ruby_llm-agents 3.3.0 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 463487c17c50bf1496a30c9eea51dab3c334a17010853da97da5624a6cf564b5
4
- data.tar.gz: 470a6666266d17dc8190f5118eec0c5f674fb51bae8b368d8713ea65d1882025
3
+ metadata.gz: 82355e2a179ddaf2f5003b2cbd972f373b2ca49cdcc2847535aec89fb18ed046
4
+ data.tar.gz: '09656de02af43adafdfe2615d1bfcb67aee76602fd0699d0f739eda731f29d8d'
5
5
  SHA512:
6
- metadata.gz: b1e2d4688dfc294c3b94c95df084a248fa25fbcd7d99910f31f72b85138bf37a392a8b350462ee341d5d209674b844a9d2692a177db30845857a586fa77ce3bc
7
- data.tar.gz: 50130237011f12c808a073d55b9083ce8449f8e0ddf8dd800c13134104f48233ea4f0fac3ddcbcc9a8b45b91814f67db5c57632dd6d54930c1ffd19fda825e96
6
+ metadata.gz: a5c8b20da41f0f73b8fdbffb809cecc726f1e7e6030d8351c5b994c58192b8d18da7693fa8fadec603f8dfb29ab7dd40907877600f58af185ab9d5542a884dcf
7
+ data.tar.gz: b6c0c90038a87f2824ff52b0bedd901528e291748a18caff4fd2df403affd351bf7cdf05db3042e6daae05d602c672f90a9a97434e6f54f9834c337ebae1a607
data/README.md CHANGED
@@ -135,7 +135,7 @@ result.save("logo.png")
135
135
  | **Attachments** | Images, PDFs, and multimodal support | [Attachments](https://github.com/adham90/ruby_llm-agents/wiki/Attachments) |
136
136
  | **Embeddings** | Vector embeddings with batching, caching, and preprocessing | [Embeddings](https://github.com/adham90/ruby_llm-agents/wiki/Embeddings) |
137
137
  | **Image Operations** | Generation, analysis, editing, pipelines with cost tracking | [Images](https://github.com/adham90/ruby_llm-agents/wiki/Image-Generation) |
138
- | **Audio** | Text-to-speech (OpenAI, ElevenLabs), speech-to-text, dashboard audio playback | [Audio](https://github.com/adham90/ruby_llm-agents/wiki/Audio) |
138
+ | **Audio** | Text-to-speech (OpenAI, ElevenLabs), speech-to-text, dynamic pricing, 28+ output formats, dashboard audio playback | [Audio](https://github.com/adham90/ruby_llm-agents/wiki/Audio) |
139
139
  | **Alerts** | Slack, webhook, and custom notifications | [Alerts](https://github.com/adham90/ruby_llm-agents/wiki/Alerts) |
140
140
 
141
141
  ## Quick Start
@@ -0,0 +1,187 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "faraday"
4
+ require "json"
5
+
6
+ module RubyLLM
7
+ module Agents
8
+ module Audio
9
+ module ElevenLabs
10
+ # Fetches and caches ElevenLabs model data from the /v1/models API.
11
+ #
12
+ # Used for:
13
+ # - Dynamic cost calculation via character_cost_multiplier
14
+ # - Model validation (TTS vs STS capability)
15
+ # - Capability awareness (style, speaker_boost, max chars, languages)
16
+ #
17
+ # @example Check if a model supports TTS
18
+ # ElevenLabs::ModelRegistry.tts_model?("eleven_v3") # => true
19
+ # ElevenLabs::ModelRegistry.tts_model?("eleven_english_sts_v2") # => false
20
+ #
21
+ # @example Get cost multiplier
22
+ # ElevenLabs::ModelRegistry.cost_multiplier("eleven_flash_v2_5") # => 0.5
23
+ #
24
+ module ModelRegistry
25
+ extend self
26
+
27
+ # Returns all models from the ElevenLabs API (cached)
28
+ #
29
+ # @return [Array<Hash>] Array of model hashes
30
+ def models
31
+ @mutex ||= Mutex.new
32
+ @mutex.synchronize do
33
+ if @models && !cache_expired?
34
+ return @models
35
+ end
36
+
37
+ @models = fetch_models
38
+ @fetched_at = Time.now
39
+ @models
40
+ end
41
+ end
42
+
43
+ # Find a specific model by ID
44
+ #
45
+ # @param model_id [String] The model identifier
46
+ # @return [Hash, nil] Model hash or nil if not found
47
+ def find(model_id)
48
+ models.find { |m| m["model_id"] == model_id.to_s }
49
+ end
50
+
51
+ # Check if model supports text-to-speech
52
+ #
53
+ # @param model_id [String] The model identifier
54
+ # @return [Boolean]
55
+ def tts_model?(model_id)
56
+ model = find(model_id)
57
+ return false unless model
58
+
59
+ model["can_do_text_to_speech"] == true
60
+ end
61
+
62
+ # Get character_cost_multiplier for a model
63
+ #
64
+ # @param model_id [String] The model identifier
65
+ # @return [Float] Cost multiplier (defaults to 1.0 for unknown models)
66
+ def cost_multiplier(model_id)
67
+ model = find(model_id)
68
+ model&.dig("model_rates", "character_cost_multiplier") || 1.0
69
+ end
70
+
71
+ # Get max characters per request for a model
72
+ #
73
+ # @param model_id [String] The model identifier
74
+ # @return [Integer, nil] Max characters or nil if unknown
75
+ def max_characters(model_id)
76
+ model = find(model_id)
77
+ model&.dig("maximum_text_length_per_request")
78
+ end
79
+
80
+ # Get supported language IDs for a model
81
+ #
82
+ # @param model_id [String] The model identifier
83
+ # @return [Array<String>] Language IDs (e.g. ["en", "es", "ja"])
84
+ def languages(model_id)
85
+ model = find(model_id)
86
+ model&.dig("languages")&.map { |l| l["language_id"] } || []
87
+ end
88
+
89
+ # Check if model supports the style voice setting
90
+ #
91
+ # @param model_id [String] The model identifier
92
+ # @return [Boolean]
93
+ def supports_style?(model_id)
94
+ find(model_id)&.dig("can_use_style") == true
95
+ end
96
+
97
+ # Check if model supports the speaker_boost setting
98
+ #
99
+ # @param model_id [String] The model identifier
100
+ # @return [Boolean]
101
+ def supports_speaker_boost?(model_id)
102
+ find(model_id)&.dig("can_use_speaker_boost") == true
103
+ end
104
+
105
+ # Check if model supports voice conversion (speech-to-speech)
106
+ # Used by VoiceConverter agent (see plans/elevenlabs_voice_converter.md)
107
+ #
108
+ # @param model_id [String] The model identifier
109
+ # @return [Boolean]
110
+ def voice_conversion_model?(model_id)
111
+ model = find(model_id)
112
+ return false unless model
113
+
114
+ model["can_do_voice_conversion"] == true
115
+ end
116
+
117
+ # Force refresh the cache
118
+ #
119
+ # @return [Array<Hash>] Fresh model data
120
+ def refresh!
121
+ @mutex ||= Mutex.new
122
+ @mutex.synchronize do
123
+ @models = nil
124
+ @fetched_at = nil
125
+ end
126
+ models
127
+ end
128
+
129
+ # Clear cache without re-fetching (useful for tests)
130
+ #
131
+ # @return [void]
132
+ def clear_cache!
133
+ @mutex ||= Mutex.new
134
+ @mutex.synchronize do
135
+ @models = nil
136
+ @fetched_at = nil
137
+ end
138
+ end
139
+
140
+ private
141
+
142
+ def fetch_models
143
+ return [] unless api_key
144
+
145
+ response = connection.get("/v1/models")
146
+
147
+ if response.success?
148
+ parsed = JSON.parse(response.body)
149
+ parsed.is_a?(Array) ? parsed : []
150
+ else
151
+ warn "[RubyLLM::Agents] ElevenLabs /v1/models returned HTTP #{response.status}"
152
+ @models || []
153
+ end
154
+ rescue Faraday::Error, JSON::ParserError => e
155
+ warn "[RubyLLM::Agents] Failed to fetch ElevenLabs models: #{e.message}"
156
+ @models || []
157
+ end
158
+
159
+ def cache_expired?
160
+ return true unless @fetched_at
161
+
162
+ ttl = RubyLLM::Agents.configuration.elevenlabs_models_cache_ttl || 21_600
163
+ Time.now - @fetched_at > ttl
164
+ end
165
+
166
+ def api_key
167
+ RubyLLM::Agents.configuration.elevenlabs_api_key
168
+ end
169
+
170
+ def api_base
171
+ base = RubyLLM::Agents.configuration.elevenlabs_api_base
172
+ (base && !base.empty?) ? base : "https://api.elevenlabs.io"
173
+ end
174
+
175
+ def connection
176
+ Faraday.new(url: api_base) do |f|
177
+ f.headers["xi-api-key"] = api_key
178
+ f.adapter Faraday.default_adapter
179
+ f.options.timeout = 10
180
+ f.options.open_timeout = 5
181
+ end
182
+ end
183
+ end
184
+ end
185
+ end
186
+ end
187
+ end
@@ -4,6 +4,7 @@ require "digest"
4
4
  require_relative "../results/speech_result"
5
5
  require_relative "speech_client"
6
6
  require_relative "speech_pricing"
7
+ require_relative "elevenlabs/model_registry"
7
8
 
8
9
  module RubyLLM
9
10
  module Agents
@@ -409,6 +410,7 @@ module RubyLLM
409
410
 
410
411
  # Executes speech synthesis
411
412
  def execute_speech(processed_text)
413
+ validate_elevenlabs_model!(processed_text)
412
414
  speak_options = build_speak_options
413
415
 
414
416
  if streaming_enabled? && @streaming_block
@@ -418,6 +420,42 @@ module RubyLLM
418
420
  end
419
421
  end
420
422
 
423
+ # Validates ElevenLabs model capabilities before calling the API.
424
+ # Raises on hard errors (non-TTS model), warns on soft issues.
425
+ def validate_elevenlabs_model!(text)
426
+ return unless resolved_provider == :elevenlabs
427
+ return unless defined?(Audio::ElevenLabs::ModelRegistry)
428
+
429
+ model_id = resolved_model
430
+ model = Audio::ElevenLabs::ModelRegistry.find(model_id)
431
+ return unless model # Unknown model — skip validation
432
+
433
+ # Hard error: model doesn't support TTS at all
434
+ unless model["can_do_text_to_speech"] == true
435
+ raise ConfigurationError,
436
+ "ElevenLabs model '#{model_id}' does not support text-to-speech. " \
437
+ "It may be a speech-to-speech model. Use a TTS-capable model like 'eleven_v3'."
438
+ end
439
+
440
+ # Warn: text exceeds model's max character limit
441
+ max_chars = model["maximum_text_length_per_request"]
442
+ if max_chars && text.length > max_chars
443
+ warn "[RubyLLM::Agents] Text length (#{text.length}) exceeds " \
444
+ "#{model_id} max of #{max_chars} characters. The API may truncate or reject it."
445
+ end
446
+
447
+ # Warn: style used on model that doesn't support it
448
+ vs = self.class.voice_settings_config
449
+ if vs && vs.style_value && vs.style_value > 0 && model["can_use_style"] != true
450
+ warn "[RubyLLM::Agents] Model '#{model_id}' does not support the 'style' voice setting. It will be ignored."
451
+ end
452
+ rescue ConfigurationError
453
+ raise
454
+ rescue => e
455
+ # Don't block speech on validation errors
456
+ warn "[RubyLLM::Agents] ElevenLabs model validation failed: #{e.message}"
457
+ end
458
+
421
459
  # Executes standard (non-streaming) speech synthesis
422
460
  def execute_standard_speech(text, options)
423
461
  response = speech_client.speak(
@@ -2,6 +2,7 @@
2
2
 
3
3
  require "faraday"
4
4
  require "json"
5
+ require "set"
5
6
 
6
7
  module RubyLLM
7
8
  module Agents
@@ -266,14 +267,37 @@ module RubyLLM
266
267
  body
267
268
  end
268
269
 
270
+ # Convenience mapping: simple symbol → ElevenLabs native format string
269
271
  ELEVENLABS_FORMAT_MAP = {
270
272
  "mp3" => "mp3_44100_128",
271
- "pcm" => "pcm_44100",
273
+ "wav" => "wav_44100",
274
+ "ogg" => "mp3_44100_128", # ElevenLabs doesn't support ogg; fallback to mp3
275
+ "pcm" => "pcm_24000",
276
+ "opus" => "opus_48000_128",
277
+ "flac" => "mp3_44100_128", # ElevenLabs doesn't support flac; fallback to mp3
278
+ "aac" => "mp3_44100_128", # ElevenLabs doesn't support aac; fallback to mp3
279
+ "alaw" => "alaw_8000",
272
280
  "ulaw" => "ulaw_8000"
273
281
  }.freeze
274
282
 
283
+ # All valid ElevenLabs native format strings (pass-through)
284
+ ELEVENLABS_NATIVE_FORMATS = Set.new(%w[
285
+ mp3_22050_32 mp3_24000_48 mp3_44100_32 mp3_44100_64
286
+ mp3_44100_96 mp3_44100_128 mp3_44100_192
287
+ pcm_8000 pcm_16000 pcm_22050 pcm_24000 pcm_32000 pcm_44100 pcm_48000
288
+ wav_8000 wav_16000 wav_22050 wav_24000 wav_32000 wav_44100 wav_48000
289
+ opus_48000_32 opus_48000_64 opus_48000_96 opus_48000_128 opus_48000_192
290
+ alaw_8000 ulaw_8000
291
+ ]).freeze
292
+
275
293
  def elevenlabs_output_format(format)
276
- ELEVENLABS_FORMAT_MAP[format.to_s] || "mp3_44100_128"
294
+ format_str = format.to_s
295
+
296
+ # Pass through native ElevenLabs format strings directly
297
+ return format_str if ELEVENLABS_NATIVE_FORMATS.include?(format_str)
298
+
299
+ # Map simple symbols to native formats
300
+ ELEVENLABS_FORMAT_MAP[format_str] || "mp3_44100_128"
277
301
  end
278
302
 
279
303
  def elevenlabs_connection
@@ -8,10 +8,11 @@ module RubyLLM
8
8
  module Audio
9
9
  # Dynamic pricing resolution for text-to-speech models.
10
10
  #
11
- # Uses the same three-tier strategy as ImageGenerator::Pricing:
11
+ # Uses a four-tier pricing cascade:
12
12
  # 1. LiteLLM JSON (primary) - future-proof, auto-updating
13
13
  # 2. Configurable pricing table - user overrides via config.tts_model_pricing
14
- # 3. Hardcoded fallbacks - per-model defaults
14
+ # 3. ElevenLabs API - dynamic multiplier × base rate from /v1/models
15
+ # 4. Hardcoded fallbacks - per-model defaults
15
16
  #
16
17
  # All prices are per 1,000 characters.
17
18
  #
@@ -50,14 +51,22 @@ module RubyLLM
50
51
  # @param model_id [String] Model identifier
51
52
  # @return [Float] Cost per 1K characters in USD
52
53
  def cost_per_1k_characters(provider, model_id)
54
+ # Tier 1: LiteLLM
53
55
  if (litellm_price = from_litellm(model_id))
54
56
  return litellm_price
55
57
  end
56
58
 
59
+ # Tier 2: User config overrides
57
60
  if (config_price = from_config(model_id))
58
61
  return config_price
59
62
  end
60
63
 
64
+ # Tier 3: ElevenLabs API multiplier × base rate
65
+ if provider == :elevenlabs && (api_price = from_elevenlabs_api(model_id))
66
+ return api_price
67
+ end
68
+
69
+ # Tier 4: Hardcoded fallbacks
61
70
  fallback_price(provider, model_id)
62
71
  end
63
72
 
@@ -73,6 +82,7 @@ module RubyLLM
73
82
  {
74
83
  litellm: litellm_tts_models,
75
84
  configured: config.tts_model_pricing || {},
85
+ elevenlabs_api: elevenlabs_api_pricing,
76
86
  fallbacks: fallback_pricing_table
77
87
  }
78
88
  end
@@ -190,6 +200,19 @@ module RubyLLM
190
200
  end
191
201
  end
192
202
 
203
+ def elevenlabs_api_pricing
204
+ return {} unless defined?(ElevenLabs::ModelRegistry)
205
+
206
+ base = config.elevenlabs_base_cost_per_1k || 0.30
207
+ ElevenLabs::ModelRegistry.models.each_with_object({}) do |model, hash|
208
+ multiplier = model.dig("model_rates", "character_cost_multiplier") || 1.0
209
+ hash[model["model_id"]] = (base * multiplier).round(6)
210
+ end
211
+ rescue => e
212
+ warn "[RubyLLM::Agents] Failed to get ElevenLabs API pricing: #{e.message}"
213
+ {}
214
+ end
215
+
193
216
  # ============================================================
194
217
  # Tier 2: User configuration
195
218
  # ============================================================
@@ -207,7 +230,25 @@ module RubyLLM
207
230
  end
208
231
 
209
232
  # ============================================================
210
- # Tier 3: Hardcoded fallbacks
233
+ # Tier 3: ElevenLabs API (dynamic multiplier × base rate)
234
+ # ============================================================
235
+
236
+ def from_elevenlabs_api(model_id)
237
+ return nil unless defined?(ElevenLabs::ModelRegistry)
238
+
239
+ model = ElevenLabs::ModelRegistry.find(model_id)
240
+ return nil unless model
241
+
242
+ multiplier = model.dig("model_rates", "character_cost_multiplier") || 1.0
243
+ base = config.elevenlabs_base_cost_per_1k || 0.30
244
+ (base * multiplier).round(6)
245
+ rescue => e
246
+ warn "[RubyLLM::Agents] Failed to get ElevenLabs API pricing: #{e.message}"
247
+ nil
248
+ end
249
+
250
+ # ============================================================
251
+ # Tier 4: Hardcoded fallbacks
211
252
  # ============================================================
212
253
 
213
254
  def fallback_price(provider, model_id)
@@ -453,7 +453,9 @@ module RubyLLM
453
453
  :root_namespace,
454
454
  :tool_result_max_length,
455
455
  :redaction,
456
- :persist_audio_data
456
+ :persist_audio_data,
457
+ :elevenlabs_base_cost_per_1k,
458
+ :elevenlabs_models_cache_ttl
457
459
 
458
460
  # Attributes with validation (readers only, custom setters below)
459
461
  attr_reader :default_temperature,
@@ -738,6 +740,11 @@ module RubyLLM
738
740
 
739
741
  # Audio data persistence (disabled by default — base64 audio can be large)
740
742
  @persist_audio_data = false
743
+
744
+ # ElevenLabs dynamic pricing: base cost per 1K characters (Pro plan overage rate)
745
+ @elevenlabs_base_cost_per_1k = 0.30
746
+ # ElevenLabs models cache TTL in seconds (6 hours)
747
+ @elevenlabs_models_cache_ttl = 21_600
741
748
  end
742
749
 
743
750
  # Returns the configured cache store, falling back to Rails.cache
@@ -4,6 +4,6 @@ module RubyLLM
4
4
  module Agents
5
5
  # Current version of the RubyLLM::Agents gem
6
6
  # @return [String] Semantic version string
7
- VERSION = "3.3.0"
7
+ VERSION = "3.4.0"
8
8
  end
9
9
  end
@@ -319,23 +319,26 @@ module RubyLLM
319
319
  #
320
320
  # @return [String] MIME type
321
321
  def mime_type_for_format
322
+ fmt = format.to_s
323
+
324
+ # Handle ElevenLabs native format strings (e.g., "mp3_44100_128")
325
+ return "audio/mpeg" if fmt.start_with?("mp3")
326
+ return "audio/wav" if fmt.start_with?("wav")
327
+ return "audio/opus" if fmt.start_with?("opus")
328
+ return "audio/pcm" if fmt.start_with?("pcm")
329
+ return "audio/alaw" if fmt.start_with?("alaw")
330
+ return "audio/basic" if fmt.start_with?("ulaw")
331
+
332
+ # Handle simple symbols (backward compatible)
322
333
  case format
323
- when :mp3
324
- "audio/mpeg"
325
- when :wav
326
- "audio/wav"
327
- when :ogg
328
- "audio/ogg"
329
- when :flac
330
- "audio/flac"
331
- when :aac
332
- "audio/aac"
333
- when :opus
334
- "audio/opus"
335
- when :pcm
336
- "audio/pcm"
337
- else
338
- "audio/mpeg" # Default to mp3
334
+ when :mp3 then "audio/mpeg"
335
+ when :wav then "audio/wav"
336
+ when :ogg then "audio/ogg"
337
+ when :flac then "audio/flac"
338
+ when :aac then "audio/aac"
339
+ when :opus then "audio/opus"
340
+ when :pcm then "audio/pcm"
341
+ else "audio/mpeg"
339
342
  end
340
343
  end
341
344
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby_llm-agents
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.3.0
4
+ version: 3.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - adham90
@@ -209,6 +209,7 @@ files:
209
209
  - lib/generators/ruby_llm_agents/upgrade_generator.rb
210
210
  - lib/ruby_llm-agents.rb
211
211
  - lib/ruby_llm/agents.rb
212
+ - lib/ruby_llm/agents/audio/elevenlabs/model_registry.rb
212
213
  - lib/ruby_llm/agents/audio/speaker.rb
213
214
  - lib/ruby_llm/agents/audio/speaker/active_storage_support.rb
214
215
  - lib/ruby_llm/agents/audio/speech_client.rb