ruby_llm-agents 3.2.0 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d8c4a83ecc9e39e7df7243b98a51d1c249a963f3a1f96551ebefae13becb50c5
4
- data.tar.gz: d042bda1737b7593187896e879b3065cc855e990e1406410e99d1d853819f3a9
3
+ metadata.gz: 82355e2a179ddaf2f5003b2cbd972f373b2ca49cdcc2847535aec89fb18ed046
4
+ data.tar.gz: '09656de02af43adafdfe2615d1bfcb67aee76602fd0699d0f739eda731f29d8d'
5
5
  SHA512:
6
- metadata.gz: 78b6fa31a8a656c36e0bb51f6e7a405101e90f47afc3ca35f87c392878b6a2d986b2b275c6424826ffb6b5d4bfb059f4632e8976aa5b2a473eab540619bd18cb
7
- data.tar.gz: 3d5890ea864aea3531e96571b6010c9fdcedc5a15e8966c7974138cde0ebaec89a771e33fd20e3fd2d5097a1ecfc398236d833f9852f046fd3b1f720eaf7fb6e
6
+ metadata.gz: a5c8b20da41f0f73b8fdbffb809cecc726f1e7e6030d8351c5b994c58192b8d18da7693fa8fadec603f8dfb29ab7dd40907877600f58af185ab9d5542a884dcf
7
+ data.tar.gz: b6c0c90038a87f2824ff52b0bedd901528e291748a18caff4fd2df403affd351bf7cdf05db3042e6daae05d602c672f90a9a97434e6f54f9834c337ebae1a607
data/README.md CHANGED
@@ -135,7 +135,7 @@ result.save("logo.png")
135
135
  | **Attachments** | Images, PDFs, and multimodal support | [Attachments](https://github.com/adham90/ruby_llm-agents/wiki/Attachments) |
136
136
  | **Embeddings** | Vector embeddings with batching, caching, and preprocessing | [Embeddings](https://github.com/adham90/ruby_llm-agents/wiki/Embeddings) |
137
137
  | **Image Operations** | Generation, analysis, editing, pipelines with cost tracking | [Images](https://github.com/adham90/ruby_llm-agents/wiki/Image-Generation) |
138
- | **Audio** | Text-to-speech (OpenAI, ElevenLabs) and speech-to-text with cost tracking | [Audio](https://github.com/adham90/ruby_llm-agents/wiki/Audio) |
138
+ | **Audio** | Text-to-speech (OpenAI, ElevenLabs), speech-to-text, dynamic pricing, 28+ output formats, dashboard audio playback | [Audio](https://github.com/adham90/ruby_llm-agents/wiki/Audio) |
139
139
  | **Alerts** | Slack, webhook, and custom notifications | [Alerts](https://github.com/adham90/ruby_llm-agents/wiki/Alerts) |
140
140
 
141
141
  ## Quick Start
@@ -0,0 +1,57 @@
1
+ <%
2
+ response = @execution.response || {}
3
+ audio_src = response["audio_url"] || response[:audio_url] ||
4
+ response["audio_data_uri"] || response[:audio_data_uri]
5
+ audio_format = response["format"] || response[:format] ||
6
+ @execution.metadata&.dig("audio_format")
7
+ audio_duration = response["duration"] || response[:duration] ||
8
+ @execution.metadata&.dig("audio_duration_seconds")
9
+ audio_file_size = response["file_size"] || response[:file_size] ||
10
+ @execution.metadata&.dig("audio_file_size_bytes")
11
+ audio_voice = response["voice_id"] || response[:voice_id] ||
12
+ @execution.metadata&.dig("voice_id")
13
+ audio_provider = response["provider"] || response[:provider] ||
14
+ @execution.metadata&.dig("audio_provider")
15
+ audio_characters = @execution.metadata&.dig("audio_characters")
16
+ %>
17
+
18
+ <div class="flex items-center gap-3 mt-6 mb-3">
19
+ <span class="text-[10px] font-medium text-gray-400 dark:text-gray-600 uppercase tracking-widest font-mono">audio</span>
20
+ <div class="flex-1 border-t border-gray-200 dark:border-gray-800"></div>
21
+ </div>
22
+
23
+ <% if audio_src.present? %>
24
+ <div class="mb-3">
25
+ <audio controls preload="metadata" class="w-full max-w-lg" style="height: 36px;">
26
+ <source src="<%= audio_src %>">
27
+ Your browser does not support the audio element.
28
+ </audio>
29
+ </div>
30
+ <% end %>
31
+
32
+ <div class="flex flex-wrap items-center gap-x-4 gap-y-1 font-mono text-xs text-gray-400 dark:text-gray-500">
33
+ <% if audio_duration.present? %>
34
+ <span><span class="text-gray-800 dark:text-gray-200"><%= audio_duration.is_a?(Numeric) ? "#{audio_duration.round(1)}s" : audio_duration %></span> duration</span>
35
+ <% end %>
36
+ <% if audio_format.present? %>
37
+ <span><span class="text-gray-800 dark:text-gray-200"><%= audio_format %></span> format</span>
38
+ <% end %>
39
+ <% if audio_file_size.present? %>
40
+ <span><span class="text-gray-800 dark:text-gray-200"><%= number_to_human_size(audio_file_size) %></span> size</span>
41
+ <% end %>
42
+ <% if audio_voice.present? %>
43
+ <span><span class="text-gray-800 dark:text-gray-200"><%= audio_voice %></span> voice</span>
44
+ <% end %>
45
+ <% if audio_provider.present? %>
46
+ <span><span class="text-gray-800 dark:text-gray-200"><%= audio_provider %></span> provider</span>
47
+ <% end %>
48
+ <% if audio_characters.present? %>
49
+ <span><span class="text-gray-800 dark:text-gray-200"><%= number_to_human_short(audio_characters) %></span> characters</span>
50
+ <% end %>
51
+ </div>
52
+
53
+ <% if audio_src.blank? %>
54
+ <p class="text-xs text-gray-400 dark:text-gray-600 font-mono mt-2 italic">
55
+ No audio data stored. Enable <code class="text-gray-500 dark:text-gray-400">persist_audio_data</code> in config to play back Speaker audio here.
56
+ </p>
57
+ <% end %>
@@ -57,6 +57,14 @@
57
57
  <% end %>
58
58
  </div>
59
59
 
60
+ <!-- ── audio player ──────────────────── -->
61
+ <% if @execution.agent_type.to_s.match?(/Speaker|Narrator|Transcriber/i) ||
62
+ @execution.metadata&.dig("audio_duration_seconds").present? ||
63
+ @execution.response&.dig("audio_data_uri").present? ||
64
+ @execution.response&.dig("audio_url").present? %>
65
+ <%= render "ruby_llm/agents/executions/audio_player" %>
66
+ <% end %>
67
+
60
68
  <!-- ── tokens ──────────────────────── -->
61
69
  <%
62
70
  input_tokens = @execution.input_tokens || 0
@@ -0,0 +1,187 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "faraday"
4
+ require "json"
5
+
6
+ module RubyLLM
7
+ module Agents
8
+ module Audio
9
+ module ElevenLabs
10
+ # Fetches and caches ElevenLabs model data from the /v1/models API.
11
+ #
12
+ # Used for:
13
+ # - Dynamic cost calculation via character_cost_multiplier
14
+ # - Model validation (TTS vs STS capability)
15
+ # - Capability awareness (style, speaker_boost, max chars, languages)
16
+ #
17
+ # @example Check if a model supports TTS
18
+ # ElevenLabs::ModelRegistry.tts_model?("eleven_v3") # => true
19
+ # ElevenLabs::ModelRegistry.tts_model?("eleven_english_sts_v2") # => false
20
+ #
21
+ # @example Get cost multiplier
22
+ # ElevenLabs::ModelRegistry.cost_multiplier("eleven_flash_v2_5") # => 0.5
23
+ #
24
+ module ModelRegistry
25
+ extend self
26
+
27
+ # Returns all models from the ElevenLabs API (cached)
28
+ #
29
+ # @return [Array<Hash>] Array of model hashes
30
+ def models
31
+ @mutex ||= Mutex.new
32
+ @mutex.synchronize do
33
+ if @models && !cache_expired?
34
+ return @models
35
+ end
36
+
37
+ @models = fetch_models
38
+ @fetched_at = Time.now
39
+ @models
40
+ end
41
+ end
42
+
43
+ # Find a specific model by ID
44
+ #
45
+ # @param model_id [String] The model identifier
46
+ # @return [Hash, nil] Model hash or nil if not found
47
+ def find(model_id)
48
+ models.find { |m| m["model_id"] == model_id.to_s }
49
+ end
50
+
51
+ # Check if model supports text-to-speech
52
+ #
53
+ # @param model_id [String] The model identifier
54
+ # @return [Boolean]
55
+ def tts_model?(model_id)
56
+ model = find(model_id)
57
+ return false unless model
58
+
59
+ model["can_do_text_to_speech"] == true
60
+ end
61
+
62
+ # Get character_cost_multiplier for a model
63
+ #
64
+ # @param model_id [String] The model identifier
65
+ # @return [Float] Cost multiplier (defaults to 1.0 for unknown models)
66
+ def cost_multiplier(model_id)
67
+ model = find(model_id)
68
+ model&.dig("model_rates", "character_cost_multiplier") || 1.0
69
+ end
70
+
71
+ # Get max characters per request for a model
72
+ #
73
+ # @param model_id [String] The model identifier
74
+ # @return [Integer, nil] Max characters or nil if unknown
75
+ def max_characters(model_id)
76
+ model = find(model_id)
77
+ model&.dig("maximum_text_length_per_request")
78
+ end
79
+
80
+ # Get supported language IDs for a model
81
+ #
82
+ # @param model_id [String] The model identifier
83
+ # @return [Array<String>] Language IDs (e.g. ["en", "es", "ja"])
84
+ def languages(model_id)
85
+ model = find(model_id)
86
+ model&.dig("languages")&.map { |l| l["language_id"] } || []
87
+ end
88
+
89
+ # Check if model supports the style voice setting
90
+ #
91
+ # @param model_id [String] The model identifier
92
+ # @return [Boolean]
93
+ def supports_style?(model_id)
94
+ find(model_id)&.dig("can_use_style") == true
95
+ end
96
+
97
+ # Check if model supports the speaker_boost setting
98
+ #
99
+ # @param model_id [String] The model identifier
100
+ # @return [Boolean]
101
+ def supports_speaker_boost?(model_id)
102
+ find(model_id)&.dig("can_use_speaker_boost") == true
103
+ end
104
+
105
+ # Check if model supports voice conversion (speech-to-speech)
106
+ # Used by VoiceConverter agent (see plans/elevenlabs_voice_converter.md)
107
+ #
108
+ # @param model_id [String] The model identifier
109
+ # @return [Boolean]
110
+ def voice_conversion_model?(model_id)
111
+ model = find(model_id)
112
+ return false unless model
113
+
114
+ model["can_do_voice_conversion"] == true
115
+ end
116
+
117
+ # Force refresh the cache
118
+ #
119
+ # @return [Array<Hash>] Fresh model data
120
+ def refresh!
121
+ @mutex ||= Mutex.new
122
+ @mutex.synchronize do
123
+ @models = nil
124
+ @fetched_at = nil
125
+ end
126
+ models
127
+ end
128
+
129
+ # Clear cache without re-fetching (useful for tests)
130
+ #
131
+ # @return [void]
132
+ def clear_cache!
133
+ @mutex ||= Mutex.new
134
+ @mutex.synchronize do
135
+ @models = nil
136
+ @fetched_at = nil
137
+ end
138
+ end
139
+
140
+ private
141
+
142
+ def fetch_models
143
+ return [] unless api_key
144
+
145
+ response = connection.get("/v1/models")
146
+
147
+ if response.success?
148
+ parsed = JSON.parse(response.body)
149
+ parsed.is_a?(Array) ? parsed : []
150
+ else
151
+ warn "[RubyLLM::Agents] ElevenLabs /v1/models returned HTTP #{response.status}"
152
+ @models || []
153
+ end
154
+ rescue Faraday::Error, JSON::ParserError => e
155
+ warn "[RubyLLM::Agents] Failed to fetch ElevenLabs models: #{e.message}"
156
+ @models || []
157
+ end
158
+
159
+ def cache_expired?
160
+ return true unless @fetched_at
161
+
162
+ ttl = RubyLLM::Agents.configuration.elevenlabs_models_cache_ttl || 21_600
163
+ Time.now - @fetched_at > ttl
164
+ end
165
+
166
+ def api_key
167
+ RubyLLM::Agents.configuration.elevenlabs_api_key
168
+ end
169
+
170
+ def api_base
171
+ base = RubyLLM::Agents.configuration.elevenlabs_api_base
172
+ (base && !base.empty?) ? base : "https://api.elevenlabs.io"
173
+ end
174
+
175
+ def connection
176
+ Faraday.new(url: api_base) do |f|
177
+ f.headers["xi-api-key"] = api_key
178
+ f.adapter Faraday.default_adapter
179
+ f.options.timeout = 10
180
+ f.options.open_timeout = 5
181
+ end
182
+ end
183
+ end
184
+ end
185
+ end
186
+ end
187
+ end
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Agents
5
+ class Speaker
6
+ # ActiveStorage integration for speakers
7
+ #
8
+ # Provides convenience methods for generating audio and directly
9
+ # attaching it to ActiveStorage attachments.
10
+ #
11
+ # @example Attaching to a model
12
+ # class Article < ApplicationRecord
13
+ # has_one_attached :narration
14
+ # end
15
+ #
16
+ # class ArticleNarrator < RubyLLM::Agents::Speaker
17
+ # include RubyLLM::Agents::Speaker::ActiveStorageSupport
18
+ #
19
+ # provider :openai
20
+ # model 'tts-1-hd'
21
+ # voice 'nova'
22
+ # end
23
+ #
24
+ # article = Article.find(1)
25
+ # result = ArticleNarrator.speak_and_attach(
26
+ # text: article.body,
27
+ # record: article,
28
+ # attachment_name: :narration
29
+ # )
30
+ #
31
+ module ActiveStorageSupport
32
+ extend ActiveSupport::Concern
33
+
34
+ class_methods do
35
+ # Generate audio and attach it to a record
36
+ #
37
+ # @param text [String] Text to convert to speech
38
+ # @param record [ActiveRecord::Base] The record to attach to
39
+ # @param attachment_name [Symbol] Name of the attachment (e.g., :narration)
40
+ # @param options [Hash] Additional options for generation
41
+ # @return [SpeechResult] The speech result with audio_url set
42
+ def speak_and_attach(text:, record:, attachment_name:, **options)
43
+ result = call(text: text, **options)
44
+
45
+ return result unless result.success?
46
+
47
+ attach_audio_to_record(result, record, attachment_name, options)
48
+
49
+ result
50
+ end
51
+
52
+ private
53
+
54
+ def attach_audio_to_record(result, record, attachment_name, options)
55
+ attachment = record.public_send(attachment_name)
56
+ filename = options[:filename] || generate_audio_filename(result)
57
+
58
+ attachment.attach(
59
+ io: StringIO.new(result.audio),
60
+ filename: filename,
61
+ content_type: result.content_type
62
+ )
63
+
64
+ result.audio_key = attachment.blob.key if attachment.respond_to?(:blob) && attachment.blob
65
+ result.audio_url = blob_url(attachment) if attachment.respond_to?(:blob) && attachment.blob
66
+ end
67
+
68
+ def blob_url(attachment)
69
+ if attachment.blob.respond_to?(:url)
70
+ attachment.blob.url
71
+ elsif attachment.blob.respond_to?(:service_url)
72
+ attachment.blob.service_url
73
+ end
74
+ rescue => _e
75
+ nil
76
+ end
77
+
78
+ def generate_audio_filename(result)
79
+ timestamp = Time.current.to_i
80
+ ext = result.format || :mp3
81
+ "speech_#{timestamp}.#{ext}"
82
+ end
83
+ end
84
+ end
85
+ end
86
+ end
87
+ end
@@ -4,6 +4,7 @@ require "digest"
4
4
  require_relative "../results/speech_result"
5
5
  require_relative "speech_client"
6
6
  require_relative "speech_pricing"
7
+ require_relative "elevenlabs/model_registry"
7
8
 
8
9
  module RubyLLM
9
10
  module Agents
@@ -336,6 +337,14 @@ module RubyLLM
336
337
  context.output_tokens = 0
337
338
  context.total_cost = calculate_cost(result)
338
339
 
340
+ # Store audio-specific metadata for execution tracking
341
+ context[:provider] = result[:provider].to_s
342
+ context[:voice_id] = (resolved_voice_id || resolved_voice).to_s
343
+ context[:characters] = result[:characters]
344
+ context[:output_format] = result[:format].to_s
345
+ context[:file_size] = result[:audio]&.bytesize
346
+ context[:audio_duration_seconds] = result[:duration] if result[:duration]
347
+
339
348
  # Build final result
340
349
  context.output = build_result(
341
350
  result,
@@ -401,6 +410,7 @@ module RubyLLM
401
410
 
402
411
  # Executes speech synthesis
403
412
  def execute_speech(processed_text)
413
+ validate_elevenlabs_model!(processed_text)
404
414
  speak_options = build_speak_options
405
415
 
406
416
  if streaming_enabled? && @streaming_block
@@ -410,6 +420,42 @@ module RubyLLM
410
420
  end
411
421
  end
412
422
 
423
+ # Validates ElevenLabs model capabilities before calling the API.
424
+ # Raises on hard errors (non-TTS model), warns on soft issues.
425
+ def validate_elevenlabs_model!(text)
426
+ return unless resolved_provider == :elevenlabs
427
+ return unless defined?(Audio::ElevenLabs::ModelRegistry)
428
+
429
+ model_id = resolved_model
430
+ model = Audio::ElevenLabs::ModelRegistry.find(model_id)
431
+ return unless model # Unknown model — skip validation
432
+
433
+ # Hard error: model doesn't support TTS at all
434
+ unless model["can_do_text_to_speech"] == true
435
+ raise ConfigurationError,
436
+ "ElevenLabs model '#{model_id}' does not support text-to-speech. " \
437
+ "It may be a speech-to-speech model. Use a TTS-capable model like 'eleven_v3'."
438
+ end
439
+
440
+ # Warn: text exceeds model's max character limit
441
+ max_chars = model["maximum_text_length_per_request"]
442
+ if max_chars && text.length > max_chars
443
+ warn "[RubyLLM::Agents] Text length (#{text.length}) exceeds " \
444
+ "#{model_id} max of #{max_chars} characters. The API may truncate or reject it."
445
+ end
446
+
447
+ # Warn: style used on model that doesn't support it
448
+ vs = self.class.voice_settings_config
449
+ if vs && vs.style_value && vs.style_value > 0 && model["can_use_style"] != true
450
+ warn "[RubyLLM::Agents] Model '#{model_id}' does not support the 'style' voice setting. It will be ignored."
451
+ end
452
+ rescue ConfigurationError
453
+ raise
454
+ rescue => e
455
+ # Don't block speech on validation errors
456
+ warn "[RubyLLM::Agents] ElevenLabs model validation failed: #{e.message}"
457
+ end
458
+
413
459
  # Executes standard (non-streaming) speech synthesis
414
460
  def execute_standard_speech(text, options)
415
461
  response = speech_client.speak(
@@ -559,3 +605,5 @@ module RubyLLM
559
605
  end
560
606
  end
561
607
  end
608
+
609
+ require_relative "speaker/active_storage_support"
@@ -2,6 +2,7 @@
2
2
 
3
3
  require "faraday"
4
4
  require "json"
5
+ require "set"
5
6
 
6
7
  module RubyLLM
7
8
  module Agents
@@ -266,14 +267,37 @@ module RubyLLM
266
267
  body
267
268
  end
268
269
 
270
+ # Convenience mapping: simple symbol → ElevenLabs native format string
269
271
  ELEVENLABS_FORMAT_MAP = {
270
272
  "mp3" => "mp3_44100_128",
271
- "pcm" => "pcm_44100",
273
+ "wav" => "wav_44100",
274
+ "ogg" => "mp3_44100_128", # ElevenLabs doesn't support ogg; fallback to mp3
275
+ "pcm" => "pcm_24000",
276
+ "opus" => "opus_48000_128",
277
+ "flac" => "mp3_44100_128", # ElevenLabs doesn't support flac; fallback to mp3
278
+ "aac" => "mp3_44100_128", # ElevenLabs doesn't support aac; fallback to mp3
279
+ "alaw" => "alaw_8000",
272
280
  "ulaw" => "ulaw_8000"
273
281
  }.freeze
274
282
 
283
+ # All valid ElevenLabs native format strings (pass-through)
284
+ ELEVENLABS_NATIVE_FORMATS = Set.new(%w[
285
+ mp3_22050_32 mp3_24000_48 mp3_44100_32 mp3_44100_64
286
+ mp3_44100_96 mp3_44100_128 mp3_44100_192
287
+ pcm_8000 pcm_16000 pcm_22050 pcm_24000 pcm_32000 pcm_44100 pcm_48000
288
+ wav_8000 wav_16000 wav_22050 wav_24000 wav_32000 wav_44100 wav_48000
289
+ opus_48000_32 opus_48000_64 opus_48000_96 opus_48000_128 opus_48000_192
290
+ alaw_8000 ulaw_8000
291
+ ]).freeze
292
+
275
293
  def elevenlabs_output_format(format)
276
- ELEVENLABS_FORMAT_MAP[format.to_s] || "mp3_44100_128"
294
+ format_str = format.to_s
295
+
296
+ # Pass through native ElevenLabs format strings directly
297
+ return format_str if ELEVENLABS_NATIVE_FORMATS.include?(format_str)
298
+
299
+ # Map simple symbols to native formats
300
+ ELEVENLABS_FORMAT_MAP[format_str] || "mp3_44100_128"
277
301
  end
278
302
 
279
303
  def elevenlabs_connection
@@ -8,10 +8,11 @@ module RubyLLM
8
8
  module Audio
9
9
  # Dynamic pricing resolution for text-to-speech models.
10
10
  #
11
- # Uses the same three-tier strategy as ImageGenerator::Pricing:
11
+ # Uses a four-tier pricing cascade:
12
12
  # 1. LiteLLM JSON (primary) - future-proof, auto-updating
13
13
  # 2. Configurable pricing table - user overrides via config.tts_model_pricing
14
- # 3. Hardcoded fallbacks - per-model defaults
14
+ # 3. ElevenLabs API - dynamic multiplier × base rate from /v1/models
15
+ # 4. Hardcoded fallbacks - per-model defaults
15
16
  #
16
17
  # All prices are per 1,000 characters.
17
18
  #
@@ -50,14 +51,22 @@ module RubyLLM
50
51
  # @param model_id [String] Model identifier
51
52
  # @return [Float] Cost per 1K characters in USD
52
53
  def cost_per_1k_characters(provider, model_id)
54
+ # Tier 1: LiteLLM
53
55
  if (litellm_price = from_litellm(model_id))
54
56
  return litellm_price
55
57
  end
56
58
 
59
+ # Tier 2: User config overrides
57
60
  if (config_price = from_config(model_id))
58
61
  return config_price
59
62
  end
60
63
 
64
+ # Tier 3: ElevenLabs API multiplier × base rate
65
+ if provider == :elevenlabs && (api_price = from_elevenlabs_api(model_id))
66
+ return api_price
67
+ end
68
+
69
+ # Tier 4: Hardcoded fallbacks
61
70
  fallback_price(provider, model_id)
62
71
  end
63
72
 
@@ -73,6 +82,7 @@ module RubyLLM
73
82
  {
74
83
  litellm: litellm_tts_models,
75
84
  configured: config.tts_model_pricing || {},
85
+ elevenlabs_api: elevenlabs_api_pricing,
76
86
  fallbacks: fallback_pricing_table
77
87
  }
78
88
  end
@@ -190,6 +200,19 @@ module RubyLLM
190
200
  end
191
201
  end
192
202
 
203
+ def elevenlabs_api_pricing
204
+ return {} unless defined?(ElevenLabs::ModelRegistry)
205
+
206
+ base = config.elevenlabs_base_cost_per_1k || 0.30
207
+ ElevenLabs::ModelRegistry.models.each_with_object({}) do |model, hash|
208
+ multiplier = model.dig("model_rates", "character_cost_multiplier") || 1.0
209
+ hash[model["model_id"]] = (base * multiplier).round(6)
210
+ end
211
+ rescue => e
212
+ warn "[RubyLLM::Agents] Failed to get ElevenLabs API pricing: #{e.message}"
213
+ {}
214
+ end
215
+
193
216
  # ============================================================
194
217
  # Tier 2: User configuration
195
218
  # ============================================================
@@ -207,7 +230,25 @@ module RubyLLM
207
230
  end
208
231
 
209
232
  # ============================================================
210
- # Tier 3: Hardcoded fallbacks
233
+ # Tier 3: ElevenLabs API (dynamic multiplier × base rate)
234
+ # ============================================================
235
+
236
+ def from_elevenlabs_api(model_id)
237
+ return nil unless defined?(ElevenLabs::ModelRegistry)
238
+
239
+ model = ElevenLabs::ModelRegistry.find(model_id)
240
+ return nil unless model
241
+
242
+ multiplier = model.dig("model_rates", "character_cost_multiplier") || 1.0
243
+ base = config.elevenlabs_base_cost_per_1k || 0.30
244
+ (base * multiplier).round(6)
245
+ rescue => e
246
+ warn "[RubyLLM::Agents] Failed to get ElevenLabs API pricing: #{e.message}"
247
+ nil
248
+ end
249
+
250
+ # ============================================================
251
+ # Tier 4: Hardcoded fallbacks
211
252
  # ============================================================
212
253
 
213
254
  def fallback_price(provider, model_id)
@@ -318,6 +318,16 @@ module RubyLLM
318
318
  context.output_tokens = 0
319
319
  context.total_cost = calculate_cost(raw_result)
320
320
 
321
+ # Store transcription-specific metadata for execution tracking
322
+ context[:language] = resolved_language if resolved_language
323
+ context[:detected_language] = raw_result[:language] if raw_result[:language]
324
+ context[:audio_duration_seconds] = raw_result[:duration] if raw_result[:duration]
325
+ context[:audio_minutes] = (raw_result[:duration] / 60.0).round(4) if raw_result[:duration]
326
+ context[:output_format] = self.class.output_format.to_s
327
+ context[:timestamp_granularity] = self.class.include_timestamps.to_s
328
+ context[:segment_count] = raw_result[:segments]&.size if raw_result[:segments]
329
+ context[:word_count] = raw_result[:text]&.split(/\s+/)&.size if raw_result[:text]
330
+
321
331
  # Build final result
322
332
  context.output = build_result(
323
333
  raw_result,
@@ -452,7 +452,10 @@ module RubyLLM
452
452
  :root_directory,
453
453
  :root_namespace,
454
454
  :tool_result_max_length,
455
- :redaction
455
+ :redaction,
456
+ :persist_audio_data,
457
+ :elevenlabs_base_cost_per_1k,
458
+ :elevenlabs_models_cache_ttl
456
459
 
457
460
  # Attributes with validation (readers only, custom setters below)
458
461
  attr_reader :default_temperature,
@@ -734,6 +737,14 @@ module RubyLLM
734
737
 
735
738
  # Redaction defaults (disabled by default)
736
739
  @redaction = nil
740
+
741
+ # Audio data persistence (disabled by default — base64 audio can be large)
742
+ @persist_audio_data = false
743
+
744
+ # ElevenLabs dynamic pricing: base cost per 1K characters (Pro plan overage rate)
745
+ @elevenlabs_base_cost_per_1k = 0.30
746
+ # ElevenLabs models cache TTL in seconds (6 hours)
747
+ @elevenlabs_models_cache_ttl = 21_600
737
748
  end
738
749
 
739
750
  # Returns the configured cache store, falling back to Rails.cache
@@ -4,6 +4,6 @@ module RubyLLM
4
4
  module Agents
5
5
  # Current version of the RubyLLM::Agents gem
6
6
  # @return [String] Semantic version string
7
- VERSION = "3.2.0"
7
+ VERSION = "3.4.0"
8
8
  end
9
9
  end
@@ -280,6 +280,9 @@ module RubyLLM
280
280
  detail_data[:response] = serialize_response(context)
281
281
  end
282
282
 
283
+ # Persist audio data for Speaker executions
284
+ maybe_persist_audio_response(context, detail_data)
285
+
283
286
  has_data = detail_data.values.any? { |v| v.present? && v != {} && v != [] }
284
287
  return unless has_data
285
288
 
@@ -376,6 +379,10 @@ module RubyLLM
376
379
  if global_config.persist_responses && context.output.respond_to?(:content)
377
380
  detail_data[:response] = serialize_response(context)
378
381
  end
382
+
383
+ # Persist audio data for Speaker executions
384
+ maybe_persist_audio_response(context, detail_data)
385
+
379
386
  data[:_detail_data] = detail_data
380
387
 
381
388
  data
@@ -463,6 +470,48 @@ module RubyLLM
463
470
  nil
464
471
  end
465
472
 
473
+ # Persists audio response data for Speaker executions
474
+ #
475
+ # When persist_audio_data is enabled and the output is a SpeechResult with
476
+ # audio binary data, stores a base64 data URI in the response column.
477
+ # Always stores audio_url if present (lightweight, no binary).
478
+ #
479
+ # @param context [Context] The execution context
480
+ # @param detail_data [Hash] The detail data hash to modify
481
+ def maybe_persist_audio_response(context, detail_data)
482
+ return unless context.output.is_a?(RubyLLM::Agents::SpeechResult)
483
+
484
+ # Always persist audio_url if present (it's just a string, no binary)
485
+ if context.output.audio_url.present?
486
+ detail_data[:response] ||= {}
487
+ detail_data[:response][:audio_url] = context.output.audio_url
488
+ end
489
+
490
+ # Persist full audio data URI only when opted in
491
+ return unless global_config.respond_to?(:persist_audio_data) && global_config.persist_audio_data
492
+ return unless context.output.audio.present?
493
+
494
+ detail_data[:response] = serialize_audio_response(context.output)
495
+ rescue => e
496
+ error("Failed to persist audio response: #{e.message}")
497
+ end
498
+
499
+ # Serializes a SpeechResult into a hash for the response column
500
+ #
501
+ # @param result [SpeechResult] The speech result to serialize
502
+ # @return [Hash] Serialized audio response data
503
+ def serialize_audio_response(result)
504
+ {
505
+ audio_data_uri: result.to_data_uri,
506
+ audio_url: result.audio_url,
507
+ format: result.format.to_s,
508
+ duration: result.duration,
509
+ file_size: result.file_size,
510
+ voice_id: result.voice_id,
511
+ provider: result.provider.to_s
512
+ }.compact
513
+ end
514
+
466
515
  # Queues async logging via background job
467
516
  #
468
517
  # @param data [Hash] Execution data
@@ -29,17 +29,17 @@ module RubyLLM
29
29
  # @return [String, nil] Binary audio data
30
30
  attr_reader :audio
31
31
 
32
- # @!attribute [r] audio_url
32
+ # @!attribute [rw] audio_url
33
33
  # @return [String, nil] URL if audio was stored remotely
34
- attr_reader :audio_url
34
+ attr_accessor :audio_url
35
35
 
36
- # @!attribute [r] audio_key
36
+ # @!attribute [rw] audio_key
37
37
  # @return [String, nil] Storage key if stored
38
- attr_reader :audio_key
38
+ attr_accessor :audio_key
39
39
 
40
- # @!attribute [r] audio_path
40
+ # @!attribute [rw] audio_path
41
41
  # @return [String, nil] Local file path if saved
42
- attr_reader :audio_path
42
+ attr_accessor :audio_path
43
43
 
44
44
  # @!endgroup
45
45
 
@@ -308,29 +308,37 @@ module RubyLLM
308
308
  }
309
309
  end
310
310
 
311
- private
311
+ # Returns MIME type for the audio format
312
+ #
313
+ # @return [String] MIME type
314
+ def content_type
315
+ mime_type_for_format
316
+ end
312
317
 
313
318
  # Returns MIME type for the audio format
314
319
  #
315
320
  # @return [String] MIME type
316
321
  def mime_type_for_format
322
+ fmt = format.to_s
323
+
324
+ # Handle ElevenLabs native format strings (e.g., "mp3_44100_128")
325
+ return "audio/mpeg" if fmt.start_with?("mp3")
326
+ return "audio/wav" if fmt.start_with?("wav")
327
+ return "audio/opus" if fmt.start_with?("opus")
328
+ return "audio/pcm" if fmt.start_with?("pcm")
329
+ return "audio/alaw" if fmt.start_with?("alaw")
330
+ return "audio/basic" if fmt.start_with?("ulaw")
331
+
332
+ # Handle simple symbols (backward compatible)
317
333
  case format
318
- when :mp3
319
- "audio/mpeg"
320
- when :wav
321
- "audio/wav"
322
- when :ogg
323
- "audio/ogg"
324
- when :flac
325
- "audio/flac"
326
- when :aac
327
- "audio/aac"
328
- when :opus
329
- "audio/opus"
330
- when :pcm
331
- "audio/pcm"
332
- else
333
- "audio/mpeg" # Default to mp3
334
+ when :mp3 then "audio/mpeg"
335
+ when :wav then "audio/wav"
336
+ when :ogg then "audio/ogg"
337
+ when :flac then "audio/flac"
338
+ when :aac then "audio/aac"
339
+ when :opus then "audio/opus"
340
+ when :pcm then "audio/pcm"
341
+ else "audio/mpeg"
334
342
  end
335
343
  end
336
344
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby_llm-agents
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.2.0
4
+ version: 3.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - adham90
@@ -111,6 +111,7 @@ files:
111
111
  - app/views/ruby_llm/agents/dashboard/_action_center.html.erb
112
112
  - app/views/ruby_llm/agents/dashboard/_tenant_budget.html.erb
113
113
  - app/views/ruby_llm/agents/dashboard/index.html.erb
114
+ - app/views/ruby_llm/agents/executions/_audio_player.html.erb
114
115
  - app/views/ruby_llm/agents/executions/_execution.html.erb
115
116
  - app/views/ruby_llm/agents/executions/_filters.html.erb
116
117
  - app/views/ruby_llm/agents/executions/_list.html.erb
@@ -208,7 +209,9 @@ files:
208
209
  - lib/generators/ruby_llm_agents/upgrade_generator.rb
209
210
  - lib/ruby_llm-agents.rb
210
211
  - lib/ruby_llm/agents.rb
212
+ - lib/ruby_llm/agents/audio/elevenlabs/model_registry.rb
211
213
  - lib/ruby_llm/agents/audio/speaker.rb
214
+ - lib/ruby_llm/agents/audio/speaker/active_storage_support.rb
212
215
  - lib/ruby_llm/agents/audio/speech_client.rb
213
216
  - lib/ruby_llm/agents/audio/speech_pricing.rb
214
217
  - lib/ruby_llm/agents/audio/transcriber.rb