ruby_llm-agents 3.2.0 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/app/views/ruby_llm/agents/executions/_audio_player.html.erb +57 -0
- data/app/views/ruby_llm/agents/executions/show.html.erb +8 -0
- data/lib/ruby_llm/agents/audio/elevenlabs/model_registry.rb +187 -0
- data/lib/ruby_llm/agents/audio/speaker/active_storage_support.rb +87 -0
- data/lib/ruby_llm/agents/audio/speaker.rb +48 -0
- data/lib/ruby_llm/agents/audio/speech_client.rb +26 -2
- data/lib/ruby_llm/agents/audio/speech_pricing.rb +44 -3
- data/lib/ruby_llm/agents/audio/transcriber.rb +10 -0
- data/lib/ruby_llm/agents/core/configuration.rb +12 -1
- data/lib/ruby_llm/agents/core/version.rb +1 -1
- data/lib/ruby_llm/agents/pipeline/middleware/instrumentation.rb +49 -0
- data/lib/ruby_llm/agents/results/speech_result.rb +31 -23
- metadata +4 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 82355e2a179ddaf2f5003b2cbd972f373b2ca49cdcc2847535aec89fb18ed046
|
|
4
|
+
data.tar.gz: '09656de02af43adafdfe2615d1bfcb67aee76602fd0699d0f739eda731f29d8d'
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a5c8b20da41f0f73b8fdbffb809cecc726f1e7e6030d8351c5b994c58192b8d18da7693fa8fadec603f8dfb29ab7dd40907877600f58af185ab9d5542a884dcf
|
|
7
|
+
data.tar.gz: b6c0c90038a87f2824ff52b0bedd901528e291748a18caff4fd2df403affd351bf7cdf05db3042e6daae05d602c672f90a9a97434e6f54f9834c337ebae1a607
|
data/README.md
CHANGED
|
@@ -135,7 +135,7 @@ result.save("logo.png")
|
|
|
135
135
|
| **Attachments** | Images, PDFs, and multimodal support | [Attachments](https://github.com/adham90/ruby_llm-agents/wiki/Attachments) |
|
|
136
136
|
| **Embeddings** | Vector embeddings with batching, caching, and preprocessing | [Embeddings](https://github.com/adham90/ruby_llm-agents/wiki/Embeddings) |
|
|
137
137
|
| **Image Operations** | Generation, analysis, editing, pipelines with cost tracking | [Images](https://github.com/adham90/ruby_llm-agents/wiki/Image-Generation) |
|
|
138
|
-
| **Audio** | Text-to-speech (OpenAI, ElevenLabs)
|
|
138
|
+
| **Audio** | Text-to-speech (OpenAI, ElevenLabs), speech-to-text, dynamic pricing, 28+ output formats, dashboard audio playback | [Audio](https://github.com/adham90/ruby_llm-agents/wiki/Audio) |
|
|
139
139
|
| **Alerts** | Slack, webhook, and custom notifications | [Alerts](https://github.com/adham90/ruby_llm-agents/wiki/Alerts) |
|
|
140
140
|
|
|
141
141
|
## Quick Start
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
<%
|
|
2
|
+
response = @execution.response || {}
|
|
3
|
+
audio_src = response["audio_url"] || response[:audio_url] ||
|
|
4
|
+
response["audio_data_uri"] || response[:audio_data_uri]
|
|
5
|
+
audio_format = response["format"] || response[:format] ||
|
|
6
|
+
@execution.metadata&.dig("audio_format")
|
|
7
|
+
audio_duration = response["duration"] || response[:duration] ||
|
|
8
|
+
@execution.metadata&.dig("audio_duration_seconds")
|
|
9
|
+
audio_file_size = response["file_size"] || response[:file_size] ||
|
|
10
|
+
@execution.metadata&.dig("audio_file_size_bytes")
|
|
11
|
+
audio_voice = response["voice_id"] || response[:voice_id] ||
|
|
12
|
+
@execution.metadata&.dig("voice_id")
|
|
13
|
+
audio_provider = response["provider"] || response[:provider] ||
|
|
14
|
+
@execution.metadata&.dig("audio_provider")
|
|
15
|
+
audio_characters = @execution.metadata&.dig("audio_characters")
|
|
16
|
+
%>
|
|
17
|
+
|
|
18
|
+
<div class="flex items-center gap-3 mt-6 mb-3">
|
|
19
|
+
<span class="text-[10px] font-medium text-gray-400 dark:text-gray-600 uppercase tracking-widest font-mono">audio</span>
|
|
20
|
+
<div class="flex-1 border-t border-gray-200 dark:border-gray-800"></div>
|
|
21
|
+
</div>
|
|
22
|
+
|
|
23
|
+
<% if audio_src.present? %>
|
|
24
|
+
<div class="mb-3">
|
|
25
|
+
<audio controls preload="metadata" class="w-full max-w-lg" style="height: 36px;">
|
|
26
|
+
<source src="<%= audio_src %>">
|
|
27
|
+
Your browser does not support the audio element.
|
|
28
|
+
</audio>
|
|
29
|
+
</div>
|
|
30
|
+
<% end %>
|
|
31
|
+
|
|
32
|
+
<div class="flex flex-wrap items-center gap-x-4 gap-y-1 font-mono text-xs text-gray-400 dark:text-gray-500">
|
|
33
|
+
<% if audio_duration.present? %>
|
|
34
|
+
<span><span class="text-gray-800 dark:text-gray-200"><%= audio_duration.is_a?(Numeric) ? "#{audio_duration.round(1)}s" : audio_duration %></span> duration</span>
|
|
35
|
+
<% end %>
|
|
36
|
+
<% if audio_format.present? %>
|
|
37
|
+
<span><span class="text-gray-800 dark:text-gray-200"><%= audio_format %></span> format</span>
|
|
38
|
+
<% end %>
|
|
39
|
+
<% if audio_file_size.present? %>
|
|
40
|
+
<span><span class="text-gray-800 dark:text-gray-200"><%= number_to_human_size(audio_file_size) %></span> size</span>
|
|
41
|
+
<% end %>
|
|
42
|
+
<% if audio_voice.present? %>
|
|
43
|
+
<span><span class="text-gray-800 dark:text-gray-200"><%= audio_voice %></span> voice</span>
|
|
44
|
+
<% end %>
|
|
45
|
+
<% if audio_provider.present? %>
|
|
46
|
+
<span><span class="text-gray-800 dark:text-gray-200"><%= audio_provider %></span> provider</span>
|
|
47
|
+
<% end %>
|
|
48
|
+
<% if audio_characters.present? %>
|
|
49
|
+
<span><span class="text-gray-800 dark:text-gray-200"><%= number_to_human_short(audio_characters) %></span> characters</span>
|
|
50
|
+
<% end %>
|
|
51
|
+
</div>
|
|
52
|
+
|
|
53
|
+
<% if audio_src.blank? %>
|
|
54
|
+
<p class="text-xs text-gray-400 dark:text-gray-600 font-mono mt-2 italic">
|
|
55
|
+
No audio data stored. Enable <code class="text-gray-500 dark:text-gray-400">persist_audio_data</code> in config to play back Speaker audio here.
|
|
56
|
+
</p>
|
|
57
|
+
<% end %>
|
|
@@ -57,6 +57,14 @@
|
|
|
57
57
|
<% end %>
|
|
58
58
|
</div>
|
|
59
59
|
|
|
60
|
+
<!-- ── audio player ──────────────────── -->
|
|
61
|
+
<% if @execution.agent_type.to_s.match?(/Speaker|Narrator|Transcriber/i) ||
|
|
62
|
+
@execution.metadata&.dig("audio_duration_seconds").present? ||
|
|
63
|
+
@execution.response&.dig("audio_data_uri").present? ||
|
|
64
|
+
@execution.response&.dig("audio_url").present? %>
|
|
65
|
+
<%= render "ruby_llm/agents/executions/audio_player" %>
|
|
66
|
+
<% end %>
|
|
67
|
+
|
|
60
68
|
<!-- ── tokens ──────────────────────── -->
|
|
61
69
|
<%
|
|
62
70
|
input_tokens = @execution.input_tokens || 0
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "faraday"
|
|
4
|
+
require "json"
|
|
5
|
+
|
|
6
|
+
module RubyLLM
|
|
7
|
+
module Agents
|
|
8
|
+
module Audio
|
|
9
|
+
module ElevenLabs
|
|
10
|
+
# Fetches and caches ElevenLabs model data from the /v1/models API.
|
|
11
|
+
#
|
|
12
|
+
# Used for:
|
|
13
|
+
# - Dynamic cost calculation via character_cost_multiplier
|
|
14
|
+
# - Model validation (TTS vs STS capability)
|
|
15
|
+
# - Capability awareness (style, speaker_boost, max chars, languages)
|
|
16
|
+
#
|
|
17
|
+
# @example Check if a model supports TTS
|
|
18
|
+
# ElevenLabs::ModelRegistry.tts_model?("eleven_v3") # => true
|
|
19
|
+
# ElevenLabs::ModelRegistry.tts_model?("eleven_english_sts_v2") # => false
|
|
20
|
+
#
|
|
21
|
+
# @example Get cost multiplier
|
|
22
|
+
# ElevenLabs::ModelRegistry.cost_multiplier("eleven_flash_v2_5") # => 0.5
|
|
23
|
+
#
|
|
24
|
+
module ModelRegistry
|
|
25
|
+
extend self
|
|
26
|
+
|
|
27
|
+
# Returns all models from the ElevenLabs API (cached)
|
|
28
|
+
#
|
|
29
|
+
# @return [Array<Hash>] Array of model hashes
|
|
30
|
+
def models
|
|
31
|
+
@mutex ||= Mutex.new
|
|
32
|
+
@mutex.synchronize do
|
|
33
|
+
if @models && !cache_expired?
|
|
34
|
+
return @models
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
@models = fetch_models
|
|
38
|
+
@fetched_at = Time.now
|
|
39
|
+
@models
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Find a specific model by ID
|
|
44
|
+
#
|
|
45
|
+
# @param model_id [String] The model identifier
|
|
46
|
+
# @return [Hash, nil] Model hash or nil if not found
|
|
47
|
+
def find(model_id)
|
|
48
|
+
models.find { |m| m["model_id"] == model_id.to_s }
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Check if model supports text-to-speech
|
|
52
|
+
#
|
|
53
|
+
# @param model_id [String] The model identifier
|
|
54
|
+
# @return [Boolean]
|
|
55
|
+
def tts_model?(model_id)
|
|
56
|
+
model = find(model_id)
|
|
57
|
+
return false unless model
|
|
58
|
+
|
|
59
|
+
model["can_do_text_to_speech"] == true
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Get character_cost_multiplier for a model
|
|
63
|
+
#
|
|
64
|
+
# @param model_id [String] The model identifier
|
|
65
|
+
# @return [Float] Cost multiplier (defaults to 1.0 for unknown models)
|
|
66
|
+
def cost_multiplier(model_id)
|
|
67
|
+
model = find(model_id)
|
|
68
|
+
model&.dig("model_rates", "character_cost_multiplier") || 1.0
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Get max characters per request for a model
|
|
72
|
+
#
|
|
73
|
+
# @param model_id [String] The model identifier
|
|
74
|
+
# @return [Integer, nil] Max characters or nil if unknown
|
|
75
|
+
def max_characters(model_id)
|
|
76
|
+
model = find(model_id)
|
|
77
|
+
model&.dig("maximum_text_length_per_request")
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Get supported language IDs for a model
|
|
81
|
+
#
|
|
82
|
+
# @param model_id [String] The model identifier
|
|
83
|
+
# @return [Array<String>] Language IDs (e.g. ["en", "es", "ja"])
|
|
84
|
+
def languages(model_id)
|
|
85
|
+
model = find(model_id)
|
|
86
|
+
model&.dig("languages")&.map { |l| l["language_id"] } || []
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Check if model supports the style voice setting
|
|
90
|
+
#
|
|
91
|
+
# @param model_id [String] The model identifier
|
|
92
|
+
# @return [Boolean]
|
|
93
|
+
def supports_style?(model_id)
|
|
94
|
+
find(model_id)&.dig("can_use_style") == true
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Check if model supports the speaker_boost setting
|
|
98
|
+
#
|
|
99
|
+
# @param model_id [String] The model identifier
|
|
100
|
+
# @return [Boolean]
|
|
101
|
+
def supports_speaker_boost?(model_id)
|
|
102
|
+
find(model_id)&.dig("can_use_speaker_boost") == true
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Check if model supports voice conversion (speech-to-speech)
|
|
106
|
+
# Used by VoiceConverter agent (see plans/elevenlabs_voice_converter.md)
|
|
107
|
+
#
|
|
108
|
+
# @param model_id [String] The model identifier
|
|
109
|
+
# @return [Boolean]
|
|
110
|
+
def voice_conversion_model?(model_id)
|
|
111
|
+
model = find(model_id)
|
|
112
|
+
return false unless model
|
|
113
|
+
|
|
114
|
+
model["can_do_voice_conversion"] == true
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Force refresh the cache
|
|
118
|
+
#
|
|
119
|
+
# @return [Array<Hash>] Fresh model data
|
|
120
|
+
def refresh!
|
|
121
|
+
@mutex ||= Mutex.new
|
|
122
|
+
@mutex.synchronize do
|
|
123
|
+
@models = nil
|
|
124
|
+
@fetched_at = nil
|
|
125
|
+
end
|
|
126
|
+
models
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Clear cache without re-fetching (useful for tests)
|
|
130
|
+
#
|
|
131
|
+
# @return [void]
|
|
132
|
+
def clear_cache!
|
|
133
|
+
@mutex ||= Mutex.new
|
|
134
|
+
@mutex.synchronize do
|
|
135
|
+
@models = nil
|
|
136
|
+
@fetched_at = nil
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
private
|
|
141
|
+
|
|
142
|
+
def fetch_models
|
|
143
|
+
return [] unless api_key
|
|
144
|
+
|
|
145
|
+
response = connection.get("/v1/models")
|
|
146
|
+
|
|
147
|
+
if response.success?
|
|
148
|
+
parsed = JSON.parse(response.body)
|
|
149
|
+
parsed.is_a?(Array) ? parsed : []
|
|
150
|
+
else
|
|
151
|
+
warn "[RubyLLM::Agents] ElevenLabs /v1/models returned HTTP #{response.status}"
|
|
152
|
+
@models || []
|
|
153
|
+
end
|
|
154
|
+
rescue Faraday::Error, JSON::ParserError => e
|
|
155
|
+
warn "[RubyLLM::Agents] Failed to fetch ElevenLabs models: #{e.message}"
|
|
156
|
+
@models || []
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def cache_expired?
|
|
160
|
+
return true unless @fetched_at
|
|
161
|
+
|
|
162
|
+
ttl = RubyLLM::Agents.configuration.elevenlabs_models_cache_ttl || 21_600
|
|
163
|
+
Time.now - @fetched_at > ttl
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def api_key
|
|
167
|
+
RubyLLM::Agents.configuration.elevenlabs_api_key
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def api_base
|
|
171
|
+
base = RubyLLM::Agents.configuration.elevenlabs_api_base
|
|
172
|
+
(base && !base.empty?) ? base : "https://api.elevenlabs.io"
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
def connection
|
|
176
|
+
Faraday.new(url: api_base) do |f|
|
|
177
|
+
f.headers["xi-api-key"] = api_key
|
|
178
|
+
f.adapter Faraday.default_adapter
|
|
179
|
+
f.options.timeout = 10
|
|
180
|
+
f.options.open_timeout = 5
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
end
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Agents
|
|
5
|
+
class Speaker
|
|
6
|
+
# ActiveStorage integration for speakers
|
|
7
|
+
#
|
|
8
|
+
# Provides convenience methods for generating audio and directly
|
|
9
|
+
# attaching it to ActiveStorage attachments.
|
|
10
|
+
#
|
|
11
|
+
# @example Attaching to a model
|
|
12
|
+
# class Article < ApplicationRecord
|
|
13
|
+
# has_one_attached :narration
|
|
14
|
+
# end
|
|
15
|
+
#
|
|
16
|
+
# class ArticleNarrator < RubyLLM::Agents::Speaker
|
|
17
|
+
# include RubyLLM::Agents::Speaker::ActiveStorageSupport
|
|
18
|
+
#
|
|
19
|
+
# provider :openai
|
|
20
|
+
# model 'tts-1-hd'
|
|
21
|
+
# voice 'nova'
|
|
22
|
+
# end
|
|
23
|
+
#
|
|
24
|
+
# article = Article.find(1)
|
|
25
|
+
# result = ArticleNarrator.speak_and_attach(
|
|
26
|
+
# text: article.body,
|
|
27
|
+
# record: article,
|
|
28
|
+
# attachment_name: :narration
|
|
29
|
+
# )
|
|
30
|
+
#
|
|
31
|
+
module ActiveStorageSupport
|
|
32
|
+
extend ActiveSupport::Concern
|
|
33
|
+
|
|
34
|
+
class_methods do
|
|
35
|
+
# Generate audio and attach it to a record
|
|
36
|
+
#
|
|
37
|
+
# @param text [String] Text to convert to speech
|
|
38
|
+
# @param record [ActiveRecord::Base] The record to attach to
|
|
39
|
+
# @param attachment_name [Symbol] Name of the attachment (e.g., :narration)
|
|
40
|
+
# @param options [Hash] Additional options for generation
|
|
41
|
+
# @return [SpeechResult] The speech result with audio_url set
|
|
42
|
+
def speak_and_attach(text:, record:, attachment_name:, **options)
|
|
43
|
+
result = call(text: text, **options)
|
|
44
|
+
|
|
45
|
+
return result unless result.success?
|
|
46
|
+
|
|
47
|
+
attach_audio_to_record(result, record, attachment_name, options)
|
|
48
|
+
|
|
49
|
+
result
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
def attach_audio_to_record(result, record, attachment_name, options)
|
|
55
|
+
attachment = record.public_send(attachment_name)
|
|
56
|
+
filename = options[:filename] || generate_audio_filename(result)
|
|
57
|
+
|
|
58
|
+
attachment.attach(
|
|
59
|
+
io: StringIO.new(result.audio),
|
|
60
|
+
filename: filename,
|
|
61
|
+
content_type: result.content_type
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
result.audio_key = attachment.blob.key if attachment.respond_to?(:blob) && attachment.blob
|
|
65
|
+
result.audio_url = blob_url(attachment) if attachment.respond_to?(:blob) && attachment.blob
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def blob_url(attachment)
|
|
69
|
+
if attachment.blob.respond_to?(:url)
|
|
70
|
+
attachment.blob.url
|
|
71
|
+
elsif attachment.blob.respond_to?(:service_url)
|
|
72
|
+
attachment.blob.service_url
|
|
73
|
+
end
|
|
74
|
+
rescue => _e
|
|
75
|
+
nil
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def generate_audio_filename(result)
|
|
79
|
+
timestamp = Time.current.to_i
|
|
80
|
+
ext = result.format || :mp3
|
|
81
|
+
"speech_#{timestamp}.#{ext}"
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
@@ -4,6 +4,7 @@ require "digest"
|
|
|
4
4
|
require_relative "../results/speech_result"
|
|
5
5
|
require_relative "speech_client"
|
|
6
6
|
require_relative "speech_pricing"
|
|
7
|
+
require_relative "elevenlabs/model_registry"
|
|
7
8
|
|
|
8
9
|
module RubyLLM
|
|
9
10
|
module Agents
|
|
@@ -336,6 +337,14 @@ module RubyLLM
|
|
|
336
337
|
context.output_tokens = 0
|
|
337
338
|
context.total_cost = calculate_cost(result)
|
|
338
339
|
|
|
340
|
+
# Store audio-specific metadata for execution tracking
|
|
341
|
+
context[:provider] = result[:provider].to_s
|
|
342
|
+
context[:voice_id] = (resolved_voice_id || resolved_voice).to_s
|
|
343
|
+
context[:characters] = result[:characters]
|
|
344
|
+
context[:output_format] = result[:format].to_s
|
|
345
|
+
context[:file_size] = result[:audio]&.bytesize
|
|
346
|
+
context[:audio_duration_seconds] = result[:duration] if result[:duration]
|
|
347
|
+
|
|
339
348
|
# Build final result
|
|
340
349
|
context.output = build_result(
|
|
341
350
|
result,
|
|
@@ -401,6 +410,7 @@ module RubyLLM
|
|
|
401
410
|
|
|
402
411
|
# Executes speech synthesis
|
|
403
412
|
def execute_speech(processed_text)
|
|
413
|
+
validate_elevenlabs_model!(processed_text)
|
|
404
414
|
speak_options = build_speak_options
|
|
405
415
|
|
|
406
416
|
if streaming_enabled? && @streaming_block
|
|
@@ -410,6 +420,42 @@ module RubyLLM
|
|
|
410
420
|
end
|
|
411
421
|
end
|
|
412
422
|
|
|
423
|
+
# Validates ElevenLabs model capabilities before calling the API.
|
|
424
|
+
# Raises on hard errors (non-TTS model), warns on soft issues.
|
|
425
|
+
def validate_elevenlabs_model!(text)
|
|
426
|
+
return unless resolved_provider == :elevenlabs
|
|
427
|
+
return unless defined?(Audio::ElevenLabs::ModelRegistry)
|
|
428
|
+
|
|
429
|
+
model_id = resolved_model
|
|
430
|
+
model = Audio::ElevenLabs::ModelRegistry.find(model_id)
|
|
431
|
+
return unless model # Unknown model — skip validation
|
|
432
|
+
|
|
433
|
+
# Hard error: model doesn't support TTS at all
|
|
434
|
+
unless model["can_do_text_to_speech"] == true
|
|
435
|
+
raise ConfigurationError,
|
|
436
|
+
"ElevenLabs model '#{model_id}' does not support text-to-speech. " \
|
|
437
|
+
"It may be a speech-to-speech model. Use a TTS-capable model like 'eleven_v3'."
|
|
438
|
+
end
|
|
439
|
+
|
|
440
|
+
# Warn: text exceeds model's max character limit
|
|
441
|
+
max_chars = model["maximum_text_length_per_request"]
|
|
442
|
+
if max_chars && text.length > max_chars
|
|
443
|
+
warn "[RubyLLM::Agents] Text length (#{text.length}) exceeds " \
|
|
444
|
+
"#{model_id} max of #{max_chars} characters. The API may truncate or reject it."
|
|
445
|
+
end
|
|
446
|
+
|
|
447
|
+
# Warn: style used on model that doesn't support it
|
|
448
|
+
vs = self.class.voice_settings_config
|
|
449
|
+
if vs && vs.style_value && vs.style_value > 0 && model["can_use_style"] != true
|
|
450
|
+
warn "[RubyLLM::Agents] Model '#{model_id}' does not support the 'style' voice setting. It will be ignored."
|
|
451
|
+
end
|
|
452
|
+
rescue ConfigurationError
|
|
453
|
+
raise
|
|
454
|
+
rescue => e
|
|
455
|
+
# Don't block speech on validation errors
|
|
456
|
+
warn "[RubyLLM::Agents] ElevenLabs model validation failed: #{e.message}"
|
|
457
|
+
end
|
|
458
|
+
|
|
413
459
|
# Executes standard (non-streaming) speech synthesis
|
|
414
460
|
def execute_standard_speech(text, options)
|
|
415
461
|
response = speech_client.speak(
|
|
@@ -559,3 +605,5 @@ module RubyLLM
|
|
|
559
605
|
end
|
|
560
606
|
end
|
|
561
607
|
end
|
|
608
|
+
|
|
609
|
+
require_relative "speaker/active_storage_support"
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require "faraday"
|
|
4
4
|
require "json"
|
|
5
|
+
require "set"
|
|
5
6
|
|
|
6
7
|
module RubyLLM
|
|
7
8
|
module Agents
|
|
@@ -266,14 +267,37 @@ module RubyLLM
|
|
|
266
267
|
body
|
|
267
268
|
end
|
|
268
269
|
|
|
270
|
+
# Convenience mapping: simple symbol → ElevenLabs native format string
|
|
269
271
|
ELEVENLABS_FORMAT_MAP = {
|
|
270
272
|
"mp3" => "mp3_44100_128",
|
|
271
|
-
"
|
|
273
|
+
"wav" => "wav_44100",
|
|
274
|
+
"ogg" => "mp3_44100_128", # ElevenLabs doesn't support ogg; fallback to mp3
|
|
275
|
+
"pcm" => "pcm_24000",
|
|
276
|
+
"opus" => "opus_48000_128",
|
|
277
|
+
"flac" => "mp3_44100_128", # ElevenLabs doesn't support flac; fallback to mp3
|
|
278
|
+
"aac" => "mp3_44100_128", # ElevenLabs doesn't support aac; fallback to mp3
|
|
279
|
+
"alaw" => "alaw_8000",
|
|
272
280
|
"ulaw" => "ulaw_8000"
|
|
273
281
|
}.freeze
|
|
274
282
|
|
|
283
|
+
# All valid ElevenLabs native format strings (pass-through)
|
|
284
|
+
ELEVENLABS_NATIVE_FORMATS = Set.new(%w[
|
|
285
|
+
mp3_22050_32 mp3_24000_48 mp3_44100_32 mp3_44100_64
|
|
286
|
+
mp3_44100_96 mp3_44100_128 mp3_44100_192
|
|
287
|
+
pcm_8000 pcm_16000 pcm_22050 pcm_24000 pcm_32000 pcm_44100 pcm_48000
|
|
288
|
+
wav_8000 wav_16000 wav_22050 wav_24000 wav_32000 wav_44100 wav_48000
|
|
289
|
+
opus_48000_32 opus_48000_64 opus_48000_96 opus_48000_128 opus_48000_192
|
|
290
|
+
alaw_8000 ulaw_8000
|
|
291
|
+
]).freeze
|
|
292
|
+
|
|
275
293
|
def elevenlabs_output_format(format)
|
|
276
|
-
|
|
294
|
+
format_str = format.to_s
|
|
295
|
+
|
|
296
|
+
# Pass through native ElevenLabs format strings directly
|
|
297
|
+
return format_str if ELEVENLABS_NATIVE_FORMATS.include?(format_str)
|
|
298
|
+
|
|
299
|
+
# Map simple symbols to native formats
|
|
300
|
+
ELEVENLABS_FORMAT_MAP[format_str] || "mp3_44100_128"
|
|
277
301
|
end
|
|
278
302
|
|
|
279
303
|
def elevenlabs_connection
|
|
@@ -8,10 +8,11 @@ module RubyLLM
|
|
|
8
8
|
module Audio
|
|
9
9
|
# Dynamic pricing resolution for text-to-speech models.
|
|
10
10
|
#
|
|
11
|
-
# Uses
|
|
11
|
+
# Uses a four-tier pricing cascade:
|
|
12
12
|
# 1. LiteLLM JSON (primary) - future-proof, auto-updating
|
|
13
13
|
# 2. Configurable pricing table - user overrides via config.tts_model_pricing
|
|
14
|
-
# 3.
|
|
14
|
+
# 3. ElevenLabs API - dynamic multiplier × base rate from /v1/models
|
|
15
|
+
# 4. Hardcoded fallbacks - per-model defaults
|
|
15
16
|
#
|
|
16
17
|
# All prices are per 1,000 characters.
|
|
17
18
|
#
|
|
@@ -50,14 +51,22 @@ module RubyLLM
|
|
|
50
51
|
# @param model_id [String] Model identifier
|
|
51
52
|
# @return [Float] Cost per 1K characters in USD
|
|
52
53
|
def cost_per_1k_characters(provider, model_id)
|
|
54
|
+
# Tier 1: LiteLLM
|
|
53
55
|
if (litellm_price = from_litellm(model_id))
|
|
54
56
|
return litellm_price
|
|
55
57
|
end
|
|
56
58
|
|
|
59
|
+
# Tier 2: User config overrides
|
|
57
60
|
if (config_price = from_config(model_id))
|
|
58
61
|
return config_price
|
|
59
62
|
end
|
|
60
63
|
|
|
64
|
+
# Tier 3: ElevenLabs API multiplier × base rate
|
|
65
|
+
if provider == :elevenlabs && (api_price = from_elevenlabs_api(model_id))
|
|
66
|
+
return api_price
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Tier 4: Hardcoded fallbacks
|
|
61
70
|
fallback_price(provider, model_id)
|
|
62
71
|
end
|
|
63
72
|
|
|
@@ -73,6 +82,7 @@ module RubyLLM
|
|
|
73
82
|
{
|
|
74
83
|
litellm: litellm_tts_models,
|
|
75
84
|
configured: config.tts_model_pricing || {},
|
|
85
|
+
elevenlabs_api: elevenlabs_api_pricing,
|
|
76
86
|
fallbacks: fallback_pricing_table
|
|
77
87
|
}
|
|
78
88
|
end
|
|
@@ -190,6 +200,19 @@ module RubyLLM
|
|
|
190
200
|
end
|
|
191
201
|
end
|
|
192
202
|
|
|
203
|
+
def elevenlabs_api_pricing
|
|
204
|
+
return {} unless defined?(ElevenLabs::ModelRegistry)
|
|
205
|
+
|
|
206
|
+
base = config.elevenlabs_base_cost_per_1k || 0.30
|
|
207
|
+
ElevenLabs::ModelRegistry.models.each_with_object({}) do |model, hash|
|
|
208
|
+
multiplier = model.dig("model_rates", "character_cost_multiplier") || 1.0
|
|
209
|
+
hash[model["model_id"]] = (base * multiplier).round(6)
|
|
210
|
+
end
|
|
211
|
+
rescue => e
|
|
212
|
+
warn "[RubyLLM::Agents] Failed to get ElevenLabs API pricing: #{e.message}"
|
|
213
|
+
{}
|
|
214
|
+
end
|
|
215
|
+
|
|
193
216
|
# ============================================================
|
|
194
217
|
# Tier 2: User configuration
|
|
195
218
|
# ============================================================
|
|
@@ -207,7 +230,25 @@ module RubyLLM
|
|
|
207
230
|
end
|
|
208
231
|
|
|
209
232
|
# ============================================================
|
|
210
|
-
# Tier 3:
|
|
233
|
+
# Tier 3: ElevenLabs API (dynamic multiplier × base rate)
|
|
234
|
+
# ============================================================
|
|
235
|
+
|
|
236
|
+
def from_elevenlabs_api(model_id)
|
|
237
|
+
return nil unless defined?(ElevenLabs::ModelRegistry)
|
|
238
|
+
|
|
239
|
+
model = ElevenLabs::ModelRegistry.find(model_id)
|
|
240
|
+
return nil unless model
|
|
241
|
+
|
|
242
|
+
multiplier = model.dig("model_rates", "character_cost_multiplier") || 1.0
|
|
243
|
+
base = config.elevenlabs_base_cost_per_1k || 0.30
|
|
244
|
+
(base * multiplier).round(6)
|
|
245
|
+
rescue => e
|
|
246
|
+
warn "[RubyLLM::Agents] Failed to get ElevenLabs API pricing: #{e.message}"
|
|
247
|
+
nil
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
# ============================================================
|
|
251
|
+
# Tier 4: Hardcoded fallbacks
|
|
211
252
|
# ============================================================
|
|
212
253
|
|
|
213
254
|
def fallback_price(provider, model_id)
|
|
@@ -318,6 +318,16 @@ module RubyLLM
|
|
|
318
318
|
context.output_tokens = 0
|
|
319
319
|
context.total_cost = calculate_cost(raw_result)
|
|
320
320
|
|
|
321
|
+
# Store transcription-specific metadata for execution tracking
|
|
322
|
+
context[:language] = resolved_language if resolved_language
|
|
323
|
+
context[:detected_language] = raw_result[:language] if raw_result[:language]
|
|
324
|
+
context[:audio_duration_seconds] = raw_result[:duration] if raw_result[:duration]
|
|
325
|
+
context[:audio_minutes] = (raw_result[:duration] / 60.0).round(4) if raw_result[:duration]
|
|
326
|
+
context[:output_format] = self.class.output_format.to_s
|
|
327
|
+
context[:timestamp_granularity] = self.class.include_timestamps.to_s
|
|
328
|
+
context[:segment_count] = raw_result[:segments]&.size if raw_result[:segments]
|
|
329
|
+
context[:word_count] = raw_result[:text]&.split(/\s+/)&.size if raw_result[:text]
|
|
330
|
+
|
|
321
331
|
# Build final result
|
|
322
332
|
context.output = build_result(
|
|
323
333
|
raw_result,
|
|
@@ -452,7 +452,10 @@ module RubyLLM
|
|
|
452
452
|
:root_directory,
|
|
453
453
|
:root_namespace,
|
|
454
454
|
:tool_result_max_length,
|
|
455
|
-
:redaction
|
|
455
|
+
:redaction,
|
|
456
|
+
:persist_audio_data,
|
|
457
|
+
:elevenlabs_base_cost_per_1k,
|
|
458
|
+
:elevenlabs_models_cache_ttl
|
|
456
459
|
|
|
457
460
|
# Attributes with validation (readers only, custom setters below)
|
|
458
461
|
attr_reader :default_temperature,
|
|
@@ -734,6 +737,14 @@ module RubyLLM
|
|
|
734
737
|
|
|
735
738
|
# Redaction defaults (disabled by default)
|
|
736
739
|
@redaction = nil
|
|
740
|
+
|
|
741
|
+
# Audio data persistence (disabled by default — base64 audio can be large)
|
|
742
|
+
@persist_audio_data = false
|
|
743
|
+
|
|
744
|
+
# ElevenLabs dynamic pricing: base cost per 1K characters (Pro plan overage rate)
|
|
745
|
+
@elevenlabs_base_cost_per_1k = 0.30
|
|
746
|
+
# ElevenLabs models cache TTL in seconds (6 hours)
|
|
747
|
+
@elevenlabs_models_cache_ttl = 21_600
|
|
737
748
|
end
|
|
738
749
|
|
|
739
750
|
# Returns the configured cache store, falling back to Rails.cache
|
|
@@ -280,6 +280,9 @@ module RubyLLM
|
|
|
280
280
|
detail_data[:response] = serialize_response(context)
|
|
281
281
|
end
|
|
282
282
|
|
|
283
|
+
# Persist audio data for Speaker executions
|
|
284
|
+
maybe_persist_audio_response(context, detail_data)
|
|
285
|
+
|
|
283
286
|
has_data = detail_data.values.any? { |v| v.present? && v != {} && v != [] }
|
|
284
287
|
return unless has_data
|
|
285
288
|
|
|
@@ -376,6 +379,10 @@ module RubyLLM
|
|
|
376
379
|
if global_config.persist_responses && context.output.respond_to?(:content)
|
|
377
380
|
detail_data[:response] = serialize_response(context)
|
|
378
381
|
end
|
|
382
|
+
|
|
383
|
+
# Persist audio data for Speaker executions
|
|
384
|
+
maybe_persist_audio_response(context, detail_data)
|
|
385
|
+
|
|
379
386
|
data[:_detail_data] = detail_data
|
|
380
387
|
|
|
381
388
|
data
|
|
@@ -463,6 +470,48 @@ module RubyLLM
|
|
|
463
470
|
nil
|
|
464
471
|
end
|
|
465
472
|
|
|
473
|
+
# Persists audio response data for Speaker executions
|
|
474
|
+
#
|
|
475
|
+
# When persist_audio_data is enabled and the output is a SpeechResult with
|
|
476
|
+
# audio binary data, stores a base64 data URI in the response column.
|
|
477
|
+
# Always stores audio_url if present (lightweight, no binary).
|
|
478
|
+
#
|
|
479
|
+
# @param context [Context] The execution context
|
|
480
|
+
# @param detail_data [Hash] The detail data hash to modify
|
|
481
|
+
def maybe_persist_audio_response(context, detail_data)
|
|
482
|
+
return unless context.output.is_a?(RubyLLM::Agents::SpeechResult)
|
|
483
|
+
|
|
484
|
+
# Always persist audio_url if present (it's just a string, no binary)
|
|
485
|
+
if context.output.audio_url.present?
|
|
486
|
+
detail_data[:response] ||= {}
|
|
487
|
+
detail_data[:response][:audio_url] = context.output.audio_url
|
|
488
|
+
end
|
|
489
|
+
|
|
490
|
+
# Persist full audio data URI only when opted in
|
|
491
|
+
return unless global_config.respond_to?(:persist_audio_data) && global_config.persist_audio_data
|
|
492
|
+
return unless context.output.audio.present?
|
|
493
|
+
|
|
494
|
+
detail_data[:response] = serialize_audio_response(context.output)
|
|
495
|
+
rescue => e
|
|
496
|
+
error("Failed to persist audio response: #{e.message}")
|
|
497
|
+
end
|
|
498
|
+
|
|
499
|
+
# Serializes a SpeechResult into a hash for the response column
|
|
500
|
+
#
|
|
501
|
+
# @param result [SpeechResult] The speech result to serialize
|
|
502
|
+
# @return [Hash] Serialized audio response data
|
|
503
|
+
def serialize_audio_response(result)
|
|
504
|
+
{
|
|
505
|
+
audio_data_uri: result.to_data_uri,
|
|
506
|
+
audio_url: result.audio_url,
|
|
507
|
+
format: result.format.to_s,
|
|
508
|
+
duration: result.duration,
|
|
509
|
+
file_size: result.file_size,
|
|
510
|
+
voice_id: result.voice_id,
|
|
511
|
+
provider: result.provider.to_s
|
|
512
|
+
}.compact
|
|
513
|
+
end
|
|
514
|
+
|
|
466
515
|
# Queues async logging via background job
|
|
467
516
|
#
|
|
468
517
|
# @param data [Hash] Execution data
|
|
@@ -29,17 +29,17 @@ module RubyLLM
|
|
|
29
29
|
# @return [String, nil] Binary audio data
|
|
30
30
|
attr_reader :audio
|
|
31
31
|
|
|
32
|
-
# @!attribute [
|
|
32
|
+
# @!attribute [rw] audio_url
|
|
33
33
|
# @return [String, nil] URL if audio was stored remotely
|
|
34
|
-
|
|
34
|
+
attr_accessor :audio_url
|
|
35
35
|
|
|
36
|
-
# @!attribute [
|
|
36
|
+
# @!attribute [rw] audio_key
|
|
37
37
|
# @return [String, nil] Storage key if stored
|
|
38
|
-
|
|
38
|
+
attr_accessor :audio_key
|
|
39
39
|
|
|
40
|
-
# @!attribute [
|
|
40
|
+
# @!attribute [rw] audio_path
|
|
41
41
|
# @return [String, nil] Local file path if saved
|
|
42
|
-
|
|
42
|
+
attr_accessor :audio_path
|
|
43
43
|
|
|
44
44
|
# @!endgroup
|
|
45
45
|
|
|
@@ -308,29 +308,37 @@ module RubyLLM
|
|
|
308
308
|
}
|
|
309
309
|
end
|
|
310
310
|
|
|
311
|
-
|
|
311
|
+
# Returns MIME type for the audio format
|
|
312
|
+
#
|
|
313
|
+
# @return [String] MIME type
|
|
314
|
+
def content_type
|
|
315
|
+
mime_type_for_format
|
|
316
|
+
end
|
|
312
317
|
|
|
313
318
|
# Returns MIME type for the audio format
|
|
314
319
|
#
|
|
315
320
|
# @return [String] MIME type
|
|
316
321
|
def mime_type_for_format
|
|
322
|
+
fmt = format.to_s
|
|
323
|
+
|
|
324
|
+
# Handle ElevenLabs native format strings (e.g., "mp3_44100_128")
|
|
325
|
+
return "audio/mpeg" if fmt.start_with?("mp3")
|
|
326
|
+
return "audio/wav" if fmt.start_with?("wav")
|
|
327
|
+
return "audio/opus" if fmt.start_with?("opus")
|
|
328
|
+
return "audio/pcm" if fmt.start_with?("pcm")
|
|
329
|
+
return "audio/alaw" if fmt.start_with?("alaw")
|
|
330
|
+
return "audio/basic" if fmt.start_with?("ulaw")
|
|
331
|
+
|
|
332
|
+
# Handle simple symbols (backward compatible)
|
|
317
333
|
case format
|
|
318
|
-
when :mp3
|
|
319
|
-
|
|
320
|
-
when :
|
|
321
|
-
|
|
322
|
-
when :
|
|
323
|
-
|
|
324
|
-
when :
|
|
325
|
-
|
|
326
|
-
when :aac
|
|
327
|
-
"audio/aac"
|
|
328
|
-
when :opus
|
|
329
|
-
"audio/opus"
|
|
330
|
-
when :pcm
|
|
331
|
-
"audio/pcm"
|
|
332
|
-
else
|
|
333
|
-
"audio/mpeg" # Default to mp3
|
|
334
|
+
when :mp3 then "audio/mpeg"
|
|
335
|
+
when :wav then "audio/wav"
|
|
336
|
+
when :ogg then "audio/ogg"
|
|
337
|
+
when :flac then "audio/flac"
|
|
338
|
+
when :aac then "audio/aac"
|
|
339
|
+
when :opus then "audio/opus"
|
|
340
|
+
when :pcm then "audio/pcm"
|
|
341
|
+
else "audio/mpeg"
|
|
334
342
|
end
|
|
335
343
|
end
|
|
336
344
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ruby_llm-agents
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.
|
|
4
|
+
version: 3.4.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- adham90
|
|
@@ -111,6 +111,7 @@ files:
|
|
|
111
111
|
- app/views/ruby_llm/agents/dashboard/_action_center.html.erb
|
|
112
112
|
- app/views/ruby_llm/agents/dashboard/_tenant_budget.html.erb
|
|
113
113
|
- app/views/ruby_llm/agents/dashboard/index.html.erb
|
|
114
|
+
- app/views/ruby_llm/agents/executions/_audio_player.html.erb
|
|
114
115
|
- app/views/ruby_llm/agents/executions/_execution.html.erb
|
|
115
116
|
- app/views/ruby_llm/agents/executions/_filters.html.erb
|
|
116
117
|
- app/views/ruby_llm/agents/executions/_list.html.erb
|
|
@@ -208,7 +209,9 @@ files:
|
|
|
208
209
|
- lib/generators/ruby_llm_agents/upgrade_generator.rb
|
|
209
210
|
- lib/ruby_llm-agents.rb
|
|
210
211
|
- lib/ruby_llm/agents.rb
|
|
212
|
+
- lib/ruby_llm/agents/audio/elevenlabs/model_registry.rb
|
|
211
213
|
- lib/ruby_llm/agents/audio/speaker.rb
|
|
214
|
+
- lib/ruby_llm/agents/audio/speaker/active_storage_support.rb
|
|
212
215
|
- lib/ruby_llm/agents/audio/speech_client.rb
|
|
213
216
|
- lib/ruby_llm/agents/audio/speech_pricing.rb
|
|
214
217
|
- lib/ruby_llm/agents/audio/transcriber.rb
|