ruby_llm-agents 3.1.0 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -0
- data/app/controllers/ruby_llm/agents/agents_controller.rb +16 -14
- data/app/controllers/ruby_llm/agents/dashboard_controller.rb +20 -20
- data/app/controllers/ruby_llm/agents/executions_controller.rb +5 -7
- data/app/helpers/ruby_llm/agents/application_helper.rb +57 -58
- data/app/models/ruby_llm/agents/execution/analytics.rb +27 -27
- data/app/models/ruby_llm/agents/execution/scopes.rb +4 -6
- data/app/models/ruby_llm/agents/execution.rb +25 -25
- data/app/models/ruby_llm/agents/tenant/budgetable.rb +16 -10
- data/app/models/ruby_llm/agents/tenant/resettable.rb +12 -12
- data/app/models/ruby_llm/agents/tenant/trackable.rb +7 -7
- data/app/services/ruby_llm/agents/agent_registry.rb +6 -6
- data/app/views/ruby_llm/agents/executions/_audio_player.html.erb +57 -0
- data/app/views/ruby_llm/agents/executions/show.html.erb +8 -0
- data/lib/generators/ruby_llm_agents/agent_generator.rb +4 -4
- data/lib/generators/ruby_llm_agents/background_remover_generator.rb +6 -6
- data/lib/generators/ruby_llm_agents/embedder_generator.rb +4 -4
- data/lib/generators/ruby_llm_agents/image_analyzer_generator.rb +7 -7
- data/lib/generators/ruby_llm_agents/image_editor_generator.rb +4 -4
- data/lib/generators/ruby_llm_agents/image_generator_generator.rb +6 -6
- data/lib/generators/ruby_llm_agents/image_pipeline_generator.rb +9 -9
- data/lib/generators/ruby_llm_agents/image_transformer_generator.rb +6 -6
- data/lib/generators/ruby_llm_agents/image_upscaler_generator.rb +4 -4
- data/lib/generators/ruby_llm_agents/image_variator_generator.rb +4 -4
- data/lib/generators/ruby_llm_agents/install_generator.rb +3 -3
- data/lib/generators/ruby_llm_agents/migrate_structure_generator.rb +4 -4
- data/lib/generators/ruby_llm_agents/multi_tenancy_generator.rb +2 -2
- data/lib/generators/ruby_llm_agents/restructure_generator.rb +13 -13
- data/lib/generators/ruby_llm_agents/speaker_generator.rb +6 -6
- data/lib/generators/ruby_llm_agents/transcriber_generator.rb +4 -4
- data/lib/generators/ruby_llm_agents/upgrade_generator.rb +2 -2
- data/lib/ruby_llm/agents/audio/speaker/active_storage_support.rb +87 -0
- data/lib/ruby_llm/agents/audio/speaker.rb +50 -31
- data/lib/ruby_llm/agents/audio/speech_client.rb +328 -0
- data/lib/ruby_llm/agents/audio/speech_pricing.rb +273 -0
- data/lib/ruby_llm/agents/audio/transcriber.rb +43 -33
- data/lib/ruby_llm/agents/base_agent.rb +14 -14
- data/lib/ruby_llm/agents/core/base/callbacks.rb +3 -3
- data/lib/ruby_llm/agents/core/configuration.rb +90 -73
- data/lib/ruby_llm/agents/core/errors.rb +27 -2
- data/lib/ruby_llm/agents/core/instrumentation.rb +64 -66
- data/lib/ruby_llm/agents/core/llm_tenant.rb +7 -7
- data/lib/ruby_llm/agents/core/version.rb +1 -1
- data/lib/ruby_llm/agents/dsl/base.rb +3 -3
- data/lib/ruby_llm/agents/dsl/reliability.rb +9 -9
- data/lib/ruby_llm/agents/image/analyzer/dsl.rb +1 -1
- data/lib/ruby_llm/agents/image/analyzer/execution.rb +4 -4
- data/lib/ruby_llm/agents/image/background_remover/dsl.rb +1 -1
- data/lib/ruby_llm/agents/image/background_remover/execution.rb +3 -3
- data/lib/ruby_llm/agents/image/concerns/image_operation_execution.rb +8 -8
- data/lib/ruby_llm/agents/image/editor/execution.rb +1 -1
- data/lib/ruby_llm/agents/image/generator/pricing.rb +9 -10
- data/lib/ruby_llm/agents/image/generator.rb +6 -6
- data/lib/ruby_llm/agents/image/pipeline/dsl.rb +6 -6
- data/lib/ruby_llm/agents/image/pipeline/execution.rb +9 -9
- data/lib/ruby_llm/agents/image/pipeline.rb +1 -1
- data/lib/ruby_llm/agents/image/transformer/execution.rb +1 -1
- data/lib/ruby_llm/agents/image/upscaler/dsl.rb +1 -1
- data/lib/ruby_llm/agents/image/upscaler/execution.rb +3 -5
- data/lib/ruby_llm/agents/image/variator/execution.rb +1 -1
- data/lib/ruby_llm/agents/infrastructure/alert_manager.rb +4 -4
- data/lib/ruby_llm/agents/infrastructure/attempt_tracker.rb +4 -4
- data/lib/ruby_llm/agents/infrastructure/budget/budget_query.rb +9 -9
- data/lib/ruby_llm/agents/infrastructure/budget/config_resolver.rb +3 -3
- data/lib/ruby_llm/agents/infrastructure/budget/forecaster.rb +1 -1
- data/lib/ruby_llm/agents/infrastructure/budget/spend_recorder.rb +17 -17
- data/lib/ruby_llm/agents/infrastructure/circuit_breaker.rb +1 -0
- data/lib/ruby_llm/agents/infrastructure/execution_logger_job.rb +1 -1
- data/lib/ruby_llm/agents/infrastructure/reliability.rb +6 -6
- data/lib/ruby_llm/agents/pipeline/builder.rb +11 -11
- data/lib/ruby_llm/agents/pipeline/middleware/budget.rb +3 -3
- data/lib/ruby_llm/agents/pipeline/middleware/cache.rb +4 -4
- data/lib/ruby_llm/agents/pipeline/middleware/instrumentation.rb +83 -22
- data/lib/ruby_llm/agents/pipeline/middleware/reliability.rb +2 -3
- data/lib/ruby_llm/agents/pipeline/middleware/tenant.rb +7 -7
- data/lib/ruby_llm/agents/results/background_removal_result.rb +6 -6
- data/lib/ruby_llm/agents/results/embedding_result.rb +15 -15
- data/lib/ruby_llm/agents/results/image_analysis_result.rb +7 -7
- data/lib/ruby_llm/agents/results/image_edit_result.rb +4 -4
- data/lib/ruby_llm/agents/results/image_generation_result.rb +5 -5
- data/lib/ruby_llm/agents/results/image_pipeline_result.rb +4 -4
- data/lib/ruby_llm/agents/results/image_transform_result.rb +4 -4
- data/lib/ruby_llm/agents/results/image_upscale_result.rb +5 -5
- data/lib/ruby_llm/agents/results/image_variation_result.rb +4 -4
- data/lib/ruby_llm/agents/results/speech_result.rb +12 -7
- data/lib/ruby_llm/agents/results/transcription_result.rb +1 -1
- data/lib/ruby_llm/agents/text/embedder.rb +13 -13
- metadata +5 -1
|
@@ -102,7 +102,7 @@ module RubyLlmAgents
|
|
|
102
102
|
|
|
103
103
|
def table_exists?(table)
|
|
104
104
|
ActiveRecord::Base.connection.table_exists?(table)
|
|
105
|
-
rescue
|
|
105
|
+
rescue
|
|
106
106
|
false
|
|
107
107
|
end
|
|
108
108
|
|
|
@@ -110,7 +110,7 @@ module RubyLlmAgents
|
|
|
110
110
|
return false unless ActiveRecord::Base.connection.table_exists?(table)
|
|
111
111
|
|
|
112
112
|
ActiveRecord::Base.connection.column_exists?(table, column)
|
|
113
|
-
rescue
|
|
113
|
+
rescue
|
|
114
114
|
false
|
|
115
115
|
end
|
|
116
116
|
end
|
|
@@ -37,25 +37,25 @@ module RubyLlmAgents
|
|
|
37
37
|
# Maps old directory -> { category:, type: }
|
|
38
38
|
DIRECTORY_MAPPING = {
|
|
39
39
|
# Top-level under llm/
|
|
40
|
-
"agents" => {
|
|
41
|
-
"tools" => {
|
|
40
|
+
"agents" => {category: nil, type: "agents"},
|
|
41
|
+
"tools" => {category: nil, type: "tools"},
|
|
42
42
|
|
|
43
43
|
# Audio group
|
|
44
|
-
"speakers" => {
|
|
45
|
-
"transcribers" => {
|
|
44
|
+
"speakers" => {category: :audio, type: "speakers"},
|
|
45
|
+
"transcribers" => {category: :audio, type: "transcribers"},
|
|
46
46
|
|
|
47
47
|
# Image group
|
|
48
|
-
"image_generators" => {
|
|
49
|
-
"image_editors" => {
|
|
50
|
-
"image_analyzers" => {
|
|
51
|
-
"image_transformers" => {
|
|
52
|
-
"image_upscalers" => {
|
|
53
|
-
"image_variators" => {
|
|
54
|
-
"background_removers" => {
|
|
48
|
+
"image_generators" => {category: :image, type: "generators"},
|
|
49
|
+
"image_editors" => {category: :image, type: "editors"},
|
|
50
|
+
"image_analyzers" => {category: :image, type: "analyzers"},
|
|
51
|
+
"image_transformers" => {category: :image, type: "transformers"},
|
|
52
|
+
"image_upscalers" => {category: :image, type: "upscalers"},
|
|
53
|
+
"image_variators" => {category: :image, type: "variators"},
|
|
54
|
+
"background_removers" => {category: :image, type: "background_removers"},
|
|
55
55
|
|
|
56
56
|
# Text group
|
|
57
|
-
"embedders" => {
|
|
58
|
-
"moderators" => {
|
|
57
|
+
"embedders" => {category: :text, type: "embedders"},
|
|
58
|
+
"moderators" => {category: :text, type: "moderators"}
|
|
59
59
|
}.freeze
|
|
60
60
|
|
|
61
61
|
def validate_root_directory
|
|
@@ -17,17 +17,17 @@ module RubyLlmAgents
|
|
|
17
17
|
source_root File.expand_path("templates", __dir__)
|
|
18
18
|
|
|
19
19
|
class_option :provider, type: :string, default: "openai",
|
|
20
|
-
|
|
20
|
+
desc: "The TTS provider to use (openai, elevenlabs)"
|
|
21
21
|
class_option :model, type: :string, default: nil,
|
|
22
|
-
|
|
22
|
+
desc: "The TTS model to use"
|
|
23
23
|
class_option :voice, type: :string, default: "nova",
|
|
24
|
-
|
|
24
|
+
desc: "The voice to use"
|
|
25
25
|
class_option :speed, type: :numeric, default: 1.0,
|
|
26
|
-
|
|
26
|
+
desc: "Speech speed (0.25-4.0 for OpenAI)"
|
|
27
27
|
class_option :format, type: :string, default: "mp3",
|
|
28
|
-
|
|
28
|
+
desc: "Output format (mp3, wav, ogg, flac)"
|
|
29
29
|
class_option :cache, type: :string, default: nil,
|
|
30
|
-
|
|
30
|
+
desc: "Cache TTL (e.g., '7.days')"
|
|
31
31
|
|
|
32
32
|
def ensure_base_class_and_skill_file
|
|
33
33
|
audio_dir = "app/agents/audio"
|
|
@@ -17,13 +17,13 @@ module RubyLlmAgents
|
|
|
17
17
|
source_root File.expand_path("templates", __dir__)
|
|
18
18
|
|
|
19
19
|
class_option :model, type: :string, default: "whisper-1",
|
|
20
|
-
|
|
20
|
+
desc: "The transcription model to use"
|
|
21
21
|
class_option :language, type: :string, default: nil,
|
|
22
|
-
|
|
22
|
+
desc: "Language code (e.g., 'en', 'es')"
|
|
23
23
|
class_option :output_format, type: :string, default: "text",
|
|
24
|
-
|
|
24
|
+
desc: "Output format (text, srt, vtt, json)"
|
|
25
25
|
class_option :cache, type: :string, default: nil,
|
|
26
|
-
|
|
26
|
+
desc: "Cache TTL (e.g., '30.days')"
|
|
27
27
|
|
|
28
28
|
def ensure_base_class_and_skill_file
|
|
29
29
|
audio_dir = "app/agents/audio"
|
|
@@ -164,13 +164,13 @@ module RubyLlmAgents
|
|
|
164
164
|
return false unless ActiveRecord::Base.connection.table_exists?(table)
|
|
165
165
|
|
|
166
166
|
ActiveRecord::Base.connection.column_exists?(table, column)
|
|
167
|
-
rescue
|
|
167
|
+
rescue
|
|
168
168
|
false
|
|
169
169
|
end
|
|
170
170
|
|
|
171
171
|
def table_exists?(table)
|
|
172
172
|
ActiveRecord::Base.connection.table_exists?(table)
|
|
173
|
-
rescue
|
|
173
|
+
rescue
|
|
174
174
|
false
|
|
175
175
|
end
|
|
176
176
|
end
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Agents
|
|
5
|
+
class Speaker
|
|
6
|
+
# ActiveStorage integration for speakers
|
|
7
|
+
#
|
|
8
|
+
# Provides convenience methods for generating audio and directly
|
|
9
|
+
# attaching it to ActiveStorage attachments.
|
|
10
|
+
#
|
|
11
|
+
# @example Attaching to a model
|
|
12
|
+
# class Article < ApplicationRecord
|
|
13
|
+
# has_one_attached :narration
|
|
14
|
+
# end
|
|
15
|
+
#
|
|
16
|
+
# class ArticleNarrator < RubyLLM::Agents::Speaker
|
|
17
|
+
# include RubyLLM::Agents::Speaker::ActiveStorageSupport
|
|
18
|
+
#
|
|
19
|
+
# provider :openai
|
|
20
|
+
# model 'tts-1-hd'
|
|
21
|
+
# voice 'nova'
|
|
22
|
+
# end
|
|
23
|
+
#
|
|
24
|
+
# article = Article.find(1)
|
|
25
|
+
# result = ArticleNarrator.speak_and_attach(
|
|
26
|
+
# text: article.body,
|
|
27
|
+
# record: article,
|
|
28
|
+
# attachment_name: :narration
|
|
29
|
+
# )
|
|
30
|
+
#
|
|
31
|
+
module ActiveStorageSupport
|
|
32
|
+
extend ActiveSupport::Concern
|
|
33
|
+
|
|
34
|
+
class_methods do
|
|
35
|
+
# Generate audio and attach it to a record
|
|
36
|
+
#
|
|
37
|
+
# @param text [String] Text to convert to speech
|
|
38
|
+
# @param record [ActiveRecord::Base] The record to attach to
|
|
39
|
+
# @param attachment_name [Symbol] Name of the attachment (e.g., :narration)
|
|
40
|
+
# @param options [Hash] Additional options for generation
|
|
41
|
+
# @return [SpeechResult] The speech result with audio_url set
|
|
42
|
+
def speak_and_attach(text:, record:, attachment_name:, **options)
|
|
43
|
+
result = call(text: text, **options)
|
|
44
|
+
|
|
45
|
+
return result unless result.success?
|
|
46
|
+
|
|
47
|
+
attach_audio_to_record(result, record, attachment_name, options)
|
|
48
|
+
|
|
49
|
+
result
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
def attach_audio_to_record(result, record, attachment_name, options)
|
|
55
|
+
attachment = record.public_send(attachment_name)
|
|
56
|
+
filename = options[:filename] || generate_audio_filename(result)
|
|
57
|
+
|
|
58
|
+
attachment.attach(
|
|
59
|
+
io: StringIO.new(result.audio),
|
|
60
|
+
filename: filename,
|
|
61
|
+
content_type: result.content_type
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
result.audio_key = attachment.blob.key if attachment.respond_to?(:blob) && attachment.blob
|
|
65
|
+
result.audio_url = blob_url(attachment) if attachment.respond_to?(:blob) && attachment.blob
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def blob_url(attachment)
|
|
69
|
+
if attachment.blob.respond_to?(:url)
|
|
70
|
+
attachment.blob.url
|
|
71
|
+
elsif attachment.blob.respond_to?(:service_url)
|
|
72
|
+
attachment.blob.service_url
|
|
73
|
+
end
|
|
74
|
+
rescue => _e
|
|
75
|
+
nil
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def generate_audio_filename(result)
|
|
79
|
+
timestamp = Time.current.to_i
|
|
80
|
+
ext = result.format || :mp3
|
|
81
|
+
"speech_#{timestamp}.#{ext}"
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
require "digest"
|
|
4
4
|
require_relative "../results/speech_result"
|
|
5
|
+
require_relative "speech_client"
|
|
6
|
+
require_relative "speech_pricing"
|
|
5
7
|
|
|
6
8
|
module RubyLLM
|
|
7
9
|
module Agents
|
|
@@ -194,19 +196,19 @@ module RubyLLM
|
|
|
194
196
|
|
|
195
197
|
def default_tts_provider
|
|
196
198
|
RubyLLM::Agents.configuration.default_tts_provider
|
|
197
|
-
rescue
|
|
199
|
+
rescue
|
|
198
200
|
:openai
|
|
199
201
|
end
|
|
200
202
|
|
|
201
203
|
def default_tts_model
|
|
202
204
|
RubyLLM::Agents.configuration.default_tts_model
|
|
203
|
-
rescue
|
|
205
|
+
rescue
|
|
204
206
|
"tts-1"
|
|
205
207
|
end
|
|
206
208
|
|
|
207
209
|
def default_tts_voice
|
|
208
210
|
RubyLLM::Agents.configuration.default_tts_voice
|
|
209
|
-
rescue
|
|
211
|
+
rescue
|
|
210
212
|
"nova"
|
|
211
213
|
end
|
|
212
214
|
end
|
|
@@ -334,6 +336,14 @@ module RubyLLM
|
|
|
334
336
|
context.output_tokens = 0
|
|
335
337
|
context.total_cost = calculate_cost(result)
|
|
336
338
|
|
|
339
|
+
# Store audio-specific metadata for execution tracking
|
|
340
|
+
context[:provider] = result[:provider].to_s
|
|
341
|
+
context[:voice_id] = (resolved_voice_id || resolved_voice).to_s
|
|
342
|
+
context[:characters] = result[:characters]
|
|
343
|
+
context[:output_format] = result[:format].to_s
|
|
344
|
+
context[:file_size] = result[:audio]&.bytesize
|
|
345
|
+
context[:audio_duration_seconds] = result[:duration] if result[:duration]
|
|
346
|
+
|
|
337
347
|
# Build final result
|
|
338
348
|
context.output = build_result(
|
|
339
349
|
result,
|
|
@@ -410,7 +420,15 @@ module RubyLLM
|
|
|
410
420
|
|
|
411
421
|
# Executes standard (non-streaming) speech synthesis
|
|
412
422
|
def execute_standard_speech(text, options)
|
|
413
|
-
response =
|
|
423
|
+
response = speech_client.speak(
|
|
424
|
+
text,
|
|
425
|
+
model: options[:model],
|
|
426
|
+
voice: options[:voice],
|
|
427
|
+
voice_id: resolved_voice_id,
|
|
428
|
+
speed: options[:speed],
|
|
429
|
+
response_format: options[:response_format] || "mp3",
|
|
430
|
+
voice_settings: options[:voice_settings]
|
|
431
|
+
)
|
|
414
432
|
|
|
415
433
|
{
|
|
416
434
|
audio: response.audio,
|
|
@@ -428,9 +446,17 @@ module RubyLLM
|
|
|
428
446
|
def execute_streaming_speech(text, options)
|
|
429
447
|
audio_chunks = []
|
|
430
448
|
|
|
431
|
-
|
|
449
|
+
speech_client.speak_streaming(
|
|
450
|
+
text,
|
|
451
|
+
model: options[:model],
|
|
452
|
+
voice: options[:voice],
|
|
453
|
+
voice_id: resolved_voice_id,
|
|
454
|
+
speed: options[:speed],
|
|
455
|
+
response_format: options[:response_format] || "mp3",
|
|
456
|
+
voice_settings: options[:voice_settings]
|
|
457
|
+
) do |chunk|
|
|
432
458
|
audio_chunks << chunk.audio if chunk.respond_to?(:audio)
|
|
433
|
-
@streaming_block
|
|
459
|
+
@streaming_block&.call(chunk)
|
|
434
460
|
end
|
|
435
461
|
|
|
436
462
|
{
|
|
@@ -445,7 +471,7 @@ module RubyLLM
|
|
|
445
471
|
}
|
|
446
472
|
end
|
|
447
473
|
|
|
448
|
-
# Builds options for
|
|
474
|
+
# Builds options for SpeechClient
|
|
449
475
|
def build_speak_options
|
|
450
476
|
options = {
|
|
451
477
|
model: resolved_model,
|
|
@@ -453,13 +479,11 @@ module RubyLLM
|
|
|
453
479
|
}
|
|
454
480
|
|
|
455
481
|
speed = resolved_speed
|
|
456
|
-
options[:speed] = speed if speed && speed
|
|
482
|
+
options[:speed] = speed if speed && (speed - 1.0).abs > Float::EPSILON
|
|
457
483
|
options[:response_format] = resolved_output_format.to_s
|
|
458
484
|
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
options[:voice_settings] = voice_settings.to_h if voice_settings
|
|
462
|
-
end
|
|
485
|
+
voice_settings = self.class.voice_settings_config
|
|
486
|
+
options[:voice_settings] = voice_settings.to_h if voice_settings
|
|
463
487
|
|
|
464
488
|
options
|
|
465
489
|
end
|
|
@@ -488,29 +512,17 @@ module RubyLLM
|
|
|
488
512
|
|
|
489
513
|
# Calculates cost for speech synthesis
|
|
490
514
|
def calculate_cost(raw_result)
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
if raw_result[:raw_response].respond_to?(:cost) && raw_result[:raw_response].cost
|
|
515
|
+
if raw_result[:raw_response].respond_to?(:cost) && raw_result[:raw_response]&.cost
|
|
494
516
|
return raw_result[:raw_response].cost
|
|
495
517
|
end
|
|
496
518
|
|
|
497
|
-
|
|
498
|
-
model_name = raw_result[:model].to_s
|
|
499
|
-
|
|
500
|
-
price_per_1k_chars = case provider
|
|
501
|
-
when :openai
|
|
502
|
-
model_name.include?("hd") ? 0.030 : 0.015
|
|
503
|
-
when :elevenlabs
|
|
504
|
-
0.30
|
|
505
|
-
when :google
|
|
506
|
-
0.016
|
|
507
|
-
when :polly
|
|
508
|
-
0.016
|
|
509
|
-
else
|
|
510
|
-
0.015
|
|
511
|
-
end
|
|
519
|
+
characters = raw_result[:characters] || 0
|
|
512
520
|
|
|
513
|
-
(
|
|
521
|
+
Audio::SpeechPricing.calculate_cost(
|
|
522
|
+
provider: raw_result[:provider],
|
|
523
|
+
model_id: raw_result[:model].to_s,
|
|
524
|
+
characters: characters
|
|
525
|
+
)
|
|
514
526
|
end
|
|
515
527
|
|
|
516
528
|
# Resolves the provider to use
|
|
@@ -547,6 +559,13 @@ module RubyLLM
|
|
|
547
559
|
def streaming_enabled?
|
|
548
560
|
@runtime_streaming || self.class.streaming?
|
|
549
561
|
end
|
|
562
|
+
|
|
563
|
+
# Returns a SpeechClient for the resolved provider
|
|
564
|
+
def speech_client
|
|
565
|
+
@speech_client ||= Audio::SpeechClient.new(provider: resolved_provider)
|
|
566
|
+
end
|
|
550
567
|
end
|
|
551
568
|
end
|
|
552
569
|
end
|
|
570
|
+
|
|
571
|
+
require_relative "speaker/active_storage_support"
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "faraday"
|
|
4
|
+
require "json"
|
|
5
|
+
|
|
6
|
+
module RubyLLM
|
|
7
|
+
module Agents
|
|
8
|
+
module Audio
|
|
9
|
+
# Direct HTTP client for text-to-speech APIs.
|
|
10
|
+
#
|
|
11
|
+
# Supports OpenAI and ElevenLabs providers, bypassing the need for
|
|
12
|
+
# a RubyLLM.speak() method that does not exist in the base gem.
|
|
13
|
+
#
|
|
14
|
+
# @example OpenAI
|
|
15
|
+
# client = SpeechClient.new(provider: :openai)
|
|
16
|
+
# response = client.speak("Hello", model: "tts-1", voice: "nova")
|
|
17
|
+
# response.audio # => binary audio data
|
|
18
|
+
#
|
|
19
|
+
# @example ElevenLabs
|
|
20
|
+
# client = SpeechClient.new(provider: :elevenlabs)
|
|
21
|
+
# response = client.speak("Hello",
|
|
22
|
+
# model: "eleven_v3",
|
|
23
|
+
# voice: "Rachel",
|
|
24
|
+
# voice_id: "21m00Tcm4TlvDq8ikWAM",
|
|
25
|
+
# voice_settings: { stability: 0.5, similarity_boost: 0.75 }
|
|
26
|
+
# )
|
|
27
|
+
#
|
|
28
|
+
class SpeechClient
|
|
29
|
+
SUPPORTED_PROVIDERS = %i[openai elevenlabs].freeze
|
|
30
|
+
|
|
31
|
+
Response = Struct.new(:audio, :format, :model, :voice, keyword_init: true) do
|
|
32
|
+
def duration
|
|
33
|
+
nil
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def cost
|
|
37
|
+
nil
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
StreamChunk = Struct.new(:audio, keyword_init: true)
|
|
42
|
+
|
|
43
|
+
# @param provider [Symbol] :openai or :elevenlabs
|
|
44
|
+
# @raise [UnsupportedProviderError] if provider is not supported
|
|
45
|
+
def initialize(provider:)
|
|
46
|
+
validate_provider!(provider)
|
|
47
|
+
@provider = provider
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Synthesize speech (non-streaming)
|
|
51
|
+
#
|
|
52
|
+
# @param text [String] text to convert
|
|
53
|
+
# @param model [String] model identifier
|
|
54
|
+
# @param voice [String] voice name
|
|
55
|
+
# @param voice_id [String, nil] voice ID (required for ElevenLabs)
|
|
56
|
+
# @param speed [Float, nil] speed multiplier
|
|
57
|
+
# @param response_format [String] output format
|
|
58
|
+
# @param voice_settings [Hash, nil] ElevenLabs voice settings
|
|
59
|
+
# @return [Response]
|
|
60
|
+
def speak(text, model:, voice:, voice_id: nil, speed: nil,
|
|
61
|
+
response_format: "mp3", voice_settings: nil)
|
|
62
|
+
case @provider
|
|
63
|
+
when :openai
|
|
64
|
+
openai_speak(text, model: model, voice: voice_id || voice,
|
|
65
|
+
speed: speed, response_format: response_format)
|
|
66
|
+
when :elevenlabs
|
|
67
|
+
elevenlabs_speak(text, model: model, voice_id: voice_id || voice,
|
|
68
|
+
speed: speed, response_format: response_format,
|
|
69
|
+
voice_settings: voice_settings)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Synthesize speech with streaming
|
|
74
|
+
#
|
|
75
|
+
# @param text [String] text to convert
|
|
76
|
+
# @param model [String] model identifier
|
|
77
|
+
# @param voice [String] voice name
|
|
78
|
+
# @param voice_id [String, nil] voice ID
|
|
79
|
+
# @param speed [Float, nil] speed multiplier
|
|
80
|
+
# @param response_format [String] output format
|
|
81
|
+
# @param voice_settings [Hash, nil] ElevenLabs voice settings
|
|
82
|
+
# @yield [StreamChunk] each audio chunk as it arrives
|
|
83
|
+
# @return [Response]
|
|
84
|
+
def speak_streaming(text, model:, voice:, voice_id: nil, speed: nil,
|
|
85
|
+
response_format: "mp3", voice_settings: nil, &block)
|
|
86
|
+
case @provider
|
|
87
|
+
when :openai
|
|
88
|
+
openai_speak_streaming(text, model: model, voice: voice_id || voice,
|
|
89
|
+
speed: speed, response_format: response_format,
|
|
90
|
+
&block)
|
|
91
|
+
when :elevenlabs
|
|
92
|
+
elevenlabs_speak_streaming(text, model: model,
|
|
93
|
+
voice_id: voice_id || voice,
|
|
94
|
+
speed: speed,
|
|
95
|
+
response_format: response_format,
|
|
96
|
+
voice_settings: voice_settings, &block)
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
private
|
|
101
|
+
|
|
102
|
+
# ============================================================
|
|
103
|
+
# Provider validation
|
|
104
|
+
# ============================================================
|
|
105
|
+
|
|
106
|
+
def validate_provider!(provider)
|
|
107
|
+
return if SUPPORTED_PROVIDERS.include?(provider)
|
|
108
|
+
|
|
109
|
+
raise UnsupportedProviderError.new(
|
|
110
|
+
"Provider :#{provider} is not yet supported for text-to-speech. " \
|
|
111
|
+
"Supported providers: #{SUPPORTED_PROVIDERS.map { |p| ":#{p}" }.join(", ")}.",
|
|
112
|
+
provider: provider
|
|
113
|
+
)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# ============================================================
|
|
117
|
+
# OpenAI implementation
|
|
118
|
+
# ============================================================
|
|
119
|
+
|
|
120
|
+
def openai_speak(text, model:, voice:, speed:, response_format:)
|
|
121
|
+
body = openai_request_body(text, model: model, voice: voice,
|
|
122
|
+
speed: speed, response_format: response_format)
|
|
123
|
+
|
|
124
|
+
response = openai_connection.post("/v1/audio/speech") do |req|
|
|
125
|
+
req.headers["Content-Type"] = "application/json"
|
|
126
|
+
req.body = body.to_json
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
handle_error_response!(response) unless response.success?
|
|
130
|
+
|
|
131
|
+
Response.new(
|
|
132
|
+
audio: response.body,
|
|
133
|
+
format: response_format.to_sym,
|
|
134
|
+
model: model,
|
|
135
|
+
voice: voice
|
|
136
|
+
)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def openai_speak_streaming(text, model:, voice:, speed:,
|
|
140
|
+
response_format:, &block)
|
|
141
|
+
body = openai_request_body(text, model: model, voice: voice,
|
|
142
|
+
speed: speed, response_format: response_format)
|
|
143
|
+
chunks = []
|
|
144
|
+
|
|
145
|
+
openai_connection.post("/v1/audio/speech") do |req|
|
|
146
|
+
req.headers["Content-Type"] = "application/json"
|
|
147
|
+
req.body = body.to_json
|
|
148
|
+
req.options.on_data = proc do |chunk, _size, env|
|
|
149
|
+
if env.status == 200
|
|
150
|
+
chunk_obj = StreamChunk.new(audio: chunk)
|
|
151
|
+
chunks << chunk
|
|
152
|
+
block&.call(chunk_obj)
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
Response.new(
|
|
158
|
+
audio: chunks.join,
|
|
159
|
+
format: response_format.to_sym,
|
|
160
|
+
model: model,
|
|
161
|
+
voice: voice
|
|
162
|
+
)
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def openai_request_body(text, model:, voice:, speed:, response_format:)
|
|
166
|
+
body = {
|
|
167
|
+
model: model,
|
|
168
|
+
input: text,
|
|
169
|
+
voice: voice,
|
|
170
|
+
response_format: response_format.to_s
|
|
171
|
+
}
|
|
172
|
+
body[:speed] = speed if speed && (speed - 1.0).abs > Float::EPSILON
|
|
173
|
+
body
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def openai_connection
|
|
177
|
+
@openai_connection ||= Faraday.new(url: openai_api_base) do |f|
|
|
178
|
+
f.headers["Authorization"] = "Bearer #{openai_api_key}"
|
|
179
|
+
f.adapter Faraday.default_adapter
|
|
180
|
+
f.options.timeout = 120
|
|
181
|
+
f.options.open_timeout = 30
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def openai_api_key
|
|
186
|
+
key = RubyLLM.config.openai_api_key
|
|
187
|
+
unless key
|
|
188
|
+
raise ConfigurationError,
|
|
189
|
+
"OpenAI API key is required for text-to-speech. " \
|
|
190
|
+
"Set it via: RubyLLM.configure { |c| c.openai_api_key = 'sk-...' }"
|
|
191
|
+
end
|
|
192
|
+
key
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def openai_api_base
|
|
196
|
+
base = RubyLLM.config.openai_api_base
|
|
197
|
+
(base && !base.empty?) ? base : "https://api.openai.com"
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
# ============================================================
|
|
201
|
+
# ElevenLabs implementation
|
|
202
|
+
# ============================================================
|
|
203
|
+
|
|
204
|
+
def elevenlabs_speak(text, model:, voice_id:, speed:,
|
|
205
|
+
response_format:, voice_settings:)
|
|
206
|
+
path = "/v1/text-to-speech/#{voice_id}"
|
|
207
|
+
body = elevenlabs_request_body(text, model: model, speed: speed,
|
|
208
|
+
voice_settings: voice_settings)
|
|
209
|
+
format_param = elevenlabs_output_format(response_format)
|
|
210
|
+
|
|
211
|
+
response = elevenlabs_connection.post(path) do |req|
|
|
212
|
+
req.headers["Content-Type"] = "application/json"
|
|
213
|
+
req.params["output_format"] = format_param
|
|
214
|
+
req.body = body.to_json
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
handle_error_response!(response) unless response.success?
|
|
218
|
+
|
|
219
|
+
Response.new(
|
|
220
|
+
audio: response.body,
|
|
221
|
+
format: response_format.to_sym,
|
|
222
|
+
model: model,
|
|
223
|
+
voice: voice_id
|
|
224
|
+
)
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
def elevenlabs_speak_streaming(text, model:, voice_id:, speed:,
|
|
228
|
+
response_format:, voice_settings:, &block)
|
|
229
|
+
path = "/v1/text-to-speech/#{voice_id}/stream"
|
|
230
|
+
body = elevenlabs_request_body(text, model: model, speed: speed,
|
|
231
|
+
voice_settings: voice_settings)
|
|
232
|
+
format_param = elevenlabs_output_format(response_format)
|
|
233
|
+
chunks = []
|
|
234
|
+
|
|
235
|
+
elevenlabs_connection.post(path) do |req|
|
|
236
|
+
req.headers["Content-Type"] = "application/json"
|
|
237
|
+
req.params["output_format"] = format_param
|
|
238
|
+
req.body = body.to_json
|
|
239
|
+
req.options.on_data = proc do |chunk, _size, env|
|
|
240
|
+
if env.status == 200
|
|
241
|
+
chunk_obj = StreamChunk.new(audio: chunk)
|
|
242
|
+
chunks << chunk
|
|
243
|
+
block&.call(chunk_obj)
|
|
244
|
+
end
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
Response.new(
|
|
249
|
+
audio: chunks.join,
|
|
250
|
+
format: response_format.to_sym,
|
|
251
|
+
model: model,
|
|
252
|
+
voice: voice_id
|
|
253
|
+
)
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def elevenlabs_request_body(text, model:, speed:, voice_settings:)
|
|
257
|
+
body = {
|
|
258
|
+
text: text,
|
|
259
|
+
model_id: model
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
vs = voice_settings&.dup || {}
|
|
263
|
+
vs[:speed] = speed if speed && (speed - 1.0).abs > Float::EPSILON
|
|
264
|
+
body[:voice_settings] = vs unless vs.empty?
|
|
265
|
+
|
|
266
|
+
body
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
ELEVENLABS_FORMAT_MAP = {
|
|
270
|
+
"mp3" => "mp3_44100_128",
|
|
271
|
+
"pcm" => "pcm_44100",
|
|
272
|
+
"ulaw" => "ulaw_8000"
|
|
273
|
+
}.freeze
|
|
274
|
+
|
|
275
|
+
def elevenlabs_output_format(format)
|
|
276
|
+
ELEVENLABS_FORMAT_MAP[format.to_s] || "mp3_44100_128"
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
def elevenlabs_connection
|
|
280
|
+
@elevenlabs_connection ||= Faraday.new(url: elevenlabs_api_base) do |f|
|
|
281
|
+
f.headers["xi-api-key"] = elevenlabs_api_key
|
|
282
|
+
f.adapter Faraday.default_adapter
|
|
283
|
+
f.options.timeout = 120
|
|
284
|
+
f.options.open_timeout = 30
|
|
285
|
+
end
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
def elevenlabs_api_key
|
|
289
|
+
key = RubyLLM::Agents.configuration.elevenlabs_api_key
|
|
290
|
+
unless key
|
|
291
|
+
raise ConfigurationError,
|
|
292
|
+
"ElevenLabs API key is required for text-to-speech. " \
|
|
293
|
+
"Set it via: RubyLLM::Agents.configure { |c| c.elevenlabs_api_key = 'xi-...' }"
|
|
294
|
+
end
|
|
295
|
+
key
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
def elevenlabs_api_base
|
|
299
|
+
base = RubyLLM::Agents.configuration.elevenlabs_api_base
|
|
300
|
+
(base && !base.empty?) ? base : "https://api.elevenlabs.io"
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
# ============================================================
|
|
304
|
+
# Shared error handling
|
|
305
|
+
# ============================================================
|
|
306
|
+
|
|
307
|
+
def handle_error_response!(response)
|
|
308
|
+
raise SpeechApiError.new(
|
|
309
|
+
"TTS API request failed (HTTP #{response.status}): #{error_message_from(response)}",
|
|
310
|
+
status: response.status,
|
|
311
|
+
response_body: response.body
|
|
312
|
+
)
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
def error_message_from(response)
|
|
316
|
+
parsed = JSON.parse(response.body)
|
|
317
|
+
if parsed.is_a?(Hash)
|
|
318
|
+
parsed.dig("error", "message") || parsed["detail"] || parsed["error"] || response.body
|
|
319
|
+
else
|
|
320
|
+
response.body
|
|
321
|
+
end
|
|
322
|
+
rescue JSON::ParserError
|
|
323
|
+
response.body.to_s[0, 200]
|
|
324
|
+
end
|
|
325
|
+
end
|
|
326
|
+
end
|
|
327
|
+
end
|
|
328
|
+
end
|