ruby_llm-agents 3.1.0 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -0
- data/app/controllers/ruby_llm/agents/agents_controller.rb +16 -14
- data/app/controllers/ruby_llm/agents/dashboard_controller.rb +20 -20
- data/app/controllers/ruby_llm/agents/executions_controller.rb +5 -7
- data/app/helpers/ruby_llm/agents/application_helper.rb +57 -58
- data/app/models/ruby_llm/agents/execution/analytics.rb +27 -27
- data/app/models/ruby_llm/agents/execution/scopes.rb +4 -6
- data/app/models/ruby_llm/agents/execution.rb +25 -25
- data/app/models/ruby_llm/agents/tenant/budgetable.rb +16 -10
- data/app/models/ruby_llm/agents/tenant/resettable.rb +12 -12
- data/app/models/ruby_llm/agents/tenant/trackable.rb +7 -7
- data/app/services/ruby_llm/agents/agent_registry.rb +6 -6
- data/lib/generators/ruby_llm_agents/agent_generator.rb +4 -4
- data/lib/generators/ruby_llm_agents/background_remover_generator.rb +6 -6
- data/lib/generators/ruby_llm_agents/embedder_generator.rb +4 -4
- data/lib/generators/ruby_llm_agents/image_analyzer_generator.rb +7 -7
- data/lib/generators/ruby_llm_agents/image_editor_generator.rb +4 -4
- data/lib/generators/ruby_llm_agents/image_generator_generator.rb +6 -6
- data/lib/generators/ruby_llm_agents/image_pipeline_generator.rb +9 -9
- data/lib/generators/ruby_llm_agents/image_transformer_generator.rb +6 -6
- data/lib/generators/ruby_llm_agents/image_upscaler_generator.rb +4 -4
- data/lib/generators/ruby_llm_agents/image_variator_generator.rb +4 -4
- data/lib/generators/ruby_llm_agents/install_generator.rb +3 -3
- data/lib/generators/ruby_llm_agents/migrate_structure_generator.rb +4 -4
- data/lib/generators/ruby_llm_agents/multi_tenancy_generator.rb +2 -2
- data/lib/generators/ruby_llm_agents/restructure_generator.rb +13 -13
- data/lib/generators/ruby_llm_agents/speaker_generator.rb +6 -6
- data/lib/generators/ruby_llm_agents/transcriber_generator.rb +4 -4
- data/lib/generators/ruby_llm_agents/upgrade_generator.rb +2 -2
- data/lib/ruby_llm/agents/audio/speaker.rb +40 -31
- data/lib/ruby_llm/agents/audio/speech_client.rb +328 -0
- data/lib/ruby_llm/agents/audio/speech_pricing.rb +273 -0
- data/lib/ruby_llm/agents/audio/transcriber.rb +33 -33
- data/lib/ruby_llm/agents/base_agent.rb +14 -14
- data/lib/ruby_llm/agents/core/base/callbacks.rb +3 -3
- data/lib/ruby_llm/agents/core/configuration.rb +86 -73
- data/lib/ruby_llm/agents/core/errors.rb +27 -2
- data/lib/ruby_llm/agents/core/instrumentation.rb +64 -66
- data/lib/ruby_llm/agents/core/llm_tenant.rb +7 -7
- data/lib/ruby_llm/agents/core/version.rb +1 -1
- data/lib/ruby_llm/agents/dsl/base.rb +3 -3
- data/lib/ruby_llm/agents/dsl/reliability.rb +9 -9
- data/lib/ruby_llm/agents/image/analyzer/dsl.rb +1 -1
- data/lib/ruby_llm/agents/image/analyzer/execution.rb +4 -4
- data/lib/ruby_llm/agents/image/background_remover/dsl.rb +1 -1
- data/lib/ruby_llm/agents/image/background_remover/execution.rb +3 -3
- data/lib/ruby_llm/agents/image/concerns/image_operation_execution.rb +8 -8
- data/lib/ruby_llm/agents/image/editor/execution.rb +1 -1
- data/lib/ruby_llm/agents/image/generator/pricing.rb +9 -10
- data/lib/ruby_llm/agents/image/generator.rb +6 -6
- data/lib/ruby_llm/agents/image/pipeline/dsl.rb +6 -6
- data/lib/ruby_llm/agents/image/pipeline/execution.rb +9 -9
- data/lib/ruby_llm/agents/image/pipeline.rb +1 -1
- data/lib/ruby_llm/agents/image/transformer/execution.rb +1 -1
- data/lib/ruby_llm/agents/image/upscaler/dsl.rb +1 -1
- data/lib/ruby_llm/agents/image/upscaler/execution.rb +3 -5
- data/lib/ruby_llm/agents/image/variator/execution.rb +1 -1
- data/lib/ruby_llm/agents/infrastructure/alert_manager.rb +4 -4
- data/lib/ruby_llm/agents/infrastructure/attempt_tracker.rb +4 -4
- data/lib/ruby_llm/agents/infrastructure/budget/budget_query.rb +9 -9
- data/lib/ruby_llm/agents/infrastructure/budget/config_resolver.rb +3 -3
- data/lib/ruby_llm/agents/infrastructure/budget/forecaster.rb +1 -1
- data/lib/ruby_llm/agents/infrastructure/budget/spend_recorder.rb +17 -17
- data/lib/ruby_llm/agents/infrastructure/circuit_breaker.rb +1 -0
- data/lib/ruby_llm/agents/infrastructure/execution_logger_job.rb +1 -1
- data/lib/ruby_llm/agents/infrastructure/reliability.rb +6 -6
- data/lib/ruby_llm/agents/pipeline/builder.rb +11 -11
- data/lib/ruby_llm/agents/pipeline/middleware/budget.rb +3 -3
- data/lib/ruby_llm/agents/pipeline/middleware/cache.rb +4 -4
- data/lib/ruby_llm/agents/pipeline/middleware/instrumentation.rb +34 -22
- data/lib/ruby_llm/agents/pipeline/middleware/reliability.rb +2 -3
- data/lib/ruby_llm/agents/pipeline/middleware/tenant.rb +7 -7
- data/lib/ruby_llm/agents/results/background_removal_result.rb +6 -6
- data/lib/ruby_llm/agents/results/embedding_result.rb +15 -15
- data/lib/ruby_llm/agents/results/image_analysis_result.rb +7 -7
- data/lib/ruby_llm/agents/results/image_edit_result.rb +4 -4
- data/lib/ruby_llm/agents/results/image_generation_result.rb +5 -5
- data/lib/ruby_llm/agents/results/image_pipeline_result.rb +4 -4
- data/lib/ruby_llm/agents/results/image_transform_result.rb +4 -4
- data/lib/ruby_llm/agents/results/image_upscale_result.rb +5 -5
- data/lib/ruby_llm/agents/results/image_variation_result.rb +4 -4
- data/lib/ruby_llm/agents/results/transcription_result.rb +1 -1
- data/lib/ruby_llm/agents/text/embedder.rb +13 -13
- metadata +3 -1
|
@@ -17,17 +17,17 @@ module RubyLlmAgents
|
|
|
17
17
|
source_root File.expand_path("templates", __dir__)
|
|
18
18
|
|
|
19
19
|
class_option :provider, type: :string, default: "openai",
|
|
20
|
-
|
|
20
|
+
desc: "The TTS provider to use (openai, elevenlabs)"
|
|
21
21
|
class_option :model, type: :string, default: nil,
|
|
22
|
-
|
|
22
|
+
desc: "The TTS model to use"
|
|
23
23
|
class_option :voice, type: :string, default: "nova",
|
|
24
|
-
|
|
24
|
+
desc: "The voice to use"
|
|
25
25
|
class_option :speed, type: :numeric, default: 1.0,
|
|
26
|
-
|
|
26
|
+
desc: "Speech speed (0.25-4.0 for OpenAI)"
|
|
27
27
|
class_option :format, type: :string, default: "mp3",
|
|
28
|
-
|
|
28
|
+
desc: "Output format (mp3, wav, ogg, flac)"
|
|
29
29
|
class_option :cache, type: :string, default: nil,
|
|
30
|
-
|
|
30
|
+
desc: "Cache TTL (e.g., '7.days')"
|
|
31
31
|
|
|
32
32
|
def ensure_base_class_and_skill_file
|
|
33
33
|
audio_dir = "app/agents/audio"
|
|
@@ -17,13 +17,13 @@ module RubyLlmAgents
|
|
|
17
17
|
source_root File.expand_path("templates", __dir__)
|
|
18
18
|
|
|
19
19
|
class_option :model, type: :string, default: "whisper-1",
|
|
20
|
-
|
|
20
|
+
desc: "The transcription model to use"
|
|
21
21
|
class_option :language, type: :string, default: nil,
|
|
22
|
-
|
|
22
|
+
desc: "Language code (e.g., 'en', 'es')"
|
|
23
23
|
class_option :output_format, type: :string, default: "text",
|
|
24
|
-
|
|
24
|
+
desc: "Output format (text, srt, vtt, json)"
|
|
25
25
|
class_option :cache, type: :string, default: nil,
|
|
26
|
-
|
|
26
|
+
desc: "Cache TTL (e.g., '30.days')"
|
|
27
27
|
|
|
28
28
|
def ensure_base_class_and_skill_file
|
|
29
29
|
audio_dir = "app/agents/audio"
|
|
@@ -164,13 +164,13 @@ module RubyLlmAgents
|
|
|
164
164
|
return false unless ActiveRecord::Base.connection.table_exists?(table)
|
|
165
165
|
|
|
166
166
|
ActiveRecord::Base.connection.column_exists?(table, column)
|
|
167
|
-
rescue
|
|
167
|
+
rescue
|
|
168
168
|
false
|
|
169
169
|
end
|
|
170
170
|
|
|
171
171
|
def table_exists?(table)
|
|
172
172
|
ActiveRecord::Base.connection.table_exists?(table)
|
|
173
|
-
rescue
|
|
173
|
+
rescue
|
|
174
174
|
false
|
|
175
175
|
end
|
|
176
176
|
end
|
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
require "digest"
|
|
4
4
|
require_relative "../results/speech_result"
|
|
5
|
+
require_relative "speech_client"
|
|
6
|
+
require_relative "speech_pricing"
|
|
5
7
|
|
|
6
8
|
module RubyLLM
|
|
7
9
|
module Agents
|
|
@@ -194,19 +196,19 @@ module RubyLLM
|
|
|
194
196
|
|
|
195
197
|
def default_tts_provider
|
|
196
198
|
RubyLLM::Agents.configuration.default_tts_provider
|
|
197
|
-
rescue
|
|
199
|
+
rescue
|
|
198
200
|
:openai
|
|
199
201
|
end
|
|
200
202
|
|
|
201
203
|
def default_tts_model
|
|
202
204
|
RubyLLM::Agents.configuration.default_tts_model
|
|
203
|
-
rescue
|
|
205
|
+
rescue
|
|
204
206
|
"tts-1"
|
|
205
207
|
end
|
|
206
208
|
|
|
207
209
|
def default_tts_voice
|
|
208
210
|
RubyLLM::Agents.configuration.default_tts_voice
|
|
209
|
-
rescue
|
|
211
|
+
rescue
|
|
210
212
|
"nova"
|
|
211
213
|
end
|
|
212
214
|
end
|
|
@@ -410,7 +412,15 @@ module RubyLLM
|
|
|
410
412
|
|
|
411
413
|
# Executes standard (non-streaming) speech synthesis
|
|
412
414
|
def execute_standard_speech(text, options)
|
|
413
|
-
response =
|
|
415
|
+
response = speech_client.speak(
|
|
416
|
+
text,
|
|
417
|
+
model: options[:model],
|
|
418
|
+
voice: options[:voice],
|
|
419
|
+
voice_id: resolved_voice_id,
|
|
420
|
+
speed: options[:speed],
|
|
421
|
+
response_format: options[:response_format] || "mp3",
|
|
422
|
+
voice_settings: options[:voice_settings]
|
|
423
|
+
)
|
|
414
424
|
|
|
415
425
|
{
|
|
416
426
|
audio: response.audio,
|
|
@@ -428,9 +438,17 @@ module RubyLLM
|
|
|
428
438
|
def execute_streaming_speech(text, options)
|
|
429
439
|
audio_chunks = []
|
|
430
440
|
|
|
431
|
-
|
|
441
|
+
speech_client.speak_streaming(
|
|
442
|
+
text,
|
|
443
|
+
model: options[:model],
|
|
444
|
+
voice: options[:voice],
|
|
445
|
+
voice_id: resolved_voice_id,
|
|
446
|
+
speed: options[:speed],
|
|
447
|
+
response_format: options[:response_format] || "mp3",
|
|
448
|
+
voice_settings: options[:voice_settings]
|
|
449
|
+
) do |chunk|
|
|
432
450
|
audio_chunks << chunk.audio if chunk.respond_to?(:audio)
|
|
433
|
-
@streaming_block
|
|
451
|
+
@streaming_block&.call(chunk)
|
|
434
452
|
end
|
|
435
453
|
|
|
436
454
|
{
|
|
@@ -445,7 +463,7 @@ module RubyLLM
|
|
|
445
463
|
}
|
|
446
464
|
end
|
|
447
465
|
|
|
448
|
-
# Builds options for
|
|
466
|
+
# Builds options for SpeechClient
|
|
449
467
|
def build_speak_options
|
|
450
468
|
options = {
|
|
451
469
|
model: resolved_model,
|
|
@@ -453,13 +471,11 @@ module RubyLLM
|
|
|
453
471
|
}
|
|
454
472
|
|
|
455
473
|
speed = resolved_speed
|
|
456
|
-
options[:speed] = speed if speed && speed
|
|
474
|
+
options[:speed] = speed if speed && (speed - 1.0).abs > Float::EPSILON
|
|
457
475
|
options[:response_format] = resolved_output_format.to_s
|
|
458
476
|
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
options[:voice_settings] = voice_settings.to_h if voice_settings
|
|
462
|
-
end
|
|
477
|
+
voice_settings = self.class.voice_settings_config
|
|
478
|
+
options[:voice_settings] = voice_settings.to_h if voice_settings
|
|
463
479
|
|
|
464
480
|
options
|
|
465
481
|
end
|
|
@@ -488,29 +504,17 @@ module RubyLLM
|
|
|
488
504
|
|
|
489
505
|
# Calculates cost for speech synthesis
|
|
490
506
|
def calculate_cost(raw_result)
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
if raw_result[:raw_response].respond_to?(:cost) && raw_result[:raw_response].cost
|
|
507
|
+
if raw_result[:raw_response].respond_to?(:cost) && raw_result[:raw_response]&.cost
|
|
494
508
|
return raw_result[:raw_response].cost
|
|
495
509
|
end
|
|
496
510
|
|
|
497
|
-
|
|
498
|
-
model_name = raw_result[:model].to_s
|
|
499
|
-
|
|
500
|
-
price_per_1k_chars = case provider
|
|
501
|
-
when :openai
|
|
502
|
-
model_name.include?("hd") ? 0.030 : 0.015
|
|
503
|
-
when :elevenlabs
|
|
504
|
-
0.30
|
|
505
|
-
when :google
|
|
506
|
-
0.016
|
|
507
|
-
when :polly
|
|
508
|
-
0.016
|
|
509
|
-
else
|
|
510
|
-
0.015
|
|
511
|
-
end
|
|
511
|
+
characters = raw_result[:characters] || 0
|
|
512
512
|
|
|
513
|
-
(
|
|
513
|
+
Audio::SpeechPricing.calculate_cost(
|
|
514
|
+
provider: raw_result[:provider],
|
|
515
|
+
model_id: raw_result[:model].to_s,
|
|
516
|
+
characters: characters
|
|
517
|
+
)
|
|
514
518
|
end
|
|
515
519
|
|
|
516
520
|
# Resolves the provider to use
|
|
@@ -547,6 +551,11 @@ module RubyLLM
|
|
|
547
551
|
def streaming_enabled?
|
|
548
552
|
@runtime_streaming || self.class.streaming?
|
|
549
553
|
end
|
|
554
|
+
|
|
555
|
+
# Returns a SpeechClient for the resolved provider
|
|
556
|
+
def speech_client
|
|
557
|
+
@speech_client ||= Audio::SpeechClient.new(provider: resolved_provider)
|
|
558
|
+
end
|
|
550
559
|
end
|
|
551
560
|
end
|
|
552
561
|
end
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "faraday"
|
|
4
|
+
require "json"
|
|
5
|
+
|
|
6
|
+
module RubyLLM
|
|
7
|
+
module Agents
|
|
8
|
+
module Audio
|
|
9
|
+
# Direct HTTP client for text-to-speech APIs.
|
|
10
|
+
#
|
|
11
|
+
# Supports OpenAI and ElevenLabs providers, bypassing the need for
|
|
12
|
+
# a RubyLLM.speak() method that does not exist in the base gem.
|
|
13
|
+
#
|
|
14
|
+
# @example OpenAI
|
|
15
|
+
# client = SpeechClient.new(provider: :openai)
|
|
16
|
+
# response = client.speak("Hello", model: "tts-1", voice: "nova")
|
|
17
|
+
# response.audio # => binary audio data
|
|
18
|
+
#
|
|
19
|
+
# @example ElevenLabs
|
|
20
|
+
# client = SpeechClient.new(provider: :elevenlabs)
|
|
21
|
+
# response = client.speak("Hello",
|
|
22
|
+
# model: "eleven_v3",
|
|
23
|
+
# voice: "Rachel",
|
|
24
|
+
# voice_id: "21m00Tcm4TlvDq8ikWAM",
|
|
25
|
+
# voice_settings: { stability: 0.5, similarity_boost: 0.75 }
|
|
26
|
+
# )
|
|
27
|
+
#
|
|
28
|
+
class SpeechClient
|
|
29
|
+
SUPPORTED_PROVIDERS = %i[openai elevenlabs].freeze
|
|
30
|
+
|
|
31
|
+
Response = Struct.new(:audio, :format, :model, :voice, keyword_init: true) do
|
|
32
|
+
def duration
|
|
33
|
+
nil
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def cost
|
|
37
|
+
nil
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
StreamChunk = Struct.new(:audio, keyword_init: true)
|
|
42
|
+
|
|
43
|
+
# @param provider [Symbol] :openai or :elevenlabs
|
|
44
|
+
# @raise [UnsupportedProviderError] if provider is not supported
|
|
45
|
+
def initialize(provider:)
|
|
46
|
+
validate_provider!(provider)
|
|
47
|
+
@provider = provider
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Synthesize speech (non-streaming)
|
|
51
|
+
#
|
|
52
|
+
# @param text [String] text to convert
|
|
53
|
+
# @param model [String] model identifier
|
|
54
|
+
# @param voice [String] voice name
|
|
55
|
+
# @param voice_id [String, nil] voice ID (required for ElevenLabs)
|
|
56
|
+
# @param speed [Float, nil] speed multiplier
|
|
57
|
+
# @param response_format [String] output format
|
|
58
|
+
# @param voice_settings [Hash, nil] ElevenLabs voice settings
|
|
59
|
+
# @return [Response]
|
|
60
|
+
def speak(text, model:, voice:, voice_id: nil, speed: nil,
|
|
61
|
+
response_format: "mp3", voice_settings: nil)
|
|
62
|
+
case @provider
|
|
63
|
+
when :openai
|
|
64
|
+
openai_speak(text, model: model, voice: voice_id || voice,
|
|
65
|
+
speed: speed, response_format: response_format)
|
|
66
|
+
when :elevenlabs
|
|
67
|
+
elevenlabs_speak(text, model: model, voice_id: voice_id || voice,
|
|
68
|
+
speed: speed, response_format: response_format,
|
|
69
|
+
voice_settings: voice_settings)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Synthesize speech with streaming
|
|
74
|
+
#
|
|
75
|
+
# @param text [String] text to convert
|
|
76
|
+
# @param model [String] model identifier
|
|
77
|
+
# @param voice [String] voice name
|
|
78
|
+
# @param voice_id [String, nil] voice ID
|
|
79
|
+
# @param speed [Float, nil] speed multiplier
|
|
80
|
+
# @param response_format [String] output format
|
|
81
|
+
# @param voice_settings [Hash, nil] ElevenLabs voice settings
|
|
82
|
+
# @yield [StreamChunk] each audio chunk as it arrives
|
|
83
|
+
# @return [Response]
|
|
84
|
+
def speak_streaming(text, model:, voice:, voice_id: nil, speed: nil,
|
|
85
|
+
response_format: "mp3", voice_settings: nil, &block)
|
|
86
|
+
case @provider
|
|
87
|
+
when :openai
|
|
88
|
+
openai_speak_streaming(text, model: model, voice: voice_id || voice,
|
|
89
|
+
speed: speed, response_format: response_format,
|
|
90
|
+
&block)
|
|
91
|
+
when :elevenlabs
|
|
92
|
+
elevenlabs_speak_streaming(text, model: model,
|
|
93
|
+
voice_id: voice_id || voice,
|
|
94
|
+
speed: speed,
|
|
95
|
+
response_format: response_format,
|
|
96
|
+
voice_settings: voice_settings, &block)
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
private
|
|
101
|
+
|
|
102
|
+
# ============================================================
|
|
103
|
+
# Provider validation
|
|
104
|
+
# ============================================================
|
|
105
|
+
|
|
106
|
+
def validate_provider!(provider)
|
|
107
|
+
return if SUPPORTED_PROVIDERS.include?(provider)
|
|
108
|
+
|
|
109
|
+
raise UnsupportedProviderError.new(
|
|
110
|
+
"Provider :#{provider} is not yet supported for text-to-speech. " \
|
|
111
|
+
"Supported providers: #{SUPPORTED_PROVIDERS.map { |p| ":#{p}" }.join(", ")}.",
|
|
112
|
+
provider: provider
|
|
113
|
+
)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# ============================================================
|
|
117
|
+
# OpenAI implementation
|
|
118
|
+
# ============================================================
|
|
119
|
+
|
|
120
|
+
def openai_speak(text, model:, voice:, speed:, response_format:)
|
|
121
|
+
body = openai_request_body(text, model: model, voice: voice,
|
|
122
|
+
speed: speed, response_format: response_format)
|
|
123
|
+
|
|
124
|
+
response = openai_connection.post("/v1/audio/speech") do |req|
|
|
125
|
+
req.headers["Content-Type"] = "application/json"
|
|
126
|
+
req.body = body.to_json
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
handle_error_response!(response) unless response.success?
|
|
130
|
+
|
|
131
|
+
Response.new(
|
|
132
|
+
audio: response.body,
|
|
133
|
+
format: response_format.to_sym,
|
|
134
|
+
model: model,
|
|
135
|
+
voice: voice
|
|
136
|
+
)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def openai_speak_streaming(text, model:, voice:, speed:,
|
|
140
|
+
response_format:, &block)
|
|
141
|
+
body = openai_request_body(text, model: model, voice: voice,
|
|
142
|
+
speed: speed, response_format: response_format)
|
|
143
|
+
chunks = []
|
|
144
|
+
|
|
145
|
+
openai_connection.post("/v1/audio/speech") do |req|
|
|
146
|
+
req.headers["Content-Type"] = "application/json"
|
|
147
|
+
req.body = body.to_json
|
|
148
|
+
req.options.on_data = proc do |chunk, _size, env|
|
|
149
|
+
if env.status == 200
|
|
150
|
+
chunk_obj = StreamChunk.new(audio: chunk)
|
|
151
|
+
chunks << chunk
|
|
152
|
+
block&.call(chunk_obj)
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
Response.new(
|
|
158
|
+
audio: chunks.join,
|
|
159
|
+
format: response_format.to_sym,
|
|
160
|
+
model: model,
|
|
161
|
+
voice: voice
|
|
162
|
+
)
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def openai_request_body(text, model:, voice:, speed:, response_format:)
|
|
166
|
+
body = {
|
|
167
|
+
model: model,
|
|
168
|
+
input: text,
|
|
169
|
+
voice: voice,
|
|
170
|
+
response_format: response_format.to_s
|
|
171
|
+
}
|
|
172
|
+
body[:speed] = speed if speed && (speed - 1.0).abs > Float::EPSILON
|
|
173
|
+
body
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def openai_connection
|
|
177
|
+
@openai_connection ||= Faraday.new(url: openai_api_base) do |f|
|
|
178
|
+
f.headers["Authorization"] = "Bearer #{openai_api_key}"
|
|
179
|
+
f.adapter Faraday.default_adapter
|
|
180
|
+
f.options.timeout = 120
|
|
181
|
+
f.options.open_timeout = 30
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def openai_api_key
|
|
186
|
+
key = RubyLLM.config.openai_api_key
|
|
187
|
+
unless key
|
|
188
|
+
raise ConfigurationError,
|
|
189
|
+
"OpenAI API key is required for text-to-speech. " \
|
|
190
|
+
"Set it via: RubyLLM.configure { |c| c.openai_api_key = 'sk-...' }"
|
|
191
|
+
end
|
|
192
|
+
key
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def openai_api_base
|
|
196
|
+
base = RubyLLM.config.openai_api_base
|
|
197
|
+
(base && !base.empty?) ? base : "https://api.openai.com"
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
# ============================================================
|
|
201
|
+
# ElevenLabs implementation
|
|
202
|
+
# ============================================================
|
|
203
|
+
|
|
204
|
+
def elevenlabs_speak(text, model:, voice_id:, speed:,
|
|
205
|
+
response_format:, voice_settings:)
|
|
206
|
+
path = "/v1/text-to-speech/#{voice_id}"
|
|
207
|
+
body = elevenlabs_request_body(text, model: model, speed: speed,
|
|
208
|
+
voice_settings: voice_settings)
|
|
209
|
+
format_param = elevenlabs_output_format(response_format)
|
|
210
|
+
|
|
211
|
+
response = elevenlabs_connection.post(path) do |req|
|
|
212
|
+
req.headers["Content-Type"] = "application/json"
|
|
213
|
+
req.params["output_format"] = format_param
|
|
214
|
+
req.body = body.to_json
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
handle_error_response!(response) unless response.success?
|
|
218
|
+
|
|
219
|
+
Response.new(
|
|
220
|
+
audio: response.body,
|
|
221
|
+
format: response_format.to_sym,
|
|
222
|
+
model: model,
|
|
223
|
+
voice: voice_id
|
|
224
|
+
)
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
def elevenlabs_speak_streaming(text, model:, voice_id:, speed:,
|
|
228
|
+
response_format:, voice_settings:, &block)
|
|
229
|
+
path = "/v1/text-to-speech/#{voice_id}/stream"
|
|
230
|
+
body = elevenlabs_request_body(text, model: model, speed: speed,
|
|
231
|
+
voice_settings: voice_settings)
|
|
232
|
+
format_param = elevenlabs_output_format(response_format)
|
|
233
|
+
chunks = []
|
|
234
|
+
|
|
235
|
+
elevenlabs_connection.post(path) do |req|
|
|
236
|
+
req.headers["Content-Type"] = "application/json"
|
|
237
|
+
req.params["output_format"] = format_param
|
|
238
|
+
req.body = body.to_json
|
|
239
|
+
req.options.on_data = proc do |chunk, _size, env|
|
|
240
|
+
if env.status == 200
|
|
241
|
+
chunk_obj = StreamChunk.new(audio: chunk)
|
|
242
|
+
chunks << chunk
|
|
243
|
+
block&.call(chunk_obj)
|
|
244
|
+
end
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
Response.new(
|
|
249
|
+
audio: chunks.join,
|
|
250
|
+
format: response_format.to_sym,
|
|
251
|
+
model: model,
|
|
252
|
+
voice: voice_id
|
|
253
|
+
)
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def elevenlabs_request_body(text, model:, speed:, voice_settings:)
|
|
257
|
+
body = {
|
|
258
|
+
text: text,
|
|
259
|
+
model_id: model
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
vs = voice_settings&.dup || {}
|
|
263
|
+
vs[:speed] = speed if speed && (speed - 1.0).abs > Float::EPSILON
|
|
264
|
+
body[:voice_settings] = vs unless vs.empty?
|
|
265
|
+
|
|
266
|
+
body
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
ELEVENLABS_FORMAT_MAP = {
|
|
270
|
+
"mp3" => "mp3_44100_128",
|
|
271
|
+
"pcm" => "pcm_44100",
|
|
272
|
+
"ulaw" => "ulaw_8000"
|
|
273
|
+
}.freeze
|
|
274
|
+
|
|
275
|
+
def elevenlabs_output_format(format)
|
|
276
|
+
ELEVENLABS_FORMAT_MAP[format.to_s] || "mp3_44100_128"
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
def elevenlabs_connection
|
|
280
|
+
@elevenlabs_connection ||= Faraday.new(url: elevenlabs_api_base) do |f|
|
|
281
|
+
f.headers["xi-api-key"] = elevenlabs_api_key
|
|
282
|
+
f.adapter Faraday.default_adapter
|
|
283
|
+
f.options.timeout = 120
|
|
284
|
+
f.options.open_timeout = 30
|
|
285
|
+
end
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
def elevenlabs_api_key
|
|
289
|
+
key = RubyLLM::Agents.configuration.elevenlabs_api_key
|
|
290
|
+
unless key
|
|
291
|
+
raise ConfigurationError,
|
|
292
|
+
"ElevenLabs API key is required for text-to-speech. " \
|
|
293
|
+
"Set it via: RubyLLM::Agents.configure { |c| c.elevenlabs_api_key = 'xi-...' }"
|
|
294
|
+
end
|
|
295
|
+
key
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
def elevenlabs_api_base
|
|
299
|
+
base = RubyLLM::Agents.configuration.elevenlabs_api_base
|
|
300
|
+
(base && !base.empty?) ? base : "https://api.elevenlabs.io"
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
# ============================================================
|
|
304
|
+
# Shared error handling
|
|
305
|
+
# ============================================================
|
|
306
|
+
|
|
307
|
+
def handle_error_response!(response)
|
|
308
|
+
raise SpeechApiError.new(
|
|
309
|
+
"TTS API request failed (HTTP #{response.status}): #{error_message_from(response)}",
|
|
310
|
+
status: response.status,
|
|
311
|
+
response_body: response.body
|
|
312
|
+
)
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
def error_message_from(response)
|
|
316
|
+
parsed = JSON.parse(response.body)
|
|
317
|
+
if parsed.is_a?(Hash)
|
|
318
|
+
parsed.dig("error", "message") || parsed["detail"] || parsed["error"] || response.body
|
|
319
|
+
else
|
|
320
|
+
response.body
|
|
321
|
+
end
|
|
322
|
+
rescue JSON::ParserError
|
|
323
|
+
response.body.to_s[0, 200]
|
|
324
|
+
end
|
|
325
|
+
end
|
|
326
|
+
end
|
|
327
|
+
end
|
|
328
|
+
end
|