ruby_llm-agents 3.1.0 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -0
  3. data/app/controllers/ruby_llm/agents/agents_controller.rb +16 -14
  4. data/app/controllers/ruby_llm/agents/dashboard_controller.rb +20 -20
  5. data/app/controllers/ruby_llm/agents/executions_controller.rb +5 -7
  6. data/app/helpers/ruby_llm/agents/application_helper.rb +57 -58
  7. data/app/models/ruby_llm/agents/execution/analytics.rb +27 -27
  8. data/app/models/ruby_llm/agents/execution/scopes.rb +4 -6
  9. data/app/models/ruby_llm/agents/execution.rb +25 -25
  10. data/app/models/ruby_llm/agents/tenant/budgetable.rb +16 -10
  11. data/app/models/ruby_llm/agents/tenant/resettable.rb +12 -12
  12. data/app/models/ruby_llm/agents/tenant/trackable.rb +7 -7
  13. data/app/services/ruby_llm/agents/agent_registry.rb +6 -6
  14. data/app/views/ruby_llm/agents/executions/_audio_player.html.erb +57 -0
  15. data/app/views/ruby_llm/agents/executions/show.html.erb +8 -0
  16. data/lib/generators/ruby_llm_agents/agent_generator.rb +4 -4
  17. data/lib/generators/ruby_llm_agents/background_remover_generator.rb +6 -6
  18. data/lib/generators/ruby_llm_agents/embedder_generator.rb +4 -4
  19. data/lib/generators/ruby_llm_agents/image_analyzer_generator.rb +7 -7
  20. data/lib/generators/ruby_llm_agents/image_editor_generator.rb +4 -4
  21. data/lib/generators/ruby_llm_agents/image_generator_generator.rb +6 -6
  22. data/lib/generators/ruby_llm_agents/image_pipeline_generator.rb +9 -9
  23. data/lib/generators/ruby_llm_agents/image_transformer_generator.rb +6 -6
  24. data/lib/generators/ruby_llm_agents/image_upscaler_generator.rb +4 -4
  25. data/lib/generators/ruby_llm_agents/image_variator_generator.rb +4 -4
  26. data/lib/generators/ruby_llm_agents/install_generator.rb +3 -3
  27. data/lib/generators/ruby_llm_agents/migrate_structure_generator.rb +4 -4
  28. data/lib/generators/ruby_llm_agents/multi_tenancy_generator.rb +2 -2
  29. data/lib/generators/ruby_llm_agents/restructure_generator.rb +13 -13
  30. data/lib/generators/ruby_llm_agents/speaker_generator.rb +6 -6
  31. data/lib/generators/ruby_llm_agents/transcriber_generator.rb +4 -4
  32. data/lib/generators/ruby_llm_agents/upgrade_generator.rb +2 -2
  33. data/lib/ruby_llm/agents/audio/speaker/active_storage_support.rb +87 -0
  34. data/lib/ruby_llm/agents/audio/speaker.rb +50 -31
  35. data/lib/ruby_llm/agents/audio/speech_client.rb +328 -0
  36. data/lib/ruby_llm/agents/audio/speech_pricing.rb +273 -0
  37. data/lib/ruby_llm/agents/audio/transcriber.rb +43 -33
  38. data/lib/ruby_llm/agents/base_agent.rb +14 -14
  39. data/lib/ruby_llm/agents/core/base/callbacks.rb +3 -3
  40. data/lib/ruby_llm/agents/core/configuration.rb +90 -73
  41. data/lib/ruby_llm/agents/core/errors.rb +27 -2
  42. data/lib/ruby_llm/agents/core/instrumentation.rb +64 -66
  43. data/lib/ruby_llm/agents/core/llm_tenant.rb +7 -7
  44. data/lib/ruby_llm/agents/core/version.rb +1 -1
  45. data/lib/ruby_llm/agents/dsl/base.rb +3 -3
  46. data/lib/ruby_llm/agents/dsl/reliability.rb +9 -9
  47. data/lib/ruby_llm/agents/image/analyzer/dsl.rb +1 -1
  48. data/lib/ruby_llm/agents/image/analyzer/execution.rb +4 -4
  49. data/lib/ruby_llm/agents/image/background_remover/dsl.rb +1 -1
  50. data/lib/ruby_llm/agents/image/background_remover/execution.rb +3 -3
  51. data/lib/ruby_llm/agents/image/concerns/image_operation_execution.rb +8 -8
  52. data/lib/ruby_llm/agents/image/editor/execution.rb +1 -1
  53. data/lib/ruby_llm/agents/image/generator/pricing.rb +9 -10
  54. data/lib/ruby_llm/agents/image/generator.rb +6 -6
  55. data/lib/ruby_llm/agents/image/pipeline/dsl.rb +6 -6
  56. data/lib/ruby_llm/agents/image/pipeline/execution.rb +9 -9
  57. data/lib/ruby_llm/agents/image/pipeline.rb +1 -1
  58. data/lib/ruby_llm/agents/image/transformer/execution.rb +1 -1
  59. data/lib/ruby_llm/agents/image/upscaler/dsl.rb +1 -1
  60. data/lib/ruby_llm/agents/image/upscaler/execution.rb +3 -5
  61. data/lib/ruby_llm/agents/image/variator/execution.rb +1 -1
  62. data/lib/ruby_llm/agents/infrastructure/alert_manager.rb +4 -4
  63. data/lib/ruby_llm/agents/infrastructure/attempt_tracker.rb +4 -4
  64. data/lib/ruby_llm/agents/infrastructure/budget/budget_query.rb +9 -9
  65. data/lib/ruby_llm/agents/infrastructure/budget/config_resolver.rb +3 -3
  66. data/lib/ruby_llm/agents/infrastructure/budget/forecaster.rb +1 -1
  67. data/lib/ruby_llm/agents/infrastructure/budget/spend_recorder.rb +17 -17
  68. data/lib/ruby_llm/agents/infrastructure/circuit_breaker.rb +1 -0
  69. data/lib/ruby_llm/agents/infrastructure/execution_logger_job.rb +1 -1
  70. data/lib/ruby_llm/agents/infrastructure/reliability.rb +6 -6
  71. data/lib/ruby_llm/agents/pipeline/builder.rb +11 -11
  72. data/lib/ruby_llm/agents/pipeline/middleware/budget.rb +3 -3
  73. data/lib/ruby_llm/agents/pipeline/middleware/cache.rb +4 -4
  74. data/lib/ruby_llm/agents/pipeline/middleware/instrumentation.rb +83 -22
  75. data/lib/ruby_llm/agents/pipeline/middleware/reliability.rb +2 -3
  76. data/lib/ruby_llm/agents/pipeline/middleware/tenant.rb +7 -7
  77. data/lib/ruby_llm/agents/results/background_removal_result.rb +6 -6
  78. data/lib/ruby_llm/agents/results/embedding_result.rb +15 -15
  79. data/lib/ruby_llm/agents/results/image_analysis_result.rb +7 -7
  80. data/lib/ruby_llm/agents/results/image_edit_result.rb +4 -4
  81. data/lib/ruby_llm/agents/results/image_generation_result.rb +5 -5
  82. data/lib/ruby_llm/agents/results/image_pipeline_result.rb +4 -4
  83. data/lib/ruby_llm/agents/results/image_transform_result.rb +4 -4
  84. data/lib/ruby_llm/agents/results/image_upscale_result.rb +5 -5
  85. data/lib/ruby_llm/agents/results/image_variation_result.rb +4 -4
  86. data/lib/ruby_llm/agents/results/speech_result.rb +12 -7
  87. data/lib/ruby_llm/agents/results/transcription_result.rb +1 -1
  88. data/lib/ruby_llm/agents/text/embedder.rb +13 -13
  89. metadata +5 -1
@@ -102,7 +102,7 @@ module RubyLlmAgents
102
102
 
103
103
  def table_exists?(table)
104
104
  ActiveRecord::Base.connection.table_exists?(table)
105
- rescue StandardError
105
+ rescue
106
106
  false
107
107
  end
108
108
 
@@ -110,7 +110,7 @@ module RubyLlmAgents
110
110
  return false unless ActiveRecord::Base.connection.table_exists?(table)
111
111
 
112
112
  ActiveRecord::Base.connection.column_exists?(table, column)
113
- rescue StandardError
113
+ rescue
114
114
  false
115
115
  end
116
116
  end
@@ -37,25 +37,25 @@ module RubyLlmAgents
37
37
  # Maps old directory -> { category:, type: }
38
38
  DIRECTORY_MAPPING = {
39
39
  # Top-level under llm/
40
- "agents" => { category: nil, type: "agents" },
41
- "tools" => { category: nil, type: "tools" },
40
+ "agents" => {category: nil, type: "agents"},
41
+ "tools" => {category: nil, type: "tools"},
42
42
 
43
43
  # Audio group
44
- "speakers" => { category: :audio, type: "speakers" },
45
- "transcribers" => { category: :audio, type: "transcribers" },
44
+ "speakers" => {category: :audio, type: "speakers"},
45
+ "transcribers" => {category: :audio, type: "transcribers"},
46
46
 
47
47
  # Image group
48
- "image_generators" => { category: :image, type: "generators" },
49
- "image_editors" => { category: :image, type: "editors" },
50
- "image_analyzers" => { category: :image, type: "analyzers" },
51
- "image_transformers" => { category: :image, type: "transformers" },
52
- "image_upscalers" => { category: :image, type: "upscalers" },
53
- "image_variators" => { category: :image, type: "variators" },
54
- "background_removers" => { category: :image, type: "background_removers" },
48
+ "image_generators" => {category: :image, type: "generators"},
49
+ "image_editors" => {category: :image, type: "editors"},
50
+ "image_analyzers" => {category: :image, type: "analyzers"},
51
+ "image_transformers" => {category: :image, type: "transformers"},
52
+ "image_upscalers" => {category: :image, type: "upscalers"},
53
+ "image_variators" => {category: :image, type: "variators"},
54
+ "background_removers" => {category: :image, type: "background_removers"},
55
55
 
56
56
  # Text group
57
- "embedders" => { category: :text, type: "embedders" },
58
- "moderators" => { category: :text, type: "moderators" }
57
+ "embedders" => {category: :text, type: "embedders"},
58
+ "moderators" => {category: :text, type: "moderators"}
59
59
  }.freeze
60
60
 
61
61
  def validate_root_directory
@@ -17,17 +17,17 @@ module RubyLlmAgents
17
17
  source_root File.expand_path("templates", __dir__)
18
18
 
19
19
  class_option :provider, type: :string, default: "openai",
20
- desc: "The TTS provider to use (openai, elevenlabs)"
20
+ desc: "The TTS provider to use (openai, elevenlabs)"
21
21
  class_option :model, type: :string, default: nil,
22
- desc: "The TTS model to use"
22
+ desc: "The TTS model to use"
23
23
  class_option :voice, type: :string, default: "nova",
24
- desc: "The voice to use"
24
+ desc: "The voice to use"
25
25
  class_option :speed, type: :numeric, default: 1.0,
26
- desc: "Speech speed (0.25-4.0 for OpenAI)"
26
+ desc: "Speech speed (0.25-4.0 for OpenAI)"
27
27
  class_option :format, type: :string, default: "mp3",
28
- desc: "Output format (mp3, wav, ogg, flac)"
28
+ desc: "Output format (mp3, wav, ogg, flac)"
29
29
  class_option :cache, type: :string, default: nil,
30
- desc: "Cache TTL (e.g., '7.days')"
30
+ desc: "Cache TTL (e.g., '7.days')"
31
31
 
32
32
  def ensure_base_class_and_skill_file
33
33
  audio_dir = "app/agents/audio"
@@ -17,13 +17,13 @@ module RubyLlmAgents
17
17
  source_root File.expand_path("templates", __dir__)
18
18
 
19
19
  class_option :model, type: :string, default: "whisper-1",
20
- desc: "The transcription model to use"
20
+ desc: "The transcription model to use"
21
21
  class_option :language, type: :string, default: nil,
22
- desc: "Language code (e.g., 'en', 'es')"
22
+ desc: "Language code (e.g., 'en', 'es')"
23
23
  class_option :output_format, type: :string, default: "text",
24
- desc: "Output format (text, srt, vtt, json)"
24
+ desc: "Output format (text, srt, vtt, json)"
25
25
  class_option :cache, type: :string, default: nil,
26
- desc: "Cache TTL (e.g., '30.days')"
26
+ desc: "Cache TTL (e.g., '30.days')"
27
27
 
28
28
  def ensure_base_class_and_skill_file
29
29
  audio_dir = "app/agents/audio"
@@ -164,13 +164,13 @@ module RubyLlmAgents
164
164
  return false unless ActiveRecord::Base.connection.table_exists?(table)
165
165
 
166
166
  ActiveRecord::Base.connection.column_exists?(table, column)
167
- rescue StandardError
167
+ rescue
168
168
  false
169
169
  end
170
170
 
171
171
  def table_exists?(table)
172
172
  ActiveRecord::Base.connection.table_exists?(table)
173
- rescue StandardError
173
+ rescue
174
174
  false
175
175
  end
176
176
  end
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Agents
5
+ class Speaker
6
+ # ActiveStorage integration for speakers
7
+ #
8
+ # Provides convenience methods for generating audio and directly
9
+ # attaching it to ActiveStorage attachments.
10
+ #
11
+ # @example Attaching to a model
12
+ # class Article < ApplicationRecord
13
+ # has_one_attached :narration
14
+ # end
15
+ #
16
+ # class ArticleNarrator < RubyLLM::Agents::Speaker
17
+ # include RubyLLM::Agents::Speaker::ActiveStorageSupport
18
+ #
19
+ # provider :openai
20
+ # model 'tts-1-hd'
21
+ # voice 'nova'
22
+ # end
23
+ #
24
+ # article = Article.find(1)
25
+ # result = ArticleNarrator.speak_and_attach(
26
+ # text: article.body,
27
+ # record: article,
28
+ # attachment_name: :narration
29
+ # )
30
+ #
31
+ module ActiveStorageSupport
32
+ extend ActiveSupport::Concern
33
+
34
+ class_methods do
35
+ # Generate audio and attach it to a record
36
+ #
37
+ # @param text [String] Text to convert to speech
38
+ # @param record [ActiveRecord::Base] The record to attach to
39
+ # @param attachment_name [Symbol] Name of the attachment (e.g., :narration)
40
+ # @param options [Hash] Additional options for generation
41
+ # @return [SpeechResult] The speech result with audio_url set
42
+ def speak_and_attach(text:, record:, attachment_name:, **options)
43
+ result = call(text: text, **options)
44
+
45
+ return result unless result.success?
46
+
47
+ attach_audio_to_record(result, record, attachment_name, options)
48
+
49
+ result
50
+ end
51
+
52
+ private
53
+
54
+ def attach_audio_to_record(result, record, attachment_name, options)
55
+ attachment = record.public_send(attachment_name)
56
+ filename = options[:filename] || generate_audio_filename(result)
57
+
58
+ attachment.attach(
59
+ io: StringIO.new(result.audio),
60
+ filename: filename,
61
+ content_type: result.content_type
62
+ )
63
+
64
+ result.audio_key = attachment.blob.key if attachment.respond_to?(:blob) && attachment.blob
65
+ result.audio_url = blob_url(attachment) if attachment.respond_to?(:blob) && attachment.blob
66
+ end
67
+
68
+ def blob_url(attachment)
69
+ if attachment.blob.respond_to?(:url)
70
+ attachment.blob.url
71
+ elsif attachment.blob.respond_to?(:service_url)
72
+ attachment.blob.service_url
73
+ end
74
+ rescue => _e
75
+ nil
76
+ end
77
+
78
+ def generate_audio_filename(result)
79
+ timestamp = Time.current.to_i
80
+ ext = result.format || :mp3
81
+ "speech_#{timestamp}.#{ext}"
82
+ end
83
+ end
84
+ end
85
+ end
86
+ end
87
+ end
@@ -2,6 +2,8 @@
2
2
 
3
3
  require "digest"
4
4
  require_relative "../results/speech_result"
5
+ require_relative "speech_client"
6
+ require_relative "speech_pricing"
5
7
 
6
8
  module RubyLLM
7
9
  module Agents
@@ -194,19 +196,19 @@ module RubyLLM
194
196
 
195
197
  def default_tts_provider
196
198
  RubyLLM::Agents.configuration.default_tts_provider
197
- rescue StandardError
199
+ rescue
198
200
  :openai
199
201
  end
200
202
 
201
203
  def default_tts_model
202
204
  RubyLLM::Agents.configuration.default_tts_model
203
- rescue StandardError
205
+ rescue
204
206
  "tts-1"
205
207
  end
206
208
 
207
209
  def default_tts_voice
208
210
  RubyLLM::Agents.configuration.default_tts_voice
209
- rescue StandardError
211
+ rescue
210
212
  "nova"
211
213
  end
212
214
  end
@@ -334,6 +336,14 @@ module RubyLLM
334
336
  context.output_tokens = 0
335
337
  context.total_cost = calculate_cost(result)
336
338
 
339
+ # Store audio-specific metadata for execution tracking
340
+ context[:provider] = result[:provider].to_s
341
+ context[:voice_id] = (resolved_voice_id || resolved_voice).to_s
342
+ context[:characters] = result[:characters]
343
+ context[:output_format] = result[:format].to_s
344
+ context[:file_size] = result[:audio]&.bytesize
345
+ context[:audio_duration_seconds] = result[:duration] if result[:duration]
346
+
337
347
  # Build final result
338
348
  context.output = build_result(
339
349
  result,
@@ -410,7 +420,15 @@ module RubyLLM
410
420
 
411
421
  # Executes standard (non-streaming) speech synthesis
412
422
  def execute_standard_speech(text, options)
413
- response = RubyLLM.speak(text, **options)
423
+ response = speech_client.speak(
424
+ text,
425
+ model: options[:model],
426
+ voice: options[:voice],
427
+ voice_id: resolved_voice_id,
428
+ speed: options[:speed],
429
+ response_format: options[:response_format] || "mp3",
430
+ voice_settings: options[:voice_settings]
431
+ )
414
432
 
415
433
  {
416
434
  audio: response.audio,
@@ -428,9 +446,17 @@ module RubyLLM
428
446
  def execute_streaming_speech(text, options)
429
447
  audio_chunks = []
430
448
 
431
- RubyLLM.speak(text, **options.merge(stream: true)) do |chunk|
449
+ speech_client.speak_streaming(
450
+ text,
451
+ model: options[:model],
452
+ voice: options[:voice],
453
+ voice_id: resolved_voice_id,
454
+ speed: options[:speed],
455
+ response_format: options[:response_format] || "mp3",
456
+ voice_settings: options[:voice_settings]
457
+ ) do |chunk|
432
458
  audio_chunks << chunk.audio if chunk.respond_to?(:audio)
433
- @streaming_block.call(chunk) if @streaming_block
459
+ @streaming_block&.call(chunk)
434
460
  end
435
461
 
436
462
  {
@@ -445,7 +471,7 @@ module RubyLLM
445
471
  }
446
472
  end
447
473
 
448
- # Builds options for RubyLLM.speak
474
+ # Builds options for SpeechClient
449
475
  def build_speak_options
450
476
  options = {
451
477
  model: resolved_model,
@@ -453,13 +479,11 @@ module RubyLLM
453
479
  }
454
480
 
455
481
  speed = resolved_speed
456
- options[:speed] = speed if speed && speed != 1.0
482
+ options[:speed] = speed if speed && (speed - 1.0).abs > Float::EPSILON
457
483
  options[:response_format] = resolved_output_format.to_s
458
484
 
459
- if resolved_provider == :elevenlabs
460
- voice_settings = self.class.voice_settings_config
461
- options[:voice_settings] = voice_settings.to_h if voice_settings
462
- end
485
+ voice_settings = self.class.voice_settings_config
486
+ options[:voice_settings] = voice_settings.to_h if voice_settings
463
487
 
464
488
  options
465
489
  end
@@ -488,29 +512,17 @@ module RubyLLM
488
512
 
489
513
  # Calculates cost for speech synthesis
490
514
  def calculate_cost(raw_result)
491
- characters = raw_result[:characters] || 0
492
-
493
- if raw_result[:raw_response].respond_to?(:cost) && raw_result[:raw_response].cost
515
+ if raw_result[:raw_response].respond_to?(:cost) && raw_result[:raw_response]&.cost
494
516
  return raw_result[:raw_response].cost
495
517
  end
496
518
 
497
- provider = raw_result[:provider]
498
- model_name = raw_result[:model].to_s
499
-
500
- price_per_1k_chars = case provider
501
- when :openai
502
- model_name.include?("hd") ? 0.030 : 0.015
503
- when :elevenlabs
504
- 0.30
505
- when :google
506
- 0.016
507
- when :polly
508
- 0.016
509
- else
510
- 0.015
511
- end
519
+ characters = raw_result[:characters] || 0
512
520
 
513
- (characters / 1000.0) * price_per_1k_chars
521
+ Audio::SpeechPricing.calculate_cost(
522
+ provider: raw_result[:provider],
523
+ model_id: raw_result[:model].to_s,
524
+ characters: characters
525
+ )
514
526
  end
515
527
 
516
528
  # Resolves the provider to use
@@ -547,6 +559,13 @@ module RubyLLM
547
559
  def streaming_enabled?
548
560
  @runtime_streaming || self.class.streaming?
549
561
  end
562
+
563
+ # Returns a SpeechClient for the resolved provider
564
+ def speech_client
565
+ @speech_client ||= Audio::SpeechClient.new(provider: resolved_provider)
566
+ end
550
567
  end
551
568
  end
552
569
  end
570
+
571
+ require_relative "speaker/active_storage_support"
@@ -0,0 +1,328 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "faraday"
4
+ require "json"
5
+
6
+ module RubyLLM
7
+ module Agents
8
+ module Audio
9
+ # Direct HTTP client for text-to-speech APIs.
10
+ #
11
+ # Supports OpenAI and ElevenLabs providers, bypassing the need for
12
+ # a RubyLLM.speak() method that does not exist in the base gem.
13
+ #
14
+ # @example OpenAI
15
+ # client = SpeechClient.new(provider: :openai)
16
+ # response = client.speak("Hello", model: "tts-1", voice: "nova")
17
+ # response.audio # => binary audio data
18
+ #
19
+ # @example ElevenLabs
20
+ # client = SpeechClient.new(provider: :elevenlabs)
21
+ # response = client.speak("Hello",
22
+ # model: "eleven_v3",
23
+ # voice: "Rachel",
24
+ # voice_id: "21m00Tcm4TlvDq8ikWAM",
25
+ # voice_settings: { stability: 0.5, similarity_boost: 0.75 }
26
+ # )
27
+ #
28
+ class SpeechClient
29
+ SUPPORTED_PROVIDERS = %i[openai elevenlabs].freeze
30
+
31
+ Response = Struct.new(:audio, :format, :model, :voice, keyword_init: true) do
32
+ def duration
33
+ nil
34
+ end
35
+
36
+ def cost
37
+ nil
38
+ end
39
+ end
40
+
41
+ StreamChunk = Struct.new(:audio, keyword_init: true)
42
+
43
+ # @param provider [Symbol] :openai or :elevenlabs
44
+ # @raise [UnsupportedProviderError] if provider is not supported
45
+ def initialize(provider:)
46
+ validate_provider!(provider)
47
+ @provider = provider
48
+ end
49
+
50
+ # Synthesize speech (non-streaming)
51
+ #
52
+ # @param text [String] text to convert
53
+ # @param model [String] model identifier
54
+ # @param voice [String] voice name
55
+ # @param voice_id [String, nil] voice ID (required for ElevenLabs)
56
+ # @param speed [Float, nil] speed multiplier
57
+ # @param response_format [String] output format
58
+ # @param voice_settings [Hash, nil] ElevenLabs voice settings
59
+ # @return [Response]
60
+ def speak(text, model:, voice:, voice_id: nil, speed: nil,
61
+ response_format: "mp3", voice_settings: nil)
62
+ case @provider
63
+ when :openai
64
+ openai_speak(text, model: model, voice: voice_id || voice,
65
+ speed: speed, response_format: response_format)
66
+ when :elevenlabs
67
+ elevenlabs_speak(text, model: model, voice_id: voice_id || voice,
68
+ speed: speed, response_format: response_format,
69
+ voice_settings: voice_settings)
70
+ end
71
+ end
72
+
73
+ # Synthesize speech with streaming
74
+ #
75
+ # @param text [String] text to convert
76
+ # @param model [String] model identifier
77
+ # @param voice [String] voice name
78
+ # @param voice_id [String, nil] voice ID
79
+ # @param speed [Float, nil] speed multiplier
80
+ # @param response_format [String] output format
81
+ # @param voice_settings [Hash, nil] ElevenLabs voice settings
82
+ # @yield [StreamChunk] each audio chunk as it arrives
83
+ # @return [Response]
84
+ def speak_streaming(text, model:, voice:, voice_id: nil, speed: nil,
85
+ response_format: "mp3", voice_settings: nil, &block)
86
+ case @provider
87
+ when :openai
88
+ openai_speak_streaming(text, model: model, voice: voice_id || voice,
89
+ speed: speed, response_format: response_format,
90
+ &block)
91
+ when :elevenlabs
92
+ elevenlabs_speak_streaming(text, model: model,
93
+ voice_id: voice_id || voice,
94
+ speed: speed,
95
+ response_format: response_format,
96
+ voice_settings: voice_settings, &block)
97
+ end
98
+ end
99
+
100
+ private
101
+
102
+ # ============================================================
103
+ # Provider validation
104
+ # ============================================================
105
+
106
+ def validate_provider!(provider)
107
+ return if SUPPORTED_PROVIDERS.include?(provider)
108
+
109
+ raise UnsupportedProviderError.new(
110
+ "Provider :#{provider} is not yet supported for text-to-speech. " \
111
+ "Supported providers: #{SUPPORTED_PROVIDERS.map { |p| ":#{p}" }.join(", ")}.",
112
+ provider: provider
113
+ )
114
+ end
115
+
116
+ # ============================================================
117
+ # OpenAI implementation
118
+ # ============================================================
119
+
120
+ def openai_speak(text, model:, voice:, speed:, response_format:)
121
+ body = openai_request_body(text, model: model, voice: voice,
122
+ speed: speed, response_format: response_format)
123
+
124
+ response = openai_connection.post("/v1/audio/speech") do |req|
125
+ req.headers["Content-Type"] = "application/json"
126
+ req.body = body.to_json
127
+ end
128
+
129
+ handle_error_response!(response) unless response.success?
130
+
131
+ Response.new(
132
+ audio: response.body,
133
+ format: response_format.to_sym,
134
+ model: model,
135
+ voice: voice
136
+ )
137
+ end
138
+
139
+ def openai_speak_streaming(text, model:, voice:, speed:,
140
+ response_format:, &block)
141
+ body = openai_request_body(text, model: model, voice: voice,
142
+ speed: speed, response_format: response_format)
143
+ chunks = []
144
+
145
+ openai_connection.post("/v1/audio/speech") do |req|
146
+ req.headers["Content-Type"] = "application/json"
147
+ req.body = body.to_json
148
+ req.options.on_data = proc do |chunk, _size, env|
149
+ if env.status == 200
150
+ chunk_obj = StreamChunk.new(audio: chunk)
151
+ chunks << chunk
152
+ block&.call(chunk_obj)
153
+ end
154
+ end
155
+ end
156
+
157
+ Response.new(
158
+ audio: chunks.join,
159
+ format: response_format.to_sym,
160
+ model: model,
161
+ voice: voice
162
+ )
163
+ end
164
+
165
+ def openai_request_body(text, model:, voice:, speed:, response_format:)
166
+ body = {
167
+ model: model,
168
+ input: text,
169
+ voice: voice,
170
+ response_format: response_format.to_s
171
+ }
172
+ body[:speed] = speed if speed && (speed - 1.0).abs > Float::EPSILON
173
+ body
174
+ end
175
+
176
+ def openai_connection
177
+ @openai_connection ||= Faraday.new(url: openai_api_base) do |f|
178
+ f.headers["Authorization"] = "Bearer #{openai_api_key}"
179
+ f.adapter Faraday.default_adapter
180
+ f.options.timeout = 120
181
+ f.options.open_timeout = 30
182
+ end
183
+ end
184
+
185
+ def openai_api_key
186
+ key = RubyLLM.config.openai_api_key
187
+ unless key
188
+ raise ConfigurationError,
189
+ "OpenAI API key is required for text-to-speech. " \
190
+ "Set it via: RubyLLM.configure { |c| c.openai_api_key = 'sk-...' }"
191
+ end
192
+ key
193
+ end
194
+
195
+ def openai_api_base
196
+ base = RubyLLM.config.openai_api_base
197
+ (base && !base.empty?) ? base : "https://api.openai.com"
198
+ end
199
+
200
+ # ============================================================
201
+ # ElevenLabs implementation
202
+ # ============================================================
203
+
204
+ def elevenlabs_speak(text, model:, voice_id:, speed:,
205
+ response_format:, voice_settings:)
206
+ path = "/v1/text-to-speech/#{voice_id}"
207
+ body = elevenlabs_request_body(text, model: model, speed: speed,
208
+ voice_settings: voice_settings)
209
+ format_param = elevenlabs_output_format(response_format)
210
+
211
+ response = elevenlabs_connection.post(path) do |req|
212
+ req.headers["Content-Type"] = "application/json"
213
+ req.params["output_format"] = format_param
214
+ req.body = body.to_json
215
+ end
216
+
217
+ handle_error_response!(response) unless response.success?
218
+
219
+ Response.new(
220
+ audio: response.body,
221
+ format: response_format.to_sym,
222
+ model: model,
223
+ voice: voice_id
224
+ )
225
+ end
226
+
227
+ def elevenlabs_speak_streaming(text, model:, voice_id:, speed:,
228
+ response_format:, voice_settings:, &block)
229
+ path = "/v1/text-to-speech/#{voice_id}/stream"
230
+ body = elevenlabs_request_body(text, model: model, speed: speed,
231
+ voice_settings: voice_settings)
232
+ format_param = elevenlabs_output_format(response_format)
233
+ chunks = []
234
+
235
+ elevenlabs_connection.post(path) do |req|
236
+ req.headers["Content-Type"] = "application/json"
237
+ req.params["output_format"] = format_param
238
+ req.body = body.to_json
239
+ req.options.on_data = proc do |chunk, _size, env|
240
+ if env.status == 200
241
+ chunk_obj = StreamChunk.new(audio: chunk)
242
+ chunks << chunk
243
+ block&.call(chunk_obj)
244
+ end
245
+ end
246
+ end
247
+
248
+ Response.new(
249
+ audio: chunks.join,
250
+ format: response_format.to_sym,
251
+ model: model,
252
+ voice: voice_id
253
+ )
254
+ end
255
+
256
+ def elevenlabs_request_body(text, model:, speed:, voice_settings:)
257
+ body = {
258
+ text: text,
259
+ model_id: model
260
+ }
261
+
262
+ vs = voice_settings&.dup || {}
263
+ vs[:speed] = speed if speed && (speed - 1.0).abs > Float::EPSILON
264
+ body[:voice_settings] = vs unless vs.empty?
265
+
266
+ body
267
+ end
268
+
269
+ ELEVENLABS_FORMAT_MAP = {
270
+ "mp3" => "mp3_44100_128",
271
+ "pcm" => "pcm_44100",
272
+ "ulaw" => "ulaw_8000"
273
+ }.freeze
274
+
275
+ def elevenlabs_output_format(format)
276
+ ELEVENLABS_FORMAT_MAP[format.to_s] || "mp3_44100_128"
277
+ end
278
+
279
+ def elevenlabs_connection
280
+ @elevenlabs_connection ||= Faraday.new(url: elevenlabs_api_base) do |f|
281
+ f.headers["xi-api-key"] = elevenlabs_api_key
282
+ f.adapter Faraday.default_adapter
283
+ f.options.timeout = 120
284
+ f.options.open_timeout = 30
285
+ end
286
+ end
287
+
288
+ def elevenlabs_api_key
289
+ key = RubyLLM::Agents.configuration.elevenlabs_api_key
290
+ unless key
291
+ raise ConfigurationError,
292
+ "ElevenLabs API key is required for text-to-speech. " \
293
+ "Set it via: RubyLLM::Agents.configure { |c| c.elevenlabs_api_key = 'xi-...' }"
294
+ end
295
+ key
296
+ end
297
+
298
+ def elevenlabs_api_base
299
+ base = RubyLLM::Agents.configuration.elevenlabs_api_base
300
+ (base && !base.empty?) ? base : "https://api.elevenlabs.io"
301
+ end
302
+
303
+ # ============================================================
304
+ # Shared error handling
305
+ # ============================================================
306
+
307
+ def handle_error_response!(response)
308
+ raise SpeechApiError.new(
309
+ "TTS API request failed (HTTP #{response.status}): #{error_message_from(response)}",
310
+ status: response.status,
311
+ response_body: response.body
312
+ )
313
+ end
314
+
315
+ def error_message_from(response)
316
+ parsed = JSON.parse(response.body)
317
+ if parsed.is_a?(Hash)
318
+ parsed.dig("error", "message") || parsed["detail"] || parsed["error"] || response.body
319
+ else
320
+ response.body
321
+ end
322
+ rescue JSON::ParserError
323
+ response.body.to_s[0, 200]
324
+ end
325
+ end
326
+ end
327
+ end
328
+ end