RubyGems - ruby_llm-agents - Versions diffs - 3.1.0 → 3.2.0 - Mend

ruby_llm-agents 3.1.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

data/lib/generators/ruby_llm_agents/speaker_generator.rb CHANGED Viewed

@@ -17,17 +17,17 @@ module RubyLlmAgents
     source_root File.expand_path("templates", __dir__)
     class_option :provider, type: :string, default: "openai",
-                 desc: "The TTS provider to use (openai, elevenlabs)"
+      desc: "The TTS provider to use (openai, elevenlabs)"
     class_option :model, type: :string, default: nil,
-                 desc: "The TTS model to use"
+      desc: "The TTS model to use"
     class_option :voice, type: :string, default: "nova",
-                 desc: "The voice to use"
+      desc: "The voice to use"
     class_option :speed, type: :numeric, default: 1.0,
-                 desc: "Speech speed (0.25-4.0 for OpenAI)"
+      desc: "Speech speed (0.25-4.0 for OpenAI)"
     class_option :format, type: :string, default: "mp3",
-                 desc: "Output format (mp3, wav, ogg, flac)"
+      desc: "Output format (mp3, wav, ogg, flac)"
     class_option :cache, type: :string, default: nil,
-                 desc: "Cache TTL (e.g., '7.days')"
+      desc: "Cache TTL (e.g., '7.days')"
     def ensure_base_class_and_skill_file
       audio_dir = "app/agents/audio"

data/lib/generators/ruby_llm_agents/transcriber_generator.rb CHANGED Viewed

@@ -17,13 +17,13 @@ module RubyLlmAgents
     source_root File.expand_path("templates", __dir__)
     class_option :model, type: :string, default: "whisper-1",
-                 desc: "The transcription model to use"
+      desc: "The transcription model to use"
     class_option :language, type: :string, default: nil,
-                 desc: "Language code (e.g., 'en', 'es')"
+      desc: "Language code (e.g., 'en', 'es')"
     class_option :output_format, type: :string, default: "text",
-                 desc: "Output format (text, srt, vtt, json)"
+      desc: "Output format (text, srt, vtt, json)"
     class_option :cache, type: :string, default: nil,
-                 desc: "Cache TTL (e.g., '30.days')"
+      desc: "Cache TTL (e.g., '30.days')"
     def ensure_base_class_and_skill_file
       audio_dir = "app/agents/audio"

data/lib/generators/ruby_llm_agents/upgrade_generator.rb CHANGED Viewed

@@ -164,13 +164,13 @@ module RubyLlmAgents
       return false unless ActiveRecord::Base.connection.table_exists?(table)
       ActiveRecord::Base.connection.column_exists?(table, column)
-    rescue StandardError
+    rescue
       false
     end
     def table_exists?(table)
       ActiveRecord::Base.connection.table_exists?(table)
-    rescue StandardError
+    rescue
       false
     end
   end

data/lib/ruby_llm/agents/audio/speaker.rb CHANGED Viewed

@@ -2,6 +2,8 @@
 require "digest"
 require_relative "../results/speech_result"
+require_relative "speech_client"
+require_relative "speech_pricing"
 module RubyLLM
   module Agents
@@ -194,19 +196,19 @@ module RubyLLM
         def default_tts_provider
           RubyLLM::Agents.configuration.default_tts_provider
-        rescue StandardError
+        rescue
           :openai
         end
         def default_tts_model
           RubyLLM::Agents.configuration.default_tts_model
-        rescue StandardError
+        rescue
           "tts-1"
         end
         def default_tts_voice
           RubyLLM::Agents.configuration.default_tts_voice
-        rescue StandardError
+        rescue
           "nova"
         end
       end
@@ -410,7 +412,15 @@ module RubyLLM
       # Executes standard (non-streaming) speech synthesis
       def execute_standard_speech(text, options)
-        response = RubyLLM.speak(text, **options)
+        response = speech_client.speak(
+          text,
+          model: options[:model],
+          voice: options[:voice],
+          voice_id: resolved_voice_id,
+          speed: options[:speed],
+          response_format: options[:response_format] || "mp3",
+          voice_settings: options[:voice_settings]
+        )
         {
           audio: response.audio,
@@ -428,9 +438,17 @@ module RubyLLM
       def execute_streaming_speech(text, options)
         audio_chunks = []
-        RubyLLM.speak(text, **options.merge(stream: true)) do |chunk|
+        speech_client.speak_streaming(
+          text,
+          model: options[:model],
+          voice: options[:voice],
+          voice_id: resolved_voice_id,
+          speed: options[:speed],
+          response_format: options[:response_format] || "mp3",
+          voice_settings: options[:voice_settings]
+        ) do |chunk|
           audio_chunks << chunk.audio if chunk.respond_to?(:audio)
-          @streaming_block.call(chunk) if @streaming_block
+          @streaming_block&.call(chunk)
         end
         {
@@ -445,7 +463,7 @@ module RubyLLM
         }
       end
-      # Builds options for RubyLLM.speak
+      # Builds options for SpeechClient
       def build_speak_options
         options = {
           model: resolved_model,
@@ -453,13 +471,11 @@ module RubyLLM
         }
         speed = resolved_speed
-        options[:speed] = speed if speed && speed != 1.0
+        options[:speed] = speed if speed && (speed - 1.0).abs > Float::EPSILON
         options[:response_format] = resolved_output_format.to_s
-        if resolved_provider == :elevenlabs
-          voice_settings = self.class.voice_settings_config
-          options[:voice_settings] = voice_settings.to_h if voice_settings
-        end
+        voice_settings = self.class.voice_settings_config
+        options[:voice_settings] = voice_settings.to_h if voice_settings
         options
       end
@@ -488,29 +504,17 @@ module RubyLLM
       # Calculates cost for speech synthesis
       def calculate_cost(raw_result)
-        characters = raw_result[:characters] || 0
-        if raw_result[:raw_response].respond_to?(:cost) && raw_result[:raw_response].cost
+        if raw_result[:raw_response].respond_to?(:cost) && raw_result[:raw_response]&.cost
           return raw_result[:raw_response].cost
         end
-        provider = raw_result[:provider]
-        model_name = raw_result[:model].to_s
-        price_per_1k_chars = case provider
-                            when :openai
-                              model_name.include?("hd") ? 0.030 : 0.015
-                            when :elevenlabs
-                              0.30
-                            when :google
-                              0.016
-                            when :polly
-                              0.016
-                            else
-                              0.015
-                            end
+        characters = raw_result[:characters] || 0
-        (characters / 1000.0) * price_per_1k_chars
+        Audio::SpeechPricing.calculate_cost(
+          provider: raw_result[:provider],
+          model_id: raw_result[:model].to_s,
+          characters: characters
+        )
       end
       # Resolves the provider to use
@@ -547,6 +551,11 @@ module RubyLLM
       def streaming_enabled?
         @runtime_streaming || self.class.streaming?
       end
+      # Returns a SpeechClient for the resolved provider
+      def speech_client
+        @speech_client ||= Audio::SpeechClient.new(provider: resolved_provider)
+      end
     end
   end
 end

data/lib/ruby_llm/agents/audio/speech_client.rb ADDED Viewed

@@ -0,0 +1,328 @@
+# frozen_string_literal: true
+require "faraday"
+require "json"
+module RubyLLM
+  module Agents
+    module Audio
+      # Direct HTTP client for text-to-speech APIs.
+      #
+      # Supports OpenAI and ElevenLabs providers, bypassing the need for
+      # a RubyLLM.speak() method that does not exist in the base gem.
+      #
+      # @example OpenAI
+      #   client = SpeechClient.new(provider: :openai)
+      #   response = client.speak("Hello", model: "tts-1", voice: "nova")
+      #   response.audio  # => binary audio data
+      #
+      # @example ElevenLabs
+      #   client = SpeechClient.new(provider: :elevenlabs)
+      #   response = client.speak("Hello",
+      #     model: "eleven_v3",
+      #     voice: "Rachel",
+      #     voice_id: "21m00Tcm4TlvDq8ikWAM",
+      #     voice_settings: { stability: 0.5, similarity_boost: 0.75 }
+      #   )
+      #
+      class SpeechClient
+        SUPPORTED_PROVIDERS = %i[openai elevenlabs].freeze
+        Response = Struct.new(:audio, :format, :model, :voice, keyword_init: true) do
+          def duration
+            nil
+          end
+          def cost
+            nil
+          end
+        end
+        StreamChunk = Struct.new(:audio, keyword_init: true)
+        # @param provider [Symbol] :openai or :elevenlabs
+        # @raise [UnsupportedProviderError] if provider is not supported
+        def initialize(provider:)
+          validate_provider!(provider)
+          @provider = provider
+        end
+        # Synthesize speech (non-streaming)
+        #
+        # @param text [String] text to convert
+        # @param model [String] model identifier
+        # @param voice [String] voice name
+        # @param voice_id [String, nil] voice ID (required for ElevenLabs)
+        # @param speed [Float, nil] speed multiplier
+        # @param response_format [String] output format
+        # @param voice_settings [Hash, nil] ElevenLabs voice settings
+        # @return [Response]
+        def speak(text, model:, voice:, voice_id: nil, speed: nil,
+          response_format: "mp3", voice_settings: nil)
+          case @provider
+          when :openai
+            openai_speak(text, model: model, voice: voice_id || voice,
+              speed: speed, response_format: response_format)
+          when :elevenlabs
+            elevenlabs_speak(text, model: model, voice_id: voice_id || voice,
+              speed: speed, response_format: response_format,
+              voice_settings: voice_settings)
+          end
+        end
+        # Synthesize speech with streaming
+        #
+        # @param text [String] text to convert
+        # @param model [String] model identifier
+        # @param voice [String] voice name
+        # @param voice_id [String, nil] voice ID
+        # @param speed [Float, nil] speed multiplier
+        # @param response_format [String] output format
+        # @param voice_settings [Hash, nil] ElevenLabs voice settings
+        # @yield [StreamChunk] each audio chunk as it arrives
+        # @return [Response]
+        def speak_streaming(text, model:, voice:, voice_id: nil, speed: nil,
+          response_format: "mp3", voice_settings: nil, &block)
+          case @provider
+          when :openai
+            openai_speak_streaming(text, model: model, voice: voice_id || voice,
+                                   speed: speed, response_format: response_format,
+              &block)
+          when :elevenlabs
+            elevenlabs_speak_streaming(text, model: model,
+                                       voice_id: voice_id || voice,
+                                       speed: speed,
+                                       response_format: response_format,
+                                       voice_settings: voice_settings, &block)
+          end
+        end
+        private
+        # ============================================================
+        # Provider validation
+        # ============================================================
+        def validate_provider!(provider)
+          return if SUPPORTED_PROVIDERS.include?(provider)
+          raise UnsupportedProviderError.new(
+            "Provider :#{provider} is not yet supported for text-to-speech. " \
+            "Supported providers: #{SUPPORTED_PROVIDERS.map { |p| ":#{p}" }.join(", ")}.",
+            provider: provider
+          )
+        end
+        # ============================================================
+        # OpenAI implementation
+        # ============================================================
+        def openai_speak(text, model:, voice:, speed:, response_format:)
+          body = openai_request_body(text, model: model, voice: voice,
+            speed: speed, response_format: response_format)
+          response = openai_connection.post("/v1/audio/speech") do |req|
+            req.headers["Content-Type"] = "application/json"
+            req.body = body.to_json
+          end
+          handle_error_response!(response) unless response.success?
+          Response.new(
+            audio: response.body,
+            format: response_format.to_sym,
+            model: model,
+            voice: voice
+          )
+        end
+        def openai_speak_streaming(text, model:, voice:, speed:,
+          response_format:, &block)
+          body = openai_request_body(text, model: model, voice: voice,
+            speed: speed, response_format: response_format)
+          chunks = []
+          openai_connection.post("/v1/audio/speech") do |req|
+            req.headers["Content-Type"] = "application/json"
+            req.body = body.to_json
+            req.options.on_data = proc do |chunk, _size, env|
+              if env.status == 200
+                chunk_obj = StreamChunk.new(audio: chunk)
+                chunks << chunk
+                block&.call(chunk_obj)
+              end
+            end
+          end
+          Response.new(
+            audio: chunks.join,
+            format: response_format.to_sym,
+            model: model,
+            voice: voice
+          )
+        end
+        def openai_request_body(text, model:, voice:, speed:, response_format:)
+          body = {
+            model: model,
+            input: text,
+            voice: voice,
+            response_format: response_format.to_s
+          }
+          body[:speed] = speed if speed && (speed - 1.0).abs > Float::EPSILON
+          body
+        end
+        def openai_connection
+          @openai_connection ||= Faraday.new(url: openai_api_base) do |f|
+            f.headers["Authorization"] = "Bearer #{openai_api_key}"
+            f.adapter Faraday.default_adapter
+            f.options.timeout = 120
+            f.options.open_timeout = 30
+          end
+        end
+        def openai_api_key
+          key = RubyLLM.config.openai_api_key
+          unless key
+            raise ConfigurationError,
+              "OpenAI API key is required for text-to-speech. " \
+              "Set it via: RubyLLM.configure { |c| c.openai_api_key = 'sk-...' }"
+          end
+          key
+        end
+        def openai_api_base
+          base = RubyLLM.config.openai_api_base
+          (base && !base.empty?) ? base : "https://api.openai.com"
+        end
+        # ============================================================
+        # ElevenLabs implementation
+        # ============================================================
+        def elevenlabs_speak(text, model:, voice_id:, speed:,
+          response_format:, voice_settings:)
+          path = "/v1/text-to-speech/#{voice_id}"
+          body = elevenlabs_request_body(text, model: model, speed: speed,
+            voice_settings: voice_settings)
+          format_param = elevenlabs_output_format(response_format)
+          response = elevenlabs_connection.post(path) do |req|
+            req.headers["Content-Type"] = "application/json"
+            req.params["output_format"] = format_param
+            req.body = body.to_json
+          end
+          handle_error_response!(response) unless response.success?
+          Response.new(
+            audio: response.body,
+            format: response_format.to_sym,
+            model: model,
+            voice: voice_id
+          )
+        end
+        def elevenlabs_speak_streaming(text, model:, voice_id:, speed:,
+          response_format:, voice_settings:, &block)
+          path = "/v1/text-to-speech/#{voice_id}/stream"
+          body = elevenlabs_request_body(text, model: model, speed: speed,
+            voice_settings: voice_settings)
+          format_param = elevenlabs_output_format(response_format)
+          chunks = []
+          elevenlabs_connection.post(path) do |req|
+            req.headers["Content-Type"] = "application/json"
+            req.params["output_format"] = format_param
+            req.body = body.to_json
+            req.options.on_data = proc do |chunk, _size, env|
+              if env.status == 200
+                chunk_obj = StreamChunk.new(audio: chunk)
+                chunks << chunk
+                block&.call(chunk_obj)
+              end
+            end
+          end
+          Response.new(
+            audio: chunks.join,
+            format: response_format.to_sym,
+            model: model,
+            voice: voice_id
+          )
+        end
+        def elevenlabs_request_body(text, model:, speed:, voice_settings:)
+          body = {
+            text: text,
+            model_id: model
+          }
+          vs = voice_settings&.dup || {}
+          vs[:speed] = speed if speed && (speed - 1.0).abs > Float::EPSILON
+          body[:voice_settings] = vs unless vs.empty?
+          body
+        end
+        ELEVENLABS_FORMAT_MAP = {
+          "mp3" => "mp3_44100_128",
+          "pcm" => "pcm_44100",
+          "ulaw" => "ulaw_8000"
+        }.freeze
+        def elevenlabs_output_format(format)
+          ELEVENLABS_FORMAT_MAP[format.to_s] || "mp3_44100_128"
+        end
+        def elevenlabs_connection
+          @elevenlabs_connection ||= Faraday.new(url: elevenlabs_api_base) do |f|
+            f.headers["xi-api-key"] = elevenlabs_api_key
+            f.adapter Faraday.default_adapter
+            f.options.timeout = 120
+            f.options.open_timeout = 30
+          end
+        end
+        def elevenlabs_api_key
+          key = RubyLLM::Agents.configuration.elevenlabs_api_key
+          unless key
+            raise ConfigurationError,
+              "ElevenLabs API key is required for text-to-speech. " \
+              "Set it via: RubyLLM::Agents.configure { |c| c.elevenlabs_api_key = 'xi-...' }"
+          end
+          key
+        end
+        def elevenlabs_api_base
+          base = RubyLLM::Agents.configuration.elevenlabs_api_base
+          (base && !base.empty?) ? base : "https://api.elevenlabs.io"
+        end
+        # ============================================================
+        # Shared error handling
+        # ============================================================
+        def handle_error_response!(response)
+          raise SpeechApiError.new(
+            "TTS API request failed (HTTP #{response.status}): #{error_message_from(response)}",
+            status: response.status,
+            response_body: response.body
+          )
+        end
+        def error_message_from(response)
+          parsed = JSON.parse(response.body)
+          if parsed.is_a?(Hash)
+            parsed.dig("error", "message") || parsed["detail"] || parsed["error"] || response.body
+          else
+            response.body
+          end
+        rescue JSON::ParserError
+          response.body.to_s[0, 200]
+        end
+      end
+    end
+  end
+end