RubyGems - ruby_llm-agents - Versions diffs - 3.0.0 → 3.2.0 - Mend

ruby_llm-agents 3.0.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

data/lib/ruby_llm/agents/audio/speech_client.rb ADDED Viewed

@@ -0,0 +1,328 @@
+# frozen_string_literal: true
+require "faraday"
+require "json"
+module RubyLLM
+  module Agents
+    module Audio
+      # Direct HTTP client for text-to-speech APIs.
+      #
+      # Supports OpenAI and ElevenLabs providers, bypassing the need for
+      # a RubyLLM.speak() method that does not exist in the base gem.
+      #
+      # @example OpenAI
+      #   client = SpeechClient.new(provider: :openai)
+      #   response = client.speak("Hello", model: "tts-1", voice: "nova")
+      #   response.audio  # => binary audio data
+      #
+      # @example ElevenLabs
+      #   client = SpeechClient.new(provider: :elevenlabs)
+      #   response = client.speak("Hello",
+      #     model: "eleven_v3",
+      #     voice: "Rachel",
+      #     voice_id: "21m00Tcm4TlvDq8ikWAM",
+      #     voice_settings: { stability: 0.5, similarity_boost: 0.75 }
+      #   )
+      #
+      class SpeechClient
+        SUPPORTED_PROVIDERS = %i[openai elevenlabs].freeze
+        Response = Struct.new(:audio, :format, :model, :voice, keyword_init: true) do
+          def duration
+            nil
+          end
+          def cost
+            nil
+          end
+        end
+        StreamChunk = Struct.new(:audio, keyword_init: true)
+        # @param provider [Symbol] :openai or :elevenlabs
+        # @raise [UnsupportedProviderError] if provider is not supported
+        def initialize(provider:)
+          validate_provider!(provider)
+          @provider = provider
+        end
+        # Synthesize speech (non-streaming)
+        #
+        # @param text [String] text to convert
+        # @param model [String] model identifier
+        # @param voice [String] voice name
+        # @param voice_id [String, nil] voice ID (required for ElevenLabs)
+        # @param speed [Float, nil] speed multiplier
+        # @param response_format [String] output format
+        # @param voice_settings [Hash, nil] ElevenLabs voice settings
+        # @return [Response]
+        def speak(text, model:, voice:, voice_id: nil, speed: nil,
+          response_format: "mp3", voice_settings: nil)
+          case @provider
+          when :openai
+            openai_speak(text, model: model, voice: voice_id || voice,
+              speed: speed, response_format: response_format)
+          when :elevenlabs
+            elevenlabs_speak(text, model: model, voice_id: voice_id || voice,
+              speed: speed, response_format: response_format,
+              voice_settings: voice_settings)
+          end
+        end
+        # Synthesize speech with streaming
+        #
+        # @param text [String] text to convert
+        # @param model [String] model identifier
+        # @param voice [String] voice name
+        # @param voice_id [String, nil] voice ID
+        # @param speed [Float, nil] speed multiplier
+        # @param response_format [String] output format
+        # @param voice_settings [Hash, nil] ElevenLabs voice settings
+        # @yield [StreamChunk] each audio chunk as it arrives
+        # @return [Response]
+        def speak_streaming(text, model:, voice:, voice_id: nil, speed: nil,
+          response_format: "mp3", voice_settings: nil, &block)
+          case @provider
+          when :openai
+            openai_speak_streaming(text, model: model, voice: voice_id || voice,
+                                   speed: speed, response_format: response_format,
+              &block)
+          when :elevenlabs
+            elevenlabs_speak_streaming(text, model: model,
+                                       voice_id: voice_id || voice,
+                                       speed: speed,
+                                       response_format: response_format,
+                                       voice_settings: voice_settings, &block)
+          end
+        end
+        private
+        # ============================================================
+        # Provider validation
+        # ============================================================
+        def validate_provider!(provider)
+          return if SUPPORTED_PROVIDERS.include?(provider)
+          raise UnsupportedProviderError.new(
+            "Provider :#{provider} is not yet supported for text-to-speech. " \
+            "Supported providers: #{SUPPORTED_PROVIDERS.map { |p| ":#{p}" }.join(", ")}.",
+            provider: provider
+          )
+        end
+        # ============================================================
+        # OpenAI implementation
+        # ============================================================
+        def openai_speak(text, model:, voice:, speed:, response_format:)
+          body = openai_request_body(text, model: model, voice: voice,
+            speed: speed, response_format: response_format)
+          response = openai_connection.post("/v1/audio/speech") do |req|
+            req.headers["Content-Type"] = "application/json"
+            req.body = body.to_json
+          end
+          handle_error_response!(response) unless response.success?
+          Response.new(
+            audio: response.body,
+            format: response_format.to_sym,
+            model: model,
+            voice: voice
+          )
+        end
+        def openai_speak_streaming(text, model:, voice:, speed:,
+          response_format:, &block)
+          body = openai_request_body(text, model: model, voice: voice,
+            speed: speed, response_format: response_format)
+          chunks = []
+          openai_connection.post("/v1/audio/speech") do |req|
+            req.headers["Content-Type"] = "application/json"
+            req.body = body.to_json
+            req.options.on_data = proc do |chunk, _size, env|
+              if env.status == 200
+                chunk_obj = StreamChunk.new(audio: chunk)
+                chunks << chunk
+                block&.call(chunk_obj)
+              end
+            end
+          end
+          Response.new(
+            audio: chunks.join,
+            format: response_format.to_sym,
+            model: model,
+            voice: voice
+          )
+        end
+        def openai_request_body(text, model:, voice:, speed:, response_format:)
+          body = {
+            model: model,
+            input: text,
+            voice: voice,
+            response_format: response_format.to_s
+          }
+          body[:speed] = speed if speed && (speed - 1.0).abs > Float::EPSILON
+          body
+        end
+        def openai_connection
+          @openai_connection ||= Faraday.new(url: openai_api_base) do |f|
+            f.headers["Authorization"] = "Bearer #{openai_api_key}"
+            f.adapter Faraday.default_adapter
+            f.options.timeout = 120
+            f.options.open_timeout = 30
+          end
+        end
+        def openai_api_key
+          key = RubyLLM.config.openai_api_key
+          unless key
+            raise ConfigurationError,
+              "OpenAI API key is required for text-to-speech. " \
+              "Set it via: RubyLLM.configure { |c| c.openai_api_key = 'sk-...' }"
+          end
+          key
+        end
+        def openai_api_base
+          base = RubyLLM.config.openai_api_base
+          (base && !base.empty?) ? base : "https://api.openai.com"
+        end
+        # ============================================================
+        # ElevenLabs implementation
+        # ============================================================
+        def elevenlabs_speak(text, model:, voice_id:, speed:,
+          response_format:, voice_settings:)
+          path = "/v1/text-to-speech/#{voice_id}"
+          body = elevenlabs_request_body(text, model: model, speed: speed,
+            voice_settings: voice_settings)
+          format_param = elevenlabs_output_format(response_format)
+          response = elevenlabs_connection.post(path) do |req|
+            req.headers["Content-Type"] = "application/json"
+            req.params["output_format"] = format_param
+            req.body = body.to_json
+          end
+          handle_error_response!(response) unless response.success?
+          Response.new(
+            audio: response.body,
+            format: response_format.to_sym,
+            model: model,
+            voice: voice_id
+          )
+        end
+        def elevenlabs_speak_streaming(text, model:, voice_id:, speed:,
+          response_format:, voice_settings:, &block)
+          path = "/v1/text-to-speech/#{voice_id}/stream"
+          body = elevenlabs_request_body(text, model: model, speed: speed,
+            voice_settings: voice_settings)
+          format_param = elevenlabs_output_format(response_format)
+          chunks = []
+          elevenlabs_connection.post(path) do |req|
+            req.headers["Content-Type"] = "application/json"
+            req.params["output_format"] = format_param
+            req.body = body.to_json
+            req.options.on_data = proc do |chunk, _size, env|
+              if env.status == 200
+                chunk_obj = StreamChunk.new(audio: chunk)
+                chunks << chunk
+                block&.call(chunk_obj)
+              end
+            end
+          end
+          Response.new(
+            audio: chunks.join,
+            format: response_format.to_sym,
+            model: model,
+            voice: voice_id
+          )
+        end
+        def elevenlabs_request_body(text, model:, speed:, voice_settings:)
+          body = {
+            text: text,
+            model_id: model
+          }
+          vs = voice_settings&.dup || {}
+          vs[:speed] = speed if speed && (speed - 1.0).abs > Float::EPSILON
+          body[:voice_settings] = vs unless vs.empty?
+          body
+        end
+        ELEVENLABS_FORMAT_MAP = {
+          "mp3" => "mp3_44100_128",
+          "pcm" => "pcm_44100",
+          "ulaw" => "ulaw_8000"
+        }.freeze
+        def elevenlabs_output_format(format)
+          ELEVENLABS_FORMAT_MAP[format.to_s] || "mp3_44100_128"
+        end
+        def elevenlabs_connection
+          @elevenlabs_connection ||= Faraday.new(url: elevenlabs_api_base) do |f|
+            f.headers["xi-api-key"] = elevenlabs_api_key
+            f.adapter Faraday.default_adapter
+            f.options.timeout = 120
+            f.options.open_timeout = 30
+          end
+        end
+        def elevenlabs_api_key
+          key = RubyLLM::Agents.configuration.elevenlabs_api_key
+          unless key
+            raise ConfigurationError,
+              "ElevenLabs API key is required for text-to-speech. " \
+              "Set it via: RubyLLM::Agents.configure { |c| c.elevenlabs_api_key = 'xi-...' }"
+          end
+          key
+        end
+        def elevenlabs_api_base
+          base = RubyLLM::Agents.configuration.elevenlabs_api_base
+          (base && !base.empty?) ? base : "https://api.elevenlabs.io"
+        end
+        # ============================================================
+        # Shared error handling
+        # ============================================================
+        def handle_error_response!(response)
+          raise SpeechApiError.new(
+            "TTS API request failed (HTTP #{response.status}): #{error_message_from(response)}",
+            status: response.status,
+            response_body: response.body
+          )
+        end
+        def error_message_from(response)
+          parsed = JSON.parse(response.body)
+          if parsed.is_a?(Hash)
+            parsed.dig("error", "message") || parsed["detail"] || parsed["error"] || response.body
+          else
+            response.body
+          end
+        rescue JSON::ParserError
+          response.body.to_s[0, 200]
+        end
+      end
+    end
+  end
+end

data/lib/ruby_llm/agents/audio/speech_pricing.rb ADDED Viewed

@@ -0,0 +1,273 @@
+# frozen_string_literal: true
+require "net/http"
+require "json"
+module RubyLLM
+  module Agents
+    module Audio
+      # Dynamic pricing resolution for text-to-speech models.
+      #
+      # Uses the same three-tier strategy as ImageGenerator::Pricing:
+      # 1. LiteLLM JSON (primary) - future-proof, auto-updating
+      # 2. Configurable pricing table - user overrides via config.tts_model_pricing
+      # 3. Hardcoded fallbacks - per-model defaults
+      #
+      # All prices are per 1,000 characters.
+      #
+      # @example Get cost for a speech operation
+      #   SpeechPricing.calculate_cost(provider: :openai, model_id: "tts-1", characters: 5000)
+      #   # => 0.075
+      #
+      # @example User-configured pricing
+      #   RubyLLM::Agents.configure do |c|
+      #     c.tts_model_pricing = {
+      #       "eleven_v3" => 0.24,
+      #       "tts-1" => 0.015
+      #     }
+      #   end
+      #
+      module SpeechPricing
+        extend self
+        LITELLM_PRICING_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
+        DEFAULT_CACHE_TTL = 24 * 60 * 60 # 24 hours
+        # Calculate total cost for a speech operation
+        #
+        # @param provider [Symbol] :openai or :elevenlabs
+        # @param model_id [String] The model identifier
+        # @param characters [Integer] Number of characters synthesized
+        # @return [Float] Total cost in USD
+        def calculate_cost(provider:, model_id:, characters:)
+          price_per_1k = cost_per_1k_characters(provider, model_id)
+          ((characters / 1000.0) * price_per_1k).round(6)
+        end
+        # Get cost per 1,000 characters for a model
+        #
+        # @param provider [Symbol] Provider identifier
+        # @param model_id [String] Model identifier
+        # @return [Float] Cost per 1K characters in USD
+        def cost_per_1k_characters(provider, model_id)
+          if (litellm_price = from_litellm(model_id))
+            return litellm_price
+          end
+          if (config_price = from_config(model_id))
+            return config_price
+          end
+          fallback_price(provider, model_id)
+        end
+        # Force refresh of cached LiteLLM data
+        def refresh!
+          @litellm_data = nil
+          @litellm_fetched_at = nil
+          litellm_data
+        end
+        # Expose all known pricing for debugging/dashboard
+        def all_pricing
+          {
+            litellm: litellm_tts_models,
+            configured: config.tts_model_pricing || {},
+            fallbacks: fallback_pricing_table
+          }
+        end
+        private
+        # ============================================================
+        # Tier 1: LiteLLM
+        # ============================================================
+        def from_litellm(model_id)
+          data = litellm_data
+          return nil unless data
+          model_data = find_litellm_model(data, model_id)
+          return nil unless model_data
+          extract_litellm_tts_price(model_data)
+        end
+        def find_litellm_model(data, model_id)
+          normalized = normalize_model_id(model_id)
+          candidates = [
+            model_id,
+            normalized,
+            "tts/#{model_id}",
+            "openai/#{model_id}",
+            "elevenlabs/#{model_id}"
+          ]
+          candidates.each do |key|
+            return data[key] if data[key]
+          end
+          data.find do |key, _|
+            key.to_s.downcase.include?(normalized.downcase)
+          end&.last
+        end
+        def extract_litellm_tts_price(model_data)
+          if model_data["input_cost_per_character"]
+            return model_data["input_cost_per_character"] * 1000
+          end
+          if model_data["output_cost_per_character"]
+            return model_data["output_cost_per_character"] * 1000
+          end
+          if model_data["output_cost_per_audio_token"]
+            return model_data["output_cost_per_audio_token"] * 250
+          end
+          nil
+        end
+        def litellm_data
+          return @litellm_data if @litellm_data && !cache_expired?
+          @litellm_data = fetch_litellm_data
+          @litellm_fetched_at = Time.now
+          @litellm_data
+        end
+        def fetch_litellm_data
+          if defined?(Rails) && Rails.respond_to?(:cache) && Rails.cache
+            Rails.cache.fetch("litellm_tts_pricing_data", expires_in: cache_ttl) do
+              fetch_from_url
+            end
+          else
+            fetch_from_url
+          end
+        rescue => e
+          warn "[RubyLLM::Agents] Failed to fetch LiteLLM TTS pricing: #{e.message}"
+          {}
+        end
+        def fetch_from_url
+          uri = URI(config.litellm_pricing_url || LITELLM_PRICING_URL)
+          http = Net::HTTP.new(uri.host, uri.port)
+          http.use_ssl = uri.scheme == "https"
+          http.open_timeout = 5
+          http.read_timeout = 10
+          request = Net::HTTP::Get.new(uri)
+          response = http.request(request)
+          if response.is_a?(Net::HTTPSuccess)
+            JSON.parse(response.body)
+          else
+            {}
+          end
+        rescue => e
+          warn "[RubyLLM::Agents] HTTP error fetching LiteLLM pricing: #{e.message}"
+          {}
+        end
+        def cache_expired?
+          return true unless @litellm_fetched_at
+          Time.now - @litellm_fetched_at > cache_ttl
+        end
+        def cache_ttl
+          ttl = config.litellm_pricing_cache_ttl
+          return DEFAULT_CACHE_TTL unless ttl
+          ttl.respond_to?(:to_i) ? ttl.to_i : ttl
+        end
+        def litellm_tts_models
+          litellm_data.select do |key, value|
+            value.is_a?(Hash) && (
+              value["input_cost_per_character"] ||
+              key.to_s.match?(/tts|speech|eleven/i)
+            )
+          end
+        end
+        # ============================================================
+        # Tier 2: User configuration
+        # ============================================================
+        def from_config(model_id)
+          table = config.tts_model_pricing
+          return nil unless table.is_a?(Hash) && !table.empty?
+          normalized = normalize_model_id(model_id)
+          price = table[model_id] || table[normalized] ||
+            table[model_id.to_sym] || table[normalized.to_sym]
+          price if price.is_a?(Numeric)
+        end
+        # ============================================================
+        # Tier 3: Hardcoded fallbacks
+        # ============================================================
+        def fallback_price(provider, model_id)
+          normalized = normalize_model_id(model_id)
+          case provider
+          when :openai
+            openai_fallback_price(normalized)
+          when :elevenlabs
+            elevenlabs_fallback_price(normalized)
+          else
+            config.default_tts_cost || 0.015
+          end
+        end
+        def openai_fallback_price(model_id)
+          case model_id
+          when /tts-1-hd/ then 0.030
+          when /tts-1/ then 0.015
+          else 0.015
+          end
+        end
+        def elevenlabs_fallback_price(model_id)
+          case model_id
+          when /eleven_flash_v2/ then 0.15
+          when /eleven_turbo_v2/ then 0.15
+          when /eleven_v3/ then 0.30
+          when /eleven_multilingual_v2/ then 0.30
+          when /eleven_multilingual_v1/ then 0.30
+          when /eleven_monolingual_v1/ then 0.30
+          else 0.30
+          end
+        end
+        def fallback_pricing_table
+          {
+            "tts-1" => 0.015,
+            "tts-1-hd" => 0.030,
+            "eleven_monolingual_v1" => 0.30,
+            "eleven_multilingual_v1" => 0.30,
+            "eleven_multilingual_v2" => 0.30,
+            "eleven_turbo_v2" => 0.15,
+            "eleven_flash_v2" => 0.15,
+            "eleven_turbo_v2_5" => 0.15,
+            "eleven_flash_v2_5" => 0.15,
+            "eleven_v3" => 0.30
+          }
+        end
+        def normalize_model_id(model_id)
+          model_id.to_s.downcase
+            .gsub(/[^a-z0-9._-]/, "-").squeeze("-")
+            .gsub(/^-|-$/, "")
+        end
+        def config
+          RubyLLM::Agents.configuration
+        end
+      end
+    end
+  end
+end