RubyGems - ruby_llm-agents - Versions diffs - 3.3.0 → 3.4.0 - Mend

ruby_llm-agents 3.3.0 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/README.md +1 -1
data/lib/ruby_llm/agents/audio/elevenlabs/model_registry.rb +187 -0
data/lib/ruby_llm/agents/audio/speaker.rb +38 -0
data/lib/ruby_llm/agents/audio/speech_client.rb +26 -2
data/lib/ruby_llm/agents/audio/speech_pricing.rb +44 -3
data/lib/ruby_llm/agents/core/configuration.rb +8 -1
data/lib/ruby_llm/agents/core/version.rb +1 -1
data/lib/ruby_llm/agents/results/speech_result.rb +19 -16
metadata +2 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 463487c17c50bf1496a30c9eea51dab3c334a17010853da97da5624a6cf564b5
-  data.tar.gz: 470a6666266d17dc8190f5118eec0c5f674fb51bae8b368d8713ea65d1882025
+  metadata.gz: 82355e2a179ddaf2f5003b2cbd972f373b2ca49cdcc2847535aec89fb18ed046
+  data.tar.gz: '09656de02af43adafdfe2615d1bfcb67aee76602fd0699d0f739eda731f29d8d'
 SHA512:
-  metadata.gz: b1e2d4688dfc294c3b94c95df084a248fa25fbcd7d99910f31f72b85138bf37a392a8b350462ee341d5d209674b844a9d2692a177db30845857a586fa77ce3bc
-  data.tar.gz: 50130237011f12c808a073d55b9083ce8449f8e0ddf8dd800c13134104f48233ea4f0fac3ddcbcc9a8b45b91814f67db5c57632dd6d54930c1ffd19fda825e96
+  metadata.gz: a5c8b20da41f0f73b8fdbffb809cecc726f1e7e6030d8351c5b994c58192b8d18da7693fa8fadec603f8dfb29ab7dd40907877600f58af185ab9d5542a884dcf
+  data.tar.gz: b6c0c90038a87f2824ff52b0bedd901528e291748a18caff4fd2df403affd351bf7cdf05db3042e6daae05d602c672f90a9a97434e6f54f9834c337ebae1a607

data/README.md CHANGED Viewed

@@ -135,7 +135,7 @@ result.save("logo.png")
 | **Attachments** | Images, PDFs, and multimodal support | [Attachments](https://github.com/adham90/ruby_llm-agents/wiki/Attachments) |
 | **Embeddings** | Vector embeddings with batching, caching, and preprocessing | [Embeddings](https://github.com/adham90/ruby_llm-agents/wiki/Embeddings) |
 | **Image Operations** | Generation, analysis, editing, pipelines with cost tracking | [Images](https://github.com/adham90/ruby_llm-agents/wiki/Image-Generation) |
-| **Audio** | Text-to-speech (OpenAI, ElevenLabs), speech-to-text, dashboard audio playback | [Audio](https://github.com/adham90/ruby_llm-agents/wiki/Audio) |
+| **Audio** | Text-to-speech (OpenAI, ElevenLabs), speech-to-text, dynamic pricing, 28+ output formats, dashboard audio playback | [Audio](https://github.com/adham90/ruby_llm-agents/wiki/Audio) |
 | **Alerts** | Slack, webhook, and custom notifications | [Alerts](https://github.com/adham90/ruby_llm-agents/wiki/Alerts) |
 ## Quick Start

data/lib/ruby_llm/agents/audio/elevenlabs/model_registry.rb ADDED Viewed

@@ -0,0 +1,187 @@
+# frozen_string_literal: true
+require "faraday"
+require "json"
+module RubyLLM
+  module Agents
+    module Audio
+      module ElevenLabs
+        # Fetches and caches ElevenLabs model data from the /v1/models API.
+        #
+        # Used for:
+        # - Dynamic cost calculation via character_cost_multiplier
+        # - Model validation (TTS vs STS capability)
+        # - Capability awareness (style, speaker_boost, max chars, languages)
+        #
+        # @example Check if a model supports TTS
+        #   ElevenLabs::ModelRegistry.tts_model?("eleven_v3") # => true
+        #   ElevenLabs::ModelRegistry.tts_model?("eleven_english_sts_v2") # => false
+        #
+        # @example Get cost multiplier
+        #   ElevenLabs::ModelRegistry.cost_multiplier("eleven_flash_v2_5") # => 0.5
+        #
+        module ModelRegistry
+          extend self
+          # Returns all models from the ElevenLabs API (cached)
+          #
+          # @return [Array<Hash>] Array of model hashes
+          def models
+            @mutex ||= Mutex.new
+            @mutex.synchronize do
+              if @models && !cache_expired?
+                return @models
+              end
+              @models = fetch_models
+              @fetched_at = Time.now
+              @models
+            end
+          end
+          # Find a specific model by ID
+          #
+          # @param model_id [String] The model identifier
+          # @return [Hash, nil] Model hash or nil if not found
+          def find(model_id)
+            models.find { |m| m["model_id"] == model_id.to_s }
+          end
+          # Check if model supports text-to-speech
+          #
+          # @param model_id [String] The model identifier
+          # @return [Boolean]
+          def tts_model?(model_id)
+            model = find(model_id)
+            return false unless model
+            model["can_do_text_to_speech"] == true
+          end
+          # Get character_cost_multiplier for a model
+          #
+          # @param model_id [String] The model identifier
+          # @return [Float] Cost multiplier (defaults to 1.0 for unknown models)
+          def cost_multiplier(model_id)
+            model = find(model_id)
+            model&.dig("model_rates", "character_cost_multiplier") || 1.0
+          end
+          # Get max characters per request for a model
+          #
+          # @param model_id [String] The model identifier
+          # @return [Integer, nil] Max characters or nil if unknown
+          def max_characters(model_id)
+            model = find(model_id)
+            model&.dig("maximum_text_length_per_request")
+          end
+          # Get supported language IDs for a model
+          #
+          # @param model_id [String] The model identifier
+          # @return [Array<String>] Language IDs (e.g. ["en", "es", "ja"])
+          def languages(model_id)
+            model = find(model_id)
+            model&.dig("languages")&.map { |l| l["language_id"] } || []
+          end
+          # Check if model supports the style voice setting
+          #
+          # @param model_id [String] The model identifier
+          # @return [Boolean]
+          def supports_style?(model_id)
+            find(model_id)&.dig("can_use_style") == true
+          end
+          # Check if model supports the speaker_boost setting
+          #
+          # @param model_id [String] The model identifier
+          # @return [Boolean]
+          def supports_speaker_boost?(model_id)
+            find(model_id)&.dig("can_use_speaker_boost") == true
+          end
+          # Check if model supports voice conversion (speech-to-speech)
+          # Used by VoiceConverter agent (see plans/elevenlabs_voice_converter.md)
+          #
+          # @param model_id [String] The model identifier
+          # @return [Boolean]
+          def voice_conversion_model?(model_id)
+            model = find(model_id)
+            return false unless model
+            model["can_do_voice_conversion"] == true
+          end
+          # Force refresh the cache
+          #
+          # @return [Array<Hash>] Fresh model data
+          def refresh!
+            @mutex ||= Mutex.new
+            @mutex.synchronize do
+              @models = nil
+              @fetched_at = nil
+            end
+            models
+          end
+          # Clear cache without re-fetching (useful for tests)
+          #
+          # @return [void]
+          def clear_cache!
+            @mutex ||= Mutex.new
+            @mutex.synchronize do
+              @models = nil
+              @fetched_at = nil
+            end
+          end
+          private
+          def fetch_models
+            return [] unless api_key
+            response = connection.get("/v1/models")
+            if response.success?
+              parsed = JSON.parse(response.body)
+              parsed.is_a?(Array) ? parsed : []
+            else
+              warn "[RubyLLM::Agents] ElevenLabs /v1/models returned HTTP #{response.status}"
+              @models || []
+            end
+          rescue Faraday::Error, JSON::ParserError => e
+            warn "[RubyLLM::Agents] Failed to fetch ElevenLabs models: #{e.message}"
+            @models || []
+          end
+          def cache_expired?
+            return true unless @fetched_at
+            ttl = RubyLLM::Agents.configuration.elevenlabs_models_cache_ttl || 21_600
+            Time.now - @fetched_at > ttl
+          end
+          def api_key
+            RubyLLM::Agents.configuration.elevenlabs_api_key
+          end
+          def api_base
+            base = RubyLLM::Agents.configuration.elevenlabs_api_base
+            (base && !base.empty?) ? base : "https://api.elevenlabs.io"
+          end
+          def connection
+            Faraday.new(url: api_base) do |f|
+              f.headers["xi-api-key"] = api_key
+              f.adapter Faraday.default_adapter
+              f.options.timeout = 10
+              f.options.open_timeout = 5
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/ruby_llm/agents/audio/speaker.rb CHANGED Viewed

@@ -4,6 +4,7 @@ require "digest"
 require_relative "../results/speech_result"
 require_relative "speech_client"
 require_relative "speech_pricing"
+require_relative "elevenlabs/model_registry"
 module RubyLLM
   module Agents
@@ -409,6 +410,7 @@ module RubyLLM
       # Executes speech synthesis
       def execute_speech(processed_text)
+        validate_elevenlabs_model!(processed_text)
         speak_options = build_speak_options
         if streaming_enabled? && @streaming_block
@@ -418,6 +420,42 @@ module RubyLLM
         end
       end
+      # Validates ElevenLabs model capabilities before calling the API.
+      # Raises on hard errors (non-TTS model), warns on soft issues.
+      def validate_elevenlabs_model!(text)
+        return unless resolved_provider == :elevenlabs
+        return unless defined?(Audio::ElevenLabs::ModelRegistry)
+        model_id = resolved_model
+        model = Audio::ElevenLabs::ModelRegistry.find(model_id)
+        return unless model # Unknown model — skip validation
+        # Hard error: model doesn't support TTS at all
+        unless model["can_do_text_to_speech"] == true
+          raise ConfigurationError,
+            "ElevenLabs model '#{model_id}' does not support text-to-speech. " \
+            "It may be a speech-to-speech model. Use a TTS-capable model like 'eleven_v3'."
+        end
+        # Warn: text exceeds model's max character limit
+        max_chars = model["maximum_text_length_per_request"]
+        if max_chars && text.length > max_chars
+          warn "[RubyLLM::Agents] Text length (#{text.length}) exceeds " \
+               "#{model_id} max of #{max_chars} characters. The API may truncate or reject it."
+        end
+        # Warn: style used on model that doesn't support it
+        vs = self.class.voice_settings_config
+        if vs && vs.style_value && vs.style_value > 0 && model["can_use_style"] != true
+          warn "[RubyLLM::Agents] Model '#{model_id}' does not support the 'style' voice setting. It will be ignored."
+        end
+      rescue ConfigurationError
+        raise
+      rescue => e
+        # Don't block speech on validation errors
+        warn "[RubyLLM::Agents] ElevenLabs model validation failed: #{e.message}"
+      end
       # Executes standard (non-streaming) speech synthesis
       def execute_standard_speech(text, options)
         response = speech_client.speak(

data/lib/ruby_llm/agents/audio/speech_client.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require "faraday"
 require "json"
+require "set"
 module RubyLLM
   module Agents
@@ -266,14 +267,37 @@ module RubyLLM
           body
         end
+        # Convenience mapping: simple symbol → ElevenLabs native format string
         ELEVENLABS_FORMAT_MAP = {
           "mp3" => "mp3_44100_128",
-          "pcm" => "pcm_44100",
+          "wav" => "wav_44100",
+          "ogg" => "mp3_44100_128",   # ElevenLabs doesn't support ogg; fallback to mp3
+          "pcm" => "pcm_24000",
+          "opus" => "opus_48000_128",
+          "flac" => "mp3_44100_128",  # ElevenLabs doesn't support flac; fallback to mp3
+          "aac" => "mp3_44100_128",   # ElevenLabs doesn't support aac; fallback to mp3
+          "alaw" => "alaw_8000",
           "ulaw" => "ulaw_8000"
         }.freeze
+        # All valid ElevenLabs native format strings (pass-through)
+        ELEVENLABS_NATIVE_FORMATS = Set.new(%w[
+          mp3_22050_32 mp3_24000_48 mp3_44100_32 mp3_44100_64
+          mp3_44100_96 mp3_44100_128 mp3_44100_192
+          pcm_8000 pcm_16000 pcm_22050 pcm_24000 pcm_32000 pcm_44100 pcm_48000
+          wav_8000 wav_16000 wav_22050 wav_24000 wav_32000 wav_44100 wav_48000
+          opus_48000_32 opus_48000_64 opus_48000_96 opus_48000_128 opus_48000_192
+          alaw_8000 ulaw_8000
+        ]).freeze
         def elevenlabs_output_format(format)
-          ELEVENLABS_FORMAT_MAP[format.to_s] || "mp3_44100_128"
+          format_str = format.to_s
+          # Pass through native ElevenLabs format strings directly
+          return format_str if ELEVENLABS_NATIVE_FORMATS.include?(format_str)
+          # Map simple symbols to native formats
+          ELEVENLABS_FORMAT_MAP[format_str] || "mp3_44100_128"
         end
         def elevenlabs_connection

data/lib/ruby_llm/agents/audio/speech_pricing.rb CHANGED Viewed

@@ -8,10 +8,11 @@ module RubyLLM
     module Audio
       # Dynamic pricing resolution for text-to-speech models.
       #
-      # Uses the same three-tier strategy as ImageGenerator::Pricing:
+      # Uses a four-tier pricing cascade:
       # 1. LiteLLM JSON (primary) - future-proof, auto-updating
       # 2. Configurable pricing table - user overrides via config.tts_model_pricing
-      # 3. Hardcoded fallbacks - per-model defaults
+      # 3. ElevenLabs API - dynamic multiplier × base rate from /v1/models
+      # 4. Hardcoded fallbacks - per-model defaults
       #
       # All prices are per 1,000 characters.
       #
@@ -50,14 +51,22 @@ module RubyLLM
         # @param model_id [String] Model identifier
         # @return [Float] Cost per 1K characters in USD
         def cost_per_1k_characters(provider, model_id)
+          # Tier 1: LiteLLM
           if (litellm_price = from_litellm(model_id))
             return litellm_price
           end
+          # Tier 2: User config overrides
           if (config_price = from_config(model_id))
             return config_price
           end
+          # Tier 3: ElevenLabs API multiplier × base rate
+          if provider == :elevenlabs && (api_price = from_elevenlabs_api(model_id))
+            return api_price
+          end
+          # Tier 4: Hardcoded fallbacks
           fallback_price(provider, model_id)
         end
@@ -73,6 +82,7 @@ module RubyLLM
           {
             litellm: litellm_tts_models,
             configured: config.tts_model_pricing || {},
+            elevenlabs_api: elevenlabs_api_pricing,
             fallbacks: fallback_pricing_table
           }
         end
@@ -190,6 +200,19 @@ module RubyLLM
           end
         end
+        def elevenlabs_api_pricing
+          return {} unless defined?(ElevenLabs::ModelRegistry)
+          base = config.elevenlabs_base_cost_per_1k || 0.30
+          ElevenLabs::ModelRegistry.models.each_with_object({}) do |model, hash|
+            multiplier = model.dig("model_rates", "character_cost_multiplier") || 1.0
+            hash[model["model_id"]] = (base * multiplier).round(6)
+          end
+        rescue => e
+          warn "[RubyLLM::Agents] Failed to get ElevenLabs API pricing: #{e.message}"
+          {}
+        end
         # ============================================================
         # Tier 2: User configuration
         # ============================================================
@@ -207,7 +230,25 @@ module RubyLLM
         end
         # ============================================================
-        # Tier 3: Hardcoded fallbacks
+        # Tier 3: ElevenLabs API (dynamic multiplier × base rate)
+        # ============================================================
+        def from_elevenlabs_api(model_id)
+          return nil unless defined?(ElevenLabs::ModelRegistry)
+          model = ElevenLabs::ModelRegistry.find(model_id)
+          return nil unless model
+          multiplier = model.dig("model_rates", "character_cost_multiplier") || 1.0
+          base = config.elevenlabs_base_cost_per_1k || 0.30
+          (base * multiplier).round(6)
+        rescue => e
+          warn "[RubyLLM::Agents] Failed to get ElevenLabs API pricing: #{e.message}"
+          nil
+        end
+        # ============================================================
+        # Tier 4: Hardcoded fallbacks
         # ============================================================
         def fallback_price(provider, model_id)

data/lib/ruby_llm/agents/core/configuration.rb CHANGED Viewed

@@ -453,7 +453,9 @@ module RubyLLM
         :root_namespace,
         :tool_result_max_length,
         :redaction,
-        :persist_audio_data
+        :persist_audio_data,
+        :elevenlabs_base_cost_per_1k,
+        :elevenlabs_models_cache_ttl
       # Attributes with validation (readers only, custom setters below)
       attr_reader :default_temperature,
@@ -738,6 +740,11 @@ module RubyLLM
         # Audio data persistence (disabled by default — base64 audio can be large)
         @persist_audio_data = false
+        # ElevenLabs dynamic pricing: base cost per 1K characters (Pro plan overage rate)
+        @elevenlabs_base_cost_per_1k = 0.30
+        # ElevenLabs models cache TTL in seconds (6 hours)
+        @elevenlabs_models_cache_ttl = 21_600
       end
       # Returns the configured cache store, falling back to Rails.cache

data/lib/ruby_llm/agents/core/version.rb CHANGED Viewed

@@ -4,6 +4,6 @@ module RubyLLM
   module Agents
     # Current version of the RubyLLM::Agents gem
     # @return [String] Semantic version string
-    VERSION = "3.3.0"
+    VERSION = "3.4.0"
   end
 end

data/lib/ruby_llm/agents/results/speech_result.rb CHANGED Viewed

@@ -319,23 +319,26 @@ module RubyLLM
       #
       # @return [String] MIME type
       def mime_type_for_format
+        fmt = format.to_s
+        # Handle ElevenLabs native format strings (e.g., "mp3_44100_128")
+        return "audio/mpeg" if fmt.start_with?("mp3")
+        return "audio/wav" if fmt.start_with?("wav")
+        return "audio/opus" if fmt.start_with?("opus")
+        return "audio/pcm" if fmt.start_with?("pcm")
+        return "audio/alaw" if fmt.start_with?("alaw")
+        return "audio/basic" if fmt.start_with?("ulaw")
+        # Handle simple symbols (backward compatible)
         case format
-        when :mp3
-          "audio/mpeg"
-        when :wav
-          "audio/wav"
-        when :ogg
-          "audio/ogg"
-        when :flac
-          "audio/flac"
-        when :aac
-          "audio/aac"
-        when :opus
-          "audio/opus"
-        when :pcm
-          "audio/pcm"
-        else
-          "audio/mpeg" # Default to mp3
+        when :mp3 then "audio/mpeg"
+        when :wav then "audio/wav"
+        when :ogg then "audio/ogg"
+        when :flac then "audio/flac"
+        when :aac then "audio/aac"
+        when :opus then "audio/opus"
+        when :pcm then "audio/pcm"
+        else "audio/mpeg"
         end
       end
     end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: ruby_llm-agents
 version: !ruby/object:Gem::Version
-  version: 3.3.0
+  version: 3.4.0
 platform: ruby
 authors:
 - adham90
@@ -209,6 +209,7 @@ files:
 - lib/generators/ruby_llm_agents/upgrade_generator.rb
 - lib/ruby_llm-agents.rb
 - lib/ruby_llm/agents.rb
+- lib/ruby_llm/agents/audio/elevenlabs/model_registry.rb
 - lib/ruby_llm/agents/audio/speaker.rb
 - lib/ruby_llm/agents/audio/speaker/active_storage_support.rb
 - lib/ruby_llm/agents/audio/speech_client.rb