RubyGems - ruby_llm-agents - Versions diffs - 3.1.0 → 3.3.0 - Mend

ruby_llm-agents 3.1.0 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

data/lib/generators/ruby_llm_agents/multi_tenancy_generator.rb CHANGED Viewed

@@ -102,7 +102,7 @@ module RubyLlmAgents
     def table_exists?(table)
       ActiveRecord::Base.connection.table_exists?(table)
-    rescue StandardError
+    rescue
       false
     end
@@ -110,7 +110,7 @@ module RubyLlmAgents
       return false unless ActiveRecord::Base.connection.table_exists?(table)
       ActiveRecord::Base.connection.column_exists?(table, column)
-    rescue StandardError
+    rescue
       false
     end
   end

data/lib/generators/ruby_llm_agents/restructure_generator.rb CHANGED Viewed

@@ -37,25 +37,25 @@ module RubyLlmAgents
     # Maps old directory -> { category:, type: }
     DIRECTORY_MAPPING = {
       # Top-level under llm/
-      "agents" => { category: nil, type: "agents" },
-      "tools" => { category: nil, type: "tools" },
+      "agents" => {category: nil, type: "agents"},
+      "tools" => {category: nil, type: "tools"},
       # Audio group
-      "speakers" => { category: :audio, type: "speakers" },
-      "transcribers" => { category: :audio, type: "transcribers" },
+      "speakers" => {category: :audio, type: "speakers"},
+      "transcribers" => {category: :audio, type: "transcribers"},
       # Image group
-      "image_generators" => { category: :image, type: "generators" },
-      "image_editors" => { category: :image, type: "editors" },
-      "image_analyzers" => { category: :image, type: "analyzers" },
-      "image_transformers" => { category: :image, type: "transformers" },
-      "image_upscalers" => { category: :image, type: "upscalers" },
-      "image_variators" => { category: :image, type: "variators" },
-      "background_removers" => { category: :image, type: "background_removers" },
+      "image_generators" => {category: :image, type: "generators"},
+      "image_editors" => {category: :image, type: "editors"},
+      "image_analyzers" => {category: :image, type: "analyzers"},
+      "image_transformers" => {category: :image, type: "transformers"},
+      "image_upscalers" => {category: :image, type: "upscalers"},
+      "image_variators" => {category: :image, type: "variators"},
+      "background_removers" => {category: :image, type: "background_removers"},
       # Text group
-      "embedders" => { category: :text, type: "embedders" },
-      "moderators" => { category: :text, type: "moderators" }
+      "embedders" => {category: :text, type: "embedders"},
+      "moderators" => {category: :text, type: "moderators"}
     }.freeze
     def validate_root_directory

data/lib/generators/ruby_llm_agents/speaker_generator.rb CHANGED Viewed

@@ -17,17 +17,17 @@ module RubyLlmAgents
     source_root File.expand_path("templates", __dir__)
     class_option :provider, type: :string, default: "openai",
-                 desc: "The TTS provider to use (openai, elevenlabs)"
+      desc: "The TTS provider to use (openai, elevenlabs)"
     class_option :model, type: :string, default: nil,
-                 desc: "The TTS model to use"
+      desc: "The TTS model to use"
     class_option :voice, type: :string, default: "nova",
-                 desc: "The voice to use"
+      desc: "The voice to use"
     class_option :speed, type: :numeric, default: 1.0,
-                 desc: "Speech speed (0.25-4.0 for OpenAI)"
+      desc: "Speech speed (0.25-4.0 for OpenAI)"
     class_option :format, type: :string, default: "mp3",
-                 desc: "Output format (mp3, wav, ogg, flac)"
+      desc: "Output format (mp3, wav, ogg, flac)"
     class_option :cache, type: :string, default: nil,
-                 desc: "Cache TTL (e.g., '7.days')"
+      desc: "Cache TTL (e.g., '7.days')"
     def ensure_base_class_and_skill_file
       audio_dir = "app/agents/audio"

data/lib/generators/ruby_llm_agents/transcriber_generator.rb CHANGED Viewed

@@ -17,13 +17,13 @@ module RubyLlmAgents
     source_root File.expand_path("templates", __dir__)
     class_option :model, type: :string, default: "whisper-1",
-                 desc: "The transcription model to use"
+      desc: "The transcription model to use"
     class_option :language, type: :string, default: nil,
-                 desc: "Language code (e.g., 'en', 'es')"
+      desc: "Language code (e.g., 'en', 'es')"
     class_option :output_format, type: :string, default: "text",
-                 desc: "Output format (text, srt, vtt, json)"
+      desc: "Output format (text, srt, vtt, json)"
     class_option :cache, type: :string, default: nil,
-                 desc: "Cache TTL (e.g., '30.days')"
+      desc: "Cache TTL (e.g., '30.days')"
     def ensure_base_class_and_skill_file
       audio_dir = "app/agents/audio"

data/lib/generators/ruby_llm_agents/upgrade_generator.rb CHANGED Viewed

@@ -164,13 +164,13 @@ module RubyLlmAgents
       return false unless ActiveRecord::Base.connection.table_exists?(table)
       ActiveRecord::Base.connection.column_exists?(table, column)
-    rescue StandardError
+    rescue
       false
     end
     def table_exists?(table)
       ActiveRecord::Base.connection.table_exists?(table)
-    rescue StandardError
+    rescue
       false
     end
   end

data/lib/ruby_llm/agents/audio/speaker/active_storage_support.rb ADDED Viewed

@@ -0,0 +1,87 @@
+# frozen_string_literal: true
+module RubyLLM
+  module Agents
+    class Speaker
+      # ActiveStorage integration for speakers
+      #
+      # Provides convenience methods for generating audio and directly
+      # attaching it to ActiveStorage attachments.
+      #
+      # @example Attaching to a model
+      #   class Article < ApplicationRecord
+      #     has_one_attached :narration
+      #   end
+      #
+      #   class ArticleNarrator < RubyLLM::Agents::Speaker
+      #     include RubyLLM::Agents::Speaker::ActiveStorageSupport
+      #
+      #     provider :openai
+      #     model 'tts-1-hd'
+      #     voice 'nova'
+      #   end
+      #
+      #   article = Article.find(1)
+      #   result = ArticleNarrator.speak_and_attach(
+      #     text: article.body,
+      #     record: article,
+      #     attachment_name: :narration
+      #   )
+      #
+      module ActiveStorageSupport
+        extend ActiveSupport::Concern
+        class_methods do
+          # Generate audio and attach it to a record
+          #
+          # @param text [String] Text to convert to speech
+          # @param record [ActiveRecord::Base] The record to attach to
+          # @param attachment_name [Symbol] Name of the attachment (e.g., :narration)
+          # @param options [Hash] Additional options for generation
+          # @return [SpeechResult] The speech result with audio_url set
+          def speak_and_attach(text:, record:, attachment_name:, **options)
+            result = call(text: text, **options)
+            return result unless result.success?
+            attach_audio_to_record(result, record, attachment_name, options)
+            result
+          end
+          private
+          def attach_audio_to_record(result, record, attachment_name, options)
+            attachment = record.public_send(attachment_name)
+            filename = options[:filename] || generate_audio_filename(result)
+            attachment.attach(
+              io: StringIO.new(result.audio),
+              filename: filename,
+              content_type: result.content_type
+            )
+            result.audio_key = attachment.blob.key if attachment.respond_to?(:blob) && attachment.blob
+            result.audio_url = blob_url(attachment) if attachment.respond_to?(:blob) && attachment.blob
+          end
+          def blob_url(attachment)
+            if attachment.blob.respond_to?(:url)
+              attachment.blob.url
+            elsif attachment.blob.respond_to?(:service_url)
+              attachment.blob.service_url
+            end
+          rescue => _e
+            nil
+          end
+          def generate_audio_filename(result)
+            timestamp = Time.current.to_i
+            ext = result.format || :mp3
+            "speech_#{timestamp}.#{ext}"
+          end
+        end
+      end
+    end
+  end
+end

data/lib/ruby_llm/agents/audio/speaker.rb CHANGED Viewed

@@ -2,6 +2,8 @@
 require "digest"
 require_relative "../results/speech_result"
+require_relative "speech_client"
+require_relative "speech_pricing"
 module RubyLLM
   module Agents
@@ -194,19 +196,19 @@ module RubyLLM
         def default_tts_provider
           RubyLLM::Agents.configuration.default_tts_provider
-        rescue StandardError
+        rescue
           :openai
         end
         def default_tts_model
           RubyLLM::Agents.configuration.default_tts_model
-        rescue StandardError
+        rescue
           "tts-1"
         end
         def default_tts_voice
           RubyLLM::Agents.configuration.default_tts_voice
-        rescue StandardError
+        rescue
           "nova"
         end
       end
@@ -334,6 +336,14 @@ module RubyLLM
         context.output_tokens = 0
         context.total_cost = calculate_cost(result)
+        # Store audio-specific metadata for execution tracking
+        context[:provider] = result[:provider].to_s
+        context[:voice_id] = (resolved_voice_id || resolved_voice).to_s
+        context[:characters] = result[:characters]
+        context[:output_format] = result[:format].to_s
+        context[:file_size] = result[:audio]&.bytesize
+        context[:audio_duration_seconds] = result[:duration] if result[:duration]
         # Build final result
         context.output = build_result(
           result,
@@ -410,7 +420,15 @@ module RubyLLM
       # Executes standard (non-streaming) speech synthesis
       def execute_standard_speech(text, options)
-        response = RubyLLM.speak(text, **options)
+        response = speech_client.speak(
+          text,
+          model: options[:model],
+          voice: options[:voice],
+          voice_id: resolved_voice_id,
+          speed: options[:speed],
+          response_format: options[:response_format] || "mp3",
+          voice_settings: options[:voice_settings]
+        )
         {
           audio: response.audio,
@@ -428,9 +446,17 @@ module RubyLLM
       def execute_streaming_speech(text, options)
         audio_chunks = []
-        RubyLLM.speak(text, **options.merge(stream: true)) do |chunk|
+        speech_client.speak_streaming(
+          text,
+          model: options[:model],
+          voice: options[:voice],
+          voice_id: resolved_voice_id,
+          speed: options[:speed],
+          response_format: options[:response_format] || "mp3",
+          voice_settings: options[:voice_settings]
+        ) do |chunk|
           audio_chunks << chunk.audio if chunk.respond_to?(:audio)
-          @streaming_block.call(chunk) if @streaming_block
+          @streaming_block&.call(chunk)
         end
         {
@@ -445,7 +471,7 @@ module RubyLLM
         }
       end
-      # Builds options for RubyLLM.speak
+      # Builds options for SpeechClient
       def build_speak_options
         options = {
           model: resolved_model,
@@ -453,13 +479,11 @@ module RubyLLM
         }
         speed = resolved_speed
-        options[:speed] = speed if speed && speed != 1.0
+        options[:speed] = speed if speed && (speed - 1.0).abs > Float::EPSILON
         options[:response_format] = resolved_output_format.to_s
-        if resolved_provider == :elevenlabs
-          voice_settings = self.class.voice_settings_config
-          options[:voice_settings] = voice_settings.to_h if voice_settings
-        end
+        voice_settings = self.class.voice_settings_config
+        options[:voice_settings] = voice_settings.to_h if voice_settings
         options
       end
@@ -488,29 +512,17 @@ module RubyLLM
       # Calculates cost for speech synthesis
       def calculate_cost(raw_result)
-        characters = raw_result[:characters] || 0
-        if raw_result[:raw_response].respond_to?(:cost) && raw_result[:raw_response].cost
+        if raw_result[:raw_response].respond_to?(:cost) && raw_result[:raw_response]&.cost
           return raw_result[:raw_response].cost
         end
-        provider = raw_result[:provider]
-        model_name = raw_result[:model].to_s
-        price_per_1k_chars = case provider
-                            when :openai
-                              model_name.include?("hd") ? 0.030 : 0.015
-                            when :elevenlabs
-                              0.30
-                            when :google
-                              0.016
-                            when :polly
-                              0.016
-                            else
-                              0.015
-                            end
+        characters = raw_result[:characters] || 0
-        (characters / 1000.0) * price_per_1k_chars
+        Audio::SpeechPricing.calculate_cost(
+          provider: raw_result[:provider],
+          model_id: raw_result[:model].to_s,
+          characters: characters
+        )
       end
       # Resolves the provider to use
@@ -547,6 +559,13 @@ module RubyLLM
       def streaming_enabled?
         @runtime_streaming || self.class.streaming?
       end
+      # Returns a SpeechClient for the resolved provider
+      def speech_client
+        @speech_client ||= Audio::SpeechClient.new(provider: resolved_provider)
+      end
     end
   end
 end
+require_relative "speaker/active_storage_support"

data/lib/ruby_llm/agents/audio/speech_client.rb ADDED Viewed

@@ -0,0 +1,328 @@
+# frozen_string_literal: true
+require "faraday"
+require "json"
+module RubyLLM
+  module Agents
+    module Audio
+      # Direct HTTP client for text-to-speech APIs.
+      #
+      # Supports OpenAI and ElevenLabs providers, bypassing the need for
+      # a RubyLLM.speak() method that does not exist in the base gem.
+      #
+      # @example OpenAI
+      #   client = SpeechClient.new(provider: :openai)
+      #   response = client.speak("Hello", model: "tts-1", voice: "nova")
+      #   response.audio  # => binary audio data
+      #
+      # @example ElevenLabs
+      #   client = SpeechClient.new(provider: :elevenlabs)
+      #   response = client.speak("Hello",
+      #     model: "eleven_v3",
+      #     voice: "Rachel",
+      #     voice_id: "21m00Tcm4TlvDq8ikWAM",
+      #     voice_settings: { stability: 0.5, similarity_boost: 0.75 }
+      #   )
+      #
+      class SpeechClient
+        SUPPORTED_PROVIDERS = %i[openai elevenlabs].freeze
+        Response = Struct.new(:audio, :format, :model, :voice, keyword_init: true) do
+          def duration
+            nil
+          end
+          def cost
+            nil
+          end
+        end
+        StreamChunk = Struct.new(:audio, keyword_init: true)
+        # @param provider [Symbol] :openai or :elevenlabs
+        # @raise [UnsupportedProviderError] if provider is not supported
+        def initialize(provider:)
+          validate_provider!(provider)
+          @provider = provider
+        end
+        # Synthesize speech (non-streaming)
+        #
+        # @param text [String] text to convert
+        # @param model [String] model identifier
+        # @param voice [String] voice name
+        # @param voice_id [String, nil] voice ID (required for ElevenLabs)
+        # @param speed [Float, nil] speed multiplier
+        # @param response_format [String] output format
+        # @param voice_settings [Hash, nil] ElevenLabs voice settings
+        # @return [Response]
+        def speak(text, model:, voice:, voice_id: nil, speed: nil,
+          response_format: "mp3", voice_settings: nil)
+          case @provider
+          when :openai
+            openai_speak(text, model: model, voice: voice_id || voice,
+              speed: speed, response_format: response_format)
+          when :elevenlabs
+            elevenlabs_speak(text, model: model, voice_id: voice_id || voice,
+              speed: speed, response_format: response_format,
+              voice_settings: voice_settings)
+          end
+        end
+        # Synthesize speech with streaming
+        #
+        # @param text [String] text to convert
+        # @param model [String] model identifier
+        # @param voice [String] voice name
+        # @param voice_id [String, nil] voice ID
+        # @param speed [Float, nil] speed multiplier
+        # @param response_format [String] output format
+        # @param voice_settings [Hash, nil] ElevenLabs voice settings
+        # @yield [StreamChunk] each audio chunk as it arrives
+        # @return [Response]
+        def speak_streaming(text, model:, voice:, voice_id: nil, speed: nil,
+          response_format: "mp3", voice_settings: nil, &block)
+          case @provider
+          when :openai
+            openai_speak_streaming(text, model: model, voice: voice_id || voice,
+                                   speed: speed, response_format: response_format,
+              &block)
+          when :elevenlabs
+            elevenlabs_speak_streaming(text, model: model,
+                                       voice_id: voice_id || voice,
+                                       speed: speed,
+                                       response_format: response_format,
+                                       voice_settings: voice_settings, &block)
+          end
+        end
+        private
+        # ============================================================
+        # Provider validation
+        # ============================================================
+        def validate_provider!(provider)
+          return if SUPPORTED_PROVIDERS.include?(provider)
+          raise UnsupportedProviderError.new(
+            "Provider :#{provider} is not yet supported for text-to-speech. " \
+            "Supported providers: #{SUPPORTED_PROVIDERS.map { |p| ":#{p}" }.join(", ")}.",
+            provider: provider
+          )
+        end
+        # ============================================================
+        # OpenAI implementation
+        # ============================================================
+        def openai_speak(text, model:, voice:, speed:, response_format:)
+          body = openai_request_body(text, model: model, voice: voice,
+            speed: speed, response_format: response_format)
+          response = openai_connection.post("/v1/audio/speech") do |req|
+            req.headers["Content-Type"] = "application/json"
+            req.body = body.to_json
+          end
+          handle_error_response!(response) unless response.success?
+          Response.new(
+            audio: response.body,
+            format: response_format.to_sym,
+            model: model,
+            voice: voice
+          )
+        end
+        def openai_speak_streaming(text, model:, voice:, speed:,
+          response_format:, &block)
+          body = openai_request_body(text, model: model, voice: voice,
+            speed: speed, response_format: response_format)
+          chunks = []
+          openai_connection.post("/v1/audio/speech") do |req|
+            req.headers["Content-Type"] = "application/json"
+            req.body = body.to_json
+            req.options.on_data = proc do |chunk, _size, env|
+              if env.status == 200
+                chunk_obj = StreamChunk.new(audio: chunk)
+                chunks << chunk
+                block&.call(chunk_obj)
+              end
+            end
+          end
+          Response.new(
+            audio: chunks.join,
+            format: response_format.to_sym,
+            model: model,
+            voice: voice
+          )
+        end
+        def openai_request_body(text, model:, voice:, speed:, response_format:)
+          body = {
+            model: model,
+            input: text,
+            voice: voice,
+            response_format: response_format.to_s
+          }
+          body[:speed] = speed if speed && (speed - 1.0).abs > Float::EPSILON
+          body
+        end
+        def openai_connection
+          @openai_connection ||= Faraday.new(url: openai_api_base) do |f|
+            f.headers["Authorization"] = "Bearer #{openai_api_key}"
+            f.adapter Faraday.default_adapter
+            f.options.timeout = 120
+            f.options.open_timeout = 30
+          end
+        end
+        def openai_api_key
+          key = RubyLLM.config.openai_api_key
+          unless key
+            raise ConfigurationError,
+              "OpenAI API key is required for text-to-speech. " \
+              "Set it via: RubyLLM.configure { |c| c.openai_api_key = 'sk-...' }"
+          end
+          key
+        end
+        def openai_api_base
+          base = RubyLLM.config.openai_api_base
+          (base && !base.empty?) ? base : "https://api.openai.com"
+        end
+        # ============================================================
+        # ElevenLabs implementation
+        # ============================================================
+        def elevenlabs_speak(text, model:, voice_id:, speed:,
+          response_format:, voice_settings:)
+          path = "/v1/text-to-speech/#{voice_id}"
+          body = elevenlabs_request_body(text, model: model, speed: speed,
+            voice_settings: voice_settings)
+          format_param = elevenlabs_output_format(response_format)
+          response = elevenlabs_connection.post(path) do |req|
+            req.headers["Content-Type"] = "application/json"
+            req.params["output_format"] = format_param
+            req.body = body.to_json
+          end
+          handle_error_response!(response) unless response.success?
+          Response.new(
+            audio: response.body,
+            format: response_format.to_sym,
+            model: model,
+            voice: voice_id
+          )
+        end
+        def elevenlabs_speak_streaming(text, model:, voice_id:, speed:,
+          response_format:, voice_settings:, &block)
+          path = "/v1/text-to-speech/#{voice_id}/stream"
+          body = elevenlabs_request_body(text, model: model, speed: speed,
+            voice_settings: voice_settings)
+          format_param = elevenlabs_output_format(response_format)
+          chunks = []
+          elevenlabs_connection.post(path) do |req|
+            req.headers["Content-Type"] = "application/json"
+            req.params["output_format"] = format_param
+            req.body = body.to_json
+            req.options.on_data = proc do |chunk, _size, env|
+              if env.status == 200
+                chunk_obj = StreamChunk.new(audio: chunk)
+                chunks << chunk
+                block&.call(chunk_obj)
+              end
+            end
+          end
+          Response.new(
+            audio: chunks.join,
+            format: response_format.to_sym,
+            model: model,
+            voice: voice_id
+          )
+        end
+        def elevenlabs_request_body(text, model:, speed:, voice_settings:)
+          body = {
+            text: text,
+            model_id: model
+          }
+          vs = voice_settings&.dup || {}
+          vs[:speed] = speed if speed && (speed - 1.0).abs > Float::EPSILON
+          body[:voice_settings] = vs unless vs.empty?
+          body
+        end
+        ELEVENLABS_FORMAT_MAP = {
+          "mp3" => "mp3_44100_128",
+          "pcm" => "pcm_44100",
+          "ulaw" => "ulaw_8000"
+        }.freeze
+        def elevenlabs_output_format(format)
+          ELEVENLABS_FORMAT_MAP[format.to_s] || "mp3_44100_128"
+        end
+        def elevenlabs_connection
+          @elevenlabs_connection ||= Faraday.new(url: elevenlabs_api_base) do |f|
+            f.headers["xi-api-key"] = elevenlabs_api_key
+            f.adapter Faraday.default_adapter
+            f.options.timeout = 120
+            f.options.open_timeout = 30
+          end
+        end
+        def elevenlabs_api_key
+          key = RubyLLM::Agents.configuration.elevenlabs_api_key
+          unless key
+            raise ConfigurationError,
+              "ElevenLabs API key is required for text-to-speech. " \
+              "Set it via: RubyLLM::Agents.configure { |c| c.elevenlabs_api_key = 'xi-...' }"
+          end
+          key
+        end
+        def elevenlabs_api_base
+          base = RubyLLM::Agents.configuration.elevenlabs_api_base
+          (base && !base.empty?) ? base : "https://api.elevenlabs.io"
+        end
+        # ============================================================
+        # Shared error handling
+        # ============================================================
+        def handle_error_response!(response)
+          raise SpeechApiError.new(
+            "TTS API request failed (HTTP #{response.status}): #{error_message_from(response)}",
+            status: response.status,
+            response_body: response.body
+          )
+        end
+        def error_message_from(response)
+          parsed = JSON.parse(response.body)
+          if parsed.is_a?(Hash)
+            parsed.dig("error", "message") || parsed["detail"] || parsed["error"] || response.body
+          else
+            response.body
+          end
+        rescue JSON::ParserError
+          response.body.to_s[0, 200]
+        end
+      end
+    end
+  end
+end