RubyGems - ruby_llm-agents - Versions diffs - 0.4.0 → 1.0.0.beta.1 - Mend

ruby_llm-agents 0.4.0 → 1.0.0.beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (208) hide show

data/lib/ruby_llm/agents/pipeline/middleware/instrumentation.rb ADDED Viewed

@@ -0,0 +1,415 @@
+# frozen_string_literal: true
+module RubyLLM
+  module Agents
+    module Pipeline
+      module Middleware
+        # Times execution and records results for observability.
+        #
+        # This middleware provides:
+        # - Execution timing (start/end timestamps, duration)
+        # - Success/failure recording to database
+        # - Token usage and cost tracking
+        # - Error details on failure
+        #
+        # Recording can be async (via background job) or sync depending
+        # on configuration.
+        #
+        # Tracking is enabled/disabled per agent type via configuration:
+        # - track_executions (conversation agents)
+        # - track_embeddings
+        # - track_moderations
+        # - track_image_generations
+        # - track_audio
+        #
+        # @example Configuration
+        #   RubyLLM::Agents.configure do |config|
+        #     config.track_executions = true
+        #     config.track_embeddings = true
+        #     config.async_logging = true  # Use background job
+        #   end
+        #
+        class Instrumentation < Base
+          # Process instrumentation
+          #
+          # Creates a "running" execution record at the start so executions
+          # appear on the dashboard immediately, then updates it when complete.
+          #
+          # @param context [Context] The execution context
+          # @return [Context] The context with timing info
+          def call(context)
+            context.started_at = Time.current
+            # Create "running" record immediately (SYNC - must appear on dashboard)
+            execution = create_running_execution(context)
+            context.execution_id = execution&.id
+            status_update_completed = false
+            raised_exception = nil
+            begin
+              @app.call(context)
+              context.completed_at = Time.current
+              begin
+                complete_execution(execution, context, status: "success")
+                status_update_completed = true
+              rescue StandardError
+                # Let ensure block handle via mark_execution_failed!
+              end
+            rescue StandardError => e
+              context.completed_at = Time.current
+              context.error = e
+              raised_exception = e
+              begin
+                complete_execution(execution, context, status: determine_error_status(e))
+                status_update_completed = true
+              rescue StandardError
+                # Let ensure block handle via mark_execution_failed!
+              end
+              raise
+            ensure
+              # Emergency fallback if update failed
+              mark_execution_failed!(execution, error: raised_exception || $!) unless status_update_completed
+            end
+            context
+          end
+          private
+          # Creates initial execution record with 'running' status
+          #
+          # Creates the record synchronously so it appears on the dashboard immediately.
+          # Returns nil on failure to avoid breaking the actual execution.
+          #
+          # @param context [Context] The execution context
+          # @return [Execution, nil] The created record, or nil on failure
+          def create_running_execution(context)
+            return nil unless tracking_enabled?(context)
+            return nil unless execution_model_available?
+            return nil if context.cached? && !track_cache_hits?
+            data = build_running_execution_data(context)
+            Execution.create!(data)
+          rescue StandardError => e
+            error("Failed to create running execution record: #{e.message}")
+            nil
+          end
+          # Updates execution record with completion data
+          #
+          # Updates the existing record with final status, duration, and metrics.
+          # Falls back to creating a new record if the initial record is nil.
+          # Errors are re-raised to allow the ensure block to handle them.
+          #
+          # @param execution [Execution, nil] The execution record to update
+          # @param context [Context] The execution context
+          # @param status [String] Final status ("success", "error", "timeout")
+          # @raise [StandardError] Re-raises any errors for ensure block to handle
+          def complete_execution(execution, context, status:)
+            return unless tracking_enabled?(context)
+            return if context.cached? && !track_cache_hits?
+            return unless execution_model_available?
+            # Fall back to legacy create if no execution record exists
+            unless execution
+              persist_execution(context, status: status)
+              return
+            end
+            update_data = build_completion_data(context, status)
+            if async_logging?
+              # For async updates, use a job (if update support exists)
+              # For now, update synchronously to ensure dashboard shows correct status
+              execution.update!(update_data)
+            else
+              execution.update!(update_data)
+            end
+          rescue StandardError => e
+            error("Failed to complete execution record: #{e.message}")
+            raise # Re-raise for ensure block to handle via mark_execution_failed!
+          end
+          # Emergency fallback to mark execution as failed
+          #
+          # Uses update_all to bypass ActiveRecord callbacks and validations,
+          # ensuring the status is updated even if the model is in an invalid state.
+          # Only updates records that are still in 'running' status.
+          #
+          # @param execution [Execution, nil] The execution record
+          # @param error [Exception, nil] The exception that caused the failure
+          def mark_execution_failed!(execution, error: nil)
+            return unless execution&.id
+            return unless execution.status == "running"
+            error_message = error ? "#{error.class}: #{error.message}".truncate(1000) : "Unknown error"
+            update_data = {
+              status: "error",
+              completed_at: Time.current,
+              error_class: error&.class&.name || "UnknownError",
+              error_message: error_message
+            }
+            execution.class.where(id: execution.id, status: "running").update_all(update_data)
+          rescue StandardError => e
+            error("CRITICAL: Failed emergency status update for execution #{execution&.id}: #{e.message}")
+          end
+          # Determines the status based on error type
+          #
+          # @param error [Exception] The exception that occurred
+          # @return [String] The determined status ("timeout" or "error")
+          def determine_error_status(error)
+            error.is_a?(Timeout::Error) ? "timeout" : "error"
+          end
+          # Builds data for initial running execution record
+          #
+          # @param context [Context] The execution context
+          # @return [Hash] Execution data for creating running record
+          def build_running_execution_data(context)
+            data = {
+              agent_type: context.agent_class&.name,
+              agent_version: config(:version, "1.0"),
+              model_id: context.model,
+              status: "running",
+              started_at: context.started_at,
+              input_tokens: 0,
+              output_tokens: 0,
+              total_cost: 0,
+              attempts_count: context.attempts_made
+            }
+            # Add tenant_id only if multi-tenancy is enabled and tenant is set
+            if global_config.multi_tenancy_enabled? && context.tenant_id.present?
+              data[:tenant_id] = context.tenant_id
+            end
+            # Add sanitized parameters
+            data[:parameters] = sanitize_parameters(context)
+            data
+          end
+          # Builds data for completing an execution record
+          #
+          # @param context [Context] The execution context
+          # @param status [String] Final status ("success", "error", "timeout")
+          # @return [Hash] Update data for completing the record
+          def build_completion_data(context, status)
+            data = {
+              status: status,
+              completed_at: context.completed_at,
+              duration_ms: context.duration_ms,
+              cache_hit: context.cached?,
+              input_tokens: context.input_tokens || 0,
+              output_tokens: context.output_tokens || 0,
+              total_cost: context.total_cost || 0,
+              attempts_count: context.attempts_made
+            }
+            # Add cache key for cache hit executions
+            if context.cached? && context[:cache_key]
+              data[:response_cache_key] = context[:cache_key]
+            end
+            # Add error details if present
+            if context.error
+              data[:error_class] = context.error.class.name
+              data[:error_message] = truncate_error_message(context.error.message)
+            end
+            # Add custom metadata
+            data[:metadata] = context.metadata if context.metadata.any?
+            data
+          end
+          # Persists execution data to database (legacy fallback)
+          #
+          # Used when initial running record creation failed.
+          #
+          # @param context [Context] The execution context
+          # @param status [String] "success" or "error"
+          def persist_execution(context, status:)
+            return unless execution_model_available?
+            data = build_execution_data(context, status)
+            if async_logging?
+              queue_async_logging(data)
+            else
+              create_execution_record(data)
+            end
+          rescue StandardError => e
+            error("Failed to record execution: #{e.message}")
+          end
+          # Builds execution data hash
+          #
+          # @param context [Context] The execution context
+          # @param status [String] "success" or "error"
+          # @return [Hash] Execution data
+          def build_execution_data(context, status)
+            data = {
+              agent_type: context.agent_class&.name,
+              agent_version: config(:version, "1.0"),
+              model_id: context.model,
+              status: determine_status(context, status),
+              duration_ms: context.duration_ms,
+              started_at: context.started_at,
+              completed_at: context.completed_at,
+              cache_hit: context.cached?,
+              input_tokens: context.input_tokens || 0,
+              output_tokens: context.output_tokens || 0,
+              total_cost: context.total_cost || 0,
+              attempts_count: context.attempts_made
+            }
+            # Add tenant_id only if multi-tenancy is enabled and tenant is set
+            if global_config.multi_tenancy_enabled? && context.tenant_id.present?
+              data[:tenant_id] = context.tenant_id
+            end
+            # Add cache key for cache hit executions
+            if context.cached? && context[:cache_key]
+              data[:response_cache_key] = context[:cache_key]
+            end
+            # Add error details if present
+            if context.error
+              data[:error_class] = context.error.class.name
+              data[:error_message] = truncate_error_message(context.error.message)
+            end
+            # Add custom metadata
+            data[:metadata] = context.metadata if context.metadata.any?
+            # Add sanitized parameters
+            data[:parameters] = sanitize_parameters(context)
+            data
+          end
+          # Determines the status based on context and error type
+          #
+          # @param context [Context] The execution context
+          # @param base_status [String] The base status ("success" or "error")
+          # @return [String] The determined status
+          def determine_status(context, base_status)
+            return base_status if base_status == "success"
+            # Check for timeout errors
+            if context.error.is_a?(Timeout::Error)
+              "timeout"
+            else
+              base_status
+            end
+          end
+          # Sanitizes parameters for storage, redacting sensitive values
+          #
+          # @param context [Context] The execution context
+          # @return [Hash] Sanitized parameters
+          def sanitize_parameters(context)
+            return {} unless context.agent_instance.respond_to?(:options, true)
+            params = context.agent_instance.send(:options) rescue {}
+            params = params.dup
+            params.transform_keys!(&:to_s)
+            SENSITIVE_KEYS.each do |key|
+              params[key] = "[REDACTED]" if params.key?(key)
+            end
+            params
+          end
+          # Sensitive parameter keys that should be redacted
+          SENSITIVE_KEYS = %w[
+            password token api_key secret credential auth key
+            access_token refresh_token private_key secret_key
+          ].freeze
+          # Truncates error message to prevent database issues
+          #
+          # @param message [String] The error message
+          # @return [String] Truncated message
+          def truncate_error_message(message)
+            return "" if message.nil?
+            message.to_s.truncate(1000)
+          rescue StandardError
+            message.to_s[0, 1000]
+          end
+          # Queues async logging via background job
+          #
+          # @param data [Hash] Execution data
+          def queue_async_logging(data)
+            Infrastructure::ExecutionLoggerJob.perform_later(data)
+          end
+          # Creates execution record synchronously
+          #
+          # @param data [Hash] Execution data
+          def create_execution_record(data)
+            Execution.create!(data)
+          end
+          # Returns whether tracking is enabled for this agent type
+          #
+          # @param context [Context] The execution context
+          # @return [Boolean]
+          def tracking_enabled?(context)
+            cfg = global_config
+            case context.agent_type
+            when :embedding
+              cfg.track_embeddings
+            when :moderation
+              cfg.track_moderation
+            when :image
+              cfg.track_image_generation
+            when :audio
+              cfg.track_audio
+            else
+              cfg.track_executions
+            end
+          rescue StandardError
+            false
+          end
+          # Returns whether to track cache hits
+          #
+          # @return [Boolean]
+          def track_cache_hits?
+            global_config.respond_to?(:track_cache_hits) && global_config.track_cache_hits
+          rescue StandardError
+            false
+          end
+          # Returns whether async logging is enabled
+          #
+          # @return [Boolean]
+          def async_logging?
+            global_config.async_logging && defined?(Infrastructure::ExecutionLoggerJob)
+          rescue StandardError
+            false
+          end
+          # Returns whether the Execution model is available
+          #
+          # @return [Boolean]
+          def execution_model_available?
+            defined?(RubyLLM::Agents::Execution)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/ruby_llm/agents/pipeline/middleware/reliability.rb ADDED Viewed

@@ -0,0 +1,276 @@
+# frozen_string_literal: true
+module RubyLLM
+  module Agents
+    module Pipeline
+      module Middleware
+        # Handles retries, fallbacks, and circuit breakers.
+        #
+        # This middleware provides reliability features for agent executions:
+        # - Retries with configurable backoff (constant or exponential)
+        # - Model fallbacks when primary model fails
+        # - Circuit breaker protection per model
+        # - Total timeout across all attempts
+        #
+        # Reliability is enabled via the agent's reliability DSL:
+        #   class MyAgent < ApplicationAgent
+        #     reliability do
+        #       retries max: 3, backoff: :exponential
+        #       fallback_models "gpt-4o-mini"
+        #       total_timeout 30
+        #       circuit_breaker errors: 5, within: 60
+        #     end
+        #   end
+        #
+        # @example Simple retry
+        #   class MyEmbedder < RubyLLM::Agents::Embedder
+        #     model "text-embedding-3-small"
+        #     reliability do
+        #       retries max: 2
+        #     end
+        #   end
+        #
+        class Reliability < Base
+          # Process with reliability features
+          #
+          # @param context [Context] The execution context
+          # @return [Context] The context after execution
+          # @raise [AllModelsFailedError] If all models fail
+          # @raise [TotalTimeoutError] If total timeout exceeded
+          # @raise [CircuitOpenError] If circuit breaker is open for all models
+          def call(context)
+            return @app.call(context) unless reliability_enabled?
+            config = reliability_config
+            models_to_try = build_models_list(context, config)
+            total_deadline = calculate_deadline(config)
+            execute_with_reliability(context, models_to_try, config, total_deadline)
+          end
+          private
+          # Returns whether reliability is enabled for this agent
+          #
+          # @return [Boolean]
+          def reliability_enabled?
+            @agent_class&.respond_to?(:reliability_config) &&
+              @agent_class.reliability_config.present?
+          end
+          # Returns the reliability configuration from the agent class
+          #
+          # @return [Hash] The reliability configuration
+          def reliability_config
+            @agent_class.reliability_config || {}
+          end
+          # Builds the list of models to try
+          #
+          # @param context [Context] The execution context
+          # @param config [Hash] The reliability configuration
+          # @return [Array<String>] List of models
+          def build_models_list(context, config)
+            primary = context.model || @agent_class&.model
+            fallbacks = config[:fallback_models] || []
+            [primary, *fallbacks].compact.uniq
+          end
+          # Calculates the total deadline for all attempts
+          #
+          # @param config [Hash] The reliability configuration
+          # @return [Time, nil] The deadline or nil if no timeout
+          def calculate_deadline(config)
+            return nil unless config[:total_timeout]
+            Time.current + config[:total_timeout]
+          end
+          # Executes with retry, fallback, and circuit breaker logic
+          #
+          # @param context [Context] The execution context
+          # @param models_to_try [Array<String>] List of models to try
+          # @param config [Hash] The reliability configuration
+          # @param total_deadline [Time, nil] The total deadline
+          # @return [Context] The context after execution
+          def execute_with_reliability(context, models_to_try, config, total_deadline)
+            started_at = Time.current
+            last_error = nil
+            context.attempts_made = 0
+            models_to_try.each do |current_model|
+              # Check circuit breaker for this model
+              breaker = get_circuit_breaker(current_model, context)
+              if breaker&.open?
+                debug("Circuit breaker open for #{current_model}, skipping")
+                next
+              end
+              result = try_model_with_retries(
+                context: context,
+                model: current_model,
+                config: config,
+                total_deadline: total_deadline,
+                started_at: started_at,
+                breaker: breaker
+              )
+              return result if result
+              # Capture the last error from context for the final error
+              last_error = context.error
+            end
+            # All models exhausted
+            raise Agents::Reliability::AllModelsExhaustedError.new(models_to_try, last_error)
+          end
+          # Tries a model with retry logic
+          #
+          # @param context [Context] The execution context
+          # @param model [String] The model to try
+          # @param config [Hash] The reliability configuration
+          # @param total_deadline [Time, nil] The total deadline
+          # @param started_at [Time] When execution started
+          # @param breaker [CircuitBreaker, nil] The circuit breaker for this model
+          # @return [Context, nil] The context if successful, nil to try next model
+          def try_model_with_retries(context:, model:, config:, total_deadline:, started_at:, breaker:)
+            retries_config = config[:retries] || {}
+            max_retries = retries_config[:max] || 0
+            attempt_index = 0
+            loop do
+              # Check total timeout
+              check_total_timeout!(total_deadline, started_at)
+              context.attempt = attempt_index + 1
+              context.attempts_made += 1
+              begin
+                # Override the model for this attempt
+                original_model = context.model
+                context.model = model
+                @app.call(context)
+                # Success - record in circuit breaker
+                breaker&.record_success!
+                return context
+              rescue StandardError => e
+                context.error = e
+                breaker&.record_failure!
+                # Check if we should retry
+                if should_retry?(e, config, attempt_index, max_retries, total_deadline)
+                  attempt_index += 1
+                  delay = calculate_backoff(retries_config, attempt_index)
+                  async_aware_sleep(delay)
+                else
+                  # Move to next model
+                  return nil
+                end
+              ensure
+                # Restore original model if we're going to retry or try another model
+                context.model = original_model if context.error
+              end
+            end
+          end
+          # Checks if we've exceeded the total timeout
+          #
+          # @param deadline [Time, nil] The deadline
+          # @param started_at [Time] When execution started
+          # @raise [TotalTimeoutError] If timeout exceeded
+          def check_total_timeout!(deadline, started_at)
+            return unless deadline && Time.current > deadline
+            elapsed = Time.current - started_at
+            timeout_value = deadline - started_at + elapsed
+            raise Agents::Reliability::TotalTimeoutError.new(timeout_value, elapsed)
+          end
+          # Determines if we should retry the error
+          #
+          # @param error [Exception] The error that occurred
+          # @param config [Hash] The reliability configuration
+          # @param attempt_index [Integer] Current attempt index
+          # @param max_retries [Integer] Maximum retries allowed
+          # @param total_deadline [Time, nil] The total deadline
+          # @return [Boolean] Whether to retry
+          def should_retry?(error, config, attempt_index, max_retries, total_deadline)
+            return false if attempt_index >= max_retries
+            return false if total_deadline && Time.current > total_deadline
+            retryable_error?(error, config)
+          end
+          # Checks if an error is retryable
+          #
+          # @param error [Exception] The error to check
+          # @param config [Hash] The reliability configuration
+          # @return [Boolean] Whether the error is retryable
+          def retryable_error?(error, config)
+            custom_errors = config.dig(:retries, :on) || []
+            custom_patterns = config[:retryable_patterns]
+            Agents::Reliability.retryable_error?(
+              error,
+              custom_errors: custom_errors,
+              custom_patterns: custom_patterns
+            )
+          end
+          # Calculates the backoff delay
+          #
+          # @param retries_config [Hash] The retries configuration
+          # @param attempt_index [Integer] The current attempt index
+          # @return [Float] The delay in seconds
+          def calculate_backoff(retries_config, attempt_index)
+            Agents::Reliability.calculate_backoff(
+              strategy: retries_config[:backoff] || :exponential,
+              base: retries_config[:base] || 0.4,
+              max_delay: retries_config[:max_delay] || 3.0,
+              attempt: attempt_index
+            )
+          end
+          # Gets or creates a circuit breaker for a model
+          #
+          # @param model_id [String] The model identifier
+          # @param context [Context] The execution context
+          # @return [CircuitBreaker, nil] The circuit breaker or nil
+          def get_circuit_breaker(model_id, context)
+            cb_config = reliability_config[:circuit_breaker]
+            return nil unless cb_config
+            CircuitBreaker.from_config(
+              @agent_class&.name,
+              model_id,
+              cb_config,
+              tenant_id: context.tenant_id
+            )
+          end
+          # Sleeps without blocking other fibers when in async context
+          #
+          # @param seconds [Numeric] Duration to sleep
+          # @return [void]
+          def async_aware_sleep(seconds)
+            config = global_config
+            if config.respond_to?(:async_context?) && config.async_context?
+              ::Async::Task.current.sleep(seconds)
+            else
+              sleep(seconds)
+            end
+          rescue StandardError
+            # Fall back to regular sleep if async detection fails
+            sleep(seconds)
+          end
+        end
+      end
+    end
+  end
+end