RubyGems - ruby_llm-agents - Versions diffs - 0.2.4 → 0.3.0 - Mend

ruby_llm-agents 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

checksums.yaml +4 -4
data/README.md +273 -0
data/app/channels/ruby_llm/agents/executions_channel.rb +24 -1
data/app/controllers/concerns/ruby_llm/agents/filterable.rb +81 -0
data/app/controllers/concerns/ruby_llm/agents/paginatable.rb +51 -0
data/app/controllers/ruby_llm/agents/agents_controller.rb +228 -59
data/app/controllers/ruby_llm/agents/dashboard_controller.rb +167 -12
data/app/controllers/ruby_llm/agents/executions_controller.rb +189 -31
data/app/controllers/ruby_llm/agents/settings_controller.rb +20 -0
data/app/helpers/ruby_llm/agents/application_helper.rb +307 -7
data/app/models/ruby_llm/agents/execution/analytics.rb +224 -20
data/app/models/ruby_llm/agents/execution/metrics.rb +41 -25
data/app/models/ruby_llm/agents/execution/scopes.rb +234 -14
data/app/models/ruby_llm/agents/execution.rb +259 -16
data/app/services/ruby_llm/agents/agent_registry.rb +49 -12
data/app/views/layouts/rubyllm/agents/application.html.erb +351 -85
data/app/views/rubyllm/agents/agents/_version_comparison.html.erb +186 -0
data/app/views/rubyllm/agents/agents/show.html.erb +233 -10
data/app/views/rubyllm/agents/dashboard/_action_center.html.erb +62 -0
data/app/views/rubyllm/agents/dashboard/_alerts_feed.html.erb +62 -0
data/app/views/rubyllm/agents/dashboard/_breaker_strip.html.erb +47 -0
data/app/views/rubyllm/agents/dashboard/_budgets_bar.html.erb +165 -0
data/app/views/rubyllm/agents/dashboard/_now_strip.html.erb +10 -0
data/app/views/rubyllm/agents/dashboard/_now_strip_values.html.erb +71 -0
data/app/views/rubyllm/agents/dashboard/index.html.erb +215 -109
data/app/views/rubyllm/agents/executions/_filters.html.erb +152 -155
data/app/views/rubyllm/agents/executions/_list.html.erb +103 -12
data/app/views/rubyllm/agents/executions/dry_run.html.erb +149 -0
data/app/views/rubyllm/agents/executions/index.html.erb +17 -72
data/app/views/rubyllm/agents/executions/index.turbo_stream.erb +16 -2
data/app/views/rubyllm/agents/executions/show.html.erb +693 -14
data/app/views/rubyllm/agents/settings/show.html.erb +369 -0
data/app/views/rubyllm/agents/shared/_filter_dropdown.html.erb +121 -0
data/app/views/rubyllm/agents/shared/_select_dropdown.html.erb +85 -0
data/config/routes.rb +7 -0
data/lib/generators/ruby_llm_agents/templates/add_attempts_migration.rb.tt +27 -0
data/lib/generators/ruby_llm_agents/templates/add_caching_migration.rb.tt +23 -0
data/lib/generators/ruby_llm_agents/templates/add_finish_reason_migration.rb.tt +19 -0
data/lib/generators/ruby_llm_agents/templates/add_routing_migration.rb.tt +19 -0
data/lib/generators/ruby_llm_agents/templates/add_streaming_migration.rb.tt +8 -0
data/lib/generators/ruby_llm_agents/templates/add_tracing_migration.rb.tt +34 -0
data/lib/generators/ruby_llm_agents/templates/agent.rb.tt +66 -4
data/lib/generators/ruby_llm_agents/templates/application_agent.rb.tt +53 -6
data/lib/generators/ruby_llm_agents/templates/initializer.rb.tt +139 -8
data/lib/generators/ruby_llm_agents/templates/migration.rb.tt +38 -1
data/lib/generators/ruby_llm_agents/upgrade_generator.rb +78 -0
data/lib/ruby_llm/agents/alert_manager.rb +207 -0
data/lib/ruby_llm/agents/attempt_tracker.rb +295 -0
data/lib/ruby_llm/agents/base.rb +580 -112
data/lib/ruby_llm/agents/budget_tracker.rb +360 -0
data/lib/ruby_llm/agents/circuit_breaker.rb +197 -0
data/lib/ruby_llm/agents/configuration.rb +279 -1
data/lib/ruby_llm/agents/engine.rb +58 -6
data/lib/ruby_llm/agents/execution_logger_job.rb +17 -6
data/lib/ruby_llm/agents/inflections.rb +13 -2
data/lib/ruby_llm/agents/instrumentation.rb +538 -87
data/lib/ruby_llm/agents/redactor.rb +130 -0
data/lib/ruby_llm/agents/reliability.rb +185 -0
data/lib/ruby_llm/agents/version.rb +3 -1
data/lib/ruby_llm/agents.rb +52 -0
metadata +41 -2
data/app/controllers/ruby_llm/agents/application_controller.rb +0 -37

data/lib/ruby_llm/agents/instrumentation.rb CHANGED Viewed

@@ -4,36 +4,148 @@ module RubyLLM
   module Agents
     # Instrumentation concern for tracking agent executions
     #
-    # Provides execution timing, token tracking, cost calculation, and error handling.
-    # Logs all executions to the database via a background job.
+    # Provides comprehensive execution tracking including:
+    # - Timing metrics (started_at, completed_at, duration_ms)
+    # - Token usage tracking (input, output, cached)
+    # - Cost calculation via RubyLLM pricing data
+    # - Error and timeout handling with status tracking
+    # - Safe parameter sanitization for logging
     #
-    # == Usage
+    # Included automatically in {RubyLLM::Agents::Base}.
     #
-    # Included automatically in RubyLLM::Agents::Base
-    #
-    # == Customization
-    #
-    # Override `execution_metadata` in your agent to add custom data:
-    #
-    #   def execution_metadata
-    #     { query: query, user_id: Current.user&.id }
+    # @example Adding custom metadata to executions
+    #   class MyAgent < ApplicationAgent
+    #     def execution_metadata
+    #       { user_id: Current.user&.id, request_id: request.uuid }
+    #     end
     #   end
     #
+    # @see RubyLLM::Agents::Execution
+    # @see RubyLLM::Agents::ExecutionLoggerJob
+    # @api private
     module Instrumentation
       extend ActiveSupport::Concern
       included do
+        # @!attribute [rw] execution_id
+        #   The ID of the current execution record
+        #   @return [Integer, nil]
         attr_accessor :execution_id
       end
-      # Wrap agent execution with metrics tracking
-      # Creates execution record at start with 'running' status, updates on completion
-      # Uses ensure block to guarantee status is updated even if complete_execution fails
+      # Wraps agent execution with comprehensive metrics tracking (for reliability-enabled agents)
+      #
+      # Creates a single execution record and tracks multiple attempts within it.
+      # Used by execute_with_reliability for retry/fallback scenarios.
+      #
+      # Uses catch/throw pattern because the yielded block uses `throw :execution_success`
+      # to exit early on success. Regular `return` from within a block would bypass
+      # our completion code, so we use throw/catch to properly intercept success cases.
+      #
+      # @param models_to_try [Array<String>] List of models in the fallback chain
+      # @yield [AttemptTracker] Block receives attempt tracker for recording attempts
+      # @return [Object] The result from the yielded block
+      # @raise [Timeout::Error] Re-raised after logging timeout status
+      # @raise [StandardError] Re-raised after logging error status
+      def instrument_execution_with_attempts(models_to_try:, &block)
+        started_at = Time.current
+        @last_response = nil
+        @status_update_completed = false
+        raised_exception = nil
+        completion_error = nil
+        attempt_tracker = AttemptTracker.new
+        # Create execution record with running status and fallback chain
+        execution = create_running_execution(started_at, fallback_chain: models_to_try)
+        self.execution_id = execution&.id
+        # Use catch to intercept successful early returns from the block
+        # The block uses `throw :execution_success, result` instead of `return`
+        result = catch(:execution_success) do
+          begin
+            yield(attempt_tracker)
+            # If we reach here normally (no throw), the block completed without success
+            # This happens when AllModelsExhaustedError is raised
+            nil
+          rescue Timeout::Error, Reliability::TotalTimeoutError => e
+            raised_exception = e
+            begin
+              complete_execution_with_attempts(
+                execution,
+                attempt_tracker: attempt_tracker,
+                completed_at: Time.current,
+                status: "timeout",
+                error: e
+              )
+              @status_update_completed = true
+            rescue StandardError => completion_err
+              completion_error = completion_err
+            end
+            raise
+          rescue StandardError => e
+            raised_exception = e
+            begin
+              complete_execution_with_attempts(
+                execution,
+                attempt_tracker: attempt_tracker,
+                completed_at: Time.current,
+                status: "error",
+                error: e
+              )
+              @status_update_completed = true
+            rescue StandardError => completion_err
+              completion_error = completion_err
+            end
+            raise
+          ensure
+            # Only run emergency fallback if we haven't completed AND we're not in success path
+            # The success path completion happens AFTER the catch block
+            unless @status_update_completed || !$!
+              actual_error = completion_error || raised_exception || $!
+              mark_execution_failed!(execution, error: actual_error)
+            end
+          end
+        end
+        # If we caught a successful throw, complete the execution properly
+        # result will be non-nil if throw :execution_success was called
+        if result && !@status_update_completed
+          begin
+            complete_execution_with_attempts(
+              execution,
+              attempt_tracker: attempt_tracker,
+              completed_at: Time.current,
+              status: "success"
+            )
+            @status_update_completed = true
+          rescue StandardError => e
+            Rails.logger.error("[RubyLLM::Agents] Failed to complete successful execution: #{e.class}: #{e.message}")
+            mark_execution_failed!(execution, error: e)
+          end
+        end
+        result
+      end
+      # Wraps agent execution with comprehensive metrics tracking
+      #
+      # Execution lifecycle:
+      # 1. Creates execution record immediately with 'running' status
+      # 2. Yields to the block for actual agent execution
+      # 3. Updates record with final status and metrics
+      # 4. Uses ensure block to guarantee status update even on failures
+      #
+      # @yield The block containing the actual agent execution
+      # @return [Object] The result from the yielded block
+      # @raise [Timeout::Error] Re-raised after logging timeout status
+      # @raise [StandardError] Re-raised after logging error status
       def instrument_execution(&block)
         started_at = Time.current
         @last_response = nil
-        @execution_status_updated = false
-        original_error = nil
+        @status_update_completed = false
+        raised_exception = nil
+        completion_error = nil
         # Create execution record immediately with running status
         execution = create_running_execution(started_at)
@@ -43,44 +155,69 @@ module RubyLLM
           result = yield
           # Update to success
-          complete_execution(
-            execution,
-            completed_at: Time.current,
-            status: "success",
-            response: @last_response
-          )
-          @execution_status_updated = true
+          # NOTE: If this fails, we capture the error but DON'T re-raise
+          # The ensure block will handle it via mark_execution_failed!
+          begin
+            complete_execution(
+              execution,
+              completed_at: Time.current,
+              status: "success",
+              response: @last_response
+            )
+            @status_update_completed = true
+          rescue StandardError => e
+            completion_error = e
+            # Don't re-raise - let ensure block handle via mark_execution_failed!
+          end
           result
         rescue Timeout::Error => e
-          original_error = e
-          complete_execution(
-            execution,
-            completed_at: Time.current,
-            status: "timeout",
-            error: e
-          )
-          @execution_status_updated = true
+          raised_exception = e
+          begin
+            complete_execution(
+              execution,
+              completed_at: Time.current,
+              status: "timeout",
+              error: e
+            )
+            @status_update_completed = true
+          rescue StandardError => completion_err
+            completion_error = completion_err
+          end
           raise
-        rescue => e
-          original_error = e
-          complete_execution(
-            execution,
-            completed_at: Time.current,
-            status: "error",
-            error: e
-          )
-          @execution_status_updated = true
+        rescue StandardError => e
+          raised_exception = e
+          begin
+            complete_execution(
+              execution,
+              completed_at: Time.current,
+              status: "error",
+              error: e
+            )
+            @status_update_completed = true
+          rescue StandardError => completion_err
+            completion_error = completion_err
+          end
           raise
         ensure
-          # Guarantee execution is marked as error if complete_execution failed
-          unless @execution_status_updated
-            mark_execution_failed!(execution, error: original_error)
+          # Emergency fallback: mark as error if complete_execution itself failed
+          # This ensures executions never remain stuck in 'running' status
+          unless @status_update_completed
+            # Prefer completion_error (from update! failure) over raised_exception (from execution)
+            # Use $! as final fallback - it holds the current exception being propagated
+            actual_error = completion_error || raised_exception || $!
+            mark_execution_failed!(execution, error: actual_error)
           end
         end
       end
-      # Store response for metrics extraction
+      # Stores the LLM response for metrics extraction
+      #
+      # Called by the agent after receiving a response from the LLM.
+      # The response is used to extract token counts and model information.
+      #
+      # @param response [RubyLLM::Message] The response from the LLM
+      # @return [RubyLLM::Message] The same response (for method chaining)
       def capture_response(response)
         @last_response = response
         response
@@ -88,8 +225,15 @@ module RubyLLM
       private
-      # Create execution record with running status at start
-      def create_running_execution(started_at)
+      # Creates initial execution record with 'running' status
+      #
+      # @param started_at [Time] When the execution started
+      # @param fallback_chain [Array<String>] Optional list of models in fallback chain
+      # @return [RubyLLM::Agents::Execution, nil] The created record, or nil on failure
+      def create_running_execution(started_at, fallback_chain: [])
+        config = RubyLLM::Agents.configuration
+        metadata = execution_metadata
         execution_data = {
           agent_type: self.class.name,
           agent_version: self.class.version,
@@ -97,20 +241,45 @@ module RubyLLM
           temperature: temperature,
           started_at: started_at,
           status: "running",
-          parameters: sanitized_parameters,
-          metadata: execution_metadata,
-          system_prompt: safe_system_prompt,
-          user_prompt: safe_user_prompt
+          parameters: redacted_parameters,
+          metadata: metadata,
+          system_prompt: config.persist_prompts ? redacted_system_prompt : nil,
+          user_prompt: config.persist_prompts ? redacted_user_prompt : nil,
+          streaming: self.class.streaming
         }
+        # Extract tracing fields from metadata if present
+        execution_data[:request_id] = metadata[:request_id] if metadata[:request_id]
+        execution_data[:trace_id] = metadata[:trace_id] if metadata[:trace_id]
+        execution_data[:span_id] = metadata[:span_id] if metadata[:span_id]
+        execution_data[:parent_execution_id] = metadata[:parent_execution_id] if metadata[:parent_execution_id]
+        execution_data[:root_execution_id] = metadata[:root_execution_id] if metadata[:root_execution_id]
+        # Add fallback chain if provided (for reliability-enabled executions)
+        if fallback_chain.any?
+          execution_data[:fallback_chain] = fallback_chain
+          execution_data[:attempts] = []
+          execution_data[:attempts_count] = 0
+        end
         RubyLLM::Agents::Execution.create!(execution_data)
       rescue StandardError => e
-        # Log error but don't fail the execution
+        # Log error but don't fail the agent execution itself
         Rails.logger.error("[RubyLLM::Agents] Failed to create execution record: #{e.message}")
         nil
       end
-      # Update execution record on completion
+      # Updates execution record with completion data
+      #
+      # Calculates duration, extracts response metrics, and saves final status.
+      # Falls back to legacy logging if the initial execution record is nil.
+      #
+      # @param execution [Execution, nil] The execution record to update
+      # @param completed_at [Time] When the execution completed
+      # @param status [String] Final status ("success", "error", "timeout")
+      # @param response [RubyLLM::Message, nil] The LLM response (if successful)
+      # @param error [Exception, nil] The exception (if failed)
+      # @return [void]
       def complete_execution(execution, completed_at:, status:, response: nil, error: nil)
         return legacy_log_execution(completed_at: completed_at, status: status, response: response, error: error) unless execution
@@ -123,6 +292,9 @@ module RubyLLM
           status: status
         }
+        # Add streaming metrics if available
+        update_data[:time_to_first_token_ms] = time_to_first_token_ms if respond_to?(:time_to_first_token_ms) && time_to_first_token_ms
         # Add response data if available (using safe extraction)
         response_data = safe_extract_response_data(response)
         if response_data.any?
@@ -149,12 +321,113 @@ module RubyLLM
             Rails.logger.warn("[RubyLLM::Agents] Cost calculation failed: #{cost_error.message}")
           end
         end
+      rescue ActiveRecord::RecordInvalid => e
+        Rails.logger.error("[RubyLLM::Agents] Validation failed for execution #{execution&.id}: #{e.record.errors.full_messages.join(', ')}")
+        if Rails.env.development? || Rails.env.test?
+          Rails.logger.error("[RubyLLM::Agents] Update data: #{update_data.inspect}")
+        end
+        raise
       rescue StandardError => e
-        Rails.logger.error("[RubyLLM::Agents] Failed to update execution record: #{e.message}")
-        raise # Re-raise so ensure block can handle emergency update
+        Rails.logger.error("[RubyLLM::Agents] Failed to update execution record #{execution&.id}: #{e.class}: #{e.message}")
+        if Rails.env.development? || Rails.env.test?
+          Rails.logger.error("[RubyLLM::Agents] Update data: #{update_data.inspect}")
+        end
+        raise
       end
-      # Fallback for when initial execution creation failed
+      # Updates execution record with completion data and attempt tracking
+      #
+      # Similar to complete_execution but handles multi-attempt scenarios with
+      # aggregated token counts and costs from all attempts.
+      #
+      # @param execution [Execution, nil] The execution record to update
+      # @param attempt_tracker [AttemptTracker] The attempt tracker with attempt data
+      # @param completed_at [Time] When the execution completed
+      # @param status [String] Final status ("success", "error", "timeout")
+      # @param error [Exception, nil] The exception (if failed)
+      # @return [void]
+      def complete_execution_with_attempts(execution, attempt_tracker:, completed_at:, status:, error: nil)
+        return unless execution
+        started_at = execution.started_at
+        duration_ms = ((completed_at - started_at) * 1000).round
+        config = RubyLLM::Agents.configuration
+        update_data = {
+          completed_at: completed_at,
+          duration_ms: duration_ms,
+          status: status,
+          attempts: attempt_tracker.to_json_array,
+          attempts_count: attempt_tracker.attempts_count,
+          chosen_model_id: attempt_tracker.chosen_model_id,
+          input_tokens: attempt_tracker.total_input_tokens,
+          output_tokens: attempt_tracker.total_output_tokens,
+          total_tokens: attempt_tracker.total_tokens,
+          cached_tokens: attempt_tracker.total_cached_tokens
+        }
+        # Add streaming metrics if available
+        update_data[:time_to_first_token_ms] = time_to_first_token_ms if respond_to?(:time_to_first_token_ms) && time_to_first_token_ms
+        # Add finish reason from response if available
+        if @last_response
+          finish_reason = safe_extract_finish_reason(@last_response)
+          update_data[:finish_reason] = finish_reason if finish_reason
+        end
+        # Add routing/retry tracking fields
+        routing_data = extract_routing_data(attempt_tracker, error)
+        update_data.merge!(routing_data)
+        # Add response data if we have a last response
+        if @last_response && config.persist_responses
+          update_data[:response] = redacted_response(@last_response)
+        end
+        # Add error data if failed
+        if error
+          update_data.merge!(
+            error_message: error.message.to_s.truncate(65535),
+            error_class: error.class.name
+          )
+        end
+        execution.update!(update_data)
+        # Calculate costs from all attempts
+        if attempt_tracker.attempts_count > 0
+          begin
+            execution.aggregate_attempt_costs!
+            execution.save!
+          rescue StandardError => cost_error
+            Rails.logger.warn("[RubyLLM::Agents] Cost calculation failed: #{cost_error.message}")
+          end
+        end
+      rescue ActiveRecord::RecordInvalid => e
+        Rails.logger.error("[RubyLLM::Agents] Validation failed for execution #{execution&.id}: #{e.record.errors.full_messages.join(', ')}")
+        if Rails.env.development? || Rails.env.test?
+          Rails.logger.error("[RubyLLM::Agents] Update data: #{update_data.inspect}")
+        end
+        raise
+      rescue StandardError => e
+        Rails.logger.error("[RubyLLM::Agents] Failed to update execution record #{execution&.id}: #{e.class}: #{e.message}")
+        if Rails.env.development? || Rails.env.test?
+          Rails.logger.error("[RubyLLM::Agents] Update data: #{update_data.inspect}")
+        end
+        raise
+      end
+      # Fallback logging when initial execution record creation failed
+      #
+      # Creates execution via background job or synchronously based on configuration.
+      # Used as a last resort to ensure execution data is captured.
+      #
+      # @param completed_at [Time] When the execution completed
+      # @param status [String] Final status
+      # @param response [RubyLLM::Message, nil] The LLM response
+      # @param error [Exception, nil] The exception if failed
+      # @return [void]
       def legacy_log_execution(completed_at:, status:, response: nil, error: nil)
         execution_data = {
           agent_type: self.class.name,
@@ -192,37 +465,72 @@ module RubyLLM
         end
       end
-      # Sanitize parameters to remove sensitive data
+      # Sanitizes parameters by removing sensitive data
+      #
+      # @deprecated Use {#redacted_parameters} instead
+      # @return [Hash] Sanitized parameters safe for logging
       def sanitized_parameters
-        params = @options.dup
-        # Remove sensitive keys
-        sensitive_keys = %i[password token api_key secret credential auth key]
-        sensitive_keys.each { |key| params.delete(key) }
-        # Convert ActiveRecord objects to IDs
-        params.transform_values do |value|
-          case value
-          when defined?(ActiveRecord::Base) && ActiveRecord::Base
-            { id: value.id, type: value.class.name }
-          when Array
-            if value.first.is_a?(ActiveRecord::Base)
-              { ids: value.first(10).map(&:id), type: value.first.class.name, count: value.size }
-            else
-              value.first(10)
-            end
-          else
-            value
-          end
-        end
+        redacted_parameters
+      end
+      # Returns parameters with sensitive data redacted using the Redactor
+      #
+      # Uses the configured redaction rules to remove sensitive fields and
+      # apply pattern-based redaction. Also converts ActiveRecord objects
+      # to ID references.
+      #
+      # @return [Hash] Redacted parameters safe for logging
+      def redacted_parameters
+        params = @options.except(:skip_cache, :dry_run)
+        Redactor.redact(params)
+      end
+      # Returns the system prompt with redaction applied
+      #
+      # @return [String, nil] The redacted system prompt
+      def redacted_system_prompt
+        prompt = safe_system_prompt
+        return nil unless prompt
+        Redactor.redact_string(prompt)
+      end
+      # Returns the user prompt with redaction applied
+      #
+      # @return [String, nil] The redacted user prompt
+      def redacted_user_prompt
+        prompt = safe_user_prompt
+        return nil unless prompt
+        Redactor.redact_string(prompt)
+      end
+      # Returns the response with redaction applied
+      #
+      # @param response [RubyLLM::Message] The LLM response
+      # @return [Hash] Redacted response data
+      def redacted_response(response)
+        data = safe_serialize_response(response)
+        Redactor.redact(data)
       end
-      # Hook for subclasses to add custom metadata
+      # Hook for subclasses to add custom metadata to executions
+      #
+      # Override this method in your agent to include application-specific
+      # data like user IDs, request IDs, or feature flags.
+      #
+      # @return [Hash] Custom metadata to store with the execution
+      # @example
+      #   def execution_metadata
+      #     { user_id: Current.user&.id, experiment: "v2" }
+      #   end
       def execution_metadata
         {}
       end
-      # Safely capture system prompt (may raise or return nil)
+      # Safely captures system prompt, handling errors gracefully
+      #
+      # @return [String, nil] The system prompt or nil if unavailable
       def safe_system_prompt
         respond_to?(:system_prompt) ? system_prompt.to_s : nil
       rescue StandardError => e
@@ -230,7 +538,9 @@ module RubyLLM
         nil
       end
-      # Safely capture user prompt (may raise or return nil)
+      # Safely captures user prompt, handling errors gracefully
+      #
+      # @return [String, nil] The user prompt or nil if unavailable
       def safe_user_prompt
         respond_to?(:user_prompt) ? user_prompt.to_s : nil
       rescue StandardError => e
@@ -238,7 +548,12 @@ module RubyLLM
         nil
       end
-      # Safely extract a value from response, returning default if method doesn't exist
+      # Safely extracts a value from response object
+      #
+      # @param response [Object] The response object
+      # @param method [Symbol] The method to call
+      # @param default [Object] Default value if method unavailable
+      # @return [Object] The extracted value or default
       def safe_response_value(response, method, default = nil)
         return default unless response.respond_to?(method)
         response.public_send(method)
@@ -246,7 +561,10 @@ module RubyLLM
         default
       end
-      # Safely extract all response data with fallbacks
+      # Extracts all response metrics with safe fallbacks
+      #
+      # @param response [RubyLLM::Message, nil] The LLM response
+      # @return [Hash] Extracted response data (empty if response invalid)
       def safe_extract_response_data(response)
         return {} unless response.is_a?(RubyLLM::Message)
@@ -256,11 +574,120 @@ module RubyLLM
           cached_tokens: safe_response_value(response, :cached_tokens, 0),
           cache_creation_tokens: safe_response_value(response, :cache_creation_tokens, 0),
           model_id: safe_response_value(response, :model_id),
+          finish_reason: safe_extract_finish_reason(response),
           response: safe_serialize_response(response)
         }.compact
       end
-      # Safe version of serialize_response
+      # Extracts finish reason from response, normalizing to standard values
+      #
+      # @param response [RubyLLM::Message] The LLM response
+      # @return [String, nil] Normalized finish reason
+      def safe_extract_finish_reason(response)
+        reason = safe_response_value(response, :finish_reason) ||
+                 safe_response_value(response, :stop_reason)
+        return nil unless reason
+        # Normalize to standard values
+        normalized = reason.to_s.downcase
+        case normalized
+        when "stop", "end_turn", "stop_sequence"
+          "stop"
+        when "length", "max_tokens"
+          "length"
+        when "content_filter", "safety"
+          "content_filter"
+        when "tool_calls", "tool_use", "function_call"
+          "tool_calls"
+        else
+          "other"
+        end
+      end
+      # Extracts routing/retry tracking data from attempt tracker
+      #
+      # Analyzes the execution attempts to determine:
+      # - Why a fallback was used (fallback_reason)
+      # - Whether the error is retryable
+      # - Whether rate limiting occurred
+      #
+      # @param attempt_tracker [AttemptTracker] The attempt tracker
+      # @param error [Exception, nil] The final error (if any)
+      # @return [Hash] Routing data to merge into execution
+      def extract_routing_data(attempt_tracker, error)
+        data = {}
+        # Determine if a fallback was used and why
+        if attempt_tracker.used_fallback?
+          data[:fallback_reason] = determine_fallback_reason(attempt_tracker)
+        end
+        # Check if error is retryable
+        if error
+          data[:retryable] = retryable_error?(error)
+          data[:rate_limited] = rate_limit_error?(error)
+        end
+        data
+      end
+      # Determines the reason for using a fallback model
+      #
+      # @param attempt_tracker [AttemptTracker] The attempt tracker
+      # @return [String] Fallback reason
+      def determine_fallback_reason(attempt_tracker)
+        # Analyze failed attempts to determine why fallback was needed
+        failed = attempt_tracker.failed_attempts
+        return "other" if failed.empty?
+        last_failed = failed.last
+        error_class = last_failed[:error_class]
+        case error_class
+        when /RateLimitError/, /TooManyRequestsError/
+          "rate_limit"
+        when /Timeout/
+          "timeout"
+        when /ContentFilter/, /SafetyError/
+          "safety"
+        when /BudgetExceeded/
+          "price_limit"
+        else
+          "error"
+        end
+      end
+      # Checks if an error is retryable
+      #
+      # @param error [Exception] The error
+      # @return [Boolean] true if retryable
+      def retryable_error?(error)
+        return false unless error
+        # Check against known retryable error patterns
+        error_class = error.class.name
+        error_class.match?(/Timeout|ConnectionError|RateLimitError|ServiceUnavailable|BadGateway/)
+      end
+      # Checks if an error indicates rate limiting
+      #
+      # @param error [Exception] The error
+      # @return [Boolean] true if rate limited
+      def rate_limit_error?(error)
+        return false unless error
+        error_class = error.class.name
+        error_message = error.message.to_s.downcase
+        error_class.match?(/RateLimitError|TooManyRequests/) ||
+          error_message.include?("rate limit") ||
+          error_message.include?("too many requests")
+      end
+      # Serializes response to a hash for storage
+      #
+      # @param response [RubyLLM::Message] The LLM response
+      # @return [Hash] Serialized response data
       def safe_serialize_response(response)
         {
           content: safe_response_value(response, :content),
@@ -272,17 +699,41 @@ module RubyLLM
         }.compact
       end
-      # Emergency fallback - mark execution as error using update_columns
-      # Bypasses callbacks/validations to ensure status is always updated
+      # Emergency fallback to mark execution as failed
+      #
+      # Uses update_all to bypass ActiveRecord callbacks and validations,
+      # ensuring the status is updated even if the model is in an invalid state.
+      # Only updates records that are still in 'running' status to prevent
+      # race conditions.
+      #
+      # @param execution [Execution, nil] The execution record
+      # @param error [Exception, nil] The exception that caused the failure
+      # @return [void]
       def mark_execution_failed!(execution, error: nil)
         return unless execution&.id
         return unless execution.status == "running"
+        # If no error was captured, create a synthetic one with current stack trace
+        # This helps debug cases where error details are lost
+        if error.nil?
+          Rails.logger.error("[RubyLLM::Agents] BUG: mark_execution_failed! called with nil error")
+          Rails.logger.error("[RubyLLM::Agents] Stack trace:\n  #{caller.first(15).join("\n  ")}")
+          synthetic_error = RuntimeError.new("No error was captured - check logs for stack trace")
+          synthetic_error.set_backtrace(caller)
+          error = synthetic_error
+        end
+        # Build a detailed error message including backtrace for debugging
+        backtrace_info = error.backtrace&.first(5)&.join("\n  ") || ""
+        error_message = "#{error.class}: #{error.message}"
+        error_message += "\n  #{backtrace_info}" if backtrace_info.present?
         update_data = {
           status: "error",
           completed_at: Time.current,
-          error_class: error&.class&.name || "InstrumentationError",
-          error_message: (error&.message || "Execution status update failed").to_s.truncate(65535)
+          error_class: error.class.name,
+          error_message: error_message.to_s.truncate(65535)
         }
         execution.class.where(id: execution.id, status: "running").update_all(update_data)