RubyGems - ruby_llm-contract - Versions diffs - 0.5.2 → 0.6.2 - Mend

ruby_llm-contract 0.5.2 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +49 -2
data/Gemfile.lock +2 -2
data/README.md +173 -134
data/lib/ruby_llm/contract/concerns/eval_host.rb +25 -6
data/lib/ruby_llm/contract/eval/model_comparison.rb +33 -11
data/lib/ruby_llm/contract/eval/recommendation.rb +48 -0
data/lib/ruby_llm/contract/eval/recommender.rb +132 -0
data/lib/ruby_llm/contract/eval/report.rb +2 -2
data/lib/ruby_llm/contract/eval/report_stats.rb +6 -0
data/lib/ruby_llm/contract/eval/report_storage.rb +18 -12
data/lib/ruby_llm/contract/eval/retry_optimizer.rb +221 -0
data/lib/ruby_llm/contract/eval.rb +3 -0
data/lib/ruby_llm/contract/rake_task.rb +83 -0
data/lib/ruby_llm/contract/step/base.rb +30 -0
data/lib/ruby_llm/contract/step/retry_executor.rb +9 -3
data/lib/ruby_llm/contract/step/retry_policy.rb +27 -14
data/lib/ruby_llm/contract/version.rb +1 -1
data/lib/ruby_llm/contract.rb +21 -0
metadata +4 -1

data/lib/ruby_llm/contract/eval/recommendation.rb ADDED Viewed

@@ -0,0 +1,48 @@
+# frozen_string_literal: true
+module RubyLLM
+  module Contract
+    module Eval
+      class Recommendation
+        include Concerns::DeepFreeze
+        attr_reader :best, :retry_chain, :score, :cost_per_call,
+                    :rationale, :current_config, :savings, :warnings
+        def initialize(best:, retry_chain:, score:, cost_per_call:,
+                       rationale:, current_config:, savings:, warnings:)
+          @best = deep_dup_freeze(best)
+          @retry_chain = deep_dup_freeze(retry_chain)
+          @score = score
+          @cost_per_call = cost_per_call
+          @rationale = deep_dup_freeze(rationale)
+          @current_config = deep_dup_freeze(current_config)
+          @savings = deep_dup_freeze(savings)
+          @warnings = deep_dup_freeze(warnings)
+          freeze
+        end
+        def to_dsl
+          return "# No recommendation — no candidate met the minimum score" if retry_chain.empty?
+          if retry_chain.length == 1 && retry_chain.first.keys == [:model]
+            "model \"#{retry_chain.first[:model]}\""
+          elsif retry_chain.all? { |c| c.keys == [:model] }
+            models_str = retry_chain.map { |c| c[:model] }.join(" ")
+            "retry_policy models: %w[#{models_str}]"
+          else
+            args = retry_chain.map { |c| config_to_ruby(c) }.join(",\n             ")
+            "retry_policy do\n  escalate(#{args})\nend"
+          end
+        end
+        private
+        def config_to_ruby(config)
+          pairs = config.map { |k, v| "#{k}: #{v.inspect}" }.join(", ")
+          "{ #{pairs} }"
+        end
+      end
+    end
+  end
+end

data/lib/ruby_llm/contract/eval/recommender.rb ADDED Viewed

@@ -0,0 +1,132 @@
+# frozen_string_literal: true
+module RubyLLM
+  module Contract
+    module Eval
+      class Recommender
+        def initialize(comparison:, min_score:, min_first_try_pass_rate: 0.8, current_config: nil)
+          @comparison = comparison
+          @min_score = min_score
+          @min_first_try_pass_rate = min_first_try_pass_rate
+          @current_config = current_config
+        end
+        def recommend
+          scored = build_scored_candidates
+          best = select_best(scored)
+          chain = build_retry_chain(scored, best)
+          rationale = build_rationale(scored, best)
+          warnings = build_warnings(scored)
+          savings = best ? calculate_savings(best) : {}
+          Recommendation.new(
+            best: best&.dig(:config),
+            retry_chain: chain,
+            score: best&.dig(:score) || 0.0,
+            cost_per_call: best&.dig(:cost_per_call) || 0.0,
+            rationale: rationale,
+            current_config: @current_config,
+            savings: savings,
+            warnings: warnings
+          )
+        end
+        private
+        def build_scored_candidates
+          @comparison.configs.filter_map do |label, config|
+            report = @comparison.reports[label]
+            next nil unless report
+            evaluated_count = report.results.count { |r| r.step_status != :skipped }
+            cases_count = [evaluated_count, 1].max
+            cost_per_call = report.total_cost.to_f / cases_count
+            {
+              label: label,
+              config: config,
+              score: report.score,
+              cost_per_call: cost_per_call,
+              latency: report.avg_latency_ms || Float::INFINITY,
+              pass_rate_ratio: report.pass_rate_ratio,
+              total_cost: report.total_cost
+            }
+          end
+        end
+        def select_best(scored)
+          eligible = scored.select { |s| s[:score] >= @min_score && cost_known?(s) }
+          eligible.min_by { |s| [s[:cost_per_call], s[:latency], s[:label]] }
+        end
+        def build_retry_chain(scored, best)
+          return [] unless best
+          first_try = scored
+            .select { |s| s[:pass_rate_ratio] >= @min_first_try_pass_rate && cost_known?(s) }
+            .min_by { |s| [s[:cost_per_call], s[:latency], s[:label]] }
+          if first_try.nil? || first_try[:label] == best[:label]
+            [best[:config]]
+          else
+            [first_try[:config], best[:config]]
+          end
+        end
+        def build_rationale(scored, best)
+          sorted = scored.sort_by { |s| [cost_known?(s) ? 0 : 1, s[:cost_per_call], s[:latency], s[:label]] }
+          sorted.map { |s| rationale_line(s, best) }
+        end
+        def rationale_line(candidate, best)
+          cost_str = cost_known?(candidate) ? "$#{format("%.4f", candidate[:cost_per_call])}/call" : "unknown pricing"
+          header = "#{candidate[:label]}, score #{format("%.2f", candidate[:score])}, at #{cost_str}"
+          notes = rationale_notes(candidate, best)
+          notes.any? ? "#{header} — #{notes.join(", ")}" : header
+        end
+        def rationale_notes(candidate, best)
+          notes = []
+          pass_pct = (candidate[:pass_rate_ratio] * 100).round
+          below_threshold = candidate[:score] < @min_score
+          if below_threshold && candidate[:pass_rate_ratio] >= @min_first_try_pass_rate
+            notes << "below #{@min_score} threshold, but good first-try (#{pass_pct}% pass rate)"
+          elsif below_threshold
+            notes << "below #{@min_score} threshold"
+          elsif candidate[:pass_rate_ratio] < 1.0
+            notes << "#{pass_pct}% pass rate"
+          end
+          notes << "recommended" if best && candidate[:label] == best[:label]
+          notes << "unknown pricing" unless cost_known?(candidate)
+          notes
+        end
+        def build_warnings(scored)
+          scored.reject { |s| cost_known?(s) }
+                .map { |s| "#{s[:label]}: unknown pricing — cost ranking may be inaccurate" }
+        end
+        def calculate_savings(best)
+          return {} unless @current_config
+          current_label = ModelComparison.candidate_label(@current_config)
+          current_report = @comparison.reports[current_label]
+          return {} unless current_report
+          current_evaluated = current_report.results.count { |r| r.step_status != :skipped }
+          current_cases = [current_evaluated, 1].max
+          current_cost = current_report.total_cost.to_f / current_cases
+          diff = current_cost - best[:cost_per_call]
+          return {} unless diff.positive?
+          { per_call: diff.round(6), monthly_at: { 10_000 => (diff * 10_000).round(2) } }
+        end
+        def cost_known?(scored_candidate)
+          scored_candidate[:cost_per_call]&.positive?
+        end
+      end
+    end
+  end
+end

data/lib/ruby_llm/contract/eval/report.rb CHANGED Viewed

@@ -14,8 +14,8 @@ module RubyLLM
         HISTORY_DIR = ".eval_history"
         BASELINE_DIR = ".eval_baselines"
-        def_delegators :@stats, :score, :passed, :failed, :skipped, :failures, :pass_rate, :total_cost, :avg_latency_ms,
-                       :passed?
+        def_delegators :@stats, :score, :passed, :failed, :skipped, :failures, :pass_rate, :pass_rate_ratio,
+                       :total_cost, :avg_latency_ms, :passed?
         def_delegators :@presenter, :summary, :to_s, :print_summary
         def_delegators :@storage, :save_history!, :eval_history, :save_baseline!, :compare_with_baseline,
                        :baseline_exists?

data/lib/ruby_llm/contract/eval/report_stats.rb CHANGED Viewed

@@ -35,6 +35,12 @@ module RubyLLM
           "#{passed}/#{evaluated_results.length}"
         end
+        def pass_rate_ratio
+          return 0.0 if evaluated_results.empty?
+          passed.to_f / evaluated_results.length
+        end
         def total_cost
           @results.sum { |result| result.cost || 0.0 }
         end

data/lib/ruby_llm/contract/eval/report_storage.rb CHANGED Viewed

@@ -13,25 +13,29 @@ module RubyLLM
           @stats = stats
         end
-        def save_history!(path: nil, model: nil)
-          file = path || storage_path(Report::HISTORY_DIR, "jsonl", model: model)
-          EvalHistory.append(file, history_entry)
+        def save_history!(path: nil, model: nil, reasoning_effort: nil)
+          file = path || storage_path(Report::HISTORY_DIR, "jsonl", model: model, reasoning_effort: reasoning_effort)
+          entry = history_entry
+          entry[:model] = model if model
+          entry[:reasoning_effort] = reasoning_effort if reasoning_effort
+          EvalHistory.append(file, entry)
           file
         end
-        def eval_history(path: nil, model: nil)
-          EvalHistory.load(path || storage_path(Report::HISTORY_DIR, "jsonl", model: model))
+        def eval_history(path: nil, model: nil, reasoning_effort: nil)
+          EvalHistory.load(path || storage_path(Report::HISTORY_DIR, "jsonl", model: model,
+                                                                              reasoning_effort: reasoning_effort))
         end
-        def save_baseline!(path: nil, model: nil)
-          file = path || storage_path(Report::BASELINE_DIR, "json", model: model)
+        def save_baseline!(path: nil, model: nil, reasoning_effort: nil)
+          file = path || storage_path(Report::BASELINE_DIR, "json", model: model, reasoning_effort: reasoning_effort)
           FileUtils.mkdir_p(File.dirname(file))
           File.write(file, JSON.pretty_generate(serialize_for_baseline))
           file
         end
-        def compare_with_baseline(path: nil, model: nil)
-          file = path || storage_path(Report::BASELINE_DIR, "json", model: model)
+        def compare_with_baseline(path: nil, model: nil, reasoning_effort: nil)
+          file = path || storage_path(Report::BASELINE_DIR, "json", model: model, reasoning_effort: reasoning_effort)
           raise ArgumentError, "No baseline found at #{file}" unless File.exist?(file)
           baseline_data = JSON.parse(File.read(file), symbolize_names: true)
@@ -43,8 +47,8 @@ module RubyLLM
           )
         end
-        def baseline_exists?(path: nil, model: nil)
-          File.exist?(path || storage_path(Report::BASELINE_DIR, "json", model: model))
+        def baseline_exists?(path: nil, model: nil, reasoning_effort: nil)
+          File.exist?(path || storage_path(Report::BASELINE_DIR, "json", model: model, reasoning_effort: reasoning_effort))
         end
         private
@@ -55,6 +59,7 @@ module RubyLLM
             score: @stats.score,
             total_cost: @stats.total_cost,
             pass_rate: @stats.pass_rate,
+            pass_rate_ratio: @stats.pass_rate_ratio,
             cases_count: @stats.evaluated_results_count
           }
         end
@@ -79,12 +84,13 @@ module RubyLLM
           }
         end
-        def storage_path(root_dir, extension, model:)
+        def storage_path(root_dir, extension, model:, reasoning_effort: nil)
           parts = [root_dir]
           parts << sanitize_name(@report.step_name) if @report.step_name
           dataset_name = sanitize_name(@report.dataset_name)
           dataset_name = "#{dataset_name}_#{sanitize_name(model)}" if model
+          dataset_name = "#{dataset_name}_effort_#{sanitize_name(reasoning_effort)}" if reasoning_effort
           File.join(*parts, "#{dataset_name}.#{extension}")
         end

data/lib/ruby_llm/contract/eval/retry_optimizer.rb ADDED Viewed

@@ -0,0 +1,221 @@
+# frozen_string_literal: true
+require "set"
+module RubyLLM
+  module Contract
+    module Eval
+      # Runs compare_models on ALL evals for a step, builds a score matrix,
+      # identifies the constraining eval, and suggests an escalation chain.
+      #
+      #   optimizer = RetryOptimizer.new(step: MyStep, candidates: [...], context: {})
+      #   result = optimizer.call
+      #   result.print_summary
+      #   result.to_dsl  # => copy-paste retry_policy
+      class RetryOptimizer
+        Result = Struct.new(:step_name, :eval_names, :candidate_labels, :score_matrix,
+                            :constraining_eval, :chain, :chain_details, keyword_init: true) do
+          def print_summary(io = $stdout)
+            io.puts "#{step_name} — retry chain optimization"
+            io.puts
+            print_table(io)
+            io.puts
+            print_chain(io)
+            io.puts
+            print_dsl(io)
+          end
+          def to_dsl
+            return "# No viable chain — no candidate passes all evals" if chain.empty?
+            if chain.all? { |c| c.keys == [:model] }
+              models_str = chain.map { |c| c[:model] }.join(" ")
+              "retry_policy models: %w[#{models_str}]"
+            else
+              args = chain.map { |c| config_to_ruby(c) }.join(",\n    ")
+              "retry_policy do\n  escalate(\n    #{args}\n  )\nend"
+            end
+          end
+          private
+          def print_table(io)
+            short_labels = candidate_labels.map { |l| short_candidate_label(l) }
+            col_width = [short_labels.map(&:length).max || 0, 8].max
+            eval_width = [eval_names.map { |e| e.to_s.length }.max || 0, 12].max
+            header = format("  %-#{eval_width}s", "eval") + short_labels.map { |l| format("  %#{col_width}s", l) }.join
+            io.puts header
+            io.puts "  #{"-" * (eval_width + (col_width + 2) * short_labels.size)}"
+            eval_names.each do |eval_name|
+              row = format("  %-#{eval_width}s", eval_name.to_s)
+              candidate_labels.each do |label|
+                score = score_matrix.dig(eval_name, label) || 0.0
+                marker = eval_name == constraining_eval && score < 1.0 ? " ←" : "  "
+                row += format("  %#{col_width - 2}.2f%s", score, marker)
+              end
+              io.puts row
+            end
+            io.puts
+            io.puts "  Constraining eval: #{constraining_eval}" if constraining_eval
+          end
+          def print_chain(io)
+            if chain.empty?
+              io.puts "  No viable chain."
+              return
+            end
+            io.puts "  Suggested chain:"
+            chain_details.each_with_index do |detail, i|
+              suffix = i == chain_details.size - 1 ? "passes all #{eval_names.size} evals" : "covers #{detail[:passes]} eval(s)"
+              io.puts "    #{detail[:label]} — #{suffix}"
+            end
+          end
+          def short_candidate_label(label)
+            label
+              .sub("gpt-5-", "")
+              .sub("gpt-4.1", "4.1")
+              .sub(" (effort: ", "@")
+              .sub(")", "")
+          end
+          def print_dsl(io)
+            io.puts "  DSL:"
+            to_dsl.each_line { |line| io.puts "    #{line}" }
+          end
+          def config_to_ruby(config)
+            pairs = config.map { |k, v| "#{k}: #{v.inspect}" }.join(", ")
+            "{ #{pairs} }"
+          end
+        end
+        def initialize(step:, candidates:, context: {}, min_score: 0.95)
+          @step = step
+          @candidates = candidates
+          @context = context
+          @min_score = min_score
+        end
+        def call
+          evals = @step.eval_names
+          return empty_result(evals) if evals.empty?
+          score_matrix = {}
+          evals.each do |eval_name|
+            comparison = with_retry_disabled do
+              @step.compare_models(eval_name, candidates: @candidates, context: @context)
+            end
+            score_matrix[eval_name] = extract_scores(comparison)
+          end
+          labels = score_matrix.values.flat_map(&:keys).uniq
+          constraining = find_constraining_eval(score_matrix, labels)
+          chain, details = build_chain(score_matrix, labels, evals)
+          Result.new(
+            step_name: @step.name || @step.to_s,
+            eval_names: evals,
+            candidate_labels: labels,
+            score_matrix: score_matrix,
+            constraining_eval: constraining,
+            chain: chain,
+            chain_details: details
+          )
+        end
+        private
+        def extract_scores(comparison)
+          comparison.reports.transform_values(&:score)
+        end
+        def find_constraining_eval(matrix, labels)
+          matrix.max_by do |_eval_name, scores|
+            cheapest_passing = labels.find { |l| (scores[l] || 0) >= @min_score }
+            cheapest_passing ? labels.index(cheapest_passing) : labels.size
+          end&.first
+        end
+        # Retry escalates on validation_failed/parse_error, NOT on low eval
+        # score. A model that returns :ok with semantically wrong output won't
+        # trigger retry. Therefore the LAST model in the chain must pass ALL
+        # evals — it's the safety net. Cheaper models are prepended as
+        # first-try optimization (they handle easy inputs cheaply; when they
+        # fail validation, retry escalates to the safe fallback).
+        #
+        # Known limitation: intermediate models are assumed safe if their eval
+        # failures correspond to validation failures (retryable). If an
+        # intermediate model returns :ok with semantically wrong output on
+        # some eval, retry won't fire and the safe fallback won't run. This
+        # requires step validates to cover the same semantics as eval verify
+        # checks. A future version could inspect per-case step_status from
+        # compare_models to verify failures are actually retryable.
+        def build_chain(matrix, labels, evals)
+          total = evals.size
+          # Find cheapest model that passes every eval — the safe fallback.
+          safe_fallback = labels.find { |l| evals.all? { |e| (matrix.dig(e, l) || 0) >= @min_score } }
+          return [[], []] unless safe_fallback
+          # Prepend cheaper models that pass a strict subset.
+          chain = []
+          details = []
+          covered_evals = Set.new
+          labels.each do |label|
+            break if label == safe_fallback
+            newly_covered = evals.select { |e| (matrix.dig(e, label) || 0) >= @min_score }
+            new_additions = newly_covered.to_set - covered_evals
+            next if new_additions.empty?
+            covered_evals.merge(new_additions)
+            chain << parse_label_to_config(label)
+            details << { label: label, passes: new_additions.size, cost: label }
+          end
+          # Always end with the safe fallback.
+          chain << parse_label_to_config(safe_fallback)
+          details << { label: safe_fallback, passes: total, cost: safe_fallback }
+          [chain, details]
+        end
+        def parse_label_to_config(label)
+          if label.match?(/\(effort: (\w+)\)/)
+            model = label.sub(/\s*\(effort:.*/, "").strip
+            effort = label.match(/\(effort: (\w+)\)/)[1]
+            { model: model, reasoning_effort: effort }
+          else
+            { model: label }
+          end
+        end
+        def with_retry_disabled(&block)
+          original = @step.retry_policy if @step.respond_to?(:retry_policy)
+          @step.define_singleton_method(:retry_policy) { nil }
+          block.call
+        ensure
+          @step.define_singleton_method(:retry_policy) { original }
+        end
+        def empty_result(evals)
+          Result.new(
+            step_name: @step.name || @step.to_s,
+            eval_names: evals,
+            candidate_labels: [],
+            score_matrix: {},
+            constraining_eval: nil,
+            chain: [],
+            chain_details: []
+          )
+        end
+      end
+    end
+  end
+end

data/lib/ruby_llm/contract/eval.rb CHANGED Viewed

@@ -29,3 +29,6 @@ require_relative "eval/prompt_diff_comparator"
 require_relative "eval/prompt_diff_presenter"
 require_relative "eval/prompt_diff"
 require_relative "eval/eval_history"
+require_relative "eval/recommendation"
+require_relative "eval/recommender"
+require_relative "eval/retry_optimizer"

data/lib/ruby_llm/contract/rake_task.rb CHANGED Viewed

@@ -121,5 +121,88 @@ module RubyLLM
         defined?(::Rails) ? [:environment] : []
       end
     end
+    # Standalone task: runs all evals for one step across candidates,
+    # builds a score matrix, and suggests an optimal retry chain.
+    #
+    # Loaded automatically when `require "ruby_llm/contract/rake_task"`.
+    # Usage:
+    #   rake ruby_llm_contract:optimize \
+    #     STEP=MatchProblemsToPages \
+    #     CANDIDATES=gpt-5-nano,gpt-5-mini@low,gpt-5-mini
+    class OptimizeRakeTask < ::Rake::TaskLib
+      def initialize
+        super()
+        define_task
+      end
+      private
+      def define_task
+        desc "Run all evals for STEP with CANDIDATES and suggest an optimal retry chain"
+        task(:"ruby_llm_contract:optimize" => task_prerequisites) do
+          require "ruby_llm/contract"
+          eval_dirs = ENV["EVAL_DIRS"].to_s.split(",").map(&:strip).reject(&:empty?)
+          RubyLLM::Contract.load_evals!(*eval_dirs)
+          step_name = ENV["STEP"].to_s.strip
+          abort("STEP is required, e.g. STEP=MatchProblemsToPages") if step_name.empty?
+          raw_candidates = ENV["CANDIDATES"].to_s.strip
+          abort("CANDIDATES is required, e.g. CANDIDATES=gpt-5-nano,gpt-5-mini@low,gpt-5-mini") if raw_candidates.empty?
+          min_score = ENV.fetch("MIN_SCORE", "0.95").to_f
+          host = RubyLLM::Contract.eval_hosts.find { |h| h.name == step_name }
+          unless host
+            available = RubyLLM::Contract.eval_hosts.filter_map(&:name).sort
+            abort "Unknown STEP=#{step_name}. Available: #{available.join(", ")}"
+          end
+          candidates = parse_candidates(raw_candidates)
+          context = build_context
+          result = host.optimize_retry_policy(
+            candidates: candidates,
+            context: context,
+            min_score: min_score
+          )
+          result.print_summary
+        end
+      end
+      def parse_candidates(raw)
+        entries = if raw.start_with?("[")
+                    Array(JSON.parse(raw))
+                  else
+                    raw.split(",").map(&:strip).reject(&:empty?).map do |entry|
+                      model, effort = entry.split("@", 2)
+                      config = { model: model.strip }
+                      config[:reasoning_effort] = effort.strip if effort && !effort.empty?
+                      config
+                    end
+                  end
+        entries.map { |e| RubyLLM::Contract.normalize_candidate_config(e) }.uniq
+      end
+      def build_context
+        ctx = {}
+        provider = ENV["PROVIDER"].to_s.strip
+        # Only inject real adapter when LIVE=1 or PROVIDER is set — otherwise
+        # evals use sample_response (offline mode, zero API calls).
+        if ENV["LIVE"] == "1" || !provider.empty?
+          ctx[:adapter] = RubyLLM::Contract::Adapters::RubyLLM.new
+          ctx[:provider] = provider.downcase.to_sym unless provider.empty?
+        end
+        ctx
+      end
+      def task_prerequisites
+        defined?(::Rails) ? [:environment] : []
+      end
+    end
+    # Auto-register the optimize task when this file is loaded
+    OptimizeRakeTask.new
   end
 end

data/lib/ruby_llm/contract/step/base.rb CHANGED Viewed

@@ -49,6 +49,25 @@ module RubyLLM
             end
           end
+          def recommend(eval_name, candidates:, min_score: 0.95, min_first_try_pass_rate: 0.8, context: {})
+            comparison = compare_models(eval_name, candidates: candidates, context: context)
+            Eval::Recommender.new(
+              comparison: comparison,
+              min_score: min_score,
+              min_first_try_pass_rate: min_first_try_pass_rate,
+              current_config: current_model_config
+            ).recommend
+          end
+          def optimize_retry_policy(candidates:, context: {}, min_score: 0.95)
+            Eval::RetryOptimizer.new(
+              step: self,
+              candidates: candidates,
+              context: context,
+              min_score: min_score
+            ).call
+          end
           KNOWN_CONTEXT_KEYS = %i[adapter model temperature max_tokens provider assume_model_exists reasoning_effort].freeze
           include Concerns::ContextHelpers
@@ -144,6 +163,17 @@ module RubyLLM
             }
           end
+          def current_model_config
+            policy = retry_policy
+            if policy && policy.config_list.any?
+              policy.config_list.first
+            elsif respond_to?(:model) && model
+              { model: model }
+            elsif RubyLLM::Contract.configuration.default_model
+              { model: RubyLLM::Contract.configuration.default_model }
+            end
+          end
           def resolve_adapter(context)
             adapter = context[:adapter] || RubyLLM::Contract.configuration.default_adapter
             return adapter if adapter

data/lib/ruby_llm/contract/step/retry_executor.rb CHANGED Viewed

@@ -10,12 +10,16 @@ module RubyLLM
         def run_with_retry(input, adapter:, default_model:, policy:, context_temperature: nil, extra_options: {})
           all_attempts = []
+          default_config = { model: default_model }.merge(extra_options.slice(:reasoning_effort).compact)
           policy.max_attempts.times do |attempt_index|
-            model = policy.model_for_attempt(attempt_index, default_model)
+            config = policy.config_for_attempt(attempt_index, default_config)
+            model = config[:model]
+            attempt_extra = extra_options.merge(config.except(:model))
             result = run_once(input, adapter: adapter, model: model,
-                              context_temperature: context_temperature, extra_options: extra_options)
-            all_attempts << { attempt: attempt_index + 1, model: model, result: result }
+                              context_temperature: context_temperature, extra_options: attempt_extra)
+            all_attempts << { attempt: attempt_index + 1, model: model, config: config, result: result }
             break unless policy.retryable?(result)
           end
@@ -43,6 +47,8 @@ module RubyLLM
         def build_attempt_entry(attempt)
           trace = attempt[:result].trace
           entry = { attempt: attempt[:attempt], model: attempt[:model], status: attempt[:result].status }
+          config = attempt[:config]
+          entry[:config] = config if config && config.keys != [:model]
           append_trace_fields(entry, trace)
         end