ruby_llm-contract 0.5.2 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ class Recommendation
7
+ include Concerns::DeepFreeze
8
+
9
+ attr_reader :best, :retry_chain, :score, :cost_per_call,
10
+ :rationale, :current_config, :savings, :warnings
11
+
12
+ def initialize(best:, retry_chain:, score:, cost_per_call:,
13
+ rationale:, current_config:, savings:, warnings:)
14
+ @best = deep_dup_freeze(best)
15
+ @retry_chain = deep_dup_freeze(retry_chain)
16
+ @score = score
17
+ @cost_per_call = cost_per_call
18
+ @rationale = deep_dup_freeze(rationale)
19
+ @current_config = deep_dup_freeze(current_config)
20
+ @savings = deep_dup_freeze(savings)
21
+ @warnings = deep_dup_freeze(warnings)
22
+ freeze
23
+ end
24
+
25
+ def to_dsl
26
+ return "# No recommendation — no candidate met the minimum score" if retry_chain.empty?
27
+
28
+ if retry_chain.length == 1 && retry_chain.first.keys == [:model]
29
+ "model \"#{retry_chain.first[:model]}\""
30
+ elsif retry_chain.all? { |c| c.keys == [:model] }
31
+ models_str = retry_chain.map { |c| c[:model] }.join(" ")
32
+ "retry_policy models: %w[#{models_str}]"
33
+ else
34
+ args = retry_chain.map { |c| config_to_ruby(c) }.join(",\n ")
35
+ "retry_policy do\n escalate(#{args})\nend"
36
+ end
37
+ end
38
+
39
+ private
40
+
41
+ def config_to_ruby(config)
42
+ pairs = config.map { |k, v| "#{k}: #{v.inspect}" }.join(", ")
43
+ "{ #{pairs} }"
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,132 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ class Recommender
7
+ def initialize(comparison:, min_score:, min_first_try_pass_rate: 0.8, current_config: nil)
8
+ @comparison = comparison
9
+ @min_score = min_score
10
+ @min_first_try_pass_rate = min_first_try_pass_rate
11
+ @current_config = current_config
12
+ end
13
+
14
+ def recommend
15
+ scored = build_scored_candidates
16
+ best = select_best(scored)
17
+ chain = build_retry_chain(scored, best)
18
+ rationale = build_rationale(scored, best)
19
+ warnings = build_warnings(scored)
20
+ savings = best ? calculate_savings(best) : {}
21
+
22
+ Recommendation.new(
23
+ best: best&.dig(:config),
24
+ retry_chain: chain,
25
+ score: best&.dig(:score) || 0.0,
26
+ cost_per_call: best&.dig(:cost_per_call) || 0.0,
27
+ rationale: rationale,
28
+ current_config: @current_config,
29
+ savings: savings,
30
+ warnings: warnings
31
+ )
32
+ end
33
+
34
+ private
35
+
36
+ def build_scored_candidates
37
+ @comparison.configs.filter_map do |label, config|
38
+ report = @comparison.reports[label]
39
+ next nil unless report
40
+
41
+ evaluated_count = report.results.count { |r| r.step_status != :skipped }
42
+ cases_count = [evaluated_count, 1].max
43
+ cost_per_call = report.total_cost.to_f / cases_count
44
+
45
+ {
46
+ label: label,
47
+ config: config,
48
+ score: report.score,
49
+ cost_per_call: cost_per_call,
50
+ latency: report.avg_latency_ms || Float::INFINITY,
51
+ pass_rate_ratio: report.pass_rate_ratio,
52
+ total_cost: report.total_cost
53
+ }
54
+ end
55
+ end
56
+
57
+ def select_best(scored)
58
+ eligible = scored.select { |s| s[:score] >= @min_score && cost_known?(s) }
59
+ eligible.min_by { |s| [s[:cost_per_call], s[:latency], s[:label]] }
60
+ end
61
+
62
+ def build_retry_chain(scored, best)
63
+ return [] unless best
64
+
65
+ first_try = scored
66
+ .select { |s| s[:pass_rate_ratio] >= @min_first_try_pass_rate && cost_known?(s) }
67
+ .min_by { |s| [s[:cost_per_call], s[:latency], s[:label]] }
68
+
69
+ if first_try.nil? || first_try[:label] == best[:label]
70
+ [best[:config]]
71
+ else
72
+ [first_try[:config], best[:config]]
73
+ end
74
+ end
75
+
76
+ def build_rationale(scored, best)
77
+ sorted = scored.sort_by { |s| [cost_known?(s) ? 0 : 1, s[:cost_per_call], s[:latency], s[:label]] }
78
+ sorted.map { |s| rationale_line(s, best) }
79
+ end
80
+
81
+ def rationale_line(candidate, best)
82
+ cost_str = cost_known?(candidate) ? "$#{format("%.4f", candidate[:cost_per_call])}/call" : "unknown pricing"
83
+ header = "#{candidate[:label]}, score #{format("%.2f", candidate[:score])}, at #{cost_str}"
84
+ notes = rationale_notes(candidate, best)
85
+ notes.any? ? "#{header} — #{notes.join(", ")}" : header
86
+ end
87
+
88
+ def rationale_notes(candidate, best)
89
+ notes = []
90
+ pass_pct = (candidate[:pass_rate_ratio] * 100).round
91
+ below_threshold = candidate[:score] < @min_score
92
+
93
+ if below_threshold && candidate[:pass_rate_ratio] >= @min_first_try_pass_rate
94
+ notes << "below #{@min_score} threshold, but good first-try (#{pass_pct}% pass rate)"
95
+ elsif below_threshold
96
+ notes << "below #{@min_score} threshold"
97
+ elsif candidate[:pass_rate_ratio] < 1.0
98
+ notes << "#{pass_pct}% pass rate"
99
+ end
100
+ notes << "recommended" if best && candidate[:label] == best[:label]
101
+ notes << "unknown pricing" unless cost_known?(candidate)
102
+ notes
103
+ end
104
+
105
+ def build_warnings(scored)
106
+ scored.reject { |s| cost_known?(s) }
107
+ .map { |s| "#{s[:label]}: unknown pricing — cost ranking may be inaccurate" }
108
+ end
109
+
110
+ def calculate_savings(best)
111
+ return {} unless @current_config
112
+
113
+ current_label = ModelComparison.candidate_label(@current_config)
114
+ current_report = @comparison.reports[current_label]
115
+ return {} unless current_report
116
+
117
+ current_evaluated = current_report.results.count { |r| r.step_status != :skipped }
118
+ current_cases = [current_evaluated, 1].max
119
+ current_cost = current_report.total_cost.to_f / current_cases
120
+ diff = current_cost - best[:cost_per_call]
121
+ return {} unless diff.positive?
122
+
123
+ { per_call: diff.round(6), monthly_at: { 10_000 => (diff * 10_000).round(2) } }
124
+ end
125
+
126
+ def cost_known?(scored_candidate)
127
+ scored_candidate[:cost_per_call]&.positive?
128
+ end
129
+ end
130
+ end
131
+ end
132
+ end
@@ -14,8 +14,8 @@ module RubyLLM
14
14
  HISTORY_DIR = ".eval_history"
15
15
  BASELINE_DIR = ".eval_baselines"
16
16
 
17
- def_delegators :@stats, :score, :passed, :failed, :skipped, :failures, :pass_rate, :total_cost, :avg_latency_ms,
18
- :passed?
17
+ def_delegators :@stats, :score, :passed, :failed, :skipped, :failures, :pass_rate, :pass_rate_ratio,
18
+ :total_cost, :avg_latency_ms, :passed?
19
19
  def_delegators :@presenter, :summary, :to_s, :print_summary
20
20
  def_delegators :@storage, :save_history!, :eval_history, :save_baseline!, :compare_with_baseline,
21
21
  :baseline_exists?
@@ -35,6 +35,12 @@ module RubyLLM
35
35
  "#{passed}/#{evaluated_results.length}"
36
36
  end
37
37
 
38
+ def pass_rate_ratio
39
+ return 0.0 if evaluated_results.empty?
40
+
41
+ passed.to_f / evaluated_results.length
42
+ end
43
+
38
44
  def total_cost
39
45
  @results.sum { |result| result.cost || 0.0 }
40
46
  end
@@ -13,25 +13,29 @@ module RubyLLM
13
13
  @stats = stats
14
14
  end
15
15
 
16
- def save_history!(path: nil, model: nil)
17
- file = path || storage_path(Report::HISTORY_DIR, "jsonl", model: model)
18
- EvalHistory.append(file, history_entry)
16
+ def save_history!(path: nil, model: nil, reasoning_effort: nil)
17
+ file = path || storage_path(Report::HISTORY_DIR, "jsonl", model: model, reasoning_effort: reasoning_effort)
18
+ entry = history_entry
19
+ entry[:model] = model if model
20
+ entry[:reasoning_effort] = reasoning_effort if reasoning_effort
21
+ EvalHistory.append(file, entry)
19
22
  file
20
23
  end
21
24
 
22
- def eval_history(path: nil, model: nil)
23
- EvalHistory.load(path || storage_path(Report::HISTORY_DIR, "jsonl", model: model))
25
+ def eval_history(path: nil, model: nil, reasoning_effort: nil)
26
+ EvalHistory.load(path || storage_path(Report::HISTORY_DIR, "jsonl", model: model,
27
+ reasoning_effort: reasoning_effort))
24
28
  end
25
29
 
26
- def save_baseline!(path: nil, model: nil)
27
- file = path || storage_path(Report::BASELINE_DIR, "json", model: model)
30
+ def save_baseline!(path: nil, model: nil, reasoning_effort: nil)
31
+ file = path || storage_path(Report::BASELINE_DIR, "json", model: model, reasoning_effort: reasoning_effort)
28
32
  FileUtils.mkdir_p(File.dirname(file))
29
33
  File.write(file, JSON.pretty_generate(serialize_for_baseline))
30
34
  file
31
35
  end
32
36
 
33
- def compare_with_baseline(path: nil, model: nil)
34
- file = path || storage_path(Report::BASELINE_DIR, "json", model: model)
37
+ def compare_with_baseline(path: nil, model: nil, reasoning_effort: nil)
38
+ file = path || storage_path(Report::BASELINE_DIR, "json", model: model, reasoning_effort: reasoning_effort)
35
39
  raise ArgumentError, "No baseline found at #{file}" unless File.exist?(file)
36
40
 
37
41
  baseline_data = JSON.parse(File.read(file), symbolize_names: true)
@@ -43,8 +47,8 @@ module RubyLLM
43
47
  )
44
48
  end
45
49
 
46
- def baseline_exists?(path: nil, model: nil)
47
- File.exist?(path || storage_path(Report::BASELINE_DIR, "json", model: model))
50
+ def baseline_exists?(path: nil, model: nil, reasoning_effort: nil)
51
+ File.exist?(path || storage_path(Report::BASELINE_DIR, "json", model: model, reasoning_effort: reasoning_effort))
48
52
  end
49
53
 
50
54
  private
@@ -55,6 +59,7 @@ module RubyLLM
55
59
  score: @stats.score,
56
60
  total_cost: @stats.total_cost,
57
61
  pass_rate: @stats.pass_rate,
62
+ pass_rate_ratio: @stats.pass_rate_ratio,
58
63
  cases_count: @stats.evaluated_results_count
59
64
  }
60
65
  end
@@ -79,12 +84,13 @@ module RubyLLM
79
84
  }
80
85
  end
81
86
 
82
- def storage_path(root_dir, extension, model:)
87
+ def storage_path(root_dir, extension, model:, reasoning_effort: nil)
83
88
  parts = [root_dir]
84
89
  parts << sanitize_name(@report.step_name) if @report.step_name
85
90
 
86
91
  dataset_name = sanitize_name(@report.dataset_name)
87
92
  dataset_name = "#{dataset_name}_#{sanitize_name(model)}" if model
93
+ dataset_name = "#{dataset_name}_effort_#{sanitize_name(reasoning_effort)}" if reasoning_effort
88
94
 
89
95
  File.join(*parts, "#{dataset_name}.#{extension}")
90
96
  end
@@ -0,0 +1,221 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "set"
4
+
5
+ module RubyLLM
6
+ module Contract
7
+ module Eval
8
+ # Runs compare_models on ALL evals for a step, builds a score matrix,
9
+ # identifies the constraining eval, and suggests an escalation chain.
10
+ #
11
+ # optimizer = RetryOptimizer.new(step: MyStep, candidates: [...], context: {})
12
+ # result = optimizer.call
13
+ # result.print_summary
14
+ # result.to_dsl # => copy-paste retry_policy
15
+ class RetryOptimizer
16
+ Result = Struct.new(:step_name, :eval_names, :candidate_labels, :score_matrix,
17
+ :constraining_eval, :chain, :chain_details, keyword_init: true) do
18
+ def print_summary(io = $stdout)
19
+ io.puts "#{step_name} — retry chain optimization"
20
+ io.puts
21
+ print_table(io)
22
+ io.puts
23
+ print_chain(io)
24
+ io.puts
25
+ print_dsl(io)
26
+ end
27
+
28
+ def to_dsl
29
+ return "# No viable chain — no candidate passes all evals" if chain.empty?
30
+
31
+ if chain.all? { |c| c.keys == [:model] }
32
+ models_str = chain.map { |c| c[:model] }.join(" ")
33
+ "retry_policy models: %w[#{models_str}]"
34
+ else
35
+ args = chain.map { |c| config_to_ruby(c) }.join(",\n ")
36
+ "retry_policy do\n escalate(\n #{args}\n )\nend"
37
+ end
38
+ end
39
+
40
+ private
41
+
42
+ def print_table(io)
43
+ short_labels = candidate_labels.map { |l| short_candidate_label(l) }
44
+ col_width = [short_labels.map(&:length).max || 0, 8].max
45
+ eval_width = [eval_names.map { |e| e.to_s.length }.max || 0, 12].max
46
+
47
+ header = format(" %-#{eval_width}s", "eval") + short_labels.map { |l| format(" %#{col_width}s", l) }.join
48
+ io.puts header
49
+ io.puts " #{"-" * (eval_width + (col_width + 2) * short_labels.size)}"
50
+
51
+ eval_names.each do |eval_name|
52
+ row = format(" %-#{eval_width}s", eval_name.to_s)
53
+ candidate_labels.each do |label|
54
+ score = score_matrix.dig(eval_name, label) || 0.0
55
+ marker = eval_name == constraining_eval && score < 1.0 ? " ←" : " "
56
+ row += format(" %#{col_width - 2}.2f%s", score, marker)
57
+ end
58
+ io.puts row
59
+ end
60
+
61
+ io.puts
62
+ io.puts " Constraining eval: #{constraining_eval}" if constraining_eval
63
+ end
64
+
65
+ def print_chain(io)
66
+ if chain.empty?
67
+ io.puts " No viable chain."
68
+ return
69
+ end
70
+
71
+ io.puts " Suggested chain:"
72
+ chain_details.each_with_index do |detail, i|
73
+ suffix = i == chain_details.size - 1 ? "passes all #{eval_names.size} evals" : "covers #{detail[:passes]} eval(s)"
74
+ io.puts " #{detail[:label]} — #{suffix}"
75
+ end
76
+ end
77
+
78
+ def short_candidate_label(label)
79
+ label
80
+ .sub("gpt-5-", "")
81
+ .sub("gpt-4.1", "4.1")
82
+ .sub(" (effort: ", "@")
83
+ .sub(")", "")
84
+ end
85
+
86
+ def print_dsl(io)
87
+ io.puts " DSL:"
88
+ to_dsl.each_line { |line| io.puts " #{line}" }
89
+ end
90
+
91
+ def config_to_ruby(config)
92
+ pairs = config.map { |k, v| "#{k}: #{v.inspect}" }.join(", ")
93
+ "{ #{pairs} }"
94
+ end
95
+ end
96
+
97
+ def initialize(step:, candidates:, context: {}, min_score: 0.95)
98
+ @step = step
99
+ @candidates = candidates
100
+ @context = context
101
+ @min_score = min_score
102
+ end
103
+
104
+ def call
105
+ evals = @step.eval_names
106
+ return empty_result(evals) if evals.empty?
107
+
108
+ score_matrix = {}
109
+ evals.each do |eval_name|
110
+ comparison = with_retry_disabled do
111
+ @step.compare_models(eval_name, candidates: @candidates, context: @context)
112
+ end
113
+ score_matrix[eval_name] = extract_scores(comparison)
114
+ end
115
+
116
+ labels = score_matrix.values.flat_map(&:keys).uniq
117
+ constraining = find_constraining_eval(score_matrix, labels)
118
+ chain, details = build_chain(score_matrix, labels, evals)
119
+
120
+ Result.new(
121
+ step_name: @step.name || @step.to_s,
122
+ eval_names: evals,
123
+ candidate_labels: labels,
124
+ score_matrix: score_matrix,
125
+ constraining_eval: constraining,
126
+ chain: chain,
127
+ chain_details: details
128
+ )
129
+ end
130
+
131
+ private
132
+
133
+ def extract_scores(comparison)
134
+ comparison.reports.transform_values(&:score)
135
+ end
136
+
137
+ def find_constraining_eval(matrix, labels)
138
+ matrix.max_by do |_eval_name, scores|
139
+ cheapest_passing = labels.find { |l| (scores[l] || 0) >= @min_score }
140
+ cheapest_passing ? labels.index(cheapest_passing) : labels.size
141
+ end&.first
142
+ end
143
+
144
+ # Retry escalates on validation_failed/parse_error, NOT on low eval
145
+ # score. A model that returns :ok with semantically wrong output won't
146
+ # trigger retry. Therefore the LAST model in the chain must pass ALL
147
+ # evals — it's the safety net. Cheaper models are prepended as
148
+ # first-try optimization (they handle easy inputs cheaply; when they
149
+ # fail validation, retry escalates to the safe fallback).
150
+ #
151
+ # Known limitation: intermediate models are assumed safe if their eval
152
+ # failures correspond to validation failures (retryable). If an
153
+ # intermediate model returns :ok with semantically wrong output on
154
+ # some eval, retry won't fire and the safe fallback won't run. This
155
+ # requires step validates to cover the same semantics as eval verify
156
+ # checks. A future version could inspect per-case step_status from
157
+ # compare_models to verify failures are actually retryable.
158
+ def build_chain(matrix, labels, evals)
159
+ total = evals.size
160
+
161
+ # Find cheapest model that passes every eval — the safe fallback.
162
+ safe_fallback = labels.find { |l| evals.all? { |e| (matrix.dig(e, l) || 0) >= @min_score } }
163
+ return [[], []] unless safe_fallback
164
+
165
+ # Prepend cheaper models that pass a strict subset.
166
+ chain = []
167
+ details = []
168
+ covered_evals = Set.new
169
+
170
+ labels.each do |label|
171
+ break if label == safe_fallback
172
+
173
+ newly_covered = evals.select { |e| (matrix.dig(e, label) || 0) >= @min_score }
174
+ new_additions = newly_covered.to_set - covered_evals
175
+ next if new_additions.empty?
176
+
177
+ covered_evals.merge(new_additions)
178
+ chain << parse_label_to_config(label)
179
+ details << { label: label, passes: new_additions.size, cost: label }
180
+ end
181
+
182
+ # Always end with the safe fallback.
183
+ chain << parse_label_to_config(safe_fallback)
184
+ details << { label: safe_fallback, passes: total, cost: safe_fallback }
185
+
186
+ [chain, details]
187
+ end
188
+
189
+ def parse_label_to_config(label)
190
+ if label.match?(/\(effort: (\w+)\)/)
191
+ model = label.sub(/\s*\(effort:.*/, "").strip
192
+ effort = label.match(/\(effort: (\w+)\)/)[1]
193
+ { model: model, reasoning_effort: effort }
194
+ else
195
+ { model: label }
196
+ end
197
+ end
198
+
199
+ def with_retry_disabled(&block)
200
+ original = @step.retry_policy if @step.respond_to?(:retry_policy)
201
+ @step.define_singleton_method(:retry_policy) { nil }
202
+ block.call
203
+ ensure
204
+ @step.define_singleton_method(:retry_policy) { original }
205
+ end
206
+
207
+ def empty_result(evals)
208
+ Result.new(
209
+ step_name: @step.name || @step.to_s,
210
+ eval_names: evals,
211
+ candidate_labels: [],
212
+ score_matrix: {},
213
+ constraining_eval: nil,
214
+ chain: [],
215
+ chain_details: []
216
+ )
217
+ end
218
+ end
219
+ end
220
+ end
221
+ end
@@ -29,3 +29,6 @@ require_relative "eval/prompt_diff_comparator"
29
29
  require_relative "eval/prompt_diff_presenter"
30
30
  require_relative "eval/prompt_diff"
31
31
  require_relative "eval/eval_history"
32
+ require_relative "eval/recommendation"
33
+ require_relative "eval/recommender"
34
+ require_relative "eval/retry_optimizer"
@@ -121,5 +121,88 @@ module RubyLLM
121
121
  defined?(::Rails) ? [:environment] : []
122
122
  end
123
123
  end
124
+
125
+ # Standalone task: runs all evals for one step across candidates,
126
+ # builds a score matrix, and suggests an optimal retry chain.
127
+ #
128
+ # Loaded automatically when `require "ruby_llm/contract/rake_task"`.
129
+ # Usage:
130
+ # rake ruby_llm_contract:optimize \
131
+ # STEP=MatchProblemsToPages \
132
+ # CANDIDATES=gpt-5-nano,gpt-5-mini@low,gpt-5-mini
133
+ class OptimizeRakeTask < ::Rake::TaskLib
134
+ def initialize
135
+ super()
136
+ define_task
137
+ end
138
+
139
+ private
140
+
141
+ def define_task
142
+ desc "Run all evals for STEP with CANDIDATES and suggest an optimal retry chain"
143
+ task(:"ruby_llm_contract:optimize" => task_prerequisites) do
144
+ require "ruby_llm/contract"
145
+ eval_dirs = ENV["EVAL_DIRS"].to_s.split(",").map(&:strip).reject(&:empty?)
146
+ RubyLLM::Contract.load_evals!(*eval_dirs)
147
+
148
+ step_name = ENV["STEP"].to_s.strip
149
+ abort("STEP is required, e.g. STEP=MatchProblemsToPages") if step_name.empty?
150
+ raw_candidates = ENV["CANDIDATES"].to_s.strip
151
+ abort("CANDIDATES is required, e.g. CANDIDATES=gpt-5-nano,gpt-5-mini@low,gpt-5-mini") if raw_candidates.empty?
152
+ min_score = ENV.fetch("MIN_SCORE", "0.95").to_f
153
+
154
+ host = RubyLLM::Contract.eval_hosts.find { |h| h.name == step_name }
155
+ unless host
156
+ available = RubyLLM::Contract.eval_hosts.filter_map(&:name).sort
157
+ abort "Unknown STEP=#{step_name}. Available: #{available.join(", ")}"
158
+ end
159
+
160
+ candidates = parse_candidates(raw_candidates)
161
+ context = build_context
162
+
163
+ result = host.optimize_retry_policy(
164
+ candidates: candidates,
165
+ context: context,
166
+ min_score: min_score
167
+ )
168
+
169
+ result.print_summary
170
+ end
171
+ end
172
+
173
+ def parse_candidates(raw)
174
+ entries = if raw.start_with?("[")
175
+ Array(JSON.parse(raw))
176
+ else
177
+ raw.split(",").map(&:strip).reject(&:empty?).map do |entry|
178
+ model, effort = entry.split("@", 2)
179
+ config = { model: model.strip }
180
+ config[:reasoning_effort] = effort.strip if effort && !effort.empty?
181
+ config
182
+ end
183
+ end
184
+
185
+ entries.map { |e| RubyLLM::Contract.normalize_candidate_config(e) }.uniq
186
+ end
187
+
188
+ def build_context
189
+ ctx = {}
190
+ provider = ENV["PROVIDER"].to_s.strip
191
+ # Only inject real adapter when LIVE=1 or PROVIDER is set — otherwise
192
+ # evals use sample_response (offline mode, zero API calls).
193
+ if ENV["LIVE"] == "1" || !provider.empty?
194
+ ctx[:adapter] = RubyLLM::Contract::Adapters::RubyLLM.new
195
+ ctx[:provider] = provider.downcase.to_sym unless provider.empty?
196
+ end
197
+ ctx
198
+ end
199
+
200
+ def task_prerequisites
201
+ defined?(::Rails) ? [:environment] : []
202
+ end
203
+ end
204
+
205
+ # Auto-register the optimize task when this file is loaded
206
+ OptimizeRakeTask.new
124
207
  end
125
208
  end
@@ -49,6 +49,25 @@ module RubyLLM
49
49
  end
50
50
  end
51
51
 
52
+ def recommend(eval_name, candidates:, min_score: 0.95, min_first_try_pass_rate: 0.8, context: {})
53
+ comparison = compare_models(eval_name, candidates: candidates, context: context)
54
+ Eval::Recommender.new(
55
+ comparison: comparison,
56
+ min_score: min_score,
57
+ min_first_try_pass_rate: min_first_try_pass_rate,
58
+ current_config: current_model_config
59
+ ).recommend
60
+ end
61
+
62
+ def optimize_retry_policy(candidates:, context: {}, min_score: 0.95)
63
+ Eval::RetryOptimizer.new(
64
+ step: self,
65
+ candidates: candidates,
66
+ context: context,
67
+ min_score: min_score
68
+ ).call
69
+ end
70
+
52
71
  KNOWN_CONTEXT_KEYS = %i[adapter model temperature max_tokens provider assume_model_exists reasoning_effort].freeze
53
72
 
54
73
  include Concerns::ContextHelpers
@@ -144,6 +163,17 @@ module RubyLLM
144
163
  }
145
164
  end
146
165
 
166
+ def current_model_config
167
+ policy = retry_policy
168
+ if policy && policy.config_list.any?
169
+ policy.config_list.first
170
+ elsif respond_to?(:model) && model
171
+ { model: model }
172
+ elsif RubyLLM::Contract.configuration.default_model
173
+ { model: RubyLLM::Contract.configuration.default_model }
174
+ end
175
+ end
176
+
147
177
  def resolve_adapter(context)
148
178
  adapter = context[:adapter] || RubyLLM::Contract.configuration.default_adapter
149
179
  return adapter if adapter
@@ -10,12 +10,16 @@ module RubyLLM
10
10
 
11
11
  def run_with_retry(input, adapter:, default_model:, policy:, context_temperature: nil, extra_options: {})
12
12
  all_attempts = []
13
+ default_config = { model: default_model }.merge(extra_options.slice(:reasoning_effort).compact)
13
14
 
14
15
  policy.max_attempts.times do |attempt_index|
15
- model = policy.model_for_attempt(attempt_index, default_model)
16
+ config = policy.config_for_attempt(attempt_index, default_config)
17
+ model = config[:model]
18
+ attempt_extra = extra_options.merge(config.except(:model))
19
+
16
20
  result = run_once(input, adapter: adapter, model: model,
17
- context_temperature: context_temperature, extra_options: extra_options)
18
- all_attempts << { attempt: attempt_index + 1, model: model, result: result }
21
+ context_temperature: context_temperature, extra_options: attempt_extra)
22
+ all_attempts << { attempt: attempt_index + 1, model: model, config: config, result: result }
19
23
  break unless policy.retryable?(result)
20
24
  end
21
25
 
@@ -43,6 +47,8 @@ module RubyLLM
43
47
  def build_attempt_entry(attempt)
44
48
  trace = attempt[:result].trace
45
49
  entry = { attempt: attempt[:attempt], model: attempt[:model], status: attempt[:result].status }
50
+ config = attempt[:config]
51
+ entry[:config] = config if config && config.keys != [:model]
46
52
  append_trace_fields(entry, trace)
47
53
  end
48
54