ruby_llm-contract 0.5.2 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +49 -2
- data/Gemfile.lock +2 -2
- data/README.md +173 -134
- data/lib/ruby_llm/contract/concerns/eval_host.rb +25 -6
- data/lib/ruby_llm/contract/eval/model_comparison.rb +33 -11
- data/lib/ruby_llm/contract/eval/recommendation.rb +48 -0
- data/lib/ruby_llm/contract/eval/recommender.rb +132 -0
- data/lib/ruby_llm/contract/eval/report.rb +2 -2
- data/lib/ruby_llm/contract/eval/report_stats.rb +6 -0
- data/lib/ruby_llm/contract/eval/report_storage.rb +18 -12
- data/lib/ruby_llm/contract/eval/retry_optimizer.rb +221 -0
- data/lib/ruby_llm/contract/eval.rb +3 -0
- data/lib/ruby_llm/contract/rake_task.rb +83 -0
- data/lib/ruby_llm/contract/step/base.rb +30 -0
- data/lib/ruby_llm/contract/step/retry_executor.rb +9 -3
- data/lib/ruby_llm/contract/step/retry_policy.rb +27 -14
- data/lib/ruby_llm/contract/version.rb +1 -1
- data/lib/ruby_llm/contract.rb +21 -0
- metadata +4 -1
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
class Recommendation
|
|
7
|
+
include Concerns::DeepFreeze
|
|
8
|
+
|
|
9
|
+
attr_reader :best, :retry_chain, :score, :cost_per_call,
|
|
10
|
+
:rationale, :current_config, :savings, :warnings
|
|
11
|
+
|
|
12
|
+
def initialize(best:, retry_chain:, score:, cost_per_call:,
|
|
13
|
+
rationale:, current_config:, savings:, warnings:)
|
|
14
|
+
@best = deep_dup_freeze(best)
|
|
15
|
+
@retry_chain = deep_dup_freeze(retry_chain)
|
|
16
|
+
@score = score
|
|
17
|
+
@cost_per_call = cost_per_call
|
|
18
|
+
@rationale = deep_dup_freeze(rationale)
|
|
19
|
+
@current_config = deep_dup_freeze(current_config)
|
|
20
|
+
@savings = deep_dup_freeze(savings)
|
|
21
|
+
@warnings = deep_dup_freeze(warnings)
|
|
22
|
+
freeze
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def to_dsl
|
|
26
|
+
return "# No recommendation — no candidate met the minimum score" if retry_chain.empty?
|
|
27
|
+
|
|
28
|
+
if retry_chain.length == 1 && retry_chain.first.keys == [:model]
|
|
29
|
+
"model \"#{retry_chain.first[:model]}\""
|
|
30
|
+
elsif retry_chain.all? { |c| c.keys == [:model] }
|
|
31
|
+
models_str = retry_chain.map { |c| c[:model] }.join(" ")
|
|
32
|
+
"retry_policy models: %w[#{models_str}]"
|
|
33
|
+
else
|
|
34
|
+
args = retry_chain.map { |c| config_to_ruby(c) }.join(",\n ")
|
|
35
|
+
"retry_policy do\n escalate(#{args})\nend"
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
def config_to_ruby(config)
|
|
42
|
+
pairs = config.map { |k, v| "#{k}: #{v.inspect}" }.join(", ")
|
|
43
|
+
"{ #{pairs} }"
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
class Recommender
|
|
7
|
+
def initialize(comparison:, min_score:, min_first_try_pass_rate: 0.8, current_config: nil)
|
|
8
|
+
@comparison = comparison
|
|
9
|
+
@min_score = min_score
|
|
10
|
+
@min_first_try_pass_rate = min_first_try_pass_rate
|
|
11
|
+
@current_config = current_config
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def recommend
|
|
15
|
+
scored = build_scored_candidates
|
|
16
|
+
best = select_best(scored)
|
|
17
|
+
chain = build_retry_chain(scored, best)
|
|
18
|
+
rationale = build_rationale(scored, best)
|
|
19
|
+
warnings = build_warnings(scored)
|
|
20
|
+
savings = best ? calculate_savings(best) : {}
|
|
21
|
+
|
|
22
|
+
Recommendation.new(
|
|
23
|
+
best: best&.dig(:config),
|
|
24
|
+
retry_chain: chain,
|
|
25
|
+
score: best&.dig(:score) || 0.0,
|
|
26
|
+
cost_per_call: best&.dig(:cost_per_call) || 0.0,
|
|
27
|
+
rationale: rationale,
|
|
28
|
+
current_config: @current_config,
|
|
29
|
+
savings: savings,
|
|
30
|
+
warnings: warnings
|
|
31
|
+
)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def build_scored_candidates
|
|
37
|
+
@comparison.configs.filter_map do |label, config|
|
|
38
|
+
report = @comparison.reports[label]
|
|
39
|
+
next nil unless report
|
|
40
|
+
|
|
41
|
+
evaluated_count = report.results.count { |r| r.step_status != :skipped }
|
|
42
|
+
cases_count = [evaluated_count, 1].max
|
|
43
|
+
cost_per_call = report.total_cost.to_f / cases_count
|
|
44
|
+
|
|
45
|
+
{
|
|
46
|
+
label: label,
|
|
47
|
+
config: config,
|
|
48
|
+
score: report.score,
|
|
49
|
+
cost_per_call: cost_per_call,
|
|
50
|
+
latency: report.avg_latency_ms || Float::INFINITY,
|
|
51
|
+
pass_rate_ratio: report.pass_rate_ratio,
|
|
52
|
+
total_cost: report.total_cost
|
|
53
|
+
}
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def select_best(scored)
|
|
58
|
+
eligible = scored.select { |s| s[:score] >= @min_score && cost_known?(s) }
|
|
59
|
+
eligible.min_by { |s| [s[:cost_per_call], s[:latency], s[:label]] }
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def build_retry_chain(scored, best)
|
|
63
|
+
return [] unless best
|
|
64
|
+
|
|
65
|
+
first_try = scored
|
|
66
|
+
.select { |s| s[:pass_rate_ratio] >= @min_first_try_pass_rate && cost_known?(s) }
|
|
67
|
+
.min_by { |s| [s[:cost_per_call], s[:latency], s[:label]] }
|
|
68
|
+
|
|
69
|
+
if first_try.nil? || first_try[:label] == best[:label]
|
|
70
|
+
[best[:config]]
|
|
71
|
+
else
|
|
72
|
+
[first_try[:config], best[:config]]
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def build_rationale(scored, best)
|
|
77
|
+
sorted = scored.sort_by { |s| [cost_known?(s) ? 0 : 1, s[:cost_per_call], s[:latency], s[:label]] }
|
|
78
|
+
sorted.map { |s| rationale_line(s, best) }
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def rationale_line(candidate, best)
|
|
82
|
+
cost_str = cost_known?(candidate) ? "$#{format("%.4f", candidate[:cost_per_call])}/call" : "unknown pricing"
|
|
83
|
+
header = "#{candidate[:label]}, score #{format("%.2f", candidate[:score])}, at #{cost_str}"
|
|
84
|
+
notes = rationale_notes(candidate, best)
|
|
85
|
+
notes.any? ? "#{header} — #{notes.join(", ")}" : header
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def rationale_notes(candidate, best)
|
|
89
|
+
notes = []
|
|
90
|
+
pass_pct = (candidate[:pass_rate_ratio] * 100).round
|
|
91
|
+
below_threshold = candidate[:score] < @min_score
|
|
92
|
+
|
|
93
|
+
if below_threshold && candidate[:pass_rate_ratio] >= @min_first_try_pass_rate
|
|
94
|
+
notes << "below #{@min_score} threshold, but good first-try (#{pass_pct}% pass rate)"
|
|
95
|
+
elsif below_threshold
|
|
96
|
+
notes << "below #{@min_score} threshold"
|
|
97
|
+
elsif candidate[:pass_rate_ratio] < 1.0
|
|
98
|
+
notes << "#{pass_pct}% pass rate"
|
|
99
|
+
end
|
|
100
|
+
notes << "recommended" if best && candidate[:label] == best[:label]
|
|
101
|
+
notes << "unknown pricing" unless cost_known?(candidate)
|
|
102
|
+
notes
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def build_warnings(scored)
|
|
106
|
+
scored.reject { |s| cost_known?(s) }
|
|
107
|
+
.map { |s| "#{s[:label]}: unknown pricing — cost ranking may be inaccurate" }
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def calculate_savings(best)
|
|
111
|
+
return {} unless @current_config
|
|
112
|
+
|
|
113
|
+
current_label = ModelComparison.candidate_label(@current_config)
|
|
114
|
+
current_report = @comparison.reports[current_label]
|
|
115
|
+
return {} unless current_report
|
|
116
|
+
|
|
117
|
+
current_evaluated = current_report.results.count { |r| r.step_status != :skipped }
|
|
118
|
+
current_cases = [current_evaluated, 1].max
|
|
119
|
+
current_cost = current_report.total_cost.to_f / current_cases
|
|
120
|
+
diff = current_cost - best[:cost_per_call]
|
|
121
|
+
return {} unless diff.positive?
|
|
122
|
+
|
|
123
|
+
{ per_call: diff.round(6), monthly_at: { 10_000 => (diff * 10_000).round(2) } }
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def cost_known?(scored_candidate)
|
|
127
|
+
scored_candidate[:cost_per_call]&.positive?
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
end
|
|
@@ -14,8 +14,8 @@ module RubyLLM
|
|
|
14
14
|
HISTORY_DIR = ".eval_history"
|
|
15
15
|
BASELINE_DIR = ".eval_baselines"
|
|
16
16
|
|
|
17
|
-
def_delegators :@stats, :score, :passed, :failed, :skipped, :failures, :pass_rate, :
|
|
18
|
-
:passed?
|
|
17
|
+
def_delegators :@stats, :score, :passed, :failed, :skipped, :failures, :pass_rate, :pass_rate_ratio,
|
|
18
|
+
:total_cost, :avg_latency_ms, :passed?
|
|
19
19
|
def_delegators :@presenter, :summary, :to_s, :print_summary
|
|
20
20
|
def_delegators :@storage, :save_history!, :eval_history, :save_baseline!, :compare_with_baseline,
|
|
21
21
|
:baseline_exists?
|
|
@@ -35,6 +35,12 @@ module RubyLLM
|
|
|
35
35
|
"#{passed}/#{evaluated_results.length}"
|
|
36
36
|
end
|
|
37
37
|
|
|
38
|
+
def pass_rate_ratio
|
|
39
|
+
return 0.0 if evaluated_results.empty?
|
|
40
|
+
|
|
41
|
+
passed.to_f / evaluated_results.length
|
|
42
|
+
end
|
|
43
|
+
|
|
38
44
|
def total_cost
|
|
39
45
|
@results.sum { |result| result.cost || 0.0 }
|
|
40
46
|
end
|
|
@@ -13,25 +13,29 @@ module RubyLLM
|
|
|
13
13
|
@stats = stats
|
|
14
14
|
end
|
|
15
15
|
|
|
16
|
-
def save_history!(path: nil, model: nil)
|
|
17
|
-
file = path || storage_path(Report::HISTORY_DIR, "jsonl", model: model)
|
|
18
|
-
|
|
16
|
+
def save_history!(path: nil, model: nil, reasoning_effort: nil)
|
|
17
|
+
file = path || storage_path(Report::HISTORY_DIR, "jsonl", model: model, reasoning_effort: reasoning_effort)
|
|
18
|
+
entry = history_entry
|
|
19
|
+
entry[:model] = model if model
|
|
20
|
+
entry[:reasoning_effort] = reasoning_effort if reasoning_effort
|
|
21
|
+
EvalHistory.append(file, entry)
|
|
19
22
|
file
|
|
20
23
|
end
|
|
21
24
|
|
|
22
|
-
def eval_history(path: nil, model: nil)
|
|
23
|
-
EvalHistory.load(path || storage_path(Report::HISTORY_DIR, "jsonl", model: model
|
|
25
|
+
def eval_history(path: nil, model: nil, reasoning_effort: nil)
|
|
26
|
+
EvalHistory.load(path || storage_path(Report::HISTORY_DIR, "jsonl", model: model,
|
|
27
|
+
reasoning_effort: reasoning_effort))
|
|
24
28
|
end
|
|
25
29
|
|
|
26
|
-
def save_baseline!(path: nil, model: nil)
|
|
27
|
-
file = path || storage_path(Report::BASELINE_DIR, "json", model: model)
|
|
30
|
+
def save_baseline!(path: nil, model: nil, reasoning_effort: nil)
|
|
31
|
+
file = path || storage_path(Report::BASELINE_DIR, "json", model: model, reasoning_effort: reasoning_effort)
|
|
28
32
|
FileUtils.mkdir_p(File.dirname(file))
|
|
29
33
|
File.write(file, JSON.pretty_generate(serialize_for_baseline))
|
|
30
34
|
file
|
|
31
35
|
end
|
|
32
36
|
|
|
33
|
-
def compare_with_baseline(path: nil, model: nil)
|
|
34
|
-
file = path || storage_path(Report::BASELINE_DIR, "json", model: model)
|
|
37
|
+
def compare_with_baseline(path: nil, model: nil, reasoning_effort: nil)
|
|
38
|
+
file = path || storage_path(Report::BASELINE_DIR, "json", model: model, reasoning_effort: reasoning_effort)
|
|
35
39
|
raise ArgumentError, "No baseline found at #{file}" unless File.exist?(file)
|
|
36
40
|
|
|
37
41
|
baseline_data = JSON.parse(File.read(file), symbolize_names: true)
|
|
@@ -43,8 +47,8 @@ module RubyLLM
|
|
|
43
47
|
)
|
|
44
48
|
end
|
|
45
49
|
|
|
46
|
-
def baseline_exists?(path: nil, model: nil)
|
|
47
|
-
File.exist?(path || storage_path(Report::BASELINE_DIR, "json", model: model))
|
|
50
|
+
def baseline_exists?(path: nil, model: nil, reasoning_effort: nil)
|
|
51
|
+
File.exist?(path || storage_path(Report::BASELINE_DIR, "json", model: model, reasoning_effort: reasoning_effort))
|
|
48
52
|
end
|
|
49
53
|
|
|
50
54
|
private
|
|
@@ -55,6 +59,7 @@ module RubyLLM
|
|
|
55
59
|
score: @stats.score,
|
|
56
60
|
total_cost: @stats.total_cost,
|
|
57
61
|
pass_rate: @stats.pass_rate,
|
|
62
|
+
pass_rate_ratio: @stats.pass_rate_ratio,
|
|
58
63
|
cases_count: @stats.evaluated_results_count
|
|
59
64
|
}
|
|
60
65
|
end
|
|
@@ -79,12 +84,13 @@ module RubyLLM
|
|
|
79
84
|
}
|
|
80
85
|
end
|
|
81
86
|
|
|
82
|
-
def storage_path(root_dir, extension, model:)
|
|
87
|
+
def storage_path(root_dir, extension, model:, reasoning_effort: nil)
|
|
83
88
|
parts = [root_dir]
|
|
84
89
|
parts << sanitize_name(@report.step_name) if @report.step_name
|
|
85
90
|
|
|
86
91
|
dataset_name = sanitize_name(@report.dataset_name)
|
|
87
92
|
dataset_name = "#{dataset_name}_#{sanitize_name(model)}" if model
|
|
93
|
+
dataset_name = "#{dataset_name}_effort_#{sanitize_name(reasoning_effort)}" if reasoning_effort
|
|
88
94
|
|
|
89
95
|
File.join(*parts, "#{dataset_name}.#{extension}")
|
|
90
96
|
end
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "set"
|
|
4
|
+
|
|
5
|
+
module RubyLLM
|
|
6
|
+
module Contract
|
|
7
|
+
module Eval
|
|
8
|
+
# Runs compare_models on ALL evals for a step, builds a score matrix,
|
|
9
|
+
# identifies the constraining eval, and suggests an escalation chain.
|
|
10
|
+
#
|
|
11
|
+
# optimizer = RetryOptimizer.new(step: MyStep, candidates: [...], context: {})
|
|
12
|
+
# result = optimizer.call
|
|
13
|
+
# result.print_summary
|
|
14
|
+
# result.to_dsl # => copy-paste retry_policy
|
|
15
|
+
class RetryOptimizer
|
|
16
|
+
Result = Struct.new(:step_name, :eval_names, :candidate_labels, :score_matrix,
|
|
17
|
+
:constraining_eval, :chain, :chain_details, keyword_init: true) do
|
|
18
|
+
def print_summary(io = $stdout)
|
|
19
|
+
io.puts "#{step_name} — retry chain optimization"
|
|
20
|
+
io.puts
|
|
21
|
+
print_table(io)
|
|
22
|
+
io.puts
|
|
23
|
+
print_chain(io)
|
|
24
|
+
io.puts
|
|
25
|
+
print_dsl(io)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def to_dsl
|
|
29
|
+
return "# No viable chain — no candidate passes all evals" if chain.empty?
|
|
30
|
+
|
|
31
|
+
if chain.all? { |c| c.keys == [:model] }
|
|
32
|
+
models_str = chain.map { |c| c[:model] }.join(" ")
|
|
33
|
+
"retry_policy models: %w[#{models_str}]"
|
|
34
|
+
else
|
|
35
|
+
args = chain.map { |c| config_to_ruby(c) }.join(",\n ")
|
|
36
|
+
"retry_policy do\n escalate(\n #{args}\n )\nend"
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
private
|
|
41
|
+
|
|
42
|
+
def print_table(io)
|
|
43
|
+
short_labels = candidate_labels.map { |l| short_candidate_label(l) }
|
|
44
|
+
col_width = [short_labels.map(&:length).max || 0, 8].max
|
|
45
|
+
eval_width = [eval_names.map { |e| e.to_s.length }.max || 0, 12].max
|
|
46
|
+
|
|
47
|
+
header = format(" %-#{eval_width}s", "eval") + short_labels.map { |l| format(" %#{col_width}s", l) }.join
|
|
48
|
+
io.puts header
|
|
49
|
+
io.puts " #{"-" * (eval_width + (col_width + 2) * short_labels.size)}"
|
|
50
|
+
|
|
51
|
+
eval_names.each do |eval_name|
|
|
52
|
+
row = format(" %-#{eval_width}s", eval_name.to_s)
|
|
53
|
+
candidate_labels.each do |label|
|
|
54
|
+
score = score_matrix.dig(eval_name, label) || 0.0
|
|
55
|
+
marker = eval_name == constraining_eval && score < 1.0 ? " ←" : " "
|
|
56
|
+
row += format(" %#{col_width - 2}.2f%s", score, marker)
|
|
57
|
+
end
|
|
58
|
+
io.puts row
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
io.puts
|
|
62
|
+
io.puts " Constraining eval: #{constraining_eval}" if constraining_eval
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def print_chain(io)
|
|
66
|
+
if chain.empty?
|
|
67
|
+
io.puts " No viable chain."
|
|
68
|
+
return
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
io.puts " Suggested chain:"
|
|
72
|
+
chain_details.each_with_index do |detail, i|
|
|
73
|
+
suffix = i == chain_details.size - 1 ? "passes all #{eval_names.size} evals" : "covers #{detail[:passes]} eval(s)"
|
|
74
|
+
io.puts " #{detail[:label]} — #{suffix}"
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def short_candidate_label(label)
|
|
79
|
+
label
|
|
80
|
+
.sub("gpt-5-", "")
|
|
81
|
+
.sub("gpt-4.1", "4.1")
|
|
82
|
+
.sub(" (effort: ", "@")
|
|
83
|
+
.sub(")", "")
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def print_dsl(io)
|
|
87
|
+
io.puts " DSL:"
|
|
88
|
+
to_dsl.each_line { |line| io.puts " #{line}" }
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def config_to_ruby(config)
|
|
92
|
+
pairs = config.map { |k, v| "#{k}: #{v.inspect}" }.join(", ")
|
|
93
|
+
"{ #{pairs} }"
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def initialize(step:, candidates:, context: {}, min_score: 0.95)
|
|
98
|
+
@step = step
|
|
99
|
+
@candidates = candidates
|
|
100
|
+
@context = context
|
|
101
|
+
@min_score = min_score
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def call
|
|
105
|
+
evals = @step.eval_names
|
|
106
|
+
return empty_result(evals) if evals.empty?
|
|
107
|
+
|
|
108
|
+
score_matrix = {}
|
|
109
|
+
evals.each do |eval_name|
|
|
110
|
+
comparison = with_retry_disabled do
|
|
111
|
+
@step.compare_models(eval_name, candidates: @candidates, context: @context)
|
|
112
|
+
end
|
|
113
|
+
score_matrix[eval_name] = extract_scores(comparison)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
labels = score_matrix.values.flat_map(&:keys).uniq
|
|
117
|
+
constraining = find_constraining_eval(score_matrix, labels)
|
|
118
|
+
chain, details = build_chain(score_matrix, labels, evals)
|
|
119
|
+
|
|
120
|
+
Result.new(
|
|
121
|
+
step_name: @step.name || @step.to_s,
|
|
122
|
+
eval_names: evals,
|
|
123
|
+
candidate_labels: labels,
|
|
124
|
+
score_matrix: score_matrix,
|
|
125
|
+
constraining_eval: constraining,
|
|
126
|
+
chain: chain,
|
|
127
|
+
chain_details: details
|
|
128
|
+
)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
private
|
|
132
|
+
|
|
133
|
+
def extract_scores(comparison)
|
|
134
|
+
comparison.reports.transform_values(&:score)
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def find_constraining_eval(matrix, labels)
|
|
138
|
+
matrix.max_by do |_eval_name, scores|
|
|
139
|
+
cheapest_passing = labels.find { |l| (scores[l] || 0) >= @min_score }
|
|
140
|
+
cheapest_passing ? labels.index(cheapest_passing) : labels.size
|
|
141
|
+
end&.first
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Retry escalates on validation_failed/parse_error, NOT on low eval
|
|
145
|
+
# score. A model that returns :ok with semantically wrong output won't
|
|
146
|
+
# trigger retry. Therefore the LAST model in the chain must pass ALL
|
|
147
|
+
# evals — it's the safety net. Cheaper models are prepended as
|
|
148
|
+
# first-try optimization (they handle easy inputs cheaply; when they
|
|
149
|
+
# fail validation, retry escalates to the safe fallback).
|
|
150
|
+
#
|
|
151
|
+
# Known limitation: intermediate models are assumed safe if their eval
|
|
152
|
+
# failures correspond to validation failures (retryable). If an
|
|
153
|
+
# intermediate model returns :ok with semantically wrong output on
|
|
154
|
+
# some eval, retry won't fire and the safe fallback won't run. This
|
|
155
|
+
# requires step validates to cover the same semantics as eval verify
|
|
156
|
+
# checks. A future version could inspect per-case step_status from
|
|
157
|
+
# compare_models to verify failures are actually retryable.
|
|
158
|
+
def build_chain(matrix, labels, evals)
|
|
159
|
+
total = evals.size
|
|
160
|
+
|
|
161
|
+
# Find cheapest model that passes every eval — the safe fallback.
|
|
162
|
+
safe_fallback = labels.find { |l| evals.all? { |e| (matrix.dig(e, l) || 0) >= @min_score } }
|
|
163
|
+
return [[], []] unless safe_fallback
|
|
164
|
+
|
|
165
|
+
# Prepend cheaper models that pass a strict subset.
|
|
166
|
+
chain = []
|
|
167
|
+
details = []
|
|
168
|
+
covered_evals = Set.new
|
|
169
|
+
|
|
170
|
+
labels.each do |label|
|
|
171
|
+
break if label == safe_fallback
|
|
172
|
+
|
|
173
|
+
newly_covered = evals.select { |e| (matrix.dig(e, label) || 0) >= @min_score }
|
|
174
|
+
new_additions = newly_covered.to_set - covered_evals
|
|
175
|
+
next if new_additions.empty?
|
|
176
|
+
|
|
177
|
+
covered_evals.merge(new_additions)
|
|
178
|
+
chain << parse_label_to_config(label)
|
|
179
|
+
details << { label: label, passes: new_additions.size, cost: label }
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Always end with the safe fallback.
|
|
183
|
+
chain << parse_label_to_config(safe_fallback)
|
|
184
|
+
details << { label: safe_fallback, passes: total, cost: safe_fallback }
|
|
185
|
+
|
|
186
|
+
[chain, details]
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
def parse_label_to_config(label)
|
|
190
|
+
if label.match?(/\(effort: (\w+)\)/)
|
|
191
|
+
model = label.sub(/\s*\(effort:.*/, "").strip
|
|
192
|
+
effort = label.match(/\(effort: (\w+)\)/)[1]
|
|
193
|
+
{ model: model, reasoning_effort: effort }
|
|
194
|
+
else
|
|
195
|
+
{ model: label }
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def with_retry_disabled(&block)
|
|
200
|
+
original = @step.retry_policy if @step.respond_to?(:retry_policy)
|
|
201
|
+
@step.define_singleton_method(:retry_policy) { nil }
|
|
202
|
+
block.call
|
|
203
|
+
ensure
|
|
204
|
+
@step.define_singleton_method(:retry_policy) { original }
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
def empty_result(evals)
|
|
208
|
+
Result.new(
|
|
209
|
+
step_name: @step.name || @step.to_s,
|
|
210
|
+
eval_names: evals,
|
|
211
|
+
candidate_labels: [],
|
|
212
|
+
score_matrix: {},
|
|
213
|
+
constraining_eval: nil,
|
|
214
|
+
chain: [],
|
|
215
|
+
chain_details: []
|
|
216
|
+
)
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
end
|
|
@@ -29,3 +29,6 @@ require_relative "eval/prompt_diff_comparator"
|
|
|
29
29
|
require_relative "eval/prompt_diff_presenter"
|
|
30
30
|
require_relative "eval/prompt_diff"
|
|
31
31
|
require_relative "eval/eval_history"
|
|
32
|
+
require_relative "eval/recommendation"
|
|
33
|
+
require_relative "eval/recommender"
|
|
34
|
+
require_relative "eval/retry_optimizer"
|
|
@@ -121,5 +121,88 @@ module RubyLLM
|
|
|
121
121
|
defined?(::Rails) ? [:environment] : []
|
|
122
122
|
end
|
|
123
123
|
end
|
|
124
|
+
|
|
125
|
+
# Standalone task: runs all evals for one step across candidates,
|
|
126
|
+
# builds a score matrix, and suggests an optimal retry chain.
|
|
127
|
+
#
|
|
128
|
+
# Loaded automatically when `require "ruby_llm/contract/rake_task"`.
|
|
129
|
+
# Usage:
|
|
130
|
+
# rake ruby_llm_contract:optimize \
|
|
131
|
+
# STEP=MatchProblemsToPages \
|
|
132
|
+
# CANDIDATES=gpt-5-nano,gpt-5-mini@low,gpt-5-mini
|
|
133
|
+
class OptimizeRakeTask < ::Rake::TaskLib
|
|
134
|
+
def initialize
|
|
135
|
+
super()
|
|
136
|
+
define_task
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
private
|
|
140
|
+
|
|
141
|
+
def define_task
|
|
142
|
+
desc "Run all evals for STEP with CANDIDATES and suggest an optimal retry chain"
|
|
143
|
+
task(:"ruby_llm_contract:optimize" => task_prerequisites) do
|
|
144
|
+
require "ruby_llm/contract"
|
|
145
|
+
eval_dirs = ENV["EVAL_DIRS"].to_s.split(",").map(&:strip).reject(&:empty?)
|
|
146
|
+
RubyLLM::Contract.load_evals!(*eval_dirs)
|
|
147
|
+
|
|
148
|
+
step_name = ENV["STEP"].to_s.strip
|
|
149
|
+
abort("STEP is required, e.g. STEP=MatchProblemsToPages") if step_name.empty?
|
|
150
|
+
raw_candidates = ENV["CANDIDATES"].to_s.strip
|
|
151
|
+
abort("CANDIDATES is required, e.g. CANDIDATES=gpt-5-nano,gpt-5-mini@low,gpt-5-mini") if raw_candidates.empty?
|
|
152
|
+
min_score = ENV.fetch("MIN_SCORE", "0.95").to_f
|
|
153
|
+
|
|
154
|
+
host = RubyLLM::Contract.eval_hosts.find { |h| h.name == step_name }
|
|
155
|
+
unless host
|
|
156
|
+
available = RubyLLM::Contract.eval_hosts.filter_map(&:name).sort
|
|
157
|
+
abort "Unknown STEP=#{step_name}. Available: #{available.join(", ")}"
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
candidates = parse_candidates(raw_candidates)
|
|
161
|
+
context = build_context
|
|
162
|
+
|
|
163
|
+
result = host.optimize_retry_policy(
|
|
164
|
+
candidates: candidates,
|
|
165
|
+
context: context,
|
|
166
|
+
min_score: min_score
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
result.print_summary
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def parse_candidates(raw)
|
|
174
|
+
entries = if raw.start_with?("[")
|
|
175
|
+
Array(JSON.parse(raw))
|
|
176
|
+
else
|
|
177
|
+
raw.split(",").map(&:strip).reject(&:empty?).map do |entry|
|
|
178
|
+
model, effort = entry.split("@", 2)
|
|
179
|
+
config = { model: model.strip }
|
|
180
|
+
config[:reasoning_effort] = effort.strip if effort && !effort.empty?
|
|
181
|
+
config
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
entries.map { |e| RubyLLM::Contract.normalize_candidate_config(e) }.uniq
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def build_context
|
|
189
|
+
ctx = {}
|
|
190
|
+
provider = ENV["PROVIDER"].to_s.strip
|
|
191
|
+
# Only inject real adapter when LIVE=1 or PROVIDER is set — otherwise
|
|
192
|
+
# evals use sample_response (offline mode, zero API calls).
|
|
193
|
+
if ENV["LIVE"] == "1" || !provider.empty?
|
|
194
|
+
ctx[:adapter] = RubyLLM::Contract::Adapters::RubyLLM.new
|
|
195
|
+
ctx[:provider] = provider.downcase.to_sym unless provider.empty?
|
|
196
|
+
end
|
|
197
|
+
ctx
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
def task_prerequisites
|
|
201
|
+
defined?(::Rails) ? [:environment] : []
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
# Auto-register the optimize task when this file is loaded
|
|
206
|
+
OptimizeRakeTask.new
|
|
124
207
|
end
|
|
125
208
|
end
|
|
@@ -49,6 +49,25 @@ module RubyLLM
|
|
|
49
49
|
end
|
|
50
50
|
end
|
|
51
51
|
|
|
52
|
+
def recommend(eval_name, candidates:, min_score: 0.95, min_first_try_pass_rate: 0.8, context: {})
|
|
53
|
+
comparison = compare_models(eval_name, candidates: candidates, context: context)
|
|
54
|
+
Eval::Recommender.new(
|
|
55
|
+
comparison: comparison,
|
|
56
|
+
min_score: min_score,
|
|
57
|
+
min_first_try_pass_rate: min_first_try_pass_rate,
|
|
58
|
+
current_config: current_model_config
|
|
59
|
+
).recommend
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def optimize_retry_policy(candidates:, context: {}, min_score: 0.95)
|
|
63
|
+
Eval::RetryOptimizer.new(
|
|
64
|
+
step: self,
|
|
65
|
+
candidates: candidates,
|
|
66
|
+
context: context,
|
|
67
|
+
min_score: min_score
|
|
68
|
+
).call
|
|
69
|
+
end
|
|
70
|
+
|
|
52
71
|
KNOWN_CONTEXT_KEYS = %i[adapter model temperature max_tokens provider assume_model_exists reasoning_effort].freeze
|
|
53
72
|
|
|
54
73
|
include Concerns::ContextHelpers
|
|
@@ -144,6 +163,17 @@ module RubyLLM
|
|
|
144
163
|
}
|
|
145
164
|
end
|
|
146
165
|
|
|
166
|
+
def current_model_config
|
|
167
|
+
policy = retry_policy
|
|
168
|
+
if policy && policy.config_list.any?
|
|
169
|
+
policy.config_list.first
|
|
170
|
+
elsif respond_to?(:model) && model
|
|
171
|
+
{ model: model }
|
|
172
|
+
elsif RubyLLM::Contract.configuration.default_model
|
|
173
|
+
{ model: RubyLLM::Contract.configuration.default_model }
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
|
|
147
177
|
def resolve_adapter(context)
|
|
148
178
|
adapter = context[:adapter] || RubyLLM::Contract.configuration.default_adapter
|
|
149
179
|
return adapter if adapter
|
|
@@ -10,12 +10,16 @@ module RubyLLM
|
|
|
10
10
|
|
|
11
11
|
def run_with_retry(input, adapter:, default_model:, policy:, context_temperature: nil, extra_options: {})
|
|
12
12
|
all_attempts = []
|
|
13
|
+
default_config = { model: default_model }.merge(extra_options.slice(:reasoning_effort).compact)
|
|
13
14
|
|
|
14
15
|
policy.max_attempts.times do |attempt_index|
|
|
15
|
-
|
|
16
|
+
config = policy.config_for_attempt(attempt_index, default_config)
|
|
17
|
+
model = config[:model]
|
|
18
|
+
attempt_extra = extra_options.merge(config.except(:model))
|
|
19
|
+
|
|
16
20
|
result = run_once(input, adapter: adapter, model: model,
|
|
17
|
-
context_temperature: context_temperature, extra_options:
|
|
18
|
-
all_attempts << { attempt: attempt_index + 1, model: model, result: result }
|
|
21
|
+
context_temperature: context_temperature, extra_options: attempt_extra)
|
|
22
|
+
all_attempts << { attempt: attempt_index + 1, model: model, config: config, result: result }
|
|
19
23
|
break unless policy.retryable?(result)
|
|
20
24
|
end
|
|
21
25
|
|
|
@@ -43,6 +47,8 @@ module RubyLLM
|
|
|
43
47
|
def build_attempt_entry(attempt)
|
|
44
48
|
trace = attempt[:result].trace
|
|
45
49
|
entry = { attempt: attempt[:attempt], model: attempt[:model], status: attempt[:result].status }
|
|
50
|
+
config = attempt[:config]
|
|
51
|
+
entry[:config] = config if config && config.keys != [:model]
|
|
46
52
|
append_trace_fields(entry, trace)
|
|
47
53
|
end
|
|
48
54
|
|