ruby_llm-contract 0.6.2 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/Gemfile.lock +2 -2
- data/README.md +2 -0
- data/lib/ruby_llm/contract/concerns/eval_host.rb +12 -2
- data/lib/ruby_llm/contract/eval/aggregated_report.rb +92 -0
- data/lib/ruby_llm/contract/eval/retry_optimizer.rb +3 -2
- data/lib/ruby_llm/contract/eval.rb +1 -0
- data/lib/ruby_llm/contract/rake_task.rb +11 -1
- data/lib/ruby_llm/contract/step/base.rb +3 -2
- data/lib/ruby_llm/contract/version.rb +1 -1
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 75884386ae53ddf1985760afa2c94f64ce15274b0cb0829ea40e6711506e60cf
|
|
4
|
+
data.tar.gz: 203229c4ce8ab0b1ab9e6209871668b0713b114e412eba03e79e9964dc9a43bb
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0c586ce70e71d8e77c262e0ae21bab2e29ef6ccdb16df4a0e56271aeb98efba59027618db24d4b59470fd3a9d340051201584038f8c67143f7ca1bec0e5e365b
|
|
7
|
+
data.tar.gz: 6a988c62f6b36a4da860b736c5523abcc568ed1384c1f20cd8ff9659124c12eeeddb36100d5b8542af07b363cef7ade8a1aca01250929ec2c23172d4f3faec5f
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,14 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.6.3 (2026-04-20)
|
|
4
|
+
|
|
5
|
+
### Features
|
|
6
|
+
|
|
7
|
+
- **`runs:` parameter on `compare_models` and `optimize_retry_policy`** — runs each candidate N times per eval and aggregates the mean score, mean cost per run, and mean latency. Reduces sampling variance in live mode where LLM outputs are non-deterministic (gpt-5 family enforces `temperature=1.0` server-side, so a single unlucky sample can misclassify a viable candidate as "failing"). Default `runs: 1` — backward compatible.
|
|
8
|
+
- **`RUNS=N` on `rake ruby_llm_contract:optimize`** — CLI flag for variance-aware optimization.
|
|
9
|
+
- **`Eval::AggregatedReport`** — duck-type `Report` exposing `score` (mean), `score_min`/`score_max` (spread), `total_cost` (mean per run), `pass_rate` (clean-pass count x/N), and `clean_passes`.
|
|
10
|
+
- **Guide: [Reducing variance with `runs:`](docs/guide/optimizing_retry_policy.md#reducing-variance-with-runs)** — when to use it and why.
|
|
11
|
+
|
|
3
12
|
## 0.6.2 (2026-04-18)
|
|
4
13
|
|
|
5
14
|
### Features
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
ruby_llm-contract (0.6.
|
|
4
|
+
ruby_llm-contract (0.6.3)
|
|
5
5
|
dry-types (~> 1.7)
|
|
6
6
|
ruby_llm (~> 1.0)
|
|
7
7
|
ruby_llm-schema (~> 0.3)
|
|
@@ -258,7 +258,7 @@ CHECKSUMS
|
|
|
258
258
|
rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
|
|
259
259
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
260
260
|
ruby_llm (1.14.0) sha256=57c6f7034fc4a44504ea137d70f853b07824f1c1cdbe774ab3ab3522e7098deb
|
|
261
|
-
ruby_llm-contract (0.6.
|
|
261
|
+
ruby_llm-contract (0.6.3)
|
|
262
262
|
ruby_llm-schema (0.3.0) sha256=a591edc5ca1b7f0304f0e2261de61ba4b3bea17be09f5cf7558153adfda3dec6
|
|
263
263
|
ruby_parser (3.22.0) sha256=1eb4937cd9eb220aa2d194e352a24dba90aef00751e24c8dfffdb14000f15d23
|
|
264
264
|
rubycritic (4.12.0) sha256=024fed90fe656fa939f6ea80aab17569699ac3863d0b52fd72cb99892247abc8
|
data/README.md
CHANGED
|
@@ -158,6 +158,8 @@ Cheapest at 100%: gpt-4.1-mini
|
|
|
158
158
|
|
|
159
159
|
Nano fails on edge cases. Mini and full both score 100% — but mini is **5x cheaper**. Now you know.
|
|
160
160
|
|
|
161
|
+
Running live against gpt-5 / o-series? Pass `runs: 3` to average out sampling variance (OpenAI forces `temperature=1.0` server-side, so one unlucky run can misclassify a viable candidate). See [Reducing variance with `runs:`](docs/guide/optimizing_retry_policy.md#reducing-variance-with-runs).
|
|
162
|
+
|
|
161
163
|
## Let the gem tell you what to do
|
|
162
164
|
|
|
163
165
|
Don't read tables — get a recommendation. Supports `model + reasoning_effort` combinations:
|
|
@@ -70,9 +70,11 @@ module RubyLLM
|
|
|
70
70
|
Eval::PromptDiff.new(candidate: my_report, baseline: other_report)
|
|
71
71
|
end
|
|
72
72
|
|
|
73
|
-
def compare_models(eval_name, models: [], candidates: [], context: {})
|
|
73
|
+
def compare_models(eval_name, models: [], candidates: [], context: {}, runs: 1)
|
|
74
74
|
raise ArgumentError, "Pass either models: or candidates:, not both" if models.any? && candidates.any?
|
|
75
75
|
|
|
76
|
+
runs = coerce_runs(runs)
|
|
77
|
+
|
|
76
78
|
context = safe_context(context)
|
|
77
79
|
candidate_configs = normalize_candidates(models, candidates)
|
|
78
80
|
|
|
@@ -82,7 +84,8 @@ module RubyLLM
|
|
|
82
84
|
label = Eval::ModelComparison.candidate_label(config)
|
|
83
85
|
model_context = isolate_context(context).merge(model: config[:model])
|
|
84
86
|
model_context[:reasoning_effort] = config[:reasoning_effort] if config[:reasoning_effort]
|
|
85
|
-
|
|
87
|
+
per_run = Array.new(runs) { run_single_eval(eval_name, model_context) }
|
|
88
|
+
reports[label] = runs == 1 ? per_run.first : Eval::AggregatedReport.new(per_run)
|
|
86
89
|
configs[label] = config
|
|
87
90
|
end
|
|
88
91
|
|
|
@@ -91,6 +94,13 @@ module RubyLLM
|
|
|
91
94
|
|
|
92
95
|
private
|
|
93
96
|
|
|
97
|
+
def coerce_runs(runs)
|
|
98
|
+
raise ArgumentError, "runs must be an Integer >= 1, got #{runs.inspect}" unless runs.is_a?(Integer)
|
|
99
|
+
raise ArgumentError, "runs must be >= 1, got #{runs.inspect}" if runs < 1
|
|
100
|
+
|
|
101
|
+
runs
|
|
102
|
+
end
|
|
103
|
+
|
|
94
104
|
def normalize_candidates(models, candidates)
|
|
95
105
|
if candidates.any?
|
|
96
106
|
candidates.map { |c| RubyLLM::Contract.normalize_candidate_config(c) }.uniq
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
# Wraps N Reports from repeated runs of the same eval to reduce sampling
|
|
7
|
+
# variance in live mode (temperature=1 on gpt-5 family). Exposes the same
|
|
8
|
+
# duck-type as Report — mean score, mean cost per run, mean latency.
|
|
9
|
+
#
|
|
10
|
+
# pass_rate reports how many runs passed cleanly (x/N), not case-level
|
|
11
|
+
# pass rate, since the question is "does this candidate reliably pass?".
|
|
12
|
+
class AggregatedReport
|
|
13
|
+
attr_reader :runs, :results
|
|
14
|
+
|
|
15
|
+
def initialize(runs)
|
|
16
|
+
raise ArgumentError, "runs must not be empty" if runs.empty?
|
|
17
|
+
|
|
18
|
+
@runs = runs.freeze
|
|
19
|
+
@results = runs.flat_map(&:results).freeze
|
|
20
|
+
freeze
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def dataset_name
|
|
24
|
+
@runs.first.dataset_name
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def step_name
|
|
28
|
+
@runs.first.step_name
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def score
|
|
32
|
+
@runs.sum(&:score) / @runs.length.to_f
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def score_min
|
|
36
|
+
@runs.map(&:score).min
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def score_max
|
|
40
|
+
@runs.map(&:score).max
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def total_cost
|
|
44
|
+
@runs.sum(&:total_cost) / @runs.length.to_f
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def avg_latency_ms
|
|
48
|
+
latencies = @runs.filter_map(&:avg_latency_ms)
|
|
49
|
+
return nil if latencies.empty?
|
|
50
|
+
|
|
51
|
+
latencies.sum / latencies.length.to_f
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def pass_rate
|
|
55
|
+
"#{clean_passes}/#{@runs.length}"
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def pass_rate_ratio
|
|
59
|
+
clean_passes.to_f / @runs.length
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def each(&block)
|
|
63
|
+
@results.each(&block)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def summary
|
|
67
|
+
@runs.first.summary
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def to_s
|
|
71
|
+
@runs.first.to_s
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def print_summary(io = $stdout)
|
|
75
|
+
@runs.first.print_summary(io)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def passed?
|
|
79
|
+
@runs.all?(&:passed?)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def clean_passes
|
|
83
|
+
@runs.count(&:passed?)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def failures
|
|
87
|
+
@runs.flat_map(&:failures)
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
@@ -94,11 +94,12 @@ module RubyLLM
|
|
|
94
94
|
end
|
|
95
95
|
end
|
|
96
96
|
|
|
97
|
-
def initialize(step:, candidates:, context: {}, min_score: 0.95)
|
|
97
|
+
def initialize(step:, candidates:, context: {}, min_score: 0.95, runs: 1)
|
|
98
98
|
@step = step
|
|
99
99
|
@candidates = candidates
|
|
100
100
|
@context = context
|
|
101
101
|
@min_score = min_score
|
|
102
|
+
@runs = runs
|
|
102
103
|
end
|
|
103
104
|
|
|
104
105
|
def call
|
|
@@ -108,7 +109,7 @@ module RubyLLM
|
|
|
108
109
|
score_matrix = {}
|
|
109
110
|
evals.each do |eval_name|
|
|
110
111
|
comparison = with_retry_disabled do
|
|
111
|
-
@step.compare_models(eval_name, candidates: @candidates, context: @context)
|
|
112
|
+
@step.compare_models(eval_name, candidates: @candidates, context: @context, runs: @runs)
|
|
112
113
|
end
|
|
113
114
|
score_matrix[eval_name] = extract_scores(comparison)
|
|
114
115
|
end
|
|
@@ -21,6 +21,7 @@ require_relative "eval/report_stats"
|
|
|
21
21
|
require_relative "eval/report_presenter"
|
|
22
22
|
require_relative "eval/report_storage"
|
|
23
23
|
require_relative "eval/report"
|
|
24
|
+
require_relative "eval/aggregated_report"
|
|
24
25
|
require_relative "eval/eval_definition"
|
|
25
26
|
require_relative "eval/model_comparison"
|
|
26
27
|
require_relative "eval/baseline_diff"
|
|
@@ -150,6 +150,7 @@ module RubyLLM
|
|
|
150
150
|
raw_candidates = ENV["CANDIDATES"].to_s.strip
|
|
151
151
|
abort("CANDIDATES is required, e.g. CANDIDATES=gpt-5-nano,gpt-5-mini@low,gpt-5-mini") if raw_candidates.empty?
|
|
152
152
|
min_score = ENV.fetch("MIN_SCORE", "0.95").to_f
|
|
153
|
+
runs = parse_runs(ENV.fetch("RUNS", "1"))
|
|
153
154
|
|
|
154
155
|
host = RubyLLM::Contract.eval_hosts.find { |h| h.name == step_name }
|
|
155
156
|
unless host
|
|
@@ -163,13 +164,22 @@ module RubyLLM
|
|
|
163
164
|
result = host.optimize_retry_policy(
|
|
164
165
|
candidates: candidates,
|
|
165
166
|
context: context,
|
|
166
|
-
min_score: min_score
|
|
167
|
+
min_score: min_score,
|
|
168
|
+
runs: runs
|
|
167
169
|
)
|
|
168
170
|
|
|
169
171
|
result.print_summary
|
|
170
172
|
end
|
|
171
173
|
end
|
|
172
174
|
|
|
175
|
+
def parse_runs(raw)
|
|
176
|
+
runs = Integer(raw.to_s.strip, 10)
|
|
177
|
+
abort("RUNS must be an integer >= 1, e.g. RUNS=1") if runs < 1
|
|
178
|
+
runs
|
|
179
|
+
rescue ArgumentError
|
|
180
|
+
abort("RUNS must be an integer >= 1, e.g. RUNS=1")
|
|
181
|
+
end
|
|
182
|
+
|
|
173
183
|
def parse_candidates(raw)
|
|
174
184
|
entries = if raw.start_with?("[")
|
|
175
185
|
Array(JSON.parse(raw))
|
|
@@ -59,12 +59,13 @@ module RubyLLM
|
|
|
59
59
|
).recommend
|
|
60
60
|
end
|
|
61
61
|
|
|
62
|
-
def optimize_retry_policy(candidates:, context: {}, min_score: 0.95)
|
|
62
|
+
def optimize_retry_policy(candidates:, context: {}, min_score: 0.95, runs: 1)
|
|
63
63
|
Eval::RetryOptimizer.new(
|
|
64
64
|
step: self,
|
|
65
65
|
candidates: candidates,
|
|
66
66
|
context: context,
|
|
67
|
-
min_score: min_score
|
|
67
|
+
min_score: min_score,
|
|
68
|
+
runs: runs
|
|
68
69
|
).call
|
|
69
70
|
end
|
|
70
71
|
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ruby_llm-contract
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.6.
|
|
4
|
+
version: 0.6.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Justyna
|
|
@@ -109,6 +109,7 @@ files:
|
|
|
109
109
|
- lib/ruby_llm/contract/dsl.rb
|
|
110
110
|
- lib/ruby_llm/contract/errors.rb
|
|
111
111
|
- lib/ruby_llm/contract/eval.rb
|
|
112
|
+
- lib/ruby_llm/contract/eval/aggregated_report.rb
|
|
112
113
|
- lib/ruby_llm/contract/eval/baseline_diff.rb
|
|
113
114
|
- lib/ruby_llm/contract/eval/case_executor.rb
|
|
114
115
|
- lib/ruby_llm/contract/eval/case_result.rb
|