ruby_llm-contract 0.6.2 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2636c2e59f5fef27f929a94ac9e3194793ce6b51f86cce0c18ca6a5b0caa61ab
4
- data.tar.gz: c2c81c7cc8fd281bf6c88738f0fb5bc4bdbb86b71d2fb59248e2b0ebb8d648fe
3
+ metadata.gz: 75884386ae53ddf1985760afa2c94f64ce15274b0cb0829ea40e6711506e60cf
4
+ data.tar.gz: 203229c4ce8ab0b1ab9e6209871668b0713b114e412eba03e79e9964dc9a43bb
5
5
  SHA512:
6
- metadata.gz: d9f2fca592fd3a183d987239dea0cdc2456eed639d6cccaea71e9b1ef3a3ff6e32f3e346f640c905168674e7401ccb85e5f342815a1b290cdebe05a9f7b5374f
7
- data.tar.gz: 00b8d113564871db19f88d9061276a0aee0295da7638974804782fda7929cf893a0004aadfeffc2f25f13d421935e0e9d710e553d8e56b7b5d0229f62edd129e
6
+ metadata.gz: 0c586ce70e71d8e77c262e0ae21bab2e29ef6ccdb16df4a0e56271aeb98efba59027618db24d4b59470fd3a9d340051201584038f8c67143f7ca1bec0e5e365b
7
+ data.tar.gz: 6a988c62f6b36a4da860b736c5523abcc568ed1384c1f20cd8ff9659124c12eeeddb36100d5b8542af07b363cef7ade8a1aca01250929ec2c23172d4f3faec5f
data/CHANGELOG.md CHANGED
@@ -1,5 +1,14 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.6.3 (2026-04-20)
4
+
5
+ ### Features
6
+
7
+ - **`runs:` parameter on `compare_models` and `optimize_retry_policy`** — runs each candidate N times per eval and aggregates the mean score, mean cost per run, and mean latency. Reduces sampling variance in live mode where LLM outputs are non-deterministic (gpt-5 family enforces `temperature=1.0` server-side, so a single unlucky sample can misclassify a viable candidate as "failing"). Default `runs: 1` — backward compatible.
8
+ - **`RUNS=N` on `rake ruby_llm_contract:optimize`** — CLI flag for variance-aware optimization.
9
+ - **`Eval::AggregatedReport`** — duck-type `Report` exposing `score` (mean), `score_min`/`score_max` (spread), `total_cost` (mean per run), `pass_rate` (clean-pass count x/N), and `clean_passes`.
10
+ - **Guide: [Reducing variance with `runs:`](docs/guide/optimizing_retry_policy.md#reducing-variance-with-runs)** — when to use it and why.
11
+
3
12
  ## 0.6.2 (2026-04-18)
4
13
 
5
14
  ### Features
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ruby_llm-contract (0.6.1)
4
+ ruby_llm-contract (0.6.3)
5
5
  dry-types (~> 1.7)
6
6
  ruby_llm (~> 1.0)
7
7
  ruby_llm-schema (~> 0.3)
@@ -258,7 +258,7 @@ CHECKSUMS
258
258
  rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
259
259
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
260
260
  ruby_llm (1.14.0) sha256=57c6f7034fc4a44504ea137d70f853b07824f1c1cdbe774ab3ab3522e7098deb
261
- ruby_llm-contract (0.6.1)
261
+ ruby_llm-contract (0.6.3)
262
262
  ruby_llm-schema (0.3.0) sha256=a591edc5ca1b7f0304f0e2261de61ba4b3bea17be09f5cf7558153adfda3dec6
263
263
  ruby_parser (3.22.0) sha256=1eb4937cd9eb220aa2d194e352a24dba90aef00751e24c8dfffdb14000f15d23
264
264
  rubycritic (4.12.0) sha256=024fed90fe656fa939f6ea80aab17569699ac3863d0b52fd72cb99892247abc8
data/README.md CHANGED
@@ -158,6 +158,8 @@ Cheapest at 100%: gpt-4.1-mini
158
158
 
159
159
  Nano fails on edge cases. Mini and full both score 100% — but mini is **5x cheaper**. Now you know.
160
160
 
161
+ Running live against gpt-5 / o-series? Pass `runs: 3` to average out sampling variance (OpenAI forces `temperature=1.0` server-side, so one unlucky run can misclassify a viable candidate). See [Reducing variance with `runs:`](docs/guide/optimizing_retry_policy.md#reducing-variance-with-runs).
162
+
161
163
  ## Let the gem tell you what to do
162
164
 
163
165
  Don't read tables — get a recommendation. Supports `model + reasoning_effort` combinations:
@@ -70,9 +70,11 @@ module RubyLLM
70
70
  Eval::PromptDiff.new(candidate: my_report, baseline: other_report)
71
71
  end
72
72
 
73
- def compare_models(eval_name, models: [], candidates: [], context: {})
73
+ def compare_models(eval_name, models: [], candidates: [], context: {}, runs: 1)
74
74
  raise ArgumentError, "Pass either models: or candidates:, not both" if models.any? && candidates.any?
75
75
 
76
+ runs = coerce_runs(runs)
77
+
76
78
  context = safe_context(context)
77
79
  candidate_configs = normalize_candidates(models, candidates)
78
80
 
@@ -82,7 +84,8 @@ module RubyLLM
82
84
  label = Eval::ModelComparison.candidate_label(config)
83
85
  model_context = isolate_context(context).merge(model: config[:model])
84
86
  model_context[:reasoning_effort] = config[:reasoning_effort] if config[:reasoning_effort]
85
- reports[label] = run_single_eval(eval_name, model_context)
87
+ per_run = Array.new(runs) { run_single_eval(eval_name, model_context) }
88
+ reports[label] = runs == 1 ? per_run.first : Eval::AggregatedReport.new(per_run)
86
89
  configs[label] = config
87
90
  end
88
91
 
@@ -91,6 +94,13 @@ module RubyLLM
91
94
 
92
95
  private
93
96
 
97
+ def coerce_runs(runs)
98
+ raise ArgumentError, "runs must be an Integer >= 1, got #{runs.inspect}" unless runs.is_a?(Integer)
99
+ raise ArgumentError, "runs must be >= 1, got #{runs.inspect}" if runs < 1
100
+
101
+ runs
102
+ end
103
+
94
104
  def normalize_candidates(models, candidates)
95
105
  if candidates.any?
96
106
  candidates.map { |c| RubyLLM::Contract.normalize_candidate_config(c) }.uniq
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ # Wraps N Reports from repeated runs of the same eval to reduce sampling
7
+ # variance in live mode (temperature=1 on gpt-5 family). Exposes the same
8
+ # duck-type as Report — mean score, mean cost per run, mean latency.
9
+ #
10
+ # pass_rate reports how many runs passed cleanly (x/N), not case-level
11
+ # pass rate, since the question is "does this candidate reliably pass?".
12
+ class AggregatedReport
13
+ attr_reader :runs, :results
14
+
15
+ def initialize(runs)
16
+ raise ArgumentError, "runs must not be empty" if runs.empty?
17
+
18
+ @runs = runs.freeze
19
+ @results = runs.flat_map(&:results).freeze
20
+ freeze
21
+ end
22
+
23
+ def dataset_name
24
+ @runs.first.dataset_name
25
+ end
26
+
27
+ def step_name
28
+ @runs.first.step_name
29
+ end
30
+
31
+ def score
32
+ @runs.sum(&:score) / @runs.length.to_f
33
+ end
34
+
35
+ def score_min
36
+ @runs.map(&:score).min
37
+ end
38
+
39
+ def score_max
40
+ @runs.map(&:score).max
41
+ end
42
+
43
+ def total_cost
44
+ @runs.sum(&:total_cost) / @runs.length.to_f
45
+ end
46
+
47
+ def avg_latency_ms
48
+ latencies = @runs.filter_map(&:avg_latency_ms)
49
+ return nil if latencies.empty?
50
+
51
+ latencies.sum / latencies.length.to_f
52
+ end
53
+
54
+ def pass_rate
55
+ "#{clean_passes}/#{@runs.length}"
56
+ end
57
+
58
+ def pass_rate_ratio
59
+ clean_passes.to_f / @runs.length
60
+ end
61
+
62
+ def each(&block)
63
+ @results.each(&block)
64
+ end
65
+
66
+ def summary
67
+ @runs.first.summary
68
+ end
69
+
70
+ def to_s
71
+ @runs.first.to_s
72
+ end
73
+
74
+ def print_summary(io = $stdout)
75
+ @runs.first.print_summary(io)
76
+ end
77
+
78
+ def passed?
79
+ @runs.all?(&:passed?)
80
+ end
81
+
82
+ def clean_passes
83
+ @runs.count(&:passed?)
84
+ end
85
+
86
+ def failures
87
+ @runs.flat_map(&:failures)
88
+ end
89
+ end
90
+ end
91
+ end
92
+ end
@@ -94,11 +94,12 @@ module RubyLLM
94
94
  end
95
95
  end
96
96
 
97
- def initialize(step:, candidates:, context: {}, min_score: 0.95)
97
+ def initialize(step:, candidates:, context: {}, min_score: 0.95, runs: 1)
98
98
  @step = step
99
99
  @candidates = candidates
100
100
  @context = context
101
101
  @min_score = min_score
102
+ @runs = runs
102
103
  end
103
104
 
104
105
  def call
@@ -108,7 +109,7 @@ module RubyLLM
108
109
  score_matrix = {}
109
110
  evals.each do |eval_name|
110
111
  comparison = with_retry_disabled do
111
- @step.compare_models(eval_name, candidates: @candidates, context: @context)
112
+ @step.compare_models(eval_name, candidates: @candidates, context: @context, runs: @runs)
112
113
  end
113
114
  score_matrix[eval_name] = extract_scores(comparison)
114
115
  end
@@ -21,6 +21,7 @@ require_relative "eval/report_stats"
21
21
  require_relative "eval/report_presenter"
22
22
  require_relative "eval/report_storage"
23
23
  require_relative "eval/report"
24
+ require_relative "eval/aggregated_report"
24
25
  require_relative "eval/eval_definition"
25
26
  require_relative "eval/model_comparison"
26
27
  require_relative "eval/baseline_diff"
@@ -150,6 +150,7 @@ module RubyLLM
150
150
  raw_candidates = ENV["CANDIDATES"].to_s.strip
151
151
  abort("CANDIDATES is required, e.g. CANDIDATES=gpt-5-nano,gpt-5-mini@low,gpt-5-mini") if raw_candidates.empty?
152
152
  min_score = ENV.fetch("MIN_SCORE", "0.95").to_f
153
+ runs = parse_runs(ENV.fetch("RUNS", "1"))
153
154
 
154
155
  host = RubyLLM::Contract.eval_hosts.find { |h| h.name == step_name }
155
156
  unless host
@@ -163,13 +164,22 @@ module RubyLLM
163
164
  result = host.optimize_retry_policy(
164
165
  candidates: candidates,
165
166
  context: context,
166
- min_score: min_score
167
+ min_score: min_score,
168
+ runs: runs
167
169
  )
168
170
 
169
171
  result.print_summary
170
172
  end
171
173
  end
172
174
 
175
+ def parse_runs(raw)
176
+ runs = Integer(raw.to_s.strip, 10)
177
+ abort("RUNS must be an integer >= 1, e.g. RUNS=1") if runs < 1
178
+ runs
179
+ rescue ArgumentError
180
+ abort("RUNS must be an integer >= 1, e.g. RUNS=1")
181
+ end
182
+
173
183
  def parse_candidates(raw)
174
184
  entries = if raw.start_with?("[")
175
185
  Array(JSON.parse(raw))
@@ -59,12 +59,13 @@ module RubyLLM
59
59
  ).recommend
60
60
  end
61
61
 
62
- def optimize_retry_policy(candidates:, context: {}, min_score: 0.95)
62
+ def optimize_retry_policy(candidates:, context: {}, min_score: 0.95, runs: 1)
63
63
  Eval::RetryOptimizer.new(
64
64
  step: self,
65
65
  candidates: candidates,
66
66
  context: context,
67
- min_score: min_score
67
+ min_score: min_score,
68
+ runs: runs
68
69
  ).call
69
70
  end
70
71
 
@@ -2,6 +2,6 @@
2
2
 
3
3
  module RubyLLM
4
4
  module Contract
5
- VERSION = "0.6.2"
5
+ VERSION = "0.6.3"
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby_llm-contract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.6.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Justyna
@@ -109,6 +109,7 @@ files:
109
109
  - lib/ruby_llm/contract/dsl.rb
110
110
  - lib/ruby_llm/contract/errors.rb
111
111
  - lib/ruby_llm/contract/eval.rb
112
+ - lib/ruby_llm/contract/eval/aggregated_report.rb
112
113
  - lib/ruby_llm/contract/eval/baseline_diff.rb
113
114
  - lib/ruby_llm/contract/eval/case_executor.rb
114
115
  - lib/ruby_llm/contract/eval/case_result.rb