RubyGems - ruby_llm-contract - Versions diffs - 0.6.2 → 0.6.3 - Mend

ruby_llm-contract 0.6.2 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +9 -0
data/Gemfile.lock +2 -2
data/README.md +2 -0
data/lib/ruby_llm/contract/concerns/eval_host.rb +12 -2
data/lib/ruby_llm/contract/eval/aggregated_report.rb +92 -0
data/lib/ruby_llm/contract/eval/retry_optimizer.rb +3 -2
data/lib/ruby_llm/contract/eval.rb +1 -0
data/lib/ruby_llm/contract/rake_task.rb +11 -1
data/lib/ruby_llm/contract/step/base.rb +3 -2
data/lib/ruby_llm/contract/version.rb +1 -1
metadata +2 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 2636c2e59f5fef27f929a94ac9e3194793ce6b51f86cce0c18ca6a5b0caa61ab
-  data.tar.gz: c2c81c7cc8fd281bf6c88738f0fb5bc4bdbb86b71d2fb59248e2b0ebb8d648fe
+  metadata.gz: 75884386ae53ddf1985760afa2c94f64ce15274b0cb0829ea40e6711506e60cf
+  data.tar.gz: 203229c4ce8ab0b1ab9e6209871668b0713b114e412eba03e79e9964dc9a43bb
 SHA512:
-  metadata.gz: d9f2fca592fd3a183d987239dea0cdc2456eed639d6cccaea71e9b1ef3a3ff6e32f3e346f640c905168674e7401ccb85e5f342815a1b290cdebe05a9f7b5374f
-  data.tar.gz: 00b8d113564871db19f88d9061276a0aee0295da7638974804782fda7929cf893a0004aadfeffc2f25f13d421935e0e9d710e553d8e56b7b5d0229f62edd129e
+  metadata.gz: 0c586ce70e71d8e77c262e0ae21bab2e29ef6ccdb16df4a0e56271aeb98efba59027618db24d4b59470fd3a9d340051201584038f8c67143f7ca1bec0e5e365b
+  data.tar.gz: 6a988c62f6b36a4da860b736c5523abcc568ed1384c1f20cd8ff9659124c12eeeddb36100d5b8542af07b363cef7ade8a1aca01250929ec2c23172d4f3faec5f

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,14 @@
 # Changelog
+## 0.6.3 (2026-04-20)
+### Features
+- **`runs:` parameter on `compare_models` and `optimize_retry_policy`** — runs each candidate N times per eval and aggregates the mean score, mean cost per run, and mean latency. Reduces sampling variance in live mode where LLM outputs are non-deterministic (gpt-5 family enforces `temperature=1.0` server-side, so a single unlucky sample can misclassify a viable candidate as "failing"). Default `runs: 1` — backward compatible.
+- **`RUNS=N` on `rake ruby_llm_contract:optimize`** — CLI flag for variance-aware optimization.
+- **`Eval::AggregatedReport`** — duck-type `Report` exposing `score` (mean), `score_min`/`score_max` (spread), `total_cost` (mean per run), `pass_rate` (clean-pass count x/N), and `clean_passes`.
+- **Guide: [Reducing variance with `runs:`](docs/guide/optimizing_retry_policy.md#reducing-variance-with-runs)** — when to use it and why.
 ## 0.6.2 (2026-04-18)
 ### Features

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    ruby_llm-contract (0.6.1)
+    ruby_llm-contract (0.6.3)
       dry-types (~> 1.7)
       ruby_llm (~> 1.0)
       ruby_llm-schema (~> 0.3)
@@ -258,7 +258,7 @@ CHECKSUMS
   rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
   ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
   ruby_llm (1.14.0) sha256=57c6f7034fc4a44504ea137d70f853b07824f1c1cdbe774ab3ab3522e7098deb
-  ruby_llm-contract (0.6.1)
+  ruby_llm-contract (0.6.3)
   ruby_llm-schema (0.3.0) sha256=a591edc5ca1b7f0304f0e2261de61ba4b3bea17be09f5cf7558153adfda3dec6
   ruby_parser (3.22.0) sha256=1eb4937cd9eb220aa2d194e352a24dba90aef00751e24c8dfffdb14000f15d23
   rubycritic (4.12.0) sha256=024fed90fe656fa939f6ea80aab17569699ac3863d0b52fd72cb99892247abc8

data/README.md CHANGED Viewed

@@ -158,6 +158,8 @@ Cheapest at 100%: gpt-4.1-mini
 Nano fails on edge cases. Mini and full both score 100% — but mini is **5x cheaper**. Now you know.
+Running live against gpt-5 / o-series? Pass `runs: 3` to average out sampling variance (OpenAI forces `temperature=1.0` server-side, so one unlucky run can misclassify a viable candidate). See [Reducing variance with `runs:`](docs/guide/optimizing_retry_policy.md#reducing-variance-with-runs).
 ## Let the gem tell you what to do
 Don't read tables — get a recommendation. Supports `model + reasoning_effort` combinations:

data/lib/ruby_llm/contract/concerns/eval_host.rb CHANGED Viewed

@@ -70,9 +70,11 @@ module RubyLLM
           Eval::PromptDiff.new(candidate: my_report, baseline: other_report)
         end
-        def compare_models(eval_name, models: [], candidates: [], context: {})
+        def compare_models(eval_name, models: [], candidates: [], context: {}, runs: 1)
           raise ArgumentError, "Pass either models: or candidates:, not both" if models.any? && candidates.any?
+          runs = coerce_runs(runs)
           context = safe_context(context)
           candidate_configs = normalize_candidates(models, candidates)
@@ -82,7 +84,8 @@ module RubyLLM
             label = Eval::ModelComparison.candidate_label(config)
             model_context = isolate_context(context).merge(model: config[:model])
             model_context[:reasoning_effort] = config[:reasoning_effort] if config[:reasoning_effort]
-            reports[label] = run_single_eval(eval_name, model_context)
+            per_run = Array.new(runs) { run_single_eval(eval_name, model_context) }
+            reports[label] = runs == 1 ? per_run.first : Eval::AggregatedReport.new(per_run)
             configs[label] = config
           end
@@ -91,6 +94,13 @@ module RubyLLM
         private
+        def coerce_runs(runs)
+          raise ArgumentError, "runs must be an Integer >= 1, got #{runs.inspect}" unless runs.is_a?(Integer)
+          raise ArgumentError, "runs must be >= 1, got #{runs.inspect}" if runs < 1
+          runs
+        end
         def normalize_candidates(models, candidates)
           if candidates.any?
             candidates.map { |c| RubyLLM::Contract.normalize_candidate_config(c) }.uniq

data/lib/ruby_llm/contract/eval/aggregated_report.rb ADDED Viewed

@@ -0,0 +1,92 @@
+# frozen_string_literal: true
+module RubyLLM
+  module Contract
+    module Eval
+      # Wraps N Reports from repeated runs of the same eval to reduce sampling
+      # variance in live mode (temperature=1 on gpt-5 family). Exposes the same
+      # duck-type as Report — mean score, mean cost per run, mean latency.
+      #
+      # pass_rate reports how many runs passed cleanly (x/N), not case-level
+      # pass rate, since the question is "does this candidate reliably pass?".
+      class AggregatedReport
+        attr_reader :runs, :results
+        def initialize(runs)
+          raise ArgumentError, "runs must not be empty" if runs.empty?
+          @runs = runs.freeze
+          @results = runs.flat_map(&:results).freeze
+          freeze
+        end
+        def dataset_name
+          @runs.first.dataset_name
+        end
+        def step_name
+          @runs.first.step_name
+        end
+        def score
+          @runs.sum(&:score) / @runs.length.to_f
+        end
+        def score_min
+          @runs.map(&:score).min
+        end
+        def score_max
+          @runs.map(&:score).max
+        end
+        def total_cost
+          @runs.sum(&:total_cost) / @runs.length.to_f
+        end
+        def avg_latency_ms
+          latencies = @runs.filter_map(&:avg_latency_ms)
+          return nil if latencies.empty?
+          latencies.sum / latencies.length.to_f
+        end
+        def pass_rate
+          "#{clean_passes}/#{@runs.length}"
+        end
+        def pass_rate_ratio
+          clean_passes.to_f / @runs.length
+        end
+        def each(&block)
+          @results.each(&block)
+        end
+        def summary
+          @runs.first.summary
+        end
+        def to_s
+          @runs.first.to_s
+        end
+        def print_summary(io = $stdout)
+          @runs.first.print_summary(io)
+        end
+        def passed?
+          @runs.all?(&:passed?)
+        end
+        def clean_passes
+          @runs.count(&:passed?)
+        end
+        def failures
+          @runs.flat_map(&:failures)
+        end
+      end
+    end
+  end
+end

data/lib/ruby_llm/contract/eval/retry_optimizer.rb CHANGED Viewed

@@ -94,11 +94,12 @@ module RubyLLM
           end
         end
-        def initialize(step:, candidates:, context: {}, min_score: 0.95)
+        def initialize(step:, candidates:, context: {}, min_score: 0.95, runs: 1)
           @step = step
           @candidates = candidates
           @context = context
           @min_score = min_score
+          @runs = runs
         end
         def call
@@ -108,7 +109,7 @@ module RubyLLM
           score_matrix = {}
           evals.each do |eval_name|
             comparison = with_retry_disabled do
-              @step.compare_models(eval_name, candidates: @candidates, context: @context)
+              @step.compare_models(eval_name, candidates: @candidates, context: @context, runs: @runs)
             end
             score_matrix[eval_name] = extract_scores(comparison)
           end

data/lib/ruby_llm/contract/eval.rb CHANGED Viewed

@@ -21,6 +21,7 @@ require_relative "eval/report_stats"
 require_relative "eval/report_presenter"
 require_relative "eval/report_storage"
 require_relative "eval/report"
+require_relative "eval/aggregated_report"
 require_relative "eval/eval_definition"
 require_relative "eval/model_comparison"
 require_relative "eval/baseline_diff"

data/lib/ruby_llm/contract/rake_task.rb CHANGED Viewed

@@ -150,6 +150,7 @@ module RubyLLM
           raw_candidates = ENV["CANDIDATES"].to_s.strip
           abort("CANDIDATES is required, e.g. CANDIDATES=gpt-5-nano,gpt-5-mini@low,gpt-5-mini") if raw_candidates.empty?
           min_score = ENV.fetch("MIN_SCORE", "0.95").to_f
+          runs = parse_runs(ENV.fetch("RUNS", "1"))
           host = RubyLLM::Contract.eval_hosts.find { |h| h.name == step_name }
           unless host
@@ -163,13 +164,22 @@ module RubyLLM
           result = host.optimize_retry_policy(
             candidates: candidates,
             context: context,
-            min_score: min_score
+            min_score: min_score,
+            runs: runs
           )
           result.print_summary
         end
       end
+      def parse_runs(raw)
+        runs = Integer(raw.to_s.strip, 10)
+        abort("RUNS must be an integer >= 1, e.g. RUNS=1") if runs < 1
+        runs
+      rescue ArgumentError
+        abort("RUNS must be an integer >= 1, e.g. RUNS=1")
+      end
       def parse_candidates(raw)
         entries = if raw.start_with?("[")
                     Array(JSON.parse(raw))

data/lib/ruby_llm/contract/step/base.rb CHANGED Viewed

@@ -59,12 +59,13 @@ module RubyLLM
             ).recommend
           end
-          def optimize_retry_policy(candidates:, context: {}, min_score: 0.95)
+          def optimize_retry_policy(candidates:, context: {}, min_score: 0.95, runs: 1)
             Eval::RetryOptimizer.new(
               step: self,
               candidates: candidates,
               context: context,
-              min_score: min_score
+              min_score: min_score,
+              runs: runs
             ).call
           end

data/lib/ruby_llm/contract/version.rb CHANGED Viewed

@@ -2,6 +2,6 @@
 module RubyLLM
   module Contract
-    VERSION = "0.6.2"
+    VERSION = "0.6.3"
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: ruby_llm-contract
 version: !ruby/object:Gem::Version
-  version: 0.6.2
+  version: 0.6.3
 platform: ruby
 authors:
 - Justyna
@@ -109,6 +109,7 @@ files:
 - lib/ruby_llm/contract/dsl.rb
 - lib/ruby_llm/contract/errors.rb
 - lib/ruby_llm/contract/eval.rb
+- lib/ruby_llm/contract/eval/aggregated_report.rb
 - lib/ruby_llm/contract/eval/baseline_diff.rb
 - lib/ruby_llm/contract/eval/case_executor.rb
 - lib/ruby_llm/contract/eval/case_result.rb