ruby_llm-contract 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +18 -0
  3. data/Gemfile.lock +2 -2
  4. data/README.md +27 -2
  5. data/lib/ruby_llm/contract/adapters/response.rb +4 -2
  6. data/lib/ruby_llm/contract/adapters/test.rb +3 -2
  7. data/lib/ruby_llm/contract/concerns/deep_freeze.rb +23 -0
  8. data/lib/ruby_llm/contract/concerns/eval_host.rb +10 -2
  9. data/lib/ruby_llm/contract/eval/baseline_diff.rb +88 -0
  10. data/lib/ruby_llm/contract/eval/dataset.rb +11 -4
  11. data/lib/ruby_llm/contract/eval/eval_definition.rb +11 -10
  12. data/lib/ruby_llm/contract/eval/model_comparison.rb +1 -1
  13. data/lib/ruby_llm/contract/eval/report.rb +71 -2
  14. data/lib/ruby_llm/contract/eval/runner.rb +3 -2
  15. data/lib/ruby_llm/contract/eval.rb +1 -0
  16. data/lib/ruby_llm/contract/pipeline/base.rb +1 -1
  17. data/lib/ruby_llm/contract/pipeline/runner.rb +1 -1
  18. data/lib/ruby_llm/contract/pipeline/trace.rb +3 -2
  19. data/lib/ruby_llm/contract/prompt/node.rb +2 -2
  20. data/lib/ruby_llm/contract/prompt/nodes/example_node.rb +2 -2
  21. data/lib/ruby_llm/contract/rake_task.rb +31 -4
  22. data/lib/ruby_llm/contract/rspec/helpers.rb +28 -8
  23. data/lib/ruby_llm/contract/rspec/pass_eval.rb +23 -2
  24. data/lib/ruby_llm/contract/step/base.rb +10 -5
  25. data/lib/ruby_llm/contract/step/dsl.rb +1 -1
  26. data/lib/ruby_llm/contract/step/retry_executor.rb +3 -2
  27. data/lib/ruby_llm/contract/step/retry_policy.rb +6 -0
  28. data/lib/ruby_llm/contract/step/runner.rb +3 -1
  29. data/lib/ruby_llm/contract/step/trace.rb +5 -4
  30. data/lib/ruby_llm/contract/version.rb +1 -1
  31. data/lib/ruby_llm/contract.rb +21 -18
  32. metadata +3 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '080fd81afd87ad234cf66f7577080a4ac55a59f890e0c8c479479fccec57ad32'
4
- data.tar.gz: cdabbac3ea1d81e1abd3cb850e927f410d98282bd23111be79463804ea4d84b9
3
+ metadata.gz: b032109a7818caa3f68cae651f9f99210765d4257825f52a332944a6120ad522
4
+ data.tar.gz: 8f4c1bb95cbcf79236723e100becf8c8f2b87061bd7c29827152e4d716a99ce3
5
5
  SHA512:
6
- metadata.gz: 294b36f7264a2ba8b04334f3fd1c6b4433466a04c6be4aaccf23a92df3c7e92d04061ace018aa5243e28a9ef4fe64abc7f6de5ec11143c32bf5466bf591b9130
7
- data.tar.gz: d7447319e3389264571209bc84d7dc84a441ffb76d1f64506d9cac2dc1953d26ba8cf3e1eb4169adf64576f1be7ae182bdf0c6e8e6b876220b102cea1e653fa6
6
+ metadata.gz: e84f8e58367e2eae1ea6a0a712e125be6b3edb361ce6feca984c659f15ca11ce658143adf7fdfcd09f5c1ff57d09fad31e431320f780dd08da7ab7499dd9b961
7
+ data.tar.gz: 29c98d8fb09a92df1a88136d7c67094784fdf2ae01ae9ec1aaa3fc5f1cd589fd27c7139c84663ba9e49c89e5537f98480eb451076c8a00dffcccfc3bf062f5d8
data/CHANGELOG.md CHANGED
@@ -1,5 +1,23 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.3.0 (2026-03-23)
4
+
5
+ Baseline regression detection — know when quality drops before users do.
6
+
7
+ ### Features
8
+
9
+ - **`report.save_baseline!`** — serialize eval results to `.eval_baselines/` (JSON, git-tracked)
10
+ - **`report.compare_with_baseline`** — returns `BaselineDiff` with regressions, improvements, score_delta, new/removed cases
11
+ - **`diff.regressed?`** — true when any previously-passing case now fails
12
+ - **`without_regressions` RSpec chain** — `expect(Step).to pass_eval("x").without_regressions`
13
+ - **RakeTask `fail_on_regression`** — blocks CI when regressions detected
14
+ - **RakeTask `save_baseline`** — auto-save after successful run
15
+ - **Migration guide** — `docs/guide/migration.md` with 7 patterns for adopting the gem in existing Rails apps
16
+
17
+ ### Stats
18
+
19
+ - 1086 tests, 0 failures
20
+
3
21
  ## 0.2.3 (2026-03-23)
4
22
 
5
23
  Production hardening from senior Rails review panel.
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ruby_llm-contract (0.2.3)
4
+ ruby_llm-contract (0.3.0)
5
5
  dry-types (~> 1.7)
6
6
  ruby_llm (~> 1.0)
7
7
  ruby_llm-schema (~> 0.3)
@@ -165,7 +165,7 @@ CHECKSUMS
165
165
  rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
166
166
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
167
167
  ruby_llm (1.14.0) sha256=57c6f7034fc4a44504ea137d70f853b07824f1c1cdbe774ab3ab3522e7098deb
168
- ruby_llm-contract (0.2.3)
168
+ ruby_llm-contract (0.3.0)
169
169
  ruby_llm-schema (0.3.0) sha256=a591edc5ca1b7f0304f0e2261de61ba4b3bea17be09f5cf7558153adfda3dec6
170
170
  unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
171
171
  unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f
data/README.md CHANGED
@@ -111,6 +111,30 @@ end
111
111
  # bundle exec rake ruby_llm_contract:eval
112
112
  ```
113
113
 
114
+ ## Detect quality drops
115
+
116
+ Save a baseline. Next run, see what regressed.
117
+
118
+ ```ruby
119
+ report = ClassifyTicket.run_eval("regression", context: { model: "gpt-4.1-nano" })
120
+ report.save_baseline!(model: "gpt-4.1-nano")
121
+
122
+ # Later — after prompt change, model update, or provider weight shift:
123
+ report = ClassifyTicket.run_eval("regression", context: { model: "gpt-4.1-nano" })
124
+ diff = report.compare_with_baseline(model: "gpt-4.1-nano")
125
+
126
+ diff.regressed? # => true
127
+ diff.regressions # => [{case: "outage", baseline: {passed: true}, current: {passed: false}}]
128
+ diff.score_delta # => -0.33
129
+ ```
130
+
131
+ ```ruby
132
+ # CI: block merge if any previously-passing case now fails
133
+ expect(ClassifyTicket).to pass_eval("regression")
134
+ .with_context(model: "gpt-4.1-nano")
135
+ .without_regressions
136
+ ```
137
+
114
138
  ## Predict cost before running
115
139
 
116
140
  ```ruby
@@ -140,12 +164,13 @@ Works with any ruby_llm provider (OpenAI, Anthropic, Gemini, etc).
140
164
  | [Output Schema](docs/guide/output_schema.md) | Full schema reference + constraints |
141
165
  | [Pipeline](docs/guide/pipeline.md) | Multi-step composition, timeout, fail-fast |
142
166
  | [Testing](docs/guide/testing.md) | Test adapter, RSpec matchers |
167
+ | [Migration](docs/guide/migration.md) | Adopting the gem in existing Rails apps |
143
168
 
144
169
  ## Roadmap
145
170
 
146
- **v0.2 (current):** Model comparison, cost tracking, eval with `add_case`, CI gating, Rails Railtie.
171
+ **v0.3 (current):** Baseline regression detection `save_baseline!`, `compare_with_baseline`, `without_regressions`. Migration guide.
147
172
 
148
- **v0.3:** Regression baselines compare eval results with previous run, detect quality drift.
173
+ **v0.2:** Model comparison, cost tracking, eval with `add_case`, CI gating, Rails Railtie.
149
174
 
150
175
  **v0.4:** Auto-routing — learn which model works for which input pattern.
151
176
 
@@ -4,11 +4,13 @@ module RubyLLM
4
4
  module Contract
5
5
  module Adapters
6
6
  class Response
7
+ include Concerns::DeepFreeze
8
+
7
9
  attr_reader :content, :usage
8
10
 
9
11
  def initialize(content:, usage: {})
10
- @content = content
11
- @usage = usage
12
+ @content = deep_dup_freeze(content)
13
+ @usage = deep_dup_freeze(usage)
12
14
  freeze
13
15
  end
14
16
  end
@@ -4,8 +4,9 @@ module RubyLLM
4
4
  module Contract
5
5
  module Adapters
6
6
  class Test < Base
7
- def initialize(response: nil, responses: nil)
7
+ def initialize(response: nil, responses: nil, usage: nil)
8
8
  super()
9
+ @usage = (usage || { input_tokens: 0, output_tokens: 0 }).dup.freeze
9
10
  if responses
10
11
  raise ArgumentError, "responses: must not be empty (use response: nil for nil content)" if responses.empty?
11
12
 
@@ -36,7 +37,7 @@ module RubyLLM
36
37
  else
37
38
  @response
38
39
  end
39
- Response.new(content: content, usage: { input_tokens: 0, output_tokens: 0 })
40
+ Response.new(content: content, usage: @usage)
40
41
  end
41
42
  end
42
43
  end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Concerns
6
+ # Deep-duplicate and freeze a value. Creates an independent frozen copy
7
+ # without mutating the original. Handles Hash, Array, String recursively.
8
+ module DeepFreeze
9
+ private
10
+
11
+ def deep_dup_freeze(obj)
12
+ case obj
13
+ when NilClass, Integer, Float, Symbol, TrueClass, FalseClass then obj
14
+ when Hash then obj.transform_values { |v| deep_dup_freeze(v) }.freeze
15
+ when Array then obj.map { |v| deep_dup_freeze(v) }.freeze
16
+ when String then obj.frozen? ? obj : obj.dup.freeze
17
+ else obj.frozen? ? obj : obj.dup.freeze
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -6,6 +6,7 @@ module RubyLLM
6
6
  module EvalHost
7
7
  def define_eval(name, &)
8
8
  @eval_definitions ||= {}
9
+ @file_sourced_evals ||= Set.new
9
10
  key = name.to_s
10
11
 
11
12
  if @eval_definitions.key?(key) && !Thread.current[:ruby_llm_contract_reloading]
@@ -14,12 +15,16 @@ module RubyLLM
14
15
  end
15
16
 
16
17
  @eval_definitions[key] = Eval::EvalDefinition.new(key, step_class: self, &)
18
+ @file_sourced_evals.add(key) if Thread.current[:ruby_llm_contract_reloading]
17
19
  Contract.register_eval_host(self)
18
20
  register_subclasses(self)
19
21
  end
20
22
 
21
- def clear_eval_definitions!
22
- @eval_definitions = {}
23
+ def clear_file_sourced_evals!
24
+ return unless defined?(@file_sourced_evals) && defined?(@eval_definitions)
25
+
26
+ @file_sourced_evals.each { |key| @eval_definitions.delete(key) }
27
+ @file_sourced_evals.clear
23
28
  end
24
29
 
25
30
  def eval_names
@@ -31,6 +36,7 @@ module RubyLLM
31
36
  end
32
37
 
33
38
  def run_eval(name = nil, context: {})
39
+ context ||= {}
34
40
  if name
35
41
  run_single_eval(name, context)
36
42
  else
@@ -39,6 +45,7 @@ module RubyLLM
39
45
  end
40
46
 
41
47
  def compare_models(eval_name, models:, context: {})
48
+ context ||= {}
42
49
  reports = models.each_with_object({}) do |model, hash|
43
50
  model_context = deep_dup_context(context).merge(model: model)
44
51
  hash[model] = run_single_eval(eval_name, model_context)
@@ -75,6 +82,7 @@ module RubyLLM
75
82
  end
76
83
 
77
84
  def eval_context(defn, context)
85
+ context = (context || {}).transform_keys { |k| k.respond_to?(:to_sym) ? k.to_sym : k }
78
86
  return context if context[:adapter]
79
87
 
80
88
  sample_adapter = defn.build_adapter
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ class BaselineDiff
7
+ attr_reader :baseline_score, :current_score
8
+
9
+ def initialize(baseline_cases:, current_cases:)
10
+ @baseline = index_by_name(baseline_cases)
11
+ @current = index_by_name(current_cases)
12
+ @baseline_score = baseline_cases.empty? ? 0.0 : baseline_cases.sum { |c| c[:score] } / baseline_cases.length
13
+ @current_score = current_cases.empty? ? 0.0 : current_cases.sum { |c| c[:score] } / current_cases.length
14
+ freeze
15
+ end
16
+
17
+ def regressions
18
+ @baseline.filter_map do |name, baseline|
19
+ current = @current[name]
20
+ next unless current
21
+ next unless baseline[:passed] && !current[:passed]
22
+
23
+ {
24
+ case: name,
25
+ baseline: { passed: baseline[:passed], score: baseline[:score] },
26
+ current: { passed: current[:passed], score: current[:score] },
27
+ detail: current[:details]
28
+ }
29
+ end
30
+ end
31
+
32
+ def improvements
33
+ @baseline.filter_map do |name, baseline|
34
+ current = @current[name]
35
+ next unless current
36
+ next unless !baseline[:passed] && current[:passed]
37
+
38
+ {
39
+ case: name,
40
+ baseline: { passed: baseline[:passed], score: baseline[:score] },
41
+ current: { passed: current[:passed], score: current[:score] }
42
+ }
43
+ end
44
+ end
45
+
46
+ def score_delta
47
+ (current_score - baseline_score).round(4)
48
+ end
49
+
50
+ def regressed?
51
+ regressions.any?
52
+ end
53
+
54
+ def improved?
55
+ improvements.any?
56
+ end
57
+
58
+ def new_cases
59
+ (@current.keys - @baseline.keys)
60
+ end
61
+
62
+ def removed_cases
63
+ (@baseline.keys - @current.keys)
64
+ end
65
+
66
+ def to_s
67
+ lines = ["Score: #{baseline_score.round(2)} → #{current_score.round(2)} (#{format_delta})"]
68
+ regressions.each { |r| lines << " REGRESSED #{r[:case]}: #{r[:detail]}" }
69
+ improvements.each { |r| lines << " IMPROVED #{r[:case]}" }
70
+ new_cases.each { |c| lines << " NEW #{c}" }
71
+ removed_cases.each { |c| lines << " REMOVED #{c}" }
72
+ lines.join("\n")
73
+ end
74
+
75
+ private
76
+
77
+ def index_by_name(cases)
78
+ cases.each_with_object({}) { |c, h| h[c[:name]] = c }
79
+ end
80
+
81
+ def format_delta
82
+ d = score_delta
83
+ d >= 0 ? "+#{d}" : d.to_s
84
+ end
85
+ end
86
+ end
87
+ end
88
+ end
@@ -23,8 +23,13 @@ module RubyLLM
23
23
  # dataset.case "name", input: {...}, expected_traits: {...}
24
24
  # dataset.case "name", input: {...}, evaluator: proc
25
25
  def add_case(name = nil, input:, expected: nil, expected_traits: nil, evaluator: nil)
26
+ case_name = name || "case_#{@cases.length + 1}"
27
+ if @cases.any? { |c| c.name == case_name }
28
+ raise ArgumentError, "Duplicate case name '#{case_name}'. Case names must be unique within a dataset."
29
+ end
30
+
26
31
  @cases << Case.new(
27
- name: name || "case_#{@cases.length + 1}",
32
+ name: case_name,
28
33
  input: input,
29
34
  expected: expected,
30
35
  expected_traits: expected_traits,
@@ -37,13 +42,15 @@ module RubyLLM
37
42
  end
38
43
 
39
44
  class Case
45
+ include Concerns::DeepFreeze
46
+
40
47
  attr_reader :name, :input, :expected, :expected_traits, :evaluator
41
48
 
42
49
  def initialize(name:, input:, expected: nil, expected_traits: nil, evaluator: nil)
43
50
  @name = name
44
- @input = input
45
- @expected = expected
46
- @expected_traits = expected_traits
51
+ @input = deep_dup_freeze(input)
52
+ @expected = deep_dup_freeze(expected)
53
+ @expected_traits = deep_dup_freeze(expected_traits)
47
54
  @evaluator = evaluator
48
55
  freeze
49
56
  end
@@ -21,18 +21,19 @@ module RubyLLM
21
21
 
22
22
  def sample_response(response)
23
23
  @sample_response = response
24
+ @has_sample_response = true
24
25
  pre_validate_sample! if @step_class
25
26
  end
26
27
 
27
28
  def build_adapter
28
- return nil unless @sample_response
29
+ return nil unless defined?(@has_sample_response) && @has_sample_response
29
30
 
30
- Adapters::Test.new(response: @sample_response.is_a?(String) ? @sample_response : @sample_response.to_json)
31
+ Adapters::Test.new(response: @sample_response)
31
32
  end
32
33
 
33
34
  def add_case(description, input: nil, expected: nil, expected_traits: nil, evaluator: nil)
34
- case_input = input || @default_input
35
- raise ArgumentError, "add_case requires input (set default_input or pass input:)" unless case_input
35
+ case_input = input.nil? ? @default_input : input
36
+ raise ArgumentError, "add_case requires input (set default_input or pass input:)" if case_input.nil?
36
37
 
37
38
  @cases << {
38
39
  name: description,
@@ -44,12 +45,12 @@ module RubyLLM
44
45
  end
45
46
 
46
47
  def verify(description, expected_or_proc = nil, input: nil, expect: nil)
47
- if expected_or_proc && expect
48
+ if !expected_or_proc.nil? && !expect.nil?
48
49
  raise ArgumentError, "verify accepts either a positional argument or expect: keyword, not both"
49
50
  end
50
51
 
51
- expected_or_proc = expect if expect
52
- case_input = input || @default_input
52
+ expected_or_proc = expect unless expect.nil?
53
+ case_input = input.nil? ? @default_input : input
53
54
  validate_verify_args!(expected_or_proc, case_input)
54
55
 
55
56
  evaluator = expected_or_proc.is_a?(::Proc) ? expected_or_proc : nil
@@ -78,15 +79,15 @@ module RubyLLM
78
79
 
79
80
  def effective_cases
80
81
  return @cases if @cases.any?
81
- return [] unless @default_input
82
+ return [] if @default_input.nil?
82
83
 
83
84
  # Zero-verify: auto-add a contract check case
84
85
  [{ name: "contract check", input: @default_input, expected: nil, evaluator: nil }]
85
86
  end
86
87
 
87
88
  def validate_verify_args!(expected_or_proc, case_input)
88
- raise ArgumentError, "verify requires either a positional argument or expect: keyword" unless expected_or_proc
89
- raise ArgumentError, "verify requires input (set default_input or pass input:)" unless case_input
89
+ raise ArgumentError, "verify requires either a positional argument or expect: keyword" if expected_or_proc.nil?
90
+ raise ArgumentError, "verify requires input (set default_input or pass input:)" if case_input.nil?
90
91
  end
91
92
 
92
93
  def pre_validate_sample!
@@ -8,7 +8,7 @@ module RubyLLM
8
8
 
9
9
  def initialize(eval_name:, reports:)
10
10
  @eval_name = eval_name
11
- @reports = reports.freeze # { "model_name" => Report }
11
+ @reports = reports.dup.freeze # { "model_name" => Report }
12
12
  freeze
13
13
  end
14
14
 
@@ -1,14 +1,18 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "json"
4
+ require "fileutils"
5
+
3
6
  module RubyLLM
4
7
  module Contract
5
8
  module Eval
6
9
  class Report
7
10
  attr_reader :dataset_name, :results
8
11
 
9
- def initialize(dataset_name:, results:)
12
+ def initialize(dataset_name:, results:, step_name: nil)
10
13
  @dataset_name = dataset_name
11
- @results = results.freeze
14
+ @step_name = step_name
15
+ @results = results.dup.freeze
12
16
  freeze
13
17
  end
14
18
 
@@ -78,6 +82,29 @@ module RubyLLM
78
82
  lines.join("\n")
79
83
  end
80
84
 
85
+ def save_baseline!(path: nil, model: nil)
86
+ file = path || default_baseline_path(model: model)
87
+ FileUtils.mkdir_p(File.dirname(file))
88
+ File.write(file, JSON.pretty_generate(serialize_for_baseline))
89
+ file
90
+ end
91
+
92
+ def compare_with_baseline(path: nil, model: nil)
93
+ file = path || default_baseline_path(model: model)
94
+ raise ArgumentError, "No baseline found at #{file}" unless File.exist?(file)
95
+
96
+ baseline_data = JSON.parse(File.read(file), symbolize_names: true)
97
+ validate_baseline!(baseline_data)
98
+ BaselineDiff.new(
99
+ baseline_cases: baseline_data[:cases],
100
+ current_cases: evaluated_results.map { |r| serialize_case(r) }
101
+ )
102
+ end
103
+
104
+ def baseline_exists?(path: nil, model: nil)
105
+ File.exist?(path || default_baseline_path(model: model))
106
+ end
107
+
81
108
  def print_summary(io = $stdout)
82
109
  io.puts summary
83
110
  io.puts
@@ -106,6 +133,48 @@ module RubyLLM
106
133
  results.reject { |r| r.step_status == :skipped }
107
134
  end
108
135
 
136
+ def default_baseline_path(model: nil)
137
+ parts = [".eval_baselines"]
138
+ parts << sanitize_name(@step_name) if @step_name
139
+ name = sanitize_name(dataset_name)
140
+ name = "#{name}_#{sanitize_name(model)}" if model
141
+ parts << "#{name}.json"
142
+ File.join(*parts)
143
+ end
144
+
145
+ def validate_baseline!(data)
146
+ if data[:dataset_name] && data[:dataset_name] != dataset_name
147
+ raise ArgumentError, "Baseline eval '#{data[:dataset_name]}' does not match '#{dataset_name}'"
148
+ end
149
+ if data[:step_name] && @step_name && data[:step_name] != @step_name
150
+ raise ArgumentError, "Baseline step '#{data[:step_name]}' does not match '#{@step_name}'"
151
+ end
152
+ end
153
+
154
+ def sanitize_name(name)
155
+ name.to_s.gsub(/[^a-zA-Z0-9_-]/, "_")
156
+ end
157
+
158
+ def serialize_for_baseline
159
+ {
160
+ dataset_name: dataset_name,
161
+ step_name: @step_name,
162
+ score: score,
163
+ total_cost: total_cost,
164
+ cases: evaluated_results.map { |r| serialize_case(r) }
165
+ }
166
+ end
167
+
168
+ def serialize_case(result)
169
+ {
170
+ name: result.name,
171
+ passed: result.passed?,
172
+ score: result.score,
173
+ details: result.details,
174
+ cost: result.cost
175
+ }
176
+ end
177
+
109
178
  def format_cost(cost)
110
179
  "$#{format("%.6f", cost)}"
111
180
  end
@@ -19,7 +19,8 @@ module RubyLLM
19
19
 
20
20
  def run
21
21
  results = @dataset.cases.map { |test_case| evaluate_case(test_case) }
22
- Report.new(dataset_name: @dataset.name, results: results)
22
+ step_name = @step.respond_to?(:name) ? @step.name : @step.to_s
23
+ Report.new(dataset_name: @dataset.name, results: results, step_name: step_name)
23
24
  end
24
25
 
25
26
  private
@@ -81,7 +82,7 @@ module RubyLLM
81
82
  evaluate_with_custom(step_result, test_case)
82
83
  elsif test_case.expected_traits
83
84
  evaluate_traits(step_result, test_case)
84
- elsif test_case.expected
85
+ elsif !test_case.expected.nil?
85
86
  evaluate_expected(step_result, test_case)
86
87
  else
87
88
  evaluate_contract_only
@@ -14,3 +14,4 @@ require_relative "eval/runner"
14
14
  require_relative "eval/report"
15
15
  require_relative "eval/eval_definition"
16
16
  require_relative "eval/model_comparison"
17
+ require_relative "eval/baseline_diff"
@@ -20,7 +20,7 @@ module RubyLLM
20
20
  end
21
21
 
22
22
  def steps
23
- steps_registry.dup.freeze
23
+ steps_registry.map { |s| s.dup.freeze }.freeze
24
24
  end
25
25
 
26
26
  # Internal mutable steps list for registration
@@ -13,7 +13,7 @@ module RubyLLM
13
13
  raise ArgumentError, "Pipeline has no steps defined" if steps.empty?
14
14
 
15
15
  @steps = steps
16
- @context = context
16
+ @context = context || {}
17
17
  @timeout_ms = timeout_ms
18
18
  @token_budget = token_budget
19
19
  end
@@ -5,14 +5,15 @@ module RubyLLM
5
5
  module Pipeline
6
6
  class Trace
7
7
  include Concerns::TraceEquality
8
+ include Concerns::DeepFreeze
8
9
 
9
10
  attr_reader :trace_id, :total_latency_ms, :total_usage, :step_traces, :total_cost
10
11
 
11
12
  def initialize(trace_id: nil, total_latency_ms: nil, total_usage: nil, step_traces: nil)
12
13
  @trace_id = trace_id
13
14
  @total_latency_ms = total_latency_ms
14
- @total_usage = total_usage
15
- @step_traces = step_traces
15
+ @total_usage = deep_dup_freeze(total_usage)
16
+ @step_traces = step_traces&.dup&.freeze
16
17
  @total_cost = calculate_total_cost
17
18
  freeze
18
19
  end
@@ -7,8 +7,8 @@ module RubyLLM
7
7
  attr_reader :type, :content
8
8
 
9
9
  def initialize(type:, content:)
10
- @type = type.freeze
11
- @content = content.freeze
10
+ @type = type
11
+ @content = content.frozen? ? content : content.dup.freeze
12
12
  freeze
13
13
  end
14
14
 
@@ -8,8 +8,8 @@ module RubyLLM
8
8
  attr_reader :input, :output
9
9
 
10
10
  def initialize(input:, output:)
11
- @input = input.freeze
12
- @output = output.freeze
11
+ @input = input.frozen? ? input : input.dup.freeze
12
+ @output = output.frozen? ? output : output.dup.freeze
13
13
  super(type: :example, content: nil)
14
14
  end
15
15
 
@@ -6,7 +6,8 @@ require "rake/tasklib"
6
6
  module RubyLLM
7
7
  module Contract
8
8
  class RakeTask < ::Rake::TaskLib
9
- attr_accessor :name, :context, :fail_on_empty, :minimum_score, :maximum_cost, :eval_dirs
9
+ attr_accessor :name, :context, :fail_on_empty, :minimum_score, :maximum_cost,
10
+ :eval_dirs, :save_baseline, :fail_on_regression
10
11
 
11
12
  def initialize(name = :"ruby_llm_contract:eval", &block)
12
13
  super()
@@ -16,6 +17,8 @@ module RubyLLM
16
17
  @minimum_score = nil # nil = require 100%; float = threshold
17
18
  @maximum_cost = nil # nil = no cost limit; float = budget cap (suite-level)
18
19
  @eval_dirs = [] # directories to load eval files from (non-Rails)
20
+ @save_baseline = false
21
+ @fail_on_regression = false
19
22
  block&.call(self)
20
23
  define_task
21
24
  end
@@ -26,8 +29,7 @@ module RubyLLM
26
29
  desc "Run all ruby_llm-contract evals"
27
30
  task(@name => task_prerequisites) do
28
31
  require "ruby_llm/contract"
29
- @eval_dirs.each { |dir| RubyLLM::Contract.load_evals!(dir) }
30
- RubyLLM::Contract.load_evals!
32
+ RubyLLM::Contract.load_evals!(*@eval_dirs)
31
33
 
32
34
  results = RubyLLM::Contract.run_all_evals(context: @context)
33
35
 
@@ -43,12 +45,16 @@ module RubyLLM
43
45
  gate_passed = true
44
46
  suite_cost = 0.0
45
47
 
48
+ passed_reports = []
49
+
46
50
  results.each do |host, reports|
47
51
  puts "\n#{host.name || host.to_s}"
48
52
  reports.each_value do |report|
49
53
  report.print_summary
50
54
  suite_cost += report.total_cost
51
- gate_passed = false unless report_meets_score?(report)
55
+ report_ok = report_meets_score?(report) && !check_regression(report)
56
+ gate_passed = false unless report_ok
57
+ passed_reports << report if report_ok
52
58
  end
53
59
  end
54
60
 
@@ -58,6 +64,9 @@ module RubyLLM
58
64
  end
59
65
 
60
66
  abort "\nEval suite FAILED" unless gate_passed
67
+
68
+ # Save baselines only after ALL gates pass
69
+ passed_reports.each { |r| save_baseline!(r) } if @save_baseline
61
70
  puts "\nAll evals passed."
62
71
  end
63
72
  end
@@ -70,6 +79,24 @@ module RubyLLM
70
79
  end
71
80
  end
72
81
 
82
+ def check_regression(report)
83
+ return false unless @fail_on_regression && report.baseline_exists?
84
+
85
+ diff = report.compare_with_baseline
86
+ if diff.regressed?
87
+ puts "\n REGRESSIONS DETECTED:"
88
+ puts " #{diff}"
89
+ true
90
+ else
91
+ false
92
+ end
93
+ end
94
+
95
+ def save_baseline!(report)
96
+ path = report.save_baseline!
97
+ puts " Baseline saved: #{path}"
98
+ end
99
+
73
100
  def task_prerequisites
74
101
  Rake::Task.task_defined?(:environment) ? [:environment] : []
75
102
  end
@@ -10,18 +10,38 @@ module RubyLLM
10
10
  # result = ClassifyTicket.run("test")
11
11
  # result.parsed_output # => {priority: "high"}
12
12
  #
13
- # For multiple sequential responses:
14
- # stub_step(ClassifyTicket, responses: [{ a: 1 }, { a: 2 }])
13
+ # Only affects the specified step — other steps are not affected.
15
14
  #
16
15
  def stub_step(step_class, response: nil, responses: nil)
17
- adapter = if responses
18
- Adapters::Test.new(responses: responses.map { |r| r.is_a?(String) ? r : r.to_json })
19
- else
20
- content = response.is_a?(String) ? response : response.to_json
21
- Adapters::Test.new(response: content)
22
- end
16
+ adapter = build_test_adapter(response: response, responses: responses)
17
+ allow(step_class).to receive(:run).and_wrap_original do |original, input, **kwargs|
18
+ context = (kwargs[:context] || {}).merge(adapter: adapter)
19
+ original.call(input, context: context)
20
+ end
21
+ end
22
+
23
+ # Set a global test adapter for ALL steps.
24
+ #
25
+ # stub_all_steps(response: { default: true })
26
+ #
27
+ def stub_all_steps(response: nil, responses: nil)
28
+ adapter = build_test_adapter(response: response, responses: responses)
23
29
  RubyLLM::Contract.configure { |c| c.default_adapter = adapter }
24
30
  end
31
+
32
+ private
33
+
34
+ def build_test_adapter(response: nil, responses: nil)
35
+ if responses
36
+ Adapters::Test.new(responses: responses.map { |r| normalize_test_response(r) })
37
+ else
38
+ Adapters::Test.new(response: normalize_test_response(response))
39
+ end
40
+ end
41
+
42
+ def normalize_test_response(value)
43
+ value
44
+ end
25
45
  end
26
46
  end
27
47
  end
@@ -64,12 +64,18 @@ RSpec::Matchers.define :pass_eval do |eval_name|
64
64
  @maximum_cost = cost
65
65
  end
66
66
 
67
+ chain :without_regressions do
68
+ @check_regressions = true
69
+ end
70
+
67
71
  match do |step_or_pipeline|
68
72
  @eval_name = eval_name
69
73
  @context ||= {}
70
74
  @minimum_score ||= nil
71
75
  @maximum_cost ||= nil
76
+ @check_regressions ||= false
72
77
  @error = nil
78
+ @diff = nil
73
79
  @report = step_or_pipeline.run_eval(eval_name, context: @context)
74
80
 
75
81
  score_ok = if @minimum_score
@@ -80,14 +86,29 @@ RSpec::Matchers.define :pass_eval do |eval_name|
80
86
 
81
87
  cost_ok = @maximum_cost ? @report.total_cost <= @maximum_cost : true
82
88
 
83
- score_ok && cost_ok
89
+ regression_ok = if @check_regressions && @report.baseline_exists?
90
+ @diff = @report.compare_with_baseline
91
+ !@diff.regressed?
92
+ else
93
+ true
94
+ end
95
+
96
+ score_ok && cost_ok && regression_ok
84
97
  rescue StandardError => e
85
98
  @error = e
86
99
  false
87
100
  end
88
101
 
89
102
  failure_message do
90
- format_failure_message(@eval_name, @error, @report, @minimum_score, @maximum_cost)
103
+ msg = format_failure_message(@eval_name, @error, @report, @minimum_score, @maximum_cost)
104
+ if @diff&.regressed?
105
+ msg += "\n\nRegressions from baseline:\n"
106
+ @diff.regressions.each do |r|
107
+ msg += " #{r[:case]}: was PASS, now FAIL — #{r[:detail]}\n"
108
+ end
109
+ msg += " Score delta: #{@diff.score_delta}"
110
+ end
111
+ msg
91
112
  end
92
113
 
93
114
  failure_message_when_negated do
@@ -58,18 +58,23 @@ module RubyLLM
58
58
  end
59
59
  end
60
60
 
61
- KNOWN_CONTEXT_KEYS = %i[adapter model temperature max_tokens schema provider assume_model_exists].freeze
61
+ KNOWN_CONTEXT_KEYS = %i[adapter model temperature provider assume_model_exists].freeze
62
62
 
63
63
  def run(input, context: {})
64
+ context = (context || {}).transform_keys { |k| k.respond_to?(:to_sym) ? k.to_sym : k }
64
65
  warn_unknown_context_keys(context)
65
66
  adapter = resolve_adapter(context)
66
67
  default_model = context[:model] || model || RubyLLM::Contract.configuration.default_model
67
68
  policy = retry_policy
68
69
 
70
+ ctx_temp = context[:temperature]
71
+ extra = context.slice(:provider, :assume_model_exists)
69
72
  result = if policy
70
- run_with_retry(input, adapter: adapter, default_model: default_model, policy: policy)
73
+ run_with_retry(input, adapter: adapter, default_model: default_model,
74
+ policy: policy, context_temperature: ctx_temp, extra_options: extra)
71
75
  else
72
- run_once(input, adapter: adapter, model: default_model, context_temperature: context[:temperature])
76
+ run_once(input, adapter: adapter, model: default_model,
77
+ context_temperature: ctx_temp, extra_options: extra)
73
78
  end
74
79
 
75
80
  invoke_around_call(input, result)
@@ -101,14 +106,14 @@ module RubyLLM
101
106
  "{ |c| c.default_adapter = ... } or pass context: { adapter: ... }"
102
107
  end
103
108
 
104
- def run_once(input, adapter:, model:, context_temperature: nil)
109
+ def run_once(input, adapter:, model:, context_temperature: nil, extra_options: {})
105
110
  effective_temp = context_temperature || temperature
106
111
  Runner.new(
107
112
  input_type: input_type, output_type: output_type,
108
113
  prompt_block: prompt, contract_definition: effective_contract,
109
114
  adapter: adapter, model: model, output_schema: output_schema,
110
115
  max_output: max_output, max_input: max_input, max_cost: max_cost,
111
- temperature: effective_temp
116
+ temperature: effective_temp, extra_options: extra_options
112
117
  ).call(input)
113
118
  rescue ArgumentError => e
114
119
  Result.new(status: :input_error, raw_output: nil, parsed_output: nil,
@@ -168,7 +168,7 @@ module RubyLLM
168
168
  end
169
169
 
170
170
  def retry_policy(models: nil, attempts: nil, retry_on: nil, &block)
171
- if block || models || attempts
171
+ if block || models || attempts || retry_on
172
172
  return @retry_policy = RetryPolicy.new(models: models, attempts: attempts, retry_on: retry_on, &block)
173
173
  end
174
174
 
@@ -8,12 +8,13 @@ module RubyLLM
8
8
  module RetryExecutor
9
9
  private
10
10
 
11
- def run_with_retry(input, adapter:, default_model:, policy:)
11
+ def run_with_retry(input, adapter:, default_model:, policy:, context_temperature: nil, extra_options: {})
12
12
  all_attempts = []
13
13
 
14
14
  policy.max_attempts.times do |attempt_index|
15
15
  model = policy.model_for_attempt(attempt_index, default_model)
16
- result = run_once(input, adapter: adapter, model: model)
16
+ result = run_once(input, adapter: adapter, model: model,
17
+ context_temperature: context_temperature, extra_options: extra_options)
17
18
  all_attempts << { attempt: attempt_index + 1, model: model, result: result }
18
19
  break unless policy.retryable?(result)
19
20
  end
@@ -15,6 +15,7 @@ module RubyLLM
15
15
  if block
16
16
  @max_attempts = 1
17
17
  instance_eval(&block)
18
+ warn_no_retry! if @max_attempts == 1 && @models.empty?
18
19
  else
19
20
  apply_keywords(models: models, attempts: attempts, retry_on: retry_on)
20
21
  end
@@ -65,6 +66,11 @@ module RubyLLM
65
66
  @retryable_statuses = Array(retry_on).dup if retry_on
66
67
  end
67
68
 
69
+ def warn_no_retry!
70
+ warn "[ruby_llm-contract] retry_policy has max_attempts=1 with no models. " \
71
+ "This means no actual retry will happen. Add `attempts 2` or `escalate %w[model1 model2]`."
72
+ end
73
+
68
74
  def validate_max_attempts!
69
75
  return if @max_attempts.is_a?(Integer) && @max_attempts >= 1
70
76
 
@@ -8,7 +8,7 @@ module RubyLLM
8
8
 
9
9
  def initialize(input_type:, output_type:, prompt_block:, contract_definition:,
10
10
  adapter:, model:, output_schema: nil, max_output: nil,
11
- max_input: nil, max_cost: nil, temperature: nil)
11
+ max_input: nil, max_cost: nil, temperature: nil, extra_options: {})
12
12
  @input_type = input_type
13
13
  @output_type = output_type
14
14
  @prompt_block = prompt_block
@@ -20,6 +20,7 @@ module RubyLLM
20
20
  @max_input = max_input
21
21
  @max_cost = max_cost
22
22
  @temperature = temperature
23
+ @extra_options = extra_options
23
24
  end
24
25
 
25
26
  def call(input)
@@ -86,6 +87,7 @@ module RubyLLM
86
87
  opts[:schema] = @output_schema if @output_schema
87
88
  opts[:max_tokens] = @max_output if @max_output
88
89
  opts[:temperature] = @temperature if @temperature
90
+ @extra_options.each { |k, v| opts[k] = v unless opts.key?(k) }
89
91
  end
90
92
  end
91
93
 
@@ -5,15 +5,16 @@ module RubyLLM
5
5
  module Step
6
6
  class Trace
7
7
  include Concerns::TraceEquality
8
+ include Concerns::DeepFreeze
8
9
 
9
10
  attr_reader :messages, :model, :latency_ms, :usage, :attempts, :cost
10
11
 
11
12
  def initialize(messages: nil, model: nil, latency_ms: nil, usage: nil, attempts: nil, cost: nil)
12
- @messages = messages
13
- @model = model
13
+ @messages = deep_dup_freeze(messages)
14
+ @model = model.frozen? ? model : model&.dup&.freeze
14
15
  @latency_ms = latency_ms
15
- @usage = usage
16
- @attempts = attempts
16
+ @usage = deep_dup_freeze(usage)
17
+ @attempts = deep_dup_freeze(attempts)
17
18
  @cost = cost || CostCalculator.calculate(model_name: model, usage: usage)
18
19
  freeze
19
20
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module RubyLLM
4
4
  module Contract
5
- VERSION = "0.2.3"
5
+ VERSION = "0.3.0"
6
6
  end
7
7
  end
@@ -40,25 +40,21 @@ module RubyLLM
40
40
  @eval_hosts = []
41
41
  end
42
42
 
43
- def load_evals!(dir = nil)
44
- dirs = if dir
45
- [dir]
46
- elsif defined?(::Rails)
47
- %w[app/steps/eval app/contracts/eval].filter_map do |path|
48
- full = ::Rails.root.join(path)
49
- full.to_s if full.exist?
50
- end
51
- else
52
- []
53
- end
43
+ def load_evals!(*dirs)
44
+ dirs = dirs.flatten.compact
45
+ if dirs.empty? && defined?(::Rails)
46
+ dirs = %w[app/steps/eval app/contracts/eval].filter_map do |path|
47
+ full = ::Rails.root.join(path)
48
+ full.to_s if full.exist?
49
+ end
50
+ end
54
51
 
55
52
  return if dirs.empty?
56
53
 
57
- # Clear existing eval definitions before reload to prevent stale state.
58
- # Thread-local flag suppresses the "redefining" warning during reload.
54
+ # Clear file-sourced evals ONCE, then load ALL dirs.
59
55
  Thread.current[:ruby_llm_contract_reloading] = true
60
56
  eval_hosts.each do |host|
61
- host.clear_eval_definitions! if host.respond_to?(:clear_eval_definitions!)
57
+ host.clear_file_sourced_evals! if host.respond_to?(:clear_file_sourced_evals!)
62
58
  end
63
59
 
64
60
  dirs.each do |d|
@@ -70,11 +66,17 @@ module RubyLLM
70
66
 
71
67
  private
72
68
 
73
- # Filter out GC'd anonymous classes and classes that no longer have evals
69
+ # Filter stale hosts, deduplicate by name (last wins), prune registry in-place
74
70
  def live_eval_hosts
75
- eval_hosts.select do |host|
76
- host.respond_to?(:eval_defined?) && host.eval_defined?
77
- end
71
+ # Remove hosts without evals
72
+ @eval_hosts&.reject! { |h| !h.respond_to?(:eval_defined?) || !h.eval_defined? }
73
+
74
+ # Deduplicate: if two classes share a name (reload), keep the latest
75
+ seen = {}
76
+ @eval_hosts&.each { |h| seen[h.name || h.object_id] = h }
77
+ @eval_hosts = seen.values
78
+
79
+ @eval_hosts || []
78
80
  end
79
81
 
80
82
  def auto_create_adapter!
@@ -87,6 +89,7 @@ module RubyLLM
87
89
  end
88
90
  end
89
91
 
92
+ require_relative "contract/concerns/deep_freeze"
90
93
  require_relative "contract/concerns/deep_symbolize"
91
94
  require_relative "contract/concerns/eval_host"
92
95
  require_relative "contract/concerns/trace_equality"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby_llm-contract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Justyna
@@ -82,6 +82,7 @@ files:
82
82
  - lib/ruby_llm/contract/adapters/response.rb
83
83
  - lib/ruby_llm/contract/adapters/ruby_llm.rb
84
84
  - lib/ruby_llm/contract/adapters/test.rb
85
+ - lib/ruby_llm/contract/concerns/deep_freeze.rb
85
86
  - lib/ruby_llm/contract/concerns/deep_symbolize.rb
86
87
  - lib/ruby_llm/contract/concerns/eval_host.rb
87
88
  - lib/ruby_llm/contract/concerns/trace_equality.rb
@@ -97,6 +98,7 @@ files:
97
98
  - lib/ruby_llm/contract/dsl.rb
98
99
  - lib/ruby_llm/contract/errors.rb
99
100
  - lib/ruby_llm/contract/eval.rb
101
+ - lib/ruby_llm/contract/eval/baseline_diff.rb
100
102
  - lib/ruby_llm/contract/eval/case_result.rb
101
103
  - lib/ruby_llm/contract/eval/contract_detail_builder.rb
102
104
  - lib/ruby_llm/contract/eval/dataset.rb