ruby_llm-contract 0.4.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.rubycritic.yml +8 -0
  3. data/.simplecov +22 -0
  4. data/CHANGELOG.md +59 -0
  5. data/Gemfile +2 -0
  6. data/Gemfile.lock +104 -2
  7. data/README.md +42 -2
  8. data/lib/ruby_llm/contract/concerns/context_helpers.rb +11 -10
  9. data/lib/ruby_llm/contract/concerns/deep_freeze.rb +13 -7
  10. data/lib/ruby_llm/contract/concerns/deep_symbolize.rb +15 -5
  11. data/lib/ruby_llm/contract/concerns/eval_host.rb +51 -7
  12. data/lib/ruby_llm/contract/contract/schema_validator/bound_rule.rb +85 -0
  13. data/lib/ruby_llm/contract/contract/schema_validator/enum_rule.rb +23 -0
  14. data/lib/ruby_llm/contract/contract/schema_validator/node.rb +70 -0
  15. data/lib/ruby_llm/contract/contract/schema_validator/object_rules.rb +66 -0
  16. data/lib/ruby_llm/contract/contract/schema_validator/scalar_rules.rb +22 -0
  17. data/lib/ruby_llm/contract/contract/schema_validator/schema_extractor.rb +23 -0
  18. data/lib/ruby_llm/contract/contract/schema_validator/type_rule.rb +30 -0
  19. data/lib/ruby_llm/contract/contract/schema_validator.rb +41 -266
  20. data/lib/ruby_llm/contract/contract/validator.rb +9 -0
  21. data/lib/ruby_llm/contract/cost_calculator.rb +41 -1
  22. data/lib/ruby_llm/contract/eval/case_executor.rb +52 -0
  23. data/lib/ruby_llm/contract/eval/case_result_builder.rb +35 -0
  24. data/lib/ruby_llm/contract/eval/case_scorer.rb +66 -0
  25. data/lib/ruby_llm/contract/eval/evaluator/exact.rb +8 -6
  26. data/lib/ruby_llm/contract/eval/evaluator/proc_evaluator.rb +22 -10
  27. data/lib/ruby_llm/contract/eval/evaluator/regex.rb +11 -8
  28. data/lib/ruby_llm/contract/eval/expectation_evaluator.rb +26 -0
  29. data/lib/ruby_llm/contract/eval/prompt_diff.rb +39 -0
  30. data/lib/ruby_llm/contract/eval/prompt_diff_comparator.rb +116 -0
  31. data/lib/ruby_llm/contract/eval/prompt_diff_presenter.rb +99 -0
  32. data/lib/ruby_llm/contract/eval/prompt_diff_serializer.rb +23 -0
  33. data/lib/ruby_llm/contract/eval/report.rb +19 -191
  34. data/lib/ruby_llm/contract/eval/report_presenter.rb +65 -0
  35. data/lib/ruby_llm/contract/eval/report_stats.rb +65 -0
  36. data/lib/ruby_llm/contract/eval/report_storage.rb +107 -0
  37. data/lib/ruby_llm/contract/eval/runner.rb +30 -207
  38. data/lib/ruby_llm/contract/eval/step_expectation_applier.rb +67 -0
  39. data/lib/ruby_llm/contract/eval/step_result_normalizer.rb +39 -0
  40. data/lib/ruby_llm/contract/eval.rb +13 -0
  41. data/lib/ruby_llm/contract/minitest.rb +116 -2
  42. data/lib/ruby_llm/contract/pipeline/base.rb +15 -2
  43. data/lib/ruby_llm/contract/rake_task.rb +20 -1
  44. data/lib/ruby_llm/contract/rspec/helpers.rb +91 -6
  45. data/lib/ruby_llm/contract/rspec/pass_eval.rb +84 -3
  46. data/lib/ruby_llm/contract/rspec.rb +18 -0
  47. data/lib/ruby_llm/contract/step/adapter_caller.rb +23 -0
  48. data/lib/ruby_llm/contract/step/base.rb +94 -37
  49. data/lib/ruby_llm/contract/step/dsl.rb +61 -16
  50. data/lib/ruby_llm/contract/step/input_validator.rb +34 -0
  51. data/lib/ruby_llm/contract/step/limit_checker.rb +28 -11
  52. data/lib/ruby_llm/contract/step/prompt_compiler.rb +33 -0
  53. data/lib/ruby_llm/contract/step/result.rb +3 -2
  54. data/lib/ruby_llm/contract/step/result_builder.rb +60 -0
  55. data/lib/ruby_llm/contract/step/retry_executor.rb +1 -0
  56. data/lib/ruby_llm/contract/step/runner.rb +47 -84
  57. data/lib/ruby_llm/contract/step/runner_config.rb +37 -0
  58. data/lib/ruby_llm/contract/step.rb +5 -0
  59. data/lib/ruby_llm/contract/version.rb +1 -1
  60. data/lib/ruby_llm/contract.rb +28 -0
  61. metadata +28 -1
@@ -4,8 +4,6 @@ module RubyLLM
4
4
  module Contract
5
5
  module Eval
6
6
  class Runner
7
- include TraitEvaluator
8
- include ContractDetailBuilder
9
7
  include Concerns::ContextHelpers
10
8
 
11
9
  def self.run(step:, dataset:, context: {}, concurrency: nil)
@@ -20,17 +18,35 @@ module RubyLLM
20
18
  end
21
19
 
22
20
  def run
23
- results = if @concurrency && @concurrency > 1
24
- run_concurrent
25
- else
26
- @dataset.cases.map { |test_case| evaluate_case(test_case) }
27
- end
28
- step_name = @step.respond_to?(:name) ? @step.name : @step.to_s
29
- Report.new(dataset_name: @dataset.name, results: results, step_name: step_name)
21
+ Report.new(
22
+ dataset_name: @dataset.name,
23
+ results: collected_results,
24
+ step_name: step_name
25
+ )
30
26
  end
31
27
 
32
28
  private
33
29
 
30
+ def collected_results
31
+ concurrent? ? run_concurrent : run_serial
32
+ end
33
+
34
+ def run_serial
35
+ @dataset.cases.map { |test_case| case_executor.call(test_case: test_case, context: @context) }
36
+ end
37
+
38
+ def concurrent?
39
+ @concurrency && @concurrency > 1
40
+ end
41
+
42
+ def step_name
43
+ @step.respond_to?(:name) ? @step.name : @step.to_s
44
+ end
45
+
46
+ def case_executor
47
+ @case_executor ||= CaseExecutor.new(step: @step)
48
+ end
49
+
34
50
  def run_concurrent
35
51
  require "concurrent"
36
52
  pool = Concurrent::FixedThreadPool.new(@concurrency)
@@ -39,10 +55,10 @@ module RubyLLM
39
55
  # gets a single-response adapter with its own response (by index).
40
56
  per_case_contexts = build_per_case_contexts
41
57
 
42
- futures = @dataset.cases.each_with_index.map do |test_case, i|
43
- ctx = per_case_contexts[i]
58
+ futures = @dataset.cases.each_with_index.map do |test_case, index|
59
+ case_context = per_case_contexts[index]
44
60
  Concurrent::Future.execute(executor: pool) do
45
- evaluate_case_with_context(test_case, ctx)
61
+ case_executor.call(test_case: test_case, context: case_context)
46
62
  end
47
63
  end
48
64
  futures.map(&:value!)
@@ -55,10 +71,10 @@ module RubyLLM
55
71
  adapter = @context[:adapter]
56
72
  responses = adapter.respond_to?(:responses_array) ? adapter.responses_array : nil
57
73
 
58
- @dataset.cases.each_with_index.map do |_, i|
74
+ @dataset.cases.each_with_index.map do |_, index|
59
75
  if responses
60
76
  # Give each case its own single-response adapter
61
- response = responses[i] || responses.last
77
+ response = responses[index] || responses.last
62
78
  per_case_adapter = Adapters::Test.new(response: response)
63
79
  @context.merge(adapter: per_case_adapter)
64
80
  else
@@ -66,199 +82,6 @@ module RubyLLM
66
82
  end
67
83
  end
68
84
  end
69
-
70
- def evaluate_case_with_context(test_case, context)
71
- run_result = @step.run(test_case.input, context: context)
72
- step_result = normalize_result(run_result)
73
- eval_result = dispatch_evaluation(step_result, test_case)
74
-
75
- result = build_case_result(test_case, step_result, eval_result)
76
-
77
- if test_case.respond_to?(:step_expectations) && test_case.step_expectations &&
78
- run_result.respond_to?(:outputs_by_step)
79
- evaluate_step_expectations(result, run_result.outputs_by_step, test_case.step_expectations)
80
- else
81
- result
82
- end
83
- rescue RubyLLM::Contract::Error => e
84
- raise unless e.message.include?("No adapter configured")
85
-
86
- skipped_result(test_case, e.message)
87
- end
88
-
89
- def evaluate_case(test_case)
90
- run_result = @step.run(test_case.input, context: @context)
91
- step_result = normalize_result(run_result)
92
- eval_result = dispatch_evaluation(step_result, test_case)
93
-
94
- result = build_case_result(test_case, step_result, eval_result)
95
-
96
- # Pipeline per-step evaluation
97
- if test_case.respond_to?(:step_expectations) && test_case.step_expectations &&
98
- run_result.respond_to?(:outputs_by_step)
99
- evaluate_step_expectations(result, run_result.outputs_by_step, test_case.step_expectations)
100
- else
101
- result
102
- end
103
- rescue RubyLLM::Contract::Error => e
104
- raise unless e.message.include?("No adapter configured")
105
-
106
- skipped_result(test_case, e.message)
107
- end
108
-
109
- def build_case_result(test_case, step_result, eval_result)
110
- trace = step_result.respond_to?(:trace) ? step_result.trace : nil
111
- CaseResult.new(
112
- name: test_case.name,
113
- input: test_case.input,
114
- output: step_result.parsed_output,
115
- expected: test_case.expected,
116
- step_status: step_result.status,
117
- score: eval_result.score,
118
- passed: eval_result.passed,
119
- label: eval_result.label,
120
- details: eval_result.details,
121
- duration_ms: extract_latency(trace),
122
- cost: extract_cost(trace)
123
- )
124
- end
125
-
126
- def extract_latency(trace)
127
- return nil unless trace
128
-
129
- # Pipeline::Trace uses total_latency_ms, Step::Trace uses latency_ms
130
- if trace.respond_to?(:total_latency_ms)
131
- trace.total_latency_ms
132
- else
133
- trace[:latency_ms]
134
- end
135
- end
136
-
137
- def extract_cost(trace)
138
- return nil unless trace
139
-
140
- # Pipeline::Trace uses total_cost, Step::Trace uses cost
141
- if trace.respond_to?(:total_cost)
142
- trace.total_cost
143
- else
144
- trace[:cost]
145
- end
146
- end
147
-
148
- def dispatch_evaluation(step_result, test_case)
149
- return contract_failure(step_result) unless step_result.ok?
150
-
151
- if test_case.evaluator
152
- evaluate_with_custom(step_result, test_case)
153
- elsif test_case.expected_traits
154
- evaluate_traits(step_result, test_case)
155
- elsif !test_case.expected.nil?
156
- evaluate_expected(step_result, test_case)
157
- else
158
- evaluate_contract_only
159
- end
160
- end
161
-
162
- def normalize_result(result)
163
- return result if result.respond_to?(:parsed_output)
164
-
165
- normalize_pipeline_result(result)
166
- end
167
-
168
- def normalize_pipeline_result(result)
169
- last_result = result.step_results&.last&.dig(:result)
170
- is_ok = result.ok?
171
- pipeline_trace = result.respond_to?(:trace) ? result.trace : nil
172
-
173
- PipelineResultAdapter.new(
174
- status: result.status,
175
- ok_flag: is_ok,
176
- parsed_output: is_ok ? result.outputs_by_step.values.last : nil,
177
- validation_errors: last_result.respond_to?(:validation_errors) ? last_result.validation_errors : [],
178
- trace: pipeline_trace || (last_result.respond_to?(:trace) ? last_result.trace : {})
179
- )
180
- end
181
-
182
- def evaluate_expected(step_result, test_case)
183
- dispatch_expected_evaluator(
184
- output: step_result.parsed_output,
185
- expected: test_case.expected,
186
- input: test_case.input
187
- )
188
- end
189
-
190
- def dispatch_expected_evaluator(output:, expected:, input:)
191
- if expected.is_a?(Hash)
192
- Evaluator::JsonIncludes.new.call(output: output, expected: expected, input: input)
193
- elsif expected.is_a?(::Regexp)
194
- Evaluator::Regex.new(expected).call(output: output, input: input)
195
- else
196
- Evaluator::Exact.new.call(output: output, expected: expected, input: input)
197
- end
198
- end
199
-
200
- def evaluate_with_custom(step_result, test_case)
201
- evaluator = test_case.evaluator
202
- evaluator = Evaluator::ProcEvaluator.new(evaluator) if evaluator.is_a?(::Proc)
203
- evaluator.call(output: step_result.parsed_output, expected: test_case.expected, input: test_case.input)
204
- end
205
-
206
- def evaluate_contract_only
207
- EvaluationResult.new(score: 1.0, passed: true, details: build_contract_details)
208
- end
209
-
210
- def contract_failure(step_result)
211
- EvaluationResult.new(
212
- score: 0.0, passed: false,
213
- details: "step failed: #{step_result.status} — #{step_result.validation_errors.join(", ")}"
214
- )
215
- end
216
-
217
- def evaluate_step_expectations(result, outputs_by_step, expectations)
218
- step_results = {}
219
- all_passed = true
220
-
221
- expectations.each do |step_alias, expected|
222
- output = outputs_by_step[step_alias]
223
- if output.nil?
224
- step_results[step_alias] = { passed: false, details: "step not executed" }
225
- all_passed = false
226
- else
227
- eval_res = dispatch_expected_evaluator(output: output, expected: expected, input: nil)
228
- step_results[step_alias] = { passed: eval_res.passed, score: eval_res.score, details: eval_res.details }
229
- all_passed = false unless eval_res.passed
230
- end
231
- end
232
-
233
- # Rebuild CaseResult with step_results metadata
234
- failed_steps = step_results.select { |_, v| !v[:passed] }
235
- failure_details = failed_steps.map { |k, v| "#{k}: #{v[:details]}" }.join("; ")
236
-
237
- CaseResult.new(
238
- name: result.name, input: result.input, output: result.output,
239
- expected: result.expected,
240
- step_status: all_passed ? result.step_status : :step_expectation_failed,
241
- score: all_passed ? result.score : 0.0,
242
- passed: result.passed? && all_passed,
243
- label: all_passed ? result.label : "FAIL",
244
- details: all_passed ? result.details : "step expectations failed: #{failure_details}",
245
- duration_ms: result.duration_ms, cost: result.cost
246
- )
247
- end
248
-
249
- def skipped_result(test_case, reason)
250
- CaseResult.new(
251
- name: test_case.name,
252
- input: test_case.input,
253
- output: nil,
254
- expected: test_case.expected,
255
- step_status: :skipped,
256
- score: 0.0,
257
- passed: false,
258
- label: "SKIP",
259
- details: "skipped: #{reason}"
260
- )
261
- end
262
85
  end
263
86
  end
264
87
  end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ class StepExpectationApplier
7
+ def initialize(expectation_evaluator: ExpectationEvaluator.new)
8
+ @expectation_evaluator = expectation_evaluator
9
+ end
10
+
11
+ def call(result:, run_result:, test_case:)
12
+ return result unless applicable?(test_case, run_result)
13
+
14
+ expectation_results = evaluate_expectations(run_result.outputs_by_step, test_case.step_expectations)
15
+ return result if expectation_results.values.all? { |entry| entry[:passed] }
16
+
17
+ rebuild_result(result, failure_details_for(expectation_results))
18
+ end
19
+
20
+ private
21
+
22
+ def applicable?(test_case, run_result)
23
+ test_case.respond_to?(:step_expectations) &&
24
+ test_case.step_expectations &&
25
+ run_result.respond_to?(:outputs_by_step)
26
+ end
27
+
28
+ def evaluate_expectations(outputs_by_step, expectations)
29
+ expectations.each_with_object({}) do |(step_alias, expected), results|
30
+ output = outputs_by_step[step_alias]
31
+ results[step_alias] = evaluate_single_expectation(output, expected)
32
+ end
33
+ end
34
+
35
+ def evaluate_single_expectation(output, expected)
36
+ return { passed: false, details: "step not executed" } if output.nil?
37
+
38
+ evaluation = @expectation_evaluator.call(output: output, expected: expected, input: nil)
39
+ { passed: evaluation.passed, details: evaluation.details }
40
+ end
41
+
42
+ def failure_details_for(expectation_results)
43
+ expectation_results
44
+ .select { |_, entry| !entry[:passed] }
45
+ .map { |step_alias, entry| "#{step_alias}: #{entry[:details]}" }
46
+ .join("; ")
47
+ end
48
+
49
+ def rebuild_result(result, failure_details)
50
+ CaseResult.new(
51
+ name: result.name,
52
+ input: result.input,
53
+ output: result.output,
54
+ expected: result.expected,
55
+ step_status: :step_expectation_failed,
56
+ score: 0.0,
57
+ passed: false,
58
+ label: "FAIL",
59
+ details: "step expectations failed: #{failure_details}",
60
+ duration_ms: result.duration_ms,
61
+ cost: result.cost
62
+ )
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ class StepResultNormalizer
7
+ def call(result)
8
+ return result if result.respond_to?(:parsed_output)
9
+
10
+ normalize_pipeline_result(result)
11
+ end
12
+
13
+ private
14
+
15
+ def normalize_pipeline_result(result)
16
+ last_result = result.step_results&.last&.dig(:result)
17
+ successful = result.ok?
18
+ trace = result.respond_to?(:trace) ? result.trace : nil
19
+
20
+ PipelineResultAdapter.new(
21
+ status: result.status,
22
+ ok_flag: successful,
23
+ parsed_output: successful ? result.outputs_by_step.values.last : nil,
24
+ validation_errors: validation_errors_for(last_result),
25
+ trace: trace || trace_for(last_result)
26
+ )
27
+ end
28
+
29
+ def validation_errors_for(result)
30
+ result.respond_to?(:validation_errors) ? result.validation_errors : []
31
+ end
32
+
33
+ def trace_for(result)
34
+ result.respond_to?(:trace) ? result.trace : {}
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -8,11 +8,24 @@ require_relative "eval/evaluator/json_includes"
8
8
  require_relative "eval/evaluator/proc_evaluator"
9
9
  require_relative "eval/dataset"
10
10
  require_relative "eval/pipeline_result_adapter"
11
+ require_relative "eval/expectation_evaluator"
11
12
  require_relative "eval/trait_evaluator"
12
13
  require_relative "eval/contract_detail_builder"
14
+ require_relative "eval/case_scorer"
15
+ require_relative "eval/case_result_builder"
16
+ require_relative "eval/step_result_normalizer"
17
+ require_relative "eval/step_expectation_applier"
18
+ require_relative "eval/case_executor"
13
19
  require_relative "eval/runner"
20
+ require_relative "eval/report_stats"
21
+ require_relative "eval/report_presenter"
22
+ require_relative "eval/report_storage"
14
23
  require_relative "eval/report"
15
24
  require_relative "eval/eval_definition"
16
25
  require_relative "eval/model_comparison"
17
26
  require_relative "eval/baseline_diff"
27
+ require_relative "eval/prompt_diff_serializer"
28
+ require_relative "eval/prompt_diff_comparator"
29
+ require_relative "eval/prompt_diff_presenter"
30
+ require_relative "eval/prompt_diff"
18
31
  require_relative "eval/eval_history"
@@ -5,6 +5,20 @@ require "ruby_llm/contract"
5
5
  module RubyLLM
6
6
  module Contract
7
7
  module MinitestHelpers
8
+ # Snapshot adapter before each test so teardown can restore it.
9
+ def setup
10
+ super if defined?(super)
11
+ @_contract_original_adapter = RubyLLM::Contract.configuration.default_adapter
12
+ end
13
+
14
+ # Auto-cleanup: clear overrides AND restore original adapter.
15
+ # Prevents both non-block stub_step and stub_all_steps from leaking.
16
+ def teardown
17
+ RubyLLM::Contract.step_adapter_overrides.clear
18
+ RubyLLM::Contract.configuration.default_adapter = @_contract_original_adapter
19
+ super if defined?(super)
20
+ end
21
+
8
22
  def assert_satisfies_contract(result, msg = nil)
9
23
  assert result.ok?, msg || "Expected step result to satisfy contract, " \
10
24
  "but got status: #{result.status}. Errors: #{result.validation_errors.join(", ")}"
@@ -33,13 +47,113 @@ module RubyLLM
33
47
  report
34
48
  end
35
49
 
36
- def stub_step(step_class, response: nil, responses: nil)
50
+ # Stub a specific step to return a canned response without API calls.
51
+ # Routes per-step — other steps are not affected.
52
+ #
53
+ # stub_step(ClassifyTicket, response: { priority: "high" })
54
+ #
55
+ # Supports an optional block form — the override is removed after the
56
+ # block returns (even if it raises):
57
+ #
58
+ # stub_step(ClassifyTicket, response: data) do
59
+ # result = ClassifyTicket.run("test")
60
+ # end
61
+ # # ClassifyTicket.run no longer stubbed
62
+ #
63
+ def stub_step(step_class, response: nil, responses: nil, &block)
64
+ adapter = if responses
65
+ Adapters::Test.new(responses: responses)
66
+ else
67
+ Adapters::Test.new(response: response)
68
+ end
69
+
70
+ overrides = RubyLLM::Contract.step_adapter_overrides
71
+ previous = overrides[step_class]
72
+ overrides[step_class] = adapter
73
+
74
+ if block
75
+ begin
76
+ yield
77
+ ensure
78
+ if previous
79
+ overrides[step_class] = previous
80
+ else
81
+ overrides.delete(step_class)
82
+ end
83
+ end
84
+ end
85
+ end
86
+
87
+ # Stub multiple steps at once with different responses.
88
+ # Takes a hash of step_class => options. Requires a block.
89
+ #
90
+ # stub_steps(
91
+ # ClassifyTicket => { response: { priority: "high" } },
92
+ # RouteToTeam => { response: { team: "billing" } }
93
+ # ) do
94
+ # result = TicketPipeline.run("test")
95
+ # end
96
+ #
97
+ def stub_steps(stubs, &block)
98
+ raise ArgumentError, "stub_steps requires a block" unless block
99
+
100
+ overrides = RubyLLM::Contract.step_adapter_overrides
101
+ previous = {}
102
+
103
+ stubs.each do |step_class, opts|
104
+ opts = opts.transform_keys(&:to_sym)
105
+ adapter = if opts[:responses]
106
+ Adapters::Test.new(responses: opts[:responses])
107
+ else
108
+ Adapters::Test.new(response: opts[:response])
109
+ end
110
+ previous[step_class] = overrides[step_class]
111
+ overrides[step_class] = adapter
112
+ end
113
+
114
+ begin
115
+ yield
116
+ ensure
117
+ stubs.each_key do |step_class|
118
+ if previous[step_class]
119
+ overrides[step_class] = previous[step_class]
120
+ else
121
+ overrides.delete(step_class)
122
+ end
123
+ end
124
+ end
125
+ end
126
+
127
+ # Set a global test adapter for ALL steps.
128
+ #
129
+ # stub_all_steps(response: { default: true })
130
+ #
131
+ # Supports an optional block form — the previous adapter is restored
132
+ # after the block returns (even if it raises):
133
+ #
134
+ # stub_all_steps(response: { default: true }) do
135
+ # # all steps use test adapter
136
+ # end
137
+ # # original adapter restored
138
+ #
139
+ def stub_all_steps(response: nil, responses: nil, &block)
37
140
  adapter = if responses
38
141
  Adapters::Test.new(responses: responses)
39
142
  else
40
143
  Adapters::Test.new(response: response)
41
144
  end
42
- RubyLLM::Contract.configure { |c| c.default_adapter = adapter }
145
+
146
+ if block
147
+ previous = RubyLLM::Contract.configuration.default_adapter
148
+ begin
149
+ RubyLLM::Contract.configuration.default_adapter = adapter
150
+ yield
151
+ ensure
152
+ RubyLLM::Contract.configuration.default_adapter = previous
153
+ end
154
+ else
155
+ RubyLLM::Contract.configure { |c| c.default_adapter = adapter }
156
+ end
43
157
  end
44
158
  end
45
159
  end
@@ -25,11 +25,24 @@ module RubyLLM
25
25
 
26
26
  # Internal mutable steps list for registration
27
27
  def steps_registry
28
- @steps_registry ||= []
28
+ @steps_registry ||= begin
29
+ inherited_steps =
30
+ if superclass.respond_to?(:steps_registry, true)
31
+ superclass.send(:steps_registry).map(&:dup)
32
+ else
33
+ []
34
+ end
35
+
36
+ inherited_steps
37
+ end
29
38
  end
30
39
 
31
40
  def token_budget(limit = nil)
32
- return @token_budget = limit if limit
41
+ if limit
42
+ raise ArgumentError, "token_budget must be positive, got #{limit}" unless limit.positive?
43
+
44
+ return @token_budget = limit
45
+ end
33
46
 
34
47
  @token_budget
35
48
  end
@@ -7,7 +7,7 @@ module RubyLLM
7
7
  module Contract
8
8
  class RakeTask < ::Rake::TaskLib
9
9
  attr_accessor :name, :context, :fail_on_empty, :minimum_score, :maximum_cost,
10
- :eval_dirs, :save_baseline, :fail_on_regression
10
+ :eval_dirs, :save_baseline, :fail_on_regression, :track_history
11
11
 
12
12
  def initialize(name = :"ruby_llm_contract:eval", &block)
13
13
  super()
@@ -19,6 +19,7 @@ module RubyLLM
19
19
  @eval_dirs = [] # directories to load eval files from (non-Rails)
20
20
  @save_baseline = false
21
21
  @fail_on_regression = false
22
+ @track_history = false
22
23
  block&.call(self)
23
24
  define_task
24
25
  end
@@ -47,18 +48,23 @@ module RubyLLM
47
48
  suite_cost = 0.0
48
49
 
49
50
  passed_reports = []
51
+ all_reports = []
50
52
 
51
53
  results.each do |host, reports|
52
54
  puts "\n#{host.name || host.to_s}"
53
55
  reports.each_value do |report|
54
56
  report.print_summary
55
57
  suite_cost += report.total_cost
58
+ all_reports << [host, report]
56
59
  report_ok = report_meets_score?(report) && !check_regression(report)
57
60
  gate_passed = false unless report_ok
58
61
  passed_reports << report if report_ok
59
62
  end
60
63
  end
61
64
 
65
+ # Save history BEFORE gating — failures are valuable trend data (ADR-0016 F3)
66
+ save_all_history!(all_reports, context) if @track_history
67
+
62
68
  if @maximum_cost && suite_cost > @maximum_cost
63
69
  abort "\nEval suite FAILED: total cost $#{format("%.4f", suite_cost)} " \
64
70
  "exceeds budget $#{format("%.4f", @maximum_cost)}"
@@ -68,6 +74,7 @@ module RubyLLM
68
74
 
69
75
  # Save baselines only after ALL gates pass
70
76
  passed_reports.each { |r| save_baseline!(r) } if @save_baseline
77
+
71
78
  puts "\nAll evals passed."
72
79
  end
73
80
  end
@@ -98,6 +105,18 @@ module RubyLLM
98
105
  puts " Baseline saved: #{path}"
99
106
  end
100
107
 
108
+ def save_all_history!(host_reports, context)
109
+ context_model = (context[:model] || context["model"]) if context.is_a?(Hash)
110
+ host_reports.each do |host, report|
111
+ # Model priority: context > step DSL > default config
112
+ model = context_model
113
+ model ||= (host.model if host.respond_to?(:model))
114
+ model ||= RubyLLM::Contract.configuration.default_model rescue nil
115
+ path = report.save_history!(model: model)
116
+ puts " History saved: #{path}"
117
+ end
118
+ end
119
+
101
120
  def task_prerequisites
102
121
  defined?(::Rails) ? [:environment] : []
103
122
  end