ruby_llm-contract 0.4.5 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/.rubycritic.yml +8 -0
  3. data/.simplecov +22 -0
  4. data/CHANGELOG.md +19 -0
  5. data/Gemfile +2 -0
  6. data/Gemfile.lock +104 -2
  7. data/README.md +42 -2
  8. data/lib/ruby_llm/contract/concerns/context_helpers.rb +11 -10
  9. data/lib/ruby_llm/contract/concerns/deep_freeze.rb +13 -7
  10. data/lib/ruby_llm/contract/concerns/deep_symbolize.rb +15 -5
  11. data/lib/ruby_llm/contract/concerns/eval_host.rb +51 -7
  12. data/lib/ruby_llm/contract/contract/schema_validator/bound_rule.rb +85 -0
  13. data/lib/ruby_llm/contract/contract/schema_validator/enum_rule.rb +23 -0
  14. data/lib/ruby_llm/contract/contract/schema_validator/node.rb +70 -0
  15. data/lib/ruby_llm/contract/contract/schema_validator/object_rules.rb +66 -0
  16. data/lib/ruby_llm/contract/contract/schema_validator/scalar_rules.rb +22 -0
  17. data/lib/ruby_llm/contract/contract/schema_validator/schema_extractor.rb +23 -0
  18. data/lib/ruby_llm/contract/contract/schema_validator/type_rule.rb +30 -0
  19. data/lib/ruby_llm/contract/contract/schema_validator.rb +41 -266
  20. data/lib/ruby_llm/contract/contract/validator.rb +9 -0
  21. data/lib/ruby_llm/contract/eval/case_executor.rb +52 -0
  22. data/lib/ruby_llm/contract/eval/case_result_builder.rb +35 -0
  23. data/lib/ruby_llm/contract/eval/case_scorer.rb +66 -0
  24. data/lib/ruby_llm/contract/eval/evaluator/exact.rb +8 -6
  25. data/lib/ruby_llm/contract/eval/evaluator/proc_evaluator.rb +22 -10
  26. data/lib/ruby_llm/contract/eval/evaluator/regex.rb +11 -8
  27. data/lib/ruby_llm/contract/eval/expectation_evaluator.rb +26 -0
  28. data/lib/ruby_llm/contract/eval/prompt_diff.rb +39 -0
  29. data/lib/ruby_llm/contract/eval/prompt_diff_comparator.rb +116 -0
  30. data/lib/ruby_llm/contract/eval/prompt_diff_presenter.rb +99 -0
  31. data/lib/ruby_llm/contract/eval/prompt_diff_serializer.rb +23 -0
  32. data/lib/ruby_llm/contract/eval/report.rb +19 -191
  33. data/lib/ruby_llm/contract/eval/report_presenter.rb +65 -0
  34. data/lib/ruby_llm/contract/eval/report_stats.rb +65 -0
  35. data/lib/ruby_llm/contract/eval/report_storage.rb +107 -0
  36. data/lib/ruby_llm/contract/eval/runner.rb +30 -207
  37. data/lib/ruby_llm/contract/eval/step_expectation_applier.rb +67 -0
  38. data/lib/ruby_llm/contract/eval/step_result_normalizer.rb +39 -0
  39. data/lib/ruby_llm/contract/eval.rb +13 -0
  40. data/lib/ruby_llm/contract/pipeline/base.rb +10 -1
  41. data/lib/ruby_llm/contract/rspec/pass_eval.rb +84 -3
  42. data/lib/ruby_llm/contract/rspec.rb +5 -0
  43. data/lib/ruby_llm/contract/step/adapter_caller.rb +23 -0
  44. data/lib/ruby_llm/contract/step/base.rb +93 -38
  45. data/lib/ruby_llm/contract/step/dsl.rb +10 -0
  46. data/lib/ruby_llm/contract/step/input_validator.rb +34 -0
  47. data/lib/ruby_llm/contract/step/limit_checker.rb +11 -11
  48. data/lib/ruby_llm/contract/step/prompt_compiler.rb +33 -0
  49. data/lib/ruby_llm/contract/step/result.rb +3 -2
  50. data/lib/ruby_llm/contract/step/result_builder.rb +60 -0
  51. data/lib/ruby_llm/contract/step/retry_executor.rb +1 -0
  52. data/lib/ruby_llm/contract/step/runner.rb +46 -85
  53. data/lib/ruby_llm/contract/step/runner_config.rb +37 -0
  54. data/lib/ruby_llm/contract/step.rb +5 -0
  55. data/lib/ruby_llm/contract/version.rb +1 -1
  56. metadata +28 -1
@@ -4,8 +4,6 @@ module RubyLLM
4
4
  module Contract
5
5
  module Eval
6
6
  class Runner
7
- include TraitEvaluator
8
- include ContractDetailBuilder
9
7
  include Concerns::ContextHelpers
10
8
 
11
9
  def self.run(step:, dataset:, context: {}, concurrency: nil)
@@ -20,17 +18,35 @@ module RubyLLM
20
18
  end
21
19
 
22
20
  def run
23
- results = if @concurrency && @concurrency > 1
24
- run_concurrent
25
- else
26
- @dataset.cases.map { |test_case| evaluate_case(test_case) }
27
- end
28
- step_name = @step.respond_to?(:name) ? @step.name : @step.to_s
29
- Report.new(dataset_name: @dataset.name, results: results, step_name: step_name)
21
+ Report.new(
22
+ dataset_name: @dataset.name,
23
+ results: collected_results,
24
+ step_name: step_name
25
+ )
30
26
  end
31
27
 
32
28
  private
33
29
 
30
+ def collected_results
31
+ concurrent? ? run_concurrent : run_serial
32
+ end
33
+
34
+ def run_serial
35
+ @dataset.cases.map { |test_case| case_executor.call(test_case: test_case, context: @context) }
36
+ end
37
+
38
+ def concurrent?
39
+ @concurrency && @concurrency > 1
40
+ end
41
+
42
+ def step_name
43
+ @step.respond_to?(:name) ? @step.name : @step.to_s
44
+ end
45
+
46
+ def case_executor
47
+ @case_executor ||= CaseExecutor.new(step: @step)
48
+ end
49
+
34
50
  def run_concurrent
35
51
  require "concurrent"
36
52
  pool = Concurrent::FixedThreadPool.new(@concurrency)
@@ -39,10 +55,10 @@ module RubyLLM
39
55
  # gets a single-response adapter with its own response (by index).
40
56
  per_case_contexts = build_per_case_contexts
41
57
 
42
- futures = @dataset.cases.each_with_index.map do |test_case, i|
43
- ctx = per_case_contexts[i]
58
+ futures = @dataset.cases.each_with_index.map do |test_case, index|
59
+ case_context = per_case_contexts[index]
44
60
  Concurrent::Future.execute(executor: pool) do
45
- evaluate_case_with_context(test_case, ctx)
61
+ case_executor.call(test_case: test_case, context: case_context)
46
62
  end
47
63
  end
48
64
  futures.map(&:value!)
@@ -55,10 +71,10 @@ module RubyLLM
55
71
  adapter = @context[:adapter]
56
72
  responses = adapter.respond_to?(:responses_array) ? adapter.responses_array : nil
57
73
 
58
- @dataset.cases.each_with_index.map do |_, i|
74
+ @dataset.cases.each_with_index.map do |_, index|
59
75
  if responses
60
76
  # Give each case its own single-response adapter
61
- response = responses[i] || responses.last
77
+ response = responses[index] || responses.last
62
78
  per_case_adapter = Adapters::Test.new(response: response)
63
79
  @context.merge(adapter: per_case_adapter)
64
80
  else
@@ -66,199 +82,6 @@ module RubyLLM
66
82
  end
67
83
  end
68
84
  end
69
-
70
- def evaluate_case_with_context(test_case, context)
71
- run_result = @step.run(test_case.input, context: context)
72
- step_result = normalize_result(run_result)
73
- eval_result = dispatch_evaluation(step_result, test_case)
74
-
75
- result = build_case_result(test_case, step_result, eval_result)
76
-
77
- if test_case.respond_to?(:step_expectations) && test_case.step_expectations &&
78
- run_result.respond_to?(:outputs_by_step)
79
- evaluate_step_expectations(result, run_result.outputs_by_step, test_case.step_expectations)
80
- else
81
- result
82
- end
83
- rescue RubyLLM::Contract::Error => e
84
- raise unless e.message.include?("No adapter configured")
85
-
86
- skipped_result(test_case, e.message)
87
- end
88
-
89
- def evaluate_case(test_case)
90
- run_result = @step.run(test_case.input, context: @context)
91
- step_result = normalize_result(run_result)
92
- eval_result = dispatch_evaluation(step_result, test_case)
93
-
94
- result = build_case_result(test_case, step_result, eval_result)
95
-
96
- # Pipeline per-step evaluation
97
- if test_case.respond_to?(:step_expectations) && test_case.step_expectations &&
98
- run_result.respond_to?(:outputs_by_step)
99
- evaluate_step_expectations(result, run_result.outputs_by_step, test_case.step_expectations)
100
- else
101
- result
102
- end
103
- rescue RubyLLM::Contract::Error => e
104
- raise unless e.message.include?("No adapter configured")
105
-
106
- skipped_result(test_case, e.message)
107
- end
108
-
109
- def build_case_result(test_case, step_result, eval_result)
110
- trace = step_result.respond_to?(:trace) ? step_result.trace : nil
111
- CaseResult.new(
112
- name: test_case.name,
113
- input: test_case.input,
114
- output: step_result.parsed_output,
115
- expected: test_case.expected,
116
- step_status: step_result.status,
117
- score: eval_result.score,
118
- passed: eval_result.passed,
119
- label: eval_result.label,
120
- details: eval_result.details,
121
- duration_ms: extract_latency(trace),
122
- cost: extract_cost(trace)
123
- )
124
- end
125
-
126
- def extract_latency(trace)
127
- return nil unless trace
128
-
129
- # Pipeline::Trace uses total_latency_ms, Step::Trace uses latency_ms
130
- if trace.respond_to?(:total_latency_ms)
131
- trace.total_latency_ms
132
- else
133
- trace[:latency_ms]
134
- end
135
- end
136
-
137
- def extract_cost(trace)
138
- return nil unless trace
139
-
140
- # Pipeline::Trace uses total_cost, Step::Trace uses cost
141
- if trace.respond_to?(:total_cost)
142
- trace.total_cost
143
- else
144
- trace[:cost]
145
- end
146
- end
147
-
148
- def dispatch_evaluation(step_result, test_case)
149
- return contract_failure(step_result) unless step_result.ok?
150
-
151
- if test_case.evaluator
152
- evaluate_with_custom(step_result, test_case)
153
- elsif test_case.expected_traits
154
- evaluate_traits(step_result, test_case)
155
- elsif !test_case.expected.nil?
156
- evaluate_expected(step_result, test_case)
157
- else
158
- evaluate_contract_only
159
- end
160
- end
161
-
162
- def normalize_result(result)
163
- return result if result.respond_to?(:parsed_output)
164
-
165
- normalize_pipeline_result(result)
166
- end
167
-
168
- def normalize_pipeline_result(result)
169
- last_result = result.step_results&.last&.dig(:result)
170
- is_ok = result.ok?
171
- pipeline_trace = result.respond_to?(:trace) ? result.trace : nil
172
-
173
- PipelineResultAdapter.new(
174
- status: result.status,
175
- ok_flag: is_ok,
176
- parsed_output: is_ok ? result.outputs_by_step.values.last : nil,
177
- validation_errors: last_result.respond_to?(:validation_errors) ? last_result.validation_errors : [],
178
- trace: pipeline_trace || (last_result.respond_to?(:trace) ? last_result.trace : {})
179
- )
180
- end
181
-
182
- def evaluate_expected(step_result, test_case)
183
- dispatch_expected_evaluator(
184
- output: step_result.parsed_output,
185
- expected: test_case.expected,
186
- input: test_case.input
187
- )
188
- end
189
-
190
- def dispatch_expected_evaluator(output:, expected:, input:)
191
- if expected.is_a?(Hash)
192
- Evaluator::JsonIncludes.new.call(output: output, expected: expected, input: input)
193
- elsif expected.is_a?(::Regexp)
194
- Evaluator::Regex.new(expected).call(output: output, input: input)
195
- else
196
- Evaluator::Exact.new.call(output: output, expected: expected, input: input)
197
- end
198
- end
199
-
200
- def evaluate_with_custom(step_result, test_case)
201
- evaluator = test_case.evaluator
202
- evaluator = Evaluator::ProcEvaluator.new(evaluator) if evaluator.is_a?(::Proc)
203
- evaluator.call(output: step_result.parsed_output, expected: test_case.expected, input: test_case.input)
204
- end
205
-
206
- def evaluate_contract_only
207
- EvaluationResult.new(score: 1.0, passed: true, details: build_contract_details)
208
- end
209
-
210
- def contract_failure(step_result)
211
- EvaluationResult.new(
212
- score: 0.0, passed: false,
213
- details: "step failed: #{step_result.status} — #{step_result.validation_errors.join(", ")}"
214
- )
215
- end
216
-
217
- def evaluate_step_expectations(result, outputs_by_step, expectations)
218
- step_results = {}
219
- all_passed = true
220
-
221
- expectations.each do |step_alias, expected|
222
- output = outputs_by_step[step_alias]
223
- if output.nil?
224
- step_results[step_alias] = { passed: false, details: "step not executed" }
225
- all_passed = false
226
- else
227
- eval_res = dispatch_expected_evaluator(output: output, expected: expected, input: nil)
228
- step_results[step_alias] = { passed: eval_res.passed, score: eval_res.score, details: eval_res.details }
229
- all_passed = false unless eval_res.passed
230
- end
231
- end
232
-
233
- # Rebuild CaseResult with step_results metadata
234
- failed_steps = step_results.select { |_, v| !v[:passed] }
235
- failure_details = failed_steps.map { |k, v| "#{k}: #{v[:details]}" }.join("; ")
236
-
237
- CaseResult.new(
238
- name: result.name, input: result.input, output: result.output,
239
- expected: result.expected,
240
- step_status: all_passed ? result.step_status : :step_expectation_failed,
241
- score: all_passed ? result.score : 0.0,
242
- passed: result.passed? && all_passed,
243
- label: all_passed ? result.label : "FAIL",
244
- details: all_passed ? result.details : "step expectations failed: #{failure_details}",
245
- duration_ms: result.duration_ms, cost: result.cost
246
- )
247
- end
248
-
249
- def skipped_result(test_case, reason)
250
- CaseResult.new(
251
- name: test_case.name,
252
- input: test_case.input,
253
- output: nil,
254
- expected: test_case.expected,
255
- step_status: :skipped,
256
- score: 0.0,
257
- passed: false,
258
- label: "SKIP",
259
- details: "skipped: #{reason}"
260
- )
261
- end
262
85
  end
263
86
  end
264
87
  end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ class StepExpectationApplier
7
+ def initialize(expectation_evaluator: ExpectationEvaluator.new)
8
+ @expectation_evaluator = expectation_evaluator
9
+ end
10
+
11
+ def call(result:, run_result:, test_case:)
12
+ return result unless applicable?(test_case, run_result)
13
+
14
+ expectation_results = evaluate_expectations(run_result.outputs_by_step, test_case.step_expectations)
15
+ return result if expectation_results.values.all? { |entry| entry[:passed] }
16
+
17
+ rebuild_result(result, failure_details_for(expectation_results))
18
+ end
19
+
20
+ private
21
+
22
+ def applicable?(test_case, run_result)
23
+ test_case.respond_to?(:step_expectations) &&
24
+ test_case.step_expectations &&
25
+ run_result.respond_to?(:outputs_by_step)
26
+ end
27
+
28
+ def evaluate_expectations(outputs_by_step, expectations)
29
+ expectations.each_with_object({}) do |(step_alias, expected), results|
30
+ output = outputs_by_step[step_alias]
31
+ results[step_alias] = evaluate_single_expectation(output, expected)
32
+ end
33
+ end
34
+
35
+ def evaluate_single_expectation(output, expected)
36
+ return { passed: false, details: "step not executed" } if output.nil?
37
+
38
+ evaluation = @expectation_evaluator.call(output: output, expected: expected, input: nil)
39
+ { passed: evaluation.passed, details: evaluation.details }
40
+ end
41
+
42
+ def failure_details_for(expectation_results)
43
+ expectation_results
44
+ .select { |_, entry| !entry[:passed] }
45
+ .map { |step_alias, entry| "#{step_alias}: #{entry[:details]}" }
46
+ .join("; ")
47
+ end
48
+
49
+ def rebuild_result(result, failure_details)
50
+ CaseResult.new(
51
+ name: result.name,
52
+ input: result.input,
53
+ output: result.output,
54
+ expected: result.expected,
55
+ step_status: :step_expectation_failed,
56
+ score: 0.0,
57
+ passed: false,
58
+ label: "FAIL",
59
+ details: "step expectations failed: #{failure_details}",
60
+ duration_ms: result.duration_ms,
61
+ cost: result.cost
62
+ )
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ class StepResultNormalizer
7
+ def call(result)
8
+ return result if result.respond_to?(:parsed_output)
9
+
10
+ normalize_pipeline_result(result)
11
+ end
12
+
13
+ private
14
+
15
+ def normalize_pipeline_result(result)
16
+ last_result = result.step_results&.last&.dig(:result)
17
+ successful = result.ok?
18
+ trace = result.respond_to?(:trace) ? result.trace : nil
19
+
20
+ PipelineResultAdapter.new(
21
+ status: result.status,
22
+ ok_flag: successful,
23
+ parsed_output: successful ? result.outputs_by_step.values.last : nil,
24
+ validation_errors: validation_errors_for(last_result),
25
+ trace: trace || trace_for(last_result)
26
+ )
27
+ end
28
+
29
+ def validation_errors_for(result)
30
+ result.respond_to?(:validation_errors) ? result.validation_errors : []
31
+ end
32
+
33
+ def trace_for(result)
34
+ result.respond_to?(:trace) ? result.trace : {}
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -8,11 +8,24 @@ require_relative "eval/evaluator/json_includes"
8
8
  require_relative "eval/evaluator/proc_evaluator"
9
9
  require_relative "eval/dataset"
10
10
  require_relative "eval/pipeline_result_adapter"
11
+ require_relative "eval/expectation_evaluator"
11
12
  require_relative "eval/trait_evaluator"
12
13
  require_relative "eval/contract_detail_builder"
14
+ require_relative "eval/case_scorer"
15
+ require_relative "eval/case_result_builder"
16
+ require_relative "eval/step_result_normalizer"
17
+ require_relative "eval/step_expectation_applier"
18
+ require_relative "eval/case_executor"
13
19
  require_relative "eval/runner"
20
+ require_relative "eval/report_stats"
21
+ require_relative "eval/report_presenter"
22
+ require_relative "eval/report_storage"
14
23
  require_relative "eval/report"
15
24
  require_relative "eval/eval_definition"
16
25
  require_relative "eval/model_comparison"
17
26
  require_relative "eval/baseline_diff"
27
+ require_relative "eval/prompt_diff_serializer"
28
+ require_relative "eval/prompt_diff_comparator"
29
+ require_relative "eval/prompt_diff_presenter"
30
+ require_relative "eval/prompt_diff"
18
31
  require_relative "eval/eval_history"
@@ -25,7 +25,16 @@ module RubyLLM
25
25
 
26
26
  # Internal mutable steps list for registration
27
27
  def steps_registry
28
- @steps_registry ||= []
28
+ @steps_registry ||= begin
29
+ inherited_steps =
30
+ if superclass.respond_to?(:steps_registry, true)
31
+ superclass.send(:steps_registry).map(&:dup)
32
+ else
33
+ []
34
+ end
35
+
36
+ inherited_steps
37
+ end
29
38
  end
30
39
 
31
40
  def token_budget(limit = nil)
@@ -68,15 +68,28 @@ RSpec::Matchers.define :pass_eval do |eval_name|
68
68
  @check_regressions = true
69
69
  end
70
70
 
71
+ chain :compared_with do |other_step|
72
+ @comparison_step = other_step
73
+ @check_regressions = true # compared_with implies regression check
74
+ end
75
+
71
76
  match do |step_or_pipeline|
72
77
  @eval_name = eval_name
73
78
  @context ||= {}
74
79
  @minimum_score ||= nil
75
80
  @maximum_cost ||= nil
76
81
  @check_regressions ||= false
82
+ @comparison_step ||= nil
77
83
  @error = nil
78
84
  @diff = nil
79
- @report = step_or_pipeline.run_eval(eval_name, context: @context)
85
+ @prompt_diff = nil
86
+
87
+ if @comparison_step && @check_regressions
88
+ @prompt_diff = step_or_pipeline.compare_with(@comparison_step, eval: eval_name, context: @context)
89
+ @report = @prompt_diff.candidate_report
90
+ else
91
+ @report = step_or_pipeline.run_eval(eval_name, context: @context)
92
+ end
80
93
 
81
94
  score_ok = if @minimum_score
82
95
  @report.score >= @minimum_score
@@ -86,7 +99,9 @@ RSpec::Matchers.define :pass_eval do |eval_name|
86
99
 
87
100
  cost_ok = @maximum_cost ? @report.total_cost <= @maximum_cost : true
88
101
 
89
- regression_ok = if @check_regressions && @report.baseline_exists?
102
+ regression_ok = if @prompt_diff
103
+ @prompt_diff.safe_to_switch?
104
+ elsif @check_regressions && @report.baseline_exists?
90
105
  @diff = @report.compare_with_baseline
91
106
  !@diff.regressed?
92
107
  else
@@ -100,11 +115,67 @@ RSpec::Matchers.define :pass_eval do |eval_name|
100
115
  end
101
116
 
102
117
  failure_message do
118
+ if @prompt_diff && !@prompt_diff.safe_to_switch?
119
+ msg = "expected #{@eval_name} eval to be safe to switch from baseline prompt\n"
120
+
121
+ # Check empty sides first — most fundamental problem
122
+ bl_empty = @prompt_diff.baseline_empty?
123
+ cd_empty = @prompt_diff.candidate_empty?
124
+ if bl_empty || cd_empty
125
+ msg += " One side has no evaluated cases (all skipped or no adapter?)\n"
126
+ if sample_response_only_compare?
127
+ msg += " compare_with ignores sample_response; pass model: or with_context(adapter: ...)\n"
128
+ end
129
+ msg += " Candidate score: #{@prompt_diff.candidate_score}, Baseline score: #{@prompt_diff.baseline_score}"
130
+ next msg
131
+ end
132
+
133
+ # Check dataset comparability — names, inputs, AND expected must match
134
+ unless @prompt_diff.cases_comparable?
135
+ unless @prompt_diff.case_names_match?
136
+ mm = @prompt_diff.mismatched_cases
137
+ msg += " Case set mismatch — candidate and baseline must have identical cases:\n"
138
+ mm[:only_in_baseline].each { |n| msg += " only in baseline: #{n}\n" }
139
+ mm[:only_in_candidate].each { |n| msg += " only in candidate: #{n}\n" }
140
+ end
141
+ @prompt_diff.input_mismatches.each do |m|
142
+ msg += " Input mismatch for '#{m[:case]}' — same name but different inputs\n"
143
+ end
144
+ @prompt_diff.expected_mismatches.each do |m|
145
+ msg += " Expected mismatch for '#{m[:case]}' — same name/input but different expected values\n"
146
+ end
147
+ next msg
148
+ end
149
+
150
+ # Check per-case score regressions (even if global average is flat)
151
+ if @prompt_diff.score_regressions.any?
152
+ msg += " Per-case score regressions (#{@prompt_diff.score_regressions.length}):\n"
153
+ @prompt_diff.score_regressions.each do |r|
154
+ msg += " #{r[:case]}: #{r[:baseline_score]} -> #{r[:candidate_score]} (#{r[:delta]})\n"
155
+ end
156
+ msg += " Score delta: #{@prompt_diff.score_delta}"
157
+ next msg
158
+ end
159
+
160
+ # Check pass/fail regressions and removed cases
161
+ removed = @prompt_diff.removed_passing_cases
162
+ reg_count = @prompt_diff.regressions.length + removed.length
163
+ msg += " Found #{reg_count} regression(s):\n"
164
+ @prompt_diff.regressions.each do |r|
165
+ msg += " #{r[:case]}: was PASS, now FAIL -- #{r[:detail]}\n"
166
+ end
167
+ removed.each do |name|
168
+ msg += " #{name}: REMOVED (was passing in baseline)\n"
169
+ end
170
+ msg += " Score delta: #{@prompt_diff.score_delta}"
171
+ next msg
172
+ end
173
+
103
174
  msg = format_failure_message(@eval_name, @error, @report, @minimum_score, @maximum_cost)
104
175
  if @diff&.regressed?
105
176
  msg += "\n\nRegressions from baseline:\n"
106
177
  @diff.regressions.each do |r|
107
- msg += " #{r[:case]}: was PASS, now FAIL #{r[:detail]}\n"
178
+ msg += " #{r[:case]}: was PASS, now FAIL -- #{r[:detail]}\n"
108
179
  end
109
180
  msg += " Score delta: #{@diff.score_delta}"
110
181
  end
@@ -114,4 +185,14 @@ RSpec::Matchers.define :pass_eval do |eval_name|
114
185
  failure_message_when_negated do
115
186
  "expected #{@eval_name} eval NOT to pass, but it passed with score: #{@report.score.round(2)}"
116
187
  end
188
+
189
+ def sample_response_only_compare?
190
+ return false unless @comparison_step
191
+ return false if @context[:adapter] || @context[:model]
192
+
193
+ defn = @comparison_step.send(:all_eval_definitions)[@eval_name.to_s]
194
+ defn&.build_adapter
195
+ rescue StandardError
196
+ false
197
+ end
117
198
  end
@@ -13,11 +13,16 @@ RSpec.configure do |config|
13
13
  # Prevents non-block stub_all_steps from leaking between examples.
14
14
  config.around(:each) do |example|
15
15
  original_adapter = RubyLLM::Contract.configuration.default_adapter
16
+ original_logger = RubyLLM::Contract.configuration.logger
17
+ original_eval_hosts = RubyLLM::Contract.eval_hosts.dup
16
18
  original_overrides = RubyLLM::Contract.step_adapter_overrides.dup
17
19
  begin
18
20
  example.run
19
21
  ensure
20
22
  RubyLLM::Contract.configuration.default_adapter = original_adapter
23
+ RubyLLM::Contract.configuration.logger = original_logger
24
+ RubyLLM::Contract.reset_eval_hosts!
25
+ RubyLLM::Contract.eval_hosts.concat(original_eval_hosts)
21
26
  RubyLLM::Contract.step_adapter_overrides.replace(original_overrides)
22
27
  end
23
28
  end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Step
6
+ class AdapterCaller
7
+ def initialize(adapter:, adapter_options:)
8
+ @adapter = adapter
9
+ @adapter_options = adapter_options
10
+ end
11
+
12
+ def call(messages)
13
+ start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
14
+ response = @adapter.call(messages: messages, **@adapter_options)
15
+ latency_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
16
+ [response, latency_ms]
17
+ rescue StandardError => error
18
+ [Result.new(status: :adapter_error, raw_output: nil, parsed_output: nil, validation_errors: [error.message]), 0]
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end