ruby_llm-contract 0.4.5 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubycritic.yml +8 -0
- data/.simplecov +22 -0
- data/CHANGELOG.md +19 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +104 -2
- data/README.md +42 -2
- data/lib/ruby_llm/contract/concerns/context_helpers.rb +11 -10
- data/lib/ruby_llm/contract/concerns/deep_freeze.rb +13 -7
- data/lib/ruby_llm/contract/concerns/deep_symbolize.rb +15 -5
- data/lib/ruby_llm/contract/concerns/eval_host.rb +51 -7
- data/lib/ruby_llm/contract/contract/schema_validator/bound_rule.rb +85 -0
- data/lib/ruby_llm/contract/contract/schema_validator/enum_rule.rb +23 -0
- data/lib/ruby_llm/contract/contract/schema_validator/node.rb +70 -0
- data/lib/ruby_llm/contract/contract/schema_validator/object_rules.rb +66 -0
- data/lib/ruby_llm/contract/contract/schema_validator/scalar_rules.rb +22 -0
- data/lib/ruby_llm/contract/contract/schema_validator/schema_extractor.rb +23 -0
- data/lib/ruby_llm/contract/contract/schema_validator/type_rule.rb +30 -0
- data/lib/ruby_llm/contract/contract/schema_validator.rb +41 -266
- data/lib/ruby_llm/contract/contract/validator.rb +9 -0
- data/lib/ruby_llm/contract/eval/case_executor.rb +52 -0
- data/lib/ruby_llm/contract/eval/case_result_builder.rb +35 -0
- data/lib/ruby_llm/contract/eval/case_scorer.rb +66 -0
- data/lib/ruby_llm/contract/eval/evaluator/exact.rb +8 -6
- data/lib/ruby_llm/contract/eval/evaluator/proc_evaluator.rb +22 -10
- data/lib/ruby_llm/contract/eval/evaluator/regex.rb +11 -8
- data/lib/ruby_llm/contract/eval/expectation_evaluator.rb +26 -0
- data/lib/ruby_llm/contract/eval/prompt_diff.rb +39 -0
- data/lib/ruby_llm/contract/eval/prompt_diff_comparator.rb +116 -0
- data/lib/ruby_llm/contract/eval/prompt_diff_presenter.rb +99 -0
- data/lib/ruby_llm/contract/eval/prompt_diff_serializer.rb +23 -0
- data/lib/ruby_llm/contract/eval/report.rb +19 -191
- data/lib/ruby_llm/contract/eval/report_presenter.rb +65 -0
- data/lib/ruby_llm/contract/eval/report_stats.rb +65 -0
- data/lib/ruby_llm/contract/eval/report_storage.rb +107 -0
- data/lib/ruby_llm/contract/eval/runner.rb +30 -207
- data/lib/ruby_llm/contract/eval/step_expectation_applier.rb +67 -0
- data/lib/ruby_llm/contract/eval/step_result_normalizer.rb +39 -0
- data/lib/ruby_llm/contract/eval.rb +13 -0
- data/lib/ruby_llm/contract/pipeline/base.rb +10 -1
- data/lib/ruby_llm/contract/rspec/pass_eval.rb +84 -3
- data/lib/ruby_llm/contract/rspec.rb +5 -0
- data/lib/ruby_llm/contract/step/adapter_caller.rb +23 -0
- data/lib/ruby_llm/contract/step/base.rb +93 -38
- data/lib/ruby_llm/contract/step/dsl.rb +10 -0
- data/lib/ruby_llm/contract/step/input_validator.rb +34 -0
- data/lib/ruby_llm/contract/step/limit_checker.rb +11 -11
- data/lib/ruby_llm/contract/step/prompt_compiler.rb +33 -0
- data/lib/ruby_llm/contract/step/result.rb +3 -2
- data/lib/ruby_llm/contract/step/result_builder.rb +60 -0
- data/lib/ruby_llm/contract/step/retry_executor.rb +1 -0
- data/lib/ruby_llm/contract/step/runner.rb +46 -85
- data/lib/ruby_llm/contract/step/runner_config.rb +37 -0
- data/lib/ruby_llm/contract/step.rb +5 -0
- data/lib/ruby_llm/contract/version.rb +1 -1
- metadata +28 -1
|
@@ -4,8 +4,6 @@ module RubyLLM
|
|
|
4
4
|
module Contract
|
|
5
5
|
module Eval
|
|
6
6
|
class Runner
|
|
7
|
-
include TraitEvaluator
|
|
8
|
-
include ContractDetailBuilder
|
|
9
7
|
include Concerns::ContextHelpers
|
|
10
8
|
|
|
11
9
|
def self.run(step:, dataset:, context: {}, concurrency: nil)
|
|
@@ -20,17 +18,35 @@ module RubyLLM
|
|
|
20
18
|
end
|
|
21
19
|
|
|
22
20
|
def run
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
step_name = @step.respond_to?(:name) ? @step.name : @step.to_s
|
|
29
|
-
Report.new(dataset_name: @dataset.name, results: results, step_name: step_name)
|
|
21
|
+
Report.new(
|
|
22
|
+
dataset_name: @dataset.name,
|
|
23
|
+
results: collected_results,
|
|
24
|
+
step_name: step_name
|
|
25
|
+
)
|
|
30
26
|
end
|
|
31
27
|
|
|
32
28
|
private
|
|
33
29
|
|
|
30
|
+
def collected_results
|
|
31
|
+
concurrent? ? run_concurrent : run_serial
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def run_serial
|
|
35
|
+
@dataset.cases.map { |test_case| case_executor.call(test_case: test_case, context: @context) }
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def concurrent?
|
|
39
|
+
@concurrency && @concurrency > 1
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def step_name
|
|
43
|
+
@step.respond_to?(:name) ? @step.name : @step.to_s
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def case_executor
|
|
47
|
+
@case_executor ||= CaseExecutor.new(step: @step)
|
|
48
|
+
end
|
|
49
|
+
|
|
34
50
|
def run_concurrent
|
|
35
51
|
require "concurrent"
|
|
36
52
|
pool = Concurrent::FixedThreadPool.new(@concurrency)
|
|
@@ -39,10 +55,10 @@ module RubyLLM
|
|
|
39
55
|
# gets a single-response adapter with its own response (by index).
|
|
40
56
|
per_case_contexts = build_per_case_contexts
|
|
41
57
|
|
|
42
|
-
futures = @dataset.cases.each_with_index.map do |test_case,
|
|
43
|
-
|
|
58
|
+
futures = @dataset.cases.each_with_index.map do |test_case, index|
|
|
59
|
+
case_context = per_case_contexts[index]
|
|
44
60
|
Concurrent::Future.execute(executor: pool) do
|
|
45
|
-
|
|
61
|
+
case_executor.call(test_case: test_case, context: case_context)
|
|
46
62
|
end
|
|
47
63
|
end
|
|
48
64
|
futures.map(&:value!)
|
|
@@ -55,10 +71,10 @@ module RubyLLM
|
|
|
55
71
|
adapter = @context[:adapter]
|
|
56
72
|
responses = adapter.respond_to?(:responses_array) ? adapter.responses_array : nil
|
|
57
73
|
|
|
58
|
-
@dataset.cases.each_with_index.map do |_,
|
|
74
|
+
@dataset.cases.each_with_index.map do |_, index|
|
|
59
75
|
if responses
|
|
60
76
|
# Give each case its own single-response adapter
|
|
61
|
-
response = responses[
|
|
77
|
+
response = responses[index] || responses.last
|
|
62
78
|
per_case_adapter = Adapters::Test.new(response: response)
|
|
63
79
|
@context.merge(adapter: per_case_adapter)
|
|
64
80
|
else
|
|
@@ -66,199 +82,6 @@ module RubyLLM
|
|
|
66
82
|
end
|
|
67
83
|
end
|
|
68
84
|
end
|
|
69
|
-
|
|
70
|
-
def evaluate_case_with_context(test_case, context)
|
|
71
|
-
run_result = @step.run(test_case.input, context: context)
|
|
72
|
-
step_result = normalize_result(run_result)
|
|
73
|
-
eval_result = dispatch_evaluation(step_result, test_case)
|
|
74
|
-
|
|
75
|
-
result = build_case_result(test_case, step_result, eval_result)
|
|
76
|
-
|
|
77
|
-
if test_case.respond_to?(:step_expectations) && test_case.step_expectations &&
|
|
78
|
-
run_result.respond_to?(:outputs_by_step)
|
|
79
|
-
evaluate_step_expectations(result, run_result.outputs_by_step, test_case.step_expectations)
|
|
80
|
-
else
|
|
81
|
-
result
|
|
82
|
-
end
|
|
83
|
-
rescue RubyLLM::Contract::Error => e
|
|
84
|
-
raise unless e.message.include?("No adapter configured")
|
|
85
|
-
|
|
86
|
-
skipped_result(test_case, e.message)
|
|
87
|
-
end
|
|
88
|
-
|
|
89
|
-
def evaluate_case(test_case)
|
|
90
|
-
run_result = @step.run(test_case.input, context: @context)
|
|
91
|
-
step_result = normalize_result(run_result)
|
|
92
|
-
eval_result = dispatch_evaluation(step_result, test_case)
|
|
93
|
-
|
|
94
|
-
result = build_case_result(test_case, step_result, eval_result)
|
|
95
|
-
|
|
96
|
-
# Pipeline per-step evaluation
|
|
97
|
-
if test_case.respond_to?(:step_expectations) && test_case.step_expectations &&
|
|
98
|
-
run_result.respond_to?(:outputs_by_step)
|
|
99
|
-
evaluate_step_expectations(result, run_result.outputs_by_step, test_case.step_expectations)
|
|
100
|
-
else
|
|
101
|
-
result
|
|
102
|
-
end
|
|
103
|
-
rescue RubyLLM::Contract::Error => e
|
|
104
|
-
raise unless e.message.include?("No adapter configured")
|
|
105
|
-
|
|
106
|
-
skipped_result(test_case, e.message)
|
|
107
|
-
end
|
|
108
|
-
|
|
109
|
-
def build_case_result(test_case, step_result, eval_result)
|
|
110
|
-
trace = step_result.respond_to?(:trace) ? step_result.trace : nil
|
|
111
|
-
CaseResult.new(
|
|
112
|
-
name: test_case.name,
|
|
113
|
-
input: test_case.input,
|
|
114
|
-
output: step_result.parsed_output,
|
|
115
|
-
expected: test_case.expected,
|
|
116
|
-
step_status: step_result.status,
|
|
117
|
-
score: eval_result.score,
|
|
118
|
-
passed: eval_result.passed,
|
|
119
|
-
label: eval_result.label,
|
|
120
|
-
details: eval_result.details,
|
|
121
|
-
duration_ms: extract_latency(trace),
|
|
122
|
-
cost: extract_cost(trace)
|
|
123
|
-
)
|
|
124
|
-
end
|
|
125
|
-
|
|
126
|
-
def extract_latency(trace)
|
|
127
|
-
return nil unless trace
|
|
128
|
-
|
|
129
|
-
# Pipeline::Trace uses total_latency_ms, Step::Trace uses latency_ms
|
|
130
|
-
if trace.respond_to?(:total_latency_ms)
|
|
131
|
-
trace.total_latency_ms
|
|
132
|
-
else
|
|
133
|
-
trace[:latency_ms]
|
|
134
|
-
end
|
|
135
|
-
end
|
|
136
|
-
|
|
137
|
-
def extract_cost(trace)
|
|
138
|
-
return nil unless trace
|
|
139
|
-
|
|
140
|
-
# Pipeline::Trace uses total_cost, Step::Trace uses cost
|
|
141
|
-
if trace.respond_to?(:total_cost)
|
|
142
|
-
trace.total_cost
|
|
143
|
-
else
|
|
144
|
-
trace[:cost]
|
|
145
|
-
end
|
|
146
|
-
end
|
|
147
|
-
|
|
148
|
-
def dispatch_evaluation(step_result, test_case)
|
|
149
|
-
return contract_failure(step_result) unless step_result.ok?
|
|
150
|
-
|
|
151
|
-
if test_case.evaluator
|
|
152
|
-
evaluate_with_custom(step_result, test_case)
|
|
153
|
-
elsif test_case.expected_traits
|
|
154
|
-
evaluate_traits(step_result, test_case)
|
|
155
|
-
elsif !test_case.expected.nil?
|
|
156
|
-
evaluate_expected(step_result, test_case)
|
|
157
|
-
else
|
|
158
|
-
evaluate_contract_only
|
|
159
|
-
end
|
|
160
|
-
end
|
|
161
|
-
|
|
162
|
-
def normalize_result(result)
|
|
163
|
-
return result if result.respond_to?(:parsed_output)
|
|
164
|
-
|
|
165
|
-
normalize_pipeline_result(result)
|
|
166
|
-
end
|
|
167
|
-
|
|
168
|
-
def normalize_pipeline_result(result)
|
|
169
|
-
last_result = result.step_results&.last&.dig(:result)
|
|
170
|
-
is_ok = result.ok?
|
|
171
|
-
pipeline_trace = result.respond_to?(:trace) ? result.trace : nil
|
|
172
|
-
|
|
173
|
-
PipelineResultAdapter.new(
|
|
174
|
-
status: result.status,
|
|
175
|
-
ok_flag: is_ok,
|
|
176
|
-
parsed_output: is_ok ? result.outputs_by_step.values.last : nil,
|
|
177
|
-
validation_errors: last_result.respond_to?(:validation_errors) ? last_result.validation_errors : [],
|
|
178
|
-
trace: pipeline_trace || (last_result.respond_to?(:trace) ? last_result.trace : {})
|
|
179
|
-
)
|
|
180
|
-
end
|
|
181
|
-
|
|
182
|
-
def evaluate_expected(step_result, test_case)
|
|
183
|
-
dispatch_expected_evaluator(
|
|
184
|
-
output: step_result.parsed_output,
|
|
185
|
-
expected: test_case.expected,
|
|
186
|
-
input: test_case.input
|
|
187
|
-
)
|
|
188
|
-
end
|
|
189
|
-
|
|
190
|
-
def dispatch_expected_evaluator(output:, expected:, input:)
|
|
191
|
-
if expected.is_a?(Hash)
|
|
192
|
-
Evaluator::JsonIncludes.new.call(output: output, expected: expected, input: input)
|
|
193
|
-
elsif expected.is_a?(::Regexp)
|
|
194
|
-
Evaluator::Regex.new(expected).call(output: output, input: input)
|
|
195
|
-
else
|
|
196
|
-
Evaluator::Exact.new.call(output: output, expected: expected, input: input)
|
|
197
|
-
end
|
|
198
|
-
end
|
|
199
|
-
|
|
200
|
-
def evaluate_with_custom(step_result, test_case)
|
|
201
|
-
evaluator = test_case.evaluator
|
|
202
|
-
evaluator = Evaluator::ProcEvaluator.new(evaluator) if evaluator.is_a?(::Proc)
|
|
203
|
-
evaluator.call(output: step_result.parsed_output, expected: test_case.expected, input: test_case.input)
|
|
204
|
-
end
|
|
205
|
-
|
|
206
|
-
def evaluate_contract_only
|
|
207
|
-
EvaluationResult.new(score: 1.0, passed: true, details: build_contract_details)
|
|
208
|
-
end
|
|
209
|
-
|
|
210
|
-
def contract_failure(step_result)
|
|
211
|
-
EvaluationResult.new(
|
|
212
|
-
score: 0.0, passed: false,
|
|
213
|
-
details: "step failed: #{step_result.status} — #{step_result.validation_errors.join(", ")}"
|
|
214
|
-
)
|
|
215
|
-
end
|
|
216
|
-
|
|
217
|
-
def evaluate_step_expectations(result, outputs_by_step, expectations)
|
|
218
|
-
step_results = {}
|
|
219
|
-
all_passed = true
|
|
220
|
-
|
|
221
|
-
expectations.each do |step_alias, expected|
|
|
222
|
-
output = outputs_by_step[step_alias]
|
|
223
|
-
if output.nil?
|
|
224
|
-
step_results[step_alias] = { passed: false, details: "step not executed" }
|
|
225
|
-
all_passed = false
|
|
226
|
-
else
|
|
227
|
-
eval_res = dispatch_expected_evaluator(output: output, expected: expected, input: nil)
|
|
228
|
-
step_results[step_alias] = { passed: eval_res.passed, score: eval_res.score, details: eval_res.details }
|
|
229
|
-
all_passed = false unless eval_res.passed
|
|
230
|
-
end
|
|
231
|
-
end
|
|
232
|
-
|
|
233
|
-
# Rebuild CaseResult with step_results metadata
|
|
234
|
-
failed_steps = step_results.select { |_, v| !v[:passed] }
|
|
235
|
-
failure_details = failed_steps.map { |k, v| "#{k}: #{v[:details]}" }.join("; ")
|
|
236
|
-
|
|
237
|
-
CaseResult.new(
|
|
238
|
-
name: result.name, input: result.input, output: result.output,
|
|
239
|
-
expected: result.expected,
|
|
240
|
-
step_status: all_passed ? result.step_status : :step_expectation_failed,
|
|
241
|
-
score: all_passed ? result.score : 0.0,
|
|
242
|
-
passed: result.passed? && all_passed,
|
|
243
|
-
label: all_passed ? result.label : "FAIL",
|
|
244
|
-
details: all_passed ? result.details : "step expectations failed: #{failure_details}",
|
|
245
|
-
duration_ms: result.duration_ms, cost: result.cost
|
|
246
|
-
)
|
|
247
|
-
end
|
|
248
|
-
|
|
249
|
-
def skipped_result(test_case, reason)
|
|
250
|
-
CaseResult.new(
|
|
251
|
-
name: test_case.name,
|
|
252
|
-
input: test_case.input,
|
|
253
|
-
output: nil,
|
|
254
|
-
expected: test_case.expected,
|
|
255
|
-
step_status: :skipped,
|
|
256
|
-
score: 0.0,
|
|
257
|
-
passed: false,
|
|
258
|
-
label: "SKIP",
|
|
259
|
-
details: "skipped: #{reason}"
|
|
260
|
-
)
|
|
261
|
-
end
|
|
262
85
|
end
|
|
263
86
|
end
|
|
264
87
|
end
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
class StepExpectationApplier
|
|
7
|
+
def initialize(expectation_evaluator: ExpectationEvaluator.new)
|
|
8
|
+
@expectation_evaluator = expectation_evaluator
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def call(result:, run_result:, test_case:)
|
|
12
|
+
return result unless applicable?(test_case, run_result)
|
|
13
|
+
|
|
14
|
+
expectation_results = evaluate_expectations(run_result.outputs_by_step, test_case.step_expectations)
|
|
15
|
+
return result if expectation_results.values.all? { |entry| entry[:passed] }
|
|
16
|
+
|
|
17
|
+
rebuild_result(result, failure_details_for(expectation_results))
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
private
|
|
21
|
+
|
|
22
|
+
def applicable?(test_case, run_result)
|
|
23
|
+
test_case.respond_to?(:step_expectations) &&
|
|
24
|
+
test_case.step_expectations &&
|
|
25
|
+
run_result.respond_to?(:outputs_by_step)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def evaluate_expectations(outputs_by_step, expectations)
|
|
29
|
+
expectations.each_with_object({}) do |(step_alias, expected), results|
|
|
30
|
+
output = outputs_by_step[step_alias]
|
|
31
|
+
results[step_alias] = evaluate_single_expectation(output, expected)
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def evaluate_single_expectation(output, expected)
|
|
36
|
+
return { passed: false, details: "step not executed" } if output.nil?
|
|
37
|
+
|
|
38
|
+
evaluation = @expectation_evaluator.call(output: output, expected: expected, input: nil)
|
|
39
|
+
{ passed: evaluation.passed, details: evaluation.details }
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def failure_details_for(expectation_results)
|
|
43
|
+
expectation_results
|
|
44
|
+
.select { |_, entry| !entry[:passed] }
|
|
45
|
+
.map { |step_alias, entry| "#{step_alias}: #{entry[:details]}" }
|
|
46
|
+
.join("; ")
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def rebuild_result(result, failure_details)
|
|
50
|
+
CaseResult.new(
|
|
51
|
+
name: result.name,
|
|
52
|
+
input: result.input,
|
|
53
|
+
output: result.output,
|
|
54
|
+
expected: result.expected,
|
|
55
|
+
step_status: :step_expectation_failed,
|
|
56
|
+
score: 0.0,
|
|
57
|
+
passed: false,
|
|
58
|
+
label: "FAIL",
|
|
59
|
+
details: "step expectations failed: #{failure_details}",
|
|
60
|
+
duration_ms: result.duration_ms,
|
|
61
|
+
cost: result.cost
|
|
62
|
+
)
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
class StepResultNormalizer
|
|
7
|
+
def call(result)
|
|
8
|
+
return result if result.respond_to?(:parsed_output)
|
|
9
|
+
|
|
10
|
+
normalize_pipeline_result(result)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
private
|
|
14
|
+
|
|
15
|
+
def normalize_pipeline_result(result)
|
|
16
|
+
last_result = result.step_results&.last&.dig(:result)
|
|
17
|
+
successful = result.ok?
|
|
18
|
+
trace = result.respond_to?(:trace) ? result.trace : nil
|
|
19
|
+
|
|
20
|
+
PipelineResultAdapter.new(
|
|
21
|
+
status: result.status,
|
|
22
|
+
ok_flag: successful,
|
|
23
|
+
parsed_output: successful ? result.outputs_by_step.values.last : nil,
|
|
24
|
+
validation_errors: validation_errors_for(last_result),
|
|
25
|
+
trace: trace || trace_for(last_result)
|
|
26
|
+
)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def validation_errors_for(result)
|
|
30
|
+
result.respond_to?(:validation_errors) ? result.validation_errors : []
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def trace_for(result)
|
|
34
|
+
result.respond_to?(:trace) ? result.trace : {}
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -8,11 +8,24 @@ require_relative "eval/evaluator/json_includes"
|
|
|
8
8
|
require_relative "eval/evaluator/proc_evaluator"
|
|
9
9
|
require_relative "eval/dataset"
|
|
10
10
|
require_relative "eval/pipeline_result_adapter"
|
|
11
|
+
require_relative "eval/expectation_evaluator"
|
|
11
12
|
require_relative "eval/trait_evaluator"
|
|
12
13
|
require_relative "eval/contract_detail_builder"
|
|
14
|
+
require_relative "eval/case_scorer"
|
|
15
|
+
require_relative "eval/case_result_builder"
|
|
16
|
+
require_relative "eval/step_result_normalizer"
|
|
17
|
+
require_relative "eval/step_expectation_applier"
|
|
18
|
+
require_relative "eval/case_executor"
|
|
13
19
|
require_relative "eval/runner"
|
|
20
|
+
require_relative "eval/report_stats"
|
|
21
|
+
require_relative "eval/report_presenter"
|
|
22
|
+
require_relative "eval/report_storage"
|
|
14
23
|
require_relative "eval/report"
|
|
15
24
|
require_relative "eval/eval_definition"
|
|
16
25
|
require_relative "eval/model_comparison"
|
|
17
26
|
require_relative "eval/baseline_diff"
|
|
27
|
+
require_relative "eval/prompt_diff_serializer"
|
|
28
|
+
require_relative "eval/prompt_diff_comparator"
|
|
29
|
+
require_relative "eval/prompt_diff_presenter"
|
|
30
|
+
require_relative "eval/prompt_diff"
|
|
18
31
|
require_relative "eval/eval_history"
|
|
@@ -25,7 +25,16 @@ module RubyLLM
|
|
|
25
25
|
|
|
26
26
|
# Internal mutable steps list for registration
|
|
27
27
|
def steps_registry
|
|
28
|
-
@steps_registry ||=
|
|
28
|
+
@steps_registry ||= begin
|
|
29
|
+
inherited_steps =
|
|
30
|
+
if superclass.respond_to?(:steps_registry, true)
|
|
31
|
+
superclass.send(:steps_registry).map(&:dup)
|
|
32
|
+
else
|
|
33
|
+
[]
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
inherited_steps
|
|
37
|
+
end
|
|
29
38
|
end
|
|
30
39
|
|
|
31
40
|
def token_budget(limit = nil)
|
|
@@ -68,15 +68,28 @@ RSpec::Matchers.define :pass_eval do |eval_name|
|
|
|
68
68
|
@check_regressions = true
|
|
69
69
|
end
|
|
70
70
|
|
|
71
|
+
chain :compared_with do |other_step|
|
|
72
|
+
@comparison_step = other_step
|
|
73
|
+
@check_regressions = true # compared_with implies regression check
|
|
74
|
+
end
|
|
75
|
+
|
|
71
76
|
match do |step_or_pipeline|
|
|
72
77
|
@eval_name = eval_name
|
|
73
78
|
@context ||= {}
|
|
74
79
|
@minimum_score ||= nil
|
|
75
80
|
@maximum_cost ||= nil
|
|
76
81
|
@check_regressions ||= false
|
|
82
|
+
@comparison_step ||= nil
|
|
77
83
|
@error = nil
|
|
78
84
|
@diff = nil
|
|
79
|
-
@
|
|
85
|
+
@prompt_diff = nil
|
|
86
|
+
|
|
87
|
+
if @comparison_step && @check_regressions
|
|
88
|
+
@prompt_diff = step_or_pipeline.compare_with(@comparison_step, eval: eval_name, context: @context)
|
|
89
|
+
@report = @prompt_diff.candidate_report
|
|
90
|
+
else
|
|
91
|
+
@report = step_or_pipeline.run_eval(eval_name, context: @context)
|
|
92
|
+
end
|
|
80
93
|
|
|
81
94
|
score_ok = if @minimum_score
|
|
82
95
|
@report.score >= @minimum_score
|
|
@@ -86,7 +99,9 @@ RSpec::Matchers.define :pass_eval do |eval_name|
|
|
|
86
99
|
|
|
87
100
|
cost_ok = @maximum_cost ? @report.total_cost <= @maximum_cost : true
|
|
88
101
|
|
|
89
|
-
regression_ok = if @
|
|
102
|
+
regression_ok = if @prompt_diff
|
|
103
|
+
@prompt_diff.safe_to_switch?
|
|
104
|
+
elsif @check_regressions && @report.baseline_exists?
|
|
90
105
|
@diff = @report.compare_with_baseline
|
|
91
106
|
!@diff.regressed?
|
|
92
107
|
else
|
|
@@ -100,11 +115,67 @@ RSpec::Matchers.define :pass_eval do |eval_name|
|
|
|
100
115
|
end
|
|
101
116
|
|
|
102
117
|
failure_message do
|
|
118
|
+
if @prompt_diff && !@prompt_diff.safe_to_switch?
|
|
119
|
+
msg = "expected #{@eval_name} eval to be safe to switch from baseline prompt\n"
|
|
120
|
+
|
|
121
|
+
# Check empty sides first — most fundamental problem
|
|
122
|
+
bl_empty = @prompt_diff.baseline_empty?
|
|
123
|
+
cd_empty = @prompt_diff.candidate_empty?
|
|
124
|
+
if bl_empty || cd_empty
|
|
125
|
+
msg += " One side has no evaluated cases (all skipped or no adapter?)\n"
|
|
126
|
+
if sample_response_only_compare?
|
|
127
|
+
msg += " compare_with ignores sample_response; pass model: or with_context(adapter: ...)\n"
|
|
128
|
+
end
|
|
129
|
+
msg += " Candidate score: #{@prompt_diff.candidate_score}, Baseline score: #{@prompt_diff.baseline_score}"
|
|
130
|
+
next msg
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Check dataset comparability — names, inputs, AND expected must match
|
|
134
|
+
unless @prompt_diff.cases_comparable?
|
|
135
|
+
unless @prompt_diff.case_names_match?
|
|
136
|
+
mm = @prompt_diff.mismatched_cases
|
|
137
|
+
msg += " Case set mismatch — candidate and baseline must have identical cases:\n"
|
|
138
|
+
mm[:only_in_baseline].each { |n| msg += " only in baseline: #{n}\n" }
|
|
139
|
+
mm[:only_in_candidate].each { |n| msg += " only in candidate: #{n}\n" }
|
|
140
|
+
end
|
|
141
|
+
@prompt_diff.input_mismatches.each do |m|
|
|
142
|
+
msg += " Input mismatch for '#{m[:case]}' — same name but different inputs\n"
|
|
143
|
+
end
|
|
144
|
+
@prompt_diff.expected_mismatches.each do |m|
|
|
145
|
+
msg += " Expected mismatch for '#{m[:case]}' — same name/input but different expected values\n"
|
|
146
|
+
end
|
|
147
|
+
next msg
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Check per-case score regressions (even if global average is flat)
|
|
151
|
+
if @prompt_diff.score_regressions.any?
|
|
152
|
+
msg += " Per-case score regressions (#{@prompt_diff.score_regressions.length}):\n"
|
|
153
|
+
@prompt_diff.score_regressions.each do |r|
|
|
154
|
+
msg += " #{r[:case]}: #{r[:baseline_score]} -> #{r[:candidate_score]} (#{r[:delta]})\n"
|
|
155
|
+
end
|
|
156
|
+
msg += " Score delta: #{@prompt_diff.score_delta}"
|
|
157
|
+
next msg
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# Check pass/fail regressions and removed cases
|
|
161
|
+
removed = @prompt_diff.removed_passing_cases
|
|
162
|
+
reg_count = @prompt_diff.regressions.length + removed.length
|
|
163
|
+
msg += " Found #{reg_count} regression(s):\n"
|
|
164
|
+
@prompt_diff.regressions.each do |r|
|
|
165
|
+
msg += " #{r[:case]}: was PASS, now FAIL -- #{r[:detail]}\n"
|
|
166
|
+
end
|
|
167
|
+
removed.each do |name|
|
|
168
|
+
msg += " #{name}: REMOVED (was passing in baseline)\n"
|
|
169
|
+
end
|
|
170
|
+
msg += " Score delta: #{@prompt_diff.score_delta}"
|
|
171
|
+
next msg
|
|
172
|
+
end
|
|
173
|
+
|
|
103
174
|
msg = format_failure_message(@eval_name, @error, @report, @minimum_score, @maximum_cost)
|
|
104
175
|
if @diff&.regressed?
|
|
105
176
|
msg += "\n\nRegressions from baseline:\n"
|
|
106
177
|
@diff.regressions.each do |r|
|
|
107
|
-
msg += " #{r[:case]}: was PASS, now FAIL
|
|
178
|
+
msg += " #{r[:case]}: was PASS, now FAIL -- #{r[:detail]}\n"
|
|
108
179
|
end
|
|
109
180
|
msg += " Score delta: #{@diff.score_delta}"
|
|
110
181
|
end
|
|
@@ -114,4 +185,14 @@ RSpec::Matchers.define :pass_eval do |eval_name|
|
|
|
114
185
|
failure_message_when_negated do
|
|
115
186
|
"expected #{@eval_name} eval NOT to pass, but it passed with score: #{@report.score.round(2)}"
|
|
116
187
|
end
|
|
188
|
+
|
|
189
|
+
def sample_response_only_compare?
|
|
190
|
+
return false unless @comparison_step
|
|
191
|
+
return false if @context[:adapter] || @context[:model]
|
|
192
|
+
|
|
193
|
+
defn = @comparison_step.send(:all_eval_definitions)[@eval_name.to_s]
|
|
194
|
+
defn&.build_adapter
|
|
195
|
+
rescue StandardError
|
|
196
|
+
false
|
|
197
|
+
end
|
|
117
198
|
end
|
|
@@ -13,11 +13,16 @@ RSpec.configure do |config|
|
|
|
13
13
|
# Prevents non-block stub_all_steps from leaking between examples.
|
|
14
14
|
config.around(:each) do |example|
|
|
15
15
|
original_adapter = RubyLLM::Contract.configuration.default_adapter
|
|
16
|
+
original_logger = RubyLLM::Contract.configuration.logger
|
|
17
|
+
original_eval_hosts = RubyLLM::Contract.eval_hosts.dup
|
|
16
18
|
original_overrides = RubyLLM::Contract.step_adapter_overrides.dup
|
|
17
19
|
begin
|
|
18
20
|
example.run
|
|
19
21
|
ensure
|
|
20
22
|
RubyLLM::Contract.configuration.default_adapter = original_adapter
|
|
23
|
+
RubyLLM::Contract.configuration.logger = original_logger
|
|
24
|
+
RubyLLM::Contract.reset_eval_hosts!
|
|
25
|
+
RubyLLM::Contract.eval_hosts.concat(original_eval_hosts)
|
|
21
26
|
RubyLLM::Contract.step_adapter_overrides.replace(original_overrides)
|
|
22
27
|
end
|
|
23
28
|
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Step
|
|
6
|
+
class AdapterCaller
|
|
7
|
+
def initialize(adapter:, adapter_options:)
|
|
8
|
+
@adapter = adapter
|
|
9
|
+
@adapter_options = adapter_options
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def call(messages)
|
|
13
|
+
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
14
|
+
response = @adapter.call(messages: messages, **@adapter_options)
|
|
15
|
+
latency_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
|
|
16
|
+
[response, latency_ms]
|
|
17
|
+
rescue StandardError => error
|
|
18
|
+
[Result.new(status: :adapter_error, raw_output: nil, parsed_output: nil, validation_errors: [error.message]), 0]
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|