ruby_llm-contract 0.4.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubycritic.yml +8 -0
- data/.simplecov +22 -0
- data/CHANGELOG.md +59 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +104 -2
- data/README.md +42 -2
- data/lib/ruby_llm/contract/concerns/context_helpers.rb +11 -10
- data/lib/ruby_llm/contract/concerns/deep_freeze.rb +13 -7
- data/lib/ruby_llm/contract/concerns/deep_symbolize.rb +15 -5
- data/lib/ruby_llm/contract/concerns/eval_host.rb +51 -7
- data/lib/ruby_llm/contract/contract/schema_validator/bound_rule.rb +85 -0
- data/lib/ruby_llm/contract/contract/schema_validator/enum_rule.rb +23 -0
- data/lib/ruby_llm/contract/contract/schema_validator/node.rb +70 -0
- data/lib/ruby_llm/contract/contract/schema_validator/object_rules.rb +66 -0
- data/lib/ruby_llm/contract/contract/schema_validator/scalar_rules.rb +22 -0
- data/lib/ruby_llm/contract/contract/schema_validator/schema_extractor.rb +23 -0
- data/lib/ruby_llm/contract/contract/schema_validator/type_rule.rb +30 -0
- data/lib/ruby_llm/contract/contract/schema_validator.rb +41 -266
- data/lib/ruby_llm/contract/contract/validator.rb +9 -0
- data/lib/ruby_llm/contract/cost_calculator.rb +41 -1
- data/lib/ruby_llm/contract/eval/case_executor.rb +52 -0
- data/lib/ruby_llm/contract/eval/case_result_builder.rb +35 -0
- data/lib/ruby_llm/contract/eval/case_scorer.rb +66 -0
- data/lib/ruby_llm/contract/eval/evaluator/exact.rb +8 -6
- data/lib/ruby_llm/contract/eval/evaluator/proc_evaluator.rb +22 -10
- data/lib/ruby_llm/contract/eval/evaluator/regex.rb +11 -8
- data/lib/ruby_llm/contract/eval/expectation_evaluator.rb +26 -0
- data/lib/ruby_llm/contract/eval/prompt_diff.rb +39 -0
- data/lib/ruby_llm/contract/eval/prompt_diff_comparator.rb +116 -0
- data/lib/ruby_llm/contract/eval/prompt_diff_presenter.rb +99 -0
- data/lib/ruby_llm/contract/eval/prompt_diff_serializer.rb +23 -0
- data/lib/ruby_llm/contract/eval/report.rb +19 -191
- data/lib/ruby_llm/contract/eval/report_presenter.rb +65 -0
- data/lib/ruby_llm/contract/eval/report_stats.rb +65 -0
- data/lib/ruby_llm/contract/eval/report_storage.rb +107 -0
- data/lib/ruby_llm/contract/eval/runner.rb +30 -207
- data/lib/ruby_llm/contract/eval/step_expectation_applier.rb +67 -0
- data/lib/ruby_llm/contract/eval/step_result_normalizer.rb +39 -0
- data/lib/ruby_llm/contract/eval.rb +13 -0
- data/lib/ruby_llm/contract/minitest.rb +116 -2
- data/lib/ruby_llm/contract/pipeline/base.rb +15 -2
- data/lib/ruby_llm/contract/rake_task.rb +20 -1
- data/lib/ruby_llm/contract/rspec/helpers.rb +91 -6
- data/lib/ruby_llm/contract/rspec/pass_eval.rb +84 -3
- data/lib/ruby_llm/contract/rspec.rb +18 -0
- data/lib/ruby_llm/contract/step/adapter_caller.rb +23 -0
- data/lib/ruby_llm/contract/step/base.rb +94 -37
- data/lib/ruby_llm/contract/step/dsl.rb +61 -16
- data/lib/ruby_llm/contract/step/input_validator.rb +34 -0
- data/lib/ruby_llm/contract/step/limit_checker.rb +28 -11
- data/lib/ruby_llm/contract/step/prompt_compiler.rb +33 -0
- data/lib/ruby_llm/contract/step/result.rb +3 -2
- data/lib/ruby_llm/contract/step/result_builder.rb +60 -0
- data/lib/ruby_llm/contract/step/retry_executor.rb +1 -0
- data/lib/ruby_llm/contract/step/runner.rb +47 -84
- data/lib/ruby_llm/contract/step/runner_config.rb +37 -0
- data/lib/ruby_llm/contract/step.rb +5 -0
- data/lib/ruby_llm/contract/version.rb +1 -1
- data/lib/ruby_llm/contract.rb +28 -0
- metadata +28 -1
|
@@ -4,8 +4,6 @@ module RubyLLM
|
|
|
4
4
|
module Contract
|
|
5
5
|
module Eval
|
|
6
6
|
class Runner
|
|
7
|
-
include TraitEvaluator
|
|
8
|
-
include ContractDetailBuilder
|
|
9
7
|
include Concerns::ContextHelpers
|
|
10
8
|
|
|
11
9
|
def self.run(step:, dataset:, context: {}, concurrency: nil)
|
|
@@ -20,17 +18,35 @@ module RubyLLM
|
|
|
20
18
|
end
|
|
21
19
|
|
|
22
20
|
def run
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
step_name = @step.respond_to?(:name) ? @step.name : @step.to_s
|
|
29
|
-
Report.new(dataset_name: @dataset.name, results: results, step_name: step_name)
|
|
21
|
+
Report.new(
|
|
22
|
+
dataset_name: @dataset.name,
|
|
23
|
+
results: collected_results,
|
|
24
|
+
step_name: step_name
|
|
25
|
+
)
|
|
30
26
|
end
|
|
31
27
|
|
|
32
28
|
private
|
|
33
29
|
|
|
30
|
+
def collected_results
|
|
31
|
+
concurrent? ? run_concurrent : run_serial
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def run_serial
|
|
35
|
+
@dataset.cases.map { |test_case| case_executor.call(test_case: test_case, context: @context) }
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def concurrent?
|
|
39
|
+
@concurrency && @concurrency > 1
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def step_name
|
|
43
|
+
@step.respond_to?(:name) ? @step.name : @step.to_s
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def case_executor
|
|
47
|
+
@case_executor ||= CaseExecutor.new(step: @step)
|
|
48
|
+
end
|
|
49
|
+
|
|
34
50
|
def run_concurrent
|
|
35
51
|
require "concurrent"
|
|
36
52
|
pool = Concurrent::FixedThreadPool.new(@concurrency)
|
|
@@ -39,10 +55,10 @@ module RubyLLM
|
|
|
39
55
|
# gets a single-response adapter with its own response (by index).
|
|
40
56
|
per_case_contexts = build_per_case_contexts
|
|
41
57
|
|
|
42
|
-
futures = @dataset.cases.each_with_index.map do |test_case,
|
|
43
|
-
|
|
58
|
+
futures = @dataset.cases.each_with_index.map do |test_case, index|
|
|
59
|
+
case_context = per_case_contexts[index]
|
|
44
60
|
Concurrent::Future.execute(executor: pool) do
|
|
45
|
-
|
|
61
|
+
case_executor.call(test_case: test_case, context: case_context)
|
|
46
62
|
end
|
|
47
63
|
end
|
|
48
64
|
futures.map(&:value!)
|
|
@@ -55,10 +71,10 @@ module RubyLLM
|
|
|
55
71
|
adapter = @context[:adapter]
|
|
56
72
|
responses = adapter.respond_to?(:responses_array) ? adapter.responses_array : nil
|
|
57
73
|
|
|
58
|
-
@dataset.cases.each_with_index.map do |_,
|
|
74
|
+
@dataset.cases.each_with_index.map do |_, index|
|
|
59
75
|
if responses
|
|
60
76
|
# Give each case its own single-response adapter
|
|
61
|
-
response = responses[
|
|
77
|
+
response = responses[index] || responses.last
|
|
62
78
|
per_case_adapter = Adapters::Test.new(response: response)
|
|
63
79
|
@context.merge(adapter: per_case_adapter)
|
|
64
80
|
else
|
|
@@ -66,199 +82,6 @@ module RubyLLM
|
|
|
66
82
|
end
|
|
67
83
|
end
|
|
68
84
|
end
|
|
69
|
-
|
|
70
|
-
def evaluate_case_with_context(test_case, context)
|
|
71
|
-
run_result = @step.run(test_case.input, context: context)
|
|
72
|
-
step_result = normalize_result(run_result)
|
|
73
|
-
eval_result = dispatch_evaluation(step_result, test_case)
|
|
74
|
-
|
|
75
|
-
result = build_case_result(test_case, step_result, eval_result)
|
|
76
|
-
|
|
77
|
-
if test_case.respond_to?(:step_expectations) && test_case.step_expectations &&
|
|
78
|
-
run_result.respond_to?(:outputs_by_step)
|
|
79
|
-
evaluate_step_expectations(result, run_result.outputs_by_step, test_case.step_expectations)
|
|
80
|
-
else
|
|
81
|
-
result
|
|
82
|
-
end
|
|
83
|
-
rescue RubyLLM::Contract::Error => e
|
|
84
|
-
raise unless e.message.include?("No adapter configured")
|
|
85
|
-
|
|
86
|
-
skipped_result(test_case, e.message)
|
|
87
|
-
end
|
|
88
|
-
|
|
89
|
-
def evaluate_case(test_case)
|
|
90
|
-
run_result = @step.run(test_case.input, context: @context)
|
|
91
|
-
step_result = normalize_result(run_result)
|
|
92
|
-
eval_result = dispatch_evaluation(step_result, test_case)
|
|
93
|
-
|
|
94
|
-
result = build_case_result(test_case, step_result, eval_result)
|
|
95
|
-
|
|
96
|
-
# Pipeline per-step evaluation
|
|
97
|
-
if test_case.respond_to?(:step_expectations) && test_case.step_expectations &&
|
|
98
|
-
run_result.respond_to?(:outputs_by_step)
|
|
99
|
-
evaluate_step_expectations(result, run_result.outputs_by_step, test_case.step_expectations)
|
|
100
|
-
else
|
|
101
|
-
result
|
|
102
|
-
end
|
|
103
|
-
rescue RubyLLM::Contract::Error => e
|
|
104
|
-
raise unless e.message.include?("No adapter configured")
|
|
105
|
-
|
|
106
|
-
skipped_result(test_case, e.message)
|
|
107
|
-
end
|
|
108
|
-
|
|
109
|
-
def build_case_result(test_case, step_result, eval_result)
|
|
110
|
-
trace = step_result.respond_to?(:trace) ? step_result.trace : nil
|
|
111
|
-
CaseResult.new(
|
|
112
|
-
name: test_case.name,
|
|
113
|
-
input: test_case.input,
|
|
114
|
-
output: step_result.parsed_output,
|
|
115
|
-
expected: test_case.expected,
|
|
116
|
-
step_status: step_result.status,
|
|
117
|
-
score: eval_result.score,
|
|
118
|
-
passed: eval_result.passed,
|
|
119
|
-
label: eval_result.label,
|
|
120
|
-
details: eval_result.details,
|
|
121
|
-
duration_ms: extract_latency(trace),
|
|
122
|
-
cost: extract_cost(trace)
|
|
123
|
-
)
|
|
124
|
-
end
|
|
125
|
-
|
|
126
|
-
def extract_latency(trace)
|
|
127
|
-
return nil unless trace
|
|
128
|
-
|
|
129
|
-
# Pipeline::Trace uses total_latency_ms, Step::Trace uses latency_ms
|
|
130
|
-
if trace.respond_to?(:total_latency_ms)
|
|
131
|
-
trace.total_latency_ms
|
|
132
|
-
else
|
|
133
|
-
trace[:latency_ms]
|
|
134
|
-
end
|
|
135
|
-
end
|
|
136
|
-
|
|
137
|
-
def extract_cost(trace)
|
|
138
|
-
return nil unless trace
|
|
139
|
-
|
|
140
|
-
# Pipeline::Trace uses total_cost, Step::Trace uses cost
|
|
141
|
-
if trace.respond_to?(:total_cost)
|
|
142
|
-
trace.total_cost
|
|
143
|
-
else
|
|
144
|
-
trace[:cost]
|
|
145
|
-
end
|
|
146
|
-
end
|
|
147
|
-
|
|
148
|
-
def dispatch_evaluation(step_result, test_case)
|
|
149
|
-
return contract_failure(step_result) unless step_result.ok?
|
|
150
|
-
|
|
151
|
-
if test_case.evaluator
|
|
152
|
-
evaluate_with_custom(step_result, test_case)
|
|
153
|
-
elsif test_case.expected_traits
|
|
154
|
-
evaluate_traits(step_result, test_case)
|
|
155
|
-
elsif !test_case.expected.nil?
|
|
156
|
-
evaluate_expected(step_result, test_case)
|
|
157
|
-
else
|
|
158
|
-
evaluate_contract_only
|
|
159
|
-
end
|
|
160
|
-
end
|
|
161
|
-
|
|
162
|
-
def normalize_result(result)
|
|
163
|
-
return result if result.respond_to?(:parsed_output)
|
|
164
|
-
|
|
165
|
-
normalize_pipeline_result(result)
|
|
166
|
-
end
|
|
167
|
-
|
|
168
|
-
def normalize_pipeline_result(result)
|
|
169
|
-
last_result = result.step_results&.last&.dig(:result)
|
|
170
|
-
is_ok = result.ok?
|
|
171
|
-
pipeline_trace = result.respond_to?(:trace) ? result.trace : nil
|
|
172
|
-
|
|
173
|
-
PipelineResultAdapter.new(
|
|
174
|
-
status: result.status,
|
|
175
|
-
ok_flag: is_ok,
|
|
176
|
-
parsed_output: is_ok ? result.outputs_by_step.values.last : nil,
|
|
177
|
-
validation_errors: last_result.respond_to?(:validation_errors) ? last_result.validation_errors : [],
|
|
178
|
-
trace: pipeline_trace || (last_result.respond_to?(:trace) ? last_result.trace : {})
|
|
179
|
-
)
|
|
180
|
-
end
|
|
181
|
-
|
|
182
|
-
def evaluate_expected(step_result, test_case)
|
|
183
|
-
dispatch_expected_evaluator(
|
|
184
|
-
output: step_result.parsed_output,
|
|
185
|
-
expected: test_case.expected,
|
|
186
|
-
input: test_case.input
|
|
187
|
-
)
|
|
188
|
-
end
|
|
189
|
-
|
|
190
|
-
def dispatch_expected_evaluator(output:, expected:, input:)
|
|
191
|
-
if expected.is_a?(Hash)
|
|
192
|
-
Evaluator::JsonIncludes.new.call(output: output, expected: expected, input: input)
|
|
193
|
-
elsif expected.is_a?(::Regexp)
|
|
194
|
-
Evaluator::Regex.new(expected).call(output: output, input: input)
|
|
195
|
-
else
|
|
196
|
-
Evaluator::Exact.new.call(output: output, expected: expected, input: input)
|
|
197
|
-
end
|
|
198
|
-
end
|
|
199
|
-
|
|
200
|
-
def evaluate_with_custom(step_result, test_case)
|
|
201
|
-
evaluator = test_case.evaluator
|
|
202
|
-
evaluator = Evaluator::ProcEvaluator.new(evaluator) if evaluator.is_a?(::Proc)
|
|
203
|
-
evaluator.call(output: step_result.parsed_output, expected: test_case.expected, input: test_case.input)
|
|
204
|
-
end
|
|
205
|
-
|
|
206
|
-
def evaluate_contract_only
|
|
207
|
-
EvaluationResult.new(score: 1.0, passed: true, details: build_contract_details)
|
|
208
|
-
end
|
|
209
|
-
|
|
210
|
-
def contract_failure(step_result)
|
|
211
|
-
EvaluationResult.new(
|
|
212
|
-
score: 0.0, passed: false,
|
|
213
|
-
details: "step failed: #{step_result.status} — #{step_result.validation_errors.join(", ")}"
|
|
214
|
-
)
|
|
215
|
-
end
|
|
216
|
-
|
|
217
|
-
def evaluate_step_expectations(result, outputs_by_step, expectations)
|
|
218
|
-
step_results = {}
|
|
219
|
-
all_passed = true
|
|
220
|
-
|
|
221
|
-
expectations.each do |step_alias, expected|
|
|
222
|
-
output = outputs_by_step[step_alias]
|
|
223
|
-
if output.nil?
|
|
224
|
-
step_results[step_alias] = { passed: false, details: "step not executed" }
|
|
225
|
-
all_passed = false
|
|
226
|
-
else
|
|
227
|
-
eval_res = dispatch_expected_evaluator(output: output, expected: expected, input: nil)
|
|
228
|
-
step_results[step_alias] = { passed: eval_res.passed, score: eval_res.score, details: eval_res.details }
|
|
229
|
-
all_passed = false unless eval_res.passed
|
|
230
|
-
end
|
|
231
|
-
end
|
|
232
|
-
|
|
233
|
-
# Rebuild CaseResult with step_results metadata
|
|
234
|
-
failed_steps = step_results.select { |_, v| !v[:passed] }
|
|
235
|
-
failure_details = failed_steps.map { |k, v| "#{k}: #{v[:details]}" }.join("; ")
|
|
236
|
-
|
|
237
|
-
CaseResult.new(
|
|
238
|
-
name: result.name, input: result.input, output: result.output,
|
|
239
|
-
expected: result.expected,
|
|
240
|
-
step_status: all_passed ? result.step_status : :step_expectation_failed,
|
|
241
|
-
score: all_passed ? result.score : 0.0,
|
|
242
|
-
passed: result.passed? && all_passed,
|
|
243
|
-
label: all_passed ? result.label : "FAIL",
|
|
244
|
-
details: all_passed ? result.details : "step expectations failed: #{failure_details}",
|
|
245
|
-
duration_ms: result.duration_ms, cost: result.cost
|
|
246
|
-
)
|
|
247
|
-
end
|
|
248
|
-
|
|
249
|
-
def skipped_result(test_case, reason)
|
|
250
|
-
CaseResult.new(
|
|
251
|
-
name: test_case.name,
|
|
252
|
-
input: test_case.input,
|
|
253
|
-
output: nil,
|
|
254
|
-
expected: test_case.expected,
|
|
255
|
-
step_status: :skipped,
|
|
256
|
-
score: 0.0,
|
|
257
|
-
passed: false,
|
|
258
|
-
label: "SKIP",
|
|
259
|
-
details: "skipped: #{reason}"
|
|
260
|
-
)
|
|
261
|
-
end
|
|
262
85
|
end
|
|
263
86
|
end
|
|
264
87
|
end
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
class StepExpectationApplier
|
|
7
|
+
def initialize(expectation_evaluator: ExpectationEvaluator.new)
|
|
8
|
+
@expectation_evaluator = expectation_evaluator
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def call(result:, run_result:, test_case:)
|
|
12
|
+
return result unless applicable?(test_case, run_result)
|
|
13
|
+
|
|
14
|
+
expectation_results = evaluate_expectations(run_result.outputs_by_step, test_case.step_expectations)
|
|
15
|
+
return result if expectation_results.values.all? { |entry| entry[:passed] }
|
|
16
|
+
|
|
17
|
+
rebuild_result(result, failure_details_for(expectation_results))
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
private
|
|
21
|
+
|
|
22
|
+
def applicable?(test_case, run_result)
|
|
23
|
+
test_case.respond_to?(:step_expectations) &&
|
|
24
|
+
test_case.step_expectations &&
|
|
25
|
+
run_result.respond_to?(:outputs_by_step)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def evaluate_expectations(outputs_by_step, expectations)
|
|
29
|
+
expectations.each_with_object({}) do |(step_alias, expected), results|
|
|
30
|
+
output = outputs_by_step[step_alias]
|
|
31
|
+
results[step_alias] = evaluate_single_expectation(output, expected)
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def evaluate_single_expectation(output, expected)
|
|
36
|
+
return { passed: false, details: "step not executed" } if output.nil?
|
|
37
|
+
|
|
38
|
+
evaluation = @expectation_evaluator.call(output: output, expected: expected, input: nil)
|
|
39
|
+
{ passed: evaluation.passed, details: evaluation.details }
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def failure_details_for(expectation_results)
|
|
43
|
+
expectation_results
|
|
44
|
+
.select { |_, entry| !entry[:passed] }
|
|
45
|
+
.map { |step_alias, entry| "#{step_alias}: #{entry[:details]}" }
|
|
46
|
+
.join("; ")
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def rebuild_result(result, failure_details)
|
|
50
|
+
CaseResult.new(
|
|
51
|
+
name: result.name,
|
|
52
|
+
input: result.input,
|
|
53
|
+
output: result.output,
|
|
54
|
+
expected: result.expected,
|
|
55
|
+
step_status: :step_expectation_failed,
|
|
56
|
+
score: 0.0,
|
|
57
|
+
passed: false,
|
|
58
|
+
label: "FAIL",
|
|
59
|
+
details: "step expectations failed: #{failure_details}",
|
|
60
|
+
duration_ms: result.duration_ms,
|
|
61
|
+
cost: result.cost
|
|
62
|
+
)
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
class StepResultNormalizer
|
|
7
|
+
def call(result)
|
|
8
|
+
return result if result.respond_to?(:parsed_output)
|
|
9
|
+
|
|
10
|
+
normalize_pipeline_result(result)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
private
|
|
14
|
+
|
|
15
|
+
def normalize_pipeline_result(result)
|
|
16
|
+
last_result = result.step_results&.last&.dig(:result)
|
|
17
|
+
successful = result.ok?
|
|
18
|
+
trace = result.respond_to?(:trace) ? result.trace : nil
|
|
19
|
+
|
|
20
|
+
PipelineResultAdapter.new(
|
|
21
|
+
status: result.status,
|
|
22
|
+
ok_flag: successful,
|
|
23
|
+
parsed_output: successful ? result.outputs_by_step.values.last : nil,
|
|
24
|
+
validation_errors: validation_errors_for(last_result),
|
|
25
|
+
trace: trace || trace_for(last_result)
|
|
26
|
+
)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def validation_errors_for(result)
|
|
30
|
+
result.respond_to?(:validation_errors) ? result.validation_errors : []
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def trace_for(result)
|
|
34
|
+
result.respond_to?(:trace) ? result.trace : {}
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -8,11 +8,24 @@ require_relative "eval/evaluator/json_includes"
|
|
|
8
8
|
require_relative "eval/evaluator/proc_evaluator"
|
|
9
9
|
require_relative "eval/dataset"
|
|
10
10
|
require_relative "eval/pipeline_result_adapter"
|
|
11
|
+
require_relative "eval/expectation_evaluator"
|
|
11
12
|
require_relative "eval/trait_evaluator"
|
|
12
13
|
require_relative "eval/contract_detail_builder"
|
|
14
|
+
require_relative "eval/case_scorer"
|
|
15
|
+
require_relative "eval/case_result_builder"
|
|
16
|
+
require_relative "eval/step_result_normalizer"
|
|
17
|
+
require_relative "eval/step_expectation_applier"
|
|
18
|
+
require_relative "eval/case_executor"
|
|
13
19
|
require_relative "eval/runner"
|
|
20
|
+
require_relative "eval/report_stats"
|
|
21
|
+
require_relative "eval/report_presenter"
|
|
22
|
+
require_relative "eval/report_storage"
|
|
14
23
|
require_relative "eval/report"
|
|
15
24
|
require_relative "eval/eval_definition"
|
|
16
25
|
require_relative "eval/model_comparison"
|
|
17
26
|
require_relative "eval/baseline_diff"
|
|
27
|
+
require_relative "eval/prompt_diff_serializer"
|
|
28
|
+
require_relative "eval/prompt_diff_comparator"
|
|
29
|
+
require_relative "eval/prompt_diff_presenter"
|
|
30
|
+
require_relative "eval/prompt_diff"
|
|
18
31
|
require_relative "eval/eval_history"
|
|
@@ -5,6 +5,20 @@ require "ruby_llm/contract"
|
|
|
5
5
|
module RubyLLM
|
|
6
6
|
module Contract
|
|
7
7
|
module MinitestHelpers
|
|
8
|
+
# Snapshot adapter before each test so teardown can restore it.
|
|
9
|
+
def setup
|
|
10
|
+
super if defined?(super)
|
|
11
|
+
@_contract_original_adapter = RubyLLM::Contract.configuration.default_adapter
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# Auto-cleanup: clear overrides AND restore original adapter.
|
|
15
|
+
# Prevents both non-block stub_step and stub_all_steps from leaking.
|
|
16
|
+
def teardown
|
|
17
|
+
RubyLLM::Contract.step_adapter_overrides.clear
|
|
18
|
+
RubyLLM::Contract.configuration.default_adapter = @_contract_original_adapter
|
|
19
|
+
super if defined?(super)
|
|
20
|
+
end
|
|
21
|
+
|
|
8
22
|
def assert_satisfies_contract(result, msg = nil)
|
|
9
23
|
assert result.ok?, msg || "Expected step result to satisfy contract, " \
|
|
10
24
|
"but got status: #{result.status}. Errors: #{result.validation_errors.join(", ")}"
|
|
@@ -33,13 +47,113 @@ module RubyLLM
|
|
|
33
47
|
report
|
|
34
48
|
end
|
|
35
49
|
|
|
36
|
-
|
|
50
|
+
# Stub a specific step to return a canned response without API calls.
|
|
51
|
+
# Routes per-step — other steps are not affected.
|
|
52
|
+
#
|
|
53
|
+
# stub_step(ClassifyTicket, response: { priority: "high" })
|
|
54
|
+
#
|
|
55
|
+
# Supports an optional block form — the override is removed after the
|
|
56
|
+
# block returns (even if it raises):
|
|
57
|
+
#
|
|
58
|
+
# stub_step(ClassifyTicket, response: data) do
|
|
59
|
+
# result = ClassifyTicket.run("test")
|
|
60
|
+
# end
|
|
61
|
+
# # ClassifyTicket.run no longer stubbed
|
|
62
|
+
#
|
|
63
|
+
def stub_step(step_class, response: nil, responses: nil, &block)
|
|
64
|
+
adapter = if responses
|
|
65
|
+
Adapters::Test.new(responses: responses)
|
|
66
|
+
else
|
|
67
|
+
Adapters::Test.new(response: response)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
overrides = RubyLLM::Contract.step_adapter_overrides
|
|
71
|
+
previous = overrides[step_class]
|
|
72
|
+
overrides[step_class] = adapter
|
|
73
|
+
|
|
74
|
+
if block
|
|
75
|
+
begin
|
|
76
|
+
yield
|
|
77
|
+
ensure
|
|
78
|
+
if previous
|
|
79
|
+
overrides[step_class] = previous
|
|
80
|
+
else
|
|
81
|
+
overrides.delete(step_class)
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Stub multiple steps at once with different responses.
|
|
88
|
+
# Takes a hash of step_class => options. Requires a block.
|
|
89
|
+
#
|
|
90
|
+
# stub_steps(
|
|
91
|
+
# ClassifyTicket => { response: { priority: "high" } },
|
|
92
|
+
# RouteToTeam => { response: { team: "billing" } }
|
|
93
|
+
# ) do
|
|
94
|
+
# result = TicketPipeline.run("test")
|
|
95
|
+
# end
|
|
96
|
+
#
|
|
97
|
+
def stub_steps(stubs, &block)
|
|
98
|
+
raise ArgumentError, "stub_steps requires a block" unless block
|
|
99
|
+
|
|
100
|
+
overrides = RubyLLM::Contract.step_adapter_overrides
|
|
101
|
+
previous = {}
|
|
102
|
+
|
|
103
|
+
stubs.each do |step_class, opts|
|
|
104
|
+
opts = opts.transform_keys(&:to_sym)
|
|
105
|
+
adapter = if opts[:responses]
|
|
106
|
+
Adapters::Test.new(responses: opts[:responses])
|
|
107
|
+
else
|
|
108
|
+
Adapters::Test.new(response: opts[:response])
|
|
109
|
+
end
|
|
110
|
+
previous[step_class] = overrides[step_class]
|
|
111
|
+
overrides[step_class] = adapter
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
begin
|
|
115
|
+
yield
|
|
116
|
+
ensure
|
|
117
|
+
stubs.each_key do |step_class|
|
|
118
|
+
if previous[step_class]
|
|
119
|
+
overrides[step_class] = previous[step_class]
|
|
120
|
+
else
|
|
121
|
+
overrides.delete(step_class)
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Set a global test adapter for ALL steps.
|
|
128
|
+
#
|
|
129
|
+
# stub_all_steps(response: { default: true })
|
|
130
|
+
#
|
|
131
|
+
# Supports an optional block form — the previous adapter is restored
|
|
132
|
+
# after the block returns (even if it raises):
|
|
133
|
+
#
|
|
134
|
+
# stub_all_steps(response: { default: true }) do
|
|
135
|
+
# # all steps use test adapter
|
|
136
|
+
# end
|
|
137
|
+
# # original adapter restored
|
|
138
|
+
#
|
|
139
|
+
def stub_all_steps(response: nil, responses: nil, &block)
|
|
37
140
|
adapter = if responses
|
|
38
141
|
Adapters::Test.new(responses: responses)
|
|
39
142
|
else
|
|
40
143
|
Adapters::Test.new(response: response)
|
|
41
144
|
end
|
|
42
|
-
|
|
145
|
+
|
|
146
|
+
if block
|
|
147
|
+
previous = RubyLLM::Contract.configuration.default_adapter
|
|
148
|
+
begin
|
|
149
|
+
RubyLLM::Contract.configuration.default_adapter = adapter
|
|
150
|
+
yield
|
|
151
|
+
ensure
|
|
152
|
+
RubyLLM::Contract.configuration.default_adapter = previous
|
|
153
|
+
end
|
|
154
|
+
else
|
|
155
|
+
RubyLLM::Contract.configure { |c| c.default_adapter = adapter }
|
|
156
|
+
end
|
|
43
157
|
end
|
|
44
158
|
end
|
|
45
159
|
end
|
|
@@ -25,11 +25,24 @@ module RubyLLM
|
|
|
25
25
|
|
|
26
26
|
# Internal mutable steps list for registration
|
|
27
27
|
def steps_registry
|
|
28
|
-
@steps_registry ||=
|
|
28
|
+
@steps_registry ||= begin
|
|
29
|
+
inherited_steps =
|
|
30
|
+
if superclass.respond_to?(:steps_registry, true)
|
|
31
|
+
superclass.send(:steps_registry).map(&:dup)
|
|
32
|
+
else
|
|
33
|
+
[]
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
inherited_steps
|
|
37
|
+
end
|
|
29
38
|
end
|
|
30
39
|
|
|
31
40
|
def token_budget(limit = nil)
|
|
32
|
-
|
|
41
|
+
if limit
|
|
42
|
+
raise ArgumentError, "token_budget must be positive, got #{limit}" unless limit.positive?
|
|
43
|
+
|
|
44
|
+
return @token_budget = limit
|
|
45
|
+
end
|
|
33
46
|
|
|
34
47
|
@token_budget
|
|
35
48
|
end
|
|
@@ -7,7 +7,7 @@ module RubyLLM
|
|
|
7
7
|
module Contract
|
|
8
8
|
class RakeTask < ::Rake::TaskLib
|
|
9
9
|
attr_accessor :name, :context, :fail_on_empty, :minimum_score, :maximum_cost,
|
|
10
|
-
:eval_dirs, :save_baseline, :fail_on_regression
|
|
10
|
+
:eval_dirs, :save_baseline, :fail_on_regression, :track_history
|
|
11
11
|
|
|
12
12
|
def initialize(name = :"ruby_llm_contract:eval", &block)
|
|
13
13
|
super()
|
|
@@ -19,6 +19,7 @@ module RubyLLM
|
|
|
19
19
|
@eval_dirs = [] # directories to load eval files from (non-Rails)
|
|
20
20
|
@save_baseline = false
|
|
21
21
|
@fail_on_regression = false
|
|
22
|
+
@track_history = false
|
|
22
23
|
block&.call(self)
|
|
23
24
|
define_task
|
|
24
25
|
end
|
|
@@ -47,18 +48,23 @@ module RubyLLM
|
|
|
47
48
|
suite_cost = 0.0
|
|
48
49
|
|
|
49
50
|
passed_reports = []
|
|
51
|
+
all_reports = []
|
|
50
52
|
|
|
51
53
|
results.each do |host, reports|
|
|
52
54
|
puts "\n#{host.name || host.to_s}"
|
|
53
55
|
reports.each_value do |report|
|
|
54
56
|
report.print_summary
|
|
55
57
|
suite_cost += report.total_cost
|
|
58
|
+
all_reports << [host, report]
|
|
56
59
|
report_ok = report_meets_score?(report) && !check_regression(report)
|
|
57
60
|
gate_passed = false unless report_ok
|
|
58
61
|
passed_reports << report if report_ok
|
|
59
62
|
end
|
|
60
63
|
end
|
|
61
64
|
|
|
65
|
+
# Save history BEFORE gating — failures are valuable trend data (ADR-0016 F3)
|
|
66
|
+
save_all_history!(all_reports, context) if @track_history
|
|
67
|
+
|
|
62
68
|
if @maximum_cost && suite_cost > @maximum_cost
|
|
63
69
|
abort "\nEval suite FAILED: total cost $#{format("%.4f", suite_cost)} " \
|
|
64
70
|
"exceeds budget $#{format("%.4f", @maximum_cost)}"
|
|
@@ -68,6 +74,7 @@ module RubyLLM
|
|
|
68
74
|
|
|
69
75
|
# Save baselines only after ALL gates pass
|
|
70
76
|
passed_reports.each { |r| save_baseline!(r) } if @save_baseline
|
|
77
|
+
|
|
71
78
|
puts "\nAll evals passed."
|
|
72
79
|
end
|
|
73
80
|
end
|
|
@@ -98,6 +105,18 @@ module RubyLLM
|
|
|
98
105
|
puts " Baseline saved: #{path}"
|
|
99
106
|
end
|
|
100
107
|
|
|
108
|
+
def save_all_history!(host_reports, context)
|
|
109
|
+
context_model = (context[:model] || context["model"]) if context.is_a?(Hash)
|
|
110
|
+
host_reports.each do |host, report|
|
|
111
|
+
# Model priority: context > step DSL > default config
|
|
112
|
+
model = context_model
|
|
113
|
+
model ||= (host.model if host.respond_to?(:model))
|
|
114
|
+
model ||= RubyLLM::Contract.configuration.default_model rescue nil
|
|
115
|
+
path = report.save_history!(model: model)
|
|
116
|
+
puts " History saved: #{path}"
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
101
120
|
def task_prerequisites
|
|
102
121
|
defined?(::Rails) ? [:environment] : []
|
|
103
122
|
end
|