ruby_llm-contract 0.4.5 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubycritic.yml +8 -0
- data/.simplecov +22 -0
- data/CHANGELOG.md +19 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +104 -2
- data/README.md +42 -2
- data/lib/ruby_llm/contract/concerns/context_helpers.rb +11 -10
- data/lib/ruby_llm/contract/concerns/deep_freeze.rb +13 -7
- data/lib/ruby_llm/contract/concerns/deep_symbolize.rb +15 -5
- data/lib/ruby_llm/contract/concerns/eval_host.rb +51 -7
- data/lib/ruby_llm/contract/contract/schema_validator/bound_rule.rb +85 -0
- data/lib/ruby_llm/contract/contract/schema_validator/enum_rule.rb +23 -0
- data/lib/ruby_llm/contract/contract/schema_validator/node.rb +70 -0
- data/lib/ruby_llm/contract/contract/schema_validator/object_rules.rb +66 -0
- data/lib/ruby_llm/contract/contract/schema_validator/scalar_rules.rb +22 -0
- data/lib/ruby_llm/contract/contract/schema_validator/schema_extractor.rb +23 -0
- data/lib/ruby_llm/contract/contract/schema_validator/type_rule.rb +30 -0
- data/lib/ruby_llm/contract/contract/schema_validator.rb +41 -266
- data/lib/ruby_llm/contract/contract/validator.rb +9 -0
- data/lib/ruby_llm/contract/eval/case_executor.rb +52 -0
- data/lib/ruby_llm/contract/eval/case_result_builder.rb +35 -0
- data/lib/ruby_llm/contract/eval/case_scorer.rb +66 -0
- data/lib/ruby_llm/contract/eval/evaluator/exact.rb +8 -6
- data/lib/ruby_llm/contract/eval/evaluator/proc_evaluator.rb +22 -10
- data/lib/ruby_llm/contract/eval/evaluator/regex.rb +11 -8
- data/lib/ruby_llm/contract/eval/expectation_evaluator.rb +26 -0
- data/lib/ruby_llm/contract/eval/prompt_diff.rb +39 -0
- data/lib/ruby_llm/contract/eval/prompt_diff_comparator.rb +116 -0
- data/lib/ruby_llm/contract/eval/prompt_diff_presenter.rb +99 -0
- data/lib/ruby_llm/contract/eval/prompt_diff_serializer.rb +23 -0
- data/lib/ruby_llm/contract/eval/report.rb +19 -191
- data/lib/ruby_llm/contract/eval/report_presenter.rb +65 -0
- data/lib/ruby_llm/contract/eval/report_stats.rb +65 -0
- data/lib/ruby_llm/contract/eval/report_storage.rb +107 -0
- data/lib/ruby_llm/contract/eval/runner.rb +30 -207
- data/lib/ruby_llm/contract/eval/step_expectation_applier.rb +67 -0
- data/lib/ruby_llm/contract/eval/step_result_normalizer.rb +39 -0
- data/lib/ruby_llm/contract/eval.rb +13 -0
- data/lib/ruby_llm/contract/pipeline/base.rb +10 -1
- data/lib/ruby_llm/contract/rspec/pass_eval.rb +84 -3
- data/lib/ruby_llm/contract/rspec.rb +5 -0
- data/lib/ruby_llm/contract/step/adapter_caller.rb +23 -0
- data/lib/ruby_llm/contract/step/base.rb +93 -38
- data/lib/ruby_llm/contract/step/dsl.rb +10 -0
- data/lib/ruby_llm/contract/step/input_validator.rb +34 -0
- data/lib/ruby_llm/contract/step/limit_checker.rb +11 -11
- data/lib/ruby_llm/contract/step/prompt_compiler.rb +33 -0
- data/lib/ruby_llm/contract/step/result.rb +3 -2
- data/lib/ruby_llm/contract/step/result_builder.rb +60 -0
- data/lib/ruby_llm/contract/step/retry_executor.rb +1 -0
- data/lib/ruby_llm/contract/step/runner.rb +46 -85
- data/lib/ruby_llm/contract/step/runner_config.rb +37 -0
- data/lib/ruby_llm/contract/step.rb +5 -0
- data/lib/ruby_llm/contract/version.rb +1 -1
- metadata +28 -1
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
class CaseScorer
|
|
7
|
+
include TraitEvaluator
|
|
8
|
+
include ContractDetailBuilder
|
|
9
|
+
|
|
10
|
+
def initialize(step:, expectation_evaluator: ExpectationEvaluator.new)
|
|
11
|
+
@step = step
|
|
12
|
+
@expectation_evaluator = expectation_evaluator
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def call(test_case:, step_result:)
|
|
16
|
+
return contract_failure(step_result) unless step_result.ok?
|
|
17
|
+
|
|
18
|
+
if test_case.evaluator
|
|
19
|
+
evaluate_with_custom(test_case, step_result)
|
|
20
|
+
elsif test_case.expected_traits
|
|
21
|
+
evaluate_traits(step_result, test_case)
|
|
22
|
+
elsif !test_case.expected.nil?
|
|
23
|
+
evaluate_expected(test_case, step_result)
|
|
24
|
+
else
|
|
25
|
+
evaluate_contract_only
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
private
|
|
30
|
+
|
|
31
|
+
def evaluate_expected(test_case, step_result)
|
|
32
|
+
@expectation_evaluator.call(
|
|
33
|
+
output: step_result.parsed_output,
|
|
34
|
+
expected: test_case.expected,
|
|
35
|
+
input: test_case.input
|
|
36
|
+
)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def evaluate_with_custom(test_case, step_result)
|
|
40
|
+
wrapped_custom_evaluator(test_case).call(
|
|
41
|
+
output: step_result.parsed_output,
|
|
42
|
+
expected: test_case.expected,
|
|
43
|
+
input: test_case.input
|
|
44
|
+
)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def wrapped_custom_evaluator(test_case)
|
|
48
|
+
evaluator = test_case.evaluator
|
|
49
|
+
evaluator.is_a?(::Proc) ? Evaluator::ProcEvaluator.new(evaluator) : evaluator
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def evaluate_contract_only
|
|
53
|
+
EvaluationResult.new(score: 1.0, passed: true, details: build_contract_details)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def contract_failure(step_result)
|
|
57
|
+
EvaluationResult.new(
|
|
58
|
+
score: 0.0,
|
|
59
|
+
passed: false,
|
|
60
|
+
details: "step failed: #{step_result.status} — #{step_result.validation_errors.join(", ")}"
|
|
61
|
+
)
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
@@ -4,14 +4,16 @@ module RubyLLM
|
|
|
4
4
|
module Contract
|
|
5
5
|
module Eval
|
|
6
6
|
module Evaluator
|
|
7
|
+
# Compares output to expected using Ruby equality semantics.
|
|
7
8
|
class Exact
|
|
8
9
|
def call(output:, expected:, input: nil) # rubocop:disable Lint/UnusedMethodArgument
|
|
9
|
-
if output == expected
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
10
|
+
return EvaluationResult.new(score: 1.0, passed: true, details: "exact match") if output == expected
|
|
11
|
+
|
|
12
|
+
EvaluationResult.new(
|
|
13
|
+
score: 0.0,
|
|
14
|
+
passed: false,
|
|
15
|
+
details: "expected #{expected.inspect}, got #{output.inspect}"
|
|
16
|
+
)
|
|
15
17
|
end
|
|
16
18
|
end
|
|
17
19
|
end
|
|
@@ -4,24 +4,36 @@ module RubyLLM
|
|
|
4
4
|
module Contract
|
|
5
5
|
module Eval
|
|
6
6
|
module Evaluator
|
|
7
|
+
# Adapts custom Ruby callables to the EvaluationResult contract.
|
|
7
8
|
class ProcEvaluator
|
|
8
9
|
def initialize(callable)
|
|
9
10
|
@callable = callable
|
|
10
11
|
end
|
|
11
12
|
|
|
12
13
|
def call(output:, expected: nil, input: nil) # rubocop:disable Lint/UnusedMethodArgument,Metrics
|
|
13
|
-
result =
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
end
|
|
14
|
+
result = invoke_callable(output, input)
|
|
15
|
+
warn_nil_result if result.nil?
|
|
16
|
+
build_evaluation_result(result)
|
|
17
|
+
end
|
|
18
18
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def invoke_callable(output, input)
|
|
22
|
+
callable_accepts_input? ? @callable.call(output, input) : @callable.call(output)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def callable_accepts_input?
|
|
26
|
+
arity = @callable.arity
|
|
27
|
+
arity == 2 || (arity.negative? && @callable.parameters.length >= 2)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def warn_nil_result
|
|
31
|
+
warn "[ruby_llm-contract] verify/evaluator proc returned nil. " \
|
|
32
|
+
"This usually means a key mismatch (string vs symbol). " \
|
|
33
|
+
"Output keys are always symbols."
|
|
34
|
+
end
|
|
24
35
|
|
|
36
|
+
def build_evaluation_result(result)
|
|
25
37
|
case result
|
|
26
38
|
when true
|
|
27
39
|
EvaluationResult.new(score: 1.0, passed: true, details: "passed")
|
|
@@ -4,21 +4,24 @@ module RubyLLM
|
|
|
4
4
|
module Contract
|
|
5
5
|
module Eval
|
|
6
6
|
module Evaluator
|
|
7
|
+
# Matches a regex against the flattened textual representation of output.
|
|
7
8
|
class Regex
|
|
8
9
|
def initialize(pattern)
|
|
9
10
|
@pattern = pattern.is_a?(::Regexp) ? pattern : ::Regexp.new(pattern)
|
|
10
11
|
end
|
|
11
12
|
|
|
12
13
|
def call(output:, expected: nil, input: nil) # rubocop:disable Lint/UnusedMethodArgument
|
|
13
|
-
|
|
14
|
+
pattern = @pattern.inspect
|
|
15
|
+
details = text_for(output).match?(@pattern) ? "matches #{pattern}" : "does not match #{pattern}"
|
|
16
|
+
passed = details.start_with?("matches")
|
|
14
17
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
18
|
+
EvaluationResult.new(score: passed ? 1.0 : 0.0, passed: passed, details: details)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
private
|
|
22
|
+
|
|
23
|
+
def text_for(output)
|
|
24
|
+
output.is_a?(Hash) ? output.values.join(" ") : output.to_s
|
|
22
25
|
end
|
|
23
26
|
end
|
|
24
27
|
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
class ExpectationEvaluator
|
|
7
|
+
def call(output:, expected:, input:)
|
|
8
|
+
evaluator_for(expected).call(output: output, expected: expected, input: input)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
private
|
|
12
|
+
|
|
13
|
+
def evaluator_for(expected)
|
|
14
|
+
case expected
|
|
15
|
+
when Hash
|
|
16
|
+
Evaluator::JsonIncludes.new
|
|
17
|
+
when ::Regexp
|
|
18
|
+
Evaluator::Regex.new(expected)
|
|
19
|
+
else
|
|
20
|
+
Evaluator::Exact.new
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "forwardable"
|
|
4
|
+
|
|
5
|
+
module RubyLLM
|
|
6
|
+
module Contract
|
|
7
|
+
module Eval
|
|
8
|
+
class PromptDiff
|
|
9
|
+
extend Forwardable
|
|
10
|
+
|
|
11
|
+
attr_reader :candidate_report, :baseline_report
|
|
12
|
+
def_delegators :@diff, :improvements, :regressions, :score_delta, :removed_passing_cases
|
|
13
|
+
def_delegators :@comparator, :safe_to_switch?, :case_names_match?, :cases_comparable?, :mismatched_cases,
|
|
14
|
+
:input_mismatches, :expected_mismatches, :score_regressions, :candidate_score, :baseline_score,
|
|
15
|
+
:baseline_empty?, :candidate_empty?
|
|
16
|
+
def_delegators :@presenter, :print_summary
|
|
17
|
+
|
|
18
|
+
def initialize(candidate:, baseline:)
|
|
19
|
+
@candidate_report = candidate
|
|
20
|
+
@baseline_report = baseline
|
|
21
|
+
serializer = PromptDiffSerializer.new
|
|
22
|
+
candidate_cases = serializer.call(candidate)
|
|
23
|
+
baseline_cases = serializer.call(baseline)
|
|
24
|
+
@diff = BaselineDiff.new(
|
|
25
|
+
baseline_cases: baseline_cases,
|
|
26
|
+
current_cases: candidate_cases
|
|
27
|
+
)
|
|
28
|
+
@comparator = PromptDiffComparator.new(
|
|
29
|
+
candidate_cases: candidate_cases,
|
|
30
|
+
baseline_cases: baseline_cases,
|
|
31
|
+
diff: @diff
|
|
32
|
+
)
|
|
33
|
+
@presenter = PromptDiffPresenter.new(prompt_diff: self, comparator: @comparator)
|
|
34
|
+
freeze
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
# Encapsulates the safety and mismatch rules for prompt A/B comparison.
|
|
7
|
+
class PromptDiffComparator
|
|
8
|
+
def initialize(candidate_cases:, baseline_cases:, diff:)
|
|
9
|
+
@candidate_cases = candidate_cases
|
|
10
|
+
@baseline_cases = baseline_cases
|
|
11
|
+
@diff = diff
|
|
12
|
+
@baseline_case_index = baseline_cases.to_h { |case_data| [case_data[:name], case_data] }
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def safe_to_switch?
|
|
16
|
+
return false if empty_comparison?
|
|
17
|
+
return false unless cases_comparable?
|
|
18
|
+
return false if score_regressions.any?
|
|
19
|
+
|
|
20
|
+
!@diff.regressed?
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def case_names_match?
|
|
24
|
+
case_names(@baseline_cases) == case_names(@candidate_cases)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def cases_comparable?
|
|
28
|
+
case_signatures(@baseline_cases) == case_signatures(@candidate_cases)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def mismatched_cases
|
|
32
|
+
baseline_names = case_names(@baseline_cases)
|
|
33
|
+
candidate_names = case_names(@candidate_cases)
|
|
34
|
+
|
|
35
|
+
{
|
|
36
|
+
only_in_baseline: baseline_names - candidate_names,
|
|
37
|
+
only_in_candidate: candidate_names - baseline_names
|
|
38
|
+
}
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def input_mismatches
|
|
42
|
+
attribute_mismatches(:input, :baseline_input, :candidate_input)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def expected_mismatches
|
|
46
|
+
attribute_mismatches(:expected, :baseline_expected, :candidate_expected)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def score_regressions
|
|
50
|
+
@candidate_cases.filter_map do |candidate_case|
|
|
51
|
+
baseline_case = @baseline_case_index[candidate_case[:name]]
|
|
52
|
+
next unless baseline_case
|
|
53
|
+
|
|
54
|
+
baseline_score = baseline_case[:score]
|
|
55
|
+
candidate_score = candidate_case[:score]
|
|
56
|
+
next unless candidate_score < baseline_score
|
|
57
|
+
|
|
58
|
+
{
|
|
59
|
+
case: candidate_case[:name],
|
|
60
|
+
baseline_score: baseline_score,
|
|
61
|
+
candidate_score: candidate_score,
|
|
62
|
+
delta: (candidate_score - baseline_score).round(4)
|
|
63
|
+
}
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def candidate_score
|
|
68
|
+
@diff.current_score
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def baseline_score
|
|
72
|
+
@diff.baseline_score
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def candidate_empty?
|
|
76
|
+
@candidate_cases.empty?
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def baseline_empty?
|
|
80
|
+
@baseline_cases.empty?
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def empty_comparison?
|
|
84
|
+
baseline_empty? || candidate_empty?
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
private
|
|
88
|
+
|
|
89
|
+
def case_names(cases)
|
|
90
|
+
cases.map { |case_data| case_data[:name] }.sort
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def case_signatures(cases)
|
|
94
|
+
cases.map { |case_data| [case_data[:name], case_data[:input], case_data[:expected]] }.sort_by(&:first)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def attribute_mismatches(attribute, baseline_key, candidate_key)
|
|
98
|
+
@candidate_cases.filter_map do |candidate_case|
|
|
99
|
+
baseline_case = @baseline_case_index[candidate_case[:name]]
|
|
100
|
+
next unless baseline_case
|
|
101
|
+
|
|
102
|
+
baseline_value = baseline_case[attribute]
|
|
103
|
+
candidate_value = candidate_case[attribute]
|
|
104
|
+
next if baseline_value == candidate_value
|
|
105
|
+
|
|
106
|
+
{
|
|
107
|
+
case: candidate_case[:name],
|
|
108
|
+
baseline_key => baseline_value,
|
|
109
|
+
candidate_key => candidate_value
|
|
110
|
+
}
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
# Renders a prompt diff as a readable console summary.
|
|
7
|
+
class PromptDiffPresenter
|
|
8
|
+
VARIANT_LABEL_WIDTH = 12
|
|
9
|
+
TABLE_WIDTH = 26
|
|
10
|
+
CASE_SET_MISMATCH_TITLE = " Case set mismatch (safe_to_switch? = NO):"
|
|
11
|
+
INPUT_MISMATCH_TITLE = " Input mismatch (safe_to_switch? = NO):"
|
|
12
|
+
EXPECTED_MISMATCH_TITLE = " Expected mismatch (safe_to_switch? = NO):"
|
|
13
|
+
REGRESSIONS_TITLE = " Regressions (PASS -> FAIL):"
|
|
14
|
+
SCORE_DROPS_TITLE = " Score drops:"
|
|
15
|
+
IMPROVEMENTS_TITLE = " Improvements:"
|
|
16
|
+
REMOVED_PASSING_TITLE = " Removed (were passing in baseline):"
|
|
17
|
+
|
|
18
|
+
def initialize(prompt_diff:, comparator:)
|
|
19
|
+
@prompt_diff = prompt_diff
|
|
20
|
+
@comparator = comparator
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def print_summary(io = $stdout)
|
|
24
|
+
print_header(io)
|
|
25
|
+
print_warning(io, "one side has no evaluated cases (all skipped?)") if @comparator.empty_comparison?
|
|
26
|
+
print_case_set_mismatch(io)
|
|
27
|
+
print_formatted_section(io, INPUT_MISMATCH_TITLE, @comparator.input_mismatches) do |mismatch|
|
|
28
|
+
"#{mismatch[:case]}: inputs differ between candidate and baseline"
|
|
29
|
+
end
|
|
30
|
+
print_formatted_section(io, EXPECTED_MISMATCH_TITLE, @comparator.expected_mismatches) do |mismatch|
|
|
31
|
+
"#{mismatch[:case]}: expected values differ between candidate and baseline"
|
|
32
|
+
end
|
|
33
|
+
print_formatted_section(io, REGRESSIONS_TITLE, @prompt_diff.regressions) do |regression|
|
|
34
|
+
"#{regression[:case]}: was PASS, now FAIL -- #{regression[:detail]}"
|
|
35
|
+
end
|
|
36
|
+
print_formatted_section(io, SCORE_DROPS_TITLE, @comparator.score_regressions) do |regression|
|
|
37
|
+
"#{regression[:case]}: #{regression[:baseline_score]} -> #{regression[:candidate_score]} (#{regression[:delta]})"
|
|
38
|
+
end
|
|
39
|
+
print_formatted_section(io, IMPROVEMENTS_TITLE, @prompt_diff.improvements) do |improvement|
|
|
40
|
+
"#{improvement[:case]}: was FAIL, now PASS"
|
|
41
|
+
end
|
|
42
|
+
print_formatted_section(io, REMOVED_PASSING_TITLE, @prompt_diff.removed_passing_cases, &:to_s)
|
|
43
|
+
io.puts " Safe to switch: #{@comparator.safe_to_switch? ? "YES" : "NO"}"
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
private
|
|
47
|
+
|
|
48
|
+
def print_header(io)
|
|
49
|
+
lines = [
|
|
50
|
+
"Prompt A/B comparison",
|
|
51
|
+
nil,
|
|
52
|
+
format(" %-#{VARIANT_LABEL_WIDTH}s Score", "Variant"),
|
|
53
|
+
" #{"-" * TABLE_WIDTH}",
|
|
54
|
+
format(" %-#{VARIANT_LABEL_WIDTH}s %.2f", "Candidate", @comparator.candidate_score),
|
|
55
|
+
format(" %-#{VARIANT_LABEL_WIDTH}s %.2f", "Baseline", @comparator.baseline_score),
|
|
56
|
+
nil,
|
|
57
|
+
" Score delta: #{format_delta(@prompt_diff.score_delta)}",
|
|
58
|
+
nil
|
|
59
|
+
]
|
|
60
|
+
emit_lines(io, lines)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def print_warning(io, message)
|
|
64
|
+
emit_lines(io, [" WARNING: #{message}", nil])
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def print_case_set_mismatch(io)
|
|
68
|
+
return if @comparator.case_names_match?
|
|
69
|
+
|
|
70
|
+
mismatches = @comparator.mismatched_cases
|
|
71
|
+
lines = mismatches[:only_in_baseline].map { |name| "only in baseline: #{name}" } +
|
|
72
|
+
mismatches[:only_in_candidate].map { |name| "only in candidate: #{name}" }
|
|
73
|
+
emit_section(io, CASE_SET_MISMATCH_TITLE, lines)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def print_formatted_section(io, title, collection)
|
|
77
|
+
return if collection.empty?
|
|
78
|
+
|
|
79
|
+
lines = collection.map { |entry| yield(entry) }
|
|
80
|
+
emit_section(io, title, lines)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def emit_section(io, title, lines)
|
|
84
|
+
emit_lines(io, [title, *lines.map { |line| " #{line}" }, nil])
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def emit_lines(io, lines)
|
|
88
|
+
lines.each do |line|
|
|
89
|
+
line.nil? ? io.puts : io.puts(line)
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def format_delta(delta)
|
|
94
|
+
delta >= 0 ? "+#{delta}" : delta.to_s
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
# Normalizes report results into comparable prompt-diff case hashes.
|
|
7
|
+
class PromptDiffSerializer
|
|
8
|
+
def call(report)
|
|
9
|
+
report.results.reject { |result| result.step_status == :skipped }.map do |result|
|
|
10
|
+
{
|
|
11
|
+
name: result.name,
|
|
12
|
+
input: result.input,
|
|
13
|
+
expected: result.expected,
|
|
14
|
+
passed: result.passed?,
|
|
15
|
+
score: result.score,
|
|
16
|
+
details: result.details
|
|
17
|
+
}
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|