ruby_llm-contract 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +55 -0
- data/CHANGELOG.md +76 -0
- data/Gemfile +11 -0
- data/Gemfile.lock +176 -0
- data/LICENSE +21 -0
- data/README.md +154 -0
- data/Rakefile +8 -0
- data/examples/00_basics.rb +500 -0
- data/examples/01_classify_threads.rb +220 -0
- data/examples/02_generate_comment.rb +203 -0
- data/examples/03_target_audience.rb +201 -0
- data/examples/04_real_llm.rb +410 -0
- data/examples/05_output_schema.rb +258 -0
- data/examples/07_keyword_extraction.rb +239 -0
- data/examples/08_translation.rb +353 -0
- data/examples/09_eval_dataset.rb +287 -0
- data/examples/10_reddit_full_showcase.rb +363 -0
- data/examples/README.md +140 -0
- data/lib/ruby_llm/contract/adapters/base.rb +13 -0
- data/lib/ruby_llm/contract/adapters/response.rb +17 -0
- data/lib/ruby_llm/contract/adapters/ruby_llm.rb +94 -0
- data/lib/ruby_llm/contract/adapters/test.rb +44 -0
- data/lib/ruby_llm/contract/adapters.rb +6 -0
- data/lib/ruby_llm/contract/concerns/deep_symbolize.rb +17 -0
- data/lib/ruby_llm/contract/concerns/eval_host.rb +109 -0
- data/lib/ruby_llm/contract/concerns/trace_equality.rb +15 -0
- data/lib/ruby_llm/contract/concerns/usage_aggregator.rb +43 -0
- data/lib/ruby_llm/contract/configuration.rb +21 -0
- data/lib/ruby_llm/contract/contract/definition.rb +39 -0
- data/lib/ruby_llm/contract/contract/invariant.rb +23 -0
- data/lib/ruby_llm/contract/contract/parser.rb +143 -0
- data/lib/ruby_llm/contract/contract/schema_validator.rb +239 -0
- data/lib/ruby_llm/contract/contract/validator.rb +104 -0
- data/lib/ruby_llm/contract/contract.rb +7 -0
- data/lib/ruby_llm/contract/cost_calculator.rb +38 -0
- data/lib/ruby_llm/contract/dsl.rb +13 -0
- data/lib/ruby_llm/contract/errors.rb +19 -0
- data/lib/ruby_llm/contract/eval/case_result.rb +76 -0
- data/lib/ruby_llm/contract/eval/contract_detail_builder.rb +47 -0
- data/lib/ruby_llm/contract/eval/dataset.rb +53 -0
- data/lib/ruby_llm/contract/eval/eval_definition.rb +112 -0
- data/lib/ruby_llm/contract/eval/evaluation_result.rb +27 -0
- data/lib/ruby_llm/contract/eval/evaluator/exact.rb +20 -0
- data/lib/ruby_llm/contract/eval/evaluator/json_includes.rb +58 -0
- data/lib/ruby_llm/contract/eval/evaluator/proc_evaluator.rb +40 -0
- data/lib/ruby_llm/contract/eval/evaluator/regex.rb +27 -0
- data/lib/ruby_llm/contract/eval/model_comparison.rb +80 -0
- data/lib/ruby_llm/contract/eval/pipeline_result_adapter.rb +15 -0
- data/lib/ruby_llm/contract/eval/report.rb +115 -0
- data/lib/ruby_llm/contract/eval/runner.rb +162 -0
- data/lib/ruby_llm/contract/eval/trait_evaluator.rb +75 -0
- data/lib/ruby_llm/contract/eval.rb +16 -0
- data/lib/ruby_llm/contract/pipeline/base.rb +62 -0
- data/lib/ruby_llm/contract/pipeline/result.rb +131 -0
- data/lib/ruby_llm/contract/pipeline/runner.rb +139 -0
- data/lib/ruby_llm/contract/pipeline/trace.rb +72 -0
- data/lib/ruby_llm/contract/pipeline.rb +6 -0
- data/lib/ruby_llm/contract/prompt/ast.rb +38 -0
- data/lib/ruby_llm/contract/prompt/builder.rb +47 -0
- data/lib/ruby_llm/contract/prompt/node.rb +25 -0
- data/lib/ruby_llm/contract/prompt/nodes/example_node.rb +27 -0
- data/lib/ruby_llm/contract/prompt/nodes/rule_node.rb +15 -0
- data/lib/ruby_llm/contract/prompt/nodes/section_node.rb +26 -0
- data/lib/ruby_llm/contract/prompt/nodes/system_node.rb +15 -0
- data/lib/ruby_llm/contract/prompt/nodes/user_node.rb +15 -0
- data/lib/ruby_llm/contract/prompt/nodes.rb +7 -0
- data/lib/ruby_llm/contract/prompt/renderer.rb +76 -0
- data/lib/ruby_llm/contract/railtie.rb +20 -0
- data/lib/ruby_llm/contract/rake_task.rb +78 -0
- data/lib/ruby_llm/contract/rspec/pass_eval.rb +96 -0
- data/lib/ruby_llm/contract/rspec/satisfy_contract.rb +31 -0
- data/lib/ruby_llm/contract/rspec.rb +6 -0
- data/lib/ruby_llm/contract/step/base.rb +138 -0
- data/lib/ruby_llm/contract/step/dsl.rb +144 -0
- data/lib/ruby_llm/contract/step/limit_checker.rb +64 -0
- data/lib/ruby_llm/contract/step/result.rb +38 -0
- data/lib/ruby_llm/contract/step/retry_executor.rb +90 -0
- data/lib/ruby_llm/contract/step/retry_policy.rb +76 -0
- data/lib/ruby_llm/contract/step/runner.rb +126 -0
- data/lib/ruby_llm/contract/step/trace.rb +70 -0
- data/lib/ruby_llm/contract/step.rb +10 -0
- data/lib/ruby_llm/contract/token_estimator.rb +19 -0
- data/lib/ruby_llm/contract/types.rb +11 -0
- data/lib/ruby_llm/contract/version.rb +7 -0
- data/lib/ruby_llm/contract.rb +108 -0
- data/ruby_llm-contract.gemspec +33 -0
- metadata +172 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
class Dataset
|
|
7
|
+
attr_reader :name, :cases
|
|
8
|
+
|
|
9
|
+
def initialize(name = "unnamed", &block)
|
|
10
|
+
@name = name
|
|
11
|
+
@cases = []
|
|
12
|
+
instance_eval(&block) if block
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def self.define(name = "unnamed", &)
|
|
16
|
+
new(name, &)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
# DSL: define a test case
|
|
22
|
+
# dataset.case "name", input: {...}, expected: {...}
|
|
23
|
+
# dataset.case "name", input: {...}, expected_traits: {...}
|
|
24
|
+
# dataset.case "name", input: {...}, evaluator: proc
|
|
25
|
+
def add_case(name = nil, input:, expected: nil, expected_traits: nil, evaluator: nil)
|
|
26
|
+
@cases << Case.new(
|
|
27
|
+
name: name || "case_#{@cases.length + 1}",
|
|
28
|
+
input: input,
|
|
29
|
+
expected: expected,
|
|
30
|
+
expected_traits: expected_traits,
|
|
31
|
+
evaluator: evaluator
|
|
32
|
+
)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Allow using `add_case` in DSL
|
|
36
|
+
public :add_case
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
class Case
|
|
40
|
+
attr_reader :name, :input, :expected, :expected_traits, :evaluator
|
|
41
|
+
|
|
42
|
+
def initialize(name:, input:, expected: nil, expected_traits: nil, evaluator: nil)
|
|
43
|
+
@name = name
|
|
44
|
+
@input = input
|
|
45
|
+
@expected = expected
|
|
46
|
+
@expected_traits = expected_traits
|
|
47
|
+
@evaluator = evaluator
|
|
48
|
+
freeze
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
class EvalDefinition
|
|
7
|
+
attr_reader :name, :cases
|
|
8
|
+
|
|
9
|
+
def initialize(name, step_class: nil, &)
|
|
10
|
+
@name = name
|
|
11
|
+
@step_class = step_class
|
|
12
|
+
@default_input = nil
|
|
13
|
+
@sample_response = nil
|
|
14
|
+
@cases = []
|
|
15
|
+
instance_eval(&)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def default_input(input)
|
|
19
|
+
@default_input = input
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def sample_response(response)
|
|
23
|
+
@sample_response = response
|
|
24
|
+
pre_validate_sample! if @step_class
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def build_adapter
|
|
28
|
+
return nil unless @sample_response
|
|
29
|
+
|
|
30
|
+
Adapters::Test.new(response: @sample_response.is_a?(String) ? @sample_response : @sample_response.to_json)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def add_case(description, input: nil, expected: nil, expected_traits: nil, evaluator: nil)
|
|
34
|
+
case_input = input || @default_input
|
|
35
|
+
raise ArgumentError, "add_case requires input (set default_input or pass input:)" unless case_input
|
|
36
|
+
|
|
37
|
+
@cases << {
|
|
38
|
+
name: description,
|
|
39
|
+
input: case_input,
|
|
40
|
+
expected: expected,
|
|
41
|
+
expected_traits: expected_traits,
|
|
42
|
+
evaluator: evaluator
|
|
43
|
+
}
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def verify(description, expected_or_proc = nil, input: nil, expect: nil)
|
|
47
|
+
if expected_or_proc && expect
|
|
48
|
+
raise ArgumentError, "verify accepts either a positional argument or expect: keyword, not both"
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
expected_or_proc = expect if expect
|
|
52
|
+
case_input = input || @default_input
|
|
53
|
+
validate_verify_args!(expected_or_proc, case_input)
|
|
54
|
+
|
|
55
|
+
evaluator = expected_or_proc.is_a?(::Proc) ? expected_or_proc : nil
|
|
56
|
+
|
|
57
|
+
@cases << {
|
|
58
|
+
name: description,
|
|
59
|
+
input: case_input,
|
|
60
|
+
expected: evaluator ? nil : expected_or_proc,
|
|
61
|
+
evaluator: evaluator
|
|
62
|
+
}
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def build_dataset
|
|
66
|
+
eval_cases = effective_cases
|
|
67
|
+
eval_name = @name
|
|
68
|
+
Dataset.define(eval_name) do
|
|
69
|
+
eval_cases.each do |eval_case|
|
|
70
|
+
add_case(eval_case[:name], input: eval_case[:input], expected: eval_case[:expected],
|
|
71
|
+
expected_traits: eval_case[:expected_traits],
|
|
72
|
+
evaluator: eval_case[:evaluator])
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
private
|
|
78
|
+
|
|
79
|
+
def effective_cases
|
|
80
|
+
return @cases if @cases.any?
|
|
81
|
+
return [] unless @default_input
|
|
82
|
+
|
|
83
|
+
# Zero-verify: auto-add a contract check case
|
|
84
|
+
[{ name: "contract check", input: @default_input, expected: nil, evaluator: nil }]
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def validate_verify_args!(expected_or_proc, case_input)
|
|
88
|
+
raise ArgumentError, "verify requires either a positional argument or expect: keyword" unless expected_or_proc
|
|
89
|
+
raise ArgumentError, "verify requires input (set default_input or pass input:)" unless case_input
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def pre_validate_sample!
|
|
93
|
+
schema = @step_class.respond_to?(:output_schema) ? @step_class.output_schema : nil
|
|
94
|
+
return unless schema
|
|
95
|
+
|
|
96
|
+
errors = validate_sample_against_schema(schema)
|
|
97
|
+
return if errors.empty?
|
|
98
|
+
|
|
99
|
+
raise ArgumentError, "sample_response does not satisfy step schema: #{errors.join(", ")}"
|
|
100
|
+
rescue JSON::ParserError
|
|
101
|
+
# Not JSON -- skip pre-validation
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def validate_sample_against_schema(schema)
|
|
105
|
+
response_hash = @sample_response.is_a?(Hash) ? @sample_response : JSON.parse(@sample_response.to_s)
|
|
106
|
+
symbolized = Parser.symbolize_keys(response_hash)
|
|
107
|
+
SchemaValidator.validate(symbolized, schema)
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
class EvaluationResult
|
|
7
|
+
attr_reader :score, :passed, :label, :details
|
|
8
|
+
|
|
9
|
+
def initialize(score:, passed:, label: nil, details: nil)
|
|
10
|
+
@score = score.to_f.clamp(0.0, 1.0)
|
|
11
|
+
@passed = passed
|
|
12
|
+
@label = label || (passed ? "PASS" : "FAIL")
|
|
13
|
+
@details = details
|
|
14
|
+
freeze
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def failed?
|
|
18
|
+
!@passed
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def to_s
|
|
22
|
+
"#{@label} (score: #{@score}#{" — #{@details}" if @details})"
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
module Evaluator
|
|
7
|
+
class Exact
|
|
8
|
+
def call(output:, expected:, input: nil) # rubocop:disable Lint/UnusedMethodArgument
|
|
9
|
+
if output == expected
|
|
10
|
+
EvaluationResult.new(score: 1.0, passed: true, details: "exact match")
|
|
11
|
+
else
|
|
12
|
+
EvaluationResult.new(score: 0.0, passed: false,
|
|
13
|
+
details: "expected #{expected.inspect}, got #{output.inspect}")
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
module Evaluator
|
|
7
|
+
class JsonIncludes
|
|
8
|
+
def call(output:, expected:, input: nil) # rubocop:disable Lint/UnusedMethodArgument
|
|
9
|
+
return type_error(output, expected) unless output.is_a?(Hash) && expected.is_a?(Hash)
|
|
10
|
+
|
|
11
|
+
errors = check_keys(output, expected)
|
|
12
|
+
build_result(errors, expected.length)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
private
|
|
16
|
+
|
|
17
|
+
def check_keys(output, expected)
|
|
18
|
+
expected.each_with_object([]) do |(key, value), errors|
|
|
19
|
+
actual = output[key]
|
|
20
|
+
error = check_single_key(key, value, actual)
|
|
21
|
+
errors << error if error
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def check_single_key(key, expected_value, actual)
|
|
26
|
+
if actual.nil?
|
|
27
|
+
"missing key: #{key}"
|
|
28
|
+
elsif expected_value.is_a?(::Regexp)
|
|
29
|
+
mismatch_message(key, expected_value, actual) unless actual.to_s.match?(expected_value)
|
|
30
|
+
elsif actual != expected_value
|
|
31
|
+
mismatch_message(key, expected_value, actual)
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def mismatch_message(key, expected_value, actual)
|
|
36
|
+
"#{key}: expected #{expected_value.inspect}, got #{actual.inspect}"
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def build_result(errors, total)
|
|
40
|
+
if errors.empty?
|
|
41
|
+
return EvaluationResult.new(score: 1.0, passed: true,
|
|
42
|
+
details: "all expected keys present and matching")
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
matched = total - errors.length
|
|
46
|
+
score = total.zero? ? 0.0 : matched.to_f / total
|
|
47
|
+
EvaluationResult.new(score: score, passed: false, details: errors.join("; "))
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def type_error(output, expected)
|
|
51
|
+
EvaluationResult.new(score: 0.0, passed: false,
|
|
52
|
+
details: "expected Hash, got #{output.class} and #{expected.class}")
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
module Evaluator
|
|
7
|
+
class ProcEvaluator
|
|
8
|
+
def initialize(callable)
|
|
9
|
+
@callable = callable
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def call(output:, expected: nil, input: nil) # rubocop:disable Lint/UnusedMethodArgument,Metrics
|
|
13
|
+
result = if @callable.arity == 2 || (@callable.arity.negative? && @callable.parameters.length >= 2)
|
|
14
|
+
@callable.call(output, input)
|
|
15
|
+
else
|
|
16
|
+
@callable.call(output)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
if result.nil?
|
|
20
|
+
warn "[ruby_llm-contract] verify/evaluator proc returned nil. " \
|
|
21
|
+
"This usually means a key mismatch (string vs symbol). " \
|
|
22
|
+
"Output keys are always symbols."
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
case result
|
|
26
|
+
when true
|
|
27
|
+
EvaluationResult.new(score: 1.0, passed: true, details: "passed")
|
|
28
|
+
when false
|
|
29
|
+
EvaluationResult.new(score: 0.0, passed: false, details: "not passed")
|
|
30
|
+
when Numeric
|
|
31
|
+
EvaluationResult.new(score: result, passed: result >= 0.5, details: "custom score: #{result}")
|
|
32
|
+
else
|
|
33
|
+
EvaluationResult.new(score: result ? 1.0 : 0.0, passed: !!result, details: "custom: #{result}")
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
module Evaluator
|
|
7
|
+
class Regex
|
|
8
|
+
def initialize(pattern)
|
|
9
|
+
@pattern = pattern.is_a?(::Regexp) ? pattern : ::Regexp.new(pattern)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def call(output:, expected: nil, input: nil) # rubocop:disable Lint/UnusedMethodArgument
|
|
13
|
+
text = output.is_a?(Hash) ? output.values.join(" ") : output.to_s
|
|
14
|
+
|
|
15
|
+
if text.match?(@pattern)
|
|
16
|
+
EvaluationResult.new(score: 1.0, passed: true,
|
|
17
|
+
details: "matches #{@pattern.inspect}")
|
|
18
|
+
else
|
|
19
|
+
EvaluationResult.new(score: 0.0, passed: false,
|
|
20
|
+
details: "does not match #{@pattern.inspect}")
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
class ModelComparison
|
|
7
|
+
attr_reader :eval_name, :reports
|
|
8
|
+
|
|
9
|
+
def initialize(eval_name:, reports:)
|
|
10
|
+
@eval_name = eval_name
|
|
11
|
+
@reports = reports.freeze # { "model_name" => Report }
|
|
12
|
+
freeze
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def models
|
|
16
|
+
@reports.keys
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def score_for(model)
|
|
20
|
+
@reports[model]&.score
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def cost_for(model)
|
|
24
|
+
@reports[model]&.total_cost
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def best_for(min_score: 0.0)
|
|
28
|
+
eligible = @reports.select { |_, report| report.score > 0.0 && report.score >= min_score }
|
|
29
|
+
return nil if eligible.empty?
|
|
30
|
+
|
|
31
|
+
eligible.min_by { |_, report| report.total_cost }&.first
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def cost_per_point
|
|
35
|
+
@reports.transform_values do |report|
|
|
36
|
+
report.score.positive? ? report.total_cost / report.score : Float::INFINITY
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def table
|
|
41
|
+
lines = [" Model Score Cost Avg Latency"]
|
|
42
|
+
lines << " #{"-" * 57}"
|
|
43
|
+
|
|
44
|
+
@reports.each do |model, report|
|
|
45
|
+
latency = report.avg_latency_ms ? "#{report.avg_latency_ms.round}ms" : "n/a"
|
|
46
|
+
cost = report.total_cost.positive? ? "$#{format("%.4f", report.total_cost)}" : "n/a"
|
|
47
|
+
lines << format(" %-25s %6.2f %10s %12s", model, report.score, cost, latency)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
lines.join("\n")
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def print_summary(io = $stdout)
|
|
54
|
+
io.puts "#{@eval_name} — model comparison"
|
|
55
|
+
io.puts
|
|
56
|
+
io.puts table
|
|
57
|
+
io.puts
|
|
58
|
+
|
|
59
|
+
best = best_for(min_score: 0.0)
|
|
60
|
+
io.puts " Best overall: #{best}" if best
|
|
61
|
+
|
|
62
|
+
cheapest_passing = best_for(min_score: 1.0)
|
|
63
|
+
io.puts " Cheapest at 100%: #{cheapest_passing}" if cheapest_passing
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def to_h
|
|
67
|
+
@reports.transform_values do |report|
|
|
68
|
+
{
|
|
69
|
+
score: report.score,
|
|
70
|
+
total_cost: report.total_cost,
|
|
71
|
+
avg_latency_ms: report.avg_latency_ms,
|
|
72
|
+
pass_rate: report.pass_rate,
|
|
73
|
+
passed: report.passed?
|
|
74
|
+
}
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
# Lightweight adapter that wraps a Pipeline::Result to look like a Step::Result.
|
|
7
|
+
# Replaces OpenStruct usage in Runner#normalize_pipeline_result.
|
|
8
|
+
PipelineResultAdapter = Struct.new(:status, :ok_flag, :parsed_output, :validation_errors, :trace) do
|
|
9
|
+
def ok?
|
|
10
|
+
ok_flag
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
class Report
|
|
7
|
+
attr_reader :dataset_name, :results
|
|
8
|
+
|
|
9
|
+
def initialize(dataset_name:, results:)
|
|
10
|
+
@dataset_name = dataset_name
|
|
11
|
+
@results = results.freeze
|
|
12
|
+
freeze
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def score
|
|
16
|
+
evaluated = evaluated_results
|
|
17
|
+
return 0.0 if evaluated.empty?
|
|
18
|
+
|
|
19
|
+
evaluated.sum(&:score) / evaluated.length
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def passed
|
|
23
|
+
evaluated_results.count(&:passed?)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def failed
|
|
27
|
+
evaluated_results.count(&:failed?)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def skipped
|
|
31
|
+
results.count { |r| r.step_status == :skipped }
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def failures
|
|
35
|
+
evaluated_results.select(&:failed?)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def pass_rate
|
|
39
|
+
"#{passed}/#{evaluated_results.length}"
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def total_cost
|
|
43
|
+
results.sum { |r| r.cost || 0.0 }
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def avg_latency_ms
|
|
47
|
+
latencies = results.filter_map(&:duration_ms)
|
|
48
|
+
return nil if latencies.empty?
|
|
49
|
+
|
|
50
|
+
latencies.sum.to_f / latencies.length
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def passed?
|
|
54
|
+
evaluated = evaluated_results
|
|
55
|
+
return false if evaluated.empty?
|
|
56
|
+
|
|
57
|
+
evaluated.all?(&:passed?)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def each(&)
|
|
61
|
+
results.each(&)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def summary
|
|
65
|
+
parts = ["#{dataset_name}: #{pass_rate} checks passed"]
|
|
66
|
+
parts << "#{skipped} skipped" if skipped.positive?
|
|
67
|
+
parts << format_cost(total_cost) if total_cost.positive?
|
|
68
|
+
parts.join(", ")
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
GENERIC_DETAILS = ["passed", "not passed"].freeze
|
|
72
|
+
|
|
73
|
+
def to_s
|
|
74
|
+
lines = [summary]
|
|
75
|
+
failures.each do |result|
|
|
76
|
+
lines << format_failure(result)
|
|
77
|
+
end
|
|
78
|
+
lines.join("\n")
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def print_summary(io = $stdout)
|
|
82
|
+
io.puts summary
|
|
83
|
+
io.puts
|
|
84
|
+
results.each do |result|
|
|
85
|
+
icon = result.label
|
|
86
|
+
cost_str = result.cost ? " #{format_cost(result.cost)}" : ""
|
|
87
|
+
latency_str = result.duration_ms ? " #{result.duration_ms}ms" : ""
|
|
88
|
+
io.puts " #{icon} #{result.name}#{cost_str}#{latency_str}"
|
|
89
|
+
io.puts " #{result.details}" if result.failed? && useful_details?(result.details)
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
private
|
|
94
|
+
|
|
95
|
+
def format_failure(result)
|
|
96
|
+
line = " FAIL #{result.name}"
|
|
97
|
+
line += ": #{result.details}" if useful_details?(result.details)
|
|
98
|
+
line
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def useful_details?(details)
|
|
102
|
+
details && !GENERIC_DETAILS.include?(details)
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def evaluated_results
|
|
106
|
+
results.reject { |r| r.step_status == :skipped }
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def format_cost(cost)
|
|
110
|
+
"$#{format("%.6f", cost)}"
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|