ruby_llm-contract 0.4.5 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/.rubycritic.yml +8 -0
  3. data/.simplecov +22 -0
  4. data/CHANGELOG.md +19 -0
  5. data/Gemfile +2 -0
  6. data/Gemfile.lock +104 -2
  7. data/README.md +42 -2
  8. data/lib/ruby_llm/contract/concerns/context_helpers.rb +11 -10
  9. data/lib/ruby_llm/contract/concerns/deep_freeze.rb +13 -7
  10. data/lib/ruby_llm/contract/concerns/deep_symbolize.rb +15 -5
  11. data/lib/ruby_llm/contract/concerns/eval_host.rb +51 -7
  12. data/lib/ruby_llm/contract/contract/schema_validator/bound_rule.rb +85 -0
  13. data/lib/ruby_llm/contract/contract/schema_validator/enum_rule.rb +23 -0
  14. data/lib/ruby_llm/contract/contract/schema_validator/node.rb +70 -0
  15. data/lib/ruby_llm/contract/contract/schema_validator/object_rules.rb +66 -0
  16. data/lib/ruby_llm/contract/contract/schema_validator/scalar_rules.rb +22 -0
  17. data/lib/ruby_llm/contract/contract/schema_validator/schema_extractor.rb +23 -0
  18. data/lib/ruby_llm/contract/contract/schema_validator/type_rule.rb +30 -0
  19. data/lib/ruby_llm/contract/contract/schema_validator.rb +41 -266
  20. data/lib/ruby_llm/contract/contract/validator.rb +9 -0
  21. data/lib/ruby_llm/contract/eval/case_executor.rb +52 -0
  22. data/lib/ruby_llm/contract/eval/case_result_builder.rb +35 -0
  23. data/lib/ruby_llm/contract/eval/case_scorer.rb +66 -0
  24. data/lib/ruby_llm/contract/eval/evaluator/exact.rb +8 -6
  25. data/lib/ruby_llm/contract/eval/evaluator/proc_evaluator.rb +22 -10
  26. data/lib/ruby_llm/contract/eval/evaluator/regex.rb +11 -8
  27. data/lib/ruby_llm/contract/eval/expectation_evaluator.rb +26 -0
  28. data/lib/ruby_llm/contract/eval/prompt_diff.rb +39 -0
  29. data/lib/ruby_llm/contract/eval/prompt_diff_comparator.rb +116 -0
  30. data/lib/ruby_llm/contract/eval/prompt_diff_presenter.rb +99 -0
  31. data/lib/ruby_llm/contract/eval/prompt_diff_serializer.rb +23 -0
  32. data/lib/ruby_llm/contract/eval/report.rb +19 -191
  33. data/lib/ruby_llm/contract/eval/report_presenter.rb +65 -0
  34. data/lib/ruby_llm/contract/eval/report_stats.rb +65 -0
  35. data/lib/ruby_llm/contract/eval/report_storage.rb +107 -0
  36. data/lib/ruby_llm/contract/eval/runner.rb +30 -207
  37. data/lib/ruby_llm/contract/eval/step_expectation_applier.rb +67 -0
  38. data/lib/ruby_llm/contract/eval/step_result_normalizer.rb +39 -0
  39. data/lib/ruby_llm/contract/eval.rb +13 -0
  40. data/lib/ruby_llm/contract/pipeline/base.rb +10 -1
  41. data/lib/ruby_llm/contract/rspec/pass_eval.rb +84 -3
  42. data/lib/ruby_llm/contract/rspec.rb +5 -0
  43. data/lib/ruby_llm/contract/step/adapter_caller.rb +23 -0
  44. data/lib/ruby_llm/contract/step/base.rb +93 -38
  45. data/lib/ruby_llm/contract/step/dsl.rb +10 -0
  46. data/lib/ruby_llm/contract/step/input_validator.rb +34 -0
  47. data/lib/ruby_llm/contract/step/limit_checker.rb +11 -11
  48. data/lib/ruby_llm/contract/step/prompt_compiler.rb +33 -0
  49. data/lib/ruby_llm/contract/step/result.rb +3 -2
  50. data/lib/ruby_llm/contract/step/result_builder.rb +60 -0
  51. data/lib/ruby_llm/contract/step/retry_executor.rb +1 -0
  52. data/lib/ruby_llm/contract/step/runner.rb +46 -85
  53. data/lib/ruby_llm/contract/step/runner_config.rb +37 -0
  54. data/lib/ruby_llm/contract/step.rb +5 -0
  55. data/lib/ruby_llm/contract/version.rb +1 -1
  56. metadata +28 -1
@@ -0,0 +1,66 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ class CaseScorer
7
+ include TraitEvaluator
8
+ include ContractDetailBuilder
9
+
10
+ def initialize(step:, expectation_evaluator: ExpectationEvaluator.new)
11
+ @step = step
12
+ @expectation_evaluator = expectation_evaluator
13
+ end
14
+
15
+ def call(test_case:, step_result:)
16
+ return contract_failure(step_result) unless step_result.ok?
17
+
18
+ if test_case.evaluator
19
+ evaluate_with_custom(test_case, step_result)
20
+ elsif test_case.expected_traits
21
+ evaluate_traits(step_result, test_case)
22
+ elsif !test_case.expected.nil?
23
+ evaluate_expected(test_case, step_result)
24
+ else
25
+ evaluate_contract_only
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ def evaluate_expected(test_case, step_result)
32
+ @expectation_evaluator.call(
33
+ output: step_result.parsed_output,
34
+ expected: test_case.expected,
35
+ input: test_case.input
36
+ )
37
+ end
38
+
39
+ def evaluate_with_custom(test_case, step_result)
40
+ wrapped_custom_evaluator(test_case).call(
41
+ output: step_result.parsed_output,
42
+ expected: test_case.expected,
43
+ input: test_case.input
44
+ )
45
+ end
46
+
47
+ def wrapped_custom_evaluator(test_case)
48
+ evaluator = test_case.evaluator
49
+ evaluator.is_a?(::Proc) ? Evaluator::ProcEvaluator.new(evaluator) : evaluator
50
+ end
51
+
52
+ def evaluate_contract_only
53
+ EvaluationResult.new(score: 1.0, passed: true, details: build_contract_details)
54
+ end
55
+
56
+ def contract_failure(step_result)
57
+ EvaluationResult.new(
58
+ score: 0.0,
59
+ passed: false,
60
+ details: "step failed: #{step_result.status} — #{step_result.validation_errors.join(", ")}"
61
+ )
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
@@ -4,14 +4,16 @@ module RubyLLM
4
4
  module Contract
5
5
  module Eval
6
6
  module Evaluator
7
+ # Compares output to expected using Ruby equality semantics.
7
8
  class Exact
8
9
  def call(output:, expected:, input: nil) # rubocop:disable Lint/UnusedMethodArgument
9
- if output == expected
10
- EvaluationResult.new(score: 1.0, passed: true, details: "exact match")
11
- else
12
- EvaluationResult.new(score: 0.0, passed: false,
13
- details: "expected #{expected.inspect}, got #{output.inspect}")
14
- end
10
+ return EvaluationResult.new(score: 1.0, passed: true, details: "exact match") if output == expected
11
+
12
+ EvaluationResult.new(
13
+ score: 0.0,
14
+ passed: false,
15
+ details: "expected #{expected.inspect}, got #{output.inspect}"
16
+ )
15
17
  end
16
18
  end
17
19
  end
@@ -4,24 +4,36 @@ module RubyLLM
4
4
  module Contract
5
5
  module Eval
6
6
  module Evaluator
7
+ # Adapts custom Ruby callables to the EvaluationResult contract.
7
8
  class ProcEvaluator
8
9
  def initialize(callable)
9
10
  @callable = callable
10
11
  end
11
12
 
12
13
  def call(output:, expected: nil, input: nil) # rubocop:disable Lint/UnusedMethodArgument,Metrics
13
- result = if @callable.arity == 2 || (@callable.arity.negative? && @callable.parameters.length >= 2)
14
- @callable.call(output, input)
15
- else
16
- @callable.call(output)
17
- end
14
+ result = invoke_callable(output, input)
15
+ warn_nil_result if result.nil?
16
+ build_evaluation_result(result)
17
+ end
18
18
 
19
- if result.nil?
20
- warn "[ruby_llm-contract] verify/evaluator proc returned nil. " \
21
- "This usually means a key mismatch (string vs symbol). " \
22
- "Output keys are always symbols."
23
- end
19
+ private
20
+
21
+ def invoke_callable(output, input)
22
+ callable_accepts_input? ? @callable.call(output, input) : @callable.call(output)
23
+ end
24
+
25
+ def callable_accepts_input?
26
+ arity = @callable.arity
27
+ arity == 2 || (arity.negative? && @callable.parameters.length >= 2)
28
+ end
29
+
30
+ def warn_nil_result
31
+ warn "[ruby_llm-contract] verify/evaluator proc returned nil. " \
32
+ "This usually means a key mismatch (string vs symbol). " \
33
+ "Output keys are always symbols."
34
+ end
24
35
 
36
+ def build_evaluation_result(result)
25
37
  case result
26
38
  when true
27
39
  EvaluationResult.new(score: 1.0, passed: true, details: "passed")
@@ -4,21 +4,24 @@ module RubyLLM
4
4
  module Contract
5
5
  module Eval
6
6
  module Evaluator
7
+ # Matches a regex against the flattened textual representation of output.
7
8
  class Regex
8
9
  def initialize(pattern)
9
10
  @pattern = pattern.is_a?(::Regexp) ? pattern : ::Regexp.new(pattern)
10
11
  end
11
12
 
12
13
  def call(output:, expected: nil, input: nil) # rubocop:disable Lint/UnusedMethodArgument
13
- text = output.is_a?(Hash) ? output.values.join(" ") : output.to_s
14
+ pattern = @pattern.inspect
15
+ details = text_for(output).match?(@pattern) ? "matches #{pattern}" : "does not match #{pattern}"
16
+ passed = details.start_with?("matches")
14
17
 
15
- if text.match?(@pattern)
16
- EvaluationResult.new(score: 1.0, passed: true,
17
- details: "matches #{@pattern.inspect}")
18
- else
19
- EvaluationResult.new(score: 0.0, passed: false,
20
- details: "does not match #{@pattern.inspect}")
21
- end
18
+ EvaluationResult.new(score: passed ? 1.0 : 0.0, passed: passed, details: details)
19
+ end
20
+
21
+ private
22
+
23
+ def text_for(output)
24
+ output.is_a?(Hash) ? output.values.join(" ") : output.to_s
22
25
  end
23
26
  end
24
27
  end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ class ExpectationEvaluator
7
+ def call(output:, expected:, input:)
8
+ evaluator_for(expected).call(output: output, expected: expected, input: input)
9
+ end
10
+
11
+ private
12
+
13
+ def evaluator_for(expected)
14
+ case expected
15
+ when Hash
16
+ Evaluator::JsonIncludes.new
17
+ when ::Regexp
18
+ Evaluator::Regex.new(expected)
19
+ else
20
+ Evaluator::Exact.new
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "forwardable"
4
+
5
+ module RubyLLM
6
+ module Contract
7
+ module Eval
8
+ class PromptDiff
9
+ extend Forwardable
10
+
11
+ attr_reader :candidate_report, :baseline_report
12
+ def_delegators :@diff, :improvements, :regressions, :score_delta, :removed_passing_cases
13
+ def_delegators :@comparator, :safe_to_switch?, :case_names_match?, :cases_comparable?, :mismatched_cases,
14
+ :input_mismatches, :expected_mismatches, :score_regressions, :candidate_score, :baseline_score,
15
+ :baseline_empty?, :candidate_empty?
16
+ def_delegators :@presenter, :print_summary
17
+
18
+ def initialize(candidate:, baseline:)
19
+ @candidate_report = candidate
20
+ @baseline_report = baseline
21
+ serializer = PromptDiffSerializer.new
22
+ candidate_cases = serializer.call(candidate)
23
+ baseline_cases = serializer.call(baseline)
24
+ @diff = BaselineDiff.new(
25
+ baseline_cases: baseline_cases,
26
+ current_cases: candidate_cases
27
+ )
28
+ @comparator = PromptDiffComparator.new(
29
+ candidate_cases: candidate_cases,
30
+ baseline_cases: baseline_cases,
31
+ diff: @diff
32
+ )
33
+ @presenter = PromptDiffPresenter.new(prompt_diff: self, comparator: @comparator)
34
+ freeze
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,116 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ # Encapsulates the safety and mismatch rules for prompt A/B comparison.
7
+ class PromptDiffComparator
8
+ def initialize(candidate_cases:, baseline_cases:, diff:)
9
+ @candidate_cases = candidate_cases
10
+ @baseline_cases = baseline_cases
11
+ @diff = diff
12
+ @baseline_case_index = baseline_cases.to_h { |case_data| [case_data[:name], case_data] }
13
+ end
14
+
15
+ def safe_to_switch?
16
+ return false if empty_comparison?
17
+ return false unless cases_comparable?
18
+ return false if score_regressions.any?
19
+
20
+ !@diff.regressed?
21
+ end
22
+
23
+ def case_names_match?
24
+ case_names(@baseline_cases) == case_names(@candidate_cases)
25
+ end
26
+
27
+ def cases_comparable?
28
+ case_signatures(@baseline_cases) == case_signatures(@candidate_cases)
29
+ end
30
+
31
+ def mismatched_cases
32
+ baseline_names = case_names(@baseline_cases)
33
+ candidate_names = case_names(@candidate_cases)
34
+
35
+ {
36
+ only_in_baseline: baseline_names - candidate_names,
37
+ only_in_candidate: candidate_names - baseline_names
38
+ }
39
+ end
40
+
41
+ def input_mismatches
42
+ attribute_mismatches(:input, :baseline_input, :candidate_input)
43
+ end
44
+
45
+ def expected_mismatches
46
+ attribute_mismatches(:expected, :baseline_expected, :candidate_expected)
47
+ end
48
+
49
+ def score_regressions
50
+ @candidate_cases.filter_map do |candidate_case|
51
+ baseline_case = @baseline_case_index[candidate_case[:name]]
52
+ next unless baseline_case
53
+
54
+ baseline_score = baseline_case[:score]
55
+ candidate_score = candidate_case[:score]
56
+ next unless candidate_score < baseline_score
57
+
58
+ {
59
+ case: candidate_case[:name],
60
+ baseline_score: baseline_score,
61
+ candidate_score: candidate_score,
62
+ delta: (candidate_score - baseline_score).round(4)
63
+ }
64
+ end
65
+ end
66
+
67
+ def candidate_score
68
+ @diff.current_score
69
+ end
70
+
71
+ def baseline_score
72
+ @diff.baseline_score
73
+ end
74
+
75
+ def candidate_empty?
76
+ @candidate_cases.empty?
77
+ end
78
+
79
+ def baseline_empty?
80
+ @baseline_cases.empty?
81
+ end
82
+
83
+ def empty_comparison?
84
+ baseline_empty? || candidate_empty?
85
+ end
86
+
87
+ private
88
+
89
+ def case_names(cases)
90
+ cases.map { |case_data| case_data[:name] }.sort
91
+ end
92
+
93
+ def case_signatures(cases)
94
+ cases.map { |case_data| [case_data[:name], case_data[:input], case_data[:expected]] }.sort_by(&:first)
95
+ end
96
+
97
+ def attribute_mismatches(attribute, baseline_key, candidate_key)
98
+ @candidate_cases.filter_map do |candidate_case|
99
+ baseline_case = @baseline_case_index[candidate_case[:name]]
100
+ next unless baseline_case
101
+
102
+ baseline_value = baseline_case[attribute]
103
+ candidate_value = candidate_case[attribute]
104
+ next if baseline_value == candidate_value
105
+
106
+ {
107
+ case: candidate_case[:name],
108
+ baseline_key => baseline_value,
109
+ candidate_key => candidate_value
110
+ }
111
+ end
112
+ end
113
+ end
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,99 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ # Renders a prompt diff as a readable console summary.
7
+ class PromptDiffPresenter
8
+ VARIANT_LABEL_WIDTH = 12
9
+ TABLE_WIDTH = 26
10
+ CASE_SET_MISMATCH_TITLE = " Case set mismatch (safe_to_switch? = NO):"
11
+ INPUT_MISMATCH_TITLE = " Input mismatch (safe_to_switch? = NO):"
12
+ EXPECTED_MISMATCH_TITLE = " Expected mismatch (safe_to_switch? = NO):"
13
+ REGRESSIONS_TITLE = " Regressions (PASS -> FAIL):"
14
+ SCORE_DROPS_TITLE = " Score drops:"
15
+ IMPROVEMENTS_TITLE = " Improvements:"
16
+ REMOVED_PASSING_TITLE = " Removed (were passing in baseline):"
17
+
18
+ def initialize(prompt_diff:, comparator:)
19
+ @prompt_diff = prompt_diff
20
+ @comparator = comparator
21
+ end
22
+
23
+ def print_summary(io = $stdout)
24
+ print_header(io)
25
+ print_warning(io, "one side has no evaluated cases (all skipped?)") if @comparator.empty_comparison?
26
+ print_case_set_mismatch(io)
27
+ print_formatted_section(io, INPUT_MISMATCH_TITLE, @comparator.input_mismatches) do |mismatch|
28
+ "#{mismatch[:case]}: inputs differ between candidate and baseline"
29
+ end
30
+ print_formatted_section(io, EXPECTED_MISMATCH_TITLE, @comparator.expected_mismatches) do |mismatch|
31
+ "#{mismatch[:case]}: expected values differ between candidate and baseline"
32
+ end
33
+ print_formatted_section(io, REGRESSIONS_TITLE, @prompt_diff.regressions) do |regression|
34
+ "#{regression[:case]}: was PASS, now FAIL -- #{regression[:detail]}"
35
+ end
36
+ print_formatted_section(io, SCORE_DROPS_TITLE, @comparator.score_regressions) do |regression|
37
+ "#{regression[:case]}: #{regression[:baseline_score]} -> #{regression[:candidate_score]} (#{regression[:delta]})"
38
+ end
39
+ print_formatted_section(io, IMPROVEMENTS_TITLE, @prompt_diff.improvements) do |improvement|
40
+ "#{improvement[:case]}: was FAIL, now PASS"
41
+ end
42
+ print_formatted_section(io, REMOVED_PASSING_TITLE, @prompt_diff.removed_passing_cases, &:to_s)
43
+ io.puts " Safe to switch: #{@comparator.safe_to_switch? ? "YES" : "NO"}"
44
+ end
45
+
46
+ private
47
+
48
+ def print_header(io)
49
+ lines = [
50
+ "Prompt A/B comparison",
51
+ nil,
52
+ format(" %-#{VARIANT_LABEL_WIDTH}s Score", "Variant"),
53
+ " #{"-" * TABLE_WIDTH}",
54
+ format(" %-#{VARIANT_LABEL_WIDTH}s %.2f", "Candidate", @comparator.candidate_score),
55
+ format(" %-#{VARIANT_LABEL_WIDTH}s %.2f", "Baseline", @comparator.baseline_score),
56
+ nil,
57
+ " Score delta: #{format_delta(@prompt_diff.score_delta)}",
58
+ nil
59
+ ]
60
+ emit_lines(io, lines)
61
+ end
62
+
63
+ def print_warning(io, message)
64
+ emit_lines(io, [" WARNING: #{message}", nil])
65
+ end
66
+
67
+ def print_case_set_mismatch(io)
68
+ return if @comparator.case_names_match?
69
+
70
+ mismatches = @comparator.mismatched_cases
71
+ lines = mismatches[:only_in_baseline].map { |name| "only in baseline: #{name}" } +
72
+ mismatches[:only_in_candidate].map { |name| "only in candidate: #{name}" }
73
+ emit_section(io, CASE_SET_MISMATCH_TITLE, lines)
74
+ end
75
+
76
+ def print_formatted_section(io, title, collection)
77
+ return if collection.empty?
78
+
79
+ lines = collection.map { |entry| yield(entry) }
80
+ emit_section(io, title, lines)
81
+ end
82
+
83
+ def emit_section(io, title, lines)
84
+ emit_lines(io, [title, *lines.map { |line| " #{line}" }, nil])
85
+ end
86
+
87
+ def emit_lines(io, lines)
88
+ lines.each do |line|
89
+ line.nil? ? io.puts : io.puts(line)
90
+ end
91
+ end
92
+
93
+ def format_delta(delta)
94
+ delta >= 0 ? "+#{delta}" : delta.to_s
95
+ end
96
+ end
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ # Normalizes report results into comparable prompt-diff case hashes.
7
+ class PromptDiffSerializer
8
+ def call(report)
9
+ report.results.reject { |result| result.step_status == :skipped }.map do |result|
10
+ {
11
+ name: result.name,
12
+ input: result.input,
13
+ expected: result.expected,
14
+ passed: result.passed?,
15
+ score: result.score,
16
+ details: result.details
17
+ }
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end