ruby_llm-contract 0.4.5 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/.rubycritic.yml +8 -0
  3. data/.simplecov +22 -0
  4. data/CHANGELOG.md +19 -0
  5. data/Gemfile +2 -0
  6. data/Gemfile.lock +104 -2
  7. data/README.md +42 -2
  8. data/lib/ruby_llm/contract/concerns/context_helpers.rb +11 -10
  9. data/lib/ruby_llm/contract/concerns/deep_freeze.rb +13 -7
  10. data/lib/ruby_llm/contract/concerns/deep_symbolize.rb +15 -5
  11. data/lib/ruby_llm/contract/concerns/eval_host.rb +51 -7
  12. data/lib/ruby_llm/contract/contract/schema_validator/bound_rule.rb +85 -0
  13. data/lib/ruby_llm/contract/contract/schema_validator/enum_rule.rb +23 -0
  14. data/lib/ruby_llm/contract/contract/schema_validator/node.rb +70 -0
  15. data/lib/ruby_llm/contract/contract/schema_validator/object_rules.rb +66 -0
  16. data/lib/ruby_llm/contract/contract/schema_validator/scalar_rules.rb +22 -0
  17. data/lib/ruby_llm/contract/contract/schema_validator/schema_extractor.rb +23 -0
  18. data/lib/ruby_llm/contract/contract/schema_validator/type_rule.rb +30 -0
  19. data/lib/ruby_llm/contract/contract/schema_validator.rb +41 -266
  20. data/lib/ruby_llm/contract/contract/validator.rb +9 -0
  21. data/lib/ruby_llm/contract/eval/case_executor.rb +52 -0
  22. data/lib/ruby_llm/contract/eval/case_result_builder.rb +35 -0
  23. data/lib/ruby_llm/contract/eval/case_scorer.rb +66 -0
  24. data/lib/ruby_llm/contract/eval/evaluator/exact.rb +8 -6
  25. data/lib/ruby_llm/contract/eval/evaluator/proc_evaluator.rb +22 -10
  26. data/lib/ruby_llm/contract/eval/evaluator/regex.rb +11 -8
  27. data/lib/ruby_llm/contract/eval/expectation_evaluator.rb +26 -0
  28. data/lib/ruby_llm/contract/eval/prompt_diff.rb +39 -0
  29. data/lib/ruby_llm/contract/eval/prompt_diff_comparator.rb +116 -0
  30. data/lib/ruby_llm/contract/eval/prompt_diff_presenter.rb +99 -0
  31. data/lib/ruby_llm/contract/eval/prompt_diff_serializer.rb +23 -0
  32. data/lib/ruby_llm/contract/eval/report.rb +19 -191
  33. data/lib/ruby_llm/contract/eval/report_presenter.rb +65 -0
  34. data/lib/ruby_llm/contract/eval/report_stats.rb +65 -0
  35. data/lib/ruby_llm/contract/eval/report_storage.rb +107 -0
  36. data/lib/ruby_llm/contract/eval/runner.rb +30 -207
  37. data/lib/ruby_llm/contract/eval/step_expectation_applier.rb +67 -0
  38. data/lib/ruby_llm/contract/eval/step_result_normalizer.rb +39 -0
  39. data/lib/ruby_llm/contract/eval.rb +13 -0
  40. data/lib/ruby_llm/contract/pipeline/base.rb +10 -1
  41. data/lib/ruby_llm/contract/rspec/pass_eval.rb +84 -3
  42. data/lib/ruby_llm/contract/rspec.rb +5 -0
  43. data/lib/ruby_llm/contract/step/adapter_caller.rb +23 -0
  44. data/lib/ruby_llm/contract/step/base.rb +93 -38
  45. data/lib/ruby_llm/contract/step/dsl.rb +10 -0
  46. data/lib/ruby_llm/contract/step/input_validator.rb +34 -0
  47. data/lib/ruby_llm/contract/step/limit_checker.rb +11 -11
  48. data/lib/ruby_llm/contract/step/prompt_compiler.rb +33 -0
  49. data/lib/ruby_llm/contract/step/result.rb +3 -2
  50. data/lib/ruby_llm/contract/step/result_builder.rb +60 -0
  51. data/lib/ruby_llm/contract/step/retry_executor.rb +1 -0
  52. data/lib/ruby_llm/contract/step/runner.rb +46 -85
  53. data/lib/ruby_llm/contract/step/runner_config.rb +37 -0
  54. data/lib/ruby_llm/contract/step.rb +5 -0
  55. data/lib/ruby_llm/contract/version.rb +1 -1
  56. metadata +28 -1
@@ -1,209 +1,37 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "json"
4
- require "fileutils"
3
+ require "forwardable"
5
4
 
6
5
  module RubyLLM
7
6
  module Contract
8
7
  module Eval
9
8
  class Report
10
- attr_reader :dataset_name, :results
9
+ extend Forwardable
10
+
11
+ attr_reader :dataset_name, :results, :step_name
12
+
13
+ GENERIC_DETAILS = ["passed", "not passed"].freeze
14
+ HISTORY_DIR = ".eval_history"
15
+ BASELINE_DIR = ".eval_baselines"
16
+
17
+ def_delegators :@stats, :score, :passed, :failed, :skipped, :failures, :pass_rate, :total_cost, :avg_latency_ms,
18
+ :passed?
19
+ def_delegators :@presenter, :summary, :to_s, :print_summary
20
+ def_delegators :@storage, :save_history!, :eval_history, :save_baseline!, :compare_with_baseline,
21
+ :baseline_exists?
11
22
 
12
23
  def initialize(dataset_name:, results:, step_name: nil)
13
24
  @dataset_name = dataset_name
14
25
  @step_name = step_name
15
26
  @results = results.dup.freeze
27
+ @stats = ReportStats.new(results: @results)
28
+ @presenter = ReportPresenter.new(report: self, stats: @stats)
29
+ @storage = ReportStorage.new(report: self, stats: @stats)
16
30
  freeze
17
31
  end
18
32
 
19
- def score
20
- evaluated = evaluated_results
21
- return 0.0 if evaluated.empty?
22
-
23
- evaluated.sum(&:score) / evaluated.length
24
- end
25
-
26
- def passed
27
- evaluated_results.count(&:passed?)
28
- end
29
-
30
- def failed
31
- evaluated_results.count(&:failed?)
32
- end
33
-
34
- def skipped
35
- results.count { |r| r.step_status == :skipped }
36
- end
37
-
38
- def failures
39
- evaluated_results.select(&:failed?)
40
- end
41
-
42
- def pass_rate
43
- "#{passed}/#{evaluated_results.length}"
44
- end
45
-
46
- def total_cost
47
- results.sum { |r| r.cost || 0.0 }
48
- end
49
-
50
- def avg_latency_ms
51
- latencies = results.filter_map(&:duration_ms)
52
- return nil if latencies.empty?
53
-
54
- latencies.sum.to_f / latencies.length
55
- end
56
-
57
- def passed?
58
- evaluated = evaluated_results
59
- return false if evaluated.empty?
60
-
61
- evaluated.all?(&:passed?)
62
- end
63
-
64
- def each(&)
65
- results.each(&)
66
- end
67
-
68
- def summary
69
- parts = ["#{dataset_name}: #{pass_rate} checks passed"]
70
- parts << "#{skipped} skipped" if skipped.positive?
71
- parts << format_cost(total_cost) if total_cost.positive?
72
- parts.join(", ")
73
- end
74
-
75
- GENERIC_DETAILS = ["passed", "not passed"].freeze
76
-
77
- def to_s
78
- lines = [summary]
79
- failures.each do |result|
80
- lines << format_failure(result)
81
- end
82
- lines.join("\n")
83
- end
84
-
85
- def save_history!(path: nil, model: nil)
86
- file = path || default_history_path(model: model)
87
- run_data = {
88
- date: Time.now.strftime("%Y-%m-%d"),
89
- score: score,
90
- total_cost: total_cost,
91
- pass_rate: pass_rate,
92
- cases_count: evaluated_results.length
93
- }
94
- EvalHistory.append(file, run_data)
95
- file
96
- end
97
-
98
- def eval_history(path: nil, model: nil)
99
- file = path || default_history_path(model: model)
100
- EvalHistory.load(file)
101
- end
102
-
103
- def save_baseline!(path: nil, model: nil)
104
- file = path || default_baseline_path(model: model)
105
- FileUtils.mkdir_p(File.dirname(file))
106
- File.write(file, JSON.pretty_generate(serialize_for_baseline))
107
- file
108
- end
109
-
110
- def compare_with_baseline(path: nil, model: nil)
111
- file = path || default_baseline_path(model: model)
112
- raise ArgumentError, "No baseline found at #{file}" unless File.exist?(file)
113
-
114
- baseline_data = JSON.parse(File.read(file), symbolize_names: true)
115
- validate_baseline!(baseline_data)
116
- BaselineDiff.new(
117
- baseline_cases: baseline_data[:cases],
118
- current_cases: results.map { |r| serialize_case(r) }
119
- )
120
- end
121
-
122
- def baseline_exists?(path: nil, model: nil)
123
- File.exist?(path || default_baseline_path(model: model))
124
- end
125
-
126
- def print_summary(io = $stdout)
127
- io.puts summary
128
- io.puts
129
- results.each do |result|
130
- icon = result.label
131
- cost_str = result.cost ? " #{format_cost(result.cost)}" : ""
132
- latency_str = result.duration_ms ? " #{result.duration_ms}ms" : ""
133
- io.puts " #{icon} #{result.name}#{cost_str}#{latency_str}"
134
- io.puts " #{result.details}" if result.failed? && useful_details?(result.details)
135
- end
136
- end
137
-
138
- private
139
-
140
- def format_failure(result)
141
- line = " FAIL #{result.name}"
142
- line += ": #{result.details}" if useful_details?(result.details)
143
- line
144
- end
145
-
146
- def useful_details?(details)
147
- details && !GENERIC_DETAILS.include?(details)
148
- end
149
-
150
- def evaluated_results
151
- results.reject { |r| r.step_status == :skipped }
152
- end
153
-
154
- def default_history_path(model: nil)
155
- parts = [".eval_history"]
156
- parts << sanitize_name(@step_name) if @step_name
157
- name = sanitize_name(dataset_name)
158
- name = "#{name}_#{sanitize_name(model)}" if model
159
- parts << "#{name}.jsonl"
160
- File.join(*parts)
161
- end
162
-
163
- def default_baseline_path(model: nil)
164
- parts = [".eval_baselines"]
165
- parts << sanitize_name(@step_name) if @step_name
166
- name = sanitize_name(dataset_name)
167
- name = "#{name}_#{sanitize_name(model)}" if model
168
- parts << "#{name}.json"
169
- File.join(*parts)
170
- end
171
-
172
- def validate_baseline!(data)
173
- if data[:dataset_name] && data[:dataset_name] != dataset_name
174
- raise ArgumentError, "Baseline eval '#{data[:dataset_name]}' does not match '#{dataset_name}'"
175
- end
176
- if data[:step_name] && @step_name && data[:step_name] != @step_name
177
- raise ArgumentError, "Baseline step '#{data[:step_name]}' does not match '#{@step_name}'"
178
- end
179
- end
180
-
181
- def sanitize_name(name)
182
- name.to_s.gsub(/[^a-zA-Z0-9_-]/, "_")
183
- end
184
-
185
- def serialize_for_baseline
186
- {
187
- dataset_name: dataset_name,
188
- step_name: @step_name,
189
- score: score,
190
- total_cost: total_cost,
191
- cases: evaluated_results.map { |r| serialize_case(r) }
192
- }
193
- end
194
-
195
- def serialize_case(result)
196
- {
197
- name: result.name,
198
- passed: result.passed?,
199
- score: result.score,
200
- details: result.details,
201
- cost: result.cost
202
- }
203
- end
204
-
205
- def format_cost(cost)
206
- "$#{format("%.6f", cost)}"
33
+ def each(&block)
34
+ results.each(&block)
207
35
  end
208
36
  end
209
37
  end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ # Formats eval reports for console and string output.
7
+ class ReportPresenter
8
+ def initialize(report:, stats:)
9
+ @report = report
10
+ @stats = stats
11
+ end
12
+
13
+ def summary
14
+ summary_parts.join(", ")
15
+ end
16
+
17
+ def to_s
18
+ ([summary] + @stats.failures.map { |result| format_failure(result) }).join("\n")
19
+ end
20
+
21
+ def print_summary(io = $stdout)
22
+ io.puts summary
23
+ io.puts
24
+ @report.results.each { |result| print_result(io, result) }
25
+ end
26
+
27
+ private
28
+
29
+ def summary_parts
30
+ parts = ["#{@report.dataset_name}: #{@stats.pass_rate} checks passed"]
31
+ parts << "#{@stats.skipped} skipped" if @stats.skipped.positive?
32
+ parts << format_cost(@stats.total_cost) if @stats.total_cost.positive?
33
+ parts
34
+ end
35
+
36
+ def format_failure(result)
37
+ line = " FAIL #{result.name}"
38
+ line += ": #{result.details}" if useful_details?(result.details)
39
+ line
40
+ end
41
+
42
+ def print_result(io, result)
43
+ io.puts " #{result.label} #{result.name}#{result_cost(result)}#{result_latency(result)}"
44
+ io.puts " #{result.details}" if result.failed? && useful_details?(result.details)
45
+ end
46
+
47
+ def useful_details?(details)
48
+ details && !Report::GENERIC_DETAILS.include?(details)
49
+ end
50
+
51
+ def result_cost(result)
52
+ result.cost ? " #{format_cost(result.cost)}" : ""
53
+ end
54
+
55
+ def result_latency(result)
56
+ result.duration_ms ? " #{result.duration_ms}ms" : ""
57
+ end
58
+
59
+ def format_cost(cost)
60
+ "$#{format("%.6f", cost)}"
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Contract
5
+ module Eval
6
+ # Computes aggregate metrics for an eval report.
7
+ class ReportStats
8
+ def initialize(results:)
9
+ @results = results
10
+ end
11
+
12
+ def score
13
+ return 0.0 if evaluated_results.empty?
14
+
15
+ evaluated_results.sum(&:score) / evaluated_results.length
16
+ end
17
+
18
+ def passed
19
+ evaluated_results.count(&:passed?)
20
+ end
21
+
22
+ def failed
23
+ evaluated_results.count(&:failed?)
24
+ end
25
+
26
+ def skipped
27
+ @results.count { |result| result.step_status == :skipped }
28
+ end
29
+
30
+ def failures
31
+ evaluated_results.select(&:failed?)
32
+ end
33
+
34
+ def pass_rate
35
+ "#{passed}/#{evaluated_results.length}"
36
+ end
37
+
38
+ def total_cost
39
+ @results.sum { |result| result.cost || 0.0 }
40
+ end
41
+
42
+ def avg_latency_ms
43
+ latencies = @results.filter_map(&:duration_ms)
44
+ return nil if latencies.empty?
45
+
46
+ latencies.sum.to_f / latencies.length
47
+ end
48
+
49
+ def passed?
50
+ return false if evaluated_results.empty?
51
+
52
+ evaluated_results.all?(&:passed?)
53
+ end
54
+
55
+ def evaluated_results
56
+ @evaluated_results ||= @results.reject { |result| result.step_status == :skipped }
57
+ end
58
+
59
+ def evaluated_results_count
60
+ evaluated_results.length
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,107 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "fileutils"
5
+
6
+ module RubyLLM
7
+ module Contract
8
+ module Eval
9
+ # Persists eval reports as history entries and regression baselines.
10
+ class ReportStorage
11
+ def initialize(report:, stats:)
12
+ @report = report
13
+ @stats = stats
14
+ end
15
+
16
+ def save_history!(path: nil, model: nil)
17
+ file = path || storage_path(Report::HISTORY_DIR, "jsonl", model: model)
18
+ EvalHistory.append(file, history_entry)
19
+ file
20
+ end
21
+
22
+ def eval_history(path: nil, model: nil)
23
+ EvalHistory.load(path || storage_path(Report::HISTORY_DIR, "jsonl", model: model))
24
+ end
25
+
26
+ def save_baseline!(path: nil, model: nil)
27
+ file = path || storage_path(Report::BASELINE_DIR, "json", model: model)
28
+ FileUtils.mkdir_p(File.dirname(file))
29
+ File.write(file, JSON.pretty_generate(serialize_for_baseline))
30
+ file
31
+ end
32
+
33
+ def compare_with_baseline(path: nil, model: nil)
34
+ file = path || storage_path(Report::BASELINE_DIR, "json", model: model)
35
+ raise ArgumentError, "No baseline found at #{file}" unless File.exist?(file)
36
+
37
+ baseline_data = JSON.parse(File.read(file), symbolize_names: true)
38
+ validate_baseline!(baseline_data)
39
+
40
+ BaselineDiff.new(
41
+ baseline_cases: baseline_data[:cases],
42
+ current_cases: @report.results.map { |result| serialize_case(result) }
43
+ )
44
+ end
45
+
46
+ def baseline_exists?(path: nil, model: nil)
47
+ File.exist?(path || storage_path(Report::BASELINE_DIR, "json", model: model))
48
+ end
49
+
50
+ private
51
+
52
+ def history_entry
53
+ {
54
+ date: Time.now.strftime("%Y-%m-%d"),
55
+ score: @stats.score,
56
+ total_cost: @stats.total_cost,
57
+ pass_rate: @stats.pass_rate,
58
+ cases_count: @stats.evaluated_results_count
59
+ }
60
+ end
61
+
62
+ def serialize_for_baseline
63
+ {
64
+ dataset_name: @report.dataset_name,
65
+ step_name: @report.step_name,
66
+ score: @stats.score,
67
+ total_cost: @stats.total_cost,
68
+ cases: @stats.evaluated_results.map { |result| serialize_case(result) }
69
+ }
70
+ end
71
+
72
+ def serialize_case(result)
73
+ {
74
+ name: result.name,
75
+ passed: result.passed?,
76
+ score: result.score,
77
+ details: result.details,
78
+ cost: result.cost
79
+ }
80
+ end
81
+
82
+ def storage_path(root_dir, extension, model:)
83
+ parts = [root_dir]
84
+ parts << sanitize_name(@report.step_name) if @report.step_name
85
+
86
+ dataset_name = sanitize_name(@report.dataset_name)
87
+ dataset_name = "#{dataset_name}_#{sanitize_name(model)}" if model
88
+
89
+ File.join(*parts, "#{dataset_name}.#{extension}")
90
+ end
91
+
92
+ def validate_baseline!(data)
93
+ if data[:dataset_name] && data[:dataset_name] != @report.dataset_name
94
+ raise ArgumentError, "Baseline eval '#{data[:dataset_name]}' does not match '#{@report.dataset_name}'"
95
+ end
96
+ if data[:step_name] && @report.step_name && data[:step_name] != @report.step_name
97
+ raise ArgumentError, "Baseline step '#{data[:step_name]}' does not match '#{@report.step_name}'"
98
+ end
99
+ end
100
+
101
+ def sanitize_name(name)
102
+ name.to_s.gsub(/[^a-zA-Z0-9_-]/, "_")
103
+ end
104
+ end
105
+ end
106
+ end
107
+ end