ruby_llm-contract 0.4.5 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubycritic.yml +8 -0
- data/.simplecov +22 -0
- data/CHANGELOG.md +19 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +104 -2
- data/README.md +42 -2
- data/lib/ruby_llm/contract/concerns/context_helpers.rb +11 -10
- data/lib/ruby_llm/contract/concerns/deep_freeze.rb +13 -7
- data/lib/ruby_llm/contract/concerns/deep_symbolize.rb +15 -5
- data/lib/ruby_llm/contract/concerns/eval_host.rb +51 -7
- data/lib/ruby_llm/contract/contract/schema_validator/bound_rule.rb +85 -0
- data/lib/ruby_llm/contract/contract/schema_validator/enum_rule.rb +23 -0
- data/lib/ruby_llm/contract/contract/schema_validator/node.rb +70 -0
- data/lib/ruby_llm/contract/contract/schema_validator/object_rules.rb +66 -0
- data/lib/ruby_llm/contract/contract/schema_validator/scalar_rules.rb +22 -0
- data/lib/ruby_llm/contract/contract/schema_validator/schema_extractor.rb +23 -0
- data/lib/ruby_llm/contract/contract/schema_validator/type_rule.rb +30 -0
- data/lib/ruby_llm/contract/contract/schema_validator.rb +41 -266
- data/lib/ruby_llm/contract/contract/validator.rb +9 -0
- data/lib/ruby_llm/contract/eval/case_executor.rb +52 -0
- data/lib/ruby_llm/contract/eval/case_result_builder.rb +35 -0
- data/lib/ruby_llm/contract/eval/case_scorer.rb +66 -0
- data/lib/ruby_llm/contract/eval/evaluator/exact.rb +8 -6
- data/lib/ruby_llm/contract/eval/evaluator/proc_evaluator.rb +22 -10
- data/lib/ruby_llm/contract/eval/evaluator/regex.rb +11 -8
- data/lib/ruby_llm/contract/eval/expectation_evaluator.rb +26 -0
- data/lib/ruby_llm/contract/eval/prompt_diff.rb +39 -0
- data/lib/ruby_llm/contract/eval/prompt_diff_comparator.rb +116 -0
- data/lib/ruby_llm/contract/eval/prompt_diff_presenter.rb +99 -0
- data/lib/ruby_llm/contract/eval/prompt_diff_serializer.rb +23 -0
- data/lib/ruby_llm/contract/eval/report.rb +19 -191
- data/lib/ruby_llm/contract/eval/report_presenter.rb +65 -0
- data/lib/ruby_llm/contract/eval/report_stats.rb +65 -0
- data/lib/ruby_llm/contract/eval/report_storage.rb +107 -0
- data/lib/ruby_llm/contract/eval/runner.rb +30 -207
- data/lib/ruby_llm/contract/eval/step_expectation_applier.rb +67 -0
- data/lib/ruby_llm/contract/eval/step_result_normalizer.rb +39 -0
- data/lib/ruby_llm/contract/eval.rb +13 -0
- data/lib/ruby_llm/contract/pipeline/base.rb +10 -1
- data/lib/ruby_llm/contract/rspec/pass_eval.rb +84 -3
- data/lib/ruby_llm/contract/rspec.rb +5 -0
- data/lib/ruby_llm/contract/step/adapter_caller.rb +23 -0
- data/lib/ruby_llm/contract/step/base.rb +93 -38
- data/lib/ruby_llm/contract/step/dsl.rb +10 -0
- data/lib/ruby_llm/contract/step/input_validator.rb +34 -0
- data/lib/ruby_llm/contract/step/limit_checker.rb +11 -11
- data/lib/ruby_llm/contract/step/prompt_compiler.rb +33 -0
- data/lib/ruby_llm/contract/step/result.rb +3 -2
- data/lib/ruby_llm/contract/step/result_builder.rb +60 -0
- data/lib/ruby_llm/contract/step/retry_executor.rb +1 -0
- data/lib/ruby_llm/contract/step/runner.rb +46 -85
- data/lib/ruby_llm/contract/step/runner_config.rb +37 -0
- data/lib/ruby_llm/contract/step.rb +5 -0
- data/lib/ruby_llm/contract/version.rb +1 -1
- metadata +28 -1
|
@@ -1,209 +1,37 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require "
|
|
4
|
-
require "fileutils"
|
|
3
|
+
require "forwardable"
|
|
5
4
|
|
|
6
5
|
module RubyLLM
|
|
7
6
|
module Contract
|
|
8
7
|
module Eval
|
|
9
8
|
class Report
|
|
10
|
-
|
|
9
|
+
extend Forwardable
|
|
10
|
+
|
|
11
|
+
attr_reader :dataset_name, :results, :step_name
|
|
12
|
+
|
|
13
|
+
GENERIC_DETAILS = ["passed", "not passed"].freeze
|
|
14
|
+
HISTORY_DIR = ".eval_history"
|
|
15
|
+
BASELINE_DIR = ".eval_baselines"
|
|
16
|
+
|
|
17
|
+
def_delegators :@stats, :score, :passed, :failed, :skipped, :failures, :pass_rate, :total_cost, :avg_latency_ms,
|
|
18
|
+
:passed?
|
|
19
|
+
def_delegators :@presenter, :summary, :to_s, :print_summary
|
|
20
|
+
def_delegators :@storage, :save_history!, :eval_history, :save_baseline!, :compare_with_baseline,
|
|
21
|
+
:baseline_exists?
|
|
11
22
|
|
|
12
23
|
def initialize(dataset_name:, results:, step_name: nil)
|
|
13
24
|
@dataset_name = dataset_name
|
|
14
25
|
@step_name = step_name
|
|
15
26
|
@results = results.dup.freeze
|
|
27
|
+
@stats = ReportStats.new(results: @results)
|
|
28
|
+
@presenter = ReportPresenter.new(report: self, stats: @stats)
|
|
29
|
+
@storage = ReportStorage.new(report: self, stats: @stats)
|
|
16
30
|
freeze
|
|
17
31
|
end
|
|
18
32
|
|
|
19
|
-
def
|
|
20
|
-
|
|
21
|
-
return 0.0 if evaluated.empty?
|
|
22
|
-
|
|
23
|
-
evaluated.sum(&:score) / evaluated.length
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
def passed
|
|
27
|
-
evaluated_results.count(&:passed?)
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
def failed
|
|
31
|
-
evaluated_results.count(&:failed?)
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
def skipped
|
|
35
|
-
results.count { |r| r.step_status == :skipped }
|
|
36
|
-
end
|
|
37
|
-
|
|
38
|
-
def failures
|
|
39
|
-
evaluated_results.select(&:failed?)
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
def pass_rate
|
|
43
|
-
"#{passed}/#{evaluated_results.length}"
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
def total_cost
|
|
47
|
-
results.sum { |r| r.cost || 0.0 }
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
def avg_latency_ms
|
|
51
|
-
latencies = results.filter_map(&:duration_ms)
|
|
52
|
-
return nil if latencies.empty?
|
|
53
|
-
|
|
54
|
-
latencies.sum.to_f / latencies.length
|
|
55
|
-
end
|
|
56
|
-
|
|
57
|
-
def passed?
|
|
58
|
-
evaluated = evaluated_results
|
|
59
|
-
return false if evaluated.empty?
|
|
60
|
-
|
|
61
|
-
evaluated.all?(&:passed?)
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
def each(&)
|
|
65
|
-
results.each(&)
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
def summary
|
|
69
|
-
parts = ["#{dataset_name}: #{pass_rate} checks passed"]
|
|
70
|
-
parts << "#{skipped} skipped" if skipped.positive?
|
|
71
|
-
parts << format_cost(total_cost) if total_cost.positive?
|
|
72
|
-
parts.join(", ")
|
|
73
|
-
end
|
|
74
|
-
|
|
75
|
-
GENERIC_DETAILS = ["passed", "not passed"].freeze
|
|
76
|
-
|
|
77
|
-
def to_s
|
|
78
|
-
lines = [summary]
|
|
79
|
-
failures.each do |result|
|
|
80
|
-
lines << format_failure(result)
|
|
81
|
-
end
|
|
82
|
-
lines.join("\n")
|
|
83
|
-
end
|
|
84
|
-
|
|
85
|
-
def save_history!(path: nil, model: nil)
|
|
86
|
-
file = path || default_history_path(model: model)
|
|
87
|
-
run_data = {
|
|
88
|
-
date: Time.now.strftime("%Y-%m-%d"),
|
|
89
|
-
score: score,
|
|
90
|
-
total_cost: total_cost,
|
|
91
|
-
pass_rate: pass_rate,
|
|
92
|
-
cases_count: evaluated_results.length
|
|
93
|
-
}
|
|
94
|
-
EvalHistory.append(file, run_data)
|
|
95
|
-
file
|
|
96
|
-
end
|
|
97
|
-
|
|
98
|
-
def eval_history(path: nil, model: nil)
|
|
99
|
-
file = path || default_history_path(model: model)
|
|
100
|
-
EvalHistory.load(file)
|
|
101
|
-
end
|
|
102
|
-
|
|
103
|
-
def save_baseline!(path: nil, model: nil)
|
|
104
|
-
file = path || default_baseline_path(model: model)
|
|
105
|
-
FileUtils.mkdir_p(File.dirname(file))
|
|
106
|
-
File.write(file, JSON.pretty_generate(serialize_for_baseline))
|
|
107
|
-
file
|
|
108
|
-
end
|
|
109
|
-
|
|
110
|
-
def compare_with_baseline(path: nil, model: nil)
|
|
111
|
-
file = path || default_baseline_path(model: model)
|
|
112
|
-
raise ArgumentError, "No baseline found at #{file}" unless File.exist?(file)
|
|
113
|
-
|
|
114
|
-
baseline_data = JSON.parse(File.read(file), symbolize_names: true)
|
|
115
|
-
validate_baseline!(baseline_data)
|
|
116
|
-
BaselineDiff.new(
|
|
117
|
-
baseline_cases: baseline_data[:cases],
|
|
118
|
-
current_cases: results.map { |r| serialize_case(r) }
|
|
119
|
-
)
|
|
120
|
-
end
|
|
121
|
-
|
|
122
|
-
def baseline_exists?(path: nil, model: nil)
|
|
123
|
-
File.exist?(path || default_baseline_path(model: model))
|
|
124
|
-
end
|
|
125
|
-
|
|
126
|
-
def print_summary(io = $stdout)
|
|
127
|
-
io.puts summary
|
|
128
|
-
io.puts
|
|
129
|
-
results.each do |result|
|
|
130
|
-
icon = result.label
|
|
131
|
-
cost_str = result.cost ? " #{format_cost(result.cost)}" : ""
|
|
132
|
-
latency_str = result.duration_ms ? " #{result.duration_ms}ms" : ""
|
|
133
|
-
io.puts " #{icon} #{result.name}#{cost_str}#{latency_str}"
|
|
134
|
-
io.puts " #{result.details}" if result.failed? && useful_details?(result.details)
|
|
135
|
-
end
|
|
136
|
-
end
|
|
137
|
-
|
|
138
|
-
private
|
|
139
|
-
|
|
140
|
-
def format_failure(result)
|
|
141
|
-
line = " FAIL #{result.name}"
|
|
142
|
-
line += ": #{result.details}" if useful_details?(result.details)
|
|
143
|
-
line
|
|
144
|
-
end
|
|
145
|
-
|
|
146
|
-
def useful_details?(details)
|
|
147
|
-
details && !GENERIC_DETAILS.include?(details)
|
|
148
|
-
end
|
|
149
|
-
|
|
150
|
-
def evaluated_results
|
|
151
|
-
results.reject { |r| r.step_status == :skipped }
|
|
152
|
-
end
|
|
153
|
-
|
|
154
|
-
def default_history_path(model: nil)
|
|
155
|
-
parts = [".eval_history"]
|
|
156
|
-
parts << sanitize_name(@step_name) if @step_name
|
|
157
|
-
name = sanitize_name(dataset_name)
|
|
158
|
-
name = "#{name}_#{sanitize_name(model)}" if model
|
|
159
|
-
parts << "#{name}.jsonl"
|
|
160
|
-
File.join(*parts)
|
|
161
|
-
end
|
|
162
|
-
|
|
163
|
-
def default_baseline_path(model: nil)
|
|
164
|
-
parts = [".eval_baselines"]
|
|
165
|
-
parts << sanitize_name(@step_name) if @step_name
|
|
166
|
-
name = sanitize_name(dataset_name)
|
|
167
|
-
name = "#{name}_#{sanitize_name(model)}" if model
|
|
168
|
-
parts << "#{name}.json"
|
|
169
|
-
File.join(*parts)
|
|
170
|
-
end
|
|
171
|
-
|
|
172
|
-
def validate_baseline!(data)
|
|
173
|
-
if data[:dataset_name] && data[:dataset_name] != dataset_name
|
|
174
|
-
raise ArgumentError, "Baseline eval '#{data[:dataset_name]}' does not match '#{dataset_name}'"
|
|
175
|
-
end
|
|
176
|
-
if data[:step_name] && @step_name && data[:step_name] != @step_name
|
|
177
|
-
raise ArgumentError, "Baseline step '#{data[:step_name]}' does not match '#{@step_name}'"
|
|
178
|
-
end
|
|
179
|
-
end
|
|
180
|
-
|
|
181
|
-
def sanitize_name(name)
|
|
182
|
-
name.to_s.gsub(/[^a-zA-Z0-9_-]/, "_")
|
|
183
|
-
end
|
|
184
|
-
|
|
185
|
-
def serialize_for_baseline
|
|
186
|
-
{
|
|
187
|
-
dataset_name: dataset_name,
|
|
188
|
-
step_name: @step_name,
|
|
189
|
-
score: score,
|
|
190
|
-
total_cost: total_cost,
|
|
191
|
-
cases: evaluated_results.map { |r| serialize_case(r) }
|
|
192
|
-
}
|
|
193
|
-
end
|
|
194
|
-
|
|
195
|
-
def serialize_case(result)
|
|
196
|
-
{
|
|
197
|
-
name: result.name,
|
|
198
|
-
passed: result.passed?,
|
|
199
|
-
score: result.score,
|
|
200
|
-
details: result.details,
|
|
201
|
-
cost: result.cost
|
|
202
|
-
}
|
|
203
|
-
end
|
|
204
|
-
|
|
205
|
-
def format_cost(cost)
|
|
206
|
-
"$#{format("%.6f", cost)}"
|
|
33
|
+
def each(&block)
|
|
34
|
+
results.each(&block)
|
|
207
35
|
end
|
|
208
36
|
end
|
|
209
37
|
end
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
# Formats eval reports for console and string output.
|
|
7
|
+
class ReportPresenter
|
|
8
|
+
def initialize(report:, stats:)
|
|
9
|
+
@report = report
|
|
10
|
+
@stats = stats
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def summary
|
|
14
|
+
summary_parts.join(", ")
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def to_s
|
|
18
|
+
([summary] + @stats.failures.map { |result| format_failure(result) }).join("\n")
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def print_summary(io = $stdout)
|
|
22
|
+
io.puts summary
|
|
23
|
+
io.puts
|
|
24
|
+
@report.results.each { |result| print_result(io, result) }
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
def summary_parts
|
|
30
|
+
parts = ["#{@report.dataset_name}: #{@stats.pass_rate} checks passed"]
|
|
31
|
+
parts << "#{@stats.skipped} skipped" if @stats.skipped.positive?
|
|
32
|
+
parts << format_cost(@stats.total_cost) if @stats.total_cost.positive?
|
|
33
|
+
parts
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def format_failure(result)
|
|
37
|
+
line = " FAIL #{result.name}"
|
|
38
|
+
line += ": #{result.details}" if useful_details?(result.details)
|
|
39
|
+
line
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def print_result(io, result)
|
|
43
|
+
io.puts " #{result.label} #{result.name}#{result_cost(result)}#{result_latency(result)}"
|
|
44
|
+
io.puts " #{result.details}" if result.failed? && useful_details?(result.details)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def useful_details?(details)
|
|
48
|
+
details && !Report::GENERIC_DETAILS.include?(details)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def result_cost(result)
|
|
52
|
+
result.cost ? " #{format_cost(result.cost)}" : ""
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def result_latency(result)
|
|
56
|
+
result.duration_ms ? " #{result.duration_ms}ms" : ""
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def format_cost(cost)
|
|
60
|
+
"$#{format("%.6f", cost)}"
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Contract
|
|
5
|
+
module Eval
|
|
6
|
+
# Computes aggregate metrics for an eval report.
|
|
7
|
+
class ReportStats
|
|
8
|
+
def initialize(results:)
|
|
9
|
+
@results = results
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def score
|
|
13
|
+
return 0.0 if evaluated_results.empty?
|
|
14
|
+
|
|
15
|
+
evaluated_results.sum(&:score) / evaluated_results.length
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def passed
|
|
19
|
+
evaluated_results.count(&:passed?)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def failed
|
|
23
|
+
evaluated_results.count(&:failed?)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def skipped
|
|
27
|
+
@results.count { |result| result.step_status == :skipped }
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def failures
|
|
31
|
+
evaluated_results.select(&:failed?)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def pass_rate
|
|
35
|
+
"#{passed}/#{evaluated_results.length}"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def total_cost
|
|
39
|
+
@results.sum { |result| result.cost || 0.0 }
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def avg_latency_ms
|
|
43
|
+
latencies = @results.filter_map(&:duration_ms)
|
|
44
|
+
return nil if latencies.empty?
|
|
45
|
+
|
|
46
|
+
latencies.sum.to_f / latencies.length
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def passed?
|
|
50
|
+
return false if evaluated_results.empty?
|
|
51
|
+
|
|
52
|
+
evaluated_results.all?(&:passed?)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def evaluated_results
|
|
56
|
+
@evaluated_results ||= @results.reject { |result| result.step_status == :skipped }
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def evaluated_results_count
|
|
60
|
+
evaluated_results.length
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "fileutils"
|
|
5
|
+
|
|
6
|
+
module RubyLLM
|
|
7
|
+
module Contract
|
|
8
|
+
module Eval
|
|
9
|
+
# Persists eval reports as history entries and regression baselines.
|
|
10
|
+
class ReportStorage
|
|
11
|
+
def initialize(report:, stats:)
|
|
12
|
+
@report = report
|
|
13
|
+
@stats = stats
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def save_history!(path: nil, model: nil)
|
|
17
|
+
file = path || storage_path(Report::HISTORY_DIR, "jsonl", model: model)
|
|
18
|
+
EvalHistory.append(file, history_entry)
|
|
19
|
+
file
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def eval_history(path: nil, model: nil)
|
|
23
|
+
EvalHistory.load(path || storage_path(Report::HISTORY_DIR, "jsonl", model: model))
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def save_baseline!(path: nil, model: nil)
|
|
27
|
+
file = path || storage_path(Report::BASELINE_DIR, "json", model: model)
|
|
28
|
+
FileUtils.mkdir_p(File.dirname(file))
|
|
29
|
+
File.write(file, JSON.pretty_generate(serialize_for_baseline))
|
|
30
|
+
file
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def compare_with_baseline(path: nil, model: nil)
|
|
34
|
+
file = path || storage_path(Report::BASELINE_DIR, "json", model: model)
|
|
35
|
+
raise ArgumentError, "No baseline found at #{file}" unless File.exist?(file)
|
|
36
|
+
|
|
37
|
+
baseline_data = JSON.parse(File.read(file), symbolize_names: true)
|
|
38
|
+
validate_baseline!(baseline_data)
|
|
39
|
+
|
|
40
|
+
BaselineDiff.new(
|
|
41
|
+
baseline_cases: baseline_data[:cases],
|
|
42
|
+
current_cases: @report.results.map { |result| serialize_case(result) }
|
|
43
|
+
)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def baseline_exists?(path: nil, model: nil)
|
|
47
|
+
File.exist?(path || storage_path(Report::BASELINE_DIR, "json", model: model))
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
private
|
|
51
|
+
|
|
52
|
+
def history_entry
|
|
53
|
+
{
|
|
54
|
+
date: Time.now.strftime("%Y-%m-%d"),
|
|
55
|
+
score: @stats.score,
|
|
56
|
+
total_cost: @stats.total_cost,
|
|
57
|
+
pass_rate: @stats.pass_rate,
|
|
58
|
+
cases_count: @stats.evaluated_results_count
|
|
59
|
+
}
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def serialize_for_baseline
|
|
63
|
+
{
|
|
64
|
+
dataset_name: @report.dataset_name,
|
|
65
|
+
step_name: @report.step_name,
|
|
66
|
+
score: @stats.score,
|
|
67
|
+
total_cost: @stats.total_cost,
|
|
68
|
+
cases: @stats.evaluated_results.map { |result| serialize_case(result) }
|
|
69
|
+
}
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def serialize_case(result)
|
|
73
|
+
{
|
|
74
|
+
name: result.name,
|
|
75
|
+
passed: result.passed?,
|
|
76
|
+
score: result.score,
|
|
77
|
+
details: result.details,
|
|
78
|
+
cost: result.cost
|
|
79
|
+
}
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def storage_path(root_dir, extension, model:)
|
|
83
|
+
parts = [root_dir]
|
|
84
|
+
parts << sanitize_name(@report.step_name) if @report.step_name
|
|
85
|
+
|
|
86
|
+
dataset_name = sanitize_name(@report.dataset_name)
|
|
87
|
+
dataset_name = "#{dataset_name}_#{sanitize_name(model)}" if model
|
|
88
|
+
|
|
89
|
+
File.join(*parts, "#{dataset_name}.#{extension}")
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def validate_baseline!(data)
|
|
93
|
+
if data[:dataset_name] && data[:dataset_name] != @report.dataset_name
|
|
94
|
+
raise ArgumentError, "Baseline eval '#{data[:dataset_name]}' does not match '#{@report.dataset_name}'"
|
|
95
|
+
end
|
|
96
|
+
if data[:step_name] && @report.step_name && data[:step_name] != @report.step_name
|
|
97
|
+
raise ArgumentError, "Baseline step '#{data[:step_name]}' does not match '#{@report.step_name}'"
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def sanitize_name(name)
|
|
102
|
+
name.to_s.gsub(/[^a-zA-Z0-9_-]/, "_")
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|