ruby-skill-bench 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +166 -35
- data/docs/architecture.md +3 -1
- data/docs/first-eval-guide.md +7 -7
- data/docs/testing-guide.md +1 -1
- data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
- data/lib/skill_bench/agent/react_agent/step.rb +7 -1
- data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
- data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
- data/lib/skill_bench/cli/help_printer.rb +10 -2
- data/lib/skill_bench/cli/init_command.rb +2 -1
- data/lib/skill_bench/cli/result_printer.rb +1 -1
- data/lib/skill_bench/cli/run_command.rb +47 -9
- data/lib/skill_bench/cli/validate_command.rb +242 -0
- data/lib/skill_bench/cli.rb +3 -0
- data/lib/skill_bench/client.rb +43 -1
- data/lib/skill_bench/clients/all.rb +2 -0
- data/lib/skill_bench/clients/base_client.rb +12 -1
- data/lib/skill_bench/clients/base_url_validator.rb +105 -0
- data/lib/skill_bench/clients/provider_config.rb +34 -1
- data/lib/skill_bench/clients/provider_schemas.rb +4 -0
- data/lib/skill_bench/clients/providers/mistral.rb +47 -0
- data/lib/skill_bench/commands/init.rb +5 -0
- data/lib/skill_bench/commands/skill_new.rb +3 -1
- data/lib/skill_bench/config/applier.rb +2 -0
- data/lib/skill_bench/config/defaults.rb +2 -0
- data/lib/skill_bench/config/facade_readers.rb +7 -0
- data/lib/skill_bench/config/facade_writers.rb +17 -0
- data/lib/skill_bench/config/json_loader.rb +1 -1
- data/lib/skill_bench/config/store.rb +29 -0
- data/lib/skill_bench/config.rb +18 -0
- data/lib/skill_bench/evaluation/runner.rb +20 -3
- data/lib/skill_bench/execution/context_hydrator.rb +52 -11
- data/lib/skill_bench/execution/sandbox.rb +58 -11
- data/lib/skill_bench/judge/judge.rb +4 -0
- data/lib/skill_bench/judge/prompt.rb +42 -6
- data/lib/skill_bench/models/config.rb +32 -0
- data/lib/skill_bench/output_formatter.rb +60 -1
- data/lib/skill_bench/package_verifier.rb +1 -1
- data/lib/skill_bench/rails/skill_templates.rb +19 -5
- data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
- data/lib/skill_bench/services/batch_runner_service.rb +111 -0
- data/lib/skill_bench/services/compare_option_parser.rb +1 -0
- data/lib/skill_bench/services/cost_calculator.rb +91 -0
- data/lib/skill_bench/services/html_formatter.rb +289 -0
- data/lib/skill_bench/services/json_formatter.rb +19 -1
- data/lib/skill_bench/services/junit_formatter.rb +74 -24
- data/lib/skill_bench/services/provider_resolver.rb +5 -2
- data/lib/skill_bench/services/response_cache.rb +130 -0
- data/lib/skill_bench/services/runner_service.rb +88 -4
- data/lib/skill_bench/services/summary_formatter.rb +90 -0
- data/lib/skill_bench/services/template_registry.rb +43 -9
- data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
- data/lib/skill_bench/tools/registry.rb +29 -3
- data/lib/skill_bench/tools/run_command.rb +171 -19
- data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
- data/lib/skill_bench/trend_tracker.rb +5 -5
- data/lib/skill_bench/version.rb +1 -1
- data/lib/skill_bench.rb +2 -3
- metadata +17 -36
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'cgi'
|
|
4
|
+
require_relative 'formatting_helpers'
|
|
5
|
+
require_relative '../delta_report'
|
|
6
|
+
|
|
7
|
+
module SkillBench
|
|
8
|
+
module Services
|
|
9
|
+
# Formats evaluation results as a complete, self-contained HTML document.
|
|
10
|
+
#
|
|
11
|
+
# The output embeds all styling inline (no external assets) and escapes every
|
|
12
|
+
# dynamic, user-derived value with {CGI.escapeHTML} to prevent HTML injection.
|
|
13
|
+
# Both the modern DeltaReport shape and the legacy result shape are supported.
|
|
14
|
+
class HtmlFormatter
|
|
15
|
+
extend FormattingHelpers
|
|
16
|
+
|
|
17
|
+
# Inline stylesheet embedded in every generated document.
|
|
18
|
+
STYLE = <<~CSS
|
|
19
|
+
body { font-family: -apple-system, Segoe UI, Roboto, sans-serif; margin: 2rem; color: #1a1a1a; background: #fafafa; }
|
|
20
|
+
main { max-width: 960px; margin: 0 auto; }
|
|
21
|
+
header { border-bottom: 2px solid #ddd; padding-bottom: 1rem; margin-bottom: 1.5rem; }
|
|
22
|
+
h1 { margin: 0 0 0.5rem; font-size: 1.6rem; }
|
|
23
|
+
dl.meta { display: grid; grid-template-columns: max-content 1fr; gap: 0.2rem 1rem; margin: 0.5rem 0; }
|
|
24
|
+
dl.meta dt { font-weight: 600; color: #555; }
|
|
25
|
+
dl.meta dd { margin: 0; }
|
|
26
|
+
p.usage { color: #555; font-variant-numeric: tabular-nums; }
|
|
27
|
+
table { border-collapse: collapse; width: 100%; margin: 1rem 0; }
|
|
28
|
+
th, td { padding: 0.4rem 0.75rem; text-align: right; border-bottom: 1px solid #e2e2e2; }
|
|
29
|
+
th:first-child, td:first-child { text-align: left; }
|
|
30
|
+
tr.total td { font-weight: 700; border-top: 2px solid #bbb; }
|
|
31
|
+
p.verdict { font-weight: 700; padding: 0.5rem 0.75rem; border-radius: 4px; display: inline-block; }
|
|
32
|
+
p.verdict.pass { background: #e6f4ea; color: #1e7e34; }
|
|
33
|
+
p.verdict.fail { background: #fde8e8; color: #c0392b; }
|
|
34
|
+
p.error { color: #c0392b; }
|
|
35
|
+
section.iterations h3 { margin-bottom: 0.25rem; }
|
|
36
|
+
ol { margin: 0.25rem 0 1rem; }
|
|
37
|
+
li { margin: 0.2rem 0; }
|
|
38
|
+
span.tools, span.observation { color: #555; }
|
|
39
|
+
CSS
|
|
40
|
+
|
|
41
|
+
# Format an eval result as a full HTML document.
|
|
42
|
+
#
|
|
43
|
+
# @param result [Hash] Eval result envelope (DeltaReport or legacy shape).
|
|
44
|
+
# @return [String] A complete HTML document string.
|
|
45
|
+
def self.format(result)
|
|
46
|
+
report = result.dig(:response, :report)
|
|
47
|
+
body = report.is_a?(SkillBench::DeltaReport) ? delta_body(result, report) : legacy_section(result)
|
|
48
|
+
build_document(result, body)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Builds the body for a DeltaReport result (table plus iteration timeline).
|
|
52
|
+
#
|
|
53
|
+
# @param result [Hash] Eval result envelope.
|
|
54
|
+
# @param report [SkillBench::DeltaReport] The delta report.
|
|
55
|
+
# @return [String] HTML for the report and iteration sections.
|
|
56
|
+
def self.delta_body(result, report)
|
|
57
|
+
"#{report_section(report)}\n#{iterations_section(result)}"
|
|
58
|
+
end
|
|
59
|
+
private_class_method :delta_body
|
|
60
|
+
|
|
61
|
+
# Wraps body HTML in a complete, styled HTML document.
|
|
62
|
+
#
|
|
63
|
+
# @param result [Hash] Eval result envelope (used for the header/title).
|
|
64
|
+
# @param body [String] Pre-rendered body HTML.
|
|
65
|
+
# @return [String] A complete HTML document string.
|
|
66
|
+
def self.build_document(result, body)
|
|
67
|
+
title = escape(result[:eval_name] || 'Report')
|
|
68
|
+
<<~HTML
|
|
69
|
+
<!DOCTYPE html>
|
|
70
|
+
<html lang="en">
|
|
71
|
+
<head>
|
|
72
|
+
<meta charset="utf-8">
|
|
73
|
+
<title>SkillBench Report — #{title}</title>
|
|
74
|
+
<style>#{STYLE}</style>
|
|
75
|
+
</head>
|
|
76
|
+
<body>
|
|
77
|
+
<main>
|
|
78
|
+
#{header_html(result)}
|
|
79
|
+
#{body}
|
|
80
|
+
</main>
|
|
81
|
+
</body>
|
|
82
|
+
</html>
|
|
83
|
+
HTML
|
|
84
|
+
end
|
|
85
|
+
private_class_method :build_document
|
|
86
|
+
|
|
87
|
+
# Builds the header with eval/skill/provider names and the usage line.
|
|
88
|
+
#
|
|
89
|
+
# @param result [Hash] Eval result envelope.
|
|
90
|
+
# @return [String] HTML for the document header.
|
|
91
|
+
def self.header_html(result)
|
|
92
|
+
<<~HTML.chomp
|
|
93
|
+
<header>
|
|
94
|
+
<h1>SkillBench Report</h1>
|
|
95
|
+
<dl class="meta">
|
|
96
|
+
<dt>Eval</dt><dd>#{escape(result[:eval_name])}</dd>
|
|
97
|
+
<dt>Skill</dt><dd>#{escape(result[:skill_name])}</dd>
|
|
98
|
+
<dt>Provider</dt><dd>#{escape(result[:provider_name])}</dd>
|
|
99
|
+
</dl>
|
|
100
|
+
<p class="usage">#{usage_line(result)}</p>
|
|
101
|
+
</header>
|
|
102
|
+
HTML
|
|
103
|
+
end
|
|
104
|
+
private_class_method :header_html
|
|
105
|
+
|
|
106
|
+
# Builds the token/cost summary line for the header.
|
|
107
|
+
#
|
|
108
|
+
# @param result [Hash] Eval result envelope; reads :tokens and :cost.
|
|
109
|
+
# @return [String] An escaped "Tokens / Est. Cost" line.
|
|
110
|
+
def self.usage_line(result)
|
|
111
|
+
tokens = result[:tokens] || {}
|
|
112
|
+
total = tokens[:total_tokens] || tokens['total_tokens'] || 0
|
|
113
|
+
cost = result[:cost]
|
|
114
|
+
cost_label = cost ? Kernel.format('$%.4f', cost) : '—'
|
|
115
|
+
"Tokens: #{escape(total)} | Est. Cost: #{escape(cost_label)}"
|
|
116
|
+
end
|
|
117
|
+
private_class_method :usage_line
|
|
118
|
+
|
|
119
|
+
# Builds the scoring table and verdict for a DeltaReport.
|
|
120
|
+
#
|
|
121
|
+
# @param report [SkillBench::DeltaReport] The delta report.
|
|
122
|
+
# @return [String] HTML for the report section.
|
|
123
|
+
def self.report_section(report)
|
|
124
|
+
<<~HTML.chomp
|
|
125
|
+
<section class="report">
|
|
126
|
+
<h2>Delta Report</h2>
|
|
127
|
+
<table>
|
|
128
|
+
<thead><tr><th>Dimension</th><th>Baseline</th><th>Context</th><th>Delta</th></tr></thead>
|
|
129
|
+
<tbody>
|
|
130
|
+
#{dimension_rows(report)}
|
|
131
|
+
#{total_row(report)}
|
|
132
|
+
</tbody>
|
|
133
|
+
</table>
|
|
134
|
+
#{verdict_html(report)}
|
|
135
|
+
</section>
|
|
136
|
+
HTML
|
|
137
|
+
end
|
|
138
|
+
private_class_method :report_section
|
|
139
|
+
|
|
140
|
+
# Builds one table row per scored dimension.
|
|
141
|
+
#
|
|
142
|
+
# @param report [SkillBench::DeltaReport] The delta report.
|
|
143
|
+
# @return [String] HTML table rows joined by newlines.
|
|
144
|
+
def self.dimension_rows(report)
|
|
145
|
+
report.deltas.map { |name, delta| dimension_row(name, delta, report) }.join("\n")
|
|
146
|
+
end
|
|
147
|
+
private_class_method :dimension_rows
|
|
148
|
+
|
|
149
|
+
# Builds a single dimension table row.
|
|
150
|
+
#
|
|
151
|
+
# @param name [String] Dimension name.
|
|
152
|
+
# @param delta [Numeric] Context-minus-baseline delta.
|
|
153
|
+
# @param report [SkillBench::DeltaReport] The delta report.
|
|
154
|
+
# @return [String] An HTML table row.
|
|
155
|
+
def self.dimension_row(name, delta, report)
|
|
156
|
+
dim = report.criteria.dimensions.find { |candidate| candidate.name == name }
|
|
157
|
+
humanized = humanize(name)
|
|
158
|
+
label = dim ? "#{humanized} (#{dim.max_score})" : humanized
|
|
159
|
+
baseline = report.baseline_scores[name]
|
|
160
|
+
context = report.context_scores[name]
|
|
161
|
+
row_cells('dimension', label, baseline, context, delta_str(delta))
|
|
162
|
+
end
|
|
163
|
+
private_class_method :dimension_row
|
|
164
|
+
|
|
165
|
+
# Builds the totals table row.
|
|
166
|
+
#
|
|
167
|
+
# @param report [SkillBench::DeltaReport] The delta report.
|
|
168
|
+
# @return [String] An HTML table row for the totals.
|
|
169
|
+
def self.total_row(report)
|
|
170
|
+
total_delta = report.deltas.values.sum
|
|
171
|
+
row_cells('total', 'Total', "#{report.baseline_total}/100",
|
|
172
|
+
"#{report.context_total}/100", delta_str(total_delta))
|
|
173
|
+
end
|
|
174
|
+
private_class_method :total_row
|
|
175
|
+
|
|
176
|
+
# Builds an HTML table row from escaped cell values.
|
|
177
|
+
#
|
|
178
|
+
# @param css_class [String] CSS class for the row.
|
|
179
|
+
# @param label [String] First-column label.
|
|
180
|
+
# @param baseline [Object] Baseline score cell.
|
|
181
|
+
# @param context [Object] Context score cell.
|
|
182
|
+
# @param delta [String] Delta cell.
|
|
183
|
+
# @return [String] An HTML table row.
|
|
184
|
+
def self.row_cells(css_class, label, baseline, context, delta)
|
|
185
|
+
"<tr class=\"#{css_class}\"><td>#{escape(label)}</td><td>#{escape(baseline)}</td>" \
|
|
186
|
+
"<td>#{escape(context)}</td><td>#{escape(delta)}</td></tr>"
|
|
187
|
+
end
|
|
188
|
+
private_class_method :row_cells
|
|
189
|
+
|
|
190
|
+
# Builds the verdict paragraph.
|
|
191
|
+
#
|
|
192
|
+
# @param report [SkillBench::DeltaReport] The delta report.
|
|
193
|
+
# @return [String] An HTML verdict paragraph.
|
|
194
|
+
def self.verdict_html(report)
|
|
195
|
+
verdict = report.verdict
|
|
196
|
+
criteria = report.criteria
|
|
197
|
+
status = verdict ? 'PASS' : 'FAIL'
|
|
198
|
+
css = verdict ? 'pass' : 'fail'
|
|
199
|
+
threshold = escape(criteria.pass_threshold)
|
|
200
|
+
minimum_delta = escape(criteria.minimum_delta)
|
|
201
|
+
%(<p class="verdict #{css}">Verdict: #{status} (threshold: #{threshold}, minimum delta: #{minimum_delta})</p>)
|
|
202
|
+
end
|
|
203
|
+
private_class_method :verdict_html
|
|
204
|
+
|
|
205
|
+
# Builds the baseline/context iteration timeline section.
|
|
206
|
+
#
|
|
207
|
+
# @param result [Hash] Eval result envelope.
|
|
208
|
+
# @return [String] HTML for the iterations section, or empty string.
|
|
209
|
+
def self.iterations_section(result)
|
|
210
|
+
baseline = result.dig(:response, :baseline_iterations) || []
|
|
211
|
+
context = result.dig(:response, :context_iterations) || []
|
|
212
|
+
baseline_empty = baseline.empty?
|
|
213
|
+
context_empty = context.empty?
|
|
214
|
+
return '' if baseline_empty && context_empty
|
|
215
|
+
|
|
216
|
+
blocks = []
|
|
217
|
+
blocks << iteration_block('Baseline Iterations', baseline) unless baseline_empty
|
|
218
|
+
blocks << iteration_block('Context Iterations', context) unless context_empty
|
|
219
|
+
%(<section class="iterations">\n<h2>Iteration Timeline</h2>\n#{blocks.join("\n")}\n</section>)
|
|
220
|
+
end
|
|
221
|
+
private_class_method :iterations_section
|
|
222
|
+
|
|
223
|
+
# Builds one named iteration timeline block.
|
|
224
|
+
#
|
|
225
|
+
# @param title [String] Section title.
|
|
226
|
+
# @param iterations [Array<Hash>] Iteration metadata entries.
|
|
227
|
+
# @return [String] HTML for the timeline block.
|
|
228
|
+
def self.iteration_block(title, iterations)
|
|
229
|
+
items = iterations.map { |iteration| iteration_item(iteration) }.join("\n")
|
|
230
|
+
%(<div class="timeline"><h3>#{escape(title)}</h3><ol>\n#{items}\n</ol></div>)
|
|
231
|
+
end
|
|
232
|
+
private_class_method :iteration_block
|
|
233
|
+
|
|
234
|
+
# Builds one list item for a single iteration step.
|
|
235
|
+
#
|
|
236
|
+
# @param iteration [Hash] Iteration metadata with :step_number, :thought,
|
|
237
|
+
# :tools_used, and :observation_summary keys.
|
|
238
|
+
# @return [String] An HTML list item.
|
|
239
|
+
def self.iteration_item(iteration)
|
|
240
|
+
tools = iteration[:tools_used] || []
|
|
241
|
+
tools_html = tools.empty? ? '' : %( <span class="tools">Tools: #{escape(tools.join(', '))}</span>)
|
|
242
|
+
observation = iteration[:observation_summary].to_s
|
|
243
|
+
observation_html = observation.empty? ? '' : %( <span class="observation">Observation: #{escape(observation)}</span>)
|
|
244
|
+
step = "Step #{escape(iteration[:step_number])}: #{escape(iteration[:thought])}"
|
|
245
|
+
%(<li><span class="thought">#{step}</span>#{tools_html}#{observation_html}</li>)
|
|
246
|
+
end
|
|
247
|
+
private_class_method :iteration_item
|
|
248
|
+
|
|
249
|
+
# Builds the body for a legacy (non-DeltaReport) result.
|
|
250
|
+
#
|
|
251
|
+
# @param result [Hash] Legacy eval result envelope.
|
|
252
|
+
# @return [String] HTML for the legacy status section.
|
|
253
|
+
def self.legacy_section(result)
|
|
254
|
+
passed = result[:pass]
|
|
255
|
+
status = passed ? 'PASSED' : 'FAILED'
|
|
256
|
+
css = passed ? 'pass' : 'fail'
|
|
257
|
+
score = result[:score]&.round(2)
|
|
258
|
+
<<~HTML.chomp
|
|
259
|
+
<section class="report legacy">
|
|
260
|
+
<h2>Result</h2>
|
|
261
|
+
<p class="verdict #{css}">Status: #{status}</p>
|
|
262
|
+
<p class="score">Score: #{escape(score || 'N/A')}</p>
|
|
263
|
+
#{legacy_error(result)}
|
|
264
|
+
</section>
|
|
265
|
+
HTML
|
|
266
|
+
end
|
|
267
|
+
private_class_method :legacy_section
|
|
268
|
+
|
|
269
|
+
# Builds the optional error paragraph for a legacy result.
|
|
270
|
+
#
|
|
271
|
+
# @param result [Hash] Legacy eval result envelope.
|
|
272
|
+
# @return [String] An HTML error paragraph, or empty string.
|
|
273
|
+
def self.legacy_error(result)
|
|
274
|
+
message = result.dig(:response, :error, :message)
|
|
275
|
+
message ? %(<p class="error">Error: #{escape(message)}</p>) : ''
|
|
276
|
+
end
|
|
277
|
+
private_class_method :legacy_error
|
|
278
|
+
|
|
279
|
+
# Escapes any value for safe HTML embedding.
|
|
280
|
+
#
|
|
281
|
+
# @param value [Object] The value to escape (coerced via #to_s).
|
|
282
|
+
# @return [String] HTML-escaped text.
|
|
283
|
+
def self.escape(value)
|
|
284
|
+
CGI.escapeHTML(value.to_s)
|
|
285
|
+
end
|
|
286
|
+
private_class_method :escape
|
|
287
|
+
end
|
|
288
|
+
end
|
|
289
|
+
end
|
|
@@ -6,12 +6,30 @@ module SkillBench
|
|
|
6
6
|
module Services
|
|
7
7
|
# Formats evaluation results as JSON.
|
|
8
8
|
class JsonFormatter
|
|
9
|
+
# Zeroed token usage used when a result carries no usage data.
|
|
10
|
+
EMPTY_USAGE = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }.freeze
|
|
11
|
+
|
|
9
12
|
# Format result as JSON.
|
|
10
13
|
#
|
|
14
|
+
# Ensures top-level :tokens and :cost fields are always present (additive;
|
|
15
|
+
# existing keys are preserved) so JSON consumers see a stable shape.
|
|
16
|
+
#
|
|
11
17
|
# @param result [Hash] Eval result.
|
|
12
18
|
# @return [String] JSON-formatted string.
|
|
13
19
|
def self.format(result)
|
|
14
|
-
JSON.pretty_generate(result)
|
|
20
|
+
JSON.pretty_generate(with_usage_fields(result))
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Returns the result augmented with token/cost fields when missing.
|
|
24
|
+
#
|
|
25
|
+
# @param result [Hash] Eval result (returned unchanged when not a Hash).
|
|
26
|
+
# @return [Hash] Result with :tokens and :cost guaranteed present.
|
|
27
|
+
def self.with_usage_fields(result)
|
|
28
|
+
return result unless result.is_a?(Hash)
|
|
29
|
+
|
|
30
|
+
tokens = result[:tokens] || result.dig(:response, :tokens) || EMPTY_USAGE
|
|
31
|
+
cost = result.key?(:cost) ? result[:cost] : result.dig(:response, :cost)
|
|
32
|
+
result.merge(tokens: tokens, cost: cost)
|
|
15
33
|
end
|
|
16
34
|
end
|
|
17
35
|
end
|
|
@@ -4,39 +4,89 @@ require 'cgi'
|
|
|
4
4
|
|
|
5
5
|
module SkillBench
|
|
6
6
|
module Services
|
|
7
|
-
# Formats evaluation results as JUnit XML.
|
|
7
|
+
# Formats evaluation results as JUnit XML for CI consumption.
|
|
8
|
+
#
|
|
9
|
+
# Two entry points share the same per-result verdict/score logic:
|
|
10
|
+
# {.format} emits a single-result suite (one <testcase>), while
|
|
11
|
+
# {.format_batch} aggregates many results into one suite so a batch
|
|
12
|
+
# `skill-bench run --all` produces a single JUnit artifact.
|
|
8
13
|
class JUnitFormatter
|
|
9
|
-
#
|
|
14
|
+
# classname attribute applied to every emitted <testcase>.
|
|
15
|
+
CLASSNAME = 'SkillBench'
|
|
16
|
+
|
|
17
|
+
# Format a single result as a JUnit XML document.
|
|
10
18
|
#
|
|
11
19
|
# Supports both legacy format (result[:pass]) and modern DeltaReport format.
|
|
12
20
|
#
|
|
13
21
|
# @param result [Hash] Eval result.
|
|
14
22
|
# @return [String] JUnit XML-formatted string.
|
|
15
23
|
def self.format(result)
|
|
24
|
+
suite([result])
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Format an aggregate batch envelope as one JUnit XML document.
|
|
28
|
+
#
|
|
29
|
+
# Emits a single <testsuite> with one <testcase> per result, adding a
|
|
30
|
+
# <failure> child for every failing eval.
|
|
31
|
+
#
|
|
32
|
+
# @param aggregate [Hash] Aggregate envelope with a :results array.
|
|
33
|
+
# @return [String] JUnit XML-formatted string.
|
|
34
|
+
def self.format_batch(aggregate)
|
|
35
|
+
suite(aggregate[:results] || [])
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Builds a <testsuite> wrapping one <testcase> per result.
|
|
39
|
+
#
|
|
40
|
+
# @param results [Array<Hash>] Per-eval result envelopes.
|
|
41
|
+
# @return [String] JUnit XML-formatted string.
|
|
42
|
+
def self.suite(results)
|
|
43
|
+
failures = results.count { |result| !passing?(result) }
|
|
44
|
+
cases = results.map { |result| testcase(result) }.join("\n")
|
|
45
|
+
<<~XML
|
|
46
|
+
<?xml version="1.0"?>
|
|
47
|
+
<testsuite name="#{CLASSNAME}" tests="#{results.size}" failures="#{failures}">
|
|
48
|
+
#{cases}
|
|
49
|
+
</testsuite>
|
|
50
|
+
XML
|
|
51
|
+
end
|
|
52
|
+
private_class_method :suite
|
|
53
|
+
|
|
54
|
+
# Renders one <testcase> element (indented two spaces) for a result.
|
|
55
|
+
#
|
|
56
|
+
# @param result [Hash] A single-eval result envelope.
|
|
57
|
+
# @return [String] A <testcase> XML fragment.
|
|
58
|
+
def self.testcase(result)
|
|
59
|
+
name = CGI.escapeHTML(result[:eval_name].to_s)
|
|
60
|
+
return %( <testcase name="#{name}" classname="#{CLASSNAME}"/>) if passing?(result)
|
|
61
|
+
|
|
62
|
+
score = CGI.escapeHTML(score_for(result).to_s)
|
|
63
|
+
[
|
|
64
|
+
%( <testcase name="#{name}" classname="#{CLASSNAME}">),
|
|
65
|
+
%( <failure message="Score: #{score}">Eval failed</failure>),
|
|
66
|
+
' </testcase>'
|
|
67
|
+
].join("\n")
|
|
68
|
+
end
|
|
69
|
+
private_class_method :testcase
|
|
70
|
+
|
|
71
|
+
# Whether a result passed (DeltaReport verdict or legacy :pass).
|
|
72
|
+
#
|
|
73
|
+
# @param result [Hash] A single-eval result envelope.
|
|
74
|
+
# @return [Boolean] true when the eval passed.
|
|
75
|
+
def self.passing?(result)
|
|
76
|
+
report = result.dig(:response, :report)
|
|
77
|
+
report.respond_to?(:verdict) ? report.verdict : result[:pass]
|
|
78
|
+
end
|
|
79
|
+
private_class_method :passing?
|
|
80
|
+
|
|
81
|
+
# The score reported for a failing eval.
|
|
82
|
+
#
|
|
83
|
+
# @param result [Hash] A single-eval result envelope.
|
|
84
|
+
# @return [Object] DeltaReport context_total or legacy :score.
|
|
85
|
+
def self.score_for(result)
|
|
16
86
|
report = result.dig(:response, :report)
|
|
17
|
-
|
|
18
|
-
eval_name = CGI.escapeHTML(result[:eval_name].to_s)
|
|
19
|
-
|
|
20
|
-
if verdict
|
|
21
|
-
<<~XML
|
|
22
|
-
<?xml version="1.0"?>
|
|
23
|
-
<testsuite name="SkillBench" tests="1" failures="0">
|
|
24
|
-
<testcase name="#{eval_name}" classname="SkillBench"/>
|
|
25
|
-
</testsuite>
|
|
26
|
-
XML
|
|
27
|
-
else
|
|
28
|
-
score = report.respond_to?(:context_total) ? report.context_total : result[:score]
|
|
29
|
-
escaped_score = CGI.escapeHTML(score.to_s)
|
|
30
|
-
<<~XML
|
|
31
|
-
<?xml version="1.0"?>
|
|
32
|
-
<testsuite name="SkillBench" tests="1" failures="1">
|
|
33
|
-
<testcase name="#{eval_name}" classname="SkillBench">
|
|
34
|
-
<failure message="Score: #{escaped_score}">Eval failed</failure>
|
|
35
|
-
</testcase>
|
|
36
|
-
</testsuite>
|
|
37
|
-
XML
|
|
38
|
-
end
|
|
87
|
+
report.respond_to?(:context_total) ? report.context_total : result[:score]
|
|
39
88
|
end
|
|
89
|
+
private_class_method :score_for
|
|
40
90
|
end
|
|
41
91
|
end
|
|
42
92
|
end
|
|
@@ -51,11 +51,14 @@ module SkillBench
|
|
|
51
51
|
private
|
|
52
52
|
|
|
53
53
|
def resolve_provider
|
|
54
|
-
config = SkillBench::Models::Config.
|
|
54
|
+
config = SkillBench::Models::Config.loaded
|
|
55
55
|
provider = config.to_provider
|
|
56
56
|
return provider if provider
|
|
57
57
|
|
|
58
|
-
|
|
58
|
+
# Explicit `{"provider":"mock"}` is a valid choice, not a load failure,
|
|
59
|
+
# so it falls through to the mock provider without a warning. A missing
|
|
60
|
+
# provider key (genuine misconfiguration) still warns below.
|
|
61
|
+
warn 'Config load failed, using mock provider' unless config.mock?
|
|
59
62
|
MOCK_PROVIDER.new('mock', 'mock', 'mock', {})
|
|
60
63
|
rescue JSON::ParserError, ArgumentError, Errno::ENOENT => e
|
|
61
64
|
# Config parsing/validation errors or missing config file - fall back to mock
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'digest'
|
|
4
|
+
require 'json'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Services
|
|
8
|
+
# Content-addressed, in-memory cache for LLM responses.
|
|
9
|
+
#
|
|
10
|
+
# The cache is opt-in and disabled by default. When enabled it lets repeated,
|
|
11
|
+
# identical LLM requests reuse a previously computed response instead of
|
|
12
|
+
# hitting the network again. The canonical example is `compare`, which runs
|
|
13
|
+
# the skill-less baseline twice with identical inputs.
|
|
14
|
+
#
|
|
15
|
+
# The backing store is a process-lifetime {Hash} keyed by a stable SHA-256
|
|
16
|
+
# digest of the request, so the same logical request always maps to the same
|
|
17
|
+
# entry regardless of hash-key ordering. Access to the store is serialized
|
|
18
|
+
# with a mutex so concurrent callers (e.g. {Parallel}-driven agents) cannot
|
|
19
|
+
# corrupt it or double-store a key.
|
|
20
|
+
class ResponseCache
|
|
21
|
+
# Environment variable that opts caching on when set to a truthy value.
|
|
22
|
+
ENV_FLAG = 'SKILL_BENCH_CACHE'
|
|
23
|
+
|
|
24
|
+
# Raw env values treated as "on".
|
|
25
|
+
TRUTHY_VALUES = %w[1 true yes on].freeze
|
|
26
|
+
|
|
27
|
+
# Guards every read/write of the shared store. Concurrent agents/judges run
|
|
28
|
+
# on separate threads; without this, the membership check and the write in
|
|
29
|
+
# {fetch} could interleave and store a key more than once.
|
|
30
|
+
MUTEX = Mutex.new
|
|
31
|
+
|
|
32
|
+
class << self
|
|
33
|
+
# Whether response caching is currently enabled.
|
|
34
|
+
#
|
|
35
|
+
# Enabled when {ENV_FLAG} is set to a truthy value (one of
|
|
36
|
+
# {TRUTHY_VALUES}); disabled when unset or set to anything else.
|
|
37
|
+
#
|
|
38
|
+
# @return [Boolean] true when caching is on
|
|
39
|
+
def enabled?
|
|
40
|
+
raw = ENV.fetch(ENV_FLAG, '').to_s.strip.downcase
|
|
41
|
+
TRUTHY_VALUES.include?(raw)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Computes a stable content-addressed cache key for a request.
|
|
45
|
+
#
|
|
46
|
+
# The inputs are assembled into a canonical structure (hash keys sorted
|
|
47
|
+
# and stringified recursively) and hashed, so semantically identical
|
|
48
|
+
# requests always produce the same digest. Request-affecting provider
|
|
49
|
+
# configuration (endpoint/base URL/etc.) is included so two providers that
|
|
50
|
+
# share a name but target different endpoints never collide.
|
|
51
|
+
#
|
|
52
|
+
# @param provider [Symbol, String] Resolved provider identifier
|
|
53
|
+
# @param model [String, nil] Model name
|
|
54
|
+
# @param system_prompt [String] System prompt
|
|
55
|
+
# @param messages [Array<Hash>] Conversation messages
|
|
56
|
+
# @param tools [Array<Hash>, nil] Tool definitions, when present
|
|
57
|
+
# @param temperature [Float, nil] Sampling temperature, when present
|
|
58
|
+
# @param provider_config [Hash] Request-affecting provider settings such as
|
|
59
|
+
# base_url, request_path, endpoint, location, project_id, api_version
|
|
60
|
+
# @return [String] Hex-encoded SHA-256 digest of the canonical request
|
|
61
|
+
def key(provider:, model:, system_prompt:, messages:, tools: nil, temperature: nil, provider_config: {})
|
|
62
|
+
payload = {
|
|
63
|
+
provider: provider.to_s,
|
|
64
|
+
model: model,
|
|
65
|
+
system_prompt: system_prompt,
|
|
66
|
+
messages: messages,
|
|
67
|
+
tools: tools,
|
|
68
|
+
temperature: temperature,
|
|
69
|
+
provider_config: provider_config
|
|
70
|
+
}
|
|
71
|
+
Digest::SHA256.hexdigest(JSON.generate(canonicalize(payload)))
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Returns the cached value for a key, computing and storing it on a miss.
|
|
75
|
+
#
|
|
76
|
+
# The value is computed outside the lock so requests for distinct keys run
|
|
77
|
+
# concurrently; the store read and the store write are each serialized by
|
|
78
|
+
# {MUTEX}, and a missing key is written exactly once (first writer wins).
|
|
79
|
+
#
|
|
80
|
+
# @param key [String] Cache key from {key}
|
|
81
|
+
# @yield Computes the value to cache when the key is absent
|
|
82
|
+
# @yieldreturn [Object] The value to cache
|
|
83
|
+
# @return [Object] The cached value (existing on a hit, freshly stored on a miss)
|
|
84
|
+
def fetch(key)
|
|
85
|
+
hit = MUTEX.synchronize { store[key] }
|
|
86
|
+
return hit unless hit.nil?
|
|
87
|
+
|
|
88
|
+
value = yield
|
|
89
|
+
MUTEX.synchronize { store[key] ||= value }
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Removes every cached entry.
|
|
93
|
+
#
|
|
94
|
+
# @return [void]
|
|
95
|
+
def clear
|
|
96
|
+
MUTEX.synchronize { store.clear }
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
private
|
|
100
|
+
|
|
101
|
+
# The process-lifetime backing store.
|
|
102
|
+
#
|
|
103
|
+
# @return [Hash{String => Object}] digest => cached response
|
|
104
|
+
def store
|
|
105
|
+
@store ||= {}
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Recursively rewrites a value into a stable form for serialization.
|
|
109
|
+
#
|
|
110
|
+
# Hashes get their keys stringified and sorted so that key ordering does
|
|
111
|
+
# not affect the resulting digest; arrays and scalars are preserved.
|
|
112
|
+
#
|
|
113
|
+
# @param value [Object] The value to canonicalize
|
|
114
|
+
# @return [Object] A canonical, order-stable copy of the value
|
|
115
|
+
def canonicalize(value)
|
|
116
|
+
case value
|
|
117
|
+
when Hash
|
|
118
|
+
value
|
|
119
|
+
.sort_by { |entry| entry.first.to_s }
|
|
120
|
+
.each_with_object({}) { |(name, val), acc| acc[name.to_s] = canonicalize(val) }
|
|
121
|
+
when Array
|
|
122
|
+
value.map { |element| canonicalize(element) }
|
|
123
|
+
else
|
|
124
|
+
value
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|