ruby-skill-bench 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +166 -35
  3. data/docs/architecture.md +3 -1
  4. data/docs/first-eval-guide.md +7 -7
  5. data/docs/testing-guide.md +1 -1
  6. data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
  7. data/lib/skill_bench/agent/react_agent/step.rb +7 -1
  8. data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
  9. data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
  10. data/lib/skill_bench/cli/help_printer.rb +10 -2
  11. data/lib/skill_bench/cli/init_command.rb +2 -1
  12. data/lib/skill_bench/cli/result_printer.rb +1 -1
  13. data/lib/skill_bench/cli/run_command.rb +47 -9
  14. data/lib/skill_bench/cli/validate_command.rb +242 -0
  15. data/lib/skill_bench/cli.rb +3 -0
  16. data/lib/skill_bench/client.rb +43 -1
  17. data/lib/skill_bench/clients/all.rb +2 -0
  18. data/lib/skill_bench/clients/base_client.rb +12 -1
  19. data/lib/skill_bench/clients/base_url_validator.rb +105 -0
  20. data/lib/skill_bench/clients/provider_config.rb +34 -1
  21. data/lib/skill_bench/clients/provider_schemas.rb +4 -0
  22. data/lib/skill_bench/clients/providers/mistral.rb +47 -0
  23. data/lib/skill_bench/commands/init.rb +5 -0
  24. data/lib/skill_bench/commands/skill_new.rb +3 -1
  25. data/lib/skill_bench/config/applier.rb +2 -0
  26. data/lib/skill_bench/config/defaults.rb +2 -0
  27. data/lib/skill_bench/config/facade_readers.rb +7 -0
  28. data/lib/skill_bench/config/facade_writers.rb +17 -0
  29. data/lib/skill_bench/config/json_loader.rb +1 -1
  30. data/lib/skill_bench/config/store.rb +29 -0
  31. data/lib/skill_bench/config.rb +18 -0
  32. data/lib/skill_bench/evaluation/runner.rb +20 -3
  33. data/lib/skill_bench/execution/context_hydrator.rb +52 -11
  34. data/lib/skill_bench/execution/sandbox.rb +58 -11
  35. data/lib/skill_bench/judge/judge.rb +4 -0
  36. data/lib/skill_bench/judge/prompt.rb +42 -6
  37. data/lib/skill_bench/models/config.rb +32 -0
  38. data/lib/skill_bench/output_formatter.rb +60 -1
  39. data/lib/skill_bench/package_verifier.rb +1 -1
  40. data/lib/skill_bench/rails/skill_templates.rb +19 -5
  41. data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
  42. data/lib/skill_bench/services/batch_runner_service.rb +111 -0
  43. data/lib/skill_bench/services/compare_option_parser.rb +1 -0
  44. data/lib/skill_bench/services/cost_calculator.rb +91 -0
  45. data/lib/skill_bench/services/html_formatter.rb +289 -0
  46. data/lib/skill_bench/services/json_formatter.rb +19 -1
  47. data/lib/skill_bench/services/junit_formatter.rb +74 -24
  48. data/lib/skill_bench/services/provider_resolver.rb +5 -2
  49. data/lib/skill_bench/services/response_cache.rb +130 -0
  50. data/lib/skill_bench/services/runner_service.rb +88 -4
  51. data/lib/skill_bench/services/summary_formatter.rb +90 -0
  52. data/lib/skill_bench/services/template_registry.rb +43 -9
  53. data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
  54. data/lib/skill_bench/tools/registry.rb +29 -3
  55. data/lib/skill_bench/tools/run_command.rb +171 -19
  56. data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
  57. data/lib/skill_bench/trend_tracker.rb +5 -5
  58. data/lib/skill_bench/version.rb +1 -1
  59. data/lib/skill_bench.rb +2 -3
  60. metadata +17 -36
@@ -0,0 +1,289 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'cgi'
4
+ require_relative 'formatting_helpers'
5
+ require_relative '../delta_report'
6
+
7
+ module SkillBench
8
+ module Services
9
+ # Formats evaluation results as a complete, self-contained HTML document.
10
+ #
11
+ # The output embeds all styling inline (no external assets) and escapes every
12
+ # dynamic, user-derived value with {CGI.escapeHTML} to prevent HTML injection.
13
+ # Both the modern DeltaReport shape and the legacy result shape are supported.
14
+ class HtmlFormatter
15
+ extend FormattingHelpers
16
+
17
+ # Inline stylesheet embedded in every generated document.
18
+ STYLE = <<~CSS
19
+ body { font-family: -apple-system, Segoe UI, Roboto, sans-serif; margin: 2rem; color: #1a1a1a; background: #fafafa; }
20
+ main { max-width: 960px; margin: 0 auto; }
21
+ header { border-bottom: 2px solid #ddd; padding-bottom: 1rem; margin-bottom: 1.5rem; }
22
+ h1 { margin: 0 0 0.5rem; font-size: 1.6rem; }
23
+ dl.meta { display: grid; grid-template-columns: max-content 1fr; gap: 0.2rem 1rem; margin: 0.5rem 0; }
24
+ dl.meta dt { font-weight: 600; color: #555; }
25
+ dl.meta dd { margin: 0; }
26
+ p.usage { color: #555; font-variant-numeric: tabular-nums; }
27
+ table { border-collapse: collapse; width: 100%; margin: 1rem 0; }
28
+ th, td { padding: 0.4rem 0.75rem; text-align: right; border-bottom: 1px solid #e2e2e2; }
29
+ th:first-child, td:first-child { text-align: left; }
30
+ tr.total td { font-weight: 700; border-top: 2px solid #bbb; }
31
+ p.verdict { font-weight: 700; padding: 0.5rem 0.75rem; border-radius: 4px; display: inline-block; }
32
+ p.verdict.pass { background: #e6f4ea; color: #1e7e34; }
33
+ p.verdict.fail { background: #fde8e8; color: #c0392b; }
34
+ p.error { color: #c0392b; }
35
+ section.iterations h3 { margin-bottom: 0.25rem; }
36
+ ol { margin: 0.25rem 0 1rem; }
37
+ li { margin: 0.2rem 0; }
38
+ span.tools, span.observation { color: #555; }
39
+ CSS
40
+
41
+ # Format an eval result as a full HTML document.
42
+ #
43
+ # @param result [Hash] Eval result envelope (DeltaReport or legacy shape).
44
+ # @return [String] A complete HTML document string.
45
+ def self.format(result)
46
+ report = result.dig(:response, :report)
47
+ body = report.is_a?(SkillBench::DeltaReport) ? delta_body(result, report) : legacy_section(result)
48
+ build_document(result, body)
49
+ end
50
+
51
+ # Builds the body for a DeltaReport result (table plus iteration timeline).
52
+ #
53
+ # @param result [Hash] Eval result envelope.
54
+ # @param report [SkillBench::DeltaReport] The delta report.
55
+ # @return [String] HTML for the report and iteration sections.
56
+ def self.delta_body(result, report)
57
+ "#{report_section(report)}\n#{iterations_section(result)}"
58
+ end
59
+ private_class_method :delta_body
60
+
61
+ # Wraps body HTML in a complete, styled HTML document.
62
+ #
63
+ # @param result [Hash] Eval result envelope (used for the header/title).
64
+ # @param body [String] Pre-rendered body HTML.
65
+ # @return [String] A complete HTML document string.
66
+ def self.build_document(result, body)
67
+ title = escape(result[:eval_name] || 'Report')
68
+ <<~HTML
69
+ <!DOCTYPE html>
70
+ <html lang="en">
71
+ <head>
72
+ <meta charset="utf-8">
73
+ <title>SkillBench Report — #{title}</title>
74
+ <style>#{STYLE}</style>
75
+ </head>
76
+ <body>
77
+ <main>
78
+ #{header_html(result)}
79
+ #{body}
80
+ </main>
81
+ </body>
82
+ </html>
83
+ HTML
84
+ end
85
+ private_class_method :build_document
86
+
87
+ # Builds the header with eval/skill/provider names and the usage line.
88
+ #
89
+ # @param result [Hash] Eval result envelope.
90
+ # @return [String] HTML for the document header.
91
+ def self.header_html(result)
92
+ <<~HTML.chomp
93
+ <header>
94
+ <h1>SkillBench Report</h1>
95
+ <dl class="meta">
96
+ <dt>Eval</dt><dd>#{escape(result[:eval_name])}</dd>
97
+ <dt>Skill</dt><dd>#{escape(result[:skill_name])}</dd>
98
+ <dt>Provider</dt><dd>#{escape(result[:provider_name])}</dd>
99
+ </dl>
100
+ <p class="usage">#{usage_line(result)}</p>
101
+ </header>
102
+ HTML
103
+ end
104
+ private_class_method :header_html
105
+
106
+ # Builds the token/cost summary line for the header.
107
+ #
108
+ # @param result [Hash] Eval result envelope; reads :tokens and :cost.
109
+ # @return [String] An escaped "Tokens / Est. Cost" line.
110
+ def self.usage_line(result)
111
+ tokens = result[:tokens] || {}
112
+ total = tokens[:total_tokens] || tokens['total_tokens'] || 0
113
+ cost = result[:cost]
114
+ cost_label = cost ? Kernel.format('$%.4f', cost) : '—'
115
+ "Tokens: #{escape(total)} | Est. Cost: #{escape(cost_label)}"
116
+ end
117
+ private_class_method :usage_line
118
+
119
+ # Builds the scoring table and verdict for a DeltaReport.
120
+ #
121
+ # @param report [SkillBench::DeltaReport] The delta report.
122
+ # @return [String] HTML for the report section.
123
+ def self.report_section(report)
124
+ <<~HTML.chomp
125
+ <section class="report">
126
+ <h2>Delta Report</h2>
127
+ <table>
128
+ <thead><tr><th>Dimension</th><th>Baseline</th><th>Context</th><th>Delta</th></tr></thead>
129
+ <tbody>
130
+ #{dimension_rows(report)}
131
+ #{total_row(report)}
132
+ </tbody>
133
+ </table>
134
+ #{verdict_html(report)}
135
+ </section>
136
+ HTML
137
+ end
138
+ private_class_method :report_section
139
+
140
+ # Builds one table row per scored dimension.
141
+ #
142
+ # @param report [SkillBench::DeltaReport] The delta report.
143
+ # @return [String] HTML table rows joined by newlines.
144
+ def self.dimension_rows(report)
145
+ report.deltas.map { |name, delta| dimension_row(name, delta, report) }.join("\n")
146
+ end
147
+ private_class_method :dimension_rows
148
+
149
+ # Builds a single dimension table row.
150
+ #
151
+ # @param name [String] Dimension name.
152
+ # @param delta [Numeric] Context-minus-baseline delta.
153
+ # @param report [SkillBench::DeltaReport] The delta report.
154
+ # @return [String] An HTML table row.
155
+ def self.dimension_row(name, delta, report)
156
+ dim = report.criteria.dimensions.find { |candidate| candidate.name == name }
157
+ humanized = humanize(name)
158
+ label = dim ? "#{humanized} (#{dim.max_score})" : humanized
159
+ baseline = report.baseline_scores[name]
160
+ context = report.context_scores[name]
161
+ row_cells('dimension', label, baseline, context, delta_str(delta))
162
+ end
163
+ private_class_method :dimension_row
164
+
165
+ # Builds the totals table row.
166
+ #
167
+ # @param report [SkillBench::DeltaReport] The delta report.
168
+ # @return [String] An HTML table row for the totals.
169
+ def self.total_row(report)
170
+ total_delta = report.deltas.values.sum
171
+ row_cells('total', 'Total', "#{report.baseline_total}/100",
172
+ "#{report.context_total}/100", delta_str(total_delta))
173
+ end
174
+ private_class_method :total_row
175
+
176
+ # Builds an HTML table row from escaped cell values.
177
+ #
178
+ # @param css_class [String] CSS class for the row.
179
+ # @param label [String] First-column label.
180
+ # @param baseline [Object] Baseline score cell.
181
+ # @param context [Object] Context score cell.
182
+ # @param delta [String] Delta cell.
183
+ # @return [String] An HTML table row.
184
+ def self.row_cells(css_class, label, baseline, context, delta)
185
+ "<tr class=\"#{css_class}\"><td>#{escape(label)}</td><td>#{escape(baseline)}</td>" \
186
+ "<td>#{escape(context)}</td><td>#{escape(delta)}</td></tr>"
187
+ end
188
+ private_class_method :row_cells
189
+
190
+ # Builds the verdict paragraph.
191
+ #
192
+ # @param report [SkillBench::DeltaReport] The delta report.
193
+ # @return [String] An HTML verdict paragraph.
194
+ def self.verdict_html(report)
195
+ verdict = report.verdict
196
+ criteria = report.criteria
197
+ status = verdict ? 'PASS' : 'FAIL'
198
+ css = verdict ? 'pass' : 'fail'
199
+ threshold = escape(criteria.pass_threshold)
200
+ minimum_delta = escape(criteria.minimum_delta)
201
+ %(<p class="verdict #{css}">Verdict: #{status} (threshold: #{threshold}, minimum delta: #{minimum_delta})</p>)
202
+ end
203
+ private_class_method :verdict_html
204
+
205
+ # Builds the baseline/context iteration timeline section.
206
+ #
207
+ # @param result [Hash] Eval result envelope.
208
+ # @return [String] HTML for the iterations section, or empty string.
209
+ def self.iterations_section(result)
210
+ baseline = result.dig(:response, :baseline_iterations) || []
211
+ context = result.dig(:response, :context_iterations) || []
212
+ baseline_empty = baseline.empty?
213
+ context_empty = context.empty?
214
+ return '' if baseline_empty && context_empty
215
+
216
+ blocks = []
217
+ blocks << iteration_block('Baseline Iterations', baseline) unless baseline_empty
218
+ blocks << iteration_block('Context Iterations', context) unless context_empty
219
+ %(<section class="iterations">\n<h2>Iteration Timeline</h2>\n#{blocks.join("\n")}\n</section>)
220
+ end
221
+ private_class_method :iterations_section
222
+
223
+ # Builds one named iteration timeline block.
224
+ #
225
+ # @param title [String] Section title.
226
+ # @param iterations [Array<Hash>] Iteration metadata entries.
227
+ # @return [String] HTML for the timeline block.
228
+ def self.iteration_block(title, iterations)
229
+ items = iterations.map { |iteration| iteration_item(iteration) }.join("\n")
230
+ %(<div class="timeline"><h3>#{escape(title)}</h3><ol>\n#{items}\n</ol></div>)
231
+ end
232
+ private_class_method :iteration_block
233
+
234
+ # Builds one list item for a single iteration step.
235
+ #
236
+ # @param iteration [Hash] Iteration metadata with :step_number, :thought,
237
+ # :tools_used, and :observation_summary keys.
238
+ # @return [String] An HTML list item.
239
+ def self.iteration_item(iteration)
240
+ tools = iteration[:tools_used] || []
241
+ tools_html = tools.empty? ? '' : %( <span class="tools">Tools: #{escape(tools.join(', '))}</span>)
242
+ observation = iteration[:observation_summary].to_s
243
+ observation_html = observation.empty? ? '' : %( <span class="observation">Observation: #{escape(observation)}</span>)
244
+ step = "Step #{escape(iteration[:step_number])}: #{escape(iteration[:thought])}"
245
+ %(<li><span class="thought">#{step}</span>#{tools_html}#{observation_html}</li>)
246
+ end
247
+ private_class_method :iteration_item
248
+
249
+ # Builds the body for a legacy (non-DeltaReport) result.
250
+ #
251
+ # @param result [Hash] Legacy eval result envelope.
252
+ # @return [String] HTML for the legacy status section.
253
+ def self.legacy_section(result)
254
+ passed = result[:pass]
255
+ status = passed ? 'PASSED' : 'FAILED'
256
+ css = passed ? 'pass' : 'fail'
257
+ score = result[:score]&.round(2)
258
+ <<~HTML.chomp
259
+ <section class="report legacy">
260
+ <h2>Result</h2>
261
+ <p class="verdict #{css}">Status: #{status}</p>
262
+ <p class="score">Score: #{escape(score || 'N/A')}</p>
263
+ #{legacy_error(result)}
264
+ </section>
265
+ HTML
266
+ end
267
+ private_class_method :legacy_section
268
+
269
+ # Builds the optional error paragraph for a legacy result.
270
+ #
271
+ # @param result [Hash] Legacy eval result envelope.
272
+ # @return [String] An HTML error paragraph, or empty string.
273
+ def self.legacy_error(result)
274
+ message = result.dig(:response, :error, :message)
275
+ message ? %(<p class="error">Error: #{escape(message)}</p>) : ''
276
+ end
277
+ private_class_method :legacy_error
278
+
279
+ # Escapes any value for safe HTML embedding.
280
+ #
281
+ # @param value [Object] The value to escape (coerced via #to_s).
282
+ # @return [String] HTML-escaped text.
283
+ def self.escape(value)
284
+ CGI.escapeHTML(value.to_s)
285
+ end
286
+ private_class_method :escape
287
+ end
288
+ end
289
+ end
@@ -6,12 +6,30 @@ module SkillBench
6
6
  module Services
7
7
  # Formats evaluation results as JSON.
8
8
  class JsonFormatter
9
+ # Zeroed token usage used when a result carries no usage data.
10
+ EMPTY_USAGE = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }.freeze
11
+
9
12
  # Format result as JSON.
10
13
  #
14
+ # Ensures top-level :tokens and :cost fields are always present (additive;
15
+ # existing keys are preserved) so JSON consumers see a stable shape.
16
+ #
11
17
  # @param result [Hash] Eval result.
12
18
  # @return [String] JSON-formatted string.
13
19
  def self.format(result)
14
- JSON.pretty_generate(result)
20
+ JSON.pretty_generate(with_usage_fields(result))
21
+ end
22
+
23
+ # Returns the result augmented with token/cost fields when missing.
24
+ #
25
+ # @param result [Hash] Eval result (returned unchanged when not a Hash).
26
+ # @return [Hash] Result with :tokens and :cost guaranteed present.
27
+ def self.with_usage_fields(result)
28
+ return result unless result.is_a?(Hash)
29
+
30
+ tokens = result[:tokens] || result.dig(:response, :tokens) || EMPTY_USAGE
31
+ cost = result.key?(:cost) ? result[:cost] : result.dig(:response, :cost)
32
+ result.merge(tokens: tokens, cost: cost)
15
33
  end
16
34
  end
17
35
  end
@@ -4,39 +4,89 @@ require 'cgi'
4
4
 
5
5
  module SkillBench
6
6
  module Services
7
- # Formats evaluation results as JUnit XML.
7
+ # Formats evaluation results as JUnit XML for CI consumption.
8
+ #
9
+ # Two entry points share the same per-result verdict/score logic:
10
+ # {.format} emits a single-result suite (one <testcase>), while
11
+ # {.format_batch} aggregates many results into one suite so a batch
12
+ # `skill-bench run --all` produces a single JUnit artifact.
8
13
  class JUnitFormatter
9
- # Format result as JUnit XML.
14
+ # classname attribute applied to every emitted <testcase>.
15
+ CLASSNAME = 'SkillBench'
16
+
17
+ # Format a single result as a JUnit XML document.
10
18
  #
11
19
  # Supports both legacy format (result[:pass]) and modern DeltaReport format.
12
20
  #
13
21
  # @param result [Hash] Eval result.
14
22
  # @return [String] JUnit XML-formatted string.
15
23
  def self.format(result)
24
+ suite([result])
25
+ end
26
+
27
+ # Format an aggregate batch envelope as one JUnit XML document.
28
+ #
29
+ # Emits a single <testsuite> with one <testcase> per result, adding a
30
+ # <failure> child for every failing eval.
31
+ #
32
+ # @param aggregate [Hash] Aggregate envelope with a :results array.
33
+ # @return [String] JUnit XML-formatted string.
34
+ def self.format_batch(aggregate)
35
+ suite(aggregate[:results] || [])
36
+ end
37
+
38
+ # Builds a <testsuite> wrapping one <testcase> per result.
39
+ #
40
+ # @param results [Array<Hash>] Per-eval result envelopes.
41
+ # @return [String] JUnit XML-formatted string.
42
+ def self.suite(results)
43
+ failures = results.count { |result| !passing?(result) }
44
+ cases = results.map { |result| testcase(result) }.join("\n")
45
+ <<~XML
46
+ <?xml version="1.0"?>
47
+ <testsuite name="#{CLASSNAME}" tests="#{results.size}" failures="#{failures}">
48
+ #{cases}
49
+ </testsuite>
50
+ XML
51
+ end
52
+ private_class_method :suite
53
+
54
+ # Renders one <testcase> element (indented two spaces) for a result.
55
+ #
56
+ # @param result [Hash] A single-eval result envelope.
57
+ # @return [String] A <testcase> XML fragment.
58
+ def self.testcase(result)
59
+ name = CGI.escapeHTML(result[:eval_name].to_s)
60
+ return %( <testcase name="#{name}" classname="#{CLASSNAME}"/>) if passing?(result)
61
+
62
+ score = CGI.escapeHTML(score_for(result).to_s)
63
+ [
64
+ %( <testcase name="#{name}" classname="#{CLASSNAME}">),
65
+ %( <failure message="Score: #{score}">Eval failed</failure>),
66
+ ' </testcase>'
67
+ ].join("\n")
68
+ end
69
+ private_class_method :testcase
70
+
71
+ # Whether a result passed (DeltaReport verdict or legacy :pass).
72
+ #
73
+ # @param result [Hash] A single-eval result envelope.
74
+ # @return [Boolean] true when the eval passed.
75
+ def self.passing?(result)
76
+ report = result.dig(:response, :report)
77
+ report.respond_to?(:verdict) ? report.verdict : result[:pass]
78
+ end
79
+ private_class_method :passing?
80
+
81
+ # The score reported for a failing eval.
82
+ #
83
+ # @param result [Hash] A single-eval result envelope.
84
+ # @return [Object] DeltaReport context_total or legacy :score.
85
+ def self.score_for(result)
16
86
  report = result.dig(:response, :report)
17
- verdict = report.respond_to?(:verdict) ? report.verdict : result[:pass]
18
- eval_name = CGI.escapeHTML(result[:eval_name].to_s)
19
-
20
- if verdict
21
- <<~XML
22
- <?xml version="1.0"?>
23
- <testsuite name="SkillBench" tests="1" failures="0">
24
- <testcase name="#{eval_name}" classname="SkillBench"/>
25
- </testsuite>
26
- XML
27
- else
28
- score = report.respond_to?(:context_total) ? report.context_total : result[:score]
29
- escaped_score = CGI.escapeHTML(score.to_s)
30
- <<~XML
31
- <?xml version="1.0"?>
32
- <testsuite name="SkillBench" tests="1" failures="1">
33
- <testcase name="#{eval_name}" classname="SkillBench">
34
- <failure message="Score: #{escaped_score}">Eval failed</failure>
35
- </testcase>
36
- </testsuite>
37
- XML
38
- end
87
+ report.respond_to?(:context_total) ? report.context_total : result[:score]
39
88
  end
89
+ private_class_method :score_for
40
90
  end
41
91
  end
42
92
  end
@@ -51,11 +51,14 @@ module SkillBench
51
51
  private
52
52
 
53
53
  def resolve_provider
54
- config = SkillBench::Models::Config.load
54
+ config = SkillBench::Models::Config.loaded
55
55
  provider = config.to_provider
56
56
  return provider if provider
57
57
 
58
- warn 'Config load failed, using mock provider'
58
+ # Explicit `{"provider":"mock"}` is a valid choice, not a load failure,
59
+ # so it falls through to the mock provider without a warning. A missing
60
+ # provider key (genuine misconfiguration) still warns below.
61
+ warn 'Config load failed, using mock provider' unless config.mock?
59
62
  MOCK_PROVIDER.new('mock', 'mock', 'mock', {})
60
63
  rescue JSON::ParserError, ArgumentError, Errno::ENOENT => e
61
64
  # Config parsing/validation errors or missing config file - fall back to mock
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'digest'
4
+ require 'json'
5
+
6
+ module SkillBench
7
+ module Services
8
+ # Content-addressed, in-memory cache for LLM responses.
9
+ #
10
+ # The cache is opt-in and disabled by default. When enabled it lets repeated,
11
+ # identical LLM requests reuse a previously computed response instead of
12
+ # hitting the network again. The canonical example is `compare`, which runs
13
+ # the skill-less baseline twice with identical inputs.
14
+ #
15
+ # The backing store is a process-lifetime {Hash} keyed by a stable SHA-256
16
+ # digest of the request, so the same logical request always maps to the same
17
+ # entry regardless of hash-key ordering. Access to the store is serialized
18
+ # with a mutex so concurrent callers (e.g. {Parallel}-driven agents) cannot
19
+ # corrupt it or double-store a key.
20
+ class ResponseCache
21
+ # Environment variable that opts caching on when set to a truthy value.
22
+ ENV_FLAG = 'SKILL_BENCH_CACHE'
23
+
24
+ # Raw env values treated as "on".
25
+ TRUTHY_VALUES = %w[1 true yes on].freeze
26
+
27
+ # Guards every read/write of the shared store. Concurrent agents/judges run
28
+ # on separate threads; without this, the membership check and the write in
29
+ # {fetch} could interleave and store a key more than once.
30
+ MUTEX = Mutex.new
31
+
32
+ class << self
33
+ # Whether response caching is currently enabled.
34
+ #
35
+ # Enabled when {ENV_FLAG} is set to a truthy value (one of
36
+ # {TRUTHY_VALUES}); disabled when unset or set to anything else.
37
+ #
38
+ # @return [Boolean] true when caching is on
39
+ def enabled?
40
+ raw = ENV.fetch(ENV_FLAG, '').to_s.strip.downcase
41
+ TRUTHY_VALUES.include?(raw)
42
+ end
43
+
44
+ # Computes a stable content-addressed cache key for a request.
45
+ #
46
+ # The inputs are assembled into a canonical structure (hash keys sorted
47
+ # and stringified recursively) and hashed, so semantically identical
48
+ # requests always produce the same digest. Request-affecting provider
49
+ # configuration (endpoint/base URL/etc.) is included so two providers that
50
+ # share a name but target different endpoints never collide.
51
+ #
52
+ # @param provider [Symbol, String] Resolved provider identifier
53
+ # @param model [String, nil] Model name
54
+ # @param system_prompt [String] System prompt
55
+ # @param messages [Array<Hash>] Conversation messages
56
+ # @param tools [Array<Hash>, nil] Tool definitions, when present
57
+ # @param temperature [Float, nil] Sampling temperature, when present
58
+ # @param provider_config [Hash] Request-affecting provider settings such as
59
+ # base_url, request_path, endpoint, location, project_id, api_version
60
+ # @return [String] Hex-encoded SHA-256 digest of the canonical request
61
+ def key(provider:, model:, system_prompt:, messages:, tools: nil, temperature: nil, provider_config: {})
62
+ payload = {
63
+ provider: provider.to_s,
64
+ model: model,
65
+ system_prompt: system_prompt,
66
+ messages: messages,
67
+ tools: tools,
68
+ temperature: temperature,
69
+ provider_config: provider_config
70
+ }
71
+ Digest::SHA256.hexdigest(JSON.generate(canonicalize(payload)))
72
+ end
73
+
74
+ # Returns the cached value for a key, computing and storing it on a miss.
75
+ #
76
+ # The value is computed outside the lock so requests for distinct keys run
77
+ # concurrently; the store read and the store write are each serialized by
78
+ # {MUTEX}, and a missing key is written exactly once (first writer wins).
79
+ #
80
+ # @param key [String] Cache key from {key}
81
+ # @yield Computes the value to cache when the key is absent
82
+ # @yieldreturn [Object] The value to cache
83
+ # @return [Object] The cached value (existing on a hit, freshly stored on a miss)
84
+ def fetch(key)
85
+ hit = MUTEX.synchronize { store[key] }
86
+ return hit unless hit.nil?
87
+
88
+ value = yield
89
+ MUTEX.synchronize { store[key] ||= value }
90
+ end
91
+
92
+ # Removes every cached entry.
93
+ #
94
+ # @return [void]
95
+ def clear
96
+ MUTEX.synchronize { store.clear }
97
+ end
98
+
99
+ private
100
+
101
+ # The process-lifetime backing store.
102
+ #
103
+ # @return [Hash{String => Object}] digest => cached response
104
+ def store
105
+ @store ||= {}
106
+ end
107
+
108
+ # Recursively rewrites a value into a stable form for serialization.
109
+ #
110
+ # Hashes get their keys stringified and sorted so that key ordering does
111
+ # not affect the resulting digest; arrays and scalars are preserved.
112
+ #
113
+ # @param value [Object] The value to canonicalize
114
+ # @return [Object] A canonical, order-stable copy of the value
115
+ def canonicalize(value)
116
+ case value
117
+ when Hash
118
+ value
119
+ .sort_by { |entry| entry.first.to_s }
120
+ .each_with_object({}) { |(name, val), acc| acc[name.to_s] = canonicalize(val) }
121
+ when Array
122
+ value.map { |element| canonicalize(element) }
123
+ else
124
+ value
125
+ end
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end