ruby-skill-bench 1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +299 -23
  3. data/docs/architecture.md +3 -1
  4. data/docs/first-eval-guide.md +7 -7
  5. data/docs/testing-guide.md +1 -1
  6. data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
  7. data/lib/skill_bench/agent/react_agent/step.rb +7 -1
  8. data/lib/skill_bench/agent/react_agent.rb +2 -1
  9. data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
  10. data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
  11. data/lib/skill_bench/cli/help_printer.rb +10 -2
  12. data/lib/skill_bench/cli/init_command.rb +2 -1
  13. data/lib/skill_bench/cli/result_printer.rb +1 -1
  14. data/lib/skill_bench/cli/run_command.rb +47 -9
  15. data/lib/skill_bench/cli/validate_command.rb +242 -0
  16. data/lib/skill_bench/cli.rb +3 -0
  17. data/lib/skill_bench/client.rb +43 -1
  18. data/lib/skill_bench/clients/all.rb +3 -0
  19. data/lib/skill_bench/clients/base_client.rb +14 -6
  20. data/lib/skill_bench/clients/base_url_validator.rb +105 -0
  21. data/lib/skill_bench/clients/provider_config.rb +34 -1
  22. data/lib/skill_bench/clients/provider_schemas.rb +4 -0
  23. data/lib/skill_bench/clients/providers/mistral.rb +47 -0
  24. data/lib/skill_bench/clients/request_builder.rb +2 -4
  25. data/lib/skill_bench/clients/response_builder.rb +91 -0
  26. data/lib/skill_bench/clients/response_error_handler.rb +5 -17
  27. data/lib/skill_bench/clients/retry_handler.rb +4 -7
  28. data/lib/skill_bench/commands/init.rb +5 -0
  29. data/lib/skill_bench/commands/skill_new.rb +3 -1
  30. data/lib/skill_bench/config/applier.rb +2 -0
  31. data/lib/skill_bench/config/defaults.rb +2 -0
  32. data/lib/skill_bench/config/facade_readers.rb +7 -0
  33. data/lib/skill_bench/config/facade_writers.rb +17 -0
  34. data/lib/skill_bench/config/json_loader.rb +1 -1
  35. data/lib/skill_bench/config/store.rb +29 -0
  36. data/lib/skill_bench/config.rb +18 -0
  37. data/lib/skill_bench/constants.rb +58 -0
  38. data/lib/skill_bench/evaluation/runner.rb +20 -3
  39. data/lib/skill_bench/execution/context_hydrator.rb +66 -15
  40. data/lib/skill_bench/execution/sandbox.rb +76 -14
  41. data/lib/skill_bench/judge/judge.rb +4 -0
  42. data/lib/skill_bench/judge/prompt.rb +42 -6
  43. data/lib/skill_bench/models/config.rb +32 -0
  44. data/lib/skill_bench/output_formatter.rb +60 -1
  45. data/lib/skill_bench/package_verifier.rb +1 -1
  46. data/lib/skill_bench/rails/skill_templates.rb +19 -5
  47. data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
  48. data/lib/skill_bench/services/batch_runner_service.rb +111 -0
  49. data/lib/skill_bench/services/compare_option_parser.rb +1 -0
  50. data/lib/skill_bench/services/cost_calculator.rb +91 -0
  51. data/lib/skill_bench/services/html_formatter.rb +289 -0
  52. data/lib/skill_bench/services/json_formatter.rb +19 -1
  53. data/lib/skill_bench/services/junit_formatter.rb +74 -24
  54. data/lib/skill_bench/services/provider_resolver.rb +5 -2
  55. data/lib/skill_bench/services/response_cache.rb +130 -0
  56. data/lib/skill_bench/services/runner_service.rb +88 -4
  57. data/lib/skill_bench/services/summary_formatter.rb +90 -0
  58. data/lib/skill_bench/services/template_registry.rb +43 -9
  59. data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
  60. data/lib/skill_bench/tools/registry.rb +29 -3
  61. data/lib/skill_bench/tools/run_command.rb +172 -35
  62. data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
  63. data/lib/skill_bench/trend_tracker.rb +5 -5
  64. data/lib/skill_bench/version.rb +1 -1
  65. data/lib/skill_bench.rb +3 -3
  66. metadata +19 -36
@@ -7,6 +7,9 @@ module SkillBench
7
7
  module Services
8
8
  # Spawns and executes LLM agents for evaluation.
9
9
  class AgentSpawnerService
10
+ # Zeroed token usage used when a run produces no usage data (e.g. mock, rescue).
11
+ EMPTY_USAGE = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }.freeze
12
+
10
13
  # Spawns the LLM agent with the given system prompt.
11
14
  #
12
15
  # @param evaluation [SkillBench::Models::Eval] The eval being run
@@ -33,7 +36,7 @@ module SkillBench
33
36
  #
34
37
  # @return [Hash] Agent response with result, status, runtime, usage, raw_response, iterations
35
38
  def call
36
- return { result: 'mock result', status: :success, iterations: [] } if @provider.name == 'mock'
39
+ return { result: 'mock result', status: :success, iterations: [], usage: EMPTY_USAGE } if @provider.name == 'mock'
37
40
 
38
41
  client_params = build_client_params
39
42
  max_iterations = @config&.[](:max_iterations) || @config&.[]('max_iterations') || 25
@@ -63,6 +66,7 @@ module SkillBench
63
66
  final_answer = agent_result.dig(:response, :content) || ''
64
67
  diff = Execution::Sandbox.capture_diff(sandbox.path)
65
68
  iterations = agent_result.dig(:response, :iterations) || []
69
+ usage = agent_result.dig(:response, :usage) || EMPTY_USAGE
66
70
 
67
71
  output = [final_answer, diff].reject(&:empty?).join("\n\n")
68
72
 
@@ -70,7 +74,7 @@ module SkillBench
70
74
  result: output,
71
75
  status: status,
72
76
  runtime: @provider.runtime,
73
- usage: {},
77
+ usage: usage,
74
78
  raw_response: agent_result,
75
79
  iterations: iterations
76
80
  }
@@ -80,7 +84,7 @@ module SkillBench
80
84
  result: "Error: #{e.message}",
81
85
  status: :error,
82
86
  runtime: @provider.runtime,
83
- usage: {},
87
+ usage: EMPTY_USAGE,
84
88
  raw_response: { error: e.message, backtrace: e.backtrace },
85
89
  iterations: []
86
90
  }
@@ -0,0 +1,111 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+ require 'parallel'
5
+ require_relative 'runner_service'
6
+ require_relative '../output_formatter'
7
+ require_relative '../runner'
8
+
9
+ module SkillBench
10
+ module Services
11
+ # Orchestrates running many evals in a single batch.
12
+ #
13
+ # Discovers every eval under a target directory and runs
14
+ # {RunnerService} over each, returning an aggregate envelope with
15
+ # per-eval results and a pass/fail summary.
16
+ #
17
+ # Discovery reuses {SkillBench::Runner.discover_task_dirs} but never
18
+ # routes through the deprecated {SkillBench::Task::Evaluator}: each eval
19
+ # is executed by the supported {RunnerService}.
20
+ class BatchRunnerService
21
+ # Default directory scanned for evals when none is supplied.
22
+ DEFAULT_EVALS_DIR = 'evals'
23
+
24
+ # Default batch-level thread count.
25
+ #
26
+ # Each {RunnerService.call} already runs its baseline and context
27
+ # agents concurrently (#26), so this is kept modest to bound nested
28
+ # thread usage (batch threads x per-eval threads).
29
+ DEFAULT_THREADS = 2
30
+
31
+ # Runs every eval discovered under +evals_dir+.
32
+ #
33
+ # @param skill_names [Array<String>] Names of the skills to apply to every eval
34
+ # @param evals_dir [String] Directory to scan for evals
35
+ # @param pack [String, nil] Optional pack name for registry-based skill resolution
36
+ # @param registry_manifest [String, nil] Optional path to registry.json manifest
37
+ # @param threads [Integer] Batch-level thread count
38
+ # @return [Hash] Aggregate envelope with :results and :summary
39
+ # @raise [ArgumentError] when no evals are found under +evals_dir+
40
+ def self.call(skill_names:, evals_dir: DEFAULT_EVALS_DIR, pack: nil, registry_manifest: nil, threads: DEFAULT_THREADS)
41
+ new(
42
+ skill_names: skill_names,
43
+ evals_dir: evals_dir,
44
+ pack: pack,
45
+ registry_manifest: registry_manifest,
46
+ threads: threads
47
+ ).call
48
+ end
49
+
50
+ # @param skill_names [Array<String>] Names of the skills
51
+ # @param evals_dir [String] Directory to scan for evals
52
+ # @param pack [String, nil] Optional pack name
53
+ # @param registry_manifest [String, nil] Optional registry.json path
54
+ # @param threads [Integer] Batch-level thread count
55
+ def initialize(skill_names:, evals_dir:, pack:, registry_manifest:, threads:)
56
+ @skill_names = skill_names
57
+ @evals_dir = evals_dir
58
+ @pack = pack
59
+ @registry_manifest = registry_manifest
60
+ @threads = threads
61
+ end
62
+
63
+ # Discovers the target evals and runs each through {RunnerService}.
64
+ #
65
+ # @return [Hash] Aggregate envelope with :results and :summary
66
+ # @raise [ArgumentError] when no evals are found under the directory
67
+ def call
68
+ eval_dirs = discover_eval_dirs
69
+ raise ArgumentError, "No evals found under #{evals_dir}" if eval_dirs.empty?
70
+
71
+ results = run_all(eval_dirs)
72
+ { results: results, summary: summarize(results) }
73
+ end
74
+
75
+ private
76
+
77
+ attr_reader :skill_names, :evals_dir, :pack, :registry_manifest, :threads
78
+
79
+ # Finds every eval directory under the configured root.
80
+ #
81
+ # @return [Array<Pathname>] Directories that contain a task.md
82
+ def discover_eval_dirs
83
+ SkillBench::Runner.discover_task_dirs(Pathname.new(evals_dir))
84
+ end
85
+
86
+ # Runs every eval directory through {RunnerService} concurrently.
87
+ #
88
+ # @param eval_dirs [Array<Pathname>] Discovered eval directories
89
+ # @return [Array<Hash>] Per-eval RunnerService results
90
+ def run_all(eval_dirs)
91
+ Parallel.map(eval_dirs, in_threads: threads) do |eval_dir|
92
+ RunnerService.call(
93
+ eval_name: eval_dir.to_s,
94
+ skill_names: skill_names,
95
+ pack: pack,
96
+ registry_manifest: registry_manifest
97
+ )
98
+ end
99
+ end
100
+
101
+ # Tallies pass/fail counts, reusing the single-eval exit-code logic.
102
+ #
103
+ # @param results [Array<Hash>] Per-eval results
104
+ # @return [Hash] Summary with :total, :passed and :failed counts
105
+ def summarize(results)
106
+ passed = results.count { |result| SkillBench::OutputFormatter.exit_code(result).zero? }
107
+ { total: results.size, passed: passed, failed: results.size - passed }
108
+ end
109
+ end
110
+ end
111
+ end
@@ -44,6 +44,7 @@ module SkillBench
44
44
  opts.on('--variant-b SPEC', 'Second variant (e.g., "pack:hanami" or "/path/to/skill")') { |v| options[:variant_b] = v }
45
45
  opts.on('--eval PATH', 'Path to the eval directory') { |v| options[:eval] = v }
46
46
  opts.on('--format FORMAT', 'Output format (human, json)') { |v| options[:format] = v.to_sym }
47
+ opts.on('--cache', 'Enable content-addressed response caching') { ENV['SKILL_BENCH_CACHE'] = '1' }
47
48
  opts.on('-h', '--help', 'Prints this help') do
48
49
  puts opts
49
50
  raise SkillBench::HelpRequested
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Services
5
+ # Estimates the USD cost of an LLM run from token usage and a model name.
6
+ #
7
+ # Prices are approximate, drawn from public OpenAI/Anthropic pricing pages,
8
+ # and expressed in USD per 1,000 tokens. Provider pricing changes over time,
9
+ # so treat the result as a rough estimate and extend {PRICES} as needed.
10
+ class CostCalculator
11
+ # Approximate per-model prices in USD per 1,000 tokens.
12
+ # Keyed by a canonical model prefix; longer prefixes win on lookup so that
13
+ # dated variants (e.g. "claude-sonnet-4-20250514") resolve correctly.
14
+ # Source: public OpenAI and Anthropic pricing pages (approximate).
15
+ PRICES = {
16
+ 'gpt-4o-mini' => { input: 0.00015, output: 0.0006 },
17
+ 'gpt-4o' => { input: 0.005, output: 0.015 },
18
+ 'gpt-4-turbo' => { input: 0.01, output: 0.03 },
19
+ 'gpt-4' => { input: 0.03, output: 0.06 },
20
+ 'gpt-3.5-turbo' => { input: 0.0005, output: 0.0015 },
21
+ 'claude-opus-4' => { input: 0.015, output: 0.075 },
22
+ 'claude-sonnet-4' => { input: 0.003, output: 0.015 },
23
+ 'claude-3-5-sonnet' => { input: 0.003, output: 0.015 },
24
+ 'claude-3-5-haiku' => { input: 0.0008, output: 0.004 },
25
+ 'claude-3-opus' => { input: 0.015, output: 0.075 },
26
+ 'claude-3-sonnet' => { input: 0.003, output: 0.015 },
27
+ 'claude-3-haiku' => { input: 0.00025, output: 0.00125 }
28
+ }.freeze
29
+
30
+ # Token count that one priced unit of {PRICES} covers.
31
+ TOKENS_PER_UNIT = 1000.0
32
+
33
+ # Estimates the USD cost for a run.
34
+ #
35
+ # @param usage [Hash, nil] Token usage with :prompt_tokens and :completion_tokens.
36
+ # @param model [String, nil] The model name (e.g. "gpt-4o").
37
+ # @return [Float, nil] Estimated cost in USD, or nil when the model is unknown.
38
+ def self.call(usage:, model:)
39
+ new(usage, model).call
40
+ end
41
+
42
+ # @param usage [Hash, nil] Token usage hash.
43
+ # @param model [String, nil] The model name.
44
+ def initialize(usage, model)
45
+ @usage = usage || {}
46
+ @model = model
47
+ end
48
+
49
+ # Estimates the USD cost for the configured usage and model.
50
+ #
51
+ # @return [Float, nil] Estimated cost in USD, or nil when the model is unknown.
52
+ def call
53
+ price = price_for(@model)
54
+ return nil unless price
55
+
56
+ input_cost = units(:prompt_tokens) * price[:input]
57
+ output_cost = units(:completion_tokens) * price[:output]
58
+ (input_cost + output_cost).round(6)
59
+ end
60
+
61
+ private
62
+
63
+ # Finds the price entry for a model by longest matching name prefix.
64
+ #
65
+ # @param model [String, nil] The model name.
66
+ # @return [Hash, nil] Price entry with :input and :output, or nil when unknown.
67
+ def price_for(model)
68
+ key = model.to_s.downcase
69
+ return PRICES[key] if PRICES.key?(key)
70
+
71
+ PRICES.select { |name, _| key.start_with?(name) }.max_by { |name, _| name.length }&.last
72
+ end
73
+
74
+ # Converts a usage token count into priced 1K-token units.
75
+ #
76
+ # @param key [Symbol] The usage key to read.
77
+ # @return [Float] The number of priced units.
78
+ def units(key)
79
+ token_count(key) / TOKENS_PER_UNIT
80
+ end
81
+
82
+ # Reads a token count from the usage hash, tolerating string keys.
83
+ #
84
+ # @param key [Symbol] The usage key (e.g. :prompt_tokens).
85
+ # @return [Integer] The token count, or zero when absent.
86
+ def token_count(key)
87
+ (@usage[key] || @usage[key.to_s] || 0).to_i
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,289 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'cgi'
4
+ require_relative 'formatting_helpers'
5
+ require_relative '../delta_report'
6
+
7
+ module SkillBench
8
+ module Services
9
+ # Formats evaluation results as a complete, self-contained HTML document.
10
+ #
11
+ # The output embeds all styling inline (no external assets) and escapes every
12
+ # dynamic, user-derived value with {CGI.escapeHTML} to prevent HTML injection.
13
+ # Both the modern DeltaReport shape and the legacy result shape are supported.
14
+ class HtmlFormatter
15
+ extend FormattingHelpers
16
+
17
+ # Inline stylesheet embedded in every generated document.
18
+ STYLE = <<~CSS
19
+ body { font-family: -apple-system, Segoe UI, Roboto, sans-serif; margin: 2rem; color: #1a1a1a; background: #fafafa; }
20
+ main { max-width: 960px; margin: 0 auto; }
21
+ header { border-bottom: 2px solid #ddd; padding-bottom: 1rem; margin-bottom: 1.5rem; }
22
+ h1 { margin: 0 0 0.5rem; font-size: 1.6rem; }
23
+ dl.meta { display: grid; grid-template-columns: max-content 1fr; gap: 0.2rem 1rem; margin: 0.5rem 0; }
24
+ dl.meta dt { font-weight: 600; color: #555; }
25
+ dl.meta dd { margin: 0; }
26
+ p.usage { color: #555; font-variant-numeric: tabular-nums; }
27
+ table { border-collapse: collapse; width: 100%; margin: 1rem 0; }
28
+ th, td { padding: 0.4rem 0.75rem; text-align: right; border-bottom: 1px solid #e2e2e2; }
29
+ th:first-child, td:first-child { text-align: left; }
30
+ tr.total td { font-weight: 700; border-top: 2px solid #bbb; }
31
+ p.verdict { font-weight: 700; padding: 0.5rem 0.75rem; border-radius: 4px; display: inline-block; }
32
+ p.verdict.pass { background: #e6f4ea; color: #1e7e34; }
33
+ p.verdict.fail { background: #fde8e8; color: #c0392b; }
34
+ p.error { color: #c0392b; }
35
+ section.iterations h3 { margin-bottom: 0.25rem; }
36
+ ol { margin: 0.25rem 0 1rem; }
37
+ li { margin: 0.2rem 0; }
38
+ span.tools, span.observation { color: #555; }
39
+ CSS
40
+
41
+ # Format an eval result as a full HTML document.
42
+ #
43
+ # @param result [Hash] Eval result envelope (DeltaReport or legacy shape).
44
+ # @return [String] A complete HTML document string.
45
+ def self.format(result)
46
+ report = result.dig(:response, :report)
47
+ body = report.is_a?(SkillBench::DeltaReport) ? delta_body(result, report) : legacy_section(result)
48
+ build_document(result, body)
49
+ end
50
+
51
+ # Builds the body for a DeltaReport result (table plus iteration timeline).
52
+ #
53
+ # @param result [Hash] Eval result envelope.
54
+ # @param report [SkillBench::DeltaReport] The delta report.
55
+ # @return [String] HTML for the report and iteration sections.
56
+ def self.delta_body(result, report)
57
+ "#{report_section(report)}\n#{iterations_section(result)}"
58
+ end
59
+ private_class_method :delta_body
60
+
61
+ # Wraps body HTML in a complete, styled HTML document.
62
+ #
63
+ # @param result [Hash] Eval result envelope (used for the header/title).
64
+ # @param body [String] Pre-rendered body HTML.
65
+ # @return [String] A complete HTML document string.
66
+ def self.build_document(result, body)
67
+ title = escape(result[:eval_name] || 'Report')
68
+ <<~HTML
69
+ <!DOCTYPE html>
70
+ <html lang="en">
71
+ <head>
72
+ <meta charset="utf-8">
73
+ <title>SkillBench Report — #{title}</title>
74
+ <style>#{STYLE}</style>
75
+ </head>
76
+ <body>
77
+ <main>
78
+ #{header_html(result)}
79
+ #{body}
80
+ </main>
81
+ </body>
82
+ </html>
83
+ HTML
84
+ end
85
+ private_class_method :build_document
86
+
87
+ # Builds the header with eval/skill/provider names and the usage line.
88
+ #
89
+ # @param result [Hash] Eval result envelope.
90
+ # @return [String] HTML for the document header.
91
+ def self.header_html(result)
92
+ <<~HTML.chomp
93
+ <header>
94
+ <h1>SkillBench Report</h1>
95
+ <dl class="meta">
96
+ <dt>Eval</dt><dd>#{escape(result[:eval_name])}</dd>
97
+ <dt>Skill</dt><dd>#{escape(result[:skill_name])}</dd>
98
+ <dt>Provider</dt><dd>#{escape(result[:provider_name])}</dd>
99
+ </dl>
100
+ <p class="usage">#{usage_line(result)}</p>
101
+ </header>
102
+ HTML
103
+ end
104
+ private_class_method :header_html
105
+
106
+ # Builds the token/cost summary line for the header.
107
+ #
108
+ # @param result [Hash] Eval result envelope; reads :tokens and :cost.
109
+ # @return [String] An escaped "Tokens / Est. Cost" line.
110
+ def self.usage_line(result)
111
+ tokens = result[:tokens] || {}
112
+ total = tokens[:total_tokens] || tokens['total_tokens'] || 0
113
+ cost = result[:cost]
114
+ cost_label = cost ? Kernel.format('$%.4f', cost) : '—'
115
+ "Tokens: #{escape(total)} | Est. Cost: #{escape(cost_label)}"
116
+ end
117
+ private_class_method :usage_line
118
+
119
+ # Builds the scoring table and verdict for a DeltaReport.
120
+ #
121
+ # @param report [SkillBench::DeltaReport] The delta report.
122
+ # @return [String] HTML for the report section.
123
+ def self.report_section(report)
124
+ <<~HTML.chomp
125
+ <section class="report">
126
+ <h2>Delta Report</h2>
127
+ <table>
128
+ <thead><tr><th>Dimension</th><th>Baseline</th><th>Context</th><th>Delta</th></tr></thead>
129
+ <tbody>
130
+ #{dimension_rows(report)}
131
+ #{total_row(report)}
132
+ </tbody>
133
+ </table>
134
+ #{verdict_html(report)}
135
+ </section>
136
+ HTML
137
+ end
138
+ private_class_method :report_section
139
+
140
+ # Builds one table row per scored dimension.
141
+ #
142
+ # @param report [SkillBench::DeltaReport] The delta report.
143
+ # @return [String] HTML table rows joined by newlines.
144
+ def self.dimension_rows(report)
145
+ report.deltas.map { |name, delta| dimension_row(name, delta, report) }.join("\n")
146
+ end
147
+ private_class_method :dimension_rows
148
+
149
+ # Builds a single dimension table row.
150
+ #
151
+ # @param name [String] Dimension name.
152
+ # @param delta [Numeric] Context-minus-baseline delta.
153
+ # @param report [SkillBench::DeltaReport] The delta report.
154
+ # @return [String] An HTML table row.
155
+ def self.dimension_row(name, delta, report)
156
+ dim = report.criteria.dimensions.find { |candidate| candidate.name == name }
157
+ humanized = humanize(name)
158
+ label = dim ? "#{humanized} (#{dim.max_score})" : humanized
159
+ baseline = report.baseline_scores[name]
160
+ context = report.context_scores[name]
161
+ row_cells('dimension', label, baseline, context, delta_str(delta))
162
+ end
163
+ private_class_method :dimension_row
164
+
165
+ # Builds the totals table row.
166
+ #
167
+ # @param report [SkillBench::DeltaReport] The delta report.
168
+ # @return [String] An HTML table row for the totals.
169
+ def self.total_row(report)
170
+ total_delta = report.deltas.values.sum
171
+ row_cells('total', 'Total', "#{report.baseline_total}/100",
172
+ "#{report.context_total}/100", delta_str(total_delta))
173
+ end
174
+ private_class_method :total_row
175
+
176
+ # Builds an HTML table row from escaped cell values.
177
+ #
178
+ # @param css_class [String] CSS class for the row.
179
+ # @param label [String] First-column label.
180
+ # @param baseline [Object] Baseline score cell.
181
+ # @param context [Object] Context score cell.
182
+ # @param delta [String] Delta cell.
183
+ # @return [String] An HTML table row.
184
+ def self.row_cells(css_class, label, baseline, context, delta)
185
+ "<tr class=\"#{css_class}\"><td>#{escape(label)}</td><td>#{escape(baseline)}</td>" \
186
+ "<td>#{escape(context)}</td><td>#{escape(delta)}</td></tr>"
187
+ end
188
+ private_class_method :row_cells
189
+
190
+ # Builds the verdict paragraph.
191
+ #
192
+ # @param report [SkillBench::DeltaReport] The delta report.
193
+ # @return [String] An HTML verdict paragraph.
194
+ def self.verdict_html(report)
195
+ verdict = report.verdict
196
+ criteria = report.criteria
197
+ status = verdict ? 'PASS' : 'FAIL'
198
+ css = verdict ? 'pass' : 'fail'
199
+ threshold = escape(criteria.pass_threshold)
200
+ minimum_delta = escape(criteria.minimum_delta)
201
+ %(<p class="verdict #{css}">Verdict: #{status} (threshold: #{threshold}, minimum delta: #{minimum_delta})</p>)
202
+ end
203
+ private_class_method :verdict_html
204
+
205
+ # Builds the baseline/context iteration timeline section.
206
+ #
207
+ # @param result [Hash] Eval result envelope.
208
+ # @return [String] HTML for the iterations section, or empty string.
209
+ def self.iterations_section(result)
210
+ baseline = result.dig(:response, :baseline_iterations) || []
211
+ context = result.dig(:response, :context_iterations) || []
212
+ baseline_empty = baseline.empty?
213
+ context_empty = context.empty?
214
+ return '' if baseline_empty && context_empty
215
+
216
+ blocks = []
217
+ blocks << iteration_block('Baseline Iterations', baseline) unless baseline_empty
218
+ blocks << iteration_block('Context Iterations', context) unless context_empty
219
+ %(<section class="iterations">\n<h2>Iteration Timeline</h2>\n#{blocks.join("\n")}\n</section>)
220
+ end
221
+ private_class_method :iterations_section
222
+
223
+ # Builds one named iteration timeline block.
224
+ #
225
+ # @param title [String] Section title.
226
+ # @param iterations [Array<Hash>] Iteration metadata entries.
227
+ # @return [String] HTML for the timeline block.
228
+ def self.iteration_block(title, iterations)
229
+ items = iterations.map { |iteration| iteration_item(iteration) }.join("\n")
230
+ %(<div class="timeline"><h3>#{escape(title)}</h3><ol>\n#{items}\n</ol></div>)
231
+ end
232
+ private_class_method :iteration_block
233
+
234
+ # Builds one list item for a single iteration step.
235
+ #
236
+ # @param iteration [Hash] Iteration metadata with :step_number, :thought,
237
+ # :tools_used, and :observation_summary keys.
238
+ # @return [String] An HTML list item.
239
+ def self.iteration_item(iteration)
240
+ tools = iteration[:tools_used] || []
241
+ tools_html = tools.empty? ? '' : %( <span class="tools">Tools: #{escape(tools.join(', '))}</span>)
242
+ observation = iteration[:observation_summary].to_s
243
+ observation_html = observation.empty? ? '' : %( <span class="observation">Observation: #{escape(observation)}</span>)
244
+ step = "Step #{escape(iteration[:step_number])}: #{escape(iteration[:thought])}"
245
+ %(<li><span class="thought">#{step}</span>#{tools_html}#{observation_html}</li>)
246
+ end
247
+ private_class_method :iteration_item
248
+
249
+ # Builds the body for a legacy (non-DeltaReport) result.
250
+ #
251
+ # @param result [Hash] Legacy eval result envelope.
252
+ # @return [String] HTML for the legacy status section.
253
+ def self.legacy_section(result)
254
+ passed = result[:pass]
255
+ status = passed ? 'PASSED' : 'FAILED'
256
+ css = passed ? 'pass' : 'fail'
257
+ score = result[:score]&.round(2)
258
+ <<~HTML.chomp
259
+ <section class="report legacy">
260
+ <h2>Result</h2>
261
+ <p class="verdict #{css}">Status: #{status}</p>
262
+ <p class="score">Score: #{escape(score || 'N/A')}</p>
263
+ #{legacy_error(result)}
264
+ </section>
265
+ HTML
266
+ end
267
+ private_class_method :legacy_section
268
+
269
+ # Builds the optional error paragraph for a legacy result.
270
+ #
271
+ # @param result [Hash] Legacy eval result envelope.
272
+ # @return [String] An HTML error paragraph, or empty string.
273
+ def self.legacy_error(result)
274
+ message = result.dig(:response, :error, :message)
275
+ message ? %(<p class="error">Error: #{escape(message)}</p>) : ''
276
+ end
277
+ private_class_method :legacy_error
278
+
279
+ # Escapes any value for safe HTML embedding.
280
+ #
281
+ # @param value [Object] The value to escape (coerced via #to_s).
282
+ # @return [String] HTML-escaped text.
283
+ def self.escape(value)
284
+ CGI.escapeHTML(value.to_s)
285
+ end
286
+ private_class_method :escape
287
+ end
288
+ end
289
+ end
@@ -6,12 +6,30 @@ module SkillBench
6
6
  module Services
7
7
  # Formats evaluation results as JSON.
8
8
  class JsonFormatter
9
+ # Zeroed token usage used when a result carries no usage data.
10
+ EMPTY_USAGE = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }.freeze
11
+
9
12
  # Format result as JSON.
10
13
  #
14
+ # Ensures top-level :tokens and :cost fields are always present (additive;
15
+ # existing keys are preserved) so JSON consumers see a stable shape.
16
+ #
11
17
  # @param result [Hash] Eval result.
12
18
  # @return [String] JSON-formatted string.
13
19
  def self.format(result)
14
- JSON.pretty_generate(result)
20
+ JSON.pretty_generate(with_usage_fields(result))
21
+ end
22
+
23
+ # Returns the result augmented with token/cost fields when missing.
24
+ #
25
+ # @param result [Hash] Eval result (returned unchanged when not a Hash).
26
+ # @return [Hash] Result with :tokens and :cost guaranteed present.
27
+ def self.with_usage_fields(result)
28
+ return result unless result.is_a?(Hash)
29
+
30
+ tokens = result[:tokens] || result.dig(:response, :tokens) || EMPTY_USAGE
31
+ cost = result.key?(:cost) ? result[:cost] : result.dig(:response, :cost)
32
+ result.merge(tokens: tokens, cost: cost)
15
33
  end
16
34
  end
17
35
  end