ruby-skill-bench 1.0.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +299 -23
- data/docs/architecture.md +3 -1
- data/docs/first-eval-guide.md +7 -7
- data/docs/testing-guide.md +1 -1
- data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
- data/lib/skill_bench/agent/react_agent/step.rb +7 -1
- data/lib/skill_bench/agent/react_agent.rb +2 -1
- data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
- data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
- data/lib/skill_bench/cli/help_printer.rb +10 -2
- data/lib/skill_bench/cli/init_command.rb +2 -1
- data/lib/skill_bench/cli/result_printer.rb +1 -1
- data/lib/skill_bench/cli/run_command.rb +47 -9
- data/lib/skill_bench/cli/validate_command.rb +242 -0
- data/lib/skill_bench/cli.rb +3 -0
- data/lib/skill_bench/client.rb +43 -1
- data/lib/skill_bench/clients/all.rb +3 -0
- data/lib/skill_bench/clients/base_client.rb +14 -6
- data/lib/skill_bench/clients/base_url_validator.rb +105 -0
- data/lib/skill_bench/clients/provider_config.rb +34 -1
- data/lib/skill_bench/clients/provider_schemas.rb +4 -0
- data/lib/skill_bench/clients/providers/mistral.rb +47 -0
- data/lib/skill_bench/clients/request_builder.rb +2 -4
- data/lib/skill_bench/clients/response_builder.rb +91 -0
- data/lib/skill_bench/clients/response_error_handler.rb +5 -17
- data/lib/skill_bench/clients/retry_handler.rb +4 -7
- data/lib/skill_bench/commands/init.rb +5 -0
- data/lib/skill_bench/commands/skill_new.rb +3 -1
- data/lib/skill_bench/config/applier.rb +2 -0
- data/lib/skill_bench/config/defaults.rb +2 -0
- data/lib/skill_bench/config/facade_readers.rb +7 -0
- data/lib/skill_bench/config/facade_writers.rb +17 -0
- data/lib/skill_bench/config/json_loader.rb +1 -1
- data/lib/skill_bench/config/store.rb +29 -0
- data/lib/skill_bench/config.rb +18 -0
- data/lib/skill_bench/constants.rb +58 -0
- data/lib/skill_bench/evaluation/runner.rb +20 -3
- data/lib/skill_bench/execution/context_hydrator.rb +66 -15
- data/lib/skill_bench/execution/sandbox.rb +76 -14
- data/lib/skill_bench/judge/judge.rb +4 -0
- data/lib/skill_bench/judge/prompt.rb +42 -6
- data/lib/skill_bench/models/config.rb +32 -0
- data/lib/skill_bench/output_formatter.rb +60 -1
- data/lib/skill_bench/package_verifier.rb +1 -1
- data/lib/skill_bench/rails/skill_templates.rb +19 -5
- data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
- data/lib/skill_bench/services/batch_runner_service.rb +111 -0
- data/lib/skill_bench/services/compare_option_parser.rb +1 -0
- data/lib/skill_bench/services/cost_calculator.rb +91 -0
- data/lib/skill_bench/services/html_formatter.rb +289 -0
- data/lib/skill_bench/services/json_formatter.rb +19 -1
- data/lib/skill_bench/services/junit_formatter.rb +74 -24
- data/lib/skill_bench/services/provider_resolver.rb +5 -2
- data/lib/skill_bench/services/response_cache.rb +130 -0
- data/lib/skill_bench/services/runner_service.rb +88 -4
- data/lib/skill_bench/services/summary_formatter.rb +90 -0
- data/lib/skill_bench/services/template_registry.rb +43 -9
- data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
- data/lib/skill_bench/tools/registry.rb +29 -3
- data/lib/skill_bench/tools/run_command.rb +172 -35
- data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
- data/lib/skill_bench/trend_tracker.rb +5 -5
- data/lib/skill_bench/version.rb +1 -1
- data/lib/skill_bench.rb +3 -3
- metadata +19 -36
|
@@ -7,6 +7,9 @@ module SkillBench
|
|
|
7
7
|
module Services
|
|
8
8
|
# Spawns and executes LLM agents for evaluation.
|
|
9
9
|
class AgentSpawnerService
|
|
10
|
+
# Zeroed token usage used when a run produces no usage data (e.g. mock, rescue).
|
|
11
|
+
EMPTY_USAGE = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }.freeze
|
|
12
|
+
|
|
10
13
|
# Spawns the LLM agent with the given system prompt.
|
|
11
14
|
#
|
|
12
15
|
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
@@ -33,7 +36,7 @@ module SkillBench
|
|
|
33
36
|
#
|
|
34
37
|
# @return [Hash] Agent response with result, status, runtime, usage, raw_response, iterations
|
|
35
38
|
def call
|
|
36
|
-
return { result: 'mock result', status: :success, iterations: [] } if @provider.name == 'mock'
|
|
39
|
+
return { result: 'mock result', status: :success, iterations: [], usage: EMPTY_USAGE } if @provider.name == 'mock'
|
|
37
40
|
|
|
38
41
|
client_params = build_client_params
|
|
39
42
|
max_iterations = @config&.[](:max_iterations) || @config&.[]('max_iterations') || 25
|
|
@@ -63,6 +66,7 @@ module SkillBench
|
|
|
63
66
|
final_answer = agent_result.dig(:response, :content) || ''
|
|
64
67
|
diff = Execution::Sandbox.capture_diff(sandbox.path)
|
|
65
68
|
iterations = agent_result.dig(:response, :iterations) || []
|
|
69
|
+
usage = agent_result.dig(:response, :usage) || EMPTY_USAGE
|
|
66
70
|
|
|
67
71
|
output = [final_answer, diff].reject(&:empty?).join("\n\n")
|
|
68
72
|
|
|
@@ -70,7 +74,7 @@ module SkillBench
|
|
|
70
74
|
result: output,
|
|
71
75
|
status: status,
|
|
72
76
|
runtime: @provider.runtime,
|
|
73
|
-
usage:
|
|
77
|
+
usage: usage,
|
|
74
78
|
raw_response: agent_result,
|
|
75
79
|
iterations: iterations
|
|
76
80
|
}
|
|
@@ -80,7 +84,7 @@ module SkillBench
|
|
|
80
84
|
result: "Error: #{e.message}",
|
|
81
85
|
status: :error,
|
|
82
86
|
runtime: @provider.runtime,
|
|
83
|
-
usage:
|
|
87
|
+
usage: EMPTY_USAGE,
|
|
84
88
|
raw_response: { error: e.message, backtrace: e.backtrace },
|
|
85
89
|
iterations: []
|
|
86
90
|
}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'pathname'
|
|
4
|
+
require 'parallel'
|
|
5
|
+
require_relative 'runner_service'
|
|
6
|
+
require_relative '../output_formatter'
|
|
7
|
+
require_relative '../runner'
|
|
8
|
+
|
|
9
|
+
module SkillBench
|
|
10
|
+
module Services
|
|
11
|
+
# Orchestrates running many evals in a single batch.
|
|
12
|
+
#
|
|
13
|
+
# Discovers every eval under a target directory and runs
|
|
14
|
+
# {RunnerService} over each, returning an aggregate envelope with
|
|
15
|
+
# per-eval results and a pass/fail summary.
|
|
16
|
+
#
|
|
17
|
+
# Discovery reuses {SkillBench::Runner.discover_task_dirs} but never
|
|
18
|
+
# routes through the deprecated {SkillBench::Task::Evaluator}: each eval
|
|
19
|
+
# is executed by the supported {RunnerService}.
|
|
20
|
+
class BatchRunnerService
|
|
21
|
+
# Default directory scanned for evals when none is supplied.
|
|
22
|
+
DEFAULT_EVALS_DIR = 'evals'
|
|
23
|
+
|
|
24
|
+
# Default batch-level thread count.
|
|
25
|
+
#
|
|
26
|
+
# Each {RunnerService.call} already runs its baseline and context
|
|
27
|
+
# agents concurrently (#26), so this is kept modest to bound nested
|
|
28
|
+
# thread usage (batch threads x per-eval threads).
|
|
29
|
+
DEFAULT_THREADS = 2
|
|
30
|
+
|
|
31
|
+
# Runs every eval discovered under +evals_dir+.
|
|
32
|
+
#
|
|
33
|
+
# @param skill_names [Array<String>] Names of the skills to apply to every eval
|
|
34
|
+
# @param evals_dir [String] Directory to scan for evals
|
|
35
|
+
# @param pack [String, nil] Optional pack name for registry-based skill resolution
|
|
36
|
+
# @param registry_manifest [String, nil] Optional path to registry.json manifest
|
|
37
|
+
# @param threads [Integer] Batch-level thread count
|
|
38
|
+
# @return [Hash] Aggregate envelope with :results and :summary
|
|
39
|
+
# @raise [ArgumentError] when no evals are found under +evals_dir+
|
|
40
|
+
def self.call(skill_names:, evals_dir: DEFAULT_EVALS_DIR, pack: nil, registry_manifest: nil, threads: DEFAULT_THREADS)
|
|
41
|
+
new(
|
|
42
|
+
skill_names: skill_names,
|
|
43
|
+
evals_dir: evals_dir,
|
|
44
|
+
pack: pack,
|
|
45
|
+
registry_manifest: registry_manifest,
|
|
46
|
+
threads: threads
|
|
47
|
+
).call
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# @param skill_names [Array<String>] Names of the skills
|
|
51
|
+
# @param evals_dir [String] Directory to scan for evals
|
|
52
|
+
# @param pack [String, nil] Optional pack name
|
|
53
|
+
# @param registry_manifest [String, nil] Optional registry.json path
|
|
54
|
+
# @param threads [Integer] Batch-level thread count
|
|
55
|
+
def initialize(skill_names:, evals_dir:, pack:, registry_manifest:, threads:)
|
|
56
|
+
@skill_names = skill_names
|
|
57
|
+
@evals_dir = evals_dir
|
|
58
|
+
@pack = pack
|
|
59
|
+
@registry_manifest = registry_manifest
|
|
60
|
+
@threads = threads
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Discovers the target evals and runs each through {RunnerService}.
|
|
64
|
+
#
|
|
65
|
+
# @return [Hash] Aggregate envelope with :results and :summary
|
|
66
|
+
# @raise [ArgumentError] when no evals are found under the directory
|
|
67
|
+
def call
|
|
68
|
+
eval_dirs = discover_eval_dirs
|
|
69
|
+
raise ArgumentError, "No evals found under #{evals_dir}" if eval_dirs.empty?
|
|
70
|
+
|
|
71
|
+
results = run_all(eval_dirs)
|
|
72
|
+
{ results: results, summary: summarize(results) }
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
private
|
|
76
|
+
|
|
77
|
+
attr_reader :skill_names, :evals_dir, :pack, :registry_manifest, :threads
|
|
78
|
+
|
|
79
|
+
# Finds every eval directory under the configured root.
|
|
80
|
+
#
|
|
81
|
+
# @return [Array<Pathname>] Directories that contain a task.md
|
|
82
|
+
def discover_eval_dirs
|
|
83
|
+
SkillBench::Runner.discover_task_dirs(Pathname.new(evals_dir))
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Runs every eval directory through {RunnerService} concurrently.
|
|
87
|
+
#
|
|
88
|
+
# @param eval_dirs [Array<Pathname>] Discovered eval directories
|
|
89
|
+
# @return [Array<Hash>] Per-eval RunnerService results
|
|
90
|
+
def run_all(eval_dirs)
|
|
91
|
+
Parallel.map(eval_dirs, in_threads: threads) do |eval_dir|
|
|
92
|
+
RunnerService.call(
|
|
93
|
+
eval_name: eval_dir.to_s,
|
|
94
|
+
skill_names: skill_names,
|
|
95
|
+
pack: pack,
|
|
96
|
+
registry_manifest: registry_manifest
|
|
97
|
+
)
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Tallies pass/fail counts, reusing the single-eval exit-code logic.
|
|
102
|
+
#
|
|
103
|
+
# @param results [Array<Hash>] Per-eval results
|
|
104
|
+
# @return [Hash] Summary with :total, :passed and :failed counts
|
|
105
|
+
def summarize(results)
|
|
106
|
+
passed = results.count { |result| SkillBench::OutputFormatter.exit_code(result).zero? }
|
|
107
|
+
{ total: results.size, passed: passed, failed: results.size - passed }
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
@@ -44,6 +44,7 @@ module SkillBench
|
|
|
44
44
|
opts.on('--variant-b SPEC', 'Second variant (e.g., "pack:hanami" or "/path/to/skill")') { |v| options[:variant_b] = v }
|
|
45
45
|
opts.on('--eval PATH', 'Path to the eval directory') { |v| options[:eval] = v }
|
|
46
46
|
opts.on('--format FORMAT', 'Output format (human, json)') { |v| options[:format] = v.to_sym }
|
|
47
|
+
opts.on('--cache', 'Enable content-addressed response caching') { ENV['SKILL_BENCH_CACHE'] = '1' }
|
|
47
48
|
opts.on('-h', '--help', 'Prints this help') do
|
|
48
49
|
puts opts
|
|
49
50
|
raise SkillBench::HelpRequested
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Services
|
|
5
|
+
# Estimates the USD cost of an LLM run from token usage and a model name.
|
|
6
|
+
#
|
|
7
|
+
# Prices are approximate, drawn from public OpenAI/Anthropic pricing pages,
|
|
8
|
+
# and expressed in USD per 1,000 tokens. Provider pricing changes over time,
|
|
9
|
+
# so treat the result as a rough estimate and extend {PRICES} as needed.
|
|
10
|
+
class CostCalculator
|
|
11
|
+
# Approximate per-model prices in USD per 1,000 tokens.
|
|
12
|
+
# Keyed by a canonical model prefix; longer prefixes win on lookup so that
|
|
13
|
+
# dated variants (e.g. "claude-sonnet-4-20250514") resolve correctly.
|
|
14
|
+
# Source: public OpenAI and Anthropic pricing pages (approximate).
|
|
15
|
+
PRICES = {
|
|
16
|
+
'gpt-4o-mini' => { input: 0.00015, output: 0.0006 },
|
|
17
|
+
'gpt-4o' => { input: 0.005, output: 0.015 },
|
|
18
|
+
'gpt-4-turbo' => { input: 0.01, output: 0.03 },
|
|
19
|
+
'gpt-4' => { input: 0.03, output: 0.06 },
|
|
20
|
+
'gpt-3.5-turbo' => { input: 0.0005, output: 0.0015 },
|
|
21
|
+
'claude-opus-4' => { input: 0.015, output: 0.075 },
|
|
22
|
+
'claude-sonnet-4' => { input: 0.003, output: 0.015 },
|
|
23
|
+
'claude-3-5-sonnet' => { input: 0.003, output: 0.015 },
|
|
24
|
+
'claude-3-5-haiku' => { input: 0.0008, output: 0.004 },
|
|
25
|
+
'claude-3-opus' => { input: 0.015, output: 0.075 },
|
|
26
|
+
'claude-3-sonnet' => { input: 0.003, output: 0.015 },
|
|
27
|
+
'claude-3-haiku' => { input: 0.00025, output: 0.00125 }
|
|
28
|
+
}.freeze
|
|
29
|
+
|
|
30
|
+
# Token count that one priced unit of {PRICES} covers.
|
|
31
|
+
TOKENS_PER_UNIT = 1000.0
|
|
32
|
+
|
|
33
|
+
# Estimates the USD cost for a run.
|
|
34
|
+
#
|
|
35
|
+
# @param usage [Hash, nil] Token usage with :prompt_tokens and :completion_tokens.
|
|
36
|
+
# @param model [String, nil] The model name (e.g. "gpt-4o").
|
|
37
|
+
# @return [Float, nil] Estimated cost in USD, or nil when the model is unknown.
|
|
38
|
+
def self.call(usage:, model:)
|
|
39
|
+
new(usage, model).call
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# @param usage [Hash, nil] Token usage hash.
|
|
43
|
+
# @param model [String, nil] The model name.
|
|
44
|
+
def initialize(usage, model)
|
|
45
|
+
@usage = usage || {}
|
|
46
|
+
@model = model
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Estimates the USD cost for the configured usage and model.
|
|
50
|
+
#
|
|
51
|
+
# @return [Float, nil] Estimated cost in USD, or nil when the model is unknown.
|
|
52
|
+
def call
|
|
53
|
+
price = price_for(@model)
|
|
54
|
+
return nil unless price
|
|
55
|
+
|
|
56
|
+
input_cost = units(:prompt_tokens) * price[:input]
|
|
57
|
+
output_cost = units(:completion_tokens) * price[:output]
|
|
58
|
+
(input_cost + output_cost).round(6)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
private
|
|
62
|
+
|
|
63
|
+
# Finds the price entry for a model by longest matching name prefix.
|
|
64
|
+
#
|
|
65
|
+
# @param model [String, nil] The model name.
|
|
66
|
+
# @return [Hash, nil] Price entry with :input and :output, or nil when unknown.
|
|
67
|
+
def price_for(model)
|
|
68
|
+
key = model.to_s.downcase
|
|
69
|
+
return PRICES[key] if PRICES.key?(key)
|
|
70
|
+
|
|
71
|
+
PRICES.select { |name, _| key.start_with?(name) }.max_by { |name, _| name.length }&.last
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Converts a usage token count into priced 1K-token units.
|
|
75
|
+
#
|
|
76
|
+
# @param key [Symbol] The usage key to read.
|
|
77
|
+
# @return [Float] The number of priced units.
|
|
78
|
+
def units(key)
|
|
79
|
+
token_count(key) / TOKENS_PER_UNIT
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Reads a token count from the usage hash, tolerating string keys.
|
|
83
|
+
#
|
|
84
|
+
# @param key [Symbol] The usage key (e.g. :prompt_tokens).
|
|
85
|
+
# @return [Integer] The token count, or zero when absent.
|
|
86
|
+
def token_count(key)
|
|
87
|
+
(@usage[key] || @usage[key.to_s] || 0).to_i
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'cgi'
|
|
4
|
+
require_relative 'formatting_helpers'
|
|
5
|
+
require_relative '../delta_report'
|
|
6
|
+
|
|
7
|
+
module SkillBench
|
|
8
|
+
module Services
|
|
9
|
+
# Formats evaluation results as a complete, self-contained HTML document.
|
|
10
|
+
#
|
|
11
|
+
# The output embeds all styling inline (no external assets) and escapes every
|
|
12
|
+
# dynamic, user-derived value with {CGI.escapeHTML} to prevent HTML injection.
|
|
13
|
+
# Both the modern DeltaReport shape and the legacy result shape are supported.
|
|
14
|
+
class HtmlFormatter
|
|
15
|
+
extend FormattingHelpers
|
|
16
|
+
|
|
17
|
+
# Inline stylesheet embedded in every generated document.
|
|
18
|
+
STYLE = <<~CSS
|
|
19
|
+
body { font-family: -apple-system, Segoe UI, Roboto, sans-serif; margin: 2rem; color: #1a1a1a; background: #fafafa; }
|
|
20
|
+
main { max-width: 960px; margin: 0 auto; }
|
|
21
|
+
header { border-bottom: 2px solid #ddd; padding-bottom: 1rem; margin-bottom: 1.5rem; }
|
|
22
|
+
h1 { margin: 0 0 0.5rem; font-size: 1.6rem; }
|
|
23
|
+
dl.meta { display: grid; grid-template-columns: max-content 1fr; gap: 0.2rem 1rem; margin: 0.5rem 0; }
|
|
24
|
+
dl.meta dt { font-weight: 600; color: #555; }
|
|
25
|
+
dl.meta dd { margin: 0; }
|
|
26
|
+
p.usage { color: #555; font-variant-numeric: tabular-nums; }
|
|
27
|
+
table { border-collapse: collapse; width: 100%; margin: 1rem 0; }
|
|
28
|
+
th, td { padding: 0.4rem 0.75rem; text-align: right; border-bottom: 1px solid #e2e2e2; }
|
|
29
|
+
th:first-child, td:first-child { text-align: left; }
|
|
30
|
+
tr.total td { font-weight: 700; border-top: 2px solid #bbb; }
|
|
31
|
+
p.verdict { font-weight: 700; padding: 0.5rem 0.75rem; border-radius: 4px; display: inline-block; }
|
|
32
|
+
p.verdict.pass { background: #e6f4ea; color: #1e7e34; }
|
|
33
|
+
p.verdict.fail { background: #fde8e8; color: #c0392b; }
|
|
34
|
+
p.error { color: #c0392b; }
|
|
35
|
+
section.iterations h3 { margin-bottom: 0.25rem; }
|
|
36
|
+
ol { margin: 0.25rem 0 1rem; }
|
|
37
|
+
li { margin: 0.2rem 0; }
|
|
38
|
+
span.tools, span.observation { color: #555; }
|
|
39
|
+
CSS
|
|
40
|
+
|
|
41
|
+
# Format an eval result as a full HTML document.
|
|
42
|
+
#
|
|
43
|
+
# @param result [Hash] Eval result envelope (DeltaReport or legacy shape).
|
|
44
|
+
# @return [String] A complete HTML document string.
|
|
45
|
+
def self.format(result)
|
|
46
|
+
report = result.dig(:response, :report)
|
|
47
|
+
body = report.is_a?(SkillBench::DeltaReport) ? delta_body(result, report) : legacy_section(result)
|
|
48
|
+
build_document(result, body)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Builds the body for a DeltaReport result (table plus iteration timeline).
|
|
52
|
+
#
|
|
53
|
+
# @param result [Hash] Eval result envelope.
|
|
54
|
+
# @param report [SkillBench::DeltaReport] The delta report.
|
|
55
|
+
# @return [String] HTML for the report and iteration sections.
|
|
56
|
+
def self.delta_body(result, report)
|
|
57
|
+
"#{report_section(report)}\n#{iterations_section(result)}"
|
|
58
|
+
end
|
|
59
|
+
private_class_method :delta_body
|
|
60
|
+
|
|
61
|
+
# Wraps body HTML in a complete, styled HTML document.
|
|
62
|
+
#
|
|
63
|
+
# @param result [Hash] Eval result envelope (used for the header/title).
|
|
64
|
+
# @param body [String] Pre-rendered body HTML.
|
|
65
|
+
# @return [String] A complete HTML document string.
|
|
66
|
+
def self.build_document(result, body)
|
|
67
|
+
title = escape(result[:eval_name] || 'Report')
|
|
68
|
+
<<~HTML
|
|
69
|
+
<!DOCTYPE html>
|
|
70
|
+
<html lang="en">
|
|
71
|
+
<head>
|
|
72
|
+
<meta charset="utf-8">
|
|
73
|
+
<title>SkillBench Report — #{title}</title>
|
|
74
|
+
<style>#{STYLE}</style>
|
|
75
|
+
</head>
|
|
76
|
+
<body>
|
|
77
|
+
<main>
|
|
78
|
+
#{header_html(result)}
|
|
79
|
+
#{body}
|
|
80
|
+
</main>
|
|
81
|
+
</body>
|
|
82
|
+
</html>
|
|
83
|
+
HTML
|
|
84
|
+
end
|
|
85
|
+
private_class_method :build_document
|
|
86
|
+
|
|
87
|
+
# Builds the header with eval/skill/provider names and the usage line.
|
|
88
|
+
#
|
|
89
|
+
# @param result [Hash] Eval result envelope.
|
|
90
|
+
# @return [String] HTML for the document header.
|
|
91
|
+
def self.header_html(result)
|
|
92
|
+
<<~HTML.chomp
|
|
93
|
+
<header>
|
|
94
|
+
<h1>SkillBench Report</h1>
|
|
95
|
+
<dl class="meta">
|
|
96
|
+
<dt>Eval</dt><dd>#{escape(result[:eval_name])}</dd>
|
|
97
|
+
<dt>Skill</dt><dd>#{escape(result[:skill_name])}</dd>
|
|
98
|
+
<dt>Provider</dt><dd>#{escape(result[:provider_name])}</dd>
|
|
99
|
+
</dl>
|
|
100
|
+
<p class="usage">#{usage_line(result)}</p>
|
|
101
|
+
</header>
|
|
102
|
+
HTML
|
|
103
|
+
end
|
|
104
|
+
private_class_method :header_html
|
|
105
|
+
|
|
106
|
+
# Builds the token/cost summary line for the header.
|
|
107
|
+
#
|
|
108
|
+
# @param result [Hash] Eval result envelope; reads :tokens and :cost.
|
|
109
|
+
# @return [String] An escaped "Tokens / Est. Cost" line.
|
|
110
|
+
def self.usage_line(result)
|
|
111
|
+
tokens = result[:tokens] || {}
|
|
112
|
+
total = tokens[:total_tokens] || tokens['total_tokens'] || 0
|
|
113
|
+
cost = result[:cost]
|
|
114
|
+
cost_label = cost ? Kernel.format('$%.4f', cost) : '—'
|
|
115
|
+
"Tokens: #{escape(total)} | Est. Cost: #{escape(cost_label)}"
|
|
116
|
+
end
|
|
117
|
+
private_class_method :usage_line
|
|
118
|
+
|
|
119
|
+
# Builds the scoring table and verdict for a DeltaReport.
|
|
120
|
+
#
|
|
121
|
+
# @param report [SkillBench::DeltaReport] The delta report.
|
|
122
|
+
# @return [String] HTML for the report section.
|
|
123
|
+
def self.report_section(report)
|
|
124
|
+
<<~HTML.chomp
|
|
125
|
+
<section class="report">
|
|
126
|
+
<h2>Delta Report</h2>
|
|
127
|
+
<table>
|
|
128
|
+
<thead><tr><th>Dimension</th><th>Baseline</th><th>Context</th><th>Delta</th></tr></thead>
|
|
129
|
+
<tbody>
|
|
130
|
+
#{dimension_rows(report)}
|
|
131
|
+
#{total_row(report)}
|
|
132
|
+
</tbody>
|
|
133
|
+
</table>
|
|
134
|
+
#{verdict_html(report)}
|
|
135
|
+
</section>
|
|
136
|
+
HTML
|
|
137
|
+
end
|
|
138
|
+
private_class_method :report_section
|
|
139
|
+
|
|
140
|
+
# Builds one table row per scored dimension.
|
|
141
|
+
#
|
|
142
|
+
# @param report [SkillBench::DeltaReport] The delta report.
|
|
143
|
+
# @return [String] HTML table rows joined by newlines.
|
|
144
|
+
def self.dimension_rows(report)
|
|
145
|
+
report.deltas.map { |name, delta| dimension_row(name, delta, report) }.join("\n")
|
|
146
|
+
end
|
|
147
|
+
private_class_method :dimension_rows
|
|
148
|
+
|
|
149
|
+
# Builds a single dimension table row.
|
|
150
|
+
#
|
|
151
|
+
# @param name [String] Dimension name.
|
|
152
|
+
# @param delta [Numeric] Context-minus-baseline delta.
|
|
153
|
+
# @param report [SkillBench::DeltaReport] The delta report.
|
|
154
|
+
# @return [String] An HTML table row.
|
|
155
|
+
def self.dimension_row(name, delta, report)
|
|
156
|
+
dim = report.criteria.dimensions.find { |candidate| candidate.name == name }
|
|
157
|
+
humanized = humanize(name)
|
|
158
|
+
label = dim ? "#{humanized} (#{dim.max_score})" : humanized
|
|
159
|
+
baseline = report.baseline_scores[name]
|
|
160
|
+
context = report.context_scores[name]
|
|
161
|
+
row_cells('dimension', label, baseline, context, delta_str(delta))
|
|
162
|
+
end
|
|
163
|
+
private_class_method :dimension_row
|
|
164
|
+
|
|
165
|
+
# Builds the totals table row.
|
|
166
|
+
#
|
|
167
|
+
# @param report [SkillBench::DeltaReport] The delta report.
|
|
168
|
+
# @return [String] An HTML table row for the totals.
|
|
169
|
+
def self.total_row(report)
|
|
170
|
+
total_delta = report.deltas.values.sum
|
|
171
|
+
row_cells('total', 'Total', "#{report.baseline_total}/100",
|
|
172
|
+
"#{report.context_total}/100", delta_str(total_delta))
|
|
173
|
+
end
|
|
174
|
+
private_class_method :total_row
|
|
175
|
+
|
|
176
|
+
# Builds an HTML table row from escaped cell values.
|
|
177
|
+
#
|
|
178
|
+
# @param css_class [String] CSS class for the row.
|
|
179
|
+
# @param label [String] First-column label.
|
|
180
|
+
# @param baseline [Object] Baseline score cell.
|
|
181
|
+
# @param context [Object] Context score cell.
|
|
182
|
+
# @param delta [String] Delta cell.
|
|
183
|
+
# @return [String] An HTML table row.
|
|
184
|
+
def self.row_cells(css_class, label, baseline, context, delta)
|
|
185
|
+
"<tr class=\"#{css_class}\"><td>#{escape(label)}</td><td>#{escape(baseline)}</td>" \
|
|
186
|
+
"<td>#{escape(context)}</td><td>#{escape(delta)}</td></tr>"
|
|
187
|
+
end
|
|
188
|
+
private_class_method :row_cells
|
|
189
|
+
|
|
190
|
+
# Builds the verdict paragraph.
|
|
191
|
+
#
|
|
192
|
+
# @param report [SkillBench::DeltaReport] The delta report.
|
|
193
|
+
# @return [String] An HTML verdict paragraph.
|
|
194
|
+
def self.verdict_html(report)
|
|
195
|
+
verdict = report.verdict
|
|
196
|
+
criteria = report.criteria
|
|
197
|
+
status = verdict ? 'PASS' : 'FAIL'
|
|
198
|
+
css = verdict ? 'pass' : 'fail'
|
|
199
|
+
threshold = escape(criteria.pass_threshold)
|
|
200
|
+
minimum_delta = escape(criteria.minimum_delta)
|
|
201
|
+
%(<p class="verdict #{css}">Verdict: #{status} (threshold: #{threshold}, minimum delta: #{minimum_delta})</p>)
|
|
202
|
+
end
|
|
203
|
+
private_class_method :verdict_html
|
|
204
|
+
|
|
205
|
+
# Builds the baseline/context iteration timeline section.
|
|
206
|
+
#
|
|
207
|
+
# @param result [Hash] Eval result envelope.
|
|
208
|
+
# @return [String] HTML for the iterations section, or empty string.
|
|
209
|
+
def self.iterations_section(result)
|
|
210
|
+
baseline = result.dig(:response, :baseline_iterations) || []
|
|
211
|
+
context = result.dig(:response, :context_iterations) || []
|
|
212
|
+
baseline_empty = baseline.empty?
|
|
213
|
+
context_empty = context.empty?
|
|
214
|
+
return '' if baseline_empty && context_empty
|
|
215
|
+
|
|
216
|
+
blocks = []
|
|
217
|
+
blocks << iteration_block('Baseline Iterations', baseline) unless baseline_empty
|
|
218
|
+
blocks << iteration_block('Context Iterations', context) unless context_empty
|
|
219
|
+
%(<section class="iterations">\n<h2>Iteration Timeline</h2>\n#{blocks.join("\n")}\n</section>)
|
|
220
|
+
end
|
|
221
|
+
private_class_method :iterations_section
|
|
222
|
+
|
|
223
|
+
# Builds one named iteration timeline block.
|
|
224
|
+
#
|
|
225
|
+
# @param title [String] Section title.
|
|
226
|
+
# @param iterations [Array<Hash>] Iteration metadata entries.
|
|
227
|
+
# @return [String] HTML for the timeline block.
|
|
228
|
+
def self.iteration_block(title, iterations)
|
|
229
|
+
items = iterations.map { |iteration| iteration_item(iteration) }.join("\n")
|
|
230
|
+
%(<div class="timeline"><h3>#{escape(title)}</h3><ol>\n#{items}\n</ol></div>)
|
|
231
|
+
end
|
|
232
|
+
private_class_method :iteration_block
|
|
233
|
+
|
|
234
|
+
# Builds one list item for a single iteration step.
|
|
235
|
+
#
|
|
236
|
+
# @param iteration [Hash] Iteration metadata with :step_number, :thought,
|
|
237
|
+
# :tools_used, and :observation_summary keys.
|
|
238
|
+
# @return [String] An HTML list item.
|
|
239
|
+
def self.iteration_item(iteration)
|
|
240
|
+
tools = iteration[:tools_used] || []
|
|
241
|
+
tools_html = tools.empty? ? '' : %( <span class="tools">Tools: #{escape(tools.join(', '))}</span>)
|
|
242
|
+
observation = iteration[:observation_summary].to_s
|
|
243
|
+
observation_html = observation.empty? ? '' : %( <span class="observation">Observation: #{escape(observation)}</span>)
|
|
244
|
+
step = "Step #{escape(iteration[:step_number])}: #{escape(iteration[:thought])}"
|
|
245
|
+
%(<li><span class="thought">#{step}</span>#{tools_html}#{observation_html}</li>)
|
|
246
|
+
end
|
|
247
|
+
private_class_method :iteration_item
|
|
248
|
+
|
|
249
|
+
# Builds the body for a legacy (non-DeltaReport) result.
|
|
250
|
+
#
|
|
251
|
+
# @param result [Hash] Legacy eval result envelope.
|
|
252
|
+
# @return [String] HTML for the legacy status section.
|
|
253
|
+
def self.legacy_section(result)
|
|
254
|
+
passed = result[:pass]
|
|
255
|
+
status = passed ? 'PASSED' : 'FAILED'
|
|
256
|
+
css = passed ? 'pass' : 'fail'
|
|
257
|
+
score = result[:score]&.round(2)
|
|
258
|
+
<<~HTML.chomp
|
|
259
|
+
<section class="report legacy">
|
|
260
|
+
<h2>Result</h2>
|
|
261
|
+
<p class="verdict #{css}">Status: #{status}</p>
|
|
262
|
+
<p class="score">Score: #{escape(score || 'N/A')}</p>
|
|
263
|
+
#{legacy_error(result)}
|
|
264
|
+
</section>
|
|
265
|
+
HTML
|
|
266
|
+
end
|
|
267
|
+
private_class_method :legacy_section
|
|
268
|
+
|
|
269
|
+
# Builds the optional error paragraph for a legacy result.
|
|
270
|
+
#
|
|
271
|
+
# @param result [Hash] Legacy eval result envelope.
|
|
272
|
+
# @return [String] An HTML error paragraph, or empty string.
|
|
273
|
+
def self.legacy_error(result)
|
|
274
|
+
message = result.dig(:response, :error, :message)
|
|
275
|
+
message ? %(<p class="error">Error: #{escape(message)}</p>) : ''
|
|
276
|
+
end
|
|
277
|
+
private_class_method :legacy_error
|
|
278
|
+
|
|
279
|
+
# Escapes any value for safe HTML embedding.
|
|
280
|
+
#
|
|
281
|
+
# @param value [Object] The value to escape (coerced via #to_s).
|
|
282
|
+
# @return [String] HTML-escaped text.
|
|
283
|
+
def self.escape(value)
|
|
284
|
+
CGI.escapeHTML(value.to_s)
|
|
285
|
+
end
|
|
286
|
+
private_class_method :escape
|
|
287
|
+
end
|
|
288
|
+
end
|
|
289
|
+
end
|
|
@@ -6,12 +6,30 @@ module SkillBench
|
|
|
6
6
|
module Services
|
|
7
7
|
# Formats evaluation results as JSON.
|
|
8
8
|
class JsonFormatter
|
|
9
|
+
# Zeroed token usage used when a result carries no usage data.
|
|
10
|
+
EMPTY_USAGE = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }.freeze
|
|
11
|
+
|
|
9
12
|
# Format result as JSON.
|
|
10
13
|
#
|
|
14
|
+
# Ensures top-level :tokens and :cost fields are always present (additive;
|
|
15
|
+
# existing keys are preserved) so JSON consumers see a stable shape.
|
|
16
|
+
#
|
|
11
17
|
# @param result [Hash] Eval result.
|
|
12
18
|
# @return [String] JSON-formatted string.
|
|
13
19
|
def self.format(result)
|
|
14
|
-
JSON.pretty_generate(result)
|
|
20
|
+
JSON.pretty_generate(with_usage_fields(result))
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Returns the result augmented with token/cost fields when missing.
|
|
24
|
+
#
|
|
25
|
+
# @param result [Hash] Eval result (returned unchanged when not a Hash).
|
|
26
|
+
# @return [Hash] Result with :tokens and :cost guaranteed present.
|
|
27
|
+
def self.with_usage_fields(result)
|
|
28
|
+
return result unless result.is_a?(Hash)
|
|
29
|
+
|
|
30
|
+
tokens = result[:tokens] || result.dig(:response, :tokens) || EMPTY_USAGE
|
|
31
|
+
cost = result.key?(:cost) ? result[:cost] : result.dig(:response, :cost)
|
|
32
|
+
result.merge(tokens: tokens, cost: cost)
|
|
15
33
|
end
|
|
16
34
|
end
|
|
17
35
|
end
|