ruby-skill-bench 1.0.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +299 -23
- data/docs/architecture.md +3 -1
- data/docs/first-eval-guide.md +7 -7
- data/docs/testing-guide.md +1 -1
- data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
- data/lib/skill_bench/agent/react_agent/step.rb +7 -1
- data/lib/skill_bench/agent/react_agent.rb +2 -1
- data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
- data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
- data/lib/skill_bench/cli/help_printer.rb +10 -2
- data/lib/skill_bench/cli/init_command.rb +2 -1
- data/lib/skill_bench/cli/result_printer.rb +1 -1
- data/lib/skill_bench/cli/run_command.rb +47 -9
- data/lib/skill_bench/cli/validate_command.rb +242 -0
- data/lib/skill_bench/cli.rb +3 -0
- data/lib/skill_bench/client.rb +43 -1
- data/lib/skill_bench/clients/all.rb +3 -0
- data/lib/skill_bench/clients/base_client.rb +14 -6
- data/lib/skill_bench/clients/base_url_validator.rb +105 -0
- data/lib/skill_bench/clients/provider_config.rb +34 -1
- data/lib/skill_bench/clients/provider_schemas.rb +4 -0
- data/lib/skill_bench/clients/providers/mistral.rb +47 -0
- data/lib/skill_bench/clients/request_builder.rb +2 -4
- data/lib/skill_bench/clients/response_builder.rb +91 -0
- data/lib/skill_bench/clients/response_error_handler.rb +5 -17
- data/lib/skill_bench/clients/retry_handler.rb +4 -7
- data/lib/skill_bench/commands/init.rb +5 -0
- data/lib/skill_bench/commands/skill_new.rb +3 -1
- data/lib/skill_bench/config/applier.rb +2 -0
- data/lib/skill_bench/config/defaults.rb +2 -0
- data/lib/skill_bench/config/facade_readers.rb +7 -0
- data/lib/skill_bench/config/facade_writers.rb +17 -0
- data/lib/skill_bench/config/json_loader.rb +1 -1
- data/lib/skill_bench/config/store.rb +29 -0
- data/lib/skill_bench/config.rb +18 -0
- data/lib/skill_bench/constants.rb +58 -0
- data/lib/skill_bench/evaluation/runner.rb +20 -3
- data/lib/skill_bench/execution/context_hydrator.rb +66 -15
- data/lib/skill_bench/execution/sandbox.rb +76 -14
- data/lib/skill_bench/judge/judge.rb +4 -0
- data/lib/skill_bench/judge/prompt.rb +42 -6
- data/lib/skill_bench/models/config.rb +32 -0
- data/lib/skill_bench/output_formatter.rb +60 -1
- data/lib/skill_bench/package_verifier.rb +1 -1
- data/lib/skill_bench/rails/skill_templates.rb +19 -5
- data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
- data/lib/skill_bench/services/batch_runner_service.rb +111 -0
- data/lib/skill_bench/services/compare_option_parser.rb +1 -0
- data/lib/skill_bench/services/cost_calculator.rb +91 -0
- data/lib/skill_bench/services/html_formatter.rb +289 -0
- data/lib/skill_bench/services/json_formatter.rb +19 -1
- data/lib/skill_bench/services/junit_formatter.rb +74 -24
- data/lib/skill_bench/services/provider_resolver.rb +5 -2
- data/lib/skill_bench/services/response_cache.rb +130 -0
- data/lib/skill_bench/services/runner_service.rb +88 -4
- data/lib/skill_bench/services/summary_formatter.rb +90 -0
- data/lib/skill_bench/services/template_registry.rb +43 -9
- data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
- data/lib/skill_bench/tools/registry.rb +29 -3
- data/lib/skill_bench/tools/run_command.rb +172 -35
- data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
- data/lib/skill_bench/trend_tracker.rb +5 -5
- data/lib/skill_bench/version.rb +1 -1
- data/lib/skill_bench.rb +3 -3
- metadata +19 -36
|
@@ -4,39 +4,89 @@ require 'cgi'
|
|
|
4
4
|
|
|
5
5
|
module SkillBench
|
|
6
6
|
module Services
|
|
7
|
-
# Formats evaluation results as JUnit XML.
|
|
7
|
+
# Formats evaluation results as JUnit XML for CI consumption.
|
|
8
|
+
#
|
|
9
|
+
# Two entry points share the same per-result verdict/score logic:
|
|
10
|
+
# {.format} emits a single-result suite (one <testcase>), while
|
|
11
|
+
# {.format_batch} aggregates many results into one suite so a batch
|
|
12
|
+
# `skill-bench run --all` produces a single JUnit artifact.
|
|
8
13
|
class JUnitFormatter
|
|
9
|
-
#
|
|
14
|
+
# classname attribute applied to every emitted <testcase>.
|
|
15
|
+
CLASSNAME = 'SkillBench'
|
|
16
|
+
|
|
17
|
+
# Format a single result as a JUnit XML document.
|
|
10
18
|
#
|
|
11
19
|
# Supports both legacy format (result[:pass]) and modern DeltaReport format.
|
|
12
20
|
#
|
|
13
21
|
# @param result [Hash] Eval result.
|
|
14
22
|
# @return [String] JUnit XML-formatted string.
|
|
15
23
|
def self.format(result)
|
|
24
|
+
suite([result])
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Format an aggregate batch envelope as one JUnit XML document.
|
|
28
|
+
#
|
|
29
|
+
# Emits a single <testsuite> with one <testcase> per result, adding a
|
|
30
|
+
# <failure> child for every failing eval.
|
|
31
|
+
#
|
|
32
|
+
# @param aggregate [Hash] Aggregate envelope with a :results array.
|
|
33
|
+
# @return [String] JUnit XML-formatted string.
|
|
34
|
+
def self.format_batch(aggregate)
|
|
35
|
+
suite(aggregate[:results] || [])
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Builds a <testsuite> wrapping one <testcase> per result.
|
|
39
|
+
#
|
|
40
|
+
# @param results [Array<Hash>] Per-eval result envelopes.
|
|
41
|
+
# @return [String] JUnit XML-formatted string.
|
|
42
|
+
def self.suite(results)
|
|
43
|
+
failures = results.count { |result| !passing?(result) }
|
|
44
|
+
cases = results.map { |result| testcase(result) }.join("\n")
|
|
45
|
+
<<~XML
|
|
46
|
+
<?xml version="1.0"?>
|
|
47
|
+
<testsuite name="#{CLASSNAME}" tests="#{results.size}" failures="#{failures}">
|
|
48
|
+
#{cases}
|
|
49
|
+
</testsuite>
|
|
50
|
+
XML
|
|
51
|
+
end
|
|
52
|
+
private_class_method :suite
|
|
53
|
+
|
|
54
|
+
# Renders one <testcase> element (indented two spaces) for a result.
|
|
55
|
+
#
|
|
56
|
+
# @param result [Hash] A single-eval result envelope.
|
|
57
|
+
# @return [String] A <testcase> XML fragment.
|
|
58
|
+
def self.testcase(result)
|
|
59
|
+
name = CGI.escapeHTML(result[:eval_name].to_s)
|
|
60
|
+
return %( <testcase name="#{name}" classname="#{CLASSNAME}"/>) if passing?(result)
|
|
61
|
+
|
|
62
|
+
score = CGI.escapeHTML(score_for(result).to_s)
|
|
63
|
+
[
|
|
64
|
+
%( <testcase name="#{name}" classname="#{CLASSNAME}">),
|
|
65
|
+
%( <failure message="Score: #{score}">Eval failed</failure>),
|
|
66
|
+
' </testcase>'
|
|
67
|
+
].join("\n")
|
|
68
|
+
end
|
|
69
|
+
private_class_method :testcase
|
|
70
|
+
|
|
71
|
+
# Whether a result passed (DeltaReport verdict or legacy :pass).
|
|
72
|
+
#
|
|
73
|
+
# @param result [Hash] A single-eval result envelope.
|
|
74
|
+
# @return [Boolean] true when the eval passed.
|
|
75
|
+
def self.passing?(result)
|
|
76
|
+
report = result.dig(:response, :report)
|
|
77
|
+
report.respond_to?(:verdict) ? report.verdict : result[:pass]
|
|
78
|
+
end
|
|
79
|
+
private_class_method :passing?
|
|
80
|
+
|
|
81
|
+
# The score reported for a failing eval.
|
|
82
|
+
#
|
|
83
|
+
# @param result [Hash] A single-eval result envelope.
|
|
84
|
+
# @return [Object] DeltaReport context_total or legacy :score.
|
|
85
|
+
def self.score_for(result)
|
|
16
86
|
report = result.dig(:response, :report)
|
|
17
|
-
|
|
18
|
-
eval_name = CGI.escapeHTML(result[:eval_name].to_s)
|
|
19
|
-
|
|
20
|
-
if verdict
|
|
21
|
-
<<~XML
|
|
22
|
-
<?xml version="1.0"?>
|
|
23
|
-
<testsuite name="SkillBench" tests="1" failures="0">
|
|
24
|
-
<testcase name="#{eval_name}" classname="SkillBench"/>
|
|
25
|
-
</testsuite>
|
|
26
|
-
XML
|
|
27
|
-
else
|
|
28
|
-
score = report.respond_to?(:context_total) ? report.context_total : result[:score]
|
|
29
|
-
escaped_score = CGI.escapeHTML(score.to_s)
|
|
30
|
-
<<~XML
|
|
31
|
-
<?xml version="1.0"?>
|
|
32
|
-
<testsuite name="SkillBench" tests="1" failures="1">
|
|
33
|
-
<testcase name="#{eval_name}" classname="SkillBench">
|
|
34
|
-
<failure message="Score: #{escaped_score}">Eval failed</failure>
|
|
35
|
-
</testcase>
|
|
36
|
-
</testsuite>
|
|
37
|
-
XML
|
|
38
|
-
end
|
|
87
|
+
report.respond_to?(:context_total) ? report.context_total : result[:score]
|
|
39
88
|
end
|
|
89
|
+
private_class_method :score_for
|
|
40
90
|
end
|
|
41
91
|
end
|
|
42
92
|
end
|
|
@@ -51,11 +51,14 @@ module SkillBench
|
|
|
51
51
|
private
|
|
52
52
|
|
|
53
53
|
def resolve_provider
|
|
54
|
-
config = SkillBench::Models::Config.
|
|
54
|
+
config = SkillBench::Models::Config.loaded
|
|
55
55
|
provider = config.to_provider
|
|
56
56
|
return provider if provider
|
|
57
57
|
|
|
58
|
-
|
|
58
|
+
# Explicit `{"provider":"mock"}` is a valid choice, not a load failure,
|
|
59
|
+
# so it falls through to the mock provider without a warning. A missing
|
|
60
|
+
# provider key (genuine misconfiguration) still warns below.
|
|
61
|
+
warn 'Config load failed, using mock provider' unless config.mock?
|
|
59
62
|
MOCK_PROVIDER.new('mock', 'mock', 'mock', {})
|
|
60
63
|
rescue JSON::ParserError, ArgumentError, Errno::ENOENT => e
|
|
61
64
|
# Config parsing/validation errors or missing config file - fall back to mock
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'digest'
|
|
4
|
+
require 'json'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Services
|
|
8
|
+
# Content-addressed, in-memory cache for LLM responses.
|
|
9
|
+
#
|
|
10
|
+
# The cache is opt-in and disabled by default. When enabled it lets repeated,
|
|
11
|
+
# identical LLM requests reuse a previously computed response instead of
|
|
12
|
+
# hitting the network again. The canonical example is `compare`, which runs
|
|
13
|
+
# the skill-less baseline twice with identical inputs.
|
|
14
|
+
#
|
|
15
|
+
# The backing store is a process-lifetime {Hash} keyed by a stable SHA-256
|
|
16
|
+
# digest of the request, so the same logical request always maps to the same
|
|
17
|
+
# entry regardless of hash-key ordering. Access to the store is serialized
|
|
18
|
+
# with a mutex so concurrent callers (e.g. {Parallel}-driven agents) cannot
|
|
19
|
+
# corrupt it or double-store a key.
|
|
20
|
+
class ResponseCache
|
|
21
|
+
# Environment variable that opts caching on when set to a truthy value.
|
|
22
|
+
ENV_FLAG = 'SKILL_BENCH_CACHE'
|
|
23
|
+
|
|
24
|
+
# Raw env values treated as "on".
|
|
25
|
+
TRUTHY_VALUES = %w[1 true yes on].freeze
|
|
26
|
+
|
|
27
|
+
# Guards every read/write of the shared store. Concurrent agents/judges run
|
|
28
|
+
# on separate threads; without this, the membership check and the write in
|
|
29
|
+
# {fetch} could interleave and store a key more than once.
|
|
30
|
+
MUTEX = Mutex.new
|
|
31
|
+
|
|
32
|
+
class << self
|
|
33
|
+
# Whether response caching is currently enabled.
|
|
34
|
+
#
|
|
35
|
+
# Enabled when {ENV_FLAG} is set to a truthy value (one of
|
|
36
|
+
# {TRUTHY_VALUES}); disabled when unset or set to anything else.
|
|
37
|
+
#
|
|
38
|
+
# @return [Boolean] true when caching is on
|
|
39
|
+
def enabled?
|
|
40
|
+
raw = ENV.fetch(ENV_FLAG, '').to_s.strip.downcase
|
|
41
|
+
TRUTHY_VALUES.include?(raw)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Computes a stable content-addressed cache key for a request.
|
|
45
|
+
#
|
|
46
|
+
# The inputs are assembled into a canonical structure (hash keys sorted
|
|
47
|
+
# and stringified recursively) and hashed, so semantically identical
|
|
48
|
+
# requests always produce the same digest. Request-affecting provider
|
|
49
|
+
# configuration (endpoint/base URL/etc.) is included so two providers that
|
|
50
|
+
# share a name but target different endpoints never collide.
|
|
51
|
+
#
|
|
52
|
+
# @param provider [Symbol, String] Resolved provider identifier
|
|
53
|
+
# @param model [String, nil] Model name
|
|
54
|
+
# @param system_prompt [String] System prompt
|
|
55
|
+
# @param messages [Array<Hash>] Conversation messages
|
|
56
|
+
# @param tools [Array<Hash>, nil] Tool definitions, when present
|
|
57
|
+
# @param temperature [Float, nil] Sampling temperature, when present
|
|
58
|
+
# @param provider_config [Hash] Request-affecting provider settings such as
|
|
59
|
+
# base_url, request_path, endpoint, location, project_id, api_version
|
|
60
|
+
# @return [String] Hex-encoded SHA-256 digest of the canonical request
|
|
61
|
+
def key(provider:, model:, system_prompt:, messages:, tools: nil, temperature: nil, provider_config: {})
|
|
62
|
+
payload = {
|
|
63
|
+
provider: provider.to_s,
|
|
64
|
+
model: model,
|
|
65
|
+
system_prompt: system_prompt,
|
|
66
|
+
messages: messages,
|
|
67
|
+
tools: tools,
|
|
68
|
+
temperature: temperature,
|
|
69
|
+
provider_config: provider_config
|
|
70
|
+
}
|
|
71
|
+
Digest::SHA256.hexdigest(JSON.generate(canonicalize(payload)))
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Returns the cached value for a key, computing and storing it on a miss.
|
|
75
|
+
#
|
|
76
|
+
# The value is computed outside the lock so requests for distinct keys run
|
|
77
|
+
# concurrently; the store read and the store write are each serialized by
|
|
78
|
+
# {MUTEX}, and a missing key is written exactly once (first writer wins).
|
|
79
|
+
#
|
|
80
|
+
# @param key [String] Cache key from {key}
|
|
81
|
+
# @yield Computes the value to cache when the key is absent
|
|
82
|
+
# @yieldreturn [Object] The value to cache
|
|
83
|
+
# @return [Object] The cached value (existing on a hit, freshly stored on a miss)
|
|
84
|
+
def fetch(key)
|
|
85
|
+
hit = MUTEX.synchronize { store[key] }
|
|
86
|
+
return hit unless hit.nil?
|
|
87
|
+
|
|
88
|
+
value = yield
|
|
89
|
+
MUTEX.synchronize { store[key] ||= value }
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Removes every cached entry.
|
|
93
|
+
#
|
|
94
|
+
# @return [void]
|
|
95
|
+
def clear
|
|
96
|
+
MUTEX.synchronize { store.clear }
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
private
|
|
100
|
+
|
|
101
|
+
# The process-lifetime backing store.
|
|
102
|
+
#
|
|
103
|
+
# @return [Hash{String => Object}] digest => cached response
|
|
104
|
+
def store
|
|
105
|
+
@store ||= {}
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Recursively rewrites a value into a stable form for serialization.
|
|
109
|
+
#
|
|
110
|
+
# Hashes get their keys stringified and sorted so that key ordering does
|
|
111
|
+
# not affect the resulting digest; arrays and scalars are preserved.
|
|
112
|
+
#
|
|
113
|
+
# @param value [Object] The value to canonicalize
|
|
114
|
+
# @return [Object] A canonical, order-stable copy of the value
|
|
115
|
+
def canonicalize(value)
|
|
116
|
+
case value
|
|
117
|
+
when Hash
|
|
118
|
+
value
|
|
119
|
+
.sort_by { |entry| entry.first.to_s }
|
|
120
|
+
.each_with_object({}) { |(name, val), acc| acc[name.to_s] = canonicalize(val) }
|
|
121
|
+
when Array
|
|
122
|
+
value.map { |element| canonicalize(element) }
|
|
123
|
+
else
|
|
124
|
+
value
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'parallel'
|
|
3
4
|
require_relative '../evaluation/runner'
|
|
4
5
|
require_relative 'eval_resolver'
|
|
5
6
|
require_relative 'skill_resolver_service'
|
|
@@ -10,6 +11,7 @@ require_relative 'context_loader_service'
|
|
|
10
11
|
require_relative 'judge_params_builder'
|
|
11
12
|
require_relative 'error_response_builder'
|
|
12
13
|
require_relative 'trend_recorder_service'
|
|
14
|
+
require_relative 'cost_calculator'
|
|
13
15
|
require_relative 'output_formatter'
|
|
14
16
|
|
|
15
17
|
module SkillBench
|
|
@@ -61,13 +63,11 @@ module SkillBench
|
|
|
61
63
|
provider = provider_result[:provider]
|
|
62
64
|
config = provider_result[:config]
|
|
63
65
|
|
|
64
|
-
baseline_output = run_baseline_agent(evaluation, provider, config)
|
|
65
|
-
return agent_error_result(baseline_output, 'baseline', evaluation, provider) if baseline_output[:status] == :error
|
|
66
|
-
|
|
67
66
|
skill_context = ContextLoaderService.call(skills)
|
|
68
67
|
return empty_context_error_result(evaluation, provider) if skill_context.strip.empty?
|
|
69
68
|
|
|
70
|
-
context_output =
|
|
69
|
+
baseline_output, context_output = run_agents_concurrently(evaluation, skills, skill_context, provider, config)
|
|
70
|
+
return agent_error_result(baseline_output, 'baseline', evaluation, provider) if baseline_output[:status] == :error
|
|
71
71
|
return agent_error_result(context_output, 'context', evaluation, provider) if context_output[:status] == :error
|
|
72
72
|
|
|
73
73
|
context = EvaluationContext.new(
|
|
@@ -111,6 +111,29 @@ module SkillBench
|
|
|
111
111
|
AgentSpawnerService.call(evaluation, context_prompt, provider, config)
|
|
112
112
|
end
|
|
113
113
|
|
|
114
|
+
# Runs the baseline and context agents concurrently.
|
|
115
|
+
#
|
|
116
|
+
# The two runs are independent: each spawns its own `Dir.mktmpdir`
|
|
117
|
+
# sandbox and uses a per-call client, and neither reads the other's
|
|
118
|
+
# state. The work is I/O-bound (HTTP + subprocess), so threads release
|
|
119
|
+
# the GIL and the agent phase is bound by the slower run instead of the
|
|
120
|
+
# sum of both. The skill context is built once by the caller and passed
|
|
121
|
+
# in, so no skill-dependent work is duplicated or moved into the baseline.
|
|
122
|
+
#
|
|
123
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
124
|
+
# @param skills [Array<SkillBench::Models::Skill>] Resolved skills
|
|
125
|
+
# @param skill_context [String] Combined skill context, built once pre-fork
|
|
126
|
+
# @param provider [Object] The resolved provider
|
|
127
|
+
# @param config [Hash, nil] Provider config
|
|
128
|
+
# @return [Array(Hash, Hash)] Baseline and context outputs, in that order
|
|
129
|
+
def run_agents_concurrently(evaluation, skills, skill_context, provider, config)
|
|
130
|
+
runs = [
|
|
131
|
+
-> { run_baseline_agent(evaluation, provider, config) },
|
|
132
|
+
-> { run_context_agent(evaluation, skills, skill_context, provider, config) }
|
|
133
|
+
]
|
|
134
|
+
Parallel.map(runs, in_threads: runs.size, &:call)
|
|
135
|
+
end
|
|
136
|
+
|
|
114
137
|
def evaluate_and_record_trend(context)
|
|
115
138
|
evaluation = context.evaluation
|
|
116
139
|
provider = context.provider
|
|
@@ -133,11 +156,16 @@ module SkillBench
|
|
|
133
156
|
trend_result = TrendRecorderService.call(result, eval_name, skill_names)
|
|
134
157
|
return enrich_error_result(trend_result, evaluation, provider) unless trend_result[:success]
|
|
135
158
|
|
|
159
|
+
tokens = aggregate_usage(context.baseline_output, context.context_output)
|
|
160
|
+
cost = CostCalculator.call(usage: tokens, model: agent_model(config, provider))
|
|
161
|
+
|
|
136
162
|
{
|
|
137
163
|
success: true,
|
|
138
164
|
eval_name: eval_name,
|
|
139
165
|
skill_name: skill_names.join(', '),
|
|
140
166
|
provider_name: provider.name,
|
|
167
|
+
tokens: tokens,
|
|
168
|
+
cost: cost,
|
|
141
169
|
response: result[:response].merge(
|
|
142
170
|
trend: trend_result[:trend],
|
|
143
171
|
baseline_iterations: context.baseline_output[:iterations] || [],
|
|
@@ -145,6 +173,62 @@ module SkillBench
|
|
|
145
173
|
)
|
|
146
174
|
}
|
|
147
175
|
end
|
|
176
|
+
|
|
177
|
+
# Sums the token usage of the baseline and context agent runs.
|
|
178
|
+
#
|
|
179
|
+
# Judge-side usage is not yet threaded through (the judge lives under the
|
|
180
|
+
# untouched `clients/` boundary), so this is scoped to agent usage.
|
|
181
|
+
#
|
|
182
|
+
# @param baseline_output [Hash] The baseline agent output (carries :usage).
|
|
183
|
+
# @param context_output [Hash] The context agent output (carries :usage).
|
|
184
|
+
# @return [Hash] Combined prompt/completion/total token counts.
|
|
185
|
+
def aggregate_usage(baseline_output, context_output)
|
|
186
|
+
add_usage(
|
|
187
|
+
add_usage(empty_usage, baseline_output[:usage]),
|
|
188
|
+
context_output[:usage]
|
|
189
|
+
)
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# A zeroed token-usage accumulator.
|
|
193
|
+
#
|
|
194
|
+
# @return [Hash] Usage hash with prompt/completion/total token counts set to zero.
|
|
195
|
+
def empty_usage
|
|
196
|
+
{ prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Adds one usage hash onto a running total.
|
|
200
|
+
#
|
|
201
|
+
# @param total [Hash] The running usage total.
|
|
202
|
+
# @param usage [Hash, nil] A run's usage hash (may be nil or empty).
|
|
203
|
+
# @return [Hash] A new summed usage hash.
|
|
204
|
+
def add_usage(total, usage)
|
|
205
|
+
usage ||= {}
|
|
206
|
+
{
|
|
207
|
+
prompt_tokens: total[:prompt_tokens] + token_count(usage, :prompt_tokens),
|
|
208
|
+
completion_tokens: total[:completion_tokens] + token_count(usage, :completion_tokens),
|
|
209
|
+
total_tokens: total[:total_tokens] + token_count(usage, :total_tokens)
|
|
210
|
+
}
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# Reads a token count from a usage hash, tolerating string keys.
|
|
214
|
+
#
|
|
215
|
+
# @param usage [Hash] The usage hash.
|
|
216
|
+
# @param key [Symbol] The usage key (e.g. :prompt_tokens).
|
|
217
|
+
# @return [Integer] The token count, or zero when absent.
|
|
218
|
+
def token_count(usage, key)
|
|
219
|
+
(usage[key] || usage[key.to_s] || 0).to_i
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
# Resolves the model name used for pricing from config, falling back to the provider LLM.
|
|
223
|
+
#
|
|
224
|
+
# @param config [Hash, nil] Provider config.
|
|
225
|
+
# @param provider [Object] The resolved provider.
|
|
226
|
+
# @return [String] The model name (e.g. "gpt-4o").
|
|
227
|
+
def agent_model(config, provider)
|
|
228
|
+
return provider.llm unless config.is_a?(Hash)
|
|
229
|
+
|
|
230
|
+
config[:model] || config['model'] || provider.llm
|
|
231
|
+
end
|
|
148
232
|
end
|
|
149
233
|
end
|
|
150
234
|
end
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Services
|
|
7
|
+
# Builds a compact JSON summary of a batch run for CI gating.
|
|
8
|
+
#
|
|
9
|
+
# Surfaces the aggregate pass/fail counts plus rolled-up token and cost
|
|
10
|
+
# usage and the single worst skill-vs-baseline delta across the batch, so
|
|
11
|
+
# a CI job can gate on (and archive) one machine-readable artifact.
|
|
12
|
+
class SummaryFormatter
|
|
13
|
+
# Format an aggregate batch envelope as a pretty JSON summary string.
|
|
14
|
+
#
|
|
15
|
+
# @param aggregate [Hash] Aggregate envelope with :results and :summary.
|
|
16
|
+
# @return [String] Pretty-printed JSON summary.
|
|
17
|
+
def self.format(aggregate)
|
|
18
|
+
new(aggregate).format
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# @param aggregate [Hash] Aggregate envelope with :results and :summary.
|
|
22
|
+
def initialize(aggregate)
|
|
23
|
+
@results = aggregate[:results] || []
|
|
24
|
+
@summary = aggregate[:summary] || {}
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Builds the JSON summary document.
|
|
28
|
+
#
|
|
29
|
+
# @return [String] Pretty-printed JSON summary.
|
|
30
|
+
def format
|
|
31
|
+
JSON.pretty_generate(
|
|
32
|
+
passed: summary[:passed],
|
|
33
|
+
failed: summary[:failed],
|
|
34
|
+
total: summary[:total],
|
|
35
|
+
tokens: total_tokens,
|
|
36
|
+
cost: total_cost,
|
|
37
|
+
worst_delta: worst_delta
|
|
38
|
+
)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
43
|
+
attr_reader :results, :summary
|
|
44
|
+
|
|
45
|
+
# Sums total_tokens across every result, treating missing usage as 0.
|
|
46
|
+
#
|
|
47
|
+
# @return [Integer] Aggregate token count.
|
|
48
|
+
def total_tokens
|
|
49
|
+
results.sum { |result| tokens_for(result) }
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Reads a single result's total token count.
|
|
53
|
+
#
|
|
54
|
+
# @param result [Hash] A single-eval result envelope.
|
|
55
|
+
# @return [Integer] total_tokens, or 0 when absent.
|
|
56
|
+
def tokens_for(result)
|
|
57
|
+
tokens = result[:tokens] || {}
|
|
58
|
+
tokens[:total_tokens] || tokens['total_tokens'] || 0
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Sums non-nil per-result costs.
|
|
62
|
+
#
|
|
63
|
+
# @return [Float, nil] Total cost, or nil when no result reports a cost.
|
|
64
|
+
def total_cost
|
|
65
|
+
costs = results.filter_map { |result| result[:cost] }
|
|
66
|
+
costs.empty? ? nil : costs.sum
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Finds the eval with the smallest skill-vs-baseline delta.
|
|
70
|
+
#
|
|
71
|
+
# @return [Hash, nil] {:eval_name, :delta} for the worst eval, or nil
|
|
72
|
+
# when no result carries a delta report.
|
|
73
|
+
def worst_delta
|
|
74
|
+
scored = results.filter_map { |result| delta_entry(result) }
|
|
75
|
+
scored.min_by { |entry| entry[:delta] }
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Builds a {eval_name, delta} entry for a result with a delta report.
|
|
79
|
+
#
|
|
80
|
+
# @param result [Hash] A single-eval result envelope.
|
|
81
|
+
# @return [Hash, nil] Entry hash, or nil when the report lacks deltas.
|
|
82
|
+
def delta_entry(result)
|
|
83
|
+
report = result.dig(:response, :report)
|
|
84
|
+
return nil unless report.respond_to?(:context_total) && report.respond_to?(:baseline_total)
|
|
85
|
+
|
|
86
|
+
{ eval_name: result[:eval_name], delta: report.context_total - report.baseline_total }
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'json'
|
|
4
|
+
require_relative '../dimension'
|
|
4
5
|
require_relative 'template_registry/category_data'
|
|
5
6
|
|
|
6
7
|
module SkillBench
|
|
@@ -21,6 +22,24 @@ module SkillBench
|
|
|
21
22
|
TEMPLATE_TYPES = %i[task_md criteria_json skill_md].freeze
|
|
22
23
|
CATEGORIES = REGISTRY.keys.freeze
|
|
23
24
|
|
|
25
|
+
# Score weight per core scoring dimension. Keyed by the canonical
|
|
26
|
+
# +SkillBench::DEFAULT_DIMENSIONS+ names so scaffolded criteria can never
|
|
27
|
+
# drift from the names the runtime loader requires; values sum to 100.
|
|
28
|
+
CRITERIA_DIMENSION_SCORES = {
|
|
29
|
+
'correctness' => 30,
|
|
30
|
+
'skill_adherence' => 25,
|
|
31
|
+
'code_quality' => 20,
|
|
32
|
+
'test_coverage' => 15,
|
|
33
|
+
'documentation' => 10
|
|
34
|
+
}.freeze
|
|
35
|
+
|
|
36
|
+
# Canonical dimension descriptions keyed by name, sourced from the runtime defaults.
|
|
37
|
+
CORE_DIMENSION_DESCRIPTIONS = SkillBench::DEFAULT_DIMENSIONS.to_h { |dimension| [dimension.name, dimension.description] }.freeze
|
|
38
|
+
|
|
39
|
+
# Top-level thresholds emitted with scaffolded criteria.
|
|
40
|
+
CRITERIA_PASS_THRESHOLD = 70
|
|
41
|
+
CRITERIA_MINIMUM_DELTA = 10
|
|
42
|
+
|
|
24
43
|
# @param template_type [Symbol, String] Template type (:task_md, :criteria_json, :skill_md)
|
|
25
44
|
# @param category [Symbol, String] Category (:crud, :api, :background_job, etc.)
|
|
26
45
|
# @param variables [Hash{Symbol, String => String}] Variables for interpolation
|
|
@@ -105,21 +124,36 @@ module SkillBench
|
|
|
105
124
|
MARKDOWN
|
|
106
125
|
end
|
|
107
126
|
|
|
127
|
+
# Builds runtime-loadable scoring criteria for the category.
|
|
128
|
+
#
|
|
129
|
+
# Emits the five core dimensions required by {SkillBench::Criteria}
|
|
130
|
+
# (+correctness+, +skill_adherence+, +code_quality+, +test_coverage+,
|
|
131
|
+
# +documentation+) with integer +max_score+ values summing to 100, plus
|
|
132
|
+
# the top-level +pass_threshold+ and +minimum_delta+ the loader expects.
|
|
133
|
+
# Category-specific flavor lives only in the dimension descriptions.
|
|
134
|
+
#
|
|
135
|
+
# @return [String] Pretty-printed criteria JSON.
|
|
108
136
|
def build_criteria_json
|
|
109
137
|
JSON.pretty_generate(
|
|
110
138
|
category: category.to_s,
|
|
111
|
-
dimensions:
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
{ name: 'quality', weight: 20, pass_threshold: 60 },
|
|
115
|
-
{ name: 'tests', weight: 15, pass_threshold: 80 },
|
|
116
|
-
{ name: 'docs', weight: 10, pass_threshold: 50 }
|
|
117
|
-
],
|
|
118
|
-
minimum_delta: 5,
|
|
119
|
-
category_specific: category_data.criteria
|
|
139
|
+
dimensions: criteria_dimensions,
|
|
140
|
+
pass_threshold: CRITERIA_PASS_THRESHOLD,
|
|
141
|
+
minimum_delta: CRITERIA_MINIMUM_DELTA
|
|
120
142
|
)
|
|
121
143
|
end
|
|
122
144
|
|
|
145
|
+
# @return [Array<Hash>] Core dimensions with integer +max_score+ summing to 100.
|
|
146
|
+
def criteria_dimensions
|
|
147
|
+
focus = category_data.criteria[:focus]
|
|
148
|
+
CRITERIA_DIMENSION_SCORES.map do |name, max_score|
|
|
149
|
+
{
|
|
150
|
+
name: name,
|
|
151
|
+
max_score: max_score,
|
|
152
|
+
description: "#{CORE_DIMENSION_DESCRIPTIONS.fetch(name)} (#{category} focus: #{focus})"
|
|
153
|
+
}
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
|
|
123
157
|
def build_skill_md
|
|
124
158
|
<<~MARKDOWN
|
|
125
159
|
# Skill: {{skill_name}} (#{category})
|
|
@@ -6,6 +6,13 @@ module SkillBench
|
|
|
6
6
|
module Services
|
|
7
7
|
# Records evaluation results and computes trends.
|
|
8
8
|
class TrendRecorderService
|
|
9
|
+
# Serializes the load -> append -> write of the shared trend history
|
|
10
|
+
# file. Batch runs ({BatchRunnerService}) execute evals concurrently and
|
|
11
|
+
# the trend file is process-global shared state; without this lock,
|
|
12
|
+
# concurrent records race on the temp-file rename and silently lose
|
|
13
|
+
# appended entries.
|
|
14
|
+
WRITE_MUTEX = Mutex.new
|
|
15
|
+
|
|
9
16
|
# Records evaluation results and computes trends.
|
|
10
17
|
#
|
|
11
18
|
# @param result [Hash] The evaluation result from Evaluation::Runner
|
|
@@ -27,12 +34,14 @@ module SkillBench
|
|
|
27
34
|
|
|
28
35
|
# Records evaluation results and computes trends.
|
|
29
36
|
#
|
|
37
|
+
# Loads the trend history once and reuses it for both the trend
|
|
38
|
+
# computation and the append+write, avoiding a duplicate parse per run.
|
|
39
|
+
#
|
|
30
40
|
# @return [Hash] Result with success status and trend data
|
|
31
41
|
def call
|
|
32
42
|
tracker = TrendTracker.new
|
|
33
43
|
enriched = @result.merge(eval_name: @eval_name, skill_names: @skill_names)
|
|
34
|
-
trend = tracker
|
|
35
|
-
record_result = tracker.record(enriched)
|
|
44
|
+
trend, record_result = record_atomically(tracker, enriched)
|
|
36
45
|
|
|
37
46
|
record_success = record_result.is_a?(Hash) && record_result[:success]
|
|
38
47
|
unless record_success
|
|
@@ -62,6 +71,24 @@ module SkillBench
|
|
|
62
71
|
SkillBench::ErrorLogger.log_error(e, 'Trend tracking failed')
|
|
63
72
|
{ success: false, response: { error: { message: e.message } } }
|
|
64
73
|
end
|
|
74
|
+
|
|
75
|
+
private
|
|
76
|
+
|
|
77
|
+
# Loads history, computes the trend, and records the entry while holding
|
|
78
|
+
# {WRITE_MUTEX}, so concurrent batch evals serialize their read-modify-
|
|
79
|
+
# write of the shared trend file. History is still loaded exactly once
|
|
80
|
+
# per run and reused for both the trend computation and the append.
|
|
81
|
+
#
|
|
82
|
+
# @param tracker [SkillBench::TrendTracker] The trend tracker
|
|
83
|
+
# @param enriched [Hash] Result enriched with eval_name and skill_names
|
|
84
|
+
# @return [Array(Hash, Hash)] The computed trend and the record result
|
|
85
|
+
def record_atomically(tracker, enriched)
|
|
86
|
+
WRITE_MUTEX.synchronize do
|
|
87
|
+
history = tracker.history
|
|
88
|
+
trend = tracker.trend_for(enriched, history)
|
|
89
|
+
[trend, tracker.record(enriched, history)]
|
|
90
|
+
end
|
|
91
|
+
end
|
|
65
92
|
end
|
|
66
93
|
end
|
|
67
94
|
end
|
|
@@ -8,15 +8,41 @@ module SkillBench
|
|
|
8
8
|
module Tools
|
|
9
9
|
# Registry for all available tools, providing their definitions to the LLM.
|
|
10
10
|
class Registry
|
|
11
|
-
#
|
|
11
|
+
# Recursively deep-freezes a tool-definition value (Hash/Array and contents)
|
|
12
|
+
# so accidental mutation by a downstream consumer raises immediately.
|
|
12
13
|
#
|
|
13
|
-
# @
|
|
14
|
-
|
|
14
|
+
# @param value [Object] The value to deep-freeze in place.
|
|
15
|
+
# @return [Object] The same value, frozen along with everything it contains.
|
|
16
|
+
def self.deep_freeze(value)
|
|
17
|
+
children = case value
|
|
18
|
+
when Hash then value.values
|
|
19
|
+
when Array then value
|
|
20
|
+
else []
|
|
21
|
+
end
|
|
22
|
+
children.each { |child| deep_freeze(child) }
|
|
23
|
+
value.freeze
|
|
24
|
+
end
|
|
25
|
+
private_class_method :deep_freeze
|
|
26
|
+
|
|
27
|
+
# The static tool definitions sent to the LLM API. The tool schemas are
|
|
28
|
+
# constant JSON-schema specs (no per-call state or runtime config), so the
|
|
29
|
+
# array and its nested hashes are built once and deep-frozen for reuse
|
|
30
|
+
# across every ReAct step instead of being reallocated on each call.
|
|
31
|
+
#
|
|
32
|
+
# @return [Array<Hash>] Frozen list of tools with their names, descriptions, and schemas.
|
|
33
|
+
DEFINITIONS = deep_freeze(
|
|
15
34
|
[
|
|
16
35
|
ReadFile.definition,
|
|
17
36
|
WriteFile.definition,
|
|
18
37
|
RunCommand.definition
|
|
19
38
|
]
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Returns the memoized, frozen array of tool definitions for the LLM API.
|
|
42
|
+
#
|
|
43
|
+
# @return [Array<Hash>] The frozen list of available tools with their names, descriptions, and schemas.
|
|
44
|
+
def self.definitions
|
|
45
|
+
DEFINITIONS
|
|
20
46
|
end
|
|
21
47
|
end
|
|
22
48
|
end
|