ruby-skill-bench 1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +299 -23
  3. data/docs/architecture.md +3 -1
  4. data/docs/first-eval-guide.md +7 -7
  5. data/docs/testing-guide.md +1 -1
  6. data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
  7. data/lib/skill_bench/agent/react_agent/step.rb +7 -1
  8. data/lib/skill_bench/agent/react_agent.rb +2 -1
  9. data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
  10. data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
  11. data/lib/skill_bench/cli/help_printer.rb +10 -2
  12. data/lib/skill_bench/cli/init_command.rb +2 -1
  13. data/lib/skill_bench/cli/result_printer.rb +1 -1
  14. data/lib/skill_bench/cli/run_command.rb +47 -9
  15. data/lib/skill_bench/cli/validate_command.rb +242 -0
  16. data/lib/skill_bench/cli.rb +3 -0
  17. data/lib/skill_bench/client.rb +43 -1
  18. data/lib/skill_bench/clients/all.rb +3 -0
  19. data/lib/skill_bench/clients/base_client.rb +14 -6
  20. data/lib/skill_bench/clients/base_url_validator.rb +105 -0
  21. data/lib/skill_bench/clients/provider_config.rb +34 -1
  22. data/lib/skill_bench/clients/provider_schemas.rb +4 -0
  23. data/lib/skill_bench/clients/providers/mistral.rb +47 -0
  24. data/lib/skill_bench/clients/request_builder.rb +2 -4
  25. data/lib/skill_bench/clients/response_builder.rb +91 -0
  26. data/lib/skill_bench/clients/response_error_handler.rb +5 -17
  27. data/lib/skill_bench/clients/retry_handler.rb +4 -7
  28. data/lib/skill_bench/commands/init.rb +5 -0
  29. data/lib/skill_bench/commands/skill_new.rb +3 -1
  30. data/lib/skill_bench/config/applier.rb +2 -0
  31. data/lib/skill_bench/config/defaults.rb +2 -0
  32. data/lib/skill_bench/config/facade_readers.rb +7 -0
  33. data/lib/skill_bench/config/facade_writers.rb +17 -0
  34. data/lib/skill_bench/config/json_loader.rb +1 -1
  35. data/lib/skill_bench/config/store.rb +29 -0
  36. data/lib/skill_bench/config.rb +18 -0
  37. data/lib/skill_bench/constants.rb +58 -0
  38. data/lib/skill_bench/evaluation/runner.rb +20 -3
  39. data/lib/skill_bench/execution/context_hydrator.rb +66 -15
  40. data/lib/skill_bench/execution/sandbox.rb +76 -14
  41. data/lib/skill_bench/judge/judge.rb +4 -0
  42. data/lib/skill_bench/judge/prompt.rb +42 -6
  43. data/lib/skill_bench/models/config.rb +32 -0
  44. data/lib/skill_bench/output_formatter.rb +60 -1
  45. data/lib/skill_bench/package_verifier.rb +1 -1
  46. data/lib/skill_bench/rails/skill_templates.rb +19 -5
  47. data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
  48. data/lib/skill_bench/services/batch_runner_service.rb +111 -0
  49. data/lib/skill_bench/services/compare_option_parser.rb +1 -0
  50. data/lib/skill_bench/services/cost_calculator.rb +91 -0
  51. data/lib/skill_bench/services/html_formatter.rb +289 -0
  52. data/lib/skill_bench/services/json_formatter.rb +19 -1
  53. data/lib/skill_bench/services/junit_formatter.rb +74 -24
  54. data/lib/skill_bench/services/provider_resolver.rb +5 -2
  55. data/lib/skill_bench/services/response_cache.rb +130 -0
  56. data/lib/skill_bench/services/runner_service.rb +88 -4
  57. data/lib/skill_bench/services/summary_formatter.rb +90 -0
  58. data/lib/skill_bench/services/template_registry.rb +43 -9
  59. data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
  60. data/lib/skill_bench/tools/registry.rb +29 -3
  61. data/lib/skill_bench/tools/run_command.rb +172 -35
  62. data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
  63. data/lib/skill_bench/trend_tracker.rb +5 -5
  64. data/lib/skill_bench/version.rb +1 -1
  65. data/lib/skill_bench.rb +3 -3
  66. metadata +19 -36
@@ -4,39 +4,89 @@ require 'cgi'
4
4
 
5
5
  module SkillBench
6
6
  module Services
7
- # Formats evaluation results as JUnit XML.
7
+ # Formats evaluation results as JUnit XML for CI consumption.
8
+ #
9
+ # Two entry points share the same per-result verdict/score logic:
10
+ # {.format} emits a single-result suite (one <testcase>), while
11
+ # {.format_batch} aggregates many results into one suite so a batch
12
+ # `skill-bench run --all` produces a single JUnit artifact.
8
13
  class JUnitFormatter
9
- # Format result as JUnit XML.
14
+ # classname attribute applied to every emitted <testcase>.
15
+ CLASSNAME = 'SkillBench'
16
+
17
+ # Format a single result as a JUnit XML document.
10
18
  #
11
19
  # Supports both legacy format (result[:pass]) and modern DeltaReport format.
12
20
  #
13
21
  # @param result [Hash] Eval result.
14
22
  # @return [String] JUnit XML-formatted string.
15
23
  def self.format(result)
24
+ suite([result])
25
+ end
26
+
27
+ # Format an aggregate batch envelope as one JUnit XML document.
28
+ #
29
+ # Emits a single <testsuite> with one <testcase> per result, adding a
30
+ # <failure> child for every failing eval.
31
+ #
32
+ # @param aggregate [Hash] Aggregate envelope with a :results array.
33
+ # @return [String] JUnit XML-formatted string.
34
+ def self.format_batch(aggregate)
35
+ suite(aggregate[:results] || [])
36
+ end
37
+
38
+ # Builds a <testsuite> wrapping one <testcase> per result.
39
+ #
40
+ # @param results [Array<Hash>] Per-eval result envelopes.
41
+ # @return [String] JUnit XML-formatted string.
42
+ def self.suite(results)
43
+ failures = results.count { |result| !passing?(result) }
44
+ cases = results.map { |result| testcase(result) }.join("\n")
45
+ <<~XML
46
+ <?xml version="1.0"?>
47
+ <testsuite name="#{CLASSNAME}" tests="#{results.size}" failures="#{failures}">
48
+ #{cases}
49
+ </testsuite>
50
+ XML
51
+ end
52
+ private_class_method :suite
53
+
54
+ # Renders one <testcase> element (indented two spaces) for a result.
55
+ #
56
+ # @param result [Hash] A single-eval result envelope.
57
+ # @return [String] A <testcase> XML fragment.
58
+ def self.testcase(result)
59
+ name = CGI.escapeHTML(result[:eval_name].to_s)
60
+ return %( <testcase name="#{name}" classname="#{CLASSNAME}"/>) if passing?(result)
61
+
62
+ score = CGI.escapeHTML(score_for(result).to_s)
63
+ [
64
+ %( <testcase name="#{name}" classname="#{CLASSNAME}">),
65
+ %( <failure message="Score: #{score}">Eval failed</failure>),
66
+ ' </testcase>'
67
+ ].join("\n")
68
+ end
69
+ private_class_method :testcase
70
+
71
+ # Whether a result passed (DeltaReport verdict or legacy :pass).
72
+ #
73
+ # @param result [Hash] A single-eval result envelope.
74
+ # @return [Boolean] true when the eval passed.
75
+ def self.passing?(result)
76
+ report = result.dig(:response, :report)
77
+ report.respond_to?(:verdict) ? report.verdict : result[:pass]
78
+ end
79
+ private_class_method :passing?
80
+
81
+ # The score reported for a failing eval.
82
+ #
83
+ # @param result [Hash] A single-eval result envelope.
84
+ # @return [Object] DeltaReport context_total or legacy :score.
85
+ def self.score_for(result)
16
86
  report = result.dig(:response, :report)
17
- verdict = report.respond_to?(:verdict) ? report.verdict : result[:pass]
18
- eval_name = CGI.escapeHTML(result[:eval_name].to_s)
19
-
20
- if verdict
21
- <<~XML
22
- <?xml version="1.0"?>
23
- <testsuite name="SkillBench" tests="1" failures="0">
24
- <testcase name="#{eval_name}" classname="SkillBench"/>
25
- </testsuite>
26
- XML
27
- else
28
- score = report.respond_to?(:context_total) ? report.context_total : result[:score]
29
- escaped_score = CGI.escapeHTML(score.to_s)
30
- <<~XML
31
- <?xml version="1.0"?>
32
- <testsuite name="SkillBench" tests="1" failures="1">
33
- <testcase name="#{eval_name}" classname="SkillBench">
34
- <failure message="Score: #{escaped_score}">Eval failed</failure>
35
- </testcase>
36
- </testsuite>
37
- XML
38
- end
87
+ report.respond_to?(:context_total) ? report.context_total : result[:score]
39
88
  end
89
+ private_class_method :score_for
40
90
  end
41
91
  end
42
92
  end
@@ -51,11 +51,14 @@ module SkillBench
51
51
  private
52
52
 
53
53
  def resolve_provider
54
- config = SkillBench::Models::Config.load
54
+ config = SkillBench::Models::Config.loaded
55
55
  provider = config.to_provider
56
56
  return provider if provider
57
57
 
58
- warn 'Config load failed, using mock provider'
58
+ # Explicit `{"provider":"mock"}` is a valid choice, not a load failure,
59
+ # so it falls through to the mock provider without a warning. A missing
60
+ # provider key (genuine misconfiguration) still warns below.
61
+ warn 'Config load failed, using mock provider' unless config.mock?
59
62
  MOCK_PROVIDER.new('mock', 'mock', 'mock', {})
60
63
  rescue JSON::ParserError, ArgumentError, Errno::ENOENT => e
61
64
  # Config parsing/validation errors or missing config file - fall back to mock
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'digest'
4
+ require 'json'
5
+
6
+ module SkillBench
7
+ module Services
8
+ # Content-addressed, in-memory cache for LLM responses.
9
+ #
10
+ # The cache is opt-in and disabled by default. When enabled it lets repeated,
11
+ # identical LLM requests reuse a previously computed response instead of
12
+ # hitting the network again. The canonical example is `compare`, which runs
13
+ # the skill-less baseline twice with identical inputs.
14
+ #
15
+ # The backing store is a process-lifetime {Hash} keyed by a stable SHA-256
16
+ # digest of the request, so the same logical request always maps to the same
17
+ # entry regardless of hash-key ordering. Access to the store is serialized
18
+ # with a mutex so concurrent callers (e.g. {Parallel}-driven agents) cannot
19
+ # corrupt it or double-store a key.
20
+ class ResponseCache
21
+ # Environment variable that opts caching on when set to a truthy value.
22
+ ENV_FLAG = 'SKILL_BENCH_CACHE'
23
+
24
+ # Raw env values treated as "on".
25
+ TRUTHY_VALUES = %w[1 true yes on].freeze
26
+
27
+ # Guards every read/write of the shared store. Concurrent agents/judges run
28
+ # on separate threads; without this, the membership check and the write in
29
+ # {fetch} could interleave and store a key more than once.
30
+ MUTEX = Mutex.new
31
+
32
+ class << self
33
+ # Whether response caching is currently enabled.
34
+ #
35
+ # Enabled when {ENV_FLAG} is set to a truthy value (one of
36
+ # {TRUTHY_VALUES}); disabled when unset or set to anything else.
37
+ #
38
+ # @return [Boolean] true when caching is on
39
+ def enabled?
40
+ raw = ENV.fetch(ENV_FLAG, '').to_s.strip.downcase
41
+ TRUTHY_VALUES.include?(raw)
42
+ end
43
+
44
+ # Computes a stable content-addressed cache key for a request.
45
+ #
46
+ # The inputs are assembled into a canonical structure (hash keys sorted
47
+ # and stringified recursively) and hashed, so semantically identical
48
+ # requests always produce the same digest. Request-affecting provider
49
+ # configuration (endpoint/base URL/etc.) is included so two providers that
50
+ # share a name but target different endpoints never collide.
51
+ #
52
+ # @param provider [Symbol, String] Resolved provider identifier
53
+ # @param model [String, nil] Model name
54
+ # @param system_prompt [String] System prompt
55
+ # @param messages [Array<Hash>] Conversation messages
56
+ # @param tools [Array<Hash>, nil] Tool definitions, when present
57
+ # @param temperature [Float, nil] Sampling temperature, when present
58
+ # @param provider_config [Hash] Request-affecting provider settings such as
59
+ # base_url, request_path, endpoint, location, project_id, api_version
60
+ # @return [String] Hex-encoded SHA-256 digest of the canonical request
61
+ def key(provider:, model:, system_prompt:, messages:, tools: nil, temperature: nil, provider_config: {})
62
+ payload = {
63
+ provider: provider.to_s,
64
+ model: model,
65
+ system_prompt: system_prompt,
66
+ messages: messages,
67
+ tools: tools,
68
+ temperature: temperature,
69
+ provider_config: provider_config
70
+ }
71
+ Digest::SHA256.hexdigest(JSON.generate(canonicalize(payload)))
72
+ end
73
+
74
+ # Returns the cached value for a key, computing and storing it on a miss.
75
+ #
76
+ # The value is computed outside the lock so requests for distinct keys run
77
+ # concurrently; the store read and the store write are each serialized by
78
+ # {MUTEX}, and a missing key is written exactly once (first writer wins).
79
+ #
80
+ # @param key [String] Cache key from {key}
81
+ # @yield Computes the value to cache when the key is absent
82
+ # @yieldreturn [Object] The value to cache
83
+ # @return [Object] The cached value (existing on a hit, freshly stored on a miss)
84
+ def fetch(key)
85
+ hit = MUTEX.synchronize { store[key] }
86
+ return hit unless hit.nil?
87
+
88
+ value = yield
89
+ MUTEX.synchronize { store[key] ||= value }
90
+ end
91
+
92
+ # Removes every cached entry.
93
+ #
94
+ # @return [void]
95
+ def clear
96
+ MUTEX.synchronize { store.clear }
97
+ end
98
+
99
+ private
100
+
101
+ # The process-lifetime backing store.
102
+ #
103
+ # @return [Hash{String => Object}] digest => cached response
104
+ def store
105
+ @store ||= {}
106
+ end
107
+
108
+ # Recursively rewrites a value into a stable form for serialization.
109
+ #
110
+ # Hashes get their keys stringified and sorted so that key ordering does
111
+ # not affect the resulting digest; arrays and scalars are preserved.
112
+ #
113
+ # @param value [Object] The value to canonicalize
114
+ # @return [Object] A canonical, order-stable copy of the value
115
+ def canonicalize(value)
116
+ case value
117
+ when Hash
118
+ value
119
+ .sort_by { |entry| entry.first.to_s }
120
+ .each_with_object({}) { |(name, val), acc| acc[name.to_s] = canonicalize(val) }
121
+ when Array
122
+ value.map { |element| canonicalize(element) }
123
+ else
124
+ value
125
+ end
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'parallel'
3
4
  require_relative '../evaluation/runner'
4
5
  require_relative 'eval_resolver'
5
6
  require_relative 'skill_resolver_service'
@@ -10,6 +11,7 @@ require_relative 'context_loader_service'
10
11
  require_relative 'judge_params_builder'
11
12
  require_relative 'error_response_builder'
12
13
  require_relative 'trend_recorder_service'
14
+ require_relative 'cost_calculator'
13
15
  require_relative 'output_formatter'
14
16
 
15
17
  module SkillBench
@@ -61,13 +63,11 @@ module SkillBench
61
63
  provider = provider_result[:provider]
62
64
  config = provider_result[:config]
63
65
 
64
- baseline_output = run_baseline_agent(evaluation, provider, config)
65
- return agent_error_result(baseline_output, 'baseline', evaluation, provider) if baseline_output[:status] == :error
66
-
67
66
  skill_context = ContextLoaderService.call(skills)
68
67
  return empty_context_error_result(evaluation, provider) if skill_context.strip.empty?
69
68
 
70
- context_output = run_context_agent(evaluation, skills, skill_context, provider, config)
69
+ baseline_output, context_output = run_agents_concurrently(evaluation, skills, skill_context, provider, config)
70
+ return agent_error_result(baseline_output, 'baseline', evaluation, provider) if baseline_output[:status] == :error
71
71
  return agent_error_result(context_output, 'context', evaluation, provider) if context_output[:status] == :error
72
72
 
73
73
  context = EvaluationContext.new(
@@ -111,6 +111,29 @@ module SkillBench
111
111
  AgentSpawnerService.call(evaluation, context_prompt, provider, config)
112
112
  end
113
113
 
114
+ # Runs the baseline and context agents concurrently.
115
+ #
116
+ # The two runs are independent: each spawns its own `Dir.mktmpdir`
117
+ # sandbox and uses a per-call client, and neither reads the other's
118
+ # state. The work is I/O-bound (HTTP + subprocess), so threads release
119
+ # the GIL and the agent phase is bound by the slower run instead of the
120
+ # sum of both. The skill context is built once by the caller and passed
121
+ # in, so no skill-dependent work is duplicated or moved into the baseline.
122
+ #
123
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
124
+ # @param skills [Array<SkillBench::Models::Skill>] Resolved skills
125
+ # @param skill_context [String] Combined skill context, built once pre-fork
126
+ # @param provider [Object] The resolved provider
127
+ # @param config [Hash, nil] Provider config
128
+ # @return [Array(Hash, Hash)] Baseline and context outputs, in that order
129
+ def run_agents_concurrently(evaluation, skills, skill_context, provider, config)
130
+ runs = [
131
+ -> { run_baseline_agent(evaluation, provider, config) },
132
+ -> { run_context_agent(evaluation, skills, skill_context, provider, config) }
133
+ ]
134
+ Parallel.map(runs, in_threads: runs.size, &:call)
135
+ end
136
+
114
137
  def evaluate_and_record_trend(context)
115
138
  evaluation = context.evaluation
116
139
  provider = context.provider
@@ -133,11 +156,16 @@ module SkillBench
133
156
  trend_result = TrendRecorderService.call(result, eval_name, skill_names)
134
157
  return enrich_error_result(trend_result, evaluation, provider) unless trend_result[:success]
135
158
 
159
+ tokens = aggregate_usage(context.baseline_output, context.context_output)
160
+ cost = CostCalculator.call(usage: tokens, model: agent_model(config, provider))
161
+
136
162
  {
137
163
  success: true,
138
164
  eval_name: eval_name,
139
165
  skill_name: skill_names.join(', '),
140
166
  provider_name: provider.name,
167
+ tokens: tokens,
168
+ cost: cost,
141
169
  response: result[:response].merge(
142
170
  trend: trend_result[:trend],
143
171
  baseline_iterations: context.baseline_output[:iterations] || [],
@@ -145,6 +173,62 @@ module SkillBench
145
173
  )
146
174
  }
147
175
  end
176
+
177
+ # Sums the token usage of the baseline and context agent runs.
178
+ #
179
+ # Judge-side usage is not yet threaded through (the judge lives under the
180
+ # untouched `clients/` boundary), so this is scoped to agent usage.
181
+ #
182
+ # @param baseline_output [Hash] The baseline agent output (carries :usage).
183
+ # @param context_output [Hash] The context agent output (carries :usage).
184
+ # @return [Hash] Combined prompt/completion/total token counts.
185
+ def aggregate_usage(baseline_output, context_output)
186
+ add_usage(
187
+ add_usage(empty_usage, baseline_output[:usage]),
188
+ context_output[:usage]
189
+ )
190
+ end
191
+
192
+ # A zeroed token-usage accumulator.
193
+ #
194
+ # @return [Hash] Usage hash with prompt/completion/total token counts set to zero.
195
+ def empty_usage
196
+ { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }
197
+ end
198
+
199
+ # Adds one usage hash onto a running total.
200
+ #
201
+ # @param total [Hash] The running usage total.
202
+ # @param usage [Hash, nil] A run's usage hash (may be nil or empty).
203
+ # @return [Hash] A new summed usage hash.
204
+ def add_usage(total, usage)
205
+ usage ||= {}
206
+ {
207
+ prompt_tokens: total[:prompt_tokens] + token_count(usage, :prompt_tokens),
208
+ completion_tokens: total[:completion_tokens] + token_count(usage, :completion_tokens),
209
+ total_tokens: total[:total_tokens] + token_count(usage, :total_tokens)
210
+ }
211
+ end
212
+
213
+ # Reads a token count from a usage hash, tolerating string keys.
214
+ #
215
+ # @param usage [Hash] The usage hash.
216
+ # @param key [Symbol] The usage key (e.g. :prompt_tokens).
217
+ # @return [Integer] The token count, or zero when absent.
218
+ def token_count(usage, key)
219
+ (usage[key] || usage[key.to_s] || 0).to_i
220
+ end
221
+
222
+ # Resolves the model name used for pricing from config, falling back to the provider LLM.
223
+ #
224
+ # @param config [Hash, nil] Provider config.
225
+ # @param provider [Object] The resolved provider.
226
+ # @return [String] The model name (e.g. "gpt-4o").
227
+ def agent_model(config, provider)
228
+ return provider.llm unless config.is_a?(Hash)
229
+
230
+ config[:model] || config['model'] || provider.llm
231
+ end
148
232
  end
149
233
  end
150
234
  end
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module SkillBench
6
+ module Services
7
+ # Builds a compact JSON summary of a batch run for CI gating.
8
+ #
9
+ # Surfaces the aggregate pass/fail counts plus rolled-up token and cost
10
+ # usage and the single worst skill-vs-baseline delta across the batch, so
11
+ # a CI job can gate on (and archive) one machine-readable artifact.
12
+ class SummaryFormatter
13
+ # Format an aggregate batch envelope as a pretty JSON summary string.
14
+ #
15
+ # @param aggregate [Hash] Aggregate envelope with :results and :summary.
16
+ # @return [String] Pretty-printed JSON summary.
17
+ def self.format(aggregate)
18
+ new(aggregate).format
19
+ end
20
+
21
+ # @param aggregate [Hash] Aggregate envelope with :results and :summary.
22
+ def initialize(aggregate)
23
+ @results = aggregate[:results] || []
24
+ @summary = aggregate[:summary] || {}
25
+ end
26
+
27
+ # Builds the JSON summary document.
28
+ #
29
+ # @return [String] Pretty-printed JSON summary.
30
+ def format
31
+ JSON.pretty_generate(
32
+ passed: summary[:passed],
33
+ failed: summary[:failed],
34
+ total: summary[:total],
35
+ tokens: total_tokens,
36
+ cost: total_cost,
37
+ worst_delta: worst_delta
38
+ )
39
+ end
40
+
41
+ private
42
+
43
+ attr_reader :results, :summary
44
+
45
+ # Sums total_tokens across every result, treating missing usage as 0.
46
+ #
47
+ # @return [Integer] Aggregate token count.
48
+ def total_tokens
49
+ results.sum { |result| tokens_for(result) }
50
+ end
51
+
52
+ # Reads a single result's total token count.
53
+ #
54
+ # @param result [Hash] A single-eval result envelope.
55
+ # @return [Integer] total_tokens, or 0 when absent.
56
+ def tokens_for(result)
57
+ tokens = result[:tokens] || {}
58
+ tokens[:total_tokens] || tokens['total_tokens'] || 0
59
+ end
60
+
61
+ # Sums non-nil per-result costs.
62
+ #
63
+ # @return [Float, nil] Total cost, or nil when no result reports a cost.
64
+ def total_cost
65
+ costs = results.filter_map { |result| result[:cost] }
66
+ costs.empty? ? nil : costs.sum
67
+ end
68
+
69
+ # Finds the eval with the smallest skill-vs-baseline delta.
70
+ #
71
+ # @return [Hash, nil] {:eval_name, :delta} for the worst eval, or nil
72
+ # when no result carries a delta report.
73
+ def worst_delta
74
+ scored = results.filter_map { |result| delta_entry(result) }
75
+ scored.min_by { |entry| entry[:delta] }
76
+ end
77
+
78
+ # Builds a {eval_name, delta} entry for a result with a delta report.
79
+ #
80
+ # @param result [Hash] A single-eval result envelope.
81
+ # @return [Hash, nil] Entry hash, or nil when the report lacks deltas.
82
+ def delta_entry(result)
83
+ report = result.dig(:response, :report)
84
+ return nil unless report.respond_to?(:context_total) && report.respond_to?(:baseline_total)
85
+
86
+ { eval_name: result[:eval_name], delta: report.context_total - report.baseline_total }
87
+ end
88
+ end
89
+ end
90
+ end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'json'
4
+ require_relative '../dimension'
4
5
  require_relative 'template_registry/category_data'
5
6
 
6
7
  module SkillBench
@@ -21,6 +22,24 @@ module SkillBench
21
22
  TEMPLATE_TYPES = %i[task_md criteria_json skill_md].freeze
22
23
  CATEGORIES = REGISTRY.keys.freeze
23
24
 
25
+ # Score weight per core scoring dimension. Keyed by the canonical
26
+ # +SkillBench::DEFAULT_DIMENSIONS+ names so scaffolded criteria can never
27
+ # drift from the names the runtime loader requires; values sum to 100.
28
+ CRITERIA_DIMENSION_SCORES = {
29
+ 'correctness' => 30,
30
+ 'skill_adherence' => 25,
31
+ 'code_quality' => 20,
32
+ 'test_coverage' => 15,
33
+ 'documentation' => 10
34
+ }.freeze
35
+
36
+ # Canonical dimension descriptions keyed by name, sourced from the runtime defaults.
37
+ CORE_DIMENSION_DESCRIPTIONS = SkillBench::DEFAULT_DIMENSIONS.to_h { |dimension| [dimension.name, dimension.description] }.freeze
38
+
39
+ # Top-level thresholds emitted with scaffolded criteria.
40
+ CRITERIA_PASS_THRESHOLD = 70
41
+ CRITERIA_MINIMUM_DELTA = 10
42
+
24
43
  # @param template_type [Symbol, String] Template type (:task_md, :criteria_json, :skill_md)
25
44
  # @param category [Symbol, String] Category (:crud, :api, :background_job, etc.)
26
45
  # @param variables [Hash{Symbol, String => String}] Variables for interpolation
@@ -105,21 +124,36 @@ module SkillBench
105
124
  MARKDOWN
106
125
  end
107
126
 
127
+ # Builds runtime-loadable scoring criteria for the category.
128
+ #
129
+ # Emits the five core dimensions required by {SkillBench::Criteria}
130
+ # (+correctness+, +skill_adherence+, +code_quality+, +test_coverage+,
131
+ # +documentation+) with integer +max_score+ values summing to 100, plus
132
+ # the top-level +pass_threshold+ and +minimum_delta+ the loader expects.
133
+ # Category-specific flavor lives only in the dimension descriptions.
134
+ #
135
+ # @return [String] Pretty-printed criteria JSON.
108
136
  def build_criteria_json
109
137
  JSON.pretty_generate(
110
138
  category: category.to_s,
111
- dimensions: [
112
- { name: 'correctness', weight: 30, pass_threshold: 70 },
113
- { name: 'adherence', weight: 25, pass_threshold: 60 },
114
- { name: 'quality', weight: 20, pass_threshold: 60 },
115
- { name: 'tests', weight: 15, pass_threshold: 80 },
116
- { name: 'docs', weight: 10, pass_threshold: 50 }
117
- ],
118
- minimum_delta: 5,
119
- category_specific: category_data.criteria
139
+ dimensions: criteria_dimensions,
140
+ pass_threshold: CRITERIA_PASS_THRESHOLD,
141
+ minimum_delta: CRITERIA_MINIMUM_DELTA
120
142
  )
121
143
  end
122
144
 
145
+ # @return [Array<Hash>] Core dimensions with integer +max_score+ summing to 100.
146
+ def criteria_dimensions
147
+ focus = category_data.criteria[:focus]
148
+ CRITERIA_DIMENSION_SCORES.map do |name, max_score|
149
+ {
150
+ name: name,
151
+ max_score: max_score,
152
+ description: "#{CORE_DIMENSION_DESCRIPTIONS.fetch(name)} (#{category} focus: #{focus})"
153
+ }
154
+ end
155
+ end
156
+
123
157
  def build_skill_md
124
158
  <<~MARKDOWN
125
159
  # Skill: {{skill_name}} (#{category})
@@ -6,6 +6,13 @@ module SkillBench
6
6
  module Services
7
7
  # Records evaluation results and computes trends.
8
8
  class TrendRecorderService
9
+ # Serializes the load -> append -> write of the shared trend history
10
+ # file. Batch runs ({BatchRunnerService}) execute evals concurrently and
11
+ # the trend file is process-global shared state; without this lock,
12
+ # concurrent records race on the temp-file rename and silently lose
13
+ # appended entries.
14
+ WRITE_MUTEX = Mutex.new
15
+
9
16
  # Records evaluation results and computes trends.
10
17
  #
11
18
  # @param result [Hash] The evaluation result from Evaluation::Runner
@@ -27,12 +34,14 @@ module SkillBench
27
34
 
28
35
  # Records evaluation results and computes trends.
29
36
  #
37
+ # Loads the trend history once and reuses it for both the trend
38
+ # computation and the append+write, avoiding a duplicate parse per run.
39
+ #
30
40
  # @return [Hash] Result with success status and trend data
31
41
  def call
32
42
  tracker = TrendTracker.new
33
43
  enriched = @result.merge(eval_name: @eval_name, skill_names: @skill_names)
34
- trend = tracker.trend_for(enriched)
35
- record_result = tracker.record(enriched)
44
+ trend, record_result = record_atomically(tracker, enriched)
36
45
 
37
46
  record_success = record_result.is_a?(Hash) && record_result[:success]
38
47
  unless record_success
@@ -62,6 +71,24 @@ module SkillBench
62
71
  SkillBench::ErrorLogger.log_error(e, 'Trend tracking failed')
63
72
  { success: false, response: { error: { message: e.message } } }
64
73
  end
74
+
75
+ private
76
+
77
+ # Loads history, computes the trend, and records the entry while holding
78
+ # {WRITE_MUTEX}, so concurrent batch evals serialize their read-modify-
79
+ # write of the shared trend file. History is still loaded exactly once
80
+ # per run and reused for both the trend computation and the append.
81
+ #
82
+ # @param tracker [SkillBench::TrendTracker] The trend tracker
83
+ # @param enriched [Hash] Result enriched with eval_name and skill_names
84
+ # @return [Array(Hash, Hash)] The computed trend and the record result
85
+ def record_atomically(tracker, enriched)
86
+ WRITE_MUTEX.synchronize do
87
+ history = tracker.history
88
+ trend = tracker.trend_for(enriched, history)
89
+ [trend, tracker.record(enriched, history)]
90
+ end
91
+ end
65
92
  end
66
93
  end
67
94
  end
@@ -8,15 +8,41 @@ module SkillBench
8
8
  module Tools
9
9
  # Registry for all available tools, providing their definitions to the LLM.
10
10
  class Registry
11
- # Returns an array of tool definitions in the format expected by the LLM API.
11
+ # Recursively deep-freezes a tool-definition value (Hash/Array and contents)
12
+ # so accidental mutation by a downstream consumer raises immediately.
12
13
  #
13
- # @return [Array<Hash>] The list of available tools with their names, descriptions, and schemas.
14
- def self.definitions
14
+ # @param value [Object] The value to deep-freeze in place.
15
+ # @return [Object] The same value, frozen along with everything it contains.
16
+ def self.deep_freeze(value)
17
+ children = case value
18
+ when Hash then value.values
19
+ when Array then value
20
+ else []
21
+ end
22
+ children.each { |child| deep_freeze(child) }
23
+ value.freeze
24
+ end
25
+ private_class_method :deep_freeze
26
+
27
+ # The static tool definitions sent to the LLM API. The tool schemas are
28
+ # constant JSON-schema specs (no per-call state or runtime config), so the
29
+ # array and its nested hashes are built once and deep-frozen for reuse
30
+ # across every ReAct step instead of being reallocated on each call.
31
+ #
32
+ # @return [Array<Hash>] Frozen list of tools with their names, descriptions, and schemas.
33
+ DEFINITIONS = deep_freeze(
15
34
  [
16
35
  ReadFile.definition,
17
36
  WriteFile.definition,
18
37
  RunCommand.definition
19
38
  ]
39
+ )
40
+
41
+ # Returns the memoized, frozen array of tool definitions for the LLM API.
42
+ #
43
+ # @return [Array<Hash>] The frozen list of available tools with their names, descriptions, and schemas.
44
+ def self.definitions
45
+ DEFINITIONS
20
46
  end
21
47
  end
22
48
  end