ruby-skill-bench 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +166 -35
  3. data/docs/architecture.md +3 -1
  4. data/docs/first-eval-guide.md +7 -7
  5. data/docs/testing-guide.md +1 -1
  6. data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
  7. data/lib/skill_bench/agent/react_agent/step.rb +7 -1
  8. data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
  9. data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
  10. data/lib/skill_bench/cli/help_printer.rb +10 -2
  11. data/lib/skill_bench/cli/init_command.rb +2 -1
  12. data/lib/skill_bench/cli/result_printer.rb +1 -1
  13. data/lib/skill_bench/cli/run_command.rb +47 -9
  14. data/lib/skill_bench/cli/validate_command.rb +242 -0
  15. data/lib/skill_bench/cli.rb +3 -0
  16. data/lib/skill_bench/client.rb +43 -1
  17. data/lib/skill_bench/clients/all.rb +2 -0
  18. data/lib/skill_bench/clients/base_client.rb +12 -1
  19. data/lib/skill_bench/clients/base_url_validator.rb +105 -0
  20. data/lib/skill_bench/clients/provider_config.rb +34 -1
  21. data/lib/skill_bench/clients/provider_schemas.rb +4 -0
  22. data/lib/skill_bench/clients/providers/mistral.rb +47 -0
  23. data/lib/skill_bench/commands/init.rb +5 -0
  24. data/lib/skill_bench/commands/skill_new.rb +3 -1
  25. data/lib/skill_bench/config/applier.rb +2 -0
  26. data/lib/skill_bench/config/defaults.rb +2 -0
  27. data/lib/skill_bench/config/facade_readers.rb +7 -0
  28. data/lib/skill_bench/config/facade_writers.rb +17 -0
  29. data/lib/skill_bench/config/json_loader.rb +1 -1
  30. data/lib/skill_bench/config/store.rb +29 -0
  31. data/lib/skill_bench/config.rb +18 -0
  32. data/lib/skill_bench/evaluation/runner.rb +20 -3
  33. data/lib/skill_bench/execution/context_hydrator.rb +52 -11
  34. data/lib/skill_bench/execution/sandbox.rb +58 -11
  35. data/lib/skill_bench/judge/judge.rb +4 -0
  36. data/lib/skill_bench/judge/prompt.rb +42 -6
  37. data/lib/skill_bench/models/config.rb +32 -0
  38. data/lib/skill_bench/output_formatter.rb +60 -1
  39. data/lib/skill_bench/package_verifier.rb +1 -1
  40. data/lib/skill_bench/rails/skill_templates.rb +19 -5
  41. data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
  42. data/lib/skill_bench/services/batch_runner_service.rb +111 -0
  43. data/lib/skill_bench/services/compare_option_parser.rb +1 -0
  44. data/lib/skill_bench/services/cost_calculator.rb +91 -0
  45. data/lib/skill_bench/services/html_formatter.rb +289 -0
  46. data/lib/skill_bench/services/json_formatter.rb +19 -1
  47. data/lib/skill_bench/services/junit_formatter.rb +74 -24
  48. data/lib/skill_bench/services/provider_resolver.rb +5 -2
  49. data/lib/skill_bench/services/response_cache.rb +130 -0
  50. data/lib/skill_bench/services/runner_service.rb +88 -4
  51. data/lib/skill_bench/services/summary_formatter.rb +90 -0
  52. data/lib/skill_bench/services/template_registry.rb +43 -9
  53. data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
  54. data/lib/skill_bench/tools/registry.rb +29 -3
  55. data/lib/skill_bench/tools/run_command.rb +171 -19
  56. data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
  57. data/lib/skill_bench/trend_tracker.rb +5 -5
  58. data/lib/skill_bench/version.rb +1 -1
  59. data/lib/skill_bench.rb +2 -3
  60. metadata +17 -36
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'parallel'
3
4
  require_relative '../evaluation/runner'
4
5
  require_relative 'eval_resolver'
5
6
  require_relative 'skill_resolver_service'
@@ -10,6 +11,7 @@ require_relative 'context_loader_service'
10
11
  require_relative 'judge_params_builder'
11
12
  require_relative 'error_response_builder'
12
13
  require_relative 'trend_recorder_service'
14
+ require_relative 'cost_calculator'
13
15
  require_relative 'output_formatter'
14
16
 
15
17
  module SkillBench
@@ -61,13 +63,11 @@ module SkillBench
61
63
  provider = provider_result[:provider]
62
64
  config = provider_result[:config]
63
65
 
64
- baseline_output = run_baseline_agent(evaluation, provider, config)
65
- return agent_error_result(baseline_output, 'baseline', evaluation, provider) if baseline_output[:status] == :error
66
-
67
66
  skill_context = ContextLoaderService.call(skills)
68
67
  return empty_context_error_result(evaluation, provider) if skill_context.strip.empty?
69
68
 
70
- context_output = run_context_agent(evaluation, skills, skill_context, provider, config)
69
+ baseline_output, context_output = run_agents_concurrently(evaluation, skills, skill_context, provider, config)
70
+ return agent_error_result(baseline_output, 'baseline', evaluation, provider) if baseline_output[:status] == :error
71
71
  return agent_error_result(context_output, 'context', evaluation, provider) if context_output[:status] == :error
72
72
 
73
73
  context = EvaluationContext.new(
@@ -111,6 +111,29 @@ module SkillBench
111
111
  AgentSpawnerService.call(evaluation, context_prompt, provider, config)
112
112
  end
113
113
 
114
+ # Runs the baseline and context agents concurrently.
115
+ #
116
+ # The two runs are independent: each spawns its own `Dir.mktmpdir`
117
+ # sandbox and uses a per-call client, and neither reads the other's
118
+ # state. The work is I/O-bound (HTTP + subprocess), so threads release
119
+ # the GIL and the agent phase is bound by the slower run instead of the
120
+ # sum of both. The skill context is built once by the caller and passed
121
+ # in, so no skill-dependent work is duplicated or moved into the baseline.
122
+ #
123
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
124
+ # @param skills [Array<SkillBench::Models::Skill>] Resolved skills
125
+ # @param skill_context [String] Combined skill context, built once pre-fork
126
+ # @param provider [Object] The resolved provider
127
+ # @param config [Hash, nil] Provider config
128
+ # @return [Array(Hash, Hash)] Baseline and context outputs, in that order
129
+ def run_agents_concurrently(evaluation, skills, skill_context, provider, config)
130
+ runs = [
131
+ -> { run_baseline_agent(evaluation, provider, config) },
132
+ -> { run_context_agent(evaluation, skills, skill_context, provider, config) }
133
+ ]
134
+ Parallel.map(runs, in_threads: runs.size, &:call)
135
+ end
136
+
114
137
  def evaluate_and_record_trend(context)
115
138
  evaluation = context.evaluation
116
139
  provider = context.provider
@@ -133,11 +156,16 @@ module SkillBench
133
156
  trend_result = TrendRecorderService.call(result, eval_name, skill_names)
134
157
  return enrich_error_result(trend_result, evaluation, provider) unless trend_result[:success]
135
158
 
159
+ tokens = aggregate_usage(context.baseline_output, context.context_output)
160
+ cost = CostCalculator.call(usage: tokens, model: agent_model(config, provider))
161
+
136
162
  {
137
163
  success: true,
138
164
  eval_name: eval_name,
139
165
  skill_name: skill_names.join(', '),
140
166
  provider_name: provider.name,
167
+ tokens: tokens,
168
+ cost: cost,
141
169
  response: result[:response].merge(
142
170
  trend: trend_result[:trend],
143
171
  baseline_iterations: context.baseline_output[:iterations] || [],
@@ -145,6 +173,62 @@ module SkillBench
145
173
  )
146
174
  }
147
175
  end
176
+
177
+ # Sums the token usage of the baseline and context agent runs.
178
+ #
179
+ # Judge-side usage is not yet threaded through (the judge lives under the
180
+ # untouched `clients/` boundary), so this is scoped to agent usage.
181
+ #
182
+ # @param baseline_output [Hash] The baseline agent output (carries :usage).
183
+ # @param context_output [Hash] The context agent output (carries :usage).
184
+ # @return [Hash] Combined prompt/completion/total token counts.
185
+ def aggregate_usage(baseline_output, context_output)
186
+ add_usage(
187
+ add_usage(empty_usage, baseline_output[:usage]),
188
+ context_output[:usage]
189
+ )
190
+ end
191
+
192
+ # A zeroed token-usage accumulator.
193
+ #
194
+ # @return [Hash] Usage hash with prompt/completion/total token counts set to zero.
195
+ def empty_usage
196
+ { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }
197
+ end
198
+
199
+ # Adds one usage hash onto a running total.
200
+ #
201
+ # @param total [Hash] The running usage total.
202
+ # @param usage [Hash, nil] A run's usage hash (may be nil or empty).
203
+ # @return [Hash] A new summed usage hash.
204
+ def add_usage(total, usage)
205
+ usage ||= {}
206
+ {
207
+ prompt_tokens: total[:prompt_tokens] + token_count(usage, :prompt_tokens),
208
+ completion_tokens: total[:completion_tokens] + token_count(usage, :completion_tokens),
209
+ total_tokens: total[:total_tokens] + token_count(usage, :total_tokens)
210
+ }
211
+ end
212
+
213
+ # Reads a token count from a usage hash, tolerating string keys.
214
+ #
215
+ # @param usage [Hash] The usage hash.
216
+ # @param key [Symbol] The usage key (e.g. :prompt_tokens).
217
+ # @return [Integer] The token count, or zero when absent.
218
+ def token_count(usage, key)
219
+ (usage[key] || usage[key.to_s] || 0).to_i
220
+ end
221
+
222
+ # Resolves the model name used for pricing from config, falling back to the provider LLM.
223
+ #
224
+ # @param config [Hash, nil] Provider config.
225
+ # @param provider [Object] The resolved provider.
226
+ # @return [String] The model name (e.g. "gpt-4o").
227
+ def agent_model(config, provider)
228
+ return provider.llm unless config.is_a?(Hash)
229
+
230
+ config[:model] || config['model'] || provider.llm
231
+ end
148
232
  end
149
233
  end
150
234
  end
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module SkillBench
6
+ module Services
7
+ # Builds a compact JSON summary of a batch run for CI gating.
8
+ #
9
+ # Surfaces the aggregate pass/fail counts plus rolled-up token and cost
10
+ # usage and the single worst skill-vs-baseline delta across the batch, so
11
+ # a CI job can gate on (and archive) one machine-readable artifact.
12
+ class SummaryFormatter
13
+ # Format an aggregate batch envelope as a pretty JSON summary string.
14
+ #
15
+ # @param aggregate [Hash] Aggregate envelope with :results and :summary.
16
+ # @return [String] Pretty-printed JSON summary.
17
+ def self.format(aggregate)
18
+ new(aggregate).format
19
+ end
20
+
21
+ # @param aggregate [Hash] Aggregate envelope with :results and :summary.
22
+ def initialize(aggregate)
23
+ @results = aggregate[:results] || []
24
+ @summary = aggregate[:summary] || {}
25
+ end
26
+
27
+ # Builds the JSON summary document.
28
+ #
29
+ # @return [String] Pretty-printed JSON summary.
30
+ def format
31
+ JSON.pretty_generate(
32
+ passed: summary[:passed],
33
+ failed: summary[:failed],
34
+ total: summary[:total],
35
+ tokens: total_tokens,
36
+ cost: total_cost,
37
+ worst_delta: worst_delta
38
+ )
39
+ end
40
+
41
+ private
42
+
43
+ attr_reader :results, :summary
44
+
45
+ # Sums total_tokens across every result, treating missing usage as 0.
46
+ #
47
+ # @return [Integer] Aggregate token count.
48
+ def total_tokens
49
+ results.sum { |result| tokens_for(result) }
50
+ end
51
+
52
+ # Reads a single result's total token count.
53
+ #
54
+ # @param result [Hash] A single-eval result envelope.
55
+ # @return [Integer] total_tokens, or 0 when absent.
56
+ def tokens_for(result)
57
+ tokens = result[:tokens] || {}
58
+ tokens[:total_tokens] || tokens['total_tokens'] || 0
59
+ end
60
+
61
+ # Sums non-nil per-result costs.
62
+ #
63
+ # @return [Float, nil] Total cost, or nil when no result reports a cost.
64
+ def total_cost
65
+ costs = results.filter_map { |result| result[:cost] }
66
+ costs.empty? ? nil : costs.sum
67
+ end
68
+
69
+ # Finds the eval with the smallest skill-vs-baseline delta.
70
+ #
71
+ # @return [Hash, nil] {:eval_name, :delta} for the worst eval, or nil
72
+ # when no result carries a delta report.
73
+ def worst_delta
74
+ scored = results.filter_map { |result| delta_entry(result) }
75
+ scored.min_by { |entry| entry[:delta] }
76
+ end
77
+
78
+ # Builds a {eval_name, delta} entry for a result with a delta report.
79
+ #
80
+ # @param result [Hash] A single-eval result envelope.
81
+ # @return [Hash, nil] Entry hash, or nil when the report lacks deltas.
82
+ def delta_entry(result)
83
+ report = result.dig(:response, :report)
84
+ return nil unless report.respond_to?(:context_total) && report.respond_to?(:baseline_total)
85
+
86
+ { eval_name: result[:eval_name], delta: report.context_total - report.baseline_total }
87
+ end
88
+ end
89
+ end
90
+ end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'json'
4
+ require_relative '../dimension'
4
5
  require_relative 'template_registry/category_data'
5
6
 
6
7
  module SkillBench
@@ -21,6 +22,24 @@ module SkillBench
21
22
  TEMPLATE_TYPES = %i[task_md criteria_json skill_md].freeze
22
23
  CATEGORIES = REGISTRY.keys.freeze
23
24
 
25
+ # Score weight per core scoring dimension. Keyed by the canonical
26
+ # +SkillBench::DEFAULT_DIMENSIONS+ names so scaffolded criteria can never
27
+ # drift from the names the runtime loader requires; values sum to 100.
28
+ CRITERIA_DIMENSION_SCORES = {
29
+ 'correctness' => 30,
30
+ 'skill_adherence' => 25,
31
+ 'code_quality' => 20,
32
+ 'test_coverage' => 15,
33
+ 'documentation' => 10
34
+ }.freeze
35
+
36
+ # Canonical dimension descriptions keyed by name, sourced from the runtime defaults.
37
+ CORE_DIMENSION_DESCRIPTIONS = SkillBench::DEFAULT_DIMENSIONS.to_h { |dimension| [dimension.name, dimension.description] }.freeze
38
+
39
+ # Top-level thresholds emitted with scaffolded criteria.
40
+ CRITERIA_PASS_THRESHOLD = 70
41
+ CRITERIA_MINIMUM_DELTA = 10
42
+
24
43
  # @param template_type [Symbol, String] Template type (:task_md, :criteria_json, :skill_md)
25
44
  # @param category [Symbol, String] Category (:crud, :api, :background_job, etc.)
26
45
  # @param variables [Hash{Symbol, String => String}] Variables for interpolation
@@ -105,21 +124,36 @@ module SkillBench
105
124
  MARKDOWN
106
125
  end
107
126
 
127
+ # Builds runtime-loadable scoring criteria for the category.
128
+ #
129
+ # Emits the five core dimensions required by {SkillBench::Criteria}
130
+ # (+correctness+, +skill_adherence+, +code_quality+, +test_coverage+,
131
+ # +documentation+) with integer +max_score+ values summing to 100, plus
132
+ # the top-level +pass_threshold+ and +minimum_delta+ the loader expects.
133
+ # Category-specific flavor lives only in the dimension descriptions.
134
+ #
135
+ # @return [String] Pretty-printed criteria JSON.
108
136
  def build_criteria_json
109
137
  JSON.pretty_generate(
110
138
  category: category.to_s,
111
- dimensions: [
112
- { name: 'correctness', weight: 30, pass_threshold: 70 },
113
- { name: 'adherence', weight: 25, pass_threshold: 60 },
114
- { name: 'quality', weight: 20, pass_threshold: 60 },
115
- { name: 'tests', weight: 15, pass_threshold: 80 },
116
- { name: 'docs', weight: 10, pass_threshold: 50 }
117
- ],
118
- minimum_delta: 5,
119
- category_specific: category_data.criteria
139
+ dimensions: criteria_dimensions,
140
+ pass_threshold: CRITERIA_PASS_THRESHOLD,
141
+ minimum_delta: CRITERIA_MINIMUM_DELTA
120
142
  )
121
143
  end
122
144
 
145
+ # @return [Array<Hash>] Core dimensions with integer +max_score+ summing to 100.
146
+ def criteria_dimensions
147
+ focus = category_data.criteria[:focus]
148
+ CRITERIA_DIMENSION_SCORES.map do |name, max_score|
149
+ {
150
+ name: name,
151
+ max_score: max_score,
152
+ description: "#{CORE_DIMENSION_DESCRIPTIONS.fetch(name)} (#{category} focus: #{focus})"
153
+ }
154
+ end
155
+ end
156
+
123
157
  def build_skill_md
124
158
  <<~MARKDOWN
125
159
  # Skill: {{skill_name}} (#{category})
@@ -6,6 +6,13 @@ module SkillBench
6
6
  module Services
7
7
  # Records evaluation results and computes trends.
8
8
  class TrendRecorderService
9
+ # Serializes the load -> append -> write of the shared trend history
10
+ # file. Batch runs ({BatchRunnerService}) execute evals concurrently and
11
+ # the trend file is process-global shared state; without this lock,
12
+ # concurrent records race on the temp-file rename and silently lose
13
+ # appended entries.
14
+ WRITE_MUTEX = Mutex.new
15
+
9
16
  # Records evaluation results and computes trends.
10
17
  #
11
18
  # @param result [Hash] The evaluation result from Evaluation::Runner
@@ -27,12 +34,14 @@ module SkillBench
27
34
 
28
35
  # Records evaluation results and computes trends.
29
36
  #
37
+ # Loads the trend history once and reuses it for both the trend
38
+ # computation and the append+write, avoiding a duplicate parse per run.
39
+ #
30
40
  # @return [Hash] Result with success status and trend data
31
41
  def call
32
42
  tracker = TrendTracker.new
33
43
  enriched = @result.merge(eval_name: @eval_name, skill_names: @skill_names)
34
- trend = tracker.trend_for(enriched)
35
- record_result = tracker.record(enriched)
44
+ trend, record_result = record_atomically(tracker, enriched)
36
45
 
37
46
  record_success = record_result.is_a?(Hash) && record_result[:success]
38
47
  unless record_success
@@ -62,6 +71,24 @@ module SkillBench
62
71
  SkillBench::ErrorLogger.log_error(e, 'Trend tracking failed')
63
72
  { success: false, response: { error: { message: e.message } } }
64
73
  end
74
+
75
+ private
76
+
77
+ # Loads history, computes the trend, and records the entry while holding
78
+ # {WRITE_MUTEX}, so concurrent batch evals serialize their read-modify-
79
+ # write of the shared trend file. History is still loaded exactly once
80
+ # per run and reused for both the trend computation and the append.
81
+ #
82
+ # @param tracker [SkillBench::TrendTracker] The trend tracker
83
+ # @param enriched [Hash] Result enriched with eval_name and skill_names
84
+ # @return [Array(Hash, Hash)] The computed trend and the record result
85
+ def record_atomically(tracker, enriched)
86
+ WRITE_MUTEX.synchronize do
87
+ history = tracker.history
88
+ trend = tracker.trend_for(enriched, history)
89
+ [trend, tracker.record(enriched, history)]
90
+ end
91
+ end
65
92
  end
66
93
  end
67
94
  end
@@ -8,15 +8,41 @@ module SkillBench
8
8
  module Tools
9
9
  # Registry for all available tools, providing their definitions to the LLM.
10
10
  class Registry
11
- # Returns an array of tool definitions in the format expected by the LLM API.
11
+ # Recursively deep-freezes a tool-definition value (Hash/Array and contents)
12
+ # so accidental mutation by a downstream consumer raises immediately.
12
13
  #
13
- # @return [Array<Hash>] The list of available tools with their names, descriptions, and schemas.
14
- def self.definitions
14
+ # @param value [Object] The value to deep-freeze in place.
15
+ # @return [Object] The same value, frozen along with everything it contains.
16
+ def self.deep_freeze(value)
17
+ children = case value
18
+ when Hash then value.values
19
+ when Array then value
20
+ else []
21
+ end
22
+ children.each { |child| deep_freeze(child) }
23
+ value.freeze
24
+ end
25
+ private_class_method :deep_freeze
26
+
27
+ # The static tool definitions sent to the LLM API. The tool schemas are
28
+ # constant JSON-schema specs (no per-call state or runtime config), so the
29
+ # array and its nested hashes are built once and deep-frozen for reuse
30
+ # across every ReAct step instead of being reallocated on each call.
31
+ #
32
+ # @return [Array<Hash>] Frozen list of tools with their names, descriptions, and schemas.
33
+ DEFINITIONS = deep_freeze(
15
34
  [
16
35
  ReadFile.definition,
17
36
  WriteFile.definition,
18
37
  RunCommand.definition
19
38
  ]
39
+ )
40
+
41
+ # Returns the memoized, frozen array of tool definitions for the LLM API.
42
+ #
43
+ # @return [Array<Hash>] The frozen list of available tools with their names, descriptions, and schemas.
44
+ def self.definitions
45
+ DEFINITIONS
20
46
  end
21
47
  end
22
48
  end
@@ -1,15 +1,35 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'open3'
4
- require 'timeout'
5
4
  require 'shellwords'
6
5
  require_relative '../config'
7
6
  require_relative '../constants'
7
+ require_relative '../error_logger'
8
8
 
9
9
  module SkillBench
10
10
  module Tools
11
11
  # Handles executing a shell command within the working directory.
12
+ #
13
+ # Real container isolation is not yet shipped, so an active sandbox means a
14
+ # temporary git directory on the host. To honor the documented security
15
+ # model the tool fails closed: when no container isolation is active it
16
+ # refuses to run unless `allow_host_execution` is explicitly enabled.
12
17
  class RunCommand
18
+ # Refusal returned when no container isolation is active and host execution
19
+ # has not been explicitly enabled. Deliberately omits the allowlist.
20
+ HOST_EXECUTION_REFUSED = 'Command execution refused: no sandbox isolation is active and ' \
21
+ "'allow_host_execution' is not enabled. Set \"allow_host_execution\": true in " \
22
+ 'skill-bench.json to permit running commands directly on the host (NOT isolated).'
23
+
24
+ # Warning emitted when a command runs un-isolated on the host because
25
+ # `allow_host_execution` is enabled and no container is active.
26
+ HOST_EXECUTION_WARNING = 'Warning: running command directly on the host with NO sandbox isolation ' \
27
+ '(allow_host_execution is enabled). Commands are not isolated from your machine.'
28
+
29
+ # Seconds to wait after SIGTERM before escalating to SIGKILL when a command
30
+ # exceeds its execution deadline.
31
+ TERM_GRACE_PERIOD = 2
32
+
13
33
  # @return [Hash] The tool definition for the LLM API.
14
34
  def self.definition
15
35
  {
@@ -34,11 +54,16 @@ module SkillBench
34
54
  # Tokenizes the command string before execution so that arguments are passed
35
55
  # directly to the OS without shell interpretation, preventing shell injection.
36
56
  #
57
+ # Fails closed: when no container isolation is active (`container_id` is nil)
58
+ # and `allow_host_execution` is false, the command is refused and nothing
59
+ # runs. When host execution is explicitly allowed, a warning is emitted once
60
+ # per command before running un-isolated on the host.
61
+ #
37
62
  # @param command [String] The command to run (e.g. "rspec spec/models").
38
63
  # @param working_dir_path [Pathname] The host directory (ignored if container_id present).
39
64
  # @param container_id [String, nil] The Docker container ID for isolated execution.
40
- # @return [String] A formatted string containing the exit status, STDOUT, and STDERR.
41
- # @raise [Timeout::Error] Internally rescued; returns a timeout message string.
65
+ # @return [String] A formatted string containing the exit status, STDOUT, and STDERR,
66
+ # or a standardized error/refusal message.
42
67
  def self.call(command, working_dir_path, container_id = nil)
43
68
  argv = command.shellsplit
44
69
  return 'Error: Empty command.' if argv.empty?
@@ -50,25 +75,152 @@ module SkillBench
50
75
  return 'Error: No allowed commands configured. Set allowed_commands in skill-bench.json or use --mode mock.' if allowed.nil?
51
76
  return "Error: Command '#{base_cmd}' is not permitted." unless allowed.include?(base_cmd)
52
77
 
78
+ return "Error: Command '#{base_cmd}' arguments are not permitted by the configured argument constraints." unless arguments_permitted?(base_cmd, argv)
79
+
80
+ return HOST_EXECUTION_REFUSED unless container_id || SkillBench::Config.allow_host_execution
81
+
82
+ warn_unisolated_host_execution unless container_id
83
+ execute(argv, working_dir_path, container_id)
84
+ end
85
+
86
+ # Checks the command's arguments against the optional, per-command
87
+ # argument constraints from configuration.
88
+ #
89
+ # This is a default-off seam: the command allowlist remains the primary
90
+ # authorization control, and any allowlisted wrapper binary still grants
91
+ # broad host execution. When no constraints are configured (the default),
92
+ # or none apply to +base_cmd+, every argument is permitted so behavior is
93
+ # unchanged. When a constraint exists for +base_cmd+, the command is
94
+ # refused if any argument contains a disallowed substring/flag.
95
+ #
96
+ # @param base_cmd [String] The base command (first token of the command).
97
+ # @param argv [Array<String>] The tokenized command and arguments.
98
+ # @return [Boolean] true when the arguments are permitted to run.
99
+ def self.arguments_permitted?(base_cmd, argv)
100
+ constraints = SkillBench::Config.command_argument_constraints
101
+ return true if constraints.nil? || constraints.empty?
102
+
103
+ # Constraint keys may be strings (facade API) or symbols (loaded from
104
+ # JSON via symbolize_names), so look the command up under both.
105
+ disallowed = constraints[base_cmd] || constraints[base_cmd.to_sym]
106
+ return true if disallowed.nil? || disallowed.empty?
107
+
108
+ argv.drop(1).none? { |arg| disallowed.any? { |bad| arg.include?(bad.to_s) } }
109
+ end
110
+ private_class_method :arguments_permitted?
111
+
112
+ # Runs the resolved command and formats its result, enforcing the
113
+ # configured execution timeout.
114
+ #
115
+ # The command is spawned in its own process group so that, on timeout, the
116
+ # whole group (the command and any children it forked) can be signalled —
117
+ # something `Timeout.timeout` around `Open3.capture3` could not do, because
118
+ # `capture3`'s `ensure` blocks on `wait_thr.value` and never signals the
119
+ # child.
120
+ #
121
+ # @param argv [Array<String>] The tokenized command and arguments.
122
+ # @param working_dir_path [Pathname] The host directory for host execution.
123
+ # @param container_id [String, nil] The Docker container ID for isolated execution.
124
+ # @return [String] Formatted exit status, STDOUT, and STDERR, or a timeout message.
125
+ def self.execute(argv, working_dir_path, container_id)
53
126
  max_time = SkillBench::Config.max_execution_time
54
- Timeout.timeout(max_time) do
55
- stdout_str, stderr_str, status = if container_id
56
- docker_cmd = ['docker', 'exec', '-w', '/sandbox', container_id] + argv
57
- Open3.capture3(*docker_cmd)
58
- else
59
- Open3.capture3(*argv, chdir: working_dir_path.to_s)
60
- end
61
- <<~RESULT
62
- Exit Status: #{status.exitstatus}
63
- STDOUT:
64
- #{stdout_str}
65
- STDERR:
66
- #{stderr_str}
67
- RESULT
127
+ command, spawn_opts = resolve_invocation(argv, working_dir_path, container_id)
128
+ result = capture(command, spawn_opts, max_time)
129
+ return "Error: Command execution timed out after #{max_time} seconds." if result == :timed_out
130
+
131
+ stdout_str, stderr_str, status = result
132
+ format_result(status, stdout_str, stderr_str)
133
+ end
134
+ private_class_method :execute
135
+
136
+ # Formats the captured command output into the standard result string.
137
+ #
138
+ # @param status [Process::Status] The exit status of the command.
139
+ # @param stdout_str [String] The captured standard output.
140
+ # @param stderr_str [String] The captured standard error.
141
+ # @return [String] Formatted exit status, STDOUT, and STDERR.
142
+ def self.format_result(status, stdout_str, stderr_str)
143
+ <<~RESULT
144
+ Exit Status: #{status.exitstatus}
145
+ STDOUT:
146
+ #{stdout_str}
147
+ STDERR:
148
+ #{stderr_str}
149
+ RESULT
150
+ end
151
+ private_class_method :format_result
152
+
153
+ # Builds the command array and spawn options for either container or host
154
+ # execution. Both run in their own process group (`pgroup: true`) so the
155
+ # watchdog can kill the whole group on timeout.
156
+ #
157
+ # @param argv [Array<String>] The tokenized command and arguments.
158
+ # @param working_dir_path [Pathname] The host directory for host execution.
159
+ # @param container_id [String, nil] The Docker container ID for isolated execution.
160
+ # @return [Array(Array<String>, Hash)] The full command array and spawn options.
161
+ def self.resolve_invocation(argv, working_dir_path, container_id)
162
+ return [['docker', 'exec', '-w', '/sandbox', container_id, *argv], { pgroup: true }] if container_id
163
+
164
+ [argv, { chdir: working_dir_path.to_s, pgroup: true }]
165
+ end
166
+ private_class_method :resolve_invocation
167
+
168
+ # Spawns the command, draining STDOUT/STDERR on separate threads so a chatty
169
+ # or hung child never deadlocks the reader, and enforces the deadline with a
170
+ # watchdog that kills the process group when the command overruns.
171
+ #
172
+ # @param command [Array<String>] The full command array (no shell).
173
+ # @param spawn_opts [Hash] Options passed to the spawner (includes `pgroup`).
174
+ # @param max_time [Integer] Maximum execution time in seconds.
175
+ # @return [Array(String, String, Process::Status), Symbol] STDOUT, STDERR, and
176
+ # status on completion, or `:timed_out` when the deadline is exceeded.
177
+ def self.capture(command, spawn_opts, max_time)
178
+ Open3.popen3(*command, **spawn_opts) do |stdin, stdout, stderr, wait_thr|
179
+ stdin.close
180
+ readers = [Thread.new { stdout.read }, Thread.new { stderr.read }]
181
+ completed = wait_thr.join(max_time)
182
+ terminate_process_group(wait_thr) unless completed
183
+ stdout_str, stderr_str = readers.map(&:value)
184
+ completed ? [stdout_str, stderr_str, wait_thr.value] : :timed_out
68
185
  end
69
- rescue Timeout::Error
70
- "Error: Command execution timed out after #{max_time} seconds."
71
186
  end
187
+ private_class_method :capture
188
+
189
+ # Terminates the command's entire process group: SIGTERM first, then SIGKILL
190
+ # after a short grace period if it has not exited. Signalling the negated
191
+ # process group id reaches the command and any children it forked.
192
+ #
193
+ # @param wait_thr [Process::Waiter] The wait thread for the spawned process group leader.
194
+ # @return [void]
195
+ def self.terminate_process_group(wait_thr)
196
+ pgid = wait_thr.pid
197
+ signal_group('TERM', pgid)
198
+ signal_group('KILL', pgid) unless wait_thr.join(TERM_GRACE_PERIOD)
199
+ end
200
+ private_class_method :terminate_process_group
201
+
202
+ # Sends a signal to a whole process group, ignoring an already-exited group.
203
+ #
204
+ # @param signal [String] The signal name (e.g. "TERM", "KILL").
205
+ # @param pgid [Integer] The process group id (leader pid) to signal.
206
+ # @return [void]
207
+ def self.signal_group(signal, pgid)
208
+ Process.kill(signal, -pgid)
209
+ rescue Errno::ESRCH
210
+ nil
211
+ end
212
+ private_class_method :signal_group
213
+
214
+ # Emits a single warning that the command will run un-isolated on the host,
215
+ # honoring the test-suite stderr suppression convention.
216
+ #
217
+ # @return [void]
218
+ def self.warn_unisolated_host_execution
219
+ return if SkillBench::ErrorLogger.skip_stderr_output?
220
+
221
+ warn(HOST_EXECUTION_WARNING)
222
+ end
223
+ private_class_method :warn_unisolated_host_execution
72
224
  end
73
225
  end
74
226
  end