RubyGems - ruby-skill-bench - Versions diffs - 1.1.0 → 1.2.0 - Mend

ruby-skill-bench 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

checksums.yaml +4 -4
data/README.md +166 -35
data/docs/architecture.md +3 -1
data/docs/first-eval-guide.md +7 -7
data/docs/testing-guide.md +1 -1
data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
data/lib/skill_bench/agent/react_agent/step.rb +7 -1
data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
data/lib/skill_bench/cli/help_printer.rb +10 -2
data/lib/skill_bench/cli/init_command.rb +2 -1
data/lib/skill_bench/cli/result_printer.rb +1 -1
data/lib/skill_bench/cli/run_command.rb +47 -9
data/lib/skill_bench/cli/validate_command.rb +242 -0
data/lib/skill_bench/cli.rb +3 -0
data/lib/skill_bench/client.rb +43 -1
data/lib/skill_bench/clients/all.rb +2 -0
data/lib/skill_bench/clients/base_client.rb +12 -1
data/lib/skill_bench/clients/base_url_validator.rb +105 -0
data/lib/skill_bench/clients/provider_config.rb +34 -1
data/lib/skill_bench/clients/provider_schemas.rb +4 -0
data/lib/skill_bench/clients/providers/mistral.rb +47 -0
data/lib/skill_bench/commands/init.rb +5 -0
data/lib/skill_bench/commands/skill_new.rb +3 -1
data/lib/skill_bench/config/applier.rb +2 -0
data/lib/skill_bench/config/defaults.rb +2 -0
data/lib/skill_bench/config/facade_readers.rb +7 -0
data/lib/skill_bench/config/facade_writers.rb +17 -0
data/lib/skill_bench/config/json_loader.rb +1 -1
data/lib/skill_bench/config/store.rb +29 -0
data/lib/skill_bench/config.rb +18 -0
data/lib/skill_bench/evaluation/runner.rb +20 -3
data/lib/skill_bench/execution/context_hydrator.rb +52 -11
data/lib/skill_bench/execution/sandbox.rb +58 -11
data/lib/skill_bench/judge/judge.rb +4 -0
data/lib/skill_bench/judge/prompt.rb +42 -6
data/lib/skill_bench/models/config.rb +32 -0
data/lib/skill_bench/output_formatter.rb +60 -1
data/lib/skill_bench/package_verifier.rb +1 -1
data/lib/skill_bench/rails/skill_templates.rb +19 -5
data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
data/lib/skill_bench/services/batch_runner_service.rb +111 -0
data/lib/skill_bench/services/compare_option_parser.rb +1 -0
data/lib/skill_bench/services/cost_calculator.rb +91 -0
data/lib/skill_bench/services/html_formatter.rb +289 -0
data/lib/skill_bench/services/json_formatter.rb +19 -1
data/lib/skill_bench/services/junit_formatter.rb +74 -24
data/lib/skill_bench/services/provider_resolver.rb +5 -2
data/lib/skill_bench/services/response_cache.rb +130 -0
data/lib/skill_bench/services/runner_service.rb +88 -4
data/lib/skill_bench/services/summary_formatter.rb +90 -0
data/lib/skill_bench/services/template_registry.rb +43 -9
data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
data/lib/skill_bench/tools/registry.rb +29 -3
data/lib/skill_bench/tools/run_command.rb +171 -19
data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
data/lib/skill_bench/trend_tracker.rb +5 -5
data/lib/skill_bench/version.rb +1 -1
data/lib/skill_bench.rb +2 -3
metadata +17 -36

data/lib/skill_bench/services/runner_service.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 # frozen_string_literal: true
+require 'parallel'
 require_relative '../evaluation/runner'
 require_relative 'eval_resolver'
 require_relative 'skill_resolver_service'
@@ -10,6 +11,7 @@ require_relative 'context_loader_service'
 require_relative 'judge_params_builder'
 require_relative 'error_response_builder'
 require_relative 'trend_recorder_service'
+require_relative 'cost_calculator'
 require_relative 'output_formatter'
 module SkillBench
@@ -61,13 +63,11 @@ module SkillBench
         provider = provider_result[:provider]
         config = provider_result[:config]
-        baseline_output = run_baseline_agent(evaluation, provider, config)
-        return agent_error_result(baseline_output, 'baseline', evaluation, provider) if baseline_output[:status] == :error
         skill_context = ContextLoaderService.call(skills)
         return empty_context_error_result(evaluation, provider) if skill_context.strip.empty?
-        context_output = run_context_agent(evaluation, skills, skill_context, provider, config)
+        baseline_output, context_output = run_agents_concurrently(evaluation, skills, skill_context, provider, config)
+        return agent_error_result(baseline_output, 'baseline', evaluation, provider) if baseline_output[:status] == :error
         return agent_error_result(context_output, 'context', evaluation, provider) if context_output[:status] == :error
         context = EvaluationContext.new(
@@ -111,6 +111,29 @@ module SkillBench
         AgentSpawnerService.call(evaluation, context_prompt, provider, config)
       end
+      # Runs the baseline and context agents concurrently.
+      #
+      # The two runs are independent: each spawns its own `Dir.mktmpdir`
+      # sandbox and uses a per-call client, and neither reads the other's
+      # state. The work is I/O-bound (HTTP + subprocess), so threads release
+      # the GIL and the agent phase is bound by the slower run instead of the
+      # sum of both. The skill context is built once by the caller and passed
+      # in, so no skill-dependent work is duplicated or moved into the baseline.
+      #
+      # @param evaluation [SkillBench::Models::Eval] The eval being run
+      # @param skills [Array<SkillBench::Models::Skill>] Resolved skills
+      # @param skill_context [String] Combined skill context, built once pre-fork
+      # @param provider [Object] The resolved provider
+      # @param config [Hash, nil] Provider config
+      # @return [Array(Hash, Hash)] Baseline and context outputs, in that order
+      def run_agents_concurrently(evaluation, skills, skill_context, provider, config)
+        runs = [
+          -> { run_baseline_agent(evaluation, provider, config) },
+          -> { run_context_agent(evaluation, skills, skill_context, provider, config) }
+        ]
+        Parallel.map(runs, in_threads: runs.size, &:call)
+      end
       def evaluate_and_record_trend(context)
         evaluation = context.evaluation
         provider = context.provider
@@ -133,11 +156,16 @@ module SkillBench
         trend_result = TrendRecorderService.call(result, eval_name, skill_names)
         return enrich_error_result(trend_result, evaluation, provider) unless trend_result[:success]
+        tokens = aggregate_usage(context.baseline_output, context.context_output)
+        cost = CostCalculator.call(usage: tokens, model: agent_model(config, provider))
         {
           success: true,
           eval_name: eval_name,
           skill_name: skill_names.join(', '),
           provider_name: provider.name,
+          tokens: tokens,
+          cost: cost,
           response: result[:response].merge(
             trend: trend_result[:trend],
             baseline_iterations: context.baseline_output[:iterations] || [],
@@ -145,6 +173,62 @@ module SkillBench
           )
         }
       end
+      # Sums the token usage of the baseline and context agent runs.
+      #
+      # Judge-side usage is not yet threaded through (the judge lives under the
+      # untouched `clients/` boundary), so this is scoped to agent usage.
+      #
+      # @param baseline_output [Hash] The baseline agent output (carries :usage).
+      # @param context_output [Hash] The context agent output (carries :usage).
+      # @return [Hash] Combined prompt/completion/total token counts.
+      def aggregate_usage(baseline_output, context_output)
+        add_usage(
+          add_usage(empty_usage, baseline_output[:usage]),
+          context_output[:usage]
+        )
+      end
+      # A zeroed token-usage accumulator.
+      #
+      # @return [Hash] Usage hash with prompt/completion/total token counts set to zero.
+      def empty_usage
+        { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }
+      end
+      # Adds one usage hash onto a running total.
+      #
+      # @param total [Hash] The running usage total.
+      # @param usage [Hash, nil] A run's usage hash (may be nil or empty).
+      # @return [Hash] A new summed usage hash.
+      def add_usage(total, usage)
+        usage ||= {}
+        {
+          prompt_tokens: total[:prompt_tokens] + token_count(usage, :prompt_tokens),
+          completion_tokens: total[:completion_tokens] + token_count(usage, :completion_tokens),
+          total_tokens: total[:total_tokens] + token_count(usage, :total_tokens)
+        }
+      end
+      # Reads a token count from a usage hash, tolerating string keys.
+      #
+      # @param usage [Hash] The usage hash.
+      # @param key [Symbol] The usage key (e.g. :prompt_tokens).
+      # @return [Integer] The token count, or zero when absent.
+      def token_count(usage, key)
+        (usage[key] || usage[key.to_s] || 0).to_i
+      end
+      # Resolves the model name used for pricing from config, falling back to the provider LLM.
+      #
+      # @param config [Hash, nil] Provider config.
+      # @param provider [Object] The resolved provider.
+      # @return [String] The model name (e.g. "gpt-4o").
+      def agent_model(config, provider)
+        return provider.llm unless config.is_a?(Hash)
+        config[:model] || config['model'] || provider.llm
+      end
     end
   end
 end

data/lib/skill_bench/services/summary_formatter.rb ADDED Viewed

@@ -0,0 +1,90 @@
+# frozen_string_literal: true
+require 'json'
+module SkillBench
+  module Services
+    # Builds a compact JSON summary of a batch run for CI gating.
+    #
+    # Surfaces the aggregate pass/fail counts plus rolled-up token and cost
+    # usage and the single worst skill-vs-baseline delta across the batch, so
+    # a CI job can gate on (and archive) one machine-readable artifact.
+    class SummaryFormatter
+      # Format an aggregate batch envelope as a pretty JSON summary string.
+      #
+      # @param aggregate [Hash] Aggregate envelope with :results and :summary.
+      # @return [String] Pretty-printed JSON summary.
+      def self.format(aggregate)
+        new(aggregate).format
+      end
+      # @param aggregate [Hash] Aggregate envelope with :results and :summary.
+      def initialize(aggregate)
+        @results = aggregate[:results] || []
+        @summary = aggregate[:summary] || {}
+      end
+      # Builds the JSON summary document.
+      #
+      # @return [String] Pretty-printed JSON summary.
+      def format
+        JSON.pretty_generate(
+          passed: summary[:passed],
+          failed: summary[:failed],
+          total: summary[:total],
+          tokens: total_tokens,
+          cost: total_cost,
+          worst_delta: worst_delta
+        )
+      end
+      private
+      attr_reader :results, :summary
+      # Sums total_tokens across every result, treating missing usage as 0.
+      #
+      # @return [Integer] Aggregate token count.
+      def total_tokens
+        results.sum { |result| tokens_for(result) }
+      end
+      # Reads a single result's total token count.
+      #
+      # @param result [Hash] A single-eval result envelope.
+      # @return [Integer] total_tokens, or 0 when absent.
+      def tokens_for(result)
+        tokens = result[:tokens] || {}
+        tokens[:total_tokens] || tokens['total_tokens'] || 0
+      end
+      # Sums non-nil per-result costs.
+      #
+      # @return [Float, nil] Total cost, or nil when no result reports a cost.
+      def total_cost
+        costs = results.filter_map { |result| result[:cost] }
+        costs.empty? ? nil : costs.sum
+      end
+      # Finds the eval with the smallest skill-vs-baseline delta.
+      #
+      # @return [Hash, nil] {:eval_name, :delta} for the worst eval, or nil
+      #   when no result carries a delta report.
+      def worst_delta
+        scored = results.filter_map { |result| delta_entry(result) }
+        scored.min_by { |entry| entry[:delta] }
+      end
+      # Builds a {eval_name, delta} entry for a result with a delta report.
+      #
+      # @param result [Hash] A single-eval result envelope.
+      # @return [Hash, nil] Entry hash, or nil when the report lacks deltas.
+      def delta_entry(result)
+        report = result.dig(:response, :report)
+        return nil unless report.respond_to?(:context_total) && report.respond_to?(:baseline_total)
+        { eval_name: result[:eval_name], delta: report.context_total - report.baseline_total }
+      end
+    end
+  end
+end

data/lib/skill_bench/services/template_registry.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require 'json'
+require_relative '../dimension'
 require_relative 'template_registry/category_data'
 module SkillBench
@@ -21,6 +22,24 @@ module SkillBench
       TEMPLATE_TYPES = %i[task_md criteria_json skill_md].freeze
       CATEGORIES = REGISTRY.keys.freeze
+      # Score weight per core scoring dimension. Keyed by the canonical
+      # +SkillBench::DEFAULT_DIMENSIONS+ names so scaffolded criteria can never
+      # drift from the names the runtime loader requires; values sum to 100.
+      CRITERIA_DIMENSION_SCORES = {
+        'correctness' => 30,
+        'skill_adherence' => 25,
+        'code_quality' => 20,
+        'test_coverage' => 15,
+        'documentation' => 10
+      }.freeze
+      # Canonical dimension descriptions keyed by name, sourced from the runtime defaults.
+      CORE_DIMENSION_DESCRIPTIONS = SkillBench::DEFAULT_DIMENSIONS.to_h { |dimension| [dimension.name, dimension.description] }.freeze
+      # Top-level thresholds emitted with scaffolded criteria.
+      CRITERIA_PASS_THRESHOLD = 70
+      CRITERIA_MINIMUM_DELTA = 10
       # @param template_type [Symbol, String] Template type (:task_md, :criteria_json, :skill_md)
       # @param category [Symbol, String] Category (:crud, :api, :background_job, etc.)
       # @param variables [Hash{Symbol, String => String}] Variables for interpolation
@@ -105,21 +124,36 @@ module SkillBench
         MARKDOWN
       end
+      # Builds runtime-loadable scoring criteria for the category.
+      #
+      # Emits the five core dimensions required by {SkillBench::Criteria}
+      # (+correctness+, +skill_adherence+, +code_quality+, +test_coverage+,
+      # +documentation+) with integer +max_score+ values summing to 100, plus
+      # the top-level +pass_threshold+ and +minimum_delta+ the loader expects.
+      # Category-specific flavor lives only in the dimension descriptions.
+      #
+      # @return [String] Pretty-printed criteria JSON.
       def build_criteria_json
         JSON.pretty_generate(
           category: category.to_s,
-          dimensions: [
-            { name: 'correctness', weight: 30, pass_threshold: 70 },
-            { name: 'adherence',   weight: 25, pass_threshold: 60 },
-            { name: 'quality',     weight: 20, pass_threshold: 60 },
-            { name: 'tests',       weight: 15, pass_threshold: 80 },
-            { name: 'docs',        weight: 10, pass_threshold: 50 }
-          ],
-          minimum_delta: 5,
-          category_specific: category_data.criteria
+          dimensions: criteria_dimensions,
+          pass_threshold: CRITERIA_PASS_THRESHOLD,
+          minimum_delta: CRITERIA_MINIMUM_DELTA
         )
       end
+      # @return [Array<Hash>] Core dimensions with integer +max_score+ summing to 100.
+      def criteria_dimensions
+        focus = category_data.criteria[:focus]
+        CRITERIA_DIMENSION_SCORES.map do |name, max_score|
+          {
+            name: name,
+            max_score: max_score,
+            description: "#{CORE_DIMENSION_DESCRIPTIONS.fetch(name)} (#{category} focus: #{focus})"
+          }
+        end
+      end
       def build_skill_md
         <<~MARKDOWN
           # Skill: {{skill_name}} (#{category})

data/lib/skill_bench/services/trend_recorder_service.rb CHANGED Viewed

@@ -6,6 +6,13 @@ module SkillBench
   module Services
     # Records evaluation results and computes trends.
     class TrendRecorderService
+      # Serializes the load -> append -> write of the shared trend history
+      # file. Batch runs ({BatchRunnerService}) execute evals concurrently and
+      # the trend file is process-global shared state; without this lock,
+      # concurrent records race on the temp-file rename and silently lose
+      # appended entries.
+      WRITE_MUTEX = Mutex.new
       # Records evaluation results and computes trends.
       #
       # @param result [Hash] The evaluation result from Evaluation::Runner
@@ -27,12 +34,14 @@ module SkillBench
       # Records evaluation results and computes trends.
       #
+      # Loads the trend history once and reuses it for both the trend
+      # computation and the append+write, avoiding a duplicate parse per run.
+      #
       # @return [Hash] Result with success status and trend data
       def call
         tracker = TrendTracker.new
         enriched = @result.merge(eval_name: @eval_name, skill_names: @skill_names)
-        trend = tracker.trend_for(enriched)
-        record_result = tracker.record(enriched)
+        trend, record_result = record_atomically(tracker, enriched)
         record_success = record_result.is_a?(Hash) && record_result[:success]
         unless record_success
@@ -62,6 +71,24 @@ module SkillBench
         SkillBench::ErrorLogger.log_error(e, 'Trend tracking failed')
         { success: false, response: { error: { message: e.message } } }
       end
+      private
+      # Loads history, computes the trend, and records the entry while holding
+      # {WRITE_MUTEX}, so concurrent batch evals serialize their read-modify-
+      # write of the shared trend file. History is still loaded exactly once
+      # per run and reused for both the trend computation and the append.
+      #
+      # @param tracker [SkillBench::TrendTracker] The trend tracker
+      # @param enriched [Hash] Result enriched with eval_name and skill_names
+      # @return [Array(Hash, Hash)] The computed trend and the record result
+      def record_atomically(tracker, enriched)
+        WRITE_MUTEX.synchronize do
+          history = tracker.history
+          trend = tracker.trend_for(enriched, history)
+          [trend, tracker.record(enriched, history)]
+        end
+      end
     end
   end
 end

data/lib/skill_bench/tools/registry.rb CHANGED Viewed

@@ -8,15 +8,41 @@ module SkillBench
   module Tools
     # Registry for all available tools, providing their definitions to the LLM.
     class Registry
-      # Returns an array of tool definitions in the format expected by the LLM API.
+      # Recursively deep-freezes a tool-definition value (Hash/Array and contents)
+      # so accidental mutation by a downstream consumer raises immediately.
       #
-      # @return [Array<Hash>] The list of available tools with their names, descriptions, and schemas.
-      def self.definitions
+      # @param value [Object] The value to deep-freeze in place.
+      # @return [Object] The same value, frozen along with everything it contains.
+      def self.deep_freeze(value)
+        children = case value
+                   when Hash  then value.values
+                   when Array then value
+                   else []
+                   end
+        children.each { |child| deep_freeze(child) }
+        value.freeze
+      end
+      private_class_method :deep_freeze
+      # The static tool definitions sent to the LLM API. The tool schemas are
+      # constant JSON-schema specs (no per-call state or runtime config), so the
+      # array and its nested hashes are built once and deep-frozen for reuse
+      # across every ReAct step instead of being reallocated on each call.
+      #
+      # @return [Array<Hash>] Frozen list of tools with their names, descriptions, and schemas.
+      DEFINITIONS = deep_freeze(
         [
           ReadFile.definition,
           WriteFile.definition,
           RunCommand.definition
         ]
+      )
+      # Returns the memoized, frozen array of tool definitions for the LLM API.
+      #
+      # @return [Array<Hash>] The frozen list of available tools with their names, descriptions, and schemas.
+      def self.definitions
+        DEFINITIONS
       end
     end
   end

data/lib/skill_bench/tools/run_command.rb CHANGED Viewed

@@ -1,15 +1,35 @@
 # frozen_string_literal: true
 require 'open3'
-require 'timeout'
 require 'shellwords'
 require_relative '../config'
 require_relative '../constants'
+require_relative '../error_logger'
 module SkillBench
   module Tools
     # Handles executing a shell command within the working directory.
+    #
+    # Real container isolation is not yet shipped, so an active sandbox means a
+    # temporary git directory on the host. To honor the documented security
+    # model the tool fails closed: when no container isolation is active it
+    # refuses to run unless `allow_host_execution` is explicitly enabled.
     class RunCommand
+      # Refusal returned when no container isolation is active and host execution
+      # has not been explicitly enabled. Deliberately omits the allowlist.
+      HOST_EXECUTION_REFUSED = 'Command execution refused: no sandbox isolation is active and ' \
+                               "'allow_host_execution' is not enabled. Set \"allow_host_execution\": true in " \
+                               'skill-bench.json to permit running commands directly on the host (NOT isolated).'
+      # Warning emitted when a command runs un-isolated on the host because
+      # `allow_host_execution` is enabled and no container is active.
+      HOST_EXECUTION_WARNING = 'Warning: running command directly on the host with NO sandbox isolation ' \
+                               '(allow_host_execution is enabled). Commands are not isolated from your machine.'
+      # Seconds to wait after SIGTERM before escalating to SIGKILL when a command
+      # exceeds its execution deadline.
+      TERM_GRACE_PERIOD = 2
       # @return [Hash] The tool definition for the LLM API.
       def self.definition
         {
@@ -34,11 +54,16 @@ module SkillBench
       # Tokenizes the command string before execution so that arguments are passed
       # directly to the OS without shell interpretation, preventing shell injection.
       #
+      # Fails closed: when no container isolation is active (`container_id` is nil)
+      # and `allow_host_execution` is false, the command is refused and nothing
+      # runs. When host execution is explicitly allowed, a warning is emitted once
+      # per command before running un-isolated on the host.
+      #
       # @param command [String] The command to run (e.g. "rspec spec/models").
       # @param working_dir_path [Pathname] The host directory (ignored if container_id present).
       # @param container_id [String, nil] The Docker container ID for isolated execution.
-      # @return [String] A formatted string containing the exit status, STDOUT, and STDERR.
-      # @raise [Timeout::Error] Internally rescued; returns a timeout message string.
+      # @return [String] A formatted string containing the exit status, STDOUT, and STDERR,
+      #   or a standardized error/refusal message.
       def self.call(command, working_dir_path, container_id = nil)
         argv = command.shellsplit
         return 'Error: Empty command.' if argv.empty?
@@ -50,25 +75,152 @@ module SkillBench
         return 'Error: No allowed commands configured. Set allowed_commands in skill-bench.json or use --mode mock.' if allowed.nil?
         return "Error: Command '#{base_cmd}' is not permitted." unless allowed.include?(base_cmd)
+        return "Error: Command '#{base_cmd}' arguments are not permitted by the configured argument constraints." unless arguments_permitted?(base_cmd, argv)
+        return HOST_EXECUTION_REFUSED unless container_id || SkillBench::Config.allow_host_execution
+        warn_unisolated_host_execution unless container_id
+        execute(argv, working_dir_path, container_id)
+      end
+      # Checks the command's arguments against the optional, per-command
+      # argument constraints from configuration.
+      #
+      # This is a default-off seam: the command allowlist remains the primary
+      # authorization control, and any allowlisted wrapper binary still grants
+      # broad host execution. When no constraints are configured (the default),
+      # or none apply to +base_cmd+, every argument is permitted so behavior is
+      # unchanged. When a constraint exists for +base_cmd+, the command is
+      # refused if any argument contains a disallowed substring/flag.
+      #
+      # @param base_cmd [String] The base command (first token of the command).
+      # @param argv [Array<String>] The tokenized command and arguments.
+      # @return [Boolean] true when the arguments are permitted to run.
+      def self.arguments_permitted?(base_cmd, argv)
+        constraints = SkillBench::Config.command_argument_constraints
+        return true if constraints.nil? || constraints.empty?
+        # Constraint keys may be strings (facade API) or symbols (loaded from
+        # JSON via symbolize_names), so look the command up under both.
+        disallowed = constraints[base_cmd] || constraints[base_cmd.to_sym]
+        return true if disallowed.nil? || disallowed.empty?
+        argv.drop(1).none? { |arg| disallowed.any? { |bad| arg.include?(bad.to_s) } }
+      end
+      private_class_method :arguments_permitted?
+      # Runs the resolved command and formats its result, enforcing the
+      # configured execution timeout.
+      #
+      # The command is spawned in its own process group so that, on timeout, the
+      # whole group (the command and any children it forked) can be signalled —
+      # something `Timeout.timeout` around `Open3.capture3` could not do, because
+      # `capture3`'s `ensure` blocks on `wait_thr.value` and never signals the
+      # child.
+      #
+      # @param argv [Array<String>] The tokenized command and arguments.
+      # @param working_dir_path [Pathname] The host directory for host execution.
+      # @param container_id [String, nil] The Docker container ID for isolated execution.
+      # @return [String] Formatted exit status, STDOUT, and STDERR, or a timeout message.
+      def self.execute(argv, working_dir_path, container_id)
         max_time = SkillBench::Config.max_execution_time
-        Timeout.timeout(max_time) do
-          stdout_str, stderr_str, status = if container_id
-                                             docker_cmd = ['docker', 'exec', '-w', '/sandbox', container_id] + argv
-                                             Open3.capture3(*docker_cmd)
-                                           else
-                                             Open3.capture3(*argv, chdir: working_dir_path.to_s)
-                                           end
-          <<~RESULT
-            Exit Status: #{status.exitstatus}
-            STDOUT:
-            #{stdout_str}
-            STDERR:
-            #{stderr_str}
-          RESULT
+        command, spawn_opts = resolve_invocation(argv, working_dir_path, container_id)
+        result = capture(command, spawn_opts, max_time)
+        return "Error: Command execution timed out after #{max_time} seconds." if result == :timed_out
+        stdout_str, stderr_str, status = result
+        format_result(status, stdout_str, stderr_str)
+      end
+      private_class_method :execute
+      # Formats the captured command output into the standard result string.
+      #
+      # @param status [Process::Status] The exit status of the command.
+      # @param stdout_str [String] The captured standard output.
+      # @param stderr_str [String] The captured standard error.
+      # @return [String] Formatted exit status, STDOUT, and STDERR.
+      def self.format_result(status, stdout_str, stderr_str)
+        <<~RESULT
+          Exit Status: #{status.exitstatus}
+          STDOUT:
+          #{stdout_str}
+          STDERR:
+          #{stderr_str}
+        RESULT
+      end
+      private_class_method :format_result
+      # Builds the command array and spawn options for either container or host
+      # execution. Both run in their own process group (`pgroup: true`) so the
+      # watchdog can kill the whole group on timeout.
+      #
+      # @param argv [Array<String>] The tokenized command and arguments.
+      # @param working_dir_path [Pathname] The host directory for host execution.
+      # @param container_id [String, nil] The Docker container ID for isolated execution.
+      # @return [Array(Array<String>, Hash)] The full command array and spawn options.
+      def self.resolve_invocation(argv, working_dir_path, container_id)
+        return [['docker', 'exec', '-w', '/sandbox', container_id, *argv], { pgroup: true }] if container_id
+        [argv, { chdir: working_dir_path.to_s, pgroup: true }]
+      end
+      private_class_method :resolve_invocation
+      # Spawns the command, draining STDOUT/STDERR on separate threads so a chatty
+      # or hung child never deadlocks the reader, and enforces the deadline with a
+      # watchdog that kills the process group when the command overruns.
+      #
+      # @param command [Array<String>] The full command array (no shell).
+      # @param spawn_opts [Hash] Options passed to the spawner (includes `pgroup`).
+      # @param max_time [Integer] Maximum execution time in seconds.
+      # @return [Array(String, String, Process::Status), Symbol] STDOUT, STDERR, and
+      #   status on completion, or `:timed_out` when the deadline is exceeded.
+      def self.capture(command, spawn_opts, max_time)
+        Open3.popen3(*command, **spawn_opts) do |stdin, stdout, stderr, wait_thr|
+          stdin.close
+          readers = [Thread.new { stdout.read }, Thread.new { stderr.read }]
+          completed = wait_thr.join(max_time)
+          terminate_process_group(wait_thr) unless completed
+          stdout_str, stderr_str = readers.map(&:value)
+          completed ? [stdout_str, stderr_str, wait_thr.value] : :timed_out
         end
-      rescue Timeout::Error
-        "Error: Command execution timed out after #{max_time} seconds."
       end
+      private_class_method :capture
+      # Terminates the command's entire process group: SIGTERM first, then SIGKILL
+      # after a short grace period if it has not exited. Signalling the negated
+      # process group id reaches the command and any children it forked.
+      #
+      # @param wait_thr [Process::Waiter] The wait thread for the spawned process group leader.
+      # @return [void]
+      def self.terminate_process_group(wait_thr)
+        pgid = wait_thr.pid
+        signal_group('TERM', pgid)
+        signal_group('KILL', pgid) unless wait_thr.join(TERM_GRACE_PERIOD)
+      end
+      private_class_method :terminate_process_group
+      # Sends a signal to a whole process group, ignoring an already-exited group.
+      #
+      # @param signal [String] The signal name (e.g. "TERM", "KILL").
+      # @param pgid [Integer] The process group id (leader pid) to signal.
+      # @return [void]
+      def self.signal_group(signal, pgid)
+        Process.kill(signal, -pgid)
+      rescue Errno::ESRCH
+        nil
+      end
+      private_class_method :signal_group
+      # Emits a single warning that the command will run un-isolated on the host,
+      # honoring the test-suite stderr suppression convention.
+      #
+      # @return [void]
+      def self.warn_unisolated_host_execution
+        return if SkillBench::ErrorLogger.skip_stderr_output?
+        warn(HOST_EXECUTION_WARNING)
+      end
+      private_class_method :warn_unisolated_host_execution
     end
   end
 end