RubyGems - ruby-skill-bench - Versions diffs - 1.0.1 → 1.2.0 - Mend

ruby-skill-bench 1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

checksums.yaml +4 -4
data/README.md +299 -23
data/docs/architecture.md +3 -1
data/docs/first-eval-guide.md +7 -7
data/docs/testing-guide.md +1 -1
data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
data/lib/skill_bench/agent/react_agent/step.rb +7 -1
data/lib/skill_bench/agent/react_agent.rb +2 -1
data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
data/lib/skill_bench/cli/help_printer.rb +10 -2
data/lib/skill_bench/cli/init_command.rb +2 -1
data/lib/skill_bench/cli/result_printer.rb +1 -1
data/lib/skill_bench/cli/run_command.rb +47 -9
data/lib/skill_bench/cli/validate_command.rb +242 -0
data/lib/skill_bench/cli.rb +3 -0
data/lib/skill_bench/client.rb +43 -1
data/lib/skill_bench/clients/all.rb +3 -0
data/lib/skill_bench/clients/base_client.rb +14 -6
data/lib/skill_bench/clients/base_url_validator.rb +105 -0
data/lib/skill_bench/clients/provider_config.rb +34 -1
data/lib/skill_bench/clients/provider_schemas.rb +4 -0
data/lib/skill_bench/clients/providers/mistral.rb +47 -0
data/lib/skill_bench/clients/request_builder.rb +2 -4
data/lib/skill_bench/clients/response_builder.rb +91 -0
data/lib/skill_bench/clients/response_error_handler.rb +5 -17
data/lib/skill_bench/clients/retry_handler.rb +4 -7
data/lib/skill_bench/commands/init.rb +5 -0
data/lib/skill_bench/commands/skill_new.rb +3 -1
data/lib/skill_bench/config/applier.rb +2 -0
data/lib/skill_bench/config/defaults.rb +2 -0
data/lib/skill_bench/config/facade_readers.rb +7 -0
data/lib/skill_bench/config/facade_writers.rb +17 -0
data/lib/skill_bench/config/json_loader.rb +1 -1
data/lib/skill_bench/config/store.rb +29 -0
data/lib/skill_bench/config.rb +18 -0
data/lib/skill_bench/constants.rb +58 -0
data/lib/skill_bench/evaluation/runner.rb +20 -3
data/lib/skill_bench/execution/context_hydrator.rb +66 -15
data/lib/skill_bench/execution/sandbox.rb +76 -14
data/lib/skill_bench/judge/judge.rb +4 -0
data/lib/skill_bench/judge/prompt.rb +42 -6
data/lib/skill_bench/models/config.rb +32 -0
data/lib/skill_bench/output_formatter.rb +60 -1
data/lib/skill_bench/package_verifier.rb +1 -1
data/lib/skill_bench/rails/skill_templates.rb +19 -5
data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
data/lib/skill_bench/services/batch_runner_service.rb +111 -0
data/lib/skill_bench/services/compare_option_parser.rb +1 -0
data/lib/skill_bench/services/cost_calculator.rb +91 -0
data/lib/skill_bench/services/html_formatter.rb +289 -0
data/lib/skill_bench/services/json_formatter.rb +19 -1
data/lib/skill_bench/services/junit_formatter.rb +74 -24
data/lib/skill_bench/services/provider_resolver.rb +5 -2
data/lib/skill_bench/services/response_cache.rb +130 -0
data/lib/skill_bench/services/runner_service.rb +88 -4
data/lib/skill_bench/services/summary_formatter.rb +90 -0
data/lib/skill_bench/services/template_registry.rb +43 -9
data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
data/lib/skill_bench/tools/registry.rb +29 -3
data/lib/skill_bench/tools/run_command.rb +172 -35
data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
data/lib/skill_bench/trend_tracker.rb +5 -5
data/lib/skill_bench/version.rb +1 -1
data/lib/skill_bench.rb +3 -3
metadata +19 -36

data/lib/skill_bench/tools/run_command.rb CHANGED Viewed

@@ -1,29 +1,34 @@
 # frozen_string_literal: true
 require 'open3'
-require 'timeout'
 require 'shellwords'
 require_relative '../config'
+require_relative '../constants'
+require_relative '../error_logger'
 module SkillBench
   module Tools
     # Handles executing a shell command within the working directory.
+    #
+    # Real container isolation is not yet shipped, so an active sandbox means a
+    # temporary git directory on the host. To honor the documented security
+    # model the tool fails closed: when no container isolation is active it
+    # refuses to run unless `allow_host_execution` is explicitly enabled.
     class RunCommand
-      # Commands that are always blocked even if listed in allowed_commands,
-      # because they can be used to escape the sandbox or execute arbitrary code.
-      DANGEROUS_COMMANDS = %w[
-        bash sh zsh fish dash ksh csh tcsh
-        python python3 python2 ruby perl node
-        php lua tcl wish
-        curl wget nc ncat socat
-        eval exec
-        sudo su doas
-        chmod chown mount umount
-        dd mkfs fdisk parted
-        insmod rmmod modprobe
-        systemctl service
-        passwd useradd userdel groupadd groupdel
-      ].freeze
+      # Refusal returned when no container isolation is active and host execution
+      # has not been explicitly enabled. Deliberately omits the allowlist.
+      HOST_EXECUTION_REFUSED = 'Command execution refused: no sandbox isolation is active and ' \
+                               "'allow_host_execution' is not enabled. Set \"allow_host_execution\": true in " \
+                               'skill-bench.json to permit running commands directly on the host (NOT isolated).'
+      # Warning emitted when a command runs un-isolated on the host because
+      # `allow_host_execution` is enabled and no container is active.
+      HOST_EXECUTION_WARNING = 'Warning: running command directly on the host with NO sandbox isolation ' \
+                               '(allow_host_execution is enabled). Commands are not isolated from your machine.'
+      # Seconds to wait after SIGTERM before escalating to SIGKILL when a command
+      # exceeds its execution deadline.
+      TERM_GRACE_PERIOD = 2
       # @return [Hash] The tool definition for the LLM API.
       def self.definition
@@ -49,41 +54,173 @@ module SkillBench
       # Tokenizes the command string before execution so that arguments are passed
       # directly to the OS without shell interpretation, preventing shell injection.
       #
+      # Fails closed: when no container isolation is active (`container_id` is nil)
+      # and `allow_host_execution` is false, the command is refused and nothing
+      # runs. When host execution is explicitly allowed, a warning is emitted once
+      # per command before running un-isolated on the host.
+      #
       # @param command [String] The command to run (e.g. "rspec spec/models").
       # @param working_dir_path [Pathname] The host directory (ignored if container_id present).
       # @param container_id [String, nil] The Docker container ID for isolated execution.
-      # @return [String] A formatted string containing the exit status, STDOUT, and STDERR.
-      # @raise [Timeout::Error] Internally rescued; returns a timeout message string.
+      # @return [String] A formatted string containing the exit status, STDOUT, and STDERR,
+      #   or a standardized error/refusal message.
       def self.call(command, working_dir_path, container_id = nil)
         argv = command.shellsplit
         return 'Error: Empty command.' if argv.empty?
         base_cmd = argv.first
-        return "Error: Command '#{base_cmd}' is blocked for security reasons." if DANGEROUS_COMMANDS.include?(base_cmd)
+        return "Error: Command '#{base_cmd}' is blocked for security reasons." if Constants::Tools::DANGEROUS_COMMANDS.include?(base_cmd)
         allowed = SkillBench::Config.allowed_commands
         return 'Error: No allowed commands configured. Set allowed_commands in skill-bench.json or use --mode mock.' if allowed.nil?
         return "Error: Command '#{base_cmd}' is not permitted." unless allowed.include?(base_cmd)
+        return "Error: Command '#{base_cmd}' arguments are not permitted by the configured argument constraints." unless arguments_permitted?(base_cmd, argv)
+        return HOST_EXECUTION_REFUSED unless container_id || SkillBench::Config.allow_host_execution
+        warn_unisolated_host_execution unless container_id
+        execute(argv, working_dir_path, container_id)
+      end
+      # Checks the command's arguments against the optional, per-command
+      # argument constraints from configuration.
+      #
+      # This is a default-off seam: the command allowlist remains the primary
+      # authorization control, and any allowlisted wrapper binary still grants
+      # broad host execution. When no constraints are configured (the default),
+      # or none apply to +base_cmd+, every argument is permitted so behavior is
+      # unchanged. When a constraint exists for +base_cmd+, the command is
+      # refused if any argument contains a disallowed substring/flag.
+      #
+      # @param base_cmd [String] The base command (first token of the command).
+      # @param argv [Array<String>] The tokenized command and arguments.
+      # @return [Boolean] true when the arguments are permitted to run.
+      def self.arguments_permitted?(base_cmd, argv)
+        constraints = SkillBench::Config.command_argument_constraints
+        return true if constraints.nil? || constraints.empty?
+        # Constraint keys may be strings (facade API) or symbols (loaded from
+        # JSON via symbolize_names), so look the command up under both.
+        disallowed = constraints[base_cmd] || constraints[base_cmd.to_sym]
+        return true if disallowed.nil? || disallowed.empty?
+        argv.drop(1).none? { |arg| disallowed.any? { |bad| arg.include?(bad.to_s) } }
+      end
+      private_class_method :arguments_permitted?
+      # Runs the resolved command and formats its result, enforcing the
+      # configured execution timeout.
+      #
+      # The command is spawned in its own process group so that, on timeout, the
+      # whole group (the command and any children it forked) can be signalled —
+      # something `Timeout.timeout` around `Open3.capture3` could not do, because
+      # `capture3`'s `ensure` blocks on `wait_thr.value` and never signals the
+      # child.
+      #
+      # @param argv [Array<String>] The tokenized command and arguments.
+      # @param working_dir_path [Pathname] The host directory for host execution.
+      # @param container_id [String, nil] The Docker container ID for isolated execution.
+      # @return [String] Formatted exit status, STDOUT, and STDERR, or a timeout message.
+      def self.execute(argv, working_dir_path, container_id)
         max_time = SkillBench::Config.max_execution_time
-        Timeout.timeout(max_time) do
-          stdout_str, stderr_str, status = if container_id
-                                             docker_cmd = ['docker', 'exec', '-w', '/sandbox', container_id] + argv
-                                             Open3.capture3(*docker_cmd)
-                                           else
-                                             Open3.capture3(*argv, chdir: working_dir_path.to_s)
-                                           end
-          <<~RESULT
-            Exit Status: #{status.exitstatus}
-            STDOUT:
-            #{stdout_str}
-            STDERR:
-            #{stderr_str}
-          RESULT
+        command, spawn_opts = resolve_invocation(argv, working_dir_path, container_id)
+        result = capture(command, spawn_opts, max_time)
+        return "Error: Command execution timed out after #{max_time} seconds." if result == :timed_out
+        stdout_str, stderr_str, status = result
+        format_result(status, stdout_str, stderr_str)
+      end
+      private_class_method :execute
+      # Formats the captured command output into the standard result string.
+      #
+      # @param status [Process::Status] The exit status of the command.
+      # @param stdout_str [String] The captured standard output.
+      # @param stderr_str [String] The captured standard error.
+      # @return [String] Formatted exit status, STDOUT, and STDERR.
+      def self.format_result(status, stdout_str, stderr_str)
+        <<~RESULT
+          Exit Status: #{status.exitstatus}
+          STDOUT:
+          #{stdout_str}
+          STDERR:
+          #{stderr_str}
+        RESULT
+      end
+      private_class_method :format_result
+      # Builds the command array and spawn options for either container or host
+      # execution. Both run in their own process group (`pgroup: true`) so the
+      # watchdog can kill the whole group on timeout.
+      #
+      # @param argv [Array<String>] The tokenized command and arguments.
+      # @param working_dir_path [Pathname] The host directory for host execution.
+      # @param container_id [String, nil] The Docker container ID for isolated execution.
+      # @return [Array(Array<String>, Hash)] The full command array and spawn options.
+      def self.resolve_invocation(argv, working_dir_path, container_id)
+        return [['docker', 'exec', '-w', '/sandbox', container_id, *argv], { pgroup: true }] if container_id
+        [argv, { chdir: working_dir_path.to_s, pgroup: true }]
+      end
+      private_class_method :resolve_invocation
+      # Spawns the command, draining STDOUT/STDERR on separate threads so a chatty
+      # or hung child never deadlocks the reader, and enforces the deadline with a
+      # watchdog that kills the process group when the command overruns.
+      #
+      # @param command [Array<String>] The full command array (no shell).
+      # @param spawn_opts [Hash] Options passed to the spawner (includes `pgroup`).
+      # @param max_time [Integer] Maximum execution time in seconds.
+      # @return [Array(String, String, Process::Status), Symbol] STDOUT, STDERR, and
+      #   status on completion, or `:timed_out` when the deadline is exceeded.
+      def self.capture(command, spawn_opts, max_time)
+        Open3.popen3(*command, **spawn_opts) do |stdin, stdout, stderr, wait_thr|
+          stdin.close
+          readers = [Thread.new { stdout.read }, Thread.new { stderr.read }]
+          completed = wait_thr.join(max_time)
+          terminate_process_group(wait_thr) unless completed
+          stdout_str, stderr_str = readers.map(&:value)
+          completed ? [stdout_str, stderr_str, wait_thr.value] : :timed_out
         end
-      rescue Timeout::Error
-        "Error: Command execution timed out after #{max_time} seconds."
       end
+      private_class_method :capture
+      # Terminates the command's entire process group: SIGTERM first, then SIGKILL
+      # after a short grace period if it has not exited. Signalling the negated
+      # process group id reaches the command and any children it forked.
+      #
+      # @param wait_thr [Process::Waiter] The wait thread for the spawned process group leader.
+      # @return [void]
+      def self.terminate_process_group(wait_thr)
+        pgid = wait_thr.pid
+        signal_group('TERM', pgid)
+        signal_group('KILL', pgid) unless wait_thr.join(TERM_GRACE_PERIOD)
+      end
+      private_class_method :terminate_process_group
+      # Sends a signal to a whole process group, ignoring an already-exited group.
+      #
+      # @param signal [String] The signal name (e.g. "TERM", "KILL").
+      # @param pgid [Integer] The process group id (leader pid) to signal.
+      # @return [void]
+      def self.signal_group(signal, pgid)
+        Process.kill(signal, -pgid)
+      rescue Errno::ESRCH
+        nil
+      end
+      private_class_method :signal_group
+      # Emits a single warning that the command will run un-isolated on the host,
+      # honoring the test-suite stderr suppression convention.
+      #
+      # @return [void]
+      def self.warn_unisolated_host_execution
+        return if SkillBench::ErrorLogger.skip_stderr_output?
+        warn(HOST_EXECUTION_WARNING)
+      end
+      private_class_method :warn_unisolated_host_execution
     end
   end
 end

data/lib/skill_bench/trend_tracker/persistence.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require 'json'
 require 'pathname'
+require 'fileutils'
 module SkillBench
   class TrendTracker
@@ -27,23 +28,24 @@ module SkillBench
         []
       end
-      # Writes history to file with atomic operation and backup.
-      # Returns a result hash so callers do not need to rescue SystemCallError.
+      # Writes history to file atomically, snapshotting the previous good
+      # version into the backup first.
+      #
+      # The existing history file (if any) is copied to +#{history_file}.bak+
+      # before the new content is written, so the backup always holds the
+      # previous good version rather than a duplicate of the current file. The
+      # new content is serialized once and written via a temp-file + rename so
+      # the main file is never left partially written. Returns a result hash so
+      # callers do not need to rescue SystemCallError.
       #
       # @param history [Array<Hash>] History entries to write
       # @return [Hash] { success: true } on success, { success: false, error: { message: '...' } } on failure
       def write(history)
-        json = JSON.pretty_generate(history)
+        backup_previous_version
         temp_file = "#{history_file}.tmp"
-        File.write(temp_file, json)
+        File.write(temp_file, JSON.pretty_generate(history))
         File.rename(temp_file, history_file)
-        begin
-          File.write("#{history_file}.bak", json)
-        rescue SystemCallError => e
-          warn "Backup write failed for #{history_file}: #{e.message}"
-        end
         { success: true }
       rescue SystemCallError => e
         { success: false, error: { message: e.message } }
@@ -53,6 +55,21 @@ module SkillBench
       attr_reader :history_file
+      # Copies the current history file to the backup path so the backup keeps
+      # the previous good version. No-op on the first run when no history file
+      # exists yet. A failed copy is non-fatal: it warns and lets the main
+      # write proceed.
+      #
+      # @return [void]
+      def backup_previous_version
+        source = history_file
+        return unless File.exist?(source)
+        FileUtils.cp(source, "#{source}.bak")
+      rescue SystemCallError => e
+        warn "Backup copy failed for #{source}: #{e.message}"
+      end
       # Reads backup file if it exists
       #
       # @return [Array<Hash>, nil] Backup data or nil if unavailable

data/lib/skill_bench/trend_tracker.rb CHANGED Viewed

@@ -17,9 +17,9 @@ module SkillBench
     # Records an evaluation result.
     #
     # @param result [Hash] The evaluation result from EvaluationRunner.
+    # @param history [Array<Hash>] Pre-loaded history to append to; defaults to a fresh load.
     # @return [Hash] Service response.
-    def record(result)
-      history = @persistence.load
+    def record(result, history = @persistence.load)
       history << extract_entry(result)
       write_result = @persistence.write(history)
@@ -41,11 +41,11 @@ module SkillBench
     # Computes the trend of the given result against the most recent matching history entry.
     #
     # @param result [Hash] The current evaluation result.
+    # @param history [Array<Hash>] Pre-loaded history to compare against; defaults to a fresh load.
     # @return [Hash, nil] Trend data or nil if no matching history exists.
-    def trend_for(result)
-      entries = @persistence.load
+    def trend_for(result, history = @persistence.load)
       current = extract_entry(result)
-      TrendCalculator.compute_trend(entries, current)
+      TrendCalculator.compute_trend(history, current)
     end
     private

data/lib/skill_bench/version.rb CHANGED Viewed

@@ -2,5 +2,5 @@
 module SkillBench
   # The current gem version.
-  VERSION = '1.0.1'
+  VERSION = '1.2.0'
 end

data/lib/skill_bench.rb CHANGED Viewed

@@ -8,6 +8,7 @@
 # Core modules
 require_relative 'skill_bench/version'
+require_relative 'skill_bench/constants'
 require_relative 'skill_bench/dimension'
 require_relative 'skill_bench/criteria'
 require_relative 'skill_bench/delta_report'
@@ -72,6 +73,8 @@ require_relative 'skill_bench/commands/eval_new'
 # Services
 require_relative 'skill_bench/services/runner_service'
+require_relative 'skill_bench/services/batch_runner_service'
+require_relative 'skill_bench/services/summary_formatter'
 require_relative 'skill_bench/services/template_registry'
 # Tools
@@ -87,9 +90,6 @@ require_relative 'skill_bench/trend_tracker'
 require_relative 'skill_bench/trend_tracker/persistence'
 require_relative 'skill_bench/trend_tracker/trend_calculator'
-# Rails integrations
-require_relative 'skill_bench/rails/skill_templates'
 # Migration utilities
 require_relative 'skill_bench/migration/provider_migrator'

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: ruby-skill-bench
 version: !ruby/object:Gem::Version
-  version: 1.0.1
+  version: 1.2.0
 platform: ruby
 authors:
 - Ismael Marin
@@ -9,48 +9,20 @@ bindir: bin
 cert_chain: []
 date: 1980-01-02 00:00:00.000000000 Z
 dependencies:
-- !ruby/object:Gem::Dependency
-  name: activesupport
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '6.0'
-  type: :runtime
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '6.0'
 - !ruby/object:Gem::Dependency
   name: cgi
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.5.1
-  type: :runtime
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: 0.5.1
-- !ruby/object:Gem::Dependency
-  name: dotenv
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: 3.2.0
+        version: 0.5.2
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 3.2.0
+        version: 0.5.2
 - !ruby/object:Gem::Dependency
   name: faraday
   requirement: !ruby/object:Gem::Requirement
@@ -71,28 +43,28 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '2.19'
+        version: '2.20'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '2.19'
+        version: '2.20'
 - !ruby/object:Gem::Dependency
   name: parallel
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.26'
+        version: 2.0.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.26'
+        version: 2.0.0
 description: |
   ruby-skill-bench orchestrates evaluation runs of AI coding agents
   inside isolated git sandboxes, then scores the results using deterministic
@@ -119,6 +91,7 @@ files:
 - lib/skill_bench/agent/runner.rb
 - lib/skill_bench/agent/summary.rb
 - lib/skill_bench/cli.rb
+- lib/skill_bench/cli/batch_result_printer.rb
 - lib/skill_bench/cli/compare_command.rb
 - lib/skill_bench/cli/eval/eval_command_registry.rb
 - lib/skill_bench/cli/eval/eval_commands.rb
@@ -129,9 +102,11 @@ files:
 - lib/skill_bench/cli/result_printer.rb
 - lib/skill_bench/cli/run_command.rb
 - lib/skill_bench/cli/skill_command.rb
+- lib/skill_bench/cli/validate_command.rb
 - lib/skill_bench/client.rb
 - lib/skill_bench/clients/all.rb
 - lib/skill_bench/clients/base_client.rb
+- lib/skill_bench/clients/base_url_validator.rb
 - lib/skill_bench/clients/provider_config.rb
 - lib/skill_bench/clients/provider_registry.rb
 - lib/skill_bench/clients/provider_schemas.rb
@@ -140,6 +115,7 @@ files:
 - lib/skill_bench/clients/providers/deepseek.rb
 - lib/skill_bench/clients/providers/gemini.rb
 - lib/skill_bench/clients/providers/groq.rb
+- lib/skill_bench/clients/providers/mistral.rb
 - lib/skill_bench/clients/providers/mock.rb
 - lib/skill_bench/clients/providers/null_client.rb
 - lib/skill_bench/clients/providers/ollama.rb
@@ -147,6 +123,7 @@ files:
 - lib/skill_bench/clients/providers/opencode.rb
 - lib/skill_bench/clients/providers/openrouter.rb
 - lib/skill_bench/clients/request_builder.rb
+- lib/skill_bench/clients/response_builder.rb
 - lib/skill_bench/clients/response_error_handler.rb
 - lib/skill_bench/clients/response_parser.rb
 - lib/skill_bench/clients/retry_handler.rb
@@ -162,6 +139,7 @@ files:
 - lib/skill_bench/config/facade_writers.rb
 - lib/skill_bench/config/json_loader.rb
 - lib/skill_bench/config/store.rb
+- lib/skill_bench/constants.rb
 - lib/skill_bench/criteria.rb
 - lib/skill_bench/delta_report.rb
 - lib/skill_bench/dimension.rb
@@ -196,16 +174,19 @@ files:
 - lib/skill_bench/registry/pack_resolver.rb
 - lib/skill_bench/runner.rb
 - lib/skill_bench/services/agent_spawner_service.rb
+- lib/skill_bench/services/batch_runner_service.rb
 - lib/skill_bench/services/compare_option_parser.rb
 - lib/skill_bench/services/comparison_reporter.rb
 - lib/skill_bench/services/comparison_runner.rb
 - lib/skill_bench/services/context_loader_service.rb
+- lib/skill_bench/services/cost_calculator.rb
 - lib/skill_bench/services/delta_table_formatter.rb
 - lib/skill_bench/services/error_response_builder.rb
 - lib/skill_bench/services/eval_resolver.rb
 - lib/skill_bench/services/exit_code_calculator.rb
 - lib/skill_bench/services/feedback_generator.rb
 - lib/skill_bench/services/formatting_helpers.rb
+- lib/skill_bench/services/html_formatter.rb
 - lib/skill_bench/services/iteration_formatter.rb
 - lib/skill_bench/services/json_formatter.rb
 - lib/skill_bench/services/judge_params_builder.rb
@@ -217,11 +198,13 @@ files:
 - lib/skill_bench/services/output_persistence_service.rb
 - lib/skill_bench/services/prompt_builder_service.rb
 - lib/skill_bench/services/provider_resolver.rb
+- lib/skill_bench/services/response_cache.rb
 - lib/skill_bench/services/result_printer_service.rb
 - lib/skill_bench/services/runner_service.rb
 - lib/skill_bench/services/skill_resolver.rb
 - lib/skill_bench/services/skill_resolver_service.rb
 - lib/skill_bench/services/source_path_resolver_service.rb
+- lib/skill_bench/services/summary_formatter.rb
 - lib/skill_bench/services/template_registry.rb
 - lib/skill_bench/services/template_registry/category_data.rb
 - lib/skill_bench/services/trend_recorder_service.rb
@@ -262,7 +245,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 4.0.12
+rubygems_version: 3.6.9
 specification_version: 4
 summary: The evaluation engine for AI Agent Skills benchmarking.
 test_files: []