RubyGems - ruby-skill-bench - Versions diffs - 1.0.1 → 1.2.0 - Mend

ruby-skill-bench 1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

checksums.yaml +4 -4
data/README.md +299 -23
data/docs/architecture.md +3 -1
data/docs/first-eval-guide.md +7 -7
data/docs/testing-guide.md +1 -1
data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
data/lib/skill_bench/agent/react_agent/step.rb +7 -1
data/lib/skill_bench/agent/react_agent.rb +2 -1
data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
data/lib/skill_bench/cli/help_printer.rb +10 -2
data/lib/skill_bench/cli/init_command.rb +2 -1
data/lib/skill_bench/cli/result_printer.rb +1 -1
data/lib/skill_bench/cli/run_command.rb +47 -9
data/lib/skill_bench/cli/validate_command.rb +242 -0
data/lib/skill_bench/cli.rb +3 -0
data/lib/skill_bench/client.rb +43 -1
data/lib/skill_bench/clients/all.rb +3 -0
data/lib/skill_bench/clients/base_client.rb +14 -6
data/lib/skill_bench/clients/base_url_validator.rb +105 -0
data/lib/skill_bench/clients/provider_config.rb +34 -1
data/lib/skill_bench/clients/provider_schemas.rb +4 -0
data/lib/skill_bench/clients/providers/mistral.rb +47 -0
data/lib/skill_bench/clients/request_builder.rb +2 -4
data/lib/skill_bench/clients/response_builder.rb +91 -0
data/lib/skill_bench/clients/response_error_handler.rb +5 -17
data/lib/skill_bench/clients/retry_handler.rb +4 -7
data/lib/skill_bench/commands/init.rb +5 -0
data/lib/skill_bench/commands/skill_new.rb +3 -1
data/lib/skill_bench/config/applier.rb +2 -0
data/lib/skill_bench/config/defaults.rb +2 -0
data/lib/skill_bench/config/facade_readers.rb +7 -0
data/lib/skill_bench/config/facade_writers.rb +17 -0
data/lib/skill_bench/config/json_loader.rb +1 -1
data/lib/skill_bench/config/store.rb +29 -0
data/lib/skill_bench/config.rb +18 -0
data/lib/skill_bench/constants.rb +58 -0
data/lib/skill_bench/evaluation/runner.rb +20 -3
data/lib/skill_bench/execution/context_hydrator.rb +66 -15
data/lib/skill_bench/execution/sandbox.rb +76 -14
data/lib/skill_bench/judge/judge.rb +4 -0
data/lib/skill_bench/judge/prompt.rb +42 -6
data/lib/skill_bench/models/config.rb +32 -0
data/lib/skill_bench/output_formatter.rb +60 -1
data/lib/skill_bench/package_verifier.rb +1 -1
data/lib/skill_bench/rails/skill_templates.rb +19 -5
data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
data/lib/skill_bench/services/batch_runner_service.rb +111 -0
data/lib/skill_bench/services/compare_option_parser.rb +1 -0
data/lib/skill_bench/services/cost_calculator.rb +91 -0
data/lib/skill_bench/services/html_formatter.rb +289 -0
data/lib/skill_bench/services/json_formatter.rb +19 -1
data/lib/skill_bench/services/junit_formatter.rb +74 -24
data/lib/skill_bench/services/provider_resolver.rb +5 -2
data/lib/skill_bench/services/response_cache.rb +130 -0
data/lib/skill_bench/services/runner_service.rb +88 -4
data/lib/skill_bench/services/summary_formatter.rb +90 -0
data/lib/skill_bench/services/template_registry.rb +43 -9
data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
data/lib/skill_bench/tools/registry.rb +29 -3
data/lib/skill_bench/tools/run_command.rb +172 -35
data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
data/lib/skill_bench/trend_tracker.rb +5 -5
data/lib/skill_bench/version.rb +1 -1
data/lib/skill_bench.rb +3 -3
metadata +19 -36

data/lib/skill_bench/clients/base_url_validator.rb ADDED Viewed

@@ -0,0 +1,105 @@
+# frozen_string_literal: true
+require 'uri'
+module SkillBench
+  module Clients
+    # Validates a provider `base_url` before it is used to build an HTTP
+    # connection that may carry an API key / bearer token.
+    #
+    # Security rationale: `base_url` is taken verbatim from config/env input and
+    # the authenticated request attaches a credential to whatever host it names.
+    # Left unchecked this is an SSRF surface, and an `http://` URL would transmit
+    # the credential in cleartext. This service enforces:
+    #
+    # - the URL must be an absolute `http`/`https` URL with a host (empty/relative
+    #   /garbage values are rejected);
+    # - when a credential will be attached, non-loopback hosts MUST use `https`;
+    #   loopback hosts (`localhost`, `127.0.0.1`, `::1`) MAY use `http` — the
+    #   legitimate self-hosted/Ollama case — and an explicit opt-in
+    #   (`allow_insecure_base_url`) can permit cleartext for non-loopback hosts.
+    #
+    # A blank (`nil`/empty) `base_url` is allowed so providers may supply their
+    # own (https) default downstream. Error messages describe only the transport
+    # and never include the credential.
+    class BaseUrlValidator
+      # Hosts permitted to use cleartext `http` even with a credential attached.
+      LOOPBACK_HOSTS = %w[localhost 127.0.0.1 ::1].freeze
+      # Raised when a base URL is structurally invalid or would leak a credential
+      # over cleartext transport. The message never contains the credential.
+      class InvalidBaseURLError < StandardError; end
+      # Validates a base URL and returns it unchanged when valid.
+      #
+      # @param base_url [String, nil] the URL to validate; blank values are
+      #   returned as-is so a provider default can be applied later.
+      # @param has_credential [Boolean] whether a credential (api key/bearer
+      #   token) will be attached to requests sent to this URL.
+      # @param allow_insecure [Boolean] explicit opt-in that permits cleartext
+      #   `http` to a non-loopback host even when a credential is attached.
+      # @raise [InvalidBaseURLError] when the URL is invalid or insecure.
+      # @return [String, nil] the validated URL (blank input returned unchanged).
+      def self.call(base_url:, has_credential: false, allow_insecure: false)
+        new(base_url, has_credential, allow_insecure).call
+      end
+      # @param base_url [String, nil] the URL to validate.
+      # @param has_credential [Boolean] whether a credential will be attached.
+      # @param allow_insecure [Boolean] opt-in permitting cleartext non-loopback.
+      def initialize(base_url, has_credential, allow_insecure)
+        @base_url = base_url
+        @has_credential = has_credential
+        @allow_insecure = allow_insecure
+      end
+      # Runs the validation.
+      #
+      # @raise [InvalidBaseURLError] when the URL is invalid or insecure.
+      # @return [String, nil] the validated URL.
+      def call
+        return @base_url if blank?(@base_url)
+        validate_absolute_http_url!
+        validate_secure_transport!
+        @base_url
+      end
+      private
+      def blank?(value)
+        value.to_s.strip.empty?
+      end
+      def uri
+        @uri ||= URI.parse(@base_url.to_s)
+      rescue URI::InvalidURIError
+        nil
+      end
+      def validate_absolute_http_url!
+        return if uri.is_a?(URI::HTTP) && !blank?(uri.hostname)
+        raise InvalidBaseURLError,
+              "Invalid provider base_url #{@base_url.inspect}: " \
+              'must be an absolute http(s) URL with a host.'
+      end
+      def validate_secure_transport!
+        return unless @has_credential
+        return if uri.scheme == 'https'
+        return if loopback?
+        return if @allow_insecure
+        raise InvalidBaseURLError,
+              'Insecure provider base_url: refusing to send a credential over cleartext http ' \
+              "to non-loopback host #{uri.hostname.inspect}. Use https, target a loopback host, " \
+              'or set allow_insecure_base_url: true to override.'
+      end
+      def loopback?
+        LOOPBACK_HOSTS.include?(uri.hostname)
+      end
+    end
+  end
+end

data/lib/skill_bench/clients/provider_config.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require_relative '../config'
+require_relative 'base_url_validator'
 module SkillBench
   module Clients
@@ -13,6 +14,8 @@ module SkillBench
         new(provider, options).call
       end
+      # @param provider [Symbol, String] provider identifier, coerced to a Symbol (e.g., :openai, :ollama)
+      # @param options [Hash] override options that take precedence over the loaded provider config
       def initialize(provider, options)
         @provider = provider.to_sym
         @options = options
@@ -21,8 +24,21 @@ module SkillBench
       # Loads and returns standardized provider configuration.
       #
+      # The resolved transport URLs (`base_url` and, for Azure, `endpoint`) are
+      # validated before being returned: they must be absolute http(s) URLs, and
+      # a credential is never sent over cleartext http to a non-loopback host.
+      #
+      # @raise [BaseUrlValidator::InvalidBaseURLError] when a transport URL is
+      #   structurally invalid or would leak the credential over cleartext http.
       # @return [Hash] Standardized configuration with api_key, model, base_url, etc.
       def call
+        validate_transport_urls!
+        standardized_config
+      end
+      private
+      def standardized_config
         {
           api_key: fetch_config(:api_key),
           model: fetch_config(:model),
@@ -37,7 +53,24 @@ module SkillBench
         }
       end
-      private
+      # Validates every transport URL that could carry the credential. Both
+      # `base_url` and Azure's `endpoint` are user-supplied URLs that the
+      # authenticated request targets, so both are checked with one helper.
+      #
+      # @raise [BaseUrlValidator::InvalidBaseURLError] on an invalid/insecure URL.
+      # @return [void]
+      def validate_transport_urls!
+        has_credential = !fetch_config(:api_key).to_s.empty?
+        allow_insecure = truthy?(fetch_config(:allow_insecure_base_url))
+        [fetch_config(:base_url), fetch_config(:endpoint)].each do |url|
+          BaseUrlValidator.call(base_url: url, has_credential: has_credential, allow_insecure: allow_insecure)
+        end
+      end
+      def truthy?(value)
+        value == true || value.to_s.strip.casecmp?('true')
+      end
       def fetch_config(key)
         @options[key] || @config[key]

data/lib/skill_bench/clients/provider_schemas.rb CHANGED Viewed

@@ -39,6 +39,10 @@ module SkillBench
           api_key: nil,
           model: 'deepseek-chat'
         }.freeze,
+        mistral: {
+          api_key: nil,
+          model: 'mistral-large-latest'
+        }.freeze,
         opencode: {
           api_key: nil,
           model: 'opencode-model',

data/lib/skill_bench/clients/providers/mistral.rb ADDED Viewed

@@ -0,0 +1,47 @@
+# frozen_string_literal: true
+require_relative '../base_client'
+require_relative '../provider_registry'
+module SkillBench
+  module Clients
+    module Providers
+      # Mistral (la Plateforme) LLM client.
+      # Uses Mistral's OpenAI-compatible chat completions API with bearer-token auth.
+      #
+      # NOTE: AWS Bedrock access to Mistral models (which requires SigV4 request
+      # signing rather than a static bearer token) is intentionally not handled
+      # here and is left as a follow-up.
+      class Mistral < BaseClient
+        SkillBench::Clients::ProviderRegistry.register(:mistral, self)
+        # Returns the provider identifier.
+        #
+        # @return [Symbol]
+        def provider_name
+          :mistral
+        end
+        protected
+        # Returns the base URL for the Mistral API.
+        #
+        # The Mistral API base is https://api.mistral.ai/v1; the version segment
+        # lives in {#request_path} so Faraday does not drop it (an absolute
+        # request path replaces any path component of the connection base URL).
+        #
+        # @return [String]
+        def base_url
+          @base_url_config || 'https://api.mistral.ai'
+        end
+        # Returns the request path for chat completions.
+        #
+        # @return [String]
+        def request_path
+          @request_path_config || '/v1/chat/completions'
+        end
+      end
+    end
+  end
+end

data/lib/skill_bench/clients/request_builder.rb CHANGED Viewed

@@ -1,22 +1,20 @@
 # frozen_string_literal: true
 require 'faraday'
+require_relative '../constants'
 module SkillBench
   module Clients
     # Builds and executes HTTP requests to LLM provider APIs.
     # Encapsulates Faraday connection setup and request execution.
     class RequestBuilder
-      DEFAULT_OPEN_TIMEOUT = 10
-      DEFAULT_TIMEOUT = 120
       # Creates a Faraday connection with JSON middleware.
       #
       # @param base_url [String] The API base URL
       # @param open_timeout [Integer] Connection open timeout in seconds
       # @param timeout [Integer] Request timeout in seconds
       # @return [Faraday::Connection] Configured Faraday connection
-      def self.build_connection(base_url, open_timeout: DEFAULT_OPEN_TIMEOUT, timeout: DEFAULT_TIMEOUT)
+      def self.build_connection(base_url, open_timeout: Constants::HttpClient::DEFAULT_OPEN_TIMEOUT, timeout: Constants::HttpClient::DEFAULT_TIMEOUT)
         Faraday.new(url: base_url) do |f|
           f.request :json
           f.response :json

data/lib/skill_bench/clients/response_builder.rb ADDED Viewed

@@ -0,0 +1,91 @@
+# frozen_string_literal: true
+module SkillBench
+  module Clients
+    # Service object for building standardized response hashes.
+    # Eliminates duplication of error response formatting across the codebase.
+    class ResponseBuilder
+      # Builds a standardized error response.
+      #
+      # @param message [String] The error message.
+      # @param status [String] The status identifier (default: 'error').
+      # @return [Hash] Standardized error response hash.
+      def self.error(message:, status: 'error')
+        {
+          success: false,
+          response: { error: { message: message } },
+          result: message,
+          status: status
+        }
+      end
+      # Builds a standardized success response.
+      #
+      # @param content [String] The response content.
+      # @param metadata [Hash] Additional metadata to include in response.
+      # @return [Hash] Standardized success response hash.
+      def self.success(content:, metadata: {})
+        {
+          success: true,
+          result: content,
+          response: { content: content }.merge(metadata),
+          status: 'success'
+        }
+      end
+      # Builds a standardized API error response.
+      #
+      # @param error_message [String] The API error message.
+      # @param usage [Hash] Token usage information.
+      # @return [Hash] Standardized API error response hash.
+      def self.api_error(error_message:, usage: {})
+        {
+          success: false,
+          result: "API Error: #{error_message}",
+          usage: usage,
+          response: { error: { message: "API Error: #{error_message}" } },
+          status: 'error'
+        }
+      end
+      # Builds a standardized network error response.
+      #
+      # @param error_message [String] The network error message.
+      # @return [Hash] Standardized network error response hash.
+      def self.network_error(error_message:)
+        {
+          success: false,
+          response: { error: { message: "Network Error: #{error_message}" } },
+          result: "Network Error: #{error_message}",
+          status: 'error'
+        }
+      end
+      # Builds a standardized parsing error response.
+      #
+      # @param error_message [String] The parsing error message.
+      # @return [Hash] Standardized parsing error response hash.
+      def self.parsing_error(error_message:)
+        {
+          success: false,
+          response: { error: { message: "Parsing Error: #{error_message}" } },
+          result: "Parsing Error: #{error_message}",
+          status: 'error'
+        }
+      end
+      # Builds a standardized unexpected error response.
+      #
+      # @param error_message [String] The unexpected error message.
+      # @return [Hash] Standardized unexpected error response hash.
+      def self.unexpected_error(error_message:)
+        {
+          success: false,
+          response: { error: { message: "Unexpected Error: #{error_message}" } },
+          result: "Unexpected Error: #{error_message}",
+          status: 'error'
+        }
+      end
+    end
+  end
+end

data/lib/skill_bench/clients/response_error_handler.rb CHANGED Viewed

@@ -23,14 +23,8 @@ module SkillBench
           error_msg += " - #{detail}"
         end
-        {
-          success: false,
-          result: error_msg,
-          usage: usage_extractor.call(parsed),
-          response: { error: { message: error_msg } },
-          status: 'error',
-          code: response.status
-        }
+        base_response = ResponseBuilder.api_error(error_message: error_msg, usage: usage_extractor.call(parsed))
+        base_response.merge(code: response.status)
       end
       # Creates an error response when the LLM response has no message content.
@@ -41,14 +35,8 @@ module SkillBench
       # @return [Hash] Standardized error response
       def self.missing_message_response(response, parsed, &usage_extractor)
         error_msg = 'LLM response missing message content'
-        {
-          success: false,
-          result: error_msg,
-          usage: usage_extractor.call(parsed),
-          response: { error: { message: error_msg } },
-          status: 'error',
-          code: response.status
-        }
+        base_response = ResponseBuilder.error(message: error_msg)
+        base_response.merge(usage: usage_extractor.call(parsed), code: response.status)
       end
       # Handles an exception by logging and returning a standardized error response.
@@ -58,7 +46,7 @@ module SkillBench
       # @return [Hash] Standardized error response
       def self.handle_exception(error, type)
         log_error(error)
-        { success: false, result: "#{type}: #{error.message}", status: 'error' }
+        ResponseBuilder.error(message: "#{type}: #{error.message}")
       end
       # Logs an error message and backtrace to Rails.logger or stderr.

data/lib/skill_bench/clients/retry_handler.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require 'faraday'
 require_relative '../error_logger'
+require_relative '../constants'
 module SkillBench
   module Clients
@@ -9,10 +10,6 @@ module SkillBench
     # Retries on transient errors (429, 503). Raises permanent errors immediately.
     # Returns the block result on success.
     class RetryHandler
-      RETRYABLE_STATUSES = [429, 503].freeze
-      MAX_DELAY = 30 # Maximum delay cap in seconds
       # Executes the given block with retry logic.
       #
       # @param max_attempts [Integer] Maximum number of attempts (default: 3).
@@ -21,7 +18,7 @@ module SkillBench
       # @return [Object] The block's return value on success.
       # @raise [Faraday::Error] On non-retryable errors or after exhausting retries.
       # @raise [ArgumentError] if no block is given or max_attempts < 1.
-      def self.call(max_attempts: 3, base_delay: 1, &block)
+      def self.call(max_attempts: Constants::HttpClient::DEFAULT_MAX_RETRIES, base_delay: Constants::HttpClient::DEFAULT_RETRY_DELAY, &block)
         raise ArgumentError, 'RetryHandler requires a block' unless block
         raise ArgumentError, 'max_attempts must be >= 1' if max_attempts < 1
@@ -59,11 +56,11 @@ module SkillBench
       private
       def retryable?(status, attempt)
-        RETRYABLE_STATUSES.include?(status) && attempt < @max_attempts
+        Constants::HttpClient::RETRYABLE_STATUSES.include?(status) && attempt < @max_attempts
       end
       def compute_delay(attempt)
-        [@base_delay * (2**(attempt - 1)), MAX_DELAY].min
+        [@base_delay * (2**(attempt - 1)), Constants::ReactAgent::DEFAULT_MAX_DELAY].min
       end
       def extract_status(error)

data/lib/skill_bench/commands/init.rb CHANGED Viewed

@@ -24,10 +24,15 @@ module SkillBench
       # Generates configuration hash for a specific provider.
       #
+      # The built-in `:mock` provider needs no credentials, so it produces a
+      # minimal offline config without a nested `config:` block.
+      #
       # @param provider [Symbol] LLM provider name
       # @return [Hash] Single-provider configuration
       # @raise [ArgumentError] if provider is not registered
       def self.config_for_provider(provider)
+        return { provider: :mock, max_execution_time: 30 } if provider == :mock
         {
           provider: provider,
           max_execution_time: 30,

data/lib/skill_bench/commands/skill_new.rb CHANGED Viewed

@@ -1,7 +1,6 @@
 # frozen_string_literal: true
 require 'fileutils'
-require_relative '../rails/skill_templates'
 module SkillBench
   module Commands
@@ -107,6 +106,9 @@ module SkillBench
         file_name = RAILS_TEMPLATES[template]
         raise ArgumentError, "Invalid template: #{template}. Use one of: #{RAILS_TEMPLATES.keys.join(', ')}." unless file_name
+        # Lazily load the scaffold generator so a normal `skill-bench run` does
+        # not pull it (and its dependencies) in at boot.
+        require_relative '../rails/skill_templates'
         content = Rails::SkillTemplates.public_send(template.to_sym, name)
         File.write(File.join(path, file_name), content)
       end

data/lib/skill_bench/config/applier.rb CHANGED Viewed

@@ -41,6 +41,8 @@ module SkillBench
         assign_current_provider
         @store.assign_max_execution_time(@data[:max_execution_time]) if @data.key?(:max_execution_time)
         @store.assign_allowed_commands(@data[:allowed_commands]) if @data.key?(:allowed_commands)
+        @store.assign_allow_host_execution(@data[:allow_host_execution]) if @data.key?(:allow_host_execution)
+        @store.assign_command_argument_constraints(@data[:command_argument_constraints]) if @data.key?(:command_argument_constraints)
         @store.skill_sources = @data[:skill_sources] if @data.key?(:skill_sources)
       end

data/lib/skill_bench/config/defaults.rb CHANGED Viewed

@@ -19,6 +19,8 @@ module SkillBench
           current_llm_provider: :openai,
           max_execution_time: 30,
           allowed_commands: nil,
+          allow_host_execution: false,
+          command_argument_constraints: {},
           skill_sources: {},
           llm_providers_config: {
             openai: { api_key: nil, model: 'gpt-4o' },

data/lib/skill_bench/config/facade_readers.rb CHANGED Viewed

@@ -25,6 +25,13 @@ module SkillBench
         store.allowed_commands
       end
+      # Returns whether un-isolated host command execution is permitted.
+      #
+      # @return [Boolean, nil] true when host execution is explicitly allowed
+      def allow_host_execution
+        store.allow_host_execution
+      end
       # Returns provider configuration.
       #
       # @return [Hash] provider configuration by provider name

data/lib/skill_bench/config/facade_writers.rb CHANGED Viewed

@@ -102,6 +102,23 @@ module SkillBench
         store.assign_allowed_commands(value)
       end
+      # Sets whether un-isolated host command execution is permitted.
+      #
+      # @param value [Boolean] true to permit un-isolated host execution
+      # @return [Boolean] assigned host execution flag
+      def allow_host_execution=(value)
+        store.assign_allow_host_execution(value)
+      end
+      # Sets the optional per-command argument constraints.
+      #
+      # @param value [Hash, nil] base command => disallowed argument
+      #   substrings/flags
+      # @return [Hash, nil] assigned constraints
+      def command_argument_constraints=(value)
+        store.assign_command_argument_constraints(value)
+      end
       # Replaces provider configuration.
       #
       # @param value [Hash] provider configuration

data/lib/skill_bench/config/json_loader.rb CHANGED Viewed

@@ -29,7 +29,7 @@ module SkillBench
         data = JSON.parse(File.read(@path), symbolize_names: true)
         return warn_invalid_config unless data.is_a?(Hash)
-        success_data = data.slice(:current_llm_provider, :max_execution_time, :allowed_commands, :skill_sources).compact
+        success_data = data.slice(:current_llm_provider, :max_execution_time, :allowed_commands, :allow_host_execution, :command_argument_constraints, :skill_sources).compact
         success_data[:current_llm_provider] ||= data[:provider] if data.key?(:provider)
         success(success_data.merge(providers: normalized_providers(data[:providers])))
       rescue JSON::ParserError => e

data/lib/skill_bench/config/store.rb CHANGED Viewed

@@ -19,6 +19,18 @@ module SkillBench
       # @return [Array<String>, nil] allowed commands
       attr_accessor :allowed_commands
+      # Returns whether running commands directly on the host is permitted
+      # when no real sandbox isolation (container) is active.
+      #
+      # @return [Boolean, nil] true when host execution is explicitly allowed
+      attr_reader :allow_host_execution
+      # Returns the optional per-command argument constraints.
+      #
+      # @return [Hash, nil] base command => disallowed argument
+      #   substrings/flags, or nil when unconfigured
+      attr_reader :command_argument_constraints
       # Returns provider configuration.
       #
       # @return [Hash, nil] provider configuration by provider name
@@ -109,6 +121,23 @@ module SkillBench
         @allowed_commands = value
       end
+      # Sets whether host command execution is permitted without isolation.
+      #
+      # @param value [Boolean] true to permit un-isolated host execution
+      # @return [Boolean] assigned host execution flag
+      def assign_allow_host_execution(value)
+        @allow_host_execution = value
+      end
+      # Sets the optional per-command argument constraints.
+      #
+      # @param value [Hash, nil] base command => disallowed argument
+      #   substrings/flags
+      # @return [Hash, nil] assigned constraints
+      def assign_command_argument_constraints(value)
+        @command_argument_constraints = value
+      end
       # Sets provider configuration.
       #
       # @param value [Hash] provider configuration

data/lib/skill_bench/config.rb CHANGED Viewed

@@ -95,6 +95,24 @@ module SkillBench
         store.allowed_commands
       end
+      # Returns whether commands may run directly on the host when no sandbox
+      # isolation (container) is active. Defaults to false (fail closed).
+      #
+      # @return [Boolean] true when un-isolated host execution is explicitly enabled
+      def allow_host_execution
+        store.allow_host_execution || false
+      end
+      # Returns the optional per-command argument constraints.
+      #
+      # When unconfigured, returns an empty Hash meaning no argument constraints
+      # apply (the allowlist remains the only command-authorization control).
+      #
+      # @return [Hash] base command => disallowed argument substrings/flags
+      def command_argument_constraints
+        store.command_argument_constraints || {}
+      end
       # Returns max execution time from configuration.
       #
       # @return [Integer] Maximum execution time in seconds

data/lib/skill_bench/constants.rb ADDED Viewed

@@ -0,0 +1,58 @@
+# frozen_string_literal: true
+module SkillBench
+  # Centralized configuration constants for the SkillBench system.
+  # This eliminates magic numbers and provides a single source of truth
+  # for configurable values across the codebase.
+  module Constants
+    # ReAct Agent Configuration
+    module ReactAgent
+      DEFAULT_MAX_ITERATIONS = 25
+      DEFAULT_MAX_DELAY = 30 # Maximum delay cap in seconds for retry logic
+    end
+    # HTTP Client Configuration
+    module HttpClient
+      DEFAULT_OPEN_TIMEOUT = 10
+      DEFAULT_TIMEOUT = 120
+      DEFAULT_MAX_RETRIES = 3
+      DEFAULT_RETRY_DELAY = 1
+      RETRYABLE_STATUSES = [429, 503].freeze
+    end
+    # Context Hydration Configuration
+    module ContextHydration
+      MAX_FILE_SIZE = 50_000 # Maximum file size in bytes
+      MAX_TOTAL_CONTEXT_SIZE = 1_000_000 # Maximum total context size in bytes (1MB)
+      TEXT_EXTENSIONS = %w[.md .rb .json .yml .yaml .txt].freeze
+    end
+    # Sandbox Configuration
+    module Sandbox
+      DOCKER_IMAGE_NAME = 'evaluator-sandbox'
+    end
+    # Tool Execution Configuration
+    module Tools
+      DANGEROUS_COMMANDS = %w[
+        bash sh zsh fish dash ksh csh tcsh
+        python python3 python2 ruby perl node
+        php lua tcl wish
+        curl wget nc ncat socat
+        eval exec
+        sudo su doas
+        chmod chown mount umount
+        dd mkfs fdisk parted
+        insmod rmmod modprobe
+        systemctl service
+        passwd useradd userdel groupadd groupdel
+      ].freeze
+    end
+    # File Path Configuration
+    module FilePath
+      ALLOWED_PATH_PATTERN = %r{\A[a-zA-Z0-9._\-/]+\z}
+      MAX_PATH_LENGTH = 4096
+    end
+  end
+end