ruby-skill-bench 1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +299 -23
  3. data/docs/architecture.md +3 -1
  4. data/docs/first-eval-guide.md +7 -7
  5. data/docs/testing-guide.md +1 -1
  6. data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
  7. data/lib/skill_bench/agent/react_agent/step.rb +7 -1
  8. data/lib/skill_bench/agent/react_agent.rb +2 -1
  9. data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
  10. data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
  11. data/lib/skill_bench/cli/help_printer.rb +10 -2
  12. data/lib/skill_bench/cli/init_command.rb +2 -1
  13. data/lib/skill_bench/cli/result_printer.rb +1 -1
  14. data/lib/skill_bench/cli/run_command.rb +47 -9
  15. data/lib/skill_bench/cli/validate_command.rb +242 -0
  16. data/lib/skill_bench/cli.rb +3 -0
  17. data/lib/skill_bench/client.rb +43 -1
  18. data/lib/skill_bench/clients/all.rb +3 -0
  19. data/lib/skill_bench/clients/base_client.rb +14 -6
  20. data/lib/skill_bench/clients/base_url_validator.rb +105 -0
  21. data/lib/skill_bench/clients/provider_config.rb +34 -1
  22. data/lib/skill_bench/clients/provider_schemas.rb +4 -0
  23. data/lib/skill_bench/clients/providers/mistral.rb +47 -0
  24. data/lib/skill_bench/clients/request_builder.rb +2 -4
  25. data/lib/skill_bench/clients/response_builder.rb +91 -0
  26. data/lib/skill_bench/clients/response_error_handler.rb +5 -17
  27. data/lib/skill_bench/clients/retry_handler.rb +4 -7
  28. data/lib/skill_bench/commands/init.rb +5 -0
  29. data/lib/skill_bench/commands/skill_new.rb +3 -1
  30. data/lib/skill_bench/config/applier.rb +2 -0
  31. data/lib/skill_bench/config/defaults.rb +2 -0
  32. data/lib/skill_bench/config/facade_readers.rb +7 -0
  33. data/lib/skill_bench/config/facade_writers.rb +17 -0
  34. data/lib/skill_bench/config/json_loader.rb +1 -1
  35. data/lib/skill_bench/config/store.rb +29 -0
  36. data/lib/skill_bench/config.rb +18 -0
  37. data/lib/skill_bench/constants.rb +58 -0
  38. data/lib/skill_bench/evaluation/runner.rb +20 -3
  39. data/lib/skill_bench/execution/context_hydrator.rb +66 -15
  40. data/lib/skill_bench/execution/sandbox.rb +76 -14
  41. data/lib/skill_bench/judge/judge.rb +4 -0
  42. data/lib/skill_bench/judge/prompt.rb +42 -6
  43. data/lib/skill_bench/models/config.rb +32 -0
  44. data/lib/skill_bench/output_formatter.rb +60 -1
  45. data/lib/skill_bench/package_verifier.rb +1 -1
  46. data/lib/skill_bench/rails/skill_templates.rb +19 -5
  47. data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
  48. data/lib/skill_bench/services/batch_runner_service.rb +111 -0
  49. data/lib/skill_bench/services/compare_option_parser.rb +1 -0
  50. data/lib/skill_bench/services/cost_calculator.rb +91 -0
  51. data/lib/skill_bench/services/html_formatter.rb +289 -0
  52. data/lib/skill_bench/services/json_formatter.rb +19 -1
  53. data/lib/skill_bench/services/junit_formatter.rb +74 -24
  54. data/lib/skill_bench/services/provider_resolver.rb +5 -2
  55. data/lib/skill_bench/services/response_cache.rb +130 -0
  56. data/lib/skill_bench/services/runner_service.rb +88 -4
  57. data/lib/skill_bench/services/summary_formatter.rb +90 -0
  58. data/lib/skill_bench/services/template_registry.rb +43 -9
  59. data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
  60. data/lib/skill_bench/tools/registry.rb +29 -3
  61. data/lib/skill_bench/tools/run_command.rb +172 -35
  62. data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
  63. data/lib/skill_bench/trend_tracker.rb +5 -5
  64. data/lib/skill_bench/version.rb +1 -1
  65. data/lib/skill_bench.rb +3 -3
  66. metadata +19 -36
@@ -0,0 +1,105 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'uri'
4
+
5
+ module SkillBench
6
+ module Clients
7
+ # Validates a provider `base_url` before it is used to build an HTTP
8
+ # connection that may carry an API key / bearer token.
9
+ #
10
+ # Security rationale: `base_url` is taken verbatim from config/env input and
11
+ # the authenticated request attaches a credential to whatever host it names.
12
+ # Left unchecked this is an SSRF surface, and an `http://` URL would transmit
13
+ # the credential in cleartext. This service enforces:
14
+ #
15
+ # - the URL must be an absolute `http`/`https` URL with a host (empty/relative
16
+ # /garbage values are rejected);
17
+ # - when a credential will be attached, non-loopback hosts MUST use `https`;
18
+ # loopback hosts (`localhost`, `127.0.0.1`, `::1`) MAY use `http` — the
19
+ # legitimate self-hosted/Ollama case — and an explicit opt-in
20
+ # (`allow_insecure_base_url`) can permit cleartext for non-loopback hosts.
21
+ #
22
+ # A blank (`nil`/empty) `base_url` is allowed so providers may supply their
23
+ # own (https) default downstream. Error messages describe only the transport
24
+ # and never include the credential.
25
+ class BaseUrlValidator
26
+ # Hosts permitted to use cleartext `http` even with a credential attached.
27
+ LOOPBACK_HOSTS = %w[localhost 127.0.0.1 ::1].freeze
28
+
29
+ # Raised when a base URL is structurally invalid or would leak a credential
30
+ # over cleartext transport. The message never contains the credential.
31
+ class InvalidBaseURLError < StandardError; end
32
+
33
+ # Validates a base URL and returns it unchanged when valid.
34
+ #
35
+ # @param base_url [String, nil] the URL to validate; blank values are
36
+ # returned as-is so a provider default can be applied later.
37
+ # @param has_credential [Boolean] whether a credential (api key/bearer
38
+ # token) will be attached to requests sent to this URL.
39
+ # @param allow_insecure [Boolean] explicit opt-in that permits cleartext
40
+ # `http` to a non-loopback host even when a credential is attached.
41
+ # @raise [InvalidBaseURLError] when the URL is invalid or insecure.
42
+ # @return [String, nil] the validated URL (blank input returned unchanged).
43
+ def self.call(base_url:, has_credential: false, allow_insecure: false)
44
+ new(base_url, has_credential, allow_insecure).call
45
+ end
46
+
47
+ # @param base_url [String, nil] the URL to validate.
48
+ # @param has_credential [Boolean] whether a credential will be attached.
49
+ # @param allow_insecure [Boolean] opt-in permitting cleartext non-loopback.
50
+ def initialize(base_url, has_credential, allow_insecure)
51
+ @base_url = base_url
52
+ @has_credential = has_credential
53
+ @allow_insecure = allow_insecure
54
+ end
55
+
56
+ # Runs the validation.
57
+ #
58
+ # @raise [InvalidBaseURLError] when the URL is invalid or insecure.
59
+ # @return [String, nil] the validated URL.
60
+ def call
61
+ return @base_url if blank?(@base_url)
62
+
63
+ validate_absolute_http_url!
64
+ validate_secure_transport!
65
+ @base_url
66
+ end
67
+
68
+ private
69
+
70
+ def blank?(value)
71
+ value.to_s.strip.empty?
72
+ end
73
+
74
+ def uri
75
+ @uri ||= URI.parse(@base_url.to_s)
76
+ rescue URI::InvalidURIError
77
+ nil
78
+ end
79
+
80
+ def validate_absolute_http_url!
81
+ return if uri.is_a?(URI::HTTP) && !blank?(uri.hostname)
82
+
83
+ raise InvalidBaseURLError,
84
+ "Invalid provider base_url #{@base_url.inspect}: " \
85
+ 'must be an absolute http(s) URL with a host.'
86
+ end
87
+
88
+ def validate_secure_transport!
89
+ return unless @has_credential
90
+ return if uri.scheme == 'https'
91
+ return if loopback?
92
+ return if @allow_insecure
93
+
94
+ raise InvalidBaseURLError,
95
+ 'Insecure provider base_url: refusing to send a credential over cleartext http ' \
96
+ "to non-loopback host #{uri.hostname.inspect}. Use https, target a loopback host, " \
97
+ 'or set allow_insecure_base_url: true to override.'
98
+ end
99
+
100
+ def loopback?
101
+ LOOPBACK_HOSTS.include?(uri.hostname)
102
+ end
103
+ end
104
+ end
105
+ end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative '../config'
4
+ require_relative 'base_url_validator'
4
5
 
5
6
  module SkillBench
6
7
  module Clients
@@ -13,6 +14,8 @@ module SkillBench
13
14
  new(provider, options).call
14
15
  end
15
16
 
17
+ # @param provider [Symbol, String] provider identifier, coerced to a Symbol (e.g., :openai, :ollama)
18
+ # @param options [Hash] override options that take precedence over the loaded provider config
16
19
  def initialize(provider, options)
17
20
  @provider = provider.to_sym
18
21
  @options = options
@@ -21,8 +24,21 @@ module SkillBench
21
24
 
22
25
  # Loads and returns standardized provider configuration.
23
26
  #
27
+ # The resolved transport URLs (`base_url` and, for Azure, `endpoint`) are
28
+ # validated before being returned: they must be absolute http(s) URLs, and
29
+ # a credential is never sent over cleartext http to a non-loopback host.
30
+ #
31
+ # @raise [BaseUrlValidator::InvalidBaseURLError] when a transport URL is
32
+ # structurally invalid or would leak the credential over cleartext http.
24
33
  # @return [Hash] Standardized configuration with api_key, model, base_url, etc.
25
34
  def call
35
+ validate_transport_urls!
36
+ standardized_config
37
+ end
38
+
39
+ private
40
+
41
+ def standardized_config
26
42
  {
27
43
  api_key: fetch_config(:api_key),
28
44
  model: fetch_config(:model),
@@ -37,7 +53,24 @@ module SkillBench
37
53
  }
38
54
  end
39
55
 
40
- private
56
+ # Validates every transport URL that could carry the credential. Both
57
+ # `base_url` and Azure's `endpoint` are user-supplied URLs that the
58
+ # authenticated request targets, so both are checked with one helper.
59
+ #
60
+ # @raise [BaseUrlValidator::InvalidBaseURLError] on an invalid/insecure URL.
61
+ # @return [void]
62
+ def validate_transport_urls!
63
+ has_credential = !fetch_config(:api_key).to_s.empty?
64
+ allow_insecure = truthy?(fetch_config(:allow_insecure_base_url))
65
+
66
+ [fetch_config(:base_url), fetch_config(:endpoint)].each do |url|
67
+ BaseUrlValidator.call(base_url: url, has_credential: has_credential, allow_insecure: allow_insecure)
68
+ end
69
+ end
70
+
71
+ def truthy?(value)
72
+ value == true || value.to_s.strip.casecmp?('true')
73
+ end
41
74
 
42
75
  def fetch_config(key)
43
76
  @options[key] || @config[key]
@@ -39,6 +39,10 @@ module SkillBench
39
39
  api_key: nil,
40
40
  model: 'deepseek-chat'
41
41
  }.freeze,
42
+ mistral: {
43
+ api_key: nil,
44
+ model: 'mistral-large-latest'
45
+ }.freeze,
42
46
  opencode: {
43
47
  api_key: nil,
44
48
  model: 'opencode-model',
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../base_client'
4
+ require_relative '../provider_registry'
5
+
6
+ module SkillBench
7
+ module Clients
8
+ module Providers
9
+ # Mistral (la Plateforme) LLM client.
10
+ # Uses Mistral's OpenAI-compatible chat completions API with bearer-token auth.
11
+ #
12
+ # NOTE: AWS Bedrock access to Mistral models (which requires SigV4 request
13
+ # signing rather than a static bearer token) is intentionally not handled
14
+ # here and is left as a follow-up.
15
+ class Mistral < BaseClient
16
+ SkillBench::Clients::ProviderRegistry.register(:mistral, self)
17
+
18
+ # Returns the provider identifier.
19
+ #
20
+ # @return [Symbol]
21
+ def provider_name
22
+ :mistral
23
+ end
24
+
25
+ protected
26
+
27
+ # Returns the base URL for the Mistral API.
28
+ #
29
+ # The Mistral API base is https://api.mistral.ai/v1; the version segment
30
+ # lives in {#request_path} so Faraday does not drop it (an absolute
31
+ # request path replaces any path component of the connection base URL).
32
+ #
33
+ # @return [String]
34
+ def base_url
35
+ @base_url_config || 'https://api.mistral.ai'
36
+ end
37
+
38
+ # Returns the request path for chat completions.
39
+ #
40
+ # @return [String]
41
+ def request_path
42
+ @request_path_config || '/v1/chat/completions'
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -1,22 +1,20 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'faraday'
4
+ require_relative '../constants'
4
5
 
5
6
  module SkillBench
6
7
  module Clients
7
8
  # Builds and executes HTTP requests to LLM provider APIs.
8
9
  # Encapsulates Faraday connection setup and request execution.
9
10
  class RequestBuilder
10
- DEFAULT_OPEN_TIMEOUT = 10
11
- DEFAULT_TIMEOUT = 120
12
-
13
11
  # Creates a Faraday connection with JSON middleware.
14
12
  #
15
13
  # @param base_url [String] The API base URL
16
14
  # @param open_timeout [Integer] Connection open timeout in seconds
17
15
  # @param timeout [Integer] Request timeout in seconds
18
16
  # @return [Faraday::Connection] Configured Faraday connection
19
- def self.build_connection(base_url, open_timeout: DEFAULT_OPEN_TIMEOUT, timeout: DEFAULT_TIMEOUT)
17
+ def self.build_connection(base_url, open_timeout: Constants::HttpClient::DEFAULT_OPEN_TIMEOUT, timeout: Constants::HttpClient::DEFAULT_TIMEOUT)
20
18
  Faraday.new(url: base_url) do |f|
21
19
  f.request :json
22
20
  f.response :json
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Clients
5
+ # Service object for building standardized response hashes.
6
+ # Eliminates duplication of error response formatting across the codebase.
7
+ class ResponseBuilder
8
+ # Builds a standardized error response.
9
+ #
10
+ # @param message [String] The error message.
11
+ # @param status [String] The status identifier (default: 'error').
12
+ # @return [Hash] Standardized error response hash.
13
+ def self.error(message:, status: 'error')
14
+ {
15
+ success: false,
16
+ response: { error: { message: message } },
17
+ result: message,
18
+ status: status
19
+ }
20
+ end
21
+
22
+ # Builds a standardized success response.
23
+ #
24
+ # @param content [String] The response content.
25
+ # @param metadata [Hash] Additional metadata to include in response.
26
+ # @return [Hash] Standardized success response hash.
27
+ def self.success(content:, metadata: {})
28
+ {
29
+ success: true,
30
+ result: content,
31
+ response: { content: content }.merge(metadata),
32
+ status: 'success'
33
+ }
34
+ end
35
+
36
+ # Builds a standardized API error response.
37
+ #
38
+ # @param error_message [String] The API error message.
39
+ # @param usage [Hash] Token usage information.
40
+ # @return [Hash] Standardized API error response hash.
41
+ def self.api_error(error_message:, usage: {})
42
+ {
43
+ success: false,
44
+ result: "API Error: #{error_message}",
45
+ usage: usage,
46
+ response: { error: { message: "API Error: #{error_message}" } },
47
+ status: 'error'
48
+ }
49
+ end
50
+
51
+ # Builds a standardized network error response.
52
+ #
53
+ # @param error_message [String] The network error message.
54
+ # @return [Hash] Standardized network error response hash.
55
+ def self.network_error(error_message:)
56
+ {
57
+ success: false,
58
+ response: { error: { message: "Network Error: #{error_message}" } },
59
+ result: "Network Error: #{error_message}",
60
+ status: 'error'
61
+ }
62
+ end
63
+
64
+ # Builds a standardized parsing error response.
65
+ #
66
+ # @param error_message [String] The parsing error message.
67
+ # @return [Hash] Standardized parsing error response hash.
68
+ def self.parsing_error(error_message:)
69
+ {
70
+ success: false,
71
+ response: { error: { message: "Parsing Error: #{error_message}" } },
72
+ result: "Parsing Error: #{error_message}",
73
+ status: 'error'
74
+ }
75
+ end
76
+
77
+ # Builds a standardized unexpected error response.
78
+ #
79
+ # @param error_message [String] The unexpected error message.
80
+ # @return [Hash] Standardized unexpected error response hash.
81
+ def self.unexpected_error(error_message:)
82
+ {
83
+ success: false,
84
+ response: { error: { message: "Unexpected Error: #{error_message}" } },
85
+ result: "Unexpected Error: #{error_message}",
86
+ status: 'error'
87
+ }
88
+ end
89
+ end
90
+ end
91
+ end
@@ -23,14 +23,8 @@ module SkillBench
23
23
  error_msg += " - #{detail}"
24
24
  end
25
25
 
26
- {
27
- success: false,
28
- result: error_msg,
29
- usage: usage_extractor.call(parsed),
30
- response: { error: { message: error_msg } },
31
- status: 'error',
32
- code: response.status
33
- }
26
+ base_response = ResponseBuilder.api_error(error_message: error_msg, usage: usage_extractor.call(parsed))
27
+ base_response.merge(code: response.status)
34
28
  end
35
29
 
36
30
  # Creates an error response when the LLM response has no message content.
@@ -41,14 +35,8 @@ module SkillBench
41
35
  # @return [Hash] Standardized error response
42
36
  def self.missing_message_response(response, parsed, &usage_extractor)
43
37
  error_msg = 'LLM response missing message content'
44
- {
45
- success: false,
46
- result: error_msg,
47
- usage: usage_extractor.call(parsed),
48
- response: { error: { message: error_msg } },
49
- status: 'error',
50
- code: response.status
51
- }
38
+ base_response = ResponseBuilder.error(message: error_msg)
39
+ base_response.merge(usage: usage_extractor.call(parsed), code: response.status)
52
40
  end
53
41
 
54
42
  # Handles an exception by logging and returning a standardized error response.
@@ -58,7 +46,7 @@ module SkillBench
58
46
  # @return [Hash] Standardized error response
59
47
  def self.handle_exception(error, type)
60
48
  log_error(error)
61
- { success: false, result: "#{type}: #{error.message}", status: 'error' }
49
+ ResponseBuilder.error(message: "#{type}: #{error.message}")
62
50
  end
63
51
 
64
52
  # Logs an error message and backtrace to Rails.logger or stderr.
@@ -2,6 +2,7 @@
2
2
 
3
3
  require 'faraday'
4
4
  require_relative '../error_logger'
5
+ require_relative '../constants'
5
6
 
6
7
  module SkillBench
7
8
  module Clients
@@ -9,10 +10,6 @@ module SkillBench
9
10
  # Retries on transient errors (429, 503). Raises permanent errors immediately.
10
11
  # Returns the block result on success.
11
12
  class RetryHandler
12
- RETRYABLE_STATUSES = [429, 503].freeze
13
-
14
- MAX_DELAY = 30 # Maximum delay cap in seconds
15
-
16
13
  # Executes the given block with retry logic.
17
14
  #
18
15
  # @param max_attempts [Integer] Maximum number of attempts (default: 3).
@@ -21,7 +18,7 @@ module SkillBench
21
18
  # @return [Object] The block's return value on success.
22
19
  # @raise [Faraday::Error] On non-retryable errors or after exhausting retries.
23
20
  # @raise [ArgumentError] if no block is given or max_attempts < 1.
24
- def self.call(max_attempts: 3, base_delay: 1, &block)
21
+ def self.call(max_attempts: Constants::HttpClient::DEFAULT_MAX_RETRIES, base_delay: Constants::HttpClient::DEFAULT_RETRY_DELAY, &block)
25
22
  raise ArgumentError, 'RetryHandler requires a block' unless block
26
23
  raise ArgumentError, 'max_attempts must be >= 1' if max_attempts < 1
27
24
 
@@ -59,11 +56,11 @@ module SkillBench
59
56
  private
60
57
 
61
58
  def retryable?(status, attempt)
62
- RETRYABLE_STATUSES.include?(status) && attempt < @max_attempts
59
+ Constants::HttpClient::RETRYABLE_STATUSES.include?(status) && attempt < @max_attempts
63
60
  end
64
61
 
65
62
  def compute_delay(attempt)
66
- [@base_delay * (2**(attempt - 1)), MAX_DELAY].min
63
+ [@base_delay * (2**(attempt - 1)), Constants::ReactAgent::DEFAULT_MAX_DELAY].min
67
64
  end
68
65
 
69
66
  def extract_status(error)
@@ -24,10 +24,15 @@ module SkillBench
24
24
 
25
25
  # Generates configuration hash for a specific provider.
26
26
  #
27
+ # The built-in `:mock` provider needs no credentials, so it produces a
28
+ # minimal offline config without a nested `config:` block.
29
+ #
27
30
  # @param provider [Symbol] LLM provider name
28
31
  # @return [Hash] Single-provider configuration
29
32
  # @raise [ArgumentError] if provider is not registered
30
33
  def self.config_for_provider(provider)
34
+ return { provider: :mock, max_execution_time: 30 } if provider == :mock
35
+
31
36
  {
32
37
  provider: provider,
33
38
  max_execution_time: 30,
@@ -1,7 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'fileutils'
4
- require_relative '../rails/skill_templates'
5
4
 
6
5
  module SkillBench
7
6
  module Commands
@@ -107,6 +106,9 @@ module SkillBench
107
106
  file_name = RAILS_TEMPLATES[template]
108
107
  raise ArgumentError, "Invalid template: #{template}. Use one of: #{RAILS_TEMPLATES.keys.join(', ')}." unless file_name
109
108
 
109
+ # Lazily load the scaffold generator so a normal `skill-bench run` does
110
+ # not pull it (and its dependencies) in at boot.
111
+ require_relative '../rails/skill_templates'
110
112
  content = Rails::SkillTemplates.public_send(template.to_sym, name)
111
113
  File.write(File.join(path, file_name), content)
112
114
  end
@@ -41,6 +41,8 @@ module SkillBench
41
41
  assign_current_provider
42
42
  @store.assign_max_execution_time(@data[:max_execution_time]) if @data.key?(:max_execution_time)
43
43
  @store.assign_allowed_commands(@data[:allowed_commands]) if @data.key?(:allowed_commands)
44
+ @store.assign_allow_host_execution(@data[:allow_host_execution]) if @data.key?(:allow_host_execution)
45
+ @store.assign_command_argument_constraints(@data[:command_argument_constraints]) if @data.key?(:command_argument_constraints)
44
46
  @store.skill_sources = @data[:skill_sources] if @data.key?(:skill_sources)
45
47
  end
46
48
 
@@ -19,6 +19,8 @@ module SkillBench
19
19
  current_llm_provider: :openai,
20
20
  max_execution_time: 30,
21
21
  allowed_commands: nil,
22
+ allow_host_execution: false,
23
+ command_argument_constraints: {},
22
24
  skill_sources: {},
23
25
  llm_providers_config: {
24
26
  openai: { api_key: nil, model: 'gpt-4o' },
@@ -25,6 +25,13 @@ module SkillBench
25
25
  store.allowed_commands
26
26
  end
27
27
 
28
+ # Returns whether un-isolated host command execution is permitted.
29
+ #
30
+ # @return [Boolean, nil] true when host execution is explicitly allowed
31
+ def allow_host_execution
32
+ store.allow_host_execution
33
+ end
34
+
28
35
  # Returns provider configuration.
29
36
  #
30
37
  # @return [Hash] provider configuration by provider name
@@ -102,6 +102,23 @@ module SkillBench
102
102
  store.assign_allowed_commands(value)
103
103
  end
104
104
 
105
+ # Sets whether un-isolated host command execution is permitted.
106
+ #
107
+ # @param value [Boolean] true to permit un-isolated host execution
108
+ # @return [Boolean] assigned host execution flag
109
+ def allow_host_execution=(value)
110
+ store.assign_allow_host_execution(value)
111
+ end
112
+
113
+ # Sets the optional per-command argument constraints.
114
+ #
115
+ # @param value [Hash, nil] base command => disallowed argument
116
+ # substrings/flags
117
+ # @return [Hash, nil] assigned constraints
118
+ def command_argument_constraints=(value)
119
+ store.assign_command_argument_constraints(value)
120
+ end
121
+
105
122
  # Replaces provider configuration.
106
123
  #
107
124
  # @param value [Hash] provider configuration
@@ -29,7 +29,7 @@ module SkillBench
29
29
  data = JSON.parse(File.read(@path), symbolize_names: true)
30
30
  return warn_invalid_config unless data.is_a?(Hash)
31
31
 
32
- success_data = data.slice(:current_llm_provider, :max_execution_time, :allowed_commands, :skill_sources).compact
32
+ success_data = data.slice(:current_llm_provider, :max_execution_time, :allowed_commands, :allow_host_execution, :command_argument_constraints, :skill_sources).compact
33
33
  success_data[:current_llm_provider] ||= data[:provider] if data.key?(:provider)
34
34
  success(success_data.merge(providers: normalized_providers(data[:providers])))
35
35
  rescue JSON::ParserError => e
@@ -19,6 +19,18 @@ module SkillBench
19
19
  # @return [Array<String>, nil] allowed commands
20
20
  attr_accessor :allowed_commands
21
21
 
22
+ # Returns whether running commands directly on the host is permitted
23
+ # when no real sandbox isolation (container) is active.
24
+ #
25
+ # @return [Boolean, nil] true when host execution is explicitly allowed
26
+ attr_reader :allow_host_execution
27
+
28
+ # Returns the optional per-command argument constraints.
29
+ #
30
+ # @return [Hash, nil] base command => disallowed argument
31
+ # substrings/flags, or nil when unconfigured
32
+ attr_reader :command_argument_constraints
33
+
22
34
  # Returns provider configuration.
23
35
  #
24
36
  # @return [Hash, nil] provider configuration by provider name
@@ -109,6 +121,23 @@ module SkillBench
109
121
  @allowed_commands = value
110
122
  end
111
123
 
124
+ # Sets whether host command execution is permitted without isolation.
125
+ #
126
+ # @param value [Boolean] true to permit un-isolated host execution
127
+ # @return [Boolean] assigned host execution flag
128
+ def assign_allow_host_execution(value)
129
+ @allow_host_execution = value
130
+ end
131
+
132
+ # Sets the optional per-command argument constraints.
133
+ #
134
+ # @param value [Hash, nil] base command => disallowed argument
135
+ # substrings/flags
136
+ # @return [Hash, nil] assigned constraints
137
+ def assign_command_argument_constraints(value)
138
+ @command_argument_constraints = value
139
+ end
140
+
112
141
  # Sets provider configuration.
113
142
  #
114
143
  # @param value [Hash] provider configuration
@@ -95,6 +95,24 @@ module SkillBench
95
95
  store.allowed_commands
96
96
  end
97
97
 
98
+ # Returns whether commands may run directly on the host when no sandbox
99
+ # isolation (container) is active. Defaults to false (fail closed).
100
+ #
101
+ # @return [Boolean] true when un-isolated host execution is explicitly enabled
102
+ def allow_host_execution
103
+ store.allow_host_execution || false
104
+ end
105
+
106
+ # Returns the optional per-command argument constraints.
107
+ #
108
+ # When unconfigured, returns an empty Hash meaning no argument constraints
109
+ # apply (the allowlist remains the only command-authorization control).
110
+ #
111
+ # @return [Hash] base command => disallowed argument substrings/flags
112
+ def command_argument_constraints
113
+ store.command_argument_constraints || {}
114
+ end
115
+
98
116
  # Returns max execution time from configuration.
99
117
  #
100
118
  # @return [Integer] Maximum execution time in seconds
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ # Centralized configuration constants for the SkillBench system.
5
+ # This eliminates magic numbers and provides a single source of truth
6
+ # for configurable values across the codebase.
7
+ module Constants
8
+ # ReAct Agent Configuration
9
+ module ReactAgent
10
+ DEFAULT_MAX_ITERATIONS = 25
11
+ DEFAULT_MAX_DELAY = 30 # Maximum delay cap in seconds for retry logic
12
+ end
13
+
14
+ # HTTP Client Configuration
15
+ module HttpClient
16
+ DEFAULT_OPEN_TIMEOUT = 10
17
+ DEFAULT_TIMEOUT = 120
18
+ DEFAULT_MAX_RETRIES = 3
19
+ DEFAULT_RETRY_DELAY = 1
20
+ RETRYABLE_STATUSES = [429, 503].freeze
21
+ end
22
+
23
+ # Context Hydration Configuration
24
+ module ContextHydration
25
+ MAX_FILE_SIZE = 50_000 # Maximum file size in bytes
26
+ MAX_TOTAL_CONTEXT_SIZE = 1_000_000 # Maximum total context size in bytes (1MB)
27
+ TEXT_EXTENSIONS = %w[.md .rb .json .yml .yaml .txt].freeze
28
+ end
29
+
30
+ # Sandbox Configuration
31
+ module Sandbox
32
+ DOCKER_IMAGE_NAME = 'evaluator-sandbox'
33
+ end
34
+
35
+ # Tool Execution Configuration
36
+ module Tools
37
+ DANGEROUS_COMMANDS = %w[
38
+ bash sh zsh fish dash ksh csh tcsh
39
+ python python3 python2 ruby perl node
40
+ php lua tcl wish
41
+ curl wget nc ncat socat
42
+ eval exec
43
+ sudo su doas
44
+ chmod chown mount umount
45
+ dd mkfs fdisk parted
46
+ insmod rmmod modprobe
47
+ systemctl service
48
+ passwd useradd userdel groupadd groupdel
49
+ ].freeze
50
+ end
51
+
52
+ # File Path Configuration
53
+ module FilePath
54
+ ALLOWED_PATH_PATTERN = %r{\A[a-zA-Z0-9._\-/]+\z}
55
+ MAX_PATH_LENGTH = 4096
56
+ end
57
+ end
58
+ end