ruby-skill-bench 1.0.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +299 -23
- data/docs/architecture.md +3 -1
- data/docs/first-eval-guide.md +7 -7
- data/docs/testing-guide.md +1 -1
- data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
- data/lib/skill_bench/agent/react_agent/step.rb +7 -1
- data/lib/skill_bench/agent/react_agent.rb +2 -1
- data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
- data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
- data/lib/skill_bench/cli/help_printer.rb +10 -2
- data/lib/skill_bench/cli/init_command.rb +2 -1
- data/lib/skill_bench/cli/result_printer.rb +1 -1
- data/lib/skill_bench/cli/run_command.rb +47 -9
- data/lib/skill_bench/cli/validate_command.rb +242 -0
- data/lib/skill_bench/cli.rb +3 -0
- data/lib/skill_bench/client.rb +43 -1
- data/lib/skill_bench/clients/all.rb +3 -0
- data/lib/skill_bench/clients/base_client.rb +14 -6
- data/lib/skill_bench/clients/base_url_validator.rb +105 -0
- data/lib/skill_bench/clients/provider_config.rb +34 -1
- data/lib/skill_bench/clients/provider_schemas.rb +4 -0
- data/lib/skill_bench/clients/providers/mistral.rb +47 -0
- data/lib/skill_bench/clients/request_builder.rb +2 -4
- data/lib/skill_bench/clients/response_builder.rb +91 -0
- data/lib/skill_bench/clients/response_error_handler.rb +5 -17
- data/lib/skill_bench/clients/retry_handler.rb +4 -7
- data/lib/skill_bench/commands/init.rb +5 -0
- data/lib/skill_bench/commands/skill_new.rb +3 -1
- data/lib/skill_bench/config/applier.rb +2 -0
- data/lib/skill_bench/config/defaults.rb +2 -0
- data/lib/skill_bench/config/facade_readers.rb +7 -0
- data/lib/skill_bench/config/facade_writers.rb +17 -0
- data/lib/skill_bench/config/json_loader.rb +1 -1
- data/lib/skill_bench/config/store.rb +29 -0
- data/lib/skill_bench/config.rb +18 -0
- data/lib/skill_bench/constants.rb +58 -0
- data/lib/skill_bench/evaluation/runner.rb +20 -3
- data/lib/skill_bench/execution/context_hydrator.rb +66 -15
- data/lib/skill_bench/execution/sandbox.rb +76 -14
- data/lib/skill_bench/judge/judge.rb +4 -0
- data/lib/skill_bench/judge/prompt.rb +42 -6
- data/lib/skill_bench/models/config.rb +32 -0
- data/lib/skill_bench/output_formatter.rb +60 -1
- data/lib/skill_bench/package_verifier.rb +1 -1
- data/lib/skill_bench/rails/skill_templates.rb +19 -5
- data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
- data/lib/skill_bench/services/batch_runner_service.rb +111 -0
- data/lib/skill_bench/services/compare_option_parser.rb +1 -0
- data/lib/skill_bench/services/cost_calculator.rb +91 -0
- data/lib/skill_bench/services/html_formatter.rb +289 -0
- data/lib/skill_bench/services/json_formatter.rb +19 -1
- data/lib/skill_bench/services/junit_formatter.rb +74 -24
- data/lib/skill_bench/services/provider_resolver.rb +5 -2
- data/lib/skill_bench/services/response_cache.rb +130 -0
- data/lib/skill_bench/services/runner_service.rb +88 -4
- data/lib/skill_bench/services/summary_formatter.rb +90 -0
- data/lib/skill_bench/services/template_registry.rb +43 -9
- data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
- data/lib/skill_bench/tools/registry.rb +29 -3
- data/lib/skill_bench/tools/run_command.rb +172 -35
- data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
- data/lib/skill_bench/trend_tracker.rb +5 -5
- data/lib/skill_bench/version.rb +1 -1
- data/lib/skill_bench.rb +3 -3
- metadata +19 -36
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'uri'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Clients
|
|
7
|
+
# Validates a provider `base_url` before it is used to build an HTTP
|
|
8
|
+
# connection that may carry an API key / bearer token.
|
|
9
|
+
#
|
|
10
|
+
# Security rationale: `base_url` is taken verbatim from config/env input and
|
|
11
|
+
# the authenticated request attaches a credential to whatever host it names.
|
|
12
|
+
# Left unchecked this is an SSRF surface, and an `http://` URL would transmit
|
|
13
|
+
# the credential in cleartext. This service enforces:
|
|
14
|
+
#
|
|
15
|
+
# - the URL must be an absolute `http`/`https` URL with a host (empty/relative
|
|
16
|
+
# /garbage values are rejected);
|
|
17
|
+
# - when a credential will be attached, non-loopback hosts MUST use `https`;
|
|
18
|
+
# loopback hosts (`localhost`, `127.0.0.1`, `::1`) MAY use `http` — the
|
|
19
|
+
# legitimate self-hosted/Ollama case — and an explicit opt-in
|
|
20
|
+
# (`allow_insecure_base_url`) can permit cleartext for non-loopback hosts.
|
|
21
|
+
#
|
|
22
|
+
# A blank (`nil`/empty) `base_url` is allowed so providers may supply their
|
|
23
|
+
# own (https) default downstream. Error messages describe only the transport
|
|
24
|
+
# and never include the credential.
|
|
25
|
+
class BaseUrlValidator
|
|
26
|
+
# Hosts permitted to use cleartext `http` even with a credential attached.
|
|
27
|
+
LOOPBACK_HOSTS = %w[localhost 127.0.0.1 ::1].freeze
|
|
28
|
+
|
|
29
|
+
# Raised when a base URL is structurally invalid or would leak a credential
|
|
30
|
+
# over cleartext transport. The message never contains the credential.
|
|
31
|
+
class InvalidBaseURLError < StandardError; end
|
|
32
|
+
|
|
33
|
+
# Validates a base URL and returns it unchanged when valid.
|
|
34
|
+
#
|
|
35
|
+
# @param base_url [String, nil] the URL to validate; blank values are
|
|
36
|
+
# returned as-is so a provider default can be applied later.
|
|
37
|
+
# @param has_credential [Boolean] whether a credential (api key/bearer
|
|
38
|
+
# token) will be attached to requests sent to this URL.
|
|
39
|
+
# @param allow_insecure [Boolean] explicit opt-in that permits cleartext
|
|
40
|
+
# `http` to a non-loopback host even when a credential is attached.
|
|
41
|
+
# @raise [InvalidBaseURLError] when the URL is invalid or insecure.
|
|
42
|
+
# @return [String, nil] the validated URL (blank input returned unchanged).
|
|
43
|
+
def self.call(base_url:, has_credential: false, allow_insecure: false)
|
|
44
|
+
new(base_url, has_credential, allow_insecure).call
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# @param base_url [String, nil] the URL to validate.
|
|
48
|
+
# @param has_credential [Boolean] whether a credential will be attached.
|
|
49
|
+
# @param allow_insecure [Boolean] opt-in permitting cleartext non-loopback.
|
|
50
|
+
def initialize(base_url, has_credential, allow_insecure)
|
|
51
|
+
@base_url = base_url
|
|
52
|
+
@has_credential = has_credential
|
|
53
|
+
@allow_insecure = allow_insecure
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Runs the validation.
|
|
57
|
+
#
|
|
58
|
+
# @raise [InvalidBaseURLError] when the URL is invalid or insecure.
|
|
59
|
+
# @return [String, nil] the validated URL.
|
|
60
|
+
def call
|
|
61
|
+
return @base_url if blank?(@base_url)
|
|
62
|
+
|
|
63
|
+
validate_absolute_http_url!
|
|
64
|
+
validate_secure_transport!
|
|
65
|
+
@base_url
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
private
|
|
69
|
+
|
|
70
|
+
def blank?(value)
|
|
71
|
+
value.to_s.strip.empty?
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def uri
|
|
75
|
+
@uri ||= URI.parse(@base_url.to_s)
|
|
76
|
+
rescue URI::InvalidURIError
|
|
77
|
+
nil
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def validate_absolute_http_url!
|
|
81
|
+
return if uri.is_a?(URI::HTTP) && !blank?(uri.hostname)
|
|
82
|
+
|
|
83
|
+
raise InvalidBaseURLError,
|
|
84
|
+
"Invalid provider base_url #{@base_url.inspect}: " \
|
|
85
|
+
'must be an absolute http(s) URL with a host.'
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def validate_secure_transport!
|
|
89
|
+
return unless @has_credential
|
|
90
|
+
return if uri.scheme == 'https'
|
|
91
|
+
return if loopback?
|
|
92
|
+
return if @allow_insecure
|
|
93
|
+
|
|
94
|
+
raise InvalidBaseURLError,
|
|
95
|
+
'Insecure provider base_url: refusing to send a credential over cleartext http ' \
|
|
96
|
+
"to non-loopback host #{uri.hostname.inspect}. Use https, target a loopback host, " \
|
|
97
|
+
'or set allow_insecure_base_url: true to override.'
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def loopback?
|
|
101
|
+
LOOPBACK_HOSTS.include?(uri.hostname)
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative '../config'
|
|
4
|
+
require_relative 'base_url_validator'
|
|
4
5
|
|
|
5
6
|
module SkillBench
|
|
6
7
|
module Clients
|
|
@@ -13,6 +14,8 @@ module SkillBench
|
|
|
13
14
|
new(provider, options).call
|
|
14
15
|
end
|
|
15
16
|
|
|
17
|
+
# @param provider [Symbol, String] provider identifier, coerced to a Symbol (e.g., :openai, :ollama)
|
|
18
|
+
# @param options [Hash] override options that take precedence over the loaded provider config
|
|
16
19
|
def initialize(provider, options)
|
|
17
20
|
@provider = provider.to_sym
|
|
18
21
|
@options = options
|
|
@@ -21,8 +24,21 @@ module SkillBench
|
|
|
21
24
|
|
|
22
25
|
# Loads and returns standardized provider configuration.
|
|
23
26
|
#
|
|
27
|
+
# The resolved transport URLs (`base_url` and, for Azure, `endpoint`) are
|
|
28
|
+
# validated before being returned: they must be absolute http(s) URLs, and
|
|
29
|
+
# a credential is never sent over cleartext http to a non-loopback host.
|
|
30
|
+
#
|
|
31
|
+
# @raise [BaseUrlValidator::InvalidBaseURLError] when a transport URL is
|
|
32
|
+
# structurally invalid or would leak the credential over cleartext http.
|
|
24
33
|
# @return [Hash] Standardized configuration with api_key, model, base_url, etc.
|
|
25
34
|
def call
|
|
35
|
+
validate_transport_urls!
|
|
36
|
+
standardized_config
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
def standardized_config
|
|
26
42
|
{
|
|
27
43
|
api_key: fetch_config(:api_key),
|
|
28
44
|
model: fetch_config(:model),
|
|
@@ -37,7 +53,24 @@ module SkillBench
|
|
|
37
53
|
}
|
|
38
54
|
end
|
|
39
55
|
|
|
40
|
-
|
|
56
|
+
# Validates every transport URL that could carry the credential. Both
|
|
57
|
+
# `base_url` and Azure's `endpoint` are user-supplied URLs that the
|
|
58
|
+
# authenticated request targets, so both are checked with one helper.
|
|
59
|
+
#
|
|
60
|
+
# @raise [BaseUrlValidator::InvalidBaseURLError] on an invalid/insecure URL.
|
|
61
|
+
# @return [void]
|
|
62
|
+
def validate_transport_urls!
|
|
63
|
+
has_credential = !fetch_config(:api_key).to_s.empty?
|
|
64
|
+
allow_insecure = truthy?(fetch_config(:allow_insecure_base_url))
|
|
65
|
+
|
|
66
|
+
[fetch_config(:base_url), fetch_config(:endpoint)].each do |url|
|
|
67
|
+
BaseUrlValidator.call(base_url: url, has_credential: has_credential, allow_insecure: allow_insecure)
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def truthy?(value)
|
|
72
|
+
value == true || value.to_s.strip.casecmp?('true')
|
|
73
|
+
end
|
|
41
74
|
|
|
42
75
|
def fetch_config(key)
|
|
43
76
|
@options[key] || @config[key]
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../base_client'
|
|
4
|
+
require_relative '../provider_registry'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Clients
|
|
8
|
+
module Providers
|
|
9
|
+
# Mistral (la Plateforme) LLM client.
|
|
10
|
+
# Uses Mistral's OpenAI-compatible chat completions API with bearer-token auth.
|
|
11
|
+
#
|
|
12
|
+
# NOTE: AWS Bedrock access to Mistral models (which requires SigV4 request
|
|
13
|
+
# signing rather than a static bearer token) is intentionally not handled
|
|
14
|
+
# here and is left as a follow-up.
|
|
15
|
+
class Mistral < BaseClient
|
|
16
|
+
SkillBench::Clients::ProviderRegistry.register(:mistral, self)
|
|
17
|
+
|
|
18
|
+
# Returns the provider identifier.
|
|
19
|
+
#
|
|
20
|
+
# @return [Symbol]
|
|
21
|
+
def provider_name
|
|
22
|
+
:mistral
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
protected
|
|
26
|
+
|
|
27
|
+
# Returns the base URL for the Mistral API.
|
|
28
|
+
#
|
|
29
|
+
# The Mistral API base is https://api.mistral.ai/v1; the version segment
|
|
30
|
+
# lives in {#request_path} so Faraday does not drop it (an absolute
|
|
31
|
+
# request path replaces any path component of the connection base URL).
|
|
32
|
+
#
|
|
33
|
+
# @return [String]
|
|
34
|
+
def base_url
|
|
35
|
+
@base_url_config || 'https://api.mistral.ai'
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Returns the request path for chat completions.
|
|
39
|
+
#
|
|
40
|
+
# @return [String]
|
|
41
|
+
def request_path
|
|
42
|
+
@request_path_config || '/v1/chat/completions'
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -1,22 +1,20 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'faraday'
|
|
4
|
+
require_relative '../constants'
|
|
4
5
|
|
|
5
6
|
module SkillBench
|
|
6
7
|
module Clients
|
|
7
8
|
# Builds and executes HTTP requests to LLM provider APIs.
|
|
8
9
|
# Encapsulates Faraday connection setup and request execution.
|
|
9
10
|
class RequestBuilder
|
|
10
|
-
DEFAULT_OPEN_TIMEOUT = 10
|
|
11
|
-
DEFAULT_TIMEOUT = 120
|
|
12
|
-
|
|
13
11
|
# Creates a Faraday connection with JSON middleware.
|
|
14
12
|
#
|
|
15
13
|
# @param base_url [String] The API base URL
|
|
16
14
|
# @param open_timeout [Integer] Connection open timeout in seconds
|
|
17
15
|
# @param timeout [Integer] Request timeout in seconds
|
|
18
16
|
# @return [Faraday::Connection] Configured Faraday connection
|
|
19
|
-
def self.build_connection(base_url, open_timeout: DEFAULT_OPEN_TIMEOUT, timeout: DEFAULT_TIMEOUT)
|
|
17
|
+
def self.build_connection(base_url, open_timeout: Constants::HttpClient::DEFAULT_OPEN_TIMEOUT, timeout: Constants::HttpClient::DEFAULT_TIMEOUT)
|
|
20
18
|
Faraday.new(url: base_url) do |f|
|
|
21
19
|
f.request :json
|
|
22
20
|
f.response :json
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Clients
|
|
5
|
+
# Service object for building standardized response hashes.
|
|
6
|
+
# Eliminates duplication of error response formatting across the codebase.
|
|
7
|
+
class ResponseBuilder
|
|
8
|
+
# Builds a standardized error response.
|
|
9
|
+
#
|
|
10
|
+
# @param message [String] The error message.
|
|
11
|
+
# @param status [String] The status identifier (default: 'error').
|
|
12
|
+
# @return [Hash] Standardized error response hash.
|
|
13
|
+
def self.error(message:, status: 'error')
|
|
14
|
+
{
|
|
15
|
+
success: false,
|
|
16
|
+
response: { error: { message: message } },
|
|
17
|
+
result: message,
|
|
18
|
+
status: status
|
|
19
|
+
}
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Builds a standardized success response.
|
|
23
|
+
#
|
|
24
|
+
# @param content [String] The response content.
|
|
25
|
+
# @param metadata [Hash] Additional metadata to include in response.
|
|
26
|
+
# @return [Hash] Standardized success response hash.
|
|
27
|
+
def self.success(content:, metadata: {})
|
|
28
|
+
{
|
|
29
|
+
success: true,
|
|
30
|
+
result: content,
|
|
31
|
+
response: { content: content }.merge(metadata),
|
|
32
|
+
status: 'success'
|
|
33
|
+
}
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Builds a standardized API error response.
|
|
37
|
+
#
|
|
38
|
+
# @param error_message [String] The API error message.
|
|
39
|
+
# @param usage [Hash] Token usage information.
|
|
40
|
+
# @return [Hash] Standardized API error response hash.
|
|
41
|
+
def self.api_error(error_message:, usage: {})
|
|
42
|
+
{
|
|
43
|
+
success: false,
|
|
44
|
+
result: "API Error: #{error_message}",
|
|
45
|
+
usage: usage,
|
|
46
|
+
response: { error: { message: "API Error: #{error_message}" } },
|
|
47
|
+
status: 'error'
|
|
48
|
+
}
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Builds a standardized network error response.
|
|
52
|
+
#
|
|
53
|
+
# @param error_message [String] The network error message.
|
|
54
|
+
# @return [Hash] Standardized network error response hash.
|
|
55
|
+
def self.network_error(error_message:)
|
|
56
|
+
{
|
|
57
|
+
success: false,
|
|
58
|
+
response: { error: { message: "Network Error: #{error_message}" } },
|
|
59
|
+
result: "Network Error: #{error_message}",
|
|
60
|
+
status: 'error'
|
|
61
|
+
}
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Builds a standardized parsing error response.
|
|
65
|
+
#
|
|
66
|
+
# @param error_message [String] The parsing error message.
|
|
67
|
+
# @return [Hash] Standardized parsing error response hash.
|
|
68
|
+
def self.parsing_error(error_message:)
|
|
69
|
+
{
|
|
70
|
+
success: false,
|
|
71
|
+
response: { error: { message: "Parsing Error: #{error_message}" } },
|
|
72
|
+
result: "Parsing Error: #{error_message}",
|
|
73
|
+
status: 'error'
|
|
74
|
+
}
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Builds a standardized unexpected error response.
|
|
78
|
+
#
|
|
79
|
+
# @param error_message [String] The unexpected error message.
|
|
80
|
+
# @return [Hash] Standardized unexpected error response hash.
|
|
81
|
+
def self.unexpected_error(error_message:)
|
|
82
|
+
{
|
|
83
|
+
success: false,
|
|
84
|
+
response: { error: { message: "Unexpected Error: #{error_message}" } },
|
|
85
|
+
result: "Unexpected Error: #{error_message}",
|
|
86
|
+
status: 'error'
|
|
87
|
+
}
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
@@ -23,14 +23,8 @@ module SkillBench
|
|
|
23
23
|
error_msg += " - #{detail}"
|
|
24
24
|
end
|
|
25
25
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
result: error_msg,
|
|
29
|
-
usage: usage_extractor.call(parsed),
|
|
30
|
-
response: { error: { message: error_msg } },
|
|
31
|
-
status: 'error',
|
|
32
|
-
code: response.status
|
|
33
|
-
}
|
|
26
|
+
base_response = ResponseBuilder.api_error(error_message: error_msg, usage: usage_extractor.call(parsed))
|
|
27
|
+
base_response.merge(code: response.status)
|
|
34
28
|
end
|
|
35
29
|
|
|
36
30
|
# Creates an error response when the LLM response has no message content.
|
|
@@ -41,14 +35,8 @@ module SkillBench
|
|
|
41
35
|
# @return [Hash] Standardized error response
|
|
42
36
|
def self.missing_message_response(response, parsed, &usage_extractor)
|
|
43
37
|
error_msg = 'LLM response missing message content'
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
result: error_msg,
|
|
47
|
-
usage: usage_extractor.call(parsed),
|
|
48
|
-
response: { error: { message: error_msg } },
|
|
49
|
-
status: 'error',
|
|
50
|
-
code: response.status
|
|
51
|
-
}
|
|
38
|
+
base_response = ResponseBuilder.error(message: error_msg)
|
|
39
|
+
base_response.merge(usage: usage_extractor.call(parsed), code: response.status)
|
|
52
40
|
end
|
|
53
41
|
|
|
54
42
|
# Handles an exception by logging and returning a standardized error response.
|
|
@@ -58,7 +46,7 @@ module SkillBench
|
|
|
58
46
|
# @return [Hash] Standardized error response
|
|
59
47
|
def self.handle_exception(error, type)
|
|
60
48
|
log_error(error)
|
|
61
|
-
|
|
49
|
+
ResponseBuilder.error(message: "#{type}: #{error.message}")
|
|
62
50
|
end
|
|
63
51
|
|
|
64
52
|
# Logs an error message and backtrace to Rails.logger or stderr.
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require 'faraday'
|
|
4
4
|
require_relative '../error_logger'
|
|
5
|
+
require_relative '../constants'
|
|
5
6
|
|
|
6
7
|
module SkillBench
|
|
7
8
|
module Clients
|
|
@@ -9,10 +10,6 @@ module SkillBench
|
|
|
9
10
|
# Retries on transient errors (429, 503). Raises permanent errors immediately.
|
|
10
11
|
# Returns the block result on success.
|
|
11
12
|
class RetryHandler
|
|
12
|
-
RETRYABLE_STATUSES = [429, 503].freeze
|
|
13
|
-
|
|
14
|
-
MAX_DELAY = 30 # Maximum delay cap in seconds
|
|
15
|
-
|
|
16
13
|
# Executes the given block with retry logic.
|
|
17
14
|
#
|
|
18
15
|
# @param max_attempts [Integer] Maximum number of attempts (default: 3).
|
|
@@ -21,7 +18,7 @@ module SkillBench
|
|
|
21
18
|
# @return [Object] The block's return value on success.
|
|
22
19
|
# @raise [Faraday::Error] On non-retryable errors or after exhausting retries.
|
|
23
20
|
# @raise [ArgumentError] if no block is given or max_attempts < 1.
|
|
24
|
-
def self.call(max_attempts:
|
|
21
|
+
def self.call(max_attempts: Constants::HttpClient::DEFAULT_MAX_RETRIES, base_delay: Constants::HttpClient::DEFAULT_RETRY_DELAY, &block)
|
|
25
22
|
raise ArgumentError, 'RetryHandler requires a block' unless block
|
|
26
23
|
raise ArgumentError, 'max_attempts must be >= 1' if max_attempts < 1
|
|
27
24
|
|
|
@@ -59,11 +56,11 @@ module SkillBench
|
|
|
59
56
|
private
|
|
60
57
|
|
|
61
58
|
def retryable?(status, attempt)
|
|
62
|
-
RETRYABLE_STATUSES.include?(status) && attempt < @max_attempts
|
|
59
|
+
Constants::HttpClient::RETRYABLE_STATUSES.include?(status) && attempt < @max_attempts
|
|
63
60
|
end
|
|
64
61
|
|
|
65
62
|
def compute_delay(attempt)
|
|
66
|
-
[@base_delay * (2**(attempt - 1)),
|
|
63
|
+
[@base_delay * (2**(attempt - 1)), Constants::ReactAgent::DEFAULT_MAX_DELAY].min
|
|
67
64
|
end
|
|
68
65
|
|
|
69
66
|
def extract_status(error)
|
|
@@ -24,10 +24,15 @@ module SkillBench
|
|
|
24
24
|
|
|
25
25
|
# Generates configuration hash for a specific provider.
|
|
26
26
|
#
|
|
27
|
+
# The built-in `:mock` provider needs no credentials, so it produces a
|
|
28
|
+
# minimal offline config without a nested `config:` block.
|
|
29
|
+
#
|
|
27
30
|
# @param provider [Symbol] LLM provider name
|
|
28
31
|
# @return [Hash] Single-provider configuration
|
|
29
32
|
# @raise [ArgumentError] if provider is not registered
|
|
30
33
|
def self.config_for_provider(provider)
|
|
34
|
+
return { provider: :mock, max_execution_time: 30 } if provider == :mock
|
|
35
|
+
|
|
31
36
|
{
|
|
32
37
|
provider: provider,
|
|
33
38
|
max_execution_time: 30,
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'fileutils'
|
|
4
|
-
require_relative '../rails/skill_templates'
|
|
5
4
|
|
|
6
5
|
module SkillBench
|
|
7
6
|
module Commands
|
|
@@ -107,6 +106,9 @@ module SkillBench
|
|
|
107
106
|
file_name = RAILS_TEMPLATES[template]
|
|
108
107
|
raise ArgumentError, "Invalid template: #{template}. Use one of: #{RAILS_TEMPLATES.keys.join(', ')}." unless file_name
|
|
109
108
|
|
|
109
|
+
# Lazily load the scaffold generator so a normal `skill-bench run` does
|
|
110
|
+
# not pull it (and its dependencies) in at boot.
|
|
111
|
+
require_relative '../rails/skill_templates'
|
|
110
112
|
content = Rails::SkillTemplates.public_send(template.to_sym, name)
|
|
111
113
|
File.write(File.join(path, file_name), content)
|
|
112
114
|
end
|
|
@@ -41,6 +41,8 @@ module SkillBench
|
|
|
41
41
|
assign_current_provider
|
|
42
42
|
@store.assign_max_execution_time(@data[:max_execution_time]) if @data.key?(:max_execution_time)
|
|
43
43
|
@store.assign_allowed_commands(@data[:allowed_commands]) if @data.key?(:allowed_commands)
|
|
44
|
+
@store.assign_allow_host_execution(@data[:allow_host_execution]) if @data.key?(:allow_host_execution)
|
|
45
|
+
@store.assign_command_argument_constraints(@data[:command_argument_constraints]) if @data.key?(:command_argument_constraints)
|
|
44
46
|
@store.skill_sources = @data[:skill_sources] if @data.key?(:skill_sources)
|
|
45
47
|
end
|
|
46
48
|
|
|
@@ -19,6 +19,8 @@ module SkillBench
|
|
|
19
19
|
current_llm_provider: :openai,
|
|
20
20
|
max_execution_time: 30,
|
|
21
21
|
allowed_commands: nil,
|
|
22
|
+
allow_host_execution: false,
|
|
23
|
+
command_argument_constraints: {},
|
|
22
24
|
skill_sources: {},
|
|
23
25
|
llm_providers_config: {
|
|
24
26
|
openai: { api_key: nil, model: 'gpt-4o' },
|
|
@@ -25,6 +25,13 @@ module SkillBench
|
|
|
25
25
|
store.allowed_commands
|
|
26
26
|
end
|
|
27
27
|
|
|
28
|
+
# Returns whether un-isolated host command execution is permitted.
|
|
29
|
+
#
|
|
30
|
+
# @return [Boolean, nil] true when host execution is explicitly allowed
|
|
31
|
+
def allow_host_execution
|
|
32
|
+
store.allow_host_execution
|
|
33
|
+
end
|
|
34
|
+
|
|
28
35
|
# Returns provider configuration.
|
|
29
36
|
#
|
|
30
37
|
# @return [Hash] provider configuration by provider name
|
|
@@ -102,6 +102,23 @@ module SkillBench
|
|
|
102
102
|
store.assign_allowed_commands(value)
|
|
103
103
|
end
|
|
104
104
|
|
|
105
|
+
# Sets whether un-isolated host command execution is permitted.
|
|
106
|
+
#
|
|
107
|
+
# @param value [Boolean] true to permit un-isolated host execution
|
|
108
|
+
# @return [Boolean] assigned host execution flag
|
|
109
|
+
def allow_host_execution=(value)
|
|
110
|
+
store.assign_allow_host_execution(value)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Sets the optional per-command argument constraints.
|
|
114
|
+
#
|
|
115
|
+
# @param value [Hash, nil] base command => disallowed argument
|
|
116
|
+
# substrings/flags
|
|
117
|
+
# @return [Hash, nil] assigned constraints
|
|
118
|
+
def command_argument_constraints=(value)
|
|
119
|
+
store.assign_command_argument_constraints(value)
|
|
120
|
+
end
|
|
121
|
+
|
|
105
122
|
# Replaces provider configuration.
|
|
106
123
|
#
|
|
107
124
|
# @param value [Hash] provider configuration
|
|
@@ -29,7 +29,7 @@ module SkillBench
|
|
|
29
29
|
data = JSON.parse(File.read(@path), symbolize_names: true)
|
|
30
30
|
return warn_invalid_config unless data.is_a?(Hash)
|
|
31
31
|
|
|
32
|
-
success_data = data.slice(:current_llm_provider, :max_execution_time, :allowed_commands, :skill_sources).compact
|
|
32
|
+
success_data = data.slice(:current_llm_provider, :max_execution_time, :allowed_commands, :allow_host_execution, :command_argument_constraints, :skill_sources).compact
|
|
33
33
|
success_data[:current_llm_provider] ||= data[:provider] if data.key?(:provider)
|
|
34
34
|
success(success_data.merge(providers: normalized_providers(data[:providers])))
|
|
35
35
|
rescue JSON::ParserError => e
|
|
@@ -19,6 +19,18 @@ module SkillBench
|
|
|
19
19
|
# @return [Array<String>, nil] allowed commands
|
|
20
20
|
attr_accessor :allowed_commands
|
|
21
21
|
|
|
22
|
+
# Returns whether running commands directly on the host is permitted
|
|
23
|
+
# when no real sandbox isolation (container) is active.
|
|
24
|
+
#
|
|
25
|
+
# @return [Boolean, nil] true when host execution is explicitly allowed
|
|
26
|
+
attr_reader :allow_host_execution
|
|
27
|
+
|
|
28
|
+
# Returns the optional per-command argument constraints.
|
|
29
|
+
#
|
|
30
|
+
# @return [Hash, nil] base command => disallowed argument
|
|
31
|
+
# substrings/flags, or nil when unconfigured
|
|
32
|
+
attr_reader :command_argument_constraints
|
|
33
|
+
|
|
22
34
|
# Returns provider configuration.
|
|
23
35
|
#
|
|
24
36
|
# @return [Hash, nil] provider configuration by provider name
|
|
@@ -109,6 +121,23 @@ module SkillBench
|
|
|
109
121
|
@allowed_commands = value
|
|
110
122
|
end
|
|
111
123
|
|
|
124
|
+
# Sets whether host command execution is permitted without isolation.
|
|
125
|
+
#
|
|
126
|
+
# @param value [Boolean] true to permit un-isolated host execution
|
|
127
|
+
# @return [Boolean] assigned host execution flag
|
|
128
|
+
def assign_allow_host_execution(value)
|
|
129
|
+
@allow_host_execution = value
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Sets the optional per-command argument constraints.
|
|
133
|
+
#
|
|
134
|
+
# @param value [Hash, nil] base command => disallowed argument
|
|
135
|
+
# substrings/flags
|
|
136
|
+
# @return [Hash, nil] assigned constraints
|
|
137
|
+
def assign_command_argument_constraints(value)
|
|
138
|
+
@command_argument_constraints = value
|
|
139
|
+
end
|
|
140
|
+
|
|
112
141
|
# Sets provider configuration.
|
|
113
142
|
#
|
|
114
143
|
# @param value [Hash] provider configuration
|
data/lib/skill_bench/config.rb
CHANGED
|
@@ -95,6 +95,24 @@ module SkillBench
|
|
|
95
95
|
store.allowed_commands
|
|
96
96
|
end
|
|
97
97
|
|
|
98
|
+
# Returns whether commands may run directly on the host when no sandbox
|
|
99
|
+
# isolation (container) is active. Defaults to false (fail closed).
|
|
100
|
+
#
|
|
101
|
+
# @return [Boolean] true when un-isolated host execution is explicitly enabled
|
|
102
|
+
def allow_host_execution
|
|
103
|
+
store.allow_host_execution || false
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Returns the optional per-command argument constraints.
|
|
107
|
+
#
|
|
108
|
+
# When unconfigured, returns an empty Hash meaning no argument constraints
|
|
109
|
+
# apply (the allowlist remains the only command-authorization control).
|
|
110
|
+
#
|
|
111
|
+
# @return [Hash] base command => disallowed argument substrings/flags
|
|
112
|
+
def command_argument_constraints
|
|
113
|
+
store.command_argument_constraints || {}
|
|
114
|
+
end
|
|
115
|
+
|
|
98
116
|
# Returns max execution time from configuration.
|
|
99
117
|
#
|
|
100
118
|
# @return [Integer] Maximum execution time in seconds
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
# Centralized configuration constants for the SkillBench system.
|
|
5
|
+
# This eliminates magic numbers and provides a single source of truth
|
|
6
|
+
# for configurable values across the codebase.
|
|
7
|
+
module Constants
|
|
8
|
+
# ReAct Agent Configuration
|
|
9
|
+
module ReactAgent
|
|
10
|
+
DEFAULT_MAX_ITERATIONS = 25
|
|
11
|
+
DEFAULT_MAX_DELAY = 30 # Maximum delay cap in seconds for retry logic
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# HTTP Client Configuration
|
|
15
|
+
module HttpClient
|
|
16
|
+
DEFAULT_OPEN_TIMEOUT = 10
|
|
17
|
+
DEFAULT_TIMEOUT = 120
|
|
18
|
+
DEFAULT_MAX_RETRIES = 3
|
|
19
|
+
DEFAULT_RETRY_DELAY = 1
|
|
20
|
+
RETRYABLE_STATUSES = [429, 503].freeze
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Context Hydration Configuration
|
|
24
|
+
module ContextHydration
|
|
25
|
+
MAX_FILE_SIZE = 50_000 # Maximum file size in bytes
|
|
26
|
+
MAX_TOTAL_CONTEXT_SIZE = 1_000_000 # Maximum total context size in bytes (1MB)
|
|
27
|
+
TEXT_EXTENSIONS = %w[.md .rb .json .yml .yaml .txt].freeze
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Sandbox Configuration
|
|
31
|
+
module Sandbox
|
|
32
|
+
DOCKER_IMAGE_NAME = 'evaluator-sandbox'
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Tool Execution Configuration
|
|
36
|
+
module Tools
|
|
37
|
+
DANGEROUS_COMMANDS = %w[
|
|
38
|
+
bash sh zsh fish dash ksh csh tcsh
|
|
39
|
+
python python3 python2 ruby perl node
|
|
40
|
+
php lua tcl wish
|
|
41
|
+
curl wget nc ncat socat
|
|
42
|
+
eval exec
|
|
43
|
+
sudo su doas
|
|
44
|
+
chmod chown mount umount
|
|
45
|
+
dd mkfs fdisk parted
|
|
46
|
+
insmod rmmod modprobe
|
|
47
|
+
systemctl service
|
|
48
|
+
passwd useradd userdel groupadd groupdel
|
|
49
|
+
].freeze
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# File Path Configuration
|
|
53
|
+
module FilePath
|
|
54
|
+
ALLOWED_PATH_PATTERN = %r{\A[a-zA-Z0-9._\-/]+\z}
|
|
55
|
+
MAX_PATH_LENGTH = 4096
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|