ruby-skill-bench 0.1.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +231 -0
- data/lib/skill_bench/agent/react_agent.rb +2 -1
- data/lib/skill_bench/cli/compare_command.rb +91 -0
- data/lib/skill_bench/cli/help_printer.rb +9 -1
- data/lib/skill_bench/cli/run_command.rb +6 -4
- data/lib/skill_bench/cli.rb +7 -4
- data/lib/skill_bench/clients/all.rb +2 -0
- data/lib/skill_bench/clients/base_client.rb +2 -5
- data/lib/skill_bench/clients/providers/mock.rb +56 -0
- data/lib/skill_bench/clients/request_builder.rb +2 -4
- data/lib/skill_bench/clients/response_builder.rb +91 -0
- data/lib/skill_bench/clients/response_error_handler.rb +5 -17
- data/lib/skill_bench/clients/retry_handler.rb +4 -7
- data/lib/skill_bench/commands/run.rb +6 -2
- data/lib/skill_bench/config/applier.rb +1 -0
- data/lib/skill_bench/config/defaults.rb +1 -0
- data/lib/skill_bench/config/facade_readers.rb +7 -0
- data/lib/skill_bench/config/json_loader.rb +3 -3
- data/lib/skill_bench/config/store.rb +5 -0
- data/lib/skill_bench/config.rb +10 -1
- data/lib/skill_bench/constants.rb +58 -0
- data/lib/skill_bench/delta_report.rb +20 -0
- data/lib/skill_bench/execution/context_hydrator.rb +16 -6
- data/lib/skill_bench/execution/sandbox.rb +18 -3
- data/lib/skill_bench/execution/source_path_resolver.rb +59 -3
- data/lib/skill_bench/registry/pack_resolver.rb +119 -0
- data/lib/skill_bench/services/agent_spawner_service.rb +114 -0
- data/lib/skill_bench/services/compare_option_parser.rb +55 -0
- data/lib/skill_bench/services/comparison_reporter.rb +97 -0
- data/lib/skill_bench/services/comparison_runner.rb +49 -0
- data/lib/skill_bench/services/context_loader_service.rb +42 -0
- data/lib/skill_bench/services/error_response_builder.rb +119 -0
- data/lib/skill_bench/services/eval_resolver.rb +33 -0
- data/lib/skill_bench/services/exit_code_calculator.rb +39 -0
- data/lib/skill_bench/services/judge_params_builder.rb +54 -0
- data/lib/skill_bench/services/manifest_finder.rb +36 -0
- data/lib/skill_bench/services/output_formatter.rb +28 -0
- data/lib/skill_bench/services/prompt_builder_service.rb +98 -0
- data/lib/skill_bench/services/provider_resolver.rb +73 -0
- data/lib/skill_bench/services/runner_service.rb +84 -315
- data/lib/skill_bench/services/skill_resolver.rb +37 -9
- data/lib/skill_bench/services/skill_resolver_service.rb +70 -0
- data/lib/skill_bench/services/source_path_resolver_service.rb +45 -0
- data/lib/skill_bench/services/trend_recorder_service.rb +67 -0
- data/lib/skill_bench/services/variant_parser.rb +32 -0
- data/lib/skill_bench/services/variant_resolver.rb +63 -0
- data/lib/skill_bench/tools/run_command.rb +2 -17
- data/lib/skill_bench/version.rb +1 -1
- data/lib/skill_bench.rb +1 -0
- metadata +25 -2
|
@@ -23,14 +23,8 @@ module SkillBench
|
|
|
23
23
|
error_msg += " - #{detail}"
|
|
24
24
|
end
|
|
25
25
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
result: error_msg,
|
|
29
|
-
usage: usage_extractor.call(parsed),
|
|
30
|
-
response: { error: { message: error_msg } },
|
|
31
|
-
status: 'error',
|
|
32
|
-
code: response.status
|
|
33
|
-
}
|
|
26
|
+
base_response = ResponseBuilder.api_error(error_message: error_msg, usage: usage_extractor.call(parsed))
|
|
27
|
+
base_response.merge(code: response.status)
|
|
34
28
|
end
|
|
35
29
|
|
|
36
30
|
# Creates an error response when the LLM response has no message content.
|
|
@@ -41,14 +35,8 @@ module SkillBench
|
|
|
41
35
|
# @return [Hash] Standardized error response
|
|
42
36
|
def self.missing_message_response(response, parsed, &usage_extractor)
|
|
43
37
|
error_msg = 'LLM response missing message content'
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
result: error_msg,
|
|
47
|
-
usage: usage_extractor.call(parsed),
|
|
48
|
-
response: { error: { message: error_msg } },
|
|
49
|
-
status: 'error',
|
|
50
|
-
code: response.status
|
|
51
|
-
}
|
|
38
|
+
base_response = ResponseBuilder.error(message: error_msg)
|
|
39
|
+
base_response.merge(usage: usage_extractor.call(parsed), code: response.status)
|
|
52
40
|
end
|
|
53
41
|
|
|
54
42
|
# Handles an exception by logging and returning a standardized error response.
|
|
@@ -58,7 +46,7 @@ module SkillBench
|
|
|
58
46
|
# @return [Hash] Standardized error response
|
|
59
47
|
def self.handle_exception(error, type)
|
|
60
48
|
log_error(error)
|
|
61
|
-
|
|
49
|
+
ResponseBuilder.error(message: "#{type}: #{error.message}")
|
|
62
50
|
end
|
|
63
51
|
|
|
64
52
|
# Logs an error message and backtrace to Rails.logger or stderr.
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require 'faraday'
|
|
4
4
|
require_relative '../error_logger'
|
|
5
|
+
require_relative '../constants'
|
|
5
6
|
|
|
6
7
|
module SkillBench
|
|
7
8
|
module Clients
|
|
@@ -9,10 +10,6 @@ module SkillBench
|
|
|
9
10
|
# Retries on transient errors (429, 503). Raises permanent errors immediately.
|
|
10
11
|
# Returns the block result on success.
|
|
11
12
|
class RetryHandler
|
|
12
|
-
RETRYABLE_STATUSES = [429, 503].freeze
|
|
13
|
-
|
|
14
|
-
MAX_DELAY = 30 # Maximum delay cap in seconds
|
|
15
|
-
|
|
16
13
|
# Executes the given block with retry logic.
|
|
17
14
|
#
|
|
18
15
|
# @param max_attempts [Integer] Maximum number of attempts (default: 3).
|
|
@@ -21,7 +18,7 @@ module SkillBench
|
|
|
21
18
|
# @return [Object] The block's return value on success.
|
|
22
19
|
# @raise [Faraday::Error] On non-retryable errors or after exhausting retries.
|
|
23
20
|
# @raise [ArgumentError] if no block is given or max_attempts < 1.
|
|
24
|
-
def self.call(max_attempts:
|
|
21
|
+
def self.call(max_attempts: Constants::HttpClient::DEFAULT_MAX_RETRIES, base_delay: Constants::HttpClient::DEFAULT_RETRY_DELAY, &block)
|
|
25
22
|
raise ArgumentError, 'RetryHandler requires a block' unless block
|
|
26
23
|
raise ArgumentError, 'max_attempts must be >= 1' if max_attempts < 1
|
|
27
24
|
|
|
@@ -59,11 +56,11 @@ module SkillBench
|
|
|
59
56
|
private
|
|
60
57
|
|
|
61
58
|
def retryable?(status, attempt)
|
|
62
|
-
RETRYABLE_STATUSES.include?(status) && attempt < @max_attempts
|
|
59
|
+
Constants::HttpClient::RETRYABLE_STATUSES.include?(status) && attempt < @max_attempts
|
|
63
60
|
end
|
|
64
61
|
|
|
65
62
|
def compute_delay(attempt)
|
|
66
|
-
[@base_delay * (2**(attempt - 1)),
|
|
63
|
+
[@base_delay * (2**(attempt - 1)), Constants::ReactAgent::DEFAULT_MAX_DELAY].min
|
|
67
64
|
end
|
|
68
65
|
|
|
69
66
|
def extract_status(error)
|
|
@@ -9,11 +9,15 @@ module SkillBench
|
|
|
9
9
|
# Run an eval with specified skill(s)
|
|
10
10
|
# @param eval_name [String] Name of eval to run (e.g., 'test-eval' or 'evals/test-eval')
|
|
11
11
|
# @param skill_names [Array<String>] Names of skills to use
|
|
12
|
+
# @param pack [String, nil] Optional pack name for registry-based skill resolution
|
|
13
|
+
# @param registry_manifest [String, nil] Optional path to registry.json manifest
|
|
12
14
|
# @return [Hash] Result with pass/fail and score
|
|
13
|
-
def self.run(eval_name:, skill_names:)
|
|
15
|
+
def self.run(eval_name:, skill_names:, pack: nil, registry_manifest: nil)
|
|
14
16
|
Services::RunnerService.call(
|
|
15
17
|
eval_name: eval_name,
|
|
16
|
-
skill_names: skill_names
|
|
18
|
+
skill_names: skill_names,
|
|
19
|
+
pack: pack,
|
|
20
|
+
registry_manifest: registry_manifest
|
|
17
21
|
)
|
|
18
22
|
end
|
|
19
23
|
end
|
|
@@ -41,6 +41,7 @@ module SkillBench
|
|
|
41
41
|
assign_current_provider
|
|
42
42
|
@store.assign_max_execution_time(@data[:max_execution_time]) if @data.key?(:max_execution_time)
|
|
43
43
|
@store.assign_allowed_commands(@data[:allowed_commands]) if @data.key?(:allowed_commands)
|
|
44
|
+
@store.skill_sources = @data[:skill_sources] if @data.key?(:skill_sources)
|
|
44
45
|
end
|
|
45
46
|
|
|
46
47
|
def apply_provider_values
|
|
@@ -19,6 +19,7 @@ module SkillBench
|
|
|
19
19
|
current_llm_provider: :openai,
|
|
20
20
|
max_execution_time: 30,
|
|
21
21
|
allowed_commands: nil,
|
|
22
|
+
skill_sources: {},
|
|
22
23
|
llm_providers_config: {
|
|
23
24
|
openai: { api_key: nil, model: 'gpt-4o' },
|
|
24
25
|
anthropic: { api_key: nil, model: 'claude-sonnet-4-20250514' },
|
|
@@ -32,6 +32,13 @@ module SkillBench
|
|
|
32
32
|
store.llm_providers_config
|
|
33
33
|
end
|
|
34
34
|
|
|
35
|
+
# Returns skill sources mapping.
|
|
36
|
+
#
|
|
37
|
+
# @return [Hash, nil] skill source name → directory path
|
|
38
|
+
def skill_sources
|
|
39
|
+
store.skill_sources
|
|
40
|
+
end
|
|
41
|
+
|
|
35
42
|
# Returns the API key for the current LLM provider.
|
|
36
43
|
#
|
|
37
44
|
# @return [String, nil] API key for the current provider
|
|
@@ -29,9 +29,9 @@ module SkillBench
|
|
|
29
29
|
data = JSON.parse(File.read(@path), symbolize_names: true)
|
|
30
30
|
return warn_invalid_config unless data.is_a?(Hash)
|
|
31
31
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
32
|
+
success_data = data.slice(:current_llm_provider, :max_execution_time, :allowed_commands, :skill_sources).compact
|
|
33
|
+
success_data[:current_llm_provider] ||= data[:provider] if data.key?(:provider)
|
|
34
|
+
success(success_data.merge(providers: normalized_providers(data[:providers])))
|
|
35
35
|
rescue JSON::ParserError => e
|
|
36
36
|
log_parse_error(e)
|
|
37
37
|
failure('Failed to parse config file')
|
|
@@ -24,6 +24,11 @@ module SkillBench
|
|
|
24
24
|
# @return [Hash, nil] provider configuration by provider name
|
|
25
25
|
attr_accessor :llm_providers_config
|
|
26
26
|
|
|
27
|
+
# Returns skill sources mapping.
|
|
28
|
+
#
|
|
29
|
+
# @return [Hash, nil] skill source name → directory path
|
|
30
|
+
attr_accessor :skill_sources
|
|
31
|
+
|
|
27
32
|
# Initializes a new configuration store with empty provider settings.
|
|
28
33
|
def initialize
|
|
29
34
|
@llm_providers_config = {}
|
data/lib/skill_bench/config.rb
CHANGED
|
@@ -74,7 +74,9 @@ module SkillBench
|
|
|
74
74
|
@store = Config::Store.new
|
|
75
75
|
apply_defaults
|
|
76
76
|
apply_json_config(home_config_path)
|
|
77
|
-
|
|
77
|
+
local_path = Pathname.new(Dir.pwd).join(CONFIG_FILENAME)
|
|
78
|
+
is_workspace_file = File.exist?(File.join(Dir.pwd, 'ruby-skill-bench.gemspec'))
|
|
79
|
+
apply_json_config(local_path) unless defined?(Minitest) && is_workspace_file
|
|
78
80
|
apply_env_overrides
|
|
79
81
|
end
|
|
80
82
|
|
|
@@ -122,6 +124,13 @@ module SkillBench
|
|
|
122
124
|
store.llm_providers_config || {}
|
|
123
125
|
end
|
|
124
126
|
|
|
127
|
+
# Returns skill sources mapping.
|
|
128
|
+
#
|
|
129
|
+
# @return [Hash, nil] skill source name → directory path
|
|
130
|
+
def skill_sources
|
|
131
|
+
store.skill_sources || {}
|
|
132
|
+
end
|
|
133
|
+
|
|
125
134
|
# Returns API key from configuration.
|
|
126
135
|
#
|
|
127
136
|
# @return [String, nil] API key
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
# Centralized configuration constants for the SkillBench system.
|
|
5
|
+
# This eliminates magic numbers and provides a single source of truth
|
|
6
|
+
# for configurable values across the codebase.
|
|
7
|
+
module Constants
|
|
8
|
+
# ReAct Agent Configuration
|
|
9
|
+
module ReactAgent
|
|
10
|
+
DEFAULT_MAX_ITERATIONS = 25
|
|
11
|
+
DEFAULT_MAX_DELAY = 30 # Maximum delay cap in seconds for retry logic
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# HTTP Client Configuration
|
|
15
|
+
module HttpClient
|
|
16
|
+
DEFAULT_OPEN_TIMEOUT = 10
|
|
17
|
+
DEFAULT_TIMEOUT = 120
|
|
18
|
+
DEFAULT_MAX_RETRIES = 3
|
|
19
|
+
DEFAULT_RETRY_DELAY = 1
|
|
20
|
+
RETRYABLE_STATUSES = [429, 503].freeze
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Context Hydration Configuration
|
|
24
|
+
module ContextHydration
|
|
25
|
+
MAX_FILE_SIZE = 50_000 # Maximum file size in bytes
|
|
26
|
+
MAX_TOTAL_CONTEXT_SIZE = 1_000_000 # Maximum total context size in bytes (1MB)
|
|
27
|
+
TEXT_EXTENSIONS = %w[.md .rb .json .yml .yaml .txt].freeze
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Sandbox Configuration
|
|
31
|
+
module Sandbox
|
|
32
|
+
DOCKER_IMAGE_NAME = 'evaluator-sandbox'
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Tool Execution Configuration
|
|
36
|
+
module Tools
|
|
37
|
+
DANGEROUS_COMMANDS = %w[
|
|
38
|
+
bash sh zsh fish dash ksh csh tcsh
|
|
39
|
+
python python3 python2 ruby perl node
|
|
40
|
+
php lua tcl wish
|
|
41
|
+
curl wget nc ncat socat
|
|
42
|
+
eval exec
|
|
43
|
+
sudo su doas
|
|
44
|
+
chmod chown mount umount
|
|
45
|
+
dd mkfs fdisk parted
|
|
46
|
+
insmod rmmod modprobe
|
|
47
|
+
systemctl service
|
|
48
|
+
passwd useradd userdel groupadd groupdel
|
|
49
|
+
].freeze
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# File Path Configuration
|
|
53
|
+
module FilePath
|
|
54
|
+
ALLOWED_PATH_PATTERN = %r{\A[a-zA-Z0-9._\-/]+\z}
|
|
55
|
+
MAX_PATH_LENGTH = 4096
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
@@ -49,6 +49,26 @@ module SkillBench
|
|
|
49
49
|
{ success: false, response: { error: { message: e.message } } }
|
|
50
50
|
end
|
|
51
51
|
|
|
52
|
+
# Compatibility methods for ComparisonReporter
|
|
53
|
+
|
|
54
|
+
# Returns the list of dimensions from the context run.
|
|
55
|
+
#
|
|
56
|
+
# @return [Array<Object>] List of objects responding to name and score
|
|
57
|
+
def dimensions
|
|
58
|
+
return [] unless context_dimensions
|
|
59
|
+
|
|
60
|
+
context_dimensions.map do |name, dim_hash|
|
|
61
|
+
Struct.new(:name, :score).new(name.to_s, dim_hash[:score] || dim_hash['score'])
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Returns the total context score.
|
|
66
|
+
#
|
|
67
|
+
# @return [Numeric, nil]
|
|
68
|
+
def total
|
|
69
|
+
context_total
|
|
70
|
+
end
|
|
71
|
+
|
|
52
72
|
private
|
|
53
73
|
|
|
54
74
|
attr_reader :baseline, :context
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require 'pathname'
|
|
4
4
|
require 'cgi'
|
|
5
|
+
require_relative '../constants'
|
|
5
6
|
|
|
6
7
|
module SkillBench
|
|
7
8
|
module Execution
|
|
@@ -10,10 +11,6 @@ module SkillBench
|
|
|
10
11
|
class ContextHydrator
|
|
11
12
|
# Error message returned when context hydration fails.
|
|
12
13
|
HYDRATION_FAILED = 'Failed to hydrate context from source path'
|
|
13
|
-
# File extensions considered for context hydration.
|
|
14
|
-
TEXT_EXTENSIONS = %w[.md .rb .json .yml .yaml .txt].freeze
|
|
15
|
-
# Maximum file size (in bytes) for files included in context hydration.
|
|
16
|
-
MAX_FILE_SIZE = 50_000
|
|
17
14
|
|
|
18
15
|
# Loads and formats source context files.
|
|
19
16
|
#
|
|
@@ -50,6 +47,8 @@ module SkillBench
|
|
|
50
47
|
return missing_path_result unless full_path.exist? && full_path.directory?
|
|
51
48
|
|
|
52
49
|
context_files = collect_context_files(full_path)
|
|
50
|
+
return missing_path_result unless validate_total_size?(context_files)
|
|
51
|
+
|
|
53
52
|
xml_context = build_xml(context_files)
|
|
54
53
|
|
|
55
54
|
{ success: true, response: { context: xml_context } }
|
|
@@ -65,12 +64,23 @@ module SkillBench
|
|
|
65
64
|
end
|
|
66
65
|
|
|
67
66
|
def collect_context_files(full_path)
|
|
68
|
-
pattern = full_path.join("*{#{TEXT_EXTENSIONS.join(',')}}").to_s
|
|
67
|
+
pattern = full_path.join("*{#{Constants::ContextHydration::TEXT_EXTENSIONS.join(',')}}").to_s
|
|
69
68
|
Dir.glob(pattern).reject { |f| File.symlink?(f) }
|
|
70
|
-
.select { |f| File.size(f) <= MAX_FILE_SIZE }
|
|
69
|
+
.select { |f| File.size(f) <= Constants::ContextHydration::MAX_FILE_SIZE }
|
|
71
70
|
.sort
|
|
72
71
|
end
|
|
73
72
|
|
|
73
|
+
def validate_total_size?(context_files)
|
|
74
|
+
total_size = context_files.sum { |f| File.size(f) }
|
|
75
|
+
return true if total_size <= Constants::ContextHydration::MAX_TOTAL_CONTEXT_SIZE
|
|
76
|
+
|
|
77
|
+
SkillBench::ErrorLogger.log_error(
|
|
78
|
+
StandardError.new("Total context size #{total_size} exceeds maximum #{Constants::ContextHydration::MAX_TOTAL_CONTEXT_SIZE}"),
|
|
79
|
+
'ContextHydrator'
|
|
80
|
+
)
|
|
81
|
+
false
|
|
82
|
+
end
|
|
83
|
+
|
|
74
84
|
# Builds the XML structure wrapping the contents of the context files.
|
|
75
85
|
#
|
|
76
86
|
# @param context_files [Array<String>] List of absolute paths to context files.
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
require 'fileutils'
|
|
4
4
|
require 'tmpdir'
|
|
5
5
|
require 'open3'
|
|
6
|
+
require_relative '../constants'
|
|
6
7
|
|
|
7
8
|
module SkillBench
|
|
8
9
|
module Execution
|
|
@@ -143,18 +144,32 @@ module SkillBench
|
|
|
143
144
|
|
|
144
145
|
# Starts a Docker container for isolated command execution.
|
|
145
146
|
# Builds the image only if it does not already exist.
|
|
147
|
+
# Uses hardened security settings for production safety.
|
|
146
148
|
#
|
|
147
149
|
# @raise [RuntimeError] when the Docker image cannot be built or the container fails to start.
|
|
148
150
|
def start_container
|
|
149
|
-
image_name =
|
|
151
|
+
image_name = Constants::Sandbox::DOCKER_IMAGE_NAME
|
|
150
152
|
docker_dir = File.expand_path('docker', __dir__)
|
|
151
153
|
|
|
152
154
|
# Build image (Docker layer cache handles no-op builds)
|
|
153
155
|
raise "Failed to build Docker image #{image_name}" unless system('docker', 'build', '-t', image_name, docker_dir, '--quiet')
|
|
154
156
|
|
|
155
|
-
# Start a detached container
|
|
157
|
+
# Start a detached container with hardened security settings
|
|
158
|
+
# --user $(id -u):$(id -g): Runs as non-root user
|
|
159
|
+
# --security-opt no-new-privileges: Prevents privilege escalation
|
|
160
|
+
# --cap-drop ALL: Drops all Linux capabilities
|
|
161
|
+
# --cap-add CHOWN, DAC_OVERRIDE: Adds back minimal capabilities for git operations
|
|
162
|
+
# --network none: Disables network access for additional isolation
|
|
156
163
|
stdout, stderr, status = Open3.capture3(
|
|
157
|
-
'docker', 'run', '-d', '--rm',
|
|
164
|
+
'docker', 'run', '-d', '--rm',
|
|
165
|
+
'--user', "#{Process.uid}:#{Process.gid}",
|
|
166
|
+
'--security-opt', 'no-new-privileges',
|
|
167
|
+
'--cap-drop', 'ALL',
|
|
168
|
+
'--cap-add', 'CHOWN',
|
|
169
|
+
'--cap-add', 'DAC_OVERRIDE',
|
|
170
|
+
'--network', 'none',
|
|
171
|
+
'-v', "#{@path}:/sandbox:rw",
|
|
172
|
+
image_name
|
|
158
173
|
)
|
|
159
174
|
|
|
160
175
|
raise "Failed to start Docker container: #{stderr}" unless status.success?
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'pathname'
|
|
4
|
+
|
|
3
5
|
module SkillBench
|
|
4
6
|
module Execution
|
|
5
7
|
# Resolves the source skill or workflow path for a given evaluation target.
|
|
@@ -8,6 +10,8 @@ module SkillBench
|
|
|
8
10
|
#
|
|
9
11
|
# @param eval_folder_path [String] Relative path to the eval directory.
|
|
10
12
|
# @param skill_path [String, nil] Optional explicit override for the source directory.
|
|
13
|
+
# @param skill_sources [Hash] Optional skill source name → directory path mapping for fallback.
|
|
14
|
+
# When provided and local resolution does not yield an existing path, each source is checked.
|
|
11
15
|
# @return [String, nil] The resolved source path relative to the evaluator repo root, or nil if unmappable.
|
|
12
16
|
# @example Infer a skill source path (NEW format):
|
|
13
17
|
# SkillBench::Execution::SourcePathResolver.call(
|
|
@@ -19,12 +23,57 @@ module SkillBench
|
|
|
19
23
|
# eval_folder_path: 'evals/skills/code-quality/rails-code-review/review-order'
|
|
20
24
|
# )
|
|
21
25
|
# # => "skills/code-quality/rails-code-review"
|
|
22
|
-
def self.call(eval_folder_path:, skill_path: nil)
|
|
26
|
+
def self.call(eval_folder_path:, skill_path: nil, skill_sources: {})
|
|
23
27
|
return skill_path if skill_path && !skill_path.empty?
|
|
24
28
|
|
|
25
|
-
segments = eval_folder_path.to_s
|
|
29
|
+
segments = Pathname.new(eval_folder_path.to_s).each_filename.to_a
|
|
30
|
+
|
|
31
|
+
local = resolve_skills_path(segments) || resolve_workflows_path(segments)
|
|
32
|
+
|
|
33
|
+
unless local.nil? || skill_sources.empty?
|
|
34
|
+
skill_name = extract_skill_name(segments)
|
|
35
|
+
return local unless skill_name
|
|
36
|
+
return local if skill_exists_at?(local)
|
|
37
|
+
|
|
38
|
+
skill_sources.each_value do |source_path|
|
|
39
|
+
candidate = find_skill_in_source(source_path, skill_name)
|
|
40
|
+
return candidate if candidate
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
local
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Extracts the skill name from the eval path segments.
|
|
48
|
+
#
|
|
49
|
+
# @param segments [Array<String>] Path segments
|
|
50
|
+
# @return [String, nil] Skill name or nil
|
|
51
|
+
def self.extract_skill_name(segments)
|
|
52
|
+
index = segments.rindex('skills')
|
|
53
|
+
return nil unless index
|
|
54
|
+
|
|
55
|
+
remaining = segments[(index + 1)..]
|
|
56
|
+
return nil if remaining.empty?
|
|
26
57
|
|
|
27
|
-
|
|
58
|
+
remaining[0]
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Finds a skill directory within a source path by name.
|
|
62
|
+
#
|
|
63
|
+
# @param source_path [String] Root directory containing skill categories
|
|
64
|
+
# @param skill_name [String] Name of the skill to find
|
|
65
|
+
# @return [String, nil] Path to the skill directory or nil
|
|
66
|
+
def self.find_skill_in_source(source_path, skill_name)
|
|
67
|
+
return nil unless source_path && Dir.exist?(source_path)
|
|
68
|
+
|
|
69
|
+
Dir.glob(File.join(source_path, '*')).each do |entry|
|
|
70
|
+
next unless Dir.exist?(entry)
|
|
71
|
+
|
|
72
|
+
candidate = File.join(entry, skill_name)
|
|
73
|
+
return candidate if Dir.exist?(candidate) && File.exist?(File.join(candidate, 'SKILL.md'))
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
nil
|
|
28
77
|
end
|
|
29
78
|
|
|
30
79
|
private_class_method def self.resolve_skills_path(segments)
|
|
@@ -55,6 +104,13 @@ module SkillBench
|
|
|
55
104
|
workflow_name = segments[index + 1]
|
|
56
105
|
"workflows/#{workflow_name}" if workflow_name
|
|
57
106
|
end
|
|
107
|
+
|
|
108
|
+
private_class_method def self.skill_exists_at?(path)
|
|
109
|
+
return false unless path
|
|
110
|
+
|
|
111
|
+
full_path = path.end_with?('SKILL.md') ? path : File.join(path, 'SKILL.md')
|
|
112
|
+
File.exist?(full_path)
|
|
113
|
+
end
|
|
58
114
|
end
|
|
59
115
|
end
|
|
60
116
|
end
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Registry
|
|
7
|
+
# Resolves skill paths from the ecosystem registry manifest.
|
|
8
|
+
# Reads a registry.json (from agent-mcp-runtime) and resolves
|
|
9
|
+
# pack → tile.json → skill path.
|
|
10
|
+
class PackResolver
|
|
11
|
+
# @param registry_path [String] Path to registry.json manifest
|
|
12
|
+
def initialize(registry_path)
|
|
13
|
+
@manifest = JSON.parse(File.read(registry_path))
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Resolves a skill path within a named pack.
|
|
17
|
+
#
|
|
18
|
+
# @param pack_name [String] Pack name (e.g. "rails", "core", "hanami")
|
|
19
|
+
# @param skill_name [String] Skill name (e.g. "code-review")
|
|
20
|
+
# @return [String, nil] Absolute path to the skill directory, or nil
|
|
21
|
+
# @param [Array<Object>] visited
|
|
22
|
+
def resolve_skill(pack_name, skill_name, visited = [])
|
|
23
|
+
return nil if visited.include?(pack_name)
|
|
24
|
+
|
|
25
|
+
visited += [pack_name]
|
|
26
|
+
|
|
27
|
+
pack = @manifest.dig('packs', pack_name)
|
|
28
|
+
return nil unless pack
|
|
29
|
+
|
|
30
|
+
source_path = resolve_source(pack['source'])
|
|
31
|
+
return nil unless source_path
|
|
32
|
+
|
|
33
|
+
tile_path = File.join(source_path, pack['tile'])
|
|
34
|
+
return nil unless File.exist?(tile_path)
|
|
35
|
+
|
|
36
|
+
tile = JSON.parse(File.read(tile_path))
|
|
37
|
+
|
|
38
|
+
# 1. Try to resolve directly
|
|
39
|
+
resolved = resolve_direct(tile, source_path, skill_name)
|
|
40
|
+
return resolved if resolved
|
|
41
|
+
|
|
42
|
+
# 2. Try to resolve via deprecated_skills redirect
|
|
43
|
+
resolved = resolve_redirect(tile, skill_name, visited)
|
|
44
|
+
return resolved if resolved
|
|
45
|
+
|
|
46
|
+
# 3. Try to resolve via depends_on packs in registry
|
|
47
|
+
resolve_dependencies(pack, skill_name, visited)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Lists available pack names from the manifest.
|
|
51
|
+
#
|
|
52
|
+
# @return [Array<String>] Available pack names
|
|
53
|
+
def pack_names
|
|
54
|
+
@manifest.fetch('packs', {}).keys
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
private
|
|
58
|
+
|
|
59
|
+
def resolve_direct(tile, source_path, skill_name)
|
|
60
|
+
skill_entry = tile.dig('skills', skill_name)
|
|
61
|
+
return nil unless skill_entry
|
|
62
|
+
|
|
63
|
+
skill_path = File.join(source_path, skill_entry['path'])
|
|
64
|
+
resolved = File.expand_path(skill_path)
|
|
65
|
+
resolved = File.dirname(resolved) if resolved.end_with?('SKILL.md')
|
|
66
|
+
base = File.expand_path(source_path)
|
|
67
|
+
|
|
68
|
+
# Ensure resolved path is inside source directory
|
|
69
|
+
resolved == base || resolved.start_with?(base + File::SEPARATOR) ? resolved : nil
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def resolve_redirect(tile, skill_name, visited)
|
|
73
|
+
dep_entry = tile.dig('deprecated_skills', skill_name)
|
|
74
|
+
return nil unless dep_entry
|
|
75
|
+
|
|
76
|
+
moved_to = dep_entry['moved_to']
|
|
77
|
+
return nil unless moved_to
|
|
78
|
+
|
|
79
|
+
target_pack = find_pack_by_source(moved_to)
|
|
80
|
+
return nil unless target_pack
|
|
81
|
+
|
|
82
|
+
resolve_skill(target_pack, skill_name, visited)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def resolve_dependencies(pack, skill_name, visited)
|
|
86
|
+
depends_on = pack['depends_on']
|
|
87
|
+
return nil unless depends_on.is_a?(Array)
|
|
88
|
+
|
|
89
|
+
depends_on.each do |dep_pack|
|
|
90
|
+
resolved = resolve_skill(dep_pack, skill_name, visited)
|
|
91
|
+
return resolved if resolved
|
|
92
|
+
end
|
|
93
|
+
nil
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def find_pack_by_source(source)
|
|
97
|
+
@manifest.fetch('packs', {}).each do |pack_name, pack_config|
|
|
98
|
+
if pack_config['source'] == source ||
|
|
99
|
+
pack_config['source'].to_s.split('/').last == source.to_s.split('/').last
|
|
100
|
+
return pack_name
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
nil
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def resolve_source(source)
|
|
107
|
+
return nil unless source.is_a?(String) && !source.empty?
|
|
108
|
+
|
|
109
|
+
repo_name = source.split('/').last
|
|
110
|
+
candidates = [
|
|
111
|
+
File.expand_path("../#{repo_name}", Dir.pwd),
|
|
112
|
+
File.expand_path("../../#{repo_name}", Dir.pwd),
|
|
113
|
+
File.join(Dir.home, '.agent-mcp-runtime', 'cache', repo_name)
|
|
114
|
+
]
|
|
115
|
+
candidates.find { |c| Dir.exist?(c) }
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|