ruby-skill-bench 0.1.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +86 -0
- data/lib/skill_bench/cli/compare_command.rb +91 -0
- data/lib/skill_bench/cli/help_printer.rb +9 -1
- data/lib/skill_bench/cli/run_command.rb +6 -4
- data/lib/skill_bench/cli.rb +7 -4
- data/lib/skill_bench/clients/all.rb +1 -0
- data/lib/skill_bench/clients/providers/mock.rb +56 -0
- data/lib/skill_bench/commands/run.rb +6 -2
- data/lib/skill_bench/config/applier.rb +1 -0
- data/lib/skill_bench/config/defaults.rb +1 -0
- data/lib/skill_bench/config/facade_readers.rb +7 -0
- data/lib/skill_bench/config/json_loader.rb +3 -3
- data/lib/skill_bench/config/store.rb +5 -0
- data/lib/skill_bench/config.rb +10 -1
- data/lib/skill_bench/delta_report.rb +20 -0
- data/lib/skill_bench/execution/source_path_resolver.rb +59 -3
- data/lib/skill_bench/registry/pack_resolver.rb +119 -0
- data/lib/skill_bench/services/agent_spawner_service.rb +114 -0
- data/lib/skill_bench/services/compare_option_parser.rb +55 -0
- data/lib/skill_bench/services/comparison_reporter.rb +97 -0
- data/lib/skill_bench/services/comparison_runner.rb +49 -0
- data/lib/skill_bench/services/context_loader_service.rb +42 -0
- data/lib/skill_bench/services/error_response_builder.rb +119 -0
- data/lib/skill_bench/services/eval_resolver.rb +33 -0
- data/lib/skill_bench/services/exit_code_calculator.rb +39 -0
- data/lib/skill_bench/services/judge_params_builder.rb +54 -0
- data/lib/skill_bench/services/manifest_finder.rb +36 -0
- data/lib/skill_bench/services/output_formatter.rb +28 -0
- data/lib/skill_bench/services/prompt_builder_service.rb +98 -0
- data/lib/skill_bench/services/provider_resolver.rb +73 -0
- data/lib/skill_bench/services/runner_service.rb +84 -315
- data/lib/skill_bench/services/skill_resolver.rb +37 -9
- data/lib/skill_bench/services/skill_resolver_service.rb +70 -0
- data/lib/skill_bench/services/source_path_resolver_service.rb +45 -0
- data/lib/skill_bench/services/trend_recorder_service.rb +67 -0
- data/lib/skill_bench/services/variant_parser.rb +32 -0
- data/lib/skill_bench/services/variant_resolver.rb +63 -0
- data/lib/skill_bench/version.rb +1 -1
- metadata +23 -2
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Registry
|
|
7
|
+
# Resolves skill paths from the ecosystem registry manifest.
|
|
8
|
+
# Reads a registry.json (from agent-mcp-runtime) and resolves
|
|
9
|
+
# pack → tile.json → skill path.
|
|
10
|
+
class PackResolver
|
|
11
|
+
# @param registry_path [String] Path to registry.json manifest
|
|
12
|
+
def initialize(registry_path)
|
|
13
|
+
@manifest = JSON.parse(File.read(registry_path))
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Resolves a skill path within a named pack.
|
|
17
|
+
#
|
|
18
|
+
# @param pack_name [String] Pack name (e.g. "rails", "core", "hanami")
|
|
19
|
+
# @param skill_name [String] Skill name (e.g. "code-review")
|
|
20
|
+
# @return [String, nil] Absolute path to the skill directory, or nil
|
|
21
|
+
# @param [Array<Object>] visited
|
|
22
|
+
def resolve_skill(pack_name, skill_name, visited = [])
|
|
23
|
+
return nil if visited.include?(pack_name)
|
|
24
|
+
|
|
25
|
+
visited += [pack_name]
|
|
26
|
+
|
|
27
|
+
pack = @manifest.dig('packs', pack_name)
|
|
28
|
+
return nil unless pack
|
|
29
|
+
|
|
30
|
+
source_path = resolve_source(pack['source'])
|
|
31
|
+
return nil unless source_path
|
|
32
|
+
|
|
33
|
+
tile_path = File.join(source_path, pack['tile'])
|
|
34
|
+
return nil unless File.exist?(tile_path)
|
|
35
|
+
|
|
36
|
+
tile = JSON.parse(File.read(tile_path))
|
|
37
|
+
|
|
38
|
+
# 1. Try to resolve directly
|
|
39
|
+
resolved = resolve_direct(tile, source_path, skill_name)
|
|
40
|
+
return resolved if resolved
|
|
41
|
+
|
|
42
|
+
# 2. Try to resolve via deprecated_skills redirect
|
|
43
|
+
resolved = resolve_redirect(tile, skill_name, visited)
|
|
44
|
+
return resolved if resolved
|
|
45
|
+
|
|
46
|
+
# 3. Try to resolve via depends_on packs in registry
|
|
47
|
+
resolve_dependencies(pack, skill_name, visited)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Lists available pack names from the manifest.
|
|
51
|
+
#
|
|
52
|
+
# @return [Array<String>] Available pack names
|
|
53
|
+
def pack_names
|
|
54
|
+
@manifest.fetch('packs', {}).keys
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
private
|
|
58
|
+
|
|
59
|
+
def resolve_direct(tile, source_path, skill_name)
|
|
60
|
+
skill_entry = tile.dig('skills', skill_name)
|
|
61
|
+
return nil unless skill_entry
|
|
62
|
+
|
|
63
|
+
skill_path = File.join(source_path, skill_entry['path'])
|
|
64
|
+
resolved = File.expand_path(skill_path)
|
|
65
|
+
resolved = File.dirname(resolved) if resolved.end_with?('SKILL.md')
|
|
66
|
+
base = File.expand_path(source_path)
|
|
67
|
+
|
|
68
|
+
# Ensure resolved path is inside source directory
|
|
69
|
+
resolved == base || resolved.start_with?(base + File::SEPARATOR) ? resolved : nil
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def resolve_redirect(tile, skill_name, visited)
|
|
73
|
+
dep_entry = tile.dig('deprecated_skills', skill_name)
|
|
74
|
+
return nil unless dep_entry
|
|
75
|
+
|
|
76
|
+
moved_to = dep_entry['moved_to']
|
|
77
|
+
return nil unless moved_to
|
|
78
|
+
|
|
79
|
+
target_pack = find_pack_by_source(moved_to)
|
|
80
|
+
return nil unless target_pack
|
|
81
|
+
|
|
82
|
+
resolve_skill(target_pack, skill_name, visited)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def resolve_dependencies(pack, skill_name, visited)
|
|
86
|
+
depends_on = pack['depends_on']
|
|
87
|
+
return nil unless depends_on.is_a?(Array)
|
|
88
|
+
|
|
89
|
+
depends_on.each do |dep_pack|
|
|
90
|
+
resolved = resolve_skill(dep_pack, skill_name, visited)
|
|
91
|
+
return resolved if resolved
|
|
92
|
+
end
|
|
93
|
+
nil
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def find_pack_by_source(source)
|
|
97
|
+
@manifest.fetch('packs', {}).each do |pack_name, pack_config|
|
|
98
|
+
if pack_config['source'] == source ||
|
|
99
|
+
pack_config['source'].to_s.split('/').last == source.to_s.split('/').last
|
|
100
|
+
return pack_name
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
nil
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def resolve_source(source)
|
|
107
|
+
return nil unless source.is_a?(String) && !source.empty?
|
|
108
|
+
|
|
109
|
+
repo_name = source.split('/').last
|
|
110
|
+
candidates = [
|
|
111
|
+
File.expand_path("../#{repo_name}", Dir.pwd),
|
|
112
|
+
File.expand_path("../../#{repo_name}", Dir.pwd),
|
|
113
|
+
File.join(Dir.home, '.agent-mcp-runtime', 'cache', repo_name)
|
|
114
|
+
]
|
|
115
|
+
candidates.find { |c| Dir.exist?(c) }
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../execution/sandbox'
|
|
4
|
+
require_relative '../agent/react_agent'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Services
|
|
8
|
+
# Spawns and executes LLM agents for evaluation.
|
|
9
|
+
class AgentSpawnerService
|
|
10
|
+
# Spawns the LLM agent with the given system prompt.
|
|
11
|
+
#
|
|
12
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
13
|
+
# @param system_prompt [String] The system prompt for the agent
|
|
14
|
+
# @param provider [Object] The resolved provider
|
|
15
|
+
# @param config [Hash, nil] Provider config
|
|
16
|
+
# @return [Hash] Agent response with result, status, runtime, usage, raw_response, iterations
|
|
17
|
+
def self.call(evaluation, system_prompt, provider, config)
|
|
18
|
+
new(evaluation, system_prompt, provider, config).call
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
22
|
+
# @param system_prompt [String] The system prompt for the agent
|
|
23
|
+
# @param provider [Object] The resolved provider
|
|
24
|
+
# @param config [Hash, nil] Provider config
|
|
25
|
+
def initialize(evaluation, system_prompt, provider, config)
|
|
26
|
+
@evaluation = evaluation
|
|
27
|
+
@system_prompt = system_prompt
|
|
28
|
+
@provider = provider
|
|
29
|
+
@config = config
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Spawns the LLM agent with the given system prompt.
|
|
33
|
+
#
|
|
34
|
+
# @return [Hash] Agent response with result, status, runtime, usage, raw_response, iterations
|
|
35
|
+
def call
|
|
36
|
+
return { result: 'mock result', status: :success, iterations: [] } if @provider.name == 'mock'
|
|
37
|
+
|
|
38
|
+
client_params = build_client_params
|
|
39
|
+
max_iterations = @config&.[](:max_iterations) || @config&.[]('max_iterations') || 25
|
|
40
|
+
|
|
41
|
+
run_agent(client_params, max_iterations)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
# Runs the agent in a sandbox with error handling.
|
|
47
|
+
#
|
|
48
|
+
# @param client_params [Hash] Client parameters for the agent
|
|
49
|
+
# @param max_iterations [Integer] Maximum iterations for the agent
|
|
50
|
+
# @return [Hash] Agent response with result, status, runtime, usage, raw_response, iterations
|
|
51
|
+
def run_agent(client_params, max_iterations)
|
|
52
|
+
Execution::Sandbox.run(@evaluation.path) do |sandbox|
|
|
53
|
+
agent_result = Agent::ReactAgent.call(
|
|
54
|
+
system_prompt: @system_prompt,
|
|
55
|
+
initial_prompt: @evaluation.task,
|
|
56
|
+
working_dir: sandbox.path,
|
|
57
|
+
container_id: sandbox.container_id,
|
|
58
|
+
client_params: client_params,
|
|
59
|
+
max_iterations: max_iterations
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
status = agent_result[:success] ? :success : :error
|
|
63
|
+
final_answer = agent_result.dig(:response, :content) || ''
|
|
64
|
+
diff = Execution::Sandbox.capture_diff(sandbox.path)
|
|
65
|
+
iterations = agent_result.dig(:response, :iterations) || []
|
|
66
|
+
|
|
67
|
+
output = [final_answer, diff].reject(&:empty?).join("\n\n")
|
|
68
|
+
|
|
69
|
+
{
|
|
70
|
+
result: output,
|
|
71
|
+
status: status,
|
|
72
|
+
runtime: @provider.runtime,
|
|
73
|
+
usage: {},
|
|
74
|
+
raw_response: agent_result,
|
|
75
|
+
iterations: iterations
|
|
76
|
+
}
|
|
77
|
+
end
|
|
78
|
+
rescue StandardError => e
|
|
79
|
+
{
|
|
80
|
+
result: "Error: #{e.message}",
|
|
81
|
+
status: :error,
|
|
82
|
+
runtime: @provider.runtime,
|
|
83
|
+
usage: {},
|
|
84
|
+
raw_response: { error: e.message, backtrace: e.backtrace },
|
|
85
|
+
iterations: []
|
|
86
|
+
}
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Builds client parameters for the ReactAgent.
|
|
90
|
+
#
|
|
91
|
+
# @return [Hash] Client parameters
|
|
92
|
+
def build_client_params
|
|
93
|
+
config = @config || safe_merged_config
|
|
94
|
+
return {} unless config
|
|
95
|
+
|
|
96
|
+
params = config.dup
|
|
97
|
+
params[:model] ||= @provider.llm
|
|
98
|
+
params[:provider] = @provider.runtime.to_sym
|
|
99
|
+
params
|
|
100
|
+
rescue StandardError
|
|
101
|
+
{}
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Safely calls merged_config, returning nil on any error.
|
|
105
|
+
#
|
|
106
|
+
# @return [Hash, nil] The merged config or nil
|
|
107
|
+
def safe_merged_config
|
|
108
|
+
@provider.merged_config
|
|
109
|
+
rescue StandardError
|
|
110
|
+
nil
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'optparse'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Services
|
|
7
|
+
# Parses CLI options for the compare command.
|
|
8
|
+
class CompareOptionParser
|
|
9
|
+
# Parses the given argv and returns the options hash.
|
|
10
|
+
#
|
|
11
|
+
# @param argv [Array<String>] Raw CLI arguments
|
|
12
|
+
# @return [Hash] Parsed options with keys: :variant_a, :variant_b, :eval, :format
|
|
13
|
+
# @raise [OptionParser::ParseError] when option parsing fails
|
|
14
|
+
def self.call(argv)
|
|
15
|
+
new(argv).call
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# @param argv [Array<String>] Raw CLI arguments
|
|
19
|
+
def initialize(argv)
|
|
20
|
+
@argv = argv
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Parses options from argv.
|
|
24
|
+
#
|
|
25
|
+
# @return [Hash] Parsed options with keys: :variant_a, :variant_b, :eval, :format
|
|
26
|
+
# @raise [OptionParser::ParseError] when option parsing fails
|
|
27
|
+
def call
|
|
28
|
+
options = { format: :human }
|
|
29
|
+
parser = build_parser(options)
|
|
30
|
+
parser.parse!(@argv)
|
|
31
|
+
options
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
# Builds the OptionParser instance.
|
|
37
|
+
#
|
|
38
|
+
# @param options [Hash] Options hash to populate
|
|
39
|
+
# @return [OptionParser] Configured parser
|
|
40
|
+
def build_parser(options)
|
|
41
|
+
OptionParser.new do |opts|
|
|
42
|
+
opts.banner = 'Usage: skill-bench compare <skill-name> [options]'
|
|
43
|
+
opts.on('--variant-a SPEC', 'First variant (e.g., "pack:rails" or "/path/to/skill")') { |v| options[:variant_a] = v }
|
|
44
|
+
opts.on('--variant-b SPEC', 'Second variant (e.g., "pack:hanami" or "/path/to/skill")') { |v| options[:variant_b] = v }
|
|
45
|
+
opts.on('--eval PATH', 'Path to the eval directory') { |v| options[:eval] = v }
|
|
46
|
+
opts.on('--format FORMAT', 'Output format (human, json)') { |v| options[:format] = v.to_sym }
|
|
47
|
+
opts.on('-h', '--help', 'Prints this help') do
|
|
48
|
+
puts opts
|
|
49
|
+
raise SkillBench::HelpRequested
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Services
|
|
5
|
+
# Prints a formatted comparison report for two evaluation results.
|
|
6
|
+
class ComparisonReporter
|
|
7
|
+
# Prints the comparison report to stdout.
|
|
8
|
+
#
|
|
9
|
+
# @param result_a [Hash] First evaluation result
|
|
10
|
+
# @param result_b [Hash] Second evaluation result
|
|
11
|
+
# @param label_a [String] Label for first variant
|
|
12
|
+
# @param label_b [String] Label for second variant
|
|
13
|
+
# @return [nil]
|
|
14
|
+
def self.call(result_a, result_b, label_a, label_b)
|
|
15
|
+
new(result_a, result_b, label_a, label_b).call
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# @param result_a [Hash] First evaluation result
|
|
19
|
+
# @param result_b [Hash] Second evaluation result
|
|
20
|
+
# @param label_a [String] Label for first variant
|
|
21
|
+
# @param label_b [String] Label for second variant
|
|
22
|
+
def initialize(result_a, result_b, label_a, label_b)
|
|
23
|
+
@result_a = result_a
|
|
24
|
+
@result_b = result_b
|
|
25
|
+
@label_a = label_a
|
|
26
|
+
@label_b = label_b
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Prints the comparison report to stdout.
|
|
30
|
+
#
|
|
31
|
+
# @return [nil]
|
|
32
|
+
def call
|
|
33
|
+
puts "\n=== Comparison Report ==="
|
|
34
|
+
puts "| Dimension | #{@label_a} | #{@label_b} | Delta |"
|
|
35
|
+
puts '|-----------|----------|----------|-------|'
|
|
36
|
+
|
|
37
|
+
report_a = @result_a.dig(:response, :report)
|
|
38
|
+
report_b = @result_b.dig(:response, :report)
|
|
39
|
+
return unless report_a && report_b
|
|
40
|
+
|
|
41
|
+
print_dimension_scores(report_a, report_b)
|
|
42
|
+
print_total_scores(report_a, report_b)
|
|
43
|
+
print_verdicts(report_a, report_b)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
private
|
|
47
|
+
|
|
48
|
+
# Prints dimension score comparison.
|
|
49
|
+
#
|
|
50
|
+
# @param report_a [Object] First evaluation report
|
|
51
|
+
# @param report_b [Object] Second evaluation report
|
|
52
|
+
def print_dimension_scores(report_a, report_b)
|
|
53
|
+
report_b_by_name = report_b.dimensions.to_h { |d| [d.name, d] }
|
|
54
|
+
|
|
55
|
+
report_a.dimensions.each do |dim|
|
|
56
|
+
score_a = dim.score
|
|
57
|
+
score_b = report_b_by_name[dim.name]&.score || 0
|
|
58
|
+
delta = score_a - score_b
|
|
59
|
+
puts format('| %<name>-9s | %<a>8.1f | %<b>8.1f | %<delta>+5.1f |',
|
|
60
|
+
name: dim.name, a: score_a, b: score_b, delta: delta.to_f)
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Prints total score comparison.
|
|
65
|
+
#
|
|
66
|
+
# @param report_a [Object] First evaluation report
|
|
67
|
+
# @param report_b [Object] Second evaluation report
|
|
68
|
+
def print_total_scores(report_a, report_b)
|
|
69
|
+
total_a = report_a.total
|
|
70
|
+
total_b = report_b.total
|
|
71
|
+
return unless total_a && total_b
|
|
72
|
+
|
|
73
|
+
delta = total_a - total_b
|
|
74
|
+
puts format('| %<name>-9s | %<a>8.1f | %<b>8.1f | %<delta>+5.1f |',
|
|
75
|
+
name: 'TOTAL', a: total_a.to_f, b: total_b.to_f, delta: delta.to_f)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Prints verdict comparison.
|
|
79
|
+
#
|
|
80
|
+
# @param report_a [Object] First evaluation report
|
|
81
|
+
# @param report_b [Object] Second evaluation report
|
|
82
|
+
def print_verdicts(report_a, report_b)
|
|
83
|
+
verdict_a = format_verdict(report_a.verdict)
|
|
84
|
+
verdict_b = format_verdict(report_b.verdict)
|
|
85
|
+
puts "| A: #{verdict_a} | B: #{verdict_b} |"
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def format_verdict(verdict)
|
|
89
|
+
case verdict
|
|
90
|
+
when true then 'PASS'
|
|
91
|
+
when false then 'FAIL'
|
|
92
|
+
else verdict.to_s
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'variant_resolver'
|
|
4
|
+
require_relative 'runner_service'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Services
|
|
8
|
+
# Runs both variants of a skill comparison.
|
|
9
|
+
class ComparisonRunner
|
|
10
|
+
# Runs both variants and returns their results.
|
|
11
|
+
#
|
|
12
|
+
# @param variant_a [Hash] First variant specification
|
|
13
|
+
# @param variant_b [Hash] Second variant specification
|
|
14
|
+
# @param skill_name [String] Name of the skill to compare
|
|
15
|
+
# @param eval_path [String] Path to the eval directory
|
|
16
|
+
# @param manifest_path [String, nil] Optional path to registry manifest
|
|
17
|
+
# @return [Hash] Hash with :result_a and :result_b keys
|
|
18
|
+
def self.call(variant_a, variant_b, skill_name, eval_path, manifest_path: nil)
|
|
19
|
+
new(variant_a, variant_b, skill_name, eval_path, manifest_path: manifest_path).call
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# @param variant_a [Hash] First variant specification
|
|
23
|
+
# @param variant_b [Hash] Second variant specification
|
|
24
|
+
# @param skill_name [String] Name of the skill to compare
|
|
25
|
+
# @param eval_path [String] Path to the eval directory
|
|
26
|
+
# @param manifest_path [String, nil] Optional path to registry manifest
|
|
27
|
+
def initialize(variant_a, variant_b, skill_name, eval_path, manifest_path: nil)
|
|
28
|
+
@variant_a = variant_a
|
|
29
|
+
@variant_b = variant_b
|
|
30
|
+
@skill_name = skill_name
|
|
31
|
+
@eval_path = eval_path
|
|
32
|
+
@manifest_path = manifest_path
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Runs both variants and returns their results.
|
|
36
|
+
#
|
|
37
|
+
# @return [Hash] Hash with :result_a and :result_b keys
|
|
38
|
+
def call
|
|
39
|
+
skill_paths_a = VariantResolver.call(@variant_a, @skill_name, manifest_path: @manifest_path)
|
|
40
|
+
skill_paths_b = VariantResolver.call(@variant_b, @skill_name, manifest_path: @manifest_path)
|
|
41
|
+
|
|
42
|
+
result_a = RunnerService.call(eval_name: @eval_path, skill_names: skill_paths_a)
|
|
43
|
+
result_b = RunnerService.call(eval_name: @eval_path, skill_names: skill_paths_b)
|
|
44
|
+
|
|
45
|
+
{ result_a: result_a, result_b: result_b }
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Services
|
|
5
|
+
# Loads and combines skill context from SKILL.md files.
|
|
6
|
+
class ContextLoaderService
|
|
7
|
+
# Loads and combines skill context from SKILL.md files.
|
|
8
|
+
#
|
|
9
|
+
# @param skills [Array<SkillBench::Models::Skill>] The skills to load context from
|
|
10
|
+
# @return [String] The combined skill context
|
|
11
|
+
def self.call(skills)
|
|
12
|
+
new(skills).call
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# @param skills [Array<SkillBench::Models::Skill>] The skills to load context from
|
|
16
|
+
def initialize(skills)
|
|
17
|
+
@skills = skills
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Loads and combines skill context from SKILL.md files.
|
|
21
|
+
#
|
|
22
|
+
# @return [String] The combined skill context
|
|
23
|
+
def call
|
|
24
|
+
return '' if @skills.nil? || @skills.empty?
|
|
25
|
+
|
|
26
|
+
contexts = @skills.map { |skill| load_skill_context(skill) }
|
|
27
|
+
contexts.reject(&:empty?).join("\n\n#{'=' * 40}\n\n")
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
# Loads the skill context from a single skill's SKILL.md file.
|
|
33
|
+
#
|
|
34
|
+
# @param skill [SkillBench::Models::Skill] The skill to load context from
|
|
35
|
+
# @return [String] The skill context or empty string if not found
|
|
36
|
+
def load_skill_context(skill)
|
|
37
|
+
skill_md = File.join(skill.path, 'SKILL.md')
|
|
38
|
+
File.exist?(skill_md) ? File.read(skill_md) : ''
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Services
|
|
5
|
+
# Builds standardized error responses with metadata.
|
|
6
|
+
class ErrorResponseBuilder
|
|
7
|
+
# Builds a configuration error response.
|
|
8
|
+
#
|
|
9
|
+
# @param error [Exception] The configuration error
|
|
10
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
11
|
+
# @param provider [Object] The resolved provider
|
|
12
|
+
# @param skill_names [Array<String>] Names of the skills
|
|
13
|
+
# @return [Hash] Error result with metadata
|
|
14
|
+
def self.config_error(error, evaluation, provider, skill_names)
|
|
15
|
+
new(evaluation, provider, skill_names).config_error(error)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Builds an agent error response.
|
|
19
|
+
#
|
|
20
|
+
# @param result [Hash] The agent result containing the error
|
|
21
|
+
# @param phase [String] The phase that failed (e.g., 'baseline', 'context')
|
|
22
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
23
|
+
# @param provider [Object] The resolved provider
|
|
24
|
+
# @param skill_names [Array<String>] Names of the skills
|
|
25
|
+
# @return [Hash] Error result with metadata
|
|
26
|
+
def self.agent_error(result, phase, evaluation, provider, skill_names)
|
|
27
|
+
new(evaluation, provider, skill_names).agent_error(result, phase)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Builds an empty context error response.
|
|
31
|
+
#
|
|
32
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
33
|
+
# @param provider [Object] The resolved provider
|
|
34
|
+
# @param skill_names [Array<String>] Names of the skills
|
|
35
|
+
# @return [Hash] Error result with metadata
|
|
36
|
+
def self.empty_context_error(evaluation, provider, skill_names)
|
|
37
|
+
new(evaluation, provider, skill_names).empty_context_error
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Enriches an existing error result with metadata.
|
|
41
|
+
#
|
|
42
|
+
# @param result [Hash] The existing error result
|
|
43
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
44
|
+
# @param provider [Object] The resolved provider
|
|
45
|
+
# @param skill_names [Array<String>] Names of the skills
|
|
46
|
+
# @return [Hash] Enriched error result with metadata
|
|
47
|
+
def self.enrich_error(result, evaluation, provider, skill_names)
|
|
48
|
+
new(evaluation, provider, skill_names).enrich_error(result)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
52
|
+
# @param provider [Object] The resolved provider
|
|
53
|
+
# @param skill_names [Array<String>] Names of the skills
|
|
54
|
+
def initialize(evaluation, provider, skill_names)
|
|
55
|
+
@evaluation = evaluation
|
|
56
|
+
@provider = provider
|
|
57
|
+
@skill_names = skill_names
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Builds a configuration error response.
|
|
61
|
+
#
|
|
62
|
+
# @param error [Exception] The configuration error
|
|
63
|
+
# @return [Hash] Error result with metadata
|
|
64
|
+
def config_error(error)
|
|
65
|
+
base_error_result("Configuration error: #{error.message}")
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Builds an agent error response.
|
|
69
|
+
#
|
|
70
|
+
# @param result [Hash] The agent result containing the error
|
|
71
|
+
# @param phase [String] The phase that failed (e.g., 'baseline', 'context')
|
|
72
|
+
# @return [Hash] Error result with metadata
|
|
73
|
+
def agent_error(result, phase)
|
|
74
|
+
raw = result[:raw_response]
|
|
75
|
+
error_msg = raw&.dig(:response, :error, :message) || raw&.dig(:error, :message) || 'unknown error'
|
|
76
|
+
base_error_result("#{phase.capitalize} agent failed: #{error_msg}")
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Builds an empty context error response.
|
|
80
|
+
#
|
|
81
|
+
# @return [Hash] Error result with metadata
|
|
82
|
+
def empty_context_error
|
|
83
|
+
base_error_result('Skill context is empty. Ensure SKILL.md exists and has content.')
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Enriches an existing error result with metadata.
|
|
87
|
+
#
|
|
88
|
+
# @param result [Hash] The existing error result
|
|
89
|
+
# @return [Hash] Enriched error result with metadata
|
|
90
|
+
def enrich_error(result)
|
|
91
|
+
result.merge(
|
|
92
|
+
eval_name: @evaluation.name,
|
|
93
|
+
skill_name: @skill_names.join(', '),
|
|
94
|
+
provider_name: @provider.name
|
|
95
|
+
)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
private
|
|
99
|
+
|
|
100
|
+
# Builds a base error result with metadata.
|
|
101
|
+
#
|
|
102
|
+
# @param message [String] The error message
|
|
103
|
+
# @return [Hash] Error result with metadata
|
|
104
|
+
def base_error_result(message)
|
|
105
|
+
{
|
|
106
|
+
success: false,
|
|
107
|
+
response: {
|
|
108
|
+
error: {
|
|
109
|
+
message: message
|
|
110
|
+
}
|
|
111
|
+
},
|
|
112
|
+
eval_name: @evaluation.name,
|
|
113
|
+
skill_name: @skill_names.join(', '),
|
|
114
|
+
provider_name: @provider.name
|
|
115
|
+
}
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../models/eval'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Services
|
|
7
|
+
# Resolves an eval from a name or path.
|
|
8
|
+
class EvalResolver
|
|
9
|
+
# Resolves an eval from a name or path.
|
|
10
|
+
#
|
|
11
|
+
# @param eval_name [String] Name or path of the eval to resolve
|
|
12
|
+
# @return [SkillBench::Models::Eval] The resolved eval
|
|
13
|
+
# @raise [Errno::ENOENT] when the eval directory does not exist
|
|
14
|
+
def self.call(eval_name)
|
|
15
|
+
new(eval_name).call
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# @param eval_name [String] Name or path of the eval
|
|
19
|
+
def initialize(eval_name)
|
|
20
|
+
@eval_name = eval_name
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Resolves the eval from the name or path.
|
|
24
|
+
#
|
|
25
|
+
# @return [SkillBench::Models::Eval] The resolved eval
|
|
26
|
+
# @raise [Errno::ENOENT] when the eval directory does not exist
|
|
27
|
+
def call
|
|
28
|
+
eval_path = @eval_name.include?('/') ? @eval_name : "evals/#{@eval_name}"
|
|
29
|
+
SkillBench::Models::Eval.load(eval_path)
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Services
|
|
5
|
+
# Calculates the exit code based on comparison results.
|
|
6
|
+
class ExitCodeCalculator
|
|
7
|
+
# Calculates the exit code from comparison results.
|
|
8
|
+
#
|
|
9
|
+
# @param result_a [Hash] First evaluation result
|
|
10
|
+
# @param result_b [Hash] Second evaluation result
|
|
11
|
+
# @return [Integer] 0 if both pass, 1 otherwise
|
|
12
|
+
def self.call(result_a, result_b)
|
|
13
|
+
new(result_a, result_b).call
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# @param result_a [Hash] First evaluation result
|
|
17
|
+
# @param result_b [Hash] Second evaluation result
|
|
18
|
+
def initialize(result_a, result_b)
|
|
19
|
+
@result_a = result_a
|
|
20
|
+
@result_b = result_b
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Calculates the exit code from comparison results.
|
|
24
|
+
#
|
|
25
|
+
# @return [Integer] 0 if both pass, 1 otherwise
|
|
26
|
+
def call
|
|
27
|
+
report_a = @result_a.dig(:response, :report)
|
|
28
|
+
report_b = @result_b.dig(:response, :report)
|
|
29
|
+
|
|
30
|
+
verdict_a = report_a.is_a?(Hash) ? report_a[:verdict] : report_a&.verdict
|
|
31
|
+
verdict_b = report_b.is_a?(Hash) ? report_b[:verdict] : report_b&.verdict
|
|
32
|
+
|
|
33
|
+
passed_a = verdict_a == 'PASS'
|
|
34
|
+
passed_b = verdict_b == 'PASS'
|
|
35
|
+
passed_a && passed_b ? 0 : 1
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|