ruby-skill-bench 0.1.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +231 -0
- data/lib/skill_bench/agent/react_agent.rb +2 -1
- data/lib/skill_bench/cli/compare_command.rb +91 -0
- data/lib/skill_bench/cli/help_printer.rb +9 -1
- data/lib/skill_bench/cli/run_command.rb +6 -4
- data/lib/skill_bench/cli.rb +7 -4
- data/lib/skill_bench/clients/all.rb +2 -0
- data/lib/skill_bench/clients/base_client.rb +2 -5
- data/lib/skill_bench/clients/providers/mock.rb +56 -0
- data/lib/skill_bench/clients/request_builder.rb +2 -4
- data/lib/skill_bench/clients/response_builder.rb +91 -0
- data/lib/skill_bench/clients/response_error_handler.rb +5 -17
- data/lib/skill_bench/clients/retry_handler.rb +4 -7
- data/lib/skill_bench/commands/run.rb +6 -2
- data/lib/skill_bench/config/applier.rb +1 -0
- data/lib/skill_bench/config/defaults.rb +1 -0
- data/lib/skill_bench/config/facade_readers.rb +7 -0
- data/lib/skill_bench/config/json_loader.rb +3 -3
- data/lib/skill_bench/config/store.rb +5 -0
- data/lib/skill_bench/config.rb +10 -1
- data/lib/skill_bench/constants.rb +58 -0
- data/lib/skill_bench/delta_report.rb +20 -0
- data/lib/skill_bench/execution/context_hydrator.rb +16 -6
- data/lib/skill_bench/execution/sandbox.rb +18 -3
- data/lib/skill_bench/execution/source_path_resolver.rb +59 -3
- data/lib/skill_bench/registry/pack_resolver.rb +119 -0
- data/lib/skill_bench/services/agent_spawner_service.rb +114 -0
- data/lib/skill_bench/services/compare_option_parser.rb +55 -0
- data/lib/skill_bench/services/comparison_reporter.rb +97 -0
- data/lib/skill_bench/services/comparison_runner.rb +49 -0
- data/lib/skill_bench/services/context_loader_service.rb +42 -0
- data/lib/skill_bench/services/error_response_builder.rb +119 -0
- data/lib/skill_bench/services/eval_resolver.rb +33 -0
- data/lib/skill_bench/services/exit_code_calculator.rb +39 -0
- data/lib/skill_bench/services/judge_params_builder.rb +54 -0
- data/lib/skill_bench/services/manifest_finder.rb +36 -0
- data/lib/skill_bench/services/output_formatter.rb +28 -0
- data/lib/skill_bench/services/prompt_builder_service.rb +98 -0
- data/lib/skill_bench/services/provider_resolver.rb +73 -0
- data/lib/skill_bench/services/runner_service.rb +84 -315
- data/lib/skill_bench/services/skill_resolver.rb +37 -9
- data/lib/skill_bench/services/skill_resolver_service.rb +70 -0
- data/lib/skill_bench/services/source_path_resolver_service.rb +45 -0
- data/lib/skill_bench/services/trend_recorder_service.rb +67 -0
- data/lib/skill_bench/services/variant_parser.rb +32 -0
- data/lib/skill_bench/services/variant_resolver.rb +63 -0
- data/lib/skill_bench/tools/run_command.rb +2 -17
- data/lib/skill_bench/version.rb +1 -1
- data/lib/skill_bench.rb +1 -0
- metadata +25 -2
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../execution/sandbox'
|
|
4
|
+
require_relative '../agent/react_agent'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Services
|
|
8
|
+
# Spawns and executes LLM agents for evaluation.
|
|
9
|
+
class AgentSpawnerService
|
|
10
|
+
# Spawns the LLM agent with the given system prompt.
|
|
11
|
+
#
|
|
12
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
13
|
+
# @param system_prompt [String] The system prompt for the agent
|
|
14
|
+
# @param provider [Object] The resolved provider
|
|
15
|
+
# @param config [Hash, nil] Provider config
|
|
16
|
+
# @return [Hash] Agent response with result, status, runtime, usage, raw_response, iterations
|
|
17
|
+
def self.call(evaluation, system_prompt, provider, config)
|
|
18
|
+
new(evaluation, system_prompt, provider, config).call
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
22
|
+
# @param system_prompt [String] The system prompt for the agent
|
|
23
|
+
# @param provider [Object] The resolved provider
|
|
24
|
+
# @param config [Hash, nil] Provider config
|
|
25
|
+
def initialize(evaluation, system_prompt, provider, config)
|
|
26
|
+
@evaluation = evaluation
|
|
27
|
+
@system_prompt = system_prompt
|
|
28
|
+
@provider = provider
|
|
29
|
+
@config = config
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Spawns the LLM agent with the given system prompt.
|
|
33
|
+
#
|
|
34
|
+
# @return [Hash] Agent response with result, status, runtime, usage, raw_response, iterations
|
|
35
|
+
def call
|
|
36
|
+
return { result: 'mock result', status: :success, iterations: [] } if @provider.name == 'mock'
|
|
37
|
+
|
|
38
|
+
client_params = build_client_params
|
|
39
|
+
max_iterations = @config&.[](:max_iterations) || @config&.[]('max_iterations') || 25
|
|
40
|
+
|
|
41
|
+
run_agent(client_params, max_iterations)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
# Runs the agent in a sandbox with error handling.
|
|
47
|
+
#
|
|
48
|
+
# @param client_params [Hash] Client parameters for the agent
|
|
49
|
+
# @param max_iterations [Integer] Maximum iterations for the agent
|
|
50
|
+
# @return [Hash] Agent response with result, status, runtime, usage, raw_response, iterations
|
|
51
|
+
def run_agent(client_params, max_iterations)
|
|
52
|
+
Execution::Sandbox.run(@evaluation.path) do |sandbox|
|
|
53
|
+
agent_result = Agent::ReactAgent.call(
|
|
54
|
+
system_prompt: @system_prompt,
|
|
55
|
+
initial_prompt: @evaluation.task,
|
|
56
|
+
working_dir: sandbox.path,
|
|
57
|
+
container_id: sandbox.container_id,
|
|
58
|
+
client_params: client_params,
|
|
59
|
+
max_iterations: max_iterations
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
status = agent_result[:success] ? :success : :error
|
|
63
|
+
final_answer = agent_result.dig(:response, :content) || ''
|
|
64
|
+
diff = Execution::Sandbox.capture_diff(sandbox.path)
|
|
65
|
+
iterations = agent_result.dig(:response, :iterations) || []
|
|
66
|
+
|
|
67
|
+
output = [final_answer, diff].reject(&:empty?).join("\n\n")
|
|
68
|
+
|
|
69
|
+
{
|
|
70
|
+
result: output,
|
|
71
|
+
status: status,
|
|
72
|
+
runtime: @provider.runtime,
|
|
73
|
+
usage: {},
|
|
74
|
+
raw_response: agent_result,
|
|
75
|
+
iterations: iterations
|
|
76
|
+
}
|
|
77
|
+
end
|
|
78
|
+
rescue StandardError => e
|
|
79
|
+
{
|
|
80
|
+
result: "Error: #{e.message}",
|
|
81
|
+
status: :error,
|
|
82
|
+
runtime: @provider.runtime,
|
|
83
|
+
usage: {},
|
|
84
|
+
raw_response: { error: e.message, backtrace: e.backtrace },
|
|
85
|
+
iterations: []
|
|
86
|
+
}
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Builds client parameters for the ReactAgent.
|
|
90
|
+
#
|
|
91
|
+
# @return [Hash] Client parameters
|
|
92
|
+
def build_client_params
|
|
93
|
+
config = @config || safe_merged_config
|
|
94
|
+
return {} unless config
|
|
95
|
+
|
|
96
|
+
params = config.dup
|
|
97
|
+
params[:model] ||= @provider.llm
|
|
98
|
+
params[:provider] = @provider.runtime.to_sym
|
|
99
|
+
params
|
|
100
|
+
rescue StandardError
|
|
101
|
+
{}
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Safely calls merged_config, returning nil on any error.
|
|
105
|
+
#
|
|
106
|
+
# @return [Hash, nil] The merged config or nil
|
|
107
|
+
def safe_merged_config
|
|
108
|
+
@provider.merged_config
|
|
109
|
+
rescue StandardError
|
|
110
|
+
nil
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'optparse'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Services
|
|
7
|
+
# Parses CLI options for the compare command.
|
|
8
|
+
class CompareOptionParser
|
|
9
|
+
# Parses the given argv and returns the options hash.
|
|
10
|
+
#
|
|
11
|
+
# @param argv [Array<String>] Raw CLI arguments
|
|
12
|
+
# @return [Hash] Parsed options with keys: :variant_a, :variant_b, :eval, :format
|
|
13
|
+
# @raise [OptionParser::ParseError] when option parsing fails
|
|
14
|
+
def self.call(argv)
|
|
15
|
+
new(argv).call
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# @param argv [Array<String>] Raw CLI arguments
|
|
19
|
+
def initialize(argv)
|
|
20
|
+
@argv = argv
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Parses options from argv.
|
|
24
|
+
#
|
|
25
|
+
# @return [Hash] Parsed options with keys: :variant_a, :variant_b, :eval, :format
|
|
26
|
+
# @raise [OptionParser::ParseError] when option parsing fails
|
|
27
|
+
def call
|
|
28
|
+
options = { format: :human }
|
|
29
|
+
parser = build_parser(options)
|
|
30
|
+
parser.parse!(@argv)
|
|
31
|
+
options
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
# Builds the OptionParser instance.
|
|
37
|
+
#
|
|
38
|
+
# @param options [Hash] Options hash to populate
|
|
39
|
+
# @return [OptionParser] Configured parser
|
|
40
|
+
def build_parser(options)
|
|
41
|
+
OptionParser.new do |opts|
|
|
42
|
+
opts.banner = 'Usage: skill-bench compare <skill-name> [options]'
|
|
43
|
+
opts.on('--variant-a SPEC', 'First variant (e.g., "pack:rails" or "/path/to/skill")') { |v| options[:variant_a] = v }
|
|
44
|
+
opts.on('--variant-b SPEC', 'Second variant (e.g., "pack:hanami" or "/path/to/skill")') { |v| options[:variant_b] = v }
|
|
45
|
+
opts.on('--eval PATH', 'Path to the eval directory') { |v| options[:eval] = v }
|
|
46
|
+
opts.on('--format FORMAT', 'Output format (human, json)') { |v| options[:format] = v.to_sym }
|
|
47
|
+
opts.on('-h', '--help', 'Prints this help') do
|
|
48
|
+
puts opts
|
|
49
|
+
raise SkillBench::HelpRequested
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Services
|
|
5
|
+
# Prints a formatted comparison report for two evaluation results.
|
|
6
|
+
class ComparisonReporter
|
|
7
|
+
# Prints the comparison report to stdout.
|
|
8
|
+
#
|
|
9
|
+
# @param result_a [Hash] First evaluation result
|
|
10
|
+
# @param result_b [Hash] Second evaluation result
|
|
11
|
+
# @param label_a [String] Label for first variant
|
|
12
|
+
# @param label_b [String] Label for second variant
|
|
13
|
+
# @return [nil]
|
|
14
|
+
def self.call(result_a, result_b, label_a, label_b)
|
|
15
|
+
new(result_a, result_b, label_a, label_b).call
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# @param result_a [Hash] First evaluation result
|
|
19
|
+
# @param result_b [Hash] Second evaluation result
|
|
20
|
+
# @param label_a [String] Label for first variant
|
|
21
|
+
# @param label_b [String] Label for second variant
|
|
22
|
+
def initialize(result_a, result_b, label_a, label_b)
|
|
23
|
+
@result_a = result_a
|
|
24
|
+
@result_b = result_b
|
|
25
|
+
@label_a = label_a
|
|
26
|
+
@label_b = label_b
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Prints the comparison report to stdout.
|
|
30
|
+
#
|
|
31
|
+
# @return [nil]
|
|
32
|
+
def call
|
|
33
|
+
puts "\n=== Comparison Report ==="
|
|
34
|
+
puts "| Dimension | #{@label_a} | #{@label_b} | Delta |"
|
|
35
|
+
puts '|-----------|----------|----------|-------|'
|
|
36
|
+
|
|
37
|
+
report_a = @result_a.dig(:response, :report)
|
|
38
|
+
report_b = @result_b.dig(:response, :report)
|
|
39
|
+
return unless report_a && report_b
|
|
40
|
+
|
|
41
|
+
print_dimension_scores(report_a, report_b)
|
|
42
|
+
print_total_scores(report_a, report_b)
|
|
43
|
+
print_verdicts(report_a, report_b)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
private
|
|
47
|
+
|
|
48
|
+
# Prints dimension score comparison.
|
|
49
|
+
#
|
|
50
|
+
# @param report_a [Object] First evaluation report
|
|
51
|
+
# @param report_b [Object] Second evaluation report
|
|
52
|
+
def print_dimension_scores(report_a, report_b)
|
|
53
|
+
report_b_by_name = report_b.dimensions.to_h { |d| [d.name, d] }
|
|
54
|
+
|
|
55
|
+
report_a.dimensions.each do |dim|
|
|
56
|
+
score_a = dim.score
|
|
57
|
+
score_b = report_b_by_name[dim.name]&.score || 0
|
|
58
|
+
delta = score_a - score_b
|
|
59
|
+
puts format('| %<name>-9s | %<a>8.1f | %<b>8.1f | %<delta>+5.1f |',
|
|
60
|
+
name: dim.name, a: score_a, b: score_b, delta: delta.to_f)
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Prints total score comparison.
|
|
65
|
+
#
|
|
66
|
+
# @param report_a [Object] First evaluation report
|
|
67
|
+
# @param report_b [Object] Second evaluation report
|
|
68
|
+
def print_total_scores(report_a, report_b)
|
|
69
|
+
total_a = report_a.total
|
|
70
|
+
total_b = report_b.total
|
|
71
|
+
return unless total_a && total_b
|
|
72
|
+
|
|
73
|
+
delta = total_a - total_b
|
|
74
|
+
puts format('| %<name>-9s | %<a>8.1f | %<b>8.1f | %<delta>+5.1f |',
|
|
75
|
+
name: 'TOTAL', a: total_a.to_f, b: total_b.to_f, delta: delta.to_f)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Prints verdict comparison.
|
|
79
|
+
#
|
|
80
|
+
# @param report_a [Object] First evaluation report
|
|
81
|
+
# @param report_b [Object] Second evaluation report
|
|
82
|
+
def print_verdicts(report_a, report_b)
|
|
83
|
+
verdict_a = format_verdict(report_a.verdict)
|
|
84
|
+
verdict_b = format_verdict(report_b.verdict)
|
|
85
|
+
puts "| A: #{verdict_a} | B: #{verdict_b} |"
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def format_verdict(verdict)
|
|
89
|
+
case verdict
|
|
90
|
+
when true then 'PASS'
|
|
91
|
+
when false then 'FAIL'
|
|
92
|
+
else verdict.to_s
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'variant_resolver'
|
|
4
|
+
require_relative 'runner_service'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Services
|
|
8
|
+
# Runs both variants of a skill comparison.
|
|
9
|
+
class ComparisonRunner
|
|
10
|
+
# Runs both variants and returns their results.
|
|
11
|
+
#
|
|
12
|
+
# @param variant_a [Hash] First variant specification
|
|
13
|
+
# @param variant_b [Hash] Second variant specification
|
|
14
|
+
# @param skill_name [String] Name of the skill to compare
|
|
15
|
+
# @param eval_path [String] Path to the eval directory
|
|
16
|
+
# @param manifest_path [String, nil] Optional path to registry manifest
|
|
17
|
+
# @return [Hash] Hash with :result_a and :result_b keys
|
|
18
|
+
def self.call(variant_a, variant_b, skill_name, eval_path, manifest_path: nil)
|
|
19
|
+
new(variant_a, variant_b, skill_name, eval_path, manifest_path: manifest_path).call
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# @param variant_a [Hash] First variant specification
|
|
23
|
+
# @param variant_b [Hash] Second variant specification
|
|
24
|
+
# @param skill_name [String] Name of the skill to compare
|
|
25
|
+
# @param eval_path [String] Path to the eval directory
|
|
26
|
+
# @param manifest_path [String, nil] Optional path to registry manifest
|
|
27
|
+
def initialize(variant_a, variant_b, skill_name, eval_path, manifest_path: nil)
|
|
28
|
+
@variant_a = variant_a
|
|
29
|
+
@variant_b = variant_b
|
|
30
|
+
@skill_name = skill_name
|
|
31
|
+
@eval_path = eval_path
|
|
32
|
+
@manifest_path = manifest_path
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Runs both variants and returns their results.
|
|
36
|
+
#
|
|
37
|
+
# @return [Hash] Hash with :result_a and :result_b keys
|
|
38
|
+
def call
|
|
39
|
+
skill_paths_a = VariantResolver.call(@variant_a, @skill_name, manifest_path: @manifest_path)
|
|
40
|
+
skill_paths_b = VariantResolver.call(@variant_b, @skill_name, manifest_path: @manifest_path)
|
|
41
|
+
|
|
42
|
+
result_a = RunnerService.call(eval_name: @eval_path, skill_names: skill_paths_a)
|
|
43
|
+
result_b = RunnerService.call(eval_name: @eval_path, skill_names: skill_paths_b)
|
|
44
|
+
|
|
45
|
+
{ result_a: result_a, result_b: result_b }
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Services
|
|
5
|
+
# Loads and combines skill context from SKILL.md files.
|
|
6
|
+
class ContextLoaderService
|
|
7
|
+
# Loads and combines skill context from SKILL.md files.
|
|
8
|
+
#
|
|
9
|
+
# @param skills [Array<SkillBench::Models::Skill>] The skills to load context from
|
|
10
|
+
# @return [String] The combined skill context
|
|
11
|
+
def self.call(skills)
|
|
12
|
+
new(skills).call
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# @param skills [Array<SkillBench::Models::Skill>] The skills to load context from
|
|
16
|
+
def initialize(skills)
|
|
17
|
+
@skills = skills
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Loads and combines skill context from SKILL.md files.
|
|
21
|
+
#
|
|
22
|
+
# @return [String] The combined skill context
|
|
23
|
+
def call
|
|
24
|
+
return '' if @skills.nil? || @skills.empty?
|
|
25
|
+
|
|
26
|
+
contexts = @skills.map { |skill| load_skill_context(skill) }
|
|
27
|
+
contexts.reject(&:empty?).join("\n\n#{'=' * 40}\n\n")
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
# Loads the skill context from a single skill's SKILL.md file.
|
|
33
|
+
#
|
|
34
|
+
# @param skill [SkillBench::Models::Skill] The skill to load context from
|
|
35
|
+
# @return [String] The skill context or empty string if not found
|
|
36
|
+
def load_skill_context(skill)
|
|
37
|
+
skill_md = File.join(skill.path, 'SKILL.md')
|
|
38
|
+
File.exist?(skill_md) ? File.read(skill_md) : ''
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Services
|
|
5
|
+
# Builds standardized error responses with metadata.
|
|
6
|
+
class ErrorResponseBuilder
|
|
7
|
+
# Builds a configuration error response.
|
|
8
|
+
#
|
|
9
|
+
# @param error [Exception] The configuration error
|
|
10
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
11
|
+
# @param provider [Object] The resolved provider
|
|
12
|
+
# @param skill_names [Array<String>] Names of the skills
|
|
13
|
+
# @return [Hash] Error result with metadata
|
|
14
|
+
def self.config_error(error, evaluation, provider, skill_names)
|
|
15
|
+
new(evaluation, provider, skill_names).config_error(error)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Builds an agent error response.
|
|
19
|
+
#
|
|
20
|
+
# @param result [Hash] The agent result containing the error
|
|
21
|
+
# @param phase [String] The phase that failed (e.g., 'baseline', 'context')
|
|
22
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
23
|
+
# @param provider [Object] The resolved provider
|
|
24
|
+
# @param skill_names [Array<String>] Names of the skills
|
|
25
|
+
# @return [Hash] Error result with metadata
|
|
26
|
+
def self.agent_error(result, phase, evaluation, provider, skill_names)
|
|
27
|
+
new(evaluation, provider, skill_names).agent_error(result, phase)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Builds an empty context error response.
|
|
31
|
+
#
|
|
32
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
33
|
+
# @param provider [Object] The resolved provider
|
|
34
|
+
# @param skill_names [Array<String>] Names of the skills
|
|
35
|
+
# @return [Hash] Error result with metadata
|
|
36
|
+
def self.empty_context_error(evaluation, provider, skill_names)
|
|
37
|
+
new(evaluation, provider, skill_names).empty_context_error
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Enriches an existing error result with metadata.
|
|
41
|
+
#
|
|
42
|
+
# @param result [Hash] The existing error result
|
|
43
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
44
|
+
# @param provider [Object] The resolved provider
|
|
45
|
+
# @param skill_names [Array<String>] Names of the skills
|
|
46
|
+
# @return [Hash] Enriched error result with metadata
|
|
47
|
+
def self.enrich_error(result, evaluation, provider, skill_names)
|
|
48
|
+
new(evaluation, provider, skill_names).enrich_error(result)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
52
|
+
# @param provider [Object] The resolved provider
|
|
53
|
+
# @param skill_names [Array<String>] Names of the skills
|
|
54
|
+
def initialize(evaluation, provider, skill_names)
|
|
55
|
+
@evaluation = evaluation
|
|
56
|
+
@provider = provider
|
|
57
|
+
@skill_names = skill_names
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Builds a configuration error response.
|
|
61
|
+
#
|
|
62
|
+
# @param error [Exception] The configuration error
|
|
63
|
+
# @return [Hash] Error result with metadata
|
|
64
|
+
def config_error(error)
|
|
65
|
+
base_error_result("Configuration error: #{error.message}")
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Builds an agent error response.
|
|
69
|
+
#
|
|
70
|
+
# @param result [Hash] The agent result containing the error
|
|
71
|
+
# @param phase [String] The phase that failed (e.g., 'baseline', 'context')
|
|
72
|
+
# @return [Hash] Error result with metadata
|
|
73
|
+
def agent_error(result, phase)
|
|
74
|
+
raw = result[:raw_response]
|
|
75
|
+
error_msg = raw&.dig(:response, :error, :message) || raw&.dig(:error, :message) || 'unknown error'
|
|
76
|
+
base_error_result("#{phase.capitalize} agent failed: #{error_msg}")
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Builds an empty context error response.
|
|
80
|
+
#
|
|
81
|
+
# @return [Hash] Error result with metadata
|
|
82
|
+
def empty_context_error
|
|
83
|
+
base_error_result('Skill context is empty. Ensure SKILL.md exists and has content.')
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Enriches an existing error result with metadata.
|
|
87
|
+
#
|
|
88
|
+
# @param result [Hash] The existing error result
|
|
89
|
+
# @return [Hash] Enriched error result with metadata
|
|
90
|
+
def enrich_error(result)
|
|
91
|
+
result.merge(
|
|
92
|
+
eval_name: @evaluation.name,
|
|
93
|
+
skill_name: @skill_names.join(', '),
|
|
94
|
+
provider_name: @provider.name
|
|
95
|
+
)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
private
|
|
99
|
+
|
|
100
|
+
# Builds a base error result with metadata.
|
|
101
|
+
#
|
|
102
|
+
# @param message [String] The error message
|
|
103
|
+
# @return [Hash] Error result with metadata
|
|
104
|
+
def base_error_result(message)
|
|
105
|
+
{
|
|
106
|
+
success: false,
|
|
107
|
+
response: {
|
|
108
|
+
error: {
|
|
109
|
+
message: message
|
|
110
|
+
}
|
|
111
|
+
},
|
|
112
|
+
eval_name: @evaluation.name,
|
|
113
|
+
skill_name: @skill_names.join(', '),
|
|
114
|
+
provider_name: @provider.name
|
|
115
|
+
}
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../models/eval'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Services
|
|
7
|
+
# Resolves an eval from a name or path.
|
|
8
|
+
class EvalResolver
|
|
9
|
+
# Resolves an eval from a name or path.
|
|
10
|
+
#
|
|
11
|
+
# @param eval_name [String] Name or path of the eval to resolve
|
|
12
|
+
# @return [SkillBench::Models::Eval] The resolved eval
|
|
13
|
+
# @raise [Errno::ENOENT] when the eval directory does not exist
|
|
14
|
+
def self.call(eval_name)
|
|
15
|
+
new(eval_name).call
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# @param eval_name [String] Name or path of the eval
|
|
19
|
+
def initialize(eval_name)
|
|
20
|
+
@eval_name = eval_name
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Resolves the eval from the name or path.
|
|
24
|
+
#
|
|
25
|
+
# @return [SkillBench::Models::Eval] The resolved eval
|
|
26
|
+
# @raise [Errno::ENOENT] when the eval directory does not exist
|
|
27
|
+
def call
|
|
28
|
+
eval_path = @eval_name.include?('/') ? @eval_name : "evals/#{@eval_name}"
|
|
29
|
+
SkillBench::Models::Eval.load(eval_path)
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Services
|
|
5
|
+
# Calculates the exit code based on comparison results.
|
|
6
|
+
class ExitCodeCalculator
|
|
7
|
+
# Calculates the exit code from comparison results.
|
|
8
|
+
#
|
|
9
|
+
# @param result_a [Hash] First evaluation result
|
|
10
|
+
# @param result_b [Hash] Second evaluation result
|
|
11
|
+
# @return [Integer] 0 if both pass, 1 otherwise
|
|
12
|
+
def self.call(result_a, result_b)
|
|
13
|
+
new(result_a, result_b).call
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# @param result_a [Hash] First evaluation result
|
|
17
|
+
# @param result_b [Hash] Second evaluation result
|
|
18
|
+
def initialize(result_a, result_b)
|
|
19
|
+
@result_a = result_a
|
|
20
|
+
@result_b = result_b
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Calculates the exit code from comparison results.
|
|
24
|
+
#
|
|
25
|
+
# @return [Integer] 0 if both pass, 1 otherwise
|
|
26
|
+
def call
|
|
27
|
+
report_a = @result_a.dig(:response, :report)
|
|
28
|
+
report_b = @result_b.dig(:response, :report)
|
|
29
|
+
|
|
30
|
+
verdict_a = report_a.is_a?(Hash) ? report_a[:verdict] : report_a&.verdict
|
|
31
|
+
verdict_b = report_b.is_a?(Hash) ? report_b[:verdict] : report_b&.verdict
|
|
32
|
+
|
|
33
|
+
passed_a = verdict_a == 'PASS'
|
|
34
|
+
passed_b = verdict_b == 'PASS'
|
|
35
|
+
passed_a && passed_b ? 0 : 1
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Services
|
|
5
|
+
# Builds judge parameters from provider configuration.
|
|
6
|
+
class JudgeParamsBuilder
|
|
7
|
+
# Builds judge parameters from provider configuration.
|
|
8
|
+
#
|
|
9
|
+
# @param provider [Object] The resolved provider
|
|
10
|
+
# @param config [Hash, nil] Provider config
|
|
11
|
+
# @return [Hash] Judge parameters with api_key, model, and provider
|
|
12
|
+
def self.call(provider, config)
|
|
13
|
+
new(provider, config).call
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# @param provider [Object] The resolved provider
|
|
17
|
+
# @param config [Hash, nil] Provider config
|
|
18
|
+
def initialize(provider, config)
|
|
19
|
+
@provider = provider
|
|
20
|
+
@config = config
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Builds judge parameters from provider configuration.
|
|
24
|
+
#
|
|
25
|
+
# @return [Hash] Judge parameters with api_key, model, and provider
|
|
26
|
+
def call
|
|
27
|
+
return { provider: :mock } if @provider.name == 'mock'
|
|
28
|
+
|
|
29
|
+
config = @config || safe_merged_config
|
|
30
|
+
return {} unless config
|
|
31
|
+
|
|
32
|
+
{
|
|
33
|
+
api_key: config[:api_key],
|
|
34
|
+
model: config[:model] || @provider.llm,
|
|
35
|
+
provider: @provider.runtime.to_sym
|
|
36
|
+
}
|
|
37
|
+
rescue KeyError, NoMethodError
|
|
38
|
+
# Expected errors from missing config keys or nil config
|
|
39
|
+
{}
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
private
|
|
43
|
+
|
|
44
|
+
# Safely calls merged_config, returning nil on any error.
|
|
45
|
+
#
|
|
46
|
+
# @return [Hash, nil] The merged config or nil
|
|
47
|
+
def safe_merged_config
|
|
48
|
+
@provider.merged_config
|
|
49
|
+
rescue StandardError
|
|
50
|
+
nil
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Services
|
|
5
|
+
# Finds the registry manifest file path.
|
|
6
|
+
class ManifestFinder
|
|
7
|
+
# Default path relative to current working directory.
|
|
8
|
+
DEFAULT_PATH = '../agent-mcp-runtime/registry.json'
|
|
9
|
+
|
|
10
|
+
# Finds the registry manifest file.
|
|
11
|
+
#
|
|
12
|
+
# @param path [String, nil] Optional custom path to the manifest
|
|
13
|
+
# @return [String] Absolute path to the registry manifest
|
|
14
|
+
# @raise [ArgumentError] when the manifest file is not found
|
|
15
|
+
def self.call(path: nil)
|
|
16
|
+
new(path: path).call
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# @param path [String, nil] Optional custom path to the manifest
|
|
20
|
+
def initialize(path: nil)
|
|
21
|
+
@path = path
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Finds the registry manifest file.
|
|
25
|
+
#
|
|
26
|
+
# @return [String] Absolute path to the registry manifest
|
|
27
|
+
# @raise [ArgumentError] when the manifest file is not found
|
|
28
|
+
def call
|
|
29
|
+
manifest_path = @path || File.expand_path(DEFAULT_PATH, Dir.pwd)
|
|
30
|
+
raise ArgumentError, "Registry manifest not found: #{manifest_path}" unless File.exist?(manifest_path)
|
|
31
|
+
|
|
32
|
+
manifest_path
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Services
|
|
5
|
+
# Formats agent output for evaluation.
|
|
6
|
+
class OutputFormatter
|
|
7
|
+
# Formats agent output for evaluation.
|
|
8
|
+
#
|
|
9
|
+
# @param agent_result [Hash] The agent result containing the output
|
|
10
|
+
# @return [String] The formatted output
|
|
11
|
+
def self.call(agent_result)
|
|
12
|
+
new(agent_result).call
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# @param agent_result [Hash] The agent result containing the output
|
|
16
|
+
def initialize(agent_result)
|
|
17
|
+
@agent_result = agent_result
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Formats agent output for evaluation.
|
|
21
|
+
#
|
|
22
|
+
# @return [String] The formatted output
|
|
23
|
+
def call
|
|
24
|
+
@agent_result[:result].to_s
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|