ruby-skill-bench 0.1.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +86 -0
  3. data/lib/skill_bench/cli/compare_command.rb +91 -0
  4. data/lib/skill_bench/cli/help_printer.rb +9 -1
  5. data/lib/skill_bench/cli/run_command.rb +6 -4
  6. data/lib/skill_bench/cli.rb +7 -4
  7. data/lib/skill_bench/clients/all.rb +1 -0
  8. data/lib/skill_bench/clients/providers/mock.rb +56 -0
  9. data/lib/skill_bench/commands/run.rb +6 -2
  10. data/lib/skill_bench/config/applier.rb +1 -0
  11. data/lib/skill_bench/config/defaults.rb +1 -0
  12. data/lib/skill_bench/config/facade_readers.rb +7 -0
  13. data/lib/skill_bench/config/json_loader.rb +3 -3
  14. data/lib/skill_bench/config/store.rb +5 -0
  15. data/lib/skill_bench/config.rb +10 -1
  16. data/lib/skill_bench/delta_report.rb +20 -0
  17. data/lib/skill_bench/execution/source_path_resolver.rb +59 -3
  18. data/lib/skill_bench/registry/pack_resolver.rb +119 -0
  19. data/lib/skill_bench/services/agent_spawner_service.rb +114 -0
  20. data/lib/skill_bench/services/compare_option_parser.rb +55 -0
  21. data/lib/skill_bench/services/comparison_reporter.rb +97 -0
  22. data/lib/skill_bench/services/comparison_runner.rb +49 -0
  23. data/lib/skill_bench/services/context_loader_service.rb +42 -0
  24. data/lib/skill_bench/services/error_response_builder.rb +119 -0
  25. data/lib/skill_bench/services/eval_resolver.rb +33 -0
  26. data/lib/skill_bench/services/exit_code_calculator.rb +39 -0
  27. data/lib/skill_bench/services/judge_params_builder.rb +54 -0
  28. data/lib/skill_bench/services/manifest_finder.rb +36 -0
  29. data/lib/skill_bench/services/output_formatter.rb +28 -0
  30. data/lib/skill_bench/services/prompt_builder_service.rb +98 -0
  31. data/lib/skill_bench/services/provider_resolver.rb +73 -0
  32. data/lib/skill_bench/services/runner_service.rb +84 -315
  33. data/lib/skill_bench/services/skill_resolver.rb +37 -9
  34. data/lib/skill_bench/services/skill_resolver_service.rb +70 -0
  35. data/lib/skill_bench/services/source_path_resolver_service.rb +45 -0
  36. data/lib/skill_bench/services/trend_recorder_service.rb +67 -0
  37. data/lib/skill_bench/services/variant_parser.rb +32 -0
  38. data/lib/skill_bench/services/variant_resolver.rb +63 -0
  39. data/lib/skill_bench/version.rb +1 -1
  40. metadata +23 -2
@@ -0,0 +1,119 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module SkillBench
6
+ module Registry
7
+ # Resolves skill paths from the ecosystem registry manifest.
8
+ # Reads a registry.json (from agent-mcp-runtime) and resolves
9
+ # pack → tile.json → skill path.
10
+ class PackResolver
11
+ # @param registry_path [String] Path to registry.json manifest
12
+ def initialize(registry_path)
13
+ @manifest = JSON.parse(File.read(registry_path))
14
+ end
15
+
16
+ # Resolves a skill path within a named pack.
17
+ #
18
+ # @param pack_name [String] Pack name (e.g. "rails", "core", "hanami")
19
+ # @param skill_name [String] Skill name (e.g. "code-review")
20
+ # @return [String, nil] Absolute path to the skill directory, or nil
21
+ # @param [Array<Object>] visited
22
+ def resolve_skill(pack_name, skill_name, visited = [])
23
+ return nil if visited.include?(pack_name)
24
+
25
+ visited += [pack_name]
26
+
27
+ pack = @manifest.dig('packs', pack_name)
28
+ return nil unless pack
29
+
30
+ source_path = resolve_source(pack['source'])
31
+ return nil unless source_path
32
+
33
+ tile_path = File.join(source_path, pack['tile'])
34
+ return nil unless File.exist?(tile_path)
35
+
36
+ tile = JSON.parse(File.read(tile_path))
37
+
38
+ # 1. Try to resolve directly
39
+ resolved = resolve_direct(tile, source_path, skill_name)
40
+ return resolved if resolved
41
+
42
+ # 2. Try to resolve via deprecated_skills redirect
43
+ resolved = resolve_redirect(tile, skill_name, visited)
44
+ return resolved if resolved
45
+
46
+ # 3. Try to resolve via depends_on packs in registry
47
+ resolve_dependencies(pack, skill_name, visited)
48
+ end
49
+
50
+ # Lists available pack names from the manifest.
51
+ #
52
+ # @return [Array<String>] Available pack names
53
+ def pack_names
54
+ @manifest.fetch('packs', {}).keys
55
+ end
56
+
57
+ private
58
+
59
+ def resolve_direct(tile, source_path, skill_name)
60
+ skill_entry = tile.dig('skills', skill_name)
61
+ return nil unless skill_entry
62
+
63
+ skill_path = File.join(source_path, skill_entry['path'])
64
+ resolved = File.expand_path(skill_path)
65
+ resolved = File.dirname(resolved) if resolved.end_with?('SKILL.md')
66
+ base = File.expand_path(source_path)
67
+
68
+ # Ensure resolved path is inside source directory
69
+ resolved == base || resolved.start_with?(base + File::SEPARATOR) ? resolved : nil
70
+ end
71
+
72
+ def resolve_redirect(tile, skill_name, visited)
73
+ dep_entry = tile.dig('deprecated_skills', skill_name)
74
+ return nil unless dep_entry
75
+
76
+ moved_to = dep_entry['moved_to']
77
+ return nil unless moved_to
78
+
79
+ target_pack = find_pack_by_source(moved_to)
80
+ return nil unless target_pack
81
+
82
+ resolve_skill(target_pack, skill_name, visited)
83
+ end
84
+
85
+ def resolve_dependencies(pack, skill_name, visited)
86
+ depends_on = pack['depends_on']
87
+ return nil unless depends_on.is_a?(Array)
88
+
89
+ depends_on.each do |dep_pack|
90
+ resolved = resolve_skill(dep_pack, skill_name, visited)
91
+ return resolved if resolved
92
+ end
93
+ nil
94
+ end
95
+
96
+ def find_pack_by_source(source)
97
+ @manifest.fetch('packs', {}).each do |pack_name, pack_config|
98
+ if pack_config['source'] == source ||
99
+ pack_config['source'].to_s.split('/').last == source.to_s.split('/').last
100
+ return pack_name
101
+ end
102
+ end
103
+ nil
104
+ end
105
+
106
+ def resolve_source(source)
107
+ return nil unless source.is_a?(String) && !source.empty?
108
+
109
+ repo_name = source.split('/').last
110
+ candidates = [
111
+ File.expand_path("../#{repo_name}", Dir.pwd),
112
+ File.expand_path("../../#{repo_name}", Dir.pwd),
113
+ File.join(Dir.home, '.agent-mcp-runtime', 'cache', repo_name)
114
+ ]
115
+ candidates.find { |c| Dir.exist?(c) }
116
+ end
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,114 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../execution/sandbox'
4
+ require_relative '../agent/react_agent'
5
+
6
+ module SkillBench
7
+ module Services
8
+ # Spawns and executes LLM agents for evaluation.
9
+ class AgentSpawnerService
10
+ # Spawns the LLM agent with the given system prompt.
11
+ #
12
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
13
+ # @param system_prompt [String] The system prompt for the agent
14
+ # @param provider [Object] The resolved provider
15
+ # @param config [Hash, nil] Provider config
16
+ # @return [Hash] Agent response with result, status, runtime, usage, raw_response, iterations
17
+ def self.call(evaluation, system_prompt, provider, config)
18
+ new(evaluation, system_prompt, provider, config).call
19
+ end
20
+
21
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
22
+ # @param system_prompt [String] The system prompt for the agent
23
+ # @param provider [Object] The resolved provider
24
+ # @param config [Hash, nil] Provider config
25
+ def initialize(evaluation, system_prompt, provider, config)
26
+ @evaluation = evaluation
27
+ @system_prompt = system_prompt
28
+ @provider = provider
29
+ @config = config
30
+ end
31
+
32
+ # Spawns the LLM agent with the given system prompt.
33
+ #
34
+ # @return [Hash] Agent response with result, status, runtime, usage, raw_response, iterations
35
+ def call
36
+ return { result: 'mock result', status: :success, iterations: [] } if @provider.name == 'mock'
37
+
38
+ client_params = build_client_params
39
+ max_iterations = @config&.[](:max_iterations) || @config&.[]('max_iterations') || 25
40
+
41
+ run_agent(client_params, max_iterations)
42
+ end
43
+
44
+ private
45
+
46
+ # Runs the agent in a sandbox with error handling.
47
+ #
48
+ # @param client_params [Hash] Client parameters for the agent
49
+ # @param max_iterations [Integer] Maximum iterations for the agent
50
+ # @return [Hash] Agent response with result, status, runtime, usage, raw_response, iterations
51
+ def run_agent(client_params, max_iterations)
52
+ Execution::Sandbox.run(@evaluation.path) do |sandbox|
53
+ agent_result = Agent::ReactAgent.call(
54
+ system_prompt: @system_prompt,
55
+ initial_prompt: @evaluation.task,
56
+ working_dir: sandbox.path,
57
+ container_id: sandbox.container_id,
58
+ client_params: client_params,
59
+ max_iterations: max_iterations
60
+ )
61
+
62
+ status = agent_result[:success] ? :success : :error
63
+ final_answer = agent_result.dig(:response, :content) || ''
64
+ diff = Execution::Sandbox.capture_diff(sandbox.path)
65
+ iterations = agent_result.dig(:response, :iterations) || []
66
+
67
+ output = [final_answer, diff].reject(&:empty?).join("\n\n")
68
+
69
+ {
70
+ result: output,
71
+ status: status,
72
+ runtime: @provider.runtime,
73
+ usage: {},
74
+ raw_response: agent_result,
75
+ iterations: iterations
76
+ }
77
+ end
78
+ rescue StandardError => e
79
+ {
80
+ result: "Error: #{e.message}",
81
+ status: :error,
82
+ runtime: @provider.runtime,
83
+ usage: {},
84
+ raw_response: { error: e.message, backtrace: e.backtrace },
85
+ iterations: []
86
+ }
87
+ end
88
+
89
+ # Builds client parameters for the ReactAgent.
90
+ #
91
+ # @return [Hash] Client parameters
92
+ def build_client_params
93
+ config = @config || safe_merged_config
94
+ return {} unless config
95
+
96
+ params = config.dup
97
+ params[:model] ||= @provider.llm
98
+ params[:provider] = @provider.runtime.to_sym
99
+ params
100
+ rescue StandardError
101
+ {}
102
+ end
103
+
104
+ # Safely calls merged_config, returning nil on any error.
105
+ #
106
+ # @return [Hash, nil] The merged config or nil
107
+ def safe_merged_config
108
+ @provider.merged_config
109
+ rescue StandardError
110
+ nil
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'optparse'
4
+
5
+ module SkillBench
6
+ module Services
7
+ # Parses CLI options for the compare command.
8
+ class CompareOptionParser
9
+ # Parses the given argv and returns the options hash.
10
+ #
11
+ # @param argv [Array<String>] Raw CLI arguments
12
+ # @return [Hash] Parsed options with keys: :variant_a, :variant_b, :eval, :format
13
+ # @raise [OptionParser::ParseError] when option parsing fails
14
+ def self.call(argv)
15
+ new(argv).call
16
+ end
17
+
18
+ # @param argv [Array<String>] Raw CLI arguments
19
+ def initialize(argv)
20
+ @argv = argv
21
+ end
22
+
23
+ # Parses options from argv.
24
+ #
25
+ # @return [Hash] Parsed options with keys: :variant_a, :variant_b, :eval, :format
26
+ # @raise [OptionParser::ParseError] when option parsing fails
27
+ def call
28
+ options = { format: :human }
29
+ parser = build_parser(options)
30
+ parser.parse!(@argv)
31
+ options
32
+ end
33
+
34
+ private
35
+
36
+ # Builds the OptionParser instance.
37
+ #
38
+ # @param options [Hash] Options hash to populate
39
+ # @return [OptionParser] Configured parser
40
+ def build_parser(options)
41
+ OptionParser.new do |opts|
42
+ opts.banner = 'Usage: skill-bench compare <skill-name> [options]'
43
+ opts.on('--variant-a SPEC', 'First variant (e.g., "pack:rails" or "/path/to/skill")') { |v| options[:variant_a] = v }
44
+ opts.on('--variant-b SPEC', 'Second variant (e.g., "pack:hanami" or "/path/to/skill")') { |v| options[:variant_b] = v }
45
+ opts.on('--eval PATH', 'Path to the eval directory') { |v| options[:eval] = v }
46
+ opts.on('--format FORMAT', 'Output format (human, json)') { |v| options[:format] = v.to_sym }
47
+ opts.on('-h', '--help', 'Prints this help') do
48
+ puts opts
49
+ raise SkillBench::HelpRequested
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Services
5
+ # Prints a formatted comparison report for two evaluation results.
6
+ class ComparisonReporter
7
+ # Prints the comparison report to stdout.
8
+ #
9
+ # @param result_a [Hash] First evaluation result
10
+ # @param result_b [Hash] Second evaluation result
11
+ # @param label_a [String] Label for first variant
12
+ # @param label_b [String] Label for second variant
13
+ # @return [nil]
14
+ def self.call(result_a, result_b, label_a, label_b)
15
+ new(result_a, result_b, label_a, label_b).call
16
+ end
17
+
18
+ # @param result_a [Hash] First evaluation result
19
+ # @param result_b [Hash] Second evaluation result
20
+ # @param label_a [String] Label for first variant
21
+ # @param label_b [String] Label for second variant
22
+ def initialize(result_a, result_b, label_a, label_b)
23
+ @result_a = result_a
24
+ @result_b = result_b
25
+ @label_a = label_a
26
+ @label_b = label_b
27
+ end
28
+
29
+ # Prints the comparison report to stdout.
30
+ #
31
+ # @return [nil]
32
+ def call
33
+ puts "\n=== Comparison Report ==="
34
+ puts "| Dimension | #{@label_a} | #{@label_b} | Delta |"
35
+ puts '|-----------|----------|----------|-------|'
36
+
37
+ report_a = @result_a.dig(:response, :report)
38
+ report_b = @result_b.dig(:response, :report)
39
+ return unless report_a && report_b
40
+
41
+ print_dimension_scores(report_a, report_b)
42
+ print_total_scores(report_a, report_b)
43
+ print_verdicts(report_a, report_b)
44
+ end
45
+
46
+ private
47
+
48
+ # Prints dimension score comparison.
49
+ #
50
+ # @param report_a [Object] First evaluation report
51
+ # @param report_b [Object] Second evaluation report
52
+ def print_dimension_scores(report_a, report_b)
53
+ report_b_by_name = report_b.dimensions.to_h { |d| [d.name, d] }
54
+
55
+ report_a.dimensions.each do |dim|
56
+ score_a = dim.score
57
+ score_b = report_b_by_name[dim.name]&.score || 0
58
+ delta = score_a - score_b
59
+ puts format('| %<name>-9s | %<a>8.1f | %<b>8.1f | %<delta>+5.1f |',
60
+ name: dim.name, a: score_a, b: score_b, delta: delta.to_f)
61
+ end
62
+ end
63
+
64
+ # Prints total score comparison.
65
+ #
66
+ # @param report_a [Object] First evaluation report
67
+ # @param report_b [Object] Second evaluation report
68
+ def print_total_scores(report_a, report_b)
69
+ total_a = report_a.total
70
+ total_b = report_b.total
71
+ return unless total_a && total_b
72
+
73
+ delta = total_a - total_b
74
+ puts format('| %<name>-9s | %<a>8.1f | %<b>8.1f | %<delta>+5.1f |',
75
+ name: 'TOTAL', a: total_a.to_f, b: total_b.to_f, delta: delta.to_f)
76
+ end
77
+
78
+ # Prints verdict comparison.
79
+ #
80
+ # @param report_a [Object] First evaluation report
81
+ # @param report_b [Object] Second evaluation report
82
+ def print_verdicts(report_a, report_b)
83
+ verdict_a = format_verdict(report_a.verdict)
84
+ verdict_b = format_verdict(report_b.verdict)
85
+ puts "| A: #{verdict_a} | B: #{verdict_b} |"
86
+ end
87
+
88
+ def format_verdict(verdict)
89
+ case verdict
90
+ when true then 'PASS'
91
+ when false then 'FAIL'
92
+ else verdict.to_s
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'variant_resolver'
4
+ require_relative 'runner_service'
5
+
6
+ module SkillBench
7
+ module Services
8
+ # Runs both variants of a skill comparison.
9
+ class ComparisonRunner
10
+ # Runs both variants and returns their results.
11
+ #
12
+ # @param variant_a [Hash] First variant specification
13
+ # @param variant_b [Hash] Second variant specification
14
+ # @param skill_name [String] Name of the skill to compare
15
+ # @param eval_path [String] Path to the eval directory
16
+ # @param manifest_path [String, nil] Optional path to registry manifest
17
+ # @return [Hash] Hash with :result_a and :result_b keys
18
+ def self.call(variant_a, variant_b, skill_name, eval_path, manifest_path: nil)
19
+ new(variant_a, variant_b, skill_name, eval_path, manifest_path: manifest_path).call
20
+ end
21
+
22
+ # @param variant_a [Hash] First variant specification
23
+ # @param variant_b [Hash] Second variant specification
24
+ # @param skill_name [String] Name of the skill to compare
25
+ # @param eval_path [String] Path to the eval directory
26
+ # @param manifest_path [String, nil] Optional path to registry manifest
27
+ def initialize(variant_a, variant_b, skill_name, eval_path, manifest_path: nil)
28
+ @variant_a = variant_a
29
+ @variant_b = variant_b
30
+ @skill_name = skill_name
31
+ @eval_path = eval_path
32
+ @manifest_path = manifest_path
33
+ end
34
+
35
+ # Runs both variants and returns their results.
36
+ #
37
+ # @return [Hash] Hash with :result_a and :result_b keys
38
+ def call
39
+ skill_paths_a = VariantResolver.call(@variant_a, @skill_name, manifest_path: @manifest_path)
40
+ skill_paths_b = VariantResolver.call(@variant_b, @skill_name, manifest_path: @manifest_path)
41
+
42
+ result_a = RunnerService.call(eval_name: @eval_path, skill_names: skill_paths_a)
43
+ result_b = RunnerService.call(eval_name: @eval_path, skill_names: skill_paths_b)
44
+
45
+ { result_a: result_a, result_b: result_b }
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Services
5
+ # Loads and combines skill context from SKILL.md files.
6
+ class ContextLoaderService
7
+ # Loads and combines skill context from SKILL.md files.
8
+ #
9
+ # @param skills [Array<SkillBench::Models::Skill>] The skills to load context from
10
+ # @return [String] The combined skill context
11
+ def self.call(skills)
12
+ new(skills).call
13
+ end
14
+
15
+ # @param skills [Array<SkillBench::Models::Skill>] The skills to load context from
16
+ def initialize(skills)
17
+ @skills = skills
18
+ end
19
+
20
+ # Loads and combines skill context from SKILL.md files.
21
+ #
22
+ # @return [String] The combined skill context
23
+ def call
24
+ return '' if @skills.nil? || @skills.empty?
25
+
26
+ contexts = @skills.map { |skill| load_skill_context(skill) }
27
+ contexts.reject(&:empty?).join("\n\n#{'=' * 40}\n\n")
28
+ end
29
+
30
+ private
31
+
32
+ # Loads the skill context from a single skill's SKILL.md file.
33
+ #
34
+ # @param skill [SkillBench::Models::Skill] The skill to load context from
35
+ # @return [String] The skill context or empty string if not found
36
+ def load_skill_context(skill)
37
+ skill_md = File.join(skill.path, 'SKILL.md')
38
+ File.exist?(skill_md) ? File.read(skill_md) : ''
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,119 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Services
5
+ # Builds standardized error responses with metadata.
6
+ class ErrorResponseBuilder
7
+ # Builds a configuration error response.
8
+ #
9
+ # @param error [Exception] The configuration error
10
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
11
+ # @param provider [Object] The resolved provider
12
+ # @param skill_names [Array<String>] Names of the skills
13
+ # @return [Hash] Error result with metadata
14
+ def self.config_error(error, evaluation, provider, skill_names)
15
+ new(evaluation, provider, skill_names).config_error(error)
16
+ end
17
+
18
+ # Builds an agent error response.
19
+ #
20
+ # @param result [Hash] The agent result containing the error
21
+ # @param phase [String] The phase that failed (e.g., 'baseline', 'context')
22
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
23
+ # @param provider [Object] The resolved provider
24
+ # @param skill_names [Array<String>] Names of the skills
25
+ # @return [Hash] Error result with metadata
26
+ def self.agent_error(result, phase, evaluation, provider, skill_names)
27
+ new(evaluation, provider, skill_names).agent_error(result, phase)
28
+ end
29
+
30
+ # Builds an empty context error response.
31
+ #
32
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
33
+ # @param provider [Object] The resolved provider
34
+ # @param skill_names [Array<String>] Names of the skills
35
+ # @return [Hash] Error result with metadata
36
+ def self.empty_context_error(evaluation, provider, skill_names)
37
+ new(evaluation, provider, skill_names).empty_context_error
38
+ end
39
+
40
+ # Enriches an existing error result with metadata.
41
+ #
42
+ # @param result [Hash] The existing error result
43
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
44
+ # @param provider [Object] The resolved provider
45
+ # @param skill_names [Array<String>] Names of the skills
46
+ # @return [Hash] Enriched error result with metadata
47
+ def self.enrich_error(result, evaluation, provider, skill_names)
48
+ new(evaluation, provider, skill_names).enrich_error(result)
49
+ end
50
+
51
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
52
+ # @param provider [Object] The resolved provider
53
+ # @param skill_names [Array<String>] Names of the skills
54
+ def initialize(evaluation, provider, skill_names)
55
+ @evaluation = evaluation
56
+ @provider = provider
57
+ @skill_names = skill_names
58
+ end
59
+
60
+ # Builds a configuration error response.
61
+ #
62
+ # @param error [Exception] The configuration error
63
+ # @return [Hash] Error result with metadata
64
+ def config_error(error)
65
+ base_error_result("Configuration error: #{error.message}")
66
+ end
67
+
68
+ # Builds an agent error response.
69
+ #
70
+ # @param result [Hash] The agent result containing the error
71
+ # @param phase [String] The phase that failed (e.g., 'baseline', 'context')
72
+ # @return [Hash] Error result with metadata
73
+ def agent_error(result, phase)
74
+ raw = result[:raw_response]
75
+ error_msg = raw&.dig(:response, :error, :message) || raw&.dig(:error, :message) || 'unknown error'
76
+ base_error_result("#{phase.capitalize} agent failed: #{error_msg}")
77
+ end
78
+
79
+ # Builds an empty context error response.
80
+ #
81
+ # @return [Hash] Error result with metadata
82
+ def empty_context_error
83
+ base_error_result('Skill context is empty. Ensure SKILL.md exists and has content.')
84
+ end
85
+
86
+ # Enriches an existing error result with metadata.
87
+ #
88
+ # @param result [Hash] The existing error result
89
+ # @return [Hash] Enriched error result with metadata
90
+ def enrich_error(result)
91
+ result.merge(
92
+ eval_name: @evaluation.name,
93
+ skill_name: @skill_names.join(', '),
94
+ provider_name: @provider.name
95
+ )
96
+ end
97
+
98
+ private
99
+
100
+ # Builds a base error result with metadata.
101
+ #
102
+ # @param message [String] The error message
103
+ # @return [Hash] Error result with metadata
104
+ def base_error_result(message)
105
+ {
106
+ success: false,
107
+ response: {
108
+ error: {
109
+ message: message
110
+ }
111
+ },
112
+ eval_name: @evaluation.name,
113
+ skill_name: @skill_names.join(', '),
114
+ provider_name: @provider.name
115
+ }
116
+ end
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../models/eval'
4
+
5
+ module SkillBench
6
+ module Services
7
+ # Resolves an eval from a name or path.
8
+ class EvalResolver
9
+ # Resolves an eval from a name or path.
10
+ #
11
+ # @param eval_name [String] Name or path of the eval to resolve
12
+ # @return [SkillBench::Models::Eval] The resolved eval
13
+ # @raise [Errno::ENOENT] when the eval directory does not exist
14
+ def self.call(eval_name)
15
+ new(eval_name).call
16
+ end
17
+
18
+ # @param eval_name [String] Name or path of the eval
19
+ def initialize(eval_name)
20
+ @eval_name = eval_name
21
+ end
22
+
23
+ # Resolves the eval from the name or path.
24
+ #
25
+ # @return [SkillBench::Models::Eval] The resolved eval
26
+ # @raise [Errno::ENOENT] when the eval directory does not exist
27
+ def call
28
+ eval_path = @eval_name.include?('/') ? @eval_name : "evals/#{@eval_name}"
29
+ SkillBench::Models::Eval.load(eval_path)
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Services
5
+ # Calculates the exit code based on comparison results.
6
+ class ExitCodeCalculator
7
+ # Calculates the exit code from comparison results.
8
+ #
9
+ # @param result_a [Hash] First evaluation result
10
+ # @param result_b [Hash] Second evaluation result
11
+ # @return [Integer] 0 if both pass, 1 otherwise
12
+ def self.call(result_a, result_b)
13
+ new(result_a, result_b).call
14
+ end
15
+
16
+ # @param result_a [Hash] First evaluation result
17
+ # @param result_b [Hash] Second evaluation result
18
+ def initialize(result_a, result_b)
19
+ @result_a = result_a
20
+ @result_b = result_b
21
+ end
22
+
23
+ # Calculates the exit code from comparison results.
24
+ #
25
+ # @return [Integer] 0 if both pass, 1 otherwise
26
+ def call
27
+ report_a = @result_a.dig(:response, :report)
28
+ report_b = @result_b.dig(:response, :report)
29
+
30
+ verdict_a = report_a.is_a?(Hash) ? report_a[:verdict] : report_a&.verdict
31
+ verdict_b = report_b.is_a?(Hash) ? report_b[:verdict] : report_b&.verdict
32
+
33
+ passed_a = verdict_a == 'PASS'
34
+ passed_b = verdict_b == 'PASS'
35
+ passed_a && passed_b ? 0 : 1
36
+ end
37
+ end
38
+ end
39
+ end