ruby-skill-bench 0.1.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +231 -0
  3. data/lib/skill_bench/agent/react_agent.rb +2 -1
  4. data/lib/skill_bench/cli/compare_command.rb +91 -0
  5. data/lib/skill_bench/cli/help_printer.rb +9 -1
  6. data/lib/skill_bench/cli/run_command.rb +6 -4
  7. data/lib/skill_bench/cli.rb +7 -4
  8. data/lib/skill_bench/clients/all.rb +2 -0
  9. data/lib/skill_bench/clients/base_client.rb +2 -5
  10. data/lib/skill_bench/clients/providers/mock.rb +56 -0
  11. data/lib/skill_bench/clients/request_builder.rb +2 -4
  12. data/lib/skill_bench/clients/response_builder.rb +91 -0
  13. data/lib/skill_bench/clients/response_error_handler.rb +5 -17
  14. data/lib/skill_bench/clients/retry_handler.rb +4 -7
  15. data/lib/skill_bench/commands/run.rb +6 -2
  16. data/lib/skill_bench/config/applier.rb +1 -0
  17. data/lib/skill_bench/config/defaults.rb +1 -0
  18. data/lib/skill_bench/config/facade_readers.rb +7 -0
  19. data/lib/skill_bench/config/json_loader.rb +3 -3
  20. data/lib/skill_bench/config/store.rb +5 -0
  21. data/lib/skill_bench/config.rb +10 -1
  22. data/lib/skill_bench/constants.rb +58 -0
  23. data/lib/skill_bench/delta_report.rb +20 -0
  24. data/lib/skill_bench/execution/context_hydrator.rb +16 -6
  25. data/lib/skill_bench/execution/sandbox.rb +18 -3
  26. data/lib/skill_bench/execution/source_path_resolver.rb +59 -3
  27. data/lib/skill_bench/registry/pack_resolver.rb +119 -0
  28. data/lib/skill_bench/services/agent_spawner_service.rb +114 -0
  29. data/lib/skill_bench/services/compare_option_parser.rb +55 -0
  30. data/lib/skill_bench/services/comparison_reporter.rb +97 -0
  31. data/lib/skill_bench/services/comparison_runner.rb +49 -0
  32. data/lib/skill_bench/services/context_loader_service.rb +42 -0
  33. data/lib/skill_bench/services/error_response_builder.rb +119 -0
  34. data/lib/skill_bench/services/eval_resolver.rb +33 -0
  35. data/lib/skill_bench/services/exit_code_calculator.rb +39 -0
  36. data/lib/skill_bench/services/judge_params_builder.rb +54 -0
  37. data/lib/skill_bench/services/manifest_finder.rb +36 -0
  38. data/lib/skill_bench/services/output_formatter.rb +28 -0
  39. data/lib/skill_bench/services/prompt_builder_service.rb +98 -0
  40. data/lib/skill_bench/services/provider_resolver.rb +73 -0
  41. data/lib/skill_bench/services/runner_service.rb +84 -315
  42. data/lib/skill_bench/services/skill_resolver.rb +37 -9
  43. data/lib/skill_bench/services/skill_resolver_service.rb +70 -0
  44. data/lib/skill_bench/services/source_path_resolver_service.rb +45 -0
  45. data/lib/skill_bench/services/trend_recorder_service.rb +67 -0
  46. data/lib/skill_bench/services/variant_parser.rb +32 -0
  47. data/lib/skill_bench/services/variant_resolver.rb +63 -0
  48. data/lib/skill_bench/tools/run_command.rb +2 -17
  49. data/lib/skill_bench/version.rb +1 -1
  50. data/lib/skill_bench.rb +1 -0
  51. metadata +25 -2
@@ -0,0 +1,114 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../execution/sandbox'
4
+ require_relative '../agent/react_agent'
5
+
6
+ module SkillBench
7
+ module Services
8
+ # Spawns and executes LLM agents for evaluation.
9
+ class AgentSpawnerService
10
+ # Spawns the LLM agent with the given system prompt.
11
+ #
12
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
13
+ # @param system_prompt [String] The system prompt for the agent
14
+ # @param provider [Object] The resolved provider
15
+ # @param config [Hash, nil] Provider config
16
+ # @return [Hash] Agent response with result, status, runtime, usage, raw_response, iterations
17
+ def self.call(evaluation, system_prompt, provider, config)
18
+ new(evaluation, system_prompt, provider, config).call
19
+ end
20
+
21
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
22
+ # @param system_prompt [String] The system prompt for the agent
23
+ # @param provider [Object] The resolved provider
24
+ # @param config [Hash, nil] Provider config
25
+ def initialize(evaluation, system_prompt, provider, config)
26
+ @evaluation = evaluation
27
+ @system_prompt = system_prompt
28
+ @provider = provider
29
+ @config = config
30
+ end
31
+
32
+ # Spawns the LLM agent with the given system prompt.
33
+ #
34
+ # @return [Hash] Agent response with result, status, runtime, usage, raw_response, iterations
35
+ def call
36
+ return { result: 'mock result', status: :success, iterations: [] } if @provider.name == 'mock'
37
+
38
+ client_params = build_client_params
39
+ max_iterations = @config&.[](:max_iterations) || @config&.[]('max_iterations') || 25
40
+
41
+ run_agent(client_params, max_iterations)
42
+ end
43
+
44
+ private
45
+
46
+ # Runs the agent in a sandbox with error handling.
47
+ #
48
+ # @param client_params [Hash] Client parameters for the agent
49
+ # @param max_iterations [Integer] Maximum iterations for the agent
50
+ # @return [Hash] Agent response with result, status, runtime, usage, raw_response, iterations
51
+ def run_agent(client_params, max_iterations)
52
+ Execution::Sandbox.run(@evaluation.path) do |sandbox|
53
+ agent_result = Agent::ReactAgent.call(
54
+ system_prompt: @system_prompt,
55
+ initial_prompt: @evaluation.task,
56
+ working_dir: sandbox.path,
57
+ container_id: sandbox.container_id,
58
+ client_params: client_params,
59
+ max_iterations: max_iterations
60
+ )
61
+
62
+ status = agent_result[:success] ? :success : :error
63
+ final_answer = agent_result.dig(:response, :content) || ''
64
+ diff = Execution::Sandbox.capture_diff(sandbox.path)
65
+ iterations = agent_result.dig(:response, :iterations) || []
66
+
67
+ output = [final_answer, diff].reject(&:empty?).join("\n\n")
68
+
69
+ {
70
+ result: output,
71
+ status: status,
72
+ runtime: @provider.runtime,
73
+ usage: {},
74
+ raw_response: agent_result,
75
+ iterations: iterations
76
+ }
77
+ end
78
+ rescue StandardError => e
79
+ {
80
+ result: "Error: #{e.message}",
81
+ status: :error,
82
+ runtime: @provider.runtime,
83
+ usage: {},
84
+ raw_response: { error: e.message, backtrace: e.backtrace },
85
+ iterations: []
86
+ }
87
+ end
88
+
89
+ # Builds client parameters for the ReactAgent.
90
+ #
91
+ # @return [Hash] Client parameters
92
+ def build_client_params
93
+ config = @config || safe_merged_config
94
+ return {} unless config
95
+
96
+ params = config.dup
97
+ params[:model] ||= @provider.llm
98
+ params[:provider] = @provider.runtime.to_sym
99
+ params
100
+ rescue StandardError
101
+ {}
102
+ end
103
+
104
+ # Safely calls merged_config, returning nil on any error.
105
+ #
106
+ # @return [Hash, nil] The merged config or nil
107
+ def safe_merged_config
108
+ @provider.merged_config
109
+ rescue StandardError
110
+ nil
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'optparse'
4
+
5
+ module SkillBench
6
+ module Services
7
+ # Parses CLI options for the compare command.
8
+ class CompareOptionParser
9
+ # Parses the given argv and returns the options hash.
10
+ #
11
+ # @param argv [Array<String>] Raw CLI arguments
12
+ # @return [Hash] Parsed options with keys: :variant_a, :variant_b, :eval, :format
13
+ # @raise [OptionParser::ParseError] when option parsing fails
14
+ def self.call(argv)
15
+ new(argv).call
16
+ end
17
+
18
+ # @param argv [Array<String>] Raw CLI arguments
19
+ def initialize(argv)
20
+ @argv = argv
21
+ end
22
+
23
+ # Parses options from argv.
24
+ #
25
+ # @return [Hash] Parsed options with keys: :variant_a, :variant_b, :eval, :format
26
+ # @raise [OptionParser::ParseError] when option parsing fails
27
+ def call
28
+ options = { format: :human }
29
+ parser = build_parser(options)
30
+ parser.parse!(@argv)
31
+ options
32
+ end
33
+
34
+ private
35
+
36
+ # Builds the OptionParser instance.
37
+ #
38
+ # @param options [Hash] Options hash to populate
39
+ # @return [OptionParser] Configured parser
40
+ def build_parser(options)
41
+ OptionParser.new do |opts|
42
+ opts.banner = 'Usage: skill-bench compare <skill-name> [options]'
43
+ opts.on('--variant-a SPEC', 'First variant (e.g., "pack:rails" or "/path/to/skill")') { |v| options[:variant_a] = v }
44
+ opts.on('--variant-b SPEC', 'Second variant (e.g., "pack:hanami" or "/path/to/skill")') { |v| options[:variant_b] = v }
45
+ opts.on('--eval PATH', 'Path to the eval directory') { |v| options[:eval] = v }
46
+ opts.on('--format FORMAT', 'Output format (human, json)') { |v| options[:format] = v.to_sym }
47
+ opts.on('-h', '--help', 'Prints this help') do
48
+ puts opts
49
+ raise SkillBench::HelpRequested
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Services
5
+ # Prints a formatted comparison report for two evaluation results.
6
+ class ComparisonReporter
7
+ # Prints the comparison report to stdout.
8
+ #
9
+ # @param result_a [Hash] First evaluation result
10
+ # @param result_b [Hash] Second evaluation result
11
+ # @param label_a [String] Label for first variant
12
+ # @param label_b [String] Label for second variant
13
+ # @return [nil]
14
+ def self.call(result_a, result_b, label_a, label_b)
15
+ new(result_a, result_b, label_a, label_b).call
16
+ end
17
+
18
+ # @param result_a [Hash] First evaluation result
19
+ # @param result_b [Hash] Second evaluation result
20
+ # @param label_a [String] Label for first variant
21
+ # @param label_b [String] Label for second variant
22
+ def initialize(result_a, result_b, label_a, label_b)
23
+ @result_a = result_a
24
+ @result_b = result_b
25
+ @label_a = label_a
26
+ @label_b = label_b
27
+ end
28
+
29
+ # Prints the comparison report to stdout.
30
+ #
31
+ # @return [nil]
32
+ def call
33
+ puts "\n=== Comparison Report ==="
34
+ puts "| Dimension | #{@label_a} | #{@label_b} | Delta |"
35
+ puts '|-----------|----------|----------|-------|'
36
+
37
+ report_a = @result_a.dig(:response, :report)
38
+ report_b = @result_b.dig(:response, :report)
39
+ return unless report_a && report_b
40
+
41
+ print_dimension_scores(report_a, report_b)
42
+ print_total_scores(report_a, report_b)
43
+ print_verdicts(report_a, report_b)
44
+ end
45
+
46
+ private
47
+
48
+ # Prints dimension score comparison.
49
+ #
50
+ # @param report_a [Object] First evaluation report
51
+ # @param report_b [Object] Second evaluation report
52
+ def print_dimension_scores(report_a, report_b)
53
+ report_b_by_name = report_b.dimensions.to_h { |d| [d.name, d] }
54
+
55
+ report_a.dimensions.each do |dim|
56
+ score_a = dim.score
57
+ score_b = report_b_by_name[dim.name]&.score || 0
58
+ delta = score_a - score_b
59
+ puts format('| %<name>-9s | %<a>8.1f | %<b>8.1f | %<delta>+5.1f |',
60
+ name: dim.name, a: score_a, b: score_b, delta: delta.to_f)
61
+ end
62
+ end
63
+
64
+ # Prints total score comparison.
65
+ #
66
+ # @param report_a [Object] First evaluation report
67
+ # @param report_b [Object] Second evaluation report
68
+ def print_total_scores(report_a, report_b)
69
+ total_a = report_a.total
70
+ total_b = report_b.total
71
+ return unless total_a && total_b
72
+
73
+ delta = total_a - total_b
74
+ puts format('| %<name>-9s | %<a>8.1f | %<b>8.1f | %<delta>+5.1f |',
75
+ name: 'TOTAL', a: total_a.to_f, b: total_b.to_f, delta: delta.to_f)
76
+ end
77
+
78
+ # Prints verdict comparison.
79
+ #
80
+ # @param report_a [Object] First evaluation report
81
+ # @param report_b [Object] Second evaluation report
82
+ def print_verdicts(report_a, report_b)
83
+ verdict_a = format_verdict(report_a.verdict)
84
+ verdict_b = format_verdict(report_b.verdict)
85
+ puts "| A: #{verdict_a} | B: #{verdict_b} |"
86
+ end
87
+
88
+ def format_verdict(verdict)
89
+ case verdict
90
+ when true then 'PASS'
91
+ when false then 'FAIL'
92
+ else verdict.to_s
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'variant_resolver'
4
+ require_relative 'runner_service'
5
+
6
+ module SkillBench
7
+ module Services
8
+ # Runs both variants of a skill comparison.
9
+ class ComparisonRunner
10
+ # Runs both variants and returns their results.
11
+ #
12
+ # @param variant_a [Hash] First variant specification
13
+ # @param variant_b [Hash] Second variant specification
14
+ # @param skill_name [String] Name of the skill to compare
15
+ # @param eval_path [String] Path to the eval directory
16
+ # @param manifest_path [String, nil] Optional path to registry manifest
17
+ # @return [Hash] Hash with :result_a and :result_b keys
18
+ def self.call(variant_a, variant_b, skill_name, eval_path, manifest_path: nil)
19
+ new(variant_a, variant_b, skill_name, eval_path, manifest_path: manifest_path).call
20
+ end
21
+
22
+ # @param variant_a [Hash] First variant specification
23
+ # @param variant_b [Hash] Second variant specification
24
+ # @param skill_name [String] Name of the skill to compare
25
+ # @param eval_path [String] Path to the eval directory
26
+ # @param manifest_path [String, nil] Optional path to registry manifest
27
+ def initialize(variant_a, variant_b, skill_name, eval_path, manifest_path: nil)
28
+ @variant_a = variant_a
29
+ @variant_b = variant_b
30
+ @skill_name = skill_name
31
+ @eval_path = eval_path
32
+ @manifest_path = manifest_path
33
+ end
34
+
35
+ # Runs both variants and returns their results.
36
+ #
37
+ # @return [Hash] Hash with :result_a and :result_b keys
38
+ def call
39
+ skill_paths_a = VariantResolver.call(@variant_a, @skill_name, manifest_path: @manifest_path)
40
+ skill_paths_b = VariantResolver.call(@variant_b, @skill_name, manifest_path: @manifest_path)
41
+
42
+ result_a = RunnerService.call(eval_name: @eval_path, skill_names: skill_paths_a)
43
+ result_b = RunnerService.call(eval_name: @eval_path, skill_names: skill_paths_b)
44
+
45
+ { result_a: result_a, result_b: result_b }
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Services
5
+ # Loads and combines skill context from SKILL.md files.
6
+ class ContextLoaderService
7
+ # Loads and combines skill context from SKILL.md files.
8
+ #
9
+ # @param skills [Array<SkillBench::Models::Skill>] The skills to load context from
10
+ # @return [String] The combined skill context
11
+ def self.call(skills)
12
+ new(skills).call
13
+ end
14
+
15
+ # @param skills [Array<SkillBench::Models::Skill>] The skills to load context from
16
+ def initialize(skills)
17
+ @skills = skills
18
+ end
19
+
20
+ # Loads and combines skill context from SKILL.md files.
21
+ #
22
+ # @return [String] The combined skill context
23
+ def call
24
+ return '' if @skills.nil? || @skills.empty?
25
+
26
+ contexts = @skills.map { |skill| load_skill_context(skill) }
27
+ contexts.reject(&:empty?).join("\n\n#{'=' * 40}\n\n")
28
+ end
29
+
30
+ private
31
+
32
+ # Loads the skill context from a single skill's SKILL.md file.
33
+ #
34
+ # @param skill [SkillBench::Models::Skill] The skill to load context from
35
+ # @return [String] The skill context or empty string if not found
36
+ def load_skill_context(skill)
37
+ skill_md = File.join(skill.path, 'SKILL.md')
38
+ File.exist?(skill_md) ? File.read(skill_md) : ''
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,119 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Services
5
+ # Builds standardized error responses with metadata.
6
+ class ErrorResponseBuilder
7
+ # Builds a configuration error response.
8
+ #
9
+ # @param error [Exception] The configuration error
10
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
11
+ # @param provider [Object] The resolved provider
12
+ # @param skill_names [Array<String>] Names of the skills
13
+ # @return [Hash] Error result with metadata
14
+ def self.config_error(error, evaluation, provider, skill_names)
15
+ new(evaluation, provider, skill_names).config_error(error)
16
+ end
17
+
18
+ # Builds an agent error response.
19
+ #
20
+ # @param result [Hash] The agent result containing the error
21
+ # @param phase [String] The phase that failed (e.g., 'baseline', 'context')
22
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
23
+ # @param provider [Object] The resolved provider
24
+ # @param skill_names [Array<String>] Names of the skills
25
+ # @return [Hash] Error result with metadata
26
+ def self.agent_error(result, phase, evaluation, provider, skill_names)
27
+ new(evaluation, provider, skill_names).agent_error(result, phase)
28
+ end
29
+
30
+ # Builds an empty context error response.
31
+ #
32
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
33
+ # @param provider [Object] The resolved provider
34
+ # @param skill_names [Array<String>] Names of the skills
35
+ # @return [Hash] Error result with metadata
36
+ def self.empty_context_error(evaluation, provider, skill_names)
37
+ new(evaluation, provider, skill_names).empty_context_error
38
+ end
39
+
40
+ # Enriches an existing error result with metadata.
41
+ #
42
+ # @param result [Hash] The existing error result
43
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
44
+ # @param provider [Object] The resolved provider
45
+ # @param skill_names [Array<String>] Names of the skills
46
+ # @return [Hash] Enriched error result with metadata
47
+ def self.enrich_error(result, evaluation, provider, skill_names)
48
+ new(evaluation, provider, skill_names).enrich_error(result)
49
+ end
50
+
51
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
52
+ # @param provider [Object] The resolved provider
53
+ # @param skill_names [Array<String>] Names of the skills
54
+ def initialize(evaluation, provider, skill_names)
55
+ @evaluation = evaluation
56
+ @provider = provider
57
+ @skill_names = skill_names
58
+ end
59
+
60
+ # Builds a configuration error response.
61
+ #
62
+ # @param error [Exception] The configuration error
63
+ # @return [Hash] Error result with metadata
64
+ def config_error(error)
65
+ base_error_result("Configuration error: #{error.message}")
66
+ end
67
+
68
+ # Builds an agent error response.
69
+ #
70
+ # @param result [Hash] The agent result containing the error
71
+ # @param phase [String] The phase that failed (e.g., 'baseline', 'context')
72
+ # @return [Hash] Error result with metadata
73
+ def agent_error(result, phase)
74
+ raw = result[:raw_response]
75
+ error_msg = raw&.dig(:response, :error, :message) || raw&.dig(:error, :message) || 'unknown error'
76
+ base_error_result("#{phase.capitalize} agent failed: #{error_msg}")
77
+ end
78
+
79
+ # Builds an empty context error response.
80
+ #
81
+ # @return [Hash] Error result with metadata
82
+ def empty_context_error
83
+ base_error_result('Skill context is empty. Ensure SKILL.md exists and has content.')
84
+ end
85
+
86
+ # Enriches an existing error result with metadata.
87
+ #
88
+ # @param result [Hash] The existing error result
89
+ # @return [Hash] Enriched error result with metadata
90
+ def enrich_error(result)
91
+ result.merge(
92
+ eval_name: @evaluation.name,
93
+ skill_name: @skill_names.join(', '),
94
+ provider_name: @provider.name
95
+ )
96
+ end
97
+
98
+ private
99
+
100
+ # Builds a base error result with metadata.
101
+ #
102
+ # @param message [String] The error message
103
+ # @return [Hash] Error result with metadata
104
+ def base_error_result(message)
105
+ {
106
+ success: false,
107
+ response: {
108
+ error: {
109
+ message: message
110
+ }
111
+ },
112
+ eval_name: @evaluation.name,
113
+ skill_name: @skill_names.join(', '),
114
+ provider_name: @provider.name
115
+ }
116
+ end
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../models/eval'
4
+
5
+ module SkillBench
6
+ module Services
7
+ # Resolves an eval from a name or path.
8
+ class EvalResolver
9
+ # Resolves an eval from a name or path.
10
+ #
11
+ # @param eval_name [String] Name or path of the eval to resolve
12
+ # @return [SkillBench::Models::Eval] The resolved eval
13
+ # @raise [Errno::ENOENT] when the eval directory does not exist
14
+ def self.call(eval_name)
15
+ new(eval_name).call
16
+ end
17
+
18
+ # @param eval_name [String] Name or path of the eval
19
+ def initialize(eval_name)
20
+ @eval_name = eval_name
21
+ end
22
+
23
+ # Resolves the eval from the name or path.
24
+ #
25
+ # @return [SkillBench::Models::Eval] The resolved eval
26
+ # @raise [Errno::ENOENT] when the eval directory does not exist
27
+ def call
28
+ eval_path = @eval_name.include?('/') ? @eval_name : "evals/#{@eval_name}"
29
+ SkillBench::Models::Eval.load(eval_path)
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Services
5
+ # Calculates the exit code based on comparison results.
6
+ class ExitCodeCalculator
7
+ # Calculates the exit code from comparison results.
8
+ #
9
+ # @param result_a [Hash] First evaluation result
10
+ # @param result_b [Hash] Second evaluation result
11
+ # @return [Integer] 0 if both pass, 1 otherwise
12
+ def self.call(result_a, result_b)
13
+ new(result_a, result_b).call
14
+ end
15
+
16
+ # @param result_a [Hash] First evaluation result
17
+ # @param result_b [Hash] Second evaluation result
18
+ def initialize(result_a, result_b)
19
+ @result_a = result_a
20
+ @result_b = result_b
21
+ end
22
+
23
+ # Calculates the exit code from comparison results.
24
+ #
25
+ # @return [Integer] 0 if both pass, 1 otherwise
26
+ def call
27
+ report_a = @result_a.dig(:response, :report)
28
+ report_b = @result_b.dig(:response, :report)
29
+
30
+ verdict_a = report_a.is_a?(Hash) ? report_a[:verdict] : report_a&.verdict
31
+ verdict_b = report_b.is_a?(Hash) ? report_b[:verdict] : report_b&.verdict
32
+
33
+ passed_a = verdict_a == 'PASS'
34
+ passed_b = verdict_b == 'PASS'
35
+ passed_a && passed_b ? 0 : 1
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Services
5
+ # Builds judge parameters from provider configuration.
6
+ class JudgeParamsBuilder
7
+ # Builds judge parameters from provider configuration.
8
+ #
9
+ # @param provider [Object] The resolved provider
10
+ # @param config [Hash, nil] Provider config
11
+ # @return [Hash] Judge parameters with api_key, model, and provider
12
+ def self.call(provider, config)
13
+ new(provider, config).call
14
+ end
15
+
16
+ # @param provider [Object] The resolved provider
17
+ # @param config [Hash, nil] Provider config
18
+ def initialize(provider, config)
19
+ @provider = provider
20
+ @config = config
21
+ end
22
+
23
+ # Builds judge parameters from provider configuration.
24
+ #
25
+ # @return [Hash] Judge parameters with api_key, model, and provider
26
+ def call
27
+ return { provider: :mock } if @provider.name == 'mock'
28
+
29
+ config = @config || safe_merged_config
30
+ return {} unless config
31
+
32
+ {
33
+ api_key: config[:api_key],
34
+ model: config[:model] || @provider.llm,
35
+ provider: @provider.runtime.to_sym
36
+ }
37
+ rescue KeyError, NoMethodError
38
+ # Expected errors from missing config keys or nil config
39
+ {}
40
+ end
41
+
42
+ private
43
+
44
+ # Safely calls merged_config, returning nil on any error.
45
+ #
46
+ # @return [Hash, nil] The merged config or nil
47
+ def safe_merged_config
48
+ @provider.merged_config
49
+ rescue StandardError
50
+ nil
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Services
5
+ # Finds the registry manifest file path.
6
+ class ManifestFinder
7
+ # Default path relative to current working directory.
8
+ DEFAULT_PATH = '../agent-mcp-runtime/registry.json'
9
+
10
+ # Finds the registry manifest file.
11
+ #
12
+ # @param path [String, nil] Optional custom path to the manifest
13
+ # @return [String] Absolute path to the registry manifest
14
+ # @raise [ArgumentError] when the manifest file is not found
15
+ def self.call(path: nil)
16
+ new(path: path).call
17
+ end
18
+
19
+ # @param path [String, nil] Optional custom path to the manifest
20
+ def initialize(path: nil)
21
+ @path = path
22
+ end
23
+
24
+ # Finds the registry manifest file.
25
+ #
26
+ # @return [String] Absolute path to the registry manifest
27
+ # @raise [ArgumentError] when the manifest file is not found
28
+ def call
29
+ manifest_path = @path || File.expand_path(DEFAULT_PATH, Dir.pwd)
30
+ raise ArgumentError, "Registry manifest not found: #{manifest_path}" unless File.exist?(manifest_path)
31
+
32
+ manifest_path
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Services
5
+ # Formats agent output for evaluation.
6
+ class OutputFormatter
7
+ # Formats agent output for evaluation.
8
+ #
9
+ # @param agent_result [Hash] The agent result containing the output
10
+ # @return [String] The formatted output
11
+ def self.call(agent_result)
12
+ new(agent_result).call
13
+ end
14
+
15
+ # @param agent_result [Hash] The agent result containing the output
16
+ def initialize(agent_result)
17
+ @agent_result = agent_result
18
+ end
19
+
20
+ # Formats agent output for evaluation.
21
+ #
22
+ # @return [String] The formatted output
23
+ def call
24
+ @agent_result[:result].to_s
25
+ end
26
+ end
27
+ end
28
+ end