ruby-skill-bench 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +794 -0
  4. data/bin/skill-bench +15 -0
  5. data/docs/architecture.md +200 -0
  6. data/docs/first-eval-guide.md +522 -0
  7. data/docs/testing-guide.md +361 -0
  8. data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
  9. data/lib/skill_bench/agent/react_agent/step.rb +92 -0
  10. data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
  11. data/lib/skill_bench/agent/react_agent.rb +58 -0
  12. data/lib/skill_bench/agent/runner.rb +108 -0
  13. data/lib/skill_bench/agent/summary.rb +39 -0
  14. data/lib/skill_bench/agent.rb +10 -0
  15. data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
  16. data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
  17. data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
  18. data/lib/skill_bench/cli/eval_command.rb +40 -0
  19. data/lib/skill_bench/cli/help_printer.rb +47 -0
  20. data/lib/skill_bench/cli/init_command.rb +69 -0
  21. data/lib/skill_bench/cli/result_printer.rb +20 -0
  22. data/lib/skill_bench/cli/run_command.rb +72 -0
  23. data/lib/skill_bench/cli/skill_command.rb +79 -0
  24. data/lib/skill_bench/cli.rb +51 -0
  25. data/lib/skill_bench/client.rb +23 -0
  26. data/lib/skill_bench/clients/all.rb +19 -0
  27. data/lib/skill_bench/clients/base_client.rb +212 -0
  28. data/lib/skill_bench/clients/provider_config.rb +47 -0
  29. data/lib/skill_bench/clients/provider_registry.rb +56 -0
  30. data/lib/skill_bench/clients/provider_schemas.rb +73 -0
  31. data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
  32. data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
  33. data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
  34. data/lib/skill_bench/clients/providers/gemini.rb +63 -0
  35. data/lib/skill_bench/clients/providers/groq.rb +39 -0
  36. data/lib/skill_bench/clients/providers/null_client.rb +50 -0
  37. data/lib/skill_bench/clients/providers/ollama.rb +63 -0
  38. data/lib/skill_bench/clients/providers/openai.rb +39 -0
  39. data/lib/skill_bench/clients/providers/opencode.rb +56 -0
  40. data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
  41. data/lib/skill_bench/clients/request_builder.rb +43 -0
  42. data/lib/skill_bench/clients/response_error_handler.rb +73 -0
  43. data/lib/skill_bench/clients/response_parser.rb +93 -0
  44. data/lib/skill_bench/clients/retry_handler.rb +78 -0
  45. data/lib/skill_bench/commands/eval_new.rb +89 -0
  46. data/lib/skill_bench/commands/init.rb +39 -0
  47. data/lib/skill_bench/commands/run.rb +21 -0
  48. data/lib/skill_bench/commands/skill_new.rb +115 -0
  49. data/lib/skill_bench/config/applier.rb +67 -0
  50. data/lib/skill_bench/config/defaults.rb +42 -0
  51. data/lib/skill_bench/config/env_overrides.rb +117 -0
  52. data/lib/skill_bench/config/facade_readers.rb +65 -0
  53. data/lib/skill_bench/config/facade_writers.rb +120 -0
  54. data/lib/skill_bench/config/json_loader.rb +84 -0
  55. data/lib/skill_bench/config/store.rb +177 -0
  56. data/lib/skill_bench/config.rb +172 -0
  57. data/lib/skill_bench/criteria.rb +141 -0
  58. data/lib/skill_bench/delta_report.rb +97 -0
  59. data/lib/skill_bench/dimension.rb +69 -0
  60. data/lib/skill_bench/error_logger.rb +35 -0
  61. data/lib/skill_bench/evaluate_command.rb +120 -0
  62. data/lib/skill_bench/evaluation/generator.rb +191 -0
  63. data/lib/skill_bench/evaluation/runner.rb +81 -0
  64. data/lib/skill_bench/evaluation.rb +10 -0
  65. data/lib/skill_bench/execution/context_hydrator.rb +97 -0
  66. data/lib/skill_bench/execution/sandbox.rb +174 -0
  67. data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
  68. data/lib/skill_bench/execution.rb +10 -0
  69. data/lib/skill_bench/history_recorder/history_file.rb +71 -0
  70. data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
  71. data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
  72. data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
  73. data/lib/skill_bench/history_recorder.rb +40 -0
  74. data/lib/skill_bench/interactive.rb +61 -0
  75. data/lib/skill_bench/judge/judge.rb +72 -0
  76. data/lib/skill_bench/judge/prompt.rb +121 -0
  77. data/lib/skill_bench/judge/response.rb +158 -0
  78. data/lib/skill_bench/judge.rb +10 -0
  79. data/lib/skill_bench/migration/provider_migrator.rb +30 -0
  80. data/lib/skill_bench/models/config.rb +61 -0
  81. data/lib/skill_bench/models/criteria_validator.rb +106 -0
  82. data/lib/skill_bench/models/eval.rb +81 -0
  83. data/lib/skill_bench/models/provider.rb +70 -0
  84. data/lib/skill_bench/models/skill.rb +32 -0
  85. data/lib/skill_bench/output_formatter.rb +132 -0
  86. data/lib/skill_bench/package_verifier.rb +80 -0
  87. data/lib/skill_bench/rails/skill_templates.rb +99 -0
  88. data/lib/skill_bench/runner.rb +89 -0
  89. data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
  90. data/lib/skill_bench/services/feedback_generator.rb +122 -0
  91. data/lib/skill_bench/services/formatting_helpers.rb +45 -0
  92. data/lib/skill_bench/services/iteration_formatter.rb +30 -0
  93. data/lib/skill_bench/services/json_formatter.rb +18 -0
  94. data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
  95. data/lib/skill_bench/services/junit_formatter.rb +42 -0
  96. data/lib/skill_bench/services/option_parser_service.rb +63 -0
  97. data/lib/skill_bench/services/output_persistence_service.rb +77 -0
  98. data/lib/skill_bench/services/result_printer_service.rb +126 -0
  99. data/lib/skill_bench/services/runner_service.rb +381 -0
  100. data/lib/skill_bench/services/skill_resolver.rb +78 -0
  101. data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
  102. data/lib/skill_bench/services/template_registry.rb +148 -0
  103. data/lib/skill_bench/task/evaluator.rb +94 -0
  104. data/lib/skill_bench/task/file_reader.rb +69 -0
  105. data/lib/skill_bench/task.rb +10 -0
  106. data/lib/skill_bench/tools/argument_parser.rb +20 -0
  107. data/lib/skill_bench/tools/base.rb +73 -0
  108. data/lib/skill_bench/tools/dispatcher.rb +61 -0
  109. data/lib/skill_bench/tools/read_file.rb +66 -0
  110. data/lib/skill_bench/tools/registry.rb +23 -0
  111. data/lib/skill_bench/tools/run_command.rb +89 -0
  112. data/lib/skill_bench/tools/write_file.rb +78 -0
  113. data/lib/skill_bench/tools.rb +33 -0
  114. data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
  115. data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
  116. data/lib/skill_bench/trend_tracker.rb +66 -0
  117. data/lib/skill_bench/version.rb +6 -0
  118. data/lib/skill_bench.rb +103 -0
  119. metadata +247 -0
@@ -0,0 +1,108 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../execution/sandbox'
4
+ require_relative '../execution/context_hydrator'
5
+ require_relative 'react_agent'
6
+
7
+ module SkillBench
8
+ module Agent
9
+ # Responsible for executing a specific scenario (baseline or context-hydrated)
10
+ # within an isolated sandbox. Handles the system prompt generation and agent execution.
11
+ class Runner
12
+ # Executes the agent run scenario.
13
+ #
14
+ # @param params [Hash] The configuration parameters for the run.
15
+ # @option params [Symbol] :mode The mode to run in (`:baseline` or `:context`).
16
+ # @option params [Pathname] :full_eval_path The path to the evaluation directory.
17
+ # @option params [String] :task_content The task description.
18
+ # @option params [Hash] :client_params Parameters for the LLM client.
19
+ # @option params [String] :source_path Required if mode is `:context`.
20
+ # @option params [Pathname] :base_path Required if mode is `:context`.
21
+ # @return [Array<String, String>] The agent's final answer and the git diff.
22
+ def self.call(params)
23
+ new(params).call
24
+ end
25
+
26
+ # @param params [Hash] The configuration parameters for the run.
27
+ def initialize(params)
28
+ @mode = validate_mode(params.fetch(:mode))
29
+ @full_eval_path = params.fetch(:full_eval_path)
30
+ @task_content = params.fetch(:task_content)
31
+ @client_params = params.fetch(:client_params, {})
32
+
33
+ @source_path = params[:source_path]
34
+ @base_path = params[:base_path]
35
+ end
36
+
37
+ # Runs the evaluation scenario and captures the results.
38
+ #
39
+ # @return [Array<String, String>] A tuple containing the final answer and the diff.
40
+ def call
41
+ Execution::Sandbox.run(@full_eval_path) do |sandbox|
42
+ working_dir = sandbox.path
43
+ agent_result = ReactAgent.call(
44
+ client_params: @client_params,
45
+ working_dir: working_dir,
46
+ container_id: sandbox.container_id,
47
+ system_prompt: build_system_prompt,
48
+ initial_prompt: @task_content
49
+ )
50
+
51
+ response = agent_result[:response]
52
+ final_answer = if agent_result[:success]
53
+ response&.dig(:content) || 'Error: Empty response from agent'
54
+ else
55
+ error_msg = response&.dig(:error, :message) || 'Unknown error'
56
+ "Error: #{error_msg}"
57
+ end
58
+ [final_answer, Execution::Sandbox.capture_diff(working_dir)]
59
+ end
60
+ end
61
+
62
+ private
63
+
64
+ # Builds the appropriate system prompt based on the execution mode.
65
+ #
66
+ # @return [String] The system prompt for the agent.
67
+ # @raise [RuntimeError] when context hydration fails in :context mode.
68
+ def build_system_prompt
69
+ case @mode
70
+ when :baseline
71
+ baseline_system_prompt
72
+ when :context
73
+ context_system_prompt
74
+ end
75
+ end
76
+
77
+ def baseline_system_prompt
78
+ <<~PROMPT
79
+ You are an expert Ruby on Rails developer.#{' '}
80
+ Your job is to read the task, modify the codebase using the tools provided to meet the requirements, and then explain what you did.
81
+ PROMPT
82
+ end
83
+
84
+ def context_system_prompt
85
+ hydrator_result = Execution::ContextHydrator.call(source_path: @source_path, base_path: @base_path)
86
+ raise "Context hydration failed: #{hydrator_result.dig(:response, :error, :message)}" unless hydrator_result[:success]
87
+
88
+ context_xml = hydrator_result[:response][:context]
89
+
90
+ <<~PROMPT
91
+ You are an expert Ruby on Rails developer.
92
+ You have access to specific skill files wrapped in <agent_context> tags.
93
+ Use these skills exactly as instructed to solve the user's task.
94
+ Modify the codebase using the tools provided to meet the requirements, and then explain what you did.
95
+
96
+ #{context_xml}
97
+ PROMPT
98
+ end
99
+
100
+ def validate_mode(mode)
101
+ allowed = %i[baseline context]
102
+ return mode if allowed.include?(mode)
103
+
104
+ raise ArgumentError, "Invalid mode: #{mode.inspect}. Allowed: #{allowed.join(', ')}"
105
+ end
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Agent
5
+ # Value object capturing sandbox execution metadata.
6
+ #
7
+ # Holds files changed, commands run, and agent reasoning extracted
8
+ # from an evaluation run for delivery to the judge.
9
+ class Summary
10
+ attr_reader :files_changed, :commands_run, :agent_reasoning
11
+
12
+ # Creates an Agent::Summary from execution data.
13
+ #
14
+ # @param files_changed [Array<String>] List of file paths modified.
15
+ # @param commands_run [Array<String>] List of shell commands executed.
16
+ # @param agent_reasoning [String] Excerpt of agent reasoning.
17
+ # @return [Hash] Service response with agent_summary or error.
18
+ def self.call(files_changed: [], commands_run: [], agent_reasoning: '')
19
+ new(files_changed:, commands_run:, agent_reasoning:).call
20
+ end
21
+
22
+ # @param files_changed [Array<String>] Modified file paths.
23
+ # @param commands_run [Array<String>] Executed commands.
24
+ # @param agent_reasoning [String] Agent reasoning excerpt.
25
+ def initialize(files_changed:, commands_run:, agent_reasoning:)
26
+ @files_changed = files_changed
27
+ @commands_run = commands_run
28
+ @agent_reasoning = agent_reasoning
29
+ end
30
+
31
+ # Returns the agent summary in the service response format.
32
+ #
33
+ # @return [Hash] Service response with agent_summary.
34
+ def call
35
+ { success: true, response: { agent_summary: self } }
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ # Namespace for the agent subsystem.
5
+ #
6
+ # The agent subsystem executes AI agents within isolated sandboxes,
7
+ # following the ReAct (Reasoning and Acting) loop pattern.
8
+ module Agent
9
+ end
10
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Cli
5
+ module Eval
6
+ # Registry for eval command handlers
7
+ class EvalCommandRegistry
8
+ # @api private
9
+ # Maps eval action names to their handler classes
10
+ COMMANDS = {
11
+ 'new' => NewEvalCommand,
12
+ 'generate' => GenerateEvalCommand,
13
+ 'help' => HelpEvalCommand
14
+ }.freeze
15
+
16
+ # Gets command class for action
17
+ #
18
+ # @param action [String] Command action name
19
+ # @return [Class<BaseEvalCommand>, nil] Command class or nil if not found
20
+ def self.get_command(action)
21
+ return COMMANDS['help'] if action.nil? || %w[-h --help help].include?(action)
22
+
23
+ COMMANDS[action]
24
+ end
25
+
26
+ # Lists all available actions
27
+ #
28
+ # @return [Array<String>] Available action names
29
+ def self.available_actions
30
+ COMMANDS.keys
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,112 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../../evaluation/generator'
4
+ require_relative '../../commands/eval_new'
5
+
6
+ module SkillBench
7
+ module Cli
8
+ module Eval
9
+ # Base class for eval command handlers
10
+ class BaseEvalCommand
11
+ # Executes command.
12
+ #
13
+ # @param argv [Array<String>] Command line arguments
14
+ # @return [Integer] Exit code
15
+ # @raise [NotImplementedError] always — subclasses must override
16
+ def call(argv)
17
+ raise NotImplementedError, 'Subclasses must implement #call'
18
+ end
19
+
20
+ protected
21
+
22
+ # Wraps a command block with standard rescue handling for HelpRequested
23
+ # and generic StandardError.
24
+ #
25
+ # @yield Block that implements the command logic
26
+ # @return [Integer] Exit code from the block, 0 for help, or 1 on error
27
+ # @raise [HelpRequested] caught internally, returns 0
28
+ # @raise [StandardError] caught internally, prints to stderr and returns 1
29
+ def run_with_rescue
30
+ yield
31
+ rescue HelpRequested
32
+ 0
33
+ rescue StandardError => e
34
+ warn "Error: #{e.message}"
35
+ 1
36
+ end
37
+
38
+ # Returns error response for missing required argument
39
+ #
40
+ # @param message [String] Error message
41
+ # @return [Integer] Exit code 1
42
+ def error_missing(message)
43
+ warn "Error: #{message}"
44
+ 1
45
+ end
46
+ end
47
+
48
+ # Handles 'eval new' command
49
+ class NewEvalCommand < BaseEvalCommand
50
+ # Creates a new evaluation
51
+ #
52
+ # @param argv [Array<String>] Command line arguments
53
+ # @return [Integer] Exit code
54
+ def call(argv)
55
+ run_with_rescue do
56
+ options_parser = NewEvalOptions.new
57
+ options_parser.parse!(argv)
58
+
59
+ name = argv.shift
60
+ return error_missing('eval name is required') unless name
61
+
62
+ Commands::EvalNew.run(name: name, **options_parser.options)
63
+ puts "Created eval: #{name}"
64
+ 0
65
+ end
66
+ end
67
+ end
68
+
69
+ # Handles 'eval generate' command
70
+ class GenerateEvalCommand < BaseEvalCommand
71
+ # Generates an evaluation from a skill
72
+ #
73
+ # @param argv [Array<String>] Command line arguments
74
+ # @return [Integer] Exit code
75
+ def call(argv)
76
+ run_with_rescue do
77
+ options_parser = GenerateEvalOptions.new
78
+ options_parser.parse!(argv)
79
+
80
+ skill_name = argv.shift
81
+ return error_missing('skill name is required') unless skill_name
82
+
83
+ eval_name = options_parser.options[:eval_name] || "#{skill_name}-eval"
84
+ result = Evaluation::Generator.new(skill_name: skill_name, eval_name: eval_name).call
85
+
86
+ if result[:success]
87
+ puts "Generated eval: #{eval_name} from skill: #{skill_name}"
88
+ 0
89
+ else
90
+ warn "Error: #{result[:response][:error][:message]}"
91
+ 1
92
+ end
93
+ end
94
+ end
95
+ end
96
+
97
+ # Handles help display for eval commands
98
+ class HelpEvalCommand < BaseEvalCommand
99
+ # Shows help information
100
+ #
101
+ # @param _argv [Array<String>] Unused arguments
102
+ # @return [Integer] Exit code 0
103
+ def call(_argv)
104
+ puts 'Usage: skill-bench eval new <name> [options]'
105
+ puts ' --runtime TYPE rails, ruby, etc. (default: ruby)'
106
+ puts 'Usage: skill-bench eval generate <skill-name> [--name <eval-name>]'
107
+ 0
108
+ end
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'optparse'
4
+
5
+ module SkillBench
6
+ module Cli
7
+ module Eval
8
+ # Base class for eval command option parsing
9
+ class BaseEvalOptions
10
+ attr_reader :options, :parser
11
+
12
+ def initialize
13
+ @options = default_options
14
+ @parser = create_parser
15
+ end
16
+
17
+ # Parses command line arguments
18
+ #
19
+ # @param argv [Array<String>] Command line arguments
20
+ # @return [Array<String>] Remaining arguments after parsing options
21
+ def parse!(argv)
22
+ parser.parse!(argv)
23
+ end
24
+
25
+ protected
26
+
27
+ # Override in subclasses to define default options
28
+ def default_options
29
+ {}
30
+ end
31
+
32
+ # Override in subclasses to configure OptionParser
33
+ def create_parser
34
+ OptionParser.new
35
+ end
36
+ end
37
+
38
+ # Options parser for 'eval new' command
39
+ class NewEvalOptions < BaseEvalOptions
40
+ protected
41
+
42
+ def default_options
43
+ { runtime: 'ruby' }
44
+ end
45
+
46
+ def create_parser
47
+ OptionParser.new do |opts|
48
+ opts.banner = 'Usage: skill-bench eval new <name> [options]'
49
+ opts.on('--runtime TYPE', 'rails, ruby, etc.') { |v| @options[:runtime] = v }
50
+ opts.on('-h', '--help', 'Prints this help') do
51
+ puts opts
52
+ raise HelpRequested
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ # Options parser for 'eval generate' command
59
+ class GenerateEvalOptions < BaseEvalOptions
60
+ protected
61
+
62
+ def create_parser
63
+ OptionParser.new do |opts|
64
+ opts.banner = 'Usage: skill-bench eval generate <skill-name> [options]'
65
+ opts.on('--name NAME', 'Name for generated eval') { |v| @options[:eval_name] = v }
66
+ opts.on('-h', '--help', 'Prints this help') do
67
+ puts opts
68
+ raise HelpRequested
69
+ end
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'eval/eval_options'
4
+ require_relative 'eval/eval_commands'
5
+ require_relative 'eval/eval_command_registry'
6
+
7
+ module SkillBench
8
+ module Cli
9
+ # Handles the `skill-bench eval` subcommand.
10
+ # Dispatches to appropriate command handlers.
11
+ class EvalCommand
12
+ # Parses argv and executes eval command.
13
+ #
14
+ # @param argv [Array<String>] Raw CLI arguments
15
+ # @return [Integer] Exit code
16
+ def self.call(argv)
17
+ new(argv).call
18
+ end
19
+
20
+ # @param argv [Array<String>] Raw CLI arguments
21
+ def initialize(argv)
22
+ @argv = argv
23
+ end
24
+
25
+ # Dispatches to appropriate eval action.
26
+ #
27
+ def call
28
+ action = @argv.shift
29
+ command_class = Eval::EvalCommandRegistry.get_command(action)
30
+
31
+ if command_class
32
+ command_class.new.call(@argv)
33
+ else
34
+ warn "Unknown eval action: #{action}"
35
+ 1
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Cli
5
+ # Prints the CLI help/usage message.
6
+ class HelpPrinter
7
+ # Prints the help message and returns exit code 0.
8
+ #
9
+ # @return [Integer] Exit code (always 0)
10
+ def self.call
11
+ providers = SkillBench::Clients::ProviderSchemas.names.map { |name| "--#{name}" }.join(', ')
12
+
13
+ puts <<~USAGE
14
+ Usage: skill-bench <subcommand> [options]
15
+
16
+ Subcommands:
17
+ init --<provider> [--force]
18
+ Generate configuration file
19
+ Providers: #{providers}
20
+ --force Overwrite existing config file
21
+
22
+ run <eval> --skill <name> [--skill <name>] [--format FORMAT]
23
+ Run an evaluation
24
+ --skill Skill to use (can be specified multiple times)
25
+ --format Output format: human, json, junit (default: human)
26
+
27
+ skill new <name> [--mode MODE] [--template TYPE]
28
+ Create a new skill
29
+ --mode simple, advanced, or rails (default: simple)
30
+ --template service_object, concern, active_record_model (default: service_object)
31
+
32
+ eval new <name> [--runtime TYPE]
33
+ Create a new eval
34
+ --runtime rails, ruby, etc. (default: ruby)
35
+
36
+ eval generate <skill-name> [--name <eval-name>]
37
+ Auto-generate an eval from a skill
38
+ --name Name for the generated eval (optional)
39
+
40
+ Global Options:
41
+ -h, --help Show this help message
42
+ USAGE
43
+ 0
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'optparse'
4
+
5
+ module SkillBench
6
+ module Cli
7
+ # Handles the `skill-bench init` subcommand.
8
+ # Parses options and delegates to Commands::Init.
9
+ class InitCommand
10
+ #
11
+ # @param argv [Array<String>] Raw CLI arguments
12
+ # @return [Integer] Exit code
13
+ def self.call(argv)
14
+ new(argv).call
15
+ end
16
+
17
+ # @param argv [Array<String>] Raw CLI arguments
18
+ def initialize(argv)
19
+ @argv = argv
20
+ end
21
+
22
+ # Parses options and runs init.
23
+ #
24
+ # @return [Integer] Exit code
25
+ def call
26
+ options = { force: false, provider: nil }
27
+ parser = build_parser(options)
28
+ parser.parse!(@argv)
29
+
30
+ return error_missing_provider unless options[:provider]
31
+
32
+ Commands::Init.run(**options)
33
+ puts "Created #{SkillBench::Config::CONFIG_FILENAME}"
34
+ 0
35
+ rescue SkillBench::HelpRequested
36
+ 0
37
+ rescue StandardError => e
38
+ warn "Error: #{e.message}"
39
+ 1
40
+ end
41
+
42
+ private
43
+
44
+ def build_parser(options)
45
+ OptionParser.new do |opts|
46
+ opts.banner = 'Usage: skill-bench init --<provider> [options]'
47
+ register_provider_options(opts, options)
48
+ opts.on('--force', 'Overwrite existing config file') { options[:force] = true }
49
+ opts.on('-h', '--help', 'Prints this help') do
50
+ puts opts
51
+ raise SkillBench::HelpRequested
52
+ end
53
+ end
54
+ end
55
+
56
+ def register_provider_options(parser, options)
57
+ SkillBench::Clients::ProviderSchemas.names.each do |name|
58
+ parser.on("--#{name}", "Generate config for #{name.to_s.capitalize}") { options[:provider] = name }
59
+ end
60
+ end
61
+
62
+ def error_missing_provider
63
+ providers = SkillBench::Clients::ProviderSchemas.names.map { |provider_name| "--#{provider_name}" }.join(', ')
64
+ warn "Error: provider is required. Use one of: #{providers}"
65
+ 1
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../output_formatter'
4
+
5
+ module SkillBench
6
+ module Cli
7
+ # Prints the result of a `skill-bench run` command.
8
+ class ResultPrinter
9
+ # Prints the result and returns the appropriate exit code.
10
+ #
11
+ # @param result [Hash] Result from ScoringService
12
+ # @param format [Symbol] Output format (:human, :json, :junit)
13
+ # @return [Integer] Exit code (0 for pass, 1 for fail)
14
+ def self.call(result, format: :human)
15
+ puts OutputFormatter.format(result, format: format)
16
+ OutputFormatter.exit_code(result)
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'optparse'
4
+
5
+ module SkillBench
6
+ module Cli
7
+ # Handles the `skill-bench run` subcommand.
8
+ # Parses options and delegates to Commands::Run.
9
+ class RunCommand
10
+ #
11
+ # @param argv [Array<String>] Raw CLI arguments
12
+ # @return [Integer] Exit code
13
+ def self.call(argv)
14
+ new(argv).call
15
+ end
16
+
17
+ # @param argv [Array<String>] Raw CLI arguments
18
+ def initialize(argv)
19
+ @argv = argv
20
+ end
21
+
22
+ # Parses options and runs the eval.
23
+ #
24
+ # @return [Integer] Exit code
25
+ def call
26
+ options = { skill_names: [] }
27
+ parser = build_parser(options)
28
+ parser.parse!(@argv)
29
+
30
+ eval_name = @argv.shift
31
+ return error_missing_eval unless eval_name
32
+ return error_missing_skill if options[:skill_names].empty?
33
+
34
+ options[:eval_name] = eval_name
35
+ exec_options = options.reject { |key| key == :format }
36
+ result = Commands::Run.run(**exec_options)
37
+ ResultPrinter.call(result, format: options[:format] || :human)
38
+ rescue HelpRequested
39
+ 0
40
+ rescue StandardError => e
41
+ warn "Error: #{e.message}"
42
+ 1
43
+ end
44
+
45
+ private
46
+
47
+ def build_parser(options)
48
+ OptionParser.new do |opts|
49
+ opts.banner = 'Usage: skill-bench run <eval> [options]'
50
+ opts.on('--skill NAME', 'Skill to use (can be specified multiple times)') { |v| options[:skill_names] << v }
51
+ opts.on('--format FORMAT', 'Output format (human, json, junit)') { |v| options[:format] = v.to_sym }
52
+ opts.on('-h', '--help', 'Prints this help') do
53
+ puts opts
54
+ raise SkillBench::HelpRequested
55
+ end
56
+ end
57
+ end
58
+
59
+ def error_missing_eval
60
+ warn 'Error: eval name is required'
61
+ warn 'Usage: skill-bench run <eval> --skill <name>'
62
+ 1
63
+ end
64
+
65
+ def error_missing_skill
66
+ warn 'Error: skill name is required'
67
+ warn 'Usage: skill-bench run <eval> --skill <name>'
68
+ 1
69
+ end
70
+ end
71
+ end
72
+ end