ruby-skill-bench 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +794 -0
- data/bin/skill-bench +15 -0
- data/docs/architecture.md +200 -0
- data/docs/first-eval-guide.md +522 -0
- data/docs/testing-guide.md +361 -0
- data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
- data/lib/skill_bench/agent/react_agent/step.rb +92 -0
- data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
- data/lib/skill_bench/agent/react_agent.rb +58 -0
- data/lib/skill_bench/agent/runner.rb +108 -0
- data/lib/skill_bench/agent/summary.rb +39 -0
- data/lib/skill_bench/agent.rb +10 -0
- data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
- data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
- data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
- data/lib/skill_bench/cli/eval_command.rb +40 -0
- data/lib/skill_bench/cli/help_printer.rb +47 -0
- data/lib/skill_bench/cli/init_command.rb +69 -0
- data/lib/skill_bench/cli/result_printer.rb +20 -0
- data/lib/skill_bench/cli/run_command.rb +72 -0
- data/lib/skill_bench/cli/skill_command.rb +79 -0
- data/lib/skill_bench/cli.rb +51 -0
- data/lib/skill_bench/client.rb +23 -0
- data/lib/skill_bench/clients/all.rb +19 -0
- data/lib/skill_bench/clients/base_client.rb +212 -0
- data/lib/skill_bench/clients/provider_config.rb +47 -0
- data/lib/skill_bench/clients/provider_registry.rb +56 -0
- data/lib/skill_bench/clients/provider_schemas.rb +73 -0
- data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
- data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
- data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
- data/lib/skill_bench/clients/providers/gemini.rb +63 -0
- data/lib/skill_bench/clients/providers/groq.rb +39 -0
- data/lib/skill_bench/clients/providers/null_client.rb +50 -0
- data/lib/skill_bench/clients/providers/ollama.rb +63 -0
- data/lib/skill_bench/clients/providers/openai.rb +39 -0
- data/lib/skill_bench/clients/providers/opencode.rb +56 -0
- data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
- data/lib/skill_bench/clients/request_builder.rb +43 -0
- data/lib/skill_bench/clients/response_error_handler.rb +73 -0
- data/lib/skill_bench/clients/response_parser.rb +93 -0
- data/lib/skill_bench/clients/retry_handler.rb +78 -0
- data/lib/skill_bench/commands/eval_new.rb +89 -0
- data/lib/skill_bench/commands/init.rb +39 -0
- data/lib/skill_bench/commands/run.rb +21 -0
- data/lib/skill_bench/commands/skill_new.rb +115 -0
- data/lib/skill_bench/config/applier.rb +67 -0
- data/lib/skill_bench/config/defaults.rb +42 -0
- data/lib/skill_bench/config/env_overrides.rb +117 -0
- data/lib/skill_bench/config/facade_readers.rb +65 -0
- data/lib/skill_bench/config/facade_writers.rb +120 -0
- data/lib/skill_bench/config/json_loader.rb +84 -0
- data/lib/skill_bench/config/store.rb +177 -0
- data/lib/skill_bench/config.rb +172 -0
- data/lib/skill_bench/criteria.rb +141 -0
- data/lib/skill_bench/delta_report.rb +97 -0
- data/lib/skill_bench/dimension.rb +69 -0
- data/lib/skill_bench/error_logger.rb +35 -0
- data/lib/skill_bench/evaluate_command.rb +120 -0
- data/lib/skill_bench/evaluation/generator.rb +191 -0
- data/lib/skill_bench/evaluation/runner.rb +81 -0
- data/lib/skill_bench/evaluation.rb +10 -0
- data/lib/skill_bench/execution/context_hydrator.rb +97 -0
- data/lib/skill_bench/execution/sandbox.rb +174 -0
- data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
- data/lib/skill_bench/execution.rb +10 -0
- data/lib/skill_bench/history_recorder/history_file.rb +71 -0
- data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
- data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
- data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
- data/lib/skill_bench/history_recorder.rb +40 -0
- data/lib/skill_bench/interactive.rb +61 -0
- data/lib/skill_bench/judge/judge.rb +72 -0
- data/lib/skill_bench/judge/prompt.rb +121 -0
- data/lib/skill_bench/judge/response.rb +158 -0
- data/lib/skill_bench/judge.rb +10 -0
- data/lib/skill_bench/migration/provider_migrator.rb +30 -0
- data/lib/skill_bench/models/config.rb +61 -0
- data/lib/skill_bench/models/criteria_validator.rb +106 -0
- data/lib/skill_bench/models/eval.rb +81 -0
- data/lib/skill_bench/models/provider.rb +70 -0
- data/lib/skill_bench/models/skill.rb +32 -0
- data/lib/skill_bench/output_formatter.rb +132 -0
- data/lib/skill_bench/package_verifier.rb +80 -0
- data/lib/skill_bench/rails/skill_templates.rb +99 -0
- data/lib/skill_bench/runner.rb +89 -0
- data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
- data/lib/skill_bench/services/feedback_generator.rb +122 -0
- data/lib/skill_bench/services/formatting_helpers.rb +45 -0
- data/lib/skill_bench/services/iteration_formatter.rb +30 -0
- data/lib/skill_bench/services/json_formatter.rb +18 -0
- data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
- data/lib/skill_bench/services/junit_formatter.rb +42 -0
- data/lib/skill_bench/services/option_parser_service.rb +63 -0
- data/lib/skill_bench/services/output_persistence_service.rb +77 -0
- data/lib/skill_bench/services/result_printer_service.rb +126 -0
- data/lib/skill_bench/services/runner_service.rb +381 -0
- data/lib/skill_bench/services/skill_resolver.rb +78 -0
- data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
- data/lib/skill_bench/services/template_registry.rb +148 -0
- data/lib/skill_bench/task/evaluator.rb +94 -0
- data/lib/skill_bench/task/file_reader.rb +69 -0
- data/lib/skill_bench/task.rb +10 -0
- data/lib/skill_bench/tools/argument_parser.rb +20 -0
- data/lib/skill_bench/tools/base.rb +73 -0
- data/lib/skill_bench/tools/dispatcher.rb +61 -0
- data/lib/skill_bench/tools/read_file.rb +66 -0
- data/lib/skill_bench/tools/registry.rb +23 -0
- data/lib/skill_bench/tools/run_command.rb +89 -0
- data/lib/skill_bench/tools/write_file.rb +78 -0
- data/lib/skill_bench/tools.rb +33 -0
- data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
- data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
- data/lib/skill_bench/trend_tracker.rb +66 -0
- data/lib/skill_bench/version.rb +6 -0
- data/lib/skill_bench.rb +103 -0
- metadata +247 -0
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../execution/sandbox'
|
|
4
|
+
require_relative '../execution/context_hydrator'
|
|
5
|
+
require_relative 'react_agent'
|
|
6
|
+
|
|
7
|
+
module SkillBench
|
|
8
|
+
module Agent
|
|
9
|
+
# Responsible for executing a specific scenario (baseline or context-hydrated)
|
|
10
|
+
# within an isolated sandbox. Handles the system prompt generation and agent execution.
|
|
11
|
+
class Runner
|
|
12
|
+
# Executes the agent run scenario.
|
|
13
|
+
#
|
|
14
|
+
# @param params [Hash] The configuration parameters for the run.
|
|
15
|
+
# @option params [Symbol] :mode The mode to run in (`:baseline` or `:context`).
|
|
16
|
+
# @option params [Pathname] :full_eval_path The path to the evaluation directory.
|
|
17
|
+
# @option params [String] :task_content The task description.
|
|
18
|
+
# @option params [Hash] :client_params Parameters for the LLM client.
|
|
19
|
+
# @option params [String] :source_path Required if mode is `:context`.
|
|
20
|
+
# @option params [Pathname] :base_path Required if mode is `:context`.
|
|
21
|
+
# @return [Array<String, String>] The agent's final answer and the git diff.
|
|
22
|
+
def self.call(params)
|
|
23
|
+
new(params).call
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# @param params [Hash] The configuration parameters for the run.
|
|
27
|
+
def initialize(params)
|
|
28
|
+
@mode = validate_mode(params.fetch(:mode))
|
|
29
|
+
@full_eval_path = params.fetch(:full_eval_path)
|
|
30
|
+
@task_content = params.fetch(:task_content)
|
|
31
|
+
@client_params = params.fetch(:client_params, {})
|
|
32
|
+
|
|
33
|
+
@source_path = params[:source_path]
|
|
34
|
+
@base_path = params[:base_path]
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Runs the evaluation scenario and captures the results.
|
|
38
|
+
#
|
|
39
|
+
# @return [Array<String, String>] A tuple containing the final answer and the diff.
|
|
40
|
+
def call
|
|
41
|
+
Execution::Sandbox.run(@full_eval_path) do |sandbox|
|
|
42
|
+
working_dir = sandbox.path
|
|
43
|
+
agent_result = ReactAgent.call(
|
|
44
|
+
client_params: @client_params,
|
|
45
|
+
working_dir: working_dir,
|
|
46
|
+
container_id: sandbox.container_id,
|
|
47
|
+
system_prompt: build_system_prompt,
|
|
48
|
+
initial_prompt: @task_content
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
response = agent_result[:response]
|
|
52
|
+
final_answer = if agent_result[:success]
|
|
53
|
+
response&.dig(:content) || 'Error: Empty response from agent'
|
|
54
|
+
else
|
|
55
|
+
error_msg = response&.dig(:error, :message) || 'Unknown error'
|
|
56
|
+
"Error: #{error_msg}"
|
|
57
|
+
end
|
|
58
|
+
[final_answer, Execution::Sandbox.capture_diff(working_dir)]
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
private
|
|
63
|
+
|
|
64
|
+
# Builds the appropriate system prompt based on the execution mode.
|
|
65
|
+
#
|
|
66
|
+
# @return [String] The system prompt for the agent.
|
|
67
|
+
# @raise [RuntimeError] when context hydration fails in :context mode.
|
|
68
|
+
def build_system_prompt
|
|
69
|
+
case @mode
|
|
70
|
+
when :baseline
|
|
71
|
+
baseline_system_prompt
|
|
72
|
+
when :context
|
|
73
|
+
context_system_prompt
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def baseline_system_prompt
|
|
78
|
+
<<~PROMPT
|
|
79
|
+
You are an expert Ruby on Rails developer.#{' '}
|
|
80
|
+
Your job is to read the task, modify the codebase using the tools provided to meet the requirements, and then explain what you did.
|
|
81
|
+
PROMPT
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def context_system_prompt
|
|
85
|
+
hydrator_result = Execution::ContextHydrator.call(source_path: @source_path, base_path: @base_path)
|
|
86
|
+
raise "Context hydration failed: #{hydrator_result.dig(:response, :error, :message)}" unless hydrator_result[:success]
|
|
87
|
+
|
|
88
|
+
context_xml = hydrator_result[:response][:context]
|
|
89
|
+
|
|
90
|
+
<<~PROMPT
|
|
91
|
+
You are an expert Ruby on Rails developer.
|
|
92
|
+
You have access to specific skill files wrapped in <agent_context> tags.
|
|
93
|
+
Use these skills exactly as instructed to solve the user's task.
|
|
94
|
+
Modify the codebase using the tools provided to meet the requirements, and then explain what you did.
|
|
95
|
+
|
|
96
|
+
#{context_xml}
|
|
97
|
+
PROMPT
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def validate_mode(mode)
|
|
101
|
+
allowed = %i[baseline context]
|
|
102
|
+
return mode if allowed.include?(mode)
|
|
103
|
+
|
|
104
|
+
raise ArgumentError, "Invalid mode: #{mode.inspect}. Allowed: #{allowed.join(', ')}"
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Agent
|
|
5
|
+
# Value object capturing sandbox execution metadata.
|
|
6
|
+
#
|
|
7
|
+
# Holds files changed, commands run, and agent reasoning extracted
|
|
8
|
+
# from an evaluation run for delivery to the judge.
|
|
9
|
+
class Summary
|
|
10
|
+
attr_reader :files_changed, :commands_run, :agent_reasoning
|
|
11
|
+
|
|
12
|
+
# Creates an Agent::Summary from execution data.
|
|
13
|
+
#
|
|
14
|
+
# @param files_changed [Array<String>] List of file paths modified.
|
|
15
|
+
# @param commands_run [Array<String>] List of shell commands executed.
|
|
16
|
+
# @param agent_reasoning [String] Excerpt of agent reasoning.
|
|
17
|
+
# @return [Hash] Service response with agent_summary or error.
|
|
18
|
+
def self.call(files_changed: [], commands_run: [], agent_reasoning: '')
|
|
19
|
+
new(files_changed:, commands_run:, agent_reasoning:).call
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# @param files_changed [Array<String>] Modified file paths.
|
|
23
|
+
# @param commands_run [Array<String>] Executed commands.
|
|
24
|
+
# @param agent_reasoning [String] Agent reasoning excerpt.
|
|
25
|
+
def initialize(files_changed:, commands_run:, agent_reasoning:)
|
|
26
|
+
@files_changed = files_changed
|
|
27
|
+
@commands_run = commands_run
|
|
28
|
+
@agent_reasoning = agent_reasoning
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Returns the agent summary in the service response format.
|
|
32
|
+
#
|
|
33
|
+
# @return [Hash] Service response with agent_summary.
|
|
34
|
+
def call
|
|
35
|
+
{ success: true, response: { agent_summary: self } }
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Cli
|
|
5
|
+
module Eval
|
|
6
|
+
# Registry for eval command handlers
|
|
7
|
+
class EvalCommandRegistry
|
|
8
|
+
# @api private
|
|
9
|
+
# Maps eval action names to their handler classes
|
|
10
|
+
COMMANDS = {
|
|
11
|
+
'new' => NewEvalCommand,
|
|
12
|
+
'generate' => GenerateEvalCommand,
|
|
13
|
+
'help' => HelpEvalCommand
|
|
14
|
+
}.freeze
|
|
15
|
+
|
|
16
|
+
# Gets command class for action
|
|
17
|
+
#
|
|
18
|
+
# @param action [String] Command action name
|
|
19
|
+
# @return [Class<BaseEvalCommand>, nil] Command class or nil if not found
|
|
20
|
+
def self.get_command(action)
|
|
21
|
+
return COMMANDS['help'] if action.nil? || %w[-h --help help].include?(action)
|
|
22
|
+
|
|
23
|
+
COMMANDS[action]
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Lists all available actions
|
|
27
|
+
#
|
|
28
|
+
# @return [Array<String>] Available action names
|
|
29
|
+
def self.available_actions
|
|
30
|
+
COMMANDS.keys
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../../evaluation/generator'
|
|
4
|
+
require_relative '../../commands/eval_new'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Cli
|
|
8
|
+
module Eval
|
|
9
|
+
# Base class for eval command handlers
|
|
10
|
+
class BaseEvalCommand
|
|
11
|
+
# Executes command.
|
|
12
|
+
#
|
|
13
|
+
# @param argv [Array<String>] Command line arguments
|
|
14
|
+
# @return [Integer] Exit code
|
|
15
|
+
# @raise [NotImplementedError] always — subclasses must override
|
|
16
|
+
def call(argv)
|
|
17
|
+
raise NotImplementedError, 'Subclasses must implement #call'
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
protected
|
|
21
|
+
|
|
22
|
+
# Wraps a command block with standard rescue handling for HelpRequested
|
|
23
|
+
# and generic StandardError.
|
|
24
|
+
#
|
|
25
|
+
# @yield Block that implements the command logic
|
|
26
|
+
# @return [Integer] Exit code from the block, 0 for help, or 1 on error
|
|
27
|
+
# @raise [HelpRequested] caught internally, returns 0
|
|
28
|
+
# @raise [StandardError] caught internally, prints to stderr and returns 1
|
|
29
|
+
def run_with_rescue
|
|
30
|
+
yield
|
|
31
|
+
rescue HelpRequested
|
|
32
|
+
0
|
|
33
|
+
rescue StandardError => e
|
|
34
|
+
warn "Error: #{e.message}"
|
|
35
|
+
1
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Returns error response for missing required argument
|
|
39
|
+
#
|
|
40
|
+
# @param message [String] Error message
|
|
41
|
+
# @return [Integer] Exit code 1
|
|
42
|
+
def error_missing(message)
|
|
43
|
+
warn "Error: #{message}"
|
|
44
|
+
1
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Handles 'eval new' command
|
|
49
|
+
class NewEvalCommand < BaseEvalCommand
|
|
50
|
+
# Creates a new evaluation
|
|
51
|
+
#
|
|
52
|
+
# @param argv [Array<String>] Command line arguments
|
|
53
|
+
# @return [Integer] Exit code
|
|
54
|
+
def call(argv)
|
|
55
|
+
run_with_rescue do
|
|
56
|
+
options_parser = NewEvalOptions.new
|
|
57
|
+
options_parser.parse!(argv)
|
|
58
|
+
|
|
59
|
+
name = argv.shift
|
|
60
|
+
return error_missing('eval name is required') unless name
|
|
61
|
+
|
|
62
|
+
Commands::EvalNew.run(name: name, **options_parser.options)
|
|
63
|
+
puts "Created eval: #{name}"
|
|
64
|
+
0
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Handles 'eval generate' command
|
|
70
|
+
class GenerateEvalCommand < BaseEvalCommand
|
|
71
|
+
# Generates an evaluation from a skill
|
|
72
|
+
#
|
|
73
|
+
# @param argv [Array<String>] Command line arguments
|
|
74
|
+
# @return [Integer] Exit code
|
|
75
|
+
def call(argv)
|
|
76
|
+
run_with_rescue do
|
|
77
|
+
options_parser = GenerateEvalOptions.new
|
|
78
|
+
options_parser.parse!(argv)
|
|
79
|
+
|
|
80
|
+
skill_name = argv.shift
|
|
81
|
+
return error_missing('skill name is required') unless skill_name
|
|
82
|
+
|
|
83
|
+
eval_name = options_parser.options[:eval_name] || "#{skill_name}-eval"
|
|
84
|
+
result = Evaluation::Generator.new(skill_name: skill_name, eval_name: eval_name).call
|
|
85
|
+
|
|
86
|
+
if result[:success]
|
|
87
|
+
puts "Generated eval: #{eval_name} from skill: #{skill_name}"
|
|
88
|
+
0
|
|
89
|
+
else
|
|
90
|
+
warn "Error: #{result[:response][:error][:message]}"
|
|
91
|
+
1
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Handles help display for eval commands
|
|
98
|
+
class HelpEvalCommand < BaseEvalCommand
|
|
99
|
+
# Shows help information
|
|
100
|
+
#
|
|
101
|
+
# @param _argv [Array<String>] Unused arguments
|
|
102
|
+
# @return [Integer] Exit code 0
|
|
103
|
+
def call(_argv)
|
|
104
|
+
puts 'Usage: skill-bench eval new <name> [options]'
|
|
105
|
+
puts ' --runtime TYPE rails, ruby, etc. (default: ruby)'
|
|
106
|
+
puts 'Usage: skill-bench eval generate <skill-name> [--name <eval-name>]'
|
|
107
|
+
0
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'optparse'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Cli
|
|
7
|
+
module Eval
|
|
8
|
+
# Base class for eval command option parsing
|
|
9
|
+
class BaseEvalOptions
|
|
10
|
+
attr_reader :options, :parser
|
|
11
|
+
|
|
12
|
+
def initialize
|
|
13
|
+
@options = default_options
|
|
14
|
+
@parser = create_parser
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Parses command line arguments
|
|
18
|
+
#
|
|
19
|
+
# @param argv [Array<String>] Command line arguments
|
|
20
|
+
# @return [Array<String>] Remaining arguments after parsing options
|
|
21
|
+
def parse!(argv)
|
|
22
|
+
parser.parse!(argv)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
protected
|
|
26
|
+
|
|
27
|
+
# Override in subclasses to define default options
|
|
28
|
+
def default_options
|
|
29
|
+
{}
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Override in subclasses to configure OptionParser
|
|
33
|
+
def create_parser
|
|
34
|
+
OptionParser.new
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Options parser for 'eval new' command
|
|
39
|
+
class NewEvalOptions < BaseEvalOptions
|
|
40
|
+
protected
|
|
41
|
+
|
|
42
|
+
def default_options
|
|
43
|
+
{ runtime: 'ruby' }
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def create_parser
|
|
47
|
+
OptionParser.new do |opts|
|
|
48
|
+
opts.banner = 'Usage: skill-bench eval new <name> [options]'
|
|
49
|
+
opts.on('--runtime TYPE', 'rails, ruby, etc.') { |v| @options[:runtime] = v }
|
|
50
|
+
opts.on('-h', '--help', 'Prints this help') do
|
|
51
|
+
puts opts
|
|
52
|
+
raise HelpRequested
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Options parser for 'eval generate' command
|
|
59
|
+
class GenerateEvalOptions < BaseEvalOptions
|
|
60
|
+
protected
|
|
61
|
+
|
|
62
|
+
def create_parser
|
|
63
|
+
OptionParser.new do |opts|
|
|
64
|
+
opts.banner = 'Usage: skill-bench eval generate <skill-name> [options]'
|
|
65
|
+
opts.on('--name NAME', 'Name for generated eval') { |v| @options[:eval_name] = v }
|
|
66
|
+
opts.on('-h', '--help', 'Prints this help') do
|
|
67
|
+
puts opts
|
|
68
|
+
raise HelpRequested
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'eval/eval_options'
|
|
4
|
+
require_relative 'eval/eval_commands'
|
|
5
|
+
require_relative 'eval/eval_command_registry'
|
|
6
|
+
|
|
7
|
+
module SkillBench
|
|
8
|
+
module Cli
|
|
9
|
+
# Handles the `skill-bench eval` subcommand.
|
|
10
|
+
# Dispatches to appropriate command handlers.
|
|
11
|
+
class EvalCommand
|
|
12
|
+
# Parses argv and executes eval command.
|
|
13
|
+
#
|
|
14
|
+
# @param argv [Array<String>] Raw CLI arguments
|
|
15
|
+
# @return [Integer] Exit code
|
|
16
|
+
def self.call(argv)
|
|
17
|
+
new(argv).call
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# @param argv [Array<String>] Raw CLI arguments
|
|
21
|
+
def initialize(argv)
|
|
22
|
+
@argv = argv
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Dispatches to appropriate eval action.
|
|
26
|
+
#
|
|
27
|
+
def call
|
|
28
|
+
action = @argv.shift
|
|
29
|
+
command_class = Eval::EvalCommandRegistry.get_command(action)
|
|
30
|
+
|
|
31
|
+
if command_class
|
|
32
|
+
command_class.new.call(@argv)
|
|
33
|
+
else
|
|
34
|
+
warn "Unknown eval action: #{action}"
|
|
35
|
+
1
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Cli
|
|
5
|
+
# Prints the CLI help/usage message.
|
|
6
|
+
class HelpPrinter
|
|
7
|
+
# Prints the help message and returns exit code 0.
|
|
8
|
+
#
|
|
9
|
+
# @return [Integer] Exit code (always 0)
|
|
10
|
+
def self.call
|
|
11
|
+
providers = SkillBench::Clients::ProviderSchemas.names.map { |name| "--#{name}" }.join(', ')
|
|
12
|
+
|
|
13
|
+
puts <<~USAGE
|
|
14
|
+
Usage: skill-bench <subcommand> [options]
|
|
15
|
+
|
|
16
|
+
Subcommands:
|
|
17
|
+
init --<provider> [--force]
|
|
18
|
+
Generate configuration file
|
|
19
|
+
Providers: #{providers}
|
|
20
|
+
--force Overwrite existing config file
|
|
21
|
+
|
|
22
|
+
run <eval> --skill <name> [--skill <name>] [--format FORMAT]
|
|
23
|
+
Run an evaluation
|
|
24
|
+
--skill Skill to use (can be specified multiple times)
|
|
25
|
+
--format Output format: human, json, junit (default: human)
|
|
26
|
+
|
|
27
|
+
skill new <name> [--mode MODE] [--template TYPE]
|
|
28
|
+
Create a new skill
|
|
29
|
+
--mode simple, advanced, or rails (default: simple)
|
|
30
|
+
--template service_object, concern, active_record_model (default: service_object)
|
|
31
|
+
|
|
32
|
+
eval new <name> [--runtime TYPE]
|
|
33
|
+
Create a new eval
|
|
34
|
+
--runtime rails, ruby, etc. (default: ruby)
|
|
35
|
+
|
|
36
|
+
eval generate <skill-name> [--name <eval-name>]
|
|
37
|
+
Auto-generate an eval from a skill
|
|
38
|
+
--name Name for the generated eval (optional)
|
|
39
|
+
|
|
40
|
+
Global Options:
|
|
41
|
+
-h, --help Show this help message
|
|
42
|
+
USAGE
|
|
43
|
+
0
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'optparse'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Cli
|
|
7
|
+
# Handles the `skill-bench init` subcommand.
|
|
8
|
+
# Parses options and delegates to Commands::Init.
|
|
9
|
+
class InitCommand
|
|
10
|
+
#
|
|
11
|
+
# @param argv [Array<String>] Raw CLI arguments
|
|
12
|
+
# @return [Integer] Exit code
|
|
13
|
+
def self.call(argv)
|
|
14
|
+
new(argv).call
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# @param argv [Array<String>] Raw CLI arguments
|
|
18
|
+
def initialize(argv)
|
|
19
|
+
@argv = argv
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Parses options and runs init.
|
|
23
|
+
#
|
|
24
|
+
# @return [Integer] Exit code
|
|
25
|
+
def call
|
|
26
|
+
options = { force: false, provider: nil }
|
|
27
|
+
parser = build_parser(options)
|
|
28
|
+
parser.parse!(@argv)
|
|
29
|
+
|
|
30
|
+
return error_missing_provider unless options[:provider]
|
|
31
|
+
|
|
32
|
+
Commands::Init.run(**options)
|
|
33
|
+
puts "Created #{SkillBench::Config::CONFIG_FILENAME}"
|
|
34
|
+
0
|
|
35
|
+
rescue SkillBench::HelpRequested
|
|
36
|
+
0
|
|
37
|
+
rescue StandardError => e
|
|
38
|
+
warn "Error: #{e.message}"
|
|
39
|
+
1
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
private
|
|
43
|
+
|
|
44
|
+
def build_parser(options)
|
|
45
|
+
OptionParser.new do |opts|
|
|
46
|
+
opts.banner = 'Usage: skill-bench init --<provider> [options]'
|
|
47
|
+
register_provider_options(opts, options)
|
|
48
|
+
opts.on('--force', 'Overwrite existing config file') { options[:force] = true }
|
|
49
|
+
opts.on('-h', '--help', 'Prints this help') do
|
|
50
|
+
puts opts
|
|
51
|
+
raise SkillBench::HelpRequested
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def register_provider_options(parser, options)
|
|
57
|
+
SkillBench::Clients::ProviderSchemas.names.each do |name|
|
|
58
|
+
parser.on("--#{name}", "Generate config for #{name.to_s.capitalize}") { options[:provider] = name }
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def error_missing_provider
|
|
63
|
+
providers = SkillBench::Clients::ProviderSchemas.names.map { |provider_name| "--#{provider_name}" }.join(', ')
|
|
64
|
+
warn "Error: provider is required. Use one of: #{providers}"
|
|
65
|
+
1
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../output_formatter'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Cli
|
|
7
|
+
# Prints the result of a `skill-bench run` command.
|
|
8
|
+
class ResultPrinter
|
|
9
|
+
# Prints the result and returns the appropriate exit code.
|
|
10
|
+
#
|
|
11
|
+
# @param result [Hash] Result from ScoringService
|
|
12
|
+
# @param format [Symbol] Output format (:human, :json, :junit)
|
|
13
|
+
# @return [Integer] Exit code (0 for pass, 1 for fail)
|
|
14
|
+
def self.call(result, format: :human)
|
|
15
|
+
puts OutputFormatter.format(result, format: format)
|
|
16
|
+
OutputFormatter.exit_code(result)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'optparse'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Cli
|
|
7
|
+
# Handles the `skill-bench run` subcommand.
|
|
8
|
+
# Parses options and delegates to Commands::Run.
|
|
9
|
+
class RunCommand
|
|
10
|
+
#
|
|
11
|
+
# @param argv [Array<String>] Raw CLI arguments
|
|
12
|
+
# @return [Integer] Exit code
|
|
13
|
+
def self.call(argv)
|
|
14
|
+
new(argv).call
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# @param argv [Array<String>] Raw CLI arguments
|
|
18
|
+
def initialize(argv)
|
|
19
|
+
@argv = argv
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Parses options and runs the eval.
|
|
23
|
+
#
|
|
24
|
+
# @return [Integer] Exit code
|
|
25
|
+
def call
|
|
26
|
+
options = { skill_names: [] }
|
|
27
|
+
parser = build_parser(options)
|
|
28
|
+
parser.parse!(@argv)
|
|
29
|
+
|
|
30
|
+
eval_name = @argv.shift
|
|
31
|
+
return error_missing_eval unless eval_name
|
|
32
|
+
return error_missing_skill if options[:skill_names].empty?
|
|
33
|
+
|
|
34
|
+
options[:eval_name] = eval_name
|
|
35
|
+
exec_options = options.reject { |key| key == :format }
|
|
36
|
+
result = Commands::Run.run(**exec_options)
|
|
37
|
+
ResultPrinter.call(result, format: options[:format] || :human)
|
|
38
|
+
rescue HelpRequested
|
|
39
|
+
0
|
|
40
|
+
rescue StandardError => e
|
|
41
|
+
warn "Error: #{e.message}"
|
|
42
|
+
1
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
def build_parser(options)
|
|
48
|
+
OptionParser.new do |opts|
|
|
49
|
+
opts.banner = 'Usage: skill-bench run <eval> [options]'
|
|
50
|
+
opts.on('--skill NAME', 'Skill to use (can be specified multiple times)') { |v| options[:skill_names] << v }
|
|
51
|
+
opts.on('--format FORMAT', 'Output format (human, json, junit)') { |v| options[:format] = v.to_sym }
|
|
52
|
+
opts.on('-h', '--help', 'Prints this help') do
|
|
53
|
+
puts opts
|
|
54
|
+
raise SkillBench::HelpRequested
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def error_missing_eval
|
|
60
|
+
warn 'Error: eval name is required'
|
|
61
|
+
warn 'Usage: skill-bench run <eval> --skill <name>'
|
|
62
|
+
1
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def error_missing_skill
|
|
66
|
+
warn 'Error: skill name is required'
|
|
67
|
+
warn 'Usage: skill-bench run <eval> --skill <name>'
|
|
68
|
+
1
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|