ruby-skill-bench 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +794 -0
  4. data/bin/skill-bench +15 -0
  5. data/docs/architecture.md +200 -0
  6. data/docs/first-eval-guide.md +522 -0
  7. data/docs/testing-guide.md +361 -0
  8. data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
  9. data/lib/skill_bench/agent/react_agent/step.rb +92 -0
  10. data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
  11. data/lib/skill_bench/agent/react_agent.rb +58 -0
  12. data/lib/skill_bench/agent/runner.rb +108 -0
  13. data/lib/skill_bench/agent/summary.rb +39 -0
  14. data/lib/skill_bench/agent.rb +10 -0
  15. data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
  16. data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
  17. data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
  18. data/lib/skill_bench/cli/eval_command.rb +40 -0
  19. data/lib/skill_bench/cli/help_printer.rb +47 -0
  20. data/lib/skill_bench/cli/init_command.rb +69 -0
  21. data/lib/skill_bench/cli/result_printer.rb +20 -0
  22. data/lib/skill_bench/cli/run_command.rb +72 -0
  23. data/lib/skill_bench/cli/skill_command.rb +79 -0
  24. data/lib/skill_bench/cli.rb +51 -0
  25. data/lib/skill_bench/client.rb +23 -0
  26. data/lib/skill_bench/clients/all.rb +19 -0
  27. data/lib/skill_bench/clients/base_client.rb +212 -0
  28. data/lib/skill_bench/clients/provider_config.rb +47 -0
  29. data/lib/skill_bench/clients/provider_registry.rb +56 -0
  30. data/lib/skill_bench/clients/provider_schemas.rb +73 -0
  31. data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
  32. data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
  33. data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
  34. data/lib/skill_bench/clients/providers/gemini.rb +63 -0
  35. data/lib/skill_bench/clients/providers/groq.rb +39 -0
  36. data/lib/skill_bench/clients/providers/null_client.rb +50 -0
  37. data/lib/skill_bench/clients/providers/ollama.rb +63 -0
  38. data/lib/skill_bench/clients/providers/openai.rb +39 -0
  39. data/lib/skill_bench/clients/providers/opencode.rb +56 -0
  40. data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
  41. data/lib/skill_bench/clients/request_builder.rb +43 -0
  42. data/lib/skill_bench/clients/response_error_handler.rb +73 -0
  43. data/lib/skill_bench/clients/response_parser.rb +93 -0
  44. data/lib/skill_bench/clients/retry_handler.rb +78 -0
  45. data/lib/skill_bench/commands/eval_new.rb +89 -0
  46. data/lib/skill_bench/commands/init.rb +39 -0
  47. data/lib/skill_bench/commands/run.rb +21 -0
  48. data/lib/skill_bench/commands/skill_new.rb +115 -0
  49. data/lib/skill_bench/config/applier.rb +67 -0
  50. data/lib/skill_bench/config/defaults.rb +42 -0
  51. data/lib/skill_bench/config/env_overrides.rb +117 -0
  52. data/lib/skill_bench/config/facade_readers.rb +65 -0
  53. data/lib/skill_bench/config/facade_writers.rb +120 -0
  54. data/lib/skill_bench/config/json_loader.rb +84 -0
  55. data/lib/skill_bench/config/store.rb +177 -0
  56. data/lib/skill_bench/config.rb +172 -0
  57. data/lib/skill_bench/criteria.rb +141 -0
  58. data/lib/skill_bench/delta_report.rb +97 -0
  59. data/lib/skill_bench/dimension.rb +69 -0
  60. data/lib/skill_bench/error_logger.rb +35 -0
  61. data/lib/skill_bench/evaluate_command.rb +120 -0
  62. data/lib/skill_bench/evaluation/generator.rb +191 -0
  63. data/lib/skill_bench/evaluation/runner.rb +81 -0
  64. data/lib/skill_bench/evaluation.rb +10 -0
  65. data/lib/skill_bench/execution/context_hydrator.rb +97 -0
  66. data/lib/skill_bench/execution/sandbox.rb +174 -0
  67. data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
  68. data/lib/skill_bench/execution.rb +10 -0
  69. data/lib/skill_bench/history_recorder/history_file.rb +71 -0
  70. data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
  71. data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
  72. data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
  73. data/lib/skill_bench/history_recorder.rb +40 -0
  74. data/lib/skill_bench/interactive.rb +61 -0
  75. data/lib/skill_bench/judge/judge.rb +72 -0
  76. data/lib/skill_bench/judge/prompt.rb +121 -0
  77. data/lib/skill_bench/judge/response.rb +158 -0
  78. data/lib/skill_bench/judge.rb +10 -0
  79. data/lib/skill_bench/migration/provider_migrator.rb +30 -0
  80. data/lib/skill_bench/models/config.rb +61 -0
  81. data/lib/skill_bench/models/criteria_validator.rb +106 -0
  82. data/lib/skill_bench/models/eval.rb +81 -0
  83. data/lib/skill_bench/models/provider.rb +70 -0
  84. data/lib/skill_bench/models/skill.rb +32 -0
  85. data/lib/skill_bench/output_formatter.rb +132 -0
  86. data/lib/skill_bench/package_verifier.rb +80 -0
  87. data/lib/skill_bench/rails/skill_templates.rb +99 -0
  88. data/lib/skill_bench/runner.rb +89 -0
  89. data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
  90. data/lib/skill_bench/services/feedback_generator.rb +122 -0
  91. data/lib/skill_bench/services/formatting_helpers.rb +45 -0
  92. data/lib/skill_bench/services/iteration_formatter.rb +30 -0
  93. data/lib/skill_bench/services/json_formatter.rb +18 -0
  94. data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
  95. data/lib/skill_bench/services/junit_formatter.rb +42 -0
  96. data/lib/skill_bench/services/option_parser_service.rb +63 -0
  97. data/lib/skill_bench/services/output_persistence_service.rb +77 -0
  98. data/lib/skill_bench/services/result_printer_service.rb +126 -0
  99. data/lib/skill_bench/services/runner_service.rb +381 -0
  100. data/lib/skill_bench/services/skill_resolver.rb +78 -0
  101. data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
  102. data/lib/skill_bench/services/template_registry.rb +148 -0
  103. data/lib/skill_bench/task/evaluator.rb +94 -0
  104. data/lib/skill_bench/task/file_reader.rb +69 -0
  105. data/lib/skill_bench/task.rb +10 -0
  106. data/lib/skill_bench/tools/argument_parser.rb +20 -0
  107. data/lib/skill_bench/tools/base.rb +73 -0
  108. data/lib/skill_bench/tools/dispatcher.rb +61 -0
  109. data/lib/skill_bench/tools/read_file.rb +66 -0
  110. data/lib/skill_bench/tools/registry.rb +23 -0
  111. data/lib/skill_bench/tools/run_command.rb +89 -0
  112. data/lib/skill_bench/tools/write_file.rb +78 -0
  113. data/lib/skill_bench/tools.rb +33 -0
  114. data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
  115. data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
  116. data/lib/skill_bench/trend_tracker.rb +66 -0
  117. data/lib/skill_bench/version.rb +6 -0
  118. data/lib/skill_bench.rb +103 -0
  119. metadata +247 -0
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'faraday'
4
+
5
+ module SkillBench
6
+ module Clients
7
+ # Builds and executes HTTP requests to LLM provider APIs.
8
+ # Encapsulates Faraday connection setup and request execution.
9
+ class RequestBuilder
10
+ DEFAULT_OPEN_TIMEOUT = 10
11
+ DEFAULT_TIMEOUT = 120
12
+
13
+ # Creates a Faraday connection with JSON middleware.
14
+ #
15
+ # @param base_url [String] The API base URL
16
+ # @param open_timeout [Integer] Connection open timeout in seconds
17
+ # @param timeout [Integer] Request timeout in seconds
18
+ # @return [Faraday::Connection] Configured Faraday connection
19
+ def self.build_connection(base_url, open_timeout: DEFAULT_OPEN_TIMEOUT, timeout: DEFAULT_TIMEOUT)
20
+ Faraday.new(url: base_url) do |f|
21
+ f.request :json
22
+ f.response :json
23
+ f.options.open_timeout = open_timeout
24
+ f.options.timeout = timeout
25
+ end
26
+ end
27
+
28
+ # Executes a POST request to the LLM API.
29
+ #
30
+ # @param connection [Faraday::Connection] The Faraday connection
31
+ # @param path [String] The request path
32
+ # @param headers [Hash] Request headers
33
+ # @param body [Hash] Request body
34
+ # @return [Faraday::Response] The HTTP response
35
+ def self.execute(connection, path, headers:, body:)
36
+ connection.post(path) do |req|
37
+ req.headers.update(headers)
38
+ req.body = body.to_json
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Clients
5
+ # Handles error responses and logging for LLM provider clients.
6
+ # Encapsulates error formatting, logging, and exception handling.
7
+ class ResponseErrorHandler
8
+ API_FAILED = 'API Request failed'
9
+
10
+ # Creates an error response for failed HTTP requests.
11
+ #
12
+ # @param response [Faraday::Response] The HTTP response
13
+ # @param parsed [Hash] Parsed response body
14
+ # @param usage_extractor [Proc] Block to extract usage data
15
+ # @return [Hash] Standardized error response
16
+ def self.failure_response(response, parsed, &usage_extractor)
17
+ error_msg = "#{API_FAILED}: #{response.status}"
18
+ detail = parsed.is_a?(Hash) ? (parsed[:error] || parsed['error'] || parsed) : parsed
19
+
20
+ if detail.is_a?(Hash) && (detail[:message] || detail['message'])
21
+ error_msg += " - #{detail[:message] || detail['message']}"
22
+ elsif !detail.to_s.empty?
23
+ error_msg += " - #{detail}"
24
+ end
25
+
26
+ {
27
+ success: false,
28
+ result: error_msg,
29
+ usage: usage_extractor.call(parsed),
30
+ response: { error: { message: error_msg } },
31
+ status: 'error',
32
+ code: response.status
33
+ }
34
+ end
35
+
36
+ # Creates an error response when the LLM response has no message content.
37
+ #
38
+ # @param response [Faraday::Response] The HTTP response
39
+ # @param parsed [Hash] Parsed response body
40
+ # @param usage_extractor [Proc] Block to extract usage data
41
+ # @return [Hash] Standardized error response
42
+ def self.missing_message_response(response, parsed, &usage_extractor)
43
+ error_msg = 'LLM response missing message content'
44
+ {
45
+ success: false,
46
+ result: error_msg,
47
+ usage: usage_extractor.call(parsed),
48
+ response: { error: { message: error_msg } },
49
+ status: 'error',
50
+ code: response.status
51
+ }
52
+ end
53
+
54
+ # Handles an exception by logging and returning a standardized error response.
55
+ #
56
+ # @param error [StandardError] The exception that occurred
57
+ # @param type [String] The error type label
58
+ # @return [Hash] Standardized error response
59
+ def self.handle_exception(error, type)
60
+ log_error(error)
61
+ { success: false, result: "#{type}: #{error.message}", status: 'error' }
62
+ end
63
+
64
+ # Logs an error message and backtrace to Rails.logger or stderr.
65
+ #
66
+ # @param error [StandardError] The exception to log
67
+ # @return [void]
68
+ def self.log_error(error)
69
+ SkillBench::ErrorLogger.log_error(error)
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module SkillBench
6
+ module Clients
7
+ # Parses LLM provider responses and extracts messages and usage data.
8
+ # Handles JSON parsing, message extraction, and validation.
9
+ class ResponseParser
10
+ # Parses the response body into a Hash.
11
+ #
12
+ # @param response [Faraday::Response] The HTTP response
13
+ # @return [Hash] Parsed response body
14
+ def self.parse_body(response)
15
+ return response.body if response.body.is_a?(Hash)
16
+ return { error: { message: response.body.to_s } } if response.body.is_a?(Array)
17
+
18
+ JSON.parse(response.body, symbolize_names: true)
19
+ rescue JSON::ParserError
20
+ { error: { message: response.body.to_s } }
21
+ end
22
+
23
+ # Strips markdown code fences from a string if present.
24
+ #
25
+ # @param text [String] The text to clean
26
+ # @return [String] Cleaned text
27
+ def self.strip_markdown_fences(text)
28
+ return text unless text.is_a?(String)
29
+
30
+ if text.start_with?('```')
31
+ lines = text.each_line.to_a
32
+ lines.shift if lines.first&.strip&.start_with?('```')
33
+ lines.pop if lines.last&.strip == '```'
34
+ lines.join.strip
35
+ else
36
+ text
37
+ end
38
+ end
39
+
40
+ # Checks if a message is valid (has content or tool calls).
41
+ #
42
+ # @param message [Hash, String, nil] The message to validate
43
+ # @return [Boolean] True if the message is valid
44
+ def self.valid_message?(message)
45
+ return false if message.nil?
46
+
47
+ content = extract_content(message)
48
+ tool_calls = extract_tool_calls(message)
49
+
50
+ !content.nil? || !Array(tool_calls).empty?
51
+ end
52
+
53
+ # Extracts the content from a message.
54
+ #
55
+ # @param message [Hash, String] The message
56
+ # @return [String, nil] The content or nil
57
+ def self.extract_content(message)
58
+ return message unless message.is_a?(Hash)
59
+
60
+ message[:content] || message['content']
61
+ end
62
+
63
+ # Extracts tool calls from a message.
64
+ #
65
+ # @param message [Hash] The message
66
+ # @return [Array, nil] The tool calls or nil
67
+ def self.extract_tool_calls(message)
68
+ return nil unless message.is_a?(Hash)
69
+
70
+ message[:tool_calls] || message['tool_calls']
71
+ end
72
+
73
+ # Extracts the message from an OpenAI-compatible response body.
74
+ #
75
+ # @param body [Hash] The parsed response body
76
+ # @return [Hash, nil] The message or nil
77
+ def self.extract_openai_message(body)
78
+ choices = body[:choices] || body['choices']
79
+ return nil unless choices&.any?
80
+
81
+ choices.first[:message] || choices.first['message']
82
+ end
83
+
84
+ # Extracts usage data from an OpenAI-compatible response.
85
+ #
86
+ # @param body [Hash] The parsed response body
87
+ # @return [Hash] Usage data
88
+ def self.extract_openai_usage(body)
89
+ body[:usage] || body['usage'] || {}
90
+ end
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'faraday'
4
+ require_relative '../error_logger'
5
+
6
+ module SkillBench
7
+ module Clients
8
+ # Service object for retrying HTTP requests with exponential backoff.
9
+ # Retries on transient errors (429, 503). Raises permanent errors immediately.
10
+ # Returns the block result on success.
11
+ class RetryHandler
12
+ RETRYABLE_STATUSES = [429, 503].freeze
13
+
14
+ MAX_DELAY = 30 # Maximum delay cap in seconds
15
+
16
+ # Executes the given block with retry logic.
17
+ #
18
+ # @param max_attempts [Integer] Maximum number of attempts (default: 3).
19
+ # @param base_delay [Numeric] Base delay in seconds before first retry (doubles each attempt).
20
+ # @yield The request block to execute.
21
+ # @return [Object] The block's return value on success.
22
+ # @raise [Faraday::Error] On non-retryable errors or after exhausting retries.
23
+ # @raise [ArgumentError] if no block is given or max_attempts < 1.
24
+ def self.call(max_attempts: 3, base_delay: 1, &block)
25
+ raise ArgumentError, 'RetryHandler requires a block' unless block
26
+ raise ArgumentError, 'max_attempts must be >= 1' if max_attempts < 1
27
+
28
+ new(max_attempts:, base_delay:, block:).call
29
+ end
30
+
31
+ # @param max_attempts [Integer] Maximum number of attempts.
32
+ # @param base_delay [Numeric] Base delay before first retry.
33
+ # @param block [Proc] The request block to execute.
34
+ def initialize(max_attempts:, base_delay:, block:)
35
+ @max_attempts = max_attempts
36
+ @base_delay = base_delay
37
+ @block = block
38
+ end
39
+
40
+ # Executes the block with retry logic.
41
+ #
42
+ # @return [Object] The block's return value on success.
43
+ # @raise [Faraday::Error] On non-retryable errors or after exhausting retries.
44
+ def call
45
+ attempt = 0
46
+
47
+ loop do
48
+ attempt += 1
49
+ return @block.call
50
+ rescue Faraday::Error => e
51
+ status = extract_status(e)
52
+ raise e unless retryable?(status, attempt)
53
+
54
+ delay = compute_delay(attempt)
55
+ wait(delay)
56
+ end
57
+ end
58
+
59
+ private
60
+
61
+ def retryable?(status, attempt)
62
+ RETRYABLE_STATUSES.include?(status) && attempt < @max_attempts
63
+ end
64
+
65
+ def compute_delay(attempt)
66
+ [@base_delay * (2**(attempt - 1)), MAX_DELAY].min
67
+ end
68
+
69
+ def extract_status(error)
70
+ error.respond_to?(:response_status) ? error.response_status : 0
71
+ end
72
+
73
+ def wait(delay)
74
+ sleep(delay)
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fileutils'
4
+ require 'json'
5
+
6
+ module SkillBench
7
+ module Commands
8
+ # Handles the `skill-bench eval new` command
9
+ class EvalNew
10
+ # Allowed runtime values for eval scaffolding.
11
+ ALLOWED_RUNTIMES = %w[ruby rails].freeze
12
+
13
+ # Run the eval new command
14
+ # @param name [String] Eval name
15
+ # @param runtime [String] "ruby" or "rails" (default: ruby)
16
+ # @return [void]
17
+ # @raise [ArgumentError] if runtime is not in ALLOWED_RUNTIMES.
18
+ def self.run(name:, runtime: 'ruby')
19
+ raise ArgumentError, "Unsupported runtime '#{runtime}'. Allowed: #{ALLOWED_RUNTIMES.join(', ')}" unless ALLOWED_RUNTIMES.include?(runtime)
20
+
21
+ eval_path = File.join('evals', name)
22
+ FileUtils.mkdir_p(eval_path)
23
+
24
+ create_task_md(eval_path, name)
25
+ create_criteria_json(eval_path, runtime)
26
+ create_rails_files(eval_path, name) if runtime == 'rails'
27
+ end
28
+
29
+ # Create task.md for the eval
30
+ # @param path [String] Eval directory path
31
+ # @param name [String] Eval name
32
+ # @return [void]
33
+ def self.create_task_md(path, name)
34
+ File.write(File.join(path, 'task.md'), task_template(name))
35
+ end
36
+
37
+ # Create criteria.json for the eval
38
+ # @param path [String] Eval directory path
39
+ # @param runtime [String] Runtime type
40
+ # @return [void]
41
+ def self.create_criteria_json(path, runtime)
42
+ criteria = default_criteria(runtime)
43
+ File.write(File.join(path, 'criteria.json'), JSON.pretty_generate(criteria))
44
+ end
45
+
46
+ # Generate task.md template
47
+ # @param name [String] Eval name
48
+ # @return [String] Markdown template
49
+ def self.task_template(name)
50
+ <<~MARKDOWN
51
+ # Eval: #{name}
52
+
53
+ ## Task
54
+ Describe the task for the agent here.
55
+
56
+ ## Success Criteria
57
+ Define what constitutes a successful completion.
58
+ MARKDOWN
59
+ end
60
+
61
+ # Generate default criteria hash.
62
+ #
63
+ # @param runtime [String] Runtime type.
64
+ # @return [Hash] Criteria configuration in the new format.
65
+ def self.default_criteria(runtime)
66
+ {
67
+ context: "Evaluate #{runtime} task",
68
+ dimensions: [
69
+ { name: 'correctness', max_score: 30 },
70
+ { name: 'skill_adherence', max_score: 25 },
71
+ { name: 'code_quality', max_score: 20 },
72
+ { name: 'test_coverage', max_score: 15 },
73
+ { name: 'documentation', max_score: 10 }
74
+ ],
75
+ pass_threshold: 70,
76
+ minimum_delta: 10
77
+ }
78
+ end
79
+
80
+ # Create Rails-specific files for the eval
81
+ # @param path [String] Eval directory path
82
+ # @param _name [String] Eval name
83
+ # @return [void]
84
+ def self.create_rails_files(path, _name)
85
+ File.write(File.join(path, 'rails_helper.rb'), "require 'rails_helper'\n")
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require_relative '../clients/provider_schemas'
5
+
6
+ module SkillBench
7
+ module Commands
8
+ # Handles the `skill-bench init` command.
9
+ # Generates a skill-bench.json config file with single-provider settings.
10
+ class Init
11
+ # Run the init command to generate config.
12
+ #
13
+ # @param provider [Symbol] LLM provider name (e.g., :openai, :gemini)
14
+ # @param force [Boolean] Whether to overwrite an existing config file.
15
+ # @return [void]
16
+ # @raise [RuntimeError] if config file exists and force is false
17
+ # @raise [ArgumentError] if provider is not registered
18
+ def self.run(provider:, force: false)
19
+ raise "Config file '#{SkillBench::Config::CONFIG_FILENAME}' already exists. Use --force to overwrite." if File.exist?(SkillBench::Config::CONFIG_FILENAME) && !force
20
+
21
+ config = config_for_provider(provider)
22
+ File.write(SkillBench::Config::CONFIG_FILENAME, JSON.pretty_generate(config))
23
+ end
24
+
25
+ # Generates configuration hash for a specific provider.
26
+ #
27
+ # @param provider [Symbol] LLM provider name
28
+ # @return [Hash] Single-provider configuration
29
+ # @raise [ArgumentError] if provider is not registered
30
+ def self.config_for_provider(provider)
31
+ {
32
+ provider: provider,
33
+ max_execution_time: 30,
34
+ config: SkillBench::Clients::ProviderSchemas.for(provider)
35
+ }
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../services/runner_service'
4
+
5
+ module SkillBench
6
+ module Commands
7
+ # Handles the `skill-bench run` command
8
+ class Run
9
+ # Run an eval with specified skill(s)
10
+ # @param eval_name [String] Name of eval to run (e.g., 'test-eval' or 'evals/test-eval')
11
+ # @param skill_names [Array<String>] Names of skills to use
12
+ # @return [Hash] Result with pass/fail and score
13
+ def self.run(eval_name:, skill_names:)
14
+ Services::RunnerService.call(
15
+ eval_name: eval_name,
16
+ skill_names: skill_names
17
+ )
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,115 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fileutils'
4
+ require_relative '../rails/skill_templates'
5
+
6
+ module SkillBench
7
+ module Commands
8
+ # Handles the `skill-bench skill new` command
9
+ class SkillNew
10
+ # Run the skill new command
11
+ # @param name [String] Skill name
12
+ # @param mode [String] "simple", "advanced", or "rails"
13
+ # @param template [String] Rails template type (service_object, concern, active_record_model)
14
+ # @return [void]
15
+ # @raise [ArgumentError] if mode is invalid
16
+ def self.run(name:, mode: 'simple', template: 'service_object')
17
+ skill_path = File.join('skills', name)
18
+ FileUtils.mkdir_p(skill_path)
19
+
20
+ case mode
21
+ when 'simple'
22
+ create_simple_skill(skill_path, name)
23
+ when 'advanced'
24
+ create_advanced_skill(skill_path, name)
25
+ when 'rails'
26
+ create_rails_skill(skill_path, name, template)
27
+ else
28
+ raise ArgumentError, "Invalid mode: #{mode}. Use 'simple', 'advanced', or 'rails'."
29
+ end
30
+ end
31
+
32
+ # Create a simple skill with SKILL.md
33
+ # @param path [String] Skill directory path
34
+ # @param name [String] Skill name
35
+ # @return [void]
36
+ def self.create_simple_skill(path, name)
37
+ File.write(File.join(path, 'SKILL.md'), simple_skill_template(name))
38
+ end
39
+
40
+ # Create an advanced skill with Ruby class
41
+ # @param path [String] Skill directory path
42
+ # @param name [String] Skill name
43
+ # @return [void]
44
+ def self.create_advanced_skill(path, name)
45
+ File.write(File.join(path, 'skill.rb'), advanced_skill_template(name))
46
+ end
47
+
48
+ # Generate simple skill template
49
+ # @param name [String] Skill name
50
+ # @return [String] Markdown template
51
+ def self.simple_skill_template(name)
52
+ <<~MARKDOWN
53
+ # Skill: #{name}
54
+
55
+ ## Description
56
+ Add skill description here.
57
+
58
+ ## Context
59
+ Add context injection content here.
60
+
61
+ ## Workflow
62
+ Add workflow steps here.
63
+ MARKDOWN
64
+ end
65
+
66
+ # Convert snake_case to CamelCase
67
+ # @param string [String] String to convert
68
+ # @return [String] CamelCase string
69
+ def self.camelize(string)
70
+ string.split(/[_\s]+/).map(&:capitalize).join
71
+ end
72
+
73
+ # Generate advanced skill template
74
+ # @param name [String] Skill name
75
+ # @return [String] Ruby class template
76
+ def self.advanced_skill_template(name)
77
+ class_name = camelize(name)
78
+ <<~RUBY
79
+ # frozen_string_literal: true
80
+
81
+ module SkillBench
82
+ module Skills
83
+ class #{class_name}
84
+ def initialize; end
85
+
86
+ def call
87
+ # Implement skill logic here
88
+ end
89
+ end
90
+ end
91
+ end
92
+ RUBY
93
+ end
94
+
95
+ RAILS_TEMPLATES = {
96
+ 'service_object' => 'service.rb',
97
+ 'concern' => 'concern.rb',
98
+ 'active_record_model' => 'model.rb'
99
+ }.freeze
100
+
101
+ # Create a Rails skill using templates
102
+ # @param path [String] Skill directory path
103
+ # @param name [String] Skill name
104
+ # @param template [String] Template type (service_object, concern, active_record_model)
105
+ # @return [void]
106
+ def self.create_rails_skill(path, name, template)
107
+ file_name = RAILS_TEMPLATES[template]
108
+ raise ArgumentError, "Invalid template: #{template}. Use one of: #{RAILS_TEMPLATES.keys.join(', ')}." unless file_name
109
+
110
+ content = Rails::SkillTemplates.public_send(template.to_sym, name)
111
+ File.write(File.join(path, file_name), content)
112
+ end
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ class Config
5
+ # Applies normalized configuration hashes to a mutable store.
6
+ class Applier
7
+ # Applies configuration values to a store.
8
+ #
9
+ # @param store [Store] mutable configuration store
10
+ # @param data [Hash] normalized configuration values
11
+ # @return [Hash] result envelope with applied status
12
+ def self.call(store:, data:)
13
+ new(store:, data:).call
14
+ end
15
+
16
+ # Initializes the applier.
17
+ #
18
+ # @param store [Store] mutable configuration store
19
+ # @param data [Hash] normalized configuration values
20
+ # @return [Applier] an applier instance
21
+ def initialize(store:, data:)
22
+ @store = store
23
+ @data = data
24
+ end
25
+
26
+ # Applies configuration values to the configured store.
27
+ #
28
+ # @return [Hash] result envelope with applied status
29
+ def call
30
+ apply_scalar_values
31
+ apply_provider_values
32
+ { success: true, response: { applied: true } }
33
+ rescue StandardError => e
34
+ SkillBench::ErrorLogger.log_error(e, 'Applier Error')
35
+ { success: false, response: { error: { message: e.message } } }
36
+ end
37
+
38
+ private
39
+
40
+ def apply_scalar_values
41
+ assign_current_provider
42
+ @store.assign_max_execution_time(@data[:max_execution_time]) if @data.key?(:max_execution_time)
43
+ @store.assign_allowed_commands(@data[:allowed_commands]) if @data.key?(:allowed_commands)
44
+ end
45
+
46
+ def apply_provider_values
47
+ if @data.key?(:llm_providers_config)
48
+ @store.replace_provider_config(copied_provider_config)
49
+ else
50
+ @store.apply_provider_config(@data[:providers] || {})
51
+ end
52
+ end
53
+
54
+ def assign_current_provider
55
+ provider = @data.fetch(:current_llm_provider) { return }
56
+ provider_name = provider.to_s.strip
57
+ return if provider_name.empty?
58
+
59
+ @store.assign_current_llm_provider(provider_name.to_sym)
60
+ end
61
+
62
+ def copied_provider_config
63
+ @data[:llm_providers_config].transform_values(&:dup)
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ class Config
5
+ # Builds the default evaluator configuration state.
6
+ class Defaults
7
+ # Returns the default configuration values.
8
+ #
9
+ # @return [Hash] result envelope with default provider, timeout, command, and provider settings
10
+ def self.call
11
+ { success: true, response: { config: config } }
12
+ end
13
+
14
+ # Builds the raw default configuration hash.
15
+ #
16
+ # @return [Hash] default provider, timeout, command, and provider settings
17
+ def self.config
18
+ {
19
+ current_llm_provider: :openai,
20
+ max_execution_time: 30,
21
+ allowed_commands: nil,
22
+ llm_providers_config: {
23
+ openai: { api_key: nil, model: 'gpt-4o' },
24
+ anthropic: { api_key: nil, model: 'claude-sonnet-4-20250514' },
25
+ gemini: {
26
+ api_key: nil,
27
+ model: 'gemini-1.5-flash-latest',
28
+ location: 'us-central1',
29
+ project_id: nil
30
+ },
31
+ ollama: { api_key: nil, model: 'qwen:7b', base_url: nil },
32
+ azure: { api_key: nil, model: 'gpt-4', endpoint: nil, api_version: nil },
33
+ groq: { api_key: nil, model: 'llama-3.3-70b-versatile' },
34
+ deepseek: { api_key: nil, model: 'deepseek-chat' },
35
+ opencode: { api_key: nil, model: 'opencode-model', base_url: nil },
36
+ openrouter: { api_key: nil, model: 'anthropic/claude-3.5-sonnet' }
37
+ }
38
+ }
39
+ end
40
+ end
41
+ end
42
+ end