ruby-skill-bench 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +794 -0
- data/bin/skill-bench +15 -0
- data/docs/architecture.md +200 -0
- data/docs/first-eval-guide.md +522 -0
- data/docs/testing-guide.md +361 -0
- data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
- data/lib/skill_bench/agent/react_agent/step.rb +92 -0
- data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
- data/lib/skill_bench/agent/react_agent.rb +58 -0
- data/lib/skill_bench/agent/runner.rb +108 -0
- data/lib/skill_bench/agent/summary.rb +39 -0
- data/lib/skill_bench/agent.rb +10 -0
- data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
- data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
- data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
- data/lib/skill_bench/cli/eval_command.rb +40 -0
- data/lib/skill_bench/cli/help_printer.rb +47 -0
- data/lib/skill_bench/cli/init_command.rb +69 -0
- data/lib/skill_bench/cli/result_printer.rb +20 -0
- data/lib/skill_bench/cli/run_command.rb +72 -0
- data/lib/skill_bench/cli/skill_command.rb +79 -0
- data/lib/skill_bench/cli.rb +51 -0
- data/lib/skill_bench/client.rb +23 -0
- data/lib/skill_bench/clients/all.rb +19 -0
- data/lib/skill_bench/clients/base_client.rb +212 -0
- data/lib/skill_bench/clients/provider_config.rb +47 -0
- data/lib/skill_bench/clients/provider_registry.rb +56 -0
- data/lib/skill_bench/clients/provider_schemas.rb +73 -0
- data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
- data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
- data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
- data/lib/skill_bench/clients/providers/gemini.rb +63 -0
- data/lib/skill_bench/clients/providers/groq.rb +39 -0
- data/lib/skill_bench/clients/providers/null_client.rb +50 -0
- data/lib/skill_bench/clients/providers/ollama.rb +63 -0
- data/lib/skill_bench/clients/providers/openai.rb +39 -0
- data/lib/skill_bench/clients/providers/opencode.rb +56 -0
- data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
- data/lib/skill_bench/clients/request_builder.rb +43 -0
- data/lib/skill_bench/clients/response_error_handler.rb +73 -0
- data/lib/skill_bench/clients/response_parser.rb +93 -0
- data/lib/skill_bench/clients/retry_handler.rb +78 -0
- data/lib/skill_bench/commands/eval_new.rb +89 -0
- data/lib/skill_bench/commands/init.rb +39 -0
- data/lib/skill_bench/commands/run.rb +21 -0
- data/lib/skill_bench/commands/skill_new.rb +115 -0
- data/lib/skill_bench/config/applier.rb +67 -0
- data/lib/skill_bench/config/defaults.rb +42 -0
- data/lib/skill_bench/config/env_overrides.rb +117 -0
- data/lib/skill_bench/config/facade_readers.rb +65 -0
- data/lib/skill_bench/config/facade_writers.rb +120 -0
- data/lib/skill_bench/config/json_loader.rb +84 -0
- data/lib/skill_bench/config/store.rb +177 -0
- data/lib/skill_bench/config.rb +172 -0
- data/lib/skill_bench/criteria.rb +141 -0
- data/lib/skill_bench/delta_report.rb +97 -0
- data/lib/skill_bench/dimension.rb +69 -0
- data/lib/skill_bench/error_logger.rb +35 -0
- data/lib/skill_bench/evaluate_command.rb +120 -0
- data/lib/skill_bench/evaluation/generator.rb +191 -0
- data/lib/skill_bench/evaluation/runner.rb +81 -0
- data/lib/skill_bench/evaluation.rb +10 -0
- data/lib/skill_bench/execution/context_hydrator.rb +97 -0
- data/lib/skill_bench/execution/sandbox.rb +174 -0
- data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
- data/lib/skill_bench/execution.rb +10 -0
- data/lib/skill_bench/history_recorder/history_file.rb +71 -0
- data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
- data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
- data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
- data/lib/skill_bench/history_recorder.rb +40 -0
- data/lib/skill_bench/interactive.rb +61 -0
- data/lib/skill_bench/judge/judge.rb +72 -0
- data/lib/skill_bench/judge/prompt.rb +121 -0
- data/lib/skill_bench/judge/response.rb +158 -0
- data/lib/skill_bench/judge.rb +10 -0
- data/lib/skill_bench/migration/provider_migrator.rb +30 -0
- data/lib/skill_bench/models/config.rb +61 -0
- data/lib/skill_bench/models/criteria_validator.rb +106 -0
- data/lib/skill_bench/models/eval.rb +81 -0
- data/lib/skill_bench/models/provider.rb +70 -0
- data/lib/skill_bench/models/skill.rb +32 -0
- data/lib/skill_bench/output_formatter.rb +132 -0
- data/lib/skill_bench/package_verifier.rb +80 -0
- data/lib/skill_bench/rails/skill_templates.rb +99 -0
- data/lib/skill_bench/runner.rb +89 -0
- data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
- data/lib/skill_bench/services/feedback_generator.rb +122 -0
- data/lib/skill_bench/services/formatting_helpers.rb +45 -0
- data/lib/skill_bench/services/iteration_formatter.rb +30 -0
- data/lib/skill_bench/services/json_formatter.rb +18 -0
- data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
- data/lib/skill_bench/services/junit_formatter.rb +42 -0
- data/lib/skill_bench/services/option_parser_service.rb +63 -0
- data/lib/skill_bench/services/output_persistence_service.rb +77 -0
- data/lib/skill_bench/services/result_printer_service.rb +126 -0
- data/lib/skill_bench/services/runner_service.rb +381 -0
- data/lib/skill_bench/services/skill_resolver.rb +78 -0
- data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
- data/lib/skill_bench/services/template_registry.rb +148 -0
- data/lib/skill_bench/task/evaluator.rb +94 -0
- data/lib/skill_bench/task/file_reader.rb +69 -0
- data/lib/skill_bench/task.rb +10 -0
- data/lib/skill_bench/tools/argument_parser.rb +20 -0
- data/lib/skill_bench/tools/base.rb +73 -0
- data/lib/skill_bench/tools/dispatcher.rb +61 -0
- data/lib/skill_bench/tools/read_file.rb +66 -0
- data/lib/skill_bench/tools/registry.rb +23 -0
- data/lib/skill_bench/tools/run_command.rb +89 -0
- data/lib/skill_bench/tools/write_file.rb +78 -0
- data/lib/skill_bench/tools.rb +33 -0
- data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
- data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
- data/lib/skill_bench/trend_tracker.rb +66 -0
- data/lib/skill_bench/version.rb +6 -0
- data/lib/skill_bench.rb +103 -0
- metadata +247 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'faraday'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Clients
|
|
7
|
+
# Builds and executes HTTP requests to LLM provider APIs.
|
|
8
|
+
# Encapsulates Faraday connection setup and request execution.
|
|
9
|
+
class RequestBuilder
|
|
10
|
+
DEFAULT_OPEN_TIMEOUT = 10
|
|
11
|
+
DEFAULT_TIMEOUT = 120
|
|
12
|
+
|
|
13
|
+
# Creates a Faraday connection with JSON middleware.
|
|
14
|
+
#
|
|
15
|
+
# @param base_url [String] The API base URL
|
|
16
|
+
# @param open_timeout [Integer] Connection open timeout in seconds
|
|
17
|
+
# @param timeout [Integer] Request timeout in seconds
|
|
18
|
+
# @return [Faraday::Connection] Configured Faraday connection
|
|
19
|
+
def self.build_connection(base_url, open_timeout: DEFAULT_OPEN_TIMEOUT, timeout: DEFAULT_TIMEOUT)
|
|
20
|
+
Faraday.new(url: base_url) do |f|
|
|
21
|
+
f.request :json
|
|
22
|
+
f.response :json
|
|
23
|
+
f.options.open_timeout = open_timeout
|
|
24
|
+
f.options.timeout = timeout
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Executes a POST request to the LLM API.
|
|
29
|
+
#
|
|
30
|
+
# @param connection [Faraday::Connection] The Faraday connection
|
|
31
|
+
# @param path [String] The request path
|
|
32
|
+
# @param headers [Hash] Request headers
|
|
33
|
+
# @param body [Hash] Request body
|
|
34
|
+
# @return [Faraday::Response] The HTTP response
|
|
35
|
+
def self.execute(connection, path, headers:, body:)
|
|
36
|
+
connection.post(path) do |req|
|
|
37
|
+
req.headers.update(headers)
|
|
38
|
+
req.body = body.to_json
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Clients
|
|
5
|
+
# Handles error responses and logging for LLM provider clients.
|
|
6
|
+
# Encapsulates error formatting, logging, and exception handling.
|
|
7
|
+
class ResponseErrorHandler
|
|
8
|
+
API_FAILED = 'API Request failed'
|
|
9
|
+
|
|
10
|
+
# Creates an error response for failed HTTP requests.
|
|
11
|
+
#
|
|
12
|
+
# @param response [Faraday::Response] The HTTP response
|
|
13
|
+
# @param parsed [Hash] Parsed response body
|
|
14
|
+
# @param usage_extractor [Proc] Block to extract usage data
|
|
15
|
+
# @return [Hash] Standardized error response
|
|
16
|
+
def self.failure_response(response, parsed, &usage_extractor)
|
|
17
|
+
error_msg = "#{API_FAILED}: #{response.status}"
|
|
18
|
+
detail = parsed.is_a?(Hash) ? (parsed[:error] || parsed['error'] || parsed) : parsed
|
|
19
|
+
|
|
20
|
+
if detail.is_a?(Hash) && (detail[:message] || detail['message'])
|
|
21
|
+
error_msg += " - #{detail[:message] || detail['message']}"
|
|
22
|
+
elsif !detail.to_s.empty?
|
|
23
|
+
error_msg += " - #{detail}"
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
{
|
|
27
|
+
success: false,
|
|
28
|
+
result: error_msg,
|
|
29
|
+
usage: usage_extractor.call(parsed),
|
|
30
|
+
response: { error: { message: error_msg } },
|
|
31
|
+
status: 'error',
|
|
32
|
+
code: response.status
|
|
33
|
+
}
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Creates an error response when the LLM response has no message content.
|
|
37
|
+
#
|
|
38
|
+
# @param response [Faraday::Response] The HTTP response
|
|
39
|
+
# @param parsed [Hash] Parsed response body
|
|
40
|
+
# @param usage_extractor [Proc] Block to extract usage data
|
|
41
|
+
# @return [Hash] Standardized error response
|
|
42
|
+
def self.missing_message_response(response, parsed, &usage_extractor)
|
|
43
|
+
error_msg = 'LLM response missing message content'
|
|
44
|
+
{
|
|
45
|
+
success: false,
|
|
46
|
+
result: error_msg,
|
|
47
|
+
usage: usage_extractor.call(parsed),
|
|
48
|
+
response: { error: { message: error_msg } },
|
|
49
|
+
status: 'error',
|
|
50
|
+
code: response.status
|
|
51
|
+
}
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Handles an exception by logging and returning a standardized error response.
|
|
55
|
+
#
|
|
56
|
+
# @param error [StandardError] The exception that occurred
|
|
57
|
+
# @param type [String] The error type label
|
|
58
|
+
# @return [Hash] Standardized error response
|
|
59
|
+
def self.handle_exception(error, type)
|
|
60
|
+
log_error(error)
|
|
61
|
+
{ success: false, result: "#{type}: #{error.message}", status: 'error' }
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Logs an error message and backtrace to Rails.logger or stderr.
|
|
65
|
+
#
|
|
66
|
+
# @param error [StandardError] The exception to log
|
|
67
|
+
# @return [void]
|
|
68
|
+
def self.log_error(error)
|
|
69
|
+
SkillBench::ErrorLogger.log_error(error)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Clients
|
|
7
|
+
# Parses LLM provider responses and extracts messages and usage data.
|
|
8
|
+
# Handles JSON parsing, message extraction, and validation.
|
|
9
|
+
class ResponseParser
|
|
10
|
+
# Parses the response body into a Hash.
|
|
11
|
+
#
|
|
12
|
+
# @param response [Faraday::Response] The HTTP response
|
|
13
|
+
# @return [Hash] Parsed response body
|
|
14
|
+
def self.parse_body(response)
|
|
15
|
+
return response.body if response.body.is_a?(Hash)
|
|
16
|
+
return { error: { message: response.body.to_s } } if response.body.is_a?(Array)
|
|
17
|
+
|
|
18
|
+
JSON.parse(response.body, symbolize_names: true)
|
|
19
|
+
rescue JSON::ParserError
|
|
20
|
+
{ error: { message: response.body.to_s } }
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Strips markdown code fences from a string if present.
|
|
24
|
+
#
|
|
25
|
+
# @param text [String] The text to clean
|
|
26
|
+
# @return [String] Cleaned text
|
|
27
|
+
def self.strip_markdown_fences(text)
|
|
28
|
+
return text unless text.is_a?(String)
|
|
29
|
+
|
|
30
|
+
if text.start_with?('```')
|
|
31
|
+
lines = text.each_line.to_a
|
|
32
|
+
lines.shift if lines.first&.strip&.start_with?('```')
|
|
33
|
+
lines.pop if lines.last&.strip == '```'
|
|
34
|
+
lines.join.strip
|
|
35
|
+
else
|
|
36
|
+
text
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Checks if a message is valid (has content or tool calls).
|
|
41
|
+
#
|
|
42
|
+
# @param message [Hash, String, nil] The message to validate
|
|
43
|
+
# @return [Boolean] True if the message is valid
|
|
44
|
+
def self.valid_message?(message)
|
|
45
|
+
return false if message.nil?
|
|
46
|
+
|
|
47
|
+
content = extract_content(message)
|
|
48
|
+
tool_calls = extract_tool_calls(message)
|
|
49
|
+
|
|
50
|
+
!content.nil? || !Array(tool_calls).empty?
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Extracts the content from a message.
|
|
54
|
+
#
|
|
55
|
+
# @param message [Hash, String] The message
|
|
56
|
+
# @return [String, nil] The content or nil
|
|
57
|
+
def self.extract_content(message)
|
|
58
|
+
return message unless message.is_a?(Hash)
|
|
59
|
+
|
|
60
|
+
message[:content] || message['content']
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Extracts tool calls from a message.
|
|
64
|
+
#
|
|
65
|
+
# @param message [Hash] The message
|
|
66
|
+
# @return [Array, nil] The tool calls or nil
|
|
67
|
+
def self.extract_tool_calls(message)
|
|
68
|
+
return nil unless message.is_a?(Hash)
|
|
69
|
+
|
|
70
|
+
message[:tool_calls] || message['tool_calls']
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Extracts the message from an OpenAI-compatible response body.
|
|
74
|
+
#
|
|
75
|
+
# @param body [Hash] The parsed response body
|
|
76
|
+
# @return [Hash, nil] The message or nil
|
|
77
|
+
def self.extract_openai_message(body)
|
|
78
|
+
choices = body[:choices] || body['choices']
|
|
79
|
+
return nil unless choices&.any?
|
|
80
|
+
|
|
81
|
+
choices.first[:message] || choices.first['message']
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Extracts usage data from an OpenAI-compatible response.
|
|
85
|
+
#
|
|
86
|
+
# @param body [Hash] The parsed response body
|
|
87
|
+
# @return [Hash] Usage data
|
|
88
|
+
def self.extract_openai_usage(body)
|
|
89
|
+
body[:usage] || body['usage'] || {}
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'faraday'
|
|
4
|
+
require_relative '../error_logger'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Clients
|
|
8
|
+
# Service object for retrying HTTP requests with exponential backoff.
|
|
9
|
+
# Retries on transient errors (429, 503). Raises permanent errors immediately.
|
|
10
|
+
# Returns the block result on success.
|
|
11
|
+
class RetryHandler
|
|
12
|
+
RETRYABLE_STATUSES = [429, 503].freeze
|
|
13
|
+
|
|
14
|
+
MAX_DELAY = 30 # Maximum delay cap in seconds
|
|
15
|
+
|
|
16
|
+
# Executes the given block with retry logic.
|
|
17
|
+
#
|
|
18
|
+
# @param max_attempts [Integer] Maximum number of attempts (default: 3).
|
|
19
|
+
# @param base_delay [Numeric] Base delay in seconds before first retry (doubles each attempt).
|
|
20
|
+
# @yield The request block to execute.
|
|
21
|
+
# @return [Object] The block's return value on success.
|
|
22
|
+
# @raise [Faraday::Error] On non-retryable errors or after exhausting retries.
|
|
23
|
+
# @raise [ArgumentError] if no block is given or max_attempts < 1.
|
|
24
|
+
def self.call(max_attempts: 3, base_delay: 1, &block)
|
|
25
|
+
raise ArgumentError, 'RetryHandler requires a block' unless block
|
|
26
|
+
raise ArgumentError, 'max_attempts must be >= 1' if max_attempts < 1
|
|
27
|
+
|
|
28
|
+
new(max_attempts:, base_delay:, block:).call
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# @param max_attempts [Integer] Maximum number of attempts.
|
|
32
|
+
# @param base_delay [Numeric] Base delay before first retry.
|
|
33
|
+
# @param block [Proc] The request block to execute.
|
|
34
|
+
def initialize(max_attempts:, base_delay:, block:)
|
|
35
|
+
@max_attempts = max_attempts
|
|
36
|
+
@base_delay = base_delay
|
|
37
|
+
@block = block
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Executes the block with retry logic.
|
|
41
|
+
#
|
|
42
|
+
# @return [Object] The block's return value on success.
|
|
43
|
+
# @raise [Faraday::Error] On non-retryable errors or after exhausting retries.
|
|
44
|
+
def call
|
|
45
|
+
attempt = 0
|
|
46
|
+
|
|
47
|
+
loop do
|
|
48
|
+
attempt += 1
|
|
49
|
+
return @block.call
|
|
50
|
+
rescue Faraday::Error => e
|
|
51
|
+
status = extract_status(e)
|
|
52
|
+
raise e unless retryable?(status, attempt)
|
|
53
|
+
|
|
54
|
+
delay = compute_delay(attempt)
|
|
55
|
+
wait(delay)
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
private
|
|
60
|
+
|
|
61
|
+
def retryable?(status, attempt)
|
|
62
|
+
RETRYABLE_STATUSES.include?(status) && attempt < @max_attempts
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def compute_delay(attempt)
|
|
66
|
+
[@base_delay * (2**(attempt - 1)), MAX_DELAY].min
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def extract_status(error)
|
|
70
|
+
error.respond_to?(:response_status) ? error.response_status : 0
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def wait(delay)
|
|
74
|
+
sleep(delay)
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'fileutils'
|
|
4
|
+
require 'json'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Commands
|
|
8
|
+
# Handles the `skill-bench eval new` command
|
|
9
|
+
class EvalNew
|
|
10
|
+
# Allowed runtime values for eval scaffolding.
|
|
11
|
+
ALLOWED_RUNTIMES = %w[ruby rails].freeze
|
|
12
|
+
|
|
13
|
+
# Run the eval new command
|
|
14
|
+
# @param name [String] Eval name
|
|
15
|
+
# @param runtime [String] "ruby" or "rails" (default: ruby)
|
|
16
|
+
# @return [void]
|
|
17
|
+
# @raise [ArgumentError] if runtime is not in ALLOWED_RUNTIMES.
|
|
18
|
+
def self.run(name:, runtime: 'ruby')
|
|
19
|
+
raise ArgumentError, "Unsupported runtime '#{runtime}'. Allowed: #{ALLOWED_RUNTIMES.join(', ')}" unless ALLOWED_RUNTIMES.include?(runtime)
|
|
20
|
+
|
|
21
|
+
eval_path = File.join('evals', name)
|
|
22
|
+
FileUtils.mkdir_p(eval_path)
|
|
23
|
+
|
|
24
|
+
create_task_md(eval_path, name)
|
|
25
|
+
create_criteria_json(eval_path, runtime)
|
|
26
|
+
create_rails_files(eval_path, name) if runtime == 'rails'
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Create task.md for the eval
|
|
30
|
+
# @param path [String] Eval directory path
|
|
31
|
+
# @param name [String] Eval name
|
|
32
|
+
# @return [void]
|
|
33
|
+
def self.create_task_md(path, name)
|
|
34
|
+
File.write(File.join(path, 'task.md'), task_template(name))
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Create criteria.json for the eval
|
|
38
|
+
# @param path [String] Eval directory path
|
|
39
|
+
# @param runtime [String] Runtime type
|
|
40
|
+
# @return [void]
|
|
41
|
+
def self.create_criteria_json(path, runtime)
|
|
42
|
+
criteria = default_criteria(runtime)
|
|
43
|
+
File.write(File.join(path, 'criteria.json'), JSON.pretty_generate(criteria))
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Generate task.md template
|
|
47
|
+
# @param name [String] Eval name
|
|
48
|
+
# @return [String] Markdown template
|
|
49
|
+
def self.task_template(name)
|
|
50
|
+
<<~MARKDOWN
|
|
51
|
+
# Eval: #{name}
|
|
52
|
+
|
|
53
|
+
## Task
|
|
54
|
+
Describe the task for the agent here.
|
|
55
|
+
|
|
56
|
+
## Success Criteria
|
|
57
|
+
Define what constitutes a successful completion.
|
|
58
|
+
MARKDOWN
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Generate default criteria hash.
|
|
62
|
+
#
|
|
63
|
+
# @param runtime [String] Runtime type.
|
|
64
|
+
# @return [Hash] Criteria configuration in the new format.
|
|
65
|
+
def self.default_criteria(runtime)
|
|
66
|
+
{
|
|
67
|
+
context: "Evaluate #{runtime} task",
|
|
68
|
+
dimensions: [
|
|
69
|
+
{ name: 'correctness', max_score: 30 },
|
|
70
|
+
{ name: 'skill_adherence', max_score: 25 },
|
|
71
|
+
{ name: 'code_quality', max_score: 20 },
|
|
72
|
+
{ name: 'test_coverage', max_score: 15 },
|
|
73
|
+
{ name: 'documentation', max_score: 10 }
|
|
74
|
+
],
|
|
75
|
+
pass_threshold: 70,
|
|
76
|
+
minimum_delta: 10
|
|
77
|
+
}
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Create Rails-specific files for the eval
|
|
81
|
+
# @param path [String] Eval directory path
|
|
82
|
+
# @param _name [String] Eval name
|
|
83
|
+
# @return [void]
|
|
84
|
+
def self.create_rails_files(path, _name)
|
|
85
|
+
File.write(File.join(path, 'rails_helper.rb'), "require 'rails_helper'\n")
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require_relative '../clients/provider_schemas'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Commands
|
|
8
|
+
# Handles the `skill-bench init` command.
|
|
9
|
+
# Generates a skill-bench.json config file with single-provider settings.
|
|
10
|
+
class Init
|
|
11
|
+
# Run the init command to generate config.
|
|
12
|
+
#
|
|
13
|
+
# @param provider [Symbol] LLM provider name (e.g., :openai, :gemini)
|
|
14
|
+
# @param force [Boolean] Whether to overwrite an existing config file.
|
|
15
|
+
# @return [void]
|
|
16
|
+
# @raise [RuntimeError] if config file exists and force is false
|
|
17
|
+
# @raise [ArgumentError] if provider is not registered
|
|
18
|
+
def self.run(provider:, force: false)
|
|
19
|
+
raise "Config file '#{SkillBench::Config::CONFIG_FILENAME}' already exists. Use --force to overwrite." if File.exist?(SkillBench::Config::CONFIG_FILENAME) && !force
|
|
20
|
+
|
|
21
|
+
config = config_for_provider(provider)
|
|
22
|
+
File.write(SkillBench::Config::CONFIG_FILENAME, JSON.pretty_generate(config))
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Generates configuration hash for a specific provider.
|
|
26
|
+
#
|
|
27
|
+
# @param provider [Symbol] LLM provider name
|
|
28
|
+
# @return [Hash] Single-provider configuration
|
|
29
|
+
# @raise [ArgumentError] if provider is not registered
|
|
30
|
+
def self.config_for_provider(provider)
|
|
31
|
+
{
|
|
32
|
+
provider: provider,
|
|
33
|
+
max_execution_time: 30,
|
|
34
|
+
config: SkillBench::Clients::ProviderSchemas.for(provider)
|
|
35
|
+
}
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../services/runner_service'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Commands
|
|
7
|
+
# Handles the `skill-bench run` command
|
|
8
|
+
class Run
|
|
9
|
+
# Run an eval with specified skill(s)
|
|
10
|
+
# @param eval_name [String] Name of eval to run (e.g., 'test-eval' or 'evals/test-eval')
|
|
11
|
+
# @param skill_names [Array<String>] Names of skills to use
|
|
12
|
+
# @return [Hash] Result with pass/fail and score
|
|
13
|
+
def self.run(eval_name:, skill_names:)
|
|
14
|
+
Services::RunnerService.call(
|
|
15
|
+
eval_name: eval_name,
|
|
16
|
+
skill_names: skill_names
|
|
17
|
+
)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'fileutils'
|
|
4
|
+
require_relative '../rails/skill_templates'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Commands
|
|
8
|
+
# Handles the `skill-bench skill new` command
|
|
9
|
+
class SkillNew
|
|
10
|
+
# Run the skill new command
|
|
11
|
+
# @param name [String] Skill name
|
|
12
|
+
# @param mode [String] "simple", "advanced", or "rails"
|
|
13
|
+
# @param template [String] Rails template type (service_object, concern, active_record_model)
|
|
14
|
+
# @return [void]
|
|
15
|
+
# @raise [ArgumentError] if mode is invalid
|
|
16
|
+
def self.run(name:, mode: 'simple', template: 'service_object')
|
|
17
|
+
skill_path = File.join('skills', name)
|
|
18
|
+
FileUtils.mkdir_p(skill_path)
|
|
19
|
+
|
|
20
|
+
case mode
|
|
21
|
+
when 'simple'
|
|
22
|
+
create_simple_skill(skill_path, name)
|
|
23
|
+
when 'advanced'
|
|
24
|
+
create_advanced_skill(skill_path, name)
|
|
25
|
+
when 'rails'
|
|
26
|
+
create_rails_skill(skill_path, name, template)
|
|
27
|
+
else
|
|
28
|
+
raise ArgumentError, "Invalid mode: #{mode}. Use 'simple', 'advanced', or 'rails'."
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Create a simple skill with SKILL.md
|
|
33
|
+
# @param path [String] Skill directory path
|
|
34
|
+
# @param name [String] Skill name
|
|
35
|
+
# @return [void]
|
|
36
|
+
def self.create_simple_skill(path, name)
|
|
37
|
+
File.write(File.join(path, 'SKILL.md'), simple_skill_template(name))
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Create an advanced skill with Ruby class
|
|
41
|
+
# @param path [String] Skill directory path
|
|
42
|
+
# @param name [String] Skill name
|
|
43
|
+
# @return [void]
|
|
44
|
+
def self.create_advanced_skill(path, name)
|
|
45
|
+
File.write(File.join(path, 'skill.rb'), advanced_skill_template(name))
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Generate simple skill template
|
|
49
|
+
# @param name [String] Skill name
|
|
50
|
+
# @return [String] Markdown template
|
|
51
|
+
def self.simple_skill_template(name)
|
|
52
|
+
<<~MARKDOWN
|
|
53
|
+
# Skill: #{name}
|
|
54
|
+
|
|
55
|
+
## Description
|
|
56
|
+
Add skill description here.
|
|
57
|
+
|
|
58
|
+
## Context
|
|
59
|
+
Add context injection content here.
|
|
60
|
+
|
|
61
|
+
## Workflow
|
|
62
|
+
Add workflow steps here.
|
|
63
|
+
MARKDOWN
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Convert snake_case to CamelCase
|
|
67
|
+
# @param string [String] String to convert
|
|
68
|
+
# @return [String] CamelCase string
|
|
69
|
+
def self.camelize(string)
|
|
70
|
+
string.split(/[_\s]+/).map(&:capitalize).join
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Generate advanced skill template
|
|
74
|
+
# @param name [String] Skill name
|
|
75
|
+
# @return [String] Ruby class template
|
|
76
|
+
def self.advanced_skill_template(name)
|
|
77
|
+
class_name = camelize(name)
|
|
78
|
+
<<~RUBY
|
|
79
|
+
# frozen_string_literal: true
|
|
80
|
+
|
|
81
|
+
module SkillBench
|
|
82
|
+
module Skills
|
|
83
|
+
class #{class_name}
|
|
84
|
+
def initialize; end
|
|
85
|
+
|
|
86
|
+
def call
|
|
87
|
+
# Implement skill logic here
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
RUBY
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
RAILS_TEMPLATES = {
|
|
96
|
+
'service_object' => 'service.rb',
|
|
97
|
+
'concern' => 'concern.rb',
|
|
98
|
+
'active_record_model' => 'model.rb'
|
|
99
|
+
}.freeze
|
|
100
|
+
|
|
101
|
+
# Create a Rails skill using templates
|
|
102
|
+
# @param path [String] Skill directory path
|
|
103
|
+
# @param name [String] Skill name
|
|
104
|
+
# @param template [String] Template type (service_object, concern, active_record_model)
|
|
105
|
+
# @return [void]
|
|
106
|
+
def self.create_rails_skill(path, name, template)
|
|
107
|
+
file_name = RAILS_TEMPLATES[template]
|
|
108
|
+
raise ArgumentError, "Invalid template: #{template}. Use one of: #{RAILS_TEMPLATES.keys.join(', ')}." unless file_name
|
|
109
|
+
|
|
110
|
+
content = Rails::SkillTemplates.public_send(template.to_sym, name)
|
|
111
|
+
File.write(File.join(path, file_name), content)
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
class Config
|
|
5
|
+
# Applies normalized configuration hashes to a mutable store.
|
|
6
|
+
class Applier
|
|
7
|
+
# Applies configuration values to a store.
|
|
8
|
+
#
|
|
9
|
+
# @param store [Store] mutable configuration store
|
|
10
|
+
# @param data [Hash] normalized configuration values
|
|
11
|
+
# @return [Hash] result envelope with applied status
|
|
12
|
+
def self.call(store:, data:)
|
|
13
|
+
new(store:, data:).call
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Initializes the applier.
|
|
17
|
+
#
|
|
18
|
+
# @param store [Store] mutable configuration store
|
|
19
|
+
# @param data [Hash] normalized configuration values
|
|
20
|
+
# @return [Applier] an applier instance
|
|
21
|
+
def initialize(store:, data:)
|
|
22
|
+
@store = store
|
|
23
|
+
@data = data
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Applies configuration values to the configured store.
|
|
27
|
+
#
|
|
28
|
+
# @return [Hash] result envelope with applied status
|
|
29
|
+
def call
|
|
30
|
+
apply_scalar_values
|
|
31
|
+
apply_provider_values
|
|
32
|
+
{ success: true, response: { applied: true } }
|
|
33
|
+
rescue StandardError => e
|
|
34
|
+
SkillBench::ErrorLogger.log_error(e, 'Applier Error')
|
|
35
|
+
{ success: false, response: { error: { message: e.message } } }
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
private
|
|
39
|
+
|
|
40
|
+
def apply_scalar_values
|
|
41
|
+
assign_current_provider
|
|
42
|
+
@store.assign_max_execution_time(@data[:max_execution_time]) if @data.key?(:max_execution_time)
|
|
43
|
+
@store.assign_allowed_commands(@data[:allowed_commands]) if @data.key?(:allowed_commands)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def apply_provider_values
|
|
47
|
+
if @data.key?(:llm_providers_config)
|
|
48
|
+
@store.replace_provider_config(copied_provider_config)
|
|
49
|
+
else
|
|
50
|
+
@store.apply_provider_config(@data[:providers] || {})
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def assign_current_provider
|
|
55
|
+
provider = @data.fetch(:current_llm_provider) { return }
|
|
56
|
+
provider_name = provider.to_s.strip
|
|
57
|
+
return if provider_name.empty?
|
|
58
|
+
|
|
59
|
+
@store.assign_current_llm_provider(provider_name.to_sym)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def copied_provider_config
|
|
63
|
+
@data[:llm_providers_config].transform_values(&:dup)
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
class Config
|
|
5
|
+
# Builds the default evaluator configuration state.
|
|
6
|
+
class Defaults
|
|
7
|
+
# Returns the default configuration values.
|
|
8
|
+
#
|
|
9
|
+
# @return [Hash] result envelope with default provider, timeout, command, and provider settings
|
|
10
|
+
def self.call
|
|
11
|
+
{ success: true, response: { config: config } }
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# Builds the raw default configuration hash.
|
|
15
|
+
#
|
|
16
|
+
# @return [Hash] default provider, timeout, command, and provider settings
|
|
17
|
+
def self.config
|
|
18
|
+
{
|
|
19
|
+
current_llm_provider: :openai,
|
|
20
|
+
max_execution_time: 30,
|
|
21
|
+
allowed_commands: nil,
|
|
22
|
+
llm_providers_config: {
|
|
23
|
+
openai: { api_key: nil, model: 'gpt-4o' },
|
|
24
|
+
anthropic: { api_key: nil, model: 'claude-sonnet-4-20250514' },
|
|
25
|
+
gemini: {
|
|
26
|
+
api_key: nil,
|
|
27
|
+
model: 'gemini-1.5-flash-latest',
|
|
28
|
+
location: 'us-central1',
|
|
29
|
+
project_id: nil
|
|
30
|
+
},
|
|
31
|
+
ollama: { api_key: nil, model: 'qwen:7b', base_url: nil },
|
|
32
|
+
azure: { api_key: nil, model: 'gpt-4', endpoint: nil, api_version: nil },
|
|
33
|
+
groq: { api_key: nil, model: 'llama-3.3-70b-versatile' },
|
|
34
|
+
deepseek: { api_key: nil, model: 'deepseek-chat' },
|
|
35
|
+
opencode: { api_key: nil, model: 'opencode-model', base_url: nil },
|
|
36
|
+
openrouter: { api_key: nil, model: 'anthropic/claude-3.5-sonnet' }
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|