ruby-skill-bench 1.0.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +299 -23
- data/docs/architecture.md +3 -1
- data/docs/first-eval-guide.md +7 -7
- data/docs/testing-guide.md +1 -1
- data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
- data/lib/skill_bench/agent/react_agent/step.rb +7 -1
- data/lib/skill_bench/agent/react_agent.rb +2 -1
- data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
- data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
- data/lib/skill_bench/cli/help_printer.rb +10 -2
- data/lib/skill_bench/cli/init_command.rb +2 -1
- data/lib/skill_bench/cli/result_printer.rb +1 -1
- data/lib/skill_bench/cli/run_command.rb +47 -9
- data/lib/skill_bench/cli/validate_command.rb +242 -0
- data/lib/skill_bench/cli.rb +3 -0
- data/lib/skill_bench/client.rb +43 -1
- data/lib/skill_bench/clients/all.rb +3 -0
- data/lib/skill_bench/clients/base_client.rb +14 -6
- data/lib/skill_bench/clients/base_url_validator.rb +105 -0
- data/lib/skill_bench/clients/provider_config.rb +34 -1
- data/lib/skill_bench/clients/provider_schemas.rb +4 -0
- data/lib/skill_bench/clients/providers/mistral.rb +47 -0
- data/lib/skill_bench/clients/request_builder.rb +2 -4
- data/lib/skill_bench/clients/response_builder.rb +91 -0
- data/lib/skill_bench/clients/response_error_handler.rb +5 -17
- data/lib/skill_bench/clients/retry_handler.rb +4 -7
- data/lib/skill_bench/commands/init.rb +5 -0
- data/lib/skill_bench/commands/skill_new.rb +3 -1
- data/lib/skill_bench/config/applier.rb +2 -0
- data/lib/skill_bench/config/defaults.rb +2 -0
- data/lib/skill_bench/config/facade_readers.rb +7 -0
- data/lib/skill_bench/config/facade_writers.rb +17 -0
- data/lib/skill_bench/config/json_loader.rb +1 -1
- data/lib/skill_bench/config/store.rb +29 -0
- data/lib/skill_bench/config.rb +18 -0
- data/lib/skill_bench/constants.rb +58 -0
- data/lib/skill_bench/evaluation/runner.rb +20 -3
- data/lib/skill_bench/execution/context_hydrator.rb +66 -15
- data/lib/skill_bench/execution/sandbox.rb +76 -14
- data/lib/skill_bench/judge/judge.rb +4 -0
- data/lib/skill_bench/judge/prompt.rb +42 -6
- data/lib/skill_bench/models/config.rb +32 -0
- data/lib/skill_bench/output_formatter.rb +60 -1
- data/lib/skill_bench/package_verifier.rb +1 -1
- data/lib/skill_bench/rails/skill_templates.rb +19 -5
- data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
- data/lib/skill_bench/services/batch_runner_service.rb +111 -0
- data/lib/skill_bench/services/compare_option_parser.rb +1 -0
- data/lib/skill_bench/services/cost_calculator.rb +91 -0
- data/lib/skill_bench/services/html_formatter.rb +289 -0
- data/lib/skill_bench/services/json_formatter.rb +19 -1
- data/lib/skill_bench/services/junit_formatter.rb +74 -24
- data/lib/skill_bench/services/provider_resolver.rb +5 -2
- data/lib/skill_bench/services/response_cache.rb +130 -0
- data/lib/skill_bench/services/runner_service.rb +88 -4
- data/lib/skill_bench/services/summary_formatter.rb +90 -0
- data/lib/skill_bench/services/template_registry.rb +43 -9
- data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
- data/lib/skill_bench/tools/registry.rb +29 -3
- data/lib/skill_bench/tools/run_command.rb +172 -35
- data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
- data/lib/skill_bench/trend_tracker.rb +5 -5
- data/lib/skill_bench/version.rb +1 -1
- data/lib/skill_bench.rb +3 -3
- metadata +19 -36
|
@@ -12,7 +12,8 @@ module SkillBench
|
|
|
12
12
|
#
|
|
13
13
|
# @param messages [Array<Hash>] The conversation history.
|
|
14
14
|
# @param config [Hash] Configuration for this step (client params, system prompt, working dir).
|
|
15
|
-
# @return [Hash] Step outcome containing :continue (boolean), :result (hash, if finished),
|
|
15
|
+
# @return [Hash] Step outcome containing :continue (boolean), :result (hash, if finished),
|
|
16
|
+
# :usage (token usage for this step), and :messages.
|
|
16
17
|
def self.call(messages, config)
|
|
17
18
|
messages = messages.dup
|
|
18
19
|
client_result = Client.call(
|
|
@@ -21,12 +22,14 @@ module SkillBench
|
|
|
21
22
|
tools: Tools.definitions,
|
|
22
23
|
**config[:client_params]
|
|
23
24
|
)
|
|
25
|
+
usage = client_result[:usage] || {}
|
|
24
26
|
|
|
25
27
|
unless client_result[:success]
|
|
26
28
|
error_msg = client_result.dig(:response, :error, :message) || 'Unknown error'
|
|
27
29
|
return {
|
|
28
30
|
continue: false,
|
|
29
31
|
result: client_result,
|
|
32
|
+
usage: usage,
|
|
30
33
|
iteration: build_iteration(thought: '', tools_used: [], observation_summary: error_msg)
|
|
31
34
|
}
|
|
32
35
|
end
|
|
@@ -36,6 +39,7 @@ module SkillBench
|
|
|
36
39
|
return {
|
|
37
40
|
continue: false,
|
|
38
41
|
result: { success: false, response: { error: { message: 'Empty response from LLM' } } },
|
|
42
|
+
usage: usage,
|
|
39
43
|
iteration: build_iteration(thought: '', tools_used: [], observation_summary: 'Empty response from LLM')
|
|
40
44
|
}
|
|
41
45
|
end
|
|
@@ -51,6 +55,7 @@ module SkillBench
|
|
|
51
55
|
return {
|
|
52
56
|
continue: false,
|
|
53
57
|
result: { success: true, response: { content: content } },
|
|
58
|
+
usage: usage,
|
|
54
59
|
iteration: build_iteration(thought: thought, tools_used: [], observation_summary: '')
|
|
55
60
|
}
|
|
56
61
|
end
|
|
@@ -69,6 +74,7 @@ module SkillBench
|
|
|
69
74
|
{
|
|
70
75
|
continue: true,
|
|
71
76
|
messages: messages,
|
|
77
|
+
usage: usage,
|
|
72
78
|
iteration: build_iteration(thought: thought, tools_used: tools_used, observation_summary: observation_summary)
|
|
73
79
|
}
|
|
74
80
|
end
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative '../constants'
|
|
3
4
|
require_relative 'react_agent/step'
|
|
4
5
|
require_relative 'react_agent/loop_runner'
|
|
5
6
|
|
|
@@ -29,7 +30,7 @@ module SkillBench
|
|
|
29
30
|
def initialize(params)
|
|
30
31
|
@system_prompt = params[:system_prompt]
|
|
31
32
|
@initial_prompt = params[:initial_prompt]
|
|
32
|
-
@max_iterations = params[:max_iterations] ||
|
|
33
|
+
@max_iterations = params[:max_iterations] || Constants::ReactAgent::DEFAULT_MAX_ITERATIONS
|
|
33
34
|
@working_dir = params[:working_dir] || Dir.pwd
|
|
34
35
|
@container_id = params[:container_id]
|
|
35
36
|
@client_params = params[:client_params] || {}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../output_formatter'
|
|
4
|
+
require_relative '../services/summary_formatter'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Cli
|
|
8
|
+
# Prints the aggregate result of a batch `skill-bench run --all` command.
|
|
9
|
+
#
|
|
10
|
+
# Defaults to the human-readable batch summary, but can instead emit a
|
|
11
|
+
# JUnit document (`format: :junit`) or a JSON gate (`summary: true`). The
|
|
12
|
+
# returned exit code is always {OutputFormatter.batch_exit_code}, so CI
|
|
13
|
+
# gating works identically across every output mode.
|
|
14
|
+
class BatchResultPrinter
|
|
15
|
+
# Prints the aggregate summary and returns the appropriate exit code.
|
|
16
|
+
#
|
|
17
|
+
# @param aggregate [Hash] Aggregate envelope from BatchRunnerService.
|
|
18
|
+
# @param format [Symbol, nil] Output format (:junit for JUnit XML, else human).
|
|
19
|
+
# @param summary [Boolean] When true, print the JSON summary gate instead.
|
|
20
|
+
# @return [Integer] Exit code (0 when all pass, 1 when any fails).
|
|
21
|
+
def self.call(aggregate, format: nil, summary: false)
|
|
22
|
+
puts batch_output(aggregate, format: format, summary: summary)
|
|
23
|
+
OutputFormatter.batch_exit_code(aggregate)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Selects the rendered batch output for the requested mode.
|
|
27
|
+
#
|
|
28
|
+
# `:junit` and `:json` produce machine-readable batch output; `:json` maps
|
|
29
|
+
# to the same JSON gate as `summary: true`. `:html` (and any other format)
|
|
30
|
+
# falls back to the human batch summary, since there is no batch HTML report.
|
|
31
|
+
#
|
|
32
|
+
# @param aggregate [Hash] Aggregate envelope from BatchRunnerService.
|
|
33
|
+
# @param format [Symbol, nil] Output format (:junit, :json, else human).
|
|
34
|
+
# @param summary [Boolean] When true, render the JSON summary gate.
|
|
35
|
+
# @return [String] The formatted batch output.
|
|
36
|
+
def self.batch_output(aggregate, format:, summary:)
|
|
37
|
+
return Services::SummaryFormatter.format(aggregate) if summary || format == :json
|
|
38
|
+
return Services::JUnitFormatter.format_batch(aggregate) if format == :junit
|
|
39
|
+
|
|
40
|
+
OutputFormatter.format_batch(aggregate)
|
|
41
|
+
end
|
|
42
|
+
private_class_method :batch_output
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -9,6 +9,7 @@ module SkillBench
|
|
|
9
9
|
class BaseEvalOptions
|
|
10
10
|
attr_reader :options, :parser
|
|
11
11
|
|
|
12
|
+
# Initializes the option set and the OptionParser used to parse the command's arguments.
|
|
12
13
|
def initialize
|
|
13
14
|
@options = default_options
|
|
14
15
|
@parser = create_parser
|
|
@@ -39,10 +40,12 @@ module SkillBench
|
|
|
39
40
|
class NewEvalOptions < BaseEvalOptions
|
|
40
41
|
protected
|
|
41
42
|
|
|
43
|
+
# @return [Hash] default options for the `eval new` command, with the runtime defaulting to "ruby"
|
|
42
44
|
def default_options
|
|
43
45
|
{ runtime: 'ruby' }
|
|
44
46
|
end
|
|
45
47
|
|
|
48
|
+
# @return [OptionParser] parser for the `eval new` command, handling --runtime and --help
|
|
46
49
|
def create_parser
|
|
47
50
|
OptionParser.new do |opts|
|
|
48
51
|
opts.banner = 'Usage: skill-bench eval new <name> [options]'
|
|
@@ -59,6 +62,7 @@ module SkillBench
|
|
|
59
62
|
class GenerateEvalOptions < BaseEvalOptions
|
|
60
63
|
protected
|
|
61
64
|
|
|
65
|
+
# @return [OptionParser] parser for the `eval generate` command, handling --name and --help
|
|
62
66
|
def create_parser
|
|
63
67
|
OptionParser.new do |opts|
|
|
64
68
|
opts.banner = 'Usage: skill-bench eval generate <skill-name> [options]'
|
|
@@ -20,11 +20,14 @@ module SkillBench
|
|
|
20
20
|
--force Overwrite existing config file
|
|
21
21
|
|
|
22
22
|
run <eval> --skill <name> [--skill <name>] [--format FORMAT] [--pack NAME]
|
|
23
|
-
Run an evaluation
|
|
23
|
+
Run an evaluation (single eval, or a whole directory with --all)
|
|
24
24
|
--skill Skill to use (can be specified multiple times)
|
|
25
25
|
--pack Pack context for registry-based skill resolution
|
|
26
26
|
--registry-manifest PATH Path to registry.json manifest
|
|
27
|
-
--format Output format: human, json, junit (default: human)
|
|
27
|
+
--format Output format: human, json, junit, html (default: human)
|
|
28
|
+
--all Run every eval under evals/ (batch mode)
|
|
29
|
+
--evals-dir DIR Run every eval under DIR (batch mode)
|
|
30
|
+
--summary Emit a JSON summary gate for a batch run (batch mode)
|
|
28
31
|
|
|
29
32
|
compare <skill-name> --variant-a SPEC --variant-b SPEC --eval PATH
|
|
30
33
|
Compare the same skill across two pack variants
|
|
@@ -45,6 +48,11 @@ module SkillBench
|
|
|
45
48
|
Auto-generate an eval from a skill
|
|
46
49
|
--name Name for the generated eval (optional)
|
|
47
50
|
|
|
51
|
+
validate (alias: doctor) [--criteria PATH] [--config PATH]
|
|
52
|
+
Run read-only pre-flight checks (no eval, no network)
|
|
53
|
+
--criteria Criteria JSON to validate (default: criteria.json)
|
|
54
|
+
--config Config file to validate (default: skill-bench.json)
|
|
55
|
+
|
|
48
56
|
Global Options:
|
|
49
57
|
-h, --help Show this help message
|
|
50
58
|
USAGE
|
|
@@ -45,6 +45,7 @@ module SkillBench
|
|
|
45
45
|
OptionParser.new do |opts|
|
|
46
46
|
opts.banner = 'Usage: skill-bench init --<provider> [options]'
|
|
47
47
|
register_provider_options(opts, options)
|
|
48
|
+
opts.on('--mock', 'Generate offline mock config (no API key required)') { options[:provider] = :mock }
|
|
48
49
|
opts.on('--force', 'Overwrite existing config file') { options[:force] = true }
|
|
49
50
|
opts.on('-h', '--help', 'Prints this help') do
|
|
50
51
|
puts opts
|
|
@@ -60,7 +61,7 @@ module SkillBench
|
|
|
60
61
|
end
|
|
61
62
|
|
|
62
63
|
def error_missing_provider
|
|
63
|
-
providers = SkillBench::Clients::ProviderSchemas.names.map { |provider_name| "--#{provider_name}" }.join(', ')
|
|
64
|
+
providers = (SkillBench::Clients::ProviderSchemas.names.map { |provider_name| "--#{provider_name}" } + ['--mock']).join(', ')
|
|
64
65
|
warn "Error: provider is required. Use one of: #{providers}"
|
|
65
66
|
1
|
|
66
67
|
end
|
|
@@ -9,7 +9,7 @@ module SkillBench
|
|
|
9
9
|
# Prints the result and returns the appropriate exit code.
|
|
10
10
|
#
|
|
11
11
|
# @param result [Hash] Result from ScoringService
|
|
12
|
-
# @param format [Symbol] Output format (:human, :json, :junit)
|
|
12
|
+
# @param format [Symbol] Output format (:human, :json, :junit, :html)
|
|
13
13
|
# @return [Integer] Exit code (0 for pass, 1 for fail)
|
|
14
14
|
def self.call(result, format: :human)
|
|
15
15
|
puts OutputFormatter.format(result, format: format)
|
|
@@ -19,7 +19,7 @@ module SkillBench
|
|
|
19
19
|
@argv = argv
|
|
20
20
|
end
|
|
21
21
|
|
|
22
|
-
# Parses options and runs the eval.
|
|
22
|
+
# Parses options and runs the eval(s).
|
|
23
23
|
#
|
|
24
24
|
# @return [Integer] Exit code
|
|
25
25
|
def call
|
|
@@ -27,14 +27,9 @@ module SkillBench
|
|
|
27
27
|
parser = build_parser(options)
|
|
28
28
|
parser.parse!(@argv)
|
|
29
29
|
|
|
30
|
-
|
|
31
|
-
return error_missing_eval unless eval_name
|
|
32
|
-
return error_missing_skill if options[:skill_names].empty? && !options[:pack]
|
|
30
|
+
return run_batch(options) if batch_requested?(options)
|
|
33
31
|
|
|
34
|
-
options
|
|
35
|
-
exec_options = options.reject { |key| key == :format }
|
|
36
|
-
result = Commands::Run.run(**exec_options)
|
|
37
|
-
ResultPrinter.call(result, format: options[:format] || :human)
|
|
32
|
+
run_single(options)
|
|
38
33
|
rescue HelpRequested
|
|
39
34
|
0
|
|
40
35
|
rescue StandardError => e
|
|
@@ -44,13 +39,56 @@ module SkillBench
|
|
|
44
39
|
|
|
45
40
|
private
|
|
46
41
|
|
|
42
|
+
# Whether a whole-directory batch run was requested.
|
|
43
|
+
#
|
|
44
|
+
# @param options [Hash] Parsed options
|
|
45
|
+
# @return [Boolean] true when --all or --evals-dir was given
|
|
46
|
+
def batch_requested?(options)
|
|
47
|
+
options[:all] || options[:evals_dir]
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Runs a single eval (the original `run <eval> --skill ...` path).
|
|
51
|
+
#
|
|
52
|
+
# @param options [Hash] Parsed options
|
|
53
|
+
# @return [Integer] Exit code
|
|
54
|
+
def run_single(options)
|
|
55
|
+
eval_name = @argv.shift
|
|
56
|
+
return error_missing_eval unless eval_name
|
|
57
|
+
return error_missing_skill if options[:skill_names].empty? && !options[:pack]
|
|
58
|
+
|
|
59
|
+
options[:eval_name] = eval_name
|
|
60
|
+
exec_options = options.reject { |key| %i[format summary all evals_dir].include?(key) }
|
|
61
|
+
result = Commands::Run.run(**exec_options)
|
|
62
|
+
ResultPrinter.call(result, format: options[:format] || :human)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Runs every eval under the target directory and prints an aggregate.
|
|
66
|
+
#
|
|
67
|
+
# @param options [Hash] Parsed options
|
|
68
|
+
# @return [Integer] Exit code
|
|
69
|
+
def run_batch(options)
|
|
70
|
+
return error_missing_skill if options[:skill_names].empty? && !options[:pack]
|
|
71
|
+
|
|
72
|
+
aggregate = Services::BatchRunnerService.call(
|
|
73
|
+
evals_dir: options[:evals_dir] || Services::BatchRunnerService::DEFAULT_EVALS_DIR,
|
|
74
|
+
skill_names: options[:skill_names],
|
|
75
|
+
pack: options[:pack],
|
|
76
|
+
registry_manifest: options[:registry_manifest]
|
|
77
|
+
)
|
|
78
|
+
BatchResultPrinter.call(aggregate, format: options[:format], summary: options[:summary])
|
|
79
|
+
end
|
|
80
|
+
|
|
47
81
|
def build_parser(options)
|
|
48
82
|
OptionParser.new do |opts|
|
|
49
83
|
opts.banner = 'Usage: skill-bench run <eval> [options]'
|
|
50
84
|
opts.on('--skill NAME', 'Skill to use (can be specified multiple times)') { |v| options[:skill_names] << v }
|
|
51
85
|
opts.on('--pack NAME', 'Pack context for skill resolution') { |v| options[:pack] = v }
|
|
52
86
|
opts.on('--registry-manifest PATH', 'Path to registry.json manifest') { |v| options[:registry_manifest] = v }
|
|
53
|
-
opts.on('--format FORMAT', 'Output format (human, json, junit)') { |v| options[:format] = v.to_sym }
|
|
87
|
+
opts.on('--format FORMAT', 'Output format (human, json, junit, html)') { |v| options[:format] = v.to_sym }
|
|
88
|
+
opts.on('--all', 'Run every eval under the default evals/ directory') { options[:all] = true }
|
|
89
|
+
opts.on('--evals-dir DIR', 'Run every eval under DIR') { |v| options[:evals_dir] = v }
|
|
90
|
+
opts.on('--summary', 'Emit a JSON summary gate for a batch run') { options[:summary] = true }
|
|
91
|
+
opts.on('--cache', 'Enable content-addressed response caching') { ENV['SKILL_BENCH_CACHE'] = '1' }
|
|
54
92
|
opts.on('-h', '--help', 'Prints this help') do
|
|
55
93
|
puts opts
|
|
56
94
|
raise SkillBench::HelpRequested
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'optparse'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Cli
|
|
8
|
+
# Handles the `skill-bench validate` / `doctor` subcommand.
|
|
9
|
+
#
|
|
10
|
+
# Runs read-only pre-flight checks and prints a PASS/FAIL report:
|
|
11
|
+
# 1. Criteria JSON structure (via {Models::CriteriaValidator}).
|
|
12
|
+
# 2. skill-bench.json shape (hand-rolled, lightweight schema check).
|
|
13
|
+
# 3. Provider credentials for the configured non-mock provider.
|
|
14
|
+
#
|
|
15
|
+
# It never runs an eval and never makes a network call.
|
|
16
|
+
class ValidateCommand
|
|
17
|
+
# Default criteria file validated when --criteria is not given.
|
|
18
|
+
DEFAULT_CRITERIA = 'criteria.json'
|
|
19
|
+
|
|
20
|
+
# @param argv [Array<String>] Raw CLI arguments
|
|
21
|
+
# @return [Integer] Exit code
|
|
22
|
+
def self.call(argv)
|
|
23
|
+
new(argv).call
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# @param argv [Array<String>] Raw CLI arguments
|
|
27
|
+
def initialize(argv)
|
|
28
|
+
@argv = argv
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Parses options, runs the pre-flight checks, and prints the report.
|
|
32
|
+
#
|
|
33
|
+
# @return [Integer] Exit code (0 when all checks pass, 1 otherwise)
|
|
34
|
+
def call
|
|
35
|
+
options = parse_options
|
|
36
|
+
config_path = options[:config] || SkillBench::Config::CONFIG_FILENAME
|
|
37
|
+
config_data = load_config_data(config_path)
|
|
38
|
+
results = [
|
|
39
|
+
check_criteria(options),
|
|
40
|
+
check_config(config_path, config_data),
|
|
41
|
+
check_provider_key(config_data)
|
|
42
|
+
]
|
|
43
|
+
print_report(results)
|
|
44
|
+
results.any? { |result| result[:status] == :fail } ? 1 : 0
|
|
45
|
+
rescue HelpRequested
|
|
46
|
+
0
|
|
47
|
+
rescue StandardError => e
|
|
48
|
+
warn "Error: #{e.message}"
|
|
49
|
+
1
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
def parse_options
|
|
55
|
+
options = {}
|
|
56
|
+
build_parser(options).parse!(@argv)
|
|
57
|
+
options
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def build_parser(options)
|
|
61
|
+
OptionParser.new do |opts|
|
|
62
|
+
opts.banner = 'Usage: skill-bench validate [options]'
|
|
63
|
+
opts.on('--criteria PATH', 'Criteria JSON file to validate (default: criteria.json)') { |v| options[:criteria] = v }
|
|
64
|
+
opts.on('--config PATH', 'Config file to validate (default: skill-bench.json)') { |v| options[:config] = v }
|
|
65
|
+
opts.on('-h', '--help', 'Prints this help') do
|
|
66
|
+
puts opts
|
|
67
|
+
raise SkillBench::HelpRequested
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# --- Check (a): criteria ------------------------------------------------
|
|
73
|
+
|
|
74
|
+
def check_criteria(options)
|
|
75
|
+
path = options[:criteria] || DEFAULT_CRITERIA
|
|
76
|
+
unless File.exist?(path)
|
|
77
|
+
return fail_result('criteria', "criteria file not found: #{path}") if options[:criteria]
|
|
78
|
+
|
|
79
|
+
return skip_result('criteria', "no #{DEFAULT_CRITERIA} found (skipped)")
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
result = Models::CriteriaValidator.call(path:)
|
|
83
|
+
return pass_result('criteria', "#{path} is valid") if result[:success]
|
|
84
|
+
|
|
85
|
+
fail_result('criteria', "#{path}: #{criteria_error(result)}")
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def criteria_error(result)
|
|
89
|
+
result.dig(:response, :error, :message) || 'invalid criteria'
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# --- Check (b): config shape -------------------------------------------
|
|
93
|
+
|
|
94
|
+
def check_config(path, config_data)
|
|
95
|
+
case config_data[:status]
|
|
96
|
+
when :missing
|
|
97
|
+
fail_result('config', "#{path} not found")
|
|
98
|
+
when :invalid_json
|
|
99
|
+
fail_result('config', "#{path} is not valid JSON: #{config_data[:message]}")
|
|
100
|
+
else
|
|
101
|
+
validate_config_shape(path, config_data[:data])
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def validate_config_shape(path, data)
|
|
106
|
+
return fail_result('config', "#{path} must contain a JSON object") unless data.is_a?(Hash)
|
|
107
|
+
|
|
108
|
+
errors = config_shape_errors(data)
|
|
109
|
+
return fail_result('config', errors.join('; ')) if errors.any?
|
|
110
|
+
|
|
111
|
+
pass_result('config', "#{path} matches the expected shape")
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def config_shape_errors(data)
|
|
115
|
+
errors = provider_errors(data[:provider])
|
|
116
|
+
errors.concat(max_execution_time_errors(data[:max_execution_time]))
|
|
117
|
+
errors << "'config' must be an object" if data.key?(:config) && !data[:config].is_a?(Hash)
|
|
118
|
+
errors
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def provider_errors(provider)
|
|
122
|
+
return ["'provider' is required"] if provider.nil?
|
|
123
|
+
return ["'provider' must be a string"] unless provider.is_a?(String)
|
|
124
|
+
|
|
125
|
+
allowed = Models::Provider::ALLOWED_PROVIDERS
|
|
126
|
+
return [] if allowed.include?(provider)
|
|
127
|
+
|
|
128
|
+
["'provider' '#{provider}' is not one of: #{allowed.join(', ')}"]
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def max_execution_time_errors(value)
|
|
132
|
+
return [] if value.nil?
|
|
133
|
+
return [] if value.is_a?(Integer) && value.positive?
|
|
134
|
+
|
|
135
|
+
["'max_execution_time' must be a positive integer"]
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# --- Check (c): provider key -------------------------------------------
|
|
139
|
+
|
|
140
|
+
def check_provider_key(config_data)
|
|
141
|
+
return skip_result('provider key', 'skipped (no usable config)') unless config_data[:status] == :ok
|
|
142
|
+
|
|
143
|
+
provider = config_provider(config_data[:data])
|
|
144
|
+
return skip_result('provider key', 'skipped (provider invalid)') unless provider
|
|
145
|
+
return pass_result('provider key', 'mock provider requires no API key') if provider == 'mock'
|
|
146
|
+
|
|
147
|
+
missing = missing_provider_keys(provider, config_data[:data][:config])
|
|
148
|
+
return pass_result('provider key', "#{provider} credentials present") if missing.empty?
|
|
149
|
+
|
|
150
|
+
fail_result('provider key', "#{provider} is missing: #{missing.join(', ')}")
|
|
151
|
+
rescue StandardError => e
|
|
152
|
+
# Building the client can raise on unrelated config (e.g. base_url
|
|
153
|
+
# validation); surface that as a structured FAIL rather than crashing.
|
|
154
|
+
fail_result('provider key', "#{provider} config is invalid: #{e.message}")
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def config_provider(data)
|
|
158
|
+
return nil unless data.is_a?(Hash)
|
|
159
|
+
|
|
160
|
+
provider = data[:provider]
|
|
161
|
+
return nil unless provider.is_a?(String) && Models::Provider::ALLOWED_PROVIDERS.include?(provider)
|
|
162
|
+
|
|
163
|
+
provider
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def missing_provider_keys(provider, provider_config)
|
|
167
|
+
provider_sym = provider.to_sym
|
|
168
|
+
options = provider_client_options(provider_sym, provider_config)
|
|
169
|
+
client = Clients::ProviderRegistry.for(provider_sym).new(options)
|
|
170
|
+
return [] unless client.respond_to?(:missing_config_keys, true)
|
|
171
|
+
|
|
172
|
+
client.send(:missing_config_keys)
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
def provider_client_options(provider_sym, provider_config)
|
|
176
|
+
options = provider_config.is_a?(Hash) ? provider_config.dup : {}
|
|
177
|
+
Models::Provider::ENV_OVERRIDABLE_SETTINGS.each do |setting|
|
|
178
|
+
value = env_setting(provider_sym, setting)
|
|
179
|
+
options[setting] = value unless value.nil?
|
|
180
|
+
end
|
|
181
|
+
options
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
def env_setting(provider_sym, setting)
|
|
185
|
+
provider = provider_sym.to_s.upcase
|
|
186
|
+
name = setting.to_s.upcase
|
|
187
|
+
["SKILL_BENCH_#{provider}_#{name}", "#{provider}_#{name}"].each do |var|
|
|
188
|
+
value = ENV.fetch(var, nil)
|
|
189
|
+
return value if value && !value.empty?
|
|
190
|
+
end
|
|
191
|
+
nil
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# --- Config loading ----------------------------------------------------
|
|
195
|
+
|
|
196
|
+
def load_config_data(path)
|
|
197
|
+
return { status: :missing } unless File.exist?(path)
|
|
198
|
+
|
|
199
|
+
{ status: :ok, data: JSON.parse(File.read(path), symbolize_names: true) }
|
|
200
|
+
rescue JSON::ParserError => e
|
|
201
|
+
{ status: :invalid_json, message: e.message }
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# --- Reporting ---------------------------------------------------------
|
|
205
|
+
|
|
206
|
+
def print_report(results)
|
|
207
|
+
puts 'skill-bench validate'
|
|
208
|
+
puts
|
|
209
|
+
results.each { |result| puts format_result(result) }
|
|
210
|
+
puts
|
|
211
|
+
puts summary_line(results)
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def format_result(result)
|
|
215
|
+
"[#{label(result[:status])}] #{result[:name].ljust(13)} #{result[:message]}"
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
def label(status)
|
|
219
|
+
{ pass: 'PASS', fail: 'FAIL', skip: 'SKIP' }.fetch(status)
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
def summary_line(results)
|
|
223
|
+
failed = results.count { |result| result[:status] == :fail }
|
|
224
|
+
return "#{failed} check(s) failed." if failed.positive?
|
|
225
|
+
|
|
226
|
+
'All checks passed.'
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def pass_result(name, message)
|
|
230
|
+
{ name:, status: :pass, message: }
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
def fail_result(name, message)
|
|
234
|
+
{ name:, status: :fail, message: }
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
def skip_result(name, message)
|
|
238
|
+
{ name:, status: :skip, message: }
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
end
|
data/lib/skill_bench/cli.rb
CHANGED
|
@@ -5,8 +5,10 @@ require_relative 'cli/run_command'
|
|
|
5
5
|
require_relative 'cli/compare_command'
|
|
6
6
|
require_relative 'cli/skill_command'
|
|
7
7
|
require_relative 'cli/eval_command'
|
|
8
|
+
require_relative 'cli/validate_command'
|
|
8
9
|
require_relative 'cli/help_printer'
|
|
9
10
|
require_relative 'cli/result_printer'
|
|
11
|
+
require_relative 'cli/batch_result_printer'
|
|
10
12
|
|
|
11
13
|
module SkillBench
|
|
12
14
|
# Raised when -h/--help is passed to abort OptionParser and return exit code 0.
|
|
@@ -42,6 +44,7 @@ module SkillBench
|
|
|
42
44
|
when 'compare' then Cli::CompareCommand.call(@argv)
|
|
43
45
|
when 'skill' then Cli::SkillCommand.call(@argv)
|
|
44
46
|
when 'eval' then Cli::EvalCommand.call(@argv)
|
|
47
|
+
when 'validate', 'doctor' then Cli::ValidateCommand.call(@argv)
|
|
45
48
|
when '-h', '--help', 'help'
|
|
46
49
|
help.call
|
|
47
50
|
else
|
data/lib/skill_bench/client.rb
CHANGED
|
@@ -1,13 +1,27 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative 'clients/all'
|
|
4
|
+
require_relative 'services/response_cache'
|
|
4
5
|
|
|
5
6
|
module SkillBench
|
|
6
7
|
# Facade for calling LLM clients.
|
|
7
8
|
# Delegates to the configured provider.
|
|
8
9
|
class Client
|
|
10
|
+
# Provider clients that must never be cached: their results either signal a
|
|
11
|
+
# configuration error (NullClient) or are cheap, deterministic test doubles
|
|
12
|
+
# (Mock). Caching them would provide no benefit and could mask errors.
|
|
13
|
+
UNCACHEABLE_CLIENTS = [
|
|
14
|
+
Clients::Providers::NullClient,
|
|
15
|
+
Clients::Providers::Mock
|
|
16
|
+
].freeze
|
|
17
|
+
|
|
9
18
|
# Calls the configured LLM provider with the given parameters.
|
|
10
19
|
#
|
|
20
|
+
# When response caching is enabled (see {Services::ResponseCache.enabled?})
|
|
21
|
+
# and the resolved provider is cacheable, identical requests reuse a cached
|
|
22
|
+
# response instead of calling the provider again. When caching is disabled
|
|
23
|
+
# (the default), the provider is always invoked, leaving behavior unchanged.
|
|
24
|
+
#
|
|
11
25
|
# @param system_prompt [String] System prompt for the LLM
|
|
12
26
|
# @param messages [Array<Hash>] Conversation messages
|
|
13
27
|
# @param provider [Symbol, nil] Override the configured LLM provider (e.g., :deepseek, :openai)
|
|
@@ -17,7 +31,35 @@ module SkillBench
|
|
|
17
31
|
resolved = provider || Config.current_llm_provider || :openai
|
|
18
32
|
client_class = Clients::ProviderRegistry.for(resolved)
|
|
19
33
|
warn "WARNING: LLM provider '#{resolved}' is not configured. Falling back to null client." if client_class == Clients::Providers::NullClient
|
|
20
|
-
|
|
34
|
+
|
|
35
|
+
invoke = -> { client_class.call(system_prompt: system_prompt, messages: messages, **options) }
|
|
36
|
+
return invoke.call unless cache_eligible?(client_class)
|
|
37
|
+
|
|
38
|
+
cache_key = Services::ResponseCache.key(
|
|
39
|
+
provider: resolved,
|
|
40
|
+
model: options[:model],
|
|
41
|
+
system_prompt: system_prompt,
|
|
42
|
+
messages: messages,
|
|
43
|
+
tools: options[:tools],
|
|
44
|
+
temperature: options[:temperature],
|
|
45
|
+
provider_config: options.slice(:base_url, :request_path, :endpoint, :location, :project_id, :api_version)
|
|
46
|
+
)
|
|
47
|
+
Services::ResponseCache.fetch(cache_key, &invoke)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Whether a resolved provider client may be served from the cache.
|
|
51
|
+
#
|
|
52
|
+
# Requires caching to be enabled and the client to not be one of the
|
|
53
|
+
# {UNCACHEABLE_CLIENTS} (null/mock), so disabling the cache restores the
|
|
54
|
+
# original, uncached behavior exactly.
|
|
55
|
+
#
|
|
56
|
+
# @param client_class [Class] The resolved provider client class
|
|
57
|
+
# @return [Boolean] true when the call should go through the cache
|
|
58
|
+
def self.cache_eligible?(client_class)
|
|
59
|
+
return false unless Services::ResponseCache.enabled?
|
|
60
|
+
|
|
61
|
+
!UNCACHEABLE_CLIENTS.include?(client_class)
|
|
21
62
|
end
|
|
63
|
+
private_class_method :cache_eligible?
|
|
22
64
|
end
|
|
23
65
|
end
|
|
@@ -2,8 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
require_relative 'response_parser'
|
|
4
4
|
require_relative 'response_error_handler'
|
|
5
|
+
require_relative 'response_builder'
|
|
5
6
|
require_relative 'request_builder'
|
|
6
7
|
require_relative 'retry_handler'
|
|
8
|
+
require_relative 'base_url_validator'
|
|
7
9
|
require_relative 'base_client'
|
|
8
10
|
require_relative 'provider_config'
|
|
9
11
|
require_relative 'provider_registry'
|
|
@@ -16,5 +18,6 @@ require_relative 'providers/azure_openai'
|
|
|
16
18
|
require_relative 'providers/opencode'
|
|
17
19
|
require_relative 'providers/groq'
|
|
18
20
|
require_relative 'providers/deepseek'
|
|
21
|
+
require_relative 'providers/mistral'
|
|
19
22
|
require_relative 'providers/openrouter'
|
|
20
23
|
require_relative 'providers/mock'
|
|
@@ -4,6 +4,7 @@ require_relative '../config'
|
|
|
4
4
|
require_relative 'provider_config'
|
|
5
5
|
require_relative 'response_parser'
|
|
6
6
|
require_relative 'response_error_handler'
|
|
7
|
+
require_relative 'response_builder'
|
|
7
8
|
require_relative 'request_builder'
|
|
8
9
|
require_relative 'retry_handler'
|
|
9
10
|
|
|
@@ -135,7 +136,7 @@ module SkillBench
|
|
|
135
136
|
else
|
|
136
137
|
"#{missing.first} not set for #{@provider_display_name}"
|
|
137
138
|
end
|
|
138
|
-
|
|
139
|
+
ResponseBuilder.error(message: message)
|
|
139
140
|
end
|
|
140
141
|
|
|
141
142
|
# Extracts the message hash from the provider's specific response body structure.
|
|
@@ -158,11 +159,22 @@ module SkillBench
|
|
|
158
159
|
|
|
159
160
|
def execute_request
|
|
160
161
|
RetryHandler.call do
|
|
161
|
-
connection = RequestBuilder.build_connection(base_url)
|
|
162
162
|
RequestBuilder.execute(connection, request_path, headers: request_headers, body: request_body)
|
|
163
163
|
end
|
|
164
164
|
end
|
|
165
165
|
|
|
166
|
+
# Lazily builds and memoizes the Faraday connection for this client instance.
|
|
167
|
+
#
|
|
168
|
+
# Reusing one connection across the instance's sequential requests and retry
|
|
169
|
+
# attempts enables HTTP keep-alive, avoiding a fresh TCP + TLS handshake per turn.
|
|
170
|
+
# Memoization is intentionally per-instance (never global/shared) so concurrent
|
|
171
|
+
# agent and judge clients each own a connection, keeping net/http thread-safe.
|
|
172
|
+
#
|
|
173
|
+
# @return [Faraday::Connection] the reused connection for this instance.
|
|
174
|
+
def connection
|
|
175
|
+
@connection ||= RequestBuilder.build_connection(base_url)
|
|
176
|
+
end
|
|
177
|
+
|
|
166
178
|
def handle_response(response)
|
|
167
179
|
parsed = ResponseParser.parse_body(response)
|
|
168
180
|
return failure_response(response, parsed) unless response.success?
|
|
@@ -182,10 +194,6 @@ module SkillBench
|
|
|
182
194
|
message = extract_message(parsed)
|
|
183
195
|
return missing_message_response(response, parsed) unless ResponseParser.valid_message?(message)
|
|
184
196
|
|
|
185
|
-
success_response(parsed, message)
|
|
186
|
-
end
|
|
187
|
-
|
|
188
|
-
def success_response(parsed, message)
|
|
189
197
|
content = ResponseParser.extract_content(message)
|
|
190
198
|
{
|
|
191
199
|
success: true,
|