ruby-skill-bench 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +794 -0
- data/bin/skill-bench +15 -0
- data/docs/architecture.md +200 -0
- data/docs/first-eval-guide.md +522 -0
- data/docs/testing-guide.md +361 -0
- data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
- data/lib/skill_bench/agent/react_agent/step.rb +92 -0
- data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
- data/lib/skill_bench/agent/react_agent.rb +58 -0
- data/lib/skill_bench/agent/runner.rb +108 -0
- data/lib/skill_bench/agent/summary.rb +39 -0
- data/lib/skill_bench/agent.rb +10 -0
- data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
- data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
- data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
- data/lib/skill_bench/cli/eval_command.rb +40 -0
- data/lib/skill_bench/cli/help_printer.rb +47 -0
- data/lib/skill_bench/cli/init_command.rb +69 -0
- data/lib/skill_bench/cli/result_printer.rb +20 -0
- data/lib/skill_bench/cli/run_command.rb +72 -0
- data/lib/skill_bench/cli/skill_command.rb +79 -0
- data/lib/skill_bench/cli.rb +51 -0
- data/lib/skill_bench/client.rb +23 -0
- data/lib/skill_bench/clients/all.rb +19 -0
- data/lib/skill_bench/clients/base_client.rb +212 -0
- data/lib/skill_bench/clients/provider_config.rb +47 -0
- data/lib/skill_bench/clients/provider_registry.rb +56 -0
- data/lib/skill_bench/clients/provider_schemas.rb +73 -0
- data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
- data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
- data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
- data/lib/skill_bench/clients/providers/gemini.rb +63 -0
- data/lib/skill_bench/clients/providers/groq.rb +39 -0
- data/lib/skill_bench/clients/providers/null_client.rb +50 -0
- data/lib/skill_bench/clients/providers/ollama.rb +63 -0
- data/lib/skill_bench/clients/providers/openai.rb +39 -0
- data/lib/skill_bench/clients/providers/opencode.rb +56 -0
- data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
- data/lib/skill_bench/clients/request_builder.rb +43 -0
- data/lib/skill_bench/clients/response_error_handler.rb +73 -0
- data/lib/skill_bench/clients/response_parser.rb +93 -0
- data/lib/skill_bench/clients/retry_handler.rb +78 -0
- data/lib/skill_bench/commands/eval_new.rb +89 -0
- data/lib/skill_bench/commands/init.rb +39 -0
- data/lib/skill_bench/commands/run.rb +21 -0
- data/lib/skill_bench/commands/skill_new.rb +115 -0
- data/lib/skill_bench/config/applier.rb +67 -0
- data/lib/skill_bench/config/defaults.rb +42 -0
- data/lib/skill_bench/config/env_overrides.rb +117 -0
- data/lib/skill_bench/config/facade_readers.rb +65 -0
- data/lib/skill_bench/config/facade_writers.rb +120 -0
- data/lib/skill_bench/config/json_loader.rb +84 -0
- data/lib/skill_bench/config/store.rb +177 -0
- data/lib/skill_bench/config.rb +172 -0
- data/lib/skill_bench/criteria.rb +141 -0
- data/lib/skill_bench/delta_report.rb +97 -0
- data/lib/skill_bench/dimension.rb +69 -0
- data/lib/skill_bench/error_logger.rb +35 -0
- data/lib/skill_bench/evaluate_command.rb +120 -0
- data/lib/skill_bench/evaluation/generator.rb +191 -0
- data/lib/skill_bench/evaluation/runner.rb +81 -0
- data/lib/skill_bench/evaluation.rb +10 -0
- data/lib/skill_bench/execution/context_hydrator.rb +97 -0
- data/lib/skill_bench/execution/sandbox.rb +174 -0
- data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
- data/lib/skill_bench/execution.rb +10 -0
- data/lib/skill_bench/history_recorder/history_file.rb +71 -0
- data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
- data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
- data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
- data/lib/skill_bench/history_recorder.rb +40 -0
- data/lib/skill_bench/interactive.rb +61 -0
- data/lib/skill_bench/judge/judge.rb +72 -0
- data/lib/skill_bench/judge/prompt.rb +121 -0
- data/lib/skill_bench/judge/response.rb +158 -0
- data/lib/skill_bench/judge.rb +10 -0
- data/lib/skill_bench/migration/provider_migrator.rb +30 -0
- data/lib/skill_bench/models/config.rb +61 -0
- data/lib/skill_bench/models/criteria_validator.rb +106 -0
- data/lib/skill_bench/models/eval.rb +81 -0
- data/lib/skill_bench/models/provider.rb +70 -0
- data/lib/skill_bench/models/skill.rb +32 -0
- data/lib/skill_bench/output_formatter.rb +132 -0
- data/lib/skill_bench/package_verifier.rb +80 -0
- data/lib/skill_bench/rails/skill_templates.rb +99 -0
- data/lib/skill_bench/runner.rb +89 -0
- data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
- data/lib/skill_bench/services/feedback_generator.rb +122 -0
- data/lib/skill_bench/services/formatting_helpers.rb +45 -0
- data/lib/skill_bench/services/iteration_formatter.rb +30 -0
- data/lib/skill_bench/services/json_formatter.rb +18 -0
- data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
- data/lib/skill_bench/services/junit_formatter.rb +42 -0
- data/lib/skill_bench/services/option_parser_service.rb +63 -0
- data/lib/skill_bench/services/output_persistence_service.rb +77 -0
- data/lib/skill_bench/services/result_printer_service.rb +126 -0
- data/lib/skill_bench/services/runner_service.rb +381 -0
- data/lib/skill_bench/services/skill_resolver.rb +78 -0
- data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
- data/lib/skill_bench/services/template_registry.rb +148 -0
- data/lib/skill_bench/task/evaluator.rb +94 -0
- data/lib/skill_bench/task/file_reader.rb +69 -0
- data/lib/skill_bench/task.rb +10 -0
- data/lib/skill_bench/tools/argument_parser.rb +20 -0
- data/lib/skill_bench/tools/base.rb +73 -0
- data/lib/skill_bench/tools/dispatcher.rb +61 -0
- data/lib/skill_bench/tools/read_file.rb +66 -0
- data/lib/skill_bench/tools/registry.rb +23 -0
- data/lib/skill_bench/tools/run_command.rb +89 -0
- data/lib/skill_bench/tools/write_file.rb +78 -0
- data/lib/skill_bench/tools.rb +33 -0
- data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
- data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
- data/lib/skill_bench/trend_tracker.rb +66 -0
- data/lib/skill_bench/version.rb +6 -0
- data/lib/skill_bench.rb +103 -0
- metadata +247 -0
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Judge
|
|
7
|
+
# Parses and validates structured JSON responses from the LLM judge.
|
|
8
|
+
#
|
|
9
|
+
# Expects a JSON object with a 'dimensions' key mapping dimension names
|
|
10
|
+
# to score hashes, and an optional 'overall_reasoning' string.
|
|
11
|
+
class Response
|
|
12
|
+
attr_reader :dimensions, :overall_reasoning
|
|
13
|
+
|
|
14
|
+
# Parses a judge JSON string.
|
|
15
|
+
#
|
|
16
|
+
# @param json [String] The raw JSON string from the judge.
|
|
17
|
+
# @return [Hash] Service response with parsed judge response or error.
|
|
18
|
+
def self.call(json:)
|
|
19
|
+
new(json:).call
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# @param json [String] The raw JSON string from the judge.
|
|
23
|
+
def initialize(json:)
|
|
24
|
+
@json = json
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Parses and validates the judge JSON.
|
|
28
|
+
#
|
|
29
|
+
# @return [Hash] Service response with judge response or error.
|
|
30
|
+
def call
|
|
31
|
+
data = parse_json
|
|
32
|
+
return data unless data[:success]
|
|
33
|
+
|
|
34
|
+
payload = data[:response][:data]
|
|
35
|
+
validation = validate_structure(payload)
|
|
36
|
+
return validation unless validation[:success]
|
|
37
|
+
|
|
38
|
+
dims = payload['dimensions'] || payload[:dimensions]
|
|
39
|
+
extracted = extract_dimensions(dims)
|
|
40
|
+
return extracted unless extracted[:success]
|
|
41
|
+
|
|
42
|
+
@dimensions = extracted[:response][:dimensions]
|
|
43
|
+
@overall_reasoning = payload['overall_reasoning'] || payload[:overall_reasoning] || ''
|
|
44
|
+
|
|
45
|
+
{ success: true, response: { judge_response: self } }
|
|
46
|
+
rescue StandardError => e
|
|
47
|
+
SkillBench::ErrorLogger.log_error(e, 'Judge::Response Parse Error')
|
|
48
|
+
{ success: false, response: { error: { message: e.message } } }
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
attr_reader :json
|
|
54
|
+
|
|
55
|
+
def parse_json
|
|
56
|
+
stripped = strip_markdown_fences(json)
|
|
57
|
+
data = JSON.parse(stripped)
|
|
58
|
+
{ success: true, response: { data: data } }
|
|
59
|
+
rescue JSON::ParserError => e
|
|
60
|
+
{ success: false, response: { error: { message: "Invalid JSON: #{e.message}" } } }
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def strip_markdown_fences(text)
|
|
64
|
+
return text unless text.start_with?('```')
|
|
65
|
+
|
|
66
|
+
lines = text.each_line.to_a
|
|
67
|
+
lines.shift if lines.first&.strip&.start_with?('```')
|
|
68
|
+
lines.pop if lines.last&.strip == '```'
|
|
69
|
+
lines.join.strip
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def validate_structure(payload)
|
|
73
|
+
dims = payload['dimensions'] || payload[:dimensions]
|
|
74
|
+
|
|
75
|
+
return missing_dimensions_result if dims.nil?
|
|
76
|
+
return empty_dimensions_result if dims.empty?
|
|
77
|
+
|
|
78
|
+
{ success: true, response: {} }
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def missing_dimensions_result
|
|
82
|
+
{ success: false, response: { error: { message: "Judge response missing 'dimensions' key" } } }
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def empty_dimensions_result
|
|
86
|
+
{ success: false, response: { error: { message: "Judge response 'dimensions' is empty" } } }
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def extract_dimensions(dims)
|
|
90
|
+
dimensions = {}
|
|
91
|
+
|
|
92
|
+
dims.each do |name, dim|
|
|
93
|
+
validated = validate_dimension(name, dim)
|
|
94
|
+
return validated unless validated[:success]
|
|
95
|
+
|
|
96
|
+
dimensions[name] = validated[:response][:dimension]
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
{ success: true, response: { dimensions: dimensions } }
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def validate_dimension(name, dim)
|
|
103
|
+
score = dim['score'] || dim[:score]
|
|
104
|
+
return missing_score_result(name) if score.nil?
|
|
105
|
+
|
|
106
|
+
numeric_score = parse_numeric(score)
|
|
107
|
+
return invalid_score_result(name, score) if numeric_score.nil?
|
|
108
|
+
|
|
109
|
+
max_score = dim['max_score'] || dim[:max_score]
|
|
110
|
+
max_score_result = validate_max_score(name, numeric_score, max_score)
|
|
111
|
+
return max_score_result unless max_score_result[:success]
|
|
112
|
+
|
|
113
|
+
{
|
|
114
|
+
success: true,
|
|
115
|
+
response: {
|
|
116
|
+
dimension: {
|
|
117
|
+
score: numeric_score,
|
|
118
|
+
max_score: max_score,
|
|
119
|
+
reasoning: dim['reasoning'] || dim[:reasoning] || ''
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def validate_max_score(name, numeric_score, max_score)
|
|
126
|
+
return { success: true, response: {} } unless max_score
|
|
127
|
+
return invalid_max_score_result(name, max_score) unless max_score.is_a?(Numeric)
|
|
128
|
+
return out_of_bounds_result(name, numeric_score, max_score) if numeric_score.negative? || numeric_score > max_score
|
|
129
|
+
|
|
130
|
+
{ success: true, response: {} }
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def parse_numeric(value)
|
|
134
|
+
return value if value.is_a?(Numeric)
|
|
135
|
+
|
|
136
|
+
Float(value)
|
|
137
|
+
rescue ArgumentError, TypeError
|
|
138
|
+
nil
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def missing_score_result(name)
|
|
142
|
+
{ success: false, response: { error: { message: "Judge dimension '#{name}' missing score" } } }
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def invalid_score_result(name, score)
|
|
146
|
+
{ success: false, response: { error: { message: "Judge dimension '#{name}' has invalid score: #{score.inspect}" } } }
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def out_of_bounds_result(name, score, max_score)
|
|
150
|
+
{ success: false, response: { error: { message: "Judge dimension '#{name}' score #{score} out of bounds (0..#{max_score})" } } }
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def invalid_max_score_result(name, max_score)
|
|
154
|
+
{ success: false, response: { error: { message: "Judge dimension '#{name}' has invalid max_score: #{max_score.inspect} (must be numeric)" } } }
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'yaml'
|
|
4
|
+
require 'json'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Migration
|
|
8
|
+
# Migrates old provider classes to new YAML-based configuration
|
|
9
|
+
class ProviderMigrator
|
|
10
|
+
# Migrate providers to YAML config file
|
|
11
|
+
# @param providers [Hash] Providers to migrate (name => config hash)
|
|
12
|
+
# @param yaml_path [String] Path to YAML config file (default: .agent-eval.yml)
|
|
13
|
+
def self.migrate(providers, yaml_path = '.agent-eval.yml')
|
|
14
|
+
existing = if File.exist?(yaml_path)
|
|
15
|
+
YAML.safe_load_file(yaml_path, permitted_classes: [], aliases: false) || {}
|
|
16
|
+
else
|
|
17
|
+
{}
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
existing['providers'] ||= {}
|
|
21
|
+
|
|
22
|
+
providers.each do |name, config|
|
|
23
|
+
existing['providers'][name.to_s] = config.transform_keys(&:to_s)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
File.write(yaml_path, existing.to_yaml)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require_relative 'provider'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Models
|
|
8
|
+
# Represents the skill-bench configuration loaded from skill-bench.json
|
|
9
|
+
class Config
|
|
10
|
+
# @param data [Hash] Raw configuration data
|
|
11
|
+
# @raise [ArgumentError] if data is not a Hash
|
|
12
|
+
def initialize(data = {})
|
|
13
|
+
raise ArgumentError, 'Config-data must be a Hash' unless data.is_a?(Hash)
|
|
14
|
+
|
|
15
|
+
@data = data
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Load configuration from a JSON file
|
|
19
|
+
# @param path [String] Path to config file (default: skill-bench.json)
|
|
20
|
+
# @return [SkillBench::Models::Config] Loaded config instance
|
|
21
|
+
# @raise [Errno::ENOENT] if config file not found
|
|
22
|
+
def self.load(path = 'skill-bench.json')
|
|
23
|
+
raw_data = JSON.parse(File.read(path), symbolize_names: true)
|
|
24
|
+
new(raw_data)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Returns the configured provider name
|
|
28
|
+
# @return [String, nil] Provider name
|
|
29
|
+
def provider_name
|
|
30
|
+
@data[:provider]
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Returns the provider configuration
|
|
34
|
+
# @return [Hash] Provider configuration
|
|
35
|
+
def provider_config
|
|
36
|
+
@data[:config] || {}
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Returns max execution time
|
|
40
|
+
# @return [Integer] Max execution time in seconds
|
|
41
|
+
def max_execution_time
|
|
42
|
+
@data[:max_execution_time] || 30
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Builds a Provider model from the current configuration.
|
|
46
|
+
# Returns a mock provider if provider name is 'mock'.
|
|
47
|
+
#
|
|
48
|
+
# @return [SkillBench::Models::Provider] The configured provider
|
|
49
|
+
def to_provider
|
|
50
|
+
return nil if provider_name.nil? || provider_name == 'mock'
|
|
51
|
+
|
|
52
|
+
Provider.new(
|
|
53
|
+
name: provider_name,
|
|
54
|
+
runtime: provider_name,
|
|
55
|
+
llm: provider_name,
|
|
56
|
+
config: provider_config
|
|
57
|
+
)
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Models
|
|
7
|
+
# Validates evaluation criteria JSON structure without building rich objects.
|
|
8
|
+
class CriteriaValidator
|
|
9
|
+
# Validates criteria from a JSON file
|
|
10
|
+
#
|
|
11
|
+
# @param path [String] Path to criteria JSON file
|
|
12
|
+
# @return [Hash] Validation result with success status and criteria data
|
|
13
|
+
def self.call(path:)
|
|
14
|
+
new(path).call
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# @param path [String] Path to criteria JSON file
|
|
18
|
+
def initialize(path)
|
|
19
|
+
@path = path
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Validates the criteria file.
|
|
23
|
+
#
|
|
24
|
+
# @return [Hash] Validation result.
|
|
25
|
+
def call
|
|
26
|
+
return file_not_found_result unless File.exist?(path)
|
|
27
|
+
|
|
28
|
+
data = parse_json(path)
|
|
29
|
+
return data unless data[:success]
|
|
30
|
+
|
|
31
|
+
parsed = data[:response][:data]
|
|
32
|
+
validation = validate(parsed)
|
|
33
|
+
return validation unless validation[:success]
|
|
34
|
+
|
|
35
|
+
{ success: true, response: { criteria: parsed } }
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
private
|
|
39
|
+
|
|
40
|
+
attr_reader :path
|
|
41
|
+
|
|
42
|
+
def file_not_found_result
|
|
43
|
+
{ success: false, response: { error: { message: "File not found: #{path}" } } }
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def parse_json(file_path)
|
|
47
|
+
parsed = JSON.parse(File.read(file_path), symbolize_names: true)
|
|
48
|
+
{ success: true, response: { data: parsed } }
|
|
49
|
+
rescue JSON::ParserError => e
|
|
50
|
+
{ success: false, response: { error: { message: "Invalid JSON: #{e.message}" } } }
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def validate(data)
|
|
54
|
+
dim_result = validate_dimensions(data.fetch(:dimensions, []))
|
|
55
|
+
return dim_result unless dim_result[:success]
|
|
56
|
+
|
|
57
|
+
field_result = validate_required_fields(data)
|
|
58
|
+
return field_result unless field_result[:success]
|
|
59
|
+
|
|
60
|
+
threshold_result = validate_pass_threshold(data[:pass_threshold])
|
|
61
|
+
return threshold_result unless threshold_result[:success]
|
|
62
|
+
|
|
63
|
+
validate_minimum_delta(data[:minimum_delta])
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def validate_dimensions(dimensions)
|
|
67
|
+
return invalid_dimensions_result unless dimensions.is_a?(Array)
|
|
68
|
+
return invalid_dimensions_result unless dimensions.all? do |dim|
|
|
69
|
+
dim.is_a?(Hash) && dim[:name] && dim[:max_score].is_a?(Numeric)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
total = dimensions.sum { |dim| dim[:max_score] || 0 }
|
|
73
|
+
return score_sum_result(total) unless total == 100
|
|
74
|
+
|
|
75
|
+
{ success: true, response: {} }
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def invalid_dimensions_result
|
|
79
|
+
{ success: false, response: { error: { message: 'Invalid dimensions format' } } }
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def score_sum_result(total)
|
|
83
|
+
{ success: false, response: { error: { message: "Dimension scores must sum to 100, got #{total}" } } }
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def validate_required_fields(data)
|
|
87
|
+
missing = %i[pass_threshold minimum_delta].select { |field| data[field].nil? }
|
|
88
|
+
return { success: true, response: {} } if missing.empty?
|
|
89
|
+
|
|
90
|
+
{ success: false, response: { error: { message: "Missing required fields: #{missing.join(', ')}" } } }
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def validate_pass_threshold(value)
|
|
94
|
+
return { success: true, response: {} } if value.is_a?(Integer) && value.between?(0, 100)
|
|
95
|
+
|
|
96
|
+
{ success: false, response: { error: { message: 'Pass threshold must be between 0 and 100' } } }
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def validate_minimum_delta(value)
|
|
100
|
+
return { success: true, response: {} } if value.is_a?(Integer) && value >= 0
|
|
101
|
+
|
|
102
|
+
{ success: false, response: { error: { message: 'Minimum delta must be non-negative integer' } } }
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'pathname'
|
|
5
|
+
require_relative '../criteria'
|
|
6
|
+
|
|
7
|
+
module SkillBench
|
|
8
|
+
module Models
|
|
9
|
+
# Represents an evaluation scenario
|
|
10
|
+
class Eval
|
|
11
|
+
attr_reader :name, :path, :task, :criteria, :source_code, :metadata
|
|
12
|
+
|
|
13
|
+
# @param name [String] Eval name
|
|
14
|
+
# @param path [String] Path to eval directory
|
|
15
|
+
# @param task [String] Task description from task.md
|
|
16
|
+
# @param criteria [Hash] Criteria from criteria.json
|
|
17
|
+
# @param source_code [String] Source code to evaluate
|
|
18
|
+
# @param metadata [Hash] Metadata from metadata.json
|
|
19
|
+
def initialize(name:, path:, task: '', criteria: {}, source_code: '', metadata: {})
|
|
20
|
+
@name = name
|
|
21
|
+
@path = path
|
|
22
|
+
@task = task
|
|
23
|
+
@criteria = criteria
|
|
24
|
+
@source_code = source_code
|
|
25
|
+
@metadata = metadata
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Load an eval from a directory
|
|
29
|
+
# @param dir_path [String] Path to eval directory
|
|
30
|
+
# @return [SkillBench::Models::Eval] Loaded eval instance
|
|
31
|
+
# @raise [Errno::ENOENT] if eval directory does not exist
|
|
32
|
+
def self.load(dir_path)
|
|
33
|
+
path = Pathname.new(dir_path)
|
|
34
|
+
raise Errno::ENOENT, "Eval directory not found: #{dir_path}" unless path.exist?
|
|
35
|
+
|
|
36
|
+
name = path.basename.to_s
|
|
37
|
+
task = load_task(path)
|
|
38
|
+
criteria = load_criteria(path)
|
|
39
|
+
metadata = load_metadata(path)
|
|
40
|
+
|
|
41
|
+
new(name: name, path: dir_path, task: task, criteria: criteria, metadata: metadata)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Load task description from task.md
|
|
45
|
+
# @param path [Pathname] Path to eval directory
|
|
46
|
+
# @return [String] Task description or empty string if file doesn't exist
|
|
47
|
+
def self.load_task(path)
|
|
48
|
+
task_md = path.join('task.md')
|
|
49
|
+
task_md.exist? ? File.read(task_md) : ''
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Load evaluation criteria from criteria.json
|
|
53
|
+
# @param path [Pathname] Path to eval directory
|
|
54
|
+
# @return [SkillBench::Criteria] Parsed criteria or empty criteria if file doesn't exist
|
|
55
|
+
# @raise [RuntimeError] if JSON is malformed or criteria validation fails
|
|
56
|
+
def self.load_criteria(path)
|
|
57
|
+
criteria_json = path.join('criteria.json')
|
|
58
|
+
return SkillBench::Criteria.empty unless criteria_json.exist?
|
|
59
|
+
|
|
60
|
+
result = SkillBench::Criteria.call(path: criteria_json.to_s)
|
|
61
|
+
response = result[:response]
|
|
62
|
+
return response[:criteria] if result[:success]
|
|
63
|
+
|
|
64
|
+
raise "Failed to load criteria: #{response[:error][:message]}"
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Load metadata from metadata.json
|
|
68
|
+
# @param path [Pathname] Path to eval directory
|
|
69
|
+
# @return [Hash] Parsed metadata or empty hash if file doesn't exist
|
|
70
|
+
# @raise [JSON::ParserError] if JSON is malformed
|
|
71
|
+
def self.load_metadata(path)
|
|
72
|
+
metadata_file = path.join('metadata.json')
|
|
73
|
+
return {} unless metadata_file.exist?
|
|
74
|
+
|
|
75
|
+
JSON.parse(File.read(metadata_file))
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
private_class_method :load_task, :load_criteria, :load_metadata
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../clients/provider_schemas'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Models
|
|
7
|
+
# Represents an agent runtime + LLM provider
|
|
8
|
+
class Provider
|
|
9
|
+
attr_reader :name, :runtime, :llm, :config
|
|
10
|
+
|
|
11
|
+
ALLOWED_PROVIDERS = (Clients::ProviderSchemas.names.map(&:to_s) + %w[mock]).freeze
|
|
12
|
+
|
|
13
|
+
# Settings that can be overridden via environment variables.
|
|
14
|
+
ENV_OVERRIDABLE_SETTINGS = %i[api_key model base_url endpoint location project_id api_version].freeze
|
|
15
|
+
|
|
16
|
+
# Initialize a new Provider
|
|
17
|
+
# @param name [String] Provider name (e.g., "openai")
|
|
18
|
+
# @param runtime [String] Agent runtime (e.g., "opencode")
|
|
19
|
+
# @param llm [String] LLM provider (e.g., "openai")
|
|
20
|
+
# @param config [Hash] Provider-specific configuration
|
|
21
|
+
def initialize(name:, runtime:, llm:, config: {})
|
|
22
|
+
@name = name
|
|
23
|
+
@runtime = runtime
|
|
24
|
+
@llm = llm
|
|
25
|
+
@config = config.is_a?(Hash) ? config.transform_keys(&:to_sym) : {}
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Returns merged config with environment variable fallbacks.
|
|
29
|
+
# Checks both `SKILL_BENCH_<PROVIDER>_<SETTING>` (documented standard)
|
|
30
|
+
# and `<PROVIDER>_<SETTING>` (legacy) naming conventions.
|
|
31
|
+
#
|
|
32
|
+
# @return [Hash] Merged configuration
|
|
33
|
+
# @raise [ArgumentError] if provider name is invalid or API key is missing
|
|
34
|
+
def merged_config
|
|
35
|
+
raise ArgumentError, "Invalid provider name: #{name}" unless ALLOWED_PROVIDERS.include?(name)
|
|
36
|
+
|
|
37
|
+
merged = config.dup
|
|
38
|
+
ENV_OVERRIDABLE_SETTINGS.each do |setting|
|
|
39
|
+
merged[setting] = resolve_env_setting(setting)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
api_key = merged[:api_key]
|
|
43
|
+
raise ArgumentError, "API key not found for provider '#{name}'. Set SKILL_BENCH_#{name.upcase}_API_KEY environment variable or provide in config." if api_key.nil? || api_key.to_s.empty?
|
|
44
|
+
|
|
45
|
+
merged
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
private
|
|
49
|
+
|
|
50
|
+
# Resolves a single setting from environment variables.
|
|
51
|
+
# Prefers `SKILL_BENCH_<PROVIDER>_<SETTING>`, falls back to
|
|
52
|
+
# `<PROVIDER>_<SETTING>`, then to the config file value.
|
|
53
|
+
#
|
|
54
|
+
# @param setting [Symbol] The setting name (e.g., :api_key)
|
|
55
|
+
# @return [String, nil] The resolved value
|
|
56
|
+
def resolve_env_setting(setting)
|
|
57
|
+
provider_name = name.upcase
|
|
58
|
+
setting_name = setting.upcase
|
|
59
|
+
|
|
60
|
+
prefixed = ENV.fetch("SKILL_BENCH_#{provider_name}_#{setting_name}", nil)
|
|
61
|
+
return prefixed if prefixed && !prefixed.to_s.empty?
|
|
62
|
+
|
|
63
|
+
legacy = ENV.fetch("#{provider_name}_#{setting_name}", nil)
|
|
64
|
+
return legacy if legacy && !legacy.to_s.empty?
|
|
65
|
+
|
|
66
|
+
config[setting]
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'pathname'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Models
|
|
7
|
+
# Represents a reusable skill for agent evaluation
|
|
8
|
+
class Skill
|
|
9
|
+
attr_reader :name, :path
|
|
10
|
+
|
|
11
|
+
# Initialize a new Skill
|
|
12
|
+
# @param name [String] Skill name
|
|
13
|
+
# @param path [String] Path to skill directory
|
|
14
|
+
def initialize(name:, path:)
|
|
15
|
+
@name = name
|
|
16
|
+
@path = path
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Discover skills from a directory recursively
|
|
20
|
+
# @param base_path [String] Directory to search (default: "skills/")
|
|
21
|
+
# @return [Array<SkillBench::Models::Skill>] Discovered skills
|
|
22
|
+
def self.discover(base_path = 'skills/')
|
|
23
|
+
return [] unless Dir.exist?(base_path)
|
|
24
|
+
|
|
25
|
+
Dir.glob(File.join(base_path, '**', 'SKILL.md')).map do |skill_md_path|
|
|
26
|
+
skill_dir = File.dirname(skill_md_path)
|
|
27
|
+
new(name: File.basename(skill_dir), path: skill_dir)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'services/iteration_formatter'
|
|
4
|
+
require_relative 'services/delta_table_formatter'
|
|
5
|
+
require_relative 'services/feedback_generator'
|
|
6
|
+
require_relative 'services/json_formatter'
|
|
7
|
+
require_relative 'services/junit_formatter'
|
|
8
|
+
|
|
9
|
+
module SkillBench
|
|
10
|
+
# Handles formatting output for different use cases (human, CI, etc.).
|
|
11
|
+
# Delegates all presentation logic to focused service objects under
|
|
12
|
+
# {SkillBench::Services}.
|
|
13
|
+
class OutputFormatter
|
|
14
|
+
# Format the eval result for output.
|
|
15
|
+
#
|
|
16
|
+
# @param result [Hash] Eval result with keys like :eval_name, :pass, :score, etc.
|
|
17
|
+
# @param format [Symbol] Output format (:human, :json, :junit)
|
|
18
|
+
# @return [String] Formatted output string
|
|
19
|
+
def self.format(result, format: :human)
|
|
20
|
+
case format
|
|
21
|
+
when :json
|
|
22
|
+
Services::JsonFormatter.format(result)
|
|
23
|
+
when :junit
|
|
24
|
+
Services::JUnitFormatter.format(result)
|
|
25
|
+
else
|
|
26
|
+
format_human(result)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Determine exit code based on eval result.
|
|
31
|
+
#
|
|
32
|
+
# @param result [Hash] Eval result with :pass or :success/:response keys.
|
|
33
|
+
# @return [Integer] 0 if passed, 1 if failed
|
|
34
|
+
def self.exit_code(result)
|
|
35
|
+
return 0 if result[:pass]
|
|
36
|
+
return 1 unless result[:success]
|
|
37
|
+
|
|
38
|
+
report = result.dig(:response, :report)
|
|
39
|
+
report&.verdict ? 0 : 1
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Format result as human-readable text.
|
|
43
|
+
#
|
|
44
|
+
# @param result [Hash] Eval result in old or new format.
|
|
45
|
+
# @return [String] Human-readable formatted string.
|
|
46
|
+
def self.format_human(result)
|
|
47
|
+
report = result.dig(:response, :report)
|
|
48
|
+
return format_legacy_human(result) unless delta_report?(report)
|
|
49
|
+
|
|
50
|
+
format_delta_report(result, report)
|
|
51
|
+
end
|
|
52
|
+
private_class_method :format_human
|
|
53
|
+
|
|
54
|
+
# Checks whether a report object is a DeltaReport.
|
|
55
|
+
#
|
|
56
|
+
# @param report [Object] The report to inspect.
|
|
57
|
+
# @return [Boolean] true when the report has DeltaReport attributes.
|
|
58
|
+
def self.delta_report?(report)
|
|
59
|
+
report.respond_to?(:deltas) && report.respond_to?(:criteria) &&
|
|
60
|
+
report.respond_to?(:baseline_scores) && report.respond_to?(:context_scores)
|
|
61
|
+
end
|
|
62
|
+
private_class_method :delta_report?
|
|
63
|
+
|
|
64
|
+
# Formats a legacy result hash.
|
|
65
|
+
#
|
|
66
|
+
# @param result [Hash] Legacy eval result.
|
|
67
|
+
# @return [String] Human-readable formatted string.
|
|
68
|
+
def self.format_legacy_human(result)
|
|
69
|
+
status = result[:pass] ? 'PASSED' : 'FAILED'
|
|
70
|
+
lines = [
|
|
71
|
+
'=' * 60,
|
|
72
|
+
"Eval: #{result[:eval_name] || ''}",
|
|
73
|
+
"Skill: #{result[:skill_name] || ''}",
|
|
74
|
+
"Provider: #{result[:provider_name] || ''}",
|
|
75
|
+
"Status: #{status}",
|
|
76
|
+
"Score: #{result[:score]&.round(2) || 'N/A'}"
|
|
77
|
+
]
|
|
78
|
+
error_msg = result.dig(:response, :error, :message)
|
|
79
|
+
lines << "Error: #{error_msg}" if error_msg
|
|
80
|
+
lines << ('=' * 60)
|
|
81
|
+
lines.join("\n")
|
|
82
|
+
end
|
|
83
|
+
private_class_method :format_legacy_human
|
|
84
|
+
|
|
85
|
+
# Formats a DeltaReport as a human-readable report.
|
|
86
|
+
#
|
|
87
|
+
# @param result [Hash] Eval result envelope.
|
|
88
|
+
# @param report [SkillBench::DeltaReport] The delta report.
|
|
89
|
+
# @return [String] Formatted report string.
|
|
90
|
+
def self.format_delta_report(result, report)
|
|
91
|
+
lines = [
|
|
92
|
+
('═' * 55),
|
|
93
|
+
" Eval: #{result[:eval_name] || ''}",
|
|
94
|
+
" Skill: #{result[:skill_name] || ''}",
|
|
95
|
+
" Provider: #{result[:provider_name] || ''}",
|
|
96
|
+
('═' * 55),
|
|
97
|
+
''
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
lines.concat(build_iteration_lines(result))
|
|
101
|
+
lines << Services::DeltaTableFormatter.format(report, result)
|
|
102
|
+
|
|
103
|
+
feedback_result = Services::FeedbackGenerator.call(report)
|
|
104
|
+
if feedback_result[:success]
|
|
105
|
+
output = feedback_result.dig(:response, :output)
|
|
106
|
+
lines << output unless output.empty?
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
lines.join("\n")
|
|
110
|
+
end
|
|
111
|
+
private_class_method :format_delta_report
|
|
112
|
+
|
|
113
|
+
# Builds iteration timeline lines from the result response.
|
|
114
|
+
#
|
|
115
|
+
# @param result [Hash] Eval result envelope.
|
|
116
|
+
# @return [Array<String>] Lines to append, or empty array.
|
|
117
|
+
def self.build_iteration_lines(result)
|
|
118
|
+
baseline = result.dig(:response, :baseline_iterations) || []
|
|
119
|
+
context = result.dig(:response, :context_iterations) || []
|
|
120
|
+
baseline_empty = baseline.empty?
|
|
121
|
+
context_empty = context.empty?
|
|
122
|
+
lines = []
|
|
123
|
+
|
|
124
|
+
lines << Services::IterationFormatter.format('BASELINE ITERATIONS', baseline) unless baseline_empty
|
|
125
|
+
lines << Services::IterationFormatter.format('CONTEXT ITERATIONS', context) unless context_empty
|
|
126
|
+
lines << '' unless baseline_empty && context_empty
|
|
127
|
+
|
|
128
|
+
lines
|
|
129
|
+
end
|
|
130
|
+
private_class_method :build_iteration_lines
|
|
131
|
+
end
|
|
132
|
+
end
|