ruby-skill-bench 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +794 -0
- data/bin/skill-bench +15 -0
- data/docs/architecture.md +200 -0
- data/docs/first-eval-guide.md +522 -0
- data/docs/testing-guide.md +361 -0
- data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
- data/lib/skill_bench/agent/react_agent/step.rb +92 -0
- data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
- data/lib/skill_bench/agent/react_agent.rb +58 -0
- data/lib/skill_bench/agent/runner.rb +108 -0
- data/lib/skill_bench/agent/summary.rb +39 -0
- data/lib/skill_bench/agent.rb +10 -0
- data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
- data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
- data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
- data/lib/skill_bench/cli/eval_command.rb +40 -0
- data/lib/skill_bench/cli/help_printer.rb +47 -0
- data/lib/skill_bench/cli/init_command.rb +69 -0
- data/lib/skill_bench/cli/result_printer.rb +20 -0
- data/lib/skill_bench/cli/run_command.rb +72 -0
- data/lib/skill_bench/cli/skill_command.rb +79 -0
- data/lib/skill_bench/cli.rb +51 -0
- data/lib/skill_bench/client.rb +23 -0
- data/lib/skill_bench/clients/all.rb +19 -0
- data/lib/skill_bench/clients/base_client.rb +212 -0
- data/lib/skill_bench/clients/provider_config.rb +47 -0
- data/lib/skill_bench/clients/provider_registry.rb +56 -0
- data/lib/skill_bench/clients/provider_schemas.rb +73 -0
- data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
- data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
- data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
- data/lib/skill_bench/clients/providers/gemini.rb +63 -0
- data/lib/skill_bench/clients/providers/groq.rb +39 -0
- data/lib/skill_bench/clients/providers/null_client.rb +50 -0
- data/lib/skill_bench/clients/providers/ollama.rb +63 -0
- data/lib/skill_bench/clients/providers/openai.rb +39 -0
- data/lib/skill_bench/clients/providers/opencode.rb +56 -0
- data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
- data/lib/skill_bench/clients/request_builder.rb +43 -0
- data/lib/skill_bench/clients/response_error_handler.rb +73 -0
- data/lib/skill_bench/clients/response_parser.rb +93 -0
- data/lib/skill_bench/clients/retry_handler.rb +78 -0
- data/lib/skill_bench/commands/eval_new.rb +89 -0
- data/lib/skill_bench/commands/init.rb +39 -0
- data/lib/skill_bench/commands/run.rb +21 -0
- data/lib/skill_bench/commands/skill_new.rb +115 -0
- data/lib/skill_bench/config/applier.rb +67 -0
- data/lib/skill_bench/config/defaults.rb +42 -0
- data/lib/skill_bench/config/env_overrides.rb +117 -0
- data/lib/skill_bench/config/facade_readers.rb +65 -0
- data/lib/skill_bench/config/facade_writers.rb +120 -0
- data/lib/skill_bench/config/json_loader.rb +84 -0
- data/lib/skill_bench/config/store.rb +177 -0
- data/lib/skill_bench/config.rb +172 -0
- data/lib/skill_bench/criteria.rb +141 -0
- data/lib/skill_bench/delta_report.rb +97 -0
- data/lib/skill_bench/dimension.rb +69 -0
- data/lib/skill_bench/error_logger.rb +35 -0
- data/lib/skill_bench/evaluate_command.rb +120 -0
- data/lib/skill_bench/evaluation/generator.rb +191 -0
- data/lib/skill_bench/evaluation/runner.rb +81 -0
- data/lib/skill_bench/evaluation.rb +10 -0
- data/lib/skill_bench/execution/context_hydrator.rb +97 -0
- data/lib/skill_bench/execution/sandbox.rb +174 -0
- data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
- data/lib/skill_bench/execution.rb +10 -0
- data/lib/skill_bench/history_recorder/history_file.rb +71 -0
- data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
- data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
- data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
- data/lib/skill_bench/history_recorder.rb +40 -0
- data/lib/skill_bench/interactive.rb +61 -0
- data/lib/skill_bench/judge/judge.rb +72 -0
- data/lib/skill_bench/judge/prompt.rb +121 -0
- data/lib/skill_bench/judge/response.rb +158 -0
- data/lib/skill_bench/judge.rb +10 -0
- data/lib/skill_bench/migration/provider_migrator.rb +30 -0
- data/lib/skill_bench/models/config.rb +61 -0
- data/lib/skill_bench/models/criteria_validator.rb +106 -0
- data/lib/skill_bench/models/eval.rb +81 -0
- data/lib/skill_bench/models/provider.rb +70 -0
- data/lib/skill_bench/models/skill.rb +32 -0
- data/lib/skill_bench/output_formatter.rb +132 -0
- data/lib/skill_bench/package_verifier.rb +80 -0
- data/lib/skill_bench/rails/skill_templates.rb +99 -0
- data/lib/skill_bench/runner.rb +89 -0
- data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
- data/lib/skill_bench/services/feedback_generator.rb +122 -0
- data/lib/skill_bench/services/formatting_helpers.rb +45 -0
- data/lib/skill_bench/services/iteration_formatter.rb +30 -0
- data/lib/skill_bench/services/json_formatter.rb +18 -0
- data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
- data/lib/skill_bench/services/junit_formatter.rb +42 -0
- data/lib/skill_bench/services/option_parser_service.rb +63 -0
- data/lib/skill_bench/services/output_persistence_service.rb +77 -0
- data/lib/skill_bench/services/result_printer_service.rb +126 -0
- data/lib/skill_bench/services/runner_service.rb +381 -0
- data/lib/skill_bench/services/skill_resolver.rb +78 -0
- data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
- data/lib/skill_bench/services/template_registry.rb +148 -0
- data/lib/skill_bench/task/evaluator.rb +94 -0
- data/lib/skill_bench/task/file_reader.rb +69 -0
- data/lib/skill_bench/task.rb +10 -0
- data/lib/skill_bench/tools/argument_parser.rb +20 -0
- data/lib/skill_bench/tools/base.rb +73 -0
- data/lib/skill_bench/tools/dispatcher.rb +61 -0
- data/lib/skill_bench/tools/read_file.rb +66 -0
- data/lib/skill_bench/tools/registry.rb +23 -0
- data/lib/skill_bench/tools/run_command.rb +89 -0
- data/lib/skill_bench/tools/write_file.rb +78 -0
- data/lib/skill_bench/tools.rb +33 -0
- data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
- data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
- data/lib/skill_bench/trend_tracker.rb +66 -0
- data/lib/skill_bench/version.rb +6 -0
- data/lib/skill_bench.rb +103 -0
- metadata +247 -0
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'pathname'
|
|
4
|
+
require_relative 'config/defaults'
|
|
5
|
+
require_relative 'config/store'
|
|
6
|
+
require_relative 'config/applier'
|
|
7
|
+
require_relative 'config/json_loader'
|
|
8
|
+
require_relative 'config/env_overrides'
|
|
9
|
+
require_relative 'config/facade_readers'
|
|
10
|
+
require_relative 'config/facade_writers'
|
|
11
|
+
|
|
12
|
+
module SkillBench
|
|
13
|
+
# Centralized configuration for the SkillBench system.
|
|
14
|
+
# Supports hierarchical loading: Defaults < Home JSON < Local JSON < ENV Variables.
|
|
15
|
+
class Config
|
|
16
|
+
# File name used for local and home evaluator configuration.
|
|
17
|
+
CONFIG_FILENAME = 'skill-bench.json'
|
|
18
|
+
|
|
19
|
+
class << self
|
|
20
|
+
include Config::FacadeReaders
|
|
21
|
+
include Config::FacadeWriters
|
|
22
|
+
|
|
23
|
+
# Returns the mutable configuration store behind the facade.
|
|
24
|
+
# Lazily initializes configuration on first access.
|
|
25
|
+
#
|
|
26
|
+
# @return [Config::Store] configuration state store
|
|
27
|
+
def store
|
|
28
|
+
@store ||= Config::Store.new
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Returns the default configuration.
|
|
32
|
+
#
|
|
33
|
+
# @return [Hash] default configuration hash
|
|
34
|
+
def defaults
|
|
35
|
+
Config::Defaults.call
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Applies configuration from the store.
|
|
39
|
+
#
|
|
40
|
+
# @return [Hash] applied configuration
|
|
41
|
+
def apply
|
|
42
|
+
Config::Applier.call(store.to_h)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Loads configuration from a JSON file.
|
|
46
|
+
#
|
|
47
|
+
# @param path [String] Path to JSON file
|
|
48
|
+
# @return [Hash] loaded configuration
|
|
49
|
+
def load_from_file(path)
|
|
50
|
+
Config::JsonLoader.call(path)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Saves configuration to a JSON file.
|
|
54
|
+
#
|
|
55
|
+
# @param path [String] Path to JSON file
|
|
56
|
+
# @param config [Hash] Configuration to save
|
|
57
|
+
# @return [void]
|
|
58
|
+
def save_to_file(path, config)
|
|
59
|
+
Config::FacadeWriters.save_to_file(path, config)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Returns configuration overrides from environment variables.
|
|
63
|
+
#
|
|
64
|
+
# @return [Hash] environment-based overrides
|
|
65
|
+
def env_overrides
|
|
66
|
+
Config::EnvOverrides.call
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Resets and reloads configuration from all sources.
|
|
70
|
+
# Pipeline: Defaults → Home JSON → Local JSON → ENV overrides.
|
|
71
|
+
#
|
|
72
|
+
# @return [void]
|
|
73
|
+
def reset
|
|
74
|
+
@store = Config::Store.new
|
|
75
|
+
apply_defaults
|
|
76
|
+
apply_json_config(home_config_path)
|
|
77
|
+
apply_json_config(Pathname.new(Dir.pwd).join(CONFIG_FILENAME))
|
|
78
|
+
apply_env_overrides
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Sets up configuration with a block.
|
|
82
|
+
#
|
|
83
|
+
# @yieldparam config [Config::Store] Configuration store for modification
|
|
84
|
+
# @return [void]
|
|
85
|
+
def setup
|
|
86
|
+
yield store
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Returns allowed commands from configuration.
|
|
90
|
+
#
|
|
91
|
+
# @return [Array<String>, nil] List of allowed commands
|
|
92
|
+
def allowed_commands
|
|
93
|
+
store.allowed_commands
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Returns max execution time from configuration.
|
|
97
|
+
#
|
|
98
|
+
# @return [Integer] Maximum execution time in seconds
|
|
99
|
+
def max_execution_time
|
|
100
|
+
store.max_execution_time || 30
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Returns the current LLM provider name.
|
|
104
|
+
#
|
|
105
|
+
# @return [Symbol] Current provider name
|
|
106
|
+
def current_llm_provider
|
|
107
|
+
store.current_llm_provider || :openai
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Sets the current LLM provider.
|
|
111
|
+
#
|
|
112
|
+
# @param provider [Symbol] Provider name
|
|
113
|
+
# @return [void]
|
|
114
|
+
def current_llm_provider=(provider)
|
|
115
|
+
store.assign_current_llm_provider(provider)
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Returns LLM providers configuration.
|
|
119
|
+
#
|
|
120
|
+
# @return [Hash] Providers configuration
|
|
121
|
+
def llm_providers_config
|
|
122
|
+
store.llm_providers_config || {}
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Returns API key from configuration.
|
|
126
|
+
#
|
|
127
|
+
# @return [String, nil] API key
|
|
128
|
+
def api_key
|
|
129
|
+
store.api_key
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Returns model from configuration.
|
|
133
|
+
#
|
|
134
|
+
# @return [String, nil] Model name
|
|
135
|
+
def model
|
|
136
|
+
store.model
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
private
|
|
140
|
+
|
|
141
|
+
def home_config_path
|
|
142
|
+
Pathname.new(Dir.home).join(CONFIG_FILENAME)
|
|
143
|
+
rescue ArgumentError
|
|
144
|
+
nil
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def apply_defaults
|
|
148
|
+
result = Config::Defaults.call
|
|
149
|
+
return unless result[:success]
|
|
150
|
+
|
|
151
|
+
Config::Applier.call(store: store, data: result[:response][:config])
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def apply_json_config(path)
|
|
155
|
+
return unless path
|
|
156
|
+
return unless File.exist?(path)
|
|
157
|
+
|
|
158
|
+
result = Config::JsonLoader.call(path)
|
|
159
|
+
return unless result[:success]
|
|
160
|
+
|
|
161
|
+
Config::Applier.call(store: store, data: result[:response][:config])
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def apply_env_overrides
|
|
165
|
+
result = Config::EnvOverrides.call
|
|
166
|
+
return unless result[:success]
|
|
167
|
+
|
|
168
|
+
store.apply_provider_config(result[:response][:overrides])
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
end
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
# Loads, validates, and represents evaluation criteria from criteria.json.
|
|
7
|
+
#
|
|
8
|
+
# Merges eval-specific dimension overrides with built-in default descriptions
|
|
9
|
+
# and validates that dimension weights sum to exactly 100.
|
|
10
|
+
class Criteria
|
|
11
|
+
attr_reader :dimensions, :context, :pass_threshold, :minimum_delta
|
|
12
|
+
|
|
13
|
+
# Loads criteria from a JSON file.
|
|
14
|
+
#
|
|
15
|
+
# @param path [String] Path to the criteria.json file.
|
|
16
|
+
# @return [Hash] Service response with :success and :response keys.
|
|
17
|
+
# @raise [TypeError] when the provided path is not a string.
|
|
18
|
+
def self.call(path:)
|
|
19
|
+
new(path:).call
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Returns an empty criteria with default thresholds and no dimensions.
|
|
23
|
+
#
|
|
24
|
+
# @return [SkillBench::Criteria] An empty criteria instance.
|
|
25
|
+
def self.empty
|
|
26
|
+
new(path: '').tap do |criteria|
|
|
27
|
+
criteria.instance_variable_set(:@context, '')
|
|
28
|
+
criteria.instance_variable_set(:@pass_threshold, 70)
|
|
29
|
+
criteria.instance_variable_set(:@minimum_delta, 10)
|
|
30
|
+
criteria.instance_variable_set(:@dimensions, [])
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# @param path [String] Path to the criteria.json file.
|
|
35
|
+
def initialize(path:)
|
|
36
|
+
@path = path
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Loads and validates the criteria file.
|
|
40
|
+
#
|
|
41
|
+
# @return [Hash] Service response with criteria or error.
|
|
42
|
+
def call
|
|
43
|
+
raw = load_json
|
|
44
|
+
return raw unless raw[:success]
|
|
45
|
+
|
|
46
|
+
data = raw[:response][:data]
|
|
47
|
+
raw_dimensions = data['dimensions'] || data[:dimensions] || []
|
|
48
|
+
dimensions = build_dimensions(raw_dimensions)
|
|
49
|
+
|
|
50
|
+
core_validation = validate_core_dimensions(dimensions)
|
|
51
|
+
return core_validation unless core_validation[:success]
|
|
52
|
+
|
|
53
|
+
validation = validate_dimensions(dimensions)
|
|
54
|
+
return validation unless validation[:success]
|
|
55
|
+
|
|
56
|
+
assign_attributes(data, dimensions)
|
|
57
|
+
|
|
58
|
+
{ success: true, response: { criteria: self } }
|
|
59
|
+
rescue StandardError => e
|
|
60
|
+
SkillBench::ErrorLogger.log_error(e, 'Criteria Load Error')
|
|
61
|
+
{ success: false, response: { error: { message: e.message } } }
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
private
|
|
65
|
+
|
|
66
|
+
attr_reader :path
|
|
67
|
+
|
|
68
|
+
def load_json
|
|
69
|
+
return missing_file_result unless File.exist?(path)
|
|
70
|
+
|
|
71
|
+
data = JSON.parse(File.read(path))
|
|
72
|
+
{ success: true, response: { data: data } }
|
|
73
|
+
rescue JSON::ParserError => e
|
|
74
|
+
{ success: false, response: { error: { message: "Invalid JSON: #{e.message}" } } }
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def missing_file_result
|
|
78
|
+
{ success: false, response: { error: { message: "Criteria file #{path} does not exist" } } }
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def build_dimensions(raw_dimensions)
|
|
82
|
+
defaults = DEFAULT_DIMENSIONS.to_h { |d| [d.name, d] }
|
|
83
|
+
|
|
84
|
+
raw_dimensions.map do |raw|
|
|
85
|
+
name = raw['name'] || raw[:name]
|
|
86
|
+
default = defaults[name]
|
|
87
|
+
description = raw['description'] || raw[:description] || default&.description || ''
|
|
88
|
+
|
|
89
|
+
Dimension.new(
|
|
90
|
+
name: name,
|
|
91
|
+
description: description,
|
|
92
|
+
max_score: raw['max_score'] || raw[:max_score]
|
|
93
|
+
)
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def validate_dimensions(dimensions)
|
|
98
|
+
invalid = invalid_dimensions(dimensions)
|
|
99
|
+
return invalid_max_score_result(invalid) unless invalid.empty?
|
|
100
|
+
|
|
101
|
+
total = dimensions.sum(&:max_score)
|
|
102
|
+
return { success: true, response: {} } if total == 100
|
|
103
|
+
|
|
104
|
+
{
|
|
105
|
+
success: false,
|
|
106
|
+
response: { error: { message: "Dimension max_scores must sum to 100, got #{total}" } }
|
|
107
|
+
}
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def invalid_dimensions(dimensions)
|
|
111
|
+
dimensions.reject { |d| d.max_score.is_a?(Numeric) }
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def invalid_max_score_result(invalid)
|
|
115
|
+
names = invalid.map(&:name).join(', ')
|
|
116
|
+
{
|
|
117
|
+
success: false,
|
|
118
|
+
response: { error: { message: "Dimensions missing or invalid max_score: #{names}" } }
|
|
119
|
+
}
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def validate_core_dimensions(dimensions)
|
|
123
|
+
core_names = DEFAULT_DIMENSIONS.map(&:name)
|
|
124
|
+
present_names = dimensions.map(&:name)
|
|
125
|
+
missing = core_names - present_names
|
|
126
|
+
return { success: true, response: {} } if missing.empty?
|
|
127
|
+
|
|
128
|
+
{
|
|
129
|
+
success: false,
|
|
130
|
+
response: { error: { message: "missing required core dimensions: #{missing.join(', ')}" } }
|
|
131
|
+
}
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def assign_attributes(data, dimensions)
|
|
135
|
+
@context = data['context'] || data[:context] || ''
|
|
136
|
+
@pass_threshold = [data['pass_threshold'], data[:pass_threshold]].compact.first || 70
|
|
137
|
+
@minimum_delta = [data['minimum_delta'], data[:minimum_delta]].compact.first || 10
|
|
138
|
+
@dimensions = dimensions
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
# Computes baseline vs context deltas per dimension and determines verdict.
|
|
5
|
+
#
|
|
6
|
+
# Verdict is true when context score meets pass_threshold AND
|
|
7
|
+
# the total delta meets minimum_delta.
|
|
8
|
+
class DeltaReport
|
|
9
|
+
attr_reader :deltas, :baseline_total, :context_total, :verdict, :baseline_scores, :context_scores, :criteria,
|
|
10
|
+
:baseline_dimensions, :context_dimensions
|
|
11
|
+
|
|
12
|
+
# Computes deltas and verdict from baseline and context judge responses.
|
|
13
|
+
#
|
|
14
|
+
# @param baseline [Hash] Baseline judge dimensions hash.
|
|
15
|
+
# @param context [Hash] Context judge dimensions hash.
|
|
16
|
+
# @param criteria [SkillBench::Criteria] The eval criteria with thresholds.
|
|
17
|
+
# @return [Hash] Service response with delta_report or error.
|
|
18
|
+
def self.call(baseline:, context:, criteria:)
|
|
19
|
+
new(baseline:, context:, criteria:).call
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# @param baseline [Hash] Baseline dimensions.
|
|
23
|
+
# @param context [Hash] Context dimensions.
|
|
24
|
+
# @param criteria [SkillBench::Criteria] Eval criteria.
|
|
25
|
+
def initialize(baseline:, context:, criteria:)
|
|
26
|
+
@baseline = baseline
|
|
27
|
+
@context = context
|
|
28
|
+
@criteria = criteria
|
|
29
|
+
@deltas = {}
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Computes deltas and determines verdict.
|
|
33
|
+
#
|
|
34
|
+
# @return [Hash] Service response with delta_report or error.
|
|
35
|
+
def call
|
|
36
|
+
return mismatch_result unless dimensions_match?
|
|
37
|
+
|
|
38
|
+
@baseline_dimensions = deep_copy_dimensions(baseline)
|
|
39
|
+
@context_dimensions = deep_copy_dimensions(context)
|
|
40
|
+
@baseline_scores = extract_scores(baseline)
|
|
41
|
+
@context_scores = extract_scores(context)
|
|
42
|
+
compute_totals
|
|
43
|
+
compute_deltas
|
|
44
|
+
determine_verdict
|
|
45
|
+
|
|
46
|
+
{ success: true, response: { delta_report: self } }
|
|
47
|
+
rescue StandardError => e
|
|
48
|
+
SkillBench::ErrorLogger.log_error(e, 'DeltaReport Error')
|
|
49
|
+
{ success: false, response: { error: { message: e.message } } }
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
attr_reader :baseline, :context
|
|
55
|
+
|
|
56
|
+
def dimensions_match?
|
|
57
|
+
baseline.keys.sort == context.keys.sort
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def mismatch_result
|
|
61
|
+
{ success: false, response: { error: { message: 'Baseline and context dimension names mismatch' } } }
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def compute_totals
|
|
65
|
+
@baseline_total = baseline.values.sum { |v| extract_score(v) }
|
|
66
|
+
@context_total = context.values.sum { |v| extract_score(v) }
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def compute_deltas
|
|
70
|
+
baseline.each do |name, base|
|
|
71
|
+
base_score = extract_score(base)
|
|
72
|
+
context_score = extract_score(context[name])
|
|
73
|
+
@deltas[name] = context_score - base_score
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def extract_score(dim)
|
|
78
|
+
dim[:score] || dim['score']
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def extract_scores(dimensions)
|
|
82
|
+
dimensions.transform_values { |dim| extract_score(dim) }
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def deep_copy_dimensions(dimensions)
|
|
86
|
+
dimensions.transform_values(&:dup)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def determine_verdict
|
|
90
|
+
@verdict = context_total >= criteria.pass_threshold && total_delta >= criteria.minimum_delta
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def total_delta
|
|
94
|
+
context_total - baseline_total
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
# Value object representing a scoring dimension for evaluation.
|
|
5
|
+
#
|
|
6
|
+
# Dimensions are used by the judge to score agent output across
|
|
7
|
+
# different aspects such as correctness, code quality, and skill adherence.
|
|
8
|
+
class Dimension
|
|
9
|
+
attr_reader :name, :description, :max_score
|
|
10
|
+
|
|
11
|
+
# @param name [String] The machine-friendly identifier for the dimension.
|
|
12
|
+
# @param description [String] Human-readable explanation of what the dimension measures.
|
|
13
|
+
# @param max_score [Integer, nil] Maximum score this dimension can contribute. Nil in defaults.
|
|
14
|
+
def initialize(name:, description:, max_score:)
|
|
15
|
+
@name = name
|
|
16
|
+
@description = description
|
|
17
|
+
@max_score = max_score
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Compares two Dimension instances for equality.
|
|
21
|
+
#
|
|
22
|
+
# @param other [Object] The object to compare against.
|
|
23
|
+
# @return [Boolean] true when all attributes match.
|
|
24
|
+
def ==(other)
|
|
25
|
+
other.is_a?(Dimension) &&
|
|
26
|
+
name == other.name &&
|
|
27
|
+
description == other.description &&
|
|
28
|
+
max_score == other.max_score
|
|
29
|
+
end
|
|
30
|
+
alias eql? ==
|
|
31
|
+
|
|
32
|
+
# Computes a hash code based on attributes.
|
|
33
|
+
#
|
|
34
|
+
# @return [Integer] The hash code.
|
|
35
|
+
def hash
|
|
36
|
+
[name, description, max_score].hash
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Canonical dimensions used when eval authors do not override descriptions.
|
|
41
|
+
# Weights (max_score) are nil here; the eval's criteria.json provides them.
|
|
42
|
+
DEFAULT_DIMENSIONS = [
|
|
43
|
+
Dimension.new(
|
|
44
|
+
name: 'correctness',
|
|
45
|
+
description: 'Does the output fulfill the task requirements? Are all specified behaviors present and correct?',
|
|
46
|
+
max_score: nil
|
|
47
|
+
),
|
|
48
|
+
Dimension.new(
|
|
49
|
+
name: 'skill_adherence',
|
|
50
|
+
description: 'Did the agent follow the specific patterns, hard gates, and workflows defined in the skill?',
|
|
51
|
+
max_score: nil
|
|
52
|
+
),
|
|
53
|
+
Dimension.new(
|
|
54
|
+
name: 'code_quality',
|
|
55
|
+
description: 'Is the code clean, well-structured, free of smells, follows SRP, and avoids duplication?',
|
|
56
|
+
max_score: nil
|
|
57
|
+
),
|
|
58
|
+
Dimension.new(
|
|
59
|
+
name: 'test_coverage',
|
|
60
|
+
description: 'Are there meaningful tests? Do they test the right things? Are they following TDD/best practices from the skill?',
|
|
61
|
+
max_score: nil
|
|
62
|
+
),
|
|
63
|
+
Dimension.new(
|
|
64
|
+
name: 'documentation',
|
|
65
|
+
description: 'Is there adequate YARD documentation, clear intent, and helpful inline comments where needed?',
|
|
66
|
+
max_score: nil
|
|
67
|
+
)
|
|
68
|
+
].freeze
|
|
69
|
+
end
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
# Shared error logging mixin for service objects.
|
|
5
|
+
# Logs error message and backtrace to Rails.logger or stderr.
|
|
6
|
+
module ErrorLogger
|
|
7
|
+
# Logs an error with message and backtrace.
|
|
8
|
+
#
|
|
9
|
+
# @param error [StandardError] The exception to log
|
|
10
|
+
# @param prefix [String] Optional prefix for the log message
|
|
11
|
+
# @return [void]
|
|
12
|
+
def log_error(error, prefix = nil)
|
|
13
|
+
message = prefix ? "#{prefix}: #{error.message}" : error.message
|
|
14
|
+
backtrace = error.backtrace&.first(5)&.join("\n") || '(no backtrace)'
|
|
15
|
+
|
|
16
|
+
return if skip_stderr_output?
|
|
17
|
+
|
|
18
|
+
if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
19
|
+
Rails.logger.error(message)
|
|
20
|
+
Rails.logger.error(backtrace)
|
|
21
|
+
else
|
|
22
|
+
warn(message)
|
|
23
|
+
warn(backtrace)
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# @return [Boolean] true when stderr should be skipped (test mode without explicit capture).
|
|
28
|
+
def skip_stderr_output?
|
|
29
|
+
defined?(Minitest) && !$stderr.is_a?(StringIO)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
module_function :log_error
|
|
33
|
+
module_function :skip_stderr_output?
|
|
34
|
+
end
|
|
35
|
+
end
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'pathname'
|
|
4
|
+
require_relative 'runner'
|
|
5
|
+
require_relative 'services/option_parser_service'
|
|
6
|
+
require_relative 'services/result_printer_service'
|
|
7
|
+
require_relative 'services/output_persistence_service'
|
|
8
|
+
|
|
9
|
+
module SkillBench
|
|
10
|
+
# Implements the `skill-bench run` CLI command.
|
|
11
|
+
# Orchestrates option parsing, evaluation execution, result printing, and output persistence.
|
|
12
|
+
# @deprecated Use {SkillBench::Cli::RunCommand} and {SkillBench::Services::RunnerService} instead.
|
|
13
|
+
class EvaluateCommand
|
|
14
|
+
# Parses arguments, runs the evaluator, prints the report, and records history.
|
|
15
|
+
#
|
|
16
|
+
# @param argv [Array<String>] Raw CLI arguments.
|
|
17
|
+
# @param stdout [#puts, #write] Output stream for user-visible messages.
|
|
18
|
+
# @return [Integer] Shell-compatible exit code.
|
|
19
|
+
# @raise [OptionParser::ParseError] when invalid CLI flags are provided.
|
|
20
|
+
# @raise [SystemCallError] if writing output fails.
|
|
21
|
+
def self.call(argv, stdout: $stdout)
|
|
22
|
+
new(argv, stdout: stdout).call
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# @param argv [Array<String>] Raw CLI arguments.
|
|
26
|
+
# @param stdout [#puts, #write] Output stream for user-visible messages.
|
|
27
|
+
def initialize(argv, stdout:)
|
|
28
|
+
@argv = argv
|
|
29
|
+
@stdout = stdout
|
|
30
|
+
@options = nil
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Executes the command by orchestrating service objects.
|
|
34
|
+
#
|
|
35
|
+
# @return [Integer] Shell-compatible exit code.
|
|
36
|
+
# @raise [OptionParser::ParseError] when invalid CLI flags are provided.
|
|
37
|
+
# @raise [SystemCallError] when the optional JSON output file cannot be written.
|
|
38
|
+
def call
|
|
39
|
+
return 1 unless parse_options? && validate_options?
|
|
40
|
+
|
|
41
|
+
result = run_evaluation
|
|
42
|
+
return 1 unless result[:success]
|
|
43
|
+
|
|
44
|
+
return 1 unless persist_output?(result)
|
|
45
|
+
|
|
46
|
+
SkillBench::HistoryRecorder.record(
|
|
47
|
+
result,
|
|
48
|
+
source_path: result[:source_path],
|
|
49
|
+
model: SkillBench::Config.model
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
0
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
private
|
|
56
|
+
|
|
57
|
+
def parse_options?
|
|
58
|
+
options_result = Services::OptionParserService.call(@argv)
|
|
59
|
+
@options = options_result[:response]
|
|
60
|
+
|
|
61
|
+
unless options_result[:success]
|
|
62
|
+
@stdout.puts "Error: #{@options[:error][:message]}"
|
|
63
|
+
return false
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
true
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def validate_options?
|
|
70
|
+
eval_path = @options[:eval]
|
|
71
|
+
return true if eval_path
|
|
72
|
+
|
|
73
|
+
@stdout.puts 'Error: The --eval option is required.'
|
|
74
|
+
@stdout.puts 'Example: bin/evaluate -e evals/skills/infrastructure/rails-api-versioning/api-versioning-with-controller-inheritan'
|
|
75
|
+
false
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def run_evaluation
|
|
79
|
+
skill_option = @options[:skill]
|
|
80
|
+
eval_path = safe_expand_path(@options[:eval])
|
|
81
|
+
skill_path = skill_option ? safe_expand_path(skill_option) : nil
|
|
82
|
+
|
|
83
|
+
result = SkillBench::Runner.call(
|
|
84
|
+
eval_folder_path: eval_path,
|
|
85
|
+
skill_path: skill_path
|
|
86
|
+
)
|
|
87
|
+
Services::ResultPrinterService.call(result, stdout: @stdout)
|
|
88
|
+
result
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def persist_output?(result)
|
|
92
|
+
output_result = Services::OutputPersistenceService.call(result, output_path: @options[:output])
|
|
93
|
+
output_response = output_result[:response]
|
|
94
|
+
message = output_response[:message]
|
|
95
|
+
|
|
96
|
+
if output_result[:success]
|
|
97
|
+
@stdout.puts(message) if message
|
|
98
|
+
true
|
|
99
|
+
else
|
|
100
|
+
@stdout.puts "Error saving report: #{output_response[:error][:message]}"
|
|
101
|
+
false
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def safe_expand_path(path)
|
|
106
|
+
expanded = File.expand_path(path)
|
|
107
|
+
base = File.expand_path(Dir.pwd)
|
|
108
|
+
|
|
109
|
+
real_expanded = File.exist?(expanded) ? File.realpath(expanded) : expanded
|
|
110
|
+
real_base = File.realpath(base)
|
|
111
|
+
|
|
112
|
+
relative = Pathname.new(real_expanded).relative_path_from(Pathname.new(real_base)).to_s
|
|
113
|
+
raise ArgumentError, "Path '#{path}' resolves outside the current working directory" if relative.start_with?('..')
|
|
114
|
+
|
|
115
|
+
expanded
|
|
116
|
+
rescue Errno::ENOENT, Errno::EACCES => e
|
|
117
|
+
raise ArgumentError, "Path '#{path}' is not accessible: #{e.message}"
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|