ruby-skill-bench 1.0.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +299 -23
- data/docs/architecture.md +3 -1
- data/docs/first-eval-guide.md +7 -7
- data/docs/testing-guide.md +1 -1
- data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
- data/lib/skill_bench/agent/react_agent/step.rb +7 -1
- data/lib/skill_bench/agent/react_agent.rb +2 -1
- data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
- data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
- data/lib/skill_bench/cli/help_printer.rb +10 -2
- data/lib/skill_bench/cli/init_command.rb +2 -1
- data/lib/skill_bench/cli/result_printer.rb +1 -1
- data/lib/skill_bench/cli/run_command.rb +47 -9
- data/lib/skill_bench/cli/validate_command.rb +242 -0
- data/lib/skill_bench/cli.rb +3 -0
- data/lib/skill_bench/client.rb +43 -1
- data/lib/skill_bench/clients/all.rb +3 -0
- data/lib/skill_bench/clients/base_client.rb +14 -6
- data/lib/skill_bench/clients/base_url_validator.rb +105 -0
- data/lib/skill_bench/clients/provider_config.rb +34 -1
- data/lib/skill_bench/clients/provider_schemas.rb +4 -0
- data/lib/skill_bench/clients/providers/mistral.rb +47 -0
- data/lib/skill_bench/clients/request_builder.rb +2 -4
- data/lib/skill_bench/clients/response_builder.rb +91 -0
- data/lib/skill_bench/clients/response_error_handler.rb +5 -17
- data/lib/skill_bench/clients/retry_handler.rb +4 -7
- data/lib/skill_bench/commands/init.rb +5 -0
- data/lib/skill_bench/commands/skill_new.rb +3 -1
- data/lib/skill_bench/config/applier.rb +2 -0
- data/lib/skill_bench/config/defaults.rb +2 -0
- data/lib/skill_bench/config/facade_readers.rb +7 -0
- data/lib/skill_bench/config/facade_writers.rb +17 -0
- data/lib/skill_bench/config/json_loader.rb +1 -1
- data/lib/skill_bench/config/store.rb +29 -0
- data/lib/skill_bench/config.rb +18 -0
- data/lib/skill_bench/constants.rb +58 -0
- data/lib/skill_bench/evaluation/runner.rb +20 -3
- data/lib/skill_bench/execution/context_hydrator.rb +66 -15
- data/lib/skill_bench/execution/sandbox.rb +76 -14
- data/lib/skill_bench/judge/judge.rb +4 -0
- data/lib/skill_bench/judge/prompt.rb +42 -6
- data/lib/skill_bench/models/config.rb +32 -0
- data/lib/skill_bench/output_formatter.rb +60 -1
- data/lib/skill_bench/package_verifier.rb +1 -1
- data/lib/skill_bench/rails/skill_templates.rb +19 -5
- data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
- data/lib/skill_bench/services/batch_runner_service.rb +111 -0
- data/lib/skill_bench/services/compare_option_parser.rb +1 -0
- data/lib/skill_bench/services/cost_calculator.rb +91 -0
- data/lib/skill_bench/services/html_formatter.rb +289 -0
- data/lib/skill_bench/services/json_formatter.rb +19 -1
- data/lib/skill_bench/services/junit_formatter.rb +74 -24
- data/lib/skill_bench/services/provider_resolver.rb +5 -2
- data/lib/skill_bench/services/response_cache.rb +130 -0
- data/lib/skill_bench/services/runner_service.rb +88 -4
- data/lib/skill_bench/services/summary_formatter.rb +90 -0
- data/lib/skill_bench/services/template_registry.rb +43 -9
- data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
- data/lib/skill_bench/tools/registry.rb +29 -3
- data/lib/skill_bench/tools/run_command.rb +172 -35
- data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
- data/lib/skill_bench/trend_tracker.rb +5 -5
- data/lib/skill_bench/version.rb +1 -1
- data/lib/skill_bench.rb +3 -3
- metadata +19 -36
|
@@ -1,29 +1,34 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'open3'
|
|
4
|
-
require 'timeout'
|
|
5
4
|
require 'shellwords'
|
|
6
5
|
require_relative '../config'
|
|
6
|
+
require_relative '../constants'
|
|
7
|
+
require_relative '../error_logger'
|
|
7
8
|
|
|
8
9
|
module SkillBench
|
|
9
10
|
module Tools
|
|
10
11
|
# Handles executing a shell command within the working directory.
|
|
12
|
+
#
|
|
13
|
+
# Real container isolation is not yet shipped, so an active sandbox means a
|
|
14
|
+
# temporary git directory on the host. To honor the documented security
|
|
15
|
+
# model the tool fails closed: when no container isolation is active it
|
|
16
|
+
# refuses to run unless `allow_host_execution` is explicitly enabled.
|
|
11
17
|
class RunCommand
|
|
12
|
-
#
|
|
13
|
-
#
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
].freeze
|
|
18
|
+
# Refusal returned when no container isolation is active and host execution
|
|
19
|
+
# has not been explicitly enabled. Deliberately omits the allowlist.
|
|
20
|
+
HOST_EXECUTION_REFUSED = 'Command execution refused: no sandbox isolation is active and ' \
|
|
21
|
+
"'allow_host_execution' is not enabled. Set \"allow_host_execution\": true in " \
|
|
22
|
+
'skill-bench.json to permit running commands directly on the host (NOT isolated).'
|
|
23
|
+
|
|
24
|
+
# Warning emitted when a command runs un-isolated on the host because
|
|
25
|
+
# `allow_host_execution` is enabled and no container is active.
|
|
26
|
+
HOST_EXECUTION_WARNING = 'Warning: running command directly on the host with NO sandbox isolation ' \
|
|
27
|
+
'(allow_host_execution is enabled). Commands are not isolated from your machine.'
|
|
28
|
+
|
|
29
|
+
# Seconds to wait after SIGTERM before escalating to SIGKILL when a command
|
|
30
|
+
# exceeds its execution deadline.
|
|
31
|
+
TERM_GRACE_PERIOD = 2
|
|
27
32
|
|
|
28
33
|
# @return [Hash] The tool definition for the LLM API.
|
|
29
34
|
def self.definition
|
|
@@ -49,41 +54,173 @@ module SkillBench
|
|
|
49
54
|
# Tokenizes the command string before execution so that arguments are passed
|
|
50
55
|
# directly to the OS without shell interpretation, preventing shell injection.
|
|
51
56
|
#
|
|
57
|
+
# Fails closed: when no container isolation is active (`container_id` is nil)
|
|
58
|
+
# and `allow_host_execution` is false, the command is refused and nothing
|
|
59
|
+
# runs. When host execution is explicitly allowed, a warning is emitted once
|
|
60
|
+
# per command before running un-isolated on the host.
|
|
61
|
+
#
|
|
52
62
|
# @param command [String] The command to run (e.g. "rspec spec/models").
|
|
53
63
|
# @param working_dir_path [Pathname] The host directory (ignored if container_id present).
|
|
54
64
|
# @param container_id [String, nil] The Docker container ID for isolated execution.
|
|
55
|
-
# @return [String] A formatted string containing the exit status, STDOUT, and STDERR
|
|
56
|
-
#
|
|
65
|
+
# @return [String] A formatted string containing the exit status, STDOUT, and STDERR,
|
|
66
|
+
# or a standardized error/refusal message.
|
|
57
67
|
def self.call(command, working_dir_path, container_id = nil)
|
|
58
68
|
argv = command.shellsplit
|
|
59
69
|
return 'Error: Empty command.' if argv.empty?
|
|
60
70
|
|
|
61
71
|
base_cmd = argv.first
|
|
62
|
-
return "Error: Command '#{base_cmd}' is blocked for security reasons." if DANGEROUS_COMMANDS.include?(base_cmd)
|
|
72
|
+
return "Error: Command '#{base_cmd}' is blocked for security reasons." if Constants::Tools::DANGEROUS_COMMANDS.include?(base_cmd)
|
|
63
73
|
|
|
64
74
|
allowed = SkillBench::Config.allowed_commands
|
|
65
75
|
return 'Error: No allowed commands configured. Set allowed_commands in skill-bench.json or use --mode mock.' if allowed.nil?
|
|
66
76
|
return "Error: Command '#{base_cmd}' is not permitted." unless allowed.include?(base_cmd)
|
|
67
77
|
|
|
78
|
+
return "Error: Command '#{base_cmd}' arguments are not permitted by the configured argument constraints." unless arguments_permitted?(base_cmd, argv)
|
|
79
|
+
|
|
80
|
+
return HOST_EXECUTION_REFUSED unless container_id || SkillBench::Config.allow_host_execution
|
|
81
|
+
|
|
82
|
+
warn_unisolated_host_execution unless container_id
|
|
83
|
+
execute(argv, working_dir_path, container_id)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Checks the command's arguments against the optional, per-command
|
|
87
|
+
# argument constraints from configuration.
|
|
88
|
+
#
|
|
89
|
+
# This is a default-off seam: the command allowlist remains the primary
|
|
90
|
+
# authorization control, and any allowlisted wrapper binary still grants
|
|
91
|
+
# broad host execution. When no constraints are configured (the default),
|
|
92
|
+
# or none apply to +base_cmd+, every argument is permitted so behavior is
|
|
93
|
+
# unchanged. When a constraint exists for +base_cmd+, the command is
|
|
94
|
+
# refused if any argument contains a disallowed substring/flag.
|
|
95
|
+
#
|
|
96
|
+
# @param base_cmd [String] The base command (first token of the command).
|
|
97
|
+
# @param argv [Array<String>] The tokenized command and arguments.
|
|
98
|
+
# @return [Boolean] true when the arguments are permitted to run.
|
|
99
|
+
def self.arguments_permitted?(base_cmd, argv)
|
|
100
|
+
constraints = SkillBench::Config.command_argument_constraints
|
|
101
|
+
return true if constraints.nil? || constraints.empty?
|
|
102
|
+
|
|
103
|
+
# Constraint keys may be strings (facade API) or symbols (loaded from
|
|
104
|
+
# JSON via symbolize_names), so look the command up under both.
|
|
105
|
+
disallowed = constraints[base_cmd] || constraints[base_cmd.to_sym]
|
|
106
|
+
return true if disallowed.nil? || disallowed.empty?
|
|
107
|
+
|
|
108
|
+
argv.drop(1).none? { |arg| disallowed.any? { |bad| arg.include?(bad.to_s) } }
|
|
109
|
+
end
|
|
110
|
+
private_class_method :arguments_permitted?
|
|
111
|
+
|
|
112
|
+
# Runs the resolved command and formats its result, enforcing the
|
|
113
|
+
# configured execution timeout.
|
|
114
|
+
#
|
|
115
|
+
# The command is spawned in its own process group so that, on timeout, the
|
|
116
|
+
# whole group (the command and any children it forked) can be signalled —
|
|
117
|
+
# something `Timeout.timeout` around `Open3.capture3` could not do, because
|
|
118
|
+
# `capture3`'s `ensure` blocks on `wait_thr.value` and never signals the
|
|
119
|
+
# child.
|
|
120
|
+
#
|
|
121
|
+
# @param argv [Array<String>] The tokenized command and arguments.
|
|
122
|
+
# @param working_dir_path [Pathname] The host directory for host execution.
|
|
123
|
+
# @param container_id [String, nil] The Docker container ID for isolated execution.
|
|
124
|
+
# @return [String] Formatted exit status, STDOUT, and STDERR, or a timeout message.
|
|
125
|
+
def self.execute(argv, working_dir_path, container_id)
|
|
68
126
|
max_time = SkillBench::Config.max_execution_time
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
127
|
+
command, spawn_opts = resolve_invocation(argv, working_dir_path, container_id)
|
|
128
|
+
result = capture(command, spawn_opts, max_time)
|
|
129
|
+
return "Error: Command execution timed out after #{max_time} seconds." if result == :timed_out
|
|
130
|
+
|
|
131
|
+
stdout_str, stderr_str, status = result
|
|
132
|
+
format_result(status, stdout_str, stderr_str)
|
|
133
|
+
end
|
|
134
|
+
private_class_method :execute
|
|
135
|
+
|
|
136
|
+
# Formats the captured command output into the standard result string.
|
|
137
|
+
#
|
|
138
|
+
# @param status [Process::Status] The exit status of the command.
|
|
139
|
+
# @param stdout_str [String] The captured standard output.
|
|
140
|
+
# @param stderr_str [String] The captured standard error.
|
|
141
|
+
# @return [String] Formatted exit status, STDOUT, and STDERR.
|
|
142
|
+
def self.format_result(status, stdout_str, stderr_str)
|
|
143
|
+
<<~RESULT
|
|
144
|
+
Exit Status: #{status.exitstatus}
|
|
145
|
+
STDOUT:
|
|
146
|
+
#{stdout_str}
|
|
147
|
+
STDERR:
|
|
148
|
+
#{stderr_str}
|
|
149
|
+
RESULT
|
|
150
|
+
end
|
|
151
|
+
private_class_method :format_result
|
|
152
|
+
|
|
153
|
+
# Builds the command array and spawn options for either container or host
|
|
154
|
+
# execution. Both run in their own process group (`pgroup: true`) so the
|
|
155
|
+
# watchdog can kill the whole group on timeout.
|
|
156
|
+
#
|
|
157
|
+
# @param argv [Array<String>] The tokenized command and arguments.
|
|
158
|
+
# @param working_dir_path [Pathname] The host directory for host execution.
|
|
159
|
+
# @param container_id [String, nil] The Docker container ID for isolated execution.
|
|
160
|
+
# @return [Array(Array<String>, Hash)] The full command array and spawn options.
|
|
161
|
+
def self.resolve_invocation(argv, working_dir_path, container_id)
|
|
162
|
+
return [['docker', 'exec', '-w', '/sandbox', container_id, *argv], { pgroup: true }] if container_id
|
|
163
|
+
|
|
164
|
+
[argv, { chdir: working_dir_path.to_s, pgroup: true }]
|
|
165
|
+
end
|
|
166
|
+
private_class_method :resolve_invocation
|
|
167
|
+
|
|
168
|
+
# Spawns the command, draining STDOUT/STDERR on separate threads so a chatty
|
|
169
|
+
# or hung child never deadlocks the reader, and enforces the deadline with a
|
|
170
|
+
# watchdog that kills the process group when the command overruns.
|
|
171
|
+
#
|
|
172
|
+
# @param command [Array<String>] The full command array (no shell).
|
|
173
|
+
# @param spawn_opts [Hash] Options passed to the spawner (includes `pgroup`).
|
|
174
|
+
# @param max_time [Integer] Maximum execution time in seconds.
|
|
175
|
+
# @return [Array(String, String, Process::Status), Symbol] STDOUT, STDERR, and
|
|
176
|
+
# status on completion, or `:timed_out` when the deadline is exceeded.
|
|
177
|
+
def self.capture(command, spawn_opts, max_time)
|
|
178
|
+
Open3.popen3(*command, **spawn_opts) do |stdin, stdout, stderr, wait_thr|
|
|
179
|
+
stdin.close
|
|
180
|
+
readers = [Thread.new { stdout.read }, Thread.new { stderr.read }]
|
|
181
|
+
completed = wait_thr.join(max_time)
|
|
182
|
+
terminate_process_group(wait_thr) unless completed
|
|
183
|
+
stdout_str, stderr_str = readers.map(&:value)
|
|
184
|
+
completed ? [stdout_str, stderr_str, wait_thr.value] : :timed_out
|
|
83
185
|
end
|
|
84
|
-
rescue Timeout::Error
|
|
85
|
-
"Error: Command execution timed out after #{max_time} seconds."
|
|
86
186
|
end
|
|
187
|
+
private_class_method :capture
|
|
188
|
+
|
|
189
|
+
# Terminates the command's entire process group: SIGTERM first, then SIGKILL
|
|
190
|
+
# after a short grace period if it has not exited. Signalling the negated
|
|
191
|
+
# process group id reaches the command and any children it forked.
|
|
192
|
+
#
|
|
193
|
+
# @param wait_thr [Process::Waiter] The wait thread for the spawned process group leader.
|
|
194
|
+
# @return [void]
|
|
195
|
+
def self.terminate_process_group(wait_thr)
|
|
196
|
+
pgid = wait_thr.pid
|
|
197
|
+
signal_group('TERM', pgid)
|
|
198
|
+
signal_group('KILL', pgid) unless wait_thr.join(TERM_GRACE_PERIOD)
|
|
199
|
+
end
|
|
200
|
+
private_class_method :terminate_process_group
|
|
201
|
+
|
|
202
|
+
# Sends a signal to a whole process group, ignoring an already-exited group.
|
|
203
|
+
#
|
|
204
|
+
# @param signal [String] The signal name (e.g. "TERM", "KILL").
|
|
205
|
+
# @param pgid [Integer] The process group id (leader pid) to signal.
|
|
206
|
+
# @return [void]
|
|
207
|
+
def self.signal_group(signal, pgid)
|
|
208
|
+
Process.kill(signal, -pgid)
|
|
209
|
+
rescue Errno::ESRCH
|
|
210
|
+
nil
|
|
211
|
+
end
|
|
212
|
+
private_class_method :signal_group
|
|
213
|
+
|
|
214
|
+
# Emits a single warning that the command will run un-isolated on the host,
|
|
215
|
+
# honoring the test-suite stderr suppression convention.
|
|
216
|
+
#
|
|
217
|
+
# @return [void]
|
|
218
|
+
def self.warn_unisolated_host_execution
|
|
219
|
+
return if SkillBench::ErrorLogger.skip_stderr_output?
|
|
220
|
+
|
|
221
|
+
warn(HOST_EXECUTION_WARNING)
|
|
222
|
+
end
|
|
223
|
+
private_class_method :warn_unisolated_host_execution
|
|
87
224
|
end
|
|
88
225
|
end
|
|
89
226
|
end
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require 'json'
|
|
4
4
|
require 'pathname'
|
|
5
|
+
require 'fileutils'
|
|
5
6
|
|
|
6
7
|
module SkillBench
|
|
7
8
|
class TrendTracker
|
|
@@ -27,23 +28,24 @@ module SkillBench
|
|
|
27
28
|
[]
|
|
28
29
|
end
|
|
29
30
|
|
|
30
|
-
# Writes history to file
|
|
31
|
-
#
|
|
31
|
+
# Writes history to file atomically, snapshotting the previous good
|
|
32
|
+
# version into the backup first.
|
|
33
|
+
#
|
|
34
|
+
# The existing history file (if any) is copied to +#{history_file}.bak+
|
|
35
|
+
# before the new content is written, so the backup always holds the
|
|
36
|
+
# previous good version rather than a duplicate of the current file. The
|
|
37
|
+
# new content is serialized once and written via a temp-file + rename so
|
|
38
|
+
# the main file is never left partially written. Returns a result hash so
|
|
39
|
+
# callers do not need to rescue SystemCallError.
|
|
32
40
|
#
|
|
33
41
|
# @param history [Array<Hash>] History entries to write
|
|
34
42
|
# @return [Hash] { success: true } on success, { success: false, error: { message: '...' } } on failure
|
|
35
43
|
def write(history)
|
|
36
|
-
|
|
44
|
+
backup_previous_version
|
|
37
45
|
temp_file = "#{history_file}.tmp"
|
|
38
|
-
File.write(temp_file,
|
|
46
|
+
File.write(temp_file, JSON.pretty_generate(history))
|
|
39
47
|
File.rename(temp_file, history_file)
|
|
40
48
|
|
|
41
|
-
begin
|
|
42
|
-
File.write("#{history_file}.bak", json)
|
|
43
|
-
rescue SystemCallError => e
|
|
44
|
-
warn "Backup write failed for #{history_file}: #{e.message}"
|
|
45
|
-
end
|
|
46
|
-
|
|
47
49
|
{ success: true }
|
|
48
50
|
rescue SystemCallError => e
|
|
49
51
|
{ success: false, error: { message: e.message } }
|
|
@@ -53,6 +55,21 @@ module SkillBench
|
|
|
53
55
|
|
|
54
56
|
attr_reader :history_file
|
|
55
57
|
|
|
58
|
+
# Copies the current history file to the backup path so the backup keeps
|
|
59
|
+
# the previous good version. No-op on the first run when no history file
|
|
60
|
+
# exists yet. A failed copy is non-fatal: it warns and lets the main
|
|
61
|
+
# write proceed.
|
|
62
|
+
#
|
|
63
|
+
# @return [void]
|
|
64
|
+
def backup_previous_version
|
|
65
|
+
source = history_file
|
|
66
|
+
return unless File.exist?(source)
|
|
67
|
+
|
|
68
|
+
FileUtils.cp(source, "#{source}.bak")
|
|
69
|
+
rescue SystemCallError => e
|
|
70
|
+
warn "Backup copy failed for #{source}: #{e.message}"
|
|
71
|
+
end
|
|
72
|
+
|
|
56
73
|
# Reads backup file if it exists
|
|
57
74
|
#
|
|
58
75
|
# @return [Array<Hash>, nil] Backup data or nil if unavailable
|
|
@@ -17,9 +17,9 @@ module SkillBench
|
|
|
17
17
|
# Records an evaluation result.
|
|
18
18
|
#
|
|
19
19
|
# @param result [Hash] The evaluation result from EvaluationRunner.
|
|
20
|
+
# @param history [Array<Hash>] Pre-loaded history to append to; defaults to a fresh load.
|
|
20
21
|
# @return [Hash] Service response.
|
|
21
|
-
def record(result)
|
|
22
|
-
history = @persistence.load
|
|
22
|
+
def record(result, history = @persistence.load)
|
|
23
23
|
history << extract_entry(result)
|
|
24
24
|
write_result = @persistence.write(history)
|
|
25
25
|
|
|
@@ -41,11 +41,11 @@ module SkillBench
|
|
|
41
41
|
# Computes the trend of the given result against the most recent matching history entry.
|
|
42
42
|
#
|
|
43
43
|
# @param result [Hash] The current evaluation result.
|
|
44
|
+
# @param history [Array<Hash>] Pre-loaded history to compare against; defaults to a fresh load.
|
|
44
45
|
# @return [Hash, nil] Trend data or nil if no matching history exists.
|
|
45
|
-
def trend_for(result)
|
|
46
|
-
entries = @persistence.load
|
|
46
|
+
def trend_for(result, history = @persistence.load)
|
|
47
47
|
current = extract_entry(result)
|
|
48
|
-
TrendCalculator.compute_trend(
|
|
48
|
+
TrendCalculator.compute_trend(history, current)
|
|
49
49
|
end
|
|
50
50
|
|
|
51
51
|
private
|
data/lib/skill_bench/version.rb
CHANGED
data/lib/skill_bench.rb
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
|
|
9
9
|
# Core modules
|
|
10
10
|
require_relative 'skill_bench/version'
|
|
11
|
+
require_relative 'skill_bench/constants'
|
|
11
12
|
require_relative 'skill_bench/dimension'
|
|
12
13
|
require_relative 'skill_bench/criteria'
|
|
13
14
|
require_relative 'skill_bench/delta_report'
|
|
@@ -72,6 +73,8 @@ require_relative 'skill_bench/commands/eval_new'
|
|
|
72
73
|
|
|
73
74
|
# Services
|
|
74
75
|
require_relative 'skill_bench/services/runner_service'
|
|
76
|
+
require_relative 'skill_bench/services/batch_runner_service'
|
|
77
|
+
require_relative 'skill_bench/services/summary_formatter'
|
|
75
78
|
require_relative 'skill_bench/services/template_registry'
|
|
76
79
|
|
|
77
80
|
# Tools
|
|
@@ -87,9 +90,6 @@ require_relative 'skill_bench/trend_tracker'
|
|
|
87
90
|
require_relative 'skill_bench/trend_tracker/persistence'
|
|
88
91
|
require_relative 'skill_bench/trend_tracker/trend_calculator'
|
|
89
92
|
|
|
90
|
-
# Rails integrations
|
|
91
|
-
require_relative 'skill_bench/rails/skill_templates'
|
|
92
|
-
|
|
93
93
|
# Migration utilities
|
|
94
94
|
require_relative 'skill_bench/migration/provider_migrator'
|
|
95
95
|
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ruby-skill-bench
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.0
|
|
4
|
+
version: 1.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ismael Marin
|
|
@@ -9,48 +9,20 @@ bindir: bin
|
|
|
9
9
|
cert_chain: []
|
|
10
10
|
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
|
-
- !ruby/object:Gem::Dependency
|
|
13
|
-
name: activesupport
|
|
14
|
-
requirement: !ruby/object:Gem::Requirement
|
|
15
|
-
requirements:
|
|
16
|
-
- - ">="
|
|
17
|
-
- !ruby/object:Gem::Version
|
|
18
|
-
version: '6.0'
|
|
19
|
-
type: :runtime
|
|
20
|
-
prerelease: false
|
|
21
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
-
requirements:
|
|
23
|
-
- - ">="
|
|
24
|
-
- !ruby/object:Gem::Version
|
|
25
|
-
version: '6.0'
|
|
26
12
|
- !ruby/object:Gem::Dependency
|
|
27
13
|
name: cgi
|
|
28
14
|
requirement: !ruby/object:Gem::Requirement
|
|
29
15
|
requirements:
|
|
30
16
|
- - "~>"
|
|
31
17
|
- !ruby/object:Gem::Version
|
|
32
|
-
version: 0.5.
|
|
33
|
-
type: :runtime
|
|
34
|
-
prerelease: false
|
|
35
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
-
requirements:
|
|
37
|
-
- - "~>"
|
|
38
|
-
- !ruby/object:Gem::Version
|
|
39
|
-
version: 0.5.1
|
|
40
|
-
- !ruby/object:Gem::Dependency
|
|
41
|
-
name: dotenv
|
|
42
|
-
requirement: !ruby/object:Gem::Requirement
|
|
43
|
-
requirements:
|
|
44
|
-
- - "~>"
|
|
45
|
-
- !ruby/object:Gem::Version
|
|
46
|
-
version: 3.2.0
|
|
18
|
+
version: 0.5.2
|
|
47
19
|
type: :runtime
|
|
48
20
|
prerelease: false
|
|
49
21
|
version_requirements: !ruby/object:Gem::Requirement
|
|
50
22
|
requirements:
|
|
51
23
|
- - "~>"
|
|
52
24
|
- !ruby/object:Gem::Version
|
|
53
|
-
version:
|
|
25
|
+
version: 0.5.2
|
|
54
26
|
- !ruby/object:Gem::Dependency
|
|
55
27
|
name: faraday
|
|
56
28
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -71,28 +43,28 @@ dependencies:
|
|
|
71
43
|
requirements:
|
|
72
44
|
- - "~>"
|
|
73
45
|
- !ruby/object:Gem::Version
|
|
74
|
-
version: '2.
|
|
46
|
+
version: '2.20'
|
|
75
47
|
type: :runtime
|
|
76
48
|
prerelease: false
|
|
77
49
|
version_requirements: !ruby/object:Gem::Requirement
|
|
78
50
|
requirements:
|
|
79
51
|
- - "~>"
|
|
80
52
|
- !ruby/object:Gem::Version
|
|
81
|
-
version: '2.
|
|
53
|
+
version: '2.20'
|
|
82
54
|
- !ruby/object:Gem::Dependency
|
|
83
55
|
name: parallel
|
|
84
56
|
requirement: !ruby/object:Gem::Requirement
|
|
85
57
|
requirements:
|
|
86
58
|
- - "~>"
|
|
87
59
|
- !ruby/object:Gem::Version
|
|
88
|
-
version:
|
|
60
|
+
version: 2.0.0
|
|
89
61
|
type: :runtime
|
|
90
62
|
prerelease: false
|
|
91
63
|
version_requirements: !ruby/object:Gem::Requirement
|
|
92
64
|
requirements:
|
|
93
65
|
- - "~>"
|
|
94
66
|
- !ruby/object:Gem::Version
|
|
95
|
-
version:
|
|
67
|
+
version: 2.0.0
|
|
96
68
|
description: |
|
|
97
69
|
ruby-skill-bench orchestrates evaluation runs of AI coding agents
|
|
98
70
|
inside isolated git sandboxes, then scores the results using deterministic
|
|
@@ -119,6 +91,7 @@ files:
|
|
|
119
91
|
- lib/skill_bench/agent/runner.rb
|
|
120
92
|
- lib/skill_bench/agent/summary.rb
|
|
121
93
|
- lib/skill_bench/cli.rb
|
|
94
|
+
- lib/skill_bench/cli/batch_result_printer.rb
|
|
122
95
|
- lib/skill_bench/cli/compare_command.rb
|
|
123
96
|
- lib/skill_bench/cli/eval/eval_command_registry.rb
|
|
124
97
|
- lib/skill_bench/cli/eval/eval_commands.rb
|
|
@@ -129,9 +102,11 @@ files:
|
|
|
129
102
|
- lib/skill_bench/cli/result_printer.rb
|
|
130
103
|
- lib/skill_bench/cli/run_command.rb
|
|
131
104
|
- lib/skill_bench/cli/skill_command.rb
|
|
105
|
+
- lib/skill_bench/cli/validate_command.rb
|
|
132
106
|
- lib/skill_bench/client.rb
|
|
133
107
|
- lib/skill_bench/clients/all.rb
|
|
134
108
|
- lib/skill_bench/clients/base_client.rb
|
|
109
|
+
- lib/skill_bench/clients/base_url_validator.rb
|
|
135
110
|
- lib/skill_bench/clients/provider_config.rb
|
|
136
111
|
- lib/skill_bench/clients/provider_registry.rb
|
|
137
112
|
- lib/skill_bench/clients/provider_schemas.rb
|
|
@@ -140,6 +115,7 @@ files:
|
|
|
140
115
|
- lib/skill_bench/clients/providers/deepseek.rb
|
|
141
116
|
- lib/skill_bench/clients/providers/gemini.rb
|
|
142
117
|
- lib/skill_bench/clients/providers/groq.rb
|
|
118
|
+
- lib/skill_bench/clients/providers/mistral.rb
|
|
143
119
|
- lib/skill_bench/clients/providers/mock.rb
|
|
144
120
|
- lib/skill_bench/clients/providers/null_client.rb
|
|
145
121
|
- lib/skill_bench/clients/providers/ollama.rb
|
|
@@ -147,6 +123,7 @@ files:
|
|
|
147
123
|
- lib/skill_bench/clients/providers/opencode.rb
|
|
148
124
|
- lib/skill_bench/clients/providers/openrouter.rb
|
|
149
125
|
- lib/skill_bench/clients/request_builder.rb
|
|
126
|
+
- lib/skill_bench/clients/response_builder.rb
|
|
150
127
|
- lib/skill_bench/clients/response_error_handler.rb
|
|
151
128
|
- lib/skill_bench/clients/response_parser.rb
|
|
152
129
|
- lib/skill_bench/clients/retry_handler.rb
|
|
@@ -162,6 +139,7 @@ files:
|
|
|
162
139
|
- lib/skill_bench/config/facade_writers.rb
|
|
163
140
|
- lib/skill_bench/config/json_loader.rb
|
|
164
141
|
- lib/skill_bench/config/store.rb
|
|
142
|
+
- lib/skill_bench/constants.rb
|
|
165
143
|
- lib/skill_bench/criteria.rb
|
|
166
144
|
- lib/skill_bench/delta_report.rb
|
|
167
145
|
- lib/skill_bench/dimension.rb
|
|
@@ -196,16 +174,19 @@ files:
|
|
|
196
174
|
- lib/skill_bench/registry/pack_resolver.rb
|
|
197
175
|
- lib/skill_bench/runner.rb
|
|
198
176
|
- lib/skill_bench/services/agent_spawner_service.rb
|
|
177
|
+
- lib/skill_bench/services/batch_runner_service.rb
|
|
199
178
|
- lib/skill_bench/services/compare_option_parser.rb
|
|
200
179
|
- lib/skill_bench/services/comparison_reporter.rb
|
|
201
180
|
- lib/skill_bench/services/comparison_runner.rb
|
|
202
181
|
- lib/skill_bench/services/context_loader_service.rb
|
|
182
|
+
- lib/skill_bench/services/cost_calculator.rb
|
|
203
183
|
- lib/skill_bench/services/delta_table_formatter.rb
|
|
204
184
|
- lib/skill_bench/services/error_response_builder.rb
|
|
205
185
|
- lib/skill_bench/services/eval_resolver.rb
|
|
206
186
|
- lib/skill_bench/services/exit_code_calculator.rb
|
|
207
187
|
- lib/skill_bench/services/feedback_generator.rb
|
|
208
188
|
- lib/skill_bench/services/formatting_helpers.rb
|
|
189
|
+
- lib/skill_bench/services/html_formatter.rb
|
|
209
190
|
- lib/skill_bench/services/iteration_formatter.rb
|
|
210
191
|
- lib/skill_bench/services/json_formatter.rb
|
|
211
192
|
- lib/skill_bench/services/judge_params_builder.rb
|
|
@@ -217,11 +198,13 @@ files:
|
|
|
217
198
|
- lib/skill_bench/services/output_persistence_service.rb
|
|
218
199
|
- lib/skill_bench/services/prompt_builder_service.rb
|
|
219
200
|
- lib/skill_bench/services/provider_resolver.rb
|
|
201
|
+
- lib/skill_bench/services/response_cache.rb
|
|
220
202
|
- lib/skill_bench/services/result_printer_service.rb
|
|
221
203
|
- lib/skill_bench/services/runner_service.rb
|
|
222
204
|
- lib/skill_bench/services/skill_resolver.rb
|
|
223
205
|
- lib/skill_bench/services/skill_resolver_service.rb
|
|
224
206
|
- lib/skill_bench/services/source_path_resolver_service.rb
|
|
207
|
+
- lib/skill_bench/services/summary_formatter.rb
|
|
225
208
|
- lib/skill_bench/services/template_registry.rb
|
|
226
209
|
- lib/skill_bench/services/template_registry/category_data.rb
|
|
227
210
|
- lib/skill_bench/services/trend_recorder_service.rb
|
|
@@ -262,7 +245,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
262
245
|
- !ruby/object:Gem::Version
|
|
263
246
|
version: '0'
|
|
264
247
|
requirements: []
|
|
265
|
-
rubygems_version:
|
|
248
|
+
rubygems_version: 3.6.9
|
|
266
249
|
specification_version: 4
|
|
267
250
|
summary: The evaluation engine for AI Agent Skills benchmarking.
|
|
268
251
|
test_files: []
|