ruby-skill-bench 1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +299 -23
  3. data/docs/architecture.md +3 -1
  4. data/docs/first-eval-guide.md +7 -7
  5. data/docs/testing-guide.md +1 -1
  6. data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
  7. data/lib/skill_bench/agent/react_agent/step.rb +7 -1
  8. data/lib/skill_bench/agent/react_agent.rb +2 -1
  9. data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
  10. data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
  11. data/lib/skill_bench/cli/help_printer.rb +10 -2
  12. data/lib/skill_bench/cli/init_command.rb +2 -1
  13. data/lib/skill_bench/cli/result_printer.rb +1 -1
  14. data/lib/skill_bench/cli/run_command.rb +47 -9
  15. data/lib/skill_bench/cli/validate_command.rb +242 -0
  16. data/lib/skill_bench/cli.rb +3 -0
  17. data/lib/skill_bench/client.rb +43 -1
  18. data/lib/skill_bench/clients/all.rb +3 -0
  19. data/lib/skill_bench/clients/base_client.rb +14 -6
  20. data/lib/skill_bench/clients/base_url_validator.rb +105 -0
  21. data/lib/skill_bench/clients/provider_config.rb +34 -1
  22. data/lib/skill_bench/clients/provider_schemas.rb +4 -0
  23. data/lib/skill_bench/clients/providers/mistral.rb +47 -0
  24. data/lib/skill_bench/clients/request_builder.rb +2 -4
  25. data/lib/skill_bench/clients/response_builder.rb +91 -0
  26. data/lib/skill_bench/clients/response_error_handler.rb +5 -17
  27. data/lib/skill_bench/clients/retry_handler.rb +4 -7
  28. data/lib/skill_bench/commands/init.rb +5 -0
  29. data/lib/skill_bench/commands/skill_new.rb +3 -1
  30. data/lib/skill_bench/config/applier.rb +2 -0
  31. data/lib/skill_bench/config/defaults.rb +2 -0
  32. data/lib/skill_bench/config/facade_readers.rb +7 -0
  33. data/lib/skill_bench/config/facade_writers.rb +17 -0
  34. data/lib/skill_bench/config/json_loader.rb +1 -1
  35. data/lib/skill_bench/config/store.rb +29 -0
  36. data/lib/skill_bench/config.rb +18 -0
  37. data/lib/skill_bench/constants.rb +58 -0
  38. data/lib/skill_bench/evaluation/runner.rb +20 -3
  39. data/lib/skill_bench/execution/context_hydrator.rb +66 -15
  40. data/lib/skill_bench/execution/sandbox.rb +76 -14
  41. data/lib/skill_bench/judge/judge.rb +4 -0
  42. data/lib/skill_bench/judge/prompt.rb +42 -6
  43. data/lib/skill_bench/models/config.rb +32 -0
  44. data/lib/skill_bench/output_formatter.rb +60 -1
  45. data/lib/skill_bench/package_verifier.rb +1 -1
  46. data/lib/skill_bench/rails/skill_templates.rb +19 -5
  47. data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
  48. data/lib/skill_bench/services/batch_runner_service.rb +111 -0
  49. data/lib/skill_bench/services/compare_option_parser.rb +1 -0
  50. data/lib/skill_bench/services/cost_calculator.rb +91 -0
  51. data/lib/skill_bench/services/html_formatter.rb +289 -0
  52. data/lib/skill_bench/services/json_formatter.rb +19 -1
  53. data/lib/skill_bench/services/junit_formatter.rb +74 -24
  54. data/lib/skill_bench/services/provider_resolver.rb +5 -2
  55. data/lib/skill_bench/services/response_cache.rb +130 -0
  56. data/lib/skill_bench/services/runner_service.rb +88 -4
  57. data/lib/skill_bench/services/summary_formatter.rb +90 -0
  58. data/lib/skill_bench/services/template_registry.rb +43 -9
  59. data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
  60. data/lib/skill_bench/tools/registry.rb +29 -3
  61. data/lib/skill_bench/tools/run_command.rb +172 -35
  62. data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
  63. data/lib/skill_bench/trend_tracker.rb +5 -5
  64. data/lib/skill_bench/version.rb +1 -1
  65. data/lib/skill_bench.rb +3 -3
  66. metadata +19 -36
@@ -1,29 +1,34 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'open3'
4
- require 'timeout'
5
4
  require 'shellwords'
6
5
  require_relative '../config'
6
+ require_relative '../constants'
7
+ require_relative '../error_logger'
7
8
 
8
9
  module SkillBench
9
10
  module Tools
10
11
  # Handles executing a shell command within the working directory.
12
+ #
13
+ # Real container isolation is not yet shipped, so an active sandbox means a
14
+ # temporary git directory on the host. To honor the documented security
15
+ # model the tool fails closed: when no container isolation is active it
16
+ # refuses to run unless `allow_host_execution` is explicitly enabled.
11
17
  class RunCommand
12
- # Commands that are always blocked even if listed in allowed_commands,
13
- # because they can be used to escape the sandbox or execute arbitrary code.
14
- DANGEROUS_COMMANDS = %w[
15
- bash sh zsh fish dash ksh csh tcsh
16
- python python3 python2 ruby perl node
17
- php lua tcl wish
18
- curl wget nc ncat socat
19
- eval exec
20
- sudo su doas
21
- chmod chown mount umount
22
- dd mkfs fdisk parted
23
- insmod rmmod modprobe
24
- systemctl service
25
- passwd useradd userdel groupadd groupdel
26
- ].freeze
18
+ # Refusal returned when no container isolation is active and host execution
19
+ # has not been explicitly enabled. Deliberately omits the allowlist.
20
+ HOST_EXECUTION_REFUSED = 'Command execution refused: no sandbox isolation is active and ' \
21
+ "'allow_host_execution' is not enabled. Set \"allow_host_execution\": true in " \
22
+ 'skill-bench.json to permit running commands directly on the host (NOT isolated).'
23
+
24
+ # Warning emitted when a command runs un-isolated on the host because
25
+ # `allow_host_execution` is enabled and no container is active.
26
+ HOST_EXECUTION_WARNING = 'Warning: running command directly on the host with NO sandbox isolation ' \
27
+ '(allow_host_execution is enabled). Commands are not isolated from your machine.'
28
+
29
+ # Seconds to wait after SIGTERM before escalating to SIGKILL when a command
30
+ # exceeds its execution deadline.
31
+ TERM_GRACE_PERIOD = 2
27
32
 
28
33
  # @return [Hash] The tool definition for the LLM API.
29
34
  def self.definition
@@ -49,41 +54,173 @@ module SkillBench
49
54
  # Tokenizes the command string before execution so that arguments are passed
50
55
  # directly to the OS without shell interpretation, preventing shell injection.
51
56
  #
57
+ # Fails closed: when no container isolation is active (`container_id` is nil)
58
+ # and `allow_host_execution` is false, the command is refused and nothing
59
+ # runs. When host execution is explicitly allowed, a warning is emitted once
60
+ # per command before running un-isolated on the host.
61
+ #
52
62
  # @param command [String] The command to run (e.g. "rspec spec/models").
53
63
  # @param working_dir_path [Pathname] The host directory (ignored if container_id present).
54
64
  # @param container_id [String, nil] The Docker container ID for isolated execution.
55
- # @return [String] A formatted string containing the exit status, STDOUT, and STDERR.
56
- # @raise [Timeout::Error] Internally rescued; returns a timeout message string.
65
+ # @return [String] A formatted string containing the exit status, STDOUT, and STDERR,
66
+ # or a standardized error/refusal message.
57
67
  def self.call(command, working_dir_path, container_id = nil)
58
68
  argv = command.shellsplit
59
69
  return 'Error: Empty command.' if argv.empty?
60
70
 
61
71
  base_cmd = argv.first
62
- return "Error: Command '#{base_cmd}' is blocked for security reasons." if DANGEROUS_COMMANDS.include?(base_cmd)
72
+ return "Error: Command '#{base_cmd}' is blocked for security reasons." if Constants::Tools::DANGEROUS_COMMANDS.include?(base_cmd)
63
73
 
64
74
  allowed = SkillBench::Config.allowed_commands
65
75
  return 'Error: No allowed commands configured. Set allowed_commands in skill-bench.json or use --mode mock.' if allowed.nil?
66
76
  return "Error: Command '#{base_cmd}' is not permitted." unless allowed.include?(base_cmd)
67
77
 
78
+ return "Error: Command '#{base_cmd}' arguments are not permitted by the configured argument constraints." unless arguments_permitted?(base_cmd, argv)
79
+
80
+ return HOST_EXECUTION_REFUSED unless container_id || SkillBench::Config.allow_host_execution
81
+
82
+ warn_unisolated_host_execution unless container_id
83
+ execute(argv, working_dir_path, container_id)
84
+ end
85
+
86
+ # Checks the command's arguments against the optional, per-command
87
+ # argument constraints from configuration.
88
+ #
89
+ # This is a default-off seam: the command allowlist remains the primary
90
+ # authorization control, and any allowlisted wrapper binary still grants
91
+ # broad host execution. When no constraints are configured (the default),
92
+ # or none apply to +base_cmd+, every argument is permitted so behavior is
93
+ # unchanged. When a constraint exists for +base_cmd+, the command is
94
+ # refused if any argument contains a disallowed substring/flag.
95
+ #
96
+ # @param base_cmd [String] The base command (first token of the command).
97
+ # @param argv [Array<String>] The tokenized command and arguments.
98
+ # @return [Boolean] true when the arguments are permitted to run.
99
+ def self.arguments_permitted?(base_cmd, argv)
100
+ constraints = SkillBench::Config.command_argument_constraints
101
+ return true if constraints.nil? || constraints.empty?
102
+
103
+ # Constraint keys may be strings (facade API) or symbols (loaded from
104
+ # JSON via symbolize_names), so look the command up under both.
105
+ disallowed = constraints[base_cmd] || constraints[base_cmd.to_sym]
106
+ return true if disallowed.nil? || disallowed.empty?
107
+
108
+ argv.drop(1).none? { |arg| disallowed.any? { |bad| arg.include?(bad.to_s) } }
109
+ end
110
+ private_class_method :arguments_permitted?
111
+
112
+ # Runs the resolved command and formats its result, enforcing the
113
+ # configured execution timeout.
114
+ #
115
+ # The command is spawned in its own process group so that, on timeout, the
116
+ # whole group (the command and any children it forked) can be signalled —
117
+ # something `Timeout.timeout` around `Open3.capture3` could not do, because
118
+ # `capture3`'s `ensure` blocks on `wait_thr.value` and never signals the
119
+ # child.
120
+ #
121
+ # @param argv [Array<String>] The tokenized command and arguments.
122
+ # @param working_dir_path [Pathname] The host directory for host execution.
123
+ # @param container_id [String, nil] The Docker container ID for isolated execution.
124
+ # @return [String] Formatted exit status, STDOUT, and STDERR, or a timeout message.
125
+ def self.execute(argv, working_dir_path, container_id)
68
126
  max_time = SkillBench::Config.max_execution_time
69
- Timeout.timeout(max_time) do
70
- stdout_str, stderr_str, status = if container_id
71
- docker_cmd = ['docker', 'exec', '-w', '/sandbox', container_id] + argv
72
- Open3.capture3(*docker_cmd)
73
- else
74
- Open3.capture3(*argv, chdir: working_dir_path.to_s)
75
- end
76
- <<~RESULT
77
- Exit Status: #{status.exitstatus}
78
- STDOUT:
79
- #{stdout_str}
80
- STDERR:
81
- #{stderr_str}
82
- RESULT
127
+ command, spawn_opts = resolve_invocation(argv, working_dir_path, container_id)
128
+ result = capture(command, spawn_opts, max_time)
129
+ return "Error: Command execution timed out after #{max_time} seconds." if result == :timed_out
130
+
131
+ stdout_str, stderr_str, status = result
132
+ format_result(status, stdout_str, stderr_str)
133
+ end
134
+ private_class_method :execute
135
+
136
+ # Formats the captured command output into the standard result string.
137
+ #
138
+ # @param status [Process::Status] The exit status of the command.
139
+ # @param stdout_str [String] The captured standard output.
140
+ # @param stderr_str [String] The captured standard error.
141
+ # @return [String] Formatted exit status, STDOUT, and STDERR.
142
+ def self.format_result(status, stdout_str, stderr_str)
143
+ <<~RESULT
144
+ Exit Status: #{status.exitstatus}
145
+ STDOUT:
146
+ #{stdout_str}
147
+ STDERR:
148
+ #{stderr_str}
149
+ RESULT
150
+ end
151
+ private_class_method :format_result
152
+
153
+ # Builds the command array and spawn options for either container or host
154
+ # execution. Both run in their own process group (`pgroup: true`) so the
155
+ # watchdog can kill the whole group on timeout.
156
+ #
157
+ # @param argv [Array<String>] The tokenized command and arguments.
158
+ # @param working_dir_path [Pathname] The host directory for host execution.
159
+ # @param container_id [String, nil] The Docker container ID for isolated execution.
160
+ # @return [Array(Array<String>, Hash)] The full command array and spawn options.
161
+ def self.resolve_invocation(argv, working_dir_path, container_id)
162
+ return [['docker', 'exec', '-w', '/sandbox', container_id, *argv], { pgroup: true }] if container_id
163
+
164
+ [argv, { chdir: working_dir_path.to_s, pgroup: true }]
165
+ end
166
+ private_class_method :resolve_invocation
167
+
168
+ # Spawns the command, draining STDOUT/STDERR on separate threads so a chatty
169
+ # or hung child never deadlocks the reader, and enforces the deadline with a
170
+ # watchdog that kills the process group when the command overruns.
171
+ #
172
+ # @param command [Array<String>] The full command array (no shell).
173
+ # @param spawn_opts [Hash] Options passed to the spawner (includes `pgroup`).
174
+ # @param max_time [Integer] Maximum execution time in seconds.
175
+ # @return [Array(String, String, Process::Status), Symbol] STDOUT, STDERR, and
176
+ # status on completion, or `:timed_out` when the deadline is exceeded.
177
+ def self.capture(command, spawn_opts, max_time)
178
+ Open3.popen3(*command, **spawn_opts) do |stdin, stdout, stderr, wait_thr|
179
+ stdin.close
180
+ readers = [Thread.new { stdout.read }, Thread.new { stderr.read }]
181
+ completed = wait_thr.join(max_time)
182
+ terminate_process_group(wait_thr) unless completed
183
+ stdout_str, stderr_str = readers.map(&:value)
184
+ completed ? [stdout_str, stderr_str, wait_thr.value] : :timed_out
83
185
  end
84
- rescue Timeout::Error
85
- "Error: Command execution timed out after #{max_time} seconds."
86
186
  end
187
+ private_class_method :capture
188
+
189
+ # Terminates the command's entire process group: SIGTERM first, then SIGKILL
190
+ # after a short grace period if it has not exited. Signalling the negated
191
+ # process group id reaches the command and any children it forked.
192
+ #
193
+ # @param wait_thr [Process::Waiter] The wait thread for the spawned process group leader.
194
+ # @return [void]
195
+ def self.terminate_process_group(wait_thr)
196
+ pgid = wait_thr.pid
197
+ signal_group('TERM', pgid)
198
+ signal_group('KILL', pgid) unless wait_thr.join(TERM_GRACE_PERIOD)
199
+ end
200
+ private_class_method :terminate_process_group
201
+
202
+ # Sends a signal to a whole process group, ignoring an already-exited group.
203
+ #
204
+ # @param signal [String] The signal name (e.g. "TERM", "KILL").
205
+ # @param pgid [Integer] The process group id (leader pid) to signal.
206
+ # @return [void]
207
+ def self.signal_group(signal, pgid)
208
+ Process.kill(signal, -pgid)
209
+ rescue Errno::ESRCH
210
+ nil
211
+ end
212
+ private_class_method :signal_group
213
+
214
+ # Emits a single warning that the command will run un-isolated on the host,
215
+ # honoring the test-suite stderr suppression convention.
216
+ #
217
+ # @return [void]
218
+ def self.warn_unisolated_host_execution
219
+ return if SkillBench::ErrorLogger.skip_stderr_output?
220
+
221
+ warn(HOST_EXECUTION_WARNING)
222
+ end
223
+ private_class_method :warn_unisolated_host_execution
87
224
  end
88
225
  end
89
226
  end
@@ -2,6 +2,7 @@
2
2
 
3
3
  require 'json'
4
4
  require 'pathname'
5
+ require 'fileutils'
5
6
 
6
7
  module SkillBench
7
8
  class TrendTracker
@@ -27,23 +28,24 @@ module SkillBench
27
28
  []
28
29
  end
29
30
 
30
- # Writes history to file with atomic operation and backup.
31
- # Returns a result hash so callers do not need to rescue SystemCallError.
31
+ # Writes history to file atomically, snapshotting the previous good
32
+ # version into the backup first.
33
+ #
34
+ # The existing history file (if any) is copied to +#{history_file}.bak+
35
+ # before the new content is written, so the backup always holds the
36
+ # previous good version rather than a duplicate of the current file. The
37
+ # new content is serialized once and written via a temp-file + rename so
38
+ # the main file is never left partially written. Returns a result hash so
39
+ # callers do not need to rescue SystemCallError.
32
40
  #
33
41
  # @param history [Array<Hash>] History entries to write
34
42
  # @return [Hash] { success: true } on success, { success: false, error: { message: '...' } } on failure
35
43
  def write(history)
36
- json = JSON.pretty_generate(history)
44
+ backup_previous_version
37
45
  temp_file = "#{history_file}.tmp"
38
- File.write(temp_file, json)
46
+ File.write(temp_file, JSON.pretty_generate(history))
39
47
  File.rename(temp_file, history_file)
40
48
 
41
- begin
42
- File.write("#{history_file}.bak", json)
43
- rescue SystemCallError => e
44
- warn "Backup write failed for #{history_file}: #{e.message}"
45
- end
46
-
47
49
  { success: true }
48
50
  rescue SystemCallError => e
49
51
  { success: false, error: { message: e.message } }
@@ -53,6 +55,21 @@ module SkillBench
53
55
 
54
56
  attr_reader :history_file
55
57
 
58
+ # Copies the current history file to the backup path so the backup keeps
59
+ # the previous good version. No-op on the first run when no history file
60
+ # exists yet. A failed copy is non-fatal: it warns and lets the main
61
+ # write proceed.
62
+ #
63
+ # @return [void]
64
+ def backup_previous_version
65
+ source = history_file
66
+ return unless File.exist?(source)
67
+
68
+ FileUtils.cp(source, "#{source}.bak")
69
+ rescue SystemCallError => e
70
+ warn "Backup copy failed for #{source}: #{e.message}"
71
+ end
72
+
56
73
  # Reads backup file if it exists
57
74
  #
58
75
  # @return [Array<Hash>, nil] Backup data or nil if unavailable
@@ -17,9 +17,9 @@ module SkillBench
17
17
  # Records an evaluation result.
18
18
  #
19
19
  # @param result [Hash] The evaluation result from EvaluationRunner.
20
+ # @param history [Array<Hash>] Pre-loaded history to append to; defaults to a fresh load.
20
21
  # @return [Hash] Service response.
21
- def record(result)
22
- history = @persistence.load
22
+ def record(result, history = @persistence.load)
23
23
  history << extract_entry(result)
24
24
  write_result = @persistence.write(history)
25
25
 
@@ -41,11 +41,11 @@ module SkillBench
41
41
  # Computes the trend of the given result against the most recent matching history entry.
42
42
  #
43
43
  # @param result [Hash] The current evaluation result.
44
+ # @param history [Array<Hash>] Pre-loaded history to compare against; defaults to a fresh load.
44
45
  # @return [Hash, nil] Trend data or nil if no matching history exists.
45
- def trend_for(result)
46
- entries = @persistence.load
46
+ def trend_for(result, history = @persistence.load)
47
47
  current = extract_entry(result)
48
- TrendCalculator.compute_trend(entries, current)
48
+ TrendCalculator.compute_trend(history, current)
49
49
  end
50
50
 
51
51
  private
@@ -2,5 +2,5 @@
2
2
 
3
3
  module SkillBench
4
4
  # The current gem version.
5
- VERSION = '1.0.1'
5
+ VERSION = '1.2.0'
6
6
  end
data/lib/skill_bench.rb CHANGED
@@ -8,6 +8,7 @@
8
8
 
9
9
  # Core modules
10
10
  require_relative 'skill_bench/version'
11
+ require_relative 'skill_bench/constants'
11
12
  require_relative 'skill_bench/dimension'
12
13
  require_relative 'skill_bench/criteria'
13
14
  require_relative 'skill_bench/delta_report'
@@ -72,6 +73,8 @@ require_relative 'skill_bench/commands/eval_new'
72
73
 
73
74
  # Services
74
75
  require_relative 'skill_bench/services/runner_service'
76
+ require_relative 'skill_bench/services/batch_runner_service'
77
+ require_relative 'skill_bench/services/summary_formatter'
75
78
  require_relative 'skill_bench/services/template_registry'
76
79
 
77
80
  # Tools
@@ -87,9 +90,6 @@ require_relative 'skill_bench/trend_tracker'
87
90
  require_relative 'skill_bench/trend_tracker/persistence'
88
91
  require_relative 'skill_bench/trend_tracker/trend_calculator'
89
92
 
90
- # Rails integrations
91
- require_relative 'skill_bench/rails/skill_templates'
92
-
93
93
  # Migration utilities
94
94
  require_relative 'skill_bench/migration/provider_migrator'
95
95
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-skill-bench
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ismael Marin
@@ -9,48 +9,20 @@ bindir: bin
9
9
  cert_chain: []
10
10
  date: 1980-01-02 00:00:00.000000000 Z
11
11
  dependencies:
12
- - !ruby/object:Gem::Dependency
13
- name: activesupport
14
- requirement: !ruby/object:Gem::Requirement
15
- requirements:
16
- - - ">="
17
- - !ruby/object:Gem::Version
18
- version: '6.0'
19
- type: :runtime
20
- prerelease: false
21
- version_requirements: !ruby/object:Gem::Requirement
22
- requirements:
23
- - - ">="
24
- - !ruby/object:Gem::Version
25
- version: '6.0'
26
12
  - !ruby/object:Gem::Dependency
27
13
  name: cgi
28
14
  requirement: !ruby/object:Gem::Requirement
29
15
  requirements:
30
16
  - - "~>"
31
17
  - !ruby/object:Gem::Version
32
- version: 0.5.1
33
- type: :runtime
34
- prerelease: false
35
- version_requirements: !ruby/object:Gem::Requirement
36
- requirements:
37
- - - "~>"
38
- - !ruby/object:Gem::Version
39
- version: 0.5.1
40
- - !ruby/object:Gem::Dependency
41
- name: dotenv
42
- requirement: !ruby/object:Gem::Requirement
43
- requirements:
44
- - - "~>"
45
- - !ruby/object:Gem::Version
46
- version: 3.2.0
18
+ version: 0.5.2
47
19
  type: :runtime
48
20
  prerelease: false
49
21
  version_requirements: !ruby/object:Gem::Requirement
50
22
  requirements:
51
23
  - - "~>"
52
24
  - !ruby/object:Gem::Version
53
- version: 3.2.0
25
+ version: 0.5.2
54
26
  - !ruby/object:Gem::Dependency
55
27
  name: faraday
56
28
  requirement: !ruby/object:Gem::Requirement
@@ -71,28 +43,28 @@ dependencies:
71
43
  requirements:
72
44
  - - "~>"
73
45
  - !ruby/object:Gem::Version
74
- version: '2.19'
46
+ version: '2.20'
75
47
  type: :runtime
76
48
  prerelease: false
77
49
  version_requirements: !ruby/object:Gem::Requirement
78
50
  requirements:
79
51
  - - "~>"
80
52
  - !ruby/object:Gem::Version
81
- version: '2.19'
53
+ version: '2.20'
82
54
  - !ruby/object:Gem::Dependency
83
55
  name: parallel
84
56
  requirement: !ruby/object:Gem::Requirement
85
57
  requirements:
86
58
  - - "~>"
87
59
  - !ruby/object:Gem::Version
88
- version: '1.26'
60
+ version: 2.0.0
89
61
  type: :runtime
90
62
  prerelease: false
91
63
  version_requirements: !ruby/object:Gem::Requirement
92
64
  requirements:
93
65
  - - "~>"
94
66
  - !ruby/object:Gem::Version
95
- version: '1.26'
67
+ version: 2.0.0
96
68
  description: |
97
69
  ruby-skill-bench orchestrates evaluation runs of AI coding agents
98
70
  inside isolated git sandboxes, then scores the results using deterministic
@@ -119,6 +91,7 @@ files:
119
91
  - lib/skill_bench/agent/runner.rb
120
92
  - lib/skill_bench/agent/summary.rb
121
93
  - lib/skill_bench/cli.rb
94
+ - lib/skill_bench/cli/batch_result_printer.rb
122
95
  - lib/skill_bench/cli/compare_command.rb
123
96
  - lib/skill_bench/cli/eval/eval_command_registry.rb
124
97
  - lib/skill_bench/cli/eval/eval_commands.rb
@@ -129,9 +102,11 @@ files:
129
102
  - lib/skill_bench/cli/result_printer.rb
130
103
  - lib/skill_bench/cli/run_command.rb
131
104
  - lib/skill_bench/cli/skill_command.rb
105
+ - lib/skill_bench/cli/validate_command.rb
132
106
  - lib/skill_bench/client.rb
133
107
  - lib/skill_bench/clients/all.rb
134
108
  - lib/skill_bench/clients/base_client.rb
109
+ - lib/skill_bench/clients/base_url_validator.rb
135
110
  - lib/skill_bench/clients/provider_config.rb
136
111
  - lib/skill_bench/clients/provider_registry.rb
137
112
  - lib/skill_bench/clients/provider_schemas.rb
@@ -140,6 +115,7 @@ files:
140
115
  - lib/skill_bench/clients/providers/deepseek.rb
141
116
  - lib/skill_bench/clients/providers/gemini.rb
142
117
  - lib/skill_bench/clients/providers/groq.rb
118
+ - lib/skill_bench/clients/providers/mistral.rb
143
119
  - lib/skill_bench/clients/providers/mock.rb
144
120
  - lib/skill_bench/clients/providers/null_client.rb
145
121
  - lib/skill_bench/clients/providers/ollama.rb
@@ -147,6 +123,7 @@ files:
147
123
  - lib/skill_bench/clients/providers/opencode.rb
148
124
  - lib/skill_bench/clients/providers/openrouter.rb
149
125
  - lib/skill_bench/clients/request_builder.rb
126
+ - lib/skill_bench/clients/response_builder.rb
150
127
  - lib/skill_bench/clients/response_error_handler.rb
151
128
  - lib/skill_bench/clients/response_parser.rb
152
129
  - lib/skill_bench/clients/retry_handler.rb
@@ -162,6 +139,7 @@ files:
162
139
  - lib/skill_bench/config/facade_writers.rb
163
140
  - lib/skill_bench/config/json_loader.rb
164
141
  - lib/skill_bench/config/store.rb
142
+ - lib/skill_bench/constants.rb
165
143
  - lib/skill_bench/criteria.rb
166
144
  - lib/skill_bench/delta_report.rb
167
145
  - lib/skill_bench/dimension.rb
@@ -196,16 +174,19 @@ files:
196
174
  - lib/skill_bench/registry/pack_resolver.rb
197
175
  - lib/skill_bench/runner.rb
198
176
  - lib/skill_bench/services/agent_spawner_service.rb
177
+ - lib/skill_bench/services/batch_runner_service.rb
199
178
  - lib/skill_bench/services/compare_option_parser.rb
200
179
  - lib/skill_bench/services/comparison_reporter.rb
201
180
  - lib/skill_bench/services/comparison_runner.rb
202
181
  - lib/skill_bench/services/context_loader_service.rb
182
+ - lib/skill_bench/services/cost_calculator.rb
203
183
  - lib/skill_bench/services/delta_table_formatter.rb
204
184
  - lib/skill_bench/services/error_response_builder.rb
205
185
  - lib/skill_bench/services/eval_resolver.rb
206
186
  - lib/skill_bench/services/exit_code_calculator.rb
207
187
  - lib/skill_bench/services/feedback_generator.rb
208
188
  - lib/skill_bench/services/formatting_helpers.rb
189
+ - lib/skill_bench/services/html_formatter.rb
209
190
  - lib/skill_bench/services/iteration_formatter.rb
210
191
  - lib/skill_bench/services/json_formatter.rb
211
192
  - lib/skill_bench/services/judge_params_builder.rb
@@ -217,11 +198,13 @@ files:
217
198
  - lib/skill_bench/services/output_persistence_service.rb
218
199
  - lib/skill_bench/services/prompt_builder_service.rb
219
200
  - lib/skill_bench/services/provider_resolver.rb
201
+ - lib/skill_bench/services/response_cache.rb
220
202
  - lib/skill_bench/services/result_printer_service.rb
221
203
  - lib/skill_bench/services/runner_service.rb
222
204
  - lib/skill_bench/services/skill_resolver.rb
223
205
  - lib/skill_bench/services/skill_resolver_service.rb
224
206
  - lib/skill_bench/services/source_path_resolver_service.rb
207
+ - lib/skill_bench/services/summary_formatter.rb
225
208
  - lib/skill_bench/services/template_registry.rb
226
209
  - lib/skill_bench/services/template_registry/category_data.rb
227
210
  - lib/skill_bench/services/trend_recorder_service.rb
@@ -262,7 +245,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
262
245
  - !ruby/object:Gem::Version
263
246
  version: '0'
264
247
  requirements: []
265
- rubygems_version: 4.0.12
248
+ rubygems_version: 3.6.9
266
249
  specification_version: 4
267
250
  summary: The evaluation engine for AI Agent Skills benchmarking.
268
251
  test_files: []