ruby-skill-bench 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +166 -35
- data/docs/architecture.md +3 -1
- data/docs/first-eval-guide.md +7 -7
- data/docs/testing-guide.md +1 -1
- data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
- data/lib/skill_bench/agent/react_agent/step.rb +7 -1
- data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
- data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
- data/lib/skill_bench/cli/help_printer.rb +10 -2
- data/lib/skill_bench/cli/init_command.rb +2 -1
- data/lib/skill_bench/cli/result_printer.rb +1 -1
- data/lib/skill_bench/cli/run_command.rb +47 -9
- data/lib/skill_bench/cli/validate_command.rb +242 -0
- data/lib/skill_bench/cli.rb +3 -0
- data/lib/skill_bench/client.rb +43 -1
- data/lib/skill_bench/clients/all.rb +2 -0
- data/lib/skill_bench/clients/base_client.rb +12 -1
- data/lib/skill_bench/clients/base_url_validator.rb +105 -0
- data/lib/skill_bench/clients/provider_config.rb +34 -1
- data/lib/skill_bench/clients/provider_schemas.rb +4 -0
- data/lib/skill_bench/clients/providers/mistral.rb +47 -0
- data/lib/skill_bench/commands/init.rb +5 -0
- data/lib/skill_bench/commands/skill_new.rb +3 -1
- data/lib/skill_bench/config/applier.rb +2 -0
- data/lib/skill_bench/config/defaults.rb +2 -0
- data/lib/skill_bench/config/facade_readers.rb +7 -0
- data/lib/skill_bench/config/facade_writers.rb +17 -0
- data/lib/skill_bench/config/json_loader.rb +1 -1
- data/lib/skill_bench/config/store.rb +29 -0
- data/lib/skill_bench/config.rb +18 -0
- data/lib/skill_bench/evaluation/runner.rb +20 -3
- data/lib/skill_bench/execution/context_hydrator.rb +52 -11
- data/lib/skill_bench/execution/sandbox.rb +58 -11
- data/lib/skill_bench/judge/judge.rb +4 -0
- data/lib/skill_bench/judge/prompt.rb +42 -6
- data/lib/skill_bench/models/config.rb +32 -0
- data/lib/skill_bench/output_formatter.rb +60 -1
- data/lib/skill_bench/package_verifier.rb +1 -1
- data/lib/skill_bench/rails/skill_templates.rb +19 -5
- data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
- data/lib/skill_bench/services/batch_runner_service.rb +111 -0
- data/lib/skill_bench/services/compare_option_parser.rb +1 -0
- data/lib/skill_bench/services/cost_calculator.rb +91 -0
- data/lib/skill_bench/services/html_formatter.rb +289 -0
- data/lib/skill_bench/services/json_formatter.rb +19 -1
- data/lib/skill_bench/services/junit_formatter.rb +74 -24
- data/lib/skill_bench/services/provider_resolver.rb +5 -2
- data/lib/skill_bench/services/response_cache.rb +130 -0
- data/lib/skill_bench/services/runner_service.rb +88 -4
- data/lib/skill_bench/services/summary_formatter.rb +90 -0
- data/lib/skill_bench/services/template_registry.rb +43 -9
- data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
- data/lib/skill_bench/tools/registry.rb +29 -3
- data/lib/skill_bench/tools/run_command.rb +171 -19
- data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
- data/lib/skill_bench/trend_tracker.rb +5 -5
- data/lib/skill_bench/version.rb +1 -1
- data/lib/skill_bench.rb +2 -3
- metadata +17 -36
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require 'json'
|
|
4
4
|
require 'pathname'
|
|
5
|
+
require 'fileutils'
|
|
5
6
|
|
|
6
7
|
module SkillBench
|
|
7
8
|
class TrendTracker
|
|
@@ -27,23 +28,24 @@ module SkillBench
|
|
|
27
28
|
[]
|
|
28
29
|
end
|
|
29
30
|
|
|
30
|
-
# Writes history to file
|
|
31
|
-
#
|
|
31
|
+
# Writes history to file atomically, snapshotting the previous good
|
|
32
|
+
# version into the backup first.
|
|
33
|
+
#
|
|
34
|
+
# The existing history file (if any) is copied to +#{history_file}.bak+
|
|
35
|
+
# before the new content is written, so the backup always holds the
|
|
36
|
+
# previous good version rather than a duplicate of the current file. The
|
|
37
|
+
# new content is serialized once and written via a temp-file + rename so
|
|
38
|
+
# the main file is never left partially written. Returns a result hash so
|
|
39
|
+
# callers do not need to rescue SystemCallError.
|
|
32
40
|
#
|
|
33
41
|
# @param history [Array<Hash>] History entries to write
|
|
34
42
|
# @return [Hash] { success: true } on success, { success: false, error: { message: '...' } } on failure
|
|
35
43
|
def write(history)
|
|
36
|
-
|
|
44
|
+
backup_previous_version
|
|
37
45
|
temp_file = "#{history_file}.tmp"
|
|
38
|
-
File.write(temp_file,
|
|
46
|
+
File.write(temp_file, JSON.pretty_generate(history))
|
|
39
47
|
File.rename(temp_file, history_file)
|
|
40
48
|
|
|
41
|
-
begin
|
|
42
|
-
File.write("#{history_file}.bak", json)
|
|
43
|
-
rescue SystemCallError => e
|
|
44
|
-
warn "Backup write failed for #{history_file}: #{e.message}"
|
|
45
|
-
end
|
|
46
|
-
|
|
47
49
|
{ success: true }
|
|
48
50
|
rescue SystemCallError => e
|
|
49
51
|
{ success: false, error: { message: e.message } }
|
|
@@ -53,6 +55,21 @@ module SkillBench
|
|
|
53
55
|
|
|
54
56
|
attr_reader :history_file
|
|
55
57
|
|
|
58
|
+
# Copies the current history file to the backup path so the backup keeps
|
|
59
|
+
# the previous good version. No-op on the first run when no history file
|
|
60
|
+
# exists yet. A failed copy is non-fatal: it warns and lets the main
|
|
61
|
+
# write proceed.
|
|
62
|
+
#
|
|
63
|
+
# @return [void]
|
|
64
|
+
def backup_previous_version
|
|
65
|
+
source = history_file
|
|
66
|
+
return unless File.exist?(source)
|
|
67
|
+
|
|
68
|
+
FileUtils.cp(source, "#{source}.bak")
|
|
69
|
+
rescue SystemCallError => e
|
|
70
|
+
warn "Backup copy failed for #{source}: #{e.message}"
|
|
71
|
+
end
|
|
72
|
+
|
|
56
73
|
# Reads backup file if it exists
|
|
57
74
|
#
|
|
58
75
|
# @return [Array<Hash>, nil] Backup data or nil if unavailable
|
|
@@ -17,9 +17,9 @@ module SkillBench
|
|
|
17
17
|
# Records an evaluation result.
|
|
18
18
|
#
|
|
19
19
|
# @param result [Hash] The evaluation result from EvaluationRunner.
|
|
20
|
+
# @param history [Array<Hash>] Pre-loaded history to append to; defaults to a fresh load.
|
|
20
21
|
# @return [Hash] Service response.
|
|
21
|
-
def record(result)
|
|
22
|
-
history = @persistence.load
|
|
22
|
+
def record(result, history = @persistence.load)
|
|
23
23
|
history << extract_entry(result)
|
|
24
24
|
write_result = @persistence.write(history)
|
|
25
25
|
|
|
@@ -41,11 +41,11 @@ module SkillBench
|
|
|
41
41
|
# Computes the trend of the given result against the most recent matching history entry.
|
|
42
42
|
#
|
|
43
43
|
# @param result [Hash] The current evaluation result.
|
|
44
|
+
# @param history [Array<Hash>] Pre-loaded history to compare against; defaults to a fresh load.
|
|
44
45
|
# @return [Hash, nil] Trend data or nil if no matching history exists.
|
|
45
|
-
def trend_for(result)
|
|
46
|
-
entries = @persistence.load
|
|
46
|
+
def trend_for(result, history = @persistence.load)
|
|
47
47
|
current = extract_entry(result)
|
|
48
|
-
TrendCalculator.compute_trend(
|
|
48
|
+
TrendCalculator.compute_trend(history, current)
|
|
49
49
|
end
|
|
50
50
|
|
|
51
51
|
private
|
data/lib/skill_bench/version.rb
CHANGED
data/lib/skill_bench.rb
CHANGED
|
@@ -73,6 +73,8 @@ require_relative 'skill_bench/commands/eval_new'
|
|
|
73
73
|
|
|
74
74
|
# Services
|
|
75
75
|
require_relative 'skill_bench/services/runner_service'
|
|
76
|
+
require_relative 'skill_bench/services/batch_runner_service'
|
|
77
|
+
require_relative 'skill_bench/services/summary_formatter'
|
|
76
78
|
require_relative 'skill_bench/services/template_registry'
|
|
77
79
|
|
|
78
80
|
# Tools
|
|
@@ -88,9 +90,6 @@ require_relative 'skill_bench/trend_tracker'
|
|
|
88
90
|
require_relative 'skill_bench/trend_tracker/persistence'
|
|
89
91
|
require_relative 'skill_bench/trend_tracker/trend_calculator'
|
|
90
92
|
|
|
91
|
-
# Rails integrations
|
|
92
|
-
require_relative 'skill_bench/rails/skill_templates'
|
|
93
|
-
|
|
94
93
|
# Migration utilities
|
|
95
94
|
require_relative 'skill_bench/migration/provider_migrator'
|
|
96
95
|
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ruby-skill-bench
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ismael Marin
|
|
@@ -9,48 +9,20 @@ bindir: bin
|
|
|
9
9
|
cert_chain: []
|
|
10
10
|
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
|
-
- !ruby/object:Gem::Dependency
|
|
13
|
-
name: activesupport
|
|
14
|
-
requirement: !ruby/object:Gem::Requirement
|
|
15
|
-
requirements:
|
|
16
|
-
- - ">="
|
|
17
|
-
- !ruby/object:Gem::Version
|
|
18
|
-
version: '6.0'
|
|
19
|
-
type: :runtime
|
|
20
|
-
prerelease: false
|
|
21
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
-
requirements:
|
|
23
|
-
- - ">="
|
|
24
|
-
- !ruby/object:Gem::Version
|
|
25
|
-
version: '6.0'
|
|
26
12
|
- !ruby/object:Gem::Dependency
|
|
27
13
|
name: cgi
|
|
28
14
|
requirement: !ruby/object:Gem::Requirement
|
|
29
15
|
requirements:
|
|
30
16
|
- - "~>"
|
|
31
17
|
- !ruby/object:Gem::Version
|
|
32
|
-
version: 0.5.
|
|
33
|
-
type: :runtime
|
|
34
|
-
prerelease: false
|
|
35
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
-
requirements:
|
|
37
|
-
- - "~>"
|
|
38
|
-
- !ruby/object:Gem::Version
|
|
39
|
-
version: 0.5.1
|
|
40
|
-
- !ruby/object:Gem::Dependency
|
|
41
|
-
name: dotenv
|
|
42
|
-
requirement: !ruby/object:Gem::Requirement
|
|
43
|
-
requirements:
|
|
44
|
-
- - "~>"
|
|
45
|
-
- !ruby/object:Gem::Version
|
|
46
|
-
version: 3.2.0
|
|
18
|
+
version: 0.5.2
|
|
47
19
|
type: :runtime
|
|
48
20
|
prerelease: false
|
|
49
21
|
version_requirements: !ruby/object:Gem::Requirement
|
|
50
22
|
requirements:
|
|
51
23
|
- - "~>"
|
|
52
24
|
- !ruby/object:Gem::Version
|
|
53
|
-
version:
|
|
25
|
+
version: 0.5.2
|
|
54
26
|
- !ruby/object:Gem::Dependency
|
|
55
27
|
name: faraday
|
|
56
28
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -71,28 +43,28 @@ dependencies:
|
|
|
71
43
|
requirements:
|
|
72
44
|
- - "~>"
|
|
73
45
|
- !ruby/object:Gem::Version
|
|
74
|
-
version: '2.
|
|
46
|
+
version: '2.20'
|
|
75
47
|
type: :runtime
|
|
76
48
|
prerelease: false
|
|
77
49
|
version_requirements: !ruby/object:Gem::Requirement
|
|
78
50
|
requirements:
|
|
79
51
|
- - "~>"
|
|
80
52
|
- !ruby/object:Gem::Version
|
|
81
|
-
version: '2.
|
|
53
|
+
version: '2.20'
|
|
82
54
|
- !ruby/object:Gem::Dependency
|
|
83
55
|
name: parallel
|
|
84
56
|
requirement: !ruby/object:Gem::Requirement
|
|
85
57
|
requirements:
|
|
86
58
|
- - "~>"
|
|
87
59
|
- !ruby/object:Gem::Version
|
|
88
|
-
version:
|
|
60
|
+
version: 2.0.0
|
|
89
61
|
type: :runtime
|
|
90
62
|
prerelease: false
|
|
91
63
|
version_requirements: !ruby/object:Gem::Requirement
|
|
92
64
|
requirements:
|
|
93
65
|
- - "~>"
|
|
94
66
|
- !ruby/object:Gem::Version
|
|
95
|
-
version:
|
|
67
|
+
version: 2.0.0
|
|
96
68
|
description: |
|
|
97
69
|
ruby-skill-bench orchestrates evaluation runs of AI coding agents
|
|
98
70
|
inside isolated git sandboxes, then scores the results using deterministic
|
|
@@ -119,6 +91,7 @@ files:
|
|
|
119
91
|
- lib/skill_bench/agent/runner.rb
|
|
120
92
|
- lib/skill_bench/agent/summary.rb
|
|
121
93
|
- lib/skill_bench/cli.rb
|
|
94
|
+
- lib/skill_bench/cli/batch_result_printer.rb
|
|
122
95
|
- lib/skill_bench/cli/compare_command.rb
|
|
123
96
|
- lib/skill_bench/cli/eval/eval_command_registry.rb
|
|
124
97
|
- lib/skill_bench/cli/eval/eval_commands.rb
|
|
@@ -129,9 +102,11 @@ files:
|
|
|
129
102
|
- lib/skill_bench/cli/result_printer.rb
|
|
130
103
|
- lib/skill_bench/cli/run_command.rb
|
|
131
104
|
- lib/skill_bench/cli/skill_command.rb
|
|
105
|
+
- lib/skill_bench/cli/validate_command.rb
|
|
132
106
|
- lib/skill_bench/client.rb
|
|
133
107
|
- lib/skill_bench/clients/all.rb
|
|
134
108
|
- lib/skill_bench/clients/base_client.rb
|
|
109
|
+
- lib/skill_bench/clients/base_url_validator.rb
|
|
135
110
|
- lib/skill_bench/clients/provider_config.rb
|
|
136
111
|
- lib/skill_bench/clients/provider_registry.rb
|
|
137
112
|
- lib/skill_bench/clients/provider_schemas.rb
|
|
@@ -140,6 +115,7 @@ files:
|
|
|
140
115
|
- lib/skill_bench/clients/providers/deepseek.rb
|
|
141
116
|
- lib/skill_bench/clients/providers/gemini.rb
|
|
142
117
|
- lib/skill_bench/clients/providers/groq.rb
|
|
118
|
+
- lib/skill_bench/clients/providers/mistral.rb
|
|
143
119
|
- lib/skill_bench/clients/providers/mock.rb
|
|
144
120
|
- lib/skill_bench/clients/providers/null_client.rb
|
|
145
121
|
- lib/skill_bench/clients/providers/ollama.rb
|
|
@@ -198,16 +174,19 @@ files:
|
|
|
198
174
|
- lib/skill_bench/registry/pack_resolver.rb
|
|
199
175
|
- lib/skill_bench/runner.rb
|
|
200
176
|
- lib/skill_bench/services/agent_spawner_service.rb
|
|
177
|
+
- lib/skill_bench/services/batch_runner_service.rb
|
|
201
178
|
- lib/skill_bench/services/compare_option_parser.rb
|
|
202
179
|
- lib/skill_bench/services/comparison_reporter.rb
|
|
203
180
|
- lib/skill_bench/services/comparison_runner.rb
|
|
204
181
|
- lib/skill_bench/services/context_loader_service.rb
|
|
182
|
+
- lib/skill_bench/services/cost_calculator.rb
|
|
205
183
|
- lib/skill_bench/services/delta_table_formatter.rb
|
|
206
184
|
- lib/skill_bench/services/error_response_builder.rb
|
|
207
185
|
- lib/skill_bench/services/eval_resolver.rb
|
|
208
186
|
- lib/skill_bench/services/exit_code_calculator.rb
|
|
209
187
|
- lib/skill_bench/services/feedback_generator.rb
|
|
210
188
|
- lib/skill_bench/services/formatting_helpers.rb
|
|
189
|
+
- lib/skill_bench/services/html_formatter.rb
|
|
211
190
|
- lib/skill_bench/services/iteration_formatter.rb
|
|
212
191
|
- lib/skill_bench/services/json_formatter.rb
|
|
213
192
|
- lib/skill_bench/services/judge_params_builder.rb
|
|
@@ -219,11 +198,13 @@ files:
|
|
|
219
198
|
- lib/skill_bench/services/output_persistence_service.rb
|
|
220
199
|
- lib/skill_bench/services/prompt_builder_service.rb
|
|
221
200
|
- lib/skill_bench/services/provider_resolver.rb
|
|
201
|
+
- lib/skill_bench/services/response_cache.rb
|
|
222
202
|
- lib/skill_bench/services/result_printer_service.rb
|
|
223
203
|
- lib/skill_bench/services/runner_service.rb
|
|
224
204
|
- lib/skill_bench/services/skill_resolver.rb
|
|
225
205
|
- lib/skill_bench/services/skill_resolver_service.rb
|
|
226
206
|
- lib/skill_bench/services/source_path_resolver_service.rb
|
|
207
|
+
- lib/skill_bench/services/summary_formatter.rb
|
|
227
208
|
- lib/skill_bench/services/template_registry.rb
|
|
228
209
|
- lib/skill_bench/services/template_registry/category_data.rb
|
|
229
210
|
- lib/skill_bench/services/trend_recorder_service.rb
|
|
@@ -264,7 +245,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
264
245
|
- !ruby/object:Gem::Version
|
|
265
246
|
version: '0'
|
|
266
247
|
requirements: []
|
|
267
|
-
rubygems_version:
|
|
248
|
+
rubygems_version: 3.6.9
|
|
268
249
|
specification_version: 4
|
|
269
250
|
summary: The evaluation engine for AI Agent Skills benchmarking.
|
|
270
251
|
test_files: []
|