ruby-skill-bench 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +166 -35
  3. data/docs/architecture.md +3 -1
  4. data/docs/first-eval-guide.md +7 -7
  5. data/docs/testing-guide.md +1 -1
  6. data/lib/skill_bench/agent/react_agent/loop_runner.rb +44 -9
  7. data/lib/skill_bench/agent/react_agent/step.rb +7 -1
  8. data/lib/skill_bench/cli/batch_result_printer.rb +45 -0
  9. data/lib/skill_bench/cli/eval/eval_options.rb +4 -0
  10. data/lib/skill_bench/cli/help_printer.rb +10 -2
  11. data/lib/skill_bench/cli/init_command.rb +2 -1
  12. data/lib/skill_bench/cli/result_printer.rb +1 -1
  13. data/lib/skill_bench/cli/run_command.rb +47 -9
  14. data/lib/skill_bench/cli/validate_command.rb +242 -0
  15. data/lib/skill_bench/cli.rb +3 -0
  16. data/lib/skill_bench/client.rb +43 -1
  17. data/lib/skill_bench/clients/all.rb +2 -0
  18. data/lib/skill_bench/clients/base_client.rb +12 -1
  19. data/lib/skill_bench/clients/base_url_validator.rb +105 -0
  20. data/lib/skill_bench/clients/provider_config.rb +34 -1
  21. data/lib/skill_bench/clients/provider_schemas.rb +4 -0
  22. data/lib/skill_bench/clients/providers/mistral.rb +47 -0
  23. data/lib/skill_bench/commands/init.rb +5 -0
  24. data/lib/skill_bench/commands/skill_new.rb +3 -1
  25. data/lib/skill_bench/config/applier.rb +2 -0
  26. data/lib/skill_bench/config/defaults.rb +2 -0
  27. data/lib/skill_bench/config/facade_readers.rb +7 -0
  28. data/lib/skill_bench/config/facade_writers.rb +17 -0
  29. data/lib/skill_bench/config/json_loader.rb +1 -1
  30. data/lib/skill_bench/config/store.rb +29 -0
  31. data/lib/skill_bench/config.rb +18 -0
  32. data/lib/skill_bench/evaluation/runner.rb +20 -3
  33. data/lib/skill_bench/execution/context_hydrator.rb +52 -11
  34. data/lib/skill_bench/execution/sandbox.rb +58 -11
  35. data/lib/skill_bench/judge/judge.rb +4 -0
  36. data/lib/skill_bench/judge/prompt.rb +42 -6
  37. data/lib/skill_bench/models/config.rb +32 -0
  38. data/lib/skill_bench/output_formatter.rb +60 -1
  39. data/lib/skill_bench/package_verifier.rb +1 -1
  40. data/lib/skill_bench/rails/skill_templates.rb +19 -5
  41. data/lib/skill_bench/services/agent_spawner_service.rb +7 -3
  42. data/lib/skill_bench/services/batch_runner_service.rb +111 -0
  43. data/lib/skill_bench/services/compare_option_parser.rb +1 -0
  44. data/lib/skill_bench/services/cost_calculator.rb +91 -0
  45. data/lib/skill_bench/services/html_formatter.rb +289 -0
  46. data/lib/skill_bench/services/json_formatter.rb +19 -1
  47. data/lib/skill_bench/services/junit_formatter.rb +74 -24
  48. data/lib/skill_bench/services/provider_resolver.rb +5 -2
  49. data/lib/skill_bench/services/response_cache.rb +130 -0
  50. data/lib/skill_bench/services/runner_service.rb +88 -4
  51. data/lib/skill_bench/services/summary_formatter.rb +90 -0
  52. data/lib/skill_bench/services/template_registry.rb +43 -9
  53. data/lib/skill_bench/services/trend_recorder_service.rb +29 -2
  54. data/lib/skill_bench/tools/registry.rb +29 -3
  55. data/lib/skill_bench/tools/run_command.rb +171 -19
  56. data/lib/skill_bench/trend_tracker/persistence.rb +27 -10
  57. data/lib/skill_bench/trend_tracker.rb +5 -5
  58. data/lib/skill_bench/version.rb +1 -1
  59. data/lib/skill_bench.rb +2 -3
  60. metadata +17 -36
@@ -2,6 +2,7 @@
2
2
 
3
3
  require 'json'
4
4
  require 'pathname'
5
+ require 'fileutils'
5
6
 
6
7
  module SkillBench
7
8
  class TrendTracker
@@ -27,23 +28,24 @@ module SkillBench
27
28
  []
28
29
  end
29
30
 
30
- # Writes history to file with atomic operation and backup.
31
- # Returns a result hash so callers do not need to rescue SystemCallError.
31
+ # Writes history to file atomically, snapshotting the previous good
32
+ # version into the backup first.
33
+ #
34
+ # The existing history file (if any) is copied to +#{history_file}.bak+
35
+ # before the new content is written, so the backup always holds the
36
+ # previous good version rather than a duplicate of the current file. The
37
+ # new content is serialized once and written via a temp-file + rename so
38
+ # the main file is never left partially written. Returns a result hash so
39
+ # callers do not need to rescue SystemCallError.
32
40
  #
33
41
  # @param history [Array<Hash>] History entries to write
34
42
  # @return [Hash] { success: true } on success, { success: false, error: { message: '...' } } on failure
35
43
  def write(history)
36
- json = JSON.pretty_generate(history)
44
+ backup_previous_version
37
45
  temp_file = "#{history_file}.tmp"
38
- File.write(temp_file, json)
46
+ File.write(temp_file, JSON.pretty_generate(history))
39
47
  File.rename(temp_file, history_file)
40
48
 
41
- begin
42
- File.write("#{history_file}.bak", json)
43
- rescue SystemCallError => e
44
- warn "Backup write failed for #{history_file}: #{e.message}"
45
- end
46
-
47
49
  { success: true }
48
50
  rescue SystemCallError => e
49
51
  { success: false, error: { message: e.message } }
@@ -53,6 +55,21 @@ module SkillBench
53
55
 
54
56
  attr_reader :history_file
55
57
 
58
+ # Copies the current history file to the backup path so the backup keeps
59
+ # the previous good version. No-op on the first run when no history file
60
+ # exists yet. A failed copy is non-fatal: it warns and lets the main
61
+ # write proceed.
62
+ #
63
+ # @return [void]
64
+ def backup_previous_version
65
+ source = history_file
66
+ return unless File.exist?(source)
67
+
68
+ FileUtils.cp(source, "#{source}.bak")
69
+ rescue SystemCallError => e
70
+ warn "Backup copy failed for #{source}: #{e.message}"
71
+ end
72
+
56
73
  # Reads backup file if it exists
57
74
  #
58
75
  # @return [Array<Hash>, nil] Backup data or nil if unavailable
@@ -17,9 +17,9 @@ module SkillBench
17
17
  # Records an evaluation result.
18
18
  #
19
19
  # @param result [Hash] The evaluation result from EvaluationRunner.
20
+ # @param history [Array<Hash>] Pre-loaded history to append to; defaults to a fresh load.
20
21
  # @return [Hash] Service response.
21
- def record(result)
22
- history = @persistence.load
22
+ def record(result, history = @persistence.load)
23
23
  history << extract_entry(result)
24
24
  write_result = @persistence.write(history)
25
25
 
@@ -41,11 +41,11 @@ module SkillBench
41
41
  # Computes the trend of the given result against the most recent matching history entry.
42
42
  #
43
43
  # @param result [Hash] The current evaluation result.
44
+ # @param history [Array<Hash>] Pre-loaded history to compare against; defaults to a fresh load.
44
45
  # @return [Hash, nil] Trend data or nil if no matching history exists.
45
- def trend_for(result)
46
- entries = @persistence.load
46
+ def trend_for(result, history = @persistence.load)
47
47
  current = extract_entry(result)
48
- TrendCalculator.compute_trend(entries, current)
48
+ TrendCalculator.compute_trend(history, current)
49
49
  end
50
50
 
51
51
  private
@@ -2,5 +2,5 @@
2
2
 
3
3
  module SkillBench
4
4
  # The current gem version.
5
- VERSION = '1.1.0'
5
+ VERSION = '1.2.0'
6
6
  end
data/lib/skill_bench.rb CHANGED
@@ -73,6 +73,8 @@ require_relative 'skill_bench/commands/eval_new'
73
73
 
74
74
  # Services
75
75
  require_relative 'skill_bench/services/runner_service'
76
+ require_relative 'skill_bench/services/batch_runner_service'
77
+ require_relative 'skill_bench/services/summary_formatter'
76
78
  require_relative 'skill_bench/services/template_registry'
77
79
 
78
80
  # Tools
@@ -88,9 +90,6 @@ require_relative 'skill_bench/trend_tracker'
88
90
  require_relative 'skill_bench/trend_tracker/persistence'
89
91
  require_relative 'skill_bench/trend_tracker/trend_calculator'
90
92
 
91
- # Rails integrations
92
- require_relative 'skill_bench/rails/skill_templates'
93
-
94
93
  # Migration utilities
95
94
  require_relative 'skill_bench/migration/provider_migrator'
96
95
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-skill-bench
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ismael Marin
@@ -9,48 +9,20 @@ bindir: bin
9
9
  cert_chain: []
10
10
  date: 1980-01-02 00:00:00.000000000 Z
11
11
  dependencies:
12
- - !ruby/object:Gem::Dependency
13
- name: activesupport
14
- requirement: !ruby/object:Gem::Requirement
15
- requirements:
16
- - - ">="
17
- - !ruby/object:Gem::Version
18
- version: '6.0'
19
- type: :runtime
20
- prerelease: false
21
- version_requirements: !ruby/object:Gem::Requirement
22
- requirements:
23
- - - ">="
24
- - !ruby/object:Gem::Version
25
- version: '6.0'
26
12
  - !ruby/object:Gem::Dependency
27
13
  name: cgi
28
14
  requirement: !ruby/object:Gem::Requirement
29
15
  requirements:
30
16
  - - "~>"
31
17
  - !ruby/object:Gem::Version
32
- version: 0.5.1
33
- type: :runtime
34
- prerelease: false
35
- version_requirements: !ruby/object:Gem::Requirement
36
- requirements:
37
- - - "~>"
38
- - !ruby/object:Gem::Version
39
- version: 0.5.1
40
- - !ruby/object:Gem::Dependency
41
- name: dotenv
42
- requirement: !ruby/object:Gem::Requirement
43
- requirements:
44
- - - "~>"
45
- - !ruby/object:Gem::Version
46
- version: 3.2.0
18
+ version: 0.5.2
47
19
  type: :runtime
48
20
  prerelease: false
49
21
  version_requirements: !ruby/object:Gem::Requirement
50
22
  requirements:
51
23
  - - "~>"
52
24
  - !ruby/object:Gem::Version
53
- version: 3.2.0
25
+ version: 0.5.2
54
26
  - !ruby/object:Gem::Dependency
55
27
  name: faraday
56
28
  requirement: !ruby/object:Gem::Requirement
@@ -71,28 +43,28 @@ dependencies:
71
43
  requirements:
72
44
  - - "~>"
73
45
  - !ruby/object:Gem::Version
74
- version: '2.19'
46
+ version: '2.20'
75
47
  type: :runtime
76
48
  prerelease: false
77
49
  version_requirements: !ruby/object:Gem::Requirement
78
50
  requirements:
79
51
  - - "~>"
80
52
  - !ruby/object:Gem::Version
81
- version: '2.19'
53
+ version: '2.20'
82
54
  - !ruby/object:Gem::Dependency
83
55
  name: parallel
84
56
  requirement: !ruby/object:Gem::Requirement
85
57
  requirements:
86
58
  - - "~>"
87
59
  - !ruby/object:Gem::Version
88
- version: '1.26'
60
+ version: 2.0.0
89
61
  type: :runtime
90
62
  prerelease: false
91
63
  version_requirements: !ruby/object:Gem::Requirement
92
64
  requirements:
93
65
  - - "~>"
94
66
  - !ruby/object:Gem::Version
95
- version: '1.26'
67
+ version: 2.0.0
96
68
  description: |
97
69
  ruby-skill-bench orchestrates evaluation runs of AI coding agents
98
70
  inside isolated git sandboxes, then scores the results using deterministic
@@ -119,6 +91,7 @@ files:
119
91
  - lib/skill_bench/agent/runner.rb
120
92
  - lib/skill_bench/agent/summary.rb
121
93
  - lib/skill_bench/cli.rb
94
+ - lib/skill_bench/cli/batch_result_printer.rb
122
95
  - lib/skill_bench/cli/compare_command.rb
123
96
  - lib/skill_bench/cli/eval/eval_command_registry.rb
124
97
  - lib/skill_bench/cli/eval/eval_commands.rb
@@ -129,9 +102,11 @@ files:
129
102
  - lib/skill_bench/cli/result_printer.rb
130
103
  - lib/skill_bench/cli/run_command.rb
131
104
  - lib/skill_bench/cli/skill_command.rb
105
+ - lib/skill_bench/cli/validate_command.rb
132
106
  - lib/skill_bench/client.rb
133
107
  - lib/skill_bench/clients/all.rb
134
108
  - lib/skill_bench/clients/base_client.rb
109
+ - lib/skill_bench/clients/base_url_validator.rb
135
110
  - lib/skill_bench/clients/provider_config.rb
136
111
  - lib/skill_bench/clients/provider_registry.rb
137
112
  - lib/skill_bench/clients/provider_schemas.rb
@@ -140,6 +115,7 @@ files:
140
115
  - lib/skill_bench/clients/providers/deepseek.rb
141
116
  - lib/skill_bench/clients/providers/gemini.rb
142
117
  - lib/skill_bench/clients/providers/groq.rb
118
+ - lib/skill_bench/clients/providers/mistral.rb
143
119
  - lib/skill_bench/clients/providers/mock.rb
144
120
  - lib/skill_bench/clients/providers/null_client.rb
145
121
  - lib/skill_bench/clients/providers/ollama.rb
@@ -198,16 +174,19 @@ files:
198
174
  - lib/skill_bench/registry/pack_resolver.rb
199
175
  - lib/skill_bench/runner.rb
200
176
  - lib/skill_bench/services/agent_spawner_service.rb
177
+ - lib/skill_bench/services/batch_runner_service.rb
201
178
  - lib/skill_bench/services/compare_option_parser.rb
202
179
  - lib/skill_bench/services/comparison_reporter.rb
203
180
  - lib/skill_bench/services/comparison_runner.rb
204
181
  - lib/skill_bench/services/context_loader_service.rb
182
+ - lib/skill_bench/services/cost_calculator.rb
205
183
  - lib/skill_bench/services/delta_table_formatter.rb
206
184
  - lib/skill_bench/services/error_response_builder.rb
207
185
  - lib/skill_bench/services/eval_resolver.rb
208
186
  - lib/skill_bench/services/exit_code_calculator.rb
209
187
  - lib/skill_bench/services/feedback_generator.rb
210
188
  - lib/skill_bench/services/formatting_helpers.rb
189
+ - lib/skill_bench/services/html_formatter.rb
211
190
  - lib/skill_bench/services/iteration_formatter.rb
212
191
  - lib/skill_bench/services/json_formatter.rb
213
192
  - lib/skill_bench/services/judge_params_builder.rb
@@ -219,11 +198,13 @@ files:
219
198
  - lib/skill_bench/services/output_persistence_service.rb
220
199
  - lib/skill_bench/services/prompt_builder_service.rb
221
200
  - lib/skill_bench/services/provider_resolver.rb
201
+ - lib/skill_bench/services/response_cache.rb
222
202
  - lib/skill_bench/services/result_printer_service.rb
223
203
  - lib/skill_bench/services/runner_service.rb
224
204
  - lib/skill_bench/services/skill_resolver.rb
225
205
  - lib/skill_bench/services/skill_resolver_service.rb
226
206
  - lib/skill_bench/services/source_path_resolver_service.rb
207
+ - lib/skill_bench/services/summary_formatter.rb
227
208
  - lib/skill_bench/services/template_registry.rb
228
209
  - lib/skill_bench/services/template_registry/category_data.rb
229
210
  - lib/skill_bench/services/trend_recorder_service.rb
@@ -264,7 +245,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
264
245
  - !ruby/object:Gem::Version
265
246
  version: '0'
266
247
  requirements: []
267
- rubygems_version: 4.0.12
248
+ rubygems_version: 3.6.9
268
249
  specification_version: 4
269
250
  summary: The evaluation engine for AI Agent Skills benchmarking.
270
251
  test_files: []