ruby-skill-bench 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +794 -0
  4. data/bin/skill-bench +15 -0
  5. data/docs/architecture.md +200 -0
  6. data/docs/first-eval-guide.md +522 -0
  7. data/docs/testing-guide.md +361 -0
  8. data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
  9. data/lib/skill_bench/agent/react_agent/step.rb +92 -0
  10. data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
  11. data/lib/skill_bench/agent/react_agent.rb +58 -0
  12. data/lib/skill_bench/agent/runner.rb +108 -0
  13. data/lib/skill_bench/agent/summary.rb +39 -0
  14. data/lib/skill_bench/agent.rb +10 -0
  15. data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
  16. data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
  17. data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
  18. data/lib/skill_bench/cli/eval_command.rb +40 -0
  19. data/lib/skill_bench/cli/help_printer.rb +47 -0
  20. data/lib/skill_bench/cli/init_command.rb +69 -0
  21. data/lib/skill_bench/cli/result_printer.rb +20 -0
  22. data/lib/skill_bench/cli/run_command.rb +72 -0
  23. data/lib/skill_bench/cli/skill_command.rb +79 -0
  24. data/lib/skill_bench/cli.rb +51 -0
  25. data/lib/skill_bench/client.rb +23 -0
  26. data/lib/skill_bench/clients/all.rb +19 -0
  27. data/lib/skill_bench/clients/base_client.rb +212 -0
  28. data/lib/skill_bench/clients/provider_config.rb +47 -0
  29. data/lib/skill_bench/clients/provider_registry.rb +56 -0
  30. data/lib/skill_bench/clients/provider_schemas.rb +73 -0
  31. data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
  32. data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
  33. data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
  34. data/lib/skill_bench/clients/providers/gemini.rb +63 -0
  35. data/lib/skill_bench/clients/providers/groq.rb +39 -0
  36. data/lib/skill_bench/clients/providers/null_client.rb +50 -0
  37. data/lib/skill_bench/clients/providers/ollama.rb +63 -0
  38. data/lib/skill_bench/clients/providers/openai.rb +39 -0
  39. data/lib/skill_bench/clients/providers/opencode.rb +56 -0
  40. data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
  41. data/lib/skill_bench/clients/request_builder.rb +43 -0
  42. data/lib/skill_bench/clients/response_error_handler.rb +73 -0
  43. data/lib/skill_bench/clients/response_parser.rb +93 -0
  44. data/lib/skill_bench/clients/retry_handler.rb +78 -0
  45. data/lib/skill_bench/commands/eval_new.rb +89 -0
  46. data/lib/skill_bench/commands/init.rb +39 -0
  47. data/lib/skill_bench/commands/run.rb +21 -0
  48. data/lib/skill_bench/commands/skill_new.rb +115 -0
  49. data/lib/skill_bench/config/applier.rb +67 -0
  50. data/lib/skill_bench/config/defaults.rb +42 -0
  51. data/lib/skill_bench/config/env_overrides.rb +117 -0
  52. data/lib/skill_bench/config/facade_readers.rb +65 -0
  53. data/lib/skill_bench/config/facade_writers.rb +120 -0
  54. data/lib/skill_bench/config/json_loader.rb +84 -0
  55. data/lib/skill_bench/config/store.rb +177 -0
  56. data/lib/skill_bench/config.rb +172 -0
  57. data/lib/skill_bench/criteria.rb +141 -0
  58. data/lib/skill_bench/delta_report.rb +97 -0
  59. data/lib/skill_bench/dimension.rb +69 -0
  60. data/lib/skill_bench/error_logger.rb +35 -0
  61. data/lib/skill_bench/evaluate_command.rb +120 -0
  62. data/lib/skill_bench/evaluation/generator.rb +191 -0
  63. data/lib/skill_bench/evaluation/runner.rb +81 -0
  64. data/lib/skill_bench/evaluation.rb +10 -0
  65. data/lib/skill_bench/execution/context_hydrator.rb +97 -0
  66. data/lib/skill_bench/execution/sandbox.rb +174 -0
  67. data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
  68. data/lib/skill_bench/execution.rb +10 -0
  69. data/lib/skill_bench/history_recorder/history_file.rb +71 -0
  70. data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
  71. data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
  72. data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
  73. data/lib/skill_bench/history_recorder.rb +40 -0
  74. data/lib/skill_bench/interactive.rb +61 -0
  75. data/lib/skill_bench/judge/judge.rb +72 -0
  76. data/lib/skill_bench/judge/prompt.rb +121 -0
  77. data/lib/skill_bench/judge/response.rb +158 -0
  78. data/lib/skill_bench/judge.rb +10 -0
  79. data/lib/skill_bench/migration/provider_migrator.rb +30 -0
  80. data/lib/skill_bench/models/config.rb +61 -0
  81. data/lib/skill_bench/models/criteria_validator.rb +106 -0
  82. data/lib/skill_bench/models/eval.rb +81 -0
  83. data/lib/skill_bench/models/provider.rb +70 -0
  84. data/lib/skill_bench/models/skill.rb +32 -0
  85. data/lib/skill_bench/output_formatter.rb +132 -0
  86. data/lib/skill_bench/package_verifier.rb +80 -0
  87. data/lib/skill_bench/rails/skill_templates.rb +99 -0
  88. data/lib/skill_bench/runner.rb +89 -0
  89. data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
  90. data/lib/skill_bench/services/feedback_generator.rb +122 -0
  91. data/lib/skill_bench/services/formatting_helpers.rb +45 -0
  92. data/lib/skill_bench/services/iteration_formatter.rb +30 -0
  93. data/lib/skill_bench/services/json_formatter.rb +18 -0
  94. data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
  95. data/lib/skill_bench/services/junit_formatter.rb +42 -0
  96. data/lib/skill_bench/services/option_parser_service.rb +63 -0
  97. data/lib/skill_bench/services/output_persistence_service.rb +77 -0
  98. data/lib/skill_bench/services/result_printer_service.rb +126 -0
  99. data/lib/skill_bench/services/runner_service.rb +381 -0
  100. data/lib/skill_bench/services/skill_resolver.rb +78 -0
  101. data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
  102. data/lib/skill_bench/services/template_registry.rb +148 -0
  103. data/lib/skill_bench/task/evaluator.rb +94 -0
  104. data/lib/skill_bench/task/file_reader.rb +69 -0
  105. data/lib/skill_bench/task.rb +10 -0
  106. data/lib/skill_bench/tools/argument_parser.rb +20 -0
  107. data/lib/skill_bench/tools/base.rb +73 -0
  108. data/lib/skill_bench/tools/dispatcher.rb +61 -0
  109. data/lib/skill_bench/tools/read_file.rb +66 -0
  110. data/lib/skill_bench/tools/registry.rb +23 -0
  111. data/lib/skill_bench/tools/run_command.rb +89 -0
  112. data/lib/skill_bench/tools/write_file.rb +78 -0
  113. data/lib/skill_bench/tools.rb +33 -0
  114. data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
  115. data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
  116. data/lib/skill_bench/trend_tracker.rb +66 -0
  117. data/lib/skill_bench/version.rb +6 -0
  118. data/lib/skill_bench.rb +103 -0
  119. metadata +247 -0
metadata ADDED
@@ -0,0 +1,247 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ruby-skill-bench
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ismael Marin
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: activesupport
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '6.0'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: '6.0'
26
+ - !ruby/object:Gem::Dependency
27
+ name: cgi
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: 0.5.1
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: 0.5.1
40
+ - !ruby/object:Gem::Dependency
41
+ name: dotenv
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: 3.2.0
47
+ type: :runtime
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: 3.2.0
54
+ - !ruby/object:Gem::Dependency
55
+ name: faraday
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '2.14'
61
+ type: :runtime
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '2.14'
68
+ - !ruby/object:Gem::Dependency
69
+ name: json
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '2.19'
75
+ type: :runtime
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '2.19'
82
+ - !ruby/object:Gem::Dependency
83
+ name: parallel
84
+ requirement: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - "~>"
87
+ - !ruby/object:Gem::Version
88
+ version: '1.26'
89
+ type: :runtime
90
+ prerelease: false
91
+ version_requirements: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - "~>"
94
+ - !ruby/object:Gem::Version
95
+ version: '1.26'
96
+ description: |
97
+ ruby-skill-bench orchestrates evaluation runs of AI coding agents
98
+ inside isolated git sandboxes, then scores the results using deterministic
99
+ and LLM-powered judges.
100
+ email:
101
+ - ismael.marin@gmail.com
102
+ executables:
103
+ - skill-bench
104
+ extensions: []
105
+ extra_rdoc_files: []
106
+ files:
107
+ - LICENSE
108
+ - README.md
109
+ - bin/skill-bench
110
+ - docs/architecture.md
111
+ - docs/first-eval-guide.md
112
+ - docs/testing-guide.md
113
+ - lib/skill_bench.rb
114
+ - lib/skill_bench/agent.rb
115
+ - lib/skill_bench/agent/react_agent.rb
116
+ - lib/skill_bench/agent/react_agent/loop_runner.rb
117
+ - lib/skill_bench/agent/react_agent/step.rb
118
+ - lib/skill_bench/agent/react_agent/tool_executor.rb
119
+ - lib/skill_bench/agent/runner.rb
120
+ - lib/skill_bench/agent/summary.rb
121
+ - lib/skill_bench/cli.rb
122
+ - lib/skill_bench/cli/eval/eval_command_registry.rb
123
+ - lib/skill_bench/cli/eval/eval_commands.rb
124
+ - lib/skill_bench/cli/eval/eval_options.rb
125
+ - lib/skill_bench/cli/eval_command.rb
126
+ - lib/skill_bench/cli/help_printer.rb
127
+ - lib/skill_bench/cli/init_command.rb
128
+ - lib/skill_bench/cli/result_printer.rb
129
+ - lib/skill_bench/cli/run_command.rb
130
+ - lib/skill_bench/cli/skill_command.rb
131
+ - lib/skill_bench/client.rb
132
+ - lib/skill_bench/clients/all.rb
133
+ - lib/skill_bench/clients/base_client.rb
134
+ - lib/skill_bench/clients/provider_config.rb
135
+ - lib/skill_bench/clients/provider_registry.rb
136
+ - lib/skill_bench/clients/provider_schemas.rb
137
+ - lib/skill_bench/clients/providers/anthropic.rb
138
+ - lib/skill_bench/clients/providers/azure_openai.rb
139
+ - lib/skill_bench/clients/providers/deepseek.rb
140
+ - lib/skill_bench/clients/providers/gemini.rb
141
+ - lib/skill_bench/clients/providers/groq.rb
142
+ - lib/skill_bench/clients/providers/null_client.rb
143
+ - lib/skill_bench/clients/providers/ollama.rb
144
+ - lib/skill_bench/clients/providers/openai.rb
145
+ - lib/skill_bench/clients/providers/opencode.rb
146
+ - lib/skill_bench/clients/providers/openrouter.rb
147
+ - lib/skill_bench/clients/request_builder.rb
148
+ - lib/skill_bench/clients/response_error_handler.rb
149
+ - lib/skill_bench/clients/response_parser.rb
150
+ - lib/skill_bench/clients/retry_handler.rb
151
+ - lib/skill_bench/commands/eval_new.rb
152
+ - lib/skill_bench/commands/init.rb
153
+ - lib/skill_bench/commands/run.rb
154
+ - lib/skill_bench/commands/skill_new.rb
155
+ - lib/skill_bench/config.rb
156
+ - lib/skill_bench/config/applier.rb
157
+ - lib/skill_bench/config/defaults.rb
158
+ - lib/skill_bench/config/env_overrides.rb
159
+ - lib/skill_bench/config/facade_readers.rb
160
+ - lib/skill_bench/config/facade_writers.rb
161
+ - lib/skill_bench/config/json_loader.rb
162
+ - lib/skill_bench/config/store.rb
163
+ - lib/skill_bench/criteria.rb
164
+ - lib/skill_bench/delta_report.rb
165
+ - lib/skill_bench/dimension.rb
166
+ - lib/skill_bench/error_logger.rb
167
+ - lib/skill_bench/evaluate_command.rb
168
+ - lib/skill_bench/evaluation.rb
169
+ - lib/skill_bench/evaluation/generator.rb
170
+ - lib/skill_bench/evaluation/runner.rb
171
+ - lib/skill_bench/execution.rb
172
+ - lib/skill_bench/execution/context_hydrator.rb
173
+ - lib/skill_bench/execution/sandbox.rb
174
+ - lib/skill_bench/execution/source_path_resolver.rb
175
+ - lib/skill_bench/history_recorder.rb
176
+ - lib/skill_bench/history_recorder/history_file.rb
177
+ - lib/skill_bench/history_recorder/history_path_resolver.rb
178
+ - lib/skill_bench/history_recorder/persistence_service.rb
179
+ - lib/skill_bench/history_recorder/summary_service.rb
180
+ - lib/skill_bench/interactive.rb
181
+ - lib/skill_bench/judge.rb
182
+ - lib/skill_bench/judge/judge.rb
183
+ - lib/skill_bench/judge/prompt.rb
184
+ - lib/skill_bench/judge/response.rb
185
+ - lib/skill_bench/migration/provider_migrator.rb
186
+ - lib/skill_bench/models/config.rb
187
+ - lib/skill_bench/models/criteria_validator.rb
188
+ - lib/skill_bench/models/eval.rb
189
+ - lib/skill_bench/models/provider.rb
190
+ - lib/skill_bench/models/skill.rb
191
+ - lib/skill_bench/output_formatter.rb
192
+ - lib/skill_bench/package_verifier.rb
193
+ - lib/skill_bench/rails/skill_templates.rb
194
+ - lib/skill_bench/runner.rb
195
+ - lib/skill_bench/services/delta_table_formatter.rb
196
+ - lib/skill_bench/services/feedback_generator.rb
197
+ - lib/skill_bench/services/formatting_helpers.rb
198
+ - lib/skill_bench/services/iteration_formatter.rb
199
+ - lib/skill_bench/services/json_formatter.rb
200
+ - lib/skill_bench/services/judge_score_parser_service.rb
201
+ - lib/skill_bench/services/junit_formatter.rb
202
+ - lib/skill_bench/services/option_parser_service.rb
203
+ - lib/skill_bench/services/output_persistence_service.rb
204
+ - lib/skill_bench/services/result_printer_service.rb
205
+ - lib/skill_bench/services/runner_service.rb
206
+ - lib/skill_bench/services/skill_resolver.rb
207
+ - lib/skill_bench/services/template_registry.rb
208
+ - lib/skill_bench/services/template_registry/category_data.rb
209
+ - lib/skill_bench/task.rb
210
+ - lib/skill_bench/task/evaluator.rb
211
+ - lib/skill_bench/task/file_reader.rb
212
+ - lib/skill_bench/tools.rb
213
+ - lib/skill_bench/tools/argument_parser.rb
214
+ - lib/skill_bench/tools/base.rb
215
+ - lib/skill_bench/tools/dispatcher.rb
216
+ - lib/skill_bench/tools/read_file.rb
217
+ - lib/skill_bench/tools/registry.rb
218
+ - lib/skill_bench/tools/run_command.rb
219
+ - lib/skill_bench/tools/write_file.rb
220
+ - lib/skill_bench/trend_tracker.rb
221
+ - lib/skill_bench/trend_tracker/persistence.rb
222
+ - lib/skill_bench/trend_tracker/trend_calculator.rb
223
+ - lib/skill_bench/version.rb
224
+ homepage: https://github.com/igmarin/ruby-skill-bench
225
+ licenses:
226
+ - MIT
227
+ metadata:
228
+ rubygems_mfa_required: 'true'
229
+ source_code_uri: https://github.com/igmarin/ruby-skill-bench
230
+ rdoc_options: []
231
+ require_paths:
232
+ - lib
233
+ required_ruby_version: !ruby/object:Gem::Requirement
234
+ requirements:
235
+ - - ">="
236
+ - !ruby/object:Gem::Version
237
+ version: '3.1'
238
+ required_rubygems_version: !ruby/object:Gem::Requirement
239
+ requirements:
240
+ - - ">="
241
+ - !ruby/object:Gem::Version
242
+ version: '0'
243
+ requirements: []
244
+ rubygems_version: 4.0.11
245
+ specification_version: 4
246
+ summary: The evaluation engine for AI Agent Skills benchmarking.
247
+ test_files: []