ruby-skill-bench 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +794 -0
  4. data/bin/skill-bench +15 -0
  5. data/docs/architecture.md +200 -0
  6. data/docs/first-eval-guide.md +522 -0
  7. data/docs/testing-guide.md +361 -0
  8. data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
  9. data/lib/skill_bench/agent/react_agent/step.rb +92 -0
  10. data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
  11. data/lib/skill_bench/agent/react_agent.rb +58 -0
  12. data/lib/skill_bench/agent/runner.rb +108 -0
  13. data/lib/skill_bench/agent/summary.rb +39 -0
  14. data/lib/skill_bench/agent.rb +10 -0
  15. data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
  16. data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
  17. data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
  18. data/lib/skill_bench/cli/eval_command.rb +40 -0
  19. data/lib/skill_bench/cli/help_printer.rb +47 -0
  20. data/lib/skill_bench/cli/init_command.rb +69 -0
  21. data/lib/skill_bench/cli/result_printer.rb +20 -0
  22. data/lib/skill_bench/cli/run_command.rb +72 -0
  23. data/lib/skill_bench/cli/skill_command.rb +79 -0
  24. data/lib/skill_bench/cli.rb +51 -0
  25. data/lib/skill_bench/client.rb +23 -0
  26. data/lib/skill_bench/clients/all.rb +19 -0
  27. data/lib/skill_bench/clients/base_client.rb +212 -0
  28. data/lib/skill_bench/clients/provider_config.rb +47 -0
  29. data/lib/skill_bench/clients/provider_registry.rb +56 -0
  30. data/lib/skill_bench/clients/provider_schemas.rb +73 -0
  31. data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
  32. data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
  33. data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
  34. data/lib/skill_bench/clients/providers/gemini.rb +63 -0
  35. data/lib/skill_bench/clients/providers/groq.rb +39 -0
  36. data/lib/skill_bench/clients/providers/null_client.rb +50 -0
  37. data/lib/skill_bench/clients/providers/ollama.rb +63 -0
  38. data/lib/skill_bench/clients/providers/openai.rb +39 -0
  39. data/lib/skill_bench/clients/providers/opencode.rb +56 -0
  40. data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
  41. data/lib/skill_bench/clients/request_builder.rb +43 -0
  42. data/lib/skill_bench/clients/response_error_handler.rb +73 -0
  43. data/lib/skill_bench/clients/response_parser.rb +93 -0
  44. data/lib/skill_bench/clients/retry_handler.rb +78 -0
  45. data/lib/skill_bench/commands/eval_new.rb +89 -0
  46. data/lib/skill_bench/commands/init.rb +39 -0
  47. data/lib/skill_bench/commands/run.rb +21 -0
  48. data/lib/skill_bench/commands/skill_new.rb +115 -0
  49. data/lib/skill_bench/config/applier.rb +67 -0
  50. data/lib/skill_bench/config/defaults.rb +42 -0
  51. data/lib/skill_bench/config/env_overrides.rb +117 -0
  52. data/lib/skill_bench/config/facade_readers.rb +65 -0
  53. data/lib/skill_bench/config/facade_writers.rb +120 -0
  54. data/lib/skill_bench/config/json_loader.rb +84 -0
  55. data/lib/skill_bench/config/store.rb +177 -0
  56. data/lib/skill_bench/config.rb +172 -0
  57. data/lib/skill_bench/criteria.rb +141 -0
  58. data/lib/skill_bench/delta_report.rb +97 -0
  59. data/lib/skill_bench/dimension.rb +69 -0
  60. data/lib/skill_bench/error_logger.rb +35 -0
  61. data/lib/skill_bench/evaluate_command.rb +120 -0
  62. data/lib/skill_bench/evaluation/generator.rb +191 -0
  63. data/lib/skill_bench/evaluation/runner.rb +81 -0
  64. data/lib/skill_bench/evaluation.rb +10 -0
  65. data/lib/skill_bench/execution/context_hydrator.rb +97 -0
  66. data/lib/skill_bench/execution/sandbox.rb +174 -0
  67. data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
  68. data/lib/skill_bench/execution.rb +10 -0
  69. data/lib/skill_bench/history_recorder/history_file.rb +71 -0
  70. data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
  71. data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
  72. data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
  73. data/lib/skill_bench/history_recorder.rb +40 -0
  74. data/lib/skill_bench/interactive.rb +61 -0
  75. data/lib/skill_bench/judge/judge.rb +72 -0
  76. data/lib/skill_bench/judge/prompt.rb +121 -0
  77. data/lib/skill_bench/judge/response.rb +158 -0
  78. data/lib/skill_bench/judge.rb +10 -0
  79. data/lib/skill_bench/migration/provider_migrator.rb +30 -0
  80. data/lib/skill_bench/models/config.rb +61 -0
  81. data/lib/skill_bench/models/criteria_validator.rb +106 -0
  82. data/lib/skill_bench/models/eval.rb +81 -0
  83. data/lib/skill_bench/models/provider.rb +70 -0
  84. data/lib/skill_bench/models/skill.rb +32 -0
  85. data/lib/skill_bench/output_formatter.rb +132 -0
  86. data/lib/skill_bench/package_verifier.rb +80 -0
  87. data/lib/skill_bench/rails/skill_templates.rb +99 -0
  88. data/lib/skill_bench/runner.rb +89 -0
  89. data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
  90. data/lib/skill_bench/services/feedback_generator.rb +122 -0
  91. data/lib/skill_bench/services/formatting_helpers.rb +45 -0
  92. data/lib/skill_bench/services/iteration_formatter.rb +30 -0
  93. data/lib/skill_bench/services/json_formatter.rb +18 -0
  94. data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
  95. data/lib/skill_bench/services/junit_formatter.rb +42 -0
  96. data/lib/skill_bench/services/option_parser_service.rb +63 -0
  97. data/lib/skill_bench/services/output_persistence_service.rb +77 -0
  98. data/lib/skill_bench/services/result_printer_service.rb +126 -0
  99. data/lib/skill_bench/services/runner_service.rb +381 -0
  100. data/lib/skill_bench/services/skill_resolver.rb +78 -0
  101. data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
  102. data/lib/skill_bench/services/template_registry.rb +148 -0
  103. data/lib/skill_bench/task/evaluator.rb +94 -0
  104. data/lib/skill_bench/task/file_reader.rb +69 -0
  105. data/lib/skill_bench/task.rb +10 -0
  106. data/lib/skill_bench/tools/argument_parser.rb +20 -0
  107. data/lib/skill_bench/tools/base.rb +73 -0
  108. data/lib/skill_bench/tools/dispatcher.rb +61 -0
  109. data/lib/skill_bench/tools/read_file.rb +66 -0
  110. data/lib/skill_bench/tools/registry.rb +23 -0
  111. data/lib/skill_bench/tools/run_command.rb +89 -0
  112. data/lib/skill_bench/tools/write_file.rb +78 -0
  113. data/lib/skill_bench/tools.rb +33 -0
  114. data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
  115. data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
  116. data/lib/skill_bench/trend_tracker.rb +66 -0
  117. data/lib/skill_bench/version.rb +6 -0
  118. data/lib/skill_bench.rb +103 -0
  119. metadata +247 -0
@@ -0,0 +1,522 @@
1
+ # SkillBench - 5 Minute First Eval Guide
2
+
3
+ Get started with Ruby Skill Bench in 5 minutes. No prior AI eval experience required.
4
+
5
+ ---
6
+
7
+ ## Prerequisites
8
+
9
+ - Ruby 3.1+
10
+ - Bundler
11
+
12
+ Not sure? Run:
13
+
14
+ ```bash
15
+ ruby --version # Should be 3.1 or higher
16
+ bundle --version # Should print a version number
17
+ ```
18
+
19
+ ---
20
+
21
+ ## Step 1: Installation
22
+
23
+ Add to your Gemfile:
24
+
25
+ ```ruby
26
+ gem 'ruby-skill-bench'
27
+ ```
28
+
29
+ Or install globally:
30
+
31
+ ```bash
32
+ gem install ruby-skill-bench
33
+ ```
34
+
35
+ ---
36
+
37
+ ## Step 2: Initialize Configuration
38
+
39
+ ```bash
40
+ skill-bench init --openai
41
+ ```
42
+
43
+ This creates `skill-bench.json` with the OpenAI provider config. Use `--force` to overwrite.
44
+
45
+ **Available providers:** `--openai`, `--anthropic`, `--gemini`, `--ollama`, `--azure`, `--groq`, `--deepseek`, `--opencode`
46
+
47
+ > **What is `skill-bench.json`?** This is your config file. It stores your API key, chosen LLM model, timeout, and allowed shell commands. Think of it as `.env` but structured as JSON. You edit it; SkillBench reads it.
48
+
49
+ ---
50
+
51
+ ## Step 3: Create Your First Skill
52
+
53
+ ```bash
54
+ skill-bench skill new my-service --mode=rails --template=service_object
55
+ ```
56
+
57
+ This creates `skills/my-service/SKILL.md` with a Rails service object template.
58
+
59
+ **What is a skill?** A skill is a set of instructions (written in Markdown) that you want the AI agent to follow. It is like a style guide or a cheat sheet. The agent reads it before solving the task.
60
+
61
+ **What goes in `SKILL.md`:**
62
+ - What pattern the skill implements (e.g. "Service Object with `.call`")
63
+ - Hard rules the agent must follow
64
+ - Code examples
65
+ - Response format expectations
66
+
67
+ **Example `SKILL.md`:**
68
+
69
+ ```markdown
70
+ # Service Object Skill
71
+
72
+ ## Pattern
73
+
74
+ All service objects use the `.call` class method and return a standardized hash:
75
+
76
+ ```ruby
77
+ { success: true, response: { data: ... } }
78
+ ```
79
+
80
+ ## Hard Rules
81
+
82
+ 1. Every `.rb` file begins with `# frozen_string_literal: true`
83
+ 2. Every public method has YARD docs (`@param`, `@return`, `@raise`)
84
+ 3. `rescue StandardError` blocks must log backtrace
85
+ ```ruby
86
+
87
+ ---
88
+
89
+ ## Step 4: Create an Eval
90
+
91
+ You have two options.
92
+
93
+ ### Option A — Manual (recommended for learning)
94
+
95
+ ```bash
96
+ skill-bench eval new my-first-eval --runtime=rails
97
+ ```
98
+
99
+ **Creates:**
100
+
101
+ ```bash
102
+ evals/
103
+ └── my-first-eval/
104
+ ├── task.md # <- The prompt given to the agent
105
+ └── criteria.json # <- How the judge scores the result
106
+ ```
107
+
108
+ #### What goes in `task.md`
109
+
110
+ This is the **user prompt** the agent receives. Be specific — the agent has no other context.
111
+
112
+ **Bad example (too vague):**
113
+
114
+ ```markdown
115
+ Create a user service.
116
+ ```
117
+
118
+ **Good example (specific requirements):**
119
+
120
+ ```markdown
121
+ Create a `UserRegistrationService` that:
122
+
123
+ 1. Accepts `email` and `password` parameters
124
+ 2. Validates email format with a regex (must contain @ and a domain)
125
+ 3. Validates password length (minimum 8 characters)
126
+ 4. Returns `{ success: true, response: { user_id: ... } }` on success
127
+ 5. Returns `{ success: false, response: { error: { message: ... } } }` on failure
128
+ 6. Includes YARD documentation for every public method
129
+ 7. Includes RSpec tests covering both success and failure paths
130
+ 8. Follows the frozen_string_literal convention
131
+
132
+ Do not use ActiveRecord. Use plain Ruby objects.
133
+ ```
134
+
135
+ #### What goes in `criteria.json`
136
+
137
+ This tells the judge how to score. The 5 core dimensions are mandatory.
138
+
139
+ **Minimal example (copy-paste ready):**
140
+
141
+ ```json
142
+ {
143
+ "context": "Evaluate service object creation skill",
144
+ "dimensions": [
145
+ { "name": "correctness", "max_score": 30 },
146
+ { "name": "skill_adherence", "max_score": 25 },
147
+ { "name": "code_quality", "max_score": 20 },
148
+ { "name": "test_coverage", "max_score": 15 },
149
+ { "name": "documentation", "max_score": 10 }
150
+ ],
151
+ "pass_threshold": 70,
152
+ "minimum_delta": 10
153
+ }
154
+ ```
155
+
156
+ **With custom descriptions (recommended):**
157
+
158
+ ```json
159
+ {
160
+ "context": "Evaluate service object creation skill",
161
+ "dimensions": [
162
+ { "name": "correctness", "max_score": 30 },
163
+ { "name": "skill_adherence", "max_score": 25, "description": "Did the agent use the .call pattern and return the standardized hash?" },
164
+ { "name": "code_quality", "max_score": 20 },
165
+ { "name": "test_coverage", "max_score": 15, "description": "Are there tests for both success and failure paths?" },
166
+ { "name": "documentation", "max_score": 10 }
167
+ ],
168
+ "pass_threshold": 70,
169
+ "minimum_delta": 10
170
+ }
171
+ ```
172
+
173
+ **Key rules:**
174
+
175
+ - `max_score` values must sum to exactly 100
176
+ - All 5 core dimensions (`correctness`, `skill_adherence`, `code_quality`, `test_coverage`, `documentation`) are required
177
+ - `pass_threshold` = minimum context score to pass (0-100)
178
+ - `minimum_delta` = minimum improvement over baseline to pass
179
+
180
+ ---
181
+
182
+ ### Option B — Auto-Generated (from a skill)
183
+
184
+ If you already have a skill and want the LLM to design the eval for you:
185
+
186
+ ```bash
187
+ skill-bench eval generate my-service --name my-first-eval
188
+ ```
189
+
190
+ This reads `skills/my-service/SKILL.md` and generates both `task.md` and `criteria.json`. The output is immediately validated — if the generated `criteria.json` has invalid dimensions or doesn't sum to 100, you'll see an error and can fix it manually.
191
+
192
+ ---
193
+
194
+ ### Option C — Using TemplateRegistry (Programmatic)
195
+
196
+ For automated eval creation or building tools on top of SkillBench, use `TemplateRegistry` to generate scaffolding from pre-built templates.
197
+
198
+ **Basic Usage:**
199
+
200
+ ```ruby
201
+ require 'skill_bench'
202
+
203
+ # Generate all eval files from templates
204
+ task_md = SkillBench::Services::TemplateRegistry.call(:task_md, :crud, skill_name: "UserCreator")
205
+ criteria_json = SkillBench::Services::TemplateRegistry.call(:criteria_json, :crud)
206
+ skill_md = SkillBench::Services::TemplateRegistry.call(:skill_md, :crud, skill_name: "UserCreator")
207
+
208
+ # Write to disk
209
+ FileUtils.mkdir_p("evals/user-creator")
210
+ File.write("evals/user-creator/task.md", task_md)
211
+ File.write("evals/user-creator/criteria.json", criteria_json)
212
+
213
+ FileUtils.mkdir_p("skills/user-creator")
214
+ File.write("skills/user-creator/SKILL.md", skill_md)
215
+ ```
216
+
217
+ **Available Categories:**
218
+
219
+ | Category | Use Case |
220
+ |----------|----------|
221
+ | `crud` | Service Objects with Create, Read, Update, Delete |
222
+ | `api` | API clients with authentication and error handling |
223
+ | `background_job` | ActiveJob/Sidekiq workers with retry logic |
224
+ | `controller` | RESTful controllers with strong parameters |
225
+ | `model` | ActiveRecord models with validations |
226
+ | `migration` | Database migrations with indexes |
227
+ | `concern` | ActiveSupport::Concern modules |
228
+ | `policy` | Authorization policies (Pundit-style) |
229
+ | `form_object` | Form objects with validations |
230
+ | `view_component` | ViewComponent components with previews |
231
+
232
+ **Template Types:**
233
+
234
+ | Type | Output | Purpose |
235
+ |------|--------|---------|
236
+ | `task_md` | Markdown | Agent prompt with requirements |
237
+ | `criteria_json` | JSON | Scoring rules and dimensions |
238
+ | `skill_md` | Markdown | Skill instructions for the agent |
239
+
240
+ **Variable Interpolation:**
241
+
242
+ Templates support `{{variable_name}}` syntax for dynamic content:
243
+
244
+ ```ruby
245
+ task = SkillBench::Services::TemplateRegistry.call(
246
+ :task_md,
247
+ :api,
248
+ skill_name: "PaymentGateway",
249
+ endpoint: "/api/v1/payments"
250
+ )
251
+ ```
252
+
253
+ > **Tip:** `TemplateRegistry` returns template strings you can customize before writing to disk. It's a pure function with no side effects.
254
+
255
+ ---
256
+
257
+ ## Step 5: Run the Eval
258
+
259
+ ```bash
260
+ skill-bench run my-first-eval --skill=my-service
261
+ ```
262
+
263
+ Provider is read from `skill-bench.json` — no `--provider` flag needed.
264
+
265
+ **What happens behind the scenes:**
266
+
267
+ 1. Agent runs **without** skill context → produces baseline output
268
+ 2. Agent runs **with** skill context → produces context output
269
+ 3. Judge scores both independently → per-dimension scores
270
+ 4. Engine computes deltas → applies pass/fail logic
271
+ 5. Result is recorded in `.skill-bench-history.json` for trend tracking
272
+
273
+ **Run with multiple skills:**
274
+
275
+ ```bash
276
+ skill-bench run my-first-eval --skill=skill-a --skill=skill-b
277
+ ```
278
+
279
+ Both skill contexts are concatenated. The judge evaluates whether the combined context improves results.
280
+
281
+ **Available Providers (configured via `skill-bench init`):**
282
+
283
+ - `openai` — OpenAI GPT models
284
+ - `anthropic` — Anthropic Claude
285
+ - `gemini` — Google Gemini
286
+ - `azure` — Azure OpenAI
287
+ - `ollama` — Local Ollama models
288
+ - `groq` — Groq fast inference
289
+ - `deepseek` — DeepSeek models
290
+ - `opencode` — OpenCode platform (**requires custom `base_url`**: OpenCode does not host a public LLM API. Provide your own OpenAI-compatible endpoint via `config.base_url`)
291
+
292
+ ---
293
+
294
+ ## Step 6: Check Results
295
+
296
+ **Human-readable output (default):**
297
+
298
+ ```text
299
+ ═══════════════════════════════════════════════════════
300
+ Eval: my-first-eval
301
+ Skill: my-service
302
+ Provider: openai
303
+ ═══════════════════════════════════════════════════════
304
+
305
+ === BASELINE ITERATIONS ===
306
+ Step 1: Read task → Tool: read_file → Observation: content...
307
+ Step 2: Plan changes → Tool: write_file → Observation: Success...
308
+ Step 3: Run tests → Tool: run_command → Observation: 3 runs, 0 failures
309
+ Step 4: Final answer
310
+
311
+ === CONTEXT ITERATIONS ===
312
+ Step 1: Read task → Tool: read_file → Observation: content...
313
+ Step 2: Apply skill pattern → Tool: write_file, run_command → Observation: Success...
314
+ Step 3: Final answer
315
+
316
+ DIMENSION BASELINE CONTEXT DELTA
317
+ ──────────────────────── ───────── ───────── ───────
318
+ Correctness (30) 12 28 +16
319
+ Skill Adherence (25) 5 22 +17
320
+ Code Quality (20) 10 16 +6
321
+ Test Coverage (15) 3 13 +10
322
+ Documentation (10) 2 8 +6
323
+ ──────────────────────── ───────── ───────── ───────
324
+ TOTAL 32/100 87/100 +55
325
+
326
+ TREND: baseline ↑ (+2), context ↑ (+7)
327
+ VERDICT: PASS (threshold: 70, minimum delta: 10)
328
+ ═══════════════════════════════════════════════════════
329
+
330
+ === WHAT WENT WELL ===
331
+ Correctness (28/30, baseline: 12/30)
332
+ The agent correctly implemented all required behaviors.
333
+ Skill Adherence (22/25, baseline: 5/25)
334
+ Followed the service object pattern and hard gates.
335
+
336
+ === WHAT WENT WRONG ===
337
+ Test Coverage (13/15, baseline: 3/15)
338
+ Tests exist but edge cases are missing.
339
+ Advice: Are there meaningful tests? Do they test the right things?
340
+ ```
341
+
342
+ **Column meanings:**
343
+
344
+ | Column | Meaning |
345
+ |--------|---------|
346
+ | **BASELINE** | Score without skill (unaided performance). Think: "How well does the AI do on its own?" |
347
+ | **CONTEXT** | Score with skill (aided performance). Think: "How well does the AI do when it reads my skill?" |
348
+ | **DELTA** | Improvement = CONTEXT - BASELINE. Think: "How many points did my skill add?" |
349
+ | **TREND** | Change since the *previous* run of this exact eval + skill. Stored in `.skill-bench-history.json`. |
350
+ | **VERDICT** | PASS only if CONTEXT >= threshold AND DELTA >= minimum_delta. Both must be true. |
351
+ | **Iterations** | ReAct loop steps for each run: thought → tools → observation. Helps you understand *how* the agent worked. |
352
+ | **What went well** | Dimensions scoring ≥ 80% of max, with judge reasoning. Strengths of your skill. |
353
+ | **What went wrong** | Dimensions scoring < 80% of max, with judge reasoning + baseline comparison. Weaknesses to fix. |
354
+ | **Advice** | Description from `criteria.json` for each low-scoring dimension. Actionable guidance for improvement. |
355
+
356
+ **Why both conditions for PASS?**
357
+
358
+ - `pass_threshold` alone would pass even if the skill didn't help (e.g. baseline=80, context=80, delta=0).
359
+ - `minimum_delta` alone would pass even if the absolute score is terrible (e.g. baseline=10, context=20, delta=10).
360
+ - Both together ensure the skill is **both effective and meaningful**.
361
+
362
+ **The four possible outcomes:**
363
+
364
+ | Context Score | Delta | Verdict | What it means |
365
+ |---------------|-------|---------|---------------|
366
+ | 87 | +55 | **PASS** | Skill helped a lot. Context >= 70 **and** delta >= 10. |
367
+ | 87 | -2 | **FAIL** | Skill made things **worse**. Context >= 70 **but** delta < 10. |
368
+ | 65 | +15 | **FAIL** | Skill helped, but not enough. Delta >= 10 **but** context < 70. |
369
+ | 65 | +5 | **FAIL** | Skill didn't help enough. Both conditions failed. |
370
+
371
+ **Most common surprise: negative delta**
372
+
373
+ If baseline=89 and context=87, your skill confused the agent. The agent scored higher *without* reading your skill. This usually means:
374
+
375
+ 1. **Skill is too long** — the agent fixates on following the skill and ignores the actual task
376
+ 2. **Skill contradicts the task** — e.g., skill says "use Service Objects" but task says "write a script"
377
+ 3. **Over-engineering** — skill adds boilerplate (factories, decorators) that the judge penalizes as unnecessary
378
+
379
+ **Fix:** Remove rules that don't directly improve the weakest dimension. Measure: look at the dimension with the smallest (or most negative) delta. Delete or rewrite rules targeting that dimension.
380
+
381
+ **Using the iteration timeline:**
382
+
383
+ The iteration timeline shows every tool call the agent made. Watch for:
384
+
385
+ - **Many `read_file` errors** — The agent is guessing filenames. Add `ls` or `find` to `allowed_commands` in `skill-bench.json`, or write a more explicit task.
386
+ - **Tool errors in baseline but not context** — Your skill taught the agent correct file paths or command sequences. Good!
387
+ - **Tool errors in both runs** — The task references files that don't exist in the sandbox. Check your `task.md` for stale paths.
388
+ - **Many iterations, no final answer** — The agent hit the max iteration limit. Increase `max_iterations` in your provider config:
389
+
390
+ ```json
391
+ {
392
+ "config": {
393
+ "max_iterations": 50
394
+ }
395
+ }
396
+ ```
397
+
398
+ Default is 25. Complex Rails tasks often need 30–50+ steps.
399
+
400
+ **JSON output:**
401
+
402
+ ```bash
403
+ skill-bench run my-first-eval --skill=my-service --format json
404
+ ```
405
+
406
+ **JUnit XML output:**
407
+
408
+ ```bash
409
+ skill-bench run my-first-eval --skill=my-service --format junit
410
+ ```
411
+
412
+ ---
413
+
414
+ ## Step 7: Iterate and Improve
415
+
416
+ Your first run probably will not pass. That is normal. Here is how to improve.
417
+
418
+ ### Use the History File
419
+
420
+ After each run, SkillBench appends to `.skill-bench-history.json`. You can read it to track progress:
421
+
422
+ ```bash
423
+ cat .skill-bench-history.json | jq '.[-1]'
424
+ ```
425
+
426
+ Look at the dimension with the **smallest delta**. That is where your skill is weakest. Open `SKILL.md` and add a concrete rule targeting that dimension.
427
+
428
+ ### Example Iteration
429
+
430
+ **Run 1:** Test Coverage delta is only `+3`.
431
+
432
+ **Action:** Add to `SKILL.md`:
433
+
434
+ ```markdown
435
+ ## Testing Rules
436
+
437
+ Every service must have RSpec tests with:
438
+ - One test for the happy path (valid input succeeds)
439
+ - One test for the error path (invalid input returns errors)
440
+ - Use `describe`, `context`, and `it` blocks
441
+ ```
442
+
443
+ **Run 2:** Test Coverage delta jumps to `+10`. TREND line shows `context ↑ (+5)`.
444
+
445
+ **Repeat** until the eval passes consistently and deltas are stable.
446
+
447
+ ---
448
+
449
+ ## Understanding the Files on Disk
450
+
451
+ SkillBench manages three files you should know about:
452
+
453
+ ### `skill-bench.json` — Your Configuration (You Edit This)
454
+
455
+ Created by `skill-bench init`. Stores provider, API key, model, timeout, and allowed commands. You edit this file by hand or with the CLI.
456
+
457
+ ```json
458
+ {
459
+ "provider": "openai",
460
+ "max_execution_time": 300,
461
+ "allowed_commands": ["rspec", "bundle", "ruby", "git"],
462
+ "config": {
463
+ "api_key": "sk-...",
464
+ "model": "gpt-4o",
465
+ "max_iterations": 25
466
+ }
467
+ }
468
+ ```
469
+
470
+ ### `.skill-bench-history.json` — Evaluation History (Auto-Generated)
471
+
472
+ A JSON array recording every successful eval run. SkillBench writes it automatically. It stores timestamps, eval names, skill names, scores, and deltas. This powers the **TREND** line in your output.
473
+
474
+ ```json
475
+ [
476
+ {
477
+ "timestamp": "2026-05-12T10:30:00Z",
478
+ "eval_name": "my-first-eval",
479
+ "skill_names": ["my-service"],
480
+ "verdict": true,
481
+ "baseline_total": 32,
482
+ "context_total": 87,
483
+ "deltas": { "correctness": 16, "skill_adherence": 17, ... }
484
+ }
485
+ ]
486
+ ```
487
+
488
+ **Tip:** Commit this file to git if you want to share trend data with your team.
489
+
490
+ ### `.skill-bench-history.json.bak` — Backup (Auto-Generated)
491
+
492
+ A safety copy of the history file. If the main file gets corrupted, SkillBench recovers from this backup automatically. You never need to touch it.
493
+
494
+ ---
495
+
496
+ ## Troubleshooting
497
+
498
+ ### "Dimension max_scores must sum to 100"
499
+
500
+ Check your `criteria.json`. All `max_score` values must add up to exactly 100.
501
+
502
+ ### "missing required core dimensions: documentation"
503
+
504
+ You are missing one of the 5 mandatory dimensions. All of these must be present: `correctness`, `skill_adherence`, `code_quality`, `test_coverage`, `documentation`.
505
+
506
+ ### "Config load failed, using mock provider"
507
+
508
+ Run `skill-bench init --<provider>` to create `skill-bench.json`, or ensure it exists in the current directory.
509
+
510
+ ### "Baseline agent failed" or "Context agent failed"
511
+
512
+ The LLM provider returned an error. Check your API key in `skill-bench.json` or environment variables.
513
+
514
+ ### "Base URL not set for Opencode"
515
+
516
+ You selected `opencode` as provider but did not set a `base_url`. OpenCode does not host a public API. Either switch to a real provider (`openrouter`, `groq`, etc.) or set `config.base_url` to your own OpenAI-compatible proxy.
517
+
518
+ ## Next Steps
519
+
520
+ - Explore skill templates with `skill-bench skill new --help`
521
+ - Read `docs/architecture.md` for the full component map
522
+ - Read `docs/testing-guide.md` for advanced eval authoring techniques