@intentsolutionsio/skill-creator 5.0.0 → 5.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/package.json +1 -1
  2. package/scripts/validate-skill.py +45 -22
  3. package/skills/agent-creator/SKILL.md +40 -14
  4. package/skills/agent-creator/references/anthropic-agent-spec.md +1 -0
  5. package/skills/skill-creator/SKILL.md +34 -9
  6. package/skills/skill-creator/agents/analyzer.md +11 -0
  7. package/skills/skill-creator/agents/comparator.md +3 -0
  8. package/skills/skill-creator/agents/grader.md +4 -0
  9. package/skills/skill-creator/eval-viewer/generate_review.py +45 -13
  10. package/skills/skill-creator/references/advanced-eval-workflow.md +16 -0
  11. package/skills/skill-creator/references/anthropic-comparison.md +3 -0
  12. package/skills/skill-creator/references/creation-guide.md +20 -1
  13. package/skills/skill-creator/references/errors-template.md +1 -0
  14. package/skills/skill-creator/references/examples-template.md +1 -0
  15. package/skills/skill-creator/references/frontmatter-spec.md +1 -0
  16. package/skills/skill-creator/references/implementation-template.md +1 -0
  17. package/skills/skill-creator/references/output-patterns.md +7 -0
  18. package/skills/skill-creator/references/schemas.md +5 -0
  19. package/skills/skill-creator/references/source-of-truth.md +40 -2
  20. package/skills/skill-creator/references/validation-rules.md +19 -1
  21. package/skills/skill-creator/scripts/__pycache__/__init__.cpython-312.pyc +0 -0
  22. package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-312.pyc +0 -0
  23. package/skills/skill-creator/scripts/__pycache__/utils.cpython-312.pyc +0 -0
  24. package/skills/skill-creator/scripts/aggregate_benchmark.py +46 -60
  25. package/skills/skill-creator/scripts/generate_report.py +29 -17
  26. package/skills/skill-creator/scripts/improve_description.py +18 -21
  27. package/skills/skill-creator/scripts/package_skill.py +2 -2
  28. package/skills/skill-creator/scripts/quick_validate.py +16 -15
  29. package/skills/skill-creator/scripts/run_eval.py +14 -10
  30. package/skills/skill-creator/scripts/run_loop.py +51 -31
  31. package/skills/skill-creator/scripts/utils.py +5 -4
  32. package/skills/skill-creator/templates/agent-template.md +3 -0
  33. package/skills/skill-creator/templates/skill-template.md +4 -0
@@ -32,9 +32,32 @@ METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"}
32
32
 
33
33
  # Extensions we render as inline text
34
34
  TEXT_EXTENSIONS = {
35
- ".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx",
36
- ".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs",
37
- ".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml",
35
+ ".txt",
36
+ ".md",
37
+ ".json",
38
+ ".csv",
39
+ ".py",
40
+ ".js",
41
+ ".ts",
42
+ ".tsx",
43
+ ".jsx",
44
+ ".yaml",
45
+ ".yml",
46
+ ".xml",
47
+ ".html",
48
+ ".css",
49
+ ".sh",
50
+ ".rb",
51
+ ".go",
52
+ ".rs",
53
+ ".java",
54
+ ".c",
55
+ ".cpp",
56
+ ".h",
57
+ ".hpp",
58
+ ".sql",
59
+ ".r",
60
+ ".toml",
38
61
  }
39
62
 
40
63
  # Extensions we render as inline images
@@ -224,9 +247,7 @@ def load_previous_iteration(workspace: Path) -> dict[str, dict]:
224
247
  try:
225
248
  data = json.loads(feedback_path.read_text())
226
249
  feedback_map = {
227
- r["run_id"]: r["feedback"]
228
- for r in data.get("reviews", [])
229
- if r.get("feedback", "").strip()
250
+ r["run_id"]: r["feedback"] for r in data.get("reviews", []) if r.get("feedback", "").strip()
230
251
  }
231
252
  except (json.JSONDecodeError, OSError, KeyError):
232
253
  pass
@@ -285,12 +306,15 @@ def generate_html(
285
306
  # HTTP server (stdlib only, zero dependencies)
286
307
  # ---------------------------------------------------------------------------
287
308
 
309
+
288
310
  def _kill_port(port: int) -> None:
289
311
  """Kill any process listening on the given port."""
290
312
  try:
291
313
  result = subprocess.run(
292
314
  ["lsof", "-ti", f":{port}"],
293
- capture_output=True, text=True, timeout=5,
315
+ capture_output=True,
316
+ text=True,
317
+ timeout=5,
294
318
  )
295
319
  for pid_str in result.stdout.strip().split("\n"):
296
320
  if pid_str.strip():
@@ -305,6 +329,7 @@ def _kill_port(port: int) -> None:
305
329
  except FileNotFoundError:
306
330
  print("Note: lsof not found, cannot check if port is in use", file=sys.stderr)
307
331
 
332
+
308
333
  class ReviewHandler(BaseHTTPRequestHandler):
309
334
  """Serves the review HTML and handles feedback saves.
310
335
 
@@ -390,15 +415,22 @@ def main() -> None:
390
415
  parser.add_argument("--port", "-p", type=int, default=3117, help="Server port (default: 3117)")
391
416
  parser.add_argument("--skill-name", "-n", type=str, default=None, help="Skill name for header")
392
417
  parser.add_argument(
393
- "--previous-workspace", type=Path, default=None,
418
+ "--previous-workspace",
419
+ type=Path,
420
+ default=None,
394
421
  help="Path to previous iteration's workspace (shows old outputs and feedback as context)",
395
422
  )
396
423
  parser.add_argument(
397
- "--benchmark", type=Path, default=None,
424
+ "--benchmark",
425
+ type=Path,
426
+ default=None,
398
427
  help="Path to benchmark.json to show in the Benchmark tab",
399
428
  )
400
429
  parser.add_argument(
401
- "--static", "-s", type=Path, default=None,
430
+ "--static",
431
+ "-s",
432
+ type=Path,
433
+ default=None,
402
434
  help="Write standalone HTML to this path instead of starting a server",
403
435
  )
404
436
  args = parser.parse_args()
@@ -447,8 +479,8 @@ def main() -> None:
447
479
  port = server.server_address[1]
448
480
 
449
481
  url = f"http://localhost:{port}"
450
- print(f"\n Eval Viewer")
451
- print(f" ─────────────────────────────────")
482
+ print("\n Eval Viewer")
483
+ print(" ─────────────────────────────────")
452
484
  print(f" URL: {url}")
453
485
  print(f" Workspace: {workspace}")
454
486
  print(f" Feedback: {feedback_path}")
@@ -456,7 +488,7 @@ def main() -> None:
456
488
  print(f" Previous: {args.previous_workspace} ({len(previous)} runs)")
457
489
  if benchmark_path:
458
490
  print(f" Benchmark: {benchmark_path}")
459
- print(f"\n Press Ctrl+C to stop.\n")
491
+ print("\n Press Ctrl+C to stop.\n")
460
492
 
461
493
  webbrowser.open(url)
462
494
 
@@ -24,6 +24,7 @@ For each test case, spawn two subagents in the same turn — one with the skill,
24
24
  Launch everything at once so runs finish around the same time.
25
25
 
26
26
  **With-skill run:**
27
+
27
28
  ```
28
29
  Execute this task:
29
30
  - Skill path: <path-to-skill>
@@ -34,11 +35,13 @@ Execute this task:
34
35
  ```
35
36
 
36
37
  **Baseline run** (same prompt, no skill):
38
+
37
39
  - **Creating a new skill**: no skill at all. Save to `without_skill/outputs/`.
38
40
  - **Improving an existing skill**: snapshot the old version first (`cp -r`), point baseline
39
41
  at the snapshot. Save to `old_skill/outputs/`.
40
42
 
41
43
  Write an `eval_metadata.json` for each test case:
44
+
42
45
  ```json
43
46
  {
44
47
  "eval_id": 0,
@@ -63,6 +66,7 @@ Update `eval_metadata.json` files and `evals/evals.json` with the assertions. Se
63
66
  When each subagent task completes, the notification contains `total_tokens` and
64
67
  `duration_ms`. Save immediately to `timing.json` — this is the only opportunity to
65
68
  capture this data:
69
+
66
70
  ```json
67
71
  {
68
72
  "total_tokens": 84852,
@@ -83,9 +87,11 @@ Once all runs are done:
83
87
  eyeballing it.
84
88
 
85
89
  2. **Aggregate into benchmark**:
90
+
86
91
  ```bash
87
92
  python -m scripts.aggregate_benchmark <workspace>/iteration-N --skill-name <name>
88
93
  ```
94
+
89
95
  This produces `benchmark.json` and `benchmark.md` with pass_rate, time, and tokens for
90
96
  each configuration, with mean +/- stddev and the delta. If generating benchmark.json
91
97
  manually, see `${CLAUDE_SKILL_DIR}/references/schemas.md` for the exact schema the
@@ -96,6 +102,7 @@ Once all runs are done:
96
102
  what to look for — non-discriminating assertions, high-variance evals, time/token tradeoffs.
97
103
 
98
104
  4. **Launch the viewer**:
105
+
99
106
  ```bash
100
107
  nohup python ${CLAUDE_SKILL_DIR}/eval-viewer/generate_review.py \
101
108
  <workspace>/iteration-N \
@@ -104,6 +111,7 @@ Once all runs are done:
104
111
  > /dev/null 2>&1 &
105
112
  VIEWER_PID=$!
106
113
  ```
114
+
107
115
  For iteration 2+, also pass `--previous-workspace <workspace>/iteration-<N-1>`.
108
116
 
109
117
  **Headless/Cowork:** Use `--static <output_path>` to write standalone HTML instead of
@@ -115,6 +123,7 @@ Once all runs are done:
115
123
  ### What the user sees in the viewer
116
124
 
117
125
  The "Outputs" tab shows one test case at a time:
126
+
118
127
  - **Prompt**: the task that was given
119
128
  - **Output**: the files the skill produced, rendered inline where possible
120
129
  - **Previous Output** (iteration 2+): collapsed section showing last iteration's output
@@ -131,6 +140,7 @@ all feedback to `feedback.json`.
131
140
  ### Step E5: Read the feedback
132
141
 
133
142
  When the user is done, read `feedback.json`:
143
+
134
144
  ```json
135
145
  {
136
146
  "reviews": [
@@ -140,6 +150,7 @@ When the user is done, read `feedback.json`:
140
150
  "status": "complete"
141
151
  }
142
152
  ```
153
+
143
154
  Empty feedback means the user thought it was fine. Focus improvements on test cases with
144
155
  specific complaints. Kill the viewer server when done: `kill $VIEWER_PID 2>/dev/null`.
145
156
 
@@ -172,6 +183,7 @@ After running test cases and collecting feedback, improve the skill based on wha
172
183
  ### The iteration loop
173
184
 
174
185
  After improving the skill:
186
+
175
187
  1. Apply improvements to the skill
176
188
  2. Rerun all test cases into a new `iteration-<N+1>/` directory, including baselines
177
189
  3. Launch the reviewer with `--previous-workspace` pointing at the previous iteration
@@ -179,6 +191,7 @@ After improving the skill:
179
191
  5. Read new feedback, improve again, repeat
180
192
 
181
193
  Keep going until:
194
+
182
195
  - The user says they're happy
183
196
  - The feedback is all empty (everything looks good)
184
197
  - You're not making meaningful progress
@@ -194,6 +207,7 @@ accuracy.
194
207
  ### Step D1: Generate trigger eval queries
195
208
 
196
209
  Create 20 eval queries — a mix of should-trigger and should-not-trigger. Save as JSON:
210
+
197
211
  ```json
198
212
  [
199
213
  {"query": "the user prompt", "should_trigger": true},
@@ -219,6 +233,7 @@ Good: `"ok so my boss just sent me this xlsx file (its in my downloads, called s
219
233
  ### Step D2: Review with user
220
234
 
221
235
  Present the eval set using the HTML template:
236
+
222
237
  1. Read `${CLAUDE_SKILL_DIR}/assets/eval_review.html`
223
238
  2. Replace placeholders: `__EVAL_DATA_PLACEHOLDER__` (JSON array, no quotes — it's a JS
224
239
  variable assignment), `__SKILL_NAME_PLACEHOLDER__`, `__SKILL_DESCRIPTION_PLACEHOLDER__`
@@ -298,6 +313,7 @@ direct the user to the resulting `.skill` file path so they can install it.
298
313
  ### Claude.ai
299
314
 
300
315
  The core workflow (draft -> test -> review -> improve) is the same, but without subagents:
316
+
301
317
  - **Test cases**: Run them yourself one at a time. Skip baseline runs.
302
318
  - **Review**: Present results directly in conversation. Save output files and tell the user
303
319
  where they are.
@@ -3,6 +3,7 @@
3
3
  Sources: [AgentSkills.io spec](https://agentskills.io/specification) · [Anthropic docs](https://code.claude.com/docs/en/skills) · [anthropics/skills repo](https://github.com/anthropics/skills)
4
4
 
5
5
  Comparison of our skill-creator implementation against:
6
+
6
7
  - AgentSkills.io specification (canonical open standard)
7
8
  - Anthropic best practices (platform.claude.com)
8
9
  - anthropics/skills official skill-creator
@@ -73,6 +74,7 @@ Comparison of our skill-creator implementation against:
73
74
  ### For Existing Skills
74
75
 
75
76
  Existing skills that pass our old validator will mostly pass the new one because:
77
+
76
78
  - Enterprise tier (default) still checks `metadata.author`, `metadata.version`, scoped tools
77
79
  - The body section checks are warnings, not errors
78
80
  - "Use when" / "Trigger with" are now recommended patterns, not hard requirements
@@ -85,6 +87,7 @@ Existing skills that pass our old validator will mostly pass the new one because
85
87
  ### New Capabilities
86
88
 
87
89
  Skills can now use:
90
+
88
91
  - `$ARGUMENTS` for dynamic input
89
92
  - `context: fork` for subagent execution
90
93
  - `hooks` for lifecycle automation
@@ -10,6 +10,7 @@ Generate the SKILL.md using the template from `${CLAUDE_SKILL_DIR}/templates/ski
10
10
  **Frontmatter rules** (see `${CLAUDE_SKILL_DIR}/references/frontmatter-spec.md`):
11
11
 
12
12
  Required fields:
13
+
13
14
  ```yaml
14
15
  name: {skill-name} # Must match directory name
15
16
  description: | # Third person, what + when + keywords
@@ -18,10 +19,12 @@ description: | # Third person, what + when + keywords
18
19
  ```
19
20
 
20
21
  **Frontmatter constraints (Anthropic spec):**
22
+
21
23
  - `name`: No XML tags (`<`, `>` characters prohibited). No reserved words (`anthropic`, `claude`) in isolation.
22
24
  - `description`: No XML tags. Description is injected into Claude's system prompt — third person prevents discovery issues where Claude speaks as the skill author.
23
25
 
24
26
  Identity fields (top-level — marketplace validator scores these here):
27
+
25
28
  ```yaml
26
29
  version: 1.0.0
27
30
  author: {name} <{email}>
@@ -32,12 +35,14 @@ license: MIT
32
35
  Do NOT nest them under `metadata:`. The marketplace 100-point validator checks them at top-level.
33
36
 
34
37
  Recommended fields:
38
+
35
39
  ```yaml
36
40
  allowed-tools: "{scoped tools}"
37
41
  model: inherit
38
42
  ```
39
43
 
40
44
  Optional Claude Code extensions:
45
+
41
46
  ```yaml
42
47
  argument-hint: "[arg]" # If accepts $ARGUMENTS
43
48
  context: fork # If needs isolated execution
@@ -71,6 +76,7 @@ Pattern (enterprise): "Use when [scenario]" (+3 pts) + "Trigger with [phrases]"
71
76
  **Body content guidelines — section recommendations:**
72
77
 
73
78
  Anthropic's spec places no format restrictions on body content. The sections below are enterprise-tier quality recommendations scored by the Intent Solutions marketplace rubric. At standard tier, these are not required but are still good practice:
79
+
74
80
  ```
75
81
  ## Overview (>50 chars content: +4 pts enterprise)
76
82
  ## Prerequisites (+2 pts enterprise)
@@ -83,10 +89,11 @@ Anthropic's spec places no format restrictions on body content. The sections bel
83
89
  ```
84
90
 
85
91
  Additional guidelines:
92
+
86
93
  - Keep under 500 lines (offload to `references/` if longer)
87
94
  - Concise — Claude is smart, don't over-explain
88
95
  - Concrete examples over abstract descriptions
89
- - Reference supporting files with relative markdown links: `[details](reference.md)` or `[API](references/api.md)` — Claude reads these on demand
96
+ - Reference supporting files with relative markdown links: `details` or `API` — Claude reads these on demand
90
97
  - Use `${CLAUDE_SKILL_DIR}/` in DCI/bash contexts only: exclamation + backtick-wrapped command, e.g. `cat ${CLAUDE_SKILL_DIR}/references/config.md`
91
98
  - Sections >20 lines (Output, Error Handling, Examples) → offload to `references/` with relative links
92
99
  - If skill has 3+ distinct user operations → split into individual `commands/*.md` files
@@ -100,6 +107,7 @@ Additional guidelines:
100
107
  - **No surprise behavior**: Skills must not contain malware, exploit code, or content that could compromise security. A skill's behavior should not surprise the user if described honestly
101
108
 
102
109
  **String substitutions available:**
110
+
103
111
  - `$ARGUMENTS` / `$0`, `$1` - user-provided arguments (pair with `argument-hint` frontmatter)
104
112
  - `${CLAUDE_SESSION_ID}` - current session ID
105
113
  - `` !`command` `` syntax — dynamic context injection (Anthropic spec feature):
@@ -111,6 +119,7 @@ Additional guidelines:
111
119
  ## Step 5: Create Supporting Files
112
120
 
113
121
  **Scripts** (`scripts/`):
122
+
114
123
  - Scripts should solve problems, not punt to Claude
115
124
  - Explicit error handling
116
125
  - No voodoo constants (document all magic values)
@@ -118,16 +127,19 @@ Additional guidelines:
118
127
  - Make executable: `chmod +x scripts/*.py`
119
128
 
120
129
  **References** (`references/`):
130
+
121
131
  - Heavy documentation that doesn't need to load at activation
122
132
  - Use clear section headers for navigability
123
133
  - For reference files >100 lines, include a TOC at the top so Claude can see full scope even with partial reads
124
134
  - One-level-deep references only (no `references/sub/dir/`)
125
135
 
126
136
  **Templates** (`templates/`):
137
+
127
138
  - Boilerplate files used for generation
128
139
  - Use clear placeholder syntax (`{{PLACEHOLDER}}`)
129
140
 
130
141
  **Assets** (`assets/`):
142
+
131
143
  - Static resources (images, configs, data files)
132
144
 
133
145
  ## Step 6: Validate
@@ -142,6 +154,7 @@ python3 ${CLAUDE_SKILL_DIR}/scripts/validate-skill.py --grade {skill-dir}/SKILL.
142
154
  Standard tier is the default (no required fields, broad compatibility). Use `--enterprise` for full 100-point marketplace grading.
143
155
 
144
156
  **Validation checks:**
157
+
145
158
  - Frontmatter: required fields, types, constraints
146
159
  - Description: third person, what + when, keywords, length
147
160
  - Body: under 500 lines, no absolute paths, has instructions + examples
@@ -151,6 +164,7 @@ Standard tier is the default (no required fields, broad compatibility). Use `--e
151
164
  - Progressive disclosure: appropriate use of references/
152
165
 
153
166
  **If validation fails:** fix issues and re-run. Common fixes:
167
+
154
168
  - Scope Bash tools: `Bash(git:*)` not `Bash`
155
169
  - Remove absolute paths, use `${CLAUDE_SKILL_DIR}/`
156
170
  - Split long SKILL.md into references
@@ -172,6 +186,7 @@ Create `evals/evals.json` with minimum 3 scenarios: happy path, edge case, negat
172
186
  Run parallel evaluation: Claude A with skill installed vs Claude B without. Compare outputs against assertions — the skill should produce meaningfully better results for its target use cases.
173
187
 
174
188
  **Additional testing practices:**
189
+
175
190
  - **Team feedback**: If applicable, share the skill with teammates and observe usage patterns
176
191
  - **Observe Claude navigation**: Watch how Claude reads and navigates the skill — look for unexpected exploration paths, missed references, or overreliance on certain sections
177
192
 
@@ -197,6 +212,7 @@ Tips: front-load distinctive keywords, include specific file types/tools/domains
197
212
  ## Step 10: Report
198
213
 
199
214
  Show the user:
215
+
200
216
  ```
201
217
  SKILL CREATED
202
218
  ====================================
@@ -235,6 +251,7 @@ When the user wants to validate, grade, or audit an existing skill:
235
251
  ### Step V1: Locate the Skill
236
252
 
237
253
  Ask for the SKILL.md path or detect from context. Common locations:
254
+
238
255
  - `~/.claude/skills/{name}/SKILL.md` (global)
239
256
  - `.claude/skills/{name}/SKILL.md` (project)
240
257
 
@@ -268,6 +285,7 @@ Present the grade report with specific fix recommendations. Prioritize fixes by
268
285
  ### Step V5: Auto-Fix (if requested)
269
286
 
270
287
  If the user says "fix it" or "auto-fix", apply the suggested improvements:
288
+
271
289
  1. Add missing sections (Overview, Prerequisites, Output)
272
290
  2. Add "Use when" / "Trigger with" to description
273
291
  3. Move author/version from metadata to top-level
@@ -301,5 +319,6 @@ For A/B testing between skill versions, read `${CLAUDE_SKILL_DIR}/agents/compara
301
319
  ## Platform-Specific Notes
302
320
 
303
321
  See `${CLAUDE_SKILL_DIR}/references/advanced-eval-workflow.md` (section "Platform-Specific Notes").
322
+
304
323
  - **Claude.ai**: No subagents — run tests yourself, skip benchmarking/description optimization.
305
324
  - **Cowork**: Full subagent workflow. Use `--static` for eval viewer. Generate viewer BEFORE self-evaluation.
@@ -22,6 +22,7 @@ Standard troubleshooting reference for all marketplace skills. Every references/
22
22
  ```
23
23
 
24
24
  Categories should match the skill's domain. Examples:
25
+
25
26
  - For GCP skills: Authentication Errors, API Errors, Deployment Errors, Quota Errors
26
27
  - For database skills: Connection Errors, Query Errors, Permission Errors, Data Errors
27
28
  - For CI/CD skills: Build Errors, Deploy Errors, Configuration Errors, Permission Errors
@@ -34,6 +34,7 @@ Standard usage examples reference for all marketplace skills. Every references/e
34
34
  ```
35
35
 
36
36
  Rules:
37
+
37
38
  - Minimum 3 examples, maximum 6
38
39
  - Each example has: scenario title, user request, what's produced, code/config
39
40
  - Code must be real and runnable — no pseudocode, no "add your logic here"
@@ -94,6 +94,7 @@ allowed-tools: "Read,Write,Bash" # Warning in Standard, Error in Enterprise
94
94
  ```
95
95
 
96
96
  **Bash Scoping Patterns**:
97
+
97
98
  ```yaml
98
99
  Bash(git:*) # All git commands
99
100
  Bash(npm:*) # All npm commands
@@ -37,6 +37,7 @@ Standard implementation guide for all marketplace skills. Every references/imple
37
37
  ```
38
38
 
39
39
  Rules:
40
+
40
41
  - Write from the perspective of how this specific skill works, not generic patterns
41
42
  - Include real commands, real API calls, real config values
42
43
  - Keep under 200 lines — this is a reference, not a textbook
@@ -29,6 +29,7 @@ Generate output using this exact template:
29
29
  ```
30
30
 
31
31
  Replace placeholders with gathered values. Do not add extra fields.
32
+
32
33
  ```
33
34
 
34
35
  ### Flexible Template (Medium Degrees of Freedom)
@@ -69,20 +70,24 @@ Provide input/output pairs that demonstrate expected behavior.
69
70
  **Input**: `/skill-name auth.py`
70
71
  **Output**:
71
72
  ```
73
+
72
74
  auth.py: 3 issues found
73
75
  Line 15: SQL injection risk in query builder
74
76
  Line 42: Hardcoded credential detected
75
77
  Line 89: Missing input validation
78
+
76
79
  ```
77
80
 
78
81
  ### Complex case
79
82
  **Input**: `/skill-name --deep src/`
80
83
  **Output**:
81
84
  ```
85
+
82
86
  Deep scan: 12 files, 7 issues
83
87
  CRITICAL (2): sql-injection, hardcoded-secret
84
88
  WARNING (3): missing-validation, weak-hash, cors-wildcard
85
89
  INFO (2): deprecated-api, unused-import
90
+
86
91
  ```
87
92
  ```
88
93
 
@@ -135,6 +140,7 @@ Example HTML structure:
135
140
  </body>
136
141
  </html>
137
142
  ```
143
+
138
144
  ```
139
145
 
140
146
  ### When to Use
@@ -177,6 +183,7 @@ Results are written as JSON to `{output_path}/results.json`:
177
183
  ```
178
184
 
179
185
  Additionally, a human-readable summary is printed to the conversation.
186
+
180
187
  ```
181
188
 
182
189
  ---
@@ -27,6 +27,7 @@ Defines the evals for a skill. Located at `evals/evals.json` within the skill di
27
27
  ```
28
28
 
29
29
  **Fields:**
30
+
30
31
  - `skill_name`: Name matching the skill's frontmatter
31
32
  - `evals[].id`: Unique integer identifier
32
33
  - `evals[].prompt`: The task to execute
@@ -72,6 +73,7 @@ Tracks version progression in Improve mode. Located at workspace root.
72
73
  ```
73
74
 
74
75
  **Fields:**
76
+
75
77
  - `started_at`: ISO timestamp of when improvement started
76
78
  - `skill_name`: Name of the skill being improved
77
79
  - `current_best`: Version identifier of the best performer
@@ -150,6 +152,7 @@ Output from the grader agent. Located at `<run-dir>/grading.json`.
150
152
  ```
151
153
 
152
154
  **Fields:**
155
+
153
156
  - `expectations[]`: Graded expectations with evidence
154
157
  - `summary`: Aggregate pass/fail counts
155
158
  - `execution_metrics`: Tool usage and output size (from executor's metrics.json)
@@ -184,6 +187,7 @@ Output from the executor agent. Located at `<run-dir>/outputs/metrics.json`.
184
187
  ```
185
188
 
186
189
  **Fields:**
190
+
187
191
  - `tool_calls`: Count per tool type
188
192
  - `total_tool_calls`: Sum of all tool calls
189
193
  - `total_steps`: Number of major execution steps
@@ -286,6 +290,7 @@ Output from Benchmark mode. Located at `benchmarks/<timestamp>/benchmark.json`.
286
290
  ```
287
291
 
288
292
  **Fields:**
293
+
289
294
  - `metadata`: Information about the benchmark run
290
295
  - `skill_name`: Name of the skill
291
296
  - `timestamp`: When the benchmark was run