@intentsolutionsio/skill-creator 5.0.0 → 5.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/scripts/validate-skill.py +45 -22
- package/skills/agent-creator/SKILL.md +40 -14
- package/skills/agent-creator/references/anthropic-agent-spec.md +1 -0
- package/skills/skill-creator/SKILL.md +34 -9
- package/skills/skill-creator/agents/analyzer.md +11 -0
- package/skills/skill-creator/agents/comparator.md +3 -0
- package/skills/skill-creator/agents/grader.md +4 -0
- package/skills/skill-creator/eval-viewer/generate_review.py +45 -13
- package/skills/skill-creator/references/advanced-eval-workflow.md +16 -0
- package/skills/skill-creator/references/anthropic-comparison.md +3 -0
- package/skills/skill-creator/references/creation-guide.md +20 -1
- package/skills/skill-creator/references/errors-template.md +1 -0
- package/skills/skill-creator/references/examples-template.md +1 -0
- package/skills/skill-creator/references/frontmatter-spec.md +1 -0
- package/skills/skill-creator/references/implementation-template.md +1 -0
- package/skills/skill-creator/references/output-patterns.md +7 -0
- package/skills/skill-creator/references/schemas.md +5 -0
- package/skills/skill-creator/references/source-of-truth.md +40 -2
- package/skills/skill-creator/references/validation-rules.md +19 -1
- package/skills/skill-creator/scripts/__pycache__/__init__.cpython-312.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-312.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/utils.cpython-312.pyc +0 -0
- package/skills/skill-creator/scripts/aggregate_benchmark.py +46 -60
- package/skills/skill-creator/scripts/generate_report.py +29 -17
- package/skills/skill-creator/scripts/improve_description.py +18 -21
- package/skills/skill-creator/scripts/package_skill.py +2 -2
- package/skills/skill-creator/scripts/quick_validate.py +16 -15
- package/skills/skill-creator/scripts/run_eval.py +14 -10
- package/skills/skill-creator/scripts/run_loop.py +51 -31
- package/skills/skill-creator/scripts/utils.py +5 -4
- package/skills/skill-creator/templates/agent-template.md +3 -0
- package/skills/skill-creator/templates/skill-template.md +4 -0
|
@@ -32,9 +32,32 @@ METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"}
|
|
|
32
32
|
|
|
33
33
|
# Extensions we render as inline text
|
|
34
34
|
TEXT_EXTENSIONS = {
|
|
35
|
-
".txt",
|
|
36
|
-
".
|
|
37
|
-
".
|
|
35
|
+
".txt",
|
|
36
|
+
".md",
|
|
37
|
+
".json",
|
|
38
|
+
".csv",
|
|
39
|
+
".py",
|
|
40
|
+
".js",
|
|
41
|
+
".ts",
|
|
42
|
+
".tsx",
|
|
43
|
+
".jsx",
|
|
44
|
+
".yaml",
|
|
45
|
+
".yml",
|
|
46
|
+
".xml",
|
|
47
|
+
".html",
|
|
48
|
+
".css",
|
|
49
|
+
".sh",
|
|
50
|
+
".rb",
|
|
51
|
+
".go",
|
|
52
|
+
".rs",
|
|
53
|
+
".java",
|
|
54
|
+
".c",
|
|
55
|
+
".cpp",
|
|
56
|
+
".h",
|
|
57
|
+
".hpp",
|
|
58
|
+
".sql",
|
|
59
|
+
".r",
|
|
60
|
+
".toml",
|
|
38
61
|
}
|
|
39
62
|
|
|
40
63
|
# Extensions we render as inline images
|
|
@@ -224,9 +247,7 @@ def load_previous_iteration(workspace: Path) -> dict[str, dict]:
|
|
|
224
247
|
try:
|
|
225
248
|
data = json.loads(feedback_path.read_text())
|
|
226
249
|
feedback_map = {
|
|
227
|
-
r["run_id"]: r["feedback"]
|
|
228
|
-
for r in data.get("reviews", [])
|
|
229
|
-
if r.get("feedback", "").strip()
|
|
250
|
+
r["run_id"]: r["feedback"] for r in data.get("reviews", []) if r.get("feedback", "").strip()
|
|
230
251
|
}
|
|
231
252
|
except (json.JSONDecodeError, OSError, KeyError):
|
|
232
253
|
pass
|
|
@@ -285,12 +306,15 @@ def generate_html(
|
|
|
285
306
|
# HTTP server (stdlib only, zero dependencies)
|
|
286
307
|
# ---------------------------------------------------------------------------
|
|
287
308
|
|
|
309
|
+
|
|
288
310
|
def _kill_port(port: int) -> None:
|
|
289
311
|
"""Kill any process listening on the given port."""
|
|
290
312
|
try:
|
|
291
313
|
result = subprocess.run(
|
|
292
314
|
["lsof", "-ti", f":{port}"],
|
|
293
|
-
capture_output=True,
|
|
315
|
+
capture_output=True,
|
|
316
|
+
text=True,
|
|
317
|
+
timeout=5,
|
|
294
318
|
)
|
|
295
319
|
for pid_str in result.stdout.strip().split("\n"):
|
|
296
320
|
if pid_str.strip():
|
|
@@ -305,6 +329,7 @@ def _kill_port(port: int) -> None:
|
|
|
305
329
|
except FileNotFoundError:
|
|
306
330
|
print("Note: lsof not found, cannot check if port is in use", file=sys.stderr)
|
|
307
331
|
|
|
332
|
+
|
|
308
333
|
class ReviewHandler(BaseHTTPRequestHandler):
|
|
309
334
|
"""Serves the review HTML and handles feedback saves.
|
|
310
335
|
|
|
@@ -390,15 +415,22 @@ def main() -> None:
|
|
|
390
415
|
parser.add_argument("--port", "-p", type=int, default=3117, help="Server port (default: 3117)")
|
|
391
416
|
parser.add_argument("--skill-name", "-n", type=str, default=None, help="Skill name for header")
|
|
392
417
|
parser.add_argument(
|
|
393
|
-
"--previous-workspace",
|
|
418
|
+
"--previous-workspace",
|
|
419
|
+
type=Path,
|
|
420
|
+
default=None,
|
|
394
421
|
help="Path to previous iteration's workspace (shows old outputs and feedback as context)",
|
|
395
422
|
)
|
|
396
423
|
parser.add_argument(
|
|
397
|
-
"--benchmark",
|
|
424
|
+
"--benchmark",
|
|
425
|
+
type=Path,
|
|
426
|
+
default=None,
|
|
398
427
|
help="Path to benchmark.json to show in the Benchmark tab",
|
|
399
428
|
)
|
|
400
429
|
parser.add_argument(
|
|
401
|
-
"--static",
|
|
430
|
+
"--static",
|
|
431
|
+
"-s",
|
|
432
|
+
type=Path,
|
|
433
|
+
default=None,
|
|
402
434
|
help="Write standalone HTML to this path instead of starting a server",
|
|
403
435
|
)
|
|
404
436
|
args = parser.parse_args()
|
|
@@ -447,8 +479,8 @@ def main() -> None:
|
|
|
447
479
|
port = server.server_address[1]
|
|
448
480
|
|
|
449
481
|
url = f"http://localhost:{port}"
|
|
450
|
-
print(
|
|
451
|
-
print(
|
|
482
|
+
print("\n Eval Viewer")
|
|
483
|
+
print(" ─────────────────────────────────")
|
|
452
484
|
print(f" URL: {url}")
|
|
453
485
|
print(f" Workspace: {workspace}")
|
|
454
486
|
print(f" Feedback: {feedback_path}")
|
|
@@ -456,7 +488,7 @@ def main() -> None:
|
|
|
456
488
|
print(f" Previous: {args.previous_workspace} ({len(previous)} runs)")
|
|
457
489
|
if benchmark_path:
|
|
458
490
|
print(f" Benchmark: {benchmark_path}")
|
|
459
|
-
print(
|
|
491
|
+
print("\n Press Ctrl+C to stop.\n")
|
|
460
492
|
|
|
461
493
|
webbrowser.open(url)
|
|
462
494
|
|
|
@@ -24,6 +24,7 @@ For each test case, spawn two subagents in the same turn — one with the skill,
|
|
|
24
24
|
Launch everything at once so runs finish around the same time.
|
|
25
25
|
|
|
26
26
|
**With-skill run:**
|
|
27
|
+
|
|
27
28
|
```
|
|
28
29
|
Execute this task:
|
|
29
30
|
- Skill path: <path-to-skill>
|
|
@@ -34,11 +35,13 @@ Execute this task:
|
|
|
34
35
|
```
|
|
35
36
|
|
|
36
37
|
**Baseline run** (same prompt, no skill):
|
|
38
|
+
|
|
37
39
|
- **Creating a new skill**: no skill at all. Save to `without_skill/outputs/`.
|
|
38
40
|
- **Improving an existing skill**: snapshot the old version first (`cp -r`), point baseline
|
|
39
41
|
at the snapshot. Save to `old_skill/outputs/`.
|
|
40
42
|
|
|
41
43
|
Write an `eval_metadata.json` for each test case:
|
|
44
|
+
|
|
42
45
|
```json
|
|
43
46
|
{
|
|
44
47
|
"eval_id": 0,
|
|
@@ -63,6 +66,7 @@ Update `eval_metadata.json` files and `evals/evals.json` with the assertions. Se
|
|
|
63
66
|
When each subagent task completes, the notification contains `total_tokens` and
|
|
64
67
|
`duration_ms`. Save immediately to `timing.json` — this is the only opportunity to
|
|
65
68
|
capture this data:
|
|
69
|
+
|
|
66
70
|
```json
|
|
67
71
|
{
|
|
68
72
|
"total_tokens": 84852,
|
|
@@ -83,9 +87,11 @@ Once all runs are done:
|
|
|
83
87
|
eyeballing it.
|
|
84
88
|
|
|
85
89
|
2. **Aggregate into benchmark**:
|
|
90
|
+
|
|
86
91
|
```bash
|
|
87
92
|
python -m scripts.aggregate_benchmark <workspace>/iteration-N --skill-name <name>
|
|
88
93
|
```
|
|
94
|
+
|
|
89
95
|
This produces `benchmark.json` and `benchmark.md` with pass_rate, time, and tokens for
|
|
90
96
|
each configuration, with mean +/- stddev and the delta. If generating benchmark.json
|
|
91
97
|
manually, see `${CLAUDE_SKILL_DIR}/references/schemas.md` for the exact schema the
|
|
@@ -96,6 +102,7 @@ Once all runs are done:
|
|
|
96
102
|
what to look for — non-discriminating assertions, high-variance evals, time/token tradeoffs.
|
|
97
103
|
|
|
98
104
|
4. **Launch the viewer**:
|
|
105
|
+
|
|
99
106
|
```bash
|
|
100
107
|
nohup python ${CLAUDE_SKILL_DIR}/eval-viewer/generate_review.py \
|
|
101
108
|
<workspace>/iteration-N \
|
|
@@ -104,6 +111,7 @@ Once all runs are done:
|
|
|
104
111
|
> /dev/null 2>&1 &
|
|
105
112
|
VIEWER_PID=$!
|
|
106
113
|
```
|
|
114
|
+
|
|
107
115
|
For iteration 2+, also pass `--previous-workspace <workspace>/iteration-<N-1>`.
|
|
108
116
|
|
|
109
117
|
**Headless/Cowork:** Use `--static <output_path>` to write standalone HTML instead of
|
|
@@ -115,6 +123,7 @@ Once all runs are done:
|
|
|
115
123
|
### What the user sees in the viewer
|
|
116
124
|
|
|
117
125
|
The "Outputs" tab shows one test case at a time:
|
|
126
|
+
|
|
118
127
|
- **Prompt**: the task that was given
|
|
119
128
|
- **Output**: the files the skill produced, rendered inline where possible
|
|
120
129
|
- **Previous Output** (iteration 2+): collapsed section showing last iteration's output
|
|
@@ -131,6 +140,7 @@ all feedback to `feedback.json`.
|
|
|
131
140
|
### Step E5: Read the feedback
|
|
132
141
|
|
|
133
142
|
When the user is done, read `feedback.json`:
|
|
143
|
+
|
|
134
144
|
```json
|
|
135
145
|
{
|
|
136
146
|
"reviews": [
|
|
@@ -140,6 +150,7 @@ When the user is done, read `feedback.json`:
|
|
|
140
150
|
"status": "complete"
|
|
141
151
|
}
|
|
142
152
|
```
|
|
153
|
+
|
|
143
154
|
Empty feedback means the user thought it was fine. Focus improvements on test cases with
|
|
144
155
|
specific complaints. Kill the viewer server when done: `kill $VIEWER_PID 2>/dev/null`.
|
|
145
156
|
|
|
@@ -172,6 +183,7 @@ After running test cases and collecting feedback, improve the skill based on wha
|
|
|
172
183
|
### The iteration loop
|
|
173
184
|
|
|
174
185
|
After improving the skill:
|
|
186
|
+
|
|
175
187
|
1. Apply improvements to the skill
|
|
176
188
|
2. Rerun all test cases into a new `iteration-<N+1>/` directory, including baselines
|
|
177
189
|
3. Launch the reviewer with `--previous-workspace` pointing at the previous iteration
|
|
@@ -179,6 +191,7 @@ After improving the skill:
|
|
|
179
191
|
5. Read new feedback, improve again, repeat
|
|
180
192
|
|
|
181
193
|
Keep going until:
|
|
194
|
+
|
|
182
195
|
- The user says they're happy
|
|
183
196
|
- The feedback is all empty (everything looks good)
|
|
184
197
|
- You're not making meaningful progress
|
|
@@ -194,6 +207,7 @@ accuracy.
|
|
|
194
207
|
### Step D1: Generate trigger eval queries
|
|
195
208
|
|
|
196
209
|
Create 20 eval queries — a mix of should-trigger and should-not-trigger. Save as JSON:
|
|
210
|
+
|
|
197
211
|
```json
|
|
198
212
|
[
|
|
199
213
|
{"query": "the user prompt", "should_trigger": true},
|
|
@@ -219,6 +233,7 @@ Good: `"ok so my boss just sent me this xlsx file (its in my downloads, called s
|
|
|
219
233
|
### Step D2: Review with user
|
|
220
234
|
|
|
221
235
|
Present the eval set using the HTML template:
|
|
236
|
+
|
|
222
237
|
1. Read `${CLAUDE_SKILL_DIR}/assets/eval_review.html`
|
|
223
238
|
2. Replace placeholders: `__EVAL_DATA_PLACEHOLDER__` (JSON array, no quotes — it's a JS
|
|
224
239
|
variable assignment), `__SKILL_NAME_PLACEHOLDER__`, `__SKILL_DESCRIPTION_PLACEHOLDER__`
|
|
@@ -298,6 +313,7 @@ direct the user to the resulting `.skill` file path so they can install it.
|
|
|
298
313
|
### Claude.ai
|
|
299
314
|
|
|
300
315
|
The core workflow (draft -> test -> review -> improve) is the same, but without subagents:
|
|
316
|
+
|
|
301
317
|
- **Test cases**: Run them yourself one at a time. Skip baseline runs.
|
|
302
318
|
- **Review**: Present results directly in conversation. Save output files and tell the user
|
|
303
319
|
where they are.
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
Sources: [AgentSkills.io spec](https://agentskills.io/specification) · [Anthropic docs](https://code.claude.com/docs/en/skills) · [anthropics/skills repo](https://github.com/anthropics/skills)
|
|
4
4
|
|
|
5
5
|
Comparison of our skill-creator implementation against:
|
|
6
|
+
|
|
6
7
|
- AgentSkills.io specification (canonical open standard)
|
|
7
8
|
- Anthropic best practices (platform.claude.com)
|
|
8
9
|
- anthropics/skills official skill-creator
|
|
@@ -73,6 +74,7 @@ Comparison of our skill-creator implementation against:
|
|
|
73
74
|
### For Existing Skills
|
|
74
75
|
|
|
75
76
|
Existing skills that pass our old validator will mostly pass the new one because:
|
|
77
|
+
|
|
76
78
|
- Enterprise tier (default) still checks `metadata.author`, `metadata.version`, scoped tools
|
|
77
79
|
- The body section checks are warnings, not errors
|
|
78
80
|
- "Use when" / "Trigger with" are now recommended patterns, not hard requirements
|
|
@@ -85,6 +87,7 @@ Existing skills that pass our old validator will mostly pass the new one because
|
|
|
85
87
|
### New Capabilities
|
|
86
88
|
|
|
87
89
|
Skills can now use:
|
|
90
|
+
|
|
88
91
|
- `$ARGUMENTS` for dynamic input
|
|
89
92
|
- `context: fork` for subagent execution
|
|
90
93
|
- `hooks` for lifecycle automation
|
|
@@ -10,6 +10,7 @@ Generate the SKILL.md using the template from `${CLAUDE_SKILL_DIR}/templates/ski
|
|
|
10
10
|
**Frontmatter rules** (see `${CLAUDE_SKILL_DIR}/references/frontmatter-spec.md`):
|
|
11
11
|
|
|
12
12
|
Required fields:
|
|
13
|
+
|
|
13
14
|
```yaml
|
|
14
15
|
name: {skill-name} # Must match directory name
|
|
15
16
|
description: | # Third person, what + when + keywords
|
|
@@ -18,10 +19,12 @@ description: | # Third person, what + when + keywords
|
|
|
18
19
|
```
|
|
19
20
|
|
|
20
21
|
**Frontmatter constraints (Anthropic spec):**
|
|
22
|
+
|
|
21
23
|
- `name`: No XML tags (`<`, `>` characters prohibited). No reserved words (`anthropic`, `claude`) in isolation.
|
|
22
24
|
- `description`: No XML tags. Description is injected into Claude's system prompt — third person prevents discovery issues where Claude speaks as the skill author.
|
|
23
25
|
|
|
24
26
|
Identity fields (top-level — marketplace validator scores these here):
|
|
27
|
+
|
|
25
28
|
```yaml
|
|
26
29
|
version: 1.0.0
|
|
27
30
|
author: {name} <{email}>
|
|
@@ -32,12 +35,14 @@ license: MIT
|
|
|
32
35
|
Do NOT nest them under `metadata:`. The marketplace 100-point validator checks them at top-level.
|
|
33
36
|
|
|
34
37
|
Recommended fields:
|
|
38
|
+
|
|
35
39
|
```yaml
|
|
36
40
|
allowed-tools: "{scoped tools}"
|
|
37
41
|
model: inherit
|
|
38
42
|
```
|
|
39
43
|
|
|
40
44
|
Optional Claude Code extensions:
|
|
45
|
+
|
|
41
46
|
```yaml
|
|
42
47
|
argument-hint: "[arg]" # If accepts $ARGUMENTS
|
|
43
48
|
context: fork # If needs isolated execution
|
|
@@ -71,6 +76,7 @@ Pattern (enterprise): "Use when [scenario]" (+3 pts) + "Trigger with [phrases]"
|
|
|
71
76
|
**Body content guidelines — section recommendations:**
|
|
72
77
|
|
|
73
78
|
Anthropic's spec places no format restrictions on body content. The sections below are enterprise-tier quality recommendations scored by the Intent Solutions marketplace rubric. At standard tier, these are not required but are still good practice:
|
|
79
|
+
|
|
74
80
|
```
|
|
75
81
|
## Overview (>50 chars content: +4 pts enterprise)
|
|
76
82
|
## Prerequisites (+2 pts enterprise)
|
|
@@ -83,10 +89,11 @@ Anthropic's spec places no format restrictions on body content. The sections bel
|
|
|
83
89
|
```
|
|
84
90
|
|
|
85
91
|
Additional guidelines:
|
|
92
|
+
|
|
86
93
|
- Keep under 500 lines (offload to `references/` if longer)
|
|
87
94
|
- Concise — Claude is smart, don't over-explain
|
|
88
95
|
- Concrete examples over abstract descriptions
|
|
89
|
-
- Reference supporting files with relative markdown links: `
|
|
96
|
+
- Reference supporting files with relative markdown links: `details` or `API` — Claude reads these on demand
|
|
90
97
|
- Use `${CLAUDE_SKILL_DIR}/` in DCI/bash contexts only: exclamation + backtick-wrapped command, e.g. `cat ${CLAUDE_SKILL_DIR}/references/config.md`
|
|
91
98
|
- Sections >20 lines (Output, Error Handling, Examples) → offload to `references/` with relative links
|
|
92
99
|
- If skill has 3+ distinct user operations → split into individual `commands/*.md` files
|
|
@@ -100,6 +107,7 @@ Additional guidelines:
|
|
|
100
107
|
- **No surprise behavior**: Skills must not contain malware, exploit code, or content that could compromise security. A skill's behavior should not surprise the user if described honestly
|
|
101
108
|
|
|
102
109
|
**String substitutions available:**
|
|
110
|
+
|
|
103
111
|
- `$ARGUMENTS` / `$0`, `$1` - user-provided arguments (pair with `argument-hint` frontmatter)
|
|
104
112
|
- `${CLAUDE_SESSION_ID}` - current session ID
|
|
105
113
|
- `` !`command` `` syntax — dynamic context injection (Anthropic spec feature):
|
|
@@ -111,6 +119,7 @@ Additional guidelines:
|
|
|
111
119
|
## Step 5: Create Supporting Files
|
|
112
120
|
|
|
113
121
|
**Scripts** (`scripts/`):
|
|
122
|
+
|
|
114
123
|
- Scripts should solve problems, not punt to Claude
|
|
115
124
|
- Explicit error handling
|
|
116
125
|
- No voodoo constants (document all magic values)
|
|
@@ -118,16 +127,19 @@ Additional guidelines:
|
|
|
118
127
|
- Make executable: `chmod +x scripts/*.py`
|
|
119
128
|
|
|
120
129
|
**References** (`references/`):
|
|
130
|
+
|
|
121
131
|
- Heavy documentation that doesn't need to load at activation
|
|
122
132
|
- Use clear section headers for navigability
|
|
123
133
|
- For reference files >100 lines, include a TOC at the top so Claude can see full scope even with partial reads
|
|
124
134
|
- One-level-deep references only (no `references/sub/dir/`)
|
|
125
135
|
|
|
126
136
|
**Templates** (`templates/`):
|
|
137
|
+
|
|
127
138
|
- Boilerplate files used for generation
|
|
128
139
|
- Use clear placeholder syntax (`{{PLACEHOLDER}}`)
|
|
129
140
|
|
|
130
141
|
**Assets** (`assets/`):
|
|
142
|
+
|
|
131
143
|
- Static resources (images, configs, data files)
|
|
132
144
|
|
|
133
145
|
## Step 6: Validate
|
|
@@ -142,6 +154,7 @@ python3 ${CLAUDE_SKILL_DIR}/scripts/validate-skill.py --grade {skill-dir}/SKILL.
|
|
|
142
154
|
Standard tier is the default (no required fields, broad compatibility). Use `--enterprise` for full 100-point marketplace grading.
|
|
143
155
|
|
|
144
156
|
**Validation checks:**
|
|
157
|
+
|
|
145
158
|
- Frontmatter: required fields, types, constraints
|
|
146
159
|
- Description: third person, what + when, keywords, length
|
|
147
160
|
- Body: under 500 lines, no absolute paths, has instructions + examples
|
|
@@ -151,6 +164,7 @@ Standard tier is the default (no required fields, broad compatibility). Use `--e
|
|
|
151
164
|
- Progressive disclosure: appropriate use of references/
|
|
152
165
|
|
|
153
166
|
**If validation fails:** fix issues and re-run. Common fixes:
|
|
167
|
+
|
|
154
168
|
- Scope Bash tools: `Bash(git:*)` not `Bash`
|
|
155
169
|
- Remove absolute paths, use `${CLAUDE_SKILL_DIR}/`
|
|
156
170
|
- Split long SKILL.md into references
|
|
@@ -172,6 +186,7 @@ Create `evals/evals.json` with minimum 3 scenarios: happy path, edge case, negat
|
|
|
172
186
|
Run parallel evaluation: Claude A with skill installed vs Claude B without. Compare outputs against assertions — the skill should produce meaningfully better results for its target use cases.
|
|
173
187
|
|
|
174
188
|
**Additional testing practices:**
|
|
189
|
+
|
|
175
190
|
- **Team feedback**: If applicable, share the skill with teammates and observe usage patterns
|
|
176
191
|
- **Observe Claude navigation**: Watch how Claude reads and navigates the skill — look for unexpected exploration paths, missed references, or overreliance on certain sections
|
|
177
192
|
|
|
@@ -197,6 +212,7 @@ Tips: front-load distinctive keywords, include specific file types/tools/domains
|
|
|
197
212
|
## Step 10: Report
|
|
198
213
|
|
|
199
214
|
Show the user:
|
|
215
|
+
|
|
200
216
|
```
|
|
201
217
|
SKILL CREATED
|
|
202
218
|
====================================
|
|
@@ -235,6 +251,7 @@ When the user wants to validate, grade, or audit an existing skill:
|
|
|
235
251
|
### Step V1: Locate the Skill
|
|
236
252
|
|
|
237
253
|
Ask for the SKILL.md path or detect from context. Common locations:
|
|
254
|
+
|
|
238
255
|
- `~/.claude/skills/{name}/SKILL.md` (global)
|
|
239
256
|
- `.claude/skills/{name}/SKILL.md` (project)
|
|
240
257
|
|
|
@@ -268,6 +285,7 @@ Present the grade report with specific fix recommendations. Prioritize fixes by
|
|
|
268
285
|
### Step V5: Auto-Fix (if requested)
|
|
269
286
|
|
|
270
287
|
If the user says "fix it" or "auto-fix", apply the suggested improvements:
|
|
288
|
+
|
|
271
289
|
1. Add missing sections (Overview, Prerequisites, Output)
|
|
272
290
|
2. Add "Use when" / "Trigger with" to description
|
|
273
291
|
3. Move author/version from metadata to top-level
|
|
@@ -301,5 +319,6 @@ For A/B testing between skill versions, read `${CLAUDE_SKILL_DIR}/agents/compara
|
|
|
301
319
|
## Platform-Specific Notes
|
|
302
320
|
|
|
303
321
|
See `${CLAUDE_SKILL_DIR}/references/advanced-eval-workflow.md` (section "Platform-Specific Notes").
|
|
322
|
+
|
|
304
323
|
- **Claude.ai**: No subagents — run tests yourself, skip benchmarking/description optimization.
|
|
305
324
|
- **Cowork**: Full subagent workflow. Use `--static` for eval viewer. Generate viewer BEFORE self-evaluation.
|
|
@@ -22,6 +22,7 @@ Standard troubleshooting reference for all marketplace skills. Every references/
|
|
|
22
22
|
```
|
|
23
23
|
|
|
24
24
|
Categories should match the skill's domain. Examples:
|
|
25
|
+
|
|
25
26
|
- For GCP skills: Authentication Errors, API Errors, Deployment Errors, Quota Errors
|
|
26
27
|
- For database skills: Connection Errors, Query Errors, Permission Errors, Data Errors
|
|
27
28
|
- For CI/CD skills: Build Errors, Deploy Errors, Configuration Errors, Permission Errors
|
|
@@ -34,6 +34,7 @@ Standard usage examples reference for all marketplace skills. Every references/e
|
|
|
34
34
|
```
|
|
35
35
|
|
|
36
36
|
Rules:
|
|
37
|
+
|
|
37
38
|
- Minimum 3 examples, maximum 6
|
|
38
39
|
- Each example has: scenario title, user request, what's produced, code/config
|
|
39
40
|
- Code must be real and runnable — no pseudocode, no "add your logic here"
|
|
@@ -37,6 +37,7 @@ Standard implementation guide for all marketplace skills. Every references/imple
|
|
|
37
37
|
```
|
|
38
38
|
|
|
39
39
|
Rules:
|
|
40
|
+
|
|
40
41
|
- Write from the perspective of how this specific skill works, not generic patterns
|
|
41
42
|
- Include real commands, real API calls, real config values
|
|
42
43
|
- Keep under 200 lines — this is a reference, not a textbook
|
|
@@ -29,6 +29,7 @@ Generate output using this exact template:
|
|
|
29
29
|
```
|
|
30
30
|
|
|
31
31
|
Replace placeholders with gathered values. Do not add extra fields.
|
|
32
|
+
|
|
32
33
|
```
|
|
33
34
|
|
|
34
35
|
### Flexible Template (Medium Degrees of Freedom)
|
|
@@ -69,20 +70,24 @@ Provide input/output pairs that demonstrate expected behavior.
|
|
|
69
70
|
**Input**: `/skill-name auth.py`
|
|
70
71
|
**Output**:
|
|
71
72
|
```
|
|
73
|
+
|
|
72
74
|
auth.py: 3 issues found
|
|
73
75
|
Line 15: SQL injection risk in query builder
|
|
74
76
|
Line 42: Hardcoded credential detected
|
|
75
77
|
Line 89: Missing input validation
|
|
78
|
+
|
|
76
79
|
```
|
|
77
80
|
|
|
78
81
|
### Complex case
|
|
79
82
|
**Input**: `/skill-name --deep src/`
|
|
80
83
|
**Output**:
|
|
81
84
|
```
|
|
85
|
+
|
|
82
86
|
Deep scan: 12 files, 7 issues
|
|
83
87
|
CRITICAL (2): sql-injection, hardcoded-secret
|
|
84
88
|
WARNING (3): missing-validation, weak-hash, cors-wildcard
|
|
85
89
|
INFO (2): deprecated-api, unused-import
|
|
90
|
+
|
|
86
91
|
```
|
|
87
92
|
```
|
|
88
93
|
|
|
@@ -135,6 +140,7 @@ Example HTML structure:
|
|
|
135
140
|
</body>
|
|
136
141
|
</html>
|
|
137
142
|
```
|
|
143
|
+
|
|
138
144
|
```
|
|
139
145
|
|
|
140
146
|
### When to Use
|
|
@@ -177,6 +183,7 @@ Results are written as JSON to `{output_path}/results.json`:
|
|
|
177
183
|
```
|
|
178
184
|
|
|
179
185
|
Additionally, a human-readable summary is printed to the conversation.
|
|
186
|
+
|
|
180
187
|
```
|
|
181
188
|
|
|
182
189
|
---
|
|
@@ -27,6 +27,7 @@ Defines the evals for a skill. Located at `evals/evals.json` within the skill di
|
|
|
27
27
|
```
|
|
28
28
|
|
|
29
29
|
**Fields:**
|
|
30
|
+
|
|
30
31
|
- `skill_name`: Name matching the skill's frontmatter
|
|
31
32
|
- `evals[].id`: Unique integer identifier
|
|
32
33
|
- `evals[].prompt`: The task to execute
|
|
@@ -72,6 +73,7 @@ Tracks version progression in Improve mode. Located at workspace root.
|
|
|
72
73
|
```
|
|
73
74
|
|
|
74
75
|
**Fields:**
|
|
76
|
+
|
|
75
77
|
- `started_at`: ISO timestamp of when improvement started
|
|
76
78
|
- `skill_name`: Name of the skill being improved
|
|
77
79
|
- `current_best`: Version identifier of the best performer
|
|
@@ -150,6 +152,7 @@ Output from the grader agent. Located at `<run-dir>/grading.json`.
|
|
|
150
152
|
```
|
|
151
153
|
|
|
152
154
|
**Fields:**
|
|
155
|
+
|
|
153
156
|
- `expectations[]`: Graded expectations with evidence
|
|
154
157
|
- `summary`: Aggregate pass/fail counts
|
|
155
158
|
- `execution_metrics`: Tool usage and output size (from executor's metrics.json)
|
|
@@ -184,6 +187,7 @@ Output from the executor agent. Located at `<run-dir>/outputs/metrics.json`.
|
|
|
184
187
|
```
|
|
185
188
|
|
|
186
189
|
**Fields:**
|
|
190
|
+
|
|
187
191
|
- `tool_calls`: Count per tool type
|
|
188
192
|
- `total_tool_calls`: Sum of all tool calls
|
|
189
193
|
- `total_steps`: Number of major execution steps
|
|
@@ -286,6 +290,7 @@ Output from Benchmark mode. Located at `benchmarks/<timestamp>/benchmark.json`.
|
|
|
286
290
|
```
|
|
287
291
|
|
|
288
292
|
**Fields:**
|
|
293
|
+
|
|
289
294
|
- `metadata`: Information about the benchmark run
|
|
290
295
|
- `skill_name`: Name of the skill
|
|
291
296
|
- `timestamp`: When the benchmark was run
|