@intentsolutionsio/skill-creator 5.0.0 → 5.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/package.json +1 -1
  2. package/scripts/validate-skill.py +45 -22
  3. package/skills/agent-creator/SKILL.md +40 -14
  4. package/skills/agent-creator/references/anthropic-agent-spec.md +1 -0
  5. package/skills/skill-creator/SKILL.md +34 -9
  6. package/skills/skill-creator/agents/analyzer.md +11 -0
  7. package/skills/skill-creator/agents/comparator.md +3 -0
  8. package/skills/skill-creator/agents/grader.md +4 -0
  9. package/skills/skill-creator/eval-viewer/generate_review.py +45 -13
  10. package/skills/skill-creator/references/advanced-eval-workflow.md +16 -0
  11. package/skills/skill-creator/references/anthropic-comparison.md +3 -0
  12. package/skills/skill-creator/references/creation-guide.md +20 -1
  13. package/skills/skill-creator/references/errors-template.md +1 -0
  14. package/skills/skill-creator/references/examples-template.md +1 -0
  15. package/skills/skill-creator/references/frontmatter-spec.md +1 -0
  16. package/skills/skill-creator/references/implementation-template.md +1 -0
  17. package/skills/skill-creator/references/output-patterns.md +7 -0
  18. package/skills/skill-creator/references/schemas.md +5 -0
  19. package/skills/skill-creator/references/source-of-truth.md +40 -2
  20. package/skills/skill-creator/references/validation-rules.md +19 -1
  21. package/skills/skill-creator/scripts/__pycache__/__init__.cpython-312.pyc +0 -0
  22. package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-312.pyc +0 -0
  23. package/skills/skill-creator/scripts/__pycache__/utils.cpython-312.pyc +0 -0
  24. package/skills/skill-creator/scripts/aggregate_benchmark.py +46 -60
  25. package/skills/skill-creator/scripts/generate_report.py +29 -17
  26. package/skills/skill-creator/scripts/improve_description.py +18 -21
  27. package/skills/skill-creator/scripts/package_skill.py +2 -2
  28. package/skills/skill-creator/scripts/quick_validate.py +16 -15
  29. package/skills/skill-creator/scripts/run_eval.py +14 -10
  30. package/skills/skill-creator/scripts/run_loop.py +51 -31
  31. package/skills/skill-creator/scripts/utils.py +5 -4
  32. package/skills/skill-creator/templates/agent-template.md +3 -0
  33. package/skills/skill-creator/templates/skill-template.md +4 -0
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@intentsolutionsio/skill-creator",
3
- "version": "5.0.0",
3
+ "version": "5.0.3",
4
4
  "description": "Create and validate production-grade agent skills with 100-point marketplace grading",
5
5
  "keywords": [
6
6
  "skill-creation",
@@ -32,19 +32,41 @@ except ImportError:
32
32
  # === CONSTANTS ===
33
33
 
34
34
  VALID_TOOLS = {
35
- "Read", "Write", "Edit", "Bash", "Glob", "Grep",
36
- "WebFetch", "WebSearch", "Task", "NotebookEdit",
37
- "AskUserQuestion", "Skill",
35
+ "Read",
36
+ "Write",
37
+ "Edit",
38
+ "Bash",
39
+ "Glob",
40
+ "Grep",
41
+ "WebFetch",
42
+ "WebSearch",
43
+ "Task",
44
+ "NotebookEdit",
45
+ "AskUserQuestion",
46
+ "Skill",
38
47
  }
39
48
 
40
49
  KNOWN_FRONTMATTER_FIELDS = {
41
50
  # AgentSkills.io spec
42
- "name", "description", "license", "compatibility", "metadata", "allowed-tools",
51
+ "name",
52
+ "description",
53
+ "license",
54
+ "compatibility",
55
+ "metadata",
56
+ "allowed-tools",
43
57
  # Top-level identity fields (marketplace standard)
44
- "version", "author", "compatible-with", "tags",
58
+ "version",
59
+ "author",
60
+ "compatible-with",
61
+ "tags",
45
62
  # Claude Code extensions
46
- "argument-hint", "disable-model-invocation", "user-invocable", "model",
47
- "context", "agent", "hooks",
63
+ "argument-hint",
64
+ "disable-model-invocation",
65
+ "user-invocable",
66
+ "model",
67
+ "context",
68
+ "agent",
69
+ "hooks",
48
70
  }
49
71
 
50
72
  DEPRECATED_FIELDS = {
@@ -53,7 +75,13 @@ DEPRECATED_FIELDS = {
53
75
  }
54
76
 
55
77
  VALID_PLATFORMS = {
56
- "claude-code", "codex", "openclaw", "aider", "continue", "cursor", "windsurf",
78
+ "claude-code",
79
+ "codex",
80
+ "openclaw",
81
+ "aider",
82
+ "continue",
83
+ "cursor",
84
+ "windsurf",
57
85
  }
58
86
 
59
87
  RE_FRONTMATTER = re.compile(r"^---\s*\n(.*?)\n---\s*\n(.*)$", re.DOTALL)
@@ -69,7 +97,10 @@ RE_XML_TAG = re.compile(r"[<>]")
69
97
  RE_TIME_SENSITIVE = [
70
98
  re.compile(r"\b(20\d{2}[-/]\d{2}[-/]\d{2})\b"), # dates like 2025-01-01
71
99
  re.compile(r"\b(v\d+\.\d+\.\d+)\b", re.IGNORECASE), # version numbers like v1.2.3
72
- re.compile(r"\b(as of|since|after|before) (January|February|March|April|May|June|July|August|September|October|November|December)\b", re.IGNORECASE),
100
+ re.compile(
101
+ r"\b(as of|since|after|before) (January|February|March|April|May|June|July|August|September|October|November|December)\b",
102
+ re.IGNORECASE,
103
+ ),
73
104
  ]
74
105
 
75
106
  ABSOLUTE_PATH_PATTERNS = [
@@ -285,9 +316,7 @@ def validate_optional_fields(fm: dict) -> Tuple[List[str], List[str]]:
285
316
  if isinstance(metadata, dict):
286
317
  for field in ("author", "version", "license", "tags"):
287
318
  if field in metadata and field not in fm:
288
- warnings.append(
289
- f"'{field}' found in metadata block - move to top-level for marketplace scoring"
290
- )
319
+ warnings.append(f"'{field}' found in metadata block - move to top-level for marketplace scoring")
291
320
 
292
321
  # Unknown fields
293
322
  all_known = KNOWN_FRONTMATTER_FIELDS | DEPRECATED_FIELDS.keys()
@@ -369,10 +398,7 @@ def validate_body(body: str, path: Path, enterprise: bool) -> Tuple[List[str], L
369
398
  )
370
399
  if instr_match:
371
400
  instr = instr_match.group(1)
372
- has_steps = (
373
- re.search(r"(?m)^\s*\d+\.\s+", instr)
374
- or re.search(r"(?mi)^\s*#{2,6}\s*step\s*\d+", instr)
375
- )
401
+ has_steps = re.search(r"(?m)^\s*\d+\.\s+", instr) or re.search(r"(?mi)^\s*#{2,6}\s*step\s*\d+", instr)
376
402
  if not has_steps:
377
403
  warnings.append("Instructions should have numbered steps or ### Step N headings")
378
404
 
@@ -386,8 +412,7 @@ def validate_body(body: str, path: Path, enterprise: bool) -> Tuple[List[str], L
386
412
  refs_dir = skill_dir / "references"
387
413
  if line_count > 300 and not refs_dir.exists():
388
414
  warnings.append(
389
- f"SKILL.md is {line_count} lines with no references/ directory - "
390
- "consider splitting heavy content"
415
+ f"SKILL.md is {line_count} lines with no references/ directory - consider splitting heavy content"
391
416
  )
392
417
 
393
418
  if "{baseDir}/../" in body:
@@ -1003,7 +1028,7 @@ def print_result(result: Dict[str, Any], path: Path) -> None:
1003
1028
  print(f" - {i}")
1004
1029
  print()
1005
1030
 
1006
- print(f"Stats:")
1031
+ print("Stats:")
1007
1032
  print(f" Words: {stats['word_count']}")
1008
1033
  print(f" Lines: {stats['line_count']}")
1009
1034
  print(f" Tokens (est.): {stats['token_estimate']}")
@@ -1056,9 +1081,7 @@ def print_grade(grade_result: Dict[str, Any], path: Path) -> None:
1056
1081
 
1057
1082
 
1058
1083
  def main():
1059
- parser = argparse.ArgumentParser(
1060
- description="Validate SKILL.md files (Standard tier by default)"
1061
- )
1084
+ parser = argparse.ArgumentParser(description="Validate SKILL.md files (Standard tier by default)")
1062
1085
  parser.add_argument("path", help="Path to SKILL.md file")
1063
1086
  parser.add_argument(
1064
1087
  "--standard",
@@ -1,20 +1,33 @@
1
1
  ---
2
2
  name: agent-creator
3
- description: |
4
- Create production-grade agent .md files aligned with the Anthropic 2026 spec (16-field schema).
5
- Also validates existing agents against the marketplace compliance rules. Use when building custom
6
- subagents, reviewing agent quality, or creating parallel agent architectures for orchestrator skills.
7
- Trigger with "/agent-creator", "create an agent", "build a subagent", or "validate my agent".
8
- Make sure to use this skill whenever creating agents/*.md files for plugins or standalone use.
9
- allowed-tools: "Read,Write,Edit,Glob,Grep,Bash(python:*),AskUserQuestion"
3
+ description: 'Create production-grade agent .md files aligned with the Anthropic 2026
4
+ spec (16-field schema).
5
+
6
+ Also validates existing agents against the marketplace compliance rules. Use when
7
+ building custom
8
+
9
+ subagents, reviewing agent quality, or creating parallel agent architectures for
10
+ orchestrator skills.
11
+
12
+ Trigger with "/agent-creator", "create an agent", "build a subagent", or "validate
13
+ my agent".
14
+
15
+ Make sure to use this skill whenever creating agents/*.md files for plugins or standalone
16
+ use.
17
+
18
+ '
19
+ allowed-tools: Read,Write,Edit,Glob,Grep,Bash(python:*),AskUserQuestion
10
20
  version: 1.0.0
11
21
  author: Jeremy Longshore <jeremy@intentsolutions.io>
12
22
  license: MIT
13
- compatible-with: claude-code, codex, openclaw
14
- tags: [agent-creation, validation, meta-tooling, subagents]
23
+ tags:
24
+ - agent-creation
25
+ - validation
26
+ - meta-tooling
27
+ - subagents
15
28
  model: inherit
29
+ compatibility: Designed for Claude Code, also compatible with Codex and OpenClaw
16
30
  ---
17
-
18
31
  # Agent Creator
19
32
 
20
33
  Creates spec-compliant agent .md files following the Anthropic 2026 16-field schema. Supports
@@ -45,6 +58,7 @@ that drives the subagent — it does NOT receive the full Claude Code system pro
45
58
  ### Mode Detection
46
59
 
47
60
  Determine user intent from their prompt:
61
+
48
62
  - **Create mode**: "create an agent", "build a subagent", "new agent" -> Step 1
49
63
  - **Validate mode**: "validate agent", "check agent", "grade agent" -> Validation Workflow
50
64
 
@@ -53,21 +67,25 @@ Determine user intent from their prompt:
53
67
  Ask the user with AskUserQuestion:
54
68
 
55
69
  **Agent Identity:**
70
+
56
71
  - Name (kebab-case, 1-64 chars, e.g., `risk-assessor`, `clause-analyzer`)
57
72
  - Specialty description (20-200 chars — shown in agent selection UI)
58
73
 
59
74
  **Execution Context:**
75
+
60
76
  - Plugin agent (`plugins/*/agents/`) or standalone (`~/.claude/agents/`)?
61
77
  - Will it be spawned by an orchestrator skill via `Task` tool?
62
78
  - Does it need to preload specific skills? (`skills: [skill-name]`)
63
79
 
64
80
  **Behavioral Controls:**
81
+
65
82
  - Model override? (`sonnet` for speed, `opus` for quality, `inherit` for default)
66
83
  - Reasoning effort? (`low` for simple, `medium` default, `high` for complex analysis)
67
84
  - Max iterations? (`maxTurns` — how many tool-use loops before stopping)
68
85
  - Tools to deny? (`disallowedTools` — denylist approach, opposite of skills)
69
86
 
70
87
  **Plugin Restrictions (if plugin agent):**
88
+
71
89
  - `hooks` — NOT supported in plugin agents (use plugin-level hooks)
72
90
  - `mcpServers` — NOT supported in plugin agents
73
91
  - `permissionMode` — standalone only, NOT plugin agents
@@ -78,6 +96,7 @@ Before writing, determine:
78
96
 
79
97
  **Agent Role Clarity:**
80
98
  The agent body must make three things unambiguous:
99
+
81
100
  1. **What it IS responsible for** — its specific domain/methodology
82
101
  2. **What it is NOT responsible for** — boundaries with other agents
83
102
  3. **How it communicates results** — output format and structure
@@ -99,6 +118,7 @@ All production agents should follow this body structure:
99
118
  | `## Examples` | Concrete interaction examples | For complex agents |
100
119
 
101
120
  **Output Structure Decision:**
121
+
102
122
  - If the agent feeds into an orchestrator: use **JSON output** (machine-parseable)
103
123
  - If the agent is user-facing: use **markdown output** (human-readable)
104
124
  - If the agent produces both: JSON primary with markdown summary
@@ -113,12 +133,14 @@ Generate the agent .md using the template from
113
133
  See [Anthropic Agent Spec](references/anthropic-agent-spec.md) for the full official reference.
114
134
 
115
135
  Required fields:
136
+
116
137
  ```yaml
117
138
  name: {agent-name} # Lowercase letters and hyphens, unique identifier
118
139
  description: "{specialty}" # When Claude should delegate to this subagent
119
140
  ```
120
141
 
121
142
  Optional fields (include only what's needed):
143
+
122
144
  ```yaml
123
145
  tools: "Read, Glob, Grep" # Allowlist — inherits all tools if omitted
124
146
  disallowedTools: "Write" # Denylist — removed from inherited/specified list
@@ -137,12 +159,14 @@ mcpServers: {} # Standalone only, NOT plugin agents
137
159
  ```
138
160
 
139
161
  **Tool access:**
162
+
140
163
  - `tools` = allowlist (like skills' `allowed-tools`)
141
164
  - `disallowedTools` = denylist (remove specific tools)
142
165
  - If both set: disallowed applied first, then tools resolved
143
166
  - If neither set: inherits all tools from parent conversation
144
167
 
145
168
  **Invalid fields (ERROR — never use these):**
169
+
146
170
  - `capabilities` — looks valid but flagged by validator
147
171
  - `expertise_level` — invented, not in Anthropic spec
148
172
  - `activation_priority` — invented, not in Anthropic spec
@@ -191,6 +215,7 @@ Run validation against the Anthropic 16-field schema:
191
215
  | Body under 300 lines | Offload to references if longer (prevents context bloat) |
192
216
 
193
217
  **Automated validation:**
218
+
194
219
  ```bash
195
220
  python3 ${CLAUDE_SKILL_DIR}/../skill-creator/scripts/validate-skill.py --agents-only {plugin-dir}/
196
221
  ```
@@ -209,6 +234,7 @@ Test the agent by spawning it via the `Task` tool or the `Agent` tool:
209
234
  ### Step 6: Report
210
235
 
211
236
  Provide a summary:
237
+
212
238
  - Agent name and file path
213
239
  - Frontmatter field count (of 14 possible)
214
240
  - Body line count
@@ -299,7 +325,7 @@ actionable).
299
325
  ## Resources
300
326
 
301
327
  - [Anthropic Agent Spec](references/anthropic-agent-spec.md) — Official 16-field schema from code.claude.com/docs/en/sub-agents
302
- - [Agent template](${CLAUDE_SKILL_DIR}/../skill-creator/templates/agent-template.md) — Skeleton with placeholders
303
- - [Frontmatter spec](${CLAUDE_SKILL_DIR}/../skill-creator/references/frontmatter-spec.md) — Field reference (internal)
304
- - [Source of truth](${CLAUDE_SKILL_DIR}/../skill-creator/references/source-of-truth.md) — Canonical spec
305
- - [Validation rules](${CLAUDE_SKILL_DIR}/../skill-creator/references/validation-rules.md) — Agent validation section
328
+ - Agent template — Skeleton with placeholders
329
+ - Frontmatter spec — Field reference (internal)
330
+ - Source of truth — Canonical spec
331
+ - Validation rules — Agent validation section
@@ -39,6 +39,7 @@ Total: 16 official fields.
39
39
  ## Plugin Agent Restrictions
40
40
 
41
41
  Plugin agents (`plugins/*/agents/*.md`) do NOT support:
42
+
42
43
  - `hooks` — ignored when loading from plugin
43
44
  - `mcpServers` — ignored when loading from plugin
44
45
  - `permissionMode` — ignored when loading from plugin
@@ -1,20 +1,31 @@
1
1
  ---
2
2
  name: skill-creator
3
- description: |
4
- Create production-grade agent skills aligned with the 2026 AgentSkills.io spec and Anthropic
5
- best practices (2026). Also validates existing skills against the Intent Solutions 100-point rubric.
3
+ description: 'Create production-grade agent skills aligned with the 2026 AgentSkills.io
4
+ spec and Anthropic
5
+
6
+ best practices (2026). Also validates existing skills against the Intent Solutions
7
+ 100-point rubric.
8
+
6
9
  Use when building, testing, validating, or optimizing Claude Code skills.
7
- Trigger with "/skill-creator", "create a skill", "validate my skill", or "check skill quality".
8
- Make sure to use this skill whenever creating a new skill, slash command, or agent capability.
9
- allowed-tools: "Read,Write,Edit,Glob,Grep,Bash(mkdir:*),Bash(chmod:*),Bash(python:*),Bash(claude:*),Task,AskUserQuestion"
10
+
11
+ Trigger with "/skill-creator", "create a skill", "validate my skill", or "check
12
+ skill quality".
13
+
14
+ Make sure to use this skill whenever creating a new skill, slash command, or agent
15
+ capability.
16
+
17
+ '
18
+ allowed-tools: Read,Write,Edit,Glob,Grep,Bash(mkdir:*),Bash(chmod:*),Bash(python:*),Bash(claude:*),Task,AskUserQuestion
10
19
  version: 5.1.0
11
20
  author: Jeremy Longshore <jeremy@intentsolutions.io>
12
21
  license: MIT
13
- compatible-with: claude-code, codex, openclaw
14
- tags: [skill-creation, validation, meta-tooling]
22
+ tags:
23
+ - skill-creation
24
+ - validation
25
+ - meta-tooling
15
26
  model: inherit
27
+ compatibility: Designed for Claude Code, also compatible with Codex and OpenClaw
16
28
  ---
17
-
18
29
  # Skill Creator
19
30
 
20
31
  Creates complete, spec-compliant skill packages following AgentSkills.io and Anthropic standards.
@@ -39,12 +50,14 @@ scratch with full validation, or grade/audit existing skills with actionable fix
39
50
  ### Mode Detection
40
51
 
41
52
  Determine user intent from their prompt:
53
+
42
54
  - **Create mode**: "create a skill", "build a skill", "new skill" -> proceed to Step 1
43
55
  - **Validate mode**: "validate", "check", "grade", "score", "audit" -> jump to Validation Workflow
44
56
 
45
57
  ### Communicating with the User
46
58
 
47
59
  Pay attention to context cues to understand the user's technical level. Skill creator is used by people across a wide range of familiarity — from first-time coders to senior engineers. In the default case:
60
+
48
61
  - "evaluation" and "benchmark" are borderline but OK
49
62
  - For "JSON" and "assertion", check for cues the user knows these terms before using them without explanation
50
63
  - Briefly explain terms if in doubt
@@ -56,21 +69,25 @@ If the current conversation already contains a workflow the user wants to captur
56
69
  Ask the user with AskUserQuestion:
57
70
 
58
71
  **Skill Identity:**
72
+
59
73
  - Name (kebab-case, gerund preferred: `processing-pdfs`, `analyzing-data`)
60
74
  - Purpose (1-2 sentences: what it does + when to use it)
61
75
 
62
76
  **Execution Model:**
77
+
63
78
  - User-invocable via `/name`? Or background knowledge only?
64
79
  - Accepts arguments? (`$ARGUMENTS` substitution)
65
80
  - Needs isolated context? (`context: fork` for subagent execution)
66
81
  - Explicit-only invocation? (`disable-model-invocation: true` — prevents auto-activation, requires `/name`)
67
82
 
68
83
  **Required Tools:**
84
+
69
85
  - Read, Write, Edit, Glob, Grep, WebFetch, WebSearch, Task, AskUserQuestion, Skill
70
86
  - Bash must be scoped: `Bash(git:*)`, `Bash(npm:*)`, etc.
71
87
  - MCP tools: `ServerName:tool_name`
72
88
 
73
89
  **Complexity:**
90
+
74
91
  - Simple (SKILL.md only)
75
92
  - With scripts (automation code in `scripts/`)
76
93
  - With references (documentation in `references/`)
@@ -78,6 +95,7 @@ Ask the user with AskUserQuestion:
78
95
  - Full package (all directories)
79
96
 
80
97
  **Location:**
98
+
81
99
  - Global: `~/.claude/skills/<skill-name>/`
82
100
  - Project: `.claude/skills/<skill-name>/`
83
101
 
@@ -86,6 +104,7 @@ Ask the user with AskUserQuestion:
86
104
  Before writing, determine:
87
105
 
88
106
  **Degrees of Freedom:**
107
+
89
108
  | Level | When to Use |
90
109
  |-------|-------------|
91
110
  | High | Creative/open-ended tasks (analysis, writing) |
@@ -95,6 +114,7 @@ Before writing, determine:
95
114
  Think of it as **narrow bridge vs open field**: a deployment skill is a narrow bridge (one safe path, guard rails everywhere), while a writing skill is an open field (Claude roams freely within broad boundaries). Match constraint level to the task.
96
115
 
97
116
  **Workflow Pattern** (see `${CLAUDE_SKILL_DIR}/references/workflows.md`):
117
+
98
118
  - Sequential: fixed steps in order
99
119
  - Conditional: branch based on input
100
120
  - Wizard: interactive multi-step gathering
@@ -104,6 +124,7 @@ Think of it as **narrow bridge vs open field**: a deployment skill is a narrow b
104
124
  - Search-Analyze-Report: explore and summarize
105
125
 
106
126
  **Output Pattern** (see `${CLAUDE_SKILL_DIR}/references/output-patterns.md`):
127
+
107
128
  - Strict template (exact format)
108
129
  - Flexible template (structure with creative content)
109
130
  - Examples-driven (input/output pairs)
@@ -128,6 +149,7 @@ mkdir -p {location}/{skill-name}/evals # for eval-driven development
128
149
  For detailed guidance on writing SKILL.md (frontmatter rules, description scoring, body guidelines, string substitutions, DCI syntax), creating supporting files, validation, testing, iteration, description optimization, and final reporting, see [Creation Guide](references/creation-guide.md).
129
150
 
130
151
  Key rules:
152
+
131
153
  - `version`, `author`, `license`, `tags`, `compatible-with` are TOP-LEVEL fields (not nested under `metadata:`)
132
154
  - Scope Bash: `Bash(git:*)` not bare `Bash`
133
155
  - Keep under 500 lines; offload to `references/` if longer
@@ -244,15 +266,18 @@ Output:
244
266
  ## Resources
245
267
 
246
268
  **References:** `${CLAUDE_SKILL_DIR}/references/`
269
+
247
270
  - `creation-guide.md` — Detailed Steps 4-10 and Validation Workflow (V1-V5)
248
271
  - `source-of-truth.md` — Canonical spec ([AgentSkills.io](https://agentskills.io/specification), [Anthropic docs](https://code.claude.com/docs/en/skills), [Lee Han Chung deep dive](https://leehanchung.github.io/blogs/2025/10/26/claude-skills-deep-dive/)) | `frontmatter-spec.md` — Field reference | `validation-rules.md` — 100-point rubric
249
272
  - `workflows.md` — Workflow patterns | `output-patterns.md` — Output formats | `schemas.md` — JSON schemas (evals, grading, benchmarks)
250
273
  - `anthropic-comparison.md` — Gap analysis | `advanced-eval-workflow.md` — Eval, iteration, optimization, platform notes
251
274
 
252
275
  **Agents** (read when spawning subagents): `${CLAUDE_SKILL_DIR}/agents/`
276
+
253
277
  - `grader.md` — Assertion evaluation | `comparator.md` — Blind A/B comparison | `analyzer.md` — Benchmark analysis
254
278
 
255
279
  **Scripts:** `${CLAUDE_SKILL_DIR}/scripts/`
280
+
256
281
  - `validate-skill.py` — 100-point rubric grading | `quick_validate.py` — Lightweight validation
257
282
  - `aggregate_benchmark.py` — Benchmark stats | `run_eval.py` — Trigger accuracy testing
258
283
  - `run_loop.py` — Description optimization loop | `improve_description.py` — LLM-powered rewriting
@@ -54,6 +54,7 @@ You receive these parameters in your prompt:
54
54
  ### Step 4: Analyze Instruction Following
55
55
 
56
56
  For each transcript, evaluate:
57
+
57
58
  - Did the agent follow the skill's explicit instructions?
58
59
  - Did the agent use the skill's provided tools/scripts?
59
60
  - Were there missed opportunities to leverage skill content?
@@ -64,6 +65,7 @@ Score instruction following 1-10 and note specific issues.
64
65
  ### Step 5: Identify Winner Strengths
65
66
 
66
67
  Determine what made the winner better:
68
+
67
69
  - Clearer instructions that led to better behavior?
68
70
  - Better scripts/tools that produced better output?
69
71
  - More comprehensive examples that guided edge cases?
@@ -74,6 +76,7 @@ Be specific. Quote from skills/transcripts where relevant.
74
76
  ### Step 6: Identify Loser Weaknesses
75
77
 
76
78
  Determine what held the loser back:
79
+
77
80
  - Ambiguous instructions that led to suboptimal choices?
78
81
  - Missing tools/scripts that forced workarounds?
79
82
  - Gaps in edge case coverage?
@@ -82,6 +85,7 @@ Determine what held the loser back:
82
85
  ### Step 7: Generate Improvement Suggestions
83
86
 
84
87
  Based on the analysis, produce actionable suggestions for improving the loser skill:
88
+
85
89
  - Specific instruction changes to make
86
90
  - Tools/scripts to add or modify
87
91
  - Examples to include
@@ -216,6 +220,7 @@ You receive these parameters in your prompt:
216
220
  ### Step 2: Analyze Per-Assertion Patterns
217
221
 
218
222
  For each expectation across all runs:
223
+
219
224
  - Does it **always pass** in both configurations? (may not differentiate skill value)
220
225
  - Does it **always fail** in both configurations? (may be broken or beyond capability)
221
226
  - Does it **always pass with skill but fail without**? (skill clearly adds value here)
@@ -225,6 +230,7 @@ For each expectation across all runs:
225
230
  ### Step 3: Analyze Cross-Eval Patterns
226
231
 
227
232
  Look for patterns across evals:
233
+
228
234
  - Are certain eval types consistently harder/easier?
229
235
  - Do some evals show high variance while others are stable?
230
236
  - Are there surprising results that contradict expectations?
@@ -232,6 +238,7 @@ Look for patterns across evals:
232
238
  ### Step 4: Analyze Metrics Patterns
233
239
 
234
240
  Look at time_seconds, tokens, tool_calls:
241
+
235
242
  - Does the skill significantly increase execution time?
236
243
  - Is there high variance in resource usage?
237
244
  - Are there outlier runs that skew the aggregates?
@@ -239,11 +246,13 @@ Look at time_seconds, tokens, tool_calls:
239
246
  ### Step 5: Generate Notes
240
247
 
241
248
  Write freeform observations as a list of strings. Each note should:
249
+
242
250
  - State a specific observation
243
251
  - Be grounded in the data (not speculation)
244
252
  - Help the user understand something the aggregate metrics don't show
245
253
 
246
254
  Examples:
255
+
247
256
  - "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value"
248
257
  - "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure that may be flaky"
249
258
  - "Without-skill runs consistently fail on table extraction expectations (0% pass rate)"
@@ -267,12 +276,14 @@ Save notes to `{output_path}` as a JSON array of strings:
267
276
  ## Guidelines
268
277
 
269
278
  **DO:**
279
+
270
280
  - Report what you observe in the data
271
281
  - Be specific about which evals, expectations, or runs you're referring to
272
282
  - Note patterns that aggregate metrics would hide
273
283
  - Provide context that helps interpret the numbers
274
284
 
275
285
  **DO NOT:**
286
+
276
287
  - Suggest improvements to the skill (that's for the improvement step, not benchmarking)
277
288
  - Make subjective quality judgments ("the output was good/bad")
278
289
  - Speculate about causes without evidence
@@ -44,6 +44,7 @@ You receive these parameters in your prompt:
44
44
  Based on the task, generate a rubric with two dimensions:
45
45
 
46
46
  **Content Rubric** (what the output contains):
47
+
47
48
  | Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
48
49
  |-----------|----------|----------------|---------------|
49
50
  | Correctness | Major errors | Minor errors | Fully correct |
@@ -51,6 +52,7 @@ Based on the task, generate a rubric with two dimensions:
51
52
  | Accuracy | Significant inaccuracies | Minor inaccuracies | Accurate throughout |
52
53
 
53
54
  **Structure Rubric** (how the output is organized):
55
+
54
56
  | Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
55
57
  |-----------|----------|----------------|---------------|
56
58
  | Organization | Disorganized | Reasonably organized | Clear, logical structure |
@@ -58,6 +60,7 @@ Based on the task, generate a rubric with two dimensions:
58
60
  | Usability | Difficult to use | Usable with effort | Easy to use |
59
61
 
60
62
  Adapt criteria to the specific task. For example:
63
+
61
64
  - PDF form → "Field alignment", "Text readability", "Data placement"
62
65
  - Document → "Section structure", "Heading hierarchy", "Paragraph flow"
63
66
  - Data output → "Schema correctness", "Data types", "Completeness"
@@ -66,6 +66,7 @@ This catches issues that predefined expectations might miss.
66
66
  ### Step 5: Read User Notes
67
67
 
68
68
  If `{outputs_dir}/user_notes.md` exists:
69
+
69
70
  1. Read it and note any uncertainties or issues flagged by the executor
70
71
  2. Include relevant concerns in the grading output
71
72
  3. These may reveal problems even when expectations pass
@@ -77,6 +78,7 @@ After grading, consider whether the evals themselves could be improved. Only sur
77
78
  Good suggestions test meaningful outcomes — assertions that are hard to satisfy without actually doing the work correctly. Think about what makes an assertion *discriminating*: it passes when the skill genuinely succeeds and fails when it doesn't.
78
79
 
79
80
  Suggestions worth raising:
81
+
80
82
  - An assertion that passed but would also pass for a clearly wrong output (e.g., checking filename existence but not file content)
81
83
  - An important outcome you observed — good or bad — that no assertion covers at all
82
84
  - An assertion that can't actually be verified from the available outputs
@@ -90,11 +92,13 @@ Save results to `{outputs_dir}/../grading.json` (sibling to outputs_dir).
90
92
  ## Grading Criteria
91
93
 
92
94
  **PASS when**:
95
+
93
96
  - The transcript or outputs clearly demonstrate the expectation is true
94
97
  - Specific evidence can be cited
95
98
  - The evidence reflects genuine substance, not just surface compliance (e.g., a file exists AND contains correct content, not just the right filename)
96
99
 
97
100
  **FAIL when**:
101
+
98
102
  - No evidence found for the expectation
99
103
  - Evidence contradicts the expectation
100
104
  - The expectation cannot be verified from available information