@intentsolutionsio/skill-creator 5.0.0 → 5.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/scripts/validate-skill.py +61 -1100
- package/skills/agent-creator/SKILL.md +40 -14
- package/skills/agent-creator/references/anthropic-agent-spec.md +1 -0
- package/skills/skill-creator/SKILL.md +34 -9
- package/skills/skill-creator/agents/analyzer.md +39 -1
- package/skills/skill-creator/agents/comparator.md +31 -1
- package/skills/skill-creator/agents/grader.md +32 -1
- package/skills/skill-creator/eval-viewer/generate_review.py +45 -13
- package/skills/skill-creator/references/advanced-eval-workflow.md +16 -0
- package/skills/skill-creator/references/anthropic-comparison.md +3 -0
- package/skills/skill-creator/references/creation-guide.md +20 -1
- package/skills/skill-creator/references/errors-template.md +1 -0
- package/skills/skill-creator/references/examples-template.md +1 -0
- package/skills/skill-creator/references/frontmatter-spec.md +1 -0
- package/skills/skill-creator/references/implementation-template.md +1 -0
- package/skills/skill-creator/references/output-patterns.md +7 -0
- package/skills/skill-creator/references/schemas.md +5 -0
- package/skills/skill-creator/references/source-of-truth.md +40 -2
- package/skills/skill-creator/references/validation-rules.md +19 -1
- package/skills/skill-creator/scripts/aggregate_benchmark.py +46 -60
- package/skills/skill-creator/scripts/generate_report.py +29 -17
- package/skills/skill-creator/scripts/improve_description.py +18 -21
- package/skills/skill-creator/scripts/package_skill.py +2 -2
- package/skills/skill-creator/scripts/quick_validate.py +16 -15
- package/skills/skill-creator/scripts/run_eval.py +14 -10
- package/skills/skill-creator/scripts/run_loop.py +51 -31
- package/skills/skill-creator/scripts/utils.py +5 -4
- package/skills/skill-creator/templates/agent-template.md +3 -0
- package/skills/skill-creator/templates/skill-template.md +4 -0
|
@@ -1,20 +1,33 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: agent-creator
|
|
3
|
-
description:
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
3
|
+
description: 'Create production-grade agent .md files aligned with the Anthropic 2026
|
|
4
|
+
spec (16-field schema).
|
|
5
|
+
|
|
6
|
+
Also validates existing agents against the marketplace compliance rules. Use when
|
|
7
|
+
building custom
|
|
8
|
+
|
|
9
|
+
subagents, reviewing agent quality, or creating parallel agent architectures for
|
|
10
|
+
orchestrator skills.
|
|
11
|
+
|
|
12
|
+
Trigger with "/agent-creator", "create an agent", "build a subagent", or "validate
|
|
13
|
+
my agent".
|
|
14
|
+
|
|
15
|
+
Make sure to use this skill whenever creating agents/*.md files for plugins or standalone
|
|
16
|
+
use.
|
|
17
|
+
|
|
18
|
+
'
|
|
19
|
+
allowed-tools: Read,Write,Edit,Glob,Grep,Bash(python:*),AskUserQuestion
|
|
10
20
|
version: 1.0.0
|
|
11
21
|
author: Jeremy Longshore <jeremy@intentsolutions.io>
|
|
12
22
|
license: MIT
|
|
13
|
-
|
|
14
|
-
|
|
23
|
+
tags:
|
|
24
|
+
- agent-creation
|
|
25
|
+
- validation
|
|
26
|
+
- meta-tooling
|
|
27
|
+
- subagents
|
|
15
28
|
model: inherit
|
|
29
|
+
compatibility: Designed for Claude Code, also compatible with Codex and OpenClaw
|
|
16
30
|
---
|
|
17
|
-
|
|
18
31
|
# Agent Creator
|
|
19
32
|
|
|
20
33
|
Creates spec-compliant agent .md files following the Anthropic 2026 16-field schema. Supports
|
|
@@ -45,6 +58,7 @@ that drives the subagent — it does NOT receive the full Claude Code system pro
|
|
|
45
58
|
### Mode Detection
|
|
46
59
|
|
|
47
60
|
Determine user intent from their prompt:
|
|
61
|
+
|
|
48
62
|
- **Create mode**: "create an agent", "build a subagent", "new agent" -> Step 1
|
|
49
63
|
- **Validate mode**: "validate agent", "check agent", "grade agent" -> Validation Workflow
|
|
50
64
|
|
|
@@ -53,21 +67,25 @@ Determine user intent from their prompt:
|
|
|
53
67
|
Ask the user with AskUserQuestion:
|
|
54
68
|
|
|
55
69
|
**Agent Identity:**
|
|
70
|
+
|
|
56
71
|
- Name (kebab-case, 1-64 chars, e.g., `risk-assessor`, `clause-analyzer`)
|
|
57
72
|
- Specialty description (20-200 chars — shown in agent selection UI)
|
|
58
73
|
|
|
59
74
|
**Execution Context:**
|
|
75
|
+
|
|
60
76
|
- Plugin agent (`plugins/*/agents/`) or standalone (`~/.claude/agents/`)?
|
|
61
77
|
- Will it be spawned by an orchestrator skill via `Task` tool?
|
|
62
78
|
- Does it need to preload specific skills? (`skills: [skill-name]`)
|
|
63
79
|
|
|
64
80
|
**Behavioral Controls:**
|
|
81
|
+
|
|
65
82
|
- Model override? (`sonnet` for speed, `opus` for quality, `inherit` for default)
|
|
66
83
|
- Reasoning effort? (`low` for simple, `medium` default, `high` for complex analysis)
|
|
67
84
|
- Max iterations? (`maxTurns` — how many tool-use loops before stopping)
|
|
68
85
|
- Tools to deny? (`disallowedTools` — denylist approach, opposite of skills)
|
|
69
86
|
|
|
70
87
|
**Plugin Restrictions (if plugin agent):**
|
|
88
|
+
|
|
71
89
|
- `hooks` — NOT supported in plugin agents (use plugin-level hooks)
|
|
72
90
|
- `mcpServers` — NOT supported in plugin agents
|
|
73
91
|
- `permissionMode` — standalone only, NOT plugin agents
|
|
@@ -78,6 +96,7 @@ Before writing, determine:
|
|
|
78
96
|
|
|
79
97
|
**Agent Role Clarity:**
|
|
80
98
|
The agent body must make three things unambiguous:
|
|
99
|
+
|
|
81
100
|
1. **What it IS responsible for** — its specific domain/methodology
|
|
82
101
|
2. **What it is NOT responsible for** — boundaries with other agents
|
|
83
102
|
3. **How it communicates results** — output format and structure
|
|
@@ -99,6 +118,7 @@ All production agents should follow this body structure:
|
|
|
99
118
|
| `## Examples` | Concrete interaction examples | For complex agents |
|
|
100
119
|
|
|
101
120
|
**Output Structure Decision:**
|
|
121
|
+
|
|
102
122
|
- If the agent feeds into an orchestrator: use **JSON output** (machine-parseable)
|
|
103
123
|
- If the agent is user-facing: use **markdown output** (human-readable)
|
|
104
124
|
- If the agent produces both: JSON primary with markdown summary
|
|
@@ -113,12 +133,14 @@ Generate the agent .md using the template from
|
|
|
113
133
|
See [Anthropic Agent Spec](references/anthropic-agent-spec.md) for the full official reference.
|
|
114
134
|
|
|
115
135
|
Required fields:
|
|
136
|
+
|
|
116
137
|
```yaml
|
|
117
138
|
name: {agent-name} # Lowercase letters and hyphens, unique identifier
|
|
118
139
|
description: "{specialty}" # When Claude should delegate to this subagent
|
|
119
140
|
```
|
|
120
141
|
|
|
121
142
|
Optional fields (include only what's needed):
|
|
143
|
+
|
|
122
144
|
```yaml
|
|
123
145
|
tools: "Read, Glob, Grep" # Allowlist — inherits all tools if omitted
|
|
124
146
|
disallowedTools: "Write" # Denylist — removed from inherited/specified list
|
|
@@ -137,12 +159,14 @@ mcpServers: {} # Standalone only, NOT plugin agents
|
|
|
137
159
|
```
|
|
138
160
|
|
|
139
161
|
**Tool access:**
|
|
162
|
+
|
|
140
163
|
- `tools` = allowlist (like skills' `allowed-tools`)
|
|
141
164
|
- `disallowedTools` = denylist (remove specific tools)
|
|
142
165
|
- If both set: disallowed applied first, then tools resolved
|
|
143
166
|
- If neither set: inherits all tools from parent conversation
|
|
144
167
|
|
|
145
168
|
**Invalid fields (ERROR — never use these):**
|
|
169
|
+
|
|
146
170
|
- `capabilities` — looks valid but flagged by validator
|
|
147
171
|
- `expertise_level` — invented, not in Anthropic spec
|
|
148
172
|
- `activation_priority` — invented, not in Anthropic spec
|
|
@@ -191,6 +215,7 @@ Run validation against the Anthropic 16-field schema:
|
|
|
191
215
|
| Body under 300 lines | Offload to references if longer (prevents context bloat) |
|
|
192
216
|
|
|
193
217
|
**Automated validation:**
|
|
218
|
+
|
|
194
219
|
```bash
|
|
195
220
|
python3 ${CLAUDE_SKILL_DIR}/../skill-creator/scripts/validate-skill.py --agents-only {plugin-dir}/
|
|
196
221
|
```
|
|
@@ -209,6 +234,7 @@ Test the agent by spawning it via the `Task` tool or the `Agent` tool:
|
|
|
209
234
|
### Step 6: Report
|
|
210
235
|
|
|
211
236
|
Provide a summary:
|
|
237
|
+
|
|
212
238
|
- Agent name and file path
|
|
213
239
|
- Frontmatter field count (of 14 possible)
|
|
214
240
|
- Body line count
|
|
@@ -299,7 +325,7 @@ actionable).
|
|
|
299
325
|
## Resources
|
|
300
326
|
|
|
301
327
|
- [Anthropic Agent Spec](references/anthropic-agent-spec.md) — Official 16-field schema from code.claude.com/docs/en/sub-agents
|
|
302
|
-
-
|
|
303
|
-
-
|
|
304
|
-
-
|
|
305
|
-
-
|
|
328
|
+
- Agent template — Skeleton with placeholders
|
|
329
|
+
- Frontmatter spec — Field reference (internal)
|
|
330
|
+
- Source of truth — Canonical spec
|
|
331
|
+
- Validation rules — Agent validation section
|
|
@@ -39,6 +39,7 @@ Total: 16 official fields.
|
|
|
39
39
|
## Plugin Agent Restrictions
|
|
40
40
|
|
|
41
41
|
Plugin agents (`plugins/*/agents/*.md`) do NOT support:
|
|
42
|
+
|
|
42
43
|
- `hooks` — ignored when loading from plugin
|
|
43
44
|
- `mcpServers` — ignored when loading from plugin
|
|
44
45
|
- `permissionMode` — ignored when loading from plugin
|
|
@@ -1,20 +1,31 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: skill-creator
|
|
3
|
-
description:
|
|
4
|
-
|
|
5
|
-
|
|
3
|
+
description: 'Create production-grade agent skills aligned with the 2026 AgentSkills.io
|
|
4
|
+
spec and Anthropic
|
|
5
|
+
|
|
6
|
+
best practices (2026). Also validates existing skills against the Intent Solutions
|
|
7
|
+
100-point rubric.
|
|
8
|
+
|
|
6
9
|
Use when building, testing, validating, or optimizing Claude Code skills.
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
+
|
|
11
|
+
Trigger with "/skill-creator", "create a skill", "validate my skill", or "check
|
|
12
|
+
skill quality".
|
|
13
|
+
|
|
14
|
+
Make sure to use this skill whenever creating a new skill, slash command, or agent
|
|
15
|
+
capability.
|
|
16
|
+
|
|
17
|
+
'
|
|
18
|
+
allowed-tools: Read,Write,Edit,Glob,Grep,Bash(mkdir:*),Bash(chmod:*),Bash(python:*),Bash(claude:*),Task,AskUserQuestion
|
|
10
19
|
version: 5.1.0
|
|
11
20
|
author: Jeremy Longshore <jeremy@intentsolutions.io>
|
|
12
21
|
license: MIT
|
|
13
|
-
|
|
14
|
-
|
|
22
|
+
tags:
|
|
23
|
+
- skill-creation
|
|
24
|
+
- validation
|
|
25
|
+
- meta-tooling
|
|
15
26
|
model: inherit
|
|
27
|
+
compatibility: Designed for Claude Code, also compatible with Codex and OpenClaw
|
|
16
28
|
---
|
|
17
|
-
|
|
18
29
|
# Skill Creator
|
|
19
30
|
|
|
20
31
|
Creates complete, spec-compliant skill packages following AgentSkills.io and Anthropic standards.
|
|
@@ -39,12 +50,14 @@ scratch with full validation, or grade/audit existing skills with actionable fix
|
|
|
39
50
|
### Mode Detection
|
|
40
51
|
|
|
41
52
|
Determine user intent from their prompt:
|
|
53
|
+
|
|
42
54
|
- **Create mode**: "create a skill", "build a skill", "new skill" -> proceed to Step 1
|
|
43
55
|
- **Validate mode**: "validate", "check", "grade", "score", "audit" -> jump to Validation Workflow
|
|
44
56
|
|
|
45
57
|
### Communicating with the User
|
|
46
58
|
|
|
47
59
|
Pay attention to context cues to understand the user's technical level. Skill creator is used by people across a wide range of familiarity — from first-time coders to senior engineers. In the default case:
|
|
60
|
+
|
|
48
61
|
- "evaluation" and "benchmark" are borderline but OK
|
|
49
62
|
- For "JSON" and "assertion", check for cues the user knows these terms before using them without explanation
|
|
50
63
|
- Briefly explain terms if in doubt
|
|
@@ -56,21 +69,25 @@ If the current conversation already contains a workflow the user wants to captur
|
|
|
56
69
|
Ask the user with AskUserQuestion:
|
|
57
70
|
|
|
58
71
|
**Skill Identity:**
|
|
72
|
+
|
|
59
73
|
- Name (kebab-case, gerund preferred: `processing-pdfs`, `analyzing-data`)
|
|
60
74
|
- Purpose (1-2 sentences: what it does + when to use it)
|
|
61
75
|
|
|
62
76
|
**Execution Model:**
|
|
77
|
+
|
|
63
78
|
- User-invocable via `/name`? Or background knowledge only?
|
|
64
79
|
- Accepts arguments? (`$ARGUMENTS` substitution)
|
|
65
80
|
- Needs isolated context? (`context: fork` for subagent execution)
|
|
66
81
|
- Explicit-only invocation? (`disable-model-invocation: true` — prevents auto-activation, requires `/name`)
|
|
67
82
|
|
|
68
83
|
**Required Tools:**
|
|
84
|
+
|
|
69
85
|
- Read, Write, Edit, Glob, Grep, WebFetch, WebSearch, Task, AskUserQuestion, Skill
|
|
70
86
|
- Bash must be scoped: `Bash(git:*)`, `Bash(npm:*)`, etc.
|
|
71
87
|
- MCP tools: `ServerName:tool_name`
|
|
72
88
|
|
|
73
89
|
**Complexity:**
|
|
90
|
+
|
|
74
91
|
- Simple (SKILL.md only)
|
|
75
92
|
- With scripts (automation code in `scripts/`)
|
|
76
93
|
- With references (documentation in `references/`)
|
|
@@ -78,6 +95,7 @@ Ask the user with AskUserQuestion:
|
|
|
78
95
|
- Full package (all directories)
|
|
79
96
|
|
|
80
97
|
**Location:**
|
|
98
|
+
|
|
81
99
|
- Global: `~/.claude/skills/<skill-name>/`
|
|
82
100
|
- Project: `.claude/skills/<skill-name>/`
|
|
83
101
|
|
|
@@ -86,6 +104,7 @@ Ask the user with AskUserQuestion:
|
|
|
86
104
|
Before writing, determine:
|
|
87
105
|
|
|
88
106
|
**Degrees of Freedom:**
|
|
107
|
+
|
|
89
108
|
| Level | When to Use |
|
|
90
109
|
|-------|-------------|
|
|
91
110
|
| High | Creative/open-ended tasks (analysis, writing) |
|
|
@@ -95,6 +114,7 @@ Before writing, determine:
|
|
|
95
114
|
Think of it as **narrow bridge vs open field**: a deployment skill is a narrow bridge (one safe path, guard rails everywhere), while a writing skill is an open field (Claude roams freely within broad boundaries). Match constraint level to the task.
|
|
96
115
|
|
|
97
116
|
**Workflow Pattern** (see `${CLAUDE_SKILL_DIR}/references/workflows.md`):
|
|
117
|
+
|
|
98
118
|
- Sequential: fixed steps in order
|
|
99
119
|
- Conditional: branch based on input
|
|
100
120
|
- Wizard: interactive multi-step gathering
|
|
@@ -104,6 +124,7 @@ Think of it as **narrow bridge vs open field**: a deployment skill is a narrow b
|
|
|
104
124
|
- Search-Analyze-Report: explore and summarize
|
|
105
125
|
|
|
106
126
|
**Output Pattern** (see `${CLAUDE_SKILL_DIR}/references/output-patterns.md`):
|
|
127
|
+
|
|
107
128
|
- Strict template (exact format)
|
|
108
129
|
- Flexible template (structure with creative content)
|
|
109
130
|
- Examples-driven (input/output pairs)
|
|
@@ -128,6 +149,7 @@ mkdir -p {location}/{skill-name}/evals # for eval-driven development
|
|
|
128
149
|
For detailed guidance on writing SKILL.md (frontmatter rules, description scoring, body guidelines, string substitutions, DCI syntax), creating supporting files, validation, testing, iteration, description optimization, and final reporting, see [Creation Guide](references/creation-guide.md).
|
|
129
150
|
|
|
130
151
|
Key rules:
|
|
152
|
+
|
|
131
153
|
- `version`, `author`, `license`, `tags`, `compatible-with` are TOP-LEVEL fields (not nested under `metadata:`)
|
|
132
154
|
- Scope Bash: `Bash(git:*)` not bare `Bash`
|
|
133
155
|
- Keep under 500 lines; offload to `references/` if longer
|
|
@@ -244,15 +266,18 @@ Output:
|
|
|
244
266
|
## Resources
|
|
245
267
|
|
|
246
268
|
**References:** `${CLAUDE_SKILL_DIR}/references/`
|
|
269
|
+
|
|
247
270
|
- `creation-guide.md` — Detailed Steps 4-10 and Validation Workflow (V1-V5)
|
|
248
271
|
- `source-of-truth.md` — Canonical spec ([AgentSkills.io](https://agentskills.io/specification), [Anthropic docs](https://code.claude.com/docs/en/skills), [Lee Han Chung deep dive](https://leehanchung.github.io/blogs/2025/10/26/claude-skills-deep-dive/)) | `frontmatter-spec.md` — Field reference | `validation-rules.md` — 100-point rubric
|
|
249
272
|
- `workflows.md` — Workflow patterns | `output-patterns.md` — Output formats | `schemas.md` — JSON schemas (evals, grading, benchmarks)
|
|
250
273
|
- `anthropic-comparison.md` — Gap analysis | `advanced-eval-workflow.md` — Eval, iteration, optimization, platform notes
|
|
251
274
|
|
|
252
275
|
**Agents** (read when spawning subagents): `${CLAUDE_SKILL_DIR}/agents/`
|
|
276
|
+
|
|
253
277
|
- `grader.md` — Assertion evaluation | `comparator.md` — Blind A/B comparison | `analyzer.md` — Benchmark analysis
|
|
254
278
|
|
|
255
279
|
**Scripts:** `${CLAUDE_SKILL_DIR}/scripts/`
|
|
280
|
+
|
|
256
281
|
- `validate-skill.py` — 100-point rubric grading | `quick_validate.py` — Lightweight validation
|
|
257
282
|
- `aggregate_benchmark.py` — Benchmark stats | `run_eval.py` — Trigger accuracy testing
|
|
258
283
|
- `run_loop.py` — Description optimization loop | `improve_description.py` — LLM-powered rewriting
|
|
@@ -1,8 +1,35 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: analyzer
|
|
3
3
|
description: Analyze blind comparison results to understand why the winner won and generate improvement suggestions
|
|
4
|
+
tools:
|
|
5
|
+
- Read
|
|
6
|
+
- Write
|
|
7
|
+
- Edit
|
|
8
|
+
- Bash
|
|
9
|
+
- Glob
|
|
10
|
+
- Grep
|
|
11
|
+
- WebFetch
|
|
12
|
+
- WebSearch
|
|
13
|
+
- Task
|
|
14
|
+
- TodoWrite
|
|
15
|
+
model: sonnet
|
|
16
|
+
color: orange
|
|
17
|
+
version: 1.0.0
|
|
18
|
+
author: Jeremy Longshore <jeremy@intentsolutions.io>
|
|
19
|
+
tags:
|
|
20
|
+
- skill-enhancers
|
|
21
|
+
- analyzer
|
|
22
|
+
disallowedTools: []
|
|
23
|
+
skills: []
|
|
24
|
+
background: false
|
|
25
|
+
# ── upgrade levers — uncomment + set when tuning this agent ──
|
|
26
|
+
# effort: high # reasoning depth: low/medium/high/xhigh/max (omit = inherit session)
|
|
27
|
+
# maxTurns: 50 # cap the agentic loop (omit = engine default)
|
|
28
|
+
# memory: project # persistent scope: user/project/local (omit = ephemeral)
|
|
29
|
+
# isolation: worktree # run in an isolated git worktree
|
|
30
|
+
# initialPrompt: "…" # seed the agent's first turn
|
|
31
|
+
# hooks / mcpServers / permissionMode → set at the PLUGIN level, not on a plugin agent
|
|
4
32
|
---
|
|
5
|
-
|
|
6
33
|
# Post-hoc Analyzer Agent
|
|
7
34
|
|
|
8
35
|
Analyze blind comparison results to understand WHY the winner won and generate improvement suggestions.
|
|
@@ -54,6 +81,7 @@ You receive these parameters in your prompt:
|
|
|
54
81
|
### Step 4: Analyze Instruction Following
|
|
55
82
|
|
|
56
83
|
For each transcript, evaluate:
|
|
84
|
+
|
|
57
85
|
- Did the agent follow the skill's explicit instructions?
|
|
58
86
|
- Did the agent use the skill's provided tools/scripts?
|
|
59
87
|
- Were there missed opportunities to leverage skill content?
|
|
@@ -64,6 +92,7 @@ Score instruction following 1-10 and note specific issues.
|
|
|
64
92
|
### Step 5: Identify Winner Strengths
|
|
65
93
|
|
|
66
94
|
Determine what made the winner better:
|
|
95
|
+
|
|
67
96
|
- Clearer instructions that led to better behavior?
|
|
68
97
|
- Better scripts/tools that produced better output?
|
|
69
98
|
- More comprehensive examples that guided edge cases?
|
|
@@ -74,6 +103,7 @@ Be specific. Quote from skills/transcripts where relevant.
|
|
|
74
103
|
### Step 6: Identify Loser Weaknesses
|
|
75
104
|
|
|
76
105
|
Determine what held the loser back:
|
|
106
|
+
|
|
77
107
|
- Ambiguous instructions that led to suboptimal choices?
|
|
78
108
|
- Missing tools/scripts that forced workarounds?
|
|
79
109
|
- Gaps in edge case coverage?
|
|
@@ -82,6 +112,7 @@ Determine what held the loser back:
|
|
|
82
112
|
### Step 7: Generate Improvement Suggestions
|
|
83
113
|
|
|
84
114
|
Based on the analysis, produce actionable suggestions for improving the loser skill:
|
|
115
|
+
|
|
85
116
|
- Specific instruction changes to make
|
|
86
117
|
- Tools/scripts to add or modify
|
|
87
118
|
- Examples to include
|
|
@@ -216,6 +247,7 @@ You receive these parameters in your prompt:
|
|
|
216
247
|
### Step 2: Analyze Per-Assertion Patterns
|
|
217
248
|
|
|
218
249
|
For each expectation across all runs:
|
|
250
|
+
|
|
219
251
|
- Does it **always pass** in both configurations? (may not differentiate skill value)
|
|
220
252
|
- Does it **always fail** in both configurations? (may be broken or beyond capability)
|
|
221
253
|
- Does it **always pass with skill but fail without**? (skill clearly adds value here)
|
|
@@ -225,6 +257,7 @@ For each expectation across all runs:
|
|
|
225
257
|
### Step 3: Analyze Cross-Eval Patterns
|
|
226
258
|
|
|
227
259
|
Look for patterns across evals:
|
|
260
|
+
|
|
228
261
|
- Are certain eval types consistently harder/easier?
|
|
229
262
|
- Do some evals show high variance while others are stable?
|
|
230
263
|
- Are there surprising results that contradict expectations?
|
|
@@ -232,6 +265,7 @@ Look for patterns across evals:
|
|
|
232
265
|
### Step 4: Analyze Metrics Patterns
|
|
233
266
|
|
|
234
267
|
Look at time_seconds, tokens, tool_calls:
|
|
268
|
+
|
|
235
269
|
- Does the skill significantly increase execution time?
|
|
236
270
|
- Is there high variance in resource usage?
|
|
237
271
|
- Are there outlier runs that skew the aggregates?
|
|
@@ -239,11 +273,13 @@ Look at time_seconds, tokens, tool_calls:
|
|
|
239
273
|
### Step 5: Generate Notes
|
|
240
274
|
|
|
241
275
|
Write freeform observations as a list of strings. Each note should:
|
|
276
|
+
|
|
242
277
|
- State a specific observation
|
|
243
278
|
- Be grounded in the data (not speculation)
|
|
244
279
|
- Help the user understand something the aggregate metrics don't show
|
|
245
280
|
|
|
246
281
|
Examples:
|
|
282
|
+
|
|
247
283
|
- "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value"
|
|
248
284
|
- "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure that may be flaky"
|
|
249
285
|
- "Without-skill runs consistently fail on table extraction expectations (0% pass rate)"
|
|
@@ -267,12 +303,14 @@ Save notes to `{output_path}` as a JSON array of strings:
|
|
|
267
303
|
## Guidelines
|
|
268
304
|
|
|
269
305
|
**DO:**
|
|
306
|
+
|
|
270
307
|
- Report what you observe in the data
|
|
271
308
|
- Be specific about which evals, expectations, or runs you're referring to
|
|
272
309
|
- Note patterns that aggregate metrics would hide
|
|
273
310
|
- Provide context that helps interpret the numbers
|
|
274
311
|
|
|
275
312
|
**DO NOT:**
|
|
313
|
+
|
|
276
314
|
- Suggest improvements to the skill (that's for the improvement step, not benchmarking)
|
|
277
315
|
- Make subjective quality judgments ("the output was good/bad")
|
|
278
316
|
- Speculate about causes without evidence
|
|
@@ -1,8 +1,35 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: comparator
|
|
3
3
|
description: Compare two outputs blindly without knowing which skill produced them
|
|
4
|
+
tools:
|
|
5
|
+
- Read
|
|
6
|
+
- Write
|
|
7
|
+
- Edit
|
|
8
|
+
- Bash
|
|
9
|
+
- Glob
|
|
10
|
+
- Grep
|
|
11
|
+
- WebFetch
|
|
12
|
+
- WebSearch
|
|
13
|
+
- Task
|
|
14
|
+
- TodoWrite
|
|
15
|
+
model: sonnet
|
|
16
|
+
color: red
|
|
17
|
+
version: 1.0.0
|
|
18
|
+
author: Jeremy Longshore <jeremy@intentsolutions.io>
|
|
19
|
+
tags:
|
|
20
|
+
- skill-enhancers
|
|
21
|
+
- comparator
|
|
22
|
+
disallowedTools: []
|
|
23
|
+
skills: []
|
|
24
|
+
background: false
|
|
25
|
+
# ── upgrade levers — uncomment + set when tuning this agent ──
|
|
26
|
+
# effort: high # reasoning depth: low/medium/high/xhigh/max (omit = inherit session)
|
|
27
|
+
# maxTurns: 50 # cap the agentic loop (omit = engine default)
|
|
28
|
+
# memory: project # persistent scope: user/project/local (omit = ephemeral)
|
|
29
|
+
# isolation: worktree # run in an isolated git worktree
|
|
30
|
+
# initialPrompt: "…" # seed the agent's first turn
|
|
31
|
+
# hooks / mcpServers / permissionMode → set at the PLUGIN level, not on a plugin agent
|
|
4
32
|
---
|
|
5
|
-
|
|
6
33
|
# Blind Comparator Agent
|
|
7
34
|
|
|
8
35
|
Compare two outputs WITHOUT knowing which skill produced them.
|
|
@@ -44,6 +71,7 @@ You receive these parameters in your prompt:
|
|
|
44
71
|
Based on the task, generate a rubric with two dimensions:
|
|
45
72
|
|
|
46
73
|
**Content Rubric** (what the output contains):
|
|
74
|
+
|
|
47
75
|
| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
|
|
48
76
|
|-----------|----------|----------------|---------------|
|
|
49
77
|
| Correctness | Major errors | Minor errors | Fully correct |
|
|
@@ -51,6 +79,7 @@ Based on the task, generate a rubric with two dimensions:
|
|
|
51
79
|
| Accuracy | Significant inaccuracies | Minor inaccuracies | Accurate throughout |
|
|
52
80
|
|
|
53
81
|
**Structure Rubric** (how the output is organized):
|
|
82
|
+
|
|
54
83
|
| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
|
|
55
84
|
|-----------|----------|----------------|---------------|
|
|
56
85
|
| Organization | Disorganized | Reasonably organized | Clear, logical structure |
|
|
@@ -58,6 +87,7 @@ Based on the task, generate a rubric with two dimensions:
|
|
|
58
87
|
| Usability | Difficult to use | Usable with effort | Easy to use |
|
|
59
88
|
|
|
60
89
|
Adapt criteria to the specific task. For example:
|
|
90
|
+
|
|
61
91
|
- PDF form → "Field alignment", "Text readability", "Data placement"
|
|
62
92
|
- Document → "Section structure", "Heading hierarchy", "Paragraph flow"
|
|
63
93
|
- Data output → "Schema correctness", "Data types", "Completeness"
|
|
@@ -1,8 +1,35 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: grader
|
|
3
3
|
description: Evaluate expectations against execution transcripts and outputs
|
|
4
|
+
tools:
|
|
5
|
+
- Read
|
|
6
|
+
- Write
|
|
7
|
+
- Edit
|
|
8
|
+
- Bash
|
|
9
|
+
- Glob
|
|
10
|
+
- Grep
|
|
11
|
+
- WebFetch
|
|
12
|
+
- WebSearch
|
|
13
|
+
- Task
|
|
14
|
+
- TodoWrite
|
|
15
|
+
model: sonnet
|
|
16
|
+
color: pink
|
|
17
|
+
version: 1.0.0
|
|
18
|
+
author: Jeremy Longshore <jeremy@intentsolutions.io>
|
|
19
|
+
tags:
|
|
20
|
+
- skill-enhancers
|
|
21
|
+
- grader
|
|
22
|
+
disallowedTools: []
|
|
23
|
+
skills: []
|
|
24
|
+
background: false
|
|
25
|
+
# ── upgrade levers — uncomment + set when tuning this agent ──
|
|
26
|
+
# effort: high # reasoning depth: low/medium/high/xhigh/max (omit = inherit session)
|
|
27
|
+
# maxTurns: 50 # cap the agentic loop (omit = engine default)
|
|
28
|
+
# memory: project # persistent scope: user/project/local (omit = ephemeral)
|
|
29
|
+
# isolation: worktree # run in an isolated git worktree
|
|
30
|
+
# initialPrompt: "…" # seed the agent's first turn
|
|
31
|
+
# hooks / mcpServers / permissionMode → set at the PLUGIN level, not on a plugin agent
|
|
4
32
|
---
|
|
5
|
-
|
|
6
33
|
# Grader Agent
|
|
7
34
|
|
|
8
35
|
Evaluate expectations against an execution transcript and outputs.
|
|
@@ -66,6 +93,7 @@ This catches issues that predefined expectations might miss.
|
|
|
66
93
|
### Step 5: Read User Notes
|
|
67
94
|
|
|
68
95
|
If `{outputs_dir}/user_notes.md` exists:
|
|
96
|
+
|
|
69
97
|
1. Read it and note any uncertainties or issues flagged by the executor
|
|
70
98
|
2. Include relevant concerns in the grading output
|
|
71
99
|
3. These may reveal problems even when expectations pass
|
|
@@ -77,6 +105,7 @@ After grading, consider whether the evals themselves could be improved. Only sur
|
|
|
77
105
|
Good suggestions test meaningful outcomes — assertions that are hard to satisfy without actually doing the work correctly. Think about what makes an assertion *discriminating*: it passes when the skill genuinely succeeds and fails when it doesn't.
|
|
78
106
|
|
|
79
107
|
Suggestions worth raising:
|
|
108
|
+
|
|
80
109
|
- An assertion that passed but would also pass for a clearly wrong output (e.g., checking filename existence but not file content)
|
|
81
110
|
- An important outcome you observed — good or bad — that no assertion covers at all
|
|
82
111
|
- An assertion that can't actually be verified from the available outputs
|
|
@@ -90,11 +119,13 @@ Save results to `{outputs_dir}/../grading.json` (sibling to outputs_dir).
|
|
|
90
119
|
## Grading Criteria
|
|
91
120
|
|
|
92
121
|
**PASS when**:
|
|
122
|
+
|
|
93
123
|
- The transcript or outputs clearly demonstrate the expectation is true
|
|
94
124
|
- Specific evidence can be cited
|
|
95
125
|
- The evidence reflects genuine substance, not just surface compliance (e.g., a file exists AND contains correct content, not just the right filename)
|
|
96
126
|
|
|
97
127
|
**FAIL when**:
|
|
128
|
+
|
|
98
129
|
- No evidence found for the expectation
|
|
99
130
|
- Evidence contradicts the expectation
|
|
100
131
|
- The expectation cannot be verified from available information
|
|
@@ -32,9 +32,32 @@ METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"}
|
|
|
32
32
|
|
|
33
33
|
# Extensions we render as inline text
|
|
34
34
|
TEXT_EXTENSIONS = {
|
|
35
|
-
".txt",
|
|
36
|
-
".
|
|
37
|
-
".
|
|
35
|
+
".txt",
|
|
36
|
+
".md",
|
|
37
|
+
".json",
|
|
38
|
+
".csv",
|
|
39
|
+
".py",
|
|
40
|
+
".js",
|
|
41
|
+
".ts",
|
|
42
|
+
".tsx",
|
|
43
|
+
".jsx",
|
|
44
|
+
".yaml",
|
|
45
|
+
".yml",
|
|
46
|
+
".xml",
|
|
47
|
+
".html",
|
|
48
|
+
".css",
|
|
49
|
+
".sh",
|
|
50
|
+
".rb",
|
|
51
|
+
".go",
|
|
52
|
+
".rs",
|
|
53
|
+
".java",
|
|
54
|
+
".c",
|
|
55
|
+
".cpp",
|
|
56
|
+
".h",
|
|
57
|
+
".hpp",
|
|
58
|
+
".sql",
|
|
59
|
+
".r",
|
|
60
|
+
".toml",
|
|
38
61
|
}
|
|
39
62
|
|
|
40
63
|
# Extensions we render as inline images
|
|
@@ -224,9 +247,7 @@ def load_previous_iteration(workspace: Path) -> dict[str, dict]:
|
|
|
224
247
|
try:
|
|
225
248
|
data = json.loads(feedback_path.read_text())
|
|
226
249
|
feedback_map = {
|
|
227
|
-
r["run_id"]: r["feedback"]
|
|
228
|
-
for r in data.get("reviews", [])
|
|
229
|
-
if r.get("feedback", "").strip()
|
|
250
|
+
r["run_id"]: r["feedback"] for r in data.get("reviews", []) if r.get("feedback", "").strip()
|
|
230
251
|
}
|
|
231
252
|
except (json.JSONDecodeError, OSError, KeyError):
|
|
232
253
|
pass
|
|
@@ -285,12 +306,15 @@ def generate_html(
|
|
|
285
306
|
# HTTP server (stdlib only, zero dependencies)
|
|
286
307
|
# ---------------------------------------------------------------------------
|
|
287
308
|
|
|
309
|
+
|
|
288
310
|
def _kill_port(port: int) -> None:
|
|
289
311
|
"""Kill any process listening on the given port."""
|
|
290
312
|
try:
|
|
291
313
|
result = subprocess.run(
|
|
292
314
|
["lsof", "-ti", f":{port}"],
|
|
293
|
-
capture_output=True,
|
|
315
|
+
capture_output=True,
|
|
316
|
+
text=True,
|
|
317
|
+
timeout=5,
|
|
294
318
|
)
|
|
295
319
|
for pid_str in result.stdout.strip().split("\n"):
|
|
296
320
|
if pid_str.strip():
|
|
@@ -305,6 +329,7 @@ def _kill_port(port: int) -> None:
|
|
|
305
329
|
except FileNotFoundError:
|
|
306
330
|
print("Note: lsof not found, cannot check if port is in use", file=sys.stderr)
|
|
307
331
|
|
|
332
|
+
|
|
308
333
|
class ReviewHandler(BaseHTTPRequestHandler):
|
|
309
334
|
"""Serves the review HTML and handles feedback saves.
|
|
310
335
|
|
|
@@ -390,15 +415,22 @@ def main() -> None:
|
|
|
390
415
|
parser.add_argument("--port", "-p", type=int, default=3117, help="Server port (default: 3117)")
|
|
391
416
|
parser.add_argument("--skill-name", "-n", type=str, default=None, help="Skill name for header")
|
|
392
417
|
parser.add_argument(
|
|
393
|
-
"--previous-workspace",
|
|
418
|
+
"--previous-workspace",
|
|
419
|
+
type=Path,
|
|
420
|
+
default=None,
|
|
394
421
|
help="Path to previous iteration's workspace (shows old outputs and feedback as context)",
|
|
395
422
|
)
|
|
396
423
|
parser.add_argument(
|
|
397
|
-
"--benchmark",
|
|
424
|
+
"--benchmark",
|
|
425
|
+
type=Path,
|
|
426
|
+
default=None,
|
|
398
427
|
help="Path to benchmark.json to show in the Benchmark tab",
|
|
399
428
|
)
|
|
400
429
|
parser.add_argument(
|
|
401
|
-
"--static",
|
|
430
|
+
"--static",
|
|
431
|
+
"-s",
|
|
432
|
+
type=Path,
|
|
433
|
+
default=None,
|
|
402
434
|
help="Write standalone HTML to this path instead of starting a server",
|
|
403
435
|
)
|
|
404
436
|
args = parser.parse_args()
|
|
@@ -447,8 +479,8 @@ def main() -> None:
|
|
|
447
479
|
port = server.server_address[1]
|
|
448
480
|
|
|
449
481
|
url = f"http://localhost:{port}"
|
|
450
|
-
print(
|
|
451
|
-
print(
|
|
482
|
+
print("\n Eval Viewer")
|
|
483
|
+
print(" ─────────────────────────────────")
|
|
452
484
|
print(f" URL: {url}")
|
|
453
485
|
print(f" Workspace: {workspace}")
|
|
454
486
|
print(f" Feedback: {feedback_path}")
|
|
@@ -456,7 +488,7 @@ def main() -> None:
|
|
|
456
488
|
print(f" Previous: {args.previous_workspace} ({len(previous)} runs)")
|
|
457
489
|
if benchmark_path:
|
|
458
490
|
print(f" Benchmark: {benchmark_path}")
|
|
459
|
-
print(
|
|
491
|
+
print("\n Press Ctrl+C to stop.\n")
|
|
460
492
|
|
|
461
493
|
webbrowser.open(url)
|
|
462
494
|
|