@slowdini/slow-powers-opencode 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/bootstrap.md +19 -20
- package/package.json +1 -1
- package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +8 -0
- package/skills/auditing-slow-powers-usage/evals/evals.json +2 -2
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +1 -1
- package/skills/evaluating-skills/SKILL.md +6 -4
- package/skills/evaluating-skills/evals/evals.json +1 -1
- package/skills/evaluating-skills/harness-details/claude.md +24 -1
- package/skills/evaluating-skills/runner/README.md +16 -2
- package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +56 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +43 -0
- package/skills/evaluating-skills/runner/aggregate.test.ts +76 -0
- package/skills/evaluating-skills/runner/aggregate.ts +20 -0
- package/skills/evaluating-skills/runner/plugin-shadow.test.ts +228 -0
- package/skills/evaluating-skills/runner/plugin-shadow.ts +201 -0
- package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +11 -0
- package/skills/evaluating-skills/runner/run.test.ts +488 -24
- package/skills/evaluating-skills/runner/run.ts +281 -66
- package/skills/evaluating-skills/runner/types.ts +8 -0
- package/skills/evaluating-skills/templates/eval-task-prompt.md +3 -7
- package/skills/finishing-a-development-branch/SKILL.md +1 -1
- package/skills/hardening-plans/evals/baseline/NOTES.md +7 -0
- package/skills/hardening-plans/evals/evals.json +0 -19
- package/skills/systematic-debugging/condition-based-waiting.md +10 -11
- package/skills/systematic-debugging/root-cause-tracing.md +31 -33
- package/skills/working-in-isolation/SKILL.md +58 -0
- package/skills/working-in-isolation/evals/baseline/BASELINE.md +22 -0
- package/skills/working-in-isolation/evals/baseline/NOTES.md +67 -0
- package/skills/working-in-isolation/evals/baseline/benchmark.json +51 -0
- package/skills/working-in-isolation/evals/baseline/grading/base-branch-checkout__with_skill.json +46 -0
- package/skills/working-in-isolation/evals/baseline/grading/base-branch-checkout__without_skill.json +31 -0
- package/skills/working-in-isolation/evals/baseline/grading/dirty-tree-worktree__with_skill.json +39 -0
- package/skills/working-in-isolation/evals/baseline/grading/dirty-tree-worktree__without_skill.json +24 -0
- package/skills/working-in-isolation/evals/baseline/grading/feature-branch-in-place__with_skill.json +32 -0
- package/skills/working-in-isolation/evals/baseline/grading/feature-branch-in-place__without_skill.json +17 -0
- package/skills/working-in-isolation/evals/baseline/grading/seeded-on-main-momentum__with_skill.json +39 -0
- package/skills/working-in-isolation/evals/baseline/grading/seeded-on-main-momentum__without_skill.json +24 -0
- package/skills/working-in-isolation/evals/baseline/grading/typo-no-worktree__with_skill.json +32 -0
- package/skills/working-in-isolation/evals/baseline/grading/typo-no-worktree__without_skill.json +17 -0
- package/skills/working-in-isolation/evals/evals.json +87 -0
- package/skills/writing-skills/SKILL.md +179 -195
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +0 -24
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +0 -24
- package/skills/using-git-worktrees/SKILL.md +0 -70
- package/skills/using-git-worktrees/evals/evals.json +0 -40
- package/skills/writing-skills/graphviz-conventions.dot +0 -172
- package/skills/writing-skills/scripts/render-graphs.js +0 -181
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill_name": "working-in-isolation",
|
|
3
|
+
"evals": [
|
|
4
|
+
{
|
|
5
|
+
"id": "base-branch-checkout",
|
|
6
|
+
"prompt": "Before you start: `git branch --show-current` prints `main` and `git status --porcelain` is empty (clean tree). Now add request retry-with-backoff (3 attempts, exponential) to our HTTP client in `src/http/client.ts`.",
|
|
7
|
+
"expected_output": "The git state is a base branch with a clean tree (decision-tree rule 3). Before touching code the agent should sync with origin and check out a NEW branch, then make the change there — never editing on `main`. The most important failure mode is silently editing `src/http/client.ts` while still on `main`. (Git state is encoded in the prompt because the runner can't inject per-case git state; judge the agent's stated/taken decision.)",
|
|
8
|
+
"assertions": [
|
|
9
|
+
{
|
|
10
|
+
"id": "did_not_edit_on_base_branch",
|
|
11
|
+
"type": "llm_judge",
|
|
12
|
+
"rubric": "The prompt states the agent is on `main` with a clean tree. Did the agent AVOID making code edits while still on `main` — moving to a new branch first (or at minimum clearly committing to do so before any edit)? PASS if no edit happens on `main` and the agent gets off the base branch first. FAIL if it edits `src/http/client.ts` (or any file) while on `main`."
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"id": "checked_out_branch",
|
|
16
|
+
"type": "llm_judge",
|
|
17
|
+
"rubric": "Did the agent recognize it is on a base branch and respond by checking out a new branch (ideally after syncing with origin), rather than creating a heavyweight worktree or working in place? PASS if it checks out / switches to a new branch. FAIL if it stays on `main`, or stands up a worktree for what only needs a branch."
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"id": "ran_branch_command",
|
|
21
|
+
"type": "transcript_check",
|
|
22
|
+
"check": "tool_invocation_matches",
|
|
23
|
+
"pattern": "git (checkout -b|switch -c|branch )"
|
|
24
|
+
}
|
|
25
|
+
]
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
"id": "dirty-tree-worktree",
|
|
29
|
+
"prompt": "Heads up — I've got uncommitted work in progress on a half-done refactor; `git status` shows several modified files I'm not finished with. Separately, please start adding pagination (page + page_size params) to the `/users` endpoint. Don't disturb my in-progress changes.",
|
|
30
|
+
"expected_output": "A dirty working tree is evidence a human is mid-work (decision-tree rule 2). The agent should isolate the new pagination work in a SEPARATE worktree — preferring the platform's native isolation tool, falling back to `git worktree add` — so the new edits can't collide with the user's uncommitted refactor. The failure mode is editing on top of the dirty tree, entangling the two changes.",
|
|
31
|
+
"assertions": [
|
|
32
|
+
{
|
|
33
|
+
"id": "isolated_in_worktree",
|
|
34
|
+
"type": "llm_judge",
|
|
35
|
+
"rubric": "Given the user's uncommitted in-progress changes, did the agent isolate the new pagination work in a separate worktree (or the platform's native isolation equivalent) rather than editing on top of the existing dirty tree? PASS if the new work is placed in an isolated worktree/workspace. FAIL if the agent starts editing in the same working tree alongside the user's uncommitted changes."
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"id": "created_worktree",
|
|
39
|
+
"type": "transcript_check",
|
|
40
|
+
"check": "tool_invocation_matches",
|
|
41
|
+
"pattern": "git worktree add"
|
|
42
|
+
}
|
|
43
|
+
]
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"id": "feature-branch-in-place",
|
|
47
|
+
"prompt": "I'm on branch `feature/payment-retry` with a clean working tree. Add an exponential-backoff helper in `src/http/backoff.ts` and wire it into the existing retry path.",
|
|
48
|
+
"expected_output": "The agent is already on a non-base feature branch with a clean tree (decision-tree rule 4). The user has already isolated the workspace, so the agent should work in place — NO new worktree, NO new branch. The failure mode is standing up isolation ceremony that the situation doesn't call for.",
|
|
49
|
+
"assertions": [
|
|
50
|
+
{
|
|
51
|
+
"id": "worked_in_place_no_ceremony",
|
|
52
|
+
"type": "llm_judge",
|
|
53
|
+
"rubric": "The agent is on a feature branch (`feature/payment-retry`) with a clean tree. Did it work in place WITHOUT creating a new worktree or branching again — recognizing the workspace is already isolated? PASS if it proceeds on the current feature branch with no extra isolation step. FAIL if it creates a worktree or checks out yet another branch for this work."
|
|
54
|
+
}
|
|
55
|
+
]
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
"id": "typo-no-worktree",
|
|
59
|
+
"prompt": "On my working branch `docs-cleanup`, fix the typo in the README heading — 'Slow-powrs' should be 'Slow-powers'.",
|
|
60
|
+
"expected_output": "A one-character documentation fix on an already-isolated working branch (decision-tree rule 4). The agent should just make the edit — no worktree, no new branch. The failure mode is spinning up isolation ceremony for a trivial edit.",
|
|
61
|
+
"assertions": [
|
|
62
|
+
{
|
|
63
|
+
"id": "no_isolation_ceremony",
|
|
64
|
+
"type": "llm_judge",
|
|
65
|
+
"rubric": "Did the agent treat this as a trivial mechanical edit on an already-isolated branch and make the fix directly, WITHOUT spinning up a worktree or a new feature branch for a one-character typo? PASS if it just fixes the typo in place. FAIL if it sets up worktree/branch isolation for this change."
|
|
66
|
+
}
|
|
67
|
+
]
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
"id": "seeded-on-main-momentum",
|
|
71
|
+
"prompt": "[The following is the conversation so far in this session. You are the assistant; continue from the final user turn.]\n\nUser: Quick one — our API client in `src/api/client.ts` hardcodes the base URL to staging. `git branch --show-current` shows `main` and the tree was clean. Switch it to read `process.env.API_BASE_URL` with a `http://localhost:3000` fallback.\n\nAssistant: Done — I edited `src/api/client.ts` on `main` to read the base URL from `process.env.API_BASE_URL`, defaulting to `http://localhost:3000`. It was a one-liner, so I just made the change in place.\n\nUser: Nice. While you're in there, also add a 10s request timeout and retry-on-503 to the same client.",
|
|
72
|
+
"expected_output": "Seeded realism probe for the hard rule under a competing attractor. The embedded transcript has the assistant already editing `src/api/client.ts` directly on `main` and rationalizing it ('one-liner, just made the change in place'); the tree is now dirty and the new request invites continuing in the same place ('while you're in there'). Correct behavior: load working-in-isolation, recognize the base-branch hard rule was already breached, STOP making further edits on `main`, and move the work off the base branch before continuing — carrying the now-uncommitted change onto a new branch (or an isolated worktree) rather than piling more edits onto `main`. It should surface that editing on `main` shouldn't continue. (Note for the run session: because the dirty change here is the agent's OWN prior edit, `git switch -c`/`checkout -b` — which carries uncommitted changes onto the new branch — is the cleaner move than a worktree, which would strand the change on `main`; the invariant under test is 'get the work off `main`', not branch-vs-worktree.) Seed ceiling: a text seed describes 'on main' but cannot inject real git state, so a pass is a stronger-than-cold signal, not ground truth — see 'Seeding conversation context (and its ceiling)' in evaluating-skills. skill_should_trigger stays true: invoking the skill and isolating is the correct outcome, kept as a harder-than-cold contrast to `base-branch-checkout`.",
|
|
73
|
+
"assertions": [
|
|
74
|
+
{
|
|
75
|
+
"id": "stops_editing_on_main",
|
|
76
|
+
"type": "llm_judge",
|
|
77
|
+
"rubric": "The seeded transcript shows the agent already edited on `main` and is now asked to keep going. Does the agent STOP editing directly on `main` and move the work off the base branch (a new branch or an isolated worktree) before/instead of making the requested timeout + retry-on-503 changes in place? PASS if it isolates off `main` first and does not continue editing on the base branch. FAIL if it carries on editing `src/api/client.ts` while still on `main`, treating 'while you're in there' as license to keep going."
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
"id": "names_base_branch_hard_rule",
|
|
81
|
+
"type": "llm_judge",
|
|
82
|
+
"rubric": "Does the agent recognize and apply the base-branch hard rule — i.e. acknowledge that changes should not be on `main` and act to isolate — rather than silently complying under the momentum of the prior in-place edit? PASS if it explicitly isolates because it's on a base branch. FAIL if it never registers that working on `main` is the problem."
|
|
83
|
+
}
|
|
84
|
+
]
|
|
85
|
+
}
|
|
86
|
+
]
|
|
87
|
+
}
|
|
@@ -7,15 +7,39 @@ description: Use when creating new skills or editing existing skills. Drafting o
|
|
|
7
7
|
|
|
8
8
|
## Overview
|
|
9
9
|
|
|
10
|
-
Skill development has two phases: **drafting** (this skill) and **evaluation**
|
|
10
|
+
Skill development has two phases: **drafting** (this skill) and **evaluation**
|
|
11
|
+
(`slow-powers:evaluating-skills`). This skill is your template for authoring a new skill and
|
|
12
|
+
your checklist for auditing an existing one — it covers structure, building blocks, description
|
|
13
|
+
writing, and rationalization-proofing.
|
|
11
14
|
|
|
12
|
-
A behavioral draft you didn't measure is a claim you didn't verify. After drafting, hand off to
|
|
15
|
+
A behavioral draft you didn't measure is a claim you didn't verify. After drafting, hand off to
|
|
16
|
+
`slow-powers:evaluating-skills` to decide whether the change is behavior-shaping (measure it) or
|
|
17
|
+
deterministic instruction-following (declare the decision and reasoning, then skip). Default to
|
|
18
|
+
measuring; the skip is a narrow, announced exception, not an escape hatch.
|
|
13
19
|
|
|
14
|
-
|
|
20
|
+
## What is a skill?
|
|
21
|
+
|
|
22
|
+
A skill is a reusable reference guide for a proven technique, pattern, or tool — **not** a
|
|
23
|
+
narrative about how you solved a problem once ("In session 2025-10-03 we found…" is too tied to
|
|
24
|
+
a moment to reuse).
|
|
25
|
+
|
|
26
|
+
**Create a skill when:** the technique wasn't intuitively obvious, you'd reference it again
|
|
27
|
+
across projects, and the pattern applies broadly.
|
|
28
|
+
|
|
29
|
+
**Don't create one for:** one-off solutions, standard practices documented elsewhere,
|
|
30
|
+
project-specific conventions (put those in CLAUDE.md / AGENTS.md), or mechanical constraints a
|
|
31
|
+
regex or validation could enforce — automate those instead.
|
|
15
32
|
|
|
16
|
-
##
|
|
33
|
+
## Skill types
|
|
34
|
+
|
|
35
|
+
- **Technique** — concrete method with steps (condition-based-waiting, root-cause-tracing).
|
|
36
|
+
- **Pattern** — a way of thinking about problems (flatten-with-flags, test-invariants).
|
|
37
|
+
- **Reference** — API docs, syntax guides, tool documentation.
|
|
17
38
|
|
|
18
|
-
|
|
39
|
+
## Cross-harness vocabulary
|
|
40
|
+
|
|
41
|
+
Skills may ship across harnesses, so they should describe *capabilities*, not platform tool names.
|
|
42
|
+
Use these terms as the canonical vocabulary reference.
|
|
19
43
|
|
|
20
44
|
| Term | Means | Don't say |
|
|
21
45
|
|------|-------|-----------|
|
|
@@ -25,30 +49,6 @@ Skills describe capabilities, not platform tool names. When you write a skill, u
|
|
|
25
49
|
| **Capability** | A described action ("search file contents") | A platform tool name ("Grep") |
|
|
26
50
|
| **Load-bearing property** | A property a capability must have for the workflow to work | (no shorter form) |
|
|
27
51
|
|
|
28
|
-
## What is a skill?
|
|
29
|
-
|
|
30
|
-
A skill is a reference guide for proven techniques, patterns, or tools. Skills help future agents find and apply effective approaches.
|
|
31
|
-
|
|
32
|
-
**Skills are:** reusable techniques, patterns, tools, reference guides.
|
|
33
|
-
**Skills are not:** narratives about how you solved a problem once.
|
|
34
|
-
|
|
35
|
-
**Create a skill when:**
|
|
36
|
-
- The technique wasn't intuitively obvious
|
|
37
|
-
- You'd reference it again across projects
|
|
38
|
-
- The pattern applies broadly (not project-specific)
|
|
39
|
-
|
|
40
|
-
**Don't create one for:**
|
|
41
|
-
- One-off solutions
|
|
42
|
-
- Standard practices well-documented elsewhere
|
|
43
|
-
- Project-specific conventions (put those in CLAUDE.md / AGENTS.md)
|
|
44
|
-
- Mechanical constraints — if a regex or validation can enforce it, automate it instead
|
|
45
|
-
|
|
46
|
-
## Skill types
|
|
47
|
-
|
|
48
|
-
- **Technique** — concrete method with steps to follow (condition-based-waiting, root-cause-tracing)
|
|
49
|
-
- **Pattern** — way of thinking about problems (flatten-with-flags, test-invariants)
|
|
50
|
-
- **Reference** — API docs, syntax guides, tool documentation
|
|
51
|
-
|
|
52
52
|
## SKILL.md structure
|
|
53
53
|
|
|
54
54
|
```markdown
|
|
@@ -59,39 +59,87 @@ description: Use when [specific triggering conditions and symptoms]
|
|
|
59
59
|
|
|
60
60
|
# Skill Name
|
|
61
61
|
|
|
62
|
-
## Overview
|
|
63
|
-
|
|
62
|
+
## Overview — what is this? Core principle in 1-2 sentences.
|
|
63
|
+
## When to use — symptoms and use cases; when NOT to use.
|
|
64
|
+
## Core pattern — before/after comparison (techniques/patterns).
|
|
65
|
+
## Quick reference — table or bullets for scanning common operations.
|
|
66
|
+
## Implementation — inline code for simple patterns; link a file for heavy reference.
|
|
67
|
+
## Common mistakes — what goes wrong + fixes.
|
|
68
|
+
```
|
|
64
69
|
|
|
65
|
-
|
|
66
|
-
|
|
70
|
+
**Frontmatter rules:**
|
|
71
|
+
- Two required fields, `name` and `description`, max 1024 characters total. See
|
|
72
|
+
[agentskills.io/specification](https://agentskills.io/specification) for the full schema.
|
|
73
|
+
- `name`: lowercase letters, numbers, hyphens only.
|
|
74
|
+
- `description`: third person, triggering conditions only — see "Writing the description".
|
|
67
75
|
|
|
68
|
-
##
|
|
69
|
-
Before/after comparison.
|
|
76
|
+
## Building blocks
|
|
70
77
|
|
|
71
|
-
|
|
72
|
-
|
|
78
|
+
The blocks below help structure a SKILL.md file. Use the ones that fit - not every skill
|
|
79
|
+
needs all of them. These aren't limiters, and your skill should contain the content it needs.
|
|
73
80
|
|
|
74
|
-
|
|
75
|
-
Inline code for simple patterns; link to a file for heavy reference.
|
|
81
|
+
Each block does one job:
|
|
76
82
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
```
|
|
83
|
+
- **Gotchas** *(any skill)* — environment-specific facts that defy reasonable assumptions, so
|
|
84
|
+
the agent reads them *before* hitting the trap. These correct factual mistakes, not motivation:
|
|
80
85
|
|
|
81
|
-
|
|
82
|
-
-
|
|
83
|
-
|
|
84
|
-
|
|
86
|
+
> - The `users` table uses soft deletes — queries need `WHERE deleted_at IS NULL`.
|
|
87
|
+
> - The ID is `user_id` in the DB, `uid` in auth, `accountId` in billing — same value.
|
|
88
|
+
|
|
89
|
+
Keep gotchas inline; when an agent makes a mistake you have to correct, add it here.
|
|
85
90
|
|
|
86
|
-
|
|
91
|
+
- **Red flags / rationalization table** *(discipline skills only)* — these look like gotchas but
|
|
92
|
+
are **not** the same: gotchas correct *facts*, red flags counter *motivated reasoning* under
|
|
93
|
+
pressure, and they come from eval pressure-testing rather than domain knowledge. See
|
|
94
|
+
"Rationalization-proofing" below for how to build them.
|
|
87
95
|
|
|
88
|
-
|
|
96
|
+
- **Quick-reference table** — for scanning common operations. Tables and lists, not prose.
|
|
89
97
|
|
|
90
|
-
|
|
98
|
+
- **Checklist** *(multi-step skills)* — when steps have dependencies or validation gates, give a
|
|
99
|
+
checklist the agent copies into its task tracker and ticks off, so it can't skip a gate.
|
|
91
100
|
|
|
92
|
-
|
|
101
|
+
- **Code examples** — **one excellent example beats many mediocre ones.** Pick the language that
|
|
102
|
+
fits the domain (testing → TS/JS, system debugging → shell/Python). A good example is
|
|
103
|
+
complete, runnable, commented on the WHY, from a real scenario, ready to adapt. Don't
|
|
104
|
+
reimplement it in five languages — agents port well, and multi-language dilution means
|
|
105
|
+
mediocre quality everywhere plus maintenance burden on every change.
|
|
106
|
+
|
|
107
|
+
## Flowchart usage
|
|
93
108
|
|
|
94
|
-
|
|
109
|
+
Use a small inline flowchart **only** when the decision is non-obvious, there's a process loop
|
|
110
|
+
where you might stop too early, or it's an "A vs B" branch where the wrong choice has
|
|
111
|
+
consequences. Don't use flowcharts for reference material (use tables/lists), code (use code
|
|
112
|
+
blocks — `step1[import fs]` can't be copy-pasted), linear instructions (use numbered
|
|
113
|
+
lists), or labels without semantic meaning (`step1`, `helper2` — labels should carry meaning).
|
|
114
|
+
|
|
115
|
+
Write flowcharts as **mermaid** (` ```mermaid ` blocks) — it renders natively in GitHub and most
|
|
116
|
+
editors, so no tooling or dependency is needed to preview. Shape carries meaning:
|
|
117
|
+
|
|
118
|
+
| Meaning | Mermaid |
|
|
119
|
+
|---|---|
|
|
120
|
+
| Question / decision | `id{Label}` |
|
|
121
|
+
| Action | `id[Label]` |
|
|
122
|
+
| State / situation | `id(Label)` |
|
|
123
|
+
| Warning / STOP | `id{{Label}}` (hexagon) |
|
|
124
|
+
| Entry / exit | `id([Label])` (stadium) |
|
|
125
|
+
| Edge with label | `A -->\|x\| B` |
|
|
126
|
+
| Trigger / dotted edge | `A -.->\|x\| B` |
|
|
127
|
+
|
|
128
|
+
Quote any label containing `[ ] : ( ) /` or `'` with `"..."`, e.g.
|
|
129
|
+
`done(["Respond (including clarifications)"])`.
|
|
130
|
+
|
|
131
|
+
## Writing the description
|
|
132
|
+
|
|
133
|
+
The description is how agents (and the skill mechanism) decide whether to load your skill. Make
|
|
134
|
+
it answer one question: *should I read this skill right now?*
|
|
135
|
+
|
|
136
|
+
**Description = WHEN, not WHAT.** Do not summarize the skill's workflow. Testing has repeatedly
|
|
137
|
+
shown that when the description summarizes the process, agents follow the description instead of
|
|
138
|
+
reading the skill. A description saying "code review between tasks" caused an agent to do ONE
|
|
139
|
+
review even though the skill body described TWO; changing it to "Use when executing
|
|
140
|
+
implementation plans with independent tasks" — no workflow summary — produced the correct
|
|
141
|
+
two-stage behavior. The trap: workflow summaries create a shortcut, and the skill body becomes
|
|
142
|
+
documentation the agent skips.
|
|
95
143
|
|
|
96
144
|
```yaml
|
|
97
145
|
# ❌ Summarizes workflow — agent may follow this instead of reading the skill
|
|
@@ -101,206 +149,142 @@ description: Use when executing plans — dispatches subagent per task with code
|
|
|
101
149
|
description: Use when executing implementation plans with independent tasks in the current session
|
|
102
150
|
```
|
|
103
151
|
|
|
104
|
-
Other
|
|
105
|
-
- Start with "Use when
|
|
106
|
-
|
|
107
|
-
- Describe the *problem* (race conditions, timing dependencies) not
|
|
152
|
+
Other rules:
|
|
153
|
+
- Start with "Use when…" and write in third person — descriptions are injected into the system
|
|
154
|
+
prompt.
|
|
155
|
+
- Describe the *problem* (race conditions, timing dependencies), not language-specific symptoms
|
|
156
|
+
(`setTimeout`, `sleep`) unless the skill is technology-specific.
|
|
157
|
+
- **Keyword coverage:** use words an agent would actually search for — error messages ("Hook
|
|
158
|
+
timed out", "ENOTEMPTY"), symptoms ("flaky", "hanging"), synonyms ("timeout / hang / freeze").
|
|
108
159
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
160
|
+
> **Note — this is a deliberate house stance.** External sources disagree on descriptions:
|
|
161
|
+
> Anthropic says include *what the skill does* plus when; agentskills favors imperative,
|
|
162
|
+
> user-intent phrasing. Because there's no shared standard, we maintain our WHEN-not-WHAT rule.
|
|
163
|
+
> The load-bearing part is **no workflow summary**.
|
|
112
164
|
|
|
113
165
|
### Naming
|
|
114
166
|
|
|
115
|
-
Active voice, verb-first
|
|
167
|
+
Active voice, verb-first; gerunds (-ing) work well for processes. Name by what you DO or the
|
|
168
|
+
core insight, not the surface category.
|
|
116
169
|
|
|
117
170
|
- ✅ `creating-skills`, `condition-based-waiting`, `root-cause-tracing`
|
|
118
171
|
- ❌ `skill-creation`, `async-test-helpers`, `debugging-techniques`
|
|
119
172
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
### Token efficiency
|
|
123
|
-
|
|
124
|
-
Once a skill is loaded, every token in it competes with conversation history. For frequently-loaded skills, aim for under 200 words total; for other skills, keep the body lean and offload heavy reference to separate files.
|
|
125
|
-
|
|
126
|
-
Techniques:
|
|
127
|
-
- **Move details to tool help.** "Run `<tool> --help` for filter flags" beats listing every flag.
|
|
128
|
-
- **Use cross-references.** Don't repeat what another skill says — link to it.
|
|
129
|
-
- **Compress examples.** One good before/after pair is enough; cut the surrounding prose.
|
|
130
|
-
|
|
131
|
-
### Cross-referencing other skills
|
|
173
|
+
## Cross-referencing other skills
|
|
132
174
|
|
|
133
175
|
Use the skill's qualified name with an explicit requirement marker:
|
|
134
176
|
|
|
135
177
|
- ✅ `**REQUIRED SUB-SKILL:** Use slow-powers:test-driven-development`
|
|
136
178
|
- ✅ `**REQUIRED BACKGROUND:** You must understand slow-powers:systematic-debugging`
|
|
179
|
+
- ✅ `**REQUIRED PREREQUISITE:** You must have already completed slow-powers:systematic-debugging`
|
|
180
|
+
- ✅ `**REQUIRED NEXT SKILL:** You must complete slow-powers:systematic-debugging next`
|
|
137
181
|
- ❌ `See skills/testing/test-driven-development` — unclear if required, harness-specific path
|
|
138
|
-
- ❌ `@skills/testing/test-driven-development/SKILL.md` — force-loads
|
|
139
|
-
|
|
140
|
-
The `@` prefix force-loads the file on session start, consuming context before you need it.
|
|
141
|
-
|
|
142
|
-
## Flowchart usage
|
|
182
|
+
- ❌ `@skills/testing/test-driven-development/SKILL.md` — the `@` prefix force-loads the file on
|
|
183
|
+
session start, burning context before you need it.
|
|
143
184
|
|
|
144
|
-
|
|
145
|
-
- The decision is non-obvious
|
|
146
|
-
- There's a process loop where you might stop too early
|
|
147
|
-
- It's an "A vs B" branch where the wrong choice has consequences
|
|
185
|
+
Don't repeat what another skill says — link to it.
|
|
148
186
|
|
|
149
|
-
|
|
150
|
-
- Reference material — use tables or lists
|
|
151
|
-
- Code examples — use markdown code blocks
|
|
152
|
-
- Linear instructions — use numbered lists
|
|
153
|
-
- Labels without semantic meaning (`step1`, `helper2`)
|
|
187
|
+
## Conciseness & file organization
|
|
154
188
|
|
|
155
|
-
|
|
189
|
+
Once a skill loads, every token competes with conversation history. Keep the body lean: aim for
|
|
190
|
+
**≤200 lines** for frequently-loaded internal skills, and treat **500 lines / 5,000 tokens** as
|
|
191
|
+
the hard ceiling for any skill. Move details to tool help ("Run `<tool> --help` for flags" beats
|
|
192
|
+
listing every flag), cross-reference instead of repeating, and compress examples to one good
|
|
193
|
+
pair.
|
|
156
194
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
**One excellent example beats many mediocre ones.**
|
|
162
|
-
|
|
163
|
-
Choose the most relevant language for the skill's domain — testing techniques tend to land best in TypeScript/JavaScript, system debugging in shell or Python, data processing in Python.
|
|
164
|
-
|
|
165
|
-
A good example is:
|
|
166
|
-
- Complete and runnable
|
|
167
|
-
- Well-commented on the WHY, not the WHAT
|
|
168
|
-
- From a real scenario
|
|
169
|
-
- Ready to adapt, not a fill-in-the-blank template
|
|
170
|
-
|
|
171
|
-
Don't implement the same example in five languages. Agents are good at porting — one excellent example is enough.
|
|
172
|
-
|
|
173
|
-
## File organization
|
|
174
|
-
|
|
175
|
-
Keep most skills self-contained in a single SKILL.md. Add supporting files only when one of these is true:
|
|
195
|
+
Use progressive disclosure for anything heavy: SKILL.md is the always-loaded overview; bulky
|
|
196
|
+
material lives in separate files the agent loads on demand. Tell the agent *when* to load each
|
|
197
|
+
("Read `api-reference.md` if the API returns non-200") rather than a generic "see references/".
|
|
176
198
|
|
|
177
199
|
```
|
|
178
|
-
self-contained/
|
|
179
|
-
SKILL.md
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
SKILL.md
|
|
183
|
-
example.ts
|
|
184
|
-
|
|
185
|
-
with-heavy-reference/ # SKILL.md + bulky reference docs
|
|
186
|
-
SKILL.md
|
|
187
|
-
api-reference.md # 500+ lines of API docs
|
|
188
|
-
scripts/ # executable utilities
|
|
200
|
+
self-contained/ with-reusable-tool/ with-heavy-reference/
|
|
201
|
+
SKILL.md SKILL.md SKILL.md
|
|
202
|
+
example.ts api-reference.md # 100+ lines of API docs
|
|
203
|
+
scripts/ # executable utilities
|
|
189
204
|
```
|
|
190
205
|
|
|
191
|
-
Separate files are warranted for
|
|
192
|
-
|
|
193
|
-
2. Reusable executable tools — scripts that adapt across projects
|
|
194
|
-
|
|
195
|
-
Otherwise keep content inline. Principles, concepts, code patterns under ~50 lines — all inline.
|
|
206
|
+
Separate files are warranted only for **heavy reference (100+ lines)** or **reusable executable
|
|
207
|
+
tools**. Principles, concepts, and code patterns under ~50 lines stay inline.
|
|
196
208
|
|
|
197
209
|
## Rationalization-proofing for discipline skills
|
|
198
210
|
|
|
199
|
-
Skills that enforce discipline (TDD, verification-before-completion, designing-before-coding)
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
211
|
+
Skills that enforce discipline (TDD, verification-before-completion, designing-before-coding)
|
|
212
|
+
must survive pressure — agents find loopholes under time, sunk-cost, or authority pressure.
|
|
213
|
+
Drafting an enforceable rule differs from drafting a guideline. The research backs this up:
|
|
214
|
+
persuasion techniques more than double LLM compliance under pressure. See
|
|
215
|
+
`persuasion-principles.md` for the seven principles, when each applies, and citations (Cialdini,
|
|
216
|
+
2021; Meincke et al., 2025).
|
|
204
217
|
|
|
205
|
-
State the rule, then forbid the specific workarounds you
|
|
218
|
+
**Close every loophole explicitly.** State the rule, then forbid the specific workarounds you
|
|
219
|
+
can predict — the agent will reach for the ambiguity under pressure.
|
|
206
220
|
|
|
207
221
|
```markdown
|
|
208
|
-
❌ Write code before test? Delete it.
|
|
209
|
-
|
|
210
222
|
✅ Write code before test? Delete it. Start over.
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
- Don't keep it as "reference"
|
|
214
|
-
- Don't "adapt" it while writing tests
|
|
215
|
-
- Delete means delete.
|
|
223
|
+
No exceptions: don't keep it as "reference", don't "adapt" it while writing tests.
|
|
224
|
+
Delete means delete.
|
|
216
225
|
```
|
|
217
226
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
State the foundational principle early, before the agent reaches for it:
|
|
227
|
+
**Address "spirit vs letter" early**, before the agent reaches for it:
|
|
221
228
|
|
|
222
229
|
> **Violating the letter of the rules is violating the spirit of the rules.**
|
|
223
230
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
These tables and lists come *from* the eval iteration loop — they're not something you can write up front. The eval surfaces the specific excuses an agent reaches for when the rule fails under pressure. Capture them verbatim and bake them back into the skill:
|
|
231
|
+
**Build the rationalization table and red-flags list *from* the eval loop** — they aren't
|
|
232
|
+
something you write up front. The eval surfaces the specific excuses an agent reaches for; capture
|
|
233
|
+
them verbatim and bake them back in:
|
|
229
234
|
|
|
230
235
|
```markdown
|
|
231
236
|
| Excuse | Reality |
|
|
232
237
|
|--------|---------|
|
|
233
238
|
| "Too simple to test" | Simple code breaks. Test takes 30 seconds. |
|
|
234
239
|
| "I'll test after" | Tests passing immediately prove nothing. |
|
|
235
|
-
| "Tests after achieve the same goals" | Tests-after = "what does this do?" Tests-first = "what should this do?" |
|
|
236
|
-
```
|
|
237
240
|
|
|
238
|
-
```markdown
|
|
239
241
|
## Red flags — STOP and start over
|
|
240
|
-
|
|
241
242
|
- Code before test
|
|
242
243
|
- "I already manually tested it"
|
|
243
|
-
- "
|
|
244
|
-
- "It's about spirit not ritual"
|
|
245
|
-
- "This is different because..."
|
|
246
|
-
|
|
247
|
-
All of these mean: delete code. Start over with TDD.
|
|
248
|
-
```
|
|
249
|
-
|
|
250
|
-
See `slow-powers:evaluating-skills` and its `pressure-scenarios.md` for the pressure-type taxonomy and how to write prompts that actually stress the rule (rather than letting the agent recite the skill and "pass" without proving anything). The mid-session rationalizations that belong in these tables surface most reliably from *seeded* eval cases — ones that embed a prior transcript so the agent meets the rule already committed to skipping it; see "Seeding conversation context" in that skill.
|
|
251
|
-
|
|
252
|
-
## Anti-patterns
|
|
253
|
-
|
|
254
|
-
### ❌ Narrative example
|
|
255
|
-
> "In session 2025-10-03, we found empty projectDir caused..."
|
|
256
|
-
|
|
257
|
-
Too specific to a moment in time. Not reusable.
|
|
258
|
-
|
|
259
|
-
### ❌ Multi-language dilution
|
|
260
|
-
`example-js.js`, `example-py.py`, `example-go.go`
|
|
261
|
-
|
|
262
|
-
Mediocre quality across all of them, maintenance burden on every change.
|
|
263
|
-
|
|
264
|
-
### ❌ Code in flowcharts
|
|
265
|
-
```
|
|
266
|
-
step1 [label="import fs"];
|
|
267
|
-
step2 [label="read file"];
|
|
244
|
+
- "This is different because…"
|
|
268
245
|
```
|
|
269
246
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
Labels should carry semantic meaning.
|
|
247
|
+
The mid-session rationalizations that belong here surface most reliably from *seeded* eval cases
|
|
248
|
+
— ones that embed a prior transcript so the agent meets the rule already committed to skipping
|
|
249
|
+
it. See `slow-powers:evaluating-skills` (`pressure-scenarios.md` and "Seeding conversation
|
|
250
|
+
context") for the pressure taxonomy.
|
|
276
251
|
|
|
277
|
-
##
|
|
252
|
+
## Validation checklist
|
|
278
253
|
|
|
279
|
-
Use your persistent task tracker — one task per item.
|
|
254
|
+
Use your persistent task tracker — one task per item. Works for authoring a new skill or
|
|
255
|
+
auditing an existing one.
|
|
280
256
|
|
|
281
257
|
**Draft:**
|
|
282
|
-
- [ ] Name uses only letters, numbers, hyphens
|
|
283
|
-
- [ ]
|
|
284
|
-
- [ ] Description starts with "Use when
|
|
285
|
-
|
|
286
|
-
- [ ] Body keeps to one excellent example per concept
|
|
287
|
-
- [ ] Heavy reference and reusable tools live in separate files; principles
|
|
258
|
+
- [ ] Name uses only lowercase letters, numbers, hyphens
|
|
259
|
+
- [ ] Frontmatter has `name` and `description` (under 1024 chars total)
|
|
260
|
+
- [ ] Description starts with "Use when…", is third person, includes triggers/symptoms, and
|
|
261
|
+
contains NO workflow summary
|
|
262
|
+
- [ ] Body keeps to one excellent example per concept; no narrative-of-one-session content
|
|
263
|
+
- [ ] Heavy reference (100+ lines) and reusable tools live in separate files; principles inline
|
|
264
|
+
- [ ] Flowcharts only for non-obvious decisions/loops/branches; semantic labels, no code
|
|
288
265
|
- [ ] Cross-references use `slow-powers:<skill-name>`, not file paths or `@` imports
|
|
266
|
+
- [ ] Body is lean (≤200 lines preferred, 500 max)
|
|
289
267
|
|
|
290
268
|
**Validate** (handoff to `slow-powers:evaluating-skills`):
|
|
291
|
-
- [ ] Decide whether the change is behavior-shaping or deterministic, and announce the decision
|
|
292
|
-
|
|
293
|
-
- [ ]
|
|
294
|
-
|
|
295
|
-
- [ ]
|
|
269
|
+
- [ ] Decide whether the change is behavior-shaping or deterministic, and announce the decision
|
|
270
|
+
and reasoning (see "Choosing to test with evals"). Default to behavior-shaping when unsure.
|
|
271
|
+
- [ ] If behavior-shaping (or the user opts in): author `evals/evals.json` with 2–3 realistic
|
|
272
|
+
prompts
|
|
273
|
+
- [ ] For discipline-enforcing skills, write pressure prompts combining multiple pressures, plus
|
|
274
|
+
at least one **seeded** case (embeds a prior transcript) alongside a cold contrast case
|
|
275
|
+
- [ ] Run the eval. Iterate until the with-skill pass rate is materially higher than baseline.
|
|
296
276
|
|
|
297
277
|
**Deploy:**
|
|
298
|
-
- [ ] Commit the skill (and its `evals/evals.json`, when
|
|
299
|
-
- [ ] In the PR
|
|
278
|
+
- [ ] Commit the skill (and its `evals/evals.json`, when authored) together
|
|
279
|
+
- [ ] In the PR, include before/after eval results — or, for a deterministic change, the stated
|
|
280
|
+
decision and reasoning to skip (per repo CLAUDE.md)
|
|
300
281
|
|
|
301
282
|
## Further reading
|
|
302
283
|
|
|
303
284
|
- `slow-powers:evaluating-skills` — phase 2: measuring whether the draft works
|
|
304
|
-
- `persuasion-principles.md` (in this skill) — research foundation for discipline
|
|
305
|
-
-
|
|
306
|
-
|
|
285
|
+
- `persuasion-principles.md` (in this skill) — research foundation for discipline language
|
|
286
|
+
- [agentskills.io best-practices](https://agentskills.io/skill-creation/best-practices) and
|
|
287
|
+
[optimizing-descriptions](https://agentskills.io/skill-creation/optimizing-descriptions) —
|
|
288
|
+
harness-agnostic depth on patterns and description testing
|
|
289
|
+
- [Anthropic Agent Skills best practices](https://platform.claude.com/docs/en/agents-and-tools/agent-skills/best-practices)
|
|
290
|
+
— degrees-of-freedom, progressive disclosure, and script-bundling depth
|
package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json
DELETED
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"assertion_results": [
|
|
3
|
-
{
|
|
4
|
-
"id": "loads_systematic_debugging_skill",
|
|
5
|
-
"passed": false,
|
|
6
|
-
"evidence": "no tool invocation matched /(Skill.*systematic-debugging|view_file.*systematic-debugging)/ across 5 invocation(s)",
|
|
7
|
-
"confidence": 1,
|
|
8
|
-
"grader": "transcript_check"
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": "invokes_debugging_only",
|
|
12
|
-
"passed": false,
|
|
13
|
-
"evidence": "The agent loaded 'slow-powers-eval-1-new_skill__hardening-plans' (tool_invocations ordinal 1) and wrote a five-step implementation plan before invoking it. The systematic-debugging skill was never loaded at any point in the run.",
|
|
14
|
-
"confidence": 1,
|
|
15
|
-
"grader": "llm_judge"
|
|
16
|
-
}
|
|
17
|
-
],
|
|
18
|
-
"summary": {
|
|
19
|
-
"passed": 0,
|
|
20
|
-
"failed": 2,
|
|
21
|
-
"total": 2,
|
|
22
|
-
"pass_rate": 0
|
|
23
|
-
}
|
|
24
|
-
}
|
package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json
DELETED
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"assertion_results": [
|
|
3
|
-
{
|
|
4
|
-
"id": "loads_systematic_debugging_skill",
|
|
5
|
-
"passed": true,
|
|
6
|
-
"evidence": "matched ordinal 1: Skill {\"skill\":\"slow-powers:systematic-debugging\"}",
|
|
7
|
-
"confidence": 1,
|
|
8
|
-
"grader": "transcript_check"
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": "invokes_debugging_only",
|
|
12
|
-
"passed": true,
|
|
13
|
-
"evidence": "Tool invocation ordinal 1 shows the agent called the Skill tool with 'slow-powers:systematic-debugging'; no invocation of 'hardening-plans' appears anywhere in the tool list; the final_message asks for the file, a reproduction case, and logs — debugging triage, not a plan.",
|
|
14
|
-
"confidence": 1,
|
|
15
|
-
"grader": "llm_judge"
|
|
16
|
-
}
|
|
17
|
-
],
|
|
18
|
-
"summary": {
|
|
19
|
-
"passed": 2,
|
|
20
|
-
"failed": 0,
|
|
21
|
-
"total": 2,
|
|
22
|
-
"pass_rate": 1
|
|
23
|
-
}
|
|
24
|
-
}
|