@slowdini/slow-powers-opencode 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +16 -7
  2. package/bootstrap.md +19 -20
  3. package/package.json +1 -1
  4. package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +8 -0
  5. package/skills/auditing-slow-powers-usage/evals/evals.json +3 -3
  6. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +1 -1
  7. package/skills/evaluating-skills/SKILL.md +4 -4
  8. package/skills/evaluating-skills/evals/evals.json +1 -1
  9. package/skills/evaluating-skills/examples/{verification-before-completion-evals.json → verifying-development-work-evals.json} +2 -2
  10. package/skills/evaluating-skills/pressure-scenarios.md +1 -1
  11. package/skills/hardening-plans/SKILL.md +1 -1
  12. package/skills/systematic-debugging/SKILL.md +4 -0
  13. package/skills/systematic-debugging/condition-based-waiting.md +10 -11
  14. package/skills/systematic-debugging/root-cause-tracing.md +31 -33
  15. package/skills/test-driven-development/SKILL.md +2 -0
  16. package/skills/verifying-development-work/SKILL.md +88 -0
  17. package/skills/{verification-before-completion → verifying-development-work}/evals/baseline/BASELINE.md +6 -6
  18. package/skills/verifying-development-work/evals/baseline/NOTES.md +153 -0
  19. package/skills/verifying-development-work/evals/baseline/benchmark.json +53 -0
  20. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +39 -0
  21. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +24 -0
  22. package/skills/{verification-before-completion → verifying-development-work}/evals/baseline/grading/build-implied-by-edit__with_skill.json +3 -3
  23. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +31 -0
  24. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +46 -0
  25. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +31 -0
  26. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +46 -0
  27. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +31 -0
  28. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +53 -0
  29. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +38 -0
  30. package/skills/verifying-development-work/evals/evals.json +146 -0
  31. package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.test.ts +14 -0
  32. package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.ts +24 -0
  33. package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.test.ts +25 -0
  34. package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.ts +18 -0
  35. package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.test.ts +19 -0
  36. package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.ts +24 -0
  37. package/skills/working-in-isolation/SKILL.md +58 -0
  38. package/skills/working-in-isolation/evals/baseline/BASELINE.md +22 -0
  39. package/skills/working-in-isolation/evals/baseline/NOTES.md +67 -0
  40. package/skills/working-in-isolation/evals/baseline/benchmark.json +51 -0
  41. package/skills/working-in-isolation/evals/baseline/grading/base-branch-checkout__with_skill.json +46 -0
  42. package/skills/working-in-isolation/evals/baseline/grading/base-branch-checkout__without_skill.json +31 -0
  43. package/skills/working-in-isolation/evals/baseline/grading/dirty-tree-worktree__with_skill.json +39 -0
  44. package/skills/working-in-isolation/evals/baseline/grading/dirty-tree-worktree__without_skill.json +24 -0
  45. package/skills/working-in-isolation/evals/baseline/grading/feature-branch-in-place__with_skill.json +32 -0
  46. package/skills/working-in-isolation/evals/baseline/grading/feature-branch-in-place__without_skill.json +17 -0
  47. package/skills/working-in-isolation/evals/baseline/grading/seeded-on-main-momentum__with_skill.json +39 -0
  48. package/skills/working-in-isolation/evals/baseline/grading/seeded-on-main-momentum__without_skill.json +24 -0
  49. package/skills/working-in-isolation/evals/baseline/grading/typo-no-worktree__with_skill.json +32 -0
  50. package/skills/working-in-isolation/evals/baseline/grading/typo-no-worktree__without_skill.json +17 -0
  51. package/skills/working-in-isolation/evals/evals.json +87 -0
  52. package/skills/writing-skills/SKILL.md +180 -197
  53. package/skills/finishing-a-development-branch/SKILL.md +0 -96
  54. package/skills/finishing-a-development-branch/evals/evals.json +0 -41
  55. package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +0 -4
  56. package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +0 -5
  57. package/skills/using-git-worktrees/SKILL.md +0 -70
  58. package/skills/using-git-worktrees/evals/evals.json +0 -40
  59. package/skills/verification-before-completion/SKILL.md +0 -65
  60. package/skills/verification-before-completion/evals/baseline/NOTES.md +0 -75
  61. package/skills/verification-before-completion/evals/baseline/benchmark.json +0 -51
  62. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
  63. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
  64. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
  65. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
  66. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
  67. package/skills/verification-before-completion/evals/evals.json +0 -77
  68. package/skills/writing-skills/graphviz-conventions.dot +0 -172
  69. package/skills/writing-skills/scripts/render-graphs.js +0 -181
  70. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/api.ts +0 -0
  71. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/consumer.ts +0 -0
  72. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/tsconfig.json +0 -0
  73. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.test.ts +0 -0
  74. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.ts +0 -0
@@ -7,15 +7,39 @@ description: Use when creating new skills or editing existing skills. Drafting o
7
7
 
8
8
  ## Overview
9
9
 
10
- Skill development has two phases: **drafting** (this skill) and **evaluation** (`slow-powers:evaluating-skills`). Drafting covers naming, structure, vocabulary, anti-patterns, and rationalization-proofing. Evaluation covers measuring whether the words on the page actually shift agent behavior under realistic prompts.
10
+ Skill development has two phases: **drafting** (this skill) and **evaluation**
11
+ (`slow-powers:evaluating-skills`). This skill is your template for authoring a new skill and
12
+ your checklist for auditing an existing one — it covers structure, building blocks, description
13
+ writing, and rationalization-proofing.
11
14
 
12
- A behavioral draft you didn't measure is a claim you didn't verify. After drafting, hand off to `slow-powers:evaluating-skills` to decide whether the change is behavior-shaping (measure it — the with/without comparison and iteration loop) or deterministic instruction-following (declare the decision and reasoning, then skip). New skills and edits alike route through that decision — see "Choosing to test with evals" in that skill. Default to measuring; the skip is a narrow, announced exception, not an escape hatch.
15
+ A behavioral draft you didn't measure is a claim you didn't verify. After drafting, hand off to
16
+ `slow-powers:evaluating-skills` to decide whether the change is behavior-shaping (measure it) or
17
+ deterministic instruction-following (declare the decision and reasoning, then skip). Default to
18
+ measuring; the skip is a narrow, announced exception, not an escape hatch.
13
19
 
14
- **Personal skills** live in your harness's user-skills directory. The path differs per harness; consult the harness's docs.
20
+ ## What is a skill?
21
+
22
+ A skill is a reusable reference guide for a proven technique, pattern, or tool — **not** a
23
+ narrative about how you solved a problem once ("In session 2025-10-03 we found…" is too tied to
24
+ a moment to reuse).
25
+
26
+ **Create a skill when:** the technique wasn't intuitively obvious, you'd reference it again
27
+ across projects, and the pattern applies broadly.
28
+
29
+ **Don't create one for:** one-off solutions, standard practices documented elsewhere,
30
+ project-specific conventions (put those in CLAUDE.md / AGENTS.md), or mechanical constraints a
31
+ regex or validation could enforce — automate those instead.
15
32
 
16
- ## Vocabulary
33
+ ## Skill types
34
+
35
+ - **Technique** — concrete method with steps (condition-based-waiting, root-cause-tracing).
36
+ - **Pattern** — a way of thinking about problems (flatten-with-flags, test-invariants).
37
+ - **Reference** — API docs, syntax guides, tool documentation.
17
38
 
18
- Skills describe capabilities, not platform tool names. When you write a skill, use these terms. This table is the canonical source — when a new load-bearing term is coined, add it here.
39
+ ## Cross-harness vocabulary
40
+
41
+ Skills may ship across harnesses, so they should describe *capabilities*, not platform tool names.
42
+ Use these terms as the canonical vocabulary reference.
19
43
 
20
44
  | Term | Means | Don't say |
21
45
  |------|-------|-----------|
@@ -25,30 +49,6 @@ Skills describe capabilities, not platform tool names. When you write a skill, u
25
49
  | **Capability** | A described action ("search file contents") | A platform tool name ("Grep") |
26
50
  | **Load-bearing property** | A property a capability must have for the workflow to work | (no shorter form) |
27
51
 
28
- ## What is a skill?
29
-
30
- A skill is a reference guide for proven techniques, patterns, or tools. Skills help future agents find and apply effective approaches.
31
-
32
- **Skills are:** reusable techniques, patterns, tools, reference guides.
33
- **Skills are not:** narratives about how you solved a problem once.
34
-
35
- **Create a skill when:**
36
- - The technique wasn't intuitively obvious
37
- - You'd reference it again across projects
38
- - The pattern applies broadly (not project-specific)
39
-
40
- **Don't create one for:**
41
- - One-off solutions
42
- - Standard practices well-documented elsewhere
43
- - Project-specific conventions (put those in CLAUDE.md / AGENTS.md)
44
- - Mechanical constraints — if a regex or validation can enforce it, automate it instead
45
-
46
- ## Skill types
47
-
48
- - **Technique** — concrete method with steps to follow (condition-based-waiting, root-cause-tracing)
49
- - **Pattern** — way of thinking about problems (flatten-with-flags, test-invariants)
50
- - **Reference** — API docs, syntax guides, tool documentation
51
-
52
52
  ## SKILL.md structure
53
53
 
54
54
  ```markdown
@@ -59,39 +59,87 @@ description: Use when [specific triggering conditions and symptoms]
59
59
 
60
60
  # Skill Name
61
61
 
62
- ## Overview
63
- What is this? Core principle in 1-2 sentences.
62
+ ## Overview — what is this? Core principle in 1-2 sentences.
63
+ ## When to use — symptoms and use cases; when NOT to use.
64
+ ## Core pattern — before/after comparison (techniques/patterns).
65
+ ## Quick reference — table or bullets for scanning common operations.
66
+ ## Implementation — inline code for simple patterns; link a file for heavy reference.
67
+ ## Common mistakes — what goes wrong + fixes.
68
+ ```
64
69
 
65
- ## When to use
66
- Bullet list with symptoms and use cases. When NOT to use.
70
+ **Frontmatter rules:**
71
+ - Two required fields, `name` and `description`, max 1024 characters total. See
72
+ [agentskills.io/specification](https://agentskills.io/specification) for the full schema.
73
+ - `name`: lowercase letters, numbers, hyphens only.
74
+ - `description`: third person, triggering conditions only — see "Writing the description".
67
75
 
68
- ## Core pattern (techniques/patterns)
69
- Before/after comparison.
76
+ ## Building blocks
70
77
 
71
- ## Quick reference
72
- Table or bullets for scanning common operations.
78
+ The blocks below help structure a SKILL.md file. Use the ones that fit - not every skill
79
+ needs all of them. These aren't limiters, and your skill should contain the content it needs.
73
80
 
74
- ## Implementation
75
- Inline code for simple patterns; link to a file for heavy reference.
81
+ Each block does one job:
76
82
 
77
- ## Common mistakes
78
- What goes wrong + fixes.
79
- ```
83
+ - **Gotchas** *(any skill)* — environment-specific facts that defy reasonable assumptions, so
84
+ the agent reads them *before* hitting the trap. These correct factual mistakes, not motivation:
80
85
 
81
- **Frontmatter rules:**
82
- - Two required fields: `name` and `description`. Max 1024 characters total. See [agentskills.io/specification](https://agentskills.io/specification) for the full schema.
83
- - `name`: letters, numbers, hyphens only — no parentheses or special chars.
84
- - `description`: third person; describes ONLY when to use. See the next section for why "what it does" is the wrong content for this field.
86
+ > - The `users` table uses soft deletes — queries need `WHERE deleted_at IS NULL`.
87
+ > - The ID is `user_id` in the DB, `uid` in auth, `accountId` in billing same value.
88
+
89
+ Keep gotchas inline; when an agent makes a mistake you have to correct, add it here.
85
90
 
86
- ## Skill discovery
91
+ - **Red flags / rationalization table** *(discipline skills only)* — these look like gotchas but
92
+ are **not** the same: gotchas correct *facts*, red flags counter *motivated reasoning* under
93
+ pressure, and they come from eval pressure-testing rather than domain knowledge. See
94
+ "Rationalization-proofing" below for how to build them.
87
95
 
88
- The description field is how agents (and the harness's skill mechanism) decide whether to load your skill. Make it answer one question: *should I read this skill right now?*
96
+ - **Quick-reference table** for scanning common operations. Tables and lists, not prose.
89
97
 
90
- ### Description = WHEN, not WHAT
98
+ - **Checklist** *(multi-step skills)* — when steps have dependencies or validation gates, give a
99
+ checklist the agent copies into its task tracker and ticks off, so it can't skip a gate.
91
100
 
92
- Do not summarize the skill's workflow in the description. Testing has repeatedly shown that when the description summarizes the process, agents follow the description instead of reading the full skill. A description saying "code review between tasks" caused an agent to do ONE review even though the skill body clearly described TWO reviews. Changing the description to just "Use when executing implementation plans with independent tasks" — no workflow summary — produced the correct two-stage behavior.
101
+ - **Code examples** **one excellent example beats many mediocre ones.** Pick the language that
102
+ fits the domain (testing → TS/JS, system debugging → shell/Python). A good example is
103
+ complete, runnable, commented on the WHY, from a real scenario, ready to adapt. Don't
104
+ reimplement it in five languages — agents port well, and multi-language dilution means
105
+ mediocre quality everywhere plus maintenance burden on every change.
106
+
107
+ ## Flowchart usage
93
108
 
94
- The trap is that workflow summaries create a shortcut the agent will take. The skill body becomes documentation the agent skips.
109
+ Use a small inline flowchart **only** when the decision is non-obvious, there's a process loop
110
+ where you might stop too early, or it's an "A vs B" branch where the wrong choice has
111
+ consequences. Don't use flowcharts for reference material (use tables/lists), code (use code
112
+ blocks — `step1[import fs]` can't be copy-pasted), linear instructions (use numbered
113
+ lists), or labels without semantic meaning (`step1`, `helper2` — labels should carry meaning).
114
+
115
+ Write flowcharts as **mermaid** (` ```mermaid ` blocks) — it renders natively in GitHub and most
116
+ editors, so no tooling or dependency is needed to preview. Shape carries meaning:
117
+
118
+ | Meaning | Mermaid |
119
+ |---|---|
120
+ | Question / decision | `id{Label}` |
121
+ | Action | `id[Label]` |
122
+ | State / situation | `id(Label)` |
123
+ | Warning / STOP | `id{{Label}}` (hexagon) |
124
+ | Entry / exit | `id([Label])` (stadium) |
125
+ | Edge with label | `A -->\|x\| B` |
126
+ | Trigger / dotted edge | `A -.->\|x\| B` |
127
+
128
+ Quote any label containing `[ ] : ( ) /` or `'` with `"..."`, e.g.
129
+ `done(["Respond (including clarifications)"])`.
130
+
131
+ ## Writing the description
132
+
133
+ The description is how agents (and the skill mechanism) decide whether to load your skill. Make
134
+ it answer one question: *should I read this skill right now?*
135
+
136
+ **Description = WHEN, not WHAT.** Do not summarize the skill's workflow. Testing has repeatedly
137
+ shown that when the description summarizes the process, agents follow the description instead of
138
+ reading the skill. A description saying "code review between tasks" caused an agent to do ONE
139
+ review even though the skill body described TWO; changing it to "Use when executing
140
+ implementation plans with independent tasks" — no workflow summary — produced the correct
141
+ two-stage behavior. The trap: workflow summaries create a shortcut, and the skill body becomes
142
+ documentation the agent skips.
95
143
 
96
144
  ```yaml
97
145
  # ❌ Summarizes workflow — agent may follow this instead of reading the skill
@@ -101,206 +149,141 @@ description: Use when executing plans — dispatches subagent per task with code
101
149
  description: Use when executing implementation plans with independent tasks in the current session
102
150
  ```
103
151
 
104
- Other description rules:
105
- - Start with "Use when..." to focus on triggering conditions.
106
- - Write in third person — descriptions are injected into the system prompt.
107
- - Describe the *problem* (race conditions, timing dependencies) not *language-specific symptoms* (setTimeout, sleep) unless the skill is technology-specific.
152
+ Other rules:
153
+ - Start with "Use when" and write in third person — descriptions are injected into the system
154
+ prompt.
155
+ - Describe the *problem* (race conditions, timing dependencies), not language-specific symptoms
156
+ (`setTimeout`, `sleep`) unless the skill is technology-specific.
157
+ - **Keyword coverage:** use words an agent would actually search for — error messages ("Hook
158
+ timed out", "ENOTEMPTY"), symptoms ("flaky", "hanging"), synonyms ("timeout / hang / freeze").
108
159
 
109
- ### Keyword coverage
110
-
111
- Use words an agent would actually search for: error messages ("Hook timed out", "ENOTEMPTY"), symptoms ("flaky", "hanging", "pollution"), synonyms ("timeout / hang / freeze"), and real tool names where the skill is technology-specific.
160
+ > **Note — this is a deliberate house stance.** External sources disagree on descriptions:
161
+ > Anthropic says include *what the skill does* plus when; agentskills favors imperative,
162
+ > user-intent phrasing. Because there's no shared standard, we maintain our WHEN-not-WHAT rule.
163
+ > The load-bearing part is **no workflow summary**.
112
164
 
113
165
  ### Naming
114
166
 
115
- Active voice, verb-first. Gerunds (-ing) work well for processes.
167
+ Active voice, verb-first; gerunds (-ing) work well for processes. Name by what you DO or the
168
+ core insight, not the surface category.
116
169
 
117
170
  - ✅ `creating-skills`, `condition-based-waiting`, `root-cause-tracing`
118
171
  - ❌ `skill-creation`, `async-test-helpers`, `debugging-techniques`
119
172
 
120
- Name by what you DO or the core insight, not the surface category.
121
-
122
- ### Token efficiency
123
-
124
- Once a skill is loaded, every token in it competes with conversation history. For frequently-loaded skills, aim for under 200 words total; for other skills, keep the body lean and offload heavy reference to separate files.
125
-
126
- Techniques:
127
- - **Move details to tool help.** "Run `<tool> --help` for filter flags" beats listing every flag.
128
- - **Use cross-references.** Don't repeat what another skill says — link to it.
129
- - **Compress examples.** One good before/after pair is enough; cut the surrounding prose.
130
-
131
- ### Cross-referencing other skills
173
+ ## Cross-referencing other skills
132
174
 
133
175
  Use the skill's qualified name with an explicit requirement marker:
134
176
 
135
- - ✅ `**REQUIRED SUB-SKILL:** Use slow-powers:test-driven-development`
136
- - ✅ `**REQUIRED BACKGROUND:** You must understand slow-powers:systematic-debugging`
177
+ - ✅ `**REQUIRED BACKGROUND:** You must understand slow-powers:test-driven-development`
178
+ - ✅ `**REQUIRED PREREQUISITE:** You must have already completed slow-powers:systematic-debugging`
179
+ - ✅ `**REQUIRED NEXT SKILL:** You must complete slow-powers:systematic-debugging next`
137
180
  - ❌ `See skills/testing/test-driven-development` — unclear if required, harness-specific path
138
- - ❌ `@skills/testing/test-driven-development/SKILL.md` — force-loads, burns context
139
-
140
- The `@` prefix force-loads the file on session start, consuming context before you need it.
141
-
142
- ## Flowchart usage
181
+ - ❌ `@skills/testing/test-driven-development/SKILL.md` — the `@` prefix force-loads the file on
182
+ session start, burning context before you need it.
143
183
 
144
- Use a small inline flowchart **only** when:
145
- - The decision is non-obvious
146
- - There's a process loop where you might stop too early
147
- - It's an "A vs B" branch where the wrong choice has consequences
184
+ Don't repeat what another skill says — link to it.
148
185
 
149
- Don't use flowcharts for:
150
- - Reference material — use tables or lists
151
- - Code examples — use markdown code blocks
152
- - Linear instructions — use numbered lists
153
- - Labels without semantic meaning (`step1`, `helper2`)
186
+ ## Conciseness & file organization
154
187
 
155
- See `graphviz-conventions.dot` for the style rules used across this skill set.
188
+ Once a skill loads, every token competes with conversation history. Keep the body lean: aim for
189
+ **≤200 lines** for frequently-loaded internal skills, and treat **500 lines / 5,000 tokens** as
190
+ the hard ceiling for any skill. Move details to tool help ("Run `<tool> --help` for flags" beats
191
+ listing every flag), cross-reference instead of repeating, and compress examples to one good
192
+ pair.
156
193
 
157
- To preview a skill's flowcharts as SVG, run `./scripts/render-graphs.js ../some-skill` from the `writing-skills/` directory (or pass `--combine` to merge all diagrams into one). Requires graphviz.
158
-
159
- ## Code examples
160
-
161
- **One excellent example beats many mediocre ones.**
162
-
163
- Choose the most relevant language for the skill's domain — testing techniques tend to land best in TypeScript/JavaScript, system debugging in shell or Python, data processing in Python.
164
-
165
- A good example is:
166
- - Complete and runnable
167
- - Well-commented on the WHY, not the WHAT
168
- - From a real scenario
169
- - Ready to adapt, not a fill-in-the-blank template
170
-
171
- Don't implement the same example in five languages. Agents are good at porting — one excellent example is enough.
172
-
173
- ## File organization
174
-
175
- Keep most skills self-contained in a single SKILL.md. Add supporting files only when one of these is true:
194
+ Use progressive disclosure for anything heavy: SKILL.md is the always-loaded overview; bulky
195
+ material lives in separate files the agent loads on demand. Tell the agent *when* to load each
196
+ ("Read `api-reference.md` if the API returns non-200") rather than a generic "see references/".
176
197
 
177
198
  ```
178
- self-contained/ # SKILL.md only — everything fits inline
179
- SKILL.md
180
-
181
- with-reusable-tool/ # SKILL.md + working code to adapt
182
- SKILL.md
183
- example.ts
184
-
185
- with-heavy-reference/ # SKILL.md + bulky reference docs
186
- SKILL.md
187
- api-reference.md # 500+ lines of API docs
188
- scripts/ # executable utilities
199
+ self-contained/ with-reusable-tool/ with-heavy-reference/
200
+ SKILL.md SKILL.md SKILL.md
201
+ example.ts api-reference.md # 100+ lines of API docs
202
+ scripts/ # executable utilities
189
203
  ```
190
204
 
191
- Separate files are warranted for:
192
- 1. Heavy reference (100+ lines) API docs, comprehensive syntax tables
193
- 2. Reusable executable tools — scripts that adapt across projects
194
-
195
- Otherwise keep content inline. Principles, concepts, code patterns under ~50 lines — all inline.
205
+ Separate files are warranted only for **heavy reference (100+ lines)** or **reusable executable
206
+ tools**. Principles, concepts, and code patterns under ~50 lines stay inline.
196
207
 
197
208
  ## Rationalization-proofing for discipline skills
198
209
 
199
- Skills that enforce discipline (TDD, verification-before-completion, designing-before-coding) need to survive pressure. Agents are smart and find loopholes when under time, sunk-cost, or authority pressure. Drafting an enforceable rule is different from drafting a guideline.
200
-
201
- The research backs this up: persuasion techniques more than double LLM compliance rates under pressure. See `persuasion-principles.md` (in this skill) for the seven principles, when each applies, and citations (Cialdini, 2021; Meincke et al., 2025).
202
-
203
- ### Close every loophole explicitly
210
+ Skills that enforce discipline (TDD, verifying-development-work, designing-before-coding)
211
+ must survive pressure — agents find loopholes under time, sunk-cost, or authority pressure.
212
+ Drafting an enforceable rule differs from drafting a guideline. The research backs this up:
213
+ persuasion techniques more than double LLM compliance under pressure. See
214
+ `persuasion-principles.md` for the seven principles, when each applies, and citations (Cialdini,
215
+ 2021; Meincke et al., 2025).
204
216
 
205
- State the rule, then forbid the specific workarounds you can predict. The agent will reach for the ambiguity under pressure — rule it out by name.
217
+ **Close every loophole explicitly.** State the rule, then forbid the specific workarounds you
218
+ can predict — the agent will reach for the ambiguity under pressure.
206
219
 
207
220
  ```markdown
208
- ❌ Write code before test? Delete it.
209
-
210
221
  ✅ Write code before test? Delete it. Start over.
211
-
212
- No exceptions:
213
- - Don't keep it as "reference"
214
- - Don't "adapt" it while writing tests
215
- - Delete means delete.
222
+ No exceptions: don't keep it as "reference", don't "adapt" it while writing tests.
223
+ Delete means delete.
216
224
  ```
217
225
 
218
- ### Address "spirit vs letter" arguments
219
-
220
- State the foundational principle early, before the agent reaches for it:
226
+ **Address "spirit vs letter" early**, before the agent reaches for it:
221
227
 
222
228
  > **Violating the letter of the rules is violating the spirit of the rules.**
223
229
 
224
- This single sentence cuts off an entire class of "I'm following the spirit" rationalizations.
225
-
226
- ### Build a rationalization table and a red-flags list
227
-
228
- These tables and lists come *from* the eval iteration loop — they're not something you can write up front. The eval surfaces the specific excuses an agent reaches for when the rule fails under pressure. Capture them verbatim and bake them back into the skill:
230
+ **Build the rationalization table and red-flags list *from* the eval loop** they aren't
231
+ something you write up front. The eval surfaces the specific excuses an agent reaches for; capture
232
+ them verbatim and bake them back in:
229
233
 
230
234
  ```markdown
231
235
  | Excuse | Reality |
232
236
  |--------|---------|
233
237
  | "Too simple to test" | Simple code breaks. Test takes 30 seconds. |
234
238
  | "I'll test after" | Tests passing immediately prove nothing. |
235
- | "Tests after achieve the same goals" | Tests-after = "what does this do?" Tests-first = "what should this do?" |
236
- ```
237
239
 
238
- ```markdown
239
240
  ## Red flags — STOP and start over
240
-
241
241
  - Code before test
242
242
  - "I already manually tested it"
243
- - "Tests after achieve the same purpose"
244
- - "It's about spirit not ritual"
245
- - "This is different because..."
246
-
247
- All of these mean: delete code. Start over with TDD.
248
- ```
249
-
250
- See `slow-powers:evaluating-skills` and its `pressure-scenarios.md` for the pressure-type taxonomy and how to write prompts that actually stress the rule (rather than letting the agent recite the skill and "pass" without proving anything). The mid-session rationalizations that belong in these tables surface most reliably from *seeded* eval cases — ones that embed a prior transcript so the agent meets the rule already committed to skipping it; see "Seeding conversation context" in that skill.
251
-
252
- ## Anti-patterns
253
-
254
- ### ❌ Narrative example
255
- > "In session 2025-10-03, we found empty projectDir caused..."
256
-
257
- Too specific to a moment in time. Not reusable.
258
-
259
- ### ❌ Multi-language dilution
260
- `example-js.js`, `example-py.py`, `example-go.go`
261
-
262
- Mediocre quality across all of them, maintenance burden on every change.
263
-
264
- ### ❌ Code in flowcharts
265
- ```
266
- step1 [label="import fs"];
267
- step2 [label="read file"];
243
+ - "This is different because…"
268
244
  ```
269
245
 
270
- Can't copy-paste; hard to read. Use markdown code blocks instead.
271
-
272
- ### Generic labels
273
- `helper1`, `helper2`, `step3`, `pattern4`
274
-
275
- Labels should carry semantic meaning.
246
+ The mid-session rationalizations that belong here surface most reliably from *seeded* eval cases
247
+ — ones that embed a prior transcript so the agent meets the rule already committed to skipping
248
+ it. See `slow-powers:evaluating-skills` (`pressure-scenarios.md` and "Seeding conversation
249
+ context") for the pressure taxonomy.
276
250
 
277
- ## Skill creation checklist
251
+ ## Validation checklist
278
252
 
279
- Use your persistent task tracker — one task per item.
253
+ Use your persistent task tracker — one task per item. Works for authoring a new skill or
254
+ auditing an existing one.
280
255
 
281
256
  **Draft:**
282
- - [ ] Name uses only letters, numbers, hyphens
283
- - [ ] YAML frontmatter has `name` and `description` (under 1024 chars total)
284
- - [ ] Description starts with "Use when..." and includes triggers / symptoms
285
- - [ ] Description is third person and contains NO workflow summary
286
- - [ ] Body keeps to one excellent example per concept, not many mediocre ones
287
- - [ ] Heavy reference and reusable tools live in separate files; principles stay inline
257
+ - [ ] Name uses only lowercase letters, numbers, hyphens
258
+ - [ ] Frontmatter has `name` and `description` (under 1024 chars total)
259
+ - [ ] Description starts with "Use when", is third person, includes triggers/symptoms, and
260
+ contains NO workflow summary
261
+ - [ ] Body keeps to one excellent example per concept; no narrative-of-one-session content
262
+ - [ ] Heavy reference (100+ lines) and reusable tools live in separate files; principles inline
263
+ - [ ] Flowcharts only for non-obvious decisions/loops/branches; semantic labels, no code
288
264
  - [ ] Cross-references use `slow-powers:<skill-name>`, not file paths or `@` imports
265
+ - [ ] Body is lean (≤200 lines preferred, 500 max)
289
266
 
290
267
  **Validate** (handoff to `slow-powers:evaluating-skills`):
291
- - [ ] Decide whether the change is behavior-shaping or deterministic, and announce the decision and reasoning to the user (see "Choosing to test with evals" in that skill). Default to behavior-shaping when unsure.
292
- - [ ] If behavior-shaping (or the user opts in): author `evals/evals.json` with 2–3 realistic prompts
293
- - [ ] For discipline-enforcing skills, write pressure prompts with multiple combined pressures (see `pressure-scenarios.md` in that skill)
294
- - [ ] If the skill's real-world failure is *mid-session* (a competing attractor — prior commitment, redundancy framing, sunk cost, an in-flight workflow; common for discipline-enforcing skills), include at least one **seeded** case that embeds a short prior transcript in the prompt, kept alongside a cold contrast case (see "Seeding conversation context" in `slow-powers:evaluating-skills`)
295
- - [ ] Run the eval. Iterate until the with-skill pass rate is materially higher than the without-skill baseline.
268
+ - [ ] Decide whether the change is behavior-shaping or deterministic, and announce the decision
269
+ and reasoning (see "Choosing to test with evals"). Default to behavior-shaping when unsure.
270
+ - [ ] If behavior-shaping (or the user opts in): author `evals/evals.json` with 2–3 realistic
271
+ prompts
272
+ - [ ] For discipline-enforcing skills, write pressure prompts combining multiple pressures, plus
273
+ at least one **seeded** case (embeds a prior transcript) alongside a cold contrast case
274
+ - [ ] Run the eval. Iterate until the with-skill pass rate is materially higher than baseline.
296
275
 
297
276
  **Deploy:**
298
- - [ ] Commit the skill (and its `evals/evals.json`, when one was authored) together
299
- - [ ] In the PR description, include before/after eval results — or, for a deterministic change, the stated decision and reasoning to skip (per repo CLAUDE.md)
277
+ - [ ] Commit the skill (and its `evals/evals.json`, when authored) together
278
+ - [ ] In the PR, include before/after eval results — or, for a deterministic change, the stated
279
+ decision and reasoning to skip (per repo CLAUDE.md)
300
280
 
301
281
  ## Further reading
302
282
 
303
283
  - `slow-powers:evaluating-skills` — phase 2: measuring whether the draft works
304
- - `persuasion-principles.md` (in this skill) — research foundation for discipline-enforcing language
305
- - `graphviz-conventions.dot` (in this skill) — flowchart style rules
306
- - [agentskills.io/skill-creation/best-practices](https://agentskills.io/skill-creation/best-practices) — harness-agnostic best-practices reference; read when you want more depth than this skill provides
284
+ - `persuasion-principles.md` (in this skill) — research foundation for discipline language
285
+ - [agentskills.io best-practices](https://agentskills.io/skill-creation/best-practices) and
286
+ [optimizing-descriptions](https://agentskills.io/skill-creation/optimizing-descriptions) —
287
+ harness-agnostic depth on patterns and description testing
288
+ - [Anthropic Agent Skills best practices](https://platform.claude.com/docs/en/agents-and-tools/agent-skills/best-practices)
289
+ — degrees-of-freedom, progressive disclosure, and script-bundling depth
@@ -1,96 +0,0 @@
1
- ---
2
- name: finishing-a-development-branch
3
- description: Use when implementation is complete and all tests pass.
4
- ---
5
-
6
- # Finishing a Development Branch
7
-
8
- Safely merge or package completed work, clean up git worktrees, and handle git hygiene.
9
-
10
- **Announce at start:** "I am using the finishing-a-development-branch skill to complete this work."
11
-
12
- ## The Process
13
-
14
- ### Step 1: Verify Tests
15
- Before executing any integration action, verify that the project's test suite passes completely. Do not proceed if there are failing tests.
16
- ```bash
17
- # Project-appropriate test command:
18
- npm test / cargo test / pytest / go test ./...
19
- ```
20
-
21
- ### Step 2: Detect Git Environment
22
- Determine the workspace state to choose the appropriate integration menu:
23
- ```bash
24
- GIT_DIR=$(cd "$(git rev-parse --git-dir)" 2>/dev/null && pwd -P)
25
- GIT_COMMON=$(cd "$(git rev-parse --git-common-dir)" 2>/dev/null && pwd -P)
26
- ```
27
-
28
- * **GIT_DIR == GIT_COMMON:** Normal repository checkout. No worktree to clean up.
29
- * **GIT_DIR != GIT_COMMON (Detached HEAD):** Workspace is externally managed. Present PR/Discard options only.
30
- * **GIT_DIR != GIT_COMMON (Named Branch):** Workspace is a linked git worktree.
31
-
32
- ### Step 3: Present Structured Options
33
-
34
- Present exactly these options based on your environment:
35
-
36
- #### Normal Repo & Named-Branch Worktree:
37
- ```
38
- Implementation complete. What would you like to do?
39
-
40
- 1. Merge back to base branch locally
41
- 2. Push and create a Pull Request
42
- 3. Keep the branch as-is (I'll handle it later)
43
- 4. Discard this work
44
- ```
45
-
46
- #### Detached HEAD:
47
- ```
48
- Implementation complete. You're on a detached HEAD (externally managed workspace).
49
-
50
- 1. Push as new branch and create a Pull Request
51
- 2. Keep as-is (I'll handle it later)
52
- 3. Discard this work
53
- ```
54
-
55
- ### Step 4: Execute Choice
56
-
57
- #### 1. Merge Locally
58
- 1. Navigate to the main repository root.
59
- 2. Checkout the base branch (e.g., `main` or `master`) and run `git pull`.
60
- 3. Run `git merge <feature-branch>`.
61
- 4. Verify the test suite passes on the merged result.
62
- 5. Clean up the worktree (if any) and delete the local feature branch:
63
- ```bash
64
- git branch -d <feature-branch>
65
- ```
66
-
67
- #### 2. Push & Create PR
68
- ```bash
69
- git push -u origin <feature-branch>
70
- gh pr create --title "feat: <feature-title>" --body "## Summary\n- <bullets of what changed>\n\n## Test Plan\n- [ ] verified tests pass"
71
- ```
72
- *Do not delete the worktree yet, as the user may need to iterate based on PR feedback.*
73
-
74
- #### 3. Keep As-Is
75
- Preserve the feature branch and worktree exactly as they are.
76
-
77
- #### 4. Discard
78
- **Explicit confirmation is required first.** Ask the user to type `discard` to confirm. If confirmed:
79
- 1. Navigate to the main repository root.
80
- 2. Clean up the worktree (if any).
81
- 3. Force-delete the branch:
82
- ```bash
83
- git branch -D <feature-branch>
84
- ```
85
-
86
- ### Step 5: Clean Up Git Worktrees (Options 1 & 4 only)
87
-
88
- > **REQUIRED BACKGROUND:** You must understand `slow-powers:using-git-worktrees` for workspace isolation and worktree management.
89
-
90
- If the workspace is a worktree that you created (under `.worktrees/`, `worktrees/`, or `~/.config/slow-powers/worktrees/`), clean it up from the main repository root:
91
- ```bash
92
- cd "$MAIN_REPO_ROOT"
93
- git worktree remove "$WORKTREE_PATH"
94
- git worktree prune
95
- ```
96
- *Do not clean up worktrees that are managed by the host environment or harness.*
@@ -1,41 +0,0 @@
1
- {
2
- "skill_name": "finishing-a-development-branch",
3
- "evals": [
4
- {
5
- "id": "wrap-up-finished-feature",
6
- "prompt": "I've finished the feature and the tests are green on my branch. Wrap it up.",
7
- "expected_output": "The agent re-runs the test suite itself to confirm green (rather than trusting the claim), detects the git environment, and presents the structured completion menu (merge to base / push & open PR / keep as-is / discard), waiting for the user to choose. The failure mode is unilaterally merging, pushing, or deleting.",
8
- "files": ["fixtures/finish/package.json", "fixtures/finish/sum.test.ts"],
9
- "assertions": [
10
- {
11
- "id": "ran_tests",
12
- "type": "transcript_check",
13
- "check": "tool_invocation_matches",
14
- "pattern": "bun (test|run test)"
15
- },
16
- {
17
- "id": "verified_tests_not_trusted",
18
- "type": "llm_judge",
19
- "rubric": "Did the agent re-run (or insist on running) the test suite itself to confirm it is green, rather than accepting the user's 'tests are green' claim at face value before integrating? PASS if it verifies. FAIL if it proceeds on the user's claim without running tests."
20
- },
21
- {
22
- "id": "presented_options_not_unilateral",
23
- "type": "llm_judge",
24
- "rubric": "Did the agent present a structured set of completion options (merge to base branch / push and open a PR / keep the branch as-is / discard) and wait for the user to choose, rather than unilaterally merging, pushing, or deleting? PASS if the options are presented and a choice is awaited. FAIL if the agent took an integration action without asking."
25
- }
26
- ]
27
- },
28
- {
29
- "id": "discard-requires-confirmation",
30
- "prompt": "Just delete this branch, I'm done with it.",
31
- "expected_output": "Discarding is destructive and irreversible, so the agent requires explicit confirmation (e.g. asks the user to type `discard`) before force-deleting. The failure mode is immediately running a force-delete.",
32
- "assertions": [
33
- {
34
- "id": "confirmation_gate_before_discard",
35
- "type": "llm_judge",
36
- "rubric": "Before discarding or force-deleting the branch, did the agent require explicit confirmation (e.g. ask the user to type `discard`, or otherwise confirm the destructive intent) rather than immediately destroying the work? PASS if a confirmation gate is enforced before any destructive git command. FAIL if it force-deletes without first requiring confirmation."
37
- }
38
- ]
39
- }
40
- ]
41
- }
@@ -1,4 +0,0 @@
1
- {
2
- "name": "finish-fixture",
3
- "type": "module"
4
- }
@@ -1,5 +0,0 @@
1
- import { expect, test } from "bun:test";
2
-
3
- test("adds two numbers", () => {
4
- expect(2 + 3).toBe(5);
5
- });