@slowdini/slow-powers-opencode 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +174 -0
- package/bootstrap.md +16 -0
- package/opencode/plugins/slow-powers.js +86 -0
- package/package.json +66 -0
- package/skills/auditing-slow-powers-usage/SKILL.md +157 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/BASELINE.md +22 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +72 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/benchmark.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__with_skill.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__without_skill.json +38 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__with_skill.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__without_skill.json +38 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__with_skill.json +17 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__without_skill.json +17 -0
- package/skills/auditing-slow-powers-usage/evals/evals.json +74 -0
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +39 -0
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-completed-session/session-summary.md +33 -0
- package/skills/evaluating-skills/SKILL.md +448 -0
- package/skills/evaluating-skills/evals/evals.json +52 -0
- package/skills/evaluating-skills/evals/fixtures/iron-law/candidate-skill.md +13 -0
- package/skills/evaluating-skills/examples/verification-before-completion-evals.json +30 -0
- package/skills/evaluating-skills/harness-details/claude.md +135 -0
- package/skills/evaluating-skills/pressure-scenarios.md +163 -0
- package/skills/evaluating-skills/runner/README.md +140 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +263 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +146 -0
- package/skills/evaluating-skills/runner/aggregate.test.ts +188 -0
- package/skills/evaluating-skills/runner/aggregate.ts +228 -0
- package/skills/evaluating-skills/runner/context.test.ts +181 -0
- package/skills/evaluating-skills/runner/context.ts +90 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +103 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +192 -0
- package/skills/evaluating-skills/runner/fill-transcripts.test.ts +73 -0
- package/skills/evaluating-skills/runner/fill-transcripts.ts +154 -0
- package/skills/evaluating-skills/runner/grade.test.ts +347 -0
- package/skills/evaluating-skills/runner/grade.ts +603 -0
- package/skills/evaluating-skills/runner/guard/guard.ts +49 -0
- package/skills/evaluating-skills/runner/guard/install.test.ts +92 -0
- package/skills/evaluating-skills/runner/guard/install.ts +147 -0
- package/skills/evaluating-skills/runner/guard/policy.test.ts +71 -0
- package/skills/evaluating-skills/runner/guard/policy.ts +74 -0
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +230 -0
- package/skills/evaluating-skills/runner/promote-baseline.ts +186 -0
- package/skills/evaluating-skills/runner/run.test.ts +716 -0
- package/skills/evaluating-skills/runner/run.ts +814 -0
- package/skills/evaluating-skills/runner/sandbox-policy.ts +74 -0
- package/skills/evaluating-skills/runner/types.ts +104 -0
- package/skills/evaluating-skills/runner/validate-all.ts +54 -0
- package/skills/evaluating-skills/runner/validate-schema.test.ts +99 -0
- package/skills/evaluating-skills/runner/validate-schema.ts +51 -0
- package/skills/evaluating-skills/runner/validate.test.ts +56 -0
- package/skills/evaluating-skills/runner/validate.ts +21 -0
- package/skills/evaluating-skills/schema/evals.schema.json +105 -0
- package/skills/evaluating-skills/schema/grading.schema.json +84 -0
- package/skills/evaluating-skills/schema/run-record.schema.json +80 -0
- package/skills/evaluating-skills/schema/stray-writes.schema.json +68 -0
- package/skills/evaluating-skills/templates/eval-task-prompt.md +71 -0
- package/skills/evaluating-skills/templates/evals.json.example +17 -0
- package/skills/evaluating-skills/templates/judge-prompt.md +56 -0
- package/skills/evaluating-skills/templates/revise-skill-prompt.md +56 -0
- package/skills/finishing-a-development-branch/SKILL.md +96 -0
- package/skills/finishing-a-development-branch/evals/evals.json +41 -0
- package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +4 -0
- package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +5 -0
- package/skills/hardening-plans/SKILL.md +72 -0
- package/skills/hardening-plans/evals/baseline/BASELINE.md +22 -0
- package/skills/hardening-plans/evals/baseline/NOTES.md +58 -0
- package/skills/hardening-plans/evals/baseline/benchmark.json +54 -0
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +24 -0
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +24 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +46 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +46 -0
- package/skills/hardening-plans/evals/evals.json +114 -0
- package/skills/systematic-debugging/CREATION-LOG.md +119 -0
- package/skills/systematic-debugging/SKILL.md +84 -0
- package/skills/systematic-debugging/condition-based-waiting-example.ts +164 -0
- package/skills/systematic-debugging/condition-based-waiting.md +115 -0
- package/skills/systematic-debugging/defense-in-depth.md +122 -0
- package/skills/systematic-debugging/evals/baseline/BASELINE.md +22 -0
- package/skills/systematic-debugging/evals/baseline/benchmark.json +51 -0
- package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__with_skill.json +17 -0
- package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__without_skill.json +17 -0
- package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__with_skill.json +46 -0
- package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__without_skill.json +31 -0
- package/skills/systematic-debugging/evals/evals.json +45 -0
- package/skills/systematic-debugging/evals/fixtures/order-bug/orderHandler.ts +9 -0
- package/skills/systematic-debugging/evals/fixtures/order-bug/repro.ts +10 -0
- package/skills/systematic-debugging/find-polluter.sh +63 -0
- package/skills/systematic-debugging/root-cause-tracing.md +169 -0
- package/skills/systematic-debugging/test-academic.md +14 -0
- package/skills/systematic-debugging/test-pressure-1.md +58 -0
- package/skills/systematic-debugging/test-pressure-2.md +68 -0
- package/skills/systematic-debugging/test-pressure-3.md +69 -0
- package/skills/test-driven-development/SKILL.md +93 -0
- package/skills/test-driven-development/evals/baseline/BASELINE.md +22 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +74 -0
- package/skills/test-driven-development/evals/baseline/benchmark.json +51 -0
- package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__with_skill.json +53 -0
- package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__without_skill.json +38 -0
- package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__with_skill.json +32 -0
- package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__without_skill.json +17 -0
- package/skills/test-driven-development/evals/evals.json +77 -0
- package/skills/test-driven-development/evals/fixtures/slugify/package.json +4 -0
- package/skills/test-driven-development/evals/fixtures/slugify/utils.ts +7 -0
- package/skills/test-driven-development/testing-anti-patterns.md +299 -0
- package/skills/using-git-worktrees/SKILL.md +70 -0
- package/skills/using-git-worktrees/evals/evals.json +40 -0
- package/skills/verification-before-completion/SKILL.md +65 -0
- package/skills/verification-before-completion/evals/baseline/BASELINE.md +22 -0
- package/skills/verification-before-completion/evals/baseline/NOTES.md +75 -0
- package/skills/verification-before-completion/evals/baseline/benchmark.json +51 -0
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +39 -0
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +24 -0
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +46 -0
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +31 -0
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +46 -0
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +31 -0
- package/skills/verification-before-completion/evals/evals.json +77 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/api.ts +1 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/consumer.ts +3 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/tsconfig.json +23 -0
- package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.test.ts +10 -0
- package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.ts +1 -0
- package/skills/writing-skills/SKILL.md +306 -0
- package/skills/writing-skills/evals/evals.json +40 -0
- package/skills/writing-skills/graphviz-conventions.dot +172 -0
- package/skills/writing-skills/persuasion-principles.md +187 -0
- package/skills/writing-skills/scripts/render-graphs.js +181 -0
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: writing-skills
|
|
3
|
+
description: Use when creating new skills or editing existing skills. Drafting only; see slow-powers:evaluating-skills for measuring whether the draft works.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Writing Skills
|
|
7
|
+
|
|
8
|
+
## Overview
|
|
9
|
+
|
|
10
|
+
Skill development has two phases: **drafting** (this skill) and **evaluation** (`slow-powers:evaluating-skills`). Drafting covers naming, structure, vocabulary, anti-patterns, and rationalization-proofing. Evaluation covers measuring whether the words on the page actually shift agent behavior under realistic prompts.
|
|
11
|
+
|
|
12
|
+
A behavioral draft you didn't measure is a claim you didn't verify. After drafting, hand off to `slow-powers:evaluating-skills` to decide whether the change is behavior-shaping (measure it — the with/without comparison and iteration loop) or deterministic instruction-following (declare the decision and reasoning, then skip). New skills and edits alike route through that decision — see "Choosing to test with evals" in that skill. Default to measuring; the skip is a narrow, announced exception, not an escape hatch.
|
|
13
|
+
|
|
14
|
+
**Personal skills** live in your harness's user-skills directory. The path differs per harness; consult the harness's docs.
|
|
15
|
+
|
|
16
|
+
## Vocabulary
|
|
17
|
+
|
|
18
|
+
Skills describe capabilities, not platform tool names. When you write a skill, use these terms. This table is the canonical source — when a new load-bearing term is coined, add it here.
|
|
19
|
+
|
|
20
|
+
| Term | Means | Don't say |
|
|
21
|
+
|------|-------|-----------|
|
|
22
|
+
| **Skill mechanism** | The platform's dedicated skill loader | "Skill tool" (Claude-specific) |
|
|
23
|
+
| **Persistent task tracker** | A todo tool whose state survives subagent dispatches and context churn | "TodoWrite", "write_todos" |
|
|
24
|
+
| **General-purpose subagent** | A subagent without a specialized role | "Task tool", "@generalist" |
|
|
25
|
+
| **Capability** | A described action ("search file contents") | A platform tool name ("Grep") |
|
|
26
|
+
| **Load-bearing property** | A property a capability must have for the workflow to work | (no shorter form) |
|
|
27
|
+
|
|
28
|
+
## What is a skill?
|
|
29
|
+
|
|
30
|
+
A skill is a reference guide for proven techniques, patterns, or tools. Skills help future agents find and apply effective approaches.
|
|
31
|
+
|
|
32
|
+
**Skills are:** reusable techniques, patterns, tools, reference guides.
|
|
33
|
+
**Skills are not:** narratives about how you solved a problem once.
|
|
34
|
+
|
|
35
|
+
**Create a skill when:**
|
|
36
|
+
- The technique wasn't intuitively obvious
|
|
37
|
+
- You'd reference it again across projects
|
|
38
|
+
- The pattern applies broadly (not project-specific)
|
|
39
|
+
|
|
40
|
+
**Don't create one for:**
|
|
41
|
+
- One-off solutions
|
|
42
|
+
- Standard practices well-documented elsewhere
|
|
43
|
+
- Project-specific conventions (put those in CLAUDE.md / AGENTS.md)
|
|
44
|
+
- Mechanical constraints — if a regex or validation can enforce it, automate it instead
|
|
45
|
+
|
|
46
|
+
## Skill types
|
|
47
|
+
|
|
48
|
+
- **Technique** — concrete method with steps to follow (condition-based-waiting, root-cause-tracing)
|
|
49
|
+
- **Pattern** — way of thinking about problems (flatten-with-flags, test-invariants)
|
|
50
|
+
- **Reference** — API docs, syntax guides, tool documentation
|
|
51
|
+
|
|
52
|
+
## SKILL.md structure
|
|
53
|
+
|
|
54
|
+
```markdown
|
|
55
|
+
---
|
|
56
|
+
name: skill-name-with-hyphens
|
|
57
|
+
description: Use when [specific triggering conditions and symptoms]
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
# Skill Name
|
|
61
|
+
|
|
62
|
+
## Overview
|
|
63
|
+
What is this? Core principle in 1-2 sentences.
|
|
64
|
+
|
|
65
|
+
## When to use
|
|
66
|
+
Bullet list with symptoms and use cases. When NOT to use.
|
|
67
|
+
|
|
68
|
+
## Core pattern (techniques/patterns)
|
|
69
|
+
Before/after comparison.
|
|
70
|
+
|
|
71
|
+
## Quick reference
|
|
72
|
+
Table or bullets for scanning common operations.
|
|
73
|
+
|
|
74
|
+
## Implementation
|
|
75
|
+
Inline code for simple patterns; link to a file for heavy reference.
|
|
76
|
+
|
|
77
|
+
## Common mistakes
|
|
78
|
+
What goes wrong + fixes.
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
**Frontmatter rules:**
|
|
82
|
+
- Two required fields: `name` and `description`. Max 1024 characters total. See [agentskills.io/specification](https://agentskills.io/specification) for the full schema.
|
|
83
|
+
- `name`: letters, numbers, hyphens only — no parentheses or special chars.
|
|
84
|
+
- `description`: third person; describes ONLY when to use. See the next section for why "what it does" is the wrong content for this field.
|
|
85
|
+
|
|
86
|
+
## Skill discovery
|
|
87
|
+
|
|
88
|
+
The description field is how agents (and the harness's skill mechanism) decide whether to load your skill. Make it answer one question: *should I read this skill right now?*
|
|
89
|
+
|
|
90
|
+
### Description = WHEN, not WHAT
|
|
91
|
+
|
|
92
|
+
Do not summarize the skill's workflow in the description. Testing has repeatedly shown that when the description summarizes the process, agents follow the description instead of reading the full skill. A description saying "code review between tasks" caused an agent to do ONE review even though the skill body clearly described TWO reviews. Changing the description to just "Use when executing implementation plans with independent tasks" — no workflow summary — produced the correct two-stage behavior.
|
|
93
|
+
|
|
94
|
+
The trap is that workflow summaries create a shortcut the agent will take. The skill body becomes documentation the agent skips.
|
|
95
|
+
|
|
96
|
+
```yaml
|
|
97
|
+
# ❌ Summarizes workflow — agent may follow this instead of reading the skill
|
|
98
|
+
description: Use when executing plans — dispatches subagent per task with code review between tasks
|
|
99
|
+
|
|
100
|
+
# ✅ Triggering conditions only
|
|
101
|
+
description: Use when executing implementation plans with independent tasks in the current session
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Other description rules:
|
|
105
|
+
- Start with "Use when..." to focus on triggering conditions.
|
|
106
|
+
- Write in third person — descriptions are injected into the system prompt.
|
|
107
|
+
- Describe the *problem* (race conditions, timing dependencies) not *language-specific symptoms* (setTimeout, sleep) unless the skill is technology-specific.
|
|
108
|
+
|
|
109
|
+
### Keyword coverage
|
|
110
|
+
|
|
111
|
+
Use words an agent would actually search for: error messages ("Hook timed out", "ENOTEMPTY"), symptoms ("flaky", "hanging", "pollution"), synonyms ("timeout / hang / freeze"), and real tool names where the skill is technology-specific.
|
|
112
|
+
|
|
113
|
+
### Naming
|
|
114
|
+
|
|
115
|
+
Active voice, verb-first. Gerunds (-ing) work well for processes.
|
|
116
|
+
|
|
117
|
+
- ✅ `creating-skills`, `condition-based-waiting`, `root-cause-tracing`
|
|
118
|
+
- ❌ `skill-creation`, `async-test-helpers`, `debugging-techniques`
|
|
119
|
+
|
|
120
|
+
Name by what you DO or the core insight, not the surface category.
|
|
121
|
+
|
|
122
|
+
### Token efficiency
|
|
123
|
+
|
|
124
|
+
Once a skill is loaded, every token in it competes with conversation history. For frequently-loaded skills, aim for under 200 words total; for other skills, keep the body lean and offload heavy reference to separate files.
|
|
125
|
+
|
|
126
|
+
Techniques:
|
|
127
|
+
- **Move details to tool help.** "Run `<tool> --help` for filter flags" beats listing every flag.
|
|
128
|
+
- **Use cross-references.** Don't repeat what another skill says — link to it.
|
|
129
|
+
- **Compress examples.** One good before/after pair is enough; cut the surrounding prose.
|
|
130
|
+
|
|
131
|
+
### Cross-referencing other skills
|
|
132
|
+
|
|
133
|
+
Use the skill's qualified name with an explicit requirement marker:
|
|
134
|
+
|
|
135
|
+
- ✅ `**REQUIRED SUB-SKILL:** Use slow-powers:test-driven-development`
|
|
136
|
+
- ✅ `**REQUIRED BACKGROUND:** You must understand slow-powers:systematic-debugging`
|
|
137
|
+
- ❌ `See skills/testing/test-driven-development` — unclear if required, harness-specific path
|
|
138
|
+
- ❌ `@skills/testing/test-driven-development/SKILL.md` — force-loads, burns context
|
|
139
|
+
|
|
140
|
+
The `@` prefix force-loads the file on session start, consuming context before you need it.
|
|
141
|
+
|
|
142
|
+
## Flowchart usage
|
|
143
|
+
|
|
144
|
+
Use a small inline flowchart **only** when:
|
|
145
|
+
- The decision is non-obvious
|
|
146
|
+
- There's a process loop where you might stop too early
|
|
147
|
+
- It's an "A vs B" branch where the wrong choice has consequences
|
|
148
|
+
|
|
149
|
+
Don't use flowcharts for:
|
|
150
|
+
- Reference material — use tables or lists
|
|
151
|
+
- Code examples — use markdown code blocks
|
|
152
|
+
- Linear instructions — use numbered lists
|
|
153
|
+
- Labels without semantic meaning (`step1`, `helper2`)
|
|
154
|
+
|
|
155
|
+
See `graphviz-conventions.dot` for the style rules used across this skill set.
|
|
156
|
+
|
|
157
|
+
To preview a skill's flowcharts as SVG, run `./scripts/render-graphs.js ../some-skill` from the `writing-skills/` directory (or pass `--combine` to merge all diagrams into one). Requires graphviz.
|
|
158
|
+
|
|
159
|
+
## Code examples
|
|
160
|
+
|
|
161
|
+
**One excellent example beats many mediocre ones.**
|
|
162
|
+
|
|
163
|
+
Choose the most relevant language for the skill's domain — testing techniques tend to land best in TypeScript/JavaScript, system debugging in shell or Python, data processing in Python.
|
|
164
|
+
|
|
165
|
+
A good example is:
|
|
166
|
+
- Complete and runnable
|
|
167
|
+
- Well-commented on the WHY, not the WHAT
|
|
168
|
+
- From a real scenario
|
|
169
|
+
- Ready to adapt, not a fill-in-the-blank template
|
|
170
|
+
|
|
171
|
+
Don't implement the same example in five languages. Agents are good at porting — one excellent example is enough.
|
|
172
|
+
|
|
173
|
+
## File organization
|
|
174
|
+
|
|
175
|
+
Keep most skills self-contained in a single SKILL.md. Add supporting files only when one of these is true:
|
|
176
|
+
|
|
177
|
+
```
|
|
178
|
+
self-contained/ # SKILL.md only — everything fits inline
|
|
179
|
+
SKILL.md
|
|
180
|
+
|
|
181
|
+
with-reusable-tool/ # SKILL.md + working code to adapt
|
|
182
|
+
SKILL.md
|
|
183
|
+
example.ts
|
|
184
|
+
|
|
185
|
+
with-heavy-reference/ # SKILL.md + bulky reference docs
|
|
186
|
+
SKILL.md
|
|
187
|
+
api-reference.md # 500+ lines of API docs
|
|
188
|
+
scripts/ # executable utilities
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
Separate files are warranted for:
|
|
192
|
+
1. Heavy reference (100+ lines) — API docs, comprehensive syntax tables
|
|
193
|
+
2. Reusable executable tools — scripts that adapt across projects
|
|
194
|
+
|
|
195
|
+
Otherwise keep content inline. Principles, concepts, code patterns under ~50 lines — all inline.
|
|
196
|
+
|
|
197
|
+
## Rationalization-proofing for discipline skills
|
|
198
|
+
|
|
199
|
+
Skills that enforce discipline (TDD, verification-before-completion, designing-before-coding) need to survive pressure. Agents are smart and find loopholes when under time, sunk-cost, or authority pressure. Drafting an enforceable rule is different from drafting a guideline.
|
|
200
|
+
|
|
201
|
+
The research backs this up: persuasion techniques more than double LLM compliance rates under pressure. See `persuasion-principles.md` (in this skill) for the seven principles, when each applies, and citations (Cialdini, 2021; Meincke et al., 2025).
|
|
202
|
+
|
|
203
|
+
### Close every loophole explicitly
|
|
204
|
+
|
|
205
|
+
State the rule, then forbid the specific workarounds you can predict. The agent will reach for the ambiguity under pressure — rule it out by name.
|
|
206
|
+
|
|
207
|
+
```markdown
|
|
208
|
+
❌ Write code before test? Delete it.
|
|
209
|
+
|
|
210
|
+
✅ Write code before test? Delete it. Start over.
|
|
211
|
+
|
|
212
|
+
No exceptions:
|
|
213
|
+
- Don't keep it as "reference"
|
|
214
|
+
- Don't "adapt" it while writing tests
|
|
215
|
+
- Delete means delete.
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
### Address "spirit vs letter" arguments
|
|
219
|
+
|
|
220
|
+
State the foundational principle early, before the agent reaches for it:
|
|
221
|
+
|
|
222
|
+
> **Violating the letter of the rules is violating the spirit of the rules.**
|
|
223
|
+
|
|
224
|
+
This single sentence cuts off an entire class of "I'm following the spirit" rationalizations.
|
|
225
|
+
|
|
226
|
+
### Build a rationalization table and a red-flags list
|
|
227
|
+
|
|
228
|
+
These tables and lists come *from* the eval iteration loop — they're not something you can write up front. The eval surfaces the specific excuses an agent reaches for when the rule fails under pressure. Capture them verbatim and bake them back into the skill:
|
|
229
|
+
|
|
230
|
+
```markdown
|
|
231
|
+
| Excuse | Reality |
|
|
232
|
+
|--------|---------|
|
|
233
|
+
| "Too simple to test" | Simple code breaks. Test takes 30 seconds. |
|
|
234
|
+
| "I'll test after" | Tests passing immediately prove nothing. |
|
|
235
|
+
| "Tests after achieve the same goals" | Tests-after = "what does this do?" Tests-first = "what should this do?" |
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
```markdown
|
|
239
|
+
## Red flags — STOP and start over
|
|
240
|
+
|
|
241
|
+
- Code before test
|
|
242
|
+
- "I already manually tested it"
|
|
243
|
+
- "Tests after achieve the same purpose"
|
|
244
|
+
- "It's about spirit not ritual"
|
|
245
|
+
- "This is different because..."
|
|
246
|
+
|
|
247
|
+
All of these mean: delete code. Start over with TDD.
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
See `slow-powers:evaluating-skills` and its `pressure-scenarios.md` for the pressure-type taxonomy and how to write prompts that actually stress the rule (rather than letting the agent recite the skill and "pass" without proving anything). The mid-session rationalizations that belong in these tables surface most reliably from *seeded* eval cases — ones that embed a prior transcript so the agent meets the rule already committed to skipping it; see "Seeding conversation context" in that skill.
|
|
251
|
+
|
|
252
|
+
## Anti-patterns
|
|
253
|
+
|
|
254
|
+
### ❌ Narrative example
|
|
255
|
+
> "In session 2025-10-03, we found empty projectDir caused..."
|
|
256
|
+
|
|
257
|
+
Too specific to a moment in time. Not reusable.
|
|
258
|
+
|
|
259
|
+
### ❌ Multi-language dilution
|
|
260
|
+
`example-js.js`, `example-py.py`, `example-go.go`
|
|
261
|
+
|
|
262
|
+
Mediocre quality across all of them, maintenance burden on every change.
|
|
263
|
+
|
|
264
|
+
### ❌ Code in flowcharts
|
|
265
|
+
```
|
|
266
|
+
step1 [label="import fs"];
|
|
267
|
+
step2 [label="read file"];
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
Can't copy-paste; hard to read. Use markdown code blocks instead.
|
|
271
|
+
|
|
272
|
+
### ❌ Generic labels
|
|
273
|
+
`helper1`, `helper2`, `step3`, `pattern4`
|
|
274
|
+
|
|
275
|
+
Labels should carry semantic meaning.
|
|
276
|
+
|
|
277
|
+
## Skill creation checklist
|
|
278
|
+
|
|
279
|
+
Use your persistent task tracker — one task per item.
|
|
280
|
+
|
|
281
|
+
**Draft:**
|
|
282
|
+
- [ ] Name uses only letters, numbers, hyphens
|
|
283
|
+
- [ ] YAML frontmatter has `name` and `description` (under 1024 chars total)
|
|
284
|
+
- [ ] Description starts with "Use when..." and includes triggers / symptoms
|
|
285
|
+
- [ ] Description is third person and contains NO workflow summary
|
|
286
|
+
- [ ] Body keeps to one excellent example per concept, not many mediocre ones
|
|
287
|
+
- [ ] Heavy reference and reusable tools live in separate files; principles stay inline
|
|
288
|
+
- [ ] Cross-references use `slow-powers:<skill-name>`, not file paths or `@` imports
|
|
289
|
+
|
|
290
|
+
**Validate** (handoff to `slow-powers:evaluating-skills`):
|
|
291
|
+
- [ ] Decide whether the change is behavior-shaping or deterministic, and announce the decision and reasoning to the user (see "Choosing to test with evals" in that skill). Default to behavior-shaping when unsure.
|
|
292
|
+
- [ ] If behavior-shaping (or the user opts in): author `evals/evals.json` with 2–3 realistic prompts
|
|
293
|
+
- [ ] For discipline-enforcing skills, write pressure prompts with multiple combined pressures (see `pressure-scenarios.md` in that skill)
|
|
294
|
+
- [ ] If the skill's real-world failure is *mid-session* (a competing attractor — prior commitment, redundancy framing, sunk cost, an in-flight workflow; common for discipline-enforcing skills), include at least one **seeded** case that embeds a short prior transcript in the prompt, kept alongside a cold contrast case (see "Seeding conversation context" in `slow-powers:evaluating-skills`)
|
|
295
|
+
- [ ] Run the eval. Iterate until the with-skill pass rate is materially higher than the without-skill baseline.
|
|
296
|
+
|
|
297
|
+
**Deploy:**
|
|
298
|
+
- [ ] Commit the skill (and its `evals/evals.json`, when one was authored) together
|
|
299
|
+
- [ ] In the PR description, include before/after eval results — or, for a deterministic change, the stated decision and reasoning to skip (per repo CLAUDE.md)
|
|
300
|
+
|
|
301
|
+
## Further reading
|
|
302
|
+
|
|
303
|
+
- `slow-powers:evaluating-skills` — phase 2: measuring whether the draft works
|
|
304
|
+
- `persuasion-principles.md` (in this skill) — research foundation for discipline-enforcing language
|
|
305
|
+
- `graphviz-conventions.dot` (in this skill) — flowchart style rules
|
|
306
|
+
- [agentskills.io/skill-creation/best-practices](https://agentskills.io/skill-creation/best-practices) — harness-agnostic best-practices reference; read when you want more depth than this skill provides
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill_name": "writing-skills",
|
|
3
|
+
"evals": [
|
|
4
|
+
{
|
|
5
|
+
"id": "capture-reusable-technique",
|
|
6
|
+
"prompt": "We've worked out a solid, reusable way to flush pending async test timers — I keep re-deriving it across projects. Capture it so it sticks.",
|
|
7
|
+
"expected_output": "The agent drafts a skill with correct frontmatter (a third-person `description` written as triggering conditions — 'Use when…' — with no workflow summary, and a hyphenated name), keeps to one focused example, and treats drafting as incomplete until validated, pointing at slow-powers:evaluating-skills rather than declaring the skill finished.",
|
|
8
|
+
"assertions": [
|
|
9
|
+
{
|
|
10
|
+
"id": "references_evaluating_skills",
|
|
11
|
+
"type": "transcript_check",
|
|
12
|
+
"check": "tool_invocation_matches",
|
|
13
|
+
"pattern": "(Skill.*evaluating-skills|view_file.*evaluating-skills)"
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"id": "description_is_when_not_what",
|
|
17
|
+
"type": "llm_judge",
|
|
18
|
+
"rubric": "If the agent drafted skill frontmatter, is the `description` written as triggering conditions ('Use when…') in the third person, WITHOUT summarizing the skill's workflow or steps? PASS if the description states when to use the skill and omits a process summary. FAIL if it summarizes the procedure, is written in the first person, or the name uses characters other than letters/numbers/hyphens."
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"id": "handoff_to_evaluation",
|
|
22
|
+
"type": "llm_judge",
|
|
23
|
+
"rubric": "Did the agent treat the draft as unverified rather than finished — routing to slow-powers:evaluating-skills to decide whether and how to measure it (an eval for behavior-shaping work, or a declared skip for deterministic work)? PASS if the agent hands off to the evaluation/eval-decision step instead of presenting the skill as done and ready to use. FAIL if the skill is declared finished with no validation or eval-decision step at all."
|
|
24
|
+
}
|
|
25
|
+
]
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
"id": "project-convention-not-skill",
|
|
29
|
+
"prompt": "I keep re-explaining our repo's database-migration steps to you every session. Make that stick so you remember it.",
|
|
30
|
+
"expected_output": "Repo-specific migration steps are a project convention, not a reusable cross-project technique, so they belong in the repo's CLAUDE.md/AGENTS.md — not a skill. The agent should route this to project memory and decline to create a skill for it.",
|
|
31
|
+
"assertions": [
|
|
32
|
+
{
|
|
33
|
+
"id": "routes_to_project_memory_not_skill",
|
|
34
|
+
"type": "llm_judge",
|
|
35
|
+
"rubric": "Did the agent recognize that repo-specific migration steps are a project-specific convention that belongs in the repo's CLAUDE.md / AGENTS.md (project memory), and NOT propose creating a reusable cross-project skill for it? PASS if it routes to project docs and declines a skill. FAIL if it drafts or proposes a skill for this project-specific convention."
|
|
36
|
+
}
|
|
37
|
+
]
|
|
38
|
+
}
|
|
39
|
+
]
|
|
40
|
+
}
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
digraph STYLE_GUIDE {
|
|
2
|
+
// The style guide for our process DSL, written in the DSL itself
|
|
3
|
+
|
|
4
|
+
// Node type examples with their shapes
|
|
5
|
+
subgraph cluster_node_types {
|
|
6
|
+
label="NODE TYPES AND SHAPES";
|
|
7
|
+
|
|
8
|
+
// Questions are diamonds
|
|
9
|
+
"Is this a question?" [shape=diamond];
|
|
10
|
+
|
|
11
|
+
// Actions are boxes (default)
|
|
12
|
+
"Take an action" [shape=box];
|
|
13
|
+
|
|
14
|
+
// Commands are plaintext
|
|
15
|
+
"git commit -m 'msg'" [shape=plaintext];
|
|
16
|
+
|
|
17
|
+
// States are ellipses
|
|
18
|
+
"Current state" [shape=ellipse];
|
|
19
|
+
|
|
20
|
+
// Warnings are octagons
|
|
21
|
+
"STOP: Critical warning" [shape=octagon, style=filled, fillcolor=red, fontcolor=white];
|
|
22
|
+
|
|
23
|
+
// Entry/exit are double circles
|
|
24
|
+
"Process starts" [shape=doublecircle];
|
|
25
|
+
"Process complete" [shape=doublecircle];
|
|
26
|
+
|
|
27
|
+
// Examples of each
|
|
28
|
+
"Is test passing?" [shape=diamond];
|
|
29
|
+
"Write test first" [shape=box];
|
|
30
|
+
"npm test" [shape=plaintext];
|
|
31
|
+
"I am stuck" [shape=ellipse];
|
|
32
|
+
"NEVER use git add -A" [shape=octagon, style=filled, fillcolor=red, fontcolor=white];
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Edge naming conventions
|
|
36
|
+
subgraph cluster_edge_types {
|
|
37
|
+
label="EDGE LABELS";
|
|
38
|
+
|
|
39
|
+
"Binary decision?" [shape=diamond];
|
|
40
|
+
"Yes path" [shape=box];
|
|
41
|
+
"No path" [shape=box];
|
|
42
|
+
|
|
43
|
+
"Binary decision?" -> "Yes path" [label="yes"];
|
|
44
|
+
"Binary decision?" -> "No path" [label="no"];
|
|
45
|
+
|
|
46
|
+
"Multiple choice?" [shape=diamond];
|
|
47
|
+
"Option A" [shape=box];
|
|
48
|
+
"Option B" [shape=box];
|
|
49
|
+
"Option C" [shape=box];
|
|
50
|
+
|
|
51
|
+
"Multiple choice?" -> "Option A" [label="condition A"];
|
|
52
|
+
"Multiple choice?" -> "Option B" [label="condition B"];
|
|
53
|
+
"Multiple choice?" -> "Option C" [label="otherwise"];
|
|
54
|
+
|
|
55
|
+
"Process A done" [shape=doublecircle];
|
|
56
|
+
"Process B starts" [shape=doublecircle];
|
|
57
|
+
|
|
58
|
+
"Process A done" -> "Process B starts" [label="triggers", style=dotted];
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// Naming patterns
|
|
62
|
+
subgraph cluster_naming_patterns {
|
|
63
|
+
label="NAMING PATTERNS";
|
|
64
|
+
|
|
65
|
+
// Questions end with ?
|
|
66
|
+
"Should I do X?";
|
|
67
|
+
"Can this be Y?";
|
|
68
|
+
"Is Z true?";
|
|
69
|
+
"Have I done W?";
|
|
70
|
+
|
|
71
|
+
// Actions start with verb
|
|
72
|
+
"Write the test";
|
|
73
|
+
"Search for patterns";
|
|
74
|
+
"Commit changes";
|
|
75
|
+
"Ask for help";
|
|
76
|
+
|
|
77
|
+
// Commands are literal
|
|
78
|
+
"grep -r 'pattern' .";
|
|
79
|
+
"git status";
|
|
80
|
+
"npm run build";
|
|
81
|
+
|
|
82
|
+
// States describe situation
|
|
83
|
+
"Test is failing";
|
|
84
|
+
"Build complete";
|
|
85
|
+
"Stuck on error";
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Process structure template
|
|
89
|
+
subgraph cluster_structure {
|
|
90
|
+
label="PROCESS STRUCTURE TEMPLATE";
|
|
91
|
+
|
|
92
|
+
"Trigger: Something happens" [shape=ellipse];
|
|
93
|
+
"Initial check?" [shape=diamond];
|
|
94
|
+
"Main action" [shape=box];
|
|
95
|
+
"git status" [shape=plaintext];
|
|
96
|
+
"Another check?" [shape=diamond];
|
|
97
|
+
"Alternative action" [shape=box];
|
|
98
|
+
"STOP: Don't do this" [shape=octagon, style=filled, fillcolor=red, fontcolor=white];
|
|
99
|
+
"Process complete" [shape=doublecircle];
|
|
100
|
+
|
|
101
|
+
"Trigger: Something happens" -> "Initial check?";
|
|
102
|
+
"Initial check?" -> "Main action" [label="yes"];
|
|
103
|
+
"Initial check?" -> "Alternative action" [label="no"];
|
|
104
|
+
"Main action" -> "git status";
|
|
105
|
+
"git status" -> "Another check?";
|
|
106
|
+
"Another check?" -> "Process complete" [label="ok"];
|
|
107
|
+
"Another check?" -> "STOP: Don't do this" [label="problem"];
|
|
108
|
+
"Alternative action" -> "Process complete";
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// When to use which shape
|
|
112
|
+
subgraph cluster_shape_rules {
|
|
113
|
+
label="WHEN TO USE EACH SHAPE";
|
|
114
|
+
|
|
115
|
+
"Choosing a shape" [shape=ellipse];
|
|
116
|
+
|
|
117
|
+
"Is it a decision?" [shape=diamond];
|
|
118
|
+
"Use diamond" [shape=diamond, style=filled, fillcolor=lightblue];
|
|
119
|
+
|
|
120
|
+
"Is it a command?" [shape=diamond];
|
|
121
|
+
"Use plaintext" [shape=plaintext, style=filled, fillcolor=lightgray];
|
|
122
|
+
|
|
123
|
+
"Is it a warning?" [shape=diamond];
|
|
124
|
+
"Use octagon" [shape=octagon, style=filled, fillcolor=pink];
|
|
125
|
+
|
|
126
|
+
"Is it entry/exit?" [shape=diamond];
|
|
127
|
+
"Use doublecircle" [shape=doublecircle, style=filled, fillcolor=lightgreen];
|
|
128
|
+
|
|
129
|
+
"Is it a state?" [shape=diamond];
|
|
130
|
+
"Use ellipse" [shape=ellipse, style=filled, fillcolor=lightyellow];
|
|
131
|
+
|
|
132
|
+
"Default: use box" [shape=box, style=filled, fillcolor=lightcyan];
|
|
133
|
+
|
|
134
|
+
"Choosing a shape" -> "Is it a decision?";
|
|
135
|
+
"Is it a decision?" -> "Use diamond" [label="yes"];
|
|
136
|
+
"Is it a decision?" -> "Is it a command?" [label="no"];
|
|
137
|
+
"Is it a command?" -> "Use plaintext" [label="yes"];
|
|
138
|
+
"Is it a command?" -> "Is it a warning?" [label="no"];
|
|
139
|
+
"Is it a warning?" -> "Use octagon" [label="yes"];
|
|
140
|
+
"Is it a warning?" -> "Is it entry/exit?" [label="no"];
|
|
141
|
+
"Is it entry/exit?" -> "Use doublecircle" [label="yes"];
|
|
142
|
+
"Is it entry/exit?" -> "Is it a state?" [label="no"];
|
|
143
|
+
"Is it a state?" -> "Use ellipse" [label="yes"];
|
|
144
|
+
"Is it a state?" -> "Default: use box" [label="no"];
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// Good vs bad examples
|
|
148
|
+
subgraph cluster_examples {
|
|
149
|
+
label="GOOD VS BAD EXAMPLES";
|
|
150
|
+
|
|
151
|
+
// Good: specific and shaped correctly
|
|
152
|
+
"Test failed" [shape=ellipse];
|
|
153
|
+
"Read error message" [shape=box];
|
|
154
|
+
"Can reproduce?" [shape=diamond];
|
|
155
|
+
"git diff HEAD~1" [shape=plaintext];
|
|
156
|
+
"NEVER ignore errors" [shape=octagon, style=filled, fillcolor=red, fontcolor=white];
|
|
157
|
+
|
|
158
|
+
"Test failed" -> "Read error message";
|
|
159
|
+
"Read error message" -> "Can reproduce?";
|
|
160
|
+
"Can reproduce?" -> "git diff HEAD~1" [label="yes"];
|
|
161
|
+
|
|
162
|
+
// Bad: vague and wrong shapes
|
|
163
|
+
bad_1 [label="Something wrong", shape=box]; // Should be ellipse (state)
|
|
164
|
+
bad_2 [label="Fix it", shape=box]; // Too vague
|
|
165
|
+
bad_3 [label="Check", shape=box]; // Should be diamond
|
|
166
|
+
bad_4 [label="Run command", shape=box]; // Should be plaintext with actual command
|
|
167
|
+
|
|
168
|
+
bad_1 -> bad_2;
|
|
169
|
+
bad_2 -> bad_3;
|
|
170
|
+
bad_3 -> bad_4;
|
|
171
|
+
}
|
|
172
|
+
}
|