agileflow 2.51.0 → 2.56.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +80 -460
  2. package/package.json +18 -3
  3. package/scripts/agileflow-configure.js +134 -63
  4. package/scripts/agileflow-welcome.js +161 -31
  5. package/scripts/generators/agent-registry.js +45 -57
  6. package/scripts/generators/command-registry.js +48 -32
  7. package/scripts/generators/index.js +2 -6
  8. package/scripts/generators/inject-babysit.js +9 -2
  9. package/scripts/generators/inject-help.js +3 -1
  10. package/scripts/generators/inject-readme.js +7 -3
  11. package/scripts/generators/skill-registry.js +60 -33
  12. package/scripts/get-env.js +13 -12
  13. package/scripts/lib/frontmatter-parser.js +82 -0
  14. package/scripts/obtain-context.js +79 -26
  15. package/scripts/session-coordinator.sh +232 -0
  16. package/scripts/session-manager.js +512 -0
  17. package/src/core/agents/orchestrator.md +275 -0
  18. package/src/core/commands/adr.md +38 -16
  19. package/src/core/commands/agent.md +39 -22
  20. package/src/core/commands/assign.md +17 -0
  21. package/src/core/commands/auto.md +60 -46
  22. package/src/core/commands/babysit.md +302 -637
  23. package/src/core/commands/baseline.md +20 -0
  24. package/src/core/commands/blockers.md +33 -48
  25. package/src/core/commands/board.md +19 -0
  26. package/src/core/commands/changelog.md +20 -0
  27. package/src/core/commands/ci.md +17 -0
  28. package/src/core/commands/context.md +43 -40
  29. package/src/core/commands/debt.md +76 -45
  30. package/src/core/commands/deploy.md +20 -0
  31. package/src/core/commands/deps.md +40 -46
  32. package/src/core/commands/diagnose.md +24 -18
  33. package/src/core/commands/docs.md +18 -0
  34. package/src/core/commands/epic.md +31 -0
  35. package/src/core/commands/feedback.md +33 -21
  36. package/src/core/commands/handoff.md +29 -0
  37. package/src/core/commands/help.md +16 -7
  38. package/src/core/commands/impact.md +31 -61
  39. package/src/core/commands/metrics.md +17 -35
  40. package/src/core/commands/packages.md +21 -0
  41. package/src/core/commands/pr.md +15 -0
  42. package/src/core/commands/readme-sync.md +42 -9
  43. package/src/core/commands/research.md +58 -11
  44. package/src/core/commands/retro.md +42 -50
  45. package/src/core/commands/review.md +22 -27
  46. package/src/core/commands/session/end.md +53 -297
  47. package/src/core/commands/session/history.md +38 -257
  48. package/src/core/commands/session/init.md +44 -446
  49. package/src/core/commands/session/new.md +152 -0
  50. package/src/core/commands/session/resume.md +51 -447
  51. package/src/core/commands/session/status.md +32 -244
  52. package/src/core/commands/sprint.md +33 -0
  53. package/src/core/commands/status.md +18 -0
  54. package/src/core/commands/story-validate.md +32 -0
  55. package/src/core/commands/story.md +21 -6
  56. package/src/core/commands/template.md +18 -0
  57. package/src/core/commands/tests.md +22 -0
  58. package/src/core/commands/update.md +72 -58
  59. package/src/core/commands/validate-expertise.md +25 -37
  60. package/src/core/commands/velocity.md +33 -74
  61. package/src/core/commands/verify.md +16 -0
  62. package/src/core/experts/documentation/expertise.yaml +16 -2
  63. package/src/core/skills/agileflow-retro-facilitator/SKILL.md +57 -219
  64. package/src/core/skills/agileflow-retro-facilitator/cookbook/4ls.md +86 -0
  65. package/src/core/skills/agileflow-retro-facilitator/cookbook/glad-sad-mad.md +79 -0
  66. package/src/core/skills/agileflow-retro-facilitator/cookbook/start-stop-continue.md +142 -0
  67. package/src/core/skills/agileflow-retro-facilitator/prompts/action-items.md +83 -0
  68. package/src/core/skills/writing-skills/SKILL.md +352 -0
  69. package/src/core/skills/writing-skills/testing-skills-with-subagents.md +232 -0
  70. package/tools/cli/agileflow-cli.js +4 -2
  71. package/tools/cli/commands/config.js +20 -13
  72. package/tools/cli/commands/doctor.js +25 -9
  73. package/tools/cli/commands/list.js +10 -6
  74. package/tools/cli/commands/setup.js +54 -3
  75. package/tools/cli/commands/status.js +6 -8
  76. package/tools/cli/commands/uninstall.js +5 -5
  77. package/tools/cli/commands/update.js +51 -7
  78. package/tools/cli/installers/core/installer.js +8 -4
  79. package/tools/cli/installers/ide/_base-ide.js +58 -1
  80. package/tools/cli/installers/ide/claude-code.js +3 -61
  81. package/tools/cli/installers/ide/codex.js +440 -0
  82. package/tools/cli/installers/ide/cursor.js +21 -51
  83. package/tools/cli/installers/ide/manager.js +2 -6
  84. package/tools/cli/installers/ide/windsurf.js +20 -50
  85. package/tools/cli/lib/content-injector.js +26 -49
  86. package/tools/cli/lib/docs-setup.js +3 -2
  87. package/tools/cli/lib/npm-utils.js +39 -12
  88. package/tools/cli/lib/ui.js +31 -10
  89. package/tools/cli/lib/version-checker.js +3 -3
  90. package/tools/postinstall.js +2 -3
@@ -0,0 +1,79 @@
1
+ # Glad/Sad/Mad Retrospective Format
2
+
3
+ Best for emotional topics and team dynamics. Use when morale is low or interpersonal issues need addressing.
4
+
5
+ ## When to Use
6
+
7
+ - Team seems frustrated or burnt out
8
+ - Recent conflicts or tensions
9
+ - Major changes (team composition, process, tools)
10
+ - After particularly difficult sprints
11
+ - When standard retro feels stale
12
+
13
+ ## Template
14
+
15
+ ```markdown
16
+ # Glad/Sad/Mad Retrospective
17
+
18
+ **Date**: YYYY-MM-DD
19
+ **Facilitator**: [Name]
20
+ **Attendees**: [Team members present]
21
+ **Context**: [Why using this format]
22
+
23
+ ## Glad (Things that made us happy)
24
+
25
+ - [Glad 1: Celebration or positive moment]
26
+ - [Glad 2: Something that worked well]
27
+ - [Glad 3: Team success]
28
+
29
+ ## Sad (Things that disappointed us)
30
+
31
+ - [Sad 1: Unmet expectation]
32
+ - [Sad 2: Missed opportunity]
33
+ - [Sad 3: Something we hoped would go better]
34
+
35
+ ## Mad (Things that frustrated us)
36
+
37
+ - [Mad 1: Recurring problem]
38
+ - [Mad 2: Blocker or impediment]
39
+ - [Mad 3: Process that isn't working]
40
+
41
+ ## Patterns Identified
42
+
43
+ - [Pattern 1: Theme across categories]
44
+ - [Pattern 2: Root cause analysis]
45
+
46
+ ## Action Items
47
+
48
+ - [ ] **[Action 1]** - @Owner - Due: [Date]
49
+ - Addresses: [Which Sad/Mad item]
50
+
51
+ - [ ] **[Action 2]** - @Owner - Due: [Date]
52
+ - Addresses: [Which Sad/Mad item]
53
+
54
+ ## What We Want to Protect
55
+
56
+ - [Glad item to protect and maintain]
57
+ - [Process or practice to keep]
58
+ ```
59
+
60
+ ## Facilitation Notes
61
+
62
+ **Creating Psychological Safety**:
63
+ - Acknowledge that emotions are valid
64
+ - Focus on situations, not individuals
65
+ - Use "I feel..." statements
66
+ - Allow venting, then redirect to solutions
67
+
68
+ **Timing**:
69
+ - Glad: 10 minutes
70
+ - Sad: 15 minutes
71
+ - Mad: 15 minutes
72
+ - Action Items: 15 minutes
73
+ - Total: ~60 minutes
74
+
75
+ **Tips**:
76
+ - Start with Glad to set positive tone
77
+ - Let people express frustration fully before problem-solving
78
+ - End with concrete actions to channel emotions productively
79
+ - Follow up on emotional items in 1:1s if needed
@@ -0,0 +1,142 @@
1
+ # Start/Stop/Continue Retrospective Format
2
+
3
+ The standard format for sprint retrospectives. Best for regular, recurring retros.
4
+
5
+ ## Template
6
+
7
+ ```markdown
8
+ # Sprint [Number] Retrospective
9
+
10
+ **Date**: YYYY-MM-DD
11
+ **Facilitator**: [Name]
12
+ **Attendees**: [Team members present]
13
+ **Sprint Duration**: [Start] - [End]
14
+
15
+ ## Sprint Metrics
16
+
17
+ - **Committed**: X story points
18
+ - **Completed**: Y story points
19
+ - **Velocity**: Z%
20
+ - **Stories Done**: A / B
21
+ - **Bugs Found**: C
22
+
23
+ ## What Went Well
24
+
25
+ - [Positive 1: Specific thing that worked]
26
+ - [Positive 2: Team success]
27
+ - [Positive 3: Process improvement]
28
+
29
+ ## What Didn't Go Well
30
+
31
+ - [Challenge 1: Specific problem]
32
+ - [Challenge 2: Blocker or delay]
33
+ - [Challenge 3: Process issue]
34
+
35
+ ## Start (New Practices)
36
+
37
+ - **[Practice 1]**
38
+ - Why: [Reasoning]
39
+ - Owner: [Who will drive this]
40
+ - Success metric: [How we'll measure]
41
+
42
+ ## Stop (Remove Practices)
43
+
44
+ - **[Practice 1]**
45
+ - Why it's not working: [Reasoning]
46
+ - Alternative: [What we'll do instead]
47
+
48
+ ## Continue (Keep Doing)
49
+
50
+ - **[Practice 1]**
51
+ - Why it's working: [Reasoning]
52
+ - How to maintain: [Keep it going]
53
+
54
+ ## Action Items
55
+
56
+ - [ ] **[Action 1]** - @Owner - Due: [Date]
57
+ - Success criteria: [How we know it's done]
58
+
59
+ - [ ] **[Action 2]** - @Owner - Due: [Date]
60
+ - Success criteria: [How we know it's done]
61
+
62
+ ## Previous Action Items Review
63
+
64
+ - [] **[Completed Action]** - Implemented, improved X by Y%
65
+ - [] **[In Progress Action]** - Still working on it, 60% done
66
+ - [] **[Not Done Action]** - Blocked by Z, rolling to next sprint
67
+
68
+ ## Key Insights
69
+
70
+ 1. [Insight 1: Pattern or learning]
71
+ 2. [Insight 2: Team dynamic observation]
72
+ 3. [Insight 3: Process discovery]
73
+ ```
74
+
75
+ ## Metrics to Track
76
+
77
+ **Sprint Health**:
78
+ - Velocity trend (increasing, stable, decreasing?)
79
+ - Commitment accuracy (completed vs committed)
80
+ - Bug count (increasing, decreasing?)
81
+ - Blocker frequency
82
+
83
+ **Team Health**:
84
+ - Meeting effectiveness
85
+ - Communication quality
86
+ - Collaboration level
87
+ - Work-life balance
88
+
89
+ **Process Health**:
90
+ - Cycle time (story start to done)
91
+ - Code review turnaround
92
+ - Deployment frequency
93
+ - Incident count
94
+
95
+ ## Common Themes to Watch For
96
+
97
+ **Positive Patterns**:
98
+ - Consistent velocity
99
+ - Low bug count
100
+ - Fast code reviews
101
+ - Clear requirements
102
+ - Good collaboration
103
+
104
+ **Warning Signs**:
105
+ - Declining velocity
106
+ - Recurring blockers
107
+ - Communication issues
108
+ - Scope creep
109
+ - Burnout indicators
110
+
111
+ ## Facilitator Tips
112
+
113
+ **Do**:
114
+ - Create safe space for honest feedback
115
+ - Focus on process, not people
116
+ - Time-box discussions (5-10 min per topic)
117
+ - Ensure everyone participates
118
+ - End on positive note
119
+ - Follow up on action items
120
+
121
+ **Don't**:
122
+ - Blame individuals
123
+ - Let discussions run too long
124
+ - Skip retros ("too busy")
125
+ - Create action items without owners
126
+ - Ignore previous action items
127
+
128
+ ## Remote Retro Adaptations
129
+
130
+ For distributed teams:
131
+ - Use anonymous feedback tools (Retrium, Metro Retro)
132
+ - Give time for async reflection before meeting
133
+ - Use polls/voting for prioritization
134
+ - Record session for absent team members
135
+ - Use collaborative docs for brainstorming
136
+
137
+ ## Frequency Guidelines
138
+
139
+ - **Every sprint**: Standard retros (60-90 min)
140
+ - **Major milestones**: Extended retros (2-3 hours)
141
+ - **Quarterly**: Big-picture retros (process, tools, culture)
142
+ - **Post-incident**: Blameless postmortems (as needed)
@@ -0,0 +1,83 @@
1
+ # Action Items Template
2
+
3
+ Shared template for creating SMART action items across all retro formats.
4
+
5
+ ## SMART Action Items
6
+
7
+ - **S**pecific: Clear what needs to be done
8
+ - **M**easurable: Can verify it's complete
9
+ - **A**ssignable: Has an owner
10
+ - **R**elevant: Addresses the issue
11
+ - **T**ime-bound: Has a deadline
12
+
13
+ ## Good vs Bad Examples
14
+
15
+ **Good (Specific, Actionable)**:
16
+ ```
17
+ - [ ] **Create PR size guideline** - @TechLead - Due: Before next sprint
18
+ - Success criteria: Document written, shared with team, added to CLAUDE.md
19
+ - Metric: 80% of PRs under 300 lines
20
+ ```
21
+
22
+ **Bad (Vague, Unactionable)**:
23
+ ```
24
+ - [ ] Fix code reviews
25
+ - [ ] Be better at communication
26
+ - [ ] Improve quality
27
+ ```
28
+
29
+ ## Good vs Bad Feedback
30
+
31
+ **Good (Specific, Constructive)**:
32
+ ```
33
+ "Daily standups ran long (20+ min) because we discussed
34
+ implementation details. Consider moving technical discussions
35
+ to separate sessions."
36
+
37
+ "Code reviews were faster this sprint (avg 4 hours vs 24 hours
38
+ last sprint) thanks to smaller PR sizes."
39
+ ```
40
+
41
+ **Bad (Vague, Blame-Oriented)**:
42
+ ```
43
+ "Meetings were bad"
44
+ "Bob didn't do his job"
45
+ "Everything was terrible"
46
+ "Process is broken"
47
+ ```
48
+
49
+ ## Action Item Template
50
+
51
+ ```markdown
52
+ - [ ] **[Specific Action]** - @Owner - Due: [Date]
53
+ - Context: [What retro item this addresses]
54
+ - Success criteria: [How we know it's done]
55
+ - Metric: [Measurable outcome if applicable]
56
+ ```
57
+
58
+ ## Tracking Action Items
59
+
60
+ **Status Markers**:
61
+ - `[ ]` - Not started
62
+ - `[~]` - In progress
63
+ - `[x]` - Completed
64
+ - `[!]` - Blocked
65
+ - `[-]` - Cancelled/Deferred
66
+
67
+ **Review Format**:
68
+ ```markdown
69
+ ## Previous Action Items Review
70
+
71
+ - [x] **[Completed Action]** - Implemented, improved X by Y%
72
+ - [~] **[In Progress Action]** - 60% done, on track for next week
73
+ - [!] **[Blocked Action]** - Blocked by Z, need help from [team]
74
+ - [-] **[Cancelled Action]** - No longer relevant due to [reason]
75
+ ```
76
+
77
+ ## Best Practices
78
+
79
+ 1. **Limit to 3-5 actions per retro** - More than 5 rarely get done
80
+ 2. **Assign single owners** - Shared ownership = no ownership
81
+ 3. **Set realistic deadlines** - Usually before next retro
82
+ 4. **Review at next retro** - Accountability matters
83
+ 5. **Track completion rate** - Target: >80%
@@ -0,0 +1,352 @@
1
+ ---
2
+ name: writing-skills
3
+ description: Use when creating new skills, editing existing skills, or verifying skills work before deployment
4
+ ---
5
+
6
+ # Writing Skills
7
+
8
+ ## Overview
9
+
10
+ **Writing skills IS Test-Driven Development applied to process documentation.**
11
+
12
+ You write test cases (pressure scenarios with subagents), watch them fail (baseline behavior), write the skill (documentation), watch tests pass (agents comply), and refactor (close loopholes).
13
+
14
+ **Core principle:** If you didn't watch an agent fail without the skill, you don't know if the skill teaches the right thing.
15
+
16
+ ## What is a Skill?
17
+
18
+ A **skill** is a reference guide for proven techniques, patterns, or tools. Skills help future Claude instances find and apply effective approaches.
19
+
20
+ **Skills are:** Reusable techniques, patterns, tools, reference guides
21
+
22
+ **Skills are NOT:** Narratives about how you solved a problem once
23
+
24
+ ## TDD Mapping for Skills
25
+
26
+ | TDD Concept | Skill Creation |
27
+ |-------------|----------------|
28
+ | **Test case** | Pressure scenario with subagent |
29
+ | **Production code** | Skill document (SKILL.md) |
30
+ | **Test fails (RED)** | Agent violates rule without skill (baseline) |
31
+ | **Test passes (GREEN)** | Agent complies with skill present |
32
+ | **Refactor** | Close loopholes while maintaining compliance |
33
+ | **Write test first** | Run baseline scenario BEFORE writing skill |
34
+ | **Watch it fail** | Document exact rationalizations agent uses |
35
+ | **Minimal code** | Write skill addressing those specific violations |
36
+ | **Watch it pass** | Verify agent now complies |
37
+ | **Refactor cycle** | Find new rationalizations → plug → re-verify |
38
+
39
+ ## When to Create a Skill
40
+
41
+ **Create when:**
42
+ - Technique wasn't intuitively obvious to you
43
+ - You'd reference this again across projects
44
+ - Pattern applies broadly (not project-specific)
45
+ - Others would benefit
46
+
47
+ **Don't create for:**
48
+ - One-off solutions
49
+ - Standard practices well-documented elsewhere
50
+ - Project-specific conventions (put in CLAUDE.md)
51
+ - Mechanical constraints (if enforceable with validation, automate it)
52
+
53
+ ## Skill Types
54
+
55
+ ### Technique
56
+ Concrete method with steps to follow (condition-based-waiting, root-cause-tracing)
57
+
58
+ ### Pattern
59
+ Way of thinking about problems (flatten-with-flags, test-invariants)
60
+
61
+ ### Reference
62
+ API docs, syntax guides, tool documentation
63
+
64
+ ## Directory Structure
65
+
66
+ ```
67
+ skills/
68
+ skill-name/
69
+ SKILL.md # Main reference (required)
70
+ cookbook/ # Per-use-case docs (if multiple workflows)
71
+ prompts/ # Reusable prompt templates
72
+ tools/ # Scripts, utilities
73
+ supporting-file.* # Only if needed
74
+ ```
75
+
76
+ **Flat namespace** - all skills in one searchable namespace
77
+
78
+ **Separate files for:**
79
+ 1. **Heavy reference** (100+ lines) - API docs, comprehensive syntax
80
+ 2. **Reusable tools** - Scripts, utilities, templates
81
+ 3. **Multiple workflows** - Use cookbook/ pattern for progressive disclosure
82
+
83
+ **Keep inline:**
84
+ - Principles and concepts
85
+ - Code patterns (< 50 lines)
86
+ - Everything else
87
+
88
+ ## SKILL.md Structure
89
+
90
+ **Frontmatter (YAML):**
91
+ - Only two fields supported: `name` and `description`
92
+ - Max 1024 characters total
93
+ - `name`: Use letters, numbers, and hyphens only
94
+ - `description`: Third-person, describes ONLY when to use (NOT what it does)
95
+
96
+ ```markdown
97
+ ---
98
+ name: skill-name-with-hyphens
99
+ description: Use when [specific triggering conditions and symptoms]
100
+ ---
101
+
102
+ # Skill Name
103
+
104
+ ## Overview
105
+ What is this? Core principle in 1-2 sentences.
106
+
107
+ ## When to Use
108
+ Bullet list with SYMPTOMS and use cases
109
+ When NOT to use
110
+
111
+ ## Variables (if using cookbook pattern)
112
+ Feature flags for conditional behavior
113
+
114
+ ## Cookbook (if multiple workflows)
115
+ If condition A → read cookbook/a.md
116
+ If condition B → read cookbook/b.md
117
+
118
+ ## Core Pattern (for techniques/patterns)
119
+ Before/after code comparison
120
+
121
+ ## Quick Reference
122
+ Table or bullets for scanning common operations
123
+
124
+ ## Implementation
125
+ Inline code for simple patterns
126
+ Link to file for heavy reference
127
+
128
+ ## Common Mistakes
129
+ What goes wrong + fixes
130
+ ```
131
+
132
+ ## Claude Search Optimization (CSO)
133
+
134
+ **Critical for discovery:** Future Claude needs to FIND your skill
135
+
136
+ ### 1. Rich Description Field
137
+
138
+ **Purpose:** Claude reads description to decide which skills to load. Make it answer: "Should I read this skill right now?"
139
+
140
+ **Format:** Start with "Use when..." to focus on triggering conditions
141
+
142
+ **CRITICAL: Description = When to Use, NOT What the Skill Does**
143
+
144
+ ```yaml
145
+ # BAD: Summarizes workflow - Claude may follow this instead of reading skill
146
+ description: Use when executing plans - dispatches subagent per task with code review
147
+
148
+ # GOOD: Just triggering conditions, no workflow summary
149
+ description: Use when executing implementation plans with independent tasks
150
+ ```
151
+
152
+ ### 2. Keyword Coverage
153
+
154
+ Use words Claude would search for:
155
+ - Error messages: "Hook timed out", "race condition"
156
+ - Symptoms: "flaky", "hanging", "zombie"
157
+ - Synonyms: "timeout/hang/freeze", "cleanup/teardown"
158
+ - Tools: Actual commands, library names, file types
159
+
160
+ ### 3. Descriptive Naming
161
+
162
+ **Use active voice, verb-first:**
163
+ - `creating-skills` not `skill-creation`
164
+ - `condition-based-waiting` not `async-test-helpers`
165
+
166
+ ### 4. Token Efficiency
167
+
168
+ **Target word counts:**
169
+ - Frequently-loaded skills: <200 words total
170
+ - Other skills: <500 words (still be concise)
171
+
172
+ **Techniques:**
173
+ - Move details to tool help
174
+ - Use cross-references to other skills
175
+ - Compress examples
176
+ - Eliminate redundancy
177
+
178
+ ## The Iron Law
179
+
180
+ ```
181
+ NO SKILL WITHOUT A FAILING TEST FIRST
182
+ ```
183
+
184
+ This applies to NEW skills AND EDITS to existing skills.
185
+
186
+ Write skill before testing? Delete it. Start over.
187
+ Edit skill without testing? Same violation.
188
+
189
+ **No exceptions:**
190
+ - Not for "simple additions"
191
+ - Not for "just adding a section"
192
+ - Not for "documentation updates"
193
+ - Delete means delete
194
+
195
+ ## Testing Skill Types
196
+
197
+ ### Discipline-Enforcing Skills (rules/requirements)
198
+
199
+ **Test with:**
200
+ - Academic questions: Do they understand the rules?
201
+ - Pressure scenarios: Do they comply under stress?
202
+ - Multiple pressures combined: time + sunk cost + exhaustion
203
+
204
+ **Success criteria:** Agent follows rule under maximum pressure
205
+
206
+ ### Technique Skills (how-to guides)
207
+
208
+ **Test with:**
209
+ - Application scenarios: Can they apply the technique correctly?
210
+ - Variation scenarios: Do they handle edge cases?
211
+ - Missing information tests: Do instructions have gaps?
212
+
213
+ **Success criteria:** Agent successfully applies technique to new scenario
214
+
215
+ ### Pattern Skills (mental models)
216
+
217
+ **Test with:**
218
+ - Recognition scenarios: Do they recognize when pattern applies?
219
+ - Counter-examples: Do they know when NOT to apply?
220
+
221
+ **Success criteria:** Agent correctly identifies when/how to apply pattern
222
+
223
+ ### Reference Skills (documentation/APIs)
224
+
225
+ **Test with:**
226
+ - Retrieval scenarios: Can they find the right information?
227
+ - Gap testing: Are common use cases covered?
228
+
229
+ **Success criteria:** Agent finds and correctly applies reference information
230
+
231
+ ## Common Rationalizations for Skipping Testing
232
+
233
+ | Excuse | Reality |
234
+ |--------|---------|
235
+ | "Skill is obviously clear" | Clear to you ≠ clear to other agents. Test it. |
236
+ | "It's just a reference" | References can have gaps. Test retrieval. |
237
+ | "Testing is overkill" | Untested skills have issues. Always. |
238
+ | "I'll test if problems emerge" | Problems = agents can't use skill. Test BEFORE. |
239
+ | "Too tedious to test" | Testing is less tedious than debugging later. |
240
+ | "No time to test" | Deploying untested wastes more time fixing later. |
241
+
242
+ ## Bulletproofing Against Rationalization
243
+
244
+ ### Close Every Loophole Explicitly
245
+
246
+ Don't just state the rule - forbid specific workarounds:
247
+
248
+ ```markdown
249
+ # BAD
250
+ Write code before test? Delete it.
251
+
252
+ # GOOD
253
+ Write code before test? Delete it. Start over.
254
+
255
+ **No exceptions:**
256
+ - Don't keep it as "reference"
257
+ - Don't "adapt" it while writing tests
258
+ - Delete means delete
259
+ ```
260
+
261
+ ### Build Rationalization Table
262
+
263
+ Capture rationalizations from baseline testing:
264
+
265
+ ```markdown
266
+ | Excuse | Reality |
267
+ |--------|---------|
268
+ | "Too simple to test" | Simple code breaks. Test takes 30 seconds. |
269
+ | "I'll test after" | Tests passing immediately prove nothing. |
270
+ ```
271
+
272
+ ### Create Red Flags List
273
+
274
+ ```markdown
275
+ ## Red Flags - STOP and Start Over
276
+
277
+ - Code before test
278
+ - "I already manually tested it"
279
+ - "This is different because..."
280
+
281
+ **All of these mean: Delete code. Start over.**
282
+ ```
283
+
284
+ ## RED-GREEN-REFACTOR for Skills
285
+
286
+ ### RED: Write Failing Test (Baseline)
287
+
288
+ Run pressure scenario with subagent WITHOUT the skill. Document exact behavior:
289
+ - What choices did they make?
290
+ - What rationalizations did they use (verbatim)?
291
+
292
+ ### GREEN: Write Minimal Skill
293
+
294
+ Write skill that addresses those specific rationalizations. Don't add extra content for hypothetical cases.
295
+
296
+ Run same scenarios WITH skill. Agent should now comply.
297
+
298
+ ### REFACTOR: Close Loopholes
299
+
300
+ Agent found new rationalization? Add explicit counter. Re-test until bulletproof.
301
+
302
+ ## Anti-Patterns
303
+
304
+ ### Narrative Example
305
+ "In session 2025-10-03, we found empty projectDir caused..."
306
+ **Why bad:** Too specific, not reusable
307
+
308
+ ### Multi-Language Dilution
309
+ example-js.js, example-py.py, example-go.go
310
+ **Why bad:** Mediocre quality, maintenance burden
311
+
312
+ ### Generic Labels
313
+ helper1, helper2, step3, pattern4
314
+ **Why bad:** Labels should have semantic meaning
315
+
316
+ ## Skill Creation Checklist
317
+
318
+ **RED Phase - Write Failing Test:**
319
+ - [ ] Create pressure scenarios (3+ combined pressures for discipline skills)
320
+ - [ ] Run scenarios WITHOUT skill - document baseline behavior verbatim
321
+ - [ ] Identify patterns in rationalizations/failures
322
+
323
+ **GREEN Phase - Write Minimal Skill:**
324
+ - [ ] Name uses only letters, numbers, hyphens
325
+ - [ ] YAML frontmatter with only name and description
326
+ - [ ] Description starts with "Use when..." with specific triggers
327
+ - [ ] Keywords throughout for search
328
+ - [ ] Address specific baseline failures identified in RED
329
+ - [ ] One excellent example (not multi-language)
330
+ - [ ] Run scenarios WITH skill - verify agents now comply
331
+
332
+ **REFACTOR Phase - Close Loopholes:**
333
+ - [ ] Identify NEW rationalizations from testing
334
+ - [ ] Add explicit counters (if discipline skill)
335
+ - [ ] Build rationalization table from all test iterations
336
+ - [ ] Re-test until bulletproof
337
+
338
+ **Quality Checks:**
339
+ - [ ] Quick reference table
340
+ - [ ] Common mistakes section
341
+ - [ ] No narrative storytelling
342
+ - [ ] Supporting files only for tools or heavy reference
343
+
344
+ ## The Bottom Line
345
+
346
+ **Creating skills IS TDD for process documentation.**
347
+
348
+ Same Iron Law: No skill without failing test first.
349
+ Same cycle: RED (baseline) → GREEN (write skill) → REFACTOR (close loopholes).
350
+ Same benefits: Better quality, fewer surprises, bulletproof results.
351
+
352
+ If you follow TDD for code, follow it for skills. It's the same discipline applied to documentation.