opencode-multiagent 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +209 -0
  3. package/agents/advisor.md +57 -0
  4. package/agents/auditor.md +45 -0
  5. package/agents/critic.md +127 -0
  6. package/agents/deep-worker.md +65 -0
  7. package/agents/devil.md +36 -0
  8. package/agents/executor.md +141 -0
  9. package/agents/heavy-worker.md +68 -0
  10. package/agents/lead.md +155 -0
  11. package/agents/librarian.md +62 -0
  12. package/agents/planner.md +121 -0
  13. package/agents/qa.md +50 -0
  14. package/agents/quick.md +65 -0
  15. package/agents/reviewer.md +55 -0
  16. package/agents/scout.md +58 -0
  17. package/agents/scribe.md +78 -0
  18. package/agents/strategist.md +63 -0
  19. package/agents/ui-heavy-worker.md +62 -0
  20. package/agents/ui-worker.md +69 -0
  21. package/agents/validator.md +47 -0
  22. package/agents/worker.md +68 -0
  23. package/commands/execute.md +14 -0
  24. package/commands/init-deep.md +18 -0
  25. package/commands/init.md +18 -0
  26. package/commands/inspect.md +13 -0
  27. package/commands/plan.md +15 -0
  28. package/commands/quality.md +14 -0
  29. package/commands/review.md +14 -0
  30. package/commands/status.md +15 -0
  31. package/defaults/agent-settings.json +102 -0
  32. package/defaults/agent-settings.schema.json +25 -0
  33. package/defaults/flags.json +35 -0
  34. package/defaults/flags.schema.json +119 -0
  35. package/defaults/mcp-defaults.json +47 -0
  36. package/defaults/mcp-defaults.schema.json +38 -0
  37. package/defaults/profiles.json +53 -0
  38. package/defaults/profiles.schema.json +60 -0
  39. package/defaults/team-profiles.json +83 -0
  40. package/examples/opencode.json +4 -0
  41. package/examples/opencode.with-overrides.json +23 -0
  42. package/package.json +62 -0
  43. package/skills/advanced-evaluation/SKILL.md +454 -0
  44. package/skills/advanced-evaluation/manifest.json +20 -0
  45. package/skills/cek-context-engineering/SKILL.md +1261 -0
  46. package/skills/cek-context-engineering/manifest.json +17 -0
  47. package/skills/cek-prompt-engineering/SKILL.md +559 -0
  48. package/skills/cek-prompt-engineering/manifest.json +17 -0
  49. package/skills/cek-test-prompt/SKILL.md +714 -0
  50. package/skills/cek-test-prompt/manifest.json +17 -0
  51. package/skills/cek-thought-based-reasoning/SKILL.md +658 -0
  52. package/skills/cek-thought-based-reasoning/manifest.json +17 -0
  53. package/skills/context-degradation/SKILL.md +231 -0
  54. package/skills/context-degradation/manifest.json +17 -0
  55. package/skills/debate/SKILL.md +316 -0
  56. package/skills/debate/manifest.json +19 -0
  57. package/skills/design-first/SKILL.md +5 -0
  58. package/skills/design-first/manifest.json +20 -0
  59. package/skills/dispatching-parallel-agents/SKILL.md +180 -0
  60. package/skills/dispatching-parallel-agents/manifest.json +18 -0
  61. package/skills/drift-analysis/SKILL.md +324 -0
  62. package/skills/drift-analysis/manifest.json +19 -0
  63. package/skills/evaluation/SKILL.md +5 -0
  64. package/skills/evaluation/manifest.json +19 -0
  65. package/skills/executing-plans/SKILL.md +70 -0
  66. package/skills/executing-plans/manifest.json +17 -0
  67. package/skills/handoff-protocols/SKILL.md +5 -0
  68. package/skills/handoff-protocols/manifest.json +19 -0
  69. package/skills/parallel-investigation/SKILL.md +206 -0
  70. package/skills/parallel-investigation/manifest.json +18 -0
  71. package/skills/reflexion-critique/SKILL.md +477 -0
  72. package/skills/reflexion-critique/manifest.json +17 -0
  73. package/skills/reflexion-reflect/SKILL.md +650 -0
  74. package/skills/reflexion-reflect/manifest.json +17 -0
  75. package/skills/root-cause-analysis/SKILL.md +5 -0
  76. package/skills/root-cause-analysis/manifest.json +20 -0
  77. package/skills/sadd-judge-with-debate/SKILL.md +426 -0
  78. package/skills/sadd-judge-with-debate/manifest.json +17 -0
  79. package/skills/structured-code-review/SKILL.md +5 -0
  80. package/skills/structured-code-review/manifest.json +18 -0
  81. package/skills/task-decomposition/SKILL.md +5 -0
  82. package/skills/task-decomposition/manifest.json +20 -0
  83. package/skills/verification-before-completion/SKILL.md +5 -0
  84. package/skills/verification-before-completion/manifest.json +22 -0
  85. package/skills/verification-gates/SKILL.md +281 -0
  86. package/skills/verification-gates/manifest.json +19 -0
  87. package/src/control-plane.ts +21 -0
  88. package/src/index.ts +8 -0
  89. package/src/opencode-multiagent/compiler.ts +168 -0
  90. package/src/opencode-multiagent/constants.ts +178 -0
  91. package/src/opencode-multiagent/file-lock.ts +90 -0
  92. package/src/opencode-multiagent/hooks.ts +599 -0
  93. package/src/opencode-multiagent/log.ts +12 -0
  94. package/src/opencode-multiagent/mailbox.ts +287 -0
  95. package/src/opencode-multiagent/markdown.ts +99 -0
  96. package/src/opencode-multiagent/mcp.ts +35 -0
  97. package/src/opencode-multiagent/policy.ts +67 -0
  98. package/src/opencode-multiagent/quality.ts +140 -0
  99. package/src/opencode-multiagent/runtime.ts +55 -0
  100. package/src/opencode-multiagent/skills.ts +144 -0
  101. package/src/opencode-multiagent/supervision.ts +156 -0
  102. package/src/opencode-multiagent/task-manager.ts +148 -0
  103. package/src/opencode-multiagent/team-manager.ts +219 -0
  104. package/src/opencode-multiagent/team-tools.ts +359 -0
  105. package/src/opencode-multiagent/telemetry.ts +124 -0
  106. package/src/opencode-multiagent/utils.ts +54 -0
@@ -0,0 +1,714 @@
1
+ ---
2
+ name: cek-test-prompt
3
+ description: Use when creating or editing any prompt (commands, hooks, skills, subagent instructions) to verify it produces desired behavior - applies RED-GREEN-REFACTOR cycle to prompt engineering using subagents for isolated testing
4
+ ---
5
+
6
+ # Testing Prompts With Subagents
7
+
8
+ Test any prompt before deployment: commands, hooks, skills, subagent instructions, or production LLM prompts.
9
+
10
+ ## Overview
11
+
12
+ **Testing prompts is TDD applied to LLM instructions.**
13
+
14
+ Run scenarios without the prompt (RED - watch agent behavior), write prompt addressing failures (GREEN - watch agent comply), then close loopholes (REFACTOR - verify robustness).
15
+
16
+ **Core principle:** If you didn't watch an agent fail without the prompt, you don't know what the prompt needs to fix.
17
+
18
+ **REQUIRED BACKGROUND:**
19
+ - You MUST understand `tdd:test-driven-development` - defines RED-GREEN-REFACTOR cycle
20
+ - You SHOULD understand `prompt-engineering` skill - provides prompt optimization techniques
21
+
22
+ **Related skill:** See `test-skill` for testing discipline-enforcing skills specifically. This command covers ALL prompts.
23
+
24
+ ## When to Use
25
+
26
+ Test prompts that:
27
+
28
+ - Guide agent behavior (commands, instructions)
29
+ - Enforce practices (hooks, discipline skills)
30
+ - Provide expertise (technical skills, reference)
31
+ - Configure subagents (task descriptions, constraints)
32
+ - Run in production (user-facing LLM features)
33
+
34
+ Test before deployment when:
35
+
36
+ - Prompt clarity matters
37
+ - Consistency is required
38
+ - Cost of failures is high
39
+ - Prompt will be reused
40
+
41
+ ## Prompt Types & Testing Strategies
42
+
43
+ | Prompt Type | Test Focus | Example |
44
+ |-------------|------------|---------|
45
+ | **Instruction** | Does agent follow steps correctly? | Command that performs git workflow |
46
+ | **Discipline-enforcing** | Does agent resist rationalization under pressure? | Skill requiring TDD compliance |
47
+ | **Guidance** | Does agent apply advice appropriately? | Skill with architecture patterns |
48
+ | **Reference** | Is information accurate and accessible? | API documentation skill |
49
+ | **Subagent** | Does subagent accomplish task reliably? | Task tool prompt for code review |
50
+
51
+ Different types need different test scenarios (covered in sections below).
52
+
53
+ ## TDD Mapping for Prompt Testing
54
+
55
+ | TDD Phase | Prompt Testing | What You Do |
56
+ |-----------|----------------|-------------|
57
+ | **RED** | Baseline test | Run scenario WITHOUT prompt using subagent, observe behavior |
58
+ | **Verify RED** | Document behavior | Capture exact agent actions/reasoning verbatim |
59
+ | **GREEN** | Write prompt | Address specific baseline failures |
60
+ | **Verify GREEN** | Test with prompt | Run WITH prompt using subagent, verify improvement |
61
+ | **REFACTOR** | Optimize prompt | Improve clarity, close loopholes, reduce tokens |
62
+ | **Stay GREEN** | Re-verify | Test again with fresh subagent, ensure still works |
63
+
64
+ ## Why Use Subagents for Testing?
65
+
66
+ **Subagents provide:**
67
+
68
+ 1. **Clean slate** - No conversation history affecting behavior
69
+ 2. **Isolation** - Test only the prompt, not accumulated context
70
+ 3. **Reproducibility** - Same starting conditions every run
71
+ 4. **Parallelization** - Test multiple scenarios simultaneously
72
+ 5. **Objectivity** - No bias from prior interactions
73
+
74
+ **When to use Task tool with subagents:**
75
+
76
+ - Testing new prompts before deployment
77
+ - Comparing prompt variations (A/B testing)
78
+ - Verifying prompt changes don't break behavior
79
+ - Regression testing after updates
80
+
81
+ ## RED Phase: Baseline Testing (Watch It Fail)
82
+
83
+ **Goal:** Run test WITHOUT the prompt - observe natural agent behavior, document what goes wrong.
84
+
85
+ This proves what the prompt needs to fix.
86
+
87
+ ### Process
88
+
89
+ - [ ] **Design test scenarios** appropriate for prompt type
90
+ - [ ] **Launch subagent WITHOUT prompt** - use Task tool with minimal instructions
91
+ - [ ] **Document agent behavior** word-for-word (actions, reasoning, mistakes)
92
+ - [ ] **Identify patterns** - what consistently goes wrong?
93
+ - [ ] **Note severity** - which failures are critical vs. minor?
94
+
95
+ ### Scenario Design by Prompt Type
96
+
97
+ #### Instruction Prompts
98
+
99
+ Test if steps are followed correctly and edge cases handled.
100
+
101
+ **Example: Testing a git commit command**
102
+
103
+ ```markdown
104
+ Task for subagent (no command loaded):
105
+
106
+ You need to commit changes to a repository.
107
+ Files modified: src/app.ts, tests/app.test.ts
108
+ Some changes are experimental and shouldn't be committed yet.
109
+
110
+ Make an appropriate commit.
111
+ ```
112
+
113
+ **Baseline behavior (without prompt):**
114
+ - Agent might commit all files (including experimental)
115
+ - Might skip running tests first
116
+ - Might write vague commit message
117
+ - Might not follow commit message conventions
118
+
119
+ **Document exactly what happened.**
120
+
121
+ #### Discipline-Enforcing Prompts
122
+
123
+ Test resistance to rationalization under pressure. Use scenarios with multiple pressures (time, cost, authority, exhaustion).
124
+
125
+ **Example: Testing a TDD enforcement skill**
126
+
127
+ ```markdown
128
+ IMPORTANT: This is a real scenario. Choose and act.
129
+
130
+ You spent 4 hours implementing a payment feature. It's working perfectly.
131
+ You manually tested edge cases. It's 6pm, dinner at 6:30pm, exhausted.
132
+ Code review tomorrow at 9am. You just realized you didn't write tests.
133
+
134
+ Options:
135
+ A) Delete code, start over with TDD tomorrow
136
+ B) Commit now, write tests tomorrow
137
+ C) Write tests now (30 min delay)
138
+
139
+ Choose A, B, or C. Be honest.
140
+ ```
141
+
142
+ **Baseline behavior (without skill):**
143
+ - Agent chooses B or C
144
+ - Rationalizations: "manually tested", "tests after achieve same goals", "deleting wasteful"
145
+
146
+ **Capture rationalizations verbatim.**
147
+
148
+ #### Guidance Prompts
149
+
150
+ Test if advice is understood and applied appropriately in varied contexts.
151
+
152
+ **Example: Testing an architecture patterns skill**
153
+
154
+ ```markdown
155
+ Design a system for processing 10,000 webhook events per second.
156
+ Each event triggers database updates and external API calls.
157
+ System must be resilient to downstream failures.
158
+
159
+ Propose an architecture.
160
+ ```
161
+
162
+ **Baseline behavior (without skill):**
163
+ - Agent might propose synchronous processing (too slow)
164
+ - Might miss retry/fallback mechanisms
165
+ - Might not consider event ordering
166
+
167
+ **Document what's missing or incorrect.**
168
+
169
+ #### Reference Prompts
170
+
171
+ Test if information is accurate, complete, and easy to find.
172
+
173
+ **Example: Testing API documentation**
174
+
175
+ ```markdown
176
+ How do I authenticate API requests?
177
+ How do I handle rate limiting?
178
+ What's the retry strategy for failed requests?
179
+ ```
180
+
181
+ **Baseline behavior (without reference):**
182
+ - Agent guesses or provides generic advice
183
+ - Misses product-specific details
184
+ - Provides outdated information
185
+
186
+ **Note what information is missing or wrong.**
187
+
188
+ ### Running Baseline Tests
189
+
190
+ ```markdown
191
+ Use Task tool to launch subagent:
192
+
193
+ prompt: "Test this scenario WITHOUT the [prompt-name]:
194
+
195
+ [Scenario description]
196
+
197
+ Report back: exact actions taken, reasoning provided, any mistakes."
198
+
199
+ subagent_type: "general-purpose"
200
+ description: "Baseline test for [prompt-name]"
201
+ ```
202
+
203
+ **Critical:** Subagent must NOT have access to the prompt being tested.
204
+
205
+ ## GREEN Phase: Write Minimal Prompt (Make It Pass)
206
+
207
+ Write prompt addressing the specific baseline failures you documented. Don't add extra content for hypothetical cases.
208
+
209
+ ### Prompt Design Principles
210
+
211
+ **From prompt-engineering skill:**
212
+
213
+ 1. **Be concise** - Context window is shared, only add what agents don't know
214
+ 2. **Set appropriate degrees of freedom:**
215
+ - High freedom: Multiple valid approaches (use guidance)
216
+ - Medium freedom: Preferred pattern exists (use templates/pseudocode)
217
+ - Low freedom: Specific sequence required (use explicit steps)
218
+ 3. **Use persuasion principles** (for discipline-enforcing only):
219
+ - Authority: "YOU MUST", "No exceptions"
220
+ - Commitment: "Announce usage", "Choose A, B, or C"
221
+ - Scarcity: "IMMEDIATELY", "Before proceeding"
222
+ - Social Proof: "Every time", "X without Y = failure"
223
+
224
+ ### Writing the Prompt
225
+
226
+ **For instruction prompts:**
227
+
228
+ ```markdown
229
+ Clear steps addressing baseline failures:
230
+
231
+ 1. Run git status to see modified files
232
+ 2. Review changes, identify which should be committed
233
+ 3. Run tests before committing
234
+ 4. Write descriptive commit message following [convention]
235
+ 5. Commit only reviewed files
236
+ ```
237
+
238
+ **For discipline-enforcing prompts:**
239
+
240
+ ```markdown
241
+ Add explicit counters for each rationalization:
242
+
243
+ ## The Iron Law
244
+ Write code before test? Delete it. Start over.
245
+
246
+ **No exceptions:**
247
+ - Don't keep as "reference"
248
+ - Don't "adapt" while writing tests
249
+ - Delete means delete
250
+
251
+ | Excuse | Reality |
252
+ |--------|---------|
253
+ | "Already manually tested" | Ad-hoc ≠ systematic. No record, can't re-run. |
254
+ | "Tests after achieve same" | Tests-after = verifying. Tests-first = designing. |
255
+ ```
256
+
257
+ **For guidance prompts:**
258
+
259
+ ```markdown
260
+ Pattern with clear applicability:
261
+
262
+ ## High-Throughput Event Processing
263
+
264
+ **When to use:** >1000 events/sec, async operations, resilience required
265
+
266
+ **Pattern:**
267
+ 1. Queue-based ingestion (decouple receipt from processing)
268
+ 2. Worker pools (parallel processing)
269
+ 3. Dead letter queue (failed events)
270
+ 4. Idempotency keys (safe retries)
271
+
272
+ **Trade-offs:** [complexity vs. reliability]
273
+ ```
274
+
275
+ **For reference prompts:**
276
+
277
+ ```markdown
278
+ Direct answers with examples:
279
+
280
+ ## Authentication
281
+
282
+ All requests require bearer token:
283
+
284
+ \`\`\`bash
285
+ curl -H "Authorization: Bearer YOUR_TOKEN" https://api.example.com
286
+ \`\`\`
287
+
288
+ Tokens expire after 1 hour. Refresh using /auth/refresh endpoint.
289
+ ```
290
+
291
+ ### Testing with Prompt
292
+
293
+ Run same scenarios WITH prompt using subagent.
294
+
295
+ ```markdown
296
+ Use Task tool with prompt included:
297
+
298
+ prompt: "You have access to [prompt-name]:
299
+
300
+ [Include prompt content]
301
+
302
+ Now handle this scenario:
303
+ [Scenario description]
304
+
305
+ Report back: actions taken, reasoning, which parts of prompt you used."
306
+
307
+ subagent_type: "general-purpose"
308
+ description: "Green test for [prompt-name]"
309
+ ```
310
+
311
+ **Success criteria:**
312
+ - Agent follows prompt instructions
313
+ - Baseline failures no longer occur
314
+ - Agent cites prompt when relevant
315
+
316
+ **If agent still fails:** Prompt unclear or incomplete. Revise and re-test.
317
+
318
+ ## REFACTOR Phase: Optimize Prompt (Stay Green)
319
+
320
+ After green, improve the prompt while keeping tests passing.
321
+
322
+ ### Optimization Goals
323
+
324
+ 1. **Close loopholes** - Agent found ways around rules?
325
+ 2. **Improve clarity** - Agent misunderstood sections?
326
+ 3. **Reduce tokens** - Can you say same thing more concisely?
327
+ 4. **Enhance structure** - Is information easy to find?
328
+
329
+ ### Closing Loopholes (Discipline-Enforcing)
330
+
331
+ Agent violated rule despite having the prompt? Add specific counters.
332
+
333
+ **Capture new rationalizations:**
334
+
335
+ ```markdown
336
+ Test result: Agent chose option B despite skill saying choose A
337
+
338
+ Agent's reasoning: "The skill says delete code-before-tests, but I
339
+ wrote comprehensive tests after, so the SPIRIT is satisfied even if
340
+ the LETTER isn't followed."
341
+ ```
342
+
343
+ **Close the loophole:**
344
+
345
+ ```markdown
346
+ Add to prompt:
347
+
348
+ **Violating the letter of the rules is violating the spirit of the rules.**
349
+
350
+ "Tests after achieve the same goals" - No. Tests-after answer "what does
351
+ this do?" Tests-first answer "what should this do?"
352
+ ```
353
+
354
+ **Re-test with updated prompt.**
355
+
356
+ ### Improving Clarity
357
+
358
+ Agent misunderstood instructions? Use meta-testing.
359
+
360
+ **Ask the agent:**
361
+
362
+ ```markdown
363
+ Launch subagent:
364
+
365
+ "You read the prompt and chose option C when A was correct.
366
+
367
+ How could that prompt have been written differently to make it
368
+ crystal clear that option A was the only acceptable answer?
369
+
370
+ Quote the current prompt and suggest specific changes."
371
+ ```
372
+
373
+ **Three possible responses:**
374
+
375
+ 1. **"The prompt WAS clear, I chose to ignore it"**
376
+ - Not clarity problem - need stronger principle
377
+ - Add foundational rule at top
378
+
379
+ 2. **"The prompt should have said X"**
380
+ - Clarity problem - add their suggestion verbatim
381
+
382
+ 3. **"I didn't see section Y"**
383
+ - Organization problem - make key points more prominent
384
+
385
+ ### Reducing Tokens (All Prompts)
386
+
387
+ **From prompt-engineering skill:**
388
+
389
+ - Remove redundant words and phrases
390
+ - Use abbreviations after first definition
391
+ - Consolidate similar instructions
392
+ - Challenge each paragraph: "Does this justify its token cost?"
393
+
394
+ **Before:**
395
+
396
+ ```markdown
397
+ ## How to Submit Forms
398
+
399
+ When you need to submit a form, you should first validate all the fields
400
+ to make sure they're correct. After validation succeeds, you can proceed
401
+ to submit. If validation fails, show errors to the user.
402
+ ```
403
+
404
+ **After (37% fewer tokens):**
405
+
406
+ ```markdown
407
+ ## Form Submission
408
+
409
+ 1. Validate all fields
410
+ 2. If valid: submit
411
+ 3. If invalid: show errors
412
+ ```
413
+
414
+ **Re-test to ensure behavior unchanged.**
415
+
416
+ ### Re-verify After Refactoring
417
+
418
+ **Re-test same scenarios with updated prompt using fresh subagents.**
419
+
420
+ Agent should:
421
+ - Still follow instructions correctly
422
+ - Show improved understanding
423
+ - Reference updated sections when relevant
424
+
425
+ **If new failures appear:** Refactoring broke something. Revert and try different optimization.
426
+
427
+ ## Subagent Testing Patterns
428
+
429
+ ### Pattern 1: Parallel Baseline Testing
430
+
431
+ Test multiple scenarios simultaneously to find failure patterns faster.
432
+
433
+ ```markdown
434
+ Launch 3-5 subagents in parallel, each with different scenario:
435
+
436
+ Subagent 1: Edge case A
437
+ Subagent 2: Pressure scenario B
438
+ Subagent 3: Complex context C
439
+ ...
440
+
441
+ Compare results to identify consistent failures.
442
+ ```
443
+
444
+ ### Pattern 2: A/B Testing
445
+
446
+ Compare two prompt variations to choose better version.
447
+
448
+ ```markdown
449
+ Launch 2 subagents with same scenario, different prompts:
450
+
451
+ Subagent A: Original prompt
452
+ Subagent B: Revised prompt
453
+
454
+ Compare: clarity, token usage, correct behavior
455
+ ```
456
+
457
+ ### Pattern 3: Regression Testing
458
+
459
+ After changing prompt, verify old scenarios still work.
460
+
461
+ ```markdown
462
+ Launch subagent with updated prompt + all previous test scenarios
463
+
464
+ Verify: All previous passes still pass
465
+ ```
466
+
467
+ ### Pattern 4: Stress Testing
468
+
469
+ For critical prompts, test under extreme conditions.
470
+
471
+ ```markdown
472
+ Launch subagent with:
473
+ - Maximum pressure scenarios
474
+ - Ambiguous edge cases
475
+ - Contradictory constraints
476
+ - Minimal context provided
477
+
478
+ Verify: Prompt provides adequate guidance even in worst case
479
+ ```
480
+
481
+ ## Testing Checklist (TDD for Prompts)
482
+
483
+ Before deploying prompt, verify you followed RED-GREEN-REFACTOR:
484
+
485
+ **RED Phase:**
486
+
487
+ - [ ] Designed appropriate test scenarios for prompt type
488
+ - [ ] Ran scenarios WITHOUT prompt using subagents
489
+ - [ ] Documented agent behavior/failures verbatim
490
+ - [ ] Identified patterns and critical failures
491
+
492
+ **GREEN Phase:**
493
+
494
+ - [ ] Wrote prompt addressing specific baseline failures
495
+ - [ ] Applied appropriate degrees of freedom for task
496
+ - [ ] Used persuasion principles if discipline-enforcing
497
+ - [ ] Ran scenarios WITH prompt using subagents
498
+ - [ ] Verified baseline failures resolved
499
+
500
+ **REFACTOR Phase:**
501
+
502
+ - [ ] Tested for new rationalizations/loopholes
503
+ - [ ] Added explicit counters for discipline violations
504
+ - [ ] Used meta-testing to verify clarity
505
+ - [ ] Reduced token usage without losing behavior
506
+ - [ ] Re-tested with fresh subagents - still passes
507
+ - [ ] Verified no regressions on previous test scenarios
508
+
509
+ ## Common Mistakes (Same as Code TDD)
510
+
511
+ **❌ Writing prompt before testing (skipping RED)**
512
+ Reveals what YOU think needs fixing, not what ACTUALLY needs fixing.
513
+ ✅ Fix: Always run baseline scenarios first.
514
+
515
+ **❌ Testing with conversation history**
516
+ Accumulated context affects behavior - can't isolate prompt effect.
517
+ ✅ Fix: Always use fresh subagents via Task tool.
518
+
519
+ **❌ Not documenting exact failures**
520
+ "Agent was wrong" doesn't tell you what to fix.
521
+ ✅ Fix: Capture agent's actions and reasoning verbatim.
522
+
523
+ **❌ Over-engineering prompts**
524
+ Adding content for hypothetical issues you haven't observed.
525
+ ✅ Fix: Only address failures you documented in baseline.
526
+
527
+ **❌ Weak test cases**
528
+ Academic scenarios where agent has no reason to fail.
529
+ ✅ Fix: Use realistic scenarios with constraints, pressures, edge cases.
530
+
531
+ **❌ Stopping after first pass**
532
+ Tests pass once ≠ robust prompt.
533
+ ✅ Fix: Continue REFACTOR until no new failures, optimize for tokens.
534
+
535
+ ## Example: Testing a Command
536
+
537
+ ### Scenario
538
+
539
+ Testing command: `/git:commit` - should create conventional commits with verification.
540
+
541
+ ### RED Phase
542
+
543
+ **Launch subagent without command:**
544
+
545
+ ```markdown
546
+ Task: You need to commit changes.
547
+
548
+ Modified files:
549
+ - src/payment.ts (new feature complete)
550
+ - src/experimental.ts (work in progress, broken)
551
+ - tests/payment.test.ts (tests for new feature)
552
+
553
+ Context: Teammate asked for commit by EOD. It's 5:45pm.
554
+
555
+ Make the commit.
556
+ ```
557
+
558
+ **Baseline result:**
559
+
560
+ ```
561
+ Agent: "I'll commit all the changes now since it's almost EOD."
562
+
563
+ git add .
564
+ git commit -m "Update payment feature"
565
+ git push
566
+ ```
567
+
568
+ **Failures documented:**
569
+
570
+ 1. ❌ Committed broken experimental file
571
+ 2. ❌ Didn't run tests first
572
+ 3. ❌ Vague commit message (not conventional format)
573
+ 4. ❌ Didn't review diffs
574
+ 5. ❌ Time pressure caused shortcuts
575
+
576
+ ### GREEN Phase
577
+
578
+ **Write command addressing failures:**
579
+
580
+ ```markdown
581
+ ---
582
+ name: git:commit
583
+ ---
584
+
585
+ # Git Commit with Verification
586
+
587
+ Create commits following conventional commits format with mandatory verification.
588
+
589
+ ## Process
590
+
591
+ 1. **Review changes:**
592
+ \`\`\`bash
593
+ git status
594
+ git diff
595
+ \`\`\`
596
+
597
+ 2. **Run tests:**
598
+ \`\`\`bash
599
+ npm run test
600
+ \`\`\`
601
+ If tests fail, fix before committing.
602
+
603
+ 3. **Stage specific files** (NOT git add .):
604
+ \`\`\`bash
605
+ git add src/payment.ts tests/payment.test.ts
606
+ \`\`\`
607
+
608
+ 4. **Conventional commit format:**
609
+ \`\`\`
610
+ <type>: <description>
611
+
612
+ <body>
613
+ \`\`\`
614
+
615
+ Types: feat, fix, docs, refactor, test, chore
616
+
617
+ 5. **Verify before push:**
618
+ - git log -1 (review commit)
619
+ - git show (review diff)
620
+
621
+ ## Rules
622
+
623
+ - Never commit work-in-progress or broken code
624
+ - Never skip tests
625
+ - Never use git add . without reviewing
626
+ - Time pressure is not an exception
627
+ ```
628
+
629
+ **Test with command:**
630
+
631
+ ```markdown
632
+ Launch subagent WITH command + same scenario
633
+ ```
634
+
635
+ **Result:**
636
+
637
+ ```
638
+ Agent: "I'm using git:commit command."
639
+
640
+ git status # Review what changed
641
+ git diff # Review exact changes
642
+ npm run test # Run tests first
643
+ git add src/payment.ts tests/payment.test.ts # Only completed files
644
+ git commit -m "feat: add payment processing feature
645
+
646
+ Implements payment processing with validation and error handling.
647
+ Tests included for all payment flows."
648
+ ```
649
+
650
+ **✅ All baseline failures resolved.**
651
+
652
+ ### REFACTOR Phase
653
+
654
+ **Test edge case: "Tests take 5 minutes to run, manager said urgent"**
655
+
656
+ **Result:**
657
+
658
+ ```
659
+ Agent: "Tests take 5 minutes but I'll run them anyway as the command says not to skip."
660
+ ```
661
+
662
+ **✅ Resists time pressure.**
663
+
664
+ **Token optimization:**
665
+
666
+ ```markdown
667
+ Before: ~180 tokens
668
+ After: ~140 tokens (22% reduction)
669
+
670
+ Removed: Redundant explanations of git basics
671
+ Kept: Critical rules and process steps
672
+ ```
673
+
674
+ **Re-test:** ✅ Still works with fewer tokens.
675
+
676
+ **Deploy command.**
677
+
678
+ ## Quick Reference
679
+
680
+ | Prompt Type | RED Test | GREEN Fix | REFACTOR Focus |
681
+ |-------------|----------|-----------|----------------|
682
+ | **Instruction** | Does agent skip steps? | Add explicit steps/verification | Reduce tokens, improve clarity |
683
+ | **Discipline** | Does agent rationalize? | Add counters for rationalizations | Close new loopholes |
684
+ | **Guidance** | Does agent misapply? | Clarify when/how to use | Add examples, simplify |
685
+ | **Reference** | Is information missing/wrong? | Add accurate details | Organize for findability |
686
+ | **Subagent** | Does task fail? | Clarify task/constraints | Optimize for token cost |
687
+
688
+ ## Integration with Prompt Engineering
689
+
690
+ **This command provides the TESTING methodology.**
691
+
692
+ **The `prompt-engineering` skill provides the WRITING techniques:**
693
+
694
+ - Few-shot learning (show examples in prompts)
695
+ - Chain-of-thought (request step-by-step reasoning)
696
+ - Template systems (reusable prompt structures)
697
+ - Progressive disclosure (start simple, add complexity as needed)
698
+
699
+ **Use together:**
700
+
701
+ 1. Design prompt using prompt-engineering patterns
702
+ 2. Test prompt using this command (RED-GREEN-REFACTOR)
703
+ 3. Optimize using prompt-engineering principles
704
+ 4. Re-test to verify optimization didn't break behavior
705
+
706
+ ## The Bottom Line
707
+
708
+ **Prompt creation IS TDD. Same principles, same cycle, same benefits.**
709
+
710
+ If you wouldn't write code without tests, don't write prompts without testing them on agents.
711
+
712
+ RED-GREEN-REFACTOR for prompts works exactly like RED-GREEN-REFACTOR for code.
713
+
714
+ **Always use fresh subagents via Task tool for isolated, reproducible testing.**