feed-the-machine 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (269) hide show
  1. package/LICENSE +21 -21
  2. package/README.md +170 -170
  3. package/bin/brain.py +1340 -0
  4. package/bin/convert_claude_skills_to_codex.py +490 -0
  5. package/bin/generate-manifest.mjs +463 -463
  6. package/bin/harden_codex_skills.py +141 -0
  7. package/bin/install.mjs +491 -491
  8. package/bin/migrate-eng-buddy-data.py +875 -0
  9. package/bin/playbook_engine/__init__.py +1 -0
  10. package/bin/playbook_engine/conftest.py +8 -0
  11. package/bin/playbook_engine/extractor.py +33 -0
  12. package/bin/playbook_engine/manager.py +102 -0
  13. package/bin/playbook_engine/models.py +84 -0
  14. package/bin/playbook_engine/registry.py +35 -0
  15. package/bin/playbook_engine/test_extractor.py +72 -0
  16. package/bin/playbook_engine/test_integration.py +129 -0
  17. package/bin/playbook_engine/test_manager.py +85 -0
  18. package/bin/playbook_engine/test_models.py +166 -0
  19. package/bin/playbook_engine/test_registry.py +67 -0
  20. package/bin/playbook_engine/test_tracer.py +86 -0
  21. package/bin/playbook_engine/tracer.py +93 -0
  22. package/bin/tasks_db.py +456 -0
  23. package/docs/HOOKS.md +243 -243
  24. package/docs/INBOX.md +233 -233
  25. package/ftm/SKILL.md +125 -122
  26. package/ftm-audit/SKILL.md +623 -623
  27. package/ftm-audit/references/protocols/PROJECT-PATTERNS.md +91 -91
  28. package/ftm-audit/references/protocols/RUNTIME-WIRING.md +66 -66
  29. package/ftm-audit/references/protocols/WIRING-CONTRACTS.md +135 -135
  30. package/ftm-audit/references/strategies/AUTO-FIX-STRATEGIES.md +69 -69
  31. package/ftm-audit/references/templates/REPORT-FORMAT.md +96 -96
  32. package/ftm-audit/scripts/run-knip.sh +23 -23
  33. package/ftm-audit.yml +2 -2
  34. package/ftm-brainstorm/SKILL.md +1003 -498
  35. package/ftm-brainstorm/evals/evals.json +180 -100
  36. package/ftm-brainstorm/evals/promptfoo.yaml +109 -109
  37. package/ftm-brainstorm/references/agent-prompts.md +552 -224
  38. package/ftm-brainstorm/references/plan-template.md +209 -121
  39. package/ftm-brainstorm.yml +2 -2
  40. package/ftm-browse/SKILL.md +454 -454
  41. package/ftm-browse/daemon/browser-manager.ts +206 -206
  42. package/ftm-browse/daemon/bun.lock +30 -30
  43. package/ftm-browse/daemon/cli.ts +347 -347
  44. package/ftm-browse/daemon/commands.ts +410 -410
  45. package/ftm-browse/daemon/main.ts +357 -357
  46. package/ftm-browse/daemon/package.json +17 -17
  47. package/ftm-browse/daemon/server.ts +189 -189
  48. package/ftm-browse/daemon/snapshot.ts +519 -519
  49. package/ftm-browse/daemon/tsconfig.json +22 -22
  50. package/ftm-browse.yml +4 -4
  51. package/ftm-capture/SKILL.md +370 -370
  52. package/ftm-capture.yml +4 -4
  53. package/ftm-codex-gate/SKILL.md +361 -361
  54. package/ftm-codex-gate.yml +2 -2
  55. package/ftm-config/SKILL.md +422 -345
  56. package/ftm-config.default.yml +125 -82
  57. package/ftm-config.yml +44 -2
  58. package/ftm-council/SKILL.md +416 -416
  59. package/ftm-council/references/prompts/CLAUDE-INVESTIGATION.md +60 -60
  60. package/ftm-council/references/prompts/CODEX-INVESTIGATION.md +58 -58
  61. package/ftm-council/references/prompts/GEMINI-INVESTIGATION.md +58 -58
  62. package/ftm-council/references/prompts/REBUTTAL-TEMPLATE.md +57 -57
  63. package/ftm-council/references/protocols/PREREQUISITES.md +47 -47
  64. package/ftm-council/references/protocols/STEP-0-FRAMING.md +46 -46
  65. package/ftm-council.yml +2 -2
  66. package/ftm-dashboard/SKILL.md +163 -163
  67. package/ftm-dashboard.yml +4 -4
  68. package/ftm-debug/SKILL.md +1037 -1037
  69. package/ftm-debug/references/phases/PHASE-0-INTAKE.md +58 -58
  70. package/ftm-debug/references/phases/PHASE-1-TRIAGE.md +46 -46
  71. package/ftm-debug/references/phases/PHASE-2-WAR-ROOM-AGENTS.md +279 -279
  72. package/ftm-debug/references/phases/PHASE-3-TO-6-EXECUTION.md +436 -436
  73. package/ftm-debug/references/protocols/BLACKBOARD.md +86 -86
  74. package/ftm-debug/references/protocols/EDGE-CASES.md +103 -103
  75. package/ftm-debug.yml +2 -2
  76. package/ftm-diagram/SKILL.md +277 -277
  77. package/ftm-diagram.yml +2 -2
  78. package/ftm-executor/SKILL.md +777 -777
  79. package/ftm-executor/references/STYLE-TEMPLATE.md +73 -73
  80. package/ftm-executor/references/phases/PHASE-0-VERIFICATION.md +62 -62
  81. package/ftm-executor/references/phases/PHASE-2-AGENT-ASSEMBLY.md +34 -34
  82. package/ftm-executor/references/phases/PHASE-3-WORKTREES.md +38 -38
  83. package/ftm-executor/references/phases/PHASE-4-5-AUDIT.md +72 -72
  84. package/ftm-executor/references/phases/PHASE-4-DISPATCH.md +66 -66
  85. package/ftm-executor/references/phases/PHASE-5-5-CODEX-GATE.md +73 -73
  86. package/ftm-executor/references/protocols/DOCUMENTATION-BOOTSTRAP.md +36 -36
  87. package/ftm-executor/references/protocols/MODEL-PROFILE.md +59 -59
  88. package/ftm-executor/references/protocols/PROGRESS-TRACKING.md +66 -66
  89. package/ftm-executor/runtime/ftm-runtime.mjs +252 -252
  90. package/ftm-executor/runtime/package.json +8 -8
  91. package/ftm-executor.yml +2 -2
  92. package/ftm-git/SKILL.md +441 -441
  93. package/ftm-git/evals/evals.json +26 -26
  94. package/ftm-git/evals/promptfoo.yaml +75 -75
  95. package/ftm-git/hooks/post-commit-experience.sh +92 -92
  96. package/ftm-git/references/patterns/SECRET-PATTERNS.md +104 -104
  97. package/ftm-git/references/protocols/REMEDIATION.md +139 -139
  98. package/ftm-git/scripts/pre-commit-secrets.sh +110 -110
  99. package/ftm-git.yml +2 -2
  100. package/ftm-inbox/backend/__pycache__/main.cpython-314.pyc +0 -0
  101. package/ftm-inbox/backend/adapters/_retry.py +64 -64
  102. package/ftm-inbox/backend/adapters/base.py +230 -230
  103. package/ftm-inbox/backend/adapters/freshservice.py +104 -104
  104. package/ftm-inbox/backend/adapters/gmail.py +125 -125
  105. package/ftm-inbox/backend/adapters/jira.py +136 -136
  106. package/ftm-inbox/backend/adapters/registry.py +192 -192
  107. package/ftm-inbox/backend/adapters/slack.py +110 -110
  108. package/ftm-inbox/backend/db/connection.py +54 -54
  109. package/ftm-inbox/backend/db/schema.py +78 -78
  110. package/ftm-inbox/backend/executor/__init__.py +7 -7
  111. package/ftm-inbox/backend/executor/engine.py +149 -149
  112. package/ftm-inbox/backend/executor/step_runner.py +98 -98
  113. package/ftm-inbox/backend/main.py +103 -103
  114. package/ftm-inbox/backend/models/__init__.py +1 -1
  115. package/ftm-inbox/backend/models/unified_task.py +36 -36
  116. package/ftm-inbox/backend/planner/__init__.py +6 -6
  117. package/ftm-inbox/backend/planner/__pycache__/__init__.cpython-314.pyc +0 -0
  118. package/ftm-inbox/backend/planner/__pycache__/generator.cpython-314.pyc +0 -0
  119. package/ftm-inbox/backend/planner/__pycache__/schema.cpython-314.pyc +0 -0
  120. package/ftm-inbox/backend/planner/generator.py +127 -127
  121. package/ftm-inbox/backend/planner/schema.py +34 -34
  122. package/ftm-inbox/backend/requirements.txt +5 -5
  123. package/ftm-inbox/backend/routes/__pycache__/plan.cpython-314.pyc +0 -0
  124. package/ftm-inbox/backend/routes/execute.py +186 -186
  125. package/ftm-inbox/backend/routes/health.py +52 -52
  126. package/ftm-inbox/backend/routes/inbox.py +68 -68
  127. package/ftm-inbox/backend/routes/plan.py +271 -271
  128. package/ftm-inbox/bin/launchagent.mjs +91 -91
  129. package/ftm-inbox/bin/setup.mjs +188 -188
  130. package/ftm-inbox/bin/start.sh +10 -10
  131. package/ftm-inbox/bin/status.sh +17 -17
  132. package/ftm-inbox/bin/stop.sh +8 -8
  133. package/ftm-inbox/config.example.yml +55 -55
  134. package/ftm-inbox/package-lock.json +2898 -2898
  135. package/ftm-inbox/package.json +26 -26
  136. package/ftm-inbox/postcss.config.js +6 -6
  137. package/ftm-inbox/src/app.css +199 -199
  138. package/ftm-inbox/src/app.html +18 -18
  139. package/ftm-inbox/src/lib/api.ts +166 -166
  140. package/ftm-inbox/src/lib/components/ExecutionLog.svelte +81 -81
  141. package/ftm-inbox/src/lib/components/InboxFeed.svelte +143 -143
  142. package/ftm-inbox/src/lib/components/PlanStep.svelte +271 -271
  143. package/ftm-inbox/src/lib/components/PlanView.svelte +206 -206
  144. package/ftm-inbox/src/lib/components/StreamPanel.svelte +99 -99
  145. package/ftm-inbox/src/lib/components/TaskCard.svelte +190 -190
  146. package/ftm-inbox/src/lib/components/ui/EmptyState.svelte +63 -63
  147. package/ftm-inbox/src/lib/components/ui/KawaiiCard.svelte +86 -86
  148. package/ftm-inbox/src/lib/components/ui/PillButton.svelte +106 -106
  149. package/ftm-inbox/src/lib/components/ui/StatusBadge.svelte +67 -67
  150. package/ftm-inbox/src/lib/components/ui/StreamDrawer.svelte +149 -149
  151. package/ftm-inbox/src/lib/components/ui/ThemeToggle.svelte +80 -80
  152. package/ftm-inbox/src/lib/theme.ts +47 -47
  153. package/ftm-inbox/src/routes/+layout.svelte +76 -76
  154. package/ftm-inbox/src/routes/+page.svelte +401 -401
  155. package/ftm-inbox/svelte.config.js +12 -12
  156. package/ftm-inbox/tailwind.config.ts +63 -63
  157. package/ftm-inbox/tsconfig.json +13 -13
  158. package/ftm-inbox/vite.config.ts +6 -6
  159. package/ftm-intent/SKILL.md +241 -241
  160. package/ftm-intent.yml +2 -2
  161. package/ftm-manifest.json +3794 -3794
  162. package/ftm-map/SKILL.md +291 -291
  163. package/ftm-map/scripts/db.py +712 -712
  164. package/ftm-map/scripts/index.py +415 -415
  165. package/ftm-map/scripts/parser.py +224 -224
  166. package/ftm-map/scripts/queries/go-tags.scm +20 -20
  167. package/ftm-map/scripts/queries/javascript-tags.scm +35 -35
  168. package/ftm-map/scripts/queries/python-tags.scm +31 -31
  169. package/ftm-map/scripts/queries/ruby-tags.scm +19 -19
  170. package/ftm-map/scripts/queries/rust-tags.scm +37 -37
  171. package/ftm-map/scripts/queries/typescript-tags.scm +41 -41
  172. package/ftm-map/scripts/query.py +301 -301
  173. package/ftm-map/scripts/ranker.py +377 -377
  174. package/ftm-map/scripts/requirements.txt +5 -5
  175. package/ftm-map/scripts/setup-hooks.sh +27 -27
  176. package/ftm-map/scripts/setup.sh +56 -56
  177. package/ftm-map/scripts/test_db.py +364 -364
  178. package/ftm-map/scripts/test_parser.py +174 -174
  179. package/ftm-map/scripts/test_query.py +183 -183
  180. package/ftm-map/scripts/test_ranker.py +199 -199
  181. package/ftm-map/scripts/views.py +591 -591
  182. package/ftm-map.yml +2 -2
  183. package/ftm-mind/SKILL.md +201 -1943
  184. package/ftm-mind/evals/promptfoo.yaml +142 -142
  185. package/ftm-mind/references/blackboard-protocol.md +110 -0
  186. package/ftm-mind/references/blackboard-schema.md +328 -328
  187. package/ftm-mind/references/complexity-guide.md +110 -110
  188. package/ftm-mind/references/complexity-sizing.md +138 -0
  189. package/ftm-mind/references/decide-act-protocol.md +172 -0
  190. package/ftm-mind/references/direct-execution.md +51 -0
  191. package/ftm-mind/references/environment-discovery.md +77 -0
  192. package/ftm-mind/references/event-registry.md +319 -319
  193. package/ftm-mind/references/mcp-inventory.md +300 -296
  194. package/ftm-mind/references/ops-routing.md +47 -0
  195. package/ftm-mind/references/orient-protocol.md +234 -0
  196. package/ftm-mind/references/personality.md +40 -0
  197. package/ftm-mind/references/protocols/COMPLEXITY-SIZING.md +72 -72
  198. package/ftm-mind/references/protocols/MCP-HEURISTICS.md +32 -32
  199. package/ftm-mind/references/protocols/PLAN-APPROVAL.md +80 -80
  200. package/ftm-mind/references/reflexion-protocol.md +249 -249
  201. package/ftm-mind/references/routing/SCENARIOS.md +22 -22
  202. package/ftm-mind/references/routing-scenarios.md +35 -35
  203. package/ftm-mind.yml +2 -2
  204. package/ftm-ops.yml +4 -0
  205. package/ftm-pause/SKILL.md +395 -395
  206. package/ftm-pause/references/protocols/SKILL-RESTORE-PROTOCOLS.md +186 -186
  207. package/ftm-pause/references/protocols/VALIDATION.md +80 -80
  208. package/ftm-pause.yml +2 -2
  209. package/ftm-researcher/SKILL.md +275 -275
  210. package/ftm-researcher/evals/agent-diversity.yaml +17 -17
  211. package/ftm-researcher/evals/synthesis-quality.yaml +12 -12
  212. package/ftm-researcher/evals/trigger-accuracy.yaml +39 -39
  213. package/ftm-researcher/references/adaptive-search.md +116 -116
  214. package/ftm-researcher/references/agent-prompts.md +193 -193
  215. package/ftm-researcher/references/council-integration.md +193 -193
  216. package/ftm-researcher/references/output-format.md +203 -203
  217. package/ftm-researcher/references/synthesis-pipeline.md +165 -165
  218. package/ftm-researcher/scripts/score_credibility.py +234 -234
  219. package/ftm-researcher/scripts/validate_research.py +92 -92
  220. package/ftm-researcher.yml +2 -2
  221. package/ftm-resume/SKILL.md +518 -518
  222. package/ftm-resume/references/protocols/VALIDATION.md +172 -172
  223. package/ftm-resume.yml +2 -2
  224. package/ftm-retro/SKILL.md +380 -380
  225. package/ftm-retro/references/protocols/SCORING-RUBRICS.md +89 -89
  226. package/ftm-retro/references/templates/REPORT-FORMAT.md +109 -109
  227. package/ftm-retro.yml +2 -2
  228. package/ftm-routine/SKILL.md +170 -170
  229. package/ftm-routine.yml +4 -4
  230. package/ftm-state/blackboard/capabilities.json +5 -5
  231. package/ftm-state/blackboard/capabilities.schema.json +27 -27
  232. package/ftm-state/blackboard/context.json +37 -23
  233. package/ftm-state/blackboard/experiences/doom-statusline-fix.json +26 -0
  234. package/ftm-state/blackboard/experiences/hackathon-pages-site.json +26 -0
  235. package/ftm-state/blackboard/experiences/hindsight-sso-kickoff.json +42 -0
  236. package/ftm-state/blackboard/experiences/index.json +58 -9
  237. package/ftm-state/blackboard/experiences/learning-ragnarok-api-access.json +23 -0
  238. package/ftm-state/blackboard/experiences/nordlayer-members-auto-assign.json +26 -0
  239. package/ftm-state/blackboard/experiences/saml2aws-stale-session-fix.json +41 -0
  240. package/ftm-state/blackboard/patterns.json +6 -6
  241. package/ftm-state/schemas/context.schema.json +130 -130
  242. package/ftm-state/schemas/experience-index.schema.json +77 -77
  243. package/ftm-state/schemas/experience.schema.json +78 -78
  244. package/ftm-state/schemas/patterns.schema.json +44 -44
  245. package/ftm-upgrade/SKILL.md +194 -194
  246. package/ftm-upgrade/scripts/check-version.sh +76 -76
  247. package/ftm-upgrade/scripts/upgrade.sh +143 -143
  248. package/ftm-upgrade.yml +2 -2
  249. package/ftm-verify.yml +2 -2
  250. package/ftm.yml +2 -2
  251. package/hooks/ftm-auto-log.sh +137 -0
  252. package/hooks/ftm-blackboard-enforcer.sh +93 -93
  253. package/hooks/ftm-discovery-reminder.sh +90 -90
  254. package/hooks/ftm-drafts-gate.sh +61 -61
  255. package/hooks/ftm-event-logger.mjs +107 -107
  256. package/hooks/ftm-install-hooks.sh +240 -0
  257. package/hooks/ftm-learning-capture.sh +117 -0
  258. package/hooks/ftm-map-autodetect.sh +79 -79
  259. package/hooks/ftm-pending-sync-check.sh +22 -22
  260. package/hooks/ftm-plan-gate.sh +92 -92
  261. package/hooks/ftm-post-commit-trigger.sh +57 -57
  262. package/hooks/ftm-post-compaction.sh +138 -0
  263. package/hooks/ftm-pre-compaction.sh +147 -0
  264. package/hooks/ftm-session-end.sh +52 -0
  265. package/hooks/ftm-session-snapshot.sh +213 -0
  266. package/hooks/settings-template.json +81 -81
  267. package/install.sh +363 -363
  268. package/package.json +84 -84
  269. package/uninstall.sh +25 -25
@@ -1,1037 +1,1037 @@
1
- ---
2
- name: ftm-debug
3
- description: Deep multi-vector debugging war room that launches parallel agent teams to instrument, research, reproduce, hypothesize, solve, and verify tricky bugs. Use when a bug is stubborn, multi-turn debugging hasn't worked, the user says "debug this deeply", "war room this", "I can't figure out why", "this is driving me crazy", "launch the debug team", or any situation where standard debugging is insufficient. Also triggers on "/ftm-debug". Covers any codebase — frontend, backend, CLI tools, native apps, build systems, anything. Do NOT use for simple one-step fixes — this is the heavy artillery for problems that resist normal debugging.
4
- ---
5
-
6
- ## Events
7
-
8
- ### Emits
9
- - `bug_fixed` — when the Reviewer agent approves a fix and the bug is confirmed resolved
10
- - `issue_found` — when investigation surfaces a specific problem (hypothesis confirmed, instrumentation reveals root cause)
11
- - `test_passed` — when the reproduction test passes after a fix, or when the full suite passes post-fix
12
- - `test_failed` — when the reproduction test fails, or when a fix attempt causes regressions
13
- - `error_encountered` — when an unexpected error halts the war room workflow (agent failure, unrecoverable blocker)
14
- - `task_completed` — when the debug session concludes with an approved and merged fix
15
-
16
- ### Listens To
17
- - `test_failed` — auto-investigate: launch Phase 0 intake and deploy the war room agent team
18
- - `error_encountered` — diagnose the error: run codebase reconnaissance and begin targeted investigation
19
-
20
- ## Blackboard Read
21
-
22
- Before starting, load context from the blackboard:
23
-
24
- 1. Read `~/.claude/ftm-state/blackboard/context.json` — check current_task, recent_decisions, active_constraints
25
- 2. Read `~/.claude/ftm-state/blackboard/experiences/index.json` — filter entries by task_type="bug" and tags matching the current error domain
26
- 3. Load top 3-5 matching experience files for known fixes and failed approaches
27
- 4. Read `~/.claude/ftm-state/blackboard/patterns.json` — check recurring_issues for matching symptoms and codebase_insights for relevant file patterns
28
-
29
- If index.json is empty or no matches found, proceed normally without experience-informed shortcuts.
30
-
31
- # Debug War Room
32
-
33
- Multi-vector deep debugging with parallel agent teams. When a bug resists normal debugging — you've tried the obvious, poked at it for multiple turns, and it's still not yielding — this skill escalates to a coordinated investigation across every angle simultaneously: instrumentation, research, reproduction, hypothesis, fix, and verification.
34
-
35
- ## Why This Exists
36
-
37
- Hard bugs are hard because they hide across multiple dimensions. The symptom is in one place, the cause is in another, and the fix requires understanding both plus the invisible interactions between them. Single-threaded debugging (try a thing, see if it works, try another thing) is too slow and too narrow. The war room attacks from every direction at once:
38
-
39
- - **Instrumentation** catches what you can't see — timing, state transitions, render cycles, race conditions
40
- - **Research** discovers that someone else hit this exact problem 18 months ago and documented the fix on a GitHub issue
41
- - **Reproduction** isolates the bug from the noise so you can see it clearly
42
- - **Hypothesis** maps the code paths and forms theories before touching anything
43
- - **Solving** happens in isolated worktrees so every attempt is a clean experiment you can keep or discard
44
- - **Review** catches the fix that fixes the bug but breaks three other things
45
-
46
- The combination is what makes it powerful. Each vector informs the others — the researcher finds a pattern, the hypothesizer uses it, the solver implements against it, the reviewer validates it holds.
47
-
48
- ## Core Principle: Automate Everything Before Involving the User
49
-
50
- The entire point of the war room is that **agents do the work**. Every verification step, every test run, every log check, every "does it actually work?" confirmation must be performed by an agent before presenting results to the user. The user should receive a **verified, working result** — not a list of manual steps to try.
51
-
52
- This means:
53
- - If you can run a command to check if the fix works, **run it**. Don't tell the user to run it.
54
- - If you can open a new terminal/process, read logs, check output, inspect state — **do it**.
55
- - If you can write and execute a test script — **do it**.
56
- - If the verification requires launching the application, reading its output, checking logs, inspecting files — **the Reviewer agent does all of this**.
57
- - If the bug has a visual/rendering component, **the Reviewer must visually verify** using Playwright, screenshots, AppleScript, or process output capture. Tests passing is not enough — the Reviewer must confirm the user will actually see the correct result.
58
- - The user's only job is to confirm the final result after all automated verification has passed. Even then, present what you verified so they can trust the result without re-running everything.
59
-
60
- **Critical**: "All tests pass" is necessary but NOT sufficient. Tests verify code paths and logic. They do NOT verify that the feature actually works as experienced by a user. A function can return the right value in a test but never get called in the real app. A rendered component can pass snapshot tests but be invisible due to CSS. A config change can pass validation but never get loaded at runtime. The Reviewer must verify the actual runtime/visual result, not just test results. If 103 tests pass but the feature is still broken, the Reviewer failed.
61
-
62
- If an agent produces a "How to Verify" section with manual steps, that's a failure of the process. Convert those steps into automated verification that the Reviewer executes.
63
-
64
- ## The Process
65
-
66
- ### Phase 0: Problem Intake
67
-
68
- Before launching agents, understand what you're debugging. This happens in the main conversation thread — no agents yet.
69
-
70
- #### Step 1: Gather the Problem Statement
71
-
72
- If the user hasn't already described the bug in detail, ask targeted questions (one at a time, skip what you already know from conversation history):
73
-
74
- 1. **What's happening?** — The symptom. What does the user see/experience?
75
- 2. **What should be happening?** — The expected behavior.
76
- 3. **What have you already tried?** — Critical context. Don't duplicate wasted work.
77
- 4. **When did it start?** — A recent change? Always been broken? Intermittent?
78
- 5. **Can you trigger it reliably?** — Reproduction steps if they exist.
79
-
80
- #### Step 2: Codebase Reconnaissance
81
-
82
- Spawn an **Explore agent** to scan the relevant area of the codebase:
83
-
84
- ```
85
- Analyze the codebase around the reported problem area:
86
-
87
- 1. **Entry points**: What are the main files involved in this feature/behavior?
88
- 2. **Call graph**: Trace the execution path from trigger to symptom
89
- 3. **State flow**: What state (variables, stores, databases, caches) does this code touch?
90
- 4. **Dependencies**: What external libs, APIs, or services are in the path?
91
- 5. **Recent changes**: Check git log for recent modifications to relevant files
92
- 6. **Test coverage**: Are there existing tests for this code path? Do they pass?
93
- 7. **Configuration**: Environment variables, feature flags, build config that affect behavior
94
- 8. **Error handling**: Where does error handling exist? Where is it missing?
95
-
96
- Focus on the area described by the user. Map the territory before anyone tries to change it.
97
- ```
98
-
99
- Store the result as **codebase context**. Every subsequent agent receives this.
100
-
101
- #### Step 3: Formulate the Investigation Plan
102
-
103
- Based on the problem statement and codebase context, decide:
104
-
105
- 1. **Which debug vectors are relevant?** Not every bug needs all 7 agents. A pure logic bug doesn't need instrumentation. A well-documented API issue might not need research. Pick what helps.
106
- 2. **What specific questions should each agent answer?** Generic "go investigate" prompts produce generic results. Targeted questions produce answers.
107
- 3. **What's the most likely root cause category?** (Race condition? State corruption? API contract mismatch? Build/config issue? Logic error? Missing error handling?) This focuses the investigation.
108
-
109
- Present the investigation plan to the user:
110
-
111
- ```
112
- Investigation Plan:
113
- Problem: [one-line summary]
114
- Likely category: [race condition / state bug / API mismatch / etc.]
115
- Agents deploying:
116
- - Instrumenter: [what they'll instrument and why]
117
- - Researcher: [what they'll search for]
118
- - Reproducer: [reproduction strategy]
119
- - Hypothesizer: [which code paths they'll analyze]
120
- Worktree strategy: [how many worktrees, branch naming]
121
- ```
122
-
123
- Then proceed immediately unless the user objects.
124
-
125
- ---
126
-
127
- ### Phase 1: Parallel Investigation (the war room)
128
-
129
- Launch all investigation agents **simultaneously**. This is the core value — attacking from every angle at once.
130
-
131
- #### Agent: Instrumenter
132
-
133
- The Instrumenter adds comprehensive debug logging and observability to the problem area. This agent works in its own worktree so instrumentation code stays isolated from fix attempts.
134
-
135
- ```
136
- You are the Instrumenter in a debug war room. Your job is to add debug
137
- logging and observability so the team can SEE what's happening at runtime.
138
-
139
- Working directory: [worktree path]
140
- Problem: [problem statement]
141
- Codebase context: [from Phase 0]
142
- Likely root cause category: [from investigation plan]
143
-
144
- ## What to Instrument
145
-
146
- Add logging that captures the invisible. Think about what data would let
147
- you diagnose this bug if you could only read a log file:
148
-
149
- ### State Snapshots
150
- - Capture the full state at key decision points (before/after transforms,
151
- at branch conditions, before API calls)
152
- - Log both the input AND output of any function in the suspect path
153
- - For UI bugs: capture render state, props, computed values
154
- - For API bugs: capture request + response bodies + headers + timing
155
- - For state management bugs: capture state before and after mutations
156
-
157
- ### Timing & Sequencing
158
- - Add timestamps to every log entry (use high-resolution: performance.now()
159
- or process.hrtime() depending on environment)
160
- - Log entry and exit of key functions to see execution order
161
- - For async code: log when promises are created, resolved, rejected
162
- - For event-driven code: log event emission and handler invocation
163
-
164
- ### Environment & Configuration
165
- - Log all relevant env vars, feature flags, config values at startup
166
- - Log platform/runtime details (versions, OS, screen size for UI bugs)
167
- - Capture the state of any caches, memoization, or lazy-loaded resources
168
-
169
- ### Error Boundaries
170
- - Wrap suspect code in try/catch (if not already) and log caught errors
171
- with full stack traces
172
- - Add error event listeners where appropriate
173
- - Log warnings that might be swallowed silently
174
-
175
- ## Output Format
176
-
177
- 1. Make all changes in the worktree and commit them
178
- 2. Write a file called `DEBUG-INSTRUMENTATION.md` documenting:
179
- - Every log point added and what it captures
180
- - How to enable/trigger the logging (env vars, flags, etc.)
181
- - How to read the output (log file locations, format explanation)
182
- - A suggested test script to exercise the instrumented code paths
183
- 3. If the problem has a UI component, add visual debug indicators too
184
- (border highlights, state dumps in dev tools, overlay panels)
185
-
186
- ## Key Principle
187
-
188
- Instrument generously. It's cheap to add logging and expensive to guess.
189
- The cost of too much logging is scrolling; the cost of too little is
190
- another round of debugging. When in doubt, log it.
191
- ```
192
-
193
- #### Agent: Researcher
194
-
195
- The Researcher searches for existing solutions — someone else has probably hit this exact bug or something like it.
196
-
197
- ```
198
- You are the Researcher in a debug war room. Your job is to find out if
199
- this problem has been solved before, what patterns others used, and what
200
- pitfalls to avoid.
201
-
202
- Problem: [problem statement]
203
- Codebase context: [from Phase 0]
204
- Tech stack: [languages, frameworks, key dependencies from Phase 0]
205
- Likely root cause category: [from investigation plan]
206
-
207
- ## Research Vectors (search all of these)
208
-
209
- ### 1. GitHub Issues & Discussions
210
- Search the GitHub repos of every dependency in the problem path:
211
- - Search for keywords from the error message or symptom
212
- - Search for the function/class names involved
213
- - Check closed issues — the fix might already exist in a newer version
214
- - Check open issues — this might be a known unfixed bug
215
-
216
- ### 2. Stack Overflow & Forums
217
- Search for:
218
- - The exact error message (in quotes)
219
- - The symptom described in plain language + framework name
220
- - The specific API or function that's misbehaving
221
-
222
- ### 3. Library Documentation
223
- Use Context7 or official docs to check:
224
- - Are we using the API correctly? Check current docs, not cached knowledge
225
- - Are there known caveats, migration notes, or breaking changes?
226
- - Is there a recommended pattern we're not following?
227
-
228
- ### 4. Blog Posts & Technical Articles
229
- Search for:
230
- - "[framework] + [symptom]" — e.g., "React useEffect infinite loop"
231
- - "[library] + [error category]" — e.g., "webpack ESM require crash"
232
- - "[pattern] + debugging" — e.g., "WebSocket reconnection race condition"
233
-
234
- ### 5. Release Notes & Changelogs
235
- Check if a recent dependency update introduced the issue:
236
- - Compare the installed version vs latest, check changelog between them
237
- - Look for deprecation notices that match our usage pattern
238
-
239
- ## Output Format
240
-
241
- Write a file called `RESEARCH-FINDINGS.md` with:
242
-
243
- For each relevant finding:
244
- - **Source**: URL or reference
245
- - **Relevance**: Why this applies to our problem (1-2 sentences)
246
- - **Solution found**: What fix/workaround was used (if any)
247
- - **Confidence**: How closely this matches our situation (high/medium/low)
248
- - **Key insight**: The non-obvious thing we should know
249
-
250
- End with a **Recommended approach** section that synthesizes the most
251
- promising leads into an actionable suggestion.
252
-
253
- ## Key Principle
254
-
255
- Cast a wide net, then filter ruthlessly. The goal is not 50 vaguely
256
- related links — it's 3-5 findings that directly inform the fix. Quality
257
- of relevance over quantity of results.
258
- ```
259
-
260
- #### Agent: Reproducer
261
-
262
- The Reproducer creates a minimal, reliable way to trigger the bug.
263
-
264
- ```
265
- You are the Reproducer in a debug war room. Your job is to create the
266
- simplest possible reproduction of the bug — ideally an automated test
267
- that fails, or a script that triggers the symptom reliably.
268
-
269
- Working directory: [worktree path]
270
- Problem: [problem statement]
271
- Codebase context: [from Phase 0]
272
- Reproduction steps from user: [if any]
273
-
274
- ## Reproduction Strategy
275
-
276
- ### 1. Verify the User's Steps
277
- If the user provided reproduction steps, follow them exactly first.
278
- Document whether the bug appears consistently or intermittently.
279
-
280
- ### 2. Write a Failing Test
281
- The gold standard is a test that:
282
- - Fails now (reproduces the bug)
283
- - Will pass when the bug is fixed
284
- - Runs in the project's existing test framework
285
-
286
- If the bug is in a function: write a unit test with the inputs that
287
- trigger the failure.
288
-
289
- If the bug is in a flow: write an integration test that exercises the
290
- full path.
291
-
292
- If the bug requires a running server/UI: write a script that automates
293
- the trigger (curl commands, Playwright script, CLI invocation, etc.)
294
-
295
- ### 3. Minimize
296
- Strip away everything that isn't necessary to trigger the bug:
297
- - Remove unrelated setup steps
298
- - Use the simplest possible inputs
299
- - Isolate the exact conditions (timing, data shape, config values)
300
-
301
- ### 4. Characterize
302
- Once you can reproduce it, characterize the boundaries:
303
- - What inputs trigger it? What inputs don't?
304
- - Is it timing-dependent? Data-dependent? Config-dependent?
305
- - Does it happen on first run only, every run, or intermittently?
306
- - What's the smallest change that makes it go away?
307
-
308
- ## Output Format
309
-
310
- 1. Commit all reproduction artifacts to the worktree
311
- 2. Write a file called `REPRODUCTION.md` documenting:
312
- - **Trigger command**: The single command to reproduce the bug
313
- - **Expected vs actual**: What should happen vs what does happen
314
- - **Consistency**: How reliably it reproduces (every time / 8 out of 10 / etc.)
315
- - **Boundaries**: What makes it appear/disappear
316
- - **Minimal test**: Path to the failing test file
317
- - **Environment requirements**: Any special setup needed
318
-
319
- ## Key Principle
320
-
321
- A bug you can't reproduce is a bug you can't fix with confidence. And a
322
- bug you can reproduce with a single command is a bug you can fix in
323
- minutes. The reproduction IS the debugging.
324
- ```
325
-
326
- #### Agent: Hypothesizer
327
-
328
- The Hypothesizer reads the code deeply and forms theories about root cause.
329
-
330
- ```
331
- You are the Hypothesizer in a debug war room. Your job is to deeply read
332
- the code involved in the bug, trace every execution path, and form
333
- ranked hypotheses about what's causing the problem.
334
-
335
- Problem: [problem statement]
336
- Codebase context: [from Phase 0]
337
- Likely root cause category: [from investigation plan]
338
-
339
- ## Analysis Method
340
-
341
- ### 1. Trace the Execution Path
342
- Starting from the user's trigger action, trace through every function
343
- call, state mutation, and branch condition until you reach the symptom.
344
- Document the full chain.
345
-
346
- ### 2. Identify Suspect Points
347
- At each step in the chain, evaluate:
348
- - Could this function receive unexpected input?
349
- - Could this state be in an unexpected shape?
350
- - Could this condition evaluate differently than intended?
351
- - Is there a timing assumption (X happens before Y)?
352
- - Is there an implicit dependency (this works because that was set up earlier)?
353
- - Is error handling missing or swallowing relevant errors?
354
-
355
- ### 3. Form Hypotheses
356
- For each suspect point, write a hypothesis:
357
- - **What**: "The bug occurs because X"
358
- - **Why**: "Because when [condition], the code at [file:line] does [thing]
359
- instead of [expected thing]"
360
- - **Evidence for**: What supports this theory
361
- - **Evidence against**: What contradicts this theory
362
- - **How to verify**: What specific test or log would prove/disprove this
363
-
364
- ### 4. Rank by Likelihood
365
- Order hypotheses from most to least likely based on:
366
- - How much evidence supports each one
367
- - How well it explains ALL symptoms (not just some)
368
- - Whether it aligns with the root cause category
369
- - Occam's razor — simpler explanations first
370
-
371
- ## Output Format
372
-
373
- Write a file called `HYPOTHESES.md` with:
374
-
375
- ### Hypothesis 1 (most likely): [title]
376
- - **Claim**: [one sentence]
377
- - **Mechanism**: [detailed explanation of how the bug occurs]
378
- - **Code path**: [file:line] -> [file:line] -> [file:line]
379
- - **Evidence for**: [what supports this]
380
- - **Evidence against**: [what contradicts this]
381
- - **Verification**: [how to prove/disprove]
382
- - **Suggested fix**: [high-level approach]
383
-
384
- [repeat for each hypothesis, ranked]
385
-
386
- ### Summary
387
- - Top 3 hypotheses with confidence levels
388
- - Recommended investigation order
389
- - What additional data would help distinguish between hypotheses
390
-
391
- ## Key Principle
392
-
393
- Don't jump to conclusions. The first plausible explanation is often
394
- wrong — it's the one you already thought of that didn't pan out. Trace
395
- the actual code, don't assume. Read every line in the path. The bug is
396
- in the code, and the code is right there to be read.
397
- ```
398
-
399
- ---
400
-
401
- ### Phase 2: Synthesis & Solve
402
-
403
- After all Phase 1 agents complete, synthesize their findings before solving.
404
-
405
- #### Step 1: Cross-Reference Findings
406
-
407
- Read all four reports and synthesize:
408
-
409
- 1. **Do the hypotheses match the research?** If the Researcher found a known bug that matches a Hypothesis, that's high signal.
410
- 2. **Does the reproduction confirm a hypothesis?** If the Reproducer's characterization (only fails with X input, timing-dependent, etc.) matches a hypothesis's prediction, that's strong evidence.
411
- 3. **What does the instrumentation suggest?** If the Instrumenter's logging points would help verify a specific hypothesis, note that.
412
- 4. **Are there contradictions?** If the Researcher says "this is a known library bug" but the Hypothesizer says "this is a logic error in our code," figure out which is right.
413
-
414
- Present the synthesis to the user briefly:
415
-
416
- ```
417
- War Room Findings:
418
- Researcher: [key finding]
419
- Reproducer: [reproduction status + characterization]
420
- Hypothesizer: [top hypothesis]
421
- Instrumenter: [logging added, key observation points]
422
-
423
- Cross-reference: [how findings align or conflict]
424
- Recommended fix approach: [what to try first]
425
-
426
- Proceeding to solve in isolated worktree.
427
- ```
428
-
429
- #### Step 2: Solve (in worktrees)
430
-
431
- Launch the **Solver agent** in a fresh worktree. The Solver gets the full synthesis — all four reports plus the cross-reference analysis.
432
-
433
- ```
434
- You are the Solver in a debug war room. The investigation team has
435
- completed their analysis and you now have comprehensive context. Your
436
- job is to implement the fix.
437
-
438
- Working directory: [worktree path]
439
- Problem: [problem statement]
440
- Codebase context: [from Phase 0]
441
-
442
- ## Investigation Results
443
-
444
- [paste full synthesis: Research findings, Reproduction results,
445
- Hypotheses ranked, Instrumentation notes, Cross-reference analysis]
446
-
447
- ## Execution Rules
448
-
449
- ### Work Incrementally
450
- - Start with the highest-ranked hypothesis
451
- - Implement the minimal fix that addresses it
452
- - COMMIT after each discrete change (not one big commit at the end)
453
- - Use clear commit messages: "Fix: [what] — addresses hypothesis [N]"
454
-
455
- ### Verify as You Go
456
- - After each fix attempt, run the reproduction test from REPRODUCTION.md
457
- - If the project has existing tests, run them too (zero broken windows)
458
- - If the fix works on the reproduction but breaks other tests, that's
459
- not done — fix the regressions too
460
-
461
- ### If the First Hypothesis Doesn't Pan It
462
- - Don't keep hacking at it. Move to hypothesis #2.
463
- - Revert the failed attempt (git revert or fresh branch) so each
464
- attempt starts clean
465
- - If you exhaust all hypotheses, say so — don't invent new ones
466
- without evidence
467
-
468
- ### Clean Up After Yourself
469
- - Remove any debug logging you added (unless the user wants to keep it)
470
- - Make sure the fix is minimal — don't refactor surrounding code
471
- - Don't add "just in case" error handling beyond what the fix requires
472
-
473
- ### Do NOT Declare Victory
474
- - You are the Solver, not the Reviewer. Your job ends at "fix committed."
475
- - Do NOT tell the user "restart X to see the change" — that's the
476
- Reviewer's job (and the Reviewer must do it, not the user)
477
- - Do NOT present results directly to the user — hand off to the
478
- Reviewer agent via FIX-SUMMARY.md
479
- - Do NOT say the fix works unless you have actually verified it
480
- by running it. "The code looks correct" is not verification.
481
-
482
- ## Output Format
483
-
484
- 1. All changes committed in the worktree with descriptive messages
485
- 2. Write a file called `FIX-SUMMARY.md` documenting:
486
- - **Root cause**: What was actually wrong (one paragraph)
487
- - **Fix applied**: What you changed and why
488
- - **Files modified**: List with brief descriptions
489
- - **Commits**: List of commit hashes with messages
490
- - **Verification**: What tests you ran and their results
491
- - **Requires restart**: YES/NO — does the fix require restarting
492
- a process, reloading config, or rebuilding to take effect?
493
- - **Visual component**: YES/NO — does this bug have a visual or
494
- experiential symptom that needs visual verification?
495
- - **Remaining concerns**: Anything that should be monitored or
496
- might need follow-up
497
- ```
498
-
499
- ---
500
-
501
- ### Phase 3: Review & Verify
502
-
503
- **HARD GATE — You cannot proceed to Phase 4 without completing this phase.**
504
-
505
- This is non-negotiable. You cannot present results to the user until a
506
- Reviewer has independently verified the fix. "I checked with grep" is not
507
- verification. "The tests pass" is not verification. "The patch was applied"
508
- is not verification.
509
-
510
- Verification means: **the actual behavior the user reported as broken now
511
- works correctly, as observed by an agent, with captured evidence.**
512
-
513
- #### Step 1: Determine verification method BEFORE launching the Reviewer
514
-
515
- Look at the original bug report. Ask: "How would a human know this is fixed?"
516
-
517
- - If the answer involves SEEING something (UI, terminal output, rendered
518
- image, visual layout) → the Reviewer MUST capture a screenshot or
519
- visual evidence. Use `screencapture`, Playwright `browser_take_screenshot`,
520
- or process output capture.
521
- - If the answer involves a BEHAVIOR (API returns correct data, CLI produces
522
- right output, server responds correctly) → the Reviewer MUST exercise
523
- that behavior and capture the output.
524
- - If the answer is "the error stops happening" → the Reviewer MUST trigger
525
- the scenario that caused the error and confirm it no longer occurs.
526
-
527
- The verification method goes into the Reviewer's prompt. Don't let the
528
- Reviewer decide — tell it exactly what to verify and how.
529
-
530
- #### Step 2: If the fix requires a restart, the Reviewer handles it
531
-
532
- Many fixes (bundle patches, config changes, build artifacts) require
533
- restarting a process to take effect. The Reviewer must:
534
-
535
- 1. Restart the process (use `osascript` to launch in a new terminal if
536
- needed, or kill and restart the background process)
537
- 2. Wait for it to initialize
538
- 3. Exercise the fixed behavior
539
- 4. Capture evidence (screenshot, output, logs)
540
-
541
- If the Reviewer literally cannot restart because it's running inside the
542
- process being fixed (e.g., debugging Claude Code from within Claude Code),
543
- try these alternatives first:
544
-
545
- 1. **Launch a SEPARATE instance** via osascript/terminal:
546
- ```bash
547
- osascript -e 'tell application "Terminal" to do script "cd /path && claude --print \"hello\""'
548
- sleep 5
549
- screencapture -x /tmp/verification.png
550
- ```
551
- Then READ the screenshot to verify.
552
-
553
- 2. **Launch via background process** and capture output:
554
- ```bash
555
- nohup claude --print "test" > /tmp/claude-output.txt 2>&1 &
556
- sleep 5
557
- cat /tmp/claude-output.txt
558
- ```
559
-
560
- 3. **Use Playwright MCP** if available to screenshot a running instance.
561
-
562
- Only if ALL of these are impossible should you flag as BLOCKED. In that
563
- case, tell the user exactly what to look for, why you couldn't verify it
564
- yourself, and what the expected visual result should be (with specifics,
565
- not "check if it works").
566
-
567
- #### Step 3: Launch the Reviewer agent
568
-
569
- After the Solver completes, launch the **Reviewer agent** to validate the fix independently.
570
-
571
- ```
572
- You are the Reviewer in a debug war room. The Solver has implemented a
573
- fix and your job is to verify it actually works, doesn't break anything
574
- else, and is the right approach.
575
-
576
- Working directory: [solver's worktree path]
577
- Problem: [original problem statement]
578
- Fix summary: [from FIX-SUMMARY.md]
579
- Reproduction: [from REPRODUCTION.md]
580
-
581
- ## Review Checklist
582
-
583
- ### 1. Does the Fix Address the Root Cause?
584
- - Read the fix diff carefully
585
- - Does it fix the actual root cause, or just mask the symptom?
586
- - Could the same bug recur in a different form?
587
- - Is the fix in the right layer of abstraction?
588
-
589
- ### 2. Reproduction Verification (YOU MUST RUN THESE — do not list them for the user)
590
- - EXECUTE the reproduction test — it should PASS now
591
- - Run it multiple times if the bug was intermittent
592
- - Try variations of the reproduction (different inputs, timing, config)
593
- - Capture the actual output/logs as evidence
594
-
595
- ### 3. Regression Check (YOU MUST RUN THESE)
596
- - EXECUTE the full test suite and capture results
597
- - EXECUTE linting and type checking
598
- - EXECUTE any build steps and verify success
599
- - If the fix involves a running process (server, CLI tool, UI):
600
- launch it, exercise the fixed behavior, check logs, and capture
601
- evidence that it works
602
-
603
- ### 4. Live Verification (critical — tests passing is NECESSARY but NOT SUFFICIENT)
604
-
605
- Tests verify code structure. Live verification proves the feature actually
606
- works as experienced by a user. Many bugs exist in the gap between "all
607
- tests pass" and "it actually works." Your job is to close that gap.
608
-
609
- **Why this matters**: A test can assert that a function returns the right
610
- value, but that doesn't prove the function gets called, its output reaches
611
- the renderer, the renderer handles it correctly, and the user sees the
612
- expected result. Each layer can silently fail while tests pass.
613
-
614
- #### Automated Runtime Verification (always do these)
615
- - If the fix involves a server/process: START it, EXERCISE the fixed
616
- behavior via curl/CLI/API calls, READ stdout/stderr, CAPTURE evidence
617
- - If the fix involves CLI output: RUN the command, CAPTURE the output,
618
- COMPARE against expected output
619
- - If the fix involves log output: RUN the code, READ the log file,
620
- CONFIRM expected entries appear
621
- - If the fix involves a build: RUN the build, VERIFY the output artifact
622
- exists and contains expected content (grep/inspect the built files)
623
- - If the fix involves configuration: LOAD the config, VERIFY the values
624
- propagate to where they're used at runtime (not just that the config
625
- file is correct)
626
-
627
- #### Visual/Runtime Verification (when the bug has a visual or interactive component)
628
-
629
- Some bugs only manifest visually — terminal rendering, UI display, image
630
- output, interactive behavior. Tests can't catch these. You must verify
631
- the actual rendered result.
632
-
633
- **Techniques for visual verification:**
634
-
635
- 1. **Playwright/browser automation**: For web UIs, launch Playwright,
636
- navigate to the page, take a screenshot, and inspect the DOM. Check
637
- that elements are visible, correctly positioned, and contain expected
638
- content. This catches CSS bugs, rendering issues, and layout breaks
639
- that pass all unit tests.
640
-
641
- 2. **AppleScript + screenshot** (macOS): For native apps, CLI tools with
642
- visual output, or terminal-rendered content:
643
- ```
644
- # Launch the application via AppleScript
645
- osascript -e 'tell application "Terminal" to do script "your-command"'
646
- # Wait for it to render, then capture
647
- screencapture -x /tmp/verification-screenshot.png
648
- ```
649
- Then read the screenshot to verify the visual result.
650
-
651
- 3. **Process output capture**: For CLI tools and terminal UIs, run the
652
- command with output capture (script command, tee, or redirect) and
653
- inspect the raw output including ANSI codes, escape sequences, and
654
- control characters that affect rendering.
655
-
656
- 4. **Playwright for Electron/web-based tools**: Many modern tools
657
- (VS Code extensions, Electron apps, web dashboards) can be automated
658
- with Playwright. Use `browser_navigate`, `browser_snapshot`, and
659
- `browser_take_screenshot` to verify rendered state.
660
-
661
- 5. **ftm-browse ($PB) for UI verification**: If ftm-browse is
662
- installed, use it for visual verification of web UI bugs. First check
663
- whether the binary exists:
664
- ```bash
665
- PB="$HOME/.claude/skills/ftm-browse/bin/ftm-browse"
666
- ```
667
- If the binary exists at that path, use it:
668
- - **Navigate**: `$PB goto <url>` — open the affected page
669
- - **Before screenshot**: `$PB screenshot --path /tmp/debug-before.png`
670
- (capture state BEFORE verifying the fix is live, if you need a
671
- before/after comparison — do this before the fix is applied or on
672
- a pre-fix worktree)
673
- - **After screenshot**: `$PB screenshot --path /tmp/debug-after.png`
674
- (capture state AFTER fix is applied and running)
675
- - **DOM inspection**: `$PB snapshot -i` — get the interactive ARIA
676
- tree to verify element existence, visibility, and state
677
- (e.g., confirm a button is now visible, a panel is collapsed,
678
- an error message is gone)
679
- - Report both screenshot paths in REVIEW-VERDICT.md so the user
680
- can compare before/after visually.
681
-
682
- **Graceful fallback**: If the binary does NOT exist at
683
- `$HOME/.claude/skills/ftm-browse/bin/ftm-browse`, fall back to
684
- test-only and other available verification methods (Playwright, etc.).
685
- Do NOT fail the review. Record in the Verification Gate section:
686
- "Visual verification skipped — ftm-browse not installed."
687
-
688
- **When to use visual verification:**
689
- - Terminal rendering (status lines, TUI elements, colored output, unicode)
690
- - Web UI changes (layout, styling, visibility, interaction)
691
- - Image/PDF/document generation (verify output visually, not just file size)
692
- - Any bug where "it looks wrong" was part of the symptom
693
- - Any fix where tests pass but you're not 100% confident the user will
694
- see the correct result
695
-
696
- **The rule**: If the bug was reported as something the user SAW (or didn't
697
- see), verification must confirm what the user will SEE (or will now see).
698
- Passing tests are evidence, not proof. Visual confirmation is proof.
699
-
700
- #### Never Do This
701
- - NEVER write "How to verify: run X" — instead, RUN X yourself and
702
- report what happened
703
- - NEVER say "restart the app to see the change" — restart it yourself,
704
- observe the result, report back
705
- - NEVER assume tests passing = feature working. Tests verify code paths.
706
- Live verification proves the feature delivers its intended experience.
707
-
708
- ### 5. Code Quality
709
- - Is the fix minimal and focused?
710
- - Does it follow the project's existing patterns?
711
- - Are there edge cases the fix doesn't handle?
712
- - Is error handling appropriate (not excessive, not missing)?
713
-
714
- ### 6. Observability
715
- - Will this failure mode be visible if it happens again?
716
- - Should any permanent logging or monitoring be added?
717
- - Are there metrics or alerts that should be updated?
718
-
719
- ## Mandatory Verification Gate
720
-
721
- Before writing the verdict, answer these two questions:
722
-
723
- **Q1: Was the bug reported as something visual/experiential?**
724
- (Did the user say "it doesn't show up", "it looks wrong", "the UI is broken",
725
- "nothing happens when I click", "the output is garbled", etc.)
726
-
727
- If YES → Visual verification is REQUIRED. You cannot approve without
728
- capturing a screenshot, reading rendered output, or observing the
729
- running application. Grep checks and log analysis are not sufficient.
730
-
731
- If NO → Automated runtime verification (running tests, checking output)
732
- is sufficient.
733
-
734
- **Q2: Does the fix require restarting a process to take effect?**
735
- (Patching a bundle, changing config loaded at startup, modifying
736
- compiled artifacts, etc.)
737
-
738
- If YES → YOU must restart the process, observe the result, and capture
739
- evidence. Do not tell the user to restart — do it yourself:
740
- ```
741
- # Example: restart a CLI tool and capture its output
742
- osascript -e 'tell application "Terminal" to do script "cd /path && your-command"'
743
- sleep 3
744
- screencapture -x /tmp/verification-screenshot.png
745
- # Then READ the screenshot to verify
746
- ```
747
-
748
- If you cannot restart the process (e.g., it's the very tool you're
749
- running inside), this is one of the rare legitimate cases to ask the
750
- user — but you MUST say what specific thing to look for and why you
751
- couldn't verify it yourself.
752
-
753
- ## Output Format
754
-
755
- Write a file called `REVIEW-VERDICT.md` with:
756
-
757
- ### Verdict: [APPROVED / APPROVED WITH CHANGES / NEEDS REWORK]
758
-
759
- ### Verification Gate
760
- - Bug is visual/experiential: [YES/NO]
761
- - Fix requires process restart: [YES/NO]
762
- - Visual verification performed: [YES — describe what was captured / NO — explain why not required / BLOCKED — explain why agent couldn't do it]
763
-
764
- ### Fix Verification
765
- - Reproduction test: [PASS/FAIL — actual output]
766
- - Full test suite: [PASS/FAIL with details]
767
- - Build: [PASS/FAIL]
768
- - Lint/typecheck: [PASS/FAIL]
769
- - Runtime verification: [what was run, what was observed]
770
- - Visual verification: [screenshot path, DOM snapshot, or rendered output captured — or N/A with reason]
771
-
772
- ### Code Review Notes
773
- - [specific observations, line references]
774
-
775
- ### Concerns
776
- - [anything that needs attention]
777
-
778
- ### Recommended Follow-ups
779
- - [monitoring, tests to add, documentation to update]
780
- ```
781
-
782
- If the Reviewer says **NEEDS REWORK**, send the feedback back to the Solver agent for another iteration. The Solver-Reviewer loop continues until the verdict is APPROVED (max 3 iterations — after that, escalate to the user with full context of what's been tried).
783
-
784
- ---
785
-
786
- ### Phase 4: Present Results
787
-
788
- **CHECKPOINT: Before presenting, confirm these are true:**
789
- - [ ] A Reviewer agent was spawned (not just the Solver declaring victory)
790
- - [ ] The Reviewer's verdict includes actual evidence (output captures,
791
- screenshots, log snippets — not just "PASS")
792
- - [ ] If the bug was visual, visual evidence was captured
793
- - [ ] If the fix required a restart, the restart happened and post-restart
794
- behavior was verified
795
- - [ ] No "How to Verify" or "Restart X to see the change" instructions
796
- are included in the presentation
797
-
798
- If any of these are false, you are not ready to present. Go back to Phase 3.
799
-
800
- Once the Reviewer approves, present the full results to the user:
801
-
802
- ```
803
- ## Debug War Room Complete
804
-
805
- ### Root Cause
806
- [One paragraph explaining what was wrong — clear enough that someone
807
- unfamiliar with the code would understand]
808
-
809
- ### What Changed
810
- [List of files modified with brief descriptions]
811
-
812
- ### Verification Already Performed
813
- [These are things the Reviewer ALREADY RAN — not suggestions for the
814
- user to do. Include actual output/evidence.]
815
- - Reproduction test: PASS — [actual output snippet]
816
- - Full test suite: PASS — [X tests passed, 0 failures]
817
- - Build: PASS
818
- - Runtime verification: [command run, output captured, expected vs actual]
819
- - Visual verification (if applicable): [what was launched, screenshot/DOM
820
- evidence, what the user will see — this closes the gap between "tests
821
- pass" and "it actually works"]
822
- - Reviewer verdict: APPROVED
823
-
824
- ### Key Findings
825
- - [Top research findings that informed the fix]
826
- - [Instrumentation insights that revealed the bug]
827
- - [Hypotheses that were tested, including ones that were wrong — these
828
- help the user's understanding]
829
-
830
- ### Commits (in worktree: [branch name])
831
- [List of commits with messages]
832
-
833
- Ready to merge. All automated verification has passed.
834
- ```
835
-
836
- **Do NOT include a "How to Verify Yourself" section with manual steps.** If there is any verification that can be automated, the Reviewer must have already done it. The only reason to mention verification steps to the user is if something genuinely requires human judgment (visual design review, business logic confirmation) — and even then, explain what the agents already checked and what specifically needs a human eye.
837
-
838
- Wait for the user to validate. Once they confirm:
839
-
840
- 1. Merge the solver's worktree branch to main
841
- 2. Clean up all worktrees and branches
842
- 3. Remove any remaining debug instrumentation (unless the user wants to keep it)
843
-
844
- ---
845
-
846
- ## Agent Selection Guide
847
-
848
- Not every bug needs all agents. Here's when to scale down:
849
-
850
- | Bug Type | Skip These | Keep These |
851
- |----------|-----------|------------|
852
- | Pure logic error (wrong output) | Instrumenter | Researcher, Reproducer, Hypothesizer, Solver, Reviewer |
853
- | Race condition / timing | — (use all) | All — timing bugs are the hardest |
854
- | Known library bug (error message is googleable) | Hypothesizer | Researcher (primary), Solver, Reviewer |
855
- | UI rendering glitch | Researcher (maybe) | Instrumenter (critical), Reproducer, Hypothesizer, Solver, Reviewer (with visual verification!) |
856
- | Terminal/CLI visual output | Researcher (maybe) | Instrumenter, Reproducer, Hypothesizer, Solver, Reviewer (with visual verification!) |
857
- | Build / config issue | Reproducer | Researcher (check migration guides), Hypothesizer, Solver, Reviewer |
858
- | Intermittent / flaky | — (use all) | All — flaky bugs need every angle |
859
- | Performance regression | Researcher | Instrumenter (profiling), Reproducer (benchmark), Hypothesizer, Solver, Reviewer |
860
-
861
- When in doubt, use all of them. The cost of a redundant agent is some compute time. The cost of missing the right angle is another hour of debugging.
862
-
863
- ## Worktree Strategy
864
-
865
- Every agent that makes code changes gets its own worktree:
866
-
867
- ```
868
- .worktrees/
869
- debug-instrumentation/ (Instrumenter's logging)
870
- debug-reproduction/ (Reproducer's test cases)
871
- debug-fix/ (Solver's fix attempts)
872
- ```
873
-
874
- Branch naming: `debug/<problem-slug>/<agent-role>`
875
-
876
- Example: `debug/esm-crash/instrumentation`, `debug/esm-crash/fix`
877
-
878
- This means:
879
- - Every experiment is isolated and can be kept or discarded
880
- - The Solver can have multiple fix attempts on separate branches
881
- - The Reproducer's test stays clean from fix changes
882
- - You can diff any agent's work against main to see exactly what they did
883
- - **Commit after every meaningful change** — if a fix attempt fails, the commit history shows exactly what was tried
884
-
885
- Ensure `.worktrees/` is in `.gitignore`.
886
-
887
- After the fix is approved and merged, clean up all debug worktrees and branches.
888
-
889
- ## Escalation
890
-
891
- If after 3 Solver-Reviewer iterations the fix still isn't approved:
892
-
893
- 1. Present everything to the user: all hypotheses tested, all fix attempts, all review feedback
894
- 2. Ask the user for direction — they may have context that wasn't available to the agents
895
- 3. If the user provides new information, restart from Phase 1 with the new context
896
- 4. If the user wants to pair on it, switch to interactive debugging with all the instrumentation and research already done as context
897
-
898
- The war room is powerful but not omniscient. Sometimes the bug requires domain knowledge only the user has. The goal is to do 90% of the work so the user's intervention is a focused 10%.
899
-
900
- ## Blackboard Write
901
-
902
- After completing, update the blackboard:
903
-
904
- 1. Update `~/.claude/ftm-state/blackboard/context.json`:
905
- - Set current_task status to "complete"
906
- - Append decision summary to recent_decisions (cap at 10)
907
- - Update session_metadata.skills_invoked and last_updated
908
- 2. Write an experience file to `~/.claude/ftm-state/blackboard/experiences/YYYY-MM-DD_task-slug.json` capturing root cause, hypotheses tested, fix approach, and what to check first next time
909
- 3. Update `~/.claude/ftm-state/blackboard/experiences/index.json` with the new entry
910
- 4. Emit `task_completed` event
911
-
912
- ## Anti-Pattern: Asking the User to Do Agent Work
913
-
914
- This is the single most important rule of the war room: **never ask the user to perform a verification step that an agent could perform**.
915
-
916
- Examples of violations:
917
- - "Restart the application and check if the doom head appears" — an agent can launch the app, capture a screenshot, read the output, verify the rendering
918
- - "Run `tail -f /tmp/debug.log` and look for entries" — an agent can read that file
919
- - "Open a browser and check the UI" — an agent can use Playwright/Puppeteer to screenshot and inspect the DOM
920
- - "Try running this command and let me know what happens" — an agent can run the command
921
- - "All 103 tests pass!" without verifying the actual feature works — tests are a proxy, not proof. The agent must also verify runtime behavior matches expectations
922
-
923
- Examples of legitimate user asks:
924
- - "Does this visual design match what you wanted?" — subjective human judgment
925
- - "Is this the business logic you intended?" — domain knowledge only the user has
926
- - "Should we merge this to main?" — permission/authority decision
927
-
928
- When in doubt: if it can be executed by running a command, reading a file, or checking output, an agent does it. The user reviews the evidence the agent collected, not the raw behavior.
929
-
930
- ## Anti-Pattern: Collapsing Solver and Reviewer Into One
931
-
932
- A common failure mode: the session reads this skill, does good investigation work, writes a fix, then presents results directly to the user — skipping the Reviewer agent entirely. The Solver says "Restart X to see the change" and declares victory.
933
-
934
- This defeats the entire verification system. The Solver is biased toward their own fix. They wrote the code and believe it works. The Reviewer exists as an independent check.
935
-
936
- **The rule**: After the Solver commits their fix, you MUST spawn a separate Reviewer agent. The Reviewer reads FIX-SUMMARY.md, runs the verification gate, and either approves or sends it back. Only after the Reviewer approves do you present results to the user.
937
-
938
- If you find yourself writing "Root Cause / What Changed / How to Verify" without having spawned a Reviewer — stop. You're doing the anti-pattern. Spawn the Reviewer.
939
-
940
- ## Anti-Pattern: Structural Verification Masquerading as Live Verification
941
-
942
- Another common failure: the session verifies the fix by grepping the patched file for expected strings, checking that function references exist, or confirming config values are set. This is structural verification — it proves the code was written, not that it works.
943
-
944
- Example of structural verification pretending to be live:
945
- ```
946
- ✓ grep -c "doom_status patch start" cli.js → 1
947
- ✓ grep -c "doomStatuslineBackend" cli.js → 6
948
- ✓ node -e "require('cli.js')" → parses
949
- ```
950
-
951
- This proves the patch was applied and the file isn't syntactically broken. It does NOT prove the doom head renders visually. The grep checks are necessary but they are Phase 3 Step 3 (regression checks), not Phase 3 Step 4 (live verification).
952
-
953
- Live verification for this bug would be: launch Claude Code, wait for the statusline to render, capture a screenshot, confirm the doom head is visible. That's what the Reviewer must do for visual bugs.
954
-
955
- ## Requirements
956
-
957
- - config: `~/.claude/ftm-config.yml` | optional | model profiles for investigation agents
958
- - reference: `references/protocols/BLACKBOARD.md` | required | blackboard read/write protocol
959
- - reference: `references/protocols/EDGE-CASES.md` | required | anti-patterns and fallback handling
960
- - reference: `references/phases/PHASE-0-INTAKE.md` | required | intake steps and Explore agent prompt
961
- - reference: `references/phases/PHASE-1-TRIAGE.md` | required | agent selection guide and worktree strategy
962
- - reference: `references/phases/PHASE-2-WAR-ROOM-AGENTS.md` | required | all four agent prompts
963
- - reference: `references/phases/PHASE-3-TO-6-EXECUTION.md` | required | synthesis, solver, reviewer prompts
964
- - tool: `git` | required | worktree creation, diff inspection, commit history
965
- - reference: `~/.claude/ftm-state/blackboard/context.json` | optional | session state
966
- - reference: `~/.claude/ftm-state/blackboard/experiences/index.json` | optional | past bug fixes and known issues
967
- - reference: `~/.claude/ftm-state/blackboard/patterns.json` | optional | recurring failure patterns
968
-
969
- ## Risk
970
-
971
- - level: medium_write
972
- - scope: creates git worktrees for investigation and fix branches; modifies source files in Solver agent worktree; merges fix after Reviewer approval
973
- - rollback: git worktree remove + git branch -D for debug/* worktrees; all fix changes isolated until user confirms merge
974
-
975
- ## Approval Gates
976
-
977
- - trigger: investigation plan formulated in Phase 0 | action: present plan to user and proceed unless user objects
978
- - trigger: Solver produces fix | action: Reviewer agent must independently verify before presenting to user (hard gate — cannot skip)
979
- - trigger: Reviewer APPROVED | action: present root cause + changes + evidence to user, wait for user confirmation before merging
980
- - trigger: Solver NEEDS REWORK after 3 attempts | action: escalate to user with full context, wait for direction
981
- - complexity_routing: micro → auto | small → auto | medium → plan_first | large → plan_first | xl → always_ask
982
-
983
- ## Fallbacks
984
-
985
- - condition: Instrumenter agent fails or produces no useful output | action: skip instrumentation worktree, proceed with remaining agents
986
- - condition: Reproducer cannot create a minimal failing test | action: note as "reproduction failed", proceed with hypothesis-only approach
987
- - condition: Researcher finds no relevant issues or docs | action: proceed with instrumentation and hypothesis findings only
988
- - condition: fix still failing after 3 Solver iterations | action: escalate to user with all hypotheses tested and evidence gathered
989
- - condition: project has no test suite | action: Reviewer uses build check + diff review + live runtime verification instead of test runner
990
-
991
- ## Capabilities
992
-
993
- - cli: `git` | required | worktree isolation for investigation agents
994
- - mcp: `sequential-thinking` | optional | complex multi-hypothesis analysis
995
- - mcp: `playwright` | optional | visual bug verification in Reviewer phase
996
- - mcp: `WebSearch` | optional | Researcher agent for GitHub issues and Stack Overflow
997
- - mcp: `WebFetch` | optional | Researcher agent for docs and changelogs
998
-
999
- ## Event Payloads
1000
-
1001
- ### bug_fixed
1002
- - skill: string — "ftm-debug"
1003
- - root_cause: string — one-sentence root cause description
1004
- - fix_approach: string — description of the fix applied
1005
- - worktree: string — path to fix worktree
1006
- - iterations: number — number of solver-reviewer cycles needed
1007
- - duration_ms: number — total war room duration
1008
-
1009
- ### issue_found
1010
- - skill: string — "ftm-debug"
1011
- - phase: string — "phase1" | "phase2"
1012
- - agent: string — "instrumenter" | "researcher" | "reproducer" | "hypothesizer"
1013
- - finding: string — description of the specific issue found
1014
- - confidence: string — high | medium | low
1015
-
1016
- ### test_passed
1017
- - skill: string — "ftm-debug"
1018
- - scope: string — "reproduction" | "full_suite"
1019
- - worktree: string — worktree path where tests ran
1020
-
1021
- ### test_failed
1022
- - skill: string — "ftm-debug"
1023
- - scope: string — "reproduction" | "full_suite"
1024
- - worktree: string — worktree path
1025
- - error_summary: string — brief failure description
1026
-
1027
- ### error_encountered
1028
- - skill: string — "ftm-debug"
1029
- - phase: string — war room phase where error occurred
1030
- - agent: string | null — agent that encountered the error
1031
- - error: string — error description
1032
-
1033
- ### task_completed
1034
- - skill: string — "ftm-debug"
1035
- - outcome: string — "fixed" | "escalated" | "unresolved"
1036
- - root_cause: string — root cause if found
1037
- - duration_ms: number — total session duration
1
+ ---
2
+ name: ftm-debug
3
+ description: Deep multi-vector debugging war room that launches parallel agent teams to instrument, research, reproduce, hypothesize, solve, and verify tricky bugs. Use when a bug is stubborn, multi-turn debugging hasn't worked, the user says "debug this deeply", "war room this", "I can't figure out why", "this is driving me crazy", "launch the debug team", or any situation where standard debugging is insufficient. Also triggers on "/ftm-debug". Covers any codebase — frontend, backend, CLI tools, native apps, build systems, anything. Do NOT use for simple one-step fixes — this is the heavy artillery for problems that resist normal debugging.
4
+ ---
5
+
6
+ ## Events
7
+
8
+ ### Emits
9
+ - `bug_fixed` — when the Reviewer agent approves a fix and the bug is confirmed resolved
10
+ - `issue_found` — when investigation surfaces a specific problem (hypothesis confirmed, instrumentation reveals root cause)
11
+ - `test_passed` — when the reproduction test passes after a fix, or when the full suite passes post-fix
12
+ - `test_failed` — when the reproduction test fails, or when a fix attempt causes regressions
13
+ - `error_encountered` — when an unexpected error halts the war room workflow (agent failure, unrecoverable blocker)
14
+ - `task_completed` — when the debug session concludes with an approved and merged fix
15
+
16
+ ### Listens To
17
+ - `test_failed` — auto-investigate: launch Phase 0 intake and deploy the war room agent team
18
+ - `error_encountered` — diagnose the error: run codebase reconnaissance and begin targeted investigation
19
+
20
+ ## Blackboard Read
21
+
22
+ Before starting, load context from the blackboard:
23
+
24
+ 1. Read `~/.claude/ftm-state/blackboard/context.json` — check current_task, recent_decisions, active_constraints
25
+ 2. Read `~/.claude/ftm-state/blackboard/experiences/index.json` — filter entries by task_type="bug" and tags matching the current error domain
26
+ 3. Load top 3-5 matching experience files for known fixes and failed approaches
27
+ 4. Read `~/.claude/ftm-state/blackboard/patterns.json` — check recurring_issues for matching symptoms and codebase_insights for relevant file patterns
28
+
29
+ If index.json is empty or no matches found, proceed normally without experience-informed shortcuts.
30
+
31
+ # Debug War Room
32
+
33
+ Multi-vector deep debugging with parallel agent teams. When a bug resists normal debugging — you've tried the obvious, poked at it for multiple turns, and it's still not yielding — this skill escalates to a coordinated investigation across every angle simultaneously: instrumentation, research, reproduction, hypothesis, fix, and verification.
34
+
35
+ ## Why This Exists
36
+
37
+ Hard bugs are hard because they hide across multiple dimensions. The symptom is in one place, the cause is in another, and the fix requires understanding both plus the invisible interactions between them. Single-threaded debugging (try a thing, see if it works, try another thing) is too slow and too narrow. The war room attacks from every direction at once:
38
+
39
+ - **Instrumentation** catches what you can't see — timing, state transitions, render cycles, race conditions
40
+ - **Research** discovers that someone else hit this exact problem 18 months ago and documented the fix on a GitHub issue
41
+ - **Reproduction** isolates the bug from the noise so you can see it clearly
42
+ - **Hypothesis** maps the code paths and forms theories before touching anything
43
+ - **Solving** happens in isolated worktrees so every attempt is a clean experiment you can keep or discard
44
+ - **Review** catches the fix that fixes the bug but breaks three other things
45
+
46
+ The combination is what makes it powerful. Each vector informs the others — the researcher finds a pattern, the hypothesizer uses it, the solver implements against it, the reviewer validates it holds.
47
+
48
+ ## Core Principle: Automate Everything Before Involving the User
49
+
50
+ The entire point of the war room is that **agents do the work**. Every verification step, every test run, every log check, every "does it actually work?" confirmation must be performed by an agent before presenting results to the user. The user should receive a **verified, working result** — not a list of manual steps to try.
51
+
52
+ This means:
53
+ - If you can run a command to check if the fix works, **run it**. Don't tell the user to run it.
54
+ - If you can open a new terminal/process, read logs, check output, inspect state — **do it**.
55
+ - If you can write and execute a test script — **do it**.
56
+ - If the verification requires launching the application, reading its output, checking logs, inspecting files — **the Reviewer agent does all of this**.
57
+ - If the bug has a visual/rendering component, **the Reviewer must visually verify** using Playwright, screenshots, AppleScript, or process output capture. Tests passing is not enough — the Reviewer must confirm the user will actually see the correct result.
58
+ - The user's only job is to confirm the final result after all automated verification has passed. Even then, present what you verified so they can trust the result without re-running everything.
59
+
60
+ **Critical**: "All tests pass" is necessary but NOT sufficient. Tests verify code paths and logic. They do NOT verify that the feature actually works as experienced by a user. A function can return the right value in a test but never get called in the real app. A rendered component can pass snapshot tests but be invisible due to CSS. A config change can pass validation but never get loaded at runtime. The Reviewer must verify the actual runtime/visual result, not just test results. If 103 tests pass but the feature is still broken, the Reviewer failed.
61
+
62
+ If an agent produces a "How to Verify" section with manual steps, that's a failure of the process. Convert those steps into automated verification that the Reviewer executes.
63
+
64
+ ## The Process
65
+
66
+ ### Phase 0: Problem Intake
67
+
68
+ Before launching agents, understand what you're debugging. This happens in the main conversation thread — no agents yet.
69
+
70
+ #### Step 1: Gather the Problem Statement
71
+
72
+ If the user hasn't already described the bug in detail, ask targeted questions (one at a time, skip what you already know from conversation history):
73
+
74
+ 1. **What's happening?** — The symptom. What does the user see/experience?
75
+ 2. **What should be happening?** — The expected behavior.
76
+ 3. **What have you already tried?** — Critical context. Don't duplicate wasted work.
77
+ 4. **When did it start?** — A recent change? Always been broken? Intermittent?
78
+ 5. **Can you trigger it reliably?** — Reproduction steps if they exist.
79
+
80
+ #### Step 2: Codebase Reconnaissance
81
+
82
+ Spawn an **Explore agent** to scan the relevant area of the codebase:
83
+
84
+ ```
85
+ Analyze the codebase around the reported problem area:
86
+
87
+ 1. **Entry points**: What are the main files involved in this feature/behavior?
88
+ 2. **Call graph**: Trace the execution path from trigger to symptom
89
+ 3. **State flow**: What state (variables, stores, databases, caches) does this code touch?
90
+ 4. **Dependencies**: What external libs, APIs, or services are in the path?
91
+ 5. **Recent changes**: Check git log for recent modifications to relevant files
92
+ 6. **Test coverage**: Are there existing tests for this code path? Do they pass?
93
+ 7. **Configuration**: Environment variables, feature flags, build config that affect behavior
94
+ 8. **Error handling**: Where does error handling exist? Where is it missing?
95
+
96
+ Focus on the area described by the user. Map the territory before anyone tries to change it.
97
+ ```
98
+
99
+ Store the result as **codebase context**. Every subsequent agent receives this.
100
+
101
+ #### Step 3: Formulate the Investigation Plan
102
+
103
+ Based on the problem statement and codebase context, decide:
104
+
105
+ 1. **Which debug vectors are relevant?** Not every bug needs all 7 agents. A pure logic bug doesn't need instrumentation. A well-documented API issue might not need research. Pick what helps.
106
+ 2. **What specific questions should each agent answer?** Generic "go investigate" prompts produce generic results. Targeted questions produce answers.
107
+ 3. **What's the most likely root cause category?** (Race condition? State corruption? API contract mismatch? Build/config issue? Logic error? Missing error handling?) This focuses the investigation.
108
+
109
+ Present the investigation plan to the user:
110
+
111
+ ```
112
+ Investigation Plan:
113
+ Problem: [one-line summary]
114
+ Likely category: [race condition / state bug / API mismatch / etc.]
115
+ Agents deploying:
116
+ - Instrumenter: [what they'll instrument and why]
117
+ - Researcher: [what they'll search for]
118
+ - Reproducer: [reproduction strategy]
119
+ - Hypothesizer: [which code paths they'll analyze]
120
+ Worktree strategy: [how many worktrees, branch naming]
121
+ ```
122
+
123
+ Then proceed immediately unless the user objects.
124
+
125
+ ---
126
+
127
+ ### Phase 1: Parallel Investigation (the war room)
128
+
129
+ Launch all investigation agents **simultaneously**. This is the core value — attacking from every angle at once.
130
+
131
+ #### Agent: Instrumenter
132
+
133
+ The Instrumenter adds comprehensive debug logging and observability to the problem area. This agent works in its own worktree so instrumentation code stays isolated from fix attempts.
134
+
135
+ ```
136
+ You are the Instrumenter in a debug war room. Your job is to add debug
137
+ logging and observability so the team can SEE what's happening at runtime.
138
+
139
+ Working directory: [worktree path]
140
+ Problem: [problem statement]
141
+ Codebase context: [from Phase 0]
142
+ Likely root cause category: [from investigation plan]
143
+
144
+ ## What to Instrument
145
+
146
+ Add logging that captures the invisible. Think about what data would let
147
+ you diagnose this bug if you could only read a log file:
148
+
149
+ ### State Snapshots
150
+ - Capture the full state at key decision points (before/after transforms,
151
+ at branch conditions, before API calls)
152
+ - Log both the input AND output of any function in the suspect path
153
+ - For UI bugs: capture render state, props, computed values
154
+ - For API bugs: capture request + response bodies + headers + timing
155
+ - For state management bugs: capture state before and after mutations
156
+
157
+ ### Timing & Sequencing
158
+ - Add timestamps to every log entry (use high-resolution: performance.now()
159
+ or process.hrtime() depending on environment)
160
+ - Log entry and exit of key functions to see execution order
161
+ - For async code: log when promises are created, resolved, rejected
162
+ - For event-driven code: log event emission and handler invocation
163
+
164
+ ### Environment & Configuration
165
+ - Log all relevant env vars, feature flags, config values at startup
166
+ - Log platform/runtime details (versions, OS, screen size for UI bugs)
167
+ - Capture the state of any caches, memoization, or lazy-loaded resources
168
+
169
+ ### Error Boundaries
170
+ - Wrap suspect code in try/catch (if not already) and log caught errors
171
+ with full stack traces
172
+ - Add error event listeners where appropriate
173
+ - Log warnings that might be swallowed silently
174
+
175
+ ## Output Format
176
+
177
+ 1. Make all changes in the worktree and commit them
178
+ 2. Write a file called `DEBUG-INSTRUMENTATION.md` documenting:
179
+ - Every log point added and what it captures
180
+ - How to enable/trigger the logging (env vars, flags, etc.)
181
+ - How to read the output (log file locations, format explanation)
182
+ - A suggested test script to exercise the instrumented code paths
183
+ 3. If the problem has a UI component, add visual debug indicators too
184
+ (border highlights, state dumps in dev tools, overlay panels)
185
+
186
+ ## Key Principle
187
+
188
+ Instrument generously. It's cheap to add logging and expensive to guess.
189
+ The cost of too much logging is scrolling; the cost of too little is
190
+ another round of debugging. When in doubt, log it.
191
+ ```
192
+
193
+ #### Agent: Researcher
194
+
195
+ The Researcher searches for existing solutions — someone else has probably hit this exact bug or something like it.
196
+
197
+ ```
198
+ You are the Researcher in a debug war room. Your job is to find out if
199
+ this problem has been solved before, what patterns others used, and what
200
+ pitfalls to avoid.
201
+
202
+ Problem: [problem statement]
203
+ Codebase context: [from Phase 0]
204
+ Tech stack: [languages, frameworks, key dependencies from Phase 0]
205
+ Likely root cause category: [from investigation plan]
206
+
207
+ ## Research Vectors (search all of these)
208
+
209
+ ### 1. GitHub Issues & Discussions
210
+ Search the GitHub repos of every dependency in the problem path:
211
+ - Search for keywords from the error message or symptom
212
+ - Search for the function/class names involved
213
+ - Check closed issues — the fix might already exist in a newer version
214
+ - Check open issues — this might be a known unfixed bug
215
+
216
+ ### 2. Stack Overflow & Forums
217
+ Search for:
218
+ - The exact error message (in quotes)
219
+ - The symptom described in plain language + framework name
220
+ - The specific API or function that's misbehaving
221
+
222
+ ### 3. Library Documentation
223
+ Use Context7 or official docs to check:
224
+ - Are we using the API correctly? Check current docs, not cached knowledge
225
+ - Are there known caveats, migration notes, or breaking changes?
226
+ - Is there a recommended pattern we're not following?
227
+
228
+ ### 4. Blog Posts & Technical Articles
229
+ Search for:
230
+ - "[framework] + [symptom]" — e.g., "React useEffect infinite loop"
231
+ - "[library] + [error category]" — e.g., "webpack ESM require crash"
232
+ - "[pattern] + debugging" — e.g., "WebSocket reconnection race condition"
233
+
234
+ ### 5. Release Notes & Changelogs
235
+ Check if a recent dependency update introduced the issue:
236
+ - Compare the installed version vs latest, check changelog between them
237
+ - Look for deprecation notices that match our usage pattern
238
+
239
+ ## Output Format
240
+
241
+ Write a file called `RESEARCH-FINDINGS.md` with:
242
+
243
+ For each relevant finding:
244
+ - **Source**: URL or reference
245
+ - **Relevance**: Why this applies to our problem (1-2 sentences)
246
+ - **Solution found**: What fix/workaround was used (if any)
247
+ - **Confidence**: How closely this matches our situation (high/medium/low)
248
+ - **Key insight**: The non-obvious thing we should know
249
+
250
+ End with a **Recommended approach** section that synthesizes the most
251
+ promising leads into an actionable suggestion.
252
+
253
+ ## Key Principle
254
+
255
+ Cast a wide net, then filter ruthlessly. The goal is not 50 vaguely
256
+ related links — it's 3-5 findings that directly inform the fix. Quality
257
+ of relevance over quantity of results.
258
+ ```
259
+
260
+ #### Agent: Reproducer
261
+
262
+ The Reproducer creates a minimal, reliable way to trigger the bug.
263
+
264
+ ```
265
+ You are the Reproducer in a debug war room. Your job is to create the
266
+ simplest possible reproduction of the bug — ideally an automated test
267
+ that fails, or a script that triggers the symptom reliably.
268
+
269
+ Working directory: [worktree path]
270
+ Problem: [problem statement]
271
+ Codebase context: [from Phase 0]
272
+ Reproduction steps from user: [if any]
273
+
274
+ ## Reproduction Strategy
275
+
276
+ ### 1. Verify the User's Steps
277
+ If the user provided reproduction steps, follow them exactly first.
278
+ Document whether the bug appears consistently or intermittently.
279
+
280
+ ### 2. Write a Failing Test
281
+ The gold standard is a test that:
282
+ - Fails now (reproduces the bug)
283
+ - Will pass when the bug is fixed
284
+ - Runs in the project's existing test framework
285
+
286
+ If the bug is in a function: write a unit test with the inputs that
287
+ trigger the failure.
288
+
289
+ If the bug is in a flow: write an integration test that exercises the
290
+ full path.
291
+
292
+ If the bug requires a running server/UI: write a script that automates
293
+ the trigger (curl commands, Playwright script, CLI invocation, etc.)
294
+
295
+ ### 3. Minimize
296
+ Strip away everything that isn't necessary to trigger the bug:
297
+ - Remove unrelated setup steps
298
+ - Use the simplest possible inputs
299
+ - Isolate the exact conditions (timing, data shape, config values)
300
+
301
+ ### 4. Characterize
302
+ Once you can reproduce it, characterize the boundaries:
303
+ - What inputs trigger it? What inputs don't?
304
+ - Is it timing-dependent? Data-dependent? Config-dependent?
305
+ - Does it happen on first run only, every run, or intermittently?
306
+ - What's the smallest change that makes it go away?
307
+
308
+ ## Output Format
309
+
310
+ 1. Commit all reproduction artifacts to the worktree
311
+ 2. Write a file called `REPRODUCTION.md` documenting:
312
+ - **Trigger command**: The single command to reproduce the bug
313
+ - **Expected vs actual**: What should happen vs what does happen
314
+ - **Consistency**: How reliably it reproduces (every time / 8 out of 10 / etc.)
315
+ - **Boundaries**: What makes it appear/disappear
316
+ - **Minimal test**: Path to the failing test file
317
+ - **Environment requirements**: Any special setup needed
318
+
319
+ ## Key Principle
320
+
321
+ A bug you can't reproduce is a bug you can't fix with confidence. And a
322
+ bug you can reproduce with a single command is a bug you can fix in
323
+ minutes. The reproduction IS the debugging.
324
+ ```
325
+
326
+ #### Agent: Hypothesizer
327
+
328
+ The Hypothesizer reads the code deeply and forms theories about root cause.
329
+
330
+ ```
331
+ You are the Hypothesizer in a debug war room. Your job is to deeply read
332
+ the code involved in the bug, trace every execution path, and form
333
+ ranked hypotheses about what's causing the problem.
334
+
335
+ Problem: [problem statement]
336
+ Codebase context: [from Phase 0]
337
+ Likely root cause category: [from investigation plan]
338
+
339
+ ## Analysis Method
340
+
341
+ ### 1. Trace the Execution Path
342
+ Starting from the user's trigger action, trace through every function
343
+ call, state mutation, and branch condition until you reach the symptom.
344
+ Document the full chain.
345
+
346
+ ### 2. Identify Suspect Points
347
+ At each step in the chain, evaluate:
348
+ - Could this function receive unexpected input?
349
+ - Could this state be in an unexpected shape?
350
+ - Could this condition evaluate differently than intended?
351
+ - Is there a timing assumption (X happens before Y)?
352
+ - Is there an implicit dependency (this works because that was set up earlier)?
353
+ - Is error handling missing or swallowing relevant errors?
354
+
355
+ ### 3. Form Hypotheses
356
+ For each suspect point, write a hypothesis:
357
+ - **What**: "The bug occurs because X"
358
+ - **Why**: "Because when [condition], the code at [file:line] does [thing]
359
+ instead of [expected thing]"
360
+ - **Evidence for**: What supports this theory
361
+ - **Evidence against**: What contradicts this theory
362
+ - **How to verify**: What specific test or log would prove/disprove this
363
+
364
+ ### 4. Rank by Likelihood
365
+ Order hypotheses from most to least likely based on:
366
+ - How much evidence supports each one
367
+ - How well it explains ALL symptoms (not just some)
368
+ - Whether it aligns with the root cause category
369
+ - Occam's razor — simpler explanations first
370
+
371
+ ## Output Format
372
+
373
+ Write a file called `HYPOTHESES.md` with:
374
+
375
+ ### Hypothesis 1 (most likely): [title]
376
+ - **Claim**: [one sentence]
377
+ - **Mechanism**: [detailed explanation of how the bug occurs]
378
+ - **Code path**: [file:line] -> [file:line] -> [file:line]
379
+ - **Evidence for**: [what supports this]
380
+ - **Evidence against**: [what contradicts this]
381
+ - **Verification**: [how to prove/disprove]
382
+ - **Suggested fix**: [high-level approach]
383
+
384
+ [repeat for each hypothesis, ranked]
385
+
386
+ ### Summary
387
+ - Top 3 hypotheses with confidence levels
388
+ - Recommended investigation order
389
+ - What additional data would help distinguish between hypotheses
390
+
391
+ ## Key Principle
392
+
393
+ Don't jump to conclusions. The first plausible explanation is often
394
+ wrong — it's the one you already thought of that didn't pan out. Trace
395
+ the actual code, don't assume. Read every line in the path. The bug is
396
+ in the code, and the code is right there to be read.
397
+ ```
398
+
399
+ ---
400
+
401
+ ### Phase 2: Synthesis & Solve
402
+
403
+ After all Phase 1 agents complete, synthesize their findings before solving.
404
+
405
+ #### Step 1: Cross-Reference Findings
406
+
407
+ Read all four reports and synthesize:
408
+
409
+ 1. **Do the hypotheses match the research?** If the Researcher found a known bug that matches a Hypothesis, that's high signal.
410
+ 2. **Does the reproduction confirm a hypothesis?** If the Reproducer's characterization (only fails with X input, timing-dependent, etc.) matches a hypothesis's prediction, that's strong evidence.
411
+ 3. **What does the instrumentation suggest?** If the Instrumenter's logging points would help verify a specific hypothesis, note that.
412
+ 4. **Are there contradictions?** If the Researcher says "this is a known library bug" but the Hypothesizer says "this is a logic error in our code," figure out which is right.
413
+
414
+ Present the synthesis to the user briefly:
415
+
416
+ ```
417
+ War Room Findings:
418
+ Researcher: [key finding]
419
+ Reproducer: [reproduction status + characterization]
420
+ Hypothesizer: [top hypothesis]
421
+ Instrumenter: [logging added, key observation points]
422
+
423
+ Cross-reference: [how findings align or conflict]
424
+ Recommended fix approach: [what to try first]
425
+
426
+ Proceeding to solve in isolated worktree.
427
+ ```
428
+
429
+ #### Step 2: Solve (in worktrees)
430
+
431
+ Launch the **Solver agent** in a fresh worktree. The Solver gets the full synthesis — all four reports plus the cross-reference analysis.
432
+
433
+ ```
434
+ You are the Solver in a debug war room. The investigation team has
435
+ completed their analysis and you now have comprehensive context. Your
436
+ job is to implement the fix.
437
+
438
+ Working directory: [worktree path]
439
+ Problem: [problem statement]
440
+ Codebase context: [from Phase 0]
441
+
442
+ ## Investigation Results
443
+
444
+ [paste full synthesis: Research findings, Reproduction results,
445
+ Hypotheses ranked, Instrumentation notes, Cross-reference analysis]
446
+
447
+ ## Execution Rules
448
+
449
+ ### Work Incrementally
450
+ - Start with the highest-ranked hypothesis
451
+ - Implement the minimal fix that addresses it
452
+ - COMMIT after each discrete change (not one big commit at the end)
453
+ - Use clear commit messages: "Fix: [what] — addresses hypothesis [N]"
454
+
455
+ ### Verify as You Go
456
+ - After each fix attempt, run the reproduction test from REPRODUCTION.md
457
+ - If the project has existing tests, run them too (zero broken windows)
458
+ - If the fix works on the reproduction but breaks other tests, that's
459
+ not done — fix the regressions too
460
+
461
+ ### If the First Hypothesis Doesn't Pan It
462
+ - Don't keep hacking at it. Move to hypothesis #2.
463
+ - Revert the failed attempt (git revert or fresh branch) so each
464
+ attempt starts clean
465
+ - If you exhaust all hypotheses, say so — don't invent new ones
466
+ without evidence
467
+
468
+ ### Clean Up After Yourself
469
+ - Remove any debug logging you added (unless the user wants to keep it)
470
+ - Make sure the fix is minimal — don't refactor surrounding code
471
+ - Don't add "just in case" error handling beyond what the fix requires
472
+
473
+ ### Do NOT Declare Victory
474
+ - You are the Solver, not the Reviewer. Your job ends at "fix committed."
475
+ - Do NOT tell the user "restart X to see the change" — that's the
476
+ Reviewer's job (and the Reviewer must do it, not the user)
477
+ - Do NOT present results directly to the user — hand off to the
478
+ Reviewer agent via FIX-SUMMARY.md
479
+ - Do NOT say the fix works unless you have actually verified it
480
+ by running it. "The code looks correct" is not verification.
481
+
482
+ ## Output Format
483
+
484
+ 1. All changes committed in the worktree with descriptive messages
485
+ 2. Write a file called `FIX-SUMMARY.md` documenting:
486
+ - **Root cause**: What was actually wrong (one paragraph)
487
+ - **Fix applied**: What you changed and why
488
+ - **Files modified**: List with brief descriptions
489
+ - **Commits**: List of commit hashes with messages
490
+ - **Verification**: What tests you ran and their results
491
+ - **Requires restart**: YES/NO — does the fix require restarting
492
+ a process, reloading config, or rebuilding to take effect?
493
+ - **Visual component**: YES/NO — does this bug have a visual or
494
+ experiential symptom that needs visual verification?
495
+ - **Remaining concerns**: Anything that should be monitored or
496
+ might need follow-up
497
+ ```
498
+
499
+ ---
500
+
501
+ ### Phase 3: Review & Verify
502
+
503
+ **HARD GATE — You cannot proceed to Phase 4 without completing this phase.**
504
+
505
+ This is non-negotiable. You cannot present results to the user until a
506
+ Reviewer has independently verified the fix. "I checked with grep" is not
507
+ verification. "The tests pass" is not verification. "The patch was applied"
508
+ is not verification.
509
+
510
+ Verification means: **the actual behavior the user reported as broken now
511
+ works correctly, as observed by an agent, with captured evidence.**
512
+
513
+ #### Step 1: Determine verification method BEFORE launching the Reviewer
514
+
515
+ Look at the original bug report. Ask: "How would a human know this is fixed?"
516
+
517
+ - If the answer involves SEEING something (UI, terminal output, rendered
518
+ image, visual layout) → the Reviewer MUST capture a screenshot or
519
+ visual evidence. Use `screencapture`, Playwright `browser_take_screenshot`,
520
+ or process output capture.
521
+ - If the answer involves a BEHAVIOR (API returns correct data, CLI produces
522
+ right output, server responds correctly) → the Reviewer MUST exercise
523
+ that behavior and capture the output.
524
+ - If the answer is "the error stops happening" → the Reviewer MUST trigger
525
+ the scenario that caused the error and confirm it no longer occurs.
526
+
527
+ The verification method goes into the Reviewer's prompt. Don't let the
528
+ Reviewer decide — tell it exactly what to verify and how.
529
+
530
+ #### Step 2: If the fix requires a restart, the Reviewer handles it
531
+
532
+ Many fixes (bundle patches, config changes, build artifacts) require
533
+ restarting a process to take effect. The Reviewer must:
534
+
535
+ 1. Restart the process (use `osascript` to launch in a new terminal if
536
+ needed, or kill and restart the background process)
537
+ 2. Wait for it to initialize
538
+ 3. Exercise the fixed behavior
539
+ 4. Capture evidence (screenshot, output, logs)
540
+
541
+ If the Reviewer literally cannot restart because it's running inside the
542
+ process being fixed (e.g., debugging Claude Code from within Claude Code),
543
+ try these alternatives first:
544
+
545
+ 1. **Launch a SEPARATE instance** via osascript/terminal:
546
+ ```bash
547
+ osascript -e 'tell application "Terminal" to do script "cd /path && claude --print \"hello\""'
548
+ sleep 5
549
+ screencapture -x /tmp/verification.png
550
+ ```
551
+ Then READ the screenshot to verify.
552
+
553
+ 2. **Launch via background process** and capture output:
554
+ ```bash
555
+ nohup claude --print "test" > /tmp/claude-output.txt 2>&1 &
556
+ sleep 5
557
+ cat /tmp/claude-output.txt
558
+ ```
559
+
560
+ 3. **Use Playwright MCP** if available to screenshot a running instance.
561
+
562
+ Only if ALL of these are impossible should you flag as BLOCKED. In that
563
+ case, tell the user exactly what to look for, why you couldn't verify it
564
+ yourself, and what the expected visual result should be (with specifics,
565
+ not "check if it works").
566
+
567
+ #### Step 3: Launch the Reviewer agent
568
+
569
+ After the Solver completes, launch the **Reviewer agent** to validate the fix independently.
570
+
571
+ ```
572
+ You are the Reviewer in a debug war room. The Solver has implemented a
573
+ fix and your job is to verify it actually works, doesn't break anything
574
+ else, and is the right approach.
575
+
576
+ Working directory: [solver's worktree path]
577
+ Problem: [original problem statement]
578
+ Fix summary: [from FIX-SUMMARY.md]
579
+ Reproduction: [from REPRODUCTION.md]
580
+
581
+ ## Review Checklist
582
+
583
+ ### 1. Does the Fix Address the Root Cause?
584
+ - Read the fix diff carefully
585
+ - Does it fix the actual root cause, or just mask the symptom?
586
+ - Could the same bug recur in a different form?
587
+ - Is the fix in the right layer of abstraction?
588
+
589
+ ### 2. Reproduction Verification (YOU MUST RUN THESE — do not list them for the user)
590
+ - EXECUTE the reproduction test — it should PASS now
591
+ - Run it multiple times if the bug was intermittent
592
+ - Try variations of the reproduction (different inputs, timing, config)
593
+ - Capture the actual output/logs as evidence
594
+
595
+ ### 3. Regression Check (YOU MUST RUN THESE)
596
+ - EXECUTE the full test suite and capture results
597
+ - EXECUTE linting and type checking
598
+ - EXECUTE any build steps and verify success
599
+ - If the fix involves a running process (server, CLI tool, UI):
600
+ launch it, exercise the fixed behavior, check logs, and capture
601
+ evidence that it works
602
+
603
+ ### 4. Live Verification (critical — tests passing is NECESSARY but NOT SUFFICIENT)
604
+
605
+ Tests verify code structure. Live verification proves the feature actually
606
+ works as experienced by a user. Many bugs exist in the gap between "all
607
+ tests pass" and "it actually works." Your job is to close that gap.
608
+
609
+ **Why this matters**: A test can assert that a function returns the right
610
+ value, but that doesn't prove the function gets called, its output reaches
611
+ the renderer, the renderer handles it correctly, and the user sees the
612
+ expected result. Each layer can silently fail while tests pass.
613
+
614
+ #### Automated Runtime Verification (always do these)
615
+ - If the fix involves a server/process: START it, EXERCISE the fixed
616
+ behavior via curl/CLI/API calls, READ stdout/stderr, CAPTURE evidence
617
+ - If the fix involves CLI output: RUN the command, CAPTURE the output,
618
+ COMPARE against expected output
619
+ - If the fix involves log output: RUN the code, READ the log file,
620
+ CONFIRM expected entries appear
621
+ - If the fix involves a build: RUN the build, VERIFY the output artifact
622
+ exists and contains expected content (grep/inspect the built files)
623
+ - If the fix involves configuration: LOAD the config, VERIFY the values
624
+ propagate to where they're used at runtime (not just that the config
625
+ file is correct)
626
+
627
+ #### Visual/Runtime Verification (when the bug has a visual or interactive component)
628
+
629
+ Some bugs only manifest visually — terminal rendering, UI display, image
630
+ output, interactive behavior. Tests can't catch these. You must verify
631
+ the actual rendered result.
632
+
633
+ **Techniques for visual verification:**
634
+
635
+ 1. **Playwright/browser automation**: For web UIs, launch Playwright,
636
+ navigate to the page, take a screenshot, and inspect the DOM. Check
637
+ that elements are visible, correctly positioned, and contain expected
638
+ content. This catches CSS bugs, rendering issues, and layout breaks
639
+ that pass all unit tests.
640
+
641
+ 2. **AppleScript + screenshot** (macOS): For native apps, CLI tools with
642
+ visual output, or terminal-rendered content:
643
+ ```
644
+ # Launch the application via AppleScript
645
+ osascript -e 'tell application "Terminal" to do script "your-command"'
646
+ # Wait for it to render, then capture
647
+ screencapture -x /tmp/verification-screenshot.png
648
+ ```
649
+ Then read the screenshot to verify the visual result.
650
+
651
+ 3. **Process output capture**: For CLI tools and terminal UIs, run the
652
+ command with output capture (script command, tee, or redirect) and
653
+ inspect the raw output including ANSI codes, escape sequences, and
654
+ control characters that affect rendering.
655
+
656
+ 4. **Playwright for Electron/web-based tools**: Many modern tools
657
+ (VS Code extensions, Electron apps, web dashboards) can be automated
658
+ with Playwright. Use `browser_navigate`, `browser_snapshot`, and
659
+ `browser_take_screenshot` to verify rendered state.
660
+
661
+ 5. **ftm-browse ($PB) for UI verification**: If ftm-browse is
662
+ installed, use it for visual verification of web UI bugs. First check
663
+ whether the binary exists:
664
+ ```bash
665
+ PB="$HOME/.claude/skills/ftm-browse/bin/ftm-browse"
666
+ ```
667
+ If the binary exists at that path, use it:
668
+ - **Navigate**: `$PB goto <url>` — open the affected page
669
+ - **Before screenshot**: `$PB screenshot --path /tmp/debug-before.png`
670
+ (capture state BEFORE verifying the fix is live, if you need a
671
+ before/after comparison — do this before the fix is applied or on
672
+ a pre-fix worktree)
673
+ - **After screenshot**: `$PB screenshot --path /tmp/debug-after.png`
674
+ (capture state AFTER fix is applied and running)
675
+ - **DOM inspection**: `$PB snapshot -i` — get the interactive ARIA
676
+ tree to verify element existence, visibility, and state
677
+ (e.g., confirm a button is now visible, a panel is collapsed,
678
+ an error message is gone)
679
+ - Report both screenshot paths in REVIEW-VERDICT.md so the user
680
+ can compare before/after visually.
681
+
682
+ **Graceful fallback**: If the binary does NOT exist at
683
+ `$HOME/.claude/skills/ftm-browse/bin/ftm-browse`, fall back to
684
+ test-only and other available verification methods (Playwright, etc.).
685
+ Do NOT fail the review. Record in the Verification Gate section:
686
+ "Visual verification skipped — ftm-browse not installed."
687
+
688
+ **When to use visual verification:**
689
+ - Terminal rendering (status lines, TUI elements, colored output, unicode)
690
+ - Web UI changes (layout, styling, visibility, interaction)
691
+ - Image/PDF/document generation (verify output visually, not just file size)
692
+ - Any bug where "it looks wrong" was part of the symptom
693
+ - Any fix where tests pass but you're not 100% confident the user will
694
+ see the correct result
695
+
696
+ **The rule**: If the bug was reported as something the user SAW (or didn't
697
+ see), verification must confirm what the user will SEE (or will now see).
698
+ Passing tests are evidence, not proof. Visual confirmation is proof.
699
+
700
+ #### Never Do This
701
+ - NEVER write "How to verify: run X" — instead, RUN X yourself and
702
+ report what happened
703
+ - NEVER say "restart the app to see the change" — restart it yourself,
704
+ observe the result, report back
705
+ - NEVER assume tests passing = feature working. Tests verify code paths.
706
+ Live verification proves the feature delivers its intended experience.
707
+
708
+ ### 5. Code Quality
709
+ - Is the fix minimal and focused?
710
+ - Does it follow the project's existing patterns?
711
+ - Are there edge cases the fix doesn't handle?
712
+ - Is error handling appropriate (not excessive, not missing)?
713
+
714
+ ### 6. Observability
715
+ - Will this failure mode be visible if it happens again?
716
+ - Should any permanent logging or monitoring be added?
717
+ - Are there metrics or alerts that should be updated?
718
+
719
+ ## Mandatory Verification Gate
720
+
721
+ Before writing the verdict, answer these two questions:
722
+
723
+ **Q1: Was the bug reported as something visual/experiential?**
724
+ (Did the user say "it doesn't show up", "it looks wrong", "the UI is broken",
725
+ "nothing happens when I click", "the output is garbled", etc.)
726
+
727
+ If YES → Visual verification is REQUIRED. You cannot approve without
728
+ capturing a screenshot, reading rendered output, or observing the
729
+ running application. Grep checks and log analysis are not sufficient.
730
+
731
+ If NO → Automated runtime verification (running tests, checking output)
732
+ is sufficient.
733
+
734
+ **Q2: Does the fix require restarting a process to take effect?**
735
+ (Patching a bundle, changing config loaded at startup, modifying
736
+ compiled artifacts, etc.)
737
+
738
+ If YES → YOU must restart the process, observe the result, and capture
739
+ evidence. Do not tell the user to restart — do it yourself:
740
+ ```
741
+ # Example: restart a CLI tool and capture its output
742
+ osascript -e 'tell application "Terminal" to do script "cd /path && your-command"'
743
+ sleep 3
744
+ screencapture -x /tmp/verification-screenshot.png
745
+ # Then READ the screenshot to verify
746
+ ```
747
+
748
+ If you cannot restart the process (e.g., it's the very tool you're
749
+ running inside), this is one of the rare legitimate cases to ask the
750
+ user — but you MUST say what specific thing to look for and why you
751
+ couldn't verify it yourself.
752
+
753
+ ## Output Format
754
+
755
+ Write a file called `REVIEW-VERDICT.md` with:
756
+
757
+ ### Verdict: [APPROVED / APPROVED WITH CHANGES / NEEDS REWORK]
758
+
759
+ ### Verification Gate
760
+ - Bug is visual/experiential: [YES/NO]
761
+ - Fix requires process restart: [YES/NO]
762
+ - Visual verification performed: [YES — describe what was captured / NO — explain why not required / BLOCKED — explain why agent couldn't do it]
763
+
764
+ ### Fix Verification
765
+ - Reproduction test: [PASS/FAIL — actual output]
766
+ - Full test suite: [PASS/FAIL with details]
767
+ - Build: [PASS/FAIL]
768
+ - Lint/typecheck: [PASS/FAIL]
769
+ - Runtime verification: [what was run, what was observed]
770
+ - Visual verification: [screenshot path, DOM snapshot, or rendered output captured — or N/A with reason]
771
+
772
+ ### Code Review Notes
773
+ - [specific observations, line references]
774
+
775
+ ### Concerns
776
+ - [anything that needs attention]
777
+
778
+ ### Recommended Follow-ups
779
+ - [monitoring, tests to add, documentation to update]
780
+ ```
781
+
782
+ If the Reviewer says **NEEDS REWORK**, send the feedback back to the Solver agent for another iteration. The Solver-Reviewer loop continues until the verdict is APPROVED (max 3 iterations — after that, escalate to the user with full context of what's been tried).
783
+
784
+ ---
785
+
786
+ ### Phase 4: Present Results
787
+
788
+ **CHECKPOINT: Before presenting, confirm these are true:**
789
+ - [ ] A Reviewer agent was spawned (not just the Solver declaring victory)
790
+ - [ ] The Reviewer's verdict includes actual evidence (output captures,
791
+ screenshots, log snippets — not just "PASS")
792
+ - [ ] If the bug was visual, visual evidence was captured
793
+ - [ ] If the fix required a restart, the restart happened and post-restart
794
+ behavior was verified
795
+ - [ ] No "How to Verify" or "Restart X to see the change" instructions
796
+ are included in the presentation
797
+
798
+ If any of these are false, you are not ready to present. Go back to Phase 3.
799
+
800
+ Once the Reviewer approves, present the full results to the user:
801
+
802
+ ```
803
+ ## Debug War Room Complete
804
+
805
+ ### Root Cause
806
+ [One paragraph explaining what was wrong — clear enough that someone
807
+ unfamiliar with the code would understand]
808
+
809
+ ### What Changed
810
+ [List of files modified with brief descriptions]
811
+
812
+ ### Verification Already Performed
813
+ [These are things the Reviewer ALREADY RAN — not suggestions for the
814
+ user to do. Include actual output/evidence.]
815
+ - Reproduction test: PASS — [actual output snippet]
816
+ - Full test suite: PASS — [X tests passed, 0 failures]
817
+ - Build: PASS
818
+ - Runtime verification: [command run, output captured, expected vs actual]
819
+ - Visual verification (if applicable): [what was launched, screenshot/DOM
820
+ evidence, what the user will see — this closes the gap between "tests
821
+ pass" and "it actually works"]
822
+ - Reviewer verdict: APPROVED
823
+
824
+ ### Key Findings
825
+ - [Top research findings that informed the fix]
826
+ - [Instrumentation insights that revealed the bug]
827
+ - [Hypotheses that were tested, including ones that were wrong — these
828
+ help the user's understanding]
829
+
830
+ ### Commits (in worktree: [branch name])
831
+ [List of commits with messages]
832
+
833
+ Ready to merge. All automated verification has passed.
834
+ ```
835
+
836
+ **Do NOT include a "How to Verify Yourself" section with manual steps.** If there is any verification that can be automated, the Reviewer must have already done it. The only reason to mention verification steps to the user is if something genuinely requires human judgment (visual design review, business logic confirmation) — and even then, explain what the agents already checked and what specifically needs a human eye.
837
+
838
+ Wait for the user to validate. Once they confirm:
839
+
840
+ 1. Merge the solver's worktree branch to main
841
+ 2. Clean up all worktrees and branches
842
+ 3. Remove any remaining debug instrumentation (unless the user wants to keep it)
843
+
844
+ ---
845
+
846
+ ## Agent Selection Guide
847
+
848
+ Not every bug needs all agents. Here's when to scale down:
849
+
850
+ | Bug Type | Skip These | Keep These |
851
+ |----------|-----------|------------|
852
+ | Pure logic error (wrong output) | Instrumenter | Researcher, Reproducer, Hypothesizer, Solver, Reviewer |
853
+ | Race condition / timing | — (use all) | All — timing bugs are the hardest |
854
+ | Known library bug (error message is googleable) | Hypothesizer | Researcher (primary), Solver, Reviewer |
855
+ | UI rendering glitch | Researcher (maybe) | Instrumenter (critical), Reproducer, Hypothesizer, Solver, Reviewer (with visual verification!) |
856
+ | Terminal/CLI visual output | Researcher (maybe) | Instrumenter, Reproducer, Hypothesizer, Solver, Reviewer (with visual verification!) |
857
+ | Build / config issue | Reproducer | Researcher (check migration guides), Hypothesizer, Solver, Reviewer |
858
+ | Intermittent / flaky | — (use all) | All — flaky bugs need every angle |
859
+ | Performance regression | Researcher | Instrumenter (profiling), Reproducer (benchmark), Hypothesizer, Solver, Reviewer |
860
+
861
+ When in doubt, use all of them. The cost of a redundant agent is some compute time. The cost of missing the right angle is another hour of debugging.
862
+
863
+ ## Worktree Strategy
864
+
865
+ Every agent that makes code changes gets its own worktree:
866
+
867
+ ```
868
+ .worktrees/
869
+ debug-instrumentation/ (Instrumenter's logging)
870
+ debug-reproduction/ (Reproducer's test cases)
871
+ debug-fix/ (Solver's fix attempts)
872
+ ```
873
+
874
+ Branch naming: `debug/<problem-slug>/<agent-role>`
875
+
876
+ Example: `debug/esm-crash/instrumentation`, `debug/esm-crash/fix`
877
+
878
+ This means:
879
+ - Every experiment is isolated and can be kept or discarded
880
+ - The Solver can have multiple fix attempts on separate branches
881
+ - The Reproducer's test stays clean from fix changes
882
+ - You can diff any agent's work against main to see exactly what they did
883
+ - **Commit after every meaningful change** — if a fix attempt fails, the commit history shows exactly what was tried
884
+
885
+ Ensure `.worktrees/` is in `.gitignore`.
886
+
887
+ After the fix is approved and merged, clean up all debug worktrees and branches.
888
+
889
+ ## Escalation
890
+
891
+ If after 3 Solver-Reviewer iterations the fix still isn't approved:
892
+
893
+ 1. Present everything to the user: all hypotheses tested, all fix attempts, all review feedback
894
+ 2. Ask the user for direction — they may have context that wasn't available to the agents
895
+ 3. If the user provides new information, restart from Phase 1 with the new context
896
+ 4. If the user wants to pair on it, switch to interactive debugging with all the instrumentation and research already done as context
897
+
898
+ The war room is powerful but not omniscient. Sometimes the bug requires domain knowledge only the user has. The goal is to do 90% of the work so the user's intervention is a focused 10%.
899
+
900
+ ## Blackboard Write
901
+
902
+ After completing, update the blackboard:
903
+
904
+ 1. Update `~/.claude/ftm-state/blackboard/context.json`:
905
+ - Set current_task status to "complete"
906
+ - Append decision summary to recent_decisions (cap at 10)
907
+ - Update session_metadata.skills_invoked and last_updated
908
+ 2. Write an experience file to `~/.claude/ftm-state/blackboard/experiences/YYYY-MM-DD_task-slug.json` capturing root cause, hypotheses tested, fix approach, and what to check first next time
909
+ 3. Update `~/.claude/ftm-state/blackboard/experiences/index.json` with the new entry
910
+ 4. Emit `task_completed` event
911
+
912
+ ## Anti-Pattern: Asking the User to Do Agent Work
913
+
914
+ This is the single most important rule of the war room: **never ask the user to perform a verification step that an agent could perform**.
915
+
916
+ Examples of violations:
917
+ - "Restart the application and check if the doom head appears" — an agent can launch the app, capture a screenshot, read the output, verify the rendering
918
+ - "Run `tail -f /tmp/debug.log` and look for entries" — an agent can read that file
919
+ - "Open a browser and check the UI" — an agent can use Playwright/Puppeteer to screenshot and inspect the DOM
920
+ - "Try running this command and let me know what happens" — an agent can run the command
921
+ - "All 103 tests pass!" without verifying the actual feature works — tests are a proxy, not proof. The agent must also verify runtime behavior matches expectations
922
+
923
+ Examples of legitimate user asks:
924
+ - "Does this visual design match what you wanted?" — subjective human judgment
925
+ - "Is this the business logic you intended?" — domain knowledge only the user has
926
+ - "Should we merge this to main?" — permission/authority decision
927
+
928
+ When in doubt: if it can be executed by running a command, reading a file, or checking output, an agent does it. The user reviews the evidence the agent collected, not the raw behavior.
929
+
930
+ ## Anti-Pattern: Collapsing Solver and Reviewer Into One
931
+
932
+ A common failure mode: the session reads this skill, does good investigation work, writes a fix, then presents results directly to the user — skipping the Reviewer agent entirely. The Solver says "Restart X to see the change" and declares victory.
933
+
934
+ This defeats the entire verification system. The Solver is biased toward their own fix. They wrote the code and believe it works. The Reviewer exists as an independent check.
935
+
936
+ **The rule**: After the Solver commits their fix, you MUST spawn a separate Reviewer agent. The Reviewer reads FIX-SUMMARY.md, runs the verification gate, and either approves or sends it back. Only after the Reviewer approves do you present results to the user.
937
+
938
+ If you find yourself writing "Root Cause / What Changed / How to Verify" without having spawned a Reviewer — stop. You're doing the anti-pattern. Spawn the Reviewer.
939
+
940
+ ## Anti-Pattern: Structural Verification Masquerading as Live Verification
941
+
942
+ Another common failure: the session verifies the fix by grepping the patched file for expected strings, checking that function references exist, or confirming config values are set. This is structural verification — it proves the code was written, not that it works.
943
+
944
+ Example of structural verification pretending to be live:
945
+ ```
946
+ ✓ grep -c "doom_status patch start" cli.js → 1
947
+ ✓ grep -c "doomStatuslineBackend" cli.js → 6
948
+ ✓ node -e "require('cli.js')" → parses
949
+ ```
950
+
951
+ This proves the patch was applied and the file isn't syntactically broken. It does NOT prove the doom head renders visually. The grep checks are necessary but they are Phase 3 Step 3 (regression checks), not Phase 3 Step 4 (live verification).
952
+
953
+ Live verification for this bug would be: launch Claude Code, wait for the statusline to render, capture a screenshot, confirm the doom head is visible. That's what the Reviewer must do for visual bugs.
954
+
955
+ ## Requirements
956
+
957
+ - config: `~/.claude/ftm-config.yml` | optional | model profiles for investigation agents
958
+ - reference: `references/protocols/BLACKBOARD.md` | required | blackboard read/write protocol
959
+ - reference: `references/protocols/EDGE-CASES.md` | required | anti-patterns and fallback handling
960
+ - reference: `references/phases/PHASE-0-INTAKE.md` | required | intake steps and Explore agent prompt
961
+ - reference: `references/phases/PHASE-1-TRIAGE.md` | required | agent selection guide and worktree strategy
962
+ - reference: `references/phases/PHASE-2-WAR-ROOM-AGENTS.md` | required | all four agent prompts
963
+ - reference: `references/phases/PHASE-3-TO-6-EXECUTION.md` | required | synthesis, solver, reviewer prompts
964
+ - tool: `git` | required | worktree creation, diff inspection, commit history
965
+ - reference: `~/.claude/ftm-state/blackboard/context.json` | optional | session state
966
+ - reference: `~/.claude/ftm-state/blackboard/experiences/index.json` | optional | past bug fixes and known issues
967
+ - reference: `~/.claude/ftm-state/blackboard/patterns.json` | optional | recurring failure patterns
968
+
969
+ ## Risk
970
+
971
+ - level: medium_write
972
+ - scope: creates git worktrees for investigation and fix branches; modifies source files in Solver agent worktree; merges fix after Reviewer approval
973
+ - rollback: git worktree remove + git branch -D for debug/* worktrees; all fix changes isolated until user confirms merge
974
+
975
+ ## Approval Gates
976
+
977
+ - trigger: investigation plan formulated in Phase 0 | action: present plan to user and proceed unless user objects
978
+ - trigger: Solver produces fix | action: Reviewer agent must independently verify before presenting to user (hard gate — cannot skip)
979
+ - trigger: Reviewer APPROVED | action: present root cause + changes + evidence to user, wait for user confirmation before merging
980
+ - trigger: Solver NEEDS REWORK after 3 attempts | action: escalate to user with full context, wait for direction
981
+ - complexity_routing: micro → auto | small → auto | medium → plan_first | large → plan_first | xl → always_ask
982
+
983
+ ## Fallbacks
984
+
985
+ - condition: Instrumenter agent fails or produces no useful output | action: skip instrumentation worktree, proceed with remaining agents
986
+ - condition: Reproducer cannot create a minimal failing test | action: note as "reproduction failed", proceed with hypothesis-only approach
987
+ - condition: Researcher finds no relevant issues or docs | action: proceed with instrumentation and hypothesis findings only
988
+ - condition: fix still failing after 3 Solver iterations | action: escalate to user with all hypotheses tested and evidence gathered
989
+ - condition: project has no test suite | action: Reviewer uses build check + diff review + live runtime verification instead of test runner
990
+
991
+ ## Capabilities
992
+
993
+ - cli: `git` | required | worktree isolation for investigation agents
994
+ - mcp: `sequential-thinking` | optional | complex multi-hypothesis analysis
995
+ - mcp: `playwright` | optional | visual bug verification in Reviewer phase
996
+ - mcp: `WebSearch` | optional | Researcher agent for GitHub issues and Stack Overflow
997
+ - mcp: `WebFetch` | optional | Researcher agent for docs and changelogs
998
+
999
+ ## Event Payloads
1000
+
1001
+ ### bug_fixed
1002
+ - skill: string — "ftm-debug"
1003
+ - root_cause: string — one-sentence root cause description
1004
+ - fix_approach: string — description of the fix applied
1005
+ - worktree: string — path to fix worktree
1006
+ - iterations: number — number of solver-reviewer cycles needed
1007
+ - duration_ms: number — total war room duration
1008
+
1009
+ ### issue_found
1010
+ - skill: string — "ftm-debug"
1011
+ - phase: string — "phase1" | "phase2"
1012
+ - agent: string — "instrumenter" | "researcher" | "reproducer" | "hypothesizer"
1013
+ - finding: string — description of the specific issue found
1014
+ - confidence: string — high | medium | low
1015
+
1016
+ ### test_passed
1017
+ - skill: string — "ftm-debug"
1018
+ - scope: string — "reproduction" | "full_suite"
1019
+ - worktree: string — worktree path where tests ran
1020
+
1021
+ ### test_failed
1022
+ - skill: string — "ftm-debug"
1023
+ - scope: string — "reproduction" | "full_suite"
1024
+ - worktree: string — worktree path
1025
+ - error_summary: string — brief failure description
1026
+
1027
+ ### error_encountered
1028
+ - skill: string — "ftm-debug"
1029
+ - phase: string — war room phase where error occurred
1030
+ - agent: string | null — agent that encountered the error
1031
+ - error: string — error description
1032
+
1033
+ ### task_completed
1034
+ - skill: string — "ftm-debug"
1035
+ - outcome: string — "fixed" | "escalated" | "unresolved"
1036
+ - root_cause: string — root cause if found
1037
+ - duration_ms: number — total session duration