forge-workflow 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/.claude/commands/dev.md +314 -0
  2. package/.claude/commands/plan.md +389 -0
  3. package/.claude/commands/premerge.md +179 -0
  4. package/.claude/commands/research.md +42 -0
  5. package/.claude/commands/review.md +442 -0
  6. package/.claude/commands/rollback.md +721 -0
  7. package/.claude/commands/ship.md +134 -0
  8. package/.claude/commands/sonarcloud.md +152 -0
  9. package/.claude/commands/status.md +77 -0
  10. package/.claude/commands/validate.md +237 -0
  11. package/.claude/commands/verify.md +221 -0
  12. package/.claude/rules/greptile-review-process.md +285 -0
  13. package/.claude/rules/workflow.md +105 -0
  14. package/.claude/scripts/greptile-resolve.sh +526 -0
  15. package/.claude/scripts/load-env.sh +32 -0
  16. package/.forge/hooks/check-tdd.js +240 -0
  17. package/.github/PLUGIN_TEMPLATE.json +32 -0
  18. package/.mcp.json.example +12 -0
  19. package/AGENTS.md +169 -0
  20. package/CLAUDE.md +99 -0
  21. package/LICENSE +21 -0
  22. package/README.md +414 -0
  23. package/bin/forge-cmd.js +313 -0
  24. package/bin/forge-validate.js +303 -0
  25. package/bin/forge.js +4228 -0
  26. package/docs/AGENT_INSTALL_PROMPT.md +342 -0
  27. package/docs/ENHANCED_ONBOARDING.md +602 -0
  28. package/docs/EXAMPLES.md +482 -0
  29. package/docs/GREPTILE_SETUP.md +400 -0
  30. package/docs/MANUAL_REVIEW_GUIDE.md +106 -0
  31. package/docs/ROADMAP.md +359 -0
  32. package/docs/SETUP.md +632 -0
  33. package/docs/TOOLCHAIN.md +849 -0
  34. package/docs/VALIDATION.md +363 -0
  35. package/docs/WORKFLOW.md +400 -0
  36. package/docs/planning/PROGRESS.md +396 -0
  37. package/docs/plans/.gitkeep +0 -0
  38. package/docs/plans/2026-02-27-forge-test-suite-v2-decisions.md +21 -0
  39. package/docs/plans/2026-02-27-forge-test-suite-v2-design.md +362 -0
  40. package/docs/plans/2026-02-27-forge-test-suite-v2-tasks.md +343 -0
  41. package/docs/plans/2026-03-02-superpowers-gaps-decisions.md +26 -0
  42. package/docs/plans/2026-03-02-superpowers-gaps-design.md +239 -0
  43. package/docs/plans/2026-03-02-superpowers-gaps-tasks.md +260 -0
  44. package/docs/plans/2026-03-04-agent-command-parity-design.md +163 -0
  45. package/docs/plans/2026-03-04-verify-worktree-cleanup-decisions.md +7 -0
  46. package/docs/plans/2026-03-04-verify-worktree-cleanup-design.md +165 -0
  47. package/docs/plans/2026-03-05-forge-uto-decisions.md +6 -0
  48. package/docs/plans/2026-03-05-forge-uto-design.md +116 -0
  49. package/docs/plans/2026-03-05-forge-uto-tasks.md +244 -0
  50. package/docs/plans/2026-03-10-command-creator-and-eval-decisions.md +52 -0
  51. package/docs/plans/2026-03-10-command-creator-and-eval-design.md +350 -0
  52. package/docs/plans/2026-03-10-command-creator-and-eval-tasks.md +426 -0
  53. package/docs/plans/2026-03-10-stale-workflow-refs-decisions.md +8 -0
  54. package/docs/plans/2026-03-10-stale-workflow-refs-design.md +80 -0
  55. package/docs/plans/2026-03-10-stale-workflow-refs-tasks.md +90 -0
  56. package/docs/plans/2026-03-14-beads-plan-context-decisions.md +9 -0
  57. package/docs/plans/2026-03-14-beads-plan-context-design.md +171 -0
  58. package/docs/plans/2026-03-14-beads-plan-context-tasks.md +160 -0
  59. package/docs/plans/2026-03-14-skill-eval-loop-decisions.md +33 -0
  60. package/docs/plans/2026-03-14-skill-eval-loop-design.md +118 -0
  61. package/docs/plans/2026-03-14-skill-eval-loop-results.md +78 -0
  62. package/docs/plans/2026-03-14-skill-eval-loop-tasks.md +160 -0
  63. package/docs/plans/2026-03-15-agent-command-parity-v2-decisions.md +11 -0
  64. package/docs/plans/2026-03-15-agent-command-parity-v2-design.md +145 -0
  65. package/docs/plans/2026-03-15-agent-command-parity-v2-tasks.md +211 -0
  66. package/docs/research/TEMPLATE.md +292 -0
  67. package/docs/research/advanced-testing.md +297 -0
  68. package/docs/research/agent-permissions.md +167 -0
  69. package/docs/research/dependency-chain.md +328 -0
  70. package/docs/research/forge-workflow-v2.md +550 -0
  71. package/docs/research/plugin-architecture.md +772 -0
  72. package/docs/research/pr4-cli-automation.md +326 -0
  73. package/docs/research/premerge-verify-restructure.md +205 -0
  74. package/docs/research/skills-restructure.md +508 -0
  75. package/docs/research/sonarcloud-perfection-plan.md +166 -0
  76. package/docs/research/sonarcloud-quality-gate.md +184 -0
  77. package/docs/research/superpowers-integration.md +403 -0
  78. package/docs/research/superpowers.md +319 -0
  79. package/docs/research/test-environment.md +519 -0
  80. package/install.sh +1062 -0
  81. package/lefthook.yml +39 -0
  82. package/lib/agents/README.md +198 -0
  83. package/lib/agents/claude.plugin.json +28 -0
  84. package/lib/agents/cline.plugin.json +22 -0
  85. package/lib/agents/codex.plugin.json +19 -0
  86. package/lib/agents/copilot.plugin.json +24 -0
  87. package/lib/agents/cursor.plugin.json +25 -0
  88. package/lib/agents/kilocode.plugin.json +22 -0
  89. package/lib/agents/opencode.plugin.json +20 -0
  90. package/lib/agents/roo.plugin.json +23 -0
  91. package/lib/agents-config.js +2112 -0
  92. package/lib/commands/dev.js +513 -0
  93. package/lib/commands/plan.js +696 -0
  94. package/lib/commands/recommend.js +119 -0
  95. package/lib/commands/ship.js +377 -0
  96. package/lib/commands/status.js +378 -0
  97. package/lib/commands/validate.js +602 -0
  98. package/lib/context-merge.js +359 -0
  99. package/lib/plugin-catalog.js +360 -0
  100. package/lib/plugin-manager.js +166 -0
  101. package/lib/plugin-recommender.js +141 -0
  102. package/lib/project-discovery.js +491 -0
  103. package/lib/setup.js +118 -0
  104. package/lib/workflow-profiles.js +203 -0
  105. package/package.json +115 -0
@@ -0,0 +1,343 @@
1
+ # Task List: Forge Test Suite v2
2
+
3
+ **Feature**: forge-test-suite-v2
4
+ **Beads**: forge-5vf
5
+ **Branch**: feat/forge-test-suite-v2
6
+ **Design doc**: docs/plans/2026-02-27-forge-test-suite-v2-design.md
7
+ **Baseline**: 107/107 tests passing
8
+
9
+ ---
10
+
11
+ ## Ordering Rationale
12
+
13
+ 1. **Delete stale code first** (Tasks 1-2) — removes dead exports so new tests can't accidentally pass by testing removed code
14
+ 2. **Unit tests for lib functions** (Tasks 3-5) — deterministic, fast, bun:test migration
15
+ 3. **Structural command-file tests** (Task 6) — no lib dependency, can run independently
16
+ 4. **commitlint script tests** (Task 7) — isolated, no lib dependency
17
+ 5. **gh-aw behavioral workflow** (Tasks 8-10) — CI-only, built last after unit tests confirm baseline
18
+
19
+ ---
20
+
21
+ ## Task 1: Audit and delete stale lib exports
22
+
23
+ **File(s)**: `lib/commands/research.js`, `lib/commands/plan.js`
24
+
25
+ **What to implement**:
26
+ Grep entire codebase for imports/requires of `lib/commands/research.js` and the OpenSpec functions in `lib/commands/plan.js` (`createOpenSpecProposal`, `formatProposalPRBody`, `createProposalPR`). If zero usages found outside test files, delete `lib/commands/research.js` entirely and remove the three OpenSpec functions from `lib/commands/plan.js`. Update `package.json` exports if needed.
27
+
28
+ **TDD steps**:
29
+ 1. Write test: none — this is a deletion task. Run `grep -r "require.*commands/research" . --include="*.js" --exclude-dir=node_modules --exclude-dir=test` and `grep -r "createOpenSpecProposal\|formatProposalPRBody\|createProposalPR" . --include="*.js" --exclude-dir=node_modules --exclude-dir=test` first.
30
+ 2. Confirm zero usages outside test files
31
+ 3. Delete `lib/commands/research.js`
32
+ 4. Remove OpenSpec functions from `lib/commands/plan.js`
33
+ 5. Run `bun test` — if any test now fails with "Cannot find module", that test was using the deleted code and must also be deleted in Task 2
34
+ 6. Commit: `refactor: delete stale research lib and OpenSpec functions`
35
+
36
+ **Expected output**: `bun test` still passes all non-stale tests. Zero references to deleted exports in non-test files.
37
+
38
+ ---
39
+
40
+ ## Task 2: Delete stale test files
41
+
42
+ **File(s)**: `test/commands/research.test.js`, `test/commands/plan.test.js` (OpenSpec tests only)
43
+
44
+ **What to implement**:
45
+ Delete `test/commands/research.test.js` entirely — it tests `lib/commands/research.js` which no longer exists after Task 1. In `test/commands/plan.test.js`, remove the test blocks for `createOpenSpecProposal`, `formatProposalPRBody`, `createProposalPR`, and `createProposalPR`. Keep `detectScope`, `createBeadsIssue`, `createFeatureBranch`, `extractDesignDecisions` — these are still valid. Migrate kept tests from `node:test` to `bun:test` import syntax in the same step.
46
+
47
+ **TDD steps**:
48
+ 1. Write test: none — deletion task
49
+ 2. Delete `test/commands/research.test.js`
50
+ 3. Remove OpenSpec test blocks from `test/commands/plan.test.js`
51
+ 4. Migrate remaining `plan.test.js` imports: `require('node:test')` → `import { describe, test } from "bun:test"`, `require('node:assert/strict')` → `import { expect } from "bun:test"`
52
+ 5. Run `bun test test/commands/plan.test.js` — must pass
53
+ 6. Commit: `refactor: delete stale research tests and OpenSpec test blocks`
54
+
55
+ **Expected output**: `test/commands/research.test.js` does not exist. `test/commands/plan.test.js` has no OpenSpec references.
56
+
57
+ ---
58
+
59
+ ## Task 3: Add Phase 1/2/3 coverage to plan.test.js
60
+
61
+ **File(s)**: `test/commands/plan.test.js`, `lib/commands/plan.js`
62
+
63
+ **What to implement**:
64
+ Add test coverage for the new `/plan` workflow mechanics. Tests use `bun:test` with `mock.module` for `node:child_process` and `node:fs`. Add mock.module declarations at the top of the file BEFORE any lib import. Cover:
65
+
66
+ - `validateDesignDoc(content)` — returns `{ valid: true, sections: [...] }` for complete doc; `{ valid: false, missing: ['OWASP'] }` for missing OWASP section
67
+ - `validateDesignDoc` minimum content length check — OWASP section < 200 chars → invalid
68
+ - `validateDesignDoc` placeholder detection — doc containing "[describe" → invalid
69
+ - `validateTaskList(content)` — returns `{ valid: true, taskCount: N }` when ≥3 tasks with TDD steps; `{ valid: false, reason: '...' }` when < 50% of tasks have RED/GREEN/REFACTOR
70
+ - `readResearchDoc` now reads from `docs/plans/` (not `docs/research/`) — assert correct path
71
+ - `createFeatureBranch` with `--strategic` flag — assert proposal branch naming `feat/<slug>-proposal`
72
+
73
+ **TDD steps**:
74
+ 1. Write test: `describe("validateDesignDoc")` block with 5 cases (happy path, missing OWASP, short OWASP, placeholder, missing HARD-GATE)
75
+ 2. Run: confirm RED — `validateDesignDoc is not a function`
76
+ 3. Implement `validateDesignDoc(content)` in `lib/commands/plan.js`
77
+ 4. Run: confirm GREEN
78
+ 5. Write test: `describe("validateTaskList")` block with 3 cases (≥3 tasks all with TDD, ≥3 tasks only 30% with TDD → invalid, < 3 tasks → invalid)
79
+ 6. Run: confirm RED
80
+ 7. Implement `validateTaskList(content)` in `lib/commands/plan.js`
81
+ 8. Run: confirm GREEN
82
+ 9. Write test: `readResearchDoc` path assertion — mock `fs.existsSync` to capture the path argument, assert it includes `docs/plans/`
83
+ 10. Run: confirm GREEN or RED depending on current path in lib
84
+ 11. Fix path in lib if needed
85
+ 12. Commit: `test: add Phase 1/2/3 coverage to plan.test.js` then `feat: add validateDesignDoc and validateTaskList`
86
+
87
+ **Expected output**: All new tests pass. `validateDesignDoc` and `validateTaskList` exported from `lib/commands/plan.js`.
88
+
89
+ ---
90
+
91
+ ## Task 4: Add decision gate + subagent tests to dev.test.js
92
+
93
+ **File(s)**: `test/commands/dev.test.js`, `lib/commands/dev.js`
94
+
95
+ **What to implement**:
96
+ Migrate `test/commands/dev.test.js` from `node:test` to `bun:test`. Add `mock.module` for `node:child_process` at top before lib import. Add coverage for:
97
+
98
+ - `evaluateDecisionGate(score)` — score 0-3 → `{ route: 'PROCEED' }`, score 4-7 → `{ route: 'SPEC-REVIEWER' }`, score 8+ → `{ route: 'BLOCKED' }`
99
+ - `orderReviewers(task)` — always returns spec compliance reviewer BEFORE code quality reviewer (spec-before-quality HARD-GATE)
100
+ - `dispatchImplementer(task, designDoc)` — mock the subprocess call, assert it receives full task text (not just task number), assert it receives relevant design doc sections
101
+ - `dispatchImplementer` with missing task list → returns `{ success: false, error: 'task-list-not-found' }`
102
+
103
+ **TDD steps**:
104
+ 1. Migrate existing `dev.test.js` imports to `bun:test` (same pattern as Task 2)
105
+ 2. Add `mock.module("node:child_process", ...)` at top
106
+ 3. Write test: `describe("evaluateDecisionGate")` — 3 score ranges, 3 boundary cases (0, 3, 4, 7, 8, 15)
107
+ 4. Run: confirm RED — `evaluateDecisionGate is not a function`
108
+ 5. Implement `evaluateDecisionGate(score)` in `lib/commands/dev.js`
109
+ 6. Run: confirm GREEN
110
+ 7. Write test: `describe("orderReviewers")` — assert spec reviewer index < quality reviewer index in returned array
111
+ 8. Run: confirm RED or GREEN (may already exist)
112
+ 9. Implement or fix `orderReviewers` if needed
113
+ 10. Write test: `describe("dispatchImplementer")` — mock `execFileSync`, assert call args contain full task text
114
+ 11. Run: confirm RED → implement → GREEN
115
+ 12. Commit: `test: add decision gate and subagent dispatch tests` then `feat: implement evaluateDecisionGate and orderReviewers`
116
+
117
+ **Expected output**: Decision gate routing tested at all 6 boundary values. Spec-before-quality ordering verified. Subagent dispatch mock asserts correct arguments.
118
+
119
+ ---
120
+
121
+ ## Task 5: Add commitlint script tests
122
+
123
+ **File(s)**: `test/scripts/commitlint.test.js` (new file), `scripts/commitlint.js`
124
+
125
+ **What to implement**:
126
+ Create `test/scripts/commitlint.test.js`. Test the cross-platform commitlint runner. Use `bun:test` with `mock.module` for `node:child_process` and `node:fs`.
127
+
128
+ Cover:
129
+ - `getCommitlintRunner()` — `bun.lock` exists → returns `'bunx'`; no `bun.lock` → returns `'npx'`
130
+ - `getCommitlintRunner()` on Windows (`process.platform === 'win32'`) → shell option is `true`
131
+ - Missing commit message file argument → process exits with code 1 + error message
132
+ - Exit code propagation — if spawnSync returns `{ status: 1 }`, script exits with 1
133
+ - Exit code propagation — if spawnSync returns `{ status: 0 }`, script exits with 0
134
+
135
+ **TDD steps**:
136
+ 1. Write test file with 5 test cases listed above
137
+ 2. Run: confirm RED — `test/scripts/commitlint.test.js` doesn't exist yet, or functions not exported
138
+ 3. Refactor `scripts/commitlint.js` to export `getCommitlintRunner()` for testability (extract from inline logic), keep `if (require.main === module)` guard for CLI usage
139
+ 4. Run: confirm GREEN
140
+ 5. Commit: `test: add commitlint script tests` then `refactor: extract getCommitlintRunner for testability`
141
+
142
+ **Expected output**: 5 tests passing for `scripts/commitlint.js`. Function exported without breaking lefthook hook behavior.
143
+
144
+ ---
145
+
146
+ ## Task 6: Add structural command-file tests
147
+
148
+ **File(s)**: `test/commands/plan-structure.test.js` (new file), `test/commands/dev-structure.test.js` (new file)
149
+
150
+ **What to implement**:
151
+ Using the same pattern as `test/ci-workflow.test.js` (reads a file and asserts content structure), create two test files that read `.claude/commands/plan.md` and `.claude/commands/dev.md` and assert required structural elements exist:
152
+
153
+ **plan-structure.test.js asserts:**
154
+ - `<!-- WORKFLOW-SYNC:START -->` and `<!-- WORKFLOW-SYNC:END -->` markers present
155
+ - Phase 1 header `## Phase 1` exists
156
+ - Phase 2 header `## Phase 2` exists
157
+ - Phase 3 header `## Phase 3` exists
158
+ - `<HARD-GATE: Phase 1 exit>` block exists
159
+ - `<HARD-GATE: Phase 2 exit>` block exists
160
+ - `<HARD-GATE: /plan exit>` block exists
161
+ - `Skill("parallel-web-search")` call present in Phase 2
162
+ - `git worktree add` command present in Phase 3
163
+ - `docs/plans/` path present (not `docs/research/`)
164
+ - `docs/plans/YYYY-MM-DD-<slug>-tasks.md` task list path format present
165
+
166
+ **dev-structure.test.js asserts:**
167
+ - `<HARD-GATE: /dev entry>` block exists
168
+ - Spec compliance reviewer step present (text: "Spec compliance reviewer" or "spec-before-quality")
169
+ - Code quality reviewer step present AFTER spec reviewer
170
+ - Decision gate scoring documented (text: "PROCEED", "SPEC-REVIEWER", "BLOCKED" all present)
171
+ - `docs/plans/` path present for task list reading
172
+ - `decisions.md` path present
173
+
174
+ **TDD steps**:
175
+ 1. Write `test/commands/plan-structure.test.js` with all assertions (using `fs.readFileSync` + `includes()`, same pattern as ci-workflow.test.js)
176
+ 2. Run: confirm which assertions pass/fail against current `.claude/commands/plan.md`
177
+ 3. Fix any missing markers in `.claude/commands/plan.md` (add WORKFLOW-SYNC markers if missing)
178
+ 4. Run: GREEN for plan-structure
179
+ 5. Write `test/commands/dev-structure.test.js`
180
+ 6. Run: confirm which pass/fail
181
+ 7. Fix any missing markers in `.claude/commands/dev.md`
182
+ 8. Run: GREEN for dev-structure
183
+ 9. Commit: `test: add structural command-file tests for plan and dev`
184
+
185
+ **Expected output**: Both structural test files pass. `.claude/commands/plan.md` and `dev.md` contain all required structural markers.
186
+
187
+ ---
188
+
189
+ ## Task 7: Update test script in package.json
190
+
191
+ **File(s)**: `package.json`
192
+
193
+ **What to implement**:
194
+ Add the new test files to the `bun test` script in `package.json` so they run in CI:
195
+ - `test/commands/plan.test.js`
196
+ - `test/commands/dev.test.js`
197
+ - `test/commands/plan-structure.test.js`
198
+ - `test/commands/dev-structure.test.js`
199
+ - `test/scripts/commitlint.test.js`
200
+
201
+ Also verify `test/commands/check.test.js`, `test/commands/ship.test.js`, `test/commands/status.test.js` are already included or add them.
202
+
203
+ Run full `bun test` with the updated script to confirm all tests pass.
204
+
205
+ **TDD steps**:
206
+ 1. Read current `package.json` test script
207
+ 2. Add new test file paths
208
+ 3. Run `bun test <all files>` — confirm all pass
209
+ 4. Commit: `chore: add new test files to bun test script`
210
+
211
+ **Expected output**: `bun test` (using package.json script) runs all test files and all pass.
212
+
213
+ ---
214
+
215
+ ## Task 8: Create gh-aw behavioral workflow markdown
216
+
217
+ **File(s)**: `.github/workflows/behavioral-test.md` (new file), `.github/workflows/detect-command-file-changes.yml` (new file)
218
+
219
+ **What to implement**:
220
+ Create the gh-aw behavioral test workflow in markdown format. Two files:
221
+
222
+ **`detect-command-file-changes.yml`** — lightweight standard GitHub Actions YAML that triggers when `.claude/commands/plan.md`, `.claude/commands/dev.md`, or `AGENTS.md` changes on push to master. This fires `workflow_run` on the behavioral test.
223
+
224
+ **`.github/workflows/behavioral-test.md`** — gh-aw markdown workflow:
225
+
226
+ ```yaml
227
+ ---
228
+ name: forge-workflow-behavioral-test
229
+ description: "Tests that a real AI agent correctly follows the Forge /plan workflow"
230
+ on:
231
+ - schedule: "0 3 * * SUN"
232
+ - workflow_dispatch
233
+ - workflow_run:
234
+ workflows: ["detect-command-file-changes.yml"]
235
+ types: [completed]
236
+ permissions:
237
+ contents: read
238
+ actions: read
239
+ secrets:
240
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
241
+ OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
242
+ engine:
243
+ type: claude
244
+ model: claude-sonnet-4-6
245
+ max-turns: 20
246
+ tools:
247
+ - bash
248
+ - edit
249
+ - github:
250
+ toolsets: [repos, actions]
251
+ ---
252
+ ```
253
+
254
+ Markdown body instructs the agent to:
255
+ 1. Create a temp directory as synthetic test repo
256
+ 2. Run `/plan` on 3-4 rotating test prompts
257
+ 3. Assert artifacts exist (design doc, task list)
258
+ 4. Save Q&A transcript to temp file
259
+ 5. Run judge evaluation (curl to OpenRouter with MiniMax M2.5)
260
+ 6. Parse judge JSON output
261
+ 7. Apply 3-layer scoring (blockers → dimensions → band)
262
+ 8. Append score to `.github/behavioral-test-scores.json`
263
+ 9. If FAIL → exit non-zero (fails the workflow run)
264
+ 10. If INCONCLUSIVE (API error) → exit 0 with warning comment
265
+ 11. Cleanup temp directory
266
+
267
+ **TDD steps**:
268
+ 1. Write `detect-command-file-changes.yml` (standard YAML, no gh-aw)
269
+ 2. Write `.github/workflows/behavioral-test.md` with frontmatter + markdown body
270
+ 3. Run `gh aw compile .github/workflows/behavioral-test.md` to generate `.lock.yml`
271
+ 4. Verify `.lock.yml` was created and is valid YAML
272
+ 5. Commit: `feat: add gh-aw behavioral test workflow`
273
+
274
+ **Expected output**: Both files exist. `.github/workflows/behavioral-test.lock.yml` generated and committed.
275
+
276
+ ---
277
+
278
+ ## Task 9: Create judge scoring script
279
+
280
+ **File(s)**: `scripts/behavioral-judge.sh` (new file)
281
+
282
+ **What to implement**:
283
+ Bash script called by the behavioral test workflow to run the judge evaluation. Takes design doc path + task list path + Q&A transcript path as args. Calls OpenRouter MiniMax M2.5, parses response, applies 3-layer scoring, returns JSON result.
284
+
285
+ Covers all 16 loophole fixes:
286
+ - Layer 1 blocker checks (existence, content length, placeholder detection, timestamp recency, majority TDD threshold)
287
+ - Layer 2 weighted scoring (security ×3, TDD ×3, design ×2, structural ×1, max 45)
288
+ - Layer 3 trend comparison (read previous score from `.github/behavioral-test-scores.json`, compare per-dimension)
289
+ - INCONCLUSIVE on API errors (429/5xx)
290
+ - Calibration mode flag (first 4 runs don't enforce FAIL gate)
291
+ - Minimax M2.5 with MiniMax K2.5 fallback
292
+
293
+ **TDD steps**:
294
+ 1. Write `test/scripts/behavioral-judge.test.js` with mocked OpenRouter responses covering:
295
+ - All Layer 1 blockers fire correctly
296
+ - Weighted scoring math (security 4/5 × 3 = 12, etc.)
297
+ - INCONCLUSIVE on 429
298
+ - Calibration mode: score below threshold but result is still PASS (with warning)
299
+ - Trend alert: current score 20, previous was 29 → ≥8 point drop → alert
300
+ 2. Run: confirm RED
301
+ 3. Implement `scripts/behavioral-judge.sh` (bash) + export testable functions to a `lib/behavioral-judge.js` wrapper for unit testing
302
+ 4. Run: confirm GREEN
303
+ 5. Commit: `test: add behavioral judge scoring tests` then `feat: implement behavioral judge scoring script`
304
+
305
+ **Expected output**: Judge script handles all 16 loophole scenarios correctly. INCONCLUSIVE does not cause FAIL.
306
+
307
+ ---
308
+
309
+ ## Task 10: Add CI sync check for .lock.yml
310
+
311
+ **File(s)**: `.github/workflows/test.yml`
312
+
313
+ **What to implement**:
314
+ Add a job to `test.yml` that verifies `.github/workflows/behavioral-test.lock.yml` is in sync with `.github/workflows/behavioral-test.md`. On every PR and push, runs `gh aw compile --dry-run` and diffs output against committed `.lock.yml`. Fails if they diverge.
315
+
316
+ Also add: `test/workflows/behavioral-test-sync.test.js` that asserts the `.lock.yml` exists and is non-empty (structural sanity check without requiring gh-aw CLI in unit test environment).
317
+
318
+ **TDD steps**:
319
+ 1. Write `test/workflows/behavioral-test-sync.test.js` — assert `.github/workflows/behavioral-test.lock.yml` exists and `behavioral-test.md` exists
320
+ 2. Run: RED (files don't exist yet from Task 8)
321
+ 3. After Task 8 creates the files, rerun: GREEN
322
+ 4. Add sync check job to `.github/workflows/test.yml`
323
+ 5. Run `gh pr checks` to verify new job appears
324
+ 6. Commit: `test: add behavioral test lock file sync check`
325
+
326
+ **Expected output**: CI fails if `.lock.yml` is out of sync with `.md`. Structural test confirms both files exist.
327
+
328
+ ---
329
+
330
+ ## Parallelization Map
331
+
332
+ ```
333
+ Sequential (must run in order):
334
+ Task 1 (delete stale lib) → Task 2 (delete stale tests) → Task 3-5 (unit tests)
335
+
336
+ Parallel after Task 2:
337
+ Track A: Tasks 3, 4, 5 (unit tests — independent of each other)
338
+ Track B: Task 6 (structural tests — no lib dependency)
339
+ Track C: Task 7 (package.json — wait for Tasks 3-6 to know file names)
340
+
341
+ Sequential after all unit/structural tests:
342
+ Task 8 → Task 9 → Task 10 (behavioral workflow — builds on stable unit test foundation)
343
+ ```
@@ -0,0 +1,26 @@
1
+ # Decisions Log: superpowers-gaps
2
+
3
+ **Feature**: superpowers-gaps
4
+ **Branch**: feat/superpowers-gaps
5
+ **Dev session started**: 2026-03-02
6
+ **Design doc**: `docs/plans/2026-03-02-superpowers-gaps-design.md`
7
+ **Ambiguity policy**: Follow /dev decision gate (7-dimension scoring). Low-impact → proceed + document. High-impact → pause and ask.
8
+
9
+ ---
10
+
11
+ ## /dev Summary
12
+
13
+ **Completed**: 2026-03-02
14
+ **Tasks**: 6 (0a, 0b, 1, 2, 3, 4)
15
+ **Decision gates fired**: 0 (plan quality: Excellent — all ambiguity resolved in Phase 1 Q&A)
16
+ **Final test result**: 1227 pass, 31 skip, 0 fail (1258 total across 72 files)
17
+
18
+ ### Post-implementation fix (final code review finding)
19
+
20
+ **Issue 1**: Duplicate 4-phase debug section in `validate.md` (copy-paste artifact from Task 4 implementation). Removed in commit `5baddcc`.
21
+
22
+ **Issue 2**: Incomplete `/check` → `/validate` rename — `bin/forge.js`, `lib/workflow-profiles.js`, `lib/agents-config.js`, `lib/commands/status.js`, `README.md`, `QUICKSTART.md`, `GEMINI.md`, and 6 test files still referenced `/check`. All updated in commit `5baddcc`.
23
+
24
+ No decision gates were fired during implementation. All ambiguity was resolved upfront in Phase 1 Q&A.
25
+
26
+ **Status**: All decisions RESOLVED. Ready for /validate.
@@ -0,0 +1,239 @@
1
+ # Design Doc: superpowers-gaps
2
+
3
+ **Feature**: superpowers-gaps
4
+ **Date**: 2026-03-02
5
+ **Status**: Phase 3 complete — ready for /dev
6
+ **Branch**: feat/superpowers-gaps
7
+ **Beads**: forge-6od (in_progress)
8
+
9
+ ---
10
+
11
+ ## Purpose
12
+
13
+ Fill 5 workflow gaps identified in the OBRA/Superpowers integration research (`docs/research/superpowers.md`, `docs/research/superpowers-integration.md`, beads `forge-6od`):
14
+
15
+ 1. **Worktree isolation** — `/plan` had no entry gate; planning could run on any branch, contaminating unrelated feature branches (discovered when superpowers-gaps commits leaked into forge-test-suite-v2 history)
16
+ 2. **YAGNI enforcement** — No gate in `/plan` Phase 3 prevents over-scoped tasks
17
+ 3. **DRY enforcement** — No gate in `/plan` Phase 2 checks for existing implementations before planning new ones
18
+ 4. **Verification-before-completion** — `/dev` task completion and `/check` don't require end-to-end verification, only unit test passage
19
+ 5. **Systematic debugging** — No structured investigation workflow when validation fails
20
+
21
+ These gaps mean: planning commits bleed into wrong branches (isolation), code gets planned that already exists (DRY), tasks get created that aren't in the design (YAGNI), and validation failures get "fixed" without root-cause investigation (debug).
22
+
23
+ ---
24
+
25
+ ## Success Criteria
26
+
27
+ 1. ✅ `/plan` has a HARD-GATE at entry that checks the current branch, stops if not on master, then creates `feat/<slug>` + `.worktrees/<slug>` before any Phase 1 work begins (commit `86eaec8`)
28
+ 2. ✅ `/plan` Phase 3 branch creation explicitly uses `git checkout master` as base, not the current branch (commit `9b31bd9`)
29
+ 3. `/plan` Phase 2 includes an explicit DRY check step that searches for existing implementations before finalizing approach
30
+ 4. `/plan` Phase 3 task-writing includes a YAGNI filter: each task must map to a requirement in the design doc; tasks without a design doc anchor are flagged
31
+ 5. `/dev` task completion HARD-GATE requires actual behavior verification (run the feature/function, not just unit tests) before marking a task done
32
+ 6. `/check` is renamed to `/validate` and upgraded: failure path triggers automatic 4-phase systematic debug mode (Reproduce → Root-cause → Fix → Verify) with HARD-GATE: no fix without completed root-cause phase
33
+ 7. AGENTS.md, `docs/WORKFLOW.md`, and workflow table updated to reflect `/validate` naming and new capabilities
34
+ 8. All existing tests pass after changes
35
+
36
+ ---
37
+
38
+ ## Out of Scope
39
+
40
+ - Separate `/debug` command — debug mode is embedded inside `/validate`
41
+ - New review subagent for YAGNI/DRY at code-writing time — enforcement is at planning stage
42
+ - Changes to `/dev` subagent architecture (spec → quality review stays 2-stage; scope compliance is handled in `/plan` pre-work)
43
+ - Changing how Beads integrates with existing commands
44
+ - Any changes to `/ship`, `/review`, `/premerge`, `/verify`
45
+
46
+ ---
47
+
48
+ ## Approach Selected: A+B Hybrid (inline gates + automatic review)
49
+
50
+ **Why not A alone (inline gates only)**: User explicitly wants best quality and automatic process evaluation. Inline planning gates catch scope creep at planning time, but the automatic 4-phase debug mode in `/validate` provides automated enforcement at validation time.
51
+
52
+ **Why not B alone (new subagent per task)**: Adding a scope compliance reviewer as a 3rd subagent per task in `/dev` would slow every task. YAGNI/DRY enforcement is better done at planning time (before any code is written) rather than per-task.
53
+
54
+ **The hybrid**:
55
+ - **Pre-code enforcement** (planning): DRY check in Phase 2, YAGNI filter in Phase 3 — catch problems before code is written
56
+ - **Post-code enforcement** (validation): Verification HARD-GATE in `/dev` task completion, automatic debug mode in `/validate` — catch problems before shipping
57
+
58
+ ---
59
+
60
+ ## Implementation Plan (High-Level)
61
+
62
+ ### Change 1: DRY gate in `/plan` Phase 2
63
+ **File**: `.claude/commands/plan.md`
64
+ **Where**: Phase 2, codebase exploration section
65
+ **What**: Add an explicit step: before finalizing the approach, search the codebase for existing implementations that could be reused or extended. Document what was found and whether the new work extends existing code or starts fresh.
66
+
67
+ ### Change 2: YAGNI filter in `/plan` Phase 3
68
+ **File**: `.claude/commands/plan.md`
69
+ **Where**: Phase 3, Step 5 (task list creation)
70
+ **What**: Add a YAGNI filter step after initial task drafting: for each task, confirm it maps to a specific requirement in the design doc. Tasks without a clear design doc anchor must be either (a) traced back to a requirement or (b) removed. Present any removed tasks to the user as "out of scope" before finalizing.
71
+
72
+ ### Change 3: Verification HARD-GATE in `/dev` task completion
73
+ **File**: `.claude/commands/dev.md`
74
+ **Where**: Task completion HARD-GATE (currently at line ~178)
75
+ **What**: Upgrade the completion gate to require: in addition to tests passing, run the actual implemented function/feature and observe real output. This is the "verification-before-completion" pattern from Superpowers — tests can pass but behavior can still be wrong.
76
+
77
+ ### Change 4: Rename `/check` to `/validate` + add debug mode
78
+ **Files**:
79
+ - `.claude/commands/check.md` → rename to `.claude/commands/validate.md`
80
+ - All references to `/check` in AGENTS.md, `docs/WORKFLOW.md`, `docs/plans/`, `.claude/rules/workflow.md`
81
+ **What**:
82
+ - Rename the command file
83
+ - Add failure path: when any validation step fails, automatically enter 4-phase debug mode:
84
+ - **Phase D1: Reproduce** — confirm failure is deterministic, get exact error output
85
+ - **Phase D2: Root-cause trace** — trace the failure to its actual source (not symptoms)
86
+ - **Phase D3: Fix** — minimal targeted fix for the root cause
87
+ - **Phase D4: Verify** — re-run full validation, confirm fix works end-to-end
88
+ - HARD-GATE: No fix commit without completing Phase D2 (root-cause confirmed in writing)
89
+ - After fix, automatically re-run validation from the beginning
90
+
91
+ ---
92
+
93
+ ## Constraints
94
+
95
+ - **Additive only**: No restructuring of existing phases, no removing steps
96
+ - **Lean gates**: YAGNI filter = checklist, not a new phase. DRY check = one search step, not a research loop. Gates should add ~2-3 lines of instruction, not new procedures.
97
+ - **No new ceremony**: Debug mode in `/validate` activates only on failure. Passing runs are unchanged.
98
+ - **Ambiguity policy**: Follow existing `/dev` decision gate (7-dimension scoring). Low-impact spec gaps → agent makes reasonable choice, documents in decisions file. High-impact gaps → pause and ask.
99
+
100
+ ---
101
+
102
+ ## Edge Cases (from Q&A)
103
+
104
+ 1. **YAGNI filter removes all tasks**: If every task is flagged as out-of-scope, the design doc needs more requirements. Present this as "design doc doesn't cover all tasks — needs amendment" rather than error.
105
+ 2. **DRY check finds partial match**: If codebase has something 80% similar, document it as "extend existing" in the approach — don't create a net-new implementation.
106
+ 3. **Debug mode loops**: If Phase D3 fix doesn't resolve Phase D4 verify, re-enter Phase D1 with more specific reproduction steps. Max 3 debug cycles before surfacing to user with full context.
107
+ 4. **Validation passes on re-run after fix, but fix is wrong**: Phase D4 requires not just "tests pass" but "behavior is correct" — run actual feature, not just tests.
108
+
109
+ ---
110
+
111
+ ## Ambiguity Policy
112
+
113
+ Follow existing `/dev` decision gate (7-dimension scoring system):
114
+ - Score ≤ threshold: Agent makes reasonable choice, documents in `docs/plans/YYYY-MM-DD-superpowers-gaps-decisions.md`
115
+ - Score > threshold: Pause and ask user
116
+
117
+ Phase 1 Q&A pre-resolved all major design questions. Remaining ambiguity should be rare.
118
+
119
+ ---
120
+
121
+ ## Technical Research
122
+
123
+ ### YAGNI/DRY Enforcement — Key Findings
124
+
125
+ **Critical discovery**: Superpowers `writing-plans/SKILL.md` only contains "DRY. YAGNI. TDD." as aspirational bullet points — no actual gates or enforcement mechanisms. Our approach (proper HARD-GATE wording) is stronger than Superpowers' implementation.
126
+
127
+ **Effective YAGNI gate wording** (from Claude Code system prompts, Cursor rules, community research):
128
+ - "Do not add features, refactor, or improve beyond what was asked."
129
+ - "Only make changes that are directly requested or clearly necessary."
130
+ - "YAGNI: No speculative implementation." (applied during GREEN phase)
131
+
132
+ **Effective DRY gate wording**:
133
+ - "Check if logic already exists before writing new code." (Cursor rules)
134
+ - "Before creating new code, search the codebase for existing implementations" + explicit grep/glob tool calls
135
+
136
+ **Critical gotcha**: Aspirational lists ("DRY. YAGNI.") are ignored under pressure. Effective enforcement requires imperative gate language AND explicit search commands (not just "check"). Agents hallucinate that nothing equivalent exists if not forced to search with tools.
137
+
138
+ ### Verification-Before-Completion — Key Findings
139
+
140
+ From `superpowers:verification-before-completion/SKILL.md`:
141
+
142
+ **Iron Law**: `NO COMPLETION CLAIMS WITHOUT FRESH VERIFICATION EVIDENCE`
143
+
144
+ **5-step gate**:
145
+ 1. IDENTIFY: What command proves this claim?
146
+ 2. RUN: Execute the FULL command (fresh, complete)
147
+ 3. READ: Full output, check exit code, count failures
148
+ 4. VERIFY: Does output confirm the claim?
149
+ 5. ONLY THEN: Make the claim
150
+
151
+ **Enforcement**: "Skip any step = lying, not verifying"
152
+
153
+ **Common failures table** (forbidden substitutes):
154
+ - "Tests pass" ← `"Previous run", "should pass"` is not evidence
155
+ - "Bug fixed" ← `"Code changed, assumed fixed"` is not evidence
156
+ - "Requirements met" ← `"Tests passing"` alone is not sufficient
157
+
158
+ **Red Flags — STOP**: Using "should", "probably", "seems to"; expressing satisfaction ("Great!", "Done!") before verification; trusting agent success reports.
159
+
160
+ ### Systematic Debugging — Key Findings
161
+
162
+ From `superpowers:systematic-debugging/SKILL.md`:
163
+
164
+ **Iron Law**: `NO FIXES WITHOUT ROOT CAUSE INVESTIGATION FIRST`
165
+
166
+ **4-phase structure** (MUST complete each before proceeding):
167
+ - Phase 1: Root Cause Investigation (reproduce, trace data flow)
168
+ - Phase 2: Pattern Analysis (find working examples, compare references)
169
+ - Phase 3: Hypothesis and Testing (form SINGLE hypothesis, test MINIMALLY)
170
+ - Phase 4: Implementation (failing test FIRST, ONE change at a time)
171
+
172
+ **3-fix architectural HARD-GATE**: If >= 3 fix attempts fail → STOP, question architecture. Do NOT attempt Fix #4.
173
+
174
+ **Red Flags — STOP** (return to Phase 1):
175
+ - "Quick fix for now, investigate later"
176
+ - "It's probably X, let me fix that"
177
+ - "I don't fully understand but this might work"
178
+
179
+ **Key principle**: "Fix at source, not at symptom." Seeing symptoms ≠ understanding root cause.
180
+
181
+ ### Rename Scope (/check → /validate)
182
+
183
+ **Files affected**: 25 files, ~70+ instances
184
+ - Command file: `.claude/commands/check.md` → `.claude/commands/validate.md`
185
+ - Implementation: `lib/commands/check.js` → `lib/commands/validate.js`
186
+ - Test file: `test/commands/check.test.js` → `test/commands/validate.test.js`
187
+ - Stage references in all command docs (dev.md, plan.md, ship.md, review.md, premerge.md, verify.md, research.md, rollback.md)
188
+ - Docs: AGENTS.md, docs/WORKFLOW.md, docs/TOOLCHAIN.md, docs/VALIDATION.md, docs/EXAMPLES.md, docs/README-v1.3.md, docs/ROADMAP.md, docs/MANUAL_REVIEW_GUIDE.md, docs/ENHANCED_ONBOARDING.md
189
+ - GitHub: .github/CONTRIBUTING.md, .github/pull_request_template.md, .github/agentic-workflows/behavioral-test.md
190
+ - Rules: .claude/rules/workflow.md
191
+
192
+ **Strategy**: Batch sed replacement across all files for `/check` → `/validate`, then manually update:
193
+ - File renames (check.md → validate.md, check.js → validate.js, check.test.js → validate.test.js)
194
+ - `<HARD-GATE: /check exit>` tag names
195
+ - Function names in check.js that reference "check" semantically
196
+
197
+ ### OWASP Analysis
198
+
199
+ All changes are to `.md` instruction files and `.js` command implementations. No security surface: no user input, no cryptography, no access control, no external service calls.
200
+
201
+ Risk: Near-zero. No OWASP categories apply to this change type.
202
+
203
+ ### TDD Test Scenarios
204
+
205
+ **Test 1 (Happy path — YAGNI filter)**:
206
+ - Input: plan Phase 3 with 5 tasks, 3 mapped to design doc, 2 not mapped
207
+ - Expected: `extractTasksFromDesign()` returns flagged tasks list: 2 tasks with `yaggniFlag: true`
208
+ - Test file: `test/commands/plan.phases.test.js`
209
+
210
+ **Test 2 (Happy path — /validate rename)**:
211
+ - Input: `executeValidate({ skip: ['lint', 'security', 'tests'] })`
212
+ - Expected: returns `{ success: boolean, checks: object, summary: string }` (same shape as check)
213
+ - Test file: `test/commands/validate.test.js`
214
+
215
+ **Test 3 (Verification gate — no completion without evidence)**:
216
+ - Input: `validateCompletion({ claimed: 'tests pass', evidence: null })`
217
+ - Expected: throws or returns `{ valid: false, reason: 'No fresh run evidence provided' }`
218
+ - Test file: `test/commands/validate.test.js`
219
+
220
+ **Test 4 (Edge case — all tasks flagged as YAGNI)**:
221
+ - Input: plan Phase 3 with design doc that has no matching tasks
222
+ - Expected: returns `{ allFlagged: true, message: 'Design doc doesn\'t cover all tasks — needs amendment' }`
223
+ - Test file: `test/commands/plan.phases.test.js`
224
+
225
+ **Test 5 (Debug mode — 3-fix architectural gate)**:
226
+ - Input: `debugMode({ fixAttempts: 3, error: 'test failure' })`
227
+ - Expected: returns `{ escalate: true, message: 'STOP: 3+ fixes attempted. Question architecture before Fix #4.' }`
228
+ - Test file: `test/commands/validate.test.js`
229
+
230
+ ---
231
+
232
+ ## Sources
233
+
234
+ - `docs/research/superpowers.md` — Full Superpowers analysis, 14 skills, HARD-GATE pattern
235
+ - `docs/research/superpowers-integration.md` — 5 integration options, decision matrix, recommended path
236
+ - `forge-6od` — Confirmed gaps list with primary sources
237
+ - `.claude/commands/plan.md` — Current plan command state (HARD-GATE blocks confirmed present)
238
+ - `.claude/commands/dev.md` — Current dev command state (two-stage review confirmed present)
239
+ - `.claude/commands/check.md` — Current check command state (verification-before-completion confirmed missing)