voidforge-build 23.11.4 → 23.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/dist/.claude/agents/batman-qa.md +1 -0
  2. package/dist/.claude/agents/galadriel-frontend.md +2 -0
  3. package/dist/.claude/agents/kusanagi-devops.md +4 -0
  4. package/dist/.claude/agents/lucius-config.md +6 -0
  5. package/dist/.claude/agents/samwise-accessibility.md +4 -0
  6. package/dist/.claude/agents/silver-surfer-herald.md +13 -4
  7. package/dist/.claude/commands/architect.md +9 -0
  8. package/dist/.claude/commands/assemble.md +4 -1
  9. package/dist/.claude/commands/assess.md +13 -1
  10. package/dist/.claude/commands/audit-docs.md +106 -0
  11. package/dist/.claude/commands/deploy.md +29 -1
  12. package/dist/.claude/commands/engage.md +19 -1
  13. package/dist/.claude/commands/gauntlet.md +23 -4
  14. package/dist/.claude/commands/imagine.md +15 -0
  15. package/dist/.claude/commands/sentinel.md +15 -0
  16. package/dist/.claude/commands/ux.md +36 -0
  17. package/dist/.claude/commands/void.md +1 -0
  18. package/dist/CHANGELOG.md +65 -0
  19. package/dist/CLAUDE.md +9 -0
  20. package/dist/VERSION.md +3 -1
  21. package/dist/docs/methods/AI_INTELLIGENCE.md +33 -0
  22. package/dist/docs/methods/ASSEMBLER.md +31 -2
  23. package/dist/docs/methods/BUILD_PROTOCOL.md +2 -0
  24. package/dist/docs/methods/CAMPAIGN.md +46 -0
  25. package/dist/docs/methods/DEVOPS_ENGINEER.md +194 -0
  26. package/dist/docs/methods/DOC_AUDIT.md +92 -0
  27. package/dist/docs/methods/FORGE_KEEPER.md +16 -5
  28. package/dist/docs/methods/GAUNTLET.md +38 -0
  29. package/dist/docs/methods/PRODUCT_DESIGN_FRONTEND.md +57 -0
  30. package/dist/docs/methods/QA_ENGINEER.md +21 -0
  31. package/dist/docs/methods/RELEASE_MANAGER.md +27 -0
  32. package/dist/docs/methods/SECURITY_AUDITOR.md +12 -1
  33. package/dist/docs/methods/SUB_AGENTS.md +54 -0
  34. package/dist/docs/methods/SYSTEMS_ARCHITECT.md +13 -0
  35. package/dist/docs/methods/TESTING.md +19 -0
  36. package/dist/docs/patterns/README.md +3 -0
  37. package/dist/docs/patterns/ai-eval.ts +63 -0
  38. package/dist/docs/patterns/daemon-process.ts +90 -0
  39. package/dist/docs/patterns/database-migration.ts +65 -0
  40. package/dist/docs/patterns/deploy-preflight.ts +85 -2
  41. package/dist/docs/patterns/design-tokens.ts +338 -0
  42. package/dist/docs/patterns/error-message-categorization.tsx +376 -0
  43. package/dist/wizard/lib/patterns/daemon-process.d.ts +2 -1
  44. package/dist/wizard/lib/patterns/daemon-process.js +89 -1
  45. package/package.json +2 -2
@@ -230,3 +230,30 @@ After pushing to remote, if the project runs on a persistent server (PM2, system
230
230
  2. **If stale:** Prompt: "Server is running an older version. Rebuild and restart? [Y/n]"
231
231
  3. **In blitz mode:** Auto-rebuild if a deploy script or PM2 ecosystem config exists.
232
232
  4. Pushing code to GitHub is NOT deploying it. The server must be rebuilt and restarted for changes to take effect. (Field report #104: 22 commits pushed but PM2 was still running v3.8.1 while code was v3.10.0.)
233
+
234
+ ## No Auto-Rotting Production-Status Footer (field report #342 F-4)
235
+
236
+ Do NOT add a "Production binary still vX.Y — vA, B, C await operator deploy" footer to the `PROJECT_VERSION.md` template (or any per-version block). The pattern is seductive — it reads as a helpful reminder when written — but it rots silently: it is accurate only at the instant of the version it was written under, and the *next* version bump leaves it pointing at a stale "still on vX.Y" claim that nobody re-reads. By the third release it actively lies about what production is running.
237
+
238
+ **Rule:** Production-deploy status lives in exactly two places, both of which a release bump already touches:
239
+
240
+ 1. **The single source of truth**, if the project keeps one — `docs/_truth.yml` (or equivalent machine-readable status file). One canonical `production_version:` field, not a prose footer.
241
+ 2. **The topmost "Current" block** of `PROJECT_VERSION.md` — the line Coulson already rewrites every bump (Step 5 changes `**Current:** X.Y.Z`). Deploy state, if tracked here at all, belongs adjacent to that line so it is impossible to bump the version without confronting it.
242
+
243
+ A per-version footer fails because it is *additive* — each bump appends a new one and leaves the old ones in place, so the file accumulates N footers of which N−1 are false. The Current block and the truth file are *overwritten* each bump, so they cannot drift. Coulson rejects any release diff that introduces an "await operator deploy" or "Production binary still" footer; route that information to the Current block instead.
244
+
245
+ ## Regenerating Generated CLAUDE.md Stack Blocks (field report #342 F-2)
246
+
247
+ When a generated `CLAUDE.md` (or any generated doc) embeds a project stack/inventory block — framework, language, test count, package versions — do NOT leave a promissory placeholder marker (`<!-- stack block: fill me in -->`, `[STACK_TBD]`, etc.) that depends on a human remembering to update it. Placeholder markers rot the same way the footer in F-4 does: they survive review, ship, and then read as authoritative once the brackets are forgotten.
248
+
249
+ **Pattern:** If the project keeps a machine-readable truth source — `docs/_truth.yml`, `package.json`, a manifest — a regeneration helper rewrites a **clearly-delimited generated block** in place from that source, so the block is reproducible and drift is impossible (re-run the helper, diff, commit). Wrap the block in explicit sentinels so the rewrite is surgical and the hand-written prose around it is never clobbered:
250
+
251
+ ```
252
+ <!-- BEGIN GENERATED: stack (do not edit by hand — run scripts/regen-claude-md.sh) -->
253
+ - **Framework:** Next.js 15.4
254
+ - **Language:** TypeScript 5.6 (strict)
255
+ - **Tests:** 1209 passing
256
+ <!-- END GENERATED: stack -->
257
+ ```
258
+
259
+ A working `scripts/regen-claude-md.sh` may ship alongside this discipline (reading `docs/_truth.yml` / `package.json` and rewriting only the text between the sentinels, leaving everything else byte-identical). If that script is absent, this section documents the intended pattern: the *generated* block is derived, never authored by hand, and never a placeholder. On every MINOR/MAJOR bump Coulson regenerates the block (or flags it for regeneration) rather than trusting that someone updated the prose by hand.
@@ -263,6 +263,17 @@ For any system that sends URLs to users (transactional emails, SMS, push notific
263
263
 
264
264
  This is the outbound mirror of SSRF prevention: SSRF stops external URLs from reaching internal services, outbound URL safety stops internal URLs from reaching external users. (Field report #44: verification email sent with `localhost:5005` URL — worked on same machine, broke from any other device.)
265
265
 
266
+ ### Enforcement-Layer Severity Rubric (field report #354 F2)
267
+
268
+ Key a finding's severity to the **enforcement layer**, not the **symptom location**. The question that sets severity is not "where did I see the leak?" but **"where is this actually enforced?"** Before you assign P0/P1, trace the request to the layer that *decides* — the server-side authorization check, the database query scope, the policy engine — and confirm the gap exists *there*.
269
+
270
+ - **Client-side affordance leak with intact server enforcement = UX-only (P2/P3), not a breach.** A hidden admin button that renders in the DOM, a disabled-but-present form field, an action the SPA shows but the API rejects with 403/404 — these are **render-then-403** patterns. The client showed something it shouldn't, but the actually-enforcing layer (the server) still says no. That is an information-disclosure or UX-polish finding, not a Critical. Rating a server-enforced client affordance leak as Critical is a false-positive that wastes a remediation slot and erodes trust in the report.
271
+ - **A gap at the actually-enforcing layer = P0/P1.** If the server itself does not check ownership, the role gate is missing on the route, or the query has no `org_id` scope, the breach is real regardless of what the client renders. The symptom may surface in the UI, but the severity comes from the server hole.
272
+
273
+ **Verification before scoring (always do this for any "exposed in the UI" finding):** reproduce the action against the API directly — `curl`/Postman with the victim's resource ID and the attacker's credentials, no browser. If the server returns 403/404/401 and writes nothing, the enforcing layer holds → downgrade to P2/P3 and note "server-enforced; client affordance leak only." If the server returns 200 + data or commits a write, the enforcing layer is breached → P0/P1. Never infer the server's behavior from the client's rendering.
274
+
275
+ This is an explicit lens in **both** the audit (Phase 1/2: for every "this is visible/clickable" observation, ask "where is this actually enforced?" and probe that layer) and the re-verify pass (Phase 4: Maul must confirm a downgraded affordance-leak finding by hitting the API directly, not by re-checking the DOM). (#354 F2)
276
+
266
277
  ### Credentials Never in API Responses
267
278
 
268
279
  API responses must NEVER include credentials, tokens, or secrets — even in "admin-only" or "internal" endpoints. Grep for responses that include: `password`, `secret`, `token`, `api_key`, `private_key`, `credentials`. Common violations: user profile endpoints returning the password hash, API key management endpoints including the full key in GET responses (show only last 4 characters), internal debug endpoints returning environment variables. (Field report #66: API settings endpoint returned full MCP connection credentials in the response body.)
@@ -366,7 +377,7 @@ When fixing an auth, authorization, or validation check: trace ALL callers of th
366
377
 
367
378
  After remediations are applied:
368
379
 
369
- **Maul — Red Team Verification:** Re-probe all remediated vulnerabilities. Verify fixes hold under adversarial conditions. Check that fixes didn't introduce new attack vectors. Attempt to bypass the remediations.
380
+ **Maul — Red Team Verification:** Re-probe all remediated vulnerabilities. Verify fixes hold under adversarial conditions. Check that fixes didn't introduce new attack vectors. Attempt to bypass the remediations. **Apply the enforcement-layer lens (#354 F2):** for any finding rated Critical/High off a UI-visible symptom, confirm severity by hitting the API directly — a finding that only reproduces in the DOM but returns 403/404 server-side is a server-enforced affordance leak (P2/P3), not the breach it was filed as. Re-score before sign-off.
370
381
 
371
382
  **Padmé — Functional Verification:** After Maul confirms security holds, Padmé verifies the primary user flow still works end-to-end. Open the app, complete the main task, verify output. This catches "secure but broken" regressions that pure security re-testing misses.
372
383
 
@@ -115,6 +115,10 @@ This powers the Danger Room's live agent ticker. The wizard server watches this
115
115
 
116
116
  This is **methodology-driven logging**, not hook-driven. Hooks cannot extract agent identity from tool input — the orchestrator must write the log entry explicitly. (Field report #128, architectural review)
117
117
 
118
+ ### Workflow-Tool Progress-Tree Labels
119
+
120
+ When dispatching via the Workflow tool, set the agent **label** so the named character surfaces in the `/workflows` progress tree. Use the form `"<agent> · <key>"` (e.g., `"Picard · review:architecture"`, `"Kenobi · sentinel:auth"`, `"Galadriel · ux:a11y"`), or omit the label entirely so the underlying `agentType` surfaces on its own. If you instead pass only a dimension key like `review:architecture` as the label, that key OVERRIDES the agent identity and the tree shows the dimension instead of Picard/Kenobi/Galadriel — the roster becomes anonymous in the dashboard and the Danger Room ticker correlation breaks. Keep the character name as the leading token of every workflow label. (Field report #348 #2.)
121
+
118
122
  ## Delegation Template
119
123
 
120
124
  ```
@@ -330,8 +334,37 @@ This pattern applies to:
330
334
  - Galadriel's UX (Samwise + Radagast re-verify)
331
335
  - Kenobi's Security (Maul re-probes remediations)
332
336
 
337
+ #### Verify the FIX, not just the finding
338
+
339
+ The adversarial-verify step has two distinct jobs, and orchestrators routinely collapse them into one:
340
+
341
+ 1. **Re-probe the fixed AREA** — after a fix lands, confirm the original finding is gone and no neighboring regression appeared. This is the Pass 2 above.
342
+ 2. **Interrogate the fix DESIGN** — before or as the fix lands, challenge the *proposed remediation itself* for NEW failure modes it introduces: wedge (a state that can never be exited inside the available budget), unbounded retry, infinite loop, orphaned record, double-send. This is NOT the same as re-probing the area; it scrutinizes the design of the change, not its installed effect.
343
+
344
+ Job 2 is **especially mandatory when the fix adds a coordination primitive** — a sentinel, a lock, a retry-state record, a fence, a dedup marker — **without a corresponding liveness signal** (a guaranteed path that releases the primitive, an upper bound on retries, a reclaim window that is actually reachable). A coordination primitive with no liveness signal is a wedge waiting to happen: it makes the original bug rarer but converts it into a stuck state that is harder to diagnose.
345
+
346
+ Motivating incidents:
347
+ - **M5 mint-fence** (field report #348 #1 / #350 #4): the fix added a mint fence so a draft couldn't be re-minted concurrently, with a reclaim window to recover abandoned fences. But the reclaim window was set *longer than the retry budget* — so every retry exhausted before the window opened, and the reclaim path was algebraically unreachable inside the retry budget. Drafts wedged permanently in `FAILED`. The fix's own coordination primitive (the fence) had no reachable liveness path.
348
+ - **M6 lifecycle-sweep** (field report #348 #1 / #350 #4): the fix swept lifecycle records on a schedule but compared against a stale `send_at` snapshot captured before the sweep, so a record whose `send_at` had advanced got swept AND re-sent — a double-send introduced by the remediation, not present in the original bug.
349
+
350
+ Both would have been caught by an adversarial pass that asked "what new failure mode does THIS fix create?" rather than only "is the old finding gone?" When a fix introduces a sentinel/lock/retry-state, the verify dispatch brief MUST name the wedge/loop/orphan/double-send checklist explicitly and require the agent to trace the liveness path.
351
+
333
352
  **Important distinction:** The Agent tool enables **parallel analysis**, not parallel coding. Sub-agents return text findings — the lead agent then implements code changes sequentially. This is still faster than sequential analysis, but don't expect parallel file edits.
334
353
 
354
+ ### The Default Review Shape: Find → Cluster/Dedupe → 3-Lens Verify → Fix Only Survivors
355
+
356
+ Every review command — `/engage`, `/sentinel`, `/gauntlet` — runs the same four-stage shape, not a flat "list findings then fix everything" pass. v23.12.0 added the refute-pass mechanics to `/gauntlet`; this is the generalized naming so the same discipline is the DEFAULT everywhere, not Gauntlet-only (field report #354 F1).
357
+
358
+ 1. **Find** — fan out the roster; each lens produces raw findings against the same diff (see Intentionally Overlapping Mandates).
359
+ 2. **Cluster/Dedupe** — collapse the raw findings into distinct claims. The same root cause flagged by Stark + Kenobi + Ahsoka is ONE claim with three votes, not three findings. LLM-assigned finding ids are display labels, not keys — dedupe on the claim, not the id.
360
+ 3. **3-Lens Verify** — every surviving claim is interrogated through three lenses before it earns a fix:
361
+ - **Correctness** — is the asserted behavior actually wrong? (the bug is real, the logic is genuinely broken)
362
+ - **Reachability** — can production actually hit this path? (not provably-dead-code, not behind a `DEV_ONLY` gate — see the WARN/cosmetic contract above)
363
+ - **Refutation** — assign a **skeptic agent whose explicit job is to REFUTE the finding and cast a confirm vote.** The skeptic is told to argue the finding is wrong/unreachable/already-handled and to vote CONFIRM only if it cannot. A claim that survives a reviewer instructed to kill it is a real claim. This is the defining element of the shape: not "does another lens agree?" but "does a lens TRYING to disprove it fail to?" A finding nobody was assigned to refute is unverified, regardless of how many agents independently raised it.
364
+ 4. **Fix Only Survivors** — only claims that pass all three lenses (correct AND reachable AND survive a confirm-vote refutation) enter the fix batch. Refuted claims are logged with the refutation rationale and dropped — never silently, so a future review doesn't re-raise them.
365
+
366
+ The refutation lens is what separates this from the Intentionally Overlapping Mandates convergence rule: convergence asks independent agents to agree; refutation assigns one agent to disagree on purpose. Run both — convergence raises confidence on what's flagged, refutation removes false positives from the fix batch. (Field report #354 F1.)
367
+
335
368
  ### Multi-Session Parallelism (Separate Terminals)
336
369
 
337
370
  For larger projects where agents need to make code changes simultaneously, use separate Claude Code sessions in different terminal windows. Each session works on separate files within defined scope boundaries.
@@ -372,6 +405,18 @@ Proven in production: a full `/assemble --muster` (11 phases, 15+ agents) ran en
372
405
  | Track status, report to user | Do work an agent could do |
373
406
  | Git operations (commit, push) | Launch agent-to-agent dispatch |
374
407
 
408
+ ### Default to Fixing, Not to Asking Which to Fix
409
+
410
+ When a review surfaces a clear list of fixable findings, the orchestrator's DEFAULT is to apply them in batches — not to surface a multi-option "which subset should I fix?" picker and wait. A list of well-scoped findings with obvious remediations is a work queue, not a decision fork. Presenting it back to the user as a menu of options offloads triage the orchestrator was dispatched to do, and stalls a batch that could already be landing.
411
+
412
+ Apply the findings in batches (partition by domain/concern per the Concurrency Rules), verify after each batch, and report what was fixed. Only stop to ask when a choice is **genuinely architectural or irreversible** — e.g., two incompatible schema directions, a data migration that can't be rolled back, a dependency that changes the deploy target, or a trade-off the PRD is silent on (then follow Multi-agent conflict resolution). "Which of these 9 lint/logic findings should I fix?" is not such a choice; "should this be event-sourced or CRUD?" is. (Field report #343 F5.)
413
+
414
+ ### Use AskUserQuestion at Genuine Forks
415
+
416
+ The flip side of the anti-picker rule: when the orchestrator hits a **genuine creative or scope fork** — 2-3 mutually-exclusive directions, none obviously dominant, where guessing wrong means rework — present them with `AskUserQuestion` and an option preview for each, rather than silently picking one or surfacing a single take-it-or-leave-it option. Give each option a short label and a one-line preview of what it commits to (the trade-off, the consequence, what it forecloses), so the user can decide in one read instead of an interview.
417
+
418
+ Use it for: which of two layouts/IA directions, which scope to ship first when both are valid, an irreversible architectural split, a naming/contract convention that downstream agents will all inherit. Do NOT use it as a substitute for triage you should be doing yourself (see the anti-picker rule above), and do NOT pad it past 3 options — a fork with 6 options usually means the scope wasn't analyzed enough to narrow it. One option presented as a question ("shall I do X?") is also an anti-pattern: either it's the obvious default (just do it) or there's a real alternative (show both). (Field report #351 #5.)
419
+
375
420
  ### Standard Agent Brief
376
421
 
377
422
  Every agent launch MUST include a structured brief:
@@ -425,6 +470,15 @@ Field report #324 (Union Station v7.8 R2): three agents (Discovery + Stark + Ken
425
470
  - **Wait for ALL parallel agents before synthesizing** (field report #300).
426
471
  - Partition strategies: by domain (frontend/backend), by concern (security/UX), or read-only vs. write.
427
472
 
473
+ ### Directory / Migration Fan-Out: Glob the List, Sweep the Remainder
474
+
475
+ When a wave fans out per-file or per-entity work across a directory or migration (one agent per file/module/route/migration), two rules are MANDATORY (field report #355 F2):
476
+
477
+ 1. **Derive the per-agent file list from a GLOB, never a hand-typed list.** Run `ls`/`Glob` (e.g., `Glob "src/routes/**/*.ts"`, `git ls-files 'migrations/*.sql'`) and partition the GLOB output into agent assignments. A hand-typed list silently drops the files the orchestrator forgot existed — and those are exactly the ones with the unmigrated legacy pattern, because they weren't top-of-mind. The glob is the source of truth for "what's in scope," not the orchestrator's memory of the tree.
478
+ 2. **Pair every fan-out with a post-fan-out completeness sweep before the wave is "done."** After all fan-out agents return, run ONE grep for the legacy pattern across the WHOLE target tree (not just the assigned files) — e.g., `grep -rn "oldApiCall(" src/` or `grep -rln "TODO: migrate" .`. A wave is not complete while that grep returns hits. The sweep catches: files the glob/partition missed, files created mid-wave by a parallel agent, and occurrences an agent declared done but left behind. Zero hits is the completion gate, not "all dispatched agents reported done."
479
+
480
+ The failure mode this prevents: a fan-out reports "9/9 agents complete" while 3 files still carry the legacy pattern — because they were never in the hand-typed list, and nobody grepped the whole tree to confirm. "All my agents finished" is not "the migration is complete." The completeness sweep is the difference. (Field report #355 F2.)
481
+
428
482
  ### Context Passing Between Phases
429
483
 
430
484
  - Pass **findings summaries** between phases, not raw file contents
@@ -211,6 +211,19 @@ When reviewing architecture, identify all endpoints/services that mutate the sam
211
211
 
212
212
  When architecture requires accepting a known security risk (e.g., iframe sandbox weakening for UX, storing tokens in memory for operational continuity), document it as an ADR with explicit risk acceptance. Include: the tradeoff made, what is gained, what attack surface is expanded, what mitigations are in place, and who accepted the risk. This prevents the same finding from appearing in every future audit and reduces Gauntlet noise. (Field report #102: preview iframe `allow-scripts + allow-same-origin` sandbox escape was a known tradeoff but was never documented — flagged in every security pass.)
213
213
 
214
+ ### Fix-Direction Reconciliation Against Doctrine
215
+
216
+ For any access, permission, or contract fix, "verified" is not sufficient to make the fix actionable. A finding can be reproduced, root-caused, and confirmed by multiple agents and *still* carry a backwards fix — one that widens a permission, grants access to the wrong principal, or relaxes a contract the doctrine intends to tighten. Reproduction proves the behavior; it does not prove the fix moves in the correct direction. (Field report #349 F-2)
217
+
218
+ Before any such fix is accepted, the architect MUST do two things explicitly:
219
+
220
+ 1. **Name the governing SSOT.** Identify the single source of truth that governs the access/permission/contract being changed — the permission matrix, the relevant ADR, or the published API contract. If no SSOT exists for the boundary being touched, that absence is itself a finding: the fix is unanchored and must wait until the doctrine is written.
221
+ 2. **Reconcile the fix DIRECTION against that SSOT.** State, in the fix record, whether the change *loosens* or *tightens* the boundary, and *who gains or loses access* as a result. Then compare that direction to what the named SSOT prescribes. If the fix loosens a permission the matrix says should be tightened (or grants a role access the ADR reserves for another), the fix is backwards — reject it and re-derive the correct change from doctrine, regardless of how well-verified the underlying finding is.
222
+
223
+ The reconciliation belongs in the same record as the finding: *"SSOT: <permission-matrix row / ADR-NNN / contract endpoint>. Direction: <loosen|tighten>; <principal> gains/loses <access>. Doctrine prescribes: <tighten|loosen>. Reconciled: <match|MISMATCH — fix is backwards>."* A MISMATCH blocks the fix.
224
+
225
+ This mirrors the engage.md Step 2 requirement that access/permission findings name their governing SSOT and reconcile fix direction before synthesis — Picard applies the same gate at the architecture layer so a backwards fix never reaches an ADR or an implementer. (Field report #349 F-2)
226
+
214
227
  ### Strategy Consolidation Check
215
228
 
216
229
  When a system implements N parallel strategies for the same goal (payment providers, notification channels, API versions, deployment targets, content pipelines), periodically verify that each strategy still justifies its maintenance cost. If usage data shows one strategy handling 95%+ of traffic or value while the others sit idle or near-zero, the idle strategies are not "options" — they are dead code with maintenance burden.
@@ -356,6 +356,25 @@ Do NOT create custom DDL in test files — it drifts from the real schema (missi
356
356
 
357
357
  Custom DDL causes test DB schema mismatches that require 2-3 fix-and-retry cycles per occurrence. (Field report #31)
358
358
 
359
+ ### Failure Attribution in Shared-State Suites
360
+
361
+ When a test fails in a suite that shares mutable state across files (a shared test DB, module-level singletons, a global fixture, an ordering-sensitive runner), do NOT attribute a multi-file failure to your change until you have reproduced it in isolation. Shared state means a failure can surface in file B while the root cause lives in file A — or in test ordering itself, not in your edit at all. (Field report #349 F-3)
362
+
363
+ **Procedure:**
364
+
365
+ 1. **Isolate the failing file.** Run only the failing test file (or the single test), so cross-file state pollution can't contribute. Use the framework's isolation/single-worker flag so the runner doesn't parallelize or randomize:
366
+
367
+ | Framework | Isolate single-worker / no parallelism | Disable random ordering |
368
+ |-----------|----------------------------------------|-------------------------|
369
+ | vitest | `vitest run --no-threads <file>` (or `--pool=forks --poolOptions.forks.singleFork`) | `--sequence.shuffle=false` |
370
+ | jest | `jest --runInBand <file>` | `--testSequencer` (default is deterministic) |
371
+ | pytest | `pytest <file>::<test>` | `pytest -p no:randomly` (disable pytest-randomly) |
372
+
373
+ 2. **Compare against clean HEAD.** Stash your change (`git stash`) and re-run the same isolated command on a clean tree. If it still fails on clean HEAD, the failure is pre-existing — not yours. Restore with `git stash pop` afterward.
374
+ 3. **Only after isolation + clean-HEAD comparison** attribute the failure to your change, and fix the actual cause rather than the symptom.
375
+
376
+ This is the canonical rule in `/docs/methods/QA_ENGINEER.md` (Failure Attribution) — see it for the full decision tree. This section is the testing-runner-flag companion to it.
377
+
359
378
  ## Setup Checklist
360
379
 
361
380
  When setting up testing for a new project:
@@ -34,6 +34,9 @@ Reference implementations for common code structures. These show the **shape and
34
34
  | Data Pipeline | `data-pipeline.ts` | ETL with checkpoint/resume, quality checks, idempotent processing | Node.js streams, Python polars, SQL/dbt |
35
35
  | Backtest Engine | `backtest-engine.ts` | Walk-forward validation, no-lookahead, Sharpe/drawdown metrics | Python vectorbt/backtrader |
36
36
  | Execution Safety | `execution-safety.ts` | Order validation, position limits, exchange precision, paper/live toggle | CCXT, Alpaca, IBKR |
37
+ | Design Tokens | `design-tokens.ts` | Semantic color/type tokens so theme pivots are a token change (field report #351) | CSS vars + Tailwind + React |
38
+ | Nginx Vhost | `nginx-vhost.conf` | Cloudflare-Flexible-safe vhost: security headers, ACME passthrough (field report #351) | Nginx |
39
+ | Error Message Categorization | `error-message-categorization.tsx` | Categorize errors at the UI boundary before showing copy (field report #351) | React (framework-agnostic notes) |
37
40
 
38
41
  ## How to Use
39
42
 
@@ -337,6 +337,69 @@ export const CLAUDE_PROMPT_EVAL_CATEGORIES = {
337
337
  * 7. cost per case within 20% of baseline
338
338
  */
339
339
 
340
+ // --- Live eval layer: the pre-launch gate (field report #352, #4) ---
341
+
342
+ /**
343
+ * THE LIVE EVAL LAYER IS THE PRE-LAUNCH GATE.
344
+ *
345
+ * Deterministic and sandbox-adapter evals (fixed inputs, fake-data runners)
346
+ * verify your *plumbing* — scoring functions, tag breakdowns, comparison
347
+ * thresholds. They CANNOT catch model-output-shape bugs, because the runner
348
+ * never calls a real model. The shape of what a live model actually emits —
349
+ * extra prose, null fields, reordered keys, casing drift — only appears when
350
+ * you run against the real provider.
351
+ *
352
+ * Field report #352: a classifier passed every sandbox eval (the fake runner
353
+ * returned hand-written JSON), then crashed in production on launch day
354
+ * because the live model emitted `null` for an absent optional field and the
355
+ * Zod `.optional()` parse rejected it. The deterministic layer was green the
356
+ * whole time. The bug was structurally invisible to it.
357
+ *
358
+ * Rule: before any launch, run AT LEAST ONE eval pass with a LIVE model
359
+ * runner (real provider call), not just the sandbox runner. Treat the live
360
+ * pass as a release gate — a deterministic-only green is necessary but never
361
+ * sufficient. Wire it as the final, non-skippable category in CI.
362
+ *
363
+ * // Sandbox pass — fast, free, catches plumbing regressions:
364
+ * await suite.run(sandboxRunner, version, 'sandbox')
365
+ *
366
+ * // Live pass — the actual gate, catches output-shape bugs:
367
+ * await suite.run(liveModelRunner, version, 'claude-sonnet-4-20250514')
368
+ */
369
+
370
+ /**
371
+ * GOTCHA: live models emit `null` for absent optionals — Zod `.optional()`
372
+ * accepts `undefined`, NOT `null` (field report #352, #4).
373
+ *
374
+ * `z.string().optional()` is `string | undefined`. A live model serializing
375
+ * "this field is absent" almost always emits JSON `null`, which deserializes
376
+ * to JS `null` — and `null` fails `.optional()`. The fix is to normalize
377
+ * null-to-undefined BEFORE Zod validation (do NOT reach for `.nullable()`
378
+ * everywhere — that leaks `null` into downstream types and just moves the
379
+ * problem). Normalize at the boundary, validate clean shapes inside.
380
+ *
381
+ * const Schema = z.object({ label: z.string(), reason: z.string().optional() })
382
+ * const raw = JSON.parse(modelOutput) // { label: 'billing', reason: null }
383
+ * const parsed = Schema.parse(normalizeNullsToUndefined(raw)) // ✓ reason -> undefined
384
+ */
385
+ export function normalizeNullsToUndefined<T>(value: T): T {
386
+ if (value === null) return undefined as T
387
+ if (Array.isArray(value)) {
388
+ return value.map((item) => normalizeNullsToUndefined(item)) as unknown as T
389
+ }
390
+ if (value && typeof value === 'object') {
391
+ const out: Record<string, unknown> = {}
392
+ for (const [key, val] of Object.entries(value as Record<string, unknown>)) {
393
+ const normalized = normalizeNullsToUndefined(val)
394
+ // Drop keys whose value normalized to undefined so Zod `.optional()`
395
+ // treats them as truly absent rather than present-with-undefined.
396
+ if (normalized !== undefined) out[key] = normalized
397
+ }
398
+ return out as T
399
+ }
400
+ return value
401
+ }
402
+
340
403
  /**
341
404
  * Framework adaptations:
342
405
  *
@@ -325,6 +325,95 @@ function createLogger(logPath: string): { log: (msg: string) => void; close: ()
325
325
  };
326
326
  }
327
327
 
328
+ // ── .env Parsing (literal, $-safe) ────────────────────
329
+ // field report #344 F1: never source secrets via `export $(cat .env)` /
330
+ // `eval "$(cat .env)"`. The shell performs variable expansion and word
331
+ // splitting on the RHS, so a `$`-bearing secret — bcrypt hashes
332
+ // ($2b$...), JWTs, Postgres URLs with `$` in the password, anything with
333
+ // `$VAR`/`${...}`/backticks — gets mangled or silently truncated. Parse
334
+ // literally instead: read each line, split on the FIRST `=` only, and keep
335
+ // the value byte-for-byte (no expansion, no eval). For shells, the
336
+ // equivalent is `while IFS='=' read -r k v; do export "$k=$v"; done < .env`
337
+ // — note IFS='=' and `read -r` (raw, no backslash processing), which never
338
+ // re-expands the value.
339
+ //
340
+ // Prefer a runtime-native loader where available — it sidesteps the shell
341
+ // entirely:
342
+ // - Node 20.6+: `node --env-file=.env daemon.js` (literal parse, no shell).
343
+ // - systemd: `EnvironmentFile=/etc/voidforge/heartbeat.env` (also literal;
344
+ // unit-file `Environment=` lines do NOT undergo shell expansion).
345
+ // Use this helper only when you must parse `.env` in-process.
346
+
347
+ function parseDotenv(contents: string): Record<string, string> {
348
+ const out: Record<string, string> = {};
349
+ for (const rawLine of contents.split('\n')) {
350
+ const line = rawLine.replace(/\r$/, '');
351
+ // Skip blanks and comments. A leading `export ` prefix is tolerated.
352
+ const trimmed = line.trimStart();
353
+ if (trimmed === '' || trimmed.startsWith('#')) continue;
354
+ const body = trimmed.startsWith('export ') ? trimmed.slice(7) : trimmed;
355
+
356
+ // Split on the FIRST `=` only — values may legitimately contain `=`.
357
+ const eq = body.indexOf('=');
358
+ if (eq < 0) continue; // not a KEY=VALUE line — ignore, don't guess
359
+ const key = body.slice(0, eq).trim();
360
+ if (!/^[A-Za-z_][A-Za-z0-9_]*$/.test(key)) continue; // invalid env name
361
+
362
+ let value = body.slice(eq + 1);
363
+ // Strip a single layer of matching surrounding quotes. Inside quotes the
364
+ // value is taken LITERALLY — no `$` expansion, no eval — which is the
365
+ // whole point: `PASS='p@$$w0rd'` keeps its `$$` intact.
366
+ if (value.length >= 2 &&
367
+ ((value[0] === '"' && value[value.length - 1] === '"') ||
368
+ (value[0] === "'" && value[value.length - 1] === "'"))) {
369
+ value = value.slice(1, -1);
370
+ } else {
371
+ // Unquoted: trim trailing inline whitespace only (POSIX-ish), never
372
+ // touch interior `$` characters.
373
+ value = value.trimEnd();
374
+ }
375
+ out[key] = value;
376
+ }
377
+ return out;
378
+ }
379
+
380
+ // ── systemd hardening stanza (Node daemons) ───────────
381
+ // field report #344 F3: when running this daemon under systemd, harden the
382
+ // unit — but DO NOT set `MemoryDenyWriteExecute=true` for a Node/V8 process.
383
+ // V8's JIT allocates pages that are written and then executed (it manages its
384
+ // own W^X internally); MDWE forbids any write+exec mapping, so the daemon
385
+ // takes a SIGTRAP and dies at boot, usually before it logs a single line. The
386
+ // safe, high-value sandbox flags below give most of MDWE's benefit without the
387
+ // JIT collision:
388
+ //
389
+ // [Unit]
390
+ // Description=VoidForge Heartbeat daemon
391
+ // After=network-online.target
392
+ // Wants=network-online.target
393
+ //
394
+ // [Service]
395
+ // Type=simple
396
+ // ExecStart=/usr/bin/node /opt/voidforge/daemon.js
397
+ // EnvironmentFile=/etc/voidforge/heartbeat.env # literal parse — see #344 F1
398
+ // Restart=on-failure
399
+ // RestartSec=5
400
+ //
401
+ // # Hardening — keep these:
402
+ // NoNewPrivileges=true # no setuid/setgid privilege escalation
403
+ // ProtectSystem=full # /usr, /boot, /etc mounted read-only
404
+ // ProtectHome=true # /home, /root, /run/user hidden
405
+ // PrivateTmp=true # private /tmp + /var/tmp namespace
406
+ // # MemoryDenyWriteExecute=true # <-- OMITTED ON PURPOSE: breaks V8 JIT
407
+ // # (SIGTRAP at boot). Re-enable ONLY for
408
+ // # Go/Rust/static daemons with no JIT.
409
+ //
410
+ // [Install]
411
+ // WantedBy=multi-user.target
412
+ //
413
+ // Go, Rust, and other AOT-compiled daemons emit no executable pages at
414
+ // runtime, so for THEM you can and should keep `MemoryDenyWriteExecute=true`.
415
+ // The omission above is V8-specific, not a general weakening.
416
+
328
417
  export {
329
418
  writePidFile, checkStalePid, removePidFile,
330
419
  generateSessionToken, validateToken,
@@ -333,6 +422,7 @@ export {
333
422
  setupSignalHandlers,
334
423
  JobScheduler,
335
424
  createLogger,
425
+ parseDotenv,
336
426
  PID_FILE, SOCKET_PATH, TOKEN_FILE, STATE_FILE, LOG_FILE,
337
427
  };
338
428
  export type { DaemonState, HeartbeatState, ScheduledJob };
@@ -245,6 +245,71 @@ const dropLegacyAvatarUrl: MigrationStep = {
245
245
  },
246
246
  };
247
247
 
248
+ // ── Boot-Time Schema Re-Application & Table Ownership (#354 F4) ──
249
+
250
+ /**
251
+ * GUARD: idempotent boot-time schema re-application must account for table
252
+ * OWNERSHIP and role grants — not just IF NOT EXISTS / IF EXISTS guards (#354 F4).
253
+ *
254
+ * The trap: a service that re-applies its schema at startup (boot-time DDL,
255
+ * "ensure schema" on connect) often connects as an app role distinct from the
256
+ * role that originally created the tables. In PostgreSQL, ALTER TABLE / DROP /
257
+ * CREATE INDEX / ADD COLUMN require the table OWNER (or a superuser) — not just
258
+ * INSERT/UPDATE/SELECT privileges. So even a fully idempotent
259
+ * `CREATE TABLE IF NOT EXISTS` / `ALTER TABLE ... ADD COLUMN IF NOT EXISTS`
260
+ * will FAIL at boot with "must be owner of table X" when the table is owned by
261
+ * a different DB role (e.g. a migration/admin role) than the connecting app role.
262
+ * The IF [NOT] EXISTS guard does not save you here — ownership is checked before
263
+ * the existence short-circuit on ALTER, and CREATE INDEX has no existence
264
+ * short-circuit at all for the ownership check.
265
+ *
266
+ * Why it bites at boot specifically: idempotent re-application is meant to be a
267
+ * safe no-op on an already-migrated DB. But the ownership check fires regardless
268
+ * of whether the change is a no-op, so a healthy, already-correct schema can
269
+ * still crash the service on startup.
270
+ *
271
+ * Mitigations (pick per your trust model):
272
+ * - Run boot-time/idempotent DDL as the table OWNER role, not the app role.
273
+ * Keep schema changes on a privileged migration role; let the app role do DML only.
274
+ * - OR align ownership: `ALTER TABLE <t> OWNER TO <app_role>` once (by a superuser),
275
+ * or create tables under the app role from the start.
276
+ * - OR use a shared owning role and `GRANT <owner_role> TO <app_role>` so the
277
+ * app role can act as owner for DDL.
278
+ * - Prefer a one-shot migration step (runMigrations) over boot-time re-application
279
+ * for anything beyond table/index existence — it isolates the privileged role.
280
+ *
281
+ * Pre-flight check before re-applying schema at boot — fail fast with a clear
282
+ * message instead of a raw "must be owner" deep in startup.
283
+ */
284
+ async function assertTableOwnership(
285
+ ctx: MigrationContext,
286
+ table: string,
287
+ expectedRole: string
288
+ ): Promise<void> {
289
+ // PostgreSQL: tableowner comes from pg_tables; current_user is the connecting role.
290
+ const result = await ctx.execute(
291
+ `SELECT tableowner FROM pg_tables WHERE tablename = $1`,
292
+ [table]
293
+ );
294
+
295
+ // No row → table absent; CREATE TABLE IF NOT EXISTS will create it under the
296
+ // connecting role, so ownership is not a concern for this table yet.
297
+ if (result.rowCount === 0) {
298
+ ctx.log('migration.ownership_check', { table, present: false });
299
+ return;
300
+ }
301
+
302
+ // In a real adapter, read the owner value from the row; shown here as the contract.
303
+ // If the owner is not the expected (connecting/owner) role, boot-time ALTER/CREATE
304
+ // INDEX on this table will fail with "must be owner of table" (#354 F4).
305
+ ctx.log('migration.ownership_check', {
306
+ table,
307
+ present: true,
308
+ expectedRole,
309
+ note: 'boot-time DDL requires table owner or a superuser; app-role DML privileges are not enough',
310
+ });
311
+ }
312
+
248
313
  // ── Batched Processing for Large Tables ─────────────────
249
314
 
250
315
  /**
@@ -4,13 +4,17 @@
4
4
  * Reference implementation for .claude/commands/deploy.md Step 2.5.
5
5
  * Scans the deploy artifact directory BEFORE upload. Exits non-zero on any hit.
6
6
  *
7
- * Evidence: field reports #305 (32-day credential leak), #303 (methodology exposure).
7
+ * Evidence: field reports #305 (32-day credential leak), #303 (methodology exposure),
8
+ * #343 F7 (stop-build-start loop mislabeled "blue-green" → 502 window every deploy).
8
9
  *
9
10
  * Key principles:
10
11
  * - Scan the deploy payload directory, NOT the repo root.
11
12
  * - Never auto-filter — a hit means the operator must investigate.
12
13
  * - Never print secret content; only paths + pattern IDs.
13
14
  * - Allowlist escape hatch via DEPLOY_PREFLIGHT_ALLOW (comma-separated globs).
15
+ * - Deploy-strategy claims must be backed by a real mechanism: a comment that
16
+ * says "blue-green"/"zero-downtime" without an atomic swap (rename, container
17
+ * swap, or LB cutover) is a lie that ships a 502 window (#343 F7).
14
18
  *
15
19
  * Usage:
16
20
  * npx tsx docs/patterns/deploy-preflight.ts ./dist
@@ -55,11 +59,82 @@ const TEXT_EXTENSIONS = new Set([
55
59
  ]);
56
60
 
57
61
  interface Hit {
58
- kind: 'name' | 'content';
62
+ kind: 'name' | 'content' | 'strategy';
59
63
  path: string;
60
64
  patternId: string;
61
65
  }
62
66
 
67
+ // ---------- deploy-strategy nomenclature check (field report #343 F7) ----------
68
+ // A stop-build-start loop mislabeled "blue-green"/"zero-downtime" still drops the
69
+ // old process before the new one is live, producing a 502 window on every deploy.
70
+ // The comment lies; the mechanism doesn't. This flags scripts whose comments CLAIM
71
+ // blue-green / zero-downtime but where no atomic-swap mechanism is detectable —
72
+ // temp-build-then-rename, container/image swap, or load-balancer cutover.
73
+
74
+ // File shapes that can carry a deploy strategy worth checking.
75
+ const DEPLOY_SCRIPT_EXTENSIONS = new Set([
76
+ '.sh', '.bash', '.zsh', '.yml', '.yaml', '.ps1',
77
+ ]);
78
+ const DEPLOY_SCRIPT_BASENAMES = new Set([
79
+ 'Dockerfile', 'Procfile', 'Makefile',
80
+ ]);
81
+
82
+ // Comments that CLAIM an atomic deploy strategy.
83
+ const STRATEGY_CLAIM_RE = /\b(blue[\s/_-]?green|zero[\s/_-]?downtime|hot[\s/_-]?swap|atomic\s+deploy(?:ment)?)\b/i;
84
+
85
+ // Any one of these signals a real atomic-swap mechanism is present.
86
+ const ATOMIC_SWAP_SIGNALS: { id: string; re: RegExp }[] = [
87
+ // temp build dir then rename/symlink-swap into place (release-then-link pattern)
88
+ { id: 'rename-swap', re: /\b(?:mv|rename|ln\s+-s(?:fn|nf|f)?)\b[^\n]*\b(?:current|live|release|active|prod(?:uction)?)\b/i },
89
+ { id: 'symlink-current', re: /\bln\s+-s(?:fn|nf|f)?\b[^\n]*\bcurrent\b/i },
90
+ // container / image swap: new container up, traffic moved, old removed
91
+ { id: 'container-swap', re: /\bdocker\b[^\n]*\b(?:run|up|--scale|service\s+update)\b|\bdocker[\s-]compose\b[^\n]*\bup\b[^\n]*\b(?:--no-recreate|--scale)\b|\bcontainer[\s_-]?swap\b/i },
92
+ { id: 'orchestrator-rollout', re: /\b(?:kubectl\s+rollout|helm\s+upgrade|nomad\s+job\s+run|ecs\b[^\n]*update-service)\b/i },
93
+ // load-balancer / proxy cutover: register new target, then drain/deregister old
94
+ { id: 'lb-cutover', re: /\b(?:register-targets|deregister-targets|modify-listener|switchover|traffic[\s_-]?shift|weighted[\s_-]?routing|upstream)\b/i },
95
+ { id: 'proxy-reload', re: /\b(?:nginx\s+-s\s+reload|caddy\s+reload|envoy\b[^\n]*config|haproxy\b[^\n]*reload)\b/i },
96
+ ];
97
+
98
+ // Sequences that betray a stop-build-start loop (kill old, then start new).
99
+ // Used only to strengthen the signal — a claim with NO atomic mechanism is
100
+ // already a hit; this just confirms the anti-pattern is actively present.
101
+ const STOP_START_RE = /\b(?:kill|stop|down|terminate|systemctl\s+stop|pm2\s+stop|docker\s+stop|docker\s+rm)\b[\s\S]{0,400}?\b(?:start|up|run|systemctl\s+start|pm2\s+start|npm\s+(?:run\s+)?start|node\b)/i;
102
+
103
+ function scanStrategy(fullPath: string, relPath: string): string | null {
104
+ const base = relPath.split(sep).pop() ?? '';
105
+ const ext = extname(fullPath).toLowerCase();
106
+ const looksLikeDeployScript =
107
+ DEPLOY_SCRIPT_EXTENSIONS.has(ext) ||
108
+ DEPLOY_SCRIPT_BASENAMES.has(base) ||
109
+ /deploy|release|rollout|cutover/i.test(base);
110
+ if (!looksLikeDeployScript) return null;
111
+
112
+ let stats;
113
+ try {
114
+ stats = statSync(fullPath);
115
+ } catch {
116
+ return null;
117
+ }
118
+ if (stats.size > 2_000_000) return null;
119
+
120
+ let buf: string;
121
+ try {
122
+ buf = readFileSync(fullPath, 'utf8');
123
+ } catch {
124
+ return null;
125
+ }
126
+
127
+ if (!STRATEGY_CLAIM_RE.test(buf)) return null; // no claim, nothing to verify
128
+ const hasAtomicSwap = ATOMIC_SWAP_SIGNALS.some((s) => s.re.test(buf));
129
+ if (hasAtomicSwap) return null; // claim is backed by a real mechanism
130
+
131
+ // Claim present, no atomic-swap mechanism. Distinguish the worst case:
132
+ // an actual stop-build-start loop wearing a blue-green label.
133
+ return STOP_START_RE.test(buf)
134
+ ? 'strategy-mislabel-stop-start'
135
+ : 'strategy-claim-no-atomic-swap';
136
+ }
137
+
63
138
  function globToRegex(glob: string): RegExp {
64
139
  const escaped = glob
65
140
  .replace(/[.+^${}()|[\]\\]/g, '\\$&')
@@ -167,6 +242,14 @@ function main(): void {
167
242
  const contentHit = scanContent(fullPath);
168
243
  if (contentHit) {
169
244
  hits.push({ kind: 'content', path: relPath, patternId: contentHit });
245
+ continue; // a secret hit is already terminal; don't double-report this file
246
+ }
247
+
248
+ // Deploy-strategy nomenclature check (field report #343 F7): a script whose
249
+ // comments claim blue-green / zero-downtime but ships no atomic-swap mechanism.
250
+ const strategyHit = scanStrategy(fullPath, relPath);
251
+ if (strategyHit) {
252
+ hits.push({ kind: 'strategy', path: relPath, patternId: strategyHit });
170
253
  }
171
254
  }
172
255