npm - voidforge-build - Versions diffs - 23.11.4 → 23.12.1 - Mend

voidforge-build 23.11.4 → 23.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/dist/.claude/agents/batman-qa.md +1 -0
package/dist/.claude/agents/galadriel-frontend.md +2 -0
package/dist/.claude/agents/kusanagi-devops.md +4 -0
package/dist/.claude/agents/lucius-config.md +6 -0
package/dist/.claude/agents/samwise-accessibility.md +4 -0
package/dist/.claude/agents/silver-surfer-herald.md +13 -4
package/dist/.claude/commands/architect.md +9 -0
package/dist/.claude/commands/assemble.md +4 -1
package/dist/.claude/commands/assess.md +13 -1
package/dist/.claude/commands/audit-docs.md +106 -0
package/dist/.claude/commands/deploy.md +29 -1
package/dist/.claude/commands/engage.md +19 -1
package/dist/.claude/commands/gauntlet.md +23 -4
package/dist/.claude/commands/imagine.md +15 -0
package/dist/.claude/commands/sentinel.md +15 -0
package/dist/.claude/commands/ux.md +36 -0
package/dist/.claude/commands/void.md +1 -0
package/dist/CHANGELOG.md +65 -0
package/dist/CLAUDE.md +9 -0
package/dist/VERSION.md +3 -1
package/dist/docs/methods/AI_INTELLIGENCE.md +33 -0
package/dist/docs/methods/ASSEMBLER.md +31 -2
package/dist/docs/methods/BUILD_PROTOCOL.md +2 -0
package/dist/docs/methods/CAMPAIGN.md +46 -0
package/dist/docs/methods/DEVOPS_ENGINEER.md +194 -0
package/dist/docs/methods/DOC_AUDIT.md +92 -0
package/dist/docs/methods/FORGE_KEEPER.md +16 -5
package/dist/docs/methods/GAUNTLET.md +38 -0
package/dist/docs/methods/PRODUCT_DESIGN_FRONTEND.md +57 -0
package/dist/docs/methods/QA_ENGINEER.md +21 -0
package/dist/docs/methods/RELEASE_MANAGER.md +27 -0
package/dist/docs/methods/SECURITY_AUDITOR.md +12 -1
package/dist/docs/methods/SUB_AGENTS.md +54 -0
package/dist/docs/methods/SYSTEMS_ARCHITECT.md +13 -0
package/dist/docs/methods/TESTING.md +19 -0
package/dist/docs/patterns/README.md +3 -0
package/dist/docs/patterns/ai-eval.ts +63 -0
package/dist/docs/patterns/daemon-process.ts +90 -0
package/dist/docs/patterns/database-migration.ts +65 -0
package/dist/docs/patterns/deploy-preflight.ts +85 -2
package/dist/docs/patterns/design-tokens.ts +338 -0
package/dist/docs/patterns/error-message-categorization.tsx +376 -0
package/dist/wizard/lib/patterns/daemon-process.d.ts +2 -1
package/dist/wizard/lib/patterns/daemon-process.js +89 -1
package/package.json +2 -2

package/dist/docs/methods/RELEASE_MANAGER.md CHANGED Viewed

@@ -230,3 +230,30 @@ After pushing to remote, if the project runs on a persistent server (PM2, system
 2. **If stale:** Prompt: "Server is running an older version. Rebuild and restart? [Y/n]"
 3. **In blitz mode:** Auto-rebuild if a deploy script or PM2 ecosystem config exists.
 4. Pushing code to GitHub is NOT deploying it. The server must be rebuilt and restarted for changes to take effect. (Field report #104: 22 commits pushed but PM2 was still running v3.8.1 while code was v3.10.0.)
+## No Auto-Rotting Production-Status Footer (field report #342 F-4)
+Do NOT add a "Production binary still vX.Y — vA, B, C await operator deploy" footer to the `PROJECT_VERSION.md` template (or any per-version block). The pattern is seductive — it reads as a helpful reminder when written — but it rots silently: it is accurate only at the instant of the version it was written under, and the *next* version bump leaves it pointing at a stale "still on vX.Y" claim that nobody re-reads. By the third release it actively lies about what production is running.
+**Rule:** Production-deploy status lives in exactly two places, both of which a release bump already touches:
+1. **The single source of truth**, if the project keeps one — `docs/_truth.yml` (or equivalent machine-readable status file). One canonical `production_version:` field, not a prose footer.
+2. **The topmost "Current" block** of `PROJECT_VERSION.md` — the line Coulson already rewrites every bump (Step 5 changes `**Current:** X.Y.Z`). Deploy state, if tracked here at all, belongs adjacent to that line so it is impossible to bump the version without confronting it.
+A per-version footer fails because it is *additive* — each bump appends a new one and leaves the old ones in place, so the file accumulates N footers of which N−1 are false. The Current block and the truth file are *overwritten* each bump, so they cannot drift. Coulson rejects any release diff that introduces an "await operator deploy" or "Production binary still" footer; route that information to the Current block instead.
+## Regenerating Generated CLAUDE.md Stack Blocks (field report #342 F-2)
+When a generated `CLAUDE.md` (or any generated doc) embeds a project stack/inventory block — framework, language, test count, package versions — do NOT leave a promissory placeholder marker (`<!-- stack block: fill me in -->`, `[STACK_TBD]`, etc.) that depends on a human remembering to update it. Placeholder markers rot the same way the footer in F-4 does: they survive review, ship, and then read as authoritative once the brackets are forgotten.
+**Pattern:** If the project keeps a machine-readable truth source — `docs/_truth.yml`, `package.json`, a manifest — a regeneration helper rewrites a **clearly-delimited generated block** in place from that source, so the block is reproducible and drift is impossible (re-run the helper, diff, commit). Wrap the block in explicit sentinels so the rewrite is surgical and the hand-written prose around it is never clobbered:
+```
+<!-- BEGIN GENERATED: stack (do not edit by hand — run scripts/regen-claude-md.sh) -->
+- **Framework:** Next.js 15.4
+- **Language:** TypeScript 5.6 (strict)
+- **Tests:** 1209 passing
+<!-- END GENERATED: stack -->
+```
+A working `scripts/regen-claude-md.sh` may ship alongside this discipline (reading `docs/_truth.yml` / `package.json` and rewriting only the text between the sentinels, leaving everything else byte-identical). If that script is absent, this section documents the intended pattern: the *generated* block is derived, never authored by hand, and never a placeholder. On every MINOR/MAJOR bump Coulson regenerates the block (or flags it for regeneration) rather than trusting that someone updated the prose by hand.

package/dist/docs/methods/SECURITY_AUDITOR.md CHANGED Viewed

@@ -263,6 +263,17 @@ For any system that sends URLs to users (transactional emails, SMS, push notific
 This is the outbound mirror of SSRF prevention: SSRF stops external URLs from reaching internal services, outbound URL safety stops internal URLs from reaching external users. (Field report #44: verification email sent with `localhost:5005` URL — worked on same machine, broke from any other device.)
+### Enforcement-Layer Severity Rubric (field report #354 F2)
+Key a finding's severity to the **enforcement layer**, not the **symptom location**. The question that sets severity is not "where did I see the leak?" but **"where is this actually enforced?"** Before you assign P0/P1, trace the request to the layer that *decides* — the server-side authorization check, the database query scope, the policy engine — and confirm the gap exists *there*.
+- **Client-side affordance leak with intact server enforcement = UX-only (P2/P3), not a breach.** A hidden admin button that renders in the DOM, a disabled-but-present form field, an action the SPA shows but the API rejects with 403/404 — these are **render-then-403** patterns. The client showed something it shouldn't, but the actually-enforcing layer (the server) still says no. That is an information-disclosure or UX-polish finding, not a Critical. Rating a server-enforced client affordance leak as Critical is a false-positive that wastes a remediation slot and erodes trust in the report.
+- **A gap at the actually-enforcing layer = P0/P1.** If the server itself does not check ownership, the role gate is missing on the route, or the query has no `org_id` scope, the breach is real regardless of what the client renders. The symptom may surface in the UI, but the severity comes from the server hole.
+**Verification before scoring (always do this for any "exposed in the UI" finding):** reproduce the action against the API directly — `curl`/Postman with the victim's resource ID and the attacker's credentials, no browser. If the server returns 403/404/401 and writes nothing, the enforcing layer holds → downgrade to P2/P3 and note "server-enforced; client affordance leak only." If the server returns 200 + data or commits a write, the enforcing layer is breached → P0/P1. Never infer the server's behavior from the client's rendering.
+This is an explicit lens in **both** the audit (Phase 1/2: for every "this is visible/clickable" observation, ask "where is this actually enforced?" and probe that layer) and the re-verify pass (Phase 4: Maul must confirm a downgraded affordance-leak finding by hitting the API directly, not by re-checking the DOM). (#354 F2)
 ### Credentials Never in API Responses
 API responses must NEVER include credentials, tokens, or secrets — even in "admin-only" or "internal" endpoints. Grep for responses that include: `password`, `secret`, `token`, `api_key`, `private_key`, `credentials`. Common violations: user profile endpoints returning the password hash, API key management endpoints including the full key in GET responses (show only last 4 characters), internal debug endpoints returning environment variables. (Field report #66: API settings endpoint returned full MCP connection credentials in the response body.)
@@ -366,7 +377,7 @@ When fixing an auth, authorization, or validation check: trace ALL callers of th
 After remediations are applied:
-**Maul — Red Team Verification:** Re-probe all remediated vulnerabilities. Verify fixes hold under adversarial conditions. Check that fixes didn't introduce new attack vectors. Attempt to bypass the remediations.
+**Maul — Red Team Verification:** Re-probe all remediated vulnerabilities. Verify fixes hold under adversarial conditions. Check that fixes didn't introduce new attack vectors. Attempt to bypass the remediations. **Apply the enforcement-layer lens (#354 F2):** for any finding rated Critical/High off a UI-visible symptom, confirm severity by hitting the API directly — a finding that only reproduces in the DOM but returns 403/404 server-side is a server-enforced affordance leak (P2/P3), not the breach it was filed as. Re-score before sign-off.
 **Padmé — Functional Verification:** After Maul confirms security holds, Padmé verifies the primary user flow still works end-to-end. Open the app, complete the main task, verify output. This catches "secure but broken" regressions that pure security re-testing misses.

package/dist/docs/methods/SUB_AGENTS.md CHANGED Viewed

@@ -115,6 +115,10 @@ This powers the Danger Room's live agent ticker. The wizard server watches this
 This is **methodology-driven logging**, not hook-driven. Hooks cannot extract agent identity from tool input — the orchestrator must write the log entry explicitly. (Field report #128, architectural review)
+### Workflow-Tool Progress-Tree Labels
+When dispatching via the Workflow tool, set the agent **label** so the named character surfaces in the `/workflows` progress tree. Use the form `"<agent> · <key>"` (e.g., `"Picard · review:architecture"`, `"Kenobi · sentinel:auth"`, `"Galadriel · ux:a11y"`), or omit the label entirely so the underlying `agentType` surfaces on its own. If you instead pass only a dimension key like `review:architecture` as the label, that key OVERRIDES the agent identity and the tree shows the dimension instead of Picard/Kenobi/Galadriel — the roster becomes anonymous in the dashboard and the Danger Room ticker correlation breaks. Keep the character name as the leading token of every workflow label. (Field report #348 #2.)
 ## Delegation Template
 ```
@@ -330,8 +334,37 @@ This pattern applies to:
 - Galadriel's UX (Samwise + Radagast re-verify)
 - Kenobi's Security (Maul re-probes remediations)
+#### Verify the FIX, not just the finding
+The adversarial-verify step has two distinct jobs, and orchestrators routinely collapse them into one:
+1. **Re-probe the fixed AREA** — after a fix lands, confirm the original finding is gone and no neighboring regression appeared. This is the Pass 2 above.
+2. **Interrogate the fix DESIGN** — before or as the fix lands, challenge the *proposed remediation itself* for NEW failure modes it introduces: wedge (a state that can never be exited inside the available budget), unbounded retry, infinite loop, orphaned record, double-send. This is NOT the same as re-probing the area; it scrutinizes the design of the change, not its installed effect.
+Job 2 is **especially mandatory when the fix adds a coordination primitive** — a sentinel, a lock, a retry-state record, a fence, a dedup marker — **without a corresponding liveness signal** (a guaranteed path that releases the primitive, an upper bound on retries, a reclaim window that is actually reachable). A coordination primitive with no liveness signal is a wedge waiting to happen: it makes the original bug rarer but converts it into a stuck state that is harder to diagnose.
+Motivating incidents:
+- **M5 mint-fence** (field report #348 #1 / #350 #4): the fix added a mint fence so a draft couldn't be re-minted concurrently, with a reclaim window to recover abandoned fences. But the reclaim window was set *longer than the retry budget* — so every retry exhausted before the window opened, and the reclaim path was algebraically unreachable inside the retry budget. Drafts wedged permanently in `FAILED`. The fix's own coordination primitive (the fence) had no reachable liveness path.
+- **M6 lifecycle-sweep** (field report #348 #1 / #350 #4): the fix swept lifecycle records on a schedule but compared against a stale `send_at` snapshot captured before the sweep, so a record whose `send_at` had advanced got swept AND re-sent — a double-send introduced by the remediation, not present in the original bug.
+Both would have been caught by an adversarial pass that asked "what new failure mode does THIS fix create?" rather than only "is the old finding gone?" When a fix introduces a sentinel/lock/retry-state, the verify dispatch brief MUST name the wedge/loop/orphan/double-send checklist explicitly and require the agent to trace the liveness path.
 **Important distinction:** The Agent tool enables **parallel analysis**, not parallel coding. Sub-agents return text findings — the lead agent then implements code changes sequentially. This is still faster than sequential analysis, but don't expect parallel file edits.
+### The Default Review Shape: Find → Cluster/Dedupe → 3-Lens Verify → Fix Only Survivors
+Every review command — `/engage`, `/sentinel`, `/gauntlet` — runs the same four-stage shape, not a flat "list findings then fix everything" pass. v23.12.0 added the refute-pass mechanics to `/gauntlet`; this is the generalized naming so the same discipline is the DEFAULT everywhere, not Gauntlet-only (field report #354 F1).
+1. **Find** — fan out the roster; each lens produces raw findings against the same diff (see Intentionally Overlapping Mandates).
+2. **Cluster/Dedupe** — collapse the raw findings into distinct claims. The same root cause flagged by Stark + Kenobi + Ahsoka is ONE claim with three votes, not three findings. LLM-assigned finding ids are display labels, not keys — dedupe on the claim, not the id.
+3. **3-Lens Verify** — every surviving claim is interrogated through three lenses before it earns a fix:
+   - **Correctness** — is the asserted behavior actually wrong? (the bug is real, the logic is genuinely broken)
+   - **Reachability** — can production actually hit this path? (not provably-dead-code, not behind a `DEV_ONLY` gate — see the WARN/cosmetic contract above)
+   - **Refutation** — assign a **skeptic agent whose explicit job is to REFUTE the finding and cast a confirm vote.** The skeptic is told to argue the finding is wrong/unreachable/already-handled and to vote CONFIRM only if it cannot. A claim that survives a reviewer instructed to kill it is a real claim. This is the defining element of the shape: not "does another lens agree?" but "does a lens TRYING to disprove it fail to?" A finding nobody was assigned to refute is unverified, regardless of how many agents independently raised it.
+4. **Fix Only Survivors** — only claims that pass all three lenses (correct AND reachable AND survive a confirm-vote refutation) enter the fix batch. Refuted claims are logged with the refutation rationale and dropped — never silently, so a future review doesn't re-raise them.
+The refutation lens is what separates this from the Intentionally Overlapping Mandates convergence rule: convergence asks independent agents to agree; refutation assigns one agent to disagree on purpose. Run both — convergence raises confidence on what's flagged, refutation removes false positives from the fix batch. (Field report #354 F1.)
 ### Multi-Session Parallelism (Separate Terminals)
 For larger projects where agents need to make code changes simultaneously, use separate Claude Code sessions in different terminal windows. Each session works on separate files within defined scope boundaries.
@@ -372,6 +405,18 @@ Proven in production: a full `/assemble --muster` (11 phases, 15+ agents) ran en
 | Track status, report to user | Do work an agent could do |
 | Git operations (commit, push) | Launch agent-to-agent dispatch |
+### Default to Fixing, Not to Asking Which to Fix
+When a review surfaces a clear list of fixable findings, the orchestrator's DEFAULT is to apply them in batches — not to surface a multi-option "which subset should I fix?" picker and wait. A list of well-scoped findings with obvious remediations is a work queue, not a decision fork. Presenting it back to the user as a menu of options offloads triage the orchestrator was dispatched to do, and stalls a batch that could already be landing.
+Apply the findings in batches (partition by domain/concern per the Concurrency Rules), verify after each batch, and report what was fixed. Only stop to ask when a choice is **genuinely architectural or irreversible** — e.g., two incompatible schema directions, a data migration that can't be rolled back, a dependency that changes the deploy target, or a trade-off the PRD is silent on (then follow Multi-agent conflict resolution). "Which of these 9 lint/logic findings should I fix?" is not such a choice; "should this be event-sourced or CRUD?" is. (Field report #343 F5.)
+### Use AskUserQuestion at Genuine Forks
+The flip side of the anti-picker rule: when the orchestrator hits a **genuine creative or scope fork** — 2-3 mutually-exclusive directions, none obviously dominant, where guessing wrong means rework — present them with `AskUserQuestion` and an option preview for each, rather than silently picking one or surfacing a single take-it-or-leave-it option. Give each option a short label and a one-line preview of what it commits to (the trade-off, the consequence, what it forecloses), so the user can decide in one read instead of an interview.
+Use it for: which of two layouts/IA directions, which scope to ship first when both are valid, an irreversible architectural split, a naming/contract convention that downstream agents will all inherit. Do NOT use it as a substitute for triage you should be doing yourself (see the anti-picker rule above), and do NOT pad it past 3 options — a fork with 6 options usually means the scope wasn't analyzed enough to narrow it. One option presented as a question ("shall I do X?") is also an anti-pattern: either it's the obvious default (just do it) or there's a real alternative (show both). (Field report #351 #5.)
 ### Standard Agent Brief
 Every agent launch MUST include a structured brief:
@@ -425,6 +470,15 @@ Field report #324 (Union Station v7.8 R2): three agents (Discovery + Stark + Ken
 - **Wait for ALL parallel agents before synthesizing** (field report #300).
 - Partition strategies: by domain (frontend/backend), by concern (security/UX), or read-only vs. write.
+### Directory / Migration Fan-Out: Glob the List, Sweep the Remainder
+When a wave fans out per-file or per-entity work across a directory or migration (one agent per file/module/route/migration), two rules are MANDATORY (field report #355 F2):
+1. **Derive the per-agent file list from a GLOB, never a hand-typed list.** Run `ls`/`Glob` (e.g., `Glob "src/routes/**/*.ts"`, `git ls-files 'migrations/*.sql'`) and partition the GLOB output into agent assignments. A hand-typed list silently drops the files the orchestrator forgot existed — and those are exactly the ones with the unmigrated legacy pattern, because they weren't top-of-mind. The glob is the source of truth for "what's in scope," not the orchestrator's memory of the tree.
+2. **Pair every fan-out with a post-fan-out completeness sweep before the wave is "done."** After all fan-out agents return, run ONE grep for the legacy pattern across the WHOLE target tree (not just the assigned files) — e.g., `grep -rn "oldApiCall(" src/` or `grep -rln "TODO: migrate" .`. A wave is not complete while that grep returns hits. The sweep catches: files the glob/partition missed, files created mid-wave by a parallel agent, and occurrences an agent declared done but left behind. Zero hits is the completion gate, not "all dispatched agents reported done."
+The failure mode this prevents: a fan-out reports "9/9 agents complete" while 3 files still carry the legacy pattern — because they were never in the hand-typed list, and nobody grepped the whole tree to confirm. "All my agents finished" is not "the migration is complete." The completeness sweep is the difference. (Field report #355 F2.)
 ### Context Passing Between Phases
 - Pass **findings summaries** between phases, not raw file contents

package/dist/docs/methods/SYSTEMS_ARCHITECT.md CHANGED Viewed

@@ -211,6 +211,19 @@ When reviewing architecture, identify all endpoints/services that mutate the sam
 When architecture requires accepting a known security risk (e.g., iframe sandbox weakening for UX, storing tokens in memory for operational continuity), document it as an ADR with explicit risk acceptance. Include: the tradeoff made, what is gained, what attack surface is expanded, what mitigations are in place, and who accepted the risk. This prevents the same finding from appearing in every future audit and reduces Gauntlet noise. (Field report #102: preview iframe `allow-scripts + allow-same-origin` sandbox escape was a known tradeoff but was never documented — flagged in every security pass.)
+### Fix-Direction Reconciliation Against Doctrine
+For any access, permission, or contract fix, "verified" is not sufficient to make the fix actionable. A finding can be reproduced, root-caused, and confirmed by multiple agents and *still* carry a backwards fix — one that widens a permission, grants access to the wrong principal, or relaxes a contract the doctrine intends to tighten. Reproduction proves the behavior; it does not prove the fix moves in the correct direction. (Field report #349 F-2)
+Before any such fix is accepted, the architect MUST do two things explicitly:
+1. **Name the governing SSOT.** Identify the single source of truth that governs the access/permission/contract being changed — the permission matrix, the relevant ADR, or the published API contract. If no SSOT exists for the boundary being touched, that absence is itself a finding: the fix is unanchored and must wait until the doctrine is written.
+2. **Reconcile the fix DIRECTION against that SSOT.** State, in the fix record, whether the change *loosens* or *tightens* the boundary, and *who gains or loses access* as a result. Then compare that direction to what the named SSOT prescribes. If the fix loosens a permission the matrix says should be tightened (or grants a role access the ADR reserves for another), the fix is backwards — reject it and re-derive the correct change from doctrine, regardless of how well-verified the underlying finding is.
+The reconciliation belongs in the same record as the finding: *"SSOT: <permission-matrix row / ADR-NNN / contract endpoint>. Direction: <loosen|tighten>; <principal> gains/loses <access>. Doctrine prescribes: <tighten|loosen>. Reconciled: <match|MISMATCH — fix is backwards>."* A MISMATCH blocks the fix.
+This mirrors the engage.md Step 2 requirement that access/permission findings name their governing SSOT and reconcile fix direction before synthesis — Picard applies the same gate at the architecture layer so a backwards fix never reaches an ADR or an implementer. (Field report #349 F-2)
 ### Strategy Consolidation Check
 When a system implements N parallel strategies for the same goal (payment providers, notification channels, API versions, deployment targets, content pipelines), periodically verify that each strategy still justifies its maintenance cost. If usage data shows one strategy handling 95%+ of traffic or value while the others sit idle or near-zero, the idle strategies are not "options" — they are dead code with maintenance burden.

package/dist/docs/methods/TESTING.md CHANGED Viewed

@@ -356,6 +356,25 @@ Do NOT create custom DDL in test files — it drifts from the real schema (missi
 Custom DDL causes test DB schema mismatches that require 2-3 fix-and-retry cycles per occurrence. (Field report #31)
+### Failure Attribution in Shared-State Suites
+When a test fails in a suite that shares mutable state across files (a shared test DB, module-level singletons, a global fixture, an ordering-sensitive runner), do NOT attribute a multi-file failure to your change until you have reproduced it in isolation. Shared state means a failure can surface in file B while the root cause lives in file A — or in test ordering itself, not in your edit at all. (Field report #349 F-3)
+**Procedure:**
+1. **Isolate the failing file.** Run only the failing test file (or the single test), so cross-file state pollution can't contribute. Use the framework's isolation/single-worker flag so the runner doesn't parallelize or randomize:
+   | Framework | Isolate single-worker / no parallelism | Disable random ordering |
+   |-----------|----------------------------------------|-------------------------|
+   | vitest | `vitest run --no-threads <file>` (or `--pool=forks --poolOptions.forks.singleFork`) | `--sequence.shuffle=false` |
+   | jest | `jest --runInBand <file>` | `--testSequencer` (default is deterministic) |
+   | pytest | `pytest <file>::<test>` | `pytest -p no:randomly` (disable pytest-randomly) |
+2. **Compare against clean HEAD.** Stash your change (`git stash`) and re-run the same isolated command on a clean tree. If it still fails on clean HEAD, the failure is pre-existing — not yours. Restore with `git stash pop` afterward.
+3. **Only after isolation + clean-HEAD comparison** attribute the failure to your change, and fix the actual cause rather than the symptom.
+This is the canonical rule in `/docs/methods/QA_ENGINEER.md` (Failure Attribution) — see it for the full decision tree. This section is the testing-runner-flag companion to it.
 ## Setup Checklist
 When setting up testing for a new project:

package/dist/docs/patterns/README.md CHANGED Viewed

@@ -34,6 +34,9 @@ Reference implementations for common code structures. These show the **shape and
 | Data Pipeline | `data-pipeline.ts` | ETL with checkpoint/resume, quality checks, idempotent processing | Node.js streams, Python polars, SQL/dbt |
 | Backtest Engine | `backtest-engine.ts` | Walk-forward validation, no-lookahead, Sharpe/drawdown metrics | Python vectorbt/backtrader |
 | Execution Safety | `execution-safety.ts` | Order validation, position limits, exchange precision, paper/live toggle | CCXT, Alpaca, IBKR |
+| Design Tokens | `design-tokens.ts` | Semantic color/type tokens so theme pivots are a token change (field report #351) | CSS vars + Tailwind + React |
+| Nginx Vhost | `nginx-vhost.conf` | Cloudflare-Flexible-safe vhost: security headers, ACME passthrough (field report #351) | Nginx |
+| Error Message Categorization | `error-message-categorization.tsx` | Categorize errors at the UI boundary before showing copy (field report #351) | React (framework-agnostic notes) |
 ## How to Use

package/dist/docs/patterns/ai-eval.ts CHANGED Viewed

@@ -337,6 +337,69 @@ export const CLAUDE_PROMPT_EVAL_CATEGORIES = {
  *   7. cost per case within 20% of baseline
  */
+// --- Live eval layer: the pre-launch gate (field report #352, #4) ---
+/**
+ * THE LIVE EVAL LAYER IS THE PRE-LAUNCH GATE.
+ *
+ * Deterministic and sandbox-adapter evals (fixed inputs, fake-data runners)
+ * verify your *plumbing* — scoring functions, tag breakdowns, comparison
+ * thresholds. They CANNOT catch model-output-shape bugs, because the runner
+ * never calls a real model. The shape of what a live model actually emits —
+ * extra prose, null fields, reordered keys, casing drift — only appears when
+ * you run against the real provider.
+ *
+ * Field report #352: a classifier passed every sandbox eval (the fake runner
+ * returned hand-written JSON), then crashed in production on launch day
+ * because the live model emitted `null` for an absent optional field and the
+ * Zod `.optional()` parse rejected it. The deterministic layer was green the
+ * whole time. The bug was structurally invisible to it.
+ *
+ * Rule: before any launch, run AT LEAST ONE eval pass with a LIVE model
+ * runner (real provider call), not just the sandbox runner. Treat the live
+ * pass as a release gate — a deterministic-only green is necessary but never
+ * sufficient. Wire it as the final, non-skippable category in CI.
+ *
+ *   // Sandbox pass — fast, free, catches plumbing regressions:
+ *   await suite.run(sandboxRunner, version, 'sandbox')
+ *
+ *   // Live pass — the actual gate, catches output-shape bugs:
+ *   await suite.run(liveModelRunner, version, 'claude-sonnet-4-20250514')
+ */
+/**
+ * GOTCHA: live models emit `null` for absent optionals — Zod `.optional()`
+ * accepts `undefined`, NOT `null` (field report #352, #4).
+ *
+ * `z.string().optional()` is `string | undefined`. A live model serializing
+ * "this field is absent" almost always emits JSON `null`, which deserializes
+ * to JS `null` — and `null` fails `.optional()`. The fix is to normalize
+ * null-to-undefined BEFORE Zod validation (do NOT reach for `.nullable()`
+ * everywhere — that leaks `null` into downstream types and just moves the
+ * problem). Normalize at the boundary, validate clean shapes inside.
+ *
+ *   const Schema = z.object({ label: z.string(), reason: z.string().optional() })
+ *   const raw = JSON.parse(modelOutput)            // { label: 'billing', reason: null }
+ *   const parsed = Schema.parse(normalizeNullsToUndefined(raw)) // ✓ reason -> undefined
+ */
+export function normalizeNullsToUndefined<T>(value: T): T {
+  if (value === null) return undefined as T
+  if (Array.isArray(value)) {
+    return value.map((item) => normalizeNullsToUndefined(item)) as unknown as T
+  }
+  if (value && typeof value === 'object') {
+    const out: Record<string, unknown> = {}
+    for (const [key, val] of Object.entries(value as Record<string, unknown>)) {
+      const normalized = normalizeNullsToUndefined(val)
+      // Drop keys whose value normalized to undefined so Zod `.optional()`
+      // treats them as truly absent rather than present-with-undefined.
+      if (normalized !== undefined) out[key] = normalized
+    }
+    return out as T
+  }
+  return value
+}
 /**
  * Framework adaptations:
  *

package/dist/docs/patterns/daemon-process.ts CHANGED Viewed

@@ -325,6 +325,95 @@ function createLogger(logPath: string): { log: (msg: string) => void; close: ()
   };
 }
+// ── .env Parsing (literal, $-safe) ────────────────────
+// field report #344 F1: never source secrets via `export $(cat .env)` /
+// `eval "$(cat .env)"`. The shell performs variable expansion and word
+// splitting on the RHS, so a `$`-bearing secret — bcrypt hashes
+// ($2b$...), JWTs, Postgres URLs with `$` in the password, anything with
+// `$VAR`/`${...}`/backticks — gets mangled or silently truncated. Parse
+// literally instead: read each line, split on the FIRST `=` only, and keep
+// the value byte-for-byte (no expansion, no eval). For shells, the
+// equivalent is `while IFS='=' read -r k v; do export "$k=$v"; done < .env`
+// — note IFS='=' and `read -r` (raw, no backslash processing), which never
+// re-expands the value.
+//
+// Prefer a runtime-native loader where available — it sidesteps the shell
+// entirely:
+//   - Node 20.6+: `node --env-file=.env daemon.js` (literal parse, no shell).
+//   - systemd:    `EnvironmentFile=/etc/voidforge/heartbeat.env` (also literal;
+//                 unit-file `Environment=` lines do NOT undergo shell expansion).
+// Use this helper only when you must parse `.env` in-process.
+function parseDotenv(contents: string): Record<string, string> {
+  const out: Record<string, string> = {};
+  for (const rawLine of contents.split('\n')) {
+    const line = rawLine.replace(/\r$/, '');
+    // Skip blanks and comments. A leading `export ` prefix is tolerated.
+    const trimmed = line.trimStart();
+    if (trimmed === '' || trimmed.startsWith('#')) continue;
+    const body = trimmed.startsWith('export ') ? trimmed.slice(7) : trimmed;
+    // Split on the FIRST `=` only — values may legitimately contain `=`.
+    const eq = body.indexOf('=');
+    if (eq < 0) continue; // not a KEY=VALUE line — ignore, don't guess
+    const key = body.slice(0, eq).trim();
+    if (!/^[A-Za-z_][A-Za-z0-9_]*$/.test(key)) continue; // invalid env name
+    let value = body.slice(eq + 1);
+    // Strip a single layer of matching surrounding quotes. Inside quotes the
+    // value is taken LITERALLY — no `$` expansion, no eval — which is the
+    // whole point: `PASS='p@$$w0rd'` keeps its `$$` intact.
+    if (value.length >= 2 &&
+        ((value[0] === '"' && value[value.length - 1] === '"') ||
+         (value[0] === "'" && value[value.length - 1] === "'"))) {
+      value = value.slice(1, -1);
+    } else {
+      // Unquoted: trim trailing inline whitespace only (POSIX-ish), never
+      // touch interior `$` characters.
+      value = value.trimEnd();
+    }
+    out[key] = value;
+  }
+  return out;
+}
+// ── systemd hardening stanza (Node daemons) ───────────
+// field report #344 F3: when running this daemon under systemd, harden the
+// unit — but DO NOT set `MemoryDenyWriteExecute=true` for a Node/V8 process.
+// V8's JIT allocates pages that are written and then executed (it manages its
+// own W^X internally); MDWE forbids any write+exec mapping, so the daemon
+// takes a SIGTRAP and dies at boot, usually before it logs a single line. The
+// safe, high-value sandbox flags below give most of MDWE's benefit without the
+// JIT collision:
+//
+//   [Unit]
+//   Description=VoidForge Heartbeat daemon
+//   After=network-online.target
+//   Wants=network-online.target
+//
+//   [Service]
+//   Type=simple
+//   ExecStart=/usr/bin/node /opt/voidforge/daemon.js
+//   EnvironmentFile=/etc/voidforge/heartbeat.env   # literal parse — see #344 F1
+//   Restart=on-failure
+//   RestartSec=5
+//
+//   # Hardening — keep these:
+//   NoNewPrivileges=true        # no setuid/setgid privilege escalation
+//   ProtectSystem=full          # /usr, /boot, /etc mounted read-only
+//   ProtectHome=true            # /home, /root, /run/user hidden
+//   PrivateTmp=true             # private /tmp + /var/tmp namespace
+//   # MemoryDenyWriteExecute=true  # <-- OMITTED ON PURPOSE: breaks V8 JIT
+//                                  #     (SIGTRAP at boot). Re-enable ONLY for
+//                                  #     Go/Rust/static daemons with no JIT.
+//
+//   [Install]
+//   WantedBy=multi-user.target
+//
+// Go, Rust, and other AOT-compiled daemons emit no executable pages at
+// runtime, so for THEM you can and should keep `MemoryDenyWriteExecute=true`.
+// The omission above is V8-specific, not a general weakening.
 export {
   writePidFile, checkStalePid, removePidFile,
   generateSessionToken, validateToken,
@@ -333,6 +422,7 @@ export {
   setupSignalHandlers,
   JobScheduler,
   createLogger,
+  parseDotenv,
   PID_FILE, SOCKET_PATH, TOKEN_FILE, STATE_FILE, LOG_FILE,
 };
 export type { DaemonState, HeartbeatState, ScheduledJob };

package/dist/docs/patterns/database-migration.ts CHANGED Viewed

@@ -245,6 +245,71 @@ const dropLegacyAvatarUrl: MigrationStep = {
   },
 };
+// ── Boot-Time Schema Re-Application & Table Ownership (#354 F4) ──
+/**
+ * GUARD: idempotent boot-time schema re-application must account for table
+ * OWNERSHIP and role grants — not just IF NOT EXISTS / IF EXISTS guards (#354 F4).
+ *
+ * The trap: a service that re-applies its schema at startup (boot-time DDL,
+ * "ensure schema" on connect) often connects as an app role distinct from the
+ * role that originally created the tables. In PostgreSQL, ALTER TABLE / DROP /
+ * CREATE INDEX / ADD COLUMN require the table OWNER (or a superuser) — not just
+ * INSERT/UPDATE/SELECT privileges. So even a fully idempotent
+ * `CREATE TABLE IF NOT EXISTS` / `ALTER TABLE ... ADD COLUMN IF NOT EXISTS`
+ * will FAIL at boot with "must be owner of table X" when the table is owned by
+ * a different DB role (e.g. a migration/admin role) than the connecting app role.
+ * The IF [NOT] EXISTS guard does not save you here — ownership is checked before
+ * the existence short-circuit on ALTER, and CREATE INDEX has no existence
+ * short-circuit at all for the ownership check.
+ *
+ * Why it bites at boot specifically: idempotent re-application is meant to be a
+ * safe no-op on an already-migrated DB. But the ownership check fires regardless
+ * of whether the change is a no-op, so a healthy, already-correct schema can
+ * still crash the service on startup.
+ *
+ * Mitigations (pick per your trust model):
+ *  - Run boot-time/idempotent DDL as the table OWNER role, not the app role.
+ *    Keep schema changes on a privileged migration role; let the app role do DML only.
+ *  - OR align ownership: `ALTER TABLE <t> OWNER TO <app_role>` once (by a superuser),
+ *    or create tables under the app role from the start.
+ *  - OR use a shared owning role and `GRANT <owner_role> TO <app_role>` so the
+ *    app role can act as owner for DDL.
+ *  - Prefer a one-shot migration step (runMigrations) over boot-time re-application
+ *    for anything beyond table/index existence — it isolates the privileged role.
+ *
+ * Pre-flight check before re-applying schema at boot — fail fast with a clear
+ * message instead of a raw "must be owner" deep in startup.
+ */
+async function assertTableOwnership(
+  ctx: MigrationContext,
+  table: string,
+  expectedRole: string
+): Promise<void> {
+  // PostgreSQL: tableowner comes from pg_tables; current_user is the connecting role.
+  const result = await ctx.execute(
+    `SELECT tableowner FROM pg_tables WHERE tablename = $1`,
+    [table]
+  );
+  // No row → table absent; CREATE TABLE IF NOT EXISTS will create it under the
+  // connecting role, so ownership is not a concern for this table yet.
+  if (result.rowCount === 0) {
+    ctx.log('migration.ownership_check', { table, present: false });
+    return;
+  }
+  // In a real adapter, read the owner value from the row; shown here as the contract.
+  // If the owner is not the expected (connecting/owner) role, boot-time ALTER/CREATE
+  // INDEX on this table will fail with "must be owner of table" (#354 F4).
+  ctx.log('migration.ownership_check', {
+    table,
+    present: true,
+    expectedRole,
+    note: 'boot-time DDL requires table owner or a superuser; app-role DML privileges are not enough',
+  });
+}
 // ── Batched Processing for Large Tables ─────────────────
 /**

package/dist/docs/patterns/deploy-preflight.ts CHANGED Viewed

@@ -4,13 +4,17 @@
  * Reference implementation for .claude/commands/deploy.md Step 2.5.
  * Scans the deploy artifact directory BEFORE upload. Exits non-zero on any hit.
  *
- * Evidence: field reports #305 (32-day credential leak), #303 (methodology exposure).
+ * Evidence: field reports #305 (32-day credential leak), #303 (methodology exposure),
+ * #343 F7 (stop-build-start loop mislabeled "blue-green" → 502 window every deploy).
  *
  * Key principles:
  * - Scan the deploy payload directory, NOT the repo root.
  * - Never auto-filter — a hit means the operator must investigate.
  * - Never print secret content; only paths + pattern IDs.
  * - Allowlist escape hatch via DEPLOY_PREFLIGHT_ALLOW (comma-separated globs).
+ * - Deploy-strategy claims must be backed by a real mechanism: a comment that
+ *   says "blue-green"/"zero-downtime" without an atomic swap (rename, container
+ *   swap, or LB cutover) is a lie that ships a 502 window (#343 F7).
  *
  * Usage:
  *   npx tsx docs/patterns/deploy-preflight.ts ./dist
@@ -55,11 +59,82 @@ const TEXT_EXTENSIONS = new Set([
 ]);
 interface Hit {
-  kind: 'name' | 'content';
+  kind: 'name' | 'content' | 'strategy';
   path: string;
   patternId: string;
 }
+// ---------- deploy-strategy nomenclature check (field report #343 F7) ----------
+// A stop-build-start loop mislabeled "blue-green"/"zero-downtime" still drops the
+// old process before the new one is live, producing a 502 window on every deploy.
+// The comment lies; the mechanism doesn't. This flags scripts whose comments CLAIM
+// blue-green / zero-downtime but where no atomic-swap mechanism is detectable —
+// temp-build-then-rename, container/image swap, or load-balancer cutover.
+// File shapes that can carry a deploy strategy worth checking.
+const DEPLOY_SCRIPT_EXTENSIONS = new Set([
+  '.sh', '.bash', '.zsh', '.yml', '.yaml', '.ps1',
+]);
+const DEPLOY_SCRIPT_BASENAMES = new Set([
+  'Dockerfile', 'Procfile', 'Makefile',
+]);
+// Comments that CLAIM an atomic deploy strategy.
+const STRATEGY_CLAIM_RE = /\b(blue[\s/_-]?green|zero[\s/_-]?downtime|hot[\s/_-]?swap|atomic\s+deploy(?:ment)?)\b/i;
+// Any one of these signals a real atomic-swap mechanism is present.
+const ATOMIC_SWAP_SIGNALS: { id: string; re: RegExp }[] = [
+  // temp build dir then rename/symlink-swap into place (release-then-link pattern)
+  { id: 'rename-swap', re: /\b(?:mv|rename|ln\s+-s(?:fn|nf|f)?)\b[^\n]*\b(?:current|live|release|active|prod(?:uction)?)\b/i },
+  { id: 'symlink-current', re: /\bln\s+-s(?:fn|nf|f)?\b[^\n]*\bcurrent\b/i },
+  // container / image swap: new container up, traffic moved, old removed
+  { id: 'container-swap', re: /\bdocker\b[^\n]*\b(?:run|up|--scale|service\s+update)\b|\bdocker[\s-]compose\b[^\n]*\bup\b[^\n]*\b(?:--no-recreate|--scale)\b|\bcontainer[\s_-]?swap\b/i },
+  { id: 'orchestrator-rollout', re: /\b(?:kubectl\s+rollout|helm\s+upgrade|nomad\s+job\s+run|ecs\b[^\n]*update-service)\b/i },
+  // load-balancer / proxy cutover: register new target, then drain/deregister old
+  { id: 'lb-cutover', re: /\b(?:register-targets|deregister-targets|modify-listener|switchover|traffic[\s_-]?shift|weighted[\s_-]?routing|upstream)\b/i },
+  { id: 'proxy-reload', re: /\b(?:nginx\s+-s\s+reload|caddy\s+reload|envoy\b[^\n]*config|haproxy\b[^\n]*reload)\b/i },
+];
+// Sequences that betray a stop-build-start loop (kill old, then start new).
+// Used only to strengthen the signal — a claim with NO atomic mechanism is
+// already a hit; this just confirms the anti-pattern is actively present.
+const STOP_START_RE = /\b(?:kill|stop|down|terminate|systemctl\s+stop|pm2\s+stop|docker\s+stop|docker\s+rm)\b[\s\S]{0,400}?\b(?:start|up|run|systemctl\s+start|pm2\s+start|npm\s+(?:run\s+)?start|node\b)/i;
+function scanStrategy(fullPath: string, relPath: string): string | null {
+  const base = relPath.split(sep).pop() ?? '';
+  const ext = extname(fullPath).toLowerCase();
+  const looksLikeDeployScript =
+    DEPLOY_SCRIPT_EXTENSIONS.has(ext) ||
+    DEPLOY_SCRIPT_BASENAMES.has(base) ||
+    /deploy|release|rollout|cutover/i.test(base);
+  if (!looksLikeDeployScript) return null;
+  let stats;
+  try {
+    stats = statSync(fullPath);
+  } catch {
+    return null;
+  }
+  if (stats.size > 2_000_000) return null;
+  let buf: string;
+  try {
+    buf = readFileSync(fullPath, 'utf8');
+  } catch {
+    return null;
+  }
+  if (!STRATEGY_CLAIM_RE.test(buf)) return null; // no claim, nothing to verify
+  const hasAtomicSwap = ATOMIC_SWAP_SIGNALS.some((s) => s.re.test(buf));
+  if (hasAtomicSwap) return null; // claim is backed by a real mechanism
+  // Claim present, no atomic-swap mechanism. Distinguish the worst case:
+  // an actual stop-build-start loop wearing a blue-green label.
+  return STOP_START_RE.test(buf)
+    ? 'strategy-mislabel-stop-start'
+    : 'strategy-claim-no-atomic-swap';
+}
 function globToRegex(glob: string): RegExp {
   const escaped = glob
     .replace(/[.+^${}()|[\]\\]/g, '\\$&')
@@ -167,6 +242,14 @@ function main(): void {
     const contentHit = scanContent(fullPath);
     if (contentHit) {
       hits.push({ kind: 'content', path: relPath, patternId: contentHit });
+      continue; // a secret hit is already terminal; don't double-report this file
+    }
+    // Deploy-strategy nomenclature check (field report #343 F7): a script whose
+    // comments claim blue-green / zero-downtime but ships no atomic-swap mechanism.
+    const strategyHit = scanStrategy(fullPath, relPath);
+    if (strategyHit) {
+      hits.push({ kind: 'strategy', path: relPath, patternId: strategyHit });
     }
   }