npm - @nathapp/nax - Versions diffs - 0.18.1 → 0.18.3 - Mend

@nathapp/nax 0.18.1 → 0.18.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

package/.gitlab-ci.yml +12 -6
package/bun.lock +1 -1
package/bunfig.toml +2 -1
package/docker-compose.test.yml +17 -0
package/docs/ROADMAP.md +121 -36
package/docs/specs/verification-architecture-v2.md +343 -0
package/nax/config.json +13 -10
package/nax/features/smart-test-runner/plan.md +7 -0
package/nax/features/smart-test-runner/prd.json +203 -0
package/nax/features/smart-test-runner/progress.txt +13 -0
package/nax/features/smart-test-runner/spec.md +7 -0
package/nax/features/smart-test-runner/tasks.md +8 -0
package/nax/features/v0.18.3-execution-reliability/prd.json +80 -0
package/nax/features/v0.18.3-execution-reliability/progress.txt +3 -0
package/package.json +2 -2
package/src/config/defaults.ts +2 -0
package/src/config/schema.ts +1 -0
package/src/config/schemas.ts +24 -0
package/src/config/types.ts +16 -1
package/src/context/builder.ts +11 -0
package/src/context/elements.ts +38 -1
package/src/execution/escalation/tier-escalation.ts +28 -3
package/src/execution/post-verify-rectification.ts +4 -2
package/src/execution/post-verify.ts +73 -9
package/src/execution/progress.ts +2 -0
package/src/pipeline/stages/review.ts +5 -3
package/src/pipeline/stages/routing.ts +14 -9
package/src/pipeline/stages/verify.ts +54 -1
package/src/prd/index.ts +16 -1
package/src/prd/types.ts +33 -0
package/src/precheck/index.ts +9 -4
package/src/routing/strategies/llm.ts +5 -0
package/src/verification/gate.ts +2 -1
package/src/verification/smart-runner.ts +214 -0
package/src/verification/types.ts +2 -0
package/test/US-002-orchestrator.test.ts +5 -5
package/test/context/prior-failures.test.ts +462 -0
package/test/execution/post-verify-bug026.test.ts +443 -0
package/test/execution/post-verify.test.ts +32 -0
package/test/execution/structured-failure.test.ts +414 -0
package/test/integration/logger.test.ts +1 -1
package/test/integration/review-plugin-integration.test.ts +2 -1
package/test/integration/story-id-in-events.test.ts +1 -1
package/test/unit/config/smart-runner-flag.test.ts +249 -0
package/test/unit/pipeline/routing-partial-override.test.ts +141 -0
package/test/unit/pipeline/verify-smart-runner.test.ts +344 -0
package/test/unit/prd-get-next-story.test.ts +28 -0
package/test/unit/routing.test.ts +102 -0
package/test/unit/smart-test-runner.test.ts +512 -0
package/test/unit/verification/smart-runner.test.ts +246 -0

package/.gitlab-ci.yml CHANGED Viewed

@@ -15,9 +15,11 @@ stages:
 # --- Stage: Test ---
 test:
   stage: test
-  image: nathapp/node-bun:22.21.0-1.3.9-alpine
+  image:
+    name: nathapp/node-bun:22.21.0-1.3.9-alpine
+    pull_policy: if-not-present
   before_script:
-    - apk add --no-cache git python3 make g++
+    - apk add --no-cache git python3 make g++
     - git config --global safe.directory '*'
     - git config --global user.name "CI Runner"
     - git config --global user.email "ci@nathapp.io"
@@ -32,7 +34,7 @@ test:
     - bun install --frozen-lockfile --ignore-scripts
     - bun run typecheck
     - bun run lint
-    - NAX_SKIP_PRECHECK=1 bun test test/ --timeout=60000
+    - bun run test:unit
   rules:
     - if: '$CI_COMMIT_MESSAGE =~ /release-by-bot/ || $CI_COMMIT_TAG'
       when: never
@@ -43,7 +45,9 @@ test:
 # --- Stage: Release ---
 release:
   stage: release
-  image: nathapp/node-bun:22.21.0-1.3.9-alpine
+  image:
+    name: nathapp/node-bun:22.21.0-1.3.9-alpine
+    pull_policy: if-not-present
   cache:
     key:
       files:
@@ -80,10 +84,12 @@ release:
 # --- Stage: Notify ---
 notify:
   stage: notify
-  image: nathapp/node-bun:22.21.0-1.3.9-alpine
+  image:
+    name: registry-intl.cn-hongkong.aliyuncs.com/gkci/node:22.14.0-alpine-ci
+    pull_policy: if-not-present
   needs: [release]
   script:
-    - VERSION=$(bun -e "console.log(require('./package.json').version)")
+    - VERSION=$(node -e "console.log(require('./package.json').version)")
     - 'curl -s -X POST -H "Content-Type: application/json" -d "{\"chat_id\": \"$TELEGRAM_CHAT_ID\", \"text\": \"nax v${VERSION} released\"}" https://api.telegram.org/bot$TELEGRAM_BOT_TOKEN/sendMessage'
   rules:
     - if: '$CI_COMMIT_MESSAGE =~ /release-by-bot/ || $CI_COMMIT_TAG'

package/bun.lock CHANGED Viewed

@@ -18,7 +18,7 @@
       },
       "devDependencies": {
         "@biomejs/biome": "^1.9.4",
-        "@types/bun": "^1.2.4",
+        "@types/bun": "^1.3.8",
         "react-devtools-core": "^7.0.1",
         "typescript": "^5.7.3",
       },

package/bunfig.toml CHANGED Viewed

@@ -8,4 +8,5 @@ timeout = 30000
 # Exclude nax dogfood feature directories (contain acceptance tests from nax runs, not source tests)
 exclude = ["nax/**"]
-# Note: E2E tests may override this with longer timeouts using test options
+[test]
+concurrent = 1

package/docker-compose.test.yml ADDED Viewed

@@ -0,0 +1,17 @@
+version: "3.9"
+services:
+  app:
+    image: oven/bun:1.3.8
+    working_dir: /app
+    volumes:
+      - .:/app
+    command: >
+      sh -c "
+        echo 'Running pre-step...' &&
+        apt-get update && apt-get install -y --no-install-recommends git &&
+        bun install &&
+        bun run test:unit
+      "
+    environment:
+      - NAX_SKIP_PRECHECK=1
+      - CI=true

package/docs/ROADMAP.md CHANGED Viewed

@@ -24,47 +24,44 @@
 ---
-## v0.18.1 — Type Safety + Per-Story testStrategy
+## v0.18.1 — Type Safety + CI Pipeline ✅
-**Theme:** Fix all TypeScript/lint errors + fine-grained test strategy control
-**Status:** 🔲 Planned
+**Theme:** Fix all TypeScript/lint errors, establish CI pipeline
+**Status:** ✅ Shipped (2026-03-03)
 ### TypeScript Fixes (60 errors across 21 files)
-- [ ] **TS-001:** Fix context module exports — add `BuiltContext`, `ContextElement`, `ContextBudget`, `StoryContext` to `context/types.ts` (13 errors)
-- [ ] **TS-002:** Fix config/command type safety — type `{}` → proper types in `config/loader.ts`, `commands/logs.ts`, `agents/claude.ts` (12 errors)
-- [ ] **TS-003:** Fix review/verification types — add `softViolations`, `warnings`, `description` to review result types (9 errors)
-- [ ] **TS-004:** Fix escalation PRD type construction — ensure escalation produces valid `PRD` objects (4 errors)
-- [ ] **TS-005:** Fix misc — Logger mock types, null checks, missing exports (`RectificationState`, `TestSummary`, `TestFailure`) (6 errors)
-### Lint Fixes (12 errors)
-- [ ] **LINT-001:** Run `biome check --fix` + manual review of unsafe fixes
-### Verify Stage Fix
-- [ ] **TEST-001:** Fix hanging "test command that throws error" test — add timeout or proper process kill
-### Per-Story testStrategy
-- [ ] Add optional `testStrategy` field to userStory PRD schema (`"test-after" | "three-session-tdd" | "three-session-tdd-lite"`)
-- [ ] When set, overrides global config + task classification for that story
-- [ ] Update routing stage to check `story.testStrategy` before config/LLM
-- [ ] Docs + tests
-### Re-enable Checks
-- [ ] Re-enable `typecheck` in `nax/config.json` review checks after TS fixes land
+- [x] ~~**TS-001:** Fix context module exports (13 errors)~~
+- [x] ~~**TS-002:** Fix config/command type safety (12 errors)~~
+- [x] ~~**TS-003:** Fix review/verification types (9 errors)~~
+- [x] ~~**TS-004:** Fix escalation PRD type construction (4 errors)~~
+- [x] ~~**TS-005:** Fix misc types (6 errors)~~
+- [x] ~~**LINT-001:** Run biome check --fix + manual review~~
+### CI Pipeline (new)
+- [x] `.gitlab-ci.yml` — stages: test → release → notify
+- [x] Image: `nathapp/node-bun:22.21.0-1.3.9-alpine` (test/release), `gkci/node:22.14.0-alpine-ci` (notify)
+- [x] `before_script`: apk add git python3 make g++, safe.directory, git identity
+- [x] Test env: `NAX_SKIP_PRECHECK=1 bun test test/ --timeout=60000`
+- [x] CI skip guards for env-sensitive tests (claude binary, PID checks, subprocess integration)
+- [x] Fixed `checkClaudeCLI()` ENOENT crash — try/catch around Bun.spawn
+- [x] Release trigger: `[run-release]` in commit message on master
+- [x] Runner requirement: 8GB shared runner (`saas-linux-small-amd64`)
+- [x] **Result: 1952 pass, 56 skip, 0 fail**
 ---
-## v0.18.2 — Smart Test Runner + Bun PTY Migration
+## v0.18.2 — Smart Test Runner + Routing Fix ✅
-**Theme:** Scope verify to changed files only + remove node-pty native addon
-**Status:** 🔲 Planned
+**Theme:** Scope verify to changed files only + fix routing override
+**Status:** ✅ Shipped (2026-03-03)
 ### Smart Test Runner
-- [ ] After agent implementation, run `git diff --name-only` to get changed source files
-- [ ] Map source → test files by naming convention (`src/foo/bar.ts` → `test/unit/foo/bar.test.ts`)
-- [ ] Run only related tests for verify (instead of full suite)
-- [ ] Fallback to full suite when mapping yields no test files
-- [ ] Config flag `execution.smartTestRunner: true` (default: true) to opt out
-- [ ] Result: verify drops from ~125s to ~10-20s for typical single-file fixes
+- [x] ~~After agent implementation, run `git diff --name-only` to get changed source files~~
+- [x] ~~Map source → test files by naming convention (`src/foo/bar.ts` → `test/unit/foo/bar.test.ts`)~~
+- [x] ~~Run only related tests for verify (instead of full suite)~~
+- [x] ~~Fallback to full suite when mapping yields no test files~~
+- [x] ~~Config flag `execution.smartTestRunner: true` (default: true) to opt out~~
+- [x] ~~Result: verify drops from ~125s to ~10-20s for typical single-file fixes~~
 ### Bun PTY Migration (BUN-001)
 - [ ] Replace `node-pty` (native addon, requires python/make/g++ to build) with `Bun.Terminal` API (v1.3.5+)
@@ -83,12 +80,68 @@
 ---
-## v0.19.0 — Central Run Registry
+## v0.18.3 — Execution Reliability ✅
+**Theme:** Fix execution pipeline bugs (escalation, routing, review), structured failure context, and Smart Runner enhancement
+**Status:** ✅ Shipped (2026-03-04)
+**Spec:** [docs/specs/verification-architecture-v2.md](specs/verification-architecture-v2.md) (Phase 1)
+### Bugfixes — Completed
+- [x] **BUG-026:** Regression gate timeout → accept scoped pass + warn (not escalate). Config: `regressionGate.acceptOnTimeout: true`.
+- [x] **BUG-028:** Routing cache ignores escalation tier — `clearCacheForStory(storyId)` in `llm.ts`, called on tier escalation in both `preIterationTierCheck()` and `handleTierEscalation()`.
+### Structured Failure Context — Completed
+- [x] **SFC-001:** `StructuredFailure` type with `TestFailureContext[]` + `priorFailures?: StructuredFailure[]` on `UserStory`. Populated on verify, regression, rectification, and escalation failures.
+- [x] **SFC-002:** Format `priorFailures` into agent prompt at priority 95 via `createPriorFailuresContext()` in `context/builder.ts`.
+### Bugfixes — Completed (Round 2)
+- [x] **BUG-029:** Escalation resets story to `pending` → bypasses BUG-022 retry priority. After escalation, `getNextStory()` picks the next pending story instead of retrying the escalated one. **Location:** `src/prd/index.ts:getNextStory()`. **Fix:** Recognize escalated-pending stories in Priority 1 (e.g. check `story.routing.modelTier` changed, or use `"retry-pending"` status).
+- [x] **BUG-030:** Review lint/typecheck failure → hard `"fail"`, no rectification or retry. `review.ts:92` returns `{ action: "fail" }` → `markStoryFailed()` permanently. Lint errors are auto-fixable but story is killed with zero retry. **Fix:** Return `"escalate"` for lint/typecheck failures (or add review-rectification loop). Reserve `"fail"` for plugin reviewer rejection only.
+- [x] **BUG-032:** Routing stage overrides escalated `modelTier` with complexity-derived tier. `routing.ts:43` always runs `complexityToModelTier()` even when `story.routing.modelTier` was set by escalation → escalated tier silently ignored. BUG-013 fix (`applyCachedRouting`) runs too late. **Fix:** Skip `complexityToModelTier()` when `story.routing.modelTier` is explicitly set.
+### STR-007: Smart Test Runner Enhancement — Completed
+- [x] Configurable `testFilePatterns` in config (default: `test/**/*.test.ts`)
+- [x] `testFileFallback` config option: `"import-grep"` | `"full-suite"` (default: `"import-grep"`)
+- [x] 3-pass test discovery: path-convention → import-grep (grep test files for changed module name) → full-suite
+- [x] Config schema update: `execution.smartTestRunner` becomes object `{ enabled, testFilePatterns, fallback }` (backward compat: boolean coerced)
+---
+## v0.18.4 — Routing Stability
+**Theme:** Fix routing classifier consistency and LLM routing reliability
+**Status:** 🔲 Planned
+### Bugfixes
+- [ ] **BUG-031:** Keyword fallback classifier gives inconsistent strategy across retries for same story. `priorErrors` text shifts keyword classification. **Fix:** Keyword classifier should only use original story fields; or lock `story.routing.testStrategy` once set.
+- [ ] **BUG-033:** LLM routing has no retry on timeout — single 15s attempt, then keyword fallback. **Fix:** Add `routing.llm.retries` config (default: 1) with backoff. Raise default timeout to 30s for batch routing.
+---
+## v0.19.0 — Verification Architecture v2
-**Theme:** Unified run tracking across worktrees + dashboard integration
+**Theme:** Eliminate duplicate test runs, deferred regression gate, structured escalation context
 **Status:** 🔲 Planned
+**Spec:** [docs/specs/verification-architecture-v2.md](specs/verification-architecture-v2.md) (Phase 2)
+### Remove Duplicate Test Execution
+- [ ] Pipeline verify stage is the single test execution point (Smart Test Runner)
+- [ ] Remove scoped re-test in `post-verify.ts` (duplicate of pipeline verify)
+- [ ] Review stage runs typecheck + lint only — remove `review.commands.test` execution
-- [ ] **Central Run Registry** — `~/.nax/runs/<project>-<feature>-<runId>/` with status.json + events.jsonl symlink. Dashboard reads from registry.
+### Deferred Regression Gate
+- [ ] New `src/execution/lifecycle/run-regression.ts` — run full suite once at run-end (not per-story)
+- [ ] Reverse Smart Test Runner mapping: failing test → source file → responsible story
+- [ ] Targeted rectification per responsible story with full failure context
+- [ ] Config: `execution.regressionGate.mode: "deferred" | "per-story" | "disabled"` (default `"deferred"`)
+- [ ] Call deferred regression in `run-completion.ts` before final metrics
+### Full Structured Failure Context
+- [ ] `priorFailures` injected into escalated agent prompts via `context/builder.ts`
+- [ ] Reverse file mapping for regression attribution
+### Central Run Registry (carried forward)
+- [ ] `~/.nax/runs/<project>-<feature>-<runId>/` with status.json + events.jsonl symlink
 ---
@@ -96,6 +149,9 @@
 | Version | Theme | Date | Details |
 |:---|:---|:---|:---|
+| v0.18.1 | Type Safety + CI Pipeline | 2026-03-03 | 60 TS errors + 12 lint errors fixed, GitLab CI green (1952/56/0) |
+| v0.18.3 | Execution Reliability + Smart Runner | 2026-03-04 | BUG-026/028/029/030/032 + SFC-001/002 + STR-007, all items complete |
+| v0.18.2 | Smart Test Runner + Routing Fix | 2026-03-03 | FIX-001 + STR-001–006, 2038 pass/11 skip/0 fail |
 | v0.18.0 | Orchestration Quality | 2026-03-03 | BUG-016/017/018/019/020/021/022/023/025 all fixed |
 | v0.17.0 | Config Management | 2026-03-02 | CM-001 --explain, CM-002 --diff, CM-003 default view |
 | v0.16.4 | Bugfixes: Routing + Env Allowlist | 2026-03-02 | BUG-012/013/014 |
@@ -130,7 +186,28 @@
 - [x] ~~BUG-012: Greenfield detection ignores pre-existing test files~~
 - [x] ~~BUG-013: Escalation routing not applied in iterations~~
 - [x] ~~BUG-014: buildAllowedEnv() strips USER/LOGNAME~~
+<<<<<<< Updated upstream
+- [x] ~~**BUG-015:** `loadConstitution()` leaks global `~/.nax/constitution.md` into unit tests — fixed via `skipGlobal: true` in all unit tests~~
+=======
 - [ ] **BUG-015:** `loadConstitution()` leaks global `~/.nax/constitution.md` into unit tests
+- [ ] **BUG-027:** `runPrecheck()` always prints to stdout — pollutes test output when called programmatically.
+  - **Observed (2026-03-03):** `bun test` output starts with precheck JSON from `US-002-orchestrator.test.ts` calling `runPrecheck()`, which unconditionally calls `console.log()`. nax verify stage captures this, making every failure look like a `git-repo-exists` blocker.
+  - **Root cause:** `runPrecheck()` mixes side-effects (printing) with logic (returning result).
+  - **Fix:** Add `silent?: boolean` to `PrecheckOptions`; test callers pass `silent: true`.
+  - **Workaround (active):** `silent` option + test update shipped in v0.18.2 branch.
+  - **Target:** v0.18.2
+- [ ] **BUG-028:** Routing cache ignores escalation tier — escalated stories re-run at original tier.
+  - **Observed (2026-03-03):** STR-006 escalated to `powerful`. Router returned LLM cache hit from prior `balanced` run → agent ran as `balanced` anyway.
+  - **Root cause:** Cache key does not include requested tier. Lower-tier cache hit served for higher-tier request.
+  - **Fix:** Include `requestedTier` in cache key; only serve cache hit if cached tier >= requested tier.
+  - **Target:** v0.19.0
+- [ ] **BUG-026:** Regression gate failure triggers full story re-implementation instead of targeted rectification.
+  - **Observed (2026-03-03):** During v0.18.2 smart-runner development on Mac01, STR-001 passed scoped verification (5/5 tests green) but the full-suite regression gate timed out (exit code 132, SIGILL/Bun crash). nax treated this as a story failure and re-ran the coding agent, which rewrote already-correct code. The retry agent then produced a different (worse) implementation that failed verification.
+  - **Root cause:** Escalation logic does not distinguish between "story code is wrong" and "story code is fine but introduced a regression". Both flow through the same retry path.
+  - **Fix:** After regression gate failure, spawn a rectification agent with context of what regressed (failing test names + diff), not a full story re-implementation. Only fall back to full re-implementation if rectification also fails.
+  - **Workaround (active):** Disabled regression gate via `rectification.enabled: false` in project nax/config.json for self-dev runs. CI on VPS is the regression gate instead.
+  - **Target:** v0.19.0
+>>>>>>> Stashed changes
 - [x] ~~**BUG-016:** Hardcoded 120s timeout in pipeline verify stage → fixed in v0.18.0~~
 - [x] ~~**BUG-017:** run.complete not emitted on SIGTERM → fixed in v0.18.0~~
 - [x] ~~**BUG-018:** Test-writer wastes ~3min/retry when tests already exist → fixed in v0.18.0~~
@@ -141,12 +218,20 @@
 - [x] ~~**BUG-023:** Agent failure silent — no exitCode/stderr in JSONL → fixed in v0.18.0~~
 - [x] ~~**BUG-025:** `needsHumanReview` not triggering interactive plugin → fixed in v0.18.0~~
+- [x] **BUG-029:** Escalation resets story to `pending` → bypasses BUG-022 retry priority. `handleTierEscalation()` sets `status: "pending"` after escalation, but `getNextStory()` Priority 1 only checks `status === "failed"`. Result: after BUG-026 escalated (iter 1), nax moved to BUG-028 (iter 2) instead of retrying BUG-026 immediately. **Location:** `src/prd/index.ts:getNextStory()` + `src/execution/escalation/tier-escalation.ts`. **Fix:** `getNextStory()` should also prioritize stories with `story.routing.modelTier` that changed since last attempt (escalation marker), or `handleTierEscalation` should use a distinct status like `"retry-pending"` that Priority 1 recognizes.
+- [x] **BUG-030:** Review lint failure → hard `"fail"`, no rectification or retry. `src/pipeline/stages/review.ts:92` returns `{ action: "fail" }` for all review failures including lint. In `pipeline-result-handler.ts`, `"fail"` calls `markStoryFailed()` — permanently dead. But lint errors are auto-fixable (agent can run `biome check --fix`). Contrast with verify stage which returns `"escalate"` on test failure, allowing retry. SFC-001 and SFC-002 both hit this — tests passed but 5 Biome lint errors killed the stories permanently. **Fix:** Review stage should return `"escalate"` (not `"fail"`) for lint/typecheck failures, or add a review-rectification loop (like verify has) that gives the agent one retry with the lint output as context. Reserve `"fail"` for unfixable review issues (e.g. plugin reviewer rejection).
+- [ ] **BUG-031:** Keyword fallback classifier gives inconsistent strategy across retries for same story. BUG-026 was classified as `test-after` on iter 1 (keyword fallback), but `three-session-tdd-lite` on iter 5 (same keyword fallback). The keyword classifier in `src/routing/strategies/keyword.ts:classifyComplexity()` may be influenced by `priorErrors` text added between attempts, shifting the keyword match result. **Location:** `src/routing/strategies/keyword.ts`. **Fix:** Keyword classifier should only consider the story's original title + description + acceptance criteria, not accumulated `priorErrors` or `priorFailures`. Alternatively, once a strategy is set in `story.routing.testStrategy`, the routing stage should preserve it across retries (already partially done in `routing.ts:40-41` but may not apply when LLM falls back to keyword).
+- [x] **BUG-032:** Routing stage overrides escalated `modelTier` with complexity-derived tier. `src/pipeline/stages/routing.ts:43` always runs `complexityToModelTier(routing.complexity, config)` even when `story.routing.modelTier` was explicitly set by `handleTierEscalation()`. BUG-026 was escalated to `balanced` (logged in iteration header), but `Task classified` shows `modelTier=fast` because `complexityToModelTier("simple", config)` → `"fast"`. Related to BUG-013 (escalation routing not applied) which was marked fixed, but the fix in `applyCachedRouting()` in `pipeline-result-handler.ts:295-310` runs **after** the routing stage — too late. **Location:** `src/pipeline/stages/routing.ts:43`. **Fix:** When `story.routing.modelTier` is explicitly set (by escalation), skip `complexityToModelTier()` and use the cached tier directly. Only derive from complexity when `story.routing.modelTier` is absent.
+- [ ] **BUG-033:** LLM routing has no retry on timeout — single attempt with hardcoded 15s default. All 5 LLM routing attempts in the v0.18.3 run timed out at 15s, forcing keyword fallback every time. `src/routing/strategies/llm.ts:63` reads `llmConfig?.timeoutMs ?? 15000` but there's no retry logic — one timeout = immediate fallback. **Location:** `src/routing/strategies/llm.ts:callLlm()`. **Fix:** Add `routing.llm.retries` config (default: 1) with backoff. Also surface `routing.llm.timeoutMs` in `nax config --explain` and consider raising default to 30s for batch routing which processes multiple stories.
 ### Features
 - [x] ~~`nax unlock` command~~
 - [x] ~~Constitution file support~~
 - [x] ~~Per-story testStrategy override — v0.18.1~~
 - [x] ~~Smart Test Runner — v0.18.2~~
 - [x] ~~Central Run Registry — v0.19.0~~
+- [ ] **BUN-001:** Bun PTY Migration — replace `node-pty` with `Bun.Terminal` API
+- [ ] **CI-001:** CI Memory Optimization — parallel test sharding for 1GB runners
 - [ ] Cost tracking dashboard
 - [ ] npm publish setup
 - [ ] `nax diagnose --ai` flag (LLM-assisted, future TBD)
@@ -162,4 +247,4 @@ Sequential canary → stable: `v0.12.0-canary.0` → `canary.N` → `v0.12.0`
 Canary: `npm publish --tag canary`
 Stable: `npm publish` (latest)
-*Last updated: 2026-03-03 (v0.18.0 shipped — all 9 bugs fixed)*
+*Last updated: 2026-03-04 (v0.18.3 shipped; v0.18.4: BUG-031/033; v0.19.0: Verification Architecture v2)*

package/docs/specs/verification-architecture-v2.md ADDED Viewed

@@ -0,0 +1,343 @@
+# Verification Architecture v2
+**Status:** Proposal
+**Target:** v0.19.0
+**Author:** Nax Dev
+**Date:** 2026-03-04
+**Fixes:** BUG-026, BUG-028, plus architectural debt in verification pipeline
+---
+## 1. Problems with Current Architecture
+### 1.1 Triple Test Execution (Waste)
+Current per-story flow runs tests up to 3 times:
+```
+Pipeline verify stage     → scoped tests (Smart Test Runner)
+Pipeline review stage     → test command (if review.commands.test configured)
+Post-verify               → scoped tests AGAIN + full regression gate
+```
+On Mac01 with ~2000 tests, this means:
+- Scoped: ~10-20s × 2 (duplicate) = 20-40s wasted
+- Full regression: ~125s per story
+- Total: ~150s+ of test execution per story
+### 1.2 Regression Gate Per Story (BUG-026)
+The regression gate runs a **full test suite after every story**. Problems:
+- **Timeout:** Full suite frequently times out on Mac01 (~125s)
+- **False escalation:** Timeout is treated as story failure → bumps `story.attempts` → triggers tier escalation
+- **Wasted compute:** Agent's implementation was correct (scoped tests passed), but full suite timeout causes a complete redo at a higher (more expensive) tier
+- **Cascading waste:** N stories × 1 full suite each = N full suite runs. Most are redundant.
+### 1.3 Escalation Context Loss
+When a story fails and escalates to a higher tier, the error context passed is:
+```
+priorErrors: ["Attempt 1 failed with model tier: fast"]
+```
+The actual test output — which tests failed, error messages, stack traces — is **discarded**. The escalated agent gets a vague hint instead of actionable failure context.
+| Stage | Context Available | What's Stored in priorErrors |
+|-------|-------------------|------------------------------|
+| Rectification loop | Full `TestFailure[]` with file, testName, error, stackTrace | *(used internally, then discarded)* |
+| Post-verify failure | `verificationResult.error` (summary string) | Generic: `"Verification failed: TEST_FAILURE"` |
+| Regression gate failure | Full test output | Generic: `"REGRESSION: full-suite regression detected"` |
+| Tier escalation | Nothing new | `"Attempt N failed with model tier: X"` |
+Result: `fast → balanced → powerful` escalation chain has **zero actionable context** about what actually failed.
+### 1.4 Routing Cache Ignores Escalation Tier (BUG-028)
+LLM routing cache is keyed by `story.id` only. When escalation updates `story.routing.modelTier` from `balanced` → `powerful`, the next iteration hits the cache and returns the old `balanced` routing decision, overriding the escalation.
+---
+## 2. Proposed Architecture
+### 2.1 Verification Flow (Simplified)
+```
+Pipeline per-story:
+  1. Agent execution
+  2. Scoped verify (Smart Test Runner)        ← ONLY test run per story
+  3. Scoped rectification (if verify fails)   ← has full test failure context
+  4. Review (typecheck + lint only)            ← NO test re-run
+  5. Story marked "passed" or escalated
+Run-end (after all stories pass):
+  6. Deferred regression gate (full suite)     ← ONE full suite run total
+  7. Targeted regression rectification         ← per-story, with failure context
+  8. Run marked complete or stalled
+```
+**Key changes:**
+- **Remove duplicate test runs** — pipeline verify is the single source of truth
+- **Review stage runs typecheck + lint only** — no test command
+- **Remove post-verify scoped re-test** — pipeline verify already did this
+- **Move regression gate to run-end** — one full suite run instead of N
+- **Targeted regression rectification** — map failing tests back to responsible stories
+### 2.2 Deferred Regression Gate
+Instead of running the full suite after every story, run it **once** after all stories complete.
+```typescript
+// New: src/execution/lifecycle/run-regression.ts
+interface DeferredRegressionOptions {
+  config: NaxConfig;
+  workdir: string;
+  prd: PRD;
+  prdPath: string;
+  allStoryMetrics: StoryMetrics[];
+}
+interface DeferredRegressionResult {
+  passed: boolean;
+  failedTests?: TestFailure[];
+  storyMapping?: Map<string, TestFailure[]>; // storyId → failures caused by that story
+}
+```
+**Failure handling:**
+1. Run full suite
+2. Parse failures into `TestFailure[]`
+3. For each failing test, use reverse Smart Test Runner mapping:
+   - `test/unit/foo/bar.test.ts` → `src/foo/bar.ts` → which story touched this file? (from git log per story)
+4. Group failures by responsible story
+5. Attempt targeted rectification per story (agent gets FULL failure context)
+6. Re-run full suite to confirm fix
+7. If still failing → mark responsible stories as failed
+**Config:**
+```jsonc
+{
+  "execution": {
+    "regressionGate": {
+      "enabled": true,
+      "mode": "deferred",        // "deferred" | "per-story" | "disabled"
+      "timeoutSeconds": 300,
+      "maxRectificationAttempts": 2
+    }
+  }
+}
+```
+### 2.3 Structured Failure Context for Escalation
+Replace vague `priorErrors` strings with structured failure data.
+**New PRD field:** `priorFailures` (alongside existing `priorErrors` for backward compat)
+```typescript
+// In src/prd/types.ts
+interface StructuredFailure {
+  /** Which attempt this failure occurred on */
+  attempt: number;
+  /** Model tier that was used */
+  modelTier: string;
+  /** What stage failed */
+  stage: "verify" | "review" | "regression" | "rectification" | "agent-session";
+  /** Human-readable summary */
+  summary: string;
+  /** Structured test failures (if applicable) */
+  testFailures?: TestFailureContext[];
+  /** Timestamp */
+  timestamp: string;
+}
+interface TestFailureContext {
+  file: string;
+  testName: string;
+  error: string;
+  /** First 5 lines of stack trace */
+  stackTrace: string[];
+}
+```
+**How it flows through escalation:**
+```
+fast attempt 1 → verify fails
+  → priorFailures: [{
+      attempt: 1,
+      modelTier: "fast",
+      stage: "verify",
+      summary: "3 tests failed in src/routing/router.ts",
+      testFailures: [
+        { file: "test/unit/routing/router.test.ts",
+          testName: "should route to balanced",
+          error: "Expected 'balanced' got 'fast'",
+          stackTrace: [...] },
+        ...
+      ]
+    }]
+balanced attempt 1 → agent gets FULL context of what fast couldn't fix
+```
+**Context injection** (`context/builder.ts`):
+Format `priorFailures` into actionable markdown for the agent prompt:
+```markdown
+## Prior Attempt 1 (fast, verify)
+3 tests failed in src/routing/router.ts
+### Test Failures:
+- **test/unit/routing/router.test.ts** > should route to balanced
+  Error: Expected 'balanced' got 'fast'
+  Stack: at Router.route (src/routing/router.ts:42)
+```
+### 2.4 BUG-028 Fix: Cache Invalidation on Escalation
+Add `clearCacheForStory(storyId)` to `src/routing/strategies/llm.ts`.
+Call it in `tier-escalation.ts` when updating `story.routing.modelTier`.
+---
+## 3. Migration Plan
+### Phase 1: v0.18.3 — Minimal Fixes (no architecture change)
+1. **BUG-026 quick fix:** Regression gate timeout → accept scoped pass + warn (not escalate)
+2. **BUG-028 fix:** `clearCacheForStory()` on escalation
+3. **Store structured failures:** Start populating `priorFailures` alongside `priorErrors` (backward compat)
+### Phase 2: v0.19.0 — Architecture v2
+1. **Remove post-verify duplicate test run** — pipeline verify is authoritative
+2. **Review stage: typecheck + lint only** — remove test command from review
+3. **Deferred regression gate** — run-end full suite with targeted rectification
+4. **Reverse Smart Test Runner mapping** — failing test → source file → responsible story
+5. **Full structured failure context** — `priorFailures` injected into agent prompts
+6. **Config:** `regressionGate.mode: "deferred"` (default)
+### Phase 3: Future
+- **Incremental regression:** Only run tests related to ALL changed files across all stories (union of Smart Test Runner scopes)
+- **Test impact analysis:** AST-based dependency graph for more precise test scoping
+- **Parallel story regression:** Run rectification for multiple stories concurrently
+---
+## 4. Files Affected
+### Phase 1 (v0.18.3)
+| File | Change |
+|------|--------|
+| `src/execution/post-verify.ts` | Regression gate timeout → accept + warn |
+| `src/routing/strategies/llm.ts` | Add `clearCacheForStory()` export |
+| `src/execution/escalation/tier-escalation.ts` | Call `clearCacheForStory()` on escalation |
+| `src/execution/post-verify-rectification.ts` | Store `StructuredFailure` in `priorFailures` |
+| `src/prd/types.ts` | Add `priorFailures?: StructuredFailure[]` to `UserStory` |
+### Phase 2 (v0.19.0)
+| File | Change |
+|------|--------|
+| `src/pipeline/stages/review.ts` | Remove test command execution |
+| `src/execution/post-verify.ts` | Remove scoped re-test, keep regression call only |
+| `src/execution/lifecycle/run-regression.ts` | **New:** Deferred regression gate + targeted rectification |
+| `src/execution/lifecycle/run-completion.ts` | Call deferred regression before final metrics |
+| `src/verification/smart-runner.ts` | Add reverse mapping: test file → source file → story |
+| `src/context/builder.ts` | Format `priorFailures` into agent prompt |
+| `src/config/schemas.ts` | Add `regressionGate.mode` enum |
+---
+## 5. Test Plan
+### Phase 1 Tests
+- Regression gate timeout returns "passed" with warning (not "failed")
+- `clearCacheForStory()` removes cached decision; next route() re-evaluates
+- `priorFailures` populated with structured `TestFailureContext` on verify failure
+- Backward compat: `priorErrors` still populated alongside `priorFailures`
+### Phase 2 Tests
+- Pipeline verify is single test execution (no duplicate)
+- Review stage skips test command
+- Deferred regression runs once at run-end
+- Reverse mapping correctly identifies responsible story
+- Targeted rectification receives full failure context
+- Escalated agent prompt includes formatted `priorFailures`
+- Config `regressionGate.mode: "per-story"` preserves current behavior
+---
+## 6. Historical Context (Why It's Like This)
+### Why post-verify exists separately from pipeline verify
+The pipeline (`src/pipeline/pipeline.ts`) runs stages in sequence: routing → context → prompt → execution → **verify** → review → completion. This was the original single verification point.
+Later, **post-agent verification** was added in `src/execution/pipeline-result-handler.ts` → `handlePipelineSuccess()` → `runPostAgentVerification()`. This was meant to handle:
+- **Scoped verification** with git-diff-based test file detection (before Smart Test Runner existed in the pipeline)
+- **Rectification** — retry loop with agent when tests fail
+- **Regression gate** (BUG-009 fix) — full suite after scoped pass
+When Smart Test Runner was added to the **pipeline verify stage** (v0.18.2), it duplicated the scoped test logic that post-verify already had. Nobody removed the post-verify scoped test.
+### Current code flow with exact locations
+```
+sequential-executor.ts:170  → pipelineRunner.run(story)
+  pipeline.ts:execute()     → runs stages in order:
+    verify.ts:execute()     → Smart Test Runner scoped tests    [TEST RUN #1]
+    review.ts:execute()     → runReview() which may run tests   [TEST RUN #2 if review.commands.test set]
+pipeline-result-handler.ts:76  → runPostAgentVerification()
+  post-verify.ts:85            → runVerification(scopedCommand) [TEST RUN #3 — duplicate of #1]
+  post-verify.ts:118           → runRegressionGate()
+    post-verify.ts:180         → runVerification(fullSuite)     [TEST RUN #4 — full suite]
+```
+### Review stage test command
+`review.ts` calls `runReview()` from `src/review/index.ts` which runs `config.review.commands.test` if configured. In default config, `review.commands` includes `test`, `typecheck`, and `lint`. So yes — review runs tests by default, creating the triple-test problem.
+### Decision rationale
+**Why deferred regression (Option C) over per-story (A) or disabled (B):**
+- **Option A (keep per-story):** 125s timeout per story is the root cause of BUG-026. Even with timeout-acceptance, it's wasteful.
+- **Option B (disable entirely):** Too risky — cross-story regressions are real (BUG-009 was filed for this exact reason).
+- **Option C (deferred):** One full suite run at the end. If it fails, we can trace back to responsible stories via reverse file mapping. Best balance of safety vs speed.
+**Why cache invalidation (Option C for BUG-028) over cache key change (A) or bypass (B):**
+- **Option A (include tier in key):** Works but creates multiple cache entries per story. If story is re-routed 3 times, 3 entries exist. Cache eviction becomes unpredictable.
+- **Option B (bypass when routing set):** Almost all stories have `story.routing` set after first pass, so cache would rarely be used at all — defeats the purpose.
+- **Option C (clear on escalation):** Surgical — one `delete()` call at the exact moment routing changes. Cache works normally for non-escalated stories.
+## 7. Edge Cases
+### Partial completion (stalled run)
+If only 3 of 5 stories pass and nax stalls (remaining stories failed/paused):
+- Deferred regression still runs on the 3 passed stories
+- If regression fails, only the passed stories are candidates for rectification
+- Failed/paused stories are untouched
+### Stories that touch the same files
+If story A and story B both modify `src/utils/parser.ts`:
+- Reverse mapping may attribute the same failing test to both stories
+- Rectification should try the **last story that touched the file** first (git log order)
+- If that doesn't fix it, try the other story
+### No test mapping possible
+If a failing test can't be mapped to any story's changed files:
+- Log warning: "Unmapped regression — cannot attribute to a specific story"
+- Mark ALL passed stories as needing re-verification
+- This is the worst case but should be rare with good test naming conventions