@nathapp/nax 0.18.1 → 0.18.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/.gitlab-ci.yml +12 -6
  2. package/bun.lock +1 -1
  3. package/bunfig.toml +2 -1
  4. package/docker-compose.test.yml +17 -0
  5. package/docs/ROADMAP.md +121 -36
  6. package/docs/specs/verification-architecture-v2.md +343 -0
  7. package/nax/config.json +13 -10
  8. package/nax/features/smart-test-runner/plan.md +7 -0
  9. package/nax/features/smart-test-runner/prd.json +203 -0
  10. package/nax/features/smart-test-runner/progress.txt +13 -0
  11. package/nax/features/smart-test-runner/spec.md +7 -0
  12. package/nax/features/smart-test-runner/tasks.md +8 -0
  13. package/nax/features/v0.18.3-execution-reliability/prd.json +80 -0
  14. package/nax/features/v0.18.3-execution-reliability/progress.txt +3 -0
  15. package/package.json +2 -2
  16. package/src/config/defaults.ts +2 -0
  17. package/src/config/schema.ts +1 -0
  18. package/src/config/schemas.ts +24 -0
  19. package/src/config/types.ts +16 -1
  20. package/src/context/builder.ts +11 -0
  21. package/src/context/elements.ts +38 -1
  22. package/src/execution/escalation/tier-escalation.ts +28 -3
  23. package/src/execution/post-verify-rectification.ts +4 -2
  24. package/src/execution/post-verify.ts +73 -9
  25. package/src/execution/progress.ts +2 -0
  26. package/src/pipeline/stages/review.ts +5 -3
  27. package/src/pipeline/stages/routing.ts +14 -9
  28. package/src/pipeline/stages/verify.ts +54 -1
  29. package/src/prd/index.ts +16 -1
  30. package/src/prd/types.ts +33 -0
  31. package/src/precheck/index.ts +9 -4
  32. package/src/routing/strategies/llm.ts +5 -0
  33. package/src/verification/gate.ts +2 -1
  34. package/src/verification/smart-runner.ts +214 -0
  35. package/src/verification/types.ts +2 -0
  36. package/test/US-002-orchestrator.test.ts +5 -5
  37. package/test/context/prior-failures.test.ts +462 -0
  38. package/test/execution/post-verify-bug026.test.ts +443 -0
  39. package/test/execution/post-verify.test.ts +32 -0
  40. package/test/execution/structured-failure.test.ts +414 -0
  41. package/test/integration/logger.test.ts +1 -1
  42. package/test/integration/review-plugin-integration.test.ts +2 -1
  43. package/test/integration/story-id-in-events.test.ts +1 -1
  44. package/test/unit/config/smart-runner-flag.test.ts +249 -0
  45. package/test/unit/pipeline/routing-partial-override.test.ts +141 -0
  46. package/test/unit/pipeline/verify-smart-runner.test.ts +344 -0
  47. package/test/unit/prd-get-next-story.test.ts +28 -0
  48. package/test/unit/routing.test.ts +102 -0
  49. package/test/unit/smart-test-runner.test.ts +512 -0
  50. package/test/unit/verification/smart-runner.test.ts +246 -0
package/.gitlab-ci.yml CHANGED
@@ -15,9 +15,11 @@ stages:
15
15
  # --- Stage: Test ---
16
16
  test:
17
17
  stage: test
18
- image: nathapp/node-bun:22.21.0-1.3.9-alpine
18
+ image:
19
+ name: nathapp/node-bun:22.21.0-1.3.9-alpine
20
+ pull_policy: if-not-present
19
21
  before_script:
20
- - apk add --no-cache git python3 make g++
22
+ - apk add --no-cache git python3 make g++
21
23
  - git config --global safe.directory '*'
22
24
  - git config --global user.name "CI Runner"
23
25
  - git config --global user.email "ci@nathapp.io"
@@ -32,7 +34,7 @@ test:
32
34
  - bun install --frozen-lockfile --ignore-scripts
33
35
  - bun run typecheck
34
36
  - bun run lint
35
- - NAX_SKIP_PRECHECK=1 bun test test/ --timeout=60000
37
+ - bun run test:unit
36
38
  rules:
37
39
  - if: '$CI_COMMIT_MESSAGE =~ /release-by-bot/ || $CI_COMMIT_TAG'
38
40
  when: never
@@ -43,7 +45,9 @@ test:
43
45
  # --- Stage: Release ---
44
46
  release:
45
47
  stage: release
46
- image: nathapp/node-bun:22.21.0-1.3.9-alpine
48
+ image:
49
+ name: nathapp/node-bun:22.21.0-1.3.9-alpine
50
+ pull_policy: if-not-present
47
51
  cache:
48
52
  key:
49
53
  files:
@@ -80,10 +84,12 @@ release:
80
84
  # --- Stage: Notify ---
81
85
  notify:
82
86
  stage: notify
83
- image: nathapp/node-bun:22.21.0-1.3.9-alpine
87
+ image:
88
+ name: registry-intl.cn-hongkong.aliyuncs.com/gkci/node:22.14.0-alpine-ci
89
+ pull_policy: if-not-present
84
90
  needs: [release]
85
91
  script:
86
- - VERSION=$(bun -e "console.log(require('./package.json').version)")
92
+ - VERSION=$(node -e "console.log(require('./package.json').version)")
87
93
  - 'curl -s -X POST -H "Content-Type: application/json" -d "{\"chat_id\": \"$TELEGRAM_CHAT_ID\", \"text\": \"nax v${VERSION} released\"}" https://api.telegram.org/bot$TELEGRAM_BOT_TOKEN/sendMessage'
88
94
  rules:
89
95
  - if: '$CI_COMMIT_MESSAGE =~ /release-by-bot/ || $CI_COMMIT_TAG'
package/bun.lock CHANGED
@@ -18,7 +18,7 @@
18
18
  },
19
19
  "devDependencies": {
20
20
  "@biomejs/biome": "^1.9.4",
21
- "@types/bun": "^1.2.4",
21
+ "@types/bun": "^1.3.8",
22
22
  "react-devtools-core": "^7.0.1",
23
23
  "typescript": "^5.7.3",
24
24
  },
package/bunfig.toml CHANGED
@@ -8,4 +8,5 @@ timeout = 30000
8
8
  # Exclude nax dogfood feature directories (contain acceptance tests from nax runs, not source tests)
9
9
  exclude = ["nax/**"]
10
10
 
11
- # Note: E2E tests may override this with longer timeouts using test options
11
+ [test]
12
+ concurrent = 1
@@ -0,0 +1,17 @@
1
+ version: "3.9"
2
+ services:
3
+ app:
4
+ image: oven/bun:1.3.8
5
+ working_dir: /app
6
+ volumes:
7
+ - .:/app
8
+ command: >
9
+ sh -c "
10
+ echo 'Running pre-step...' &&
11
+ apt-get update && apt-get install -y --no-install-recommends git &&
12
+ bun install &&
13
+ bun run test:unit
14
+ "
15
+ environment:
16
+ - NAX_SKIP_PRECHECK=1
17
+ - CI=true
package/docs/ROADMAP.md CHANGED
@@ -24,47 +24,44 @@
24
24
 
25
25
  ---
26
26
 
27
- ## v0.18.1 — Type Safety + Per-Story testStrategy
27
+ ## v0.18.1 — Type Safety + CI Pipeline ✅
28
28
 
29
- **Theme:** Fix all TypeScript/lint errors + fine-grained test strategy control
30
- **Status:** 🔲 Planned
29
+ **Theme:** Fix all TypeScript/lint errors, establish CI pipeline
30
+ **Status:** Shipped (2026-03-03)
31
31
 
32
32
  ### TypeScript Fixes (60 errors across 21 files)
33
- - [ ] **TS-001:** Fix context module exports — add `BuiltContext`, `ContextElement`, `ContextBudget`, `StoryContext` to `context/types.ts` (13 errors)
34
- - [ ] **TS-002:** Fix config/command type safety — type `{}` → proper types in `config/loader.ts`, `commands/logs.ts`, `agents/claude.ts` (12 errors)
35
- - [ ] **TS-003:** Fix review/verification types — add `softViolations`, `warnings`, `description` to review result types (9 errors)
36
- - [ ] **TS-004:** Fix escalation PRD type construction — ensure escalation produces valid `PRD` objects (4 errors)
37
- - [ ] **TS-005:** Fix misc — Logger mock types, null checks, missing exports (`RectificationState`, `TestSummary`, `TestFailure`) (6 errors)
38
-
39
- ### Lint Fixes (12 errors)
40
- - [ ] **LINT-001:** Run `biome check --fix` + manual review of unsafe fixes
41
-
42
- ### Verify Stage Fix
43
- - [ ] **TEST-001:** Fix hanging "test command that throws error" test — add timeout or proper process kill
44
-
45
- ### Per-Story testStrategy
46
- - [ ] Add optional `testStrategy` field to userStory PRD schema (`"test-after" | "three-session-tdd" | "three-session-tdd-lite"`)
47
- - [ ] When set, overrides global config + task classification for that story
48
- - [ ] Update routing stage to check `story.testStrategy` before config/LLM
49
- - [ ] Docs + tests
50
-
51
- ### Re-enable Checks
52
- - [ ] Re-enable `typecheck` in `nax/config.json` review checks after TS fixes land
33
+ - [x] ~~**TS-001:** Fix context module exports (13 errors)~~
34
+ - [x] ~~**TS-002:** Fix config/command type safety (12 errors)~~
35
+ - [x] ~~**TS-003:** Fix review/verification types (9 errors)~~
36
+ - [x] ~~**TS-004:** Fix escalation PRD type construction (4 errors)~~
37
+ - [x] ~~**TS-005:** Fix misc types (6 errors)~~
38
+ - [x] ~~**LINT-001:** Run biome check --fix + manual review~~
39
+
40
+ ### CI Pipeline (new)
41
+ - [x] `.gitlab-ci.yml` — stages: test → release → notify
42
+ - [x] Image: `nathapp/node-bun:22.21.0-1.3.9-alpine` (test/release), `gkci/node:22.14.0-alpine-ci` (notify)
43
+ - [x] `before_script`: apk add git python3 make g++, safe.directory, git identity
44
+ - [x] Test env: `NAX_SKIP_PRECHECK=1 bun test test/ --timeout=60000`
45
+ - [x] CI skip guards for env-sensitive tests (claude binary, PID checks, subprocess integration)
46
+ - [x] Fixed `checkClaudeCLI()` ENOENT crash try/catch around Bun.spawn
47
+ - [x] Release trigger: `[run-release]` in commit message on master
48
+ - [x] Runner requirement: 8GB shared runner (`saas-linux-small-amd64`)
49
+ - [x] **Result: 1952 pass, 56 skip, 0 fail**
53
50
 
54
51
  ---
55
52
 
56
- ## v0.18.2 — Smart Test Runner + Bun PTY Migration
53
+ ## v0.18.2 — Smart Test Runner + Routing Fix
57
54
 
58
- **Theme:** Scope verify to changed files only + remove node-pty native addon
59
- **Status:** 🔲 Planned
55
+ **Theme:** Scope verify to changed files only + fix routing override
56
+ **Status:** Shipped (2026-03-03)
60
57
 
61
58
  ### Smart Test Runner
62
- - [ ] After agent implementation, run `git diff --name-only` to get changed source files
63
- - [ ] Map source → test files by naming convention (`src/foo/bar.ts` → `test/unit/foo/bar.test.ts`)
64
- - [ ] Run only related tests for verify (instead of full suite)
65
- - [ ] Fallback to full suite when mapping yields no test files
66
- - [ ] Config flag `execution.smartTestRunner: true` (default: true) to opt out
67
- - [ ] Result: verify drops from ~125s to ~10-20s for typical single-file fixes
59
+ - [x] ~~After agent implementation, run `git diff --name-only` to get changed source files~~
60
+ - [x] ~~Map source → test files by naming convention (`src/foo/bar.ts` → `test/unit/foo/bar.test.ts`)~~
61
+ - [x] ~~Run only related tests for verify (instead of full suite)~~
62
+ - [x] ~~Fallback to full suite when mapping yields no test files~~
63
+ - [x] ~~Config flag `execution.smartTestRunner: true` (default: true) to opt out~~
64
+ - [x] ~~Result: verify drops from ~125s to ~10-20s for typical single-file fixes~~
68
65
 
69
66
  ### Bun PTY Migration (BUN-001)
70
67
  - [ ] Replace `node-pty` (native addon, requires python/make/g++ to build) with `Bun.Terminal` API (v1.3.5+)
@@ -83,12 +80,68 @@
83
80
 
84
81
  ---
85
82
 
86
- ## v0.19.0Central Run Registry
83
+ ## v0.18.3Execution Reliability
84
+
85
+ **Theme:** Fix execution pipeline bugs (escalation, routing, review), structured failure context, and Smart Runner enhancement
86
+ **Status:** ✅ Shipped (2026-03-04)
87
+ **Spec:** [docs/specs/verification-architecture-v2.md](specs/verification-architecture-v2.md) (Phase 1)
88
+
89
+ ### Bugfixes — Completed
90
+ - [x] **BUG-026:** Regression gate timeout → accept scoped pass + warn (not escalate). Config: `regressionGate.acceptOnTimeout: true`.
91
+ - [x] **BUG-028:** Routing cache ignores escalation tier — `clearCacheForStory(storyId)` in `llm.ts`, called on tier escalation in both `preIterationTierCheck()` and `handleTierEscalation()`.
92
+
93
+ ### Structured Failure Context — Completed
94
+ - [x] **SFC-001:** `StructuredFailure` type with `TestFailureContext[]` + `priorFailures?: StructuredFailure[]` on `UserStory`. Populated on verify, regression, rectification, and escalation failures.
95
+ - [x] **SFC-002:** Format `priorFailures` into agent prompt at priority 95 via `createPriorFailuresContext()` in `context/builder.ts`.
96
+
97
+ ### Bugfixes — Completed (Round 2)
98
+ - [x] **BUG-029:** Escalation resets story to `pending` → bypasses BUG-022 retry priority. After escalation, `getNextStory()` picks the next pending story instead of retrying the escalated one. **Location:** `src/prd/index.ts:getNextStory()`. **Fix:** Recognize escalated-pending stories in Priority 1 (e.g. check `story.routing.modelTier` changed, or use `"retry-pending"` status).
99
+ - [x] **BUG-030:** Review lint/typecheck failure → hard `"fail"`, no rectification or retry. `review.ts:92` returns `{ action: "fail" }` → `markStoryFailed()` permanently. Lint errors are auto-fixable but story is killed with zero retry. **Fix:** Return `"escalate"` for lint/typecheck failures (or add review-rectification loop). Reserve `"fail"` for plugin reviewer rejection only.
100
+ - [x] **BUG-032:** Routing stage overrides escalated `modelTier` with complexity-derived tier. `routing.ts:43` always runs `complexityToModelTier()` even when `story.routing.modelTier` was set by escalation → escalated tier silently ignored. BUG-013 fix (`applyCachedRouting`) runs too late. **Fix:** Skip `complexityToModelTier()` when `story.routing.modelTier` is explicitly set.
101
+
102
+ ### STR-007: Smart Test Runner Enhancement — Completed
103
+ - [x] Configurable `testFilePatterns` in config (default: `test/**/*.test.ts`)
104
+ - [x] `testFileFallback` config option: `"import-grep"` | `"full-suite"` (default: `"import-grep"`)
105
+ - [x] 3-pass test discovery: path-convention → import-grep (grep test files for changed module name) → full-suite
106
+ - [x] Config schema update: `execution.smartTestRunner` becomes object `{ enabled, testFilePatterns, fallback }` (backward compat: boolean coerced)
107
+
108
+ ---
109
+
110
+ ## v0.18.4 — Routing Stability
111
+
112
+ **Theme:** Fix routing classifier consistency and LLM routing reliability
113
+ **Status:** 🔲 Planned
114
+
115
+ ### Bugfixes
116
+ - [ ] **BUG-031:** Keyword fallback classifier gives inconsistent strategy across retries for same story. `priorErrors` text shifts keyword classification. **Fix:** Keyword classifier should only use original story fields; or lock `story.routing.testStrategy` once set.
117
+ - [ ] **BUG-033:** LLM routing has no retry on timeout — single 15s attempt, then keyword fallback. **Fix:** Add `routing.llm.retries` config (default: 1) with backoff. Raise default timeout to 30s for batch routing.
118
+
119
+ ---
120
+
121
+ ## v0.19.0 — Verification Architecture v2
87
122
 
88
- **Theme:** Unified run tracking across worktrees + dashboard integration
123
+ **Theme:** Eliminate duplicate test runs, deferred regression gate, structured escalation context
89
124
  **Status:** 🔲 Planned
125
+ **Spec:** [docs/specs/verification-architecture-v2.md](specs/verification-architecture-v2.md) (Phase 2)
126
+
127
+ ### Remove Duplicate Test Execution
128
+ - [ ] Pipeline verify stage is the single test execution point (Smart Test Runner)
129
+ - [ ] Remove scoped re-test in `post-verify.ts` (duplicate of pipeline verify)
130
+ - [ ] Review stage runs typecheck + lint only — remove `review.commands.test` execution
90
131
 
91
- - [ ] **Central Run Registry** — `~/.nax/runs/<project>-<feature>-<runId>/` with status.json + events.jsonl symlink. Dashboard reads from registry.
132
+ ### Deferred Regression Gate
133
+ - [ ] New `src/execution/lifecycle/run-regression.ts` — run full suite once at run-end (not per-story)
134
+ - [ ] Reverse Smart Test Runner mapping: failing test → source file → responsible story
135
+ - [ ] Targeted rectification per responsible story with full failure context
136
+ - [ ] Config: `execution.regressionGate.mode: "deferred" | "per-story" | "disabled"` (default `"deferred"`)
137
+ - [ ] Call deferred regression in `run-completion.ts` before final metrics
138
+
139
+ ### Full Structured Failure Context
140
+ - [ ] `priorFailures` injected into escalated agent prompts via `context/builder.ts`
141
+ - [ ] Reverse file mapping for regression attribution
142
+
143
+ ### Central Run Registry (carried forward)
144
+ - [ ] `~/.nax/runs/<project>-<feature>-<runId>/` with status.json + events.jsonl symlink
92
145
 
93
146
  ---
94
147
 
@@ -96,6 +149,9 @@
96
149
 
97
150
  | Version | Theme | Date | Details |
98
151
  |:---|:---|:---|:---|
152
+ | v0.18.1 | Type Safety + CI Pipeline | 2026-03-03 | 60 TS errors + 12 lint errors fixed, GitLab CI green (1952/56/0) |
153
+ | v0.18.3 | Execution Reliability + Smart Runner | 2026-03-04 | BUG-026/028/029/030/032 + SFC-001/002 + STR-007, all items complete |
154
+ | v0.18.2 | Smart Test Runner + Routing Fix | 2026-03-03 | FIX-001 + STR-001–006, 2038 pass/11 skip/0 fail |
99
155
  | v0.18.0 | Orchestration Quality | 2026-03-03 | BUG-016/017/018/019/020/021/022/023/025 all fixed |
100
156
  | v0.17.0 | Config Management | 2026-03-02 | CM-001 --explain, CM-002 --diff, CM-003 default view |
101
157
  | v0.16.4 | Bugfixes: Routing + Env Allowlist | 2026-03-02 | BUG-012/013/014 |
@@ -130,7 +186,28 @@
130
186
  - [x] ~~BUG-012: Greenfield detection ignores pre-existing test files~~
131
187
  - [x] ~~BUG-013: Escalation routing not applied in iterations~~
132
188
  - [x] ~~BUG-014: buildAllowedEnv() strips USER/LOGNAME~~
189
+ <<<<<<< Updated upstream
190
+ - [x] ~~**BUG-015:** `loadConstitution()` leaks global `~/.nax/constitution.md` into unit tests — fixed via `skipGlobal: true` in all unit tests~~
191
+ =======
133
192
  - [ ] **BUG-015:** `loadConstitution()` leaks global `~/.nax/constitution.md` into unit tests
193
+ - [ ] **BUG-027:** `runPrecheck()` always prints to stdout — pollutes test output when called programmatically.
194
+ - **Observed (2026-03-03):** `bun test` output starts with precheck JSON from `US-002-orchestrator.test.ts` calling `runPrecheck()`, which unconditionally calls `console.log()`. nax verify stage captures this, making every failure look like a `git-repo-exists` blocker.
195
+ - **Root cause:** `runPrecheck()` mixes side-effects (printing) with logic (returning result).
196
+ - **Fix:** Add `silent?: boolean` to `PrecheckOptions`; test callers pass `silent: true`.
197
+ - **Workaround (active):** `silent` option + test update shipped in v0.18.2 branch.
198
+ - **Target:** v0.18.2
199
+ - [ ] **BUG-028:** Routing cache ignores escalation tier — escalated stories re-run at original tier.
200
+ - **Observed (2026-03-03):** STR-006 escalated to `powerful`. Router returned LLM cache hit from prior `balanced` run → agent ran as `balanced` anyway.
201
+ - **Root cause:** Cache key does not include requested tier. Lower-tier cache hit served for higher-tier request.
202
+ - **Fix:** Include `requestedTier` in cache key; only serve cache hit if cached tier >= requested tier.
203
+ - **Target:** v0.19.0
204
+ - [ ] **BUG-026:** Regression gate failure triggers full story re-implementation instead of targeted rectification.
205
+ - **Observed (2026-03-03):** During v0.18.2 smart-runner development on Mac01, STR-001 passed scoped verification (5/5 tests green) but the full-suite regression gate timed out (exit code 132, SIGILL/Bun crash). nax treated this as a story failure and re-ran the coding agent, which rewrote already-correct code. The retry agent then produced a different (worse) implementation that failed verification.
206
+ - **Root cause:** Escalation logic does not distinguish between "story code is wrong" and "story code is fine but introduced a regression". Both flow through the same retry path.
207
+ - **Fix:** After regression gate failure, spawn a rectification agent with context of what regressed (failing test names + diff), not a full story re-implementation. Only fall back to full re-implementation if rectification also fails.
208
+ - **Workaround (active):** Disabled regression gate via `rectification.enabled: false` in project nax/config.json for self-dev runs. CI on VPS is the regression gate instead.
209
+ - **Target:** v0.19.0
210
+ >>>>>>> Stashed changes
134
211
  - [x] ~~**BUG-016:** Hardcoded 120s timeout in pipeline verify stage → fixed in v0.18.0~~
135
212
  - [x] ~~**BUG-017:** run.complete not emitted on SIGTERM → fixed in v0.18.0~~
136
213
  - [x] ~~**BUG-018:** Test-writer wastes ~3min/retry when tests already exist → fixed in v0.18.0~~
@@ -141,12 +218,20 @@
141
218
  - [x] ~~**BUG-023:** Agent failure silent — no exitCode/stderr in JSONL → fixed in v0.18.0~~
142
219
  - [x] ~~**BUG-025:** `needsHumanReview` not triggering interactive plugin → fixed in v0.18.0~~
143
220
 
221
+ - [x] **BUG-029:** Escalation resets story to `pending` → bypasses BUG-022 retry priority. `handleTierEscalation()` sets `status: "pending"` after escalation, but `getNextStory()` Priority 1 only checks `status === "failed"`. Result: after BUG-026 escalated (iter 1), nax moved to BUG-028 (iter 2) instead of retrying BUG-026 immediately. **Location:** `src/prd/index.ts:getNextStory()` + `src/execution/escalation/tier-escalation.ts`. **Fix:** `getNextStory()` should also prioritize stories with `story.routing.modelTier` that changed since last attempt (escalation marker), or `handleTierEscalation` should use a distinct status like `"retry-pending"` that Priority 1 recognizes.
222
+ - [x] **BUG-030:** Review lint failure → hard `"fail"`, no rectification or retry. `src/pipeline/stages/review.ts:92` returns `{ action: "fail" }` for all review failures including lint. In `pipeline-result-handler.ts`, `"fail"` calls `markStoryFailed()` — permanently dead. But lint errors are auto-fixable (agent can run `biome check --fix`). Contrast with verify stage which returns `"escalate"` on test failure, allowing retry. SFC-001 and SFC-002 both hit this — tests passed but 5 Biome lint errors killed the stories permanently. **Fix:** Review stage should return `"escalate"` (not `"fail"`) for lint/typecheck failures, or add a review-rectification loop (like verify has) that gives the agent one retry with the lint output as context. Reserve `"fail"` for unfixable review issues (e.g. plugin reviewer rejection).
223
+ - [ ] **BUG-031:** Keyword fallback classifier gives inconsistent strategy across retries for same story. BUG-026 was classified as `test-after` on iter 1 (keyword fallback), but `three-session-tdd-lite` on iter 5 (same keyword fallback). The keyword classifier in `src/routing/strategies/keyword.ts:classifyComplexity()` may be influenced by `priorErrors` text added between attempts, shifting the keyword match result. **Location:** `src/routing/strategies/keyword.ts`. **Fix:** Keyword classifier should only consider the story's original title + description + acceptance criteria, not accumulated `priorErrors` or `priorFailures`. Alternatively, once a strategy is set in `story.routing.testStrategy`, the routing stage should preserve it across retries (already partially done in `routing.ts:40-41` but may not apply when LLM falls back to keyword).
224
+ - [x] **BUG-032:** Routing stage overrides escalated `modelTier` with complexity-derived tier. `src/pipeline/stages/routing.ts:43` always runs `complexityToModelTier(routing.complexity, config)` even when `story.routing.modelTier` was explicitly set by `handleTierEscalation()`. BUG-026 was escalated to `balanced` (logged in iteration header), but `Task classified` shows `modelTier=fast` because `complexityToModelTier("simple", config)` → `"fast"`. Related to BUG-013 (escalation routing not applied) which was marked fixed, but the fix in `applyCachedRouting()` in `pipeline-result-handler.ts:295-310` runs **after** the routing stage — too late. **Location:** `src/pipeline/stages/routing.ts:43`. **Fix:** When `story.routing.modelTier` is explicitly set (by escalation), skip `complexityToModelTier()` and use the cached tier directly. Only derive from complexity when `story.routing.modelTier` is absent.
225
+ - [ ] **BUG-033:** LLM routing has no retry on timeout — single attempt with hardcoded 15s default. All 5 LLM routing attempts in the v0.18.3 run timed out at 15s, forcing keyword fallback every time. `src/routing/strategies/llm.ts:63` reads `llmConfig?.timeoutMs ?? 15000` but there's no retry logic — one timeout = immediate fallback. **Location:** `src/routing/strategies/llm.ts:callLlm()`. **Fix:** Add `routing.llm.retries` config (default: 1) with backoff. Also surface `routing.llm.timeoutMs` in `nax config --explain` and consider raising default to 30s for batch routing which processes multiple stories.
226
+
144
227
  ### Features
145
228
  - [x] ~~`nax unlock` command~~
146
229
  - [x] ~~Constitution file support~~
147
230
  - [x] ~~Per-story testStrategy override — v0.18.1~~
148
231
  - [x] ~~Smart Test Runner — v0.18.2~~
149
232
  - [x] ~~Central Run Registry — v0.19.0~~
233
+ - [ ] **BUN-001:** Bun PTY Migration — replace `node-pty` with `Bun.Terminal` API
234
+ - [ ] **CI-001:** CI Memory Optimization — parallel test sharding for 1GB runners
150
235
  - [ ] Cost tracking dashboard
151
236
  - [ ] npm publish setup
152
237
  - [ ] `nax diagnose --ai` flag (LLM-assisted, future TBD)
@@ -162,4 +247,4 @@ Sequential canary → stable: `v0.12.0-canary.0` → `canary.N` → `v0.12.0`
162
247
  Canary: `npm publish --tag canary`
163
248
  Stable: `npm publish` (latest)
164
249
 
165
- *Last updated: 2026-03-03 (v0.18.0 shipped all 9 bugs fixed)*
250
+ *Last updated: 2026-03-04 (v0.18.3 shipped; v0.18.4: BUG-031/033; v0.19.0: Verification Architecture v2)*
@@ -0,0 +1,343 @@
1
+ # Verification Architecture v2
2
+
3
+ **Status:** Proposal
4
+ **Target:** v0.19.0
5
+ **Author:** Nax Dev
6
+ **Date:** 2026-03-04
7
+ **Fixes:** BUG-026, BUG-028, plus architectural debt in verification pipeline
8
+
9
+ ---
10
+
11
+ ## 1. Problems with Current Architecture
12
+
13
+ ### 1.1 Triple Test Execution (Waste)
14
+
15
+ Current per-story flow runs tests up to 3 times:
16
+
17
+ ```
18
+ Pipeline verify stage → scoped tests (Smart Test Runner)
19
+ Pipeline review stage → test command (if review.commands.test configured)
20
+ Post-verify → scoped tests AGAIN + full regression gate
21
+ ```
22
+
23
+ On Mac01 with ~2000 tests, this means:
24
+ - Scoped: ~10-20s × 2 (duplicate) = 20-40s wasted
25
+ - Full regression: ~125s per story
26
+ - Total: ~150s+ of test execution per story
27
+
28
+ ### 1.2 Regression Gate Per Story (BUG-026)
29
+
30
+ The regression gate runs a **full test suite after every story**. Problems:
31
+ - **Timeout:** Full suite frequently times out on Mac01 (~125s)
32
+ - **False escalation:** Timeout is treated as story failure → bumps `story.attempts` → triggers tier escalation
33
+ - **Wasted compute:** Agent's implementation was correct (scoped tests passed), but full suite timeout causes a complete redo at a higher (more expensive) tier
34
+ - **Cascading waste:** N stories × 1 full suite each = N full suite runs. Most are redundant.
35
+
36
+ ### 1.3 Escalation Context Loss
37
+
38
+ When a story fails and escalates to a higher tier, the error context passed is:
39
+
40
+ ```
41
+ priorErrors: ["Attempt 1 failed with model tier: fast"]
42
+ ```
43
+
44
+ The actual test output — which tests failed, error messages, stack traces — is **discarded**. The escalated agent gets a vague hint instead of actionable failure context.
45
+
46
+ | Stage | Context Available | What's Stored in priorErrors |
47
+ |-------|-------------------|------------------------------|
48
+ | Rectification loop | Full `TestFailure[]` with file, testName, error, stackTrace | *(used internally, then discarded)* |
49
+ | Post-verify failure | `verificationResult.error` (summary string) | Generic: `"Verification failed: TEST_FAILURE"` |
50
+ | Regression gate failure | Full test output | Generic: `"REGRESSION: full-suite regression detected"` |
51
+ | Tier escalation | Nothing new | `"Attempt N failed with model tier: X"` |
52
+
53
+ Result: `fast → balanced → powerful` escalation chain has **zero actionable context** about what actually failed.
54
+
55
+ ### 1.4 Routing Cache Ignores Escalation Tier (BUG-028)
56
+
57
+ LLM routing cache is keyed by `story.id` only. When escalation updates `story.routing.modelTier` from `balanced` → `powerful`, the next iteration hits the cache and returns the old `balanced` routing decision, overriding the escalation.
58
+
59
+ ---
60
+
61
+ ## 2. Proposed Architecture
62
+
63
+ ### 2.1 Verification Flow (Simplified)
64
+
65
+ ```
66
+ Pipeline per-story:
67
+ 1. Agent execution
68
+ 2. Scoped verify (Smart Test Runner) ← ONLY test run per story
69
+ 3. Scoped rectification (if verify fails) ← has full test failure context
70
+ 4. Review (typecheck + lint only) ← NO test re-run
71
+ 5. Story marked "passed" or escalated
72
+
73
+ Run-end (after all stories pass):
74
+ 6. Deferred regression gate (full suite) ← ONE full suite run total
75
+ 7. Targeted regression rectification ← per-story, with failure context
76
+ 8. Run marked complete or stalled
77
+ ```
78
+
79
+ **Key changes:**
80
+ - **Remove duplicate test runs** — pipeline verify is the single source of truth
81
+ - **Review stage runs typecheck + lint only** — no test command
82
+ - **Remove post-verify scoped re-test** — pipeline verify already did this
83
+ - **Move regression gate to run-end** — one full suite run instead of N
84
+ - **Targeted regression rectification** — map failing tests back to responsible stories
85
+
86
+ ### 2.2 Deferred Regression Gate
87
+
88
+ Instead of running the full suite after every story, run it **once** after all stories complete.
89
+
90
+ ```typescript
91
+ // New: src/execution/lifecycle/run-regression.ts
92
+
93
+ interface DeferredRegressionOptions {
94
+ config: NaxConfig;
95
+ workdir: string;
96
+ prd: PRD;
97
+ prdPath: string;
98
+ allStoryMetrics: StoryMetrics[];
99
+ }
100
+
101
+ interface DeferredRegressionResult {
102
+ passed: boolean;
103
+ failedTests?: TestFailure[];
104
+ storyMapping?: Map<string, TestFailure[]>; // storyId → failures caused by that story
105
+ }
106
+ ```
107
+
108
+ **Failure handling:**
109
+ 1. Run full suite
110
+ 2. Parse failures into `TestFailure[]`
111
+ 3. For each failing test, use reverse Smart Test Runner mapping:
112
+ - `test/unit/foo/bar.test.ts` → `src/foo/bar.ts` → which story touched this file? (from git log per story)
113
+ 4. Group failures by responsible story
114
+ 5. Attempt targeted rectification per story (agent gets FULL failure context)
115
+ 6. Re-run full suite to confirm fix
116
+ 7. If still failing → mark responsible stories as failed
117
+
118
+ **Config:**
119
+
120
+ ```jsonc
121
+ {
122
+ "execution": {
123
+ "regressionGate": {
124
+ "enabled": true,
125
+ "mode": "deferred", // "deferred" | "per-story" | "disabled"
126
+ "timeoutSeconds": 300,
127
+ "maxRectificationAttempts": 2
128
+ }
129
+ }
130
+ }
131
+ ```
132
+
133
+ ### 2.3 Structured Failure Context for Escalation
134
+
135
+ Replace vague `priorErrors` strings with structured failure data.
136
+
137
+ **New PRD field:** `priorFailures` (alongside existing `priorErrors` for backward compat)
138
+
139
+ ```typescript
140
+ // In src/prd/types.ts
141
+
142
+ interface StructuredFailure {
143
+ /** Which attempt this failure occurred on */
144
+ attempt: number;
145
+ /** Model tier that was used */
146
+ modelTier: string;
147
+ /** What stage failed */
148
+ stage: "verify" | "review" | "regression" | "rectification" | "agent-session";
149
+ /** Human-readable summary */
150
+ summary: string;
151
+ /** Structured test failures (if applicable) */
152
+ testFailures?: TestFailureContext[];
153
+ /** Timestamp */
154
+ timestamp: string;
155
+ }
156
+
157
+ interface TestFailureContext {
158
+ file: string;
159
+ testName: string;
160
+ error: string;
161
+ /** First 5 lines of stack trace */
162
+ stackTrace: string[];
163
+ }
164
+ ```
165
+
166
+ **How it flows through escalation:**
167
+
168
+ ```
169
+ fast attempt 1 → verify fails
170
+ → priorFailures: [{
171
+ attempt: 1,
172
+ modelTier: "fast",
173
+ stage: "verify",
174
+ summary: "3 tests failed in src/routing/router.ts",
175
+ testFailures: [
176
+ { file: "test/unit/routing/router.test.ts",
177
+ testName: "should route to balanced",
178
+ error: "Expected 'balanced' got 'fast'",
179
+ stackTrace: [...] },
180
+ ...
181
+ ]
182
+ }]
183
+
184
+ balanced attempt 1 → agent gets FULL context of what fast couldn't fix
185
+ ```
186
+
187
+ **Context injection** (`context/builder.ts`):
188
+
189
+ Format `priorFailures` into actionable markdown for the agent prompt:
190
+
191
+ ```markdown
192
+ ## Prior Attempt 1 (fast, verify)
193
+ 3 tests failed in src/routing/router.ts
194
+
195
+ ### Test Failures:
196
+ - **test/unit/routing/router.test.ts** > should route to balanced
197
+ Error: Expected 'balanced' got 'fast'
198
+ Stack: at Router.route (src/routing/router.ts:42)
199
+ ```
200
+
201
+ ### 2.4 BUG-028 Fix: Cache Invalidation on Escalation
202
+
203
+ Add `clearCacheForStory(storyId)` to `src/routing/strategies/llm.ts`.
204
+
205
+ Call it in `tier-escalation.ts` when updating `story.routing.modelTier`.
206
+
207
+ ---
208
+
209
+ ## 3. Migration Plan
210
+
211
+ ### Phase 1: v0.18.3 — Minimal Fixes (no architecture change)
212
+
213
+ 1. **BUG-026 quick fix:** Regression gate timeout → accept scoped pass + warn (not escalate)
214
+ 2. **BUG-028 fix:** `clearCacheForStory()` on escalation
215
+ 3. **Store structured failures:** Start populating `priorFailures` alongside `priorErrors` (backward compat)
216
+
217
+ ### Phase 2: v0.19.0 — Architecture v2
218
+
219
+ 1. **Remove post-verify duplicate test run** — pipeline verify is authoritative
220
+ 2. **Review stage: typecheck + lint only** — remove test command from review
221
+ 3. **Deferred regression gate** — run-end full suite with targeted rectification
222
+ 4. **Reverse Smart Test Runner mapping** — failing test → source file → responsible story
223
+ 5. **Full structured failure context** — `priorFailures` injected into agent prompts
224
+ 6. **Config:** `regressionGate.mode: "deferred"` (default)
225
+
226
+ ### Phase 3: Future
227
+
228
+ - **Incremental regression:** Only run tests related to ALL changed files across all stories (union of Smart Test Runner scopes)
229
+ - **Test impact analysis:** AST-based dependency graph for more precise test scoping
230
+ - **Parallel story regression:** Run rectification for multiple stories concurrently
231
+
232
+ ---
233
+
234
+ ## 4. Files Affected
235
+
236
+ ### Phase 1 (v0.18.3)
237
+
238
+ | File | Change |
239
+ |------|--------|
240
+ | `src/execution/post-verify.ts` | Regression gate timeout → accept + warn |
241
+ | `src/routing/strategies/llm.ts` | Add `clearCacheForStory()` export |
242
+ | `src/execution/escalation/tier-escalation.ts` | Call `clearCacheForStory()` on escalation |
243
+ | `src/execution/post-verify-rectification.ts` | Store `StructuredFailure` in `priorFailures` |
244
+ | `src/prd/types.ts` | Add `priorFailures?: StructuredFailure[]` to `UserStory` |
245
+
246
+ ### Phase 2 (v0.19.0)
247
+
248
+ | File | Change |
249
+ |------|--------|
250
+ | `src/pipeline/stages/review.ts` | Remove test command execution |
251
+ | `src/execution/post-verify.ts` | Remove scoped re-test, keep regression call only |
252
+ | `src/execution/lifecycle/run-regression.ts` | **New:** Deferred regression gate + targeted rectification |
253
+ | `src/execution/lifecycle/run-completion.ts` | Call deferred regression before final metrics |
254
+ | `src/verification/smart-runner.ts` | Add reverse mapping: test file → source file → story |
255
+ | `src/context/builder.ts` | Format `priorFailures` into agent prompt |
256
+ | `src/config/schemas.ts` | Add `regressionGate.mode` enum |
257
+
258
+ ---
259
+
260
+ ## 5. Test Plan
261
+
262
+ ### Phase 1 Tests
263
+ - Regression gate timeout returns "passed" with warning (not "failed")
264
+ - `clearCacheForStory()` removes cached decision; next route() re-evaluates
265
+ - `priorFailures` populated with structured `TestFailureContext` on verify failure
266
+ - Backward compat: `priorErrors` still populated alongside `priorFailures`
267
+
268
+ ### Phase 2 Tests
269
+ - Pipeline verify is single test execution (no duplicate)
270
+ - Review stage skips test command
271
+ - Deferred regression runs once at run-end
272
+ - Reverse mapping correctly identifies responsible story
273
+ - Targeted rectification receives full failure context
274
+ - Escalated agent prompt includes formatted `priorFailures`
275
+ - Config `regressionGate.mode: "per-story"` preserves current behavior
276
+
277
+ ---
278
+
279
+ ## 6. Historical Context (Why It's Like This)
280
+
281
+ ### Why post-verify exists separately from pipeline verify
282
+
283
+ The pipeline (`src/pipeline/pipeline.ts`) runs stages in sequence: routing → context → prompt → execution → **verify** → review → completion. This was the original single verification point.
284
+
285
+ Later, **post-agent verification** was added in `src/execution/pipeline-result-handler.ts` → `handlePipelineSuccess()` → `runPostAgentVerification()`. This was meant to handle:
286
+ - **Scoped verification** with git-diff-based test file detection (before Smart Test Runner existed in the pipeline)
287
+ - **Rectification** — retry loop with agent when tests fail
288
+ - **Regression gate** (BUG-009 fix) — full suite after scoped pass
289
+
290
+ When Smart Test Runner was added to the **pipeline verify stage** (v0.18.2), it duplicated the scoped test logic that post-verify already had. Nobody removed the post-verify scoped test.
291
+
292
+ ### Current code flow with exact locations
293
+
294
+ ```
295
+ sequential-executor.ts:170 → pipelineRunner.run(story)
296
+ pipeline.ts:execute() → runs stages in order:
297
+ verify.ts:execute() → Smart Test Runner scoped tests [TEST RUN #1]
298
+ review.ts:execute() → runReview() which may run tests [TEST RUN #2 if review.commands.test set]
299
+
300
+ pipeline-result-handler.ts:76 → runPostAgentVerification()
301
+ post-verify.ts:85 → runVerification(scopedCommand) [TEST RUN #3 — duplicate of #1]
302
+ post-verify.ts:118 → runRegressionGate()
303
+ post-verify.ts:180 → runVerification(fullSuite) [TEST RUN #4 — full suite]
304
+ ```
305
+
306
+ ### Review stage test command
307
+
308
+ `review.ts` calls `runReview()` from `src/review/index.ts` which runs `config.review.commands.test` if configured. In default config, `review.commands` includes `test`, `typecheck`, and `lint`. So yes — review runs tests by default, creating the triple-test problem.
309
+
310
+ ### Decision rationale
311
+
312
+ **Why deferred regression (Option C) over per-story (A) or disabled (B):**
313
+ - **Option A (keep per-story):** 125s timeout per story is the root cause of BUG-026. Even with timeout-acceptance, it's wasteful.
314
+ - **Option B (disable entirely):** Too risky — cross-story regressions are real (BUG-009 was filed for this exact reason).
315
+ - **Option C (deferred):** One full suite run at the end. If it fails, we can trace back to responsible stories via reverse file mapping. Best balance of safety vs speed.
316
+
317
+ **Why cache invalidation (Option C for BUG-028) over cache key change (A) or bypass (B):**
318
+ - **Option A (include tier in key):** Works but creates multiple cache entries per story. If story is re-routed 3 times, 3 entries exist. Cache eviction becomes unpredictable.
319
+ - **Option B (bypass when routing set):** Almost all stories have `story.routing` set after first pass, so cache would rarely be used at all — defeats the purpose.
320
+ - **Option C (clear on escalation):** Surgical — one `delete()` call at the exact moment routing changes. Cache works normally for non-escalated stories.
321
+
322
+ ## 7. Edge Cases
323
+
324
+ ### Partial completion (stalled run)
325
+
326
+ If only 3 of 5 stories pass and nax stalls (remaining stories failed/paused):
327
+ - Deferred regression still runs on the 3 passed stories
328
+ - If regression fails, only the passed stories are candidates for rectification
329
+ - Failed/paused stories are untouched
330
+
331
+ ### Stories that touch the same files
332
+
333
+ If story A and story B both modify `src/utils/parser.ts`:
334
+ - Reverse mapping may attribute the same failing test to both stories
335
+ - Rectification should try the **last story that touched the file** first (git log order)
336
+ - If that doesn't fix it, try the other story
337
+
338
+ ### No test mapping possible
339
+
340
+ If a failing test can't be mapped to any story's changed files:
341
+ - Log warning: "Unmapped regression — cannot attribute to a specific story"
342
+ - Mark ALL passed stories as needing re-verification
343
+ - This is the worst case but should be rare with good test naming conventions