@nathapp/nax 0.18.2 → 0.18.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/rules/01-project-conventions.md +34 -0
- package/.claude/rules/02-test-architecture.md +39 -0
- package/.claude/rules/03-test-writing.md +58 -0
- package/.claude/rules/04-forbidden-patterns.md +29 -0
- package/.githooks/pre-commit +13 -0
- package/.gitlab-ci.yml +11 -5
- package/CHANGELOG.md +9 -0
- package/CLAUDE.md +45 -122
- package/bun.lock +1 -1
- package/bunfig.toml +2 -1
- package/docker-compose.test.yml +15 -0
- package/docs/ROADMAP.md +83 -14
- package/docs/specs/verification-architecture-v2.md +343 -0
- package/nax/config.json +7 -7
- package/nax/features/v0.18.3-execution-reliability/prd.json +80 -0
- package/nax/features/v0.18.3-execution-reliability/progress.txt +3 -0
- package/package.json +2 -2
- package/src/config/defaults.ts +1 -0
- package/src/config/schema.ts +1 -0
- package/src/config/schemas.ts +26 -1
- package/src/config/types.ts +21 -4
- package/src/context/builder.ts +11 -0
- package/src/context/elements.ts +38 -1
- package/src/execution/escalation/tier-escalation.ts +28 -3
- package/src/execution/post-verify-rectification.ts +4 -2
- package/src/execution/post-verify.ts +102 -20
- package/src/execution/progress.ts +2 -0
- package/src/pipeline/stages/execution.ts +10 -2
- package/src/pipeline/stages/review.ts +5 -3
- package/src/pipeline/stages/routing.ts +28 -9
- package/src/pipeline/stages/verify.ts +49 -8
- package/src/prd/index.ts +16 -1
- package/src/prd/types.ts +33 -0
- package/src/routing/strategies/keyword.ts +7 -4
- package/src/routing/strategies/llm.ts +45 -4
- package/src/verification/gate.ts +2 -1
- package/src/verification/smart-runner.ts +68 -0
- package/src/verification/types.ts +2 -0
- package/test/context/prior-failures.test.ts +462 -0
- package/test/execution/structured-failure.test.ts +414 -0
- package/test/integration/logger.test.ts +1 -1
- package/test/{US-002-orchestrator.test.ts → integration/precheck-orchestrator.test.ts} +3 -3
- package/test/integration/review-plugin-integration.test.ts +2 -1
- package/test/integration/story-id-in-events.test.ts +1 -1
- package/test/unit/config/smart-runner-flag.test.ts +36 -12
- package/test/unit/execution/post-verify-regression.test.ts +415 -0
- package/test/{execution → unit/execution}/post-verify.test.ts +33 -1
- package/test/unit/pipeline/routing-partial-override.test.ts +15 -36
- package/test/unit/pipeline/verify-smart-runner.test.ts +8 -6
- package/test/unit/prd-get-next-story.test.ts +28 -0
- package/test/unit/routing/routing-stability.test.ts +207 -0
- package/test/unit/routing.test.ts +102 -0
- package/test/unit/storyid-events.test.ts +20 -32
- package/test/unit/verification/smart-runner-config.test.ts +162 -0
- package/test/unit/verification/smart-runner-discovery.test.ts +353 -0
- package/test/TEST_COVERAGE_US001.md +0 -217
- package/test/TEST_COVERAGE_US003.md +0 -84
- package/test/TEST_COVERAGE_US005.md +0 -86
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
# Verification Architecture v2
|
|
2
|
+
|
|
3
|
+
**Status:** Proposal
|
|
4
|
+
**Target:** v0.19.0
|
|
5
|
+
**Author:** Nax Dev
|
|
6
|
+
**Date:** 2026-03-04
|
|
7
|
+
**Fixes:** BUG-026, BUG-028, plus architectural debt in verification pipeline
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## 1. Problems with Current Architecture
|
|
12
|
+
|
|
13
|
+
### 1.1 Triple Test Execution (Waste)
|
|
14
|
+
|
|
15
|
+
Current per-story flow runs tests up to 3 times:
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
Pipeline verify stage → scoped tests (Smart Test Runner)
|
|
19
|
+
Pipeline review stage → test command (if review.commands.test configured)
|
|
20
|
+
Post-verify → scoped tests AGAIN + full regression gate
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
On Mac01 with ~2000 tests, this means:
|
|
24
|
+
- Scoped: ~10-20s × 2 (duplicate) = 20-40s wasted
|
|
25
|
+
- Full regression: ~125s per story
|
|
26
|
+
- Total: ~150s+ of test execution per story
|
|
27
|
+
|
|
28
|
+
### 1.2 Regression Gate Per Story (BUG-026)
|
|
29
|
+
|
|
30
|
+
The regression gate runs a **full test suite after every story**. Problems:
|
|
31
|
+
- **Timeout:** Full suite frequently times out on Mac01 (~125s)
|
|
32
|
+
- **False escalation:** Timeout is treated as story failure → bumps `story.attempts` → triggers tier escalation
|
|
33
|
+
- **Wasted compute:** Agent's implementation was correct (scoped tests passed), but full suite timeout causes a complete redo at a higher (more expensive) tier
|
|
34
|
+
- **Cascading waste:** N stories × 1 full suite each = N full suite runs. Most are redundant.
|
|
35
|
+
|
|
36
|
+
### 1.3 Escalation Context Loss
|
|
37
|
+
|
|
38
|
+
When a story fails and escalates to a higher tier, the error context passed is:
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
priorErrors: ["Attempt 1 failed with model tier: fast"]
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
The actual test output — which tests failed, error messages, stack traces — is **discarded**. The escalated agent gets a vague hint instead of actionable failure context.
|
|
45
|
+
|
|
46
|
+
| Stage | Context Available | What's Stored in priorErrors |
|
|
47
|
+
|-------|-------------------|------------------------------|
|
|
48
|
+
| Rectification loop | Full `TestFailure[]` with file, testName, error, stackTrace | *(used internally, then discarded)* |
|
|
49
|
+
| Post-verify failure | `verificationResult.error` (summary string) | Generic: `"Verification failed: TEST_FAILURE"` |
|
|
50
|
+
| Regression gate failure | Full test output | Generic: `"REGRESSION: full-suite regression detected"` |
|
|
51
|
+
| Tier escalation | Nothing new | `"Attempt N failed with model tier: X"` |
|
|
52
|
+
|
|
53
|
+
Result: `fast → balanced → powerful` escalation chain has **zero actionable context** about what actually failed.
|
|
54
|
+
|
|
55
|
+
### 1.4 Routing Cache Ignores Escalation Tier (BUG-028)
|
|
56
|
+
|
|
57
|
+
LLM routing cache is keyed by `story.id` only. When escalation updates `story.routing.modelTier` from `balanced` → `powerful`, the next iteration hits the cache and returns the old `balanced` routing decision, overriding the escalation.
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## 2. Proposed Architecture
|
|
62
|
+
|
|
63
|
+
### 2.1 Verification Flow (Simplified)
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
Pipeline per-story:
|
|
67
|
+
1. Agent execution
|
|
68
|
+
2. Scoped verify (Smart Test Runner) ← ONLY test run per story
|
|
69
|
+
3. Scoped rectification (if verify fails) ← has full test failure context
|
|
70
|
+
4. Review (typecheck + lint only) ← NO test re-run
|
|
71
|
+
5. Story marked "passed" or escalated
|
|
72
|
+
|
|
73
|
+
Run-end (after all stories pass):
|
|
74
|
+
6. Deferred regression gate (full suite) ← ONE full suite run total
|
|
75
|
+
7. Targeted regression rectification ← per-story, with failure context
|
|
76
|
+
8. Run marked complete or stalled
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
**Key changes:**
|
|
80
|
+
- **Remove duplicate test runs** — pipeline verify is the single source of truth
|
|
81
|
+
- **Review stage runs typecheck + lint only** — no test command
|
|
82
|
+
- **Remove post-verify scoped re-test** — pipeline verify already did this
|
|
83
|
+
- **Move regression gate to run-end** — one full suite run instead of N
|
|
84
|
+
- **Targeted regression rectification** — map failing tests back to responsible stories
|
|
85
|
+
|
|
86
|
+
### 2.2 Deferred Regression Gate
|
|
87
|
+
|
|
88
|
+
Instead of running the full suite after every story, run it **once** after all stories complete.
|
|
89
|
+
|
|
90
|
+
```typescript
|
|
91
|
+
// New: src/execution/lifecycle/run-regression.ts
|
|
92
|
+
|
|
93
|
+
interface DeferredRegressionOptions {
|
|
94
|
+
config: NaxConfig;
|
|
95
|
+
workdir: string;
|
|
96
|
+
prd: PRD;
|
|
97
|
+
prdPath: string;
|
|
98
|
+
allStoryMetrics: StoryMetrics[];
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
interface DeferredRegressionResult {
|
|
102
|
+
passed: boolean;
|
|
103
|
+
failedTests?: TestFailure[];
|
|
104
|
+
storyMapping?: Map<string, TestFailure[]>; // storyId → failures caused by that story
|
|
105
|
+
}
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
**Failure handling:**
|
|
109
|
+
1. Run full suite
|
|
110
|
+
2. Parse failures into `TestFailure[]`
|
|
111
|
+
3. For each failing test, use reverse Smart Test Runner mapping:
|
|
112
|
+
- `test/unit/foo/bar.test.ts` → `src/foo/bar.ts` → which story touched this file? (from git log per story)
|
|
113
|
+
4. Group failures by responsible story
|
|
114
|
+
5. Attempt targeted rectification per story (agent gets FULL failure context)
|
|
115
|
+
6. Re-run full suite to confirm fix
|
|
116
|
+
7. If still failing → mark responsible stories as failed
|
|
117
|
+
|
|
118
|
+
**Config:**
|
|
119
|
+
|
|
120
|
+
```jsonc
|
|
121
|
+
{
|
|
122
|
+
"execution": {
|
|
123
|
+
"regressionGate": {
|
|
124
|
+
"enabled": true,
|
|
125
|
+
"mode": "deferred", // "deferred" | "per-story" | "disabled"
|
|
126
|
+
"timeoutSeconds": 300,
|
|
127
|
+
"maxRectificationAttempts": 2
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### 2.3 Structured Failure Context for Escalation
|
|
134
|
+
|
|
135
|
+
Replace vague `priorErrors` strings with structured failure data.
|
|
136
|
+
|
|
137
|
+
**New PRD field:** `priorFailures` (alongside existing `priorErrors` for backward compat)
|
|
138
|
+
|
|
139
|
+
```typescript
|
|
140
|
+
// In src/prd/types.ts
|
|
141
|
+
|
|
142
|
+
interface StructuredFailure {
|
|
143
|
+
/** Which attempt this failure occurred on */
|
|
144
|
+
attempt: number;
|
|
145
|
+
/** Model tier that was used */
|
|
146
|
+
modelTier: string;
|
|
147
|
+
/** What stage failed */
|
|
148
|
+
stage: "verify" | "review" | "regression" | "rectification" | "agent-session";
|
|
149
|
+
/** Human-readable summary */
|
|
150
|
+
summary: string;
|
|
151
|
+
/** Structured test failures (if applicable) */
|
|
152
|
+
testFailures?: TestFailureContext[];
|
|
153
|
+
/** Timestamp */
|
|
154
|
+
timestamp: string;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
interface TestFailureContext {
|
|
158
|
+
file: string;
|
|
159
|
+
testName: string;
|
|
160
|
+
error: string;
|
|
161
|
+
/** First 5 lines of stack trace */
|
|
162
|
+
stackTrace: string[];
|
|
163
|
+
}
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
**How it flows through escalation:**
|
|
167
|
+
|
|
168
|
+
```
|
|
169
|
+
fast attempt 1 → verify fails
|
|
170
|
+
→ priorFailures: [{
|
|
171
|
+
attempt: 1,
|
|
172
|
+
modelTier: "fast",
|
|
173
|
+
stage: "verify",
|
|
174
|
+
summary: "3 tests failed in src/routing/router.ts",
|
|
175
|
+
testFailures: [
|
|
176
|
+
{ file: "test/unit/routing/router.test.ts",
|
|
177
|
+
testName: "should route to balanced",
|
|
178
|
+
error: "Expected 'balanced' got 'fast'",
|
|
179
|
+
stackTrace: [...] },
|
|
180
|
+
...
|
|
181
|
+
]
|
|
182
|
+
}]
|
|
183
|
+
|
|
184
|
+
balanced attempt 1 → agent gets FULL context of what fast couldn't fix
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
**Context injection** (`context/builder.ts`):
|
|
188
|
+
|
|
189
|
+
Format `priorFailures` into actionable markdown for the agent prompt:
|
|
190
|
+
|
|
191
|
+
```markdown
|
|
192
|
+
## Prior Attempt 1 (fast, verify)
|
|
193
|
+
3 tests failed in src/routing/router.ts
|
|
194
|
+
|
|
195
|
+
### Test Failures:
|
|
196
|
+
- **test/unit/routing/router.test.ts** > should route to balanced
|
|
197
|
+
Error: Expected 'balanced' got 'fast'
|
|
198
|
+
Stack: at Router.route (src/routing/router.ts:42)
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### 2.4 BUG-028 Fix: Cache Invalidation on Escalation
|
|
202
|
+
|
|
203
|
+
Add `clearCacheForStory(storyId)` to `src/routing/strategies/llm.ts`.
|
|
204
|
+
|
|
205
|
+
Call it in `tier-escalation.ts` when updating `story.routing.modelTier`.
|
|
206
|
+
|
|
207
|
+
---
|
|
208
|
+
|
|
209
|
+
## 3. Migration Plan
|
|
210
|
+
|
|
211
|
+
### Phase 1: v0.18.3 — Minimal Fixes (no architecture change)
|
|
212
|
+
|
|
213
|
+
1. **BUG-026 quick fix:** Regression gate timeout → accept scoped pass + warn (not escalate)
|
|
214
|
+
2. **BUG-028 fix:** `clearCacheForStory()` on escalation
|
|
215
|
+
3. **Store structured failures:** Start populating `priorFailures` alongside `priorErrors` (backward compat)
|
|
216
|
+
|
|
217
|
+
### Phase 2: v0.19.0 — Architecture v2
|
|
218
|
+
|
|
219
|
+
1. **Remove post-verify duplicate test run** — pipeline verify is authoritative
|
|
220
|
+
2. **Review stage: typecheck + lint only** — remove test command from review
|
|
221
|
+
3. **Deferred regression gate** — run-end full suite with targeted rectification
|
|
222
|
+
4. **Reverse Smart Test Runner mapping** — failing test → source file → responsible story
|
|
223
|
+
5. **Full structured failure context** — `priorFailures` injected into agent prompts
|
|
224
|
+
6. **Config:** `regressionGate.mode: "deferred"` (default)
|
|
225
|
+
|
|
226
|
+
### Phase 3: Future
|
|
227
|
+
|
|
228
|
+
- **Incremental regression:** Only run tests related to ALL changed files across all stories (union of Smart Test Runner scopes)
|
|
229
|
+
- **Test impact analysis:** AST-based dependency graph for more precise test scoping
|
|
230
|
+
- **Parallel story regression:** Run rectification for multiple stories concurrently
|
|
231
|
+
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
## 4. Files Affected
|
|
235
|
+
|
|
236
|
+
### Phase 1 (v0.18.3)
|
|
237
|
+
|
|
238
|
+
| File | Change |
|
|
239
|
+
|------|--------|
|
|
240
|
+
| `src/execution/post-verify.ts` | Regression gate timeout → accept + warn |
|
|
241
|
+
| `src/routing/strategies/llm.ts` | Add `clearCacheForStory()` export |
|
|
242
|
+
| `src/execution/escalation/tier-escalation.ts` | Call `clearCacheForStory()` on escalation |
|
|
243
|
+
| `src/execution/post-verify-rectification.ts` | Store `StructuredFailure` in `priorFailures` |
|
|
244
|
+
| `src/prd/types.ts` | Add `priorFailures?: StructuredFailure[]` to `UserStory` |
|
|
245
|
+
|
|
246
|
+
### Phase 2 (v0.19.0)
|
|
247
|
+
|
|
248
|
+
| File | Change |
|
|
249
|
+
|------|--------|
|
|
250
|
+
| `src/pipeline/stages/review.ts` | Remove test command execution |
|
|
251
|
+
| `src/execution/post-verify.ts` | Remove scoped re-test, keep regression call only |
|
|
252
|
+
| `src/execution/lifecycle/run-regression.ts` | **New:** Deferred regression gate + targeted rectification |
|
|
253
|
+
| `src/execution/lifecycle/run-completion.ts` | Call deferred regression before final metrics |
|
|
254
|
+
| `src/verification/smart-runner.ts` | Add reverse mapping: test file → source file → story |
|
|
255
|
+
| `src/context/builder.ts` | Format `priorFailures` into agent prompt |
|
|
256
|
+
| `src/config/schemas.ts` | Add `regressionGate.mode` enum |
|
|
257
|
+
|
|
258
|
+
---
|
|
259
|
+
|
|
260
|
+
## 5. Test Plan
|
|
261
|
+
|
|
262
|
+
### Phase 1 Tests
|
|
263
|
+
- Regression gate timeout returns "passed" with warning (not "failed")
|
|
264
|
+
- `clearCacheForStory()` removes cached decision; next route() re-evaluates
|
|
265
|
+
- `priorFailures` populated with structured `TestFailureContext` on verify failure
|
|
266
|
+
- Backward compat: `priorErrors` still populated alongside `priorFailures`
|
|
267
|
+
|
|
268
|
+
### Phase 2 Tests
|
|
269
|
+
- Pipeline verify is single test execution (no duplicate)
|
|
270
|
+
- Review stage skips test command
|
|
271
|
+
- Deferred regression runs once at run-end
|
|
272
|
+
- Reverse mapping correctly identifies responsible story
|
|
273
|
+
- Targeted rectification receives full failure context
|
|
274
|
+
- Escalated agent prompt includes formatted `priorFailures`
|
|
275
|
+
- Config `regressionGate.mode: "per-story"` preserves current behavior
|
|
276
|
+
|
|
277
|
+
---
|
|
278
|
+
|
|
279
|
+
## 6. Historical Context (Why It's Like This)
|
|
280
|
+
|
|
281
|
+
### Why post-verify exists separately from pipeline verify
|
|
282
|
+
|
|
283
|
+
The pipeline (`src/pipeline/pipeline.ts`) runs stages in sequence: routing → context → prompt → execution → **verify** → review → completion. This was the original single verification point.
|
|
284
|
+
|
|
285
|
+
Later, **post-agent verification** was added in `src/execution/pipeline-result-handler.ts` → `handlePipelineSuccess()` → `runPostAgentVerification()`. This was meant to handle:
|
|
286
|
+
- **Scoped verification** with git-diff-based test file detection (before Smart Test Runner existed in the pipeline)
|
|
287
|
+
- **Rectification** — retry loop with agent when tests fail
|
|
288
|
+
- **Regression gate** (BUG-009 fix) — full suite after scoped pass
|
|
289
|
+
|
|
290
|
+
When Smart Test Runner was added to the **pipeline verify stage** (v0.18.2), it duplicated the scoped test logic that post-verify already had. Nobody removed the post-verify scoped test.
|
|
291
|
+
|
|
292
|
+
### Current code flow with exact locations
|
|
293
|
+
|
|
294
|
+
```
|
|
295
|
+
sequential-executor.ts:170 → pipelineRunner.run(story)
|
|
296
|
+
pipeline.ts:execute() → runs stages in order:
|
|
297
|
+
verify.ts:execute() → Smart Test Runner scoped tests [TEST RUN #1]
|
|
298
|
+
review.ts:execute() → runReview() which may run tests [TEST RUN #2 if review.commands.test set]
|
|
299
|
+
|
|
300
|
+
pipeline-result-handler.ts:76 → runPostAgentVerification()
|
|
301
|
+
post-verify.ts:85 → runVerification(scopedCommand) [TEST RUN #3 — duplicate of #1]
|
|
302
|
+
post-verify.ts:118 → runRegressionGate()
|
|
303
|
+
post-verify.ts:180 → runVerification(fullSuite) [TEST RUN #4 — full suite]
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
### Review stage test command
|
|
307
|
+
|
|
308
|
+
`review.ts` calls `runReview()` from `src/review/index.ts` which runs `config.review.commands.test` if configured. In default config, `review.commands` includes `test`, `typecheck`, and `lint`. So yes — review runs tests by default, creating the triple-test problem.
|
|
309
|
+
|
|
310
|
+
### Decision rationale
|
|
311
|
+
|
|
312
|
+
**Why deferred regression (Option C) over per-story (A) or disabled (B):**
|
|
313
|
+
- **Option A (keep per-story):** 125s timeout per story is the root cause of BUG-026. Even with timeout-acceptance, it's wasteful.
|
|
314
|
+
- **Option B (disable entirely):** Too risky — cross-story regressions are real (BUG-009 was filed for this exact reason).
|
|
315
|
+
- **Option C (deferred):** One full suite run at the end. If it fails, we can trace back to responsible stories via reverse file mapping. Best balance of safety vs speed.
|
|
316
|
+
|
|
317
|
+
**Why cache invalidation (Option C for BUG-028) over cache key change (A) or bypass (B):**
|
|
318
|
+
- **Option A (include tier in key):** Works but creates multiple cache entries per story. If story is re-routed 3 times, 3 entries exist. Cache eviction becomes unpredictable.
|
|
319
|
+
- **Option B (bypass when routing set):** Almost all stories have `story.routing` set after first pass, so cache would rarely be used at all — defeats the purpose.
|
|
320
|
+
- **Option C (clear on escalation):** Surgical — one `delete()` call at the exact moment routing changes. Cache works normally for non-escalated stories.
|
|
321
|
+
|
|
322
|
+
## 7. Edge Cases
|
|
323
|
+
|
|
324
|
+
### Partial completion (stalled run)
|
|
325
|
+
|
|
326
|
+
If only 3 of 5 stories pass and nax stalls (remaining stories failed/paused):
|
|
327
|
+
- Deferred regression still runs on the 3 passed stories
|
|
328
|
+
- If regression fails, only the passed stories are candidates for rectification
|
|
329
|
+
- Failed/paused stories are untouched
|
|
330
|
+
|
|
331
|
+
### Stories that touch the same files
|
|
332
|
+
|
|
333
|
+
If story A and story B both modify `src/utils/parser.ts`:
|
|
334
|
+
- Reverse mapping may attribute the same failing test to both stories
|
|
335
|
+
- Rectification should try the **last story that touched the file** first (git log order)
|
|
336
|
+
- If that doesn't fix it, try the other story
|
|
337
|
+
|
|
338
|
+
### No test mapping possible
|
|
339
|
+
|
|
340
|
+
If a failing test can't be mapped to any story's changed files:
|
|
341
|
+
- Log warning: "Unmapped regression — cannot attribute to a specific story"
|
|
342
|
+
- Mark ALL passed stories as needing re-verification
|
|
343
|
+
- This is the worst case but should be rare with good test naming conventions
|
package/nax/config.json
CHANGED
|
@@ -63,18 +63,19 @@
|
|
|
63
63
|
"verificationTimeoutSeconds": 300,
|
|
64
64
|
"maxStoriesPerFeature": 15,
|
|
65
65
|
"rectification": {
|
|
66
|
-
"enabled":
|
|
66
|
+
"enabled": true,
|
|
67
67
|
"maxRetries": 2,
|
|
68
68
|
"fullSuiteTimeoutSeconds": 600,
|
|
69
69
|
"maxFailureSummaryChars": 2000,
|
|
70
70
|
"abortOnIncreasingFailures": true
|
|
71
71
|
},
|
|
72
72
|
"regressionGate": {
|
|
73
|
-
"enabled": false
|
|
73
|
+
"enabled": false,
|
|
74
|
+
"timeoutSeconds": 300
|
|
74
75
|
}
|
|
75
76
|
},
|
|
76
77
|
"quality": {
|
|
77
|
-
"requireTypecheck":
|
|
78
|
+
"requireTypecheck": true,
|
|
78
79
|
"requireLint": true,
|
|
79
80
|
"requireTests": true,
|
|
80
81
|
"commands": {
|
|
@@ -117,13 +118,12 @@
|
|
|
117
118
|
"maxCodebaseSummaryTokens": 5000
|
|
118
119
|
},
|
|
119
120
|
"review": {
|
|
120
|
-
"enabled":
|
|
121
|
+
"enabled": true,
|
|
121
122
|
"checks": [
|
|
122
|
-
"
|
|
123
|
+
"typecheck",
|
|
123
124
|
"lint"
|
|
124
125
|
],
|
|
125
126
|
"commands": {
|
|
126
|
-
"test": "bun run test",
|
|
127
127
|
"typecheck": "bun run typecheck",
|
|
128
128
|
"lint": "bun run lint"
|
|
129
129
|
}
|
|
@@ -147,4 +147,4 @@
|
|
|
147
147
|
"scopeToStory": true
|
|
148
148
|
}
|
|
149
149
|
}
|
|
150
|
-
}
|
|
150
|
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
{
|
|
2
|
+
"project": "nax",
|
|
3
|
+
"branchName": "feat/v0.18.3-execution-reliability",
|
|
4
|
+
"feature": "v0.18.3-execution-reliability",
|
|
5
|
+
"userStories": [
|
|
6
|
+
{
|
|
7
|
+
"id": "BUG-026",
|
|
8
|
+
"title": "Regression gate timeout accepts scoped pass instead of escalating",
|
|
9
|
+
"description": "In src/execution/post-verify.ts, the runRegressionGate() function returns 'failed' when the full suite times out. This causes revertStoriesOnFailure() to bump story.attempts and trigger tier escalation, even though scoped verification already passed. Fix: when regressionResult.status === 'TIMEOUT', return 'passed' with a warning log instead of 'failed'. The scoped Smart Test Runner already verified the story's changes. A timeout is not evidence of regression \u2014 only TEST_FAILURE is. Keep existing behavior for TEST_FAILURE (attempt rectification). Add config flag execution.regressionGate.acceptOnTimeout (default true).",
|
|
10
|
+
"complexity": 2,
|
|
11
|
+
"status": "passed",
|
|
12
|
+
"testStrategy": "test-after",
|
|
13
|
+
"attempts": 0,
|
|
14
|
+
"priorErrors": [
|
|
15
|
+
"Attempt 1 failed with model tier: fast"
|
|
16
|
+
],
|
|
17
|
+
"escalations": [],
|
|
18
|
+
"dependencies": [],
|
|
19
|
+
"tags": [],
|
|
20
|
+
"acceptanceCriteria": [],
|
|
21
|
+
"storyPoints": 1,
|
|
22
|
+
"routing": {
|
|
23
|
+
"complexity": "simple",
|
|
24
|
+
"modelTier": "balanced",
|
|
25
|
+
"testStrategy": "three-session-tdd-lite",
|
|
26
|
+
"reasoning": "three-session-tdd-lite: strategy:lite"
|
|
27
|
+
},
|
|
28
|
+
"passes": true
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"id": "BUG-028",
|
|
32
|
+
"title": "Clear LLM routing cache on tier escalation",
|
|
33
|
+
"description": "In src/routing/strategies/llm.ts, the routing cache is keyed by story.id only. When tier escalation updates story.routing.modelTier (e.g. balanced \u2192 powerful), the next iteration hits the cache and returns the old routing decision, overriding the escalation. Fix: add a clearCacheForStory(storyId: string) export to llm.ts that calls cachedDecisions.delete(storyId). Call clearCacheForStory() in src/execution/escalation/tier-escalation.ts in both escalation paths: (1) preIterationTierCheck() after updating story routing, (2) handleTierEscalation() after updating story routing. Write unit tests verifying: cache hit returns old decision, clearCacheForStory removes it, next route() call re-evaluates.",
|
|
34
|
+
"complexity": 2,
|
|
35
|
+
"status": "passed",
|
|
36
|
+
"testStrategy": "test-after",
|
|
37
|
+
"attempts": 0,
|
|
38
|
+
"priorErrors": [],
|
|
39
|
+
"escalations": [],
|
|
40
|
+
"dependencies": [],
|
|
41
|
+
"tags": [],
|
|
42
|
+
"acceptanceCriteria": [],
|
|
43
|
+
"storyPoints": 1,
|
|
44
|
+
"passes": true
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
"id": "SFC-001",
|
|
48
|
+
"title": "Add StructuredFailure type and populate priorFailures on verify failure",
|
|
49
|
+
"description": "Add structured failure context so escalated tiers know exactly what failed. Changes: (1) In src/prd/types.ts, add StructuredFailure interface with fields: attempt (number), modelTier (string), stage ('verify'|'review'|'regression'|'rectification'|'agent-session'), summary (string), testFailures (optional TestFailureContext[]), timestamp (string). Add TestFailureContext interface with fields: file (string), testName (string), error (string), stackTrace (string[]). Add priorFailures?: StructuredFailure[] to UserStory type. (2) In src/execution/post-verify.ts, when verification fails (both scoped and regression), build a StructuredFailure from the VerificationResult and parseBunTestOutput(), and push to story.priorFailures. (3) In src/execution/post-verify-rectification.ts revertStoriesOnFailure(), also populate priorFailures alongside existing priorErrors. (4) In src/execution/escalation/tier-escalation.ts, populate priorFailures with stage='escalation' when escalating. (5) Ensure priorFailures is initialized to [] in prd/index.ts loadPRD() like priorErrors.",
|
|
50
|
+
"complexity": 3,
|
|
51
|
+
"status": "passed",
|
|
52
|
+
"testStrategy": "test-after",
|
|
53
|
+
"attempts": 1,
|
|
54
|
+
"priorErrors": [],
|
|
55
|
+
"escalations": [],
|
|
56
|
+
"dependencies": [],
|
|
57
|
+
"tags": [],
|
|
58
|
+
"acceptanceCriteria": [],
|
|
59
|
+
"storyPoints": 1,
|
|
60
|
+
"passes": true
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
"id": "SFC-002",
|
|
64
|
+
"title": "Format priorFailures into agent prompt context",
|
|
65
|
+
"description": "In src/context/builder.ts, the existing code injects priorErrors as high-priority context elements (priority 90). Add similar handling for priorFailures: format each StructuredFailure into actionable markdown showing the attempt number, model tier, stage, summary, and detailed test failures (file, testName, error, first stack trace line). Inject as priority 95 (higher than priorErrors) so the agent sees structured failures first. Keep priorErrors injection for backward compat. Write unit tests verifying: (1) priorFailures formatted correctly, (2) test failure details included, (3) empty priorFailures produces no context element, (4) priority ordering is correct.",
|
|
66
|
+
"complexity": 2,
|
|
67
|
+
"status": "passed",
|
|
68
|
+
"testStrategy": "test-after",
|
|
69
|
+
"attempts": 1,
|
|
70
|
+
"priorErrors": [],
|
|
71
|
+
"escalations": [],
|
|
72
|
+
"dependencies": [],
|
|
73
|
+
"tags": [],
|
|
74
|
+
"acceptanceCriteria": [],
|
|
75
|
+
"storyPoints": 1,
|
|
76
|
+
"passes": true
|
|
77
|
+
}
|
|
78
|
+
],
|
|
79
|
+
"updatedAt": "2026-03-04T05:01:36.021Z"
|
|
80
|
+
}
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
[2026-03-04T04:36:19.998Z] BUG-028 — PASSED — Clear LLM routing cache on tier escalation — Cost: $0.0779
|
|
2
|
+
[2026-03-04T04:51:46.868Z] SFC-001 — FAILED — Add StructuredFailure type and populate priorFailures on verify failure — Review failed: lint failed (exit code 1)
|
|
3
|
+
[2026-03-04T05:01:36.022Z] SFC-002 — FAILED — Format priorFailures into agent prompt context — Review failed: lint failed (exit code 1)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@nathapp/nax",
|
|
3
|
-
"version": "0.18.
|
|
3
|
+
"version": "0.18.4",
|
|
4
4
|
"description": "AI Coding Agent Orchestrator \u2014 loops until done",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -31,7 +31,7 @@
|
|
|
31
31
|
},
|
|
32
32
|
"devDependencies": {
|
|
33
33
|
"@biomejs/biome": "^1.9.4",
|
|
34
|
-
"@types/bun": "^1.
|
|
34
|
+
"@types/bun": "^1.3.8",
|
|
35
35
|
"react-devtools-core": "^7.0.1",
|
|
36
36
|
"typescript": "^5.7.3"
|
|
37
37
|
},
|
package/src/config/defaults.ts
CHANGED
package/src/config/schema.ts
CHANGED
package/src/config/schemas.ts
CHANGED
|
@@ -62,8 +62,31 @@ const RectificationConfigSchema = z.object({
|
|
|
62
62
|
const RegressionGateConfigSchema = z.object({
|
|
63
63
|
enabled: z.boolean().default(true),
|
|
64
64
|
timeoutSeconds: z.number().int().min(10).max(600).default(120),
|
|
65
|
+
acceptOnTimeout: z.boolean().default(true),
|
|
65
66
|
});
|
|
66
67
|
|
|
68
|
+
const SmartTestRunnerConfigSchema = z.object({
|
|
69
|
+
enabled: z.boolean().default(true),
|
|
70
|
+
testFilePatterns: z.array(z.string()).default(["test/**/*.test.ts"]),
|
|
71
|
+
fallback: z.enum(["import-grep", "full-suite"]).default("import-grep"),
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
const SMART_TEST_RUNNER_DEFAULT = {
|
|
75
|
+
enabled: true,
|
|
76
|
+
testFilePatterns: ["test/**/*.test.ts"],
|
|
77
|
+
fallback: "import-grep" as const,
|
|
78
|
+
};
|
|
79
|
+
|
|
80
|
+
/** Coerces boolean → SmartTestRunnerConfig for backward compat */
|
|
81
|
+
const smartTestRunnerFieldSchema = z
|
|
82
|
+
.preprocess((val) => {
|
|
83
|
+
if (typeof val === "boolean") {
|
|
84
|
+
return { enabled: val, testFilePatterns: ["test/**/*.test.ts"], fallback: "import-grep" };
|
|
85
|
+
}
|
|
86
|
+
return val;
|
|
87
|
+
}, SmartTestRunnerConfigSchema)
|
|
88
|
+
.default(SMART_TEST_RUNNER_DEFAULT);
|
|
89
|
+
|
|
67
90
|
const ExecutionConfigSchema = z.object({
|
|
68
91
|
maxIterations: z.number().int().positive({ message: "maxIterations must be > 0" }),
|
|
69
92
|
iterationDelayMs: z.number().int().nonnegative(),
|
|
@@ -81,7 +104,7 @@ const ExecutionConfigSchema = z.object({
|
|
|
81
104
|
lintCommand: z.string().nullable().optional(),
|
|
82
105
|
typecheckCommand: z.string().nullable().optional(),
|
|
83
106
|
dangerouslySkipPermissions: z.boolean().default(true),
|
|
84
|
-
smartTestRunner:
|
|
107
|
+
smartTestRunner: smartTestRunnerFieldSchema,
|
|
85
108
|
});
|
|
86
109
|
|
|
87
110
|
const QualityConfigSchema = z.object({
|
|
@@ -189,6 +212,8 @@ const LlmRoutingConfigSchema = z.object({
|
|
|
189
212
|
mode: z.enum(["one-shot", "per-story", "hybrid"]).optional(),
|
|
190
213
|
batchMode: z.boolean().optional(), // deprecated, for backward compat
|
|
191
214
|
timeoutMs: z.number().int().positive({ message: "llm.timeoutMs must be > 0" }).optional(),
|
|
215
|
+
retries: z.number().int().min(0, { message: "llm.retries must be >= 0" }).optional(),
|
|
216
|
+
retryDelayMs: z.number().int().min(0, { message: "llm.retryDelayMs must be >= 0" }).optional(),
|
|
192
217
|
});
|
|
193
218
|
|
|
194
219
|
const RoutingConfigSchema = z
|
package/src/config/types.ts
CHANGED
|
@@ -70,12 +70,24 @@ export interface RectificationConfig {
|
|
|
70
70
|
abortOnIncreasingFailures: boolean;
|
|
71
71
|
}
|
|
72
72
|
|
|
73
|
-
/** Regression gate config (BUG-009) */
|
|
73
|
+
/** Regression gate config (BUG-009, BUG-026) */
|
|
74
74
|
export interface RegressionGateConfig {
|
|
75
75
|
/** Enable full-suite regression gate after scoped verification (default: true) */
|
|
76
76
|
enabled: boolean;
|
|
77
77
|
/** Timeout for full-suite regression run in seconds (default: 120) */
|
|
78
78
|
timeoutSeconds: number;
|
|
79
|
+
/** Accept timeout as pass instead of failing (BUG-026, default: true) */
|
|
80
|
+
acceptOnTimeout?: boolean;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/** Smart test runner configuration (STR-007) */
|
|
84
|
+
export interface SmartTestRunnerConfig {
|
|
85
|
+
/** Enable smart test runner (default: true) */
|
|
86
|
+
enabled: boolean;
|
|
87
|
+
/** Glob patterns to scan for test files during import-grep fallback */
|
|
88
|
+
testFilePatterns: string[];
|
|
89
|
+
/** Fallback strategy when path-convention mapping yields no results */
|
|
90
|
+
fallback: "import-grep" | "full-suite";
|
|
79
91
|
}
|
|
80
92
|
|
|
81
93
|
/** Execution limits */
|
|
@@ -106,8 +118,9 @@ export interface ExecutionConfig {
|
|
|
106
118
|
typecheckCommand?: string | null;
|
|
107
119
|
/** Use --dangerously-skip-permissions flag for agent (default: true for backward compat, SEC-1 fix) */
|
|
108
120
|
dangerouslySkipPermissions?: boolean;
|
|
109
|
-
/** Enable smart test runner to scope test runs to changed files (default: true)
|
|
110
|
-
|
|
121
|
+
/** Enable smart test runner to scope test runs to changed files (default: true).
|
|
122
|
+
* Accepts boolean for backward compat or a SmartTestRunnerConfig object. */
|
|
123
|
+
smartTestRunner?: boolean | SmartTestRunnerConfig;
|
|
111
124
|
}
|
|
112
125
|
|
|
113
126
|
/** Quality gate config */
|
|
@@ -352,8 +365,12 @@ export interface LlmRoutingConfig {
|
|
|
352
365
|
mode?: LlmRoutingMode;
|
|
353
366
|
/** @deprecated Use mode instead. Will be removed in v1.0 */
|
|
354
367
|
batchMode?: boolean;
|
|
355
|
-
/** Timeout for LLM call in milliseconds (default:
|
|
368
|
+
/** Timeout for LLM call in milliseconds (default: 30000) */
|
|
356
369
|
timeoutMs?: number;
|
|
370
|
+
/** Number of retries on LLM timeout or transient failure (default: 1) */
|
|
371
|
+
retries?: number;
|
|
372
|
+
/** Delay between retries in milliseconds (default: 1000) */
|
|
373
|
+
retryDelayMs?: number;
|
|
357
374
|
}
|
|
358
375
|
|
|
359
376
|
/** Routing config */
|
package/src/context/builder.ts
CHANGED
|
@@ -14,6 +14,7 @@ import {
|
|
|
14
14
|
createDependencyContext,
|
|
15
15
|
createErrorContext,
|
|
16
16
|
createFileContext,
|
|
17
|
+
createPriorFailuresContext,
|
|
17
18
|
createProgressContext,
|
|
18
19
|
createStoryContext,
|
|
19
20
|
createTestCoverageContext,
|
|
@@ -31,6 +32,7 @@ export {
|
|
|
31
32
|
createProgressContext,
|
|
32
33
|
createFileContext,
|
|
33
34
|
createTestCoverageContext,
|
|
35
|
+
createPriorFailuresContext,
|
|
34
36
|
} from "./elements";
|
|
35
37
|
export { formatContextAsMarkdown } from "./formatter";
|
|
36
38
|
|
|
@@ -89,6 +91,15 @@ export async function buildContext(storyContext: StoryContext, budget: ContextBu
|
|
|
89
91
|
// Add progress summary (highest priority)
|
|
90
92
|
elements.push(createProgressContext(generateProgressSummary(prd), 100));
|
|
91
93
|
|
|
94
|
+
// Add prior failures (highest priority after progress, priority 95)
|
|
95
|
+
if (
|
|
96
|
+
currentStory.priorFailures &&
|
|
97
|
+
Array.isArray(currentStory.priorFailures) &&
|
|
98
|
+
currentStory.priorFailures.length > 0
|
|
99
|
+
) {
|
|
100
|
+
elements.push(createPriorFailuresContext(currentStory.priorFailures, 95));
|
|
101
|
+
}
|
|
102
|
+
|
|
92
103
|
// Add prior errors (high priority)
|
|
93
104
|
if (currentStory.priorErrors && Array.isArray(currentStory.priorErrors) && currentStory.priorErrors.length > 0) {
|
|
94
105
|
for (const error of currentStory.priorErrors) {
|