@nathapp/nax 0.22.2 → 0.22.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/docs/ROADMAP.md CHANGED
@@ -118,22 +118,22 @@
118
118
 
119
119
  ---
120
120
 
121
- ## v0.22.2 — Status File Consolidation
121
+ ## v0.23.0 — Status File Consolidation
122
122
 
123
123
  **Theme:** Auto-write status.json to well-known paths, align readers, remove dead options
124
- **Status:** 🔲 Planned
124
+ **Status:** 🔄 In Progress (self-dev running, SFC-001 ✅)
125
125
  **Spec:** [docs/specs/status-file-consolidation.md](specs/status-file-consolidation.md)
126
- **Pre-requisite for:** v0.23.0 (Central Run Registry)
126
+ **Pre-requisite for:** v0.24.0 (Central Run Registry)
127
127
 
128
128
  ### Stories
129
- - [ ] **SFC-001:** Auto-write project-level status — remove `--status-file` flag, always write to `<workdir>/nax/status.json`
129
+ - [x] ~~**SFC-001:** Auto-write project-level status — remove `--status-file` flag, always write to `<workdir>/nax/status.json`~~
130
130
  - [ ] **SFC-002:** Write feature-level status on run end — copy final snapshot to `<workdir>/nax/features/<feature>/status.json`
131
131
  - [ ] **SFC-003:** Align status readers — `nax status` + `nax diagnose` read from correct paths
132
132
  - [ ] **SFC-004:** Clean up dead code — remove `--status-file` option, `.nax-status.json` references
133
133
 
134
134
  ---
135
135
 
136
- ## v0.23.0 — Central Run Registry
136
+ ## v0.24.0 — Central Run Registry
137
137
 
138
138
  **Theme:** Global run index across all projects — single source of truth for all nax run history
139
139
  **Status:** 🔲 Planned
@@ -221,6 +221,7 @@
221
221
  | Version | Theme | Date | Details |
222
222
  |:---|:---|:---|:---|
223
223
  | v0.18.1 | Type Safety + CI Pipeline | 2026-03-03 | 60 TS errors + 12 lint errors fixed, GitLab CI green (1952/56/0) |
224
+ | v0.22.2 | Routing Stability + SFC-001 | 2026-03-07 | BUG-040 floating outputPromise crash on LLM timeout retry; SFC-001 auto-write status.json |
224
225
  | v0.22.1 | Pipeline Re-Architecture | 2026-03-07 | VerificationOrchestrator, EventBus, new stages (rectify/autofix/regression/deferred-regression), post-run SSOT. 2264 pass |
225
226
  | v0.20.0 | Verification Architecture v2 | 2026-03-06 | Deferred regression gate, remove duplicate tests, BUG-037 |
226
227
  | v0.19.0 | Hardening & Compliance | 2026-03-04 | SEC-1 to SEC-5, BUG-1, Node.js API removal, _deps rollout |
@@ -288,7 +289,7 @@
288
289
  - [x] ~~Constitution file support~~
289
290
  - [x] ~~Per-story testStrategy override — v0.18.1~~
290
291
  - [x] ~~Smart Test Runner — v0.18.2~~
291
- - [ ] **Central Run Registry** — moved to v0.23.0
292
+ - [ ] **Central Run Registry** — moved to v0.24.0
292
293
  - [x] ~~**BUN-001:** Bun PTY Migration — replace `node-pty` with `Bun.spawn` (piped stdio). Shipped in v0.18.5.~~
293
294
  - [ ] **CI-001:** CI Memory Optimization — parallel test sharding for 1GB runners
294
295
  - [ ] **CI-001:** CI Memory Optimization — parallel test sharding to pass on 1GB runners (currently requires 8GB). Evaluate `bun test --shard` when stable.
@@ -0,0 +1,137 @@
1
+ {
2
+ "project": "nax",
3
+ "branchName": "feat/post-rearch-bugfix",
4
+ "feature": "post-rearch-bugfix",
5
+ "version": "0.22.3",
6
+ "description": "Fix all critical and key high-priority bugs found in post-re-architecture code review. Stream deadlocks, unhandled rejections, signal handler safety, lock file reliability, interaction system, parallel executor race, and error swallowing.",
7
+ "userStories": [
8
+ {
9
+ "id": "FIX-C1",
10
+ "title": "Fix stream deadlock in acceptance and autofix stages",
11
+ "description": "In src/pipeline/stages/acceptance.ts (line ~136) and src/pipeline/stages/autofix.ts (line ~116), the code awaits proc.exited BEFORE reading stdout/stderr. When output exceeds the 64KB OS pipe buffer, the child blocks on write and proc.exited never resolves, causing a silent deadlock. Fix: use Promise.all([proc.exited, new Response(proc.stdout).text(), new Response(proc.stderr).text()]) to read streams concurrently with exit.",
12
+ "complexity": "simple",
13
+ "status": "pending",
14
+ "acceptanceCriteria": [
15
+ "acceptance.ts reads stdout/stderr concurrently with proc.exited using Promise.all",
16
+ "autofix.ts reads stdout/stderr concurrently with proc.exited using Promise.all",
17
+ "No sequential await proc.exited before stream reads in either file",
18
+ "Existing tests pass"
19
+ ]
20
+ },
21
+ {
22
+ "id": "FIX-C2",
23
+ "title": "Fix emitAsync never called for human-in-the-loop interaction",
24
+ "description": "In src/execution/pipeline-result-handler.ts (line ~151), human-review:requested is emitted via fire-and-forget emit() instead of emitAsync(). The emitAsync() method in src/pipeline/subscribers/interaction.ts was specifically designed to wait for human response but is never called anywhere. The pipeline races past without waiting for human input. Fix: use await pipelineEventBus.emitAsync() for human-review events.",
25
+ "complexity": "medium",
26
+ "status": "pending",
27
+ "acceptanceCriteria": [
28
+ "human-review:requested event uses emitAsync instead of emit",
29
+ "Pipeline waits for human response before continuing",
30
+ "emitAsync is properly awaited at the call site",
31
+ "Existing tests pass"
32
+ ]
33
+ },
34
+ {
35
+ "id": "FIX-C5",
36
+ "title": "Fix timeoutPromise unhandled rejection in LLM routing",
37
+ "description": "In src/routing/strategies/llm.ts, the timeoutPromise created in callLlmOnce() uses reject() but if the timer fires between race resolution and clearTimeout, the rejection is unhandled. Add timeoutPromise.catch(() => {}) right after creation, or restructure to use a clearable pattern that does not reject.",
38
+ "complexity": "simple",
39
+ "status": "pending",
40
+ "acceptanceCriteria": [
41
+ "timeoutPromise rejection is always handled (no unhandled rejection possible)",
42
+ "Existing BUG-040 tests still pass",
43
+ "Add test verifying no unhandled rejection when timeout fires after successful completion"
44
+ ]
45
+ },
46
+ {
47
+ "id": "FIX-C3",
48
+ "title": "Fix TDZ crash in signal handler — prd accessed before initialization",
49
+ "description": "In src/execution/runner.ts around line 123, the crash handler closure references prd which is declared later (~line 134). If SIGTERM arrives during setupRun(), accessing prd throws ReferenceError. Fix: declare let prd: PRD | undefined before the crash handler setup, and add a null guard in the getter: () => prd ? countStories(prd).total : 0.",
50
+ "complexity": "simple",
51
+ "status": "pending",
52
+ "acceptanceCriteria": [
53
+ "prd variable declared before crash handler registration",
54
+ "Crash handler getter has null guard for prd",
55
+ "SIGTERM during setupRun does not throw ReferenceError",
56
+ "Existing tests pass"
57
+ ]
58
+ },
59
+ {
60
+ "id": "FIX-C6",
61
+ "title": "Fix parallel executor shared mutable state race condition",
62
+ "description": "In src/execution/parallel.ts around line 191 and 213, results.totalCost += ... and executing.splice(index, 1) are mutated concurrently from parallel promises. The splice inside .finally() can corrupt array indices when two promises resolve in the same microtask batch. Fix: replace executing array with a Set pattern; use executing.delete(p) instead of splice.",
63
+ "complexity": "medium",
64
+ "status": "pending",
65
+ "acceptanceCriteria": [
66
+ "executing collection uses Set instead of Array with splice",
67
+ "totalCost accumulation is safe against concurrent updates",
68
+ "No array index corruption possible when promises resolve simultaneously",
69
+ "Existing tests pass"
70
+ ]
71
+ },
72
+ {
73
+ "id": "FIX-C7",
74
+ "title": "Fix corrupt lock file permanently blocking all runs",
75
+ "description": "In src/execution/lock.ts around line 48-79, if JSON.parse(lockContent) throws on a corrupted lock file, the error propagates to the outer catch which returns false, the caller interprets this as another process is running. Fix: wrap JSON.parse in its own try-catch; treat unparseable lock files as stale and delete them.",
76
+ "complexity": "simple",
77
+ "status": "pending",
78
+ "acceptanceCriteria": [
79
+ "Corrupt/unparseable lock file is treated as stale and deleted",
80
+ "A warning is logged when a corrupt lock file is found",
81
+ "nax can start normally after encountering a corrupt lock file",
82
+ "Existing tests pass"
83
+ ]
84
+ },
85
+ {
86
+ "id": "FIX-C8",
87
+ "title": "Fix empty catch in drainWithDeadline swallowing all errors",
88
+ "description": "In src/verification/executor.ts around line 36-39, the catch block in drainWithDeadline swallows ALL exceptions including TypeError, OutOfMemoryError etc. Output silently becomes empty string with no diagnostic. Fix: narrow the catch to expected stream-destroyed errors only; log unexpected errors at debug level.",
89
+ "complexity": "simple",
90
+ "status": "pending",
91
+ "acceptanceCriteria": [
92
+ "Expected stream errors (after kill) are still silently handled",
93
+ "Unexpected errors are logged at debug level",
94
+ "Output defaults to empty string on expected stream errors",
95
+ "Existing tests pass"
96
+ ]
97
+ },
98
+ {
99
+ "id": "FIX-H16",
100
+ "title": "Fix lock file not released when setupRun fails",
101
+ "description": "In src/execution/lifecycle/run-setup.ts around line 153-193, the lock is acquired during setupRun but if setupRun fails, it is outside the runner main try block so the lock is never released. Fix: ensure lock release in a finally block within setupRun, or move lock acquisition inside the runner try/finally.",
102
+ "complexity": "medium",
103
+ "status": "pending",
104
+ "acceptanceCriteria": [
105
+ "Lock file is released when setupRun throws an error",
106
+ "Lock file is released on all error paths during setup",
107
+ "Existing tests pass"
108
+ ]
109
+ },
110
+ {
111
+ "id": "FIX-C4",
112
+ "title": "Replace uncancellable Bun.sleep timer in executor",
113
+ "description": "In src/verification/executor.ts around line 97-100, Bun.sleep() is used for the timeout promise but cannot be cancelled. When the process exits quickly, the sleep continues for the full timeoutMs. Fix: replace with a clearable setTimeout-based promise pattern, clearing the timer in the success path.",
114
+ "complexity": "simple",
115
+ "status": "pending",
116
+ "acceptanceCriteria": [
117
+ "Timeout in executeWithTimeout uses clearable setTimeout not Bun.sleep",
118
+ "Timer is cleared when process exits before timeout",
119
+ "No timer leak after successful execution",
120
+ "Existing tests pass"
121
+ ]
122
+ },
123
+ {
124
+ "id": "FIX-H5",
125
+ "title": "Add hard deadline to async signal handlers",
126
+ "description": "In src/execution/crash-recovery.ts around line 149-170, signal handlers contain multiple await operations. If any hangs, process.exit() is never reached. Fix: add a setTimeout hard deadline (e.g. 10s) at the top of each signal handler that calls process.exit() as a fallback.",
127
+ "complexity": "simple",
128
+ "status": "pending",
129
+ "acceptanceCriteria": [
130
+ "SIGTERM handler has a hard deadline timeout (10s) that calls process.exit",
131
+ "SIGINT handler has the same hard deadline",
132
+ "Hard deadline fires even if async operations hang",
133
+ "Existing tests pass"
134
+ ]
135
+ }
136
+ ]
137
+ }
package/nax/status.json CHANGED
@@ -1,27 +1,28 @@
1
1
  {
2
2
  "version": 1,
3
3
  "run": {
4
- "id": "run-2026-03-05T02-37-04-540Z",
5
- "feature": "nax-compliance",
6
- "startedAt": "2026-03-05T02:37:04.540Z",
7
- "status": "stalled",
4
+ "id": "run-2026-03-07T06-14-21-018Z",
5
+ "feature": "status-file-consolidation",
6
+ "startedAt": "2026-03-07T06:14:21.018Z",
7
+ "status": "completed",
8
8
  "dryRun": false,
9
- "pid": 814245
9
+ "pid": 217461
10
10
  },
11
11
  "progress": {
12
- "total": 1,
13
- "passed": 0,
14
- "failed": 1,
12
+ "total": 4,
13
+ "passed": 4,
14
+ "failed": 0,
15
15
  "paused": 0,
16
16
  "blocked": 0,
17
17
  "pending": 0
18
18
  },
19
19
  "cost": {
20
20
  "spent": 0,
21
- "limit": 8
21
+ "limit": 3
22
22
  },
23
23
  "current": null,
24
- "iterations": 3,
25
- "updatedAt": "2026-03-05T02:38:46.469Z",
26
- "durationMs": 101929
24
+ "iterations": 0,
25
+ "updatedAt": "2026-03-07T06:19:54.528Z",
26
+ "durationMs": 1000,
27
+ "lastHeartbeat": "2026-03-07T06:19:34.987Z"
27
28
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@nathapp/nax",
3
- "version": "0.22.2",
3
+ "version": "0.22.3",
4
4
  "description": "AI Coding Agent Orchestrator \u2014 loops until done",
5
5
  "type": "module",
6
6
  "bin": {
@@ -44,4 +44,4 @@
44
44
  "tdd",
45
45
  "coding"
46
46
  ]
47
- }
47
+ }
@@ -147,6 +147,12 @@ export function installCrashHandlers(ctx: CrashRecoveryContext): () => void {
147
147
 
148
148
  // Signal handler
149
149
  const handleSignal = async (signal: NodeJS.Signals) => {
150
+ // Hard deadline: force exit if any async operation hangs (FIX-H5)
151
+ const hardDeadline = setTimeout(() => {
152
+ process.exit(128 + getSignalNumber(signal));
153
+ }, 10_000);
154
+ if (hardDeadline.unref) hardDeadline.unref();
155
+
150
156
  logger?.error("crash-recovery", `Received ${signal}, shutting down...`, { signal });
151
157
 
152
158
  // Kill all spawned agent processes
@@ -166,6 +172,7 @@ export function installCrashHandlers(ctx: CrashRecoveryContext): () => void {
166
172
  // Stop heartbeat
167
173
  stopHeartbeat();
168
174
 
175
+ clearTimeout(hardDeadline);
169
176
  // Exit cleanly
170
177
  process.exit(128 + getSignalNumber(signal));
171
178
  };
@@ -26,7 +26,7 @@ import type { PluginRegistry } from "../../plugins/registry";
26
26
  import type { PRD } from "../../prd";
27
27
  import { loadPRD } from "../../prd";
28
28
  import { installCrashHandlers } from "../crash-recovery";
29
- import { acquireLock, hookCtx } from "../helpers";
29
+ import { acquireLock, hookCtx, releaseLock } from "../helpers";
30
30
  import { PidRegistry } from "../pid-registry";
31
31
  import { StatusWriter } from "../status-writer";
32
32
 
@@ -157,48 +157,56 @@ export async function setupRun(options: RunSetupOptions): Promise<RunSetupResult
157
157
  throw new LockAcquisitionError(workdir);
158
158
  }
159
159
 
160
- // Load plugins (before try block so it's accessible in finally)
161
- const globalPluginsDir = path.join(os.homedir(), ".nax", "plugins");
162
- const projectPluginsDir = path.join(workdir, "nax", "plugins");
163
- const configPlugins = config.plugins || [];
164
- const pluginRegistry = await loadPlugins(globalPluginsDir, projectPluginsDir, configPlugins, workdir);
165
-
166
- // Log plugins loaded
167
- logger?.info("plugins", `Loaded ${pluginRegistry.plugins.length} plugins`, {
168
- plugins: pluginRegistry.plugins.map((p) => ({ name: p.name, version: p.version, provides: p.provides })),
169
- });
160
+ // Everything after lock acquisition is wrapped in try-catch to ensure
161
+ // the lock is released if any setup step fails (FIX-H16)
162
+ try {
163
+ // Load plugins (before try block so it's accessible in finally)
164
+ const globalPluginsDir = path.join(os.homedir(), ".nax", "plugins");
165
+ const projectPluginsDir = path.join(workdir, "nax", "plugins");
166
+ const configPlugins = config.plugins || [];
167
+ const pluginRegistry = await loadPlugins(globalPluginsDir, projectPluginsDir, configPlugins, workdir);
168
+
169
+ // Log plugins loaded
170
+ logger?.info("plugins", `Loaded ${pluginRegistry.plugins.length} plugins`, {
171
+ plugins: pluginRegistry.plugins.map((p) => ({ name: p.name, version: p.version, provides: p.provides })),
172
+ });
170
173
 
171
- // Log run start
172
- const routingMode = config.routing.llm?.mode ?? "hybrid";
173
- logger?.info("run.start", `Starting feature: ${feature}`, {
174
- runId,
175
- feature,
176
- workdir,
177
- dryRun,
178
- routingMode,
179
- });
174
+ // Log run start
175
+ const routingMode = config.routing.llm?.mode ?? "hybrid";
176
+ logger?.info("run.start", `Starting feature: ${feature}`, {
177
+ runId,
178
+ feature,
179
+ workdir,
180
+ dryRun,
181
+ routingMode,
182
+ });
180
183
 
181
- // Fire on-start hook
182
- await fireHook(hooks, "on-start", hookCtx(feature), workdir);
184
+ // Fire on-start hook
185
+ await fireHook(hooks, "on-start", hookCtx(feature), workdir);
183
186
 
184
- // Initialize run: check agent, reconcile state, validate limits
185
- const { initializeRun } = await import("./run-initialization");
186
- const initResult = await initializeRun({
187
- config,
188
- prdPath,
189
- workdir,
190
- dryRun,
191
- });
192
- prd = initResult.prd;
193
- const counts = initResult.storyCounts;
187
+ // Initialize run: check agent, reconcile state, validate limits
188
+ const { initializeRun } = await import("./run-initialization");
189
+ const initResult = await initializeRun({
190
+ config,
191
+ prdPath,
192
+ workdir,
193
+ dryRun,
194
+ });
195
+ prd = initResult.prd;
196
+ const counts = initResult.storyCounts;
194
197
 
195
- return {
196
- statusWriter,
197
- pidRegistry,
198
- cleanupCrashHandlers,
199
- pluginRegistry,
200
- prd,
201
- storyCounts: counts,
202
- interactionChain,
203
- };
198
+ return {
199
+ statusWriter,
200
+ pidRegistry,
201
+ cleanupCrashHandlers,
202
+ pluginRegistry,
203
+ prd,
204
+ storyCounts: counts,
205
+ interactionChain,
206
+ };
207
+ } catch (error) {
208
+ // Release lock before re-throwing so the directory isn't permanently locked
209
+ await releaseLock(workdir);
210
+ throw error;
211
+ }
204
212
  }
@@ -49,22 +49,38 @@ export async function acquireLock(workdir: string): Promise<boolean> {
49
49
  if (exists) {
50
50
  // Read lock data
51
51
  const lockContent = await lockFile.text();
52
- const lockData = JSON.parse(lockContent);
53
- const lockPid = lockData.pid;
54
-
55
- // Check if the process is still alive
56
- if (isProcessAlive(lockPid)) {
57
- // Process is alive, lock is valid
58
- return false;
52
+ let lockData: { pid: number };
53
+ try {
54
+ lockData = JSON.parse(lockContent);
55
+ } catch {
56
+ // Corrupt/unparseable lock file — treat as stale and delete
57
+ const logger = getSafeLogger();
58
+ logger?.warn("execution", "Corrupt lock file detected, removing", {
59
+ lockPath,
60
+ });
61
+ const fs = await import("node:fs/promises");
62
+ await fs.unlink(lockPath).catch(() => {});
63
+ // Fall through to create a new lock
64
+ lockData = undefined as unknown as { pid: number };
59
65
  }
60
66
 
61
- // Process is dead, remove stale lock
62
- const logger = getSafeLogger();
63
- logger?.warn("execution", "Removing stale lock", {
64
- pid: lockPid,
65
- });
66
- const fs = await import("node:fs/promises");
67
- await fs.unlink(lockPath).catch(() => {});
67
+ if (lockData) {
68
+ const lockPid = lockData.pid;
69
+
70
+ // Check if the process is still alive
71
+ if (isProcessAlive(lockPid)) {
72
+ // Process is alive, lock is valid
73
+ return false;
74
+ }
75
+
76
+ // Process is dead, remove stale lock
77
+ const logger = getSafeLogger();
78
+ logger?.warn("execution", "Removing stale lock", {
79
+ pid: lockPid,
80
+ });
81
+ const fs = await import("node:fs/promises");
82
+ await fs.unlink(lockPath).catch(() => {});
83
+ }
68
84
  }
69
85
 
70
86
  // Create lock file atomically using exclusive create (O_CREAT | O_EXCL)
@@ -180,8 +180,7 @@ async function executeParallelBatch(
180
180
  }
181
181
 
182
182
  // Execute stories in parallel with concurrency limit
183
- const executing: Promise<void>[] = [];
184
- let activeCount = 0;
183
+ const executing = new Set<Promise<void>>();
185
184
 
186
185
  for (const { story, worktreePath } of worktreeSetup) {
187
186
  const routing = routeTask(story.title, story.description, story.acceptanceCriteria, story.tags, config);
@@ -205,19 +204,13 @@ async function executeParallelBatch(
205
204
  }
206
205
  })
207
206
  .finally(() => {
208
- activeCount--;
209
- // BUG-4 fix: Remove completed promise from executing array
210
- const index = executing.indexOf(executePromise);
211
- if (index > -1) {
212
- executing.splice(index, 1);
213
- }
207
+ executing.delete(executePromise);
214
208
  });
215
209
 
216
- executing.push(executePromise);
217
- activeCount++;
210
+ executing.add(executePromise);
218
211
 
219
212
  // Wait if we've hit the concurrency limit
220
- if (activeCount >= maxConcurrency) {
213
+ if (executing.size >= maxConcurrency) {
221
214
  await Promise.race(executing);
222
215
  }
223
216
  }
@@ -148,7 +148,7 @@ export async function handlePipelineFailure(
148
148
  });
149
149
 
150
150
  if (ctx.story.attempts !== undefined && ctx.story.attempts >= ctx.config.execution.rectification.maxRetries) {
151
- pipelineEventBus.emit({
151
+ await pipelineEventBus.emitAsync({
152
152
  type: "human-review:requested",
153
153
  storyId: ctx.story.id,
154
154
  reason: pipelineResult.reason || "Max retries exceeded",
@@ -99,6 +99,9 @@ export async function run(options: RunOptions): Promise<RunResult> {
99
99
 
100
100
  const logger = getSafeLogger();
101
101
 
102
+ // Declare prd before crash handler setup to avoid TDZ if SIGTERM arrives during setupRun
103
+ let prd: Awaited<ReturnType<typeof import("./lifecycle/run-setup").setupRun>>["prd"] | undefined;
104
+
102
105
  // ── Execute initial setup phase ──────────────────────────────────────────────
103
106
  const { setupRun } = await import("./lifecycle/run-setup");
104
107
  const setupResult = await setupRun({
@@ -120,7 +123,7 @@ export async function run(options: RunOptions): Promise<RunResult> {
120
123
  getIterations: () => iterations,
121
124
  // BUG-017: Pass getters for run.complete event on SIGTERM
122
125
  getStoriesCompleted: () => storiesCompleted,
123
- getTotalStories: () => countStories(prd).total,
126
+ getTotalStories: () => (prd ? countStories(prd).total : 0),
124
127
  });
125
128
 
126
129
  const {
@@ -131,7 +134,7 @@ export async function run(options: RunOptions): Promise<RunResult> {
131
134
  storyCounts: counts,
132
135
  interactionChain,
133
136
  } = setupResult;
134
- let prd = setupResult.prd;
137
+ prd = setupResult.prd;
135
138
 
136
139
  try {
137
140
  // ── Output run header in headless mode ─────────────────────────────────
@@ -133,9 +133,11 @@ export const acceptanceStage: PipelineStage = {
133
133
  stderr: "pipe",
134
134
  });
135
135
 
136
- const exitCode = await proc.exited;
137
- const stdout = await new Response(proc.stdout).text();
138
- const stderr = await new Response(proc.stderr).text();
136
+ const [exitCode, stdout, stderr] = await Promise.all([
137
+ proc.exited,
138
+ new Response(proc.stdout).text(),
139
+ new Response(proc.stderr).text(),
140
+ ]);
139
141
 
140
142
  // Combine stdout and stderr for parsing
141
143
  const output = `${stdout}\n${stderr}`;
@@ -113,9 +113,11 @@ interface CommandResult {
113
113
  async function runCommand(cmd: string, cwd: string): Promise<CommandResult> {
114
114
  const parts = cmd.split(/\s+/);
115
115
  const proc = Bun.spawn(parts, { cwd, stdout: "pipe", stderr: "pipe" });
116
- const exitCode = await proc.exited;
117
- const stdout = await new Response(proc.stdout).text();
118
- const stderr = await new Response(proc.stderr).text();
116
+ const [exitCode, stdout, stderr] = await Promise.all([
117
+ proc.exited,
118
+ new Response(proc.stdout).text(),
119
+ new Response(proc.stderr).text(),
120
+ ]);
119
121
  return { exitCode, output: `${stdout}\n${stderr}` };
120
122
  }
121
123
 
@@ -98,6 +98,8 @@ async function callLlmOnce(modelTier: string, prompt: string, config: NaxConfig,
98
98
  reject(new Error(`LLM call timeout after ${timeoutMs}ms`));
99
99
  }, timeoutMs);
100
100
  });
101
+ // Prevent unhandled rejection if timer fires between race resolution and clearTimeout
102
+ timeoutPromise.catch(() => {});
101
103
 
102
104
  const outputPromise = (async () => {
103
105
  const [stdout, stderr] = await Promise.all([new Response(proc.stdout).text(), new Response(proc.stderr).text()]);
@@ -116,20 +118,16 @@ async function callLlmOnce(modelTier: string, prompt: string, config: NaxConfig,
116
118
  return result;
117
119
  } catch (err) {
118
120
  clearTimeout(timeoutId);
119
- // Silence the floating outputPromise after kill() the proc exits non-zero,
120
- // causing outputPromise to throw. Without this, it becomes an unhandled rejection.
121
+ // Silence the floating outputPromise BEFORE killing the process.
122
+ // proc.kill() causes piped streams to error Response.text() rejects
123
+ // outputPromise rejects. The .catch() must be attached first to prevent
124
+ // an unhandled rejection that crashes nax via crash-recovery.
121
125
  outputPromise.catch(() => {});
122
- try {
123
- proc.stdout.cancel();
124
- } catch {
125
- // ignore cancel errors — stream may already be locked by Response
126
- }
127
- try {
128
- proc.stderr.cancel();
129
- } catch {
130
- // ignore cancel errors — stream may already be locked by Response
131
- }
132
126
  proc.kill();
127
+ // DO NOT call proc.stdout.cancel() / proc.stderr.cancel() here.
128
+ // The streams are locked by Response.text() readers. Per Web Streams spec,
129
+ // cancel() on a locked stream returns a rejected Promise (not a sync throw),
130
+ // which becomes an unhandled rejection. Let proc.kill() handle cleanup.
133
131
  throw err;
134
132
  }
135
133
  }
@@ -34,8 +34,16 @@ async function drainWithDeadline(proc: Subprocess, deadlineMs: number): Promise<
34
34
  if (o !== EMPTY) out += o;
35
35
  if (e !== EMPTY) out += (out ? "\n" : "") + e;
36
36
  } catch (error) {
37
- // Streams may already be destroyed - this is expected after kill
38
- // No logger available in this utility function context
37
+ // Expected: streams destroyed after kill (e.g. TypeError from closed ReadableStream)
38
+ const isExpectedStreamError =
39
+ error instanceof TypeError ||
40
+ (error instanceof Error && /abort|cancel|close|destroy|locked/i.test(error.message));
41
+ if (!isExpectedStreamError) {
42
+ const { getSafeLogger } = await import("../logger");
43
+ getSafeLogger()?.debug("executor", "Unexpected error draining process output", {
44
+ error: error instanceof Error ? error.message : String(error),
45
+ });
46
+ }
39
47
  }
40
48
  return out;
41
49
  }
@@ -93,15 +101,19 @@ export async function executeWithTimeout(
93
101
  const timeoutMs = timeoutSeconds * 1000;
94
102
 
95
103
  let timedOut = false;
104
+ const timer = { id: undefined as ReturnType<typeof setTimeout> | undefined };
96
105
 
97
- const timeoutPromise = (async () => {
98
- await Bun.sleep(timeoutMs);
99
- timedOut = true;
100
- })();
106
+ const timeoutPromise = new Promise<void>((resolve) => {
107
+ timer.id = setTimeout(() => {
108
+ timedOut = true;
109
+ resolve();
110
+ }, timeoutMs);
111
+ });
101
112
 
102
113
  const processPromise = proc.exited;
103
114
 
104
115
  const raceResult = await Promise.race([processPromise, timeoutPromise]);
116
+ clearTimeout(timer.id);
105
117
 
106
118
  if (timedOut) {
107
119
  const pid = proc.pid;
@@ -283,9 +283,9 @@ describe("acquireLock and releaseLock", () => {
283
283
  // Create invalid JSON lock file
284
284
  await Bun.write(lockPath, "not valid json");
285
285
 
286
- // Should fail to acquire but not crash
286
+ // Should treat corrupt lock as stale and acquire successfully
287
287
  const acquired = await acquireLock(testDir);
288
- expect(acquired).toBe(false);
288
+ expect(acquired).toBe(true);
289
289
  });
290
290
 
291
291
  test("handles release when lock file doesn't exist", async () => {
@@ -112,17 +112,15 @@ afterEach(() => {
112
112
  resetLogger();
113
113
  });
114
114
 
115
- describe("BUG-039: callLlmOnce stream drain on timeout", () => {
116
- test("cancels stdout and stderr before proc.kill() on timeout", async () => {
117
- const { proc, stdoutCancelled, stderrCancelled, killCalled, killCalledAfterCancel } = makeHangingProc();
115
+ describe("BUG-039/BUG-040: stream cleanup on timeout", () => {
116
+ test("kills process on timeout without calling cancel() on locked streams", async () => {
117
+ const { proc, stdoutCancelled, stderrCancelled, killCalled } = makeHangingProc();
118
118
 
119
119
  const originalSpawn = _deps.spawn;
120
120
  _deps.spawn = mock(() => proc as PipedProc);
121
121
 
122
122
  const config = makeConfig({ timeoutMs: 30 });
123
123
 
124
- // Import callLlmOnce indirectly through llmStrategy to trigger the private function.
125
- // We test via the exported llmStrategy.route() which calls callLlm → callLlmOnce.
126
124
  const { llmStrategy } = await import("../../../../src/routing/strategies/llm");
127
125
 
128
126
  const story = {
@@ -147,15 +145,72 @@ describe("BUG-039: callLlmOnce stream drain on timeout", () => {
147
145
  // Should resolve promptly — within 500ms of the 30ms timeout
148
146
  expect(elapsed).toBeLessThan(500);
149
147
 
150
- expect(stdoutCancelled.value).toBe(true);
151
- expect(stderrCancelled.value).toBe(true);
148
+ // BUG-040: cancel() must NOT be called on locked streams — it returns a rejected
149
+ // Promise (per Web Streams spec) which becomes an unhandled rejection crash.
150
+ expect(stdoutCancelled.value).toBe(false);
151
+ expect(stderrCancelled.value).toBe(false);
152
152
  expect(killCalled.value).toBe(true);
153
- // kill() was called after both streams were cancelled
154
- expect(killCalledAfterCancel.value).toBe(true);
155
153
 
156
154
  _deps.spawn = originalSpawn;
157
155
  });
158
156
 
157
+ test("no unhandled rejection when Response.text() locks streams and proc is killed", async () => {
158
+ // Simulate the exact BUG-040 scenario:
159
+ // 1. Spawn proc with piped streams
160
+ // 2. Response(proc.stdout).text() locks the streams
161
+ // 3. Timeout fires, proc.kill() called
162
+ // 4. No unhandled rejection should occur
163
+
164
+ const unhandledRejections: Error[] = [];
165
+ const handler = (event: PromiseRejectionEvent) => {
166
+ unhandledRejections.push(event.reason as Error);
167
+ event.preventDefault();
168
+ };
169
+
170
+ // biome-ignore lint/suspicious/noGlobalAssign: test-only override
171
+ globalThis.addEventListener("unhandledrejection", handler);
172
+
173
+ // Create a proc where streams are locked by Response readers
174
+ const stdout = new ReadableStream({ start() {} });
175
+ const stderr = new ReadableStream({ start() {} });
176
+ const proc = {
177
+ stdout,
178
+ stderr,
179
+ exited: new Promise<number>(() => {}),
180
+ kill: mock(() => {}),
181
+ };
182
+
183
+ const originalSpawn = _deps.spawn;
184
+ _deps.spawn = mock(() => proc as PipedProc);
185
+
186
+ const config = makeConfig({ timeoutMs: 20, retries: 0 });
187
+
188
+ const { llmStrategy } = await import("../../../../src/routing/strategies/llm");
189
+ const story = {
190
+ id: "BUG040",
191
+ title: "Bug test",
192
+ description: "Test",
193
+ acceptanceCriteria: ["AC1"],
194
+ tags: [],
195
+ dependencies: [],
196
+ status: "pending" as const,
197
+ passes: false,
198
+ escalations: [],
199
+ attempts: 0,
200
+ };
201
+
202
+ await expect(llmStrategy.route(story, { config })).rejects.toThrow(/timeout/i);
203
+
204
+ // Give microtasks time to settle
205
+ await Bun.sleep(50);
206
+
207
+ globalThis.removeEventListener("unhandledrejection", handler);
208
+ _deps.spawn = originalSpawn;
209
+
210
+ // No unhandled rejections should have occurred
211
+ expect(unhandledRejections).toHaveLength(0);
212
+ });
213
+
159
214
  test("clearTimeout is called on success path (no resource leak)", async () => {
160
215
  const originalSpawn = _deps.spawn;
161
216