pi-crew 0.2.19 → 0.2.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/CHANGELOG.md +23 -10
  2. package/README.md +4 -2
  3. package/docs/PROJECT_REVIEW.md +271 -0
  4. package/docs/PROJECT_REVIEW_FIXES.md +343 -0
  5. package/docs/PROJECT_REVIEW_ROUND4.md +156 -0
  6. package/docs/PROJECT_REVIEW_ROUND5.md +86 -0
  7. package/docs/fixes/BATCH_A_H1_H2.md +86 -0
  8. package/docs/fixes/bug-006-foreground-cancel-concurrent.md +78 -0
  9. package/docs/fixes/bug-007-async-notifier-stale-ctx.md +112 -0
  10. package/docs/fixes/bug-008-child-process-silent-timeout.md +100 -0
  11. package/docs/fixes/bug-009-executor-yield-limit-needs-attention.md +75 -0
  12. package/docs/fixes/bug-010-child-process-api-key-filtered.md +109 -0
  13. package/docs/fixes/bug-011-spawn-pi-enoent.md +92 -0
  14. package/docs/fixes/bug-012-essential-env-stripped.md +89 -0
  15. package/docs/fixes/bug-013-background-runner-death.md +84 -0
  16. package/docs/fixes/bug-014-infinite-retry-loop-needs-attention.md +82 -0
  17. package/docs/fixes/bug-015-background-runner-sigterm.md +65 -0
  18. package/docs/fixes/bug-017-background-runner-session-shutdown.md +66 -0
  19. package/docs/fixes/bug-017-background-runner-sigkill-double-fork.md +28 -0
  20. package/docs/fixes/bug-018-child-pi-worker-stdin-hang.md +61 -0
  21. package/docs/fixes/bug-019-phantom-runs-temp-workspace.md +52 -0
  22. package/docs/pi-crew-bugs.md +954 -0
  23. package/docs/pi-crew-investigation-report.md +411 -0
  24. package/docs/pi-crew-test-final.md +120 -0
  25. package/docs/pi-crew-test-results.md +260 -0
  26. package/docs/pi-crew-test-round2.md +136 -0
  27. package/docs/pi-crew-test-round4.md +100 -0
  28. package/docs/pi-crew-test-round5.md +70 -0
  29. package/docs/pi-crew-test-round6.md +110 -0
  30. package/docs/usage.md +14 -0
  31. package/package.json +7 -5
  32. package/src/adapters/export-util.ts +12 -6
  33. package/src/agents/agent-config.ts +2 -0
  34. package/src/config/defaults.ts +1 -1
  35. package/src/config/markers.ts +22 -17
  36. package/src/config/resilient-parser.ts +1 -1
  37. package/src/extension/async-notifier.ts +4 -2
  38. package/src/extension/management.ts +52 -0
  39. package/src/extension/register.ts +47 -10
  40. package/src/extension/run-index.ts +20 -2
  41. package/src/extension/run-maintenance.ts +2 -2
  42. package/src/extension/team-tool/parallel-dispatch.ts +1 -1
  43. package/src/extension/team-tool/run.ts +3 -6
  44. package/src/extension/team-tool.ts +67 -11
  45. package/src/observability/event-to-metric.ts +2 -1
  46. package/src/runtime/async-runner.ts +42 -34
  47. package/src/runtime/background-runner.ts +165 -7
  48. package/src/runtime/child-pi.ts +111 -18
  49. package/src/runtime/code-summary.ts +1 -1
  50. package/src/runtime/crash-recovery.ts +1 -1
  51. package/src/runtime/crew-agent-runtime.ts +2 -1
  52. package/src/runtime/heartbeat-watcher.ts +4 -0
  53. package/src/runtime/live-agent-manager.ts +1 -1
  54. package/src/runtime/live-session-runtime.ts +2 -1
  55. package/src/runtime/manifest-cache.ts +2 -2
  56. package/src/runtime/model-fallback.ts +2 -1
  57. package/src/runtime/phase-progress.ts +1 -1
  58. package/src/runtime/pi-args.ts +3 -1
  59. package/src/runtime/pi-spawn.ts +6 -0
  60. package/src/runtime/prose-compressor.ts +1 -1
  61. package/src/runtime/result-extractor.ts +0 -1
  62. package/src/runtime/retry-executor.ts +1 -1
  63. package/src/runtime/runtime-resolver.ts +1 -1
  64. package/src/runtime/skill-instructions.ts +0 -1
  65. package/src/runtime/stale-reconciler.ts +30 -3
  66. package/src/runtime/subagent-manager.ts +2 -0
  67. package/src/runtime/task-display.ts +1 -1
  68. package/src/runtime/task-graph-scheduler.ts +1 -1
  69. package/src/runtime/task-runner/tail-read.ts +26 -0
  70. package/src/runtime/task-runner.ts +1007 -383
  71. package/src/runtime/team-runner.ts +9 -5
  72. package/src/runtime/worker-startup.ts +3 -1
  73. package/src/schema/team-tool-schema.ts +2 -1
  74. package/src/state/active-run-registry.ts +8 -2
  75. package/src/state/atomic-write.ts +17 -0
  76. package/src/state/contracts.ts +5 -2
  77. package/src/state/event-log-rotation.ts +118 -31
  78. package/src/state/event-log.ts +33 -5
  79. package/src/state/event-reconstructor.ts +4 -2
  80. package/src/state/mailbox.ts +5 -1
  81. package/src/state/schedule.ts +146 -0
  82. package/src/state/types.ts +40 -0
  83. package/src/state/usage.ts +20 -0
  84. package/src/ui/crew-widget.ts +2 -2
  85. package/src/ui/run-event-bus.ts +1 -1
  86. package/src/ui/run-snapshot-cache.ts +2 -1
  87. package/src/ui/snapshot-types.ts +1 -0
  88. package/src/utils/gh-protocol.ts +2 -2
  89. package/src/utils/names.ts +1 -1
  90. package/src/utils/sse-parser.ts +0 -2
  91. package/src/worktree/branch-freshness.ts +1 -1
  92. package/src/worktree/cleanup.ts +54 -14
  93. package/src/worktree/worktree-manager.ts +19 -9
@@ -1,4 +1,4 @@
1
- import { spawn, type SpawnOptions } from "node:child_process";
1
+ import { spawn } from "node:child_process";
2
2
  import { createRequire } from "node:module";
3
3
  import * as fs from "node:fs";
4
4
  import * as path from "node:path";
@@ -102,14 +102,20 @@ export function getBackgroundRunnerCommand(
102
102
  ): { args: string[]; loader: "jiti" | "strip-types" } {
103
103
  const loader = normalizeLoaderInput(loaderInput);
104
104
  if (!loader) throw new Error(buildLoaderUnavailableMessage(packageRootFromRuntime()));
105
+ // Limit V8 heap to 512MB for the background runner to avoid triggering the
106
+ // Linux OOM killer. The runner itself is lightweight — it delegates work to
107
+ // child Pi processes — so 512MB is generous. Without this limit, Node.js
108
+ // defaults to ~1.5GB on 64-bit systems, which combined with jiti compilation
109
+ // and child processes can exhaust system memory.
110
+ const memoryLimit = "--max-old-space-size=512";
105
111
  if (loader.kind === "jiti") {
106
112
  return {
107
- args: ["--import", pathToFileURL(loader.path).href, runnerPath, "--cwd", cwd, "--run-id", runId],
113
+ args: [memoryLimit, "--trace-uncaught", "--import", pathToFileURL(loader.path).href, runnerPath, "--cwd", cwd, "--run-id", runId],
108
114
  loader: "jiti",
109
115
  };
110
116
  }
111
117
  return {
112
- args: ["--experimental-strip-types", runnerPath, "--cwd", cwd, "--run-id", runId],
118
+ args: [memoryLimit, "--experimental-strip-types", runnerPath, "--cwd", cwd, "--run-id", runId],
113
119
  loader: "strip-types",
114
120
  };
115
121
  }
@@ -119,42 +125,44 @@ export interface SpawnBackgroundTeamRunResult {
119
125
  logPath: string;
120
126
  }
121
127
 
122
- export function buildBackgroundSpawnOptions(manifest: TeamRunManifest, logFd: number): SpawnOptions {
128
+ export async function spawnBackgroundTeamRun(manifest: TeamRunManifest): Promise<SpawnBackgroundTeamRunResult> {
129
+ const runnerPath = path.join(path.dirname(fileURLToPath(import.meta.url)), "background-runner.ts");
130
+ const logPath = path.join(manifest.stateRoot, "background.log");
131
+ fs.mkdirSync(manifest.stateRoot, { recursive: true });
132
+
123
133
  // NOTE: Do NOT set PI_CREW_PARENT_PID for the background runner.
124
- // The background runner is a top-level worker spawned by the team tool.
125
- // When the team tool finishes, its process exits, and the background runner
126
- // would incorrectly detect a "dead parent" and self-terminate.
127
- // Child workers spawned BY the background runner will have the background
128
- // runner as their parent, so they correctly die when the runner exits.
129
134
  const { PI_CREW_PARENT_PID: _, ...envWithoutParentPid } = process.env;
130
- return {
135
+
136
+ const loader = resolveTypeScriptLoader();
137
+ if (!loader) {
138
+ const message = buildLoaderUnavailableMessage(packageRootFromRuntime());
139
+ appendEvent(manifest.eventsPath, { type: "async.failed", runId: manifest.runId, message });
140
+ throw new Error(message);
141
+ }
142
+ const command = getBackgroundRunnerCommand(runnerPath, manifest.cwd, manifest.runId, loader);
143
+ fs.appendFileSync(logPath, `[pi-crew] background loader=${command.loader}\n`, "utf-8");
144
+
145
+ // Spawn the background runner as a fully detached process with its own session.
146
+ // BUG #17 FIX: setsid:true + detached:true creates a process that:
147
+ // 1. Has its own session (SID = PID) — immune to terminal/SIGTERM signals
148
+ // 2. Is detached (unref'd) — parent exit doesn't affect it
149
+ // 3. Has its own process group (PGID = PID) — process group kills don't reach it
150
+ //
151
+ // IMPORTANT: session_shutdown handlers must NOT kill async runners.
152
+ // See register.ts cleanupRuntime — the kill loop was commented out.
153
+ const child = spawn(process.execPath, command.args, {
131
154
  cwd: manifest.cwd,
132
155
  detached: true,
133
- stdio: ["ignore", logFd, logFd],
156
+ setsid: true as any,
157
+ stdio: ["ignore", "pipe", "pipe"],
134
158
  env: envWithoutParentPid,
135
159
  windowsHide: true,
136
- };
137
- }
160
+ } as any) as any;
161
+ child.on("error", (error: Error) => {
162
+ console.error(`[pi-crew] async spawn failed: ${error.message}`);
163
+ });
164
+ child.unref();
138
165
 
139
- export function spawnBackgroundTeamRun(manifest: TeamRunManifest): SpawnBackgroundTeamRunResult {
140
- const runnerPath = path.join(path.dirname(fileURLToPath(import.meta.url)), "background-runner.ts");
141
- const logPath = path.join(manifest.stateRoot, "background.log");
142
- fs.mkdirSync(manifest.stateRoot, { recursive: true });
143
- const logFd = fs.openSync(logPath, "a");
144
- try {
145
- const loader = resolveTypeScriptLoader();
146
- if (!loader) {
147
- const message = buildLoaderUnavailableMessage(packageRootFromRuntime());
148
- appendEvent(manifest.eventsPath, { type: "async.failed", runId: manifest.runId, message });
149
- throw new Error(message);
150
- }
151
- const command = getBackgroundRunnerCommand(runnerPath, manifest.cwd, manifest.runId, loader);
152
- fs.appendFileSync(logPath, `[pi-crew] background loader=${command.loader}\n`, "utf-8");
153
- const child = spawn(process.execPath, command.args, buildBackgroundSpawnOptions(manifest, logFd));
154
- child.unref();
155
-
156
- return { pid: child.pid, logPath };
157
- } finally {
158
- fs.closeSync(logFd);
159
- }
166
+ return { pid: child.pid, logPath };
160
167
  }
168
+
@@ -24,6 +24,34 @@ import { expandParallelResearchWorkflow } from "./parallel-research.ts";
24
24
  import { writeAsyncStartMarker } from "./async-marker.ts";
25
25
  import { startParentGuard, stopParentGuard } from "./parent-guard.ts";
26
26
 
27
+ /**
28
+ * Heartbeat mechanism: periodically write a heartbeat file so the stale reconciler
29
+ * can distinguish "process died" from "process still alive but quiet".
30
+ * Without this, the reconciler relies solely on process.kill(pid, 0) which can
31
+ * false-positive when a process is SIGKILLed and the PID hasn't been recycled yet.
32
+ */
33
+ function startHeartbeat(stateRoot: string, eventsPath: string, runId: string): () => void {
34
+ const heartbeatPath = path.join(stateRoot, "heartbeat.json");
35
+ const writeHeartbeat = (): void => {
36
+ try {
37
+ const mem = process.memoryUsage();
38
+ fs.writeFileSync(heartbeatPath, JSON.stringify({
39
+ pid: process.pid,
40
+ at: Date.now(),
41
+ runId,
42
+ memory: { heapUsedMb: Math.round(mem.heapUsed / 1024 / 1024), rssMb: Math.round(mem.rss / 1024 / 1024) },
43
+ }), "utf-8");
44
+ } catch {
45
+ /* ignore — best-effort */
46
+ }
47
+ };
48
+ // Write immediately so the stale reconciler can use heartbeat age as liveness evidence.
49
+ writeHeartbeat();
50
+ const interval = setInterval(writeHeartbeat, 15_000);
51
+ interval.unref();
52
+ return () => clearInterval(interval);
53
+ }
54
+
27
55
  /**
28
56
  * Remove macOS malloc-stack-logging vars that get inherited by child shells.
29
57
  * Without this, every subprocess prints "MallocStackLogging: can't turn off..." to stderr.
@@ -69,28 +97,111 @@ function setupUnhandledRejectionGuard(state: { cwd?: string; runId?: string; eve
69
97
  process.on("unhandledRejection", (reason, promise) => {
70
98
  const message = reason instanceof Error ? reason.message : String(reason);
71
99
  console.error("[background-runner] UNHANDLED REJECTION:", reason);
100
+ console.error("[background-runner] Stack:", reason instanceof Error ? reason.stack : "N/A");
72
101
  try {
73
- // Try to write async.failed event if we have the necessary state
74
102
  if (state.eventsPath && state.runId) {
75
103
  appendEvent(state.eventsPath, {
76
104
  type: "async.failed",
77
105
  runId: state.runId,
78
106
  message: `Unhandled rejection: ${message}`,
79
- data: { reason: String(reason), handled: false },
107
+ data: { reason: String(reason), stack: reason instanceof Error ? reason.stack : undefined, handled: false },
80
108
  });
81
109
  }
82
110
  } catch (appendErr) {
83
111
  console.error("[background-runner] Failed to write async.failed event:", appendErr);
84
112
  }
113
+ // BUG #17 FIX: Do NOT call process.exit() here. Previously, unhandled
114
+ // rejection from child Pi workers would kill the entire background runner.
115
+ // Instead, set exitCode and let the run complete normally.
85
116
  process.exitCode = 1;
86
- // Give async operations a moment to flush before exit
87
- setTimeout(() => process.exit(1), 100);
88
117
  });
89
118
  }
90
119
 
91
120
  async function main(): Promise<void> {
121
+ // Redirect console to background.log since stdio is "ignore" in detached mode.
122
+ // Must be BEFORE any console.log/console.error calls.
123
+ const _cwd = argValue("--cwd");
124
+ const _runId = argValue("--run-id");
125
+ if (_cwd && _runId) {
126
+ try {
127
+ const logPath = path.join(_cwd, ".crew/state/runs", _runId, "background.log");
128
+ const logFd = fs.openSync(logPath, "a");
129
+ const origWrite = (prefix: string) => (data: any, ...args: any[]) => {
130
+ const msg = [data, ...args].map(String).join(" ") + "\n";
131
+ fs.writeSync(logFd, msg);
132
+ };
133
+ console.log = origWrite("OUT");
134
+ console.error = origWrite("ERR");
135
+ } catch { /* best-effort */ }
136
+ }
137
+
92
138
  // Scrub macOS malloc vars BEFORE anything else — must be clean for all child processes
93
139
  scrubProcessEnv();
140
+ // Install signal handlers EARLY — log events before exiting so we can distinguish
141
+ // OOM/SIGKILL (no event) from SIGTERM/SIGINT (event written).
142
+ const signalLog = (sig: string): void => {
143
+ const cwd = argValue("--cwd");
144
+ const runId = argValue("--run-id");
145
+ if (cwd && runId) {
146
+ const loaded = loadRunManifestById(cwd, runId);
147
+ if (loaded) appendEvent(loaded.manifest.eventsPath, { type: "async.failed", runId, message: `Background runner received ${sig} — exiting.`, data: { signal: sig, pid: process.pid } });
148
+ }
149
+ };
150
+ // BUG #17 DIAGNOSTIC: Write exit code to file for debugging.
151
+ process.on("exit", (code) => {
152
+ try {
153
+ require("node:fs").appendFileSync(
154
+ manifest.stateRoot + '/exit-code.txt',
155
+ `${new Date().toISOString()} exit_code=${code} pid=${process.pid}\n`
156
+ );
157
+ } catch {}
158
+ });
159
+
160
+ process.on("SIGTERM", () => {
161
+ // BUG #17 FIX: Ignore SIGTERM.
162
+ // IMPORTANT: Perform real I/O here to flush io_uring state after EINTR.
163
+ // Without I/O, io_uring can enter corrupted state and cause silent crash.
164
+ const cwd = argValue("--cwd");
165
+ const runId = argValue("--run-id");
166
+ if (cwd && runId) {
167
+ try {
168
+ const loaded = loadRunManifestById(cwd, runId);
169
+ if (loaded) appendEvent(loaded.manifest.eventsPath, { type: "async.sigterm_ignored", runId, message: `SIGTERM ignored pid=${process.pid}`, data: { pid: process.pid, ppid: process.ppid } });
170
+ } catch { /* best-effort */ }
171
+ }
172
+ });
173
+ process.on("SIGINT", () => { signalLog("SIGINT"); process.exit(130); });
174
+ // BUG #17: Catch ALL signals to identify what kills the background runner
175
+ for (const sig of ["SIGHUP", "SIGUSR1", "SIGUSR2", "SIGPIPE", "SIGALRM", "SIGPROF", "SIGIO", "SIGPWR", "SIGSYS", "SIGURG", "SIGWINCH", "SIGCONT", "SIGTSTP", "SIGTTIN", "SIGTTOU", "SIGVTALRM", "SIGXCPU", "SIGXFSZ"] as const) {
176
+ try {
177
+ process.on(sig, () => {
178
+ signalLog(sig);
179
+ try {
180
+ const loaded = loadRunManifestById(cwd!, runId!);
181
+ if (loaded) appendEvent(loaded.manifest.eventsPath, { type: "async.signal", runId: runId!, message: `Background runner received ${sig}`, data: { signal: sig, pid: process.pid } });
182
+ } catch { /* best-effort */ }
183
+ });
184
+ } catch { /* some signals not supported on this platform */ }
185
+ }
186
+ // Hook Node.js abort — if process.exit is called with code 1 (uncaught exception, assert failure)
187
+ // we log it before exiting so it appears in background.log
188
+ const origExit = process.exit.bind(process);
189
+ // Intercept all exit(code) calls to log them as async.exit events before exiting.
190
+ // This surfaces uncaught exceptions / early exits that would otherwise vanish silently.
191
+ process.exit = ((code?: number | string): never => {
192
+ const cwd2 = argValue("--cwd");
193
+ const runId2 = argValue("--run-id");
194
+ const codeStr = code === undefined ? "<none>" : String(code);
195
+ if (cwd2 && runId2) {
196
+ try {
197
+ const loaded = loadRunManifestById(cwd2, runId2);
198
+ if (loaded) {
199
+ appendEvent(loaded.manifest.eventsPath, { type: "async.exit", runId: runId2, message: `Background runner exit(${codeStr}) pid=${process.pid}`, data: { code, pid: process.pid } });
200
+ }
201
+ } catch { /* best-effort */ }
202
+ }
203
+ return origExit(code);
204
+ }) as typeof process.exit;
94
205
 
95
206
  // Start parent guard FIRST — if parent is already dead, exit immediately
96
207
  const parentPid = Number(process.env.PI_CREW_PARENT_PID);
@@ -100,27 +211,52 @@ async function main(): Promise<void> {
100
211
  const runId = argValue("--run-id");
101
212
  if (!cwd || !runId) throw new Error("Usage: background-runner.ts --cwd <cwd> --run-id <runId>");
102
213
 
214
+ // Log PGID and SID for debugging process group isolation
215
+ try {
216
+ const stat = fs.readFileSync("/proc/self/stat", "utf8").split(" ");
217
+ console.log(`[background-runner] DEBUG: pid=${process.pid} ppid=${process.ppid} pgid=${stat[4]} sid=${stat[5]} cwd=${cwd} runId=${runId}`);
218
+ } catch {
219
+ console.log(`[background-runner] DEBUG: pid=${process.pid} ppid=${process.ppid} cwd=${cwd} runId=${runId}`);
220
+ }
221
+
103
222
  const loaded = loadRunManifestById(cwd, runId);
104
223
  if (!loaded) throw new Error(`Run '${runId}' not found.`);
105
224
  let { manifest, tasks } = loaded;
106
225
 
226
+ console.log(`[background-runner] DEBUG: manifest loaded, eventsPath=${manifest.eventsPath}`);
227
+
107
228
  // Setup unhandled rejection guard EARLY — must be before any async operations
108
229
  // that might produce unhandled rejections during cleanup.
109
230
  const rejectionGuardState = { cwd, runId, eventsPath: loaded.manifest.eventsPath };
110
231
  setupUnhandledRejectionGuard(rejectionGuardState);
111
232
 
112
233
  appendEvent(manifest.eventsPath, { type: "async.started", runId: manifest.runId, data: { pid: process.pid } });
234
+ console.log(`[background-runner] DEBUG: async.started written, pid=${process.pid}`);
113
235
  writeAsyncStartMarker(manifest, { pid: process.pid, startedAt: new Date().toISOString() });
236
+ const stopHeartbeat = startHeartbeat(manifest.stateRoot, manifest.eventsPath, manifest.runId);
114
237
  const stopInterruptGuard = startInterruptGuard(manifest);
238
+ console.log(`[background-runner] DEBUG: heartbeat+interrupt guard started`);
239
+ // BUG #17: Keep-alive interval prevents event loop from exiting during
240
+ // jiti compilation. Pure empty interval (no I/O to avoid io_uring issues).
241
+ const keepAlive = setInterval(() => {}, 5000);
242
+
115
243
 
116
244
  try {
245
+ console.log(`[background-runner] DEBUG: about to call discoverAgents`);
117
246
  const agents = allAgents(discoverAgents(cwd));
247
+ console.log(`[background-runner] DEBUG: discoverAgents done, ${agents.length} agents`);
248
+ fs.fsyncSync(fs.openSync(manifest.eventsPath, "a")); // FORCE flush so we see this before death
249
+ console.log(`[background-runner] DEBUG: calling directTeamAndWorkflowFromRun`);
118
250
  const direct = directTeamAndWorkflowFromRun(manifest, tasks, agents);
251
+ console.log(`[background-runner] DEBUG: direct done, finding team`);
119
252
  const team = direct?.team ?? allTeams(discoverTeams(cwd)).find((candidate) => candidate.name === manifest.team);
120
253
  if (!team) throw new Error(`Team '${manifest.team}' not found.`);
254
+ console.log(`[background-runner] DEBUG: team=${team.name}, finding workflow`);
121
255
  const baseWorkflow = direct?.workflow ?? allWorkflows(discoverWorkflows(cwd)).find((candidate) => candidate.name === manifest.workflow);
122
256
  if (!baseWorkflow) throw new Error(`Workflow '${manifest.workflow ?? ""}' not found.`);
257
+ console.log(`[background-runner] DEBUG: workflow=${baseWorkflow.name}`);
123
258
  const workflow = expandParallelResearchWorkflow(baseWorkflow, cwd);
259
+ console.log(`[background-runner] DEBUG: loading config`);
124
260
  const loadedConfig = loadConfig(cwd);
125
261
  const runConfig = manifest.runConfig && typeof manifest.runConfig === "object" && !Array.isArray(manifest.runConfig) ? manifest.runConfig as typeof loadedConfig.config : loadedConfig.config;
126
262
  const runtime = manifest.runtimeResolution ? { kind: manifest.runtimeResolution.kind, requestedMode: manifest.runtimeResolution.requestedMode, available: manifest.runtimeResolution.available, fallback: manifest.runtimeResolution.fallback, steer: manifest.runtimeResolution.kind === "live-session", resume: manifest.runtimeResolution.kind === "live-session", liveToolActivity: manifest.runtimeResolution.kind === "live-session", transcript: manifest.runtimeResolution.kind !== "scaffold", reason: manifest.runtimeResolution.reason, safety: manifest.runtimeResolution.safety } : await resolveCrewRuntime(runConfig);
@@ -130,12 +266,27 @@ async function main(): Promise<void> {
130
266
  appendEvent(manifest.eventsPath, { type: "runtime.resolved", runId: manifest.runId, message: `Runtime resolved: ${runtime.kind} safety=${runtime.safety}`, data: { runtimeResolution, async: true } });
131
267
  if (runtime.safety === "blocked") throw new Error(runtime.reason ?? "Child worker execution is disabled; refusing to create no-op scaffold subagents.");
132
268
  const executeWorkers = runtime.kind !== "scaffold";
133
- // Use ownerSessionId for workspaceId to ensure agents are only visible to the session that spawned them.
269
+ // Use ownerSessionId for workspaceId to ensure agents are only visible to the session that spawned them.
134
270
  // manifest.cwd would cause cross-session visibility since all sessions share the same project directory.
135
- const result = await executeTeamRun({ manifest, tasks, team, workflow, agents, executeWorkers, limits: runConfig.limits, runtime, runtimeConfig: runConfig.runtime, skillOverride: manifest.skillOverride, reliability: runConfig.reliability, workspaceId: manifest.ownerSessionId ?? manifest.cwd });
271
+ // Mark this as background mode so task-runner writes events to background.log for debugging.
272
+ process.env.PI_CREW_BACKGROUND_MODE = "1";
273
+ // BUG #17: Keep-alive interval (NOT unref'd) prevents event loop from exiting
274
+ // during jiti compilation of team-runner.ts. Without this, the event loop
275
+ // can drain when import() blocks, causing the process to exit prematurely.
276
+ console.log(`[background-runner] DEBUG: calling executeTeamRun`);
277
+ let result;
278
+ try {
279
+ result = await executeTeamRun({ manifest, tasks, team, workflow, agents, executeWorkers, limits: runConfig.limits, runtime, runtimeConfig: runConfig.runtime, skillOverride: manifest.skillOverride, reliability: runConfig.reliability, workspaceId: manifest.ownerSessionId ?? manifest.cwd });
280
+ console.log(`[background-runner] DEBUG: executeTeamRun returned, status=${result.manifest.status}`);
281
+ } catch (execError) {
282
+ console.log(`[background-runner] DEBUG: executeTeamRun THREW: ${execError instanceof Error ? execError.message : String(execError)}`);
283
+ console.log(`[background-runner] DEBUG: stack: ${execError instanceof Error ? execError.stack : "N/A"}`);
284
+ throw execError;
285
+ }
136
286
  manifest = result.manifest;
137
287
  tasks = result.tasks;
138
288
  appendEvent(manifest.eventsPath, { type: "async.completed", runId: manifest.runId, data: { status: manifest.status, tasks: tasks.length } });
289
+ console.log(`[background-runner] DEBUG: async.completed written, status=${manifest.status}`);
139
290
  if (manifest.status === "failed" || manifest.status === "cancelled" || manifest.status === "blocked") process.exitCode = 1;
140
291
  } catch (error) {
141
292
  // Terminate live agents on failure too — agents are done when the run fails
@@ -151,10 +302,17 @@ async function main(): Promise<void> {
151
302
  manifest = updateRunStatus(manifest, "failed", message);
152
303
  appendEvent(manifest.eventsPath, { type: "async.failed", runId: manifest.runId, message });
153
304
  process.exitCode = 1;
305
+ console.log(`[background-runner] DEBUG: catch block, error=${error instanceof Error ? error.message : String(error)}`);
154
306
  } finally {
307
+ console.log(`[background-runner] DEBUG: finally block, exitCode=${process.exitCode}`);
155
308
  stopInterruptGuard();
156
309
  stopParentGuard();
310
+ stopHeartbeat();
311
+ clearInterval(keepAlive);
157
312
  }
158
313
  }
159
314
 
160
- await main();
315
+ await main().catch((err) => {
316
+ console.error(`[background-runner] DEBUG: main() uncaught: ${err?.message ?? err}`);
317
+ process.exit(1);
318
+ });
@@ -115,6 +115,8 @@ export interface ChildPiLifecycleEvent {
115
115
  exitCode?: number | null;
116
116
  /** Error message for error events. */
117
117
  error?: string;
118
+ /** Stderr captured at timeout moment (for response_timeout events). */
119
+ stderr?: string;
118
120
  /** Timestamp (ISO). */
119
121
  ts: string;
120
122
  }
@@ -136,6 +138,10 @@ export interface ChildPiRunInput {
136
138
  finalDrainMs?: number;
137
139
  hardKillMs?: number;
138
140
  responseTimeoutMs?: number;
141
+ /** Soft limit on assistant turns — inject steer at this count. */
142
+ maxTurns?: number;
143
+ /** Extra turns after soft limit before hard abort. Default: 5. */
144
+ graceTurns?: number;
139
145
  }
140
146
 
141
147
  export interface ChildPiRunResult {
@@ -144,18 +150,57 @@ export interface ChildPiRunResult {
144
150
  stderr: string;
145
151
  error?: string;
146
152
  exitStatus?: WorkerExitStatus;
153
+ /** True if the agent was hard-aborted (max_turns + grace exceeded). */
154
+ aborted?: boolean;
155
+ /** True if the agent was steered to wrap up (hit soft turn limit) but finished in time. */
156
+ steered?: boolean;
147
157
  }
148
158
 
149
159
  export function buildChildPiSpawnOptions(cwd: string, env: NodeJS.ProcessEnv): SpawnOptions {
150
- // Filter out env vars whose keys match secret patterns to avoid leaking credentials to child processes
151
- const filteredEnv = sanitizeEnvSecrets(env);
160
+ // Filter out env vars whose keys match secret patterns to avoid leaking credentials to child processes.
161
+ // IMPORTANT: preserve model provider API keys — they are needed by the child Pi to call the LLM.
162
+ // Also preserve essential non-secret vars (PATH, HOME, USER, etc.) so the child process can function.
163
+ // Bug #10 fix: allow-list preserves model provider keys.
164
+ // Bug #12 fix: essential env vars (PATH, HOME, etc.) are always preserved so child can find npm/node.
165
+ const filteredEnv = sanitizeEnvSecrets(env, {
166
+ allowList: [
167
+ // Model provider API keys (these are safe to pass — they're meant for API calls)
168
+ "MINIMAX_*",
169
+ "OPENAI_*",
170
+ "ANTHROPIC_*",
171
+ "GOOGLE_*",
172
+ "AZURE_*",
173
+ "AWS_*",
174
+ "ZEU_*",
175
+ "ZERODEV_*",
176
+ "*_API_KEY",
177
+ "*_TOKEN",
178
+ "*_SECRET",
179
+ // Essential non-secret vars for child process to function
180
+ "PATH",
181
+ "HOME",
182
+ "USER",
183
+ "SHELL",
184
+ "TERM",
185
+ "LANG",
186
+ "LC_*",
187
+ "XDG_*",
188
+ "NVM_*",
189
+ "NODE_*",
190
+ "npm_*",
191
+ "PI_*",
192
+ "PI_CREW_*",
193
+ "PI_TEAMS_*",
194
+ ],
195
+ });
152
196
  return {
153
197
  cwd,
154
198
  env: { ...filteredEnv, PI_CREW_PARENT_PID: String(process.pid) },
155
- stdio: ["ignore", "pipe", "pipe"],
199
+ stdio: ["ignore", "pipe", "pipe"], // stdin=ignore: child doesn't wait for input; task comes via CLI args
156
200
  detached: process.platform !== "win32",
201
+ setsid: true,
157
202
  windowsHide: true,
158
- };
203
+ } as SpawnOptions;
159
204
  }
160
205
 
161
206
  function appendTranscript(input: ChildPiRunInput, line: string): void {
@@ -326,7 +371,7 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
326
371
  if (mock === "retryable-failure") return { exitCode: 1, stdout: "", stderr: "rate limit: mock failure" };
327
372
  return { exitCode: 1, stdout: "", stderr: `mock failure: ${mock}` };
328
373
  }
329
- const built = buildPiWorkerArgs({ task: input.task, agent: input.agent, model: input.model, sessionEnabled: false, maxDepth: input.maxDepth, skillPaths: input.skillPaths });
374
+ const built = buildPiWorkerArgs({ task: input.task, agent: input.agent, model: input.model, sessionEnabled: true, maxDepth: input.maxDepth, skillPaths: input.skillPaths });
330
375
  const spawnSpec = getPiSpawnCommand(built.args);
331
376
  try {
332
377
  return await new Promise<ChildPiRunResult>((resolve) => {
@@ -355,12 +400,20 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
355
400
  let abortRequested = input.signal?.aborted === true;
356
401
  let hardKilled = false;
357
402
  const cleanupErrors: string[] = [];
403
+ let turnCount = 0;
404
+ let softLimitReached = false;
405
+ const maxTurns = input.maxTurns;
406
+ const graceTurns = input.graceTurns;
407
+ let abortDueToParentSignal = false;
408
+ input.signal?.addEventListener("abort", () => { abortDueToParentSignal = true; }, { once: true });
358
409
  const restartNoResponseTimer = (): void => {
359
410
  if (responseTimeoutMs <= 0) return;
360
411
  if (noResponseTimer) clearTimeout(noResponseTimer);
361
412
  noResponseTimer = setTimeout(() => {
362
413
  responseTimeoutHit = true;
363
- input.onLifecycleEvent?.({ type: "response_timeout", pid: child.pid, error: `No output for ${responseTimeoutMs}ms`, ts: new Date().toISOString() });
414
+ // Capture stderr at timeout moment for debugging
415
+ const timeoutStderr = stderr.slice(-1024); // Last 1KB of stderr
416
+ input.onLifecycleEvent?.({ type: "response_timeout", pid: child.pid, error: `No output for ${responseTimeoutMs}ms`, ts: new Date().toISOString(), stderr: timeoutStderr || undefined });
364
417
  killProcessTree(child.pid, child);
365
418
  try {
366
419
  child.kill(process.platform === "win32" ? undefined : "SIGTERM");
@@ -384,6 +437,21 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
384
437
  },
385
438
  onJsonEvent: (event) => {
386
439
  restartNoResponseTimer();
440
+ // Turn-count-based steering: soft limit steer + hard abort after graceTurns
441
+ if (event && typeof event === "object" && !Array.isArray(event)) {
442
+ const obj = event as Record<string, unknown>;
443
+ if (obj.type === "turn_end") {
444
+ turnCount += 1;
445
+ if (maxTurns !== undefined && !softLimitReached && turnCount >= maxTurns) {
446
+ softLimitReached = true;
447
+ // Inject steer via stdin to tell child to wrap up
448
+ child.stdin?.write(JSON.stringify({ type: "steer", message: "You have reached your turn limit. Wrap up immediately — provide your final answer now." }) + "\n");
449
+ } else if (maxTurns !== undefined && softLimitReached && turnCount >= maxTurns + (graceTurns ?? 5)) {
450
+ // Hard abort — terminate after grace turns
451
+ try { child.kill(process.platform === "win32" ? undefined : "SIGTERM"); } catch { /* best-effort */ }
452
+ }
453
+ }
454
+ }
387
455
  input.onJsonEvent?.(event);
388
456
  if (!isFinalAssistantEvent(event) || childExited || settled || finalDrainTimer) return;
389
457
  finalDrainTimer = setTimeout(() => {
@@ -440,7 +508,12 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
440
508
  } catch (error) {
441
509
  cleanupErrors.push(error instanceof Error ? error.message : String(error));
442
510
  }
443
- resolve({ ...result, exitStatus: result.exitStatus ?? { exitCode: result.exitCode, cancelled: abortRequested, timedOut: responseTimeoutHit, killed: hardKilled, cleanupErrors, finalDrainMs } });
511
+ // Catch all errors from settle to prevent unhandled rejection from propagating
512
+ try {
513
+ resolve({ ...result, exitStatus: result.exitStatus ?? { exitCode: result.exitCode, cancelled: abortRequested, timedOut: responseTimeoutHit, killed: hardKilled, cleanupErrors, finalDrainMs } });
514
+ } catch (resolveError) {
515
+ logInternalError("child-pi.settle-resolve", resolveError, `result=${JSON.stringify({ exitCode: result.exitCode })}`);
516
+ }
444
517
  };
445
518
 
446
519
  const abort = (): void => {
@@ -485,7 +558,11 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
485
558
  restartNoResponseTimer();
486
559
  const text = chunk.toString("utf-8");
487
560
  backpressureBytes += text.length;
488
- lineObserver.observe(text);
561
+ try {
562
+ lineObserver.observe(text);
563
+ } catch (err) {
564
+ logInternalError("child-pi.line-observer-observe", err, `text=${text.slice(0, 100)}`);
565
+ }
489
566
  if (backpressureBytes > BACKPRESSURE_HIGH && child.stdout && !child.stdout.isPaused()) {
490
567
  try { child.stdout.pause(); } catch { /* ignore */ }
491
568
  const timer = setTimeout(releaseBackpressure, 50);
@@ -497,7 +574,11 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
497
574
  stderr = appendBoundedTail(stderr, chunk.toString("utf-8"));
498
575
  });
499
576
  child.on("error", (error) => {
500
- input.onLifecycleEvent?.({ type: "spawn_error", pid: child.pid, error: error.message, ts: new Date().toISOString() });
577
+ try {
578
+ input.onLifecycleEvent?.({ type: "spawn_error", pid: child.pid, error: error.message, ts: new Date().toISOString() });
579
+ } catch (err) {
580
+ logInternalError("child-pi.on-lifecycle-event", err, `event=error, pid=${child.pid}`);
581
+ }
501
582
  settle({ exitCode: null, stdout, stderr, error: error.message });
502
583
  });
503
584
  child.on("exit", (code) => {
@@ -505,7 +586,11 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
505
586
  activeChildProcesses.delete(child.pid);
506
587
  clearHardKillTimer(child.pid);
507
588
  }
508
- input.onLifecycleEvent?.({ type: "exit", pid: child.pid, exitCode: code, ts: new Date().toISOString() });
589
+ try {
590
+ input.onLifecycleEvent?.({ type: "exit", pid: child.pid, exitCode: code, ts: new Date().toISOString() });
591
+ } catch (err) {
592
+ logInternalError("child-pi.on-lifecycle-event", err, `event=exit, pid=${child.pid}`);
593
+ }
509
594
  childExited = true;
510
595
  clearNoResponseTimer();
511
596
  clearFinalDrainTimers();
@@ -521,14 +606,22 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
521
606
  activeChildProcesses.delete(child.pid);
522
607
  clearHardKillTimer(child.pid);
523
608
  }
524
- input.onLifecycleEvent?.({ type: "close", pid: child.pid, exitCode, ts: new Date().toISOString() });
525
- const timeoutError = responseTimeoutHit && !stderr.trim() ? { error: `Child Pi produced no new output for ${responseTimeoutMs}ms; process was terminated as unresponsive.` } : undefined;
526
- const finalExitCode = forcedFinalDrain && !timeoutError ? 0 : exitCode;
527
- // A final assistant event is the child Pi contract for "the worker produced its answer".
528
- // Some Pi processes can linger during post-final cleanup/stdio shutdown; finalDrain terminates
529
- // that lingering process so the parent can continue, but it must not turn a completed
530
- // subagent answer into a failed task. Real pre-final response timeouts still report errors.
531
- settle({ exitCode: finalExitCode, stdout, stderr, ...(timeoutError ? { error: timeoutError.error } : {}), exitStatus: { exitCode: finalExitCode, cancelled: abortRequested, timedOut: responseTimeoutHit, killed: hardKilled, cleanupErrors, finalDrainMs } });
609
+ try {
610
+ input.onLifecycleEvent?.({ type: "close", pid: child.pid, exitCode, ts: new Date().toISOString() });
611
+ } catch (err) {
612
+ logInternalError("child-pi.on-lifecycle-event", err, `event=close, pid=${child.pid}`);
613
+ }
614
+ const timeoutError = responseTimeoutHit && !stderr.trim() ? { error: `Child Pi produced no new output for ${responseTimeoutMs}ms; process was terminated as unresponsive.` } : responseTimeoutHit && stderr.trim() ? { error: `Child Pi timed out after ${responseTimeoutMs}ms with stderr: ${stderr.slice(-500)}` } : undefined;
615
+ // M6 fix: log when forced final drain converts non-zero exit to 0.
616
+ // This is expected in normal operation (child finished cleanly but linger was killed),
617
+ // but the telemetry helps detect regressions where crashes are hidden.
618
+ if (forcedFinalDrain && !timeoutError && exitCode !== 0) {
619
+ logInternalError("child-pi.final-drain-zero-exit", new Error(`Child exit code overridden to 0 after forced final drain (original=${exitCode})`), `pid=${child.pid}, finalDrainMs=${finalDrainMs}`);
620
+ }
621
+ const finalExitCode = forcedFinalDrain && !timeoutError ? 0 : exitCode;
622
+ const wasGraceAborted = softLimitReached && turnCount >= (maxTurns ?? 0) + (graceTurns ?? 5);
623
+ const wasParentAborted = abortDueToParentSignal && !wasGraceAborted;
624
+ settle({ exitCode: finalExitCode, stdout, stderr, ...(timeoutError ? { error: timeoutError.error } : {}), aborted: wasGraceAborted || wasParentAborted, steered: softLimitReached && !wasGraceAborted, exitStatus: { exitCode: finalExitCode, cancelled: abortRequested, timedOut: responseTimeoutHit, killed: hardKilled, cleanupErrors, finalDrainMs } });
532
625
  });
533
626
  });
534
627
  } finally {
@@ -88,7 +88,7 @@ function findBraceRanges(lines: string[], openPattern: RegExp, minBody: number):
88
88
  // ── TypeScript / JavaScript ──
89
89
 
90
90
  const TS_FN_SIG =
91
- /^\s*(export\s+)?(async\s+)?function\s|^\s*(export\s+)?(static\s+|get\s+|set\s+|private\s+|public\s+|protected\s+|readonly\s+)*\*?\s*\w+\s*[\(<]/;
91
+ /^\s*(export\s+)?(async\s+)?function\s|^\s*(export\s+)?(static\s+|get\s+|set\s+|private\s+|public\s+|protected\s+|readonly\s+)*\*?\s*\w+\s*[(<]/;
92
92
  const TS_CLASS_SIG = /^\s*(export\s+)?(default\s+)?(abstract\s+)?class\s/;
93
93
  const TS_STRUCT_SIG = /^\s*(export\s+)?(default\s+)?(const|let|var)\s+\w+\s*=\s*(\[[\s]*$|\{[\s]*$)/;
94
94
 
@@ -24,7 +24,7 @@ export interface RecoveryPlan {
24
24
  }
25
25
 
26
26
  function isTerminalTask(task: TeamTaskState): boolean {
27
- return task.status === "completed" || task.status === "failed" || task.status === "cancelled" || task.status === "skipped";
27
+ return task.status === "completed" || task.status === "failed" || task.status === "cancelled" || task.status === "skipped" || task.status === "needs_attention";
28
28
  }
29
29
 
30
30
  function shouldRecoverTask(task: TeamTaskState, deadMs: number): boolean {
@@ -2,7 +2,7 @@ import type { TeamTaskStatus } from "../state/contracts.ts";
2
2
  import type { CrewActivityState, ModelRoutingState, UsageState } from "../state/types.ts";
3
3
 
4
4
  export type CrewRuntimeKind = "scaffold" | "child-process" | "live-session";
5
- export type CrewAgentStatus = "queued" | "running" | "waiting" | "completed" | "failed" | "cancelled" | "stopped";
5
+ export type CrewAgentStatus = "queued" | "running" | "waiting" | "completed" | "failed" | "cancelled" | "stopped" | "needs_attention";
6
6
 
7
7
  export interface CrewAgentRecentTool {
8
8
  tool: string;
@@ -56,5 +56,6 @@ export function taskStatusToAgentStatus(status: TeamTaskStatus): CrewAgentStatus
56
56
  if (status === "cancelled" || status === "skipped") return "cancelled";
57
57
  if (status === "running") return "running";
58
58
  if (status === "waiting") return "waiting";
59
+ if (status === "needs_attention") return "needs_attention";
59
60
  return "queued";
60
61
  }
@@ -1,3 +1,4 @@
1
+ import * as fs from "node:fs";
1
2
  import type { NotificationDescriptor } from "../extension/notification-router.ts";
2
3
  import type { MetricRegistry } from "../observability/metric-registry.ts";
3
4
  import { appendEvent } from "../state/event-log.ts";
@@ -84,6 +85,9 @@ export class HeartbeatWatcher {
84
85
 
85
86
  for (const run of this.opts.manifestCache.list(50)) {
86
87
  if (run.status !== "running") continue;
88
+ // Bug #5 fix: if stateRoot doesn't exist, the run was pruned — skip it silently.
89
+ // This prevents stale "heartbeat dead" notifications for runs that no longer exist.
90
+ if (!fs.existsSync(run.stateRoot)) continue;
87
91
  const loaded = loadRunManifestById(this.opts.cwd, run.runId);
88
92
  if (!loaded) continue;
89
93
  for (const task of loaded.tasks) {
@@ -200,7 +200,7 @@ export function evictStaleLiveAgentHandles(now = Date.now()): number {
200
200
  }
201
201
 
202
202
  export function listLiveAgents(): LiveAgentHandle[] {
203
- return [...liveAgents.values()].sort((a, b) => b.updatedAt.localeCompare(a.updatedAt));
203
+ return [...liveAgents.values()].sort((a, b) => (b.updatedAt ?? "").localeCompare(a.updatedAt ?? ""));
204
204
  }
205
205
 
206
206
  export function listActiveLiveAgents(): LiveAgentHandle[] {
@@ -267,8 +267,9 @@ function liveSystemPrompt(input: LiveSessionSpawnInput): string {
267
267
  function filterActiveTools(session: LiveSessionLike, agent: AgentConfig): void {
268
268
  if (typeof session.getActiveToolNames !== "function" || typeof session.setActiveToolsByName !== "function") return;
269
269
  const recursiveTools = new Set(["team", "Team", "Agent", "get_subagent_result", "steer_subagent"]);
270
+ const disallowed = agent.disallowedTools?.length ? new Set(agent.disallowedTools) : undefined;
270
271
  const allowed = agent.tools?.length ? new Set(agent.tools) : undefined;
271
- const active = session.getActiveToolNames().filter((name) => !recursiveTools.has(name) && (!allowed || allowed.has(name)));
272
+ const active = session.getActiveToolNames().filter((name) => !recursiveTools.has(name) && (!disallowed || !disallowed.has(name)) && (!allowed || allowed.has(name)));
272
273
  session.setActiveToolsByName(active);
273
274
  }
274
275