pi-crew 0.2.19 → 0.2.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +23 -10
- package/README.md +4 -2
- package/docs/PROJECT_REVIEW.md +271 -0
- package/docs/PROJECT_REVIEW_FIXES.md +343 -0
- package/docs/PROJECT_REVIEW_ROUND4.md +156 -0
- package/docs/PROJECT_REVIEW_ROUND5.md +86 -0
- package/docs/fixes/BATCH_A_H1_H2.md +86 -0
- package/docs/fixes/bug-006-foreground-cancel-concurrent.md +78 -0
- package/docs/fixes/bug-007-async-notifier-stale-ctx.md +112 -0
- package/docs/fixes/bug-008-child-process-silent-timeout.md +100 -0
- package/docs/fixes/bug-009-executor-yield-limit-needs-attention.md +75 -0
- package/docs/fixes/bug-010-child-process-api-key-filtered.md +109 -0
- package/docs/fixes/bug-011-spawn-pi-enoent.md +92 -0
- package/docs/fixes/bug-012-essential-env-stripped.md +89 -0
- package/docs/fixes/bug-013-background-runner-death.md +84 -0
- package/docs/fixes/bug-014-infinite-retry-loop-needs-attention.md +82 -0
- package/docs/fixes/bug-015-background-runner-sigterm.md +65 -0
- package/docs/fixes/bug-017-background-runner-session-shutdown.md +66 -0
- package/docs/fixes/bug-017-background-runner-sigkill-double-fork.md +28 -0
- package/docs/fixes/bug-018-child-pi-worker-stdin-hang.md +61 -0
- package/docs/fixes/bug-019-phantom-runs-temp-workspace.md +52 -0
- package/docs/pi-crew-bugs.md +954 -0
- package/docs/pi-crew-investigation-report.md +411 -0
- package/docs/pi-crew-test-final.md +120 -0
- package/docs/pi-crew-test-results.md +260 -0
- package/docs/pi-crew-test-round2.md +136 -0
- package/docs/pi-crew-test-round4.md +100 -0
- package/docs/pi-crew-test-round5.md +70 -0
- package/docs/pi-crew-test-round6.md +110 -0
- package/docs/usage.md +14 -0
- package/package.json +7 -5
- package/src/adapters/export-util.ts +12 -6
- package/src/agents/agent-config.ts +2 -0
- package/src/config/defaults.ts +1 -1
- package/src/config/markers.ts +22 -17
- package/src/config/resilient-parser.ts +1 -1
- package/src/extension/async-notifier.ts +4 -2
- package/src/extension/management.ts +52 -0
- package/src/extension/register.ts +47 -10
- package/src/extension/run-index.ts +20 -2
- package/src/extension/run-maintenance.ts +2 -2
- package/src/extension/team-tool/parallel-dispatch.ts +1 -1
- package/src/extension/team-tool/run.ts +3 -6
- package/src/extension/team-tool.ts +67 -11
- package/src/observability/event-to-metric.ts +2 -1
- package/src/runtime/async-runner.ts +42 -34
- package/src/runtime/background-runner.ts +165 -7
- package/src/runtime/child-pi.ts +111 -18
- package/src/runtime/code-summary.ts +1 -1
- package/src/runtime/crash-recovery.ts +1 -1
- package/src/runtime/crew-agent-runtime.ts +2 -1
- package/src/runtime/heartbeat-watcher.ts +4 -0
- package/src/runtime/live-agent-manager.ts +1 -1
- package/src/runtime/live-session-runtime.ts +2 -1
- package/src/runtime/manifest-cache.ts +2 -2
- package/src/runtime/model-fallback.ts +2 -1
- package/src/runtime/phase-progress.ts +1 -1
- package/src/runtime/pi-args.ts +3 -1
- package/src/runtime/pi-spawn.ts +6 -0
- package/src/runtime/prose-compressor.ts +1 -1
- package/src/runtime/result-extractor.ts +0 -1
- package/src/runtime/retry-executor.ts +1 -1
- package/src/runtime/runtime-resolver.ts +1 -1
- package/src/runtime/skill-instructions.ts +0 -1
- package/src/runtime/stale-reconciler.ts +30 -3
- package/src/runtime/subagent-manager.ts +2 -0
- package/src/runtime/task-display.ts +1 -1
- package/src/runtime/task-graph-scheduler.ts +1 -1
- package/src/runtime/task-runner/tail-read.ts +26 -0
- package/src/runtime/task-runner.ts +1007 -383
- package/src/runtime/team-runner.ts +9 -5
- package/src/runtime/worker-startup.ts +3 -1
- package/src/schema/team-tool-schema.ts +2 -1
- package/src/state/active-run-registry.ts +8 -2
- package/src/state/atomic-write.ts +17 -0
- package/src/state/contracts.ts +5 -2
- package/src/state/event-log-rotation.ts +118 -31
- package/src/state/event-log.ts +33 -5
- package/src/state/event-reconstructor.ts +4 -2
- package/src/state/mailbox.ts +5 -1
- package/src/state/schedule.ts +146 -0
- package/src/state/types.ts +40 -0
- package/src/state/usage.ts +20 -0
- package/src/ui/crew-widget.ts +2 -2
- package/src/ui/run-event-bus.ts +1 -1
- package/src/ui/run-snapshot-cache.ts +2 -1
- package/src/ui/snapshot-types.ts +1 -0
- package/src/utils/gh-protocol.ts +2 -2
- package/src/utils/names.ts +1 -1
- package/src/utils/sse-parser.ts +0 -2
- package/src/worktree/branch-freshness.ts +1 -1
- package/src/worktree/cleanup.ts +54 -14
- package/src/worktree/worktree-manager.ts +19 -9
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { spawn
|
|
1
|
+
import { spawn } from "node:child_process";
|
|
2
2
|
import { createRequire } from "node:module";
|
|
3
3
|
import * as fs from "node:fs";
|
|
4
4
|
import * as path from "node:path";
|
|
@@ -102,14 +102,20 @@ export function getBackgroundRunnerCommand(
|
|
|
102
102
|
): { args: string[]; loader: "jiti" | "strip-types" } {
|
|
103
103
|
const loader = normalizeLoaderInput(loaderInput);
|
|
104
104
|
if (!loader) throw new Error(buildLoaderUnavailableMessage(packageRootFromRuntime()));
|
|
105
|
+
// Limit V8 heap to 512MB for the background runner to avoid triggering the
|
|
106
|
+
// Linux OOM killer. The runner itself is lightweight — it delegates work to
|
|
107
|
+
// child Pi processes — so 512MB is generous. Without this limit, Node.js
|
|
108
|
+
// defaults to ~1.5GB on 64-bit systems, which combined with jiti compilation
|
|
109
|
+
// and child processes can exhaust system memory.
|
|
110
|
+
const memoryLimit = "--max-old-space-size=512";
|
|
105
111
|
if (loader.kind === "jiti") {
|
|
106
112
|
return {
|
|
107
|
-
args: ["--import", pathToFileURL(loader.path).href, runnerPath, "--cwd", cwd, "--run-id", runId],
|
|
113
|
+
args: [memoryLimit, "--trace-uncaught", "--import", pathToFileURL(loader.path).href, runnerPath, "--cwd", cwd, "--run-id", runId],
|
|
108
114
|
loader: "jiti",
|
|
109
115
|
};
|
|
110
116
|
}
|
|
111
117
|
return {
|
|
112
|
-
args: ["--experimental-strip-types", runnerPath, "--cwd", cwd, "--run-id", runId],
|
|
118
|
+
args: [memoryLimit, "--experimental-strip-types", runnerPath, "--cwd", cwd, "--run-id", runId],
|
|
113
119
|
loader: "strip-types",
|
|
114
120
|
};
|
|
115
121
|
}
|
|
@@ -119,42 +125,44 @@ export interface SpawnBackgroundTeamRunResult {
|
|
|
119
125
|
logPath: string;
|
|
120
126
|
}
|
|
121
127
|
|
|
122
|
-
export function
|
|
128
|
+
export async function spawnBackgroundTeamRun(manifest: TeamRunManifest): Promise<SpawnBackgroundTeamRunResult> {
|
|
129
|
+
const runnerPath = path.join(path.dirname(fileURLToPath(import.meta.url)), "background-runner.ts");
|
|
130
|
+
const logPath = path.join(manifest.stateRoot, "background.log");
|
|
131
|
+
fs.mkdirSync(manifest.stateRoot, { recursive: true });
|
|
132
|
+
|
|
123
133
|
// NOTE: Do NOT set PI_CREW_PARENT_PID for the background runner.
|
|
124
|
-
// The background runner is a top-level worker spawned by the team tool.
|
|
125
|
-
// When the team tool finishes, its process exits, and the background runner
|
|
126
|
-
// would incorrectly detect a "dead parent" and self-terminate.
|
|
127
|
-
// Child workers spawned BY the background runner will have the background
|
|
128
|
-
// runner as their parent, so they correctly die when the runner exits.
|
|
129
134
|
const { PI_CREW_PARENT_PID: _, ...envWithoutParentPid } = process.env;
|
|
130
|
-
|
|
135
|
+
|
|
136
|
+
const loader = resolveTypeScriptLoader();
|
|
137
|
+
if (!loader) {
|
|
138
|
+
const message = buildLoaderUnavailableMessage(packageRootFromRuntime());
|
|
139
|
+
appendEvent(manifest.eventsPath, { type: "async.failed", runId: manifest.runId, message });
|
|
140
|
+
throw new Error(message);
|
|
141
|
+
}
|
|
142
|
+
const command = getBackgroundRunnerCommand(runnerPath, manifest.cwd, manifest.runId, loader);
|
|
143
|
+
fs.appendFileSync(logPath, `[pi-crew] background loader=${command.loader}\n`, "utf-8");
|
|
144
|
+
|
|
145
|
+
// Spawn the background runner as a fully detached process with its own session.
|
|
146
|
+
// BUG #17 FIX: setsid:true + detached:true creates a process that:
|
|
147
|
+
// 1. Has its own session (SID = PID) — immune to terminal/SIGTERM signals
|
|
148
|
+
// 2. Is detached (unref'd) — parent exit doesn't affect it
|
|
149
|
+
// 3. Has its own process group (PGID = PID) — process group kills don't reach it
|
|
150
|
+
//
|
|
151
|
+
// IMPORTANT: session_shutdown handlers must NOT kill async runners.
|
|
152
|
+
// See register.ts cleanupRuntime — the kill loop was commented out.
|
|
153
|
+
const child = spawn(process.execPath, command.args, {
|
|
131
154
|
cwd: manifest.cwd,
|
|
132
155
|
detached: true,
|
|
133
|
-
|
|
156
|
+
setsid: true as any,
|
|
157
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
134
158
|
env: envWithoutParentPid,
|
|
135
159
|
windowsHide: true,
|
|
136
|
-
};
|
|
137
|
-
|
|
160
|
+
} as any) as any;
|
|
161
|
+
child.on("error", (error: Error) => {
|
|
162
|
+
console.error(`[pi-crew] async spawn failed: ${error.message}`);
|
|
163
|
+
});
|
|
164
|
+
child.unref();
|
|
138
165
|
|
|
139
|
-
|
|
140
|
-
const runnerPath = path.join(path.dirname(fileURLToPath(import.meta.url)), "background-runner.ts");
|
|
141
|
-
const logPath = path.join(manifest.stateRoot, "background.log");
|
|
142
|
-
fs.mkdirSync(manifest.stateRoot, { recursive: true });
|
|
143
|
-
const logFd = fs.openSync(logPath, "a");
|
|
144
|
-
try {
|
|
145
|
-
const loader = resolveTypeScriptLoader();
|
|
146
|
-
if (!loader) {
|
|
147
|
-
const message = buildLoaderUnavailableMessage(packageRootFromRuntime());
|
|
148
|
-
appendEvent(manifest.eventsPath, { type: "async.failed", runId: manifest.runId, message });
|
|
149
|
-
throw new Error(message);
|
|
150
|
-
}
|
|
151
|
-
const command = getBackgroundRunnerCommand(runnerPath, manifest.cwd, manifest.runId, loader);
|
|
152
|
-
fs.appendFileSync(logPath, `[pi-crew] background loader=${command.loader}\n`, "utf-8");
|
|
153
|
-
const child = spawn(process.execPath, command.args, buildBackgroundSpawnOptions(manifest, logFd));
|
|
154
|
-
child.unref();
|
|
155
|
-
|
|
156
|
-
return { pid: child.pid, logPath };
|
|
157
|
-
} finally {
|
|
158
|
-
fs.closeSync(logFd);
|
|
159
|
-
}
|
|
166
|
+
return { pid: child.pid, logPath };
|
|
160
167
|
}
|
|
168
|
+
|
|
@@ -24,6 +24,34 @@ import { expandParallelResearchWorkflow } from "./parallel-research.ts";
|
|
|
24
24
|
import { writeAsyncStartMarker } from "./async-marker.ts";
|
|
25
25
|
import { startParentGuard, stopParentGuard } from "./parent-guard.ts";
|
|
26
26
|
|
|
27
|
+
/**
|
|
28
|
+
* Heartbeat mechanism: periodically write a heartbeat file so the stale reconciler
|
|
29
|
+
* can distinguish "process died" from "process still alive but quiet".
|
|
30
|
+
* Without this, the reconciler relies solely on process.kill(pid, 0) which can
|
|
31
|
+
* false-positive when a process is SIGKILLed and the PID hasn't been recycled yet.
|
|
32
|
+
*/
|
|
33
|
+
function startHeartbeat(stateRoot: string, eventsPath: string, runId: string): () => void {
|
|
34
|
+
const heartbeatPath = path.join(stateRoot, "heartbeat.json");
|
|
35
|
+
const writeHeartbeat = (): void => {
|
|
36
|
+
try {
|
|
37
|
+
const mem = process.memoryUsage();
|
|
38
|
+
fs.writeFileSync(heartbeatPath, JSON.stringify({
|
|
39
|
+
pid: process.pid,
|
|
40
|
+
at: Date.now(),
|
|
41
|
+
runId,
|
|
42
|
+
memory: { heapUsedMb: Math.round(mem.heapUsed / 1024 / 1024), rssMb: Math.round(mem.rss / 1024 / 1024) },
|
|
43
|
+
}), "utf-8");
|
|
44
|
+
} catch {
|
|
45
|
+
/* ignore — best-effort */
|
|
46
|
+
}
|
|
47
|
+
};
|
|
48
|
+
// Write immediately so the stale reconciler can use heartbeat age as liveness evidence.
|
|
49
|
+
writeHeartbeat();
|
|
50
|
+
const interval = setInterval(writeHeartbeat, 15_000);
|
|
51
|
+
interval.unref();
|
|
52
|
+
return () => clearInterval(interval);
|
|
53
|
+
}
|
|
54
|
+
|
|
27
55
|
/**
|
|
28
56
|
* Remove macOS malloc-stack-logging vars that get inherited by child shells.
|
|
29
57
|
* Without this, every subprocess prints "MallocStackLogging: can't turn off..." to stderr.
|
|
@@ -69,28 +97,111 @@ function setupUnhandledRejectionGuard(state: { cwd?: string; runId?: string; eve
|
|
|
69
97
|
process.on("unhandledRejection", (reason, promise) => {
|
|
70
98
|
const message = reason instanceof Error ? reason.message : String(reason);
|
|
71
99
|
console.error("[background-runner] UNHANDLED REJECTION:", reason);
|
|
100
|
+
console.error("[background-runner] Stack:", reason instanceof Error ? reason.stack : "N/A");
|
|
72
101
|
try {
|
|
73
|
-
// Try to write async.failed event if we have the necessary state
|
|
74
102
|
if (state.eventsPath && state.runId) {
|
|
75
103
|
appendEvent(state.eventsPath, {
|
|
76
104
|
type: "async.failed",
|
|
77
105
|
runId: state.runId,
|
|
78
106
|
message: `Unhandled rejection: ${message}`,
|
|
79
|
-
data: { reason: String(reason), handled: false },
|
|
107
|
+
data: { reason: String(reason), stack: reason instanceof Error ? reason.stack : undefined, handled: false },
|
|
80
108
|
});
|
|
81
109
|
}
|
|
82
110
|
} catch (appendErr) {
|
|
83
111
|
console.error("[background-runner] Failed to write async.failed event:", appendErr);
|
|
84
112
|
}
|
|
113
|
+
// BUG #17 FIX: Do NOT call process.exit() here. Previously, unhandled
|
|
114
|
+
// rejection from child Pi workers would kill the entire background runner.
|
|
115
|
+
// Instead, set exitCode and let the run complete normally.
|
|
85
116
|
process.exitCode = 1;
|
|
86
|
-
// Give async operations a moment to flush before exit
|
|
87
|
-
setTimeout(() => process.exit(1), 100);
|
|
88
117
|
});
|
|
89
118
|
}
|
|
90
119
|
|
|
91
120
|
async function main(): Promise<void> {
|
|
121
|
+
// Redirect console to background.log since stdio is "ignore" in detached mode.
|
|
122
|
+
// Must be BEFORE any console.log/console.error calls.
|
|
123
|
+
const _cwd = argValue("--cwd");
|
|
124
|
+
const _runId = argValue("--run-id");
|
|
125
|
+
if (_cwd && _runId) {
|
|
126
|
+
try {
|
|
127
|
+
const logPath = path.join(_cwd, ".crew/state/runs", _runId, "background.log");
|
|
128
|
+
const logFd = fs.openSync(logPath, "a");
|
|
129
|
+
const origWrite = (prefix: string) => (data: any, ...args: any[]) => {
|
|
130
|
+
const msg = [data, ...args].map(String).join(" ") + "\n";
|
|
131
|
+
fs.writeSync(logFd, msg);
|
|
132
|
+
};
|
|
133
|
+
console.log = origWrite("OUT");
|
|
134
|
+
console.error = origWrite("ERR");
|
|
135
|
+
} catch { /* best-effort */ }
|
|
136
|
+
}
|
|
137
|
+
|
|
92
138
|
// Scrub macOS malloc vars BEFORE anything else — must be clean for all child processes
|
|
93
139
|
scrubProcessEnv();
|
|
140
|
+
// Install signal handlers EARLY — log events before exiting so we can distinguish
|
|
141
|
+
// OOM/SIGKILL (no event) from SIGTERM/SIGINT (event written).
|
|
142
|
+
const signalLog = (sig: string): void => {
|
|
143
|
+
const cwd = argValue("--cwd");
|
|
144
|
+
const runId = argValue("--run-id");
|
|
145
|
+
if (cwd && runId) {
|
|
146
|
+
const loaded = loadRunManifestById(cwd, runId);
|
|
147
|
+
if (loaded) appendEvent(loaded.manifest.eventsPath, { type: "async.failed", runId, message: `Background runner received ${sig} — exiting.`, data: { signal: sig, pid: process.pid } });
|
|
148
|
+
}
|
|
149
|
+
};
|
|
150
|
+
// BUG #17 DIAGNOSTIC: Write exit code to file for debugging.
|
|
151
|
+
process.on("exit", (code) => {
|
|
152
|
+
try {
|
|
153
|
+
require("node:fs").appendFileSync(
|
|
154
|
+
manifest.stateRoot + '/exit-code.txt',
|
|
155
|
+
`${new Date().toISOString()} exit_code=${code} pid=${process.pid}\n`
|
|
156
|
+
);
|
|
157
|
+
} catch {}
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
process.on("SIGTERM", () => {
|
|
161
|
+
// BUG #17 FIX: Ignore SIGTERM.
|
|
162
|
+
// IMPORTANT: Perform real I/O here to flush io_uring state after EINTR.
|
|
163
|
+
// Without I/O, io_uring can enter corrupted state and cause silent crash.
|
|
164
|
+
const cwd = argValue("--cwd");
|
|
165
|
+
const runId = argValue("--run-id");
|
|
166
|
+
if (cwd && runId) {
|
|
167
|
+
try {
|
|
168
|
+
const loaded = loadRunManifestById(cwd, runId);
|
|
169
|
+
if (loaded) appendEvent(loaded.manifest.eventsPath, { type: "async.sigterm_ignored", runId, message: `SIGTERM ignored pid=${process.pid}`, data: { pid: process.pid, ppid: process.ppid } });
|
|
170
|
+
} catch { /* best-effort */ }
|
|
171
|
+
}
|
|
172
|
+
});
|
|
173
|
+
process.on("SIGINT", () => { signalLog("SIGINT"); process.exit(130); });
|
|
174
|
+
// BUG #17: Catch ALL signals to identify what kills the background runner
|
|
175
|
+
for (const sig of ["SIGHUP", "SIGUSR1", "SIGUSR2", "SIGPIPE", "SIGALRM", "SIGPROF", "SIGIO", "SIGPWR", "SIGSYS", "SIGURG", "SIGWINCH", "SIGCONT", "SIGTSTP", "SIGTTIN", "SIGTTOU", "SIGVTALRM", "SIGXCPU", "SIGXFSZ"] as const) {
|
|
176
|
+
try {
|
|
177
|
+
process.on(sig, () => {
|
|
178
|
+
signalLog(sig);
|
|
179
|
+
try {
|
|
180
|
+
const loaded = loadRunManifestById(cwd!, runId!);
|
|
181
|
+
if (loaded) appendEvent(loaded.manifest.eventsPath, { type: "async.signal", runId: runId!, message: `Background runner received ${sig}`, data: { signal: sig, pid: process.pid } });
|
|
182
|
+
} catch { /* best-effort */ }
|
|
183
|
+
});
|
|
184
|
+
} catch { /* some signals not supported on this platform */ }
|
|
185
|
+
}
|
|
186
|
+
// Hook Node.js abort — if process.exit is called with code 1 (uncaught exception, assert failure)
|
|
187
|
+
// we log it before exiting so it appears in background.log
|
|
188
|
+
const origExit = process.exit.bind(process);
|
|
189
|
+
// Intercept all exit(code) calls to log them as async.exit events before exiting.
|
|
190
|
+
// This surfaces uncaught exceptions / early exits that would otherwise vanish silently.
|
|
191
|
+
process.exit = ((code?: number | string): never => {
|
|
192
|
+
const cwd2 = argValue("--cwd");
|
|
193
|
+
const runId2 = argValue("--run-id");
|
|
194
|
+
const codeStr = code === undefined ? "<none>" : String(code);
|
|
195
|
+
if (cwd2 && runId2) {
|
|
196
|
+
try {
|
|
197
|
+
const loaded = loadRunManifestById(cwd2, runId2);
|
|
198
|
+
if (loaded) {
|
|
199
|
+
appendEvent(loaded.manifest.eventsPath, { type: "async.exit", runId: runId2, message: `Background runner exit(${codeStr}) pid=${process.pid}`, data: { code, pid: process.pid } });
|
|
200
|
+
}
|
|
201
|
+
} catch { /* best-effort */ }
|
|
202
|
+
}
|
|
203
|
+
return origExit(code);
|
|
204
|
+
}) as typeof process.exit;
|
|
94
205
|
|
|
95
206
|
// Start parent guard FIRST — if parent is already dead, exit immediately
|
|
96
207
|
const parentPid = Number(process.env.PI_CREW_PARENT_PID);
|
|
@@ -100,27 +211,52 @@ async function main(): Promise<void> {
|
|
|
100
211
|
const runId = argValue("--run-id");
|
|
101
212
|
if (!cwd || !runId) throw new Error("Usage: background-runner.ts --cwd <cwd> --run-id <runId>");
|
|
102
213
|
|
|
214
|
+
// Log PGID and SID for debugging process group isolation
|
|
215
|
+
try {
|
|
216
|
+
const stat = fs.readFileSync("/proc/self/stat", "utf8").split(" ");
|
|
217
|
+
console.log(`[background-runner] DEBUG: pid=${process.pid} ppid=${process.ppid} pgid=${stat[4]} sid=${stat[5]} cwd=${cwd} runId=${runId}`);
|
|
218
|
+
} catch {
|
|
219
|
+
console.log(`[background-runner] DEBUG: pid=${process.pid} ppid=${process.ppid} cwd=${cwd} runId=${runId}`);
|
|
220
|
+
}
|
|
221
|
+
|
|
103
222
|
const loaded = loadRunManifestById(cwd, runId);
|
|
104
223
|
if (!loaded) throw new Error(`Run '${runId}' not found.`);
|
|
105
224
|
let { manifest, tasks } = loaded;
|
|
106
225
|
|
|
226
|
+
console.log(`[background-runner] DEBUG: manifest loaded, eventsPath=${manifest.eventsPath}`);
|
|
227
|
+
|
|
107
228
|
// Setup unhandled rejection guard EARLY — must be before any async operations
|
|
108
229
|
// that might produce unhandled rejections during cleanup.
|
|
109
230
|
const rejectionGuardState = { cwd, runId, eventsPath: loaded.manifest.eventsPath };
|
|
110
231
|
setupUnhandledRejectionGuard(rejectionGuardState);
|
|
111
232
|
|
|
112
233
|
appendEvent(manifest.eventsPath, { type: "async.started", runId: manifest.runId, data: { pid: process.pid } });
|
|
234
|
+
console.log(`[background-runner] DEBUG: async.started written, pid=${process.pid}`);
|
|
113
235
|
writeAsyncStartMarker(manifest, { pid: process.pid, startedAt: new Date().toISOString() });
|
|
236
|
+
const stopHeartbeat = startHeartbeat(manifest.stateRoot, manifest.eventsPath, manifest.runId);
|
|
114
237
|
const stopInterruptGuard = startInterruptGuard(manifest);
|
|
238
|
+
console.log(`[background-runner] DEBUG: heartbeat+interrupt guard started`);
|
|
239
|
+
// BUG #17: Keep-alive interval prevents event loop from exiting during
|
|
240
|
+
// jiti compilation. Pure empty interval (no I/O to avoid io_uring issues).
|
|
241
|
+
const keepAlive = setInterval(() => {}, 5000);
|
|
242
|
+
|
|
115
243
|
|
|
116
244
|
try {
|
|
245
|
+
console.log(`[background-runner] DEBUG: about to call discoverAgents`);
|
|
117
246
|
const agents = allAgents(discoverAgents(cwd));
|
|
247
|
+
console.log(`[background-runner] DEBUG: discoverAgents done, ${agents.length} agents`);
|
|
248
|
+
fs.fsyncSync(fs.openSync(manifest.eventsPath, "a")); // FORCE flush so we see this before death
|
|
249
|
+
console.log(`[background-runner] DEBUG: calling directTeamAndWorkflowFromRun`);
|
|
118
250
|
const direct = directTeamAndWorkflowFromRun(manifest, tasks, agents);
|
|
251
|
+
console.log(`[background-runner] DEBUG: direct done, finding team`);
|
|
119
252
|
const team = direct?.team ?? allTeams(discoverTeams(cwd)).find((candidate) => candidate.name === manifest.team);
|
|
120
253
|
if (!team) throw new Error(`Team '${manifest.team}' not found.`);
|
|
254
|
+
console.log(`[background-runner] DEBUG: team=${team.name}, finding workflow`);
|
|
121
255
|
const baseWorkflow = direct?.workflow ?? allWorkflows(discoverWorkflows(cwd)).find((candidate) => candidate.name === manifest.workflow);
|
|
122
256
|
if (!baseWorkflow) throw new Error(`Workflow '${manifest.workflow ?? ""}' not found.`);
|
|
257
|
+
console.log(`[background-runner] DEBUG: workflow=${baseWorkflow.name}`);
|
|
123
258
|
const workflow = expandParallelResearchWorkflow(baseWorkflow, cwd);
|
|
259
|
+
console.log(`[background-runner] DEBUG: loading config`);
|
|
124
260
|
const loadedConfig = loadConfig(cwd);
|
|
125
261
|
const runConfig = manifest.runConfig && typeof manifest.runConfig === "object" && !Array.isArray(manifest.runConfig) ? manifest.runConfig as typeof loadedConfig.config : loadedConfig.config;
|
|
126
262
|
const runtime = manifest.runtimeResolution ? { kind: manifest.runtimeResolution.kind, requestedMode: manifest.runtimeResolution.requestedMode, available: manifest.runtimeResolution.available, fallback: manifest.runtimeResolution.fallback, steer: manifest.runtimeResolution.kind === "live-session", resume: manifest.runtimeResolution.kind === "live-session", liveToolActivity: manifest.runtimeResolution.kind === "live-session", transcript: manifest.runtimeResolution.kind !== "scaffold", reason: manifest.runtimeResolution.reason, safety: manifest.runtimeResolution.safety } : await resolveCrewRuntime(runConfig);
|
|
@@ -130,12 +266,27 @@ async function main(): Promise<void> {
|
|
|
130
266
|
appendEvent(manifest.eventsPath, { type: "runtime.resolved", runId: manifest.runId, message: `Runtime resolved: ${runtime.kind} safety=${runtime.safety}`, data: { runtimeResolution, async: true } });
|
|
131
267
|
if (runtime.safety === "blocked") throw new Error(runtime.reason ?? "Child worker execution is disabled; refusing to create no-op scaffold subagents.");
|
|
132
268
|
const executeWorkers = runtime.kind !== "scaffold";
|
|
133
|
-
|
|
269
|
+
// Use ownerSessionId for workspaceId to ensure agents are only visible to the session that spawned them.
|
|
134
270
|
// manifest.cwd would cause cross-session visibility since all sessions share the same project directory.
|
|
135
|
-
|
|
271
|
+
// Mark this as background mode so task-runner writes events to background.log for debugging.
|
|
272
|
+
process.env.PI_CREW_BACKGROUND_MODE = "1";
|
|
273
|
+
// BUG #17: Keep-alive interval (NOT unref'd) prevents event loop from exiting
|
|
274
|
+
// during jiti compilation of team-runner.ts. Without this, the event loop
|
|
275
|
+
// can drain when import() blocks, causing the process to exit prematurely.
|
|
276
|
+
console.log(`[background-runner] DEBUG: calling executeTeamRun`);
|
|
277
|
+
let result;
|
|
278
|
+
try {
|
|
279
|
+
result = await executeTeamRun({ manifest, tasks, team, workflow, agents, executeWorkers, limits: runConfig.limits, runtime, runtimeConfig: runConfig.runtime, skillOverride: manifest.skillOverride, reliability: runConfig.reliability, workspaceId: manifest.ownerSessionId ?? manifest.cwd });
|
|
280
|
+
console.log(`[background-runner] DEBUG: executeTeamRun returned, status=${result.manifest.status}`);
|
|
281
|
+
} catch (execError) {
|
|
282
|
+
console.log(`[background-runner] DEBUG: executeTeamRun THREW: ${execError instanceof Error ? execError.message : String(execError)}`);
|
|
283
|
+
console.log(`[background-runner] DEBUG: stack: ${execError instanceof Error ? execError.stack : "N/A"}`);
|
|
284
|
+
throw execError;
|
|
285
|
+
}
|
|
136
286
|
manifest = result.manifest;
|
|
137
287
|
tasks = result.tasks;
|
|
138
288
|
appendEvent(manifest.eventsPath, { type: "async.completed", runId: manifest.runId, data: { status: manifest.status, tasks: tasks.length } });
|
|
289
|
+
console.log(`[background-runner] DEBUG: async.completed written, status=${manifest.status}`);
|
|
139
290
|
if (manifest.status === "failed" || manifest.status === "cancelled" || manifest.status === "blocked") process.exitCode = 1;
|
|
140
291
|
} catch (error) {
|
|
141
292
|
// Terminate live agents on failure too — agents are done when the run fails
|
|
@@ -151,10 +302,17 @@ async function main(): Promise<void> {
|
|
|
151
302
|
manifest = updateRunStatus(manifest, "failed", message);
|
|
152
303
|
appendEvent(manifest.eventsPath, { type: "async.failed", runId: manifest.runId, message });
|
|
153
304
|
process.exitCode = 1;
|
|
305
|
+
console.log(`[background-runner] DEBUG: catch block, error=${error instanceof Error ? error.message : String(error)}`);
|
|
154
306
|
} finally {
|
|
307
|
+
console.log(`[background-runner] DEBUG: finally block, exitCode=${process.exitCode}`);
|
|
155
308
|
stopInterruptGuard();
|
|
156
309
|
stopParentGuard();
|
|
310
|
+
stopHeartbeat();
|
|
311
|
+
clearInterval(keepAlive);
|
|
157
312
|
}
|
|
158
313
|
}
|
|
159
314
|
|
|
160
|
-
await main()
|
|
315
|
+
await main().catch((err) => {
|
|
316
|
+
console.error(`[background-runner] DEBUG: main() uncaught: ${err?.message ?? err}`);
|
|
317
|
+
process.exit(1);
|
|
318
|
+
});
|
package/src/runtime/child-pi.ts
CHANGED
|
@@ -115,6 +115,8 @@ export interface ChildPiLifecycleEvent {
|
|
|
115
115
|
exitCode?: number | null;
|
|
116
116
|
/** Error message for error events. */
|
|
117
117
|
error?: string;
|
|
118
|
+
/** Stderr captured at timeout moment (for response_timeout events). */
|
|
119
|
+
stderr?: string;
|
|
118
120
|
/** Timestamp (ISO). */
|
|
119
121
|
ts: string;
|
|
120
122
|
}
|
|
@@ -136,6 +138,10 @@ export interface ChildPiRunInput {
|
|
|
136
138
|
finalDrainMs?: number;
|
|
137
139
|
hardKillMs?: number;
|
|
138
140
|
responseTimeoutMs?: number;
|
|
141
|
+
/** Soft limit on assistant turns — inject steer at this count. */
|
|
142
|
+
maxTurns?: number;
|
|
143
|
+
/** Extra turns after soft limit before hard abort. Default: 5. */
|
|
144
|
+
graceTurns?: number;
|
|
139
145
|
}
|
|
140
146
|
|
|
141
147
|
export interface ChildPiRunResult {
|
|
@@ -144,18 +150,57 @@ export interface ChildPiRunResult {
|
|
|
144
150
|
stderr: string;
|
|
145
151
|
error?: string;
|
|
146
152
|
exitStatus?: WorkerExitStatus;
|
|
153
|
+
/** True if the agent was hard-aborted (max_turns + grace exceeded). */
|
|
154
|
+
aborted?: boolean;
|
|
155
|
+
/** True if the agent was steered to wrap up (hit soft turn limit) but finished in time. */
|
|
156
|
+
steered?: boolean;
|
|
147
157
|
}
|
|
148
158
|
|
|
149
159
|
export function buildChildPiSpawnOptions(cwd: string, env: NodeJS.ProcessEnv): SpawnOptions {
|
|
150
|
-
// Filter out env vars whose keys match secret patterns to avoid leaking credentials to child processes
|
|
151
|
-
|
|
160
|
+
// Filter out env vars whose keys match secret patterns to avoid leaking credentials to child processes.
|
|
161
|
+
// IMPORTANT: preserve model provider API keys — they are needed by the child Pi to call the LLM.
|
|
162
|
+
// Also preserve essential non-secret vars (PATH, HOME, USER, etc.) so the child process can function.
|
|
163
|
+
// Bug #10 fix: allow-list preserves model provider keys.
|
|
164
|
+
// Bug #12 fix: essential env vars (PATH, HOME, etc.) are always preserved so child can find npm/node.
|
|
165
|
+
const filteredEnv = sanitizeEnvSecrets(env, {
|
|
166
|
+
allowList: [
|
|
167
|
+
// Model provider API keys (these are safe to pass — they're meant for API calls)
|
|
168
|
+
"MINIMAX_*",
|
|
169
|
+
"OPENAI_*",
|
|
170
|
+
"ANTHROPIC_*",
|
|
171
|
+
"GOOGLE_*",
|
|
172
|
+
"AZURE_*",
|
|
173
|
+
"AWS_*",
|
|
174
|
+
"ZEU_*",
|
|
175
|
+
"ZERODEV_*",
|
|
176
|
+
"*_API_KEY",
|
|
177
|
+
"*_TOKEN",
|
|
178
|
+
"*_SECRET",
|
|
179
|
+
// Essential non-secret vars for child process to function
|
|
180
|
+
"PATH",
|
|
181
|
+
"HOME",
|
|
182
|
+
"USER",
|
|
183
|
+
"SHELL",
|
|
184
|
+
"TERM",
|
|
185
|
+
"LANG",
|
|
186
|
+
"LC_*",
|
|
187
|
+
"XDG_*",
|
|
188
|
+
"NVM_*",
|
|
189
|
+
"NODE_*",
|
|
190
|
+
"npm_*",
|
|
191
|
+
"PI_*",
|
|
192
|
+
"PI_CREW_*",
|
|
193
|
+
"PI_TEAMS_*",
|
|
194
|
+
],
|
|
195
|
+
});
|
|
152
196
|
return {
|
|
153
197
|
cwd,
|
|
154
198
|
env: { ...filteredEnv, PI_CREW_PARENT_PID: String(process.pid) },
|
|
155
|
-
stdio: ["ignore", "pipe", "pipe"],
|
|
199
|
+
stdio: ["ignore", "pipe", "pipe"], // stdin=ignore: child doesn't wait for input; task comes via CLI args
|
|
156
200
|
detached: process.platform !== "win32",
|
|
201
|
+
setsid: true,
|
|
157
202
|
windowsHide: true,
|
|
158
|
-
};
|
|
203
|
+
} as SpawnOptions;
|
|
159
204
|
}
|
|
160
205
|
|
|
161
206
|
function appendTranscript(input: ChildPiRunInput, line: string): void {
|
|
@@ -326,7 +371,7 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
326
371
|
if (mock === "retryable-failure") return { exitCode: 1, stdout: "", stderr: "rate limit: mock failure" };
|
|
327
372
|
return { exitCode: 1, stdout: "", stderr: `mock failure: ${mock}` };
|
|
328
373
|
}
|
|
329
|
-
const built = buildPiWorkerArgs({ task: input.task, agent: input.agent, model: input.model, sessionEnabled:
|
|
374
|
+
const built = buildPiWorkerArgs({ task: input.task, agent: input.agent, model: input.model, sessionEnabled: true, maxDepth: input.maxDepth, skillPaths: input.skillPaths });
|
|
330
375
|
const spawnSpec = getPiSpawnCommand(built.args);
|
|
331
376
|
try {
|
|
332
377
|
return await new Promise<ChildPiRunResult>((resolve) => {
|
|
@@ -355,12 +400,20 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
355
400
|
let abortRequested = input.signal?.aborted === true;
|
|
356
401
|
let hardKilled = false;
|
|
357
402
|
const cleanupErrors: string[] = [];
|
|
403
|
+
let turnCount = 0;
|
|
404
|
+
let softLimitReached = false;
|
|
405
|
+
const maxTurns = input.maxTurns;
|
|
406
|
+
const graceTurns = input.graceTurns;
|
|
407
|
+
let abortDueToParentSignal = false;
|
|
408
|
+
input.signal?.addEventListener("abort", () => { abortDueToParentSignal = true; }, { once: true });
|
|
358
409
|
const restartNoResponseTimer = (): void => {
|
|
359
410
|
if (responseTimeoutMs <= 0) return;
|
|
360
411
|
if (noResponseTimer) clearTimeout(noResponseTimer);
|
|
361
412
|
noResponseTimer = setTimeout(() => {
|
|
362
413
|
responseTimeoutHit = true;
|
|
363
|
-
|
|
414
|
+
// Capture stderr at timeout moment for debugging
|
|
415
|
+
const timeoutStderr = stderr.slice(-1024); // Last 1KB of stderr
|
|
416
|
+
input.onLifecycleEvent?.({ type: "response_timeout", pid: child.pid, error: `No output for ${responseTimeoutMs}ms`, ts: new Date().toISOString(), stderr: timeoutStderr || undefined });
|
|
364
417
|
killProcessTree(child.pid, child);
|
|
365
418
|
try {
|
|
366
419
|
child.kill(process.platform === "win32" ? undefined : "SIGTERM");
|
|
@@ -384,6 +437,21 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
384
437
|
},
|
|
385
438
|
onJsonEvent: (event) => {
|
|
386
439
|
restartNoResponseTimer();
|
|
440
|
+
// Turn-count-based steering: soft limit steer + hard abort after graceTurns
|
|
441
|
+
if (event && typeof event === "object" && !Array.isArray(event)) {
|
|
442
|
+
const obj = event as Record<string, unknown>;
|
|
443
|
+
if (obj.type === "turn_end") {
|
|
444
|
+
turnCount += 1;
|
|
445
|
+
if (maxTurns !== undefined && !softLimitReached && turnCount >= maxTurns) {
|
|
446
|
+
softLimitReached = true;
|
|
447
|
+
// Inject steer via stdin to tell child to wrap up
|
|
448
|
+
child.stdin?.write(JSON.stringify({ type: "steer", message: "You have reached your turn limit. Wrap up immediately — provide your final answer now." }) + "\n");
|
|
449
|
+
} else if (maxTurns !== undefined && softLimitReached && turnCount >= maxTurns + (graceTurns ?? 5)) {
|
|
450
|
+
// Hard abort — terminate after grace turns
|
|
451
|
+
try { child.kill(process.platform === "win32" ? undefined : "SIGTERM"); } catch { /* best-effort */ }
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
}
|
|
387
455
|
input.onJsonEvent?.(event);
|
|
388
456
|
if (!isFinalAssistantEvent(event) || childExited || settled || finalDrainTimer) return;
|
|
389
457
|
finalDrainTimer = setTimeout(() => {
|
|
@@ -440,7 +508,12 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
440
508
|
} catch (error) {
|
|
441
509
|
cleanupErrors.push(error instanceof Error ? error.message : String(error));
|
|
442
510
|
}
|
|
443
|
-
|
|
511
|
+
// Catch all errors from settle to prevent unhandled rejection from propagating
|
|
512
|
+
try {
|
|
513
|
+
resolve({ ...result, exitStatus: result.exitStatus ?? { exitCode: result.exitCode, cancelled: abortRequested, timedOut: responseTimeoutHit, killed: hardKilled, cleanupErrors, finalDrainMs } });
|
|
514
|
+
} catch (resolveError) {
|
|
515
|
+
logInternalError("child-pi.settle-resolve", resolveError, `result=${JSON.stringify({ exitCode: result.exitCode })}`);
|
|
516
|
+
}
|
|
444
517
|
};
|
|
445
518
|
|
|
446
519
|
const abort = (): void => {
|
|
@@ -485,7 +558,11 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
485
558
|
restartNoResponseTimer();
|
|
486
559
|
const text = chunk.toString("utf-8");
|
|
487
560
|
backpressureBytes += text.length;
|
|
488
|
-
|
|
561
|
+
try {
|
|
562
|
+
lineObserver.observe(text);
|
|
563
|
+
} catch (err) {
|
|
564
|
+
logInternalError("child-pi.line-observer-observe", err, `text=${text.slice(0, 100)}`);
|
|
565
|
+
}
|
|
489
566
|
if (backpressureBytes > BACKPRESSURE_HIGH && child.stdout && !child.stdout.isPaused()) {
|
|
490
567
|
try { child.stdout.pause(); } catch { /* ignore */ }
|
|
491
568
|
const timer = setTimeout(releaseBackpressure, 50);
|
|
@@ -497,7 +574,11 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
497
574
|
stderr = appendBoundedTail(stderr, chunk.toString("utf-8"));
|
|
498
575
|
});
|
|
499
576
|
child.on("error", (error) => {
|
|
500
|
-
|
|
577
|
+
try {
|
|
578
|
+
input.onLifecycleEvent?.({ type: "spawn_error", pid: child.pid, error: error.message, ts: new Date().toISOString() });
|
|
579
|
+
} catch (err) {
|
|
580
|
+
logInternalError("child-pi.on-lifecycle-event", err, `event=error, pid=${child.pid}`);
|
|
581
|
+
}
|
|
501
582
|
settle({ exitCode: null, stdout, stderr, error: error.message });
|
|
502
583
|
});
|
|
503
584
|
child.on("exit", (code) => {
|
|
@@ -505,7 +586,11 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
505
586
|
activeChildProcesses.delete(child.pid);
|
|
506
587
|
clearHardKillTimer(child.pid);
|
|
507
588
|
}
|
|
508
|
-
|
|
589
|
+
try {
|
|
590
|
+
input.onLifecycleEvent?.({ type: "exit", pid: child.pid, exitCode: code, ts: new Date().toISOString() });
|
|
591
|
+
} catch (err) {
|
|
592
|
+
logInternalError("child-pi.on-lifecycle-event", err, `event=exit, pid=${child.pid}`);
|
|
593
|
+
}
|
|
509
594
|
childExited = true;
|
|
510
595
|
clearNoResponseTimer();
|
|
511
596
|
clearFinalDrainTimers();
|
|
@@ -521,14 +606,22 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
521
606
|
activeChildProcesses.delete(child.pid);
|
|
522
607
|
clearHardKillTimer(child.pid);
|
|
523
608
|
}
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
//
|
|
531
|
-
|
|
609
|
+
try {
|
|
610
|
+
input.onLifecycleEvent?.({ type: "close", pid: child.pid, exitCode, ts: new Date().toISOString() });
|
|
611
|
+
} catch (err) {
|
|
612
|
+
logInternalError("child-pi.on-lifecycle-event", err, `event=close, pid=${child.pid}`);
|
|
613
|
+
}
|
|
614
|
+
const timeoutError = responseTimeoutHit && !stderr.trim() ? { error: `Child Pi produced no new output for ${responseTimeoutMs}ms; process was terminated as unresponsive.` } : responseTimeoutHit && stderr.trim() ? { error: `Child Pi timed out after ${responseTimeoutMs}ms with stderr: ${stderr.slice(-500)}` } : undefined;
|
|
615
|
+
// M6 fix: log when forced final drain converts non-zero exit to 0.
|
|
616
|
+
// This is expected in normal operation (child finished cleanly but linger was killed),
|
|
617
|
+
// but the telemetry helps detect regressions where crashes are hidden.
|
|
618
|
+
if (forcedFinalDrain && !timeoutError && exitCode !== 0) {
|
|
619
|
+
logInternalError("child-pi.final-drain-zero-exit", new Error(`Child exit code overridden to 0 after forced final drain (original=${exitCode})`), `pid=${child.pid}, finalDrainMs=${finalDrainMs}`);
|
|
620
|
+
}
|
|
621
|
+
const finalExitCode = forcedFinalDrain && !timeoutError ? 0 : exitCode;
|
|
622
|
+
const wasGraceAborted = softLimitReached && turnCount >= (maxTurns ?? 0) + (graceTurns ?? 5);
|
|
623
|
+
const wasParentAborted = abortDueToParentSignal && !wasGraceAborted;
|
|
624
|
+
settle({ exitCode: finalExitCode, stdout, stderr, ...(timeoutError ? { error: timeoutError.error } : {}), aborted: wasGraceAborted || wasParentAborted, steered: softLimitReached && !wasGraceAborted, exitStatus: { exitCode: finalExitCode, cancelled: abortRequested, timedOut: responseTimeoutHit, killed: hardKilled, cleanupErrors, finalDrainMs } });
|
|
532
625
|
});
|
|
533
626
|
});
|
|
534
627
|
} finally {
|
|
@@ -88,7 +88,7 @@ function findBraceRanges(lines: string[], openPattern: RegExp, minBody: number):
|
|
|
88
88
|
// ── TypeScript / JavaScript ──
|
|
89
89
|
|
|
90
90
|
const TS_FN_SIG =
|
|
91
|
-
/^\s*(export\s+)?(async\s+)?function\s|^\s*(export\s+)?(static\s+|get\s+|set\s+|private\s+|public\s+|protected\s+|readonly\s+)*\*?\s*\w+\s*[
|
|
91
|
+
/^\s*(export\s+)?(async\s+)?function\s|^\s*(export\s+)?(static\s+|get\s+|set\s+|private\s+|public\s+|protected\s+|readonly\s+)*\*?\s*\w+\s*[(<]/;
|
|
92
92
|
const TS_CLASS_SIG = /^\s*(export\s+)?(default\s+)?(abstract\s+)?class\s/;
|
|
93
93
|
const TS_STRUCT_SIG = /^\s*(export\s+)?(default\s+)?(const|let|var)\s+\w+\s*=\s*(\[[\s]*$|\{[\s]*$)/;
|
|
94
94
|
|
|
@@ -24,7 +24,7 @@ export interface RecoveryPlan {
|
|
|
24
24
|
}
|
|
25
25
|
|
|
26
26
|
function isTerminalTask(task: TeamTaskState): boolean {
|
|
27
|
-
return task.status === "completed" || task.status === "failed" || task.status === "cancelled" || task.status === "skipped";
|
|
27
|
+
return task.status === "completed" || task.status === "failed" || task.status === "cancelled" || task.status === "skipped" || task.status === "needs_attention";
|
|
28
28
|
}
|
|
29
29
|
|
|
30
30
|
function shouldRecoverTask(task: TeamTaskState, deadMs: number): boolean {
|
|
@@ -2,7 +2,7 @@ import type { TeamTaskStatus } from "../state/contracts.ts";
|
|
|
2
2
|
import type { CrewActivityState, ModelRoutingState, UsageState } from "../state/types.ts";
|
|
3
3
|
|
|
4
4
|
export type CrewRuntimeKind = "scaffold" | "child-process" | "live-session";
|
|
5
|
-
export type CrewAgentStatus = "queued" | "running" | "waiting" | "completed" | "failed" | "cancelled" | "stopped";
|
|
5
|
+
export type CrewAgentStatus = "queued" | "running" | "waiting" | "completed" | "failed" | "cancelled" | "stopped" | "needs_attention";
|
|
6
6
|
|
|
7
7
|
export interface CrewAgentRecentTool {
|
|
8
8
|
tool: string;
|
|
@@ -56,5 +56,6 @@ export function taskStatusToAgentStatus(status: TeamTaskStatus): CrewAgentStatus
|
|
|
56
56
|
if (status === "cancelled" || status === "skipped") return "cancelled";
|
|
57
57
|
if (status === "running") return "running";
|
|
58
58
|
if (status === "waiting") return "waiting";
|
|
59
|
+
if (status === "needs_attention") return "needs_attention";
|
|
59
60
|
return "queued";
|
|
60
61
|
}
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import * as fs from "node:fs";
|
|
1
2
|
import type { NotificationDescriptor } from "../extension/notification-router.ts";
|
|
2
3
|
import type { MetricRegistry } from "../observability/metric-registry.ts";
|
|
3
4
|
import { appendEvent } from "../state/event-log.ts";
|
|
@@ -84,6 +85,9 @@ export class HeartbeatWatcher {
|
|
|
84
85
|
|
|
85
86
|
for (const run of this.opts.manifestCache.list(50)) {
|
|
86
87
|
if (run.status !== "running") continue;
|
|
88
|
+
// Bug #5 fix: if stateRoot doesn't exist, the run was pruned — skip it silently.
|
|
89
|
+
// This prevents stale "heartbeat dead" notifications for runs that no longer exist.
|
|
90
|
+
if (!fs.existsSync(run.stateRoot)) continue;
|
|
87
91
|
const loaded = loadRunManifestById(this.opts.cwd, run.runId);
|
|
88
92
|
if (!loaded) continue;
|
|
89
93
|
for (const task of loaded.tasks) {
|
|
@@ -200,7 +200,7 @@ export function evictStaleLiveAgentHandles(now = Date.now()): number {
|
|
|
200
200
|
}
|
|
201
201
|
|
|
202
202
|
export function listLiveAgents(): LiveAgentHandle[] {
|
|
203
|
-
return [...liveAgents.values()].sort((a, b) => b.updatedAt.localeCompare(a.updatedAt));
|
|
203
|
+
return [...liveAgents.values()].sort((a, b) => (b.updatedAt ?? "").localeCompare(a.updatedAt ?? ""));
|
|
204
204
|
}
|
|
205
205
|
|
|
206
206
|
export function listActiveLiveAgents(): LiveAgentHandle[] {
|
|
@@ -267,8 +267,9 @@ function liveSystemPrompt(input: LiveSessionSpawnInput): string {
|
|
|
267
267
|
function filterActiveTools(session: LiveSessionLike, agent: AgentConfig): void {
|
|
268
268
|
if (typeof session.getActiveToolNames !== "function" || typeof session.setActiveToolsByName !== "function") return;
|
|
269
269
|
const recursiveTools = new Set(["team", "Team", "Agent", "get_subagent_result", "steer_subagent"]);
|
|
270
|
+
const disallowed = agent.disallowedTools?.length ? new Set(agent.disallowedTools) : undefined;
|
|
270
271
|
const allowed = agent.tools?.length ? new Set(agent.tools) : undefined;
|
|
271
|
-
const active = session.getActiveToolNames().filter((name) => !recursiveTools.has(name) && (!allowed || allowed.has(name)));
|
|
272
|
+
const active = session.getActiveToolNames().filter((name) => !recursiveTools.has(name) && (!disallowed || !disallowed.has(name)) && (!allowed || allowed.has(name)));
|
|
272
273
|
session.setActiveToolsByName(active);
|
|
273
274
|
}
|
|
274
275
|
|