pi-crew 0.3.7 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/CHANGELOG.md +17 -0
  2. package/package.json +1 -1
  3. package/src/agents/discover-agents.ts +2 -1
  4. package/src/config/config.ts +732 -208
  5. package/src/config/types.ts +34 -5
  6. package/src/extension/help.ts +1 -0
  7. package/src/extension/register.ts +1173 -257
  8. package/src/extension/registration/commands.ts +15 -2
  9. package/src/extension/registration/team-tool.ts +1 -1
  10. package/src/extension/session-summary.ts +11 -1
  11. package/src/extension/team-tool/api.ts +4 -1
  12. package/src/extension/team-tool/cache-control.ts +23 -0
  13. package/src/extension/team-tool/cancel.ts +15 -5
  14. package/src/extension/team-tool/context.ts +2 -0
  15. package/src/extension/team-tool/handle-settings.ts +2 -0
  16. package/src/extension/team-tool/health-monitor.ts +563 -0
  17. package/src/extension/team-tool/inspect.ts +10 -3
  18. package/src/extension/team-tool/respond.ts +5 -2
  19. package/src/extension/team-tool/status.ts +4 -1
  20. package/src/extension/team-tool-types.ts +2 -0
  21. package/src/extension/team-tool.ts +901 -177
  22. package/src/runtime/adaptive-plan.ts +1 -1
  23. package/src/runtime/foreground-watchdog.ts +129 -0
  24. package/src/runtime/manifest-cache.ts +4 -2
  25. package/src/runtime/run-tracker.ts +11 -0
  26. package/src/runtime/runtime-policy.ts +15 -2
  27. package/src/runtime/stale-reconciler.ts +322 -18
  28. package/src/runtime/task-runner.ts +6 -1
  29. package/src/schema/config-schema.ts +1 -0
  30. package/src/schema/team-tool-schema.ts +204 -76
  31. package/src/state/state-store.ts +9 -1
  32. package/src/teams/discover-teams.ts +2 -1
  33. package/src/ui/run-event-bus.ts +2 -1
  34. package/src/ui/settings-overlay.ts +2 -0
  35. package/src/workflows/discover-workflows.ts +5 -1
@@ -263,7 +263,7 @@ export interface InjectAdaptivePlanResult {
263
263
  export function injectAdaptivePlanIfReady(input: InjectAdaptivePlanInput): InjectAdaptivePlanResult {
264
264
  if (input.workflow.name !== "implementation") return { tasks: input.tasks, workflow: input.workflow, injected: false, missingPlan: false };
265
265
  if (input.tasks.some((task) => task.stepId?.startsWith("adaptive-"))) return { tasks: input.tasks, workflow: reconstructAdaptiveWorkflow(input.workflow, input.tasks), injected: false, missingPlan: false };
266
- const completedAssess = input.tasks.find((task) => task.stepId === "assess" && task.status === "completed");
266
+ const completedAssess = input.tasks.find((task) => task.stepId === "assess" && (task.status === "completed" || task.status === "needs_attention"));
267
267
  if (!completedAssess) return { tasks: input.tasks, workflow: input.workflow, injected: false, missingPlan: false };
268
268
  if (!completedAssess.resultArtifact?.path) {
269
269
  appendEvent(input.manifest.eventsPath, { type: "adaptive.plan_missing", runId: input.manifest.runId, taskId: completedAssess.id, message: "Adaptive planner result artifact is missing." });
@@ -0,0 +1,129 @@
1
+ /**
2
+ * Foreground run watchdog — periodically checks that active foreground runs
3
+ * are making progress and auto-notifies the assistant if a run appears hung.
4
+ *
5
+ * Problem: foreground runs run in background via startForegroundRun(). The Pi
6
+ * assistant has no way to know when a run completes or gets stuck without
7
+ * manual polling. This watchdog monitors active runs and:
8
+ *
9
+ * 1. Detects hung runs (active status, no heartbeat update for >10 min)
10
+ * 2. Injects a followUp message via pi.sendUserMessage() so the assistant
11
+ * is automatically notified — no manual sleep+check needed.
12
+ * 3. Cleans up after itself when the run completes or the session ends.
13
+ */
14
+ import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
15
+ import { loadRunManifestById } from "../state/state-store.ts";
16
+ import { readCrewAgents } from "./crew-agent-records.ts";
17
+ import { isActiveRunStatus, isLikelyOrphanedActiveRun } from "./process-status.ts";
18
+
19
+ export interface WatchdogOptions {
20
+ pi: ExtensionAPI;
21
+ cwd: string;
22
+ runId: string;
23
+ /** Check interval in ms. Default: 5 minutes. */
24
+ checkIntervalMs?: number;
25
+ /** Maximum time to monitor in ms. Default: 2 hours. */
26
+ maxMonitorMs?: number;
27
+ }
28
+
29
+ const DEFAULT_CHECK_INTERVAL_MS = 300_000; // 5 minutes
30
+ const DEFAULT_MAX_MONITOR_MS = 7_200_000; // 2 hours
31
+
32
+ /** Active watchdog timers — keyed by runId for cleanup. */
33
+ const activeWatchdogs = new Map<string, ReturnType<typeof setTimeout>>();
34
+
35
+ /** Stop a specific watchdog by runId. */
36
+ export function stopWatchdog(runId: string): void {
37
+ const timer = activeWatchdogs.get(runId);
38
+ if (timer) {
39
+ clearTimeout(timer);
40
+ activeWatchdogs.delete(runId);
41
+ }
42
+ }
43
+
44
+ /** Stop all active watchdogs. Called on session shutdown. */
45
+ export function stopAllWatchdogs(): void {
46
+ for (const [runId, timer] of activeWatchdogs) {
47
+ clearTimeout(timer);
48
+ }
49
+ activeWatchdogs.clear();
50
+ }
51
+
52
+ /**
53
+ * Start a periodic watchdog for a foreground run.
54
+ * Checks at regular intervals whether the run is still progressing.
55
+ * If the run appears hung (no update for >10 min with no active agents),
56
+ * injects a followUp message into the Pi conversation.
57
+ *
58
+ * Automatically stops when:
59
+ * - The run reaches a terminal status (completed/failed/cancelled)
60
+ * - The max monitor time is exceeded
61
+ * - Explicitly stopped via stopWatchdog()
62
+ */
63
+ export function startForegroundWatchdog(opts: WatchdogOptions): void {
64
+ const { pi, cwd, runId } = opts;
65
+ const checkIntervalMs = opts.checkIntervalMs ?? DEFAULT_CHECK_INTERVAL_MS;
66
+ const maxMonitorMs = opts.maxMonitorMs ?? DEFAULT_MAX_MONITOR_MS;
67
+ const startTime = Date.now();
68
+
69
+ // Don't stack watchdogs for the same run
70
+ if (activeWatchdogs.has(runId)) return;
71
+
72
+ const check = (): void => {
73
+ // Check if max monitor time exceeded
74
+ if (Date.now() - startTime > maxMonitorMs) {
75
+ activeWatchdogs.delete(runId);
76
+ return;
77
+ }
78
+
79
+ try {
80
+ const loaded = loadRunManifestById(cwd, runId);
81
+ if (!loaded) {
82
+ // Run not found — stop watchdog
83
+ activeWatchdogs.delete(runId);
84
+ return;
85
+ }
86
+
87
+ const { manifest } = loaded;
88
+
89
+ // Terminal status — send completion notification and stop
90
+ if (!isActiveRunStatus(manifest.status)) {
91
+ const teamName = manifest.team ?? "unknown";
92
+ try {
93
+ pi.sendUserMessage(
94
+ `pi-crew run ${manifest.status}: ${runId} (${teamName}/${manifest.workflow ?? "default"})`,
95
+ { deliverAs: "followUp" },
96
+ );
97
+ } catch { /* non-critical */ }
98
+ activeWatchdogs.delete(runId);
99
+ return;
100
+ }
101
+
102
+ // Check if run appears hung
103
+ const agents = readCrewAgents(manifest);
104
+ const now = Date.now();
105
+ if (isLikelyOrphanedActiveRun(manifest, agents, now)) {
106
+ const detail = `status=${manifest.status}, updatedAt=${manifest.updatedAt}, agents=${agents.length}`;
107
+ try {
108
+ pi.sendUserMessage(
109
+ `pi-crew watchdog: run ${runId} appears hung (${detail}). Consider running team action='cancel' runId='${runId}' or team action='doctor'.`,
110
+ { deliverAs: "followUp" },
111
+ );
112
+ } catch { /* non-critical */ }
113
+ // Don't stop — keep monitoring. The assistant or user may intervene.
114
+ }
115
+ } catch {
116
+ // Non-critical — skip this check
117
+ }
118
+
119
+ // Schedule next check
120
+ const timer = setTimeout(check, checkIntervalMs);
121
+ timer.unref(); // Don't prevent process exit
122
+ activeWatchdogs.set(runId, timer);
123
+ };
124
+
125
+ // First check after initial interval
126
+ const timer = setTimeout(check, checkIntervalMs);
127
+ timer.unref();
128
+ activeWatchdogs.set(runId, timer);
129
+ }
@@ -108,8 +108,10 @@ function parseManifestIfChanged(root: string, runId: string, filePath: string, p
108
108
 
109
109
  function listRunRoots(cwd: string): string[] {
110
110
  const roots = new Set<string>();
111
- const base = findRepoRoot(cwd) ? projectCrewRoot(cwd) : userCrewRoot();
112
- roots.add(path.join(base, DEFAULT_PATHS.state.runsSubdir));
111
+ // Always include user-level runs (fast-fix, direct-agent, etc. write here)
112
+ roots.add(path.join(userCrewRoot(), DEFAULT_PATHS.state.runsSubdir));
113
+ const projectRoot = findRepoRoot(cwd);
114
+ if (projectRoot) roots.add(path.join(projectCrewRoot(cwd), DEFAULT_PATHS.state.runsSubdir));
113
115
  return [...roots];
114
116
  }
115
117
 
@@ -1,4 +1,6 @@
1
1
  import type { TeamRunManifest, TeamTaskState } from "../state/types.ts";
2
+ import * as fs from "node:fs";
3
+ import * as path from "node:path";
2
4
  import { loadRunManifestById } from "../state/state-store.ts";
3
5
  import { isFinishedRunStatus } from "./process-status.ts";
4
6
 
@@ -75,6 +77,15 @@ export async function waitForRun(
75
77
  // Slow path: background run — poll with exponential backoff capped at pollIntervalMs
76
78
  let attempt = 0;
77
79
  while (Date.now() < deadline) {
80
+ if (attempt === 0) {
81
+ // Early exit: if the run directory doesn't exist, don't waste time polling
82
+ const runDir = path.join(cwd, ".crew", "state", "runs", runId);
83
+ if (!fs.existsSync(runDir)) {
84
+ throw new Error(
85
+ `Run ${runId} not found. No run directory at ${runDir}`,
86
+ );
87
+ }
88
+ }
78
89
  const fresh = loadRunManifestById(cwd, runId);
79
90
  if (fresh && isFinishedRunStatus(fresh.manifest.status)) {
80
91
  return fresh;
@@ -9,12 +9,25 @@ import { currentCrewDepth } from "./pi-args.ts";
9
9
  * - If the role appears in `isolationPolicy.isolatedRoles`, use child-process (crash isolation).
10
10
  * - Otherwise, use `isolationPolicy.defaultRuntime` when configured, then fall back to globalKind.
11
11
  */
12
- export function resolveTaskRuntimeKind(globalKind: CrewRuntimeKind, role: string, isolationPolicy: CrewRuntimeConfig["isolationPolicy"], env: NodeJS.ProcessEnv = process.env): CrewRuntimeKind {
12
+ export function resolveTaskRuntimeKind(
13
+ globalKind: CrewRuntimeKind,
14
+ role: string,
15
+ isolationPolicy: CrewRuntimeConfig["isolationPolicy"],
16
+ env: NodeJS.ProcessEnv = process.env,
17
+ ): CrewRuntimeKind {
13
18
  if (globalKind === "scaffold") return "scaffold";
14
19
  // Safety: when already inside a pi-crew worker (depth > 0), never nest live-session.
15
20
  // Live-session creates in-process Pi agent sessions, which would recursively
16
21
  // try to use pi-crew, leading to "Cannot read properties of undefined" errors.
17
- if (globalKind === "live-session" && currentCrewDepth(env) > 0) return "child-process";
22
+ // Exception: when PI_CREW_MOCK_LIVE_SESSION is set, we're in a test harness
23
+ // that mocks the live-session path — forcing child-process would spawn a real
24
+ // pi process and hang the test.
25
+ if (
26
+ globalKind === "live-session" &&
27
+ currentCrewDepth(env) > 0 &&
28
+ env.PI_CREW_MOCK_LIVE_SESSION !== "success"
29
+ )
30
+ return "child-process";
18
31
  const isolatedRoles = isolationPolicy?.isolatedRoles ?? [];
19
32
  if (isolatedRoles.includes(role)) return "child-process";
20
33
  return isolationPolicy?.defaultRuntime ?? globalKind;
@@ -1,8 +1,12 @@
1
1
  import * as fs from "node:fs";
2
+ import * as os from "node:os";
2
3
  import * as path from "node:path";
3
4
  import type { TeamRunManifest, TeamTaskState } from "../state/types.ts";
4
- import { checkProcessLiveness } from "./process-status.ts";
5
5
  import { recordFromTask, upsertCrewAgent } from "./crew-agent-records.ts";
6
+ import { checkProcessLiveness } from "./process-status.ts";
7
+
8
+ /** Age threshold for orphaned temp directory cleanup: 1 hour. */
9
+ const ORPHAN_TEMP_DIR_AGE_THRESHOLD_MS = 60 * 60 * 1000;
6
10
 
7
11
  /**
8
12
  * Result of reconciling a single stale run.
@@ -10,7 +14,12 @@ import { recordFromTask, upsertCrewAgent } from "./crew-agent-records.ts";
10
14
  export interface ReconcileResult {
11
15
  runId: string;
12
16
  /** What was found and what action was taken */
13
- verdict: "healthy" | "result_exists" | "pid_dead" | "pid_alive_stale" | "no_status";
17
+ verdict:
18
+ | "healthy"
19
+ | "result_exists"
20
+ | "pid_dead"
21
+ | "pid_alive_stale"
22
+ | "no_status";
14
23
  /** Whether repair was applied */
15
24
  repaired: boolean;
16
25
  /** Human-readable detail */
@@ -21,6 +30,8 @@ export interface ReconcileResult {
21
30
 
22
31
  const STALE_ALIVE_PID_MS = 24 * 60 * 60 * 1000; // 24 hours
23
32
  const ACTIVE_EVIDENCE_TTL_MS = 5 * 60 * 1000;
33
+ /** For no-PID runs, repair when ALL running tasks have heartbeat stale beyond this threshold. */
34
+ const NO_PID_HEARTBEAT_STALE_MS = 5 * 60 * 1000; // 5 minutes — same as heartbeat-gradient deadMs
24
35
 
25
36
  /**
26
37
  * Phase 1: Check if a result file already exists for the run.
@@ -31,14 +42,28 @@ function checkResultFile(
31
42
  tasks: TeamTaskState[],
32
43
  ): { found: boolean; repaired: boolean } {
33
44
  // Check if all tasks already have terminal status (result was written but manifest wasn't updated)
34
- const allTerminal = tasks.length > 0 && tasks.every(
35
- (t) => t.status === "completed" || t.status === "failed" || t.status === "cancelled" || t.status === "skipped" || t.status === "needs_attention",
36
- );
45
+ const allTerminal =
46
+ tasks.length > 0 &&
47
+ tasks.every(
48
+ (t) =>
49
+ t.status === "completed" ||
50
+ t.status === "failed" ||
51
+ t.status === "cancelled" ||
52
+ t.status === "skipped" ||
53
+ t.status === "needs_attention",
54
+ );
37
55
  if (allTerminal) {
38
56
  // Sync agent records even when tasks are already terminal
39
57
  // (e.g., a previous reconcile fixed tasks but crashed before updating agents)
40
58
  for (const task of tasks) {
41
- try { upsertCrewAgent(manifest, recordFromTask(manifest, task, "scaffold")); } catch { /* non-critical */ }
59
+ try {
60
+ upsertCrewAgent(
61
+ manifest,
62
+ recordFromTask(manifest, task, "scaffold"),
63
+ );
64
+ } catch {
65
+ /* non-critical */
66
+ }
42
67
  }
43
68
  return { found: true, repaired: false };
44
69
  }
@@ -52,7 +77,10 @@ function checkResultFile(
52
77
  * written, treat the PID as alive even if process.kill returns false
53
78
  * (handles SIGKILL race where PID hasn't been recycled yet).
54
79
  */
55
- function checkPidLiveness(pid: number | undefined, stateRoot?: string): {
80
+ function checkPidLiveness(
81
+ pid: number | undefined,
82
+ stateRoot?: string,
83
+ ): {
56
84
  alive: boolean;
57
85
  detail: string;
58
86
  } {
@@ -67,13 +95,18 @@ function checkPidLiveness(pid: number | undefined, stateRoot?: string): {
67
95
  const heartbeatPath = path.join(stateRoot, "heartbeat.json");
68
96
  try {
69
97
  if (fs.existsSync(heartbeatPath)) {
70
- const hb = JSON.parse(fs.readFileSync(heartbeatPath, "utf-8")) as { pid?: number; at?: number };
98
+ const hb = JSON.parse(
99
+ fs.readFileSync(heartbeatPath, "utf-8"),
100
+ ) as { pid?: number; at?: number };
71
101
  if (hb?.pid === pid && hb?.at) {
72
102
  const ageMs = Date.now() - hb.at;
73
103
  // Heartbeat written < 5 min ago → process was alive recently.
74
104
  // Don't repair yet; let the next reconciliation cycle catch it.
75
105
  if (ageMs < 5 * 60_000) {
76
- return { alive: true, detail: `process dead but heartbeat ${Math.round(ageMs / 1000)}s old` };
106
+ return {
107
+ alive: true,
108
+ detail: `process dead but heartbeat ${Math.round(ageMs / 1000)}s old`,
109
+ };
77
110
  }
78
111
  }
79
112
  }
@@ -101,18 +134,76 @@ function evaluateStaleness(
101
134
  return { stale: false, reason: "updated_at_invalid" };
102
135
  }
103
136
  if (now - updatedAt > STALE_ALIVE_PID_MS) {
104
- return { stale: true, reason: `alive_but_stale_${Math.round((now - updatedAt) / 3600_000)}h` };
137
+ return {
138
+ stale: true,
139
+ reason: `alive_but_stale_${Math.round((now - updatedAt) / 3600_000)}h`,
140
+ };
105
141
  }
106
142
  return { stale: false, reason: "alive_and_recent" };
107
143
  }
108
144
 
109
145
  function hasRecentActiveEvidence(tasks: TeamTaskState[], now: number): boolean {
110
146
  return tasks.some((task) => {
111
- if (task.status !== "running" && task.status !== "waiting") return false;
112
- const heartbeatAt = task.heartbeat?.lastSeenAt ? new Date(task.heartbeat.lastSeenAt).getTime() : Number.NaN;
113
- if (task.heartbeat?.alive !== false && Number.isFinite(heartbeatAt) && now - heartbeatAt <= ACTIVE_EVIDENCE_TTL_MS) return true;
114
- const activityAt = task.agentProgress?.lastActivityAt ? new Date(task.agentProgress.lastActivityAt).getTime() : Number.NaN;
115
- return Number.isFinite(activityAt) && now - activityAt <= ACTIVE_EVIDENCE_TTL_MS;
147
+ if (task.status !== "running" && task.status !== "waiting")
148
+ return false;
149
+ const heartbeatAt = task.heartbeat?.lastSeenAt
150
+ ? new Date(task.heartbeat.lastSeenAt).getTime()
151
+ : Number.NaN;
152
+ if (
153
+ task.heartbeat?.alive !== false &&
154
+ Number.isFinite(heartbeatAt) &&
155
+ now - heartbeatAt <= ACTIVE_EVIDENCE_TTL_MS
156
+ )
157
+ return true;
158
+ const activityAt = task.agentProgress?.lastActivityAt
159
+ ? new Date(task.agentProgress.lastActivityAt).getTime()
160
+ : Number.NaN;
161
+ return (
162
+ Number.isFinite(activityAt) &&
163
+ now - activityAt <= ACTIVE_EVIDENCE_TTL_MS
164
+ );
165
+ });
166
+ }
167
+
168
+ /**
169
+ * For no-PID runs: check if ALL running tasks have heartbeats stale beyond
170
+ * the no-PID heartbeat threshold. This detects zombie tasks where the worker
171
+ * process died but no PID was recorded (e.g. live-session /tmp/ workspaces).
172
+ * Tasks with no heartbeat AND no agent progress are considered NOT stale
173
+ * (they may be newly spawned and haven't reported yet).
174
+ */
175
+ function allRunningTasksHeartbeatStale(
176
+ tasks: TeamTaskState[],
177
+ now: number,
178
+ ): boolean {
179
+ const runningTasks = tasks.filter(
180
+ (t) => t.status === "running" || t.status === "waiting",
181
+ );
182
+ if (runningTasks.length === 0) return false;
183
+ return runningTasks.every((task) => {
184
+ const heartbeatAt = task.heartbeat?.lastSeenAt
185
+ ? new Date(task.heartbeat.lastSeenAt).getTime()
186
+ : Number.NaN;
187
+ const activityAt = task.agentProgress?.lastActivityAt
188
+ ? new Date(task.agentProgress.lastActivityAt).getTime()
189
+ : Number.NaN;
190
+ // If no heartbeat AND no activity, we can't determine staleness — assume not stale
191
+ if (!Number.isFinite(heartbeatAt) && !Number.isFinite(activityAt))
192
+ return false;
193
+ // If heartbeat is recent enough, not stale
194
+ if (
195
+ Number.isFinite(heartbeatAt) &&
196
+ now - heartbeatAt <= NO_PID_HEARTBEAT_STALE_MS
197
+ )
198
+ return false;
199
+ // If agent progress is recent enough, not stale
200
+ if (
201
+ Number.isFinite(activityAt) &&
202
+ now - activityAt <= NO_PID_HEARTBEAT_STALE_MS
203
+ )
204
+ return false;
205
+ // Both present and both stale → this task is stale
206
+ return true;
116
207
  });
117
208
  }
118
209
 
@@ -126,7 +217,11 @@ function repairStaleRun(
126
217
  ): TeamTaskState[] {
127
218
  const now = new Date().toISOString();
128
219
  const repairedTasks = tasks.map((task) => {
129
- if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
220
+ if (
221
+ task.status === "running" ||
222
+ task.status === "queued" ||
223
+ task.status === "waiting"
224
+ ) {
130
225
  return {
131
226
  ...task,
132
227
  status: "cancelled" as const,
@@ -138,7 +233,14 @@ function repairStaleRun(
138
233
  });
139
234
  // Update agent records so widget sees cancelled status immediately
140
235
  for (const task of repairedTasks) {
141
- try { upsertCrewAgent(manifest, recordFromTask(manifest, task, "scaffold")); } catch { /* non-critical */ }
236
+ try {
237
+ upsertCrewAgent(
238
+ manifest,
239
+ recordFromTask(manifest, task, "scaffold"),
240
+ );
241
+ } catch {
242
+ /* non-critical */
243
+ }
142
244
  }
143
245
  return repairedTasks;
144
246
  }
@@ -183,8 +285,31 @@ export function reconcileStaleRun(
183
285
  detail: "No PID recorded, but recent task heartbeat/progress exists; not repairing",
184
286
  };
185
287
  }
288
+ // No PID and no recent activity. If ALL running tasks have stale heartbeats
289
+ // (beyond NO_PID_HEARTBEAT_STALE_MS = 5min), repair immediately — the worker
290
+ // process is dead but we have no PID to check. This handles /tmp/ live-session
291
+ // workspaces where agents exit without calling submit_result.
292
+ if (allRunningTasksHeartbeatStale(tasks, now)) {
293
+ const repaired = repairStaleRun(
294
+ manifest,
295
+ tasks,
296
+ "no_pid_heartbeat_stale",
297
+ );
298
+ return {
299
+ runId,
300
+ verdict: "no_status",
301
+ repaired: true,
302
+ detail: `No PID; all running task heartbeats stale >${Math.round(NO_PID_HEARTBEAT_STALE_MS / 60_000)}min; repaired ${repaired.filter((t) => t.status === "cancelled").length} tasks`,
303
+ repairedTasks: repaired,
304
+ };
305
+ }
306
+ // Fall through: no recent activity but not all tasks stale enough yet.
307
+ // Check the longer STALE_ALIVE_PID_MS threshold for very old runs.
186
308
  const updatedAt = new Date(manifest.updatedAt).getTime();
187
- if (Number.isFinite(updatedAt) && now - updatedAt > STALE_ALIVE_PID_MS) {
309
+ if (
310
+ Number.isFinite(updatedAt) &&
311
+ now - updatedAt > STALE_ALIVE_PID_MS
312
+ ) {
188
313
  const repaired = repairStaleRun(manifest, tasks, "no_pid_stale");
189
314
  return {
190
315
  runId,
@@ -223,3 +348,182 @@ export function reconcileStaleRun(
223
348
  repairedTasks: repaired,
224
349
  };
225
350
  }
351
+
352
+ /**
353
+ * Result of orphaned temp workspace reconciliation.
354
+ */
355
+ export interface OrphanReconcileResult {
356
+ /** Number of runs repaired (manifests cancelled). */
357
+ repaired: number;
358
+ /** Number of /tmp/pi-crew-* directories removed. */
359
+ cleanedDirs: number;
360
+ }
361
+
362
+ /**
363
+ * Scan /tmp (os.tmpdir()) for orphaned pi-crew-* workspaces and reconcile
364
+ * any stale runs found. This catches runs created by tests or crashed sessions
365
+ * that the per-CWD auto-repair timer would miss.
366
+ *
367
+ * When `cleanupOrphanedTempDirs` is not explicitly set to `false`, directories
368
+ * older than 1 hour with no remaining running manifests are deleted after
369
+ * their runs are reconciled.
370
+ *
371
+ * @returns Number of runs repaired and directories cleaned.
372
+ */
373
+ export function reconcileOrphanedTempWorkspaces(
374
+ now = Date.now(),
375
+ options?: { cleanupOrphanedTempDirs?: boolean },
376
+ ): OrphanReconcileResult {
377
+ const tmpDir = getSafeTempDir();
378
+ if (!tmpDir) return { repaired: 0, cleanedDirs: 0 };
379
+ let repaired = 0;
380
+ let cleanedDirs = 0;
381
+ try {
382
+ const entries = fs.readdirSync(tmpDir, { withFileTypes: true });
383
+ for (const entry of entries) {
384
+ if (!entry.isDirectory() || !entry.name.startsWith("pi-crew-"))
385
+ continue;
386
+ const workspaceDir = path.join(tmpDir, entry.name);
387
+ const crewDir = path.join(workspaceDir, ".crew");
388
+ if (!fs.existsSync(crewDir)) continue;
389
+ const stateRunsDir = path.join(crewDir, "state", "runs");
390
+ if (!fs.existsSync(stateRunsDir)) continue;
391
+ let hasRunning = false;
392
+ try {
393
+ for (const runDir of fs.readdirSync(stateRunsDir)) {
394
+ const manifestPath = path.join(
395
+ stateRunsDir,
396
+ runDir,
397
+ "manifest.json",
398
+ );
399
+ const tasksPath = path.join(
400
+ stateRunsDir,
401
+ runDir,
402
+ "tasks.json",
403
+ );
404
+ if (
405
+ !fs.existsSync(manifestPath) ||
406
+ !fs.existsSync(tasksPath)
407
+ )
408
+ continue;
409
+ try {
410
+ const manifest: TeamRunManifest = JSON.parse(
411
+ fs.readFileSync(manifestPath, "utf-8"),
412
+ );
413
+ if (manifest.status !== "running") continue;
414
+ const tasks: TeamTaskState[] = JSON.parse(
415
+ fs.readFileSync(tasksPath, "utf-8"),
416
+ );
417
+ const result = reconcileStaleRun(manifest, tasks, now);
418
+ if (result.repaired && result.repairedTasks) {
419
+ // Persist repaired tasks
420
+ fs.writeFileSync(
421
+ tasksPath,
422
+ JSON.stringify(result.repairedTasks, null, 2),
423
+ );
424
+ // Update manifest status
425
+ const updated = {
426
+ ...manifest,
427
+ status: "cancelled" as const,
428
+ updatedAt: new Date(now).toISOString(),
429
+ summary: `Stale run reconciled: ${result.detail}`,
430
+ };
431
+ fs.writeFileSync(
432
+ manifestPath,
433
+ JSON.stringify(updated, null, 2),
434
+ );
435
+ // Update agent records
436
+ for (const task of result.repairedTasks) {
437
+ try {
438
+ upsertCrewAgent(
439
+ updated,
440
+ recordFromTask(
441
+ updated,
442
+ task,
443
+ "scaffold",
444
+ ),
445
+ );
446
+ } catch {
447
+ /* non-critical */
448
+ }
449
+ }
450
+ repaired++;
451
+ }
452
+ // If still running after reconciliation attempt, mark for dir-preserving
453
+ if (
454
+ result.verdict === "healthy" ||
455
+ (result.verdict === "no_status" && !result.repaired)
456
+ ) {
457
+ hasRunning = true;
458
+ }
459
+ } catch {
460
+ /* skip corrupt manifests */
461
+ }
462
+ }
463
+ } catch {
464
+ /* skip unreadable dirs */
465
+ }
466
+
467
+ // Post-loop: check if this workspace dir can be cleaned up.
468
+ // Eligible when cleanup is enabled, no running manifests remain, and
469
+ // the directory is older than the age threshold.
470
+ if (!hasRunning) {
471
+ // Re-scan manifests to confirm no running runs remain
472
+ // (some may have been cancelled on a previous pass)
473
+ if (fs.existsSync(stateRunsDir)) {
474
+ try {
475
+ for (const runDir of fs.readdirSync(stateRunsDir)) {
476
+ const manifestPath = path.join(
477
+ stateRunsDir,
478
+ runDir,
479
+ "manifest.json",
480
+ );
481
+ if (!fs.existsSync(manifestPath)) continue;
482
+ try {
483
+ const manifest: TeamRunManifest = JSON.parse(
484
+ fs.readFileSync(manifestPath, "utf-8"),
485
+ );
486
+ if (manifest.status === "running") {
487
+ hasRunning = true;
488
+ break;
489
+ }
490
+ } catch {
491
+ /* skip corrupt */
492
+ }
493
+ }
494
+ } catch {
495
+ /* skip unreadable */
496
+ }
497
+ }
498
+ }
499
+
500
+ const cleanupEnabled = options?.cleanupOrphanedTempDirs !== false;
501
+ if (cleanupEnabled && !hasRunning) {
502
+ try {
503
+ const stat = fs.statSync(workspaceDir);
504
+ const dirAge = now - stat.mtimeMs;
505
+ if (dirAge > ORPHAN_TEMP_DIR_AGE_THRESHOLD_MS) {
506
+ fs.rmSync(workspaceDir, {
507
+ recursive: true,
508
+ force: true,
509
+ });
510
+ cleanedDirs++;
511
+ }
512
+ } catch {
513
+ /* skip if stat or rm fails */
514
+ }
515
+ }
516
+ }
517
+ } catch {
518
+ /* skip if tmpdir unreadable */
519
+ }
520
+ return { repaired, cleanedDirs };
521
+ }
522
+
523
+ function getSafeTempDir(): string | undefined {
524
+ try {
525
+ return fs.existsSync(os.tmpdir()) ? os.tmpdir() : undefined;
526
+ } catch {
527
+ return undefined;
528
+ }
529
+ }
@@ -829,8 +829,13 @@ export async function runTeamTask(
829
829
  // _yieldResult: preserved for future use — yield completion contract not yet wired to task.result
830
830
  let _yieldResult: YieldResult | undefined;
831
831
  let noYield = false;
832
+ // Child-process workers do not have a submit_result tool — the yield contract
833
+ // only applies to live-session workers where submit_result is injected by the
834
+ // runtime. Skipping yield detection for child-process prevents every child
835
+ // worker from incorrectly being marked needs_attention.
832
836
  const yieldEnabled =
833
- input.runtimeConfig?.yield?.enabled ?? DEFAULT_YIELD_CONFIG.enabled;
837
+ runtimeKind !== "child-process" &&
838
+ (input.runtimeConfig?.yield?.enabled ?? DEFAULT_YIELD_CONFIG.enabled);
834
839
  if (yieldEnabled && collectedJsonEvents.length > 0) {
835
840
  if (hasYieldInOutput(collectedJsonEvents)) {
836
841
  const yieldEvent = collectedJsonEvents.find((e) =>
@@ -113,6 +113,7 @@ export const PiTeamsReliabilityConfigSchema = Type.Object({
113
113
  }, { additionalProperties: false })),
114
114
  autoRecover: Type.Optional(Type.Boolean()),
115
115
  deadletterThreshold: Type.Optional(Type.Integer({ minimum: 1 })),
116
+ cleanupOrphanedTempDirs: Type.Optional(Type.Boolean()),
116
117
  }, { additionalProperties: false });
117
118
 
118
119
  export const PiTeamsOtlpConfigSchema = Type.Object({