npm - pi-crew - Versions diffs - 0.3.7 → 0.3.9 - Mend

pi-crew 0.3.7 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/CHANGELOG.md +17 -0
package/package.json +1 -1
package/src/agents/discover-agents.ts +354 -15
package/src/config/config.ts +732 -208
package/src/config/types.ts +34 -5
package/src/extension/help.ts +1 -0
package/src/extension/register.ts +1173 -257
package/src/extension/registration/commands.ts +15 -2
package/src/extension/registration/team-tool.ts +1 -1
package/src/extension/session-summary.ts +11 -1
package/src/extension/team-tool/api.ts +4 -1
package/src/extension/team-tool/cache-control.ts +23 -0
package/src/extension/team-tool/cancel.ts +15 -5
package/src/extension/team-tool/context.ts +2 -0
package/src/extension/team-tool/handle-settings.ts +2 -0
package/src/extension/team-tool/health-monitor.ts +563 -0
package/src/extension/team-tool/inspect.ts +10 -3
package/src/extension/team-tool/respond.ts +5 -2
package/src/extension/team-tool/status.ts +4 -1
package/src/extension/team-tool-types.ts +2 -0
package/src/extension/team-tool.ts +901 -177
package/src/runtime/adaptive-plan.ts +1 -1
package/src/runtime/foreground-watchdog.ts +129 -0
package/src/runtime/manifest-cache.ts +4 -2
package/src/runtime/run-tracker.ts +11 -0
package/src/runtime/runtime-policy.ts +15 -2
package/src/runtime/skill-instructions.ts +8 -2
package/src/runtime/stale-reconciler.ts +322 -18
package/src/runtime/task-packet.ts +48 -1
package/src/runtime/task-runner.ts +6 -1
package/src/schema/config-schema.ts +1 -0
package/src/schema/team-tool-schema.ts +204 -76
package/src/state/state-store.ts +9 -1
package/src/teams/discover-teams.ts +2 -1
package/src/ui/run-event-bus.ts +2 -1
package/src/ui/settings-overlay.ts +2 -0
package/src/workflows/discover-workflows.ts +5 -1

package/src/runtime/adaptive-plan.ts CHANGED Viewed

@@ -263,7 +263,7 @@ export interface InjectAdaptivePlanResult {
 export function injectAdaptivePlanIfReady(input: InjectAdaptivePlanInput): InjectAdaptivePlanResult {
 	if (input.workflow.name !== "implementation") return { tasks: input.tasks, workflow: input.workflow, injected: false, missingPlan: false };
 	if (input.tasks.some((task) => task.stepId?.startsWith("adaptive-"))) return { tasks: input.tasks, workflow: reconstructAdaptiveWorkflow(input.workflow, input.tasks), injected: false, missingPlan: false };
-	const completedAssess = input.tasks.find((task) => task.stepId === "assess" && task.status === "completed");
+	const completedAssess = input.tasks.find((task) => task.stepId === "assess" && (task.status === "completed" || task.status === "needs_attention"));
 	if (!completedAssess) return { tasks: input.tasks, workflow: input.workflow, injected: false, missingPlan: false };
 	if (!completedAssess.resultArtifact?.path) {
 		appendEvent(input.manifest.eventsPath, { type: "adaptive.plan_missing", runId: input.manifest.runId, taskId: completedAssess.id, message: "Adaptive planner result artifact is missing." });

package/src/runtime/foreground-watchdog.ts ADDED Viewed

@@ -0,0 +1,129 @@
+/**
+ * Foreground run watchdog — periodically checks that active foreground runs
+ * are making progress and auto-notifies the assistant if a run appears hung.
+ *
+ * Problem: foreground runs run in background via startForegroundRun(). The Pi
+ * assistant has no way to know when a run completes or gets stuck without
+ * manual polling. This watchdog monitors active runs and:
+ *
+ * 1. Detects hung runs (active status, no heartbeat update for >10 min)
+ * 2. Injects a followUp message via pi.sendUserMessage() so the assistant
+ *    is automatically notified — no manual sleep+check needed.
+ * 3. Cleans up after itself when the run completes or the session ends.
+ */
+import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
+import { loadRunManifestById } from "../state/state-store.ts";
+import { readCrewAgents } from "./crew-agent-records.ts";
+import { isActiveRunStatus, isLikelyOrphanedActiveRun } from "./process-status.ts";
+export interface WatchdogOptions {
+	pi: ExtensionAPI;
+	cwd: string;
+	runId: string;
+	/** Check interval in ms. Default: 5 minutes. */
+	checkIntervalMs?: number;
+	/** Maximum time to monitor in ms. Default: 2 hours. */
+	maxMonitorMs?: number;
+}
+const DEFAULT_CHECK_INTERVAL_MS = 300_000; // 5 minutes
+const DEFAULT_MAX_MONITOR_MS = 7_200_000; // 2 hours
+/** Active watchdog timers — keyed by runId for cleanup. */
+const activeWatchdogs = new Map<string, ReturnType<typeof setTimeout>>();
+/** Stop a specific watchdog by runId. */
+export function stopWatchdog(runId: string): void {
+	const timer = activeWatchdogs.get(runId);
+	if (timer) {
+		clearTimeout(timer);
+		activeWatchdogs.delete(runId);
+	}
+}
+/** Stop all active watchdogs. Called on session shutdown. */
+export function stopAllWatchdogs(): void {
+	for (const [runId, timer] of activeWatchdogs) {
+		clearTimeout(timer);
+	}
+	activeWatchdogs.clear();
+}
+/**
+ * Start a periodic watchdog for a foreground run.
+ * Checks at regular intervals whether the run is still progressing.
+ * If the run appears hung (no update for >10 min with no active agents),
+ * injects a followUp message into the Pi conversation.
+ *
+ * Automatically stops when:
+ * - The run reaches a terminal status (completed/failed/cancelled)
+ * - The max monitor time is exceeded
+ * - Explicitly stopped via stopWatchdog()
+ */
+export function startForegroundWatchdog(opts: WatchdogOptions): void {
+	const { pi, cwd, runId } = opts;
+	const checkIntervalMs = opts.checkIntervalMs ?? DEFAULT_CHECK_INTERVAL_MS;
+	const maxMonitorMs = opts.maxMonitorMs ?? DEFAULT_MAX_MONITOR_MS;
+	const startTime = Date.now();
+	// Don't stack watchdogs for the same run
+	if (activeWatchdogs.has(runId)) return;
+	const check = (): void => {
+		// Check if max monitor time exceeded
+		if (Date.now() - startTime > maxMonitorMs) {
+			activeWatchdogs.delete(runId);
+			return;
+		}
+		try {
+			const loaded = loadRunManifestById(cwd, runId);
+			if (!loaded) {
+				// Run not found — stop watchdog
+				activeWatchdogs.delete(runId);
+				return;
+			}
+			const { manifest } = loaded;
+			// Terminal status — send completion notification and stop
+			if (!isActiveRunStatus(manifest.status)) {
+				const teamName = manifest.team ?? "unknown";
+				try {
+					pi.sendUserMessage(
+						`pi-crew run ${manifest.status}: ${runId} (${teamName}/${manifest.workflow ?? "default"})`,
+						{ deliverAs: "followUp" },
+					);
+				} catch { /* non-critical */ }
+				activeWatchdogs.delete(runId);
+				return;
+			}
+			// Check if run appears hung
+			const agents = readCrewAgents(manifest);
+			const now = Date.now();
+			if (isLikelyOrphanedActiveRun(manifest, agents, now)) {
+				const detail = `status=${manifest.status}, updatedAt=${manifest.updatedAt}, agents=${agents.length}`;
+				try {
+					pi.sendUserMessage(
+						`pi-crew watchdog: run ${runId} appears hung (${detail}). Consider running team action='cancel' runId='${runId}' or team action='doctor'.`,
+						{ deliverAs: "followUp" },
+					);
+				} catch { /* non-critical */ }
+				// Don't stop — keep monitoring. The assistant or user may intervene.
+			}
+		} catch {
+			// Non-critical — skip this check
+		}
+		// Schedule next check
+		const timer = setTimeout(check, checkIntervalMs);
+		timer.unref(); // Don't prevent process exit
+		activeWatchdogs.set(runId, timer);
+	};
+	// First check after initial interval
+	const timer = setTimeout(check, checkIntervalMs);
+	timer.unref();
+	activeWatchdogs.set(runId, timer);
+}

package/src/runtime/manifest-cache.ts CHANGED Viewed

@@ -108,8 +108,10 @@ function parseManifestIfChanged(root: string, runId: string, filePath: string, p
 function listRunRoots(cwd: string): string[] {
 	const roots = new Set<string>();
-	const base = findRepoRoot(cwd) ? projectCrewRoot(cwd) : userCrewRoot();
-	roots.add(path.join(base, DEFAULT_PATHS.state.runsSubdir));
+	// Always include user-level runs (fast-fix, direct-agent, etc. write here)
+	roots.add(path.join(userCrewRoot(), DEFAULT_PATHS.state.runsSubdir));
+	const projectRoot = findRepoRoot(cwd);
+	if (projectRoot) roots.add(path.join(projectCrewRoot(cwd), DEFAULT_PATHS.state.runsSubdir));
 	return [...roots];
 }

package/src/runtime/run-tracker.ts CHANGED Viewed

@@ -1,4 +1,6 @@
 import type { TeamRunManifest, TeamTaskState } from "../state/types.ts";
+import * as fs from "node:fs";
+import * as path from "node:path";
 import { loadRunManifestById } from "../state/state-store.ts";
 import { isFinishedRunStatus } from "./process-status.ts";
@@ -75,6 +77,15 @@ export async function waitForRun(
 	// Slow path: background run — poll with exponential backoff capped at pollIntervalMs
 	let attempt = 0;
 	while (Date.now() < deadline) {
+		if (attempt === 0) {
+			// Early exit: if the run directory doesn't exist, don't waste time polling
+			const runDir = path.join(cwd, ".crew", "state", "runs", runId);
+			if (!fs.existsSync(runDir)) {
+				throw new Error(
+					`Run ${runId} not found. No run directory at ${runDir}`,
+				);
+			}
+		}
 		const fresh = loadRunManifestById(cwd, runId);
 		if (fresh && isFinishedRunStatus(fresh.manifest.status)) {
 			return fresh;

package/src/runtime/runtime-policy.ts CHANGED Viewed

@@ -9,12 +9,25 @@ import { currentCrewDepth } from "./pi-args.ts";
  * - If the role appears in `isolationPolicy.isolatedRoles`, use child-process (crash isolation).
  * - Otherwise, use `isolationPolicy.defaultRuntime` when configured, then fall back to globalKind.
  */
-export function resolveTaskRuntimeKind(globalKind: CrewRuntimeKind, role: string, isolationPolicy: CrewRuntimeConfig["isolationPolicy"], env: NodeJS.ProcessEnv = process.env): CrewRuntimeKind {
+export function resolveTaskRuntimeKind(
+	globalKind: CrewRuntimeKind,
+	role: string,
+	isolationPolicy: CrewRuntimeConfig["isolationPolicy"],
+	env: NodeJS.ProcessEnv = process.env,
+): CrewRuntimeKind {
 	if (globalKind === "scaffold") return "scaffold";
 	// Safety: when already inside a pi-crew worker (depth > 0), never nest live-session.
 	// Live-session creates in-process Pi agent sessions, which would recursively
 	// try to use pi-crew, leading to "Cannot read properties of undefined" errors.
-	if (globalKind === "live-session" && currentCrewDepth(env) > 0) return "child-process";
+	// Exception: when PI_CREW_MOCK_LIVE_SESSION is set, we're in a test harness
+	// that mocks the live-session path — forcing child-process would spawn a real
+	// pi process and hang the test.
+	if (
+		globalKind === "live-session" &&
+		currentCrewDepth(env) > 0 &&
+		env.PI_CREW_MOCK_LIVE_SESSION !== "success"
+	)
+		return "child-process";
 	const isolatedRoles = isolationPolicy?.isolatedRoles ?? [];
 	if (isolatedRoles.includes(role)) return "child-process";
 	return isolationPolicy?.defaultRuntime ?? globalKind;

package/src/runtime/skill-instructions.ts CHANGED Viewed

@@ -91,10 +91,16 @@ export function resolveTaskSkillNames(input: ResolveTaskSkillsInput): string[] {
 	return collectTaskSkillNames(input).slice(0, MAX_SELECTED_SKILLS);
 }
+// ═══════════════════════════════════════════════════════════════════════════
+// SEC-003 Fix: Reverse skill search order (package first, project second)
+// Prevents malicious project skills from overriding trusted package skills.
+// See: SECURITY-ISSUES.md SEC-003
+// ═══════════════════════════════════════════════════════════════════════════
 function candidateSkillDirs(cwd: string): Array<{ root: string; source: "project" | "package" }> {
 	return [
-		{ root: path.resolve(cwd, "skills"), source: "project" },
-		{ root: PACKAGE_SKILLS_DIR, source: "package" },
+		{ root: PACKAGE_SKILLS_DIR, source: "package" },   // ✓ Trusted first
+		{ root: path.resolve(cwd, "skills"), source: "project" },  // ⚠️ Override second
 	];
 }

package/src/runtime/stale-reconciler.ts CHANGED Viewed

@@ -1,8 +1,12 @@
 import * as fs from "node:fs";
+import * as os from "node:os";
 import * as path from "node:path";
 import type { TeamRunManifest, TeamTaskState } from "../state/types.ts";
-import { checkProcessLiveness } from "./process-status.ts";
 import { recordFromTask, upsertCrewAgent } from "./crew-agent-records.ts";
+import { checkProcessLiveness } from "./process-status.ts";
+/** Age threshold for orphaned temp directory cleanup: 1 hour. */
+const ORPHAN_TEMP_DIR_AGE_THRESHOLD_MS = 60 * 60 * 1000;
 /**
  * Result of reconciling a single stale run.
@@ -10,7 +14,12 @@ import { recordFromTask, upsertCrewAgent } from "./crew-agent-records.ts";
 export interface ReconcileResult {
 	runId: string;
 	/** What was found and what action was taken */
-	verdict: "healthy" | "result_exists" | "pid_dead" | "pid_alive_stale" | "no_status";
+	verdict:
+		| "healthy"
+		| "result_exists"
+		| "pid_dead"
+		| "pid_alive_stale"
+		| "no_status";
 	/** Whether repair was applied */
 	repaired: boolean;
 	/** Human-readable detail */
@@ -21,6 +30,8 @@ export interface ReconcileResult {
 const STALE_ALIVE_PID_MS = 24 * 60 * 60 * 1000; // 24 hours
 const ACTIVE_EVIDENCE_TTL_MS = 5 * 60 * 1000;
+/** For no-PID runs, repair when ALL running tasks have heartbeat stale beyond this threshold. */
+const NO_PID_HEARTBEAT_STALE_MS = 5 * 60 * 1000; // 5 minutes — same as heartbeat-gradient deadMs
 /**
  * Phase 1: Check if a result file already exists for the run.
@@ -31,14 +42,28 @@ function checkResultFile(
 	tasks: TeamTaskState[],
 ): { found: boolean; repaired: boolean } {
 	// Check if all tasks already have terminal status (result was written but manifest wasn't updated)
-	const allTerminal = tasks.length > 0 && tasks.every(
-		(t) => t.status === "completed" || t.status === "failed" || t.status === "cancelled" || t.status === "skipped" || t.status === "needs_attention",
-	);
+	const allTerminal =
+		tasks.length > 0 &&
+		tasks.every(
+			(t) =>
+				t.status === "completed" ||
+				t.status === "failed" ||
+				t.status === "cancelled" ||
+				t.status === "skipped" ||
+				t.status === "needs_attention",
+		);
 	if (allTerminal) {
 		// Sync agent records even when tasks are already terminal
 		// (e.g., a previous reconcile fixed tasks but crashed before updating agents)
 		for (const task of tasks) {
-			try { upsertCrewAgent(manifest, recordFromTask(manifest, task, "scaffold")); } catch { /* non-critical */ }
+			try {
+				upsertCrewAgent(
+					manifest,
+					recordFromTask(manifest, task, "scaffold"),
+				);
+			} catch {
+				/* non-critical */
+			}
 		}
 		return { found: true, repaired: false };
 	}
@@ -52,7 +77,10 @@ function checkResultFile(
  * written, treat the PID as alive even if process.kill returns false
  * (handles SIGKILL race where PID hasn't been recycled yet).
  */
-function checkPidLiveness(pid: number | undefined, stateRoot?: string): {
+function checkPidLiveness(
+	pid: number | undefined,
+	stateRoot?: string,
+): {
 	alive: boolean;
 	detail: string;
 } {
@@ -67,13 +95,18 @@ function checkPidLiveness(pid: number | undefined, stateRoot?: string): {
 		const heartbeatPath = path.join(stateRoot, "heartbeat.json");
 		try {
 			if (fs.existsSync(heartbeatPath)) {
-				const hb = JSON.parse(fs.readFileSync(heartbeatPath, "utf-8")) as { pid?: number; at?: number };
+				const hb = JSON.parse(
+					fs.readFileSync(heartbeatPath, "utf-8"),
+				) as { pid?: number; at?: number };
 				if (hb?.pid === pid && hb?.at) {
 					const ageMs = Date.now() - hb.at;
 					// Heartbeat written < 5 min ago → process was alive recently.
 					// Don't repair yet; let the next reconciliation cycle catch it.
 					if (ageMs < 5 * 60_000) {
-						return { alive: true, detail: `process dead but heartbeat ${Math.round(ageMs / 1000)}s old` };
+						return {
+							alive: true,
+							detail: `process dead but heartbeat ${Math.round(ageMs / 1000)}s old`,
+						};
 					}
 				}
 			}
@@ -101,18 +134,76 @@ function evaluateStaleness(
 		return { stale: false, reason: "updated_at_invalid" };
 	}
 	if (now - updatedAt > STALE_ALIVE_PID_MS) {
-		return { stale: true, reason: `alive_but_stale_${Math.round((now - updatedAt) / 3600_000)}h` };
+		return {
+			stale: true,
+			reason: `alive_but_stale_${Math.round((now - updatedAt) / 3600_000)}h`,
+		};
 	}
 	return { stale: false, reason: "alive_and_recent" };
 }
 function hasRecentActiveEvidence(tasks: TeamTaskState[], now: number): boolean {
 	return tasks.some((task) => {
-		if (task.status !== "running" && task.status !== "waiting") return false;
-		const heartbeatAt = task.heartbeat?.lastSeenAt ? new Date(task.heartbeat.lastSeenAt).getTime() : Number.NaN;
-		if (task.heartbeat?.alive !== false && Number.isFinite(heartbeatAt) && now - heartbeatAt <= ACTIVE_EVIDENCE_TTL_MS) return true;
-		const activityAt = task.agentProgress?.lastActivityAt ? new Date(task.agentProgress.lastActivityAt).getTime() : Number.NaN;
-		return Number.isFinite(activityAt) && now - activityAt <= ACTIVE_EVIDENCE_TTL_MS;
+		if (task.status !== "running" && task.status !== "waiting")
+			return false;
+		const heartbeatAt = task.heartbeat?.lastSeenAt
+			? new Date(task.heartbeat.lastSeenAt).getTime()
+			: Number.NaN;
+		if (
+			task.heartbeat?.alive !== false &&
+			Number.isFinite(heartbeatAt) &&
+			now - heartbeatAt <= ACTIVE_EVIDENCE_TTL_MS
+		)
+			return true;
+		const activityAt = task.agentProgress?.lastActivityAt
+			? new Date(task.agentProgress.lastActivityAt).getTime()
+			: Number.NaN;
+		return (
+			Number.isFinite(activityAt) &&
+			now - activityAt <= ACTIVE_EVIDENCE_TTL_MS
+		);
+	});
+}
+/**
+ * For no-PID runs: check if ALL running tasks have heartbeats stale beyond
+ * the no-PID heartbeat threshold. This detects zombie tasks where the worker
+ * process died but no PID was recorded (e.g. live-session /tmp/ workspaces).
+ * Tasks with no heartbeat AND no agent progress are considered NOT stale
+ * (they may be newly spawned and haven't reported yet).
+ */
+function allRunningTasksHeartbeatStale(
+	tasks: TeamTaskState[],
+	now: number,
+): boolean {
+	const runningTasks = tasks.filter(
+		(t) => t.status === "running" || t.status === "waiting",
+	);
+	if (runningTasks.length === 0) return false;
+	return runningTasks.every((task) => {
+		const heartbeatAt = task.heartbeat?.lastSeenAt
+			? new Date(task.heartbeat.lastSeenAt).getTime()
+			: Number.NaN;
+		const activityAt = task.agentProgress?.lastActivityAt
+			? new Date(task.agentProgress.lastActivityAt).getTime()
+			: Number.NaN;
+		// If no heartbeat AND no activity, we can't determine staleness — assume not stale
+		if (!Number.isFinite(heartbeatAt) && !Number.isFinite(activityAt))
+			return false;
+		// If heartbeat is recent enough, not stale
+		if (
+			Number.isFinite(heartbeatAt) &&
+			now - heartbeatAt <= NO_PID_HEARTBEAT_STALE_MS
+		)
+			return false;
+		// If agent progress is recent enough, not stale
+		if (
+			Number.isFinite(activityAt) &&
+			now - activityAt <= NO_PID_HEARTBEAT_STALE_MS
+		)
+			return false;
+		// Both present and both stale → this task is stale
+		return true;
 	});
 }
@@ -126,7 +217,11 @@ function repairStaleRun(
 ): TeamTaskState[] {
 	const now = new Date().toISOString();
 	const repairedTasks = tasks.map((task) => {
-		if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
+		if (
+			task.status === "running" ||
+			task.status === "queued" ||
+			task.status === "waiting"
+		) {
 			return {
 				...task,
 				status: "cancelled" as const,
@@ -138,7 +233,14 @@ function repairStaleRun(
 	});
 	// Update agent records so widget sees cancelled status immediately
 	for (const task of repairedTasks) {
-		try { upsertCrewAgent(manifest, recordFromTask(manifest, task, "scaffold")); } catch { /* non-critical */ }
+		try {
+			upsertCrewAgent(
+				manifest,
+				recordFromTask(manifest, task, "scaffold"),
+			);
+		} catch {
+			/* non-critical */
+		}
 	}
 	return repairedTasks;
 }
@@ -183,8 +285,31 @@ export function reconcileStaleRun(
 				detail: "No PID recorded, but recent task heartbeat/progress exists; not repairing",
 			};
 		}
+		// No PID and no recent activity. If ALL running tasks have stale heartbeats
+		// (beyond NO_PID_HEARTBEAT_STALE_MS = 5min), repair immediately — the worker
+		// process is dead but we have no PID to check. This handles /tmp/ live-session
+		// workspaces where agents exit without calling submit_result.
+		if (allRunningTasksHeartbeatStale(tasks, now)) {
+			const repaired = repairStaleRun(
+				manifest,
+				tasks,
+				"no_pid_heartbeat_stale",
+			);
+			return {
+				runId,
+				verdict: "no_status",
+				repaired: true,
+				detail: `No PID; all running task heartbeats stale >${Math.round(NO_PID_HEARTBEAT_STALE_MS / 60_000)}min; repaired ${repaired.filter((t) => t.status === "cancelled").length} tasks`,
+				repairedTasks: repaired,
+			};
+		}
+		// Fall through: no recent activity but not all tasks stale enough yet.
+		// Check the longer STALE_ALIVE_PID_MS threshold for very old runs.
 		const updatedAt = new Date(manifest.updatedAt).getTime();
-		if (Number.isFinite(updatedAt) && now - updatedAt > STALE_ALIVE_PID_MS) {
+		if (
+			Number.isFinite(updatedAt) &&
+			now - updatedAt > STALE_ALIVE_PID_MS
+		) {
 			const repaired = repairStaleRun(manifest, tasks, "no_pid_stale");
 			return {
 				runId,
@@ -223,3 +348,182 @@ export function reconcileStaleRun(
 		repairedTasks: repaired,
 	};
 }
+/**
+ * Result of orphaned temp workspace reconciliation.
+ */
+export interface OrphanReconcileResult {
+	/** Number of runs repaired (manifests cancelled). */
+	repaired: number;
+	/** Number of /tmp/pi-crew-* directories removed. */
+	cleanedDirs: number;
+}
+/**
+ * Scan /tmp (os.tmpdir()) for orphaned pi-crew-* workspaces and reconcile
+ * any stale runs found. This catches runs created by tests or crashed sessions
+ * that the per-CWD auto-repair timer would miss.
+ *
+ * When `cleanupOrphanedTempDirs` is not explicitly set to `false`, directories
+ * older than 1 hour with no remaining running manifests are deleted after
+ * their runs are reconciled.
+ *
+ * @returns Number of runs repaired and directories cleaned.
+ */
+export function reconcileOrphanedTempWorkspaces(
+	now = Date.now(),
+	options?: { cleanupOrphanedTempDirs?: boolean },
+): OrphanReconcileResult {
+	const tmpDir = getSafeTempDir();
+	if (!tmpDir) return { repaired: 0, cleanedDirs: 0 };
+	let repaired = 0;
+	let cleanedDirs = 0;
+	try {
+		const entries = fs.readdirSync(tmpDir, { withFileTypes: true });
+		for (const entry of entries) {
+			if (!entry.isDirectory() || !entry.name.startsWith("pi-crew-"))
+				continue;
+			const workspaceDir = path.join(tmpDir, entry.name);
+			const crewDir = path.join(workspaceDir, ".crew");
+			if (!fs.existsSync(crewDir)) continue;
+			const stateRunsDir = path.join(crewDir, "state", "runs");
+			if (!fs.existsSync(stateRunsDir)) continue;
+			let hasRunning = false;
+			try {
+				for (const runDir of fs.readdirSync(stateRunsDir)) {
+					const manifestPath = path.join(
+						stateRunsDir,
+						runDir,
+						"manifest.json",
+					);
+					const tasksPath = path.join(
+						stateRunsDir,
+						runDir,
+						"tasks.json",
+					);
+					if (
+						!fs.existsSync(manifestPath) ||
+						!fs.existsSync(tasksPath)
+					)
+						continue;
+					try {
+						const manifest: TeamRunManifest = JSON.parse(
+							fs.readFileSync(manifestPath, "utf-8"),
+						);
+						if (manifest.status !== "running") continue;
+						const tasks: TeamTaskState[] = JSON.parse(
+							fs.readFileSync(tasksPath, "utf-8"),
+						);
+						const result = reconcileStaleRun(manifest, tasks, now);
+						if (result.repaired && result.repairedTasks) {
+							// Persist repaired tasks
+							fs.writeFileSync(
+								tasksPath,
+								JSON.stringify(result.repairedTasks, null, 2),
+							);
+							// Update manifest status
+							const updated = {
+								...manifest,
+								status: "cancelled" as const,
+								updatedAt: new Date(now).toISOString(),
+								summary: `Stale run reconciled: ${result.detail}`,
+							};
+							fs.writeFileSync(
+								manifestPath,
+								JSON.stringify(updated, null, 2),
+							);
+							// Update agent records
+							for (const task of result.repairedTasks) {
+								try {
+									upsertCrewAgent(
+										updated,
+										recordFromTask(
+											updated,
+											task,
+											"scaffold",
+										),
+									);
+								} catch {
+									/* non-critical */
+								}
+							}
+							repaired++;
+						}
+						// If still running after reconciliation attempt, mark for dir-preserving
+						if (
+							result.verdict === "healthy" ||
+							(result.verdict === "no_status" && !result.repaired)
+						) {
+							hasRunning = true;
+						}
+					} catch {
+						/* skip corrupt manifests */
+					}
+				}
+			} catch {
+				/* skip unreadable dirs */
+			}
+			// Post-loop: check if this workspace dir can be cleaned up.
+			// Eligible when cleanup is enabled, no running manifests remain, and
+			// the directory is older than the age threshold.
+			if (!hasRunning) {
+				// Re-scan manifests to confirm no running runs remain
+				// (some may have been cancelled on a previous pass)
+				if (fs.existsSync(stateRunsDir)) {
+					try {
+						for (const runDir of fs.readdirSync(stateRunsDir)) {
+							const manifestPath = path.join(
+								stateRunsDir,
+								runDir,
+								"manifest.json",
+							);
+							if (!fs.existsSync(manifestPath)) continue;
+							try {
+								const manifest: TeamRunManifest = JSON.parse(
+									fs.readFileSync(manifestPath, "utf-8"),
+								);
+								if (manifest.status === "running") {
+									hasRunning = true;
+									break;
+								}
+							} catch {
+								/* skip corrupt */
+							}
+						}
+					} catch {
+						/* skip unreadable */
+					}
+				}
+			}
+			const cleanupEnabled = options?.cleanupOrphanedTempDirs !== false;
+			if (cleanupEnabled && !hasRunning) {
+				try {
+					const stat = fs.statSync(workspaceDir);
+					const dirAge = now - stat.mtimeMs;
+					if (dirAge > ORPHAN_TEMP_DIR_AGE_THRESHOLD_MS) {
+						fs.rmSync(workspaceDir, {
+							recursive: true,
+							force: true,
+						});
+						cleanedDirs++;
+					}
+				} catch {
+					/* skip if stat or rm fails */
+				}
+			}
+		}
+	} catch {
+		/* skip if tmpdir unreadable */
+	}
+	return { repaired, cleanedDirs };
+}
+function getSafeTempDir(): string | undefined {
+	try {
+		return fs.existsSync(os.tmpdir()) ? os.tmpdir() : undefined;
+	} catch {
+		return undefined;
+	}
+}