npm - pi-crew - Versions diffs - 0.7.5 → 0.7.7 - Mend

pi-crew 0.7.5 → 0.7.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

package/CHANGELOG.md +71 -0
package/README.md +11 -11
package/docs/commands-reference.md +14 -10
package/docs/troubleshooting.md +131 -0
package/docs/usage.md +9 -4
package/package.json +1 -1
package/src/config/config.ts +11 -4
package/src/extension/action-suggestions.ts +71 -0
package/src/extension/context-status-injection.ts +32 -1
package/src/extension/register.ts +71 -65
package/src/extension/team-tool/api.ts +3 -2
package/src/extension/team-tool/cancel.ts +5 -4
package/src/extension/team-tool/explain.ts +2 -1
package/src/extension/team-tool/failure-patterns.ts +124 -0
package/src/extension/team-tool/inspect.ts +10 -6
package/src/extension/team-tool/lifecycle-actions.ts +5 -4
package/src/extension/team-tool/respond.ts +4 -3
package/src/extension/team-tool/run-not-found.ts +54 -0
package/src/extension/team-tool/run.ts +26 -4
package/src/extension/team-tool/status.ts +58 -4
package/src/extension/team-tool.ts +5 -3
package/src/runtime/async-runner.ts +7 -0
package/src/runtime/background-runner.ts +7 -1
package/src/runtime/chain-parser.ts +13 -5
package/src/runtime/checkpoint.ts +13 -1
package/src/runtime/child-pi.ts +9 -1
package/src/runtime/crash-recovery.ts +21 -1
package/src/runtime/live-session-runtime.ts +15 -1
package/src/runtime/parent-guard.ts +2 -2
package/src/runtime/pi-spawn.ts +66 -0
package/src/runtime/stale-reconciler.ts +38 -3
package/src/runtime/task-runner.ts +10 -1
package/src/runtime/team-runner.ts +19 -2
package/src/runtime/verification-gates.ts +21 -1
package/src/schema/team-tool-schema.ts +9 -0
package/src/state/blob-store.ts +12 -10
package/src/state/event-log-rotation.ts +114 -93
package/src/state/event-log.ts +79 -20
package/src/state/health-store.ts +6 -1
package/src/state/locks.ts +66 -16
package/src/state/state-store.ts +14 -1
package/src/ui/card-colors.ts +7 -3
package/src/ui/dashboard-panes/agents-pane.ts +15 -2
package/src/ui/live-duration.ts +58 -0
package/src/ui/tool-render.ts +7 -11
package/src/ui/tool-renderers/index.ts +6 -3
package/src/ui/widget/widget-formatters.ts +2 -13
package/src/utils/fs-watch.ts +11 -60
package/src/utils/run-watcher-registry.ts +164 -0
package/src/workflows/discover-workflows.ts +2 -1
package/src/workflows/workflow-config.ts +5 -0
package/src/runtime/dynamic-script-runner.ts +0 -497
package/src/runtime/sandbox.ts +0 -335

package/src/extension/team-tool/status.ts CHANGED Viewed

@@ -3,24 +3,31 @@ import type { TeamToolParamsValue } from "../../schema/team-tool-schema.ts";
 import { appendEvent, readEvents } from "../../state/event-log.ts";
 import { readDeliveryState, readMailbox } from "../../state/mailbox.ts";
 import { loadRunManifestById, updateRunStatus, saveRunTasks } from "../../state/state-store.ts";
-import { aggregateUsage, formatUsage } from "../../state/usage.ts";
+import { aggregateUsage, formatUsage, formatCost } from "../../state/usage.ts";
 import { applyAttentionState, formatActivityAge, resolveCrewControlConfig } from "../../runtime/agent-control.ts";
 import { readCrewAgents } from "../../runtime/crew-agent-records.ts";
 import { checkProcessLiveness, isActiveRunStatus } from "../../runtime/process-status.ts";
 import { formatTaskGraphLines, waitingReason } from "../../runtime/task-display.ts";
+import { computePhaseProgress } from "../../runtime/phase-progress.ts";
+import { formatDuration } from "../../ui/tool-render.ts";
 import { verifyTaskCompletion } from "../../runtime/completion-guard.ts";
 import { evaluateRunEffectiveness } from "../../runtime/effectiveness.ts";
 import type { PiTeamsToolResult } from "../tool-result.ts";
 import { locateRunCwd } from "../team-tool.ts";
 import { result, type TeamContext } from "./context.ts";
+import { RUN_NOT_FOUND_HINT } from "./run-not-found.ts";
 export function handleStatus(params: TeamToolParamsValue, ctx: TeamContext): PiTeamsToolResult {
 	if (!params.runId) return result("Status requires runId.", { action: "status", status: "error" }, true);
 	const runCwd = locateRunCwd(params.runId, ctx.cwd);
-	if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "status", status: "error" }, true);
+	if (!runCwd) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "status", status: "error" }, true);
 	const loaded = loadRunManifestById(runCwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
-	if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "status", status: "error" }, true);
+	if (!loaded) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "status", status: "error" }, true);
 	let { manifest, tasks } = loaded;
+	// DX (Round 16 F3): compact status mode. Default = full (backward compatible).
+	// details=false gives a tight summary (status, goal, counts, failed/attention
+	// errors) for quick checks without 40 lines of dense key=value noise.
+	const fullDetails = params.details !== false;
 	let asyncLivenessLine: string | undefined;
 	if (manifest.async) {
 		const asyncState = manifest.async;
@@ -35,6 +42,7 @@ export function handleStatus(params: TeamToolParamsValue, ctx: TeamContext): PiT
 	}
 	const counts = new Map<string, number>();
 	for (const task of tasks) counts.set(task.status, (counts.get(task.status) ?? 0) + 1);
+	const phaseProgress = computePhaseProgress(tasks);
 	const allEvents = readEvents(manifest.eventsPath);
 	const events = allEvents.slice(-8);
 	const attentionByTask = new Map(allEvents.filter((event) => event.type === "task.attention" && event.taskId).map((event) => [event.taskId!, event]));
@@ -62,12 +70,13 @@ export function handleStatus(params: TeamToolParamsValue, ctx: TeamContext): PiT
 	const activeAgents = crewAgents.filter((agent) => agent.status === "running");
 	const completedAgents = crewAgents.filter((agent) => agent.status !== "running");
 	const waitingTasks = tasks.filter((task) => task.status === "queued" || task.status === "waiting");
-	const agentLine = (agent: typeof crewAgents[number]): string => `- ${agent.id} [${agent.status}] ${agent.role} -> ${agent.agent} runtime=${agent.runtime}${agent.model ? ` model=${agent.model}` : ""}${agent.usage ? ` usage=${formatUsage(agent.usage)}` : ""}${agent.progress?.activityState ? ` activityState=${agent.progress.activityState}` : ""}${formatActivityAge(agent) ? ` activity=${formatActivityAge(agent)}` : ""}${agent.progress?.currentTool ? ` tool=${agent.progress.currentTool}` : ""}${agent.toolUses ? ` tools=${agent.toolUses}` : ""}${!agent.usage && agent.progress?.tokens ? ` tokens=${agent.progress.tokens}` : ""}${agent.progress?.turns ? ` turns=${agent.progress.turns}` : ""}${agent.jsonEvents !== undefined ? ` jsonEvents=${agent.jsonEvents}` : ""}${agent.outputPath ? ` output=${agent.outputPath}` : ""}${agent.transcriptPath ? ` transcript=${agent.transcriptPath}` : ""}${agent.statusPath ? ` status=${agent.statusPath}` : ""}${agent.error ? ` error=${agent.error}` : ""}`;
+	const agentLine = (agent: typeof crewAgents[number]): string => `- ${agent.id} [${agent.status}] ${agent.role} -> ${agent.agent} runtime=${agent.runtime}${agent.model ? ` model=${agent.model}` : ""}${agent.usage ? ` usage=${formatUsage(agent.usage)}` : ""}${agent.usage?.cost ? ` cost=${formatCost(agent.usage.cost)}` : ""}${agent.progress?.activityState ? ` activityState=${agent.progress.activityState}` : ""}${formatActivityAge(agent) ? ` activity=${formatActivityAge(agent)}` : ""}${agent.progress?.currentTool ? ` tool=${agent.progress.currentTool}` : ""}${agent.toolUses ? ` tools=${agent.toolUses}` : ""}${!agent.usage && agent.progress?.tokens ? ` tokens=${agent.progress.tokens}` : ""}${agent.progress?.turns ? ` turns=${agent.progress.turns}` : ""}${agent.jsonEvents !== undefined ? ` jsonEvents=${agent.jsonEvents}` : ""}${agent.outputPath ? ` output=${agent.outputPath}` : ""}${agent.transcriptPath ? ` transcript=${agent.transcriptPath}` : ""}${agent.statusPath ? ` status=${agent.statusPath}` : ""}${agent.error ? ` error=${agent.error}` : ""}`;
 	const lines = [
 		`Run: ${manifest.runId}`,
 		`Team: ${manifest.team}`,
 		`Workflow: ${manifest.workflow ?? "(none)"}`,
 		`Status: ${manifest.status}`,
+		`Progress: ${phaseProgress.overallPercentage}% (~${formatDuration(phaseProgress.estimatedRemainingMs)} remaining)`,
 		`Workspace mode: ${manifest.workspaceMode}`,
 		...(manifest.runtimeResolution ? [`Runtime: ${manifest.runtimeResolution.kind}`, `Runtime safety: ${manifest.runtimeResolution.safety}`, `Runtime requested: ${manifest.runtimeResolution.requestedMode}${manifest.runtimeResolution.reason ? ` (${manifest.runtimeResolution.reason})` : ""}`] : []),
 		`Goal: ${manifest.goal}`,
@@ -109,5 +118,50 @@ export function handleStatus(params: TeamToolParamsValue, ctx: TeamContext): PiT
 		"Recent events:",
 		...(events.length ? events.map((event) => `- ${event.time} ${event.type}${event.taskId ? ` ${event.taskId}` : ""}${event.message ? `: ${event.message}` : ""}`) : ["- (none)"]),
 	];
+	if (!fullDetails) {
+		return result(
+			buildCompactStatus(manifest, tasks, counts, asyncLivenessLine, phaseProgress).join("\n"),
+			{ action: "status", status: "ok", runId: manifest.runId, artifactsRoot: manifest.artifactsRoot, intent: `status ${manifest.runId}: ${manifest.status} (compact)` },
+		);
+	}
 	return result(lines.join("\n"), { action: "status", status: "ok", runId: manifest.runId, artifactsRoot: manifest.artifactsRoot, intent: `status ${manifest.runId}: ${manifest.status}` });
 }
+/**
+ * Compact status builder (DX: Round 16 F3). A tight summary for quick checks:
+ * identity, status, goal, task counts, and ONLY failed / attention task
+ * errors — not the 40-line dense dump. Invoked when params.details === false.
+ *
+ * Exported for unit testing.
+ */
+export function buildCompactStatus(
+	manifest: { runId: string; team: string; workflow?: string; status: string; goal: string; workspaceMode?: string },
+	tasks: Array<{ id: string; status: string; role: string; agent: string; error?: string }>,
+	counts: Map<string, number>,
+	asyncLivenessLine?: string,
+	progress?: { overallPercentage: number; estimatedRemainingMs: number },
+): string[] {
+	const failedOrAttention = tasks.filter(
+		(t) =>
+			t.status === "failed" ||
+			t.status === "needs_attention" ||
+			t.status === "cancelled",
+	);
+	const lines = [
+		`Run: ${manifest.runId}`,
+		`Team: ${manifest.team}${manifest.workflow ? ` (${manifest.workflow})` : ""}`,
+		`Status: ${manifest.status}`,
+		...(progress ? [`Progress: ${progress.overallPercentage}% (~${formatDuration(progress.estimatedRemainingMs)} remaining)`] : []),
+		`Goal: ${manifest.goal}`,
+		...(asyncLivenessLine ? [asyncLivenessLine] : []),
+		`Tasks: ${[...counts.entries()].map(([s, c]) => `${s}=${c}`).join(", ") || "none"}`,
+	];
+	if (failedOrAttention.length > 0) {
+		lines.push("Issues:");
+		for (const t of failedOrAttention) {
+			lines.push(`- ${t.id} [${t.status}] ${t.role}: ${t.error ?? "(no error detail)"}`);
+		}
+	}
+	lines.push("Tip: pass details=true for full output (task graph, agents, effectiveness, events).");
+	return lines;
+}

package/src/extension/team-tool.ts CHANGED Viewed

@@ -156,6 +156,8 @@ import { handleParallel } from "./team-tool/parallel-dispatch.ts";
 import { handlePlan } from "./team-tool/plan.ts";
 import { handleRespond } from "./team-tool/respond.ts";
 import { handleStatus } from "./team-tool/status.ts";
+import { RUN_NOT_FOUND_HINT } from "./team-tool/run-not-found.ts";
+import { formatActionSuggestion } from "./action-suggestions.ts";
 export { handleApi } from "./team-tool/api.ts";
 export { handleRetry } from "./team-tool/cancel.ts";
@@ -459,14 +461,14 @@ export async function handleResume(
 	const runCwd = locateRunCwd(params.runId, ctx.cwd);
 	if (!runCwd)
 		return result(
-			`Run '${params.runId}' not found.`,
+			`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`,
 			{ action: "resume", status: "error" },
 			true,
 		);
 	const loaded = loadRunManifestById(runCwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
 	if (!loaded)
 		return result(
-			`Run '${params.runId}' not found.`,
+			`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`,
 			{ action: "resume", status: "error" },
 			true,
 		);
@@ -1347,7 +1349,7 @@ export async function handleTeamTool(
 		}
 		default:
 			return result(
-				`Unknown action: ${action}`,
+				`Unknown action: ${action}${formatActionSuggestion(String(action))}`,
 				{ action: "unknown", status: "error" },
 				true,
 			);

package/src/runtime/async-runner.ts CHANGED Viewed

@@ -231,6 +231,13 @@ export async function spawnBackgroundTeamRun(manifest: TeamRunManifest): Promise
 		windowsHide: true,
 	} as unknown as Parameters<typeof spawn>[2];
 	const child = spawn(process.execPath, command.args, spawnOpts);
+	// Round 27 (BUG 3): the piped stdout/stderr are NEVER read or destroyed →
+	// 2 FDs leak per background spawn, and if the child writes >64KB (pipe
+	// buffer) it blocks forever (nobody drains the pipe) → background runner
+	// hangs. The background runner redirects its own console to a file, so we
+	// don't need this output — destroy the read ends immediately.
+	child.stdout?.destroy();
+	child.stderr?.destroy();
 	child.on("error", (error: Error) => {
 		logInternalError("async-runner.spawn", error, `pid=${child.pid ?? "unknown"}`);
 	});

package/src/runtime/background-runner.ts CHANGED Viewed

@@ -525,7 +525,13 @@ async function main(): Promise<void> {
 		const agents = allAgents(discoverAgents(cwd));
 		debugLog(`[background-runner] discoverAgents done, ${agents.length} agents`,
 		);
-		try { fs.fsyncSync(fs.openSync(manifest.eventsPath, "a")); } catch { /* best-effort */ } // FORCE flush so we see this before death
+		// Round 27 (BUG 2): openSync returned an fd that was never closed → FD
+		// leak per background runner startup. Close it in a finally (matches the
+		// canonical pattern in checkpoint.ts:83 and event-log.ts:582).
+		try {
+			const fd = fs.openSync(manifest.eventsPath, "a");
+			try { fs.fsyncSync(fd); } finally { try { fs.closeSync(fd); } catch { /* best-effort */ } }
+		} catch { /* best-effort */ } // FORCE flush so we see this before death
 		debugLog(`[background-runner] calling directTeamAndWorkflowFromRun`,
 		);
 		const direct = directTeamAndWorkflowFromRun(manifest, tasks, agents);

package/src/runtime/chain-parser.ts CHANGED Viewed

@@ -122,10 +122,10 @@ class ChainParser {
 	parse(): ChainStep[] {
 		const steps: ChainStep[] = [];
-		steps.push(this.parseStep());
+		steps.push(this.parseStep(0));
 		while (this.peek("ARROW")) {
 			this.consume("ARROW");
-			steps.push(this.parseStep());
+			steps.push(this.parseStep(0));
 		}
 		if (this.pos < this.tokens.length) {
 			throw new Error(`Unexpected token '${this.tokens[this.pos]?.value}' at position ${this.pos}`);
@@ -133,16 +133,24 @@ class ChainParser {
 		return steps;
 	}
-	private parseStep(): ChainStep {
+	private parseStep(depth: number = 0): ChainStep {
+		// Round 22 (BUG 2): guard against stack overflow on deeply nested input.
+		// Without this, a crafted 'parallel(parallel(parallel(...)))' input would
+		// recurse unbounded and crash the process with RangeError. Each nesting
+		// level needs >=9 chars, so ~130KB could overflow V8's ~15K-frame stack.
+		const MAX_CHAIN_NESTING = 100;
+		if (depth > MAX_CHAIN_NESTING) {
+			throw new Error(`Chain DSL nesting too deep (max ${MAX_CHAIN_NESTING}); likely unbalanced or malicious input`);
+		}
 		// Check for parallel(...) construct
 		if (this.peek("NAME", "parallel")) {
 			this.consume("NAME"); // eat "parallel"
 			this.consume("LPAREN");
 			const parallel: ChainStep[] = [];
-			parallel.push(this.parseStep());
+			parallel.push(this.parseStep(depth + 1));
 			while (this.peek("COMMA")) {
 				this.consume("COMMA");
-				parallel.push(this.parseStep());
+				parallel.push(this.parseStep(depth + 1));
 			}
 			this.consume("RPAREN");
 			const step: ChainStep = { name: "parallel", parallel };

package/src/runtime/checkpoint.ts CHANGED Viewed

@@ -64,7 +64,19 @@ export class FileCheckpointStore implements CheckpointStore {
 		// Atomic write: write to temp file first, then rename, then fsync parent.
 		// This guarantees either the old file or the new file, never a partial
 		// write, even on network filesystems or certain journal modes.
-		const tmp = path.join(this.checkpointDir(), ".tmp.checkpoint");
+		//
+		// Round 22 (BUG 1): the temp filename MUST be unique per save call.
+		// Previously a fixed '.tmp.checkpoint' was shared across ALL concurrent
+		// saves; pi-crew's multi-process architecture (main + detached background
+		// workers each checkpointing their own tasks) made this realistic: two
+		// processes writing '.tmp.checkpoint' at once → one's rename picks up the
+		// other's data (silent corruption) and the second rename hits ENOENT
+		// (silent data loss). Including taskId + pid + timestamp guarantees
+		// uniqueness across processes and across tasks.
+		const tmp = path.join(
+			this.checkpointDir(),
+			`.tmp.${checkpoint.taskId}.${process.pid}.${Date.now()}.${Math.random().toString(36).slice(2, 8)}`,
+		);
 		fs.writeFileSync(tmp, JSON.stringify(checkpoint, null, 2), "utf-8");
 		fs.renameSync(tmp, p);
 		// fsync parent directory to ensure the rename is durable

package/src/runtime/child-pi.ts CHANGED Viewed

@@ -628,7 +628,14 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
 			let graceTurns = input.graceTurns;
 			if (graceTurns !== undefined && graceTurns > 1000) graceTurns = 1000;
 			let abortDueToParentSignal = false;
-			input.signal?.addEventListener("abort", () => { abortDueToParentSignal = true; }, { once: true });
+			// Round 27 (BUG 4): extract to a named handler so settle() can remove it.
+			// The previous anonymous listener was never removed → on runs with >10
+			// tasks sharing one AbortSignal (background-runner), Node emitted
+			// MaxListenersExceededWarning and each leaked listener pinned the task's
+			// stack frame (abortDueToParentSignal closure) in memory. { once: true }
+			// only auto-removes AFTER the signal fires; on normal completion it leaks.
+			const onParentAbort = (): void => { abortDueToParentSignal = true; };
+			input.signal?.addEventListener("abort", onParentAbort, { once: true });
 			const restartNoResponseTimer = (): void => {
 				if (responseTimeoutMs <= 0) return;
 				if (noResponseTimer) clearTimeout(noResponseTimer);
@@ -747,6 +754,7 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
 				clearChildPiTimeouts();
 				lineObserver.flush();
 				input.signal?.removeEventListener("abort", abort);
+				input.signal?.removeEventListener("abort", onParentAbort);
 				try {
 					cleanupTempDir(built.tempDir);
 				} catch (error) {

package/src/runtime/crash-recovery.ts CHANGED Viewed

@@ -9,7 +9,7 @@ import type { TeamTaskState } from "../state/types.ts";
 import { isWorkerHeartbeatStale } from "./worker-heartbeat.ts";
 import type { ManifestCache } from "./manifest-cache.ts";
 import { checkProcessLiveness } from "./process-status.ts";
-import { reconcileStaleRun, type ReconcileResult } from "./stale-reconciler.ts";
+import { isPlanApprovalPending, reconcileStaleRun, type ReconcileResult } from "./stale-reconciler.ts";
 import { executeHook, appendHookEvent } from "../hooks/registry.ts";
 import { unregisterActiveRun, readActiveRunRegistry } from "../state/active-run-registry.ts";
 import { resolveRealContainedPath } from "../utils/safe-paths.ts";
@@ -38,6 +38,8 @@ export function detectInterruptedRuns(cwd: string, manifestCache: ManifestCache,
 	const plans: RecoveryPlan[] = [];
 	for (const manifest of manifestCache.list(50)) {
 		if (manifest.status !== "running" && manifest.status !== "blocked") continue;
+		// Preserve runs intentionally blocked on plan approval — not crashes.
+		if (isPlanApprovalPending(manifest)) continue;
 		if (manifest.async?.pid !== undefined && checkProcessLiveness(manifest.async.pid).alive) continue;
 		// NOTE: no withRunLock — best-effort only; concurrent writes may cause inconsistency
 		const loaded = loadRunManifestById(cwd, manifest.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
@@ -107,6 +109,12 @@ export function cancelOrphanedRuns(
 	// Phase 1: Scan project-level manifests via manifestCache
 	for (const manifest of manifestCache.list(50)) {
 		if (manifest.status !== "running" && manifest.status !== "blocked") continue;
+		// Preserve plan-approval-blocked runs — they belong to their owner and are
+		// waiting on a human decision, not orphaned by a dead owner process.
+		if (isPlanApprovalPending(manifest)) {
+			skipped.push(manifest.runId);
+			continue;
+		}
 		// Only consider runs owned by a different session
 		const ownerId = manifest.ownerSessionId;
@@ -340,6 +348,18 @@ export function reconcileAllStaleRuns(cwd: string, manifestCache: ManifestCache,
 			// Re-read inside lock to get freshest data
 			const fresh = loadRunManifestById(cwd, runId); // NOTE: inside withRunLockSync - consistent read
 			if (!fresh || (fresh.manifest.status !== "running" && fresh.manifest.status !== "blocked")) return;
+			// Belt-and-suspenders: reconcileStaleRun itself guards this, but the run
+			// may have flipped to blocked+plan-approval between cache-list and lock
+			// acquisition — re-check the freshest manifest under the lock.
+			if (isPlanApprovalPending(fresh.manifest)) {
+				results.push({
+					runId,
+					verdict: "blocked_awaiting_approval",
+					repaired: false,
+					detail: "Plan approval is pending; stale reconciliation skipped",
+				});
+				return;
+			}
 			const result = reconcileStaleRun(fresh.manifest, fresh.tasks, now);
 			if (result.repaired || result.verdict === "result_exists") {
 				if (result.repairedTasks) {

package/src/runtime/live-session-runtime.ts CHANGED Viewed

@@ -384,6 +384,12 @@ export async function runLiveSessionTask(input: LiveSessionSpawnInput): Promise<
 	const agentId = `${input.manifest.runId}:${input.task.id}`;
+	// Round 27 (BUG 4): hoisted to function scope so the finally block can remove
+	// it. const inside try{} is block-scoped and invisible to finally{}. The
+	// handler resolves `session` lazily at call time (it may be assigned later
+	// inside the try), so declaring it here is safe.
+	let onSignalAbort: (() => void) | undefined;
 	try {
 		const agentDir = typeof mod.getAgentDir === "function" ? mod.getAgentDir() : undefined;
 		let resourceLoader: unknown;
@@ -545,9 +551,14 @@ export async function runLiveSessionTask(input: LiveSessionSpawnInput): Promise<
 				}
 			});
 		}
+		// Round 27 (BUG 4): named abort handler (removed in finally below).
+		onSignalAbort = (): void => { void session?.abort?.(); };
 		if (input.signal) {
 			if (input.signal.aborted) await session.abort?.();
-			else input.signal.addEventListener("abort", () => { void session?.abort?.(); }, { once: true });
+			// Round 27 (BUG 4): named handler so the finally block can remove it.
+			// The previous anonymous listener leaked on normal completion (only
+			// auto-removed by { once: true } AFTER the signal fires).
+			else input.signal.addEventListener("abort", onSignalAbort, { once: true });
 		}
 		const effectivePrompt = input.runtimeConfig?.inheritContext === true && input.parentContext ? `${input.parentContext}\n\n---\n# Live Subagent Task\n${input.prompt}` : input.prompt;
@@ -687,6 +698,9 @@ export async function runLiveSessionTask(input: LiveSessionSpawnInput): Promise<
 		// H6: Unsubscribe listeners FIRST before clearing timer to prevent race
 		unsubscribe?.();
 		unsubscribeControlRealtime?.();
+		// Round 27 (BUG 4): remove the named abort listener to avoid leaking it
+		// on the shared AbortSignal across many live-session tasks.
+		if (onSignalAbort) input.signal?.removeEventListener("abort", onSignalAbort);
 		if (controlTimer) clearInterval(controlTimer);
 		streamOut?.close();
 		if (input.signal?.aborted) {

package/src/runtime/parent-guard.ts CHANGED Viewed

@@ -29,8 +29,8 @@
  * signal, NOT a security boundary:
  *   - It only causes the (already-compromised) child to exit earlier.
  *   - A truly malicious child can simply not call `startParentGuard()`.
- *   - Real protection against hostile children comes from the sandbox,
- *     env-filter allowlist, and redaction — all enforced before spawn.
+ *   - Real protection against hostile children comes from the env-filter
+ *     allowlist and redaction — all enforced before spawn.
  *
  * The guard exists for the benign case: a parent dies (user closes the
  * terminal, pi crashes, machine loses power) and we want all detached

package/src/runtime/pi-spawn.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import * as fs from "node:fs";
 import * as os from "node:os";
+import { execSync } from "node:child_process";
 import { fileURLToPath } from "node:url";
 import * as path from "node:path";
@@ -118,6 +119,63 @@ function findPiPackageJsonFrom(startDir: string): string | undefined {
 	return undefined;
 }
+/**
+ * Discover the real npm global node_modules directory at runtime.
+ *
+ * Why this exists (Issue #33): on Windows, pi may be installed somewhere
+ * other than %APPDATA%\npm — e.g. nvm-windows puts the global node_modules
+ * under %NVM_HOME%/<version>/node_modules, Volta under
+ * %LOCALAPPDATA%\Volta, fnm under %LOCALAPPDATA%\fnm_multishells. The static
+ * %APPDATA%\npm paths in resolvePiCliScript() miss all of those, and the
+ * fallback spawn("pi") then fails with ENOENT because child_process.spawn does
+ * NOT do PATHEXT resolution on Windows (only exec/execSync via cmd.exe do).
+ *
+ * `npm root -g` is the canonical way to find the global node_modules dir and
+ * works across every npm-based install layout. We run it via execSync, which
+ * DOES resolve `npm.cmd` through PATHEXT. Capped at 5s; any failure (npm not
+ * on PATH, slow start, etc.) just falls through to the other resolution roots.
+ *
+ * Memoized: the npm global root does not change during a process lifetime, so
+ * this is a one-time ~200ms cost rather than per-worker.
+ *
+ * @internal — exported for unit-test injection via __setNpmGlobalRootForTest.
+ */
+let cachedNpmGlobalRoot: string | undefined | null = null;
+export function resolveNpmGlobalRoot(): string | undefined {
+	if (cachedNpmGlobalRoot !== null) {
+		return cachedNpmGlobalRoot ?? undefined;
+	}
+	let resolved: string | undefined;
+	try {
+		const out = execSync("npm root -g", {
+			encoding: "utf-8",
+			timeout: 5000,
+			stdio: ["pipe", "pipe", "pipe"], // suppress npm's stderr chatter
+			windowsHide: true,
+		}).trim();
+		resolved = out.length > 0 ? out : undefined;
+	} catch {
+		resolved = undefined;
+	}
+	cachedNpmGlobalRoot = resolved ?? null;
+	return resolved;
+}
+/**
+ * Given an npm global node_modules root, derive the candidate package dirs for
+ * each supported pi scope. Pure + exported so the mapping is unit-testable
+ * without spawning npm.
+ * @internal
+ */
+export function buildNpmGlobalPackageDirs(npmGlobalRoot: string): string[] {
+	return PI_PACKAGE_NAMES.map((pkgName) => path.join(npmGlobalRoot, ...pkgName.split("/")));
+}
+/** @internal — test hook: inject a fake global root (or undefined) and reset the memo. */
+export function __setNpmGlobalRootForTest(root: string | undefined): void {
+	cachedNpmGlobalRoot = root ?? null;
+}
 function resolvePiCliScript(): string | undefined {
 	const argv1 = process.argv[1];
 	if (argv1) {
@@ -125,8 +183,16 @@ function resolvePiCliScript(): string | undefined {
 		if (isRunnableNodeScript(argvPath)) return argvPath;
 	}
+	// npm-global package dirs derived from `npm root -g` — placed BEFORE the
+	// %APPDATA%\npm static paths and the cwd/import.meta fallbacks so that a pi
+	// install under nvm-windows / Volta / fnm is found even when %APPDATA%\npm
+	// doesn't contain it. Covers Issue #33.
+	const npmGlobalRoot = resolveNpmGlobalRoot();
+	const npmGlobalDirs = npmGlobalRoot ? buildNpmGlobalPackageDirs(npmGlobalRoot) : [];
 	const roots = [
 		resolvePiPackageRoot(),
+		...npmGlobalDirs,
 		process.env.APPDATA ? path.join(process.env.APPDATA, "npm", "node_modules", "@earendil-works", "pi-coding-agent") : undefined,
 		process.env.APPDATA ? path.join(process.env.APPDATA, "npm", "node_modules", "@mariozechner", "pi-coding-agent") : undefined,
 		path.dirname(fileURLToPath(import.meta.url)),

package/src/runtime/stale-reconciler.ts CHANGED Viewed

@@ -24,6 +24,7 @@ export interface ReconcileResult {
 	/** What was found and what action was taken */
 	verdict:
 		| "healthy"
+		| "blocked_awaiting_approval"
 		| "result_exists"
 		| "pid_dead"
 		| "pid_alive_stale"
@@ -36,6 +37,23 @@ export interface ReconcileResult {
 	repairedTasks?: TeamTaskState[];
 }
+/**
+ * Is this run intentionally waiting for human plan approval?
+ *
+ * Such runs are NOT stale even if their owning session died or their async PID
+ * is no longer live — they are blocked on a human decision, not a crash. Crash
+ * recovery and stale reconciliation must preserve them rather than mark them
+ * failed or orphan-cancel them. See PR #32 (gustavo-pelissaro) for the
+ * original analysis of this failure mode.
+ */
+export function isPlanApprovalPending(manifest: TeamRunManifest): boolean {
+	return (
+		manifest.status === "blocked" &&
+		manifest.planApproval?.required === true &&
+		manifest.planApproval.status === "pending"
+	);
+}
 const STALE_ALIVE_PID_MS = 24 * 60 * 60 * 1000; // 24 hours
 const ACTIVE_EVIDENCE_TTL_MS = 5 * 60 * 1000;
 /** For no-PID runs, repair when ALL running tasks have heartbeat stale beyond this threshold. */
@@ -347,6 +365,18 @@ export function reconcileStaleRun(
 ): ReconcileResult {
 	const runId = manifest.runId;
+	// Preserve runs intentionally blocked on human plan approval. These are not
+	// crashes even if the owning PID is gone — they are waiting for a decision.
+	// Must short-circuit before Phase 1 (result check) and Phase 2 (PID liveness).
+	if (isPlanApprovalPending(manifest)) {
+		return {
+			runId,
+			verdict: "blocked_awaiting_approval",
+			repaired: false,
+			detail: "Plan approval is pending; blocked run is intentionally waiting and must not be stale-repaired",
+		};
+	}
 	// Phase 1: Check if results already exist
 	const phase1 = checkResultFile(manifest, tasks);
 	if (phase1.found) {
@@ -485,9 +515,13 @@ export interface OrphanReconcileResult {
  */
 export function reconcileOrphanedTempWorkspaces(
 	now = Date.now(),
-	options?: { cleanupOrphanedTempDirs?: boolean },
+	options?: { cleanupOrphanedTempDirs?: boolean; tmpDir?: string; scanBatchSize?: number },
 ): OrphanReconcileResult {
-	const tmpDir = getSafeTempDir();
+	// Injectable tmpDir + scanBatchSize for deterministic unit testing
+	// (Round 19: tests must not depend on global /tmp cleanliness; the
+	// production ORPHAN_TEMP_SCAN_BATCH_SIZE cap could exclude a test's dir
+	// when leftover dirs accumulate). Defaults remain os.tmpdir() + the cap.
+	const tmpDir = options?.tmpDir ?? getSafeTempDir();
 	if (!tmpDir) return { repaired: 0, cleanedDirs: 0 };
 	let repaired = 0;
 	let cleanedDirs = 0;
@@ -496,10 +530,11 @@ export function reconcileOrphanedTempWorkspaces(
 		// Sort for deterministic order; cap to ORPHAN_TEMP_SCAN_BATCH_SIZE per
 		// tick to avoid main-thread stalls when /tmp has thousands of
 		// pi-crew-* dirs from past interrupted test runs.
+		const scanBatch = options?.scanBatchSize ?? ORPHAN_TEMP_SCAN_BATCH_SIZE;
 		const candidates = entries
 			.filter((e) => e.isDirectory() && e.name.startsWith("pi-crew-"))
 			.sort((a, b) => a.name.localeCompare(b.name))
-			.slice(0, ORPHAN_TEMP_SCAN_BATCH_SIZE);
+			.slice(0, scanBatch);
 		for (const entry of candidates) {
 			if (!entry.isDirectory() || !entry.name.startsWith("pi-crew-"))
 				continue;

package/src/runtime/task-runner.ts CHANGED Viewed

@@ -292,7 +292,16 @@ export async function runTeamTask(
 				const exitCode = (err as NodeJS.ErrnoException & { status?: number }).status;
 				// E1 (Round 15): structured CrewError with code E009 + help hint,
 				// instead of a raw Error. Surfaces the script path, exit code, and stderr.
-				throw errors.preStepFailed(input.step.preStepScript, exitCode, msg);
+				// Round 21 (E4): if preStepOptional is set, a failing hook is NON-FATAL.
+				// Log a warning + emit a 'warning' event, then proceed without the
+				// pre-step output rather than aborting the task (advisory hooks).
+				if (input.step.preStepOptional) {
+					const warnMsg = `[preStepOptional] pre-step hook '${input.step.preStepScript}' failed (exit ${exitCode ?? "?"}) but preStepOptional=true; continuing without its output.`;
+					try { appendEventFireAndForget(manifest.eventsPath, { type: "hook.pre_step_optional_failed", runId: manifest.runId, taskId: task.id, message: warnMsg, data: { script: input.step.preStepScript, exitCode: exitCode ?? null } }); } catch { /* best-effort event log */ }
+					preStepOutput = undefined;
+				} else {
+					throw errors.preStepFailed(input.step.preStepScript, exitCode, msg);
+				}
 			}
 		}

package/src/runtime/team-runner.ts CHANGED Viewed

@@ -455,6 +455,15 @@ export async function executeTeamRun(input: ExecuteTeamRunInput): Promise<{ mani
 		return result;
 	} catch (error) {
+		// Round 27 (BUG 1): the success path calls stopTeamHeartbeat() but this
+			// catch path did NOT. The team heartbeat is a non-unref'd setInterval
+			// (30s) that deliberately keeps the event loop alive — without this
+			// call, a failed team run leaves the interval firing forever and the
+			// foreground pi process hangs (never returns to the prompt); in
+			// background-runner mode the worker never exits. clearInterval is
+			// idempotent so a double-call (if this runs after the success path)
+			// is harmless.
+		stopTeamHeartbeat();
 		// P1: Catch unhandled errors — ensure manifest/tasks/agents are terminal so they don't stay "running" forever.
 		const message = error instanceof Error ? error.message : String(error);
 		// Reload manifest with lock to avoid stale data overwriting concurrent writes.
@@ -922,8 +931,16 @@ tasks = mergeResult.resultTasks;
 		await saveRunTasksAsync(finalManifest, tasks);
 	});
 	manifest = finalManifest;
-	// Save health snapshot on run completion
-	const crewRoot = path.dirname(path.dirname(finalManifest.stateRoot));
+	// Save health snapshot on run completion.
+	// BUG A (pts/2 hang investigation 2026-06-16): stateRoot = `<crewRoot>/state/runs/<runId>`,
+	// so the crew root is THREE dirnames up, not two. Two dirnames gave `<crewRoot>/state`
+	// (the state dir), and HealthStore then joined HEALTH_DIR (`.crew/state/health`)
+	// onto it → `<crewRoot>/state/.crew/state/health` — a double-joined BOGUS path.
+	// That wrote health snapshots to a nonexistent subtree (silently breaking the
+	// health feature) AND created junk dirs that the recursive state watcher then
+	// attached extra inotify watches to. Fix: compute the real crew root (3 up)
+	// and make HEALTH_DIR relative to it.
+	const crewRoot = path.dirname(path.dirname(path.dirname(finalManifest.stateRoot)));
 	const healthStore = new HealthStore(crewRoot);
 	healthStore.saveSnapshot({
 		runId: finalManifest.runId,

package/src/runtime/verification-gates.ts CHANGED Viewed

@@ -57,7 +57,12 @@ export const CARGO_RUST_GATES: Array<{ name: string; command: string; critical:
  * Execute a single command and capture output.
  */
 /** Characters/patterns that indicate dangerous shell metacharacters. */
-const DANGEROUS_SHELL_PATTERNS = /(?:;|&&|\|\||\$\(|`|\$\{|\b(eval|exec)\b|>>|<[^&])/;
+// Round 25 (VULN-3/VULN-4): also block raw newlines (sh -c treats \n as a
+// command separator -> injection) and bare $VARNAME references (can exfiltrate
+// secrets into captured gate output, e.g. `echo $ANTHROPIC_API_KEY`).
+// $+word-char is blocked; special vars like $?/$$/$! are left alone. Built-in
+// gates use only `2>&1` (no $VAR), so this does not break them.
+const DANGEROUS_SHELL_PATTERNS = /(?:;|&&|\|\||\$\(|`|\$\{|\$\w|\b(eval|exec)\b|>>|<[^^&]|[\r\n])/;
 // Note: single `>` is NOT blocked here because `2>&1` is a safe redirect used by built-in gates.
 // `>>` (append) is still blocked. `<` without `&` (input redirect) is still blocked.
@@ -66,7 +71,22 @@ const DANGEROUS_SHELL_PATTERNS = /(?:;|&&|\|\||\$\(|`|\$\{|\b(eval|exec)\b|>>|<[
  * Rejects commands with shell metacharacters that could enable injection.
  * Allows: pipes (|), redirection of stderr (2>&1), and basic npm/cargo/npx commands.
  */
+/** @internal — exported for injection-guard unit testing (Round 25). */
+export function __test__validateGateCommand(command: string): void {
+	validateGateCommand(command);
+}
 function validateGateCommand(command: string): void {
+	// Round 25 (VULN-3): check the ORIGINAL command for raw newlines BEFORE
+	// normalization. The regex below runs on the NORMALIZED command (which
+	// collapses \s+ incl. newlines to a single space), so a newline would be
+	// hidden from it - but `sh -c` treats a raw newline as a command
+	// separator, enabling injection (e.g. `npm test\nrm -rf x`).
+	if (/[\r\n]/.test(command)) {
+		throw new Error(
+			`Security: verification gate command rejected (raw newline - potential command injection): ${JSON.stringify(command)}`,
+		);
+	}
 	const normalized = command
 		.replace(/\x1b\[[0-9;]*[a-zA-Z]/g, '')  // ANSI escape sequences
 		.replace(/[\x00-\x08\x0b\x0c\x0e-\x1f]/g, '')  // control chars