pi-crew 0.7.5 → 0.7.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/CHANGELOG.md +71 -0
  2. package/README.md +11 -11
  3. package/docs/commands-reference.md +14 -10
  4. package/docs/troubleshooting.md +131 -0
  5. package/docs/usage.md +9 -4
  6. package/package.json +1 -1
  7. package/src/config/config.ts +11 -4
  8. package/src/extension/action-suggestions.ts +71 -0
  9. package/src/extension/context-status-injection.ts +32 -1
  10. package/src/extension/register.ts +71 -65
  11. package/src/extension/team-tool/api.ts +3 -2
  12. package/src/extension/team-tool/cancel.ts +5 -4
  13. package/src/extension/team-tool/explain.ts +2 -1
  14. package/src/extension/team-tool/failure-patterns.ts +124 -0
  15. package/src/extension/team-tool/inspect.ts +10 -6
  16. package/src/extension/team-tool/lifecycle-actions.ts +5 -4
  17. package/src/extension/team-tool/respond.ts +4 -3
  18. package/src/extension/team-tool/run-not-found.ts +54 -0
  19. package/src/extension/team-tool/run.ts +26 -4
  20. package/src/extension/team-tool/status.ts +58 -4
  21. package/src/extension/team-tool.ts +5 -3
  22. package/src/runtime/async-runner.ts +7 -0
  23. package/src/runtime/background-runner.ts +7 -1
  24. package/src/runtime/chain-parser.ts +13 -5
  25. package/src/runtime/checkpoint.ts +13 -1
  26. package/src/runtime/child-pi.ts +9 -1
  27. package/src/runtime/crash-recovery.ts +21 -1
  28. package/src/runtime/live-session-runtime.ts +15 -1
  29. package/src/runtime/parent-guard.ts +2 -2
  30. package/src/runtime/pi-spawn.ts +66 -0
  31. package/src/runtime/stale-reconciler.ts +38 -3
  32. package/src/runtime/task-runner.ts +10 -1
  33. package/src/runtime/team-runner.ts +19 -2
  34. package/src/runtime/verification-gates.ts +21 -1
  35. package/src/schema/team-tool-schema.ts +9 -0
  36. package/src/state/blob-store.ts +12 -10
  37. package/src/state/event-log-rotation.ts +114 -93
  38. package/src/state/event-log.ts +79 -20
  39. package/src/state/health-store.ts +6 -1
  40. package/src/state/locks.ts +66 -16
  41. package/src/state/state-store.ts +14 -1
  42. package/src/ui/card-colors.ts +7 -3
  43. package/src/ui/dashboard-panes/agents-pane.ts +15 -2
  44. package/src/ui/live-duration.ts +58 -0
  45. package/src/ui/tool-render.ts +7 -11
  46. package/src/ui/tool-renderers/index.ts +6 -3
  47. package/src/ui/widget/widget-formatters.ts +2 -13
  48. package/src/utils/fs-watch.ts +11 -60
  49. package/src/utils/run-watcher-registry.ts +164 -0
  50. package/src/workflows/discover-workflows.ts +2 -1
  51. package/src/workflows/workflow-config.ts +5 -0
  52. package/src/runtime/dynamic-script-runner.ts +0 -497
  53. package/src/runtime/sandbox.ts +0 -335
@@ -3,24 +3,31 @@ import type { TeamToolParamsValue } from "../../schema/team-tool-schema.ts";
3
3
  import { appendEvent, readEvents } from "../../state/event-log.ts";
4
4
  import { readDeliveryState, readMailbox } from "../../state/mailbox.ts";
5
5
  import { loadRunManifestById, updateRunStatus, saveRunTasks } from "../../state/state-store.ts";
6
- import { aggregateUsage, formatUsage } from "../../state/usage.ts";
6
+ import { aggregateUsage, formatUsage, formatCost } from "../../state/usage.ts";
7
7
  import { applyAttentionState, formatActivityAge, resolveCrewControlConfig } from "../../runtime/agent-control.ts";
8
8
  import { readCrewAgents } from "../../runtime/crew-agent-records.ts";
9
9
  import { checkProcessLiveness, isActiveRunStatus } from "../../runtime/process-status.ts";
10
10
  import { formatTaskGraphLines, waitingReason } from "../../runtime/task-display.ts";
11
+ import { computePhaseProgress } from "../../runtime/phase-progress.ts";
12
+ import { formatDuration } from "../../ui/tool-render.ts";
11
13
  import { verifyTaskCompletion } from "../../runtime/completion-guard.ts";
12
14
  import { evaluateRunEffectiveness } from "../../runtime/effectiveness.ts";
13
15
  import type { PiTeamsToolResult } from "../tool-result.ts";
14
16
  import { locateRunCwd } from "../team-tool.ts";
15
17
  import { result, type TeamContext } from "./context.ts";
18
+ import { RUN_NOT_FOUND_HINT } from "./run-not-found.ts";
16
19
 
17
20
  export function handleStatus(params: TeamToolParamsValue, ctx: TeamContext): PiTeamsToolResult {
18
21
  if (!params.runId) return result("Status requires runId.", { action: "status", status: "error" }, true);
19
22
  const runCwd = locateRunCwd(params.runId, ctx.cwd);
20
- if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "status", status: "error" }, true);
23
+ if (!runCwd) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "status", status: "error" }, true);
21
24
  const loaded = loadRunManifestById(runCwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
22
- if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "status", status: "error" }, true);
25
+ if (!loaded) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "status", status: "error" }, true);
23
26
  let { manifest, tasks } = loaded;
27
+ // DX (Round 16 F3): compact status mode. Default = full (backward compatible).
28
+ // details=false gives a tight summary (status, goal, counts, failed/attention
29
+ // errors) for quick checks without 40 lines of dense key=value noise.
30
+ const fullDetails = params.details !== false;
24
31
  let asyncLivenessLine: string | undefined;
25
32
  if (manifest.async) {
26
33
  const asyncState = manifest.async;
@@ -35,6 +42,7 @@ export function handleStatus(params: TeamToolParamsValue, ctx: TeamContext): PiT
35
42
  }
36
43
  const counts = new Map<string, number>();
37
44
  for (const task of tasks) counts.set(task.status, (counts.get(task.status) ?? 0) + 1);
45
+ const phaseProgress = computePhaseProgress(tasks);
38
46
  const allEvents = readEvents(manifest.eventsPath);
39
47
  const events = allEvents.slice(-8);
40
48
  const attentionByTask = new Map(allEvents.filter((event) => event.type === "task.attention" && event.taskId).map((event) => [event.taskId!, event]));
@@ -62,12 +70,13 @@ export function handleStatus(params: TeamToolParamsValue, ctx: TeamContext): PiT
62
70
  const activeAgents = crewAgents.filter((agent) => agent.status === "running");
63
71
  const completedAgents = crewAgents.filter((agent) => agent.status !== "running");
64
72
  const waitingTasks = tasks.filter((task) => task.status === "queued" || task.status === "waiting");
65
- const agentLine = (agent: typeof crewAgents[number]): string => `- ${agent.id} [${agent.status}] ${agent.role} -> ${agent.agent} runtime=${agent.runtime}${agent.model ? ` model=${agent.model}` : ""}${agent.usage ? ` usage=${formatUsage(agent.usage)}` : ""}${agent.progress?.activityState ? ` activityState=${agent.progress.activityState}` : ""}${formatActivityAge(agent) ? ` activity=${formatActivityAge(agent)}` : ""}${agent.progress?.currentTool ? ` tool=${agent.progress.currentTool}` : ""}${agent.toolUses ? ` tools=${agent.toolUses}` : ""}${!agent.usage && agent.progress?.tokens ? ` tokens=${agent.progress.tokens}` : ""}${agent.progress?.turns ? ` turns=${agent.progress.turns}` : ""}${agent.jsonEvents !== undefined ? ` jsonEvents=${agent.jsonEvents}` : ""}${agent.outputPath ? ` output=${agent.outputPath}` : ""}${agent.transcriptPath ? ` transcript=${agent.transcriptPath}` : ""}${agent.statusPath ? ` status=${agent.statusPath}` : ""}${agent.error ? ` error=${agent.error}` : ""}`;
73
+ const agentLine = (agent: typeof crewAgents[number]): string => `- ${agent.id} [${agent.status}] ${agent.role} -> ${agent.agent} runtime=${agent.runtime}${agent.model ? ` model=${agent.model}` : ""}${agent.usage ? ` usage=${formatUsage(agent.usage)}` : ""}${agent.usage?.cost ? ` cost=${formatCost(agent.usage.cost)}` : ""}${agent.progress?.activityState ? ` activityState=${agent.progress.activityState}` : ""}${formatActivityAge(agent) ? ` activity=${formatActivityAge(agent)}` : ""}${agent.progress?.currentTool ? ` tool=${agent.progress.currentTool}` : ""}${agent.toolUses ? ` tools=${agent.toolUses}` : ""}${!agent.usage && agent.progress?.tokens ? ` tokens=${agent.progress.tokens}` : ""}${agent.progress?.turns ? ` turns=${agent.progress.turns}` : ""}${agent.jsonEvents !== undefined ? ` jsonEvents=${agent.jsonEvents}` : ""}${agent.outputPath ? ` output=${agent.outputPath}` : ""}${agent.transcriptPath ? ` transcript=${agent.transcriptPath}` : ""}${agent.statusPath ? ` status=${agent.statusPath}` : ""}${agent.error ? ` error=${agent.error}` : ""}`;
66
74
  const lines = [
67
75
  `Run: ${manifest.runId}`,
68
76
  `Team: ${manifest.team}`,
69
77
  `Workflow: ${manifest.workflow ?? "(none)"}`,
70
78
  `Status: ${manifest.status}`,
79
+ `Progress: ${phaseProgress.overallPercentage}% (~${formatDuration(phaseProgress.estimatedRemainingMs)} remaining)`,
71
80
  `Workspace mode: ${manifest.workspaceMode}`,
72
81
  ...(manifest.runtimeResolution ? [`Runtime: ${manifest.runtimeResolution.kind}`, `Runtime safety: ${manifest.runtimeResolution.safety}`, `Runtime requested: ${manifest.runtimeResolution.requestedMode}${manifest.runtimeResolution.reason ? ` (${manifest.runtimeResolution.reason})` : ""}`] : []),
73
82
  `Goal: ${manifest.goal}`,
@@ -109,5 +118,50 @@ export function handleStatus(params: TeamToolParamsValue, ctx: TeamContext): PiT
109
118
  "Recent events:",
110
119
  ...(events.length ? events.map((event) => `- ${event.time} ${event.type}${event.taskId ? ` ${event.taskId}` : ""}${event.message ? `: ${event.message}` : ""}`) : ["- (none)"]),
111
120
  ];
121
+ if (!fullDetails) {
122
+ return result(
123
+ buildCompactStatus(manifest, tasks, counts, asyncLivenessLine, phaseProgress).join("\n"),
124
+ { action: "status", status: "ok", runId: manifest.runId, artifactsRoot: manifest.artifactsRoot, intent: `status ${manifest.runId}: ${manifest.status} (compact)` },
125
+ );
126
+ }
112
127
  return result(lines.join("\n"), { action: "status", status: "ok", runId: manifest.runId, artifactsRoot: manifest.artifactsRoot, intent: `status ${manifest.runId}: ${manifest.status}` });
113
128
  }
129
+
130
+ /**
131
+ * Compact status builder (DX: Round 16 F3). A tight summary for quick checks:
132
+ * identity, status, goal, task counts, and ONLY failed / attention task
133
+ * errors — not the 40-line dense dump. Invoked when params.details === false.
134
+ *
135
+ * Exported for unit testing.
136
+ */
137
+ export function buildCompactStatus(
138
+ manifest: { runId: string; team: string; workflow?: string; status: string; goal: string; workspaceMode?: string },
139
+ tasks: Array<{ id: string; status: string; role: string; agent: string; error?: string }>,
140
+ counts: Map<string, number>,
141
+ asyncLivenessLine?: string,
142
+ progress?: { overallPercentage: number; estimatedRemainingMs: number },
143
+ ): string[] {
144
+ const failedOrAttention = tasks.filter(
145
+ (t) =>
146
+ t.status === "failed" ||
147
+ t.status === "needs_attention" ||
148
+ t.status === "cancelled",
149
+ );
150
+ const lines = [
151
+ `Run: ${manifest.runId}`,
152
+ `Team: ${manifest.team}${manifest.workflow ? ` (${manifest.workflow})` : ""}`,
153
+ `Status: ${manifest.status}`,
154
+ ...(progress ? [`Progress: ${progress.overallPercentage}% (~${formatDuration(progress.estimatedRemainingMs)} remaining)`] : []),
155
+ `Goal: ${manifest.goal}`,
156
+ ...(asyncLivenessLine ? [asyncLivenessLine] : []),
157
+ `Tasks: ${[...counts.entries()].map(([s, c]) => `${s}=${c}`).join(", ") || "none"}`,
158
+ ];
159
+ if (failedOrAttention.length > 0) {
160
+ lines.push("Issues:");
161
+ for (const t of failedOrAttention) {
162
+ lines.push(`- ${t.id} [${t.status}] ${t.role}: ${t.error ?? "(no error detail)"}`);
163
+ }
164
+ }
165
+ lines.push("Tip: pass details=true for full output (task graph, agents, effectiveness, events).");
166
+ return lines;
167
+ }
@@ -156,6 +156,8 @@ import { handleParallel } from "./team-tool/parallel-dispatch.ts";
156
156
  import { handlePlan } from "./team-tool/plan.ts";
157
157
  import { handleRespond } from "./team-tool/respond.ts";
158
158
  import { handleStatus } from "./team-tool/status.ts";
159
+ import { RUN_NOT_FOUND_HINT } from "./team-tool/run-not-found.ts";
160
+ import { formatActionSuggestion } from "./action-suggestions.ts";
159
161
 
160
162
  export { handleApi } from "./team-tool/api.ts";
161
163
  export { handleRetry } from "./team-tool/cancel.ts";
@@ -459,14 +461,14 @@ export async function handleResume(
459
461
  const runCwd = locateRunCwd(params.runId, ctx.cwd);
460
462
  if (!runCwd)
461
463
  return result(
462
- `Run '${params.runId}' not found.`,
464
+ `Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`,
463
465
  { action: "resume", status: "error" },
464
466
  true,
465
467
  );
466
468
  const loaded = loadRunManifestById(runCwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
467
469
  if (!loaded)
468
470
  return result(
469
- `Run '${params.runId}' not found.`,
471
+ `Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`,
470
472
  { action: "resume", status: "error" },
471
473
  true,
472
474
  );
@@ -1347,7 +1349,7 @@ export async function handleTeamTool(
1347
1349
  }
1348
1350
  default:
1349
1351
  return result(
1350
- `Unknown action: ${action}`,
1352
+ `Unknown action: ${action}${formatActionSuggestion(String(action))}`,
1351
1353
  { action: "unknown", status: "error" },
1352
1354
  true,
1353
1355
  );
@@ -231,6 +231,13 @@ export async function spawnBackgroundTeamRun(manifest: TeamRunManifest): Promise
231
231
  windowsHide: true,
232
232
  } as unknown as Parameters<typeof spawn>[2];
233
233
  const child = spawn(process.execPath, command.args, spawnOpts);
234
+ // Round 27 (BUG 3): the piped stdout/stderr are NEVER read or destroyed →
235
+ // 2 FDs leak per background spawn, and if the child writes >64KB (pipe
236
+ // buffer) it blocks forever (nobody drains the pipe) → background runner
237
+ // hangs. The background runner redirects its own console to a file, so we
238
+ // don't need this output — destroy the read ends immediately.
239
+ child.stdout?.destroy();
240
+ child.stderr?.destroy();
234
241
  child.on("error", (error: Error) => {
235
242
  logInternalError("async-runner.spawn", error, `pid=${child.pid ?? "unknown"}`);
236
243
  });
@@ -525,7 +525,13 @@ async function main(): Promise<void> {
525
525
  const agents = allAgents(discoverAgents(cwd));
526
526
  debugLog(`[background-runner] discoverAgents done, ${agents.length} agents`,
527
527
  );
528
- try { fs.fsyncSync(fs.openSync(manifest.eventsPath, "a")); } catch { /* best-effort */ } // FORCE flush so we see this before death
528
+ // Round 27 (BUG 2): openSync returned an fd that was never closed FD
529
+ // leak per background runner startup. Close it in a finally (matches the
530
+ // canonical pattern in checkpoint.ts:83 and event-log.ts:582).
531
+ try {
532
+ const fd = fs.openSync(manifest.eventsPath, "a");
533
+ try { fs.fsyncSync(fd); } finally { try { fs.closeSync(fd); } catch { /* best-effort */ } }
534
+ } catch { /* best-effort */ } // FORCE flush so we see this before death
529
535
  debugLog(`[background-runner] calling directTeamAndWorkflowFromRun`,
530
536
  );
531
537
  const direct = directTeamAndWorkflowFromRun(manifest, tasks, agents);
@@ -122,10 +122,10 @@ class ChainParser {
122
122
 
123
123
  parse(): ChainStep[] {
124
124
  const steps: ChainStep[] = [];
125
- steps.push(this.parseStep());
125
+ steps.push(this.parseStep(0));
126
126
  while (this.peek("ARROW")) {
127
127
  this.consume("ARROW");
128
- steps.push(this.parseStep());
128
+ steps.push(this.parseStep(0));
129
129
  }
130
130
  if (this.pos < this.tokens.length) {
131
131
  throw new Error(`Unexpected token '${this.tokens[this.pos]?.value}' at position ${this.pos}`);
@@ -133,16 +133,24 @@ class ChainParser {
133
133
  return steps;
134
134
  }
135
135
 
136
- private parseStep(): ChainStep {
136
+ private parseStep(depth: number = 0): ChainStep {
137
+ // Round 22 (BUG 2): guard against stack overflow on deeply nested input.
138
+ // Without this, a crafted 'parallel(parallel(parallel(...)))' input would
139
+ // recurse unbounded and crash the process with RangeError. Each nesting
140
+ // level needs >=9 chars, so ~130KB could overflow V8's ~15K-frame stack.
141
+ const MAX_CHAIN_NESTING = 100;
142
+ if (depth > MAX_CHAIN_NESTING) {
143
+ throw new Error(`Chain DSL nesting too deep (max ${MAX_CHAIN_NESTING}); likely unbalanced or malicious input`);
144
+ }
137
145
  // Check for parallel(...) construct
138
146
  if (this.peek("NAME", "parallel")) {
139
147
  this.consume("NAME"); // eat "parallel"
140
148
  this.consume("LPAREN");
141
149
  const parallel: ChainStep[] = [];
142
- parallel.push(this.parseStep());
150
+ parallel.push(this.parseStep(depth + 1));
143
151
  while (this.peek("COMMA")) {
144
152
  this.consume("COMMA");
145
- parallel.push(this.parseStep());
153
+ parallel.push(this.parseStep(depth + 1));
146
154
  }
147
155
  this.consume("RPAREN");
148
156
  const step: ChainStep = { name: "parallel", parallel };
@@ -64,7 +64,19 @@ export class FileCheckpointStore implements CheckpointStore {
64
64
  // Atomic write: write to temp file first, then rename, then fsync parent.
65
65
  // This guarantees either the old file or the new file, never a partial
66
66
  // write, even on network filesystems or certain journal modes.
67
- const tmp = path.join(this.checkpointDir(), ".tmp.checkpoint");
67
+ //
68
+ // Round 22 (BUG 1): the temp filename MUST be unique per save call.
69
+ // Previously a fixed '.tmp.checkpoint' was shared across ALL concurrent
70
+ // saves; pi-crew's multi-process architecture (main + detached background
71
+ // workers each checkpointing their own tasks) made this realistic: two
72
+ // processes writing '.tmp.checkpoint' at once → one's rename picks up the
73
+ // other's data (silent corruption) and the second rename hits ENOENT
74
+ // (silent data loss). Including taskId + pid + timestamp guarantees
75
+ // uniqueness across processes and across tasks.
76
+ const tmp = path.join(
77
+ this.checkpointDir(),
78
+ `.tmp.${checkpoint.taskId}.${process.pid}.${Date.now()}.${Math.random().toString(36).slice(2, 8)}`,
79
+ );
68
80
  fs.writeFileSync(tmp, JSON.stringify(checkpoint, null, 2), "utf-8");
69
81
  fs.renameSync(tmp, p);
70
82
  // fsync parent directory to ensure the rename is durable
@@ -628,7 +628,14 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
628
628
  let graceTurns = input.graceTurns;
629
629
  if (graceTurns !== undefined && graceTurns > 1000) graceTurns = 1000;
630
630
  let abortDueToParentSignal = false;
631
- input.signal?.addEventListener("abort", () => { abortDueToParentSignal = true; }, { once: true });
631
+ // Round 27 (BUG 4): extract to a named handler so settle() can remove it.
632
+ // The previous anonymous listener was never removed → on runs with >10
633
+ // tasks sharing one AbortSignal (background-runner), Node emitted
634
+ // MaxListenersExceededWarning and each leaked listener pinned the task's
635
+ // stack frame (abortDueToParentSignal closure) in memory. { once: true }
636
+ // only auto-removes AFTER the signal fires; on normal completion it leaks.
637
+ const onParentAbort = (): void => { abortDueToParentSignal = true; };
638
+ input.signal?.addEventListener("abort", onParentAbort, { once: true });
632
639
  const restartNoResponseTimer = (): void => {
633
640
  if (responseTimeoutMs <= 0) return;
634
641
  if (noResponseTimer) clearTimeout(noResponseTimer);
@@ -747,6 +754,7 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
747
754
  clearChildPiTimeouts();
748
755
  lineObserver.flush();
749
756
  input.signal?.removeEventListener("abort", abort);
757
+ input.signal?.removeEventListener("abort", onParentAbort);
750
758
  try {
751
759
  cleanupTempDir(built.tempDir);
752
760
  } catch (error) {
@@ -9,7 +9,7 @@ import type { TeamTaskState } from "../state/types.ts";
9
9
  import { isWorkerHeartbeatStale } from "./worker-heartbeat.ts";
10
10
  import type { ManifestCache } from "./manifest-cache.ts";
11
11
  import { checkProcessLiveness } from "./process-status.ts";
12
- import { reconcileStaleRun, type ReconcileResult } from "./stale-reconciler.ts";
12
+ import { isPlanApprovalPending, reconcileStaleRun, type ReconcileResult } from "./stale-reconciler.ts";
13
13
  import { executeHook, appendHookEvent } from "../hooks/registry.ts";
14
14
  import { unregisterActiveRun, readActiveRunRegistry } from "../state/active-run-registry.ts";
15
15
  import { resolveRealContainedPath } from "../utils/safe-paths.ts";
@@ -38,6 +38,8 @@ export function detectInterruptedRuns(cwd: string, manifestCache: ManifestCache,
38
38
  const plans: RecoveryPlan[] = [];
39
39
  for (const manifest of manifestCache.list(50)) {
40
40
  if (manifest.status !== "running" && manifest.status !== "blocked") continue;
41
+ // Preserve runs intentionally blocked on plan approval — not crashes.
42
+ if (isPlanApprovalPending(manifest)) continue;
41
43
  if (manifest.async?.pid !== undefined && checkProcessLiveness(manifest.async.pid).alive) continue;
42
44
  // NOTE: no withRunLock — best-effort only; concurrent writes may cause inconsistency
43
45
  const loaded = loadRunManifestById(cwd, manifest.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
@@ -107,6 +109,12 @@ export function cancelOrphanedRuns(
107
109
  // Phase 1: Scan project-level manifests via manifestCache
108
110
  for (const manifest of manifestCache.list(50)) {
109
111
  if (manifest.status !== "running" && manifest.status !== "blocked") continue;
112
+ // Preserve plan-approval-blocked runs — they belong to their owner and are
113
+ // waiting on a human decision, not orphaned by a dead owner process.
114
+ if (isPlanApprovalPending(manifest)) {
115
+ skipped.push(manifest.runId);
116
+ continue;
117
+ }
110
118
 
111
119
  // Only consider runs owned by a different session
112
120
  const ownerId = manifest.ownerSessionId;
@@ -340,6 +348,18 @@ export function reconcileAllStaleRuns(cwd: string, manifestCache: ManifestCache,
340
348
  // Re-read inside lock to get freshest data
341
349
  const fresh = loadRunManifestById(cwd, runId); // NOTE: inside withRunLockSync - consistent read
342
350
  if (!fresh || (fresh.manifest.status !== "running" && fresh.manifest.status !== "blocked")) return;
351
+ // Belt-and-suspenders: reconcileStaleRun itself guards this, but the run
352
+ // may have flipped to blocked+plan-approval between cache-list and lock
353
+ // acquisition — re-check the freshest manifest under the lock.
354
+ if (isPlanApprovalPending(fresh.manifest)) {
355
+ results.push({
356
+ runId,
357
+ verdict: "blocked_awaiting_approval",
358
+ repaired: false,
359
+ detail: "Plan approval is pending; stale reconciliation skipped",
360
+ });
361
+ return;
362
+ }
343
363
  const result = reconcileStaleRun(fresh.manifest, fresh.tasks, now);
344
364
  if (result.repaired || result.verdict === "result_exists") {
345
365
  if (result.repairedTasks) {
@@ -384,6 +384,12 @@ export async function runLiveSessionTask(input: LiveSessionSpawnInput): Promise<
384
384
 
385
385
  const agentId = `${input.manifest.runId}:${input.task.id}`;
386
386
 
387
+ // Round 27 (BUG 4): hoisted to function scope so the finally block can remove
388
+ // it. const inside try{} is block-scoped and invisible to finally{}. The
389
+ // handler resolves `session` lazily at call time (it may be assigned later
390
+ // inside the try), so declaring it here is safe.
391
+ let onSignalAbort: (() => void) | undefined;
392
+
387
393
  try {
388
394
  const agentDir = typeof mod.getAgentDir === "function" ? mod.getAgentDir() : undefined;
389
395
  let resourceLoader: unknown;
@@ -545,9 +551,14 @@ export async function runLiveSessionTask(input: LiveSessionSpawnInput): Promise<
545
551
  }
546
552
  });
547
553
  }
554
+ // Round 27 (BUG 4): named abort handler (removed in finally below).
555
+ onSignalAbort = (): void => { void session?.abort?.(); };
548
556
  if (input.signal) {
549
557
  if (input.signal.aborted) await session.abort?.();
550
- else input.signal.addEventListener("abort", () => { void session?.abort?.(); }, { once: true });
558
+ // Round 27 (BUG 4): named handler so the finally block can remove it.
559
+ // The previous anonymous listener leaked on normal completion (only
560
+ // auto-removed by { once: true } AFTER the signal fires).
561
+ else input.signal.addEventListener("abort", onSignalAbort, { once: true });
551
562
  }
552
563
  const effectivePrompt = input.runtimeConfig?.inheritContext === true && input.parentContext ? `${input.parentContext}\n\n---\n# Live Subagent Task\n${input.prompt}` : input.prompt;
553
564
 
@@ -687,6 +698,9 @@ export async function runLiveSessionTask(input: LiveSessionSpawnInput): Promise<
687
698
  // H6: Unsubscribe listeners FIRST before clearing timer to prevent race
688
699
  unsubscribe?.();
689
700
  unsubscribeControlRealtime?.();
701
+ // Round 27 (BUG 4): remove the named abort listener to avoid leaking it
702
+ // on the shared AbortSignal across many live-session tasks.
703
+ if (onSignalAbort) input.signal?.removeEventListener("abort", onSignalAbort);
690
704
  if (controlTimer) clearInterval(controlTimer);
691
705
  streamOut?.close();
692
706
  if (input.signal?.aborted) {
@@ -29,8 +29,8 @@
29
29
  * signal, NOT a security boundary:
30
30
  * - It only causes the (already-compromised) child to exit earlier.
31
31
  * - A truly malicious child can simply not call `startParentGuard()`.
32
- * - Real protection against hostile children comes from the sandbox,
33
- * env-filter allowlist, and redaction — all enforced before spawn.
32
+ * - Real protection against hostile children comes from the env-filter
33
+ * allowlist and redaction — all enforced before spawn.
34
34
  *
35
35
  * The guard exists for the benign case: a parent dies (user closes the
36
36
  * terminal, pi crashes, machine loses power) and we want all detached
@@ -1,5 +1,6 @@
1
1
  import * as fs from "node:fs";
2
2
  import * as os from "node:os";
3
+ import { execSync } from "node:child_process";
3
4
  import { fileURLToPath } from "node:url";
4
5
  import * as path from "node:path";
5
6
 
@@ -118,6 +119,63 @@ function findPiPackageJsonFrom(startDir: string): string | undefined {
118
119
  return undefined;
119
120
  }
120
121
 
122
+ /**
123
+ * Discover the real npm global node_modules directory at runtime.
124
+ *
125
+ * Why this exists (Issue #33): on Windows, pi may be installed somewhere
126
+ * other than %APPDATA%\npm — e.g. nvm-windows puts the global node_modules
127
+ * under %NVM_HOME%/<version>/node_modules, Volta under
128
+ * %LOCALAPPDATA%\Volta, fnm under %LOCALAPPDATA%\fnm_multishells. The static
129
+ * %APPDATA%\npm paths in resolvePiCliScript() miss all of those, and the
130
+ * fallback spawn("pi") then fails with ENOENT because child_process.spawn does
131
+ * NOT do PATHEXT resolution on Windows (only exec/execSync via cmd.exe do).
132
+ *
133
+ * `npm root -g` is the canonical way to find the global node_modules dir and
134
+ * works across every npm-based install layout. We run it via execSync, which
135
+ * DOES resolve `npm.cmd` through PATHEXT. Capped at 5s; any failure (npm not
136
+ * on PATH, slow start, etc.) just falls through to the other resolution roots.
137
+ *
138
+ * Memoized: the npm global root does not change during a process lifetime, so
139
+ * this is a one-time ~200ms cost rather than per-worker.
140
+ *
141
+ * @internal — exported for unit-test injection via __setNpmGlobalRootForTest.
142
+ */
143
+ let cachedNpmGlobalRoot: string | undefined | null = null;
144
+ export function resolveNpmGlobalRoot(): string | undefined {
145
+ if (cachedNpmGlobalRoot !== null) {
146
+ return cachedNpmGlobalRoot ?? undefined;
147
+ }
148
+ let resolved: string | undefined;
149
+ try {
150
+ const out = execSync("npm root -g", {
151
+ encoding: "utf-8",
152
+ timeout: 5000,
153
+ stdio: ["pipe", "pipe", "pipe"], // suppress npm's stderr chatter
154
+ windowsHide: true,
155
+ }).trim();
156
+ resolved = out.length > 0 ? out : undefined;
157
+ } catch {
158
+ resolved = undefined;
159
+ }
160
+ cachedNpmGlobalRoot = resolved ?? null;
161
+ return resolved;
162
+ }
163
+
164
+ /**
165
+ * Given an npm global node_modules root, derive the candidate package dirs for
166
+ * each supported pi scope. Pure + exported so the mapping is unit-testable
167
+ * without spawning npm.
168
+ * @internal
169
+ */
170
+ export function buildNpmGlobalPackageDirs(npmGlobalRoot: string): string[] {
171
+ return PI_PACKAGE_NAMES.map((pkgName) => path.join(npmGlobalRoot, ...pkgName.split("/")));
172
+ }
173
+
174
+ /** @internal — test hook: inject a fake global root (or undefined) and reset the memo. */
175
+ export function __setNpmGlobalRootForTest(root: string | undefined): void {
176
+ cachedNpmGlobalRoot = root ?? null;
177
+ }
178
+
121
179
  function resolvePiCliScript(): string | undefined {
122
180
  const argv1 = process.argv[1];
123
181
  if (argv1) {
@@ -125,8 +183,16 @@ function resolvePiCliScript(): string | undefined {
125
183
  if (isRunnableNodeScript(argvPath)) return argvPath;
126
184
  }
127
185
 
186
+ // npm-global package dirs derived from `npm root -g` — placed BEFORE the
187
+ // %APPDATA%\npm static paths and the cwd/import.meta fallbacks so that a pi
188
+ // install under nvm-windows / Volta / fnm is found even when %APPDATA%\npm
189
+ // doesn't contain it. Covers Issue #33.
190
+ const npmGlobalRoot = resolveNpmGlobalRoot();
191
+ const npmGlobalDirs = npmGlobalRoot ? buildNpmGlobalPackageDirs(npmGlobalRoot) : [];
192
+
128
193
  const roots = [
129
194
  resolvePiPackageRoot(),
195
+ ...npmGlobalDirs,
130
196
  process.env.APPDATA ? path.join(process.env.APPDATA, "npm", "node_modules", "@earendil-works", "pi-coding-agent") : undefined,
131
197
  process.env.APPDATA ? path.join(process.env.APPDATA, "npm", "node_modules", "@mariozechner", "pi-coding-agent") : undefined,
132
198
  path.dirname(fileURLToPath(import.meta.url)),
@@ -24,6 +24,7 @@ export interface ReconcileResult {
24
24
  /** What was found and what action was taken */
25
25
  verdict:
26
26
  | "healthy"
27
+ | "blocked_awaiting_approval"
27
28
  | "result_exists"
28
29
  | "pid_dead"
29
30
  | "pid_alive_stale"
@@ -36,6 +37,23 @@ export interface ReconcileResult {
36
37
  repairedTasks?: TeamTaskState[];
37
38
  }
38
39
 
40
+ /**
41
+ * Is this run intentionally waiting for human plan approval?
42
+ *
43
+ * Such runs are NOT stale even if their owning session died or their async PID
44
+ * is no longer live — they are blocked on a human decision, not a crash. Crash
45
+ * recovery and stale reconciliation must preserve them rather than mark them
46
+ * failed or orphan-cancel them. See PR #32 (gustavo-pelissaro) for the
47
+ * original analysis of this failure mode.
48
+ */
49
+ export function isPlanApprovalPending(manifest: TeamRunManifest): boolean {
50
+ return (
51
+ manifest.status === "blocked" &&
52
+ manifest.planApproval?.required === true &&
53
+ manifest.planApproval.status === "pending"
54
+ );
55
+ }
56
+
39
57
  const STALE_ALIVE_PID_MS = 24 * 60 * 60 * 1000; // 24 hours
40
58
  const ACTIVE_EVIDENCE_TTL_MS = 5 * 60 * 1000;
41
59
  /** For no-PID runs, repair when ALL running tasks have heartbeat stale beyond this threshold. */
@@ -347,6 +365,18 @@ export function reconcileStaleRun(
347
365
  ): ReconcileResult {
348
366
  const runId = manifest.runId;
349
367
 
368
+ // Preserve runs intentionally blocked on human plan approval. These are not
369
+ // crashes even if the owning PID is gone — they are waiting for a decision.
370
+ // Must short-circuit before Phase 1 (result check) and Phase 2 (PID liveness).
371
+ if (isPlanApprovalPending(manifest)) {
372
+ return {
373
+ runId,
374
+ verdict: "blocked_awaiting_approval",
375
+ repaired: false,
376
+ detail: "Plan approval is pending; blocked run is intentionally waiting and must not be stale-repaired",
377
+ };
378
+ }
379
+
350
380
  // Phase 1: Check if results already exist
351
381
  const phase1 = checkResultFile(manifest, tasks);
352
382
  if (phase1.found) {
@@ -485,9 +515,13 @@ export interface OrphanReconcileResult {
485
515
  */
486
516
  export function reconcileOrphanedTempWorkspaces(
487
517
  now = Date.now(),
488
- options?: { cleanupOrphanedTempDirs?: boolean },
518
+ options?: { cleanupOrphanedTempDirs?: boolean; tmpDir?: string; scanBatchSize?: number },
489
519
  ): OrphanReconcileResult {
490
- const tmpDir = getSafeTempDir();
520
+ // Injectable tmpDir + scanBatchSize for deterministic unit testing
521
+ // (Round 19: tests must not depend on global /tmp cleanliness; the
522
+ // production ORPHAN_TEMP_SCAN_BATCH_SIZE cap could exclude a test's dir
523
+ // when leftover dirs accumulate). Defaults remain os.tmpdir() + the cap.
524
+ const tmpDir = options?.tmpDir ?? getSafeTempDir();
491
525
  if (!tmpDir) return { repaired: 0, cleanedDirs: 0 };
492
526
  let repaired = 0;
493
527
  let cleanedDirs = 0;
@@ -496,10 +530,11 @@ export function reconcileOrphanedTempWorkspaces(
496
530
  // Sort for deterministic order; cap to ORPHAN_TEMP_SCAN_BATCH_SIZE per
497
531
  // tick to avoid main-thread stalls when /tmp has thousands of
498
532
  // pi-crew-* dirs from past interrupted test runs.
533
+ const scanBatch = options?.scanBatchSize ?? ORPHAN_TEMP_SCAN_BATCH_SIZE;
499
534
  const candidates = entries
500
535
  .filter((e) => e.isDirectory() && e.name.startsWith("pi-crew-"))
501
536
  .sort((a, b) => a.name.localeCompare(b.name))
502
- .slice(0, ORPHAN_TEMP_SCAN_BATCH_SIZE);
537
+ .slice(0, scanBatch);
503
538
  for (const entry of candidates) {
504
539
  if (!entry.isDirectory() || !entry.name.startsWith("pi-crew-"))
505
540
  continue;
@@ -292,7 +292,16 @@ export async function runTeamTask(
292
292
  const exitCode = (err as NodeJS.ErrnoException & { status?: number }).status;
293
293
  // E1 (Round 15): structured CrewError with code E009 + help hint,
294
294
  // instead of a raw Error. Surfaces the script path, exit code, and stderr.
295
- throw errors.preStepFailed(input.step.preStepScript, exitCode, msg);
295
+ // Round 21 (E4): if preStepOptional is set, a failing hook is NON-FATAL.
296
+ // Log a warning + emit a 'warning' event, then proceed without the
297
+ // pre-step output rather than aborting the task (advisory hooks).
298
+ if (input.step.preStepOptional) {
299
+ const warnMsg = `[preStepOptional] pre-step hook '${input.step.preStepScript}' failed (exit ${exitCode ?? "?"}) but preStepOptional=true; continuing without its output.`;
300
+ try { appendEventFireAndForget(manifest.eventsPath, { type: "hook.pre_step_optional_failed", runId: manifest.runId, taskId: task.id, message: warnMsg, data: { script: input.step.preStepScript, exitCode: exitCode ?? null } }); } catch { /* best-effort event log */ }
301
+ preStepOutput = undefined;
302
+ } else {
303
+ throw errors.preStepFailed(input.step.preStepScript, exitCode, msg);
304
+ }
296
305
  }
297
306
  }
298
307
 
@@ -455,6 +455,15 @@ export async function executeTeamRun(input: ExecuteTeamRunInput): Promise<{ mani
455
455
 
456
456
  return result;
457
457
  } catch (error) {
458
+ // Round 27 (BUG 1): the success path calls stopTeamHeartbeat() but this
459
+ // catch path did NOT. The team heartbeat is a non-unref'd setInterval
460
+ // (30s) that deliberately keeps the event loop alive — without this
461
+ // call, a failed team run leaves the interval firing forever and the
462
+ // foreground pi process hangs (never returns to the prompt); in
463
+ // background-runner mode the worker never exits. clearInterval is
464
+ // idempotent so a double-call (if this runs after the success path)
465
+ // is harmless.
466
+ stopTeamHeartbeat();
458
467
  // P1: Catch unhandled errors — ensure manifest/tasks/agents are terminal so they don't stay "running" forever.
459
468
  const message = error instanceof Error ? error.message : String(error);
460
469
  // Reload manifest with lock to avoid stale data overwriting concurrent writes.
@@ -922,8 +931,16 @@ tasks = mergeResult.resultTasks;
922
931
  await saveRunTasksAsync(finalManifest, tasks);
923
932
  });
924
933
  manifest = finalManifest;
925
- // Save health snapshot on run completion
926
- const crewRoot = path.dirname(path.dirname(finalManifest.stateRoot));
934
+ // Save health snapshot on run completion.
935
+ // BUG A (pts/2 hang investigation 2026-06-16): stateRoot = `<crewRoot>/state/runs/<runId>`,
936
+ // so the crew root is THREE dirnames up, not two. Two dirnames gave `<crewRoot>/state`
937
+ // (the state dir), and HealthStore then joined HEALTH_DIR (`.crew/state/health`)
938
+ // onto it → `<crewRoot>/state/.crew/state/health` — a double-joined BOGUS path.
939
+ // That wrote health snapshots to a nonexistent subtree (silently breaking the
940
+ // health feature) AND created junk dirs that the recursive state watcher then
941
+ // attached extra inotify watches to. Fix: compute the real crew root (3 up)
942
+ // and make HEALTH_DIR relative to it.
943
+ const crewRoot = path.dirname(path.dirname(path.dirname(finalManifest.stateRoot)));
927
944
  const healthStore = new HealthStore(crewRoot);
928
945
  healthStore.saveSnapshot({
929
946
  runId: finalManifest.runId,
@@ -57,7 +57,12 @@ export const CARGO_RUST_GATES: Array<{ name: string; command: string; critical:
57
57
  * Execute a single command and capture output.
58
58
  */
59
59
  /** Characters/patterns that indicate dangerous shell metacharacters. */
60
- const DANGEROUS_SHELL_PATTERNS = /(?:;|&&|\|\||\$\(|`|\$\{|\b(eval|exec)\b|>>|<[^&])/;
60
+ // Round 25 (VULN-3/VULN-4): also block raw newlines (sh -c treats \n as a
61
+ // command separator -> injection) and bare $VARNAME references (can exfiltrate
62
+ // secrets into captured gate output, e.g. `echo $ANTHROPIC_API_KEY`).
63
+ // $+word-char is blocked; special vars like $?/$$/$! are left alone. Built-in
64
+ // gates use only `2>&1` (no $VAR), so this does not break them.
65
+ const DANGEROUS_SHELL_PATTERNS = /(?:;|&&|\|\||\$\(|`|\$\{|\$\w|\b(eval|exec)\b|>>|<[^^&]|[\r\n])/;
61
66
  // Note: single `>` is NOT blocked here because `2>&1` is a safe redirect used by built-in gates.
62
67
  // `>>` (append) is still blocked. `<` without `&` (input redirect) is still blocked.
63
68
 
@@ -66,7 +71,22 @@ const DANGEROUS_SHELL_PATTERNS = /(?:;|&&|\|\||\$\(|`|\$\{|\b(eval|exec)\b|>>|<[
66
71
  * Rejects commands with shell metacharacters that could enable injection.
67
72
  * Allows: pipes (|), redirection of stderr (2>&1), and basic npm/cargo/npx commands.
68
73
  */
74
+ /** @internal — exported for injection-guard unit testing (Round 25). */
75
+ export function __test__validateGateCommand(command: string): void {
76
+ validateGateCommand(command);
77
+ }
78
+
69
79
  function validateGateCommand(command: string): void {
80
+ // Round 25 (VULN-3): check the ORIGINAL command for raw newlines BEFORE
81
+ // normalization. The regex below runs on the NORMALIZED command (which
82
+ // collapses \s+ incl. newlines to a single space), so a newline would be
83
+ // hidden from it - but `sh -c` treats a raw newline as a command
84
+ // separator, enabling injection (e.g. `npm test\nrm -rf x`).
85
+ if (/[\r\n]/.test(command)) {
86
+ throw new Error(
87
+ `Security: verification gate command rejected (raw newline - potential command injection): ${JSON.stringify(command)}`,
88
+ );
89
+ }
70
90
  const normalized = command
71
91
  .replace(/\x1b\[[0-9;]*[a-zA-Z]/g, '') // ANSI escape sequences
72
92
  .replace(/[\x00-\x08\x0b\x0c\x0e-\x1f]/g, '') // control chars