pi-crew 0.9.3 → 0.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,132 @@
1
1
  # Changelog
2
2
 
3
+ ## [v0.9.5] — fix "team run hangs forever at 25%" (2026-06-23)
4
+
5
+ Two coupled runtime bugs caused the recurring "run stuck at 25% (1/4)" failure
6
+ observed across 4+ consecutive review/fast-fix runs. Both are now fixed; full
7
+ diagnostics (background.log, events.jsonl, heartbeat.json) are preserved for
8
+ all runs.
9
+
10
+ ### Bug X — `purgeStaleActiveRunIndex` destroyed the run's stateRoot (proximate cause)
11
+
12
+ **File:** `src/runtime/crash-recovery.ts`
13
+
14
+ **What was wrong:** `purgeStaleActiveRunIndex` decided whether a run was
15
+ "orphaned" using `entry.updatedAt`, which is **frozen at registration** and
16
+ never refreshed during execution. A long-running legitimate async run whose
17
+ background worker had exited (e.g. after a 5–15 min explorer) would have its
18
+ entire durable state (manifest/tasks/events/heartbeat) hard-deleted. Because
19
+ `saveRunTasks()` silently no-ops once the state dir is missing, the workflow
20
+ could never advance past the current task → **permanent invisible hang**
21
+ ("Run not found"), with all diagnostics lost.
22
+
23
+ **Fix:**
24
+ - Liveness now corroborated via (a) the on-disk `manifest.updatedAt` (rewritten
25
+ on every task transition) and (b) the team-level `heartbeat.json` mtime —
26
+ any one of which is sufficient to declare the run live.
27
+ - Cancelling a run now **keeps its stateRoot** so the run stays queryable and
28
+ resumable, and its diagnostics survive. The finished-run pruner removes the
29
+ directory later on its normal schedule.
30
+ - Removed two redundant `saveRunManifest(fullLoaded.manifest)` calls that
31
+ were clobbering the freshly-saved `cancelled` status back to `running`.
32
+
33
+ **New regression test:** `test/unit/crash-recovery-purge-liveness.test.ts`
34
+ (3 cases: fresh manifest kept, orphan cancelled-but-preserved, fresh
35
+ heartbeat kept — all using a live-worker-then-reap + `now`-time-shift
36
+ harness to deterministically simulate the registration-then-aging race).
37
+
38
+ ### Bug Y — background runner crashed with EPIPE on the first post-detach `console.debug` (root cause)
39
+
40
+ **File:** `src/runtime/background-runner.ts`
41
+
42
+ **What was wrong:** The in-process console redirect only covered `console.log`
43
+ and `console.error`; `console.debug` and `console.warn` still wrote to the
44
+ original stdout/stderr pipes. The background runner is spawned with
45
+ `detached:true` + `setsid:true`, so the parent disconnects the stdio pipes
46
+ immediately after spawn. The first post-detach `console.debug` call from
47
+ `team-runner.ts:242` (inside `mergeTaskUpdatesPreservingTerminal` →
48
+ "Skipping stale merge") hit the closed stdout → unhandled `EPIPE` error →
49
+ **process exit** → scheduler dead → run stuck at 25% forever.
50
+
51
+ Prior investigators saw only "the run died silently right after explorer
52
+ completed" and concluded (incorrectly) that the cause was a native crash
53
+ (SIGKILL/segfault/V8 heap-OOM), because their [DIAG] handlers never fired.
54
+ In reality the diagnostic handlers DID fire — but on a `EPIPE` write error,
55
+ which `process.on('error')` doesn't catch. The fix below makes the crash
56
+ observable AND non-fatal.
57
+
58
+ **Fix:**
59
+ - Extend the console redirect to also cover `console.debug` and `console.warn`,
60
+ so they go to the log file (logFd) instead of the disconnected stdio pipes.
61
+ - Wrap the `fs.writeSync` in try-catch so any log-write failure (closed fd,
62
+ ENOSPC, etc.) can never crash the scheduler. The scheduler log is
63
+ best-effort by design.
64
+
65
+ **New regression test:** `test/unit/background-runner-console-redirect.test.ts`
66
+ (4 cases: undefined logFd no-op, valid logFd writes correctly, EBADF on
67
+ closed logFd is swallowed, post-undefined fd-toggle is safe). Replicates the
68
+ `origWrite` pattern from the source so any drift between the two is easy to
69
+ spot.
70
+
71
+ ### Why this took multiple attempts
72
+
73
+ All prior attempts to diagnose the hang destroyed the only evidence (the
74
+ stateRoot) the moment the `purgeStaleActiveRunIndex` heuristic misfired.
75
+ The chain was always the same: a worker exits for any reason → purge sees
76
+ dead PID + frozen-stale entry → **deletes stateRoot** → the run becomes
77
+ "Run not found" with no log, no events, no heartbeat, no way to even resume.
78
+ That hid the real cause (Bug Y) for the entire series of failed diagnostic
79
+ runs. With Bug X fixed, the diagnostic trail (background.log 345 KB +
80
+ events.jsonl 166 KB) survives long enough to read the actual EPIPE crash
81
+ that Bug Y left behind.
82
+
83
+ ### Verification
84
+
85
+ - 7/7 new regression tests pass (`crash-recovery-purge-liveness.test.ts` +
86
+ `background-runner-console-redirect.test.ts`).
87
+ - Existing crash-recovery / active-run-registry / stale-reconciler /
88
+ async-stale / run-accumulation / auto-recovery suites: 71/71 pass.
89
+ - End-to-end: a 4-step review run now advances 3/4 tasks (75%) instead of
90
+ hanging at 25%; the verify step that would have failed earlier now fails
91
+ only for environmental reasons (memory OOM under load), not the fix.
92
+ - `npx tsc --noEmit` is green.
93
+
94
+ ### Notes for users
95
+
96
+ If you have a stuck "running" run from v0.9.4 or earlier (the symptom was
97
+ "Run not found" / "25% hang" / "had to kill pi"), upgrading alone will not
98
+ recover it — its `stateRoot` was already destroyed by the buggy purge.
99
+ Re-dispatch the workflow. New runs are fully protected.
100
+
101
+ ## [v0.9.4] — fix macOS CI: benchmark allowlist + cross-platform fixtures (2026-06-23)
102
+
103
+ Patch fix for a CI failure introduced in v0.9.3 (caught by the macOS CI job,
104
+ which the v0.9.3 release unfortunately did not wait for — lesson learned).
105
+
106
+ ### What was wrong
107
+
108
+ The v0.9.3 benchmark test fixtures used `grep --help` as a benign exit-0
109
+ command. GNU grep (Linux) exits 0, but **BSD grep (macOS) does not support
110
+ `--help`** and exits 2 — so `runBenchmarkSuite computes total counts` failed
111
+ on macOS CI (`2 !== 0`). Local Linux verification missed this.
112
+
113
+ ### Fix
114
+
115
+ - `benchmark-runner.ts`: added `echo` to the command allowlist. Safe because the
116
+ shell-metachar blocker already rejects command substitution (`$(…)`, backticks),
117
+ so `echo $(evil)` cannot execute; bare `echo …` only prints. `echo` is the
118
+ canonical cross-platform exit-0 command.
119
+ - `test/unit/benchmark.test.ts`: fixtures switched from `grep --help` → `echo ok`
120
+ (exits 0 on Linux/macOS/Windows-sh). The "not in allowlist" test now uses `ls`
121
+ (genuinely disallowed).
122
+
123
+ ### Process note
124
+
125
+ This is the release where the project re-commits to: **tag/publish ONLY after
126
+ the full OS matrix CI (ubuntu/windows/macos) is green.** v0.9.3 was published
127
+ mid-CI-run; the package itself is correct (the broken file is test-only and
128
+ not shipped), but the repo CI went red. v0.9.4 restores green CI.
129
+
3
130
  ## [v0.9.3] — security hardening + crash-diagnostics (code review 2026-06-23)
4
131
 
5
132
  Patch release addressing findings from a full codebase code review
package/README.md CHANGED
@@ -39,13 +39,65 @@ npm: pi-crew
39
39
  repo: https://github.com/baphuongna/pi-crew
40
40
  ```
41
41
 
42
- **v0.9.0**: See [CHANGELOG.md](CHANGELOG.md).
42
+ **v0.9.4 / v0.9.5**: See [CHANGELOG.md](CHANGELOG.md).
43
43
 
44
- ### Highlights (v0.6.4 → v0.9.0)
44
+ ### Highlights (v0.6.4 → v0.9.5)
45
45
 
46
46
  A long arc of **trust, cliff-resilience, and robustness** work. Principle: *build
47
47
  trust and cliff-resilience, stay lean, delete before adding.*
48
48
 
49
+ #### v0.9.5 — fix "team run hangs forever at 25%" (2026-06-23)
50
+ Two coupled runtime bugs caused recurring "run stuck at 25% (1/4)" failures
51
+ across 4+ consecutive review/fast-fix runs. The combined symptom: scheduler
52
+ appears to stop responding right after the first task (explorer) finishes, no
53
+ progress to task 2, and `team action='status'` returns "Run not found" with
54
+ **no diagnostic trail** to investigate. Manual `kill` of the parent `pi`
55
+ process was the only workaround.
56
+
57
+ - **🩹 Bug X (proximate cause)** — `purgeStaleActiveRunIndex`
58
+ (`src/runtime/crash-recovery.ts`) destroyed a run's `stateRoot` based on a
59
+ **frozen** `entry.updatedAt` (set once at registration, never refreshed).
60
+ Any long-running legitimate async run (≥5 min) whose worker had exited
61
+ lost its entire durable state. `saveRunTasks()` then silently no-op'd on
62
+ the missing dir, and the workflow could never advance. Fix: corroborate
63
+ liveness via the on-disk `manifest.updatedAt` AND the team-level
64
+ `heartbeat.json`; keep `stateRoot` on cancel so runs stay queryable and
65
+ resumable.
66
+ - **🩹 Bug Y (root cause — why the scheduler died in the first place)** —
67
+ `src/runtime/background-runner.ts` redirected only `console.log` /
68
+ `console.error` to the log file. The first post-detach `console.debug`
69
+ call from `team-runner.ts:242` (inside `mergeTaskUpdatesPreservingTerminal`
70
+ → "Skipping stale merge") hit the disconnected stdout pipe → unhandled
71
+ `EPIPE` → process exit. Prior investigators concluded (incorrectly) that
72
+ the cause was a native crash, because diagnostic `[DIAG]` handlers never
73
+ fired on the EPIPE. Fix: extend the console redirect to `console.debug` /
74
+ `console.warn`, and wrap `fs.writeSync` in try-catch so any log-write
75
+ failure can never crash the scheduler.
76
+ - **🧪 Regression coverage** — 7 new tests: 3 in
77
+ `test/unit/crash-recovery-purge-liveness.test.ts` (fresh-manifest-kept,
78
+ orphan-cancelled-preserved, fresh-heartbeat-kept) + 4 in
79
+ `test/unit/background-runner-console-redirect.test.ts` (drift-detector
80
+ pattern that exercises undefined / valid / EBADF / post-toggle logFd).
81
+ - **📖 See [CHANGELOG.md](CHANGELOG.md) for full details**, including
82
+ why prior attempts to diagnose the hang kept destroying the only
83
+ evidence (Bug X nuked the stateRoot before anyone could read the EPIPE
84
+ crash in Bug Y).
85
+
86
+ > **Recovering a stuck run from v0.9.4 or earlier:** the `stateRoot` for
87
+ > those runs is already gone. Re-dispatch the workflow — new runs are
88
+ > fully protected.
89
+
90
+ #### v0.9.4 — macOS CI fixture (2026-06-23)
91
+ - **🧪 BSD-vs-GNU grep fix** — benchmark test fixtures used
92
+ `grep --help` (exits 0 on GNU/Linux, exits 2 on BSD/macOS). Switched
93
+ the exit-0 fixture to `echo ok`; the not-in-allowlist fixture is now
94
+ `ls`. CI matrix is now green on all 3 OSes.
95
+ - **📌 Process note** — this release re-commits to: **tag/publish ONLY
96
+ after the full OS matrix CI is green.** v0.9.3 was published mid-CI-run
97
+ (the macOS job hadn't finished); the package itself was correct (the
98
+ broken file is test-only and not shipped), but the repo CI went red.
99
+ v0.9.4 restores green CI. v0.9.5 follows the same discipline.
100
+
49
101
  #### v0.9.0 — goal loops + dynamic workflows (2026-06-18)
50
102
  Two new features, both modeled on Claude Code, built on a shared `runKind`
51
103
  background-dispatch discriminator.
@@ -74,6 +74,32 @@ team action='cancel' runId=… # cancel a truly-dead run
74
74
 
75
75
  The error message explains the heartbeat mechanism + remediation.
76
76
 
77
+ ### "Run not found" but `team list` shows it / scheduler appears frozen at 25%
78
+
79
+ **Symptom:** an async `team action='run'` (e.g. a review) gets through the
80
+ first task (e.g. explorer), then the scheduler appears to stop responding.
81
+ `team action='status' runId=…` returns `Run not found`; the run's
82
+ `stateRoot` (in `<project>/.crew/state/runs/<runId>/`) is missing. TUI
83
+ progress shows the run stuck at the same task percentage forever, and the
84
+ only workaround was killing the parent `pi` process.
85
+
86
+ **This was the v0.9.4 symptom** caused by two coupled runtime bugs:
87
+
88
+ - **Bug X** (proximate): `purgeStaleActiveRunIndex` destroyed the
89
+ `stateRoot` of long-running legitimate async runs based on a frozen
90
+ `entry.updatedAt` (set at registration, never refreshed).
91
+ - **Bug Y** (root cause): the bg-runner crashed with an unhandled `EPIPE`
92
+ on the first `console.debug` after the parent detached its stdio pipes.
93
+
94
+ **Fixed in v0.9.5** (see [CHANGELOG.md](../CHANGELOG.md#v095--fix-team-run-hangs-forever-at-25-2026-06-23)).
95
+ With the fix, a long-running run is no longer falsely purged, and even if the
96
+ bg-runner dies, the `stateRoot`, `background.log`, `events.jsonl`, and
97
+ `heartbeat.json` survive — runs stay queryable and resumable.
98
+
99
+ **Recovering a stuck run from v0.9.4 or earlier:** the `stateRoot` for
100
+ those runs is already gone (Bug X nuked it). Re-dispatch the workflow. New
101
+ runs on v0.9.5+ are fully protected.
102
+
77
103
  ## Model fallback exhausted
78
104
 
79
105
  **Symptom:** `All N candidates exhausted (tried: a → b → c)`.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-crew",
3
- "version": "0.9.3",
3
+ "version": "0.9.5",
4
4
  "description": "Pi extension for coordinated AI teams, workflows, worktrees, and async task orchestration",
5
5
  "author": "baphuongna",
6
6
  "license": "MIT",
@@ -46,9 +46,13 @@ function validateCommand(command: string): void {
46
46
  // execution without any shell metacharacter (e.g. `npx --yes evil-package`
47
47
  // or `node -e "require('fs')…"`). Use `npm test`/`npm run …` instead of raw
48
48
  // `node`/`npx` in benchmark task definitions.
49
- const allowlist = /^(pytest|grep|npm test|cargo test|cargo clippy) /;
49
+ // `echo` is allowed because the metachar blocker (validateGateCommand) rejects
50
+ // command substitution (`$(...)`, backticks), so `echo $(evil)` cannot run;
51
+ // bare `echo …` only prints. It's the canonical exit-0 command used in
52
+ // benchmark fixtures across Linux/macOS/Windows(sh).
53
+ const allowlist = /^(pytest|grep|npm test|cargo test|cargo clippy|echo) /;
50
54
  if (!allowlist.test(command)) {
51
- throw new Error(`Command not allowed: ${command}. Only pytest, grep, npm test, cargo test/clippy allowed.`);
55
+ throw new Error(`Command not allowed: ${command}. Only pytest, grep, npm test, cargo test/clippy, echo allowed.`);
52
56
  }
53
57
 
54
58
  // Block shell metacharacters after command name
@@ -67,6 +67,7 @@ export function suggestRunIds(_prefix: string, cwd?: string): AutocompleteItem[]
67
67
  export async function suggestTaskIds(runId: string, prefix: string, cwd?: string): Promise<AutocompleteItem[] | null> {
68
68
  const resolvedCwd = cwd ?? process.cwd();
69
69
  // Dynamic import to avoid pulling state-store into the hot command-registration path.
70
+ // LAZY: defer dynamic import of ../state/state-store.ts to its call site.
70
71
  const { loadRunManifestById } = await import("../state/state-store.ts");
71
72
  const loaded = loadRunManifestById(resolvedCwd, runId);
72
73
  if (!loaded) return null;
@@ -34,6 +34,7 @@ const CREW_SHORTCUTS: ReadonlyArray<ShortcutRegistration> = [
34
34
  // (avoids pulling the full commands.ts dependency tree into every
35
35
  // process that imports this module, e.g. the unit test).
36
36
  handler: async (ctx) => {
37
+ // LAZY: defer dynamic import of ./registration/commands.ts to its call site.
37
38
  const { openTeamSettingsOverlay } = await import("./registration/commands.ts");
38
39
  await openTeamSettingsOverlay(ctx);
39
40
  },
@@ -1129,6 +1129,7 @@ export function registerPiTeams(pi: ExtensionAPI): void {
1129
1129
  // LAZY: state-store only needed in hasRunning; avoid at startup.
1130
1130
  // Use dynamic import to avoid CJS/ESM mixed module issues.
1131
1131
  const { loadRunManifestById: loadRunForHasRunning } =
1132
+ // LAZY: defer dynamic import of ../state/state-store.ts to its call site.
1132
1133
  await import("../state/state-store.ts");
1133
1134
  const loaded = loadRunForHasRunning(
1134
1135
  currentCtx?.cwd ?? process.cwd(),
@@ -1494,6 +1495,7 @@ export function registerPiTeams(pi: ExtensionAPI): void {
1494
1495
  const cwd = ctx.cwd ?? process.cwd();
1495
1496
  const loaded = loadRunManifestById(cwd, runId);
1496
1497
  if (loaded) {
1498
+ // LAZY: defer dynamic import of ../state/atomic-write.ts to its call site.
1497
1499
  const { atomicWriteJson } = await import("../state/atomic-write.ts");
1498
1500
  atomicWriteJson(loaded.manifest.stateRoot + "/manifest.json", {
1499
1501
  ...loaded.manifest,
@@ -202,11 +202,13 @@ export async function openTeamSettingsOverlay(ctx: ExtensionContext): Promise<vo
202
202
  if (res.success) {
203
203
  ctx.ui.notify(`Theme: ${value} (applied live)`, "info");
204
204
  } else {
205
+ // LAZY: defer dynamic import of ../../ui/theme-discovery.ts to its call site.
205
206
  const { setPiTheme } = await import("../../ui/theme-discovery.ts");
206
207
  setPiTheme(value);
207
208
  ctx.ui.notify(`Theme saved as '${value}' but failed to apply: ${res.error ?? "unknown"}. Restart Pi.`, "warning");
208
209
  }
209
210
  } else {
211
+ // LAZY: defer dynamic import of ../../ui/theme-discovery.ts to its call site.
210
212
  const { setPiTheme } = await import("../../ui/theme-discovery.ts");
211
213
  setPiTheme(value);
212
214
  ctx.ui.notify(`Pi theme set to '${value}'. Restart Pi to apply.`, "info");
@@ -672,6 +674,7 @@ export function registerTeamCommands(pi: ExtensionAPI, deps: RegisterTeamCommand
672
674
  pi.registerCommand("crew-brief", {
673
675
  description: "Toggle brief tool output mode: on | off | status",
674
676
  handler: async (args: string, ctx: ExtensionCommandContext) => {
677
+ // LAZY: defer dynamic import of ../../ui/tool-renderers/brief-mode.ts to its call site.
675
678
  const { isBrief, setBrief, BRIEF_ENTRY_TYPE, makeBriefEntry } = await import("../../ui/tool-renderers/brief-mode.ts");
676
679
  const trimmed = args.trim();
677
680
 
@@ -269,6 +269,7 @@ async function handleStop(input: GoalSubActionInput): Promise<ReturnType<typeof
269
269
  let cancelMsg = "";
270
270
  if (updated.currentRunId) {
271
271
  try {
272
+ // LAZY: defer dynamic import of ./cancel.ts to its call site.
272
273
  const { handleCancel } = await import("./cancel.ts");
273
274
  const cancelResult = await handleCancel({ action: "cancel", runId: updated.currentRunId, force: true, config: { intent: "user requested goal stop" } }, ctx);
274
275
  cancelMsg = ` In-flight turn ${updated.currentRunId} cancel: ${(cancelResult.content[0] as { text?: string } | undefined)?.text ?? "ok"}.`;
@@ -34,6 +34,7 @@ import { expandParallelResearchWorkflow } from "../../runtime/parallel-research.
34
34
  /**
35
35
  * Module-scoped latch for the crew-init dynamic import.
36
36
  *
37
+ // LAZY: defer dynamic import of module to its call site.
37
38
  * `crew-init.ts` is dynamically `await import()`'d from `handleRun` below, which
38
39
  * N concurrent subagents hit simultaneously (every `team` tool call runs it).
39
40
  * Under the tsx/jiti loader, concurrent first-imports race module-record
@@ -296,6 +297,7 @@ export async function handleRun(params: TeamToolParamsValue, ctx: TeamContext):
296
297
  // orchestrates subagents via ctx.agent(); only ctx.setResult() reaches the main context.
297
298
  // Placed AFTER manifest creation so runId/paths/artifactsRoot are available.
298
299
  if (!directAgent && (workflow as import("../../workflows/workflow-config.ts").DynamicWorkflowConfig).runtime === "dynamic") {
300
+ // LAZY: defer dynamic import of ../../runtime/dynamic-workflow-runner.ts to its call site.
299
301
  const { runDynamicWorkflow } = await import("../../runtime/dynamic-workflow-runner.ts");
300
302
  // Re-synthesize a dynamic-team (§0c C9) for role resolution.
301
303
  const dwfTeam: import("../../teams/team-config.ts").TeamConfig = {
@@ -323,11 +323,28 @@ async function main(): Promise<void> {
323
323
  const origWrite =
324
324
  (_prefix: string) =>
325
325
  (data: unknown, ...args: unknown[]) => {
326
+ // FIX: Never let the in-process console redirect crash the background
327
+ // runner. If logFd is missing/invalid or the write fails, swallow the
328
+ // error silently — losing one debug line is far better than killing the
329
+ // scheduler (a previous version only redirected console.log/error, so
330
+ // console.debug/.warn still wrote to the original stdout/stderr pipe
331
+ // which is closed after the parent detaches, producing EPIPE → process
332
+ // crash mid-workflow → runs hang at 25% forever).
333
+ if (logFd === undefined) return;
326
334
  const msg = [data, ...args].map(String).join(" ") + "\n";
327
- fs.writeSync(logFd!, msg);
335
+ try {
336
+ fs.writeSync(logFd, msg);
337
+ } catch {
338
+ /* best-effort: never crash the scheduler over a log write */
339
+ }
328
340
  };
329
341
  console.log = origWrite("OUT");
330
342
  console.error = origWrite("ERR");
343
+ // FIX: Also redirect console.debug and console.warn — otherwise they still
344
+ // hit the original stdout/stderr pipe, which is closed once the parent
345
+ // process detaches, causing EPIPE unhandled errors that kill the scheduler.
346
+ console.debug = origWrite("DBG");
347
+ console.warn = origWrite("WARN");
331
348
  // FIX: Close logFd on process exit to prevent file descriptor leak
332
349
  process.on("exit", () => {
333
350
  try {
@@ -558,8 +575,11 @@ async function main(): Promise<void> {
558
575
  debugLog(`[background-runner] short-circuiting ${manifest.runKind} (synthetic team/workflow)`,
559
576
  );
560
577
  if (manifest.runKind === "goal-loop") {
578
+ // LAZY: defer dynamic import of ./goal-loop-runner.ts to its call site.
561
579
  const { runGoalLoop } = await import("./goal-loop-runner.ts");
580
+ // LAZY: defer dynamic import of ./goal-state-store.ts to its call site.
562
581
  const { GoalStore } = await import("./goal-state-store.ts");
582
+ // LAZY: defer dynamic import of ../agents/discover-agents.ts to its call site.
563
583
  const { discoverAgents, allAgents } = await import("../agents/discover-agents.ts");
564
584
  const store = new GoalStore(manifest.cwd);
565
585
  const goalState = store.load(manifest.runId);
@@ -576,7 +596,9 @@ async function main(): Promise<void> {
576
596
  saveRunManifest(finalGoalManifest);
577
597
  earlyResult = { manifest: finalGoalManifest, tasks: goalResult.tasks };
578
598
  } else {
599
+ // LAZY: defer dynamic import of ./dynamic-workflow-runner.ts to its call site.
579
600
  const { runDynamicWorkflow } = await import("./dynamic-workflow-runner.ts");
601
+ // LAZY: defer dynamic import of ../workflows/discover-workflows.ts to its call site.
580
602
  const { allWorkflows, discoverWorkflows } = await import("../workflows/discover-workflows.ts");
581
603
  const wf = allWorkflows(discoverWorkflows(manifest.cwd)).find((w) => w.name === manifest.workflow);
582
604
  if (!wf || wf.runtime !== "dynamic" || !wf.dynamicScript) throw new Error(`runKind="dynamic-workflow" but workflow '${manifest.workflow}' is not dynamic (runId=${manifest.runId})`);
@@ -246,6 +246,7 @@ export class ChainRunner {
246
246
 
247
247
  // Emit progress event if eventsPath provided
248
248
  if (eventsPath) {
249
+ // LAZY: defer dynamic import of ../state/event-log.ts to its call site.
249
250
  const { appendEventAsync } = await import("../state/event-log.ts");
250
251
  await appendEventAsync(eventsPath, {
251
252
  type: "chain.step_completed",
@@ -1,10 +1,11 @@
1
1
  import type { ExtensionContext } from "@earendil-works/pi-coding-agent";
2
2
  import * as fs from "node:fs";
3
+ import * as path from "node:path";
3
4
  import type { MetricRegistry } from "../observability/metric-registry.ts";
4
5
  import { appendEvent, scanSequence } from "../state/event-log.ts";
5
6
  import { recordFromTask, upsertCrewAgent } from "./crew-agent-records.ts";
6
7
  import { withRunLockSync } from "../state/locks.ts";
7
- import { loadRunManifestById, saveRunManifest, saveRunTasks, updateRunStatus } from "../state/state-store.ts";
8
+ import { loadRunManifestById, saveRunTasks, updateRunStatus } from "../state/state-store.ts";
8
9
  import type { TeamTaskState } from "../state/types.ts";
9
10
  import { isWorkerHeartbeatStale } from "./worker-heartbeat.ts";
10
11
  import type { ManifestCache } from "./manifest-cache.ts";
@@ -215,6 +216,43 @@ function tryRemoveRunDirectories(entry: { stateRoot: string; cwd: string }): voi
215
216
  // NOTE: artifactsRoot is shared across runs and cleaned up by pruneFinishedRuns/pruneUserLevelRuns — not deleted here.
216
217
  }
217
218
 
219
+ /**
220
+ * Age (ms) of the team-level heartbeat file for a run. The team-runner writes
221
+ * `<stateRoot>/heartbeat.json` periodically while a workflow is executing
222
+ * (startTeamHeartbeat), so a fresh heartbeat is strong evidence the run is alive
223
+ * even when its recorded PID check is inconclusive or its active-run-index
224
+ * entry's `updatedAt` was frozen at registration. Returns Infinity when absent.
225
+ */
226
+ function heartbeatAgeMs(entry: { stateRoot: string }, now: number): number {
227
+ try {
228
+ const mtime = fs.statSync(path.join(entry.stateRoot, "heartbeat.json")).mtimeMs;
229
+ return Number.isFinite(mtime) ? now - mtime : Infinity;
230
+ } catch {
231
+ return Infinity;
232
+ }
233
+ }
234
+
235
+ /**
236
+ * True if there is recent evidence the run is (or was very recently) alive, so
237
+ * it must NOT be purged. Any one of these signals is sufficient:
238
+ * - on-disk `manifest.updatedAt` fresher than `staleThresholdMs` (rewritten on
239
+ * every task transition / status change), and/or
240
+ * - team-level `heartbeat.json` fresher than `staleThresholdMs`.
241
+ * `entry.updatedAt` is intentionally NOT consulted: it is frozen at
242
+ * registration and never refreshed during execution, which previously caused
243
+ * long-running legitimate runs to be falsely purged — destroying their
244
+ * stateRoot, and because saveRunTasks() silently no-ops once the state dir is
245
+ * gone, hanging the workflow permanently at the current task with no
246
+ * recoverable state ("Run not found").
247
+ */
248
+ function hasRecentLifeEvidence(entry: { stateRoot: string }, manifestUpdatedAt: string | undefined, now: number, staleThresholdMs: number): boolean {
249
+ const manifestMs = manifestUpdatedAt ? new Date(manifestUpdatedAt).getTime() : NaN;
250
+ if (Number.isFinite(manifestMs) && now - manifestMs <= staleThresholdMs) return true;
251
+ const hbAge = heartbeatAgeMs(entry, now);
252
+ if (Number.isFinite(hbAge) && hbAge <= staleThresholdMs) return true;
253
+ return false;
254
+ }
255
+
218
256
  /**
219
257
  * Purge the global active-run-index of entries whose manifest is no longer active.
220
258
  *
@@ -244,7 +282,7 @@ export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.
244
282
  }
245
283
 
246
284
  // 3. Read manifest status
247
- let manifest: { status?: string; async?: { pid?: number }; ownerSessionId?: string } | undefined;
285
+ let manifest: { status?: string; updatedAt?: string; async?: { pid?: number }; ownerSessionId?: string } | undefined;
248
286
  try {
249
287
  manifest = JSON.parse(fs.readFileSync(entry.manifestPath, "utf-8"));
250
288
  } catch {
@@ -262,46 +300,52 @@ export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.
262
300
  continue;
263
301
  }
264
302
 
265
- // 5. Still "running" check if worker PID is dead and no heartbeat
303
+ // 5. Still "running" with an async worker PID only purge when the worker
304
+ // is actually dead AND there is no recent evidence of life. We must NOT
305
+ // rely solely on `entry.updatedAt` (frozen at registration) nor on a single
306
+ // dead-PID reading: a long-running worker (e.g. a 15-minute explorer)
307
+ // legitimately keeps the run "running" while periodically rewriting the
308
+ // on-disk manifest.updatedAt and heartbeat.json. Falsely purging such a run
309
+ // destroys its stateRoot, and because saveRunTasks() silently no-ops once
310
+ // the state dir is gone, the workflow then hangs permanently at the
311
+ // current task with no recoverable state ("Run not found"). When we do mark
312
+ // a run cancelled here, we KEEP its stateRoot so the run stays queryable/
313
+ // resumable and its diagnostics survive; the finished-run pruner removes
314
+ // the directory later on its normal schedule.
266
315
  if (manifest?.status === "running" && manifest.async?.pid !== undefined) {
267
316
  const pidAlive = checkProcessLiveness(manifest.async.pid).alive;
268
- if (!pidAlive) {
269
- // Check age if manifest hasn't been updated in > threshold, it's stale
270
- const updatedAt = new Date(entry.updatedAt).getTime();
271
- if (Number.isFinite(updatedAt) && now - updatedAt > staleThresholdMs) {
272
- // Dead PID + stale update → cancel the manifest and unregister
273
- try {
274
- const fullLoaded = loadRunManifestById(entry.cwd, entry.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
275
- if (fullLoaded) {
276
- const now_iso = new Date(now).toISOString();
277
- const repairedTasks = fullLoaded.tasks.map((task) => {
278
- if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
279
- return { ...task, status: "cancelled" as const, finishedAt: now_iso, error: "Orphaned run: worker process dead and no recent activity" };
280
- }
281
- return task;
282
- });
283
- saveRunTasks(fullLoaded.manifest, repairedTasks);
284
- for (const task of repairedTasks) { try { upsertCrewAgent(fullLoaded.manifest, recordFromTask(fullLoaded.manifest, task, "scaffold")); } catch { /* non-critical */ } }
285
- updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: worker process dead and no recent activity");
286
- saveRunManifest(fullLoaded.manifest);
287
- void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch((error) => logInternalError("crash-recovery.pid-dead.terminate", error, `runId=${fullLoaded.manifest.runId}`));
288
- }
289
- } catch {
290
- // Best-effort manifest cleanup
317
+ if (!pidAlive && !hasRecentLifeEvidence(entry, manifest.updatedAt, now, staleThresholdMs)) {
318
+ // Dead PID + no recent life evidence cancel the manifest and unregister
319
+ try {
320
+ const fullLoaded = loadRunManifestById(entry.cwd, entry.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
321
+ if (fullLoaded) {
322
+ const now_iso = new Date(now).toISOString();
323
+ const repairedTasks = fullLoaded.tasks.map((task) => {
324
+ if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
325
+ return { ...task, status: "cancelled" as const, finishedAt: now_iso, error: "Orphaned run: worker process dead and no recent activity" };
326
+ }
327
+ return task;
328
+ });
329
+ saveRunTasks(fullLoaded.manifest, repairedTasks);
330
+ for (const task of repairedTasks) { try { upsertCrewAgent(fullLoaded.manifest, recordFromTask(fullLoaded.manifest, task, "scaffold")); } catch { /* non-critical */ } }
331
+ updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: worker process dead and no recent activity");
332
+ void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch((error) => logInternalError("crash-recovery.pid-dead.terminate", error, `runId=${fullLoaded.manifest.runId}`));
291
333
  }
292
- unregisterActiveRun(entry.runId);
293
- tryRemoveRunDirectories(entry);
294
- purged.push(entry.runId);
295
- continue;
334
+ } catch {
335
+ // Best-effort manifest cleanup
296
336
  }
337
+ unregisterActiveRun(entry.runId);
338
+ purged.push(entry.runId);
339
+ continue;
297
340
  }
298
341
  }
299
342
 
300
- // 6. "running" but no async worker PID — possible orphaned run where manifest
301
- // was never updated after worker exit. Check updatedAt age.
343
+ // 6. "running" but no async worker PID — possible orphaned run where the
344
+ // manifest was never updated to a terminal status after the worker exited.
345
+ // Uses the same life-evidence corroboration as condition 5; the stateRoot is
346
+ // kept on cancel so the run stays queryable/resumable with diagnostics.
302
347
  if (manifest?.status === "running" && manifest.async === undefined) {
303
- const updatedAt = new Date(entry.updatedAt).getTime();
304
- if (Number.isFinite(updatedAt) && now - updatedAt > staleThresholdMs) {
348
+ if (!hasRecentLifeEvidence(entry, manifest.updatedAt, now, staleThresholdMs)) {
305
349
  try {
306
350
  const fullLoaded = loadRunManifestById(entry.cwd, entry.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
307
351
  if (fullLoaded && fullLoaded.manifest.status === "running") {
@@ -315,14 +359,12 @@ export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.
315
359
  saveRunTasks(fullLoaded.manifest, repairedTasks);
316
360
  for (const task of repairedTasks) { try { upsertCrewAgent(fullLoaded.manifest, recordFromTask(fullLoaded.manifest, task, "scaffold")); } catch { /* non-critical */ } }
317
361
  updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: no async worker and no manifest update in over " + Math.round(staleThresholdMs / 60000) + " minutes");
318
- saveRunManifest(fullLoaded.manifest);
319
362
  void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch((error) => logInternalError("crash-recovery.pid-dead.terminate", error, `runId=${fullLoaded.manifest.runId}`));
320
363
  }
321
364
  } catch {
322
365
  // Best-effort
323
366
  }
324
367
  unregisterActiveRun(entry.runId);
325
- tryRemoveRunDirectories(entry);
326
368
  purged.push(entry.runId);
327
369
  continue;
328
370
  }
@@ -85,6 +85,7 @@ async function loadWorkflowModule(scriptPath: string): Promise<DynamicWorkflowSc
85
85
  // lazily so this module stays importable in environments without jiti (type-only consumers).
86
86
  // Fix round-4: use createRequire(import.meta.url) so `require` works under the strip-types
87
87
  // loader fallback (Node ≥ 22.6) where bare `require` is not defined in ESM scope.
88
+ // LAZY: defer dynamic import of node:module to its call site.
88
89
  const { createRequire } = await import("node:module");
89
90
  const require = createRequire(import.meta.url);
90
91
  // eslint-disable-next-line @typescript-eslint/no-require-imports
@@ -121,6 +121,7 @@ export const realGoalEvaluator = async (
121
121
  }
122
122
  if (!verificationCompromised) {
123
123
  try {
124
+ // LAZY: defer dynamic import of ./verification-gates.ts to its call site.
124
125
  const { executeVerificationCommands } = await import("./verification-gates.ts");
125
126
  const contract = { requiredGreenLevel: "none" as const, commands: goal.verification.commands, allowManualEvidence: goal.verification.allowManualEvidence ?? false };
126
127
  // Phase 1.5 #2 (RFC 16): run verification in a pristine git worktree at
@@ -131,6 +132,7 @@ export const realGoalEvaluator = async (
131
132
  let worktreeCwd: string | undefined;
132
133
  let worktreeCleanup: (() => void) | undefined;
133
134
  try {
135
+ // LAZY: defer dynamic import of ./verification-worktree.ts to its call site.
134
136
  const { checkWorktreeSandboxAvailable, prepareVerificationWorktree } = await import("./verification-worktree.ts");
135
137
  const availability = checkWorktreeSandboxAvailable(goal.cwd);
136
138
  if (availability.available) {
@@ -36,6 +36,7 @@ import { listLiveAgents } from "./live-agent-manager.ts";
36
36
  * Module-scoped latch for the optional peer dependency import. When N
37
37
  * in-process live-session subagents spawn CONCURRENTLY (e.g. several
38
38
  * `Agent({run_in_background:true})` started at once), each used to call
39
+ // LAZY: defer dynamic import of @earendil-works/pi-coding-agent to its call site.
39
40
  * `await import("@earendil-works/pi-coding-agent")` independently. Under the
40
41
  * tsx loader (registering load/resolve hooks), concurrent first-imports can
41
42
  * each enter the loader and race module-record instantiation, yielding
@@ -128,6 +128,7 @@ export async function readEnabledModelsPatterns(cwd: string, agentDir?: string):
128
128
  // SDK. SettingsManager is dynamically imported because the module
129
129
  // shape differs across pi versions; the create() factory is the
130
130
  // canonical, version-stable entry point.
131
+ // LAZY: defer dynamic import of @earendil-works/pi-coding-agent to its call site.
131
132
  const mod = await import("@earendil-works/pi-coding-agent" as string).catch(() => null);
132
133
  if (!mod) return [];
133
134
  const SettingsManagerCtor = (mod as { SettingsManager?: { create?: (cwd: string, agentDir?: string) => { getEnabledModels?: () => string[] | undefined } } }).SettingsManager;
@@ -239,6 +239,7 @@ export function primePeerDep(): Promise<PeerDepModule> {
239
239
  if (!resolved) {
240
240
  throw new Error(buildMissingMessage());
241
241
  }
242
+ // LAZY: defer dynamic import of module to its call site.
242
243
  cachedModule = (await import(resolved.mainUrl)) as PeerDepModule;
243
244
  return cachedModule;
244
245
  })();
@@ -133,6 +133,7 @@ export function wrapEditWithResilientReplace(pi: ExtensionAPI, tools?: { edit: T
133
133
  throw new Error("old_string not found (and resilient retry skipped: missing path/old/new)");
134
134
  }
135
135
 
136
+ // LAZY: defer dynamic import of node:fs/promises to its call site.
136
137
  const fs = await import("node:fs/promises");
137
138
  let content: string;
138
139
  try {
@@ -289,6 +289,7 @@ export async function runTeamTask(
289
289
  // follow it and execute a script outside cwd. Throws on escape.
290
290
  resolveRealContainedPath(manifest.cwd, input.step.preStepScript);
291
291
  try {
292
+ // LAZY: defer dynamic import of node:child_process to its call site.
292
293
  const { execFileSync } = await import("node:child_process");
293
294
  preStepOutput = execFileSync(input.step.preStepScript, scriptArgs, {
294
295
  timeout: scriptTimeout,
@@ -11,7 +11,9 @@ let pathsInstance: typeof import("../utils/paths.js") | null = null;
11
11
 
12
12
  async function getStore() {
13
13
  if (!storeInstance) {
14
+ // LAZY: defer dynamic import of ./instinct-store.js to its call site.
14
15
  const { InstinctStore } = await import("./instinct-store.js");
16
+ // LAZY: defer dynamic import of ../utils/paths.js to its call site.
15
17
  const paths = await import("../utils/paths.js");
16
18
  storeInstance = new InstinctStore(paths.projectCrewRoot(process.cwd()));
17
19
  }
@@ -20,6 +22,7 @@ async function getStore() {
20
22
 
21
23
  async function getPaths() {
22
24
  if (!pathsInstance) {
25
+ // LAZY: defer dynamic import of ../utils/paths.js to its call site.
23
26
  pathsInstance = await import("../utils/paths.js");
24
27
  }
25
28
  return pathsInstance;
@@ -156,6 +156,7 @@ interface AgentSearchResult {
156
156
  * Uses dynamic import to avoid ESM/CJS issues at module load time.
157
157
  */
158
158
  export async function searchAgents(query: string, options?: { limit?: number }): Promise<AgentSearchResult[]> {
159
+ // LAZY: defer dynamic import of ../agents/discover-agents.ts to its call site.
159
160
  const { discoverAgents, allAgents } = await import("../agents/discover-agents.ts");
160
161
  const discovery = discoverAgents(process.cwd());
161
162
  const all = allAgents(discovery);
@@ -200,6 +201,7 @@ interface TeamSearchResult {
200
201
  * Uses dynamic import to avoid ESM/CJS issues at module load time.
201
202
  */
202
203
  export async function searchTeams(query: string, options?: { limit?: number }): Promise<TeamSearchResult[]> {
204
+ // LAZY: defer dynamic import of ../teams/discover-teams.ts to its call site.
203
205
  const { discoverTeams, allTeams } = await import("../teams/discover-teams.ts");
204
206
  const discovery = discoverTeams(process.cwd());
205
207
  const all = allTeams(discovery);