pi-crew 0.9.4 → 0.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +592 -0
  2. package/README.md +55 -3
  3. package/docs/HARNESS_BACKLOG.md +51 -3
  4. package/docs/dynamic-workflows.md +315 -2
  5. package/docs/fix-plan-disabletools-exit-null.md +219 -0
  6. package/docs/troubleshooting.md +102 -0
  7. package/package.json +8 -2
  8. package/src/extension/command-completions.ts +1 -0
  9. package/src/extension/crew-shortcuts.ts +1 -0
  10. package/src/extension/register.ts +2 -0
  11. package/src/extension/registration/commands.ts +3 -0
  12. package/src/extension/team-tool/doctor.ts +14 -0
  13. package/src/extension/team-tool/goal.ts +1 -0
  14. package/src/extension/team-tool/run.ts +4 -0
  15. package/src/runtime/background-runner.ts +24 -2
  16. package/src/runtime/chain-runner.ts +1 -0
  17. package/src/runtime/child-pi.ts +101 -10
  18. package/src/runtime/crash-recovery.ts +78 -36
  19. package/src/runtime/deterministic-ast.ts +161 -0
  20. package/src/runtime/dwf-state-store.ts +97 -0
  21. package/src/runtime/dynamic-workflow-context.ts +381 -7
  22. package/src/runtime/dynamic-workflow-runner.ts +94 -2
  23. package/src/runtime/goal-loop-runner.ts +2 -0
  24. package/src/runtime/live-session-runtime.ts +1 -0
  25. package/src/runtime/model-scope.ts +1 -0
  26. package/src/runtime/peer-dep.ts +1 -0
  27. package/src/runtime/pi-args.ts +11 -0
  28. package/src/runtime/resilient-edit.ts +1 -0
  29. package/src/runtime/result-extractor.ts +72 -7
  30. package/src/runtime/task-runner.ts +1 -0
  31. package/src/runtime/team-runner.ts +8 -3
  32. package/src/runtime/zombie-scanner.ts +297 -0
  33. package/src/schema/team-tool-schema.ts +28 -0
  34. package/src/state/contracts.ts +1 -0
  35. package/src/state/hook-instinct-bridge.ts +3 -0
  36. package/src/state/state-store.ts +3 -0
  37. package/src/state/types.ts +9 -0
  38. package/src/ui/dashboard-panes/progress-pane.ts +5 -0
  39. package/src/ui/dwf-phase-display.ts +151 -0
  40. package/src/ui/run-snapshot-cache.ts +4 -0
  41. package/src/ui/snapshot-types.ts +3 -0
  42. package/src/utils/bm25-search.ts +2 -0
  43. package/src/workflows/workflow-config.ts +3 -0
  44. package/src/worktree/worktree-manager.ts +94 -0
  45. package/types/dwf.d.ts +187 -0
@@ -74,6 +74,32 @@ team action='cancel' runId=… # cancel a truly-dead run
74
74
 
75
75
  The error message explains the heartbeat mechanism + remediation.
76
76
 
77
+ ### "Run not found" but `team list` shows it / scheduler appears frozen at 25%
78
+
79
+ **Symptom:** an async `team action='run'` (e.g. a review) gets through the
80
+ first task (e.g. explorer), then the scheduler appears to stop responding.
81
+ `team action='status' runId=…` returns `Run not found`; the run's
82
+ `stateRoot` (in `<project>/.crew/state/runs/<runId>/`) is missing. TUI
83
+ progress shows the run stuck at the same task percentage forever, and the
84
+ only workaround was killing the parent `pi` process.
85
+
86
+ **This was the v0.9.4 symptom** caused by two coupled runtime bugs:
87
+
88
+ - **Bug X** (proximate): `purgeStaleActiveRunIndex` destroyed the
89
+ `stateRoot` of long-running legitimate async runs based on a frozen
90
+ `entry.updatedAt` (set at registration, never refreshed).
91
+ - **Bug Y** (root cause): the bg-runner crashed with an unhandled `EPIPE`
92
+ on the first `console.debug` after the parent detached its stdio pipes.
93
+
94
+ **Fixed in v0.9.5** (see [CHANGELOG.md](../CHANGELOG.md#v095--fix-team-run-hangs-forever-at-25-2026-06-23)).
95
+ With the fix, a long-running run is no longer falsely purged, and even if the
96
+ bg-runner dies, the `stateRoot`, `background.log`, `events.jsonl`, and
97
+ `heartbeat.json` survive — runs stay queryable and resumable.
98
+
99
+ **Recovering a stuck run from v0.9.4 or earlier:** the `stateRoot` for
100
+ those runs is already gone (Bug X nuked it). Re-dispatch the workflow. New
101
+ runs on v0.9.5+ are fully protected.
102
+
77
103
  ## Model fallback exhausted
78
104
 
79
105
  **Symptom:** `All N candidates exhausted (tried: a → b → c)`.
@@ -129,3 +155,79 @@ code + a help hint inline. Common ones:
129
155
  - `team action='summary' runId=…` — includes common failure-pattern detection
130
156
  ("4 of 5 failures share 2 root causes").
131
157
  - `team action='events' runId=…` — full event timeline for forensics.
158
+
159
+ ## Stuck / orphaned sub-agent processes ("zombies")
160
+
161
+ A pi-crew sub-agent whose parent crashed may linger as an orphaned process.
162
+ **Do NOT kill `pi` processes by eye** (uptime/RSS heuristics will match your
163
+ own interactive main session — that is unrecoverable). Use the safe scanner:
164
+
165
+ ```
166
+ team action='doctor' focus='zombies'
167
+ ```
168
+
169
+ This is **read-only**. It matches ONLY processes carrying the authoritative
170
+ `PI_CREW_KIND=subagent` env marker (set by every child-pi spawn) whose
171
+ `PI_CREW_PARENT_PID` is no longer alive. Your main session never carries the
172
+ marker, so it can never appear in the list. (The marker is an env var, not an
173
+ argv flag — pi's strict option parser rejects unknown flags, so we can't use
174
+ a `--crew-subagent` CLI flag.)
175
+
176
+ To kill a confirmed zombie: `kill <PID>` (the OS reaps it). The scanner never
177
+ kills on your behalf.
178
+
179
+ ### Why the marker exists
180
+
181
+ Before `PI_CREW_KIND`, a heuristic zombie "cleanup" killed a live main session
182
+ by accident. The marker makes sub-agent identity authoritative rather than
183
+ guessed. See `src/runtime/zombie-scanner.ts` and `.crew/knowledge.md`.
184
+
185
+ ## `ctx.agent({disableTools: true})` — historical `exit null` (FIXED)
186
+
187
+ Previously, `ctx.agent({disableTools: true, maxTurns: 1})` could return
188
+ `exit null` because the steer-injection code mis-treated normal Node stdin
189
+ backpressure (`write() === false`) as a fatal failure and `killProcessTree`'d
190
+ the worker mid-answer. **Fixed**: steer injection is now advisory — a
191
+ backpressure return or non-writable stdin is logged, not fatal; the
192
+ hard-abort at `maxTurns + graceTurns` remains the safety net for genuine
193
+ runaways. The `disableTools` correlation was a red herring — the real trigger
194
+ was `maxTurns:1` hitting on the first turn. See CHANGELOG "Real-world smoke
195
+ testing findings" and `test/unit/child-pi-steer-backpressure.test.ts`.
196
+
197
+ ## Running the real-binary smoke suite (HB-004)
198
+
199
+ The default `npm test` mocks child-pi (`PI_TEAMS_MOCK_CHILD_PI`), so it cannot
200
+ catch bugs that only manifest against the real `pi` binary. The smoke suite
201
+ shells out to real pi + makes real LLM calls, so it bills tokens and is gated
202
+ behind `PI_CREW_SMOKE=1`.
203
+
204
+ ### Run locally
205
+
206
+ ```bash
207
+ # All smoke tests (~5 tests, ~1 min, bills tokens):
208
+ PI_CREW_SMOKE=1 npm run test:smoke
209
+
210
+ # One smoke test in isolation:
211
+ PI_CREW_SMOKE=1 npx tsx --test test/smoke/agent-disabletools.smoke.ts
212
+ ```
213
+
214
+ Smoke tests live in `test/smoke/*.smoke.ts` and are NOT picked up by the default
215
+ `npm test` glob (`test/unit/*` + `test/integration/*`). Each test self-skips
216
+ unless `PI_CREW_SMOKE=1`.
217
+
218
+ ### What each covers
219
+
220
+ | File | Feature family | Catches |
221
+ |---|---|---|
222
+ | `argv-flags.smoke.ts` | buildPiWorkerArgs argv | unknown-flag rejection (e.g. `--crew-subagent`) |
223
+ | `agent-plain.smoke.ts` | ctx.agent() baseline | spawn-path breakage |
224
+ | `agent-schema.smoke.ts` | ctx.agent({schema, systemPrompt}) | persona-leak / schema-validation failures |
225
+ | `agent-disabletools.smoke.ts` | ctx.agent({disableTools, maxTurns:1}) ×5 | HB-003a steer-backpressure exit-null (flaky → 5×) |
226
+ | `dwf-workflow.smoke.ts` | full DWF end-to-end | phase/log/args/budget/pipeline/agent/setResult integration |
227
+
228
+ ### Run in CI (manual dispatch)
229
+
230
+ GitHub Actions → "Smoke (real-binary, manual)" → Run workflow → pick OS.
231
+ Requires the `PI_AUTH_JSON` repo secret (the contents of `~/.pi/agent/auth.json`)
232
+ so the spawned `pi` can authenticate with the model provider. If unset, the
233
+ LLM-calling smoke tests fail with a clear auth error.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-crew",
3
- "version": "0.9.4",
3
+ "version": "0.9.7",
4
4
  "description": "Pi extension for coordinated AI teams, workflows, worktrees, and async task orchestration",
5
5
  "author": "baphuongna",
6
6
  "license": "MIT",
@@ -39,6 +39,7 @@
39
39
  "docs/",
40
40
  "tsconfig.json",
41
41
  "schema.json",
42
+ "types/",
42
43
  "CHANGELOG.md",
43
44
  "LICENSE",
44
45
  "NOTICE.md"
@@ -52,6 +53,7 @@
52
53
  "test:unit": "node scripts/test-runner.mjs --test-concurrency=4 --test-timeout=180000 --test-force-exit test/unit/*.test.ts",
53
54
  "test:watch": "tsx --watch --test --test-concurrency=4 --test-timeout=30000 --test-force-exit test/unit/*.test.ts",
54
55
  "test:integration": "node scripts/test-runner.mjs --test-concurrency=1 --test-timeout=120000 test/integration/*.test.ts",
56
+ "test:smoke": "node scripts/test-runner.mjs --test-concurrency=1 --test-timeout=180000 test/smoke/*.smoke.ts",
55
57
  "build:bundle": "node scripts/build-bundle.mjs",
56
58
  "bench": "node scripts/run-bench.mjs",
57
59
  "bench:check": "node scripts/bench-check.mjs",
@@ -63,7 +65,10 @@
63
65
  "smoke:release": "node scripts/release-smoke.mjs"
64
66
  },
65
67
  "exports": {
66
- "./schema.json": "./schema.json"
68
+ "./schema.json": "./schema.json",
69
+ "./workflow": {
70
+ "types": "./types/dwf.d.ts"
71
+ }
67
72
  },
68
73
  "pi": {
69
74
  "extensions": [
@@ -81,6 +86,7 @@
81
86
  },
82
87
  "dependencies": {
83
88
  "@sinclair/typebox": "^0.34.49",
89
+ "acorn": "^8.17.0",
84
90
  "ajv": "^8.20.0",
85
91
  "cli-highlight": "^2.1.11",
86
92
  "diff": "^5.2.0",
@@ -67,6 +67,7 @@ export function suggestRunIds(_prefix: string, cwd?: string): AutocompleteItem[]
67
67
  export async function suggestTaskIds(runId: string, prefix: string, cwd?: string): Promise<AutocompleteItem[] | null> {
68
68
  const resolvedCwd = cwd ?? process.cwd();
69
69
  // Dynamic import to avoid pulling state-store into the hot command-registration path.
70
+ // LAZY: defer dynamic import of ../state/state-store.ts to its call site.
70
71
  const { loadRunManifestById } = await import("../state/state-store.ts");
71
72
  const loaded = loadRunManifestById(resolvedCwd, runId);
72
73
  if (!loaded) return null;
@@ -34,6 +34,7 @@ const CREW_SHORTCUTS: ReadonlyArray<ShortcutRegistration> = [
34
34
  // (avoids pulling the full commands.ts dependency tree into every
35
35
  // process that imports this module, e.g. the unit test).
36
36
  handler: async (ctx) => {
37
+ // LAZY: defer dynamic import of ./registration/commands.ts to its call site.
37
38
  const { openTeamSettingsOverlay } = await import("./registration/commands.ts");
38
39
  await openTeamSettingsOverlay(ctx);
39
40
  },
@@ -1129,6 +1129,7 @@ export function registerPiTeams(pi: ExtensionAPI): void {
1129
1129
  // LAZY: state-store only needed in hasRunning; avoid at startup.
1130
1130
  // Use dynamic import to avoid CJS/ESM mixed module issues.
1131
1131
  const { loadRunManifestById: loadRunForHasRunning } =
1132
+ // LAZY: defer dynamic import of ../state/state-store.ts to its call site.
1132
1133
  await import("../state/state-store.ts");
1133
1134
  const loaded = loadRunForHasRunning(
1134
1135
  currentCtx?.cwd ?? process.cwd(),
@@ -1494,6 +1495,7 @@ export function registerPiTeams(pi: ExtensionAPI): void {
1494
1495
  const cwd = ctx.cwd ?? process.cwd();
1495
1496
  const loaded = loadRunManifestById(cwd, runId);
1496
1497
  if (loaded) {
1498
+ // LAZY: defer dynamic import of ../state/atomic-write.ts to its call site.
1497
1499
  const { atomicWriteJson } = await import("../state/atomic-write.ts");
1498
1500
  atomicWriteJson(loaded.manifest.stateRoot + "/manifest.json", {
1499
1501
  ...loaded.manifest,
@@ -202,11 +202,13 @@ export async function openTeamSettingsOverlay(ctx: ExtensionContext): Promise<vo
202
202
  if (res.success) {
203
203
  ctx.ui.notify(`Theme: ${value} (applied live)`, "info");
204
204
  } else {
205
+ // LAZY: defer dynamic import of ../../ui/theme-discovery.ts to its call site.
205
206
  const { setPiTheme } = await import("../../ui/theme-discovery.ts");
206
207
  setPiTheme(value);
207
208
  ctx.ui.notify(`Theme saved as '${value}' but failed to apply: ${res.error ?? "unknown"}. Restart Pi.`, "warning");
208
209
  }
209
210
  } else {
211
+ // LAZY: defer dynamic import of ../../ui/theme-discovery.ts to its call site.
210
212
  const { setPiTheme } = await import("../../ui/theme-discovery.ts");
211
213
  setPiTheme(value);
212
214
  ctx.ui.notify(`Pi theme set to '${value}'. Restart Pi to apply.`, "info");
@@ -672,6 +674,7 @@ export function registerTeamCommands(pi: ExtensionAPI, deps: RegisterTeamCommand
672
674
  pi.registerCommand("crew-brief", {
673
675
  description: "Toggle brief tool output mode: on | off | status",
674
676
  handler: async (args: string, ctx: ExtensionCommandContext) => {
677
+ // LAZY: defer dynamic import of ../../ui/tool-renderers/brief-mode.ts to its call site.
675
678
  const { isBrief, setBrief, BRIEF_ENTRY_TYPE, makeBriefEntry } = await import("../../ui/tool-renderers/brief-mode.ts");
676
679
  const trimmed = args.trim();
677
680
 
@@ -10,6 +10,7 @@ import { DEFAULT_PATHS } from "../../config/defaults.ts";
10
10
  import type { TeamToolParamsValue } from "../../schema/team-tool-schema.ts";
11
11
  import { getPiSpawnCommand } from "../../runtime/pi-spawn.ts";
12
12
  import { getRuntimeWarmupStatus } from "../../runtime/runtime-warmup.ts";
13
+ import { scanZombieSubagents, formatZombieReport } from "../../runtime/zombie-scanner.ts";
13
14
  import { validateResources } from "../validate-resources.ts";
14
15
  import { detectDrift, formatDriftReport, type DriftReport } from "../../config/drift-detector.ts";
15
16
  import { TeamToolParams } from "../../schema/team-tool-schema.ts";
@@ -237,6 +238,19 @@ export function buildTeamDoctorReport(input: TeamDoctorReportInput): TeamDoctorR
237
238
  }
238
239
 
239
240
  export function handleDoctor(ctx: TeamContext, params: TeamToolParamsValue = {}): PiTeamsToolResult {
241
+ // Sub-focus: zombie sub-agent scan. READ-ONLY — never kills. Returns a table of
242
+ // orphaned pi-crew sub-agents identified by the authoritative PI_CREW_KIND=subagent
243
+ // marker. The user's main session never carries that marker, so it can never appear.
244
+ if (params.focus === "zombies") {
245
+ const scan = scanZombieSubagents();
246
+ const text = formatZombieReport(scan);
247
+ return result(text, {
248
+ action: "doctor",
249
+ status: "ok",
250
+ data: { zombies: scan.zombies.length, live: scan.live.length, errors: scan.errors.length },
251
+ }, false);
252
+ }
253
+
240
254
  const loadedConfig = loadConfig(ctx.cwd);
241
255
  let smokeChildPi: { ok: boolean; detail: string } | undefined;
242
256
  if (configRecord(params.config).smokeChildPi === true) {
@@ -269,6 +269,7 @@ async function handleStop(input: GoalSubActionInput): Promise<ReturnType<typeof
269
269
  let cancelMsg = "";
270
270
  if (updated.currentRunId) {
271
271
  try {
272
+ // LAZY: defer dynamic import of ./cancel.ts to its call site.
272
273
  const { handleCancel } = await import("./cancel.ts");
273
274
  const cancelResult = await handleCancel({ action: "cancel", runId: updated.currentRunId, force: true, config: { intent: "user requested goal stop" } }, ctx);
274
275
  cancelMsg = ` In-flight turn ${updated.currentRunId} cancel: ${(cancelResult.content[0] as { text?: string } | undefined)?.text ?? "ok"}.`;
@@ -34,6 +34,7 @@ import { expandParallelResearchWorkflow } from "../../runtime/parallel-research.
34
34
  /**
35
35
  * Module-scoped latch for the crew-init dynamic import.
36
36
  *
37
+ // LAZY: defer dynamic import of module to its call site.
37
38
  * `crew-init.ts` is dynamically `await import()`'d from `handleRun` below, which
38
39
  * N concurrent subagents hit simultaneously (every `team` tool call runs it).
39
40
  * Under the tsx/jiti loader, concurrent first-imports race module-record
@@ -280,6 +281,7 @@ export async function handleRun(params: TeamToolParamsValue, ctx: TeamContext):
280
281
  workspaceMode: params.workspaceMode,
281
282
  ownerSessionId: ctx.sessionId,
282
283
  runKind: params.runKind,
284
+ args: params.args,
283
285
  });
284
286
  const goalArtifact = writeArtifact(paths.artifactsRoot, {
285
287
  kind: "prompt",
@@ -296,6 +298,7 @@ export async function handleRun(params: TeamToolParamsValue, ctx: TeamContext):
296
298
  // orchestrates subagents via ctx.agent(); only ctx.setResult() reaches the main context.
297
299
  // Placed AFTER manifest creation so runId/paths/artifactsRoot are available.
298
300
  if (!directAgent && (workflow as import("../../workflows/workflow-config.ts").DynamicWorkflowConfig).runtime === "dynamic") {
301
+ // LAZY: defer dynamic import of ../../runtime/dynamic-workflow-runner.ts to its call site.
299
302
  const { runDynamicWorkflow } = await import("../../runtime/dynamic-workflow-runner.ts");
300
303
  // Re-synthesize a dynamic-team (§0c C9) for role resolution.
301
304
  const dwfTeam: import("../../teams/team-config.ts").TeamConfig = {
@@ -321,6 +324,7 @@ export async function handleRun(params: TeamToolParamsValue, ctx: TeamContext):
321
324
  team: dwfTeam,
322
325
  signal: ctx.signal ?? AbortSignal.timeout(3_600_000),
323
326
  modelOverride: params.model,
327
+ tokenBudget: params.tokenBudget ?? (workflow as import("../../workflows/workflow-config.ts").DynamicWorkflowConfig).maxTokenBudget,
324
328
  });
325
329
  } catch (runnerError) {
326
330
  // Round-11 runtime fix: persist manifest with status=failed when runner throws
@@ -323,11 +323,28 @@ async function main(): Promise<void> {
323
323
  const origWrite =
324
324
  (_prefix: string) =>
325
325
  (data: unknown, ...args: unknown[]) => {
326
+ // FIX: Never let the in-process console redirect crash the background
327
+ // runner. If logFd is missing/invalid or the write fails, swallow the
328
+ // error silently — losing one debug line is far better than killing the
329
+ // scheduler (a previous version only redirected console.log/error, so
330
+ // console.debug/.warn still wrote to the original stdout/stderr pipe
331
+ // which is closed after the parent detaches, producing EPIPE → process
332
+ // crash mid-workflow → runs hang at 25% forever).
333
+ if (logFd === undefined) return;
326
334
  const msg = [data, ...args].map(String).join(" ") + "\n";
327
- fs.writeSync(logFd!, msg);
335
+ try {
336
+ fs.writeSync(logFd, msg);
337
+ } catch {
338
+ /* best-effort: never crash the scheduler over a log write */
339
+ }
328
340
  };
329
341
  console.log = origWrite("OUT");
330
342
  console.error = origWrite("ERR");
343
+ // FIX: Also redirect console.debug and console.warn — otherwise they still
344
+ // hit the original stdout/stderr pipe, which is closed once the parent
345
+ // process detaches, causing EPIPE unhandled errors that kill the scheduler.
346
+ console.debug = origWrite("DBG");
347
+ console.warn = origWrite("WARN");
331
348
  // FIX: Close logFd on process exit to prevent file descriptor leak
332
349
  process.on("exit", () => {
333
350
  try {
@@ -558,8 +575,11 @@ async function main(): Promise<void> {
558
575
  debugLog(`[background-runner] short-circuiting ${manifest.runKind} (synthetic team/workflow)`,
559
576
  );
560
577
  if (manifest.runKind === "goal-loop") {
578
+ // LAZY: defer dynamic import of ./goal-loop-runner.ts to its call site.
561
579
  const { runGoalLoop } = await import("./goal-loop-runner.ts");
580
+ // LAZY: defer dynamic import of ./goal-state-store.ts to its call site.
562
581
  const { GoalStore } = await import("./goal-state-store.ts");
582
+ // LAZY: defer dynamic import of ../agents/discover-agents.ts to its call site.
563
583
  const { discoverAgents, allAgents } = await import("../agents/discover-agents.ts");
564
584
  const store = new GoalStore(manifest.cwd);
565
585
  const goalState = store.load(manifest.runId);
@@ -576,11 +596,13 @@ async function main(): Promise<void> {
576
596
  saveRunManifest(finalGoalManifest);
577
597
  earlyResult = { manifest: finalGoalManifest, tasks: goalResult.tasks };
578
598
  } else {
599
+ // LAZY: defer dynamic import of ./dynamic-workflow-runner.ts to its call site.
579
600
  const { runDynamicWorkflow } = await import("./dynamic-workflow-runner.ts");
601
+ // LAZY: defer dynamic import of ../workflows/discover-workflows.ts to its call site.
580
602
  const { allWorkflows, discoverWorkflows } = await import("../workflows/discover-workflows.ts");
581
603
  const wf = allWorkflows(discoverWorkflows(manifest.cwd)).find((w) => w.name === manifest.workflow);
582
604
  if (!wf || wf.runtime !== "dynamic" || !wf.dynamicScript) throw new Error(`runKind="dynamic-workflow" but workflow '${manifest.workflow}' is not dynamic (runId=${manifest.runId})`);
583
- const dwfResult = await runDynamicWorkflow({ manifest, workflow: wf as import("../workflows/workflow-config.ts").DynamicWorkflowConfig, signal: abortController.signal });
605
+ const dwfResult = await runDynamicWorkflow({ manifest, workflow: wf as import("../workflows/workflow-config.ts").DynamicWorkflowConfig, signal: abortController.signal, tokenBudget: wf.maxTokenBudget });
584
606
  saveRunManifest(dwfResult.manifest);
585
607
  earlyResult = dwfResult;
586
608
  }
@@ -246,6 +246,7 @@ export class ChainRunner {
246
246
 
247
247
  // Emit progress event if eventsPath provided
248
248
  if (eventsPath) {
249
+ // LAZY: defer dynamic import of ../state/event-log.ts to its call site.
249
250
  const { appendEventAsync } = await import("../state/event-log.ts");
250
251
  await appendEventAsync(eventsPath, {
251
252
  type: "chain.step_completed",
@@ -95,6 +95,17 @@ export function killProcessPid(pid: number): void {
95
95
  }
96
96
 
97
97
  function killProcessTree(pid: number | undefined, child?: ChildProcess): void {
98
+ // Phase-0 diagnostic (HB-003a): capture who invoked killProcessTree so the
99
+ // exit-null race has a provenance trail. .stack is best-effort (may be undefined
100
+ // under deep async), so we take a snapshot lazily.
101
+ try {
102
+ const callerStack = new Error("killProcessTree caller").stack ?? "(no stack)";
103
+ logInternalError(
104
+ "child-pi.kill-process-tree-invoked",
105
+ new Error(`pid=${pid} called from:\n${callerStack.split("\n").slice(0, 8).join("\n")}`),
106
+ `pid=${pid}`,
107
+ );
108
+ } catch { /* diagnostic best-effort */ }
98
109
  if (!pid || !Number.isInteger(pid) || pid <= 0) return;
99
110
  if (child && child.exitCode !== null) return;
100
111
  killProcessPid(pid);
@@ -124,6 +135,18 @@ export interface ChildPiLifecycleEvent {
124
135
  stderrExcerpt?: string;
125
136
  /** Timestamp (ISO). */
126
137
  ts: string;
138
+ /** Phase-0 diagnostic (HB-003a): the signal that killed the child (when
139
+ * available). Was previously discarded after building the error string. */
140
+ signal?: string;
141
+ /** Phase-0 diagnostic (HB-003a): final-drain race timing, present only on
142
+ * exit events where a drain timer was armed. Surfaces the exit-null race. */
143
+ diagnostic?: {
144
+ finalDrainArmed: boolean;
145
+ forcedFinalDrain: boolean;
146
+ finalDrainFiredMonotonicMs?: number;
147
+ finalAssistantEventMonotonicMs?: number;
148
+ exitMonotonicMs: number;
149
+ };
127
150
  }
128
151
 
129
152
  export interface ChildPiRunInput {
@@ -267,6 +290,9 @@ export function buildChildPiSpawnOptions(cwd: string, env: NodeJS.ProcessEnv): S
267
290
  "PI_CREW_MAX_DEPTH",
268
291
  "PI_CREW_INHERIT_PROJECT_CONTEXT",
269
292
  "PI_CREW_INHERIT_SKILLS",
293
+ // PI_CREW_KIND marks this process as a crew sub-agent (vs the user's main session).
294
+ // doctor --zombies matches it to safely list orphaned sub-agents only.
295
+ "PI_CREW_KIND",
270
296
  // PI_CREW_PARENT_PID is needed by child-pi's parent-guard (uses
271
297
  // process.kill(pid, 0) liveness check). The PID is not a secret.
272
298
  "PI_CREW_PARENT_PID",
@@ -577,6 +603,15 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
577
603
  let noResponseTimer: NodeJS.Timeout | undefined;
578
604
  const finalDrainMs = input.finalDrainMs ?? FINAL_DRAIN_MS;
579
605
  const hardKillMs = input.hardKillMs ?? HARD_KILL_MS;
606
+ // Phase-0 diagnostic (HB-003a): track the final-drain race that produces
607
+ // `exit null` for ctx.agent({disableTools:true}). These vars are READ-ONLY
608
+ // instrumentation — no behavior change. finalDrainArmed lets the close
609
+ // handler know a drain timer existed even after clearFinalDrainTimers() ran;
610
+ // spawnMonotonicMs gives us relative timing to distinguish a race from a crash.
611
+ let finalDrainArmed = false;
612
+ let finalDrainFiredMonotonicMs: number | undefined;
613
+ const spawnMonotonicMs = performance.now();
614
+ let finalAssistantEventMonotonicMs: number | undefined;
580
615
  // FIX (Round 14): Bound the env-controlled response timeout to
581
616
  // [1_000ms, 3_600_000ms] (1s–1h) so a hostile or accidental value
582
617
  // (e.g. 1, or 999_999_999) cannot disable the timeout or cause
@@ -680,20 +715,27 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
680
715
  if (maxTurns !== undefined && !softLimitReached && turnCount >= maxTurns) {
681
716
  softLimitReached = true;
682
717
  // Inject steer via stdin to tell child to wrap up.
683
- // If stdin is not writable or the write fails (backpressure/closed),
684
- // the steer cannot be injected and the agent could run indefinitely.
685
- // Kill the process tree in that case to enforce the turn limit.
718
+ // Steer injection is ADVISORY: it asks the worker to wrap up. The real
719
+ // enforcement is the hard-abort at maxTurns + graceTurns (below). So a
720
+ // failed/non-writable stdin must NOT kill the worker that destroys a
721
+ // valid answer already in stdout (Phase-0 root cause of the
722
+ // disableTools/maxTurns:1 exit-null bug). Just log + let the hard-abort
723
+ // path handle a genuinely runaway worker.
686
724
  if (child.stdin?.writable) {
687
725
  const steerPayload = JSON.stringify({ type: "steer", message: "You have reached your turn limit. Wrap up immediately — provide your final answer now." }) + "\n";
688
726
  const writeSucceeded = child.stdin.write(steerPayload);
689
727
  if (!writeSucceeded) {
690
- logInternalError("child-pi.steer-backpressure", new Error("stdin write returned false during steer injection; buffer full"), `pid=${child.pid}`);
691
- steerInjectionFailed = true;
692
- killProcessTree(child.pid, child);
728
+ // Normal Node backpressure: the payload is buffered and will flush on
729
+ // 'drain'. NOT a failure — do NOT kill the worker. The steer is
730
+ // advisory; if the worker ignores it and runs past maxTurns +
731
+ // graceTurns, the hard-abort below terminates it.
732
+ logInternalError("child-pi.steer-backpressure", new Error("stdin write returned false (normal backpressure); steer buffered, worker NOT killed"), `pid=${child.pid}`);
693
733
  }
694
734
  } else {
695
- logInternalError("child-pi.steer-not-writable", new Error("stdin not writable when attempting steer injection"), `pid=${child.pid}`);
696
- killProcessTree(child.pid, child);
735
+ // stdin closed (worker already finished) or otherwise unwritable.
736
+ // Also advisory — the worker is done or nearly done; let it exit
737
+ // naturally. Hard-abort remains the safety net for true runaways.
738
+ logInternalError("child-pi.steer-not-writable", new Error("stdin not writable when attempting steer injection (worker may be done); worker NOT killed"), `pid=${child.pid}`);
697
739
  }
698
740
  } else if (maxTurns !== undefined && softLimitReached && turnCount >= maxTurns + (graceTurns ?? 5)) {
699
741
  // Hard abort — terminate after grace turns
@@ -708,9 +750,12 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
708
750
  }
709
751
  input.onJsonEvent?.(event);
710
752
  if (!isFinalAssistantEvent(event) || childExited || settled || finalDrainTimer) return;
753
+ finalAssistantEventMonotonicMs = performance.now();
754
+ finalDrainArmed = true; // Phase-0 diagnostic: track that a drain timer was created.
711
755
  finalDrainTimer = setTimeout(() => {
712
756
  if (settled || childExited) return;
713
757
  forcedFinalDrain = true;
758
+ finalDrainFiredMonotonicMs = performance.now(); // Phase-0 diagnostic: race timing.
714
759
  input.onLifecycleEvent?.({ type: "final_drain", pid: child.pid, ts: new Date().toISOString() });
715
760
  try {
716
761
  child.kill(process.platform === "win32" ? undefined : "SIGTERM");
@@ -765,7 +810,27 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
765
810
  }
766
811
  // Catch all errors from settle to prevent unhandled rejection from propagating
767
812
  try {
768
- resolve({ ...result, exitStatus: result.exitStatus ?? { exitCode: result.exitCode, cancelled: abortRequested, timedOut: responseTimeoutHit, killed: hardKilled, cleanupErrors, finalDrainMs } });
813
+ resolve({
814
+ ...result,
815
+ exitStatus: result.exitStatus ?? {
816
+ exitCode: result.exitCode,
817
+ cancelled: abortRequested,
818
+ timedOut: responseTimeoutHit,
819
+ killed: hardKilled,
820
+ // Phase-0 diagnostic (HB-003a): surface the final-drain race state.
821
+ // finalDrainArmed lets Phase 1 decide whether a signal-death (exitCode=null)
822
+ // should be treated as a forced final drain. READ-ONLY for now.
823
+ ...(finalDrainArmed || forcedFinalDrain
824
+ ? {
825
+ finalDrainArmed,
826
+ forcedFinalDrain,
827
+ finalDrainFiredMonotonicMs,
828
+ }
829
+ : {}),
830
+ cleanupErrors,
831
+ finalDrainMs,
832
+ },
833
+ });
769
834
  } catch (resolveError) {
770
835
  logInternalError("child-pi.settle-resolve", resolveError, `result=${JSON.stringify({ exitCode: result.exitCode })}`);
771
836
  }
@@ -866,7 +931,30 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
866
931
  rejectPendingOperations(exitError);
867
932
  }
868
933
  try {
869
- input.onLifecycleEvent?.({ type: "exit", pid: child.pid, exitCode: code, ts: new Date().toISOString(), error: exitError?.message, stderrExcerpt: isUnexpectedExit ? stderr.slice(-1000) || undefined : undefined });
934
+ // Phase-0 diagnostic (HB-003a): capture signal + drain timing in the
935
+ // exit lifecycle event so the exit-null race is diagnosable instead of
936
+ // opaque. `signal` was previously discarded after building the error msg.
937
+ input.onLifecycleEvent?.({
938
+ type: "exit",
939
+ pid: child.pid,
940
+ exitCode: code,
941
+ ts: new Date().toISOString(),
942
+ error: exitError?.message,
943
+ stderrExcerpt: isUnexpectedExit ? stderr.slice(-1000) || undefined : undefined,
944
+ // Phase-0 diagnostic fields (kept optional — no type change required).
945
+ ...(signal ? { signal } : {}),
946
+ ...(finalDrainArmed || forcedFinalDrain
947
+ ? {
948
+ diagnostic: {
949
+ finalDrainArmed,
950
+ forcedFinalDrain,
951
+ finalDrainFiredMonotonicMs,
952
+ finalAssistantEventMonotonicMs,
953
+ exitMonotonicMs: performance.now() - spawnMonotonicMs,
954
+ },
955
+ }
956
+ : {}),
957
+ });
870
958
  } catch (err) {
871
959
  logInternalError("child-pi.on-lifecycle-event", err, `event=exit, pid=${child.pid}`);
872
960
  }
@@ -902,6 +990,9 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
902
990
  const finalExitCode = forcedFinalDrain && !timeoutError ? 0 : exitCode;
903
991
  const wasGraceAborted = softLimitReached && turnCount >= (maxTurns ?? 0) + (graceTurns ?? 5);
904
992
  const wasParentAborted = abortDueToParentSignal && !wasGraceAborted;
993
+ // steerInjectionFailed is now always false (Phase-1 fix: steer backpressure
994
+ // is logged, not fatal). The steerError branch is retained for safety in
995
+ // case a future change reintroduces a fatal steer path.
905
996
  const steerError = steerInjectionFailed ? "Steer injection failed due to stdin backpressure; process killed" : undefined;
906
997
  settle({ exitCode: finalExitCode, stdout, stderr, ...(timeoutError ? { error: timeoutError.error } : {}), ...(steerError ? { error: steerError } : {}), aborted: wasGraceAborted || wasParentAborted, steered: softLimitReached && !wasGraceAborted, exitStatus: { exitCode: finalExitCode, cancelled: abortRequested, timedOut: responseTimeoutHit, killed: hardKilled, cleanupErrors, finalDrainMs } });
907
998
  });