pi-crew 0.9.4 → 0.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +592 -0
- package/README.md +55 -3
- package/docs/HARNESS_BACKLOG.md +51 -3
- package/docs/dynamic-workflows.md +315 -2
- package/docs/fix-plan-disabletools-exit-null.md +219 -0
- package/docs/troubleshooting.md +102 -0
- package/package.json +8 -2
- package/src/extension/command-completions.ts +1 -0
- package/src/extension/crew-shortcuts.ts +1 -0
- package/src/extension/register.ts +2 -0
- package/src/extension/registration/commands.ts +3 -0
- package/src/extension/team-tool/doctor.ts +14 -0
- package/src/extension/team-tool/goal.ts +1 -0
- package/src/extension/team-tool/run.ts +4 -0
- package/src/runtime/background-runner.ts +24 -2
- package/src/runtime/chain-runner.ts +1 -0
- package/src/runtime/child-pi.ts +101 -10
- package/src/runtime/crash-recovery.ts +78 -36
- package/src/runtime/deterministic-ast.ts +161 -0
- package/src/runtime/dwf-state-store.ts +97 -0
- package/src/runtime/dynamic-workflow-context.ts +381 -7
- package/src/runtime/dynamic-workflow-runner.ts +94 -2
- package/src/runtime/goal-loop-runner.ts +2 -0
- package/src/runtime/live-session-runtime.ts +1 -0
- package/src/runtime/model-scope.ts +1 -0
- package/src/runtime/peer-dep.ts +1 -0
- package/src/runtime/pi-args.ts +11 -0
- package/src/runtime/resilient-edit.ts +1 -0
- package/src/runtime/result-extractor.ts +72 -7
- package/src/runtime/task-runner.ts +1 -0
- package/src/runtime/team-runner.ts +8 -3
- package/src/runtime/zombie-scanner.ts +297 -0
- package/src/schema/team-tool-schema.ts +28 -0
- package/src/state/contracts.ts +1 -0
- package/src/state/hook-instinct-bridge.ts +3 -0
- package/src/state/state-store.ts +3 -0
- package/src/state/types.ts +9 -0
- package/src/ui/dashboard-panes/progress-pane.ts +5 -0
- package/src/ui/dwf-phase-display.ts +151 -0
- package/src/ui/run-snapshot-cache.ts +4 -0
- package/src/ui/snapshot-types.ts +3 -0
- package/src/utils/bm25-search.ts +2 -0
- package/src/workflows/workflow-config.ts +3 -0
- package/src/worktree/worktree-manager.ts +94 -0
- package/types/dwf.d.ts +187 -0
package/docs/troubleshooting.md
CHANGED
|
@@ -74,6 +74,32 @@ team action='cancel' runId=… # cancel a truly-dead run
|
|
|
74
74
|
|
|
75
75
|
The error message explains the heartbeat mechanism + remediation.
|
|
76
76
|
|
|
77
|
+
### "Run not found" but `team list` shows it / scheduler appears frozen at 25%
|
|
78
|
+
|
|
79
|
+
**Symptom:** an async `team action='run'` (e.g. a review) gets through the
|
|
80
|
+
first task (e.g. explorer), then the scheduler appears to stop responding.
|
|
81
|
+
`team action='status' runId=…` returns `Run not found`; the run's
|
|
82
|
+
`stateRoot` (in `<project>/.crew/state/runs/<runId>/`) is missing. TUI
|
|
83
|
+
progress shows the run stuck at the same task percentage forever, and the
|
|
84
|
+
only workaround was killing the parent `pi` process.
|
|
85
|
+
|
|
86
|
+
**This was the v0.9.4 symptom** caused by two coupled runtime bugs:
|
|
87
|
+
|
|
88
|
+
- **Bug X** (proximate): `purgeStaleActiveRunIndex` destroyed the
|
|
89
|
+
`stateRoot` of long-running legitimate async runs based on a frozen
|
|
90
|
+
`entry.updatedAt` (set at registration, never refreshed).
|
|
91
|
+
- **Bug Y** (root cause): the bg-runner crashed with an unhandled `EPIPE`
|
|
92
|
+
on the first `console.debug` after the parent detached its stdio pipes.
|
|
93
|
+
|
|
94
|
+
**Fixed in v0.9.5** (see [CHANGELOG.md](../CHANGELOG.md#v095--fix-team-run-hangs-forever-at-25-2026-06-23)).
|
|
95
|
+
With the fix, a long-running run is no longer falsely purged, and even if the
|
|
96
|
+
bg-runner dies, the `stateRoot`, `background.log`, `events.jsonl`, and
|
|
97
|
+
`heartbeat.json` survive — runs stay queryable and resumable.
|
|
98
|
+
|
|
99
|
+
**Recovering a stuck run from v0.9.4 or earlier:** the `stateRoot` for
|
|
100
|
+
those runs is already gone (Bug X nuked it). Re-dispatch the workflow. New
|
|
101
|
+
runs on v0.9.5+ are fully protected.
|
|
102
|
+
|
|
77
103
|
## Model fallback exhausted
|
|
78
104
|
|
|
79
105
|
**Symptom:** `All N candidates exhausted (tried: a → b → c)`.
|
|
@@ -129,3 +155,79 @@ code + a help hint inline. Common ones:
|
|
|
129
155
|
- `team action='summary' runId=…` — includes common failure-pattern detection
|
|
130
156
|
("4 of 5 failures share 2 root causes").
|
|
131
157
|
- `team action='events' runId=…` — full event timeline for forensics.
|
|
158
|
+
|
|
159
|
+
## Stuck / orphaned sub-agent processes ("zombies")
|
|
160
|
+
|
|
161
|
+
A pi-crew sub-agent whose parent crashed may linger as an orphaned process.
|
|
162
|
+
**Do NOT kill `pi` processes by eye** (uptime/RSS heuristics will match your
|
|
163
|
+
own interactive main session — that is unrecoverable). Use the safe scanner:
|
|
164
|
+
|
|
165
|
+
```
|
|
166
|
+
team action='doctor' focus='zombies'
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
This is **read-only**. It matches ONLY processes carrying the authoritative
|
|
170
|
+
`PI_CREW_KIND=subagent` env marker (set by every child-pi spawn) whose
|
|
171
|
+
`PI_CREW_PARENT_PID` is no longer alive. Your main session never carries the
|
|
172
|
+
marker, so it can never appear in the list. (The marker is an env var, not an
|
|
173
|
+
argv flag — pi's strict option parser rejects unknown flags, so we can't use
|
|
174
|
+
a `--crew-subagent` CLI flag.)
|
|
175
|
+
|
|
176
|
+
To kill a confirmed zombie: `kill <PID>` (the OS reaps it). The scanner never
|
|
177
|
+
kills on your behalf.
|
|
178
|
+
|
|
179
|
+
### Why the marker exists
|
|
180
|
+
|
|
181
|
+
Before `PI_CREW_KIND`, a heuristic zombie "cleanup" killed a live main session
|
|
182
|
+
by accident. The marker makes sub-agent identity authoritative rather than
|
|
183
|
+
guessed. See `src/runtime/zombie-scanner.ts` and `.crew/knowledge.md`.
|
|
184
|
+
|
|
185
|
+
## `ctx.agent({disableTools: true})` — historical `exit null` (FIXED)
|
|
186
|
+
|
|
187
|
+
Previously, `ctx.agent({disableTools: true, maxTurns: 1})` could return
|
|
188
|
+
`exit null` because the steer-injection code mis-treated normal Node stdin
|
|
189
|
+
backpressure (`write() === false`) as a fatal failure and `killProcessTree`'d
|
|
190
|
+
the worker mid-answer. **Fixed**: steer injection is now advisory — a
|
|
191
|
+
backpressure return or non-writable stdin is logged, not fatal; the
|
|
192
|
+
hard-abort at `maxTurns + graceTurns` remains the safety net for genuine
|
|
193
|
+
runaways. The `disableTools` correlation was a red herring — the real trigger
|
|
194
|
+
was `maxTurns:1` hitting on the first turn. See CHANGELOG "Real-world smoke
|
|
195
|
+
testing findings" and `test/unit/child-pi-steer-backpressure.test.ts`.
|
|
196
|
+
|
|
197
|
+
## Running the real-binary smoke suite (HB-004)
|
|
198
|
+
|
|
199
|
+
The default `npm test` mocks child-pi (`PI_TEAMS_MOCK_CHILD_PI`), so it cannot
|
|
200
|
+
catch bugs that only manifest against the real `pi` binary. The smoke suite
|
|
201
|
+
shells out to real pi + makes real LLM calls, so it bills tokens and is gated
|
|
202
|
+
behind `PI_CREW_SMOKE=1`.
|
|
203
|
+
|
|
204
|
+
### Run locally
|
|
205
|
+
|
|
206
|
+
```bash
|
|
207
|
+
# All smoke tests (~5 tests, ~1 min, bills tokens):
|
|
208
|
+
PI_CREW_SMOKE=1 npm run test:smoke
|
|
209
|
+
|
|
210
|
+
# One smoke test in isolation:
|
|
211
|
+
PI_CREW_SMOKE=1 npx tsx --test test/smoke/agent-disabletools.smoke.ts
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
Smoke tests live in `test/smoke/*.smoke.ts` and are NOT picked up by the default
|
|
215
|
+
`npm test` glob (`test/unit/*` + `test/integration/*`). Each test self-skips
|
|
216
|
+
unless `PI_CREW_SMOKE=1`.
|
|
217
|
+
|
|
218
|
+
### What each covers
|
|
219
|
+
|
|
220
|
+
| File | Feature family | Catches |
|
|
221
|
+
|---|---|---|
|
|
222
|
+
| `argv-flags.smoke.ts` | buildPiWorkerArgs argv | unknown-flag rejection (e.g. `--crew-subagent`) |
|
|
223
|
+
| `agent-plain.smoke.ts` | ctx.agent() baseline | spawn-path breakage |
|
|
224
|
+
| `agent-schema.smoke.ts` | ctx.agent({schema, systemPrompt}) | persona-leak / schema-validation failures |
|
|
225
|
+
| `agent-disabletools.smoke.ts` | ctx.agent({disableTools, maxTurns:1}) ×5 | HB-003a steer-backpressure exit-null (flaky → 5×) |
|
|
226
|
+
| `dwf-workflow.smoke.ts` | full DWF end-to-end | phase/log/args/budget/pipeline/agent/setResult integration |
|
|
227
|
+
|
|
228
|
+
### Run in CI (manual dispatch)
|
|
229
|
+
|
|
230
|
+
GitHub Actions → "Smoke (real-binary, manual)" → Run workflow → pick OS.
|
|
231
|
+
Requires the `PI_AUTH_JSON` repo secret (the contents of `~/.pi/agent/auth.json`)
|
|
232
|
+
so the spawned `pi` can authenticate with the model provider. If unset, the
|
|
233
|
+
LLM-calling smoke tests fail with a clear auth error.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pi-crew",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.7",
|
|
4
4
|
"description": "Pi extension for coordinated AI teams, workflows, worktrees, and async task orchestration",
|
|
5
5
|
"author": "baphuongna",
|
|
6
6
|
"license": "MIT",
|
|
@@ -39,6 +39,7 @@
|
|
|
39
39
|
"docs/",
|
|
40
40
|
"tsconfig.json",
|
|
41
41
|
"schema.json",
|
|
42
|
+
"types/",
|
|
42
43
|
"CHANGELOG.md",
|
|
43
44
|
"LICENSE",
|
|
44
45
|
"NOTICE.md"
|
|
@@ -52,6 +53,7 @@
|
|
|
52
53
|
"test:unit": "node scripts/test-runner.mjs --test-concurrency=4 --test-timeout=180000 --test-force-exit test/unit/*.test.ts",
|
|
53
54
|
"test:watch": "tsx --watch --test --test-concurrency=4 --test-timeout=30000 --test-force-exit test/unit/*.test.ts",
|
|
54
55
|
"test:integration": "node scripts/test-runner.mjs --test-concurrency=1 --test-timeout=120000 test/integration/*.test.ts",
|
|
56
|
+
"test:smoke": "node scripts/test-runner.mjs --test-concurrency=1 --test-timeout=180000 test/smoke/*.smoke.ts",
|
|
55
57
|
"build:bundle": "node scripts/build-bundle.mjs",
|
|
56
58
|
"bench": "node scripts/run-bench.mjs",
|
|
57
59
|
"bench:check": "node scripts/bench-check.mjs",
|
|
@@ -63,7 +65,10 @@
|
|
|
63
65
|
"smoke:release": "node scripts/release-smoke.mjs"
|
|
64
66
|
},
|
|
65
67
|
"exports": {
|
|
66
|
-
"./schema.json": "./schema.json"
|
|
68
|
+
"./schema.json": "./schema.json",
|
|
69
|
+
"./workflow": {
|
|
70
|
+
"types": "./types/dwf.d.ts"
|
|
71
|
+
}
|
|
67
72
|
},
|
|
68
73
|
"pi": {
|
|
69
74
|
"extensions": [
|
|
@@ -81,6 +86,7 @@
|
|
|
81
86
|
},
|
|
82
87
|
"dependencies": {
|
|
83
88
|
"@sinclair/typebox": "^0.34.49",
|
|
89
|
+
"acorn": "^8.17.0",
|
|
84
90
|
"ajv": "^8.20.0",
|
|
85
91
|
"cli-highlight": "^2.1.11",
|
|
86
92
|
"diff": "^5.2.0",
|
|
@@ -67,6 +67,7 @@ export function suggestRunIds(_prefix: string, cwd?: string): AutocompleteItem[]
|
|
|
67
67
|
export async function suggestTaskIds(runId: string, prefix: string, cwd?: string): Promise<AutocompleteItem[] | null> {
|
|
68
68
|
const resolvedCwd = cwd ?? process.cwd();
|
|
69
69
|
// Dynamic import to avoid pulling state-store into the hot command-registration path.
|
|
70
|
+
// LAZY: defer dynamic import of ../state/state-store.ts to its call site.
|
|
70
71
|
const { loadRunManifestById } = await import("../state/state-store.ts");
|
|
71
72
|
const loaded = loadRunManifestById(resolvedCwd, runId);
|
|
72
73
|
if (!loaded) return null;
|
|
@@ -34,6 +34,7 @@ const CREW_SHORTCUTS: ReadonlyArray<ShortcutRegistration> = [
|
|
|
34
34
|
// (avoids pulling the full commands.ts dependency tree into every
|
|
35
35
|
// process that imports this module, e.g. the unit test).
|
|
36
36
|
handler: async (ctx) => {
|
|
37
|
+
// LAZY: defer dynamic import of ./registration/commands.ts to its call site.
|
|
37
38
|
const { openTeamSettingsOverlay } = await import("./registration/commands.ts");
|
|
38
39
|
await openTeamSettingsOverlay(ctx);
|
|
39
40
|
},
|
|
@@ -1129,6 +1129,7 @@ export function registerPiTeams(pi: ExtensionAPI): void {
|
|
|
1129
1129
|
// LAZY: state-store only needed in hasRunning; avoid at startup.
|
|
1130
1130
|
// Use dynamic import to avoid CJS/ESM mixed module issues.
|
|
1131
1131
|
const { loadRunManifestById: loadRunForHasRunning } =
|
|
1132
|
+
// LAZY: defer dynamic import of ../state/state-store.ts to its call site.
|
|
1132
1133
|
await import("../state/state-store.ts");
|
|
1133
1134
|
const loaded = loadRunForHasRunning(
|
|
1134
1135
|
currentCtx?.cwd ?? process.cwd(),
|
|
@@ -1494,6 +1495,7 @@ export function registerPiTeams(pi: ExtensionAPI): void {
|
|
|
1494
1495
|
const cwd = ctx.cwd ?? process.cwd();
|
|
1495
1496
|
const loaded = loadRunManifestById(cwd, runId);
|
|
1496
1497
|
if (loaded) {
|
|
1498
|
+
// LAZY: defer dynamic import of ../state/atomic-write.ts to its call site.
|
|
1497
1499
|
const { atomicWriteJson } = await import("../state/atomic-write.ts");
|
|
1498
1500
|
atomicWriteJson(loaded.manifest.stateRoot + "/manifest.json", {
|
|
1499
1501
|
...loaded.manifest,
|
|
@@ -202,11 +202,13 @@ export async function openTeamSettingsOverlay(ctx: ExtensionContext): Promise<vo
|
|
|
202
202
|
if (res.success) {
|
|
203
203
|
ctx.ui.notify(`Theme: ${value} (applied live)`, "info");
|
|
204
204
|
} else {
|
|
205
|
+
// LAZY: defer dynamic import of ../../ui/theme-discovery.ts to its call site.
|
|
205
206
|
const { setPiTheme } = await import("../../ui/theme-discovery.ts");
|
|
206
207
|
setPiTheme(value);
|
|
207
208
|
ctx.ui.notify(`Theme saved as '${value}' but failed to apply: ${res.error ?? "unknown"}. Restart Pi.`, "warning");
|
|
208
209
|
}
|
|
209
210
|
} else {
|
|
211
|
+
// LAZY: defer dynamic import of ../../ui/theme-discovery.ts to its call site.
|
|
210
212
|
const { setPiTheme } = await import("../../ui/theme-discovery.ts");
|
|
211
213
|
setPiTheme(value);
|
|
212
214
|
ctx.ui.notify(`Pi theme set to '${value}'. Restart Pi to apply.`, "info");
|
|
@@ -672,6 +674,7 @@ export function registerTeamCommands(pi: ExtensionAPI, deps: RegisterTeamCommand
|
|
|
672
674
|
pi.registerCommand("crew-brief", {
|
|
673
675
|
description: "Toggle brief tool output mode: on | off | status",
|
|
674
676
|
handler: async (args: string, ctx: ExtensionCommandContext) => {
|
|
677
|
+
// LAZY: defer dynamic import of ../../ui/tool-renderers/brief-mode.ts to its call site.
|
|
675
678
|
const { isBrief, setBrief, BRIEF_ENTRY_TYPE, makeBriefEntry } = await import("../../ui/tool-renderers/brief-mode.ts");
|
|
676
679
|
const trimmed = args.trim();
|
|
677
680
|
|
|
@@ -10,6 +10,7 @@ import { DEFAULT_PATHS } from "../../config/defaults.ts";
|
|
|
10
10
|
import type { TeamToolParamsValue } from "../../schema/team-tool-schema.ts";
|
|
11
11
|
import { getPiSpawnCommand } from "../../runtime/pi-spawn.ts";
|
|
12
12
|
import { getRuntimeWarmupStatus } from "../../runtime/runtime-warmup.ts";
|
|
13
|
+
import { scanZombieSubagents, formatZombieReport } from "../../runtime/zombie-scanner.ts";
|
|
13
14
|
import { validateResources } from "../validate-resources.ts";
|
|
14
15
|
import { detectDrift, formatDriftReport, type DriftReport } from "../../config/drift-detector.ts";
|
|
15
16
|
import { TeamToolParams } from "../../schema/team-tool-schema.ts";
|
|
@@ -237,6 +238,19 @@ export function buildTeamDoctorReport(input: TeamDoctorReportInput): TeamDoctorR
|
|
|
237
238
|
}
|
|
238
239
|
|
|
239
240
|
export function handleDoctor(ctx: TeamContext, params: TeamToolParamsValue = {}): PiTeamsToolResult {
|
|
241
|
+
// Sub-focus: zombie sub-agent scan. READ-ONLY — never kills. Returns a table of
|
|
242
|
+
// orphaned pi-crew sub-agents identified by the authoritative PI_CREW_KIND=subagent
|
|
243
|
+
// marker. The user's main session never carries that marker, so it can never appear.
|
|
244
|
+
if (params.focus === "zombies") {
|
|
245
|
+
const scan = scanZombieSubagents();
|
|
246
|
+
const text = formatZombieReport(scan);
|
|
247
|
+
return result(text, {
|
|
248
|
+
action: "doctor",
|
|
249
|
+
status: "ok",
|
|
250
|
+
data: { zombies: scan.zombies.length, live: scan.live.length, errors: scan.errors.length },
|
|
251
|
+
}, false);
|
|
252
|
+
}
|
|
253
|
+
|
|
240
254
|
const loadedConfig = loadConfig(ctx.cwd);
|
|
241
255
|
let smokeChildPi: { ok: boolean; detail: string } | undefined;
|
|
242
256
|
if (configRecord(params.config).smokeChildPi === true) {
|
|
@@ -269,6 +269,7 @@ async function handleStop(input: GoalSubActionInput): Promise<ReturnType<typeof
|
|
|
269
269
|
let cancelMsg = "";
|
|
270
270
|
if (updated.currentRunId) {
|
|
271
271
|
try {
|
|
272
|
+
// LAZY: defer dynamic import of ./cancel.ts to its call site.
|
|
272
273
|
const { handleCancel } = await import("./cancel.ts");
|
|
273
274
|
const cancelResult = await handleCancel({ action: "cancel", runId: updated.currentRunId, force: true, config: { intent: "user requested goal stop" } }, ctx);
|
|
274
275
|
cancelMsg = ` In-flight turn ${updated.currentRunId} cancel: ${(cancelResult.content[0] as { text?: string } | undefined)?.text ?? "ok"}.`;
|
|
@@ -34,6 +34,7 @@ import { expandParallelResearchWorkflow } from "../../runtime/parallel-research.
|
|
|
34
34
|
/**
|
|
35
35
|
* Module-scoped latch for the crew-init dynamic import.
|
|
36
36
|
*
|
|
37
|
+
// LAZY: defer dynamic import of module to its call site.
|
|
37
38
|
* `crew-init.ts` is dynamically `await import()`'d from `handleRun` below, which
|
|
38
39
|
* N concurrent subagents hit simultaneously (every `team` tool call runs it).
|
|
39
40
|
* Under the tsx/jiti loader, concurrent first-imports race module-record
|
|
@@ -280,6 +281,7 @@ export async function handleRun(params: TeamToolParamsValue, ctx: TeamContext):
|
|
|
280
281
|
workspaceMode: params.workspaceMode,
|
|
281
282
|
ownerSessionId: ctx.sessionId,
|
|
282
283
|
runKind: params.runKind,
|
|
284
|
+
args: params.args,
|
|
283
285
|
});
|
|
284
286
|
const goalArtifact = writeArtifact(paths.artifactsRoot, {
|
|
285
287
|
kind: "prompt",
|
|
@@ -296,6 +298,7 @@ export async function handleRun(params: TeamToolParamsValue, ctx: TeamContext):
|
|
|
296
298
|
// orchestrates subagents via ctx.agent(); only ctx.setResult() reaches the main context.
|
|
297
299
|
// Placed AFTER manifest creation so runId/paths/artifactsRoot are available.
|
|
298
300
|
if (!directAgent && (workflow as import("../../workflows/workflow-config.ts").DynamicWorkflowConfig).runtime === "dynamic") {
|
|
301
|
+
// LAZY: defer dynamic import of ../../runtime/dynamic-workflow-runner.ts to its call site.
|
|
299
302
|
const { runDynamicWorkflow } = await import("../../runtime/dynamic-workflow-runner.ts");
|
|
300
303
|
// Re-synthesize a dynamic-team (§0c C9) for role resolution.
|
|
301
304
|
const dwfTeam: import("../../teams/team-config.ts").TeamConfig = {
|
|
@@ -321,6 +324,7 @@ export async function handleRun(params: TeamToolParamsValue, ctx: TeamContext):
|
|
|
321
324
|
team: dwfTeam,
|
|
322
325
|
signal: ctx.signal ?? AbortSignal.timeout(3_600_000),
|
|
323
326
|
modelOverride: params.model,
|
|
327
|
+
tokenBudget: params.tokenBudget ?? (workflow as import("../../workflows/workflow-config.ts").DynamicWorkflowConfig).maxTokenBudget,
|
|
324
328
|
});
|
|
325
329
|
} catch (runnerError) {
|
|
326
330
|
// Round-11 runtime fix: persist manifest with status=failed when runner throws
|
|
@@ -323,11 +323,28 @@ async function main(): Promise<void> {
|
|
|
323
323
|
const origWrite =
|
|
324
324
|
(_prefix: string) =>
|
|
325
325
|
(data: unknown, ...args: unknown[]) => {
|
|
326
|
+
// FIX: Never let the in-process console redirect crash the background
|
|
327
|
+
// runner. If logFd is missing/invalid or the write fails, swallow the
|
|
328
|
+
// error silently — losing one debug line is far better than killing the
|
|
329
|
+
// scheduler (a previous version only redirected console.log/error, so
|
|
330
|
+
// console.debug/.warn still wrote to the original stdout/stderr pipe
|
|
331
|
+
// which is closed after the parent detaches, producing EPIPE → process
|
|
332
|
+
// crash mid-workflow → runs hang at 25% forever).
|
|
333
|
+
if (logFd === undefined) return;
|
|
326
334
|
const msg = [data, ...args].map(String).join(" ") + "\n";
|
|
327
|
-
|
|
335
|
+
try {
|
|
336
|
+
fs.writeSync(logFd, msg);
|
|
337
|
+
} catch {
|
|
338
|
+
/* best-effort: never crash the scheduler over a log write */
|
|
339
|
+
}
|
|
328
340
|
};
|
|
329
341
|
console.log = origWrite("OUT");
|
|
330
342
|
console.error = origWrite("ERR");
|
|
343
|
+
// FIX: Also redirect console.debug and console.warn — otherwise they still
|
|
344
|
+
// hit the original stdout/stderr pipe, which is closed once the parent
|
|
345
|
+
// process detaches, causing EPIPE unhandled errors that kill the scheduler.
|
|
346
|
+
console.debug = origWrite("DBG");
|
|
347
|
+
console.warn = origWrite("WARN");
|
|
331
348
|
// FIX: Close logFd on process exit to prevent file descriptor leak
|
|
332
349
|
process.on("exit", () => {
|
|
333
350
|
try {
|
|
@@ -558,8 +575,11 @@ async function main(): Promise<void> {
|
|
|
558
575
|
debugLog(`[background-runner] short-circuiting ${manifest.runKind} (synthetic team/workflow)`,
|
|
559
576
|
);
|
|
560
577
|
if (manifest.runKind === "goal-loop") {
|
|
578
|
+
// LAZY: defer dynamic import of ./goal-loop-runner.ts to its call site.
|
|
561
579
|
const { runGoalLoop } = await import("./goal-loop-runner.ts");
|
|
580
|
+
// LAZY: defer dynamic import of ./goal-state-store.ts to its call site.
|
|
562
581
|
const { GoalStore } = await import("./goal-state-store.ts");
|
|
582
|
+
// LAZY: defer dynamic import of ../agents/discover-agents.ts to its call site.
|
|
563
583
|
const { discoverAgents, allAgents } = await import("../agents/discover-agents.ts");
|
|
564
584
|
const store = new GoalStore(manifest.cwd);
|
|
565
585
|
const goalState = store.load(manifest.runId);
|
|
@@ -576,11 +596,13 @@ async function main(): Promise<void> {
|
|
|
576
596
|
saveRunManifest(finalGoalManifest);
|
|
577
597
|
earlyResult = { manifest: finalGoalManifest, tasks: goalResult.tasks };
|
|
578
598
|
} else {
|
|
599
|
+
// LAZY: defer dynamic import of ./dynamic-workflow-runner.ts to its call site.
|
|
579
600
|
const { runDynamicWorkflow } = await import("./dynamic-workflow-runner.ts");
|
|
601
|
+
// LAZY: defer dynamic import of ../workflows/discover-workflows.ts to its call site.
|
|
580
602
|
const { allWorkflows, discoverWorkflows } = await import("../workflows/discover-workflows.ts");
|
|
581
603
|
const wf = allWorkflows(discoverWorkflows(manifest.cwd)).find((w) => w.name === manifest.workflow);
|
|
582
604
|
if (!wf || wf.runtime !== "dynamic" || !wf.dynamicScript) throw new Error(`runKind="dynamic-workflow" but workflow '${manifest.workflow}' is not dynamic (runId=${manifest.runId})`);
|
|
583
|
-
const dwfResult = await runDynamicWorkflow({ manifest, workflow: wf as import("../workflows/workflow-config.ts").DynamicWorkflowConfig, signal: abortController.signal });
|
|
605
|
+
const dwfResult = await runDynamicWorkflow({ manifest, workflow: wf as import("../workflows/workflow-config.ts").DynamicWorkflowConfig, signal: abortController.signal, tokenBudget: wf.maxTokenBudget });
|
|
584
606
|
saveRunManifest(dwfResult.manifest);
|
|
585
607
|
earlyResult = dwfResult;
|
|
586
608
|
}
|
|
@@ -246,6 +246,7 @@ export class ChainRunner {
|
|
|
246
246
|
|
|
247
247
|
// Emit progress event if eventsPath provided
|
|
248
248
|
if (eventsPath) {
|
|
249
|
+
// LAZY: defer dynamic import of ../state/event-log.ts to its call site.
|
|
249
250
|
const { appendEventAsync } = await import("../state/event-log.ts");
|
|
250
251
|
await appendEventAsync(eventsPath, {
|
|
251
252
|
type: "chain.step_completed",
|
package/src/runtime/child-pi.ts
CHANGED
|
@@ -95,6 +95,17 @@ export function killProcessPid(pid: number): void {
|
|
|
95
95
|
}
|
|
96
96
|
|
|
97
97
|
function killProcessTree(pid: number | undefined, child?: ChildProcess): void {
|
|
98
|
+
// Phase-0 diagnostic (HB-003a): capture who invoked killProcessTree so the
|
|
99
|
+
// exit-null race has a provenance trail. .stack is best-effort (may be undefined
|
|
100
|
+
// under deep async), so we take a snapshot lazily.
|
|
101
|
+
try {
|
|
102
|
+
const callerStack = new Error("killProcessTree caller").stack ?? "(no stack)";
|
|
103
|
+
logInternalError(
|
|
104
|
+
"child-pi.kill-process-tree-invoked",
|
|
105
|
+
new Error(`pid=${pid} called from:\n${callerStack.split("\n").slice(0, 8).join("\n")}`),
|
|
106
|
+
`pid=${pid}`,
|
|
107
|
+
);
|
|
108
|
+
} catch { /* diagnostic best-effort */ }
|
|
98
109
|
if (!pid || !Number.isInteger(pid) || pid <= 0) return;
|
|
99
110
|
if (child && child.exitCode !== null) return;
|
|
100
111
|
killProcessPid(pid);
|
|
@@ -124,6 +135,18 @@ export interface ChildPiLifecycleEvent {
|
|
|
124
135
|
stderrExcerpt?: string;
|
|
125
136
|
/** Timestamp (ISO). */
|
|
126
137
|
ts: string;
|
|
138
|
+
/** Phase-0 diagnostic (HB-003a): the signal that killed the child (when
|
|
139
|
+
* available). Was previously discarded after building the error string. */
|
|
140
|
+
signal?: string;
|
|
141
|
+
/** Phase-0 diagnostic (HB-003a): final-drain race timing, present only on
|
|
142
|
+
* exit events where a drain timer was armed. Surfaces the exit-null race. */
|
|
143
|
+
diagnostic?: {
|
|
144
|
+
finalDrainArmed: boolean;
|
|
145
|
+
forcedFinalDrain: boolean;
|
|
146
|
+
finalDrainFiredMonotonicMs?: number;
|
|
147
|
+
finalAssistantEventMonotonicMs?: number;
|
|
148
|
+
exitMonotonicMs: number;
|
|
149
|
+
};
|
|
127
150
|
}
|
|
128
151
|
|
|
129
152
|
export interface ChildPiRunInput {
|
|
@@ -267,6 +290,9 @@ export function buildChildPiSpawnOptions(cwd: string, env: NodeJS.ProcessEnv): S
|
|
|
267
290
|
"PI_CREW_MAX_DEPTH",
|
|
268
291
|
"PI_CREW_INHERIT_PROJECT_CONTEXT",
|
|
269
292
|
"PI_CREW_INHERIT_SKILLS",
|
|
293
|
+
// PI_CREW_KIND marks this process as a crew sub-agent (vs the user's main session).
|
|
294
|
+
// doctor --zombies matches it to safely list orphaned sub-agents only.
|
|
295
|
+
"PI_CREW_KIND",
|
|
270
296
|
// PI_CREW_PARENT_PID is needed by child-pi's parent-guard (uses
|
|
271
297
|
// process.kill(pid, 0) liveness check). The PID is not a secret.
|
|
272
298
|
"PI_CREW_PARENT_PID",
|
|
@@ -577,6 +603,15 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
577
603
|
let noResponseTimer: NodeJS.Timeout | undefined;
|
|
578
604
|
const finalDrainMs = input.finalDrainMs ?? FINAL_DRAIN_MS;
|
|
579
605
|
const hardKillMs = input.hardKillMs ?? HARD_KILL_MS;
|
|
606
|
+
// Phase-0 diagnostic (HB-003a): track the final-drain race that produces
|
|
607
|
+
// `exit null` for ctx.agent({disableTools:true}). These vars are READ-ONLY
|
|
608
|
+
// instrumentation — no behavior change. finalDrainArmed lets the close
|
|
609
|
+
// handler know a drain timer existed even after clearFinalDrainTimers() ran;
|
|
610
|
+
// spawnMonotonicMs gives us relative timing to distinguish a race from a crash.
|
|
611
|
+
let finalDrainArmed = false;
|
|
612
|
+
let finalDrainFiredMonotonicMs: number | undefined;
|
|
613
|
+
const spawnMonotonicMs = performance.now();
|
|
614
|
+
let finalAssistantEventMonotonicMs: number | undefined;
|
|
580
615
|
// FIX (Round 14): Bound the env-controlled response timeout to
|
|
581
616
|
// [1_000ms, 3_600_000ms] (1s–1h) so a hostile or accidental value
|
|
582
617
|
// (e.g. 1, or 999_999_999) cannot disable the timeout or cause
|
|
@@ -680,20 +715,27 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
680
715
|
if (maxTurns !== undefined && !softLimitReached && turnCount >= maxTurns) {
|
|
681
716
|
softLimitReached = true;
|
|
682
717
|
// Inject steer via stdin to tell child to wrap up.
|
|
683
|
-
//
|
|
684
|
-
//
|
|
685
|
-
//
|
|
718
|
+
// Steer injection is ADVISORY: it asks the worker to wrap up. The real
|
|
719
|
+
// enforcement is the hard-abort at maxTurns + graceTurns (below). So a
|
|
720
|
+
// failed/non-writable stdin must NOT kill the worker — that destroys a
|
|
721
|
+
// valid answer already in stdout (Phase-0 root cause of the
|
|
722
|
+
// disableTools/maxTurns:1 exit-null bug). Just log + let the hard-abort
|
|
723
|
+
// path handle a genuinely runaway worker.
|
|
686
724
|
if (child.stdin?.writable) {
|
|
687
725
|
const steerPayload = JSON.stringify({ type: "steer", message: "You have reached your turn limit. Wrap up immediately — provide your final answer now." }) + "\n";
|
|
688
726
|
const writeSucceeded = child.stdin.write(steerPayload);
|
|
689
727
|
if (!writeSucceeded) {
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
728
|
+
// Normal Node backpressure: the payload is buffered and will flush on
|
|
729
|
+
// 'drain'. NOT a failure — do NOT kill the worker. The steer is
|
|
730
|
+
// advisory; if the worker ignores it and runs past maxTurns +
|
|
731
|
+
// graceTurns, the hard-abort below terminates it.
|
|
732
|
+
logInternalError("child-pi.steer-backpressure", new Error("stdin write returned false (normal backpressure); steer buffered, worker NOT killed"), `pid=${child.pid}`);
|
|
693
733
|
}
|
|
694
734
|
} else {
|
|
695
|
-
|
|
696
|
-
|
|
735
|
+
// stdin closed (worker already finished) or otherwise unwritable.
|
|
736
|
+
// Also advisory — the worker is done or nearly done; let it exit
|
|
737
|
+
// naturally. Hard-abort remains the safety net for true runaways.
|
|
738
|
+
logInternalError("child-pi.steer-not-writable", new Error("stdin not writable when attempting steer injection (worker may be done); worker NOT killed"), `pid=${child.pid}`);
|
|
697
739
|
}
|
|
698
740
|
} else if (maxTurns !== undefined && softLimitReached && turnCount >= maxTurns + (graceTurns ?? 5)) {
|
|
699
741
|
// Hard abort — terminate after grace turns
|
|
@@ -708,9 +750,12 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
708
750
|
}
|
|
709
751
|
input.onJsonEvent?.(event);
|
|
710
752
|
if (!isFinalAssistantEvent(event) || childExited || settled || finalDrainTimer) return;
|
|
753
|
+
finalAssistantEventMonotonicMs = performance.now();
|
|
754
|
+
finalDrainArmed = true; // Phase-0 diagnostic: track that a drain timer was created.
|
|
711
755
|
finalDrainTimer = setTimeout(() => {
|
|
712
756
|
if (settled || childExited) return;
|
|
713
757
|
forcedFinalDrain = true;
|
|
758
|
+
finalDrainFiredMonotonicMs = performance.now(); // Phase-0 diagnostic: race timing.
|
|
714
759
|
input.onLifecycleEvent?.({ type: "final_drain", pid: child.pid, ts: new Date().toISOString() });
|
|
715
760
|
try {
|
|
716
761
|
child.kill(process.platform === "win32" ? undefined : "SIGTERM");
|
|
@@ -765,7 +810,27 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
765
810
|
}
|
|
766
811
|
// Catch all errors from settle to prevent unhandled rejection from propagating
|
|
767
812
|
try {
|
|
768
|
-
resolve({
|
|
813
|
+
resolve({
|
|
814
|
+
...result,
|
|
815
|
+
exitStatus: result.exitStatus ?? {
|
|
816
|
+
exitCode: result.exitCode,
|
|
817
|
+
cancelled: abortRequested,
|
|
818
|
+
timedOut: responseTimeoutHit,
|
|
819
|
+
killed: hardKilled,
|
|
820
|
+
// Phase-0 diagnostic (HB-003a): surface the final-drain race state.
|
|
821
|
+
// finalDrainArmed lets Phase 1 decide whether a signal-death (exitCode=null)
|
|
822
|
+
// should be treated as a forced final drain. READ-ONLY for now.
|
|
823
|
+
...(finalDrainArmed || forcedFinalDrain
|
|
824
|
+
? {
|
|
825
|
+
finalDrainArmed,
|
|
826
|
+
forcedFinalDrain,
|
|
827
|
+
finalDrainFiredMonotonicMs,
|
|
828
|
+
}
|
|
829
|
+
: {}),
|
|
830
|
+
cleanupErrors,
|
|
831
|
+
finalDrainMs,
|
|
832
|
+
},
|
|
833
|
+
});
|
|
769
834
|
} catch (resolveError) {
|
|
770
835
|
logInternalError("child-pi.settle-resolve", resolveError, `result=${JSON.stringify({ exitCode: result.exitCode })}`);
|
|
771
836
|
}
|
|
@@ -866,7 +931,30 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
866
931
|
rejectPendingOperations(exitError);
|
|
867
932
|
}
|
|
868
933
|
try {
|
|
869
|
-
|
|
934
|
+
// Phase-0 diagnostic (HB-003a): capture signal + drain timing in the
|
|
935
|
+
// exit lifecycle event so the exit-null race is diagnosable instead of
|
|
936
|
+
// opaque. `signal` was previously discarded after building the error msg.
|
|
937
|
+
input.onLifecycleEvent?.({
|
|
938
|
+
type: "exit",
|
|
939
|
+
pid: child.pid,
|
|
940
|
+
exitCode: code,
|
|
941
|
+
ts: new Date().toISOString(),
|
|
942
|
+
error: exitError?.message,
|
|
943
|
+
stderrExcerpt: isUnexpectedExit ? stderr.slice(-1000) || undefined : undefined,
|
|
944
|
+
// Phase-0 diagnostic fields (kept optional — no type change required).
|
|
945
|
+
...(signal ? { signal } : {}),
|
|
946
|
+
...(finalDrainArmed || forcedFinalDrain
|
|
947
|
+
? {
|
|
948
|
+
diagnostic: {
|
|
949
|
+
finalDrainArmed,
|
|
950
|
+
forcedFinalDrain,
|
|
951
|
+
finalDrainFiredMonotonicMs,
|
|
952
|
+
finalAssistantEventMonotonicMs,
|
|
953
|
+
exitMonotonicMs: performance.now() - spawnMonotonicMs,
|
|
954
|
+
},
|
|
955
|
+
}
|
|
956
|
+
: {}),
|
|
957
|
+
});
|
|
870
958
|
} catch (err) {
|
|
871
959
|
logInternalError("child-pi.on-lifecycle-event", err, `event=exit, pid=${child.pid}`);
|
|
872
960
|
}
|
|
@@ -902,6 +990,9 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
902
990
|
const finalExitCode = forcedFinalDrain && !timeoutError ? 0 : exitCode;
|
|
903
991
|
const wasGraceAborted = softLimitReached && turnCount >= (maxTurns ?? 0) + (graceTurns ?? 5);
|
|
904
992
|
const wasParentAborted = abortDueToParentSignal && !wasGraceAborted;
|
|
993
|
+
// steerInjectionFailed is now always false (Phase-1 fix: steer backpressure
|
|
994
|
+
// is logged, not fatal). The steerError branch is retained for safety in
|
|
995
|
+
// case a future change reintroduces a fatal steer path.
|
|
905
996
|
const steerError = steerInjectionFailed ? "Steer injection failed due to stdin backpressure; process killed" : undefined;
|
|
906
997
|
settle({ exitCode: finalExitCode, stdout, stderr, ...(timeoutError ? { error: timeoutError.error } : {}), ...(steerError ? { error: steerError } : {}), aborted: wasGraceAborted || wasParentAborted, steered: softLimitReached && !wasGraceAborted, exitStatus: { exitCode: finalExitCode, cancelled: abortRequested, timedOut: responseTimeoutHit, killed: hardKilled, cleanupErrors, finalDrainMs } });
|
|
907
998
|
});
|