pi-crew 0.9.4 → 0.9.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +98 -0
- package/README.md +54 -2
- package/docs/troubleshooting.md +26 -0
- package/package.json +1 -1
- package/src/extension/command-completions.ts +1 -0
- package/src/extension/crew-shortcuts.ts +1 -0
- package/src/extension/register.ts +2 -0
- package/src/extension/registration/commands.ts +3 -0
- package/src/extension/team-tool/goal.ts +1 -0
- package/src/extension/team-tool/run.ts +2 -0
- package/src/runtime/background-runner.ts +23 -1
- package/src/runtime/chain-runner.ts +1 -0
- package/src/runtime/crash-recovery.ts +78 -36
- package/src/runtime/dynamic-workflow-runner.ts +1 -0
- package/src/runtime/goal-loop-runner.ts +2 -0
- package/src/runtime/live-session-runtime.ts +1 -0
- package/src/runtime/model-scope.ts +1 -0
- package/src/runtime/peer-dep.ts +1 -0
- package/src/runtime/resilient-edit.ts +1 -0
- package/src/runtime/task-runner.ts +1 -0
- package/src/state/hook-instinct-bridge.ts +3 -0
- package/src/utils/bm25-search.ts +2 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,103 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [v0.9.5] — fix "team run hangs forever at 25%" (2026-06-23)
|
|
4
|
+
|
|
5
|
+
Two coupled runtime bugs caused the recurring "run stuck at 25% (1/4)" failure
|
|
6
|
+
observed across 4+ consecutive review/fast-fix runs. Both are now fixed; full
|
|
7
|
+
diagnostics (background.log, events.jsonl, heartbeat.json) are preserved for
|
|
8
|
+
all runs.
|
|
9
|
+
|
|
10
|
+
### Bug X — `purgeStaleActiveRunIndex` destroyed the run's stateRoot (proximate cause)
|
|
11
|
+
|
|
12
|
+
**File:** `src/runtime/crash-recovery.ts`
|
|
13
|
+
|
|
14
|
+
**What was wrong:** `purgeStaleActiveRunIndex` decided whether a run was
|
|
15
|
+
"orphaned" using `entry.updatedAt`, which is **frozen at registration** and
|
|
16
|
+
never refreshed during execution. A long-running legitimate async run whose
|
|
17
|
+
background worker had exited (e.g. after a 5–15 min explorer) would have its
|
|
18
|
+
entire durable state (manifest/tasks/events/heartbeat) hard-deleted. Because
|
|
19
|
+
`saveRunTasks()` silently no-ops once the state dir is missing, the workflow
|
|
20
|
+
could never advance past the current task → **permanent invisible hang**
|
|
21
|
+
("Run not found"), with all diagnostics lost.
|
|
22
|
+
|
|
23
|
+
**Fix:**
|
|
24
|
+
- Liveness now corroborated via (a) the on-disk `manifest.updatedAt` (rewritten
|
|
25
|
+
on every task transition) and (b) the team-level `heartbeat.json` mtime —
|
|
26
|
+
any one of which is sufficient to declare the run live.
|
|
27
|
+
- Cancelling a run now **keeps its stateRoot** so the run stays queryable and
|
|
28
|
+
resumable, and its diagnostics survive. The finished-run pruner removes the
|
|
29
|
+
directory later on its normal schedule.
|
|
30
|
+
- Removed two redundant `saveRunManifest(fullLoaded.manifest)` calls that
|
|
31
|
+
were clobbering the freshly-saved `cancelled` status back to `running`.
|
|
32
|
+
|
|
33
|
+
**New regression test:** `test/unit/crash-recovery-purge-liveness.test.ts`
|
|
34
|
+
(3 cases: fresh manifest kept, orphan cancelled-but-preserved, fresh
|
|
35
|
+
heartbeat kept — all using a live-worker-then-reap + `now`-time-shift
|
|
36
|
+
harness to deterministically simulate the registration-then-aging race).
|
|
37
|
+
|
|
38
|
+
### Bug Y — background runner crashed with EPIPE on the first post-detach `console.debug` (root cause)
|
|
39
|
+
|
|
40
|
+
**File:** `src/runtime/background-runner.ts`
|
|
41
|
+
|
|
42
|
+
**What was wrong:** The in-process console redirect only covered `console.log`
|
|
43
|
+
and `console.error`; `console.debug` and `console.warn` still wrote to the
|
|
44
|
+
original stdout/stderr pipes. The background runner is spawned with
|
|
45
|
+
`detached:true` + `setsid:true`, so the parent disconnects the stdio pipes
|
|
46
|
+
immediately after spawn. The first post-detach `console.debug` call from
|
|
47
|
+
`team-runner.ts:242` (inside `mergeTaskUpdatesPreservingTerminal` →
|
|
48
|
+
"Skipping stale merge") hit the closed stdout → unhandled `EPIPE` error →
|
|
49
|
+
**process exit** → scheduler dead → run stuck at 25% forever.
|
|
50
|
+
|
|
51
|
+
Prior investigators saw only "the run died silently right after explorer
|
|
52
|
+
completed" and concluded (incorrectly) that the cause was a native crash
|
|
53
|
+
(SIGKILL/segfault/V8 heap-OOM), because their [DIAG] handlers never fired.
|
|
54
|
+
In reality the diagnostic handlers DID fire — but on a `EPIPE` write error,
|
|
55
|
+
which `process.on('error')` doesn't catch. The fix below makes the crash
|
|
56
|
+
observable AND non-fatal.
|
|
57
|
+
|
|
58
|
+
**Fix:**
|
|
59
|
+
- Extend the console redirect to also cover `console.debug` and `console.warn`,
|
|
60
|
+
so they go to the log file (logFd) instead of the disconnected stdio pipes.
|
|
61
|
+
- Wrap the `fs.writeSync` in try-catch so any log-write failure (closed fd,
|
|
62
|
+
ENOSPC, etc.) can never crash the scheduler. The scheduler log is
|
|
63
|
+
best-effort by design.
|
|
64
|
+
|
|
65
|
+
**New regression test:** `test/unit/background-runner-console-redirect.test.ts`
|
|
66
|
+
(4 cases: undefined logFd no-op, valid logFd writes correctly, EBADF on
|
|
67
|
+
closed logFd is swallowed, post-undefined fd-toggle is safe). Replicates the
|
|
68
|
+
`origWrite` pattern from the source so any drift between the two is easy to
|
|
69
|
+
spot.
|
|
70
|
+
|
|
71
|
+
### Why this took multiple attempts
|
|
72
|
+
|
|
73
|
+
All prior attempts to diagnose the hang destroyed the only evidence (the
|
|
74
|
+
stateRoot) the moment the `purgeStaleActiveRunIndex` heuristic misfired.
|
|
75
|
+
The chain was always the same: a worker exits for any reason → purge sees
|
|
76
|
+
dead PID + frozen-stale entry → **deletes stateRoot** → the run becomes
|
|
77
|
+
"Run not found" with no log, no events, no heartbeat, no way to even resume.
|
|
78
|
+
That hid the real cause (Bug Y) for the entire series of failed diagnostic
|
|
79
|
+
runs. With Bug X fixed, the diagnostic trail (background.log 345 KB +
|
|
80
|
+
events.jsonl 166 KB) survives long enough to read the actual EPIPE crash
|
|
81
|
+
that Bug Y left behind.
|
|
82
|
+
|
|
83
|
+
### Verification
|
|
84
|
+
|
|
85
|
+
- 7/7 new regression tests pass (`crash-recovery-purge-liveness.test.ts` +
|
|
86
|
+
`background-runner-console-redirect.test.ts`).
|
|
87
|
+
- Existing crash-recovery / active-run-registry / stale-reconciler /
|
|
88
|
+
async-stale / run-accumulation / auto-recovery suites: 71/71 pass.
|
|
89
|
+
- End-to-end: a 4-step review run now advances 3/4 tasks (75%) instead of
|
|
90
|
+
hanging at 25%; the verify step that would have failed earlier now fails
|
|
91
|
+
only for environmental reasons (memory OOM under load), not the fix.
|
|
92
|
+
- `npx tsc --noEmit` is green.
|
|
93
|
+
|
|
94
|
+
### Notes for users
|
|
95
|
+
|
|
96
|
+
If you have a stuck "running" run from v0.9.4 or earlier (the symptom was
|
|
97
|
+
"Run not found" / "25% hang" / "had to kill pi"), upgrading alone will not
|
|
98
|
+
recover it — its `stateRoot` was already destroyed by the buggy purge.
|
|
99
|
+
Re-dispatch the workflow. New runs are fully protected.
|
|
100
|
+
|
|
3
101
|
## [v0.9.4] — fix macOS CI: benchmark allowlist + cross-platform fixtures (2026-06-23)
|
|
4
102
|
|
|
5
103
|
Patch fix for a CI failure introduced in v0.9.3 (caught by the macOS CI job,
|
package/README.md
CHANGED
|
@@ -39,13 +39,65 @@ npm: pi-crew
|
|
|
39
39
|
repo: https://github.com/baphuongna/pi-crew
|
|
40
40
|
```
|
|
41
41
|
|
|
42
|
-
**v0.9.
|
|
42
|
+
**v0.9.4 / v0.9.5**: See [CHANGELOG.md](CHANGELOG.md).
|
|
43
43
|
|
|
44
|
-
### Highlights (v0.6.4 → v0.9.
|
|
44
|
+
### Highlights (v0.6.4 → v0.9.5)
|
|
45
45
|
|
|
46
46
|
A long arc of **trust, cliff-resilience, and robustness** work. Principle: *build
|
|
47
47
|
trust and cliff-resilience, stay lean, delete before adding.*
|
|
48
48
|
|
|
49
|
+
#### v0.9.5 — fix "team run hangs forever at 25%" (2026-06-23)
|
|
50
|
+
Two coupled runtime bugs caused recurring "run stuck at 25% (1/4)" failures
|
|
51
|
+
across 4+ consecutive review/fast-fix runs. The combined symptom: scheduler
|
|
52
|
+
appears to stop responding right after the first task (explorer) finishes, no
|
|
53
|
+
progress to task 2, and `team action='status'` returns "Run not found" with
|
|
54
|
+
**no diagnostic trail** to investigate. Manual `kill` of the parent `pi`
|
|
55
|
+
process was the only workaround.
|
|
56
|
+
|
|
57
|
+
- **🩹 Bug X (proximate cause)** — `purgeStaleActiveRunIndex`
|
|
58
|
+
(`src/runtime/crash-recovery.ts`) destroyed a run's `stateRoot` based on a
|
|
59
|
+
**frozen** `entry.updatedAt` (set once at registration, never refreshed).
|
|
60
|
+
Any long-running legitimate async run (≥5 min) whose worker had exited
|
|
61
|
+
lost its entire durable state. `saveRunTasks()` then silently no-op'd on
|
|
62
|
+
the missing dir, and the workflow could never advance. Fix: corroborate
|
|
63
|
+
liveness via the on-disk `manifest.updatedAt` AND the team-level
|
|
64
|
+
`heartbeat.json`; keep `stateRoot` on cancel so runs stay queryable and
|
|
65
|
+
resumable.
|
|
66
|
+
- **🩹 Bug Y (root cause — why the scheduler died in the first place)** —
|
|
67
|
+
`src/runtime/background-runner.ts` redirected only `console.log` /
|
|
68
|
+
`console.error` to the log file. The first post-detach `console.debug`
|
|
69
|
+
call from `team-runner.ts:242` (inside `mergeTaskUpdatesPreservingTerminal`
|
|
70
|
+
→ "Skipping stale merge") hit the disconnected stdout pipe → unhandled
|
|
71
|
+
`EPIPE` → process exit. Prior investigators concluded (incorrectly) that
|
|
72
|
+
the cause was a native crash, because diagnostic `[DIAG]` handlers never
|
|
73
|
+
fired on the EPIPE. Fix: extend the console redirect to `console.debug` /
|
|
74
|
+
`console.warn`, and wrap `fs.writeSync` in try-catch so any log-write
|
|
75
|
+
failure can never crash the scheduler.
|
|
76
|
+
- **🧪 Regression coverage** — 7 new tests: 3 in
|
|
77
|
+
`test/unit/crash-recovery-purge-liveness.test.ts` (fresh-manifest-kept,
|
|
78
|
+
orphan-cancelled-preserved, fresh-heartbeat-kept) + 4 in
|
|
79
|
+
`test/unit/background-runner-console-redirect.test.ts` (drift-detector
|
|
80
|
+
pattern that exercises undefined / valid / EBADF / post-toggle logFd).
|
|
81
|
+
- **📖 See [CHANGELOG.md](CHANGELOG.md) for full details**, including
|
|
82
|
+
why prior attempts to diagnose the hang kept destroying the only
|
|
83
|
+
evidence (Bug X nuked the stateRoot before anyone could read the EPIPE
|
|
84
|
+
crash in Bug Y).
|
|
85
|
+
|
|
86
|
+
> **Recovering a stuck run from v0.9.4 or earlier:** the `stateRoot` for
|
|
87
|
+
> those runs is already gone. Re-dispatch the workflow — new runs are
|
|
88
|
+
> fully protected.
|
|
89
|
+
|
|
90
|
+
#### v0.9.4 — macOS CI fixture (2026-06-23)
|
|
91
|
+
- **🧪 BSD-vs-GNU grep fix** — benchmark test fixtures used
|
|
92
|
+
`grep --help` (exits 0 on GNU/Linux, exits 2 on BSD/macOS). Switched
|
|
93
|
+
the exit-0 fixture to `echo ok`; the not-in-allowlist fixture is now
|
|
94
|
+
`ls`. CI matrix is now green on all 3 OSes.
|
|
95
|
+
- **📌 Process note** — this release re-commits to: **tag/publish ONLY
|
|
96
|
+
after the full OS matrix CI is green.** v0.9.3 was published mid-CI-run
|
|
97
|
+
(the macOS job hadn't finished); the package itself was correct (the
|
|
98
|
+
broken file is test-only and not shipped), but the repo CI went red.
|
|
99
|
+
v0.9.4 restores green CI. v0.9.5 follows the same discipline.
|
|
100
|
+
|
|
49
101
|
#### v0.9.0 — goal loops + dynamic workflows (2026-06-18)
|
|
50
102
|
Two new features, both modeled on Claude Code, built on a shared `runKind`
|
|
51
103
|
background-dispatch discriminator.
|
package/docs/troubleshooting.md
CHANGED
|
@@ -74,6 +74,32 @@ team action='cancel' runId=… # cancel a truly-dead run
|
|
|
74
74
|
|
|
75
75
|
The error message explains the heartbeat mechanism + remediation.
|
|
76
76
|
|
|
77
|
+
### "Run not found" but `team list` shows it / scheduler appears frozen at 25%
|
|
78
|
+
|
|
79
|
+
**Symptom:** an async `team action='run'` (e.g. a review) gets through the
|
|
80
|
+
first task (e.g. explorer), then the scheduler appears to stop responding.
|
|
81
|
+
`team action='status' runId=…` returns `Run not found`; the run's
|
|
82
|
+
`stateRoot` (in `<project>/.crew/state/runs/<runId>/`) is missing. TUI
|
|
83
|
+
progress shows the run stuck at the same task percentage forever, and the
|
|
84
|
+
only workaround was killing the parent `pi` process.
|
|
85
|
+
|
|
86
|
+
**This was the v0.9.4 symptom** caused by two coupled runtime bugs:
|
|
87
|
+
|
|
88
|
+
- **Bug X** (proximate): `purgeStaleActiveRunIndex` destroyed the
|
|
89
|
+
`stateRoot` of long-running legitimate async runs based on a frozen
|
|
90
|
+
`entry.updatedAt` (set at registration, never refreshed).
|
|
91
|
+
- **Bug Y** (root cause): the bg-runner crashed with an unhandled `EPIPE`
|
|
92
|
+
on the first `console.debug` after the parent detached its stdio pipes.
|
|
93
|
+
|
|
94
|
+
**Fixed in v0.9.5** (see [CHANGELOG.md](../CHANGELOG.md#v095--fix-team-run-hangs-forever-at-25-2026-06-23)).
|
|
95
|
+
With the fix, a long-running run is no longer falsely purged, and even if the
|
|
96
|
+
bg-runner dies, the `stateRoot`, `background.log`, `events.jsonl`, and
|
|
97
|
+
`heartbeat.json` survive — runs stay queryable and resumable.
|
|
98
|
+
|
|
99
|
+
**Recovering a stuck run from v0.9.4 or earlier:** the `stateRoot` for
|
|
100
|
+
those runs is already gone (Bug X nuked it). Re-dispatch the workflow. New
|
|
101
|
+
runs on v0.9.5+ are fully protected.
|
|
102
|
+
|
|
77
103
|
## Model fallback exhausted
|
|
78
104
|
|
|
79
105
|
**Symptom:** `All N candidates exhausted (tried: a → b → c)`.
|
package/package.json
CHANGED
|
@@ -67,6 +67,7 @@ export function suggestRunIds(_prefix: string, cwd?: string): AutocompleteItem[]
|
|
|
67
67
|
export async function suggestTaskIds(runId: string, prefix: string, cwd?: string): Promise<AutocompleteItem[] | null> {
|
|
68
68
|
const resolvedCwd = cwd ?? process.cwd();
|
|
69
69
|
// Dynamic import to avoid pulling state-store into the hot command-registration path.
|
|
70
|
+
// LAZY: defer dynamic import of ../state/state-store.ts to its call site.
|
|
70
71
|
const { loadRunManifestById } = await import("../state/state-store.ts");
|
|
71
72
|
const loaded = loadRunManifestById(resolvedCwd, runId);
|
|
72
73
|
if (!loaded) return null;
|
|
@@ -34,6 +34,7 @@ const CREW_SHORTCUTS: ReadonlyArray<ShortcutRegistration> = [
|
|
|
34
34
|
// (avoids pulling the full commands.ts dependency tree into every
|
|
35
35
|
// process that imports this module, e.g. the unit test).
|
|
36
36
|
handler: async (ctx) => {
|
|
37
|
+
// LAZY: defer dynamic import of ./registration/commands.ts to its call site.
|
|
37
38
|
const { openTeamSettingsOverlay } = await import("./registration/commands.ts");
|
|
38
39
|
await openTeamSettingsOverlay(ctx);
|
|
39
40
|
},
|
|
@@ -1129,6 +1129,7 @@ export function registerPiTeams(pi: ExtensionAPI): void {
|
|
|
1129
1129
|
// LAZY: state-store only needed in hasRunning; avoid at startup.
|
|
1130
1130
|
// Use dynamic import to avoid CJS/ESM mixed module issues.
|
|
1131
1131
|
const { loadRunManifestById: loadRunForHasRunning } =
|
|
1132
|
+
// LAZY: defer dynamic import of ../state/state-store.ts to its call site.
|
|
1132
1133
|
await import("../state/state-store.ts");
|
|
1133
1134
|
const loaded = loadRunForHasRunning(
|
|
1134
1135
|
currentCtx?.cwd ?? process.cwd(),
|
|
@@ -1494,6 +1495,7 @@ export function registerPiTeams(pi: ExtensionAPI): void {
|
|
|
1494
1495
|
const cwd = ctx.cwd ?? process.cwd();
|
|
1495
1496
|
const loaded = loadRunManifestById(cwd, runId);
|
|
1496
1497
|
if (loaded) {
|
|
1498
|
+
// LAZY: defer dynamic import of ../state/atomic-write.ts to its call site.
|
|
1497
1499
|
const { atomicWriteJson } = await import("../state/atomic-write.ts");
|
|
1498
1500
|
atomicWriteJson(loaded.manifest.stateRoot + "/manifest.json", {
|
|
1499
1501
|
...loaded.manifest,
|
|
@@ -202,11 +202,13 @@ export async function openTeamSettingsOverlay(ctx: ExtensionContext): Promise<vo
|
|
|
202
202
|
if (res.success) {
|
|
203
203
|
ctx.ui.notify(`Theme: ${value} (applied live)`, "info");
|
|
204
204
|
} else {
|
|
205
|
+
// LAZY: defer dynamic import of ../../ui/theme-discovery.ts to its call site.
|
|
205
206
|
const { setPiTheme } = await import("../../ui/theme-discovery.ts");
|
|
206
207
|
setPiTheme(value);
|
|
207
208
|
ctx.ui.notify(`Theme saved as '${value}' but failed to apply: ${res.error ?? "unknown"}. Restart Pi.`, "warning");
|
|
208
209
|
}
|
|
209
210
|
} else {
|
|
211
|
+
// LAZY: defer dynamic import of ../../ui/theme-discovery.ts to its call site.
|
|
210
212
|
const { setPiTheme } = await import("../../ui/theme-discovery.ts");
|
|
211
213
|
setPiTheme(value);
|
|
212
214
|
ctx.ui.notify(`Pi theme set to '${value}'. Restart Pi to apply.`, "info");
|
|
@@ -672,6 +674,7 @@ export function registerTeamCommands(pi: ExtensionAPI, deps: RegisterTeamCommand
|
|
|
672
674
|
pi.registerCommand("crew-brief", {
|
|
673
675
|
description: "Toggle brief tool output mode: on | off | status",
|
|
674
676
|
handler: async (args: string, ctx: ExtensionCommandContext) => {
|
|
677
|
+
// LAZY: defer dynamic import of ../../ui/tool-renderers/brief-mode.ts to its call site.
|
|
675
678
|
const { isBrief, setBrief, BRIEF_ENTRY_TYPE, makeBriefEntry } = await import("../../ui/tool-renderers/brief-mode.ts");
|
|
676
679
|
const trimmed = args.trim();
|
|
677
680
|
|
|
@@ -269,6 +269,7 @@ async function handleStop(input: GoalSubActionInput): Promise<ReturnType<typeof
|
|
|
269
269
|
let cancelMsg = "";
|
|
270
270
|
if (updated.currentRunId) {
|
|
271
271
|
try {
|
|
272
|
+
// LAZY: defer dynamic import of ./cancel.ts to its call site.
|
|
272
273
|
const { handleCancel } = await import("./cancel.ts");
|
|
273
274
|
const cancelResult = await handleCancel({ action: "cancel", runId: updated.currentRunId, force: true, config: { intent: "user requested goal stop" } }, ctx);
|
|
274
275
|
cancelMsg = ` In-flight turn ${updated.currentRunId} cancel: ${(cancelResult.content[0] as { text?: string } | undefined)?.text ?? "ok"}.`;
|
|
@@ -34,6 +34,7 @@ import { expandParallelResearchWorkflow } from "../../runtime/parallel-research.
|
|
|
34
34
|
/**
|
|
35
35
|
* Module-scoped latch for the crew-init dynamic import.
|
|
36
36
|
*
|
|
37
|
+
// LAZY: defer dynamic import of module to its call site.
|
|
37
38
|
* `crew-init.ts` is dynamically `await import()`'d from `handleRun` below, which
|
|
38
39
|
* N concurrent subagents hit simultaneously (every `team` tool call runs it).
|
|
39
40
|
* Under the tsx/jiti loader, concurrent first-imports race module-record
|
|
@@ -296,6 +297,7 @@ export async function handleRun(params: TeamToolParamsValue, ctx: TeamContext):
|
|
|
296
297
|
// orchestrates subagents via ctx.agent(); only ctx.setResult() reaches the main context.
|
|
297
298
|
// Placed AFTER manifest creation so runId/paths/artifactsRoot are available.
|
|
298
299
|
if (!directAgent && (workflow as import("../../workflows/workflow-config.ts").DynamicWorkflowConfig).runtime === "dynamic") {
|
|
300
|
+
// LAZY: defer dynamic import of ../../runtime/dynamic-workflow-runner.ts to its call site.
|
|
299
301
|
const { runDynamicWorkflow } = await import("../../runtime/dynamic-workflow-runner.ts");
|
|
300
302
|
// Re-synthesize a dynamic-team (§0c C9) for role resolution.
|
|
301
303
|
const dwfTeam: import("../../teams/team-config.ts").TeamConfig = {
|
|
@@ -323,11 +323,28 @@ async function main(): Promise<void> {
|
|
|
323
323
|
const origWrite =
|
|
324
324
|
(_prefix: string) =>
|
|
325
325
|
(data: unknown, ...args: unknown[]) => {
|
|
326
|
+
// FIX: Never let the in-process console redirect crash the background
|
|
327
|
+
// runner. If logFd is missing/invalid or the write fails, swallow the
|
|
328
|
+
// error silently — losing one debug line is far better than killing the
|
|
329
|
+
// scheduler (a previous version only redirected console.log/error, so
|
|
330
|
+
// console.debug/.warn still wrote to the original stdout/stderr pipe
|
|
331
|
+
// which is closed after the parent detaches, producing EPIPE → process
|
|
332
|
+
// crash mid-workflow → runs hang at 25% forever).
|
|
333
|
+
if (logFd === undefined) return;
|
|
326
334
|
const msg = [data, ...args].map(String).join(" ") + "\n";
|
|
327
|
-
|
|
335
|
+
try {
|
|
336
|
+
fs.writeSync(logFd, msg);
|
|
337
|
+
} catch {
|
|
338
|
+
/* best-effort: never crash the scheduler over a log write */
|
|
339
|
+
}
|
|
328
340
|
};
|
|
329
341
|
console.log = origWrite("OUT");
|
|
330
342
|
console.error = origWrite("ERR");
|
|
343
|
+
// FIX: Also redirect console.debug and console.warn — otherwise they still
|
|
344
|
+
// hit the original stdout/stderr pipe, which is closed once the parent
|
|
345
|
+
// process detaches, causing EPIPE unhandled errors that kill the scheduler.
|
|
346
|
+
console.debug = origWrite("DBG");
|
|
347
|
+
console.warn = origWrite("WARN");
|
|
331
348
|
// FIX: Close logFd on process exit to prevent file descriptor leak
|
|
332
349
|
process.on("exit", () => {
|
|
333
350
|
try {
|
|
@@ -558,8 +575,11 @@ async function main(): Promise<void> {
|
|
|
558
575
|
debugLog(`[background-runner] short-circuiting ${manifest.runKind} (synthetic team/workflow)`,
|
|
559
576
|
);
|
|
560
577
|
if (manifest.runKind === "goal-loop") {
|
|
578
|
+
// LAZY: defer dynamic import of ./goal-loop-runner.ts to its call site.
|
|
561
579
|
const { runGoalLoop } = await import("./goal-loop-runner.ts");
|
|
580
|
+
// LAZY: defer dynamic import of ./goal-state-store.ts to its call site.
|
|
562
581
|
const { GoalStore } = await import("./goal-state-store.ts");
|
|
582
|
+
// LAZY: defer dynamic import of ../agents/discover-agents.ts to its call site.
|
|
563
583
|
const { discoverAgents, allAgents } = await import("../agents/discover-agents.ts");
|
|
564
584
|
const store = new GoalStore(manifest.cwd);
|
|
565
585
|
const goalState = store.load(manifest.runId);
|
|
@@ -576,7 +596,9 @@ async function main(): Promise<void> {
|
|
|
576
596
|
saveRunManifest(finalGoalManifest);
|
|
577
597
|
earlyResult = { manifest: finalGoalManifest, tasks: goalResult.tasks };
|
|
578
598
|
} else {
|
|
599
|
+
// LAZY: defer dynamic import of ./dynamic-workflow-runner.ts to its call site.
|
|
579
600
|
const { runDynamicWorkflow } = await import("./dynamic-workflow-runner.ts");
|
|
601
|
+
// LAZY: defer dynamic import of ../workflows/discover-workflows.ts to its call site.
|
|
580
602
|
const { allWorkflows, discoverWorkflows } = await import("../workflows/discover-workflows.ts");
|
|
581
603
|
const wf = allWorkflows(discoverWorkflows(manifest.cwd)).find((w) => w.name === manifest.workflow);
|
|
582
604
|
if (!wf || wf.runtime !== "dynamic" || !wf.dynamicScript) throw new Error(`runKind="dynamic-workflow" but workflow '${manifest.workflow}' is not dynamic (runId=${manifest.runId})`);
|
|
@@ -246,6 +246,7 @@ export class ChainRunner {
|
|
|
246
246
|
|
|
247
247
|
// Emit progress event if eventsPath provided
|
|
248
248
|
if (eventsPath) {
|
|
249
|
+
// LAZY: defer dynamic import of ../state/event-log.ts to its call site.
|
|
249
250
|
const { appendEventAsync } = await import("../state/event-log.ts");
|
|
250
251
|
await appendEventAsync(eventsPath, {
|
|
251
252
|
type: "chain.step_completed",
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import type { ExtensionContext } from "@earendil-works/pi-coding-agent";
|
|
2
2
|
import * as fs from "node:fs";
|
|
3
|
+
import * as path from "node:path";
|
|
3
4
|
import type { MetricRegistry } from "../observability/metric-registry.ts";
|
|
4
5
|
import { appendEvent, scanSequence } from "../state/event-log.ts";
|
|
5
6
|
import { recordFromTask, upsertCrewAgent } from "./crew-agent-records.ts";
|
|
6
7
|
import { withRunLockSync } from "../state/locks.ts";
|
|
7
|
-
import { loadRunManifestById,
|
|
8
|
+
import { loadRunManifestById, saveRunTasks, updateRunStatus } from "../state/state-store.ts";
|
|
8
9
|
import type { TeamTaskState } from "../state/types.ts";
|
|
9
10
|
import { isWorkerHeartbeatStale } from "./worker-heartbeat.ts";
|
|
10
11
|
import type { ManifestCache } from "./manifest-cache.ts";
|
|
@@ -215,6 +216,43 @@ function tryRemoveRunDirectories(entry: { stateRoot: string; cwd: string }): voi
|
|
|
215
216
|
// NOTE: artifactsRoot is shared across runs and cleaned up by pruneFinishedRuns/pruneUserLevelRuns — not deleted here.
|
|
216
217
|
}
|
|
217
218
|
|
|
219
|
+
/**
|
|
220
|
+
* Age (ms) of the team-level heartbeat file for a run. The team-runner writes
|
|
221
|
+
* `<stateRoot>/heartbeat.json` periodically while a workflow is executing
|
|
222
|
+
* (startTeamHeartbeat), so a fresh heartbeat is strong evidence the run is alive
|
|
223
|
+
* even when its recorded PID check is inconclusive or its active-run-index
|
|
224
|
+
* entry's `updatedAt` was frozen at registration. Returns Infinity when absent.
|
|
225
|
+
*/
|
|
226
|
+
function heartbeatAgeMs(entry: { stateRoot: string }, now: number): number {
|
|
227
|
+
try {
|
|
228
|
+
const mtime = fs.statSync(path.join(entry.stateRoot, "heartbeat.json")).mtimeMs;
|
|
229
|
+
return Number.isFinite(mtime) ? now - mtime : Infinity;
|
|
230
|
+
} catch {
|
|
231
|
+
return Infinity;
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* True if there is recent evidence the run is (or was very recently) alive, so
|
|
237
|
+
* it must NOT be purged. Any one of these signals is sufficient:
|
|
238
|
+
* - on-disk `manifest.updatedAt` fresher than `staleThresholdMs` (rewritten on
|
|
239
|
+
* every task transition / status change), and/or
|
|
240
|
+
* - team-level `heartbeat.json` fresher than `staleThresholdMs`.
|
|
241
|
+
* `entry.updatedAt` is intentionally NOT consulted: it is frozen at
|
|
242
|
+
* registration and never refreshed during execution, which previously caused
|
|
243
|
+
* long-running legitimate runs to be falsely purged — destroying their
|
|
244
|
+
* stateRoot, and because saveRunTasks() silently no-ops once the state dir is
|
|
245
|
+
* gone, hanging the workflow permanently at the current task with no
|
|
246
|
+
* recoverable state ("Run not found").
|
|
247
|
+
*/
|
|
248
|
+
function hasRecentLifeEvidence(entry: { stateRoot: string }, manifestUpdatedAt: string | undefined, now: number, staleThresholdMs: number): boolean {
|
|
249
|
+
const manifestMs = manifestUpdatedAt ? new Date(manifestUpdatedAt).getTime() : NaN;
|
|
250
|
+
if (Number.isFinite(manifestMs) && now - manifestMs <= staleThresholdMs) return true;
|
|
251
|
+
const hbAge = heartbeatAgeMs(entry, now);
|
|
252
|
+
if (Number.isFinite(hbAge) && hbAge <= staleThresholdMs) return true;
|
|
253
|
+
return false;
|
|
254
|
+
}
|
|
255
|
+
|
|
218
256
|
/**
|
|
219
257
|
* Purge the global active-run-index of entries whose manifest is no longer active.
|
|
220
258
|
*
|
|
@@ -244,7 +282,7 @@ export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.
|
|
|
244
282
|
}
|
|
245
283
|
|
|
246
284
|
// 3. Read manifest status
|
|
247
|
-
let manifest: { status?: string; async?: { pid?: number }; ownerSessionId?: string } | undefined;
|
|
285
|
+
let manifest: { status?: string; updatedAt?: string; async?: { pid?: number }; ownerSessionId?: string } | undefined;
|
|
248
286
|
try {
|
|
249
287
|
manifest = JSON.parse(fs.readFileSync(entry.manifestPath, "utf-8"));
|
|
250
288
|
} catch {
|
|
@@ -262,46 +300,52 @@ export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.
|
|
|
262
300
|
continue;
|
|
263
301
|
}
|
|
264
302
|
|
|
265
|
-
// 5. Still "running"
|
|
303
|
+
// 5. Still "running" with an async worker PID — only purge when the worker
|
|
304
|
+
// is actually dead AND there is no recent evidence of life. We must NOT
|
|
305
|
+
// rely solely on `entry.updatedAt` (frozen at registration) nor on a single
|
|
306
|
+
// dead-PID reading: a long-running worker (e.g. a 15-minute explorer)
|
|
307
|
+
// legitimately keeps the run "running" while periodically rewriting the
|
|
308
|
+
// on-disk manifest.updatedAt and heartbeat.json. Falsely purging such a run
|
|
309
|
+
// destroys its stateRoot, and because saveRunTasks() silently no-ops once
|
|
310
|
+
// the state dir is gone, the workflow then hangs permanently at the
|
|
311
|
+
// current task with no recoverable state ("Run not found"). When we do mark
|
|
312
|
+
// a run cancelled here, we KEEP its stateRoot so the run stays queryable/
|
|
313
|
+
// resumable and its diagnostics survive; the finished-run pruner removes
|
|
314
|
+
// the directory later on its normal schedule.
|
|
266
315
|
if (manifest?.status === "running" && manifest.async?.pid !== undefined) {
|
|
267
316
|
const pidAlive = checkProcessLiveness(manifest.async.pid).alive;
|
|
268
|
-
if (!pidAlive) {
|
|
269
|
-
//
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
const
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
for (const task of repairedTasks) { try { upsertCrewAgent(fullLoaded.manifest, recordFromTask(fullLoaded.manifest, task, "scaffold")); } catch { /* non-critical */ } }
|
|
285
|
-
updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: worker process dead and no recent activity");
|
|
286
|
-
saveRunManifest(fullLoaded.manifest);
|
|
287
|
-
void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch((error) => logInternalError("crash-recovery.pid-dead.terminate", error, `runId=${fullLoaded.manifest.runId}`));
|
|
288
|
-
}
|
|
289
|
-
} catch {
|
|
290
|
-
// Best-effort manifest cleanup
|
|
317
|
+
if (!pidAlive && !hasRecentLifeEvidence(entry, manifest.updatedAt, now, staleThresholdMs)) {
|
|
318
|
+
// Dead PID + no recent life evidence → cancel the manifest and unregister
|
|
319
|
+
try {
|
|
320
|
+
const fullLoaded = loadRunManifestById(entry.cwd, entry.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
|
|
321
|
+
if (fullLoaded) {
|
|
322
|
+
const now_iso = new Date(now).toISOString();
|
|
323
|
+
const repairedTasks = fullLoaded.tasks.map((task) => {
|
|
324
|
+
if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
|
|
325
|
+
return { ...task, status: "cancelled" as const, finishedAt: now_iso, error: "Orphaned run: worker process dead and no recent activity" };
|
|
326
|
+
}
|
|
327
|
+
return task;
|
|
328
|
+
});
|
|
329
|
+
saveRunTasks(fullLoaded.manifest, repairedTasks);
|
|
330
|
+
for (const task of repairedTasks) { try { upsertCrewAgent(fullLoaded.manifest, recordFromTask(fullLoaded.manifest, task, "scaffold")); } catch { /* non-critical */ } }
|
|
331
|
+
updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: worker process dead and no recent activity");
|
|
332
|
+
void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch((error) => logInternalError("crash-recovery.pid-dead.terminate", error, `runId=${fullLoaded.manifest.runId}`));
|
|
291
333
|
}
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
purged.push(entry.runId);
|
|
295
|
-
continue;
|
|
334
|
+
} catch {
|
|
335
|
+
// Best-effort manifest cleanup
|
|
296
336
|
}
|
|
337
|
+
unregisterActiveRun(entry.runId);
|
|
338
|
+
purged.push(entry.runId);
|
|
339
|
+
continue;
|
|
297
340
|
}
|
|
298
341
|
}
|
|
299
342
|
|
|
300
|
-
// 6. "running" but no async worker PID — possible orphaned run where
|
|
301
|
-
// was never updated
|
|
343
|
+
// 6. "running" but no async worker PID — possible orphaned run where the
|
|
344
|
+
// manifest was never updated to a terminal status after the worker exited.
|
|
345
|
+
// Uses the same life-evidence corroboration as condition 5; the stateRoot is
|
|
346
|
+
// kept on cancel so the run stays queryable/resumable with diagnostics.
|
|
302
347
|
if (manifest?.status === "running" && manifest.async === undefined) {
|
|
303
|
-
|
|
304
|
-
if (Number.isFinite(updatedAt) && now - updatedAt > staleThresholdMs) {
|
|
348
|
+
if (!hasRecentLifeEvidence(entry, manifest.updatedAt, now, staleThresholdMs)) {
|
|
305
349
|
try {
|
|
306
350
|
const fullLoaded = loadRunManifestById(entry.cwd, entry.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
|
|
307
351
|
if (fullLoaded && fullLoaded.manifest.status === "running") {
|
|
@@ -315,14 +359,12 @@ export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.
|
|
|
315
359
|
saveRunTasks(fullLoaded.manifest, repairedTasks);
|
|
316
360
|
for (const task of repairedTasks) { try { upsertCrewAgent(fullLoaded.manifest, recordFromTask(fullLoaded.manifest, task, "scaffold")); } catch { /* non-critical */ } }
|
|
317
361
|
updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: no async worker and no manifest update in over " + Math.round(staleThresholdMs / 60000) + " minutes");
|
|
318
|
-
saveRunManifest(fullLoaded.manifest);
|
|
319
362
|
void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch((error) => logInternalError("crash-recovery.pid-dead.terminate", error, `runId=${fullLoaded.manifest.runId}`));
|
|
320
363
|
}
|
|
321
364
|
} catch {
|
|
322
365
|
// Best-effort
|
|
323
366
|
}
|
|
324
367
|
unregisterActiveRun(entry.runId);
|
|
325
|
-
tryRemoveRunDirectories(entry);
|
|
326
368
|
purged.push(entry.runId);
|
|
327
369
|
continue;
|
|
328
370
|
}
|
|
@@ -85,6 +85,7 @@ async function loadWorkflowModule(scriptPath: string): Promise<DynamicWorkflowSc
|
|
|
85
85
|
// lazily so this module stays importable in environments without jiti (type-only consumers).
|
|
86
86
|
// Fix round-4: use createRequire(import.meta.url) so `require` works under the strip-types
|
|
87
87
|
// loader fallback (Node ≥ 22.6) where bare `require` is not defined in ESM scope.
|
|
88
|
+
// LAZY: defer dynamic import of node:module to its call site.
|
|
88
89
|
const { createRequire } = await import("node:module");
|
|
89
90
|
const require = createRequire(import.meta.url);
|
|
90
91
|
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
@@ -121,6 +121,7 @@ export const realGoalEvaluator = async (
|
|
|
121
121
|
}
|
|
122
122
|
if (!verificationCompromised) {
|
|
123
123
|
try {
|
|
124
|
+
// LAZY: defer dynamic import of ./verification-gates.ts to its call site.
|
|
124
125
|
const { executeVerificationCommands } = await import("./verification-gates.ts");
|
|
125
126
|
const contract = { requiredGreenLevel: "none" as const, commands: goal.verification.commands, allowManualEvidence: goal.verification.allowManualEvidence ?? false };
|
|
126
127
|
// Phase 1.5 #2 (RFC 16): run verification in a pristine git worktree at
|
|
@@ -131,6 +132,7 @@ export const realGoalEvaluator = async (
|
|
|
131
132
|
let worktreeCwd: string | undefined;
|
|
132
133
|
let worktreeCleanup: (() => void) | undefined;
|
|
133
134
|
try {
|
|
135
|
+
// LAZY: defer dynamic import of ./verification-worktree.ts to its call site.
|
|
134
136
|
const { checkWorktreeSandboxAvailable, prepareVerificationWorktree } = await import("./verification-worktree.ts");
|
|
135
137
|
const availability = checkWorktreeSandboxAvailable(goal.cwd);
|
|
136
138
|
if (availability.available) {
|
|
@@ -36,6 +36,7 @@ import { listLiveAgents } from "./live-agent-manager.ts";
|
|
|
36
36
|
* Module-scoped latch for the optional peer dependency import. When N
|
|
37
37
|
* in-process live-session subagents spawn CONCURRENTLY (e.g. several
|
|
38
38
|
* `Agent({run_in_background:true})` started at once), each used to call
|
|
39
|
+
// LAZY: defer dynamic import of @earendil-works/pi-coding-agent to its call site.
|
|
39
40
|
* `await import("@earendil-works/pi-coding-agent")` independently. Under the
|
|
40
41
|
* tsx loader (registering load/resolve hooks), concurrent first-imports can
|
|
41
42
|
* each enter the loader and race module-record instantiation, yielding
|
|
@@ -128,6 +128,7 @@ export async function readEnabledModelsPatterns(cwd: string, agentDir?: string):
|
|
|
128
128
|
// SDK. SettingsManager is dynamically imported because the module
|
|
129
129
|
// shape differs across pi versions; the create() factory is the
|
|
130
130
|
// canonical, version-stable entry point.
|
|
131
|
+
// LAZY: defer dynamic import of @earendil-works/pi-coding-agent to its call site.
|
|
131
132
|
const mod = await import("@earendil-works/pi-coding-agent" as string).catch(() => null);
|
|
132
133
|
if (!mod) return [];
|
|
133
134
|
const SettingsManagerCtor = (mod as { SettingsManager?: { create?: (cwd: string, agentDir?: string) => { getEnabledModels?: () => string[] | undefined } } }).SettingsManager;
|
package/src/runtime/peer-dep.ts
CHANGED
|
@@ -239,6 +239,7 @@ export function primePeerDep(): Promise<PeerDepModule> {
|
|
|
239
239
|
if (!resolved) {
|
|
240
240
|
throw new Error(buildMissingMessage());
|
|
241
241
|
}
|
|
242
|
+
// LAZY: defer dynamic import of module to its call site.
|
|
242
243
|
cachedModule = (await import(resolved.mainUrl)) as PeerDepModule;
|
|
243
244
|
return cachedModule;
|
|
244
245
|
})();
|
|
@@ -133,6 +133,7 @@ export function wrapEditWithResilientReplace(pi: ExtensionAPI, tools?: { edit: T
|
|
|
133
133
|
throw new Error("old_string not found (and resilient retry skipped: missing path/old/new)");
|
|
134
134
|
}
|
|
135
135
|
|
|
136
|
+
// LAZY: defer dynamic import of node:fs/promises to its call site.
|
|
136
137
|
const fs = await import("node:fs/promises");
|
|
137
138
|
let content: string;
|
|
138
139
|
try {
|
|
@@ -289,6 +289,7 @@ export async function runTeamTask(
|
|
|
289
289
|
// follow it and execute a script outside cwd. Throws on escape.
|
|
290
290
|
resolveRealContainedPath(manifest.cwd, input.step.preStepScript);
|
|
291
291
|
try {
|
|
292
|
+
// LAZY: defer dynamic import of node:child_process to its call site.
|
|
292
293
|
const { execFileSync } = await import("node:child_process");
|
|
293
294
|
preStepOutput = execFileSync(input.step.preStepScript, scriptArgs, {
|
|
294
295
|
timeout: scriptTimeout,
|
|
@@ -11,7 +11,9 @@ let pathsInstance: typeof import("../utils/paths.js") | null = null;
|
|
|
11
11
|
|
|
12
12
|
async function getStore() {
|
|
13
13
|
if (!storeInstance) {
|
|
14
|
+
// LAZY: defer dynamic import of ./instinct-store.js to its call site.
|
|
14
15
|
const { InstinctStore } = await import("./instinct-store.js");
|
|
16
|
+
// LAZY: defer dynamic import of ../utils/paths.js to its call site.
|
|
15
17
|
const paths = await import("../utils/paths.js");
|
|
16
18
|
storeInstance = new InstinctStore(paths.projectCrewRoot(process.cwd()));
|
|
17
19
|
}
|
|
@@ -20,6 +22,7 @@ async function getStore() {
|
|
|
20
22
|
|
|
21
23
|
async function getPaths() {
|
|
22
24
|
if (!pathsInstance) {
|
|
25
|
+
// LAZY: defer dynamic import of ../utils/paths.js to its call site.
|
|
23
26
|
pathsInstance = await import("../utils/paths.js");
|
|
24
27
|
}
|
|
25
28
|
return pathsInstance;
|
package/src/utils/bm25-search.ts
CHANGED
|
@@ -156,6 +156,7 @@ interface AgentSearchResult {
|
|
|
156
156
|
* Uses dynamic import to avoid ESM/CJS issues at module load time.
|
|
157
157
|
*/
|
|
158
158
|
export async function searchAgents(query: string, options?: { limit?: number }): Promise<AgentSearchResult[]> {
|
|
159
|
+
// LAZY: defer dynamic import of ../agents/discover-agents.ts to its call site.
|
|
159
160
|
const { discoverAgents, allAgents } = await import("../agents/discover-agents.ts");
|
|
160
161
|
const discovery = discoverAgents(process.cwd());
|
|
161
162
|
const all = allAgents(discovery);
|
|
@@ -200,6 +201,7 @@ interface TeamSearchResult {
|
|
|
200
201
|
* Uses dynamic import to avoid ESM/CJS issues at module load time.
|
|
201
202
|
*/
|
|
202
203
|
export async function searchTeams(query: string, options?: { limit?: number }): Promise<TeamSearchResult[]> {
|
|
204
|
+
// LAZY: defer dynamic import of ../teams/discover-teams.ts to its call site.
|
|
203
205
|
const { discoverTeams, allTeams } = await import("../teams/discover-teams.ts");
|
|
204
206
|
const discovery = discoverTeams(process.cwd());
|
|
205
207
|
const all = allTeams(discovery);
|