pi-crew 0.7.6 → 0.7.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +20 -0
- package/package.json +1 -1
- package/src/runtime/crash-recovery.ts +21 -1
- package/src/runtime/pi-spawn.ts +66 -0
- package/src/runtime/stale-reconciler.ts +30 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,25 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.7.7] — Windows spawn fix + plan-approval crash-recovery fix + CI flake fixes (2026-06-16)
|
|
4
|
+
|
|
5
|
+
A focused patch release driven by two community reports (Issue #33 and PR #32) plus the CI flake surfaced while validating them. CI green on Windows / Ubuntu / macOS (run 27599121797). 4965 tests pass / 0 fail.
|
|
6
|
+
|
|
7
|
+
### Bug Fixes
|
|
8
|
+
|
|
9
|
+
- **`#33` — Windows `spawn pi ENOENT`** (commit `afc23b4`): when pi is installed outside `%APPDATA%\npm` (nvm-windows / Volta / fnm put the global `node_modules` elsewhere), the static `%APPDATA%\npm` paths in `resolvePiCliScript()` all miss, and the fallback `spawn("pi")` fails with `ENOENT` because `child_process.spawn` does NOT do PATHEXT resolution on Windows (only `exec`/`execSync` via `cmd.exe` do). **Fix**: pi-crew now discovers the real npm global `node_modules` dir at runtime via `npm root -g` (run through `execSync`, which DOES resolve `npm.cmd` via PATHEXT), then derives the `@earendil-works` / `@mariozechner` package dirs from it and checks them BEFORE the static `%APPDATA%\npm` paths and the cwd fallback. Covers standard installs **and** nvm-windows / Volta / fnm uniformly. Memoized once per process (one-time ~200ms cost). Injection-safe — no `shell: true` on the real worker spawn. +6 tests.
|
|
10
|
+
- **Plan-approval-blocked runs crash-recovery fix** (commit `421b76d`, adapts PR #32 change #1 by @gustavo-pelissaro): crash recovery and stale reconciliation both treated `status === "blocked"` runs as repair candidates, so a run legitimately blocked on **human** plan approval (`requirePlanApproval`, `status="pending"`) was marked failed and/or orphan-cancelled when its owning session died or its async PID was no longer live — destroying an in-flight HITL checkpoint. **Fix**: new `isPlanApprovalPending(manifest)` guard (status=blocked AND `planApproval.required=true` AND `planApproval.status=pending`). Guarded in `reconcileStaleRun` (new `blocked_awaiting_approval` verdict, `repaired=false` — which automatically covers `reconcileAllStaleRuns`), `detectInterruptedRuns` (skip), `cancelOrphanedRuns` (push to `skipped`), and a belt-and-suspenders re-check under the lock in `reconcileAllStaleRuns`. The guard is intentionally narrow: a plain `blocked` run (no planApproval, or already approved/cancelled) is still a recovery candidate, so existing orphaned-blocked-run handling is unchanged. +6 tests.
|
|
11
|
+
|
|
12
|
+
### Tests (CI reliability)
|
|
13
|
+
|
|
14
|
+
- **`run-watcher-registry` macOS cancellation** (commit `dccb5e7`): the two fs.watch-dependent tests used unbounded `done()` callbacks that hung the whole test file on macOS CI runners (fs.watch events are slow/dropped under `/var/folders` + VM-runner FS load). Fixed with bounded async waits (1.5s deadline) consistent with production semantics, where fs.watch is best-effort and the preload poll loop is the source of truth.
|
|
15
|
+
- **`operator-experience` ubuntu redaction flake** (commit `2da1a1b`): the redaction test seeded a secret literally named `abc` and asserted `/abc/` does not leak, but the runId hash (`randomBytes(8).toString("hex")`) occasionally spells `...abc...` (e.g. `team_..._9791deabc2f52485`) → false failure, even though redaction worked perfectly. Fixed by switching to a `ZZ_LEAK_CANARY` marker — uppercase letters never appear in a lowercase-hex hash, so the marker is collision-proof.
|
|
16
|
+
|
|
17
|
+
### Community
|
|
18
|
+
|
|
19
|
+
- Thanks to **@YrFnS** for the textbook-quality Issue #33 report and diagnosis (PATHEXT, spawn vs execSync matrix) that pinpointed the fix.
|
|
20
|
+
- Thanks to **@gustavo-pelissaro** for PR #32 — change #1 (plan-approval preservation) landed here; changes #2/#3 (child exit-143 normalization, symlinked temp base) were closed for heavy conflicts but will be revisited.
|
|
21
|
+
- PR #34 (closed) overlapped the existing `%APPDATA%\npm` resolution; superseded by the runtime `npm root -g` probe.
|
|
22
|
+
|
|
3
23
|
## [0.7.6] — DX, observability, and a critical interactive-session hang fix (2026-06-16)
|
|
4
24
|
|
|
5
25
|
This release bundles Rounds 16–28: a developer-experience pass, an observability pass, and eight correctness/security audits — culminating in the **fix for the pts/2 interactive-session busy-loop hang** (two separate Pi sessions had hung at 71.5% CPU with 339 inotify watches). All 24 commits passed CI on Windows, Ubuntu, and macOS.
|
package/package.json
CHANGED
|
@@ -9,7 +9,7 @@ import type { TeamTaskState } from "../state/types.ts";
|
|
|
9
9
|
import { isWorkerHeartbeatStale } from "./worker-heartbeat.ts";
|
|
10
10
|
import type { ManifestCache } from "./manifest-cache.ts";
|
|
11
11
|
import { checkProcessLiveness } from "./process-status.ts";
|
|
12
|
-
import { reconcileStaleRun, type ReconcileResult } from "./stale-reconciler.ts";
|
|
12
|
+
import { isPlanApprovalPending, reconcileStaleRun, type ReconcileResult } from "./stale-reconciler.ts";
|
|
13
13
|
import { executeHook, appendHookEvent } from "../hooks/registry.ts";
|
|
14
14
|
import { unregisterActiveRun, readActiveRunRegistry } from "../state/active-run-registry.ts";
|
|
15
15
|
import { resolveRealContainedPath } from "../utils/safe-paths.ts";
|
|
@@ -38,6 +38,8 @@ export function detectInterruptedRuns(cwd: string, manifestCache: ManifestCache,
|
|
|
38
38
|
const plans: RecoveryPlan[] = [];
|
|
39
39
|
for (const manifest of manifestCache.list(50)) {
|
|
40
40
|
if (manifest.status !== "running" && manifest.status !== "blocked") continue;
|
|
41
|
+
// Preserve runs intentionally blocked on plan approval — not crashes.
|
|
42
|
+
if (isPlanApprovalPending(manifest)) continue;
|
|
41
43
|
if (manifest.async?.pid !== undefined && checkProcessLiveness(manifest.async.pid).alive) continue;
|
|
42
44
|
// NOTE: no withRunLock — best-effort only; concurrent writes may cause inconsistency
|
|
43
45
|
const loaded = loadRunManifestById(cwd, manifest.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
|
|
@@ -107,6 +109,12 @@ export function cancelOrphanedRuns(
|
|
|
107
109
|
// Phase 1: Scan project-level manifests via manifestCache
|
|
108
110
|
for (const manifest of manifestCache.list(50)) {
|
|
109
111
|
if (manifest.status !== "running" && manifest.status !== "blocked") continue;
|
|
112
|
+
// Preserve plan-approval-blocked runs — they belong to their owner and are
|
|
113
|
+
// waiting on a human decision, not orphaned by a dead owner process.
|
|
114
|
+
if (isPlanApprovalPending(manifest)) {
|
|
115
|
+
skipped.push(manifest.runId);
|
|
116
|
+
continue;
|
|
117
|
+
}
|
|
110
118
|
|
|
111
119
|
// Only consider runs owned by a different session
|
|
112
120
|
const ownerId = manifest.ownerSessionId;
|
|
@@ -340,6 +348,18 @@ export function reconcileAllStaleRuns(cwd: string, manifestCache: ManifestCache,
|
|
|
340
348
|
// Re-read inside lock to get freshest data
|
|
341
349
|
const fresh = loadRunManifestById(cwd, runId); // NOTE: inside withRunLockSync - consistent read
|
|
342
350
|
if (!fresh || (fresh.manifest.status !== "running" && fresh.manifest.status !== "blocked")) return;
|
|
351
|
+
// Belt-and-suspenders: reconcileStaleRun itself guards this, but the run
|
|
352
|
+
// may have flipped to blocked+plan-approval between cache-list and lock
|
|
353
|
+
// acquisition — re-check the freshest manifest under the lock.
|
|
354
|
+
if (isPlanApprovalPending(fresh.manifest)) {
|
|
355
|
+
results.push({
|
|
356
|
+
runId,
|
|
357
|
+
verdict: "blocked_awaiting_approval",
|
|
358
|
+
repaired: false,
|
|
359
|
+
detail: "Plan approval is pending; stale reconciliation skipped",
|
|
360
|
+
});
|
|
361
|
+
return;
|
|
362
|
+
}
|
|
343
363
|
const result = reconcileStaleRun(fresh.manifest, fresh.tasks, now);
|
|
344
364
|
if (result.repaired || result.verdict === "result_exists") {
|
|
345
365
|
if (result.repairedTasks) {
|
package/src/runtime/pi-spawn.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import * as fs from "node:fs";
|
|
2
2
|
import * as os from "node:os";
|
|
3
|
+
import { execSync } from "node:child_process";
|
|
3
4
|
import { fileURLToPath } from "node:url";
|
|
4
5
|
import * as path from "node:path";
|
|
5
6
|
|
|
@@ -118,6 +119,63 @@ function findPiPackageJsonFrom(startDir: string): string | undefined {
|
|
|
118
119
|
return undefined;
|
|
119
120
|
}
|
|
120
121
|
|
|
122
|
+
/**
|
|
123
|
+
* Discover the real npm global node_modules directory at runtime.
|
|
124
|
+
*
|
|
125
|
+
* Why this exists (Issue #33): on Windows, pi may be installed somewhere
|
|
126
|
+
* other than %APPDATA%\npm — e.g. nvm-windows puts the global node_modules
|
|
127
|
+
* under %NVM_HOME%/<version>/node_modules, Volta under
|
|
128
|
+
* %LOCALAPPDATA%\Volta, fnm under %LOCALAPPDATA%\fnm_multishells. The static
|
|
129
|
+
* %APPDATA%\npm paths in resolvePiCliScript() miss all of those, and the
|
|
130
|
+
* fallback spawn("pi") then fails with ENOENT because child_process.spawn does
|
|
131
|
+
* NOT do PATHEXT resolution on Windows (only exec/execSync via cmd.exe do).
|
|
132
|
+
*
|
|
133
|
+
* `npm root -g` is the canonical way to find the global node_modules dir and
|
|
134
|
+
* works across every npm-based install layout. We run it via execSync, which
|
|
135
|
+
* DOES resolve `npm.cmd` through PATHEXT. Capped at 5s; any failure (npm not
|
|
136
|
+
* on PATH, slow start, etc.) just falls through to the other resolution roots.
|
|
137
|
+
*
|
|
138
|
+
* Memoized: the npm global root does not change during a process lifetime, so
|
|
139
|
+
* this is a one-time ~200ms cost rather than per-worker.
|
|
140
|
+
*
|
|
141
|
+
* @internal — exported for unit-test injection via __setNpmGlobalRootForTest.
|
|
142
|
+
*/
|
|
143
|
+
let cachedNpmGlobalRoot: string | undefined | null = null;
|
|
144
|
+
export function resolveNpmGlobalRoot(): string | undefined {
|
|
145
|
+
if (cachedNpmGlobalRoot !== null) {
|
|
146
|
+
return cachedNpmGlobalRoot ?? undefined;
|
|
147
|
+
}
|
|
148
|
+
let resolved: string | undefined;
|
|
149
|
+
try {
|
|
150
|
+
const out = execSync("npm root -g", {
|
|
151
|
+
encoding: "utf-8",
|
|
152
|
+
timeout: 5000,
|
|
153
|
+
stdio: ["pipe", "pipe", "pipe"], // suppress npm's stderr chatter
|
|
154
|
+
windowsHide: true,
|
|
155
|
+
}).trim();
|
|
156
|
+
resolved = out.length > 0 ? out : undefined;
|
|
157
|
+
} catch {
|
|
158
|
+
resolved = undefined;
|
|
159
|
+
}
|
|
160
|
+
cachedNpmGlobalRoot = resolved ?? null;
|
|
161
|
+
return resolved;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Given an npm global node_modules root, derive the candidate package dirs for
|
|
166
|
+
* each supported pi scope. Pure + exported so the mapping is unit-testable
|
|
167
|
+
* without spawning npm.
|
|
168
|
+
* @internal
|
|
169
|
+
*/
|
|
170
|
+
export function buildNpmGlobalPackageDirs(npmGlobalRoot: string): string[] {
|
|
171
|
+
return PI_PACKAGE_NAMES.map((pkgName) => path.join(npmGlobalRoot, ...pkgName.split("/")));
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/** @internal — test hook: inject a fake global root (or undefined) and reset the memo. */
|
|
175
|
+
export function __setNpmGlobalRootForTest(root: string | undefined): void {
|
|
176
|
+
cachedNpmGlobalRoot = root ?? null;
|
|
177
|
+
}
|
|
178
|
+
|
|
121
179
|
function resolvePiCliScript(): string | undefined {
|
|
122
180
|
const argv1 = process.argv[1];
|
|
123
181
|
if (argv1) {
|
|
@@ -125,8 +183,16 @@ function resolvePiCliScript(): string | undefined {
|
|
|
125
183
|
if (isRunnableNodeScript(argvPath)) return argvPath;
|
|
126
184
|
}
|
|
127
185
|
|
|
186
|
+
// npm-global package dirs derived from `npm root -g` — placed BEFORE the
|
|
187
|
+
// %APPDATA%\npm static paths and the cwd/import.meta fallbacks so that a pi
|
|
188
|
+
// install under nvm-windows / Volta / fnm is found even when %APPDATA%\npm
|
|
189
|
+
// doesn't contain it. Covers Issue #33.
|
|
190
|
+
const npmGlobalRoot = resolveNpmGlobalRoot();
|
|
191
|
+
const npmGlobalDirs = npmGlobalRoot ? buildNpmGlobalPackageDirs(npmGlobalRoot) : [];
|
|
192
|
+
|
|
128
193
|
const roots = [
|
|
129
194
|
resolvePiPackageRoot(),
|
|
195
|
+
...npmGlobalDirs,
|
|
130
196
|
process.env.APPDATA ? path.join(process.env.APPDATA, "npm", "node_modules", "@earendil-works", "pi-coding-agent") : undefined,
|
|
131
197
|
process.env.APPDATA ? path.join(process.env.APPDATA, "npm", "node_modules", "@mariozechner", "pi-coding-agent") : undefined,
|
|
132
198
|
path.dirname(fileURLToPath(import.meta.url)),
|
|
@@ -24,6 +24,7 @@ export interface ReconcileResult {
|
|
|
24
24
|
/** What was found and what action was taken */
|
|
25
25
|
verdict:
|
|
26
26
|
| "healthy"
|
|
27
|
+
| "blocked_awaiting_approval"
|
|
27
28
|
| "result_exists"
|
|
28
29
|
| "pid_dead"
|
|
29
30
|
| "pid_alive_stale"
|
|
@@ -36,6 +37,23 @@ export interface ReconcileResult {
|
|
|
36
37
|
repairedTasks?: TeamTaskState[];
|
|
37
38
|
}
|
|
38
39
|
|
|
40
|
+
/**
|
|
41
|
+
* Is this run intentionally waiting for human plan approval?
|
|
42
|
+
*
|
|
43
|
+
* Such runs are NOT stale even if their owning session died or their async PID
|
|
44
|
+
* is no longer live — they are blocked on a human decision, not a crash. Crash
|
|
45
|
+
* recovery and stale reconciliation must preserve them rather than mark them
|
|
46
|
+
* failed or orphan-cancel them. See PR #32 (gustavo-pelissaro) for the
|
|
47
|
+
* original analysis of this failure mode.
|
|
48
|
+
*/
|
|
49
|
+
export function isPlanApprovalPending(manifest: TeamRunManifest): boolean {
|
|
50
|
+
return (
|
|
51
|
+
manifest.status === "blocked" &&
|
|
52
|
+
manifest.planApproval?.required === true &&
|
|
53
|
+
manifest.planApproval.status === "pending"
|
|
54
|
+
);
|
|
55
|
+
}
|
|
56
|
+
|
|
39
57
|
const STALE_ALIVE_PID_MS = 24 * 60 * 60 * 1000; // 24 hours
|
|
40
58
|
const ACTIVE_EVIDENCE_TTL_MS = 5 * 60 * 1000;
|
|
41
59
|
/** For no-PID runs, repair when ALL running tasks have heartbeat stale beyond this threshold. */
|
|
@@ -347,6 +365,18 @@ export function reconcileStaleRun(
|
|
|
347
365
|
): ReconcileResult {
|
|
348
366
|
const runId = manifest.runId;
|
|
349
367
|
|
|
368
|
+
// Preserve runs intentionally blocked on human plan approval. These are not
|
|
369
|
+
// crashes even if the owning PID is gone — they are waiting for a decision.
|
|
370
|
+
// Must short-circuit before Phase 1 (result check) and Phase 2 (PID liveness).
|
|
371
|
+
if (isPlanApprovalPending(manifest)) {
|
|
372
|
+
return {
|
|
373
|
+
runId,
|
|
374
|
+
verdict: "blocked_awaiting_approval",
|
|
375
|
+
repaired: false,
|
|
376
|
+
detail: "Plan approval is pending; blocked run is intentionally waiting and must not be stale-repaired",
|
|
377
|
+
};
|
|
378
|
+
}
|
|
379
|
+
|
|
350
380
|
// Phase 1: Check if results already exist
|
|
351
381
|
const phase1 = checkResultFile(manifest, tasks);
|
|
352
382
|
if (phase1.found) {
|