pi-crew 0.7.6 → 0.7.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,25 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.7.7] — Windows spawn fix + plan-approval crash-recovery fix + CI flake fixes (2026-06-16)
4
+
5
+ A focused patch release driven by two community reports (Issue #33 and PR #32) plus the CI flake surfaced while validating them. CI green on Windows / Ubuntu / macOS (run 27599121797). 4965 tests pass / 0 fail.
6
+
7
+ ### Bug Fixes
8
+
9
+ - **`#33` — Windows `spawn pi ENOENT`** (commit `afc23b4`): when pi is installed outside `%APPDATA%\npm` (nvm-windows / Volta / fnm put the global `node_modules` elsewhere), the static `%APPDATA%\npm` paths in `resolvePiCliScript()` all miss, and the fallback `spawn("pi")` fails with `ENOENT` because `child_process.spawn` does NOT do PATHEXT resolution on Windows (only `exec`/`execSync` via `cmd.exe` do). **Fix**: pi-crew now discovers the real npm global `node_modules` dir at runtime via `npm root -g` (run through `execSync`, which DOES resolve `npm.cmd` via PATHEXT), then derives the `@earendil-works` / `@mariozechner` package dirs from it and checks them BEFORE the static `%APPDATA%\npm` paths and the cwd fallback. Covers standard installs **and** nvm-windows / Volta / fnm uniformly. Memoized once per process (one-time ~200ms cost). Injection-safe — no `shell: true` on the real worker spawn. +6 tests.
10
+ - **Plan-approval-blocked runs crash-recovery fix** (commit `421b76d`, adapts PR #32 change #1 by @gustavo-pelissaro): crash recovery and stale reconciliation both treated `status === "blocked"` runs as repair candidates, so a run legitimately blocked on **human** plan approval (`requirePlanApproval`, `status="pending"`) was marked failed and/or orphan-cancelled when its owning session died or its async PID was no longer live — destroying an in-flight HITL checkpoint. **Fix**: new `isPlanApprovalPending(manifest)` guard (status=blocked AND `planApproval.required=true` AND `planApproval.status=pending`). Guarded in `reconcileStaleRun` (new `blocked_awaiting_approval` verdict, `repaired=false` — which automatically covers `reconcileAllStaleRuns`), `detectInterruptedRuns` (skip), `cancelOrphanedRuns` (push to `skipped`), and a belt-and-suspenders re-check under the lock in `reconcileAllStaleRuns`. The guard is intentionally narrow: a plain `blocked` run (no planApproval, or already approved/cancelled) is still a recovery candidate, so existing orphaned-blocked-run handling is unchanged. +6 tests.
11
+
12
+ ### Tests (CI reliability)
13
+
14
+ - **`run-watcher-registry` macOS cancellation** (commit `dccb5e7`): the two fs.watch-dependent tests used unbounded `done()` callbacks that hung the whole test file on macOS CI runners (fs.watch events are slow/dropped under `/var/folders` + VM-runner FS load). Fixed with bounded async waits (1.5s deadline) consistent with production semantics, where fs.watch is best-effort and the preload poll loop is the source of truth.
15
+ - **`operator-experience` ubuntu redaction flake** (commit `2da1a1b`): the redaction test seeded a secret literally named `abc` and asserted `/abc/` does not leak, but the runId hash (`randomBytes(8).toString("hex")`) occasionally spells `...abc...` (e.g. `team_..._9791deabc2f52485`) → false failure, even though redaction worked perfectly. Fixed by switching to a `ZZ_LEAK_CANARY` marker — uppercase letters never appear in a lowercase-hex hash, so the marker is collision-proof.
16
+
17
+ ### Community
18
+
19
+ - Thanks to **@YrFnS** for the textbook-quality Issue #33 report and diagnosis (PATHEXT, spawn vs execSync matrix) that pinpointed the fix.
20
+ - Thanks to **@gustavo-pelissaro** for PR #32 — change #1 (plan-approval preservation) landed here; changes #2/#3 (child exit-143 normalization, symlinked temp base) were closed for heavy conflicts but will be revisited.
21
+ - PR #34 (closed) overlapped the existing `%APPDATA%\npm` resolution; superseded by the runtime `npm root -g` probe.
22
+
3
23
  ## [0.7.6] — DX, observability, and a critical interactive-session hang fix (2026-06-16)
4
24
 
5
25
  This release bundles Rounds 16–28: a developer-experience pass, an observability pass, and eight correctness/security audits — culminating in the **fix for the pts/2 interactive-session busy-loop hang** (two separate Pi sessions had hung at 71.5% CPU with 339 inotify watches). All 24 commits passed CI on Windows, Ubuntu, and macOS.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-crew",
3
- "version": "0.7.6",
3
+ "version": "0.7.7",
4
4
  "description": "Pi extension for coordinated AI teams, workflows, worktrees, and async task orchestration",
5
5
  "author": "baphuongna",
6
6
  "license": "MIT",
@@ -9,7 +9,7 @@ import type { TeamTaskState } from "../state/types.ts";
9
9
  import { isWorkerHeartbeatStale } from "./worker-heartbeat.ts";
10
10
  import type { ManifestCache } from "./manifest-cache.ts";
11
11
  import { checkProcessLiveness } from "./process-status.ts";
12
- import { reconcileStaleRun, type ReconcileResult } from "./stale-reconciler.ts";
12
+ import { isPlanApprovalPending, reconcileStaleRun, type ReconcileResult } from "./stale-reconciler.ts";
13
13
  import { executeHook, appendHookEvent } from "../hooks/registry.ts";
14
14
  import { unregisterActiveRun, readActiveRunRegistry } from "../state/active-run-registry.ts";
15
15
  import { resolveRealContainedPath } from "../utils/safe-paths.ts";
@@ -38,6 +38,8 @@ export function detectInterruptedRuns(cwd: string, manifestCache: ManifestCache,
38
38
  const plans: RecoveryPlan[] = [];
39
39
  for (const manifest of manifestCache.list(50)) {
40
40
  if (manifest.status !== "running" && manifest.status !== "blocked") continue;
41
+ // Preserve runs intentionally blocked on plan approval — not crashes.
42
+ if (isPlanApprovalPending(manifest)) continue;
41
43
  if (manifest.async?.pid !== undefined && checkProcessLiveness(manifest.async.pid).alive) continue;
42
44
  // NOTE: no withRunLock — best-effort only; concurrent writes may cause inconsistency
43
45
  const loaded = loadRunManifestById(cwd, manifest.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
@@ -107,6 +109,12 @@ export function cancelOrphanedRuns(
107
109
  // Phase 1: Scan project-level manifests via manifestCache
108
110
  for (const manifest of manifestCache.list(50)) {
109
111
  if (manifest.status !== "running" && manifest.status !== "blocked") continue;
112
+ // Preserve plan-approval-blocked runs — they belong to their owner and are
113
+ // waiting on a human decision, not orphaned by a dead owner process.
114
+ if (isPlanApprovalPending(manifest)) {
115
+ skipped.push(manifest.runId);
116
+ continue;
117
+ }
110
118
 
111
119
  // Only consider runs owned by a different session
112
120
  const ownerId = manifest.ownerSessionId;
@@ -340,6 +348,18 @@ export function reconcileAllStaleRuns(cwd: string, manifestCache: ManifestCache,
340
348
  // Re-read inside lock to get freshest data
341
349
  const fresh = loadRunManifestById(cwd, runId); // NOTE: inside withRunLockSync - consistent read
342
350
  if (!fresh || (fresh.manifest.status !== "running" && fresh.manifest.status !== "blocked")) return;
351
+ // Belt-and-suspenders: reconcileStaleRun itself guards this, but the run
352
+ // may have flipped to blocked+plan-approval between cache-list and lock
353
+ // acquisition — re-check the freshest manifest under the lock.
354
+ if (isPlanApprovalPending(fresh.manifest)) {
355
+ results.push({
356
+ runId,
357
+ verdict: "blocked_awaiting_approval",
358
+ repaired: false,
359
+ detail: "Plan approval is pending; stale reconciliation skipped",
360
+ });
361
+ return;
362
+ }
343
363
  const result = reconcileStaleRun(fresh.manifest, fresh.tasks, now);
344
364
  if (result.repaired || result.verdict === "result_exists") {
345
365
  if (result.repairedTasks) {
@@ -1,5 +1,6 @@
1
1
  import * as fs from "node:fs";
2
2
  import * as os from "node:os";
3
+ import { execSync } from "node:child_process";
3
4
  import { fileURLToPath } from "node:url";
4
5
  import * as path from "node:path";
5
6
 
@@ -118,6 +119,63 @@ function findPiPackageJsonFrom(startDir: string): string | undefined {
118
119
  return undefined;
119
120
  }
120
121
 
122
+ /**
123
+ * Discover the real npm global node_modules directory at runtime.
124
+ *
125
+ * Why this exists (Issue #33): on Windows, pi may be installed somewhere
126
+ * other than %APPDATA%\npm — e.g. nvm-windows puts the global node_modules
127
+ * under %NVM_HOME%/<version>/node_modules, Volta under
128
+ * %LOCALAPPDATA%\Volta, fnm under %LOCALAPPDATA%\fnm_multishells. The static
129
+ * %APPDATA%\npm paths in resolvePiCliScript() miss all of those, and the
130
+ * fallback spawn("pi") then fails with ENOENT because child_process.spawn does
131
+ * NOT do PATHEXT resolution on Windows (only exec/execSync via cmd.exe do).
132
+ *
133
+ * `npm root -g` is the canonical way to find the global node_modules dir and
134
+ * works across every npm-based install layout. We run it via execSync, which
135
+ * DOES resolve `npm.cmd` through PATHEXT. Capped at 5s; any failure (npm not
136
+ * on PATH, slow start, etc.) just falls through to the other resolution roots.
137
+ *
138
+ * Memoized: the npm global root does not change during a process lifetime, so
139
+ * this is a one-time ~200ms cost rather than per-worker.
140
+ *
141
+ * @internal — exported for unit-test injection via __setNpmGlobalRootForTest.
142
+ */
143
+ let cachedNpmGlobalRoot: string | undefined | null = null;
144
+ export function resolveNpmGlobalRoot(): string | undefined {
145
+ if (cachedNpmGlobalRoot !== null) {
146
+ return cachedNpmGlobalRoot ?? undefined;
147
+ }
148
+ let resolved: string | undefined;
149
+ try {
150
+ const out = execSync("npm root -g", {
151
+ encoding: "utf-8",
152
+ timeout: 5000,
153
+ stdio: ["pipe", "pipe", "pipe"], // suppress npm's stderr chatter
154
+ windowsHide: true,
155
+ }).trim();
156
+ resolved = out.length > 0 ? out : undefined;
157
+ } catch {
158
+ resolved = undefined;
159
+ }
160
+ cachedNpmGlobalRoot = resolved ?? null;
161
+ return resolved;
162
+ }
163
+
164
+ /**
165
+ * Given an npm global node_modules root, derive the candidate package dirs for
166
+ * each supported pi scope. Pure + exported so the mapping is unit-testable
167
+ * without spawning npm.
168
+ * @internal
169
+ */
170
+ export function buildNpmGlobalPackageDirs(npmGlobalRoot: string): string[] {
171
+ return PI_PACKAGE_NAMES.map((pkgName) => path.join(npmGlobalRoot, ...pkgName.split("/")));
172
+ }
173
+
174
+ /** @internal — test hook: inject a fake global root (or undefined) and reset the memo. */
175
+ export function __setNpmGlobalRootForTest(root: string | undefined): void {
176
+ cachedNpmGlobalRoot = root ?? null;
177
+ }
178
+
121
179
  function resolvePiCliScript(): string | undefined {
122
180
  const argv1 = process.argv[1];
123
181
  if (argv1) {
@@ -125,8 +183,16 @@ function resolvePiCliScript(): string | undefined {
125
183
  if (isRunnableNodeScript(argvPath)) return argvPath;
126
184
  }
127
185
 
186
+ // npm-global package dirs derived from `npm root -g` — placed BEFORE the
187
+ // %APPDATA%\npm static paths and the cwd/import.meta fallbacks so that a pi
188
+ // install under nvm-windows / Volta / fnm is found even when %APPDATA%\npm
189
+ // doesn't contain it. Covers Issue #33.
190
+ const npmGlobalRoot = resolveNpmGlobalRoot();
191
+ const npmGlobalDirs = npmGlobalRoot ? buildNpmGlobalPackageDirs(npmGlobalRoot) : [];
192
+
128
193
  const roots = [
129
194
  resolvePiPackageRoot(),
195
+ ...npmGlobalDirs,
130
196
  process.env.APPDATA ? path.join(process.env.APPDATA, "npm", "node_modules", "@earendil-works", "pi-coding-agent") : undefined,
131
197
  process.env.APPDATA ? path.join(process.env.APPDATA, "npm", "node_modules", "@mariozechner", "pi-coding-agent") : undefined,
132
198
  path.dirname(fileURLToPath(import.meta.url)),
@@ -24,6 +24,7 @@ export interface ReconcileResult {
24
24
  /** What was found and what action was taken */
25
25
  verdict:
26
26
  | "healthy"
27
+ | "blocked_awaiting_approval"
27
28
  | "result_exists"
28
29
  | "pid_dead"
29
30
  | "pid_alive_stale"
@@ -36,6 +37,23 @@ export interface ReconcileResult {
36
37
  repairedTasks?: TeamTaskState[];
37
38
  }
38
39
 
40
+ /**
41
+ * Is this run intentionally waiting for human plan approval?
42
+ *
43
+ * Such runs are NOT stale even if their owning session died or their async PID
44
+ * is no longer live — they are blocked on a human decision, not a crash. Crash
45
+ * recovery and stale reconciliation must preserve them rather than mark them
46
+ * failed or orphan-cancel them. See PR #32 (gustavo-pelissaro) for the
47
+ * original analysis of this failure mode.
48
+ */
49
+ export function isPlanApprovalPending(manifest: TeamRunManifest): boolean {
50
+ return (
51
+ manifest.status === "blocked" &&
52
+ manifest.planApproval?.required === true &&
53
+ manifest.planApproval.status === "pending"
54
+ );
55
+ }
56
+
39
57
  const STALE_ALIVE_PID_MS = 24 * 60 * 60 * 1000; // 24 hours
40
58
  const ACTIVE_EVIDENCE_TTL_MS = 5 * 60 * 1000;
41
59
  /** For no-PID runs, repair when ALL running tasks have heartbeat stale beyond this threshold. */
@@ -347,6 +365,18 @@ export function reconcileStaleRun(
347
365
  ): ReconcileResult {
348
366
  const runId = manifest.runId;
349
367
 
368
+ // Preserve runs intentionally blocked on human plan approval. These are not
369
+ // crashes even if the owning PID is gone — they are waiting for a decision.
370
+ // Must short-circuit before Phase 1 (result check) and Phase 2 (PID liveness).
371
+ if (isPlanApprovalPending(manifest)) {
372
+ return {
373
+ runId,
374
+ verdict: "blocked_awaiting_approval",
375
+ repaired: false,
376
+ detail: "Plan approval is pending; blocked run is intentionally waiting and must not be stale-repaired",
377
+ };
378
+ }
379
+
350
380
  // Phase 1: Check if results already exist
351
381
  const phase1 = checkResultFile(manifest, tasks);
352
382
  if (phase1.found) {