pi-crew 0.2.20 → 0.2.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/CHANGELOG.md +23 -10
  2. package/README.md +4 -2
  3. package/docs/PROJECT_REVIEW.md +271 -0
  4. package/docs/PROJECT_REVIEW_FIXES.md +343 -0
  5. package/docs/PROJECT_REVIEW_ROUND4.md +156 -0
  6. package/docs/PROJECT_REVIEW_ROUND5.md +86 -0
  7. package/docs/fixes/BATCH_A_H1_H2.md +86 -0
  8. package/docs/fixes/bug-006-foreground-cancel-concurrent.md +78 -0
  9. package/docs/fixes/bug-007-async-notifier-stale-ctx.md +112 -0
  10. package/docs/fixes/bug-008-child-process-silent-timeout.md +100 -0
  11. package/docs/fixes/bug-009-executor-yield-limit-needs-attention.md +75 -0
  12. package/docs/fixes/bug-010-child-process-api-key-filtered.md +109 -0
  13. package/docs/fixes/bug-011-spawn-pi-enoent.md +92 -0
  14. package/docs/fixes/bug-012-essential-env-stripped.md +89 -0
  15. package/docs/fixes/bug-013-background-runner-death.md +84 -0
  16. package/docs/fixes/bug-014-infinite-retry-loop-needs-attention.md +82 -0
  17. package/docs/fixes/bug-015-background-runner-sigterm.md +65 -0
  18. package/docs/fixes/bug-017-background-runner-session-shutdown.md +66 -0
  19. package/docs/fixes/bug-017-background-runner-sigkill-double-fork.md +28 -0
  20. package/docs/fixes/bug-018-child-pi-worker-stdin-hang.md +61 -0
  21. package/docs/fixes/bug-019-phantom-runs-temp-workspace.md +52 -0
  22. package/docs/pi-crew-bugs.md +954 -0
  23. package/docs/pi-crew-investigation-report.md +411 -0
  24. package/docs/pi-crew-test-final.md +120 -0
  25. package/docs/pi-crew-test-results.md +260 -0
  26. package/docs/pi-crew-test-round2.md +136 -0
  27. package/docs/pi-crew-test-round4.md +100 -0
  28. package/docs/pi-crew-test-round5.md +70 -0
  29. package/docs/pi-crew-test-round6.md +110 -0
  30. package/docs/usage.md +14 -0
  31. package/package.json +4 -2
  32. package/src/adapters/export-util.ts +12 -6
  33. package/src/agents/agent-config.ts +2 -0
  34. package/src/config/defaults.ts +1 -1
  35. package/src/config/markers.ts +22 -17
  36. package/src/config/resilient-parser.ts +1 -1
  37. package/src/extension/async-notifier.ts +4 -2
  38. package/src/extension/management.ts +52 -0
  39. package/src/extension/register.ts +47 -10
  40. package/src/extension/run-index.ts +20 -2
  41. package/src/extension/run-maintenance.ts +2 -2
  42. package/src/extension/team-tool/parallel-dispatch.ts +1 -1
  43. package/src/extension/team-tool/run.ts +3 -6
  44. package/src/extension/team-tool.ts +67 -11
  45. package/src/observability/event-to-metric.ts +2 -1
  46. package/src/runtime/async-runner.ts +42 -34
  47. package/src/runtime/background-runner.ts +165 -7
  48. package/src/runtime/child-pi.ts +111 -18
  49. package/src/runtime/code-summary.ts +1 -1
  50. package/src/runtime/crash-recovery.ts +1 -1
  51. package/src/runtime/crew-agent-runtime.ts +2 -1
  52. package/src/runtime/heartbeat-watcher.ts +4 -0
  53. package/src/runtime/live-agent-manager.ts +1 -1
  54. package/src/runtime/live-session-runtime.ts +2 -1
  55. package/src/runtime/manifest-cache.ts +2 -2
  56. package/src/runtime/model-fallback.ts +2 -1
  57. package/src/runtime/phase-progress.ts +1 -1
  58. package/src/runtime/pi-args.ts +3 -1
  59. package/src/runtime/pi-spawn.ts +6 -0
  60. package/src/runtime/prose-compressor.ts +1 -1
  61. package/src/runtime/result-extractor.ts +0 -1
  62. package/src/runtime/retry-executor.ts +1 -1
  63. package/src/runtime/runtime-resolver.ts +8 -3
  64. package/src/runtime/skill-instructions.ts +0 -1
  65. package/src/runtime/stale-reconciler.ts +30 -3
  66. package/src/runtime/subagent-manager.ts +2 -0
  67. package/src/runtime/task-display.ts +1 -1
  68. package/src/runtime/task-graph-scheduler.ts +1 -1
  69. package/src/runtime/task-runner/live-executor.ts +15 -0
  70. package/src/runtime/task-runner/tail-read.ts +26 -0
  71. package/src/runtime/task-runner.ts +1007 -383
  72. package/src/runtime/team-runner.ts +9 -5
  73. package/src/runtime/worker-startup.ts +3 -1
  74. package/src/schema/team-tool-schema.ts +2 -1
  75. package/src/state/active-run-registry.ts +8 -2
  76. package/src/state/atomic-write.ts +17 -0
  77. package/src/state/contracts.ts +5 -2
  78. package/src/state/event-log-rotation.ts +118 -31
  79. package/src/state/event-log.ts +33 -5
  80. package/src/state/event-reconstructor.ts +4 -2
  81. package/src/state/mailbox.ts +5 -1
  82. package/src/state/schedule.ts +146 -0
  83. package/src/state/types.ts +40 -0
  84. package/src/state/usage.ts +20 -0
  85. package/src/ui/crew-widget.ts +2 -2
  86. package/src/ui/run-event-bus.ts +1 -1
  87. package/src/ui/run-snapshot-cache.ts +2 -1
  88. package/src/ui/snapshot-types.ts +1 -0
  89. package/src/utils/gh-protocol.ts +2 -2
  90. package/src/utils/names.ts +1 -1
  91. package/src/utils/sse-parser.ts +0 -2
  92. package/src/worktree/branch-freshness.ts +1 -1
  93. package/src/worktree/cleanup.ts +54 -14
  94. package/src/worktree/worktree-manager.ts +19 -9
@@ -0,0 +1,78 @@
1
+ # Bug #6: Foreground fast-fix team cancelled after explore — ROOT CAUSE CONFIRMED
2
+
3
+ | Field | Value |
4
+ |---|---|
5
+ | **Severity** | 🔴 HIGH |
6
+ | **Status** | Root cause confirmed, 100% reproducible |
7
+ | **Affected** | Foreground team runs when concurrent tool calls happen |
8
+ | **Symptom** | Explore completes with zero output, run immediately cancelled |
9
+
10
+ ## Reproduce (100%)
11
+
12
+ 1. Start a foreground fast-fix team run
13
+ 2. While it's running, call ANY other team action (get, plan, settings, etc.)
14
+ 3. Result: Explore "completes" with `outputLength=0, jsonEvents=0, toolUses=0` → run cancelled
15
+
16
+ ## Without concurrent calls
17
+
18
+ Same fast-fix run with NO other tool calls → **completes successfully** with `jsonEvents=120`, full output.
19
+
20
+ ## Root Cause
21
+
22
+ When multiple tool calls happen concurrently with a foreground live-session run:
23
+
24
+ 1. Foreground run starts, spawns live-session agent for explore
25
+ 2. User calls `team get`, `team plan`, `team settings`, etc.
26
+ 3. Pi processes these tool calls in the same session
27
+ 4. Pi may trigger **auto-compaction** (context grows from tool outputs)
28
+ 5. Compaction or context operation **interrupts the live-session agent**
29
+ 6. Live-session agent's prompt returns with **zero output** (`outputLength:0`)
30
+ 7. team-runner marks task as "completed" (exit code 0, but no output)
31
+ 8. Next phase transition fails — run gets `"caller_cancelled"` abort
32
+
33
+ ### Evidence chain
34
+
35
+ ```
36
+ Successful run (no concurrent calls):
37
+ live-session.prompt_done: elapsedMs=30888, jsonEvents=30, outputLength=20894
38
+ → All 3 tasks complete
39
+
40
+ Failed run (with concurrent tool calls):
41
+ live-session.prompt_done: elapsedMs=32304, jsonEvents=0, outputLength=0
42
+ → Explore "completes" with nothing, run immediately cancelled
43
+ ```
44
+
45
+ The `"outputLength":0` is the smoking gun — the live-session agent's prompt completed
46
+ without producing any output because Pi was busy processing other tool calls.
47
+
48
+ ### Key difference in status.json
49
+
50
+ | Field | Successful | Failed |
51
+ |---|---|---|
52
+ | jsonEvents | 120 | 0 |
53
+ | toolUses | many | 0 |
54
+ | output | full context | empty |
55
+
56
+ ## Fix suggestions
57
+
58
+ ### Option A: Queue concurrent tool calls during foreground run
59
+ When a foreground run is active, queue other team tool calls instead of processing immediately.
60
+
61
+ ### Option B: Protect live-session prompt from interruption
62
+ In `live-session-runtime.ts`, add a guard that prevents context operations during `session.prompt()`.
63
+
64
+ ### Option C: Detect zero-output completion as failure
65
+ In `team-runner.ts`, when a live-session task completes with `outputLength=0` and `toolUses=0`, treat it as a failure and retry instead of proceeding.
66
+
67
+ ### Option D: Warn user
68
+ When foreground run is active and user calls another team action, return a warning:
69
+ "Foreground run is active. Concurrent operations may interrupt the running agent."
70
+
71
+ ## Files
72
+
73
+ ```
74
+ pi-crew/src/extension/register.ts — startForegroundRun(), concurrent tool handling
75
+ pi-crew/src/runtime/live-session-runtime.ts — promptWithTimeout(), output capture
76
+ pi-crew/src/runtime/team-runner.ts — task completion handling
77
+ pi-crew/src/extension/registration/compaction-guard.ts — auto-compaction during runs
78
+ ```
@@ -0,0 +1,112 @@
1
+ # Bug #7: Async notifier "stale ctx detected; stopping notifier" — không restart
2
+
3
+ | Field | Value |
4
+ |---|---|
5
+ | **Severity** | 🔴 HIGH |
6
+ | **Status** | Root cause confirmed, fix pending |
7
+ | **Affected** | Tất cả pi-crew users sau khi Pi session restarts/compacts |
8
+ | **Symptom** | Sau restart/compact, notifier dừng hoàn toàn — không còn nhận notifications cho run completions |
9
+
10
+ ## Mô tả
11
+
12
+ Sau khi Pi restart (hoặc /clear, /compact, session switch), xuất hiện error:
13
+ ```
14
+ [pi-crew] async notifier stale ctx detected; stopping notifier.
15
+ ```
16
+
17
+ Sau đó, pi-crew **không còn deliver notifications** cho run completions. Background runs hoàn thành nhưng user không được báo.
18
+
19
+ ## Root cause
20
+
21
+ ### Flow bình thường (mong đợi):
22
+
23
+ ```
24
+ Pi session_start event
25
+ → sessionGeneration++
26
+ → currentCtx = newCtx
27
+ → cleanupRuntime() (stop old notifier)
28
+ → startAsyncRunNotifier(newCtx, ...)
29
+ → New notifier hoạt động với newCtx ✅
30
+ ```
31
+
32
+ ### Flow bị lỗi:
33
+
34
+ ```
35
+ 1. Old notifier interval tick fires
36
+ 2. isCurrent(generation) check → true (generation chưa increment)
37
+ 3. ctx.ui.notify() called
38
+ 4. Pi đã invalidate old ctx → throw Error("This extension ctx is stale...")
39
+ 5. Catch block: message.includes("stale") → true
40
+ 6. stopAsyncRunNotifier(state) → clearInterval(interval)
41
+ 7. console.error("stale ctx detected; stopping notifier.")
42
+ 8. ❌ Notifier stopped permanently
43
+ ```
44
+
45
+ Vấn đề: **session_start handler** CHƯA kịp chạy để start new notifier.
46
+
47
+ ### Tại sao session_start chưa chạy?
48
+
49
+ Khi Pi invalidate ctx, old notifier interval **vẫn đang chạy** (clearInterval chưa được gọi). Interval tick xảy ra **trước khi** `cleanupRuntime()` hoặc `session_start` handler chạy. Đây là **race condition** giữa:
50
+ - Old notifier's setInterval tick
51
+ - Pi's session shutdown/start event sequence
52
+
53
+ ### Code location
54
+
55
+ **`/home/bom/source/my_pi/pi-crew/src/extension/async-notifier.ts`**, line 103-112:
56
+ ```typescript
57
+ } catch (error) {
58
+ const message = error instanceof Error ? error.message : String(error);
59
+ if (message.includes("stale") || message.includes("session replacement") || message.includes("old ctx")) {
60
+ console.error(`[pi-crew] async notifier stale ctx detected; stopping notifier.`);
61
+ try { stopAsyncRunNotifier(state); } catch { /* ignore */ }
62
+ return; // ❌ Stops the interval, never restarts
63
+ }
64
+ }
65
+ ```
66
+
67
+ ### Why it matters
68
+
69
+ - User restarts Pi → notifier dies → background runs complete silently
70
+ - User phải manually check `team status` để biết runs hoàn thành
71
+ - Ảnh hưởng UX nghiêm trọng: pi-crew "câm" sau restart
72
+
73
+ ## Fix
74
+
75
+ ### Option A: Silent swallow stale errors (recommended)
76
+
77
+ Thay vì stop notifier, chỉ **skip notification** này và chờ session_start restart notifier với new ctx:
78
+
79
+ ```typescript
80
+ } catch (error) {
81
+ const message = error instanceof Error ? error.message : String(error);
82
+ if (message.includes("stale") || message.includes("session replacement") || message.includes("old ctx")) {
83
+ // Don't stop — session_start will create a new notifier with the new ctx.
84
+ // This old notifier's isCurrent guard will return false on next tick,
85
+ // making it effectively dormant until cleaned up.
86
+ return;
87
+ }
88
+ console.error(`[pi-crew] async notifier error: ${message}`);
89
+ }
90
+ ```
91
+
92
+ Rationale: `isCurrent` guard sẽ return false sau khi `sessionGeneration++` → old notifier interval vẫn chạy nhưng không làm gì (silent). New notifier từ `session_start` sẽ hoạt động bình thường.
93
+
94
+ ### Option B: Add explicit restart mechanism
95
+
96
+ Add `restartAsyncRunNotifier()` function và gọi từ `session_start` handler. Nhưng Option A đơn giản hơn và đủ.
97
+
98
+ ## Key files
99
+
100
+ ```
101
+ pi-crew/src/extension/async-notifier.ts — startAsyncRunNotifier(), stopAsyncRunNotifier()
102
+ pi-crew/src/extension/register.ts — session_start handler, isCurrent guard
103
+ ```
104
+
105
+ ## Pi SDK reference
106
+
107
+ Pi's `ExtensionRunner.invalidate()` sets `staleMessage` → `assertActive()` throws:
108
+ ```
109
+ "This extension ctx is stale after session replacement or reload.
110
+ Do not use a captured pi or command ctx after ctx.newSession(),
111
+ ctx.fork(), ctx.switchSession(), or ctx.reload()."
112
+ ```
@@ -0,0 +1,100 @@
1
+ # Bug #8: Background child-process 300s timeout — Silent hang
2
+
3
+ | Field | Value |
4
+ |---|---|
5
+ | **Severity** | 🟠 MEDIUM |
6
+ | **Status** | Root cause identified, fix partially applied |
7
+ | **Affected** | All async/background team runs (child-process runtime) |
8
+
9
+ ## Symptom
10
+
11
+ ```
12
+ worker.spawned: pid=177677 ✅ (real process spawned)
13
+ worker.response_timeout: No output for 300000ms
14
+ crew.task.heartbeat_dead: elapsedMs=300774
15
+ worker.exit: exitCode=null (killed)
16
+ Result: jsonEvents=0, toolUses=0, output.log=DOES NOT EXIST, stderr=EMPTY
17
+ ```
18
+
19
+ ## Root Cause
20
+
21
+ Child Pi process:
22
+ 1. Spawned successfully (real OS process with valid PID)
23
+ 2. Ran completely SILENT — zero stdout, zero stderr for 5 minutes
24
+ 3. Timed out after 300s with no output
25
+ 4. Killed by SIGTERM
26
+
27
+ **The process was alive but produced no output at all.** This is NOT:
28
+ - ❌ Crash (would have stderr)
29
+ - ❌ 429 rate limit (Round 1 fix handles this)
30
+ - ❌ Model error (would have error output)
31
+
32
+ **Possible causes:**
33
+ 1. MiniMax provider silently hangs in child-process context
34
+ 2. Child Pi startup error but stderr channel not capturing it
35
+ 3. Model called but produces empty response → child waits forever
36
+ 4. IPC communication failure between parent and child
37
+
38
+ ## Evidence
39
+
40
+ ```
41
+ Event timeline (team_20260519090953_e6ddc7b21b0048fa):
42
+ 09:09:54.511Z worker.spawned pid=177677
43
+ 09:13:38.075Z task.attention: 223s idle (no observed activity)
44
+ 09:14:54.516Z worker.response_timeout: No output for 300000ms
45
+ → stderr: NOT IN EVENT (timeoutStderr was empty/undefined)
46
+ 09:14:57.535Z worker.exit exitCode=null (killed)
47
+ ```
48
+
49
+ ```
50
+ Agent files: only status.json exists
51
+ output.log: DOES NOT EXIST
52
+ stderr.log: DOES NOT EXIST
53
+ result: "(no output)" (11 bytes)
54
+ ```
55
+
56
+ ## Partial Fix Applied
57
+
58
+ `timeoutStderr` is now captured and included in `response_timeout` events. However, in this case the stderr was empty — meaning the child process ran but produced nothing to stderr.
59
+
60
+ ## Files Involved
61
+
62
+ ```
63
+ pi-crew/src/runtime/child-pi.ts — spawn, timeout, stderr capture
64
+ pi-crew/src/runtime/background-runner.ts — async run management
65
+ pi-crew/src/runtime/task-runner.ts — worker lifecycle
66
+ ```
67
+
68
+ ## Fix Suggestions
69
+
70
+ ### Fix A: Add stderr capture at spawn moment
71
+ Capture any startup errors from child Pi's stderr pipe immediately after spawn.
72
+
73
+ ```typescript
74
+ // In child-pi.ts, after child spawn:
75
+ child.stderr?.on("data", (chunk) => {
76
+ stderr += chunk.toString();
77
+ // Also log to parent for debugging
78
+ console.error("[pi-crew:child-stderr]", chunk.toString());
79
+ });
80
+ ```
81
+
82
+ ### Fix B: Reduce timeout for background workers
83
+ For background/async runs, use shorter timeout (60s) since output should stream quickly.
84
+
85
+ ### Fix C: Detect silent spawn (no output within 30s = warning)
86
+ If a worker spawns but produces zero output within 30s, emit a warning event.
87
+
88
+ ### Fix D: Add process startup verification
89
+ Send a ping to the child Pi and expect a response within 10s. If no response, consider it a spawn failure.
90
+
91
+ ## Comparison: Live-session vs Child-process
92
+
93
+ | Aspect | Live-session | Child-process |
94
+ |---|---|---|
95
+ | Startup | Immediate | Delayed (~1s) |
96
+ | Model output | Streaming JSON | Silent hang |
97
+ | Error visibility | Direct | Hidden (no stderr) |
98
+ | Timeout | Works correctly | Silent 300s hang |
99
+
100
+ This suggests the issue is specific to **child-process runtime with MiniMax model** — the model provider silently fails in the background subprocess context.
@@ -0,0 +1,75 @@
1
+ # Bug #9 Fix: Executor Yield Limit — New `needs_attention` Status
2
+
3
+ **Date**: 2026-05-19
4
+ **Root Cause**: Executor agent completes work but doesn't call `submit_result` → yield enforcement sends 3 reminders → task marked `completed` with `exitCode: 0` → artifact missing/incomplete
5
+ **Status**: ✅ Fixed
6
+
7
+ ## Problem
8
+
9
+ When an executor agent (or any live-session worker) completes its task but doesn't call the `submit_result` tool:
10
+
11
+ 1. The live-session runtime's yield enforcement loop runs (max 3 reminders × 500ms = 1.5s window)
12
+ 2. After 3 reminders with no `submit_result`, a `task.attention` event fires with `reason: "no_yield"`
13
+ 3. But the task is still marked `status: "completed"` with `exitCode: 0`
14
+ 4. The `resultArtifact` contains `liveResult.stdout || "(no output)"` — which may be empty
15
+ 5. The executor's actual file write was completed but never captured in the result artifact
16
+
17
+ This means tasks that didn't properly submit their result appear "completed" in the UI, misleading users into thinking the work was done.
18
+
19
+ ## Fix
20
+
21
+ Added a new `needs_attention` task status that is set when a worker completes without calling `submit_result`.
22
+
23
+ ### New Status: `needs_attention`
24
+
25
+ - **Type**: Terminal status (like `completed`, `failed`, `cancelled`, `skipped`)
26
+ - **Meaning**: Worker finished executing but didn't submit a result — work may or may not be complete
27
+ - **Transitions**: `running → needs_attention`, `needs_attention → queued` (retry), `needs_attention → running` (re-run)
28
+ - **Icon**: ⚠ (warning sign) in UI
29
+
30
+ ### Changes
31
+
32
+ | File | Change |
33
+ |---|---|
34
+ | `src/state/contracts.ts` | Added `"needs_attention"` to `TEAM_TASK_STATUSES`, `TEAM_TERMINAL_TASK_STATUSES`, `TEAM_TASK_STATUS_TRANSITIONS`, `TEAM_EVENT_TYPES` |
35
+ | `src/runtime/task-runner.ts` | Added `noYield` flag; when no yield detected → set `status: "needs_attention"` instead of `"completed"`, emit `"task.needs_attention"` event |
36
+ | `src/runtime/crew-agent-runtime.ts` | Added `"needs_attention"` to `CrewAgentStatus` type; updated `taskStatusToAgentStatus()` |
37
+ | `src/runtime/team-runner.ts` | Added `"needs_attention"` to `terminalStatuses` set for workflow phase advancement |
38
+ | `src/runtime/stale-reconciler.ts` | Added `"needs_attention"` to `allTerminal` check |
39
+ | `src/runtime/crash-recovery.ts` | Added `"needs_attention"` to `isTerminalTask()` |
40
+ | `src/runtime/phase-progress.ts` | Added `"needs_attention"` to `TERMINAL_STATUSES` |
41
+ | `src/runtime/task-display.ts` | Added ⚠ icon for `"needs_attention"` status |
42
+ | `src/ui/snapshot-types.ts` | Added `needsAttention?: number` to `RunUiProgress` |
43
+ | `src/ui/run-snapshot-cache.ts` | Track `"needs_attention"` tasks in progress calculation |
44
+ | `src/ui/crew-widget.ts` | Added `"needs_attention"` to `ERROR_STATUSES`; added ⚠ icon |
45
+ | `src/ui/run-event-bus.ts` | Added `"task.needs_attention"` to `WORKER_LIFECYCLE_TYPES` |
46
+ | `src/config/defaults.ts` | Added `"task.needs_attention"` to `terminalEventTypes` |
47
+ | `src/state/event-reconstructor.ts` | Added `"task.needs_attention"` event → `"needs_attention"` status mapping |
48
+ | `src/observability/event-to-metric.ts` | Added `"crew.task.needs_attention"` metric |
49
+
50
+ ## Status Transition Graph (Updated)
51
+
52
+ ```
53
+ queued → running → completed ✓
54
+ → failed ✗
55
+ → cancelled ■
56
+ → needs_attention ⚠ (NEW)
57
+
58
+ needs_attention → queued (retry)
59
+ → running (re-run)
60
+ ```
61
+
62
+ ## Verification
63
+
64
+ ```bash
65
+ cd /home/bom/source/my_pi/pi-crew
66
+ npx tsc --noEmit # No errors
67
+ npx vitest run test/unit/stale-reconciler.test.ts # All 8 tests pass
68
+ ```
69
+
70
+ ## User Impact
71
+
72
+ - Tasks that previously showed as "completed" (✓) with missing artifacts now show as "⚠ needs_attention"
73
+ - Users can clearly see which tasks need manual review
74
+ - Downstream tasks (verifier, etc.) will see the task as "needs_attention" instead of "completed" and can adjust behavior accordingly
75
+ - Workflow phase advancement correctly treats `needs_attention` as a terminal status
@@ -0,0 +1,109 @@
1
+ # Bug #10: Child-Process Silent Timeout — MINIMAX_API_KEY Filtered Out
2
+
3
+ **Date:** 2026-05-19
4
+ **Severity:** HIGH
5
+ **Type:** Bug (not design issue)
6
+ **Status:** OPEN — Fix applied, pending verification
7
+
8
+ ## Summary
9
+
10
+ Background child-process workers silently time out after 300 seconds with zero output
11
+ because `MINIMAX_API_KEY` is filtered out by `sanitizeEnvSecrets()` before the child
12
+ process is spawned. The child Pi has no API credentials to call the model.
13
+
14
+ ## Root Cause
15
+
16
+ In `src/runtime/child-pi.ts` line 159-161:
17
+
18
+ ```typescript
19
+ function buildChildPiSpawnOptions(cwd: string, env: NodeJS.ProcessEnv): SpawnOptions {
20
+ const filteredEnv = sanitizeEnvSecrets(env); // ← STRIPS ALL *API_KEY* VARS
21
+ return {
22
+ cwd,
23
+ env: { ...filteredEnv, PI_CREW_PARENT_PID: String(process.pid) },
24
+ ...
25
+ };
26
+ }
27
+ ```
28
+
29
+ The `sanitizeEnvSecrets()` function (in `src/utils/env-filter.ts`) uses a deny-list
30
+ pattern to filter out keys matching secret patterns:
31
+
32
+ ```typescript
33
+ SECRET_KEY_PATTERN = /(?:^|[_.-])(token|api[-_]?key|password|passwd|secret|credential|authorization|private[-_]?key)(?:$|[_.-])/i;
34
+ ```
35
+
36
+ `MINIMAX_API_KEY` matches this pattern because `_API_KEY` contains `api_key` as a
37
+ substring, which matches the `api[-_]?key` part of the regex.
38
+
39
+ ### Why foreground (live-session) works fine
40
+
41
+ The live-session runtime in `live-session-runtime.ts` uses the SAME parent Pi session
42
+ that already has the API key loaded. There's no separate child process — the live
43
+ session inherits the parent's environment directly. No `sanitizeEnvSecrets()` call.
44
+
45
+ ### Why background (child-process) fails
46
+
47
+ 1. `team action='run'` with async=true → `background-runner.ts` → `child-pi.ts`
48
+ 2. Child Pi is spawned via `spawn()` with `buildChildPiSpawnOptions()`
49
+ 3. `buildChildPiSpawnOptions` calls `sanitizeEnvSecrets()` which strips `MINIMAX_API_KEY`
50
+ 4. Child Pi starts with no API key → cannot authenticate with MiniMax → hangs silently
51
+ 5. After 300s of no output, `response_timeout` fires and kills the process
52
+
53
+ ## Evidence
54
+
55
+ All 7+ failed background workers show the same pattern:
56
+ - `worker.spawned` — PID confirmed, process is alive
57
+ - `task.attention` — 223+ seconds of idle (no stdout, no stderr, no jsonEvents)
58
+ - `worker.response_timeout` — "No output for 300000ms"
59
+ - `worker.exit` — `exitCode=null` (SIGTERM kill)
60
+
61
+ No error output because the child Pi can't even report an auth failure — it simply
62
+ silently waits for a model response that never comes.
63
+
64
+ ## Fix
65
+
66
+ **File:** `src/runtime/child-pi.ts`
67
+
68
+ In `buildChildPiSpawnOptions()`, change the `sanitizeEnvSecrets()` call to preserve
69
+ model provider API keys:
70
+
71
+ ```typescript
72
+ function buildChildPiSpawnOptions(cwd: string, env: NodeJS.ProcessEnv): SpawnOptions {
73
+ // Preserve model provider API keys (MINIMAX_API_KEY, OPENAI_API_KEY, etc.)
74
+ // These are needed by the child Pi to call the configured model provider.
75
+ const filteredEnv = sanitizeEnvSecrets(env, {
76
+ allowList: ["MINIMAX_*", "OPENAI_*", "ANTHROPIC_*", "GOOGLE_*", "AZURE_*", "AWS_*", "ZEU_*", "ZERODEV_*", "*_API_KEY", "*_TOKEN", "*_SECRET"],
77
+ });
78
+ return {
79
+ cwd,
80
+ env: { ...filteredEnv, PI_CREW_PARENT_PID: String(process.pid) },
81
+ stdio: ["pipe", "pipe", "pipe"],
82
+ detached: process.platform !== "win32",
83
+ windowsHide: true,
84
+ };
85
+ }
86
+ ```
87
+
88
+ This uses the allow-list mode of `sanitizeEnvSecrets()` which only preserves keys
89
+ matching the globs. All other secret-like keys (passwords, credentials, etc.) are still
90
+ stripped. Only model API keys needed for the child process to call the LLM are
91
+ preserved.
92
+
93
+ ## Why this is safe
94
+
95
+ - Model API keys are not sensitive in the same way as passwords — they're designed
96
+ to be passed to API endpoints
97
+ - The allow-list is specific to known provider prefixes — it doesn't blanket-allow
98
+ all env vars
99
+ - Other secrets (DB passwords, internal credentials) are still filtered out
100
+ - The `PI_CREW_PARENT_PID` is added after filtering so it won't conflict
101
+
102
+ ## Verification Plan
103
+
104
+ 1. Apply the fix to `src/runtime/child-pi.ts`
105
+ 2. Restart Pi to reload the extension
106
+ 3. Run a background team (e.g., `team action='run', team='research', async=true`)
107
+ 4. Verify the child process produces output within the first 30 seconds
108
+ 5. Compare env of child process (add temporary debug logging) to confirm
109
+ `MINIMAX_API_KEY` is present
@@ -0,0 +1,92 @@
1
+ # Bug #11: Background Runner "spawn pi ENOENT" — pi binary not in PATH
2
+
3
+ **Date:** 2026-05-19
4
+ **Severity:** 🔴 HIGH
5
+ **Type:** Regression (broken in current session)
6
+ **Status:** ✅ Fixed — added `resolvePiCliScript()` call for non-Windows platforms
7
+
8
+ ## Summary
9
+
10
+ Background async team runs fail immediately with "spawn pi ENOENT" because the `getPiSpawnCommand()` function returns `command: "pi"` without resolving the full path on non-Windows platforms. When the detached background-runner process starts with a minimal PATH environment, it cannot find the `pi` binary.
11
+
12
+ ## Root Cause
13
+
14
+ In `src/runtime/pi-spawn.ts` line 153-171:
15
+
16
+ ```typescript
17
+ export function getPiSpawnCommand(args: string[]): PiSpawnCommand {
18
+ // ...
19
+ if (process.platform === "win32") {
20
+ const script = resolvePiCliScript(); // ← Only called on Windows!
21
+ if (script) return { command: process.execPath, args: [script, ...args] };
22
+ }
23
+ return { command: "pi", args }; // ← Returns bare "pi" on Linux/macOS!
24
+ }
25
+ ```
26
+
27
+ On Windows, the full path to the Pi entry point script is resolved via `resolvePiCliScript()`. On Linux/macOS, the function returns `command: "pi"` which relies on PATH lookup. When the detached background-runner process inherits a minimal PATH, `pi` is not found → `ENOENT`.
28
+
29
+ ## Why Live-Session Works
30
+
31
+ Live-session runs use `child-pi.ts` → `getPiSpawnCommand()` for spawning child workers, but the parent Pi session has the full PATH including `/home/bom/.nvm/versions/node/v22.22.0/bin`. However, there was also a live-session failure in round 3 with "caller_cancelled" — likely a different issue (compaction guard, Bug #6).
32
+
33
+ ## Why Earlier Async Runs Worked
34
+
35
+ Earlier async runs (before current session) had workers that:
36
+ 1. Spawned successfully (`worker.spawned` event with real PID)
37
+ 2. Ran for 5 minutes producing zero output
38
+ 3. Timed out with `response_timeout`
39
+
40
+ Those were the SAME underlying bug (#10/8): `MINIMAX_API_KEY` filtered out. But they DID find the `pi` binary — meaning something changed in the current session that broke the PATH further.
41
+
42
+ The NEW "spawn pi ENOENT" failure in the current session test is a SEPARATE issue: the background runner itself can't find `pi`, not just the child workers.
43
+
44
+ ## Fix
45
+
46
+ **File:** `src/runtime/pi-spawn.ts`
47
+
48
+ Changed from:
49
+
50
+ ```typescript
51
+ if (process.platform === "win32") {
52
+ const script = resolvePiCliScript();
53
+ if (script) return { command: process.execPath, args: [script, ...args] };
54
+ }
55
+ return { command: "pi", args };
56
+ ```
57
+
58
+ To:
59
+
60
+ ```typescript
61
+ if (process.platform === "win32") {
62
+ const script = resolvePiCliScript();
63
+ if (script) return { command: process.execPath, args: [script, ...args] };
64
+ }
65
+ // Linux/macOS: also resolve the full path so child processes can find 'pi' even if
66
+ // PATH is minimal (e.g. in detached background-runner processes). Fall back to "pi"
67
+ // only if resolution fails.
68
+ const script = resolvePiCliScript();
69
+ if (script) return { command: process.execPath, args: [script, ...args] };
70
+ return { command: "pi", args };
71
+ ```
72
+
73
+ `resolvePiCliScript()` on Linux walks from `argv[1]` upward to find the pi-crew package root, then locates the Pi CLI script from the package bin. This gives an absolute path that doesn't depend on PATH.
74
+
75
+ ## Why `resolvePiCliScript()` Works
76
+
77
+ On Linux, `process.argv[1]` for the running Node process points to the pi-crew entry script. Walking up the directory tree finds the `@mariozechner/pi-coding-agent` package root, then reads its `bin.pi` field to get the absolute path to the Pi CLI script (e.g., `/home/bom/.nvm/versions/node/v22.22.0/lib/node_modules/@mariozechner/pi-coding-agent/dist/cli.cjs`).
78
+
79
+ This absolute path is then passed to `process.execPath` (Node.js) as the first argument, so the child process runs with:
80
+ ```
81
+ node /path/to/pi/dist/cli.cjs [args]
82
+ ```
83
+
84
+ This doesn't need PATH at all.
85
+
86
+ ## Verification Plan
87
+
88
+ 1. Restart Pi to reload pi-crew with the fix
89
+ 2. Run `team action='run', async=true` with a simple research task
90
+ 3. Verify `worker.spawned` events appear within 5 seconds
91
+ 4. Verify workers produce output within 60 seconds (not 300s timeout)
92
+ 5. Verify final run status is `completed` not `failed`
@@ -0,0 +1,89 @@
1
+ # Bug #12: Child-process crash "Failed to run npm root -g" — essential env vars stripped
2
+
3
+ **Date:** 2026-05-19
4
+ **Severity:** 🔴 HIGH
5
+ **Type:** Bug (regression from Bug #10 fix)
6
+ **Status:** ✅ Fixed — added essential env vars to allow-list
7
+
8
+ ## Summary
9
+
10
+ Child Pi workers crash immediately after spawning with:
11
+ ```
12
+ Error: Failed to run npm root -g: undefined
13
+ at DefaultPackageManager.runNpmCommandSync (...)
14
+ at DefaultPackageManager.getGlobalNpmRoot (...)
15
+ ```
16
+
17
+ The child Pi starts but can't find `npm` because `PATH` was stripped from its environment.
18
+
19
+ ## Root Cause
20
+
21
+ **Bug #10's fix introduced Bug #12.** The `sanitizeEnvSecrets()` function in allow-list mode ONLY preserves keys matching the allow-list. All other keys are stripped. The Bug #10 fix used an allow-list that only included model provider API keys, but stripped ALL other env vars including essential ones like `PATH`, `HOME`, `USER`, etc.
22
+
23
+ ```typescript
24
+ // Bug #10 fix (BROKEN):
25
+ const filteredEnv = sanitizeEnvSecrets(env, {
26
+ allowList: [
27
+ "MINIMAX_*", "OPENAI_*", "*_API_KEY", "*_TOKEN", "*_SECRET",
28
+ // Missing: PATH, HOME, USER, etc.
29
+ ],
30
+ });
31
+ // Result: child process gets ONLY model API keys, nothing else
32
+ // → Child can't find npm/node/PATH → crashes immediately
33
+ ```
34
+
35
+ **The `sanitizeEnvSecrets` allow-list mode works like this:**
36
+ - With allow-list: preserve ONLY keys matching the list, strip everything else
37
+ - Without allow-list: strip only keys matching SECRET_KEY_PATTERN, preserve everything else
38
+
39
+ So the Bug #10 fix preserved `MINIMAX_API_KEY` but stripped `PATH`, making it impossible for the child Pi to find `npm`.
40
+
41
+ ## Why It Looked Like "spawn pi ENOENT" Before (Bug #11)
42
+
43
+ In the previous session, background workers failed with `spawn pi ENOENT` immediately because:
44
+ 1. `getPiSpawnCommand()` returned bare `"pi"` without path resolution (Bug #11)
45
+ 2. The detached background runner had minimal PATH
46
+ 3. Fix: added `resolvePiCliScript()` for non-Windows (Bug #11 fix)
47
+
48
+ After Bug #11 fix + Pi restart, workers spawn OK but crash with `npm root -g` error because:
49
+ 1. `getPiSpawnCommand()` now resolves full path
50
+ 2. Child process starts but has no `PATH` → can't find `npm`
51
+ 3. Pi's package manager calls `npm root -g` → fails with ENOENT
52
+
53
+ ## Fix Applied
54
+
55
+ **File:** `src/runtime/child-pi.ts` — `buildChildPiSpawnOptions()`
56
+
57
+ Added essential non-secret env vars to the allow-list:
58
+
59
+ ```typescript
60
+ const filteredEnv = sanitizeEnvSecrets(env, {
61
+ allowList: [
62
+ // Model provider API keys (Bug #10)
63
+ "MINIMAX_*", "OPENAI_*", "ANTHROPIC_*", "GOOGLE_*",
64
+ "AZURE_*", "AWS_*", "ZEU_*", "ZERODEV_*",
65
+ "*_API_KEY", "*_TOKEN", "*_SECRET",
66
+ // Essential non-secret vars (Bug #12 fix)
67
+ "PATH", "HOME", "USER", "SHELL", "TERM", "LANG", "LC_*", "XDG_*",
68
+ "NVM_*", "NODE_*", "npm_*", "PI_*", "PI_CREW_*", "PI_TEAMS_*",
69
+ ],
70
+ });
71
+ ```
72
+
73
+ This preserves both:
74
+ 1. Model provider API keys (Bug #10 fix)
75
+ 2. Essential environment variables so child process can function (Bug #12 fix)
76
+
77
+ ## Why Not Use Deny-List Mode?
78
+
79
+ In deny-list mode (no allow-list), `SECRET_KEY_PATTERN` strips any key matching secret patterns including `MINIMAX_API_KEY` (because `_API_KEY` matches the pattern). This was the original Bug #10 root cause.
80
+
81
+ The allow-list approach is correct — we just need to include both model API keys AND essential env vars.
82
+
83
+ ## Verification
84
+
85
+ After this fix, background workers should:
86
+ 1. Spawn successfully (Bug #11 verified)
87
+ 2. Find `npm` and `node` via PATH (Bug #12 verified)
88
+ 3. Authenticate with MiniMax via `MINIMAX_API_KEY` (Bug #10 verified)
89
+ 4. Produce output within 60 seconds (not 300s timeout)