pi-crew 0.7.5 → 0.7.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/CHANGELOG.md +71 -0
  2. package/README.md +11 -11
  3. package/docs/commands-reference.md +14 -10
  4. package/docs/troubleshooting.md +131 -0
  5. package/docs/usage.md +9 -4
  6. package/package.json +1 -1
  7. package/src/config/config.ts +11 -4
  8. package/src/extension/action-suggestions.ts +71 -0
  9. package/src/extension/context-status-injection.ts +32 -1
  10. package/src/extension/register.ts +71 -65
  11. package/src/extension/team-tool/api.ts +3 -2
  12. package/src/extension/team-tool/cancel.ts +5 -4
  13. package/src/extension/team-tool/explain.ts +2 -1
  14. package/src/extension/team-tool/failure-patterns.ts +124 -0
  15. package/src/extension/team-tool/inspect.ts +10 -6
  16. package/src/extension/team-tool/lifecycle-actions.ts +5 -4
  17. package/src/extension/team-tool/respond.ts +4 -3
  18. package/src/extension/team-tool/run-not-found.ts +54 -0
  19. package/src/extension/team-tool/run.ts +26 -4
  20. package/src/extension/team-tool/status.ts +58 -4
  21. package/src/extension/team-tool.ts +5 -3
  22. package/src/runtime/async-runner.ts +7 -0
  23. package/src/runtime/background-runner.ts +7 -1
  24. package/src/runtime/chain-parser.ts +13 -5
  25. package/src/runtime/checkpoint.ts +13 -1
  26. package/src/runtime/child-pi.ts +9 -1
  27. package/src/runtime/crash-recovery.ts +21 -1
  28. package/src/runtime/live-session-runtime.ts +15 -1
  29. package/src/runtime/parent-guard.ts +2 -2
  30. package/src/runtime/pi-spawn.ts +66 -0
  31. package/src/runtime/stale-reconciler.ts +38 -3
  32. package/src/runtime/task-runner.ts +10 -1
  33. package/src/runtime/team-runner.ts +19 -2
  34. package/src/runtime/verification-gates.ts +21 -1
  35. package/src/schema/team-tool-schema.ts +9 -0
  36. package/src/state/blob-store.ts +12 -10
  37. package/src/state/event-log-rotation.ts +114 -93
  38. package/src/state/event-log.ts +79 -20
  39. package/src/state/health-store.ts +6 -1
  40. package/src/state/locks.ts +66 -16
  41. package/src/state/state-store.ts +14 -1
  42. package/src/ui/card-colors.ts +7 -3
  43. package/src/ui/dashboard-panes/agents-pane.ts +15 -2
  44. package/src/ui/live-duration.ts +58 -0
  45. package/src/ui/tool-render.ts +7 -11
  46. package/src/ui/tool-renderers/index.ts +6 -3
  47. package/src/ui/widget/widget-formatters.ts +2 -13
  48. package/src/utils/fs-watch.ts +11 -60
  49. package/src/utils/run-watcher-registry.ts +164 -0
  50. package/src/workflows/discover-workflows.ts +2 -1
  51. package/src/workflows/workflow-config.ts +5 -0
  52. package/src/runtime/dynamic-script-runner.ts +0 -497
  53. package/src/runtime/sandbox.ts +0 -335
@@ -135,6 +135,13 @@ export const TeamToolParams = Type.Object({
135
135
  description: "Run in background when execution support is enabled.",
136
136
  }),
137
137
  ),
138
+ details: Type.Optional(
139
+ Type.Boolean({
140
+ default: true,
141
+ description:
142
+ "(status) Output detail level. true (default) = full status (task graph, agents, effectiveness, events). false = compact summary (status, goal, task counts, and only failed/attention task errors) for quick checks.",
143
+ }),
144
+ ),
138
145
  workspaceMode: Type.Optional(
139
146
  Type.Union([Type.Literal("single"), Type.Literal("worktree")], {
140
147
  description:
@@ -318,6 +325,8 @@ export interface TeamToolParamsValue {
318
325
  taskId?: string;
319
326
  message?: string;
320
327
  async?: boolean;
328
+ /** (status) Output detail level. false = compact summary. Default: true (full). */
329
+ details?: boolean;
321
330
  workspaceMode?: "single" | "worktree";
322
331
  context?: "fresh" | "fork";
323
332
  cwd?: string;
@@ -190,16 +190,18 @@ export function writeBlob(artifactsRoot: string, input: {
190
190
  metadataWritten = true;
191
191
  });
192
192
  } catch (error) {
193
- // Issue 4 fix: Clean up orphaned blob if metadata write fails.
194
- // If metadata write fails (e.g., concurrent conflict), the blob content
195
- // is orphaned since no metadata references it. Clean it up to reclaim space.
196
- // Issue 8 fix: Do NOT delete blob content on metadata failure.
197
- // If metadata write fails due to concurrent conflict (different values),
198
- // the blob content is still valid. Another process has written metadata
199
- // referencing this blob - deleting the blob would orphan their metadata.
200
- // The caller can retry the metadata write if needed.
201
- // However, if metadata was never written (metadataWritten === false),
202
- // the blob is orphaned and should be cleaned up.
193
+ // Round 24 (BUG 4 note): the catch block previously checked
194
+ // `if (!blobContentWritten)` the WRONG variable (the local comment said
195
+ // `metadataWritten === false`). For a CONTENT-ADDRESSED store the blob path
196
+ // is the content hash, so the blob may be referenced by another process's
197
+ // metadata even when OUR metadata write failed (e.g. a concurrent conflict
198
+ // where the peer already wrote metadata for the same hash). Deleting it
199
+ // would orphan their metadata. The safe behavior is therefore to NEVER
200
+ // delete on a metadata write failure and let the periodic
201
+ // cleanupOrphanedBlobs() reclaim genuinely-orphaned blobs. The guard below
202
+ // only removes a blob when its CONTENT was never written (a stray/partial
203
+ // file from a failed content write) — which is the only unambiguously-safe
204
+ // case to clean up here.
203
205
  if (!blobContentWritten) {
204
206
  try { fs.rmSync(blobPath, { force: true }); } catch { /* best-effort */ }
205
207
  }
@@ -1,5 +1,5 @@
1
1
  import * as fs from "node:fs";
2
- import { readEvents } from "./event-log.ts";
2
+ import { readEvents, type TeamEvent } from "./event-log.ts";
3
3
  import { atomicWriteFile } from "./atomic-write.ts";
4
4
  import { logInternalError } from "../utils/internal-error.ts";
5
5
  import { withEventLogLockSync } from "./event-log.ts";
@@ -65,6 +65,25 @@ export interface CompactionResult {
65
65
  * 6. Return compaction stats
66
66
  */
67
67
  export function compactEventLog(eventsPath: string, config?: Partial<RotationConfig>): CompactionResult | undefined {
68
+ const prepared = prepareCompaction(eventsPath, config);
69
+ if (!prepared) return undefined;
70
+ // FIX: Wrap entire read-compact-write-recover sequence in lock to prevent
71
+ // event loss during compaction. Without lock, events can be appended between
72
+ // read and write, lost silently.
73
+ //
74
+ // NOTE (Round 24 BUG 1): callers ALREADY holding the event-log lock (e.g.
75
+ // appendEventInsideLock in event-log.ts) must call applyCompactionUnlocked
76
+ // directly — calling compactEventLog from inside the lock deadlocks (the
77
+ // mkdir lock is not re-entrant → 5s timeout → compaction never ran → the
78
+ // log grew unbounded until events were silently dropped past 50MB).
79
+ return withEventLogLockSync(eventsPath, () => applyCompactionUnlocked(eventsPath, prepared));
80
+ }
81
+
82
+ /** Round 24 (BUG 1): the lock-free pre-read for compaction. Safe to run
83
+ * outside the lock (read-only). Returns the compacted lines + stats needed
84
+ * for the write phase. */
85
+ export function prepareCompaction(eventsPath: string, config?: Partial<RotationConfig>):
86
+ { lines: string; originalSize: number; originalCount: number; kept: TeamEvent[] } | undefined {
68
87
  if (!fs.existsSync(eventsPath)) return undefined;
69
88
  const cfg = resolveConfig(config);
70
89
  let originalSize: number;
@@ -74,79 +93,73 @@ export function compactEventLog(eventsPath: string, config?: Partial<RotationCon
74
93
  if (originalCount <= cfg.compactToCount) return undefined;
75
94
  const kept = allEvents.slice(-cfg.compactToCount);
76
95
  const lines = kept.map((e) => JSON.stringify(e)).join("\n") + "\n";
96
+ return { lines, originalSize, originalCount, kept };
97
+ }
77
98
 
78
- // FIX: Wrap entire read-compact-write-recover sequence in lock to prevent
79
- // event loss during compaction. Without lock, events can be appended between
80
- // read and write, lost silently.
81
- return withEventLogLockSync(eventsPath, () => {
82
- try {
83
- atomicWriteFile(eventsPath, lines);
84
- } catch (err) {
85
- // Concurrent write conflict — skip compaction this cycle
86
- logInternalError("event-log-rotation.compact", err, `eventsPath=${eventsPath}`);
87
- return undefined;
88
- }
89
- // C2: Re-read to recover any events appended during the compaction window.
90
- // Events appended during the compaction window are preserved because they
91
- // appear in afterWrite and the condition afterWrite.length >= kept.length is
92
- // true, so they are included in the return stats without entering the
93
- // recovery branch.
94
- try {
95
- const afterWrite = readEvents(eventsPath);
96
- // FIX: Check if events were actually lost (afterWrite.length < kept.length)
97
- // rather than using appendedDuringWindow >= 0 which is always true.
98
- // Also use sequence numbers for comparison instead of JSON.stringify
99
- // which is fragile due to key ordering and floating point differences.
100
- if (afterWrite.length >= kept.length) {
101
- // No data loss either events were appended and kept, or nothing happened.
102
- return {
103
- originalSize,
104
- compactedSize: fs.statSync(eventsPath).size,
105
- eventsRemoved: originalCount - kept.length,
106
- eventsKept: kept.length + Math.max(0, afterWrite.length - kept.length),
107
- };
108
- }
109
- // afterWrite.length < kept.length — events were lost during compaction window.
110
- // Find missing events and re-append them.
111
- // FIX: Use sequence numbers for comparison instead of JSON.stringify.
112
- const afterSeqs = new Set(afterWrite.map((e) => e.metadata?.seq).filter((s): s is number => s !== undefined));
113
- const missingEvents = kept.filter((e) => e.metadata?.seq === undefined || !afterSeqs.has(e.metadata.seq));
114
- let recoveredCount = 0;
115
- let recoveryFailed = false;
116
- if (missingEvents.length > 0) {
117
- // BUGFIX (Round 12 C2): the previous loop called atomicWriteFile PER event,
118
- // which REPLACES the entire file each iteration — destroying the
119
- // compacted log and all previously-recovered events, leaving only the
120
- // LAST missing event. FIX: accumulate all missing events into one
121
- // string and append in a single write (appendFileSync appends without
122
- // destroying existing content).
123
- const recoveryLines = missingEvents.map((e) => JSON.stringify(e) + "\n").join("");
124
- try {
125
- fs.appendFileSync(eventsPath, recoveryLines);
126
- recoveredCount = missingEvents.length;
127
- } catch (err) {
128
- recoveryFailed = true;
129
- logInternalError("event-log-rotation.recovery", err, `eventsPath=${eventsPath} lostEvents=${missingEvents.length}`);
130
- }
131
- }
99
+ /** Round 24 (BUG 1): the write+recover phase of compaction. Assumes the
100
+ * caller ALREADY holds the event-log lock (or accepts the unlocked race). */
101
+ export function applyCompactionUnlocked(
102
+ eventsPath: string,
103
+ prepared: { lines: string; originalSize: number; originalCount: number; kept: TeamEvent[] },
104
+ ): CompactionResult | undefined {
105
+ const { lines, originalSize, originalCount, kept } = prepared;
106
+ try {
107
+ atomicWriteFile(eventsPath, lines);
108
+ } catch (err) {
109
+ // Concurrent write conflict — skip compaction this cycle
110
+ logInternalError("event-log-rotation.compact", err, `eventsPath=${eventsPath}`);
111
+ return undefined;
112
+ }
113
+ // C2: Re-read to recover any events appended during the compaction window.
114
+ // Events appended during the compaction window are preserved because they
115
+ // appear in afterWrite and the condition afterWrite.length >= kept.length is
116
+ // true, so they are included in the return stats without entering the
117
+ // recovery branch.
118
+ try {
119
+ const afterWrite = readEvents(eventsPath);
120
+ // FIX: Check if events were actually lost (afterWrite.length < kept.length)
121
+ // rather than using appendedDuringWindow >= 0 which is always true.
122
+ // Also use sequence numbers for comparison instead of JSON.stringify
123
+ // which is fragile due to key ordering and floating point differences.
124
+ if (afterWrite.length >= kept.length) {
132
125
  return {
133
126
  originalSize,
134
127
  compactedSize: fs.statSync(eventsPath).size,
135
128
  eventsRemoved: originalCount - kept.length,
136
- eventsKept: kept.length + recoveredCount,
137
- recoveryFailed,
138
- };
139
- } catch {
140
- // Post-write verification failed — compaction likely succeeded.
141
- const compactedSize = fs.statSync(eventsPath).size;
142
- return {
143
- originalSize,
144
- compactedSize,
145
- eventsRemoved: originalCount - kept.length,
146
- eventsKept: kept.length,
129
+ eventsKept: kept.length + Math.max(0, afterWrite.length - kept.length),
147
130
  };
148
131
  }
149
- });
132
+ // afterWrite.length < kept.length — events were lost during compaction window.
133
+ const afterSeqs = new Set(afterWrite.map((e) => e.metadata?.seq).filter((s): s is number => s !== undefined));
134
+ const missingEvents = kept.filter((e) => e.metadata?.seq === undefined || !afterSeqs.has(e.metadata.seq));
135
+ let recoveredCount = 0;
136
+ let recoveryFailed = false;
137
+ if (missingEvents.length > 0) {
138
+ const recoveryLines = missingEvents.map((e) => JSON.stringify(e) + "\n").join("");
139
+ try {
140
+ fs.appendFileSync(eventsPath, recoveryLines);
141
+ recoveredCount = missingEvents.length;
142
+ } catch (err) {
143
+ recoveryFailed = true;
144
+ logInternalError("event-log-rotation.recovery", err, `eventsPath=${eventsPath} lostEvents=${missingEvents.length}`);
145
+ }
146
+ }
147
+ return {
148
+ originalSize,
149
+ compactedSize: fs.statSync(eventsPath).size,
150
+ eventsRemoved: originalCount - kept.length,
151
+ eventsKept: kept.length + recoveredCount,
152
+ recoveryFailed,
153
+ };
154
+ } catch {
155
+ // Post-write verification failed — compaction likely succeeded.
156
+ return {
157
+ originalSize,
158
+ compactedSize: fs.statSync(eventsPath).size,
159
+ eventsRemoved: originalCount - kept.length,
160
+ eventsKept: kept.length,
161
+ };
162
+ }
150
163
  }
151
164
 
152
165
  /**
@@ -161,33 +174,41 @@ export function rotateEventLog(eventsPath: string): boolean {
161
174
  // FIX: Wrap rotation in lock to prevent race conditions with concurrent readers.
162
175
  // Order of operations: (1) create new empty file, (2) rename old file to archive.
163
176
  // This ensures eventsPath always exists — a reader never sees a missing file.
164
- return withEventLogLockSync(eventsPath, () => {
165
- try {
166
- const ts = new Date().toISOString().replace(/[:.]/g, "-");
167
- let archivePath = `${eventsPath}.${ts}.archive.jsonl`;
168
- // Round 12: avoid timestamp collisions when two rotations happen within
169
- // the same millisecond (copyFileSync would silently overwrite the
170
- // first archive). Append a counter until the path is free.
171
- let collision = 1;
172
- while (fs.existsSync(archivePath)) {
173
- archivePath = `${eventsPath}.${ts}.${collision}.archive.jsonl`;
174
- collision++;
175
- }
176
- // BUGFIX (Round 12 C1): the previous order (atomicWriteFile empty THEN
177
- // rename) destroyed ALL events atomicWriteFile replaces the file
178
- // in place, so the rename then moved an EMPTY file to the archive.
179
- // FIX: copy current content to the archive first (archive is populated,
180
- // original still intact), then truncate the original to empty in place.
181
- // copyFileSync + writeFileSync("") ensures eventsPath ALWAYS exists
182
- // (no missing-file window for concurrent readers).
183
- fs.copyFileSync(eventsPath, archivePath);
184
- fs.writeFileSync(eventsPath, "", "utf-8");
185
- return true;
186
- } catch (error) {
187
- logInternalError("event-log.rotate", error, `eventsPath=${eventsPath}`);
188
- return false;
177
+ //
178
+ // NOTE (Round 24 BUG 1): callers ALREADY holding the lock must call
179
+ // rotateEventLogUnlocked directly this locked variant is NOT re-entrant.
180
+ return withEventLogLockSync(eventsPath, () => rotateEventLogUnlocked(eventsPath));
181
+ }
182
+
183
+ /** Round 24 (BUG 1): the lock-free core of rotation. Assumes the caller
184
+ * already holds the event-log lock (or accepts the unlocked race). */
185
+ export function rotateEventLogUnlocked(eventsPath: string): boolean {
186
+ if (!fs.existsSync(eventsPath)) return false;
187
+ try {
188
+ const ts = new Date().toISOString().replace(/[:.]/g, "-");
189
+ let archivePath = `${eventsPath}.${ts}.archive.jsonl`;
190
+ // Round 12: avoid timestamp collisions when two rotations happen within
191
+ // the same millisecond (copyFileSync would silently overwrite the
192
+ // first archive). Append a counter until the path is free.
193
+ let collision = 1;
194
+ while (fs.existsSync(archivePath)) {
195
+ archivePath = `${eventsPath}.${ts}.${collision}.archive.jsonl`;
196
+ collision++;
189
197
  }
190
- });
198
+ // BUGFIX (Round 12 C1): the previous order (atomicWriteFile empty THEN
199
+ // rename) destroyed ALL events — atomicWriteFile replaces the file
200
+ // in place, so the rename then moved an EMPTY file to the archive.
201
+ // FIX: copy current content to the archive first (archive is populated,
202
+ // original still intact), then truncate the original to empty in place.
203
+ // copyFileSync + writeFileSync("") ensures eventsPath ALWAYS exists
204
+ // (no missing-file window for concurrent readers).
205
+ fs.copyFileSync(eventsPath, archivePath);
206
+ fs.writeFileSync(eventsPath, "", "utf-8");
207
+ return true;
208
+ } catch (error) {
209
+ logInternalError("event-log.rotate", error, `eventsPath=${eventsPath}`);
210
+ return false;
211
+ }
191
212
  }
192
213
 
193
214
  export interface EventLogStats {
@@ -9,7 +9,7 @@ import { logInternalError } from "../utils/internal-error.ts";
9
9
  import { readJsonlSince, type IncrementalReadState } from "../utils/incremental-reader.ts";
10
10
  import { redactSecrets } from "../utils/redaction.ts";
11
11
  import { sleepSync } from "../utils/sleep.ts";
12
- import { needsRotation, compactEventLog, rotateEventLog } from "./event-log-rotation.ts";
12
+ import { needsRotation, compactEventLog, rotateEventLog, applyCompactionUnlocked, prepareCompaction, rotateEventLogUnlocked } from "./event-log-rotation.ts";
13
13
 
14
14
  export type TeamEventProvenance = "live_worker" | "test" | "healthcheck" | "replay" | "api" | "background" | "team_runner";
15
15
  export type TeamWatcherAction = "act" | "observe" | "ignore";
@@ -76,7 +76,7 @@ let overflowCounter = 0;
76
76
  * `flushOneEventLogBuffer`, and `state/mailbox.ts`. Prefer the async alternative
77
77
  * (`appendEventAsync`) for all new code.
78
78
  */
79
- export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
79
+ export function withEventLogLockSync<T>(eventsPath: string, fn: () => T, options?: { timeoutMs?: number; staleMs?: number }): T {
80
80
  // Ensure parent directory exists before attempting lock
81
81
  fs.mkdirSync(path.dirname(eventsPath), { recursive: true });
82
82
  const lockDir = `${eventsPath}.lock`;
@@ -86,8 +86,8 @@ export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
86
86
  // event loop indefinitely. 500 retries × 10ms = 5s max. After timeout, we
87
87
  // throw a clear error instead of blocking forever. This ensures AbortSignal
88
88
  // handlers, SIGTERM, and graceful shutdown can fire within seconds.
89
- const timeout = 5000;
90
- const staleMs = 10000;
89
+ const timeout = options?.timeoutMs ?? 5000;
90
+ const staleMs = options?.staleMs ?? 10000;
91
91
  let acquired = false;
92
92
  while (true) {
93
93
  try {
@@ -110,24 +110,35 @@ export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
110
110
  // to check for orphaned .lock dirs / stale processes.
111
111
  throw errors.eventLogLockTimeout(eventsPath, timeout);
112
112
  }
113
- // Stale detection: if the owning process is dead, remove the stale lock.
113
+ // Round 26 (BUG 3): mtime-based stale check INDEPENDENT of pidFile.
114
+ // If the holder crashed between mkdir and writing pidFile, there is no
115
+ // pidFile to read — the old code just slept until the 5s timeout, then
116
+ // threw, leaving the dir orphaned FOREVER (every retry repeats the
117
+ // timeout). Now: if the lock dir's mtime exceeds staleMs, reclaim it.
118
+ try {
119
+ const dirStat = fs.statSync(lockDir);
120
+ if (Date.now() - dirStat.mtimeMs > staleMs) {
121
+ fs.rmSync(lockDir, { recursive: true, force: true });
122
+ continue;
123
+ }
124
+ } catch { /* dir vanished — let loop retry */ }
125
+ // Round 26 (BUG 4): the mtime check was previously NESTED inside
126
+ // `if (!alive)`, so a recycled PID (crashed holder's PID reused by an
127
+ // unrelated live process) kept `alive=true` and the mtime check NEVER
128
+ // fired → permanent wedge. mtime is now checked FIRST (above) for ALL
129
+ // holders. The PID check below is a secondary fast-path: if the holder
130
+ // PID is provably dead AND the lock isn't stale yet, we still wait
131
+ // (don't steal a fresh lock just because the pid lookup raced).
114
132
  try {
115
133
  const raw = fs.readFileSync(pidFile, "utf-8").trim();
116
134
  const ownerPid = Number.parseInt(raw, 10);
117
135
  if (!Number.isNaN(ownerPid) && ownerPid !== process.pid) {
118
136
  let alive = false;
119
137
  try { process.kill(ownerPid, 0); alive = true; } catch { /* dead */ }
120
- if (!alive) {
121
- try {
122
- const stat = fs.statSync(lockDir);
123
- if (Date.now() - stat.mtimeMs > staleMs) {
124
- fs.rmSync(lockDir, { recursive: true, force: true });
125
- continue;
126
- }
127
- } catch { /* race — let loop sleep */ }
128
- }
138
+ // (mtime already handled above; nothing to do here for dead-but-fresh.)
139
+ void alive;
129
140
  }
130
- } catch { /* no pid file — fall through to sleep */ }
141
+ } catch { /* no pid file — mtime check above already handles it */ }
131
142
  sleepSync(10);
132
143
  }
133
144
  }
@@ -135,7 +146,19 @@ export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
135
146
  return fn();
136
147
  } finally {
137
148
  if (acquired) {
138
- try { fs.rmSync(lockDir, { recursive: true, force: true }); } catch { /* best-effort */ }
149
+ // Round 26 (BUG 5): token/PID-guarded release. Previously the release
150
+ // was an UNCONDITIONAL rmSync. If our fn exceeded staleMs, another
151
+ // process could steal our lock (rm our dir, make its own); when our fn
152
+ // finished our finally block would then DELETE THE STEALER's dir → both
153
+ // in the critical section + lost lock. Verify the pidFile still records
154
+ // OUR pid before removing; if it doesn't, the lock was stolen and the
155
+ // current holder owns the dir.
156
+ try {
157
+ const currentPid = fs.readFileSync(pidFile, "utf-8").trim();
158
+ if (currentPid === String(process.pid)) {
159
+ fs.rmSync(lockDir, { recursive: true, force: true });
160
+ }
161
+ } catch { /* lock stolen or already gone — do not touch */ }
139
162
  }
140
163
  }
141
164
  }
@@ -152,6 +175,29 @@ function evictOldestSequenceCacheEntries(): void {
152
175
  }
153
176
  }
154
177
 
178
+ /** @internal — exported for sequence-cache LRU testing (Round 19). */
179
+ export function __test__sequenceCacheSize(): number {
180
+ return sequenceCache.size;
181
+ }
182
+
183
+ /** @internal — seed an entry into the sequence cache for testing. */
184
+ export function __test__seedSequenceCache(eventsPath: string, lastAccessMs: number): void {
185
+ sequenceCache.set(eventsPath, { size: 1, mtimeMs: 0, seq: 0, lastAccessMs });
186
+ }
187
+
188
+ /** @internal — expose eviction for testing. */
189
+ export function __test__evictOldestSequenceCacheEntries(): void {
190
+ evictOldestSequenceCacheEntries();
191
+ }
192
+
193
+ /** @internal — clear the sequence cache. */
194
+ export function __test__clearSequenceCache(): void {
195
+ sequenceCache.clear();
196
+ }
197
+
198
+ /** @internal — the max sequence cache entries bound. */
199
+ export const MAX_SEQUENCE_CACHE_ENTRIES_VALUE = MAX_SEQUENCE_CACHE_ENTRIES;
200
+
155
201
  export function sequencePath(eventsPath: string): string {
156
202
  return `${eventsPath}.seq`;
157
203
  }
@@ -497,9 +543,14 @@ function appendEventInsideLock(eventsPath: string, event: AppendTeamEvent): Team
497
543
  if (!isTerminal && fs.existsSync(eventsPath)) {
498
544
  const stat = fs.statSync(eventsPath);
499
545
  if (stat.size > MAX_EVENTS_BYTES) {
500
- // Try immediate compact (not waiting for counter % 100)
546
+ // Try immediate compact (not waiting for counter % 100).
547
+ // Round 24 (BUG 1): we are INSIDE withEventLogLockSync. Use the unlocked
548
+ // apply/rotate cores — the locked variants would deadlock (mkdir lock
549
+ // is not re-entrant → 5s timeout → compaction/rotation never ran →
550
+ // unbounded log growth → events silently dropped past 50MB).
501
551
  try {
502
- compactEventLog(eventsPath);
552
+ const prepared = prepareCompaction(eventsPath);
553
+ if (prepared) applyCompactionUnlocked(eventsPath, prepared);
503
554
  } catch (error) {
504
555
  logInternalError("event-log.immediate-compact", error, `eventsPath=${eventsPath}`);
505
556
  }
@@ -507,7 +558,7 @@ function appendEventInsideLock(eventsPath: string, event: AppendTeamEvent): Team
507
558
  if (fs.existsSync(eventsPath)) {
508
559
  const afterCompact = fs.statSync(eventsPath);
509
560
  if (afterCompact.size > MAX_EVENTS_BYTES) {
510
- rotateEventLog(eventsPath);
561
+ rotateEventLogUnlocked(eventsPath);
511
562
  }
512
563
  }
513
564
  }
@@ -555,7 +606,15 @@ function appendEventInsideLock(eventsPath: string, event: AppendTeamEvent): Team
555
606
  }
556
607
  appendCounter++;
557
608
  if (appendCounter % 100 === 0 && needsRotation(eventsPath)) {
558
- try { compactEventLog(eventsPath); } catch (error) { logInternalError("event-log.rotation", error, `eventsPath=${eventsPath}`); }
609
+ // Round 24 (BUG 1): we are INSIDE withEventLogLockSync here (called via
610
+ // appendEventInsideLock). The mkdir lock is NOT re-entrant, so calling the
611
+ // locked compactEventLog would deadlock → 5s timeout → compaction never
612
+ // ran → unbounded log growth → events silently dropped past 50MB. Use the
613
+ // unlocked apply path instead (lock already held).
614
+ try {
615
+ const prepared = prepareCompaction(eventsPath);
616
+ if (prepared) applyCompactionUnlocked(eventsPath, prepared);
617
+ } catch (error) { logInternalError("event-log.rotation", error, `eventsPath=${eventsPath}`); }
559
618
  }
560
619
  try { emitFromTeamEvent(fullEvent); } catch (error) { logInternalError("event-log.emit", error); }
561
620
  return fullEvent;
@@ -4,7 +4,12 @@ import type { RunHealth } from "../runtime/task-health.ts";
4
4
  import { computeRunHealth } from "../runtime/task-health.ts";
5
5
  import type { ManifestSummary } from "../runtime/task-health.ts";
6
6
 
7
- const HEALTH_DIR = ".crew/state/health";
7
+ // Relative to the crew root (`<cwd>/.crew`). BUG A fix (pts/2 hang
8
+ // investigation 2026-06-16): this was `.crew/state/health`, which double-joined
9
+ // to `<crewRoot>/state/.crew/state/health` because the caller passed the state
10
+ // dir (not the crew root). Now the caller passes the real crew root, so this is
11
+ // a plain `state/health` suffix.
12
+ const HEALTH_DIR = "state/health";
8
13
 
9
14
  export interface HealthSnapshot {
10
15
  runId: string;
@@ -66,6 +66,57 @@ function isLockHolderAlive(filePath: string): boolean {
66
66
  }
67
67
  }
68
68
 
69
+ /**
70
+ * Round 26 (BUG 1): read the lock file ONCE and evaluate staleness + holder
71
+ * liveness from that single snapshot.
72
+ *
73
+ * Previously `acquireLockWithRetry` called `isLockStale()` and
74
+ * `isLockHolderAlive()` separately, each performing its own `readFileSync`.
75
+ * Between those two reads the lock could transition stale→fresh (old holder
76
+ * released, new holder acquired): isLockStale saw the OLD createdAt → stale,
77
+ * isLockHolderAlive saw the NEW pid → alive, yielding `!stale && alive` =
78
+ * false → we forcibly rm the NEW holder's freshly-acquired lock and take it
79
+ * ourselves → BOTH in the critical section. Reading once closes the window.
80
+ *
81
+ * Returns `{ canSteal: true }` if the lock is stale OR the holder is dead
82
+ * (safe to forcibly remove); `{ canSteal: false }` if it is fresh AND held by
83
+ * a live process (must keep waiting).
84
+ */
85
+ function readLockSnapshot(filePath: string, staleMs: number): { canSteal: boolean } {
86
+ let stat: fs.Stats | undefined;
87
+ let raw: string | undefined;
88
+ try {
89
+ stat = fs.statSync(filePath);
90
+ raw = fs.readFileSync(filePath, "utf-8");
91
+ } catch {
92
+ // File vanished between writeLockFile's EEXIST and now (holder released).
93
+ // Loop will retry the create; safe to signal "nothing to steal".
94
+ return { canSteal: false };
95
+ }
96
+ // Staleness from a single snapshot.
97
+ let createdAt = parseCreatedAtFromLock(raw);
98
+ if (createdAt === undefined) createdAt = stat.mtimeMs;
99
+ const isStale = Date.now() - createdAt > staleMs;
100
+ // Holder liveness from the SAME snapshot.
101
+ let isAlive = true; // Unknown holder — assume alive to be safe (matches isLockHolderAlive).
102
+ try {
103
+ const parsed = JSON.parse(raw) as { pid?: unknown };
104
+ const pid = typeof parsed.pid === "number" ? parsed.pid : undefined;
105
+ if (pid !== undefined) {
106
+ try {
107
+ process.kill(pid, 0);
108
+ isAlive = true;
109
+ } catch (error) {
110
+ const code = (error as NodeJS.ErrnoException).code;
111
+ // EPERM/ESRCH → treat as not-alive (stealable), see isLockHolderAlive.
112
+ isAlive = false;
113
+ }
114
+ }
115
+ } catch { /* malformed payload — keep isAlive=true */ }
116
+ // Steal if stale OR holder dead — matches the original intent.
117
+ return { canSteal: isStale || !isAlive };
118
+ }
119
+
69
120
  /**
70
121
  * Lock file kinds. Discriminator written to the lock file payload so that:
71
122
  * - Debugging tools (e.g. a future `pi-crew locks` command) can identify
@@ -180,9 +231,10 @@ function acquireLockWithRetry(filePath: string, staleMs: number, kind: LockKind
180
231
  if (Date.now() > deadline) {
181
232
  throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
182
233
  }
183
- const isStale = isLockStale(filePath, staleMs);
184
- const isHolderAlive = isLockHolderAlive(filePath);
185
- if (!isStale && isHolderAlive) {
234
+ // Round 26 (BUG 1): single-snapshot read closes the TOCTOU window between
235
+ // separate stale + alive reads (which could race stale→fresh).
236
+ const { canSteal } = readLockSnapshot(filePath, staleMs);
237
+ if (!canSteal) {
186
238
  throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
187
239
  }
188
240
  // Stale or dead holder — forcibly remove the lock.
@@ -213,9 +265,9 @@ async function acquireLockWithRetryAsync(filePath: string, staleMs: number, kind
213
265
  if (Date.now() > deadline) {
214
266
  throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
215
267
  }
216
- const isStale = isLockStale(filePath, staleMs);
217
- const isHolderAlive = isLockHolderAlive(filePath);
218
- if (!isStale && isHolderAlive) {
268
+ // Round 26 (BUG 1): single-snapshot read (see sync variant).
269
+ const { canSteal } = readLockSnapshot(filePath, staleMs);
270
+ if (!canSteal) {
219
271
  throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
220
272
  }
221
273
  // Stale or dead holder — forcibly remove the lock.
@@ -244,16 +296,14 @@ export function withFileLockSync<T>(filePath: string, fn: () => T, options: RunL
244
296
  // Between mkdir and lock acquisition, an attacker could plant a symlink.
245
297
  if (!isSymlinkSafePath(path.dirname(lockFile))) throw new Error("Refusing: parent of lock directory is a symlink");
246
298
  fs.mkdirSync(path.dirname(lockFile), { recursive: true });
247
- // FIX: Validate that the target file still exists. If it was deleted and
248
- // recreated since the last lock cycle, the old .lock file may be orphaned
249
- // and should not block the new cycle. Clean it up if the target is missing.
250
- try {
251
- fs.statSync(filePath);
252
- } catch {
253
- // Target file doesn't exist clean up any stale .lock file and proceed.
254
- // The lock will be acquired fresh for the new file (if fn creates it).
255
- try { fs.rmSync(lockFile, { force: true }); } catch { /* ignore */ }
256
- }
299
+ // Round 26 (BUG 2): REMOVED the pre-acquisition target-file-existence check.
300
+ // It was racy between statSync(target) and acquire, a concurrent process
301
+ // could acquire the lock to CREATE the target, and we'd delete its active
302
+ // lock. It was also actively wrong for callers that pass a path already
303
+ // ending in `.lock` (config.ts: the checked "target" never exists, so the
304
+ // cleanup ALWAYS fired, deleting a fresh concurrent holder's lock). Genuine
305
+ // orphan locks (crashed holder) are reclaimed by acquireLockWithRetry's
306
+ // staleMs-based steal logic after at most `staleMs`.
257
307
  // FIX (TOCTOU): Re-validate symlink safety before each lock acquisition
258
308
  // attempt. Between our initial check and the acquisition (and between
259
309
  // acquireLockWithRetry's internal retries), an attacker could plant a
@@ -57,7 +57,7 @@ export interface RunPaths {
57
57
  eventsPath: string;
58
58
  }
59
59
 
60
- interface ManifestCacheEntry {
60
+ export interface ManifestCacheEntry {
61
61
  manifest: TeamRunManifest;
62
62
  tasks: TeamTaskState[];
63
63
  manifestMtimeMs: number;
@@ -76,6 +76,19 @@ const MANIFEST_CACHE_TTL_MS = 15 * 1000; // 15 seconds (FIX: increased from 5s f
76
76
  const LOAD_MANIFEST_RETRY_LIMIT = 5; // Configurable retry limit for mtime/size stability checks under contention
77
77
  const manifestCache = new Map<string, ManifestCacheEntry>();
78
78
 
79
+ /** @internal — exported for TTL-eviction unit testing (Round 19). */
80
+ export function __test__setManifestCache(stateRoot: string, entry: ManifestCacheEntry): void {
81
+ setManifestCache(stateRoot, entry);
82
+ }
83
+
84
+ /** @internal — exported for TTL-eviction unit testing (Round 19). */
85
+ export function __test__getManifestCacheEntry(stateRoot: string): ManifestCacheEntry | undefined {
86
+ return manifestCache.get(stateRoot);
87
+ }
88
+
89
+ /** @internal — the TTL in ms used for manifest cache eviction. */
90
+ export const MANIFEST_CACHE_TTL_MS_VALUE = MANIFEST_CACHE_TTL_MS;
91
+
79
92
  function setManifestCache(stateRoot: string, entry: ManifestCacheEntry): void {
80
93
  if (manifestCache.has(stateRoot)) manifestCache.delete(stateRoot);
81
94
  entry.cachedAt = Date.now();