npm - pi-crew - Versions diffs - 0.7.5 → 0.7.7 - Mend

pi-crew 0.7.5 → 0.7.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

package/CHANGELOG.md +71 -0
package/README.md +11 -11
package/docs/commands-reference.md +14 -10
package/docs/troubleshooting.md +131 -0
package/docs/usage.md +9 -4
package/package.json +1 -1
package/src/config/config.ts +11 -4
package/src/extension/action-suggestions.ts +71 -0
package/src/extension/context-status-injection.ts +32 -1
package/src/extension/register.ts +71 -65
package/src/extension/team-tool/api.ts +3 -2
package/src/extension/team-tool/cancel.ts +5 -4
package/src/extension/team-tool/explain.ts +2 -1
package/src/extension/team-tool/failure-patterns.ts +124 -0
package/src/extension/team-tool/inspect.ts +10 -6
package/src/extension/team-tool/lifecycle-actions.ts +5 -4
package/src/extension/team-tool/respond.ts +4 -3
package/src/extension/team-tool/run-not-found.ts +54 -0
package/src/extension/team-tool/run.ts +26 -4
package/src/extension/team-tool/status.ts +58 -4
package/src/extension/team-tool.ts +5 -3
package/src/runtime/async-runner.ts +7 -0
package/src/runtime/background-runner.ts +7 -1
package/src/runtime/chain-parser.ts +13 -5
package/src/runtime/checkpoint.ts +13 -1
package/src/runtime/child-pi.ts +9 -1
package/src/runtime/crash-recovery.ts +21 -1
package/src/runtime/live-session-runtime.ts +15 -1
package/src/runtime/parent-guard.ts +2 -2
package/src/runtime/pi-spawn.ts +66 -0
package/src/runtime/stale-reconciler.ts +38 -3
package/src/runtime/task-runner.ts +10 -1
package/src/runtime/team-runner.ts +19 -2
package/src/runtime/verification-gates.ts +21 -1
package/src/schema/team-tool-schema.ts +9 -0
package/src/state/blob-store.ts +12 -10
package/src/state/event-log-rotation.ts +114 -93
package/src/state/event-log.ts +79 -20
package/src/state/health-store.ts +6 -1
package/src/state/locks.ts +66 -16
package/src/state/state-store.ts +14 -1
package/src/ui/card-colors.ts +7 -3
package/src/ui/dashboard-panes/agents-pane.ts +15 -2
package/src/ui/live-duration.ts +58 -0
package/src/ui/tool-render.ts +7 -11
package/src/ui/tool-renderers/index.ts +6 -3
package/src/ui/widget/widget-formatters.ts +2 -13
package/src/utils/fs-watch.ts +11 -60
package/src/utils/run-watcher-registry.ts +164 -0
package/src/workflows/discover-workflows.ts +2 -1
package/src/workflows/workflow-config.ts +5 -0
package/src/runtime/dynamic-script-runner.ts +0 -497
package/src/runtime/sandbox.ts +0 -335

package/src/schema/team-tool-schema.ts CHANGED Viewed

@@ -135,6 +135,13 @@ export const TeamToolParams = Type.Object({
 			description: "Run in background when execution support is enabled.",
 		}),
 	),
+	details: Type.Optional(
+		Type.Boolean({
+			default: true,
+			description:
+				"(status) Output detail level. true (default) = full status (task graph, agents, effectiveness, events). false = compact summary (status, goal, task counts, and only failed/attention task errors) for quick checks.",
+		}),
+	),
 	workspaceMode: Type.Optional(
 		Type.Union([Type.Literal("single"), Type.Literal("worktree")], {
 			description:
@@ -318,6 +325,8 @@ export interface TeamToolParamsValue {
 	taskId?: string;
 	message?: string;
 	async?: boolean;
+	/** (status) Output detail level. false = compact summary. Default: true (full). */
+	details?: boolean;
 	workspaceMode?: "single" | "worktree";
 	context?: "fresh" | "fork";
 	cwd?: string;

package/src/state/blob-store.ts CHANGED Viewed

@@ -190,16 +190,18 @@ export function writeBlob(artifactsRoot: string, input: {
 			metadataWritten = true;
 		});
 	} catch (error) {
-		// Issue 4 fix: Clean up orphaned blob if metadata write fails.
-		// If metadata write fails (e.g., concurrent conflict), the blob content
-		// is orphaned since no metadata references it. Clean it up to reclaim space.
-		// Issue 8 fix: Do NOT delete blob content on metadata failure.
-		// If metadata write fails due to concurrent conflict (different values),
-		// the blob content is still valid. Another process has written metadata
-		// referencing this blob - deleting the blob would orphan their metadata.
-		// The caller can retry the metadata write if needed.
-		// However, if metadata was never written (metadataWritten === false),
-		// the blob is orphaned and should be cleaned up.
+		// Round 24 (BUG 4 note): the catch block previously checked
+		// `if (!blobContentWritten)` — the WRONG variable (the local comment said
+		// `metadataWritten === false`). For a CONTENT-ADDRESSED store the blob path
+		// is the content hash, so the blob may be referenced by another process's
+		// metadata even when OUR metadata write failed (e.g. a concurrent conflict
+		// where the peer already wrote metadata for the same hash). Deleting it
+		// would orphan their metadata. The safe behavior is therefore to NEVER
+		// delete on a metadata write failure and let the periodic
+		// cleanupOrphanedBlobs() reclaim genuinely-orphaned blobs. The guard below
+		// only removes a blob when its CONTENT was never written (a stray/partial
+		// file from a failed content write) — which is the only unambiguously-safe
+		// case to clean up here.
 		if (!blobContentWritten) {
 			try { fs.rmSync(blobPath, { force: true }); } catch { /* best-effort */ }
 		}

package/src/state/event-log-rotation.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import * as fs from "node:fs";
-import { readEvents } from "./event-log.ts";
+import { readEvents, type TeamEvent } from "./event-log.ts";
 import { atomicWriteFile } from "./atomic-write.ts";
 import { logInternalError } from "../utils/internal-error.ts";
 import { withEventLogLockSync } from "./event-log.ts";
@@ -65,6 +65,25 @@ export interface CompactionResult {
  * 6. Return compaction stats
  */
 export function compactEventLog(eventsPath: string, config?: Partial<RotationConfig>): CompactionResult | undefined {
+	const prepared = prepareCompaction(eventsPath, config);
+	if (!prepared) return undefined;
+	// FIX: Wrap entire read-compact-write-recover sequence in lock to prevent
+	// event loss during compaction. Without lock, events can be appended between
+	// read and write, lost silently.
+	//
+	// NOTE (Round 24 BUG 1): callers ALREADY holding the event-log lock (e.g.
+	// appendEventInsideLock in event-log.ts) must call applyCompactionUnlocked
+	// directly — calling compactEventLog from inside the lock deadlocks (the
+	// mkdir lock is not re-entrant → 5s timeout → compaction never ran → the
+	// log grew unbounded until events were silently dropped past 50MB).
+	return withEventLogLockSync(eventsPath, () => applyCompactionUnlocked(eventsPath, prepared));
+}
+/** Round 24 (BUG 1): the lock-free pre-read for compaction. Safe to run
+ * outside the lock (read-only). Returns the compacted lines + stats needed
+ * for the write phase. */
+export function prepareCompaction(eventsPath: string, config?: Partial<RotationConfig>):
+	{ lines: string; originalSize: number; originalCount: number; kept: TeamEvent[] } | undefined {
 	if (!fs.existsSync(eventsPath)) return undefined;
 	const cfg = resolveConfig(config);
 	let originalSize: number;
@@ -74,79 +93,73 @@ export function compactEventLog(eventsPath: string, config?: Partial<RotationCon
 	if (originalCount <= cfg.compactToCount) return undefined;
 	const kept = allEvents.slice(-cfg.compactToCount);
 	const lines = kept.map((e) => JSON.stringify(e)).join("\n") + "\n";
+	return { lines, originalSize, originalCount, kept };
+}
-	// FIX: Wrap entire read-compact-write-recover sequence in lock to prevent
-	// event loss during compaction. Without lock, events can be appended between
-	// read and write, lost silently.
-	return withEventLogLockSync(eventsPath, () => {
-		try {
-			atomicWriteFile(eventsPath, lines);
-		} catch (err) {
-			// Concurrent write conflict — skip compaction this cycle
-			logInternalError("event-log-rotation.compact", err, `eventsPath=${eventsPath}`);
-			return undefined;
-		}
-		// C2: Re-read to recover any events appended during the compaction window.
-			// Events appended during the compaction window are preserved because they
-			// appear in afterWrite and the condition afterWrite.length >= kept.length is
-			// true, so they are included in the return stats without entering the
-			// recovery branch.
-		try {
-			const afterWrite = readEvents(eventsPath);
-			// FIX: Check if events were actually lost (afterWrite.length < kept.length)
-			// rather than using appendedDuringWindow >= 0 which is always true.
-			// Also use sequence numbers for comparison instead of JSON.stringify
-			// which is fragile due to key ordering and floating point differences.
-			if (afterWrite.length >= kept.length) {
-				// No data loss — either events were appended and kept, or nothing happened.
-				return {
-					originalSize,
-					compactedSize: fs.statSync(eventsPath).size,
-					eventsRemoved: originalCount - kept.length,
-					eventsKept: kept.length + Math.max(0, afterWrite.length - kept.length),
-				};
-			}
-			// afterWrite.length < kept.length — events were lost during compaction window.
-			// Find missing events and re-append them.
-			// FIX: Use sequence numbers for comparison instead of JSON.stringify.
-			const afterSeqs = new Set(afterWrite.map((e) => e.metadata?.seq).filter((s): s is number => s !== undefined));
-			const missingEvents = kept.filter((e) => e.metadata?.seq === undefined || !afterSeqs.has(e.metadata.seq));
-			let recoveredCount = 0;
-			let recoveryFailed = false;
-			if (missingEvents.length > 0) {
-				// BUGFIX (Round 12 C2): the previous loop called atomicWriteFile PER event,
-				// which REPLACES the entire file each iteration — destroying the
-				// compacted log and all previously-recovered events, leaving only the
-				// LAST missing event. FIX: accumulate all missing events into one
-				// string and append in a single write (appendFileSync appends without
-				// destroying existing content).
-				const recoveryLines = missingEvents.map((e) => JSON.stringify(e) + "\n").join("");
-				try {
-					fs.appendFileSync(eventsPath, recoveryLines);
-					recoveredCount = missingEvents.length;
-				} catch (err) {
-					recoveryFailed = true;
-					logInternalError("event-log-rotation.recovery", err, `eventsPath=${eventsPath} lostEvents=${missingEvents.length}`);
-				}
-			}
+/** Round 24 (BUG 1): the write+recover phase of compaction. Assumes the
+ * caller ALREADY holds the event-log lock (or accepts the unlocked race). */
+export function applyCompactionUnlocked(
+	eventsPath: string,
+	prepared: { lines: string; originalSize: number; originalCount: number; kept: TeamEvent[] },
+): CompactionResult | undefined {
+	const { lines, originalSize, originalCount, kept } = prepared;
+	try {
+		atomicWriteFile(eventsPath, lines);
+	} catch (err) {
+		// Concurrent write conflict — skip compaction this cycle
+		logInternalError("event-log-rotation.compact", err, `eventsPath=${eventsPath}`);
+		return undefined;
+	}
+	// C2: Re-read to recover any events appended during the compaction window.
+	// Events appended during the compaction window are preserved because they
+	// appear in afterWrite and the condition afterWrite.length >= kept.length is
+	// true, so they are included in the return stats without entering the
+	// recovery branch.
+	try {
+		const afterWrite = readEvents(eventsPath);
+		// FIX: Check if events were actually lost (afterWrite.length < kept.length)
+		// rather than using appendedDuringWindow >= 0 which is always true.
+		// Also use sequence numbers for comparison instead of JSON.stringify
+		// which is fragile due to key ordering and floating point differences.
+		if (afterWrite.length >= kept.length) {
 			return {
 				originalSize,
 				compactedSize: fs.statSync(eventsPath).size,
 				eventsRemoved: originalCount - kept.length,
-				eventsKept: kept.length + recoveredCount,
-				recoveryFailed,
-			};
-		} catch {
-			// Post-write verification failed — compaction likely succeeded.
-			const compactedSize = fs.statSync(eventsPath).size;
-			return {
-				originalSize,
-				compactedSize,
-				eventsRemoved: originalCount - kept.length,
-				eventsKept: kept.length,
+				eventsKept: kept.length + Math.max(0, afterWrite.length - kept.length),
 			};
 		}
-	});
+		// afterWrite.length < kept.length — events were lost during compaction window.
+		const afterSeqs = new Set(afterWrite.map((e) => e.metadata?.seq).filter((s): s is number => s !== undefined));
+		const missingEvents = kept.filter((e) => e.metadata?.seq === undefined || !afterSeqs.has(e.metadata.seq));
+		let recoveredCount = 0;
+		let recoveryFailed = false;
+		if (missingEvents.length > 0) {
+			const recoveryLines = missingEvents.map((e) => JSON.stringify(e) + "\n").join("");
+			try {
+				fs.appendFileSync(eventsPath, recoveryLines);
+				recoveredCount = missingEvents.length;
+			} catch (err) {
+				recoveryFailed = true;
+				logInternalError("event-log-rotation.recovery", err, `eventsPath=${eventsPath} lostEvents=${missingEvents.length}`);
+			}
+		}
+		return {
+			originalSize,
+			compactedSize: fs.statSync(eventsPath).size,
+			eventsRemoved: originalCount - kept.length,
+			eventsKept: kept.length + recoveredCount,
+			recoveryFailed,
+		};
+	} catch {
+		// Post-write verification failed — compaction likely succeeded.
+		return {
+			originalSize,
+			compactedSize: fs.statSync(eventsPath).size,
+			eventsRemoved: originalCount - kept.length,
+			eventsKept: kept.length,
+		};
+	}
 }
 /**
@@ -161,33 +174,41 @@ export function rotateEventLog(eventsPath: string): boolean {
 	// FIX: Wrap rotation in lock to prevent race conditions with concurrent readers.
 	// Order of operations: (1) create new empty file, (2) rename old file to archive.
 	// This ensures eventsPath always exists — a reader never sees a missing file.
-	return withEventLogLockSync(eventsPath, () => {
-		try {
-			const ts = new Date().toISOString().replace(/[:.]/g, "-");
-			let archivePath = `${eventsPath}.${ts}.archive.jsonl`;
-			// Round 12: avoid timestamp collisions when two rotations happen within
-			// the same millisecond (copyFileSync would silently overwrite the
-			// first archive). Append a counter until the path is free.
-			let collision = 1;
-			while (fs.existsSync(archivePath)) {
-				archivePath = `${eventsPath}.${ts}.${collision}.archive.jsonl`;
-				collision++;
-			}
-			// BUGFIX (Round 12 C1): the previous order (atomicWriteFile empty THEN
-			// rename) destroyed ALL events — atomicWriteFile replaces the file
-			// in place, so the rename then moved an EMPTY file to the archive.
-			// FIX: copy current content to the archive first (archive is populated,
-			// original still intact), then truncate the original to empty in place.
-			// copyFileSync + writeFileSync("") ensures eventsPath ALWAYS exists
-			// (no missing-file window for concurrent readers).
-			fs.copyFileSync(eventsPath, archivePath);
-			fs.writeFileSync(eventsPath, "", "utf-8");
-			return true;
-		} catch (error) {
-			logInternalError("event-log.rotate", error, `eventsPath=${eventsPath}`);
-			return false;
+	//
+	// NOTE (Round 24 BUG 1): callers ALREADY holding the lock must call
+	// rotateEventLogUnlocked directly — this locked variant is NOT re-entrant.
+	return withEventLogLockSync(eventsPath, () => rotateEventLogUnlocked(eventsPath));
+}
+/** Round 24 (BUG 1): the lock-free core of rotation. Assumes the caller
+ * already holds the event-log lock (or accepts the unlocked race). */
+export function rotateEventLogUnlocked(eventsPath: string): boolean {
+	if (!fs.existsSync(eventsPath)) return false;
+	try {
+		const ts = new Date().toISOString().replace(/[:.]/g, "-");
+		let archivePath = `${eventsPath}.${ts}.archive.jsonl`;
+		// Round 12: avoid timestamp collisions when two rotations happen within
+		// the same millisecond (copyFileSync would silently overwrite the
+		// first archive). Append a counter until the path is free.
+		let collision = 1;
+		while (fs.existsSync(archivePath)) {
+			archivePath = `${eventsPath}.${ts}.${collision}.archive.jsonl`;
+			collision++;
 		}
-	});
+		// BUGFIX (Round 12 C1): the previous order (atomicWriteFile empty THEN
+		// rename) destroyed ALL events — atomicWriteFile replaces the file
+		// in place, so the rename then moved an EMPTY file to the archive.
+		// FIX: copy current content to the archive first (archive is populated,
+		// original still intact), then truncate the original to empty in place.
+		// copyFileSync + writeFileSync("") ensures eventsPath ALWAYS exists
+		// (no missing-file window for concurrent readers).
+		fs.copyFileSync(eventsPath, archivePath);
+		fs.writeFileSync(eventsPath, "", "utf-8");
+		return true;
+	} catch (error) {
+		logInternalError("event-log.rotate", error, `eventsPath=${eventsPath}`);
+		return false;
+	}
 }
 export interface EventLogStats {

package/src/state/event-log.ts CHANGED Viewed

@@ -9,7 +9,7 @@ import { logInternalError } from "../utils/internal-error.ts";
 import { readJsonlSince, type IncrementalReadState } from "../utils/incremental-reader.ts";
 import { redactSecrets } from "../utils/redaction.ts";
 import { sleepSync } from "../utils/sleep.ts";
-import { needsRotation, compactEventLog, rotateEventLog } from "./event-log-rotation.ts";
+import { needsRotation, compactEventLog, rotateEventLog, applyCompactionUnlocked, prepareCompaction, rotateEventLogUnlocked } from "./event-log-rotation.ts";
 export type TeamEventProvenance = "live_worker" | "test" | "healthcheck" | "replay" | "api" | "background" | "team_runner";
 export type TeamWatcherAction = "act" | "observe" | "ignore";
@@ -76,7 +76,7 @@ let overflowCounter = 0;
  *  `flushOneEventLogBuffer`, and `state/mailbox.ts`. Prefer the async alternative
  *  (`appendEventAsync`) for all new code.
  */
-export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
+export function withEventLogLockSync<T>(eventsPath: string, fn: () => T, options?: { timeoutMs?: number; staleMs?: number }): T {
 	// Ensure parent directory exists before attempting lock
 	fs.mkdirSync(path.dirname(eventsPath), { recursive: true });
 	const lockDir = `${eventsPath}.lock`;
@@ -86,8 +86,8 @@ export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
 	// event loop indefinitely. 500 retries × 10ms = 5s max. After timeout, we
 	// throw a clear error instead of blocking forever. This ensures AbortSignal
 	// handlers, SIGTERM, and graceful shutdown can fire within seconds.
-	const timeout = 5000;
-	const staleMs = 10000;
+	const timeout = options?.timeoutMs ?? 5000;
+	const staleMs = options?.staleMs ?? 10000;
 	let acquired = false;
 	while (true) {
 		try {
@@ -110,24 +110,35 @@ export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
 				// to check for orphaned .lock dirs / stale processes.
 				throw errors.eventLogLockTimeout(eventsPath, timeout);
 			}
-			// Stale detection: if the owning process is dead, remove the stale lock.
+			// Round 26 (BUG 3): mtime-based stale check INDEPENDENT of pidFile.
+			// If the holder crashed between mkdir and writing pidFile, there is no
+			// pidFile to read — the old code just slept until the 5s timeout, then
+			// threw, leaving the dir orphaned FOREVER (every retry repeats the
+			// timeout). Now: if the lock dir's mtime exceeds staleMs, reclaim it.
+			try {
+				const dirStat = fs.statSync(lockDir);
+				if (Date.now() - dirStat.mtimeMs > staleMs) {
+					fs.rmSync(lockDir, { recursive: true, force: true });
+					continue;
+				}
+			} catch { /* dir vanished — let loop retry */ }
+			// Round 26 (BUG 4): the mtime check was previously NESTED inside
+			// `if (!alive)`, so a recycled PID (crashed holder's PID reused by an
+			// unrelated live process) kept `alive=true` and the mtime check NEVER
+			// fired → permanent wedge. mtime is now checked FIRST (above) for ALL
+			// holders. The PID check below is a secondary fast-path: if the holder
+			// PID is provably dead AND the lock isn't stale yet, we still wait
+			// (don't steal a fresh lock just because the pid lookup raced).
 			try {
 				const raw = fs.readFileSync(pidFile, "utf-8").trim();
 				const ownerPid = Number.parseInt(raw, 10);
 				if (!Number.isNaN(ownerPid) && ownerPid !== process.pid) {
 					let alive = false;
 					try { process.kill(ownerPid, 0); alive = true; } catch { /* dead */ }
-					if (!alive) {
-						try {
-							const stat = fs.statSync(lockDir);
-							if (Date.now() - stat.mtimeMs > staleMs) {
-								fs.rmSync(lockDir, { recursive: true, force: true });
-								continue;
-							}
-						} catch { /* race — let loop sleep */ }
-					}
+					// (mtime already handled above; nothing to do here for dead-but-fresh.)
+					void alive;
 				}
-			} catch { /* no pid file — fall through to sleep */ }
+			} catch { /* no pid file — mtime check above already handles it */ }
 			sleepSync(10);
 		}
 	}
@@ -135,7 +146,19 @@ export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
 		return fn();
 	} finally {
 		if (acquired) {
-			try { fs.rmSync(lockDir, { recursive: true, force: true }); } catch { /* best-effort */ }
+			// Round 26 (BUG 5): token/PID-guarded release. Previously the release
+			// was an UNCONDITIONAL rmSync. If our fn exceeded staleMs, another
+			// process could steal our lock (rm our dir, make its own); when our fn
+			// finished our finally block would then DELETE THE STEALER's dir → both
+			// in the critical section + lost lock. Verify the pidFile still records
+			// OUR pid before removing; if it doesn't, the lock was stolen and the
+			// current holder owns the dir.
+			try {
+				const currentPid = fs.readFileSync(pidFile, "utf-8").trim();
+				if (currentPid === String(process.pid)) {
+					fs.rmSync(lockDir, { recursive: true, force: true });
+				}
+			} catch { /* lock stolen or already gone — do not touch */ }
 		}
 	}
 }
@@ -152,6 +175,29 @@ function evictOldestSequenceCacheEntries(): void {
 	}
 }
+/** @internal — exported for sequence-cache LRU testing (Round 19). */
+export function __test__sequenceCacheSize(): number {
+	return sequenceCache.size;
+}
+/** @internal — seed an entry into the sequence cache for testing. */
+export function __test__seedSequenceCache(eventsPath: string, lastAccessMs: number): void {
+	sequenceCache.set(eventsPath, { size: 1, mtimeMs: 0, seq: 0, lastAccessMs });
+}
+/** @internal — expose eviction for testing. */
+export function __test__evictOldestSequenceCacheEntries(): void {
+	evictOldestSequenceCacheEntries();
+}
+/** @internal — clear the sequence cache. */
+export function __test__clearSequenceCache(): void {
+	sequenceCache.clear();
+}
+/** @internal — the max sequence cache entries bound. */
+export const MAX_SEQUENCE_CACHE_ENTRIES_VALUE = MAX_SEQUENCE_CACHE_ENTRIES;
 export function sequencePath(eventsPath: string): string {
 	return `${eventsPath}.seq`;
 }
@@ -497,9 +543,14 @@ function appendEventInsideLock(eventsPath: string, event: AppendTeamEvent): Team
 	if (!isTerminal && fs.existsSync(eventsPath)) {
 		const stat = fs.statSync(eventsPath);
 		if (stat.size > MAX_EVENTS_BYTES) {
-			// Try immediate compact (not waiting for counter % 100)
+			// Try immediate compact (not waiting for counter % 100).
+			// Round 24 (BUG 1): we are INSIDE withEventLogLockSync. Use the unlocked
+			// apply/rotate cores — the locked variants would deadlock (mkdir lock
+			// is not re-entrant → 5s timeout → compaction/rotation never ran →
+			// unbounded log growth → events silently dropped past 50MB).
 			try {
-				compactEventLog(eventsPath);
+				const prepared = prepareCompaction(eventsPath);
+				if (prepared) applyCompactionUnlocked(eventsPath, prepared);
 			} catch (error) {
 				logInternalError("event-log.immediate-compact", error, `eventsPath=${eventsPath}`);
 			}
@@ -507,7 +558,7 @@ function appendEventInsideLock(eventsPath: string, event: AppendTeamEvent): Team
 			if (fs.existsSync(eventsPath)) {
 				const afterCompact = fs.statSync(eventsPath);
 				if (afterCompact.size > MAX_EVENTS_BYTES) {
-					rotateEventLog(eventsPath);
+					rotateEventLogUnlocked(eventsPath);
 				}
 			}
 		}
@@ -555,7 +606,15 @@ function appendEventInsideLock(eventsPath: string, event: AppendTeamEvent): Team
 	}
 	appendCounter++;
 	if (appendCounter % 100 === 0 && needsRotation(eventsPath)) {
-		try { compactEventLog(eventsPath); } catch (error) { logInternalError("event-log.rotation", error, `eventsPath=${eventsPath}`); }
+		// Round 24 (BUG 1): we are INSIDE withEventLogLockSync here (called via
+		// appendEventInsideLock). The mkdir lock is NOT re-entrant, so calling the
+		// locked compactEventLog would deadlock → 5s timeout → compaction never
+		// ran → unbounded log growth → events silently dropped past 50MB. Use the
+		// unlocked apply path instead (lock already held).
+		try {
+			const prepared = prepareCompaction(eventsPath);
+			if (prepared) applyCompactionUnlocked(eventsPath, prepared);
+		} catch (error) { logInternalError("event-log.rotation", error, `eventsPath=${eventsPath}`); }
 	}
 	try { emitFromTeamEvent(fullEvent); } catch (error) { logInternalError("event-log.emit", error); }
 	return fullEvent;

package/src/state/health-store.ts CHANGED Viewed

@@ -4,7 +4,12 @@ import type { RunHealth } from "../runtime/task-health.ts";
 import { computeRunHealth } from "../runtime/task-health.ts";
 import type { ManifestSummary } from "../runtime/task-health.ts";
-const HEALTH_DIR = ".crew/state/health";
+// Relative to the crew root (`<cwd>/.crew`). BUG A fix (pts/2 hang
+// investigation 2026-06-16): this was `.crew/state/health`, which double-joined
+// to `<crewRoot>/state/.crew/state/health` because the caller passed the state
+// dir (not the crew root). Now the caller passes the real crew root, so this is
+// a plain `state/health` suffix.
+const HEALTH_DIR = "state/health";
 export interface HealthSnapshot {
   runId: string;

package/src/state/locks.ts CHANGED Viewed

@@ -66,6 +66,57 @@ function isLockHolderAlive(filePath: string): boolean {
 	}
 }
+/**
+ * Round 26 (BUG 1): read the lock file ONCE and evaluate staleness + holder
+ * liveness from that single snapshot.
+ *
+ * Previously `acquireLockWithRetry` called `isLockStale()` and
+ * `isLockHolderAlive()` separately, each performing its own `readFileSync`.
+ * Between those two reads the lock could transition stale→fresh (old holder
+ * released, new holder acquired): isLockStale saw the OLD createdAt → stale,
+ * isLockHolderAlive saw the NEW pid → alive, yielding `!stale && alive` =
+ * false → we forcibly rm the NEW holder's freshly-acquired lock and take it
+ * ourselves → BOTH in the critical section. Reading once closes the window.
+ *
+ * Returns `{ canSteal: true }` if the lock is stale OR the holder is dead
+ * (safe to forcibly remove); `{ canSteal: false }` if it is fresh AND held by
+ * a live process (must keep waiting).
+ */
+function readLockSnapshot(filePath: string, staleMs: number): { canSteal: boolean } {
+	let stat: fs.Stats | undefined;
+	let raw: string | undefined;
+	try {
+		stat = fs.statSync(filePath);
+		raw = fs.readFileSync(filePath, "utf-8");
+	} catch {
+		// File vanished between writeLockFile's EEXIST and now (holder released).
+		// Loop will retry the create; safe to signal "nothing to steal".
+		return { canSteal: false };
+	}
+	// Staleness from a single snapshot.
+	let createdAt = parseCreatedAtFromLock(raw);
+	if (createdAt === undefined) createdAt = stat.mtimeMs;
+	const isStale = Date.now() - createdAt > staleMs;
+	// Holder liveness from the SAME snapshot.
+	let isAlive = true; // Unknown holder — assume alive to be safe (matches isLockHolderAlive).
+	try {
+		const parsed = JSON.parse(raw) as { pid?: unknown };
+		const pid = typeof parsed.pid === "number" ? parsed.pid : undefined;
+		if (pid !== undefined) {
+			try {
+				process.kill(pid, 0);
+				isAlive = true;
+			} catch (error) {
+				const code = (error as NodeJS.ErrnoException).code;
+				// EPERM/ESRCH → treat as not-alive (stealable), see isLockHolderAlive.
+				isAlive = false;
+			}
+		}
+	} catch { /* malformed payload — keep isAlive=true */ }
+	// Steal if stale OR holder dead — matches the original intent.
+	return { canSteal: isStale || !isAlive };
+}
 /**
  * Lock file kinds. Discriminator written to the lock file payload so that:
  *   - Debugging tools (e.g. a future `pi-crew locks` command) can identify
@@ -180,9 +231,10 @@ function acquireLockWithRetry(filePath: string, staleMs: number, kind: LockKind
 			if (Date.now() > deadline) {
 				throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
 			}
-			const isStale = isLockStale(filePath, staleMs);
-			const isHolderAlive = isLockHolderAlive(filePath);
-			if (!isStale && isHolderAlive) {
+			// Round 26 (BUG 1): single-snapshot read closes the TOCTOU window between
+			// separate stale + alive reads (which could race stale→fresh).
+			const { canSteal } = readLockSnapshot(filePath, staleMs);
+			if (!canSteal) {
 				throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
 			}
 			// Stale or dead holder — forcibly remove the lock.
@@ -213,9 +265,9 @@ async function acquireLockWithRetryAsync(filePath: string, staleMs: number, kind
 			if (Date.now() > deadline) {
 				throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
 			}
-			const isStale = isLockStale(filePath, staleMs);
-			const isHolderAlive = isLockHolderAlive(filePath);
-			if (!isStale && isHolderAlive) {
+			// Round 26 (BUG 1): single-snapshot read (see sync variant).
+			const { canSteal } = readLockSnapshot(filePath, staleMs);
+			if (!canSteal) {
 				throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
 			}
 			// Stale or dead holder — forcibly remove the lock.
@@ -244,16 +296,14 @@ export function withFileLockSync<T>(filePath: string, fn: () => T, options: RunL
 	// Between mkdir and lock acquisition, an attacker could plant a symlink.
 	if (!isSymlinkSafePath(path.dirname(lockFile))) throw new Error("Refusing: parent of lock directory is a symlink");
 	fs.mkdirSync(path.dirname(lockFile), { recursive: true });
-	// FIX: Validate that the target file still exists. If it was deleted and
-	// recreated since the last lock cycle, the old .lock file may be orphaned
-	// and should not block the new cycle. Clean it up if the target is missing.
-	try {
-		fs.statSync(filePath);
-	} catch {
-		// Target file doesn't exist — clean up any stale .lock file and proceed.
-		// The lock will be acquired fresh for the new file (if fn creates it).
-		try { fs.rmSync(lockFile, { force: true }); } catch { /* ignore */ }
-	}
+	// Round 26 (BUG 2): REMOVED the pre-acquisition target-file-existence check.
+	// It was racy — between statSync(target) and acquire, a concurrent process
+	// could acquire the lock to CREATE the target, and we'd delete its active
+	// lock. It was also actively wrong for callers that pass a path already
+	// ending in `.lock` (config.ts: the checked "target" never exists, so the
+	// cleanup ALWAYS fired, deleting a fresh concurrent holder's lock). Genuine
+	// orphan locks (crashed holder) are reclaimed by acquireLockWithRetry's
+	// staleMs-based steal logic after at most `staleMs`.
 	// FIX (TOCTOU): Re-validate symlink safety before each lock acquisition
 	// attempt. Between our initial check and the acquisition (and between
 	// acquireLockWithRetry's internal retries), an attacker could plant a

package/src/state/state-store.ts CHANGED Viewed

@@ -57,7 +57,7 @@ export interface RunPaths {
 	eventsPath: string;
 }
-interface ManifestCacheEntry {
+export interface ManifestCacheEntry {
 	manifest: TeamRunManifest;
 	tasks: TeamTaskState[];
 	manifestMtimeMs: number;
@@ -76,6 +76,19 @@ const MANIFEST_CACHE_TTL_MS = 15 * 1000; // 15 seconds (FIX: increased from 5s f
 const LOAD_MANIFEST_RETRY_LIMIT = 5; // Configurable retry limit for mtime/size stability checks under contention
 const manifestCache = new Map<string, ManifestCacheEntry>();
+/** @internal — exported for TTL-eviction unit testing (Round 19). */
+export function __test__setManifestCache(stateRoot: string, entry: ManifestCacheEntry): void {
+	setManifestCache(stateRoot, entry);
+}
+/** @internal — exported for TTL-eviction unit testing (Round 19). */
+export function __test__getManifestCacheEntry(stateRoot: string): ManifestCacheEntry | undefined {
+	return manifestCache.get(stateRoot);
+}
+/** @internal — the TTL in ms used for manifest cache eviction. */
+export const MANIFEST_CACHE_TTL_MS_VALUE = MANIFEST_CACHE_TTL_MS;
 function setManifestCache(stateRoot: string, entry: ManifestCacheEntry): void {
 	if (manifestCache.has(stateRoot)) manifestCache.delete(stateRoot);
 	entry.cachedAt = Date.now();