npm - pi-crew - Versions diffs - 0.9.9 → 0.9.11 - Mend

pi-crew 0.9.9 → 0.9.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/CHANGELOG.md +330 -0
package/docs/fixes/v0.9.10/locks-fix-verify.md +3 -0
package/docs/fixes/v0.9.10/smoke-test.md +12 -0
package/package.json +1 -1
package/src/config/role-tools.ts +39 -6
package/src/extension/team-tool/doctor.ts +41 -18
package/src/runtime/async-runner.ts +70 -74
package/src/runtime/background-runner.ts +13 -2
package/src/runtime/child-pi.ts +122 -22
package/src/runtime/compact-pipeline.ts +56 -0
package/src/runtime/compact-stages/ansi-strip-stage.ts +25 -0
package/src/runtime/compact-stages/blank-collapse-stage.ts +31 -0
package/src/runtime/compact-stages/deduplicate-stage.ts +34 -0
package/src/runtime/compact-stages/head-snap-stage.ts +57 -0
package/src/runtime/compact-stages/index.ts +13 -0
package/src/runtime/compact-stages/tail-capture-stage.ts +72 -0
package/src/runtime/compact-stages/truncation-stage.ts +71 -0
package/src/runtime/handoff-manager.ts +10 -0
package/src/runtime/important-line-classifier.ts +130 -0
package/src/runtime/iteration-hooks.ts +7 -19
package/src/runtime/live-session-runtime.ts +50 -1
package/src/runtime/model-fallback.ts +29 -1
package/src/runtime/role-permission.ts +5 -21
package/src/runtime/stream-preview.ts +9 -2
package/src/runtime/task-output-context.ts +161 -27
package/src/runtime/task-runner/prompt-builder.ts +1 -0
package/src/runtime/task-runner.ts +76 -15
package/src/state/artifact-store.ts +22 -2
package/src/state/locks.ts +16 -0
package/src/state/state-store.ts +8 -2
package/src/ui/live-run-sidebar.ts +6 -1
package/src/ui/loaders.ts +24 -4
package/src/ui/run-dashboard.ts +6 -1
package/src/ui/run-event-bus.ts +1 -1
package/src/ui/run-snapshot-cache.ts +50 -16
package/src/ui/widget/index.ts +27 -5
package/src/ui/widget/widget-renderer.ts +43 -13
package/src/utils/redaction.ts +66 -32
package/src/utils/visual.ts +6 -0
package/src/ui/crew-widget.ts +0 -544

package/src/runtime/task-output-context.ts CHANGED Viewed

@@ -5,6 +5,8 @@ import { writeArtifact } from "../state/artifact-store.ts";
 import { resolveRealContainedPath } from "../utils/safe-paths.ts";
 import type { WorkflowStep } from "../workflows/workflow-config.ts";
 import { pruneToolOutputs, type ToolResultEntry, type FileEditEvent, DEFAULT_PRUNE_CONFIG } from "./tool-output-pruner.ts";
+import { applyCompactPipeline } from "./compact-pipeline.ts";
+import { ANSI_STRIP_STAGE, BLANK_COLLAPSE_STAGE, TruncationStage } from "./compact-stages/index.ts";
 export interface DependencyContextEntry {
 	taskId: string;
@@ -19,7 +21,14 @@ export interface DependencyContextEntry {
 export interface DependencyOutputContext {
 	dependencies: DependencyContextEntry[];
-	sharedReads: Array<{ name: string; path: string; content: string }>;
+	/**
+	 * Each shared artifact read, truncated for inline injection. When truncation
+	 * is materially lossy (file size > 2× MAX_RESULT_INLINE_BYTES) the FULL
+	 * content is also teed to `${artifactsRoot}/tee/${taskId}-${name}.full.txt`
+	 * and the path is exposed via `fullOutputPath` so the downstream worker
+	 * can `read` it back if it needs the dropped middle.
+	 */
+	sharedReads: Array<{ name: string; path: string; content: string; fullOutputPath?: string }>;
 }
 function containedExists(filePath: string, baseDir?: string): boolean {
@@ -39,35 +48,127 @@ function containedExists(filePath: string, baseDir?: string): boolean {
  * (24K/40K/80K) which truncated the same artifact differently depending on
  * which code path read it.
  */
-const MAX_RESULT_INLINE_BYTES = 32_000;
+export const MAX_RESULT_INLINE_BYTES = 32_000;
-function readIfSmall(filePath: string, baseDir?: string): string | undefined {
-	const maxBytes = MAX_RESULT_INLINE_BYTES;
+/**
+ * Read a file and return its content, truncating to a head+tail slice if it
+ * exceeds {@link MAX_RESULT_INLINE_BYTES} characters. Multi-byte UTF-8
+ * sequences are preserved by reading the full file as a UTF-8 string and
+ * slicing by character count (not raw bytes).
+ */
+export interface TeeRecoveryOptions {
+	/** Absolute path to write the full (non-truncated) content to. */
+	fullOutputPath: string;
+}
+export interface ReadIfSmallTeeResult {
+	/** Truncated content (or full content when no truncation). */
+	content: string;
+	/** Set only when tee was actually written (file size > 2× threshold + write succeeded). */
+	fullOutputPath?: string;
+}
+/**
+ * Sanitize a taskId / artifactName into a flat tee filename. Any character
+ * outside [A-Za-z0-9._-] is replaced with underscore so the resulting path
+ * is always single-segment and cannot escape the tee directory.
+ */
+function safeTeeName(taskId: string, artifactName: string): string {
+	const safe = (s: string): string => s.replace(/[^A-Za-z0-9._-]/g, "_");
+	return `${safe(taskId)}-${safe(artifactName)}.full.txt`;
+}
+/**
+ * Canonical tee path for a shared artifact read.
+ *
+ * Format: `${artifactsRoot}/tee/${taskId}-${artifactName}.full.txt`
+ *
+ * The downstream worker prompt includes this path so the worker can `read`
+ * the full content when it needs the dropped middle.
+ */
+export function teePathForArtifact(artifactsRoot: string, taskId: string, artifactName: string): string {
+	return path.join(artifactsRoot, "tee", safeTeeName(taskId, artifactName));
+}
+/**
+ * Best-effort tee write. Returns true on success, false on any error (write
+ * failures are silent — tee is enhancement, never a hard dependency). The
+ * truncated inline content is still returned by the caller either way.
+ */
+function writeTeeFile(fullOutputPath: string, content: string): boolean {
 	try {
-		const safePath = baseDir ? resolveRealContainedPath(baseDir, filePath) : filePath;
-		const stat = fs.statSync(safePath);
-		if (stat.size > maxBytes) {
-			// L4: head + tail instead of head-only. Keeps closing markdown
-			// structure (code fences, headings) instead of leaving them truncated.
-			const head = Math.floor(maxBytes * 0.75);
-			const tail = maxBytes - head;
-			const headBuf = Buffer.alloc(head);
-			const tailBuf = Buffer.alloc(tail);
-			const fd = fs.openSync(safePath, "r");
-			try {
-				fs.readSync(fd, headBuf, 0, head, 0);
-				fs.readSync(fd, tailBuf, 0, tail, stat.size - tail);
-			} finally {
-				fs.closeSync(fd);
+		fs.mkdirSync(path.dirname(fullOutputPath), { recursive: true });
+		fs.writeFileSync(fullOutputPath, content, "utf-8");
+		return true;
+	} catch {
+		return false;
+	}
+}
+/**
+ * Read a file with optional tee-recovery (P1-A). Returns the truncated
+ * content AND (when tee was actually written) the absolute path to the full
+ * file. Returns undefined if the file cannot be read at all.
+ *
+ * Tee threshold: only when content.length > 2 * MAX_RESULT_INLINE_BYTES
+ * (the head+tail is materially lossy — small over-threshold files are not
+ * teed because the inline content is mostly intact and the worker can live
+ * with the 75/25 split). File content is read once and reused for both the
+ * pipeline (truncation) and the tee write (full file).
+ *
+ * Truncation behavior is unchanged from the P0-A pipeline: ANSI strip +
+ * blank collapse BEFORE truncation, important-line preservation (P0-B)
+ * inside TruncationStage, marker wording matches the pre-P1-A `readIfSmall`
+ * output exactly (L4 backward-compat).
+ */
+export function readIfSmallWithTee(
+	filePath: string,
+	opts: { baseDir?: string; tee?: TeeRecoveryOptions } = {},
+): ReadIfSmallTeeResult | undefined {
+	const maxChars = MAX_RESULT_INLINE_BYTES;
+	try {
+		const safePath = opts.baseDir ? resolveRealContainedPath(opts.baseDir, filePath) : filePath;
+		const content = fs.readFileSync(safePath, "utf-8");
+		if (content.length > maxChars) {
+			let fullOutputPath: string | undefined;
+			// Tee only when truncation is materially lossy (>2× threshold).
+			if (opts.tee && content.length > maxChars * 2) {
+				if (writeTeeFile(opts.tee.fullOutputPath, content)) {
+					fullOutputPath = opts.tee.fullOutputPath;
+				}
 			}
-			return `${headBuf.toString("utf-8")}\n\n...[pi-crew truncated ${stat.size - maxBytes} bytes, head+tail preserved]...\n${tailBuf.toString("utf-8")}`;
+			const result = applyCompactPipeline(content, [
+				ANSI_STRIP_STAGE,
+				BLANK_COLLAPSE_STAGE,
+				new TruncationStage(maxChars, {
+					preserveImportant: true,
+					marker: { verb: "truncated", unit: "chars", headSeparator: "\n\n", tailSeparator: "\n" },
+				}),
+			]);
+			return fullOutputPath ? { content: result.text, fullOutputPath } : { content: result.text };
 		}
-		return fs.readFileSync(safePath, "utf-8");
+		return { content };
 	} catch {
 		return undefined;
 	}
 }
+/**
+ * Read a file and return its content, truncating to a head+tail slice if it
+ * exceeds {@link MAX_RESULT_INLINE_BYTES} characters. Multi-byte UTF-8
+ * sequences are preserved by reading the full file as a UTF-8 string and
+ * slicing by character count (not raw bytes).
+ *
+ * Thin wrapper around {@link readIfSmallWithTee} for backward compatibility
+ * — callers that do not need tee-recovery metadata get just the content
+ * string. New tee-recovery call sites should use {@link readIfSmallWithTee}
+ * directly so they can include the full output path in the worker prompt.
+ */
+export function readIfSmall(filePath: string, baseDir?: string): string | undefined {
+	const result = readIfSmallWithTee(filePath, { baseDir });
+	return result?.content;
+}
 function safeSharedName(name: string): string {
 	const normalized = name.replaceAll("\\", "/").replace(/^\.\/+/, "");
 	if (!normalized || normalized.split("/").some((segment) => segment === "..") || path.isAbsolute(normalized)) throw new Error(`Invalid shared artifact name: ${name}`);
@@ -127,6 +228,7 @@ function aggregateUsage(task: TeamTaskState): DependencyContextEntry["usage"] {
 function pruneSharedReads(
 	reads: Array<{ name: string; path: string; content: string }>,
 	dependencies: DependencyContextEntry[],
+	artifactsRoot: string,
 ): Array<{ name: string; path: string; content: string }> {
 	if (reads.length === 0) return reads;
 	// Convert shared reads to tool result entries (ordered oldest → newest
@@ -140,15 +242,19 @@ function pruneSharedReads(
 	// Collect file edit events from dependency artifacts produced to shared/.
 	// A dependency that wrote a shared file after an earlier read invalidates
 	// that read (the content is now stale relative to the latest version).
-	const sharedRoot = path.resolve("shared");
+	// Artifact entries from listTaskArtifacts() are already relative to
+	// artifactsRoot (e.g. "shared/foo.md"), so resolve directly against
+	// artifactsRoot — NOT against a "shared" subdirectory (which would
+	// double-prefix to <artifactsRoot>/shared/shared/foo.md).
 	const fileEdits: FileEditEvent[] = [];
 	for (let depIndex = 0; depIndex < dependencies.length; depIndex++) {
 		const dep = dependencies[depIndex]!;
 		const produced = dep.artifactsProduced ?? [];
 		for (const artifact of produced) {
 			if (typeof artifact !== "string") continue;
-			// Map artifact path to shared-relative and check against read targets.
-			fileEdits.push({ target: path.resolve(sharedRoot, artifact), index: reads.length + depIndex });
+			// Map artifact path (relative to artifactsRoot) to absolute and
+			// check against read targets.
+			fileEdits.push({ target: path.resolve(artifactsRoot, artifact), index: reads.length + depIndex });
 		}
 	}
 	const pruned = pruneToolOutputs(entries, DEFAULT_PRUNE_CONFIG);
@@ -175,13 +281,33 @@ export function collectDependencyOutputContext(manifest: TeamRunManifest, tasks:
 	});
 	const rawSharedReads = (step.reads === false ? [] : step.reads ?? []).map((name) => {
 		const filePath = sharedPath(manifest, name);
-		return { name, path: filePath, content: readIfSmall(filePath, path.resolve(manifest.artifactsRoot, "shared")) ?? "" };
+		// P1-A tee-recovery: when the shared artifact is large enough that the
+		// 75/25 head+tail split is materially lossy (>2× MAX_RESULT_INLINE_BYTES),
+		// tee the full content to ${artifactsRoot}/tee/${taskId}-${name}.full.txt
+		// and expose the path so the downstream worker can `read` the full file
+		// if it needs the dropped middle. The truncated content is still
+		// included inline; tee is an enhancement, not a hard dependency. Tee
+		// write is best-effort (writeTeeFile swallows I/O errors and the result
+		// simply omits fullOutputPath in that case).
+		const teePath = teePathForArtifact(manifest.artifactsRoot, task.id, name);
+		const teeResult = readIfSmallWithTee(filePath, {
+			baseDir: path.resolve(manifest.artifactsRoot, "shared"),
+			tee: { fullOutputPath: teePath },
+		});
+		if (teeResult === undefined) return { name, path: filePath, content: "" };
+		const entry: { name: string; path: string; content: string; fullOutputPath?: string } = {
+			name,
+			path: filePath,
+			content: teeResult.content,
+		};
+		if (teeResult.fullOutputPath) entry.fullOutputPath = teeResult.fullOutputPath;
+		return entry;
 	}).filter((item) => item.content.trim().length > 0);
 	// Apply staleness-aware pruning to shared reads: drops superseded reads
 	// (same file re-read with different selectors) and replaces stale large
 	// outputs with compact digest notices before injecting into the worker
 	// prompt. OPT-IN: default config protects recent results.
-	const sharedReads = pruneSharedReads(rawSharedReads, dependencies);
+	const sharedReads = pruneSharedReads(rawSharedReads, dependencies, manifest.artifactsRoot);
 	return { dependencies, sharedReads };
 }
@@ -198,7 +324,15 @@ export function renderDependencyOutputContext(context: DependencyOutputContext):
 	}
 	if (context.sharedReads.length) {
 		parts.push("# Shared Run Context Reads", "");
-		for (const read of context.sharedReads) parts.push(`## shared/${read.name}`, `Path: ${read.path}`, "", read.content.trim(), "");
+		for (const read of context.sharedReads) {
+			parts.push(`## shared/${read.name}`, `Path: ${read.path}`);
+			// P1-A tee-recovery hint: when the file was materially truncated
+			// (>2× threshold) the full content was teed to fullOutputPath so the
+			// worker can read the dropped middle if needed. The path is inside
+			// artifactsRoot/tee/ and goes through the normal permission gate.
+			if (read.fullOutputPath) parts.push(`Full output (if you need the missing middle): ${read.fullOutputPath}`);
+			parts.push("", read.content.trim(), "");
+		}
 	}
 	return parts.join("\n").trim();
 }

package/src/runtime/task-runner/prompt-builder.ts CHANGED Viewed

@@ -30,6 +30,7 @@ function readOnlyRoleInstructions(role: string): string {
 		"- Do not use shell redirects, heredocs, in-place edits, package installs, git commit/merge/rebase/reset/checkout, or other state-mutating commands.",
 		"- If implementation changes are needed, report exact recommendations instead of applying them.",
 		"- Prefer read/grep/find/listing tools and read-only git inspection commands.",
+		"- Your final RESULT TEXT is persisted automatically by the runner (as a result artifact and, if the step declares `output:`, to a shared file). To deliver a plan, report, or findings, EMIT THEM AS TEXT in your final result — do NOT try to write a file yourself.",
 	].join("\n");
 }

package/src/runtime/task-runner.ts CHANGED Viewed

@@ -763,7 +763,30 @@ export async function runTeamTask(
 					"",
 				);
 				if (!error) break;
-				const nextModel = attemptModels[i + 1];
+				let nextModel = attemptModels[i + 1];
+				// FIX 1 (task packet 01_01-agent): when the precomputed attempt
+				// chain is exhausted but the failure is retryable, do a one-shot
+				// re-resolve via buildConfiguredModelRouting with the failed
+				// model as parent. This finds alternative providers/models the
+				// original chain missed (e.g. a registry gained new fallbacks
+				// after the precompute, or the precompute ran before the parent
+				// model was known). If a different candidate is found, use it as
+				// nextModel; otherwise fall through to the existing break.
+				if (!nextModel && isRetryableModelFailure(error)) {
+					const reResolved = buildConfiguredModelRouting({
+						overrideModel: undefined,
+						stepModel: undefined,
+						teamRoleModel: undefined,
+						agentModel: undefined,
+						fallbackModels: undefined,
+						parentModel: attempt.model,
+						modelRegistry: input.modelRegistry,
+						cwd: task.cwd,
+						scopeModelsPatterns: await resolveTaskScopeModelsPatterns(task.cwd),
+					});
+					const alt = reResolved.candidates.find((c) => c !== attempt.model);
+					if (alt) nextModel = alt;
+				}
 				if (!nextModel || !isRetryableModelFailure(error)) break;
 				logs.push(formatModelAttemptNote(attempt, nextModel), "");
 			}
@@ -1368,19 +1391,57 @@ async function resolveTaskScopeModelsPatterns(cwd: string): Promise<string[]> {
  * or when there are no retryable error messages.
  */
 export function detectRetryableModelFailureFromOutput(parsed: ParsedPiJsonOutput): string | undefined {
+	// Primary signal: pre-extracted `errorMessages` (from pi-json-output parser).
+	// The parser already filters to non-empty trimmed strings from message_end
+	// events.
 	const messages = parsed.errorMessages;
-	if (!messages || messages.length === 0) return undefined;
-	// Find the first retryable model-failure message (429 / rate-limit / overloaded / 5xx / ...).
-	const retryable = messages.find((m) => isRetryableModelFailure(m));
-	if (!retryable) return undefined;
-	// Did the run actually produce real output despite the transient errors?
-	// If finalText / textEvents / patches exist, the model recovered and we
-	// should NOT mark the run as failed — only flag it when the worker yielded
-	// nothing (the 429-only case from the bug report).
-	const hasRealOutput =
-		(parsed.finalText?.trim().length ?? 0) > 0 ||
-		parsed.textEvents.some((t) => t.trim().length > 0) ||
-		(parsed.patches?.length ?? 0) > 0;
-	if (hasRealOutput) return undefined;
-	return `Model returned only retryable errors and no output: ${retryable}`;
+	if (messages && messages.length > 0) {
+		// Find the first retryable model-failure message
+		// (429 / rate-limit / overloaded / 5xx / ...).
+		const retryable = messages.find((m) => isRetryableModelFailure(m));
+		if (retryable) {
+			// Did the run actually produce real output despite the transient errors?
+			// If finalText / textEvents / patches exist, the model recovered and we
+			// should NOT mark the run as failed — only flag it when the worker
+			// yielded nothing (the 429-only case from the bug report).
+			const hasRealOutput =
+				(parsed.finalText?.trim().length ?? 0) > 0 ||
+				parsed.textEvents.some((t) => t.trim().length > 0) ||
+				(parsed.patches?.length ?? 0) > 0;
+			if (hasRealOutput) return undefined;
+			return `Model returned only retryable errors and no output: ${retryable}`;
+		}
+	}
+	// Secondary signal (FIX 3, task packet 01_01-agent): inspect a raw
+	// `messageEndEvents` (or `transcript`) array on the parsed output. The
+	// ParsedPiJsonOutput type does not currently declare this field, so we
+	// read it through a local extension cast. Callers that pass it (tests, a
+	// future parser that captures the full event stream) get a second chance
+	// to surface retryable failures. Primary path still wins when it matches.
+	const raw = parsed as ParsedPiJsonOutput & {
+		messageEndEvents?: unknown;
+		transcript?: unknown;
+	};
+	const eventSource = Array.isArray(raw.messageEndEvents)
+		? raw.messageEndEvents
+		: Array.isArray(raw.transcript)
+			? raw.transcript
+			: undefined;
+	if (!eventSource || eventSource.length === 0) return undefined;
+	for (const candidate of eventSource) {
+		if (!candidate || typeof candidate !== "object") continue;
+		const event = candidate as { stopReason?: unknown; errorMessage?: unknown };
+		if (event.stopReason !== "error") continue;
+		if (typeof event.errorMessage !== "string" || event.errorMessage.length === 0) continue;
+		if (!isRetryableModelFailure(event.errorMessage)) continue;
+		// Same real-output gate as the primary signal — don't flag runs that
+		// recovered with real final text / patches.
+		const hasRealOutput =
+			(parsed.finalText?.trim().length ?? 0) > 0 ||
+			parsed.textEvents.some((t) => t.trim().length > 0) ||
+			(parsed.patches?.length ?? 0) > 0;
+		if (hasRealOutput) return undefined;
+		return `Model returned only retryable errors and no output: ${event.errorMessage}`;
+	}
+	return undefined;
 }

package/src/state/artifact-store.ts CHANGED Viewed

@@ -4,7 +4,7 @@ import { createHash } from "node:crypto";
 import type { ArtifactDescriptor } from "./types.ts";
 import { atomicWriteFile } from "./atomic-write.ts";
 import { resolveRealContainedPath } from "../utils/safe-paths.ts";
-import { redactSecretString } from "../utils/redaction.ts";
+import { redactSecretString, redactSecrets } from "../utils/redaction.ts";
 function hashContent(content: string): string {
 	return createHash("sha256").update(content).digest("hex");
@@ -127,7 +127,27 @@ export function writeArtifact(artifactsRoot: string, options: ArtifactWriteOptio
 	const filePath = resolveInside(artifactsRoot, options.relativePath);
 	fs.mkdirSync(path.dirname(filePath), { recursive: true });
 	resolveRealContainedPath(artifactsRoot, path.dirname(filePath));
-	const content = redactSecretString(options.content);
+	let content = options.content;
+	// Structural JSON redaction first — catches quoted-JSON secrets
+	// ("api_key":"sk-...") and nested keys that flat redactSecretString misses.
+	// The flat scan below still catches free-text patterns (Bearer/JWT/Auth
+	// headers) that may live inside JSON string values. See security review M2.
+	//
+	// Formatting preservation: re-stringify with the SAME indentation as the
+	// input so pretty-printed artifacts (e.g. group-join metadata expected by
+	// test/integration/phase4-runtime.test.ts to contain `"partial": false`)
+	// keep their whitespace. Detect pretty-vs-compact from the raw input.
+	const trimmed = content.trim();
+	if (trimmed.startsWith("{") || trimmed.startsWith("[")) {
+		try {
+			const parsed: unknown = JSON.parse(content);
+			const isPretty = /\n|"\s*:\s/.test(content);
+			content = JSON.stringify(redactSecrets(parsed), null, isPretty ? 2 : undefined);
+		} catch {
+			// not valid JSON — fall through to flat redaction only
+		}
+	}
+	content = redactSecretString(content);
 	atomicWriteFile(filePath, content);
 	// Compute hash on written bytes for integrity verification.
 	// Read back the actual file content to handle atomicWrite fallback path

package/src/state/locks.ts CHANGED Viewed

@@ -292,6 +292,17 @@ export function withFileLockSync<T>(filePath: string, fn: () => T, options: RunL
 	// append, or even the lock acquisition itself) would race with the lock.
 	const lockFile = `${filePath}.lock`;
 	const staleMs = options.staleMs ?? DEFAULT_STALE_MS;
+	// FIX (Round 29): re-entrance guard — mirrors withRunLockSync below.
+	// When the same call stack already holds the file lock (e.g.
+	// registerWorker -> cleanupOrphanWorkers -> readRegistry), the second
+	// acquisition would otherwise read its own freshly-written lock file
+	// (same pid, fresh createdAt), fail the steal check, and deadlock for
+	// the full staleMs window. Strace-confirmed in
+	// .github/issues/pre-existing-2026-06-10/04-orphan-worker-registry-tests.md:75-86.
+	const existingToken = fileLockHeldByUs.get(lockFile);
+	if (existingToken) {
+		return fn();
+	}
 	// FIX: Validate the parent directory is not a symlink BEFORE calling mkdirSync.
 	// Between mkdir and lock acquisition, an attacker could plant a symlink.
 	if (!isSymlinkSafePath(path.dirname(lockFile))) throw new Error("Refusing: parent of lock directory is a symlink");
@@ -322,10 +333,12 @@ export function withFileLockSync<T>(filePath: string, fn: () => T, options: RunL
 		}
 	}
 	if (token === "") throw new Error(`Run '${path.basename(lockFile)}' is locked by another operation.`);
+	fileLockHeldByUs.set(lockFile, token);
 	try {
 		return fn();
 	} finally {
 		// Token-guarded release: don't rm the lock if it has been stolen.
+		fileLockHeldByUs.delete(lockFile);
 		releaseLock(lockFile, token);
 	}
 }
@@ -353,6 +366,9 @@ export function withRunLockSync<T>(manifest: TeamRunManifest, fn: () => T, optio
 // already held by this call stack (handleResume -> executeTeamRun ->
 // executeTeamRunCore), we skip re-acquisition to avoid deadlock.
 const runLockHeldByUs = new Map<string, string>(); // filePath -> token
+// Round 29: parallel map for withFileLockSync re-entrance. See the comment
+// at the top of withFileLockSync for the full deadlock mechanism.
+const fileLockHeldByUs = new Map<string, string>(); // lockFile -> token
 export async function withRunLock<T>(manifest: TeamRunManifest, fn: () => Promise<T>, options: RunLockOptions = {}): Promise<T> {
 	const filePath = lockPath(manifest);

package/src/state/state-store.ts CHANGED Viewed

@@ -634,7 +634,10 @@ export function loadRunManifestById(cwd: string, runId: string): { manifest: Tea
 	// between the final stat and the read. Callers needing strict consistency
 	// MUST use withRunLock() around load+modify+save.
 	if (attempts > 0) {
-		console.warn(`[state-store] loadRunManifestById: retry loop detected instability for run ${runId} after ${attempts} attempt(s) — best-effort only, use withRunLock() for strict consistency`);
+		// Round 19: downgrade to debug — retry-loop instability is expected under
+		// concurrent writes (live team runs constantly append to tasks.json).
+		// This is best-effort by design; strict consistency requires withRunLock().
+		console.debug(`[state-store] loadRunManifestById: retry loop detected instability for run ${runId} after ${attempts} attempt(s) — best-effort only, use withRunLock() for strict consistency`);
 	}
 	// NOTE: manifest mtime may legitimately be >= tasks mtime because
 	// saveManifestAndTasksAtomicSync writes manifest before tasks. However,
@@ -724,7 +727,10 @@ export async function loadRunManifestByIdAsync(cwd: string, runId: string): Prom
 	// between the final stat and the read. Callers needing strict consistency
 	// MUST use withRunLock() around load+modify+save.
 	if (attempts > 0) {
-		console.warn(`[state-store] loadRunManifestByIdAsync: retry loop detected instability for run ${runId} after ${attempts} attempt(s) — best-effort only, use withRunLock() for strict consistency`);
+		// Round 19: downgrade to debug — retry-loop instability is expected under
+		// concurrent writes (live team runs constantly append to tasks.json).
+		// This is best-effort by design; strict consistency requires withRunLock().
+		console.debug(`[state-store] loadRunManifestByIdAsync: retry loop detected instability for run ${runId} after ${attempts} attempt(s) — best-effort only, use withRunLock() for strict consistency`);
 	}
 	// NOTE: manifest mtime may legitimately be >= tasks mtime because
 	// saveManifestAndTasksAtomicSync writes manifest before tasks. However,

package/src/ui/live-run-sidebar.ts CHANGED Viewed

@@ -76,7 +76,12 @@ export class LiveRunSidebar {
 		this.config = input.config ?? {};
 		this.snapshotCache = input.snapshotCache;
 		this.unsubscribeTheme = subscribeThemeChange(input.theme, () => this.invalidate());
-		this.unsubscribeEventBus = runEventBus.onAny(() => this.invalidate());
+		this.unsubscribeEventBus = (() => {
+		const unsub1 = runEventBus.onChannel("run:state", () => this.invalidate());
+		const unsub2 = runEventBus.onChannel("worker:lifecycle", () => this.invalidate());
+		const unsub3 = runEventBus.onChannel("ui:invalidate", () => this.invalidate());
+		return () => { unsub1(); unsub2(); unsub3(); };
+	})();
 	}
 	private buildSignature(manifestStatus: string, tasks: TeamTaskState[], agents: ReturnType<typeof readCrewAgents>, waitingCount: number, snapshot?: RunUiSnapshot): string {

package/src/ui/loaders.ts CHANGED Viewed

@@ -113,23 +113,43 @@ export class CountdownTimer {
 	private readonly timeoutMs: number;
 	private timer: ReturnType<typeof setTimeout> | undefined;
 	private expired = false;
+	private lastEmittedSeconds = -1;
 	constructor(options: CountdownTimerOptions) {
 		this.timeoutMs = Math.max(0, options.timeoutMs);
 		this.onTick = options.onTick;
 		this.onExpire = options.onExpire;
 		this.startedAt = Date.now();
-		this.onTick(this.secondsLeft());
+		this.lastEmittedSeconds = this.secondsLeft();
+		this.onTick(this.lastEmittedSeconds);
 		if (this.timeoutMs === 0) {
 			this.emitExpire();
 			return;
 		}
-		this.timer = setInterval(() => {
+		this.scheduleNextTick();
+	}
+	/**
+	 * Schedule the next tick via recursive setTimeout. Each tick re-emits the
+	 * current `secondsLeft()` only if it differs from the last emitted value
+	 * (lastEmittedSeconds guard). This makes the countdown correct even under
+	 * event-loop pressure: if the previous tick fired 1.2s late, the next
+	 * tick still emits the right value for the current second rather than
+	 * skipping it (the pre-fix `setInterval` could SKIP a second value when
+	 * the loop was busy, producing [3,2,0] instead of [3,2,1,0] in tests).
+	 */
+	private scheduleNextTick(): void {
+		this.timer = setTimeout(() => {
 			const seconds = this.secondsLeft();
-			this.onTick(seconds);
+			if (seconds !== this.lastEmittedSeconds) {
+				this.lastEmittedSeconds = seconds;
+				this.onTick(seconds);
+			}
 			if (seconds <= 0) {
 				this.emitExpire();
+				return;
 			}
+			this.scheduleNextTick();
 		}, 1000);
 		// Defense-in-depth: never let the countdown timer keep the event loop
 		// alive. If dispose() is missed (e.g. UI unmount race), the timer must
@@ -151,7 +171,7 @@ export class CountdownTimer {
 	dispose(): void {
 		if (this.timer === undefined) return;
-		clearInterval(this.timer);
+		clearTimeout(this.timer);
 		this.timer = undefined;
 	}
 }

package/src/ui/run-dashboard.ts CHANGED Viewed

@@ -294,7 +294,12 @@ export class RunDashboard implements DashboardComponent {
 		this.theme = asCrewTheme(theme);
 		this.options = options;
 		this.unsubscribeTheme = subscribeThemeChange(theme, () => this.invalidateAndRender());
-		this.unsubscribeEventBus = runEventBus.onAny(() => this.invalidateAndRender());
+		this.unsubscribeEventBus = (() => {
+		const unsub1 = runEventBus.onChannel("run:state", () => this.invalidateAndRender());
+		const unsub2 = runEventBus.onChannel("worker:lifecycle", () => this.invalidateAndRender());
+		const unsub3 = runEventBus.onChannel("ui:invalidate", () => this.invalidateAndRender());
+		return () => { unsub1(); unsub2(); unsub3(); };
+	})();
 	}
 	/**

package/src/ui/run-event-bus.ts CHANGED Viewed

@@ -40,7 +40,7 @@ const RUN_STATE_TYPES = new Set([
 	"manifest.saved", "task.claimed", "task.unclaimed", "mailbox_updated",
 ]);
 const UI_INVALIDATE_TYPES = new Set([
-	"effectiveness_changed", "snapshot_stale",
+	"effectiveness_changed", "snapshot_stale", "run.cache_invalidated",
 ]);
 /** Classify an event type string into a typed channel. */

package/src/ui/run-snapshot-cache.ts CHANGED Viewed

@@ -787,11 +787,55 @@ export function createRunSnapshotCache(cwd: string, options: RunSnapshotCacheOpt
 		}
 	}
-	const unsubscribe = runEventBus.onAny((event) => {
-		if (entries.has(event.runId)) {
-			entries.delete(event.runId);
-		}
+	// Coalesced eager refresh on event-bus signals. Previously every
+	// `run:state` / `worker:lifecycle` event deleted the cache entry, leaving
+	// a window where `widget-model.ts: snapshotCache.get(runId)` returned
+	// `undefined`. The widget then fell back to `agentsFor(run)` (a disk read
+	// with no snapshot.tasks) and rendered the "0/1 done" branch of
+	// `widget-renderer.ts:39-41` instead of the "Phase 1/1 default: 0% (0/3)"
+	// branch — producing the live flicker between those two progressPart
+	// values every render tick. Replacing the delete with a coalesced
+	// refreshIfStale keeps the cache populated so the widget always sees the
+	// same logical snapshot between stamp changes; multiple events for the
+	// same runId within INVAL_COALESCE_MS are batched into one refresh.
+	function localRefresh(runId: string): RunUiSnapshot {
+		const previous = entries.get(runId);
+		const entry = build(runId, previous);
+		entries.set(runId, entry);
+		evictIfNeeded();
+		return entry.snapshot;
+	}
+	function localRefreshIfStale(runId: string): RunUiSnapshot {
+		const previous = entries.get(runId);
+		if (!previous) return localRefresh(runId);
+		const now = Date.now();
+		if (now - previous.loadedAtMs < ttlMs) return touch(runId, previous);
+		const stamps = currentStamps(previous);
+		if (sameStamps(stamps, previous.stamps)) return touch(runId, previous);
+		return localRefresh(runId);
+	}
+	const pendingRefreshes = new Map<string, ReturnType<typeof setTimeout>>();
+	const INVAL_COALESCE_MS = 80;
+	const scheduleRefresh = (runId: string): void => {
+		const existing = pendingRefreshes.get(runId);
+		if (existing) clearTimeout(existing);
+		pendingRefreshes.set(runId, setTimeout(() => {
+			pendingRefreshes.delete(runId);
+			try { localRefreshIfStale(runId); } catch { /* best-effort; widget falls back gracefully */ }
+		}, INVAL_COALESCE_MS));
+	};
+	const unsubState = runEventBus.onChannel("run:state", (event) => {
+		if (entries.has(event.runId)) scheduleRefresh(event.runId);
 	});
+	const unsubLifecycle = runEventBus.onChannel("worker:lifecycle", (event) => {
+		if (entries.has(event.runId)) scheduleRefresh(event.runId);
+	});
+	const unsubscribe = () => {
+		unsubState();
+		unsubLifecycle();
+		for (const timer of pendingRefreshes.values()) clearTimeout(timer);
+		pendingRefreshes.clear();
+	};
 	return {
 		get(runId: string): RunUiSnapshot | undefined {
@@ -799,20 +843,10 @@ export function createRunSnapshotCache(cwd: string, options: RunSnapshotCacheOpt
 			return entry ? touch(runId, entry) : undefined;
 		},
 		refresh(runId: string): RunUiSnapshot {
-			const previous = entries.get(runId);
-			const entry = build(runId, previous);
-			entries.set(runId, entry);
-			evictIfNeeded();
-			return entry.snapshot;
+			return localRefresh(runId);
 		},
 		refreshIfStale(runId: string): RunUiSnapshot {
-			const previous = entries.get(runId);
-			if (!previous) return this.refresh(runId);
-			const now = Date.now();
-			if (now - previous.loadedAtMs < ttlMs) return touch(runId, previous);
-			const stamps = currentStamps(previous);
-			if (sameStamps(stamps, previous.stamps)) return touch(runId, previous);
-			return this.refresh(runId);
+			return localRefreshIfStale(runId);
 		},
 		preloadStale,
 		preloadAllStale,