npm - pi-crew - Versions diffs - 0.7.4 → 0.7.6 - Mend

pi-crew 0.7.4 → 0.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

package/CHANGELOG.md +79 -0
package/README.md +11 -11
package/docs/commands-reference.md +14 -10
package/docs/troubleshooting.md +131 -0
package/docs/usage.md +9 -4
package/package.json +1 -1
package/src/config/config.ts +11 -4
package/src/config/types.ts +2 -0
package/src/errors.ts +66 -0
package/src/extension/action-suggestions.ts +71 -0
package/src/extension/context-status-injection.ts +174 -0
package/src/extension/knowledge-injection.ts +29 -1
package/src/extension/register.ts +81 -65
package/src/extension/team-tool/api.ts +3 -2
package/src/extension/team-tool/cancel.ts +5 -4
package/src/extension/team-tool/explain.ts +2 -1
package/src/extension/team-tool/failure-patterns.ts +124 -0
package/src/extension/team-tool/inspect.ts +10 -6
package/src/extension/team-tool/lifecycle-actions.ts +5 -4
package/src/extension/team-tool/respond.ts +4 -3
package/src/extension/team-tool/run-not-found.ts +54 -0
package/src/extension/team-tool/run.ts +26 -4
package/src/extension/team-tool/status.ts +58 -4
package/src/extension/team-tool.ts +5 -3
package/src/runtime/async-runner.ts +7 -0
package/src/runtime/background-runner.ts +7 -1
package/src/runtime/chain-parser.ts +13 -5
package/src/runtime/checkpoint.ts +13 -1
package/src/runtime/child-pi.ts +9 -1
package/src/runtime/live-session-runtime.ts +15 -1
package/src/runtime/parent-guard.ts +2 -2
package/src/runtime/pipeline-runner.ts +3 -1
package/src/runtime/stale-reconciler.ts +28 -4
package/src/runtime/task-runner.ts +50 -20
package/src/runtime/team-runner.ts +19 -2
package/src/runtime/verification-gates.ts +21 -1
package/src/runtime/workspace-tree.ts +28 -2
package/src/schema/team-tool-schema.ts +9 -0
package/src/state/blob-store.ts +12 -10
package/src/state/event-log-rotation.ts +114 -93
package/src/state/event-log.ts +83 -23
package/src/state/health-store.ts +6 -1
package/src/state/locks.ts +66 -16
package/src/state/state-store.ts +46 -2
package/src/ui/card-colors.ts +7 -3
package/src/ui/dashboard-panes/agents-pane.ts +15 -2
package/src/ui/live-duration.ts +58 -0
package/src/ui/tool-render.ts +7 -11
package/src/ui/tool-renderers/index.ts +6 -3
package/src/ui/widget/widget-formatters.ts +2 -13
package/src/utils/fs-watch.ts +11 -60
package/src/utils/run-watcher-registry.ts +164 -0
package/src/workflows/discover-workflows.ts +2 -1
package/src/workflows/workflow-config.ts +5 -0
package/src/runtime/dynamic-script-runner.ts +0 -497
package/src/runtime/sandbox.ts +0 -335

package/src/extension/context-status-injection.ts ADDED Viewed

@@ -0,0 +1,174 @@
+/**
+ * context-status-injection.ts — Ambient crew-status injection (GAP-2).
+ *
+ * Registers a `context` event handler that keeps the parent agent continuously
+ * aware of in-flight crew runs. Without this, the agent "forgets" about active
+ * runs between turns unless it explicitly calls the `team` tool.
+ *
+ * ## How it works
+ *
+ * Pi's `context` event fires before EVERY LLM call (see Pi source
+ * `extensions/runner.ts:emitContext`). The handler receives the full messages
+ * array and may return a modified copy. Critically, the returned messages are
+ * used ONLY for that single LLM call (`agent-loop.ts:283-289` feeds the result
+ * straight into `convertToLlm` for the request) — they do NOT mutate the
+ * agent's persistent `state.messages`. So injection is transient per-call:
+ *   - No accumulation across turns (the note never enters history).
+ *   - No need to dedup against prior injections.
+ *   - No risk of corrupting the conversation transcript.
+ *
+ * The injected note is a compact 1–4 line ambient status, inserted BEFORE the
+ * last message so the last message remains the active turn driver (preserves
+ * the user/assistant/tool alternation the LLMs expect).
+ *
+ * ## Safety
+ *
+ * - No-op when zero runs are in-flight (returns undefined → Pi uses original
+ *   messages unchanged). Normal single-agent operation is completely unaffected.
+ * - `emitContext` already wraps handlers in try/catch and emits errors instead
+ *   of crashing the loop (Pi `runner.ts:933`), so a throw here can't break the
+ *   agent — but we also guard defensively.
+ * - Opt-out: `runtime.reliability.ambientStatusInjection: false` in config.
+ */
+import type { AgentMessage } from "@earendil-works/pi-agent-core";
+import type { Message } from "@earendil-works/pi-ai";
+import type { ExtensionAPI, ContextEvent } from "@earendil-works/pi-coding-agent";
+import { collectInFlightRuns } from "./registration/compaction-guard.ts";
+import type { TeamRunManifest } from "../state/types.ts";
+/** Sentinel that marks an injected ambient-status user message. */
+export const AMBIENT_STATUS_SENTINEL = "[pi-crew ambient status";
+/** Cap the number of runs listed inline to keep the note compact. */
+const MAX_INLINE_RUNS = 3;
+/** Truncate long goals so one run can't dominate the context window. */
+const MAX_GOAL_LEN = 80;
+/**
+ * Cheap human-readable run age from manifest timestamps (no extra I/O).
+ * Returns "running 12m" / "updated 3m ago" style, or "" if timestamps are
+ * missing/invalid. Keeps the ambient note informative without reading
+ * tasks.json on every LLM call.
+ */
+function runAge(createdAt?: string, updatedAt?: string): string {
+	try {
+		const updated = updatedAt ? Date.parse(updatedAt) : NaN;
+		const created = createdAt ? Date.parse(createdAt) : NaN;
+		if (Number.isFinite(updated)) {
+			const sinceUpdate = Date.now() - updated;
+			if (sinceUpdate < 60_000) return `, updated just now`;
+			return `, updated ${humanizeMs(sinceUpdate)} ago`;
+		}
+		if (Number.isFinite(created)) {
+			return `, running ${humanizeMs(Date.now() - created)}`;
+		}
+	} catch { /* ignore malformed timestamps */ }
+	return "";
+}
+function humanizeMs(ms: number): string {
+	if (ms < 60_000) return `${Math.round(ms / 1000)}s`;
+	const m = Math.floor(ms / 60_000);
+	if (m < 60) return `${m}m`;
+	const h = Math.floor(m / 60);
+	return h < 24 ? `${h}h${m % 60}m` : `${Math.floor(h / 24)}d`;
+}
+/**
+ * Build a compact, human+LLM-readable ambient status string for the given
+ * in-flight runs. Returns "" for an empty list (caller treats as no-op).
+ *
+ * Exported for unit testing.
+ */
+export function formatAmbientStatus(runs: TeamRunManifest[]): string {
+	if (runs.length === 0) return "";
+	const truncate = (s: string, n: number): string =>
+		s.length > n ? `${s.slice(0, n - 1)}…` : s;
+	const lines: string[] = [
+		`${AMBIENT_STATUS_SENTINEL} — environmental context, not a user request]`,
+		`${runs.length} pi-crew run${runs.length === 1 ? "" : "s"} in flight:`,
+	];
+	const shown = runs.slice(0, MAX_INLINE_RUNS);
+	for (const run of shown) {
+		const wf = run.workflow ? `, ${run.workflow}` : "";
+		const age = runAge(run.createdAt, run.updatedAt);
+		lines.push(`• ${run.runId} (${run.status}, ${run.team}${wf})${age}: ${truncate(run.goal ?? "(no goal)", MAX_GOAL_LEN)}`);
+	}
+	if (runs.length > MAX_INLINE_RUNS) {
+		lines.push(`• …and ${runs.length - MAX_INLINE_RUNS} more`);
+	}
+	lines.push("Inspect/join via the `team` tool: action=\"status\" (list), action=\"wait\" (join running), action=\"summary\"/action=\"get\" (results).");
+	return lines.join("\n");
+}
+/**
+ * Construct a user-role AgentMessage carrying the ambient status. Uses the
+ * `user` role (the Message union has no `system` role — the system prompt is a
+ * separate field). The sentinel prefix signals to the model that this is
+ * environmental information, not a typed user instruction.
+ *
+ * Exported for unit testing.
+ */
+export function buildStatusMessage(runs: TeamRunManifest[]): Message {
+	return {
+		role: "user",
+		content: [{ type: "text", text: formatAmbientStatus(runs) }],
+		timestamp: Date.now(),
+	};
+}
+/** Result type for the `context` event handler (mirrors Pi's ContextEventResult,
+ * which isn't re-exported from the coding-agent package entry). */
+export interface AmbientContextResult {
+	messages?: AgentMessage[];
+}
+/**
+ * Core handler logic, separated from the Pi registration so it is trivially
+ * unit-testable without a live ExtensionAPI.
+ *
+ * Returns `{messages}` with the ambient status inserted before the last
+ * message, or `undefined` to leave the context untouched (no in-flight runs).
+ *
+ * Exported for unit testing.
+ */
+export function handleContextEvent(event: ContextEvent, cwd: string): AmbientContextResult | undefined {
+	let runs: TeamRunManifest[] = [];
+	try {
+		runs = collectInFlightRuns(cwd);
+	} catch {
+		// State read failure → don't inject, don't crash. Pi catches handler
+		// errors anyway, but we avoid noisy error emission for a best-effort
+		// awareness feature.
+		return undefined;
+	}
+	if (runs.length === 0) return undefined;
+	const messages = [...event.messages];
+	const statusMsg = buildStatusMessage(runs);
+	// Insert BEFORE the last message so the genuine last message (the current
+	// turn driver — user prompt or tool result) stays last. When there are 0–1
+	// messages, appending is the only sensible option.
+	const insertAt = messages.length > 1 ? messages.length - 1 : messages.length;
+	messages.splice(insertAt, 0, statusMsg as unknown as AgentMessage);
+	return { messages };
+}
+/**
+ * Register the ambient-status `context` event handler. Reads the project cwd
+ * from the session context on each call (crew state is per-project).
+ *
+ * Pass `enabled: false` (from `runtime.reliability.ambientStatusInjection`) to
+ * disable the feature without unwiring the handler.
+ */
+export function registerContextStatusInjection(
+	pi: ExtensionAPI,
+	opts: { enabled?: boolean } = {},
+): void {
+	if (opts.enabled === false) return;
+	pi.on("context", (event: ContextEvent): AmbientContextResult | undefined => {
+		const cwd = typeof process.cwd === "function" ? process.cwd() : ".";
+		return handleContextEvent(event, cwd);
+	});
+}

package/src/extension/knowledge-injection.ts CHANGED Viewed

@@ -29,17 +29,45 @@ export function knowledgePath(cwd: string): string {
 export function readKnowledge(cwd: string): string {
 	try {
 		const p = knowledgePath(cwd);
-		if (!fs.existsSync(p)) return "";
+		const stat = tryStat(p);
+		if (!stat) {
+			knowledgeCache.delete(p);
+			return "";
+		}
+		// P5 (Round 15): mtime+size cache. readKnowledge fires on every agent
+		// start (main session + every worker), re-reading the file each time.
+		// For a run with N workers this is N redundant readFileSync of the same
+		// file. Cache by (mtimeMs, size) and only re-read when the file changes.
+		const cacheKey = `${stat.mtimeMs}:${stat.size}`;
+		const cached = knowledgeCache.get(p);
+		if (cached && cached.key === cacheKey) return cached.content;
 		let content = fs.readFileSync(p, "utf8").trim();
 		if (content.length > MAX_KNOWLEDGE_BYTES) {
 			content = `${content.slice(0, MAX_KNOWLEDGE_BYTES)}\n\n<!-- knowledge.md truncated at ${MAX_KNOWLEDGE_BYTES} bytes -->`;
 		}
+		knowledgeCache.set(p, { key: cacheKey, content });
 		return content;
 	} catch {
 		return "";
 	}
 }
+/** Stat helper returning undefined on error (file missing, perms, etc.). */
+function tryStat(p: string): { mtimeMs: number; size: number } | undefined {
+	try {
+		const s = fs.statSync(p);
+		return { mtimeMs: s.mtimeMs, size: s.size };
+	} catch {
+		return undefined;
+	}
+}
+interface CachedKnowledge {
+	key: string;
+	content: string;
+}
+const knowledgeCache = new Map<string, CachedKnowledge>();
 /** Build the injected prompt fragment (empty if no knowledge). */
 export function buildKnowledgeFragment(cwd: string): string {
 	const content = readKnowledge(cwd);

package/src/extension/register.ts CHANGED Viewed

@@ -82,7 +82,8 @@ import {
 import { RenderScheduler } from "../ui/render-scheduler.ts";
 import { runEventBus } from "../ui/run-event-bus.ts";
 import { createRunSnapshotCache } from "../ui/run-snapshot-cache.ts";
-import { closeWatcher, watchCrewState } from "../utils/fs-watch.ts";
+import { closeWatcher } from "../utils/fs-watch.ts";
+import { RunWatcherRegistry } from "../utils/run-watcher-registry.ts";
 import { logInternalError } from "../utils/internal-error.ts";
 import {
 	clearProjectRootCache,
@@ -113,6 +114,7 @@ import { registerCrewMessageRenderers } from "./message-renderers.ts";
 import { registerCrewInputRouter } from "./crew-input-router.ts";
 import { registerCrewAutocomplete } from "./crew-autocomplete.ts";
 import { registerCrewShortcuts } from "./crew-shortcuts.ts";
+import { registerContextStatusInjection } from "./context-status-injection.ts";
 import { registerTeamTool } from "./registration/team-tool.ts";
 import { handleTeamTool } from "./team-tool.ts";
 import { persistScheduledJobUpdate } from "./team-tool/handle-schedule.ts";
@@ -724,8 +726,13 @@ export function registerPiTeams(pi: ExtensionAPI): void {
 	// Linux), file changes (manifest/tasks/events/agents) trigger an
 	// immediate cache invalidate via renderScheduler.schedule. Falls back to
 	// poll-only behavior on systems where fs.watch errors.
-	let crewWatcher: import("node:fs").FSWatcher | undefined;
-	let userCrewWatcher: import("node:fs").FSWatcher | undefined;
+	// pts/2 hang fix (2026-06-16): the previous RECURSIVE fs.watch(<state>, {recursive:true})
+	// exploded to O(total run history) inotify watches on Linux (109→339 observed) and
+	// caused a permanent busy-loop. Replaced with bounded per-active-run watchers via
+	// RunWatcherRegistry (root watcher on runs/ for new-run detection + one non-recursive
+	// watcher per active run, reconciled each preload tick in buildFrame).
+	let crewRunWatchers: RunWatcherRegistry | undefined;
+	let userCrewWatchers: RunWatcherRegistry | undefined;
 	// Separate map for foreground team-run AbortControllers (distinct from subagent controllers).
 	// P0 fix: stopSessionBoundSubagents must NOT abort foreground team runs on session switch.
 	// Foreground team runs run in the same process as the session; they naturally clean up
@@ -1115,10 +1122,10 @@ export function registerPiTeams(pi: ExtensionAPI): void {
 			clearTimeout(preloadTimer);
 			preloadTimer = undefined;
 		}
-		closeWatcher(crewWatcher);
-		crewWatcher = undefined;
-		closeWatcher(userCrewWatcher);
-		userCrewWatcher = undefined;
+		crewRunWatchers?.closeAll();
+		crewRunWatchers = undefined;
+		userCrewWatchers?.closeAll();
+		userCrewWatchers = undefined;
 		stopSessionBoundSubagents();
 		// P0 fix: also abort foreground team runs on session shutdown (not on session switch).
 		// This is the only place where foreground team run controllers should be aborted.
@@ -1589,6 +1596,25 @@ export function registerPiTeams(pi: ExtensionAPI): void {
 			lastFrameSnapshotCache = getRunSnapshotCache(currentCtx.cwd);
 			const manifests = lastFrameManifestCache.list(20);
 			lastPreloadedManifests = manifests;
+			// pts/2 hang fix: reconcile per-run watchers against the ACTIVE set only.
+			// This bounds inotify cost to O(active runs) — completed runs stop being
+			// watched as soon as they leave running/queued/planning status, instead of
+			// the recursive watcher watching the entire run history forever.
+			{
+				const onRunChange = (runId: string): void => {
+					if (cleanedUp || sessionGeneration !== ownerGeneration) return;
+					getRunSnapshotCache(currentCtx?.cwd ?? process.cwd()).invalidate(runId);
+					renderScheduler?.schedule({ runId });
+				};
+				const onWatchErr = (error: unknown): void => {
+					logInternalError("register.runWatcher.change", error);
+				};
+				const active = manifests
+					.filter((r) => r.status === "running" || r.status === "queued" || r.status === "planning")
+					.map((r) => ({ runId: r.runId, runDir: r.stateRoot }));
+				crewRunWatchers?.reconcile(active, onRunChange, onWatchErr);
+				userCrewWatchers?.reconcile(active, onRunChange, onWatchErr);
+			}
 			const runIds = manifests.map((r) => r.runId);
 			await lastFrameSnapshotCache.preloadAllStale(runIds);
 			return true;
@@ -1814,72 +1840,53 @@ export function registerPiTeams(pi: ExtensionAPI): void {
 		renderSchedulerUnsubscribers.push(unsubscribeRunEvents);
 		// Start async preload loop — refreshes snapshot cache in background
 		startPreloadLoop(fallbackMs, effectiveRefreshMs);
-		// 1.3: native FS watcher on `<crewRoot>/state`. Triggers an immediate
-		// renderScheduler.schedule({runId}) when files inside any run change so
-		// the snapshot cache invalidates well before the 1s preload tick. Falls
-		// back silently to poll-only behavior on systems where recursive
-		// fs.watch is not supported.
+		// 1.3: BOUNDED run watcher (pts/2 hang fix 2026-06-16). Previously this was
+		// a RECURSIVE fs.watch(<state>, {recursive:true}) which on Linux expands to
+		// ONE inotify watch PER SUBDIR — with many historical runs under
+		// .crew/state/runs/ this ballooned to hundreds of watches (109→339 observed)
+		// and the event volume caused a permanent busy-loop (71% CPU, 400KB/s read).
+		// Now: a single non-recursive watcher on the runs/ ROOT (to detect new run
+		// dirs appearing — crew.run.created is never emitted) plus per-active-run
+		// watchers reconciled each preload tick in buildFrame. Total inotify cost is
+		// O(active runs), not O(total history). Falls back to poll-only (the preload
+		// loop already polls every effectiveRefreshMs) on systems where fs.watch
+		// errors or the runs dir is absent.
+		const crewRunWatcherOnChange = (runId: string): void => {
+			if (cleanedUp || sessionGeneration !== ownerGeneration) return;
+			getRunSnapshotCache(currentCtx?.cwd ?? process.cwd()).invalidate(runId);
+			renderScheduler?.schedule({ runId });
+		};
+		const crewRunWatcherOnError = (error: unknown): void => {
+			logInternalError("register.crewRunWatchers.error", error);
+		};
 		try {
-			closeWatcher(crewWatcher);
-			crewWatcher = undefined;
-			const stateDir = path.join(projectCrewRoot(ctx.cwd), "state");
-			const watcher = watchCrewState(
-				stateDir,
-				(runId) => {
-					if (cleanedUp || sessionGeneration !== ownerGeneration)
-						return;
-					// Invalidate snapshot cache so the next renderTick reads fresh state from disk.
-					// Without this, renderTick re-renders from stale lastPreloadedManifests and
-					// shows ghost "running" entries for runs that already completed on disk.
-					const sc = getRunSnapshotCache(
-						currentCtx?.cwd ?? process.cwd(),
-					);
-					sc.invalidate(runId);
-					renderScheduler?.schedule({ runId });
-				},
-				(error) => {
-					logInternalError("register.crewWatcher.error", error);
-					closeWatcher(crewWatcher);
-					crewWatcher = undefined;
-				},
-			);
-			if (watcher) crewWatcher = watcher;
+			crewRunWatchers?.closeAll();
+			crewRunWatchers = undefined;
+			const crewRunsDir = path.join(projectCrewRoot(ctx.cwd), "state", "runs");
+			if (fs.existsSync(crewRunsDir)) {
+				crewRunWatchers = new RunWatcherRegistry();
+				crewRunWatchers.setRootWatcher(crewRunsDir, crewRunWatcherOnChange, crewRunWatcherOnError);
+			}
 		} catch (error) {
-			logInternalError("register.crewWatcher.start", error);
+			logInternalError("register.crewRunWatchers.start", error);
 		}
-		// Also watch user-level state dir — fast-fix and other user-scoped runs
-		// write manifests there. Without this watcher, runs completing in user-level
+		// Also watch user-level runs dir — fast-fix and other user-scoped runs
+		// write manifests there. Without this, runs completing in user-level
 		// state never trigger cache invalidation, causing ghost "running" entries.
 		try {
-			closeWatcher(userCrewWatcher);
-			userCrewWatcher = undefined;
-			const userStateDir = path.join(userCrewRoot(), "state");
-			if (fs.existsSync(userStateDir)) {
-				const userWatcher = watchCrewState(
-					userStateDir,
-					(runId) => {
-						if (cleanedUp || sessionGeneration !== ownerGeneration)
-							return;
-						const sc = getRunSnapshotCache(
-							currentCtx?.cwd ?? process.cwd(),
-						);
-						sc.invalidate(runId);
-						renderScheduler?.schedule({ runId });
-					},
-					(error) => {
-						logInternalError(
-							"register.userCrewWatcher.error",
-							error,
-						);
-						closeWatcher(userCrewWatcher);
-						userCrewWatcher = undefined;
-					},
-				);
-				if (userWatcher) userCrewWatcher = userWatcher;
+			userCrewWatchers?.closeAll();
+			userCrewWatchers = undefined;
+			const userRunsDir = path.join(userCrewRoot(), "state", "runs");
+			if (fs.existsSync(userRunsDir)) {
+				userCrewWatchers = new RunWatcherRegistry();
+				userCrewWatchers.setRootWatcher(userRunsDir, crewRunWatcherOnChange, crewRunWatcherOnError);
 			}
 		} catch (error) {
-			logInternalError("register.userCrewWatcher.start", error);
+			logInternalError("register.userCrewWatchers.start", error);
 		}
+		// Kick an immediate preload so the first buildFrame reconciles per-run
+		// watchers for any runs that are already active on session start.
+		backgroundPreload();
 	});
 	pi.on("session_before_switch", () => {
 		sessionGeneration++;
@@ -2065,4 +2072,13 @@ export function registerPiTeams(pi: ExtensionAPI): void {
 	// (The crew autocomplete provider is registered from session_start once
 	// a UI context is available — see the session_start handler below.)
 	registerCrewShortcuts(pi);
+	// GAP-2 (Round 11): ambient crew-status injection. Registers a `context`
+	// event handler that appends a compact in-flight-runs note to the agent
+	// context on every LLM call, so the agent never "forgets" active runs.
+	// Transient per-call (does not pollute history), and a no-op when no runs
+	// are in-flight. Toggle via runtime.reliability.ambientStatusInjection.
+	registerContextStatusInjection(pi, {
+		enabled: loadConfig(process.cwd()).config.reliability?.ambientStatusInjection !== false,
+	});
 }

package/src/extension/team-tool/api.ts CHANGED Viewed

@@ -24,6 +24,7 @@ import { resolveRealContainedPath } from "../../utils/safe-paths.ts";
 import type { PiTeamsToolResult } from "../tool-result.ts";
 import { locateRunCwd } from "../team-tool.ts";
 import { configRecord, result, type TeamContext } from "./context.ts";
+import { RUN_NOT_FOUND_HINT } from "./run-not-found.ts";
 export function globMatch(value: string, pattern: string): boolean {
 	// Prevent ReDoS: reject excessively long patterns
@@ -91,9 +92,9 @@ export async function handleApi(params: TeamToolParamsValue, ctx: TeamContext):
 	}
 	if (!params.runId) return result("API requires runId.", { action: "api", status: "error" }, true);
 	const runCwd = locateRunCwd(params.runId, ctx.cwd);
-	if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "api", status: "error" }, true);
+	if (!runCwd) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "api", status: "error" }, true);
 	const loaded = loadRunManifestById(runCwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
-	if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "api", status: "error" }, true);
+	if (!loaded) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "api", status: "error" }, true);
 	if (operation === "read-manifest") {
 		return result(JSON.stringify(loaded.manifest, null, 2), { action: "api", status: "ok", runId: loaded.manifest.runId, artifactsRoot: loaded.manifest.artifactsRoot });
 	}

package/src/extension/team-tool/cancel.ts CHANGED Viewed

@@ -12,6 +12,7 @@ import { executeHook, appendHookEvent } from "../../hooks/registry.ts";
 import type { PiTeamsToolResult } from "../tool-result.ts";
 import { locateRunCwd } from "../team-tool.ts";
 import { result, type TeamContext } from "./context.ts";
+import { RUN_NOT_FOUND_HINT } from "./run-not-found.ts";
 import { enforceDestructiveIntent, intentFromConfig } from "./intent-policy.ts";
 import { invalidateSnapshot, type CacheControlDeps } from "./cache-control.ts";
@@ -80,9 +81,9 @@ function cancelReasonFromParams(params: TeamToolParamsValue): CancellationReason
 export async function handleRetry(params: TeamToolParamsValue, ctx: TeamContext, deps?: CacheControlDeps): Promise<PiTeamsToolResult> {
 	if (!params.runId) return result("Retry requires runId.", { action: "retry", status: "error" }, true);
 	const runCwd = locateRunCwd(params.runId, ctx.cwd);
-	if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "retry", status: "error" }, true);
+	if (!runCwd) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "retry", status: "error" }, true);
 	const loaded = loadRunManifestById(runCwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
-	if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "retry", status: "error" }, true);
+	if (!loaded) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "retry", status: "error" }, true);
 	// Pre-lock ownership check: reject foreign-owned runs unless force is set
 	const foreignRun = typeof loaded.manifest.ownerSessionId === "string" && loaded.manifest.ownerSessionId !== ctx.sessionId;
@@ -145,9 +146,9 @@ export async function handleCancel(params: TeamToolParamsValue, ctx: TeamContext
 	if (intentError) return intentError;
 	if (!params.runId) return result("Cancel requires runId.", { action: "cancel", status: "error" }, true);
 	const runCwd = locateRunCwd(params.runId, ctx.cwd);
-	if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "cancel", status: "error" }, true);
+	if (!runCwd) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "cancel", status: "error" }, true);
 	const loaded = loadRunManifestById(runCwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
-	if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "cancel", status: "error" }, true);
+	if (!loaded) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "cancel", status: "error" }, true);
 	// Pre-lock ownership check: reject foreign-owned runs unless force is set
 	const preCheck = abortOwned(loaded.manifest.runId, undefined, ctx, params.force);

package/src/extension/team-tool/explain.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import * as fs from "node:fs";
+import { RUN_NOT_FOUND_HINT } from "./run-not-found.ts";
 import * as path from "node:path";
 import { loadRunManifestById } from "../../state/state-store.ts";
 import type { TeamRunManifest, TeamTaskState } from "../../state/types.ts";
@@ -211,7 +212,7 @@ export function handleExplain(params: {
   const loaded = loadRunManifestById(cwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
   if (!loaded) {
-    return result(`Run '${params.runId}' not found.`, { action: "explain", status: "error" }, true);
+    return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "explain", status: "error" }, true);
   }
   const { manifest, tasks } = loaded;

package/src/extension/team-tool/failure-patterns.ts ADDED Viewed

@@ -0,0 +1,124 @@
+/**
+ * failure-patterns.ts — Group failed tasks by error similarity (Round 17 BS-4).
+ *
+ * Before this, a run with 8 failed tasks surfaced 8 separate raw error
+ * strings. The user had to mentally group them ("5 of these say 'model
+ * routing fallback failed'"). This module detects common failure patterns
+ * so `summary` can say "5 of 8 failures share root cause: X".
+ *
+ * Grouping strategy (cheap, deterministic, no ML):
+ *   1. Normalize: lowercase, collapse whitespace, strip task ids / run ids /
+ *      absolute paths / numbers → a canonical "signature".
+ *   2. Bucket by signature. Buckets with >1 member are "common patterns".
+ *   3. Sort by frequency desc.
+ *
+ * Conservative: only buckets with >=2 members count as a pattern (a single
+ * failure is just itself). Returns [] when there are no repeated signatures.
+ */
+export interface FailurePattern {
+	/** Canonical error signature used for grouping. */
+	signature: string;
+	/** A representative original error (the shortest variant) for display. */
+	representative: string;
+	/** Task ids that hit this pattern. */
+	taskIds: string[];
+	/** Count of failures in this bucket (== taskIds.length). */
+	count: number;
+}
+export interface FailurePatternInput {
+	id: string;
+	status: string;
+	error?: string;
+}
+/**
+ * Normalize an error string into a grouping signature.
+ * Exported for unit testing.
+ */
+export function normalizeErrorSignature(error: string | undefined): string {
+	if (!error) return "(no error detail)";
+	let s = error.toLowerCase();
+	// Strip run ids (team_YYYYMMDDHHMMSS_xxxxxxxxxxxxxxxx)
+	s = s.replace(/team_\d{8,}_[a-z0-9]{12,}/g, "<run>");
+	// Strip task ids (01_explore, adaptive-03-executor, etc.)
+	s = s.replace(/\b(adaptive-)?\d{2,}[a-z0-9_-]+/g, "<task>");
+	// Strip absolute paths
+	s = s.replace(/\/(?:home|users|tmp|var|opt|root)[^\s'"]*/g, "<path>");
+	// Strip numbers (line numbers, counts, pids, ms durations)
+	s = s.replace(/\b\d+\b/g, "N");
+	// Collapse whitespace
+	s = s.replace(/\s+/g, " ").trim();
+	return s || "(no error detail)";
+}
+/**
+ * Group failed tasks by error-pattern similarity. Only groups with >=2
+ * members are returned (singletons are not "patterns"). Sorted by count desc.
+ *
+ * @param tasks  the run's tasks (any with status 'failed'/'cancelled' are
+ *               considered failures for aggregation purposes).
+ */
+export function aggregateFailurePatterns(tasks: FailurePatternInput[]): FailurePattern[] {
+	const failed = tasks.filter(
+		(t) => t.status === "failed" || t.status === "cancelled",
+	);
+	if (failed.length === 0) return [];
+	const buckets = new Map<string, FailurePattern>();
+	for (const t of failed) {
+		const signature = normalizeErrorSignature(t.error);
+		const existing = buckets.get(signature);
+		if (existing) {
+			existing.taskIds.push(t.id);
+			existing.count += 1;
+			// Keep the shortest non-empty variant as representative (most readable).
+			if (t.error && (!existing.representative || t.error.length < existing.representative.length)) {
+				existing.representative = t.error;
+			}
+		} else {
+			buckets.set(signature, {
+				signature,
+				representative: t.error ?? "(no error detail)",
+				taskIds: [t.id],
+				count: 1,
+			});
+		}
+	}
+	// Only patterns with >=2 members (repeated root causes).
+	return [...buckets.values()]
+		.filter((b) => b.count >= 2)
+		.sort((a, b) => b.count - a.count);
+}
+/**
+ * Render failure patterns as human-readable lines for the `summary` action.
+ * Returns [] when there are no repeated patterns (so the caller can omit the
+ * section entirely).
+ *
+ * Example output:
+ *   Common failure patterns (3 of 5 failures share 2 root causes):
+ *   - [×3] model routing fallback failed: all 2 candidates exhausted
+ *       tasks: 02_exec, 03_exec, 04_exec
+ *   - [×2] EPERM: operation not permitted, rename
+ *       tasks: 05_exec, 06_exec
+ */
+export function formatFailurePatterns(tasks: FailurePatternInput[]): string[] {
+	const patterns = aggregateFailurePatterns(tasks);
+	if (patterns.length === 0) return [];
+	const failedCount = tasks.filter(
+		(t) => t.status === "failed" || t.status === "cancelled",
+	).length;
+	const groupedCount = patterns.reduce((sum, p) => sum + p.count, 0);
+	const lines = [
+		`Common failure patterns (${groupedCount} of ${failedCount} failures share ${patterns.length} root cause${patterns.length === 1 ? "" : "s"}):`,
+	];
+	for (const p of patterns) {
+		const rep = p.representative.length > 100 ? `${p.representative.slice(0, 99)}…` : p.representative;
+		lines.push(`- [×${p.count}] ${rep}`);
+		const shown = p.taskIds.slice(0, 6);
+		const more = p.taskIds.length > 6 ? `, +${p.taskIds.length - 6} more` : "";
+		lines.push(`    tasks: ${shown.join(", ")}${more}`);
+	}
+	return lines;
+}