pi-crew 0.7.4 → 0.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/CHANGELOG.md +79 -0
  2. package/README.md +11 -11
  3. package/docs/commands-reference.md +14 -10
  4. package/docs/troubleshooting.md +131 -0
  5. package/docs/usage.md +9 -4
  6. package/package.json +1 -1
  7. package/src/config/config.ts +11 -4
  8. package/src/config/types.ts +2 -0
  9. package/src/errors.ts +66 -0
  10. package/src/extension/action-suggestions.ts +71 -0
  11. package/src/extension/context-status-injection.ts +174 -0
  12. package/src/extension/knowledge-injection.ts +29 -1
  13. package/src/extension/register.ts +81 -65
  14. package/src/extension/team-tool/api.ts +3 -2
  15. package/src/extension/team-tool/cancel.ts +5 -4
  16. package/src/extension/team-tool/explain.ts +2 -1
  17. package/src/extension/team-tool/failure-patterns.ts +124 -0
  18. package/src/extension/team-tool/inspect.ts +10 -6
  19. package/src/extension/team-tool/lifecycle-actions.ts +5 -4
  20. package/src/extension/team-tool/respond.ts +4 -3
  21. package/src/extension/team-tool/run-not-found.ts +54 -0
  22. package/src/extension/team-tool/run.ts +26 -4
  23. package/src/extension/team-tool/status.ts +58 -4
  24. package/src/extension/team-tool.ts +5 -3
  25. package/src/runtime/async-runner.ts +7 -0
  26. package/src/runtime/background-runner.ts +7 -1
  27. package/src/runtime/chain-parser.ts +13 -5
  28. package/src/runtime/checkpoint.ts +13 -1
  29. package/src/runtime/child-pi.ts +9 -1
  30. package/src/runtime/live-session-runtime.ts +15 -1
  31. package/src/runtime/parent-guard.ts +2 -2
  32. package/src/runtime/pipeline-runner.ts +3 -1
  33. package/src/runtime/stale-reconciler.ts +28 -4
  34. package/src/runtime/task-runner.ts +50 -20
  35. package/src/runtime/team-runner.ts +19 -2
  36. package/src/runtime/verification-gates.ts +21 -1
  37. package/src/runtime/workspace-tree.ts +28 -2
  38. package/src/schema/team-tool-schema.ts +9 -0
  39. package/src/state/blob-store.ts +12 -10
  40. package/src/state/event-log-rotation.ts +114 -93
  41. package/src/state/event-log.ts +83 -23
  42. package/src/state/health-store.ts +6 -1
  43. package/src/state/locks.ts +66 -16
  44. package/src/state/state-store.ts +46 -2
  45. package/src/ui/card-colors.ts +7 -3
  46. package/src/ui/dashboard-panes/agents-pane.ts +15 -2
  47. package/src/ui/live-duration.ts +58 -0
  48. package/src/ui/tool-render.ts +7 -11
  49. package/src/ui/tool-renderers/index.ts +6 -3
  50. package/src/ui/widget/widget-formatters.ts +2 -13
  51. package/src/utils/fs-watch.ts +11 -60
  52. package/src/utils/run-watcher-registry.ts +164 -0
  53. package/src/workflows/discover-workflows.ts +2 -1
  54. package/src/workflows/workflow-config.ts +5 -0
  55. package/src/runtime/dynamic-script-runner.ts +0 -497
  56. package/src/runtime/sandbox.ts +0 -335
@@ -0,0 +1,174 @@
1
+ /**
2
+ * context-status-injection.ts — Ambient crew-status injection (GAP-2).
3
+ *
4
+ * Registers a `context` event handler that keeps the parent agent continuously
5
+ * aware of in-flight crew runs. Without this, the agent "forgets" about active
6
+ * runs between turns unless it explicitly calls the `team` tool.
7
+ *
8
+ * ## How it works
9
+ *
10
+ * Pi's `context` event fires before EVERY LLM call (see Pi source
11
+ * `extensions/runner.ts:emitContext`). The handler receives the full messages
12
+ * array and may return a modified copy. Critically, the returned messages are
13
+ * used ONLY for that single LLM call (`agent-loop.ts:283-289` feeds the result
14
+ * straight into `convertToLlm` for the request) — they do NOT mutate the
15
+ * agent's persistent `state.messages`. So injection is transient per-call:
16
+ * - No accumulation across turns (the note never enters history).
17
+ * - No need to dedup against prior injections.
18
+ * - No risk of corrupting the conversation transcript.
19
+ *
20
+ * The injected note is a compact 1–4 line ambient status, inserted BEFORE the
21
+ * last message so the last message remains the active turn driver (preserves
22
+ * the user/assistant/tool alternation the LLMs expect).
23
+ *
24
+ * ## Safety
25
+ *
26
+ * - No-op when zero runs are in-flight (returns undefined → Pi uses original
27
+ * messages unchanged). Normal single-agent operation is completely unaffected.
28
+ * - `emitContext` already wraps handlers in try/catch and emits errors instead
29
+ * of crashing the loop (Pi `runner.ts:933`), so a throw here can't break the
30
+ * agent — but we also guard defensively.
31
+ * - Opt-out: `runtime.reliability.ambientStatusInjection: false` in config.
32
+ */
33
+
34
+ import type { AgentMessage } from "@earendil-works/pi-agent-core";
35
+ import type { Message } from "@earendil-works/pi-ai";
36
+ import type { ExtensionAPI, ContextEvent } from "@earendil-works/pi-coding-agent";
37
+ import { collectInFlightRuns } from "./registration/compaction-guard.ts";
38
+ import type { TeamRunManifest } from "../state/types.ts";
39
+
40
+ /** Sentinel that marks an injected ambient-status user message. */
41
+ export const AMBIENT_STATUS_SENTINEL = "[pi-crew ambient status";
42
+
43
+ /** Cap the number of runs listed inline to keep the note compact. */
44
+ const MAX_INLINE_RUNS = 3;
45
+ /** Truncate long goals so one run can't dominate the context window. */
46
+ const MAX_GOAL_LEN = 80;
47
+
48
+ /**
49
+ * Cheap human-readable run age from manifest timestamps (no extra I/O).
50
+ * Returns "running 12m" / "updated 3m ago" style, or "" if timestamps are
51
+ * missing/invalid. Keeps the ambient note informative without reading
52
+ * tasks.json on every LLM call.
53
+ */
54
+ function runAge(createdAt?: string, updatedAt?: string): string {
55
+ try {
56
+ const updated = updatedAt ? Date.parse(updatedAt) : NaN;
57
+ const created = createdAt ? Date.parse(createdAt) : NaN;
58
+ if (Number.isFinite(updated)) {
59
+ const sinceUpdate = Date.now() - updated;
60
+ if (sinceUpdate < 60_000) return `, updated just now`;
61
+ return `, updated ${humanizeMs(sinceUpdate)} ago`;
62
+ }
63
+ if (Number.isFinite(created)) {
64
+ return `, running ${humanizeMs(Date.now() - created)}`;
65
+ }
66
+ } catch { /* ignore malformed timestamps */ }
67
+ return "";
68
+ }
69
+
70
+ function humanizeMs(ms: number): string {
71
+ if (ms < 60_000) return `${Math.round(ms / 1000)}s`;
72
+ const m = Math.floor(ms / 60_000);
73
+ if (m < 60) return `${m}m`;
74
+ const h = Math.floor(m / 60);
75
+ return h < 24 ? `${h}h${m % 60}m` : `${Math.floor(h / 24)}d`;
76
+ }
77
+
78
+ /**
79
+ * Build a compact, human+LLM-readable ambient status string for the given
80
+ * in-flight runs. Returns "" for an empty list (caller treats as no-op).
81
+ *
82
+ * Exported for unit testing.
83
+ */
84
+ export function formatAmbientStatus(runs: TeamRunManifest[]): string {
85
+ if (runs.length === 0) return "";
86
+ const truncate = (s: string, n: number): string =>
87
+ s.length > n ? `${s.slice(0, n - 1)}…` : s;
88
+ const lines: string[] = [
89
+ `${AMBIENT_STATUS_SENTINEL} — environmental context, not a user request]`,
90
+ `${runs.length} pi-crew run${runs.length === 1 ? "" : "s"} in flight:`,
91
+ ];
92
+ const shown = runs.slice(0, MAX_INLINE_RUNS);
93
+ for (const run of shown) {
94
+ const wf = run.workflow ? `, ${run.workflow}` : "";
95
+ const age = runAge(run.createdAt, run.updatedAt);
96
+ lines.push(`• ${run.runId} (${run.status}, ${run.team}${wf})${age}: ${truncate(run.goal ?? "(no goal)", MAX_GOAL_LEN)}`);
97
+ }
98
+ if (runs.length > MAX_INLINE_RUNS) {
99
+ lines.push(`• …and ${runs.length - MAX_INLINE_RUNS} more`);
100
+ }
101
+ lines.push("Inspect/join via the `team` tool: action=\"status\" (list), action=\"wait\" (join running), action=\"summary\"/action=\"get\" (results).");
102
+ return lines.join("\n");
103
+ }
104
+
105
+ /**
106
+ * Construct a user-role AgentMessage carrying the ambient status. Uses the
107
+ * `user` role (the Message union has no `system` role — the system prompt is a
108
+ * separate field). The sentinel prefix signals to the model that this is
109
+ * environmental information, not a typed user instruction.
110
+ *
111
+ * Exported for unit testing.
112
+ */
113
+ export function buildStatusMessage(runs: TeamRunManifest[]): Message {
114
+ return {
115
+ role: "user",
116
+ content: [{ type: "text", text: formatAmbientStatus(runs) }],
117
+ timestamp: Date.now(),
118
+ };
119
+ }
120
+
121
+ /** Result type for the `context` event handler (mirrors Pi's ContextEventResult,
122
+ * which isn't re-exported from the coding-agent package entry). */
123
+ export interface AmbientContextResult {
124
+ messages?: AgentMessage[];
125
+ }
126
+
127
+ /**
128
+ * Core handler logic, separated from the Pi registration so it is trivially
129
+ * unit-testable without a live ExtensionAPI.
130
+ *
131
+ * Returns `{messages}` with the ambient status inserted before the last
132
+ * message, or `undefined` to leave the context untouched (no in-flight runs).
133
+ *
134
+ * Exported for unit testing.
135
+ */
136
+ export function handleContextEvent(event: ContextEvent, cwd: string): AmbientContextResult | undefined {
137
+ let runs: TeamRunManifest[] = [];
138
+ try {
139
+ runs = collectInFlightRuns(cwd);
140
+ } catch {
141
+ // State read failure → don't inject, don't crash. Pi catches handler
142
+ // errors anyway, but we avoid noisy error emission for a best-effort
143
+ // awareness feature.
144
+ return undefined;
145
+ }
146
+ if (runs.length === 0) return undefined;
147
+
148
+ const messages = [...event.messages];
149
+ const statusMsg = buildStatusMessage(runs);
150
+ // Insert BEFORE the last message so the genuine last message (the current
151
+ // turn driver — user prompt or tool result) stays last. When there are 0–1
152
+ // messages, appending is the only sensible option.
153
+ const insertAt = messages.length > 1 ? messages.length - 1 : messages.length;
154
+ messages.splice(insertAt, 0, statusMsg as unknown as AgentMessage);
155
+ return { messages };
156
+ }
157
+
158
+ /**
159
+ * Register the ambient-status `context` event handler. Reads the project cwd
160
+ * from the session context on each call (crew state is per-project).
161
+ *
162
+ * Pass `enabled: false` (from `runtime.reliability.ambientStatusInjection`) to
163
+ * disable the feature without unwiring the handler.
164
+ */
165
+ export function registerContextStatusInjection(
166
+ pi: ExtensionAPI,
167
+ opts: { enabled?: boolean } = {},
168
+ ): void {
169
+ if (opts.enabled === false) return;
170
+ pi.on("context", (event: ContextEvent): AmbientContextResult | undefined => {
171
+ const cwd = typeof process.cwd === "function" ? process.cwd() : ".";
172
+ return handleContextEvent(event, cwd);
173
+ });
174
+ }
@@ -29,17 +29,45 @@ export function knowledgePath(cwd: string): string {
29
29
  export function readKnowledge(cwd: string): string {
30
30
  try {
31
31
  const p = knowledgePath(cwd);
32
- if (!fs.existsSync(p)) return "";
32
+ const stat = tryStat(p);
33
+ if (!stat) {
34
+ knowledgeCache.delete(p);
35
+ return "";
36
+ }
37
+ // P5 (Round 15): mtime+size cache. readKnowledge fires on every agent
38
+ // start (main session + every worker), re-reading the file each time.
39
+ // For a run with N workers this is N redundant readFileSync of the same
40
+ // file. Cache by (mtimeMs, size) and only re-read when the file changes.
41
+ const cacheKey = `${stat.mtimeMs}:${stat.size}`;
42
+ const cached = knowledgeCache.get(p);
43
+ if (cached && cached.key === cacheKey) return cached.content;
33
44
  let content = fs.readFileSync(p, "utf8").trim();
34
45
  if (content.length > MAX_KNOWLEDGE_BYTES) {
35
46
  content = `${content.slice(0, MAX_KNOWLEDGE_BYTES)}\n\n<!-- knowledge.md truncated at ${MAX_KNOWLEDGE_BYTES} bytes -->`;
36
47
  }
48
+ knowledgeCache.set(p, { key: cacheKey, content });
37
49
  return content;
38
50
  } catch {
39
51
  return "";
40
52
  }
41
53
  }
42
54
 
55
+ /** Stat helper returning undefined on error (file missing, perms, etc.). */
56
+ function tryStat(p: string): { mtimeMs: number; size: number } | undefined {
57
+ try {
58
+ const s = fs.statSync(p);
59
+ return { mtimeMs: s.mtimeMs, size: s.size };
60
+ } catch {
61
+ return undefined;
62
+ }
63
+ }
64
+
65
+ interface CachedKnowledge {
66
+ key: string;
67
+ content: string;
68
+ }
69
+ const knowledgeCache = new Map<string, CachedKnowledge>();
70
+
43
71
  /** Build the injected prompt fragment (empty if no knowledge). */
44
72
  export function buildKnowledgeFragment(cwd: string): string {
45
73
  const content = readKnowledge(cwd);
@@ -82,7 +82,8 @@ import {
82
82
  import { RenderScheduler } from "../ui/render-scheduler.ts";
83
83
  import { runEventBus } from "../ui/run-event-bus.ts";
84
84
  import { createRunSnapshotCache } from "../ui/run-snapshot-cache.ts";
85
- import { closeWatcher, watchCrewState } from "../utils/fs-watch.ts";
85
+ import { closeWatcher } from "../utils/fs-watch.ts";
86
+ import { RunWatcherRegistry } from "../utils/run-watcher-registry.ts";
86
87
  import { logInternalError } from "../utils/internal-error.ts";
87
88
  import {
88
89
  clearProjectRootCache,
@@ -113,6 +114,7 @@ import { registerCrewMessageRenderers } from "./message-renderers.ts";
113
114
  import { registerCrewInputRouter } from "./crew-input-router.ts";
114
115
  import { registerCrewAutocomplete } from "./crew-autocomplete.ts";
115
116
  import { registerCrewShortcuts } from "./crew-shortcuts.ts";
117
+ import { registerContextStatusInjection } from "./context-status-injection.ts";
116
118
  import { registerTeamTool } from "./registration/team-tool.ts";
117
119
  import { handleTeamTool } from "./team-tool.ts";
118
120
  import { persistScheduledJobUpdate } from "./team-tool/handle-schedule.ts";
@@ -724,8 +726,13 @@ export function registerPiTeams(pi: ExtensionAPI): void {
724
726
  // Linux), file changes (manifest/tasks/events/agents) trigger an
725
727
  // immediate cache invalidate via renderScheduler.schedule. Falls back to
726
728
  // poll-only behavior on systems where fs.watch errors.
727
- let crewWatcher: import("node:fs").FSWatcher | undefined;
728
- let userCrewWatcher: import("node:fs").FSWatcher | undefined;
729
+ // pts/2 hang fix (2026-06-16): the previous RECURSIVE fs.watch(<state>, {recursive:true})
730
+ // exploded to O(total run history) inotify watches on Linux (109→339 observed) and
731
+ // caused a permanent busy-loop. Replaced with bounded per-active-run watchers via
732
+ // RunWatcherRegistry (root watcher on runs/ for new-run detection + one non-recursive
733
+ // watcher per active run, reconciled each preload tick in buildFrame).
734
+ let crewRunWatchers: RunWatcherRegistry | undefined;
735
+ let userCrewWatchers: RunWatcherRegistry | undefined;
729
736
  // Separate map for foreground team-run AbortControllers (distinct from subagent controllers).
730
737
  // P0 fix: stopSessionBoundSubagents must NOT abort foreground team runs on session switch.
731
738
  // Foreground team runs run in the same process as the session; they naturally clean up
@@ -1115,10 +1122,10 @@ export function registerPiTeams(pi: ExtensionAPI): void {
1115
1122
  clearTimeout(preloadTimer);
1116
1123
  preloadTimer = undefined;
1117
1124
  }
1118
- closeWatcher(crewWatcher);
1119
- crewWatcher = undefined;
1120
- closeWatcher(userCrewWatcher);
1121
- userCrewWatcher = undefined;
1125
+ crewRunWatchers?.closeAll();
1126
+ crewRunWatchers = undefined;
1127
+ userCrewWatchers?.closeAll();
1128
+ userCrewWatchers = undefined;
1122
1129
  stopSessionBoundSubagents();
1123
1130
  // P0 fix: also abort foreground team runs on session shutdown (not on session switch).
1124
1131
  // This is the only place where foreground team run controllers should be aborted.
@@ -1589,6 +1596,25 @@ export function registerPiTeams(pi: ExtensionAPI): void {
1589
1596
  lastFrameSnapshotCache = getRunSnapshotCache(currentCtx.cwd);
1590
1597
  const manifests = lastFrameManifestCache.list(20);
1591
1598
  lastPreloadedManifests = manifests;
1599
+ // pts/2 hang fix: reconcile per-run watchers against the ACTIVE set only.
1600
+ // This bounds inotify cost to O(active runs) — completed runs stop being
1601
+ // watched as soon as they leave running/queued/planning status, instead of
1602
+ // the recursive watcher watching the entire run history forever.
1603
+ {
1604
+ const onRunChange = (runId: string): void => {
1605
+ if (cleanedUp || sessionGeneration !== ownerGeneration) return;
1606
+ getRunSnapshotCache(currentCtx?.cwd ?? process.cwd()).invalidate(runId);
1607
+ renderScheduler?.schedule({ runId });
1608
+ };
1609
+ const onWatchErr = (error: unknown): void => {
1610
+ logInternalError("register.runWatcher.change", error);
1611
+ };
1612
+ const active = manifests
1613
+ .filter((r) => r.status === "running" || r.status === "queued" || r.status === "planning")
1614
+ .map((r) => ({ runId: r.runId, runDir: r.stateRoot }));
1615
+ crewRunWatchers?.reconcile(active, onRunChange, onWatchErr);
1616
+ userCrewWatchers?.reconcile(active, onRunChange, onWatchErr);
1617
+ }
1592
1618
  const runIds = manifests.map((r) => r.runId);
1593
1619
  await lastFrameSnapshotCache.preloadAllStale(runIds);
1594
1620
  return true;
@@ -1814,72 +1840,53 @@ export function registerPiTeams(pi: ExtensionAPI): void {
1814
1840
  renderSchedulerUnsubscribers.push(unsubscribeRunEvents);
1815
1841
  // Start async preload loop — refreshes snapshot cache in background
1816
1842
  startPreloadLoop(fallbackMs, effectiveRefreshMs);
1817
- // 1.3: native FS watcher on `<crewRoot>/state`. Triggers an immediate
1818
- // renderScheduler.schedule({runId}) when files inside any run change so
1819
- // the snapshot cache invalidates well before the 1s preload tick. Falls
1820
- // back silently to poll-only behavior on systems where recursive
1821
- // fs.watch is not supported.
1843
+ // 1.3: BOUNDED run watcher (pts/2 hang fix 2026-06-16). Previously this was
1844
+ // a RECURSIVE fs.watch(<state>, {recursive:true}) which on Linux expands to
1845
+ // ONE inotify watch PER SUBDIR with many historical runs under
1846
+ // .crew/state/runs/ this ballooned to hundreds of watches (109→339 observed)
1847
+ // and the event volume caused a permanent busy-loop (71% CPU, 400KB/s read).
1848
+ // Now: a single non-recursive watcher on the runs/ ROOT (to detect new run
1849
+ // dirs appearing — crew.run.created is never emitted) plus per-active-run
1850
+ // watchers reconciled each preload tick in buildFrame. Total inotify cost is
1851
+ // O(active runs), not O(total history). Falls back to poll-only (the preload
1852
+ // loop already polls every effectiveRefreshMs) on systems where fs.watch
1853
+ // errors or the runs dir is absent.
1854
+ const crewRunWatcherOnChange = (runId: string): void => {
1855
+ if (cleanedUp || sessionGeneration !== ownerGeneration) return;
1856
+ getRunSnapshotCache(currentCtx?.cwd ?? process.cwd()).invalidate(runId);
1857
+ renderScheduler?.schedule({ runId });
1858
+ };
1859
+ const crewRunWatcherOnError = (error: unknown): void => {
1860
+ logInternalError("register.crewRunWatchers.error", error);
1861
+ };
1822
1862
  try {
1823
- closeWatcher(crewWatcher);
1824
- crewWatcher = undefined;
1825
- const stateDir = path.join(projectCrewRoot(ctx.cwd), "state");
1826
- const watcher = watchCrewState(
1827
- stateDir,
1828
- (runId) => {
1829
- if (cleanedUp || sessionGeneration !== ownerGeneration)
1830
- return;
1831
- // Invalidate snapshot cache so the next renderTick reads fresh state from disk.
1832
- // Without this, renderTick re-renders from stale lastPreloadedManifests and
1833
- // shows ghost "running" entries for runs that already completed on disk.
1834
- const sc = getRunSnapshotCache(
1835
- currentCtx?.cwd ?? process.cwd(),
1836
- );
1837
- sc.invalidate(runId);
1838
- renderScheduler?.schedule({ runId });
1839
- },
1840
- (error) => {
1841
- logInternalError("register.crewWatcher.error", error);
1842
- closeWatcher(crewWatcher);
1843
- crewWatcher = undefined;
1844
- },
1845
- );
1846
- if (watcher) crewWatcher = watcher;
1863
+ crewRunWatchers?.closeAll();
1864
+ crewRunWatchers = undefined;
1865
+ const crewRunsDir = path.join(projectCrewRoot(ctx.cwd), "state", "runs");
1866
+ if (fs.existsSync(crewRunsDir)) {
1867
+ crewRunWatchers = new RunWatcherRegistry();
1868
+ crewRunWatchers.setRootWatcher(crewRunsDir, crewRunWatcherOnChange, crewRunWatcherOnError);
1869
+ }
1847
1870
  } catch (error) {
1848
- logInternalError("register.crewWatcher.start", error);
1871
+ logInternalError("register.crewRunWatchers.start", error);
1849
1872
  }
1850
- // Also watch user-level state dir — fast-fix and other user-scoped runs
1851
- // write manifests there. Without this watcher, runs completing in user-level
1873
+ // Also watch user-level runs dir — fast-fix and other user-scoped runs
1874
+ // write manifests there. Without this, runs completing in user-level
1852
1875
  // state never trigger cache invalidation, causing ghost "running" entries.
1853
1876
  try {
1854
- closeWatcher(userCrewWatcher);
1855
- userCrewWatcher = undefined;
1856
- const userStateDir = path.join(userCrewRoot(), "state");
1857
- if (fs.existsSync(userStateDir)) {
1858
- const userWatcher = watchCrewState(
1859
- userStateDir,
1860
- (runId) => {
1861
- if (cleanedUp || sessionGeneration !== ownerGeneration)
1862
- return;
1863
- const sc = getRunSnapshotCache(
1864
- currentCtx?.cwd ?? process.cwd(),
1865
- );
1866
- sc.invalidate(runId);
1867
- renderScheduler?.schedule({ runId });
1868
- },
1869
- (error) => {
1870
- logInternalError(
1871
- "register.userCrewWatcher.error",
1872
- error,
1873
- );
1874
- closeWatcher(userCrewWatcher);
1875
- userCrewWatcher = undefined;
1876
- },
1877
- );
1878
- if (userWatcher) userCrewWatcher = userWatcher;
1877
+ userCrewWatchers?.closeAll();
1878
+ userCrewWatchers = undefined;
1879
+ const userRunsDir = path.join(userCrewRoot(), "state", "runs");
1880
+ if (fs.existsSync(userRunsDir)) {
1881
+ userCrewWatchers = new RunWatcherRegistry();
1882
+ userCrewWatchers.setRootWatcher(userRunsDir, crewRunWatcherOnChange, crewRunWatcherOnError);
1879
1883
  }
1880
1884
  } catch (error) {
1881
- logInternalError("register.userCrewWatcher.start", error);
1885
+ logInternalError("register.userCrewWatchers.start", error);
1882
1886
  }
1887
+ // Kick an immediate preload so the first buildFrame reconciles per-run
1888
+ // watchers for any runs that are already active on session start.
1889
+ backgroundPreload();
1883
1890
  });
1884
1891
  pi.on("session_before_switch", () => {
1885
1892
  sessionGeneration++;
@@ -2065,4 +2072,13 @@ export function registerPiTeams(pi: ExtensionAPI): void {
2065
2072
  // (The crew autocomplete provider is registered from session_start once
2066
2073
  // a UI context is available — see the session_start handler below.)
2067
2074
  registerCrewShortcuts(pi);
2075
+
2076
+ // GAP-2 (Round 11): ambient crew-status injection. Registers a `context`
2077
+ // event handler that appends a compact in-flight-runs note to the agent
2078
+ // context on every LLM call, so the agent never "forgets" active runs.
2079
+ // Transient per-call (does not pollute history), and a no-op when no runs
2080
+ // are in-flight. Toggle via runtime.reliability.ambientStatusInjection.
2081
+ registerContextStatusInjection(pi, {
2082
+ enabled: loadConfig(process.cwd()).config.reliability?.ambientStatusInjection !== false,
2083
+ });
2068
2084
  }
@@ -24,6 +24,7 @@ import { resolveRealContainedPath } from "../../utils/safe-paths.ts";
24
24
  import type { PiTeamsToolResult } from "../tool-result.ts";
25
25
  import { locateRunCwd } from "../team-tool.ts";
26
26
  import { configRecord, result, type TeamContext } from "./context.ts";
27
+ import { RUN_NOT_FOUND_HINT } from "./run-not-found.ts";
27
28
 
28
29
  export function globMatch(value: string, pattern: string): boolean {
29
30
  // Prevent ReDoS: reject excessively long patterns
@@ -91,9 +92,9 @@ export async function handleApi(params: TeamToolParamsValue, ctx: TeamContext):
91
92
  }
92
93
  if (!params.runId) return result("API requires runId.", { action: "api", status: "error" }, true);
93
94
  const runCwd = locateRunCwd(params.runId, ctx.cwd);
94
- if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "api", status: "error" }, true);
95
+ if (!runCwd) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "api", status: "error" }, true);
95
96
  const loaded = loadRunManifestById(runCwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
96
- if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "api", status: "error" }, true);
97
+ if (!loaded) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "api", status: "error" }, true);
97
98
  if (operation === "read-manifest") {
98
99
  return result(JSON.stringify(loaded.manifest, null, 2), { action: "api", status: "ok", runId: loaded.manifest.runId, artifactsRoot: loaded.manifest.artifactsRoot });
99
100
  }
@@ -12,6 +12,7 @@ import { executeHook, appendHookEvent } from "../../hooks/registry.ts";
12
12
  import type { PiTeamsToolResult } from "../tool-result.ts";
13
13
  import { locateRunCwd } from "../team-tool.ts";
14
14
  import { result, type TeamContext } from "./context.ts";
15
+ import { RUN_NOT_FOUND_HINT } from "./run-not-found.ts";
15
16
  import { enforceDestructiveIntent, intentFromConfig } from "./intent-policy.ts";
16
17
  import { invalidateSnapshot, type CacheControlDeps } from "./cache-control.ts";
17
18
 
@@ -80,9 +81,9 @@ function cancelReasonFromParams(params: TeamToolParamsValue): CancellationReason
80
81
  export async function handleRetry(params: TeamToolParamsValue, ctx: TeamContext, deps?: CacheControlDeps): Promise<PiTeamsToolResult> {
81
82
  if (!params.runId) return result("Retry requires runId.", { action: "retry", status: "error" }, true);
82
83
  const runCwd = locateRunCwd(params.runId, ctx.cwd);
83
- if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "retry", status: "error" }, true);
84
+ if (!runCwd) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "retry", status: "error" }, true);
84
85
  const loaded = loadRunManifestById(runCwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
85
- if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "retry", status: "error" }, true);
86
+ if (!loaded) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "retry", status: "error" }, true);
86
87
 
87
88
  // Pre-lock ownership check: reject foreign-owned runs unless force is set
88
89
  const foreignRun = typeof loaded.manifest.ownerSessionId === "string" && loaded.manifest.ownerSessionId !== ctx.sessionId;
@@ -145,9 +146,9 @@ export async function handleCancel(params: TeamToolParamsValue, ctx: TeamContext
145
146
  if (intentError) return intentError;
146
147
  if (!params.runId) return result("Cancel requires runId.", { action: "cancel", status: "error" }, true);
147
148
  const runCwd = locateRunCwd(params.runId, ctx.cwd);
148
- if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "cancel", status: "error" }, true);
149
+ if (!runCwd) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "cancel", status: "error" }, true);
149
150
  const loaded = loadRunManifestById(runCwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
150
- if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "cancel", status: "error" }, true);
151
+ if (!loaded) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "cancel", status: "error" }, true);
151
152
 
152
153
  // Pre-lock ownership check: reject foreign-owned runs unless force is set
153
154
  const preCheck = abortOwned(loaded.manifest.runId, undefined, ctx, params.force);
@@ -1,4 +1,5 @@
1
1
  import * as fs from "node:fs";
2
+ import { RUN_NOT_FOUND_HINT } from "./run-not-found.ts";
2
3
  import * as path from "node:path";
3
4
  import { loadRunManifestById } from "../../state/state-store.ts";
4
5
  import type { TeamRunManifest, TeamTaskState } from "../../state/types.ts";
@@ -211,7 +212,7 @@ export function handleExplain(params: {
211
212
 
212
213
  const loaded = loadRunManifestById(cwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
213
214
  if (!loaded) {
214
- return result(`Run '${params.runId}' not found.`, { action: "explain", status: "error" }, true);
215
+ return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "explain", status: "error" }, true);
215
216
  }
216
217
 
217
218
  const { manifest, tasks } = loaded;
@@ -0,0 +1,124 @@
1
+ /**
2
+ * failure-patterns.ts — Group failed tasks by error similarity (Round 17 BS-4).
3
+ *
4
+ * Before this, a run with 8 failed tasks surfaced 8 separate raw error
5
+ * strings. The user had to mentally group them ("5 of these say 'model
6
+ * routing fallback failed'"). This module detects common failure patterns
7
+ * so `summary` can say "5 of 8 failures share root cause: X".
8
+ *
9
+ * Grouping strategy (cheap, deterministic, no ML):
10
+ * 1. Normalize: lowercase, collapse whitespace, strip task ids / run ids /
11
+ * absolute paths / numbers → a canonical "signature".
12
+ * 2. Bucket by signature. Buckets with >1 member are "common patterns".
13
+ * 3. Sort by frequency desc.
14
+ *
15
+ * Conservative: only buckets with >=2 members count as a pattern (a single
16
+ * failure is just itself). Returns [] when there are no repeated signatures.
17
+ */
18
+
19
+ export interface FailurePattern {
20
+ /** Canonical error signature used for grouping. */
21
+ signature: string;
22
+ /** A representative original error (the shortest variant) for display. */
23
+ representative: string;
24
+ /** Task ids that hit this pattern. */
25
+ taskIds: string[];
26
+ /** Count of failures in this bucket (== taskIds.length). */
27
+ count: number;
28
+ }
29
+
30
+ export interface FailurePatternInput {
31
+ id: string;
32
+ status: string;
33
+ error?: string;
34
+ }
35
+
36
+ /**
37
+ * Normalize an error string into a grouping signature.
38
+ * Exported for unit testing.
39
+ */
40
+ export function normalizeErrorSignature(error: string | undefined): string {
41
+ if (!error) return "(no error detail)";
42
+ let s = error.toLowerCase();
43
+ // Strip run ids (team_YYYYMMDDHHMMSS_xxxxxxxxxxxxxxxx)
44
+ s = s.replace(/team_\d{8,}_[a-z0-9]{12,}/g, "<run>");
45
+ // Strip task ids (01_explore, adaptive-03-executor, etc.)
46
+ s = s.replace(/\b(adaptive-)?\d{2,}[a-z0-9_-]+/g, "<task>");
47
+ // Strip absolute paths
48
+ s = s.replace(/\/(?:home|users|tmp|var|opt|root)[^\s'"]*/g, "<path>");
49
+ // Strip numbers (line numbers, counts, pids, ms durations)
50
+ s = s.replace(/\b\d+\b/g, "N");
51
+ // Collapse whitespace
52
+ s = s.replace(/\s+/g, " ").trim();
53
+ return s || "(no error detail)";
54
+ }
55
+
56
+ /**
57
+ * Group failed tasks by error-pattern similarity. Only groups with >=2
58
+ * members are returned (singletons are not "patterns"). Sorted by count desc.
59
+ *
60
+ * @param tasks the run's tasks (any with status 'failed'/'cancelled' are
61
+ * considered failures for aggregation purposes).
62
+ */
63
+ export function aggregateFailurePatterns(tasks: FailurePatternInput[]): FailurePattern[] {
64
+ const failed = tasks.filter(
65
+ (t) => t.status === "failed" || t.status === "cancelled",
66
+ );
67
+ if (failed.length === 0) return [];
68
+ const buckets = new Map<string, FailurePattern>();
69
+ for (const t of failed) {
70
+ const signature = normalizeErrorSignature(t.error);
71
+ const existing = buckets.get(signature);
72
+ if (existing) {
73
+ existing.taskIds.push(t.id);
74
+ existing.count += 1;
75
+ // Keep the shortest non-empty variant as representative (most readable).
76
+ if (t.error && (!existing.representative || t.error.length < existing.representative.length)) {
77
+ existing.representative = t.error;
78
+ }
79
+ } else {
80
+ buckets.set(signature, {
81
+ signature,
82
+ representative: t.error ?? "(no error detail)",
83
+ taskIds: [t.id],
84
+ count: 1,
85
+ });
86
+ }
87
+ }
88
+ // Only patterns with >=2 members (repeated root causes).
89
+ return [...buckets.values()]
90
+ .filter((b) => b.count >= 2)
91
+ .sort((a, b) => b.count - a.count);
92
+ }
93
+
94
+ /**
95
+ * Render failure patterns as human-readable lines for the `summary` action.
96
+ * Returns [] when there are no repeated patterns (so the caller can omit the
97
+ * section entirely).
98
+ *
99
+ * Example output:
100
+ * Common failure patterns (3 of 5 failures share 2 root causes):
101
+ * - [×3] model routing fallback failed: all 2 candidates exhausted
102
+ * tasks: 02_exec, 03_exec, 04_exec
103
+ * - [×2] EPERM: operation not permitted, rename
104
+ * tasks: 05_exec, 06_exec
105
+ */
106
+ export function formatFailurePatterns(tasks: FailurePatternInput[]): string[] {
107
+ const patterns = aggregateFailurePatterns(tasks);
108
+ if (patterns.length === 0) return [];
109
+ const failedCount = tasks.filter(
110
+ (t) => t.status === "failed" || t.status === "cancelled",
111
+ ).length;
112
+ const groupedCount = patterns.reduce((sum, p) => sum + p.count, 0);
113
+ const lines = [
114
+ `Common failure patterns (${groupedCount} of ${failedCount} failures share ${patterns.length} root cause${patterns.length === 1 ? "" : "s"}):`,
115
+ ];
116
+ for (const p of patterns) {
117
+ const rep = p.representative.length > 100 ? `${p.representative.slice(0, 99)}…` : p.representative;
118
+ lines.push(`- [×${p.count}] ${rep}`);
119
+ const shown = p.taskIds.slice(0, 6);
120
+ const more = p.taskIds.length > 6 ? `, +${p.taskIds.length - 6} more` : "";
121
+ lines.push(` tasks: ${shown.join(", ")}${more}`);
122
+ }
123
+ return lines;
124
+ }