pi-crew 0.7.5 → 0.7.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/CHANGELOG.md +71 -0
  2. package/README.md +11 -11
  3. package/docs/commands-reference.md +14 -10
  4. package/docs/troubleshooting.md +131 -0
  5. package/docs/usage.md +9 -4
  6. package/package.json +1 -1
  7. package/src/config/config.ts +11 -4
  8. package/src/extension/action-suggestions.ts +71 -0
  9. package/src/extension/context-status-injection.ts +32 -1
  10. package/src/extension/register.ts +71 -65
  11. package/src/extension/team-tool/api.ts +3 -2
  12. package/src/extension/team-tool/cancel.ts +5 -4
  13. package/src/extension/team-tool/explain.ts +2 -1
  14. package/src/extension/team-tool/failure-patterns.ts +124 -0
  15. package/src/extension/team-tool/inspect.ts +10 -6
  16. package/src/extension/team-tool/lifecycle-actions.ts +5 -4
  17. package/src/extension/team-tool/respond.ts +4 -3
  18. package/src/extension/team-tool/run-not-found.ts +54 -0
  19. package/src/extension/team-tool/run.ts +26 -4
  20. package/src/extension/team-tool/status.ts +58 -4
  21. package/src/extension/team-tool.ts +5 -3
  22. package/src/runtime/async-runner.ts +7 -0
  23. package/src/runtime/background-runner.ts +7 -1
  24. package/src/runtime/chain-parser.ts +13 -5
  25. package/src/runtime/checkpoint.ts +13 -1
  26. package/src/runtime/child-pi.ts +9 -1
  27. package/src/runtime/crash-recovery.ts +21 -1
  28. package/src/runtime/live-session-runtime.ts +15 -1
  29. package/src/runtime/parent-guard.ts +2 -2
  30. package/src/runtime/pi-spawn.ts +66 -0
  31. package/src/runtime/stale-reconciler.ts +38 -3
  32. package/src/runtime/task-runner.ts +10 -1
  33. package/src/runtime/team-runner.ts +19 -2
  34. package/src/runtime/verification-gates.ts +21 -1
  35. package/src/schema/team-tool-schema.ts +9 -0
  36. package/src/state/blob-store.ts +12 -10
  37. package/src/state/event-log-rotation.ts +114 -93
  38. package/src/state/event-log.ts +79 -20
  39. package/src/state/health-store.ts +6 -1
  40. package/src/state/locks.ts +66 -16
  41. package/src/state/state-store.ts +14 -1
  42. package/src/ui/card-colors.ts +7 -3
  43. package/src/ui/dashboard-panes/agents-pane.ts +15 -2
  44. package/src/ui/live-duration.ts +58 -0
  45. package/src/ui/tool-render.ts +7 -11
  46. package/src/ui/tool-renderers/index.ts +6 -3
  47. package/src/ui/widget/widget-formatters.ts +2 -13
  48. package/src/utils/fs-watch.ts +11 -60
  49. package/src/utils/run-watcher-registry.ts +164 -0
  50. package/src/workflows/discover-workflows.ts +2 -1
  51. package/src/workflows/workflow-config.ts +5 -0
  52. package/src/runtime/dynamic-script-runner.ts +0 -497
  53. package/src/runtime/sandbox.ts +0 -335
@@ -82,7 +82,8 @@ import {
82
82
  import { RenderScheduler } from "../ui/render-scheduler.ts";
83
83
  import { runEventBus } from "../ui/run-event-bus.ts";
84
84
  import { createRunSnapshotCache } from "../ui/run-snapshot-cache.ts";
85
- import { closeWatcher, watchCrewState } from "../utils/fs-watch.ts";
85
+ import { closeWatcher } from "../utils/fs-watch.ts";
86
+ import { RunWatcherRegistry } from "../utils/run-watcher-registry.ts";
86
87
  import { logInternalError } from "../utils/internal-error.ts";
87
88
  import {
88
89
  clearProjectRootCache,
@@ -725,8 +726,13 @@ export function registerPiTeams(pi: ExtensionAPI): void {
725
726
  // Linux), file changes (manifest/tasks/events/agents) trigger an
726
727
  // immediate cache invalidate via renderScheduler.schedule. Falls back to
727
728
  // poll-only behavior on systems where fs.watch errors.
728
- let crewWatcher: import("node:fs").FSWatcher | undefined;
729
- let userCrewWatcher: import("node:fs").FSWatcher | undefined;
729
+ // pts/2 hang fix (2026-06-16): the previous RECURSIVE fs.watch(<state>, {recursive:true})
730
+ // exploded to O(total run history) inotify watches on Linux (109→339 observed) and
731
+ // caused a permanent busy-loop. Replaced with bounded per-active-run watchers via
732
+ // RunWatcherRegistry (root watcher on runs/ for new-run detection + one non-recursive
733
+ // watcher per active run, reconciled each preload tick in buildFrame).
734
+ let crewRunWatchers: RunWatcherRegistry | undefined;
735
+ let userCrewWatchers: RunWatcherRegistry | undefined;
730
736
  // Separate map for foreground team-run AbortControllers (distinct from subagent controllers).
731
737
  // P0 fix: stopSessionBoundSubagents must NOT abort foreground team runs on session switch.
732
738
  // Foreground team runs run in the same process as the session; they naturally clean up
@@ -1116,10 +1122,10 @@ export function registerPiTeams(pi: ExtensionAPI): void {
1116
1122
  clearTimeout(preloadTimer);
1117
1123
  preloadTimer = undefined;
1118
1124
  }
1119
- closeWatcher(crewWatcher);
1120
- crewWatcher = undefined;
1121
- closeWatcher(userCrewWatcher);
1122
- userCrewWatcher = undefined;
1125
+ crewRunWatchers?.closeAll();
1126
+ crewRunWatchers = undefined;
1127
+ userCrewWatchers?.closeAll();
1128
+ userCrewWatchers = undefined;
1123
1129
  stopSessionBoundSubagents();
1124
1130
  // P0 fix: also abort foreground team runs on session shutdown (not on session switch).
1125
1131
  // This is the only place where foreground team run controllers should be aborted.
@@ -1590,6 +1596,25 @@ export function registerPiTeams(pi: ExtensionAPI): void {
1590
1596
  lastFrameSnapshotCache = getRunSnapshotCache(currentCtx.cwd);
1591
1597
  const manifests = lastFrameManifestCache.list(20);
1592
1598
  lastPreloadedManifests = manifests;
1599
+ // pts/2 hang fix: reconcile per-run watchers against the ACTIVE set only.
1600
+ // This bounds inotify cost to O(active runs) — completed runs stop being
1601
+ // watched as soon as they leave running/queued/planning status, instead of
1602
+ // the recursive watcher watching the entire run history forever.
1603
+ {
1604
+ const onRunChange = (runId: string): void => {
1605
+ if (cleanedUp || sessionGeneration !== ownerGeneration) return;
1606
+ getRunSnapshotCache(currentCtx?.cwd ?? process.cwd()).invalidate(runId);
1607
+ renderScheduler?.schedule({ runId });
1608
+ };
1609
+ const onWatchErr = (error: unknown): void => {
1610
+ logInternalError("register.runWatcher.change", error);
1611
+ };
1612
+ const active = manifests
1613
+ .filter((r) => r.status === "running" || r.status === "queued" || r.status === "planning")
1614
+ .map((r) => ({ runId: r.runId, runDir: r.stateRoot }));
1615
+ crewRunWatchers?.reconcile(active, onRunChange, onWatchErr);
1616
+ userCrewWatchers?.reconcile(active, onRunChange, onWatchErr);
1617
+ }
1593
1618
  const runIds = manifests.map((r) => r.runId);
1594
1619
  await lastFrameSnapshotCache.preloadAllStale(runIds);
1595
1620
  return true;
@@ -1815,72 +1840,53 @@ export function registerPiTeams(pi: ExtensionAPI): void {
1815
1840
  renderSchedulerUnsubscribers.push(unsubscribeRunEvents);
1816
1841
  // Start async preload loop — refreshes snapshot cache in background
1817
1842
  startPreloadLoop(fallbackMs, effectiveRefreshMs);
1818
- // 1.3: native FS watcher on `<crewRoot>/state`. Triggers an immediate
1819
- // renderScheduler.schedule({runId}) when files inside any run change so
1820
- // the snapshot cache invalidates well before the 1s preload tick. Falls
1821
- // back silently to poll-only behavior on systems where recursive
1822
- // fs.watch is not supported.
1843
+ // 1.3: BOUNDED run watcher (pts/2 hang fix 2026-06-16). Previously this was
1844
+ // a RECURSIVE fs.watch(<state>, {recursive:true}) which on Linux expands to
1845
+ // ONE inotify watch PER SUBDIR with many historical runs under
1846
+ // .crew/state/runs/ this ballooned to hundreds of watches (109→339 observed)
1847
+ // and the event volume caused a permanent busy-loop (71% CPU, 400KB/s read).
1848
+ // Now: a single non-recursive watcher on the runs/ ROOT (to detect new run
1849
+ // dirs appearing — crew.run.created is never emitted) plus per-active-run
1850
+ // watchers reconciled each preload tick in buildFrame. Total inotify cost is
1851
+ // O(active runs), not O(total history). Falls back to poll-only (the preload
1852
+ // loop already polls every effectiveRefreshMs) on systems where fs.watch
1853
+ // errors or the runs dir is absent.
1854
+ const crewRunWatcherOnChange = (runId: string): void => {
1855
+ if (cleanedUp || sessionGeneration !== ownerGeneration) return;
1856
+ getRunSnapshotCache(currentCtx?.cwd ?? process.cwd()).invalidate(runId);
1857
+ renderScheduler?.schedule({ runId });
1858
+ };
1859
+ const crewRunWatcherOnError = (error: unknown): void => {
1860
+ logInternalError("register.crewRunWatchers.error", error);
1861
+ };
1823
1862
  try {
1824
- closeWatcher(crewWatcher);
1825
- crewWatcher = undefined;
1826
- const stateDir = path.join(projectCrewRoot(ctx.cwd), "state");
1827
- const watcher = watchCrewState(
1828
- stateDir,
1829
- (runId) => {
1830
- if (cleanedUp || sessionGeneration !== ownerGeneration)
1831
- return;
1832
- // Invalidate snapshot cache so the next renderTick reads fresh state from disk.
1833
- // Without this, renderTick re-renders from stale lastPreloadedManifests and
1834
- // shows ghost "running" entries for runs that already completed on disk.
1835
- const sc = getRunSnapshotCache(
1836
- currentCtx?.cwd ?? process.cwd(),
1837
- );
1838
- sc.invalidate(runId);
1839
- renderScheduler?.schedule({ runId });
1840
- },
1841
- (error) => {
1842
- logInternalError("register.crewWatcher.error", error);
1843
- closeWatcher(crewWatcher);
1844
- crewWatcher = undefined;
1845
- },
1846
- );
1847
- if (watcher) crewWatcher = watcher;
1863
+ crewRunWatchers?.closeAll();
1864
+ crewRunWatchers = undefined;
1865
+ const crewRunsDir = path.join(projectCrewRoot(ctx.cwd), "state", "runs");
1866
+ if (fs.existsSync(crewRunsDir)) {
1867
+ crewRunWatchers = new RunWatcherRegistry();
1868
+ crewRunWatchers.setRootWatcher(crewRunsDir, crewRunWatcherOnChange, crewRunWatcherOnError);
1869
+ }
1848
1870
  } catch (error) {
1849
- logInternalError("register.crewWatcher.start", error);
1871
+ logInternalError("register.crewRunWatchers.start", error);
1850
1872
  }
1851
- // Also watch user-level state dir — fast-fix and other user-scoped runs
1852
- // write manifests there. Without this watcher, runs completing in user-level
1873
+ // Also watch user-level runs dir — fast-fix and other user-scoped runs
1874
+ // write manifests there. Without this, runs completing in user-level
1853
1875
  // state never trigger cache invalidation, causing ghost "running" entries.
1854
1876
  try {
1855
- closeWatcher(userCrewWatcher);
1856
- userCrewWatcher = undefined;
1857
- const userStateDir = path.join(userCrewRoot(), "state");
1858
- if (fs.existsSync(userStateDir)) {
1859
- const userWatcher = watchCrewState(
1860
- userStateDir,
1861
- (runId) => {
1862
- if (cleanedUp || sessionGeneration !== ownerGeneration)
1863
- return;
1864
- const sc = getRunSnapshotCache(
1865
- currentCtx?.cwd ?? process.cwd(),
1866
- );
1867
- sc.invalidate(runId);
1868
- renderScheduler?.schedule({ runId });
1869
- },
1870
- (error) => {
1871
- logInternalError(
1872
- "register.userCrewWatcher.error",
1873
- error,
1874
- );
1875
- closeWatcher(userCrewWatcher);
1876
- userCrewWatcher = undefined;
1877
- },
1878
- );
1879
- if (userWatcher) userCrewWatcher = userWatcher;
1877
+ userCrewWatchers?.closeAll();
1878
+ userCrewWatchers = undefined;
1879
+ const userRunsDir = path.join(userCrewRoot(), "state", "runs");
1880
+ if (fs.existsSync(userRunsDir)) {
1881
+ userCrewWatchers = new RunWatcherRegistry();
1882
+ userCrewWatchers.setRootWatcher(userRunsDir, crewRunWatcherOnChange, crewRunWatcherOnError);
1880
1883
  }
1881
1884
  } catch (error) {
1882
- logInternalError("register.userCrewWatcher.start", error);
1885
+ logInternalError("register.userCrewWatchers.start", error);
1883
1886
  }
1887
+ // Kick an immediate preload so the first buildFrame reconciles per-run
1888
+ // watchers for any runs that are already active on session start.
1889
+ backgroundPreload();
1884
1890
  });
1885
1891
  pi.on("session_before_switch", () => {
1886
1892
  sessionGeneration++;
@@ -24,6 +24,7 @@ import { resolveRealContainedPath } from "../../utils/safe-paths.ts";
24
24
  import type { PiTeamsToolResult } from "../tool-result.ts";
25
25
  import { locateRunCwd } from "../team-tool.ts";
26
26
  import { configRecord, result, type TeamContext } from "./context.ts";
27
+ import { RUN_NOT_FOUND_HINT } from "./run-not-found.ts";
27
28
 
28
29
  export function globMatch(value: string, pattern: string): boolean {
29
30
  // Prevent ReDoS: reject excessively long patterns
@@ -91,9 +92,9 @@ export async function handleApi(params: TeamToolParamsValue, ctx: TeamContext):
91
92
  }
92
93
  if (!params.runId) return result("API requires runId.", { action: "api", status: "error" }, true);
93
94
  const runCwd = locateRunCwd(params.runId, ctx.cwd);
94
- if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "api", status: "error" }, true);
95
+ if (!runCwd) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "api", status: "error" }, true);
95
96
  const loaded = loadRunManifestById(runCwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
96
- if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "api", status: "error" }, true);
97
+ if (!loaded) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "api", status: "error" }, true);
97
98
  if (operation === "read-manifest") {
98
99
  return result(JSON.stringify(loaded.manifest, null, 2), { action: "api", status: "ok", runId: loaded.manifest.runId, artifactsRoot: loaded.manifest.artifactsRoot });
99
100
  }
@@ -12,6 +12,7 @@ import { executeHook, appendHookEvent } from "../../hooks/registry.ts";
12
12
  import type { PiTeamsToolResult } from "../tool-result.ts";
13
13
  import { locateRunCwd } from "../team-tool.ts";
14
14
  import { result, type TeamContext } from "./context.ts";
15
+ import { RUN_NOT_FOUND_HINT } from "./run-not-found.ts";
15
16
  import { enforceDestructiveIntent, intentFromConfig } from "./intent-policy.ts";
16
17
  import { invalidateSnapshot, type CacheControlDeps } from "./cache-control.ts";
17
18
 
@@ -80,9 +81,9 @@ function cancelReasonFromParams(params: TeamToolParamsValue): CancellationReason
80
81
  export async function handleRetry(params: TeamToolParamsValue, ctx: TeamContext, deps?: CacheControlDeps): Promise<PiTeamsToolResult> {
81
82
  if (!params.runId) return result("Retry requires runId.", { action: "retry", status: "error" }, true);
82
83
  const runCwd = locateRunCwd(params.runId, ctx.cwd);
83
- if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "retry", status: "error" }, true);
84
+ if (!runCwd) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "retry", status: "error" }, true);
84
85
  const loaded = loadRunManifestById(runCwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
85
- if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "retry", status: "error" }, true);
86
+ if (!loaded) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "retry", status: "error" }, true);
86
87
 
87
88
  // Pre-lock ownership check: reject foreign-owned runs unless force is set
88
89
  const foreignRun = typeof loaded.manifest.ownerSessionId === "string" && loaded.manifest.ownerSessionId !== ctx.sessionId;
@@ -145,9 +146,9 @@ export async function handleCancel(params: TeamToolParamsValue, ctx: TeamContext
145
146
  if (intentError) return intentError;
146
147
  if (!params.runId) return result("Cancel requires runId.", { action: "cancel", status: "error" }, true);
147
148
  const runCwd = locateRunCwd(params.runId, ctx.cwd);
148
- if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "cancel", status: "error" }, true);
149
+ if (!runCwd) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "cancel", status: "error" }, true);
149
150
  const loaded = loadRunManifestById(runCwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
150
- if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "cancel", status: "error" }, true);
151
+ if (!loaded) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "cancel", status: "error" }, true);
151
152
 
152
153
  // Pre-lock ownership check: reject foreign-owned runs unless force is set
153
154
  const preCheck = abortOwned(loaded.manifest.runId, undefined, ctx, params.force);
@@ -1,4 +1,5 @@
1
1
  import * as fs from "node:fs";
2
+ import { RUN_NOT_FOUND_HINT } from "./run-not-found.ts";
2
3
  import * as path from "node:path";
3
4
  import { loadRunManifestById } from "../../state/state-store.ts";
4
5
  import type { TeamRunManifest, TeamTaskState } from "../../state/types.ts";
@@ -211,7 +212,7 @@ export function handleExplain(params: {
211
212
 
212
213
  const loaded = loadRunManifestById(cwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
213
214
  if (!loaded) {
214
- return result(`Run '${params.runId}' not found.`, { action: "explain", status: "error" }, true);
215
+ return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "explain", status: "error" }, true);
215
216
  }
216
217
 
217
218
  const { manifest, tasks } = loaded;
@@ -0,0 +1,124 @@
1
+ /**
2
+ * failure-patterns.ts — Group failed tasks by error similarity (Round 17 BS-4).
3
+ *
4
+ * Before this, a run with 8 failed tasks surfaced 8 separate raw error
5
+ * strings. The user had to mentally group them ("5 of these say 'model
6
+ * routing fallback failed'"). This module detects common failure patterns
7
+ * so `summary` can say "5 of 8 failures share root cause: X".
8
+ *
9
+ * Grouping strategy (cheap, deterministic, no ML):
10
+ * 1. Normalize: lowercase, collapse whitespace, strip task ids / run ids /
11
+ * absolute paths / numbers → a canonical "signature".
12
+ * 2. Bucket by signature. Buckets with >1 member are "common patterns".
13
+ * 3. Sort by frequency desc.
14
+ *
15
+ * Conservative: only buckets with >=2 members count as a pattern (a single
16
+ * failure is just itself). Returns [] when there are no repeated signatures.
17
+ */
18
+
19
+ export interface FailurePattern {
20
+ /** Canonical error signature used for grouping. */
21
+ signature: string;
22
+ /** A representative original error (the shortest variant) for display. */
23
+ representative: string;
24
+ /** Task ids that hit this pattern. */
25
+ taskIds: string[];
26
+ /** Count of failures in this bucket (== taskIds.length). */
27
+ count: number;
28
+ }
29
+
30
+ export interface FailurePatternInput {
31
+ id: string;
32
+ status: string;
33
+ error?: string;
34
+ }
35
+
36
+ /**
37
+ * Normalize an error string into a grouping signature.
38
+ * Exported for unit testing.
39
+ */
40
+ export function normalizeErrorSignature(error: string | undefined): string {
41
+ if (!error) return "(no error detail)";
42
+ let s = error.toLowerCase();
43
+ // Strip run ids (team_YYYYMMDDHHMMSS_xxxxxxxxxxxxxxxx)
44
+ s = s.replace(/team_\d{8,}_[a-z0-9]{12,}/g, "<run>");
45
+ // Strip task ids (01_explore, adaptive-03-executor, etc.)
46
+ s = s.replace(/\b(adaptive-)?\d{2,}[a-z0-9_-]+/g, "<task>");
47
+ // Strip absolute paths
48
+ s = s.replace(/\/(?:home|users|tmp|var|opt|root)[^\s'"]*/g, "<path>");
49
+ // Strip numbers (line numbers, counts, pids, ms durations)
50
+ s = s.replace(/\b\d+\b/g, "N");
51
+ // Collapse whitespace
52
+ s = s.replace(/\s+/g, " ").trim();
53
+ return s || "(no error detail)";
54
+ }
55
+
56
+ /**
57
+ * Group failed tasks by error-pattern similarity. Only groups with >=2
58
+ * members are returned (singletons are not "patterns"). Sorted by count desc.
59
+ *
60
+ * @param tasks the run's tasks (any with status 'failed'/'cancelled' are
61
+ * considered failures for aggregation purposes).
62
+ */
63
+ export function aggregateFailurePatterns(tasks: FailurePatternInput[]): FailurePattern[] {
64
+ const failed = tasks.filter(
65
+ (t) => t.status === "failed" || t.status === "cancelled",
66
+ );
67
+ if (failed.length === 0) return [];
68
+ const buckets = new Map<string, FailurePattern>();
69
+ for (const t of failed) {
70
+ const signature = normalizeErrorSignature(t.error);
71
+ const existing = buckets.get(signature);
72
+ if (existing) {
73
+ existing.taskIds.push(t.id);
74
+ existing.count += 1;
75
+ // Keep the shortest non-empty variant as representative (most readable).
76
+ if (t.error && (!existing.representative || t.error.length < existing.representative.length)) {
77
+ existing.representative = t.error;
78
+ }
79
+ } else {
80
+ buckets.set(signature, {
81
+ signature,
82
+ representative: t.error ?? "(no error detail)",
83
+ taskIds: [t.id],
84
+ count: 1,
85
+ });
86
+ }
87
+ }
88
+ // Only patterns with >=2 members (repeated root causes).
89
+ return [...buckets.values()]
90
+ .filter((b) => b.count >= 2)
91
+ .sort((a, b) => b.count - a.count);
92
+ }
93
+
94
+ /**
95
+ * Render failure patterns as human-readable lines for the `summary` action.
96
+ * Returns [] when there are no repeated patterns (so the caller can omit the
97
+ * section entirely).
98
+ *
99
+ * Example output:
100
+ * Common failure patterns (3 of 5 failures share 2 root causes):
101
+ * - [×3] model routing fallback failed: all 2 candidates exhausted
102
+ * tasks: 02_exec, 03_exec, 04_exec
103
+ * - [×2] EPERM: operation not permitted, rename
104
+ * tasks: 05_exec, 06_exec
105
+ */
106
+ export function formatFailurePatterns(tasks: FailurePatternInput[]): string[] {
107
+ const patterns = aggregateFailurePatterns(tasks);
108
+ if (patterns.length === 0) return [];
109
+ const failedCount = tasks.filter(
110
+ (t) => t.status === "failed" || t.status === "cancelled",
111
+ ).length;
112
+ const groupedCount = patterns.reduce((sum, p) => sum + p.count, 0);
113
+ const lines = [
114
+ `Common failure patterns (${groupedCount} of ${failedCount} failures share ${patterns.length} root cause${patterns.length === 1 ? "" : "s"}):`,
115
+ ];
116
+ for (const p of patterns) {
117
+ const rep = p.representative.length > 100 ? `${p.representative.slice(0, 99)}…` : p.representative;
118
+ lines.push(`- [×${p.count}] ${rep}`);
119
+ const shown = p.taskIds.slice(0, 6);
120
+ const more = p.taskIds.length > 6 ? `, +${p.taskIds.length - 6} more` : "";
121
+ lines.push(` tasks: ${shown.join(", ")}${more}`);
122
+ }
123
+ return lines;
124
+ }
@@ -5,13 +5,15 @@ import { aggregateUsage, formatUsage, formatCostReport } from "../../state/usage
5
5
  import type { PiTeamsToolResult } from "../tool-result.ts";
6
6
  import { locateRunCwd } from "../team-tool.ts";
7
7
  import { result, type TeamContext } from "./context.ts";
8
+ import { RUN_NOT_FOUND_HINT } from "./run-not-found.ts";
9
+ import { formatFailurePatterns } from "./failure-patterns.ts";
8
10
 
9
11
  export function handleEvents(params: TeamToolParamsValue, ctx: TeamContext): PiTeamsToolResult {
10
12
  if (!params.runId) return result("Events requires runId.", { action: "events", status: "error" }, true);
11
13
  const runCwd = locateRunCwd(params.runId, ctx.cwd);
12
- if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "events", status: "error" }, true);
14
+ if (!runCwd) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "events", status: "error" }, true);
13
15
  const loaded = loadRunManifestById(runCwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
14
- if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "events", status: "error" }, true);
16
+ if (!loaded) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "events", status: "error" }, true);
15
17
  const events = readEvents(loaded.manifest.eventsPath);
16
18
  const lines = [`Events for ${loaded.manifest.runId}:`, ...(events.length ? events.map((event) => `${event.time} ${event.type}${event.taskId ? ` ${event.taskId}` : ""}${event.message ? `: ${event.message}` : ""}${event.data ? ` ${JSON.stringify(event.data)}` : ""}`) : ["(none)"])];
17
19
  return result(lines.join("\n"), { action: "events", status: "ok", runId: loaded.manifest.runId, artifactsRoot: loaded.manifest.artifactsRoot });
@@ -20,9 +22,9 @@ export function handleEvents(params: TeamToolParamsValue, ctx: TeamContext): PiT
20
22
  export function handleArtifacts(params: TeamToolParamsValue, ctx: TeamContext): PiTeamsToolResult {
21
23
  if (!params.runId) return result("Artifacts requires runId.", { action: "artifacts", status: "error" }, true);
22
24
  const runCwd = locateRunCwd(params.runId, ctx.cwd);
23
- if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "artifacts", status: "error" }, true);
25
+ if (!runCwd) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "artifacts", status: "error" }, true);
24
26
  const loaded = loadRunManifestById(runCwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
25
- if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "artifacts", status: "error" }, true);
27
+ if (!loaded) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "artifacts", status: "error" }, true);
26
28
  const lines = [`Artifacts for ${loaded.manifest.runId}:`, ...(loaded.manifest.artifacts.length ? loaded.manifest.artifacts.map((artifact) => `- ${artifact.kind}: ${artifact.path}${artifact.sizeBytes !== undefined ? ` (${artifact.sizeBytes} bytes)` : ""}${artifact.contentHash ? ` sha256=${artifact.contentHash.slice(0, 12)}` : ""}`) : ["- (none)"])];
27
29
  return result(lines.join("\n"), { action: "artifacts", status: "ok", runId: loaded.manifest.runId, artifactsRoot: loaded.manifest.artifactsRoot });
28
30
  }
@@ -30,10 +32,11 @@ export function handleArtifacts(params: TeamToolParamsValue, ctx: TeamContext):
30
32
  export function handleSummary(params: TeamToolParamsValue, ctx: TeamContext): PiTeamsToolResult {
31
33
  if (!params.runId) return result("Summary requires runId.", { action: "summary", status: "error" }, true);
32
34
  const runCwd = locateRunCwd(params.runId, ctx.cwd);
33
- if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "summary", status: "error" }, true);
35
+ if (!runCwd) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "summary", status: "error" }, true);
34
36
  const loaded = loadRunManifestById(runCwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
35
- if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "summary", status: "error" }, true);
37
+ if (!loaded) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "summary", status: "error" }, true);
36
38
  const usage = aggregateUsage(loaded.tasks);
39
+ const failurePatternLines = formatFailurePatterns(loaded.tasks);
37
40
  const lines = [
38
41
  `Summary for ${loaded.manifest.runId}`,
39
42
  `Status: ${loaded.manifest.status}`,
@@ -43,6 +46,7 @@ export function handleSummary(params: TeamToolParamsValue, ctx: TeamContext): Pi
43
46
  `Usage: ${formatUsage(usage)}`,
44
47
  "",
45
48
  formatCostReport(loaded.tasks),
49
+ ...(failurePatternLines.length > 0 ? ["", ...failurePatternLines] : []),
46
50
  "",
47
51
  "Tasks:",
48
52
  ...loaded.tasks.map((task) => `- ${task.id}: ${task.status} (${task.role} -> ${task.agent})${task.error ? ` - ${task.error}` : ""}`),
@@ -9,6 +9,7 @@ import { importRunBundle } from "../run-import.ts";
9
9
  import { pruneFinishedRuns } from "../run-maintenance.ts";
10
10
  import type { PiTeamsToolResult } from "../tool-result.ts";
11
11
  import { configRecord, result, type TeamContext } from "./context.ts";
12
+ import { RUN_NOT_FOUND_HINT } from "./run-not-found.ts";
12
13
  import { enforceDestructiveIntent, intentFromConfig } from "./intent-policy.ts";
13
14
  import { executeHook, appendHookEvent } from "../../hooks/registry.ts";
14
15
  import { resolveRealContainedPath } from "../../utils/safe-paths.ts";
@@ -18,7 +19,7 @@ import * as path from "node:path";
18
19
  export function handleWorktrees(params: TeamToolParamsValue, ctx: TeamContext): PiTeamsToolResult {
19
20
  if (!params.runId) return result("Worktrees requires runId.", { action: "worktrees", status: "error" }, true);
20
21
  const loaded = loadRunManifestById(ctx.cwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
21
- if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "worktrees", status: "error" }, true);
22
+ if (!loaded) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "worktrees", status: "error" }, true);
22
23
  const withWorktrees = loaded.tasks.filter((task) => task.worktree);
23
24
  const lines = [`Worktrees for ${loaded.manifest.runId}:`, ...(withWorktrees.length ? withWorktrees.map((task) => `- ${task.id}: ${task.worktree!.path} branch=${task.worktree!.branch} reused=${task.worktree!.reused ? "true" : "false"}`) : ["- (none)"])];
24
25
  return result(lines.join("\n"), { action: "worktrees", status: "ok", runId: loaded.manifest.runId, artifactsRoot: loaded.manifest.artifactsRoot });
@@ -47,7 +48,7 @@ export function handleImport(params: TeamToolParamsValue, ctx: TeamContext): PiT
47
48
  export async function handleExport(params: TeamToolParamsValue, ctx: TeamContext): Promise<PiTeamsToolResult> {
48
49
  if (!params.runId) return result("Export requires runId.", { action: "export", status: "error" }, true);
49
50
  const loaded = loadRunManifestById(ctx.cwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
50
- if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "export", status: "error" }, true);
51
+ if (!loaded) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "export", status: "error" }, true);
51
52
 
52
53
  // SECURITY: Ownership check — only the owner session may export a run.
53
54
  // Foreign-run export requires confirm: true (explicit user intent).
@@ -96,7 +97,7 @@ export async function handleForget(params: TeamToolParamsValue, ctx: TeamContext
96
97
  if (!params.runId) return result("Forget requires runId.", { action: "forget", status: "error" }, true);
97
98
  if (!params.confirm) return result("forget requires confirm: true.", { action: "forget", status: "error" }, true);
98
99
  const loaded = loadRunManifestById(ctx.cwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
99
- if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "forget", status: "error" }, true);
100
+ if (!loaded) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "forget", status: "error" }, true);
100
101
 
101
102
  // Ownership check — prevent cross-session deletion unless force is set
102
103
  const foreignRun = typeof loaded.manifest.ownerSessionId === "string" && loaded.manifest.ownerSessionId !== ctx.sessionId;
@@ -126,7 +127,7 @@ export async function handleCleanup(params: TeamToolParamsValue, ctx: TeamContex
126
127
  if (intentError) return intentError;
127
128
  if (!params.runId) return result("Cleanup requires runId.", { action: "cleanup", status: "error" }, true);
128
129
  const loaded = loadRunManifestById(ctx.cwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
129
- if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "cleanup", status: "error" }, true);
130
+ if (!loaded) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "cleanup", status: "error" }, true);
130
131
 
131
132
  // Ownership check — prevent cross-session worktree cleanup unless force is set
132
133
  const foreignRun = typeof loaded.manifest.ownerSessionId === "string" && loaded.manifest.ownerSessionId !== ctx.sessionId;
@@ -8,6 +8,7 @@ import { logInternalError } from "../../utils/internal-error.ts";
8
8
  import type { PiTeamsToolResult } from "../tool-result.ts";
9
9
  import { locateRunCwd } from "../team-tool.ts";
10
10
  import { result, type TeamContext } from "./context.ts";
11
+ import { RUN_NOT_FOUND_HINT } from "./run-not-found.ts";
11
12
 
12
13
  /**
13
14
  * Handle `respond` action: send a message to a waiting (interactive) task.
@@ -19,13 +20,13 @@ export function handleRespond(params: TeamToolParamsValue, ctx: TeamContext): Pi
19
20
  if (!params.message && !params.taskId) return result("Respond requires taskId and/or message.", { action: "respond", status: "error" }, true);
20
21
 
21
22
  const runCwd = locateRunCwd(params.runId, ctx.cwd);
22
- if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "respond", status: "error" }, true);
23
+ if (!runCwd) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "respond", status: "error" }, true);
23
24
  const loaded = loadRunManifestById(runCwd, params.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
24
- if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "respond", status: "error" }, true);
25
+ if (!loaded) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "respond", status: "error" }, true);
25
26
 
26
27
  return withRunLockSync(loaded.manifest, () => {
27
28
  const fresh = loadRunManifestById(loaded.manifest.cwd, params.runId!); // NOTE: inside withRunLockSync - consistent read
28
- if (!fresh) return result(`Run '${params.runId}' not found.`, { action: "respond", status: "error" }, true);
29
+ if (!fresh) return result(`Run '${params.runId}' not found.${RUN_NOT_FOUND_HINT}`, { action: "respond", status: "error" }, true);
29
30
  const foreignRun = typeof fresh.manifest.ownerSessionId === "string" && fresh.manifest.ownerSessionId !== ctx.sessionId;
30
31
  if (foreignRun && !params.force) return result(`Run ${fresh.manifest.runId} belongs to another session. Use force: true to override.`, { action: "respond", status: "error", runId: fresh.manifest.runId }, true);
31
32
 
@@ -0,0 +1,54 @@
1
+ /**
2
+ * run-not-found.ts — Centralized "Run not found" error helper (DX: F2).
3
+ *
4
+ * Round 16 DX audit found that a stale/typo'd runId hits a blank
5
+ * "Run '<id>' not found." wall in 8+ handlers (status, resume, steer, export,
6
+ * forget, cleanup, invalidate, worktrees, events, artifacts). The run IDs are
7
+ * long (`team_20260615173318_b9c8fe49a74e0760`), so typos/truncation are
8
+ * near-certain for new users — yet `team list` (which shows recent runs) is
9
+ * never suggested.
10
+ *
11
+ * This module centralizes the message + recovery hint so every handler stays
12
+ * consistent and the hint never drifts.
13
+ */
14
+
15
+ import { result, type TeamContext } from "./context.ts";
16
+ import type { TeamToolDetails } from "../team-tool-types.ts";
17
+
18
+ /** Recovery hint appended to every "Run not found" message. */
19
+ export const RUN_NOT_FOUND_HINT =
20
+ "\n\nTip: run action='list' to see recent runs and their IDs.";
21
+
22
+ /**
23
+ * Build the standard "Run not found" error result with a recovery hint.
24
+ *
25
+ * @param runId the (missing/typo'd) run id the caller passed
26
+ * @param action the action that was attempted (for the details.action field)
27
+ */
28
+ export function runNotFound(runId: string, action: string): ReturnType<typeof result> {
29
+ return result(
30
+ `Run '${runId}' not found.${RUN_NOT_FOUND_HINT}`,
31
+ { action, status: "error" } satisfies TeamToolDetails,
32
+ true,
33
+ );
34
+ }
35
+
36
+ /**
37
+ * Helper: resolve a runId to its cwd, returning a runNotFound() result when
38
+ * missing. Reduces the boilerplate `locateRunCwd → if (!runCwd) return ...`
39
+ * duplicated across handlers.
40
+ */
41
+ export function resolveRunOrNotFound(
42
+ runId: string,
43
+ action: string,
44
+ cwd: string,
45
+ locate: (runId: string, cwd: string) => string | undefined,
46
+ ): { kind: "found"; runCwd: string } | { kind: "notfound"; result: ReturnType<typeof result> } {
47
+ const runCwd = locate(runId, cwd);
48
+ if (!runCwd) return { kind: "notfound", result: runNotFound(runId, action) };
49
+ return { kind: "found", runCwd };
50
+ }
51
+
52
+ // Re-export TeamContext so callers importing this helper don't need a second
53
+ // import line — keeps the diff in each handler to a single import swap.
54
+ export type { TeamContext };
@@ -184,13 +184,17 @@ export async function handleRun(params: TeamToolParamsValue, ctx: TeamContext):
184
184
  // connecting PipelineRunner to the actual team execution system
185
185
  const stageInfo = pipelineWorkflow.stages.map((s) => `- ${s.name} (${s.team})`).join("\n");
186
186
  return result([
187
- `Pipeline workflow: ${workflow.name}`,
187
+ `Pipeline workflow '${workflow.name}' is not yet wired into the team execution system.`,
188
188
  `Goal: ${goal}`,
189
- `Stages (${pipelineWorkflow.stages.length}):`,
189
+ `Defined stages (${pipelineWorkflow.stages.length}):`,
190
190
  stageInfo,
191
191
  "",
192
- "Pipeline execution is available via the PipelineRunner API.",
193
- "Full CLI integration requires connecting to the team execution system.",
192
+ "To actually run work right now, use a supported workflow instead:",
193
+ " - action='run' workflow='default' (explore plan execute verify)",
194
+ " - action='run' workflow='implementation' (adaptive, parallel specialists)",
195
+ " - action='run' workflow='research' (explore → analyze → write)",
196
+ "",
197
+ "Run action='list' resource='workflow' to see all available workflows.",
194
198
  ].join("\n"), { action: "run", status: "ok" }, false);
195
199
  }
196
200
 
@@ -219,6 +223,24 @@ export async function handleRun(params: TeamToolParamsValue, ctx: TeamContext):
219
223
  registerActiveRun(updatedManifest);
220
224
 
221
225
  const loadedConfig = loadConfig(resolvedCtx.cwd);
226
+ // DX (Round 16 F4): surface config errors/warnings instead of silently
227
+ // proceeding with defaults. Non-blocking: emit a config.warning event so
228
+ // it shows in the run timeline and status, and log it. A malformed config
229
+ // (bad JSON / wrong types) should not be a silent no-op — doctor/config
230
+ // actions already surface these; run should too.
231
+ const configIssues = [
232
+ ...(loadedConfig.error ? [`Config error: ${loadedConfig.error}`] : []),
233
+ ...(loadedConfig.warnings ?? []),
234
+ ];
235
+ if (configIssues.length > 0) {
236
+ void appendEventAsync(updatedManifest.eventsPath, {
237
+ type: "config.warning",
238
+ runId: updatedManifest.runId,
239
+ message: `Loaded config from ${loadedConfig.path || "(defaults)"} with ${configIssues.length} issue(s): ${configIssues.join("; ")}`,
240
+ data: { error: loadedConfig.error, warnings: loadedConfig.warnings, path: loadedConfig.path },
241
+ }).catch((error) => logInternalError("team-tool.run.configWarning", error, `runId=${updatedManifest.runId}`));
242
+ logInternalError("team-tool.run.configWarning", new Error(`config issues: ${configIssues.join("; ")}`), `runId=${updatedManifest.runId} path=${loadedConfig.path ?? "(defaults)"}`);
243
+ }
222
244
  const executedConfig = effectiveRunConfig(loadedConfig.config, params.config);
223
245
  const runtime = await resolveCrewRuntime(executedConfig);
224
246
  const runtimeResolution = runtimeResolutionState(runtime);