pi-crew 0.7.4 → 0.7.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,33 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.7.5] — Ambient context status + perf hardening + error taxonomy (2026-06-15)
4
+
5
+ Three workstreams from the Round 11 API-gap and Round 15 perf/error audits: a new `context`-event feature, three performance fixes, and a full error-taxonomy expansion.
6
+
7
+ ### Features
8
+
9
+ - **Ambient crew-status injection (GAP-2)** — registers Pi's `context` event handler so the parent agent stays continuously aware of in-flight crew runs on every LLM call, without calling the `team` tool. Injects a compact status note (runId/team/status/goal, capped at 3 inline) before the last message. **Transient and safe**: Pi uses the result only for that call (`agent-loop.ts:283-289`) — it never mutates persistent `state.messages`, so there's no accumulation or history corruption. No-op when zero runs are active. Toggle: `reliability.ambientStatusInjection`.
10
+
11
+ ### Performance (Round 15 audit)
12
+
13
+ - **P1 (CRITICAL): throttle `persistSingleTaskUpdate` in `onJsonEvent`** — previously every child JSON event did a full locked read-parse-write of `tasks.json`; a 200-event task produced 200 such cycles. Now throttled to 500ms (in-memory progress stays fresh every event; final state force-flushed on completion).
14
+ - **P4: `buildWorkspaceTree` TTL cache (30s)** — workers in a run share a cwd, so the recursive walk was repeated once per task.
15
+ - **P5: `readKnowledge` mtime+size cache** — fired on every agent start (main + every worker), re-reading the same file N×/run.
16
+
17
+ ### Error experience (Round 15 audit)
18
+
19
+ - **E1: extended CrewError taxonomy E007–E012** — the taxonomy previously covered only file I/O and discovery. The most common *runtime* failures (child timeout, model exhaustion, pre-step failure, event-log lock timeout, depth limit, stale run) now throw structured `CrewError`s with a machine-readable code, a default actionable help hint, and context. Wired into all six throw sites (`task-runner.ts`, `event-log.ts`, `pipeline-runner.ts`, `stale-reconciler.ts`).
20
+ - **E2: model fallback exhaustion surfaces the full chain tried** ("All N candidates exhausted (tried: a → b → c). Last failure: …") instead of only the last attempt's raw error.
21
+ - **E3: stale-reconcile error explains the heartbeat mechanism + remediation** instead of the bare "Stale run reconciled: <reason>".
22
+
23
+ ### Tests
24
+
25
+ - +20 tests (context-status-injection: 11, errors E007–E012: 9). 4800+ pass / 0 fail.
26
+
27
+ ### Research
28
+
29
+ This release was driven by the Round 11 Pi-API gap audit and the Round 15 performance/cost + error-experience audit, documented in `research-findings/`.
30
+
3
31
  ## [0.7.4] — Editor autocomplete + settings shortcut (2026-06-15)
4
32
 
5
33
  Round 13 UX quick wins round-out: the remaining two Pi extension API integrations plus a hard-won CI reliability fix after the state-store test flake re-emerged on Windows and macOS.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-crew",
3
- "version": "0.7.4",
3
+ "version": "0.7.5",
4
4
  "description": "Pi extension for coordinated AI teams, workflows, worktrees, and async task orchestration",
5
5
  "author": "baphuongna",
6
6
  "license": "MIT",
@@ -178,6 +178,8 @@ export interface CrewReliabilityConfig {
178
178
  autoRepairIntervalMs?: number;
179
179
  /** Remove /tmp/pi-crew-* directories after their orphaned runs are reconciled. Default: true. */
180
180
  cleanupOrphanedTempDirs?: boolean;
181
+ /** Inject a compact ambient crew-status note into the agent's context on every LLM call while crew runs are in-flight, so the agent stays continuously aware of active runs without calling the `team` tool. No-op when no runs are active. Default: true. */
182
+ ambientStatusInjection?: boolean;
181
183
  }
182
184
 
183
185
  export interface CrewOtlpConfig {
package/src/errors.ts CHANGED
@@ -30,6 +30,14 @@ export const ErrorCode = {
30
30
  InvalidStatusTransition: "E004", // Run/task status cannot legally transition
31
31
  ConfigError: "E005", // Malformed config or missing required field
32
32
  ResourceNotFound: "E006", // Agent/team/workflow not found in discovery paths
33
+ // E1 (Round 15): runtime failure categories that previously threw raw Error
34
+ // with no code, no help hint, and no context. Surfaces actionable guidance.
35
+ ChildTimeout: "E007", // Child Pi worker became unresponsive and was killed
36
+ ModelExhausted: "E008", // All model candidates in the fallback chain failed
37
+ PreStepFailed: "E009", // A pre-step hook script returned a non-zero exit
38
+ EventLogLockTimeout: "E010", // Could not acquire the event-log file lock
39
+ DepthLimitExceeded: "E011", // Pipeline/chain recursion depth limit hit (circular dep)
40
+ RunStale: "E012", // Run reconciled as stale/zombie (heartbeat expired)
33
41
  } as const;
34
42
 
35
43
  export type ErrorCode = typeof ErrorCode[keyof typeof ErrorCode];
@@ -41,6 +49,13 @@ const DEFAULT_HELP: Record<ErrorCode, string | undefined> = {
41
49
  [ErrorCode.InvalidStatusTransition]: "Verify the run status using `team status` before retrying.",
42
50
  [ErrorCode.ConfigError]: "Check the configuration file for syntax errors or missing required fields.",
43
51
  [ErrorCode.ResourceNotFound]: "Use `team list` to see available agents, teams, and workflows.",
52
+ // E1 (Round 15): help hints for the new runtime categories.
53
+ [ErrorCode.ChildTimeout]: "The child Pi worker produced no output for too long and was terminated. Re-run the team; if it recurs, raise the response timeout in config or reduce the task scope.",
54
+ [ErrorCode.ModelExhausted]: "Every model in the fallback chain failed. Check your API key/quota and the per-attempt errors, then retry or swap the model in config.",
55
+ [ErrorCode.PreStepFailed]: "The pre-step hook script exited non-zero. Inspect its stderr, or mark it optional in the workflow step (preStepOptional).",
56
+ [ErrorCode.EventLogLockTimeout]: "Another process holds the event-log lock. Check for orphaned `.lock` files or stale pi-crew processes, then retry.",
57
+ [ErrorCode.DepthLimitExceeded]: "A pipeline/chain exceeded the recursion depth limit, which usually indicates a circular stage dependency. Review step `dependsOn` chains.",
58
+ [ErrorCode.RunStale]: "The worker stopped heartbeating and was treated as a zombie. Re-run the team (resume or fresh); if it recurs, check `runtime.executeWorkers` / system load.",
44
59
  };
45
60
 
46
61
  /**
@@ -122,4 +137,55 @@ export const errors = {
122
137
  `${type} '${name}' not found in any discovery path`,
123
138
  );
124
139
  },
140
+
141
+ // E1 (Round 15): runtime failure constructors. These wrap the raw-throw
142
+ // sites identified in the Round 15 error-experience audit so failures carry
143
+ // a machine-readable code, a help hint, and structured context.
144
+ childTimeout(detail: { timeoutMs?: number; taskId?: string; stderr?: string }): CrewError {
145
+ const tail = detail.stderr ? ` Stderr tail: ${detail.stderr.slice(-400)}` : "";
146
+ const dur = detail.timeoutMs ? ` after ${detail.timeoutMs}ms of no output` : "";
147
+ return new CrewError(
148
+ ErrorCode.ChildTimeout,
149
+ `Child Pi worker became unresponsive${dur} and was terminated.${tail}`,
150
+ ).withContext(`worker execution${detail.taskId ? ` (task ${detail.taskId})` : ""}`);
151
+ },
152
+
153
+ modelExhausted(chain: string[], lastFailure?: string): CrewError {
154
+ const tried = chain.join(" → ");
155
+ const last = lastFailure ? ` Last failure: ${lastFailure}` : "";
156
+ return new CrewError(
157
+ ErrorCode.ModelExhausted,
158
+ `All ${chain.length} model candidates exhausted (tried: ${tried}).${last}`,
159
+ ).withContext("model fallback chain");
160
+ },
161
+
162
+ preStepFailed(script: string, exitCode: number | undefined, stderr?: string): CrewError {
163
+ const tail = stderr ? ` Stderr: ${stderr.slice(-400)}` : "";
164
+ return new CrewError(
165
+ ErrorCode.PreStepFailed,
166
+ `preStepScript '${script}' exited ${exitCode ?? "non-zero"}.${tail}`,
167
+ ).withContext("pre-step hook execution");
168
+ },
169
+
170
+ eventLogLockTimeout(eventsPath: string, timeoutMs: number): CrewError {
171
+ return new CrewError(
172
+ ErrorCode.EventLogLockTimeout,
173
+ `Event log lock timeout for ${eventsPath}: could not acquire lock within ${timeoutMs}ms`,
174
+ ).withContext("event-log append");
175
+ },
176
+
177
+ depthLimitExceeded(depth: number, kind = "pipeline"): CrewError {
178
+ return new CrewError(
179
+ ErrorCode.DepthLimitExceeded,
180
+ `${kind[0].toUpperCase() + kind.slice(1)} recursion depth limit exceeded (${depth}). Possible circular dependency.`,
181
+ ).withContext(`${kind} execution`);
182
+ },
183
+
184
+ runStale(reason: string, heartbeatAgeSeconds?: number): CrewError {
185
+ const age = heartbeatAgeSeconds !== undefined ? ` Last heartbeat was ${heartbeatAgeSeconds}s ago.` : "";
186
+ return new CrewError(
187
+ ErrorCode.RunStale,
188
+ `Stale run reconciled (reason=${reason}).${age} The worker stopped heartbeating and was treated as dead/zombie.`,
189
+ ).withContext("stale-run reconciliation");
190
+ },
125
191
  } as const;
@@ -0,0 +1,143 @@
1
+ /**
2
+ * context-status-injection.ts — Ambient crew-status injection (GAP-2).
3
+ *
4
+ * Registers a `context` event handler that keeps the parent agent continuously
5
+ * aware of in-flight crew runs. Without this, the agent "forgets" about active
6
+ * runs between turns unless it explicitly calls the `team` tool.
7
+ *
8
+ * ## How it works
9
+ *
10
+ * Pi's `context` event fires before EVERY LLM call (see Pi source
11
+ * `extensions/runner.ts:emitContext`). The handler receives the full messages
12
+ * array and may return a modified copy. Critically, the returned messages are
13
+ * used ONLY for that single LLM call (`agent-loop.ts:283-289` feeds the result
14
+ * straight into `convertToLlm` for the request) — they do NOT mutate the
15
+ * agent's persistent `state.messages`. So injection is transient per-call:
16
+ * - No accumulation across turns (the note never enters history).
17
+ * - No need to dedup against prior injections.
18
+ * - No risk of corrupting the conversation transcript.
19
+ *
20
+ * The injected note is a compact 1–4 line ambient status, inserted BEFORE the
21
+ * last message so the last message remains the active turn driver (preserves
22
+ * the user/assistant/tool alternation the LLMs expect).
23
+ *
24
+ * ## Safety
25
+ *
26
+ * - No-op when zero runs are in-flight (returns undefined → Pi uses original
27
+ * messages unchanged). Normal single-agent operation is completely unaffected.
28
+ * - `emitContext` already wraps handlers in try/catch and emits errors instead
29
+ * of crashing the loop (Pi `runner.ts:933`), so a throw here can't break the
30
+ * agent — but we also guard defensively.
31
+ * - Opt-out: `runtime.reliability.ambientStatusInjection: false` in config.
32
+ */
33
+
34
+ import type { AgentMessage } from "@earendil-works/pi-agent-core";
35
+ import type { Message } from "@earendil-works/pi-ai";
36
+ import type { ExtensionAPI, ContextEvent } from "@earendil-works/pi-coding-agent";
37
+ import { collectInFlightRuns } from "./registration/compaction-guard.ts";
38
+ import type { TeamRunManifest } from "../state/types.ts";
39
+
40
+ /** Sentinel that marks an injected ambient-status user message. */
41
+ export const AMBIENT_STATUS_SENTINEL = "[pi-crew ambient status";
42
+
43
+ /** Cap the number of runs listed inline to keep the note compact. */
44
+ const MAX_INLINE_RUNS = 3;
45
+ /** Truncate long goals so one run can't dominate the context window. */
46
+ const MAX_GOAL_LEN = 80;
47
+
48
+ /**
49
+ * Build a compact, human+LLM-readable ambient status string for the given
50
+ * in-flight runs. Returns "" for an empty list (caller treats as no-op).
51
+ *
52
+ * Exported for unit testing.
53
+ */
54
+ export function formatAmbientStatus(runs: TeamRunManifest[]): string {
55
+ if (runs.length === 0) return "";
56
+ const truncate = (s: string, n: number): string =>
57
+ s.length > n ? `${s.slice(0, n - 1)}…` : s;
58
+ const lines: string[] = [
59
+ `${AMBIENT_STATUS_SENTINEL} — environmental context, not a user request]`,
60
+ `${runs.length} pi-crew run${runs.length === 1 ? "" : "s"} in flight:`,
61
+ ];
62
+ const shown = runs.slice(0, MAX_INLINE_RUNS);
63
+ for (const run of shown) {
64
+ const wf = run.workflow ? `, ${run.workflow}` : "";
65
+ lines.push(`• ${run.runId} (${run.status}, ${run.team}${wf}): ${truncate(run.goal ?? "(no goal)", MAX_GOAL_LEN)}`);
66
+ }
67
+ if (runs.length > MAX_INLINE_RUNS) {
68
+ lines.push(`• …and ${runs.length - MAX_INLINE_RUNS} more`);
69
+ }
70
+ lines.push("Inspect/join via the `team` tool: action=\"status\" (list), action=\"wait\" (join running), action=\"summary\"/action=\"get\" (results).");
71
+ return lines.join("\n");
72
+ }
73
+
74
+ /**
75
+ * Construct a user-role AgentMessage carrying the ambient status. Uses the
76
+ * `user` role (the Message union has no `system` role — the system prompt is a
77
+ * separate field). The sentinel prefix signals to the model that this is
78
+ * environmental information, not a typed user instruction.
79
+ *
80
+ * Exported for unit testing.
81
+ */
82
+ export function buildStatusMessage(runs: TeamRunManifest[]): Message {
83
+ return {
84
+ role: "user",
85
+ content: [{ type: "text", text: formatAmbientStatus(runs) }],
86
+ timestamp: Date.now(),
87
+ };
88
+ }
89
+
90
+ /** Result type for the `context` event handler (mirrors Pi's ContextEventResult,
91
+ * which isn't re-exported from the coding-agent package entry). */
92
+ export interface AmbientContextResult {
93
+ messages?: AgentMessage[];
94
+ }
95
+
96
+ /**
97
+ * Core handler logic, separated from the Pi registration so it is trivially
98
+ * unit-testable without a live ExtensionAPI.
99
+ *
100
+ * Returns `{messages}` with the ambient status inserted before the last
101
+ * message, or `undefined` to leave the context untouched (no in-flight runs).
102
+ *
103
+ * Exported for unit testing.
104
+ */
105
+ export function handleContextEvent(event: ContextEvent, cwd: string): AmbientContextResult | undefined {
106
+ let runs: TeamRunManifest[] = [];
107
+ try {
108
+ runs = collectInFlightRuns(cwd);
109
+ } catch {
110
+ // State read failure → don't inject, don't crash. Pi catches handler
111
+ // errors anyway, but we avoid noisy error emission for a best-effort
112
+ // awareness feature.
113
+ return undefined;
114
+ }
115
+ if (runs.length === 0) return undefined;
116
+
117
+ const messages = [...event.messages];
118
+ const statusMsg = buildStatusMessage(runs);
119
+ // Insert BEFORE the last message so the genuine last message (the current
120
+ // turn driver — user prompt or tool result) stays last. When there are 0–1
121
+ // messages, appending is the only sensible option.
122
+ const insertAt = messages.length > 1 ? messages.length - 1 : messages.length;
123
+ messages.splice(insertAt, 0, statusMsg as unknown as AgentMessage);
124
+ return { messages };
125
+ }
126
+
127
+ /**
128
+ * Register the ambient-status `context` event handler. Reads the project cwd
129
+ * from the session context on each call (crew state is per-project).
130
+ *
131
+ * Pass `enabled: false` (from `runtime.reliability.ambientStatusInjection`) to
132
+ * disable the feature without unwiring the handler.
133
+ */
134
+ export function registerContextStatusInjection(
135
+ pi: ExtensionAPI,
136
+ opts: { enabled?: boolean } = {},
137
+ ): void {
138
+ if (opts.enabled === false) return;
139
+ pi.on("context", (event: ContextEvent): AmbientContextResult | undefined => {
140
+ const cwd = typeof process.cwd === "function" ? process.cwd() : ".";
141
+ return handleContextEvent(event, cwd);
142
+ });
143
+ }
@@ -29,17 +29,45 @@ export function knowledgePath(cwd: string): string {
29
29
  export function readKnowledge(cwd: string): string {
30
30
  try {
31
31
  const p = knowledgePath(cwd);
32
- if (!fs.existsSync(p)) return "";
32
+ const stat = tryStat(p);
33
+ if (!stat) {
34
+ knowledgeCache.delete(p);
35
+ return "";
36
+ }
37
+ // P5 (Round 15): mtime+size cache. readKnowledge fires on every agent
38
+ // start (main session + every worker), re-reading the file each time.
39
+ // For a run with N workers this is N redundant readFileSync of the same
40
+ // file. Cache by (mtimeMs, size) and only re-read when the file changes.
41
+ const cacheKey = `${stat.mtimeMs}:${stat.size}`;
42
+ const cached = knowledgeCache.get(p);
43
+ if (cached && cached.key === cacheKey) return cached.content;
33
44
  let content = fs.readFileSync(p, "utf8").trim();
34
45
  if (content.length > MAX_KNOWLEDGE_BYTES) {
35
46
  content = `${content.slice(0, MAX_KNOWLEDGE_BYTES)}\n\n<!-- knowledge.md truncated at ${MAX_KNOWLEDGE_BYTES} bytes -->`;
36
47
  }
48
+ knowledgeCache.set(p, { key: cacheKey, content });
37
49
  return content;
38
50
  } catch {
39
51
  return "";
40
52
  }
41
53
  }
42
54
 
55
+ /** Stat helper returning undefined on error (file missing, perms, etc.). */
56
+ function tryStat(p: string): { mtimeMs: number; size: number } | undefined {
57
+ try {
58
+ const s = fs.statSync(p);
59
+ return { mtimeMs: s.mtimeMs, size: s.size };
60
+ } catch {
61
+ return undefined;
62
+ }
63
+ }
64
+
65
+ interface CachedKnowledge {
66
+ key: string;
67
+ content: string;
68
+ }
69
+ const knowledgeCache = new Map<string, CachedKnowledge>();
70
+
43
71
  /** Build the injected prompt fragment (empty if no knowledge). */
44
72
  export function buildKnowledgeFragment(cwd: string): string {
45
73
  const content = readKnowledge(cwd);
@@ -113,6 +113,7 @@ import { registerCrewMessageRenderers } from "./message-renderers.ts";
113
113
  import { registerCrewInputRouter } from "./crew-input-router.ts";
114
114
  import { registerCrewAutocomplete } from "./crew-autocomplete.ts";
115
115
  import { registerCrewShortcuts } from "./crew-shortcuts.ts";
116
+ import { registerContextStatusInjection } from "./context-status-injection.ts";
116
117
  import { registerTeamTool } from "./registration/team-tool.ts";
117
118
  import { handleTeamTool } from "./team-tool.ts";
118
119
  import { persistScheduledJobUpdate } from "./team-tool/handle-schedule.ts";
@@ -2065,4 +2066,13 @@ export function registerPiTeams(pi: ExtensionAPI): void {
2065
2066
  // (The crew autocomplete provider is registered from session_start once
2066
2067
  // a UI context is available — see the session_start handler below.)
2067
2068
  registerCrewShortcuts(pi);
2069
+
2070
+ // GAP-2 (Round 11): ambient crew-status injection. Registers a `context`
2071
+ // event handler that appends a compact in-flight-runs note to the agent
2072
+ // context on every LLM call, so the agent never "forgets" active runs.
2073
+ // Transient per-call (does not pollute history), and a no-op when no runs
2074
+ // are in-flight. Toggle via runtime.reliability.ambientStatusInjection.
2075
+ registerContextStatusInjection(pi, {
2076
+ enabled: loadConfig(process.cwd()).config.reliability?.ambientStatusInjection !== false,
2077
+ });
2068
2078
  }
@@ -3,6 +3,7 @@ import type { WorkflowConfig, WorkflowStep } from "../workflows/workflow-config.
3
3
  import type { TeamConfig } from "../teams/team-config.ts";
4
4
  import type { AgentConfig } from "../agents/agent-config.ts";
5
5
  import { appendEventAsync } from "../state/event-log.ts";
6
+ import { errors } from "../errors.ts";
6
7
  import { mapConcurrent } from "./parallel-utils.ts";
7
8
 
8
9
  /**
@@ -242,7 +243,8 @@ export class PipelineRunner {
242
243
  ): Promise<unknown[]> {
243
244
  // CRITICAL-6: Prevent stack overflow from deep recursion
244
245
  if (depth > 50) {
245
- throw new Error(`Pipeline recursion depth limit exceeded (${depth}). Possible circular stage dependency.`);
246
+ // E1 (Round 15): structured CrewError (E011) with help hint.
247
+ throw errors.depthLimitExceeded(depth, "pipeline");
246
248
  }
247
249
 
248
250
  const fanOut = stage.fanOut ?? true;
@@ -2,6 +2,7 @@ import * as fs from "node:fs";
2
2
  import * as os from "node:os";
3
3
  import * as path from "node:path";
4
4
  import type { TeamRunManifest, TeamTaskState } from "../state/types.ts";
5
+ import { errors } from "../errors.ts";
5
6
  import { recordFromTask, upsertCrewAgent } from "./crew-agent-records.ts";
6
7
  import { checkProcessLiveness } from "./process-status.ts";
7
8
  import { saveRunManifest } from "../state/state-store.ts";
@@ -272,6 +273,23 @@ function getRunningTaskStaleness(
272
273
  /**
273
274
  * Repair a stale run by marking it as failed and cancelling running tasks.
274
275
  */
276
+ /**
277
+ * E3/E1 (Round 15): Build a human-actionable error string for a stale-reconciled
278
+ * task. Explains WHY the run was marked stale (the detected reason) and gives
279
+ * concrete remediation, instead of the bare 'Stale run reconciled: <reason>'.
280
+ * Now returns a structured CrewError (E012) so callers also get a machine-
281
+ * readable code + help hint; `.message` carries the same rich text as before.
282
+ */
283
+ function buildStaleReconcileError(task: TeamTaskState, reason: string): Error {
284
+ const heartbeatAgeSeconds = task.heartbeat?.lastSeenAt ? Math.round((Date.now() - new Date(task.heartbeat.lastSeenAt).getTime()) / 1000) : undefined;
285
+ return errors.runStale(reason, heartbeatAgeSeconds);
286
+ }
287
+
288
+ /** @deprecated use buildStaleReconcileError (returns a structured CrewError). Kept for any external callers. */
289
+ function formatStaleReconcileError(task: TeamTaskState, reason: string): string {
290
+ return buildStaleReconcileError(task, reason).message;
291
+ }
292
+
275
293
  function repairStaleRun(
276
294
  manifest: TeamRunManifest,
277
295
  tasks: TeamTaskState[],
@@ -288,7 +306,8 @@ function repairStaleRun(
288
306
  ...task,
289
307
  status: "cancelled" as const,
290
308
  finishedAt: now,
291
- error: `Stale run reconciled: ${reason}`,
309
+ // E3/E1 (Round 15): structured CrewError (E012) with code + help hint.
310
+ error: buildStaleReconcileError(task, reason).message,
292
311
  };
293
312
  }
294
313
  return task;
@@ -11,6 +11,7 @@ import type {
11
11
  VerificationEvidence,
12
12
  } from "../state/types.ts";
13
13
  import { logInternalError } from "../utils/internal-error.ts";
14
+ import { errors } from "../errors.ts";
14
15
  import { writeArtifact } from "../state/artifact-store.ts";
15
16
  import { appendEventAsync, appendEventFireAndForget } from "../state/event-log.ts";
16
17
  import { saveRunManifest } from "../state/state-store.ts";
@@ -288,7 +289,10 @@ export async function runTeamTask(
288
289
  });
289
290
  } catch (err) {
290
291
  const msg = err instanceof Error ? err.message : String(err);
291
- throw new Error(`preStepScript failed: ${input.step.preStepScript}: ${msg}`);
292
+ const exitCode = (err as NodeJS.ErrnoException & { status?: number }).status;
293
+ // E1 (Round 15): structured CrewError with code E009 + help hint,
294
+ // instead of a raw Error. Surfaces the script path, exit code, and stderr.
295
+ throw errors.preStepFailed(input.step.preStepScript, exitCode, msg);
292
296
  }
293
297
  }
294
298
 
@@ -383,6 +387,7 @@ export async function runTeamTask(
383
387
  let lastAgentRecordPersistedAt = 0;
384
388
  let lastHeartbeatPersistedAt = 0;
385
389
  let lastRunProgressPersistedAt = 0;
390
+ let lastTaskProgressPersistedAt = 0;
386
391
  let lastRunProgressSummary: ProgressEventSummary | undefined;
387
392
  const persistHeartbeat = (force = false): void => {
388
393
  const now = Date.now();
@@ -573,26 +578,23 @@ export async function runTeamTask(
573
578
  const eventLine = typeof event === "object" && !Array.isArray(event) ? JSON.stringify(event) : String(event);
574
579
  fs.appendFileSync(bgLogPath, `${eventLine}\n`);
575
580
  }
576
- // Apply agentProgress update first, then persist, then update in-memory array.
577
- // This ensures disk state is always >= in-memory state, preventing
578
- // fresher in-memory state from being lost on crash.
579
- tasks = persistSingleTaskUpdate(manifest, tasks, {
580
- ...task,
581
- agentProgress: applyAgentProgressEvent(
582
- task.agentProgress ?? emptyCrewAgentProgress(),
583
- event,
584
- task.startedAt,
585
- ),
586
- });
587
- task = {
588
- ...task,
589
- agentProgress: applyAgentProgressEvent(
590
- task.agentProgress ?? emptyCrewAgentProgress(),
591
- event,
592
- task.startedAt,
593
- ),
594
- };
581
+ // Always keep in-memory agentProgress fresh (cheap) so the UI/events see
582
+ // the latest progress, but THROTTLE the disk persist. Previously this
583
+ // did a full locked read-parse-write of tasks.json on EVERY child JSON
584
+ // event — a 200-event task produced 200 such cycles (Round 15 P1).
585
+ // Final state is force-flushed on task completion (persistHeartbeat(true)).
586
+ const nextProgress = applyAgentProgressEvent(
587
+ task.agentProgress ?? emptyCrewAgentProgress(),
588
+ event,
589
+ task.startedAt,
590
+ );
591
+ task = { ...task, agentProgress: nextProgress };
595
592
  tasks = updateTask(tasks, task);
593
+ const progressNow = Date.now();
594
+ if (progressNow - lastTaskProgressPersistedAt >= 500) {
595
+ tasks = persistSingleTaskUpdate(manifest, tasks, task);
596
+ lastTaskProgressPersistedAt = progressNow;
597
+ }
596
598
  // Bridge event to UI event bus for near-instant updates
597
599
  const bridgeEvent = bridgeEventFromJsonEvent(
598
600
  manifest.runId,
@@ -701,6 +703,15 @@ export async function runTeamTask(
701
703
  ? childResult.stderr ||
702
704
  `Child Pi exited with ${childResult.exitCode}`
703
705
  : undefined);
706
+ // E1/E7 (Round 15): when the child timed out, surface a structured
707
+ // CrewError (E007) so users get a code + actionable help hint instead
708
+ // of a bare 'no new output for N ms'. We keep .message as the task error.
709
+ if (childResult.exitStatus?.timedOut) {
710
+ error = errors.childTimeout({
711
+ taskId: task.id,
712
+ stderr: childResult.stderr,
713
+ }).message;
714
+ }
704
715
  persistHeartbeat(true);
705
716
  persistChildProgress({ type: "attempt_finished" }, true);
706
717
  const attempt: ModelAttemptSummary = {
@@ -724,6 +735,16 @@ export async function runTeamTask(
724
735
  if (!nextModel || !isRetryableModelFailure(error)) break;
725
736
  logs.push(formatModelAttemptNote(attempt, nextModel), "");
726
737
  }
738
+ // E2 (Round 15): when the fallback chain was used and STILL failed, surface
739
+ // that explicitly. Without this the task error only shows the last
740
+ // attempt's raw failure, so users can't tell whether to fix an API key,
741
+ // upgrade a plan, or change the model config. Include the chain tried +
742
+ // the final reason.
743
+ if (error && modelAttempts.length > 1) {
744
+ // E2/E1 (Round 15): structured CrewError (E008). Build via the factory so
745
+ // the error carries a code + help hint; keep its .message as the task error.
746
+ error = errors.modelExhausted(modelAttempts.map((a) => a.model), error).message;
747
+ }
727
748
  // NEW-8 fix: register all attempt transcripts as artifacts, not just the used one.
728
749
  // Earlier failed attempts' transcripts exist on disk but were invisible to the artifact system.
729
750
  const successfulAttemptIndex = modelAttempts.findIndex(
@@ -252,7 +252,7 @@ function applyLineCap(
252
252
  return { lines: kept, elided: removable.length };
253
253
  }
254
254
 
255
- // ── Public API ─────────────────────────────────────────────────────────
255
+ // ── Public API ────────────────────────────────────────────────────────
256
256
 
257
257
  const emptyResult = (rootPath: string): WorkspaceTree => ({
258
258
  rootPath,
@@ -261,11 +261,35 @@ const emptyResult = (rootPath: string): WorkspaceTree => ({
261
261
  totalLines: 0,
262
262
  });
263
263
 
264
+ /**
265
+ * Per-cwd TTL cache for the rendered workspace tree. Workers in the same run
266
+ * share a cwd, so the recursive walk was previously repeated once per task
267
+ * (Round 15 P4). The tree is informational context for the worker; short-lived
268
+ * staleness is acceptable, so a 30s TTL is safe and keeps prompts fresh during
269
+ * long active runs while eliminating redundant walks.
270
+ */
271
+ const TREE_CACHE_TTL_MS = 30_000;
272
+ interface CachedTree {
273
+ tree: WorkspaceTree;
274
+ expiresAt: number;
275
+ }
276
+ const treeCache = new Map<string, CachedTree>();
277
+
278
+ function treeCacheKey(cwd: string, options?: WorkspaceTreeOptions): string {
279
+ // Cache is keyed on the inputs that affect the walk output.
280
+ return `${path.resolve(cwd)}|${options?.maxDepth ?? ""}|${options?.dirLimit ?? ""}|${options?.lineCap ?? ""}`;
281
+ }
282
+
264
283
  export async function buildWorkspaceTree(
265
284
  cwd: string,
266
285
  options?: WorkspaceTreeOptions,
267
286
  ): Promise<WorkspaceTree> {
268
287
  const rootPath = path.resolve(cwd);
288
+ const cacheKey = treeCacheKey(cwd, options);
289
+ const cached = treeCache.get(cacheKey);
290
+ if (cached && cached.expiresAt > Date.now()) {
291
+ return cached.tree;
292
+ }
269
293
  try {
270
294
  const maxDepth = options?.maxDepth ?? DEFAULT_MAX_DEPTH;
271
295
  const dirLimit = options?.dirLimit ?? DEFAULT_DIR_LIMIT;
@@ -286,12 +310,14 @@ export async function buildWorkspaceTree(
286
310
  const { lines: capped, elided } = applyLineCap(lines, lineCap);
287
311
  const rendered = capped.map((l) => l.text).join("\n");
288
312
 
289
- return {
313
+ const result: WorkspaceTree = {
290
314
  rootPath,
291
315
  rendered,
292
316
  truncated: dirTruncated || elided > 0,
293
317
  totalLines: capped.length,
294
318
  };
319
+ treeCache.set(cacheKey, { tree: result, expiresAt: Date.now() + TREE_CACHE_TTL_MS });
320
+ return result;
295
321
  } catch {
296
322
  return emptyResult(rootPath);
297
323
  }
@@ -3,6 +3,7 @@ import * as fs from "node:fs";
3
3
  import * as path from "node:path";
4
4
  import { DEFAULT_EVENT_LOG } from "../config/defaults.ts";
5
5
  import { atomicWriteFile } from "./atomic-write.ts";
6
+ import { errors } from "../errors.ts";
6
7
  import { emitFromTeamEvent } from "../ui/run-event-bus.ts";
7
8
  import { logInternalError } from "../utils/internal-error.ts";
8
9
  import { readJsonlSince, type IncrementalReadState } from "../utils/incremental-reader.ts";
@@ -105,9 +106,9 @@ export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
105
106
  // SECURITY (HIGH #2 fix): Throw instead of continuing without lock.
106
107
  // Previously this logged and broke out of the loop, executing the
107
108
  // operation without lock protection. Now we throw so callers can retry.
108
- throw new Error(
109
- `Event log lock timeout for ${eventsPath}: could not acquire lock within ${timeout}ms`,
110
- );
109
+ // E1 (Round 15): structured CrewError (E010) with help hint so users know
110
+ // to check for orphaned .lock dirs / stale processes.
111
+ throw errors.eventLogLockTimeout(eventsPath, timeout);
111
112
  }
112
113
  // Stale detection: if the owning process is dead, remove the stale lock.
113
114
  try {
@@ -17,6 +17,37 @@ import type { WorkflowConfig } from "../workflows/workflow-config.ts";
17
17
  import { toPiSessionId } from "../utils/session-utils.ts";
18
18
  import { HealthStore } from "./health-store.ts";
19
19
 
20
+ /**
21
+ * stat() the manifest with a brief retry on Windows for the AV-scan window.
22
+ *
23
+ * On the GitHub Actions windows-latest runner, Windows Defender real-time
24
+ * scanning can make a freshly-written manifest.json briefly invisible to
25
+ * statSync (ENOENT) even though the write succeeded and the file is on disk.
26
+ * loadRunManifestById is called right after createRunManifest in tests and in
27
+ * production (e.g. refreshPersistedSubagentRecord), so without a retry the
28
+ * caller sees a phantom "missing" run.
29
+ *
30
+ * On non-Windows, ENOENT means the file genuinely doesn't exist — passthrough
31
+ * (throw immediately) with no retry. On Windows, ENOENT/EPERM/EBUSY/EAGAIN get
32
+ * a handful of short retries (~30ms worst case) before giving up and throwing
33
+ * so the caller's catch returns undefined as before.
34
+ */
35
+ function statManifestWithWindowsRetry(manifestPath: string): fs.Stats {
36
+ if (process.platform !== "win32") return fs.statSync(manifestPath);
37
+ const retryable = new Set(["ENOENT", "EPERM", "EBUSY", "EAGAIN"]);
38
+ for (let attempt = 0; attempt < 5; attempt++) {
39
+ try {
40
+ return fs.statSync(manifestPath);
41
+ } catch (error) {
42
+ const code = (error as NodeJS.ErrnoException).code;
43
+ if (!retryable.has(code ?? "")) throw error;
44
+ const end = Date.now() + Math.min(8, 1 * 2 ** attempt);
45
+ while (Date.now() < end) { /* brief spin to ride out the AV scan window */ }
46
+ }
47
+ }
48
+ return fs.statSync(manifestPath); // last attempt — let caller's catch handle ENOENT
49
+ }
50
+
20
51
  export interface RunPaths {
21
52
  runId: string;
22
53
  stateRoot: string;
@@ -506,7 +537,7 @@ export function loadRunManifestById(cwd: string, runId: string): { manifest: Tea
506
537
 
507
538
  let manifestStat: fs.Stats;
508
539
  try {
509
- manifestStat = fs.statSync(manifestPath);
540
+ manifestStat = statManifestWithWindowsRetry(manifestPath);
510
541
  } catch {
511
542
  return undefined;
512
543
  }