pi-crew 0.1.43 → 0.1.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/docs/research-phase10-distillation.md +199 -0
  2. package/docs/research-phase11-distillation.md +201 -0
  3. package/package.json +1 -1
  4. package/src/agents/discover-agents.ts +1 -0
  5. package/src/config/config.ts +19 -0
  6. package/src/extension/register.ts +127 -8
  7. package/src/extension/registration/team-tool.ts +2 -1
  8. package/src/extension/run-index.ts +19 -0
  9. package/src/extension/team-tool/api.ts +1 -1
  10. package/src/extension/team-tool/cancel.ts +103 -31
  11. package/src/extension/team-tool/context.ts +1 -0
  12. package/src/extension/team-tool/respond.ts +67 -0
  13. package/src/extension/team-tool/run.ts +2 -2
  14. package/src/extension/team-tool/status.ts +7 -1
  15. package/src/extension/team-tool-types.ts +4 -0
  16. package/src/extension/team-tool.ts +2 -0
  17. package/src/observability/event-to-metric.ts +6 -0
  18. package/src/runtime/completion-guard.ts +190 -103
  19. package/src/runtime/crash-recovery.ts +30 -0
  20. package/src/runtime/crew-agent-runtime.ts +2 -1
  21. package/src/runtime/delivery-coordinator.ts +143 -0
  22. package/src/runtime/model-fallback.ts +5 -2
  23. package/src/runtime/overflow-recovery.ts +157 -0
  24. package/src/runtime/process-status.ts +1 -1
  25. package/src/runtime/session-resources.ts +25 -0
  26. package/src/runtime/session-snapshot.ts +59 -0
  27. package/src/runtime/stale-reconciler.ts +179 -0
  28. package/src/runtime/supervisor-contact.ts +59 -0
  29. package/src/runtime/task-runner.ts +14 -0
  30. package/src/runtime/team-runner.ts +6 -4
  31. package/src/schema/config-schema.ts +1 -0
  32. package/src/schema/team-tool-schema.ts +6 -1
  33. package/src/state/contracts.ts +6 -2
  34. package/src/ui/crew-widget.ts +5 -4
  35. package/src/ui/powerbar-publisher.ts +3 -3
  36. package/src/ui/run-snapshot-cache.ts +275 -1
  37. package/src/ui/status-colors.ts +4 -0
  38. package/src/utils/atomic-write.ts +33 -0
@@ -0,0 +1,157 @@
1
+ import { logInternalError } from "../utils/internal-error.ts";
2
+
3
+ export type OverflowPhase = "none" | "compaction" | "retrying" | "recovered" | "failed";
4
+
5
+ export interface OverflowRecoveryState {
6
+ taskId: string;
7
+ runId: string;
8
+ phase: OverflowPhase;
9
+ startedAt: number;
10
+ lastEventAt: number;
11
+ compactionCount: number;
12
+ retryCount: number;
13
+ }
14
+
15
+ export interface OverflowRecoveryCallbacks {
16
+ onPhaseChange?: (state: OverflowRecoveryState, previousPhase: OverflowPhase) => void;
17
+ onTimeout?: (state: OverflowRecoveryState) => void;
18
+ }
19
+
20
+ const PHASE_TIMEOUT_MS = 120_000; // 120 seconds per phase
21
+
22
+ export class OverflowRecoveryTracker {
23
+ private states = new Map<string, OverflowRecoveryState>();
24
+ private timers = new Map<string, ReturnType<typeof setTimeout>>();
25
+ private callbacks: OverflowRecoveryCallbacks;
26
+
27
+ constructor(callbacks: OverflowRecoveryCallbacks = {}) {
28
+ this.callbacks = callbacks;
29
+ }
30
+
31
+ feedEvent(taskId: string, runId: string, eventType: string): OverflowPhase {
32
+ const existing = this.states.get(taskId);
33
+ const now = Date.now();
34
+
35
+ if (existing && existing.phase === "recovered") {
36
+ existing.lastEventAt = now;
37
+ return "recovered";
38
+ }
39
+ if (existing && existing.phase === "failed") {
40
+ existing.lastEventAt = now;
41
+ return "failed";
42
+ }
43
+
44
+ let phase: OverflowPhase = existing?.phase ?? "none";
45
+ let compactionCount = existing?.compactionCount ?? 0;
46
+ let retryCount = existing?.retryCount ?? 0;
47
+ const previousPhase = phase;
48
+
49
+ switch (eventType) {
50
+ case "compaction_start":
51
+ phase = "compaction";
52
+ compactionCount++;
53
+ break;
54
+ case "compaction_end":
55
+ // After compaction, we expect a retry; stay in compaction until retry starts
56
+ break;
57
+ case "auto_retry_start":
58
+ phase = "retrying";
59
+ retryCount++;
60
+ break;
61
+ case "auto_retry_end":
62
+ // After retry completes, the agent should produce a response
63
+ // We consider this recovered but don't finalize until agent_end
64
+ phase = "recovered";
65
+ break;
66
+ case "agent_end":
67
+ // If we were recovering and agent ends, we're recovered or failed
68
+ if (phase === "compaction" || phase === "retrying") {
69
+ phase = "failed";
70
+ }
71
+ break;
72
+ default:
73
+ // Unknown event type — no phase change
74
+ break;
75
+ }
76
+
77
+ const state: OverflowRecoveryState = {
78
+ taskId,
79
+ runId,
80
+ phase,
81
+ startedAt: existing?.startedAt ?? now,
82
+ lastEventAt: now,
83
+ compactionCount,
84
+ retryCount,
85
+ };
86
+
87
+ this.states.set(taskId, state);
88
+ this.resetTimeout(taskId);
89
+
90
+ if (previousPhase !== phase && this.callbacks.onPhaseChange) {
91
+ try {
92
+ this.callbacks.onPhaseChange(state, previousPhase);
93
+ } catch (error) {
94
+ logInternalError("overflow-recovery.onPhaseChange", error, `taskId=${taskId}`);
95
+ }
96
+ }
97
+
98
+ return phase;
99
+ }
100
+
101
+ getState(taskId: string): OverflowRecoveryState | undefined {
102
+ return this.states.get(taskId);
103
+ }
104
+
105
+ getPhase(taskId: string): OverflowPhase {
106
+ return this.states.get(taskId)?.phase ?? "none";
107
+ }
108
+
109
+ removeTask(taskId: string): void {
110
+ this.states.delete(taskId);
111
+ const timer = this.timers.get(taskId);
112
+ if (timer) {
113
+ clearTimeout(timer);
114
+ this.timers.delete(taskId);
115
+ }
116
+ }
117
+
118
+ dispose(): void {
119
+ for (const timer of this.timers.values()) clearTimeout(timer);
120
+ this.timers.clear();
121
+ this.states.clear();
122
+ }
123
+
124
+ private resetTimeout(taskId: string): void {
125
+ const existing = this.timers.get(taskId);
126
+ if (existing) clearTimeout(existing);
127
+
128
+ const timer = setTimeout(() => {
129
+ this.timers.delete(taskId);
130
+ const state = this.states.get(taskId);
131
+ if (!state) return;
132
+ if (state.phase === "recovered" || state.phase === "failed" || state.phase === "none") return;
133
+
134
+ const previousPhase = state.phase;
135
+ state.phase = "failed";
136
+ state.lastEventAt = Date.now();
137
+
138
+ if (this.callbacks.onTimeout) {
139
+ try {
140
+ this.callbacks.onTimeout(state);
141
+ } catch (error) {
142
+ logInternalError("overflow-recovery.onTimeout", error, `taskId=${taskId}`);
143
+ }
144
+ }
145
+ if (this.callbacks.onPhaseChange) {
146
+ try {
147
+ this.callbacks.onPhaseChange(state, previousPhase);
148
+ } catch (error) {
149
+ logInternalError("overflow-recovery.onPhaseChange-timeout", error, `taskId=${taskId}`);
150
+ }
151
+ }
152
+ }, PHASE_TIMEOUT_MS);
153
+
154
+ timer.unref();
155
+ this.timers.set(taskId, timer);
156
+ }
157
+ }
@@ -27,7 +27,7 @@ export function checkProcessLiveness(pid: number | undefined): ProcessLiveness {
27
27
  }
28
28
 
29
29
  export function isActiveRunStatus(status: string): boolean {
30
- return status === "queued" || status === "planning" || status === "running";
30
+ return status === "queued" || status === "planning" || status === "running" || status === "waiting";
31
31
  }
32
32
 
33
33
  export function isLikelyOrphanedActiveRun(run: TeamRunManifest, agents: CrewAgentRecord[] = [], now = Date.now(), staleMs = ORPHANED_ACTIVE_RUN_MS): boolean {
@@ -0,0 +1,25 @@
1
+ import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
2
+ import { logInternalError } from "../utils/internal-error.ts";
3
+
4
+ /**
5
+ * Try to register a cleanup function with Pi's session resource cleanup API (v0.72+).
6
+ * Falls back to returning undefined if the API is not available.
7
+ *
8
+ * The returned function (if defined) can be called to unregister the cleanup.
9
+ */
10
+ export function tryRegisterSessionCleanup(pi: ExtensionAPI, cleanup: () => void): (() => void) | undefined {
11
+ const api = pi as unknown as Record<string, unknown>;
12
+ const registerFn = api["registerSessionResourceCleanup"];
13
+ if (typeof registerFn === "function") {
14
+ try {
15
+ const unregister = (registerFn as (fn: () => void) => (() => void) | void)(cleanup);
16
+ if (typeof unregister === "function") return unregister;
17
+ // API returned void — cleanup is registered but cannot be unregistered
18
+ return undefined;
19
+ } catch (error) {
20
+ logInternalError("session-resources.register", error);
21
+ return undefined;
22
+ }
23
+ }
24
+ return undefined;
25
+ }
@@ -0,0 +1,59 @@
1
+ import type { TeamRunManifest, TeamTaskState } from "../state/types.ts";
2
+
3
+ /**
4
+ * Creates a lightweight snapshot of task state for event emission.
5
+ * Prevents mutation-during-callback issues by copying relevant fields.
6
+ */
7
+ export function snapshotTaskState(task: TeamTaskState): Readonly<TeamTaskState> {
8
+ return {
9
+ ...task,
10
+ dependsOn: [...task.dependsOn],
11
+ usage: task.usage ? { ...task.usage } : undefined,
12
+ agentProgress: task.agentProgress ? { ...task.agentProgress } : undefined,
13
+ heartbeat: task.heartbeat ? { ...task.heartbeat } : undefined,
14
+ modelAttempts: task.modelAttempts?.map((a) => ({ ...a })),
15
+ modelRouting: task.modelRouting ? { ...task.modelRouting } : undefined,
16
+ claim: task.claim ? { ...task.claim } : undefined,
17
+ checkpoint: task.checkpoint ? { ...task.checkpoint } : undefined,
18
+ attempts: task.attempts?.map((a) => ({ ...a })),
19
+ worktree: task.worktree ? { ...task.worktree } : undefined,
20
+ };
21
+ }
22
+
23
+ /**
24
+ * Session state snapshot for persistence before session switches.
25
+ * Captures the minimal set of data needed to resume operations.
26
+ */
27
+ export interface SessionStateSnapshot {
28
+ /** ISO timestamp of the snapshot */
29
+ capturedAt: string;
30
+ /** Active run IDs at time of snapshot */
31
+ activeRunIds: string[];
32
+ /** Number of pending deliveries */
33
+ pendingDeliveryCount: number;
34
+ /** Session generation counter */
35
+ sessionGeneration: number;
36
+ /** Summary of active tasks by status */
37
+ taskSummary: Record<string, number>;
38
+ }
39
+
40
+ /**
41
+ * Create a session state snapshot for pre-switch persistence.
42
+ */
43
+ export function createSessionSnapshot(
44
+ activeRuns: TeamRunManifest[],
45
+ pendingDeliveryCount: number,
46
+ sessionGeneration: number,
47
+ ): SessionStateSnapshot {
48
+ const taskSummary: Record<string, number> = {};
49
+ for (const run of activeRuns) {
50
+ taskSummary[run.status] = (taskSummary[run.status] ?? 0) + 1;
51
+ }
52
+ return {
53
+ capturedAt: new Date().toISOString(),
54
+ activeRunIds: activeRuns.map((r) => r.runId),
55
+ pendingDeliveryCount,
56
+ sessionGeneration,
57
+ taskSummary,
58
+ };
59
+ }
@@ -0,0 +1,179 @@
1
+ import * as fs from "node:fs";
2
+ import * as path from "node:path";
3
+ import type { TeamRunManifest, TeamTaskState } from "../state/types.ts";
4
+ import { checkProcessLiveness } from "./process-status.ts";
5
+ import { logInternalError } from "../utils/internal-error.ts";
6
+ import { writeAtomicJson } from "../utils/atomic-write.ts";
7
+
8
+ /**
9
+ * Result of reconciling a single stale run.
10
+ */
11
+ export interface ReconcileResult {
12
+ runId: string;
13
+ /** What was found and what action was taken */
14
+ verdict: "healthy" | "result_exists" | "pid_dead" | "pid_alive_stale" | "no_status";
15
+ /** Whether repair was applied */
16
+ repaired: boolean;
17
+ /** Human-readable detail */
18
+ detail: string;
19
+ }
20
+
21
+ const STALE_ALIVE_PID_MS = 24 * 60 * 60 * 1000; // 24 hours
22
+
23
+ /**
24
+ * Phase 1: Check if a result file already exists for the run.
25
+ * If so, the run completed but status wasn't updated — repair it.
26
+ */
27
+ function checkResultFile(
28
+ manifest: TeamRunManifest,
29
+ tasks: TeamTaskState[],
30
+ ): { found: boolean; repaired: boolean } {
31
+ // Check if all tasks already have terminal status (result was written but manifest wasn't updated)
32
+ const allTerminal = tasks.length > 0 && tasks.every(
33
+ (t) => t.status === "completed" || t.status === "failed" || t.status === "cancelled" || t.status === "skipped",
34
+ );
35
+ if (allTerminal) {
36
+ return { found: true, repaired: false };
37
+ }
38
+ return { found: false, repaired: false };
39
+ }
40
+
41
+ /**
42
+ * Phase 2: Check PID liveness.
43
+ */
44
+ function checkPidLiveness(pid: number | undefined): {
45
+ alive: boolean;
46
+ detail: string;
47
+ } {
48
+ if (pid === undefined || !Number.isInteger(pid) || pid <= 0) {
49
+ return { alive: false, detail: "no pid recorded" };
50
+ }
51
+ const liveness = checkProcessLiveness(pid);
52
+ return { alive: liveness.alive, detail: liveness.detail };
53
+ }
54
+
55
+ /**
56
+ * Phase 3: For dead PIDs, repair immediately.
57
+ * For alive PIDs, only mark stale if status hasn't updated in STALE_ALIVE_PID_MS.
58
+ */
59
+ function evaluateStaleness(
60
+ manifest: TeamRunManifest,
61
+ pidAlive: boolean,
62
+ now: number,
63
+ ): { stale: boolean; reason: string } {
64
+ if (!pidAlive) {
65
+ return { stale: true, reason: "pid_dead" };
66
+ }
67
+ const updatedAt = new Date(manifest.updatedAt).getTime();
68
+ if (!Number.isFinite(updatedAt)) {
69
+ return { stale: false, reason: "updated_at_invalid" };
70
+ }
71
+ if (now - updatedAt > STALE_ALIVE_PID_MS) {
72
+ return { stale: true, reason: `alive_but_stale_${Math.round((now - updatedAt) / 3600_000)}h` };
73
+ }
74
+ return { stale: false, reason: "alive_and_recent" };
75
+ }
76
+
77
+ /**
78
+ * Repair a stale run by marking it as failed and cancelling running tasks.
79
+ */
80
+ function repairStaleRun(
81
+ manifest: TeamRunManifest,
82
+ tasks: TeamTaskState[],
83
+ reason: string,
84
+ ): TeamTaskState[] {
85
+ const now = new Date().toISOString();
86
+ const repairedTasks = tasks.map((task) => {
87
+ if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
88
+ return {
89
+ ...task,
90
+ status: "cancelled" as const,
91
+ finishedAt: now,
92
+ error: `Stale run reconciled: ${reason}`,
93
+ };
94
+ }
95
+ return task;
96
+ });
97
+
98
+ // Write repaired tasks atomically
99
+ const tasksPath = manifest.tasksPath;
100
+ if (tasksPath) {
101
+ try {
102
+ writeAtomicJson(tasksPath, repairedTasks);
103
+ } catch (error) {
104
+ logInternalError("stale-reconciler.repair-tasks", error, `runId=${manifest.runId}`);
105
+ }
106
+ }
107
+
108
+ return repairedTasks;
109
+ }
110
+
111
+ /**
112
+ * Three-phase stale run reconciliation.
113
+ *
114
+ * 1. Check if result already exists → use it
115
+ * 2. Check PID liveness
116
+ * 3. Dead PID → repair immediately; alive PID → only fail if stale > 24h
117
+ */
118
+ export function reconcileStaleRun(
119
+ manifest: TeamRunManifest,
120
+ tasks: TeamTaskState[],
121
+ now = Date.now(),
122
+ ): ReconcileResult {
123
+ const runId = manifest.runId;
124
+
125
+ // Phase 1: Check if results already exist
126
+ const phase1 = checkResultFile(manifest, tasks);
127
+ if (phase1.found) {
128
+ return {
129
+ runId,
130
+ verdict: "result_exists",
131
+ repaired: false,
132
+ detail: "All tasks already terminal — no repair needed",
133
+ };
134
+ }
135
+
136
+ // Phase 2: Check PID liveness
137
+ const pid = manifest.async?.pid;
138
+ const pidStatus = checkPidLiveness(pid);
139
+
140
+ if (pidStatus.detail === "no pid recorded") {
141
+ // No async PID — not an async run, check updatedAt staleness
142
+ const updatedAt = new Date(manifest.updatedAt).getTime();
143
+ if (Number.isFinite(updatedAt) && now - updatedAt > STALE_ALIVE_PID_MS) {
144
+ const repaired = repairStaleRun(manifest, tasks, "no_pid_stale");
145
+ return {
146
+ runId,
147
+ verdict: "no_status",
148
+ repaired: true,
149
+ detail: `No PID; stale ${Math.round((now - updatedAt) / 3600_000)}h; repaired ${repaired.filter((t) => t.status === "cancelled").length} tasks`,
150
+ };
151
+ }
152
+ return {
153
+ runId,
154
+ verdict: "no_status",
155
+ repaired: false,
156
+ detail: "No PID recorded; not stale enough to repair",
157
+ };
158
+ }
159
+
160
+ // Phase 3: Evaluate staleness
161
+ const staleness = evaluateStaleness(manifest, pidStatus.alive, now);
162
+ if (!staleness.stale) {
163
+ return {
164
+ runId,
165
+ verdict: "healthy",
166
+ repaired: false,
167
+ detail: `PID ${pid}: ${pidStatus.detail}, ${staleness.reason}`,
168
+ };
169
+ }
170
+
171
+ // Repair
172
+ const repaired = repairStaleRun(manifest, tasks, staleness.reason);
173
+ return {
174
+ runId,
175
+ verdict: pidStatus.alive ? "pid_alive_stale" : "pid_dead",
176
+ repaired: true,
177
+ detail: `PID ${pid}: ${pidStatus.detail}; ${staleness.reason}; repaired ${repaired.filter((t) => t.status === "cancelled").length} tasks`,
178
+ };
179
+ }
@@ -0,0 +1,59 @@
1
+ import type { TeamRunManifest } from "../state/types.ts";
2
+ import { appendEvent } from "../state/event-log.ts";
3
+ import { logInternalError } from "../utils/internal-error.ts";
4
+
5
+ export interface SupervisorContactPayload {
6
+ runId: string;
7
+ taskId: string;
8
+ reason: "decision_needed" | "clarification" | "approval" | "error_escalation" | "custom";
9
+ message: string;
10
+ data?: Record<string, unknown>;
11
+ timestamp: string;
12
+ }
13
+
14
+ /**
15
+ * Record a supervisor contact event from a child task.
16
+ * This represents a child→parent communication where the child needs
17
+ * a decision, clarification, or approval to continue.
18
+ */
19
+ export function recordSupervisorContact(manifest: TeamRunManifest, payload: Omit<SupervisorContactPayload, "timestamp">): void {
20
+ const fullPayload: SupervisorContactPayload = {
21
+ ...payload,
22
+ timestamp: new Date().toISOString(),
23
+ };
24
+ try {
25
+ appendEvent(manifest.eventsPath, {
26
+ type: "supervisor.contact",
27
+ runId: manifest.runId,
28
+ taskId: payload.taskId,
29
+ data: fullPayload as unknown as Record<string, unknown>,
30
+ });
31
+ } catch (error) {
32
+ logInternalError("supervisor-contact.record", error, `runId=${manifest.runId} taskId=${payload.taskId}`);
33
+ }
34
+ }
35
+
36
+ /**
37
+ * Parse a supervisor contact request from child Pi stdout.
38
+ * Detects structured JSON lines with type "supervisor_contact".
39
+ */
40
+ export function parseSupervisorContactFromLine(line: string): Omit<SupervisorContactPayload, "timestamp" | "runId"> | undefined {
41
+ if (!line.trim()) return undefined;
42
+ let parsed: unknown;
43
+ try {
44
+ parsed = JSON.parse(line);
45
+ } catch {
46
+ return undefined;
47
+ }
48
+ if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) return undefined;
49
+ const record = parsed as Record<string, unknown>;
50
+ if (record.type !== "supervisor_contact" && record.type !== "crew_supervisor_contact") return undefined;
51
+ return {
52
+ taskId: typeof record.taskId === "string" ? record.taskId : "",
53
+ reason: typeof record.reason === "string" && ["decision_needed", "clarification", "approval", "error_escalation", "custom"].includes(record.reason)
54
+ ? record.reason as SupervisorContactPayload["reason"]
55
+ : "custom",
56
+ message: typeof record.message === "string" ? record.message : String(record.message ?? ""),
57
+ data: record.data && typeof record.data === "object" && !Array.isArray(record.data) ? record.data as Record<string, unknown> : undefined,
58
+ };
59
+ }
@@ -27,6 +27,7 @@ import { checkpointTask, persistSingleTaskUpdate, updateTask } from "./task-runn
27
27
  import { cleanResultText, isFinalChildEvent } from "./task-runner/result-utils.ts";
28
28
  import { evaluateCompletionMutationGuard } from "./completion-guard.ts";
29
29
  import { appendTaskAttentionEvent } from "./attention-events.ts";
30
+ import { parseSupervisorContactFromLine, recordSupervisorContact } from "./supervisor-contact.ts";
30
31
 
31
32
  export interface TaskRunnerInput {
32
33
  manifest: TeamRunManifest;
@@ -44,6 +45,8 @@ export interface TaskRunnerInput {
44
45
  modelOverride?: string;
45
46
  limits?: CrewLimitsConfig;
46
47
  dependencyContextText?: string;
48
+ /** Optional callback for JSON events from child Pi. Used for overflow recovery tracking. */
49
+ onJsonEvent?: (taskId: string, runId: string, event: unknown) => void;
47
50
  }
48
51
 
49
52
  export async function runTeamTask(input: TaskRunnerInput): Promise<{ manifest: TeamRunManifest; tasks: TeamTaskState[] }> {
@@ -154,12 +157,23 @@ export async function runTeamTask(input: TaskRunnerInput): Promise<{ manifest: T
154
157
  onStdoutLine: (line) => {
155
158
  appendCrewAgentOutput(manifest, task.id, line);
156
159
  persistHeartbeat();
160
+ // Check for supervisor contact requests from child Pi
161
+ const contact = parseSupervisorContactFromLine(line);
162
+ if (contact) {
163
+ recordSupervisorContact(manifest, { runId: manifest.runId, ...contact });
164
+ }
157
165
  },
158
166
  onJsonEvent: (event) => {
159
167
  appendCrewAgentEvent(manifest, task.id, event);
160
168
  persistHeartbeat();
161
169
  task = { ...task, agentProgress: applyAgentProgressEvent(task.agentProgress ?? emptyCrewAgentProgress(), event, task.startedAt) };
162
170
  tasks = updateTask(tasks, task);
171
+ // Feed overflow recovery tracker
172
+ if (input.onJsonEvent) {
173
+ try {
174
+ input.onJsonEvent(task.id, manifest.runId, event);
175
+ } catch { /* overflow tracking errors should not affect task */ }
176
+ }
163
177
  if (!finalCheckpointWritten && isFinalChildEvent(event)) {
164
178
  finalCheckpointWritten = true;
165
179
  ({ task, tasks } = checkpointTask(manifest, tasks, task, "child-stdout-final"));
@@ -43,6 +43,8 @@ export interface ExecuteTeamRunInput {
43
43
  signal?: AbortSignal;
44
44
  reliability?: CrewReliabilityConfig;
45
45
  metricRegistry?: MetricRegistry;
46
+ /** Optional callback for JSON events from child Pi. Used for overflow recovery tracking. */
47
+ onJsonEvent?: (taskId: string, runId: string, event: unknown) => void;
46
48
  }
47
49
 
48
50
  function findStep(workflow: WorkflowConfig, task: TeamTaskState): WorkflowStep {
@@ -68,7 +70,7 @@ function mergeArtifacts(items: ArtifactDescriptor[]): ArtifactDescriptor[] {
68
70
  }
69
71
 
70
72
  function isNonTerminalTaskStatus(status: TeamTaskState["status"]): boolean {
71
- return status === "queued" || status === "running";
73
+ return status === "queued" || status === "running" || status === "waiting";
72
74
  }
73
75
 
74
76
  function shouldMergeTaskUpdate(current: TeamTaskState, updated: TeamTaskState): boolean {
@@ -483,7 +485,7 @@ function ensurePlanApprovalRequested(manifest: TeamRunManifest, tasks: TeamTaskS
483
485
  }
484
486
 
485
487
  function cancelPlanTasks(tasks: TeamTaskState[], reason: string): TeamTaskState[] {
486
- return tasks.map((task) => task.status === "queued" || task.status === "running" ? { ...task, status: "cancelled", finishedAt: new Date().toISOString(), error: reason, graph: task.graph ? { ...task.graph, queue: "done" } : undefined } : task);
488
+ return tasks.map((task) => task.status === "queued" || task.status === "running" || task.status === "waiting" ? { ...task, status: "cancelled", finishedAt: new Date().toISOString(), error: reason, graph: task.graph ? { ...task.graph, queue: "done" } : undefined } : task);
487
489
  }
488
490
 
489
491
  function hasPendingMutatingAdaptiveTask(tasks: TeamTaskState[]): boolean {
@@ -533,7 +535,7 @@ export async function executeTeamRun(input: ExecuteTeamRunInput): Promise<{ mani
533
535
 
534
536
  while (tasks.some((task) => task.status === "queued")) {
535
537
  if (input.signal?.aborted) {
536
- tasks = tasks.map((task) => task.status === "queued" || task.status === "running" ? { ...task, status: "cancelled", finishedAt: new Date().toISOString(), error: "Run cancelled." } : task);
538
+ tasks = tasks.map((task) => task.status === "queued" || task.status === "running" || task.status === "waiting" ? { ...task, status: "cancelled", finishedAt: new Date().toISOString(), error: "Run cancelled." } : task);
537
539
  await saveRunTasksAsync(manifest, tasks);
538
540
  manifest = updateRunStatus(manifest, "cancelled", "Run cancelled.");
539
541
  return { manifest, tasks };
@@ -579,7 +581,7 @@ export async function executeTeamRun(input: ExecuteTeamRunInput): Promise<{ mani
579
581
  async (task) => {
580
582
  const step = findStep(workflow, task);
581
583
  const agent = findAgent(input.agents, task);
582
- const baseInput = { manifest, tasks, task, step, agent, signal: input.signal, executeWorkers: input.executeWorkers, runtimeKind: input.runtime?.kind, runtimeConfig: input.runtimeConfig, parentContext: input.parentContext, parentModel: input.parentModel, modelRegistry: input.modelRegistry, modelOverride: input.modelOverride, limits: input.limits };
584
+ const baseInput = { manifest, tasks, task, step, agent, signal: input.signal, executeWorkers: input.executeWorkers, runtimeKind: input.runtime?.kind, runtimeConfig: input.runtimeConfig, parentContext: input.parentContext, parentModel: input.parentModel, modelRegistry: input.modelRegistry, modelOverride: input.modelOverride, limits: input.limits, onJsonEvent: input.onJsonEvent };
583
585
  if (input.reliability?.autoRetry !== true) return withCorrelation(childCorrelation(manifest.runId, task.id), () => runTeamTask(baseInput));
584
586
  let lastFailed: { manifest: TeamRunManifest; tasks: TeamTaskState[] } | undefined;
585
587
  const attemptsSoFar: TaskAttemptState[] = [...(task.attempts ?? [])];
@@ -58,6 +58,7 @@ export const AgentOverrideSchema = Type.Object({
58
58
  fallbackModels: Type.Optional(Type.Union([Type.Array(Type.String({ minLength: 1 })), Type.Literal(false)])),
59
59
  thinking: Type.Optional(Type.Union([Type.String({ minLength: 1 }), Type.Literal(false)])),
60
60
  tools: Type.Optional(Type.Union([Type.Array(Type.String({ minLength: 1 })), Type.Literal(false)])),
61
+ skills: Type.Optional(Type.Union([Type.Array(Type.String({ minLength: 1 })), Type.Literal(false)])),
61
62
  }, { additionalProperties: false });
62
63
 
63
64
  export const PiTeamsAgentsConfigSchema = Type.Object({
@@ -24,6 +24,7 @@ export const TeamToolParams = Type.Object({
24
24
  Type.Literal("get"),
25
25
  Type.Literal("cancel"),
26
26
  Type.Literal("resume"),
27
+ Type.Literal("respond"),
27
28
  Type.Literal("create"),
28
29
  Type.Literal("update"),
29
30
  Type.Literal("delete"),
@@ -59,6 +60,8 @@ export const TeamToolParams = Type.Object({
59
60
  goal: Type.Optional(Type.String({ description: "High-level objective for a team run." })),
60
61
  task: Type.Optional(Type.String({ description: "Concrete task text for direct role/agent execution." })),
61
62
  runId: Type.Optional(Type.String({ description: "Run ID for status, cancel, or resume." })),
63
+ taskId: Type.Optional(Type.String({ description: "Task ID for respond action." })),
64
+ message: Type.Optional(Type.String({ description: "Message for respond action." })),
62
65
  async: Type.Optional(Type.Boolean({ description: "Run in background when execution support is enabled." })),
63
66
  workspaceMode: Type.Optional(Type.Union([
64
67
  Type.Literal("single"),
@@ -85,7 +88,7 @@ export const TeamToolParams = Type.Object({
85
88
  });
86
89
 
87
90
  export interface TeamToolParamsValue {
88
- action?: "run" | "plan" | "status" | "list" | "get" | "cancel" | "resume" | "create" | "update" | "delete" | "doctor" | "cleanup" | "events" | "artifacts" | "worktrees" | "forget" | "summary" | "prune" | "export" | "import" | "imports" | "help" | "validate" | "config" | "init" | "recommend" | "autonomy" | "api" | "settings";
91
+ action?: "run" | "plan" | "status" | "list" | "get" | "cancel" | "resume" | "respond" | "create" | "update" | "delete" | "doctor" | "cleanup" | "events" | "artifacts" | "worktrees" | "forget" | "summary" | "prune" | "export" | "import" | "imports" | "help" | "validate" | "config" | "init" | "recommend" | "autonomy" | "api" | "settings";
89
92
  resource?: "agent" | "team" | "workflow";
90
93
  team?: string;
91
94
  workflow?: string;
@@ -94,6 +97,8 @@ export interface TeamToolParamsValue {
94
97
  goal?: string;
95
98
  task?: string;
96
99
  runId?: string;
100
+ taskId?: string;
101
+ message?: string;
97
102
  async?: boolean;
98
103
  workspaceMode?: "single" | "worktree";
99
104
  context?: "fresh" | "fork";
@@ -1,7 +1,7 @@
1
1
  export const TEAM_RUN_STATUSES = ["queued", "planning", "running", "blocked", "completed", "failed", "cancelled"] as const;
2
2
  export type TeamRunStatus = typeof TEAM_RUN_STATUSES[number];
3
3
 
4
- export const TEAM_TASK_STATUSES = ["queued", "running", "completed", "failed", "cancelled", "skipped"] as const;
4
+ export const TEAM_TASK_STATUSES = ["queued", "running", "waiting", "completed", "failed", "cancelled", "skipped"] as const;
5
5
  export type TeamTaskStatus = typeof TEAM_TASK_STATUSES[number];
6
6
 
7
7
  export const TEAM_TERMINAL_RUN_STATUSES: ReadonlySet<TeamRunStatus> = new Set(["blocked", "completed", "failed", "cancelled"]);
@@ -19,7 +19,8 @@ export const TEAM_RUN_STATUS_TRANSITIONS: Readonly<Record<TeamRunStatus, readonl
19
19
 
20
20
  export const TEAM_TASK_STATUS_TRANSITIONS: Readonly<Record<TeamTaskStatus, readonly TeamTaskStatus[]>> = {
21
21
  queued: ["running", "cancelled", "skipped", "failed"],
22
- running: ["completed", "failed", "cancelled", "queued"],
22
+ running: ["completed", "failed", "cancelled", "queued", "waiting"],
23
+ waiting: ["running", "completed", "failed", "cancelled"],
23
24
  completed: ["queued"],
24
25
  failed: ["queued", "cancelled"],
25
26
  cancelled: ["queued"],
@@ -59,6 +60,9 @@ export const TEAM_EVENT_TYPES = [
59
60
  "async.completed",
60
61
  "async.failed",
61
62
  "async.stale",
63
+ "task.waiting",
64
+ "task.resumed",
65
+ "supervisor.contact",
62
66
  ] as const;
63
67
  export type TeamEventType = typeof TEAM_EVENT_TYPES[number];
64
68
 
@@ -100,12 +100,12 @@ function agentsFor(run: TeamRunManifest): CrewAgentRecord[] {
100
100
  }
101
101
  }
102
102
 
103
- export function activeWidgetRuns(cwd: string, manifestCache?: ManifestCache, snapshotCache?: RunSnapshotCache): WidgetRun[] {
104
- const runs = manifestCache ? manifestCache.list(20) : listRecentRuns(cwd, 20);
103
+ export function activeWidgetRuns(cwd: string, manifestCache?: ManifestCache, snapshotCache?: RunSnapshotCache, preloadedManifests?: TeamRunManifest[]): WidgetRun[] {
104
+ const runs = preloadedManifests ?? (manifestCache ? manifestCache.list(20) : listRecentRuns(cwd, 20));
105
105
  return runs
106
106
  .map((run) => {
107
107
  try {
108
- const snapshot = snapshotCache?.refreshIfStale(run.runId);
108
+ const snapshot = snapshotCache?.get(run.runId) ?? snapshotCache?.refreshIfStale(run.runId);
109
109
  return snapshot ? { run: snapshot.manifest, agents: snapshot.agents, snapshot } : { run, agents: agentsFor(run) };
110
110
  } catch {
111
111
  return { run, agents: agentsFor(run) };
@@ -279,11 +279,12 @@ export function updateCrewWidget(
279
279
  config?: CrewUiConfig,
280
280
  manifestCache?: ManifestCache,
281
281
  snapshotCache?: RunSnapshotCache,
282
+ preloadedManifests?: TeamRunManifest[],
282
283
  ): void {
283
284
  if (!ctx.hasUI) return;
284
285
  state.frame += 1;
285
286
  const maxLines = config?.widgetMaxLines ?? MAX_LINES_DEFAULT;
286
- const runs = activeWidgetRuns(ctx.cwd, manifestCache, snapshotCache);
287
+ const runs = activeWidgetRuns(ctx.cwd, manifestCache, snapshotCache, preloadedManifests);
287
288
  const lines = buildCrewWidgetLines(ctx.cwd, state.frame, maxLines, runs, state.notificationCount ?? 0);
288
289
  const placement = config?.widgetPlacement ?? "aboveEditor";
289
290
  ctx.ui.setStatus(STATUS_KEY, lines.length ? statusSummary(runs) : undefined);