pi-crew 0.1.43 → 0.1.44
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/research-phase10-distillation.md +199 -0
- package/docs/research-phase11-distillation.md +201 -0
- package/package.json +1 -1
- package/src/agents/discover-agents.ts +1 -0
- package/src/config/config.ts +19 -0
- package/src/extension/register.ts +127 -8
- package/src/extension/registration/team-tool.ts +2 -1
- package/src/extension/run-index.ts +19 -0
- package/src/extension/team-tool/api.ts +1 -1
- package/src/extension/team-tool/cancel.ts +103 -31
- package/src/extension/team-tool/context.ts +1 -0
- package/src/extension/team-tool/respond.ts +67 -0
- package/src/extension/team-tool/run.ts +2 -2
- package/src/extension/team-tool/status.ts +7 -1
- package/src/extension/team-tool-types.ts +4 -0
- package/src/extension/team-tool.ts +2 -0
- package/src/observability/event-to-metric.ts +6 -0
- package/src/runtime/completion-guard.ts +190 -103
- package/src/runtime/crash-recovery.ts +30 -0
- package/src/runtime/crew-agent-runtime.ts +2 -1
- package/src/runtime/delivery-coordinator.ts +143 -0
- package/src/runtime/model-fallback.ts +5 -2
- package/src/runtime/overflow-recovery.ts +157 -0
- package/src/runtime/process-status.ts +1 -1
- package/src/runtime/session-resources.ts +25 -0
- package/src/runtime/session-snapshot.ts +59 -0
- package/src/runtime/stale-reconciler.ts +179 -0
- package/src/runtime/supervisor-contact.ts +59 -0
- package/src/runtime/task-runner.ts +14 -0
- package/src/runtime/team-runner.ts +6 -4
- package/src/schema/config-schema.ts +1 -0
- package/src/schema/team-tool-schema.ts +6 -1
- package/src/state/contracts.ts +6 -2
- package/src/ui/crew-widget.ts +5 -4
- package/src/ui/powerbar-publisher.ts +3 -3
- package/src/ui/run-snapshot-cache.ts +275 -1
- package/src/ui/status-colors.ts +4 -0
- package/src/utils/atomic-write.ts +33 -0
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
2
|
+
|
|
3
|
+
export type OverflowPhase = "none" | "compaction" | "retrying" | "recovered" | "failed";
|
|
4
|
+
|
|
5
|
+
export interface OverflowRecoveryState {
|
|
6
|
+
taskId: string;
|
|
7
|
+
runId: string;
|
|
8
|
+
phase: OverflowPhase;
|
|
9
|
+
startedAt: number;
|
|
10
|
+
lastEventAt: number;
|
|
11
|
+
compactionCount: number;
|
|
12
|
+
retryCount: number;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface OverflowRecoveryCallbacks {
|
|
16
|
+
onPhaseChange?: (state: OverflowRecoveryState, previousPhase: OverflowPhase) => void;
|
|
17
|
+
onTimeout?: (state: OverflowRecoveryState) => void;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
const PHASE_TIMEOUT_MS = 120_000; // 120 seconds per phase
|
|
21
|
+
|
|
22
|
+
export class OverflowRecoveryTracker {
|
|
23
|
+
private states = new Map<string, OverflowRecoveryState>();
|
|
24
|
+
private timers = new Map<string, ReturnType<typeof setTimeout>>();
|
|
25
|
+
private callbacks: OverflowRecoveryCallbacks;
|
|
26
|
+
|
|
27
|
+
constructor(callbacks: OverflowRecoveryCallbacks = {}) {
|
|
28
|
+
this.callbacks = callbacks;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
feedEvent(taskId: string, runId: string, eventType: string): OverflowPhase {
|
|
32
|
+
const existing = this.states.get(taskId);
|
|
33
|
+
const now = Date.now();
|
|
34
|
+
|
|
35
|
+
if (existing && existing.phase === "recovered") {
|
|
36
|
+
existing.lastEventAt = now;
|
|
37
|
+
return "recovered";
|
|
38
|
+
}
|
|
39
|
+
if (existing && existing.phase === "failed") {
|
|
40
|
+
existing.lastEventAt = now;
|
|
41
|
+
return "failed";
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
let phase: OverflowPhase = existing?.phase ?? "none";
|
|
45
|
+
let compactionCount = existing?.compactionCount ?? 0;
|
|
46
|
+
let retryCount = existing?.retryCount ?? 0;
|
|
47
|
+
const previousPhase = phase;
|
|
48
|
+
|
|
49
|
+
switch (eventType) {
|
|
50
|
+
case "compaction_start":
|
|
51
|
+
phase = "compaction";
|
|
52
|
+
compactionCount++;
|
|
53
|
+
break;
|
|
54
|
+
case "compaction_end":
|
|
55
|
+
// After compaction, we expect a retry; stay in compaction until retry starts
|
|
56
|
+
break;
|
|
57
|
+
case "auto_retry_start":
|
|
58
|
+
phase = "retrying";
|
|
59
|
+
retryCount++;
|
|
60
|
+
break;
|
|
61
|
+
case "auto_retry_end":
|
|
62
|
+
// After retry completes, the agent should produce a response
|
|
63
|
+
// We consider this recovered but don't finalize until agent_end
|
|
64
|
+
phase = "recovered";
|
|
65
|
+
break;
|
|
66
|
+
case "agent_end":
|
|
67
|
+
// If we were recovering and agent ends, we're recovered or failed
|
|
68
|
+
if (phase === "compaction" || phase === "retrying") {
|
|
69
|
+
phase = "failed";
|
|
70
|
+
}
|
|
71
|
+
break;
|
|
72
|
+
default:
|
|
73
|
+
// Unknown event type — no phase change
|
|
74
|
+
break;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const state: OverflowRecoveryState = {
|
|
78
|
+
taskId,
|
|
79
|
+
runId,
|
|
80
|
+
phase,
|
|
81
|
+
startedAt: existing?.startedAt ?? now,
|
|
82
|
+
lastEventAt: now,
|
|
83
|
+
compactionCount,
|
|
84
|
+
retryCount,
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
this.states.set(taskId, state);
|
|
88
|
+
this.resetTimeout(taskId);
|
|
89
|
+
|
|
90
|
+
if (previousPhase !== phase && this.callbacks.onPhaseChange) {
|
|
91
|
+
try {
|
|
92
|
+
this.callbacks.onPhaseChange(state, previousPhase);
|
|
93
|
+
} catch (error) {
|
|
94
|
+
logInternalError("overflow-recovery.onPhaseChange", error, `taskId=${taskId}`);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
return phase;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
getState(taskId: string): OverflowRecoveryState | undefined {
|
|
102
|
+
return this.states.get(taskId);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
getPhase(taskId: string): OverflowPhase {
|
|
106
|
+
return this.states.get(taskId)?.phase ?? "none";
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
removeTask(taskId: string): void {
|
|
110
|
+
this.states.delete(taskId);
|
|
111
|
+
const timer = this.timers.get(taskId);
|
|
112
|
+
if (timer) {
|
|
113
|
+
clearTimeout(timer);
|
|
114
|
+
this.timers.delete(taskId);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
dispose(): void {
|
|
119
|
+
for (const timer of this.timers.values()) clearTimeout(timer);
|
|
120
|
+
this.timers.clear();
|
|
121
|
+
this.states.clear();
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
private resetTimeout(taskId: string): void {
|
|
125
|
+
const existing = this.timers.get(taskId);
|
|
126
|
+
if (existing) clearTimeout(existing);
|
|
127
|
+
|
|
128
|
+
const timer = setTimeout(() => {
|
|
129
|
+
this.timers.delete(taskId);
|
|
130
|
+
const state = this.states.get(taskId);
|
|
131
|
+
if (!state) return;
|
|
132
|
+
if (state.phase === "recovered" || state.phase === "failed" || state.phase === "none") return;
|
|
133
|
+
|
|
134
|
+
const previousPhase = state.phase;
|
|
135
|
+
state.phase = "failed";
|
|
136
|
+
state.lastEventAt = Date.now();
|
|
137
|
+
|
|
138
|
+
if (this.callbacks.onTimeout) {
|
|
139
|
+
try {
|
|
140
|
+
this.callbacks.onTimeout(state);
|
|
141
|
+
} catch (error) {
|
|
142
|
+
logInternalError("overflow-recovery.onTimeout", error, `taskId=${taskId}`);
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
if (this.callbacks.onPhaseChange) {
|
|
146
|
+
try {
|
|
147
|
+
this.callbacks.onPhaseChange(state, previousPhase);
|
|
148
|
+
} catch (error) {
|
|
149
|
+
logInternalError("overflow-recovery.onPhaseChange-timeout", error, `taskId=${taskId}`);
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}, PHASE_TIMEOUT_MS);
|
|
153
|
+
|
|
154
|
+
timer.unref();
|
|
155
|
+
this.timers.set(taskId, timer);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
@@ -27,7 +27,7 @@ export function checkProcessLiveness(pid: number | undefined): ProcessLiveness {
|
|
|
27
27
|
}
|
|
28
28
|
|
|
29
29
|
export function isActiveRunStatus(status: string): boolean {
|
|
30
|
-
return status === "queued" || status === "planning" || status === "running";
|
|
30
|
+
return status === "queued" || status === "planning" || status === "running" || status === "waiting";
|
|
31
31
|
}
|
|
32
32
|
|
|
33
33
|
export function isLikelyOrphanedActiveRun(run: TeamRunManifest, agents: CrewAgentRecord[] = [], now = Date.now(), staleMs = ORPHANED_ACTIVE_RUN_MS): boolean {
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
|
|
2
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Try to register a cleanup function with Pi's session resource cleanup API (v0.72+).
|
|
6
|
+
* Falls back to returning undefined if the API is not available.
|
|
7
|
+
*
|
|
8
|
+
* The returned function (if defined) can be called to unregister the cleanup.
|
|
9
|
+
*/
|
|
10
|
+
export function tryRegisterSessionCleanup(pi: ExtensionAPI, cleanup: () => void): (() => void) | undefined {
|
|
11
|
+
const api = pi as unknown as Record<string, unknown>;
|
|
12
|
+
const registerFn = api["registerSessionResourceCleanup"];
|
|
13
|
+
if (typeof registerFn === "function") {
|
|
14
|
+
try {
|
|
15
|
+
const unregister = (registerFn as (fn: () => void) => (() => void) | void)(cleanup);
|
|
16
|
+
if (typeof unregister === "function") return unregister;
|
|
17
|
+
// API returned void — cleanup is registered but cannot be unregistered
|
|
18
|
+
return undefined;
|
|
19
|
+
} catch (error) {
|
|
20
|
+
logInternalError("session-resources.register", error);
|
|
21
|
+
return undefined;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
return undefined;
|
|
25
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import type { TeamRunManifest, TeamTaskState } from "../state/types.ts";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Creates a lightweight snapshot of task state for event emission.
|
|
5
|
+
* Prevents mutation-during-callback issues by copying relevant fields.
|
|
6
|
+
*/
|
|
7
|
+
export function snapshotTaskState(task: TeamTaskState): Readonly<TeamTaskState> {
|
|
8
|
+
return {
|
|
9
|
+
...task,
|
|
10
|
+
dependsOn: [...task.dependsOn],
|
|
11
|
+
usage: task.usage ? { ...task.usage } : undefined,
|
|
12
|
+
agentProgress: task.agentProgress ? { ...task.agentProgress } : undefined,
|
|
13
|
+
heartbeat: task.heartbeat ? { ...task.heartbeat } : undefined,
|
|
14
|
+
modelAttempts: task.modelAttempts?.map((a) => ({ ...a })),
|
|
15
|
+
modelRouting: task.modelRouting ? { ...task.modelRouting } : undefined,
|
|
16
|
+
claim: task.claim ? { ...task.claim } : undefined,
|
|
17
|
+
checkpoint: task.checkpoint ? { ...task.checkpoint } : undefined,
|
|
18
|
+
attempts: task.attempts?.map((a) => ({ ...a })),
|
|
19
|
+
worktree: task.worktree ? { ...task.worktree } : undefined,
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Session state snapshot for persistence before session switches.
|
|
25
|
+
* Captures the minimal set of data needed to resume operations.
|
|
26
|
+
*/
|
|
27
|
+
export interface SessionStateSnapshot {
|
|
28
|
+
/** ISO timestamp of the snapshot */
|
|
29
|
+
capturedAt: string;
|
|
30
|
+
/** Active run IDs at time of snapshot */
|
|
31
|
+
activeRunIds: string[];
|
|
32
|
+
/** Number of pending deliveries */
|
|
33
|
+
pendingDeliveryCount: number;
|
|
34
|
+
/** Session generation counter */
|
|
35
|
+
sessionGeneration: number;
|
|
36
|
+
/** Summary of active tasks by status */
|
|
37
|
+
taskSummary: Record<string, number>;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Create a session state snapshot for pre-switch persistence.
|
|
42
|
+
*/
|
|
43
|
+
export function createSessionSnapshot(
|
|
44
|
+
activeRuns: TeamRunManifest[],
|
|
45
|
+
pendingDeliveryCount: number,
|
|
46
|
+
sessionGeneration: number,
|
|
47
|
+
): SessionStateSnapshot {
|
|
48
|
+
const taskSummary: Record<string, number> = {};
|
|
49
|
+
for (const run of activeRuns) {
|
|
50
|
+
taskSummary[run.status] = (taskSummary[run.status] ?? 0) + 1;
|
|
51
|
+
}
|
|
52
|
+
return {
|
|
53
|
+
capturedAt: new Date().toISOString(),
|
|
54
|
+
activeRunIds: activeRuns.map((r) => r.runId),
|
|
55
|
+
pendingDeliveryCount,
|
|
56
|
+
sessionGeneration,
|
|
57
|
+
taskSummary,
|
|
58
|
+
};
|
|
59
|
+
}
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
import * as fs from "node:fs";
|
|
2
|
+
import * as path from "node:path";
|
|
3
|
+
import type { TeamRunManifest, TeamTaskState } from "../state/types.ts";
|
|
4
|
+
import { checkProcessLiveness } from "./process-status.ts";
|
|
5
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
6
|
+
import { writeAtomicJson } from "../utils/atomic-write.ts";
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Result of reconciling a single stale run.
|
|
10
|
+
*/
|
|
11
|
+
export interface ReconcileResult {
|
|
12
|
+
runId: string;
|
|
13
|
+
/** What was found and what action was taken */
|
|
14
|
+
verdict: "healthy" | "result_exists" | "pid_dead" | "pid_alive_stale" | "no_status";
|
|
15
|
+
/** Whether repair was applied */
|
|
16
|
+
repaired: boolean;
|
|
17
|
+
/** Human-readable detail */
|
|
18
|
+
detail: string;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
const STALE_ALIVE_PID_MS = 24 * 60 * 60 * 1000; // 24 hours
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Phase 1: Check if a result file already exists for the run.
|
|
25
|
+
* If so, the run completed but status wasn't updated — repair it.
|
|
26
|
+
*/
|
|
27
|
+
function checkResultFile(
|
|
28
|
+
manifest: TeamRunManifest,
|
|
29
|
+
tasks: TeamTaskState[],
|
|
30
|
+
): { found: boolean; repaired: boolean } {
|
|
31
|
+
// Check if all tasks already have terminal status (result was written but manifest wasn't updated)
|
|
32
|
+
const allTerminal = tasks.length > 0 && tasks.every(
|
|
33
|
+
(t) => t.status === "completed" || t.status === "failed" || t.status === "cancelled" || t.status === "skipped",
|
|
34
|
+
);
|
|
35
|
+
if (allTerminal) {
|
|
36
|
+
return { found: true, repaired: false };
|
|
37
|
+
}
|
|
38
|
+
return { found: false, repaired: false };
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Phase 2: Check PID liveness.
|
|
43
|
+
*/
|
|
44
|
+
function checkPidLiveness(pid: number | undefined): {
|
|
45
|
+
alive: boolean;
|
|
46
|
+
detail: string;
|
|
47
|
+
} {
|
|
48
|
+
if (pid === undefined || !Number.isInteger(pid) || pid <= 0) {
|
|
49
|
+
return { alive: false, detail: "no pid recorded" };
|
|
50
|
+
}
|
|
51
|
+
const liveness = checkProcessLiveness(pid);
|
|
52
|
+
return { alive: liveness.alive, detail: liveness.detail };
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Phase 3: For dead PIDs, repair immediately.
|
|
57
|
+
* For alive PIDs, only mark stale if status hasn't updated in STALE_ALIVE_PID_MS.
|
|
58
|
+
*/
|
|
59
|
+
function evaluateStaleness(
|
|
60
|
+
manifest: TeamRunManifest,
|
|
61
|
+
pidAlive: boolean,
|
|
62
|
+
now: number,
|
|
63
|
+
): { stale: boolean; reason: string } {
|
|
64
|
+
if (!pidAlive) {
|
|
65
|
+
return { stale: true, reason: "pid_dead" };
|
|
66
|
+
}
|
|
67
|
+
const updatedAt = new Date(manifest.updatedAt).getTime();
|
|
68
|
+
if (!Number.isFinite(updatedAt)) {
|
|
69
|
+
return { stale: false, reason: "updated_at_invalid" };
|
|
70
|
+
}
|
|
71
|
+
if (now - updatedAt > STALE_ALIVE_PID_MS) {
|
|
72
|
+
return { stale: true, reason: `alive_but_stale_${Math.round((now - updatedAt) / 3600_000)}h` };
|
|
73
|
+
}
|
|
74
|
+
return { stale: false, reason: "alive_and_recent" };
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Repair a stale run by marking it as failed and cancelling running tasks.
|
|
79
|
+
*/
|
|
80
|
+
function repairStaleRun(
|
|
81
|
+
manifest: TeamRunManifest,
|
|
82
|
+
tasks: TeamTaskState[],
|
|
83
|
+
reason: string,
|
|
84
|
+
): TeamTaskState[] {
|
|
85
|
+
const now = new Date().toISOString();
|
|
86
|
+
const repairedTasks = tasks.map((task) => {
|
|
87
|
+
if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
|
|
88
|
+
return {
|
|
89
|
+
...task,
|
|
90
|
+
status: "cancelled" as const,
|
|
91
|
+
finishedAt: now,
|
|
92
|
+
error: `Stale run reconciled: ${reason}`,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
return task;
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
// Write repaired tasks atomically
|
|
99
|
+
const tasksPath = manifest.tasksPath;
|
|
100
|
+
if (tasksPath) {
|
|
101
|
+
try {
|
|
102
|
+
writeAtomicJson(tasksPath, repairedTasks);
|
|
103
|
+
} catch (error) {
|
|
104
|
+
logInternalError("stale-reconciler.repair-tasks", error, `runId=${manifest.runId}`);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
return repairedTasks;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Three-phase stale run reconciliation.
|
|
113
|
+
*
|
|
114
|
+
* 1. Check if result already exists → use it
|
|
115
|
+
* 2. Check PID liveness
|
|
116
|
+
* 3. Dead PID → repair immediately; alive PID → only fail if stale > 24h
|
|
117
|
+
*/
|
|
118
|
+
export function reconcileStaleRun(
|
|
119
|
+
manifest: TeamRunManifest,
|
|
120
|
+
tasks: TeamTaskState[],
|
|
121
|
+
now = Date.now(),
|
|
122
|
+
): ReconcileResult {
|
|
123
|
+
const runId = manifest.runId;
|
|
124
|
+
|
|
125
|
+
// Phase 1: Check if results already exist
|
|
126
|
+
const phase1 = checkResultFile(manifest, tasks);
|
|
127
|
+
if (phase1.found) {
|
|
128
|
+
return {
|
|
129
|
+
runId,
|
|
130
|
+
verdict: "result_exists",
|
|
131
|
+
repaired: false,
|
|
132
|
+
detail: "All tasks already terminal — no repair needed",
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// Phase 2: Check PID liveness
|
|
137
|
+
const pid = manifest.async?.pid;
|
|
138
|
+
const pidStatus = checkPidLiveness(pid);
|
|
139
|
+
|
|
140
|
+
if (pidStatus.detail === "no pid recorded") {
|
|
141
|
+
// No async PID — not an async run, check updatedAt staleness
|
|
142
|
+
const updatedAt = new Date(manifest.updatedAt).getTime();
|
|
143
|
+
if (Number.isFinite(updatedAt) && now - updatedAt > STALE_ALIVE_PID_MS) {
|
|
144
|
+
const repaired = repairStaleRun(manifest, tasks, "no_pid_stale");
|
|
145
|
+
return {
|
|
146
|
+
runId,
|
|
147
|
+
verdict: "no_status",
|
|
148
|
+
repaired: true,
|
|
149
|
+
detail: `No PID; stale ${Math.round((now - updatedAt) / 3600_000)}h; repaired ${repaired.filter((t) => t.status === "cancelled").length} tasks`,
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
return {
|
|
153
|
+
runId,
|
|
154
|
+
verdict: "no_status",
|
|
155
|
+
repaired: false,
|
|
156
|
+
detail: "No PID recorded; not stale enough to repair",
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// Phase 3: Evaluate staleness
|
|
161
|
+
const staleness = evaluateStaleness(manifest, pidStatus.alive, now);
|
|
162
|
+
if (!staleness.stale) {
|
|
163
|
+
return {
|
|
164
|
+
runId,
|
|
165
|
+
verdict: "healthy",
|
|
166
|
+
repaired: false,
|
|
167
|
+
detail: `PID ${pid}: ${pidStatus.detail}, ${staleness.reason}`,
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Repair
|
|
172
|
+
const repaired = repairStaleRun(manifest, tasks, staleness.reason);
|
|
173
|
+
return {
|
|
174
|
+
runId,
|
|
175
|
+
verdict: pidStatus.alive ? "pid_alive_stale" : "pid_dead",
|
|
176
|
+
repaired: true,
|
|
177
|
+
detail: `PID ${pid}: ${pidStatus.detail}; ${staleness.reason}; repaired ${repaired.filter((t) => t.status === "cancelled").length} tasks`,
|
|
178
|
+
};
|
|
179
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import type { TeamRunManifest } from "../state/types.ts";
|
|
2
|
+
import { appendEvent } from "../state/event-log.ts";
|
|
3
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
4
|
+
|
|
5
|
+
export interface SupervisorContactPayload {
|
|
6
|
+
runId: string;
|
|
7
|
+
taskId: string;
|
|
8
|
+
reason: "decision_needed" | "clarification" | "approval" | "error_escalation" | "custom";
|
|
9
|
+
message: string;
|
|
10
|
+
data?: Record<string, unknown>;
|
|
11
|
+
timestamp: string;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Record a supervisor contact event from a child task.
|
|
16
|
+
* This represents a child→parent communication where the child needs
|
|
17
|
+
* a decision, clarification, or approval to continue.
|
|
18
|
+
*/
|
|
19
|
+
export function recordSupervisorContact(manifest: TeamRunManifest, payload: Omit<SupervisorContactPayload, "timestamp">): void {
|
|
20
|
+
const fullPayload: SupervisorContactPayload = {
|
|
21
|
+
...payload,
|
|
22
|
+
timestamp: new Date().toISOString(),
|
|
23
|
+
};
|
|
24
|
+
try {
|
|
25
|
+
appendEvent(manifest.eventsPath, {
|
|
26
|
+
type: "supervisor.contact",
|
|
27
|
+
runId: manifest.runId,
|
|
28
|
+
taskId: payload.taskId,
|
|
29
|
+
data: fullPayload as unknown as Record<string, unknown>,
|
|
30
|
+
});
|
|
31
|
+
} catch (error) {
|
|
32
|
+
logInternalError("supervisor-contact.record", error, `runId=${manifest.runId} taskId=${payload.taskId}`);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Parse a supervisor contact request from child Pi stdout.
|
|
38
|
+
* Detects structured JSON lines with type "supervisor_contact".
|
|
39
|
+
*/
|
|
40
|
+
export function parseSupervisorContactFromLine(line: string): Omit<SupervisorContactPayload, "timestamp" | "runId"> | undefined {
|
|
41
|
+
if (!line.trim()) return undefined;
|
|
42
|
+
let parsed: unknown;
|
|
43
|
+
try {
|
|
44
|
+
parsed = JSON.parse(line);
|
|
45
|
+
} catch {
|
|
46
|
+
return undefined;
|
|
47
|
+
}
|
|
48
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) return undefined;
|
|
49
|
+
const record = parsed as Record<string, unknown>;
|
|
50
|
+
if (record.type !== "supervisor_contact" && record.type !== "crew_supervisor_contact") return undefined;
|
|
51
|
+
return {
|
|
52
|
+
taskId: typeof record.taskId === "string" ? record.taskId : "",
|
|
53
|
+
reason: typeof record.reason === "string" && ["decision_needed", "clarification", "approval", "error_escalation", "custom"].includes(record.reason)
|
|
54
|
+
? record.reason as SupervisorContactPayload["reason"]
|
|
55
|
+
: "custom",
|
|
56
|
+
message: typeof record.message === "string" ? record.message : String(record.message ?? ""),
|
|
57
|
+
data: record.data && typeof record.data === "object" && !Array.isArray(record.data) ? record.data as Record<string, unknown> : undefined,
|
|
58
|
+
};
|
|
59
|
+
}
|
|
@@ -27,6 +27,7 @@ import { checkpointTask, persistSingleTaskUpdate, updateTask } from "./task-runn
|
|
|
27
27
|
import { cleanResultText, isFinalChildEvent } from "./task-runner/result-utils.ts";
|
|
28
28
|
import { evaluateCompletionMutationGuard } from "./completion-guard.ts";
|
|
29
29
|
import { appendTaskAttentionEvent } from "./attention-events.ts";
|
|
30
|
+
import { parseSupervisorContactFromLine, recordSupervisorContact } from "./supervisor-contact.ts";
|
|
30
31
|
|
|
31
32
|
export interface TaskRunnerInput {
|
|
32
33
|
manifest: TeamRunManifest;
|
|
@@ -44,6 +45,8 @@ export interface TaskRunnerInput {
|
|
|
44
45
|
modelOverride?: string;
|
|
45
46
|
limits?: CrewLimitsConfig;
|
|
46
47
|
dependencyContextText?: string;
|
|
48
|
+
/** Optional callback for JSON events from child Pi. Used for overflow recovery tracking. */
|
|
49
|
+
onJsonEvent?: (taskId: string, runId: string, event: unknown) => void;
|
|
47
50
|
}
|
|
48
51
|
|
|
49
52
|
export async function runTeamTask(input: TaskRunnerInput): Promise<{ manifest: TeamRunManifest; tasks: TeamTaskState[] }> {
|
|
@@ -154,12 +157,23 @@ export async function runTeamTask(input: TaskRunnerInput): Promise<{ manifest: T
|
|
|
154
157
|
onStdoutLine: (line) => {
|
|
155
158
|
appendCrewAgentOutput(manifest, task.id, line);
|
|
156
159
|
persistHeartbeat();
|
|
160
|
+
// Check for supervisor contact requests from child Pi
|
|
161
|
+
const contact = parseSupervisorContactFromLine(line);
|
|
162
|
+
if (contact) {
|
|
163
|
+
recordSupervisorContact(manifest, { runId: manifest.runId, ...contact });
|
|
164
|
+
}
|
|
157
165
|
},
|
|
158
166
|
onJsonEvent: (event) => {
|
|
159
167
|
appendCrewAgentEvent(manifest, task.id, event);
|
|
160
168
|
persistHeartbeat();
|
|
161
169
|
task = { ...task, agentProgress: applyAgentProgressEvent(task.agentProgress ?? emptyCrewAgentProgress(), event, task.startedAt) };
|
|
162
170
|
tasks = updateTask(tasks, task);
|
|
171
|
+
// Feed overflow recovery tracker
|
|
172
|
+
if (input.onJsonEvent) {
|
|
173
|
+
try {
|
|
174
|
+
input.onJsonEvent(task.id, manifest.runId, event);
|
|
175
|
+
} catch { /* overflow tracking errors should not affect task */ }
|
|
176
|
+
}
|
|
163
177
|
if (!finalCheckpointWritten && isFinalChildEvent(event)) {
|
|
164
178
|
finalCheckpointWritten = true;
|
|
165
179
|
({ task, tasks } = checkpointTask(manifest, tasks, task, "child-stdout-final"));
|
|
@@ -43,6 +43,8 @@ export interface ExecuteTeamRunInput {
|
|
|
43
43
|
signal?: AbortSignal;
|
|
44
44
|
reliability?: CrewReliabilityConfig;
|
|
45
45
|
metricRegistry?: MetricRegistry;
|
|
46
|
+
/** Optional callback for JSON events from child Pi. Used for overflow recovery tracking. */
|
|
47
|
+
onJsonEvent?: (taskId: string, runId: string, event: unknown) => void;
|
|
46
48
|
}
|
|
47
49
|
|
|
48
50
|
function findStep(workflow: WorkflowConfig, task: TeamTaskState): WorkflowStep {
|
|
@@ -68,7 +70,7 @@ function mergeArtifacts(items: ArtifactDescriptor[]): ArtifactDescriptor[] {
|
|
|
68
70
|
}
|
|
69
71
|
|
|
70
72
|
function isNonTerminalTaskStatus(status: TeamTaskState["status"]): boolean {
|
|
71
|
-
return status === "queued" || status === "running";
|
|
73
|
+
return status === "queued" || status === "running" || status === "waiting";
|
|
72
74
|
}
|
|
73
75
|
|
|
74
76
|
function shouldMergeTaskUpdate(current: TeamTaskState, updated: TeamTaskState): boolean {
|
|
@@ -483,7 +485,7 @@ function ensurePlanApprovalRequested(manifest: TeamRunManifest, tasks: TeamTaskS
|
|
|
483
485
|
}
|
|
484
486
|
|
|
485
487
|
function cancelPlanTasks(tasks: TeamTaskState[], reason: string): TeamTaskState[] {
|
|
486
|
-
return tasks.map((task) => task.status === "queued" || task.status === "running" ? { ...task, status: "cancelled", finishedAt: new Date().toISOString(), error: reason, graph: task.graph ? { ...task.graph, queue: "done" } : undefined } : task);
|
|
488
|
+
return tasks.map((task) => task.status === "queued" || task.status === "running" || task.status === "waiting" ? { ...task, status: "cancelled", finishedAt: new Date().toISOString(), error: reason, graph: task.graph ? { ...task.graph, queue: "done" } : undefined } : task);
|
|
487
489
|
}
|
|
488
490
|
|
|
489
491
|
function hasPendingMutatingAdaptiveTask(tasks: TeamTaskState[]): boolean {
|
|
@@ -533,7 +535,7 @@ export async function executeTeamRun(input: ExecuteTeamRunInput): Promise<{ mani
|
|
|
533
535
|
|
|
534
536
|
while (tasks.some((task) => task.status === "queued")) {
|
|
535
537
|
if (input.signal?.aborted) {
|
|
536
|
-
tasks = tasks.map((task) => task.status === "queued" || task.status === "running" ? { ...task, status: "cancelled", finishedAt: new Date().toISOString(), error: "Run cancelled." } : task);
|
|
538
|
+
tasks = tasks.map((task) => task.status === "queued" || task.status === "running" || task.status === "waiting" ? { ...task, status: "cancelled", finishedAt: new Date().toISOString(), error: "Run cancelled." } : task);
|
|
537
539
|
await saveRunTasksAsync(manifest, tasks);
|
|
538
540
|
manifest = updateRunStatus(manifest, "cancelled", "Run cancelled.");
|
|
539
541
|
return { manifest, tasks };
|
|
@@ -579,7 +581,7 @@ export async function executeTeamRun(input: ExecuteTeamRunInput): Promise<{ mani
|
|
|
579
581
|
async (task) => {
|
|
580
582
|
const step = findStep(workflow, task);
|
|
581
583
|
const agent = findAgent(input.agents, task);
|
|
582
|
-
const baseInput = { manifest, tasks, task, step, agent, signal: input.signal, executeWorkers: input.executeWorkers, runtimeKind: input.runtime?.kind, runtimeConfig: input.runtimeConfig, parentContext: input.parentContext, parentModel: input.parentModel, modelRegistry: input.modelRegistry, modelOverride: input.modelOverride, limits: input.limits };
|
|
584
|
+
const baseInput = { manifest, tasks, task, step, agent, signal: input.signal, executeWorkers: input.executeWorkers, runtimeKind: input.runtime?.kind, runtimeConfig: input.runtimeConfig, parentContext: input.parentContext, parentModel: input.parentModel, modelRegistry: input.modelRegistry, modelOverride: input.modelOverride, limits: input.limits, onJsonEvent: input.onJsonEvent };
|
|
583
585
|
if (input.reliability?.autoRetry !== true) return withCorrelation(childCorrelation(manifest.runId, task.id), () => runTeamTask(baseInput));
|
|
584
586
|
let lastFailed: { manifest: TeamRunManifest; tasks: TeamTaskState[] } | undefined;
|
|
585
587
|
const attemptsSoFar: TaskAttemptState[] = [...(task.attempts ?? [])];
|
|
@@ -58,6 +58,7 @@ export const AgentOverrideSchema = Type.Object({
|
|
|
58
58
|
fallbackModels: Type.Optional(Type.Union([Type.Array(Type.String({ minLength: 1 })), Type.Literal(false)])),
|
|
59
59
|
thinking: Type.Optional(Type.Union([Type.String({ minLength: 1 }), Type.Literal(false)])),
|
|
60
60
|
tools: Type.Optional(Type.Union([Type.Array(Type.String({ minLength: 1 })), Type.Literal(false)])),
|
|
61
|
+
skills: Type.Optional(Type.Union([Type.Array(Type.String({ minLength: 1 })), Type.Literal(false)])),
|
|
61
62
|
}, { additionalProperties: false });
|
|
62
63
|
|
|
63
64
|
export const PiTeamsAgentsConfigSchema = Type.Object({
|
|
@@ -24,6 +24,7 @@ export const TeamToolParams = Type.Object({
|
|
|
24
24
|
Type.Literal("get"),
|
|
25
25
|
Type.Literal("cancel"),
|
|
26
26
|
Type.Literal("resume"),
|
|
27
|
+
Type.Literal("respond"),
|
|
27
28
|
Type.Literal("create"),
|
|
28
29
|
Type.Literal("update"),
|
|
29
30
|
Type.Literal("delete"),
|
|
@@ -59,6 +60,8 @@ export const TeamToolParams = Type.Object({
|
|
|
59
60
|
goal: Type.Optional(Type.String({ description: "High-level objective for a team run." })),
|
|
60
61
|
task: Type.Optional(Type.String({ description: "Concrete task text for direct role/agent execution." })),
|
|
61
62
|
runId: Type.Optional(Type.String({ description: "Run ID for status, cancel, or resume." })),
|
|
63
|
+
taskId: Type.Optional(Type.String({ description: "Task ID for respond action." })),
|
|
64
|
+
message: Type.Optional(Type.String({ description: "Message for respond action." })),
|
|
62
65
|
async: Type.Optional(Type.Boolean({ description: "Run in background when execution support is enabled." })),
|
|
63
66
|
workspaceMode: Type.Optional(Type.Union([
|
|
64
67
|
Type.Literal("single"),
|
|
@@ -85,7 +88,7 @@ export const TeamToolParams = Type.Object({
|
|
|
85
88
|
});
|
|
86
89
|
|
|
87
90
|
export interface TeamToolParamsValue {
|
|
88
|
-
action?: "run" | "plan" | "status" | "list" | "get" | "cancel" | "resume" | "create" | "update" | "delete" | "doctor" | "cleanup" | "events" | "artifacts" | "worktrees" | "forget" | "summary" | "prune" | "export" | "import" | "imports" | "help" | "validate" | "config" | "init" | "recommend" | "autonomy" | "api" | "settings";
|
|
91
|
+
action?: "run" | "plan" | "status" | "list" | "get" | "cancel" | "resume" | "respond" | "create" | "update" | "delete" | "doctor" | "cleanup" | "events" | "artifacts" | "worktrees" | "forget" | "summary" | "prune" | "export" | "import" | "imports" | "help" | "validate" | "config" | "init" | "recommend" | "autonomy" | "api" | "settings";
|
|
89
92
|
resource?: "agent" | "team" | "workflow";
|
|
90
93
|
team?: string;
|
|
91
94
|
workflow?: string;
|
|
@@ -94,6 +97,8 @@ export interface TeamToolParamsValue {
|
|
|
94
97
|
goal?: string;
|
|
95
98
|
task?: string;
|
|
96
99
|
runId?: string;
|
|
100
|
+
taskId?: string;
|
|
101
|
+
message?: string;
|
|
97
102
|
async?: boolean;
|
|
98
103
|
workspaceMode?: "single" | "worktree";
|
|
99
104
|
context?: "fresh" | "fork";
|
package/src/state/contracts.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
export const TEAM_RUN_STATUSES = ["queued", "planning", "running", "blocked", "completed", "failed", "cancelled"] as const;
|
|
2
2
|
export type TeamRunStatus = typeof TEAM_RUN_STATUSES[number];
|
|
3
3
|
|
|
4
|
-
export const TEAM_TASK_STATUSES = ["queued", "running", "completed", "failed", "cancelled", "skipped"] as const;
|
|
4
|
+
export const TEAM_TASK_STATUSES = ["queued", "running", "waiting", "completed", "failed", "cancelled", "skipped"] as const;
|
|
5
5
|
export type TeamTaskStatus = typeof TEAM_TASK_STATUSES[number];
|
|
6
6
|
|
|
7
7
|
export const TEAM_TERMINAL_RUN_STATUSES: ReadonlySet<TeamRunStatus> = new Set(["blocked", "completed", "failed", "cancelled"]);
|
|
@@ -19,7 +19,8 @@ export const TEAM_RUN_STATUS_TRANSITIONS: Readonly<Record<TeamRunStatus, readonl
|
|
|
19
19
|
|
|
20
20
|
export const TEAM_TASK_STATUS_TRANSITIONS: Readonly<Record<TeamTaskStatus, readonly TeamTaskStatus[]>> = {
|
|
21
21
|
queued: ["running", "cancelled", "skipped", "failed"],
|
|
22
|
-
running: ["completed", "failed", "cancelled", "queued"],
|
|
22
|
+
running: ["completed", "failed", "cancelled", "queued", "waiting"],
|
|
23
|
+
waiting: ["running", "completed", "failed", "cancelled"],
|
|
23
24
|
completed: ["queued"],
|
|
24
25
|
failed: ["queued", "cancelled"],
|
|
25
26
|
cancelled: ["queued"],
|
|
@@ -59,6 +60,9 @@ export const TEAM_EVENT_TYPES = [
|
|
|
59
60
|
"async.completed",
|
|
60
61
|
"async.failed",
|
|
61
62
|
"async.stale",
|
|
63
|
+
"task.waiting",
|
|
64
|
+
"task.resumed",
|
|
65
|
+
"supervisor.contact",
|
|
62
66
|
] as const;
|
|
63
67
|
export type TeamEventType = typeof TEAM_EVENT_TYPES[number];
|
|
64
68
|
|
package/src/ui/crew-widget.ts
CHANGED
|
@@ -100,12 +100,12 @@ function agentsFor(run: TeamRunManifest): CrewAgentRecord[] {
|
|
|
100
100
|
}
|
|
101
101
|
}
|
|
102
102
|
|
|
103
|
-
export function activeWidgetRuns(cwd: string, manifestCache?: ManifestCache, snapshotCache?: RunSnapshotCache): WidgetRun[] {
|
|
104
|
-
const runs = manifestCache ? manifestCache.list(20) : listRecentRuns(cwd, 20);
|
|
103
|
+
export function activeWidgetRuns(cwd: string, manifestCache?: ManifestCache, snapshotCache?: RunSnapshotCache, preloadedManifests?: TeamRunManifest[]): WidgetRun[] {
|
|
104
|
+
const runs = preloadedManifests ?? (manifestCache ? manifestCache.list(20) : listRecentRuns(cwd, 20));
|
|
105
105
|
return runs
|
|
106
106
|
.map((run) => {
|
|
107
107
|
try {
|
|
108
|
-
const snapshot = snapshotCache?.refreshIfStale(run.runId);
|
|
108
|
+
const snapshot = snapshotCache?.get(run.runId) ?? snapshotCache?.refreshIfStale(run.runId);
|
|
109
109
|
return snapshot ? { run: snapshot.manifest, agents: snapshot.agents, snapshot } : { run, agents: agentsFor(run) };
|
|
110
110
|
} catch {
|
|
111
111
|
return { run, agents: agentsFor(run) };
|
|
@@ -279,11 +279,12 @@ export function updateCrewWidget(
|
|
|
279
279
|
config?: CrewUiConfig,
|
|
280
280
|
manifestCache?: ManifestCache,
|
|
281
281
|
snapshotCache?: RunSnapshotCache,
|
|
282
|
+
preloadedManifests?: TeamRunManifest[],
|
|
282
283
|
): void {
|
|
283
284
|
if (!ctx.hasUI) return;
|
|
284
285
|
state.frame += 1;
|
|
285
286
|
const maxLines = config?.widgetMaxLines ?? MAX_LINES_DEFAULT;
|
|
286
|
-
const runs = activeWidgetRuns(ctx.cwd, manifestCache, snapshotCache);
|
|
287
|
+
const runs = activeWidgetRuns(ctx.cwd, manifestCache, snapshotCache, preloadedManifests);
|
|
287
288
|
const lines = buildCrewWidgetLines(ctx.cwd, state.frame, maxLines, runs, state.notificationCount ?? 0);
|
|
288
289
|
const placement = config?.widgetPlacement ?? "aboveEditor";
|
|
289
290
|
ctx.ui.setStatus(STATUS_KEY, lines.length ? statusSummary(runs) : undefined);
|