pi-crew 0.3.7 → 0.3.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +17 -0
- package/package.json +1 -1
- package/src/agents/discover-agents.ts +354 -15
- package/src/config/config.ts +732 -208
- package/src/config/types.ts +34 -5
- package/src/extension/help.ts +1 -0
- package/src/extension/register.ts +1173 -257
- package/src/extension/registration/commands.ts +15 -2
- package/src/extension/registration/team-tool.ts +1 -1
- package/src/extension/session-summary.ts +11 -1
- package/src/extension/team-tool/api.ts +4 -1
- package/src/extension/team-tool/cache-control.ts +23 -0
- package/src/extension/team-tool/cancel.ts +15 -5
- package/src/extension/team-tool/context.ts +2 -0
- package/src/extension/team-tool/handle-settings.ts +2 -0
- package/src/extension/team-tool/health-monitor.ts +563 -0
- package/src/extension/team-tool/inspect.ts +10 -3
- package/src/extension/team-tool/respond.ts +5 -2
- package/src/extension/team-tool/status.ts +4 -1
- package/src/extension/team-tool-types.ts +2 -0
- package/src/extension/team-tool.ts +901 -177
- package/src/runtime/adaptive-plan.ts +1 -1
- package/src/runtime/foreground-watchdog.ts +129 -0
- package/src/runtime/manifest-cache.ts +4 -2
- package/src/runtime/run-tracker.ts +11 -0
- package/src/runtime/runtime-policy.ts +15 -2
- package/src/runtime/skill-instructions.ts +8 -2
- package/src/runtime/stale-reconciler.ts +322 -18
- package/src/runtime/task-packet.ts +48 -1
- package/src/runtime/task-runner.ts +6 -1
- package/src/schema/config-schema.ts +1 -0
- package/src/schema/team-tool-schema.ts +204 -76
- package/src/state/state-store.ts +9 -1
- package/src/teams/discover-teams.ts +2 -1
- package/src/ui/run-event-bus.ts +2 -1
- package/src/ui/settings-overlay.ts +2 -0
- package/src/workflows/discover-workflows.ts +5 -1
|
@@ -263,7 +263,7 @@ export interface InjectAdaptivePlanResult {
|
|
|
263
263
|
export function injectAdaptivePlanIfReady(input: InjectAdaptivePlanInput): InjectAdaptivePlanResult {
|
|
264
264
|
if (input.workflow.name !== "implementation") return { tasks: input.tasks, workflow: input.workflow, injected: false, missingPlan: false };
|
|
265
265
|
if (input.tasks.some((task) => task.stepId?.startsWith("adaptive-"))) return { tasks: input.tasks, workflow: reconstructAdaptiveWorkflow(input.workflow, input.tasks), injected: false, missingPlan: false };
|
|
266
|
-
const completedAssess = input.tasks.find((task) => task.stepId === "assess" && task.status === "completed");
|
|
266
|
+
const completedAssess = input.tasks.find((task) => task.stepId === "assess" && (task.status === "completed" || task.status === "needs_attention"));
|
|
267
267
|
if (!completedAssess) return { tasks: input.tasks, workflow: input.workflow, injected: false, missingPlan: false };
|
|
268
268
|
if (!completedAssess.resultArtifact?.path) {
|
|
269
269
|
appendEvent(input.manifest.eventsPath, { type: "adaptive.plan_missing", runId: input.manifest.runId, taskId: completedAssess.id, message: "Adaptive planner result artifact is missing." });
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Foreground run watchdog — periodically checks that active foreground runs
|
|
3
|
+
* are making progress and auto-notifies the assistant if a run appears hung.
|
|
4
|
+
*
|
|
5
|
+
* Problem: foreground runs run in background via startForegroundRun(). The Pi
|
|
6
|
+
* assistant has no way to know when a run completes or gets stuck without
|
|
7
|
+
* manual polling. This watchdog monitors active runs and:
|
|
8
|
+
*
|
|
9
|
+
* 1. Detects hung runs (active status, no heartbeat update for >10 min)
|
|
10
|
+
* 2. Injects a followUp message via pi.sendUserMessage() so the assistant
|
|
11
|
+
* is automatically notified — no manual sleep+check needed.
|
|
12
|
+
* 3. Cleans up after itself when the run completes or the session ends.
|
|
13
|
+
*/
|
|
14
|
+
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
|
|
15
|
+
import { loadRunManifestById } from "../state/state-store.ts";
|
|
16
|
+
import { readCrewAgents } from "./crew-agent-records.ts";
|
|
17
|
+
import { isActiveRunStatus, isLikelyOrphanedActiveRun } from "./process-status.ts";
|
|
18
|
+
|
|
19
|
+
export interface WatchdogOptions {
|
|
20
|
+
pi: ExtensionAPI;
|
|
21
|
+
cwd: string;
|
|
22
|
+
runId: string;
|
|
23
|
+
/** Check interval in ms. Default: 5 minutes. */
|
|
24
|
+
checkIntervalMs?: number;
|
|
25
|
+
/** Maximum time to monitor in ms. Default: 2 hours. */
|
|
26
|
+
maxMonitorMs?: number;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
const DEFAULT_CHECK_INTERVAL_MS = 300_000; // 5 minutes
|
|
30
|
+
const DEFAULT_MAX_MONITOR_MS = 7_200_000; // 2 hours
|
|
31
|
+
|
|
32
|
+
/** Active watchdog timers — keyed by runId for cleanup. */
|
|
33
|
+
const activeWatchdogs = new Map<string, ReturnType<typeof setTimeout>>();
|
|
34
|
+
|
|
35
|
+
/** Stop a specific watchdog by runId. */
|
|
36
|
+
export function stopWatchdog(runId: string): void {
|
|
37
|
+
const timer = activeWatchdogs.get(runId);
|
|
38
|
+
if (timer) {
|
|
39
|
+
clearTimeout(timer);
|
|
40
|
+
activeWatchdogs.delete(runId);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/** Stop all active watchdogs. Called on session shutdown. */
|
|
45
|
+
export function stopAllWatchdogs(): void {
|
|
46
|
+
for (const [runId, timer] of activeWatchdogs) {
|
|
47
|
+
clearTimeout(timer);
|
|
48
|
+
}
|
|
49
|
+
activeWatchdogs.clear();
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Start a periodic watchdog for a foreground run.
|
|
54
|
+
* Checks at regular intervals whether the run is still progressing.
|
|
55
|
+
* If the run appears hung (no update for >10 min with no active agents),
|
|
56
|
+
* injects a followUp message into the Pi conversation.
|
|
57
|
+
*
|
|
58
|
+
* Automatically stops when:
|
|
59
|
+
* - The run reaches a terminal status (completed/failed/cancelled)
|
|
60
|
+
* - The max monitor time is exceeded
|
|
61
|
+
* - Explicitly stopped via stopWatchdog()
|
|
62
|
+
*/
|
|
63
|
+
export function startForegroundWatchdog(opts: WatchdogOptions): void {
|
|
64
|
+
const { pi, cwd, runId } = opts;
|
|
65
|
+
const checkIntervalMs = opts.checkIntervalMs ?? DEFAULT_CHECK_INTERVAL_MS;
|
|
66
|
+
const maxMonitorMs = opts.maxMonitorMs ?? DEFAULT_MAX_MONITOR_MS;
|
|
67
|
+
const startTime = Date.now();
|
|
68
|
+
|
|
69
|
+
// Don't stack watchdogs for the same run
|
|
70
|
+
if (activeWatchdogs.has(runId)) return;
|
|
71
|
+
|
|
72
|
+
const check = (): void => {
|
|
73
|
+
// Check if max monitor time exceeded
|
|
74
|
+
if (Date.now() - startTime > maxMonitorMs) {
|
|
75
|
+
activeWatchdogs.delete(runId);
|
|
76
|
+
return;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
try {
|
|
80
|
+
const loaded = loadRunManifestById(cwd, runId);
|
|
81
|
+
if (!loaded) {
|
|
82
|
+
// Run not found — stop watchdog
|
|
83
|
+
activeWatchdogs.delete(runId);
|
|
84
|
+
return;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const { manifest } = loaded;
|
|
88
|
+
|
|
89
|
+
// Terminal status — send completion notification and stop
|
|
90
|
+
if (!isActiveRunStatus(manifest.status)) {
|
|
91
|
+
const teamName = manifest.team ?? "unknown";
|
|
92
|
+
try {
|
|
93
|
+
pi.sendUserMessage(
|
|
94
|
+
`pi-crew run ${manifest.status}: ${runId} (${teamName}/${manifest.workflow ?? "default"})`,
|
|
95
|
+
{ deliverAs: "followUp" },
|
|
96
|
+
);
|
|
97
|
+
} catch { /* non-critical */ }
|
|
98
|
+
activeWatchdogs.delete(runId);
|
|
99
|
+
return;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Check if run appears hung
|
|
103
|
+
const agents = readCrewAgents(manifest);
|
|
104
|
+
const now = Date.now();
|
|
105
|
+
if (isLikelyOrphanedActiveRun(manifest, agents, now)) {
|
|
106
|
+
const detail = `status=${manifest.status}, updatedAt=${manifest.updatedAt}, agents=${agents.length}`;
|
|
107
|
+
try {
|
|
108
|
+
pi.sendUserMessage(
|
|
109
|
+
`pi-crew watchdog: run ${runId} appears hung (${detail}). Consider running team action='cancel' runId='${runId}' or team action='doctor'.`,
|
|
110
|
+
{ deliverAs: "followUp" },
|
|
111
|
+
);
|
|
112
|
+
} catch { /* non-critical */ }
|
|
113
|
+
// Don't stop — keep monitoring. The assistant or user may intervene.
|
|
114
|
+
}
|
|
115
|
+
} catch {
|
|
116
|
+
// Non-critical — skip this check
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Schedule next check
|
|
120
|
+
const timer = setTimeout(check, checkIntervalMs);
|
|
121
|
+
timer.unref(); // Don't prevent process exit
|
|
122
|
+
activeWatchdogs.set(runId, timer);
|
|
123
|
+
};
|
|
124
|
+
|
|
125
|
+
// First check after initial interval
|
|
126
|
+
const timer = setTimeout(check, checkIntervalMs);
|
|
127
|
+
timer.unref();
|
|
128
|
+
activeWatchdogs.set(runId, timer);
|
|
129
|
+
}
|
|
@@ -108,8 +108,10 @@ function parseManifestIfChanged(root: string, runId: string, filePath: string, p
|
|
|
108
108
|
|
|
109
109
|
function listRunRoots(cwd: string): string[] {
|
|
110
110
|
const roots = new Set<string>();
|
|
111
|
-
|
|
112
|
-
roots.add(path.join(
|
|
111
|
+
// Always include user-level runs (fast-fix, direct-agent, etc. write here)
|
|
112
|
+
roots.add(path.join(userCrewRoot(), DEFAULT_PATHS.state.runsSubdir));
|
|
113
|
+
const projectRoot = findRepoRoot(cwd);
|
|
114
|
+
if (projectRoot) roots.add(path.join(projectCrewRoot(cwd), DEFAULT_PATHS.state.runsSubdir));
|
|
113
115
|
return [...roots];
|
|
114
116
|
}
|
|
115
117
|
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import type { TeamRunManifest, TeamTaskState } from "../state/types.ts";
|
|
2
|
+
import * as fs from "node:fs";
|
|
3
|
+
import * as path from "node:path";
|
|
2
4
|
import { loadRunManifestById } from "../state/state-store.ts";
|
|
3
5
|
import { isFinishedRunStatus } from "./process-status.ts";
|
|
4
6
|
|
|
@@ -75,6 +77,15 @@ export async function waitForRun(
|
|
|
75
77
|
// Slow path: background run — poll with exponential backoff capped at pollIntervalMs
|
|
76
78
|
let attempt = 0;
|
|
77
79
|
while (Date.now() < deadline) {
|
|
80
|
+
if (attempt === 0) {
|
|
81
|
+
// Early exit: if the run directory doesn't exist, don't waste time polling
|
|
82
|
+
const runDir = path.join(cwd, ".crew", "state", "runs", runId);
|
|
83
|
+
if (!fs.existsSync(runDir)) {
|
|
84
|
+
throw new Error(
|
|
85
|
+
`Run ${runId} not found. No run directory at ${runDir}`,
|
|
86
|
+
);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
78
89
|
const fresh = loadRunManifestById(cwd, runId);
|
|
79
90
|
if (fresh && isFinishedRunStatus(fresh.manifest.status)) {
|
|
80
91
|
return fresh;
|
|
@@ -9,12 +9,25 @@ import { currentCrewDepth } from "./pi-args.ts";
|
|
|
9
9
|
* - If the role appears in `isolationPolicy.isolatedRoles`, use child-process (crash isolation).
|
|
10
10
|
* - Otherwise, use `isolationPolicy.defaultRuntime` when configured, then fall back to globalKind.
|
|
11
11
|
*/
|
|
12
|
-
export function resolveTaskRuntimeKind(
|
|
12
|
+
export function resolveTaskRuntimeKind(
|
|
13
|
+
globalKind: CrewRuntimeKind,
|
|
14
|
+
role: string,
|
|
15
|
+
isolationPolicy: CrewRuntimeConfig["isolationPolicy"],
|
|
16
|
+
env: NodeJS.ProcessEnv = process.env,
|
|
17
|
+
): CrewRuntimeKind {
|
|
13
18
|
if (globalKind === "scaffold") return "scaffold";
|
|
14
19
|
// Safety: when already inside a pi-crew worker (depth > 0), never nest live-session.
|
|
15
20
|
// Live-session creates in-process Pi agent sessions, which would recursively
|
|
16
21
|
// try to use pi-crew, leading to "Cannot read properties of undefined" errors.
|
|
17
|
-
|
|
22
|
+
// Exception: when PI_CREW_MOCK_LIVE_SESSION is set, we're in a test harness
|
|
23
|
+
// that mocks the live-session path — forcing child-process would spawn a real
|
|
24
|
+
// pi process and hang the test.
|
|
25
|
+
if (
|
|
26
|
+
globalKind === "live-session" &&
|
|
27
|
+
currentCrewDepth(env) > 0 &&
|
|
28
|
+
env.PI_CREW_MOCK_LIVE_SESSION !== "success"
|
|
29
|
+
)
|
|
30
|
+
return "child-process";
|
|
18
31
|
const isolatedRoles = isolationPolicy?.isolatedRoles ?? [];
|
|
19
32
|
if (isolatedRoles.includes(role)) return "child-process";
|
|
20
33
|
return isolationPolicy?.defaultRuntime ?? globalKind;
|
|
@@ -91,10 +91,16 @@ export function resolveTaskSkillNames(input: ResolveTaskSkillsInput): string[] {
|
|
|
91
91
|
return collectTaskSkillNames(input).slice(0, MAX_SELECTED_SKILLS);
|
|
92
92
|
}
|
|
93
93
|
|
|
94
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
95
|
+
// SEC-003 Fix: Reverse skill search order (package first, project second)
|
|
96
|
+
// Prevents malicious project skills from overriding trusted package skills.
|
|
97
|
+
// See: SECURITY-ISSUES.md SEC-003
|
|
98
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
99
|
+
|
|
94
100
|
function candidateSkillDirs(cwd: string): Array<{ root: string; source: "project" | "package" }> {
|
|
95
101
|
return [
|
|
96
|
-
{ root:
|
|
97
|
-
{ root:
|
|
102
|
+
{ root: PACKAGE_SKILLS_DIR, source: "package" }, // ✓ Trusted first
|
|
103
|
+
{ root: path.resolve(cwd, "skills"), source: "project" }, // ⚠️ Override second
|
|
98
104
|
];
|
|
99
105
|
}
|
|
100
106
|
|
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
import * as fs from "node:fs";
|
|
2
|
+
import * as os from "node:os";
|
|
2
3
|
import * as path from "node:path";
|
|
3
4
|
import type { TeamRunManifest, TeamTaskState } from "../state/types.ts";
|
|
4
|
-
import { checkProcessLiveness } from "./process-status.ts";
|
|
5
5
|
import { recordFromTask, upsertCrewAgent } from "./crew-agent-records.ts";
|
|
6
|
+
import { checkProcessLiveness } from "./process-status.ts";
|
|
7
|
+
|
|
8
|
+
/** Age threshold for orphaned temp directory cleanup: 1 hour. */
|
|
9
|
+
const ORPHAN_TEMP_DIR_AGE_THRESHOLD_MS = 60 * 60 * 1000;
|
|
6
10
|
|
|
7
11
|
/**
|
|
8
12
|
* Result of reconciling a single stale run.
|
|
@@ -10,7 +14,12 @@ import { recordFromTask, upsertCrewAgent } from "./crew-agent-records.ts";
|
|
|
10
14
|
export interface ReconcileResult {
|
|
11
15
|
runId: string;
|
|
12
16
|
/** What was found and what action was taken */
|
|
13
|
-
verdict:
|
|
17
|
+
verdict:
|
|
18
|
+
| "healthy"
|
|
19
|
+
| "result_exists"
|
|
20
|
+
| "pid_dead"
|
|
21
|
+
| "pid_alive_stale"
|
|
22
|
+
| "no_status";
|
|
14
23
|
/** Whether repair was applied */
|
|
15
24
|
repaired: boolean;
|
|
16
25
|
/** Human-readable detail */
|
|
@@ -21,6 +30,8 @@ export interface ReconcileResult {
|
|
|
21
30
|
|
|
22
31
|
const STALE_ALIVE_PID_MS = 24 * 60 * 60 * 1000; // 24 hours
|
|
23
32
|
const ACTIVE_EVIDENCE_TTL_MS = 5 * 60 * 1000;
|
|
33
|
+
/** For no-PID runs, repair when ALL running tasks have heartbeat stale beyond this threshold. */
|
|
34
|
+
const NO_PID_HEARTBEAT_STALE_MS = 5 * 60 * 1000; // 5 minutes — same as heartbeat-gradient deadMs
|
|
24
35
|
|
|
25
36
|
/**
|
|
26
37
|
* Phase 1: Check if a result file already exists for the run.
|
|
@@ -31,14 +42,28 @@ function checkResultFile(
|
|
|
31
42
|
tasks: TeamTaskState[],
|
|
32
43
|
): { found: boolean; repaired: boolean } {
|
|
33
44
|
// Check if all tasks already have terminal status (result was written but manifest wasn't updated)
|
|
34
|
-
const allTerminal =
|
|
35
|
-
|
|
36
|
-
|
|
45
|
+
const allTerminal =
|
|
46
|
+
tasks.length > 0 &&
|
|
47
|
+
tasks.every(
|
|
48
|
+
(t) =>
|
|
49
|
+
t.status === "completed" ||
|
|
50
|
+
t.status === "failed" ||
|
|
51
|
+
t.status === "cancelled" ||
|
|
52
|
+
t.status === "skipped" ||
|
|
53
|
+
t.status === "needs_attention",
|
|
54
|
+
);
|
|
37
55
|
if (allTerminal) {
|
|
38
56
|
// Sync agent records even when tasks are already terminal
|
|
39
57
|
// (e.g., a previous reconcile fixed tasks but crashed before updating agents)
|
|
40
58
|
for (const task of tasks) {
|
|
41
|
-
try {
|
|
59
|
+
try {
|
|
60
|
+
upsertCrewAgent(
|
|
61
|
+
manifest,
|
|
62
|
+
recordFromTask(manifest, task, "scaffold"),
|
|
63
|
+
);
|
|
64
|
+
} catch {
|
|
65
|
+
/* non-critical */
|
|
66
|
+
}
|
|
42
67
|
}
|
|
43
68
|
return { found: true, repaired: false };
|
|
44
69
|
}
|
|
@@ -52,7 +77,10 @@ function checkResultFile(
|
|
|
52
77
|
* written, treat the PID as alive even if process.kill returns false
|
|
53
78
|
* (handles SIGKILL race where PID hasn't been recycled yet).
|
|
54
79
|
*/
|
|
55
|
-
function checkPidLiveness(
|
|
80
|
+
function checkPidLiveness(
|
|
81
|
+
pid: number | undefined,
|
|
82
|
+
stateRoot?: string,
|
|
83
|
+
): {
|
|
56
84
|
alive: boolean;
|
|
57
85
|
detail: string;
|
|
58
86
|
} {
|
|
@@ -67,13 +95,18 @@ function checkPidLiveness(pid: number | undefined, stateRoot?: string): {
|
|
|
67
95
|
const heartbeatPath = path.join(stateRoot, "heartbeat.json");
|
|
68
96
|
try {
|
|
69
97
|
if (fs.existsSync(heartbeatPath)) {
|
|
70
|
-
const hb = JSON.parse(
|
|
98
|
+
const hb = JSON.parse(
|
|
99
|
+
fs.readFileSync(heartbeatPath, "utf-8"),
|
|
100
|
+
) as { pid?: number; at?: number };
|
|
71
101
|
if (hb?.pid === pid && hb?.at) {
|
|
72
102
|
const ageMs = Date.now() - hb.at;
|
|
73
103
|
// Heartbeat written < 5 min ago → process was alive recently.
|
|
74
104
|
// Don't repair yet; let the next reconciliation cycle catch it.
|
|
75
105
|
if (ageMs < 5 * 60_000) {
|
|
76
|
-
return {
|
|
106
|
+
return {
|
|
107
|
+
alive: true,
|
|
108
|
+
detail: `process dead but heartbeat ${Math.round(ageMs / 1000)}s old`,
|
|
109
|
+
};
|
|
77
110
|
}
|
|
78
111
|
}
|
|
79
112
|
}
|
|
@@ -101,18 +134,76 @@ function evaluateStaleness(
|
|
|
101
134
|
return { stale: false, reason: "updated_at_invalid" };
|
|
102
135
|
}
|
|
103
136
|
if (now - updatedAt > STALE_ALIVE_PID_MS) {
|
|
104
|
-
return {
|
|
137
|
+
return {
|
|
138
|
+
stale: true,
|
|
139
|
+
reason: `alive_but_stale_${Math.round((now - updatedAt) / 3600_000)}h`,
|
|
140
|
+
};
|
|
105
141
|
}
|
|
106
142
|
return { stale: false, reason: "alive_and_recent" };
|
|
107
143
|
}
|
|
108
144
|
|
|
109
145
|
function hasRecentActiveEvidence(tasks: TeamTaskState[], now: number): boolean {
|
|
110
146
|
return tasks.some((task) => {
|
|
111
|
-
if (task.status !== "running" && task.status !== "waiting")
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
147
|
+
if (task.status !== "running" && task.status !== "waiting")
|
|
148
|
+
return false;
|
|
149
|
+
const heartbeatAt = task.heartbeat?.lastSeenAt
|
|
150
|
+
? new Date(task.heartbeat.lastSeenAt).getTime()
|
|
151
|
+
: Number.NaN;
|
|
152
|
+
if (
|
|
153
|
+
task.heartbeat?.alive !== false &&
|
|
154
|
+
Number.isFinite(heartbeatAt) &&
|
|
155
|
+
now - heartbeatAt <= ACTIVE_EVIDENCE_TTL_MS
|
|
156
|
+
)
|
|
157
|
+
return true;
|
|
158
|
+
const activityAt = task.agentProgress?.lastActivityAt
|
|
159
|
+
? new Date(task.agentProgress.lastActivityAt).getTime()
|
|
160
|
+
: Number.NaN;
|
|
161
|
+
return (
|
|
162
|
+
Number.isFinite(activityAt) &&
|
|
163
|
+
now - activityAt <= ACTIVE_EVIDENCE_TTL_MS
|
|
164
|
+
);
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* For no-PID runs: check if ALL running tasks have heartbeats stale beyond
|
|
170
|
+
* the no-PID heartbeat threshold. This detects zombie tasks where the worker
|
|
171
|
+
* process died but no PID was recorded (e.g. live-session /tmp/ workspaces).
|
|
172
|
+
* Tasks with no heartbeat AND no agent progress are considered NOT stale
|
|
173
|
+
* (they may be newly spawned and haven't reported yet).
|
|
174
|
+
*/
|
|
175
|
+
function allRunningTasksHeartbeatStale(
|
|
176
|
+
tasks: TeamTaskState[],
|
|
177
|
+
now: number,
|
|
178
|
+
): boolean {
|
|
179
|
+
const runningTasks = tasks.filter(
|
|
180
|
+
(t) => t.status === "running" || t.status === "waiting",
|
|
181
|
+
);
|
|
182
|
+
if (runningTasks.length === 0) return false;
|
|
183
|
+
return runningTasks.every((task) => {
|
|
184
|
+
const heartbeatAt = task.heartbeat?.lastSeenAt
|
|
185
|
+
? new Date(task.heartbeat.lastSeenAt).getTime()
|
|
186
|
+
: Number.NaN;
|
|
187
|
+
const activityAt = task.agentProgress?.lastActivityAt
|
|
188
|
+
? new Date(task.agentProgress.lastActivityAt).getTime()
|
|
189
|
+
: Number.NaN;
|
|
190
|
+
// If no heartbeat AND no activity, we can't determine staleness — assume not stale
|
|
191
|
+
if (!Number.isFinite(heartbeatAt) && !Number.isFinite(activityAt))
|
|
192
|
+
return false;
|
|
193
|
+
// If heartbeat is recent enough, not stale
|
|
194
|
+
if (
|
|
195
|
+
Number.isFinite(heartbeatAt) &&
|
|
196
|
+
now - heartbeatAt <= NO_PID_HEARTBEAT_STALE_MS
|
|
197
|
+
)
|
|
198
|
+
return false;
|
|
199
|
+
// If agent progress is recent enough, not stale
|
|
200
|
+
if (
|
|
201
|
+
Number.isFinite(activityAt) &&
|
|
202
|
+
now - activityAt <= NO_PID_HEARTBEAT_STALE_MS
|
|
203
|
+
)
|
|
204
|
+
return false;
|
|
205
|
+
// Both present and both stale → this task is stale
|
|
206
|
+
return true;
|
|
116
207
|
});
|
|
117
208
|
}
|
|
118
209
|
|
|
@@ -126,7 +217,11 @@ function repairStaleRun(
|
|
|
126
217
|
): TeamTaskState[] {
|
|
127
218
|
const now = new Date().toISOString();
|
|
128
219
|
const repairedTasks = tasks.map((task) => {
|
|
129
|
-
if (
|
|
220
|
+
if (
|
|
221
|
+
task.status === "running" ||
|
|
222
|
+
task.status === "queued" ||
|
|
223
|
+
task.status === "waiting"
|
|
224
|
+
) {
|
|
130
225
|
return {
|
|
131
226
|
...task,
|
|
132
227
|
status: "cancelled" as const,
|
|
@@ -138,7 +233,14 @@ function repairStaleRun(
|
|
|
138
233
|
});
|
|
139
234
|
// Update agent records so widget sees cancelled status immediately
|
|
140
235
|
for (const task of repairedTasks) {
|
|
141
|
-
try {
|
|
236
|
+
try {
|
|
237
|
+
upsertCrewAgent(
|
|
238
|
+
manifest,
|
|
239
|
+
recordFromTask(manifest, task, "scaffold"),
|
|
240
|
+
);
|
|
241
|
+
} catch {
|
|
242
|
+
/* non-critical */
|
|
243
|
+
}
|
|
142
244
|
}
|
|
143
245
|
return repairedTasks;
|
|
144
246
|
}
|
|
@@ -183,8 +285,31 @@ export function reconcileStaleRun(
|
|
|
183
285
|
detail: "No PID recorded, but recent task heartbeat/progress exists; not repairing",
|
|
184
286
|
};
|
|
185
287
|
}
|
|
288
|
+
// No PID and no recent activity. If ALL running tasks have stale heartbeats
|
|
289
|
+
// (beyond NO_PID_HEARTBEAT_STALE_MS = 5min), repair immediately — the worker
|
|
290
|
+
// process is dead but we have no PID to check. This handles /tmp/ live-session
|
|
291
|
+
// workspaces where agents exit without calling submit_result.
|
|
292
|
+
if (allRunningTasksHeartbeatStale(tasks, now)) {
|
|
293
|
+
const repaired = repairStaleRun(
|
|
294
|
+
manifest,
|
|
295
|
+
tasks,
|
|
296
|
+
"no_pid_heartbeat_stale",
|
|
297
|
+
);
|
|
298
|
+
return {
|
|
299
|
+
runId,
|
|
300
|
+
verdict: "no_status",
|
|
301
|
+
repaired: true,
|
|
302
|
+
detail: `No PID; all running task heartbeats stale >${Math.round(NO_PID_HEARTBEAT_STALE_MS / 60_000)}min; repaired ${repaired.filter((t) => t.status === "cancelled").length} tasks`,
|
|
303
|
+
repairedTasks: repaired,
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
// Fall through: no recent activity but not all tasks stale enough yet.
|
|
307
|
+
// Check the longer STALE_ALIVE_PID_MS threshold for very old runs.
|
|
186
308
|
const updatedAt = new Date(manifest.updatedAt).getTime();
|
|
187
|
-
if (
|
|
309
|
+
if (
|
|
310
|
+
Number.isFinite(updatedAt) &&
|
|
311
|
+
now - updatedAt > STALE_ALIVE_PID_MS
|
|
312
|
+
) {
|
|
188
313
|
const repaired = repairStaleRun(manifest, tasks, "no_pid_stale");
|
|
189
314
|
return {
|
|
190
315
|
runId,
|
|
@@ -223,3 +348,182 @@ export function reconcileStaleRun(
|
|
|
223
348
|
repairedTasks: repaired,
|
|
224
349
|
};
|
|
225
350
|
}
|
|
351
|
+
|
|
352
|
+
/**
|
|
353
|
+
* Result of orphaned temp workspace reconciliation.
|
|
354
|
+
*/
|
|
355
|
+
export interface OrphanReconcileResult {
|
|
356
|
+
/** Number of runs repaired (manifests cancelled). */
|
|
357
|
+
repaired: number;
|
|
358
|
+
/** Number of /tmp/pi-crew-* directories removed. */
|
|
359
|
+
cleanedDirs: number;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
/**
|
|
363
|
+
* Scan /tmp (os.tmpdir()) for orphaned pi-crew-* workspaces and reconcile
|
|
364
|
+
* any stale runs found. This catches runs created by tests or crashed sessions
|
|
365
|
+
* that the per-CWD auto-repair timer would miss.
|
|
366
|
+
*
|
|
367
|
+
* When `cleanupOrphanedTempDirs` is not explicitly set to `false`, directories
|
|
368
|
+
* older than 1 hour with no remaining running manifests are deleted after
|
|
369
|
+
* their runs are reconciled.
|
|
370
|
+
*
|
|
371
|
+
* @returns Number of runs repaired and directories cleaned.
|
|
372
|
+
*/
|
|
373
|
+
export function reconcileOrphanedTempWorkspaces(
|
|
374
|
+
now = Date.now(),
|
|
375
|
+
options?: { cleanupOrphanedTempDirs?: boolean },
|
|
376
|
+
): OrphanReconcileResult {
|
|
377
|
+
const tmpDir = getSafeTempDir();
|
|
378
|
+
if (!tmpDir) return { repaired: 0, cleanedDirs: 0 };
|
|
379
|
+
let repaired = 0;
|
|
380
|
+
let cleanedDirs = 0;
|
|
381
|
+
try {
|
|
382
|
+
const entries = fs.readdirSync(tmpDir, { withFileTypes: true });
|
|
383
|
+
for (const entry of entries) {
|
|
384
|
+
if (!entry.isDirectory() || !entry.name.startsWith("pi-crew-"))
|
|
385
|
+
continue;
|
|
386
|
+
const workspaceDir = path.join(tmpDir, entry.name);
|
|
387
|
+
const crewDir = path.join(workspaceDir, ".crew");
|
|
388
|
+
if (!fs.existsSync(crewDir)) continue;
|
|
389
|
+
const stateRunsDir = path.join(crewDir, "state", "runs");
|
|
390
|
+
if (!fs.existsSync(stateRunsDir)) continue;
|
|
391
|
+
let hasRunning = false;
|
|
392
|
+
try {
|
|
393
|
+
for (const runDir of fs.readdirSync(stateRunsDir)) {
|
|
394
|
+
const manifestPath = path.join(
|
|
395
|
+
stateRunsDir,
|
|
396
|
+
runDir,
|
|
397
|
+
"manifest.json",
|
|
398
|
+
);
|
|
399
|
+
const tasksPath = path.join(
|
|
400
|
+
stateRunsDir,
|
|
401
|
+
runDir,
|
|
402
|
+
"tasks.json",
|
|
403
|
+
);
|
|
404
|
+
if (
|
|
405
|
+
!fs.existsSync(manifestPath) ||
|
|
406
|
+
!fs.existsSync(tasksPath)
|
|
407
|
+
)
|
|
408
|
+
continue;
|
|
409
|
+
try {
|
|
410
|
+
const manifest: TeamRunManifest = JSON.parse(
|
|
411
|
+
fs.readFileSync(manifestPath, "utf-8"),
|
|
412
|
+
);
|
|
413
|
+
if (manifest.status !== "running") continue;
|
|
414
|
+
const tasks: TeamTaskState[] = JSON.parse(
|
|
415
|
+
fs.readFileSync(tasksPath, "utf-8"),
|
|
416
|
+
);
|
|
417
|
+
const result = reconcileStaleRun(manifest, tasks, now);
|
|
418
|
+
if (result.repaired && result.repairedTasks) {
|
|
419
|
+
// Persist repaired tasks
|
|
420
|
+
fs.writeFileSync(
|
|
421
|
+
tasksPath,
|
|
422
|
+
JSON.stringify(result.repairedTasks, null, 2),
|
|
423
|
+
);
|
|
424
|
+
// Update manifest status
|
|
425
|
+
const updated = {
|
|
426
|
+
...manifest,
|
|
427
|
+
status: "cancelled" as const,
|
|
428
|
+
updatedAt: new Date(now).toISOString(),
|
|
429
|
+
summary: `Stale run reconciled: ${result.detail}`,
|
|
430
|
+
};
|
|
431
|
+
fs.writeFileSync(
|
|
432
|
+
manifestPath,
|
|
433
|
+
JSON.stringify(updated, null, 2),
|
|
434
|
+
);
|
|
435
|
+
// Update agent records
|
|
436
|
+
for (const task of result.repairedTasks) {
|
|
437
|
+
try {
|
|
438
|
+
upsertCrewAgent(
|
|
439
|
+
updated,
|
|
440
|
+
recordFromTask(
|
|
441
|
+
updated,
|
|
442
|
+
task,
|
|
443
|
+
"scaffold",
|
|
444
|
+
),
|
|
445
|
+
);
|
|
446
|
+
} catch {
|
|
447
|
+
/* non-critical */
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
repaired++;
|
|
451
|
+
}
|
|
452
|
+
// If still running after reconciliation attempt, mark for dir-preserving
|
|
453
|
+
if (
|
|
454
|
+
result.verdict === "healthy" ||
|
|
455
|
+
(result.verdict === "no_status" && !result.repaired)
|
|
456
|
+
) {
|
|
457
|
+
hasRunning = true;
|
|
458
|
+
}
|
|
459
|
+
} catch {
|
|
460
|
+
/* skip corrupt manifests */
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
} catch {
|
|
464
|
+
/* skip unreadable dirs */
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
// Post-loop: check if this workspace dir can be cleaned up.
|
|
468
|
+
// Eligible when cleanup is enabled, no running manifests remain, and
|
|
469
|
+
// the directory is older than the age threshold.
|
|
470
|
+
if (!hasRunning) {
|
|
471
|
+
// Re-scan manifests to confirm no running runs remain
|
|
472
|
+
// (some may have been cancelled on a previous pass)
|
|
473
|
+
if (fs.existsSync(stateRunsDir)) {
|
|
474
|
+
try {
|
|
475
|
+
for (const runDir of fs.readdirSync(stateRunsDir)) {
|
|
476
|
+
const manifestPath = path.join(
|
|
477
|
+
stateRunsDir,
|
|
478
|
+
runDir,
|
|
479
|
+
"manifest.json",
|
|
480
|
+
);
|
|
481
|
+
if (!fs.existsSync(manifestPath)) continue;
|
|
482
|
+
try {
|
|
483
|
+
const manifest: TeamRunManifest = JSON.parse(
|
|
484
|
+
fs.readFileSync(manifestPath, "utf-8"),
|
|
485
|
+
);
|
|
486
|
+
if (manifest.status === "running") {
|
|
487
|
+
hasRunning = true;
|
|
488
|
+
break;
|
|
489
|
+
}
|
|
490
|
+
} catch {
|
|
491
|
+
/* skip corrupt */
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
} catch {
|
|
495
|
+
/* skip unreadable */
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
const cleanupEnabled = options?.cleanupOrphanedTempDirs !== false;
|
|
501
|
+
if (cleanupEnabled && !hasRunning) {
|
|
502
|
+
try {
|
|
503
|
+
const stat = fs.statSync(workspaceDir);
|
|
504
|
+
const dirAge = now - stat.mtimeMs;
|
|
505
|
+
if (dirAge > ORPHAN_TEMP_DIR_AGE_THRESHOLD_MS) {
|
|
506
|
+
fs.rmSync(workspaceDir, {
|
|
507
|
+
recursive: true,
|
|
508
|
+
force: true,
|
|
509
|
+
});
|
|
510
|
+
cleanedDirs++;
|
|
511
|
+
}
|
|
512
|
+
} catch {
|
|
513
|
+
/* skip if stat or rm fails */
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
} catch {
|
|
518
|
+
/* skip if tmpdir unreadable */
|
|
519
|
+
}
|
|
520
|
+
return { repaired, cleanedDirs };
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
function getSafeTempDir(): string | undefined {
|
|
524
|
+
try {
|
|
525
|
+
return fs.existsSync(os.tmpdir()) ? os.tmpdir() : undefined;
|
|
526
|
+
} catch {
|
|
527
|
+
return undefined;
|
|
528
|
+
}
|
|
529
|
+
}
|