pi-crew 0.3.7 → 0.3.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +17 -0
- package/package.json +1 -1
- package/src/agents/discover-agents.ts +354 -15
- package/src/config/config.ts +732 -208
- package/src/config/types.ts +34 -5
- package/src/extension/help.ts +1 -0
- package/src/extension/register.ts +1173 -257
- package/src/extension/registration/commands.ts +15 -2
- package/src/extension/registration/team-tool.ts +1 -1
- package/src/extension/session-summary.ts +11 -1
- package/src/extension/team-tool/api.ts +4 -1
- package/src/extension/team-tool/cache-control.ts +23 -0
- package/src/extension/team-tool/cancel.ts +15 -5
- package/src/extension/team-tool/context.ts +2 -0
- package/src/extension/team-tool/handle-settings.ts +2 -0
- package/src/extension/team-tool/health-monitor.ts +563 -0
- package/src/extension/team-tool/inspect.ts +10 -3
- package/src/extension/team-tool/respond.ts +5 -2
- package/src/extension/team-tool/status.ts +4 -1
- package/src/extension/team-tool-types.ts +2 -0
- package/src/extension/team-tool.ts +901 -177
- package/src/runtime/adaptive-plan.ts +1 -1
- package/src/runtime/foreground-watchdog.ts +129 -0
- package/src/runtime/manifest-cache.ts +4 -2
- package/src/runtime/run-tracker.ts +11 -0
- package/src/runtime/runtime-policy.ts +15 -2
- package/src/runtime/skill-instructions.ts +8 -2
- package/src/runtime/stale-reconciler.ts +322 -18
- package/src/runtime/task-packet.ts +48 -1
- package/src/runtime/task-runner.ts +6 -1
- package/src/schema/config-schema.ts +1 -0
- package/src/schema/team-tool-schema.ts +204 -76
- package/src/state/state-store.ts +9 -1
- package/src/teams/discover-teams.ts +2 -1
- package/src/ui/run-event-bus.ts +2 -1
- package/src/ui/settings-overlay.ts +2 -0
- package/src/workflows/discover-workflows.ts +5 -1
|
@@ -0,0 +1,563 @@
|
|
|
1
|
+
// watchdog monitored
|
|
2
|
+
// Auto-monitored by watchdog
|
|
3
|
+
// health monitor support
|
|
4
|
+
import * as fs from "node:fs";
|
|
5
|
+
import * as os from "node:os";
|
|
6
|
+
import * as path from "node:path";
|
|
7
|
+
import { listRuns } from "../run-index.ts";
|
|
8
|
+
import { readCrewAgents } from "../../runtime/crew-agent-records.ts";
|
|
9
|
+
import {
|
|
10
|
+
isActiveRunStatus,
|
|
11
|
+
isFinishedRunStatus,
|
|
12
|
+
hasStaleAsyncProcess,
|
|
13
|
+
isLikelyOrphanedActiveRun,
|
|
14
|
+
} from "../../runtime/process-status.ts";
|
|
15
|
+
import { result, type TeamContext } from "./context.ts";
|
|
16
|
+
import type { TeamToolParamsValue } from "../../schema/team-tool-schema.ts";
|
|
17
|
+
import type { PiTeamsToolResult } from "../tool-result.ts";
|
|
18
|
+
import type { TeamRunManifest, TeamTaskState } from "../../state/types.ts";
|
|
19
|
+
|
|
20
|
+
// ── Types ────────────────────────────────────────────────────────────
|
|
21
|
+
|
|
22
|
+
interface HealthEntry {
|
|
23
|
+
runId: string;
|
|
24
|
+
reason: string;
|
|
25
|
+
detail: string;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
interface CorruptEntry {
|
|
29
|
+
runId: string;
|
|
30
|
+
reason: string;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
interface StuckTask {
|
|
34
|
+
runId: string;
|
|
35
|
+
taskId: string;
|
|
36
|
+
staleMs: number;
|
|
37
|
+
detail: string;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
interface ZombieWorkspace {
|
|
41
|
+
dir: string;
|
|
42
|
+
runCount: number;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
export interface HealthCounts {
|
|
46
|
+
total: number;
|
|
47
|
+
running: number;
|
|
48
|
+
completed: number;
|
|
49
|
+
failed: number;
|
|
50
|
+
cancelled: number;
|
|
51
|
+
blocked: number;
|
|
52
|
+
queued: number;
|
|
53
|
+
planning: number;
|
|
54
|
+
waiting: number;
|
|
55
|
+
stuck: number;
|
|
56
|
+
zombie: number;
|
|
57
|
+
ghost: number;
|
|
58
|
+
orphaned: number;
|
|
59
|
+
corrupted: number;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/** How stale (ms) a task's heartbeat/activity must be before it's considered "stuck". */
|
|
63
|
+
export const STUCK_TASK_THRESHOLD_MS = 5 * 60 * 1000; // 5 minutes
|
|
64
|
+
|
|
65
|
+
// ── Helpers (exported for testability) ───────────────────────────────
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Read tasks.json for a given run's stateRoot. Returns empty array on error.
|
|
69
|
+
*/
|
|
70
|
+
function readRunTasks(stateRoot: string): TeamTaskState[] {
|
|
71
|
+
const tasksPath = path.join(stateRoot, "tasks.json");
|
|
72
|
+
try {
|
|
73
|
+
return JSON.parse(
|
|
74
|
+
fs.readFileSync(tasksPath, "utf-8"),
|
|
75
|
+
) as TeamTaskState[];
|
|
76
|
+
} catch {
|
|
77
|
+
return [];
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Detect "stuck" tasks: tasks with status "running" whose last heartbeat or
|
|
83
|
+
* activity timestamp is older than STUCK_TASK_THRESHOLD_MS.
|
|
84
|
+
*/
|
|
85
|
+
export function detectStuckTasks(
|
|
86
|
+
run: { runId: string; stateRoot: string },
|
|
87
|
+
now: number,
|
|
88
|
+
): StuckTask[] {
|
|
89
|
+
const tasks = readRunTasks(run.stateRoot);
|
|
90
|
+
const stuck: StuckTask[] = [];
|
|
91
|
+
|
|
92
|
+
for (const task of tasks) {
|
|
93
|
+
if (task.status !== "running") continue;
|
|
94
|
+
|
|
95
|
+
// Check agentProgress.lastActivityAt first, then heartbeat.lastSeenAt, then startedAt
|
|
96
|
+
const activityAt = task.agentProgress?.lastActivityAt
|
|
97
|
+
? new Date(task.agentProgress.lastActivityAt).getTime()
|
|
98
|
+
: Number.NaN;
|
|
99
|
+
const heartbeatAt = task.heartbeat?.lastSeenAt
|
|
100
|
+
? new Date(task.heartbeat.lastSeenAt).getTime()
|
|
101
|
+
: Number.NaN;
|
|
102
|
+
const startedAt = task.startedAt
|
|
103
|
+
? new Date(task.startedAt).getTime()
|
|
104
|
+
: Number.NaN;
|
|
105
|
+
|
|
106
|
+
// Use the most recent valid timestamp
|
|
107
|
+
let latest = Number.NaN;
|
|
108
|
+
for (const ts of [activityAt, heartbeatAt, startedAt]) {
|
|
109
|
+
if (
|
|
110
|
+
Number.isFinite(ts) &&
|
|
111
|
+
(!Number.isFinite(latest) || ts > latest)
|
|
112
|
+
) {
|
|
113
|
+
latest = ts;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
if (!Number.isFinite(latest)) continue; // no timestamp to judge
|
|
118
|
+
|
|
119
|
+
const staleMs = now - latest;
|
|
120
|
+
if (staleMs > STUCK_TASK_THRESHOLD_MS) {
|
|
121
|
+
stuck.push({
|
|
122
|
+
runId: run.runId,
|
|
123
|
+
taskId: task.id,
|
|
124
|
+
staleMs,
|
|
125
|
+
detail: `stale ${Math.round(staleMs / 60_000)}m (last activity: ${new Date(latest).toISOString()})`,
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
return stuck;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Scan a directory for "pi-crew-*" subdirs that contain
|
|
135
|
+
* .crew/state/runs/ with at least one valid run manifest.
|
|
136
|
+
* Read-only — does NOT mutate anything.
|
|
137
|
+
* Pattern adapted from stale-reconciler.ts:reconcileOrphanedTempWorkspaces().
|
|
138
|
+
*/
|
|
139
|
+
export function scanZombieTempWorkspaces(
|
|
140
|
+
tmpDir: string,
|
|
141
|
+
now: number,
|
|
142
|
+
): ZombieWorkspace[] {
|
|
143
|
+
if (!tmpDir || !fs.existsSync(tmpDir)) return [];
|
|
144
|
+
const zombies: ZombieWorkspace[] = [];
|
|
145
|
+
|
|
146
|
+
try {
|
|
147
|
+
const entries = fs.readdirSync(tmpDir, { withFileTypes: true });
|
|
148
|
+
for (const entry of entries) {
|
|
149
|
+
if (!entry.isDirectory() || !entry.name.startsWith("pi-crew-"))
|
|
150
|
+
continue;
|
|
151
|
+
const workspaceDir = path.join(tmpDir, entry.name);
|
|
152
|
+
const stateRunsDir = path.join(
|
|
153
|
+
workspaceDir,
|
|
154
|
+
".crew",
|
|
155
|
+
"state",
|
|
156
|
+
"runs",
|
|
157
|
+
);
|
|
158
|
+
if (!fs.existsSync(stateRunsDir)) continue;
|
|
159
|
+
|
|
160
|
+
let runCount = 0;
|
|
161
|
+
try {
|
|
162
|
+
for (const runDir of fs.readdirSync(stateRunsDir)) {
|
|
163
|
+
const manifestPath = path.join(
|
|
164
|
+
stateRunsDir,
|
|
165
|
+
runDir,
|
|
166
|
+
"manifest.json",
|
|
167
|
+
);
|
|
168
|
+
if (fs.existsSync(manifestPath)) {
|
|
169
|
+
try {
|
|
170
|
+
const manifest = JSON.parse(
|
|
171
|
+
fs.readFileSync(manifestPath, "utf-8"),
|
|
172
|
+
) as { runId?: string; status?: string };
|
|
173
|
+
if (manifest.runId) runCount++;
|
|
174
|
+
} catch {
|
|
175
|
+
/* skip corrupt manifest */
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
} catch {
|
|
180
|
+
/* skip unreadable dirs */
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
if (runCount > 0) {
|
|
184
|
+
zombies.push({ dir: workspaceDir, runCount });
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
} catch {
|
|
188
|
+
/* skip if tmpdir unreadable */
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
return zombies;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* Scan a directory for pi-crew-* workspaces and collect their runs.
|
|
196
|
+
* Merges with the primary list, deduping by runId.
|
|
197
|
+
*/
|
|
198
|
+
export function collectTempWorkspaceRuns(
|
|
199
|
+
primaryRuns: Array<{ runId: string }>,
|
|
200
|
+
tmpDir: string,
|
|
201
|
+
): Array<{
|
|
202
|
+
runId: string;
|
|
203
|
+
status: string;
|
|
204
|
+
cwd: string;
|
|
205
|
+
stateRoot: string;
|
|
206
|
+
artifactsRoot: string;
|
|
207
|
+
async?: { pid?: number };
|
|
208
|
+
updatedAt: string;
|
|
209
|
+
summary?: string;
|
|
210
|
+
}> {
|
|
211
|
+
if (!tmpDir || !fs.existsSync(tmpDir)) return [];
|
|
212
|
+
|
|
213
|
+
const primaryIds = new Set(primaryRuns.map((r) => r.runId));
|
|
214
|
+
const runs: Array<{
|
|
215
|
+
runId: string;
|
|
216
|
+
status: string;
|
|
217
|
+
cwd: string;
|
|
218
|
+
stateRoot: string;
|
|
219
|
+
artifactsRoot: string;
|
|
220
|
+
async?: { pid?: number };
|
|
221
|
+
updatedAt: string;
|
|
222
|
+
summary?: string;
|
|
223
|
+
}> = [];
|
|
224
|
+
|
|
225
|
+
try {
|
|
226
|
+
const entries = fs.readdirSync(tmpDir, { withFileTypes: true });
|
|
227
|
+
for (const entry of entries) {
|
|
228
|
+
if (!entry.isDirectory() || !entry.name.startsWith("pi-crew-"))
|
|
229
|
+
continue;
|
|
230
|
+
const stateRunsDir = path.join(
|
|
231
|
+
tmpDir,
|
|
232
|
+
entry.name,
|
|
233
|
+
".crew",
|
|
234
|
+
"state",
|
|
235
|
+
"runs",
|
|
236
|
+
);
|
|
237
|
+
if (!fs.existsSync(stateRunsDir)) continue;
|
|
238
|
+
|
|
239
|
+
try {
|
|
240
|
+
for (const runDir of fs.readdirSync(stateRunsDir)) {
|
|
241
|
+
const manifestPath = path.join(
|
|
242
|
+
stateRunsDir,
|
|
243
|
+
runDir,
|
|
244
|
+
"manifest.json",
|
|
245
|
+
);
|
|
246
|
+
if (!fs.existsSync(manifestPath)) continue;
|
|
247
|
+
try {
|
|
248
|
+
const manifest = JSON.parse(
|
|
249
|
+
fs.readFileSync(manifestPath, "utf-8"),
|
|
250
|
+
) as {
|
|
251
|
+
runId?: string;
|
|
252
|
+
status?: string;
|
|
253
|
+
cwd?: string;
|
|
254
|
+
stateRoot?: string;
|
|
255
|
+
artifactsRoot?: string;
|
|
256
|
+
async?: { pid?: number };
|
|
257
|
+
updatedAt?: string;
|
|
258
|
+
summary?: string;
|
|
259
|
+
};
|
|
260
|
+
if (!manifest.runId || primaryIds.has(manifest.runId))
|
|
261
|
+
continue;
|
|
262
|
+
if (
|
|
263
|
+
manifest.status &&
|
|
264
|
+
manifest.cwd &&
|
|
265
|
+
manifest.stateRoot &&
|
|
266
|
+
manifest.artifactsRoot &&
|
|
267
|
+
manifest.updatedAt
|
|
268
|
+
) {
|
|
269
|
+
runs.push({
|
|
270
|
+
runId: manifest.runId!,
|
|
271
|
+
status: manifest.status,
|
|
272
|
+
cwd: manifest.cwd,
|
|
273
|
+
stateRoot: manifest.stateRoot,
|
|
274
|
+
artifactsRoot: manifest.artifactsRoot,
|
|
275
|
+
async: manifest.async,
|
|
276
|
+
updatedAt: manifest.updatedAt,
|
|
277
|
+
summary: manifest.summary,
|
|
278
|
+
});
|
|
279
|
+
}
|
|
280
|
+
} catch {
|
|
281
|
+
/* skip corrupt manifests */
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
} catch {
|
|
285
|
+
/* skip unreadable dirs */
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
} catch {
|
|
289
|
+
/* skip if tmpdir unreadable */
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
return runs;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
/**
|
|
296
|
+
* Counts run statuses. Accepts a minimal manifest-like object.
|
|
297
|
+
*/
|
|
298
|
+
export function countStatuses(
|
|
299
|
+
runs: Array<{ status: string }>,
|
|
300
|
+
): Pick<
|
|
301
|
+
HealthCounts,
|
|
302
|
+
| "total"
|
|
303
|
+
| "running"
|
|
304
|
+
| "completed"
|
|
305
|
+
| "failed"
|
|
306
|
+
| "cancelled"
|
|
307
|
+
| "blocked"
|
|
308
|
+
| "queued"
|
|
309
|
+
| "planning"
|
|
310
|
+
| "waiting"
|
|
311
|
+
> {
|
|
312
|
+
const counts = {
|
|
313
|
+
total: runs.length,
|
|
314
|
+
running: 0,
|
|
315
|
+
completed: 0,
|
|
316
|
+
failed: 0,
|
|
317
|
+
cancelled: 0,
|
|
318
|
+
blocked: 0,
|
|
319
|
+
queued: 0,
|
|
320
|
+
planning: 0,
|
|
321
|
+
waiting: 0,
|
|
322
|
+
};
|
|
323
|
+
|
|
324
|
+
for (const run of runs) {
|
|
325
|
+
switch (run.status) {
|
|
326
|
+
case "running":
|
|
327
|
+
counts.running++;
|
|
328
|
+
break;
|
|
329
|
+
case "completed":
|
|
330
|
+
counts.completed++;
|
|
331
|
+
break;
|
|
332
|
+
case "failed":
|
|
333
|
+
counts.failed++;
|
|
334
|
+
break;
|
|
335
|
+
case "cancelled":
|
|
336
|
+
counts.cancelled++;
|
|
337
|
+
break;
|
|
338
|
+
case "blocked":
|
|
339
|
+
counts.blocked++;
|
|
340
|
+
break;
|
|
341
|
+
case "queued":
|
|
342
|
+
counts.queued++;
|
|
343
|
+
break;
|
|
344
|
+
case "planning":
|
|
345
|
+
counts.planning++;
|
|
346
|
+
break;
|
|
347
|
+
case "waiting":
|
|
348
|
+
counts.waiting++;
|
|
349
|
+
break;
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
return counts;
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
// ── Internal type for merged run entries ──────────────────────────────
|
|
357
|
+
|
|
358
|
+
interface RunLike {
|
|
359
|
+
runId: string;
|
|
360
|
+
status: string;
|
|
361
|
+
cwd: string;
|
|
362
|
+
stateRoot: string;
|
|
363
|
+
artifactsRoot: string;
|
|
364
|
+
async?: { pid?: number };
|
|
365
|
+
updatedAt: string;
|
|
366
|
+
summary?: string;
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
// ── Core logic (separated for testability) ───────────────────────────
|
|
370
|
+
|
|
371
|
+
export interface HealthMonitorOptions {
|
|
372
|
+
tmpDir?: string;
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
export function buildHealthReport(
|
|
376
|
+
ctx: TeamContext,
|
|
377
|
+
_params: TeamToolParamsValue,
|
|
378
|
+
options: HealthMonitorOptions = {},
|
|
379
|
+
): { text: string; counts: HealthCounts; hasIssues: boolean } {
|
|
380
|
+
const now = Date.now();
|
|
381
|
+
const tmpDir = options.tmpDir ?? os.tmpdir();
|
|
382
|
+
|
|
383
|
+
const primaryRuns = listRuns(ctx.cwd, ctx.signal);
|
|
384
|
+
const tempRuns = collectTempWorkspaceRuns(primaryRuns, tmpDir);
|
|
385
|
+
|
|
386
|
+
// Merge/dedup by runId (primaryRuns already deduped internally)
|
|
387
|
+
const allRuns: RunLike[] = [...primaryRuns, ...tempRuns];
|
|
388
|
+
|
|
389
|
+
const ghost: HealthEntry[] = [];
|
|
390
|
+
const orphaned: HealthEntry[] = [];
|
|
391
|
+
const corrupted: CorruptEntry[] = [];
|
|
392
|
+
const allStuck: StuckTask[] = [];
|
|
393
|
+
|
|
394
|
+
for (const run of allRuns) {
|
|
395
|
+
// 1. Ghost: active status but cwd no longer exists
|
|
396
|
+
if (
|
|
397
|
+
isActiveRunStatus(run.status) &&
|
|
398
|
+
run.cwd &&
|
|
399
|
+
!fs.existsSync(run.cwd)
|
|
400
|
+
) {
|
|
401
|
+
ghost.push({
|
|
402
|
+
runId: run.runId,
|
|
403
|
+
reason: "dead-cwd",
|
|
404
|
+
detail: `cwd=${run.cwd}`,
|
|
405
|
+
});
|
|
406
|
+
continue;
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
// 2. Corrupted: state root or artifacts root missing
|
|
410
|
+
if (
|
|
411
|
+
!fs.existsSync(run.stateRoot) ||
|
|
412
|
+
!fs.existsSync(run.artifactsRoot)
|
|
413
|
+
) {
|
|
414
|
+
corrupted.push({
|
|
415
|
+
runId: run.runId,
|
|
416
|
+
reason: "missing-state-or-artifacts",
|
|
417
|
+
});
|
|
418
|
+
continue;
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
// 3. Stuck task detection for active runs
|
|
422
|
+
if (isActiveRunStatus(run.status)) {
|
|
423
|
+
const stuck = detectStuckTasks(run, now);
|
|
424
|
+
allStuck.push(...stuck);
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
// 4. Only check orphan status for active runs
|
|
428
|
+
if (!isActiveRunStatus(run.status)) continue;
|
|
429
|
+
|
|
430
|
+
// 5. Orphaned: stale async PID
|
|
431
|
+
if (hasStaleAsyncProcess(run as TeamRunManifest, now)) {
|
|
432
|
+
orphaned.push({
|
|
433
|
+
runId: run.runId,
|
|
434
|
+
reason: "stale-async-pid",
|
|
435
|
+
detail: `pid=${run.async?.pid}`,
|
|
436
|
+
});
|
|
437
|
+
continue;
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
// 6. Orphaned: non-async active run with no recent update
|
|
441
|
+
const agents = readCrewAgents(run as TeamRunManifest);
|
|
442
|
+
if (isLikelyOrphanedActiveRun(run as TeamRunManifest, agents, now)) {
|
|
443
|
+
orphaned.push({
|
|
444
|
+
runId: run.runId,
|
|
445
|
+
reason: "stale-no-progress",
|
|
446
|
+
detail: `status=${run.status}`,
|
|
447
|
+
});
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
// Status counts
|
|
452
|
+
const statusCounts = countStatuses(allRuns);
|
|
453
|
+
|
|
454
|
+
// Zombie /tmp/ workspace detection
|
|
455
|
+
const zombieWorkspaces = scanZombieTempWorkspaces(tmpDir, now);
|
|
456
|
+
|
|
457
|
+
// Build counts object
|
|
458
|
+
const counts: HealthCounts = {
|
|
459
|
+
...statusCounts,
|
|
460
|
+
stuck: allStuck.length,
|
|
461
|
+
zombie: zombieWorkspaces.length,
|
|
462
|
+
ghost: ghost.length,
|
|
463
|
+
orphaned: orphaned.length,
|
|
464
|
+
corrupted: corrupted.length,
|
|
465
|
+
};
|
|
466
|
+
|
|
467
|
+
// ── Build text report ─────────────────────────────────────────────
|
|
468
|
+
|
|
469
|
+
const lines: string[] = [
|
|
470
|
+
"pi-crew health report",
|
|
471
|
+
`Scanned: ${counts.total} runs`,
|
|
472
|
+
"",
|
|
473
|
+
`Status: running=${counts.running} queued=${counts.queued} planning=${counts.planning} waiting=${counts.waiting} completed=${counts.completed} failed=${counts.failed} cancelled=${counts.cancelled} blocked=${counts.blocked}`,
|
|
474
|
+
"",
|
|
475
|
+
`Ghost (dead cwd): ${ghost.length}`,
|
|
476
|
+
`Orphaned (stale process): ${orphaned.length}`,
|
|
477
|
+
`Corrupted (missing state): ${corrupted.length}`,
|
|
478
|
+
`Stuck tasks (heartbeat >5min): ${allStuck.length}`,
|
|
479
|
+
`Zombie /tmp/ workspaces: ${zombieWorkspaces.length}`,
|
|
480
|
+
"",
|
|
481
|
+
];
|
|
482
|
+
|
|
483
|
+
if (ghost.length > 0) {
|
|
484
|
+
lines.push("Ghost runs:");
|
|
485
|
+
for (const entry of ghost) {
|
|
486
|
+
lines.push(` - ${entry.runId}: ${entry.reason} (${entry.detail})`);
|
|
487
|
+
}
|
|
488
|
+
lines.push("");
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
if (orphaned.length > 0) {
|
|
492
|
+
lines.push("Orphaned runs:");
|
|
493
|
+
for (const entry of orphaned) {
|
|
494
|
+
lines.push(` - ${entry.runId}: ${entry.reason} (${entry.detail})`);
|
|
495
|
+
}
|
|
496
|
+
lines.push("");
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
if (corrupted.length > 0) {
|
|
500
|
+
lines.push("Corrupted runs:");
|
|
501
|
+
for (const entry of corrupted) {
|
|
502
|
+
lines.push(` - ${entry.runId}: ${entry.reason}`);
|
|
503
|
+
}
|
|
504
|
+
lines.push("");
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
if (allStuck.length > 0) {
|
|
508
|
+
lines.push("Stuck tasks:");
|
|
509
|
+
for (const s of allStuck) {
|
|
510
|
+
lines.push(` - ${s.runId}/${s.taskId}: ${s.detail}`);
|
|
511
|
+
}
|
|
512
|
+
lines.push("");
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
if (zombieWorkspaces.length > 0) {
|
|
516
|
+
lines.push("Zombie /tmp/ workspaces:");
|
|
517
|
+
for (const z of zombieWorkspaces) {
|
|
518
|
+
lines.push(
|
|
519
|
+
` - ${z.dir} (${z.runCount} run${z.runCount !== 1 ? "s" : ""})`,
|
|
520
|
+
);
|
|
521
|
+
}
|
|
522
|
+
lines.push("");
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
const hasIssues =
|
|
526
|
+
ghost.length > 0 ||
|
|
527
|
+
orphaned.length > 0 ||
|
|
528
|
+
corrupted.length > 0 ||
|
|
529
|
+
allStuck.length > 0 ||
|
|
530
|
+
zombieWorkspaces.length > 0;
|
|
531
|
+
|
|
532
|
+
if (!hasIssues) {
|
|
533
|
+
lines.push("All runs healthy.");
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
// Compact TUI summary line
|
|
537
|
+
lines.push("");
|
|
538
|
+
lines.push(
|
|
539
|
+
`Summary: total=${counts.total} running=${counts.running} completed=${counts.completed} failed=${counts.failed} cancelled=${counts.cancelled} blocked=${counts.blocked} | stuck=${counts.stuck} zombie=${counts.zombie}`,
|
|
540
|
+
);
|
|
541
|
+
|
|
542
|
+
const text = lines.join("\n");
|
|
543
|
+
return { text, counts, hasIssues };
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
// ── Main handler ─────────────────────────────────────────────────────
|
|
547
|
+
|
|
548
|
+
export function handleHealthMonitor(
|
|
549
|
+
ctx: TeamContext,
|
|
550
|
+
params: TeamToolParamsValue,
|
|
551
|
+
): PiTeamsToolResult {
|
|
552
|
+
const { text, counts, hasIssues } = buildHealthReport(ctx, params);
|
|
553
|
+
|
|
554
|
+
return result(
|
|
555
|
+
text,
|
|
556
|
+
{
|
|
557
|
+
action: "health",
|
|
558
|
+
status: hasIssues ? "error" : "ok",
|
|
559
|
+
data: { ...counts },
|
|
560
|
+
},
|
|
561
|
+
hasIssues,
|
|
562
|
+
);
|
|
563
|
+
}
|
|
@@ -3,11 +3,14 @@ import { readEvents } from "../../state/event-log.ts";
|
|
|
3
3
|
import { loadRunManifestById } from "../../state/state-store.ts";
|
|
4
4
|
import { aggregateUsage, formatUsage } from "../../state/usage.ts";
|
|
5
5
|
import type { PiTeamsToolResult } from "../tool-result.ts";
|
|
6
|
+
import { locateRunCwd } from "../team-tool.ts";
|
|
6
7
|
import { result, type TeamContext } from "./context.ts";
|
|
7
8
|
|
|
8
9
|
export function handleEvents(params: TeamToolParamsValue, ctx: TeamContext): PiTeamsToolResult {
|
|
9
10
|
if (!params.runId) return result("Events requires runId.", { action: "events", status: "error" }, true);
|
|
10
|
-
const
|
|
11
|
+
const runCwd = locateRunCwd(params.runId, ctx.cwd);
|
|
12
|
+
if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "events", status: "error" }, true);
|
|
13
|
+
const loaded = loadRunManifestById(runCwd, params.runId);
|
|
11
14
|
if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "events", status: "error" }, true);
|
|
12
15
|
const events = readEvents(loaded.manifest.eventsPath);
|
|
13
16
|
const lines = [`Events for ${loaded.manifest.runId}:`, ...(events.length ? events.map((event) => `${event.time} ${event.type}${event.taskId ? ` ${event.taskId}` : ""}${event.message ? `: ${event.message}` : ""}${event.data ? ` ${JSON.stringify(event.data)}` : ""}`) : ["(none)"])];
|
|
@@ -16,7 +19,9 @@ export function handleEvents(params: TeamToolParamsValue, ctx: TeamContext): PiT
|
|
|
16
19
|
|
|
17
20
|
export function handleArtifacts(params: TeamToolParamsValue, ctx: TeamContext): PiTeamsToolResult {
|
|
18
21
|
if (!params.runId) return result("Artifacts requires runId.", { action: "artifacts", status: "error" }, true);
|
|
19
|
-
const
|
|
22
|
+
const runCwd = locateRunCwd(params.runId, ctx.cwd);
|
|
23
|
+
if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "artifacts", status: "error" }, true);
|
|
24
|
+
const loaded = loadRunManifestById(runCwd, params.runId);
|
|
20
25
|
if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "artifacts", status: "error" }, true);
|
|
21
26
|
const lines = [`Artifacts for ${loaded.manifest.runId}:`, ...(loaded.manifest.artifacts.length ? loaded.manifest.artifacts.map((artifact) => `- ${artifact.kind}: ${artifact.path}${artifact.sizeBytes !== undefined ? ` (${artifact.sizeBytes} bytes)` : ""}${artifact.contentHash ? ` sha256=${artifact.contentHash.slice(0, 12)}` : ""}`) : ["- (none)"])];
|
|
22
27
|
return result(lines.join("\n"), { action: "artifacts", status: "ok", runId: loaded.manifest.runId, artifactsRoot: loaded.manifest.artifactsRoot });
|
|
@@ -24,7 +29,9 @@ export function handleArtifacts(params: TeamToolParamsValue, ctx: TeamContext):
|
|
|
24
29
|
|
|
25
30
|
export function handleSummary(params: TeamToolParamsValue, ctx: TeamContext): PiTeamsToolResult {
|
|
26
31
|
if (!params.runId) return result("Summary requires runId.", { action: "summary", status: "error" }, true);
|
|
27
|
-
const
|
|
32
|
+
const runCwd = locateRunCwd(params.runId, ctx.cwd);
|
|
33
|
+
if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "summary", status: "error" }, true);
|
|
34
|
+
const loaded = loadRunManifestById(runCwd, params.runId);
|
|
28
35
|
if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "summary", status: "error" }, true);
|
|
29
36
|
const usage = aggregateUsage(loaded.tasks);
|
|
30
37
|
const lines = [
|
|
@@ -6,6 +6,7 @@ import { appendMailboxMessage, updateMailboxMessageReply } from "../../state/mai
|
|
|
6
6
|
import { readCrewAgents, saveCrewAgents, recordFromTask } from "../../runtime/crew-agent-records.ts";
|
|
7
7
|
import { logInternalError } from "../../utils/internal-error.ts";
|
|
8
8
|
import type { PiTeamsToolResult } from "../tool-result.ts";
|
|
9
|
+
import { locateRunCwd } from "../team-tool.ts";
|
|
9
10
|
import { result, type TeamContext } from "./context.ts";
|
|
10
11
|
|
|
11
12
|
/**
|
|
@@ -17,11 +18,13 @@ export function handleRespond(params: TeamToolParamsValue, ctx: TeamContext): Pi
|
|
|
17
18
|
if (!params.runId) return result("Respond requires runId.", { action: "respond", status: "error" }, true);
|
|
18
19
|
if (!params.message && !params.taskId) return result("Respond requires taskId and/or message.", { action: "respond", status: "error" }, true);
|
|
19
20
|
|
|
20
|
-
const
|
|
21
|
+
const runCwd = locateRunCwd(params.runId, ctx.cwd);
|
|
22
|
+
if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "respond", status: "error" }, true);
|
|
23
|
+
const loaded = loadRunManifestById(runCwd, params.runId);
|
|
21
24
|
if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "respond", status: "error" }, true);
|
|
22
25
|
|
|
23
26
|
return withRunLockSync(loaded.manifest, () => {
|
|
24
|
-
const fresh = loadRunManifestById(
|
|
27
|
+
const fresh = loadRunManifestById(loaded.manifest.cwd, params.runId!);
|
|
25
28
|
if (!fresh) return result(`Run '${params.runId}' not found.`, { action: "respond", status: "error" }, true);
|
|
26
29
|
const foreignRun = typeof fresh.manifest.ownerSessionId === "string" && fresh.manifest.ownerSessionId !== ctx.sessionId;
|
|
27
30
|
if (foreignRun && !params.force) return result(`Run ${fresh.manifest.runId} belongs to another session. Use force: true to override.`, { action: "respond", status: "error", runId: fresh.manifest.runId }, true);
|
|
@@ -11,11 +11,14 @@ import { formatTaskGraphLines, waitingReason } from "../../runtime/task-display.
|
|
|
11
11
|
import { verifyTaskCompletion, formatOutputPreview } from "../../runtime/completion-guard.ts";
|
|
12
12
|
import { evaluateRunEffectiveness } from "../../runtime/effectiveness.ts";
|
|
13
13
|
import type { PiTeamsToolResult } from "../tool-result.ts";
|
|
14
|
+
import { locateRunCwd } from "../team-tool.ts";
|
|
14
15
|
import { result, type TeamContext } from "./context.ts";
|
|
15
16
|
|
|
16
17
|
export function handleStatus(params: TeamToolParamsValue, ctx: TeamContext): PiTeamsToolResult {
|
|
17
18
|
if (!params.runId) return result("Status requires runId.", { action: "status", status: "error" }, true);
|
|
18
|
-
const
|
|
19
|
+
const runCwd = locateRunCwd(params.runId, ctx.cwd);
|
|
20
|
+
if (!runCwd) return result(`Run '${params.runId}' not found.`, { action: "status", status: "error" }, true);
|
|
21
|
+
const loaded = loadRunManifestById(runCwd, params.runId);
|
|
19
22
|
if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "status", status: "error" }, true);
|
|
20
23
|
let { manifest, tasks } = loaded;
|
|
21
24
|
let asyncLivenessLine: string | undefined;
|