pi-crew 0.3.6 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/CHANGELOG.md +17 -0
  2. package/package.json +1 -1
  3. package/src/agents/discover-agents.ts +2 -1
  4. package/src/config/config.ts +760 -229
  5. package/src/config/types.ts +34 -5
  6. package/src/extension/help.ts +1 -0
  7. package/src/extension/management.ts +2 -1
  8. package/src/extension/register.ts +1176 -255
  9. package/src/extension/registration/commands.ts +15 -2
  10. package/src/extension/registration/team-tool.ts +1 -1
  11. package/src/extension/session-summary.ts +11 -1
  12. package/src/extension/team-tool/api.ts +4 -1
  13. package/src/extension/team-tool/cache-control.ts +23 -0
  14. package/src/extension/team-tool/cancel.ts +27 -16
  15. package/src/extension/team-tool/context.ts +2 -0
  16. package/src/extension/team-tool/handle-settings.ts +2 -0
  17. package/src/extension/team-tool/health-monitor.ts +563 -0
  18. package/src/extension/team-tool/inspect.ts +10 -3
  19. package/src/extension/team-tool/lifecycle-actions.ts +12 -5
  20. package/src/extension/team-tool/respond.ts +6 -3
  21. package/src/extension/team-tool/status.ts +4 -1
  22. package/src/extension/team-tool-types.ts +2 -0
  23. package/src/extension/team-tool.ts +901 -177
  24. package/src/runtime/adaptive-plan.ts +1 -1
  25. package/src/runtime/child-pi.ts +15 -2
  26. package/src/runtime/crash-recovery.ts +30 -0
  27. package/src/runtime/foreground-watchdog.ts +129 -0
  28. package/src/runtime/manifest-cache.ts +4 -2
  29. package/src/runtime/pi-args.ts +3 -2
  30. package/src/runtime/run-tracker.ts +11 -0
  31. package/src/runtime/runtime-policy.ts +15 -2
  32. package/src/runtime/skill-instructions.ts +11 -0
  33. package/src/runtime/stale-reconciler.ts +322 -18
  34. package/src/runtime/task-runner.ts +8 -1
  35. package/src/schema/config-schema.ts +1 -0
  36. package/src/schema/team-tool-schema.ts +204 -76
  37. package/src/state/atomic-write.ts +2 -2
  38. package/src/state/locks.ts +19 -0
  39. package/src/state/mailbox.ts +22 -5
  40. package/src/state/state-store.ts +13 -3
  41. package/src/teams/discover-teams.ts +2 -1
  42. package/src/ui/run-event-bus.ts +2 -1
  43. package/src/ui/settings-overlay.ts +2 -0
  44. package/src/workflows/discover-workflows.ts +5 -1
@@ -1,8 +1,12 @@
1
1
  import * as fs from "node:fs";
2
+ import * as os from "node:os";
2
3
  import * as path from "node:path";
3
4
  import type { TeamRunManifest, TeamTaskState } from "../state/types.ts";
4
- import { checkProcessLiveness } from "./process-status.ts";
5
5
  import { recordFromTask, upsertCrewAgent } from "./crew-agent-records.ts";
6
+ import { checkProcessLiveness } from "./process-status.ts";
7
+
8
+ /** Age threshold for orphaned temp directory cleanup: 1 hour. */
9
+ const ORPHAN_TEMP_DIR_AGE_THRESHOLD_MS = 60 * 60 * 1000;
6
10
 
7
11
  /**
8
12
  * Result of reconciling a single stale run.
@@ -10,7 +14,12 @@ import { recordFromTask, upsertCrewAgent } from "./crew-agent-records.ts";
10
14
  export interface ReconcileResult {
11
15
  runId: string;
12
16
  /** What was found and what action was taken */
13
- verdict: "healthy" | "result_exists" | "pid_dead" | "pid_alive_stale" | "no_status";
17
+ verdict:
18
+ | "healthy"
19
+ | "result_exists"
20
+ | "pid_dead"
21
+ | "pid_alive_stale"
22
+ | "no_status";
14
23
  /** Whether repair was applied */
15
24
  repaired: boolean;
16
25
  /** Human-readable detail */
@@ -21,6 +30,8 @@ export interface ReconcileResult {
21
30
 
22
31
  const STALE_ALIVE_PID_MS = 24 * 60 * 60 * 1000; // 24 hours
23
32
  const ACTIVE_EVIDENCE_TTL_MS = 5 * 60 * 1000;
33
+ /** For no-PID runs, repair when ALL running tasks have heartbeat stale beyond this threshold. */
34
+ const NO_PID_HEARTBEAT_STALE_MS = 5 * 60 * 1000; // 5 minutes — same as heartbeat-gradient deadMs
24
35
 
25
36
  /**
26
37
  * Phase 1: Check if a result file already exists for the run.
@@ -31,14 +42,28 @@ function checkResultFile(
31
42
  tasks: TeamTaskState[],
32
43
  ): { found: boolean; repaired: boolean } {
33
44
  // Check if all tasks already have terminal status (result was written but manifest wasn't updated)
34
- const allTerminal = tasks.length > 0 && tasks.every(
35
- (t) => t.status === "completed" || t.status === "failed" || t.status === "cancelled" || t.status === "skipped" || t.status === "needs_attention",
36
- );
45
+ const allTerminal =
46
+ tasks.length > 0 &&
47
+ tasks.every(
48
+ (t) =>
49
+ t.status === "completed" ||
50
+ t.status === "failed" ||
51
+ t.status === "cancelled" ||
52
+ t.status === "skipped" ||
53
+ t.status === "needs_attention",
54
+ );
37
55
  if (allTerminal) {
38
56
  // Sync agent records even when tasks are already terminal
39
57
  // (e.g., a previous reconcile fixed tasks but crashed before updating agents)
40
58
  for (const task of tasks) {
41
- try { upsertCrewAgent(manifest, recordFromTask(manifest, task, "scaffold")); } catch { /* non-critical */ }
59
+ try {
60
+ upsertCrewAgent(
61
+ manifest,
62
+ recordFromTask(manifest, task, "scaffold"),
63
+ );
64
+ } catch {
65
+ /* non-critical */
66
+ }
42
67
  }
43
68
  return { found: true, repaired: false };
44
69
  }
@@ -52,7 +77,10 @@ function checkResultFile(
52
77
  * written, treat the PID as alive even if process.kill returns false
53
78
  * (handles SIGKILL race where PID hasn't been recycled yet).
54
79
  */
55
- function checkPidLiveness(pid: number | undefined, stateRoot?: string): {
80
+ function checkPidLiveness(
81
+ pid: number | undefined,
82
+ stateRoot?: string,
83
+ ): {
56
84
  alive: boolean;
57
85
  detail: string;
58
86
  } {
@@ -67,13 +95,18 @@ function checkPidLiveness(pid: number | undefined, stateRoot?: string): {
67
95
  const heartbeatPath = path.join(stateRoot, "heartbeat.json");
68
96
  try {
69
97
  if (fs.existsSync(heartbeatPath)) {
70
- const hb = JSON.parse(fs.readFileSync(heartbeatPath, "utf-8")) as { pid?: number; at?: number };
98
+ const hb = JSON.parse(
99
+ fs.readFileSync(heartbeatPath, "utf-8"),
100
+ ) as { pid?: number; at?: number };
71
101
  if (hb?.pid === pid && hb?.at) {
72
102
  const ageMs = Date.now() - hb.at;
73
103
  // Heartbeat written < 5 min ago → process was alive recently.
74
104
  // Don't repair yet; let the next reconciliation cycle catch it.
75
105
  if (ageMs < 5 * 60_000) {
76
- return { alive: true, detail: `process dead but heartbeat ${Math.round(ageMs / 1000)}s old` };
106
+ return {
107
+ alive: true,
108
+ detail: `process dead but heartbeat ${Math.round(ageMs / 1000)}s old`,
109
+ };
77
110
  }
78
111
  }
79
112
  }
@@ -101,18 +134,76 @@ function evaluateStaleness(
101
134
  return { stale: false, reason: "updated_at_invalid" };
102
135
  }
103
136
  if (now - updatedAt > STALE_ALIVE_PID_MS) {
104
- return { stale: true, reason: `alive_but_stale_${Math.round((now - updatedAt) / 3600_000)}h` };
137
+ return {
138
+ stale: true,
139
+ reason: `alive_but_stale_${Math.round((now - updatedAt) / 3600_000)}h`,
140
+ };
105
141
  }
106
142
  return { stale: false, reason: "alive_and_recent" };
107
143
  }
108
144
 
109
145
  function hasRecentActiveEvidence(tasks: TeamTaskState[], now: number): boolean {
110
146
  return tasks.some((task) => {
111
- if (task.status !== "running" && task.status !== "waiting") return false;
112
- const heartbeatAt = task.heartbeat?.lastSeenAt ? new Date(task.heartbeat.lastSeenAt).getTime() : Number.NaN;
113
- if (task.heartbeat?.alive !== false && Number.isFinite(heartbeatAt) && now - heartbeatAt <= ACTIVE_EVIDENCE_TTL_MS) return true;
114
- const activityAt = task.agentProgress?.lastActivityAt ? new Date(task.agentProgress.lastActivityAt).getTime() : Number.NaN;
115
- return Number.isFinite(activityAt) && now - activityAt <= ACTIVE_EVIDENCE_TTL_MS;
147
+ if (task.status !== "running" && task.status !== "waiting")
148
+ return false;
149
+ const heartbeatAt = task.heartbeat?.lastSeenAt
150
+ ? new Date(task.heartbeat.lastSeenAt).getTime()
151
+ : Number.NaN;
152
+ if (
153
+ task.heartbeat?.alive !== false &&
154
+ Number.isFinite(heartbeatAt) &&
155
+ now - heartbeatAt <= ACTIVE_EVIDENCE_TTL_MS
156
+ )
157
+ return true;
158
+ const activityAt = task.agentProgress?.lastActivityAt
159
+ ? new Date(task.agentProgress.lastActivityAt).getTime()
160
+ : Number.NaN;
161
+ return (
162
+ Number.isFinite(activityAt) &&
163
+ now - activityAt <= ACTIVE_EVIDENCE_TTL_MS
164
+ );
165
+ });
166
+ }
167
+
168
+ /**
169
+ * For no-PID runs: check if ALL running tasks have heartbeats stale beyond
170
+ * the no-PID heartbeat threshold. This detects zombie tasks where the worker
171
+ * process died but no PID was recorded (e.g. live-session /tmp/ workspaces).
172
+ * Tasks with no heartbeat AND no agent progress are considered NOT stale
173
+ * (they may be newly spawned and haven't reported yet).
174
+ */
175
+ function allRunningTasksHeartbeatStale(
176
+ tasks: TeamTaskState[],
177
+ now: number,
178
+ ): boolean {
179
+ const runningTasks = tasks.filter(
180
+ (t) => t.status === "running" || t.status === "waiting",
181
+ );
182
+ if (runningTasks.length === 0) return false;
183
+ return runningTasks.every((task) => {
184
+ const heartbeatAt = task.heartbeat?.lastSeenAt
185
+ ? new Date(task.heartbeat.lastSeenAt).getTime()
186
+ : Number.NaN;
187
+ const activityAt = task.agentProgress?.lastActivityAt
188
+ ? new Date(task.agentProgress.lastActivityAt).getTime()
189
+ : Number.NaN;
190
+ // If no heartbeat AND no activity, we can't determine staleness — assume not stale
191
+ if (!Number.isFinite(heartbeatAt) && !Number.isFinite(activityAt))
192
+ return false;
193
+ // If heartbeat is recent enough, not stale
194
+ if (
195
+ Number.isFinite(heartbeatAt) &&
196
+ now - heartbeatAt <= NO_PID_HEARTBEAT_STALE_MS
197
+ )
198
+ return false;
199
+ // If agent progress is recent enough, not stale
200
+ if (
201
+ Number.isFinite(activityAt) &&
202
+ now - activityAt <= NO_PID_HEARTBEAT_STALE_MS
203
+ )
204
+ return false;
205
+ // Both present and both stale → this task is stale
206
+ return true;
116
207
  });
117
208
  }
118
209
 
@@ -126,7 +217,11 @@ function repairStaleRun(
126
217
  ): TeamTaskState[] {
127
218
  const now = new Date().toISOString();
128
219
  const repairedTasks = tasks.map((task) => {
129
- if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
220
+ if (
221
+ task.status === "running" ||
222
+ task.status === "queued" ||
223
+ task.status === "waiting"
224
+ ) {
130
225
  return {
131
226
  ...task,
132
227
  status: "cancelled" as const,
@@ -138,7 +233,14 @@ function repairStaleRun(
138
233
  });
139
234
  // Update agent records so widget sees cancelled status immediately
140
235
  for (const task of repairedTasks) {
141
- try { upsertCrewAgent(manifest, recordFromTask(manifest, task, "scaffold")); } catch { /* non-critical */ }
236
+ try {
237
+ upsertCrewAgent(
238
+ manifest,
239
+ recordFromTask(manifest, task, "scaffold"),
240
+ );
241
+ } catch {
242
+ /* non-critical */
243
+ }
142
244
  }
143
245
  return repairedTasks;
144
246
  }
@@ -183,8 +285,31 @@ export function reconcileStaleRun(
183
285
  detail: "No PID recorded, but recent task heartbeat/progress exists; not repairing",
184
286
  };
185
287
  }
288
+ // No PID and no recent activity. If ALL running tasks have stale heartbeats
289
+ // (beyond NO_PID_HEARTBEAT_STALE_MS = 5min), repair immediately — the worker
290
+ // process is dead but we have no PID to check. This handles /tmp/ live-session
291
+ // workspaces where agents exit without calling submit_result.
292
+ if (allRunningTasksHeartbeatStale(tasks, now)) {
293
+ const repaired = repairStaleRun(
294
+ manifest,
295
+ tasks,
296
+ "no_pid_heartbeat_stale",
297
+ );
298
+ return {
299
+ runId,
300
+ verdict: "no_status",
301
+ repaired: true,
302
+ detail: `No PID; all running task heartbeats stale >${Math.round(NO_PID_HEARTBEAT_STALE_MS / 60_000)}min; repaired ${repaired.filter((t) => t.status === "cancelled").length} tasks`,
303
+ repairedTasks: repaired,
304
+ };
305
+ }
306
+ // Fall through: no recent activity but not all tasks stale enough yet.
307
+ // Check the longer STALE_ALIVE_PID_MS threshold for very old runs.
186
308
  const updatedAt = new Date(manifest.updatedAt).getTime();
187
- if (Number.isFinite(updatedAt) && now - updatedAt > STALE_ALIVE_PID_MS) {
309
+ if (
310
+ Number.isFinite(updatedAt) &&
311
+ now - updatedAt > STALE_ALIVE_PID_MS
312
+ ) {
188
313
  const repaired = repairStaleRun(manifest, tasks, "no_pid_stale");
189
314
  return {
190
315
  runId,
@@ -223,3 +348,182 @@ export function reconcileStaleRun(
223
348
  repairedTasks: repaired,
224
349
  };
225
350
  }
351
+
352
+ /**
353
+ * Result of orphaned temp workspace reconciliation.
354
+ */
355
+ export interface OrphanReconcileResult {
356
+ /** Number of runs repaired (manifests cancelled). */
357
+ repaired: number;
358
+ /** Number of /tmp/pi-crew-* directories removed. */
359
+ cleanedDirs: number;
360
+ }
361
+
362
+ /**
363
+ * Scan /tmp (os.tmpdir()) for orphaned pi-crew-* workspaces and reconcile
364
+ * any stale runs found. This catches runs created by tests or crashed sessions
365
+ * that the per-CWD auto-repair timer would miss.
366
+ *
367
+ * When `cleanupOrphanedTempDirs` is not explicitly set to `false`, directories
368
+ * older than 1 hour with no remaining running manifests are deleted after
369
+ * their runs are reconciled.
370
+ *
371
+ * @returns Number of runs repaired and directories cleaned.
372
+ */
373
+ export function reconcileOrphanedTempWorkspaces(
374
+ now = Date.now(),
375
+ options?: { cleanupOrphanedTempDirs?: boolean },
376
+ ): OrphanReconcileResult {
377
+ const tmpDir = getSafeTempDir();
378
+ if (!tmpDir) return { repaired: 0, cleanedDirs: 0 };
379
+ let repaired = 0;
380
+ let cleanedDirs = 0;
381
+ try {
382
+ const entries = fs.readdirSync(tmpDir, { withFileTypes: true });
383
+ for (const entry of entries) {
384
+ if (!entry.isDirectory() || !entry.name.startsWith("pi-crew-"))
385
+ continue;
386
+ const workspaceDir = path.join(tmpDir, entry.name);
387
+ const crewDir = path.join(workspaceDir, ".crew");
388
+ if (!fs.existsSync(crewDir)) continue;
389
+ const stateRunsDir = path.join(crewDir, "state", "runs");
390
+ if (!fs.existsSync(stateRunsDir)) continue;
391
+ let hasRunning = false;
392
+ try {
393
+ for (const runDir of fs.readdirSync(stateRunsDir)) {
394
+ const manifestPath = path.join(
395
+ stateRunsDir,
396
+ runDir,
397
+ "manifest.json",
398
+ );
399
+ const tasksPath = path.join(
400
+ stateRunsDir,
401
+ runDir,
402
+ "tasks.json",
403
+ );
404
+ if (
405
+ !fs.existsSync(manifestPath) ||
406
+ !fs.existsSync(tasksPath)
407
+ )
408
+ continue;
409
+ try {
410
+ const manifest: TeamRunManifest = JSON.parse(
411
+ fs.readFileSync(manifestPath, "utf-8"),
412
+ );
413
+ if (manifest.status !== "running") continue;
414
+ const tasks: TeamTaskState[] = JSON.parse(
415
+ fs.readFileSync(tasksPath, "utf-8"),
416
+ );
417
+ const result = reconcileStaleRun(manifest, tasks, now);
418
+ if (result.repaired && result.repairedTasks) {
419
+ // Persist repaired tasks
420
+ fs.writeFileSync(
421
+ tasksPath,
422
+ JSON.stringify(result.repairedTasks, null, 2),
423
+ );
424
+ // Update manifest status
425
+ const updated = {
426
+ ...manifest,
427
+ status: "cancelled" as const,
428
+ updatedAt: new Date(now).toISOString(),
429
+ summary: `Stale run reconciled: ${result.detail}`,
430
+ };
431
+ fs.writeFileSync(
432
+ manifestPath,
433
+ JSON.stringify(updated, null, 2),
434
+ );
435
+ // Update agent records
436
+ for (const task of result.repairedTasks) {
437
+ try {
438
+ upsertCrewAgent(
439
+ updated,
440
+ recordFromTask(
441
+ updated,
442
+ task,
443
+ "scaffold",
444
+ ),
445
+ );
446
+ } catch {
447
+ /* non-critical */
448
+ }
449
+ }
450
+ repaired++;
451
+ }
452
+ // If still running after reconciliation attempt, mark for dir-preserving
453
+ if (
454
+ result.verdict === "healthy" ||
455
+ (result.verdict === "no_status" && !result.repaired)
456
+ ) {
457
+ hasRunning = true;
458
+ }
459
+ } catch {
460
+ /* skip corrupt manifests */
461
+ }
462
+ }
463
+ } catch {
464
+ /* skip unreadable dirs */
465
+ }
466
+
467
+ // Post-loop: check if this workspace dir can be cleaned up.
468
+ // Eligible when cleanup is enabled, no running manifests remain, and
469
+ // the directory is older than the age threshold.
470
+ if (!hasRunning) {
471
+ // Re-scan manifests to confirm no running runs remain
472
+ // (some may have been cancelled on a previous pass)
473
+ if (fs.existsSync(stateRunsDir)) {
474
+ try {
475
+ for (const runDir of fs.readdirSync(stateRunsDir)) {
476
+ const manifestPath = path.join(
477
+ stateRunsDir,
478
+ runDir,
479
+ "manifest.json",
480
+ );
481
+ if (!fs.existsSync(manifestPath)) continue;
482
+ try {
483
+ const manifest: TeamRunManifest = JSON.parse(
484
+ fs.readFileSync(manifestPath, "utf-8"),
485
+ );
486
+ if (manifest.status === "running") {
487
+ hasRunning = true;
488
+ break;
489
+ }
490
+ } catch {
491
+ /* skip corrupt */
492
+ }
493
+ }
494
+ } catch {
495
+ /* skip unreadable */
496
+ }
497
+ }
498
+ }
499
+
500
+ const cleanupEnabled = options?.cleanupOrphanedTempDirs !== false;
501
+ if (cleanupEnabled && !hasRunning) {
502
+ try {
503
+ const stat = fs.statSync(workspaceDir);
504
+ const dirAge = now - stat.mtimeMs;
505
+ if (dirAge > ORPHAN_TEMP_DIR_AGE_THRESHOLD_MS) {
506
+ fs.rmSync(workspaceDir, {
507
+ recursive: true,
508
+ force: true,
509
+ });
510
+ cleanedDirs++;
511
+ }
512
+ } catch {
513
+ /* skip if stat or rm fails */
514
+ }
515
+ }
516
+ }
517
+ } catch {
518
+ /* skip if tmpdir unreadable */
519
+ }
520
+ return { repaired, cleanedDirs };
521
+ }
522
+
523
+ function getSafeTempDir(): string | undefined {
524
+ try {
525
+ return fs.existsSync(os.tmpdir()) ? os.tmpdir() : undefined;
526
+ } catch {
527
+ return undefined;
528
+ }
529
+ }
@@ -416,6 +416,8 @@ export async function runTeamTask(
416
416
  skillPaths,
417
417
  maxTurns: input.runtimeConfig?.maxTurns,
418
418
  graceTurns: input.runtimeConfig?.graceTurns,
419
+ inheritContext: input.runtimeConfig?.inheritContext,
420
+ parentContext: input.parentContext,
419
421
  onSpawn: (pid) => {
420
422
  try {
421
423
  ({ task, tasks } = checkpointTask(
@@ -827,8 +829,13 @@ export async function runTeamTask(
827
829
  // _yieldResult: preserved for future use — yield completion contract not yet wired to task.result
828
830
  let _yieldResult: YieldResult | undefined;
829
831
  let noYield = false;
832
+ // Child-process workers do not have a submit_result tool — the yield contract
833
+ // only applies to live-session workers where submit_result is injected by the
834
+ // runtime. Skipping yield detection for child-process prevents every child
835
+ // worker from incorrectly being marked needs_attention.
830
836
  const yieldEnabled =
831
- input.runtimeConfig?.yield?.enabled ?? DEFAULT_YIELD_CONFIG.enabled;
837
+ runtimeKind !== "child-process" &&
838
+ (input.runtimeConfig?.yield?.enabled ?? DEFAULT_YIELD_CONFIG.enabled);
832
839
  if (yieldEnabled && collectedJsonEvents.length > 0) {
833
840
  if (hasYieldInOutput(collectedJsonEvents)) {
834
841
  const yieldEvent = collectedJsonEvents.find((e) =>
@@ -113,6 +113,7 @@ export const PiTeamsReliabilityConfigSchema = Type.Object({
113
113
  }, { additionalProperties: false })),
114
114
  autoRecover: Type.Optional(Type.Boolean()),
115
115
  deadletterThreshold: Type.Optional(Type.Integer({ minimum: 1 })),
116
+ cleanupOrphanedTempDirs: Type.Optional(Type.Boolean()),
116
117
  }, { additionalProperties: false });
117
118
 
118
119
  export const PiTeamsOtlpConfigSchema = Type.Object({