pi-crew 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (354) hide show
  1. package/AGENTS.md +57 -32
  2. package/CHANGELOG.md +466 -413
  3. package/LICENSE +21 -21
  4. package/NOTICE.md +16 -16
  5. package/README.md +323 -323
  6. package/docs/FEATURE_INTAKE.md +126 -0
  7. package/docs/HARNESS.md +86 -0
  8. package/docs/HARNESS_BACKLOG.md +41 -0
  9. package/docs/TEST_MATRIX.md +49 -0
  10. package/docs/actions-reference.md +595 -595
  11. package/docs/architecture.md +180 -180
  12. package/docs/code-review-2026-05-11.md +592 -0
  13. package/docs/commands-reference.md +347 -347
  14. package/docs/comparison-pi-subagents-vs-pi-crew.md +303 -0
  15. package/docs/decisions/0001-durable-state.md +41 -0
  16. package/docs/decisions/0002-child-process-for-async.md +42 -0
  17. package/docs/decisions/0003-depth-guard.md +36 -0
  18. package/docs/decisions/0004-execfile-over-exec.md +34 -0
  19. package/docs/decisions/0005-no-parameter-properties.md +49 -0
  20. package/docs/decisions/0006-publish-bundled-esm.md +63 -0
  21. package/docs/decisions/0007-active-run-binary-index.md +54 -0
  22. package/docs/decisions/0008-child-pi-warm-pool.md +61 -0
  23. package/docs/decisions/README.md +23 -0
  24. package/docs/followup-plan-2026-05-12.md +463 -0
  25. package/docs/followup-review-2026-05-12.md +297 -0
  26. package/docs/followup-review-round3-2026-05-12.md +342 -0
  27. package/docs/followup-review-round4-2026-05-13.md +107 -0
  28. package/docs/implementation-plan-top3.md +333 -0
  29. package/docs/live-mailbox-runtime.md +36 -36
  30. package/docs/next-upgrade-roadmap.md +808 -808
  31. package/docs/oh-my-pi-research.md +509 -0
  32. package/docs/perf/baseline-2026-05.md +113 -0
  33. package/docs/perf/final-report-2026-05.md +206 -0
  34. package/docs/perf/sprint-1-report.md +71 -0
  35. package/docs/perf/sprint-2-report.md +81 -0
  36. package/docs/perf/sprint-2.5-report.md +53 -0
  37. package/docs/perf/sprint-3-report.md +36 -0
  38. package/docs/perf/sprint-4-report.md +47 -0
  39. package/docs/perf/sprint-5-report.md +51 -0
  40. package/docs/perf/sprint-6-report.md +94 -0
  41. package/docs/perf/sprint-7-report.md +74 -0
  42. package/docs/perf/upgrade-plan-2026-05.md +147 -0
  43. package/docs/pi-subagents3-deep-analysis.md +508 -0
  44. package/docs/product/README.md +31 -0
  45. package/docs/product/platform.md +27 -0
  46. package/docs/product/runtime-safety.md +37 -0
  47. package/docs/product/team-run.md +39 -0
  48. package/docs/product/team-tool.md +37 -0
  49. package/docs/publishing.md +65 -65
  50. package/docs/resource-formats.md +134 -134
  51. package/docs/runtime-analysis-child-vs-live.md +171 -0
  52. package/docs/runtime-flow.md +148 -148
  53. package/docs/runtime-migration-in-process-analysis.md +250 -0
  54. package/docs/stories/README.md +30 -0
  55. package/docs/stories/backlog.md +36 -0
  56. package/docs/templates/decision.md +27 -0
  57. package/docs/templates/story.md +44 -0
  58. package/docs/templates/validation-report.md +32 -0
  59. package/docs/usage.md +238 -238
  60. package/index.ts +7 -6
  61. package/install.mjs +65 -65
  62. package/package.json +107 -99
  63. package/schema.json +222 -222
  64. package/skills/child-pi-spawning/SKILL.md +213 -0
  65. package/skills/context-artifact-hygiene/SKILL.md +32 -0
  66. package/skills/event-log-tracing/SKILL.md +299 -0
  67. package/skills/git-master/SKILL.md +225 -24
  68. package/skills/live-agent-lifecycle/SKILL.md +192 -0
  69. package/skills/mailbox-interactive/SKILL.md +300 -19
  70. package/skills/model-routing-context/SKILL.md +94 -0
  71. package/skills/multi-perspective-review/SKILL.md +88 -0
  72. package/skills/read-only-explorer/SKILL.md +250 -26
  73. package/skills/safe-bash/SKILL.md +307 -21
  74. package/skills/verification-before-done/SKILL.md +11 -2
  75. package/skills/widget-rendering/SKILL.md +258 -0
  76. package/skills/workspace-isolation/SKILL.md +202 -0
  77. package/skills/worktree-isolation/SKILL.md +202 -18
  78. package/src/adapters/claude-adapter.ts +25 -25
  79. package/src/adapters/codex-adapter.ts +21 -21
  80. package/src/adapters/cursor-adapter.ts +17 -17
  81. package/src/adapters/export-util.ts +137 -137
  82. package/src/adapters/index.ts +15 -15
  83. package/src/adapters/registry.ts +18 -18
  84. package/src/adapters/types.ts +23 -23
  85. package/src/agents/agent-config.ts +38 -38
  86. package/src/agents/agent-serializer.ts +38 -38
  87. package/src/agents/discover-agents.ts +121 -118
  88. package/src/config/config.ts +740 -858
  89. package/src/config/defaults.ts +96 -96
  90. package/src/config/drift-detector.ts +211 -211
  91. package/src/config/markers.ts +327 -327
  92. package/src/config/resilient-parser.ts +109 -108
  93. package/src/config/suggestions.ts +74 -74
  94. package/src/config/types.ts +199 -0
  95. package/src/extension/async-notifier.ts +123 -89
  96. package/src/extension/autonomous-policy.ts +169 -169
  97. package/src/extension/cross-extension-rpc.ts +104 -103
  98. package/src/extension/help.ts +47 -47
  99. package/src/extension/import-index.ts +69 -69
  100. package/src/extension/management.ts +395 -382
  101. package/src/extension/notification-router.ts +116 -116
  102. package/src/extension/notification-sink.ts +51 -51
  103. package/src/extension/project-init.ts +168 -168
  104. package/src/extension/register.ts +859 -668
  105. package/src/extension/registration/artifact-cleanup.ts +15 -15
  106. package/src/extension/registration/command-utils.ts +54 -54
  107. package/src/extension/registration/commands.ts +559 -452
  108. package/src/extension/registration/compaction-guard.ts +125 -125
  109. package/src/extension/registration/subagent-helpers.ts +102 -102
  110. package/src/extension/registration/subagent-tools.ts +220 -158
  111. package/src/extension/registration/team-tool.ts +159 -98
  112. package/src/extension/registration/viewers.ts +29 -0
  113. package/src/extension/result-watcher.ts +128 -128
  114. package/src/extension/run-bundle-schema.ts +89 -89
  115. package/src/extension/run-export.ts +73 -73
  116. package/src/extension/run-import.ts +84 -84
  117. package/src/extension/run-index.ts +94 -94
  118. package/src/extension/run-maintenance.ts +142 -142
  119. package/src/extension/session-summary.ts +8 -8
  120. package/src/extension/team-manager-command.ts +96 -95
  121. package/src/extension/team-recommendation.ts +188 -188
  122. package/src/extension/team-tool/api.ts +5 -2
  123. package/src/extension/team-tool/cancel.ts +224 -209
  124. package/src/extension/team-tool/config-patch.ts +36 -36
  125. package/src/extension/team-tool/context.ts +60 -60
  126. package/src/extension/team-tool/doctor.ts +242 -242
  127. package/src/extension/team-tool/handle-settings.ts +421 -195
  128. package/src/extension/team-tool/inspect.ts +41 -41
  129. package/src/extension/team-tool/lifecycle-actions.ts +139 -139
  130. package/src/extension/team-tool/parallel-dispatch.ts +156 -156
  131. package/src/extension/team-tool/plan.ts +19 -19
  132. package/src/extension/team-tool/respond.ts +112 -111
  133. package/src/extension/team-tool/run.ts +246 -228
  134. package/src/extension/team-tool/status.ts +110 -110
  135. package/src/extension/team-tool-types.ts +13 -13
  136. package/src/extension/team-tool.ts +16 -4
  137. package/src/extension/tool-result.ts +16 -16
  138. package/src/extension/validate-resources.ts +77 -77
  139. package/src/hooks/registry.ts +61 -61
  140. package/src/hooks/types.ts +40 -40
  141. package/src/i18n.ts +184 -184
  142. package/src/observability/correlation.ts +35 -35
  143. package/src/observability/event-to-metric.ts +68 -68
  144. package/src/observability/exporters/adapter.ts +30 -30
  145. package/src/observability/exporters/otlp-exporter.ts +106 -92
  146. package/src/observability/exporters/prometheus-exporter.ts +54 -54
  147. package/src/observability/metric-registry.ts +87 -87
  148. package/src/observability/metric-retention.ts +54 -54
  149. package/src/observability/metric-sink.ts +81 -56
  150. package/src/observability/metrics-primitives.ts +167 -167
  151. package/src/prompt/prompt-runtime.ts +72 -72
  152. package/src/runtime/adaptive-plan.ts +338 -0
  153. package/src/runtime/agent-control.ts +169 -169
  154. package/src/runtime/agent-memory.ts +72 -72
  155. package/src/runtime/agent-observability.ts +114 -114
  156. package/src/runtime/async-marker.ts +26 -26
  157. package/src/runtime/async-runner.ts +153 -79
  158. package/src/runtime/attention-events.ts +28 -28
  159. package/src/runtime/auto-resume.ts +100 -100
  160. package/src/runtime/background-runner.ts +122 -88
  161. package/src/runtime/cancellation.ts +61 -61
  162. package/src/runtime/capability-inventory.ts +116 -116
  163. package/src/runtime/child-pi-pool.ts +68 -0
  164. package/src/runtime/child-pi.ts +541 -463
  165. package/src/runtime/code-summary.ts +247 -247
  166. package/src/runtime/compaction-summary.ts +271 -271
  167. package/src/runtime/concurrency.ts +58 -58
  168. package/src/runtime/crash-recovery.ts +317 -301
  169. package/src/runtime/crew-agent-records.ts +379 -281
  170. package/src/runtime/crew-agent-runtime.ts +60 -60
  171. package/src/runtime/cross-extension-rpc.ts +72 -0
  172. package/src/runtime/custom-tools/irc-tool.ts +201 -201
  173. package/src/runtime/custom-tools/submit-result-tool.ts +90 -90
  174. package/src/runtime/deadletter.ts +47 -47
  175. package/src/runtime/delivery-coordinator.ts +176 -176
  176. package/src/runtime/delta-conflict.ts +360 -360
  177. package/src/runtime/diagnostic-export.ts +102 -102
  178. package/src/runtime/direct-run.ts +35 -35
  179. package/src/runtime/effectiveness.ts +82 -81
  180. package/src/runtime/errors/crew-errors.ts +166 -0
  181. package/src/runtime/event-stream-bridge.ts +92 -92
  182. package/src/runtime/foreground-control.ts +82 -82
  183. package/src/runtime/green-contract.ts +46 -46
  184. package/src/runtime/group-join.ts +234 -106
  185. package/src/runtime/heartbeat-watcher.ts +145 -124
  186. package/src/runtime/iteration-hooks.ts +267 -264
  187. package/src/runtime/live-agent-control.ts +88 -88
  188. package/src/runtime/live-agent-manager.ts +377 -179
  189. package/src/runtime/live-control-realtime.ts +36 -36
  190. package/src/runtime/live-session-runtime.ts +676 -599
  191. package/src/runtime/loop-gates.ts +129 -129
  192. package/src/runtime/manifest-cache.ts +263 -263
  193. package/src/runtime/mcp-proxy.ts +113 -113
  194. package/src/runtime/metric-parser.ts +40 -40
  195. package/src/runtime/model-fallback.ts +282 -274
  196. package/src/runtime/model-resolver.ts +118 -0
  197. package/src/runtime/output-validator.ts +187 -187
  198. package/src/runtime/overflow-recovery.ts +175 -175
  199. package/src/runtime/parallel-research.ts +44 -44
  200. package/src/runtime/parallel-utils.ts +156 -156
  201. package/src/runtime/parent-guard.ts +80 -80
  202. package/src/runtime/phase-progress.ts +217 -217
  203. package/src/runtime/pi-args.ts +165 -165
  204. package/src/runtime/pi-json-output.ts +111 -111
  205. package/src/runtime/pi-spawn.ts +167 -167
  206. package/src/runtime/policy-engine.ts +79 -79
  207. package/src/runtime/post-checks.ts +125 -122
  208. package/src/runtime/post-exit-stdio-guard.ts +86 -86
  209. package/src/runtime/process-status.ts +97 -73
  210. package/src/runtime/progress-event-coalescer.ts +43 -43
  211. package/src/runtime/recovery-recipes.ts +74 -74
  212. package/src/runtime/retry-executor.ts +81 -81
  213. package/src/runtime/role-permission.ts +39 -39
  214. package/src/runtime/run-tracker.ts +99 -0
  215. package/src/runtime/runtime-policy.ts +21 -0
  216. package/src/runtime/runtime-resolver.ts +94 -90
  217. package/src/runtime/scheduler.ts +294 -0
  218. package/src/runtime/semaphore.ts +131 -131
  219. package/src/runtime/sensitive-paths.ts +92 -92
  220. package/src/runtime/session-usage.ts +79 -79
  221. package/src/runtime/settings-store.ts +103 -0
  222. package/src/runtime/sidechain-output.ts +29 -29
  223. package/src/runtime/skill-instructions.ts +222 -222
  224. package/src/runtime/stale-reconciler.ts +198 -189
  225. package/src/runtime/streaming-output.ts +47 -0
  226. package/src/runtime/subagent-manager.ts +404 -395
  227. package/src/runtime/subprocess-tool-registry.ts +67 -67
  228. package/src/runtime/task-display.ts +38 -38
  229. package/src/runtime/task-graph-scheduler.ts +122 -122
  230. package/src/runtime/task-graph.ts +207 -207
  231. package/src/runtime/task-output-context.ts +177 -177
  232. package/src/runtime/task-packet.ts +93 -93
  233. package/src/runtime/task-quality.ts +207 -207
  234. package/src/runtime/task-runner/capabilities.ts +78 -78
  235. package/src/runtime/task-runner/live-executor.ts +131 -113
  236. package/src/runtime/task-runner/progress.ts +119 -119
  237. package/src/runtime/task-runner/prompt-builder.ts +139 -139
  238. package/src/runtime/task-runner/prompt-pipeline.ts +64 -64
  239. package/src/runtime/task-runner/result-utils.ts +14 -14
  240. package/src/runtime/task-runner/run-projection.ts +103 -103
  241. package/src/runtime/task-runner/state-helpers.ts +22 -22
  242. package/src/runtime/task-runner.ts +469 -458
  243. package/src/runtime/team-runner.ts +693 -945
  244. package/src/runtime/usage-tracker.ts +71 -0
  245. package/src/runtime/worker-heartbeat.ts +21 -21
  246. package/src/runtime/worker-startup.ts +57 -57
  247. package/src/runtime/workflow-state.ts +187 -187
  248. package/src/runtime/yield-handler.ts +190 -189
  249. package/src/schema/config-schema.ts +172 -168
  250. package/src/schema/team-tool-schema.ts +126 -125
  251. package/src/schema/validation-types.ts +151 -148
  252. package/src/skills/discover-skills.ts +67 -67
  253. package/src/skills/skill-templates.ts +374 -374
  254. package/src/state/active-run-registry.ts +227 -191
  255. package/src/state/artifact-store.ts +130 -129
  256. package/src/state/atomic-write.ts +262 -178
  257. package/src/state/blob-store.ts +116 -116
  258. package/src/state/contracts.ts +111 -111
  259. package/src/state/event-log-rotation.ts +161 -158
  260. package/src/state/event-log.ts +383 -240
  261. package/src/state/event-reconstructor.ts +217 -217
  262. package/src/state/jsonl-writer.ts +82 -82
  263. package/src/state/locks.ts +146 -148
  264. package/src/state/mailbox.ts +446 -405
  265. package/src/state/state-store.ts +364 -351
  266. package/src/state/task-claims.ts +44 -44
  267. package/src/state/types.ts +285 -285
  268. package/src/state/usage.ts +29 -29
  269. package/src/subagents/async-entry.ts +1 -1
  270. package/src/subagents/index.ts +3 -3
  271. package/src/subagents/live/control.ts +1 -1
  272. package/src/subagents/live/manager.ts +1 -1
  273. package/src/subagents/live/realtime.ts +1 -1
  274. package/src/subagents/live/session-runtime.ts +1 -1
  275. package/src/subagents/manager.ts +1 -1
  276. package/src/subagents/spawn.ts +1 -1
  277. package/src/teams/discover-teams.ts +116 -116
  278. package/src/teams/team-config.ts +27 -27
  279. package/src/teams/team-serializer.ts +38 -38
  280. package/src/types/diff.d.ts +18 -18
  281. package/src/ui/agent-management-overlay.ts +144 -144
  282. package/src/ui/crew-widget.ts +487 -370
  283. package/src/ui/dashboard-panes/agents-pane.ts +109 -28
  284. package/src/ui/dashboard-panes/cancellation-pane.ts +42 -42
  285. package/src/ui/dashboard-panes/capability-pane.ts +59 -59
  286. package/src/ui/dashboard-panes/health-pane.ts +30 -30
  287. package/src/ui/dashboard-panes/mailbox-pane.ts +35 -35
  288. package/src/ui/dashboard-panes/progress-pane.ts +30 -30
  289. package/src/ui/dashboard-panes/transcript-pane.ts +10 -10
  290. package/src/ui/heartbeat-aggregator.ts +63 -63
  291. package/src/ui/keybinding-map.ts +97 -94
  292. package/src/ui/live-conversation-overlay.ts +152 -0
  293. package/src/ui/live-run-sidebar.ts +180 -180
  294. package/src/ui/mascot.ts +442 -442
  295. package/src/ui/overlays/agent-picker-overlay.ts +57 -57
  296. package/src/ui/overlays/confirm-overlay.ts +58 -58
  297. package/src/ui/overlays/mailbox-compose-overlay.ts +144 -144
  298. package/src/ui/overlays/mailbox-compose-preview.ts +63 -63
  299. package/src/ui/overlays/mailbox-detail-overlay.ts +122 -122
  300. package/src/ui/pi-ui-compat.ts +57 -57
  301. package/src/ui/powerbar-publisher.ts +221 -197
  302. package/src/ui/render-scheduler.ts +216 -143
  303. package/src/ui/run-action-dispatcher.ts +118 -117
  304. package/src/ui/run-dashboard.ts +526 -464
  305. package/src/ui/run-event-bus.ts +208 -208
  306. package/src/ui/run-snapshot-cache.ts +826 -777
  307. package/src/ui/settings-overlay.ts +721 -0
  308. package/src/ui/snapshot-types.ts +86 -70
  309. package/src/ui/theme-adapter.ts +190 -190
  310. package/src/ui/tool-progress-formatter.ts +89 -0
  311. package/src/ui/transcript-cache.ts +94 -94
  312. package/src/ui/transcript-viewer.ts +335 -335
  313. package/src/utils/conflict-detect.ts +662 -0
  314. package/src/utils/env-filter.ts +30 -0
  315. package/src/utils/file-coalescer.ts +86 -86
  316. package/src/utils/frontmatter.ts +68 -68
  317. package/src/utils/fs-watch.ts +88 -31
  318. package/src/utils/gh-protocol.ts +479 -0
  319. package/src/utils/ids.ts +17 -17
  320. package/src/utils/incremental-reader.ts +104 -104
  321. package/src/utils/internal-error.ts +6 -6
  322. package/src/utils/names.ts +27 -27
  323. package/src/utils/paths.ts +102 -63
  324. package/src/utils/redaction.ts +44 -44
  325. package/src/utils/resolve-shell.ts +34 -0
  326. package/src/utils/safe-paths.ts +47 -47
  327. package/src/utils/scan-cache.ts +136 -136
  328. package/src/utils/sleep.ts +2 -1
  329. package/src/utils/sse-parser.ts +134 -134
  330. package/src/utils/task-name-generator.ts +337 -337
  331. package/src/utils/timings.ts +33 -33
  332. package/src/utils/visual.ts +243 -198
  333. package/src/workflows/discover-workflows.ts +139 -139
  334. package/src/workflows/validate-workflow.ts +40 -40
  335. package/src/workflows/workflow-config.ts +26 -26
  336. package/src/workflows/workflow-serializer.ts +32 -32
  337. package/src/worktree/branch-freshness.ts +45 -45
  338. package/src/worktree/cleanup.ts +75 -72
  339. package/src/worktree/worktree-manager.ts +188 -146
  340. package/teams/default.team.md +12 -12
  341. package/teams/fast-fix.team.md +11 -11
  342. package/teams/implementation.team.md +18 -18
  343. package/teams/parallel-research.team.md +14 -14
  344. package/teams/research.team.md +11 -11
  345. package/teams/review.team.md +12 -12
  346. package/tsconfig.json +19 -19
  347. package/workflows/default.workflow.md +30 -30
  348. package/workflows/fast-fix.workflow.md +23 -23
  349. package/workflows/implementation.workflow.md +43 -43
  350. package/workflows/parallel-research.workflow.md +46 -46
  351. package/workflows/research.workflow.md +22 -22
  352. package/workflows/review.workflow.md +30 -30
  353. package/skills/task-packet/SKILL.md +0 -28
  354. package/skills/verify-evidence/SKILL.md +0 -27
@@ -1,301 +1,317 @@
1
- import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
2
- import * as fs from "node:fs";
3
- import type { MetricRegistry } from "../observability/metric-registry.ts";
4
- import { appendEvent, scanSequence } from "../state/event-log.ts";
5
- import { withRunLockSync } from "../state/locks.ts";
6
- import { loadRunManifestById, saveRunTasks, updateRunStatus } from "../state/state-store.ts";
7
- import type { TeamTaskState } from "../state/types.ts";
8
- import { isWorkerHeartbeatStale } from "./worker-heartbeat.ts";
9
- import type { ManifestCache } from "./manifest-cache.ts";
10
- import { checkProcessLiveness } from "./process-status.ts";
11
- import { reconcileStaleRun, type ReconcileResult } from "./stale-reconciler.ts";
12
- import { executeHook, appendHookEvent } from "../hooks/registry.ts";
13
- import { activeRunEntries, unregisterActiveRun, readActiveRunRegistry } from "../state/active-run-registry.ts";
14
- import { resolveRealContainedPath } from "../utils/safe-paths.ts";
15
- import { projectCrewRoot, userCrewRoot } from "../utils/paths.ts";
16
-
17
- export interface RecoveryPlan {
18
- runId: string;
19
- resumableTasks: string[];
20
- preservedTasks: string[];
21
- lastEventSeq: number;
22
- }
23
-
24
- function isTerminalTask(task: TeamTaskState): boolean {
25
- return task.status === "completed" || task.status === "failed" || task.status === "cancelled" || task.status === "skipped";
26
- }
27
-
28
- function shouldRecoverTask(task: TeamTaskState, deadMs: number): boolean {
29
- if (task.status !== "running") return false;
30
- if (!task.heartbeat) return true;
31
- return task.heartbeat.alive === false || isWorkerHeartbeatStale(task.heartbeat, deadMs);
32
- }
33
-
34
- export function detectInterruptedRuns(cwd: string, manifestCache: ManifestCache, deadMs = 300_000): RecoveryPlan[] {
35
- const plans: RecoveryPlan[] = [];
36
- for (const manifest of manifestCache.list(50)) {
37
- if (manifest.status !== "running") continue;
38
- if (manifest.async?.pid !== undefined && checkProcessLiveness(manifest.async.pid).alive) continue;
39
- const loaded = loadRunManifestById(cwd, manifest.runId);
40
- if (!loaded) continue;
41
- const resumableTasks = loaded.tasks.filter((task) => shouldRecoverTask(task, deadMs)).map((task) => task.id);
42
- if (!resumableTasks.length) continue;
43
- plans.push({ runId: manifest.runId, resumableTasks, preservedTasks: loaded.tasks.filter(isTerminalTask).map((task) => task.id), lastEventSeq: scanSequence(loaded.manifest.eventsPath) });
44
- }
45
- return plans;
46
- }
47
-
48
- export async function applyRecoveryPlan(plan: RecoveryPlan, ctx: Pick<ExtensionContext, "cwd">, registry?: MetricRegistry): Promise<void> {
49
- const loaded = loadRunManifestById(ctx.cwd, plan.runId);
50
- if (!loaded) throw new Error(`Run '${plan.runId}' not found.`);
51
-
52
- const hookReport = await executeHook("run_recovery", { runId: plan.runId, cwd: ctx.cwd });
53
- appendHookEvent(loaded.manifest, hookReport);
54
- if (hookReport.outcome === "block") {
55
- appendEvent(loaded.manifest.eventsPath, { type: "crew.run.recovery_blocked", runId: plan.runId, message: `Recovery blocked by hook: ${hookReport.reason ?? "run_recovery hook blocked the operation."}`, data: { hookOutcome: "block", reason: hookReport.reason } });
56
- return;
57
- }
58
-
59
- const reset = new Set(plan.resumableTasks);
60
- const tasks = loaded.tasks.map((task) => reset.has(task.id) ? { ...task, status: "queued" as const, startedAt: undefined, finishedAt: undefined, error: undefined, heartbeat: undefined } : task);
61
- saveRunTasks(loaded.manifest, tasks);
62
- appendEvent(loaded.manifest.eventsPath, { type: "crew.run.resumed", runId: plan.runId, message: `Recovered ${plan.resumableTasks.length} interrupted task(s).`, data: { recoveredFromSeq: plan.lastEventSeq, resumableTasks: plan.resumableTasks } });
63
- registry?.counter("crew.run.count", "Total runs by status").inc({ status: "resumed" });
64
- }
65
-
66
- export function declineRecoveryPlan(plan: RecoveryPlan, ctx: Pick<ExtensionContext, "cwd">): void {
67
- const loaded = loadRunManifestById(ctx.cwd, plan.runId);
68
- if (!loaded) throw new Error(`Run '${plan.runId}' not found.`);
69
- // Log the event first — if appendEvent fails, state remains consistent.
70
- appendEvent(loaded.manifest.eventsPath, { type: "crew.run.recovery_declined", runId: plan.runId, message: "Interrupted run was not resumed.", data: { recoveredFromSeq: plan.lastEventSeq } });
71
- updateRunStatus(loaded.manifest, "cancelled", "interrupted-not-resumed");
72
- }
73
-
74
- /**
75
- * Run 3-phase stale reconciliation on all active runs.
76
- * Returns results for each reconciled run.
77
- */
78
- /**
79
- * Auto-cancel orphaned runs whose owner session no longer exists.
80
- *
81
- * When a Pi session dies (crash, force-close, Ctrl+C), `session_shutdown`
82
- * does not fire and child workers are not terminated. The next Pi session
83
- * must detect these orphaned runs and cancel them.
84
- *
85
- * Criteria for orphan detection:
86
- * 1. Manifest status is "running"
87
- * 2. Manifest has an `ownerSessionId` that is NOT the current session
88
- * 3. The owner session's process is no longer alive (PID check)
89
- * 4. No recent heartbeat activity (task heartbeat or agent progress within threshold)
90
- *
91
- * Returns the number of runs cancelled.
92
- */
93
- export function cancelOrphanedRuns(
94
- cwd: string,
95
- manifestCache: ManifestCache,
96
- currentSessionId: string,
97
- staleThresholdMs = 300_000,
98
- now = Date.now(),
99
- ): { cancelled: string[]; skipped: string[] } {
100
- const cancelled: string[] = [];
101
- const skipped: string[] = [];
102
-
103
- // Phase 1: Scan project-level manifests via manifestCache
104
- for (const manifest of manifestCache.list(50)) {
105
- if (manifest.status !== "running") continue;
106
-
107
- // Only consider runs owned by a different session
108
- const ownerId = manifest.ownerSessionId;
109
- if (!ownerId || ownerId === currentSessionId) continue;
110
-
111
- // Check if the owner process is still alive
112
- const ownerPid = manifest.async?.pid;
113
- if (ownerPid !== undefined && checkProcessLiveness(ownerPid).alive) {
114
- skipped.push(manifest.runId);
115
- continue;
116
- }
117
-
118
- // Check for recent heartbeat activity
119
- const loaded = loadRunManifestById(cwd, manifest.runId);
120
- if (!loaded) continue;
121
-
122
- const hasRecentActivity = loaded.tasks.some((task) => {
123
- if (task.status !== "running" && task.status !== "waiting") return false;
124
- const heartbeatAt = task.heartbeat?.lastSeenAt ? new Date(task.heartbeat.lastSeenAt).getTime() : Number.NaN;
125
- if (task.heartbeat?.alive !== false && Number.isFinite(heartbeatAt) && now - heartbeatAt <= staleThresholdMs) return true;
126
- const activityAt = task.agentProgress?.lastActivityAt ? new Date(task.agentProgress.lastActivityAt).getTime() : Number.NaN;
127
- return Number.isFinite(activityAt) && now - activityAt <= staleThresholdMs;
128
- });
129
-
130
- if (hasRecentActivity) {
131
- skipped.push(manifest.runId);
132
- continue;
133
- }
134
-
135
- // Orphan confirmed — cancel all running tasks
136
- withRunLockSync(loaded.manifest, () => {
137
- const fresh = loadRunManifestById(cwd, manifest.runId);
138
- if (!fresh || fresh.manifest.status !== "running") return;
139
-
140
- const now_iso = new Date(now).toISOString();
141
- const repairedTasks = fresh.tasks.map((task) => {
142
- if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
143
- return { ...task, status: "cancelled" as const, finishedAt: now_iso, error: `Orphaned run: owner session ${ownerId} no longer exists` };
144
- }
145
- return task;
146
- });
147
-
148
- saveRunTasks(fresh.manifest, repairedTasks);
149
- updateRunStatus(fresh.manifest, "cancelled", `Orphaned run: owner session ${ownerId} no longer exists`);
150
- appendEvent(fresh.manifest.eventsPath, { type: "crew.run.orphan_cancelled", runId: manifest.runId, message: `Auto-cancelled orphaned run (owner: ${ownerId})`, data: { ownerSessionId: ownerId, cancelledTasks: repairedTasks.filter((t) => t.status === "cancelled").length } });
151
- cancelled.push(manifest.runId);
152
- });
153
- }
154
-
155
- return { cancelled, skipped };
156
- }
157
-
158
- /**
159
- * Purge the global active-run-index of entries whose manifest is no longer active.
160
- *
161
- * This scans every entry in active-run-index.json and removes any whose:
162
- * - manifest file no longer exists, OR
163
- * - manifest status is terminal (completed/failed/cancelled/blocked), OR
164
- * - manifest cwd directory no longer exists (e.g. temp test dirs)
165
- *
166
- * Also removes entries where the manifest is still "running" but:
167
- * - The cwd has been deleted (temp dir cleanup)
168
- * - The async worker PID is dead AND no heartbeat for > threshold
169
- *
170
- * This is the **global** cleanup that cancelOrphanedRuns (project-scoped)
171
- * cannot reach.
172
- */
173
- /**
174
- * Best-effort removal of stateRoot and artifactsRoot directories for a purged run.
175
- * Uses resolveRealContainedPath to ensure we only delete paths that are safely
176
- * contained within a known crew root (project or user level).
177
- */
178
- function tryRemoveRunDirectories(entry: { stateRoot: string; cwd: string }): void {
179
- const roots = [projectCrewRoot(entry.cwd), userCrewRoot()];
180
- for (const root of roots) {
181
- try {
182
- resolveRealContainedPath(root, entry.stateRoot);
183
- // If we get here, stateRoot is safely contained — remove it
184
- fs.rmSync(entry.stateRoot, { recursive: true, force: true });
185
- break;
186
- } catch {
187
- // Not contained in this root, try next
188
- }
189
- }
190
- // NOTE: artifactsRoot is shared across runs and cleaned up by pruneFinishedRuns/pruneUserLevelRuns — not deleted here.
191
- }
192
-
193
- /**
194
- * Purge the global active-run-index of entries whose manifest is no longer active.
195
- *
196
- * Note: This function only cleans user-level active run entries.
197
- * Project-level stale runs are handled by session_start auto-prune triggered during run creation.
198
- */
199
- export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.now()): { purged: string[]; kept: string[] } {
200
- const purged: string[] = [];
201
- const kept: string[] = [];
202
- const entries = readActiveRunRegistry();
203
-
204
- for (const entry of entries) {
205
- // 1. Manifest file gone → definitely stale
206
- if (!fs.existsSync(entry.manifestPath)) {
207
- unregisterActiveRun(entry.runId);
208
- tryRemoveRunDirectories(entry);
209
- purged.push(entry.runId);
210
- continue;
211
- }
212
-
213
- // 2. CWD gone → temp dir cleaned up
214
- if (!fs.existsSync(entry.cwd)) {
215
- unregisterActiveRun(entry.runId);
216
- tryRemoveRunDirectories(entry);
217
- purged.push(entry.runId);
218
- continue;
219
- }
220
-
221
- // 3. Read manifest status
222
- let manifest: { status?: string; async?: { pid?: number }; ownerSessionId?: string } | undefined;
223
- try {
224
- manifest = JSON.parse(fs.readFileSync(entry.manifestPath, "utf-8"));
225
- } catch {
226
- unregisterActiveRun(entry.runId);
227
- tryRemoveRunDirectories(entry);
228
- purged.push(entry.runId);
229
- continue;
230
- }
231
-
232
- // 4. Terminal status no longer active (just unregister, don't delete files)
233
- const terminalStatuses = new Set(["completed", "failed", "cancelled", "blocked"]);
234
- if (manifest && terminalStatuses.has(manifest.status ?? "")) {
235
- unregisterActiveRun(entry.runId);
236
- purged.push(entry.runId);
237
- continue;
238
- }
239
-
240
- // 5. Still "running" — check if worker PID is dead and no heartbeat
241
- if (manifest?.status === "running" && manifest.async?.pid !== undefined) {
242
- const pidAlive = checkProcessLiveness(manifest.async.pid).alive;
243
- if (!pidAlive) {
244
- // Check age — if manifest hasn't been updated in > threshold, it's stale
245
- const updatedAt = new Date(entry.updatedAt).getTime();
246
- if (Number.isFinite(updatedAt) && now - updatedAt > staleThresholdMs) {
247
- // Dead PID + stale update → cancel the manifest and unregister
248
- try {
249
- const fullLoaded = loadRunManifestById(entry.cwd, entry.runId);
250
- if (fullLoaded) {
251
- const now_iso = new Date(now).toISOString();
252
- const repairedTasks = fullLoaded.tasks.map((task) => {
253
- if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
254
- return { ...task, status: "cancelled" as const, finishedAt: now_iso, error: "Orphaned run: worker process dead and no recent activity" };
255
- }
256
- return task;
257
- });
258
- saveRunTasks(fullLoaded.manifest, repairedTasks);
259
- updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: worker process dead and no recent activity");
260
- }
261
- } catch {
262
- // Best-effort manifest cleanup
263
- }
264
- unregisterActiveRun(entry.runId);
265
- tryRemoveRunDirectories(entry);
266
- purged.push(entry.runId);
267
- continue;
268
- }
269
- }
270
- }
271
-
272
- kept.push(entry.runId);
273
- }
274
-
275
- return { purged, kept };
276
- }
277
-
278
- export function reconcileAllStaleRuns(cwd: string, manifestCache: ManifestCache, now = Date.now()): ReconcileResult[] {
279
- const results: ReconcileResult[] = [];
280
- for (const manifest of manifestCache.list(50)) {
281
- if (manifest.status !== "running") continue;
282
- const loaded = loadRunManifestById(cwd, manifest.runId);
283
- if (!loaded) continue;
284
- // Use lock to prevent race with cancel/status handlers modifying the same run
285
- withRunLockSync(loaded.manifest, () => {
286
- // Re-read inside lock to get freshest data
287
- const fresh = loadRunManifestById(cwd, manifest.runId);
288
- if (!fresh || fresh.manifest.status !== "running") return;
289
- const result = reconcileStaleRun(fresh.manifest, fresh.tasks, now);
290
- if (result.repaired) {
291
- if (result.repairedTasks) saveRunTasks(fresh.manifest, result.repairedTasks);
292
- updateRunStatus(fresh.manifest, "failed", `Stale run reconciled: ${result.detail}`);
293
- appendEvent(fresh.manifest.eventsPath, { type: "crew.run.reconciled_stale", runId: manifest.runId, message: result.detail, data: { verdict: result.verdict } });
294
- }
295
- if (result.verdict !== "healthy") {
296
- results.push(result);
297
- }
298
- });
299
- }
300
- return results;
301
- }
1
+ import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
2
+ import * as fs from "node:fs";
3
+ import type { MetricRegistry } from "../observability/metric-registry.ts";
4
+ import { appendEvent, scanSequence } from "../state/event-log.ts";
5
+ import { recordFromTask, upsertCrewAgent } from "./crew-agent-records.ts";
6
+ import { withRunLockSync } from "../state/locks.ts";
7
+ import { loadRunManifestById, saveRunTasks, updateRunStatus } from "../state/state-store.ts";
8
+ import type { TeamTaskState } from "../state/types.ts";
9
+ import { isWorkerHeartbeatStale } from "./worker-heartbeat.ts";
10
+ import type { ManifestCache } from "./manifest-cache.ts";
11
+ import { checkProcessLiveness } from "./process-status.ts";
12
+ import { reconcileStaleRun, type ReconcileResult } from "./stale-reconciler.ts";
13
+ import { executeHook, appendHookEvent } from "../hooks/registry.ts";
14
+ import { activeRunEntries, unregisterActiveRun, readActiveRunRegistry } from "../state/active-run-registry.ts";
15
+ import { resolveRealContainedPath } from "../utils/safe-paths.ts";
16
+ import { projectCrewRoot, userCrewRoot } from "../utils/paths.ts";
17
+ import { terminateLiveAgentsForRun } from "./live-agent-manager.ts";
18
+
19
+ export interface RecoveryPlan {
20
+ runId: string;
21
+ resumableTasks: string[];
22
+ preservedTasks: string[];
23
+ lastEventSeq: number;
24
+ }
25
+
26
+ function isTerminalTask(task: TeamTaskState): boolean {
27
+ return task.status === "completed" || task.status === "failed" || task.status === "cancelled" || task.status === "skipped";
28
+ }
29
+
30
+ function shouldRecoverTask(task: TeamTaskState, deadMs: number): boolean {
31
+ if (task.status !== "running") return false;
32
+ if (!task.heartbeat) return true;
33
+ return task.heartbeat.alive === false || isWorkerHeartbeatStale(task.heartbeat, deadMs);
34
+ }
35
+
36
+ export function detectInterruptedRuns(cwd: string, manifestCache: ManifestCache, deadMs = 300_000): RecoveryPlan[] {
37
+ const plans: RecoveryPlan[] = [];
38
+ for (const manifest of manifestCache.list(50)) {
39
+ if (manifest.status !== "running" && manifest.status !== "blocked") continue;
40
+ if (manifest.async?.pid !== undefined && checkProcessLiveness(manifest.async.pid).alive) continue;
41
+ const loaded = loadRunManifestById(cwd, manifest.runId);
42
+ if (!loaded) continue;
43
+ const resumableTasks = loaded.tasks.filter((task) => shouldRecoverTask(task, deadMs)).map((task) => task.id);
44
+ if (!resumableTasks.length) continue;
45
+ plans.push({ runId: manifest.runId, resumableTasks, preservedTasks: loaded.tasks.filter(isTerminalTask).map((task) => task.id), lastEventSeq: scanSequence(loaded.manifest.eventsPath) });
46
+ }
47
+ return plans;
48
+ }
49
+
50
+ export async function applyRecoveryPlan(plan: RecoveryPlan, ctx: Pick<ExtensionContext, "cwd">, registry?: MetricRegistry): Promise<void> {
51
+ const loaded = loadRunManifestById(ctx.cwd, plan.runId);
52
+ if (!loaded) throw new Error(`Run '${plan.runId}' not found.`);
53
+
54
+ const hookReport = await executeHook("run_recovery", { runId: plan.runId, cwd: ctx.cwd });
55
+ appendHookEvent(loaded.manifest, hookReport);
56
+ if (hookReport.outcome === "block") {
57
+ appendEvent(loaded.manifest.eventsPath, { type: "crew.run.recovery_blocked", runId: plan.runId, message: `Recovery blocked by hook: ${hookReport.reason ?? "run_recovery hook blocked the operation."}`, data: { hookOutcome: "block", reason: hookReport.reason } });
58
+ return;
59
+ }
60
+
61
+ const reset = new Set(plan.resumableTasks);
62
+ const tasks = loaded.tasks.map((task) => reset.has(task.id) ? { ...task, status: "queued" as const, startedAt: undefined, finishedAt: undefined, error: undefined, heartbeat: undefined } : task);
63
+ saveRunTasks(loaded.manifest, tasks);
64
+ appendEvent(loaded.manifest.eventsPath, { type: "crew.run.resumed", runId: plan.runId, message: `Recovered ${plan.resumableTasks.length} interrupted task(s).`, data: { recoveredFromSeq: plan.lastEventSeq, resumableTasks: plan.resumableTasks } });
65
+ registry?.counter("crew.run.count", "Total runs by status").inc({ status: "resumed" });
66
+ }
67
+
68
+ export function declineRecoveryPlan(plan: RecoveryPlan, ctx: Pick<ExtensionContext, "cwd">): void {
69
+ const loaded = loadRunManifestById(ctx.cwd, plan.runId);
70
+ if (!loaded) throw new Error(`Run '${plan.runId}' not found.`);
71
+ // Log the event first — if appendEvent fails, state remains consistent.
72
+ appendEvent(loaded.manifest.eventsPath, { type: "crew.run.recovery_declined", runId: plan.runId, message: "Interrupted run was not resumed.", data: { recoveredFromSeq: plan.lastEventSeq } });
73
+ updateRunStatus(loaded.manifest, "cancelled", "interrupted-not-resumed");
74
+ }
75
+
76
+ /**
77
+ * Run 3-phase stale reconciliation on all active runs.
78
+ * Returns results for each reconciled run.
79
+ */
80
+ /**
81
+ * Auto-cancel orphaned runs whose owner session no longer exists.
82
+ *
83
+ * When a Pi session dies (crash, force-close, Ctrl+C), `session_shutdown`
84
+ * does not fire and child workers are not terminated. The next Pi session
85
+ * must detect these orphaned runs and cancel them.
86
+ *
87
+ * Criteria for orphan detection:
88
+ * 1. Manifest status is "running"
89
+ * 2. Manifest has an `ownerSessionId` that is NOT the current session
90
+ * 3. The owner session's process is no longer alive (PID check)
91
+ * 4. No recent heartbeat activity (task heartbeat or agent progress within threshold)
92
+ *
93
+ * Returns the number of runs cancelled.
94
+ */
95
+ export function cancelOrphanedRuns(
96
+ cwd: string,
97
+ manifestCache: ManifestCache,
98
+ currentSessionId: string,
99
+ staleThresholdMs = 300_000,
100
+ now = Date.now(),
101
+ ): { cancelled: string[]; skipped: string[] } {
102
+ const cancelled: string[] = [];
103
+ const skipped: string[] = [];
104
+
105
+ // Phase 1: Scan project-level manifests via manifestCache
106
+ for (const manifest of manifestCache.list(50)) {
107
+ if (manifest.status !== "running" && manifest.status !== "blocked") continue;
108
+
109
+ // Only consider runs owned by a different session
110
+ const ownerId = manifest.ownerSessionId;
111
+ if (!ownerId || ownerId === currentSessionId) continue;
112
+
113
+ // Check if the owner process is still alive
114
+ const ownerPid = manifest.async?.pid;
115
+ if (ownerPid !== undefined && checkProcessLiveness(ownerPid).alive) {
116
+ skipped.push(manifest.runId);
117
+ continue;
118
+ }
119
+
120
+ // Check for recent heartbeat activity
121
+ const loaded = loadRunManifestById(cwd, manifest.runId);
122
+ if (!loaded) continue;
123
+
124
+ const hasRecentActivity = loaded.tasks.some((task) => {
125
+ if (task.status !== "running" && task.status !== "waiting") return false;
126
+ const heartbeatAt = task.heartbeat?.lastSeenAt ? new Date(task.heartbeat.lastSeenAt).getTime() : Number.NaN;
127
+ if (task.heartbeat?.alive !== false && Number.isFinite(heartbeatAt) && now - heartbeatAt <= staleThresholdMs) return true;
128
+ const activityAt = task.agentProgress?.lastActivityAt ? new Date(task.agentProgress.lastActivityAt).getTime() : Number.NaN;
129
+ return Number.isFinite(activityAt) && now - activityAt <= staleThresholdMs;
130
+ });
131
+
132
+ if (hasRecentActivity) {
133
+ skipped.push(manifest.runId);
134
+ continue;
135
+ }
136
+
137
+ // Orphan confirmed mark durable state terminal before best-effort live-agent abort.
138
+ // terminateLiveAgent unregisters handles before awaiting abort(), and live-executor's
139
+ // isCurrent() checks durable terminal state before writing progress.
140
+
141
+ // Orphan confirmed cancel all running tasks
142
+ let cancelledRun = false;
143
+ withRunLockSync(loaded.manifest, () => {
144
+ const fresh = loadRunManifestById(cwd, manifest.runId);
145
+ if (!fresh || (fresh.manifest.status !== "running" && fresh.manifest.status !== "blocked")) return;
146
+
147
+ const now_iso = new Date(now).toISOString();
148
+ const repairedTasks = fresh.tasks.map((task) => {
149
+ if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
150
+ return { ...task, status: "cancelled" as const, finishedAt: now_iso, error: `Orphaned run: owner session ${ownerId} no longer exists` };
151
+ }
152
+ return task;
153
+ });
154
+
155
+ saveRunTasks(fresh.manifest, repairedTasks);
156
+ for (const task of repairedTasks) { try { upsertCrewAgent(fresh.manifest, recordFromTask(fresh.manifest, task, "scaffold")); } catch { /* non-critical */ } }
157
+ updateRunStatus(fresh.manifest, "cancelled", `Orphaned run: owner session ${ownerId} no longer exists`);
158
+ appendEvent(fresh.manifest.eventsPath, { type: "crew.run.orphan_cancelled", runId: manifest.runId, message: `Auto-cancelled orphaned run (owner: ${ownerId})`, data: { ownerSessionId: ownerId, cancelledTasks: repairedTasks.filter((t) => t.status === "cancelled").length } });
159
+ cancelled.push(manifest.runId);
160
+ cancelledRun = true;
161
+ });
162
+ if (cancelledRun) void terminateLiveAgentsForRun(manifest.runId, "cancelled", appendEvent, loaded.manifest.eventsPath).catch(() => {});
163
+ }
164
+
165
+ return { cancelled, skipped };
166
+ }
167
+
168
+ /**
169
+ * Purge the global active-run-index of entries whose manifest is no longer active.
170
+ *
171
+ * This scans every entry in active-run-index.json and removes any whose:
172
+ * - manifest file no longer exists, OR
173
+ * - manifest status is terminal (completed/failed/cancelled/blocked), OR
174
+ * - manifest cwd directory no longer exists (e.g. temp test dirs)
175
+ *
176
+ * Also removes entries where the manifest is still "running" but:
177
+ * - The cwd has been deleted (temp dir cleanup)
178
+ * - The async worker PID is dead AND no heartbeat for > threshold
179
+ *
180
+ * This is the **global** cleanup that cancelOrphanedRuns (project-scoped)
181
+ * cannot reach.
182
+ */
183
+ /**
184
+ * Best-effort removal of stateRoot and artifactsRoot directories for a purged run.
185
+ * Uses resolveRealContainedPath to ensure we only delete paths that are safely
186
+ * contained within a known crew root (project or user level).
187
+ */
188
+ function tryRemoveRunDirectories(entry: { stateRoot: string; cwd: string }): void {
189
+ const roots = [projectCrewRoot(entry.cwd), userCrewRoot()];
190
+ for (const root of roots) {
191
+ try {
192
+ resolveRealContainedPath(root, entry.stateRoot);
193
+ // If we get here, stateRoot is safely contained — remove it
194
+ fs.rmSync(entry.stateRoot, { recursive: true, force: true });
195
+ break;
196
+ } catch {
197
+ // Not contained in this root, try next
198
+ }
199
+ }
200
+ // NOTE: artifactsRoot is shared across runs and cleaned up by pruneFinishedRuns/pruneUserLevelRuns — not deleted here.
201
+ }
202
+
203
+ /**
204
+ * Purge the global active-run-index of entries whose manifest is no longer active.
205
+ *
206
+ * Note: This function only cleans user-level active run entries.
207
+ * Project-level stale runs are handled by session_start auto-prune triggered during run creation.
208
+ */
209
+ export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.now()): { purged: string[]; kept: string[] } {
210
+ const purged: string[] = [];
211
+ const kept: string[] = [];
212
+ const entries = readActiveRunRegistry();
213
+
214
+ for (const entry of entries) {
215
+ // 1. Manifest file gone → definitely stale
216
+ if (!fs.existsSync(entry.manifestPath)) {
217
+ unregisterActiveRun(entry.runId);
218
+ tryRemoveRunDirectories(entry);
219
+ purged.push(entry.runId);
220
+ continue;
221
+ }
222
+
223
+ // 2. CWD gone → temp dir cleaned up
224
+ if (!fs.existsSync(entry.cwd)) {
225
+ unregisterActiveRun(entry.runId);
226
+ tryRemoveRunDirectories(entry);
227
+ purged.push(entry.runId);
228
+ continue;
229
+ }
230
+
231
+ // 3. Read manifest status
232
+ let manifest: { status?: string; async?: { pid?: number }; ownerSessionId?: string } | undefined;
233
+ try {
234
+ manifest = JSON.parse(fs.readFileSync(entry.manifestPath, "utf-8"));
235
+ } catch {
236
+ unregisterActiveRun(entry.runId);
237
+ tryRemoveRunDirectories(entry);
238
+ purged.push(entry.runId);
239
+ continue;
240
+ }
241
+
242
+ // 4. Terminal status → no longer active (just unregister, don't delete files)
243
+ const terminalStatuses = new Set(["completed", "failed", "cancelled", "blocked"]);
244
+ if (manifest && terminalStatuses.has(manifest.status ?? "")) {
245
+ unregisterActiveRun(entry.runId);
246
+ purged.push(entry.runId);
247
+ continue;
248
+ }
249
+
250
+ // 5. Still "running" — check if worker PID is dead and no heartbeat
251
+ if (manifest?.status === "running" && manifest.async?.pid !== undefined) {
252
+ const pidAlive = checkProcessLiveness(manifest.async.pid).alive;
253
+ if (!pidAlive) {
254
+ // Check age if manifest hasn't been updated in > threshold, it's stale
255
+ const updatedAt = new Date(entry.updatedAt).getTime();
256
+ if (Number.isFinite(updatedAt) && now - updatedAt > staleThresholdMs) {
257
+ // Dead PID + stale update → cancel the manifest and unregister
258
+ try {
259
+ const fullLoaded = loadRunManifestById(entry.cwd, entry.runId);
260
+ if (fullLoaded) {
261
+ const now_iso = new Date(now).toISOString();
262
+ const repairedTasks = fullLoaded.tasks.map((task) => {
263
+ if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
264
+ return { ...task, status: "cancelled" as const, finishedAt: now_iso, error: "Orphaned run: worker process dead and no recent activity" };
265
+ }
266
+ return task;
267
+ });
268
+ saveRunTasks(fullLoaded.manifest, repairedTasks);
269
+ for (const task of repairedTasks) { try { upsertCrewAgent(fullLoaded.manifest, recordFromTask(fullLoaded.manifest, task, "scaffold")); } catch { /* non-critical */ } }
270
+ updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: worker process dead and no recent activity");
271
+ void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch(() => {});
272
+ }
273
+ } catch {
274
+ // Best-effort manifest cleanup
275
+ }
276
+ unregisterActiveRun(entry.runId);
277
+ tryRemoveRunDirectories(entry);
278
+ purged.push(entry.runId);
279
+ continue;
280
+ }
281
+ }
282
+ }
283
+
284
+ kept.push(entry.runId);
285
+ }
286
+
287
+ return { purged, kept };
288
+ }
289
+
290
+ export function reconcileAllStaleRuns(cwd: string, manifestCache: ManifestCache, now = Date.now()): ReconcileResult[] {
291
+ const results: ReconcileResult[] = [];
292
+ for (const manifest of manifestCache.list(50)) {
293
+ if (manifest.status !== "running" && manifest.status !== "blocked") continue;
294
+ const loaded = loadRunManifestById(cwd, manifest.runId);
295
+ if (!loaded) continue;
296
+ // Use lock to prevent race with cancel/status handlers modifying the same run
297
+ withRunLockSync(loaded.manifest, () => {
298
+ // Re-read inside lock to get freshest data
299
+ const fresh = loadRunManifestById(cwd, manifest.runId);
300
+ if (!fresh || (fresh.manifest.status !== "running" && fresh.manifest.status !== "blocked")) return;
301
+ const result = reconcileStaleRun(fresh.manifest, fresh.tasks, now);
302
+ if (result.repaired || result.verdict === "result_exists") {
303
+ if (result.repairedTasks) {
304
+ saveRunTasks(fresh.manifest, result.repairedTasks);
305
+ for (const task of result.repairedTasks) { try { upsertCrewAgent(fresh.manifest, recordFromTask(fresh.manifest, task, "scaffold")); } catch { /* non-critical */ } }
306
+ }
307
+ updateRunStatus(fresh.manifest, "failed", `Stale run reconciled: ${result.detail}`);
308
+ void terminateLiveAgentsForRun(fresh.manifest.runId, "failed", appendEvent, fresh.manifest.eventsPath).catch(() => {});
309
+ appendEvent(fresh.manifest.eventsPath, { type: "crew.run.reconciled_stale", runId: manifest.runId, message: result.detail, data: { verdict: result.verdict } });
310
+ }
311
+ if (result.verdict !== "healthy") {
312
+ results.push(result);
313
+ }
314
+ });
315
+ }
316
+ return results;
317
+ }