pi-crew 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (348) hide show
  1. package/AGENTS.md +57 -32
  2. package/CHANGELOG.md +466 -448
  3. package/LICENSE +21 -21
  4. package/NOTICE.md +16 -16
  5. package/README.md +323 -323
  6. package/docs/FEATURE_INTAKE.md +126 -0
  7. package/docs/HARNESS.md +86 -0
  8. package/docs/HARNESS_BACKLOG.md +41 -0
  9. package/docs/TEST_MATRIX.md +49 -0
  10. package/docs/actions-reference.md +595 -595
  11. package/docs/architecture.md +180 -180
  12. package/docs/code-review-2026-05-11.md +592 -592
  13. package/docs/commands-reference.md +347 -347
  14. package/docs/comparison-pi-subagents-vs-pi-crew.md +303 -0
  15. package/docs/decisions/0001-durable-state.md +41 -0
  16. package/docs/decisions/0002-child-process-for-async.md +42 -0
  17. package/docs/decisions/0003-depth-guard.md +36 -0
  18. package/docs/decisions/0004-execfile-over-exec.md +34 -0
  19. package/docs/decisions/0005-no-parameter-properties.md +49 -0
  20. package/docs/decisions/0006-publish-bundled-esm.md +63 -0
  21. package/docs/decisions/0007-active-run-binary-index.md +54 -0
  22. package/docs/decisions/0008-child-pi-warm-pool.md +61 -0
  23. package/docs/decisions/README.md +23 -0
  24. package/docs/followup-review-round4-2026-05-13.md +107 -0
  25. package/docs/implementation-plan-top3.md +333 -0
  26. package/docs/live-mailbox-runtime.md +36 -36
  27. package/docs/next-upgrade-roadmap.md +808 -808
  28. package/docs/oh-my-pi-research.md +509 -0
  29. package/docs/perf/baseline-2026-05.md +113 -0
  30. package/docs/perf/final-report-2026-05.md +206 -0
  31. package/docs/perf/sprint-1-report.md +71 -0
  32. package/docs/perf/sprint-2-report.md +81 -0
  33. package/docs/perf/sprint-2.5-report.md +53 -0
  34. package/docs/perf/sprint-3-report.md +36 -0
  35. package/docs/perf/sprint-4-report.md +47 -0
  36. package/docs/perf/sprint-5-report.md +51 -0
  37. package/docs/perf/sprint-6-report.md +94 -0
  38. package/docs/perf/sprint-7-report.md +74 -0
  39. package/docs/perf/upgrade-plan-2026-05.md +147 -0
  40. package/docs/pi-subagents3-deep-analysis.md +508 -0
  41. package/docs/product/README.md +31 -0
  42. package/docs/product/platform.md +27 -0
  43. package/docs/product/runtime-safety.md +37 -0
  44. package/docs/product/team-run.md +39 -0
  45. package/docs/product/team-tool.md +37 -0
  46. package/docs/publishing.md +65 -65
  47. package/docs/resource-formats.md +134 -134
  48. package/docs/runtime-analysis-child-vs-live.md +171 -0
  49. package/docs/runtime-flow.md +148 -148
  50. package/docs/runtime-migration-in-process-analysis.md +250 -0
  51. package/docs/stories/README.md +30 -0
  52. package/docs/stories/backlog.md +36 -0
  53. package/docs/templates/decision.md +27 -0
  54. package/docs/templates/story.md +44 -0
  55. package/docs/templates/validation-report.md +32 -0
  56. package/docs/usage.md +238 -238
  57. package/index.ts +7 -6
  58. package/install.mjs +65 -65
  59. package/package.json +107 -100
  60. package/schema.json +222 -222
  61. package/skills/child-pi-spawning/SKILL.md +213 -0
  62. package/skills/context-artifact-hygiene/SKILL.md +32 -0
  63. package/skills/event-log-tracing/SKILL.md +299 -0
  64. package/skills/git-master/SKILL.md +225 -24
  65. package/skills/live-agent-lifecycle/SKILL.md +192 -0
  66. package/skills/mailbox-interactive/SKILL.md +300 -19
  67. package/skills/model-routing-context/SKILL.md +94 -0
  68. package/skills/multi-perspective-review/SKILL.md +88 -0
  69. package/skills/read-only-explorer/SKILL.md +250 -26
  70. package/skills/safe-bash/SKILL.md +307 -21
  71. package/skills/verification-before-done/SKILL.md +11 -2
  72. package/skills/widget-rendering/SKILL.md +258 -0
  73. package/skills/workspace-isolation/SKILL.md +202 -0
  74. package/skills/worktree-isolation/SKILL.md +202 -18
  75. package/src/adapters/claude-adapter.ts +25 -25
  76. package/src/adapters/codex-adapter.ts +21 -21
  77. package/src/adapters/cursor-adapter.ts +17 -17
  78. package/src/adapters/export-util.ts +137 -137
  79. package/src/adapters/index.ts +15 -15
  80. package/src/adapters/registry.ts +18 -18
  81. package/src/adapters/types.ts +23 -23
  82. package/src/agents/agent-config.ts +38 -38
  83. package/src/agents/agent-serializer.ts +38 -38
  84. package/src/agents/discover-agents.ts +121 -118
  85. package/src/config/config.ts +740 -858
  86. package/src/config/defaults.ts +96 -96
  87. package/src/config/drift-detector.ts +211 -211
  88. package/src/config/markers.ts +327 -327
  89. package/src/config/resilient-parser.ts +109 -108
  90. package/src/config/suggestions.ts +74 -74
  91. package/src/config/types.ts +199 -0
  92. package/src/extension/async-notifier.ts +123 -89
  93. package/src/extension/autonomous-policy.ts +169 -169
  94. package/src/extension/cross-extension-rpc.ts +104 -104
  95. package/src/extension/help.ts +47 -47
  96. package/src/extension/import-index.ts +69 -69
  97. package/src/extension/management.ts +395 -382
  98. package/src/extension/notification-router.ts +116 -116
  99. package/src/extension/notification-sink.ts +51 -51
  100. package/src/extension/project-init.ts +168 -168
  101. package/src/extension/register.ts +859 -668
  102. package/src/extension/registration/artifact-cleanup.ts +15 -15
  103. package/src/extension/registration/command-utils.ts +54 -54
  104. package/src/extension/registration/commands.ts +559 -452
  105. package/src/extension/registration/compaction-guard.ts +125 -125
  106. package/src/extension/registration/subagent-helpers.ts +102 -102
  107. package/src/extension/registration/subagent-tools.ts +220 -159
  108. package/src/extension/registration/team-tool.ts +159 -99
  109. package/src/extension/registration/viewers.ts +29 -0
  110. package/src/extension/result-watcher.ts +128 -128
  111. package/src/extension/run-bundle-schema.ts +89 -89
  112. package/src/extension/run-export.ts +73 -73
  113. package/src/extension/run-import.ts +84 -84
  114. package/src/extension/run-index.ts +94 -94
  115. package/src/extension/run-maintenance.ts +142 -142
  116. package/src/extension/session-summary.ts +8 -8
  117. package/src/extension/team-manager-command.ts +96 -96
  118. package/src/extension/team-recommendation.ts +188 -188
  119. package/src/extension/team-tool/api.ts +5 -2
  120. package/src/extension/team-tool/cancel.ts +224 -209
  121. package/src/extension/team-tool/config-patch.ts +36 -36
  122. package/src/extension/team-tool/context.ts +60 -60
  123. package/src/extension/team-tool/doctor.ts +242 -242
  124. package/src/extension/team-tool/handle-settings.ts +421 -195
  125. package/src/extension/team-tool/inspect.ts +41 -41
  126. package/src/extension/team-tool/lifecycle-actions.ts +139 -139
  127. package/src/extension/team-tool/parallel-dispatch.ts +156 -156
  128. package/src/extension/team-tool/plan.ts +19 -19
  129. package/src/extension/team-tool/respond.ts +112 -111
  130. package/src/extension/team-tool/run.ts +246 -229
  131. package/src/extension/team-tool/status.ts +110 -110
  132. package/src/extension/team-tool-types.ts +13 -13
  133. package/src/extension/team-tool.ts +344 -344
  134. package/src/extension/tool-result.ts +16 -16
  135. package/src/extension/validate-resources.ts +77 -77
  136. package/src/hooks/registry.ts +61 -61
  137. package/src/hooks/types.ts +40 -40
  138. package/src/i18n.ts +184 -184
  139. package/src/observability/correlation.ts +35 -35
  140. package/src/observability/event-to-metric.ts +68 -68
  141. package/src/observability/exporters/adapter.ts +30 -30
  142. package/src/observability/exporters/otlp-exporter.ts +106 -92
  143. package/src/observability/exporters/prometheus-exporter.ts +54 -54
  144. package/src/observability/metric-registry.ts +87 -87
  145. package/src/observability/metric-retention.ts +54 -54
  146. package/src/observability/metric-sink.ts +81 -56
  147. package/src/observability/metrics-primitives.ts +167 -167
  148. package/src/prompt/prompt-runtime.ts +72 -72
  149. package/src/runtime/adaptive-plan.ts +338 -0
  150. package/src/runtime/agent-control.ts +169 -169
  151. package/src/runtime/agent-memory.ts +72 -72
  152. package/src/runtime/agent-observability.ts +114 -114
  153. package/src/runtime/async-marker.ts +26 -26
  154. package/src/runtime/async-runner.ts +153 -153
  155. package/src/runtime/attention-events.ts +28 -28
  156. package/src/runtime/auto-resume.ts +100 -100
  157. package/src/runtime/background-runner.ts +122 -89
  158. package/src/runtime/cancellation.ts +61 -61
  159. package/src/runtime/capability-inventory.ts +116 -116
  160. package/src/runtime/child-pi-pool.ts +68 -0
  161. package/src/runtime/child-pi.ts +541 -461
  162. package/src/runtime/code-summary.ts +247 -247
  163. package/src/runtime/compaction-summary.ts +271 -271
  164. package/src/runtime/concurrency.ts +58 -58
  165. package/src/runtime/crash-recovery.ts +317 -301
  166. package/src/runtime/crew-agent-records.ts +379 -281
  167. package/src/runtime/crew-agent-runtime.ts +60 -60
  168. package/src/runtime/cross-extension-rpc.ts +72 -0
  169. package/src/runtime/custom-tools/irc-tool.ts +201 -201
  170. package/src/runtime/custom-tools/submit-result-tool.ts +90 -90
  171. package/src/runtime/deadletter.ts +47 -47
  172. package/src/runtime/delivery-coordinator.ts +176 -176
  173. package/src/runtime/delta-conflict.ts +360 -360
  174. package/src/runtime/diagnostic-export.ts +102 -102
  175. package/src/runtime/direct-run.ts +35 -35
  176. package/src/runtime/effectiveness.ts +82 -81
  177. package/src/runtime/errors/crew-errors.ts +166 -0
  178. package/src/runtime/event-stream-bridge.ts +92 -92
  179. package/src/runtime/foreground-control.ts +82 -82
  180. package/src/runtime/green-contract.ts +46 -46
  181. package/src/runtime/group-join.ts +234 -106
  182. package/src/runtime/heartbeat-watcher.ts +145 -124
  183. package/src/runtime/iteration-hooks.ts +267 -267
  184. package/src/runtime/live-agent-control.ts +88 -88
  185. package/src/runtime/live-agent-manager.ts +377 -179
  186. package/src/runtime/live-control-realtime.ts +36 -36
  187. package/src/runtime/live-session-runtime.ts +676 -600
  188. package/src/runtime/loop-gates.ts +129 -129
  189. package/src/runtime/manifest-cache.ts +263 -263
  190. package/src/runtime/mcp-proxy.ts +113 -113
  191. package/src/runtime/metric-parser.ts +40 -40
  192. package/src/runtime/model-fallback.ts +282 -274
  193. package/src/runtime/model-resolver.ts +118 -0
  194. package/src/runtime/output-validator.ts +187 -187
  195. package/src/runtime/overflow-recovery.ts +175 -175
  196. package/src/runtime/parallel-research.ts +44 -44
  197. package/src/runtime/parallel-utils.ts +156 -156
  198. package/src/runtime/parent-guard.ts +80 -80
  199. package/src/runtime/phase-progress.ts +217 -217
  200. package/src/runtime/pi-args.ts +165 -165
  201. package/src/runtime/pi-json-output.ts +111 -111
  202. package/src/runtime/pi-spawn.ts +167 -167
  203. package/src/runtime/policy-engine.ts +79 -79
  204. package/src/runtime/post-checks.ts +125 -125
  205. package/src/runtime/post-exit-stdio-guard.ts +86 -86
  206. package/src/runtime/process-status.ts +97 -73
  207. package/src/runtime/progress-event-coalescer.ts +43 -43
  208. package/src/runtime/recovery-recipes.ts +74 -74
  209. package/src/runtime/retry-executor.ts +81 -81
  210. package/src/runtime/role-permission.ts +39 -39
  211. package/src/runtime/run-tracker.ts +99 -0
  212. package/src/runtime/runtime-policy.ts +21 -0
  213. package/src/runtime/runtime-resolver.ts +94 -91
  214. package/src/runtime/scheduler.ts +294 -0
  215. package/src/runtime/semaphore.ts +131 -131
  216. package/src/runtime/sensitive-paths.ts +92 -92
  217. package/src/runtime/session-usage.ts +79 -79
  218. package/src/runtime/settings-store.ts +103 -0
  219. package/src/runtime/sidechain-output.ts +29 -29
  220. package/src/runtime/skill-instructions.ts +222 -222
  221. package/src/runtime/stale-reconciler.ts +198 -189
  222. package/src/runtime/streaming-output.ts +47 -0
  223. package/src/runtime/subagent-manager.ts +404 -400
  224. package/src/runtime/subprocess-tool-registry.ts +67 -67
  225. package/src/runtime/task-display.ts +38 -38
  226. package/src/runtime/task-graph-scheduler.ts +122 -122
  227. package/src/runtime/task-graph.ts +207 -207
  228. package/src/runtime/task-output-context.ts +177 -177
  229. package/src/runtime/task-packet.ts +93 -93
  230. package/src/runtime/task-quality.ts +207 -207
  231. package/src/runtime/task-runner/capabilities.ts +78 -78
  232. package/src/runtime/task-runner/live-executor.ts +131 -113
  233. package/src/runtime/task-runner/progress.ts +119 -119
  234. package/src/runtime/task-runner/prompt-builder.ts +139 -139
  235. package/src/runtime/task-runner/prompt-pipeline.ts +64 -64
  236. package/src/runtime/task-runner/result-utils.ts +14 -14
  237. package/src/runtime/task-runner/run-projection.ts +103 -103
  238. package/src/runtime/task-runner/state-helpers.ts +22 -22
  239. package/src/runtime/task-runner.ts +469 -459
  240. package/src/runtime/team-runner.ts +693 -945
  241. package/src/runtime/usage-tracker.ts +71 -0
  242. package/src/runtime/worker-heartbeat.ts +21 -21
  243. package/src/runtime/worker-startup.ts +57 -57
  244. package/src/runtime/workflow-state.ts +187 -187
  245. package/src/runtime/yield-handler.ts +190 -190
  246. package/src/schema/config-schema.ts +172 -168
  247. package/src/schema/team-tool-schema.ts +126 -126
  248. package/src/schema/validation-types.ts +151 -148
  249. package/src/skills/discover-skills.ts +67 -67
  250. package/src/skills/skill-templates.ts +374 -374
  251. package/src/state/active-run-registry.ts +227 -191
  252. package/src/state/artifact-store.ts +130 -129
  253. package/src/state/atomic-write.ts +262 -195
  254. package/src/state/blob-store.ts +116 -116
  255. package/src/state/contracts.ts +111 -111
  256. package/src/state/event-log-rotation.ts +161 -158
  257. package/src/state/event-log.ts +383 -303
  258. package/src/state/event-reconstructor.ts +217 -217
  259. package/src/state/jsonl-writer.ts +82 -82
  260. package/src/state/locks.ts +146 -146
  261. package/src/state/mailbox.ts +446 -405
  262. package/src/state/state-store.ts +364 -351
  263. package/src/state/task-claims.ts +44 -44
  264. package/src/state/types.ts +285 -285
  265. package/src/state/usage.ts +29 -29
  266. package/src/subagents/async-entry.ts +1 -1
  267. package/src/subagents/index.ts +3 -3
  268. package/src/subagents/live/control.ts +1 -1
  269. package/src/subagents/live/manager.ts +1 -1
  270. package/src/subagents/live/realtime.ts +1 -1
  271. package/src/subagents/live/session-runtime.ts +1 -1
  272. package/src/subagents/manager.ts +1 -1
  273. package/src/subagents/spawn.ts +1 -1
  274. package/src/teams/discover-teams.ts +116 -116
  275. package/src/teams/team-config.ts +27 -27
  276. package/src/teams/team-serializer.ts +38 -38
  277. package/src/types/diff.d.ts +18 -18
  278. package/src/ui/agent-management-overlay.ts +144 -144
  279. package/src/ui/crew-widget.ts +487 -370
  280. package/src/ui/dashboard-panes/agents-pane.ts +109 -28
  281. package/src/ui/dashboard-panes/cancellation-pane.ts +42 -42
  282. package/src/ui/dashboard-panes/capability-pane.ts +59 -59
  283. package/src/ui/dashboard-panes/health-pane.ts +30 -30
  284. package/src/ui/dashboard-panes/mailbox-pane.ts +35 -35
  285. package/src/ui/dashboard-panes/progress-pane.ts +30 -30
  286. package/src/ui/dashboard-panes/transcript-pane.ts +10 -10
  287. package/src/ui/heartbeat-aggregator.ts +63 -63
  288. package/src/ui/keybinding-map.ts +97 -94
  289. package/src/ui/live-conversation-overlay.ts +152 -0
  290. package/src/ui/live-run-sidebar.ts +180 -180
  291. package/src/ui/mascot.ts +442 -442
  292. package/src/ui/overlays/agent-picker-overlay.ts +57 -57
  293. package/src/ui/overlays/confirm-overlay.ts +58 -58
  294. package/src/ui/overlays/mailbox-compose-overlay.ts +144 -144
  295. package/src/ui/overlays/mailbox-compose-preview.ts +63 -63
  296. package/src/ui/overlays/mailbox-detail-overlay.ts +122 -122
  297. package/src/ui/pi-ui-compat.ts +57 -57
  298. package/src/ui/powerbar-publisher.ts +221 -197
  299. package/src/ui/render-scheduler.ts +216 -143
  300. package/src/ui/run-action-dispatcher.ts +118 -118
  301. package/src/ui/run-dashboard.ts +526 -464
  302. package/src/ui/run-event-bus.ts +208 -208
  303. package/src/ui/run-snapshot-cache.ts +826 -777
  304. package/src/ui/settings-overlay.ts +721 -0
  305. package/src/ui/snapshot-types.ts +86 -70
  306. package/src/ui/theme-adapter.ts +190 -190
  307. package/src/ui/tool-progress-formatter.ts +89 -0
  308. package/src/ui/transcript-cache.ts +94 -94
  309. package/src/ui/transcript-viewer.ts +335 -335
  310. package/src/utils/conflict-detect.ts +662 -0
  311. package/src/utils/file-coalescer.ts +86 -86
  312. package/src/utils/frontmatter.ts +68 -68
  313. package/src/utils/fs-watch.ts +88 -31
  314. package/src/utils/gh-protocol.ts +479 -0
  315. package/src/utils/ids.ts +17 -17
  316. package/src/utils/incremental-reader.ts +104 -104
  317. package/src/utils/internal-error.ts +6 -6
  318. package/src/utils/names.ts +27 -27
  319. package/src/utils/paths.ts +102 -63
  320. package/src/utils/redaction.ts +44 -44
  321. package/src/utils/safe-paths.ts +47 -47
  322. package/src/utils/scan-cache.ts +136 -136
  323. package/src/utils/sse-parser.ts +134 -134
  324. package/src/utils/task-name-generator.ts +337 -337
  325. package/src/utils/timings.ts +33 -33
  326. package/src/utils/visual.ts +243 -198
  327. package/src/workflows/discover-workflows.ts +139 -139
  328. package/src/workflows/validate-workflow.ts +40 -40
  329. package/src/workflows/workflow-config.ts +26 -26
  330. package/src/workflows/workflow-serializer.ts +32 -32
  331. package/src/worktree/branch-freshness.ts +45 -45
  332. package/src/worktree/cleanup.ts +75 -75
  333. package/src/worktree/worktree-manager.ts +188 -188
  334. package/teams/default.team.md +12 -12
  335. package/teams/fast-fix.team.md +11 -11
  336. package/teams/implementation.team.md +18 -18
  337. package/teams/parallel-research.team.md +14 -14
  338. package/teams/research.team.md +11 -11
  339. package/teams/review.team.md +12 -12
  340. package/tsconfig.json +19 -19
  341. package/workflows/default.workflow.md +30 -30
  342. package/workflows/fast-fix.workflow.md +23 -23
  343. package/workflows/implementation.workflow.md +43 -43
  344. package/workflows/parallel-research.workflow.md +46 -46
  345. package/workflows/research.workflow.md +22 -22
  346. package/workflows/review.workflow.md +30 -30
  347. package/skills/task-packet/SKILL.md +0 -28
  348. package/skills/verify-evidence/SKILL.md +0 -27
@@ -1,301 +1,317 @@
1
- import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
2
- import * as fs from "node:fs";
3
- import type { MetricRegistry } from "../observability/metric-registry.ts";
4
- import { appendEvent, scanSequence } from "../state/event-log.ts";
5
- import { withRunLockSync } from "../state/locks.ts";
6
- import { loadRunManifestById, saveRunTasks, updateRunStatus } from "../state/state-store.ts";
7
- import type { TeamTaskState } from "../state/types.ts";
8
- import { isWorkerHeartbeatStale } from "./worker-heartbeat.ts";
9
- import type { ManifestCache } from "./manifest-cache.ts";
10
- import { checkProcessLiveness } from "./process-status.ts";
11
- import { reconcileStaleRun, type ReconcileResult } from "./stale-reconciler.ts";
12
- import { executeHook, appendHookEvent } from "../hooks/registry.ts";
13
- import { activeRunEntries, unregisterActiveRun, readActiveRunRegistry } from "../state/active-run-registry.ts";
14
- import { resolveRealContainedPath } from "../utils/safe-paths.ts";
15
- import { projectCrewRoot, userCrewRoot } from "../utils/paths.ts";
16
-
17
- export interface RecoveryPlan {
18
- runId: string;
19
- resumableTasks: string[];
20
- preservedTasks: string[];
21
- lastEventSeq: number;
22
- }
23
-
24
- function isTerminalTask(task: TeamTaskState): boolean {
25
- return task.status === "completed" || task.status === "failed" || task.status === "cancelled" || task.status === "skipped";
26
- }
27
-
28
- function shouldRecoverTask(task: TeamTaskState, deadMs: number): boolean {
29
- if (task.status !== "running") return false;
30
- if (!task.heartbeat) return true;
31
- return task.heartbeat.alive === false || isWorkerHeartbeatStale(task.heartbeat, deadMs);
32
- }
33
-
34
- export function detectInterruptedRuns(cwd: string, manifestCache: ManifestCache, deadMs = 300_000): RecoveryPlan[] {
35
- const plans: RecoveryPlan[] = [];
36
- for (const manifest of manifestCache.list(50)) {
37
- if (manifest.status !== "running") continue;
38
- if (manifest.async?.pid !== undefined && checkProcessLiveness(manifest.async.pid).alive) continue;
39
- const loaded = loadRunManifestById(cwd, manifest.runId);
40
- if (!loaded) continue;
41
- const resumableTasks = loaded.tasks.filter((task) => shouldRecoverTask(task, deadMs)).map((task) => task.id);
42
- if (!resumableTasks.length) continue;
43
- plans.push({ runId: manifest.runId, resumableTasks, preservedTasks: loaded.tasks.filter(isTerminalTask).map((task) => task.id), lastEventSeq: scanSequence(loaded.manifest.eventsPath) });
44
- }
45
- return plans;
46
- }
47
-
48
- export async function applyRecoveryPlan(plan: RecoveryPlan, ctx: Pick<ExtensionContext, "cwd">, registry?: MetricRegistry): Promise<void> {
49
- const loaded = loadRunManifestById(ctx.cwd, plan.runId);
50
- if (!loaded) throw new Error(`Run '${plan.runId}' not found.`);
51
-
52
- const hookReport = await executeHook("run_recovery", { runId: plan.runId, cwd: ctx.cwd });
53
- appendHookEvent(loaded.manifest, hookReport);
54
- if (hookReport.outcome === "block") {
55
- appendEvent(loaded.manifest.eventsPath, { type: "crew.run.recovery_blocked", runId: plan.runId, message: `Recovery blocked by hook: ${hookReport.reason ?? "run_recovery hook blocked the operation."}`, data: { hookOutcome: "block", reason: hookReport.reason } });
56
- return;
57
- }
58
-
59
- const reset = new Set(plan.resumableTasks);
60
- const tasks = loaded.tasks.map((task) => reset.has(task.id) ? { ...task, status: "queued" as const, startedAt: undefined, finishedAt: undefined, error: undefined, heartbeat: undefined } : task);
61
- saveRunTasks(loaded.manifest, tasks);
62
- appendEvent(loaded.manifest.eventsPath, { type: "crew.run.resumed", runId: plan.runId, message: `Recovered ${plan.resumableTasks.length} interrupted task(s).`, data: { recoveredFromSeq: plan.lastEventSeq, resumableTasks: plan.resumableTasks } });
63
- registry?.counter("crew.run.count", "Total runs by status").inc({ status: "resumed" });
64
- }
65
-
66
- export function declineRecoveryPlan(plan: RecoveryPlan, ctx: Pick<ExtensionContext, "cwd">): void {
67
- const loaded = loadRunManifestById(ctx.cwd, plan.runId);
68
- if (!loaded) throw new Error(`Run '${plan.runId}' not found.`);
69
- // Log the event first — if appendEvent fails, state remains consistent.
70
- appendEvent(loaded.manifest.eventsPath, { type: "crew.run.recovery_declined", runId: plan.runId, message: "Interrupted run was not resumed.", data: { recoveredFromSeq: plan.lastEventSeq } });
71
- updateRunStatus(loaded.manifest, "cancelled", "interrupted-not-resumed");
72
- }
73
-
74
- /**
75
- * Run 3-phase stale reconciliation on all active runs.
76
- * Returns results for each reconciled run.
77
- */
78
- /**
79
- * Auto-cancel orphaned runs whose owner session no longer exists.
80
- *
81
- * When a Pi session dies (crash, force-close, Ctrl+C), `session_shutdown`
82
- * does not fire and child workers are not terminated. The next Pi session
83
- * must detect these orphaned runs and cancel them.
84
- *
85
- * Criteria for orphan detection:
86
- * 1. Manifest status is "running"
87
- * 2. Manifest has an `ownerSessionId` that is NOT the current session
88
- * 3. The owner session's process is no longer alive (PID check)
89
- * 4. No recent heartbeat activity (task heartbeat or agent progress within threshold)
90
- *
91
- * Returns the number of runs cancelled.
92
- */
93
- export function cancelOrphanedRuns(
94
- cwd: string,
95
- manifestCache: ManifestCache,
96
- currentSessionId: string,
97
- staleThresholdMs = 300_000,
98
- now = Date.now(),
99
- ): { cancelled: string[]; skipped: string[] } {
100
- const cancelled: string[] = [];
101
- const skipped: string[] = [];
102
-
103
- // Phase 1: Scan project-level manifests via manifestCache
104
- for (const manifest of manifestCache.list(50)) {
105
- if (manifest.status !== "running") continue;
106
-
107
- // Only consider runs owned by a different session
108
- const ownerId = manifest.ownerSessionId;
109
- if (!ownerId || ownerId === currentSessionId) continue;
110
-
111
- // Check if the owner process is still alive
112
- const ownerPid = manifest.async?.pid;
113
- if (ownerPid !== undefined && checkProcessLiveness(ownerPid).alive) {
114
- skipped.push(manifest.runId);
115
- continue;
116
- }
117
-
118
- // Check for recent heartbeat activity
119
- const loaded = loadRunManifestById(cwd, manifest.runId);
120
- if (!loaded) continue;
121
-
122
- const hasRecentActivity = loaded.tasks.some((task) => {
123
- if (task.status !== "running" && task.status !== "waiting") return false;
124
- const heartbeatAt = task.heartbeat?.lastSeenAt ? new Date(task.heartbeat.lastSeenAt).getTime() : Number.NaN;
125
- if (task.heartbeat?.alive !== false && Number.isFinite(heartbeatAt) && now - heartbeatAt <= staleThresholdMs) return true;
126
- const activityAt = task.agentProgress?.lastActivityAt ? new Date(task.agentProgress.lastActivityAt).getTime() : Number.NaN;
127
- return Number.isFinite(activityAt) && now - activityAt <= staleThresholdMs;
128
- });
129
-
130
- if (hasRecentActivity) {
131
- skipped.push(manifest.runId);
132
- continue;
133
- }
134
-
135
- // Orphan confirmed — cancel all running tasks
136
- withRunLockSync(loaded.manifest, () => {
137
- const fresh = loadRunManifestById(cwd, manifest.runId);
138
- if (!fresh || fresh.manifest.status !== "running") return;
139
-
140
- const now_iso = new Date(now).toISOString();
141
- const repairedTasks = fresh.tasks.map((task) => {
142
- if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
143
- return { ...task, status: "cancelled" as const, finishedAt: now_iso, error: `Orphaned run: owner session ${ownerId} no longer exists` };
144
- }
145
- return task;
146
- });
147
-
148
- saveRunTasks(fresh.manifest, repairedTasks);
149
- updateRunStatus(fresh.manifest, "cancelled", `Orphaned run: owner session ${ownerId} no longer exists`);
150
- appendEvent(fresh.manifest.eventsPath, { type: "crew.run.orphan_cancelled", runId: manifest.runId, message: `Auto-cancelled orphaned run (owner: ${ownerId})`, data: { ownerSessionId: ownerId, cancelledTasks: repairedTasks.filter((t) => t.status === "cancelled").length } });
151
- cancelled.push(manifest.runId);
152
- });
153
- }
154
-
155
- return { cancelled, skipped };
156
- }
157
-
158
- /**
159
- * Purge the global active-run-index of entries whose manifest is no longer active.
160
- *
161
- * This scans every entry in active-run-index.json and removes any whose:
162
- * - manifest file no longer exists, OR
163
- * - manifest status is terminal (completed/failed/cancelled/blocked), OR
164
- * - manifest cwd directory no longer exists (e.g. temp test dirs)
165
- *
166
- * Also removes entries where the manifest is still "running" but:
167
- * - The cwd has been deleted (temp dir cleanup)
168
- * - The async worker PID is dead AND no heartbeat for > threshold
169
- *
170
- * This is the **global** cleanup that cancelOrphanedRuns (project-scoped)
171
- * cannot reach.
172
- */
173
- /**
174
- * Best-effort removal of stateRoot and artifactsRoot directories for a purged run.
175
- * Uses resolveRealContainedPath to ensure we only delete paths that are safely
176
- * contained within a known crew root (project or user level).
177
- */
178
- function tryRemoveRunDirectories(entry: { stateRoot: string; cwd: string }): void {
179
- const roots = [projectCrewRoot(entry.cwd), userCrewRoot()];
180
- for (const root of roots) {
181
- try {
182
- resolveRealContainedPath(root, entry.stateRoot);
183
- // If we get here, stateRoot is safely contained — remove it
184
- fs.rmSync(entry.stateRoot, { recursive: true, force: true });
185
- break;
186
- } catch {
187
- // Not contained in this root, try next
188
- }
189
- }
190
- // NOTE: artifactsRoot is shared across runs and cleaned up by pruneFinishedRuns/pruneUserLevelRuns — not deleted here.
191
- }
192
-
193
- /**
194
- * Purge the global active-run-index of entries whose manifest is no longer active.
195
- *
196
- * Note: This function only cleans user-level active run entries.
197
- * Project-level stale runs are handled by session_start auto-prune triggered during run creation.
198
- */
199
- export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.now()): { purged: string[]; kept: string[] } {
200
- const purged: string[] = [];
201
- const kept: string[] = [];
202
- const entries = readActiveRunRegistry();
203
-
204
- for (const entry of entries) {
205
- // 1. Manifest file gone → definitely stale
206
- if (!fs.existsSync(entry.manifestPath)) {
207
- unregisterActiveRun(entry.runId);
208
- tryRemoveRunDirectories(entry);
209
- purged.push(entry.runId);
210
- continue;
211
- }
212
-
213
- // 2. CWD gone → temp dir cleaned up
214
- if (!fs.existsSync(entry.cwd)) {
215
- unregisterActiveRun(entry.runId);
216
- tryRemoveRunDirectories(entry);
217
- purged.push(entry.runId);
218
- continue;
219
- }
220
-
221
- // 3. Read manifest status
222
- let manifest: { status?: string; async?: { pid?: number }; ownerSessionId?: string } | undefined;
223
- try {
224
- manifest = JSON.parse(fs.readFileSync(entry.manifestPath, "utf-8"));
225
- } catch {
226
- unregisterActiveRun(entry.runId);
227
- tryRemoveRunDirectories(entry);
228
- purged.push(entry.runId);
229
- continue;
230
- }
231
-
232
- // 4. Terminal status no longer active (just unregister, don't delete files)
233
- const terminalStatuses = new Set(["completed", "failed", "cancelled", "blocked"]);
234
- if (manifest && terminalStatuses.has(manifest.status ?? "")) {
235
- unregisterActiveRun(entry.runId);
236
- purged.push(entry.runId);
237
- continue;
238
- }
239
-
240
- // 5. Still "running" — check if worker PID is dead and no heartbeat
241
- if (manifest?.status === "running" && manifest.async?.pid !== undefined) {
242
- const pidAlive = checkProcessLiveness(manifest.async.pid).alive;
243
- if (!pidAlive) {
244
- // Check age — if manifest hasn't been updated in > threshold, it's stale
245
- const updatedAt = new Date(entry.updatedAt).getTime();
246
- if (Number.isFinite(updatedAt) && now - updatedAt > staleThresholdMs) {
247
- // Dead PID + stale update → cancel the manifest and unregister
248
- try {
249
- const fullLoaded = loadRunManifestById(entry.cwd, entry.runId);
250
- if (fullLoaded) {
251
- const now_iso = new Date(now).toISOString();
252
- const repairedTasks = fullLoaded.tasks.map((task) => {
253
- if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
254
- return { ...task, status: "cancelled" as const, finishedAt: now_iso, error: "Orphaned run: worker process dead and no recent activity" };
255
- }
256
- return task;
257
- });
258
- saveRunTasks(fullLoaded.manifest, repairedTasks);
259
- updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: worker process dead and no recent activity");
260
- }
261
- } catch {
262
- // Best-effort manifest cleanup
263
- }
264
- unregisterActiveRun(entry.runId);
265
- tryRemoveRunDirectories(entry);
266
- purged.push(entry.runId);
267
- continue;
268
- }
269
- }
270
- }
271
-
272
- kept.push(entry.runId);
273
- }
274
-
275
- return { purged, kept };
276
- }
277
-
278
- export function reconcileAllStaleRuns(cwd: string, manifestCache: ManifestCache, now = Date.now()): ReconcileResult[] {
279
- const results: ReconcileResult[] = [];
280
- for (const manifest of manifestCache.list(50)) {
281
- if (manifest.status !== "running") continue;
282
- const loaded = loadRunManifestById(cwd, manifest.runId);
283
- if (!loaded) continue;
284
- // Use lock to prevent race with cancel/status handlers modifying the same run
285
- withRunLockSync(loaded.manifest, () => {
286
- // Re-read inside lock to get freshest data
287
- const fresh = loadRunManifestById(cwd, manifest.runId);
288
- if (!fresh || fresh.manifest.status !== "running") return;
289
- const result = reconcileStaleRun(fresh.manifest, fresh.tasks, now);
290
- if (result.repaired) {
291
- if (result.repairedTasks) saveRunTasks(fresh.manifest, result.repairedTasks);
292
- updateRunStatus(fresh.manifest, "failed", `Stale run reconciled: ${result.detail}`);
293
- appendEvent(fresh.manifest.eventsPath, { type: "crew.run.reconciled_stale", runId: manifest.runId, message: result.detail, data: { verdict: result.verdict } });
294
- }
295
- if (result.verdict !== "healthy") {
296
- results.push(result);
297
- }
298
- });
299
- }
300
- return results;
301
- }
1
+ import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
2
+ import * as fs from "node:fs";
3
+ import type { MetricRegistry } from "../observability/metric-registry.ts";
4
+ import { appendEvent, scanSequence } from "../state/event-log.ts";
5
+ import { recordFromTask, upsertCrewAgent } from "./crew-agent-records.ts";
6
+ import { withRunLockSync } from "../state/locks.ts";
7
+ import { loadRunManifestById, saveRunTasks, updateRunStatus } from "../state/state-store.ts";
8
+ import type { TeamTaskState } from "../state/types.ts";
9
+ import { isWorkerHeartbeatStale } from "./worker-heartbeat.ts";
10
+ import type { ManifestCache } from "./manifest-cache.ts";
11
+ import { checkProcessLiveness } from "./process-status.ts";
12
+ import { reconcileStaleRun, type ReconcileResult } from "./stale-reconciler.ts";
13
+ import { executeHook, appendHookEvent } from "../hooks/registry.ts";
14
+ import { activeRunEntries, unregisterActiveRun, readActiveRunRegistry } from "../state/active-run-registry.ts";
15
+ import { resolveRealContainedPath } from "../utils/safe-paths.ts";
16
+ import { projectCrewRoot, userCrewRoot } from "../utils/paths.ts";
17
+ import { terminateLiveAgentsForRun } from "./live-agent-manager.ts";
18
+
19
+ export interface RecoveryPlan {
20
+ runId: string;
21
+ resumableTasks: string[];
22
+ preservedTasks: string[];
23
+ lastEventSeq: number;
24
+ }
25
+
26
+ function isTerminalTask(task: TeamTaskState): boolean {
27
+ return task.status === "completed" || task.status === "failed" || task.status === "cancelled" || task.status === "skipped";
28
+ }
29
+
30
+ function shouldRecoverTask(task: TeamTaskState, deadMs: number): boolean {
31
+ if (task.status !== "running") return false;
32
+ if (!task.heartbeat) return true;
33
+ return task.heartbeat.alive === false || isWorkerHeartbeatStale(task.heartbeat, deadMs);
34
+ }
35
+
36
+ export function detectInterruptedRuns(cwd: string, manifestCache: ManifestCache, deadMs = 300_000): RecoveryPlan[] {
37
+ const plans: RecoveryPlan[] = [];
38
+ for (const manifest of manifestCache.list(50)) {
39
+ if (manifest.status !== "running" && manifest.status !== "blocked") continue;
40
+ if (manifest.async?.pid !== undefined && checkProcessLiveness(manifest.async.pid).alive) continue;
41
+ const loaded = loadRunManifestById(cwd, manifest.runId);
42
+ if (!loaded) continue;
43
+ const resumableTasks = loaded.tasks.filter((task) => shouldRecoverTask(task, deadMs)).map((task) => task.id);
44
+ if (!resumableTasks.length) continue;
45
+ plans.push({ runId: manifest.runId, resumableTasks, preservedTasks: loaded.tasks.filter(isTerminalTask).map((task) => task.id), lastEventSeq: scanSequence(loaded.manifest.eventsPath) });
46
+ }
47
+ return plans;
48
+ }
49
+
50
+ export async function applyRecoveryPlan(plan: RecoveryPlan, ctx: Pick<ExtensionContext, "cwd">, registry?: MetricRegistry): Promise<void> {
51
+ const loaded = loadRunManifestById(ctx.cwd, plan.runId);
52
+ if (!loaded) throw new Error(`Run '${plan.runId}' not found.`);
53
+
54
+ const hookReport = await executeHook("run_recovery", { runId: plan.runId, cwd: ctx.cwd });
55
+ appendHookEvent(loaded.manifest, hookReport);
56
+ if (hookReport.outcome === "block") {
57
+ appendEvent(loaded.manifest.eventsPath, { type: "crew.run.recovery_blocked", runId: plan.runId, message: `Recovery blocked by hook: ${hookReport.reason ?? "run_recovery hook blocked the operation."}`, data: { hookOutcome: "block", reason: hookReport.reason } });
58
+ return;
59
+ }
60
+
61
+ const reset = new Set(plan.resumableTasks);
62
+ const tasks = loaded.tasks.map((task) => reset.has(task.id) ? { ...task, status: "queued" as const, startedAt: undefined, finishedAt: undefined, error: undefined, heartbeat: undefined } : task);
63
+ saveRunTasks(loaded.manifest, tasks);
64
+ appendEvent(loaded.manifest.eventsPath, { type: "crew.run.resumed", runId: plan.runId, message: `Recovered ${plan.resumableTasks.length} interrupted task(s).`, data: { recoveredFromSeq: plan.lastEventSeq, resumableTasks: plan.resumableTasks } });
65
+ registry?.counter("crew.run.count", "Total runs by status").inc({ status: "resumed" });
66
+ }
67
+
68
+ export function declineRecoveryPlan(plan: RecoveryPlan, ctx: Pick<ExtensionContext, "cwd">): void {
69
+ const loaded = loadRunManifestById(ctx.cwd, plan.runId);
70
+ if (!loaded) throw new Error(`Run '${plan.runId}' not found.`);
71
+ // Log the event first — if appendEvent fails, state remains consistent.
72
+ appendEvent(loaded.manifest.eventsPath, { type: "crew.run.recovery_declined", runId: plan.runId, message: "Interrupted run was not resumed.", data: { recoveredFromSeq: plan.lastEventSeq } });
73
+ updateRunStatus(loaded.manifest, "cancelled", "interrupted-not-resumed");
74
+ }
75
+
76
+ /**
77
+ * Run 3-phase stale reconciliation on all active runs.
78
+ * Returns results for each reconciled run.
79
+ */
80
+ /**
81
+ * Auto-cancel orphaned runs whose owner session no longer exists.
82
+ *
83
+ * When a Pi session dies (crash, force-close, Ctrl+C), `session_shutdown`
84
+ * does not fire and child workers are not terminated. The next Pi session
85
+ * must detect these orphaned runs and cancel them.
86
+ *
87
+ * Criteria for orphan detection:
88
+ * 1. Manifest status is "running"
89
+ * 2. Manifest has an `ownerSessionId` that is NOT the current session
90
+ * 3. The owner session's process is no longer alive (PID check)
91
+ * 4. No recent heartbeat activity (task heartbeat or agent progress within threshold)
92
+ *
93
+ * Returns the number of runs cancelled.
94
+ */
95
+ export function cancelOrphanedRuns(
96
+ cwd: string,
97
+ manifestCache: ManifestCache,
98
+ currentSessionId: string,
99
+ staleThresholdMs = 300_000,
100
+ now = Date.now(),
101
+ ): { cancelled: string[]; skipped: string[] } {
102
+ const cancelled: string[] = [];
103
+ const skipped: string[] = [];
104
+
105
+ // Phase 1: Scan project-level manifests via manifestCache
106
+ for (const manifest of manifestCache.list(50)) {
107
+ if (manifest.status !== "running" && manifest.status !== "blocked") continue;
108
+
109
+ // Only consider runs owned by a different session
110
+ const ownerId = manifest.ownerSessionId;
111
+ if (!ownerId || ownerId === currentSessionId) continue;
112
+
113
+ // Check if the owner process is still alive
114
+ const ownerPid = manifest.async?.pid;
115
+ if (ownerPid !== undefined && checkProcessLiveness(ownerPid).alive) {
116
+ skipped.push(manifest.runId);
117
+ continue;
118
+ }
119
+
120
+ // Check for recent heartbeat activity
121
+ const loaded = loadRunManifestById(cwd, manifest.runId);
122
+ if (!loaded) continue;
123
+
124
+ const hasRecentActivity = loaded.tasks.some((task) => {
125
+ if (task.status !== "running" && task.status !== "waiting") return false;
126
+ const heartbeatAt = task.heartbeat?.lastSeenAt ? new Date(task.heartbeat.lastSeenAt).getTime() : Number.NaN;
127
+ if (task.heartbeat?.alive !== false && Number.isFinite(heartbeatAt) && now - heartbeatAt <= staleThresholdMs) return true;
128
+ const activityAt = task.agentProgress?.lastActivityAt ? new Date(task.agentProgress.lastActivityAt).getTime() : Number.NaN;
129
+ return Number.isFinite(activityAt) && now - activityAt <= staleThresholdMs;
130
+ });
131
+
132
+ if (hasRecentActivity) {
133
+ skipped.push(manifest.runId);
134
+ continue;
135
+ }
136
+
137
+ // Orphan confirmed mark durable state terminal before best-effort live-agent abort.
138
+ // terminateLiveAgent unregisters handles before awaiting abort(), and live-executor's
139
+ // isCurrent() checks durable terminal state before writing progress.
140
+
141
+ // Orphan confirmed cancel all running tasks
142
+ let cancelledRun = false;
143
+ withRunLockSync(loaded.manifest, () => {
144
+ const fresh = loadRunManifestById(cwd, manifest.runId);
145
+ if (!fresh || (fresh.manifest.status !== "running" && fresh.manifest.status !== "blocked")) return;
146
+
147
+ const now_iso = new Date(now).toISOString();
148
+ const repairedTasks = fresh.tasks.map((task) => {
149
+ if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
150
+ return { ...task, status: "cancelled" as const, finishedAt: now_iso, error: `Orphaned run: owner session ${ownerId} no longer exists` };
151
+ }
152
+ return task;
153
+ });
154
+
155
+ saveRunTasks(fresh.manifest, repairedTasks);
156
+ for (const task of repairedTasks) { try { upsertCrewAgent(fresh.manifest, recordFromTask(fresh.manifest, task, "scaffold")); } catch { /* non-critical */ } }
157
+ updateRunStatus(fresh.manifest, "cancelled", `Orphaned run: owner session ${ownerId} no longer exists`);
158
+ appendEvent(fresh.manifest.eventsPath, { type: "crew.run.orphan_cancelled", runId: manifest.runId, message: `Auto-cancelled orphaned run (owner: ${ownerId})`, data: { ownerSessionId: ownerId, cancelledTasks: repairedTasks.filter((t) => t.status === "cancelled").length } });
159
+ cancelled.push(manifest.runId);
160
+ cancelledRun = true;
161
+ });
162
+ if (cancelledRun) void terminateLiveAgentsForRun(manifest.runId, "cancelled", appendEvent, loaded.manifest.eventsPath).catch(() => {});
163
+ }
164
+
165
+ return { cancelled, skipped };
166
+ }
167
+
168
+ /**
169
+ * Purge the global active-run-index of entries whose manifest is no longer active.
170
+ *
171
+ * This scans every entry in active-run-index.json and removes any whose:
172
+ * - manifest file no longer exists, OR
173
+ * - manifest status is terminal (completed/failed/cancelled/blocked), OR
174
+ * - manifest cwd directory no longer exists (e.g. temp test dirs)
175
+ *
176
+ * Also removes entries where the manifest is still "running" but:
177
+ * - The cwd has been deleted (temp dir cleanup)
178
+ * - The async worker PID is dead AND no heartbeat for > threshold
179
+ *
180
+ * This is the **global** cleanup that cancelOrphanedRuns (project-scoped)
181
+ * cannot reach.
182
+ */
183
+ /**
184
+ * Best-effort removal of stateRoot and artifactsRoot directories for a purged run.
185
+ * Uses resolveRealContainedPath to ensure we only delete paths that are safely
186
+ * contained within a known crew root (project or user level).
187
+ */
188
+ function tryRemoveRunDirectories(entry: { stateRoot: string; cwd: string }): void {
189
+ const roots = [projectCrewRoot(entry.cwd), userCrewRoot()];
190
+ for (const root of roots) {
191
+ try {
192
+ resolveRealContainedPath(root, entry.stateRoot);
193
+ // If we get here, stateRoot is safely contained — remove it
194
+ fs.rmSync(entry.stateRoot, { recursive: true, force: true });
195
+ break;
196
+ } catch {
197
+ // Not contained in this root, try next
198
+ }
199
+ }
200
+ // NOTE: artifactsRoot is shared across runs and cleaned up by pruneFinishedRuns/pruneUserLevelRuns — not deleted here.
201
+ }
202
+
203
+ /**
204
+ * Purge the global active-run-index of entries whose manifest is no longer active.
205
+ *
206
+ * Note: This function only cleans user-level active run entries.
207
+ * Project-level stale runs are handled by session_start auto-prune triggered during run creation.
208
+ */
209
+ export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.now()): { purged: string[]; kept: string[] } {
210
+ const purged: string[] = [];
211
+ const kept: string[] = [];
212
+ const entries = readActiveRunRegistry();
213
+
214
+ for (const entry of entries) {
215
+ // 1. Manifest file gone → definitely stale
216
+ if (!fs.existsSync(entry.manifestPath)) {
217
+ unregisterActiveRun(entry.runId);
218
+ tryRemoveRunDirectories(entry);
219
+ purged.push(entry.runId);
220
+ continue;
221
+ }
222
+
223
+ // 2. CWD gone → temp dir cleaned up
224
+ if (!fs.existsSync(entry.cwd)) {
225
+ unregisterActiveRun(entry.runId);
226
+ tryRemoveRunDirectories(entry);
227
+ purged.push(entry.runId);
228
+ continue;
229
+ }
230
+
231
+ // 3. Read manifest status
232
+ let manifest: { status?: string; async?: { pid?: number }; ownerSessionId?: string } | undefined;
233
+ try {
234
+ manifest = JSON.parse(fs.readFileSync(entry.manifestPath, "utf-8"));
235
+ } catch {
236
+ unregisterActiveRun(entry.runId);
237
+ tryRemoveRunDirectories(entry);
238
+ purged.push(entry.runId);
239
+ continue;
240
+ }
241
+
242
+ // 4. Terminal status → no longer active (just unregister, don't delete files)
243
+ const terminalStatuses = new Set(["completed", "failed", "cancelled", "blocked"]);
244
+ if (manifest && terminalStatuses.has(manifest.status ?? "")) {
245
+ unregisterActiveRun(entry.runId);
246
+ purged.push(entry.runId);
247
+ continue;
248
+ }
249
+
250
+ // 5. Still "running" — check if worker PID is dead and no heartbeat
251
+ if (manifest?.status === "running" && manifest.async?.pid !== undefined) {
252
+ const pidAlive = checkProcessLiveness(manifest.async.pid).alive;
253
+ if (!pidAlive) {
254
+ // Check age if manifest hasn't been updated in > threshold, it's stale
255
+ const updatedAt = new Date(entry.updatedAt).getTime();
256
+ if (Number.isFinite(updatedAt) && now - updatedAt > staleThresholdMs) {
257
+ // Dead PID + stale update → cancel the manifest and unregister
258
+ try {
259
+ const fullLoaded = loadRunManifestById(entry.cwd, entry.runId);
260
+ if (fullLoaded) {
261
+ const now_iso = new Date(now).toISOString();
262
+ const repairedTasks = fullLoaded.tasks.map((task) => {
263
+ if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
264
+ return { ...task, status: "cancelled" as const, finishedAt: now_iso, error: "Orphaned run: worker process dead and no recent activity" };
265
+ }
266
+ return task;
267
+ });
268
+ saveRunTasks(fullLoaded.manifest, repairedTasks);
269
+ for (const task of repairedTasks) { try { upsertCrewAgent(fullLoaded.manifest, recordFromTask(fullLoaded.manifest, task, "scaffold")); } catch { /* non-critical */ } }
270
+ updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: worker process dead and no recent activity");
271
+ void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch(() => {});
272
+ }
273
+ } catch {
274
+ // Best-effort manifest cleanup
275
+ }
276
+ unregisterActiveRun(entry.runId);
277
+ tryRemoveRunDirectories(entry);
278
+ purged.push(entry.runId);
279
+ continue;
280
+ }
281
+ }
282
+ }
283
+
284
+ kept.push(entry.runId);
285
+ }
286
+
287
+ return { purged, kept };
288
+ }
289
+
290
+ export function reconcileAllStaleRuns(cwd: string, manifestCache: ManifestCache, now = Date.now()): ReconcileResult[] {
291
+ const results: ReconcileResult[] = [];
292
+ for (const manifest of manifestCache.list(50)) {
293
+ if (manifest.status !== "running" && manifest.status !== "blocked") continue;
294
+ const loaded = loadRunManifestById(cwd, manifest.runId);
295
+ if (!loaded) continue;
296
+ // Use lock to prevent race with cancel/status handlers modifying the same run
297
+ withRunLockSync(loaded.manifest, () => {
298
+ // Re-read inside lock to get freshest data
299
+ const fresh = loadRunManifestById(cwd, manifest.runId);
300
+ if (!fresh || (fresh.manifest.status !== "running" && fresh.manifest.status !== "blocked")) return;
301
+ const result = reconcileStaleRun(fresh.manifest, fresh.tasks, now);
302
+ if (result.repaired || result.verdict === "result_exists") {
303
+ if (result.repairedTasks) {
304
+ saveRunTasks(fresh.manifest, result.repairedTasks);
305
+ for (const task of result.repairedTasks) { try { upsertCrewAgent(fresh.manifest, recordFromTask(fresh.manifest, task, "scaffold")); } catch { /* non-critical */ } }
306
+ }
307
+ updateRunStatus(fresh.manifest, "failed", `Stale run reconciled: ${result.detail}`);
308
+ void terminateLiveAgentsForRun(fresh.manifest.runId, "failed", appendEvent, fresh.manifest.eventsPath).catch(() => {});
309
+ appendEvent(fresh.manifest.eventsPath, { type: "crew.run.reconciled_stale", runId: manifest.runId, message: result.detail, data: { verdict: result.verdict } });
310
+ }
311
+ if (result.verdict !== "healthy") {
312
+ results.push(result);
313
+ }
314
+ });
315
+ }
316
+ return results;
317
+ }