@pi-agents/orchid 0.1.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. package/CHANGELOG.md +41 -0
  2. package/LICENSE +21 -0
  3. package/README.md +246 -0
  4. package/agents/AGENTS-MANIFEST.md +42 -0
  5. package/agents/brain.md +42 -0
  6. package/agents/context-builder.md +46 -0
  7. package/agents/delegate.md +12 -0
  8. package/agents/dev-1.md +42 -0
  9. package/agents/oracle.md +73 -0
  10. package/agents/planner.md +55 -0
  11. package/agents/researcher.md +52 -0
  12. package/agents/reviewer.md +79 -0
  13. package/agents/scout.md +50 -0
  14. package/agents/tester.md +45 -0
  15. package/agents/worker.md +55 -0
  16. package/extensions/ralph.ts +1 -0
  17. package/extensions/reviewer-extension.ts +125 -0
  18. package/extensions/task-orchestrator.ts +28 -0
  19. package/package.json +63 -0
  20. package/prompts/gather-context-and-clarify.md +13 -0
  21. package/prompts/parallel-cleanup.md +59 -0
  22. package/prompts/parallel-context-build.md +53 -0
  23. package/prompts/parallel-handoff-plan.md +59 -0
  24. package/prompts/parallel-research.md +50 -0
  25. package/prompts/parallel-review.md +54 -0
  26. package/prompts/review-loop.md +41 -0
  27. package/skills/orchid/SKILL.md +214 -0
  28. package/skills/orchid/orchid-cleanup/SKILL.md +122 -0
  29. package/skills/orchid/orchid-converge/SKILL.md +124 -0
  30. package/skills/orchid/orchid-decompose/SKILL.md +201 -0
  31. package/skills/orchid/orchid-doctor/SKILL.md +162 -0
  32. package/skills/orchid/orchid-investigate/SKILL.md +102 -0
  33. package/skills/orchid/orchid-launch/SKILL.md +147 -0
  34. package/skills/ralph/SKILL.md +73 -0
  35. package/skills/subagents/pi-subagents/SKILL.md +813 -0
  36. package/src/index.ts +7 -0
  37. package/src/orchestrator/abort.ts +534 -0
  38. package/src/orchestrator/agent-bridge-extension.ts +1020 -0
  39. package/src/orchestrator/agent-host.ts +954 -0
  40. package/src/orchestrator/cleanup.ts +776 -0
  41. package/src/orchestrator/config-loader.ts +1412 -0
  42. package/src/orchestrator/config-schema.ts +690 -0
  43. package/src/orchestrator/config.ts +81 -0
  44. package/src/orchestrator/context-window.ts +66 -0
  45. package/src/orchestrator/diagnostic-reports.ts +475 -0
  46. package/src/orchestrator/diagnostics.ts +394 -0
  47. package/src/orchestrator/discovery.ts +1833 -0
  48. package/src/orchestrator/engine-worker.ts +415 -0
  49. package/src/orchestrator/engine.ts +5940 -0
  50. package/src/orchestrator/execution.ts +3104 -0
  51. package/src/orchestrator/extension.ts +5934 -0
  52. package/src/orchestrator/formatting.ts +785 -0
  53. package/src/orchestrator/git.ts +88 -0
  54. package/src/orchestrator/index.ts +28 -0
  55. package/src/orchestrator/lane-runner.ts +1787 -0
  56. package/src/orchestrator/mailbox.ts +780 -0
  57. package/src/orchestrator/merge.ts +3414 -0
  58. package/src/orchestrator/messages.ts +1062 -0
  59. package/src/orchestrator/migrations.ts +278 -0
  60. package/src/orchestrator/naming.ts +117 -0
  61. package/src/orchestrator/path-resolver.ts +275 -0
  62. package/src/orchestrator/persistence.ts +2625 -0
  63. package/src/orchestrator/process-registry.ts +452 -0
  64. package/src/orchestrator/quality-gate.ts +1085 -0
  65. package/src/orchestrator/resume.ts +3488 -0
  66. package/src/orchestrator/sessions.ts +57 -0
  67. package/src/orchestrator/settings-loader.ts +136 -0
  68. package/src/orchestrator/settings-tui.ts +2208 -0
  69. package/src/orchestrator/sidecar-telemetry.ts +267 -0
  70. package/src/orchestrator/supervisor.ts +4548 -0
  71. package/src/orchestrator/task-executor-core.ts +675 -0
  72. package/src/orchestrator/tmux-compat.ts +37 -0
  73. package/src/orchestrator/tool-allowlist-constants.ts +37 -0
  74. package/src/orchestrator/types.ts +4465 -0
  75. package/src/orchestrator/verification.ts +547 -0
  76. package/src/orchestrator/waves.ts +1564 -0
  77. package/src/orchestrator/workspace.ts +707 -0
  78. package/src/orchestrator/worktree.ts +2725 -0
  79. package/src/ralph/index.ts +825 -0
  80. package/src/subagents/agents/agent-management.ts +648 -0
  81. package/src/subagents/agents/agent-scope.ts +6 -0
  82. package/src/subagents/agents/agent-selection.ts +23 -0
  83. package/src/subagents/agents/agent-serializer.ts +86 -0
  84. package/src/subagents/agents/agents.ts +832 -0
  85. package/src/subagents/agents/chain-serializer.ts +137 -0
  86. package/src/subagents/agents/frontmatter.ts +29 -0
  87. package/src/subagents/agents/identity.ts +30 -0
  88. package/src/subagents/agents/skills.ts +632 -0
  89. package/src/subagents/extension/config.ts +16 -0
  90. package/src/subagents/extension/control-notices.ts +92 -0
  91. package/src/subagents/extension/doctor.ts +199 -0
  92. package/src/subagents/extension/fanout-child.ts +170 -0
  93. package/src/subagents/extension/index.ts +573 -0
  94. package/src/subagents/extension/schemas.ts +168 -0
  95. package/src/subagents/intercom/intercom-bridge.ts +379 -0
  96. package/src/subagents/intercom/result-intercom.ts +377 -0
  97. package/src/subagents/runs/background/async-execution.ts +712 -0
  98. package/src/subagents/runs/background/async-job-tracker.ts +310 -0
  99. package/src/subagents/runs/background/async-resume.ts +345 -0
  100. package/src/subagents/runs/background/async-status.ts +325 -0
  101. package/src/subagents/runs/background/completion-dedupe.ts +63 -0
  102. package/src/subagents/runs/background/notify.ts +108 -0
  103. package/src/subagents/runs/background/parallel-groups.ts +45 -0
  104. package/src/subagents/runs/background/result-watcher.ts +307 -0
  105. package/src/subagents/runs/background/run-id-resolver.ts +83 -0
  106. package/src/subagents/runs/background/run-status.ts +269 -0
  107. package/src/subagents/runs/background/stale-run-reconciler.ts +336 -0
  108. package/src/subagents/runs/background/subagent-runner.ts +1808 -0
  109. package/src/subagents/runs/background/top-level-async.ts +13 -0
  110. package/src/subagents/runs/foreground/chain-clarify.ts +1333 -0
  111. package/src/subagents/runs/foreground/chain-execution.ts +938 -0
  112. package/src/subagents/runs/foreground/execution.ts +918 -0
  113. package/src/subagents/runs/foreground/subagent-executor.ts +2527 -0
  114. package/src/subagents/runs/shared/completion-guard.ts +147 -0
  115. package/src/subagents/runs/shared/long-running-guard.ts +175 -0
  116. package/src/subagents/runs/shared/mcp-direct-tool-allowlist.ts +365 -0
  117. package/src/subagents/runs/shared/model-fallback.ts +103 -0
  118. package/src/subagents/runs/shared/nested-events.ts +819 -0
  119. package/src/subagents/runs/shared/nested-path.ts +52 -0
  120. package/src/subagents/runs/shared/nested-render.ts +115 -0
  121. package/src/subagents/runs/shared/parallel-utils.ts +109 -0
  122. package/src/subagents/runs/shared/pi-args.ts +220 -0
  123. package/src/subagents/runs/shared/pi-spawn.ts +115 -0
  124. package/src/subagents/runs/shared/run-history.ts +60 -0
  125. package/src/subagents/runs/shared/single-output.ts +164 -0
  126. package/src/subagents/runs/shared/subagent-control.ts +226 -0
  127. package/src/subagents/runs/shared/subagent-prompt-runtime.ts +170 -0
  128. package/src/subagents/runs/shared/worktree.ts +577 -0
  129. package/src/subagents/shared/artifacts.ts +98 -0
  130. package/src/subagents/shared/atomic-json.ts +16 -0
  131. package/src/subagents/shared/file-coalescer.ts +40 -0
  132. package/src/subagents/shared/fork-context.ts +76 -0
  133. package/src/subagents/shared/formatters.ts +133 -0
  134. package/src/subagents/shared/jsonl-writer.ts +81 -0
  135. package/src/subagents/shared/model-info.ts +78 -0
  136. package/src/subagents/shared/post-exit-stdio-guard.ts +85 -0
  137. package/src/subagents/shared/session-identity.ts +10 -0
  138. package/src/subagents/shared/session-tokens.ts +44 -0
  139. package/src/subagents/shared/settings.ts +397 -0
  140. package/src/subagents/shared/status-format.ts +49 -0
  141. package/src/subagents/shared/types.ts +822 -0
  142. package/src/subagents/shared/utils.ts +450 -0
  143. package/src/subagents/slash/prompt-template-bridge.ts +397 -0
  144. package/src/subagents/slash/slash-bridge.ts +174 -0
  145. package/src/subagents/slash/slash-commands.ts +528 -0
  146. package/src/subagents/slash/slash-live-state.ts +292 -0
  147. package/src/subagents/tui/render-helpers.ts +80 -0
  148. package/src/subagents/tui/render.ts +1358 -0
  149. package/templates/agents/local/supervisor.md +33 -0
  150. package/templates/agents/local/task-merger.md +27 -0
  151. package/templates/agents/local/task-reviewer.md +30 -0
  152. package/templates/agents/local/task-worker.md +34 -0
  153. package/templates/agents/supervisor-routing.md +92 -0
  154. package/templates/agents/supervisor.md +229 -0
  155. package/templates/agents/task-merger.md +214 -0
  156. package/templates/agents/task-reviewer.md +260 -0
  157. package/templates/agents/task-worker-segment.md +44 -0
  158. package/templates/agents/task-worker.md +557 -0
  159. package/templates/tasks/CONTEXT.md +30 -0
  160. package/templates/tasks/EXAMPLE-001-hello-world/PROMPT.md +98 -0
  161. package/templates/tasks/EXAMPLE-001-hello-world/STATUS.md +73 -0
  162. package/templates/tasks/EXAMPLE-002-parallel-smoke/PROMPT.md +97 -0
  163. package/templates/tasks/EXAMPLE-002-parallel-smoke/STATUS.md +73 -0
@@ -0,0 +1,3488 @@
1
+ /**
2
+ * Resume logic for paused/interrupted batches
3
+ * @module orch/resume
4
+ */
5
+ import { existsSync } from "fs";
6
+ import { join } from "path";
7
+
8
+ import { assembleDiagnosticInput, emitDiagnosticReports } from "./diagnostic-reports.ts";
9
+ import { runDiscovery } from "./discovery.ts";
10
+ import {
11
+ executeOrchBatch,
12
+ resolveDisplayWaveNumber,
13
+ buildSpawnFailureAlertExtras,
14
+ } from "./engine.ts";
15
+ import {
16
+ buildReviewerEnv,
17
+ buildWorkerEnv,
18
+ buildWorkerExcludeEnv,
19
+ computeTransitiveDependents,
20
+ execLog,
21
+ executeLaneV2,
22
+ executeWave,
23
+ resolveCanonicalTaskPaths,
24
+ } from "./execution.ts";
25
+ import type { MonitorUpdateCallback, RuntimeBackend } from "./execution.ts";
26
+ import { selectRuntimeBackend } from "./engine.ts";
27
+ import { readRegistrySnapshot, isTerminalStatus, isProcessAlive } from "./process-registry.ts";
28
+
29
+ /**
30
+ * TP-112: Terminate any alive V2 agents for a lane before re-execution.
31
+ * Per Runtime V2 spec §7.3: detect + terminate + rehydrate.
32
+ * Prevents duplicate concurrent agents for the same lane/task on resume.
33
+ */
34
+ function terminateAliveV2Agents(stateRoot: string, batchId: string, sessionName: string): void {
35
+ const registry = readRegistrySnapshot(stateRoot, batchId);
36
+ if (!registry) return;
37
+ for (const suffix of ["-worker", "-reviewer", ""]) {
38
+ const key = `${sessionName}${suffix}`;
39
+ const manifest = registry.agents[key];
40
+ if (manifest && !isTerminalStatus(manifest.status) && isProcessAlive(manifest.pid)) {
41
+ try {
42
+ process.kill(manifest.pid, "SIGTERM");
43
+ execLog("resume", key, `terminated alive V2 agent (PID ${manifest.pid}) before re-execute`);
44
+ } catch {
45
+ /* already dead */
46
+ }
47
+ }
48
+ }
49
+ }
50
+ import { getCurrentBranch, runGit } from "./git.ts";
51
+ import { mergeWaveByRepo } from "./merge.ts";
52
+ import {
53
+ applyMergeRetryLoop,
54
+ computeCleanupGatePolicy,
55
+ computeMergeFailurePolicy,
56
+ extractFailedRepoId,
57
+ formatRepoMergeSummary,
58
+ ORCH_MESSAGES,
59
+ } from "./messages.ts";
60
+ import type { CleanupGateRepoFailure } from "./messages.ts";
61
+ import { resolveOperatorId } from "./naming.ts";
62
+ import {
63
+ applyPartialProgressToOutcomes,
64
+ deleteBatchState,
65
+ hasTaskDoneMarker,
66
+ loadBatchState,
67
+ persistRuntimeState,
68
+ reconstructBatchStateFromRuntime,
69
+ saveBatchState,
70
+ seedPendingOutcomesForAllocatedLanes,
71
+ syncTaskOutcomesFromMonitor,
72
+ upsertTaskOutcome,
73
+ } from "./persistence.ts";
74
+ import {
75
+ buildBatchProgressSnapshot,
76
+ buildSupervisorSegmentFrontierSnapshot,
77
+ defaultResilienceState,
78
+ StateFileError,
79
+ } from "./types.ts";
80
+ import type {
81
+ AllocatedLane,
82
+ AllocatedTask,
83
+ LaneExecutionResult,
84
+ LaneTaskOutcome,
85
+ LaneTaskStatus,
86
+ MergeWaveResult,
87
+ OrchBatchPhase,
88
+ OrchBatchRuntimeState,
89
+ OrchestratorConfig,
90
+ ParsedTask,
91
+ PersistedBatchState,
92
+ PersistedLaneRecord,
93
+ PersistedSegmentRecord,
94
+ ReconciledTaskState,
95
+ ResumeEligibility,
96
+ ResumePoint,
97
+ TaskRunnerConfig,
98
+ WaveExecutionResult,
99
+ WorkspaceConfig,
100
+ } from "./types.ts";
101
+ import { buildDependencyGraph, resolveBaseBranch, resolveRepoRoot } from "./waves.ts";
102
+ import {
103
+ deleteBranchBestEffort,
104
+ forceCleanupWorktree,
105
+ listWorktrees,
106
+ preserveFailedLaneProgress,
107
+ removeAllWorktrees,
108
+ removeWorktree,
109
+ safeResetWorktree,
110
+ sleepSync,
111
+ } from "./worktree.ts";
112
+
113
+ // ── Resume Repo Helpers ──────────────────────────────────────────────
114
+
115
+ /**
116
+ * Collect unique repo roots from persisted lane records.
117
+ *
118
+ * In repo mode (no repoId on lanes), returns `[defaultRepoRoot]`.
119
+ * In workspace mode, returns one entry per unique repoId, resolved
120
+ * via `resolveRepoRoot()`. Includes the default root as a fallback
121
+ * for lanes with no repoId.
122
+ *
123
+ * Used by inter-wave worktree reset and terminal cleanup to operate
124
+ * on worktrees across all repos in the batch.
125
+ *
126
+ * @param persistedState - Loaded batch state with lane records
127
+ * @param defaultRepoRoot - Default/main repo root (cwd)
128
+ * @param workspaceConfig - Workspace configuration (null in repo mode)
129
+ * @returns Array of unique absolute repo root paths
130
+ */
131
+ export function collectRepoRoots(
132
+ persistedState: PersistedBatchState,
133
+ defaultRepoRoot: string,
134
+ workspaceConfig?: WorkspaceConfig | null,
135
+ ): string[] {
136
+ const roots = new Set<string>();
137
+
138
+ for (const lane of persistedState.lanes) {
139
+ const root = resolveRepoRoot(lane.repoId, defaultRepoRoot, workspaceConfig);
140
+ roots.add(root);
141
+ }
142
+
143
+ // Always include the default repo root (covers repo mode and any
144
+ // lanes without repoId)
145
+ roots.add(defaultRepoRoot);
146
+
147
+ return [...roots];
148
+ }
149
+
150
+ /**
151
+ * Resolve a repoId from a resolved repo root path.
152
+ *
153
+ * In workspace mode, workspace config maps repoId → path. This performs
154
+ * the reverse lookup: given a resolved absolute path, find the repoId.
155
+ * Returns `undefined` if no workspace config or no matching repo is found
156
+ * (which is correct for repo mode or the primary/default repo).
157
+ *
158
+ * Used during cleanup to call `resolveBaseBranch()` per-repo with the
159
+ * correct repoId, ensuring unmerged-branch protection checks against
160
+ * the right target branch in workspace mode.
161
+ *
162
+ * @param repoRoot - Resolved absolute path of the repo
163
+ * @param workspaceConfig - Workspace configuration (null in repo mode)
164
+ * @returns The repoId or undefined if not found / not in workspace mode
165
+ */
166
+ export function resolveRepoIdFromRoot(
167
+ repoRoot: string,
168
+ workspaceConfig?: WorkspaceConfig | null,
169
+ ): string | undefined {
170
+ if (!workspaceConfig) return undefined;
171
+
172
+ for (const [repoId, repoConfig] of workspaceConfig.repos) {
173
+ if (repoConfig.path === repoRoot) {
174
+ return repoId;
175
+ }
176
+ }
177
+
178
+ return undefined;
179
+ }
180
+
181
+ /**
182
+ * Reconstruct AllocatedLane[] from persisted lane records.
183
+ *
184
+ * Used during resume to preserve lane metadata (worktreePath, branch, repoId)
185
+ * across persistence checkpoints. Without this, the first resume checkpoint
186
+ * would serialize empty lanes, losing all lane context.
187
+ *
188
+ * When `persistedTasks` is provided, repo attribution fields (repoId,
189
+ * resolvedRepoId, taskFolder) are carried forward onto the reconstructed
190
+ * ParsedTask stubs. This ensures `serializeBatchState()` can emit repo
191
+ * fields for tasks not in `discovery.pending` (e.g., completed/failed tasks
192
+ * that have been archived).
193
+ *
194
+ * @param persistedLanes - Persisted lane records
195
+ * @param persistedTasks - Optional persisted task records for repo field carry-forward
196
+ * @returns Reconstructed AllocatedLane array with repo attribution preserved
197
+ */
198
+ export function reconstructAllocatedLanes(
199
+ persistedLanes: PersistedLaneRecord[],
200
+ persistedTasks?: PersistedBatchState["tasks"],
201
+ ): AllocatedLane[] {
202
+ // Build task lookup for repo field carry-forward
203
+ const taskLookup = new Map<string, PersistedBatchState["tasks"][0]>();
204
+ if (persistedTasks) {
205
+ for (const t of persistedTasks) {
206
+ taskLookup.set(t.taskId, t);
207
+ }
208
+ }
209
+
210
+ return persistedLanes.map((lr) => ({
211
+ laneNumber: lr.laneNumber,
212
+ laneId: lr.laneId,
213
+ laneSessionId: lr.laneSessionId,
214
+ worktreePath: lr.worktreePath,
215
+ branch: lr.branch,
216
+ tasks: lr.taskIds.map((taskId) => {
217
+ const persistedTask = taskLookup.get(taskId);
218
+ // Build a minimal ParsedTask stub that carries repo attribution
219
+ // from the persisted record. This ensures serializeBatchState()
220
+ // can emit repoId/resolvedRepoId for tasks not in discovery.
221
+ const taskStub: Partial<ParsedTask> = {};
222
+ if (persistedTask?.repoId !== undefined) {
223
+ taskStub.promptRepoId = persistedTask.repoId;
224
+ }
225
+ if (persistedTask?.resolvedRepoId !== undefined) {
226
+ taskStub.resolvedRepoId = persistedTask.resolvedRepoId;
227
+ }
228
+ // TP-169: Always set taskFolder on stub, even if empty string.
229
+ // Previously, the falsy check `if (persistedTask?.taskFolder)` skipped
230
+ // empty-string values, leaving taskFolder as `undefined` on the stub.
231
+ // This caused crashes in buildExecutionUnit and merge code when
232
+ // accessing `task.task.taskFolder` on dynamically-expanded segments
233
+ // whose persisted records had taskFolder="" (the default from
234
+ // serializeBatchState before enrichment).
235
+ taskStub.taskFolder = persistedTask?.taskFolder ?? "";
236
+ if ((persistedTask as any)?.packetRepoId !== undefined) {
237
+ (taskStub as any).packetRepoId = (persistedTask as any).packetRepoId;
238
+ }
239
+ if ((persistedTask as any)?.packetTaskPath !== undefined) {
240
+ (taskStub as any).packetTaskPath = (persistedTask as any).packetTaskPath;
241
+ }
242
+ if ((persistedTask as any)?.segmentIds !== undefined) {
243
+ (taskStub as any).segmentIds = (persistedTask as any).segmentIds;
244
+ }
245
+ if ((persistedTask as any)?.activeSegmentId !== undefined) {
246
+ (taskStub as any).activeSegmentId = (persistedTask as any).activeSegmentId;
247
+ }
248
+ return {
249
+ taskId,
250
+ order: 0,
251
+ task: (Object.keys(taskStub).length > 0 ? taskStub : null) as unknown as ParsedTask,
252
+ estimatedMinutes: 0,
253
+ };
254
+ }),
255
+ strategy: "round-robin" as const,
256
+ estimatedLoad: 0,
257
+ estimatedMinutes: 0,
258
+ ...(lr.repoId !== undefined ? { repoId: lr.repoId } : {}),
259
+ }));
260
+ }
261
+
262
+ /**
263
+ * Collect unique repo roots from a combination of sources.
264
+ *
265
+ * Unlike `collectRepoRoots()` which only reads from persistedState.lanes,
266
+ * this variant merges repo roots from multiple lane sources. This is
267
+ * important during resumed execution where new waves may allocate lanes
268
+ * in repos not present in the original persisted state.
269
+ *
270
+ * @param laneSources - Array of lane arrays to collect repo roots from
271
+ * @param defaultRepoRoot - Default/main repo root (cwd)
272
+ * @param workspaceConfig - Workspace configuration (null in repo mode)
273
+ * @returns Array of unique absolute repo root paths
274
+ */
275
+ export function collectAllRepoRoots(
276
+ laneSources: Array<{ repoId?: string }[]>,
277
+ defaultRepoRoot: string,
278
+ workspaceConfig?: WorkspaceConfig | null,
279
+ ): string[] {
280
+ const roots = new Set<string>();
281
+
282
+ for (const lanes of laneSources) {
283
+ for (const lane of lanes) {
284
+ const root = resolveRepoRoot(lane.repoId, defaultRepoRoot, workspaceConfig);
285
+ roots.add(root);
286
+ }
287
+ }
288
+
289
+ // Always include the default repo root (covers repo mode and any
290
+ // lanes without repoId)
291
+ roots.add(defaultRepoRoot);
292
+
293
+ return [...roots];
294
+ }
295
+
296
+ // ── Resume Pure Functions ────────────────────────────────────────────
297
+
298
+ /**
299
+ * Determine whether a multi-segment task's persisted segment frontier is
300
+ * complete — i.e., every segment for the task reached a terminal-success
301
+ * status ("succeeded" or "skipped").
302
+ *
303
+ * Returns:
304
+ * - `true` when the task has segments AND all of them are terminal-success.
305
+ * - `true` when the task has no segments recorded (single-segment / legacy
306
+ * tasks — the guard does not apply and `.DONE` is authoritative).
307
+ * - `false` when at least one segment is pending/running/failed/stalled.
308
+ *
309
+ * Used by `collectDoneTaskIdsForResume` (TP-196 / #462) to refuse a stale or
310
+ * premature `.DONE` from suppressing re-execution of remaining segments.
311
+ */
312
+ function isSegmentFrontierCompleteForResume(
313
+ persistedState: PersistedBatchState,
314
+ taskId: string,
315
+ ): boolean {
316
+ const segments = (persistedState.segments ?? []).filter((s) => s.taskId === taskId);
317
+ if (segments.length === 0) return true; // No segments recorded — guard does not apply.
318
+ return segments.every((s) => s.status === "succeeded" || s.status === "skipped");
319
+ }
320
+
321
+ /**
322
+ * Collect task IDs with authoritative .DONE markers.
323
+ *
324
+ * Segment frontier state does not suppress .DONE authority for tasks WITHOUT
325
+ * persisted segment records (single-segment / legacy). For tasks WITH segment
326
+ * records (multi-segment), TP-196 / #462 adds a resume guard: when `.DONE`
327
+ * exists but the segment frontier is incomplete (at least one segment is not
328
+ * yet succeeded/skipped), we DO NOT add the taskId to the done set — the
329
+ * task will be re-reconciled instead of silently marked complete. A WARN is
330
+ * logged so operators can spot the inconsistency. The on-disk `.DONE` marker
331
+ * is left alone; the engine will re-establish authoritative state.
332
+ */
333
+ export function collectDoneTaskIdsForResume(
334
+ persistedState: PersistedBatchState,
335
+ repoRoot: string,
336
+ workspaceConfig?: WorkspaceConfig | null,
337
+ ): Set<string> {
338
+ const doneTaskIds = new Set<string>();
339
+ for (const task of persistedState.tasks) {
340
+ let markerFound = false;
341
+ let markerLocation: string | null = null;
342
+ if (task.taskFolder && hasTaskDoneMarker(task.taskFolder)) {
343
+ markerFound = true;
344
+ markerLocation = task.taskFolder;
345
+ }
346
+ if (!markerFound) {
347
+ const laneRec = persistedState.lanes.find((l) => l.taskIds.includes(task.taskId));
348
+ if (laneRec?.worktreePath && task.taskFolder) {
349
+ const resolved = resolveCanonicalTaskPaths(
350
+ task.taskFolder,
351
+ laneRec.worktreePath,
352
+ repoRoot,
353
+ !!workspaceConfig,
354
+ );
355
+ if (existsSync(resolved.donePath)) {
356
+ markerFound = true;
357
+ markerLocation = resolved.donePath;
358
+ }
359
+ }
360
+ }
361
+ if (!markerFound) continue;
362
+
363
+ // TP-196 / #462: Resume guard — refuse `.DONE` authority for multi-segment
364
+ // tasks with an incomplete segment frontier.
365
+ if (!isSegmentFrontierCompleteForResume(persistedState, task.taskId)) {
366
+ console.warn(
367
+ `[resume] WARN: .DONE present for task ${task.taskId} at ${markerLocation} but segment frontier is incomplete — not marking complete (#462 guard). Task will re-reconcile.`,
368
+ );
369
+ continue;
370
+ }
371
+ doneTaskIds.add(task.taskId);
372
+ }
373
+ return doneTaskIds;
374
+ }
375
+
376
+ /**
377
+ * Check whether a persisted batch state is eligible for resume.
378
+ *
379
+ * Resume eligibility matrix:
380
+ * | Phase | Normal | --force | Reason |
381
+ * |-----------|-----------|-----------|-------------------------------------------|
382
+ * | paused | ✅ | ✅ | Batch was paused (user/merge-failure) |
383
+ * | executing | ✅ | ✅ | Batch was executing when orchestrator died |
384
+ * | merging | ✅ | ✅ | Batch was merging when orchestrator died |
385
+ * | stopped | ❌ | ✅ | Batch was stopped by policy |
386
+ * | failed | ❌ | ✅ | Batch has terminal failure |
387
+ * | completed | ❌ | ❌ | Batch already completed |
388
+ * | idle | ❌ | ❌ | Batch never started execution |
389
+ * | planning | ❌ | ❌ | Batch was still planning |
390
+ *
391
+ * Pure function — no process or filesystem access.
392
+ *
393
+ * @param state - Persisted batch state to check
394
+ * @param force - When true, `stopped` and `failed` phases become eligible
395
+ */
396
+ export function checkResumeEligibility(
397
+ state: PersistedBatchState,
398
+ force: boolean = false,
399
+ ): ResumeEligibility {
400
+ const { phase, batchId } = state;
401
+
402
+ switch (phase) {
403
+ case "paused":
404
+ return {
405
+ eligible: true,
406
+ reason: `Batch ${batchId} is paused and can be resumed.`,
407
+ phase,
408
+ batchId,
409
+ };
410
+
411
+ case "executing":
412
+ return {
413
+ eligible: true,
414
+ reason: `Batch ${batchId} was executing when the orchestrator disconnected. Can be resumed.`,
415
+ phase,
416
+ batchId,
417
+ };
418
+
419
+ case "merging":
420
+ return {
421
+ eligible: true,
422
+ reason: `Batch ${batchId} was merging when the orchestrator disconnected. Can be resumed.`,
423
+ phase,
424
+ batchId,
425
+ };
426
+
427
+ case "stopped":
428
+ if (force) {
429
+ return {
430
+ eligible: true,
431
+ reason: `Batch ${batchId} was stopped by failure policy. Force-resuming (--force).`,
432
+ phase,
433
+ batchId,
434
+ };
435
+ }
436
+ return {
437
+ eligible: false,
438
+ reason: `Batch ${batchId} was stopped by failure policy. Use --force to resume, or /orch-abort to clean up.`,
439
+ phase,
440
+ batchId,
441
+ };
442
+
443
+ case "failed":
444
+ if (force) {
445
+ return {
446
+ eligible: true,
447
+ reason: `Batch ${batchId} has a terminal failure. Force-resuming (--force).`,
448
+ phase,
449
+ batchId,
450
+ };
451
+ }
452
+ return {
453
+ eligible: false,
454
+ reason: `Batch ${batchId} has a terminal failure. Use --force to resume, or /orch-abort to clean up.`,
455
+ phase,
456
+ batchId,
457
+ };
458
+
459
+ case "completed":
460
+ return {
461
+ eligible: false,
462
+ reason: `Batch ${batchId} already completed. ${force ? "--force cannot resume a completed batch. " : ""}Delete the state file or start a new batch.`,
463
+ phase,
464
+ batchId,
465
+ };
466
+
467
+ case "idle":
468
+ return {
469
+ eligible: false,
470
+ reason: `Batch ${batchId} never started execution. Start a new batch with /orch.`,
471
+ phase,
472
+ batchId,
473
+ };
474
+
475
+ case "launching":
476
+ return {
477
+ eligible: false,
478
+ reason: `Batch ${batchId} is currently launching. Wait for it to start or use /orch-abort.`,
479
+ phase,
480
+ batchId,
481
+ };
482
+
483
+ case "planning":
484
+ return {
485
+ eligible: false,
486
+ reason: `Batch ${batchId} was still in planning phase. Start a new batch with /orch.`,
487
+ phase,
488
+ batchId,
489
+ };
490
+
491
+ default:
492
+ return {
493
+ eligible: false,
494
+ reason: `Batch ${batchId} has unknown phase "${phase}". Delete the state file and start a new batch.`,
495
+ phase,
496
+ batchId,
497
+ };
498
+ }
499
+ }
500
+
501
+ interface SegmentFrontierResumeTaskState {
502
+ taskId: string;
503
+ completedSegmentIds: string[];
504
+ inFlightSegmentIds: string[];
505
+ pendingSegmentIds: string[];
506
+ failedSegmentIds: string[];
507
+ nextSegmentId: string | null;
508
+ allSucceeded: boolean;
509
+ dependencyBySegmentId: Map<string, string[]>;
510
+ }
511
+
512
+ function classifySegmentStatus(
513
+ status: PersistedSegmentRecord["status"] | undefined,
514
+ ): "completed" | "failed" | "in-flight" | "pending" {
515
+ if (status === "succeeded" || status === "skipped") return "completed";
516
+ if (status === "failed" || status === "stalled") return "failed";
517
+ if (status === "running") return "in-flight";
518
+ return "pending";
519
+ }
520
+
521
+ /**
522
+ * Reconstruct per-task segment frontier from persisted segment records.
523
+ *
524
+ * Mutates persisted task records in-place to reflect the segment frontier:
525
+ * - sets `activeSegmentId` to running or next pending segment
526
+ * - normalizes task `status` to pending/running/terminal based on segments
527
+ */
528
+ export function reconstructSegmentFrontier(
529
+ persistedState: PersistedBatchState,
530
+ ): Map<string, SegmentFrontierResumeTaskState> {
531
+ const byTask = new Map<string, SegmentFrontierResumeTaskState>();
532
+ const segmentRecordById = new Map<string, PersistedSegmentRecord>();
533
+ for (const segment of persistedState.segments ?? []) {
534
+ segmentRecordById.set(segment.segmentId, segment);
535
+ }
536
+
537
+ for (const task of persistedState.tasks) {
538
+ const segmentIds = task.segmentIds ?? [];
539
+ if (segmentIds.length === 0) continue;
540
+
541
+ const dependencyBySegmentId = new Map<string, string[]>();
542
+ const completedSegmentIds: string[] = [];
543
+ const inFlightSegmentIds: string[] = [];
544
+ const pendingSegmentIds: string[] = [];
545
+ const failedSegmentIds: string[] = [];
546
+ let hasConcreteSegmentRecord = false;
547
+
548
+ for (let idx = 0; idx < segmentIds.length; idx++) {
549
+ const segmentId = segmentIds[idx];
550
+ const record = segmentRecordById.get(segmentId);
551
+ if (record) hasConcreteSegmentRecord = true;
552
+ const recordDeps = record?.dependsOnSegmentIds ?? [];
553
+ const fallbackDeps = idx > 0 ? [segmentIds[idx - 1]] : [];
554
+ const deps = (recordDeps.length > 0 ? recordDeps : fallbackDeps).filter((dep) =>
555
+ segmentIds.includes(dep),
556
+ );
557
+ dependencyBySegmentId.set(
558
+ segmentId,
559
+ [...new Set(deps)].sort((a, b) => a.localeCompare(b)),
560
+ );
561
+
562
+ switch (classifySegmentStatus(record?.status)) {
563
+ case "completed":
564
+ completedSegmentIds.push(segmentId);
565
+ break;
566
+ case "in-flight":
567
+ inFlightSegmentIds.push(segmentId);
568
+ break;
569
+ case "failed":
570
+ failedSegmentIds.push(segmentId);
571
+ break;
572
+ default:
573
+ pendingSegmentIds.push(segmentId);
574
+ break;
575
+ }
576
+ }
577
+
578
+ const completedSet = new Set(completedSegmentIds);
579
+ const readyPending = pendingSegmentIds.filter((segmentId) => {
580
+ const deps = dependencyBySegmentId.get(segmentId) ?? [];
581
+ return deps.every((dep) => completedSet.has(dep));
582
+ });
583
+
584
+ const nextSegmentId = inFlightSegmentIds[0] ?? readyPending[0] ?? pendingSegmentIds[0] ?? null;
585
+ const allSucceeded = segmentIds.every((segmentId) => {
586
+ const status = segmentRecordById.get(segmentId)?.status;
587
+ return status === "succeeded";
588
+ });
589
+
590
+ if (hasConcreteSegmentRecord) {
591
+ if (failedSegmentIds.length > 0) {
592
+ task.status = task.status === "skipped" ? "skipped" : "failed";
593
+ task.activeSegmentId = null;
594
+ } else if (inFlightSegmentIds.length > 0) {
595
+ task.status = "running";
596
+ task.activeSegmentId = inFlightSegmentIds[0];
597
+ } else if (pendingSegmentIds.length > 0) {
598
+ task.status = "pending";
599
+ task.activeSegmentId = nextSegmentId;
600
+ } else if (allSucceeded) {
601
+ task.status = "succeeded";
602
+ task.activeSegmentId = null;
603
+ } else {
604
+ task.status = task.status === "skipped" ? "skipped" : "failed";
605
+ task.activeSegmentId = null;
606
+ }
607
+ }
608
+
609
+ byTask.set(task.taskId, {
610
+ taskId: task.taskId,
611
+ completedSegmentIds,
612
+ inFlightSegmentIds,
613
+ pendingSegmentIds,
614
+ failedSegmentIds,
615
+ nextSegmentId,
616
+ allSucceeded,
617
+ dependencyBySegmentId,
618
+ });
619
+ }
620
+
621
+ return byTask;
622
+ }
623
+
624
+ /**
625
+ * Reconcile persisted task states against live signals.
626
+ *
627
+ * For each task in the persisted state, determines the correct action
628
+ * based on the current state of lane-session liveness and .DONE files.
629
+ *
630
+ * Precedence rules (applied per-task):
631
+ * 1. .DONE file found → "mark-complete" (even if session is alive — task is done)
632
+ * 2. Session alive + no .DONE → "reconnect" (task is still running)
633
+ * 3. Persisted status is terminal (succeeded/failed/stalled/skipped) → "skip"
634
+ * (already resolved in the original run, no action needed)
635
+ * 4. Session dead + no .DONE + was pending/running → "mark-failed"
636
+ * (task was interrupted and did not complete)
637
+ *
638
+ * Pure function — no process or filesystem access.
639
+ *
640
+ * @param persistedState - Loaded and validated batch state
641
+ * @param aliveSessions - Set of lane session names currently alive
642
+ * @param doneTaskIds - Set of task IDs whose .DONE files exist
643
+ * @returns Array of reconciled task states in persisted order
644
+ */
645
+ export function reconcileTaskStates(
646
+ persistedState: PersistedBatchState,
647
+ aliveSessions: ReadonlySet<string>,
648
+ doneTaskIds: ReadonlySet<string>,
649
+ existingWorktrees: ReadonlySet<string> = new Set(),
650
+ ): ReconciledTaskState[] {
651
+ return persistedState.tasks.map((task) => {
652
+ const sessionAlive = aliveSessions.has(task.sessionName);
653
+ const doneFileFound = doneTaskIds.has(task.taskId);
654
+ const worktreeExists = existingWorktrees.has(task.taskId);
655
+
656
+ // Precedence 1: .DONE file found → task completed
657
+ if (doneFileFound) {
658
+ return {
659
+ taskId: task.taskId,
660
+ persistedStatus: task.status,
661
+ liveStatus: "succeeded" as LaneTaskStatus,
662
+ sessionAlive,
663
+ doneFileFound: true,
664
+ worktreeExists,
665
+ action: "mark-complete" as const,
666
+ };
667
+ }
668
+
669
+ // Precedence 2: Session alive → reconnect
670
+ if (sessionAlive) {
671
+ return {
672
+ taskId: task.taskId,
673
+ persistedStatus: task.status,
674
+ liveStatus: "running" as LaneTaskStatus,
675
+ sessionAlive: true,
676
+ doneFileFound: false,
677
+ worktreeExists,
678
+ action: "reconnect" as const,
679
+ };
680
+ }
681
+
682
+ // Precedence 3: Already terminal in persisted state → skip
683
+ const terminalStatuses: LaneTaskStatus[] = ["succeeded", "failed", "stalled", "skipped"];
684
+ if (terminalStatuses.includes(task.status)) {
685
+ return {
686
+ taskId: task.taskId,
687
+ persistedStatus: task.status,
688
+ liveStatus: task.status,
689
+ sessionAlive: false,
690
+ doneFileFound: false,
691
+ worktreeExists,
692
+ action: "skip" as const,
693
+ };
694
+ }
695
+
696
+ // Precedence 4: Session dead + no .DONE + worktree exists → re-execute
697
+ if (worktreeExists) {
698
+ return {
699
+ taskId: task.taskId,
700
+ persistedStatus: task.status,
701
+ liveStatus: "pending" as LaneTaskStatus,
702
+ sessionAlive: false,
703
+ doneFileFound: false,
704
+ worktreeExists: true,
705
+ action: "re-execute" as const,
706
+ };
707
+ }
708
+
709
+ // Precedence 5: Pending task that was never started → remain pending
710
+ // Matches two cases:
711
+ // (a) No session assigned at all (future-wave task never allocated)
712
+ // (b) Session assigned from a prior failed resume, but session is dead
713
+ // and worktree doesn't exist — task was allocated but never actually
714
+ // started (TP-037 bug #102b fix)
715
+ // In both cases the task should be re-queued for execution, not failed.
716
+ if (task.status === "pending" && (!task.sessionName || (!sessionAlive && !worktreeExists))) {
717
+ return {
718
+ taskId: task.taskId,
719
+ persistedStatus: task.status,
720
+ liveStatus: "pending" as LaneTaskStatus,
721
+ sessionAlive: false,
722
+ doneFileFound: false,
723
+ worktreeExists: false,
724
+ action: "pending" as const,
725
+ };
726
+ }
727
+
728
+ // Precedence 6: Dead session + not terminal + no .DONE + no worktree → failed
729
+ // (Task was allocated and started but crashed without completing)
730
+ return {
731
+ taskId: task.taskId,
732
+ persistedStatus: task.status,
733
+ liveStatus: "failed" as LaneTaskStatus,
734
+ sessionAlive: false,
735
+ doneFileFound: false,
736
+ worktreeExists: false,
737
+ action: "mark-failed" as const,
738
+ };
739
+ });
740
+ }
741
+
742
+ /**
743
+ * Get the latest merge status for a specific wave index (0-based).
744
+ *
745
+ * Persisted merge results may contain multiple entries for the same wave
746
+ * (e.g., re-exec sentinel merges clamped to wave 0, or retry attempts).
747
+ * This helper returns the latest entry's status for the given wave,
748
+ * preferring the last entry in array order (which is the most recent).
749
+ *
750
+ * @param mergeResults - Persisted merge results array
751
+ * @param waveIndex - 0-based wave index to look up
752
+ * @returns The merge status ("succeeded" | "failed" | "partial") or null if no entry exists
753
+ */
754
+ export function getMergeStatusForWave(
755
+ mergeResults: ReadonlyArray<{ waveIndex: number; status: "succeeded" | "failed" | "partial" }>,
756
+ waveIndex: number,
757
+ ): "succeeded" | "failed" | "partial" | null {
758
+ // Walk in reverse to find the latest entry for this wave
759
+ for (let i = mergeResults.length - 1; i >= 0; i--) {
760
+ if (mergeResults[i].waveIndex === waveIndex) {
761
+ return mergeResults[i].status;
762
+ }
763
+ }
764
+ return null;
765
+ }
766
+
767
+ /**
768
+ * Expand persisted wave plan with continuation rounds required by segment counts.
769
+ *
770
+ * Groups missing rounds by the original last-occurrence wave so resumed execution
771
+ * preserves multi-task round concurrency semantics (`[A,B]`, then `[A]`, etc.).
772
+ */
773
+ export function buildResumeRuntimeWavePlan(persistedState: PersistedBatchState): string[][] {
774
+ const baseWavePlan = persistedState.wavePlan.map((wave) => [...wave]);
775
+ const runtimeWavePlan = [...baseWavePlan];
776
+ const segmentCountByTaskId = new Map<string, number>();
777
+ for (const task of persistedState.tasks) {
778
+ if (Array.isArray(task.segmentIds) && task.segmentIds.length > 0) {
779
+ segmentCountByTaskId.set(task.taskId, task.segmentIds.length);
780
+ }
781
+ }
782
+
783
+ const scheduledCountByTaskId = new Map<string, number>();
784
+ const lastWaveIndexByTaskId = new Map<string, number>();
785
+ for (let waveIdx = 0; waveIdx < baseWavePlan.length; waveIdx++) {
786
+ for (const taskId of baseWavePlan[waveIdx]) {
787
+ scheduledCountByTaskId.set(taskId, (scheduledCountByTaskId.get(taskId) ?? 0) + 1);
788
+ lastWaveIndexByTaskId.set(taskId, waveIdx);
789
+ }
790
+ }
791
+
792
+ const missingByLastWaveIndex = new Map<number, Map<string, number>>();
793
+ for (const [taskId, segmentCount] of segmentCountByTaskId.entries()) {
794
+ const scheduledCount = scheduledCountByTaskId.get(taskId) ?? 0;
795
+ if (segmentCount <= scheduledCount) continue;
796
+ const lastWaveIndex = lastWaveIndexByTaskId.get(taskId) ?? -1;
797
+ if (!missingByLastWaveIndex.has(lastWaveIndex)) {
798
+ missingByLastWaveIndex.set(lastWaveIndex, new Map<string, number>());
799
+ }
800
+ missingByLastWaveIndex.get(lastWaveIndex)!.set(taskId, segmentCount - scheduledCount);
801
+ }
802
+
803
+ let offset = 0;
804
+ for (let baseWaveIdx = 0; baseWaveIdx < baseWavePlan.length; baseWaveIdx++) {
805
+ const missingForWave = missingByLastWaveIndex.get(baseWaveIdx);
806
+ if (!missingForWave || missingForWave.size === 0) continue;
807
+ const rounds: string[][] = [];
808
+ const remaining = new Map(missingForWave);
809
+ while ([...remaining.values()].some((count) => count > 0)) {
810
+ const roundTaskIds = [...remaining.entries()]
811
+ .filter(([, count]) => count > 0)
812
+ .map(([taskId]) => taskId)
813
+ .sort((a, b) => a.localeCompare(b));
814
+ if (roundTaskIds.length === 0) break;
815
+ rounds.push(roundTaskIds);
816
+ for (const taskId of roundTaskIds) {
817
+ remaining.set(taskId, (remaining.get(taskId) ?? 0) - 1);
818
+ }
819
+ }
820
+ if (rounds.length > 0) {
821
+ runtimeWavePlan.splice(baseWaveIdx + 1 + offset, 0, ...rounds);
822
+ offset += rounds.length;
823
+ }
824
+ }
825
+
826
+ const dangling = missingByLastWaveIndex.get(-1);
827
+ if (dangling && dangling.size > 0) {
828
+ const remaining = new Map(dangling);
829
+ while ([...remaining.values()].some((count) => count > 0)) {
830
+ const roundTaskIds = [...remaining.entries()]
831
+ .filter(([, count]) => count > 0)
832
+ .map(([taskId]) => taskId)
833
+ .sort((a, b) => a.localeCompare(b));
834
+ if (roundTaskIds.length === 0) break;
835
+ runtimeWavePlan.push(roundTaskIds);
836
+ for (const taskId of roundTaskIds) {
837
+ remaining.set(taskId, (remaining.get(taskId) ?? 0) - 1);
838
+ }
839
+ }
840
+ }
841
+
842
+ return runtimeWavePlan;
843
+ }
844
+
845
+ /**
846
+ * Compute the resume point from reconciled task states and wave plan.
847
+ *
848
+ * Determines which wave to resume from by finding the first wave that
849
+ * has any incomplete tasks. Skips fully completed waves only when
850
+ * their merge also succeeded.
851
+ *
852
+ * TP-037 (Bug #102): A wave where all tasks are terminal but the merge
853
+ * is missing or failed is NOT skipped — it is flagged for merge retry
854
+ * via `mergeRetryWaveIndexes`. The `resumeWaveIndex` is set to the
855
+ * earliest such wave so the resume loop can process it.
856
+ *
857
+ * Pure function — no process or filesystem access.
858
+ *
859
+ * @param persistedState - Loaded and validated batch state
860
+ * @param reconciledTasks - Reconciled task states
861
+ * @returns Resume point with wave index and categorized task IDs
862
+ */
863
+ export function computeResumePoint(
864
+ persistedState: PersistedBatchState,
865
+ reconciledTasks: ReconciledTaskState[],
866
+ wavePlan: string[][] = persistedState.wavePlan,
867
+ ): ResumePoint {
868
+ // Build lookup: taskId → reconciled state
869
+ const reconciledMap = new Map<string, ReconciledTaskState>();
870
+ for (const task of reconciledTasks) {
871
+ reconciledMap.set(task.taskId, task);
872
+ }
873
+
874
+ const segmentStatusBySegmentId = new Map<string, PersistedSegmentRecord["status"]>();
875
+ for (const segment of persistedState.segments ?? []) {
876
+ segmentStatusBySegmentId.set(segment.segmentId, segment.status);
877
+ }
878
+ const persistedTasks = Array.isArray((persistedState as { tasks?: unknown }).tasks)
879
+ ? persistedState.tasks
880
+ : [];
881
+ const segmentIdsByTaskId = new Map<string, string[]>();
882
+ for (const task of persistedTasks) {
883
+ if (task.segmentIds && task.segmentIds.length > 0) {
884
+ segmentIdsByTaskId.set(task.taskId, task.segmentIds);
885
+ }
886
+ }
887
+ const waveSegmentIdByTaskOccurrence = new Map<string, string>();
888
+ const occurrenceByTaskId = new Map<string, number>();
889
+ for (let waveIdx = 0; waveIdx < wavePlan.length; waveIdx++) {
890
+ for (const taskId of wavePlan[waveIdx]) {
891
+ const segmentIds = segmentIdsByTaskId.get(taskId);
892
+ if (!segmentIds || segmentIds.length === 0) continue;
893
+ const occurrence = occurrenceByTaskId.get(taskId) ?? 0;
894
+ if (occurrence < segmentIds.length) {
895
+ waveSegmentIdByTaskOccurrence.set(`${waveIdx}:${taskId}`, segmentIds[occurrence]);
896
+ }
897
+ occurrenceByTaskId.set(taskId, occurrence + 1);
898
+ }
899
+ }
900
+
901
+ // Categorize tasks
902
+ const completedTaskIds: string[] = [];
903
+ const pendingTaskIds: string[] = [];
904
+ const failedTaskIds: string[] = [];
905
+ const reconnectTaskIds: string[] = [];
906
+ const reExecuteTaskIds: string[] = [];
907
+
908
+ for (const task of reconciledTasks) {
909
+ switch (task.action) {
910
+ case "mark-complete":
911
+ completedTaskIds.push(task.taskId);
912
+ break;
913
+ case "skip":
914
+ if (task.liveStatus === "succeeded" || task.persistedStatus === "succeeded") {
915
+ completedTaskIds.push(task.taskId);
916
+ } else if (
917
+ task.liveStatus === "failed" ||
918
+ task.liveStatus === "stalled" ||
919
+ task.persistedStatus === "failed" ||
920
+ task.persistedStatus === "stalled"
921
+ ) {
922
+ failedTaskIds.push(task.taskId);
923
+ }
924
+ // persistedStatus === "skipped" → terminal but neither completed nor failed.
925
+ // Not re-queued. Counted separately via batchState.skippedTasks (carried from persisted state).
926
+ break;
927
+ case "reconnect":
928
+ reconnectTaskIds.push(task.taskId);
929
+ break;
930
+ case "re-execute":
931
+ reExecuteTaskIds.push(task.taskId);
932
+ break;
933
+ case "mark-failed":
934
+ failedTaskIds.push(task.taskId);
935
+ break;
936
+ case "pending":
937
+ // Never-started tasks remain pending for execution — not failed.
938
+ // These are future-wave tasks that were never allocated to a lane.
939
+ pendingTaskIds.push(task.taskId);
940
+ break;
941
+ }
942
+ }
943
+
944
+ // Find resume wave: first wave with any non-completed tasks OR missing/failed merge.
945
+ // TP-037 (Bug #102): A wave where all tasks are terminal but the merge
946
+ // hasn't succeeded is flagged for merge retry, not skipped.
947
+ let resumeWaveIndex = wavePlan.length; // default: past end = all done
948
+ const mergeRetryWaveIndexes: number[] = [];
949
+
950
+ for (let i = 0; i < wavePlan.length; i++) {
951
+ const waveTasks = wavePlan[i];
952
+ const allDone = waveTasks.every((taskId) => {
953
+ const waveSegmentId = waveSegmentIdByTaskOccurrence.get(`${i}:${taskId}`);
954
+ if (waveSegmentId && segmentStatusBySegmentId.has(waveSegmentId)) {
955
+ const segmentStatus = segmentStatusBySegmentId.get(waveSegmentId)!;
956
+ return (
957
+ segmentStatus === "succeeded" ||
958
+ segmentStatus === "failed" ||
959
+ segmentStatus === "stalled" ||
960
+ segmentStatus === "skipped"
961
+ );
962
+ }
963
+ const reconciled = reconciledMap.get(taskId);
964
+ if (!reconciled) return false;
965
+ // A task is "done" for wave-skip purposes if it's terminal:
966
+ // mark-complete, mark-failed, or skip with any terminal status
967
+ // (succeeded, failed, stalled, skipped)
968
+ if (reconciled.action === "mark-complete" || reconciled.action === "mark-failed") {
969
+ return true;
970
+ }
971
+ if (reconciled.action === "skip") {
972
+ const s = reconciled.liveStatus ?? reconciled.persistedStatus;
973
+ return s === "succeeded" || s === "failed" || s === "stalled" || s === "skipped";
974
+ }
975
+ return false;
976
+ });
977
+
978
+ if (!allDone) {
979
+ // Only set resumeWaveIndex if not already set by a merge retry
980
+ // (merge retry at an earlier wave takes precedence)
981
+ if (resumeWaveIndex === wavePlan.length) {
982
+ resumeWaveIndex = i;
983
+ }
984
+ break;
985
+ }
986
+
987
+ // TP-037 (Bug #102): All tasks are terminal — but did the merge succeed?
988
+ // Only check merge status if the wave had any succeeded tasks (waves with
989
+ // only failures/skips don't produce merges and can be safely skipped).
990
+ const hasSucceededTasks = waveTasks.some((taskId) => {
991
+ const waveSegmentId = waveSegmentIdByTaskOccurrence.get(`${i}:${taskId}`);
992
+ if (waveSegmentId && segmentStatusBySegmentId.has(waveSegmentId)) {
993
+ return segmentStatusBySegmentId.get(waveSegmentId) === "succeeded";
994
+ }
995
+ const reconciled = reconciledMap.get(taskId);
996
+ if (!reconciled) return false;
997
+ if (reconciled.action === "mark-complete") return true;
998
+ if (
999
+ reconciled.action === "skip" &&
1000
+ (reconciled.liveStatus === "succeeded" || reconciled.persistedStatus === "succeeded")
1001
+ )
1002
+ return true;
1003
+ return false;
1004
+ });
1005
+
1006
+ if (hasSucceededTasks && persistedState.mergeResults) {
1007
+ const mergeStatus = getMergeStatusForWave(persistedState.mergeResults, i);
1008
+ if (mergeStatus !== "succeeded") {
1009
+ // Merge missing or failed — flag for retry, don't skip past this wave
1010
+ mergeRetryWaveIndexes.push(i);
1011
+ if (resumeWaveIndex === wavePlan.length) {
1012
+ // This is the first wave needing attention — set resume point here
1013
+ resumeWaveIndex = i;
1014
+ }
1015
+ }
1016
+ }
1017
+ }
1018
+
1019
+ // Determine pending tasks: tasks in resume wave and later that need execution
1020
+ const actualPendingTaskIds: string[] = [];
1021
+ for (let i = resumeWaveIndex; i < wavePlan.length; i++) {
1022
+ for (const taskId of wavePlan[i]) {
1023
+ const waveSegmentId = waveSegmentIdByTaskOccurrence.get(`${i}:${taskId}`);
1024
+ if (waveSegmentId && segmentStatusBySegmentId.has(waveSegmentId)) {
1025
+ const segmentStatus = segmentStatusBySegmentId.get(waveSegmentId)!;
1026
+ if (segmentStatus === "running" || segmentStatus === "pending") {
1027
+ actualPendingTaskIds.push(taskId);
1028
+ }
1029
+ continue;
1030
+ }
1031
+
1032
+ const reconciled = reconciledMap.get(taskId);
1033
+ if (!reconciled) {
1034
+ actualPendingTaskIds.push(taskId); // Unknown task — treat as pending
1035
+ continue;
1036
+ }
1037
+ if (reconciled.action === "reconnect") {
1038
+ // Tasks with alive sessions need reconnection and remain pending.
1039
+ actualPendingTaskIds.push(taskId);
1040
+ }
1041
+ if (reconciled.action === "re-execute") {
1042
+ // Tasks with existing worktrees need re-execution and remain pending.
1043
+ actualPendingTaskIds.push(taskId);
1044
+ }
1045
+ if (reconciled.action === "skip" && reconciled.persistedStatus === "pending") {
1046
+ // Skipped tasks that were pending need execution
1047
+ actualPendingTaskIds.push(taskId);
1048
+ }
1049
+ if (reconciled.action === "pending") {
1050
+ // Never-started tasks from future waves need execution
1051
+ actualPendingTaskIds.push(taskId);
1052
+ }
1053
+ }
1054
+ }
1055
+
1056
+ return {
1057
+ resumeWaveIndex,
1058
+ completedTaskIds,
1059
+ pendingTaskIds: actualPendingTaskIds,
1060
+ failedTaskIds,
1061
+ reconnectTaskIds,
1062
+ reExecuteTaskIds,
1063
+ mergeRetryWaveIndexes,
1064
+ };
1065
+ }
1066
+
1067
+ // ── Pre-Resume Diagnostics ───────────────────────────────────────────
1068
+
1069
+ /**
1070
+ * Result of a single diagnostic check.
1071
+ */
1072
+ export interface DiagnosticCheckResult {
1073
+ /** Short label for the check */
1074
+ check: string;
1075
+ /** Whether the check passed */
1076
+ passed: boolean;
1077
+ /** Human-readable detail (reason for failure or confirmation) */
1078
+ detail: string;
1079
+ }
1080
+
1081
+ /**
1082
+ * Aggregate result of pre-resume diagnostics.
1083
+ */
1084
+ export interface PreResumeDiagnosticsResult {
1085
+ /** Whether all checks passed and resume can proceed */
1086
+ passed: boolean;
1087
+ /** Individual check results */
1088
+ checks: DiagnosticCheckResult[];
1089
+ /** Summary message for operator display */
1090
+ summary: string;
1091
+ }
1092
+
1093
+ /**
1094
+ * Run pre-resume diagnostics before allowing a force-resume.
1095
+ *
1096
+ * Checks performed (per repo in workspace mode):
1097
+ * 1. **State coherence:** batch-state.json exists and is loadable
1098
+ * 2. **Branch consistency:** orch branch exists in each repo
1099
+ * 3. **Worktree health:** persisted lane worktrees are accessible or cleanly absent
1100
+ *
1101
+ * Pure-ish function — reads filesystem/git state but does not mutate anything.
1102
+ *
1103
+ * @param persistedState - Loaded batch state
1104
+ * @param repoRoot - Default repo root (cwd)
1105
+ * @param stateRoot - Root for state files (.pi/)
1106
+ * @param workspaceConfig - Workspace configuration (null in repo mode)
1107
+ * @returns Diagnostics result with pass/fail and per-check details
1108
+ */
1109
+ export function runPreResumeDiagnostics(
1110
+ persistedState: PersistedBatchState,
1111
+ repoRoot: string,
1112
+ stateRoot: string,
1113
+ workspaceConfig?: WorkspaceConfig | null,
1114
+ ): PreResumeDiagnosticsResult {
1115
+ const checks: DiagnosticCheckResult[] = [];
1116
+
1117
+ // 1. State coherence — verify batch-state.json is well-formed
1118
+ // (Already loaded by caller, so if we get here the state is valid.)
1119
+ checks.push({
1120
+ check: "state-coherence",
1121
+ passed: true,
1122
+ detail: `Batch state loaded successfully (batchId: ${persistedState.batchId}, phase: ${persistedState.phase})`,
1123
+ });
1124
+
1125
+ // 2. Branch consistency — verify orch branch exists in each repo
1126
+ const repoRoots = collectRepoRoots(persistedState, repoRoot, workspaceConfig);
1127
+ for (const root of repoRoots) {
1128
+ const repoId = resolveRepoIdFromRoot(root, workspaceConfig);
1129
+ const label = repoId ? `repo:${repoId}` : "default-repo";
1130
+
1131
+ if (persistedState.orchBranch) {
1132
+ const branchCheck = runGit(
1133
+ ["rev-parse", "--verify", `refs/heads/${persistedState.orchBranch}`],
1134
+ root,
1135
+ );
1136
+ if (branchCheck.ok) {
1137
+ checks.push({
1138
+ check: `branch-consistency:${label}`,
1139
+ passed: true,
1140
+ detail: `Orch branch "${persistedState.orchBranch}" exists in ${label}`,
1141
+ });
1142
+ } else {
1143
+ checks.push({
1144
+ check: `branch-consistency:${label}`,
1145
+ passed: false,
1146
+ detail:
1147
+ `Orch branch "${persistedState.orchBranch}" not found in ${label}. ` +
1148
+ `The branch may have been deleted or the repo is in an inconsistent state.`,
1149
+ });
1150
+ }
1151
+ }
1152
+ }
1153
+
1154
+ // 3. Worktree health — check each persisted lane worktree
1155
+ for (const lane of persistedState.lanes) {
1156
+ if (!lane.worktreePath) continue;
1157
+
1158
+ const wtExists = existsSync(lane.worktreePath);
1159
+ if (wtExists) {
1160
+ // Verify it's a valid git worktree (has .git file/directory)
1161
+ const gitMarker = join(lane.worktreePath, ".git");
1162
+ const isValidWt = existsSync(gitMarker);
1163
+ checks.push({
1164
+ check: `worktree-health:lane-${lane.laneNumber}`,
1165
+ passed: isValidWt,
1166
+ detail: isValidWt
1167
+ ? `Lane ${lane.laneNumber} worktree exists and has valid .git marker`
1168
+ : `Lane ${lane.laneNumber} worktree exists at ${lane.worktreePath} but lacks .git marker (corrupted)`,
1169
+ });
1170
+ } else {
1171
+ // Absent worktree is OK — resume will re-create or skip
1172
+ checks.push({
1173
+ check: `worktree-health:lane-${lane.laneNumber}`,
1174
+ passed: true,
1175
+ detail: `Lane ${lane.laneNumber} worktree absent (will be re-created on resume)`,
1176
+ });
1177
+ }
1178
+ }
1179
+
1180
+ const failed = checks.filter((c) => !c.passed);
1181
+ const passed = failed.length === 0;
1182
+
1183
+ const summary = passed
1184
+ ? `✅ Pre-resume diagnostics passed (${checks.length} checks)`
1185
+ : `❌ Pre-resume diagnostics failed (${failed.length}/${checks.length} checks failed):\n` +
1186
+ failed.map((c) => ` • ${c.check}: ${c.detail}`).join("\n");
1187
+
1188
+ return { passed, checks, summary };
1189
+ }
1190
+
1191
+ export async function resumeOrchBatch(
1192
+ orchConfig: OrchestratorConfig,
1193
+ runnerConfig: TaskRunnerConfig,
1194
+ cwd: string,
1195
+ batchState: OrchBatchRuntimeState,
1196
+ onNotify: (message: string, level: "info" | "warning" | "error") => void,
1197
+ onMonitorUpdate?: MonitorUpdateCallback,
1198
+ workspaceConfig?: WorkspaceConfig | null,
1199
+ workspaceRoot?: string,
1200
+ agentRoot?: string,
1201
+ force: boolean = false,
1202
+ onSupervisorAlert?: import("./types.ts").SupervisorAlertCallback | null,
1203
+ supervisorAutonomy: "interactive" | "supervised" | "autonomous" = "autonomous",
1204
+ /**
1205
+ * TP-187 (#538): Optional callback fired when a lane reaches a terminal
1206
+ * state during a resumed batch. Threaded through to executeWave so the
1207
+ * supervisor process keeps suppressing zombie alerts after resume too.
1208
+ */
1209
+ onLaneTerminated?: import("./types.ts").LaneTerminatedCallback | null,
1210
+ /**
1211
+ * TP-187 (#538): Optional callback fired when a lane is freshly
1212
+ * (re-)allocated during resume. The supervisor uses it to lift any
1213
+ * carried-over zombie-alert suppression.
1214
+ */
1215
+ onLaneRespawned?: ((laneNumber: number, agentId: string, batchId: string) => void) | null,
1216
+ ): Promise<void> {
1217
+ const repoRoot = cwd;
1218
+ // State files (.pi/batch-state.json, lane-state, etc.) belong in the workspace root,
1219
+ // which is where .pi/ config lives. In repo mode, stateRoot === repoRoot.
1220
+ const stateRoot = workspaceRoot ?? cwd;
1221
+
1222
+ // ── TP-076: Supervisor alert emission helper ─────────────────
1223
+ const emitAlert = (alert: import("./types.ts").SupervisorAlert): void => {
1224
+ if (onSupervisorAlert) {
1225
+ try {
1226
+ onSupervisorAlert(alert);
1227
+ } catch (err: unknown) {
1228
+ const msg = err instanceof Error ? err.message : String(err);
1229
+ execLog("resume", "unknown", `supervisor alert callback failed: ${msg}`, {
1230
+ alertCategory: alert.category,
1231
+ });
1232
+ }
1233
+ }
1234
+ };
1235
+
1236
+ // ── 1. Load persisted state ──────────────────────────────────
1237
+ let persistedState: PersistedBatchState | null;
1238
+ try {
1239
+ persistedState = loadBatchState(stateRoot);
1240
+ } catch (err: unknown) {
1241
+ if (err instanceof StateFileError) {
1242
+ onNotify(`❌ Cannot resume: ${err.message}`, "error");
1243
+ // ── TP-040 R006: Reset phase on pre-execution early return ──
1244
+ // The caller may have set batchState.phase = "launching" before
1245
+ // calling this function. Since we're returning without starting
1246
+ // any work, reset to "idle" so the batch isn't stuck.
1247
+ batchState.phase = "idle";
1248
+ return;
1249
+ }
1250
+ throw err;
1251
+ }
1252
+
1253
+ if (!persistedState) {
1254
+ if (!force) {
1255
+ onNotify(ORCH_MESSAGES.resumeNoState(), "error");
1256
+ // TP-040 R006: Reset phase on pre-execution early return
1257
+ batchState.phase = "idle";
1258
+ return;
1259
+ }
1260
+ // TP-187 (#539): On force-resume, attempt deterministic reconstruction
1261
+ // from .pi/runtime/<batchId>/ runtime artifacts (typically left intact
1262
+ // by `orch_abort()` even though `.pi/batch-state.json` is deleted).
1263
+ const reconstruction = reconstructBatchStateFromRuntime(stateRoot);
1264
+ if (!reconstruction.ok) {
1265
+ onNotify(ORCH_MESSAGES.resumeNoStateAfterAbort(reconstruction.error, null), "error");
1266
+ // TP-040 R006: Reset phase on pre-execution early return
1267
+ batchState.phase = "idle";
1268
+ return;
1269
+ }
1270
+ // Successful reconstruction: persist so the rest of resumeOrchBatch
1271
+ // proceeds with a normal on-disk batch-state.json picture.
1272
+ onNotify(
1273
+ ORCH_MESSAGES.resumeReconstructed(reconstruction.batchId, reconstruction.selectionNote),
1274
+ "warning",
1275
+ );
1276
+ try {
1277
+ saveBatchState(JSON.stringify(reconstruction.state, null, 2), stateRoot);
1278
+ } catch (err) {
1279
+ onNotify(
1280
+ ORCH_MESSAGES.resumeNoStateAfterAbort(
1281
+ `reconstructed state could not be persisted: ${err instanceof Error ? err.message : String(err)}`,
1282
+ reconstruction.batchId,
1283
+ ),
1284
+ "error",
1285
+ );
1286
+ // TP-040 R006: Reset phase on pre-execution early return
1287
+ batchState.phase = "idle";
1288
+ return;
1289
+ }
1290
+ persistedState = reconstruction.state;
1291
+ }
1292
+
1293
+ // ── 2. Check eligibility ─────────────────────────────────────
1294
+ const eligibility = checkResumeEligibility(persistedState, force);
1295
+ if (!eligibility.eligible) {
1296
+ onNotify(
1297
+ ORCH_MESSAGES.resumePhaseNotResumable(
1298
+ persistedState.batchId,
1299
+ persistedState.phase,
1300
+ eligibility.reason,
1301
+ ),
1302
+ "error",
1303
+ );
1304
+ // TP-040 R006: Reset phase on pre-execution early return
1305
+ batchState.phase = "idle";
1306
+ return;
1307
+ }
1308
+
1309
+ // ── 2b. Force-resume: pre-resume diagnostics & state mutation ──
1310
+ const isForceResume =
1311
+ force && (persistedState.phase === "stopped" || persistedState.phase === "failed");
1312
+ if (isForceResume) {
1313
+ onNotify(
1314
+ ORCH_MESSAGES.forceResumeStarting(persistedState.batchId, persistedState.phase),
1315
+ "warning",
1316
+ );
1317
+
1318
+ // Run pre-resume diagnostics before allowing force-resume
1319
+ const diagnostics = runPreResumeDiagnostics(persistedState, repoRoot, stateRoot, workspaceConfig);
1320
+ onNotify(diagnostics.summary, diagnostics.passed ? "info" : "error");
1321
+
1322
+ if (!diagnostics.passed) {
1323
+ onNotify(ORCH_MESSAGES.forceResumeDiagnosticsFailed(persistedState.batchId), "error");
1324
+ // TP-040 R006: Reset phase on pre-execution early return
1325
+ batchState.phase = "idle";
1326
+ return;
1327
+ }
1328
+
1329
+ // Record force intent in resilience state
1330
+ persistedState.resilience.resumeForced = true;
1331
+
1332
+ // Reset phase to paused so normal resume flow can proceed
1333
+ execLog(
1334
+ "resume",
1335
+ persistedState.batchId,
1336
+ `force-resume: phase ${persistedState.phase} → paused`,
1337
+ {
1338
+ diagnosticChecks: diagnostics.checks.length,
1339
+ diagnosticsPassed: diagnostics.passed,
1340
+ },
1341
+ );
1342
+ persistedState.phase = "paused";
1343
+ }
1344
+
1345
+ onNotify(ORCH_MESSAGES.resumeStarting(persistedState.batchId, persistedState.phase), "info");
1346
+
1347
+ const segmentFrontierByTask = reconstructSegmentFrontier(persistedState);
1348
+ if (segmentFrontierByTask.size > 0) {
1349
+ let completedSegments = 0;
1350
+ let inFlightSegments = 0;
1351
+ let pendingSegments = 0;
1352
+ for (const frontier of segmentFrontierByTask.values()) {
1353
+ completedSegments += frontier.completedSegmentIds.length;
1354
+ inFlightSegments += frontier.inFlightSegmentIds.length;
1355
+ pendingSegments += frontier.pendingSegmentIds.length;
1356
+ }
1357
+ execLog("resume", persistedState.batchId, `segment frontier reconstructed`, {
1358
+ tasks: segmentFrontierByTask.size,
1359
+ completedSegments,
1360
+ inFlightSegments,
1361
+ pendingSegments,
1362
+ });
1363
+ }
1364
+
1365
+ const runtimeWavePlan = buildResumeRuntimeWavePlan(persistedState);
1366
+ // TP-108/112: Runtime V2 backend selection for resumed batches.
1367
+ // MUST be computed before any backend-aware branch (section 3+).
1368
+ const resumeBackend: RuntimeBackend = selectRuntimeBackend(
1369
+ "all",
1370
+ runtimeWavePlan,
1371
+ workspaceConfig,
1372
+ ).backend;
1373
+ execLog("resume", batchState.batchId, `runtime backend for resumed execution: ${resumeBackend}`);
1374
+
1375
+ // ── 3. Discover live signals ─────────────────────────────────
1376
+ // TP-112/119: Runtime V2 session liveness check only.
1377
+ // Alive sessions are discovered from the process registry.
1378
+ const aliveSessions = new Set<string>();
1379
+ const registry = readRegistrySnapshot(stateRoot, persistedState.batchId);
1380
+ if (registry) {
1381
+ for (const manifest of Object.values(registry.agents)) {
1382
+ if (!isTerminalStatus(manifest.status) && isProcessAlive(manifest.pid)) {
1383
+ aliveSessions.add(manifest.agentId);
1384
+ // Also add lane session name (without role suffix) so reconciliation
1385
+ // matches persisted task.sessionName.
1386
+ // e.g., "orch-op-lane-1-worker" -> also add "orch-op-lane-1"
1387
+ const laneSession = manifest.agentId.replace(/-(worker|reviewer)$/, "");
1388
+ if (laneSession !== manifest.agentId) aliveSessions.add(laneSession);
1389
+ }
1390
+ }
1391
+ }
1392
+
1393
+ // Check .DONE files — check both original path and worktree-relative path.
1394
+ // TP-109: In workspace mode or V2 execution, .DONE is written in the worktree
1395
+ // at the resolved packet path, not the original discovery path. Resume must
1396
+ // check both locations for authoritative completion detection.
1397
+ const doneTaskIds = collectDoneTaskIdsForResume(persistedState, repoRoot, workspaceConfig);
1398
+
1399
+ // ── 3b. Detect existing worktrees ────────────────────────────
1400
+ const existingWorktreeTaskIds = new Set<string>();
1401
+ for (const task of persistedState.tasks) {
1402
+ const laneRecord = persistedState.lanes.find((l) => l.taskIds.includes(task.taskId));
1403
+ if (laneRecord && laneRecord.worktreePath && existsSync(laneRecord.worktreePath)) {
1404
+ existingWorktreeTaskIds.add(task.taskId);
1405
+ }
1406
+ }
1407
+
1408
+ // ── 4. Reconcile task states ─────────────────────────────────
1409
+ const reconciledTasks = reconcileTaskStates(
1410
+ persistedState,
1411
+ aliveSessions,
1412
+ doneTaskIds,
1413
+ existingWorktreeTaskIds,
1414
+ );
1415
+
1416
+ // ── 4b. Clear stale session allocation for tasks reconciled as pending ──
1417
+ // TP-037 (Bug #102b): Pending tasks that had a sessionName from a prior
1418
+ // failed resume but were never actually started need their allocation
1419
+ // metadata cleared so they can be freshly assigned to new lanes.
1420
+ // We also prune these tasks from persisted lane records so that
1421
+ // serializeBatchState() doesn't reintroduce stale sessionName via lane
1422
+ // fallback paths when outcome.sessionName is absent.
1423
+ const stalePendingTaskIds = new Set<string>();
1424
+ for (const reconciled of reconciledTasks) {
1425
+ if (reconciled.action === "pending") {
1426
+ const persistedTask = persistedState.tasks.find((t) => t.taskId === reconciled.taskId);
1427
+ if (persistedTask && persistedTask.sessionName) {
1428
+ execLog(
1429
+ "resume",
1430
+ persistedState.batchId,
1431
+ `clear-stale-session: ${reconciled.taskId} had stale session "${persistedTask.sessionName}" (lane ${persistedTask.laneNumber})`,
1432
+ );
1433
+ stalePendingTaskIds.add(reconciled.taskId);
1434
+ persistedTask.sessionName = "";
1435
+ persistedTask.laneNumber = 0;
1436
+ }
1437
+ }
1438
+ }
1439
+ // Prune stale-pending tasks from lane records so reconstructAllocatedLanes()
1440
+ // (and subsequent serializeBatchState()) won't map them back to the old lane.
1441
+ if (stalePendingTaskIds.size > 0) {
1442
+ for (const lane of persistedState.lanes) {
1443
+ lane.taskIds = lane.taskIds.filter((id) => !stalePendingTaskIds.has(id));
1444
+ }
1445
+ }
1446
+
1447
+ // ── 5. Compute resume point ──────────────────────────────────
1448
+ const resumePoint = computeResumePoint(persistedState, reconciledTasks, runtimeWavePlan);
1449
+ const completedTaskSet = new Set(resumePoint.completedTaskIds);
1450
+ const failedTaskSet = new Set(resumePoint.failedTaskIds);
1451
+ const reconnectTaskSet = new Set(resumePoint.reconnectTaskIds);
1452
+ const reExecuteTaskSet = new Set(resumePoint.reExecuteTaskIds);
1453
+
1454
+ onNotify(
1455
+ ORCH_MESSAGES.resumeReconciled(
1456
+ persistedState.batchId,
1457
+ resumePoint.completedTaskIds.length,
1458
+ resumePoint.pendingTaskIds.length,
1459
+ resumePoint.failedTaskIds.length,
1460
+ resumePoint.reconnectTaskIds.length,
1461
+ resumePoint.reExecuteTaskIds.length,
1462
+ ),
1463
+ "info",
1464
+ );
1465
+
1466
+ if (resumePoint.reconnectTaskIds.length > 0) {
1467
+ onNotify(ORCH_MESSAGES.resumeReconnecting(resumePoint.reconnectTaskIds.length), "info");
1468
+ }
1469
+
1470
+ if (resumePoint.resumeWaveIndex > 0) {
1471
+ onNotify(ORCH_MESSAGES.resumeSkippedWaves(resumePoint.resumeWaveIndex), "info");
1472
+ }
1473
+
1474
+ if (resumePoint.mergeRetryWaveIndexes.length > 0) {
1475
+ onNotify(
1476
+ `🔀 ${resumePoint.mergeRetryWaveIndexes.length} wave(s) need merge retry: ${resumePoint.mergeRetryWaveIndexes.map((i) => `W${i + 1}`).join(", ")}`,
1477
+ "warning",
1478
+ );
1479
+ }
1480
+
1481
+ // ── 6. Reconstruct runtime state ─────────────────────────────
1482
+
1483
+ // Guard: orchBranch must be present for routing. Persisted states from
1484
+ // pre-TP-022 runs may have orchBranch="" (TP-020 defaults).
1485
+ // Check BEFORE mutating batchState so phase/batchId remain idle on rejection,
1486
+ // allowing future /orch-resume or /orch-abort to proceed.
1487
+ if (!persistedState.orchBranch) {
1488
+ onNotify(
1489
+ `❌ Cannot resume batch ${persistedState.batchId}: persisted state has no orch branch. ` +
1490
+ `This batch was created before orch-branch routing was implemented. ` +
1491
+ `Use /orch-abort to clean up, then start a new batch.`,
1492
+ "error",
1493
+ );
1494
+ // TP-040 R006: Reset phase on pre-execution early return
1495
+ batchState.phase = "idle";
1496
+ return;
1497
+ }
1498
+
1499
+ batchState.phase = "executing";
1500
+ batchState.batchId = persistedState.batchId;
1501
+ batchState.baseBranch = persistedState.baseBranch || "";
1502
+ batchState.orchBranch = persistedState.orchBranch;
1503
+
1504
+ batchState.mode = persistedState.mode;
1505
+ batchState.startedAt = persistedState.startedAt;
1506
+ // Preserve pauseSignal if already set during "launching" phase (TP-040)
1507
+ if (!batchState.pauseSignal?.paused) batchState.pauseSignal = { paused: false };
1508
+ batchState.totalWaves = persistedState.totalWaves;
1509
+ // TP-166: Restore task-level wave metadata for correct display.
1510
+ // Normalize: fall back to totalWaves for pre-TP-166 state files.
1511
+ batchState.taskLevelWaveCount = persistedState.taskLevelWaveCount ?? persistedState.totalWaves;
1512
+ batchState.roundToTaskWave = persistedState.roundToTaskWave
1513
+ ? [...persistedState.roundToTaskWave]
1514
+ : undefined;
1515
+ batchState.totalTasks = persistedState.totalTasks;
1516
+ batchState.succeededTasks = resumePoint.completedTaskIds.length;
1517
+ batchState.failedTasks = resumePoint.failedTaskIds.length;
1518
+ batchState.skippedTasks = persistedState.skippedTasks;
1519
+ batchState.blockedTasks = persistedState.blockedTasks;
1520
+ batchState.blockedTaskIds = new Set(persistedState.blockedTaskIds);
1521
+ // Track persisted blocked IDs separately to avoid double-counting in wave loop.
1522
+ // Engine.ts counts blocked tasks per-wave when a wave is entered. If the prior
1523
+ // run paused before reaching a wave, tasks blocked for that wave are in
1524
+ // `blockedTaskIds` but NOT yet counted in `blockedTasks`. On resume, the
1525
+ // per-wave counting loop excludes `persistedBlockedTaskIds`, so those tasks
1526
+ // would never be counted. Fix: count persisted blocked tasks in future waves
1527
+ // (waves >= resumeWaveIndex) that were not yet counted.
1528
+ const persistedBlockedTaskIds = new Set(persistedState.blockedTaskIds);
1529
+
1530
+ // Count persisted-blocked tasks in unvisited waves (wave >= resumeWaveIndex).
1531
+ // These were added to blockedTaskIds in the prior run but their wave was never
1532
+ // entered, so they were never counted in blockedTasks.
1533
+ if (persistedBlockedTaskIds.size > 0) {
1534
+ let uncountedBlocked = 0;
1535
+ for (let wi = resumePoint.resumeWaveIndex; wi < runtimeWavePlan.length; wi++) {
1536
+ for (const taskId of runtimeWavePlan[wi]) {
1537
+ if (persistedBlockedTaskIds.has(taskId)) {
1538
+ uncountedBlocked++;
1539
+ }
1540
+ }
1541
+ }
1542
+ if (uncountedBlocked > 0) {
1543
+ batchState.blockedTasks += uncountedBlocked;
1544
+ execLog(
1545
+ "resume",
1546
+ persistedState.batchId,
1547
+ `blocked counter fix: ${uncountedBlocked} persisted-blocked task(s) in unvisited waves added to blockedTasks`,
1548
+ );
1549
+ }
1550
+ }
1551
+
1552
+ batchState.errors = [...persistedState.errors];
1553
+ batchState.endedAt = null;
1554
+ batchState.currentWaveIndex = resumePoint.resumeWaveIndex;
1555
+ batchState.waveResults = [];
1556
+
1557
+ // v3: Carry forward resilience and diagnostics from persisted state
1558
+ batchState.resilience = persistedState.resilience;
1559
+ batchState.diagnostics = persistedState.diagnostics;
1560
+ // v4: Carry forward segment records (including dynamically expanded segments)
1561
+ batchState.segments = [...(persistedState.segments ?? [])];
1562
+ // Carry forward unknown fields for roundtrip preservation
1563
+ if (persistedState._extraFields) {
1564
+ batchState._extraFields = persistedState._extraFields;
1565
+ }
1566
+
1567
+ // ── 6b. TP-169: Verify orch branch exists in all workspace repos ────
1568
+ // During the original batch start, the orch branch was created in every
1569
+ // workspace repo. On resume, we verify it still exists. If it's missing
1570
+ // in any repo (e.g., deleted by user, corrupted), re-create it from the
1571
+ // repo's current branch so that worktree creation doesn't silently fall
1572
+ // back to the base branch, bypassing orch branch isolation.
1573
+ if (workspaceConfig && batchState.orchBranch) {
1574
+ for (const [repoId, repoConf] of workspaceConfig.repos) {
1575
+ const rRoot = repoConf.path;
1576
+ const check = runGit(["rev-parse", "--verify", `refs/heads/${batchState.orchBranch}`], rRoot);
1577
+ if (!check.ok) {
1578
+ // Orch branch missing in this repo — re-create from current HEAD
1579
+ const repoBranch = getCurrentBranch(rRoot) || "HEAD";
1580
+ const createRes = runGit(["branch", batchState.orchBranch, repoBranch], rRoot);
1581
+ if (createRes.ok) {
1582
+ execLog("resume", batchState.batchId, `re-created missing orch branch in ${repoId}`, {
1583
+ orchBranch: batchState.orchBranch,
1584
+ base: repoBranch,
1585
+ });
1586
+ onNotify(
1587
+ `⚠️ Orch branch "${batchState.orchBranch}" was missing in repo "${repoId}" — re-created from ${repoBranch}`,
1588
+ "warning",
1589
+ );
1590
+ } else {
1591
+ const errMsg =
1592
+ `Failed to re-create orch branch "${batchState.orchBranch}" in repo "${repoId}": ${createRes.stderr}. ` +
1593
+ `Cannot resume without orch branch isolation.`;
1594
+ execLog("resume", batchState.batchId, errMsg, {
1595
+ orchBranch: batchState.orchBranch,
1596
+ error: createRes.stderr,
1597
+ });
1598
+ throw new Error(errMsg);
1599
+ }
1600
+ }
1601
+ }
1602
+ }
1603
+
1604
+ // ── 7. Re-run discovery for ParsedTask metadata ──────────────
1605
+ // We need fresh ParsedTask data (taskFolder, promptPath) for execution.
1606
+ // Use "all" to discover all areas.
1607
+ const discovery = runDiscovery("all", runnerConfig.task_areas, cwd, {
1608
+ refreshDependencies: false,
1609
+ dependencySource: orchConfig.dependencies.source,
1610
+ useDependencyCache: orchConfig.dependencies.cache,
1611
+ workspaceConfig: workspaceConfig ?? null,
1612
+ });
1613
+
1614
+ // Build dependency graph for skip-dependents policy
1615
+ const depGraph = buildDependencyGraph(discovery.pending, discovery.completed);
1616
+ batchState.dependencyGraph = depGraph;
1617
+
1618
+ // Rehydrate discovered tasks with persisted segment metadata.
1619
+ // Dynamically expanded segments may reference tasks that have segment-level
1620
+ // fields (segmentIds, activeSegmentId, packetRepoId, packetTaskPath) set
1621
+ // during the prior run. Merge these back into discovered ParsedTask records
1622
+ // so execution can resume with correct segment context.
1623
+ for (const persistedTask of persistedState.tasks) {
1624
+ const parsed = discovery.pending.get(persistedTask.taskId);
1625
+ if (!parsed) continue;
1626
+ if (persistedTask.segmentIds?.length) {
1627
+ parsed.segmentIds = persistedTask.segmentIds;
1628
+ }
1629
+ if (persistedTask.activeSegmentId !== undefined) {
1630
+ parsed.activeSegmentId = persistedTask.activeSegmentId;
1631
+ }
1632
+ if (persistedTask.packetRepoId) {
1633
+ parsed.packetRepoId = persistedTask.packetRepoId;
1634
+ }
1635
+ if (persistedTask.packetTaskPath) {
1636
+ parsed.packetTaskPath = persistedTask.packetTaskPath;
1637
+ }
1638
+ }
1639
+
1640
+ // ── 8. Handle alive sessions (reconnect) ─────────────────────
1641
+ // For tasks with alive sessions, we need to wait for them to complete.
1642
+ // We poll each alive session's .DONE file.
1643
+ const reconnectTasks = reconciledTasks.filter((t) => t.action === "reconnect");
1644
+ const reconnectFinalStatus = new Map<string, LaneTaskStatus>();
1645
+
1646
+ if (reconnectTasks.length > 0) {
1647
+ // Wait for reconnected tasks to complete (poll .DONE files)
1648
+ for (const task of reconnectTasks) {
1649
+ const parsedTask = discovery.pending.get(task.taskId);
1650
+ if (!parsedTask) continue;
1651
+
1652
+ // Find the lane info from persisted state
1653
+ const laneRecord = persistedState.lanes.find((l) => l.taskIds.includes(task.taskId));
1654
+ if (!laneRecord) continue;
1655
+
1656
+ // Build a minimal AllocatedLane for polling
1657
+ const allocatedTask: AllocatedTask = {
1658
+ taskId: task.taskId,
1659
+ order: 0,
1660
+ task: parsedTask,
1661
+ estimatedMinutes: 0,
1662
+ };
1663
+ const lane: AllocatedLane = {
1664
+ laneNumber: laneRecord.laneNumber,
1665
+ laneId: laneRecord.laneId,
1666
+ laneSessionId: laneRecord.laneSessionId,
1667
+ worktreePath: laneRecord.worktreePath,
1668
+ branch: laneRecord.branch,
1669
+ tasks: [allocatedTask],
1670
+ strategy: "round-robin",
1671
+ estimatedLoad: 0,
1672
+ estimatedMinutes: 0,
1673
+ ...(laneRecord.repoId !== undefined ? { repoId: laneRecord.repoId } : {}),
1674
+ };
1675
+
1676
+ // Resolve per-lane repo root for workspace mode (v1/repo mode: falls back to repoRoot)
1677
+ const laneRepoRoot = resolveRepoRoot(laneRecord.repoId, repoRoot, workspaceConfig);
1678
+
1679
+ // TP-112: Runtime V2 reconnect.
1680
+ // Agent-host processes do not survive supervisor restart, so reconnect
1681
+ // uses terminate + rehydrate via executeLaneV2.
1682
+ execLog("resume", task.taskId, "V2 reconnect: terminate + rehydrate via lane-runner", {
1683
+ repoId: laneRecord.repoId ?? "(default)",
1684
+ });
1685
+ terminateAliveV2Agents(stateRoot, persistedState.batchId, laneRecord.laneSessionId);
1686
+ try {
1687
+ const laneResult = await executeLaneV2(
1688
+ lane,
1689
+ orchConfig,
1690
+ laneRepoRoot,
1691
+ batchState.pauseSignal,
1692
+ workspaceRoot,
1693
+ !!workspaceConfig,
1694
+ {
1695
+ ORCH_BATCH_ID: batchState.batchId,
1696
+ ...buildReviewerEnv(runnerConfig.reviewer),
1697
+ ...buildWorkerExcludeEnv(runnerConfig.workerExcludeExtensions),
1698
+ },
1699
+ emitAlert,
1700
+ );
1701
+ const taskResult = laneResult.tasks.find((t) => t.taskId === task.taskId);
1702
+ if (taskResult?.status === "succeeded") {
1703
+ reconnectFinalStatus.set(task.taskId, "succeeded");
1704
+ completedTaskSet.add(task.taskId);
1705
+ failedTaskSet.delete(task.taskId);
1706
+ reconnectTaskSet.delete(task.taskId);
1707
+ batchState.succeededTasks++;
1708
+ } else {
1709
+ reconnectFinalStatus.set(task.taskId, "failed");
1710
+ failedTaskSet.add(task.taskId);
1711
+ completedTaskSet.delete(task.taskId);
1712
+ reconnectTaskSet.delete(task.taskId);
1713
+ batchState.failedTasks++;
1714
+ }
1715
+ } catch (err: unknown) {
1716
+ reconnectFinalStatus.set(task.taskId, "failed");
1717
+ failedTaskSet.add(task.taskId);
1718
+ completedTaskSet.delete(task.taskId);
1719
+ reconnectTaskSet.delete(task.taskId);
1720
+ batchState.failedTasks++;
1721
+ execLog(
1722
+ "resume",
1723
+ task.taskId,
1724
+ `V2 reconnect error: ${err instanceof Error ? err.message : String(err)}`,
1725
+ );
1726
+ }
1727
+ }
1728
+ }
1729
+
1730
+ // ── 8b. Handle re-execute tasks (dead session + existing worktree) ──
1731
+ const reExecuteTasks = reconciledTasks.filter((t) => t.action === "re-execute");
1732
+ const reExecuteFinalStatus = new Map<string, LaneTaskStatus>();
1733
+ const reExecAllocatedLanes: AllocatedLane[] = [];
1734
+
1735
+ if (reExecuteTasks.length > 0) {
1736
+ onNotify(
1737
+ `🔄 Re-executing ${reExecuteTasks.length} interrupted task(s) in existing worktrees...`,
1738
+ "info",
1739
+ );
1740
+
1741
+ for (const task of reExecuteTasks) {
1742
+ const parsedTask = discovery.pending.get(task.taskId);
1743
+ if (!parsedTask) continue;
1744
+
1745
+ const laneRecord = persistedState.lanes.find((l) => l.taskIds.includes(task.taskId));
1746
+ if (!laneRecord) continue;
1747
+
1748
+ const allocatedTask: AllocatedTask = {
1749
+ taskId: task.taskId,
1750
+ order: 0,
1751
+ task: parsedTask,
1752
+ estimatedMinutes: 0,
1753
+ };
1754
+ const lane: AllocatedLane = {
1755
+ laneNumber: laneRecord.laneNumber,
1756
+ laneId: laneRecord.laneId,
1757
+ laneSessionId: laneRecord.laneSessionId,
1758
+ worktreePath: laneRecord.worktreePath,
1759
+ branch: laneRecord.branch,
1760
+ tasks: [allocatedTask],
1761
+ strategy: "round-robin",
1762
+ estimatedLoad: 0,
1763
+ estimatedMinutes: 0,
1764
+ ...(laneRecord.repoId !== undefined ? { repoId: laneRecord.repoId } : {}),
1765
+ };
1766
+
1767
+ // Resolve per-lane repo root for workspace mode (v1/repo mode: falls back to repoRoot)
1768
+ const reExecRepoRoot = resolveRepoRoot(laneRecord.repoId, repoRoot, workspaceConfig);
1769
+
1770
+ execLog("resume", task.taskId, "re-executing interrupted task in existing worktree", {
1771
+ session: laneRecord.laneSessionId,
1772
+ worktree: laneRecord.worktreePath,
1773
+ repoId: laneRecord.repoId ?? "(default)",
1774
+ });
1775
+
1776
+ try {
1777
+ // TP-112: Runtime V2 re-execution.
1778
+ terminateAliveV2Agents(stateRoot, batchState.batchId, laneRecord.laneSessionId);
1779
+ const laneResult = await executeLaneV2(
1780
+ lane,
1781
+ orchConfig,
1782
+ reExecRepoRoot,
1783
+ batchState.pauseSignal,
1784
+ workspaceRoot,
1785
+ !!workspaceConfig,
1786
+ {
1787
+ ORCH_BATCH_ID: batchState.batchId,
1788
+ ...buildReviewerEnv(runnerConfig.reviewer),
1789
+ ...buildWorkerExcludeEnv(runnerConfig.workerExcludeExtensions),
1790
+ },
1791
+ emitAlert,
1792
+ );
1793
+ const taskResult = laneResult.tasks.find((t) => t.taskId === task.taskId);
1794
+ const pollResult: { status: LaneTaskStatus; exitReason: string; doneFileFound: boolean } = {
1795
+ status: taskResult?.status ?? "failed",
1796
+ exitReason: taskResult?.exitReason ?? "V2 re-execution completed",
1797
+ doneFileFound: taskResult?.doneFileFound ?? false,
1798
+ };
1799
+
1800
+ if (pollResult.status === "succeeded") {
1801
+ reExecuteFinalStatus.set(task.taskId, "succeeded");
1802
+ completedTaskSet.add(task.taskId);
1803
+ failedTaskSet.delete(task.taskId);
1804
+ reExecuteTaskSet.delete(task.taskId);
1805
+ batchState.succeededTasks++;
1806
+ reExecAllocatedLanes.push(lane);
1807
+ execLog("resume", task.taskId, "re-executed task succeeded");
1808
+ } else {
1809
+ reExecuteFinalStatus.set(task.taskId, "failed");
1810
+ failedTaskSet.add(task.taskId);
1811
+ completedTaskSet.delete(task.taskId);
1812
+ reExecuteTaskSet.delete(task.taskId);
1813
+ batchState.failedTasks++;
1814
+ execLog(
1815
+ "resume",
1816
+ task.taskId,
1817
+ `re-executed task ${pollResult.status}: ${pollResult.exitReason}`,
1818
+ );
1819
+ }
1820
+ } catch (err: unknown) {
1821
+ reExecuteFinalStatus.set(task.taskId, "failed");
1822
+ failedTaskSet.add(task.taskId);
1823
+ completedTaskSet.delete(task.taskId);
1824
+ reExecuteTaskSet.delete(task.taskId);
1825
+ batchState.failedTasks++;
1826
+ const msg = err instanceof Error ? err.message : String(err);
1827
+ execLog("resume", task.taskId, `re-execution error: ${msg}`);
1828
+ }
1829
+ }
1830
+ }
1831
+
1832
+ // ── 8c. Merge re-executed lane branches before cleanup ───────
1833
+ // Re-executed tasks completed outside the normal wave loop, so their
1834
+ // branches would not be merged by step 10. Merge them now.
1835
+ if (reExecAllocatedLanes.length > 0) {
1836
+ const succeededReExecTaskIds = [...reExecuteFinalStatus.entries()]
1837
+ .filter(([_, status]) => status === "succeeded")
1838
+ .map(([taskId]) => taskId);
1839
+
1840
+ if (succeededReExecTaskIds.length > 0) {
1841
+ onNotify(`🔀 Merging ${reExecAllocatedLanes.length} re-executed lane branch(es)...`, "info");
1842
+
1843
+ // Build synthetic WaveExecutionResult for mergeWaveByRepo()
1844
+ const syntheticLaneResults: LaneExecutionResult[] = reExecAllocatedLanes.map((lane) => ({
1845
+ laneNumber: lane.laneNumber,
1846
+ laneId: lane.laneId,
1847
+ tasks: lane.tasks.map((t) => ({
1848
+ taskId: t.taskId,
1849
+ status: "succeeded" as LaneTaskStatus,
1850
+ startTime: Date.now(),
1851
+ endTime: Date.now(),
1852
+ exitReason: "Re-executed task completed successfully",
1853
+ sessionName: lane.laneSessionId,
1854
+ doneFileFound: true,
1855
+ laneNumber: lane.laneNumber,
1856
+ })),
1857
+ overallStatus: "succeeded" as const,
1858
+ startTime: Date.now(),
1859
+ endTime: Date.now(),
1860
+ }));
1861
+
1862
+ // Use waveIndex -1 as a sentinel for "pre-wave-loop re-exec merge".
1863
+ // mergeWaveByRepo expects 1-indexed waveIndex; persistence normalizes
1864
+ // to 0-based via `mr.waveIndex - 1`. By passing -1 here:
1865
+ // - mergeWaveByRepo logs it as "W-1" (harmless)
1866
+ // - persistence normalizes to `Math.max(0, -1 - 1)` = 0 (valid)
1867
+ // - semantically distinguishes re-exec merges from wave 1 merges
1868
+ const RE_EXEC_WAVE_INDEX = -1;
1869
+
1870
+ const syntheticWaveResult: WaveExecutionResult = {
1871
+ waveIndex: RE_EXEC_WAVE_INDEX,
1872
+ startedAt: Date.now(),
1873
+ endedAt: Date.now(),
1874
+ laneResults: syntheticLaneResults,
1875
+ policyApplied: orchConfig.failure.on_task_failure,
1876
+ stoppedEarly: false,
1877
+ failedTaskIds: [],
1878
+ skippedTaskIds: [],
1879
+ succeededTaskIds: succeededReExecTaskIds,
1880
+ blockedTaskIds: [],
1881
+ laneCount: reExecAllocatedLanes.length,
1882
+ overallStatus: "succeeded",
1883
+ finalMonitorState: null,
1884
+ allocatedLanes: reExecAllocatedLanes,
1885
+ };
1886
+
1887
+ const reExecMergeResult = await mergeWaveByRepo(
1888
+ reExecAllocatedLanes,
1889
+ syntheticWaveResult,
1890
+ RE_EXEC_WAVE_INDEX,
1891
+ orchConfig,
1892
+ repoRoot,
1893
+ batchState.batchId,
1894
+ batchState.orchBranch,
1895
+ workspaceConfig,
1896
+ stateRoot,
1897
+ agentRoot,
1898
+ runnerConfig.testing_commands,
1899
+ undefined, // healthMonitor
1900
+ undefined, // forceMixedOutcome
1901
+ resumeBackend,
1902
+ );
1903
+
1904
+ if (reExecMergeResult.status === "succeeded") {
1905
+ onNotify(
1906
+ `✅ Re-executed branch merge complete: ${reExecMergeResult.laneResults.length} lane(s) merged`,
1907
+ "info",
1908
+ );
1909
+
1910
+ // Clean up merged branches (resolve per-lane repo root for workspace mode)
1911
+ // TP-032 R006-3: Exclude verification_new_failure lanes from branch cleanup
1912
+ for (const lr of reExecMergeResult.laneResults) {
1913
+ if (
1914
+ !lr.error &&
1915
+ (lr.result?.status === "SUCCESS" || lr.result?.status === "CONFLICT_RESOLVED")
1916
+ ) {
1917
+ const laneRepoRoot = resolveRepoRoot(lr.repoId, repoRoot, workspaceConfig);
1918
+ deleteBranchBestEffort(lr.sourceBranch, laneRepoRoot);
1919
+ }
1920
+ }
1921
+ } else {
1922
+ onNotify(
1923
+ `⚠️ Re-executed branch merge ${reExecMergeResult.status}: ${reExecMergeResult.failureReason || "unknown"}`,
1924
+ "warning",
1925
+ );
1926
+ }
1927
+
1928
+ batchState.mergeResults.push(reExecMergeResult);
1929
+ }
1930
+ }
1931
+
1932
+ // ── 9. Persist state after reconciliation ────────────────────
1933
+ // Track state for persistence
1934
+ const wavePlan = runtimeWavePlan;
1935
+ persistedState.wavePlan = wavePlan;
1936
+ if (batchState.totalWaves < wavePlan.length) {
1937
+ batchState.totalWaves = wavePlan.length;
1938
+ }
1939
+ const allTaskOutcomes: LaneTaskOutcome[] = [];
1940
+
1941
+ // Initialize latestAllocatedLanes from persisted lane records so that
1942
+ // early persistence calls (before the first resumed wave) retain lane
1943
+ // records with repo attribution (laneNumber, laneId, branch, repoId).
1944
+ // Without this, the `resume-reconciliation` checkpoint would serialize
1945
+ // empty lanes[], losing all lane context until a new wave allocates.
1946
+ let latestAllocatedLanes: AllocatedLane[] = reconstructAllocatedLanes(
1947
+ persistedState.lanes,
1948
+ persistedState.tasks,
1949
+ );
1950
+
1951
+ // Track all repo roots encountered during execution (persisted + newly allocated).
1952
+ // Used by inter-wave reset and terminal cleanup to cover repos introduced
1953
+ // after resume starts (not present in persisted lanes).
1954
+ // Initialized from collectRepoRoots() helper for parity with other callers.
1955
+ const encounteredRepoRoots = new Set(collectRepoRoots(persistedState, repoRoot, workspaceConfig));
1956
+
1957
+ // Build outcomes from reconciled tasks
1958
+ for (const task of reconciledTasks) {
1959
+ const persistedTask = persistedState.tasks.find((t) => t.taskId === task.taskId);
1960
+ const reconnectStatus = reconnectFinalStatus.get(task.taskId);
1961
+ const reExecuteStatus = reExecuteFinalStatus.get(task.taskId);
1962
+ const status =
1963
+ task.action === "reconnect"
1964
+ ? reconnectStatus || "running"
1965
+ : task.action === "re-execute"
1966
+ ? reExecuteStatus || "pending"
1967
+ : task.liveStatus;
1968
+ const isTerminal =
1969
+ status === "succeeded" || status === "failed" || status === "stalled" || status === "skipped";
1970
+ allTaskOutcomes.push({
1971
+ taskId: task.taskId,
1972
+ status,
1973
+ startTime: persistedTask?.startedAt ?? null,
1974
+ endTime: isTerminal ? Date.now() : null,
1975
+ exitReason:
1976
+ task.action === "mark-complete"
1977
+ ? ".DONE file found on resume"
1978
+ : task.action === "mark-failed"
1979
+ ? "Session dead, no .DONE file, no worktree on resume"
1980
+ : task.action === "reconnect"
1981
+ ? status === "succeeded"
1982
+ ? "Reconnected task completed"
1983
+ : status === "failed"
1984
+ ? "Reconnected task failed"
1985
+ : "Reconnected to alive session"
1986
+ : task.action === "re-execute"
1987
+ ? status === "succeeded"
1988
+ ? "Re-executed task completed"
1989
+ : status === "failed"
1990
+ ? "Re-executed task failed"
1991
+ : "Re-executing in existing worktree"
1992
+ : (persistedTask?.exitReason ?? ""),
1993
+ sessionName: persistedTask?.sessionName ?? "",
1994
+ doneFileFound: status === "succeeded" ? true : task.doneFileFound,
1995
+ laneNumber: persistedTask?.laneNumber,
1996
+ // Carry forward partial progress from persisted state (TP-028)
1997
+ partialProgressCommits: persistedTask?.partialProgressCommits,
1998
+ partialProgressBranch: persistedTask?.partialProgressBranch,
1999
+ // v3: Carry forward exit diagnostic from persisted state (TP-030)
2000
+ exitDiagnostic: persistedTask?.exitDiagnostic,
2001
+ });
2002
+ }
2003
+
2004
+ // ── 9b. Seed blocked dependents from reconciled failures ─────
2005
+ // Under skip-dependents policy, failures discovered during reconciliation
2006
+ // (mark-failed) or resolved during reconnect/re-execute must propagate
2007
+ // to their transitive dependents BEFORE the wave loop begins.
2008
+ if (orchConfig.failure.on_task_failure === "skip-dependents" && failedTaskSet.size > 0) {
2009
+ const reconciledBlocked = computeTransitiveDependents(failedTaskSet, depGraph);
2010
+ for (const taskId of reconciledBlocked) {
2011
+ batchState.blockedTaskIds.add(taskId);
2012
+ }
2013
+ if (reconciledBlocked.size > 0) {
2014
+ execLog(
2015
+ "resume",
2016
+ batchState.batchId,
2017
+ `skip-dependents: ${reconciledBlocked.size} task(s) blocked from reconciled failures`,
2018
+ {
2019
+ blocked: [...reconciledBlocked].sort().join(","),
2020
+ sources: [...failedTaskSet].sort().join(","),
2021
+ },
2022
+ );
2023
+ }
2024
+ }
2025
+
2026
+ persistRuntimeState(
2027
+ "resume-reconciliation",
2028
+ batchState,
2029
+ wavePlan,
2030
+ latestAllocatedLanes,
2031
+ allTaskOutcomes,
2032
+ discovery ?? null,
2033
+ stateRoot,
2034
+ );
2035
+
2036
+ // ── 10. Continue wave execution ──────────────────────────────
2037
+ // We need to execute remaining waves starting from resumeWaveIndex.
2038
+ // For waves where some tasks are already done, we filter them out.
2039
+
2040
+ let preserveWorktreesForResume = false;
2041
+ const persistedStatusByTaskId = new Map(
2042
+ persistedState.tasks.map((task) => [task.taskId, task.status] as const),
2043
+ );
2044
+
2045
+ // TP-166: Use task-level wave metadata for correct display.
2046
+ const roundToTaskWave = batchState.roundToTaskWave;
2047
+ const taskLevelWaveCount = batchState.taskLevelWaveCount;
2048
+
2049
+ for (let waveIdx = resumePoint.resumeWaveIndex; waveIdx < wavePlan.length; waveIdx++) {
2050
+ // Check pause signal
2051
+ if (batchState.pauseSignal.paused) {
2052
+ batchState.phase = "paused";
2053
+ persistRuntimeState(
2054
+ "pause-before-wave",
2055
+ batchState,
2056
+ wavePlan,
2057
+ latestAllocatedLanes,
2058
+ allTaskOutcomes,
2059
+ discovery,
2060
+ stateRoot,
2061
+ );
2062
+ const { displayWave: pauseWave } = resolveDisplayWaveNumber(
2063
+ waveIdx,
2064
+ roundToTaskWave,
2065
+ taskLevelWaveCount,
2066
+ );
2067
+ onNotify(`⏸️ Batch paused before wave ${pauseWave}.`, "warning");
2068
+ break;
2069
+ }
2070
+
2071
+ batchState.currentWaveIndex = waveIdx;
2072
+ persistRuntimeState(
2073
+ "wave-index-change",
2074
+ batchState,
2075
+ wavePlan,
2076
+ latestAllocatedLanes,
2077
+ allTaskOutcomes,
2078
+ discovery,
2079
+ stateRoot,
2080
+ );
2081
+
2082
+ // Get wave tasks, filtering out completed/failed/skipped/blocked ones.
2083
+ // Persisted "skipped" tasks are terminal and must never be re-executed.
2084
+ let waveTasks = wavePlan[waveIdx].filter(
2085
+ (taskId) =>
2086
+ !completedTaskSet.has(taskId) &&
2087
+ !failedTaskSet.has(taskId) &&
2088
+ persistedStatusByTaskId.get(taskId) !== "skipped" &&
2089
+ !batchState.blockedTaskIds.has(taskId),
2090
+ );
2091
+
2092
+ // Also filter tasks where discovery doesn't have them as pending
2093
+ waveTasks = waveTasks.filter((taskId) => discovery.pending.has(taskId));
2094
+
2095
+ // Count only newly blocked tasks (not already persisted) to avoid double-counting.
2096
+ // persistedState.blockedTaskIds were already counted in persistedState.blockedTasks
2097
+ // which initialized batchState.blockedTasks.
2098
+ const blockedInWave = wavePlan[waveIdx].filter(
2099
+ (taskId) => batchState.blockedTaskIds.has(taskId) && !persistedBlockedTaskIds.has(taskId),
2100
+ );
2101
+ if (blockedInWave.length > 0) {
2102
+ batchState.blockedTasks += blockedInWave.length;
2103
+ }
2104
+
2105
+ if (waveTasks.length === 0) {
2106
+ // TP-037 Bug #102: Check if this wave needs merge retry.
2107
+ // All tasks are terminal but the merge may have failed/been interrupted.
2108
+ if (resumePoint.mergeRetryWaveIndexes.includes(waveIdx)) {
2109
+ execLog(
2110
+ "resume",
2111
+ batchState.batchId,
2112
+ `wave ${waveIdx + 1}: all tasks done but merge needs retry`,
2113
+ );
2114
+ onNotify(
2115
+ `🔀 Wave ${resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave}: retrying merge (tasks already complete, merge was missing/failed)`,
2116
+ "info",
2117
+ );
2118
+
2119
+ // Reconstruct lanes for this wave from persisted state
2120
+ const waveTaskIds = new Set(wavePlan[waveIdx]);
2121
+ const waveLaneRecords = persistedState.lanes.filter((lane) =>
2122
+ lane.taskIds.some((tid) => waveTaskIds.has(tid)),
2123
+ );
2124
+ const mergeRetryLanes = reconstructAllocatedLanes(waveLaneRecords, persistedState.tasks);
2125
+
2126
+ // Build synthetic WaveExecutionResult from persisted terminal task states.
2127
+ // Crucial for orch_force_merge: tasks intentionally marked "skipped" must
2128
+ // remain skipped here (not failed), otherwise mixed-outcome detection would
2129
+ // trigger again and block the forced merge recovery path.
2130
+ const succeededTaskIds = wavePlan[waveIdx].filter((taskId) => completedTaskSet.has(taskId));
2131
+ const skippedTaskIds = wavePlan[waveIdx].filter(
2132
+ (taskId) => persistedStatusByTaskId.get(taskId) === "skipped",
2133
+ );
2134
+ const failedTaskIds = wavePlan[waveIdx].filter((taskId) => {
2135
+ const status = persistedStatusByTaskId.get(taskId);
2136
+ return status === "failed" || status === "stalled";
2137
+ });
2138
+
2139
+ const syntheticLaneResults: LaneExecutionResult[] = mergeRetryLanes.map((lane) => {
2140
+ const laneTasks = lane.tasks.map((t) => {
2141
+ const persistedStatus = persistedStatusByTaskId.get(t.taskId);
2142
+ let status: LaneTaskStatus;
2143
+ if (completedTaskSet.has(t.taskId) || persistedStatus === "succeeded") {
2144
+ status = "succeeded";
2145
+ } else if (persistedStatus === "skipped") {
2146
+ status = "skipped";
2147
+ } else if (persistedStatus === "failed") {
2148
+ status = "failed";
2149
+ } else if (persistedStatus === "stalled") {
2150
+ status = "stalled";
2151
+ } else {
2152
+ status = "failed";
2153
+ }
2154
+
2155
+ return {
2156
+ taskId: t.taskId,
2157
+ status,
2158
+ startTime: Date.now(),
2159
+ endTime: Date.now(),
2160
+ exitReason:
2161
+ status === "succeeded"
2162
+ ? "Task completed (merge retry)"
2163
+ : status === "skipped"
2164
+ ? "Task skipped (merge retry)"
2165
+ : status === "stalled"
2166
+ ? "Task stalled (merge retry)"
2167
+ : "Task failed (merge retry)",
2168
+ sessionName: lane.laneSessionId,
2169
+ doneFileFound: status === "succeeded",
2170
+ laneNumber: lane.laneNumber,
2171
+ };
2172
+ });
2173
+
2174
+ const laneHasHardFailure = laneTasks.some(
2175
+ (t) => t.status === "failed" || t.status === "stalled",
2176
+ );
2177
+ const laneHasSucceeded = laneTasks.some((t) => t.status === "succeeded");
2178
+ const overallStatus = laneHasHardFailure
2179
+ ? laneHasSucceeded
2180
+ ? "partial"
2181
+ : "failed"
2182
+ : "succeeded";
2183
+
2184
+ return {
2185
+ laneNumber: lane.laneNumber,
2186
+ laneId: lane.laneId,
2187
+ tasks: laneTasks,
2188
+ overallStatus,
2189
+ startTime: Date.now(),
2190
+ endTime: Date.now(),
2191
+ };
2192
+ });
2193
+
2194
+ const syntheticWaveResult: WaveExecutionResult = {
2195
+ waveIndex: waveIdx + 1,
2196
+ startedAt: Date.now(),
2197
+ endedAt: Date.now(),
2198
+ laneResults: syntheticLaneResults,
2199
+ policyApplied: orchConfig.failure.on_task_failure,
2200
+ stoppedEarly: false,
2201
+ failedTaskIds,
2202
+ skippedTaskIds,
2203
+ succeededTaskIds,
2204
+ blockedTaskIds: [],
2205
+ laneCount: mergeRetryLanes.length,
2206
+ overallStatus: "succeeded",
2207
+ finalMonitorState: null,
2208
+ allocatedLanes: mergeRetryLanes,
2209
+ };
2210
+
2211
+ batchState.phase = "merging";
2212
+ persistRuntimeState(
2213
+ "merge-retry-start",
2214
+ batchState,
2215
+ wavePlan,
2216
+ latestAllocatedLanes,
2217
+ allTaskOutcomes,
2218
+ discovery,
2219
+ stateRoot,
2220
+ );
2221
+
2222
+ const mergeRetryResult = await mergeWaveByRepo(
2223
+ mergeRetryLanes,
2224
+ syntheticWaveResult,
2225
+ waveIdx + 1,
2226
+ orchConfig,
2227
+ repoRoot,
2228
+ batchState.batchId,
2229
+ batchState.orchBranch,
2230
+ workspaceConfig,
2231
+ stateRoot,
2232
+ agentRoot,
2233
+ runnerConfig.testing_commands,
2234
+ undefined, // healthMonitor
2235
+ undefined, // forceMixedOutcome
2236
+ resumeBackend,
2237
+ );
2238
+ batchState.mergeResults.push(mergeRetryResult);
2239
+
2240
+ if (mergeRetryResult.status === "succeeded") {
2241
+ onNotify(
2242
+ `✅ Wave ${resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave} merge retry succeeded`,
2243
+ "info",
2244
+ );
2245
+ // Clean up merged branches
2246
+ for (const lr of mergeRetryResult.laneResults) {
2247
+ if (
2248
+ !lr.error &&
2249
+ (lr.result?.status === "SUCCESS" || lr.result?.status === "CONFLICT_RESOLVED")
2250
+ ) {
2251
+ const laneRepoRoot = resolveRepoRoot(lr.repoId, repoRoot, workspaceConfig);
2252
+ deleteBranchBestEffort(lr.sourceBranch, laneRepoRoot);
2253
+ }
2254
+ }
2255
+ } else {
2256
+ onNotify(
2257
+ `⚠️ Wave ${resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave} merge retry ${mergeRetryResult.status}: ${mergeRetryResult.failureReason || "unknown"}`,
2258
+ "warning",
2259
+ );
2260
+ // Apply merge failure policy (same as normal wave merge failure)
2261
+ const policyResult = computeMergeFailurePolicy(mergeRetryResult, waveIdx, orchConfig);
2262
+ execLog(
2263
+ "batch",
2264
+ batchState.batchId,
2265
+ `merge retry failure — applying ${policyResult.policy} policy`,
2266
+ policyResult.logDetails,
2267
+ );
2268
+ batchState.phase = policyResult.targetPhase;
2269
+ batchState.errors.push(policyResult.errorMessage);
2270
+ persistRuntimeState(
2271
+ policyResult.persistTrigger,
2272
+ batchState,
2273
+ wavePlan,
2274
+ latestAllocatedLanes,
2275
+ allTaskOutcomes,
2276
+ discovery,
2277
+ stateRoot,
2278
+ );
2279
+ onNotify(policyResult.notifyMessage, policyResult.notifyLevel);
2280
+ preserveWorktreesForResume = true;
2281
+ break;
2282
+ }
2283
+
2284
+ batchState.phase = "executing";
2285
+ persistRuntimeState(
2286
+ "merge-retry-complete",
2287
+ batchState,
2288
+ wavePlan,
2289
+ latestAllocatedLanes,
2290
+ allTaskOutcomes,
2291
+ discovery,
2292
+ stateRoot,
2293
+ );
2294
+ } else {
2295
+ execLog(
2296
+ "resume",
2297
+ batchState.batchId,
2298
+ `wave ${waveIdx + 1}: no tasks to execute (all completed/blocked)`,
2299
+ );
2300
+ }
2301
+ continue;
2302
+ }
2303
+
2304
+ {
2305
+ const { displayWave, displayTotal } = resolveDisplayWaveNumber(
2306
+ waveIdx,
2307
+ roundToTaskWave,
2308
+ taskLevelWaveCount,
2309
+ );
2310
+ onNotify(
2311
+ ORCH_MESSAGES.orchWaveStart(
2312
+ displayWave,
2313
+ displayTotal,
2314
+ waveTasks.length,
2315
+ Math.min(waveTasks.length, orchConfig.orchestrator.max_lanes),
2316
+ ),
2317
+ "info",
2318
+ );
2319
+ }
2320
+
2321
+ const handleResumeMonitorUpdate: MonitorUpdateCallback = (monitorState) => {
2322
+ const changed = syncTaskOutcomesFromMonitor(monitorState, allTaskOutcomes);
2323
+ if (changed) {
2324
+ persistRuntimeState(
2325
+ "task-transition",
2326
+ batchState,
2327
+ wavePlan,
2328
+ latestAllocatedLanes,
2329
+ allTaskOutcomes,
2330
+ discovery,
2331
+ stateRoot,
2332
+ );
2333
+ }
2334
+ onMonitorUpdate?.(monitorState);
2335
+ };
2336
+
2337
+ // Execute the wave
2338
+ const waveResult = await executeWave(
2339
+ waveTasks,
2340
+ waveIdx + 1,
2341
+ discovery.pending,
2342
+ orchConfig,
2343
+ repoRoot,
2344
+ batchState.batchId,
2345
+ batchState.pauseSignal,
2346
+ depGraph,
2347
+ batchState.orchBranch,
2348
+ handleResumeMonitorUpdate,
2349
+ (lanes) => {
2350
+ latestAllocatedLanes = lanes;
2351
+ batchState.currentLanes = lanes;
2352
+ // Track repos from newly allocated lanes for cleanup coverage
2353
+ for (const lane of lanes) {
2354
+ encounteredRepoRoots.add(resolveRepoRoot(lane.repoId, repoRoot, workspaceConfig));
2355
+ }
2356
+ if (seedPendingOutcomesForAllocatedLanes(lanes, allTaskOutcomes)) {
2357
+ persistRuntimeState(
2358
+ "wave-lanes-allocated",
2359
+ batchState,
2360
+ wavePlan,
2361
+ latestAllocatedLanes,
2362
+ allTaskOutcomes,
2363
+ discovery,
2364
+ stateRoot,
2365
+ );
2366
+ }
2367
+ },
2368
+ workspaceConfig,
2369
+ resumeBackend,
2370
+ emitAlert,
2371
+ supervisorAutonomy,
2372
+ runnerConfig.reviewer,
2373
+ runnerConfig.worker,
2374
+ runnerConfig.workerExcludeExtensions ?? [],
2375
+ onLaneTerminated ?? undefined,
2376
+ onLaneRespawned ?? undefined,
2377
+ );
2378
+
2379
+ batchState.waveResults.push(waveResult);
2380
+ batchState.currentLanes = [];
2381
+
2382
+ // Accumulate task outcomes
2383
+ latestAllocatedLanes = waveResult.allocatedLanes;
2384
+ for (const lr of waveResult.laneResults) {
2385
+ for (const taskOutcome of lr.tasks) {
2386
+ upsertTaskOutcome(allTaskOutcomes, taskOutcome);
2387
+ }
2388
+ }
2389
+
2390
+ // Accumulate results
2391
+ batchState.succeededTasks += waveResult.succeededTaskIds.length;
2392
+ batchState.failedTasks += waveResult.failedTaskIds.length;
2393
+ batchState.skippedTasks += waveResult.skippedTaskIds.length;
2394
+
2395
+ for (const taskId of waveResult.succeededTaskIds) {
2396
+ completedTaskSet.add(taskId);
2397
+ failedTaskSet.delete(taskId);
2398
+ reconnectTaskSet.delete(taskId);
2399
+ }
2400
+ for (const taskId of waveResult.failedTaskIds) {
2401
+ failedTaskSet.add(taskId);
2402
+ completedTaskSet.delete(taskId);
2403
+ reconnectTaskSet.delete(taskId);
2404
+ }
2405
+
2406
+ for (const blocked of waveResult.blockedTaskIds) {
2407
+ batchState.blockedTaskIds.add(blocked);
2408
+ }
2409
+
2410
+ // ── TP-076: Emit supervisor alerts for task failures ────
2411
+ for (const taskId of waveResult.failedTaskIds) {
2412
+ const outcome = allTaskOutcomes.find((o) => o.taskId === taskId);
2413
+ const laneForTask = latestAllocatedLanes.find((l) => l.tasks.some((t) => t.taskId === taskId));
2414
+ // TP-195: corrected the lookup to the real source of segment
2415
+ // metadata. `batchState.tasks` does not exist on
2416
+ // `OrchBatchRuntimeState` (it's on `PersistedBatchState`); the
2417
+ // previous read would have thrown `undefined.find is not a
2418
+ // function` if hit at runtime. The allocated lane carries the
2419
+ // `ParsedTask` payload via `AllocatedTask.task`, which has
2420
+ // `segmentIds`/`activeSegmentId` already populated by discovery.
2421
+ const taskRecord = laneForTask?.tasks.find((t) => t.taskId === taskId)?.task;
2422
+ const exitReason = outcome?.exitReason || "unknown";
2423
+ const hasPartialProgress = (outcome?.partialProgressCommits ?? 0) > 0;
2424
+ const segmentFrontier = buildSupervisorSegmentFrontierSnapshot(
2425
+ taskId,
2426
+ taskRecord?.segmentIds,
2427
+ taskRecord?.activeSegmentId,
2428
+ batchState.segments,
2429
+ outcome?.segmentId,
2430
+ );
2431
+ const segmentId =
2432
+ outcome?.segmentId ??
2433
+ taskRecord?.activeSegmentId ??
2434
+ segmentFrontier?.activeSegmentId ??
2435
+ undefined;
2436
+ const repoId = segmentId
2437
+ ? (segmentFrontier?.segments.find((segment) => segment.segmentId === segmentId)?.repoId ??
2438
+ laneForTask?.repoId)
2439
+ : laneForTask?.repoId;
2440
+ const segmentSummary = segmentId
2441
+ ? ` Segment: ${segmentId}${repoId ? ` (repo: ${repoId})` : ""}\n`
2442
+ : "";
2443
+ const frontierSummary = segmentFrontier
2444
+ ? ` Segment frontier: ${segmentFrontier.terminalSegments}/${segmentFrontier.totalSegments} terminal\n`
2445
+ : "";
2446
+ // TP-190 (#561): Mirror engine.ts emission — propagate the structured
2447
+ // exit category so /orch-resume task-failure alerts route through the
2448
+ // same supervisor playbook branches as /orch. Shared helper enforces
2449
+ // payload parity between the two emission sites.
2450
+ const { exitCategory, summaryLine: spawnFailureLine } = buildSpawnFailureAlertExtras(outcome);
2451
+ emitAlert({
2452
+ category: "task-failure",
2453
+ summary:
2454
+ `⚠️ Task failure: ${taskId}\n` +
2455
+ ` Exit reason: ${exitReason}\n` +
2456
+ spawnFailureLine +
2457
+ segmentSummary +
2458
+ frontierSummary +
2459
+ ` Lane: ${laneForTask?.laneId ?? "unknown"} (lane ${laneForTask?.laneNumber ?? "?"})\n` +
2460
+ ` Partial progress preserved: ${hasPartialProgress ? "yes" : "no"}\n` +
2461
+ ` Batch: wave ${resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave}/${taskLevelWaveCount ?? batchState.totalWaves}, ` +
2462
+ `${batchState.succeededTasks} succeeded, ${batchState.failedTasks} failed\n\n` +
2463
+ `Available actions:\n` +
2464
+ ` - orch_status() to inspect current state\n` +
2465
+ ` - orch_resume(force=true) to retry\n` +
2466
+ ` - Read STATUS.md and lane logs for diagnosis`,
2467
+ context: {
2468
+ taskId,
2469
+ segmentId,
2470
+ repoId,
2471
+ segmentFrontier,
2472
+ laneId: laneForTask?.laneId,
2473
+ laneNumber: laneForTask?.laneNumber,
2474
+ waveIndex: waveIdx,
2475
+ exitReason,
2476
+ exitCategory,
2477
+ partialProgress: hasPartialProgress,
2478
+ batchProgress: buildBatchProgressSnapshot(batchState),
2479
+ },
2480
+ });
2481
+ }
2482
+
2483
+ persistRuntimeState(
2484
+ "wave-execution-complete",
2485
+ batchState,
2486
+ wavePlan,
2487
+ latestAllocatedLanes,
2488
+ allTaskOutcomes,
2489
+ discovery,
2490
+ stateRoot,
2491
+ );
2492
+
2493
+ const elapsedSec = Math.round((waveResult.endedAt - waveResult.startedAt) / 1000);
2494
+ {
2495
+ const { displayWave: completeDisplayWave } = resolveDisplayWaveNumber(
2496
+ waveIdx,
2497
+ roundToTaskWave,
2498
+ taskLevelWaveCount,
2499
+ );
2500
+ onNotify(
2501
+ ORCH_MESSAGES.orchWaveComplete(
2502
+ completeDisplayWave,
2503
+ waveResult.succeededTaskIds.length,
2504
+ waveResult.failedTaskIds.length,
2505
+ waveResult.skippedTaskIds.length,
2506
+ elapsedSec,
2507
+ ),
2508
+ waveResult.failedTaskIds.length > 0 ? "warning" : "info",
2509
+ );
2510
+ }
2511
+
2512
+ // Check failure policy
2513
+ if (waveResult.stoppedEarly) {
2514
+ if (waveResult.policyApplied === "stop-all") {
2515
+ batchState.phase = "stopped";
2516
+ persistRuntimeState(
2517
+ "stop-all",
2518
+ batchState,
2519
+ wavePlan,
2520
+ latestAllocatedLanes,
2521
+ allTaskOutcomes,
2522
+ discovery,
2523
+ stateRoot,
2524
+ );
2525
+ onNotify(ORCH_MESSAGES.orchBatchStopped(batchState.batchId, "stop-all"), "error");
2526
+ break;
2527
+ }
2528
+ if (waveResult.policyApplied === "stop-wave") {
2529
+ batchState.phase = "stopped";
2530
+ persistRuntimeState(
2531
+ "stop-wave",
2532
+ batchState,
2533
+ wavePlan,
2534
+ latestAllocatedLanes,
2535
+ allTaskOutcomes,
2536
+ discovery,
2537
+ stateRoot,
2538
+ );
2539
+ onNotify(ORCH_MESSAGES.orchBatchStopped(batchState.batchId, "stop-wave"), "error");
2540
+ break;
2541
+ }
2542
+ }
2543
+
2544
+ // Merge handling (same as executeOrchBatch)
2545
+ let mergeResult: MergeWaveResult | null = null;
2546
+
2547
+ const laneOutcomeByNumber = new Map<number, LaneExecutionResult>();
2548
+ for (const lr of waveResult.laneResults) {
2549
+ laneOutcomeByNumber.set(lr.laneNumber, lr);
2550
+ }
2551
+ const mixedOutcomeLanes = waveResult.laneResults.filter((lr) => {
2552
+ const hasSucceeded = lr.tasks.some((t) => t.status === "succeeded");
2553
+ const hasHardFailure = lr.tasks.some((t) => t.status === "failed" || t.status === "stalled");
2554
+ return hasSucceeded && hasHardFailure;
2555
+ });
2556
+
2557
+ if (waveResult.succeededTaskIds.length > 0) {
2558
+ const mergeableLaneCount = waveResult.allocatedLanes.filter((lane) => {
2559
+ const outcome = laneOutcomeByNumber.get(lane.laneNumber);
2560
+ if (!outcome) return false;
2561
+ const hasSucceeded = outcome.tasks.some((t) => t.status === "succeeded");
2562
+ const hasHardFailure = outcome.tasks.some(
2563
+ (t) => t.status === "failed" || t.status === "stalled",
2564
+ );
2565
+ return hasSucceeded && !hasHardFailure;
2566
+ }).length;
2567
+
2568
+ if (mergeableLaneCount > 0) {
2569
+ batchState.phase = "merging";
2570
+ persistRuntimeState(
2571
+ "merge-start",
2572
+ batchState,
2573
+ wavePlan,
2574
+ latestAllocatedLanes,
2575
+ allTaskOutcomes,
2576
+ discovery,
2577
+ stateRoot,
2578
+ );
2579
+ onNotify(
2580
+ ORCH_MESSAGES.orchMergeStart(
2581
+ resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave,
2582
+ mergeableLaneCount,
2583
+ ),
2584
+ "info",
2585
+ );
2586
+
2587
+ mergeResult = await mergeWaveByRepo(
2588
+ waveResult.allocatedLanes,
2589
+ waveResult,
2590
+ waveIdx + 1,
2591
+ orchConfig,
2592
+ repoRoot,
2593
+ batchState.batchId,
2594
+ batchState.orchBranch,
2595
+ workspaceConfig,
2596
+ stateRoot,
2597
+ agentRoot,
2598
+ runnerConfig.testing_commands,
2599
+ undefined, // healthMonitor
2600
+ undefined, // forceMixedOutcome
2601
+ resumeBackend,
2602
+ );
2603
+ batchState.mergeResults.push(mergeResult);
2604
+
2605
+ // Emit per-lane merge notifications
2606
+ for (const lr of mergeResult.laneResults) {
2607
+ const durationSec = Math.round(lr.durationMs / 1000);
2608
+ // TP-032 R006-3: Check lr.error first — verification_new_failure lanes
2609
+ // have error set even though lr.result.status may be SUCCESS/CONFLICT_RESOLVED.
2610
+ if (lr.error) {
2611
+ onNotify(ORCH_MESSAGES.orchMergeLaneFailed(lr.laneNumber, lr.error), "error");
2612
+ } else if (lr.result?.status === "SUCCESS") {
2613
+ onNotify(
2614
+ ORCH_MESSAGES.orchMergeLaneSuccess(lr.laneNumber, lr.result.merge_commit, durationSec),
2615
+ "info",
2616
+ );
2617
+ } else if (lr.result?.status === "CONFLICT_RESOLVED") {
2618
+ onNotify(
2619
+ ORCH_MESSAGES.orchMergeLaneConflictResolved(
2620
+ lr.laneNumber,
2621
+ lr.result.conflicts.length,
2622
+ durationSec,
2623
+ ),
2624
+ "info",
2625
+ );
2626
+ } else if (
2627
+ lr.result?.status === "CONFLICT_UNRESOLVED" ||
2628
+ lr.result?.status === "BUILD_FAILURE"
2629
+ ) {
2630
+ onNotify(ORCH_MESSAGES.orchMergeLaneFailed(lr.laneNumber, lr.result.status), "error");
2631
+ }
2632
+ }
2633
+
2634
+ if (mixedOutcomeLanes.length > 0) {
2635
+ const mixedIds = mixedOutcomeLanes.map((l) => `lane-${l.laneNumber}`).join(", ");
2636
+ const failureReason =
2637
+ `Lane(s) ${mixedIds} contain both succeeded and failed tasks. ` +
2638
+ `Automatic partial-branch merge is disabled to avoid dropping succeeded commits.`;
2639
+ mergeResult = {
2640
+ ...mergeResult,
2641
+ status: "partial",
2642
+ failedLane: mixedOutcomeLanes[0].laneNumber,
2643
+ failureReason,
2644
+ };
2645
+ // Update the already-pushed reference so persisted state reflects "partial"
2646
+ batchState.mergeResults[batchState.mergeResults.length - 1] = mergeResult;
2647
+ }
2648
+
2649
+ // TP-032 R006-3: Exclude verification_new_failure lanes from success count
2650
+ const mergedCount = mergeResult.laneResults.filter(
2651
+ (r) =>
2652
+ !r.error && (r.result?.status === "SUCCESS" || r.result?.status === "CONFLICT_RESOLVED"),
2653
+ ).length;
2654
+ const mergeTotalSec = Math.round(mergeResult.totalDurationMs / 1000);
2655
+
2656
+ if (mergeResult.status === "succeeded") {
2657
+ onNotify(
2658
+ ORCH_MESSAGES.orchMergeComplete(
2659
+ resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave,
2660
+ mergedCount,
2661
+ mergeTotalSec,
2662
+ ),
2663
+ "info",
2664
+ );
2665
+ } else {
2666
+ onNotify(
2667
+ ORCH_MESSAGES.orchMergeFailed(
2668
+ resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave,
2669
+ mergeResult.failedLane ?? 0,
2670
+ mergeResult.failureReason || "unknown",
2671
+ ),
2672
+ "error",
2673
+ );
2674
+
2675
+ // Emit repo-divergence summary when partial is caused by cross-repo outcome differences
2676
+ if (mergeResult.status === "partial") {
2677
+ const repoSummary = formatRepoMergeSummary(mergeResult);
2678
+ if (repoSummary) {
2679
+ onNotify(repoSummary, "warning");
2680
+ }
2681
+ }
2682
+ }
2683
+
2684
+ batchState.phase = "executing";
2685
+ persistRuntimeState(
2686
+ "merge-complete",
2687
+ batchState,
2688
+ wavePlan,
2689
+ latestAllocatedLanes,
2690
+ allTaskOutcomes,
2691
+ discovery,
2692
+ stateRoot,
2693
+ );
2694
+ } else if (mixedOutcomeLanes.length > 0) {
2695
+ const mixedIds = mixedOutcomeLanes.map((l) => `lane-${l.laneNumber}`).join(", ");
2696
+ mergeResult = {
2697
+ waveIndex: waveIdx + 1,
2698
+ status: "partial",
2699
+ laneResults: [],
2700
+ failedLane: mixedOutcomeLanes[0].laneNumber,
2701
+ failureReason:
2702
+ `Lane(s) ${mixedIds} contain both succeeded and failed tasks. ` +
2703
+ `Automatic partial-branch merge is disabled to avoid dropping succeeded commits.`,
2704
+ totalDurationMs: 0,
2705
+ };
2706
+ // Keep mergeResults in sync even when no mergeable lane exists.
2707
+ // Downstream retry/update paths assume the current wave has an entry.
2708
+ batchState.mergeResults.push(mergeResult);
2709
+ onNotify(
2710
+ ORCH_MESSAGES.orchMergeFailed(
2711
+ resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave,
2712
+ mergeResult.failedLane,
2713
+ mergeResult.failureReason || "unknown",
2714
+ ),
2715
+ "error",
2716
+ );
2717
+ } else {
2718
+ onNotify(
2719
+ ORCH_MESSAGES.orchMergeSkipped(
2720
+ resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave,
2721
+ ),
2722
+ "info",
2723
+ );
2724
+ }
2725
+ } else {
2726
+ onNotify(
2727
+ ORCH_MESSAGES.orchMergeSkipped(
2728
+ resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave,
2729
+ ),
2730
+ "info",
2731
+ );
2732
+ }
2733
+
2734
+ // ── TP-033: Safe-stop on rollback failure ─────────────────
2735
+ // When a verification rollback failed, force paused regardless of
2736
+ // on_merge_failure policy. The merge worktree and temp branch are
2737
+ // preserved for manual recovery using commands in the transaction record.
2738
+ if (mergeResult?.rollbackFailed) {
2739
+ // TP-033 R004-2: Include persistence error warning when transaction
2740
+ // record files may be missing, so operator knows to inspect manually
2741
+ const hasPersistErrors =
2742
+ mergeResult.persistenceErrors && mergeResult.persistenceErrors.length > 0;
2743
+ const persistWarning = hasPersistErrors
2744
+ ? ` WARNING: ${mergeResult.persistenceErrors!.length} transaction record(s) failed to persist — recovery file(s) may be missing.`
2745
+ : "";
2746
+
2747
+ execLog(
2748
+ "batch",
2749
+ batchState.batchId,
2750
+ "SAFE-STOP: verification rollback failed — forcing paused regardless of policy",
2751
+ {
2752
+ waveIndex: waveIdx,
2753
+ configPolicy: orchConfig.failure.on_merge_failure,
2754
+ ...(hasPersistErrors ? { persistenceErrors: mergeResult.persistenceErrors } : {}),
2755
+ },
2756
+ );
2757
+
2758
+ batchState.phase = "paused";
2759
+ batchState.errors.push(
2760
+ `Safe-stop at wave ${waveIdx + 1}: verification rollback failed. ` +
2761
+ `Merge worktree and temp branch preserved for recovery. ` +
2762
+ `Check transaction records in .pi/verification/ for recovery commands.` +
2763
+ persistWarning,
2764
+ );
2765
+ persistRuntimeState(
2766
+ "merge-rollback-safe-stop",
2767
+ batchState,
2768
+ wavePlan,
2769
+ latestAllocatedLanes,
2770
+ allTaskOutcomes,
2771
+ discovery,
2772
+ stateRoot,
2773
+ );
2774
+ onNotify(
2775
+ `🛑 Safe-stop: verification rollback failed at wave ${waveIdx + 1}. ` +
2776
+ `Batch force-paused. Merge worktree preserved for manual recovery. ` +
2777
+ `See .pi/verification/ transaction records for recovery commands.` +
2778
+ persistWarning,
2779
+ "error",
2780
+ );
2781
+
2782
+ // ── TP-076: Emit supervisor alert for rollback safe-stop ──
2783
+ const rollbackRepoId = extractFailedRepoId(mergeResult) ?? undefined;
2784
+ emitAlert({
2785
+ category: "merge-failure",
2786
+ summary:
2787
+ `⚠️ Merge failed for wave ${waveIdx + 1} — verification rollback failed\n` +
2788
+ ` Batch force-paused for manual recovery.\n` +
2789
+ ` Check .pi/verification/ for recovery commands.\n\n` +
2790
+ `Available actions:\n` +
2791
+ ` - Check .pi/verification/ transaction records\n` +
2792
+ ` - orch_status() to inspect current state\n` +
2793
+ ` - orch_resume(force=true) after manual recovery`,
2794
+ context: {
2795
+ waveIndex: waveIdx,
2796
+ laneNumber: mergeResult.failedLane ?? undefined,
2797
+ repoId: rollbackRepoId,
2798
+ mergeError: `Safe-stop: verification rollback failed at wave ${waveIdx + 1}`,
2799
+ batchProgress: buildBatchProgressSnapshot(batchState),
2800
+ },
2801
+ });
2802
+
2803
+ preserveWorktreesForResume = true;
2804
+ break;
2805
+ }
2806
+
2807
+ // Handle merge failure — TP-033 Step 2 (R006): Retry policy matrix via shared applyMergeRetryLoop.
2808
+ // Uses the same centralized loop as engine.ts for guaranteed parity.
2809
+ if (mergeResult && (mergeResult.status === "failed" || mergeResult.status === "partial")) {
2810
+ // Initialize resilience state if not yet present
2811
+ if (!batchState.resilience) {
2812
+ batchState.resilience = defaultResilienceState();
2813
+ }
2814
+
2815
+ const mergeRepoId = extractFailedRepoId(mergeResult) ?? undefined;
2816
+ const retryOutcome = await applyMergeRetryLoop(
2817
+ mergeResult,
2818
+ waveIdx,
2819
+ batchState.resilience.retryCountByScope,
2820
+ {
2821
+ performMerge: async () => {
2822
+ batchState.phase = "merging";
2823
+ return await mergeWaveByRepo(
2824
+ waveResult.allocatedLanes,
2825
+ waveResult,
2826
+ waveIdx + 1,
2827
+ orchConfig,
2828
+ repoRoot,
2829
+ batchState.batchId,
2830
+ batchState.orchBranch,
2831
+ workspaceConfig,
2832
+ stateRoot,
2833
+ agentRoot,
2834
+ runnerConfig.testing_commands,
2835
+ undefined, // healthMonitor
2836
+ undefined, // forceMixedOutcome
2837
+ resumeBackend,
2838
+ );
2839
+ },
2840
+ persist: (trigger) =>
2841
+ persistRuntimeState(
2842
+ trigger,
2843
+ batchState,
2844
+ wavePlan,
2845
+ latestAllocatedLanes,
2846
+ allTaskOutcomes,
2847
+ discovery,
2848
+ stateRoot,
2849
+ ),
2850
+ log: (message, details) => execLog("batch", batchState.batchId, message, details),
2851
+ notify: (message, level) => onNotify(message, level),
2852
+ updateMergeResult: (result) => {
2853
+ mergeResult = result;
2854
+ batchState.mergeResults[batchState.mergeResults.length - 1] = result;
2855
+ },
2856
+ sleep: sleepSync,
2857
+ },
2858
+ );
2859
+
2860
+ if (retryOutcome.kind === "retry_succeeded") {
2861
+ mergeResult = retryOutcome.mergeResult;
2862
+ batchState.phase = "executing";
2863
+ persistRuntimeState(
2864
+ "merge-retry-succeeded",
2865
+ batchState,
2866
+ wavePlan,
2867
+ latestAllocatedLanes,
2868
+ allTaskOutcomes,
2869
+ discovery,
2870
+ stateRoot,
2871
+ );
2872
+ // Fall through to normal post-merge flow
2873
+ } else if (retryOutcome.kind === "safe_stop") {
2874
+ mergeResult = retryOutcome.mergeResult;
2875
+ batchState.phase = "paused";
2876
+ batchState.errors.push(retryOutcome.errorMessage);
2877
+ persistRuntimeState(
2878
+ "merge-rollback-safe-stop",
2879
+ batchState,
2880
+ wavePlan,
2881
+ latestAllocatedLanes,
2882
+ allTaskOutcomes,
2883
+ discovery,
2884
+ stateRoot,
2885
+ );
2886
+ onNotify(retryOutcome.notifyMessage, "error");
2887
+
2888
+ // ── TP-076: Emit supervisor alert for merge safe-stop ──
2889
+ emitAlert({
2890
+ category: "merge-failure",
2891
+ summary:
2892
+ `⚠️ Merge failed for wave ${waveIdx + 1} — rollback failure\n` +
2893
+ ` Error: ${retryOutcome.errorMessage}\n\n` +
2894
+ `Available actions:\n` +
2895
+ ` - orch_status() to inspect current state\n` +
2896
+ ` - orch_resume(force=true) after manual recovery`,
2897
+ context: {
2898
+ waveIndex: waveIdx,
2899
+ laneNumber: mergeResult.failedLane ?? undefined,
2900
+ repoId: mergeRepoId,
2901
+ mergeError: retryOutcome.errorMessage,
2902
+ batchProgress: buildBatchProgressSnapshot(batchState),
2903
+ },
2904
+ });
2905
+
2906
+ preserveWorktreesForResume = true;
2907
+ break;
2908
+ } else if (retryOutcome.kind === "exhausted") {
2909
+ // TP-033 R006-2: Force paused regardless of on_merge_failure config.
2910
+ mergeResult = retryOutcome.mergeResult;
2911
+ const exhaustionMsg =
2912
+ retryOutcome.errorMessage +
2913
+ ` [${retryOutcome.classification ?? "unknown"} ${retryOutcome.lastDecision.currentAttempt}/${retryOutcome.lastDecision.maxAttempts}, scope=${retryOutcome.scopeKey}]`;
2914
+
2915
+ execLog("batch", batchState.batchId, `merge retry exhausted — forcing paused`, {
2916
+ classification: retryOutcome.classification,
2917
+ scopeKey: retryOutcome.scopeKey,
2918
+ attempts: retryOutcome.lastDecision.currentAttempt,
2919
+ maxAttempts: retryOutcome.lastDecision.maxAttempts,
2920
+ });
2921
+
2922
+ batchState.phase = "paused";
2923
+ batchState.errors.push(exhaustionMsg);
2924
+ persistRuntimeState(
2925
+ "merge-retry-exhausted",
2926
+ batchState,
2927
+ wavePlan,
2928
+ latestAllocatedLanes,
2929
+ allTaskOutcomes,
2930
+ discovery,
2931
+ stateRoot,
2932
+ );
2933
+ onNotify(retryOutcome.notifyMessage, "error");
2934
+
2935
+ // ── TP-076: Emit supervisor alert for merge retry exhausted ──
2936
+ emitAlert({
2937
+ category: "merge-failure",
2938
+ summary:
2939
+ `⚠️ Merge failed for wave ${waveIdx + 1} — retry exhausted\n` +
2940
+ ` Classification: ${retryOutcome.classification ?? "unknown"}\n` +
2941
+ ` Error: ${exhaustionMsg}\n\n` +
2942
+ `Available actions:\n` +
2943
+ ` - Investigate merge failure and retry manually\n` +
2944
+ ` - orch_status() to inspect current state\n` +
2945
+ ` - orch_resume(force=true) after fixing the issue`,
2946
+ context: {
2947
+ waveIndex: waveIdx,
2948
+ laneNumber: mergeResult.failedLane ?? undefined,
2949
+ repoId: mergeRepoId,
2950
+ mergeError: exhaustionMsg,
2951
+ batchProgress: buildBatchProgressSnapshot(batchState),
2952
+ },
2953
+ });
2954
+
2955
+ preserveWorktreesForResume = true;
2956
+ break;
2957
+ } else {
2958
+ // kind === "no_retry": fall through to standard on_merge_failure policy
2959
+ mergeResult = retryOutcome.mergeResult;
2960
+ const policyResult = computeMergeFailurePolicy(mergeResult, waveIdx, orchConfig);
2961
+ const classNote = retryOutcome.classification
2962
+ ? ` [not retriable: ${retryOutcome.classification}, scope=${retryOutcome.scopeKey}]`
2963
+ : "";
2964
+
2965
+ execLog(
2966
+ "batch",
2967
+ batchState.batchId,
2968
+ `merge failure — applying ${policyResult.policy} policy${classNote}`,
2969
+ policyResult.logDetails,
2970
+ );
2971
+
2972
+ batchState.phase = policyResult.targetPhase;
2973
+ batchState.errors.push(policyResult.errorMessage + classNote);
2974
+ persistRuntimeState(
2975
+ policyResult.persistTrigger,
2976
+ batchState,
2977
+ wavePlan,
2978
+ latestAllocatedLanes,
2979
+ allTaskOutcomes,
2980
+ discovery,
2981
+ stateRoot,
2982
+ );
2983
+ onNotify(policyResult.notifyMessage + classNote, policyResult.notifyLevel);
2984
+
2985
+ // ── TP-076: Emit supervisor alert for merge failure (no-retry policy) ──
2986
+ emitAlert({
2987
+ category: "merge-failure",
2988
+ summary:
2989
+ `⚠️ Merge failed for wave ${waveIdx + 1}\n` +
2990
+ ` Policy: ${policyResult.policy}${classNote}\n` +
2991
+ ` Error: ${mergeResult.failureReason || "unknown"}\n\n` +
2992
+ `Available actions:\n` +
2993
+ ` - Investigate failed merge\n` +
2994
+ ` - orch_status() to inspect current state\n` +
2995
+ ` - orch_resume(force=true) after fixing the issue`,
2996
+ context: {
2997
+ waveIndex: waveIdx,
2998
+ laneNumber: mergeResult.failedLane ?? undefined,
2999
+ repoId: mergeRepoId,
3000
+ mergeError: mergeResult.failureReason || "unknown",
3001
+ batchProgress: buildBatchProgressSnapshot(batchState),
3002
+ },
3003
+ });
3004
+
3005
+ preserveWorktreesForResume = true;
3006
+ break;
3007
+ }
3008
+ }
3009
+
3010
+ // Post-merge: reset worktrees for next wave
3011
+ // TP-032 R006-3: Exclude verification_new_failure lanes from branch cleanup
3012
+ if (mergeResult && mergeResult.status === "succeeded") {
3013
+ for (const lr of mergeResult.laneResults) {
3014
+ if (
3015
+ !lr.error &&
3016
+ (lr.result?.status === "SUCCESS" || lr.result?.status === "CONFLICT_RESOLVED")
3017
+ ) {
3018
+ const laneRepoRoot = resolveRepoRoot(lr.repoId, repoRoot, workspaceConfig);
3019
+ const ancestorCheck = runGit(
3020
+ ["merge-base", "--is-ancestor", lr.sourceBranch, lr.targetBranch],
3021
+ laneRepoRoot,
3022
+ );
3023
+ if (ancestorCheck.ok) {
3024
+ deleteBranchBestEffort(lr.sourceBranch, laneRepoRoot);
3025
+ }
3026
+ }
3027
+ }
3028
+ }
3029
+
3030
+ // ── TP-028: Preserve partial progress before inter-wave reset ──
3031
+ // Hoisted outside the if-block so unsafeBranches is accessible to the
3032
+ // reset loop below — both blocks share the same guard condition.
3033
+ let ppUnsafeBranches = new Set<string>();
3034
+ if (waveIdx < persistedState.wavePlan.length - 1 && !batchState.pauseSignal.paused) {
3035
+ const ppOpId = resolveOperatorId(orchConfig);
3036
+ const ppResult = preserveFailedLaneProgress(
3037
+ latestAllocatedLanes,
3038
+ allTaskOutcomes,
3039
+ ppOpId,
3040
+ batchState.batchId,
3041
+ (repoId) => {
3042
+ const perRepoRoot = resolveRepoRoot(repoId, repoRoot, workspaceConfig);
3043
+ let targetBranch = batchState.orchBranch;
3044
+ if (repoId && perRepoRoot !== repoRoot) {
3045
+ try {
3046
+ targetBranch = resolveBaseBranch(
3047
+ repoId,
3048
+ perRepoRoot,
3049
+ batchState.orchBranch,
3050
+ workspaceConfig,
3051
+ );
3052
+ } catch {
3053
+ /* fall back to orchBranch */
3054
+ }
3055
+ }
3056
+ return { repoRoot: perRepoRoot, targetBranch };
3057
+ },
3058
+ );
3059
+ ppUnsafeBranches = ppResult.unsafeBranches;
3060
+ if (ppResult.results.some((r) => r.saved)) {
3061
+ execLog(
3062
+ "batch",
3063
+ batchState.batchId,
3064
+ `preserved partial progress for ${ppResult.results.filter((r) => r.saved).length} failed task(s) before inter-wave reset`,
3065
+ );
3066
+ }
3067
+ // Log per-task warnings for failed preservation attempts
3068
+ for (const r of ppResult.results) {
3069
+ if (!r.saved && (r.commitCount > 0 || r.error)) {
3070
+ execLog(
3071
+ "batch",
3072
+ batchState.batchId,
3073
+ `WARNING: Failed to preserve partial progress for task ${r.taskId} ` +
3074
+ `(${r.commitCount} commit(s) at risk on lane branch)`,
3075
+ { taskId: r.taskId, commitCount: r.commitCount, error: r.error ?? "unknown" },
3076
+ );
3077
+ }
3078
+ }
3079
+ if (ppUnsafeBranches.size > 0) {
3080
+ execLog(
3081
+ "batch",
3082
+ batchState.batchId,
3083
+ `WARNING: ${ppUnsafeBranches.size} lane branch(es) could not be preserved — skipping reset for those lanes to prevent commit loss`,
3084
+ { unsafeBranches: [...ppUnsafeBranches] },
3085
+ );
3086
+ }
3087
+ // TP-028: Stamp task outcomes with partial progress data for persistence
3088
+ applyPartialProgressToOutcomes(ppResult, allTaskOutcomes);
3089
+ }
3090
+
3091
+ if (waveIdx < persistedState.wavePlan.length - 1 && !batchState.pauseSignal.paused) {
3092
+ const wtPrefix = orchConfig.orchestrator.worktree_prefix;
3093
+ const resetOpId = resolveOperatorId(orchConfig);
3094
+ // TP-029 R006: Track worktrees that failed reset AND removal
3095
+ // so the cleanup gate only fires on true stale state, not
3096
+ // successfully-reset reusable worktrees. (Parity with engine.ts)
3097
+ const failedRemovalWorktrees = new Map<
3098
+ string,
3099
+ { repoId: string | undefined; paths: string[] }
3100
+ >();
3101
+
3102
+ // Use encounteredRepoRoots which includes both persisted lanes
3103
+ // AND newly allocated lanes from resumed waves, ensuring repos
3104
+ // introduced after resume starts are covered.
3105
+ // Per-repo target branch: primary repo uses orchBranch, secondary
3106
+ // repos resolve their own branch (same as cleanup — see section 11).
3107
+ for (const perRepoRoot of encounteredRepoRoots) {
3108
+ const existingWorktrees = listWorktrees(wtPrefix, perRepoRoot, resetOpId, batchState.batchId);
3109
+ if (existingWorktrees.length > 0) {
3110
+ let targetBranch: string;
3111
+ if (perRepoRoot === repoRoot) {
3112
+ targetBranch = batchState.orchBranch;
3113
+ } else {
3114
+ const repoId = resolveRepoIdFromRoot(perRepoRoot, workspaceConfig);
3115
+ try {
3116
+ targetBranch = resolveBaseBranch(
3117
+ repoId,
3118
+ perRepoRoot,
3119
+ batchState.orchBranch,
3120
+ workspaceConfig,
3121
+ );
3122
+ } catch {
3123
+ // If resolution fails, fall back to orchBranch (reset will
3124
+ // fail gracefully and trigger worktree removal)
3125
+ targetBranch = batchState.orchBranch;
3126
+ }
3127
+ }
3128
+ for (const wt of existingWorktrees) {
3129
+ // TP-028: Skip reset for worktrees whose lane branch has
3130
+ // unsaved partial progress (preservation failed with commits)
3131
+ if (ppUnsafeBranches.has(wt.branch)) {
3132
+ execLog(
3133
+ "batch",
3134
+ batchState.batchId,
3135
+ `skipping worktree reset for lane ${wt.laneNumber} — branch "${wt.branch}" has unsaved partial progress`,
3136
+ { path: wt.path, branch: wt.branch },
3137
+ );
3138
+ continue;
3139
+ }
3140
+
3141
+ const resetResult = safeResetWorktree(wt, targetBranch, perRepoRoot);
3142
+ if (!resetResult.success) {
3143
+ try {
3144
+ removeWorktree(wt, perRepoRoot);
3145
+ } catch {
3146
+ forceCleanupWorktree(wt, perRepoRoot, batchState.batchId);
3147
+ // Track this worktree for the cleanup gate — it may still be registered
3148
+ const perRepoId =
3149
+ perRepoRoot === repoRoot ? undefined : resolveRepoIdFromRoot(perRepoRoot, workspaceConfig);
3150
+ if (!failedRemovalWorktrees.has(perRepoRoot)) {
3151
+ failedRemovalWorktrees.set(perRepoRoot, { repoId: perRepoId, paths: [] });
3152
+ }
3153
+ failedRemovalWorktrees.get(perRepoRoot)!.paths.push(wt.path);
3154
+ }
3155
+ }
3156
+ }
3157
+ }
3158
+ }
3159
+
3160
+ // ── TP-029: Post-merge cleanup gate (parity with engine.ts) ──
3161
+ // Only gate on worktrees that the reset loop tried and failed
3162
+ // to remove. Successfully-reset reusable worktrees are expected
3163
+ // to remain registered — they will be reused in the next wave.
3164
+ // For each failed-removal worktree, verify it is still registered
3165
+ // before classifying it as truly stale.
3166
+ const cleanupGateFailures: CleanupGateRepoFailure[] = [];
3167
+ if (failedRemovalWorktrees.size > 0) {
3168
+ for (const [perRepoRoot, { repoId: perRepoId, paths: failedPaths }] of failedRemovalWorktrees) {
3169
+ const remaining = listWorktrees(wtPrefix, perRepoRoot, resetOpId, batchState.batchId);
3170
+ const remainingPaths = new Set(remaining.map((wt) => wt.path));
3171
+ // Only report worktrees that were targeted for removal but are still registered
3172
+ const stale = failedPaths.filter((p) => remainingPaths.has(p));
3173
+ if (stale.length > 0) {
3174
+ cleanupGateFailures.push({
3175
+ repoRoot: perRepoRoot,
3176
+ repoId: perRepoId,
3177
+ staleWorktrees: stale,
3178
+ });
3179
+ }
3180
+ }
3181
+ }
3182
+
3183
+ if (cleanupGateFailures.length > 0) {
3184
+ const gatePolicyResult = computeCleanupGatePolicy(waveIdx, cleanupGateFailures);
3185
+
3186
+ execLog(
3187
+ "batch",
3188
+ batchState.batchId,
3189
+ `cleanup gate failed — pausing batch`,
3190
+ gatePolicyResult.logDetails,
3191
+ );
3192
+
3193
+ batchState.phase = gatePolicyResult.targetPhase;
3194
+ batchState.errors.push(gatePolicyResult.errorMessage);
3195
+ persistRuntimeState(
3196
+ gatePolicyResult.persistTrigger,
3197
+ batchState,
3198
+ wavePlan,
3199
+ latestAllocatedLanes,
3200
+ allTaskOutcomes,
3201
+ discovery,
3202
+ stateRoot,
3203
+ );
3204
+ onNotify(gatePolicyResult.notifyMessage, gatePolicyResult.notifyLevel);
3205
+ preserveWorktreesForResume = true;
3206
+ break;
3207
+ }
3208
+ }
3209
+ }
3210
+
3211
+ // ── Pre-cleanup: Determine if worktrees should be preserved ──
3212
+ // TP-031 (R006): Parity with engine.ts — this check MUST run before cleanup
3213
+ // so that worktrees survive when failedTasks > 0. Without this, cleanup
3214
+ // deletes worktrees before the batch is marked "paused", breaking resumability.
3215
+ if (
3216
+ !preserveWorktreesForResume &&
3217
+ ((batchState.phase as OrchBatchPhase) === "executing" ||
3218
+ (batchState.phase as OrchBatchPhase) === "merging") &&
3219
+ batchState.failedTasks > 0
3220
+ ) {
3221
+ preserveWorktreesForResume = true;
3222
+ execLog(
3223
+ "resume",
3224
+ batchState.batchId,
3225
+ "pre-cleanup: failedTasks > 0 detected, preserving worktrees for resume",
3226
+ );
3227
+ }
3228
+
3229
+ // ── 11. Cleanup and terminal state ───────────────────────────
3230
+
3231
+ // ── TP-028: Preserve partial progress before terminal cleanup ──
3232
+ if (!preserveWorktreesForResume) {
3233
+ const ppOpId = resolveOperatorId(orchConfig);
3234
+ const ppResult = preserveFailedLaneProgress(
3235
+ latestAllocatedLanes,
3236
+ allTaskOutcomes,
3237
+ ppOpId,
3238
+ batchState.batchId,
3239
+ (repoId) => {
3240
+ const perRepoRoot = resolveRepoRoot(repoId, repoRoot, workspaceConfig);
3241
+ let targetBranch = batchState.orchBranch;
3242
+ if (repoId && perRepoRoot !== repoRoot) {
3243
+ try {
3244
+ targetBranch = resolveBaseBranch(repoId, perRepoRoot, batchState.orchBranch, workspaceConfig);
3245
+ } catch {
3246
+ /* fall back to orchBranch */
3247
+ }
3248
+ }
3249
+ return { repoRoot: perRepoRoot, targetBranch };
3250
+ },
3251
+ );
3252
+ if (ppResult.results.some((r) => r.saved)) {
3253
+ execLog(
3254
+ "batch",
3255
+ batchState.batchId,
3256
+ `preserved partial progress for ${ppResult.results.filter((r) => r.saved).length} failed task(s) before terminal cleanup`,
3257
+ );
3258
+ }
3259
+ // Log warnings for failed preservation attempts — at terminal cleanup
3260
+ // we cannot skip deletion (batch is ending), but operators need to know
3261
+ // that commits may become unreachable via reflog only.
3262
+ for (const r of ppResult.results) {
3263
+ if (!r.saved && (r.commitCount > 0 || r.error)) {
3264
+ execLog(
3265
+ "batch",
3266
+ batchState.batchId,
3267
+ `WARNING: Failed to preserve partial progress for task ${r.taskId} ` +
3268
+ `(${r.commitCount} commit(s) may become unreachable after cleanup)`,
3269
+ { taskId: r.taskId, commitCount: r.commitCount, error: r.error ?? "unknown" },
3270
+ );
3271
+ }
3272
+ }
3273
+ // TP-028: Stamp task outcomes with partial progress data for persistence
3274
+ applyPartialProgressToOutcomes(ppResult, allTaskOutcomes);
3275
+ }
3276
+
3277
+ if (!preserveWorktreesForResume) {
3278
+ const wtPrefix = orchConfig.orchestrator.worktree_prefix;
3279
+ const cleanupOpId = resolveOperatorId(orchConfig);
3280
+
3281
+ // Use encounteredRepoRoots which includes both persisted lanes
3282
+ // AND newly allocated lanes from resumed waves, ensuring repos
3283
+ // introduced after resume starts are cleaned up.
3284
+ //
3285
+ // Per-repo target branch resolution (workspace-mode correctness):
3286
+ // In repo mode, orchBranch is the correct target for all worktrees.
3287
+ // In workspace mode, the orchBranch only exists in the primary repo.
3288
+ // Secondary repos were merged against their own resolved base branch
3289
+ // (via resolveBaseBranch in mergeWaveByRepo), so unmerged-branch
3290
+ // protection must compare against that same per-repo branch.
3291
+ for (const perRepoRoot of encounteredRepoRoots) {
3292
+ let targetBranch: string | undefined;
3293
+ if (perRepoRoot === repoRoot) {
3294
+ // Primary repo: lane branches were merged into orchBranch
3295
+ targetBranch = batchState.orchBranch;
3296
+ } else {
3297
+ // Secondary repo (workspace mode): resolve the repo's own branch
3298
+ // using the same logic as mergeWaveByRepo. Find repoId by matching
3299
+ // the resolved path back to workspace config.
3300
+ const repoId = resolveRepoIdFromRoot(perRepoRoot, workspaceConfig);
3301
+ try {
3302
+ targetBranch = resolveBaseBranch(repoId, perRepoRoot, batchState.orchBranch, workspaceConfig);
3303
+ } catch {
3304
+ // resolveBaseBranch may throw if HEAD is detached and no
3305
+ // defaultBranch is configured. Fall back to undefined which
3306
+ // skips branch protection (branches are deleted without
3307
+ // merge-status check — safe because successfully merged
3308
+ // branches were already cleaned up in post-merge steps).
3309
+ targetBranch = undefined;
3310
+ }
3311
+ }
3312
+ removeAllWorktrees(
3313
+ wtPrefix,
3314
+ perRepoRoot,
3315
+ cleanupOpId,
3316
+ targetBranch,
3317
+ batchState.batchId,
3318
+ orchConfig,
3319
+ );
3320
+ }
3321
+ }
3322
+
3323
+ batchState.endedAt = Date.now();
3324
+ const totalElapsedSec = Math.round((batchState.endedAt - batchState.startedAt) / 1000);
3325
+
3326
+ if (
3327
+ (batchState.phase as OrchBatchPhase) === "executing" ||
3328
+ (batchState.phase as OrchBatchPhase) === "merging"
3329
+ ) {
3330
+ if (batchState.failedTasks > 0) {
3331
+ // TP-031: Parity with engine.ts — default to "paused" so the batch is
3332
+ // resumable without --force. "failed" is reserved for unrecoverable
3333
+ // invariant violations after retry exhaustion.
3334
+ // NOTE: preserveWorktreesForResume was already set pre-cleanup to ensure
3335
+ // worktrees survive; this just sets the phase for state persistence.
3336
+ batchState.phase = "paused";
3337
+ } else {
3338
+ batchState.phase = "completed";
3339
+ }
3340
+ }
3341
+
3342
+ // ── Auto-Integration & Orch Branch Preservation (TP-022 Step 4) ──
3343
+ // Parity with engine.ts: auto-integrate if configured, else show manual guidance.
3344
+ // Gate: only run for terminal phases (completed/failed). Paused/stopped batches
3345
+ // are not yet done — integration would mutate refs prematurely.
3346
+ //
3347
+ // TP-043: "supervised" and "auto" integration modes are now owned by the
3348
+ // supervisor agent. Legacy engine fast-forward is removed — supervisor
3349
+ // handles all non-manual integration after batch_complete event.
3350
+ const mergedTaskCount = batchState.succeededTasks;
3351
+ // TP-195: hoist `batchState.phase` to a fresh local with the wide
3352
+ // `OrchBatchPhase` type. TypeScript's narrowing-on-property semantics
3353
+ // under `strict: false` carries assignments forward through the
3354
+ // function (visible in the `(batchState.phase as OrchBatchPhase) === ...`
3355
+ // pattern already used at lines ~3366/~3476 above), which here narrows
3356
+ // `batchState.phase` to a subtype that excludes `"completed"` and
3357
+ // `"failed"`. Hoisting to a typed local breaks the narrowing chain so
3358
+ // the comparisons typecheck without a per-call cast. Runtime
3359
+ // evaluation is identical.
3360
+ const phaseAtTerminal = batchState.phase as OrchBatchPhase;
3361
+ const isTerminalPhase = phaseAtTerminal === "completed" || phaseAtTerminal === "failed";
3362
+ if (
3363
+ isTerminalPhase &&
3364
+ !preserveWorktreesForResume &&
3365
+ batchState.orchBranch &&
3366
+ mergedTaskCount > 0
3367
+ ) {
3368
+ if (
3369
+ orchConfig.orchestrator.integration === "supervised" ||
3370
+ orchConfig.orchestrator.integration === "auto"
3371
+ ) {
3372
+ // TP-043: Supervisor-managed integration modes. Defer to supervisor.
3373
+ execLog(
3374
+ "resume",
3375
+ batchState.batchId,
3376
+ `integration deferred to supervisor (mode: ${orchConfig.orchestrator.integration})`,
3377
+ );
3378
+ } else {
3379
+ // Manual mode (default): show integration guidance
3380
+ onNotify(
3381
+ ORCH_MESSAGES.orchIntegrationManual(
3382
+ batchState.orchBranch,
3383
+ batchState.baseBranch,
3384
+ mergedTaskCount,
3385
+ ),
3386
+ "info",
3387
+ );
3388
+ }
3389
+ }
3390
+
3391
+ persistRuntimeState(
3392
+ "batch-terminal",
3393
+ batchState,
3394
+ wavePlan,
3395
+ latestAllocatedLanes,
3396
+ allTaskOutcomes,
3397
+ discovery,
3398
+ stateRoot,
3399
+ );
3400
+
3401
+ // ── TP-076: Emit supervisor alert for batch completion ──────
3402
+ // TP-195: reuse the hoisted-typed phase to avoid the same narrowing
3403
+ // artifact as the `isTerminalPhase` check above.
3404
+ if (phaseAtTerminal === "completed" || phaseAtTerminal === "failed") {
3405
+ const batchDurationMs = batchState.endedAt ? batchState.endedAt - batchState.startedAt : 0;
3406
+ const durationStr =
3407
+ batchDurationMs > 0
3408
+ ? `${Math.floor(batchDurationMs / 60000)}m ${Math.round((batchDurationMs % 60000) / 1000)}s`
3409
+ : "unknown";
3410
+ if (batchState.phase === "completed" && batchState.failedTasks === 0) {
3411
+ emitAlert({
3412
+ category: "batch-complete",
3413
+ summary:
3414
+ `✅ Batch ${batchState.batchId} completed\n` +
3415
+ ` ${batchState.succeededTasks}/${batchState.totalTasks} tasks succeeded\n` +
3416
+ ` ${batchState.taskLevelWaveCount ?? batchState.totalWaves} wave(s), duration: ${durationStr}\n` +
3417
+ ` Merged to orch branch: ${batchState.orchBranch}\n\n` +
3418
+ `Ready for integration. Run orch_integrate() or review first.`,
3419
+ context: {
3420
+ batchProgress: buildBatchProgressSnapshot(batchState),
3421
+ batchDurationMs,
3422
+ },
3423
+ });
3424
+ } else {
3425
+ emitAlert({
3426
+ category: "batch-complete",
3427
+ summary:
3428
+ `⚠️ Batch ${batchState.batchId} finished with failures\n` +
3429
+ ` ${batchState.succeededTasks} succeeded, ${batchState.failedTasks} failed, ` +
3430
+ `${batchState.skippedTasks} skipped, ${batchState.blockedTasks} blocked\n` +
3431
+ ` Duration: ${durationStr}\n\n` +
3432
+ `Available actions:\n` +
3433
+ ` - orch_status() to review final state\n` +
3434
+ ` - orch_integrate() if succeeded work should be kept\n` +
3435
+ ` - orch_resume(force=true) to retry failed tasks`,
3436
+ context: {
3437
+ batchProgress: buildBatchProgressSnapshot(batchState),
3438
+ batchDurationMs,
3439
+ },
3440
+ });
3441
+ }
3442
+ }
3443
+
3444
+ // ── TP-031: Emit diagnostic reports (JSONL + markdown) ──
3445
+ // Non-fatal: errors are logged but never crash batch finalization.
3446
+ emitDiagnosticReports(
3447
+ assembleDiagnosticInput(
3448
+ orchConfig,
3449
+ batchState,
3450
+ wavePlan,
3451
+ latestAllocatedLanes,
3452
+ allTaskOutcomes,
3453
+ stateRoot,
3454
+ ),
3455
+ );
3456
+
3457
+ if (batchState.phase === "paused" || batchState.phase === "stopped") {
3458
+ execLog("resume", batchState.batchId, "resumed batch ended in non-terminal state", {
3459
+ phase: batchState.phase,
3460
+ });
3461
+ } else {
3462
+ onNotify(
3463
+ ORCH_MESSAGES.resumeComplete(
3464
+ batchState.batchId,
3465
+ batchState.succeededTasks,
3466
+ batchState.failedTasks,
3467
+ batchState.skippedTasks,
3468
+ batchState.blockedTasks,
3469
+ totalElapsedSec,
3470
+ ),
3471
+ batchState.failedTasks > 0 ? "warning" : "info",
3472
+ );
3473
+
3474
+ if (batchState.phase === "completed") {
3475
+ try {
3476
+ deleteBatchState(stateRoot);
3477
+ execLog("state", batchState.batchId, "state file deleted on clean resume completion");
3478
+ } catch {
3479
+ // Best-effort
3480
+ }
3481
+ }
3482
+ }
3483
+ }
3484
+
3485
+ // TP-043: attemptAutoIntegration is no longer called from engine.ts or resume.ts.
3486
+ // Supervisor-managed integration ("supervised" and "auto" modes) is handled by
3487
+ // the supervisor agent after batch_complete. The helper remains in merge.ts for
3488
+ // use by the supervisor's integration flow.