taskplane 0.1.13 → 0.1.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1092 +1,1096 @@
1
- /**
2
- * Resume logic for paused/interrupted batches
3
- * @module orch/resume
4
- */
5
- import { existsSync } from "fs";
6
- import { join } from "path";
7
-
8
- import { runDiscovery } from "./discovery.ts";
9
- import { executeOrchBatch } from "./engine.ts";
10
- import { execLog, executeWave, pollUntilTaskComplete, spawnLaneSession, tmuxHasSession } from "./execution.ts";
11
- import type { MonitorUpdateCallback } from "./execution.ts";
12
- import { runGit } from "./git.ts";
13
- import { mergeWave } from "./merge.ts";
14
- import { ORCH_MESSAGES } from "./messages.ts";
15
- import { deleteBatchState, hasTaskDoneMarker, loadBatchState, persistRuntimeState, seedPendingOutcomesForAllocatedLanes, syncTaskOutcomesFromMonitor, upsertTaskOutcome } from "./persistence.ts";
16
- import { StateFileError } from "./types.ts";
17
- import type { AllocatedLane, AllocatedTask, LaneExecutionResult, LaneTaskOutcome, LaneTaskStatus, MergeWaveResult, OrchBatchPhase, OrchBatchRuntimeState, OrchestratorConfig, ParsedTask, PersistedBatchState, ReconciledTaskState, ResumeEligibility, ResumePoint, TaskRunnerConfig, WaveExecutionResult } from "./types.ts";
18
- import { buildDependencyGraph } from "./waves.ts";
19
- import { deleteBranchBestEffort, listWorktrees, removeAllWorktrees, removeWorktree, safeResetWorktree } from "./worktree.ts";
20
-
21
- // ── Resume Pure Functions ────────────────────────────────────────────
22
-
23
- /**
24
- * Check whether a persisted batch state is eligible for resume.
25
- *
26
- * Resume eligibility matrix:
27
- * | Phase | Eligible? | Reason |
28
- * |-----------|-----------|-------------------------------------------|
29
- * | paused | ✅ | Batch was paused (user/merge-failure) |
30
- * | executing | ✅ | Batch was executing when orchestrator died |
31
- * | merging | ✅ | Batch was merging when orchestrator died |
32
- * | stopped | ❌ | Batch was stopped by policy |
33
- * | failed | ❌ | Batch has terminal failure |
34
- * | completed | ❌ | Batch already completed |
35
- * | idle | ❌ | Batch never started execution |
36
- * | planning | ❌ | Batch was still planning |
37
- *
38
- * Pure function — no process or filesystem access.
39
- */
40
- export function checkResumeEligibility(state: PersistedBatchState): ResumeEligibility {
41
- const { phase, batchId } = state;
42
-
43
- switch (phase) {
44
- case "paused":
45
- return {
46
- eligible: true,
47
- reason: `Batch ${batchId} is paused and can be resumed.`,
48
- phase,
49
- batchId,
50
- };
51
-
52
- case "executing":
53
- return {
54
- eligible: true,
55
- reason: `Batch ${batchId} was executing when the orchestrator disconnected. Can be resumed.`,
56
- phase,
57
- batchId,
58
- };
59
-
60
- case "merging":
61
- return {
62
- eligible: true,
63
- reason: `Batch ${batchId} was merging when the orchestrator disconnected. Can be resumed.`,
64
- phase,
65
- batchId,
66
- };
67
-
68
- case "stopped":
69
- return {
70
- eligible: false,
71
- reason: `Batch ${batchId} was stopped by failure policy. Use /orch-abort to clean up, then start a new batch.`,
72
- phase,
73
- batchId,
74
- };
75
-
76
- case "failed":
77
- return {
78
- eligible: false,
79
- reason: `Batch ${batchId} has a terminal failure. Use /orch-abort to clean up, then start a new batch.`,
80
- phase,
81
- batchId,
82
- };
83
-
84
- case "completed":
85
- return {
86
- eligible: false,
87
- reason: `Batch ${batchId} already completed. Delete the state file or start a new batch.`,
88
- phase,
89
- batchId,
90
- };
91
-
92
- case "idle":
93
- return {
94
- eligible: false,
95
- reason: `Batch ${batchId} never started execution. Start a new batch with /orch.`,
96
- phase,
97
- batchId,
98
- };
99
-
100
- case "planning":
101
- return {
102
- eligible: false,
103
- reason: `Batch ${batchId} was still in planning phase. Start a new batch with /orch.`,
104
- phase,
105
- batchId,
106
- };
107
-
108
- default:
109
- return {
110
- eligible: false,
111
- reason: `Batch ${batchId} has unknown phase "${phase}". Delete the state file and start a new batch.`,
112
- phase,
113
- batchId,
114
- };
115
- }
116
- }
117
-
118
- /**
119
- * Reconcile persisted task states against live signals.
120
- *
121
- * For each task in the persisted state, determines the correct action
122
- * based on the current state of TMUX sessions and .DONE files.
123
- *
124
- * Precedence rules (applied per-task):
125
- * 1. .DONE file found → "mark-complete" (even if session is alive — task is done)
126
- * 2. Session alive + no .DONE → "reconnect" (task is still running)
127
- * 3. Persisted status is terminal (succeeded/failed/stalled/skipped) → "skip"
128
- * (already resolved in the original run, no action needed)
129
- * 4. Session dead + no .DONE + was pending/running → "mark-failed"
130
- * (task was interrupted and did not complete)
131
- *
132
- * Pure function — no process or filesystem access.
133
- *
134
- * @param persistedState - Loaded and validated batch state
135
- * @param aliveSessions - Set of TMUX session names currently alive
136
- * @param doneTaskIds - Set of task IDs whose .DONE files exist
137
- * @returns Array of reconciled task states in persisted order
138
- */
139
- export function reconcileTaskStates(
140
- persistedState: PersistedBatchState,
141
- aliveSessions: ReadonlySet<string>,
142
- doneTaskIds: ReadonlySet<string>,
143
- existingWorktrees: ReadonlySet<string> = new Set(),
144
- ): ReconciledTaskState[] {
145
- return persistedState.tasks.map((task) => {
146
- const sessionAlive = aliveSessions.has(task.sessionName);
147
- const doneFileFound = doneTaskIds.has(task.taskId);
148
- const worktreeExists = existingWorktrees.has(task.taskId);
149
-
150
- // Precedence 1: .DONE file found → task completed
151
- if (doneFileFound) {
152
- return {
153
- taskId: task.taskId,
154
- persistedStatus: task.status,
155
- liveStatus: "succeeded" as LaneTaskStatus,
156
- sessionAlive,
157
- doneFileFound: true,
158
- worktreeExists,
159
- action: "mark-complete" as const,
160
- };
161
- }
162
-
163
- // Precedence 2: Session alive → reconnect
164
- if (sessionAlive) {
165
- return {
166
- taskId: task.taskId,
167
- persistedStatus: task.status,
168
- liveStatus: "running" as LaneTaskStatus,
169
- sessionAlive: true,
170
- doneFileFound: false,
171
- worktreeExists,
172
- action: "reconnect" as const,
173
- };
174
- }
175
-
176
- // Precedence 3: Already terminal in persisted state → skip
177
- const terminalStatuses: LaneTaskStatus[] = ["succeeded", "failed", "stalled", "skipped"];
178
- if (terminalStatuses.includes(task.status)) {
179
- return {
180
- taskId: task.taskId,
181
- persistedStatus: task.status,
182
- liveStatus: task.status,
183
- sessionAlive: false,
184
- doneFileFound: false,
185
- worktreeExists,
186
- action: "skip" as const,
187
- };
188
- }
189
-
190
- // Precedence 4: Session dead + no .DONE + worktree exists → re-execute
191
- if (worktreeExists) {
192
- return {
193
- taskId: task.taskId,
194
- persistedStatus: task.status,
195
- liveStatus: "pending" as LaneTaskStatus,
196
- sessionAlive: false,
197
- doneFileFound: false,
198
- worktreeExists: true,
199
- action: "re-execute" as const,
200
- };
201
- }
202
-
203
- // Precedence 5: Dead session + not terminal + no .DONE + no worktree → failed
204
- return {
205
- taskId: task.taskId,
206
- persistedStatus: task.status,
207
- liveStatus: "failed" as LaneTaskStatus,
208
- sessionAlive: false,
209
- doneFileFound: false,
210
- worktreeExists: false,
211
- action: "mark-failed" as const,
212
- };
213
- });
214
- }
215
-
216
- /**
217
- * Compute the resume point from reconciled task states and wave plan.
218
- *
219
- * Determines which wave to resume from by finding the first wave that
220
- * has any incomplete tasks. Skips fully completed waves.
221
- *
222
- * Pure function — no process or filesystem access.
223
- *
224
- * @param persistedState - Loaded and validated batch state
225
- * @param reconciledTasks - Reconciled task states
226
- * @returns Resume point with wave index and categorized task IDs
227
- */
228
- export function computeResumePoint(
229
- persistedState: PersistedBatchState,
230
- reconciledTasks: ReconciledTaskState[],
231
- ): ResumePoint {
232
- // Build lookup: taskId → reconciled state
233
- const reconciledMap = new Map<string, ReconciledTaskState>();
234
- for (const task of reconciledTasks) {
235
- reconciledMap.set(task.taskId, task);
236
- }
237
-
238
- // Categorize tasks
239
- const completedTaskIds: string[] = [];
240
- const pendingTaskIds: string[] = [];
241
- const failedTaskIds: string[] = [];
242
- const reconnectTaskIds: string[] = [];
243
- const reExecuteTaskIds: string[] = [];
244
-
245
- for (const task of reconciledTasks) {
246
- switch (task.action) {
247
- case "mark-complete":
248
- case "skip":
249
- if (task.liveStatus === "succeeded" || task.persistedStatus === "succeeded") {
250
- completedTaskIds.push(task.taskId);
251
- } else if (task.liveStatus === "failed" || task.liveStatus === "stalled" || task.persistedStatus === "failed" || task.persistedStatus === "stalled") {
252
- failedTaskIds.push(task.taskId);
253
- }
254
- // skipped tasks from original run don't count as completed or failed
255
- break;
256
- case "reconnect":
257
- reconnectTaskIds.push(task.taskId);
258
- break;
259
- case "re-execute":
260
- reExecuteTaskIds.push(task.taskId);
261
- break;
262
- case "mark-failed":
263
- failedTaskIds.push(task.taskId);
264
- break;
265
- }
266
- }
267
-
268
- // Find resume wave: first wave with any non-completed tasks
269
- let resumeWaveIndex = persistedState.wavePlan.length; // default: past end = all done
270
-
271
- for (let i = 0; i < persistedState.wavePlan.length; i++) {
272
- const waveTasks = persistedState.wavePlan[i];
273
- const allDone = waveTasks.every((taskId) => {
274
- const reconciled = reconciledMap.get(taskId);
275
- if (!reconciled) return false;
276
- // A task is "done" for wave-skip purposes if it completed or failed terminally
277
- return (
278
- reconciled.action === "mark-complete" ||
279
- (reconciled.action === "skip" && (
280
- reconciled.liveStatus === "succeeded" ||
281
- reconciled.liveStatus === "failed" ||
282
- reconciled.liveStatus === "stalled" ||
283
- reconciled.persistedStatus === "succeeded" ||
284
- reconciled.persistedStatus === "failed" ||
285
- reconciled.persistedStatus === "stalled"
286
- ))
287
- );
288
- });
289
-
290
- if (!allDone) {
291
- resumeWaveIndex = i;
292
- break;
293
- }
294
- }
295
-
296
- // Determine pending tasks: tasks in resume wave and later that need execution
297
- const actualPendingTaskIds: string[] = [];
298
- for (let i = resumeWaveIndex; i < persistedState.wavePlan.length; i++) {
299
- for (const taskId of persistedState.wavePlan[i]) {
300
- const reconciled = reconciledMap.get(taskId);
301
- if (!reconciled) {
302
- actualPendingTaskIds.push(taskId); // Unknown task — treat as pending
303
- continue;
304
- }
305
- if (reconciled.action === "reconnect") {
306
- // Tasks with alive sessions need reconnection and remain pending.
307
- actualPendingTaskIds.push(taskId);
308
- }
309
- if (reconciled.action === "re-execute") {
310
- // Tasks with existing worktrees need re-execution and remain pending.
311
- actualPendingTaskIds.push(taskId);
312
- }
313
- if (reconciled.action === "skip" && reconciled.persistedStatus === "pending") {
314
- // Skipped tasks that were pending need execution
315
- actualPendingTaskIds.push(taskId);
316
- }
317
- }
318
- }
319
-
320
- return {
321
- resumeWaveIndex,
322
- completedTaskIds,
323
- pendingTaskIds: actualPendingTaskIds,
324
- failedTaskIds,
325
- reconnectTaskIds,
326
- reExecuteTaskIds,
327
- };
328
- }
329
-
330
-
331
- export async function resumeOrchBatch(
332
- orchConfig: OrchestratorConfig,
333
- runnerConfig: TaskRunnerConfig,
334
- cwd: string,
335
- batchState: OrchBatchRuntimeState,
336
- onNotify: (message: string, level: "info" | "warning" | "error") => void,
337
- onMonitorUpdate?: MonitorUpdateCallback,
338
- ): Promise<void> {
339
- const repoRoot = cwd;
340
- const prefix = orchConfig.orchestrator.tmux_prefix;
341
-
342
- // ── 1. Load persisted state ──────────────────────────────────
343
- let persistedState: PersistedBatchState | null;
344
- try {
345
- persistedState = loadBatchState(repoRoot);
346
- } catch (err: unknown) {
347
- if (err instanceof StateFileError) {
348
- onNotify(
349
- `❌ Cannot resume: ${err.message}`,
350
- "error",
351
- );
352
- return;
353
- }
354
- throw err;
355
- }
356
-
357
- if (!persistedState) {
358
- onNotify(
359
- ORCH_MESSAGES.resumeNoState(),
360
- "error",
361
- );
362
- return;
363
- }
364
-
365
- // ── 2. Check eligibility ─────────────────────────────────────
366
- const eligibility = checkResumeEligibility(persistedState);
367
- if (!eligibility.eligible) {
368
- onNotify(
369
- ORCH_MESSAGES.resumePhaseNotResumable(persistedState.batchId, persistedState.phase, eligibility.reason),
370
- "error",
371
- );
372
- return;
373
- }
374
-
375
- onNotify(
376
- ORCH_MESSAGES.resumeStarting(persistedState.batchId, persistedState.phase),
377
- "info",
378
- );
379
-
380
- // ── 3. Discover live signals ─────────────────────────────────
381
- // Check TMUX sessions
382
- const aliveSessions = new Set<string>();
383
- for (const task of persistedState.tasks) {
384
- if (task.sessionName && tmuxHasSession(task.sessionName)) {
385
- aliveSessions.add(task.sessionName);
386
- }
387
- }
388
-
389
- // Check .DONE files
390
- const doneTaskIds = new Set<string>();
391
- for (const task of persistedState.tasks) {
392
- if (task.taskFolder && hasTaskDoneMarker(task.taskFolder)) {
393
- doneTaskIds.add(task.taskId);
394
- }
395
- }
396
-
397
- // ── 3b. Detect existing worktrees ────────────────────────────
398
- const existingWorktreeTaskIds = new Set<string>();
399
- for (const task of persistedState.tasks) {
400
- const laneRecord = persistedState.lanes.find(l => l.taskIds.includes(task.taskId));
401
- if (laneRecord && laneRecord.worktreePath && existsSync(laneRecord.worktreePath)) {
402
- existingWorktreeTaskIds.add(task.taskId);
403
- }
404
- }
405
-
406
- // ── 4. Reconcile task states ─────────────────────────────────
407
- const reconciledTasks = reconcileTaskStates(persistedState, aliveSessions, doneTaskIds, existingWorktreeTaskIds);
408
-
409
- // ── 5. Compute resume point ──────────────────────────────────
410
- const resumePoint = computeResumePoint(persistedState, reconciledTasks);
411
- const completedTaskSet = new Set(resumePoint.completedTaskIds);
412
- const failedTaskSet = new Set(resumePoint.failedTaskIds);
413
- const reconnectTaskSet = new Set(resumePoint.reconnectTaskIds);
414
- const reExecuteTaskSet = new Set(resumePoint.reExecuteTaskIds);
415
-
416
- onNotify(
417
- ORCH_MESSAGES.resumeReconciled(
418
- persistedState.batchId,
419
- resumePoint.completedTaskIds.length,
420
- resumePoint.pendingTaskIds.length,
421
- resumePoint.failedTaskIds.length,
422
- resumePoint.reconnectTaskIds.length,
423
- resumePoint.reExecuteTaskIds.length,
424
- ),
425
- "info",
426
- );
427
-
428
- if (resumePoint.reconnectTaskIds.length > 0) {
429
- onNotify(
430
- ORCH_MESSAGES.resumeReconnecting(resumePoint.reconnectTaskIds.length),
431
- "info",
432
- );
433
- }
434
-
435
- if (resumePoint.resumeWaveIndex > 0) {
436
- onNotify(
437
- ORCH_MESSAGES.resumeSkippedWaves(resumePoint.resumeWaveIndex),
438
- "info",
439
- );
440
- }
441
-
442
- // ── 6. Reconstruct runtime state ─────────────────────────────
443
- batchState.phase = "executing";
444
- batchState.batchId = persistedState.batchId;
445
- batchState.startedAt = persistedState.startedAt;
446
- batchState.pauseSignal = { paused: false };
447
- batchState.totalWaves = persistedState.totalWaves;
448
- batchState.totalTasks = persistedState.totalTasks;
449
- batchState.succeededTasks = resumePoint.completedTaskIds.length;
450
- batchState.failedTasks = resumePoint.failedTaskIds.length;
451
- batchState.skippedTasks = persistedState.skippedTasks;
452
- batchState.blockedTasks = persistedState.blockedTasks;
453
- batchState.blockedTaskIds = new Set(persistedState.blockedTaskIds);
454
- batchState.errors = [...persistedState.errors];
455
- batchState.endedAt = null;
456
- batchState.currentWaveIndex = resumePoint.resumeWaveIndex;
457
- batchState.waveResults = [];
458
-
459
- // ── 7. Re-run discovery for ParsedTask metadata ──────────────
460
- // We need fresh ParsedTask data (taskFolder, promptPath) for execution.
461
- // Use "all" to discover all areas.
462
- const discovery = runDiscovery("all", runnerConfig.task_areas, cwd, {
463
- refreshDependencies: false,
464
- dependencySource: orchConfig.dependencies.source,
465
- useDependencyCache: orchConfig.dependencies.cache,
466
- });
467
-
468
- // Build dependency graph for skip-dependents policy
469
- const depGraph = buildDependencyGraph(discovery.pending, discovery.completed);
470
- batchState.dependencyGraph = depGraph;
471
-
472
- // ── 8. Handle alive sessions (reconnect) ─────────────────────
473
- // For tasks with alive sessions, we need to wait for them to complete.
474
- // We poll each alive session's .DONE file.
475
- const reconnectTasks = reconciledTasks.filter(t => t.action === "reconnect");
476
- const reconnectFinalStatus = new Map<string, LaneTaskStatus>();
477
-
478
- if (reconnectTasks.length > 0) {
479
- // Wait for reconnected tasks to complete (poll .DONE files)
480
- for (const task of reconnectTasks) {
481
- const parsedTask = discovery.pending.get(task.taskId);
482
- if (!parsedTask) continue;
483
-
484
- // Find the lane info from persisted state
485
- const laneRecord = persistedState.lanes.find(
486
- l => l.taskIds.includes(task.taskId),
487
- );
488
- if (!laneRecord) continue;
489
-
490
- // Build a minimal AllocatedLane for polling
491
- const allocatedTask: AllocatedTask = {
492
- taskId: task.taskId,
493
- order: 0,
494
- task: parsedTask,
495
- estimatedMinutes: 0,
496
- };
497
- const lane: AllocatedLane = {
498
- laneNumber: laneRecord.laneNumber,
499
- laneId: laneRecord.laneId,
500
- tmuxSessionName: laneRecord.tmuxSessionName,
501
- worktreePath: laneRecord.worktreePath,
502
- branch: laneRecord.branch,
503
- tasks: [allocatedTask],
504
- strategy: "round-robin",
505
- estimatedLoad: 0,
506
- estimatedMinutes: 0,
507
- };
508
-
509
- execLog("resume", task.taskId, "reconnecting to alive session", {
510
- session: laneRecord.tmuxSessionName,
511
- });
512
-
513
- // Poll until task completes
514
- try {
515
- const pollResult = await pollUntilTaskComplete(
516
- lane,
517
- allocatedTask,
518
- orchConfig,
519
- repoRoot,
520
- batchState.pauseSignal,
521
- );
522
-
523
- if (pollResult.status === "succeeded") {
524
- reconnectFinalStatus.set(task.taskId, "succeeded");
525
- completedTaskSet.add(task.taskId);
526
- failedTaskSet.delete(task.taskId);
527
- reconnectTaskSet.delete(task.taskId);
528
- batchState.succeededTasks++;
529
- execLog("resume", task.taskId, "reconnected task succeeded");
530
- } else {
531
- reconnectFinalStatus.set(task.taskId, "failed");
532
- failedTaskSet.add(task.taskId);
533
- completedTaskSet.delete(task.taskId);
534
- reconnectTaskSet.delete(task.taskId);
535
- batchState.failedTasks++;
536
- execLog("resume", task.taskId, `reconnected task ${pollResult.status}: ${pollResult.exitReason}`);
537
- }
538
- } catch (err: unknown) {
539
- reconnectFinalStatus.set(task.taskId, "failed");
540
- failedTaskSet.add(task.taskId);
541
- completedTaskSet.delete(task.taskId);
542
- reconnectTaskSet.delete(task.taskId);
543
- batchState.failedTasks++;
544
- const msg = err instanceof Error ? err.message : String(err);
545
- execLog("resume", task.taskId, `reconnection error: ${msg}`);
546
- }
547
- }
548
- }
549
-
550
- // ── 8b. Handle re-execute tasks (dead session + existing worktree) ──
551
- const reExecuteTasks = reconciledTasks.filter(t => t.action === "re-execute");
552
- const reExecuteFinalStatus = new Map<string, LaneTaskStatus>();
553
- const reExecAllocatedLanes: AllocatedLane[] = [];
554
-
555
- if (reExecuteTasks.length > 0) {
556
- onNotify(
557
- `🔄 Re-executing ${reExecuteTasks.length} interrupted task(s) in existing worktrees...`,
558
- "info",
559
- );
560
-
561
- for (const task of reExecuteTasks) {
562
- const parsedTask = discovery.pending.get(task.taskId);
563
- if (!parsedTask) continue;
564
-
565
- const laneRecord = persistedState.lanes.find(
566
- l => l.taskIds.includes(task.taskId),
567
- );
568
- if (!laneRecord) continue;
569
-
570
- const allocatedTask: AllocatedTask = {
571
- taskId: task.taskId,
572
- order: 0,
573
- task: parsedTask,
574
- estimatedMinutes: 0,
575
- };
576
- const lane: AllocatedLane = {
577
- laneNumber: laneRecord.laneNumber,
578
- laneId: laneRecord.laneId,
579
- tmuxSessionName: laneRecord.tmuxSessionName,
580
- worktreePath: laneRecord.worktreePath,
581
- branch: laneRecord.branch,
582
- tasks: [allocatedTask],
583
- strategy: "round-robin",
584
- estimatedLoad: 0,
585
- estimatedMinutes: 0,
586
- };
587
-
588
- execLog("resume", task.taskId, "re-executing interrupted task in existing worktree", {
589
- session: laneRecord.tmuxSessionName,
590
- worktree: laneRecord.worktreePath,
591
- });
592
-
593
- try {
594
- spawnLaneSession(lane, allocatedTask, orchConfig, repoRoot);
595
- const pollResult = await pollUntilTaskComplete(
596
- lane,
597
- allocatedTask,
598
- orchConfig,
599
- repoRoot,
600
- batchState.pauseSignal,
601
- );
602
-
603
- if (pollResult.status === "succeeded") {
604
- reExecuteFinalStatus.set(task.taskId, "succeeded");
605
- completedTaskSet.add(task.taskId);
606
- failedTaskSet.delete(task.taskId);
607
- reExecuteTaskSet.delete(task.taskId);
608
- batchState.succeededTasks++;
609
- reExecAllocatedLanes.push(lane);
610
- execLog("resume", task.taskId, "re-executed task succeeded");
611
- } else {
612
- reExecuteFinalStatus.set(task.taskId, "failed");
613
- failedTaskSet.add(task.taskId);
614
- completedTaskSet.delete(task.taskId);
615
- reExecuteTaskSet.delete(task.taskId);
616
- batchState.failedTasks++;
617
- execLog("resume", task.taskId, `re-executed task ${pollResult.status}: ${pollResult.exitReason}`);
618
- }
619
- } catch (err: unknown) {
620
- reExecuteFinalStatus.set(task.taskId, "failed");
621
- failedTaskSet.add(task.taskId);
622
- completedTaskSet.delete(task.taskId);
623
- reExecuteTaskSet.delete(task.taskId);
624
- batchState.failedTasks++;
625
- const msg = err instanceof Error ? err.message : String(err);
626
- execLog("resume", task.taskId, `re-execution error: ${msg}`);
627
- }
628
- }
629
- }
630
-
631
- // ── 8c. Merge re-executed lane branches before cleanup ───────
632
- // Re-executed tasks completed outside the normal wave loop, so their
633
- // branches would not be merged by step 10. Merge them now.
634
- if (reExecAllocatedLanes.length > 0) {
635
- const succeededReExecTaskIds = [...reExecuteFinalStatus.entries()]
636
- .filter(([_, status]) => status === "succeeded")
637
- .map(([taskId]) => taskId);
638
-
639
- if (succeededReExecTaskIds.length > 0) {
640
- onNotify(
641
- `🔀 Merging ${reExecAllocatedLanes.length} re-executed lane branch(es)...`,
642
- "info",
643
- );
644
-
645
- // Build synthetic WaveExecutionResult for mergeWave()
646
- const syntheticLaneResults: LaneExecutionResult[] = reExecAllocatedLanes.map(lane => ({
647
- laneNumber: lane.laneNumber,
648
- laneId: lane.laneId,
649
- tasks: lane.tasks.map(t => ({
650
- taskId: t.taskId,
651
- status: "succeeded" as LaneTaskStatus,
652
- startTime: Date.now(),
653
- endTime: Date.now(),
654
- exitReason: "Re-executed task completed successfully",
655
- sessionName: lane.tmuxSessionName,
656
- doneFileFound: true,
657
- })),
658
- overallStatus: "succeeded" as const,
659
- startTime: Date.now(),
660
- endTime: Date.now(),
661
- }));
662
-
663
- const syntheticWaveResult: WaveExecutionResult = {
664
- waveIndex: 0,
665
- startedAt: Date.now(),
666
- endedAt: Date.now(),
667
- laneResults: syntheticLaneResults,
668
- policyApplied: orchConfig.failure.on_task_failure,
669
- stoppedEarly: false,
670
- failedTaskIds: [],
671
- skippedTaskIds: [],
672
- succeededTaskIds: succeededReExecTaskIds,
673
- blockedTaskIds: [],
674
- laneCount: reExecAllocatedLanes.length,
675
- overallStatus: "succeeded",
676
- finalMonitorState: null,
677
- allocatedLanes: reExecAllocatedLanes,
678
- };
679
-
680
- const reExecMergeResult = mergeWave(
681
- reExecAllocatedLanes,
682
- syntheticWaveResult,
683
- 0,
684
- orchConfig,
685
- repoRoot,
686
- batchState.batchId,
687
- );
688
-
689
- if (reExecMergeResult.status === "succeeded") {
690
- onNotify(
691
- `✅ Re-executed branch merge complete: ${reExecMergeResult.laneResults.length} lane(s) merged`,
692
- "info",
693
- );
694
-
695
- // Clean up merged branches
696
- const targetBranch = orchConfig.orchestrator.integration_branch;
697
- for (const lr of reExecMergeResult.laneResults) {
698
- if (lr.result?.status === "SUCCESS" || lr.result?.status === "CONFLICT_RESOLVED") {
699
- deleteBranchBestEffort(lr.sourceBranch, repoRoot);
700
- }
701
- }
702
- } else {
703
- onNotify(
704
- `⚠️ Re-executed branch merge ${reExecMergeResult.status}: ${reExecMergeResult.failureReason || "unknown"}`,
705
- "warning",
706
- );
707
- }
708
-
709
- batchState.mergeResults.push(reExecMergeResult);
710
- }
711
- }
712
-
713
- // ── 9. Persist state after reconciliation ────────────────────
714
- // Track state for persistence
715
- const wavePlan = persistedState.wavePlan;
716
- const allTaskOutcomes: LaneTaskOutcome[] = [];
717
- let latestAllocatedLanes: AllocatedLane[] = [];
718
-
719
- // Build outcomes from reconciled tasks
720
- for (const task of reconciledTasks) {
721
- const persistedTask = persistedState.tasks.find(t => t.taskId === task.taskId);
722
- const reconnectStatus = reconnectFinalStatus.get(task.taskId);
723
- const reExecuteStatus = reExecuteFinalStatus.get(task.taskId);
724
- const status = task.action === "reconnect"
725
- ? (reconnectStatus || "running")
726
- : task.action === "re-execute"
727
- ? (reExecuteStatus || "pending")
728
- : task.liveStatus;
729
- const isTerminal = status === "succeeded" || status === "failed" || status === "stalled" || status === "skipped";
730
- allTaskOutcomes.push({
731
- taskId: task.taskId,
732
- status,
733
- startTime: persistedTask?.startedAt ?? null,
734
- endTime: isTerminal ? Date.now() : null,
735
- exitReason: task.action === "mark-complete" ? ".DONE file found on resume"
736
- : task.action === "mark-failed" ? "Session dead, no .DONE file, no worktree on resume"
737
- : task.action === "reconnect"
738
- ? (status === "succeeded" ? "Reconnected task completed" : status === "failed" ? "Reconnected task failed" : "Reconnected to alive session")
739
- : task.action === "re-execute"
740
- ? (status === "succeeded" ? "Re-executed task completed" : status === "failed" ? "Re-executed task failed" : "Re-executing in existing worktree")
741
- : persistedTask?.exitReason ?? "",
742
- sessionName: persistedTask?.sessionName ?? "",
743
- doneFileFound: status === "succeeded" ? true : task.doneFileFound,
744
- });
745
- }
746
-
747
- persistRuntimeState("resume-reconciliation", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery ?? null, repoRoot);
748
-
749
- // ── 10. Continue wave execution ──────────────────────────────
750
- // We need to execute remaining waves starting from resumeWaveIndex.
751
- // For waves where some tasks are already done, we filter them out.
752
-
753
- let preserveWorktreesForResume = false;
754
-
755
- for (let waveIdx = resumePoint.resumeWaveIndex; waveIdx < persistedState.wavePlan.length; waveIdx++) {
756
- // Check pause signal
757
- if (batchState.pauseSignal.paused) {
758
- batchState.phase = "paused";
759
- persistRuntimeState("pause-before-wave", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
760
- onNotify(`⏸️ Batch paused before wave ${waveIdx + 1}.`, "warning");
761
- break;
762
- }
763
-
764
- batchState.currentWaveIndex = waveIdx;
765
- persistRuntimeState("wave-index-change", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
766
-
767
- // Get wave tasks, filtering out completed/failed/blocked ones.
768
- let waveTasks = persistedState.wavePlan[waveIdx].filter(
769
- taskId => !completedTaskSet.has(taskId) &&
770
- !failedTaskSet.has(taskId) &&
771
- !batchState.blockedTaskIds.has(taskId),
772
- );
773
-
774
- // Also filter tasks where discovery doesn't have them as pending
775
- waveTasks = waveTasks.filter(taskId => discovery.pending.has(taskId));
776
-
777
- const blockedInWave = persistedState.wavePlan[waveIdx].filter(
778
- taskId => batchState.blockedTaskIds.has(taskId),
779
- );
780
- if (blockedInWave.length > 0) {
781
- batchState.blockedTasks += blockedInWave.length;
782
- }
783
-
784
- if (waveTasks.length === 0) {
785
- execLog("resume", batchState.batchId, `wave ${waveIdx + 1}: no tasks to execute (all completed/blocked)`);
786
- continue;
787
- }
788
-
789
- onNotify(
790
- ORCH_MESSAGES.orchWaveStart(waveIdx + 1, persistedState.wavePlan.length, waveTasks.length, Math.min(waveTasks.length, orchConfig.orchestrator.max_lanes)),
791
- "info",
792
- );
793
-
794
- const handleResumeMonitorUpdate: MonitorUpdateCallback = (monitorState) => {
795
- const changed = syncTaskOutcomesFromMonitor(monitorState, allTaskOutcomes);
796
- if (changed) {
797
- persistRuntimeState("task-transition", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
798
- }
799
- onMonitorUpdate?.(monitorState);
800
- };
801
-
802
- // Execute the wave
803
- const waveResult = await executeWave(
804
- waveTasks,
805
- waveIdx + 1,
806
- discovery.pending,
807
- orchConfig,
808
- repoRoot,
809
- batchState.batchId,
810
- batchState.pauseSignal,
811
- depGraph,
812
- handleResumeMonitorUpdate,
813
- (lanes) => {
814
- latestAllocatedLanes = lanes;
815
- batchState.currentLanes = lanes;
816
- if (seedPendingOutcomesForAllocatedLanes(lanes, allTaskOutcomes)) {
817
- persistRuntimeState("wave-lanes-allocated", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
818
- }
819
- },
820
- );
821
-
822
- batchState.waveResults.push(waveResult);
823
- batchState.currentLanes = [];
824
-
825
- // Accumulate task outcomes
826
- latestAllocatedLanes = waveResult.allocatedLanes;
827
- for (const lr of waveResult.laneResults) {
828
- for (const taskOutcome of lr.tasks) {
829
- upsertTaskOutcome(allTaskOutcomes, taskOutcome);
830
- }
831
- }
832
-
833
- // Accumulate results
834
- batchState.succeededTasks += waveResult.succeededTaskIds.length;
835
- batchState.failedTasks += waveResult.failedTaskIds.length;
836
- batchState.skippedTasks += waveResult.skippedTaskIds.length;
837
-
838
- for (const taskId of waveResult.succeededTaskIds) {
839
- completedTaskSet.add(taskId);
840
- failedTaskSet.delete(taskId);
841
- reconnectTaskSet.delete(taskId);
842
- }
843
- for (const taskId of waveResult.failedTaskIds) {
844
- failedTaskSet.add(taskId);
845
- completedTaskSet.delete(taskId);
846
- reconnectTaskSet.delete(taskId);
847
- }
848
-
849
- for (const blocked of waveResult.blockedTaskIds) {
850
- batchState.blockedTaskIds.add(blocked);
851
- }
852
-
853
- persistRuntimeState("wave-execution-complete", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
854
-
855
- const elapsedSec = Math.round((waveResult.endedAt - waveResult.startedAt) / 1000);
856
- onNotify(
857
- ORCH_MESSAGES.orchWaveComplete(
858
- waveIdx + 1,
859
- waveResult.succeededTaskIds.length,
860
- waveResult.failedTaskIds.length,
861
- waveResult.skippedTaskIds.length,
862
- elapsedSec,
863
- ),
864
- waveResult.failedTaskIds.length > 0 ? "warning" : "info",
865
- );
866
-
867
- // Check failure policy
868
- if (waveResult.stoppedEarly) {
869
- if (waveResult.policyApplied === "stop-all") {
870
- batchState.phase = "stopped";
871
- persistRuntimeState("stop-all", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
872
- onNotify(ORCH_MESSAGES.orchBatchStopped(batchState.batchId, "stop-all"), "error");
873
- break;
874
- }
875
- if (waveResult.policyApplied === "stop-wave") {
876
- batchState.phase = "stopped";
877
- persistRuntimeState("stop-wave", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
878
- onNotify(ORCH_MESSAGES.orchBatchStopped(batchState.batchId, "stop-wave"), "error");
879
- break;
880
- }
881
- }
882
-
883
- // Merge handling (same as executeOrchBatch)
884
- let mergeResult: MergeWaveResult | null = null;
885
-
886
- const laneOutcomeByNumber = new Map<number, LaneExecutionResult>();
887
- for (const lr of waveResult.laneResults) {
888
- laneOutcomeByNumber.set(lr.laneNumber, lr);
889
- }
890
- const mixedOutcomeLanes = waveResult.laneResults.filter(lr => {
891
- const hasSucceeded = lr.tasks.some(t => t.status === "succeeded");
892
- const hasHardFailure = lr.tasks.some(
893
- t => t.status === "failed" || t.status === "stalled",
894
- );
895
- return hasSucceeded && hasHardFailure;
896
- });
897
-
898
- if (waveResult.succeededTaskIds.length > 0) {
899
- const mergeableLaneCount = waveResult.allocatedLanes.filter(lane => {
900
- const outcome = laneOutcomeByNumber.get(lane.laneNumber);
901
- if (!outcome) return false;
902
- const hasSucceeded = outcome.tasks.some(t => t.status === "succeeded");
903
- const hasHardFailure = outcome.tasks.some(
904
- t => t.status === "failed" || t.status === "stalled",
905
- );
906
- return hasSucceeded && !hasHardFailure;
907
- }).length;
908
-
909
- if (mergeableLaneCount > 0) {
910
- batchState.phase = "merging";
911
- persistRuntimeState("merge-start", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
912
- onNotify(ORCH_MESSAGES.orchMergeStart(waveIdx + 1, mergeableLaneCount), "info");
913
-
914
- mergeResult = mergeWave(
915
- waveResult.allocatedLanes,
916
- waveResult,
917
- waveIdx + 1,
918
- orchConfig,
919
- repoRoot,
920
- batchState.batchId,
921
- );
922
- batchState.mergeResults.push(mergeResult);
923
-
924
- // Emit per-lane merge notifications
925
- for (const lr of mergeResult.laneResults) {
926
- const durationSec = Math.round(lr.durationMs / 1000);
927
- if (lr.result?.status === "SUCCESS") {
928
- onNotify(ORCH_MESSAGES.orchMergeLaneSuccess(lr.laneNumber, lr.result.merge_commit, durationSec), "info");
929
- } else if (lr.result?.status === "CONFLICT_RESOLVED") {
930
- onNotify(ORCH_MESSAGES.orchMergeLaneConflictResolved(lr.laneNumber, lr.result.conflicts.length, durationSec), "info");
931
- } else if (lr.result?.status === "CONFLICT_UNRESOLVED" || lr.result?.status === "BUILD_FAILURE") {
932
- onNotify(ORCH_MESSAGES.orchMergeLaneFailed(lr.laneNumber, lr.error || lr.result.status), "error");
933
- } else if (lr.error) {
934
- onNotify(ORCH_MESSAGES.orchMergeLaneFailed(lr.laneNumber, lr.error), "error");
935
- }
936
- }
937
-
938
- if (mixedOutcomeLanes.length > 0) {
939
- const mixedIds = mixedOutcomeLanes.map(l => `lane-${l.laneNumber}`).join(", ");
940
- const failureReason =
941
- `Lane(s) ${mixedIds} contain both succeeded and failed tasks. ` +
942
- `Automatic partial-branch merge is disabled to avoid dropping succeeded commits.`;
943
- mergeResult = { ...mergeResult, status: "partial", failedLane: mixedOutcomeLanes[0].laneNumber, failureReason };
944
- }
945
-
946
- const mergedCount = mergeResult.laneResults.filter(
947
- r => r.result?.status === "SUCCESS" || r.result?.status === "CONFLICT_RESOLVED",
948
- ).length;
949
- const mergeTotalSec = Math.round(mergeResult.totalDurationMs / 1000);
950
-
951
- if (mergeResult.status === "succeeded") {
952
- onNotify(ORCH_MESSAGES.orchMergeComplete(waveIdx + 1, mergedCount, mergeTotalSec), "info");
953
- } else {
954
- onNotify(
955
- ORCH_MESSAGES.orchMergeFailed(waveIdx + 1, mergeResult.failedLane ?? 0, mergeResult.failureReason || "unknown"),
956
- "error",
957
- );
958
- }
959
-
960
- batchState.phase = "executing";
961
- persistRuntimeState("merge-complete", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
962
- } else if (mixedOutcomeLanes.length > 0) {
963
- const mixedIds = mixedOutcomeLanes.map(l => `lane-${l.laneNumber}`).join(", ");
964
- mergeResult = {
965
- waveIndex: waveIdx + 1,
966
- status: "partial",
967
- laneResults: [],
968
- failedLane: mixedOutcomeLanes[0].laneNumber,
969
- failureReason:
970
- `Lane(s) ${mixedIds} contain both succeeded and failed tasks. ` +
971
- `Automatic partial-branch merge is disabled to avoid dropping succeeded commits.`,
972
- totalDurationMs: 0,
973
- };
974
- onNotify(
975
- ORCH_MESSAGES.orchMergeFailed(waveIdx + 1, mergeResult.failedLane, mergeResult.failureReason || "unknown"),
976
- "error",
977
- );
978
- } else {
979
- onNotify(ORCH_MESSAGES.orchMergeSkipped(waveIdx + 1), "info");
980
- }
981
- } else {
982
- onNotify(ORCH_MESSAGES.orchMergeSkipped(waveIdx + 1), "info");
983
- }
984
-
985
- // Handle merge failure
986
- if (mergeResult && (mergeResult.status === "failed" || mergeResult.status === "partial")) {
987
- const mergeFailurePolicy = orchConfig.failure.on_merge_failure;
988
-
989
- if (mergeFailurePolicy === "pause") {
990
- batchState.phase = "paused";
991
- batchState.errors.push(
992
- `Merge failed at wave ${waveIdx + 1}: ${mergeResult.failureReason || "unknown"}. ` +
993
- `Batch paused. Resolve conflicts and use /orch-resume to continue.`,
994
- );
995
- persistRuntimeState("merge-failure-pause", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
996
- onNotify(
997
- `⏸️ Batch paused due to merge failure at wave ${waveIdx + 1}. ` +
998
- `Resolve conflicts and resume.`,
999
- "error",
1000
- );
1001
- preserveWorktreesForResume = true;
1002
- break;
1003
- } else {
1004
- batchState.phase = "stopped";
1005
- batchState.errors.push(
1006
- `Merge failed at wave ${waveIdx + 1}: ${mergeResult.failureReason || "unknown"}. ` +
1007
- `Batch aborted by on_merge_failure policy.`,
1008
- );
1009
- persistRuntimeState("merge-failure-abort", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
1010
- onNotify(
1011
- `⛔ Batch aborted due to merge failure at wave ${waveIdx + 1}.`,
1012
- "error",
1013
- );
1014
- preserveWorktreesForResume = true;
1015
- break;
1016
- }
1017
- }
1018
-
1019
- // Post-merge: reset worktrees for next wave
1020
- if (mergeResult && mergeResult.status === "succeeded") {
1021
- const targetBranch = orchConfig.orchestrator.integration_branch;
1022
- for (const lr of mergeResult.laneResults) {
1023
- if (lr.result?.status === "SUCCESS" || lr.result?.status === "CONFLICT_RESOLVED") {
1024
- const ancestorCheck = runGit(["merge-base", "--is-ancestor", lr.sourceBranch, targetBranch], repoRoot);
1025
- if (ancestorCheck.ok) {
1026
- deleteBranchBestEffort(lr.sourceBranch, repoRoot);
1027
- }
1028
- }
1029
- }
1030
- }
1031
-
1032
- if (waveIdx < persistedState.wavePlan.length - 1 && !batchState.pauseSignal.paused) {
1033
- const wtPrefix = orchConfig.orchestrator.worktree_prefix;
1034
- const existingWorktrees = listWorktrees(wtPrefix, repoRoot);
1035
- if (existingWorktrees.length > 0) {
1036
- const targetBranch = orchConfig.orchestrator.integration_branch;
1037
- for (const wt of existingWorktrees) {
1038
- const resetResult = safeResetWorktree(wt, targetBranch, repoRoot);
1039
- if (!resetResult.success) {
1040
- try { removeWorktree(wt, repoRoot); } catch { /* best effort */ }
1041
- }
1042
- }
1043
- }
1044
- }
1045
- }
1046
-
1047
- // ── 11. Cleanup and terminal state ───────────────────────────
1048
- if (!preserveWorktreesForResume) {
1049
- const wtPrefix = orchConfig.orchestrator.worktree_prefix;
1050
- const targetBranch = orchConfig.orchestrator.integration_branch;
1051
- removeAllWorktrees(wtPrefix, repoRoot, targetBranch);
1052
- }
1053
-
1054
- batchState.endedAt = Date.now();
1055
- const totalElapsedSec = Math.round((batchState.endedAt - batchState.startedAt) / 1000);
1056
-
1057
- if ((batchState.phase as OrchBatchPhase) === "executing" || (batchState.phase as OrchBatchPhase) === "merging") {
1058
- if (batchState.failedTasks > 0) {
1059
- batchState.phase = "failed";
1060
- } else {
1061
- batchState.phase = "completed";
1062
- }
1063
- }
1064
-
1065
- persistRuntimeState("batch-terminal", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
1066
-
1067
- if (batchState.phase === "paused" || batchState.phase === "stopped") {
1068
- execLog("resume", batchState.batchId, "resumed batch ended in non-terminal state", { phase: batchState.phase });
1069
- } else {
1070
- onNotify(
1071
- ORCH_MESSAGES.resumeComplete(
1072
- batchState.batchId,
1073
- batchState.succeededTasks,
1074
- batchState.failedTasks,
1075
- batchState.skippedTasks,
1076
- batchState.blockedTasks,
1077
- totalElapsedSec,
1078
- ),
1079
- batchState.failedTasks > 0 ? "warning" : "info",
1080
- );
1081
-
1082
- if (batchState.phase === "completed") {
1083
- try {
1084
- deleteBatchState(repoRoot);
1085
- execLog("state", batchState.batchId, "state file deleted on clean resume completion");
1086
- } catch {
1087
- // Best-effort
1088
- }
1089
- }
1090
- }
1091
- }
1092
-
1
+ /**
2
+ * Resume logic for paused/interrupted batches
3
+ * @module orch/resume
4
+ */
5
+ import { existsSync } from "fs";
6
+ import { join } from "path";
7
+
8
+ import { runDiscovery } from "./discovery.ts";
9
+ import { executeOrchBatch } from "./engine.ts";
10
+ import { execLog, executeWave, pollUntilTaskComplete, spawnLaneSession, tmuxHasSession } from "./execution.ts";
11
+ import type { MonitorUpdateCallback } from "./execution.ts";
12
+ import { runGit } from "./git.ts";
13
+ import { mergeWave } from "./merge.ts";
14
+ import { ORCH_MESSAGES } from "./messages.ts";
15
+ import { deleteBatchState, hasTaskDoneMarker, loadBatchState, persistRuntimeState, seedPendingOutcomesForAllocatedLanes, syncTaskOutcomesFromMonitor, upsertTaskOutcome } from "./persistence.ts";
16
+ import { StateFileError } from "./types.ts";
17
+ import type { AllocatedLane, AllocatedTask, LaneExecutionResult, LaneTaskOutcome, LaneTaskStatus, MergeWaveResult, OrchBatchPhase, OrchBatchRuntimeState, OrchestratorConfig, ParsedTask, PersistedBatchState, ReconciledTaskState, ResumeEligibility, ResumePoint, TaskRunnerConfig, WaveExecutionResult } from "./types.ts";
18
+ import { buildDependencyGraph } from "./waves.ts";
19
+ import { deleteBranchBestEffort, listWorktrees, removeAllWorktrees, removeWorktree, safeResetWorktree } from "./worktree.ts";
20
+
21
+ // ── Resume Pure Functions ────────────────────────────────────────────
22
+
23
+ /**
24
+ * Check whether a persisted batch state is eligible for resume.
25
+ *
26
+ * Resume eligibility matrix:
27
+ * | Phase | Eligible? | Reason |
28
+ * |-----------|-----------|-------------------------------------------|
29
+ * | paused | ✅ | Batch was paused (user/merge-failure) |
30
+ * | executing | ✅ | Batch was executing when orchestrator died |
31
+ * | merging | ✅ | Batch was merging when orchestrator died |
32
+ * | stopped | ❌ | Batch was stopped by policy |
33
+ * | failed | ❌ | Batch has terminal failure |
34
+ * | completed | ❌ | Batch already completed |
35
+ * | idle | ❌ | Batch never started execution |
36
+ * | planning | ❌ | Batch was still planning |
37
+ *
38
+ * Pure function — no process or filesystem access.
39
+ */
40
+ export function checkResumeEligibility(state: PersistedBatchState): ResumeEligibility {
41
+ const { phase, batchId } = state;
42
+
43
+ switch (phase) {
44
+ case "paused":
45
+ return {
46
+ eligible: true,
47
+ reason: `Batch ${batchId} is paused and can be resumed.`,
48
+ phase,
49
+ batchId,
50
+ };
51
+
52
+ case "executing":
53
+ return {
54
+ eligible: true,
55
+ reason: `Batch ${batchId} was executing when the orchestrator disconnected. Can be resumed.`,
56
+ phase,
57
+ batchId,
58
+ };
59
+
60
+ case "merging":
61
+ return {
62
+ eligible: true,
63
+ reason: `Batch ${batchId} was merging when the orchestrator disconnected. Can be resumed.`,
64
+ phase,
65
+ batchId,
66
+ };
67
+
68
+ case "stopped":
69
+ return {
70
+ eligible: false,
71
+ reason: `Batch ${batchId} was stopped by failure policy. Use /orch-abort to clean up, then start a new batch.`,
72
+ phase,
73
+ batchId,
74
+ };
75
+
76
+ case "failed":
77
+ return {
78
+ eligible: false,
79
+ reason: `Batch ${batchId} has a terminal failure. Use /orch-abort to clean up, then start a new batch.`,
80
+ phase,
81
+ batchId,
82
+ };
83
+
84
+ case "completed":
85
+ return {
86
+ eligible: false,
87
+ reason: `Batch ${batchId} already completed. Delete the state file or start a new batch.`,
88
+ phase,
89
+ batchId,
90
+ };
91
+
92
+ case "idle":
93
+ return {
94
+ eligible: false,
95
+ reason: `Batch ${batchId} never started execution. Start a new batch with /orch.`,
96
+ phase,
97
+ batchId,
98
+ };
99
+
100
+ case "planning":
101
+ return {
102
+ eligible: false,
103
+ reason: `Batch ${batchId} was still in planning phase. Start a new batch with /orch.`,
104
+ phase,
105
+ batchId,
106
+ };
107
+
108
+ default:
109
+ return {
110
+ eligible: false,
111
+ reason: `Batch ${batchId} has unknown phase "${phase}". Delete the state file and start a new batch.`,
112
+ phase,
113
+ batchId,
114
+ };
115
+ }
116
+ }
117
+
118
+ /**
119
+ * Reconcile persisted task states against live signals.
120
+ *
121
+ * For each task in the persisted state, determines the correct action
122
+ * based on the current state of TMUX sessions and .DONE files.
123
+ *
124
+ * Precedence rules (applied per-task):
125
+ * 1. .DONE file found → "mark-complete" (even if session is alive — task is done)
126
+ * 2. Session alive + no .DONE → "reconnect" (task is still running)
127
+ * 3. Persisted status is terminal (succeeded/failed/stalled/skipped) → "skip"
128
+ * (already resolved in the original run, no action needed)
129
+ * 4. Session dead + no .DONE + was pending/running → "mark-failed"
130
+ * (task was interrupted and did not complete)
131
+ *
132
+ * Pure function — no process or filesystem access.
133
+ *
134
+ * @param persistedState - Loaded and validated batch state
135
+ * @param aliveSessions - Set of TMUX session names currently alive
136
+ * @param doneTaskIds - Set of task IDs whose .DONE files exist
137
+ * @returns Array of reconciled task states in persisted order
138
+ */
139
+ export function reconcileTaskStates(
140
+ persistedState: PersistedBatchState,
141
+ aliveSessions: ReadonlySet<string>,
142
+ doneTaskIds: ReadonlySet<string>,
143
+ existingWorktrees: ReadonlySet<string> = new Set(),
144
+ ): ReconciledTaskState[] {
145
+ return persistedState.tasks.map((task) => {
146
+ const sessionAlive = aliveSessions.has(task.sessionName);
147
+ const doneFileFound = doneTaskIds.has(task.taskId);
148
+ const worktreeExists = existingWorktrees.has(task.taskId);
149
+
150
+ // Precedence 1: .DONE file found → task completed
151
+ if (doneFileFound) {
152
+ return {
153
+ taskId: task.taskId,
154
+ persistedStatus: task.status,
155
+ liveStatus: "succeeded" as LaneTaskStatus,
156
+ sessionAlive,
157
+ doneFileFound: true,
158
+ worktreeExists,
159
+ action: "mark-complete" as const,
160
+ };
161
+ }
162
+
163
+ // Precedence 2: Session alive → reconnect
164
+ if (sessionAlive) {
165
+ return {
166
+ taskId: task.taskId,
167
+ persistedStatus: task.status,
168
+ liveStatus: "running" as LaneTaskStatus,
169
+ sessionAlive: true,
170
+ doneFileFound: false,
171
+ worktreeExists,
172
+ action: "reconnect" as const,
173
+ };
174
+ }
175
+
176
+ // Precedence 3: Already terminal in persisted state → skip
177
+ const terminalStatuses: LaneTaskStatus[] = ["succeeded", "failed", "stalled", "skipped"];
178
+ if (terminalStatuses.includes(task.status)) {
179
+ return {
180
+ taskId: task.taskId,
181
+ persistedStatus: task.status,
182
+ liveStatus: task.status,
183
+ sessionAlive: false,
184
+ doneFileFound: false,
185
+ worktreeExists,
186
+ action: "skip" as const,
187
+ };
188
+ }
189
+
190
+ // Precedence 4: Session dead + no .DONE + worktree exists → re-execute
191
+ if (worktreeExists) {
192
+ return {
193
+ taskId: task.taskId,
194
+ persistedStatus: task.status,
195
+ liveStatus: "pending" as LaneTaskStatus,
196
+ sessionAlive: false,
197
+ doneFileFound: false,
198
+ worktreeExists: true,
199
+ action: "re-execute" as const,
200
+ };
201
+ }
202
+
203
+ // Precedence 5: Dead session + not terminal + no .DONE + no worktree → failed
204
+ return {
205
+ taskId: task.taskId,
206
+ persistedStatus: task.status,
207
+ liveStatus: "failed" as LaneTaskStatus,
208
+ sessionAlive: false,
209
+ doneFileFound: false,
210
+ worktreeExists: false,
211
+ action: "mark-failed" as const,
212
+ };
213
+ });
214
+ }
215
+
216
+ /**
217
+ * Compute the resume point from reconciled task states and wave plan.
218
+ *
219
+ * Determines which wave to resume from by finding the first wave that
220
+ * has any incomplete tasks. Skips fully completed waves.
221
+ *
222
+ * Pure function — no process or filesystem access.
223
+ *
224
+ * @param persistedState - Loaded and validated batch state
225
+ * @param reconciledTasks - Reconciled task states
226
+ * @returns Resume point with wave index and categorized task IDs
227
+ */
228
+ export function computeResumePoint(
229
+ persistedState: PersistedBatchState,
230
+ reconciledTasks: ReconciledTaskState[],
231
+ ): ResumePoint {
232
+ // Build lookup: taskId → reconciled state
233
+ const reconciledMap = new Map<string, ReconciledTaskState>();
234
+ for (const task of reconciledTasks) {
235
+ reconciledMap.set(task.taskId, task);
236
+ }
237
+
238
+ // Categorize tasks
239
+ const completedTaskIds: string[] = [];
240
+ const pendingTaskIds: string[] = [];
241
+ const failedTaskIds: string[] = [];
242
+ const reconnectTaskIds: string[] = [];
243
+ const reExecuteTaskIds: string[] = [];
244
+
245
+ for (const task of reconciledTasks) {
246
+ switch (task.action) {
247
+ case "mark-complete":
248
+ case "skip":
249
+ if (task.liveStatus === "succeeded" || task.persistedStatus === "succeeded") {
250
+ completedTaskIds.push(task.taskId);
251
+ } else if (task.liveStatus === "failed" || task.liveStatus === "stalled" || task.persistedStatus === "failed" || task.persistedStatus === "stalled") {
252
+ failedTaskIds.push(task.taskId);
253
+ }
254
+ // skipped tasks from original run don't count as completed or failed
255
+ break;
256
+ case "reconnect":
257
+ reconnectTaskIds.push(task.taskId);
258
+ break;
259
+ case "re-execute":
260
+ reExecuteTaskIds.push(task.taskId);
261
+ break;
262
+ case "mark-failed":
263
+ failedTaskIds.push(task.taskId);
264
+ break;
265
+ }
266
+ }
267
+
268
+ // Find resume wave: first wave with any non-completed tasks
269
+ let resumeWaveIndex = persistedState.wavePlan.length; // default: past end = all done
270
+
271
+ for (let i = 0; i < persistedState.wavePlan.length; i++) {
272
+ const waveTasks = persistedState.wavePlan[i];
273
+ const allDone = waveTasks.every((taskId) => {
274
+ const reconciled = reconciledMap.get(taskId);
275
+ if (!reconciled) return false;
276
+ // A task is "done" for wave-skip purposes if it completed or failed terminally
277
+ return (
278
+ reconciled.action === "mark-complete" ||
279
+ (reconciled.action === "skip" && (
280
+ reconciled.liveStatus === "succeeded" ||
281
+ reconciled.liveStatus === "failed" ||
282
+ reconciled.liveStatus === "stalled" ||
283
+ reconciled.persistedStatus === "succeeded" ||
284
+ reconciled.persistedStatus === "failed" ||
285
+ reconciled.persistedStatus === "stalled"
286
+ ))
287
+ );
288
+ });
289
+
290
+ if (!allDone) {
291
+ resumeWaveIndex = i;
292
+ break;
293
+ }
294
+ }
295
+
296
+ // Determine pending tasks: tasks in resume wave and later that need execution
297
+ const actualPendingTaskIds: string[] = [];
298
+ for (let i = resumeWaveIndex; i < persistedState.wavePlan.length; i++) {
299
+ for (const taskId of persistedState.wavePlan[i]) {
300
+ const reconciled = reconciledMap.get(taskId);
301
+ if (!reconciled) {
302
+ actualPendingTaskIds.push(taskId); // Unknown task — treat as pending
303
+ continue;
304
+ }
305
+ if (reconciled.action === "reconnect") {
306
+ // Tasks with alive sessions need reconnection and remain pending.
307
+ actualPendingTaskIds.push(taskId);
308
+ }
309
+ if (reconciled.action === "re-execute") {
310
+ // Tasks with existing worktrees need re-execution and remain pending.
311
+ actualPendingTaskIds.push(taskId);
312
+ }
313
+ if (reconciled.action === "skip" && reconciled.persistedStatus === "pending") {
314
+ // Skipped tasks that were pending need execution
315
+ actualPendingTaskIds.push(taskId);
316
+ }
317
+ }
318
+ }
319
+
320
+ return {
321
+ resumeWaveIndex,
322
+ completedTaskIds,
323
+ pendingTaskIds: actualPendingTaskIds,
324
+ failedTaskIds,
325
+ reconnectTaskIds,
326
+ reExecuteTaskIds,
327
+ };
328
+ }
329
+
330
+
331
+ export async function resumeOrchBatch(
332
+ orchConfig: OrchestratorConfig,
333
+ runnerConfig: TaskRunnerConfig,
334
+ cwd: string,
335
+ batchState: OrchBatchRuntimeState,
336
+ onNotify: (message: string, level: "info" | "warning" | "error") => void,
337
+ onMonitorUpdate?: MonitorUpdateCallback,
338
+ ): Promise<void> {
339
+ const repoRoot = cwd;
340
+ const prefix = orchConfig.orchestrator.tmux_prefix;
341
+
342
+ // ── 1. Load persisted state ──────────────────────────────────
343
+ let persistedState: PersistedBatchState | null;
344
+ try {
345
+ persistedState = loadBatchState(repoRoot);
346
+ } catch (err: unknown) {
347
+ if (err instanceof StateFileError) {
348
+ onNotify(
349
+ `❌ Cannot resume: ${err.message}`,
350
+ "error",
351
+ );
352
+ return;
353
+ }
354
+ throw err;
355
+ }
356
+
357
+ if (!persistedState) {
358
+ onNotify(
359
+ ORCH_MESSAGES.resumeNoState(),
360
+ "error",
361
+ );
362
+ return;
363
+ }
364
+
365
+ // ── 2. Check eligibility ─────────────────────────────────────
366
+ const eligibility = checkResumeEligibility(persistedState);
367
+ if (!eligibility.eligible) {
368
+ onNotify(
369
+ ORCH_MESSAGES.resumePhaseNotResumable(persistedState.batchId, persistedState.phase, eligibility.reason),
370
+ "error",
371
+ );
372
+ return;
373
+ }
374
+
375
+ onNotify(
376
+ ORCH_MESSAGES.resumeStarting(persistedState.batchId, persistedState.phase),
377
+ "info",
378
+ );
379
+
380
+ // ── 3. Discover live signals ─────────────────────────────────
381
+ // Check TMUX sessions
382
+ const aliveSessions = new Set<string>();
383
+ for (const task of persistedState.tasks) {
384
+ if (task.sessionName && tmuxHasSession(task.sessionName)) {
385
+ aliveSessions.add(task.sessionName);
386
+ }
387
+ }
388
+
389
+ // Check .DONE files
390
+ const doneTaskIds = new Set<string>();
391
+ for (const task of persistedState.tasks) {
392
+ if (task.taskFolder && hasTaskDoneMarker(task.taskFolder)) {
393
+ doneTaskIds.add(task.taskId);
394
+ }
395
+ }
396
+
397
+ // ── 3b. Detect existing worktrees ────────────────────────────
398
+ const existingWorktreeTaskIds = new Set<string>();
399
+ for (const task of persistedState.tasks) {
400
+ const laneRecord = persistedState.lanes.find(l => l.taskIds.includes(task.taskId));
401
+ if (laneRecord && laneRecord.worktreePath && existsSync(laneRecord.worktreePath)) {
402
+ existingWorktreeTaskIds.add(task.taskId);
403
+ }
404
+ }
405
+
406
+ // ── 4. Reconcile task states ─────────────────────────────────
407
+ const reconciledTasks = reconcileTaskStates(persistedState, aliveSessions, doneTaskIds, existingWorktreeTaskIds);
408
+
409
+ // ── 5. Compute resume point ──────────────────────────────────
410
+ const resumePoint = computeResumePoint(persistedState, reconciledTasks);
411
+ const completedTaskSet = new Set(resumePoint.completedTaskIds);
412
+ const failedTaskSet = new Set(resumePoint.failedTaskIds);
413
+ const reconnectTaskSet = new Set(resumePoint.reconnectTaskIds);
414
+ const reExecuteTaskSet = new Set(resumePoint.reExecuteTaskIds);
415
+
416
+ onNotify(
417
+ ORCH_MESSAGES.resumeReconciled(
418
+ persistedState.batchId,
419
+ resumePoint.completedTaskIds.length,
420
+ resumePoint.pendingTaskIds.length,
421
+ resumePoint.failedTaskIds.length,
422
+ resumePoint.reconnectTaskIds.length,
423
+ resumePoint.reExecuteTaskIds.length,
424
+ ),
425
+ "info",
426
+ );
427
+
428
+ if (resumePoint.reconnectTaskIds.length > 0) {
429
+ onNotify(
430
+ ORCH_MESSAGES.resumeReconnecting(resumePoint.reconnectTaskIds.length),
431
+ "info",
432
+ );
433
+ }
434
+
435
+ if (resumePoint.resumeWaveIndex > 0) {
436
+ onNotify(
437
+ ORCH_MESSAGES.resumeSkippedWaves(resumePoint.resumeWaveIndex),
438
+ "info",
439
+ );
440
+ }
441
+
442
+ // ── 6. Reconstruct runtime state ─────────────────────────────
443
+ batchState.phase = "executing";
444
+ batchState.batchId = persistedState.batchId;
445
+ batchState.baseBranch = persistedState.baseBranch || "";
446
+ batchState.startedAt = persistedState.startedAt;
447
+ batchState.pauseSignal = { paused: false };
448
+ batchState.totalWaves = persistedState.totalWaves;
449
+ batchState.totalTasks = persistedState.totalTasks;
450
+ batchState.succeededTasks = resumePoint.completedTaskIds.length;
451
+ batchState.failedTasks = resumePoint.failedTaskIds.length;
452
+ batchState.skippedTasks = persistedState.skippedTasks;
453
+ batchState.blockedTasks = persistedState.blockedTasks;
454
+ batchState.blockedTaskIds = new Set(persistedState.blockedTaskIds);
455
+ batchState.errors = [...persistedState.errors];
456
+ batchState.endedAt = null;
457
+ batchState.currentWaveIndex = resumePoint.resumeWaveIndex;
458
+ batchState.waveResults = [];
459
+
460
+ // ── 7. Re-run discovery for ParsedTask metadata ──────────────
461
+ // We need fresh ParsedTask data (taskFolder, promptPath) for execution.
462
+ // Use "all" to discover all areas.
463
+ const discovery = runDiscovery("all", runnerConfig.task_areas, cwd, {
464
+ refreshDependencies: false,
465
+ dependencySource: orchConfig.dependencies.source,
466
+ useDependencyCache: orchConfig.dependencies.cache,
467
+ });
468
+
469
+ // Build dependency graph for skip-dependents policy
470
+ const depGraph = buildDependencyGraph(discovery.pending, discovery.completed);
471
+ batchState.dependencyGraph = depGraph;
472
+
473
+ // ── 8. Handle alive sessions (reconnect) ─────────────────────
474
+ // For tasks with alive sessions, we need to wait for them to complete.
475
+ // We poll each alive session's .DONE file.
476
+ const reconnectTasks = reconciledTasks.filter(t => t.action === "reconnect");
477
+ const reconnectFinalStatus = new Map<string, LaneTaskStatus>();
478
+
479
+ if (reconnectTasks.length > 0) {
480
+ // Wait for reconnected tasks to complete (poll .DONE files)
481
+ for (const task of reconnectTasks) {
482
+ const parsedTask = discovery.pending.get(task.taskId);
483
+ if (!parsedTask) continue;
484
+
485
+ // Find the lane info from persisted state
486
+ const laneRecord = persistedState.lanes.find(
487
+ l => l.taskIds.includes(task.taskId),
488
+ );
489
+ if (!laneRecord) continue;
490
+
491
+ // Build a minimal AllocatedLane for polling
492
+ const allocatedTask: AllocatedTask = {
493
+ taskId: task.taskId,
494
+ order: 0,
495
+ task: parsedTask,
496
+ estimatedMinutes: 0,
497
+ };
498
+ const lane: AllocatedLane = {
499
+ laneNumber: laneRecord.laneNumber,
500
+ laneId: laneRecord.laneId,
501
+ tmuxSessionName: laneRecord.tmuxSessionName,
502
+ worktreePath: laneRecord.worktreePath,
503
+ branch: laneRecord.branch,
504
+ tasks: [allocatedTask],
505
+ strategy: "round-robin",
506
+ estimatedLoad: 0,
507
+ estimatedMinutes: 0,
508
+ };
509
+
510
+ execLog("resume", task.taskId, "reconnecting to alive session", {
511
+ session: laneRecord.tmuxSessionName,
512
+ });
513
+
514
+ // Poll until task completes
515
+ try {
516
+ const pollResult = await pollUntilTaskComplete(
517
+ lane,
518
+ allocatedTask,
519
+ orchConfig,
520
+ repoRoot,
521
+ batchState.pauseSignal,
522
+ );
523
+
524
+ if (pollResult.status === "succeeded") {
525
+ reconnectFinalStatus.set(task.taskId, "succeeded");
526
+ completedTaskSet.add(task.taskId);
527
+ failedTaskSet.delete(task.taskId);
528
+ reconnectTaskSet.delete(task.taskId);
529
+ batchState.succeededTasks++;
530
+ execLog("resume", task.taskId, "reconnected task succeeded");
531
+ } else {
532
+ reconnectFinalStatus.set(task.taskId, "failed");
533
+ failedTaskSet.add(task.taskId);
534
+ completedTaskSet.delete(task.taskId);
535
+ reconnectTaskSet.delete(task.taskId);
536
+ batchState.failedTasks++;
537
+ execLog("resume", task.taskId, `reconnected task ${pollResult.status}: ${pollResult.exitReason}`);
538
+ }
539
+ } catch (err: unknown) {
540
+ reconnectFinalStatus.set(task.taskId, "failed");
541
+ failedTaskSet.add(task.taskId);
542
+ completedTaskSet.delete(task.taskId);
543
+ reconnectTaskSet.delete(task.taskId);
544
+ batchState.failedTasks++;
545
+ const msg = err instanceof Error ? err.message : String(err);
546
+ execLog("resume", task.taskId, `reconnection error: ${msg}`);
547
+ }
548
+ }
549
+ }
550
+
551
+ // ── 8b. Handle re-execute tasks (dead session + existing worktree) ──
552
+ const reExecuteTasks = reconciledTasks.filter(t => t.action === "re-execute");
553
+ const reExecuteFinalStatus = new Map<string, LaneTaskStatus>();
554
+ const reExecAllocatedLanes: AllocatedLane[] = [];
555
+
556
+ if (reExecuteTasks.length > 0) {
557
+ onNotify(
558
+ `🔄 Re-executing ${reExecuteTasks.length} interrupted task(s) in existing worktrees...`,
559
+ "info",
560
+ );
561
+
562
+ for (const task of reExecuteTasks) {
563
+ const parsedTask = discovery.pending.get(task.taskId);
564
+ if (!parsedTask) continue;
565
+
566
+ const laneRecord = persistedState.lanes.find(
567
+ l => l.taskIds.includes(task.taskId),
568
+ );
569
+ if (!laneRecord) continue;
570
+
571
+ const allocatedTask: AllocatedTask = {
572
+ taskId: task.taskId,
573
+ order: 0,
574
+ task: parsedTask,
575
+ estimatedMinutes: 0,
576
+ };
577
+ const lane: AllocatedLane = {
578
+ laneNumber: laneRecord.laneNumber,
579
+ laneId: laneRecord.laneId,
580
+ tmuxSessionName: laneRecord.tmuxSessionName,
581
+ worktreePath: laneRecord.worktreePath,
582
+ branch: laneRecord.branch,
583
+ tasks: [allocatedTask],
584
+ strategy: "round-robin",
585
+ estimatedLoad: 0,
586
+ estimatedMinutes: 0,
587
+ };
588
+
589
+ execLog("resume", task.taskId, "re-executing interrupted task in existing worktree", {
590
+ session: laneRecord.tmuxSessionName,
591
+ worktree: laneRecord.worktreePath,
592
+ });
593
+
594
+ try {
595
+ spawnLaneSession(lane, allocatedTask, orchConfig, repoRoot);
596
+ const pollResult = await pollUntilTaskComplete(
597
+ lane,
598
+ allocatedTask,
599
+ orchConfig,
600
+ repoRoot,
601
+ batchState.pauseSignal,
602
+ );
603
+
604
+ if (pollResult.status === "succeeded") {
605
+ reExecuteFinalStatus.set(task.taskId, "succeeded");
606
+ completedTaskSet.add(task.taskId);
607
+ failedTaskSet.delete(task.taskId);
608
+ reExecuteTaskSet.delete(task.taskId);
609
+ batchState.succeededTasks++;
610
+ reExecAllocatedLanes.push(lane);
611
+ execLog("resume", task.taskId, "re-executed task succeeded");
612
+ } else {
613
+ reExecuteFinalStatus.set(task.taskId, "failed");
614
+ failedTaskSet.add(task.taskId);
615
+ completedTaskSet.delete(task.taskId);
616
+ reExecuteTaskSet.delete(task.taskId);
617
+ batchState.failedTasks++;
618
+ execLog("resume", task.taskId, `re-executed task ${pollResult.status}: ${pollResult.exitReason}`);
619
+ }
620
+ } catch (err: unknown) {
621
+ reExecuteFinalStatus.set(task.taskId, "failed");
622
+ failedTaskSet.add(task.taskId);
623
+ completedTaskSet.delete(task.taskId);
624
+ reExecuteTaskSet.delete(task.taskId);
625
+ batchState.failedTasks++;
626
+ const msg = err instanceof Error ? err.message : String(err);
627
+ execLog("resume", task.taskId, `re-execution error: ${msg}`);
628
+ }
629
+ }
630
+ }
631
+
632
+ // ── 8c. Merge re-executed lane branches before cleanup ───────
633
+ // Re-executed tasks completed outside the normal wave loop, so their
634
+ // branches would not be merged by step 10. Merge them now.
635
+ if (reExecAllocatedLanes.length > 0) {
636
+ const succeededReExecTaskIds = [...reExecuteFinalStatus.entries()]
637
+ .filter(([_, status]) => status === "succeeded")
638
+ .map(([taskId]) => taskId);
639
+
640
+ if (succeededReExecTaskIds.length > 0) {
641
+ onNotify(
642
+ `🔀 Merging ${reExecAllocatedLanes.length} re-executed lane branch(es)...`,
643
+ "info",
644
+ );
645
+
646
+ // Build synthetic WaveExecutionResult for mergeWave()
647
+ const syntheticLaneResults: LaneExecutionResult[] = reExecAllocatedLanes.map(lane => ({
648
+ laneNumber: lane.laneNumber,
649
+ laneId: lane.laneId,
650
+ tasks: lane.tasks.map(t => ({
651
+ taskId: t.taskId,
652
+ status: "succeeded" as LaneTaskStatus,
653
+ startTime: Date.now(),
654
+ endTime: Date.now(),
655
+ exitReason: "Re-executed task completed successfully",
656
+ sessionName: lane.tmuxSessionName,
657
+ doneFileFound: true,
658
+ })),
659
+ overallStatus: "succeeded" as const,
660
+ startTime: Date.now(),
661
+ endTime: Date.now(),
662
+ }));
663
+
664
+ const syntheticWaveResult: WaveExecutionResult = {
665
+ waveIndex: 0,
666
+ startedAt: Date.now(),
667
+ endedAt: Date.now(),
668
+ laneResults: syntheticLaneResults,
669
+ policyApplied: orchConfig.failure.on_task_failure,
670
+ stoppedEarly: false,
671
+ failedTaskIds: [],
672
+ skippedTaskIds: [],
673
+ succeededTaskIds: succeededReExecTaskIds,
674
+ blockedTaskIds: [],
675
+ laneCount: reExecAllocatedLanes.length,
676
+ overallStatus: "succeeded",
677
+ finalMonitorState: null,
678
+ allocatedLanes: reExecAllocatedLanes,
679
+ };
680
+
681
+ const reExecMergeResult = mergeWave(
682
+ reExecAllocatedLanes,
683
+ syntheticWaveResult,
684
+ 0,
685
+ orchConfig,
686
+ repoRoot,
687
+ batchState.batchId,
688
+ batchState.baseBranch,
689
+ );
690
+
691
+ if (reExecMergeResult.status === "succeeded") {
692
+ onNotify(
693
+ `✅ Re-executed branch merge complete: ${reExecMergeResult.laneResults.length} lane(s) merged`,
694
+ "info",
695
+ );
696
+
697
+ // Clean up merged branches
698
+ const targetBranch = batchState.baseBranch;
699
+ for (const lr of reExecMergeResult.laneResults) {
700
+ if (lr.result?.status === "SUCCESS" || lr.result?.status === "CONFLICT_RESOLVED") {
701
+ deleteBranchBestEffort(lr.sourceBranch, repoRoot);
702
+ }
703
+ }
704
+ } else {
705
+ onNotify(
706
+ `⚠️ Re-executed branch merge ${reExecMergeResult.status}: ${reExecMergeResult.failureReason || "unknown"}`,
707
+ "warning",
708
+ );
709
+ }
710
+
711
+ batchState.mergeResults.push(reExecMergeResult);
712
+ }
713
+ }
714
+
715
+ // ── 9. Persist state after reconciliation ────────────────────
716
+ // Track state for persistence
717
+ const wavePlan = persistedState.wavePlan;
718
+ const allTaskOutcomes: LaneTaskOutcome[] = [];
719
+ let latestAllocatedLanes: AllocatedLane[] = [];
720
+
721
+ // Build outcomes from reconciled tasks
722
+ for (const task of reconciledTasks) {
723
+ const persistedTask = persistedState.tasks.find(t => t.taskId === task.taskId);
724
+ const reconnectStatus = reconnectFinalStatus.get(task.taskId);
725
+ const reExecuteStatus = reExecuteFinalStatus.get(task.taskId);
726
+ const status = task.action === "reconnect"
727
+ ? (reconnectStatus || "running")
728
+ : task.action === "re-execute"
729
+ ? (reExecuteStatus || "pending")
730
+ : task.liveStatus;
731
+ const isTerminal = status === "succeeded" || status === "failed" || status === "stalled" || status === "skipped";
732
+ allTaskOutcomes.push({
733
+ taskId: task.taskId,
734
+ status,
735
+ startTime: persistedTask?.startedAt ?? null,
736
+ endTime: isTerminal ? Date.now() : null,
737
+ exitReason: task.action === "mark-complete" ? ".DONE file found on resume"
738
+ : task.action === "mark-failed" ? "Session dead, no .DONE file, no worktree on resume"
739
+ : task.action === "reconnect"
740
+ ? (status === "succeeded" ? "Reconnected task completed" : status === "failed" ? "Reconnected task failed" : "Reconnected to alive session")
741
+ : task.action === "re-execute"
742
+ ? (status === "succeeded" ? "Re-executed task completed" : status === "failed" ? "Re-executed task failed" : "Re-executing in existing worktree")
743
+ : persistedTask?.exitReason ?? "",
744
+ sessionName: persistedTask?.sessionName ?? "",
745
+ doneFileFound: status === "succeeded" ? true : task.doneFileFound,
746
+ });
747
+ }
748
+
749
+ persistRuntimeState("resume-reconciliation", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery ?? null, repoRoot);
750
+
751
+ // ── 10. Continue wave execution ──────────────────────────────
752
+ // We need to execute remaining waves starting from resumeWaveIndex.
753
+ // For waves where some tasks are already done, we filter them out.
754
+
755
+ let preserveWorktreesForResume = false;
756
+
757
+ for (let waveIdx = resumePoint.resumeWaveIndex; waveIdx < persistedState.wavePlan.length; waveIdx++) {
758
+ // Check pause signal
759
+ if (batchState.pauseSignal.paused) {
760
+ batchState.phase = "paused";
761
+ persistRuntimeState("pause-before-wave", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
762
+ onNotify(`⏸️ Batch paused before wave ${waveIdx + 1}.`, "warning");
763
+ break;
764
+ }
765
+
766
+ batchState.currentWaveIndex = waveIdx;
767
+ persistRuntimeState("wave-index-change", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
768
+
769
+ // Get wave tasks, filtering out completed/failed/blocked ones.
770
+ let waveTasks = persistedState.wavePlan[waveIdx].filter(
771
+ taskId => !completedTaskSet.has(taskId) &&
772
+ !failedTaskSet.has(taskId) &&
773
+ !batchState.blockedTaskIds.has(taskId),
774
+ );
775
+
776
+ // Also filter tasks where discovery doesn't have them as pending
777
+ waveTasks = waveTasks.filter(taskId => discovery.pending.has(taskId));
778
+
779
+ const blockedInWave = persistedState.wavePlan[waveIdx].filter(
780
+ taskId => batchState.blockedTaskIds.has(taskId),
781
+ );
782
+ if (blockedInWave.length > 0) {
783
+ batchState.blockedTasks += blockedInWave.length;
784
+ }
785
+
786
+ if (waveTasks.length === 0) {
787
+ execLog("resume", batchState.batchId, `wave ${waveIdx + 1}: no tasks to execute (all completed/blocked)`);
788
+ continue;
789
+ }
790
+
791
+ onNotify(
792
+ ORCH_MESSAGES.orchWaveStart(waveIdx + 1, persistedState.wavePlan.length, waveTasks.length, Math.min(waveTasks.length, orchConfig.orchestrator.max_lanes)),
793
+ "info",
794
+ );
795
+
796
+ const handleResumeMonitorUpdate: MonitorUpdateCallback = (monitorState) => {
797
+ const changed = syncTaskOutcomesFromMonitor(monitorState, allTaskOutcomes);
798
+ if (changed) {
799
+ persistRuntimeState("task-transition", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
800
+ }
801
+ onMonitorUpdate?.(monitorState);
802
+ };
803
+
804
+ // Execute the wave
805
+ const waveResult = await executeWave(
806
+ waveTasks,
807
+ waveIdx + 1,
808
+ discovery.pending,
809
+ orchConfig,
810
+ repoRoot,
811
+ batchState.batchId,
812
+ batchState.pauseSignal,
813
+ depGraph,
814
+ batchState.baseBranch,
815
+ handleResumeMonitorUpdate,
816
+ (lanes) => {
817
+ latestAllocatedLanes = lanes;
818
+ batchState.currentLanes = lanes;
819
+ if (seedPendingOutcomesForAllocatedLanes(lanes, allTaskOutcomes)) {
820
+ persistRuntimeState("wave-lanes-allocated", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
821
+ }
822
+ },
823
+ );
824
+
825
+ batchState.waveResults.push(waveResult);
826
+ batchState.currentLanes = [];
827
+
828
+ // Accumulate task outcomes
829
+ latestAllocatedLanes = waveResult.allocatedLanes;
830
+ for (const lr of waveResult.laneResults) {
831
+ for (const taskOutcome of lr.tasks) {
832
+ upsertTaskOutcome(allTaskOutcomes, taskOutcome);
833
+ }
834
+ }
835
+
836
+ // Accumulate results
837
+ batchState.succeededTasks += waveResult.succeededTaskIds.length;
838
+ batchState.failedTasks += waveResult.failedTaskIds.length;
839
+ batchState.skippedTasks += waveResult.skippedTaskIds.length;
840
+
841
+ for (const taskId of waveResult.succeededTaskIds) {
842
+ completedTaskSet.add(taskId);
843
+ failedTaskSet.delete(taskId);
844
+ reconnectTaskSet.delete(taskId);
845
+ }
846
+ for (const taskId of waveResult.failedTaskIds) {
847
+ failedTaskSet.add(taskId);
848
+ completedTaskSet.delete(taskId);
849
+ reconnectTaskSet.delete(taskId);
850
+ }
851
+
852
+ for (const blocked of waveResult.blockedTaskIds) {
853
+ batchState.blockedTaskIds.add(blocked);
854
+ }
855
+
856
+ persistRuntimeState("wave-execution-complete", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
857
+
858
+ const elapsedSec = Math.round((waveResult.endedAt - waveResult.startedAt) / 1000);
859
+ onNotify(
860
+ ORCH_MESSAGES.orchWaveComplete(
861
+ waveIdx + 1,
862
+ waveResult.succeededTaskIds.length,
863
+ waveResult.failedTaskIds.length,
864
+ waveResult.skippedTaskIds.length,
865
+ elapsedSec,
866
+ ),
867
+ waveResult.failedTaskIds.length > 0 ? "warning" : "info",
868
+ );
869
+
870
+ // Check failure policy
871
+ if (waveResult.stoppedEarly) {
872
+ if (waveResult.policyApplied === "stop-all") {
873
+ batchState.phase = "stopped";
874
+ persistRuntimeState("stop-all", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
875
+ onNotify(ORCH_MESSAGES.orchBatchStopped(batchState.batchId, "stop-all"), "error");
876
+ break;
877
+ }
878
+ if (waveResult.policyApplied === "stop-wave") {
879
+ batchState.phase = "stopped";
880
+ persistRuntimeState("stop-wave", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
881
+ onNotify(ORCH_MESSAGES.orchBatchStopped(batchState.batchId, "stop-wave"), "error");
882
+ break;
883
+ }
884
+ }
885
+
886
+ // Merge handling (same as executeOrchBatch)
887
+ let mergeResult: MergeWaveResult | null = null;
888
+
889
+ const laneOutcomeByNumber = new Map<number, LaneExecutionResult>();
890
+ for (const lr of waveResult.laneResults) {
891
+ laneOutcomeByNumber.set(lr.laneNumber, lr);
892
+ }
893
+ const mixedOutcomeLanes = waveResult.laneResults.filter(lr => {
894
+ const hasSucceeded = lr.tasks.some(t => t.status === "succeeded");
895
+ const hasHardFailure = lr.tasks.some(
896
+ t => t.status === "failed" || t.status === "stalled",
897
+ );
898
+ return hasSucceeded && hasHardFailure;
899
+ });
900
+
901
+ if (waveResult.succeededTaskIds.length > 0) {
902
+ const mergeableLaneCount = waveResult.allocatedLanes.filter(lane => {
903
+ const outcome = laneOutcomeByNumber.get(lane.laneNumber);
904
+ if (!outcome) return false;
905
+ const hasSucceeded = outcome.tasks.some(t => t.status === "succeeded");
906
+ const hasHardFailure = outcome.tasks.some(
907
+ t => t.status === "failed" || t.status === "stalled",
908
+ );
909
+ return hasSucceeded && !hasHardFailure;
910
+ }).length;
911
+
912
+ if (mergeableLaneCount > 0) {
913
+ batchState.phase = "merging";
914
+ persistRuntimeState("merge-start", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
915
+ onNotify(ORCH_MESSAGES.orchMergeStart(waveIdx + 1, mergeableLaneCount), "info");
916
+
917
+ mergeResult = mergeWave(
918
+ waveResult.allocatedLanes,
919
+ waveResult,
920
+ waveIdx + 1,
921
+ orchConfig,
922
+ repoRoot,
923
+ batchState.batchId,
924
+ batchState.baseBranch,
925
+ );
926
+ batchState.mergeResults.push(mergeResult);
927
+
928
+ // Emit per-lane merge notifications
929
+ for (const lr of mergeResult.laneResults) {
930
+ const durationSec = Math.round(lr.durationMs / 1000);
931
+ if (lr.result?.status === "SUCCESS") {
932
+ onNotify(ORCH_MESSAGES.orchMergeLaneSuccess(lr.laneNumber, lr.result.merge_commit, durationSec), "info");
933
+ } else if (lr.result?.status === "CONFLICT_RESOLVED") {
934
+ onNotify(ORCH_MESSAGES.orchMergeLaneConflictResolved(lr.laneNumber, lr.result.conflicts.length, durationSec), "info");
935
+ } else if (lr.result?.status === "CONFLICT_UNRESOLVED" || lr.result?.status === "BUILD_FAILURE") {
936
+ onNotify(ORCH_MESSAGES.orchMergeLaneFailed(lr.laneNumber, lr.error || lr.result.status), "error");
937
+ } else if (lr.error) {
938
+ onNotify(ORCH_MESSAGES.orchMergeLaneFailed(lr.laneNumber, lr.error), "error");
939
+ }
940
+ }
941
+
942
+ if (mixedOutcomeLanes.length > 0) {
943
+ const mixedIds = mixedOutcomeLanes.map(l => `lane-${l.laneNumber}`).join(", ");
944
+ const failureReason =
945
+ `Lane(s) ${mixedIds} contain both succeeded and failed tasks. ` +
946
+ `Automatic partial-branch merge is disabled to avoid dropping succeeded commits.`;
947
+ mergeResult = { ...mergeResult, status: "partial", failedLane: mixedOutcomeLanes[0].laneNumber, failureReason };
948
+ }
949
+
950
+ const mergedCount = mergeResult.laneResults.filter(
951
+ r => r.result?.status === "SUCCESS" || r.result?.status === "CONFLICT_RESOLVED",
952
+ ).length;
953
+ const mergeTotalSec = Math.round(mergeResult.totalDurationMs / 1000);
954
+
955
+ if (mergeResult.status === "succeeded") {
956
+ onNotify(ORCH_MESSAGES.orchMergeComplete(waveIdx + 1, mergedCount, mergeTotalSec), "info");
957
+ } else {
958
+ onNotify(
959
+ ORCH_MESSAGES.orchMergeFailed(waveIdx + 1, mergeResult.failedLane ?? 0, mergeResult.failureReason || "unknown"),
960
+ "error",
961
+ );
962
+ }
963
+
964
+ batchState.phase = "executing";
965
+ persistRuntimeState("merge-complete", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
966
+ } else if (mixedOutcomeLanes.length > 0) {
967
+ const mixedIds = mixedOutcomeLanes.map(l => `lane-${l.laneNumber}`).join(", ");
968
+ mergeResult = {
969
+ waveIndex: waveIdx + 1,
970
+ status: "partial",
971
+ laneResults: [],
972
+ failedLane: mixedOutcomeLanes[0].laneNumber,
973
+ failureReason:
974
+ `Lane(s) ${mixedIds} contain both succeeded and failed tasks. ` +
975
+ `Automatic partial-branch merge is disabled to avoid dropping succeeded commits.`,
976
+ totalDurationMs: 0,
977
+ };
978
+ onNotify(
979
+ ORCH_MESSAGES.orchMergeFailed(waveIdx + 1, mergeResult.failedLane, mergeResult.failureReason || "unknown"),
980
+ "error",
981
+ );
982
+ } else {
983
+ onNotify(ORCH_MESSAGES.orchMergeSkipped(waveIdx + 1), "info");
984
+ }
985
+ } else {
986
+ onNotify(ORCH_MESSAGES.orchMergeSkipped(waveIdx + 1), "info");
987
+ }
988
+
989
+ // Handle merge failure
990
+ if (mergeResult && (mergeResult.status === "failed" || mergeResult.status === "partial")) {
991
+ const mergeFailurePolicy = orchConfig.failure.on_merge_failure;
992
+
993
+ if (mergeFailurePolicy === "pause") {
994
+ batchState.phase = "paused";
995
+ batchState.errors.push(
996
+ `Merge failed at wave ${waveIdx + 1}: ${mergeResult.failureReason || "unknown"}. ` +
997
+ `Batch paused. Resolve conflicts and use /orch-resume to continue.`,
998
+ );
999
+ persistRuntimeState("merge-failure-pause", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
1000
+ onNotify(
1001
+ `⏸️ Batch paused due to merge failure at wave ${waveIdx + 1}. ` +
1002
+ `Resolve conflicts and resume.`,
1003
+ "error",
1004
+ );
1005
+ preserveWorktreesForResume = true;
1006
+ break;
1007
+ } else {
1008
+ batchState.phase = "stopped";
1009
+ batchState.errors.push(
1010
+ `Merge failed at wave ${waveIdx + 1}: ${mergeResult.failureReason || "unknown"}. ` +
1011
+ `Batch aborted by on_merge_failure policy.`,
1012
+ );
1013
+ persistRuntimeState("merge-failure-abort", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
1014
+ onNotify(
1015
+ `⛔ Batch aborted due to merge failure at wave ${waveIdx + 1}.`,
1016
+ "error",
1017
+ );
1018
+ preserveWorktreesForResume = true;
1019
+ break;
1020
+ }
1021
+ }
1022
+
1023
+ // Post-merge: reset worktrees for next wave
1024
+ if (mergeResult && mergeResult.status === "succeeded") {
1025
+ const targetBranch = batchState.baseBranch;
1026
+ for (const lr of mergeResult.laneResults) {
1027
+ if (lr.result?.status === "SUCCESS" || lr.result?.status === "CONFLICT_RESOLVED") {
1028
+ const ancestorCheck = runGit(["merge-base", "--is-ancestor", lr.sourceBranch, targetBranch], repoRoot);
1029
+ if (ancestorCheck.ok) {
1030
+ deleteBranchBestEffort(lr.sourceBranch, repoRoot);
1031
+ }
1032
+ }
1033
+ }
1034
+ }
1035
+
1036
+ if (waveIdx < persistedState.wavePlan.length - 1 && !batchState.pauseSignal.paused) {
1037
+ const wtPrefix = orchConfig.orchestrator.worktree_prefix;
1038
+ const existingWorktrees = listWorktrees(wtPrefix, repoRoot);
1039
+ if (existingWorktrees.length > 0) {
1040
+ const targetBranch = batchState.baseBranch;
1041
+ for (const wt of existingWorktrees) {
1042
+ const resetResult = safeResetWorktree(wt, targetBranch, repoRoot);
1043
+ if (!resetResult.success) {
1044
+ try { removeWorktree(wt, repoRoot); } catch { /* best effort */ }
1045
+ }
1046
+ }
1047
+ }
1048
+ }
1049
+ }
1050
+
1051
+ // ── 11. Cleanup and terminal state ───────────────────────────
1052
+ if (!preserveWorktreesForResume) {
1053
+ const wtPrefix = orchConfig.orchestrator.worktree_prefix;
1054
+ const targetBranch = batchState.baseBranch;
1055
+ removeAllWorktrees(wtPrefix, repoRoot, targetBranch);
1056
+ }
1057
+
1058
+ batchState.endedAt = Date.now();
1059
+ const totalElapsedSec = Math.round((batchState.endedAt - batchState.startedAt) / 1000);
1060
+
1061
+ if ((batchState.phase as OrchBatchPhase) === "executing" || (batchState.phase as OrchBatchPhase) === "merging") {
1062
+ if (batchState.failedTasks > 0) {
1063
+ batchState.phase = "failed";
1064
+ } else {
1065
+ batchState.phase = "completed";
1066
+ }
1067
+ }
1068
+
1069
+ persistRuntimeState("batch-terminal", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
1070
+
1071
+ if (batchState.phase === "paused" || batchState.phase === "stopped") {
1072
+ execLog("resume", batchState.batchId, "resumed batch ended in non-terminal state", { phase: batchState.phase });
1073
+ } else {
1074
+ onNotify(
1075
+ ORCH_MESSAGES.resumeComplete(
1076
+ batchState.batchId,
1077
+ batchState.succeededTasks,
1078
+ batchState.failedTasks,
1079
+ batchState.skippedTasks,
1080
+ batchState.blockedTasks,
1081
+ totalElapsedSec,
1082
+ ),
1083
+ batchState.failedTasks > 0 ? "warning" : "info",
1084
+ );
1085
+
1086
+ if (batchState.phase === "completed") {
1087
+ try {
1088
+ deleteBatchState(repoRoot);
1089
+ execLog("state", batchState.batchId, "state file deleted on clean resume completion");
1090
+ } catch {
1091
+ // Best-effort
1092
+ }
1093
+ }
1094
+ }
1095
+ }
1096
+