taskplane 0.1.14 → 0.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/taskplane.mjs +0 -4
- package/extensions/taskplane/engine.ts +771 -758
- package/extensions/taskplane/execution.ts +4 -2
- package/extensions/taskplane/git.ts +25 -7
- package/extensions/taskplane/merge.ts +18 -16
- package/extensions/taskplane/persistence.ts +1136 -1121
- package/extensions/taskplane/resume.ts +1096 -1092
- package/extensions/taskplane/types.ts +5 -2
- package/extensions/taskplane/waves.ts +894 -900
- package/extensions/taskplane/worktree.ts +6 -5
- package/package.json +1 -1
- package/templates/config/task-orchestrator.yaml +86 -89
- package/templates/config/task-runner.yaml +95 -99
|
@@ -1,1092 +1,1096 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Resume logic for paused/interrupted batches
|
|
3
|
-
* @module orch/resume
|
|
4
|
-
*/
|
|
5
|
-
import { existsSync } from "fs";
|
|
6
|
-
import { join } from "path";
|
|
7
|
-
|
|
8
|
-
import { runDiscovery } from "./discovery.ts";
|
|
9
|
-
import { executeOrchBatch } from "./engine.ts";
|
|
10
|
-
import { execLog, executeWave, pollUntilTaskComplete, spawnLaneSession, tmuxHasSession } from "./execution.ts";
|
|
11
|
-
import type { MonitorUpdateCallback } from "./execution.ts";
|
|
12
|
-
import { runGit } from "./git.ts";
|
|
13
|
-
import { mergeWave } from "./merge.ts";
|
|
14
|
-
import { ORCH_MESSAGES } from "./messages.ts";
|
|
15
|
-
import { deleteBatchState, hasTaskDoneMarker, loadBatchState, persistRuntimeState, seedPendingOutcomesForAllocatedLanes, syncTaskOutcomesFromMonitor, upsertTaskOutcome } from "./persistence.ts";
|
|
16
|
-
import { StateFileError } from "./types.ts";
|
|
17
|
-
import type { AllocatedLane, AllocatedTask, LaneExecutionResult, LaneTaskOutcome, LaneTaskStatus, MergeWaveResult, OrchBatchPhase, OrchBatchRuntimeState, OrchestratorConfig, ParsedTask, PersistedBatchState, ReconciledTaskState, ResumeEligibility, ResumePoint, TaskRunnerConfig, WaveExecutionResult } from "./types.ts";
|
|
18
|
-
import { buildDependencyGraph } from "./waves.ts";
|
|
19
|
-
import { deleteBranchBestEffort, listWorktrees, removeAllWorktrees, removeWorktree, safeResetWorktree } from "./worktree.ts";
|
|
20
|
-
|
|
21
|
-
// ── Resume Pure Functions ────────────────────────────────────────────
|
|
22
|
-
|
|
23
|
-
/**
|
|
24
|
-
* Check whether a persisted batch state is eligible for resume.
|
|
25
|
-
*
|
|
26
|
-
* Resume eligibility matrix:
|
|
27
|
-
* | Phase | Eligible? | Reason |
|
|
28
|
-
* |-----------|-----------|-------------------------------------------|
|
|
29
|
-
* | paused | ✅ | Batch was paused (user/merge-failure) |
|
|
30
|
-
* | executing | ✅ | Batch was executing when orchestrator died |
|
|
31
|
-
* | merging | ✅ | Batch was merging when orchestrator died |
|
|
32
|
-
* | stopped | ❌ | Batch was stopped by policy |
|
|
33
|
-
* | failed | ❌ | Batch has terminal failure |
|
|
34
|
-
* | completed | ❌ | Batch already completed |
|
|
35
|
-
* | idle | ❌ | Batch never started execution |
|
|
36
|
-
* | planning | ❌ | Batch was still planning |
|
|
37
|
-
*
|
|
38
|
-
* Pure function — no process or filesystem access.
|
|
39
|
-
*/
|
|
40
|
-
export function checkResumeEligibility(state: PersistedBatchState): ResumeEligibility {
|
|
41
|
-
const { phase, batchId } = state;
|
|
42
|
-
|
|
43
|
-
switch (phase) {
|
|
44
|
-
case "paused":
|
|
45
|
-
return {
|
|
46
|
-
eligible: true,
|
|
47
|
-
reason: `Batch ${batchId} is paused and can be resumed.`,
|
|
48
|
-
phase,
|
|
49
|
-
batchId,
|
|
50
|
-
};
|
|
51
|
-
|
|
52
|
-
case "executing":
|
|
53
|
-
return {
|
|
54
|
-
eligible: true,
|
|
55
|
-
reason: `Batch ${batchId} was executing when the orchestrator disconnected. Can be resumed.`,
|
|
56
|
-
phase,
|
|
57
|
-
batchId,
|
|
58
|
-
};
|
|
59
|
-
|
|
60
|
-
case "merging":
|
|
61
|
-
return {
|
|
62
|
-
eligible: true,
|
|
63
|
-
reason: `Batch ${batchId} was merging when the orchestrator disconnected. Can be resumed.`,
|
|
64
|
-
phase,
|
|
65
|
-
batchId,
|
|
66
|
-
};
|
|
67
|
-
|
|
68
|
-
case "stopped":
|
|
69
|
-
return {
|
|
70
|
-
eligible: false,
|
|
71
|
-
reason: `Batch ${batchId} was stopped by failure policy. Use /orch-abort to clean up, then start a new batch.`,
|
|
72
|
-
phase,
|
|
73
|
-
batchId,
|
|
74
|
-
};
|
|
75
|
-
|
|
76
|
-
case "failed":
|
|
77
|
-
return {
|
|
78
|
-
eligible: false,
|
|
79
|
-
reason: `Batch ${batchId} has a terminal failure. Use /orch-abort to clean up, then start a new batch.`,
|
|
80
|
-
phase,
|
|
81
|
-
batchId,
|
|
82
|
-
};
|
|
83
|
-
|
|
84
|
-
case "completed":
|
|
85
|
-
return {
|
|
86
|
-
eligible: false,
|
|
87
|
-
reason: `Batch ${batchId} already completed. Delete the state file or start a new batch.`,
|
|
88
|
-
phase,
|
|
89
|
-
batchId,
|
|
90
|
-
};
|
|
91
|
-
|
|
92
|
-
case "idle":
|
|
93
|
-
return {
|
|
94
|
-
eligible: false,
|
|
95
|
-
reason: `Batch ${batchId} never started execution. Start a new batch with /orch.`,
|
|
96
|
-
phase,
|
|
97
|
-
batchId,
|
|
98
|
-
};
|
|
99
|
-
|
|
100
|
-
case "planning":
|
|
101
|
-
return {
|
|
102
|
-
eligible: false,
|
|
103
|
-
reason: `Batch ${batchId} was still in planning phase. Start a new batch with /orch.`,
|
|
104
|
-
phase,
|
|
105
|
-
batchId,
|
|
106
|
-
};
|
|
107
|
-
|
|
108
|
-
default:
|
|
109
|
-
return {
|
|
110
|
-
eligible: false,
|
|
111
|
-
reason: `Batch ${batchId} has unknown phase "${phase}". Delete the state file and start a new batch.`,
|
|
112
|
-
phase,
|
|
113
|
-
batchId,
|
|
114
|
-
};
|
|
115
|
-
}
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
/**
|
|
119
|
-
* Reconcile persisted task states against live signals.
|
|
120
|
-
*
|
|
121
|
-
* For each task in the persisted state, determines the correct action
|
|
122
|
-
* based on the current state of TMUX sessions and .DONE files.
|
|
123
|
-
*
|
|
124
|
-
* Precedence rules (applied per-task):
|
|
125
|
-
* 1. .DONE file found → "mark-complete" (even if session is alive — task is done)
|
|
126
|
-
* 2. Session alive + no .DONE → "reconnect" (task is still running)
|
|
127
|
-
* 3. Persisted status is terminal (succeeded/failed/stalled/skipped) → "skip"
|
|
128
|
-
* (already resolved in the original run, no action needed)
|
|
129
|
-
* 4. Session dead + no .DONE + was pending/running → "mark-failed"
|
|
130
|
-
* (task was interrupted and did not complete)
|
|
131
|
-
*
|
|
132
|
-
* Pure function — no process or filesystem access.
|
|
133
|
-
*
|
|
134
|
-
* @param persistedState - Loaded and validated batch state
|
|
135
|
-
* @param aliveSessions - Set of TMUX session names currently alive
|
|
136
|
-
* @param doneTaskIds - Set of task IDs whose .DONE files exist
|
|
137
|
-
* @returns Array of reconciled task states in persisted order
|
|
138
|
-
*/
|
|
139
|
-
export function reconcileTaskStates(
|
|
140
|
-
persistedState: PersistedBatchState,
|
|
141
|
-
aliveSessions: ReadonlySet<string>,
|
|
142
|
-
doneTaskIds: ReadonlySet<string>,
|
|
143
|
-
existingWorktrees: ReadonlySet<string> = new Set(),
|
|
144
|
-
): ReconciledTaskState[] {
|
|
145
|
-
return persistedState.tasks.map((task) => {
|
|
146
|
-
const sessionAlive = aliveSessions.has(task.sessionName);
|
|
147
|
-
const doneFileFound = doneTaskIds.has(task.taskId);
|
|
148
|
-
const worktreeExists = existingWorktrees.has(task.taskId);
|
|
149
|
-
|
|
150
|
-
// Precedence 1: .DONE file found → task completed
|
|
151
|
-
if (doneFileFound) {
|
|
152
|
-
return {
|
|
153
|
-
taskId: task.taskId,
|
|
154
|
-
persistedStatus: task.status,
|
|
155
|
-
liveStatus: "succeeded" as LaneTaskStatus,
|
|
156
|
-
sessionAlive,
|
|
157
|
-
doneFileFound: true,
|
|
158
|
-
worktreeExists,
|
|
159
|
-
action: "mark-complete" as const,
|
|
160
|
-
};
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
// Precedence 2: Session alive → reconnect
|
|
164
|
-
if (sessionAlive) {
|
|
165
|
-
return {
|
|
166
|
-
taskId: task.taskId,
|
|
167
|
-
persistedStatus: task.status,
|
|
168
|
-
liveStatus: "running" as LaneTaskStatus,
|
|
169
|
-
sessionAlive: true,
|
|
170
|
-
doneFileFound: false,
|
|
171
|
-
worktreeExists,
|
|
172
|
-
action: "reconnect" as const,
|
|
173
|
-
};
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
// Precedence 3: Already terminal in persisted state → skip
|
|
177
|
-
const terminalStatuses: LaneTaskStatus[] = ["succeeded", "failed", "stalled", "skipped"];
|
|
178
|
-
if (terminalStatuses.includes(task.status)) {
|
|
179
|
-
return {
|
|
180
|
-
taskId: task.taskId,
|
|
181
|
-
persistedStatus: task.status,
|
|
182
|
-
liveStatus: task.status,
|
|
183
|
-
sessionAlive: false,
|
|
184
|
-
doneFileFound: false,
|
|
185
|
-
worktreeExists,
|
|
186
|
-
action: "skip" as const,
|
|
187
|
-
};
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
// Precedence 4: Session dead + no .DONE + worktree exists → re-execute
|
|
191
|
-
if (worktreeExists) {
|
|
192
|
-
return {
|
|
193
|
-
taskId: task.taskId,
|
|
194
|
-
persistedStatus: task.status,
|
|
195
|
-
liveStatus: "pending" as LaneTaskStatus,
|
|
196
|
-
sessionAlive: false,
|
|
197
|
-
doneFileFound: false,
|
|
198
|
-
worktreeExists: true,
|
|
199
|
-
action: "re-execute" as const,
|
|
200
|
-
};
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
// Precedence 5: Dead session + not terminal + no .DONE + no worktree → failed
|
|
204
|
-
return {
|
|
205
|
-
taskId: task.taskId,
|
|
206
|
-
persistedStatus: task.status,
|
|
207
|
-
liveStatus: "failed" as LaneTaskStatus,
|
|
208
|
-
sessionAlive: false,
|
|
209
|
-
doneFileFound: false,
|
|
210
|
-
worktreeExists: false,
|
|
211
|
-
action: "mark-failed" as const,
|
|
212
|
-
};
|
|
213
|
-
});
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
/**
|
|
217
|
-
* Compute the resume point from reconciled task states and wave plan.
|
|
218
|
-
*
|
|
219
|
-
* Determines which wave to resume from by finding the first wave that
|
|
220
|
-
* has any incomplete tasks. Skips fully completed waves.
|
|
221
|
-
*
|
|
222
|
-
* Pure function — no process or filesystem access.
|
|
223
|
-
*
|
|
224
|
-
* @param persistedState - Loaded and validated batch state
|
|
225
|
-
* @param reconciledTasks - Reconciled task states
|
|
226
|
-
* @returns Resume point with wave index and categorized task IDs
|
|
227
|
-
*/
|
|
228
|
-
export function computeResumePoint(
|
|
229
|
-
persistedState: PersistedBatchState,
|
|
230
|
-
reconciledTasks: ReconciledTaskState[],
|
|
231
|
-
): ResumePoint {
|
|
232
|
-
// Build lookup: taskId → reconciled state
|
|
233
|
-
const reconciledMap = new Map<string, ReconciledTaskState>();
|
|
234
|
-
for (const task of reconciledTasks) {
|
|
235
|
-
reconciledMap.set(task.taskId, task);
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
// Categorize tasks
|
|
239
|
-
const completedTaskIds: string[] = [];
|
|
240
|
-
const pendingTaskIds: string[] = [];
|
|
241
|
-
const failedTaskIds: string[] = [];
|
|
242
|
-
const reconnectTaskIds: string[] = [];
|
|
243
|
-
const reExecuteTaskIds: string[] = [];
|
|
244
|
-
|
|
245
|
-
for (const task of reconciledTasks) {
|
|
246
|
-
switch (task.action) {
|
|
247
|
-
case "mark-complete":
|
|
248
|
-
case "skip":
|
|
249
|
-
if (task.liveStatus === "succeeded" || task.persistedStatus === "succeeded") {
|
|
250
|
-
completedTaskIds.push(task.taskId);
|
|
251
|
-
} else if (task.liveStatus === "failed" || task.liveStatus === "stalled" || task.persistedStatus === "failed" || task.persistedStatus === "stalled") {
|
|
252
|
-
failedTaskIds.push(task.taskId);
|
|
253
|
-
}
|
|
254
|
-
// skipped tasks from original run don't count as completed or failed
|
|
255
|
-
break;
|
|
256
|
-
case "reconnect":
|
|
257
|
-
reconnectTaskIds.push(task.taskId);
|
|
258
|
-
break;
|
|
259
|
-
case "re-execute":
|
|
260
|
-
reExecuteTaskIds.push(task.taskId);
|
|
261
|
-
break;
|
|
262
|
-
case "mark-failed":
|
|
263
|
-
failedTaskIds.push(task.taskId);
|
|
264
|
-
break;
|
|
265
|
-
}
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
// Find resume wave: first wave with any non-completed tasks
|
|
269
|
-
let resumeWaveIndex = persistedState.wavePlan.length; // default: past end = all done
|
|
270
|
-
|
|
271
|
-
for (let i = 0; i < persistedState.wavePlan.length; i++) {
|
|
272
|
-
const waveTasks = persistedState.wavePlan[i];
|
|
273
|
-
const allDone = waveTasks.every((taskId) => {
|
|
274
|
-
const reconciled = reconciledMap.get(taskId);
|
|
275
|
-
if (!reconciled) return false;
|
|
276
|
-
// A task is "done" for wave-skip purposes if it completed or failed terminally
|
|
277
|
-
return (
|
|
278
|
-
reconciled.action === "mark-complete" ||
|
|
279
|
-
(reconciled.action === "skip" && (
|
|
280
|
-
reconciled.liveStatus === "succeeded" ||
|
|
281
|
-
reconciled.liveStatus === "failed" ||
|
|
282
|
-
reconciled.liveStatus === "stalled" ||
|
|
283
|
-
reconciled.persistedStatus === "succeeded" ||
|
|
284
|
-
reconciled.persistedStatus === "failed" ||
|
|
285
|
-
reconciled.persistedStatus === "stalled"
|
|
286
|
-
))
|
|
287
|
-
);
|
|
288
|
-
});
|
|
289
|
-
|
|
290
|
-
if (!allDone) {
|
|
291
|
-
resumeWaveIndex = i;
|
|
292
|
-
break;
|
|
293
|
-
}
|
|
294
|
-
}
|
|
295
|
-
|
|
296
|
-
// Determine pending tasks: tasks in resume wave and later that need execution
|
|
297
|
-
const actualPendingTaskIds: string[] = [];
|
|
298
|
-
for (let i = resumeWaveIndex; i < persistedState.wavePlan.length; i++) {
|
|
299
|
-
for (const taskId of persistedState.wavePlan[i]) {
|
|
300
|
-
const reconciled = reconciledMap.get(taskId);
|
|
301
|
-
if (!reconciled) {
|
|
302
|
-
actualPendingTaskIds.push(taskId); // Unknown task — treat as pending
|
|
303
|
-
continue;
|
|
304
|
-
}
|
|
305
|
-
if (reconciled.action === "reconnect") {
|
|
306
|
-
// Tasks with alive sessions need reconnection and remain pending.
|
|
307
|
-
actualPendingTaskIds.push(taskId);
|
|
308
|
-
}
|
|
309
|
-
if (reconciled.action === "re-execute") {
|
|
310
|
-
// Tasks with existing worktrees need re-execution and remain pending.
|
|
311
|
-
actualPendingTaskIds.push(taskId);
|
|
312
|
-
}
|
|
313
|
-
if (reconciled.action === "skip" && reconciled.persistedStatus === "pending") {
|
|
314
|
-
// Skipped tasks that were pending need execution
|
|
315
|
-
actualPendingTaskIds.push(taskId);
|
|
316
|
-
}
|
|
317
|
-
}
|
|
318
|
-
}
|
|
319
|
-
|
|
320
|
-
return {
|
|
321
|
-
resumeWaveIndex,
|
|
322
|
-
completedTaskIds,
|
|
323
|
-
pendingTaskIds: actualPendingTaskIds,
|
|
324
|
-
failedTaskIds,
|
|
325
|
-
reconnectTaskIds,
|
|
326
|
-
reExecuteTaskIds,
|
|
327
|
-
};
|
|
328
|
-
}
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
export async function resumeOrchBatch(
|
|
332
|
-
orchConfig: OrchestratorConfig,
|
|
333
|
-
runnerConfig: TaskRunnerConfig,
|
|
334
|
-
cwd: string,
|
|
335
|
-
batchState: OrchBatchRuntimeState,
|
|
336
|
-
onNotify: (message: string, level: "info" | "warning" | "error") => void,
|
|
337
|
-
onMonitorUpdate?: MonitorUpdateCallback,
|
|
338
|
-
): Promise<void> {
|
|
339
|
-
const repoRoot = cwd;
|
|
340
|
-
const prefix = orchConfig.orchestrator.tmux_prefix;
|
|
341
|
-
|
|
342
|
-
// ── 1. Load persisted state ──────────────────────────────────
|
|
343
|
-
let persistedState: PersistedBatchState | null;
|
|
344
|
-
try {
|
|
345
|
-
persistedState = loadBatchState(repoRoot);
|
|
346
|
-
} catch (err: unknown) {
|
|
347
|
-
if (err instanceof StateFileError) {
|
|
348
|
-
onNotify(
|
|
349
|
-
`❌ Cannot resume: ${err.message}`,
|
|
350
|
-
"error",
|
|
351
|
-
);
|
|
352
|
-
return;
|
|
353
|
-
}
|
|
354
|
-
throw err;
|
|
355
|
-
}
|
|
356
|
-
|
|
357
|
-
if (!persistedState) {
|
|
358
|
-
onNotify(
|
|
359
|
-
ORCH_MESSAGES.resumeNoState(),
|
|
360
|
-
"error",
|
|
361
|
-
);
|
|
362
|
-
return;
|
|
363
|
-
}
|
|
364
|
-
|
|
365
|
-
// ── 2. Check eligibility ─────────────────────────────────────
|
|
366
|
-
const eligibility = checkResumeEligibility(persistedState);
|
|
367
|
-
if (!eligibility.eligible) {
|
|
368
|
-
onNotify(
|
|
369
|
-
ORCH_MESSAGES.resumePhaseNotResumable(persistedState.batchId, persistedState.phase, eligibility.reason),
|
|
370
|
-
"error",
|
|
371
|
-
);
|
|
372
|
-
return;
|
|
373
|
-
}
|
|
374
|
-
|
|
375
|
-
onNotify(
|
|
376
|
-
ORCH_MESSAGES.resumeStarting(persistedState.batchId, persistedState.phase),
|
|
377
|
-
"info",
|
|
378
|
-
);
|
|
379
|
-
|
|
380
|
-
// ── 3. Discover live signals ─────────────────────────────────
|
|
381
|
-
// Check TMUX sessions
|
|
382
|
-
const aliveSessions = new Set<string>();
|
|
383
|
-
for (const task of persistedState.tasks) {
|
|
384
|
-
if (task.sessionName && tmuxHasSession(task.sessionName)) {
|
|
385
|
-
aliveSessions.add(task.sessionName);
|
|
386
|
-
}
|
|
387
|
-
}
|
|
388
|
-
|
|
389
|
-
// Check .DONE files
|
|
390
|
-
const doneTaskIds = new Set<string>();
|
|
391
|
-
for (const task of persistedState.tasks) {
|
|
392
|
-
if (task.taskFolder && hasTaskDoneMarker(task.taskFolder)) {
|
|
393
|
-
doneTaskIds.add(task.taskId);
|
|
394
|
-
}
|
|
395
|
-
}
|
|
396
|
-
|
|
397
|
-
// ── 3b. Detect existing worktrees ────────────────────────────
|
|
398
|
-
const existingWorktreeTaskIds = new Set<string>();
|
|
399
|
-
for (const task of persistedState.tasks) {
|
|
400
|
-
const laneRecord = persistedState.lanes.find(l => l.taskIds.includes(task.taskId));
|
|
401
|
-
if (laneRecord && laneRecord.worktreePath && existsSync(laneRecord.worktreePath)) {
|
|
402
|
-
existingWorktreeTaskIds.add(task.taskId);
|
|
403
|
-
}
|
|
404
|
-
}
|
|
405
|
-
|
|
406
|
-
// ── 4. Reconcile task states ─────────────────────────────────
|
|
407
|
-
const reconciledTasks = reconcileTaskStates(persistedState, aliveSessions, doneTaskIds, existingWorktreeTaskIds);
|
|
408
|
-
|
|
409
|
-
// ── 5. Compute resume point ──────────────────────────────────
|
|
410
|
-
const resumePoint = computeResumePoint(persistedState, reconciledTasks);
|
|
411
|
-
const completedTaskSet = new Set(resumePoint.completedTaskIds);
|
|
412
|
-
const failedTaskSet = new Set(resumePoint.failedTaskIds);
|
|
413
|
-
const reconnectTaskSet = new Set(resumePoint.reconnectTaskIds);
|
|
414
|
-
const reExecuteTaskSet = new Set(resumePoint.reExecuteTaskIds);
|
|
415
|
-
|
|
416
|
-
onNotify(
|
|
417
|
-
ORCH_MESSAGES.resumeReconciled(
|
|
418
|
-
persistedState.batchId,
|
|
419
|
-
resumePoint.completedTaskIds.length,
|
|
420
|
-
resumePoint.pendingTaskIds.length,
|
|
421
|
-
resumePoint.failedTaskIds.length,
|
|
422
|
-
resumePoint.reconnectTaskIds.length,
|
|
423
|
-
resumePoint.reExecuteTaskIds.length,
|
|
424
|
-
),
|
|
425
|
-
"info",
|
|
426
|
-
);
|
|
427
|
-
|
|
428
|
-
if (resumePoint.reconnectTaskIds.length > 0) {
|
|
429
|
-
onNotify(
|
|
430
|
-
ORCH_MESSAGES.resumeReconnecting(resumePoint.reconnectTaskIds.length),
|
|
431
|
-
"info",
|
|
432
|
-
);
|
|
433
|
-
}
|
|
434
|
-
|
|
435
|
-
if (resumePoint.resumeWaveIndex > 0) {
|
|
436
|
-
onNotify(
|
|
437
|
-
ORCH_MESSAGES.resumeSkippedWaves(resumePoint.resumeWaveIndex),
|
|
438
|
-
"info",
|
|
439
|
-
);
|
|
440
|
-
}
|
|
441
|
-
|
|
442
|
-
// ── 6. Reconstruct runtime state ─────────────────────────────
|
|
443
|
-
batchState.phase = "executing";
|
|
444
|
-
batchState.batchId = persistedState.batchId;
|
|
445
|
-
batchState.
|
|
446
|
-
batchState.
|
|
447
|
-
batchState.
|
|
448
|
-
batchState.
|
|
449
|
-
batchState.
|
|
450
|
-
batchState.
|
|
451
|
-
batchState.
|
|
452
|
-
batchState.
|
|
453
|
-
batchState.
|
|
454
|
-
batchState.
|
|
455
|
-
batchState.
|
|
456
|
-
batchState.
|
|
457
|
-
batchState.
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
//
|
|
461
|
-
//
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
//
|
|
474
|
-
//
|
|
475
|
-
|
|
476
|
-
const
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
for (
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
const
|
|
553
|
-
const
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
//
|
|
633
|
-
//
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
.
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
const
|
|
724
|
-
const
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
? (
|
|
728
|
-
: task.
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
: task.action === "
|
|
740
|
-
? (status === "succeeded" ? "
|
|
741
|
-
:
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
//
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
const
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
);
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
const
|
|
904
|
-
|
|
905
|
-
);
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
onNotify(ORCH_MESSAGES.
|
|
933
|
-
} else if (lr.
|
|
934
|
-
onNotify(ORCH_MESSAGES.
|
|
935
|
-
}
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
);
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
batchState.
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1
|
+
/**
|
|
2
|
+
* Resume logic for paused/interrupted batches
|
|
3
|
+
* @module orch/resume
|
|
4
|
+
*/
|
|
5
|
+
import { existsSync } from "fs";
|
|
6
|
+
import { join } from "path";
|
|
7
|
+
|
|
8
|
+
import { runDiscovery } from "./discovery.ts";
|
|
9
|
+
import { executeOrchBatch } from "./engine.ts";
|
|
10
|
+
import { execLog, executeWave, pollUntilTaskComplete, spawnLaneSession, tmuxHasSession } from "./execution.ts";
|
|
11
|
+
import type { MonitorUpdateCallback } from "./execution.ts";
|
|
12
|
+
import { runGit } from "./git.ts";
|
|
13
|
+
import { mergeWave } from "./merge.ts";
|
|
14
|
+
import { ORCH_MESSAGES } from "./messages.ts";
|
|
15
|
+
import { deleteBatchState, hasTaskDoneMarker, loadBatchState, persistRuntimeState, seedPendingOutcomesForAllocatedLanes, syncTaskOutcomesFromMonitor, upsertTaskOutcome } from "./persistence.ts";
|
|
16
|
+
import { StateFileError } from "./types.ts";
|
|
17
|
+
import type { AllocatedLane, AllocatedTask, LaneExecutionResult, LaneTaskOutcome, LaneTaskStatus, MergeWaveResult, OrchBatchPhase, OrchBatchRuntimeState, OrchestratorConfig, ParsedTask, PersistedBatchState, ReconciledTaskState, ResumeEligibility, ResumePoint, TaskRunnerConfig, WaveExecutionResult } from "./types.ts";
|
|
18
|
+
import { buildDependencyGraph } from "./waves.ts";
|
|
19
|
+
import { deleteBranchBestEffort, listWorktrees, removeAllWorktrees, removeWorktree, safeResetWorktree } from "./worktree.ts";
|
|
20
|
+
|
|
21
|
+
// ── Resume Pure Functions ────────────────────────────────────────────
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Check whether a persisted batch state is eligible for resume.
|
|
25
|
+
*
|
|
26
|
+
* Resume eligibility matrix:
|
|
27
|
+
* | Phase | Eligible? | Reason |
|
|
28
|
+
* |-----------|-----------|-------------------------------------------|
|
|
29
|
+
* | paused | ✅ | Batch was paused (user/merge-failure) |
|
|
30
|
+
* | executing | ✅ | Batch was executing when orchestrator died |
|
|
31
|
+
* | merging | ✅ | Batch was merging when orchestrator died |
|
|
32
|
+
* | stopped | ❌ | Batch was stopped by policy |
|
|
33
|
+
* | failed | ❌ | Batch has terminal failure |
|
|
34
|
+
* | completed | ❌ | Batch already completed |
|
|
35
|
+
* | idle | ❌ | Batch never started execution |
|
|
36
|
+
* | planning | ❌ | Batch was still planning |
|
|
37
|
+
*
|
|
38
|
+
* Pure function — no process or filesystem access.
|
|
39
|
+
*/
|
|
40
|
+
export function checkResumeEligibility(state: PersistedBatchState): ResumeEligibility {
|
|
41
|
+
const { phase, batchId } = state;
|
|
42
|
+
|
|
43
|
+
switch (phase) {
|
|
44
|
+
case "paused":
|
|
45
|
+
return {
|
|
46
|
+
eligible: true,
|
|
47
|
+
reason: `Batch ${batchId} is paused and can be resumed.`,
|
|
48
|
+
phase,
|
|
49
|
+
batchId,
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
case "executing":
|
|
53
|
+
return {
|
|
54
|
+
eligible: true,
|
|
55
|
+
reason: `Batch ${batchId} was executing when the orchestrator disconnected. Can be resumed.`,
|
|
56
|
+
phase,
|
|
57
|
+
batchId,
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
case "merging":
|
|
61
|
+
return {
|
|
62
|
+
eligible: true,
|
|
63
|
+
reason: `Batch ${batchId} was merging when the orchestrator disconnected. Can be resumed.`,
|
|
64
|
+
phase,
|
|
65
|
+
batchId,
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
case "stopped":
|
|
69
|
+
return {
|
|
70
|
+
eligible: false,
|
|
71
|
+
reason: `Batch ${batchId} was stopped by failure policy. Use /orch-abort to clean up, then start a new batch.`,
|
|
72
|
+
phase,
|
|
73
|
+
batchId,
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
case "failed":
|
|
77
|
+
return {
|
|
78
|
+
eligible: false,
|
|
79
|
+
reason: `Batch ${batchId} has a terminal failure. Use /orch-abort to clean up, then start a new batch.`,
|
|
80
|
+
phase,
|
|
81
|
+
batchId,
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
case "completed":
|
|
85
|
+
return {
|
|
86
|
+
eligible: false,
|
|
87
|
+
reason: `Batch ${batchId} already completed. Delete the state file or start a new batch.`,
|
|
88
|
+
phase,
|
|
89
|
+
batchId,
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
case "idle":
|
|
93
|
+
return {
|
|
94
|
+
eligible: false,
|
|
95
|
+
reason: `Batch ${batchId} never started execution. Start a new batch with /orch.`,
|
|
96
|
+
phase,
|
|
97
|
+
batchId,
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
case "planning":
|
|
101
|
+
return {
|
|
102
|
+
eligible: false,
|
|
103
|
+
reason: `Batch ${batchId} was still in planning phase. Start a new batch with /orch.`,
|
|
104
|
+
phase,
|
|
105
|
+
batchId,
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
default:
|
|
109
|
+
return {
|
|
110
|
+
eligible: false,
|
|
111
|
+
reason: `Batch ${batchId} has unknown phase "${phase}". Delete the state file and start a new batch.`,
|
|
112
|
+
phase,
|
|
113
|
+
batchId,
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Reconcile persisted task states against live signals.
|
|
120
|
+
*
|
|
121
|
+
* For each task in the persisted state, determines the correct action
|
|
122
|
+
* based on the current state of TMUX sessions and .DONE files.
|
|
123
|
+
*
|
|
124
|
+
* Precedence rules (applied per-task):
|
|
125
|
+
* 1. .DONE file found → "mark-complete" (even if session is alive — task is done)
|
|
126
|
+
* 2. Session alive + no .DONE → "reconnect" (task is still running)
|
|
127
|
+
* 3. Persisted status is terminal (succeeded/failed/stalled/skipped) → "skip"
|
|
128
|
+
* (already resolved in the original run, no action needed)
|
|
129
|
+
* 4. Session dead + no .DONE + was pending/running → "mark-failed"
|
|
130
|
+
* (task was interrupted and did not complete)
|
|
131
|
+
*
|
|
132
|
+
* Pure function — no process or filesystem access.
|
|
133
|
+
*
|
|
134
|
+
* @param persistedState - Loaded and validated batch state
|
|
135
|
+
* @param aliveSessions - Set of TMUX session names currently alive
|
|
136
|
+
* @param doneTaskIds - Set of task IDs whose .DONE files exist
|
|
137
|
+
* @returns Array of reconciled task states in persisted order
|
|
138
|
+
*/
|
|
139
|
+
export function reconcileTaskStates(
|
|
140
|
+
persistedState: PersistedBatchState,
|
|
141
|
+
aliveSessions: ReadonlySet<string>,
|
|
142
|
+
doneTaskIds: ReadonlySet<string>,
|
|
143
|
+
existingWorktrees: ReadonlySet<string> = new Set(),
|
|
144
|
+
): ReconciledTaskState[] {
|
|
145
|
+
return persistedState.tasks.map((task) => {
|
|
146
|
+
const sessionAlive = aliveSessions.has(task.sessionName);
|
|
147
|
+
const doneFileFound = doneTaskIds.has(task.taskId);
|
|
148
|
+
const worktreeExists = existingWorktrees.has(task.taskId);
|
|
149
|
+
|
|
150
|
+
// Precedence 1: .DONE file found → task completed
|
|
151
|
+
if (doneFileFound) {
|
|
152
|
+
return {
|
|
153
|
+
taskId: task.taskId,
|
|
154
|
+
persistedStatus: task.status,
|
|
155
|
+
liveStatus: "succeeded" as LaneTaskStatus,
|
|
156
|
+
sessionAlive,
|
|
157
|
+
doneFileFound: true,
|
|
158
|
+
worktreeExists,
|
|
159
|
+
action: "mark-complete" as const,
|
|
160
|
+
};
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Precedence 2: Session alive → reconnect
|
|
164
|
+
if (sessionAlive) {
|
|
165
|
+
return {
|
|
166
|
+
taskId: task.taskId,
|
|
167
|
+
persistedStatus: task.status,
|
|
168
|
+
liveStatus: "running" as LaneTaskStatus,
|
|
169
|
+
sessionAlive: true,
|
|
170
|
+
doneFileFound: false,
|
|
171
|
+
worktreeExists,
|
|
172
|
+
action: "reconnect" as const,
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// Precedence 3: Already terminal in persisted state → skip
|
|
177
|
+
const terminalStatuses: LaneTaskStatus[] = ["succeeded", "failed", "stalled", "skipped"];
|
|
178
|
+
if (terminalStatuses.includes(task.status)) {
|
|
179
|
+
return {
|
|
180
|
+
taskId: task.taskId,
|
|
181
|
+
persistedStatus: task.status,
|
|
182
|
+
liveStatus: task.status,
|
|
183
|
+
sessionAlive: false,
|
|
184
|
+
doneFileFound: false,
|
|
185
|
+
worktreeExists,
|
|
186
|
+
action: "skip" as const,
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// Precedence 4: Session dead + no .DONE + worktree exists → re-execute
|
|
191
|
+
if (worktreeExists) {
|
|
192
|
+
return {
|
|
193
|
+
taskId: task.taskId,
|
|
194
|
+
persistedStatus: task.status,
|
|
195
|
+
liveStatus: "pending" as LaneTaskStatus,
|
|
196
|
+
sessionAlive: false,
|
|
197
|
+
doneFileFound: false,
|
|
198
|
+
worktreeExists: true,
|
|
199
|
+
action: "re-execute" as const,
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// Precedence 5: Dead session + not terminal + no .DONE + no worktree → failed
|
|
204
|
+
return {
|
|
205
|
+
taskId: task.taskId,
|
|
206
|
+
persistedStatus: task.status,
|
|
207
|
+
liveStatus: "failed" as LaneTaskStatus,
|
|
208
|
+
sessionAlive: false,
|
|
209
|
+
doneFileFound: false,
|
|
210
|
+
worktreeExists: false,
|
|
211
|
+
action: "mark-failed" as const,
|
|
212
|
+
};
|
|
213
|
+
});
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Compute the resume point from reconciled task states and wave plan.
|
|
218
|
+
*
|
|
219
|
+
* Determines which wave to resume from by finding the first wave that
|
|
220
|
+
* has any incomplete tasks. Skips fully completed waves.
|
|
221
|
+
*
|
|
222
|
+
* Pure function — no process or filesystem access.
|
|
223
|
+
*
|
|
224
|
+
* @param persistedState - Loaded and validated batch state
|
|
225
|
+
* @param reconciledTasks - Reconciled task states
|
|
226
|
+
* @returns Resume point with wave index and categorized task IDs
|
|
227
|
+
*/
|
|
228
|
+
export function computeResumePoint(
|
|
229
|
+
persistedState: PersistedBatchState,
|
|
230
|
+
reconciledTasks: ReconciledTaskState[],
|
|
231
|
+
): ResumePoint {
|
|
232
|
+
// Build lookup: taskId → reconciled state
|
|
233
|
+
const reconciledMap = new Map<string, ReconciledTaskState>();
|
|
234
|
+
for (const task of reconciledTasks) {
|
|
235
|
+
reconciledMap.set(task.taskId, task);
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// Categorize tasks
|
|
239
|
+
const completedTaskIds: string[] = [];
|
|
240
|
+
const pendingTaskIds: string[] = [];
|
|
241
|
+
const failedTaskIds: string[] = [];
|
|
242
|
+
const reconnectTaskIds: string[] = [];
|
|
243
|
+
const reExecuteTaskIds: string[] = [];
|
|
244
|
+
|
|
245
|
+
for (const task of reconciledTasks) {
|
|
246
|
+
switch (task.action) {
|
|
247
|
+
case "mark-complete":
|
|
248
|
+
case "skip":
|
|
249
|
+
if (task.liveStatus === "succeeded" || task.persistedStatus === "succeeded") {
|
|
250
|
+
completedTaskIds.push(task.taskId);
|
|
251
|
+
} else if (task.liveStatus === "failed" || task.liveStatus === "stalled" || task.persistedStatus === "failed" || task.persistedStatus === "stalled") {
|
|
252
|
+
failedTaskIds.push(task.taskId);
|
|
253
|
+
}
|
|
254
|
+
// skipped tasks from original run don't count as completed or failed
|
|
255
|
+
break;
|
|
256
|
+
case "reconnect":
|
|
257
|
+
reconnectTaskIds.push(task.taskId);
|
|
258
|
+
break;
|
|
259
|
+
case "re-execute":
|
|
260
|
+
reExecuteTaskIds.push(task.taskId);
|
|
261
|
+
break;
|
|
262
|
+
case "mark-failed":
|
|
263
|
+
failedTaskIds.push(task.taskId);
|
|
264
|
+
break;
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
// Find resume wave: first wave with any non-completed tasks
|
|
269
|
+
let resumeWaveIndex = persistedState.wavePlan.length; // default: past end = all done
|
|
270
|
+
|
|
271
|
+
for (let i = 0; i < persistedState.wavePlan.length; i++) {
|
|
272
|
+
const waveTasks = persistedState.wavePlan[i];
|
|
273
|
+
const allDone = waveTasks.every((taskId) => {
|
|
274
|
+
const reconciled = reconciledMap.get(taskId);
|
|
275
|
+
if (!reconciled) return false;
|
|
276
|
+
// A task is "done" for wave-skip purposes if it completed or failed terminally
|
|
277
|
+
return (
|
|
278
|
+
reconciled.action === "mark-complete" ||
|
|
279
|
+
(reconciled.action === "skip" && (
|
|
280
|
+
reconciled.liveStatus === "succeeded" ||
|
|
281
|
+
reconciled.liveStatus === "failed" ||
|
|
282
|
+
reconciled.liveStatus === "stalled" ||
|
|
283
|
+
reconciled.persistedStatus === "succeeded" ||
|
|
284
|
+
reconciled.persistedStatus === "failed" ||
|
|
285
|
+
reconciled.persistedStatus === "stalled"
|
|
286
|
+
))
|
|
287
|
+
);
|
|
288
|
+
});
|
|
289
|
+
|
|
290
|
+
if (!allDone) {
|
|
291
|
+
resumeWaveIndex = i;
|
|
292
|
+
break;
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// Determine pending tasks: tasks in resume wave and later that need execution
|
|
297
|
+
const actualPendingTaskIds: string[] = [];
|
|
298
|
+
for (let i = resumeWaveIndex; i < persistedState.wavePlan.length; i++) {
|
|
299
|
+
for (const taskId of persistedState.wavePlan[i]) {
|
|
300
|
+
const reconciled = reconciledMap.get(taskId);
|
|
301
|
+
if (!reconciled) {
|
|
302
|
+
actualPendingTaskIds.push(taskId); // Unknown task — treat as pending
|
|
303
|
+
continue;
|
|
304
|
+
}
|
|
305
|
+
if (reconciled.action === "reconnect") {
|
|
306
|
+
// Tasks with alive sessions need reconnection and remain pending.
|
|
307
|
+
actualPendingTaskIds.push(taskId);
|
|
308
|
+
}
|
|
309
|
+
if (reconciled.action === "re-execute") {
|
|
310
|
+
// Tasks with existing worktrees need re-execution and remain pending.
|
|
311
|
+
actualPendingTaskIds.push(taskId);
|
|
312
|
+
}
|
|
313
|
+
if (reconciled.action === "skip" && reconciled.persistedStatus === "pending") {
|
|
314
|
+
// Skipped tasks that were pending need execution
|
|
315
|
+
actualPendingTaskIds.push(taskId);
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
return {
|
|
321
|
+
resumeWaveIndex,
|
|
322
|
+
completedTaskIds,
|
|
323
|
+
pendingTaskIds: actualPendingTaskIds,
|
|
324
|
+
failedTaskIds,
|
|
325
|
+
reconnectTaskIds,
|
|
326
|
+
reExecuteTaskIds,
|
|
327
|
+
};
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
export async function resumeOrchBatch(
|
|
332
|
+
orchConfig: OrchestratorConfig,
|
|
333
|
+
runnerConfig: TaskRunnerConfig,
|
|
334
|
+
cwd: string,
|
|
335
|
+
batchState: OrchBatchRuntimeState,
|
|
336
|
+
onNotify: (message: string, level: "info" | "warning" | "error") => void,
|
|
337
|
+
onMonitorUpdate?: MonitorUpdateCallback,
|
|
338
|
+
): Promise<void> {
|
|
339
|
+
const repoRoot = cwd;
|
|
340
|
+
const prefix = orchConfig.orchestrator.tmux_prefix;
|
|
341
|
+
|
|
342
|
+
// ── 1. Load persisted state ──────────────────────────────────
|
|
343
|
+
let persistedState: PersistedBatchState | null;
|
|
344
|
+
try {
|
|
345
|
+
persistedState = loadBatchState(repoRoot);
|
|
346
|
+
} catch (err: unknown) {
|
|
347
|
+
if (err instanceof StateFileError) {
|
|
348
|
+
onNotify(
|
|
349
|
+
`❌ Cannot resume: ${err.message}`,
|
|
350
|
+
"error",
|
|
351
|
+
);
|
|
352
|
+
return;
|
|
353
|
+
}
|
|
354
|
+
throw err;
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
if (!persistedState) {
|
|
358
|
+
onNotify(
|
|
359
|
+
ORCH_MESSAGES.resumeNoState(),
|
|
360
|
+
"error",
|
|
361
|
+
);
|
|
362
|
+
return;
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
// ── 2. Check eligibility ─────────────────────────────────────
|
|
366
|
+
const eligibility = checkResumeEligibility(persistedState);
|
|
367
|
+
if (!eligibility.eligible) {
|
|
368
|
+
onNotify(
|
|
369
|
+
ORCH_MESSAGES.resumePhaseNotResumable(persistedState.batchId, persistedState.phase, eligibility.reason),
|
|
370
|
+
"error",
|
|
371
|
+
);
|
|
372
|
+
return;
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
onNotify(
|
|
376
|
+
ORCH_MESSAGES.resumeStarting(persistedState.batchId, persistedState.phase),
|
|
377
|
+
"info",
|
|
378
|
+
);
|
|
379
|
+
|
|
380
|
+
// ── 3. Discover live signals ─────────────────────────────────
|
|
381
|
+
// Check TMUX sessions
|
|
382
|
+
const aliveSessions = new Set<string>();
|
|
383
|
+
for (const task of persistedState.tasks) {
|
|
384
|
+
if (task.sessionName && tmuxHasSession(task.sessionName)) {
|
|
385
|
+
aliveSessions.add(task.sessionName);
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
// Check .DONE files
|
|
390
|
+
const doneTaskIds = new Set<string>();
|
|
391
|
+
for (const task of persistedState.tasks) {
|
|
392
|
+
if (task.taskFolder && hasTaskDoneMarker(task.taskFolder)) {
|
|
393
|
+
doneTaskIds.add(task.taskId);
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// ── 3b. Detect existing worktrees ────────────────────────────
|
|
398
|
+
const existingWorktreeTaskIds = new Set<string>();
|
|
399
|
+
for (const task of persistedState.tasks) {
|
|
400
|
+
const laneRecord = persistedState.lanes.find(l => l.taskIds.includes(task.taskId));
|
|
401
|
+
if (laneRecord && laneRecord.worktreePath && existsSync(laneRecord.worktreePath)) {
|
|
402
|
+
existingWorktreeTaskIds.add(task.taskId);
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
// ── 4. Reconcile task states ─────────────────────────────────
|
|
407
|
+
const reconciledTasks = reconcileTaskStates(persistedState, aliveSessions, doneTaskIds, existingWorktreeTaskIds);
|
|
408
|
+
|
|
409
|
+
// ── 5. Compute resume point ──────────────────────────────────
|
|
410
|
+
const resumePoint = computeResumePoint(persistedState, reconciledTasks);
|
|
411
|
+
const completedTaskSet = new Set(resumePoint.completedTaskIds);
|
|
412
|
+
const failedTaskSet = new Set(resumePoint.failedTaskIds);
|
|
413
|
+
const reconnectTaskSet = new Set(resumePoint.reconnectTaskIds);
|
|
414
|
+
const reExecuteTaskSet = new Set(resumePoint.reExecuteTaskIds);
|
|
415
|
+
|
|
416
|
+
onNotify(
|
|
417
|
+
ORCH_MESSAGES.resumeReconciled(
|
|
418
|
+
persistedState.batchId,
|
|
419
|
+
resumePoint.completedTaskIds.length,
|
|
420
|
+
resumePoint.pendingTaskIds.length,
|
|
421
|
+
resumePoint.failedTaskIds.length,
|
|
422
|
+
resumePoint.reconnectTaskIds.length,
|
|
423
|
+
resumePoint.reExecuteTaskIds.length,
|
|
424
|
+
),
|
|
425
|
+
"info",
|
|
426
|
+
);
|
|
427
|
+
|
|
428
|
+
if (resumePoint.reconnectTaskIds.length > 0) {
|
|
429
|
+
onNotify(
|
|
430
|
+
ORCH_MESSAGES.resumeReconnecting(resumePoint.reconnectTaskIds.length),
|
|
431
|
+
"info",
|
|
432
|
+
);
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
if (resumePoint.resumeWaveIndex > 0) {
|
|
436
|
+
onNotify(
|
|
437
|
+
ORCH_MESSAGES.resumeSkippedWaves(resumePoint.resumeWaveIndex),
|
|
438
|
+
"info",
|
|
439
|
+
);
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
// ── 6. Reconstruct runtime state ─────────────────────────────
|
|
443
|
+
batchState.phase = "executing";
|
|
444
|
+
batchState.batchId = persistedState.batchId;
|
|
445
|
+
batchState.baseBranch = persistedState.baseBranch || "";
|
|
446
|
+
batchState.startedAt = persistedState.startedAt;
|
|
447
|
+
batchState.pauseSignal = { paused: false };
|
|
448
|
+
batchState.totalWaves = persistedState.totalWaves;
|
|
449
|
+
batchState.totalTasks = persistedState.totalTasks;
|
|
450
|
+
batchState.succeededTasks = resumePoint.completedTaskIds.length;
|
|
451
|
+
batchState.failedTasks = resumePoint.failedTaskIds.length;
|
|
452
|
+
batchState.skippedTasks = persistedState.skippedTasks;
|
|
453
|
+
batchState.blockedTasks = persistedState.blockedTasks;
|
|
454
|
+
batchState.blockedTaskIds = new Set(persistedState.blockedTaskIds);
|
|
455
|
+
batchState.errors = [...persistedState.errors];
|
|
456
|
+
batchState.endedAt = null;
|
|
457
|
+
batchState.currentWaveIndex = resumePoint.resumeWaveIndex;
|
|
458
|
+
batchState.waveResults = [];
|
|
459
|
+
|
|
460
|
+
// ── 7. Re-run discovery for ParsedTask metadata ──────────────
|
|
461
|
+
// We need fresh ParsedTask data (taskFolder, promptPath) for execution.
|
|
462
|
+
// Use "all" to discover all areas.
|
|
463
|
+
const discovery = runDiscovery("all", runnerConfig.task_areas, cwd, {
|
|
464
|
+
refreshDependencies: false,
|
|
465
|
+
dependencySource: orchConfig.dependencies.source,
|
|
466
|
+
useDependencyCache: orchConfig.dependencies.cache,
|
|
467
|
+
});
|
|
468
|
+
|
|
469
|
+
// Build dependency graph for skip-dependents policy
|
|
470
|
+
const depGraph = buildDependencyGraph(discovery.pending, discovery.completed);
|
|
471
|
+
batchState.dependencyGraph = depGraph;
|
|
472
|
+
|
|
473
|
+
// ── 8. Handle alive sessions (reconnect) ─────────────────────
|
|
474
|
+
// For tasks with alive sessions, we need to wait for them to complete.
|
|
475
|
+
// We poll each alive session's .DONE file.
|
|
476
|
+
const reconnectTasks = reconciledTasks.filter(t => t.action === "reconnect");
|
|
477
|
+
const reconnectFinalStatus = new Map<string, LaneTaskStatus>();
|
|
478
|
+
|
|
479
|
+
if (reconnectTasks.length > 0) {
|
|
480
|
+
// Wait for reconnected tasks to complete (poll .DONE files)
|
|
481
|
+
for (const task of reconnectTasks) {
|
|
482
|
+
const parsedTask = discovery.pending.get(task.taskId);
|
|
483
|
+
if (!parsedTask) continue;
|
|
484
|
+
|
|
485
|
+
// Find the lane info from persisted state
|
|
486
|
+
const laneRecord = persistedState.lanes.find(
|
|
487
|
+
l => l.taskIds.includes(task.taskId),
|
|
488
|
+
);
|
|
489
|
+
if (!laneRecord) continue;
|
|
490
|
+
|
|
491
|
+
// Build a minimal AllocatedLane for polling
|
|
492
|
+
const allocatedTask: AllocatedTask = {
|
|
493
|
+
taskId: task.taskId,
|
|
494
|
+
order: 0,
|
|
495
|
+
task: parsedTask,
|
|
496
|
+
estimatedMinutes: 0,
|
|
497
|
+
};
|
|
498
|
+
const lane: AllocatedLane = {
|
|
499
|
+
laneNumber: laneRecord.laneNumber,
|
|
500
|
+
laneId: laneRecord.laneId,
|
|
501
|
+
tmuxSessionName: laneRecord.tmuxSessionName,
|
|
502
|
+
worktreePath: laneRecord.worktreePath,
|
|
503
|
+
branch: laneRecord.branch,
|
|
504
|
+
tasks: [allocatedTask],
|
|
505
|
+
strategy: "round-robin",
|
|
506
|
+
estimatedLoad: 0,
|
|
507
|
+
estimatedMinutes: 0,
|
|
508
|
+
};
|
|
509
|
+
|
|
510
|
+
execLog("resume", task.taskId, "reconnecting to alive session", {
|
|
511
|
+
session: laneRecord.tmuxSessionName,
|
|
512
|
+
});
|
|
513
|
+
|
|
514
|
+
// Poll until task completes
|
|
515
|
+
try {
|
|
516
|
+
const pollResult = await pollUntilTaskComplete(
|
|
517
|
+
lane,
|
|
518
|
+
allocatedTask,
|
|
519
|
+
orchConfig,
|
|
520
|
+
repoRoot,
|
|
521
|
+
batchState.pauseSignal,
|
|
522
|
+
);
|
|
523
|
+
|
|
524
|
+
if (pollResult.status === "succeeded") {
|
|
525
|
+
reconnectFinalStatus.set(task.taskId, "succeeded");
|
|
526
|
+
completedTaskSet.add(task.taskId);
|
|
527
|
+
failedTaskSet.delete(task.taskId);
|
|
528
|
+
reconnectTaskSet.delete(task.taskId);
|
|
529
|
+
batchState.succeededTasks++;
|
|
530
|
+
execLog("resume", task.taskId, "reconnected task succeeded");
|
|
531
|
+
} else {
|
|
532
|
+
reconnectFinalStatus.set(task.taskId, "failed");
|
|
533
|
+
failedTaskSet.add(task.taskId);
|
|
534
|
+
completedTaskSet.delete(task.taskId);
|
|
535
|
+
reconnectTaskSet.delete(task.taskId);
|
|
536
|
+
batchState.failedTasks++;
|
|
537
|
+
execLog("resume", task.taskId, `reconnected task ${pollResult.status}: ${pollResult.exitReason}`);
|
|
538
|
+
}
|
|
539
|
+
} catch (err: unknown) {
|
|
540
|
+
reconnectFinalStatus.set(task.taskId, "failed");
|
|
541
|
+
failedTaskSet.add(task.taskId);
|
|
542
|
+
completedTaskSet.delete(task.taskId);
|
|
543
|
+
reconnectTaskSet.delete(task.taskId);
|
|
544
|
+
batchState.failedTasks++;
|
|
545
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
546
|
+
execLog("resume", task.taskId, `reconnection error: ${msg}`);
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
// ── 8b. Handle re-execute tasks (dead session + existing worktree) ──
|
|
552
|
+
const reExecuteTasks = reconciledTasks.filter(t => t.action === "re-execute");
|
|
553
|
+
const reExecuteFinalStatus = new Map<string, LaneTaskStatus>();
|
|
554
|
+
const reExecAllocatedLanes: AllocatedLane[] = [];
|
|
555
|
+
|
|
556
|
+
if (reExecuteTasks.length > 0) {
|
|
557
|
+
onNotify(
|
|
558
|
+
`🔄 Re-executing ${reExecuteTasks.length} interrupted task(s) in existing worktrees...`,
|
|
559
|
+
"info",
|
|
560
|
+
);
|
|
561
|
+
|
|
562
|
+
for (const task of reExecuteTasks) {
|
|
563
|
+
const parsedTask = discovery.pending.get(task.taskId);
|
|
564
|
+
if (!parsedTask) continue;
|
|
565
|
+
|
|
566
|
+
const laneRecord = persistedState.lanes.find(
|
|
567
|
+
l => l.taskIds.includes(task.taskId),
|
|
568
|
+
);
|
|
569
|
+
if (!laneRecord) continue;
|
|
570
|
+
|
|
571
|
+
const allocatedTask: AllocatedTask = {
|
|
572
|
+
taskId: task.taskId,
|
|
573
|
+
order: 0,
|
|
574
|
+
task: parsedTask,
|
|
575
|
+
estimatedMinutes: 0,
|
|
576
|
+
};
|
|
577
|
+
const lane: AllocatedLane = {
|
|
578
|
+
laneNumber: laneRecord.laneNumber,
|
|
579
|
+
laneId: laneRecord.laneId,
|
|
580
|
+
tmuxSessionName: laneRecord.tmuxSessionName,
|
|
581
|
+
worktreePath: laneRecord.worktreePath,
|
|
582
|
+
branch: laneRecord.branch,
|
|
583
|
+
tasks: [allocatedTask],
|
|
584
|
+
strategy: "round-robin",
|
|
585
|
+
estimatedLoad: 0,
|
|
586
|
+
estimatedMinutes: 0,
|
|
587
|
+
};
|
|
588
|
+
|
|
589
|
+
execLog("resume", task.taskId, "re-executing interrupted task in existing worktree", {
|
|
590
|
+
session: laneRecord.tmuxSessionName,
|
|
591
|
+
worktree: laneRecord.worktreePath,
|
|
592
|
+
});
|
|
593
|
+
|
|
594
|
+
try {
|
|
595
|
+
spawnLaneSession(lane, allocatedTask, orchConfig, repoRoot);
|
|
596
|
+
const pollResult = await pollUntilTaskComplete(
|
|
597
|
+
lane,
|
|
598
|
+
allocatedTask,
|
|
599
|
+
orchConfig,
|
|
600
|
+
repoRoot,
|
|
601
|
+
batchState.pauseSignal,
|
|
602
|
+
);
|
|
603
|
+
|
|
604
|
+
if (pollResult.status === "succeeded") {
|
|
605
|
+
reExecuteFinalStatus.set(task.taskId, "succeeded");
|
|
606
|
+
completedTaskSet.add(task.taskId);
|
|
607
|
+
failedTaskSet.delete(task.taskId);
|
|
608
|
+
reExecuteTaskSet.delete(task.taskId);
|
|
609
|
+
batchState.succeededTasks++;
|
|
610
|
+
reExecAllocatedLanes.push(lane);
|
|
611
|
+
execLog("resume", task.taskId, "re-executed task succeeded");
|
|
612
|
+
} else {
|
|
613
|
+
reExecuteFinalStatus.set(task.taskId, "failed");
|
|
614
|
+
failedTaskSet.add(task.taskId);
|
|
615
|
+
completedTaskSet.delete(task.taskId);
|
|
616
|
+
reExecuteTaskSet.delete(task.taskId);
|
|
617
|
+
batchState.failedTasks++;
|
|
618
|
+
execLog("resume", task.taskId, `re-executed task ${pollResult.status}: ${pollResult.exitReason}`);
|
|
619
|
+
}
|
|
620
|
+
} catch (err: unknown) {
|
|
621
|
+
reExecuteFinalStatus.set(task.taskId, "failed");
|
|
622
|
+
failedTaskSet.add(task.taskId);
|
|
623
|
+
completedTaskSet.delete(task.taskId);
|
|
624
|
+
reExecuteTaskSet.delete(task.taskId);
|
|
625
|
+
batchState.failedTasks++;
|
|
626
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
627
|
+
execLog("resume", task.taskId, `re-execution error: ${msg}`);
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
// ── 8c. Merge re-executed lane branches before cleanup ───────
|
|
633
|
+
// Re-executed tasks completed outside the normal wave loop, so their
|
|
634
|
+
// branches would not be merged by step 10. Merge them now.
|
|
635
|
+
if (reExecAllocatedLanes.length > 0) {
|
|
636
|
+
const succeededReExecTaskIds = [...reExecuteFinalStatus.entries()]
|
|
637
|
+
.filter(([_, status]) => status === "succeeded")
|
|
638
|
+
.map(([taskId]) => taskId);
|
|
639
|
+
|
|
640
|
+
if (succeededReExecTaskIds.length > 0) {
|
|
641
|
+
onNotify(
|
|
642
|
+
`🔀 Merging ${reExecAllocatedLanes.length} re-executed lane branch(es)...`,
|
|
643
|
+
"info",
|
|
644
|
+
);
|
|
645
|
+
|
|
646
|
+
// Build synthetic WaveExecutionResult for mergeWave()
|
|
647
|
+
const syntheticLaneResults: LaneExecutionResult[] = reExecAllocatedLanes.map(lane => ({
|
|
648
|
+
laneNumber: lane.laneNumber,
|
|
649
|
+
laneId: lane.laneId,
|
|
650
|
+
tasks: lane.tasks.map(t => ({
|
|
651
|
+
taskId: t.taskId,
|
|
652
|
+
status: "succeeded" as LaneTaskStatus,
|
|
653
|
+
startTime: Date.now(),
|
|
654
|
+
endTime: Date.now(),
|
|
655
|
+
exitReason: "Re-executed task completed successfully",
|
|
656
|
+
sessionName: lane.tmuxSessionName,
|
|
657
|
+
doneFileFound: true,
|
|
658
|
+
})),
|
|
659
|
+
overallStatus: "succeeded" as const,
|
|
660
|
+
startTime: Date.now(),
|
|
661
|
+
endTime: Date.now(),
|
|
662
|
+
}));
|
|
663
|
+
|
|
664
|
+
const syntheticWaveResult: WaveExecutionResult = {
|
|
665
|
+
waveIndex: 0,
|
|
666
|
+
startedAt: Date.now(),
|
|
667
|
+
endedAt: Date.now(),
|
|
668
|
+
laneResults: syntheticLaneResults,
|
|
669
|
+
policyApplied: orchConfig.failure.on_task_failure,
|
|
670
|
+
stoppedEarly: false,
|
|
671
|
+
failedTaskIds: [],
|
|
672
|
+
skippedTaskIds: [],
|
|
673
|
+
succeededTaskIds: succeededReExecTaskIds,
|
|
674
|
+
blockedTaskIds: [],
|
|
675
|
+
laneCount: reExecAllocatedLanes.length,
|
|
676
|
+
overallStatus: "succeeded",
|
|
677
|
+
finalMonitorState: null,
|
|
678
|
+
allocatedLanes: reExecAllocatedLanes,
|
|
679
|
+
};
|
|
680
|
+
|
|
681
|
+
const reExecMergeResult = mergeWave(
|
|
682
|
+
reExecAllocatedLanes,
|
|
683
|
+
syntheticWaveResult,
|
|
684
|
+
0,
|
|
685
|
+
orchConfig,
|
|
686
|
+
repoRoot,
|
|
687
|
+
batchState.batchId,
|
|
688
|
+
batchState.baseBranch,
|
|
689
|
+
);
|
|
690
|
+
|
|
691
|
+
if (reExecMergeResult.status === "succeeded") {
|
|
692
|
+
onNotify(
|
|
693
|
+
`✅ Re-executed branch merge complete: ${reExecMergeResult.laneResults.length} lane(s) merged`,
|
|
694
|
+
"info",
|
|
695
|
+
);
|
|
696
|
+
|
|
697
|
+
// Clean up merged branches
|
|
698
|
+
const targetBranch = batchState.baseBranch;
|
|
699
|
+
for (const lr of reExecMergeResult.laneResults) {
|
|
700
|
+
if (lr.result?.status === "SUCCESS" || lr.result?.status === "CONFLICT_RESOLVED") {
|
|
701
|
+
deleteBranchBestEffort(lr.sourceBranch, repoRoot);
|
|
702
|
+
}
|
|
703
|
+
}
|
|
704
|
+
} else {
|
|
705
|
+
onNotify(
|
|
706
|
+
`⚠️ Re-executed branch merge ${reExecMergeResult.status}: ${reExecMergeResult.failureReason || "unknown"}`,
|
|
707
|
+
"warning",
|
|
708
|
+
);
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
batchState.mergeResults.push(reExecMergeResult);
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
// ── 9. Persist state after reconciliation ────────────────────
|
|
716
|
+
// Track state for persistence
|
|
717
|
+
const wavePlan = persistedState.wavePlan;
|
|
718
|
+
const allTaskOutcomes: LaneTaskOutcome[] = [];
|
|
719
|
+
let latestAllocatedLanes: AllocatedLane[] = [];
|
|
720
|
+
|
|
721
|
+
// Build outcomes from reconciled tasks
|
|
722
|
+
for (const task of reconciledTasks) {
|
|
723
|
+
const persistedTask = persistedState.tasks.find(t => t.taskId === task.taskId);
|
|
724
|
+
const reconnectStatus = reconnectFinalStatus.get(task.taskId);
|
|
725
|
+
const reExecuteStatus = reExecuteFinalStatus.get(task.taskId);
|
|
726
|
+
const status = task.action === "reconnect"
|
|
727
|
+
? (reconnectStatus || "running")
|
|
728
|
+
: task.action === "re-execute"
|
|
729
|
+
? (reExecuteStatus || "pending")
|
|
730
|
+
: task.liveStatus;
|
|
731
|
+
const isTerminal = status === "succeeded" || status === "failed" || status === "stalled" || status === "skipped";
|
|
732
|
+
allTaskOutcomes.push({
|
|
733
|
+
taskId: task.taskId,
|
|
734
|
+
status,
|
|
735
|
+
startTime: persistedTask?.startedAt ?? null,
|
|
736
|
+
endTime: isTerminal ? Date.now() : null,
|
|
737
|
+
exitReason: task.action === "mark-complete" ? ".DONE file found on resume"
|
|
738
|
+
: task.action === "mark-failed" ? "Session dead, no .DONE file, no worktree on resume"
|
|
739
|
+
: task.action === "reconnect"
|
|
740
|
+
? (status === "succeeded" ? "Reconnected task completed" : status === "failed" ? "Reconnected task failed" : "Reconnected to alive session")
|
|
741
|
+
: task.action === "re-execute"
|
|
742
|
+
? (status === "succeeded" ? "Re-executed task completed" : status === "failed" ? "Re-executed task failed" : "Re-executing in existing worktree")
|
|
743
|
+
: persistedTask?.exitReason ?? "",
|
|
744
|
+
sessionName: persistedTask?.sessionName ?? "",
|
|
745
|
+
doneFileFound: status === "succeeded" ? true : task.doneFileFound,
|
|
746
|
+
});
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
persistRuntimeState("resume-reconciliation", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery ?? null, repoRoot);
|
|
750
|
+
|
|
751
|
+
// ── 10. Continue wave execution ──────────────────────────────
|
|
752
|
+
// We need to execute remaining waves starting from resumeWaveIndex.
|
|
753
|
+
// For waves where some tasks are already done, we filter them out.
|
|
754
|
+
|
|
755
|
+
let preserveWorktreesForResume = false;
|
|
756
|
+
|
|
757
|
+
for (let waveIdx = resumePoint.resumeWaveIndex; waveIdx < persistedState.wavePlan.length; waveIdx++) {
|
|
758
|
+
// Check pause signal
|
|
759
|
+
if (batchState.pauseSignal.paused) {
|
|
760
|
+
batchState.phase = "paused";
|
|
761
|
+
persistRuntimeState("pause-before-wave", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
|
|
762
|
+
onNotify(`⏸️ Batch paused before wave ${waveIdx + 1}.`, "warning");
|
|
763
|
+
break;
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
batchState.currentWaveIndex = waveIdx;
|
|
767
|
+
persistRuntimeState("wave-index-change", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
|
|
768
|
+
|
|
769
|
+
// Get wave tasks, filtering out completed/failed/blocked ones.
|
|
770
|
+
let waveTasks = persistedState.wavePlan[waveIdx].filter(
|
|
771
|
+
taskId => !completedTaskSet.has(taskId) &&
|
|
772
|
+
!failedTaskSet.has(taskId) &&
|
|
773
|
+
!batchState.blockedTaskIds.has(taskId),
|
|
774
|
+
);
|
|
775
|
+
|
|
776
|
+
// Also filter tasks where discovery doesn't have them as pending
|
|
777
|
+
waveTasks = waveTasks.filter(taskId => discovery.pending.has(taskId));
|
|
778
|
+
|
|
779
|
+
const blockedInWave = persistedState.wavePlan[waveIdx].filter(
|
|
780
|
+
taskId => batchState.blockedTaskIds.has(taskId),
|
|
781
|
+
);
|
|
782
|
+
if (blockedInWave.length > 0) {
|
|
783
|
+
batchState.blockedTasks += blockedInWave.length;
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
if (waveTasks.length === 0) {
|
|
787
|
+
execLog("resume", batchState.batchId, `wave ${waveIdx + 1}: no tasks to execute (all completed/blocked)`);
|
|
788
|
+
continue;
|
|
789
|
+
}
|
|
790
|
+
|
|
791
|
+
onNotify(
|
|
792
|
+
ORCH_MESSAGES.orchWaveStart(waveIdx + 1, persistedState.wavePlan.length, waveTasks.length, Math.min(waveTasks.length, orchConfig.orchestrator.max_lanes)),
|
|
793
|
+
"info",
|
|
794
|
+
);
|
|
795
|
+
|
|
796
|
+
const handleResumeMonitorUpdate: MonitorUpdateCallback = (monitorState) => {
|
|
797
|
+
const changed = syncTaskOutcomesFromMonitor(monitorState, allTaskOutcomes);
|
|
798
|
+
if (changed) {
|
|
799
|
+
persistRuntimeState("task-transition", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
|
|
800
|
+
}
|
|
801
|
+
onMonitorUpdate?.(monitorState);
|
|
802
|
+
};
|
|
803
|
+
|
|
804
|
+
// Execute the wave
|
|
805
|
+
const waveResult = await executeWave(
|
|
806
|
+
waveTasks,
|
|
807
|
+
waveIdx + 1,
|
|
808
|
+
discovery.pending,
|
|
809
|
+
orchConfig,
|
|
810
|
+
repoRoot,
|
|
811
|
+
batchState.batchId,
|
|
812
|
+
batchState.pauseSignal,
|
|
813
|
+
depGraph,
|
|
814
|
+
batchState.baseBranch,
|
|
815
|
+
handleResumeMonitorUpdate,
|
|
816
|
+
(lanes) => {
|
|
817
|
+
latestAllocatedLanes = lanes;
|
|
818
|
+
batchState.currentLanes = lanes;
|
|
819
|
+
if (seedPendingOutcomesForAllocatedLanes(lanes, allTaskOutcomes)) {
|
|
820
|
+
persistRuntimeState("wave-lanes-allocated", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
|
|
821
|
+
}
|
|
822
|
+
},
|
|
823
|
+
);
|
|
824
|
+
|
|
825
|
+
batchState.waveResults.push(waveResult);
|
|
826
|
+
batchState.currentLanes = [];
|
|
827
|
+
|
|
828
|
+
// Accumulate task outcomes
|
|
829
|
+
latestAllocatedLanes = waveResult.allocatedLanes;
|
|
830
|
+
for (const lr of waveResult.laneResults) {
|
|
831
|
+
for (const taskOutcome of lr.tasks) {
|
|
832
|
+
upsertTaskOutcome(allTaskOutcomes, taskOutcome);
|
|
833
|
+
}
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
// Accumulate results
|
|
837
|
+
batchState.succeededTasks += waveResult.succeededTaskIds.length;
|
|
838
|
+
batchState.failedTasks += waveResult.failedTaskIds.length;
|
|
839
|
+
batchState.skippedTasks += waveResult.skippedTaskIds.length;
|
|
840
|
+
|
|
841
|
+
for (const taskId of waveResult.succeededTaskIds) {
|
|
842
|
+
completedTaskSet.add(taskId);
|
|
843
|
+
failedTaskSet.delete(taskId);
|
|
844
|
+
reconnectTaskSet.delete(taskId);
|
|
845
|
+
}
|
|
846
|
+
for (const taskId of waveResult.failedTaskIds) {
|
|
847
|
+
failedTaskSet.add(taskId);
|
|
848
|
+
completedTaskSet.delete(taskId);
|
|
849
|
+
reconnectTaskSet.delete(taskId);
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
for (const blocked of waveResult.blockedTaskIds) {
|
|
853
|
+
batchState.blockedTaskIds.add(blocked);
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
persistRuntimeState("wave-execution-complete", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
|
|
857
|
+
|
|
858
|
+
const elapsedSec = Math.round((waveResult.endedAt - waveResult.startedAt) / 1000);
|
|
859
|
+
onNotify(
|
|
860
|
+
ORCH_MESSAGES.orchWaveComplete(
|
|
861
|
+
waveIdx + 1,
|
|
862
|
+
waveResult.succeededTaskIds.length,
|
|
863
|
+
waveResult.failedTaskIds.length,
|
|
864
|
+
waveResult.skippedTaskIds.length,
|
|
865
|
+
elapsedSec,
|
|
866
|
+
),
|
|
867
|
+
waveResult.failedTaskIds.length > 0 ? "warning" : "info",
|
|
868
|
+
);
|
|
869
|
+
|
|
870
|
+
// Check failure policy
|
|
871
|
+
if (waveResult.stoppedEarly) {
|
|
872
|
+
if (waveResult.policyApplied === "stop-all") {
|
|
873
|
+
batchState.phase = "stopped";
|
|
874
|
+
persistRuntimeState("stop-all", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
|
|
875
|
+
onNotify(ORCH_MESSAGES.orchBatchStopped(batchState.batchId, "stop-all"), "error");
|
|
876
|
+
break;
|
|
877
|
+
}
|
|
878
|
+
if (waveResult.policyApplied === "stop-wave") {
|
|
879
|
+
batchState.phase = "stopped";
|
|
880
|
+
persistRuntimeState("stop-wave", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
|
|
881
|
+
onNotify(ORCH_MESSAGES.orchBatchStopped(batchState.batchId, "stop-wave"), "error");
|
|
882
|
+
break;
|
|
883
|
+
}
|
|
884
|
+
}
|
|
885
|
+
|
|
886
|
+
// Merge handling (same as executeOrchBatch)
|
|
887
|
+
let mergeResult: MergeWaveResult | null = null;
|
|
888
|
+
|
|
889
|
+
const laneOutcomeByNumber = new Map<number, LaneExecutionResult>();
|
|
890
|
+
for (const lr of waveResult.laneResults) {
|
|
891
|
+
laneOutcomeByNumber.set(lr.laneNumber, lr);
|
|
892
|
+
}
|
|
893
|
+
const mixedOutcomeLanes = waveResult.laneResults.filter(lr => {
|
|
894
|
+
const hasSucceeded = lr.tasks.some(t => t.status === "succeeded");
|
|
895
|
+
const hasHardFailure = lr.tasks.some(
|
|
896
|
+
t => t.status === "failed" || t.status === "stalled",
|
|
897
|
+
);
|
|
898
|
+
return hasSucceeded && hasHardFailure;
|
|
899
|
+
});
|
|
900
|
+
|
|
901
|
+
if (waveResult.succeededTaskIds.length > 0) {
|
|
902
|
+
const mergeableLaneCount = waveResult.allocatedLanes.filter(lane => {
|
|
903
|
+
const outcome = laneOutcomeByNumber.get(lane.laneNumber);
|
|
904
|
+
if (!outcome) return false;
|
|
905
|
+
const hasSucceeded = outcome.tasks.some(t => t.status === "succeeded");
|
|
906
|
+
const hasHardFailure = outcome.tasks.some(
|
|
907
|
+
t => t.status === "failed" || t.status === "stalled",
|
|
908
|
+
);
|
|
909
|
+
return hasSucceeded && !hasHardFailure;
|
|
910
|
+
}).length;
|
|
911
|
+
|
|
912
|
+
if (mergeableLaneCount > 0) {
|
|
913
|
+
batchState.phase = "merging";
|
|
914
|
+
persistRuntimeState("merge-start", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
|
|
915
|
+
onNotify(ORCH_MESSAGES.orchMergeStart(waveIdx + 1, mergeableLaneCount), "info");
|
|
916
|
+
|
|
917
|
+
mergeResult = mergeWave(
|
|
918
|
+
waveResult.allocatedLanes,
|
|
919
|
+
waveResult,
|
|
920
|
+
waveIdx + 1,
|
|
921
|
+
orchConfig,
|
|
922
|
+
repoRoot,
|
|
923
|
+
batchState.batchId,
|
|
924
|
+
batchState.baseBranch,
|
|
925
|
+
);
|
|
926
|
+
batchState.mergeResults.push(mergeResult);
|
|
927
|
+
|
|
928
|
+
// Emit per-lane merge notifications
|
|
929
|
+
for (const lr of mergeResult.laneResults) {
|
|
930
|
+
const durationSec = Math.round(lr.durationMs / 1000);
|
|
931
|
+
if (lr.result?.status === "SUCCESS") {
|
|
932
|
+
onNotify(ORCH_MESSAGES.orchMergeLaneSuccess(lr.laneNumber, lr.result.merge_commit, durationSec), "info");
|
|
933
|
+
} else if (lr.result?.status === "CONFLICT_RESOLVED") {
|
|
934
|
+
onNotify(ORCH_MESSAGES.orchMergeLaneConflictResolved(lr.laneNumber, lr.result.conflicts.length, durationSec), "info");
|
|
935
|
+
} else if (lr.result?.status === "CONFLICT_UNRESOLVED" || lr.result?.status === "BUILD_FAILURE") {
|
|
936
|
+
onNotify(ORCH_MESSAGES.orchMergeLaneFailed(lr.laneNumber, lr.error || lr.result.status), "error");
|
|
937
|
+
} else if (lr.error) {
|
|
938
|
+
onNotify(ORCH_MESSAGES.orchMergeLaneFailed(lr.laneNumber, lr.error), "error");
|
|
939
|
+
}
|
|
940
|
+
}
|
|
941
|
+
|
|
942
|
+
if (mixedOutcomeLanes.length > 0) {
|
|
943
|
+
const mixedIds = mixedOutcomeLanes.map(l => `lane-${l.laneNumber}`).join(", ");
|
|
944
|
+
const failureReason =
|
|
945
|
+
`Lane(s) ${mixedIds} contain both succeeded and failed tasks. ` +
|
|
946
|
+
`Automatic partial-branch merge is disabled to avoid dropping succeeded commits.`;
|
|
947
|
+
mergeResult = { ...mergeResult, status: "partial", failedLane: mixedOutcomeLanes[0].laneNumber, failureReason };
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
const mergedCount = mergeResult.laneResults.filter(
|
|
951
|
+
r => r.result?.status === "SUCCESS" || r.result?.status === "CONFLICT_RESOLVED",
|
|
952
|
+
).length;
|
|
953
|
+
const mergeTotalSec = Math.round(mergeResult.totalDurationMs / 1000);
|
|
954
|
+
|
|
955
|
+
if (mergeResult.status === "succeeded") {
|
|
956
|
+
onNotify(ORCH_MESSAGES.orchMergeComplete(waveIdx + 1, mergedCount, mergeTotalSec), "info");
|
|
957
|
+
} else {
|
|
958
|
+
onNotify(
|
|
959
|
+
ORCH_MESSAGES.orchMergeFailed(waveIdx + 1, mergeResult.failedLane ?? 0, mergeResult.failureReason || "unknown"),
|
|
960
|
+
"error",
|
|
961
|
+
);
|
|
962
|
+
}
|
|
963
|
+
|
|
964
|
+
batchState.phase = "executing";
|
|
965
|
+
persistRuntimeState("merge-complete", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
|
|
966
|
+
} else if (mixedOutcomeLanes.length > 0) {
|
|
967
|
+
const mixedIds = mixedOutcomeLanes.map(l => `lane-${l.laneNumber}`).join(", ");
|
|
968
|
+
mergeResult = {
|
|
969
|
+
waveIndex: waveIdx + 1,
|
|
970
|
+
status: "partial",
|
|
971
|
+
laneResults: [],
|
|
972
|
+
failedLane: mixedOutcomeLanes[0].laneNumber,
|
|
973
|
+
failureReason:
|
|
974
|
+
`Lane(s) ${mixedIds} contain both succeeded and failed tasks. ` +
|
|
975
|
+
`Automatic partial-branch merge is disabled to avoid dropping succeeded commits.`,
|
|
976
|
+
totalDurationMs: 0,
|
|
977
|
+
};
|
|
978
|
+
onNotify(
|
|
979
|
+
ORCH_MESSAGES.orchMergeFailed(waveIdx + 1, mergeResult.failedLane, mergeResult.failureReason || "unknown"),
|
|
980
|
+
"error",
|
|
981
|
+
);
|
|
982
|
+
} else {
|
|
983
|
+
onNotify(ORCH_MESSAGES.orchMergeSkipped(waveIdx + 1), "info");
|
|
984
|
+
}
|
|
985
|
+
} else {
|
|
986
|
+
onNotify(ORCH_MESSAGES.orchMergeSkipped(waveIdx + 1), "info");
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
// Handle merge failure
|
|
990
|
+
if (mergeResult && (mergeResult.status === "failed" || mergeResult.status === "partial")) {
|
|
991
|
+
const mergeFailurePolicy = orchConfig.failure.on_merge_failure;
|
|
992
|
+
|
|
993
|
+
if (mergeFailurePolicy === "pause") {
|
|
994
|
+
batchState.phase = "paused";
|
|
995
|
+
batchState.errors.push(
|
|
996
|
+
`Merge failed at wave ${waveIdx + 1}: ${mergeResult.failureReason || "unknown"}. ` +
|
|
997
|
+
`Batch paused. Resolve conflicts and use /orch-resume to continue.`,
|
|
998
|
+
);
|
|
999
|
+
persistRuntimeState("merge-failure-pause", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
|
|
1000
|
+
onNotify(
|
|
1001
|
+
`⏸️ Batch paused due to merge failure at wave ${waveIdx + 1}. ` +
|
|
1002
|
+
`Resolve conflicts and resume.`,
|
|
1003
|
+
"error",
|
|
1004
|
+
);
|
|
1005
|
+
preserveWorktreesForResume = true;
|
|
1006
|
+
break;
|
|
1007
|
+
} else {
|
|
1008
|
+
batchState.phase = "stopped";
|
|
1009
|
+
batchState.errors.push(
|
|
1010
|
+
`Merge failed at wave ${waveIdx + 1}: ${mergeResult.failureReason || "unknown"}. ` +
|
|
1011
|
+
`Batch aborted by on_merge_failure policy.`,
|
|
1012
|
+
);
|
|
1013
|
+
persistRuntimeState("merge-failure-abort", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
|
|
1014
|
+
onNotify(
|
|
1015
|
+
`⛔ Batch aborted due to merge failure at wave ${waveIdx + 1}.`,
|
|
1016
|
+
"error",
|
|
1017
|
+
);
|
|
1018
|
+
preserveWorktreesForResume = true;
|
|
1019
|
+
break;
|
|
1020
|
+
}
|
|
1021
|
+
}
|
|
1022
|
+
|
|
1023
|
+
// Post-merge: reset worktrees for next wave
|
|
1024
|
+
if (mergeResult && mergeResult.status === "succeeded") {
|
|
1025
|
+
const targetBranch = batchState.baseBranch;
|
|
1026
|
+
for (const lr of mergeResult.laneResults) {
|
|
1027
|
+
if (lr.result?.status === "SUCCESS" || lr.result?.status === "CONFLICT_RESOLVED") {
|
|
1028
|
+
const ancestorCheck = runGit(["merge-base", "--is-ancestor", lr.sourceBranch, targetBranch], repoRoot);
|
|
1029
|
+
if (ancestorCheck.ok) {
|
|
1030
|
+
deleteBranchBestEffort(lr.sourceBranch, repoRoot);
|
|
1031
|
+
}
|
|
1032
|
+
}
|
|
1033
|
+
}
|
|
1034
|
+
}
|
|
1035
|
+
|
|
1036
|
+
if (waveIdx < persistedState.wavePlan.length - 1 && !batchState.pauseSignal.paused) {
|
|
1037
|
+
const wtPrefix = orchConfig.orchestrator.worktree_prefix;
|
|
1038
|
+
const existingWorktrees = listWorktrees(wtPrefix, repoRoot);
|
|
1039
|
+
if (existingWorktrees.length > 0) {
|
|
1040
|
+
const targetBranch = batchState.baseBranch;
|
|
1041
|
+
for (const wt of existingWorktrees) {
|
|
1042
|
+
const resetResult = safeResetWorktree(wt, targetBranch, repoRoot);
|
|
1043
|
+
if (!resetResult.success) {
|
|
1044
|
+
try { removeWorktree(wt, repoRoot); } catch { /* best effort */ }
|
|
1045
|
+
}
|
|
1046
|
+
}
|
|
1047
|
+
}
|
|
1048
|
+
}
|
|
1049
|
+
}
|
|
1050
|
+
|
|
1051
|
+
// ── 11. Cleanup and terminal state ───────────────────────────
|
|
1052
|
+
if (!preserveWorktreesForResume) {
|
|
1053
|
+
const wtPrefix = orchConfig.orchestrator.worktree_prefix;
|
|
1054
|
+
const targetBranch = batchState.baseBranch;
|
|
1055
|
+
removeAllWorktrees(wtPrefix, repoRoot, targetBranch);
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1058
|
+
batchState.endedAt = Date.now();
|
|
1059
|
+
const totalElapsedSec = Math.round((batchState.endedAt - batchState.startedAt) / 1000);
|
|
1060
|
+
|
|
1061
|
+
if ((batchState.phase as OrchBatchPhase) === "executing" || (batchState.phase as OrchBatchPhase) === "merging") {
|
|
1062
|
+
if (batchState.failedTasks > 0) {
|
|
1063
|
+
batchState.phase = "failed";
|
|
1064
|
+
} else {
|
|
1065
|
+
batchState.phase = "completed";
|
|
1066
|
+
}
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
persistRuntimeState("batch-terminal", batchState, wavePlan, latestAllocatedLanes, allTaskOutcomes, discovery, repoRoot);
|
|
1070
|
+
|
|
1071
|
+
if (batchState.phase === "paused" || batchState.phase === "stopped") {
|
|
1072
|
+
execLog("resume", batchState.batchId, "resumed batch ended in non-terminal state", { phase: batchState.phase });
|
|
1073
|
+
} else {
|
|
1074
|
+
onNotify(
|
|
1075
|
+
ORCH_MESSAGES.resumeComplete(
|
|
1076
|
+
batchState.batchId,
|
|
1077
|
+
batchState.succeededTasks,
|
|
1078
|
+
batchState.failedTasks,
|
|
1079
|
+
batchState.skippedTasks,
|
|
1080
|
+
batchState.blockedTasks,
|
|
1081
|
+
totalElapsedSec,
|
|
1082
|
+
),
|
|
1083
|
+
batchState.failedTasks > 0 ? "warning" : "info",
|
|
1084
|
+
);
|
|
1085
|
+
|
|
1086
|
+
if (batchState.phase === "completed") {
|
|
1087
|
+
try {
|
|
1088
|
+
deleteBatchState(repoRoot);
|
|
1089
|
+
execLog("state", batchState.batchId, "state file deleted on clean resume completion");
|
|
1090
|
+
} catch {
|
|
1091
|
+
// Best-effort
|
|
1092
|
+
}
|
|
1093
|
+
}
|
|
1094
|
+
}
|
|
1095
|
+
}
|
|
1096
|
+
|