taskplane 0.0.1 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +2 -20
  3. package/bin/taskplane.mjs +706 -0
  4. package/dashboard/public/app.js +900 -0
  5. package/dashboard/public/index.html +92 -0
  6. package/dashboard/public/style.css +924 -0
  7. package/dashboard/server.cjs +531 -0
  8. package/extensions/task-orchestrator.ts +28 -0
  9. package/extensions/task-runner.ts +1923 -0
  10. package/extensions/taskplane/abort.ts +466 -0
  11. package/extensions/taskplane/config.ts +102 -0
  12. package/extensions/taskplane/discovery.ts +988 -0
  13. package/extensions/taskplane/engine.ts +758 -0
  14. package/extensions/taskplane/execution.ts +1752 -0
  15. package/extensions/taskplane/extension.ts +577 -0
  16. package/extensions/taskplane/formatting.ts +718 -0
  17. package/extensions/taskplane/git.ts +38 -0
  18. package/extensions/taskplane/index.ts +22 -0
  19. package/extensions/taskplane/merge.ts +795 -0
  20. package/extensions/taskplane/messages.ts +134 -0
  21. package/extensions/taskplane/persistence.ts +1121 -0
  22. package/extensions/taskplane/resume.ts +1092 -0
  23. package/extensions/taskplane/sessions.ts +92 -0
  24. package/extensions/taskplane/types.ts +1514 -0
  25. package/extensions/taskplane/waves.ts +900 -0
  26. package/extensions/taskplane/worktree.ts +1624 -0
  27. package/package.json +50 -4
  28. package/skills/create-taskplane-task/SKILL.md +326 -0
  29. package/skills/create-taskplane-task/references/context-template.md +78 -0
  30. package/skills/create-taskplane-task/references/prompt-template.md +246 -0
  31. package/templates/agents/task-merger.md +256 -0
  32. package/templates/agents/task-reviewer.md +81 -0
  33. package/templates/agents/task-worker.md +140 -0
  34. package/templates/config/task-orchestrator.yaml +89 -0
  35. package/templates/config/task-runner.yaml +99 -0
  36. package/templates/tasks/CONTEXT.md +31 -0
  37. package/templates/tasks/EXAMPLE-001-hello-world/PROMPT.md +90 -0
  38. package/templates/tasks/EXAMPLE-001-hello-world/STATUS.md +73 -0
@@ -0,0 +1,1752 @@
1
+ /**
2
+ * Lane execution, monitoring, wave execution loop
3
+ * @module orch/execution
4
+ */
5
+ import { readFileSync, existsSync, statSync, unlinkSync, mkdirSync } from "fs";
6
+ import { spawnSync } from "child_process";
7
+ import { join, dirname, resolve, delimiter as pathDelimiter } from "path";
8
+
9
+ import { DONE_GRACE_MS, EXECUTION_POLL_INTERVAL_MS, ExecutionError, SESSION_SPAWN_RETRY_MAX } from "./types.ts";
10
+ import type { AllocatedLane, AllocatedTask, DependencyGraph, LaneExecutionResult, LaneMonitorSnapshot, LaneTaskOutcome, LaneTaskStatus, MonitorState, MtimeTracker, OrchestratorConfig, ParsedTask, TaskMonitorSnapshot, WaveExecutionResult } from "./types.ts";
11
+ import { allocateLanes } from "./waves.ts";
12
+
13
+ // ── Execution Helpers ────────────────────────────────────────────────
14
+
15
+ /**
16
+ * Structured log helper for lane execution.
17
+ *
18
+ * All execution logs go to stderr (same pattern as task-runner.ts).
19
+ * Format: [orch] {laneId}/{taskId}: {message}
20
+ * Correlation fields: batchId, laneId, taskId, sessionName.
21
+ * No PII — only IDs and paths.
22
+ */
23
+ export function execLog(
24
+ laneId: string,
25
+ taskId: string,
26
+ message: string,
27
+ extra?: Record<string, string | number | boolean>,
28
+ ): void {
29
+ const prefix = `[orch] ${laneId}/${taskId}`;
30
+ if (extra) {
31
+ const fields = Object.entries(extra)
32
+ .map(([k, v]) => `${k}=${v}`)
33
+ .join(" ");
34
+ console.error(`${prefix}: ${message} (${fields})`);
35
+ } else {
36
+ console.error(`${prefix}: ${message}`);
37
+ }
38
+ }
39
+
40
+ /**
41
+ * Check if a TMUX session exists (is alive).
42
+ *
43
+ * @param sessionName - TMUX session name to check
44
+ * @returns true if session exists
45
+ */
46
+ export function tmuxHasSession(sessionName: string): boolean {
47
+ const result = spawnSync("tmux", ["has-session", "-t", sessionName]);
48
+ return result.status === 0;
49
+ }
50
+
51
+ /**
52
+ * Kill a TMUX session if it exists.
53
+ *
54
+ * Idempotent: returns true if session was killed or was already absent.
55
+ *
56
+ * @param sessionName - TMUX session name to kill
57
+ * @returns true if session is now absent
58
+ */
59
+ export function tmuxKillSession(sessionName: string): boolean {
60
+ // Check liveness first so we can distinguish "already gone" from "kill failed".
61
+ const wasAlive = tmuxHasSession(sessionName);
62
+ if (!wasAlive) {
63
+ return true; // Already absent
64
+ }
65
+
66
+ spawnSync("tmux", ["kill-session", "-t", sessionName]);
67
+
68
+ // Consider success only if the session is now absent.
69
+ return !tmuxHasSession(sessionName);
70
+ }
71
+
72
+ /**
73
+ * Kill a lane session and its child sessions (worker, reviewer).
74
+ *
75
+ * Child session names follow the convention:
76
+ * - `{sessionName}-worker`
77
+ * - `{sessionName}-reviewer`
78
+ *
79
+ * @param sessionName - Base lane session name (e.g., "orch-lane-1")
80
+ */
81
+ export function killLaneAndChildren(sessionName: string): void {
82
+ // Kill children first (they depend on the parent context)
83
+ tmuxKillSession(`${sessionName}-worker`);
84
+ tmuxKillSession(`${sessionName}-reviewer`);
85
+ // Then kill the parent lane session
86
+ tmuxKillSession(sessionName);
87
+ }
88
+
89
+ /**
90
+ * Build environment variables for a lane task execution.
91
+ *
92
+ * These env vars tell the task-runner extension inside the TMUX session
93
+ * how to behave:
94
+ * - TASK_AUTOSTART: relative path to PROMPT.md from worktree root
95
+ * - TASK_RUNNER_SPAWN_MODE: "tmux" for TMUX-based worker/reviewer spawning
96
+ * - TASK_RUNNER_TMUX_PREFIX: prefix for worker/reviewer session names
97
+ *
98
+ * @param lane - The allocated lane (provides session name and worktree path)
99
+ * @param taskId - Task ID for logging
100
+ * @param promptPath - Absolute path to the task's PROMPT.md in the main repo
101
+ * @param repoRoot - Absolute path to the main repository root
102
+ * @returns Map of env var name → value
103
+ */
104
+ export function buildLaneEnvVars(
105
+ lane: AllocatedLane,
106
+ promptPath: string,
107
+ repoRoot: string,
108
+ ): Record<string, string> {
109
+ // TASK_AUTOSTART needs a path relative to the worktree root.
110
+ // The promptPath is absolute (from the main repo). We need the
111
+ // relative portion from the repo root, which will be the same
112
+ // relative path in the worktree since worktrees mirror the repo structure.
113
+ const repoRootNorm = resolve(repoRoot).replace(/\\/g, "/");
114
+ const promptNorm = resolve(promptPath).replace(/\\/g, "/");
115
+
116
+ let relativePath: string;
117
+ if (promptNorm.startsWith(repoRootNorm + "/")) {
118
+ relativePath = promptNorm.slice(repoRootNorm.length + 1);
119
+ } else {
120
+ // Fallback: use the path as-is (shouldn't happen in normal use)
121
+ relativePath = promptPath;
122
+ }
123
+
124
+ const nodePathEntries: string[] = [join(repoRoot, "node_modules")];
125
+ if (process.env.NODE_PATH) {
126
+ nodePathEntries.push(...process.env.NODE_PATH.split(pathDelimiter).filter(Boolean));
127
+ }
128
+ const nodePath = [...new Set(nodePathEntries)].join(pathDelimiter);
129
+
130
+ return {
131
+ TASK_AUTOSTART: relativePath,
132
+ TASK_RUNNER_SPAWN_MODE: "subprocess",
133
+ TASK_RUNNER_TMUX_PREFIX: lane.tmuxSessionName,
134
+ ORCH_SIDECAR_DIR: join(repoRoot, ".pi"),
135
+ NODE_PATH: nodePath,
136
+ // Pi's TUI (ink/react) hangs silently with TERM=tmux-256color (tmux default).
137
+ // Force xterm-256color so pi can render and start execution.
138
+ TERM: "xterm-256color",
139
+ };
140
+ }
141
+
142
+ /**
143
+ * Convert a Windows absolute path to a tmux-friendly POSIX-style path.
144
+ *
145
+ * tmux `-c` expects POSIX paths when running under Git Bash/MSYS.
146
+ * Passing `C:\...` can silently fall back to HOME, causing TASK_AUTOSTART
147
+ * path resolution failures.
148
+ */
149
+ export function toTmuxPath(pathValue: string): string {
150
+ const normalized = resolve(pathValue).replace(/\\/g, "/");
151
+ const driveMatch = normalized.match(/^([A-Za-z]):\/(.*)$/);
152
+ if (driveMatch) {
153
+ return `/${driveMatch[1].toLowerCase()}/${driveMatch[2]}`;
154
+ }
155
+ return normalized;
156
+ }
157
+
158
+ /**
159
+ * Build the tmux new-session command for spawning a lane.
160
+ *
161
+ * Constructs a properly escaped command that:
162
+ * 1. Sets env vars (TASK_AUTOSTART, TASK_RUNNER_SPAWN_MODE, TASK_RUNNER_TMUX_PREFIX)
163
+ * 2. Runs `pi --no-session -e extensions/task-runner.ts` in the worktree directory
164
+ *
165
+ * Shell escaping: env var values are single-quoted to prevent expansion.
166
+ * Path args are single-quoted to handle spaces and special characters.
167
+ *
168
+ * @param sessionName - TMUX session name (e.g., "orch-lane-1")
169
+ * @param worktreePath - Absolute path to the lane worktree
170
+ * @param repoRoot - Absolute path to main repo (for extension absolute path)
171
+ * @param envVars - Environment variables to set
172
+ * @param laneLogPath - Optional path to write lane session stdout/stderr
173
+ * @returns Array of arguments for spawnSync("tmux", args)
174
+ */
175
+ export function buildTmuxSpawnArgs(
176
+ sessionName: string,
177
+ worktreePath: string,
178
+ repoRoot: string,
179
+ envVars: Record<string, string>,
180
+ laneLogPath?: string,
181
+ ): string[] {
182
+ // Shell-quote a value for safe embedding in a command string.
183
+ // Wraps in single quotes, escaping any internal single quotes.
184
+ const shellQuote = (s: string): string => {
185
+ if (/[\s"'`$\\!&|;()<>{}#*?~]/.test(s)) {
186
+ return `'${s.replace(/'/g, "'\\''")}'`;
187
+ }
188
+ return s;
189
+ };
190
+
191
+ // Build the command string that runs inside the TMUX session.
192
+ // Format: ENV_VAR1=value1 ENV_VAR2=value2 pi --no-session -e extensions/task-runner.ts
193
+ const envParts = Object.entries(envVars)
194
+ .map(([key, val]) => `${key}=${shellQuote(val)}`)
195
+ .join(" ");
196
+
197
+ const taskRunnerExtPath = join(resolve(repoRoot), "extensions", "task-runner.ts");
198
+ const basePiCommand = `${envParts} pi --no-session -e ${shellQuote(taskRunnerExtPath)}`;
199
+
200
+ // NOTE: Do not redirect lane output here. Shell redirection has proven
201
+ // fragile across Windows + tmux environments and can prevent session spawn.
202
+ // Diagnostics use tmux pane capture + STATUS tail in pollUntilTaskComplete().
203
+ const piCommand = basePiCommand;
204
+
205
+ const tmuxWorktreePath = toTmuxPath(worktreePath);
206
+ const wrappedCommand = `cd ${shellQuote(tmuxWorktreePath)} && ${piCommand}`;
207
+
208
+ return [
209
+ "new-session", "-d",
210
+ "-s", sessionName,
211
+ wrappedCommand,
212
+ ];
213
+ }
214
+
215
+ /**
216
+ * Resolve the lane session log path for a task execution.
217
+ *
218
+ * Logs are written under the lane worktree to keep per-lane execution
219
+ * artifacts colocated with task state and available after failures.
220
+ */
221
+ export function resolveLaneLogPath(
222
+ lane: AllocatedLane,
223
+ task: AllocatedTask,
224
+ ): string {
225
+ return join(lane.worktreePath, ".pi", "orch-logs", `${lane.tmuxSessionName}-${task.taskId}.log`);
226
+ }
227
+
228
+ /**
229
+ * Relative lane log path used inside the tmux shell command.
230
+ *
231
+ * Relative paths avoid Windows drive-letter parsing issues in shell redirection.
232
+ */
233
+ export function resolveLaneLogRelativePath(
234
+ lane: AllocatedLane,
235
+ task: AllocatedTask,
236
+ ): string {
237
+ return join(".pi", "orch-logs", `${lane.tmuxSessionName}-${task.taskId}.log`).replace(/\\/g, "/");
238
+ }
239
+
240
+ /**
241
+ * Read a tail snippet from a lane log file for failure diagnostics.
242
+ */
243
+ export function readLaneLogTail(
244
+ logPath: string,
245
+ maxLines: number = 40,
246
+ maxChars: number = 1200,
247
+ ): string {
248
+ if (!existsSync(logPath)) return "";
249
+ try {
250
+ const raw = readFileSync(logPath, "utf-8").replace(/\r\n/g, "\n");
251
+ const tail = raw.split("\n").slice(-maxLines).join("\n").trim();
252
+ if (!tail) return "";
253
+ return tail.length > maxChars ? tail.slice(-maxChars) : tail;
254
+ } catch {
255
+ return "";
256
+ }
257
+ }
258
+
259
+ /**
260
+ * Capture tail output from a live TMUX pane for diagnostics.
261
+ *
262
+ * Works even when lane log redirection is disabled (Windows-safe fallback).
263
+ */
264
+ export function captureTmuxPaneTail(
265
+ sessionName: string,
266
+ maxLines: number = 40,
267
+ maxChars: number = 1200,
268
+ ): string {
269
+ const result = spawnSync("tmux", ["capture-pane", "-p", "-t", sessionName], {
270
+ encoding: "utf-8",
271
+ timeout: 3000,
272
+ });
273
+ if (result.status !== 0) return "";
274
+ const raw = (result.stdout || "").replace(/\r\n/g, "\n").trim();
275
+ if (!raw) return "";
276
+ const tail = raw.split("\n").slice(-maxLines).join("\n").trim();
277
+ if (!tail) return "";
278
+ return tail.length > maxChars ? tail.slice(-maxChars) : tail;
279
+ }
280
+
281
+ /**
282
+ * Read a tail snippet from task STATUS.md for failure diagnostics.
283
+ */
284
+ export function readTaskStatusTail(
285
+ statusPath: string,
286
+ maxLines: number = 40,
287
+ maxChars: number = 1200,
288
+ ): string {
289
+ if (!existsSync(statusPath)) return "";
290
+ try {
291
+ const raw = readFileSync(statusPath, "utf-8").replace(/\r\n/g, "\n").trim();
292
+ if (!raw) return "";
293
+ const tail = raw.split("\n").slice(-maxLines).join("\n").trim();
294
+ if (!tail) return "";
295
+ return tail.length > maxChars ? tail.slice(-maxChars) : tail;
296
+ } catch {
297
+ return "";
298
+ }
299
+ }
300
+
301
+ /**
302
+ * Resolve the path to a task's .DONE file inside a worktree.
303
+ *
304
+ * The task folder path from ParsedTask is absolute (main repo).
305
+ * We need to translate it to the equivalent path in the worktree.
306
+ *
307
+ * @param taskFolder - Absolute task folder path (from main repo)
308
+ * @param worktreePath - Absolute path to the lane worktree
309
+ * @param repoRoot - Absolute path to the main repository root
310
+ * @returns Absolute path to the .DONE file in the worktree
311
+ */
312
+ export function resolveTaskDonePath(
313
+ taskFolder: string,
314
+ worktreePath: string,
315
+ repoRoot: string,
316
+ ): string {
317
+ const repoRootNorm = resolve(repoRoot).replace(/\\/g, "/");
318
+ const folderNorm = resolve(taskFolder).replace(/\\/g, "/");
319
+
320
+ let relativePath: string;
321
+ if (folderNorm.startsWith(repoRootNorm + "/")) {
322
+ relativePath = folderNorm.slice(repoRootNorm.length + 1);
323
+ } else {
324
+ relativePath = taskFolder;
325
+ }
326
+
327
+ const primaryPath = join(worktreePath, relativePath, ".DONE");
328
+ if (existsSync(primaryPath)) return primaryPath;
329
+
330
+ // Fallback: worker may have archived the task folder during the
331
+ // "Documentation & Delivery" step, moving it under `.../archive/TASK-ID/`.
332
+ // Check the archive sibling path.
333
+ const parts = relativePath.replace(/\\/g, "/").split("/");
334
+ const taskDirName = parts[parts.length - 1]; // e.g. "PM-011-template-seed-data-permissions"
335
+ const parentParts = parts.slice(0, -1); // e.g. [..., "tasks"]
336
+ const archivePath = join(worktreePath, ...parentParts, "archive", taskDirName, ".DONE");
337
+ if (existsSync(archivePath)) return archivePath;
338
+
339
+ return primaryPath; // Return primary even if missing (caller checks existsSync)
340
+ }
341
+
342
+ /**
343
+ * Spawn a TMUX session for a task in a lane.
344
+ *
345
+ * Handles:
346
+ * - Stale session cleanup (kill if session name already exists)
347
+ * - Retry on transient spawn failures (up to SESSION_SPAWN_RETRY_MAX)
348
+ * - Structured logging
349
+ *
350
+ * @param lane - Allocated lane with worktree and session info
351
+ * @param task - Task to execute
352
+ * @param config - Orchestrator configuration
353
+ * @param repoRoot - Main repository root
354
+ * @throws ExecutionError if spawn fails after retries
355
+ */
356
+ export function spawnLaneSession(
357
+ lane: AllocatedLane,
358
+ task: AllocatedTask,
359
+ config: OrchestratorConfig,
360
+ repoRoot: string,
361
+ ): void {
362
+ const sessionName = lane.tmuxSessionName;
363
+ const laneId = lane.laneId;
364
+
365
+ execLog(laneId, task.taskId, "preparing to spawn TMUX session", {
366
+ session: sessionName,
367
+ worktree: lane.worktreePath,
368
+ worktreeTmuxPath: toTmuxPath(lane.worktreePath),
369
+ logPath: resolveLaneLogPath(lane, task),
370
+ });
371
+
372
+ // Pre-check: worktree exists
373
+ if (!existsSync(lane.worktreePath)) {
374
+ throw new ExecutionError(
375
+ "EXEC_WORKTREE_MISSING",
376
+ `Worktree path does not exist: ${lane.worktreePath}`,
377
+ laneId,
378
+ task.taskId,
379
+ );
380
+ }
381
+
382
+ // Build env vars
383
+ const envVars = buildLaneEnvVars(lane, task.task.promptPath, repoRoot);
384
+
385
+ // Prepare per-task lane log path for post-mortem diagnostics
386
+ const laneLogPath = resolveLaneLogPath(lane, task);
387
+ const laneLogRelativePath = resolveLaneLogRelativePath(lane, task);
388
+ try {
389
+ mkdirSync(dirname(laneLogPath), { recursive: true });
390
+ if (existsSync(laneLogPath)) {
391
+ unlinkSync(laneLogPath); // fresh log per task attempt
392
+ }
393
+ } catch {
394
+ // Best effort — session can still run without log file setup
395
+ }
396
+
397
+ // Build tmux args
398
+ const tmuxArgs = buildTmuxSpawnArgs(sessionName, lane.worktreePath, repoRoot, envVars, laneLogRelativePath);
399
+
400
+ // Clean up stale session if exists
401
+ if (tmuxHasSession(sessionName)) {
402
+ execLog(laneId, task.taskId, "killing stale TMUX session", { session: sessionName });
403
+ killLaneAndChildren(sessionName);
404
+ // Brief pause to let tmux clean up
405
+ spawnSync("sleep", ["0.5"], { shell: true, timeout: 3000 });
406
+ }
407
+
408
+ // Attempt to spawn with retry
409
+ let lastError = "";
410
+ for (let attempt = 1; attempt <= SESSION_SPAWN_RETRY_MAX + 1; attempt++) {
411
+ const result = spawnSync("tmux", tmuxArgs);
412
+
413
+ if (result.status === 0) {
414
+ execLog(laneId, task.taskId, "TMUX session spawned successfully", {
415
+ session: sessionName,
416
+ attempt,
417
+ });
418
+ return;
419
+ }
420
+
421
+ lastError = result.stderr?.toString().trim() || "unknown spawn error";
422
+ execLog(laneId, task.taskId, `spawn attempt ${attempt} failed: ${lastError}`, {
423
+ session: sessionName,
424
+ });
425
+
426
+ if (attempt <= SESSION_SPAWN_RETRY_MAX) {
427
+ // Wait before retry (1s, 2s)
428
+ const delayMs = attempt * 1000;
429
+ spawnSync("sleep", [`${delayMs / 1000}`], { shell: true, timeout: delayMs + 2000 });
430
+ }
431
+ }
432
+
433
+ throw new ExecutionError(
434
+ "EXEC_SPAWN_FAILED",
435
+ `Failed to create TMUX session '${sessionName}' after ${SESSION_SPAWN_RETRY_MAX + 1} attempts. Last error: ${lastError}`,
436
+ laneId,
437
+ task.taskId,
438
+ );
439
+ }
440
+
441
+ /**
442
+ * Poll until a task completes (or fails).
443
+ *
444
+ * Completion detection logic:
445
+ * 1. Check for .DONE file → task succeeded (highest priority)
446
+ * 2. Check TMUX session liveness via `tmux has-session`
447
+ * 3. If session exits without .DONE → wait DONE_GRACE_MS (slow disk flush)
448
+ * 4. After grace period, if still no .DONE → task failed
449
+ *
450
+ * Terminal-state precedence: .DONE found at any point = success,
451
+ * regardless of session state.
452
+ *
453
+ * @param lane - Allocated lane
454
+ * @param task - Task being executed
455
+ * @param config - Orchestrator configuration
456
+ * @param repoRoot - Main repository root
457
+ * @param pauseSignal - Checked each poll cycle; if true, returns early with "skipped"
458
+ * @returns LaneTaskStatus indicating the final state
459
+ */
460
+ export async function pollUntilTaskComplete(
461
+ lane: AllocatedLane,
462
+ task: AllocatedTask,
463
+ config: OrchestratorConfig,
464
+ repoRoot: string,
465
+ pauseSignal: { paused: boolean },
466
+ ): Promise<{ status: LaneTaskStatus; exitReason: string; doneFileFound: boolean }> {
467
+ const sessionName = lane.tmuxSessionName;
468
+ const laneId = lane.laneId;
469
+ const donePath = resolveTaskDonePath(task.task.taskFolder, lane.worktreePath, repoRoot);
470
+ const statusPath = join(dirname(donePath), "STATUS.md");
471
+ const laneLogPath = resolveLaneLogPath(lane, task);
472
+
473
+ execLog(laneId, task.taskId, "polling for completion", {
474
+ session: sessionName,
475
+ donePath,
476
+ statusPath,
477
+ logPath: laneLogPath,
478
+ });
479
+
480
+ let lastPaneTail = "";
481
+
482
+ // Main polling loop
483
+ while (true) {
484
+ // Check pause signal
485
+ if (pauseSignal.paused) {
486
+ execLog(laneId, task.taskId, "pause signal detected during poll");
487
+ // Don't kill the session — let the current task-runner checkpoint
488
+ // The calling code will handle marking as skipped
489
+ return {
490
+ status: "skipped",
491
+ exitReason: "Paused by user (/orch-pause)",
492
+ doneFileFound: false,
493
+ };
494
+ }
495
+
496
+ // Capture live pane output for diagnostics (best effort).
497
+ const paneTail = captureTmuxPaneTail(sessionName);
498
+ if (paneTail) {
499
+ lastPaneTail = paneTail;
500
+ }
501
+
502
+ // Priority 1: Check for .DONE file
503
+ if (existsSync(donePath)) {
504
+ execLog(laneId, task.taskId, ".DONE file found — task succeeded", {
505
+ session: sessionName,
506
+ });
507
+ return {
508
+ status: "succeeded",
509
+ exitReason: ".DONE file created by task-runner",
510
+ doneFileFound: true,
511
+ };
512
+ }
513
+
514
+ // Priority 2: Check if TMUX session is still alive
515
+ if (!tmuxHasSession(sessionName)) {
516
+ // Session exited — start grace period for .DONE file
517
+ execLog(laneId, task.taskId, "TMUX session exited, entering grace period", {
518
+ session: sessionName,
519
+ graceMs: DONE_GRACE_MS,
520
+ });
521
+
522
+ // Grace period: poll .DONE file at short intervals
523
+ const graceStart = Date.now();
524
+ while (Date.now() - graceStart < DONE_GRACE_MS) {
525
+ await new Promise((r) => setTimeout(r, 500));
526
+
527
+ if (existsSync(donePath)) {
528
+ execLog(laneId, task.taskId, ".DONE file found during grace period — task succeeded", {
529
+ session: sessionName,
530
+ });
531
+ return {
532
+ status: "succeeded",
533
+ exitReason: ".DONE file created (found during grace period)",
534
+ doneFileFound: true,
535
+ };
536
+ }
537
+ }
538
+
539
+ // Grace period expired without .DONE → task failed
540
+ const logTail = readLaneLogTail(laneLogPath);
541
+ execLog(laneId, task.taskId, "grace period expired without .DONE — task failed", {
542
+ session: sessionName,
543
+ logPath: laneLogPath,
544
+ });
545
+ if (logTail) {
546
+ execLog(laneId, task.taskId, `lane session output (tail):\n${logTail}`);
547
+ }
548
+ const statusTail = readTaskStatusTail(statusPath);
549
+ const hasLogFile = existsSync(laneLogPath);
550
+ const outputForHint = logTail || lastPaneTail || statusTail;
551
+ const logHint = outputForHint
552
+ ? ` Last output: ${outputForHint.replace(/\s+/g, " ").slice(-300)}`
553
+ : "";
554
+ const logLocation = hasLogFile ? ` Lane log: ${laneLogPath}.` : "";
555
+ if (!logTail && lastPaneTail) {
556
+ execLog(laneId, task.taskId, `lane session output from TMUX pane (tail):\n${lastPaneTail}`);
557
+ }
558
+ if (statusTail) {
559
+ execLog(laneId, task.taskId, `task STATUS tail:\n${statusTail}`);
560
+ }
561
+ return {
562
+ status: "failed",
563
+ exitReason:
564
+ `TMUX session '${sessionName}' exited without creating .DONE file ` +
565
+ `(grace period ${DONE_GRACE_MS}ms expired).` +
566
+ `${logLocation}${logHint}`,
567
+ doneFileFound: false,
568
+ };
569
+ }
570
+
571
+ // Session alive, no .DONE yet — keep polling
572
+ await new Promise((r) => setTimeout(r, EXECUTION_POLL_INTERVAL_MS));
573
+ }
574
+ }
575
+
576
+ /**
577
+ * Execute all tasks in a lane sequentially.
578
+ *
579
+ * For each task in the lane (in order):
580
+ * 1. Spawn a TMUX session with TASK_AUTOSTART pointing to the task's PROMPT.md
581
+ * 2. Poll until the task completes (or fails)
582
+ * 3. Record the outcome
583
+ * 4. If the task failed, skip remaining tasks in the lane
584
+ *
585
+ * The lane reuses the same worktree and TMUX session name across tasks.
586
+ * Each new task gets a fresh TMUX session (the previous one has exited).
587
+ *
588
+ * Cleanup policy:
589
+ * - On success: session exits naturally, no cleanup needed
590
+ * - On failure: session may have exited already; if alive, leave for debugging
591
+ * - On pause: stop after current task, mark remaining as skipped
592
+ * - On stall: handled by Step 3 (monitoring) — this function just polls
593
+ *
594
+ * @param lane - Fully allocated lane from Step 1
595
+ * @param config - Orchestrator configuration
596
+ * @param repoRoot - Main repository root
597
+ * @param pauseSignal - Shared signal for pause/abort (checked between tasks)
598
+ * @returns LaneExecutionResult with per-task outcomes
599
+ */
600
+ export async function executeLane(
601
+ lane: AllocatedLane,
602
+ config: OrchestratorConfig,
603
+ repoRoot: string,
604
+ pauseSignal: { paused: boolean },
605
+ ): Promise<LaneExecutionResult> {
606
+ const laneId = lane.laneId;
607
+ const laneStartTime = Date.now();
608
+ const outcomes: LaneTaskOutcome[] = [];
609
+ let shouldSkipRemaining = false;
610
+
611
+ execLog(laneId, "LANE", `starting execution of ${lane.tasks.length} task(s)`, {
612
+ worktree: lane.worktreePath,
613
+ session: lane.tmuxSessionName,
614
+ });
615
+
616
+ for (const task of lane.tasks) {
617
+ // Check if remaining tasks should be skipped (prior failure or pause)
618
+ if (shouldSkipRemaining || pauseSignal.paused) {
619
+ const reason = pauseSignal.paused
620
+ ? "Skipped due to pause signal"
621
+ : "Skipped due to prior task failure in lane";
622
+ execLog(laneId, task.taskId, reason);
623
+ outcomes.push({
624
+ taskId: task.taskId,
625
+ status: "skipped",
626
+ startTime: null,
627
+ endTime: null,
628
+ exitReason: reason,
629
+ sessionName: lane.tmuxSessionName,
630
+ doneFileFound: false,
631
+ });
632
+ continue;
633
+ }
634
+
635
+ // Execute this task
636
+ const taskStartTime = Date.now();
637
+ let taskOutcome: LaneTaskOutcome;
638
+
639
+ try {
640
+ // Spawn TMUX session
641
+ spawnLaneSession(lane, task, config, repoRoot);
642
+
643
+ // Poll until completion
644
+ const pollResult = await pollUntilTaskComplete(
645
+ lane,
646
+ task,
647
+ config,
648
+ repoRoot,
649
+ pauseSignal,
650
+ );
651
+
652
+ taskOutcome = {
653
+ taskId: task.taskId,
654
+ status: pollResult.status,
655
+ startTime: taskStartTime,
656
+ endTime: Date.now(),
657
+ exitReason: pollResult.exitReason,
658
+ sessionName: lane.tmuxSessionName,
659
+ doneFileFound: pollResult.doneFileFound,
660
+ };
661
+
662
+ // If task failed or was paused, skip remaining tasks
663
+ if (pollResult.status === "failed" || pollResult.status === "stalled") {
664
+ shouldSkipRemaining = true;
665
+ }
666
+ if (pollResult.status === "skipped") {
667
+ // Pause was signaled during poll — mark remaining as skipped too
668
+ shouldSkipRemaining = true;
669
+ }
670
+ } catch (err: unknown) {
671
+ // Spawn or polling error
672
+ const errMsg = err instanceof Error ? err.message : String(err);
673
+ execLog(laneId, task.taskId, `execution error: ${errMsg}`);
674
+
675
+ taskOutcome = {
676
+ taskId: task.taskId,
677
+ status: "failed",
678
+ startTime: taskStartTime,
679
+ endTime: Date.now(),
680
+ exitReason: errMsg,
681
+ sessionName: lane.tmuxSessionName,
682
+ doneFileFound: false,
683
+ };
684
+
685
+ shouldSkipRemaining = true;
686
+ }
687
+
688
+ const elapsed = Math.round(((taskOutcome.endTime || Date.now()) - taskStartTime) / 1000);
689
+ execLog(laneId, task.taskId, `task ${taskOutcome.status}`, {
690
+ elapsed: `${elapsed}s`,
691
+ doneFile: taskOutcome.doneFileFound,
692
+ });
693
+
694
+ outcomes.push(taskOutcome);
695
+ }
696
+
697
+ const laneEndTime = Date.now();
698
+ const succeededCount = outcomes.filter((o) => o.status === "succeeded").length;
699
+ const failedCount = outcomes.filter((o) => o.status === "failed" || o.status === "stalled").length;
700
+
701
+ let overallStatus: LaneExecutionResult["overallStatus"];
702
+ if (failedCount === 0 && succeededCount === lane.tasks.length) {
703
+ overallStatus = "succeeded";
704
+ } else if (failedCount > 0 && succeededCount > 0) {
705
+ overallStatus = "partial";
706
+ } else {
707
+ overallStatus = "failed";
708
+ }
709
+
710
+ const totalElapsed = Math.round((laneEndTime - laneStartTime) / 1000);
711
+ execLog(laneId, "LANE", `execution complete: ${overallStatus}`, {
712
+ succeeded: succeededCount,
713
+ failed: failedCount,
714
+ skipped: outcomes.filter((o) => o.status === "skipped").length,
715
+ elapsed: `${totalElapsed}s`,
716
+ });
717
+
718
+ return {
719
+ laneNumber: lane.laneNumber,
720
+ laneId: lane.laneId,
721
+ tasks: outcomes,
722
+ overallStatus,
723
+ startTime: laneStartTime,
724
+ endTime: laneEndTime,
725
+ };
726
+ }
727
+
728
+
729
+ // ── STATUS.md Parsing for Worktree ───────────────────────────────────
730
+
731
+ /**
732
+ * Normalized result from parsing a STATUS.md file in a worktree.
733
+ *
734
+ * Reuses the same regex patterns as task-runner's parseStatusMd but
735
+ * adapted for monitoring context (no direct import — same file patterns).
736
+ */
737
+ export interface ParsedWorktreeStatus {
738
+ /** Parsed step info array */
739
+ steps: {
740
+ number: number;
741
+ name: string;
742
+ status: "not-started" | "in-progress" | "complete";
743
+ totalChecked: number;
744
+ totalItems: number;
745
+ }[];
746
+ /** Review counter from STATUS.md */
747
+ reviewCounter: number;
748
+ /** Iteration number from STATUS.md */
749
+ iteration: number;
750
+ /** File modification time (epoch ms) */
751
+ mtime: number;
752
+ }
753
+
754
+ /**
755
+ * Parse STATUS.md from a task folder inside a worktree.
756
+ *
757
+ * Reads the STATUS.md file, parses step statuses and checkbox counts
758
+ * using the same regex patterns as task-runner's parseStatusMd.
759
+ *
760
+ * @param taskFolder - Absolute task folder path (from main repo)
761
+ * @param worktreePath - Absolute path to the lane worktree
762
+ * @param repoRoot - Absolute path to the main repository root
763
+ * @returns Parsed status or null with reason if unreadable
764
+ */
765
+ export function parseWorktreeStatusMd(
766
+ taskFolder: string,
767
+ worktreePath: string,
768
+ repoRoot: string,
769
+ ): { parsed: ParsedWorktreeStatus | null; error: string | null } {
770
+ // Translate the task folder path from main repo to worktree
771
+ const repoRootNorm = resolve(repoRoot).replace(/\\/g, "/");
772
+ const folderNorm = resolve(taskFolder).replace(/\\/g, "/");
773
+
774
+ let relativePath: string;
775
+ if (folderNorm.startsWith(repoRootNorm + "/")) {
776
+ relativePath = folderNorm.slice(repoRootNorm.length + 1);
777
+ } else {
778
+ relativePath = taskFolder;
779
+ }
780
+
781
+ let statusPath = join(worktreePath, relativePath, "STATUS.md");
782
+
783
+ if (!existsSync(statusPath)) {
784
+ // Fallback: worker may have archived the task folder
785
+ const parts = relativePath.replace(/\\/g, "/").split("/");
786
+ const taskDirName = parts[parts.length - 1];
787
+ const parentParts = parts.slice(0, -1);
788
+ const archiveStatusPath = join(worktreePath, ...parentParts, "archive", taskDirName, "STATUS.md");
789
+ if (existsSync(archiveStatusPath)) {
790
+ statusPath = archiveStatusPath;
791
+ } else {
792
+ return { parsed: null, error: `STATUS.md not found at ${statusPath}` };
793
+ }
794
+ }
795
+
796
+ let content: string;
797
+ let mtime: number;
798
+ try {
799
+ content = readFileSync(statusPath, "utf-8");
800
+ mtime = statSync(statusPath).mtimeMs;
801
+ } catch (err: unknown) {
802
+ return { parsed: null, error: `Cannot read STATUS.md: ${err instanceof Error ? err.message : String(err)}` };
803
+ }
804
+
805
+ // Parse using same regex patterns as task-runner's parseStatusMd
806
+ const text = content.replace(/\r\n/g, "\n");
807
+ const steps: ParsedWorktreeStatus["steps"] = [];
808
+ let currentStep: {
809
+ number: number;
810
+ name: string;
811
+ status: "not-started" | "in-progress" | "complete";
812
+ checkboxes: boolean[];
813
+ } | null = null;
814
+ let reviewCounter = 0;
815
+ let iteration = 0;
816
+
817
+ for (const line of text.split("\n")) {
818
+ const rcMatch = line.match(/\*\*Review Counter:\*\*\s*(\d+)/);
819
+ if (rcMatch) reviewCounter = parseInt(rcMatch[1]);
820
+ const itMatch = line.match(/\*\*Iteration:\*\*\s*(\d+)/);
821
+ if (itMatch) iteration = parseInt(itMatch[1]);
822
+
823
+ const stepMatch = line.match(/^###\s+Step\s+(\d+):\s*(.+)/);
824
+ if (stepMatch) {
825
+ if (currentStep) {
826
+ const totalChecked = currentStep.checkboxes.filter(c => c).length;
827
+ steps.push({
828
+ number: currentStep.number,
829
+ name: currentStep.name,
830
+ status: currentStep.status,
831
+ totalChecked,
832
+ totalItems: currentStep.checkboxes.length,
833
+ });
834
+ }
835
+ currentStep = {
836
+ number: parseInt(stepMatch[1]),
837
+ name: stepMatch[2].trim(),
838
+ status: "not-started",
839
+ checkboxes: [],
840
+ };
841
+ continue;
842
+ }
843
+ if (currentStep) {
844
+ const ss = line.match(/\*\*Status:\*\*\s*(.*)/);
845
+ if (ss) {
846
+ const s = ss[1];
847
+ if (s.includes("✅") || s.toLowerCase().includes("complete")) {
848
+ currentStep.status = "complete";
849
+ } else if (s.includes("🟨") || s.includes("🟡") || s.toLowerCase().includes("progress")) {
850
+ currentStep.status = "in-progress";
851
+ }
852
+ }
853
+ const cb = line.match(/^\s*-\s*\[([ xX])\]\s*(.*)/);
854
+ if (cb) {
855
+ currentStep.checkboxes.push(cb[1].toLowerCase() === "x");
856
+ }
857
+ }
858
+ }
859
+ if (currentStep) {
860
+ const totalChecked = currentStep.checkboxes.filter(c => c).length;
861
+ steps.push({
862
+ number: currentStep.number,
863
+ name: currentStep.name,
864
+ status: currentStep.status,
865
+ totalChecked,
866
+ totalItems: currentStep.checkboxes.length,
867
+ });
868
+ }
869
+
870
+ return {
871
+ parsed: { steps, reviewCounter, iteration, mtime },
872
+ error: null,
873
+ };
874
+ }
875
+
876
+
877
+ // ── State Resolution ─────────────────────────────────────────────────
878
+
879
+ /**
880
+ * Resolve the monitoring state for a single task by combining signals.
881
+ *
882
+ * State-resolution precedence (deterministic):
883
+ * 1. `.DONE` file found → "succeeded" (highest priority, always wins)
884
+ * 2. Stall timeout reached (mtime unchanged for stall_timeout AND session alive) → "stalled"
885
+ * 3. TMUX session exited without .DONE → "failed"
886
+ * 4. Session alive + recent mtime (within stall_timeout) → "running"
887
+ * 5. Session alive + stale mtime but within startup grace → "running" (with no stall timer yet)
888
+ * 6. Session alive + no STATUS.md yet but within startup grace → "running"
889
+ * 7. No session, no .DONE, never observed running → "unknown"
890
+ *
891
+ * @param taskId - Task identifier
892
+ * @param donePath - Absolute path to the .DONE file in the worktree
893
+ * @param sessionName - TMUX session name for this lane
894
+ * @param statusResult - Parsed STATUS.md result (may be null)
895
+ * @param tracker - Mtime tracker for stall detection
896
+ * @param stallTimeoutMs - Stall timeout in milliseconds
897
+ * @param now - Current timestamp (epoch ms) for deterministic testing
898
+ */
899
+ export function resolveTaskMonitorState(
900
+ taskId: string,
901
+ donePath: string,
902
+ sessionName: string,
903
+ statusResult: { parsed: ParsedWorktreeStatus | null; error: string | null },
904
+ tracker: MtimeTracker,
905
+ stallTimeoutMs: number,
906
+ now: number,
907
+ ): TaskMonitorSnapshot {
908
+ const sessionAlive = tmuxHasSession(sessionName);
909
+ const doneFileFound = existsSync(donePath);
910
+
911
+ // Build base snapshot from parsed status
912
+ let currentStepName: string | null = null;
913
+ let currentStepNumber: number | null = null;
914
+ let totalSteps = 0;
915
+ let totalChecked = 0;
916
+ let totalItems = 0;
917
+ let iteration = 0;
918
+ let reviewCounter = 0;
919
+ let parseError = statusResult.error;
920
+
921
+ if (statusResult.parsed) {
922
+ const { steps } = statusResult.parsed;
923
+ totalSteps = steps.length;
924
+ iteration = statusResult.parsed.iteration;
925
+ reviewCounter = statusResult.parsed.reviewCounter;
926
+
927
+ for (const step of steps) {
928
+ totalChecked += step.totalChecked;
929
+ totalItems += step.totalItems;
930
+ }
931
+
932
+ // Find the current step (first in-progress, or first not-started after last complete)
933
+ const inProgress = steps.find(s => s.status === "in-progress");
934
+ if (inProgress) {
935
+ currentStepName = inProgress.name;
936
+ currentStepNumber = inProgress.number;
937
+ } else {
938
+ // Find first not-started step
939
+ const notStarted = steps.find(s => s.status === "not-started");
940
+ if (notStarted) {
941
+ currentStepName = notStarted.name;
942
+ currentStepNumber = notStarted.number;
943
+ } else if (steps.length > 0) {
944
+ // All complete
945
+ const last = steps[steps.length - 1];
946
+ currentStepName = last.name;
947
+ currentStepNumber = last.number;
948
+ }
949
+ }
950
+
951
+ // Update mtime tracker
952
+ if (!tracker.statusFileSeenOnce) {
953
+ tracker.statusFileSeenOnce = true;
954
+ tracker.lastMtime = statusResult.parsed.mtime;
955
+ tracker.stallTimerStart = null; // Reset stall timer on first read
956
+ } else if (statusResult.parsed.mtime !== tracker.lastMtime) {
957
+ // Mtime changed — progress is being made
958
+ tracker.lastMtime = statusResult.parsed.mtime;
959
+ tracker.stallTimerStart = null; // Reset stall timer
960
+ } else {
961
+ // Mtime unchanged — start or continue stall timer
962
+ if (tracker.stallTimerStart === null) {
963
+ tracker.stallTimerStart = now;
964
+ }
965
+ }
966
+ }
967
+
968
+ // ── Priority 1: .DONE file found → succeeded ────────────────
969
+ if (doneFileFound) {
970
+ return {
971
+ taskId,
972
+ status: "succeeded",
973
+ currentStepName,
974
+ currentStepNumber,
975
+ totalSteps,
976
+ totalChecked,
977
+ totalItems,
978
+ sessionAlive,
979
+ doneFileFound: true,
980
+ stallReason: null,
981
+ lastHeartbeat: tracker.lastMtime,
982
+ observedAt: now,
983
+ parseError,
984
+ iteration,
985
+ reviewCounter,
986
+ };
987
+ }
988
+
989
+ // ── Priority 2: Stall timeout reached ────────────────────────
990
+ if (
991
+ sessionAlive &&
992
+ tracker.statusFileSeenOnce &&
993
+ tracker.stallTimerStart !== null &&
994
+ (now - tracker.stallTimerStart) >= stallTimeoutMs
995
+ ) {
996
+ const stallMinutes = Math.round((now - tracker.stallTimerStart) / 60_000);
997
+ const stallReason = `STATUS.md unchanged for ${stallMinutes} minutes (threshold: ${Math.round(stallTimeoutMs / 60_000)} min)`;
998
+
999
+ // Kill the session and children
1000
+ execLog("monitor", taskId, `stall detected — killing session`, {
1001
+ session: sessionName,
1002
+ stallMinutes,
1003
+ });
1004
+ killLaneAndChildren(sessionName);
1005
+
1006
+ return {
1007
+ taskId,
1008
+ status: "stalled",
1009
+ currentStepName,
1010
+ currentStepNumber,
1011
+ totalSteps,
1012
+ totalChecked,
1013
+ totalItems,
1014
+ sessionAlive: false, // We just killed it
1015
+ doneFileFound: false,
1016
+ stallReason,
1017
+ lastHeartbeat: tracker.lastMtime,
1018
+ observedAt: now,
1019
+ parseError,
1020
+ iteration,
1021
+ reviewCounter,
1022
+ };
1023
+ }
1024
+
1025
+ // ── Priority 3: Session exited without .DONE → failed ────────
1026
+ if (!sessionAlive) {
1027
+ return {
1028
+ taskId,
1029
+ status: "failed",
1030
+ currentStepName,
1031
+ currentStepNumber,
1032
+ totalSteps,
1033
+ totalChecked,
1034
+ totalItems,
1035
+ sessionAlive: false,
1036
+ doneFileFound: false,
1037
+ stallReason: null,
1038
+ lastHeartbeat: tracker.lastMtime,
1039
+ observedAt: now,
1040
+ parseError,
1041
+ iteration,
1042
+ reviewCounter,
1043
+ };
1044
+ }
1045
+
1046
+ // ── Priority 4-6: Session alive → running ────────────────────
1047
+ return {
1048
+ taskId,
1049
+ status: "running",
1050
+ currentStepName,
1051
+ currentStepNumber,
1052
+ totalSteps,
1053
+ totalChecked,
1054
+ totalItems,
1055
+ sessionAlive: true,
1056
+ doneFileFound: false,
1057
+ stallReason: null,
1058
+ lastHeartbeat: tracker.lastMtime,
1059
+ observedAt: now,
1060
+ parseError,
1061
+ iteration,
1062
+ reviewCounter,
1063
+ };
1064
+ }
1065
+
1066
+
1067
+ // ── Core Monitor Loop ────────────────────────────────────────────────
1068
+
1069
+ /**
1070
+ * Callback type for dashboard updates during monitoring.
1071
+ */
1072
+ export type MonitorUpdateCallback = (state: MonitorState) => void;
1073
+
1074
+ /**
1075
+ * Monitor all lanes in a wave, polling for progress, completion, and stalls.
1076
+ *
1077
+ * This is the orchestrator's "air traffic control" — it does NOT attach
1078
+ * to TMUX sessions. It monitors via filesystem polling:
1079
+ * - STATUS.md in each worktree for step/checkbox progress
1080
+ * - .DONE files for task completion
1081
+ * - `tmux has-session` for session liveness
1082
+ * - STATUS.md mtime for stall detection
1083
+ *
1084
+ * The monitoring loop runs until all lanes reach terminal states
1085
+ * (all tasks succeeded/failed/stalled) or the pauseSignal is set.
1086
+ *
1087
+ * **Important:** This function monitors lanes that are being executed
1088
+ * concurrently by `executeLane()` in Step 2. It does NOT spawn sessions —
1089
+ * it only observes. Step 4 will coordinate calling both executeLane()
1090
+ * and monitorLanes() in parallel.
1091
+ *
1092
+ * @param lanes - Allocated lanes being executed
1093
+ * @param config - Orchestrator configuration (poll_interval, stall_timeout)
1094
+ * @param repoRoot - Main repository root
1095
+ * @param pauseSignal - Shared signal for pause/abort
1096
+ * @param waveNumber - Current wave number (for display)
1097
+ * @param onUpdate - Optional callback invoked on each poll cycle
1098
+ * @returns Final MonitorState snapshot when monitoring completes
1099
+ */
1100
+ export async function monitorLanes(
1101
+ lanes: AllocatedLane[],
1102
+ config: OrchestratorConfig,
1103
+ repoRoot: string,
1104
+ pauseSignal: { paused: boolean },
1105
+ waveNumber: number = 1,
1106
+ onUpdate?: MonitorUpdateCallback,
1107
+ ): Promise<MonitorState> {
1108
+ const pollIntervalMs = (config.monitoring.poll_interval || 5) * 1000;
1109
+ const stallTimeoutMs = (config.failure.stall_timeout || 30) * 60_000;
1110
+
1111
+ // Initialize mtime trackers for each lane's current task
1112
+ // We track per-taskId so a lane advancing to the next task gets a fresh tracker
1113
+ const mtimeTrackers = new Map<string, MtimeTracker>();
1114
+
1115
+ function getOrCreateTracker(taskId: string, now: number): MtimeTracker {
1116
+ let tracker = mtimeTrackers.get(taskId);
1117
+ if (!tracker) {
1118
+ tracker = {
1119
+ taskId,
1120
+ firstObservedAt: now,
1121
+ statusFileSeenOnce: false,
1122
+ lastMtime: null,
1123
+ stallTimerStart: null,
1124
+ };
1125
+ mtimeTrackers.set(taskId, tracker);
1126
+ }
1127
+ return tracker;
1128
+ }
1129
+
1130
+ // Track terminal states per task to avoid re-processing
1131
+ const terminalTasks = new Map<string, TaskMonitorSnapshot>();
1132
+
1133
+ // Track which task each lane is currently on
1134
+ // (determined by: first task in lane that hasn't reached terminal state)
1135
+ const laneTaskIndex = new Map<number, number>();
1136
+ for (const lane of lanes) {
1137
+ laneTaskIndex.set(lane.laneNumber, 0);
1138
+ }
1139
+
1140
+ let pollCount = 0;
1141
+ let lastMonitorStateKey = "";
1142
+
1143
+ // Build the total task count
1144
+ const tasksTotal = lanes.reduce((sum, lane) => sum + lane.tasks.length, 0);
1145
+
1146
+ execLog("monitor", "ALL", `starting monitoring for ${lanes.length} lane(s), ${tasksTotal} task(s)`, {
1147
+ pollIntervalMs,
1148
+ stallTimeoutMin: Math.round(stallTimeoutMs / 60_000),
1149
+ });
1150
+
1151
+ while (true) {
1152
+ const now = Date.now();
1153
+ pollCount++;
1154
+
1155
+ // Check pause signal
1156
+ if (pauseSignal.paused) {
1157
+ execLog("monitor", "ALL", "pause signal detected — stopping monitoring");
1158
+ break;
1159
+ }
1160
+
1161
+ const laneSnapshots: LaneMonitorSnapshot[] = [];
1162
+ let totalDone = 0;
1163
+ let totalFailed = 0;
1164
+ let allTerminal = true;
1165
+
1166
+ for (const lane of lanes) {
1167
+ const completedTasks: string[] = [];
1168
+ const failedTasks: string[] = [];
1169
+ const remainingTasks: string[] = [];
1170
+ let currentTaskId: string | null = null;
1171
+ let currentTaskSnapshot: TaskMonitorSnapshot | null = null;
1172
+
1173
+ // Walk through tasks in order to determine lane state
1174
+ for (let i = 0; i < lane.tasks.length; i++) {
1175
+ const task = lane.tasks[i];
1176
+
1177
+ // Check if we already know this task is terminal
1178
+ const existingTerminal = terminalTasks.get(task.taskId);
1179
+ if (existingTerminal) {
1180
+ if (existingTerminal.status === "succeeded") {
1181
+ completedTasks.push(task.taskId);
1182
+ totalDone++;
1183
+ } else {
1184
+ failedTasks.push(task.taskId);
1185
+ totalFailed++;
1186
+ }
1187
+ continue;
1188
+ }
1189
+
1190
+ // This task hasn't reached terminal state yet
1191
+ if (currentTaskId === null) {
1192
+ // This is the current task being worked on
1193
+ currentTaskId = task.taskId;
1194
+
1195
+ const tracker = getOrCreateTracker(task.taskId, now);
1196
+ const donePath = resolveTaskDonePath(task.task.taskFolder, lane.worktreePath, repoRoot);
1197
+ const statusResult = parseWorktreeStatusMd(task.task.taskFolder, lane.worktreePath, repoRoot);
1198
+
1199
+ const snapshot = resolveTaskMonitorState(
1200
+ task.taskId,
1201
+ donePath,
1202
+ lane.tmuxSessionName,
1203
+ statusResult,
1204
+ tracker,
1205
+ stallTimeoutMs,
1206
+ now,
1207
+ );
1208
+
1209
+ currentTaskSnapshot = snapshot;
1210
+
1211
+ // Check if this task just became terminal
1212
+ if (snapshot.status === "succeeded" || snapshot.status === "failed" || snapshot.status === "stalled") {
1213
+ terminalTasks.set(task.taskId, snapshot);
1214
+ if (snapshot.status === "succeeded") {
1215
+ completedTasks.push(task.taskId);
1216
+ totalDone++;
1217
+ } else {
1218
+ failedTasks.push(task.taskId);
1219
+ totalFailed++;
1220
+ }
1221
+ // Move to next task — clear currentTaskId so next iteration picks up
1222
+ currentTaskId = null;
1223
+ currentTaskSnapshot = null;
1224
+ } else {
1225
+ // Task is still running — mark remaining and break
1226
+ allTerminal = false;
1227
+ // Remaining tasks are everything after this one
1228
+ for (let j = i + 1; j < lane.tasks.length; j++) {
1229
+ remainingTasks.push(lane.tasks[j].taskId);
1230
+ }
1231
+ break;
1232
+ }
1233
+ } else {
1234
+ // Shouldn't reach here since we break above, but defensive
1235
+ remainingTasks.push(task.taskId);
1236
+ }
1237
+ }
1238
+
1239
+ // If we processed all tasks and currentTaskId is still null,
1240
+ // the lane is fully terminal (all tasks completed/failed)
1241
+ if (currentTaskId !== null) {
1242
+ allTerminal = false;
1243
+ }
1244
+
1245
+ const sessionAlive = tmuxHasSession(lane.tmuxSessionName);
1246
+
1247
+ laneSnapshots.push({
1248
+ laneId: lane.laneId,
1249
+ laneNumber: lane.laneNumber,
1250
+ sessionName: lane.tmuxSessionName,
1251
+ sessionAlive,
1252
+ currentTaskId,
1253
+ currentTaskSnapshot,
1254
+ completedTasks,
1255
+ failedTasks,
1256
+ remainingTasks,
1257
+ });
1258
+ }
1259
+
1260
+ const monitorState: MonitorState = {
1261
+ lanes: laneSnapshots,
1262
+ tasksDone: totalDone,
1263
+ tasksFailed: totalFailed,
1264
+ tasksTotal,
1265
+ waveNumber,
1266
+ pollCount,
1267
+ lastPollTime: now,
1268
+ allTerminal,
1269
+ };
1270
+
1271
+ // Invoke the dashboard update callback
1272
+ if (onUpdate) {
1273
+ try {
1274
+ onUpdate(monitorState);
1275
+ } catch {
1276
+ // Don't let callback errors kill the monitor loop
1277
+ }
1278
+ }
1279
+
1280
+ // Log summary only on state changes (lane completes or fails) — not every poll
1281
+ const currentStateKey = `${totalDone}/${totalFailed}`;
1282
+ if (currentStateKey !== lastMonitorStateKey) {
1283
+ const activeLanes = laneSnapshots.filter(l => l.currentTaskId !== null);
1284
+ execLog("monitor", "ALL", `poll #${pollCount}: ${totalDone}/${tasksTotal} done, ${totalFailed} failed, ${activeLanes.length} active lane(s)`);
1285
+ lastMonitorStateKey = currentStateKey;
1286
+ }
1287
+
1288
+ // Exit conditions
1289
+ if (allTerminal) {
1290
+ execLog("monitor", "ALL", `all lanes terminal — monitoring complete`, {
1291
+ done: totalDone,
1292
+ failed: totalFailed,
1293
+ total: tasksTotal,
1294
+ polls: pollCount,
1295
+ });
1296
+ return monitorState;
1297
+ }
1298
+
1299
+ // Wait for next poll cycle
1300
+ await new Promise(r => setTimeout(r, pollIntervalMs));
1301
+ }
1302
+
1303
+ // Reached here due to pause signal — return current state
1304
+ const now = Date.now();
1305
+ const laneSnapshots: LaneMonitorSnapshot[] = lanes.map(lane => ({
1306
+ laneId: lane.laneId,
1307
+ laneNumber: lane.laneNumber,
1308
+ sessionName: lane.tmuxSessionName,
1309
+ sessionAlive: tmuxHasSession(lane.tmuxSessionName),
1310
+ currentTaskId: null,
1311
+ currentTaskSnapshot: null,
1312
+ completedTasks: [],
1313
+ failedTasks: [],
1314
+ remainingTasks: lane.tasks.map(t => t.taskId),
1315
+ }));
1316
+
1317
+ return {
1318
+ lanes: laneSnapshots,
1319
+ tasksDone: 0,
1320
+ tasksFailed: 0,
1321
+ tasksTotal,
1322
+ waveNumber,
1323
+ pollCount,
1324
+ lastPollTime: now,
1325
+ allTerminal: false,
1326
+ };
1327
+ }
1328
+
1329
+
1330
+ // ── Transitive Dependent Computation ─────────────────────────────────
1331
+
1332
+ /**
1333
+ * Compute transitive dependents of a set of failed task IDs.
1334
+ *
1335
+ * Uses BFS through the dependency graph's `dependents` map (task → tasks
1336
+ * that depend on it) to find all tasks transitively blocked by the failures.
1337
+ *
1338
+ * Example: if A failed, B depends on A, and C depends on B, then both B
1339
+ * and C are transitively blocked.
1340
+ *
1341
+ * The failed tasks themselves are NOT included in the output — only their
1342
+ * downstream dependents.
1343
+ *
1344
+ * @param failedTaskIds - Set of task IDs that failed
1345
+ * @param dependencyGraph - Dependency graph with dependents map
1346
+ * @returns Set of task IDs transitively blocked (excludes the failed tasks themselves)
1347
+ */
1348
+ export function computeTransitiveDependents(
1349
+ failedTaskIds: Set<string>,
1350
+ dependencyGraph: DependencyGraph,
1351
+ ): Set<string> {
1352
+ const blocked = new Set<string>();
1353
+ const queue = [...failedTaskIds];
1354
+
1355
+ while (queue.length > 0) {
1356
+ const current = queue.shift()!;
1357
+ const dependents = dependencyGraph.dependents.get(current) || [];
1358
+
1359
+ // Deterministic: sort dependents alphabetically
1360
+ const sortedDependents = [...dependents].sort();
1361
+
1362
+ for (const dep of sortedDependents) {
1363
+ if (blocked.has(dep)) continue;
1364
+ if (failedTaskIds.has(dep)) continue; // Don't re-add failed tasks
1365
+ blocked.add(dep);
1366
+ queue.push(dep); // Continue BFS for transitive closure
1367
+ }
1368
+ }
1369
+
1370
+ return blocked;
1371
+ }
1372
+
1373
+
1374
+ // ── Wave Execution Core ──────────────────────────────────────────────
1375
+
1376
+ /**
1377
+ * Execute a single wave: allocate lanes, run tasks in parallel, monitor, apply failure policy.
1378
+ *
1379
+ * Orchestration flow:
1380
+ * 1. Allocate lanes via allocateLanes() (worktree creation + task assignment)
1381
+ * 2. Start all lanes in parallel (each lane executes tasks sequentially)
1382
+ * 3. Start monitoring as a sibling async loop
1383
+ * 4. Wait for all lanes to complete (or policy-triggered early termination)
1384
+ * 5. Apply failure handling policy
1385
+ * 6. Build and return WaveExecutionResult
1386
+ *
1387
+ * Failure policy behavior:
1388
+ * - **skip-dependents**: In-flight tasks continue. Failed task's transitive
1389
+ * dependents are collected in blockedTaskIds for future wave pruning.
1390
+ * Current wave runs to completion.
1391
+ * - **stop-wave**: On first failure, pauseSignal is set. In-flight tasks
1392
+ * finish their current work, remaining tasks in lanes are skipped.
1393
+ * No next wave is started (stoppedEarly=true).
1394
+ * - **stop-all**: On first failure, all TMUX sessions are killed immediately.
1395
+ * Returns with aborted status.
1396
+ *
1397
+ * Concurrency model:
1398
+ * - Lane execution promises are NOT cancellable (tmux sessions run externally)
1399
+ * - stop-all kills sessions directly; executeLane() detects session death on next poll
1400
+ * - Monitoring stops when all lanes reach terminal state or pauseSignal is set
1401
+ *
1402
+ * @param waveTasks - Task IDs in this wave
1403
+ * @param waveIndex - Wave number (1-indexed)
1404
+ * @param pending - Full pending task map from discovery
1405
+ * @param config - Orchestrator configuration
1406
+ * @param repoRoot - Main repository root
1407
+ * @param batchId - Batch ID for naming
1408
+ * @param pauseSignal - Shared pause signal (mutated by stop-wave policy)
1409
+ * @param dependencyGraph - Dependency graph for computing transitive dependents
1410
+ * @param onMonitorUpdate - Optional callback for dashboard updates during monitoring
1411
+ * @param onLanesAllocated - Optional callback fired after lane allocation succeeds
1412
+ * @returns WaveExecutionResult with outcomes and blocked task IDs
1413
+ */
1414
+ export async function executeWave(
1415
+ waveTasks: string[],
1416
+ waveIndex: number,
1417
+ pending: Map<string, ParsedTask>,
1418
+ config: OrchestratorConfig,
1419
+ repoRoot: string,
1420
+ batchId: string,
1421
+ pauseSignal: { paused: boolean },
1422
+ dependencyGraph: DependencyGraph,
1423
+ onMonitorUpdate?: MonitorUpdateCallback,
1424
+ onLanesAllocated?: (lanes: AllocatedLane[]) => void,
1425
+ ): Promise<WaveExecutionResult> {
1426
+ const startedAt = Date.now();
1427
+ const policy = config.failure.on_task_failure;
1428
+
1429
+ execLog("wave", `W${waveIndex}`, `starting wave execution`, {
1430
+ tasks: waveTasks.length,
1431
+ policy,
1432
+ batchId,
1433
+ });
1434
+
1435
+ // ── Stage 1: Allocate lanes ──────────────────────────────────
1436
+ const allocResult = allocateLanes(waveTasks, pending, config, repoRoot, batchId);
1437
+
1438
+ if (!allocResult.success) {
1439
+ const errMsg = allocResult.error?.message || "Unknown allocation failure";
1440
+ execLog("wave", `W${waveIndex}`, `lane allocation failed: ${errMsg}`);
1441
+
1442
+ return {
1443
+ waveIndex,
1444
+ startedAt,
1445
+ endedAt: Date.now(),
1446
+ laneResults: [],
1447
+ policyApplied: policy,
1448
+ stoppedEarly: true,
1449
+ failedTaskIds: waveTasks, // All tasks in the wave are considered failed
1450
+ skippedTaskIds: [],
1451
+ succeededTaskIds: [],
1452
+ blockedTaskIds: [...computeTransitiveDependents(new Set(waveTasks), dependencyGraph)],
1453
+ laneCount: 0,
1454
+ overallStatus: "failed",
1455
+ finalMonitorState: null,
1456
+ allocatedLanes: [],
1457
+ };
1458
+ }
1459
+
1460
+ const lanes = allocResult.lanes;
1461
+ onLanesAllocated?.(lanes);
1462
+
1463
+ execLog("wave", `W${waveIndex}`, `lanes allocated`, {
1464
+ laneCount: lanes.length,
1465
+ totalTasks: waveTasks.length,
1466
+ });
1467
+
1468
+ // ── Stage 2+3: Start lanes in parallel + monitoring ──────────
1469
+ // Create per-wave pause signal that can be triggered by policy
1470
+ // while preserving the external pauseSignal from /orch-pause
1471
+ const wavePauseSignal = pauseSignal;
1472
+
1473
+ // Start lane execution promises
1474
+ const lanePromises = lanes.map(lane =>
1475
+ executeLane(lane, config, repoRoot, wavePauseSignal),
1476
+ );
1477
+
1478
+ // Start monitoring as a sibling async loop
1479
+ // Monitor runs concurrently and stops when all lanes are terminal or paused
1480
+ const monitorPromise = monitorLanes(
1481
+ lanes,
1482
+ config,
1483
+ repoRoot,
1484
+ wavePauseSignal,
1485
+ waveIndex,
1486
+ onMonitorUpdate,
1487
+ );
1488
+
1489
+ // ── Stage 4: Wait for all lanes + apply policy ───────────────
1490
+ // We need to detect the first failure to apply policy.
1491
+ // Use Promise.allSettled on lanes, then check results.
1492
+ // For stop-all, we also need to react proactively.
1493
+
1494
+ let laneResults: LaneExecutionResult[];
1495
+ let finalMonitorState: MonitorState | null = null;
1496
+
1497
+ if (policy === "stop-all") {
1498
+ // For stop-all: race detection — as soon as any lane reports failure,
1499
+ // kill all sessions immediately.
1500
+ laneResults = await executeWithStopAll(lanes, lanePromises, wavePauseSignal, waveIndex);
1501
+ } else {
1502
+ // For skip-dependents and stop-wave:
1503
+ // Let all lanes run to completion (or until pauseSignal stops them).
1504
+ // For stop-wave, we set pauseSignal when we detect failure in results.
1505
+ const settled = await Promise.allSettled(lanePromises);
1506
+
1507
+ laneResults = settled.map((result, idx) => {
1508
+ if (result.status === "fulfilled") {
1509
+ return result.value;
1510
+ }
1511
+ // Rejected promise — shouldn't normally happen (executeLane catches errors)
1512
+ const errMsg = result.reason instanceof Error ? result.reason.message : String(result.reason);
1513
+ execLog("wave", `W${waveIndex}`, `lane ${lanes[idx].laneId} promise rejected: ${errMsg}`);
1514
+ return {
1515
+ laneNumber: lanes[idx].laneNumber,
1516
+ laneId: lanes[idx].laneId,
1517
+ tasks: lanes[idx].tasks.map(t => ({
1518
+ taskId: t.taskId,
1519
+ status: "failed" as LaneTaskStatus,
1520
+ startTime: null,
1521
+ endTime: null,
1522
+ exitReason: `Lane promise rejected: ${errMsg}`,
1523
+ sessionName: lanes[idx].tmuxSessionName,
1524
+ doneFileFound: false,
1525
+ })),
1526
+ overallStatus: "failed" as const,
1527
+ startTime: startedAt,
1528
+ endTime: Date.now(),
1529
+ };
1530
+ });
1531
+
1532
+ // For stop-wave: if any task failed, set pause to prevent next wave
1533
+ if (policy === "stop-wave") {
1534
+ const hasFailure = laneResults.some(lr =>
1535
+ lr.tasks.some(t => t.status === "failed" || t.status === "stalled"),
1536
+ );
1537
+ if (hasFailure) {
1538
+ wavePauseSignal.paused = true;
1539
+ execLog("wave", `W${waveIndex}`, `stop-wave policy triggered — pausing after this wave`);
1540
+ }
1541
+ }
1542
+ }
1543
+
1544
+ // Stop the monitor (it should stop naturally when lanes are terminal,
1545
+ // but ensure it's stopped if we triggered pause)
1546
+ try {
1547
+ finalMonitorState = await monitorPromise;
1548
+ } catch {
1549
+ // Monitor error is non-fatal
1550
+ execLog("wave", `W${waveIndex}`, `monitor promise error (non-fatal)`);
1551
+ }
1552
+
1553
+ // ── Stage 5: Build WaveExecutionResult ───────────────────────
1554
+ const failedTaskIds: string[] = [];
1555
+ const skippedTaskIds: string[] = [];
1556
+ const succeededTaskIds: string[] = [];
1557
+
1558
+ for (const lr of laneResults) {
1559
+ for (const t of lr.tasks) {
1560
+ if (t.status === "succeeded") {
1561
+ succeededTaskIds.push(t.taskId);
1562
+ } else if (t.status === "failed" || t.status === "stalled") {
1563
+ failedTaskIds.push(t.taskId);
1564
+ } else if (t.status === "skipped") {
1565
+ skippedTaskIds.push(t.taskId);
1566
+ }
1567
+ }
1568
+ }
1569
+
1570
+ // Sort for deterministic output
1571
+ failedTaskIds.sort();
1572
+ skippedTaskIds.sort();
1573
+ succeededTaskIds.sort();
1574
+
1575
+ // Compute blocked tasks for future waves (skip-dependents policy)
1576
+ let blockedTaskIds: string[] = [];
1577
+ if (policy === "skip-dependents" && failedTaskIds.length > 0) {
1578
+ const blocked = computeTransitiveDependents(
1579
+ new Set(failedTaskIds),
1580
+ dependencyGraph,
1581
+ );
1582
+ blockedTaskIds = [...blocked].sort();
1583
+ if (blockedTaskIds.length > 0) {
1584
+ execLog("wave", `W${waveIndex}`, `skip-dependents: ${blockedTaskIds.length} task(s) blocked for future waves`, {
1585
+ blocked: blockedTaskIds.join(","),
1586
+ });
1587
+ }
1588
+ }
1589
+
1590
+ // Determine overall wave status
1591
+ const stoppedEarly = policy === "stop-all" && failedTaskIds.length > 0
1592
+ || policy === "stop-wave" && failedTaskIds.length > 0;
1593
+
1594
+ let overallStatus: WaveExecutionResult["overallStatus"];
1595
+ if (policy === "stop-all" && failedTaskIds.length > 0) {
1596
+ overallStatus = "aborted";
1597
+ } else if (failedTaskIds.length === 0) {
1598
+ overallStatus = "succeeded";
1599
+ } else if (succeededTaskIds.length > 0) {
1600
+ overallStatus = "partial";
1601
+ } else {
1602
+ overallStatus = "failed";
1603
+ }
1604
+
1605
+ const endedAt = Date.now();
1606
+ const elapsedSec = Math.round((endedAt - startedAt) / 1000);
1607
+
1608
+ execLog("wave", `W${waveIndex}`, `wave execution complete: ${overallStatus}`, {
1609
+ succeeded: succeededTaskIds.length,
1610
+ failed: failedTaskIds.length,
1611
+ skipped: skippedTaskIds.length,
1612
+ blocked: blockedTaskIds.length,
1613
+ elapsed: `${elapsedSec}s`,
1614
+ stoppedEarly,
1615
+ });
1616
+
1617
+ return {
1618
+ waveIndex,
1619
+ startedAt,
1620
+ endedAt,
1621
+ laneResults,
1622
+ policyApplied: policy,
1623
+ stoppedEarly,
1624
+ failedTaskIds,
1625
+ skippedTaskIds,
1626
+ succeededTaskIds,
1627
+ blockedTaskIds,
1628
+ laneCount: lanes.length,
1629
+ overallStatus,
1630
+ finalMonitorState,
1631
+ allocatedLanes: lanes,
1632
+ };
1633
+ }
1634
+
1635
+ /**
1636
+ * Execute lanes with stop-all failure policy.
1637
+ *
1638
+ * Starts all lanes, then monitors for the first failure.
1639
+ * On first failure: kills all TMUX sessions immediately and returns.
1640
+ *
1641
+ * Uses a race pattern: wraps each lane promise to signal on failure,
1642
+ * then kills all sessions when first failure is detected.
1643
+ *
1644
+ * Deterministic tie-break: when multiple failures happen simultaneously,
1645
+ * they are ordered by timestamp (startTime), then by task ID alphabetically.
1646
+ *
1647
+ * @param lanes - Allocated lanes
1648
+ * @param lanePromises - Already-started lane execution promises
1649
+ * @param pauseSignal - Pause signal to set on abort
1650
+ * @param waveIndex - Wave number for logging
1651
+ * @returns Lane execution results (may have aborted tasks)
1652
+ */
1653
+ export async function executeWithStopAll(
1654
+ lanes: AllocatedLane[],
1655
+ lanePromises: Promise<LaneExecutionResult>[],
1656
+ pauseSignal: { paused: boolean },
1657
+ waveIndex: number,
1658
+ ): Promise<LaneExecutionResult[]> {
1659
+ // Track results as they complete
1660
+ const results: (LaneExecutionResult | null)[] = new Array(lanes.length).fill(null);
1661
+ let abortTriggered = false;
1662
+
1663
+ // Create a promise that resolves when all lanes are done
1664
+ // but also detects first failure
1665
+ const wrappedPromises = lanePromises.map(async (promise, idx) => {
1666
+ try {
1667
+ const result = await promise;
1668
+ results[idx] = result;
1669
+
1670
+ // Check if any task failed
1671
+ if (!abortTriggered) {
1672
+ const hasFailure = result.tasks.some(
1673
+ t => t.status === "failed" || t.status === "stalled",
1674
+ );
1675
+ if (hasFailure) {
1676
+ // First failure detected — trigger stop-all
1677
+ abortTriggered = true;
1678
+ pauseSignal.paused = true;
1679
+
1680
+ // Determine which task failed first for logging
1681
+ const firstFailed = result.tasks
1682
+ .filter(t => t.status === "failed" || t.status === "stalled")
1683
+ .sort((a, b) => {
1684
+ // Sort by startTime, then by taskId for deterministic tie-break
1685
+ const timeA = a.startTime || 0;
1686
+ const timeB = b.startTime || 0;
1687
+ if (timeA !== timeB) return timeA - timeB;
1688
+ return a.taskId.localeCompare(b.taskId);
1689
+ })[0];
1690
+
1691
+ execLog("wave", `W${waveIndex}`, `stop-all triggered by ${firstFailed?.taskId || "unknown"} in ${lanes[idx].laneId}`, {
1692
+ session: lanes[idx].tmuxSessionName,
1693
+ });
1694
+
1695
+ // Kill ALL lane sessions immediately
1696
+ for (const lane of lanes) {
1697
+ killLaneAndChildren(lane.tmuxSessionName);
1698
+ }
1699
+ }
1700
+ }
1701
+
1702
+ return result;
1703
+ } catch (err) {
1704
+ // Lane promise rejection — should be rare
1705
+ const errMsg = err instanceof Error ? err.message : String(err);
1706
+ if (!abortTriggered) {
1707
+ abortTriggered = true;
1708
+ pauseSignal.paused = true;
1709
+ execLog("wave", `W${waveIndex}`, `stop-all triggered by lane error in ${lanes[idx].laneId}: ${errMsg}`);
1710
+ for (const lane of lanes) {
1711
+ killLaneAndChildren(lane.tmuxSessionName);
1712
+ }
1713
+ }
1714
+
1715
+ // Build a failed result for this lane
1716
+ const failedResult: LaneExecutionResult = {
1717
+ laneNumber: lanes[idx].laneNumber,
1718
+ laneId: lanes[idx].laneId,
1719
+ tasks: lanes[idx].tasks.map(t => ({
1720
+ taskId: t.taskId,
1721
+ status: "failed" as LaneTaskStatus,
1722
+ startTime: null,
1723
+ endTime: null,
1724
+ exitReason: `Lane aborted: ${errMsg}`,
1725
+ sessionName: lanes[idx].tmuxSessionName,
1726
+ doneFileFound: false,
1727
+ })),
1728
+ overallStatus: "failed",
1729
+ startTime: Date.now(),
1730
+ endTime: Date.now(),
1731
+ };
1732
+ results[idx] = failedResult;
1733
+ return failedResult;
1734
+ }
1735
+ });
1736
+
1737
+ // Wait for all lanes to settle (they should exit quickly after session kill)
1738
+ await Promise.allSettled(wrappedPromises);
1739
+
1740
+ // Fill in any null results (shouldn't happen, but defensive)
1741
+ return results.map((r, idx) => r || {
1742
+ laneNumber: lanes[idx].laneNumber,
1743
+ laneId: lanes[idx].laneId,
1744
+ tasks: [],
1745
+ overallStatus: "failed" as const,
1746
+ startTime: Date.now(),
1747
+ endTime: Date.now(),
1748
+ });
1749
+ }
1750
+
1751
+ // ── /orch Command — Full Execution (Step 5) ─────────────────────────
1752
+