taskplane 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +2 -20
- package/bin/taskplane.mjs +706 -0
- package/dashboard/public/app.js +900 -0
- package/dashboard/public/index.html +92 -0
- package/dashboard/public/style.css +924 -0
- package/dashboard/server.cjs +531 -0
- package/extensions/task-orchestrator.ts +28 -0
- package/extensions/task-runner.ts +1923 -0
- package/extensions/taskplane/abort.ts +466 -0
- package/extensions/taskplane/config.ts +102 -0
- package/extensions/taskplane/discovery.ts +988 -0
- package/extensions/taskplane/engine.ts +758 -0
- package/extensions/taskplane/execution.ts +1752 -0
- package/extensions/taskplane/extension.ts +577 -0
- package/extensions/taskplane/formatting.ts +718 -0
- package/extensions/taskplane/git.ts +38 -0
- package/extensions/taskplane/index.ts +22 -0
- package/extensions/taskplane/merge.ts +795 -0
- package/extensions/taskplane/messages.ts +134 -0
- package/extensions/taskplane/persistence.ts +1121 -0
- package/extensions/taskplane/resume.ts +1092 -0
- package/extensions/taskplane/sessions.ts +92 -0
- package/extensions/taskplane/types.ts +1514 -0
- package/extensions/taskplane/waves.ts +900 -0
- package/extensions/taskplane/worktree.ts +1624 -0
- package/package.json +48 -3
- package/skills/create-taskplane-task/SKILL.md +326 -0
- package/skills/create-taskplane-task/references/context-template.md +78 -0
- package/skills/create-taskplane-task/references/prompt-template.md +246 -0
- package/templates/agents/task-merger.md +256 -0
- package/templates/agents/task-reviewer.md +81 -0
- package/templates/agents/task-worker.md +140 -0
- package/templates/config/task-orchestrator.yaml +89 -0
- package/templates/config/task-runner.yaml +99 -0
- package/templates/tasks/CONTEXT.md +31 -0
- package/templates/tasks/EXAMPLE-001-hello-world/PROMPT.md +90 -0
- package/templates/tasks/EXAMPLE-001-hello-world/STATUS.md +73 -0
|
@@ -0,0 +1,1752 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Lane execution, monitoring, wave execution loop
|
|
3
|
+
* @module orch/execution
|
|
4
|
+
*/
|
|
5
|
+
import { readFileSync, existsSync, statSync, unlinkSync, mkdirSync } from "fs";
|
|
6
|
+
import { spawnSync } from "child_process";
|
|
7
|
+
import { join, dirname, resolve, delimiter as pathDelimiter } from "path";
|
|
8
|
+
|
|
9
|
+
import { DONE_GRACE_MS, EXECUTION_POLL_INTERVAL_MS, ExecutionError, SESSION_SPAWN_RETRY_MAX } from "./types.ts";
|
|
10
|
+
import type { AllocatedLane, AllocatedTask, DependencyGraph, LaneExecutionResult, LaneMonitorSnapshot, LaneTaskOutcome, LaneTaskStatus, MonitorState, MtimeTracker, OrchestratorConfig, ParsedTask, TaskMonitorSnapshot, WaveExecutionResult } from "./types.ts";
|
|
11
|
+
import { allocateLanes } from "./waves.ts";
|
|
12
|
+
|
|
13
|
+
// ── Execution Helpers ────────────────────────────────────────────────
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Structured log helper for lane execution.
|
|
17
|
+
*
|
|
18
|
+
* All execution logs go to stderr (same pattern as task-runner.ts).
|
|
19
|
+
* Format: [orch] {laneId}/{taskId}: {message}
|
|
20
|
+
* Correlation fields: batchId, laneId, taskId, sessionName.
|
|
21
|
+
* No PII — only IDs and paths.
|
|
22
|
+
*/
|
|
23
|
+
export function execLog(
|
|
24
|
+
laneId: string,
|
|
25
|
+
taskId: string,
|
|
26
|
+
message: string,
|
|
27
|
+
extra?: Record<string, string | number | boolean>,
|
|
28
|
+
): void {
|
|
29
|
+
const prefix = `[orch] ${laneId}/${taskId}`;
|
|
30
|
+
if (extra) {
|
|
31
|
+
const fields = Object.entries(extra)
|
|
32
|
+
.map(([k, v]) => `${k}=${v}`)
|
|
33
|
+
.join(" ");
|
|
34
|
+
console.error(`${prefix}: ${message} (${fields})`);
|
|
35
|
+
} else {
|
|
36
|
+
console.error(`${prefix}: ${message}`);
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Check if a TMUX session exists (is alive).
|
|
42
|
+
*
|
|
43
|
+
* @param sessionName - TMUX session name to check
|
|
44
|
+
* @returns true if session exists
|
|
45
|
+
*/
|
|
46
|
+
export function tmuxHasSession(sessionName: string): boolean {
|
|
47
|
+
const result = spawnSync("tmux", ["has-session", "-t", sessionName]);
|
|
48
|
+
return result.status === 0;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Kill a TMUX session if it exists.
|
|
53
|
+
*
|
|
54
|
+
* Idempotent: returns true if session was killed or was already absent.
|
|
55
|
+
*
|
|
56
|
+
* @param sessionName - TMUX session name to kill
|
|
57
|
+
* @returns true if session is now absent
|
|
58
|
+
*/
|
|
59
|
+
export function tmuxKillSession(sessionName: string): boolean {
|
|
60
|
+
// Check liveness first so we can distinguish "already gone" from "kill failed".
|
|
61
|
+
const wasAlive = tmuxHasSession(sessionName);
|
|
62
|
+
if (!wasAlive) {
|
|
63
|
+
return true; // Already absent
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
spawnSync("tmux", ["kill-session", "-t", sessionName]);
|
|
67
|
+
|
|
68
|
+
// Consider success only if the session is now absent.
|
|
69
|
+
return !tmuxHasSession(sessionName);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Kill a lane session and its child sessions (worker, reviewer).
|
|
74
|
+
*
|
|
75
|
+
* Child session names follow the convention:
|
|
76
|
+
* - `{sessionName}-worker`
|
|
77
|
+
* - `{sessionName}-reviewer`
|
|
78
|
+
*
|
|
79
|
+
* @param sessionName - Base lane session name (e.g., "orch-lane-1")
|
|
80
|
+
*/
|
|
81
|
+
export function killLaneAndChildren(sessionName: string): void {
|
|
82
|
+
// Kill children first (they depend on the parent context)
|
|
83
|
+
tmuxKillSession(`${sessionName}-worker`);
|
|
84
|
+
tmuxKillSession(`${sessionName}-reviewer`);
|
|
85
|
+
// Then kill the parent lane session
|
|
86
|
+
tmuxKillSession(sessionName);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Build environment variables for a lane task execution.
|
|
91
|
+
*
|
|
92
|
+
* These env vars tell the task-runner extension inside the TMUX session
|
|
93
|
+
* how to behave:
|
|
94
|
+
* - TASK_AUTOSTART: relative path to PROMPT.md from worktree root
|
|
95
|
+
* - TASK_RUNNER_SPAWN_MODE: "tmux" for TMUX-based worker/reviewer spawning
|
|
96
|
+
* - TASK_RUNNER_TMUX_PREFIX: prefix for worker/reviewer session names
|
|
97
|
+
*
|
|
98
|
+
* @param lane - The allocated lane (provides session name and worktree path)
|
|
99
|
+
* @param taskId - Task ID for logging
|
|
100
|
+
* @param promptPath - Absolute path to the task's PROMPT.md in the main repo
|
|
101
|
+
* @param repoRoot - Absolute path to the main repository root
|
|
102
|
+
* @returns Map of env var name → value
|
|
103
|
+
*/
|
|
104
|
+
export function buildLaneEnvVars(
|
|
105
|
+
lane: AllocatedLane,
|
|
106
|
+
promptPath: string,
|
|
107
|
+
repoRoot: string,
|
|
108
|
+
): Record<string, string> {
|
|
109
|
+
// TASK_AUTOSTART needs a path relative to the worktree root.
|
|
110
|
+
// The promptPath is absolute (from the main repo). We need the
|
|
111
|
+
// relative portion from the repo root, which will be the same
|
|
112
|
+
// relative path in the worktree since worktrees mirror the repo structure.
|
|
113
|
+
const repoRootNorm = resolve(repoRoot).replace(/\\/g, "/");
|
|
114
|
+
const promptNorm = resolve(promptPath).replace(/\\/g, "/");
|
|
115
|
+
|
|
116
|
+
let relativePath: string;
|
|
117
|
+
if (promptNorm.startsWith(repoRootNorm + "/")) {
|
|
118
|
+
relativePath = promptNorm.slice(repoRootNorm.length + 1);
|
|
119
|
+
} else {
|
|
120
|
+
// Fallback: use the path as-is (shouldn't happen in normal use)
|
|
121
|
+
relativePath = promptPath;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const nodePathEntries: string[] = [join(repoRoot, "node_modules")];
|
|
125
|
+
if (process.env.NODE_PATH) {
|
|
126
|
+
nodePathEntries.push(...process.env.NODE_PATH.split(pathDelimiter).filter(Boolean));
|
|
127
|
+
}
|
|
128
|
+
const nodePath = [...new Set(nodePathEntries)].join(pathDelimiter);
|
|
129
|
+
|
|
130
|
+
return {
|
|
131
|
+
TASK_AUTOSTART: relativePath,
|
|
132
|
+
TASK_RUNNER_SPAWN_MODE: "subprocess",
|
|
133
|
+
TASK_RUNNER_TMUX_PREFIX: lane.tmuxSessionName,
|
|
134
|
+
ORCH_SIDECAR_DIR: join(repoRoot, ".pi"),
|
|
135
|
+
NODE_PATH: nodePath,
|
|
136
|
+
// Pi's TUI (ink/react) hangs silently with TERM=tmux-256color (tmux default).
|
|
137
|
+
// Force xterm-256color so pi can render and start execution.
|
|
138
|
+
TERM: "xterm-256color",
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Convert a Windows absolute path to a tmux-friendly POSIX-style path.
|
|
144
|
+
*
|
|
145
|
+
* tmux `-c` expects POSIX paths when running under Git Bash/MSYS.
|
|
146
|
+
* Passing `C:\...` can silently fall back to HOME, causing TASK_AUTOSTART
|
|
147
|
+
* path resolution failures.
|
|
148
|
+
*/
|
|
149
|
+
export function toTmuxPath(pathValue: string): string {
|
|
150
|
+
const normalized = resolve(pathValue).replace(/\\/g, "/");
|
|
151
|
+
const driveMatch = normalized.match(/^([A-Za-z]):\/(.*)$/);
|
|
152
|
+
if (driveMatch) {
|
|
153
|
+
return `/${driveMatch[1].toLowerCase()}/${driveMatch[2]}`;
|
|
154
|
+
}
|
|
155
|
+
return normalized;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Build the tmux new-session command for spawning a lane.
|
|
160
|
+
*
|
|
161
|
+
* Constructs a properly escaped command that:
|
|
162
|
+
* 1. Sets env vars (TASK_AUTOSTART, TASK_RUNNER_SPAWN_MODE, TASK_RUNNER_TMUX_PREFIX)
|
|
163
|
+
* 2. Runs `pi --no-session -e extensions/task-runner.ts` in the worktree directory
|
|
164
|
+
*
|
|
165
|
+
* Shell escaping: env var values are single-quoted to prevent expansion.
|
|
166
|
+
* Path args are single-quoted to handle spaces and special characters.
|
|
167
|
+
*
|
|
168
|
+
* @param sessionName - TMUX session name (e.g., "orch-lane-1")
|
|
169
|
+
* @param worktreePath - Absolute path to the lane worktree
|
|
170
|
+
* @param repoRoot - Absolute path to main repo (for extension absolute path)
|
|
171
|
+
* @param envVars - Environment variables to set
|
|
172
|
+
* @param laneLogPath - Optional path to write lane session stdout/stderr
|
|
173
|
+
* @returns Array of arguments for spawnSync("tmux", args)
|
|
174
|
+
*/
|
|
175
|
+
export function buildTmuxSpawnArgs(
|
|
176
|
+
sessionName: string,
|
|
177
|
+
worktreePath: string,
|
|
178
|
+
repoRoot: string,
|
|
179
|
+
envVars: Record<string, string>,
|
|
180
|
+
laneLogPath?: string,
|
|
181
|
+
): string[] {
|
|
182
|
+
// Shell-quote a value for safe embedding in a command string.
|
|
183
|
+
// Wraps in single quotes, escaping any internal single quotes.
|
|
184
|
+
const shellQuote = (s: string): string => {
|
|
185
|
+
if (/[\s"'`$\\!&|;()<>{}#*?~]/.test(s)) {
|
|
186
|
+
return `'${s.replace(/'/g, "'\\''")}'`;
|
|
187
|
+
}
|
|
188
|
+
return s;
|
|
189
|
+
};
|
|
190
|
+
|
|
191
|
+
// Build the command string that runs inside the TMUX session.
|
|
192
|
+
// Format: ENV_VAR1=value1 ENV_VAR2=value2 pi --no-session -e extensions/task-runner.ts
|
|
193
|
+
const envParts = Object.entries(envVars)
|
|
194
|
+
.map(([key, val]) => `${key}=${shellQuote(val)}`)
|
|
195
|
+
.join(" ");
|
|
196
|
+
|
|
197
|
+
const taskRunnerExtPath = join(resolve(repoRoot), "extensions", "task-runner.ts");
|
|
198
|
+
const basePiCommand = `${envParts} pi --no-session -e ${shellQuote(taskRunnerExtPath)}`;
|
|
199
|
+
|
|
200
|
+
// NOTE: Do not redirect lane output here. Shell redirection has proven
|
|
201
|
+
// fragile across Windows + tmux environments and can prevent session spawn.
|
|
202
|
+
// Diagnostics use tmux pane capture + STATUS tail in pollUntilTaskComplete().
|
|
203
|
+
const piCommand = basePiCommand;
|
|
204
|
+
|
|
205
|
+
const tmuxWorktreePath = toTmuxPath(worktreePath);
|
|
206
|
+
const wrappedCommand = `cd ${shellQuote(tmuxWorktreePath)} && ${piCommand}`;
|
|
207
|
+
|
|
208
|
+
return [
|
|
209
|
+
"new-session", "-d",
|
|
210
|
+
"-s", sessionName,
|
|
211
|
+
wrappedCommand,
|
|
212
|
+
];
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/**
|
|
216
|
+
* Resolve the lane session log path for a task execution.
|
|
217
|
+
*
|
|
218
|
+
* Logs are written under the lane worktree to keep per-lane execution
|
|
219
|
+
* artifacts colocated with task state and available after failures.
|
|
220
|
+
*/
|
|
221
|
+
export function resolveLaneLogPath(
|
|
222
|
+
lane: AllocatedLane,
|
|
223
|
+
task: AllocatedTask,
|
|
224
|
+
): string {
|
|
225
|
+
return join(lane.worktreePath, ".pi", "orch-logs", `${lane.tmuxSessionName}-${task.taskId}.log`);
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* Relative lane log path used inside the tmux shell command.
|
|
230
|
+
*
|
|
231
|
+
* Relative paths avoid Windows drive-letter parsing issues in shell redirection.
|
|
232
|
+
*/
|
|
233
|
+
export function resolveLaneLogRelativePath(
|
|
234
|
+
lane: AllocatedLane,
|
|
235
|
+
task: AllocatedTask,
|
|
236
|
+
): string {
|
|
237
|
+
return join(".pi", "orch-logs", `${lane.tmuxSessionName}-${task.taskId}.log`).replace(/\\/g, "/");
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/**
|
|
241
|
+
* Read a tail snippet from a lane log file for failure diagnostics.
|
|
242
|
+
*/
|
|
243
|
+
export function readLaneLogTail(
|
|
244
|
+
logPath: string,
|
|
245
|
+
maxLines: number = 40,
|
|
246
|
+
maxChars: number = 1200,
|
|
247
|
+
): string {
|
|
248
|
+
if (!existsSync(logPath)) return "";
|
|
249
|
+
try {
|
|
250
|
+
const raw = readFileSync(logPath, "utf-8").replace(/\r\n/g, "\n");
|
|
251
|
+
const tail = raw.split("\n").slice(-maxLines).join("\n").trim();
|
|
252
|
+
if (!tail) return "";
|
|
253
|
+
return tail.length > maxChars ? tail.slice(-maxChars) : tail;
|
|
254
|
+
} catch {
|
|
255
|
+
return "";
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Capture tail output from a live TMUX pane for diagnostics.
|
|
261
|
+
*
|
|
262
|
+
* Works even when lane log redirection is disabled (Windows-safe fallback).
|
|
263
|
+
*/
|
|
264
|
+
export function captureTmuxPaneTail(
|
|
265
|
+
sessionName: string,
|
|
266
|
+
maxLines: number = 40,
|
|
267
|
+
maxChars: number = 1200,
|
|
268
|
+
): string {
|
|
269
|
+
const result = spawnSync("tmux", ["capture-pane", "-p", "-t", sessionName], {
|
|
270
|
+
encoding: "utf-8",
|
|
271
|
+
timeout: 3000,
|
|
272
|
+
});
|
|
273
|
+
if (result.status !== 0) return "";
|
|
274
|
+
const raw = (result.stdout || "").replace(/\r\n/g, "\n").trim();
|
|
275
|
+
if (!raw) return "";
|
|
276
|
+
const tail = raw.split("\n").slice(-maxLines).join("\n").trim();
|
|
277
|
+
if (!tail) return "";
|
|
278
|
+
return tail.length > maxChars ? tail.slice(-maxChars) : tail;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
/**
|
|
282
|
+
* Read a tail snippet from task STATUS.md for failure diagnostics.
|
|
283
|
+
*/
|
|
284
|
+
export function readTaskStatusTail(
|
|
285
|
+
statusPath: string,
|
|
286
|
+
maxLines: number = 40,
|
|
287
|
+
maxChars: number = 1200,
|
|
288
|
+
): string {
|
|
289
|
+
if (!existsSync(statusPath)) return "";
|
|
290
|
+
try {
|
|
291
|
+
const raw = readFileSync(statusPath, "utf-8").replace(/\r\n/g, "\n").trim();
|
|
292
|
+
if (!raw) return "";
|
|
293
|
+
const tail = raw.split("\n").slice(-maxLines).join("\n").trim();
|
|
294
|
+
if (!tail) return "";
|
|
295
|
+
return tail.length > maxChars ? tail.slice(-maxChars) : tail;
|
|
296
|
+
} catch {
|
|
297
|
+
return "";
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/**
|
|
302
|
+
* Resolve the path to a task's .DONE file inside a worktree.
|
|
303
|
+
*
|
|
304
|
+
* The task folder path from ParsedTask is absolute (main repo).
|
|
305
|
+
* We need to translate it to the equivalent path in the worktree.
|
|
306
|
+
*
|
|
307
|
+
* @param taskFolder - Absolute task folder path (from main repo)
|
|
308
|
+
* @param worktreePath - Absolute path to the lane worktree
|
|
309
|
+
* @param repoRoot - Absolute path to the main repository root
|
|
310
|
+
* @returns Absolute path to the .DONE file in the worktree
|
|
311
|
+
*/
|
|
312
|
+
export function resolveTaskDonePath(
|
|
313
|
+
taskFolder: string,
|
|
314
|
+
worktreePath: string,
|
|
315
|
+
repoRoot: string,
|
|
316
|
+
): string {
|
|
317
|
+
const repoRootNorm = resolve(repoRoot).replace(/\\/g, "/");
|
|
318
|
+
const folderNorm = resolve(taskFolder).replace(/\\/g, "/");
|
|
319
|
+
|
|
320
|
+
let relativePath: string;
|
|
321
|
+
if (folderNorm.startsWith(repoRootNorm + "/")) {
|
|
322
|
+
relativePath = folderNorm.slice(repoRootNorm.length + 1);
|
|
323
|
+
} else {
|
|
324
|
+
relativePath = taskFolder;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
const primaryPath = join(worktreePath, relativePath, ".DONE");
|
|
328
|
+
if (existsSync(primaryPath)) return primaryPath;
|
|
329
|
+
|
|
330
|
+
// Fallback: worker may have archived the task folder during the
|
|
331
|
+
// "Documentation & Delivery" step, moving it under `.../archive/TASK-ID/`.
|
|
332
|
+
// Check the archive sibling path.
|
|
333
|
+
const parts = relativePath.replace(/\\/g, "/").split("/");
|
|
334
|
+
const taskDirName = parts[parts.length - 1]; // e.g. "PM-011-template-seed-data-permissions"
|
|
335
|
+
const parentParts = parts.slice(0, -1); // e.g. [..., "tasks"]
|
|
336
|
+
const archivePath = join(worktreePath, ...parentParts, "archive", taskDirName, ".DONE");
|
|
337
|
+
if (existsSync(archivePath)) return archivePath;
|
|
338
|
+
|
|
339
|
+
return primaryPath; // Return primary even if missing (caller checks existsSync)
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
/**
|
|
343
|
+
* Spawn a TMUX session for a task in a lane.
|
|
344
|
+
*
|
|
345
|
+
* Handles:
|
|
346
|
+
* - Stale session cleanup (kill if session name already exists)
|
|
347
|
+
* - Retry on transient spawn failures (up to SESSION_SPAWN_RETRY_MAX)
|
|
348
|
+
* - Structured logging
|
|
349
|
+
*
|
|
350
|
+
* @param lane - Allocated lane with worktree and session info
|
|
351
|
+
* @param task - Task to execute
|
|
352
|
+
* @param config - Orchestrator configuration
|
|
353
|
+
* @param repoRoot - Main repository root
|
|
354
|
+
* @throws ExecutionError if spawn fails after retries
|
|
355
|
+
*/
|
|
356
|
+
export function spawnLaneSession(
|
|
357
|
+
lane: AllocatedLane,
|
|
358
|
+
task: AllocatedTask,
|
|
359
|
+
config: OrchestratorConfig,
|
|
360
|
+
repoRoot: string,
|
|
361
|
+
): void {
|
|
362
|
+
const sessionName = lane.tmuxSessionName;
|
|
363
|
+
const laneId = lane.laneId;
|
|
364
|
+
|
|
365
|
+
execLog(laneId, task.taskId, "preparing to spawn TMUX session", {
|
|
366
|
+
session: sessionName,
|
|
367
|
+
worktree: lane.worktreePath,
|
|
368
|
+
worktreeTmuxPath: toTmuxPath(lane.worktreePath),
|
|
369
|
+
logPath: resolveLaneLogPath(lane, task),
|
|
370
|
+
});
|
|
371
|
+
|
|
372
|
+
// Pre-check: worktree exists
|
|
373
|
+
if (!existsSync(lane.worktreePath)) {
|
|
374
|
+
throw new ExecutionError(
|
|
375
|
+
"EXEC_WORKTREE_MISSING",
|
|
376
|
+
`Worktree path does not exist: ${lane.worktreePath}`,
|
|
377
|
+
laneId,
|
|
378
|
+
task.taskId,
|
|
379
|
+
);
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
// Build env vars
|
|
383
|
+
const envVars = buildLaneEnvVars(lane, task.task.promptPath, repoRoot);
|
|
384
|
+
|
|
385
|
+
// Prepare per-task lane log path for post-mortem diagnostics
|
|
386
|
+
const laneLogPath = resolveLaneLogPath(lane, task);
|
|
387
|
+
const laneLogRelativePath = resolveLaneLogRelativePath(lane, task);
|
|
388
|
+
try {
|
|
389
|
+
mkdirSync(dirname(laneLogPath), { recursive: true });
|
|
390
|
+
if (existsSync(laneLogPath)) {
|
|
391
|
+
unlinkSync(laneLogPath); // fresh log per task attempt
|
|
392
|
+
}
|
|
393
|
+
} catch {
|
|
394
|
+
// Best effort — session can still run without log file setup
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// Build tmux args
|
|
398
|
+
const tmuxArgs = buildTmuxSpawnArgs(sessionName, lane.worktreePath, repoRoot, envVars, laneLogRelativePath);
|
|
399
|
+
|
|
400
|
+
// Clean up stale session if exists
|
|
401
|
+
if (tmuxHasSession(sessionName)) {
|
|
402
|
+
execLog(laneId, task.taskId, "killing stale TMUX session", { session: sessionName });
|
|
403
|
+
killLaneAndChildren(sessionName);
|
|
404
|
+
// Brief pause to let tmux clean up
|
|
405
|
+
spawnSync("sleep", ["0.5"], { shell: true, timeout: 3000 });
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
// Attempt to spawn with retry
|
|
409
|
+
let lastError = "";
|
|
410
|
+
for (let attempt = 1; attempt <= SESSION_SPAWN_RETRY_MAX + 1; attempt++) {
|
|
411
|
+
const result = spawnSync("tmux", tmuxArgs);
|
|
412
|
+
|
|
413
|
+
if (result.status === 0) {
|
|
414
|
+
execLog(laneId, task.taskId, "TMUX session spawned successfully", {
|
|
415
|
+
session: sessionName,
|
|
416
|
+
attempt,
|
|
417
|
+
});
|
|
418
|
+
return;
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
lastError = result.stderr?.toString().trim() || "unknown spawn error";
|
|
422
|
+
execLog(laneId, task.taskId, `spawn attempt ${attempt} failed: ${lastError}`, {
|
|
423
|
+
session: sessionName,
|
|
424
|
+
});
|
|
425
|
+
|
|
426
|
+
if (attempt <= SESSION_SPAWN_RETRY_MAX) {
|
|
427
|
+
// Wait before retry (1s, 2s)
|
|
428
|
+
const delayMs = attempt * 1000;
|
|
429
|
+
spawnSync("sleep", [`${delayMs / 1000}`], { shell: true, timeout: delayMs + 2000 });
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
throw new ExecutionError(
|
|
434
|
+
"EXEC_SPAWN_FAILED",
|
|
435
|
+
`Failed to create TMUX session '${sessionName}' after ${SESSION_SPAWN_RETRY_MAX + 1} attempts. Last error: ${lastError}`,
|
|
436
|
+
laneId,
|
|
437
|
+
task.taskId,
|
|
438
|
+
);
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
/**
|
|
442
|
+
* Poll until a task completes (or fails).
|
|
443
|
+
*
|
|
444
|
+
* Completion detection logic:
|
|
445
|
+
* 1. Check for .DONE file → task succeeded (highest priority)
|
|
446
|
+
* 2. Check TMUX session liveness via `tmux has-session`
|
|
447
|
+
* 3. If session exits without .DONE → wait DONE_GRACE_MS (slow disk flush)
|
|
448
|
+
* 4. After grace period, if still no .DONE → task failed
|
|
449
|
+
*
|
|
450
|
+
* Terminal-state precedence: .DONE found at any point = success,
|
|
451
|
+
* regardless of session state.
|
|
452
|
+
*
|
|
453
|
+
* @param lane - Allocated lane
|
|
454
|
+
* @param task - Task being executed
|
|
455
|
+
* @param config - Orchestrator configuration
|
|
456
|
+
* @param repoRoot - Main repository root
|
|
457
|
+
* @param pauseSignal - Checked each poll cycle; if true, returns early with "skipped"
|
|
458
|
+
* @returns LaneTaskStatus indicating the final state
|
|
459
|
+
*/
|
|
460
|
+
export async function pollUntilTaskComplete(
|
|
461
|
+
lane: AllocatedLane,
|
|
462
|
+
task: AllocatedTask,
|
|
463
|
+
config: OrchestratorConfig,
|
|
464
|
+
repoRoot: string,
|
|
465
|
+
pauseSignal: { paused: boolean },
|
|
466
|
+
): Promise<{ status: LaneTaskStatus; exitReason: string; doneFileFound: boolean }> {
|
|
467
|
+
const sessionName = lane.tmuxSessionName;
|
|
468
|
+
const laneId = lane.laneId;
|
|
469
|
+
const donePath = resolveTaskDonePath(task.task.taskFolder, lane.worktreePath, repoRoot);
|
|
470
|
+
const statusPath = join(dirname(donePath), "STATUS.md");
|
|
471
|
+
const laneLogPath = resolveLaneLogPath(lane, task);
|
|
472
|
+
|
|
473
|
+
execLog(laneId, task.taskId, "polling for completion", {
|
|
474
|
+
session: sessionName,
|
|
475
|
+
donePath,
|
|
476
|
+
statusPath,
|
|
477
|
+
logPath: laneLogPath,
|
|
478
|
+
});
|
|
479
|
+
|
|
480
|
+
let lastPaneTail = "";
|
|
481
|
+
|
|
482
|
+
// Main polling loop
|
|
483
|
+
while (true) {
|
|
484
|
+
// Check pause signal
|
|
485
|
+
if (pauseSignal.paused) {
|
|
486
|
+
execLog(laneId, task.taskId, "pause signal detected during poll");
|
|
487
|
+
// Don't kill the session — let the current task-runner checkpoint
|
|
488
|
+
// The calling code will handle marking as skipped
|
|
489
|
+
return {
|
|
490
|
+
status: "skipped",
|
|
491
|
+
exitReason: "Paused by user (/orch-pause)",
|
|
492
|
+
doneFileFound: false,
|
|
493
|
+
};
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
// Capture live pane output for diagnostics (best effort).
|
|
497
|
+
const paneTail = captureTmuxPaneTail(sessionName);
|
|
498
|
+
if (paneTail) {
|
|
499
|
+
lastPaneTail = paneTail;
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
// Priority 1: Check for .DONE file
|
|
503
|
+
if (existsSync(donePath)) {
|
|
504
|
+
execLog(laneId, task.taskId, ".DONE file found — task succeeded", {
|
|
505
|
+
session: sessionName,
|
|
506
|
+
});
|
|
507
|
+
return {
|
|
508
|
+
status: "succeeded",
|
|
509
|
+
exitReason: ".DONE file created by task-runner",
|
|
510
|
+
doneFileFound: true,
|
|
511
|
+
};
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
// Priority 2: Check if TMUX session is still alive
|
|
515
|
+
if (!tmuxHasSession(sessionName)) {
|
|
516
|
+
// Session exited — start grace period for .DONE file
|
|
517
|
+
execLog(laneId, task.taskId, "TMUX session exited, entering grace period", {
|
|
518
|
+
session: sessionName,
|
|
519
|
+
graceMs: DONE_GRACE_MS,
|
|
520
|
+
});
|
|
521
|
+
|
|
522
|
+
// Grace period: poll .DONE file at short intervals
|
|
523
|
+
const graceStart = Date.now();
|
|
524
|
+
while (Date.now() - graceStart < DONE_GRACE_MS) {
|
|
525
|
+
await new Promise((r) => setTimeout(r, 500));
|
|
526
|
+
|
|
527
|
+
if (existsSync(donePath)) {
|
|
528
|
+
execLog(laneId, task.taskId, ".DONE file found during grace period — task succeeded", {
|
|
529
|
+
session: sessionName,
|
|
530
|
+
});
|
|
531
|
+
return {
|
|
532
|
+
status: "succeeded",
|
|
533
|
+
exitReason: ".DONE file created (found during grace period)",
|
|
534
|
+
doneFileFound: true,
|
|
535
|
+
};
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
// Grace period expired without .DONE → task failed
|
|
540
|
+
const logTail = readLaneLogTail(laneLogPath);
|
|
541
|
+
execLog(laneId, task.taskId, "grace period expired without .DONE — task failed", {
|
|
542
|
+
session: sessionName,
|
|
543
|
+
logPath: laneLogPath,
|
|
544
|
+
});
|
|
545
|
+
if (logTail) {
|
|
546
|
+
execLog(laneId, task.taskId, `lane session output (tail):\n${logTail}`);
|
|
547
|
+
}
|
|
548
|
+
const statusTail = readTaskStatusTail(statusPath);
|
|
549
|
+
const hasLogFile = existsSync(laneLogPath);
|
|
550
|
+
const outputForHint = logTail || lastPaneTail || statusTail;
|
|
551
|
+
const logHint = outputForHint
|
|
552
|
+
? ` Last output: ${outputForHint.replace(/\s+/g, " ").slice(-300)}`
|
|
553
|
+
: "";
|
|
554
|
+
const logLocation = hasLogFile ? ` Lane log: ${laneLogPath}.` : "";
|
|
555
|
+
if (!logTail && lastPaneTail) {
|
|
556
|
+
execLog(laneId, task.taskId, `lane session output from TMUX pane (tail):\n${lastPaneTail}`);
|
|
557
|
+
}
|
|
558
|
+
if (statusTail) {
|
|
559
|
+
execLog(laneId, task.taskId, `task STATUS tail:\n${statusTail}`);
|
|
560
|
+
}
|
|
561
|
+
return {
|
|
562
|
+
status: "failed",
|
|
563
|
+
exitReason:
|
|
564
|
+
`TMUX session '${sessionName}' exited without creating .DONE file ` +
|
|
565
|
+
`(grace period ${DONE_GRACE_MS}ms expired).` +
|
|
566
|
+
`${logLocation}${logHint}`,
|
|
567
|
+
doneFileFound: false,
|
|
568
|
+
};
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
// Session alive, no .DONE yet — keep polling
|
|
572
|
+
await new Promise((r) => setTimeout(r, EXECUTION_POLL_INTERVAL_MS));
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
/**
|
|
577
|
+
* Execute all tasks in a lane sequentially.
|
|
578
|
+
*
|
|
579
|
+
* For each task in the lane (in order):
|
|
580
|
+
* 1. Spawn a TMUX session with TASK_AUTOSTART pointing to the task's PROMPT.md
|
|
581
|
+
* 2. Poll until the task completes (or fails)
|
|
582
|
+
* 3. Record the outcome
|
|
583
|
+
* 4. If the task failed, skip remaining tasks in the lane
|
|
584
|
+
*
|
|
585
|
+
* The lane reuses the same worktree and TMUX session name across tasks.
|
|
586
|
+
* Each new task gets a fresh TMUX session (the previous one has exited).
|
|
587
|
+
*
|
|
588
|
+
* Cleanup policy:
|
|
589
|
+
* - On success: session exits naturally, no cleanup needed
|
|
590
|
+
* - On failure: session may have exited already; if alive, leave for debugging
|
|
591
|
+
* - On pause: stop after current task, mark remaining as skipped
|
|
592
|
+
* - On stall: handled by Step 3 (monitoring) — this function just polls
|
|
593
|
+
*
|
|
594
|
+
* @param lane - Fully allocated lane from Step 1
|
|
595
|
+
* @param config - Orchestrator configuration
|
|
596
|
+
* @param repoRoot - Main repository root
|
|
597
|
+
* @param pauseSignal - Shared signal for pause/abort (checked between tasks)
|
|
598
|
+
* @returns LaneExecutionResult with per-task outcomes
|
|
599
|
+
*/
|
|
600
|
+
export async function executeLane(
|
|
601
|
+
lane: AllocatedLane,
|
|
602
|
+
config: OrchestratorConfig,
|
|
603
|
+
repoRoot: string,
|
|
604
|
+
pauseSignal: { paused: boolean },
|
|
605
|
+
): Promise<LaneExecutionResult> {
|
|
606
|
+
const laneId = lane.laneId;
|
|
607
|
+
const laneStartTime = Date.now();
|
|
608
|
+
const outcomes: LaneTaskOutcome[] = [];
|
|
609
|
+
let shouldSkipRemaining = false;
|
|
610
|
+
|
|
611
|
+
execLog(laneId, "LANE", `starting execution of ${lane.tasks.length} task(s)`, {
|
|
612
|
+
worktree: lane.worktreePath,
|
|
613
|
+
session: lane.tmuxSessionName,
|
|
614
|
+
});
|
|
615
|
+
|
|
616
|
+
for (const task of lane.tasks) {
|
|
617
|
+
// Check if remaining tasks should be skipped (prior failure or pause)
|
|
618
|
+
if (shouldSkipRemaining || pauseSignal.paused) {
|
|
619
|
+
const reason = pauseSignal.paused
|
|
620
|
+
? "Skipped due to pause signal"
|
|
621
|
+
: "Skipped due to prior task failure in lane";
|
|
622
|
+
execLog(laneId, task.taskId, reason);
|
|
623
|
+
outcomes.push({
|
|
624
|
+
taskId: task.taskId,
|
|
625
|
+
status: "skipped",
|
|
626
|
+
startTime: null,
|
|
627
|
+
endTime: null,
|
|
628
|
+
exitReason: reason,
|
|
629
|
+
sessionName: lane.tmuxSessionName,
|
|
630
|
+
doneFileFound: false,
|
|
631
|
+
});
|
|
632
|
+
continue;
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
// Execute this task
|
|
636
|
+
const taskStartTime = Date.now();
|
|
637
|
+
let taskOutcome: LaneTaskOutcome;
|
|
638
|
+
|
|
639
|
+
try {
|
|
640
|
+
// Spawn TMUX session
|
|
641
|
+
spawnLaneSession(lane, task, config, repoRoot);
|
|
642
|
+
|
|
643
|
+
// Poll until completion
|
|
644
|
+
const pollResult = await pollUntilTaskComplete(
|
|
645
|
+
lane,
|
|
646
|
+
task,
|
|
647
|
+
config,
|
|
648
|
+
repoRoot,
|
|
649
|
+
pauseSignal,
|
|
650
|
+
);
|
|
651
|
+
|
|
652
|
+
taskOutcome = {
|
|
653
|
+
taskId: task.taskId,
|
|
654
|
+
status: pollResult.status,
|
|
655
|
+
startTime: taskStartTime,
|
|
656
|
+
endTime: Date.now(),
|
|
657
|
+
exitReason: pollResult.exitReason,
|
|
658
|
+
sessionName: lane.tmuxSessionName,
|
|
659
|
+
doneFileFound: pollResult.doneFileFound,
|
|
660
|
+
};
|
|
661
|
+
|
|
662
|
+
// If task failed or was paused, skip remaining tasks
|
|
663
|
+
if (pollResult.status === "failed" || pollResult.status === "stalled") {
|
|
664
|
+
shouldSkipRemaining = true;
|
|
665
|
+
}
|
|
666
|
+
if (pollResult.status === "skipped") {
|
|
667
|
+
// Pause was signaled during poll — mark remaining as skipped too
|
|
668
|
+
shouldSkipRemaining = true;
|
|
669
|
+
}
|
|
670
|
+
} catch (err: unknown) {
|
|
671
|
+
// Spawn or polling error
|
|
672
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
673
|
+
execLog(laneId, task.taskId, `execution error: ${errMsg}`);
|
|
674
|
+
|
|
675
|
+
taskOutcome = {
|
|
676
|
+
taskId: task.taskId,
|
|
677
|
+
status: "failed",
|
|
678
|
+
startTime: taskStartTime,
|
|
679
|
+
endTime: Date.now(),
|
|
680
|
+
exitReason: errMsg,
|
|
681
|
+
sessionName: lane.tmuxSessionName,
|
|
682
|
+
doneFileFound: false,
|
|
683
|
+
};
|
|
684
|
+
|
|
685
|
+
shouldSkipRemaining = true;
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
const elapsed = Math.round(((taskOutcome.endTime || Date.now()) - taskStartTime) / 1000);
|
|
689
|
+
execLog(laneId, task.taskId, `task ${taskOutcome.status}`, {
|
|
690
|
+
elapsed: `${elapsed}s`,
|
|
691
|
+
doneFile: taskOutcome.doneFileFound,
|
|
692
|
+
});
|
|
693
|
+
|
|
694
|
+
outcomes.push(taskOutcome);
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
const laneEndTime = Date.now();
|
|
698
|
+
const succeededCount = outcomes.filter((o) => o.status === "succeeded").length;
|
|
699
|
+
const failedCount = outcomes.filter((o) => o.status === "failed" || o.status === "stalled").length;
|
|
700
|
+
|
|
701
|
+
let overallStatus: LaneExecutionResult["overallStatus"];
|
|
702
|
+
if (failedCount === 0 && succeededCount === lane.tasks.length) {
|
|
703
|
+
overallStatus = "succeeded";
|
|
704
|
+
} else if (failedCount > 0 && succeededCount > 0) {
|
|
705
|
+
overallStatus = "partial";
|
|
706
|
+
} else {
|
|
707
|
+
overallStatus = "failed";
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
const totalElapsed = Math.round((laneEndTime - laneStartTime) / 1000);
|
|
711
|
+
execLog(laneId, "LANE", `execution complete: ${overallStatus}`, {
|
|
712
|
+
succeeded: succeededCount,
|
|
713
|
+
failed: failedCount,
|
|
714
|
+
skipped: outcomes.filter((o) => o.status === "skipped").length,
|
|
715
|
+
elapsed: `${totalElapsed}s`,
|
|
716
|
+
});
|
|
717
|
+
|
|
718
|
+
return {
|
|
719
|
+
laneNumber: lane.laneNumber,
|
|
720
|
+
laneId: lane.laneId,
|
|
721
|
+
tasks: outcomes,
|
|
722
|
+
overallStatus,
|
|
723
|
+
startTime: laneStartTime,
|
|
724
|
+
endTime: laneEndTime,
|
|
725
|
+
};
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
|
|
729
|
+
// ── STATUS.md Parsing for Worktree ───────────────────────────────────
|
|
730
|
+
|
|
731
|
+
/**
|
|
732
|
+
* Normalized result from parsing a STATUS.md file in a worktree.
|
|
733
|
+
*
|
|
734
|
+
* Reuses the same regex patterns as task-runner's parseStatusMd but
|
|
735
|
+
* adapted for monitoring context (no direct import — same file patterns).
|
|
736
|
+
*/
|
|
737
|
+
export interface ParsedWorktreeStatus {
|
|
738
|
+
/** Parsed step info array */
|
|
739
|
+
steps: {
|
|
740
|
+
number: number;
|
|
741
|
+
name: string;
|
|
742
|
+
status: "not-started" | "in-progress" | "complete";
|
|
743
|
+
totalChecked: number;
|
|
744
|
+
totalItems: number;
|
|
745
|
+
}[];
|
|
746
|
+
/** Review counter from STATUS.md */
|
|
747
|
+
reviewCounter: number;
|
|
748
|
+
/** Iteration number from STATUS.md */
|
|
749
|
+
iteration: number;
|
|
750
|
+
/** File modification time (epoch ms) */
|
|
751
|
+
mtime: number;
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
/**
|
|
755
|
+
* Parse STATUS.md from a task folder inside a worktree.
|
|
756
|
+
*
|
|
757
|
+
* Reads the STATUS.md file, parses step statuses and checkbox counts
|
|
758
|
+
* using the same regex patterns as task-runner's parseStatusMd.
|
|
759
|
+
*
|
|
760
|
+
* @param taskFolder - Absolute task folder path (from main repo)
|
|
761
|
+
* @param worktreePath - Absolute path to the lane worktree
|
|
762
|
+
* @param repoRoot - Absolute path to the main repository root
|
|
763
|
+
* @returns Parsed status or null with reason if unreadable
|
|
764
|
+
*/
|
|
765
|
+
export function parseWorktreeStatusMd(
|
|
766
|
+
taskFolder: string,
|
|
767
|
+
worktreePath: string,
|
|
768
|
+
repoRoot: string,
|
|
769
|
+
): { parsed: ParsedWorktreeStatus | null; error: string | null } {
|
|
770
|
+
// Translate the task folder path from main repo to worktree
|
|
771
|
+
const repoRootNorm = resolve(repoRoot).replace(/\\/g, "/");
|
|
772
|
+
const folderNorm = resolve(taskFolder).replace(/\\/g, "/");
|
|
773
|
+
|
|
774
|
+
let relativePath: string;
|
|
775
|
+
if (folderNorm.startsWith(repoRootNorm + "/")) {
|
|
776
|
+
relativePath = folderNorm.slice(repoRootNorm.length + 1);
|
|
777
|
+
} else {
|
|
778
|
+
relativePath = taskFolder;
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
let statusPath = join(worktreePath, relativePath, "STATUS.md");
|
|
782
|
+
|
|
783
|
+
if (!existsSync(statusPath)) {
|
|
784
|
+
// Fallback: worker may have archived the task folder
|
|
785
|
+
const parts = relativePath.replace(/\\/g, "/").split("/");
|
|
786
|
+
const taskDirName = parts[parts.length - 1];
|
|
787
|
+
const parentParts = parts.slice(0, -1);
|
|
788
|
+
const archiveStatusPath = join(worktreePath, ...parentParts, "archive", taskDirName, "STATUS.md");
|
|
789
|
+
if (existsSync(archiveStatusPath)) {
|
|
790
|
+
statusPath = archiveStatusPath;
|
|
791
|
+
} else {
|
|
792
|
+
return { parsed: null, error: `STATUS.md not found at ${statusPath}` };
|
|
793
|
+
}
|
|
794
|
+
}
|
|
795
|
+
|
|
796
|
+
let content: string;
|
|
797
|
+
let mtime: number;
|
|
798
|
+
try {
|
|
799
|
+
content = readFileSync(statusPath, "utf-8");
|
|
800
|
+
mtime = statSync(statusPath).mtimeMs;
|
|
801
|
+
} catch (err: unknown) {
|
|
802
|
+
return { parsed: null, error: `Cannot read STATUS.md: ${err instanceof Error ? err.message : String(err)}` };
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
// Parse using same regex patterns as task-runner's parseStatusMd
|
|
806
|
+
const text = content.replace(/\r\n/g, "\n");
|
|
807
|
+
const steps: ParsedWorktreeStatus["steps"] = [];
|
|
808
|
+
let currentStep: {
|
|
809
|
+
number: number;
|
|
810
|
+
name: string;
|
|
811
|
+
status: "not-started" | "in-progress" | "complete";
|
|
812
|
+
checkboxes: boolean[];
|
|
813
|
+
} | null = null;
|
|
814
|
+
let reviewCounter = 0;
|
|
815
|
+
let iteration = 0;
|
|
816
|
+
|
|
817
|
+
for (const line of text.split("\n")) {
|
|
818
|
+
const rcMatch = line.match(/\*\*Review Counter:\*\*\s*(\d+)/);
|
|
819
|
+
if (rcMatch) reviewCounter = parseInt(rcMatch[1]);
|
|
820
|
+
const itMatch = line.match(/\*\*Iteration:\*\*\s*(\d+)/);
|
|
821
|
+
if (itMatch) iteration = parseInt(itMatch[1]);
|
|
822
|
+
|
|
823
|
+
const stepMatch = line.match(/^###\s+Step\s+(\d+):\s*(.+)/);
|
|
824
|
+
if (stepMatch) {
|
|
825
|
+
if (currentStep) {
|
|
826
|
+
const totalChecked = currentStep.checkboxes.filter(c => c).length;
|
|
827
|
+
steps.push({
|
|
828
|
+
number: currentStep.number,
|
|
829
|
+
name: currentStep.name,
|
|
830
|
+
status: currentStep.status,
|
|
831
|
+
totalChecked,
|
|
832
|
+
totalItems: currentStep.checkboxes.length,
|
|
833
|
+
});
|
|
834
|
+
}
|
|
835
|
+
currentStep = {
|
|
836
|
+
number: parseInt(stepMatch[1]),
|
|
837
|
+
name: stepMatch[2].trim(),
|
|
838
|
+
status: "not-started",
|
|
839
|
+
checkboxes: [],
|
|
840
|
+
};
|
|
841
|
+
continue;
|
|
842
|
+
}
|
|
843
|
+
if (currentStep) {
|
|
844
|
+
const ss = line.match(/\*\*Status:\*\*\s*(.*)/);
|
|
845
|
+
if (ss) {
|
|
846
|
+
const s = ss[1];
|
|
847
|
+
if (s.includes("✅") || s.toLowerCase().includes("complete")) {
|
|
848
|
+
currentStep.status = "complete";
|
|
849
|
+
} else if (s.includes("🟨") || s.includes("🟡") || s.toLowerCase().includes("progress")) {
|
|
850
|
+
currentStep.status = "in-progress";
|
|
851
|
+
}
|
|
852
|
+
}
|
|
853
|
+
const cb = line.match(/^\s*-\s*\[([ xX])\]\s*(.*)/);
|
|
854
|
+
if (cb) {
|
|
855
|
+
currentStep.checkboxes.push(cb[1].toLowerCase() === "x");
|
|
856
|
+
}
|
|
857
|
+
}
|
|
858
|
+
}
|
|
859
|
+
if (currentStep) {
|
|
860
|
+
const totalChecked = currentStep.checkboxes.filter(c => c).length;
|
|
861
|
+
steps.push({
|
|
862
|
+
number: currentStep.number,
|
|
863
|
+
name: currentStep.name,
|
|
864
|
+
status: currentStep.status,
|
|
865
|
+
totalChecked,
|
|
866
|
+
totalItems: currentStep.checkboxes.length,
|
|
867
|
+
});
|
|
868
|
+
}
|
|
869
|
+
|
|
870
|
+
return {
|
|
871
|
+
parsed: { steps, reviewCounter, iteration, mtime },
|
|
872
|
+
error: null,
|
|
873
|
+
};
|
|
874
|
+
}
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
// ── State Resolution ─────────────────────────────────────────────────
|
|
878
|
+
|
|
879
|
+
/**
|
|
880
|
+
* Resolve the monitoring state for a single task by combining signals.
|
|
881
|
+
*
|
|
882
|
+
* State-resolution precedence (deterministic):
|
|
883
|
+
* 1. `.DONE` file found → "succeeded" (highest priority, always wins)
|
|
884
|
+
* 2. Stall timeout reached (mtime unchanged for stall_timeout AND session alive) → "stalled"
|
|
885
|
+
* 3. TMUX session exited without .DONE → "failed"
|
|
886
|
+
* 4. Session alive + recent mtime (within stall_timeout) → "running"
|
|
887
|
+
* 5. Session alive + stale mtime but within startup grace → "running" (with no stall timer yet)
|
|
888
|
+
* 6. Session alive + no STATUS.md yet but within startup grace → "running"
|
|
889
|
+
* 7. No session, no .DONE, never observed running → "unknown"
|
|
890
|
+
*
|
|
891
|
+
* @param taskId - Task identifier
|
|
892
|
+
* @param donePath - Absolute path to the .DONE file in the worktree
|
|
893
|
+
* @param sessionName - TMUX session name for this lane
|
|
894
|
+
* @param statusResult - Parsed STATUS.md result (may be null)
|
|
895
|
+
* @param tracker - Mtime tracker for stall detection
|
|
896
|
+
* @param stallTimeoutMs - Stall timeout in milliseconds
|
|
897
|
+
* @param now - Current timestamp (epoch ms) for deterministic testing
|
|
898
|
+
*/
|
|
899
|
+
export function resolveTaskMonitorState(
|
|
900
|
+
taskId: string,
|
|
901
|
+
donePath: string,
|
|
902
|
+
sessionName: string,
|
|
903
|
+
statusResult: { parsed: ParsedWorktreeStatus | null; error: string | null },
|
|
904
|
+
tracker: MtimeTracker,
|
|
905
|
+
stallTimeoutMs: number,
|
|
906
|
+
now: number,
|
|
907
|
+
): TaskMonitorSnapshot {
|
|
908
|
+
const sessionAlive = tmuxHasSession(sessionName);
|
|
909
|
+
const doneFileFound = existsSync(donePath);
|
|
910
|
+
|
|
911
|
+
// Build base snapshot from parsed status
|
|
912
|
+
let currentStepName: string | null = null;
|
|
913
|
+
let currentStepNumber: number | null = null;
|
|
914
|
+
let totalSteps = 0;
|
|
915
|
+
let totalChecked = 0;
|
|
916
|
+
let totalItems = 0;
|
|
917
|
+
let iteration = 0;
|
|
918
|
+
let reviewCounter = 0;
|
|
919
|
+
let parseError = statusResult.error;
|
|
920
|
+
|
|
921
|
+
if (statusResult.parsed) {
|
|
922
|
+
const { steps } = statusResult.parsed;
|
|
923
|
+
totalSteps = steps.length;
|
|
924
|
+
iteration = statusResult.parsed.iteration;
|
|
925
|
+
reviewCounter = statusResult.parsed.reviewCounter;
|
|
926
|
+
|
|
927
|
+
for (const step of steps) {
|
|
928
|
+
totalChecked += step.totalChecked;
|
|
929
|
+
totalItems += step.totalItems;
|
|
930
|
+
}
|
|
931
|
+
|
|
932
|
+
// Find the current step (first in-progress, or first not-started after last complete)
|
|
933
|
+
const inProgress = steps.find(s => s.status === "in-progress");
|
|
934
|
+
if (inProgress) {
|
|
935
|
+
currentStepName = inProgress.name;
|
|
936
|
+
currentStepNumber = inProgress.number;
|
|
937
|
+
} else {
|
|
938
|
+
// Find first not-started step
|
|
939
|
+
const notStarted = steps.find(s => s.status === "not-started");
|
|
940
|
+
if (notStarted) {
|
|
941
|
+
currentStepName = notStarted.name;
|
|
942
|
+
currentStepNumber = notStarted.number;
|
|
943
|
+
} else if (steps.length > 0) {
|
|
944
|
+
// All complete
|
|
945
|
+
const last = steps[steps.length - 1];
|
|
946
|
+
currentStepName = last.name;
|
|
947
|
+
currentStepNumber = last.number;
|
|
948
|
+
}
|
|
949
|
+
}
|
|
950
|
+
|
|
951
|
+
// Update mtime tracker
|
|
952
|
+
if (!tracker.statusFileSeenOnce) {
|
|
953
|
+
tracker.statusFileSeenOnce = true;
|
|
954
|
+
tracker.lastMtime = statusResult.parsed.mtime;
|
|
955
|
+
tracker.stallTimerStart = null; // Reset stall timer on first read
|
|
956
|
+
} else if (statusResult.parsed.mtime !== tracker.lastMtime) {
|
|
957
|
+
// Mtime changed — progress is being made
|
|
958
|
+
tracker.lastMtime = statusResult.parsed.mtime;
|
|
959
|
+
tracker.stallTimerStart = null; // Reset stall timer
|
|
960
|
+
} else {
|
|
961
|
+
// Mtime unchanged — start or continue stall timer
|
|
962
|
+
if (tracker.stallTimerStart === null) {
|
|
963
|
+
tracker.stallTimerStart = now;
|
|
964
|
+
}
|
|
965
|
+
}
|
|
966
|
+
}
|
|
967
|
+
|
|
968
|
+
// ── Priority 1: .DONE file found → succeeded ────────────────
|
|
969
|
+
if (doneFileFound) {
|
|
970
|
+
return {
|
|
971
|
+
taskId,
|
|
972
|
+
status: "succeeded",
|
|
973
|
+
currentStepName,
|
|
974
|
+
currentStepNumber,
|
|
975
|
+
totalSteps,
|
|
976
|
+
totalChecked,
|
|
977
|
+
totalItems,
|
|
978
|
+
sessionAlive,
|
|
979
|
+
doneFileFound: true,
|
|
980
|
+
stallReason: null,
|
|
981
|
+
lastHeartbeat: tracker.lastMtime,
|
|
982
|
+
observedAt: now,
|
|
983
|
+
parseError,
|
|
984
|
+
iteration,
|
|
985
|
+
reviewCounter,
|
|
986
|
+
};
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
// ── Priority 2: Stall timeout reached ────────────────────────
|
|
990
|
+
if (
|
|
991
|
+
sessionAlive &&
|
|
992
|
+
tracker.statusFileSeenOnce &&
|
|
993
|
+
tracker.stallTimerStart !== null &&
|
|
994
|
+
(now - tracker.stallTimerStart) >= stallTimeoutMs
|
|
995
|
+
) {
|
|
996
|
+
const stallMinutes = Math.round((now - tracker.stallTimerStart) / 60_000);
|
|
997
|
+
const stallReason = `STATUS.md unchanged for ${stallMinutes} minutes (threshold: ${Math.round(stallTimeoutMs / 60_000)} min)`;
|
|
998
|
+
|
|
999
|
+
// Kill the session and children
|
|
1000
|
+
execLog("monitor", taskId, `stall detected — killing session`, {
|
|
1001
|
+
session: sessionName,
|
|
1002
|
+
stallMinutes,
|
|
1003
|
+
});
|
|
1004
|
+
killLaneAndChildren(sessionName);
|
|
1005
|
+
|
|
1006
|
+
return {
|
|
1007
|
+
taskId,
|
|
1008
|
+
status: "stalled",
|
|
1009
|
+
currentStepName,
|
|
1010
|
+
currentStepNumber,
|
|
1011
|
+
totalSteps,
|
|
1012
|
+
totalChecked,
|
|
1013
|
+
totalItems,
|
|
1014
|
+
sessionAlive: false, // We just killed it
|
|
1015
|
+
doneFileFound: false,
|
|
1016
|
+
stallReason,
|
|
1017
|
+
lastHeartbeat: tracker.lastMtime,
|
|
1018
|
+
observedAt: now,
|
|
1019
|
+
parseError,
|
|
1020
|
+
iteration,
|
|
1021
|
+
reviewCounter,
|
|
1022
|
+
};
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
// ── Priority 3: Session exited without .DONE → failed ────────
|
|
1026
|
+
if (!sessionAlive) {
|
|
1027
|
+
return {
|
|
1028
|
+
taskId,
|
|
1029
|
+
status: "failed",
|
|
1030
|
+
currentStepName,
|
|
1031
|
+
currentStepNumber,
|
|
1032
|
+
totalSteps,
|
|
1033
|
+
totalChecked,
|
|
1034
|
+
totalItems,
|
|
1035
|
+
sessionAlive: false,
|
|
1036
|
+
doneFileFound: false,
|
|
1037
|
+
stallReason: null,
|
|
1038
|
+
lastHeartbeat: tracker.lastMtime,
|
|
1039
|
+
observedAt: now,
|
|
1040
|
+
parseError,
|
|
1041
|
+
iteration,
|
|
1042
|
+
reviewCounter,
|
|
1043
|
+
};
|
|
1044
|
+
}
|
|
1045
|
+
|
|
1046
|
+
// ── Priority 4-6: Session alive → running ────────────────────
|
|
1047
|
+
return {
|
|
1048
|
+
taskId,
|
|
1049
|
+
status: "running",
|
|
1050
|
+
currentStepName,
|
|
1051
|
+
currentStepNumber,
|
|
1052
|
+
totalSteps,
|
|
1053
|
+
totalChecked,
|
|
1054
|
+
totalItems,
|
|
1055
|
+
sessionAlive: true,
|
|
1056
|
+
doneFileFound: false,
|
|
1057
|
+
stallReason: null,
|
|
1058
|
+
lastHeartbeat: tracker.lastMtime,
|
|
1059
|
+
observedAt: now,
|
|
1060
|
+
parseError,
|
|
1061
|
+
iteration,
|
|
1062
|
+
reviewCounter,
|
|
1063
|
+
};
|
|
1064
|
+
}
|
|
1065
|
+
|
|
1066
|
+
|
|
1067
|
+
// ── Core Monitor Loop ────────────────────────────────────────────────
|
|
1068
|
+
|
|
1069
|
+
/**
|
|
1070
|
+
* Callback type for dashboard updates during monitoring.
|
|
1071
|
+
*/
|
|
1072
|
+
export type MonitorUpdateCallback = (state: MonitorState) => void;
|
|
1073
|
+
|
|
1074
|
+
/**
|
|
1075
|
+
* Monitor all lanes in a wave, polling for progress, completion, and stalls.
|
|
1076
|
+
*
|
|
1077
|
+
* This is the orchestrator's "air traffic control" — it does NOT attach
|
|
1078
|
+
* to TMUX sessions. It monitors via filesystem polling:
|
|
1079
|
+
* - STATUS.md in each worktree for step/checkbox progress
|
|
1080
|
+
* - .DONE files for task completion
|
|
1081
|
+
* - `tmux has-session` for session liveness
|
|
1082
|
+
* - STATUS.md mtime for stall detection
|
|
1083
|
+
*
|
|
1084
|
+
* The monitoring loop runs until all lanes reach terminal states
|
|
1085
|
+
* (all tasks succeeded/failed/stalled) or the pauseSignal is set.
|
|
1086
|
+
*
|
|
1087
|
+
* **Important:** This function monitors lanes that are being executed
|
|
1088
|
+
* concurrently by `executeLane()` in Step 2. It does NOT spawn sessions —
|
|
1089
|
+
* it only observes. Step 4 will coordinate calling both executeLane()
|
|
1090
|
+
* and monitorLanes() in parallel.
|
|
1091
|
+
*
|
|
1092
|
+
* @param lanes - Allocated lanes being executed
|
|
1093
|
+
* @param config - Orchestrator configuration (poll_interval, stall_timeout)
|
|
1094
|
+
* @param repoRoot - Main repository root
|
|
1095
|
+
* @param pauseSignal - Shared signal for pause/abort
|
|
1096
|
+
* @param waveNumber - Current wave number (for display)
|
|
1097
|
+
* @param onUpdate - Optional callback invoked on each poll cycle
|
|
1098
|
+
* @returns Final MonitorState snapshot when monitoring completes
|
|
1099
|
+
*/
|
|
1100
|
+
export async function monitorLanes(
|
|
1101
|
+
lanes: AllocatedLane[],
|
|
1102
|
+
config: OrchestratorConfig,
|
|
1103
|
+
repoRoot: string,
|
|
1104
|
+
pauseSignal: { paused: boolean },
|
|
1105
|
+
waveNumber: number = 1,
|
|
1106
|
+
onUpdate?: MonitorUpdateCallback,
|
|
1107
|
+
): Promise<MonitorState> {
|
|
1108
|
+
const pollIntervalMs = (config.monitoring.poll_interval || 5) * 1000;
|
|
1109
|
+
const stallTimeoutMs = (config.failure.stall_timeout || 30) * 60_000;
|
|
1110
|
+
|
|
1111
|
+
// Initialize mtime trackers for each lane's current task
|
|
1112
|
+
// We track per-taskId so a lane advancing to the next task gets a fresh tracker
|
|
1113
|
+
const mtimeTrackers = new Map<string, MtimeTracker>();
|
|
1114
|
+
|
|
1115
|
+
function getOrCreateTracker(taskId: string, now: number): MtimeTracker {
|
|
1116
|
+
let tracker = mtimeTrackers.get(taskId);
|
|
1117
|
+
if (!tracker) {
|
|
1118
|
+
tracker = {
|
|
1119
|
+
taskId,
|
|
1120
|
+
firstObservedAt: now,
|
|
1121
|
+
statusFileSeenOnce: false,
|
|
1122
|
+
lastMtime: null,
|
|
1123
|
+
stallTimerStart: null,
|
|
1124
|
+
};
|
|
1125
|
+
mtimeTrackers.set(taskId, tracker);
|
|
1126
|
+
}
|
|
1127
|
+
return tracker;
|
|
1128
|
+
}
|
|
1129
|
+
|
|
1130
|
+
// Track terminal states per task to avoid re-processing
|
|
1131
|
+
const terminalTasks = new Map<string, TaskMonitorSnapshot>();
|
|
1132
|
+
|
|
1133
|
+
// Track which task each lane is currently on
|
|
1134
|
+
// (determined by: first task in lane that hasn't reached terminal state)
|
|
1135
|
+
const laneTaskIndex = new Map<number, number>();
|
|
1136
|
+
for (const lane of lanes) {
|
|
1137
|
+
laneTaskIndex.set(lane.laneNumber, 0);
|
|
1138
|
+
}
|
|
1139
|
+
|
|
1140
|
+
let pollCount = 0;
|
|
1141
|
+
let lastMonitorStateKey = "";
|
|
1142
|
+
|
|
1143
|
+
// Build the total task count
|
|
1144
|
+
const tasksTotal = lanes.reduce((sum, lane) => sum + lane.tasks.length, 0);
|
|
1145
|
+
|
|
1146
|
+
execLog("monitor", "ALL", `starting monitoring for ${lanes.length} lane(s), ${tasksTotal} task(s)`, {
|
|
1147
|
+
pollIntervalMs,
|
|
1148
|
+
stallTimeoutMin: Math.round(stallTimeoutMs / 60_000),
|
|
1149
|
+
});
|
|
1150
|
+
|
|
1151
|
+
while (true) {
|
|
1152
|
+
const now = Date.now();
|
|
1153
|
+
pollCount++;
|
|
1154
|
+
|
|
1155
|
+
// Check pause signal
|
|
1156
|
+
if (pauseSignal.paused) {
|
|
1157
|
+
execLog("monitor", "ALL", "pause signal detected — stopping monitoring");
|
|
1158
|
+
break;
|
|
1159
|
+
}
|
|
1160
|
+
|
|
1161
|
+
const laneSnapshots: LaneMonitorSnapshot[] = [];
|
|
1162
|
+
let totalDone = 0;
|
|
1163
|
+
let totalFailed = 0;
|
|
1164
|
+
let allTerminal = true;
|
|
1165
|
+
|
|
1166
|
+
for (const lane of lanes) {
|
|
1167
|
+
const completedTasks: string[] = [];
|
|
1168
|
+
const failedTasks: string[] = [];
|
|
1169
|
+
const remainingTasks: string[] = [];
|
|
1170
|
+
let currentTaskId: string | null = null;
|
|
1171
|
+
let currentTaskSnapshot: TaskMonitorSnapshot | null = null;
|
|
1172
|
+
|
|
1173
|
+
// Walk through tasks in order to determine lane state
|
|
1174
|
+
for (let i = 0; i < lane.tasks.length; i++) {
|
|
1175
|
+
const task = lane.tasks[i];
|
|
1176
|
+
|
|
1177
|
+
// Check if we already know this task is terminal
|
|
1178
|
+
const existingTerminal = terminalTasks.get(task.taskId);
|
|
1179
|
+
if (existingTerminal) {
|
|
1180
|
+
if (existingTerminal.status === "succeeded") {
|
|
1181
|
+
completedTasks.push(task.taskId);
|
|
1182
|
+
totalDone++;
|
|
1183
|
+
} else {
|
|
1184
|
+
failedTasks.push(task.taskId);
|
|
1185
|
+
totalFailed++;
|
|
1186
|
+
}
|
|
1187
|
+
continue;
|
|
1188
|
+
}
|
|
1189
|
+
|
|
1190
|
+
// This task hasn't reached terminal state yet
|
|
1191
|
+
if (currentTaskId === null) {
|
|
1192
|
+
// This is the current task being worked on
|
|
1193
|
+
currentTaskId = task.taskId;
|
|
1194
|
+
|
|
1195
|
+
const tracker = getOrCreateTracker(task.taskId, now);
|
|
1196
|
+
const donePath = resolveTaskDonePath(task.task.taskFolder, lane.worktreePath, repoRoot);
|
|
1197
|
+
const statusResult = parseWorktreeStatusMd(task.task.taskFolder, lane.worktreePath, repoRoot);
|
|
1198
|
+
|
|
1199
|
+
const snapshot = resolveTaskMonitorState(
|
|
1200
|
+
task.taskId,
|
|
1201
|
+
donePath,
|
|
1202
|
+
lane.tmuxSessionName,
|
|
1203
|
+
statusResult,
|
|
1204
|
+
tracker,
|
|
1205
|
+
stallTimeoutMs,
|
|
1206
|
+
now,
|
|
1207
|
+
);
|
|
1208
|
+
|
|
1209
|
+
currentTaskSnapshot = snapshot;
|
|
1210
|
+
|
|
1211
|
+
// Check if this task just became terminal
|
|
1212
|
+
if (snapshot.status === "succeeded" || snapshot.status === "failed" || snapshot.status === "stalled") {
|
|
1213
|
+
terminalTasks.set(task.taskId, snapshot);
|
|
1214
|
+
if (snapshot.status === "succeeded") {
|
|
1215
|
+
completedTasks.push(task.taskId);
|
|
1216
|
+
totalDone++;
|
|
1217
|
+
} else {
|
|
1218
|
+
failedTasks.push(task.taskId);
|
|
1219
|
+
totalFailed++;
|
|
1220
|
+
}
|
|
1221
|
+
// Move to next task — clear currentTaskId so next iteration picks up
|
|
1222
|
+
currentTaskId = null;
|
|
1223
|
+
currentTaskSnapshot = null;
|
|
1224
|
+
} else {
|
|
1225
|
+
// Task is still running — mark remaining and break
|
|
1226
|
+
allTerminal = false;
|
|
1227
|
+
// Remaining tasks are everything after this one
|
|
1228
|
+
for (let j = i + 1; j < lane.tasks.length; j++) {
|
|
1229
|
+
remainingTasks.push(lane.tasks[j].taskId);
|
|
1230
|
+
}
|
|
1231
|
+
break;
|
|
1232
|
+
}
|
|
1233
|
+
} else {
|
|
1234
|
+
// Shouldn't reach here since we break above, but defensive
|
|
1235
|
+
remainingTasks.push(task.taskId);
|
|
1236
|
+
}
|
|
1237
|
+
}
|
|
1238
|
+
|
|
1239
|
+
// If we processed all tasks and currentTaskId is still null,
|
|
1240
|
+
// the lane is fully terminal (all tasks completed/failed)
|
|
1241
|
+
if (currentTaskId !== null) {
|
|
1242
|
+
allTerminal = false;
|
|
1243
|
+
}
|
|
1244
|
+
|
|
1245
|
+
const sessionAlive = tmuxHasSession(lane.tmuxSessionName);
|
|
1246
|
+
|
|
1247
|
+
laneSnapshots.push({
|
|
1248
|
+
laneId: lane.laneId,
|
|
1249
|
+
laneNumber: lane.laneNumber,
|
|
1250
|
+
sessionName: lane.tmuxSessionName,
|
|
1251
|
+
sessionAlive,
|
|
1252
|
+
currentTaskId,
|
|
1253
|
+
currentTaskSnapshot,
|
|
1254
|
+
completedTasks,
|
|
1255
|
+
failedTasks,
|
|
1256
|
+
remainingTasks,
|
|
1257
|
+
});
|
|
1258
|
+
}
|
|
1259
|
+
|
|
1260
|
+
const monitorState: MonitorState = {
|
|
1261
|
+
lanes: laneSnapshots,
|
|
1262
|
+
tasksDone: totalDone,
|
|
1263
|
+
tasksFailed: totalFailed,
|
|
1264
|
+
tasksTotal,
|
|
1265
|
+
waveNumber,
|
|
1266
|
+
pollCount,
|
|
1267
|
+
lastPollTime: now,
|
|
1268
|
+
allTerminal,
|
|
1269
|
+
};
|
|
1270
|
+
|
|
1271
|
+
// Invoke the dashboard update callback
|
|
1272
|
+
if (onUpdate) {
|
|
1273
|
+
try {
|
|
1274
|
+
onUpdate(monitorState);
|
|
1275
|
+
} catch {
|
|
1276
|
+
// Don't let callback errors kill the monitor loop
|
|
1277
|
+
}
|
|
1278
|
+
}
|
|
1279
|
+
|
|
1280
|
+
// Log summary only on state changes (lane completes or fails) — not every poll
|
|
1281
|
+
const currentStateKey = `${totalDone}/${totalFailed}`;
|
|
1282
|
+
if (currentStateKey !== lastMonitorStateKey) {
|
|
1283
|
+
const activeLanes = laneSnapshots.filter(l => l.currentTaskId !== null);
|
|
1284
|
+
execLog("monitor", "ALL", `poll #${pollCount}: ${totalDone}/${tasksTotal} done, ${totalFailed} failed, ${activeLanes.length} active lane(s)`);
|
|
1285
|
+
lastMonitorStateKey = currentStateKey;
|
|
1286
|
+
}
|
|
1287
|
+
|
|
1288
|
+
// Exit conditions
|
|
1289
|
+
if (allTerminal) {
|
|
1290
|
+
execLog("monitor", "ALL", `all lanes terminal — monitoring complete`, {
|
|
1291
|
+
done: totalDone,
|
|
1292
|
+
failed: totalFailed,
|
|
1293
|
+
total: tasksTotal,
|
|
1294
|
+
polls: pollCount,
|
|
1295
|
+
});
|
|
1296
|
+
return monitorState;
|
|
1297
|
+
}
|
|
1298
|
+
|
|
1299
|
+
// Wait for next poll cycle
|
|
1300
|
+
await new Promise(r => setTimeout(r, pollIntervalMs));
|
|
1301
|
+
}
|
|
1302
|
+
|
|
1303
|
+
// Reached here due to pause signal — return current state
|
|
1304
|
+
const now = Date.now();
|
|
1305
|
+
const laneSnapshots: LaneMonitorSnapshot[] = lanes.map(lane => ({
|
|
1306
|
+
laneId: lane.laneId,
|
|
1307
|
+
laneNumber: lane.laneNumber,
|
|
1308
|
+
sessionName: lane.tmuxSessionName,
|
|
1309
|
+
sessionAlive: tmuxHasSession(lane.tmuxSessionName),
|
|
1310
|
+
currentTaskId: null,
|
|
1311
|
+
currentTaskSnapshot: null,
|
|
1312
|
+
completedTasks: [],
|
|
1313
|
+
failedTasks: [],
|
|
1314
|
+
remainingTasks: lane.tasks.map(t => t.taskId),
|
|
1315
|
+
}));
|
|
1316
|
+
|
|
1317
|
+
return {
|
|
1318
|
+
lanes: laneSnapshots,
|
|
1319
|
+
tasksDone: 0,
|
|
1320
|
+
tasksFailed: 0,
|
|
1321
|
+
tasksTotal,
|
|
1322
|
+
waveNumber,
|
|
1323
|
+
pollCount,
|
|
1324
|
+
lastPollTime: now,
|
|
1325
|
+
allTerminal: false,
|
|
1326
|
+
};
|
|
1327
|
+
}
|
|
1328
|
+
|
|
1329
|
+
|
|
1330
|
+
// ── Transitive Dependent Computation ─────────────────────────────────
|
|
1331
|
+
|
|
1332
|
+
/**
|
|
1333
|
+
* Compute transitive dependents of a set of failed task IDs.
|
|
1334
|
+
*
|
|
1335
|
+
* Uses BFS through the dependency graph's `dependents` map (task → tasks
|
|
1336
|
+
* that depend on it) to find all tasks transitively blocked by the failures.
|
|
1337
|
+
*
|
|
1338
|
+
* Example: if A failed, B depends on A, and C depends on B, then both B
|
|
1339
|
+
* and C are transitively blocked.
|
|
1340
|
+
*
|
|
1341
|
+
* The failed tasks themselves are NOT included in the output — only their
|
|
1342
|
+
* downstream dependents.
|
|
1343
|
+
*
|
|
1344
|
+
* @param failedTaskIds - Set of task IDs that failed
|
|
1345
|
+
* @param dependencyGraph - Dependency graph with dependents map
|
|
1346
|
+
* @returns Set of task IDs transitively blocked (excludes the failed tasks themselves)
|
|
1347
|
+
*/
|
|
1348
|
+
export function computeTransitiveDependents(
|
|
1349
|
+
failedTaskIds: Set<string>,
|
|
1350
|
+
dependencyGraph: DependencyGraph,
|
|
1351
|
+
): Set<string> {
|
|
1352
|
+
const blocked = new Set<string>();
|
|
1353
|
+
const queue = [...failedTaskIds];
|
|
1354
|
+
|
|
1355
|
+
while (queue.length > 0) {
|
|
1356
|
+
const current = queue.shift()!;
|
|
1357
|
+
const dependents = dependencyGraph.dependents.get(current) || [];
|
|
1358
|
+
|
|
1359
|
+
// Deterministic: sort dependents alphabetically
|
|
1360
|
+
const sortedDependents = [...dependents].sort();
|
|
1361
|
+
|
|
1362
|
+
for (const dep of sortedDependents) {
|
|
1363
|
+
if (blocked.has(dep)) continue;
|
|
1364
|
+
if (failedTaskIds.has(dep)) continue; // Don't re-add failed tasks
|
|
1365
|
+
blocked.add(dep);
|
|
1366
|
+
queue.push(dep); // Continue BFS for transitive closure
|
|
1367
|
+
}
|
|
1368
|
+
}
|
|
1369
|
+
|
|
1370
|
+
return blocked;
|
|
1371
|
+
}
|
|
1372
|
+
|
|
1373
|
+
|
|
1374
|
+
// ── Wave Execution Core ──────────────────────────────────────────────
|
|
1375
|
+
|
|
1376
|
+
/**
|
|
1377
|
+
* Execute a single wave: allocate lanes, run tasks in parallel, monitor, apply failure policy.
|
|
1378
|
+
*
|
|
1379
|
+
* Orchestration flow:
|
|
1380
|
+
* 1. Allocate lanes via allocateLanes() (worktree creation + task assignment)
|
|
1381
|
+
* 2. Start all lanes in parallel (each lane executes tasks sequentially)
|
|
1382
|
+
* 3. Start monitoring as a sibling async loop
|
|
1383
|
+
* 4. Wait for all lanes to complete (or policy-triggered early termination)
|
|
1384
|
+
* 5. Apply failure handling policy
|
|
1385
|
+
* 6. Build and return WaveExecutionResult
|
|
1386
|
+
*
|
|
1387
|
+
* Failure policy behavior:
|
|
1388
|
+
* - **skip-dependents**: In-flight tasks continue. Failed task's transitive
|
|
1389
|
+
* dependents are collected in blockedTaskIds for future wave pruning.
|
|
1390
|
+
* Current wave runs to completion.
|
|
1391
|
+
* - **stop-wave**: On first failure, pauseSignal is set. In-flight tasks
|
|
1392
|
+
* finish their current work, remaining tasks in lanes are skipped.
|
|
1393
|
+
* No next wave is started (stoppedEarly=true).
|
|
1394
|
+
* - **stop-all**: On first failure, all TMUX sessions are killed immediately.
|
|
1395
|
+
* Returns with aborted status.
|
|
1396
|
+
*
|
|
1397
|
+
* Concurrency model:
|
|
1398
|
+
* - Lane execution promises are NOT cancellable (tmux sessions run externally)
|
|
1399
|
+
* - stop-all kills sessions directly; executeLane() detects session death on next poll
|
|
1400
|
+
* - Monitoring stops when all lanes reach terminal state or pauseSignal is set
|
|
1401
|
+
*
|
|
1402
|
+
* @param waveTasks - Task IDs in this wave
|
|
1403
|
+
* @param waveIndex - Wave number (1-indexed)
|
|
1404
|
+
* @param pending - Full pending task map from discovery
|
|
1405
|
+
* @param config - Orchestrator configuration
|
|
1406
|
+
* @param repoRoot - Main repository root
|
|
1407
|
+
* @param batchId - Batch ID for naming
|
|
1408
|
+
* @param pauseSignal - Shared pause signal (mutated by stop-wave policy)
|
|
1409
|
+
* @param dependencyGraph - Dependency graph for computing transitive dependents
|
|
1410
|
+
* @param onMonitorUpdate - Optional callback for dashboard updates during monitoring
|
|
1411
|
+
* @param onLanesAllocated - Optional callback fired after lane allocation succeeds
|
|
1412
|
+
* @returns WaveExecutionResult with outcomes and blocked task IDs
|
|
1413
|
+
*/
|
|
1414
|
+
export async function executeWave(
|
|
1415
|
+
waveTasks: string[],
|
|
1416
|
+
waveIndex: number,
|
|
1417
|
+
pending: Map<string, ParsedTask>,
|
|
1418
|
+
config: OrchestratorConfig,
|
|
1419
|
+
repoRoot: string,
|
|
1420
|
+
batchId: string,
|
|
1421
|
+
pauseSignal: { paused: boolean },
|
|
1422
|
+
dependencyGraph: DependencyGraph,
|
|
1423
|
+
onMonitorUpdate?: MonitorUpdateCallback,
|
|
1424
|
+
onLanesAllocated?: (lanes: AllocatedLane[]) => void,
|
|
1425
|
+
): Promise<WaveExecutionResult> {
|
|
1426
|
+
const startedAt = Date.now();
|
|
1427
|
+
const policy = config.failure.on_task_failure;
|
|
1428
|
+
|
|
1429
|
+
execLog("wave", `W${waveIndex}`, `starting wave execution`, {
|
|
1430
|
+
tasks: waveTasks.length,
|
|
1431
|
+
policy,
|
|
1432
|
+
batchId,
|
|
1433
|
+
});
|
|
1434
|
+
|
|
1435
|
+
// ── Stage 1: Allocate lanes ──────────────────────────────────
|
|
1436
|
+
const allocResult = allocateLanes(waveTasks, pending, config, repoRoot, batchId);
|
|
1437
|
+
|
|
1438
|
+
if (!allocResult.success) {
|
|
1439
|
+
const errMsg = allocResult.error?.message || "Unknown allocation failure";
|
|
1440
|
+
execLog("wave", `W${waveIndex}`, `lane allocation failed: ${errMsg}`);
|
|
1441
|
+
|
|
1442
|
+
return {
|
|
1443
|
+
waveIndex,
|
|
1444
|
+
startedAt,
|
|
1445
|
+
endedAt: Date.now(),
|
|
1446
|
+
laneResults: [],
|
|
1447
|
+
policyApplied: policy,
|
|
1448
|
+
stoppedEarly: true,
|
|
1449
|
+
failedTaskIds: waveTasks, // All tasks in the wave are considered failed
|
|
1450
|
+
skippedTaskIds: [],
|
|
1451
|
+
succeededTaskIds: [],
|
|
1452
|
+
blockedTaskIds: [...computeTransitiveDependents(new Set(waveTasks), dependencyGraph)],
|
|
1453
|
+
laneCount: 0,
|
|
1454
|
+
overallStatus: "failed",
|
|
1455
|
+
finalMonitorState: null,
|
|
1456
|
+
allocatedLanes: [],
|
|
1457
|
+
};
|
|
1458
|
+
}
|
|
1459
|
+
|
|
1460
|
+
const lanes = allocResult.lanes;
|
|
1461
|
+
onLanesAllocated?.(lanes);
|
|
1462
|
+
|
|
1463
|
+
execLog("wave", `W${waveIndex}`, `lanes allocated`, {
|
|
1464
|
+
laneCount: lanes.length,
|
|
1465
|
+
totalTasks: waveTasks.length,
|
|
1466
|
+
});
|
|
1467
|
+
|
|
1468
|
+
// ── Stage 2+3: Start lanes in parallel + monitoring ──────────
|
|
1469
|
+
// Create per-wave pause signal that can be triggered by policy
|
|
1470
|
+
// while preserving the external pauseSignal from /orch-pause
|
|
1471
|
+
const wavePauseSignal = pauseSignal;
|
|
1472
|
+
|
|
1473
|
+
// Start lane execution promises
|
|
1474
|
+
const lanePromises = lanes.map(lane =>
|
|
1475
|
+
executeLane(lane, config, repoRoot, wavePauseSignal),
|
|
1476
|
+
);
|
|
1477
|
+
|
|
1478
|
+
// Start monitoring as a sibling async loop
|
|
1479
|
+
// Monitor runs concurrently and stops when all lanes are terminal or paused
|
|
1480
|
+
const monitorPromise = monitorLanes(
|
|
1481
|
+
lanes,
|
|
1482
|
+
config,
|
|
1483
|
+
repoRoot,
|
|
1484
|
+
wavePauseSignal,
|
|
1485
|
+
waveIndex,
|
|
1486
|
+
onMonitorUpdate,
|
|
1487
|
+
);
|
|
1488
|
+
|
|
1489
|
+
// ── Stage 4: Wait for all lanes + apply policy ───────────────
|
|
1490
|
+
// We need to detect the first failure to apply policy.
|
|
1491
|
+
// Use Promise.allSettled on lanes, then check results.
|
|
1492
|
+
// For stop-all, we also need to react proactively.
|
|
1493
|
+
|
|
1494
|
+
let laneResults: LaneExecutionResult[];
|
|
1495
|
+
let finalMonitorState: MonitorState | null = null;
|
|
1496
|
+
|
|
1497
|
+
if (policy === "stop-all") {
|
|
1498
|
+
// For stop-all: race detection — as soon as any lane reports failure,
|
|
1499
|
+
// kill all sessions immediately.
|
|
1500
|
+
laneResults = await executeWithStopAll(lanes, lanePromises, wavePauseSignal, waveIndex);
|
|
1501
|
+
} else {
|
|
1502
|
+
// For skip-dependents and stop-wave:
|
|
1503
|
+
// Let all lanes run to completion (or until pauseSignal stops them).
|
|
1504
|
+
// For stop-wave, we set pauseSignal when we detect failure in results.
|
|
1505
|
+
const settled = await Promise.allSettled(lanePromises);
|
|
1506
|
+
|
|
1507
|
+
laneResults = settled.map((result, idx) => {
|
|
1508
|
+
if (result.status === "fulfilled") {
|
|
1509
|
+
return result.value;
|
|
1510
|
+
}
|
|
1511
|
+
// Rejected promise — shouldn't normally happen (executeLane catches errors)
|
|
1512
|
+
const errMsg = result.reason instanceof Error ? result.reason.message : String(result.reason);
|
|
1513
|
+
execLog("wave", `W${waveIndex}`, `lane ${lanes[idx].laneId} promise rejected: ${errMsg}`);
|
|
1514
|
+
return {
|
|
1515
|
+
laneNumber: lanes[idx].laneNumber,
|
|
1516
|
+
laneId: lanes[idx].laneId,
|
|
1517
|
+
tasks: lanes[idx].tasks.map(t => ({
|
|
1518
|
+
taskId: t.taskId,
|
|
1519
|
+
status: "failed" as LaneTaskStatus,
|
|
1520
|
+
startTime: null,
|
|
1521
|
+
endTime: null,
|
|
1522
|
+
exitReason: `Lane promise rejected: ${errMsg}`,
|
|
1523
|
+
sessionName: lanes[idx].tmuxSessionName,
|
|
1524
|
+
doneFileFound: false,
|
|
1525
|
+
})),
|
|
1526
|
+
overallStatus: "failed" as const,
|
|
1527
|
+
startTime: startedAt,
|
|
1528
|
+
endTime: Date.now(),
|
|
1529
|
+
};
|
|
1530
|
+
});
|
|
1531
|
+
|
|
1532
|
+
// For stop-wave: if any task failed, set pause to prevent next wave
|
|
1533
|
+
if (policy === "stop-wave") {
|
|
1534
|
+
const hasFailure = laneResults.some(lr =>
|
|
1535
|
+
lr.tasks.some(t => t.status === "failed" || t.status === "stalled"),
|
|
1536
|
+
);
|
|
1537
|
+
if (hasFailure) {
|
|
1538
|
+
wavePauseSignal.paused = true;
|
|
1539
|
+
execLog("wave", `W${waveIndex}`, `stop-wave policy triggered — pausing after this wave`);
|
|
1540
|
+
}
|
|
1541
|
+
}
|
|
1542
|
+
}
|
|
1543
|
+
|
|
1544
|
+
// Stop the monitor (it should stop naturally when lanes are terminal,
|
|
1545
|
+
// but ensure it's stopped if we triggered pause)
|
|
1546
|
+
try {
|
|
1547
|
+
finalMonitorState = await monitorPromise;
|
|
1548
|
+
} catch {
|
|
1549
|
+
// Monitor error is non-fatal
|
|
1550
|
+
execLog("wave", `W${waveIndex}`, `monitor promise error (non-fatal)`);
|
|
1551
|
+
}
|
|
1552
|
+
|
|
1553
|
+
// ── Stage 5: Build WaveExecutionResult ───────────────────────
|
|
1554
|
+
const failedTaskIds: string[] = [];
|
|
1555
|
+
const skippedTaskIds: string[] = [];
|
|
1556
|
+
const succeededTaskIds: string[] = [];
|
|
1557
|
+
|
|
1558
|
+
for (const lr of laneResults) {
|
|
1559
|
+
for (const t of lr.tasks) {
|
|
1560
|
+
if (t.status === "succeeded") {
|
|
1561
|
+
succeededTaskIds.push(t.taskId);
|
|
1562
|
+
} else if (t.status === "failed" || t.status === "stalled") {
|
|
1563
|
+
failedTaskIds.push(t.taskId);
|
|
1564
|
+
} else if (t.status === "skipped") {
|
|
1565
|
+
skippedTaskIds.push(t.taskId);
|
|
1566
|
+
}
|
|
1567
|
+
}
|
|
1568
|
+
}
|
|
1569
|
+
|
|
1570
|
+
// Sort for deterministic output
|
|
1571
|
+
failedTaskIds.sort();
|
|
1572
|
+
skippedTaskIds.sort();
|
|
1573
|
+
succeededTaskIds.sort();
|
|
1574
|
+
|
|
1575
|
+
// Compute blocked tasks for future waves (skip-dependents policy)
|
|
1576
|
+
let blockedTaskIds: string[] = [];
|
|
1577
|
+
if (policy === "skip-dependents" && failedTaskIds.length > 0) {
|
|
1578
|
+
const blocked = computeTransitiveDependents(
|
|
1579
|
+
new Set(failedTaskIds),
|
|
1580
|
+
dependencyGraph,
|
|
1581
|
+
);
|
|
1582
|
+
blockedTaskIds = [...blocked].sort();
|
|
1583
|
+
if (blockedTaskIds.length > 0) {
|
|
1584
|
+
execLog("wave", `W${waveIndex}`, `skip-dependents: ${blockedTaskIds.length} task(s) blocked for future waves`, {
|
|
1585
|
+
blocked: blockedTaskIds.join(","),
|
|
1586
|
+
});
|
|
1587
|
+
}
|
|
1588
|
+
}
|
|
1589
|
+
|
|
1590
|
+
// Determine overall wave status
|
|
1591
|
+
const stoppedEarly = policy === "stop-all" && failedTaskIds.length > 0
|
|
1592
|
+
|| policy === "stop-wave" && failedTaskIds.length > 0;
|
|
1593
|
+
|
|
1594
|
+
let overallStatus: WaveExecutionResult["overallStatus"];
|
|
1595
|
+
if (policy === "stop-all" && failedTaskIds.length > 0) {
|
|
1596
|
+
overallStatus = "aborted";
|
|
1597
|
+
} else if (failedTaskIds.length === 0) {
|
|
1598
|
+
overallStatus = "succeeded";
|
|
1599
|
+
} else if (succeededTaskIds.length > 0) {
|
|
1600
|
+
overallStatus = "partial";
|
|
1601
|
+
} else {
|
|
1602
|
+
overallStatus = "failed";
|
|
1603
|
+
}
|
|
1604
|
+
|
|
1605
|
+
const endedAt = Date.now();
|
|
1606
|
+
const elapsedSec = Math.round((endedAt - startedAt) / 1000);
|
|
1607
|
+
|
|
1608
|
+
execLog("wave", `W${waveIndex}`, `wave execution complete: ${overallStatus}`, {
|
|
1609
|
+
succeeded: succeededTaskIds.length,
|
|
1610
|
+
failed: failedTaskIds.length,
|
|
1611
|
+
skipped: skippedTaskIds.length,
|
|
1612
|
+
blocked: blockedTaskIds.length,
|
|
1613
|
+
elapsed: `${elapsedSec}s`,
|
|
1614
|
+
stoppedEarly,
|
|
1615
|
+
});
|
|
1616
|
+
|
|
1617
|
+
return {
|
|
1618
|
+
waveIndex,
|
|
1619
|
+
startedAt,
|
|
1620
|
+
endedAt,
|
|
1621
|
+
laneResults,
|
|
1622
|
+
policyApplied: policy,
|
|
1623
|
+
stoppedEarly,
|
|
1624
|
+
failedTaskIds,
|
|
1625
|
+
skippedTaskIds,
|
|
1626
|
+
succeededTaskIds,
|
|
1627
|
+
blockedTaskIds,
|
|
1628
|
+
laneCount: lanes.length,
|
|
1629
|
+
overallStatus,
|
|
1630
|
+
finalMonitorState,
|
|
1631
|
+
allocatedLanes: lanes,
|
|
1632
|
+
};
|
|
1633
|
+
}
|
|
1634
|
+
|
|
1635
|
+
/**
|
|
1636
|
+
* Execute lanes with stop-all failure policy.
|
|
1637
|
+
*
|
|
1638
|
+
* Starts all lanes, then monitors for the first failure.
|
|
1639
|
+
* On first failure: kills all TMUX sessions immediately and returns.
|
|
1640
|
+
*
|
|
1641
|
+
* Uses a race pattern: wraps each lane promise to signal on failure,
|
|
1642
|
+
* then kills all sessions when first failure is detected.
|
|
1643
|
+
*
|
|
1644
|
+
* Deterministic tie-break: when multiple failures happen simultaneously,
|
|
1645
|
+
* they are ordered by timestamp (startTime), then by task ID alphabetically.
|
|
1646
|
+
*
|
|
1647
|
+
* @param lanes - Allocated lanes
|
|
1648
|
+
* @param lanePromises - Already-started lane execution promises
|
|
1649
|
+
* @param pauseSignal - Pause signal to set on abort
|
|
1650
|
+
* @param waveIndex - Wave number for logging
|
|
1651
|
+
* @returns Lane execution results (may have aborted tasks)
|
|
1652
|
+
*/
|
|
1653
|
+
export async function executeWithStopAll(
|
|
1654
|
+
lanes: AllocatedLane[],
|
|
1655
|
+
lanePromises: Promise<LaneExecutionResult>[],
|
|
1656
|
+
pauseSignal: { paused: boolean },
|
|
1657
|
+
waveIndex: number,
|
|
1658
|
+
): Promise<LaneExecutionResult[]> {
|
|
1659
|
+
// Track results as they complete
|
|
1660
|
+
const results: (LaneExecutionResult | null)[] = new Array(lanes.length).fill(null);
|
|
1661
|
+
let abortTriggered = false;
|
|
1662
|
+
|
|
1663
|
+
// Create a promise that resolves when all lanes are done
|
|
1664
|
+
// but also detects first failure
|
|
1665
|
+
const wrappedPromises = lanePromises.map(async (promise, idx) => {
|
|
1666
|
+
try {
|
|
1667
|
+
const result = await promise;
|
|
1668
|
+
results[idx] = result;
|
|
1669
|
+
|
|
1670
|
+
// Check if any task failed
|
|
1671
|
+
if (!abortTriggered) {
|
|
1672
|
+
const hasFailure = result.tasks.some(
|
|
1673
|
+
t => t.status === "failed" || t.status === "stalled",
|
|
1674
|
+
);
|
|
1675
|
+
if (hasFailure) {
|
|
1676
|
+
// First failure detected — trigger stop-all
|
|
1677
|
+
abortTriggered = true;
|
|
1678
|
+
pauseSignal.paused = true;
|
|
1679
|
+
|
|
1680
|
+
// Determine which task failed first for logging
|
|
1681
|
+
const firstFailed = result.tasks
|
|
1682
|
+
.filter(t => t.status === "failed" || t.status === "stalled")
|
|
1683
|
+
.sort((a, b) => {
|
|
1684
|
+
// Sort by startTime, then by taskId for deterministic tie-break
|
|
1685
|
+
const timeA = a.startTime || 0;
|
|
1686
|
+
const timeB = b.startTime || 0;
|
|
1687
|
+
if (timeA !== timeB) return timeA - timeB;
|
|
1688
|
+
return a.taskId.localeCompare(b.taskId);
|
|
1689
|
+
})[0];
|
|
1690
|
+
|
|
1691
|
+
execLog("wave", `W${waveIndex}`, `stop-all triggered by ${firstFailed?.taskId || "unknown"} in ${lanes[idx].laneId}`, {
|
|
1692
|
+
session: lanes[idx].tmuxSessionName,
|
|
1693
|
+
});
|
|
1694
|
+
|
|
1695
|
+
// Kill ALL lane sessions immediately
|
|
1696
|
+
for (const lane of lanes) {
|
|
1697
|
+
killLaneAndChildren(lane.tmuxSessionName);
|
|
1698
|
+
}
|
|
1699
|
+
}
|
|
1700
|
+
}
|
|
1701
|
+
|
|
1702
|
+
return result;
|
|
1703
|
+
} catch (err) {
|
|
1704
|
+
// Lane promise rejection — should be rare
|
|
1705
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
1706
|
+
if (!abortTriggered) {
|
|
1707
|
+
abortTriggered = true;
|
|
1708
|
+
pauseSignal.paused = true;
|
|
1709
|
+
execLog("wave", `W${waveIndex}`, `stop-all triggered by lane error in ${lanes[idx].laneId}: ${errMsg}`);
|
|
1710
|
+
for (const lane of lanes) {
|
|
1711
|
+
killLaneAndChildren(lane.tmuxSessionName);
|
|
1712
|
+
}
|
|
1713
|
+
}
|
|
1714
|
+
|
|
1715
|
+
// Build a failed result for this lane
|
|
1716
|
+
const failedResult: LaneExecutionResult = {
|
|
1717
|
+
laneNumber: lanes[idx].laneNumber,
|
|
1718
|
+
laneId: lanes[idx].laneId,
|
|
1719
|
+
tasks: lanes[idx].tasks.map(t => ({
|
|
1720
|
+
taskId: t.taskId,
|
|
1721
|
+
status: "failed" as LaneTaskStatus,
|
|
1722
|
+
startTime: null,
|
|
1723
|
+
endTime: null,
|
|
1724
|
+
exitReason: `Lane aborted: ${errMsg}`,
|
|
1725
|
+
sessionName: lanes[idx].tmuxSessionName,
|
|
1726
|
+
doneFileFound: false,
|
|
1727
|
+
})),
|
|
1728
|
+
overallStatus: "failed",
|
|
1729
|
+
startTime: Date.now(),
|
|
1730
|
+
endTime: Date.now(),
|
|
1731
|
+
};
|
|
1732
|
+
results[idx] = failedResult;
|
|
1733
|
+
return failedResult;
|
|
1734
|
+
}
|
|
1735
|
+
});
|
|
1736
|
+
|
|
1737
|
+
// Wait for all lanes to settle (they should exit quickly after session kill)
|
|
1738
|
+
await Promise.allSettled(wrappedPromises);
|
|
1739
|
+
|
|
1740
|
+
// Fill in any null results (shouldn't happen, but defensive)
|
|
1741
|
+
return results.map((r, idx) => r || {
|
|
1742
|
+
laneNumber: lanes[idx].laneNumber,
|
|
1743
|
+
laneId: lanes[idx].laneId,
|
|
1744
|
+
tasks: [],
|
|
1745
|
+
overallStatus: "failed" as const,
|
|
1746
|
+
startTime: Date.now(),
|
|
1747
|
+
endTime: Date.now(),
|
|
1748
|
+
});
|
|
1749
|
+
}
|
|
1750
|
+
|
|
1751
|
+
// ── /orch Command — Full Execution (Step 5) ─────────────────────────
|
|
1752
|
+
|