@pi-agents/orchid 0.1.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. package/CHANGELOG.md +41 -0
  2. package/LICENSE +21 -0
  3. package/README.md +246 -0
  4. package/agents/AGENTS-MANIFEST.md +42 -0
  5. package/agents/brain.md +42 -0
  6. package/agents/context-builder.md +46 -0
  7. package/agents/delegate.md +12 -0
  8. package/agents/dev-1.md +42 -0
  9. package/agents/oracle.md +73 -0
  10. package/agents/planner.md +55 -0
  11. package/agents/researcher.md +52 -0
  12. package/agents/reviewer.md +79 -0
  13. package/agents/scout.md +50 -0
  14. package/agents/tester.md +45 -0
  15. package/agents/worker.md +55 -0
  16. package/extensions/ralph.ts +1 -0
  17. package/extensions/reviewer-extension.ts +125 -0
  18. package/extensions/task-orchestrator.ts +28 -0
  19. package/package.json +63 -0
  20. package/prompts/gather-context-and-clarify.md +13 -0
  21. package/prompts/parallel-cleanup.md +59 -0
  22. package/prompts/parallel-context-build.md +53 -0
  23. package/prompts/parallel-handoff-plan.md +59 -0
  24. package/prompts/parallel-research.md +50 -0
  25. package/prompts/parallel-review.md +54 -0
  26. package/prompts/review-loop.md +41 -0
  27. package/skills/orchid/SKILL.md +214 -0
  28. package/skills/orchid/orchid-cleanup/SKILL.md +122 -0
  29. package/skills/orchid/orchid-converge/SKILL.md +124 -0
  30. package/skills/orchid/orchid-decompose/SKILL.md +201 -0
  31. package/skills/orchid/orchid-doctor/SKILL.md +162 -0
  32. package/skills/orchid/orchid-investigate/SKILL.md +102 -0
  33. package/skills/orchid/orchid-launch/SKILL.md +147 -0
  34. package/skills/ralph/SKILL.md +73 -0
  35. package/skills/subagents/pi-subagents/SKILL.md +813 -0
  36. package/src/index.ts +7 -0
  37. package/src/orchestrator/abort.ts +534 -0
  38. package/src/orchestrator/agent-bridge-extension.ts +1020 -0
  39. package/src/orchestrator/agent-host.ts +954 -0
  40. package/src/orchestrator/cleanup.ts +776 -0
  41. package/src/orchestrator/config-loader.ts +1412 -0
  42. package/src/orchestrator/config-schema.ts +690 -0
  43. package/src/orchestrator/config.ts +81 -0
  44. package/src/orchestrator/context-window.ts +66 -0
  45. package/src/orchestrator/diagnostic-reports.ts +475 -0
  46. package/src/orchestrator/diagnostics.ts +394 -0
  47. package/src/orchestrator/discovery.ts +1833 -0
  48. package/src/orchestrator/engine-worker.ts +415 -0
  49. package/src/orchestrator/engine.ts +5940 -0
  50. package/src/orchestrator/execution.ts +3104 -0
  51. package/src/orchestrator/extension.ts +5934 -0
  52. package/src/orchestrator/formatting.ts +785 -0
  53. package/src/orchestrator/git.ts +88 -0
  54. package/src/orchestrator/index.ts +28 -0
  55. package/src/orchestrator/lane-runner.ts +1787 -0
  56. package/src/orchestrator/mailbox.ts +780 -0
  57. package/src/orchestrator/merge.ts +3414 -0
  58. package/src/orchestrator/messages.ts +1062 -0
  59. package/src/orchestrator/migrations.ts +278 -0
  60. package/src/orchestrator/naming.ts +117 -0
  61. package/src/orchestrator/path-resolver.ts +275 -0
  62. package/src/orchestrator/persistence.ts +2625 -0
  63. package/src/orchestrator/process-registry.ts +452 -0
  64. package/src/orchestrator/quality-gate.ts +1085 -0
  65. package/src/orchestrator/resume.ts +3488 -0
  66. package/src/orchestrator/sessions.ts +57 -0
  67. package/src/orchestrator/settings-loader.ts +136 -0
  68. package/src/orchestrator/settings-tui.ts +2208 -0
  69. package/src/orchestrator/sidecar-telemetry.ts +267 -0
  70. package/src/orchestrator/supervisor.ts +4548 -0
  71. package/src/orchestrator/task-executor-core.ts +675 -0
  72. package/src/orchestrator/tmux-compat.ts +37 -0
  73. package/src/orchestrator/tool-allowlist-constants.ts +37 -0
  74. package/src/orchestrator/types.ts +4465 -0
  75. package/src/orchestrator/verification.ts +547 -0
  76. package/src/orchestrator/waves.ts +1564 -0
  77. package/src/orchestrator/workspace.ts +707 -0
  78. package/src/orchestrator/worktree.ts +2725 -0
  79. package/src/ralph/index.ts +825 -0
  80. package/src/subagents/agents/agent-management.ts +648 -0
  81. package/src/subagents/agents/agent-scope.ts +6 -0
  82. package/src/subagents/agents/agent-selection.ts +23 -0
  83. package/src/subagents/agents/agent-serializer.ts +86 -0
  84. package/src/subagents/agents/agents.ts +832 -0
  85. package/src/subagents/agents/chain-serializer.ts +137 -0
  86. package/src/subagents/agents/frontmatter.ts +29 -0
  87. package/src/subagents/agents/identity.ts +30 -0
  88. package/src/subagents/agents/skills.ts +632 -0
  89. package/src/subagents/extension/config.ts +16 -0
  90. package/src/subagents/extension/control-notices.ts +92 -0
  91. package/src/subagents/extension/doctor.ts +199 -0
  92. package/src/subagents/extension/fanout-child.ts +170 -0
  93. package/src/subagents/extension/index.ts +573 -0
  94. package/src/subagents/extension/schemas.ts +168 -0
  95. package/src/subagents/intercom/intercom-bridge.ts +379 -0
  96. package/src/subagents/intercom/result-intercom.ts +377 -0
  97. package/src/subagents/runs/background/async-execution.ts +712 -0
  98. package/src/subagents/runs/background/async-job-tracker.ts +310 -0
  99. package/src/subagents/runs/background/async-resume.ts +345 -0
  100. package/src/subagents/runs/background/async-status.ts +325 -0
  101. package/src/subagents/runs/background/completion-dedupe.ts +63 -0
  102. package/src/subagents/runs/background/notify.ts +108 -0
  103. package/src/subagents/runs/background/parallel-groups.ts +45 -0
  104. package/src/subagents/runs/background/result-watcher.ts +307 -0
  105. package/src/subagents/runs/background/run-id-resolver.ts +83 -0
  106. package/src/subagents/runs/background/run-status.ts +269 -0
  107. package/src/subagents/runs/background/stale-run-reconciler.ts +336 -0
  108. package/src/subagents/runs/background/subagent-runner.ts +1808 -0
  109. package/src/subagents/runs/background/top-level-async.ts +13 -0
  110. package/src/subagents/runs/foreground/chain-clarify.ts +1333 -0
  111. package/src/subagents/runs/foreground/chain-execution.ts +938 -0
  112. package/src/subagents/runs/foreground/execution.ts +918 -0
  113. package/src/subagents/runs/foreground/subagent-executor.ts +2527 -0
  114. package/src/subagents/runs/shared/completion-guard.ts +147 -0
  115. package/src/subagents/runs/shared/long-running-guard.ts +175 -0
  116. package/src/subagents/runs/shared/mcp-direct-tool-allowlist.ts +365 -0
  117. package/src/subagents/runs/shared/model-fallback.ts +103 -0
  118. package/src/subagents/runs/shared/nested-events.ts +819 -0
  119. package/src/subagents/runs/shared/nested-path.ts +52 -0
  120. package/src/subagents/runs/shared/nested-render.ts +115 -0
  121. package/src/subagents/runs/shared/parallel-utils.ts +109 -0
  122. package/src/subagents/runs/shared/pi-args.ts +220 -0
  123. package/src/subagents/runs/shared/pi-spawn.ts +115 -0
  124. package/src/subagents/runs/shared/run-history.ts +60 -0
  125. package/src/subagents/runs/shared/single-output.ts +164 -0
  126. package/src/subagents/runs/shared/subagent-control.ts +226 -0
  127. package/src/subagents/runs/shared/subagent-prompt-runtime.ts +170 -0
  128. package/src/subagents/runs/shared/worktree.ts +577 -0
  129. package/src/subagents/shared/artifacts.ts +98 -0
  130. package/src/subagents/shared/atomic-json.ts +16 -0
  131. package/src/subagents/shared/file-coalescer.ts +40 -0
  132. package/src/subagents/shared/fork-context.ts +76 -0
  133. package/src/subagents/shared/formatters.ts +133 -0
  134. package/src/subagents/shared/jsonl-writer.ts +81 -0
  135. package/src/subagents/shared/model-info.ts +78 -0
  136. package/src/subagents/shared/post-exit-stdio-guard.ts +85 -0
  137. package/src/subagents/shared/session-identity.ts +10 -0
  138. package/src/subagents/shared/session-tokens.ts +44 -0
  139. package/src/subagents/shared/settings.ts +397 -0
  140. package/src/subagents/shared/status-format.ts +49 -0
  141. package/src/subagents/shared/types.ts +822 -0
  142. package/src/subagents/shared/utils.ts +450 -0
  143. package/src/subagents/slash/prompt-template-bridge.ts +397 -0
  144. package/src/subagents/slash/slash-bridge.ts +174 -0
  145. package/src/subagents/slash/slash-commands.ts +528 -0
  146. package/src/subagents/slash/slash-live-state.ts +292 -0
  147. package/src/subagents/tui/render-helpers.ts +80 -0
  148. package/src/subagents/tui/render.ts +1358 -0
  149. package/templates/agents/local/supervisor.md +33 -0
  150. package/templates/agents/local/task-merger.md +27 -0
  151. package/templates/agents/local/task-reviewer.md +30 -0
  152. package/templates/agents/local/task-worker.md +34 -0
  153. package/templates/agents/supervisor-routing.md +92 -0
  154. package/templates/agents/supervisor.md +229 -0
  155. package/templates/agents/task-merger.md +214 -0
  156. package/templates/agents/task-reviewer.md +260 -0
  157. package/templates/agents/task-worker-segment.md +44 -0
  158. package/templates/agents/task-worker.md +557 -0
  159. package/templates/tasks/CONTEXT.md +30 -0
  160. package/templates/tasks/EXAMPLE-001-hello-world/PROMPT.md +98 -0
  161. package/templates/tasks/EXAMPLE-001-hello-world/STATUS.md +73 -0
  162. package/templates/tasks/EXAMPLE-002-parallel-smoke/PROMPT.md +97 -0
  163. package/templates/tasks/EXAMPLE-002-parallel-smoke/STATUS.md +73 -0
@@ -0,0 +1,3104 @@
1
+ /**
2
+ * Lane execution, monitoring, wave execution loop
3
+ * @module orch/execution
4
+ */
5
+ import {
6
+ readFileSync,
7
+ existsSync,
8
+ statSync,
9
+ unlinkSync,
10
+ mkdirSync,
11
+ writeFileSync,
12
+ copyFileSync,
13
+ } from "fs";
14
+ import { access as fsAccess, readFile as fsReadFile, stat as fsStat } from "fs/promises";
15
+ import { join, dirname, basename, resolve, relative, delimiter as pathDelimiter } from "path";
16
+ import { userInfo } from "os";
17
+
18
+ import {
19
+ DONE_GRACE_MS,
20
+ EXECUTION_POLL_INTERVAL_MS,
21
+ ExecutionError,
22
+ SESSION_SPAWN_RETRY_MAX,
23
+ } from "./types.ts";
24
+ import type {
25
+ AllocatedLane,
26
+ AllocatedTask,
27
+ DependencyGraph,
28
+ LaneExecutionResult,
29
+ LaneMonitorSnapshot,
30
+ LaneTaskOutcome,
31
+ LaneTaskStatus,
32
+ MonitorState,
33
+ MtimeTracker,
34
+ OrchestratorConfig,
35
+ ParsedTask,
36
+ TaskMonitorSnapshot,
37
+ WaveExecutionResult,
38
+ WorkspaceConfig,
39
+ ExecutionUnit,
40
+ PacketPaths,
41
+ RuntimeAgentId,
42
+ RuntimeAgentRole,
43
+ RuntimeLaneSnapshot,
44
+ RuntimeRegistry,
45
+ SupervisorAlertCallback,
46
+ } from "./types.ts";
47
+ import { resolvePacketPaths, buildRuntimeAgentId } from "./types.ts";
48
+ import type { TaskExitDiagnostic } from "./diagnostics.ts";
49
+ import {
50
+ readRegistrySnapshot,
51
+ readLaneSnapshot,
52
+ isTerminalStatus,
53
+ isProcessAlive,
54
+ detectOrphans,
55
+ markOrphansCrashed,
56
+ buildRegistrySnapshot,
57
+ writeRegistrySnapshot,
58
+ writeLaneSnapshot,
59
+ } from "./process-registry.ts";
60
+ import { allocateLanes } from "./waves.ts";
61
+ import { resolveOperatorId } from "./naming.ts";
62
+ import { runGit, runGitWithEnv } from "./git.ts";
63
+ import { resolveTaskplanePackageFile, resolveTaskplaneAgentTemplate } from "./path-resolver.ts";
64
+ import { resolvePointer, loadWorkspaceConfig } from "./workspace.ts";
65
+
66
+ // ── OrchID Package File Resolution ────────────────────────────────
67
+ // getNpmGlobalRoot() and resolveTaskplanePackageFile() consolidated in path-resolver.ts (TP-157)
68
+
69
+ // ── RPC Wrapper Path Resolution ──────────────────────────────────────
70
+
71
+ /**
72
+ * Find the rpc-wrapper.mjs path for lane sessions.
73
+ * @see resolveTaskplanePackageFile for resolution order
74
+ */
75
+ // resolveRpcWrapperPath removed (TP-120 remediation: legacy session-backend dead code)
76
+
77
+ // ── Telemetry Helpers ────────────────────────────────────────────────
78
+
79
+ // resolveTelemOpId removed (TP-120 remediation: only consumer was generateTelemetryPaths)
80
+
81
+ // sanitizeForFilename + generateTelemetryPaths removed (TP-120 remediation: legacy telemetry dead code)
82
+
83
+ // generateTelemetryPaths removed (TP-120 remediation: legacy telemetry sidecar dead code)
84
+
85
+ // ── Execution Helpers ────────────────────────────────────────────────
86
+
87
+ /**
88
+ * Structured log helper for lane execution.
89
+ *
90
+ * All execution logs go to stderr.
91
+ * Format: [orch] {laneId}/{taskId}: {message}
92
+ * Correlation fields: batchId, laneId, taskId, sessionName.
93
+ * No PII — only IDs and paths.
94
+ */
95
+ export function execLog(
96
+ laneId: string,
97
+ taskId: string,
98
+ message: string,
99
+ // TP-195: widened from `Record<string, string|number|boolean>` to
100
+ // `Record<string, unknown>` so callers can pass structured values
101
+ // (string[] arrays, repo objects, etc.) without TS errors. Runtime
102
+ // stringification via `${v}` is unchanged: primitives render as today,
103
+ // arrays render with comma separators (existing behavior), objects
104
+ // render as `[object Object]` (already today's behavior — see
105
+ // historic execLog calls in engine.ts/resume.ts that have always been
106
+ // passing structured payloads). No runtime change.
107
+ extra?: Record<string, unknown>,
108
+ ): void {
109
+ const prefix = `[orch] ${laneId}/${taskId}`;
110
+ if (extra) {
111
+ const fields = Object.entries(extra)
112
+ .map(([k, v]) => `${k}=${v}`)
113
+ .join(" ");
114
+ console.error(`${prefix}: ${message} (${fields})`);
115
+ } else {
116
+ console.error(`${prefix}: ${message}`);
117
+ }
118
+ }
119
+
120
+ /**
121
+ * TP-112: Check if a V2 agent is alive via process registry.
122
+ * Returns true if the agent's PID is running and status is non-terminal.
123
+ * Returns false if no registry, no entry, terminal status, or dead PID.
124
+ *
125
+ * @param agentIdOrSessionName - Agent ID or session name to look up
126
+ * @param runtimeBackend - Must be "v2" (caller should guard)
127
+ * @returns true if agent is alive
128
+ * @since TP-112
129
+ */
130
+ export function isV2AgentAlive(
131
+ agentIdOrSessionName: string,
132
+ _runtimeBackend?: RuntimeBackend,
133
+ laneNumber?: number,
134
+ ): boolean {
135
+ // Read the registry from the global state root.
136
+ // Since this is a pure liveness check, we scan for matching agentId
137
+ // patterns: direct match, or lane-session + "-worker" suffix.
138
+ if (!_v2LivenessRegistryCache) return false;
139
+ const agents = _v2LivenessRegistryCache.agents;
140
+ // Direct match
141
+ const manifest = agents[agentIdOrSessionName];
142
+ if (manifest && !isTerminalStatus(manifest.status) && isProcessAlive(manifest.pid)) return true;
143
+ // Try worker suffix (monitor uses lane session name, registry uses agentId)
144
+ const workerManifest = agents[`${agentIdOrSessionName}-worker`];
145
+ if (
146
+ workerManifest &&
147
+ !isTerminalStatus(workerManifest.status) &&
148
+ isProcessAlive(workerManifest.pid)
149
+ )
150
+ return true;
151
+ // TP-148: In workspace mode, laneSessionId includes repoId and uses a local
152
+ // lane number (e.g., "orch-henry-api-lane-1") while the V2 registry uses
153
+ // global lane numbers without repoId (e.g., "orch-henry-lane-3-worker").
154
+ // Fall back to scanning the registry by global lane number when provided.
155
+ if (laneNumber != null) {
156
+ for (const agent of Object.values(agents)) {
157
+ if (
158
+ agent.laneNumber === laneNumber &&
159
+ agent.role === "worker" &&
160
+ !isTerminalStatus(agent.status) &&
161
+ isProcessAlive(agent.pid)
162
+ ) {
163
+ return true;
164
+ }
165
+ }
166
+ }
167
+ return false;
168
+ }
169
+
170
+ /** Cached registry for V2 liveness checks within a monitor cycle. @since TP-112 */
171
+ let _v2LivenessRegistryCache: RuntimeRegistry | null = null;
172
+
173
+ /**
174
+ * Set the V2 liveness registry cache for the current monitor cycle.
175
+ * Called at the start of each monitor poll to avoid re-reading the file per-task.
176
+ * @since TP-112
177
+ */
178
+ export function setV2LivenessRegistryCache(registry: RuntimeRegistry | null): void {
179
+ _v2LivenessRegistryCache = registry;
180
+ }
181
+
182
+ /**
183
+ * TP-112: Kill V2 lane agents (worker + reviewer) by PID from the registry.
184
+ *
185
+ * Uses the monitor cache when available for hot-path polling, and can
186
+ * optionally read a fresh registry snapshot for cleanup flows outside monitor.
187
+ *
188
+ * @since TP-112
189
+ */
190
+ export function killV2LaneAgents(
191
+ sessionName: string,
192
+ options?: { stateRoot?: string; batchId?: string; logContext?: string; laneNumber?: number },
193
+ ): void {
194
+ const registry =
195
+ _v2LivenessRegistryCache ??
196
+ (options?.stateRoot && options?.batchId
197
+ ? readRegistrySnapshot(options.stateRoot, options.batchId)
198
+ : null);
199
+ if (!registry) return;
200
+
201
+ const agents = registry.agents;
202
+ const logContext = options?.logContext ?? "monitor";
203
+ const killedPids = new Set<number>();
204
+ for (const suffix of ["-worker", "-reviewer", ""]) {
205
+ const key = `${sessionName}${suffix}`;
206
+ const manifest = agents[key];
207
+ if (
208
+ manifest &&
209
+ !isTerminalStatus(manifest.status) &&
210
+ isProcessAlive(manifest.pid) &&
211
+ !killedPids.has(manifest.pid)
212
+ ) {
213
+ try {
214
+ process.kill(manifest.pid, "SIGTERM");
215
+ killedPids.add(manifest.pid);
216
+ execLog(logContext, key, `killed V2 agent (PID ${manifest.pid})`);
217
+ } catch {
218
+ /* already dead */
219
+ }
220
+ }
221
+ }
222
+ // TP-148: Workspace-mode fallback — match by global lane number when
223
+ // session name lookup misses (repoId/local-vs-global lane mismatch).
224
+ if (options?.laneNumber != null) {
225
+ for (const agent of Object.values(agents)) {
226
+ if (
227
+ agent.laneNumber === options.laneNumber &&
228
+ !isTerminalStatus(agent.status) &&
229
+ isProcessAlive(agent.pid) &&
230
+ !killedPids.has(agent.pid)
231
+ ) {
232
+ try {
233
+ process.kill(agent.pid, "SIGTERM");
234
+ killedPids.add(agent.pid);
235
+ execLog(logContext, agent.agentId, `killed V2 agent by lane number (PID ${agent.pid})`);
236
+ } catch {
237
+ /* already dead */
238
+ }
239
+ }
240
+ }
241
+ }
242
+ }
243
+
244
+ // ── Async File/Status Helpers (TP-070) ───────────────────────────────
245
+
246
+ /**
247
+ * Async version of readTaskStatusTail — reads STATUS.md tail without
248
+ * blocking the event loop.
249
+ *
250
+ * @param statusPath - Path to STATUS.md
251
+ * @param maxLines - Maximum number of lines to return
252
+ * @param maxChars - Maximum character count
253
+ * @returns Promise resolving to status tail text (empty string if missing/unreadable)
254
+ *
255
+ * @since TP-070
256
+ */
257
+ export async function readTaskStatusTailAsync(
258
+ statusPath: string,
259
+ maxLines: number = 40,
260
+ maxChars: number = 1200,
261
+ ): Promise<string> {
262
+ try {
263
+ await fsAccess(statusPath);
264
+ } catch {
265
+ return "";
266
+ }
267
+ try {
268
+ const raw = (await fsReadFile(statusPath, "utf-8")).replace(/\r\n/g, "\n").trim();
269
+ if (!raw) return "";
270
+ const tail = raw.split("\n").slice(-maxLines).join("\n").trim();
271
+ if (!tail) return "";
272
+ return tail.length > maxChars ? tail.slice(-maxChars) : tail;
273
+ } catch {
274
+ return "";
275
+ }
276
+ }
277
+
278
+ /**
279
+ * Legacy lane environment-variable helper removed in TP-120.
280
+ *
281
+ * Runtime V2 lane execution now runs through lane-runner/agent-host and no
282
+ * longer injects task-runner autostart/session env vars from this module.
283
+ */
284
+ // buildLaneEnvVars removed (TP-120 remediation: legacy lane-session env var path, dead code)
285
+
286
+ function laneSessionIdOf(lane: Pick<AllocatedLane, "laneSessionId">): string {
287
+ return lane.laneSessionId;
288
+ }
289
+
290
+ /**
291
+ * Resolve the lane session log path for a task execution.
292
+ *
293
+ * Logs are written under the lane worktree to keep per-lane execution
294
+ * artifacts colocated with task state and available after failures.
295
+ */
296
+ export function resolveLaneLogPath(lane: AllocatedLane, task: AllocatedTask): string {
297
+ return join(lane.worktreePath, ".pi", "orch-logs", `${laneSessionIdOf(lane)}-${task.taskId}.log`);
298
+ }
299
+
300
+ /**
301
+ * Relative lane log path used by the legacy shell-spawn path.
302
+ *
303
+ * Relative paths avoid Windows drive-letter parsing issues in shell redirection.
304
+ */
305
+ export function resolveLaneLogRelativePath(lane: AllocatedLane, task: AllocatedTask): string {
306
+ return join(".pi", "orch-logs", `${laneSessionIdOf(lane)}-${task.taskId}.log`).replace(/\\/g, "/");
307
+ }
308
+
309
+ /**
310
+ * Read a tail snippet from a lane log file for failure diagnostics.
311
+ */
312
+ export function readLaneLogTail(
313
+ logPath: string,
314
+ maxLines: number = 40,
315
+ maxChars: number = 1200,
316
+ ): string {
317
+ if (!existsSync(logPath)) return "";
318
+ try {
319
+ const raw = readFileSync(logPath, "utf-8").replace(/\r\n/g, "\n");
320
+ const tail = raw.split("\n").slice(-maxLines).join("\n").trim();
321
+ if (!tail) return "";
322
+ return tail.length > maxChars ? tail.slice(-maxChars) : tail;
323
+ } catch {
324
+ return "";
325
+ }
326
+ }
327
+
328
+ /**
329
+ * Async version of readLaneLogTail — reads lane log tail without
330
+ * blocking the event loop.
331
+ *
332
+ * @since TP-070
333
+ */
334
+ export async function readLaneLogTailAsync(
335
+ logPath: string,
336
+ maxLines: number = 40,
337
+ maxChars: number = 1200,
338
+ ): Promise<string> {
339
+ try {
340
+ await fsAccess(logPath);
341
+ } catch {
342
+ return "";
343
+ }
344
+ try {
345
+ const raw = (await fsReadFile(logPath, "utf-8")).replace(/\r\n/g, "\n");
346
+ const tail = raw.split("\n").slice(-maxLines).join("\n").trim();
347
+ if (!tail) return "";
348
+ return tail.length > maxChars ? tail.slice(-maxChars) : tail;
349
+ } catch {
350
+ return "";
351
+ }
352
+ }
353
+
354
+ /**
355
+ * Async file existence check — non-blocking replacement for existsSync
356
+ * in polling paths.
357
+ *
358
+ * @param filePath - Path to check
359
+ * @returns Promise resolving to true if file exists
360
+ *
361
+ * @since TP-070
362
+ */
363
+ export async function fileExistsAsync(filePath: string): Promise<boolean> {
364
+ try {
365
+ await fsAccess(filePath);
366
+ return true;
367
+ } catch {
368
+ return false;
369
+ }
370
+ }
371
+
372
+ /**
373
+ * Read a tail snippet from task STATUS.md for failure diagnostics.
374
+ */
375
+ export function readTaskStatusTail(
376
+ statusPath: string,
377
+ maxLines: number = 40,
378
+ maxChars: number = 1200,
379
+ ): string {
380
+ if (!existsSync(statusPath)) return "";
381
+ try {
382
+ const raw = readFileSync(statusPath, "utf-8").replace(/\r\n/g, "\n").trim();
383
+ if (!raw) return "";
384
+ const tail = raw.split("\n").slice(-maxLines).join("\n").trim();
385
+ if (!tail) return "";
386
+ return tail.length > maxChars ? tail.slice(-maxChars) : tail;
387
+ } catch {
388
+ return "";
389
+ }
390
+ }
391
+
392
+ /**
393
+ * Result of canonical task-folder path resolution.
394
+ *
395
+ * Encapsulates the resolved task folder, .DONE path, and STATUS.md path
396
+ * so callers don't need to re-derive them with inconsistent logic.
397
+ */
398
+ export interface ResolvedTaskPaths {
399
+ /** Absolute path to the resolved task folder (may be in worktree or external) */
400
+ taskFolderResolved: string;
401
+ /** Absolute path to the .DONE file */
402
+ donePath: string;
403
+ /** Absolute path to the STATUS.md file */
404
+ statusPath: string;
405
+ }
406
+
407
+ /**
408
+ * Canonical task-folder path resolver.
409
+ *
410
+ * Single source of truth for translating a task folder path (as stored in
411
+ * ParsedTask) into the correct filesystem paths for .DONE and STATUS.md
412
+ * probing. Handles two cases:
413
+ *
414
+ * 1. **Task folder inside repoRoot** (monorepo / repo mode):
415
+ * Strip the repoRoot prefix to get a relative path, then join with
416
+ * worktreePath. This is the existing behavior — worktrees mirror the
417
+ * repo structure so the relative path is the same.
418
+ *
419
+ * 2. **Task folder outside repoRoot** (workspace mode with external tasks root):
420
+ * The task folder is not inside the execution repo. Use the absolute
421
+ * task folder path directly — the .DONE and STATUS.md files live in
422
+ * the canonical task folder, not in any worktree.
423
+ *
424
+ * Both branches include archive fallback: if the primary location doesn't
425
+ * exist, check `<parent>/archive/<taskDirName>/` for relocated task folders.
426
+ *
427
+ * @param taskFolder - Absolute task folder path (from ParsedTask.taskFolder)
428
+ * @param worktreePath - Absolute path to the lane worktree
429
+ * @param repoRoot - Absolute path to the main repository root
430
+ * @returns Resolved paths for task folder, .DONE, and STATUS.md
431
+ */
432
+ export function resolveCanonicalTaskPaths(
433
+ taskFolder: string,
434
+ worktreePath: string,
435
+ repoRoot: string,
436
+ isWorkspaceMode?: boolean,
437
+ ): ResolvedTaskPaths {
438
+ const repoRootNorm = resolve(repoRoot).replace(/\\/g, "/");
439
+ const folderNorm = resolve(taskFolder).replace(/\\/g, "/");
440
+
441
+ let resolvedFolder: string;
442
+
443
+ if (isWorkspaceMode) {
444
+ // Workspace mode: use worktree-relative path when the task folder is
445
+ // inside the lane's repo. The worker writes .DONE and STATUS.md in
446
+ // the worktree, so the engine must look there too.
447
+ if (folderNorm.startsWith(repoRootNorm + "/")) {
448
+ const relPath = folderNorm.slice(repoRootNorm.length + 1);
449
+ resolvedFolder = join(worktreePath, relPath);
450
+ } else {
451
+ // Cross-repo: task files were copied into the worktree under
452
+ // .orchid-tasks/<taskDirName>/ by buildLaneEnvVars
453
+ const taskDirName = basename(resolve(taskFolder));
454
+ resolvedFolder = join(worktreePath, ".orchid-tasks", taskDirName);
455
+ }
456
+ } else if (folderNorm.startsWith(repoRootNorm + "/")) {
457
+ // Repo mode: task folder is inside the repo root.
458
+ // Translate to equivalent path in the worktree.
459
+ const relativePath = folderNorm.slice(repoRootNorm.length + 1);
460
+ resolvedFolder = join(worktreePath, relativePath);
461
+ } else {
462
+ // Fallback: use absolute path directly.
463
+ resolvedFolder = resolve(taskFolder);
464
+ }
465
+
466
+ // Check primary location
467
+ const primaryDone = join(resolvedFolder, ".DONE");
468
+ const primaryStatus = join(resolvedFolder, "STATUS.md");
469
+ if (existsSync(primaryDone) || existsSync(primaryStatus)) {
470
+ return {
471
+ taskFolderResolved: resolvedFolder,
472
+ donePath: primaryDone,
473
+ statusPath: primaryStatus,
474
+ };
475
+ }
476
+
477
+ // Archive fallback: worker may have archived the task folder during the
478
+ // "Documentation & Delivery" step, moving it under `.../archive/TASK-ID/`.
479
+ const resolvedNorm = resolve(resolvedFolder).replace(/\\/g, "/");
480
+ const parts = resolvedNorm.split("/");
481
+ const taskDirName = parts[parts.length - 1];
482
+ const parentDir = parts.slice(0, -1).join("/");
483
+ const archiveFolder = join(parentDir, "archive", taskDirName);
484
+ const archiveDone = join(archiveFolder, ".DONE");
485
+ const archiveStatus = join(archiveFolder, "STATUS.md");
486
+
487
+ if (existsSync(archiveDone) || existsSync(archiveStatus)) {
488
+ return {
489
+ taskFolderResolved: archiveFolder,
490
+ donePath: archiveDone,
491
+ statusPath: archiveStatus,
492
+ };
493
+ }
494
+
495
+ // Return primary paths even if nothing exists yet (caller probes existsSync)
496
+ return {
497
+ taskFolderResolved: resolvedFolder,
498
+ donePath: primaryDone,
499
+ statusPath: primaryStatus,
500
+ };
501
+ }
502
+
503
+ /**
504
+ * Resolve the path to a task's .DONE file inside a worktree.
505
+ *
506
+ * Delegates to `resolveCanonicalTaskPaths` for consistent path resolution
507
+ * across repo mode (task folder inside repo) and workspace mode (external
508
+ * task folder).
509
+ *
510
+ * @param taskFolder - Absolute task folder path (from main repo)
511
+ * @param worktreePath - Absolute path to the lane worktree
512
+ * @param repoRoot - Absolute path to the main repository root
513
+ * @returns Absolute path to the .DONE file in the worktree
514
+ */
515
+ export function resolveTaskDonePath(
516
+ taskFolder: string,
517
+ worktreePath: string,
518
+ repoRoot: string,
519
+ isWorkspaceMode?: boolean,
520
+ ): string {
521
+ return resolveCanonicalTaskPaths(taskFolder, worktreePath, repoRoot, isWorkspaceMode).donePath;
522
+ }
523
+
524
+ /*
525
+ * Removed in TP-120 while decommissioning the legacy session backend.
526
+ *
527
+ * `pollUntilTaskComplete` remains as a test-compatibility stub only.
528
+ * Runtime V2 completion detection now lives in lane-runner + agent-host.
529
+ */
530
+ // pollUntilTaskComplete function body removed — was ~170 lines of legacy .DONE polling.
531
+ // @ts-ignore — export kept as stub for test compatibility
532
+ export async function pollUntilTaskComplete(
533
+ _lane: AllocatedLane,
534
+ _task: AllocatedTask,
535
+ _config: OrchestratorConfig,
536
+ _repoRoot: string,
537
+ _pauseSignal: { paused: boolean },
538
+ _isWorkspaceMode?: boolean,
539
+ ): Promise<{ status: LaneTaskStatus; exitReason: string; doneFileFound: boolean }> {
540
+ return {
541
+ status: "failed",
542
+ exitReason: "Legacy pollUntilTaskComplete removed — use V2 lane-runner",
543
+ doneFileFound: false,
544
+ };
545
+ }
546
+
547
+ // ── Post-Task Commit ─────────────────────────────────────────────────
548
+
549
+ /**
550
+ * Commit any uncommitted task artifacts to the lane branch after task completion.
551
+ *
552
+ * The task-runner creates `.DONE` and updates `STATUS.md` via `writeFileSync`,
553
+ * but these changes are never committed to git by the task-runner or the worker.
554
+ * Without this commit, these files are lost when the worktree is reset or removed,
555
+ * and they don't appear in the merge to the base branch.
556
+ *
557
+ * Best-effort: failures are logged but don't fail the task (the work is already done).
558
+ *
559
+ * @param lane - Allocated lane containing the worktree path
560
+ * @param task - The task that just completed
561
+ * @param laneId - Lane identifier for logging
562
+ */
563
+ function commitTaskArtifacts(lane: AllocatedLane, task: AllocatedTask, laneId: string): void {
564
+ const worktreePath = lane.worktreePath;
565
+
566
+ // Check if there are any uncommitted changes in the worktree
567
+ const statusResult = runGit(["status", "--porcelain"], worktreePath);
568
+ if (!statusResult.ok || !statusResult.stdout.trim()) {
569
+ // Nothing to commit (worker already committed everything, or git error)
570
+ return;
571
+ }
572
+
573
+ // Stage all changes in the worktree
574
+ const addResult = runGit(["add", "-A"], worktreePath);
575
+ if (!addResult.ok) {
576
+ execLog(
577
+ laneId,
578
+ task.taskId,
579
+ `post-task stage failed (non-fatal): ${addResult.stderr.slice(0, 200)}`,
580
+ );
581
+ return;
582
+ }
583
+
584
+ // Commit with task ID for traceability
585
+ const commitResult = runGit(
586
+ ["commit", "-m", `checkpoint: ${task.taskId} task artifacts (.DONE, STATUS.md)`],
587
+ worktreePath,
588
+ );
589
+ if (!commitResult.ok) {
590
+ // "nothing to commit" is not an error — worker may have already committed
591
+ if (!commitResult.stderr.includes("nothing to commit")) {
592
+ execLog(
593
+ laneId,
594
+ task.taskId,
595
+ `post-task commit failed (non-fatal): ${commitResult.stderr.slice(0, 200)}`,
596
+ );
597
+ }
598
+ return;
599
+ }
600
+
601
+ execLog(laneId, task.taskId, `committed task artifacts to lane branch`, {
602
+ commit: commitResult.stdout.trim().split("\n")[0],
603
+ });
604
+ }
605
+
606
+ // ── STATUS.md Parsing for Worktree ───────────────────────────────────
607
+
608
+ /**
609
+ * Normalized result from parsing a STATUS.md file in a worktree.
610
+ *
611
+ * Reuses the same regex patterns as task-runner's parseStatusMd but
612
+ * adapted for monitoring context (no direct import — same file patterns).
613
+ */
614
+ export interface ParsedWorktreeStatus {
615
+ /** Parsed step info array */
616
+ steps: {
617
+ number: number;
618
+ name: string;
619
+ status: "not-started" | "in-progress" | "complete";
620
+ totalChecked: number;
621
+ totalItems: number;
622
+ }[];
623
+ /** Review counter from STATUS.md */
624
+ reviewCounter: number;
625
+ /** Iteration number from STATUS.md */
626
+ iteration: number;
627
+ /** File modification time (epoch ms) */
628
+ mtime: number;
629
+ }
630
+
631
+ /**
632
+ * Parse STATUS.md from a task folder inside a worktree.
633
+ *
634
+ * Reads the STATUS.md file, parses step statuses and checkbox counts
635
+ * using the same regex patterns as task-runner's parseStatusMd.
636
+ *
637
+ * @param taskFolder - Absolute task folder path (from main repo)
638
+ * @param worktreePath - Absolute path to the lane worktree
639
+ * @param repoRoot - Absolute path to the main repository root
640
+ * @returns Parsed status or null with reason if unreadable
641
+ */
642
+ export function parseWorktreeStatusMd(
643
+ taskFolder: string,
644
+ worktreePath: string,
645
+ repoRoot: string,
646
+ isWorkspaceMode?: boolean,
647
+ ): { parsed: ParsedWorktreeStatus | null; error: string | null } {
648
+ // Use canonical resolver for consistent path translation
649
+ const resolved = resolveCanonicalTaskPaths(taskFolder, worktreePath, repoRoot, isWorkspaceMode);
650
+ const statusPath = resolved.statusPath;
651
+
652
+ if (!existsSync(statusPath)) {
653
+ return { parsed: null, error: `STATUS.md not found at ${statusPath}` };
654
+ }
655
+
656
+ let content: string;
657
+ let mtime: number;
658
+ try {
659
+ content = readFileSync(statusPath, "utf-8");
660
+ mtime = statSync(statusPath).mtimeMs;
661
+ } catch (err: unknown) {
662
+ return {
663
+ parsed: null,
664
+ error: `Cannot read STATUS.md: ${err instanceof Error ? err.message : String(err)}`,
665
+ };
666
+ }
667
+
668
+ // Parse using same regex patterns as task-runner's parseStatusMd
669
+ const text = content.replace(/\r\n/g, "\n");
670
+ const steps: ParsedWorktreeStatus["steps"] = [];
671
+ let currentStep: {
672
+ number: number;
673
+ name: string;
674
+ status: "not-started" | "in-progress" | "complete";
675
+ checkboxes: boolean[];
676
+ } | null = null;
677
+ let reviewCounter = 0;
678
+ let iteration = 0;
679
+
680
+ for (const line of text.split("\n")) {
681
+ const rcMatch = line.match(/\*\*Review Counter:\*\*\s*(\d+)/);
682
+ if (rcMatch) reviewCounter = parseInt(rcMatch[1]);
683
+ const itMatch = line.match(/\*\*Iteration:\*\*\s*(\d+)/);
684
+ if (itMatch) iteration = parseInt(itMatch[1]);
685
+
686
+ const stepMatch = line.match(/^###\s+Step\s+(\d+):\s*(.+)/);
687
+ if (stepMatch) {
688
+ if (currentStep) {
689
+ const totalChecked = currentStep.checkboxes.filter((c) => c).length;
690
+ steps.push({
691
+ number: currentStep.number,
692
+ name: currentStep.name,
693
+ status: currentStep.status,
694
+ totalChecked,
695
+ totalItems: currentStep.checkboxes.length,
696
+ });
697
+ }
698
+ currentStep = {
699
+ number: parseInt(stepMatch[1]),
700
+ name: stepMatch[2].trim(),
701
+ status: "not-started",
702
+ checkboxes: [],
703
+ };
704
+ continue;
705
+ }
706
+ if (currentStep) {
707
+ const ss = line.match(/\*\*Status:\*\*\s*(.*)/);
708
+ if (ss) {
709
+ const s = ss[1];
710
+ if (s.includes("✅") || s.toLowerCase().includes("complete")) {
711
+ currentStep.status = "complete";
712
+ } else if (s.includes("🟨") || s.includes("🟡") || s.toLowerCase().includes("progress")) {
713
+ currentStep.status = "in-progress";
714
+ }
715
+ }
716
+ const cb = line.match(/^\s*-\s*\[([ xX])\]\s*(.*)/);
717
+ if (cb) {
718
+ currentStep.checkboxes.push(cb[1].toLowerCase() === "x");
719
+ }
720
+ }
721
+ }
722
+ if (currentStep) {
723
+ const totalChecked = currentStep.checkboxes.filter((c) => c).length;
724
+ steps.push({
725
+ number: currentStep.number,
726
+ name: currentStep.name,
727
+ status: currentStep.status,
728
+ totalChecked,
729
+ totalItems: currentStep.checkboxes.length,
730
+ });
731
+ }
732
+
733
+ return {
734
+ parsed: { steps, reviewCounter, iteration, mtime },
735
+ error: null,
736
+ };
737
+ }
738
+
739
+ /**
740
+ * Async version of parseWorktreeStatusMd — reads and parses STATUS.md
741
+ * without blocking the event loop. Used in monitoring poll loops.
742
+ *
743
+ * @since TP-070
744
+ */
745
+
746
+ /**
747
+ * Parse STATUS.md directly from a known absolute path.
748
+ * Unlike parseWorktreeStatusMdAsync, this does NOT re-resolve the path —
749
+ * it reads exactly the file you point it to. Use this when the caller
750
+ * already has the authoritative statusPath (e.g., from buildExecutionUnit).
751
+ *
752
+ * @since TP-501
753
+ */
754
+ export async function parseStatusMdAtPath(
755
+ statusPath: string,
756
+ ): Promise<{ parsed: ParsedWorktreeStatus | null; error: string | null }> {
757
+ return parseStatusMdContent(statusPath);
758
+ }
759
+
760
+ /**
761
+ * Parse STATUS.md by resolving the path from taskFolder + worktree context.
762
+ * Use parseStatusMdAtPath instead when the caller already has the authoritative path.
763
+ *
764
+ * @since TP-070
765
+ */
766
+ export async function parseWorktreeStatusMdAsync(
767
+ taskFolder: string,
768
+ worktreePath: string,
769
+ repoRoot: string,
770
+ isWorkspaceMode?: boolean,
771
+ ): Promise<{ parsed: ParsedWorktreeStatus | null; error: string | null }> {
772
+ const resolved = resolveCanonicalTaskPaths(taskFolder, worktreePath, repoRoot, isWorkspaceMode);
773
+ return parseStatusMdContent(resolved.statusPath);
774
+ }
775
+
776
+ /** Shared STATUS.md content parser — reads and parses from a known path. Handles file-not-found. */
777
+ async function parseStatusMdContent(
778
+ statusPath: string,
779
+ ): Promise<{ parsed: ParsedWorktreeStatus | null; error: string | null }> {
780
+ if (!(await fileExistsAsync(statusPath))) {
781
+ return { parsed: null, error: `STATUS.md not found at ${statusPath}` };
782
+ }
783
+
784
+ let content: string;
785
+ let mtime: number;
786
+ try {
787
+ content = await fsReadFile(statusPath, "utf-8");
788
+ mtime = (await fsStat(statusPath)).mtimeMs;
789
+ } catch (err: unknown) {
790
+ return {
791
+ parsed: null,
792
+ error: `Cannot read STATUS.md: ${err instanceof Error ? err.message : String(err)}`,
793
+ };
794
+ }
795
+
796
+ // Parse logic is identical to the sync version
797
+ const text = content.replace(/\r\n/g, "\n");
798
+ const steps: ParsedWorktreeStatus["steps"] = [];
799
+ let currentStep: {
800
+ number: number;
801
+ name: string;
802
+ status: "not-started" | "in-progress" | "complete";
803
+ checkboxes: boolean[];
804
+ } | null = null;
805
+ let reviewCounter = 0;
806
+ let iteration = 0;
807
+
808
+ for (const line of text.split("\n")) {
809
+ const rcMatch = line.match(/\*\*Review Counter:\*\*\s*(\d+)/);
810
+ if (rcMatch) reviewCounter = parseInt(rcMatch[1]);
811
+ const itMatch = line.match(/\*\*Iteration:\*\*\s*(\d+)/);
812
+ if (itMatch) iteration = parseInt(itMatch[1]);
813
+
814
+ const stepMatch = line.match(/^###\s+Step\s+(\d+):\s*(.+)/);
815
+ if (stepMatch) {
816
+ if (currentStep) {
817
+ const totalChecked = currentStep.checkboxes.filter((c) => c).length;
818
+ steps.push({
819
+ number: currentStep.number,
820
+ name: currentStep.name,
821
+ status: currentStep.status,
822
+ totalChecked,
823
+ totalItems: currentStep.checkboxes.length,
824
+ });
825
+ }
826
+ currentStep = {
827
+ number: parseInt(stepMatch[1]),
828
+ name: stepMatch[2].trim(),
829
+ status: "not-started",
830
+ checkboxes: [],
831
+ };
832
+ continue;
833
+ }
834
+ if (currentStep) {
835
+ const ss = line.match(/\*\*Status:\*\*\s*(.*)/);
836
+ if (ss) {
837
+ const s = ss[1];
838
+ if (s.includes("✅") || s.toLowerCase().includes("complete")) {
839
+ currentStep.status = "complete";
840
+ } else if (s.includes("🟨") || s.includes("🟡") || s.toLowerCase().includes("progress")) {
841
+ currentStep.status = "in-progress";
842
+ }
843
+ }
844
+ const cb = line.match(/^\s*-\s*\[([ xX])\]\s*(.*)/);
845
+ if (cb) {
846
+ currentStep.checkboxes.push(cb[1].toLowerCase() === "x");
847
+ }
848
+ }
849
+ }
850
+ if (currentStep) {
851
+ const totalChecked = currentStep.checkboxes.filter((c) => c).length;
852
+ steps.push({
853
+ number: currentStep.number,
854
+ name: currentStep.name,
855
+ status: currentStep.status,
856
+ totalChecked,
857
+ totalItems: currentStep.checkboxes.length,
858
+ });
859
+ }
860
+
861
+ return {
862
+ parsed: { steps, reviewCounter, iteration, mtime },
863
+ error: null,
864
+ };
865
+ }
866
+
867
+ // ── State Resolution ─────────────────────────────────────────────────
868
+
869
+ /**
870
+ * Resolve the monitoring state for a single task by combining signals.
871
+ *
872
+ * State-resolution precedence (deterministic):
873
+ * 1. `.DONE` file found → "succeeded" (highest priority, always wins)
874
+ * 2. Stall timeout reached (mtime unchanged for stall_timeout AND session alive) → "stalled"
875
+ * 3. Lane session ended without .DONE → "failed"
876
+ * 4. Session alive + recent mtime (within stall_timeout) → "running"
877
+ * 5. Session alive + stale mtime but within startup grace → "running" (with no stall timer yet)
878
+ * 6. Session alive + no STATUS.md yet but within startup grace → "running"
879
+ * 7. No session, no .DONE, never observed running → "unknown"
880
+ *
881
+ * @param taskId - Task identifier
882
+ * @param donePath - Absolute path to the .DONE file in the worktree
883
+ * @param sessionName - Lane session name for this lane
884
+ * @param statusResult - Parsed STATUS.md result (may be null)
885
+ * @param tracker - Mtime tracker for stall detection
886
+ * @param stallTimeoutMs - Stall timeout in milliseconds
887
+ * @param now - Current timestamp (epoch ms) for deterministic testing
888
+ * @param multiSegmentContext - Optional segment-authority context (TP-196 / #462).
889
+ * When provided AND `isFinalSegment === false`,
890
+ * `.DONE` is treated as a non-authoritative signal
891
+ * (Priority 1 is skipped). This guards against a
892
+ * stale or premature `.DONE` from a non-final
893
+ * segment short-circuiting the task to succeeded
894
+ * before the remaining segments have run.
895
+ */
896
+ export async function resolveTaskMonitorState(
897
+ taskId: string,
898
+ donePath: string,
899
+ sessionName: string,
900
+ statusResult: { parsed: ParsedWorktreeStatus | null; error: string | null },
901
+ tracker: MtimeTracker,
902
+ stallTimeoutMs: number,
903
+ now: number,
904
+ runtimeBackend?: RuntimeBackend,
905
+ v2Context?: { stateRoot: string; batchId: string; laneNumber: number },
906
+ multiSegmentContext?: { isFinalSegment: boolean; segmentId: string },
907
+ ): Promise<TaskMonitorSnapshot> {
908
+ // TP-115/TP-127: Backend-aware liveness check.
909
+ // V2: read the lane snapshot file written by lane-runner every second.
910
+ // If snapshot doesn't exist yet, assume alive (lane-runner startup race).
911
+ // If snapshot belongs to a different task, it's stale transition data from
912
+ // the previous wave/task and should be treated like startup grace (alive).
913
+ // Legacy: check lane-session liveness.
914
+ let sessionAlive: boolean;
915
+ if (runtimeBackend === "v2" && v2Context) {
916
+ const snap = readLaneSnapshot(v2Context.stateRoot, v2Context.batchId, v2Context.laneNumber);
917
+ if (snap == null || snap.taskId !== taskId) {
918
+ // Snapshot not written yet OR snapshot still points to a prior task.
919
+ // Assume alive initially, but if stale for >30s consult the registry
920
+ // to avoid indefinite false "running" if the lane-runner died.
921
+ const staleMs = snap?.updatedAt ? now - snap.updatedAt : 0;
922
+ const trackerAgeMs = now - tracker.firstObservedAt;
923
+ if (staleMs > 30_000) {
924
+ // Snapshot hasn't been updated for 30s+ — check registry as fallback.
925
+ // But also check if the tracker just started (firstObservedAt within
926
+ // last 60s) — wave transitions can leave stale snapshots from the
927
+ // prior wave/task while the new worker is still spawning.
928
+ if (trackerAgeMs < 60_000) {
929
+ // New task, stale snapshot — give the worker startup grace period
930
+ sessionAlive = true;
931
+ } else {
932
+ sessionAlive = isV2AgentAlive(sessionName, runtimeBackend, v2Context?.laneNumber);
933
+ }
934
+ } else if (snap == null && trackerAgeMs >= 60_000) {
935
+ // TP-190 (#561 sage post-mortem): when NO snapshot exists at all
936
+ // (not even stale) and the tracker has been observing this task
937
+ // for >= 60s, fall back to the registry liveness check. Without
938
+ // this branch, a snapshot-write failure in the spawn-failure catch
939
+ // (disk full, permission error, transient I/O hiccup) leaves
940
+ // `snap == null` AND `staleMs == 0`, which previously hit the
941
+ // unconditional-alive default below — reintroducing the same
942
+ // monitor hang the spawn-failure catch was supposed to fix.
943
+ // 60s tracker-age threshold matches the existing startup-grace
944
+ // boundary so we don't false-fail a slow-starting worker that
945
+ // hasn't yet written its first snapshot.
946
+ sessionAlive = isV2AgentAlive(sessionName, runtimeBackend, v2Context?.laneNumber);
947
+ } else {
948
+ sessionAlive = true;
949
+ }
950
+ } else {
951
+ // TP-159: Fast-fail path for ghost workers (issue #461).
952
+ // When the snapshot belongs to the current task but the lane-runner
953
+ // has stopped updating it and the agent's PID is confirmed dead,
954
+ // immediately set sessionAlive=false instead of waiting for the full
955
+ // stall timeout. This handles the case where a worker dies silently
956
+ // (OOM, segfault, parent crash) after writing its first snapshot:
957
+ // snap.status stays "running", stallTimerStart stays null
958
+ // (STATUS.md never written), so Priority 2 never fires without
959
+ // this explicit dead-PID check.
960
+ // Conditions:
961
+ // 1. snap.updatedAt is stale beyond stallTimeoutMs/2
962
+ // 2. startup grace has elapsed (trackerAgeMs >= 60s)
963
+ // 3. agent is confirmed dead (registry marked crashed by orphan scan)
964
+ const trackerAgeMs = now - tracker.firstObservedAt;
965
+ if (
966
+ snap.updatedAt &&
967
+ now - snap.updatedAt > stallTimeoutMs / 2 &&
968
+ trackerAgeMs >= 60_000 &&
969
+ !isV2AgentAlive(sessionName, runtimeBackend, v2Context?.laneNumber)
970
+ ) {
971
+ // Ghost worker confirmed: PID dead, snapshot stale beyond half the stall timeout
972
+ execLog("monitor", taskId, "ghost worker fast-fail — dead PID + stale snapshot", {
973
+ session: sessionName,
974
+ snapStaleMs: now - snap.updatedAt,
975
+ trackerAgeMs,
976
+ halfStallTimeoutMs: stallTimeoutMs / 2,
977
+ });
978
+ sessionAlive = false;
979
+ } else {
980
+ sessionAlive = snap.status === "running";
981
+ }
982
+ }
983
+ } else {
984
+ sessionAlive = isV2AgentAlive(sessionName, "v2", v2Context?.laneNumber);
985
+ }
986
+ const doneFileFound = await fileExistsAsync(donePath);
987
+
988
+ // Build base snapshot from parsed status
989
+ let currentStepName: string | null = null;
990
+ let currentStepNumber: number | null = null;
991
+ let totalSteps = 0;
992
+ let totalChecked = 0;
993
+ let totalItems = 0;
994
+ let iteration = 0;
995
+ let reviewCounter = 0;
996
+ let parseError = statusResult.error;
997
+
998
+ if (statusResult.parsed) {
999
+ const { steps } = statusResult.parsed;
1000
+ totalSteps = steps.length;
1001
+ iteration = statusResult.parsed.iteration;
1002
+ reviewCounter = statusResult.parsed.reviewCounter;
1003
+
1004
+ for (const step of steps) {
1005
+ totalChecked += step.totalChecked;
1006
+ totalItems += step.totalItems;
1007
+ }
1008
+
1009
+ // Find the current step (first in-progress, or first not-started after last complete)
1010
+ const inProgress = steps.find((s) => s.status === "in-progress");
1011
+ if (inProgress) {
1012
+ currentStepName = inProgress.name;
1013
+ currentStepNumber = inProgress.number;
1014
+ } else {
1015
+ // Find first not-started step
1016
+ const notStarted = steps.find((s) => s.status === "not-started");
1017
+ if (notStarted) {
1018
+ currentStepName = notStarted.name;
1019
+ currentStepNumber = notStarted.number;
1020
+ } else if (steps.length > 0) {
1021
+ // All complete
1022
+ const last = steps[steps.length - 1];
1023
+ currentStepName = last.name;
1024
+ currentStepNumber = last.number;
1025
+ }
1026
+ }
1027
+
1028
+ // Update mtime tracker
1029
+ if (!tracker.statusFileSeenOnce) {
1030
+ tracker.statusFileSeenOnce = true;
1031
+ tracker.lastMtime = statusResult.parsed.mtime;
1032
+ tracker.stallTimerStart = null; // Reset stall timer on first read
1033
+ } else if (statusResult.parsed.mtime !== tracker.lastMtime) {
1034
+ // Mtime changed — progress is being made
1035
+ tracker.lastMtime = statusResult.parsed.mtime;
1036
+ tracker.stallTimerStart = null; // Reset stall timer
1037
+ } else {
1038
+ // Mtime unchanged — start or continue stall timer
1039
+ if (tracker.stallTimerStart === null) {
1040
+ tracker.stallTimerStart = now;
1041
+ }
1042
+ }
1043
+ }
1044
+
1045
+ // ── Priority 1: .DONE file found → succeeded ────────────────
1046
+ // TP-196 / #462: Monitor guard for multi-segment tasks. When the caller
1047
+ // has provided a segment-authority context AND tells us the active segment
1048
+ // is NOT the final segment in the task plan, `.DONE` MUST NOT be accepted
1049
+ // as authoritative — a non-final segment's worker should never have
1050
+ // produced one. We log a WARN and fall through to the lower priorities
1051
+ // (which keep the task in a non-terminal state so the engine can recover).
1052
+ const doneAcceptedAsAuthority =
1053
+ doneFileFound && !(multiSegmentContext && multiSegmentContext.isFinalSegment === false);
1054
+ if (doneFileFound && !doneAcceptedAsAuthority) {
1055
+ execLog(
1056
+ "monitor",
1057
+ taskId,
1058
+ `WARN: .DONE present for non-final segment '${multiSegmentContext?.segmentId}' — ignoring (#462 guard)`,
1059
+ {
1060
+ session: sessionName,
1061
+ segmentId: multiSegmentContext?.segmentId,
1062
+ donePath,
1063
+ },
1064
+ );
1065
+ }
1066
+ if (doneAcceptedAsAuthority) {
1067
+ return {
1068
+ taskId,
1069
+ status: "succeeded",
1070
+ currentStepName,
1071
+ currentStepNumber,
1072
+ totalSteps,
1073
+ totalChecked,
1074
+ totalItems,
1075
+ sessionAlive,
1076
+ doneFileFound: true,
1077
+ stallReason: null,
1078
+ lastHeartbeat: tracker.lastMtime,
1079
+ observedAt: now,
1080
+ parseError,
1081
+ iteration,
1082
+ reviewCounter,
1083
+ };
1084
+ }
1085
+
1086
+ // ── Priority 2: Stall timeout reached ────────────────────────
1087
+ if (
1088
+ sessionAlive &&
1089
+ tracker.statusFileSeenOnce &&
1090
+ tracker.stallTimerStart !== null &&
1091
+ now - tracker.stallTimerStart >= stallTimeoutMs
1092
+ ) {
1093
+ const stallMinutes = Math.round((now - tracker.stallTimerStart) / 60_000);
1094
+ const stallReason = `STATUS.md unchanged for ${stallMinutes} minutes (threshold: ${Math.round(stallTimeoutMs / 60_000)} min)`;
1095
+
1096
+ // Kill the agent (backend-aware)
1097
+ execLog("monitor", taskId, `stall detected — killing agent`, {
1098
+ session: sessionName,
1099
+ stallMinutes,
1100
+ backend: runtimeBackend ?? "legacy",
1101
+ });
1102
+ killV2LaneAgents(sessionName, { laneNumber: v2Context?.laneNumber });
1103
+
1104
+ return {
1105
+ taskId,
1106
+ status: "stalled",
1107
+ currentStepName,
1108
+ currentStepNumber,
1109
+ totalSteps,
1110
+ totalChecked,
1111
+ totalItems,
1112
+ sessionAlive: false, // We just killed it
1113
+ doneFileFound: false,
1114
+ stallReason,
1115
+ lastHeartbeat: tracker.lastMtime,
1116
+ observedAt: now,
1117
+ parseError,
1118
+ iteration,
1119
+ reviewCounter,
1120
+ };
1121
+ }
1122
+
1123
+ // ── Priority 3: Session exited without .DONE → failed ────────
1124
+ if (!sessionAlive) {
1125
+ return {
1126
+ taskId,
1127
+ status: "failed",
1128
+ currentStepName,
1129
+ currentStepNumber,
1130
+ totalSteps,
1131
+ totalChecked,
1132
+ totalItems,
1133
+ sessionAlive: false,
1134
+ doneFileFound: false,
1135
+ stallReason: null,
1136
+ lastHeartbeat: tracker.lastMtime,
1137
+ observedAt: now,
1138
+ parseError,
1139
+ iteration,
1140
+ reviewCounter,
1141
+ };
1142
+ }
1143
+
1144
+ // ── Priority 4-6: Session alive → running ────────────────────
1145
+ return {
1146
+ taskId,
1147
+ status: "running",
1148
+ currentStepName,
1149
+ currentStepNumber,
1150
+ totalSteps,
1151
+ totalChecked,
1152
+ totalItems,
1153
+ sessionAlive: true,
1154
+ doneFileFound: false,
1155
+ stallReason: null,
1156
+ lastHeartbeat: tracker.lastMtime,
1157
+ observedAt: now,
1158
+ parseError,
1159
+ iteration,
1160
+ reviewCounter,
1161
+ };
1162
+ }
1163
+
1164
+ // ── Core Monitor Loop ────────────────────────────────────────────────
1165
+
1166
+ /**
1167
+ * Callback type for dashboard updates during monitoring.
1168
+ */
1169
+ export type MonitorUpdateCallback = (state: MonitorState) => void;
1170
+
1171
+ /**
1172
+ * Monitor all lanes in a wave, polling for progress, completion, and stalls.
1173
+ *
1174
+ * This is the orchestrator's "air traffic control" — it does NOT attach
1175
+ * to lane sessions directly. It monitors via filesystem polling:
1176
+ * - STATUS.md in each worktree for step/checkbox progress
1177
+ * - .DONE files for task completion
1178
+ * - backend liveness probes for session state
1179
+ * - STATUS.md mtime for stall detection
1180
+ *
1181
+ * The monitoring loop runs until all lanes reach terminal states
1182
+ * (all tasks succeeded/failed/stalled) or the pauseSignal is set.
1183
+ *
1184
+ * **Important:** This function monitors lanes that are being executed
1185
+ * concurrently by `executeLane()` in Step 2. It does NOT spawn sessions —
1186
+ * it only observes. Step 4 will coordinate calling both executeLane()
1187
+ * and monitorLanes() in parallel.
1188
+ *
1189
+ * @param lanes - Allocated lanes being executed
1190
+ * @param config - Orchestrator configuration (poll_interval, stall_timeout)
1191
+ * @param repoRoot - Main repository root
1192
+ * @param pauseSignal - Shared signal for pause/abort
1193
+ * @param waveNumber - Current wave number (for display)
1194
+ * @param onUpdate - Optional callback invoked on each poll cycle
1195
+ * @returns Final MonitorState snapshot when monitoring completes
1196
+ */
1197
+ export async function monitorLanes(
1198
+ lanes: AllocatedLane[],
1199
+ config: OrchestratorConfig,
1200
+ repoRoot: string,
1201
+ pauseSignal: { paused: boolean },
1202
+ waveNumber: number = 1,
1203
+ onUpdate?: MonitorUpdateCallback,
1204
+ isWorkspaceMode?: boolean,
1205
+ runtimeBackend?: RuntimeBackend,
1206
+ batchId?: string,
1207
+ stateRootForRegistry?: string,
1208
+ ): Promise<MonitorState> {
1209
+ const pollIntervalMs = (config.monitoring.poll_interval || 5) * 1000;
1210
+ const stallTimeoutMs = (config.failure.stall_timeout || 30) * 60_000;
1211
+
1212
+ // Initialize mtime trackers for each lane's current task
1213
+ // We track per-taskId so a lane advancing to the next task gets a fresh tracker
1214
+ const mtimeTrackers = new Map<string, MtimeTracker>();
1215
+
1216
+ function getOrCreateTracker(taskId: string, now: number): MtimeTracker {
1217
+ let tracker = mtimeTrackers.get(taskId);
1218
+ if (!tracker) {
1219
+ tracker = {
1220
+ taskId,
1221
+ firstObservedAt: now,
1222
+ statusFileSeenOnce: false,
1223
+ lastMtime: null,
1224
+ stallTimerStart: null,
1225
+ };
1226
+ mtimeTrackers.set(taskId, tracker);
1227
+ }
1228
+ return tracker;
1229
+ }
1230
+
1231
+ // Track terminal states per task to avoid re-processing
1232
+ const terminalTasks = new Map<string, TaskMonitorSnapshot>();
1233
+
1234
+ // Track which task each lane is currently on
1235
+ // (determined by: first task in lane that hasn't reached terminal state)
1236
+ const laneTaskIndex = new Map<number, number>();
1237
+ for (const lane of lanes) {
1238
+ laneTaskIndex.set(lane.laneNumber, 0);
1239
+ }
1240
+
1241
+ let pollCount = 0;
1242
+ let lastMonitorStateKey = "";
1243
+
1244
+ // Build the total task count
1245
+ const tasksTotal = lanes.reduce((sum, lane) => sum + lane.tasks.length, 0);
1246
+
1247
+ execLog(
1248
+ "monitor",
1249
+ "ALL",
1250
+ `starting monitoring for ${lanes.length} lane(s), ${tasksTotal} task(s)`,
1251
+ {
1252
+ pollIntervalMs,
1253
+ stallTimeoutMin: Math.round(stallTimeoutMs / 60_000),
1254
+ },
1255
+ );
1256
+
1257
+ while (true) {
1258
+ const now = Date.now();
1259
+ pollCount++;
1260
+
1261
+ // TP-112: Refresh V2 liveness registry cache once per poll cycle
1262
+ if (runtimeBackend === "v2" && batchId) {
1263
+ try {
1264
+ setV2LivenessRegistryCache(readRegistrySnapshot(stateRootForRegistry ?? repoRoot, batchId));
1265
+ } catch {
1266
+ setV2LivenessRegistryCache(null);
1267
+ }
1268
+ } else {
1269
+ setV2LivenessRegistryCache(null);
1270
+ }
1271
+
1272
+ // TP-159: Detect and mark orphaned workers each poll cycle.
1273
+ // When a worker subprocess dies silently (OOM kill, segfault, parent
1274
+ // crash) without going through the normal completion handshake, its
1275
+ // registry manifest stays in a non-terminal status indefinitely.
1276
+ // Scanning for dead PIDs here ensures list_active_agents, read_agent_status,
1277
+ // and the dashboard all reflect reality within one poll interval.
1278
+ if (runtimeBackend === "v2" && batchId) {
1279
+ try {
1280
+ const registry = readRegistrySnapshot(stateRootForRegistry ?? repoRoot, batchId);
1281
+ if (registry) {
1282
+ const orphans = detectOrphans(registry);
1283
+ if (orphans.length > 0) {
1284
+ // Mark individual agent manifests as crashed
1285
+ markOrphansCrashed(stateRootForRegistry ?? repoRoot, batchId, orphans);
1286
+ // Rebuild and write registry.json from the updated individual manifests.
1287
+ // markOrphansCrashed only updates per-agent files; registry.json is a
1288
+ // cached aggregate that must be explicitly rebuilt so readRegistrySnapshot()
1289
+ // and the dashboard see the crashed status within this poll cycle.
1290
+ const freshRegistry = buildRegistrySnapshot(stateRootForRegistry ?? repoRoot, batchId);
1291
+ writeRegistrySnapshot(stateRootForRegistry ?? repoRoot, freshRegistry);
1292
+ setV2LivenessRegistryCache(freshRegistry);
1293
+ }
1294
+ }
1295
+ } catch {
1296
+ // Non-fatal — monitor loop must never throw
1297
+ }
1298
+ }
1299
+
1300
+ // Check pause signal
1301
+ if (pauseSignal.paused) {
1302
+ execLog("monitor", "ALL", "pause signal detected — stopping monitoring");
1303
+ break;
1304
+ }
1305
+
1306
+ const laneSnapshots: LaneMonitorSnapshot[] = [];
1307
+ let totalDone = 0;
1308
+ let totalFailed = 0;
1309
+ let allTerminal = true;
1310
+
1311
+ for (const lane of lanes) {
1312
+ const completedTasks: string[] = [];
1313
+ const failedTasks: string[] = [];
1314
+ const remainingTasks: string[] = [];
1315
+ let currentTaskId: string | null = null;
1316
+ let currentTaskSnapshot: TaskMonitorSnapshot | null = null;
1317
+
1318
+ // Walk through tasks in order to determine lane state
1319
+ for (let i = 0; i < lane.tasks.length; i++) {
1320
+ const task = lane.tasks[i];
1321
+
1322
+ // Check if we already know this task is terminal
1323
+ const existingTerminal = terminalTasks.get(task.taskId);
1324
+ if (existingTerminal) {
1325
+ if (existingTerminal.status === "succeeded") {
1326
+ completedTasks.push(task.taskId);
1327
+ totalDone++;
1328
+ } else {
1329
+ failedTasks.push(task.taskId);
1330
+ totalFailed++;
1331
+ }
1332
+ continue;
1333
+ }
1334
+
1335
+ // This task hasn't reached terminal state yet
1336
+ if (currentTaskId === null) {
1337
+ // This is the current task being worked on
1338
+ currentTaskId = task.taskId;
1339
+
1340
+ const tracker = getOrCreateTracker(task.taskId, now);
1341
+ const unit = buildExecutionUnit(lane, task, repoRoot, isWorkspaceMode);
1342
+ const donePath = unit.packet.donePath;
1343
+ const statusPath = unit.packet.statusPath;
1344
+ const statusResult = await parseStatusMdAtPath(statusPath);
1345
+
1346
+ // TP-196 / #462: Build multi-segment authority context so
1347
+ // `.DONE` from a non-final segment is not accepted as terminal.
1348
+ const taskSegmentIds = task.task.segmentIds ?? [];
1349
+ const taskActiveSegmentId = task.task.activeSegmentId ?? null;
1350
+ let multiSegmentContext: { isFinalSegment: boolean; segmentId: string } | undefined;
1351
+ if (taskSegmentIds.length > 1 && taskActiveSegmentId) {
1352
+ const finalSegmentId = taskSegmentIds[taskSegmentIds.length - 1];
1353
+ multiSegmentContext = {
1354
+ isFinalSegment: taskActiveSegmentId === finalSegmentId,
1355
+ segmentId: taskActiveSegmentId,
1356
+ };
1357
+ }
1358
+
1359
+ const snapshot = await resolveTaskMonitorState(
1360
+ task.taskId,
1361
+ donePath,
1362
+ laneSessionIdOf(lane),
1363
+ statusResult,
1364
+ tracker,
1365
+ stallTimeoutMs,
1366
+ now,
1367
+ runtimeBackend,
1368
+ runtimeBackend === "v2" && batchId
1369
+ ? {
1370
+ stateRoot: stateRootForRegistry ?? repoRoot,
1371
+ batchId,
1372
+ laneNumber: lane.laneNumber,
1373
+ }
1374
+ : undefined,
1375
+ multiSegmentContext,
1376
+ );
1377
+
1378
+ currentTaskSnapshot = snapshot;
1379
+
1380
+ // Check if this task just became terminal
1381
+ if (
1382
+ snapshot.status === "succeeded" ||
1383
+ snapshot.status === "failed" ||
1384
+ snapshot.status === "stalled"
1385
+ ) {
1386
+ terminalTasks.set(task.taskId, snapshot);
1387
+ if (snapshot.status === "succeeded") {
1388
+ completedTasks.push(task.taskId);
1389
+ totalDone++;
1390
+ } else {
1391
+ failedTasks.push(task.taskId);
1392
+ totalFailed++;
1393
+ }
1394
+ // Move to next task — clear currentTaskId so next iteration picks up
1395
+ currentTaskId = null;
1396
+ currentTaskSnapshot = null;
1397
+ } else {
1398
+ // Task is still running — mark remaining and break
1399
+ allTerminal = false;
1400
+ // Remaining tasks are everything after this one
1401
+ for (let j = i + 1; j < lane.tasks.length; j++) {
1402
+ remainingTasks.push(lane.tasks[j].taskId);
1403
+ }
1404
+ break;
1405
+ }
1406
+ } else {
1407
+ // Shouldn't reach here since we break above, but defensive
1408
+ remainingTasks.push(task.taskId);
1409
+ }
1410
+ }
1411
+
1412
+ // If we processed all tasks and currentTaskId is still null,
1413
+ // the lane is fully terminal (all tasks completed/failed)
1414
+ if (currentTaskId !== null) {
1415
+ allTerminal = false;
1416
+ }
1417
+
1418
+ // TP-112: Backend-aware lane liveness for snapshot
1419
+ // TP-148: Pass global laneNumber for workspace-mode fallback lookup
1420
+ const sessionAlive = isV2AgentAlive(laneSessionIdOf(lane), "v2", lane.laneNumber);
1421
+
1422
+ laneSnapshots.push({
1423
+ laneId: lane.laneId,
1424
+ laneNumber: lane.laneNumber,
1425
+ sessionName: laneSessionIdOf(lane),
1426
+ sessionAlive,
1427
+ currentTaskId,
1428
+ currentTaskSnapshot,
1429
+ completedTasks,
1430
+ failedTasks,
1431
+ remainingTasks,
1432
+ });
1433
+ }
1434
+
1435
+ const monitorState: MonitorState = {
1436
+ lanes: laneSnapshots,
1437
+ tasksDone: totalDone,
1438
+ tasksFailed: totalFailed,
1439
+ tasksTotal,
1440
+ waveNumber,
1441
+ pollCount,
1442
+ lastPollTime: now,
1443
+ allTerminal,
1444
+ };
1445
+
1446
+ // Invoke the dashboard update callback
1447
+ if (onUpdate) {
1448
+ try {
1449
+ onUpdate(monitorState);
1450
+ } catch {
1451
+ // Don't let callback errors kill the monitor loop
1452
+ }
1453
+ }
1454
+
1455
+ // Log summary only on state changes (lane completes or fails) — not every poll
1456
+ const currentStateKey = `${totalDone}/${totalFailed}`;
1457
+ if (currentStateKey !== lastMonitorStateKey) {
1458
+ const activeLanes = laneSnapshots.filter((l) => l.currentTaskId !== null);
1459
+ execLog(
1460
+ "monitor",
1461
+ "ALL",
1462
+ `poll #${pollCount}: ${totalDone}/${tasksTotal} done, ${totalFailed} failed, ${activeLanes.length} active lane(s)`,
1463
+ );
1464
+ lastMonitorStateKey = currentStateKey;
1465
+ }
1466
+
1467
+ // Exit conditions
1468
+ if (allTerminal) {
1469
+ execLog("monitor", "ALL", `all lanes terminal — monitoring complete`, {
1470
+ done: totalDone,
1471
+ failed: totalFailed,
1472
+ total: tasksTotal,
1473
+ polls: pollCount,
1474
+ });
1475
+ setV2LivenessRegistryCache(null);
1476
+ return monitorState;
1477
+ }
1478
+
1479
+ // Wait for next poll cycle
1480
+ await new Promise((r) => setTimeout(r, pollIntervalMs));
1481
+ }
1482
+
1483
+ // Reached here due to pause signal — return current state
1484
+ const now = Date.now();
1485
+ const laneSnapshots: LaneMonitorSnapshot[] = lanes.map((lane) => ({
1486
+ laneId: lane.laneId,
1487
+ laneNumber: lane.laneNumber,
1488
+ sessionName: laneSessionIdOf(lane),
1489
+ sessionAlive: false, // Best-effort during pause — don't block on extra liveness probes
1490
+ currentTaskId: null,
1491
+ currentTaskSnapshot: null,
1492
+ completedTasks: [],
1493
+ failedTasks: [],
1494
+ remainingTasks: lane.tasks.map((t) => t.taskId),
1495
+ }));
1496
+
1497
+ setV2LivenessRegistryCache(null);
1498
+ return {
1499
+ lanes: laneSnapshots,
1500
+ tasksDone: 0,
1501
+ tasksFailed: 0,
1502
+ tasksTotal,
1503
+ waveNumber,
1504
+ pollCount,
1505
+ lastPollTime: now,
1506
+ allTerminal: false,
1507
+ };
1508
+ }
1509
+
1510
+ // ── Transitive Dependent Computation ─────────────────────────────────
1511
+
1512
+ /**
1513
+ * Compute transitive dependents of a set of failed task IDs.
1514
+ *
1515
+ * Uses BFS through the dependency graph's `dependents` map (task → tasks
1516
+ * that depend on it) to find all tasks transitively blocked by the failures.
1517
+ *
1518
+ * Example: if A failed, B depends on A, and C depends on B, then both B
1519
+ * and C are transitively blocked.
1520
+ *
1521
+ * The failed tasks themselves are NOT included in the output — only their
1522
+ * downstream dependents.
1523
+ *
1524
+ * @param failedTaskIds - Set of task IDs that failed
1525
+ * @param dependencyGraph - Dependency graph with dependents map
1526
+ * @returns Set of task IDs transitively blocked (excludes the failed tasks themselves)
1527
+ */
1528
+ export function computeTransitiveDependents(
1529
+ failedTaskIds: Set<string>,
1530
+ dependencyGraph: DependencyGraph,
1531
+ ): Set<string> {
1532
+ const blocked = new Set<string>();
1533
+ const queue = [...failedTaskIds];
1534
+
1535
+ while (queue.length > 0) {
1536
+ const current = queue.shift()!;
1537
+ const dependents = dependencyGraph.dependents.get(current) || [];
1538
+
1539
+ // Deterministic: sort dependents alphabetically
1540
+ const sortedDependents = [...dependents].sort();
1541
+
1542
+ for (const dep of sortedDependents) {
1543
+ if (blocked.has(dep)) continue;
1544
+ if (failedTaskIds.has(dep)) continue; // Don't re-add failed tasks
1545
+ blocked.add(dep);
1546
+ queue.push(dep); // Continue BFS for transitive closure
1547
+ }
1548
+ }
1549
+
1550
+ return blocked;
1551
+ }
1552
+
1553
+ // ── Pre-flight: Commit Untracked Task Files ─────────────────────────
1554
+
1555
+ /**
1556
+ * Ensure all task files for a wave are committed to git before worktree creation.
1557
+ *
1558
+ * Git worktrees only contain tracked (committed) files. If a user creates
1559
+ * task folders (PROMPT.md, STATUS.md) but doesn't commit them, the worktree
1560
+ * won't have those files and the worker will fail with "file not found".
1561
+ *
1562
+ * This function checks each wave task's folder for untracked or modified files,
1563
+ * stages them, and creates a commit on the current branch. This must run BEFORE
1564
+ * allocateLanes() so that worktrees (which are based on the batch's base branch)
1565
+ * include the task files.
1566
+ *
1567
+ * Only task-specific folders are staged — no other working tree changes are touched.
1568
+ *
1569
+ * @param waveTasks - Task IDs in this wave
1570
+ * @param pending - Full pending task map from discovery
1571
+ * @param repoRoot - Main repository root
1572
+ * @param waveIndex - Wave number for commit message
1573
+ */
1574
+ export function ensureTaskFilesCommitted(
1575
+ waveTasks: string[],
1576
+ pending: Map<string, ParsedTask>,
1577
+ repoRoot: string,
1578
+ waveIndex: number,
1579
+ orchBranch?: string,
1580
+ ): void {
1581
+ // Collect task folder paths for this wave
1582
+ const foldersToCheck: { taskId: string; relPath: string }[] = [];
1583
+ for (const taskId of waveTasks) {
1584
+ const task = pending.get(taskId);
1585
+ if (!task) continue;
1586
+
1587
+ const absFolder = resolve(task.taskFolder);
1588
+ const relPath = relative(resolve(repoRoot), absFolder).replace(/\\/g, "/");
1589
+
1590
+ // Skip if path escapes the repo (shouldn't happen in normal use)
1591
+ if (relPath.startsWith("..")) {
1592
+ continue;
1593
+ }
1594
+ foldersToCheck.push({ taskId, relPath });
1595
+ }
1596
+
1597
+ if (foldersToCheck.length === 0) return;
1598
+
1599
+ // Check which folders have untracked or uncommitted files
1600
+ const foldersToStage: string[] = [];
1601
+ for (const { taskId, relPath } of foldersToCheck) {
1602
+ const status = runGit(["status", "--porcelain", "--", relPath], repoRoot);
1603
+ if (status.ok && status.stdout.trim()) {
1604
+ execLog("wave", `W${waveIndex}`, `task ${taskId} has uncommitted files, staging`, {
1605
+ folder: relPath,
1606
+ status: status.stdout.trim().split("\n").slice(0, 5).join("; "),
1607
+ });
1608
+ foldersToStage.push(relPath);
1609
+ }
1610
+ }
1611
+
1612
+ if (foldersToStage.length === 0) return;
1613
+
1614
+ // TP-169: When an orch branch is provided, commit task files directly on
1615
+ // the orch branch using a temporary git index file. This avoids polluting
1616
+ // the repo's current branch (e.g. main) with orchestrator-internal staging
1617
+ // commits, maintaining proper branch isolation in workspace mode.
1618
+ //
1619
+ // Approach:
1620
+ // 1. Read the orch branch's tree into a temporary index
1621
+ // 2. Add new/modified task files to the temporary index
1622
+ // 3. Write the combined tree
1623
+ // 4. Create a commit on the orch branch
1624
+ // 5. Update the orch branch ref
1625
+ // 6. Clean up the temporary index
1626
+ //
1627
+ // Fallback: if orch branch plumbing fails or orchBranch is not provided,
1628
+ // fall back to the legacy path of committing on HEAD.
1629
+ if (orchBranch) {
1630
+ const orchTipRes = runGit(["rev-parse", `refs/heads/${orchBranch}`], repoRoot);
1631
+ if (orchTipRes.ok) {
1632
+ const orchTip = orchTipRes.stdout.trim();
1633
+ const tmpIdx = join(repoRoot, ".git", `tmp-staging-idx-wave-${waveIndex}`);
1634
+
1635
+ try {
1636
+ // Read orch branch tree into temporary index
1637
+ const readTreeRes = runGitWithEnv(["read-tree", orchTip], repoRoot, { GIT_INDEX_FILE: tmpIdx });
1638
+ if (!readTreeRes.ok) {
1639
+ execLog(
1640
+ "wave",
1641
+ `W${waveIndex}`,
1642
+ `orch branch staging: read-tree failed, falling back to HEAD commit`,
1643
+ {
1644
+ error: readTreeRes.stderr,
1645
+ },
1646
+ );
1647
+ // Fall through to legacy path
1648
+ } else {
1649
+ // Add task files to temporary index
1650
+ let addFailed = false;
1651
+ for (const folder of foldersToStage) {
1652
+ const addRes = runGitWithEnv(["add", "--", folder], repoRoot, { GIT_INDEX_FILE: tmpIdx });
1653
+ if (!addRes.ok) {
1654
+ execLog(
1655
+ "wave",
1656
+ `W${waveIndex}`,
1657
+ `orch branch staging: git add failed for ${folder}, falling back`,
1658
+ {
1659
+ error: addRes.stderr,
1660
+ },
1661
+ );
1662
+ addFailed = true;
1663
+ break;
1664
+ }
1665
+ }
1666
+
1667
+ if (!addFailed) {
1668
+ // Write tree from temporary index
1669
+ const writeTreeRes = runGitWithEnv(["write-tree"], repoRoot, { GIT_INDEX_FILE: tmpIdx });
1670
+
1671
+ if (writeTreeRes.ok) {
1672
+ const tree = writeTreeRes.stdout.trim();
1673
+ const taskIds = foldersToStage.map((f) => f.split("/").pop() || f).join(", ");
1674
+ const commitMsg = `chore: stage task files for orchestrator wave ${waveIndex} (${taskIds})`;
1675
+
1676
+ // Create commit directly on orch branch
1677
+ const commitTreeRes = runGit(
1678
+ ["commit-tree", tree, "-p", orchTip, "-m", commitMsg],
1679
+ repoRoot,
1680
+ );
1681
+
1682
+ if (commitTreeRes.ok) {
1683
+ const newCommit = commitTreeRes.stdout.trim();
1684
+ const refUpdateRes = runGit(
1685
+ ["update-ref", `refs/heads/${orchBranch}`, newCommit, orchTip],
1686
+ repoRoot,
1687
+ );
1688
+
1689
+ if (refUpdateRes.ok) {
1690
+ execLog(
1691
+ "wave",
1692
+ `W${waveIndex}`,
1693
+ `committed ${foldersToStage.length} task folder(s) directly on orch branch`,
1694
+ {
1695
+ orchBranch,
1696
+ folders: foldersToStage,
1697
+ from: orchTip.slice(0, 8),
1698
+ to: newCommit.slice(0, 8),
1699
+ },
1700
+ );
1701
+ // Clean up temp index and return — no need for legacy path
1702
+ try {
1703
+ unlinkSync(tmpIdx);
1704
+ } catch {
1705
+ /* best effort */
1706
+ }
1707
+ return;
1708
+ }
1709
+ execLog("wave", `W${waveIndex}`, `orch branch staging: ref update failed, falling back`, {
1710
+ error: refUpdateRes.stderr,
1711
+ });
1712
+ } else {
1713
+ execLog("wave", `W${waveIndex}`, `orch branch staging: commit-tree failed, falling back`, {
1714
+ error: commitTreeRes.stderr,
1715
+ });
1716
+ }
1717
+ } else {
1718
+ execLog("wave", `W${waveIndex}`, `orch branch staging: write-tree failed, falling back`, {
1719
+ error: writeTreeRes.stderr,
1720
+ });
1721
+ }
1722
+ }
1723
+ }
1724
+ } catch (err: unknown) {
1725
+ execLog(
1726
+ "wave",
1727
+ `W${waveIndex}`,
1728
+ `orch branch staging: unexpected error, falling back to HEAD commit`,
1729
+ {
1730
+ error: err instanceof Error ? err.message : String(err),
1731
+ },
1732
+ );
1733
+ } finally {
1734
+ // Always clean up temp index
1735
+ try {
1736
+ unlinkSync(tmpIdx);
1737
+ } catch {
1738
+ /* best effort */
1739
+ }
1740
+ }
1741
+ }
1742
+ }
1743
+
1744
+ // Legacy fallback: commit on HEAD and sync orch branch.
1745
+ // This path is used when orchBranch is not provided, or when the
1746
+ // plumbing-based approach above failed.
1747
+
1748
+ // Stage only the task folders
1749
+ for (const folder of foldersToStage) {
1750
+ const addResult = runGit(["add", "--", folder], repoRoot);
1751
+ if (!addResult.ok) {
1752
+ execLog("wave", `W${waveIndex}`, `failed to stage task files: ${addResult.stderr}`, { folder });
1753
+ throw new ExecutionError(
1754
+ "EXEC_TASK_STAGE_FAILED",
1755
+ `Failed to stage task files in "${folder}": ${addResult.stderr}`,
1756
+ "wave",
1757
+ folder,
1758
+ );
1759
+ }
1760
+ }
1761
+
1762
+ // Commit
1763
+ const taskIds = foldersToStage.map((f) => f.split("/").pop() || f).join(", ");
1764
+ const commitMsg = `chore: stage task files for orchestrator wave ${waveIndex} (${taskIds})`;
1765
+ const commitResult = runGit(["commit", "-m", commitMsg], repoRoot);
1766
+ if (!commitResult.ok) {
1767
+ execLog("wave", `W${waveIndex}`, `failed to commit task files: ${commitResult.stderr}`);
1768
+ throw new ExecutionError(
1769
+ "EXEC_TASK_COMMIT_FAILED",
1770
+ `Failed to commit task files for wave ${waveIndex}: ${commitResult.stderr}`,
1771
+ "wave",
1772
+ `W${waveIndex}`,
1773
+ );
1774
+ }
1775
+
1776
+ execLog(
1777
+ "wave",
1778
+ `W${waveIndex}`,
1779
+ `committed ${foldersToStage.length} task folder(s) to ensure worktree visibility`,
1780
+ {
1781
+ folders: foldersToStage,
1782
+ commit: commitResult.stdout.trim().split("\n")[0],
1783
+ },
1784
+ );
1785
+
1786
+ // Fast-forward (or merge) the orch branch to include the staging commit so
1787
+ // that worktrees—which branch from orchBranch—see the new task files and
1788
+ // workers can find their PROMPT.md / STATUS.md without an ENOENT crash.
1789
+ if (orchBranch) {
1790
+ try {
1791
+ const headRes = runGit(["rev-parse", "HEAD"], repoRoot);
1792
+ const orchTipRes = runGit(["rev-parse", `refs/heads/${orchBranch}`], repoRoot);
1793
+
1794
+ if (headRes.ok && orchTipRes.ok) {
1795
+ const newHead = headRes.stdout.trim();
1796
+ const orchTip = orchTipRes.stdout.trim();
1797
+
1798
+ const ancestorCheck = runGit(["merge-base", "--is-ancestor", orchTip, newHead], repoRoot);
1799
+
1800
+ if (ancestorCheck.ok) {
1801
+ const ffResult = runGit(
1802
+ ["update-ref", `refs/heads/${orchBranch}`, newHead, orchTip],
1803
+ repoRoot,
1804
+ );
1805
+ if (ffResult.ok) {
1806
+ execLog("wave", `W${waveIndex}`, `fast-forwarded orch branch to include staging commit`, {
1807
+ orchBranch,
1808
+ from: orchTip.slice(0, 8),
1809
+ to: newHead.slice(0, 8),
1810
+ });
1811
+ } else {
1812
+ execLog("wave", `W${waveIndex}`, `warning: failed to fast-forward orch branch (non-fatal)`, {
1813
+ orchBranch,
1814
+ error: ffResult.stderr,
1815
+ });
1816
+ }
1817
+ } else {
1818
+ const mergeTreeRes = runGit(["merge-tree", "--write-tree", orchTip, newHead], repoRoot);
1819
+ if (mergeTreeRes.ok) {
1820
+ const mergedTree = mergeTreeRes.stdout.trim().split("\n")[0];
1821
+ if (/^[0-9a-f]{40}$/i.test(mergedTree)) {
1822
+ const mergeMsg = `merge: include staged task files for wave ${waveIndex} into orch branch`;
1823
+ const commitTreeRes = runGit(
1824
+ ["commit-tree", mergedTree, "-p", orchTip, "-p", newHead, "-m", mergeMsg],
1825
+ repoRoot,
1826
+ );
1827
+ if (commitTreeRes.ok) {
1828
+ const mergeCommitSha = commitTreeRes.stdout.trim();
1829
+ const refUpdateRes = runGit(
1830
+ ["update-ref", `refs/heads/${orchBranch}`, mergeCommitSha, orchTip],
1831
+ repoRoot,
1832
+ );
1833
+ if (refUpdateRes.ok) {
1834
+ execLog("wave", `W${waveIndex}`, `merged staging commit into orch branch (non-FF wave)`, {
1835
+ orchBranch,
1836
+ mergeCommit: mergeCommitSha.slice(0, 8),
1837
+ });
1838
+ }
1839
+ }
1840
+ }
1841
+ }
1842
+ }
1843
+ }
1844
+ } catch (refErr: unknown) {
1845
+ execLog(
1846
+ "wave",
1847
+ `W${waveIndex}`,
1848
+ `warning: orch branch ref update threw unexpectedly (non-fatal)`,
1849
+ {
1850
+ orchBranch,
1851
+ error: refErr instanceof Error ? refErr.message : String(refErr),
1852
+ },
1853
+ );
1854
+ }
1855
+ }
1856
+ }
1857
+
1858
+ // ── Wave Execution Core ──────────────────────────────────────────────
1859
+
1860
+ /**
1861
+ * Execute a single wave: allocate lanes, run tasks in parallel, monitor, apply failure policy.
1862
+ *
1863
+ * Orchestration flow:
1864
+ * 1. Allocate lanes via allocateLanes() (worktree creation + task assignment)
1865
+ * 2. Start all lanes in parallel (each lane executes tasks sequentially)
1866
+ * 3. Start monitoring as a sibling async loop
1867
+ * 4. Wait for all lanes to complete (or policy-triggered early termination)
1868
+ * 5. Apply failure handling policy
1869
+ * 6. Build and return WaveExecutionResult
1870
+ *
1871
+ * Failure policy behavior:
1872
+ * - **skip-dependents**: In-flight tasks continue. Failed task's transitive
1873
+ * dependents are collected in blockedTaskIds for future wave pruning.
1874
+ * Current wave runs to completion.
1875
+ * - **stop-wave**: On first failure, pauseSignal is set. In-flight tasks
1876
+ * finish their current work, remaining tasks in lanes are skipped.
1877
+ * No next wave is started (stoppedEarly=true).
1878
+ * - **stop-all**: On first failure, all active lane sessions are killed immediately.
1879
+ * Returns with aborted status.
1880
+ *
1881
+ * Concurrency model:
1882
+ * - Lane execution promises are NOT cancellable (lane sessions run externally)
1883
+ * - stop-all kills sessions directly; executeLane() detects session death on next poll
1884
+ * - Monitoring stops when all lanes reach terminal state or pauseSignal is set
1885
+ *
1886
+ * @param waveTasks - Task IDs in this wave
1887
+ * @param waveIndex - Wave number (1-indexed)
1888
+ * @param pending - Full pending task map from discovery
1889
+ * @param config - Orchestrator configuration
1890
+ * @param repoRoot - Main repository root
1891
+ * @param batchId - Batch ID for naming
1892
+ * @param pauseSignal - Shared pause signal (mutated by stop-wave policy)
1893
+ * @param dependencyGraph - Dependency graph for computing transitive dependents
1894
+ * @param orchBranch - Orch branch to base worktrees on (and to update after staging commits)
1895
+ * @param onMonitorUpdate - Optional callback for dashboard updates during monitoring
1896
+ * @param onLanesAllocated - Optional callback fired after lane allocation succeeds
1897
+ * @param workspaceConfig - Workspace configuration for repo routing (null/undefined = repo mode)
1898
+ * @returns WaveExecutionResult with outcomes and blocked task IDs
1899
+ */
1900
+ /**
1901
+ * Runtime backend selector for lane execution.
1902
+ *
1903
+ * - `"legacy"`: Session-backed path (spawnLaneSession, deprecated)
1904
+ * - `"v2"`: Direct-child path (lane-runner → agent-host → pi --mode rpc)
1905
+ *
1906
+ * @since TP-105
1907
+ */
1908
+ export type RuntimeBackend = "legacy" | "v2";
1909
+
1910
+ export async function executeWave(
1911
+ waveTasks: string[],
1912
+ waveIndex: number,
1913
+ pending: Map<string, ParsedTask>,
1914
+ config: OrchestratorConfig,
1915
+ repoRoot: string,
1916
+ batchId: string,
1917
+ pauseSignal: { paused: boolean },
1918
+ dependencyGraph: DependencyGraph,
1919
+ orchBranch: string,
1920
+ onMonitorUpdate?: MonitorUpdateCallback,
1921
+ onLanesAllocated?: (lanes: AllocatedLane[]) => void,
1922
+ workspaceConfig?: WorkspaceConfig | null,
1923
+ runtimeBackend?: RuntimeBackend,
1924
+ onSupervisorAlert?: SupervisorAlertCallback,
1925
+ supervisorAutonomy: "interactive" | "supervised" | "autonomous" = "autonomous",
1926
+ reviewerConfig?: {
1927
+ model?: string;
1928
+ thinking?: string;
1929
+ tools?: string;
1930
+ excludeExtensions?: string[];
1931
+ },
1932
+ workerConfig?: {
1933
+ model?: string;
1934
+ thinking?: string;
1935
+ tools?: string;
1936
+ excludeExtensions?: string[];
1937
+ } | null,
1938
+ workerExcludeExtensions?: string[],
1939
+ onLaneTerminated?: import("./types.ts").LaneTerminatedCallback,
1940
+ onLaneRespawned?: (laneNumber: number, agentId: string, batchId: string) => void,
1941
+ ): Promise<WaveExecutionResult> {
1942
+ const startedAt = Date.now();
1943
+ const policy = config.failure.on_task_failure;
1944
+
1945
+ execLog("wave", `W${waveIndex}`, `starting wave execution`, {
1946
+ tasks: waveTasks.length,
1947
+ policy,
1948
+ batchId,
1949
+ });
1950
+
1951
+ // ── Stage 0: Ensure task files are committed ────────────────
1952
+ // Task folders may contain untracked files (PROMPT.md, STATUS.md) that
1953
+ // won't appear in worktrees unless committed. Stage and commit them now,
1954
+ // before worktree creation, so workers can find their task files.
1955
+ // Pass orchBranch so the staging commit is reflected in the orch branch
1956
+ // before worktrees are allocated from it.
1957
+ try {
1958
+ ensureTaskFilesCommitted(waveTasks, pending, repoRoot, waveIndex, orchBranch);
1959
+ } catch (err: unknown) {
1960
+ const errMsg = err instanceof Error ? err.message : String(err);
1961
+ execLog("wave", `W${waveIndex}`, `task file commit failed: ${errMsg}`);
1962
+
1963
+ return {
1964
+ waveIndex,
1965
+ startedAt,
1966
+ endedAt: Date.now(),
1967
+ laneResults: [],
1968
+ policyApplied: policy,
1969
+ stoppedEarly: true,
1970
+ failedTaskIds: waveTasks,
1971
+ skippedTaskIds: [],
1972
+ succeededTaskIds: [],
1973
+ blockedTaskIds: [...computeTransitiveDependents(new Set(waveTasks), dependencyGraph)],
1974
+ laneCount: 0,
1975
+ overallStatus: "failed",
1976
+ finalMonitorState: null,
1977
+ allocatedLanes: [],
1978
+ };
1979
+ }
1980
+
1981
+ // ── Stage 1: Allocate lanes ──────────────────────────────────
1982
+ const allocResult = allocateLanes(
1983
+ waveTasks,
1984
+ pending,
1985
+ config,
1986
+ repoRoot,
1987
+ batchId,
1988
+ orchBranch,
1989
+ workspaceConfig,
1990
+ );
1991
+
1992
+ if (!allocResult.success) {
1993
+ const errMsg = allocResult.error?.message || "Unknown allocation failure";
1994
+ execLog("wave", `W${waveIndex}`, `lane allocation failed: ${errMsg}`);
1995
+
1996
+ return {
1997
+ waveIndex,
1998
+ startedAt,
1999
+ endedAt: Date.now(),
2000
+ laneResults: [],
2001
+ policyApplied: policy,
2002
+ stoppedEarly: true,
2003
+ failedTaskIds: waveTasks, // All tasks in the wave are considered failed
2004
+ skippedTaskIds: [],
2005
+ succeededTaskIds: [],
2006
+ blockedTaskIds: [...computeTransitiveDependents(new Set(waveTasks), dependencyGraph)],
2007
+ laneCount: 0,
2008
+ overallStatus: "failed",
2009
+ finalMonitorState: null,
2010
+ allocatedLanes: [],
2011
+ allocationError: allocResult.error,
2012
+ };
2013
+ }
2014
+
2015
+ const lanes = allocResult.lanes;
2016
+ onLanesAllocated?.(lanes);
2017
+
2018
+ execLog("wave", `W${waveIndex}`, `lanes allocated`, {
2019
+ laneCount: lanes.length,
2020
+ totalTasks: waveTasks.length,
2021
+ });
2022
+
2023
+ // ── Stage 2+3: Start lanes in parallel + monitoring ──────────
2024
+ // Create per-wave pause signal that can be triggered by policy
2025
+ // while preserving the external pauseSignal from /orch-pause
2026
+ const wavePauseSignal = pauseSignal;
2027
+
2028
+ // Start lane execution promises
2029
+ // In workspace mode, pass the workspace root so lane sessions can find .pi/ config.
2030
+ // configPath is .pi/orchid-workspace.yaml → parent of parent is workspace root.
2031
+ const wsRoot = workspaceConfig ? dirname(dirname(workspaceConfig.configPath)) : undefined;
2032
+ const isWsMode = !!workspaceConfig;
2033
+ const backend: RuntimeBackend = "v2";
2034
+ if (runtimeBackend && runtimeBackend !== "v2") {
2035
+ execLog(
2036
+ "wave",
2037
+ `W${waveIndex}`,
2038
+ `legacy runtime backend '${runtimeBackend}' requested but ignored; using Runtime V2`,
2039
+ );
2040
+ }
2041
+ execLog("wave", `W${waveIndex}`, "using Runtime V2 backend (executeLaneV2)");
2042
+
2043
+ // Clear stale lane snapshots from prior waves before launching new workers.
2044
+ // Without this, the monitor reads a snapshot from wave N-1 (different taskId,
2045
+ // staleMs > 30s) and may falsely mark the new task as failed before the
2046
+ // worker has time to write its first snapshot.
2047
+ const snapshotStateRoot = resolveRuntimeStateRoot(repoRoot, wsRoot);
2048
+ for (const lane of lanes) {
2049
+ try {
2050
+ const snapPath = join(
2051
+ snapshotStateRoot,
2052
+ ".pi",
2053
+ "runtime",
2054
+ batchId,
2055
+ "lanes",
2056
+ `lane-${lane.laneNumber}.json`,
2057
+ );
2058
+ if (existsSync(snapPath)) unlinkSync(snapPath);
2059
+ } catch {
2060
+ /* best effort */
2061
+ }
2062
+ }
2063
+
2064
+ const lanePromises = lanes.map((lane) =>
2065
+ executeLaneV2(
2066
+ lane,
2067
+ config,
2068
+ repoRoot,
2069
+ wavePauseSignal,
2070
+ wsRoot,
2071
+ isWsMode,
2072
+ {
2073
+ ORCH_BATCH_ID: batchId,
2074
+ TASKPLANE_SUPERVISOR_AUTONOMY: supervisorAutonomy,
2075
+ ...buildWorkerEnv(workerConfig),
2076
+ ...buildReviewerEnv(reviewerConfig),
2077
+ ...buildWorkerExcludeEnv(workerExcludeExtensions),
2078
+ },
2079
+ onSupervisorAlert,
2080
+ onLaneTerminated,
2081
+ onLaneRespawned,
2082
+ ),
2083
+ );
2084
+
2085
+ // Start monitoring as a sibling async loop
2086
+ // Monitor runs concurrently and stops when all lanes are terminal or paused
2087
+ const monitorStateRoot = resolveRuntimeStateRoot(repoRoot, wsRoot);
2088
+ const monitorPromise = monitorLanes(
2089
+ lanes,
2090
+ config,
2091
+ repoRoot,
2092
+ wavePauseSignal,
2093
+ waveIndex,
2094
+ onMonitorUpdate,
2095
+ isWsMode,
2096
+ backend,
2097
+ batchId,
2098
+ monitorStateRoot,
2099
+ );
2100
+
2101
+ // ── Stage 4: Wait for all lanes + apply policy ───────────────
2102
+ // We need to detect the first failure to apply policy.
2103
+ // Use Promise.allSettled on lanes, then check results.
2104
+ // For stop-all, we also need to react proactively.
2105
+
2106
+ let laneResults: LaneExecutionResult[];
2107
+ let finalMonitorState: MonitorState | null = null;
2108
+
2109
+ if (policy === "stop-all") {
2110
+ // For stop-all: race detection — as soon as any lane reports failure,
2111
+ // kill all sessions immediately.
2112
+ laneResults = await executeWithStopAll(lanes, lanePromises, wavePauseSignal, waveIndex);
2113
+ } else {
2114
+ // For skip-dependents and stop-wave:
2115
+ // Let all lanes run to completion (or until pauseSignal stops them).
2116
+ // For stop-wave, we set pauseSignal when we detect failure in results.
2117
+ const settled = await Promise.allSettled(lanePromises);
2118
+
2119
+ laneResults = settled.map((result, idx) => {
2120
+ if (result.status === "fulfilled") {
2121
+ return result.value;
2122
+ }
2123
+ // Rejected promise — shouldn't normally happen (executeLane catches errors)
2124
+ const errMsg = result.reason instanceof Error ? result.reason.message : String(result.reason);
2125
+ execLog("wave", `W${waveIndex}`, `lane ${lanes[idx].laneId} promise rejected: ${errMsg}`);
2126
+ return {
2127
+ laneNumber: lanes[idx].laneNumber,
2128
+ laneId: lanes[idx].laneId,
2129
+ tasks: lanes[idx].tasks.map((t) => ({
2130
+ taskId: t.taskId,
2131
+ status: "failed" as LaneTaskStatus,
2132
+ startTime: null,
2133
+ endTime: null,
2134
+ exitReason: `Lane promise rejected: ${errMsg}`,
2135
+ sessionName: laneSessionIdOf(lanes[idx]),
2136
+ doneFileFound: false,
2137
+ laneNumber: lanes[idx].laneNumber,
2138
+ })),
2139
+ overallStatus: "failed" as const,
2140
+ startTime: startedAt,
2141
+ endTime: Date.now(),
2142
+ };
2143
+ });
2144
+
2145
+ // For stop-wave: if any task failed, set pause to prevent next wave
2146
+ if (policy === "stop-wave") {
2147
+ const hasFailure = laneResults.some((lr) =>
2148
+ lr.tasks.some((t) => t.status === "failed" || t.status === "stalled"),
2149
+ );
2150
+ if (hasFailure) {
2151
+ wavePauseSignal.paused = true;
2152
+ execLog("wave", `W${waveIndex}`, `stop-wave policy triggered — pausing after this wave`);
2153
+ }
2154
+ }
2155
+ }
2156
+
2157
+ // Stop the monitor (it should stop naturally when lanes are terminal,
2158
+ // but ensure it's stopped if we triggered pause)
2159
+ try {
2160
+ finalMonitorState = await monitorPromise;
2161
+ } catch {
2162
+ // Monitor error is non-fatal
2163
+ execLog("wave", `W${waveIndex}`, `monitor promise error (non-fatal)`);
2164
+ }
2165
+
2166
+ // ── Stage 5: Build WaveExecutionResult ───────────────────────
2167
+ const failedTaskIds: string[] = [];
2168
+ const skippedTaskIds: string[] = [];
2169
+ const succeededTaskIds: string[] = [];
2170
+
2171
+ for (const lr of laneResults) {
2172
+ for (const t of lr.tasks) {
2173
+ if (t.status === "succeeded") {
2174
+ succeededTaskIds.push(t.taskId);
2175
+ } else if (t.status === "failed" || t.status === "stalled") {
2176
+ failedTaskIds.push(t.taskId);
2177
+ } else if (t.status === "skipped") {
2178
+ skippedTaskIds.push(t.taskId);
2179
+ }
2180
+ }
2181
+ }
2182
+
2183
+ // Sort for deterministic output
2184
+ failedTaskIds.sort();
2185
+ skippedTaskIds.sort();
2186
+ succeededTaskIds.sort();
2187
+
2188
+ // Compute blocked tasks for future waves (skip-dependents policy)
2189
+ let blockedTaskIds: string[] = [];
2190
+ if (policy === "skip-dependents" && failedTaskIds.length > 0) {
2191
+ const blocked = computeTransitiveDependents(new Set(failedTaskIds), dependencyGraph);
2192
+ blockedTaskIds = [...blocked].sort();
2193
+ if (blockedTaskIds.length > 0) {
2194
+ execLog(
2195
+ "wave",
2196
+ `W${waveIndex}`,
2197
+ `skip-dependents: ${blockedTaskIds.length} task(s) blocked for future waves`,
2198
+ {
2199
+ blocked: blockedTaskIds.join(","),
2200
+ },
2201
+ );
2202
+ }
2203
+ }
2204
+
2205
+ // Determine overall wave status
2206
+ const stoppedEarly =
2207
+ (policy === "stop-all" && failedTaskIds.length > 0) ||
2208
+ (policy === "stop-wave" && failedTaskIds.length > 0);
2209
+
2210
+ let overallStatus: WaveExecutionResult["overallStatus"];
2211
+ if (policy === "stop-all" && failedTaskIds.length > 0) {
2212
+ overallStatus = "aborted";
2213
+ } else if (failedTaskIds.length === 0) {
2214
+ overallStatus = "succeeded";
2215
+ } else if (succeededTaskIds.length > 0) {
2216
+ overallStatus = "partial";
2217
+ } else {
2218
+ overallStatus = "failed";
2219
+ }
2220
+
2221
+ const endedAt = Date.now();
2222
+ const elapsedSec = Math.round((endedAt - startedAt) / 1000);
2223
+
2224
+ execLog("wave", `W${waveIndex}`, `wave execution complete: ${overallStatus}`, {
2225
+ succeeded: succeededTaskIds.length,
2226
+ failed: failedTaskIds.length,
2227
+ skipped: skippedTaskIds.length,
2228
+ blocked: blockedTaskIds.length,
2229
+ elapsed: `${elapsedSec}s`,
2230
+ stoppedEarly,
2231
+ });
2232
+
2233
+ return {
2234
+ waveIndex,
2235
+ startedAt,
2236
+ endedAt,
2237
+ laneResults,
2238
+ policyApplied: policy,
2239
+ stoppedEarly,
2240
+ failedTaskIds,
2241
+ skippedTaskIds,
2242
+ succeededTaskIds,
2243
+ blockedTaskIds,
2244
+ laneCount: lanes.length,
2245
+ overallStatus,
2246
+ finalMonitorState,
2247
+ allocatedLanes: lanes,
2248
+ };
2249
+ }
2250
+
2251
+ /**
2252
+ * Execute lanes with stop-all failure policy.
2253
+ *
2254
+ * Starts all lanes, then monitors for the first failure.
2255
+ * On first failure: kills all active lane sessions immediately and returns.
2256
+ *
2257
+ * Uses a race pattern: wraps each lane promise to signal on failure,
2258
+ * then kills all sessions when first failure is detected.
2259
+ *
2260
+ * Deterministic tie-break: when multiple failures happen simultaneously,
2261
+ * they are ordered by timestamp (startTime), then by task ID alphabetically.
2262
+ *
2263
+ * @param lanes - Allocated lanes
2264
+ * @param lanePromises - Already-started lane execution promises
2265
+ * @param pauseSignal - Pause signal to set on abort
2266
+ * @param waveIndex - Wave number for logging
2267
+ * @returns Lane execution results (may have aborted tasks)
2268
+ */
2269
+ export async function executeWithStopAll(
2270
+ lanes: AllocatedLane[],
2271
+ lanePromises: Promise<LaneExecutionResult>[],
2272
+ pauseSignal: { paused: boolean },
2273
+ waveIndex: number,
2274
+ ): Promise<LaneExecutionResult[]> {
2275
+ // Track results as they complete
2276
+ const results: (LaneExecutionResult | null)[] = new Array(lanes.length).fill(null);
2277
+ let abortTriggered = false;
2278
+
2279
+ // Create a promise that resolves when all lanes are done
2280
+ // but also detects first failure
2281
+ const wrappedPromises = lanePromises.map(async (promise, idx) => {
2282
+ try {
2283
+ const result = await promise;
2284
+ results[idx] = result;
2285
+
2286
+ // Check if any task failed
2287
+ if (!abortTriggered) {
2288
+ const hasFailure = result.tasks.some((t) => t.status === "failed" || t.status === "stalled");
2289
+ if (hasFailure) {
2290
+ // First failure detected — trigger stop-all
2291
+ abortTriggered = true;
2292
+ pauseSignal.paused = true;
2293
+
2294
+ // Determine which task failed first for logging
2295
+ const firstFailed = result.tasks
2296
+ .filter((t) => t.status === "failed" || t.status === "stalled")
2297
+ .sort((a, b) => {
2298
+ // Sort by startTime, then by taskId for deterministic tie-break
2299
+ const timeA = a.startTime || 0;
2300
+ const timeB = b.startTime || 0;
2301
+ if (timeA !== timeB) return timeA - timeB;
2302
+ return a.taskId.localeCompare(b.taskId);
2303
+ })[0];
2304
+
2305
+ execLog(
2306
+ "wave",
2307
+ `W${waveIndex}`,
2308
+ `stop-all triggered by ${firstFailed?.taskId || "unknown"} in ${lanes[idx].laneId}`,
2309
+ {
2310
+ session: laneSessionIdOf(lanes[idx]),
2311
+ },
2312
+ );
2313
+
2314
+ // Kill ALL lane sessions immediately
2315
+ for (const lane of lanes) {
2316
+ killV2LaneAgents(laneSessionIdOf(lane), { laneNumber: lane.laneNumber });
2317
+ }
2318
+ }
2319
+ }
2320
+
2321
+ return result;
2322
+ } catch (err) {
2323
+ // Lane promise rejection — should be rare
2324
+ const errMsg = err instanceof Error ? err.message : String(err);
2325
+ if (!abortTriggered) {
2326
+ abortTriggered = true;
2327
+ pauseSignal.paused = true;
2328
+ execLog(
2329
+ "wave",
2330
+ `W${waveIndex}`,
2331
+ `stop-all triggered by lane error in ${lanes[idx].laneId}: ${errMsg}`,
2332
+ );
2333
+ for (const lane of lanes) {
2334
+ killV2LaneAgents(laneSessionIdOf(lane), { laneNumber: lane.laneNumber });
2335
+ }
2336
+ }
2337
+
2338
+ // Build a failed result for this lane
2339
+ const failedResult: LaneExecutionResult = {
2340
+ laneNumber: lanes[idx].laneNumber,
2341
+ laneId: lanes[idx].laneId,
2342
+ tasks: lanes[idx].tasks.map((t) => ({
2343
+ taskId: t.taskId,
2344
+ status: "failed" as LaneTaskStatus,
2345
+ startTime: null,
2346
+ endTime: null,
2347
+ exitReason: `Lane aborted: ${errMsg}`,
2348
+ sessionName: laneSessionIdOf(lanes[idx]),
2349
+ doneFileFound: false,
2350
+ laneNumber: lanes[idx].laneNumber,
2351
+ })),
2352
+ overallStatus: "failed",
2353
+ startTime: Date.now(),
2354
+ endTime: Date.now(),
2355
+ };
2356
+ results[idx] = failedResult;
2357
+ return failedResult;
2358
+ }
2359
+ });
2360
+
2361
+ // Wait for all lanes to settle (they should exit quickly after session kill)
2362
+ await Promise.allSettled(wrappedPromises);
2363
+
2364
+ // Fill in any null results (shouldn't happen, but defensive)
2365
+ return results.map(
2366
+ (r, idx) =>
2367
+ r || {
2368
+ laneNumber: lanes[idx].laneNumber,
2369
+ laneId: lanes[idx].laneId,
2370
+ tasks: [],
2371
+ overallStatus: "failed" as const,
2372
+ startTime: Date.now(),
2373
+ endTime: Date.now(),
2374
+ },
2375
+ );
2376
+ }
2377
+
2378
+ // ── Runtime V2 Bridge Helpers (TP-102) ─────────────────────────────────────
2379
+ //
2380
+ // These helpers bridge between existing legacy data structures
2381
+ // (AllocatedLane, AllocatedTask, resolveCanonicalTaskPaths) and
2382
+ // Runtime V2 contracts (ExecutionUnit, PacketPaths, RuntimeAgentId).
2383
+ //
2384
+ // They are additive — existing code paths continue to work.
2385
+ // Runtime V2 consumers can start using these to avoid coupling to
2386
+ // legacy lane-session naming, cwd-derived paths, or extension lifecycle assumptions.
2387
+ // ────────────────────────────────────────────────────────────────────────────
2388
+
2389
+ /**
2390
+ * Build a Runtime V2 ExecutionUnit from existing legacy structures.
2391
+ *
2392
+ * Translates the current AllocatedLane + AllocatedTask into the new
2393
+ * ExecutionUnit contract with explicit packet-path authority.
2394
+ *
2395
+ * Uses `resolveCanonicalTaskPaths` to derive packet paths through
2396
+ * the existing resolution logic (worktree-relative, cross-repo copy,
2397
+ * archive fallback). This preserves current behavior while surfacing
2398
+ * it through the Runtime V2 contract.
2399
+ *
2400
+ * **Cross-repo packet authority (TP-109):** In workspace mode, when the
2401
+ * task packet home repo differs from the execution repo, the legacy path
2402
+ * copies packet files into the worktree under `.orchid-tasks/`. The
2403
+ * resolved `packet` paths here point to that execution-local copy.
2404
+ * This is by design: the worker reads/writes STATUS.md and creates .DONE
2405
+ * in the worktree, and resume checks both the worktree-relative path and
2406
+ * the original discovery path for .DONE detection.
2407
+ *
2408
+ * `packetHomeRepoId` identifies the source repo that *owns* the task
2409
+ * (for discovery and routing), while `packet.taskFolder` is the
2410
+ * authoritative *working* location where artifacts are read/written
2411
+ * during execution. Resume reconciliation (TP-109) resolves both paths.
2412
+ *
2413
+ * @param lane - Allocated lane containing worktree and identity info
2414
+ * @param task - Allocated task to build an execution unit for
2415
+ * @param repoRoot - Main repository root
2416
+ * @param isWorkspaceMode - Whether workspace mode is active
2417
+ * @returns A fully-resolved ExecutionUnit
2418
+ *
2419
+ * @since TP-102
2420
+ */
2421
+ export function buildExecutionUnit(
2422
+ lane: AllocatedLane,
2423
+ task: AllocatedTask,
2424
+ repoRoot: string,
2425
+ isWorkspaceMode?: boolean,
2426
+ ): ExecutionUnit {
2427
+ // TP-169: Guard against missing taskFolder. This can happen when
2428
+ // reconstructAllocatedLanes creates task stubs from persisted state
2429
+ // where taskFolder enrichment failed (e.g., dynamically-expanded
2430
+ // segments whose persisted records had empty taskFolder).
2431
+ const taskFolder = task.task?.taskFolder;
2432
+ if (!taskFolder) {
2433
+ throw new ExecutionError(
2434
+ "EXEC_MISSING_TASK_FOLDER",
2435
+ `Cannot build execution unit for task ${task.taskId}: taskFolder is ${taskFolder === "" ? "empty" : "undefined"}. ` +
2436
+ `This typically means the task's persisted record was not enriched with discovery data. ` +
2437
+ `Re-run discovery or check that the task exists in the task area.`,
2438
+ "execution",
2439
+ task.taskId,
2440
+ );
2441
+ }
2442
+ const resolved = resolveCanonicalTaskPaths(
2443
+ taskFolder,
2444
+ lane.worktreePath,
2445
+ repoRoot,
2446
+ isWorkspaceMode,
2447
+ );
2448
+
2449
+ const executionRepoId = lane.repoId ?? "default";
2450
+ const packetHomeRepoId = task.task.packetRepoId ?? executionRepoId;
2451
+
2452
+ // Build a segment-style ID if this is a segment execution,
2453
+ // otherwise use the plain task ID.
2454
+ const segmentId = task.task.activeSegmentId ?? null;
2455
+ const id = segmentId ?? task.taskId;
2456
+
2457
+ // Use absolute packetTaskPath ONLY when the packet home repo differs from
2458
+ // the execution repo (cross-repo segment). When they're the same repo,
2459
+ // resolve packet paths inside the worktree so .DONE, STATUS.md etc. are
2460
+ // written to the worktree (not the original repo outside the worktree).
2461
+ const useAbsolutePacketPath = task.task.packetTaskPath && packetHomeRepoId !== executionRepoId;
2462
+
2463
+ const packet = useAbsolutePacketPath
2464
+ ? resolvePacketPaths(task.task.packetTaskPath!)
2465
+ : {
2466
+ promptPath: resolved.taskFolderResolved + "/PROMPT.md",
2467
+ statusPath: resolved.statusPath,
2468
+ donePath: resolved.donePath,
2469
+ reviewsDir: resolved.taskFolderResolved + "/.reviews",
2470
+ taskFolder: resolved.taskFolderResolved,
2471
+ };
2472
+
2473
+ return {
2474
+ id,
2475
+ taskId: task.taskId,
2476
+ segmentId,
2477
+ executionRepoId,
2478
+ packetHomeRepoId,
2479
+ worktreePath: lane.worktreePath,
2480
+ packet,
2481
+ task: task.task,
2482
+ };
2483
+ }
2484
+
2485
+ /**
2486
+ * Build a RuntimeAgentId for a lane's agent from existing naming.
2487
+ *
2488
+ * Bridges the current lane-session naming convention into a
2489
+ * Runtime V2 stable agent ID. The output is compatible with
2490
+ * existing supervisor tools and mailbox addressing.
2491
+ *
2492
+ * @param lane - Allocated lane with a lane session name
2493
+ * @param role - Agent role
2494
+ * @param mergeIndex - Merge wave index (only for merge agents)
2495
+ * @returns Canonical agent ID
2496
+ *
2497
+ * @since TP-102
2498
+ */
2499
+ export function buildAgentIdFromLane(
2500
+ lane: AllocatedLane,
2501
+ role: RuntimeAgentRole,
2502
+ mergeIndex?: number,
2503
+ ): RuntimeAgentId {
2504
+ // The current laneSessionId is already in the right format
2505
+ // (e.g., "orch-henrylach-lane-1"). We derive agent IDs from it
2506
+ // by appending the role suffix, matching the existing convention.
2507
+ if (role === "merger" && mergeIndex != null) {
2508
+ // Merge agents use a different naming pattern
2509
+ const prefix = laneSessionIdOf(lane).replace(/-lane-\d+$/, "");
2510
+ return `${prefix}-merge-${mergeIndex}`;
2511
+ }
2512
+ if (role === "lane-runner") {
2513
+ return laneSessionIdOf(lane);
2514
+ }
2515
+ return `${laneSessionIdOf(lane)}-${role}`;
2516
+ }
2517
+
2518
+ /**
2519
+ * Resolve the Runtime V2 state root from available context.
2520
+ *
2521
+ * The state root is where `.pi/runtime/` artifacts live. In workspace
2522
+ * mode this is the workspace root; in repo mode it's the repo root.
2523
+ *
2524
+ * This centralizes the resolution so Runtime V2 code doesn't need
2525
+ * to repeat the workspace-vs-repo logic.
2526
+ *
2527
+ * @param repoRoot - Main repository root
2528
+ * @param workspaceRoot - Workspace root (undefined in repo mode)
2529
+ * @returns Absolute path to use as the state root for .pi/ artifacts
2530
+ *
2531
+ * @since TP-102
2532
+ */
2533
+ /**
2534
+ * Parse an agent .md file: extract frontmatter and body.
2535
+ * Returns null if file doesn't exist or is malformed.
2536
+ * @since TP-117
2537
+ */
2538
+ function parseAgentFile(filePath: string): { fm: Record<string, string>; body: string } | null {
2539
+ try {
2540
+ if (!existsSync(filePath)) return null;
2541
+ const raw = readFileSync(filePath, "utf-8");
2542
+ const fmEnd = raw.indexOf("---", 4);
2543
+ if (fmEnd < 0) return { fm: {}, body: raw.trim() };
2544
+ const fmBlock = raw.slice(4, fmEnd).trim();
2545
+ const fm: Record<string, string> = {};
2546
+ for (const line of fmBlock.split("\n")) {
2547
+ const m = line.match(/^([\w-]+)\s*:\s*(.+)/);
2548
+ if (m) fm[m[1]] = m[2].trim();
2549
+ }
2550
+ return { fm, body: raw.slice(fmEnd + 3).trim() };
2551
+ } catch {
2552
+ return null;
2553
+ }
2554
+ }
2555
+
2556
+ /**
2557
+ * Load the base agent prompt from the OrchID package's templates/ directory.
2558
+ * Resolves the package root via well-known npm global paths.
2559
+ * @since TP-117
2560
+ */
2561
+ function loadBaseAgentPrompt(agentName: string): string {
2562
+ // resolveTaskplaneAgentTemplate handles all npm setups (nvm, Homebrew, volta, Windows, etc.)
2563
+ // via npm root -g caching and well-known fallback paths (see path-resolver.ts, TP-157).
2564
+ // This avoids silently returning "" which would cause the worker to skip reviews.
2565
+ try {
2566
+ const resolved = resolveTaskplaneAgentTemplate(agentName);
2567
+ if (existsSync(resolved)) {
2568
+ const def = parseAgentFile(resolved);
2569
+ if (def?.body) return def.body;
2570
+ }
2571
+ } catch {
2572
+ /* fall through */
2573
+ }
2574
+ return "";
2575
+ }
2576
+
2577
+ /**
2578
+ * Load local project agent prompt from .pi/agents/ or agents/ directory.
2579
+ * Supports standalone mode (local replaces base entirely).
2580
+ * @since TP-117
2581
+ */
2582
+ function loadLocalAgentPrompt(stateRoot: string, agentName: string): string {
2583
+ const paths = [
2584
+ join(stateRoot, ".pi", "agents", `${agentName}.md`),
2585
+ join(stateRoot, "agents", `${agentName}.md`),
2586
+ ];
2587
+ for (const p of paths) {
2588
+ const def = parseAgentFile(p);
2589
+ if (def) {
2590
+ // standalone: true → use local as-is (body only, replaces base)
2591
+ if (def.fm.standalone === "true") return def.body;
2592
+ // Otherwise return body as project-specific guidance to append
2593
+ if (def.body) return def.body;
2594
+ }
2595
+ }
2596
+ return "";
2597
+ }
2598
+
2599
+ // ── Agent Definition Loading ─────────────────────────────────────────
2600
+
2601
+ /** Track whether an agent pointer warning has been logged this session (log once). */
2602
+ let _execPointerWarningLogged = false;
2603
+
2604
+ /**
2605
+ * Reset agent pointer warning state for testing.
2606
+ * @since TP-161
2607
+ */
2608
+ export function resetPointerWarning(): void {
2609
+ _execPointerWarningLogged = false;
2610
+ }
2611
+
2612
+ /**
2613
+ * Resolve agent files using the workspace pointer (workspace mode only).
2614
+ * Returns the agentRoot from the pointer, or null in repo mode / on failure.
2615
+ */
2616
+ function resolveAgentPointerRoot(): string | null {
2617
+ const wsRoot = process.env.TASKPLANE_WORKSPACE_ROOT;
2618
+ if (!wsRoot) return null;
2619
+ try {
2620
+ const wsConfig = loadWorkspaceConfig(wsRoot);
2621
+ const result = resolvePointer(wsRoot, wsConfig);
2622
+ if (result?.warning && !_execPointerWarningLogged) {
2623
+ _execPointerWarningLogged = true;
2624
+ console.error(`[execution] pointer: ${result.warning}`);
2625
+ }
2626
+ return result?.agentRoot ?? null;
2627
+ } catch {
2628
+ return null;
2629
+ }
2630
+ }
2631
+
2632
+ /**
2633
+ * Load a complete agent definition (systemPrompt + tools + model) by name.
2634
+ *
2635
+ * Resolution order:
2636
+ * 1. cwd/.pi/agents/<name>.md
2637
+ * 2. cwd/agents/<name>.md
2638
+ * 3. pointer.agentRoot/<name>.md (workspace mode only)
2639
+ * 4. Base package templates/agents/<name>.md
2640
+ *
2641
+ * If a local file has `standalone: true` in frontmatter, it is used as-is
2642
+ * (no base composition). Otherwise, base + local are composed.
2643
+ *
2644
+ * @param cwd - Working directory (project root) to search for local agent files
2645
+ * @param name - Agent name (e.g., "task-worker", "task-reviewer")
2646
+ * @returns Composed agent definition, or null if no base and no local file found
2647
+ * @since TP-161
2648
+ */
2649
+ export function loadAgentDef(
2650
+ cwd: string,
2651
+ name: string,
2652
+ ): { systemPrompt: string; tools: string; model: string } | null {
2653
+ const localPaths = [join(cwd, ".pi", "agents", `${name}.md`), join(cwd, "agents", `${name}.md`)];
2654
+
2655
+ // In workspace mode, add pointer-resolved agent root as fallback
2656
+ const agentRoot = resolveAgentPointerRoot();
2657
+ if (agentRoot) {
2658
+ localPaths.push(join(agentRoot, `${name}.md`));
2659
+ }
2660
+
2661
+ // Load base from package
2662
+ let baseDef: { fm: Record<string, string>; body: string } | null = null;
2663
+ try {
2664
+ const basePath = resolveTaskplaneAgentTemplate(name);
2665
+ if (existsSync(basePath)) {
2666
+ baseDef = parseAgentFile(basePath);
2667
+ }
2668
+ } catch {
2669
+ /* fall through */
2670
+ }
2671
+
2672
+ // Load local override (first found wins)
2673
+ let localDef: { fm: Record<string, string>; body: string } | null = null;
2674
+ for (const p of localPaths) {
2675
+ localDef = parseAgentFile(p);
2676
+ if (localDef) break;
2677
+ }
2678
+
2679
+ // No base and no local → null
2680
+ if (!baseDef && !localDef) return null;
2681
+
2682
+ // Local with standalone: true → use local as-is, ignore base
2683
+ if (localDef?.fm.standalone === "true") {
2684
+ return {
2685
+ systemPrompt: localDef.body,
2686
+ tools: localDef.fm.tools || "read,grep,find,ls",
2687
+ model: localDef.fm.model || "",
2688
+ };
2689
+ }
2690
+
2691
+ // Compose base + local
2692
+ const basePrompt = baseDef?.body || "";
2693
+ const localPrompt = localDef?.body || "";
2694
+ const composedPrompt = localPrompt
2695
+ ? basePrompt + "\n\n---\n\n## Project-Specific Guidance\n\n" + localPrompt
2696
+ : basePrompt;
2697
+
2698
+ // Local frontmatter overrides base (tools, model)
2699
+ const tools = localDef?.fm.tools || baseDef?.fm.tools || "read,grep,find,ls";
2700
+ const model = localDef?.fm.model || baseDef?.fm.model || "";
2701
+
2702
+ return { systemPrompt: composedPrompt.trim(), tools, model };
2703
+ }
2704
+
2705
+ export function resolveRuntimeStateRoot(repoRoot: string, workspaceRoot?: string): string {
2706
+ return workspaceRoot ?? repoRoot;
2707
+ }
2708
+
2709
+ // ── Runtime V2 Lane Execution (TP-105) ────────────────────────────
2710
+
2711
+ import { executeTaskV2, type LaneRunnerConfig, type LaneRunnerTaskResult } from "./lane-runner.ts";
2712
+ import { DEFAULT_WORKER_USER_TOOLS } from "./agent-host.ts";
2713
+
2714
+ /**
2715
+ * Execute a lane using the Runtime V2 headless backend.
2716
+ *
2717
+ * This replaces the legacy session-backed `executeLane()` for lanes that
2718
+ * should run on the new direct-child architecture. It uses the
2719
+ * lane-runner module which spawns workers via agent-host.ts instead
2720
+ * of terminal-session-backed workers.
2721
+ *
2722
+ * The function signature is deliberately close to the legacy
2723
+ * `executeLane()` to minimize integration churn in the engine.
2724
+ * The key difference: no legacy lane sessions are created.
2725
+ *
2726
+ * @since TP-105
2727
+ */
2728
+
2729
+ /**
2730
+ * Build reviewer env vars from a TaskRunnerConfig or reviewer config object.
2731
+ * Used to ensure reviewer config is consistently passed to executeLaneV2
2732
+ * across all call sites (initial waves, resume, retries).
2733
+ *
2734
+ * Returns only the keys that have non-empty values, so that empty/inherit
2735
+ * config does not override inherited env vars from the parent process.
2736
+ *
2737
+ * @since TP-160
2738
+ */
2739
+ /**
2740
+ * Parse a JSON string array from an env var value, returning empty array on failure.
2741
+ * @since TP-180
2742
+ */
2743
+ function parseJsonArrayEnv(value?: string): string[] {
2744
+ if (!value) return [];
2745
+ try {
2746
+ const parsed = JSON.parse(value);
2747
+ if (Array.isArray(parsed))
2748
+ return parsed.filter((v: unknown): v is string => typeof v === "string");
2749
+ } catch {
2750
+ /* ignore malformed */
2751
+ }
2752
+ return [];
2753
+ }
2754
+
2755
+ export function buildReviewerEnv(
2756
+ reviewerConfig?: {
2757
+ model?: string;
2758
+ thinking?: string;
2759
+ tools?: string;
2760
+ excludeExtensions?: string[];
2761
+ } | null,
2762
+ ): Record<string, string> {
2763
+ const env: Record<string, string> = {};
2764
+ if (reviewerConfig?.model) env.TASKPLANE_REVIEWER_MODEL = reviewerConfig.model;
2765
+ if (reviewerConfig?.thinking) env.TASKPLANE_REVIEWER_THINKING = reviewerConfig.thinking;
2766
+ if (reviewerConfig?.tools) env.TASKPLANE_REVIEWER_TOOLS = reviewerConfig.tools;
2767
+ // TP-180: Forward reviewer extension exclusions as JSON array
2768
+ if (reviewerConfig?.excludeExtensions && reviewerConfig.excludeExtensions.length > 0) {
2769
+ env.TASKPLANE_REVIEWER_EXCLUDE_EXTENSIONS = JSON.stringify(reviewerConfig.excludeExtensions);
2770
+ }
2771
+ return env;
2772
+ }
2773
+
2774
+ /**
2775
+ * Build worker env vars from config.
2776
+ *
2777
+ * Threads worker model/thinking/tools through to the lane runner
2778
+ * via env vars, mirroring the reviewer pattern (buildReviewerEnv).
2779
+ *
2780
+ * @since TP-181
2781
+ */
2782
+ export function buildWorkerEnv(
2783
+ workerConfig?: {
2784
+ model?: string;
2785
+ thinking?: string;
2786
+ tools?: string;
2787
+ excludeExtensions?: string[];
2788
+ } | null,
2789
+ ): Record<string, string> {
2790
+ const env: Record<string, string> = {};
2791
+ if (workerConfig?.model) env.TASKPLANE_WORKER_MODEL = workerConfig.model;
2792
+ if (workerConfig?.thinking) env.TASKPLANE_WORKER_THINKING = workerConfig.thinking;
2793
+ if (workerConfig?.tools) env.TASKPLANE_WORKER_TOOLS = workerConfig.tools;
2794
+
2795
+ return env;
2796
+ }
2797
+
2798
+ /**
2799
+ * Build worker extension exclusion env vars from config.
2800
+ * @since TP-180
2801
+ */
2802
+ export function buildWorkerExcludeEnv(
2803
+ workerExcludeExtensions?: string[] | null,
2804
+ ): Record<string, string> {
2805
+ const env: Record<string, string> = {};
2806
+ if (workerExcludeExtensions && workerExcludeExtensions.length > 0) {
2807
+ env.TASKPLANE_WORKER_EXCLUDE_EXTENSIONS = JSON.stringify(workerExcludeExtensions);
2808
+ }
2809
+ return env;
2810
+ }
2811
+
2812
+ export async function executeLaneV2(
2813
+ lane: AllocatedLane,
2814
+ config: OrchestratorConfig,
2815
+ repoRoot: string,
2816
+ pauseSignal: { paused: boolean },
2817
+ workspaceRoot?: string,
2818
+ isWorkspaceMode?: boolean,
2819
+ extraEnvVars?: Record<string, string>,
2820
+ onSupervisorAlert?: SupervisorAlertCallback,
2821
+ onLaneTerminated?: import("./types.ts").LaneTerminatedCallback,
2822
+ /**
2823
+ * TP-187 (#538): Optional callback fired BEFORE the first task of this
2824
+ * lane begins. The supervisor process uses it to lift any zombie-alert
2825
+ * suppression that was applied when this lane number was previously
2826
+ * terminated (e.g., in a prior wave).
2827
+ */
2828
+ onLaneRespawned?: (laneNumber: number, agentId: string, batchId: string) => void,
2829
+ ): Promise<LaneExecutionResult> {
2830
+ const laneId = lane.laneId;
2831
+ const laneStartTime = Date.now();
2832
+ const outcomes: LaneTaskOutcome[] = [];
2833
+ let shouldSkipRemaining = false;
2834
+
2835
+ const stateRoot = resolveRuntimeStateRoot(repoRoot, workspaceRoot);
2836
+ const batchId = config.orchestrator?.batchId || extraEnvVars?.ORCH_BATCH_ID || String(Date.now());
2837
+
2838
+ // Build agent ID prefix — must match the wave planner's naming (TP-115).
2839
+ // Uses resolveOperatorId() so agent registry keys align with lane session IDs.
2840
+ const sessionPrefix = config.orchestrator?.sessionPrefix ?? "orch";
2841
+ const opId = resolveOperatorId(config);
2842
+ const agentIdPrefix = `${sessionPrefix}-${opId}`;
2843
+
2844
+ // Load worker agent definition: compose base template + local project guidance.
2845
+ // The base template (templates/agents/task-worker.md) contains critical behavioral
2846
+ // rules: checkpoint discipline, STATUS.md resume algorithm, review_step instructions.
2847
+ // The local file (.pi/agents/task-worker.md) adds project-specific guidance.
2848
+ let workerSystemPrompt =
2849
+ "You are a task execution agent. Read STATUS.md first, find unchecked items, work on them, checkpoint after each.";
2850
+ let workerSegmentPrompt = "";
2851
+ try {
2852
+ const basePrompt = loadBaseAgentPrompt("task-worker");
2853
+ const localPrompt = loadLocalAgentPrompt(stateRoot, "task-worker");
2854
+ if (basePrompt && localPrompt) {
2855
+ workerSystemPrompt = basePrompt + "\n\n---\n\n## Project-Specific Guidance\n\n" + localPrompt;
2856
+ } else if (basePrompt) {
2857
+ workerSystemPrompt = basePrompt;
2858
+ } else if (localPrompt) {
2859
+ workerSystemPrompt = localPrompt;
2860
+ }
2861
+ // Load segment-scoped prompt overlay (appended when isSegmentScoped)
2862
+ const segPrompt = loadBaseAgentPrompt("task-worker-segment");
2863
+ if (segPrompt) workerSegmentPrompt = segPrompt;
2864
+ } catch {
2865
+ /* use default */
2866
+ }
2867
+
2868
+ execLog(laneId, "LANE", `starting Runtime V2 execution of ${lane.tasks.length} task(s)`, {
2869
+ worktree: lane.worktreePath,
2870
+ agentPrefix: agentIdPrefix,
2871
+ });
2872
+
2873
+ // TP-187 (#538): Lane is freshly starting — emit lane-respawned so any
2874
+ // zombie-alert suppression carried over from a prior wave's termination of
2875
+ // this lane number is lifted before new alerts begin to flow.
2876
+ if (onLaneRespawned) {
2877
+ try {
2878
+ onLaneRespawned(
2879
+ lane.laneNumber,
2880
+ buildRuntimeAgentId(agentIdPrefix, lane.laneNumber, "worker"),
2881
+ batchId,
2882
+ );
2883
+ } catch (err) {
2884
+ execLog(
2885
+ laneId,
2886
+ "LANE",
2887
+ `lane-respawned callback failed: ${err instanceof Error ? err.message : String(err)}`,
2888
+ );
2889
+ }
2890
+ }
2891
+
2892
+ for (const task of lane.tasks) {
2893
+ const taskSegmentId = task.task.activeSegmentId ?? null;
2894
+ if (shouldSkipRemaining || pauseSignal.paused) {
2895
+ const reason = pauseSignal.paused
2896
+ ? "Skipped due to pause signal"
2897
+ : "Skipped due to prior task failure in lane";
2898
+ outcomes.push({
2899
+ taskId: task.taskId,
2900
+ status: "skipped",
2901
+ segmentId: taskSegmentId,
2902
+ startTime: null,
2903
+ endTime: null,
2904
+ exitReason: reason,
2905
+ sessionName: buildRuntimeAgentId(agentIdPrefix, lane.laneNumber, "worker"),
2906
+ doneFileFound: false,
2907
+ laneNumber: lane.laneNumber,
2908
+ });
2909
+ continue;
2910
+ }
2911
+
2912
+ // Build execution unit
2913
+ const unit = buildExecutionUnit(lane, task, repoRoot, isWorkspaceMode);
2914
+
2915
+ const rawAutonomy = String(
2916
+ extraEnvVars?.TASKPLANE_SUPERVISOR_AUTONOMY ?? "autonomous",
2917
+ ).toLowerCase();
2918
+ const supervisorAutonomy: LaneRunnerConfig["supervisorAutonomy"] =
2919
+ rawAutonomy === "interactive" || rawAutonomy === "supervised" || rawAutonomy === "autonomous"
2920
+ ? (rawAutonomy as LaneRunnerConfig["supervisorAutonomy"])
2921
+ : "autonomous";
2922
+
2923
+ const laneRunnerConfig: LaneRunnerConfig = {
2924
+ batchId,
2925
+ agentIdPrefix,
2926
+ laneNumber: lane.laneNumber,
2927
+ worktreePath: lane.worktreePath,
2928
+ branch: lane.branch,
2929
+ repoId: lane.repoId ?? "default",
2930
+ stateRoot,
2931
+ workerModel: extraEnvVars?.TASKPLANE_WORKER_MODEL || "",
2932
+ // TP-184: This is the user-tools default. Engine bridge tools are NOT
2933
+ // added here — buildWorkerToolsAllowlist() at the lane-runner spawn
2934
+ // site appends ENGINE_BRIDGE_TOOLS exactly once, regardless of source.
2935
+ workerTools: extraEnvVars?.TASKPLANE_WORKER_TOOLS || DEFAULT_WORKER_USER_TOOLS,
2936
+ workerThinking: extraEnvVars?.TASKPLANE_WORKER_THINKING || "",
2937
+ workerSystemPrompt,
2938
+ workerSegmentPrompt,
2939
+ reviewerModel: extraEnvVars?.TASKPLANE_REVIEWER_MODEL || "",
2940
+ reviewerThinking: extraEnvVars?.TASKPLANE_REVIEWER_THINKING || "",
2941
+ reviewerTools: extraEnvVars?.TASKPLANE_REVIEWER_TOOLS || "",
2942
+ // TP-180: Extension exclusion lists from config
2943
+ workerExcludeExtensions: parseJsonArrayEnv(extraEnvVars?.TASKPLANE_WORKER_EXCLUDE_EXTENSIONS),
2944
+ reviewerExcludeExtensions: parseJsonArrayEnv(
2945
+ extraEnvVars?.TASKPLANE_REVIEWER_EXCLUDE_EXTENSIONS,
2946
+ ),
2947
+ supervisorAutonomy,
2948
+ // TP-195: replaced `config.project?.name` (no `project` field on
2949
+ // `OrchestratorConfig`; always undefined) with the env-var read
2950
+ // already used elsewhere in the codebase (lane-runner.ts:668 sets
2951
+ // `TASKPLANE_PROJECT_NAME` from the same source). When the env
2952
+ // var is unset, falls through to the same `"project"` literal as
2953
+ // before — behavior-neutral.
2954
+ projectName: extraEnvVars?.TASKPLANE_PROJECT_NAME || "project",
2955
+ maxIterations: 20,
2956
+ noProgressLimit: 3,
2957
+ // TP-195: read the canonical `max_worker_minutes` field (snake_case
2958
+ // per `OrchestratorConfig.failure` in types.ts). The previous code
2959
+ // read a non-existent `maxWorkerMinutes` camelCase alias — always
2960
+ // undefined — silently ignoring any operator-set value. Honoring
2961
+ // the config is the intended behavior; default of 120 preserved
2962
+ // when the field is unset.
2963
+ maxWorkerMinutes: config.failure?.max_worker_minutes || 120,
2964
+ warnPercent: 85,
2965
+ killPercent: 95,
2966
+ onSupervisorAlert,
2967
+ onLaneTerminated,
2968
+ };
2969
+
2970
+ try {
2971
+ const result = await executeTaskV2(unit, laneRunnerConfig, pauseSignal);
2972
+ outcomes.push({
2973
+ ...result.outcome,
2974
+ laneNumber: result.outcome.laneNumber ?? lane.laneNumber,
2975
+ });
2976
+
2977
+ // Commit artifacts after success (same as legacy path)
2978
+ if (result.outcome.status === "succeeded") {
2979
+ commitTaskArtifacts(lane, task, laneId);
2980
+ // Reset worktree for next task
2981
+ if (lane.tasks.indexOf(task) < lane.tasks.length - 1) {
2982
+ runGit(["checkout", "--", "."], lane.worktreePath);
2983
+ runGit(["clean", "-fd"], lane.worktreePath);
2984
+ }
2985
+ }
2986
+
2987
+ if (result.outcome.status === "failed" || result.outcome.status === "stalled") {
2988
+ shouldSkipRemaining = true;
2989
+ }
2990
+ } catch (err: unknown) {
2991
+ const errMsg = err instanceof Error ? err.message : String(err);
2992
+ execLog(laneId, task.taskId, `Runtime V2 execution error: ${errMsg}`);
2993
+
2994
+ // TP-190 (#561): Spawn-stage failures (Pi CLI not findable, worktree
2995
+ // provisioning failure, etc.) reach this catch synchronously —
2996
+ // `spawnAgent()` calls `resolvePiCliPath()` and other resolvers that
2997
+ // throw before any process is registered. Tag the outcome with the
2998
+ // `spawn_failure` ExitClassification so:
2999
+ // 1. The retry classifier (TIER0_RETRYABLE_CLASSIFICATIONS) excludes
3000
+ // it deterministically — spawn errors are never transient.
3001
+ // 2. The supervisor `task-failure` IPC alert can carry
3002
+ // `context.exitCategory = "spawn_failure"` so the playbook can
3003
+ // escalate immediately rather than retrying.
3004
+ // 3. The engine's post-wave logic can transition `phase` to
3005
+ // `"failed"` when every lane in a wave spawn-failed.
3006
+ const spawnExitDiagnostic: TaskExitDiagnostic = {
3007
+ classification: "spawn_failure",
3008
+ exitCode: null,
3009
+ errorMessage: errMsg,
3010
+ tokensUsed: null,
3011
+ contextPct: null,
3012
+ partialProgressCommits: 0,
3013
+ partialProgressBranch: null,
3014
+ durationSec: 0,
3015
+ lastKnownStep: null,
3016
+ lastKnownCheckbox: null,
3017
+ repoId: lane.repoId ?? "default",
3018
+ };
3019
+ const workerAgentId = buildRuntimeAgentId(agentIdPrefix, lane.laneNumber, "worker");
3020
+ outcomes.push({
3021
+ taskId: task.taskId,
3022
+ status: "failed",
3023
+ segmentId: taskSegmentId,
3024
+ startTime: Date.now(),
3025
+ endTime: Date.now(),
3026
+ exitReason: `spawn failure: ${errMsg}`,
3027
+ sessionName: workerAgentId,
3028
+ doneFileFound: false,
3029
+ laneNumber: lane.laneNumber,
3030
+ exitDiagnostic: spawnExitDiagnostic,
3031
+ });
3032
+
3033
+ // TP-190 (#561): Write a synthetic terminal lane snapshot so the
3034
+ // monitor (`monitorLanes` → `resolveTaskMonitorState`) reads
3035
+ // `snap.taskId === taskId` AND `snap.status === "failed"`, which sets
3036
+ // `sessionAlive = false` and triggers Priority 3 ("Session exited
3037
+ // without .DONE → failed"). Without this, the monitor's
3038
+ // `snap == null` startup-grace branch keeps `sessionAlive = true`
3039
+ // indefinitely and `executeWave` blocks forever on `await
3040
+ // monitorPromise`. Use the full `RuntimeLaneSnapshot` shape so
3041
+ // dashboard consumers stay schema-consistent.
3042
+ try {
3043
+ const spawnFailureSnapshot: RuntimeLaneSnapshot = {
3044
+ batchId,
3045
+ laneNumber: lane.laneNumber,
3046
+ laneId: `lane-${lane.laneNumber}`,
3047
+ repoId: lane.repoId ?? "default",
3048
+ taskId: task.taskId,
3049
+ segmentId: taskSegmentId,
3050
+ status: "failed",
3051
+ worker: {
3052
+ agentId: workerAgentId,
3053
+ status: "crashed",
3054
+ elapsedMs: 0,
3055
+ toolCalls: 0,
3056
+ contextPct: 0,
3057
+ costUsd: 0,
3058
+ lastTool: "",
3059
+ inputTokens: 0,
3060
+ outputTokens: 0,
3061
+ cacheReadTokens: 0,
3062
+ cacheWriteTokens: 0,
3063
+ },
3064
+ reviewer: null,
3065
+ progress: null,
3066
+ updatedAt: Date.now(),
3067
+ };
3068
+ writeLaneSnapshot(
3069
+ stateRoot,
3070
+ batchId,
3071
+ lane.laneNumber,
3072
+ spawnFailureSnapshot as unknown as Record<string, unknown>,
3073
+ );
3074
+ } catch (snapErr) {
3075
+ // Best effort — if the snapshot write fails, the monitor's
3076
+ // 30s-staleness fallback (snap with old updatedAt) eventually
3077
+ // kicks in via the registry liveness check. Log so this is
3078
+ // visible in operator diagnostics, but do NOT throw.
3079
+ execLog(
3080
+ laneId,
3081
+ task.taskId,
3082
+ `spawn-failure snapshot write failed (non-fatal): ${snapErr instanceof Error ? snapErr.message : String(snapErr)}`,
3083
+ );
3084
+ }
3085
+
3086
+ shouldSkipRemaining = true;
3087
+ }
3088
+ }
3089
+
3090
+ const endTime = Date.now();
3091
+ const succeeded = outcomes.every((o) => o.status === "succeeded");
3092
+ const failed = outcomes.some((o) => o.status === "failed" || o.status === "stalled");
3093
+
3094
+ return {
3095
+ laneNumber: lane.laneNumber,
3096
+ laneId,
3097
+ tasks: outcomes,
3098
+ overallStatus: succeeded ? "succeeded" : failed ? "failed" : "partial",
3099
+ startTime: laneStartTime,
3100
+ endTime,
3101
+ };
3102
+ }
3103
+
3104
+ // ── /orch Command — Full Execution (Step 5) ─────────────────────────