@pi-agents/orchid 0.1.0-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +41 -0
- package/LICENSE +21 -0
- package/README.md +246 -0
- package/agents/AGENTS-MANIFEST.md +42 -0
- package/agents/brain.md +42 -0
- package/agents/context-builder.md +46 -0
- package/agents/delegate.md +12 -0
- package/agents/dev-1.md +42 -0
- package/agents/oracle.md +73 -0
- package/agents/planner.md +55 -0
- package/agents/researcher.md +52 -0
- package/agents/reviewer.md +79 -0
- package/agents/scout.md +50 -0
- package/agents/tester.md +45 -0
- package/agents/worker.md +55 -0
- package/extensions/ralph.ts +1 -0
- package/extensions/reviewer-extension.ts +125 -0
- package/extensions/task-orchestrator.ts +28 -0
- package/package.json +63 -0
- package/prompts/gather-context-and-clarify.md +13 -0
- package/prompts/parallel-cleanup.md +59 -0
- package/prompts/parallel-context-build.md +53 -0
- package/prompts/parallel-handoff-plan.md +59 -0
- package/prompts/parallel-research.md +50 -0
- package/prompts/parallel-review.md +54 -0
- package/prompts/review-loop.md +41 -0
- package/skills/orchid/SKILL.md +214 -0
- package/skills/orchid/orchid-cleanup/SKILL.md +122 -0
- package/skills/orchid/orchid-converge/SKILL.md +124 -0
- package/skills/orchid/orchid-decompose/SKILL.md +201 -0
- package/skills/orchid/orchid-doctor/SKILL.md +162 -0
- package/skills/orchid/orchid-investigate/SKILL.md +102 -0
- package/skills/orchid/orchid-launch/SKILL.md +147 -0
- package/skills/ralph/SKILL.md +73 -0
- package/skills/subagents/pi-subagents/SKILL.md +813 -0
- package/src/index.ts +7 -0
- package/src/orchestrator/abort.ts +534 -0
- package/src/orchestrator/agent-bridge-extension.ts +1020 -0
- package/src/orchestrator/agent-host.ts +954 -0
- package/src/orchestrator/cleanup.ts +776 -0
- package/src/orchestrator/config-loader.ts +1412 -0
- package/src/orchestrator/config-schema.ts +690 -0
- package/src/orchestrator/config.ts +81 -0
- package/src/orchestrator/context-window.ts +66 -0
- package/src/orchestrator/diagnostic-reports.ts +475 -0
- package/src/orchestrator/diagnostics.ts +394 -0
- package/src/orchestrator/discovery.ts +1833 -0
- package/src/orchestrator/engine-worker.ts +415 -0
- package/src/orchestrator/engine.ts +5940 -0
- package/src/orchestrator/execution.ts +3104 -0
- package/src/orchestrator/extension.ts +5934 -0
- package/src/orchestrator/formatting.ts +785 -0
- package/src/orchestrator/git.ts +88 -0
- package/src/orchestrator/index.ts +28 -0
- package/src/orchestrator/lane-runner.ts +1787 -0
- package/src/orchestrator/mailbox.ts +780 -0
- package/src/orchestrator/merge.ts +3414 -0
- package/src/orchestrator/messages.ts +1062 -0
- package/src/orchestrator/migrations.ts +278 -0
- package/src/orchestrator/naming.ts +117 -0
- package/src/orchestrator/path-resolver.ts +275 -0
- package/src/orchestrator/persistence.ts +2625 -0
- package/src/orchestrator/process-registry.ts +452 -0
- package/src/orchestrator/quality-gate.ts +1085 -0
- package/src/orchestrator/resume.ts +3488 -0
- package/src/orchestrator/sessions.ts +57 -0
- package/src/orchestrator/settings-loader.ts +136 -0
- package/src/orchestrator/settings-tui.ts +2208 -0
- package/src/orchestrator/sidecar-telemetry.ts +267 -0
- package/src/orchestrator/supervisor.ts +4548 -0
- package/src/orchestrator/task-executor-core.ts +675 -0
- package/src/orchestrator/tmux-compat.ts +37 -0
- package/src/orchestrator/tool-allowlist-constants.ts +37 -0
- package/src/orchestrator/types.ts +4465 -0
- package/src/orchestrator/verification.ts +547 -0
- package/src/orchestrator/waves.ts +1564 -0
- package/src/orchestrator/workspace.ts +707 -0
- package/src/orchestrator/worktree.ts +2725 -0
- package/src/ralph/index.ts +825 -0
- package/src/subagents/agents/agent-management.ts +648 -0
- package/src/subagents/agents/agent-scope.ts +6 -0
- package/src/subagents/agents/agent-selection.ts +23 -0
- package/src/subagents/agents/agent-serializer.ts +86 -0
- package/src/subagents/agents/agents.ts +832 -0
- package/src/subagents/agents/chain-serializer.ts +137 -0
- package/src/subagents/agents/frontmatter.ts +29 -0
- package/src/subagents/agents/identity.ts +30 -0
- package/src/subagents/agents/skills.ts +632 -0
- package/src/subagents/extension/config.ts +16 -0
- package/src/subagents/extension/control-notices.ts +92 -0
- package/src/subagents/extension/doctor.ts +199 -0
- package/src/subagents/extension/fanout-child.ts +170 -0
- package/src/subagents/extension/index.ts +573 -0
- package/src/subagents/extension/schemas.ts +168 -0
- package/src/subagents/intercom/intercom-bridge.ts +379 -0
- package/src/subagents/intercom/result-intercom.ts +377 -0
- package/src/subagents/runs/background/async-execution.ts +712 -0
- package/src/subagents/runs/background/async-job-tracker.ts +310 -0
- package/src/subagents/runs/background/async-resume.ts +345 -0
- package/src/subagents/runs/background/async-status.ts +325 -0
- package/src/subagents/runs/background/completion-dedupe.ts +63 -0
- package/src/subagents/runs/background/notify.ts +108 -0
- package/src/subagents/runs/background/parallel-groups.ts +45 -0
- package/src/subagents/runs/background/result-watcher.ts +307 -0
- package/src/subagents/runs/background/run-id-resolver.ts +83 -0
- package/src/subagents/runs/background/run-status.ts +269 -0
- package/src/subagents/runs/background/stale-run-reconciler.ts +336 -0
- package/src/subagents/runs/background/subagent-runner.ts +1808 -0
- package/src/subagents/runs/background/top-level-async.ts +13 -0
- package/src/subagents/runs/foreground/chain-clarify.ts +1333 -0
- package/src/subagents/runs/foreground/chain-execution.ts +938 -0
- package/src/subagents/runs/foreground/execution.ts +918 -0
- package/src/subagents/runs/foreground/subagent-executor.ts +2527 -0
- package/src/subagents/runs/shared/completion-guard.ts +147 -0
- package/src/subagents/runs/shared/long-running-guard.ts +175 -0
- package/src/subagents/runs/shared/mcp-direct-tool-allowlist.ts +365 -0
- package/src/subagents/runs/shared/model-fallback.ts +103 -0
- package/src/subagents/runs/shared/nested-events.ts +819 -0
- package/src/subagents/runs/shared/nested-path.ts +52 -0
- package/src/subagents/runs/shared/nested-render.ts +115 -0
- package/src/subagents/runs/shared/parallel-utils.ts +109 -0
- package/src/subagents/runs/shared/pi-args.ts +220 -0
- package/src/subagents/runs/shared/pi-spawn.ts +115 -0
- package/src/subagents/runs/shared/run-history.ts +60 -0
- package/src/subagents/runs/shared/single-output.ts +164 -0
- package/src/subagents/runs/shared/subagent-control.ts +226 -0
- package/src/subagents/runs/shared/subagent-prompt-runtime.ts +170 -0
- package/src/subagents/runs/shared/worktree.ts +577 -0
- package/src/subagents/shared/artifacts.ts +98 -0
- package/src/subagents/shared/atomic-json.ts +16 -0
- package/src/subagents/shared/file-coalescer.ts +40 -0
- package/src/subagents/shared/fork-context.ts +76 -0
- package/src/subagents/shared/formatters.ts +133 -0
- package/src/subagents/shared/jsonl-writer.ts +81 -0
- package/src/subagents/shared/model-info.ts +78 -0
- package/src/subagents/shared/post-exit-stdio-guard.ts +85 -0
- package/src/subagents/shared/session-identity.ts +10 -0
- package/src/subagents/shared/session-tokens.ts +44 -0
- package/src/subagents/shared/settings.ts +397 -0
- package/src/subagents/shared/status-format.ts +49 -0
- package/src/subagents/shared/types.ts +822 -0
- package/src/subagents/shared/utils.ts +450 -0
- package/src/subagents/slash/prompt-template-bridge.ts +397 -0
- package/src/subagents/slash/slash-bridge.ts +174 -0
- package/src/subagents/slash/slash-commands.ts +528 -0
- package/src/subagents/slash/slash-live-state.ts +292 -0
- package/src/subagents/tui/render-helpers.ts +80 -0
- package/src/subagents/tui/render.ts +1358 -0
- package/templates/agents/local/supervisor.md +33 -0
- package/templates/agents/local/task-merger.md +27 -0
- package/templates/agents/local/task-reviewer.md +30 -0
- package/templates/agents/local/task-worker.md +34 -0
- package/templates/agents/supervisor-routing.md +92 -0
- package/templates/agents/supervisor.md +229 -0
- package/templates/agents/task-merger.md +214 -0
- package/templates/agents/task-reviewer.md +260 -0
- package/templates/agents/task-worker-segment.md +44 -0
- package/templates/agents/task-worker.md +557 -0
- package/templates/tasks/CONTEXT.md +30 -0
- package/templates/tasks/EXAMPLE-001-hello-world/PROMPT.md +98 -0
- package/templates/tasks/EXAMPLE-001-hello-world/STATUS.md +73 -0
- package/templates/tasks/EXAMPLE-002-parallel-smoke/PROMPT.md +97 -0
- package/templates/tasks/EXAMPLE-002-parallel-smoke/STATUS.md +73 -0
|
@@ -0,0 +1,3104 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Lane execution, monitoring, wave execution loop
|
|
3
|
+
* @module orch/execution
|
|
4
|
+
*/
|
|
5
|
+
import {
|
|
6
|
+
readFileSync,
|
|
7
|
+
existsSync,
|
|
8
|
+
statSync,
|
|
9
|
+
unlinkSync,
|
|
10
|
+
mkdirSync,
|
|
11
|
+
writeFileSync,
|
|
12
|
+
copyFileSync,
|
|
13
|
+
} from "fs";
|
|
14
|
+
import { access as fsAccess, readFile as fsReadFile, stat as fsStat } from "fs/promises";
|
|
15
|
+
import { join, dirname, basename, resolve, relative, delimiter as pathDelimiter } from "path";
|
|
16
|
+
import { userInfo } from "os";
|
|
17
|
+
|
|
18
|
+
import {
|
|
19
|
+
DONE_GRACE_MS,
|
|
20
|
+
EXECUTION_POLL_INTERVAL_MS,
|
|
21
|
+
ExecutionError,
|
|
22
|
+
SESSION_SPAWN_RETRY_MAX,
|
|
23
|
+
} from "./types.ts";
|
|
24
|
+
import type {
|
|
25
|
+
AllocatedLane,
|
|
26
|
+
AllocatedTask,
|
|
27
|
+
DependencyGraph,
|
|
28
|
+
LaneExecutionResult,
|
|
29
|
+
LaneMonitorSnapshot,
|
|
30
|
+
LaneTaskOutcome,
|
|
31
|
+
LaneTaskStatus,
|
|
32
|
+
MonitorState,
|
|
33
|
+
MtimeTracker,
|
|
34
|
+
OrchestratorConfig,
|
|
35
|
+
ParsedTask,
|
|
36
|
+
TaskMonitorSnapshot,
|
|
37
|
+
WaveExecutionResult,
|
|
38
|
+
WorkspaceConfig,
|
|
39
|
+
ExecutionUnit,
|
|
40
|
+
PacketPaths,
|
|
41
|
+
RuntimeAgentId,
|
|
42
|
+
RuntimeAgentRole,
|
|
43
|
+
RuntimeLaneSnapshot,
|
|
44
|
+
RuntimeRegistry,
|
|
45
|
+
SupervisorAlertCallback,
|
|
46
|
+
} from "./types.ts";
|
|
47
|
+
import { resolvePacketPaths, buildRuntimeAgentId } from "./types.ts";
|
|
48
|
+
import type { TaskExitDiagnostic } from "./diagnostics.ts";
|
|
49
|
+
import {
|
|
50
|
+
readRegistrySnapshot,
|
|
51
|
+
readLaneSnapshot,
|
|
52
|
+
isTerminalStatus,
|
|
53
|
+
isProcessAlive,
|
|
54
|
+
detectOrphans,
|
|
55
|
+
markOrphansCrashed,
|
|
56
|
+
buildRegistrySnapshot,
|
|
57
|
+
writeRegistrySnapshot,
|
|
58
|
+
writeLaneSnapshot,
|
|
59
|
+
} from "./process-registry.ts";
|
|
60
|
+
import { allocateLanes } from "./waves.ts";
|
|
61
|
+
import { resolveOperatorId } from "./naming.ts";
|
|
62
|
+
import { runGit, runGitWithEnv } from "./git.ts";
|
|
63
|
+
import { resolveTaskplanePackageFile, resolveTaskplaneAgentTemplate } from "./path-resolver.ts";
|
|
64
|
+
import { resolvePointer, loadWorkspaceConfig } from "./workspace.ts";
|
|
65
|
+
|
|
66
|
+
// ── OrchID Package File Resolution ────────────────────────────────
|
|
67
|
+
// getNpmGlobalRoot() and resolveTaskplanePackageFile() consolidated in path-resolver.ts (TP-157)
|
|
68
|
+
|
|
69
|
+
// ── RPC Wrapper Path Resolution ──────────────────────────────────────
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Find the rpc-wrapper.mjs path for lane sessions.
|
|
73
|
+
* @see resolveTaskplanePackageFile for resolution order
|
|
74
|
+
*/
|
|
75
|
+
// resolveRpcWrapperPath removed (TP-120 remediation: legacy session-backend dead code)
|
|
76
|
+
|
|
77
|
+
// ── Telemetry Helpers ────────────────────────────────────────────────
|
|
78
|
+
|
|
79
|
+
// resolveTelemOpId removed (TP-120 remediation: only consumer was generateTelemetryPaths)
|
|
80
|
+
|
|
81
|
+
// sanitizeForFilename + generateTelemetryPaths removed (TP-120 remediation: legacy telemetry dead code)
|
|
82
|
+
|
|
83
|
+
// generateTelemetryPaths removed (TP-120 remediation: legacy telemetry sidecar dead code)
|
|
84
|
+
|
|
85
|
+
// ── Execution Helpers ────────────────────────────────────────────────
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Structured log helper for lane execution.
|
|
89
|
+
*
|
|
90
|
+
* All execution logs go to stderr.
|
|
91
|
+
* Format: [orch] {laneId}/{taskId}: {message}
|
|
92
|
+
* Correlation fields: batchId, laneId, taskId, sessionName.
|
|
93
|
+
* No PII — only IDs and paths.
|
|
94
|
+
*/
|
|
95
|
+
export function execLog(
|
|
96
|
+
laneId: string,
|
|
97
|
+
taskId: string,
|
|
98
|
+
message: string,
|
|
99
|
+
// TP-195: widened from `Record<string, string|number|boolean>` to
|
|
100
|
+
// `Record<string, unknown>` so callers can pass structured values
|
|
101
|
+
// (string[] arrays, repo objects, etc.) without TS errors. Runtime
|
|
102
|
+
// stringification via `${v}` is unchanged: primitives render as today,
|
|
103
|
+
// arrays render with comma separators (existing behavior), objects
|
|
104
|
+
// render as `[object Object]` (already today's behavior — see
|
|
105
|
+
// historic execLog calls in engine.ts/resume.ts that have always been
|
|
106
|
+
// passing structured payloads). No runtime change.
|
|
107
|
+
extra?: Record<string, unknown>,
|
|
108
|
+
): void {
|
|
109
|
+
const prefix = `[orch] ${laneId}/${taskId}`;
|
|
110
|
+
if (extra) {
|
|
111
|
+
const fields = Object.entries(extra)
|
|
112
|
+
.map(([k, v]) => `${k}=${v}`)
|
|
113
|
+
.join(" ");
|
|
114
|
+
console.error(`${prefix}: ${message} (${fields})`);
|
|
115
|
+
} else {
|
|
116
|
+
console.error(`${prefix}: ${message}`);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* TP-112: Check if a V2 agent is alive via process registry.
|
|
122
|
+
* Returns true if the agent's PID is running and status is non-terminal.
|
|
123
|
+
* Returns false if no registry, no entry, terminal status, or dead PID.
|
|
124
|
+
*
|
|
125
|
+
* @param agentIdOrSessionName - Agent ID or session name to look up
|
|
126
|
+
* @param runtimeBackend - Must be "v2" (caller should guard)
|
|
127
|
+
* @returns true if agent is alive
|
|
128
|
+
* @since TP-112
|
|
129
|
+
*/
|
|
130
|
+
export function isV2AgentAlive(
|
|
131
|
+
agentIdOrSessionName: string,
|
|
132
|
+
_runtimeBackend?: RuntimeBackend,
|
|
133
|
+
laneNumber?: number,
|
|
134
|
+
): boolean {
|
|
135
|
+
// Read the registry from the global state root.
|
|
136
|
+
// Since this is a pure liveness check, we scan for matching agentId
|
|
137
|
+
// patterns: direct match, or lane-session + "-worker" suffix.
|
|
138
|
+
if (!_v2LivenessRegistryCache) return false;
|
|
139
|
+
const agents = _v2LivenessRegistryCache.agents;
|
|
140
|
+
// Direct match
|
|
141
|
+
const manifest = agents[agentIdOrSessionName];
|
|
142
|
+
if (manifest && !isTerminalStatus(manifest.status) && isProcessAlive(manifest.pid)) return true;
|
|
143
|
+
// Try worker suffix (monitor uses lane session name, registry uses agentId)
|
|
144
|
+
const workerManifest = agents[`${agentIdOrSessionName}-worker`];
|
|
145
|
+
if (
|
|
146
|
+
workerManifest &&
|
|
147
|
+
!isTerminalStatus(workerManifest.status) &&
|
|
148
|
+
isProcessAlive(workerManifest.pid)
|
|
149
|
+
)
|
|
150
|
+
return true;
|
|
151
|
+
// TP-148: In workspace mode, laneSessionId includes repoId and uses a local
|
|
152
|
+
// lane number (e.g., "orch-henry-api-lane-1") while the V2 registry uses
|
|
153
|
+
// global lane numbers without repoId (e.g., "orch-henry-lane-3-worker").
|
|
154
|
+
// Fall back to scanning the registry by global lane number when provided.
|
|
155
|
+
if (laneNumber != null) {
|
|
156
|
+
for (const agent of Object.values(agents)) {
|
|
157
|
+
if (
|
|
158
|
+
agent.laneNumber === laneNumber &&
|
|
159
|
+
agent.role === "worker" &&
|
|
160
|
+
!isTerminalStatus(agent.status) &&
|
|
161
|
+
isProcessAlive(agent.pid)
|
|
162
|
+
) {
|
|
163
|
+
return true;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
return false;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/** Cached registry for V2 liveness checks within a monitor cycle. @since TP-112 */
|
|
171
|
+
let _v2LivenessRegistryCache: RuntimeRegistry | null = null;
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Set the V2 liveness registry cache for the current monitor cycle.
|
|
175
|
+
* Called at the start of each monitor poll to avoid re-reading the file per-task.
|
|
176
|
+
* @since TP-112
|
|
177
|
+
*/
|
|
178
|
+
export function setV2LivenessRegistryCache(registry: RuntimeRegistry | null): void {
|
|
179
|
+
_v2LivenessRegistryCache = registry;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
/**
|
|
183
|
+
* TP-112: Kill V2 lane agents (worker + reviewer) by PID from the registry.
|
|
184
|
+
*
|
|
185
|
+
* Uses the monitor cache when available for hot-path polling, and can
|
|
186
|
+
* optionally read a fresh registry snapshot for cleanup flows outside monitor.
|
|
187
|
+
*
|
|
188
|
+
* @since TP-112
|
|
189
|
+
*/
|
|
190
|
+
export function killV2LaneAgents(
|
|
191
|
+
sessionName: string,
|
|
192
|
+
options?: { stateRoot?: string; batchId?: string; logContext?: string; laneNumber?: number },
|
|
193
|
+
): void {
|
|
194
|
+
const registry =
|
|
195
|
+
_v2LivenessRegistryCache ??
|
|
196
|
+
(options?.stateRoot && options?.batchId
|
|
197
|
+
? readRegistrySnapshot(options.stateRoot, options.batchId)
|
|
198
|
+
: null);
|
|
199
|
+
if (!registry) return;
|
|
200
|
+
|
|
201
|
+
const agents = registry.agents;
|
|
202
|
+
const logContext = options?.logContext ?? "monitor";
|
|
203
|
+
const killedPids = new Set<number>();
|
|
204
|
+
for (const suffix of ["-worker", "-reviewer", ""]) {
|
|
205
|
+
const key = `${sessionName}${suffix}`;
|
|
206
|
+
const manifest = agents[key];
|
|
207
|
+
if (
|
|
208
|
+
manifest &&
|
|
209
|
+
!isTerminalStatus(manifest.status) &&
|
|
210
|
+
isProcessAlive(manifest.pid) &&
|
|
211
|
+
!killedPids.has(manifest.pid)
|
|
212
|
+
) {
|
|
213
|
+
try {
|
|
214
|
+
process.kill(manifest.pid, "SIGTERM");
|
|
215
|
+
killedPids.add(manifest.pid);
|
|
216
|
+
execLog(logContext, key, `killed V2 agent (PID ${manifest.pid})`);
|
|
217
|
+
} catch {
|
|
218
|
+
/* already dead */
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
// TP-148: Workspace-mode fallback — match by global lane number when
|
|
223
|
+
// session name lookup misses (repoId/local-vs-global lane mismatch).
|
|
224
|
+
if (options?.laneNumber != null) {
|
|
225
|
+
for (const agent of Object.values(agents)) {
|
|
226
|
+
if (
|
|
227
|
+
agent.laneNumber === options.laneNumber &&
|
|
228
|
+
!isTerminalStatus(agent.status) &&
|
|
229
|
+
isProcessAlive(agent.pid) &&
|
|
230
|
+
!killedPids.has(agent.pid)
|
|
231
|
+
) {
|
|
232
|
+
try {
|
|
233
|
+
process.kill(agent.pid, "SIGTERM");
|
|
234
|
+
killedPids.add(agent.pid);
|
|
235
|
+
execLog(logContext, agent.agentId, `killed V2 agent by lane number (PID ${agent.pid})`);
|
|
236
|
+
} catch {
|
|
237
|
+
/* already dead */
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// ── Async File/Status Helpers (TP-070) ───────────────────────────────
|
|
245
|
+
|
|
246
|
+
/**
|
|
247
|
+
* Async version of readTaskStatusTail — reads STATUS.md tail without
|
|
248
|
+
* blocking the event loop.
|
|
249
|
+
*
|
|
250
|
+
* @param statusPath - Path to STATUS.md
|
|
251
|
+
* @param maxLines - Maximum number of lines to return
|
|
252
|
+
* @param maxChars - Maximum character count
|
|
253
|
+
* @returns Promise resolving to status tail text (empty string if missing/unreadable)
|
|
254
|
+
*
|
|
255
|
+
* @since TP-070
|
|
256
|
+
*/
|
|
257
|
+
export async function readTaskStatusTailAsync(
|
|
258
|
+
statusPath: string,
|
|
259
|
+
maxLines: number = 40,
|
|
260
|
+
maxChars: number = 1200,
|
|
261
|
+
): Promise<string> {
|
|
262
|
+
try {
|
|
263
|
+
await fsAccess(statusPath);
|
|
264
|
+
} catch {
|
|
265
|
+
return "";
|
|
266
|
+
}
|
|
267
|
+
try {
|
|
268
|
+
const raw = (await fsReadFile(statusPath, "utf-8")).replace(/\r\n/g, "\n").trim();
|
|
269
|
+
if (!raw) return "";
|
|
270
|
+
const tail = raw.split("\n").slice(-maxLines).join("\n").trim();
|
|
271
|
+
if (!tail) return "";
|
|
272
|
+
return tail.length > maxChars ? tail.slice(-maxChars) : tail;
|
|
273
|
+
} catch {
|
|
274
|
+
return "";
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
/**
|
|
279
|
+
* Legacy lane environment-variable helper removed in TP-120.
|
|
280
|
+
*
|
|
281
|
+
* Runtime V2 lane execution now runs through lane-runner/agent-host and no
|
|
282
|
+
* longer injects task-runner autostart/session env vars from this module.
|
|
283
|
+
*/
|
|
284
|
+
// buildLaneEnvVars removed (TP-120 remediation: legacy lane-session env var path, dead code)
|
|
285
|
+
|
|
286
|
+
function laneSessionIdOf(lane: Pick<AllocatedLane, "laneSessionId">): string {
|
|
287
|
+
return lane.laneSessionId;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
/**
|
|
291
|
+
* Resolve the lane session log path for a task execution.
|
|
292
|
+
*
|
|
293
|
+
* Logs are written under the lane worktree to keep per-lane execution
|
|
294
|
+
* artifacts colocated with task state and available after failures.
|
|
295
|
+
*/
|
|
296
|
+
export function resolveLaneLogPath(lane: AllocatedLane, task: AllocatedTask): string {
|
|
297
|
+
return join(lane.worktreePath, ".pi", "orch-logs", `${laneSessionIdOf(lane)}-${task.taskId}.log`);
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
/**
|
|
301
|
+
* Relative lane log path used by the legacy shell-spawn path.
|
|
302
|
+
*
|
|
303
|
+
* Relative paths avoid Windows drive-letter parsing issues in shell redirection.
|
|
304
|
+
*/
|
|
305
|
+
export function resolveLaneLogRelativePath(lane: AllocatedLane, task: AllocatedTask): string {
|
|
306
|
+
return join(".pi", "orch-logs", `${laneSessionIdOf(lane)}-${task.taskId}.log`).replace(/\\/g, "/");
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
/**
|
|
310
|
+
* Read a tail snippet from a lane log file for failure diagnostics.
|
|
311
|
+
*/
|
|
312
|
+
export function readLaneLogTail(
|
|
313
|
+
logPath: string,
|
|
314
|
+
maxLines: number = 40,
|
|
315
|
+
maxChars: number = 1200,
|
|
316
|
+
): string {
|
|
317
|
+
if (!existsSync(logPath)) return "";
|
|
318
|
+
try {
|
|
319
|
+
const raw = readFileSync(logPath, "utf-8").replace(/\r\n/g, "\n");
|
|
320
|
+
const tail = raw.split("\n").slice(-maxLines).join("\n").trim();
|
|
321
|
+
if (!tail) return "";
|
|
322
|
+
return tail.length > maxChars ? tail.slice(-maxChars) : tail;
|
|
323
|
+
} catch {
|
|
324
|
+
return "";
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
/**
|
|
329
|
+
* Async version of readLaneLogTail — reads lane log tail without
|
|
330
|
+
* blocking the event loop.
|
|
331
|
+
*
|
|
332
|
+
* @since TP-070
|
|
333
|
+
*/
|
|
334
|
+
export async function readLaneLogTailAsync(
|
|
335
|
+
logPath: string,
|
|
336
|
+
maxLines: number = 40,
|
|
337
|
+
maxChars: number = 1200,
|
|
338
|
+
): Promise<string> {
|
|
339
|
+
try {
|
|
340
|
+
await fsAccess(logPath);
|
|
341
|
+
} catch {
|
|
342
|
+
return "";
|
|
343
|
+
}
|
|
344
|
+
try {
|
|
345
|
+
const raw = (await fsReadFile(logPath, "utf-8")).replace(/\r\n/g, "\n");
|
|
346
|
+
const tail = raw.split("\n").slice(-maxLines).join("\n").trim();
|
|
347
|
+
if (!tail) return "";
|
|
348
|
+
return tail.length > maxChars ? tail.slice(-maxChars) : tail;
|
|
349
|
+
} catch {
|
|
350
|
+
return "";
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
/**
|
|
355
|
+
* Async file existence check — non-blocking replacement for existsSync
|
|
356
|
+
* in polling paths.
|
|
357
|
+
*
|
|
358
|
+
* @param filePath - Path to check
|
|
359
|
+
* @returns Promise resolving to true if file exists
|
|
360
|
+
*
|
|
361
|
+
* @since TP-070
|
|
362
|
+
*/
|
|
363
|
+
export async function fileExistsAsync(filePath: string): Promise<boolean> {
|
|
364
|
+
try {
|
|
365
|
+
await fsAccess(filePath);
|
|
366
|
+
return true;
|
|
367
|
+
} catch {
|
|
368
|
+
return false;
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
/**
|
|
373
|
+
* Read a tail snippet from task STATUS.md for failure diagnostics.
|
|
374
|
+
*/
|
|
375
|
+
export function readTaskStatusTail(
|
|
376
|
+
statusPath: string,
|
|
377
|
+
maxLines: number = 40,
|
|
378
|
+
maxChars: number = 1200,
|
|
379
|
+
): string {
|
|
380
|
+
if (!existsSync(statusPath)) return "";
|
|
381
|
+
try {
|
|
382
|
+
const raw = readFileSync(statusPath, "utf-8").replace(/\r\n/g, "\n").trim();
|
|
383
|
+
if (!raw) return "";
|
|
384
|
+
const tail = raw.split("\n").slice(-maxLines).join("\n").trim();
|
|
385
|
+
if (!tail) return "";
|
|
386
|
+
return tail.length > maxChars ? tail.slice(-maxChars) : tail;
|
|
387
|
+
} catch {
|
|
388
|
+
return "";
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
/**
|
|
393
|
+
* Result of canonical task-folder path resolution.
|
|
394
|
+
*
|
|
395
|
+
* Encapsulates the resolved task folder, .DONE path, and STATUS.md path
|
|
396
|
+
* so callers don't need to re-derive them with inconsistent logic.
|
|
397
|
+
*/
|
|
398
|
+
export interface ResolvedTaskPaths {
|
|
399
|
+
/** Absolute path to the resolved task folder (may be in worktree or external) */
|
|
400
|
+
taskFolderResolved: string;
|
|
401
|
+
/** Absolute path to the .DONE file */
|
|
402
|
+
donePath: string;
|
|
403
|
+
/** Absolute path to the STATUS.md file */
|
|
404
|
+
statusPath: string;
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
/**
|
|
408
|
+
* Canonical task-folder path resolver.
|
|
409
|
+
*
|
|
410
|
+
* Single source of truth for translating a task folder path (as stored in
|
|
411
|
+
* ParsedTask) into the correct filesystem paths for .DONE and STATUS.md
|
|
412
|
+
* probing. Handles two cases:
|
|
413
|
+
*
|
|
414
|
+
* 1. **Task folder inside repoRoot** (monorepo / repo mode):
|
|
415
|
+
* Strip the repoRoot prefix to get a relative path, then join with
|
|
416
|
+
* worktreePath. This is the existing behavior — worktrees mirror the
|
|
417
|
+
* repo structure so the relative path is the same.
|
|
418
|
+
*
|
|
419
|
+
* 2. **Task folder outside repoRoot** (workspace mode with external tasks root):
|
|
420
|
+
* The task folder is not inside the execution repo. Use the absolute
|
|
421
|
+
* task folder path directly — the .DONE and STATUS.md files live in
|
|
422
|
+
* the canonical task folder, not in any worktree.
|
|
423
|
+
*
|
|
424
|
+
* Both branches include archive fallback: if the primary location doesn't
|
|
425
|
+
* exist, check `<parent>/archive/<taskDirName>/` for relocated task folders.
|
|
426
|
+
*
|
|
427
|
+
* @param taskFolder - Absolute task folder path (from ParsedTask.taskFolder)
|
|
428
|
+
* @param worktreePath - Absolute path to the lane worktree
|
|
429
|
+
* @param repoRoot - Absolute path to the main repository root
|
|
430
|
+
* @returns Resolved paths for task folder, .DONE, and STATUS.md
|
|
431
|
+
*/
|
|
432
|
+
export function resolveCanonicalTaskPaths(
|
|
433
|
+
taskFolder: string,
|
|
434
|
+
worktreePath: string,
|
|
435
|
+
repoRoot: string,
|
|
436
|
+
isWorkspaceMode?: boolean,
|
|
437
|
+
): ResolvedTaskPaths {
|
|
438
|
+
const repoRootNorm = resolve(repoRoot).replace(/\\/g, "/");
|
|
439
|
+
const folderNorm = resolve(taskFolder).replace(/\\/g, "/");
|
|
440
|
+
|
|
441
|
+
let resolvedFolder: string;
|
|
442
|
+
|
|
443
|
+
if (isWorkspaceMode) {
|
|
444
|
+
// Workspace mode: use worktree-relative path when the task folder is
|
|
445
|
+
// inside the lane's repo. The worker writes .DONE and STATUS.md in
|
|
446
|
+
// the worktree, so the engine must look there too.
|
|
447
|
+
if (folderNorm.startsWith(repoRootNorm + "/")) {
|
|
448
|
+
const relPath = folderNorm.slice(repoRootNorm.length + 1);
|
|
449
|
+
resolvedFolder = join(worktreePath, relPath);
|
|
450
|
+
} else {
|
|
451
|
+
// Cross-repo: task files were copied into the worktree under
|
|
452
|
+
// .orchid-tasks/<taskDirName>/ by buildLaneEnvVars
|
|
453
|
+
const taskDirName = basename(resolve(taskFolder));
|
|
454
|
+
resolvedFolder = join(worktreePath, ".orchid-tasks", taskDirName);
|
|
455
|
+
}
|
|
456
|
+
} else if (folderNorm.startsWith(repoRootNorm + "/")) {
|
|
457
|
+
// Repo mode: task folder is inside the repo root.
|
|
458
|
+
// Translate to equivalent path in the worktree.
|
|
459
|
+
const relativePath = folderNorm.slice(repoRootNorm.length + 1);
|
|
460
|
+
resolvedFolder = join(worktreePath, relativePath);
|
|
461
|
+
} else {
|
|
462
|
+
// Fallback: use absolute path directly.
|
|
463
|
+
resolvedFolder = resolve(taskFolder);
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
// Check primary location
|
|
467
|
+
const primaryDone = join(resolvedFolder, ".DONE");
|
|
468
|
+
const primaryStatus = join(resolvedFolder, "STATUS.md");
|
|
469
|
+
if (existsSync(primaryDone) || existsSync(primaryStatus)) {
|
|
470
|
+
return {
|
|
471
|
+
taskFolderResolved: resolvedFolder,
|
|
472
|
+
donePath: primaryDone,
|
|
473
|
+
statusPath: primaryStatus,
|
|
474
|
+
};
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
// Archive fallback: worker may have archived the task folder during the
|
|
478
|
+
// "Documentation & Delivery" step, moving it under `.../archive/TASK-ID/`.
|
|
479
|
+
const resolvedNorm = resolve(resolvedFolder).replace(/\\/g, "/");
|
|
480
|
+
const parts = resolvedNorm.split("/");
|
|
481
|
+
const taskDirName = parts[parts.length - 1];
|
|
482
|
+
const parentDir = parts.slice(0, -1).join("/");
|
|
483
|
+
const archiveFolder = join(parentDir, "archive", taskDirName);
|
|
484
|
+
const archiveDone = join(archiveFolder, ".DONE");
|
|
485
|
+
const archiveStatus = join(archiveFolder, "STATUS.md");
|
|
486
|
+
|
|
487
|
+
if (existsSync(archiveDone) || existsSync(archiveStatus)) {
|
|
488
|
+
return {
|
|
489
|
+
taskFolderResolved: archiveFolder,
|
|
490
|
+
donePath: archiveDone,
|
|
491
|
+
statusPath: archiveStatus,
|
|
492
|
+
};
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
// Return primary paths even if nothing exists yet (caller probes existsSync)
|
|
496
|
+
return {
|
|
497
|
+
taskFolderResolved: resolvedFolder,
|
|
498
|
+
donePath: primaryDone,
|
|
499
|
+
statusPath: primaryStatus,
|
|
500
|
+
};
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
/**
|
|
504
|
+
* Resolve the path to a task's .DONE file inside a worktree.
|
|
505
|
+
*
|
|
506
|
+
* Delegates to `resolveCanonicalTaskPaths` for consistent path resolution
|
|
507
|
+
* across repo mode (task folder inside repo) and workspace mode (external
|
|
508
|
+
* task folder).
|
|
509
|
+
*
|
|
510
|
+
* @param taskFolder - Absolute task folder path (from main repo)
|
|
511
|
+
* @param worktreePath - Absolute path to the lane worktree
|
|
512
|
+
* @param repoRoot - Absolute path to the main repository root
|
|
513
|
+
* @returns Absolute path to the .DONE file in the worktree
|
|
514
|
+
*/
|
|
515
|
+
export function resolveTaskDonePath(
|
|
516
|
+
taskFolder: string,
|
|
517
|
+
worktreePath: string,
|
|
518
|
+
repoRoot: string,
|
|
519
|
+
isWorkspaceMode?: boolean,
|
|
520
|
+
): string {
|
|
521
|
+
return resolveCanonicalTaskPaths(taskFolder, worktreePath, repoRoot, isWorkspaceMode).donePath;
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
/*
|
|
525
|
+
* Removed in TP-120 while decommissioning the legacy session backend.
|
|
526
|
+
*
|
|
527
|
+
* `pollUntilTaskComplete` remains as a test-compatibility stub only.
|
|
528
|
+
* Runtime V2 completion detection now lives in lane-runner + agent-host.
|
|
529
|
+
*/
|
|
530
|
+
// pollUntilTaskComplete function body removed — was ~170 lines of legacy .DONE polling.
|
|
531
|
+
// @ts-ignore — export kept as stub for test compatibility
|
|
532
|
+
export async function pollUntilTaskComplete(
|
|
533
|
+
_lane: AllocatedLane,
|
|
534
|
+
_task: AllocatedTask,
|
|
535
|
+
_config: OrchestratorConfig,
|
|
536
|
+
_repoRoot: string,
|
|
537
|
+
_pauseSignal: { paused: boolean },
|
|
538
|
+
_isWorkspaceMode?: boolean,
|
|
539
|
+
): Promise<{ status: LaneTaskStatus; exitReason: string; doneFileFound: boolean }> {
|
|
540
|
+
return {
|
|
541
|
+
status: "failed",
|
|
542
|
+
exitReason: "Legacy pollUntilTaskComplete removed — use V2 lane-runner",
|
|
543
|
+
doneFileFound: false,
|
|
544
|
+
};
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
// ── Post-Task Commit ─────────────────────────────────────────────────
|
|
548
|
+
|
|
549
|
+
/**
|
|
550
|
+
* Commit any uncommitted task artifacts to the lane branch after task completion.
|
|
551
|
+
*
|
|
552
|
+
* The task-runner creates `.DONE` and updates `STATUS.md` via `writeFileSync`,
|
|
553
|
+
* but these changes are never committed to git by the task-runner or the worker.
|
|
554
|
+
* Without this commit, these files are lost when the worktree is reset or removed,
|
|
555
|
+
* and they don't appear in the merge to the base branch.
|
|
556
|
+
*
|
|
557
|
+
* Best-effort: failures are logged but don't fail the task (the work is already done).
|
|
558
|
+
*
|
|
559
|
+
* @param lane - Allocated lane containing the worktree path
|
|
560
|
+
* @param task - The task that just completed
|
|
561
|
+
* @param laneId - Lane identifier for logging
|
|
562
|
+
*/
|
|
563
|
+
function commitTaskArtifacts(lane: AllocatedLane, task: AllocatedTask, laneId: string): void {
|
|
564
|
+
const worktreePath = lane.worktreePath;
|
|
565
|
+
|
|
566
|
+
// Check if there are any uncommitted changes in the worktree
|
|
567
|
+
const statusResult = runGit(["status", "--porcelain"], worktreePath);
|
|
568
|
+
if (!statusResult.ok || !statusResult.stdout.trim()) {
|
|
569
|
+
// Nothing to commit (worker already committed everything, or git error)
|
|
570
|
+
return;
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
// Stage all changes in the worktree
|
|
574
|
+
const addResult = runGit(["add", "-A"], worktreePath);
|
|
575
|
+
if (!addResult.ok) {
|
|
576
|
+
execLog(
|
|
577
|
+
laneId,
|
|
578
|
+
task.taskId,
|
|
579
|
+
`post-task stage failed (non-fatal): ${addResult.stderr.slice(0, 200)}`,
|
|
580
|
+
);
|
|
581
|
+
return;
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
// Commit with task ID for traceability
|
|
585
|
+
const commitResult = runGit(
|
|
586
|
+
["commit", "-m", `checkpoint: ${task.taskId} task artifacts (.DONE, STATUS.md)`],
|
|
587
|
+
worktreePath,
|
|
588
|
+
);
|
|
589
|
+
if (!commitResult.ok) {
|
|
590
|
+
// "nothing to commit" is not an error — worker may have already committed
|
|
591
|
+
if (!commitResult.stderr.includes("nothing to commit")) {
|
|
592
|
+
execLog(
|
|
593
|
+
laneId,
|
|
594
|
+
task.taskId,
|
|
595
|
+
`post-task commit failed (non-fatal): ${commitResult.stderr.slice(0, 200)}`,
|
|
596
|
+
);
|
|
597
|
+
}
|
|
598
|
+
return;
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
execLog(laneId, task.taskId, `committed task artifacts to lane branch`, {
|
|
602
|
+
commit: commitResult.stdout.trim().split("\n")[0],
|
|
603
|
+
});
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
// ── STATUS.md Parsing for Worktree ───────────────────────────────────
|
|
607
|
+
|
|
608
|
+
/**
|
|
609
|
+
* Normalized result from parsing a STATUS.md file in a worktree.
|
|
610
|
+
*
|
|
611
|
+
* Reuses the same regex patterns as task-runner's parseStatusMd but
|
|
612
|
+
* adapted for monitoring context (no direct import — same file patterns).
|
|
613
|
+
*/
|
|
614
|
+
export interface ParsedWorktreeStatus {
|
|
615
|
+
/** Parsed step info array */
|
|
616
|
+
steps: {
|
|
617
|
+
number: number;
|
|
618
|
+
name: string;
|
|
619
|
+
status: "not-started" | "in-progress" | "complete";
|
|
620
|
+
totalChecked: number;
|
|
621
|
+
totalItems: number;
|
|
622
|
+
}[];
|
|
623
|
+
/** Review counter from STATUS.md */
|
|
624
|
+
reviewCounter: number;
|
|
625
|
+
/** Iteration number from STATUS.md */
|
|
626
|
+
iteration: number;
|
|
627
|
+
/** File modification time (epoch ms) */
|
|
628
|
+
mtime: number;
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
/**
|
|
632
|
+
* Parse STATUS.md from a task folder inside a worktree.
|
|
633
|
+
*
|
|
634
|
+
* Reads the STATUS.md file, parses step statuses and checkbox counts
|
|
635
|
+
* using the same regex patterns as task-runner's parseStatusMd.
|
|
636
|
+
*
|
|
637
|
+
* @param taskFolder - Absolute task folder path (from main repo)
|
|
638
|
+
* @param worktreePath - Absolute path to the lane worktree
|
|
639
|
+
* @param repoRoot - Absolute path to the main repository root
|
|
640
|
+
* @returns Parsed status or null with reason if unreadable
|
|
641
|
+
*/
|
|
642
|
+
export function parseWorktreeStatusMd(
|
|
643
|
+
taskFolder: string,
|
|
644
|
+
worktreePath: string,
|
|
645
|
+
repoRoot: string,
|
|
646
|
+
isWorkspaceMode?: boolean,
|
|
647
|
+
): { parsed: ParsedWorktreeStatus | null; error: string | null } {
|
|
648
|
+
// Use canonical resolver for consistent path translation
|
|
649
|
+
const resolved = resolveCanonicalTaskPaths(taskFolder, worktreePath, repoRoot, isWorkspaceMode);
|
|
650
|
+
const statusPath = resolved.statusPath;
|
|
651
|
+
|
|
652
|
+
if (!existsSync(statusPath)) {
|
|
653
|
+
return { parsed: null, error: `STATUS.md not found at ${statusPath}` };
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
let content: string;
|
|
657
|
+
let mtime: number;
|
|
658
|
+
try {
|
|
659
|
+
content = readFileSync(statusPath, "utf-8");
|
|
660
|
+
mtime = statSync(statusPath).mtimeMs;
|
|
661
|
+
} catch (err: unknown) {
|
|
662
|
+
return {
|
|
663
|
+
parsed: null,
|
|
664
|
+
error: `Cannot read STATUS.md: ${err instanceof Error ? err.message : String(err)}`,
|
|
665
|
+
};
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
// Parse using same regex patterns as task-runner's parseStatusMd
|
|
669
|
+
const text = content.replace(/\r\n/g, "\n");
|
|
670
|
+
const steps: ParsedWorktreeStatus["steps"] = [];
|
|
671
|
+
let currentStep: {
|
|
672
|
+
number: number;
|
|
673
|
+
name: string;
|
|
674
|
+
status: "not-started" | "in-progress" | "complete";
|
|
675
|
+
checkboxes: boolean[];
|
|
676
|
+
} | null = null;
|
|
677
|
+
let reviewCounter = 0;
|
|
678
|
+
let iteration = 0;
|
|
679
|
+
|
|
680
|
+
for (const line of text.split("\n")) {
|
|
681
|
+
const rcMatch = line.match(/\*\*Review Counter:\*\*\s*(\d+)/);
|
|
682
|
+
if (rcMatch) reviewCounter = parseInt(rcMatch[1]);
|
|
683
|
+
const itMatch = line.match(/\*\*Iteration:\*\*\s*(\d+)/);
|
|
684
|
+
if (itMatch) iteration = parseInt(itMatch[1]);
|
|
685
|
+
|
|
686
|
+
const stepMatch = line.match(/^###\s+Step\s+(\d+):\s*(.+)/);
|
|
687
|
+
if (stepMatch) {
|
|
688
|
+
if (currentStep) {
|
|
689
|
+
const totalChecked = currentStep.checkboxes.filter((c) => c).length;
|
|
690
|
+
steps.push({
|
|
691
|
+
number: currentStep.number,
|
|
692
|
+
name: currentStep.name,
|
|
693
|
+
status: currentStep.status,
|
|
694
|
+
totalChecked,
|
|
695
|
+
totalItems: currentStep.checkboxes.length,
|
|
696
|
+
});
|
|
697
|
+
}
|
|
698
|
+
currentStep = {
|
|
699
|
+
number: parseInt(stepMatch[1]),
|
|
700
|
+
name: stepMatch[2].trim(),
|
|
701
|
+
status: "not-started",
|
|
702
|
+
checkboxes: [],
|
|
703
|
+
};
|
|
704
|
+
continue;
|
|
705
|
+
}
|
|
706
|
+
if (currentStep) {
|
|
707
|
+
const ss = line.match(/\*\*Status:\*\*\s*(.*)/);
|
|
708
|
+
if (ss) {
|
|
709
|
+
const s = ss[1];
|
|
710
|
+
if (s.includes("✅") || s.toLowerCase().includes("complete")) {
|
|
711
|
+
currentStep.status = "complete";
|
|
712
|
+
} else if (s.includes("🟨") || s.includes("🟡") || s.toLowerCase().includes("progress")) {
|
|
713
|
+
currentStep.status = "in-progress";
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
const cb = line.match(/^\s*-\s*\[([ xX])\]\s*(.*)/);
|
|
717
|
+
if (cb) {
|
|
718
|
+
currentStep.checkboxes.push(cb[1].toLowerCase() === "x");
|
|
719
|
+
}
|
|
720
|
+
}
|
|
721
|
+
}
|
|
722
|
+
if (currentStep) {
|
|
723
|
+
const totalChecked = currentStep.checkboxes.filter((c) => c).length;
|
|
724
|
+
steps.push({
|
|
725
|
+
number: currentStep.number,
|
|
726
|
+
name: currentStep.name,
|
|
727
|
+
status: currentStep.status,
|
|
728
|
+
totalChecked,
|
|
729
|
+
totalItems: currentStep.checkboxes.length,
|
|
730
|
+
});
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
return {
|
|
734
|
+
parsed: { steps, reviewCounter, iteration, mtime },
|
|
735
|
+
error: null,
|
|
736
|
+
};
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
/**
|
|
740
|
+
* Async version of parseWorktreeStatusMd — reads and parses STATUS.md
|
|
741
|
+
* without blocking the event loop. Used in monitoring poll loops.
|
|
742
|
+
*
|
|
743
|
+
* @since TP-070
|
|
744
|
+
*/
|
|
745
|
+
|
|
746
|
+
/**
|
|
747
|
+
* Parse STATUS.md directly from a known absolute path.
|
|
748
|
+
* Unlike parseWorktreeStatusMdAsync, this does NOT re-resolve the path —
|
|
749
|
+
* it reads exactly the file you point it to. Use this when the caller
|
|
750
|
+
* already has the authoritative statusPath (e.g., from buildExecutionUnit).
|
|
751
|
+
*
|
|
752
|
+
* @since TP-501
|
|
753
|
+
*/
|
|
754
|
+
export async function parseStatusMdAtPath(
|
|
755
|
+
statusPath: string,
|
|
756
|
+
): Promise<{ parsed: ParsedWorktreeStatus | null; error: string | null }> {
|
|
757
|
+
return parseStatusMdContent(statusPath);
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
/**
|
|
761
|
+
* Parse STATUS.md by resolving the path from taskFolder + worktree context.
|
|
762
|
+
* Use parseStatusMdAtPath instead when the caller already has the authoritative path.
|
|
763
|
+
*
|
|
764
|
+
* @since TP-070
|
|
765
|
+
*/
|
|
766
|
+
export async function parseWorktreeStatusMdAsync(
|
|
767
|
+
taskFolder: string,
|
|
768
|
+
worktreePath: string,
|
|
769
|
+
repoRoot: string,
|
|
770
|
+
isWorkspaceMode?: boolean,
|
|
771
|
+
): Promise<{ parsed: ParsedWorktreeStatus | null; error: string | null }> {
|
|
772
|
+
const resolved = resolveCanonicalTaskPaths(taskFolder, worktreePath, repoRoot, isWorkspaceMode);
|
|
773
|
+
return parseStatusMdContent(resolved.statusPath);
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
/** Shared STATUS.md content parser — reads and parses from a known path. Handles file-not-found. */
|
|
777
|
+
async function parseStatusMdContent(
|
|
778
|
+
statusPath: string,
|
|
779
|
+
): Promise<{ parsed: ParsedWorktreeStatus | null; error: string | null }> {
|
|
780
|
+
if (!(await fileExistsAsync(statusPath))) {
|
|
781
|
+
return { parsed: null, error: `STATUS.md not found at ${statusPath}` };
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
let content: string;
|
|
785
|
+
let mtime: number;
|
|
786
|
+
try {
|
|
787
|
+
content = await fsReadFile(statusPath, "utf-8");
|
|
788
|
+
mtime = (await fsStat(statusPath)).mtimeMs;
|
|
789
|
+
} catch (err: unknown) {
|
|
790
|
+
return {
|
|
791
|
+
parsed: null,
|
|
792
|
+
error: `Cannot read STATUS.md: ${err instanceof Error ? err.message : String(err)}`,
|
|
793
|
+
};
|
|
794
|
+
}
|
|
795
|
+
|
|
796
|
+
// Parse logic is identical to the sync version
|
|
797
|
+
const text = content.replace(/\r\n/g, "\n");
|
|
798
|
+
const steps: ParsedWorktreeStatus["steps"] = [];
|
|
799
|
+
let currentStep: {
|
|
800
|
+
number: number;
|
|
801
|
+
name: string;
|
|
802
|
+
status: "not-started" | "in-progress" | "complete";
|
|
803
|
+
checkboxes: boolean[];
|
|
804
|
+
} | null = null;
|
|
805
|
+
let reviewCounter = 0;
|
|
806
|
+
let iteration = 0;
|
|
807
|
+
|
|
808
|
+
for (const line of text.split("\n")) {
|
|
809
|
+
const rcMatch = line.match(/\*\*Review Counter:\*\*\s*(\d+)/);
|
|
810
|
+
if (rcMatch) reviewCounter = parseInt(rcMatch[1]);
|
|
811
|
+
const itMatch = line.match(/\*\*Iteration:\*\*\s*(\d+)/);
|
|
812
|
+
if (itMatch) iteration = parseInt(itMatch[1]);
|
|
813
|
+
|
|
814
|
+
const stepMatch = line.match(/^###\s+Step\s+(\d+):\s*(.+)/);
|
|
815
|
+
if (stepMatch) {
|
|
816
|
+
if (currentStep) {
|
|
817
|
+
const totalChecked = currentStep.checkboxes.filter((c) => c).length;
|
|
818
|
+
steps.push({
|
|
819
|
+
number: currentStep.number,
|
|
820
|
+
name: currentStep.name,
|
|
821
|
+
status: currentStep.status,
|
|
822
|
+
totalChecked,
|
|
823
|
+
totalItems: currentStep.checkboxes.length,
|
|
824
|
+
});
|
|
825
|
+
}
|
|
826
|
+
currentStep = {
|
|
827
|
+
number: parseInt(stepMatch[1]),
|
|
828
|
+
name: stepMatch[2].trim(),
|
|
829
|
+
status: "not-started",
|
|
830
|
+
checkboxes: [],
|
|
831
|
+
};
|
|
832
|
+
continue;
|
|
833
|
+
}
|
|
834
|
+
if (currentStep) {
|
|
835
|
+
const ss = line.match(/\*\*Status:\*\*\s*(.*)/);
|
|
836
|
+
if (ss) {
|
|
837
|
+
const s = ss[1];
|
|
838
|
+
if (s.includes("✅") || s.toLowerCase().includes("complete")) {
|
|
839
|
+
currentStep.status = "complete";
|
|
840
|
+
} else if (s.includes("🟨") || s.includes("🟡") || s.toLowerCase().includes("progress")) {
|
|
841
|
+
currentStep.status = "in-progress";
|
|
842
|
+
}
|
|
843
|
+
}
|
|
844
|
+
const cb = line.match(/^\s*-\s*\[([ xX])\]\s*(.*)/);
|
|
845
|
+
if (cb) {
|
|
846
|
+
currentStep.checkboxes.push(cb[1].toLowerCase() === "x");
|
|
847
|
+
}
|
|
848
|
+
}
|
|
849
|
+
}
|
|
850
|
+
if (currentStep) {
|
|
851
|
+
const totalChecked = currentStep.checkboxes.filter((c) => c).length;
|
|
852
|
+
steps.push({
|
|
853
|
+
number: currentStep.number,
|
|
854
|
+
name: currentStep.name,
|
|
855
|
+
status: currentStep.status,
|
|
856
|
+
totalChecked,
|
|
857
|
+
totalItems: currentStep.checkboxes.length,
|
|
858
|
+
});
|
|
859
|
+
}
|
|
860
|
+
|
|
861
|
+
return {
|
|
862
|
+
parsed: { steps, reviewCounter, iteration, mtime },
|
|
863
|
+
error: null,
|
|
864
|
+
};
|
|
865
|
+
}
|
|
866
|
+
|
|
867
|
+
// ── State Resolution ─────────────────────────────────────────────────
|
|
868
|
+
|
|
869
|
+
/**
|
|
870
|
+
* Resolve the monitoring state for a single task by combining signals.
|
|
871
|
+
*
|
|
872
|
+
* State-resolution precedence (deterministic):
|
|
873
|
+
* 1. `.DONE` file found → "succeeded" (highest priority, always wins)
|
|
874
|
+
* 2. Stall timeout reached (mtime unchanged for stall_timeout AND session alive) → "stalled"
|
|
875
|
+
* 3. Lane session ended without .DONE → "failed"
|
|
876
|
+
* 4. Session alive + recent mtime (within stall_timeout) → "running"
|
|
877
|
+
* 5. Session alive + stale mtime but within startup grace → "running" (with no stall timer yet)
|
|
878
|
+
* 6. Session alive + no STATUS.md yet but within startup grace → "running"
|
|
879
|
+
* 7. No session, no .DONE, never observed running → "unknown"
|
|
880
|
+
*
|
|
881
|
+
* @param taskId - Task identifier
|
|
882
|
+
* @param donePath - Absolute path to the .DONE file in the worktree
|
|
883
|
+
* @param sessionName - Lane session name for this lane
|
|
884
|
+
* @param statusResult - Parsed STATUS.md result (may be null)
|
|
885
|
+
* @param tracker - Mtime tracker for stall detection
|
|
886
|
+
* @param stallTimeoutMs - Stall timeout in milliseconds
|
|
887
|
+
* @param now - Current timestamp (epoch ms) for deterministic testing
|
|
888
|
+
* @param multiSegmentContext - Optional segment-authority context (TP-196 / #462).
|
|
889
|
+
* When provided AND `isFinalSegment === false`,
|
|
890
|
+
* `.DONE` is treated as a non-authoritative signal
|
|
891
|
+
* (Priority 1 is skipped). This guards against a
|
|
892
|
+
* stale or premature `.DONE` from a non-final
|
|
893
|
+
* segment short-circuiting the task to succeeded
|
|
894
|
+
* before the remaining segments have run.
|
|
895
|
+
*/
|
|
896
|
+
export async function resolveTaskMonitorState(
|
|
897
|
+
taskId: string,
|
|
898
|
+
donePath: string,
|
|
899
|
+
sessionName: string,
|
|
900
|
+
statusResult: { parsed: ParsedWorktreeStatus | null; error: string | null },
|
|
901
|
+
tracker: MtimeTracker,
|
|
902
|
+
stallTimeoutMs: number,
|
|
903
|
+
now: number,
|
|
904
|
+
runtimeBackend?: RuntimeBackend,
|
|
905
|
+
v2Context?: { stateRoot: string; batchId: string; laneNumber: number },
|
|
906
|
+
multiSegmentContext?: { isFinalSegment: boolean; segmentId: string },
|
|
907
|
+
): Promise<TaskMonitorSnapshot> {
|
|
908
|
+
// TP-115/TP-127: Backend-aware liveness check.
|
|
909
|
+
// V2: read the lane snapshot file written by lane-runner every second.
|
|
910
|
+
// If snapshot doesn't exist yet, assume alive (lane-runner startup race).
|
|
911
|
+
// If snapshot belongs to a different task, it's stale transition data from
|
|
912
|
+
// the previous wave/task and should be treated like startup grace (alive).
|
|
913
|
+
// Legacy: check lane-session liveness.
|
|
914
|
+
let sessionAlive: boolean;
|
|
915
|
+
if (runtimeBackend === "v2" && v2Context) {
|
|
916
|
+
const snap = readLaneSnapshot(v2Context.stateRoot, v2Context.batchId, v2Context.laneNumber);
|
|
917
|
+
if (snap == null || snap.taskId !== taskId) {
|
|
918
|
+
// Snapshot not written yet OR snapshot still points to a prior task.
|
|
919
|
+
// Assume alive initially, but if stale for >30s consult the registry
|
|
920
|
+
// to avoid indefinite false "running" if the lane-runner died.
|
|
921
|
+
const staleMs = snap?.updatedAt ? now - snap.updatedAt : 0;
|
|
922
|
+
const trackerAgeMs = now - tracker.firstObservedAt;
|
|
923
|
+
if (staleMs > 30_000) {
|
|
924
|
+
// Snapshot hasn't been updated for 30s+ — check registry as fallback.
|
|
925
|
+
// But also check if the tracker just started (firstObservedAt within
|
|
926
|
+
// last 60s) — wave transitions can leave stale snapshots from the
|
|
927
|
+
// prior wave/task while the new worker is still spawning.
|
|
928
|
+
if (trackerAgeMs < 60_000) {
|
|
929
|
+
// New task, stale snapshot — give the worker startup grace period
|
|
930
|
+
sessionAlive = true;
|
|
931
|
+
} else {
|
|
932
|
+
sessionAlive = isV2AgentAlive(sessionName, runtimeBackend, v2Context?.laneNumber);
|
|
933
|
+
}
|
|
934
|
+
} else if (snap == null && trackerAgeMs >= 60_000) {
|
|
935
|
+
// TP-190 (#561 sage post-mortem): when NO snapshot exists at all
|
|
936
|
+
// (not even stale) and the tracker has been observing this task
|
|
937
|
+
// for >= 60s, fall back to the registry liveness check. Without
|
|
938
|
+
// this branch, a snapshot-write failure in the spawn-failure catch
|
|
939
|
+
// (disk full, permission error, transient I/O hiccup) leaves
|
|
940
|
+
// `snap == null` AND `staleMs == 0`, which previously hit the
|
|
941
|
+
// unconditional-alive default below — reintroducing the same
|
|
942
|
+
// monitor hang the spawn-failure catch was supposed to fix.
|
|
943
|
+
// 60s tracker-age threshold matches the existing startup-grace
|
|
944
|
+
// boundary so we don't false-fail a slow-starting worker that
|
|
945
|
+
// hasn't yet written its first snapshot.
|
|
946
|
+
sessionAlive = isV2AgentAlive(sessionName, runtimeBackend, v2Context?.laneNumber);
|
|
947
|
+
} else {
|
|
948
|
+
sessionAlive = true;
|
|
949
|
+
}
|
|
950
|
+
} else {
|
|
951
|
+
// TP-159: Fast-fail path for ghost workers (issue #461).
|
|
952
|
+
// When the snapshot belongs to the current task but the lane-runner
|
|
953
|
+
// has stopped updating it and the agent's PID is confirmed dead,
|
|
954
|
+
// immediately set sessionAlive=false instead of waiting for the full
|
|
955
|
+
// stall timeout. This handles the case where a worker dies silently
|
|
956
|
+
// (OOM, segfault, parent crash) after writing its first snapshot:
|
|
957
|
+
// snap.status stays "running", stallTimerStart stays null
|
|
958
|
+
// (STATUS.md never written), so Priority 2 never fires without
|
|
959
|
+
// this explicit dead-PID check.
|
|
960
|
+
// Conditions:
|
|
961
|
+
// 1. snap.updatedAt is stale beyond stallTimeoutMs/2
|
|
962
|
+
// 2. startup grace has elapsed (trackerAgeMs >= 60s)
|
|
963
|
+
// 3. agent is confirmed dead (registry marked crashed by orphan scan)
|
|
964
|
+
const trackerAgeMs = now - tracker.firstObservedAt;
|
|
965
|
+
if (
|
|
966
|
+
snap.updatedAt &&
|
|
967
|
+
now - snap.updatedAt > stallTimeoutMs / 2 &&
|
|
968
|
+
trackerAgeMs >= 60_000 &&
|
|
969
|
+
!isV2AgentAlive(sessionName, runtimeBackend, v2Context?.laneNumber)
|
|
970
|
+
) {
|
|
971
|
+
// Ghost worker confirmed: PID dead, snapshot stale beyond half the stall timeout
|
|
972
|
+
execLog("monitor", taskId, "ghost worker fast-fail — dead PID + stale snapshot", {
|
|
973
|
+
session: sessionName,
|
|
974
|
+
snapStaleMs: now - snap.updatedAt,
|
|
975
|
+
trackerAgeMs,
|
|
976
|
+
halfStallTimeoutMs: stallTimeoutMs / 2,
|
|
977
|
+
});
|
|
978
|
+
sessionAlive = false;
|
|
979
|
+
} else {
|
|
980
|
+
sessionAlive = snap.status === "running";
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
} else {
|
|
984
|
+
sessionAlive = isV2AgentAlive(sessionName, "v2", v2Context?.laneNumber);
|
|
985
|
+
}
|
|
986
|
+
const doneFileFound = await fileExistsAsync(donePath);
|
|
987
|
+
|
|
988
|
+
// Build base snapshot from parsed status
|
|
989
|
+
let currentStepName: string | null = null;
|
|
990
|
+
let currentStepNumber: number | null = null;
|
|
991
|
+
let totalSteps = 0;
|
|
992
|
+
let totalChecked = 0;
|
|
993
|
+
let totalItems = 0;
|
|
994
|
+
let iteration = 0;
|
|
995
|
+
let reviewCounter = 0;
|
|
996
|
+
let parseError = statusResult.error;
|
|
997
|
+
|
|
998
|
+
if (statusResult.parsed) {
|
|
999
|
+
const { steps } = statusResult.parsed;
|
|
1000
|
+
totalSteps = steps.length;
|
|
1001
|
+
iteration = statusResult.parsed.iteration;
|
|
1002
|
+
reviewCounter = statusResult.parsed.reviewCounter;
|
|
1003
|
+
|
|
1004
|
+
for (const step of steps) {
|
|
1005
|
+
totalChecked += step.totalChecked;
|
|
1006
|
+
totalItems += step.totalItems;
|
|
1007
|
+
}
|
|
1008
|
+
|
|
1009
|
+
// Find the current step (first in-progress, or first not-started after last complete)
|
|
1010
|
+
const inProgress = steps.find((s) => s.status === "in-progress");
|
|
1011
|
+
if (inProgress) {
|
|
1012
|
+
currentStepName = inProgress.name;
|
|
1013
|
+
currentStepNumber = inProgress.number;
|
|
1014
|
+
} else {
|
|
1015
|
+
// Find first not-started step
|
|
1016
|
+
const notStarted = steps.find((s) => s.status === "not-started");
|
|
1017
|
+
if (notStarted) {
|
|
1018
|
+
currentStepName = notStarted.name;
|
|
1019
|
+
currentStepNumber = notStarted.number;
|
|
1020
|
+
} else if (steps.length > 0) {
|
|
1021
|
+
// All complete
|
|
1022
|
+
const last = steps[steps.length - 1];
|
|
1023
|
+
currentStepName = last.name;
|
|
1024
|
+
currentStepNumber = last.number;
|
|
1025
|
+
}
|
|
1026
|
+
}
|
|
1027
|
+
|
|
1028
|
+
// Update mtime tracker
|
|
1029
|
+
if (!tracker.statusFileSeenOnce) {
|
|
1030
|
+
tracker.statusFileSeenOnce = true;
|
|
1031
|
+
tracker.lastMtime = statusResult.parsed.mtime;
|
|
1032
|
+
tracker.stallTimerStart = null; // Reset stall timer on first read
|
|
1033
|
+
} else if (statusResult.parsed.mtime !== tracker.lastMtime) {
|
|
1034
|
+
// Mtime changed — progress is being made
|
|
1035
|
+
tracker.lastMtime = statusResult.parsed.mtime;
|
|
1036
|
+
tracker.stallTimerStart = null; // Reset stall timer
|
|
1037
|
+
} else {
|
|
1038
|
+
// Mtime unchanged — start or continue stall timer
|
|
1039
|
+
if (tracker.stallTimerStart === null) {
|
|
1040
|
+
tracker.stallTimerStart = now;
|
|
1041
|
+
}
|
|
1042
|
+
}
|
|
1043
|
+
}
|
|
1044
|
+
|
|
1045
|
+
// ── Priority 1: .DONE file found → succeeded ────────────────
|
|
1046
|
+
// TP-196 / #462: Monitor guard for multi-segment tasks. When the caller
|
|
1047
|
+
// has provided a segment-authority context AND tells us the active segment
|
|
1048
|
+
// is NOT the final segment in the task plan, `.DONE` MUST NOT be accepted
|
|
1049
|
+
// as authoritative — a non-final segment's worker should never have
|
|
1050
|
+
// produced one. We log a WARN and fall through to the lower priorities
|
|
1051
|
+
// (which keep the task in a non-terminal state so the engine can recover).
|
|
1052
|
+
const doneAcceptedAsAuthority =
|
|
1053
|
+
doneFileFound && !(multiSegmentContext && multiSegmentContext.isFinalSegment === false);
|
|
1054
|
+
if (doneFileFound && !doneAcceptedAsAuthority) {
|
|
1055
|
+
execLog(
|
|
1056
|
+
"monitor",
|
|
1057
|
+
taskId,
|
|
1058
|
+
`WARN: .DONE present for non-final segment '${multiSegmentContext?.segmentId}' — ignoring (#462 guard)`,
|
|
1059
|
+
{
|
|
1060
|
+
session: sessionName,
|
|
1061
|
+
segmentId: multiSegmentContext?.segmentId,
|
|
1062
|
+
donePath,
|
|
1063
|
+
},
|
|
1064
|
+
);
|
|
1065
|
+
}
|
|
1066
|
+
if (doneAcceptedAsAuthority) {
|
|
1067
|
+
return {
|
|
1068
|
+
taskId,
|
|
1069
|
+
status: "succeeded",
|
|
1070
|
+
currentStepName,
|
|
1071
|
+
currentStepNumber,
|
|
1072
|
+
totalSteps,
|
|
1073
|
+
totalChecked,
|
|
1074
|
+
totalItems,
|
|
1075
|
+
sessionAlive,
|
|
1076
|
+
doneFileFound: true,
|
|
1077
|
+
stallReason: null,
|
|
1078
|
+
lastHeartbeat: tracker.lastMtime,
|
|
1079
|
+
observedAt: now,
|
|
1080
|
+
parseError,
|
|
1081
|
+
iteration,
|
|
1082
|
+
reviewCounter,
|
|
1083
|
+
};
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
// ── Priority 2: Stall timeout reached ────────────────────────
|
|
1087
|
+
if (
|
|
1088
|
+
sessionAlive &&
|
|
1089
|
+
tracker.statusFileSeenOnce &&
|
|
1090
|
+
tracker.stallTimerStart !== null &&
|
|
1091
|
+
now - tracker.stallTimerStart >= stallTimeoutMs
|
|
1092
|
+
) {
|
|
1093
|
+
const stallMinutes = Math.round((now - tracker.stallTimerStart) / 60_000);
|
|
1094
|
+
const stallReason = `STATUS.md unchanged for ${stallMinutes} minutes (threshold: ${Math.round(stallTimeoutMs / 60_000)} min)`;
|
|
1095
|
+
|
|
1096
|
+
// Kill the agent (backend-aware)
|
|
1097
|
+
execLog("monitor", taskId, `stall detected — killing agent`, {
|
|
1098
|
+
session: sessionName,
|
|
1099
|
+
stallMinutes,
|
|
1100
|
+
backend: runtimeBackend ?? "legacy",
|
|
1101
|
+
});
|
|
1102
|
+
killV2LaneAgents(sessionName, { laneNumber: v2Context?.laneNumber });
|
|
1103
|
+
|
|
1104
|
+
return {
|
|
1105
|
+
taskId,
|
|
1106
|
+
status: "stalled",
|
|
1107
|
+
currentStepName,
|
|
1108
|
+
currentStepNumber,
|
|
1109
|
+
totalSteps,
|
|
1110
|
+
totalChecked,
|
|
1111
|
+
totalItems,
|
|
1112
|
+
sessionAlive: false, // We just killed it
|
|
1113
|
+
doneFileFound: false,
|
|
1114
|
+
stallReason,
|
|
1115
|
+
lastHeartbeat: tracker.lastMtime,
|
|
1116
|
+
observedAt: now,
|
|
1117
|
+
parseError,
|
|
1118
|
+
iteration,
|
|
1119
|
+
reviewCounter,
|
|
1120
|
+
};
|
|
1121
|
+
}
|
|
1122
|
+
|
|
1123
|
+
// ── Priority 3: Session exited without .DONE → failed ────────
|
|
1124
|
+
if (!sessionAlive) {
|
|
1125
|
+
return {
|
|
1126
|
+
taskId,
|
|
1127
|
+
status: "failed",
|
|
1128
|
+
currentStepName,
|
|
1129
|
+
currentStepNumber,
|
|
1130
|
+
totalSteps,
|
|
1131
|
+
totalChecked,
|
|
1132
|
+
totalItems,
|
|
1133
|
+
sessionAlive: false,
|
|
1134
|
+
doneFileFound: false,
|
|
1135
|
+
stallReason: null,
|
|
1136
|
+
lastHeartbeat: tracker.lastMtime,
|
|
1137
|
+
observedAt: now,
|
|
1138
|
+
parseError,
|
|
1139
|
+
iteration,
|
|
1140
|
+
reviewCounter,
|
|
1141
|
+
};
|
|
1142
|
+
}
|
|
1143
|
+
|
|
1144
|
+
// ── Priority 4-6: Session alive → running ────────────────────
|
|
1145
|
+
return {
|
|
1146
|
+
taskId,
|
|
1147
|
+
status: "running",
|
|
1148
|
+
currentStepName,
|
|
1149
|
+
currentStepNumber,
|
|
1150
|
+
totalSteps,
|
|
1151
|
+
totalChecked,
|
|
1152
|
+
totalItems,
|
|
1153
|
+
sessionAlive: true,
|
|
1154
|
+
doneFileFound: false,
|
|
1155
|
+
stallReason: null,
|
|
1156
|
+
lastHeartbeat: tracker.lastMtime,
|
|
1157
|
+
observedAt: now,
|
|
1158
|
+
parseError,
|
|
1159
|
+
iteration,
|
|
1160
|
+
reviewCounter,
|
|
1161
|
+
};
|
|
1162
|
+
}
|
|
1163
|
+
|
|
1164
|
+
// ── Core Monitor Loop ────────────────────────────────────────────────
|
|
1165
|
+
|
|
1166
|
+
/**
|
|
1167
|
+
* Callback type for dashboard updates during monitoring.
|
|
1168
|
+
*/
|
|
1169
|
+
export type MonitorUpdateCallback = (state: MonitorState) => void;
|
|
1170
|
+
|
|
1171
|
+
/**
|
|
1172
|
+
* Monitor all lanes in a wave, polling for progress, completion, and stalls.
|
|
1173
|
+
*
|
|
1174
|
+
* This is the orchestrator's "air traffic control" — it does NOT attach
|
|
1175
|
+
* to lane sessions directly. It monitors via filesystem polling:
|
|
1176
|
+
* - STATUS.md in each worktree for step/checkbox progress
|
|
1177
|
+
* - .DONE files for task completion
|
|
1178
|
+
* - backend liveness probes for session state
|
|
1179
|
+
* - STATUS.md mtime for stall detection
|
|
1180
|
+
*
|
|
1181
|
+
* The monitoring loop runs until all lanes reach terminal states
|
|
1182
|
+
* (all tasks succeeded/failed/stalled) or the pauseSignal is set.
|
|
1183
|
+
*
|
|
1184
|
+
* **Important:** This function monitors lanes that are being executed
|
|
1185
|
+
* concurrently by `executeLane()` in Step 2. It does NOT spawn sessions —
|
|
1186
|
+
* it only observes. Step 4 will coordinate calling both executeLane()
|
|
1187
|
+
* and monitorLanes() in parallel.
|
|
1188
|
+
*
|
|
1189
|
+
* @param lanes - Allocated lanes being executed
|
|
1190
|
+
* @param config - Orchestrator configuration (poll_interval, stall_timeout)
|
|
1191
|
+
* @param repoRoot - Main repository root
|
|
1192
|
+
* @param pauseSignal - Shared signal for pause/abort
|
|
1193
|
+
* @param waveNumber - Current wave number (for display)
|
|
1194
|
+
* @param onUpdate - Optional callback invoked on each poll cycle
|
|
1195
|
+
* @returns Final MonitorState snapshot when monitoring completes
|
|
1196
|
+
*/
|
|
1197
|
+
export async function monitorLanes(
|
|
1198
|
+
lanes: AllocatedLane[],
|
|
1199
|
+
config: OrchestratorConfig,
|
|
1200
|
+
repoRoot: string,
|
|
1201
|
+
pauseSignal: { paused: boolean },
|
|
1202
|
+
waveNumber: number = 1,
|
|
1203
|
+
onUpdate?: MonitorUpdateCallback,
|
|
1204
|
+
isWorkspaceMode?: boolean,
|
|
1205
|
+
runtimeBackend?: RuntimeBackend,
|
|
1206
|
+
batchId?: string,
|
|
1207
|
+
stateRootForRegistry?: string,
|
|
1208
|
+
): Promise<MonitorState> {
|
|
1209
|
+
const pollIntervalMs = (config.monitoring.poll_interval || 5) * 1000;
|
|
1210
|
+
const stallTimeoutMs = (config.failure.stall_timeout || 30) * 60_000;
|
|
1211
|
+
|
|
1212
|
+
// Initialize mtime trackers for each lane's current task
|
|
1213
|
+
// We track per-taskId so a lane advancing to the next task gets a fresh tracker
|
|
1214
|
+
const mtimeTrackers = new Map<string, MtimeTracker>();
|
|
1215
|
+
|
|
1216
|
+
function getOrCreateTracker(taskId: string, now: number): MtimeTracker {
|
|
1217
|
+
let tracker = mtimeTrackers.get(taskId);
|
|
1218
|
+
if (!tracker) {
|
|
1219
|
+
tracker = {
|
|
1220
|
+
taskId,
|
|
1221
|
+
firstObservedAt: now,
|
|
1222
|
+
statusFileSeenOnce: false,
|
|
1223
|
+
lastMtime: null,
|
|
1224
|
+
stallTimerStart: null,
|
|
1225
|
+
};
|
|
1226
|
+
mtimeTrackers.set(taskId, tracker);
|
|
1227
|
+
}
|
|
1228
|
+
return tracker;
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1231
|
+
// Track terminal states per task to avoid re-processing
|
|
1232
|
+
const terminalTasks = new Map<string, TaskMonitorSnapshot>();
|
|
1233
|
+
|
|
1234
|
+
// Track which task each lane is currently on
|
|
1235
|
+
// (determined by: first task in lane that hasn't reached terminal state)
|
|
1236
|
+
const laneTaskIndex = new Map<number, number>();
|
|
1237
|
+
for (const lane of lanes) {
|
|
1238
|
+
laneTaskIndex.set(lane.laneNumber, 0);
|
|
1239
|
+
}
|
|
1240
|
+
|
|
1241
|
+
let pollCount = 0;
|
|
1242
|
+
let lastMonitorStateKey = "";
|
|
1243
|
+
|
|
1244
|
+
// Build the total task count
|
|
1245
|
+
const tasksTotal = lanes.reduce((sum, lane) => sum + lane.tasks.length, 0);
|
|
1246
|
+
|
|
1247
|
+
execLog(
|
|
1248
|
+
"monitor",
|
|
1249
|
+
"ALL",
|
|
1250
|
+
`starting monitoring for ${lanes.length} lane(s), ${tasksTotal} task(s)`,
|
|
1251
|
+
{
|
|
1252
|
+
pollIntervalMs,
|
|
1253
|
+
stallTimeoutMin: Math.round(stallTimeoutMs / 60_000),
|
|
1254
|
+
},
|
|
1255
|
+
);
|
|
1256
|
+
|
|
1257
|
+
while (true) {
|
|
1258
|
+
const now = Date.now();
|
|
1259
|
+
pollCount++;
|
|
1260
|
+
|
|
1261
|
+
// TP-112: Refresh V2 liveness registry cache once per poll cycle
|
|
1262
|
+
if (runtimeBackend === "v2" && batchId) {
|
|
1263
|
+
try {
|
|
1264
|
+
setV2LivenessRegistryCache(readRegistrySnapshot(stateRootForRegistry ?? repoRoot, batchId));
|
|
1265
|
+
} catch {
|
|
1266
|
+
setV2LivenessRegistryCache(null);
|
|
1267
|
+
}
|
|
1268
|
+
} else {
|
|
1269
|
+
setV2LivenessRegistryCache(null);
|
|
1270
|
+
}
|
|
1271
|
+
|
|
1272
|
+
// TP-159: Detect and mark orphaned workers each poll cycle.
|
|
1273
|
+
// When a worker subprocess dies silently (OOM kill, segfault, parent
|
|
1274
|
+
// crash) without going through the normal completion handshake, its
|
|
1275
|
+
// registry manifest stays in a non-terminal status indefinitely.
|
|
1276
|
+
// Scanning for dead PIDs here ensures list_active_agents, read_agent_status,
|
|
1277
|
+
// and the dashboard all reflect reality within one poll interval.
|
|
1278
|
+
if (runtimeBackend === "v2" && batchId) {
|
|
1279
|
+
try {
|
|
1280
|
+
const registry = readRegistrySnapshot(stateRootForRegistry ?? repoRoot, batchId);
|
|
1281
|
+
if (registry) {
|
|
1282
|
+
const orphans = detectOrphans(registry);
|
|
1283
|
+
if (orphans.length > 0) {
|
|
1284
|
+
// Mark individual agent manifests as crashed
|
|
1285
|
+
markOrphansCrashed(stateRootForRegistry ?? repoRoot, batchId, orphans);
|
|
1286
|
+
// Rebuild and write registry.json from the updated individual manifests.
|
|
1287
|
+
// markOrphansCrashed only updates per-agent files; registry.json is a
|
|
1288
|
+
// cached aggregate that must be explicitly rebuilt so readRegistrySnapshot()
|
|
1289
|
+
// and the dashboard see the crashed status within this poll cycle.
|
|
1290
|
+
const freshRegistry = buildRegistrySnapshot(stateRootForRegistry ?? repoRoot, batchId);
|
|
1291
|
+
writeRegistrySnapshot(stateRootForRegistry ?? repoRoot, freshRegistry);
|
|
1292
|
+
setV2LivenessRegistryCache(freshRegistry);
|
|
1293
|
+
}
|
|
1294
|
+
}
|
|
1295
|
+
} catch {
|
|
1296
|
+
// Non-fatal — monitor loop must never throw
|
|
1297
|
+
}
|
|
1298
|
+
}
|
|
1299
|
+
|
|
1300
|
+
// Check pause signal
|
|
1301
|
+
if (pauseSignal.paused) {
|
|
1302
|
+
execLog("monitor", "ALL", "pause signal detected — stopping monitoring");
|
|
1303
|
+
break;
|
|
1304
|
+
}
|
|
1305
|
+
|
|
1306
|
+
const laneSnapshots: LaneMonitorSnapshot[] = [];
|
|
1307
|
+
let totalDone = 0;
|
|
1308
|
+
let totalFailed = 0;
|
|
1309
|
+
let allTerminal = true;
|
|
1310
|
+
|
|
1311
|
+
for (const lane of lanes) {
|
|
1312
|
+
const completedTasks: string[] = [];
|
|
1313
|
+
const failedTasks: string[] = [];
|
|
1314
|
+
const remainingTasks: string[] = [];
|
|
1315
|
+
let currentTaskId: string | null = null;
|
|
1316
|
+
let currentTaskSnapshot: TaskMonitorSnapshot | null = null;
|
|
1317
|
+
|
|
1318
|
+
// Walk through tasks in order to determine lane state
|
|
1319
|
+
for (let i = 0; i < lane.tasks.length; i++) {
|
|
1320
|
+
const task = lane.tasks[i];
|
|
1321
|
+
|
|
1322
|
+
// Check if we already know this task is terminal
|
|
1323
|
+
const existingTerminal = terminalTasks.get(task.taskId);
|
|
1324
|
+
if (existingTerminal) {
|
|
1325
|
+
if (existingTerminal.status === "succeeded") {
|
|
1326
|
+
completedTasks.push(task.taskId);
|
|
1327
|
+
totalDone++;
|
|
1328
|
+
} else {
|
|
1329
|
+
failedTasks.push(task.taskId);
|
|
1330
|
+
totalFailed++;
|
|
1331
|
+
}
|
|
1332
|
+
continue;
|
|
1333
|
+
}
|
|
1334
|
+
|
|
1335
|
+
// This task hasn't reached terminal state yet
|
|
1336
|
+
if (currentTaskId === null) {
|
|
1337
|
+
// This is the current task being worked on
|
|
1338
|
+
currentTaskId = task.taskId;
|
|
1339
|
+
|
|
1340
|
+
const tracker = getOrCreateTracker(task.taskId, now);
|
|
1341
|
+
const unit = buildExecutionUnit(lane, task, repoRoot, isWorkspaceMode);
|
|
1342
|
+
const donePath = unit.packet.donePath;
|
|
1343
|
+
const statusPath = unit.packet.statusPath;
|
|
1344
|
+
const statusResult = await parseStatusMdAtPath(statusPath);
|
|
1345
|
+
|
|
1346
|
+
// TP-196 / #462: Build multi-segment authority context so
|
|
1347
|
+
// `.DONE` from a non-final segment is not accepted as terminal.
|
|
1348
|
+
const taskSegmentIds = task.task.segmentIds ?? [];
|
|
1349
|
+
const taskActiveSegmentId = task.task.activeSegmentId ?? null;
|
|
1350
|
+
let multiSegmentContext: { isFinalSegment: boolean; segmentId: string } | undefined;
|
|
1351
|
+
if (taskSegmentIds.length > 1 && taskActiveSegmentId) {
|
|
1352
|
+
const finalSegmentId = taskSegmentIds[taskSegmentIds.length - 1];
|
|
1353
|
+
multiSegmentContext = {
|
|
1354
|
+
isFinalSegment: taskActiveSegmentId === finalSegmentId,
|
|
1355
|
+
segmentId: taskActiveSegmentId,
|
|
1356
|
+
};
|
|
1357
|
+
}
|
|
1358
|
+
|
|
1359
|
+
const snapshot = await resolveTaskMonitorState(
|
|
1360
|
+
task.taskId,
|
|
1361
|
+
donePath,
|
|
1362
|
+
laneSessionIdOf(lane),
|
|
1363
|
+
statusResult,
|
|
1364
|
+
tracker,
|
|
1365
|
+
stallTimeoutMs,
|
|
1366
|
+
now,
|
|
1367
|
+
runtimeBackend,
|
|
1368
|
+
runtimeBackend === "v2" && batchId
|
|
1369
|
+
? {
|
|
1370
|
+
stateRoot: stateRootForRegistry ?? repoRoot,
|
|
1371
|
+
batchId,
|
|
1372
|
+
laneNumber: lane.laneNumber,
|
|
1373
|
+
}
|
|
1374
|
+
: undefined,
|
|
1375
|
+
multiSegmentContext,
|
|
1376
|
+
);
|
|
1377
|
+
|
|
1378
|
+
currentTaskSnapshot = snapshot;
|
|
1379
|
+
|
|
1380
|
+
// Check if this task just became terminal
|
|
1381
|
+
if (
|
|
1382
|
+
snapshot.status === "succeeded" ||
|
|
1383
|
+
snapshot.status === "failed" ||
|
|
1384
|
+
snapshot.status === "stalled"
|
|
1385
|
+
) {
|
|
1386
|
+
terminalTasks.set(task.taskId, snapshot);
|
|
1387
|
+
if (snapshot.status === "succeeded") {
|
|
1388
|
+
completedTasks.push(task.taskId);
|
|
1389
|
+
totalDone++;
|
|
1390
|
+
} else {
|
|
1391
|
+
failedTasks.push(task.taskId);
|
|
1392
|
+
totalFailed++;
|
|
1393
|
+
}
|
|
1394
|
+
// Move to next task — clear currentTaskId so next iteration picks up
|
|
1395
|
+
currentTaskId = null;
|
|
1396
|
+
currentTaskSnapshot = null;
|
|
1397
|
+
} else {
|
|
1398
|
+
// Task is still running — mark remaining and break
|
|
1399
|
+
allTerminal = false;
|
|
1400
|
+
// Remaining tasks are everything after this one
|
|
1401
|
+
for (let j = i + 1; j < lane.tasks.length; j++) {
|
|
1402
|
+
remainingTasks.push(lane.tasks[j].taskId);
|
|
1403
|
+
}
|
|
1404
|
+
break;
|
|
1405
|
+
}
|
|
1406
|
+
} else {
|
|
1407
|
+
// Shouldn't reach here since we break above, but defensive
|
|
1408
|
+
remainingTasks.push(task.taskId);
|
|
1409
|
+
}
|
|
1410
|
+
}
|
|
1411
|
+
|
|
1412
|
+
// If we processed all tasks and currentTaskId is still null,
|
|
1413
|
+
// the lane is fully terminal (all tasks completed/failed)
|
|
1414
|
+
if (currentTaskId !== null) {
|
|
1415
|
+
allTerminal = false;
|
|
1416
|
+
}
|
|
1417
|
+
|
|
1418
|
+
// TP-112: Backend-aware lane liveness for snapshot
|
|
1419
|
+
// TP-148: Pass global laneNumber for workspace-mode fallback lookup
|
|
1420
|
+
const sessionAlive = isV2AgentAlive(laneSessionIdOf(lane), "v2", lane.laneNumber);
|
|
1421
|
+
|
|
1422
|
+
laneSnapshots.push({
|
|
1423
|
+
laneId: lane.laneId,
|
|
1424
|
+
laneNumber: lane.laneNumber,
|
|
1425
|
+
sessionName: laneSessionIdOf(lane),
|
|
1426
|
+
sessionAlive,
|
|
1427
|
+
currentTaskId,
|
|
1428
|
+
currentTaskSnapshot,
|
|
1429
|
+
completedTasks,
|
|
1430
|
+
failedTasks,
|
|
1431
|
+
remainingTasks,
|
|
1432
|
+
});
|
|
1433
|
+
}
|
|
1434
|
+
|
|
1435
|
+
const monitorState: MonitorState = {
|
|
1436
|
+
lanes: laneSnapshots,
|
|
1437
|
+
tasksDone: totalDone,
|
|
1438
|
+
tasksFailed: totalFailed,
|
|
1439
|
+
tasksTotal,
|
|
1440
|
+
waveNumber,
|
|
1441
|
+
pollCount,
|
|
1442
|
+
lastPollTime: now,
|
|
1443
|
+
allTerminal,
|
|
1444
|
+
};
|
|
1445
|
+
|
|
1446
|
+
// Invoke the dashboard update callback
|
|
1447
|
+
if (onUpdate) {
|
|
1448
|
+
try {
|
|
1449
|
+
onUpdate(monitorState);
|
|
1450
|
+
} catch {
|
|
1451
|
+
// Don't let callback errors kill the monitor loop
|
|
1452
|
+
}
|
|
1453
|
+
}
|
|
1454
|
+
|
|
1455
|
+
// Log summary only on state changes (lane completes or fails) — not every poll
|
|
1456
|
+
const currentStateKey = `${totalDone}/${totalFailed}`;
|
|
1457
|
+
if (currentStateKey !== lastMonitorStateKey) {
|
|
1458
|
+
const activeLanes = laneSnapshots.filter((l) => l.currentTaskId !== null);
|
|
1459
|
+
execLog(
|
|
1460
|
+
"monitor",
|
|
1461
|
+
"ALL",
|
|
1462
|
+
`poll #${pollCount}: ${totalDone}/${tasksTotal} done, ${totalFailed} failed, ${activeLanes.length} active lane(s)`,
|
|
1463
|
+
);
|
|
1464
|
+
lastMonitorStateKey = currentStateKey;
|
|
1465
|
+
}
|
|
1466
|
+
|
|
1467
|
+
// Exit conditions
|
|
1468
|
+
if (allTerminal) {
|
|
1469
|
+
execLog("monitor", "ALL", `all lanes terminal — monitoring complete`, {
|
|
1470
|
+
done: totalDone,
|
|
1471
|
+
failed: totalFailed,
|
|
1472
|
+
total: tasksTotal,
|
|
1473
|
+
polls: pollCount,
|
|
1474
|
+
});
|
|
1475
|
+
setV2LivenessRegistryCache(null);
|
|
1476
|
+
return monitorState;
|
|
1477
|
+
}
|
|
1478
|
+
|
|
1479
|
+
// Wait for next poll cycle
|
|
1480
|
+
await new Promise((r) => setTimeout(r, pollIntervalMs));
|
|
1481
|
+
}
|
|
1482
|
+
|
|
1483
|
+
// Reached here due to pause signal — return current state
|
|
1484
|
+
const now = Date.now();
|
|
1485
|
+
const laneSnapshots: LaneMonitorSnapshot[] = lanes.map((lane) => ({
|
|
1486
|
+
laneId: lane.laneId,
|
|
1487
|
+
laneNumber: lane.laneNumber,
|
|
1488
|
+
sessionName: laneSessionIdOf(lane),
|
|
1489
|
+
sessionAlive: false, // Best-effort during pause — don't block on extra liveness probes
|
|
1490
|
+
currentTaskId: null,
|
|
1491
|
+
currentTaskSnapshot: null,
|
|
1492
|
+
completedTasks: [],
|
|
1493
|
+
failedTasks: [],
|
|
1494
|
+
remainingTasks: lane.tasks.map((t) => t.taskId),
|
|
1495
|
+
}));
|
|
1496
|
+
|
|
1497
|
+
setV2LivenessRegistryCache(null);
|
|
1498
|
+
return {
|
|
1499
|
+
lanes: laneSnapshots,
|
|
1500
|
+
tasksDone: 0,
|
|
1501
|
+
tasksFailed: 0,
|
|
1502
|
+
tasksTotal,
|
|
1503
|
+
waveNumber,
|
|
1504
|
+
pollCount,
|
|
1505
|
+
lastPollTime: now,
|
|
1506
|
+
allTerminal: false,
|
|
1507
|
+
};
|
|
1508
|
+
}
|
|
1509
|
+
|
|
1510
|
+
// ── Transitive Dependent Computation ─────────────────────────────────
|
|
1511
|
+
|
|
1512
|
+
/**
|
|
1513
|
+
* Compute transitive dependents of a set of failed task IDs.
|
|
1514
|
+
*
|
|
1515
|
+
* Uses BFS through the dependency graph's `dependents` map (task → tasks
|
|
1516
|
+
* that depend on it) to find all tasks transitively blocked by the failures.
|
|
1517
|
+
*
|
|
1518
|
+
* Example: if A failed, B depends on A, and C depends on B, then both B
|
|
1519
|
+
* and C are transitively blocked.
|
|
1520
|
+
*
|
|
1521
|
+
* The failed tasks themselves are NOT included in the output — only their
|
|
1522
|
+
* downstream dependents.
|
|
1523
|
+
*
|
|
1524
|
+
* @param failedTaskIds - Set of task IDs that failed
|
|
1525
|
+
* @param dependencyGraph - Dependency graph with dependents map
|
|
1526
|
+
* @returns Set of task IDs transitively blocked (excludes the failed tasks themselves)
|
|
1527
|
+
*/
|
|
1528
|
+
export function computeTransitiveDependents(
|
|
1529
|
+
failedTaskIds: Set<string>,
|
|
1530
|
+
dependencyGraph: DependencyGraph,
|
|
1531
|
+
): Set<string> {
|
|
1532
|
+
const blocked = new Set<string>();
|
|
1533
|
+
const queue = [...failedTaskIds];
|
|
1534
|
+
|
|
1535
|
+
while (queue.length > 0) {
|
|
1536
|
+
const current = queue.shift()!;
|
|
1537
|
+
const dependents = dependencyGraph.dependents.get(current) || [];
|
|
1538
|
+
|
|
1539
|
+
// Deterministic: sort dependents alphabetically
|
|
1540
|
+
const sortedDependents = [...dependents].sort();
|
|
1541
|
+
|
|
1542
|
+
for (const dep of sortedDependents) {
|
|
1543
|
+
if (blocked.has(dep)) continue;
|
|
1544
|
+
if (failedTaskIds.has(dep)) continue; // Don't re-add failed tasks
|
|
1545
|
+
blocked.add(dep);
|
|
1546
|
+
queue.push(dep); // Continue BFS for transitive closure
|
|
1547
|
+
}
|
|
1548
|
+
}
|
|
1549
|
+
|
|
1550
|
+
return blocked;
|
|
1551
|
+
}
|
|
1552
|
+
|
|
1553
|
+
// ── Pre-flight: Commit Untracked Task Files ─────────────────────────
|
|
1554
|
+
|
|
1555
|
+
/**
|
|
1556
|
+
* Ensure all task files for a wave are committed to git before worktree creation.
|
|
1557
|
+
*
|
|
1558
|
+
* Git worktrees only contain tracked (committed) files. If a user creates
|
|
1559
|
+
* task folders (PROMPT.md, STATUS.md) but doesn't commit them, the worktree
|
|
1560
|
+
* won't have those files and the worker will fail with "file not found".
|
|
1561
|
+
*
|
|
1562
|
+
* This function checks each wave task's folder for untracked or modified files,
|
|
1563
|
+
* stages them, and creates a commit on the current branch. This must run BEFORE
|
|
1564
|
+
* allocateLanes() so that worktrees (which are based on the batch's base branch)
|
|
1565
|
+
* include the task files.
|
|
1566
|
+
*
|
|
1567
|
+
* Only task-specific folders are staged — no other working tree changes are touched.
|
|
1568
|
+
*
|
|
1569
|
+
* @param waveTasks - Task IDs in this wave
|
|
1570
|
+
* @param pending - Full pending task map from discovery
|
|
1571
|
+
* @param repoRoot - Main repository root
|
|
1572
|
+
* @param waveIndex - Wave number for commit message
|
|
1573
|
+
*/
|
|
1574
|
+
export function ensureTaskFilesCommitted(
|
|
1575
|
+
waveTasks: string[],
|
|
1576
|
+
pending: Map<string, ParsedTask>,
|
|
1577
|
+
repoRoot: string,
|
|
1578
|
+
waveIndex: number,
|
|
1579
|
+
orchBranch?: string,
|
|
1580
|
+
): void {
|
|
1581
|
+
// Collect task folder paths for this wave
|
|
1582
|
+
const foldersToCheck: { taskId: string; relPath: string }[] = [];
|
|
1583
|
+
for (const taskId of waveTasks) {
|
|
1584
|
+
const task = pending.get(taskId);
|
|
1585
|
+
if (!task) continue;
|
|
1586
|
+
|
|
1587
|
+
const absFolder = resolve(task.taskFolder);
|
|
1588
|
+
const relPath = relative(resolve(repoRoot), absFolder).replace(/\\/g, "/");
|
|
1589
|
+
|
|
1590
|
+
// Skip if path escapes the repo (shouldn't happen in normal use)
|
|
1591
|
+
if (relPath.startsWith("..")) {
|
|
1592
|
+
continue;
|
|
1593
|
+
}
|
|
1594
|
+
foldersToCheck.push({ taskId, relPath });
|
|
1595
|
+
}
|
|
1596
|
+
|
|
1597
|
+
if (foldersToCheck.length === 0) return;
|
|
1598
|
+
|
|
1599
|
+
// Check which folders have untracked or uncommitted files
|
|
1600
|
+
const foldersToStage: string[] = [];
|
|
1601
|
+
for (const { taskId, relPath } of foldersToCheck) {
|
|
1602
|
+
const status = runGit(["status", "--porcelain", "--", relPath], repoRoot);
|
|
1603
|
+
if (status.ok && status.stdout.trim()) {
|
|
1604
|
+
execLog("wave", `W${waveIndex}`, `task ${taskId} has uncommitted files, staging`, {
|
|
1605
|
+
folder: relPath,
|
|
1606
|
+
status: status.stdout.trim().split("\n").slice(0, 5).join("; "),
|
|
1607
|
+
});
|
|
1608
|
+
foldersToStage.push(relPath);
|
|
1609
|
+
}
|
|
1610
|
+
}
|
|
1611
|
+
|
|
1612
|
+
if (foldersToStage.length === 0) return;
|
|
1613
|
+
|
|
1614
|
+
// TP-169: When an orch branch is provided, commit task files directly on
|
|
1615
|
+
// the orch branch using a temporary git index file. This avoids polluting
|
|
1616
|
+
// the repo's current branch (e.g. main) with orchestrator-internal staging
|
|
1617
|
+
// commits, maintaining proper branch isolation in workspace mode.
|
|
1618
|
+
//
|
|
1619
|
+
// Approach:
|
|
1620
|
+
// 1. Read the orch branch's tree into a temporary index
|
|
1621
|
+
// 2. Add new/modified task files to the temporary index
|
|
1622
|
+
// 3. Write the combined tree
|
|
1623
|
+
// 4. Create a commit on the orch branch
|
|
1624
|
+
// 5. Update the orch branch ref
|
|
1625
|
+
// 6. Clean up the temporary index
|
|
1626
|
+
//
|
|
1627
|
+
// Fallback: if orch branch plumbing fails or orchBranch is not provided,
|
|
1628
|
+
// fall back to the legacy path of committing on HEAD.
|
|
1629
|
+
if (orchBranch) {
|
|
1630
|
+
const orchTipRes = runGit(["rev-parse", `refs/heads/${orchBranch}`], repoRoot);
|
|
1631
|
+
if (orchTipRes.ok) {
|
|
1632
|
+
const orchTip = orchTipRes.stdout.trim();
|
|
1633
|
+
const tmpIdx = join(repoRoot, ".git", `tmp-staging-idx-wave-${waveIndex}`);
|
|
1634
|
+
|
|
1635
|
+
try {
|
|
1636
|
+
// Read orch branch tree into temporary index
|
|
1637
|
+
const readTreeRes = runGitWithEnv(["read-tree", orchTip], repoRoot, { GIT_INDEX_FILE: tmpIdx });
|
|
1638
|
+
if (!readTreeRes.ok) {
|
|
1639
|
+
execLog(
|
|
1640
|
+
"wave",
|
|
1641
|
+
`W${waveIndex}`,
|
|
1642
|
+
`orch branch staging: read-tree failed, falling back to HEAD commit`,
|
|
1643
|
+
{
|
|
1644
|
+
error: readTreeRes.stderr,
|
|
1645
|
+
},
|
|
1646
|
+
);
|
|
1647
|
+
// Fall through to legacy path
|
|
1648
|
+
} else {
|
|
1649
|
+
// Add task files to temporary index
|
|
1650
|
+
let addFailed = false;
|
|
1651
|
+
for (const folder of foldersToStage) {
|
|
1652
|
+
const addRes = runGitWithEnv(["add", "--", folder], repoRoot, { GIT_INDEX_FILE: tmpIdx });
|
|
1653
|
+
if (!addRes.ok) {
|
|
1654
|
+
execLog(
|
|
1655
|
+
"wave",
|
|
1656
|
+
`W${waveIndex}`,
|
|
1657
|
+
`orch branch staging: git add failed for ${folder}, falling back`,
|
|
1658
|
+
{
|
|
1659
|
+
error: addRes.stderr,
|
|
1660
|
+
},
|
|
1661
|
+
);
|
|
1662
|
+
addFailed = true;
|
|
1663
|
+
break;
|
|
1664
|
+
}
|
|
1665
|
+
}
|
|
1666
|
+
|
|
1667
|
+
if (!addFailed) {
|
|
1668
|
+
// Write tree from temporary index
|
|
1669
|
+
const writeTreeRes = runGitWithEnv(["write-tree"], repoRoot, { GIT_INDEX_FILE: tmpIdx });
|
|
1670
|
+
|
|
1671
|
+
if (writeTreeRes.ok) {
|
|
1672
|
+
const tree = writeTreeRes.stdout.trim();
|
|
1673
|
+
const taskIds = foldersToStage.map((f) => f.split("/").pop() || f).join(", ");
|
|
1674
|
+
const commitMsg = `chore: stage task files for orchestrator wave ${waveIndex} (${taskIds})`;
|
|
1675
|
+
|
|
1676
|
+
// Create commit directly on orch branch
|
|
1677
|
+
const commitTreeRes = runGit(
|
|
1678
|
+
["commit-tree", tree, "-p", orchTip, "-m", commitMsg],
|
|
1679
|
+
repoRoot,
|
|
1680
|
+
);
|
|
1681
|
+
|
|
1682
|
+
if (commitTreeRes.ok) {
|
|
1683
|
+
const newCommit = commitTreeRes.stdout.trim();
|
|
1684
|
+
const refUpdateRes = runGit(
|
|
1685
|
+
["update-ref", `refs/heads/${orchBranch}`, newCommit, orchTip],
|
|
1686
|
+
repoRoot,
|
|
1687
|
+
);
|
|
1688
|
+
|
|
1689
|
+
if (refUpdateRes.ok) {
|
|
1690
|
+
execLog(
|
|
1691
|
+
"wave",
|
|
1692
|
+
`W${waveIndex}`,
|
|
1693
|
+
`committed ${foldersToStage.length} task folder(s) directly on orch branch`,
|
|
1694
|
+
{
|
|
1695
|
+
orchBranch,
|
|
1696
|
+
folders: foldersToStage,
|
|
1697
|
+
from: orchTip.slice(0, 8),
|
|
1698
|
+
to: newCommit.slice(0, 8),
|
|
1699
|
+
},
|
|
1700
|
+
);
|
|
1701
|
+
// Clean up temp index and return — no need for legacy path
|
|
1702
|
+
try {
|
|
1703
|
+
unlinkSync(tmpIdx);
|
|
1704
|
+
} catch {
|
|
1705
|
+
/* best effort */
|
|
1706
|
+
}
|
|
1707
|
+
return;
|
|
1708
|
+
}
|
|
1709
|
+
execLog("wave", `W${waveIndex}`, `orch branch staging: ref update failed, falling back`, {
|
|
1710
|
+
error: refUpdateRes.stderr,
|
|
1711
|
+
});
|
|
1712
|
+
} else {
|
|
1713
|
+
execLog("wave", `W${waveIndex}`, `orch branch staging: commit-tree failed, falling back`, {
|
|
1714
|
+
error: commitTreeRes.stderr,
|
|
1715
|
+
});
|
|
1716
|
+
}
|
|
1717
|
+
} else {
|
|
1718
|
+
execLog("wave", `W${waveIndex}`, `orch branch staging: write-tree failed, falling back`, {
|
|
1719
|
+
error: writeTreeRes.stderr,
|
|
1720
|
+
});
|
|
1721
|
+
}
|
|
1722
|
+
}
|
|
1723
|
+
}
|
|
1724
|
+
} catch (err: unknown) {
|
|
1725
|
+
execLog(
|
|
1726
|
+
"wave",
|
|
1727
|
+
`W${waveIndex}`,
|
|
1728
|
+
`orch branch staging: unexpected error, falling back to HEAD commit`,
|
|
1729
|
+
{
|
|
1730
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1731
|
+
},
|
|
1732
|
+
);
|
|
1733
|
+
} finally {
|
|
1734
|
+
// Always clean up temp index
|
|
1735
|
+
try {
|
|
1736
|
+
unlinkSync(tmpIdx);
|
|
1737
|
+
} catch {
|
|
1738
|
+
/* best effort */
|
|
1739
|
+
}
|
|
1740
|
+
}
|
|
1741
|
+
}
|
|
1742
|
+
}
|
|
1743
|
+
|
|
1744
|
+
// Legacy fallback: commit on HEAD and sync orch branch.
|
|
1745
|
+
// This path is used when orchBranch is not provided, or when the
|
|
1746
|
+
// plumbing-based approach above failed.
|
|
1747
|
+
|
|
1748
|
+
// Stage only the task folders
|
|
1749
|
+
for (const folder of foldersToStage) {
|
|
1750
|
+
const addResult = runGit(["add", "--", folder], repoRoot);
|
|
1751
|
+
if (!addResult.ok) {
|
|
1752
|
+
execLog("wave", `W${waveIndex}`, `failed to stage task files: ${addResult.stderr}`, { folder });
|
|
1753
|
+
throw new ExecutionError(
|
|
1754
|
+
"EXEC_TASK_STAGE_FAILED",
|
|
1755
|
+
`Failed to stage task files in "${folder}": ${addResult.stderr}`,
|
|
1756
|
+
"wave",
|
|
1757
|
+
folder,
|
|
1758
|
+
);
|
|
1759
|
+
}
|
|
1760
|
+
}
|
|
1761
|
+
|
|
1762
|
+
// Commit
|
|
1763
|
+
const taskIds = foldersToStage.map((f) => f.split("/").pop() || f).join(", ");
|
|
1764
|
+
const commitMsg = `chore: stage task files for orchestrator wave ${waveIndex} (${taskIds})`;
|
|
1765
|
+
const commitResult = runGit(["commit", "-m", commitMsg], repoRoot);
|
|
1766
|
+
if (!commitResult.ok) {
|
|
1767
|
+
execLog("wave", `W${waveIndex}`, `failed to commit task files: ${commitResult.stderr}`);
|
|
1768
|
+
throw new ExecutionError(
|
|
1769
|
+
"EXEC_TASK_COMMIT_FAILED",
|
|
1770
|
+
`Failed to commit task files for wave ${waveIndex}: ${commitResult.stderr}`,
|
|
1771
|
+
"wave",
|
|
1772
|
+
`W${waveIndex}`,
|
|
1773
|
+
);
|
|
1774
|
+
}
|
|
1775
|
+
|
|
1776
|
+
execLog(
|
|
1777
|
+
"wave",
|
|
1778
|
+
`W${waveIndex}`,
|
|
1779
|
+
`committed ${foldersToStage.length} task folder(s) to ensure worktree visibility`,
|
|
1780
|
+
{
|
|
1781
|
+
folders: foldersToStage,
|
|
1782
|
+
commit: commitResult.stdout.trim().split("\n")[0],
|
|
1783
|
+
},
|
|
1784
|
+
);
|
|
1785
|
+
|
|
1786
|
+
// Fast-forward (or merge) the orch branch to include the staging commit so
|
|
1787
|
+
// that worktrees—which branch from orchBranch—see the new task files and
|
|
1788
|
+
// workers can find their PROMPT.md / STATUS.md without an ENOENT crash.
|
|
1789
|
+
if (orchBranch) {
|
|
1790
|
+
try {
|
|
1791
|
+
const headRes = runGit(["rev-parse", "HEAD"], repoRoot);
|
|
1792
|
+
const orchTipRes = runGit(["rev-parse", `refs/heads/${orchBranch}`], repoRoot);
|
|
1793
|
+
|
|
1794
|
+
if (headRes.ok && orchTipRes.ok) {
|
|
1795
|
+
const newHead = headRes.stdout.trim();
|
|
1796
|
+
const orchTip = orchTipRes.stdout.trim();
|
|
1797
|
+
|
|
1798
|
+
const ancestorCheck = runGit(["merge-base", "--is-ancestor", orchTip, newHead], repoRoot);
|
|
1799
|
+
|
|
1800
|
+
if (ancestorCheck.ok) {
|
|
1801
|
+
const ffResult = runGit(
|
|
1802
|
+
["update-ref", `refs/heads/${orchBranch}`, newHead, orchTip],
|
|
1803
|
+
repoRoot,
|
|
1804
|
+
);
|
|
1805
|
+
if (ffResult.ok) {
|
|
1806
|
+
execLog("wave", `W${waveIndex}`, `fast-forwarded orch branch to include staging commit`, {
|
|
1807
|
+
orchBranch,
|
|
1808
|
+
from: orchTip.slice(0, 8),
|
|
1809
|
+
to: newHead.slice(0, 8),
|
|
1810
|
+
});
|
|
1811
|
+
} else {
|
|
1812
|
+
execLog("wave", `W${waveIndex}`, `warning: failed to fast-forward orch branch (non-fatal)`, {
|
|
1813
|
+
orchBranch,
|
|
1814
|
+
error: ffResult.stderr,
|
|
1815
|
+
});
|
|
1816
|
+
}
|
|
1817
|
+
} else {
|
|
1818
|
+
const mergeTreeRes = runGit(["merge-tree", "--write-tree", orchTip, newHead], repoRoot);
|
|
1819
|
+
if (mergeTreeRes.ok) {
|
|
1820
|
+
const mergedTree = mergeTreeRes.stdout.trim().split("\n")[0];
|
|
1821
|
+
if (/^[0-9a-f]{40}$/i.test(mergedTree)) {
|
|
1822
|
+
const mergeMsg = `merge: include staged task files for wave ${waveIndex} into orch branch`;
|
|
1823
|
+
const commitTreeRes = runGit(
|
|
1824
|
+
["commit-tree", mergedTree, "-p", orchTip, "-p", newHead, "-m", mergeMsg],
|
|
1825
|
+
repoRoot,
|
|
1826
|
+
);
|
|
1827
|
+
if (commitTreeRes.ok) {
|
|
1828
|
+
const mergeCommitSha = commitTreeRes.stdout.trim();
|
|
1829
|
+
const refUpdateRes = runGit(
|
|
1830
|
+
["update-ref", `refs/heads/${orchBranch}`, mergeCommitSha, orchTip],
|
|
1831
|
+
repoRoot,
|
|
1832
|
+
);
|
|
1833
|
+
if (refUpdateRes.ok) {
|
|
1834
|
+
execLog("wave", `W${waveIndex}`, `merged staging commit into orch branch (non-FF wave)`, {
|
|
1835
|
+
orchBranch,
|
|
1836
|
+
mergeCommit: mergeCommitSha.slice(0, 8),
|
|
1837
|
+
});
|
|
1838
|
+
}
|
|
1839
|
+
}
|
|
1840
|
+
}
|
|
1841
|
+
}
|
|
1842
|
+
}
|
|
1843
|
+
}
|
|
1844
|
+
} catch (refErr: unknown) {
|
|
1845
|
+
execLog(
|
|
1846
|
+
"wave",
|
|
1847
|
+
`W${waveIndex}`,
|
|
1848
|
+
`warning: orch branch ref update threw unexpectedly (non-fatal)`,
|
|
1849
|
+
{
|
|
1850
|
+
orchBranch,
|
|
1851
|
+
error: refErr instanceof Error ? refErr.message : String(refErr),
|
|
1852
|
+
},
|
|
1853
|
+
);
|
|
1854
|
+
}
|
|
1855
|
+
}
|
|
1856
|
+
}
|
|
1857
|
+
|
|
1858
|
+
// ── Wave Execution Core ──────────────────────────────────────────────
|
|
1859
|
+
|
|
1860
|
+
/**
|
|
1861
|
+
* Execute a single wave: allocate lanes, run tasks in parallel, monitor, apply failure policy.
|
|
1862
|
+
*
|
|
1863
|
+
* Orchestration flow:
|
|
1864
|
+
* 1. Allocate lanes via allocateLanes() (worktree creation + task assignment)
|
|
1865
|
+
* 2. Start all lanes in parallel (each lane executes tasks sequentially)
|
|
1866
|
+
* 3. Start monitoring as a sibling async loop
|
|
1867
|
+
* 4. Wait for all lanes to complete (or policy-triggered early termination)
|
|
1868
|
+
* 5. Apply failure handling policy
|
|
1869
|
+
* 6. Build and return WaveExecutionResult
|
|
1870
|
+
*
|
|
1871
|
+
* Failure policy behavior:
|
|
1872
|
+
* - **skip-dependents**: In-flight tasks continue. Failed task's transitive
|
|
1873
|
+
* dependents are collected in blockedTaskIds for future wave pruning.
|
|
1874
|
+
* Current wave runs to completion.
|
|
1875
|
+
* - **stop-wave**: On first failure, pauseSignal is set. In-flight tasks
|
|
1876
|
+
* finish their current work, remaining tasks in lanes are skipped.
|
|
1877
|
+
* No next wave is started (stoppedEarly=true).
|
|
1878
|
+
* - **stop-all**: On first failure, all active lane sessions are killed immediately.
|
|
1879
|
+
* Returns with aborted status.
|
|
1880
|
+
*
|
|
1881
|
+
* Concurrency model:
|
|
1882
|
+
* - Lane execution promises are NOT cancellable (lane sessions run externally)
|
|
1883
|
+
* - stop-all kills sessions directly; executeLane() detects session death on next poll
|
|
1884
|
+
* - Monitoring stops when all lanes reach terminal state or pauseSignal is set
|
|
1885
|
+
*
|
|
1886
|
+
* @param waveTasks - Task IDs in this wave
|
|
1887
|
+
* @param waveIndex - Wave number (1-indexed)
|
|
1888
|
+
* @param pending - Full pending task map from discovery
|
|
1889
|
+
* @param config - Orchestrator configuration
|
|
1890
|
+
* @param repoRoot - Main repository root
|
|
1891
|
+
* @param batchId - Batch ID for naming
|
|
1892
|
+
* @param pauseSignal - Shared pause signal (mutated by stop-wave policy)
|
|
1893
|
+
* @param dependencyGraph - Dependency graph for computing transitive dependents
|
|
1894
|
+
* @param orchBranch - Orch branch to base worktrees on (and to update after staging commits)
|
|
1895
|
+
* @param onMonitorUpdate - Optional callback for dashboard updates during monitoring
|
|
1896
|
+
* @param onLanesAllocated - Optional callback fired after lane allocation succeeds
|
|
1897
|
+
* @param workspaceConfig - Workspace configuration for repo routing (null/undefined = repo mode)
|
|
1898
|
+
* @returns WaveExecutionResult with outcomes and blocked task IDs
|
|
1899
|
+
*/
|
|
1900
|
+
/**
|
|
1901
|
+
* Runtime backend selector for lane execution.
|
|
1902
|
+
*
|
|
1903
|
+
* - `"legacy"`: Session-backed path (spawnLaneSession, deprecated)
|
|
1904
|
+
* - `"v2"`: Direct-child path (lane-runner → agent-host → pi --mode rpc)
|
|
1905
|
+
*
|
|
1906
|
+
* @since TP-105
|
|
1907
|
+
*/
|
|
1908
|
+
export type RuntimeBackend = "legacy" | "v2";
|
|
1909
|
+
|
|
1910
|
+
export async function executeWave(
|
|
1911
|
+
waveTasks: string[],
|
|
1912
|
+
waveIndex: number,
|
|
1913
|
+
pending: Map<string, ParsedTask>,
|
|
1914
|
+
config: OrchestratorConfig,
|
|
1915
|
+
repoRoot: string,
|
|
1916
|
+
batchId: string,
|
|
1917
|
+
pauseSignal: { paused: boolean },
|
|
1918
|
+
dependencyGraph: DependencyGraph,
|
|
1919
|
+
orchBranch: string,
|
|
1920
|
+
onMonitorUpdate?: MonitorUpdateCallback,
|
|
1921
|
+
onLanesAllocated?: (lanes: AllocatedLane[]) => void,
|
|
1922
|
+
workspaceConfig?: WorkspaceConfig | null,
|
|
1923
|
+
runtimeBackend?: RuntimeBackend,
|
|
1924
|
+
onSupervisorAlert?: SupervisorAlertCallback,
|
|
1925
|
+
supervisorAutonomy: "interactive" | "supervised" | "autonomous" = "autonomous",
|
|
1926
|
+
reviewerConfig?: {
|
|
1927
|
+
model?: string;
|
|
1928
|
+
thinking?: string;
|
|
1929
|
+
tools?: string;
|
|
1930
|
+
excludeExtensions?: string[];
|
|
1931
|
+
},
|
|
1932
|
+
workerConfig?: {
|
|
1933
|
+
model?: string;
|
|
1934
|
+
thinking?: string;
|
|
1935
|
+
tools?: string;
|
|
1936
|
+
excludeExtensions?: string[];
|
|
1937
|
+
} | null,
|
|
1938
|
+
workerExcludeExtensions?: string[],
|
|
1939
|
+
onLaneTerminated?: import("./types.ts").LaneTerminatedCallback,
|
|
1940
|
+
onLaneRespawned?: (laneNumber: number, agentId: string, batchId: string) => void,
|
|
1941
|
+
): Promise<WaveExecutionResult> {
|
|
1942
|
+
const startedAt = Date.now();
|
|
1943
|
+
const policy = config.failure.on_task_failure;
|
|
1944
|
+
|
|
1945
|
+
execLog("wave", `W${waveIndex}`, `starting wave execution`, {
|
|
1946
|
+
tasks: waveTasks.length,
|
|
1947
|
+
policy,
|
|
1948
|
+
batchId,
|
|
1949
|
+
});
|
|
1950
|
+
|
|
1951
|
+
// ── Stage 0: Ensure task files are committed ────────────────
|
|
1952
|
+
// Task folders may contain untracked files (PROMPT.md, STATUS.md) that
|
|
1953
|
+
// won't appear in worktrees unless committed. Stage and commit them now,
|
|
1954
|
+
// before worktree creation, so workers can find their task files.
|
|
1955
|
+
// Pass orchBranch so the staging commit is reflected in the orch branch
|
|
1956
|
+
// before worktrees are allocated from it.
|
|
1957
|
+
try {
|
|
1958
|
+
ensureTaskFilesCommitted(waveTasks, pending, repoRoot, waveIndex, orchBranch);
|
|
1959
|
+
} catch (err: unknown) {
|
|
1960
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
1961
|
+
execLog("wave", `W${waveIndex}`, `task file commit failed: ${errMsg}`);
|
|
1962
|
+
|
|
1963
|
+
return {
|
|
1964
|
+
waveIndex,
|
|
1965
|
+
startedAt,
|
|
1966
|
+
endedAt: Date.now(),
|
|
1967
|
+
laneResults: [],
|
|
1968
|
+
policyApplied: policy,
|
|
1969
|
+
stoppedEarly: true,
|
|
1970
|
+
failedTaskIds: waveTasks,
|
|
1971
|
+
skippedTaskIds: [],
|
|
1972
|
+
succeededTaskIds: [],
|
|
1973
|
+
blockedTaskIds: [...computeTransitiveDependents(new Set(waveTasks), dependencyGraph)],
|
|
1974
|
+
laneCount: 0,
|
|
1975
|
+
overallStatus: "failed",
|
|
1976
|
+
finalMonitorState: null,
|
|
1977
|
+
allocatedLanes: [],
|
|
1978
|
+
};
|
|
1979
|
+
}
|
|
1980
|
+
|
|
1981
|
+
// ── Stage 1: Allocate lanes ──────────────────────────────────
|
|
1982
|
+
const allocResult = allocateLanes(
|
|
1983
|
+
waveTasks,
|
|
1984
|
+
pending,
|
|
1985
|
+
config,
|
|
1986
|
+
repoRoot,
|
|
1987
|
+
batchId,
|
|
1988
|
+
orchBranch,
|
|
1989
|
+
workspaceConfig,
|
|
1990
|
+
);
|
|
1991
|
+
|
|
1992
|
+
if (!allocResult.success) {
|
|
1993
|
+
const errMsg = allocResult.error?.message || "Unknown allocation failure";
|
|
1994
|
+
execLog("wave", `W${waveIndex}`, `lane allocation failed: ${errMsg}`);
|
|
1995
|
+
|
|
1996
|
+
return {
|
|
1997
|
+
waveIndex,
|
|
1998
|
+
startedAt,
|
|
1999
|
+
endedAt: Date.now(),
|
|
2000
|
+
laneResults: [],
|
|
2001
|
+
policyApplied: policy,
|
|
2002
|
+
stoppedEarly: true,
|
|
2003
|
+
failedTaskIds: waveTasks, // All tasks in the wave are considered failed
|
|
2004
|
+
skippedTaskIds: [],
|
|
2005
|
+
succeededTaskIds: [],
|
|
2006
|
+
blockedTaskIds: [...computeTransitiveDependents(new Set(waveTasks), dependencyGraph)],
|
|
2007
|
+
laneCount: 0,
|
|
2008
|
+
overallStatus: "failed",
|
|
2009
|
+
finalMonitorState: null,
|
|
2010
|
+
allocatedLanes: [],
|
|
2011
|
+
allocationError: allocResult.error,
|
|
2012
|
+
};
|
|
2013
|
+
}
|
|
2014
|
+
|
|
2015
|
+
const lanes = allocResult.lanes;
|
|
2016
|
+
onLanesAllocated?.(lanes);
|
|
2017
|
+
|
|
2018
|
+
execLog("wave", `W${waveIndex}`, `lanes allocated`, {
|
|
2019
|
+
laneCount: lanes.length,
|
|
2020
|
+
totalTasks: waveTasks.length,
|
|
2021
|
+
});
|
|
2022
|
+
|
|
2023
|
+
// ── Stage 2+3: Start lanes in parallel + monitoring ──────────
|
|
2024
|
+
// Create per-wave pause signal that can be triggered by policy
|
|
2025
|
+
// while preserving the external pauseSignal from /orch-pause
|
|
2026
|
+
const wavePauseSignal = pauseSignal;
|
|
2027
|
+
|
|
2028
|
+
// Start lane execution promises
|
|
2029
|
+
// In workspace mode, pass the workspace root so lane sessions can find .pi/ config.
|
|
2030
|
+
// configPath is .pi/orchid-workspace.yaml → parent of parent is workspace root.
|
|
2031
|
+
const wsRoot = workspaceConfig ? dirname(dirname(workspaceConfig.configPath)) : undefined;
|
|
2032
|
+
const isWsMode = !!workspaceConfig;
|
|
2033
|
+
const backend: RuntimeBackend = "v2";
|
|
2034
|
+
if (runtimeBackend && runtimeBackend !== "v2") {
|
|
2035
|
+
execLog(
|
|
2036
|
+
"wave",
|
|
2037
|
+
`W${waveIndex}`,
|
|
2038
|
+
`legacy runtime backend '${runtimeBackend}' requested but ignored; using Runtime V2`,
|
|
2039
|
+
);
|
|
2040
|
+
}
|
|
2041
|
+
execLog("wave", `W${waveIndex}`, "using Runtime V2 backend (executeLaneV2)");
|
|
2042
|
+
|
|
2043
|
+
// Clear stale lane snapshots from prior waves before launching new workers.
|
|
2044
|
+
// Without this, the monitor reads a snapshot from wave N-1 (different taskId,
|
|
2045
|
+
// staleMs > 30s) and may falsely mark the new task as failed before the
|
|
2046
|
+
// worker has time to write its first snapshot.
|
|
2047
|
+
const snapshotStateRoot = resolveRuntimeStateRoot(repoRoot, wsRoot);
|
|
2048
|
+
for (const lane of lanes) {
|
|
2049
|
+
try {
|
|
2050
|
+
const snapPath = join(
|
|
2051
|
+
snapshotStateRoot,
|
|
2052
|
+
".pi",
|
|
2053
|
+
"runtime",
|
|
2054
|
+
batchId,
|
|
2055
|
+
"lanes",
|
|
2056
|
+
`lane-${lane.laneNumber}.json`,
|
|
2057
|
+
);
|
|
2058
|
+
if (existsSync(snapPath)) unlinkSync(snapPath);
|
|
2059
|
+
} catch {
|
|
2060
|
+
/* best effort */
|
|
2061
|
+
}
|
|
2062
|
+
}
|
|
2063
|
+
|
|
2064
|
+
const lanePromises = lanes.map((lane) =>
|
|
2065
|
+
executeLaneV2(
|
|
2066
|
+
lane,
|
|
2067
|
+
config,
|
|
2068
|
+
repoRoot,
|
|
2069
|
+
wavePauseSignal,
|
|
2070
|
+
wsRoot,
|
|
2071
|
+
isWsMode,
|
|
2072
|
+
{
|
|
2073
|
+
ORCH_BATCH_ID: batchId,
|
|
2074
|
+
TASKPLANE_SUPERVISOR_AUTONOMY: supervisorAutonomy,
|
|
2075
|
+
...buildWorkerEnv(workerConfig),
|
|
2076
|
+
...buildReviewerEnv(reviewerConfig),
|
|
2077
|
+
...buildWorkerExcludeEnv(workerExcludeExtensions),
|
|
2078
|
+
},
|
|
2079
|
+
onSupervisorAlert,
|
|
2080
|
+
onLaneTerminated,
|
|
2081
|
+
onLaneRespawned,
|
|
2082
|
+
),
|
|
2083
|
+
);
|
|
2084
|
+
|
|
2085
|
+
// Start monitoring as a sibling async loop
|
|
2086
|
+
// Monitor runs concurrently and stops when all lanes are terminal or paused
|
|
2087
|
+
const monitorStateRoot = resolveRuntimeStateRoot(repoRoot, wsRoot);
|
|
2088
|
+
const monitorPromise = monitorLanes(
|
|
2089
|
+
lanes,
|
|
2090
|
+
config,
|
|
2091
|
+
repoRoot,
|
|
2092
|
+
wavePauseSignal,
|
|
2093
|
+
waveIndex,
|
|
2094
|
+
onMonitorUpdate,
|
|
2095
|
+
isWsMode,
|
|
2096
|
+
backend,
|
|
2097
|
+
batchId,
|
|
2098
|
+
monitorStateRoot,
|
|
2099
|
+
);
|
|
2100
|
+
|
|
2101
|
+
// ── Stage 4: Wait for all lanes + apply policy ───────────────
|
|
2102
|
+
// We need to detect the first failure to apply policy.
|
|
2103
|
+
// Use Promise.allSettled on lanes, then check results.
|
|
2104
|
+
// For stop-all, we also need to react proactively.
|
|
2105
|
+
|
|
2106
|
+
let laneResults: LaneExecutionResult[];
|
|
2107
|
+
let finalMonitorState: MonitorState | null = null;
|
|
2108
|
+
|
|
2109
|
+
if (policy === "stop-all") {
|
|
2110
|
+
// For stop-all: race detection — as soon as any lane reports failure,
|
|
2111
|
+
// kill all sessions immediately.
|
|
2112
|
+
laneResults = await executeWithStopAll(lanes, lanePromises, wavePauseSignal, waveIndex);
|
|
2113
|
+
} else {
|
|
2114
|
+
// For skip-dependents and stop-wave:
|
|
2115
|
+
// Let all lanes run to completion (or until pauseSignal stops them).
|
|
2116
|
+
// For stop-wave, we set pauseSignal when we detect failure in results.
|
|
2117
|
+
const settled = await Promise.allSettled(lanePromises);
|
|
2118
|
+
|
|
2119
|
+
laneResults = settled.map((result, idx) => {
|
|
2120
|
+
if (result.status === "fulfilled") {
|
|
2121
|
+
return result.value;
|
|
2122
|
+
}
|
|
2123
|
+
// Rejected promise — shouldn't normally happen (executeLane catches errors)
|
|
2124
|
+
const errMsg = result.reason instanceof Error ? result.reason.message : String(result.reason);
|
|
2125
|
+
execLog("wave", `W${waveIndex}`, `lane ${lanes[idx].laneId} promise rejected: ${errMsg}`);
|
|
2126
|
+
return {
|
|
2127
|
+
laneNumber: lanes[idx].laneNumber,
|
|
2128
|
+
laneId: lanes[idx].laneId,
|
|
2129
|
+
tasks: lanes[idx].tasks.map((t) => ({
|
|
2130
|
+
taskId: t.taskId,
|
|
2131
|
+
status: "failed" as LaneTaskStatus,
|
|
2132
|
+
startTime: null,
|
|
2133
|
+
endTime: null,
|
|
2134
|
+
exitReason: `Lane promise rejected: ${errMsg}`,
|
|
2135
|
+
sessionName: laneSessionIdOf(lanes[idx]),
|
|
2136
|
+
doneFileFound: false,
|
|
2137
|
+
laneNumber: lanes[idx].laneNumber,
|
|
2138
|
+
})),
|
|
2139
|
+
overallStatus: "failed" as const,
|
|
2140
|
+
startTime: startedAt,
|
|
2141
|
+
endTime: Date.now(),
|
|
2142
|
+
};
|
|
2143
|
+
});
|
|
2144
|
+
|
|
2145
|
+
// For stop-wave: if any task failed, set pause to prevent next wave
|
|
2146
|
+
if (policy === "stop-wave") {
|
|
2147
|
+
const hasFailure = laneResults.some((lr) =>
|
|
2148
|
+
lr.tasks.some((t) => t.status === "failed" || t.status === "stalled"),
|
|
2149
|
+
);
|
|
2150
|
+
if (hasFailure) {
|
|
2151
|
+
wavePauseSignal.paused = true;
|
|
2152
|
+
execLog("wave", `W${waveIndex}`, `stop-wave policy triggered — pausing after this wave`);
|
|
2153
|
+
}
|
|
2154
|
+
}
|
|
2155
|
+
}
|
|
2156
|
+
|
|
2157
|
+
// Stop the monitor (it should stop naturally when lanes are terminal,
|
|
2158
|
+
// but ensure it's stopped if we triggered pause)
|
|
2159
|
+
try {
|
|
2160
|
+
finalMonitorState = await monitorPromise;
|
|
2161
|
+
} catch {
|
|
2162
|
+
// Monitor error is non-fatal
|
|
2163
|
+
execLog("wave", `W${waveIndex}`, `monitor promise error (non-fatal)`);
|
|
2164
|
+
}
|
|
2165
|
+
|
|
2166
|
+
// ── Stage 5: Build WaveExecutionResult ───────────────────────
|
|
2167
|
+
const failedTaskIds: string[] = [];
|
|
2168
|
+
const skippedTaskIds: string[] = [];
|
|
2169
|
+
const succeededTaskIds: string[] = [];
|
|
2170
|
+
|
|
2171
|
+
for (const lr of laneResults) {
|
|
2172
|
+
for (const t of lr.tasks) {
|
|
2173
|
+
if (t.status === "succeeded") {
|
|
2174
|
+
succeededTaskIds.push(t.taskId);
|
|
2175
|
+
} else if (t.status === "failed" || t.status === "stalled") {
|
|
2176
|
+
failedTaskIds.push(t.taskId);
|
|
2177
|
+
} else if (t.status === "skipped") {
|
|
2178
|
+
skippedTaskIds.push(t.taskId);
|
|
2179
|
+
}
|
|
2180
|
+
}
|
|
2181
|
+
}
|
|
2182
|
+
|
|
2183
|
+
// Sort for deterministic output
|
|
2184
|
+
failedTaskIds.sort();
|
|
2185
|
+
skippedTaskIds.sort();
|
|
2186
|
+
succeededTaskIds.sort();
|
|
2187
|
+
|
|
2188
|
+
// Compute blocked tasks for future waves (skip-dependents policy)
|
|
2189
|
+
let blockedTaskIds: string[] = [];
|
|
2190
|
+
if (policy === "skip-dependents" && failedTaskIds.length > 0) {
|
|
2191
|
+
const blocked = computeTransitiveDependents(new Set(failedTaskIds), dependencyGraph);
|
|
2192
|
+
blockedTaskIds = [...blocked].sort();
|
|
2193
|
+
if (blockedTaskIds.length > 0) {
|
|
2194
|
+
execLog(
|
|
2195
|
+
"wave",
|
|
2196
|
+
`W${waveIndex}`,
|
|
2197
|
+
`skip-dependents: ${blockedTaskIds.length} task(s) blocked for future waves`,
|
|
2198
|
+
{
|
|
2199
|
+
blocked: blockedTaskIds.join(","),
|
|
2200
|
+
},
|
|
2201
|
+
);
|
|
2202
|
+
}
|
|
2203
|
+
}
|
|
2204
|
+
|
|
2205
|
+
// Determine overall wave status
|
|
2206
|
+
const stoppedEarly =
|
|
2207
|
+
(policy === "stop-all" && failedTaskIds.length > 0) ||
|
|
2208
|
+
(policy === "stop-wave" && failedTaskIds.length > 0);
|
|
2209
|
+
|
|
2210
|
+
let overallStatus: WaveExecutionResult["overallStatus"];
|
|
2211
|
+
if (policy === "stop-all" && failedTaskIds.length > 0) {
|
|
2212
|
+
overallStatus = "aborted";
|
|
2213
|
+
} else if (failedTaskIds.length === 0) {
|
|
2214
|
+
overallStatus = "succeeded";
|
|
2215
|
+
} else if (succeededTaskIds.length > 0) {
|
|
2216
|
+
overallStatus = "partial";
|
|
2217
|
+
} else {
|
|
2218
|
+
overallStatus = "failed";
|
|
2219
|
+
}
|
|
2220
|
+
|
|
2221
|
+
const endedAt = Date.now();
|
|
2222
|
+
const elapsedSec = Math.round((endedAt - startedAt) / 1000);
|
|
2223
|
+
|
|
2224
|
+
execLog("wave", `W${waveIndex}`, `wave execution complete: ${overallStatus}`, {
|
|
2225
|
+
succeeded: succeededTaskIds.length,
|
|
2226
|
+
failed: failedTaskIds.length,
|
|
2227
|
+
skipped: skippedTaskIds.length,
|
|
2228
|
+
blocked: blockedTaskIds.length,
|
|
2229
|
+
elapsed: `${elapsedSec}s`,
|
|
2230
|
+
stoppedEarly,
|
|
2231
|
+
});
|
|
2232
|
+
|
|
2233
|
+
return {
|
|
2234
|
+
waveIndex,
|
|
2235
|
+
startedAt,
|
|
2236
|
+
endedAt,
|
|
2237
|
+
laneResults,
|
|
2238
|
+
policyApplied: policy,
|
|
2239
|
+
stoppedEarly,
|
|
2240
|
+
failedTaskIds,
|
|
2241
|
+
skippedTaskIds,
|
|
2242
|
+
succeededTaskIds,
|
|
2243
|
+
blockedTaskIds,
|
|
2244
|
+
laneCount: lanes.length,
|
|
2245
|
+
overallStatus,
|
|
2246
|
+
finalMonitorState,
|
|
2247
|
+
allocatedLanes: lanes,
|
|
2248
|
+
};
|
|
2249
|
+
}
|
|
2250
|
+
|
|
2251
|
+
/**
|
|
2252
|
+
* Execute lanes with stop-all failure policy.
|
|
2253
|
+
*
|
|
2254
|
+
* Starts all lanes, then monitors for the first failure.
|
|
2255
|
+
* On first failure: kills all active lane sessions immediately and returns.
|
|
2256
|
+
*
|
|
2257
|
+
* Uses a race pattern: wraps each lane promise to signal on failure,
|
|
2258
|
+
* then kills all sessions when first failure is detected.
|
|
2259
|
+
*
|
|
2260
|
+
* Deterministic tie-break: when multiple failures happen simultaneously,
|
|
2261
|
+
* they are ordered by timestamp (startTime), then by task ID alphabetically.
|
|
2262
|
+
*
|
|
2263
|
+
* @param lanes - Allocated lanes
|
|
2264
|
+
* @param lanePromises - Already-started lane execution promises
|
|
2265
|
+
* @param pauseSignal - Pause signal to set on abort
|
|
2266
|
+
* @param waveIndex - Wave number for logging
|
|
2267
|
+
* @returns Lane execution results (may have aborted tasks)
|
|
2268
|
+
*/
|
|
2269
|
+
export async function executeWithStopAll(
|
|
2270
|
+
lanes: AllocatedLane[],
|
|
2271
|
+
lanePromises: Promise<LaneExecutionResult>[],
|
|
2272
|
+
pauseSignal: { paused: boolean },
|
|
2273
|
+
waveIndex: number,
|
|
2274
|
+
): Promise<LaneExecutionResult[]> {
|
|
2275
|
+
// Track results as they complete
|
|
2276
|
+
const results: (LaneExecutionResult | null)[] = new Array(lanes.length).fill(null);
|
|
2277
|
+
let abortTriggered = false;
|
|
2278
|
+
|
|
2279
|
+
// Create a promise that resolves when all lanes are done
|
|
2280
|
+
// but also detects first failure
|
|
2281
|
+
const wrappedPromises = lanePromises.map(async (promise, idx) => {
|
|
2282
|
+
try {
|
|
2283
|
+
const result = await promise;
|
|
2284
|
+
results[idx] = result;
|
|
2285
|
+
|
|
2286
|
+
// Check if any task failed
|
|
2287
|
+
if (!abortTriggered) {
|
|
2288
|
+
const hasFailure = result.tasks.some((t) => t.status === "failed" || t.status === "stalled");
|
|
2289
|
+
if (hasFailure) {
|
|
2290
|
+
// First failure detected — trigger stop-all
|
|
2291
|
+
abortTriggered = true;
|
|
2292
|
+
pauseSignal.paused = true;
|
|
2293
|
+
|
|
2294
|
+
// Determine which task failed first for logging
|
|
2295
|
+
const firstFailed = result.tasks
|
|
2296
|
+
.filter((t) => t.status === "failed" || t.status === "stalled")
|
|
2297
|
+
.sort((a, b) => {
|
|
2298
|
+
// Sort by startTime, then by taskId for deterministic tie-break
|
|
2299
|
+
const timeA = a.startTime || 0;
|
|
2300
|
+
const timeB = b.startTime || 0;
|
|
2301
|
+
if (timeA !== timeB) return timeA - timeB;
|
|
2302
|
+
return a.taskId.localeCompare(b.taskId);
|
|
2303
|
+
})[0];
|
|
2304
|
+
|
|
2305
|
+
execLog(
|
|
2306
|
+
"wave",
|
|
2307
|
+
`W${waveIndex}`,
|
|
2308
|
+
`stop-all triggered by ${firstFailed?.taskId || "unknown"} in ${lanes[idx].laneId}`,
|
|
2309
|
+
{
|
|
2310
|
+
session: laneSessionIdOf(lanes[idx]),
|
|
2311
|
+
},
|
|
2312
|
+
);
|
|
2313
|
+
|
|
2314
|
+
// Kill ALL lane sessions immediately
|
|
2315
|
+
for (const lane of lanes) {
|
|
2316
|
+
killV2LaneAgents(laneSessionIdOf(lane), { laneNumber: lane.laneNumber });
|
|
2317
|
+
}
|
|
2318
|
+
}
|
|
2319
|
+
}
|
|
2320
|
+
|
|
2321
|
+
return result;
|
|
2322
|
+
} catch (err) {
|
|
2323
|
+
// Lane promise rejection — should be rare
|
|
2324
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
2325
|
+
if (!abortTriggered) {
|
|
2326
|
+
abortTriggered = true;
|
|
2327
|
+
pauseSignal.paused = true;
|
|
2328
|
+
execLog(
|
|
2329
|
+
"wave",
|
|
2330
|
+
`W${waveIndex}`,
|
|
2331
|
+
`stop-all triggered by lane error in ${lanes[idx].laneId}: ${errMsg}`,
|
|
2332
|
+
);
|
|
2333
|
+
for (const lane of lanes) {
|
|
2334
|
+
killV2LaneAgents(laneSessionIdOf(lane), { laneNumber: lane.laneNumber });
|
|
2335
|
+
}
|
|
2336
|
+
}
|
|
2337
|
+
|
|
2338
|
+
// Build a failed result for this lane
|
|
2339
|
+
const failedResult: LaneExecutionResult = {
|
|
2340
|
+
laneNumber: lanes[idx].laneNumber,
|
|
2341
|
+
laneId: lanes[idx].laneId,
|
|
2342
|
+
tasks: lanes[idx].tasks.map((t) => ({
|
|
2343
|
+
taskId: t.taskId,
|
|
2344
|
+
status: "failed" as LaneTaskStatus,
|
|
2345
|
+
startTime: null,
|
|
2346
|
+
endTime: null,
|
|
2347
|
+
exitReason: `Lane aborted: ${errMsg}`,
|
|
2348
|
+
sessionName: laneSessionIdOf(lanes[idx]),
|
|
2349
|
+
doneFileFound: false,
|
|
2350
|
+
laneNumber: lanes[idx].laneNumber,
|
|
2351
|
+
})),
|
|
2352
|
+
overallStatus: "failed",
|
|
2353
|
+
startTime: Date.now(),
|
|
2354
|
+
endTime: Date.now(),
|
|
2355
|
+
};
|
|
2356
|
+
results[idx] = failedResult;
|
|
2357
|
+
return failedResult;
|
|
2358
|
+
}
|
|
2359
|
+
});
|
|
2360
|
+
|
|
2361
|
+
// Wait for all lanes to settle (they should exit quickly after session kill)
|
|
2362
|
+
await Promise.allSettled(wrappedPromises);
|
|
2363
|
+
|
|
2364
|
+
// Fill in any null results (shouldn't happen, but defensive)
|
|
2365
|
+
return results.map(
|
|
2366
|
+
(r, idx) =>
|
|
2367
|
+
r || {
|
|
2368
|
+
laneNumber: lanes[idx].laneNumber,
|
|
2369
|
+
laneId: lanes[idx].laneId,
|
|
2370
|
+
tasks: [],
|
|
2371
|
+
overallStatus: "failed" as const,
|
|
2372
|
+
startTime: Date.now(),
|
|
2373
|
+
endTime: Date.now(),
|
|
2374
|
+
},
|
|
2375
|
+
);
|
|
2376
|
+
}
|
|
2377
|
+
|
|
2378
|
+
// ── Runtime V2 Bridge Helpers (TP-102) ─────────────────────────────────────
|
|
2379
|
+
//
|
|
2380
|
+
// These helpers bridge between existing legacy data structures
|
|
2381
|
+
// (AllocatedLane, AllocatedTask, resolveCanonicalTaskPaths) and
|
|
2382
|
+
// Runtime V2 contracts (ExecutionUnit, PacketPaths, RuntimeAgentId).
|
|
2383
|
+
//
|
|
2384
|
+
// They are additive — existing code paths continue to work.
|
|
2385
|
+
// Runtime V2 consumers can start using these to avoid coupling to
|
|
2386
|
+
// legacy lane-session naming, cwd-derived paths, or extension lifecycle assumptions.
|
|
2387
|
+
// ────────────────────────────────────────────────────────────────────────────
|
|
2388
|
+
|
|
2389
|
+
/**
|
|
2390
|
+
* Build a Runtime V2 ExecutionUnit from existing legacy structures.
|
|
2391
|
+
*
|
|
2392
|
+
* Translates the current AllocatedLane + AllocatedTask into the new
|
|
2393
|
+
* ExecutionUnit contract with explicit packet-path authority.
|
|
2394
|
+
*
|
|
2395
|
+
* Uses `resolveCanonicalTaskPaths` to derive packet paths through
|
|
2396
|
+
* the existing resolution logic (worktree-relative, cross-repo copy,
|
|
2397
|
+
* archive fallback). This preserves current behavior while surfacing
|
|
2398
|
+
* it through the Runtime V2 contract.
|
|
2399
|
+
*
|
|
2400
|
+
* **Cross-repo packet authority (TP-109):** In workspace mode, when the
|
|
2401
|
+
* task packet home repo differs from the execution repo, the legacy path
|
|
2402
|
+
* copies packet files into the worktree under `.orchid-tasks/`. The
|
|
2403
|
+
* resolved `packet` paths here point to that execution-local copy.
|
|
2404
|
+
* This is by design: the worker reads/writes STATUS.md and creates .DONE
|
|
2405
|
+
* in the worktree, and resume checks both the worktree-relative path and
|
|
2406
|
+
* the original discovery path for .DONE detection.
|
|
2407
|
+
*
|
|
2408
|
+
* `packetHomeRepoId` identifies the source repo that *owns* the task
|
|
2409
|
+
* (for discovery and routing), while `packet.taskFolder` is the
|
|
2410
|
+
* authoritative *working* location where artifacts are read/written
|
|
2411
|
+
* during execution. Resume reconciliation (TP-109) resolves both paths.
|
|
2412
|
+
*
|
|
2413
|
+
* @param lane - Allocated lane containing worktree and identity info
|
|
2414
|
+
* @param task - Allocated task to build an execution unit for
|
|
2415
|
+
* @param repoRoot - Main repository root
|
|
2416
|
+
* @param isWorkspaceMode - Whether workspace mode is active
|
|
2417
|
+
* @returns A fully-resolved ExecutionUnit
|
|
2418
|
+
*
|
|
2419
|
+
* @since TP-102
|
|
2420
|
+
*/
|
|
2421
|
+
export function buildExecutionUnit(
|
|
2422
|
+
lane: AllocatedLane,
|
|
2423
|
+
task: AllocatedTask,
|
|
2424
|
+
repoRoot: string,
|
|
2425
|
+
isWorkspaceMode?: boolean,
|
|
2426
|
+
): ExecutionUnit {
|
|
2427
|
+
// TP-169: Guard against missing taskFolder. This can happen when
|
|
2428
|
+
// reconstructAllocatedLanes creates task stubs from persisted state
|
|
2429
|
+
// where taskFolder enrichment failed (e.g., dynamically-expanded
|
|
2430
|
+
// segments whose persisted records had empty taskFolder).
|
|
2431
|
+
const taskFolder = task.task?.taskFolder;
|
|
2432
|
+
if (!taskFolder) {
|
|
2433
|
+
throw new ExecutionError(
|
|
2434
|
+
"EXEC_MISSING_TASK_FOLDER",
|
|
2435
|
+
`Cannot build execution unit for task ${task.taskId}: taskFolder is ${taskFolder === "" ? "empty" : "undefined"}. ` +
|
|
2436
|
+
`This typically means the task's persisted record was not enriched with discovery data. ` +
|
|
2437
|
+
`Re-run discovery or check that the task exists in the task area.`,
|
|
2438
|
+
"execution",
|
|
2439
|
+
task.taskId,
|
|
2440
|
+
);
|
|
2441
|
+
}
|
|
2442
|
+
const resolved = resolveCanonicalTaskPaths(
|
|
2443
|
+
taskFolder,
|
|
2444
|
+
lane.worktreePath,
|
|
2445
|
+
repoRoot,
|
|
2446
|
+
isWorkspaceMode,
|
|
2447
|
+
);
|
|
2448
|
+
|
|
2449
|
+
const executionRepoId = lane.repoId ?? "default";
|
|
2450
|
+
const packetHomeRepoId = task.task.packetRepoId ?? executionRepoId;
|
|
2451
|
+
|
|
2452
|
+
// Build a segment-style ID if this is a segment execution,
|
|
2453
|
+
// otherwise use the plain task ID.
|
|
2454
|
+
const segmentId = task.task.activeSegmentId ?? null;
|
|
2455
|
+
const id = segmentId ?? task.taskId;
|
|
2456
|
+
|
|
2457
|
+
// Use absolute packetTaskPath ONLY when the packet home repo differs from
|
|
2458
|
+
// the execution repo (cross-repo segment). When they're the same repo,
|
|
2459
|
+
// resolve packet paths inside the worktree so .DONE, STATUS.md etc. are
|
|
2460
|
+
// written to the worktree (not the original repo outside the worktree).
|
|
2461
|
+
const useAbsolutePacketPath = task.task.packetTaskPath && packetHomeRepoId !== executionRepoId;
|
|
2462
|
+
|
|
2463
|
+
const packet = useAbsolutePacketPath
|
|
2464
|
+
? resolvePacketPaths(task.task.packetTaskPath!)
|
|
2465
|
+
: {
|
|
2466
|
+
promptPath: resolved.taskFolderResolved + "/PROMPT.md",
|
|
2467
|
+
statusPath: resolved.statusPath,
|
|
2468
|
+
donePath: resolved.donePath,
|
|
2469
|
+
reviewsDir: resolved.taskFolderResolved + "/.reviews",
|
|
2470
|
+
taskFolder: resolved.taskFolderResolved,
|
|
2471
|
+
};
|
|
2472
|
+
|
|
2473
|
+
return {
|
|
2474
|
+
id,
|
|
2475
|
+
taskId: task.taskId,
|
|
2476
|
+
segmentId,
|
|
2477
|
+
executionRepoId,
|
|
2478
|
+
packetHomeRepoId,
|
|
2479
|
+
worktreePath: lane.worktreePath,
|
|
2480
|
+
packet,
|
|
2481
|
+
task: task.task,
|
|
2482
|
+
};
|
|
2483
|
+
}
|
|
2484
|
+
|
|
2485
|
+
/**
|
|
2486
|
+
* Build a RuntimeAgentId for a lane's agent from existing naming.
|
|
2487
|
+
*
|
|
2488
|
+
* Bridges the current lane-session naming convention into a
|
|
2489
|
+
* Runtime V2 stable agent ID. The output is compatible with
|
|
2490
|
+
* existing supervisor tools and mailbox addressing.
|
|
2491
|
+
*
|
|
2492
|
+
* @param lane - Allocated lane with a lane session name
|
|
2493
|
+
* @param role - Agent role
|
|
2494
|
+
* @param mergeIndex - Merge wave index (only for merge agents)
|
|
2495
|
+
* @returns Canonical agent ID
|
|
2496
|
+
*
|
|
2497
|
+
* @since TP-102
|
|
2498
|
+
*/
|
|
2499
|
+
export function buildAgentIdFromLane(
|
|
2500
|
+
lane: AllocatedLane,
|
|
2501
|
+
role: RuntimeAgentRole,
|
|
2502
|
+
mergeIndex?: number,
|
|
2503
|
+
): RuntimeAgentId {
|
|
2504
|
+
// The current laneSessionId is already in the right format
|
|
2505
|
+
// (e.g., "orch-henrylach-lane-1"). We derive agent IDs from it
|
|
2506
|
+
// by appending the role suffix, matching the existing convention.
|
|
2507
|
+
if (role === "merger" && mergeIndex != null) {
|
|
2508
|
+
// Merge agents use a different naming pattern
|
|
2509
|
+
const prefix = laneSessionIdOf(lane).replace(/-lane-\d+$/, "");
|
|
2510
|
+
return `${prefix}-merge-${mergeIndex}`;
|
|
2511
|
+
}
|
|
2512
|
+
if (role === "lane-runner") {
|
|
2513
|
+
return laneSessionIdOf(lane);
|
|
2514
|
+
}
|
|
2515
|
+
return `${laneSessionIdOf(lane)}-${role}`;
|
|
2516
|
+
}
|
|
2517
|
+
|
|
2518
|
+
/**
|
|
2519
|
+
* Resolve the Runtime V2 state root from available context.
|
|
2520
|
+
*
|
|
2521
|
+
* The state root is where `.pi/runtime/` artifacts live. In workspace
|
|
2522
|
+
* mode this is the workspace root; in repo mode it's the repo root.
|
|
2523
|
+
*
|
|
2524
|
+
* This centralizes the resolution so Runtime V2 code doesn't need
|
|
2525
|
+
* to repeat the workspace-vs-repo logic.
|
|
2526
|
+
*
|
|
2527
|
+
* @param repoRoot - Main repository root
|
|
2528
|
+
* @param workspaceRoot - Workspace root (undefined in repo mode)
|
|
2529
|
+
* @returns Absolute path to use as the state root for .pi/ artifacts
|
|
2530
|
+
*
|
|
2531
|
+
* @since TP-102
|
|
2532
|
+
*/
|
|
2533
|
+
/**
|
|
2534
|
+
* Parse an agent .md file: extract frontmatter and body.
|
|
2535
|
+
* Returns null if file doesn't exist or is malformed.
|
|
2536
|
+
* @since TP-117
|
|
2537
|
+
*/
|
|
2538
|
+
function parseAgentFile(filePath: string): { fm: Record<string, string>; body: string } | null {
|
|
2539
|
+
try {
|
|
2540
|
+
if (!existsSync(filePath)) return null;
|
|
2541
|
+
const raw = readFileSync(filePath, "utf-8");
|
|
2542
|
+
const fmEnd = raw.indexOf("---", 4);
|
|
2543
|
+
if (fmEnd < 0) return { fm: {}, body: raw.trim() };
|
|
2544
|
+
const fmBlock = raw.slice(4, fmEnd).trim();
|
|
2545
|
+
const fm: Record<string, string> = {};
|
|
2546
|
+
for (const line of fmBlock.split("\n")) {
|
|
2547
|
+
const m = line.match(/^([\w-]+)\s*:\s*(.+)/);
|
|
2548
|
+
if (m) fm[m[1]] = m[2].trim();
|
|
2549
|
+
}
|
|
2550
|
+
return { fm, body: raw.slice(fmEnd + 3).trim() };
|
|
2551
|
+
} catch {
|
|
2552
|
+
return null;
|
|
2553
|
+
}
|
|
2554
|
+
}
|
|
2555
|
+
|
|
2556
|
+
/**
|
|
2557
|
+
* Load the base agent prompt from the OrchID package's templates/ directory.
|
|
2558
|
+
* Resolves the package root via well-known npm global paths.
|
|
2559
|
+
* @since TP-117
|
|
2560
|
+
*/
|
|
2561
|
+
function loadBaseAgentPrompt(agentName: string): string {
|
|
2562
|
+
// resolveTaskplaneAgentTemplate handles all npm setups (nvm, Homebrew, volta, Windows, etc.)
|
|
2563
|
+
// via npm root -g caching and well-known fallback paths (see path-resolver.ts, TP-157).
|
|
2564
|
+
// This avoids silently returning "" which would cause the worker to skip reviews.
|
|
2565
|
+
try {
|
|
2566
|
+
const resolved = resolveTaskplaneAgentTemplate(agentName);
|
|
2567
|
+
if (existsSync(resolved)) {
|
|
2568
|
+
const def = parseAgentFile(resolved);
|
|
2569
|
+
if (def?.body) return def.body;
|
|
2570
|
+
}
|
|
2571
|
+
} catch {
|
|
2572
|
+
/* fall through */
|
|
2573
|
+
}
|
|
2574
|
+
return "";
|
|
2575
|
+
}
|
|
2576
|
+
|
|
2577
|
+
/**
|
|
2578
|
+
* Load local project agent prompt from .pi/agents/ or agents/ directory.
|
|
2579
|
+
* Supports standalone mode (local replaces base entirely).
|
|
2580
|
+
* @since TP-117
|
|
2581
|
+
*/
|
|
2582
|
+
function loadLocalAgentPrompt(stateRoot: string, agentName: string): string {
|
|
2583
|
+
const paths = [
|
|
2584
|
+
join(stateRoot, ".pi", "agents", `${agentName}.md`),
|
|
2585
|
+
join(stateRoot, "agents", `${agentName}.md`),
|
|
2586
|
+
];
|
|
2587
|
+
for (const p of paths) {
|
|
2588
|
+
const def = parseAgentFile(p);
|
|
2589
|
+
if (def) {
|
|
2590
|
+
// standalone: true → use local as-is (body only, replaces base)
|
|
2591
|
+
if (def.fm.standalone === "true") return def.body;
|
|
2592
|
+
// Otherwise return body as project-specific guidance to append
|
|
2593
|
+
if (def.body) return def.body;
|
|
2594
|
+
}
|
|
2595
|
+
}
|
|
2596
|
+
return "";
|
|
2597
|
+
}
|
|
2598
|
+
|
|
2599
|
+
// ── Agent Definition Loading ─────────────────────────────────────────
|
|
2600
|
+
|
|
2601
|
+
/** Track whether an agent pointer warning has been logged this session (log once). */
|
|
2602
|
+
let _execPointerWarningLogged = false;
|
|
2603
|
+
|
|
2604
|
+
/**
|
|
2605
|
+
* Reset agent pointer warning state for testing.
|
|
2606
|
+
* @since TP-161
|
|
2607
|
+
*/
|
|
2608
|
+
export function resetPointerWarning(): void {
|
|
2609
|
+
_execPointerWarningLogged = false;
|
|
2610
|
+
}
|
|
2611
|
+
|
|
2612
|
+
/**
|
|
2613
|
+
* Resolve agent files using the workspace pointer (workspace mode only).
|
|
2614
|
+
* Returns the agentRoot from the pointer, or null in repo mode / on failure.
|
|
2615
|
+
*/
|
|
2616
|
+
function resolveAgentPointerRoot(): string | null {
|
|
2617
|
+
const wsRoot = process.env.TASKPLANE_WORKSPACE_ROOT;
|
|
2618
|
+
if (!wsRoot) return null;
|
|
2619
|
+
try {
|
|
2620
|
+
const wsConfig = loadWorkspaceConfig(wsRoot);
|
|
2621
|
+
const result = resolvePointer(wsRoot, wsConfig);
|
|
2622
|
+
if (result?.warning && !_execPointerWarningLogged) {
|
|
2623
|
+
_execPointerWarningLogged = true;
|
|
2624
|
+
console.error(`[execution] pointer: ${result.warning}`);
|
|
2625
|
+
}
|
|
2626
|
+
return result?.agentRoot ?? null;
|
|
2627
|
+
} catch {
|
|
2628
|
+
return null;
|
|
2629
|
+
}
|
|
2630
|
+
}
|
|
2631
|
+
|
|
2632
|
+
/**
|
|
2633
|
+
* Load a complete agent definition (systemPrompt + tools + model) by name.
|
|
2634
|
+
*
|
|
2635
|
+
* Resolution order:
|
|
2636
|
+
* 1. cwd/.pi/agents/<name>.md
|
|
2637
|
+
* 2. cwd/agents/<name>.md
|
|
2638
|
+
* 3. pointer.agentRoot/<name>.md (workspace mode only)
|
|
2639
|
+
* 4. Base package templates/agents/<name>.md
|
|
2640
|
+
*
|
|
2641
|
+
* If a local file has `standalone: true` in frontmatter, it is used as-is
|
|
2642
|
+
* (no base composition). Otherwise, base + local are composed.
|
|
2643
|
+
*
|
|
2644
|
+
* @param cwd - Working directory (project root) to search for local agent files
|
|
2645
|
+
* @param name - Agent name (e.g., "task-worker", "task-reviewer")
|
|
2646
|
+
* @returns Composed agent definition, or null if no base and no local file found
|
|
2647
|
+
* @since TP-161
|
|
2648
|
+
*/
|
|
2649
|
+
export function loadAgentDef(
|
|
2650
|
+
cwd: string,
|
|
2651
|
+
name: string,
|
|
2652
|
+
): { systemPrompt: string; tools: string; model: string } | null {
|
|
2653
|
+
const localPaths = [join(cwd, ".pi", "agents", `${name}.md`), join(cwd, "agents", `${name}.md`)];
|
|
2654
|
+
|
|
2655
|
+
// In workspace mode, add pointer-resolved agent root as fallback
|
|
2656
|
+
const agentRoot = resolveAgentPointerRoot();
|
|
2657
|
+
if (agentRoot) {
|
|
2658
|
+
localPaths.push(join(agentRoot, `${name}.md`));
|
|
2659
|
+
}
|
|
2660
|
+
|
|
2661
|
+
// Load base from package
|
|
2662
|
+
let baseDef: { fm: Record<string, string>; body: string } | null = null;
|
|
2663
|
+
try {
|
|
2664
|
+
const basePath = resolveTaskplaneAgentTemplate(name);
|
|
2665
|
+
if (existsSync(basePath)) {
|
|
2666
|
+
baseDef = parseAgentFile(basePath);
|
|
2667
|
+
}
|
|
2668
|
+
} catch {
|
|
2669
|
+
/* fall through */
|
|
2670
|
+
}
|
|
2671
|
+
|
|
2672
|
+
// Load local override (first found wins)
|
|
2673
|
+
let localDef: { fm: Record<string, string>; body: string } | null = null;
|
|
2674
|
+
for (const p of localPaths) {
|
|
2675
|
+
localDef = parseAgentFile(p);
|
|
2676
|
+
if (localDef) break;
|
|
2677
|
+
}
|
|
2678
|
+
|
|
2679
|
+
// No base and no local → null
|
|
2680
|
+
if (!baseDef && !localDef) return null;
|
|
2681
|
+
|
|
2682
|
+
// Local with standalone: true → use local as-is, ignore base
|
|
2683
|
+
if (localDef?.fm.standalone === "true") {
|
|
2684
|
+
return {
|
|
2685
|
+
systemPrompt: localDef.body,
|
|
2686
|
+
tools: localDef.fm.tools || "read,grep,find,ls",
|
|
2687
|
+
model: localDef.fm.model || "",
|
|
2688
|
+
};
|
|
2689
|
+
}
|
|
2690
|
+
|
|
2691
|
+
// Compose base + local
|
|
2692
|
+
const basePrompt = baseDef?.body || "";
|
|
2693
|
+
const localPrompt = localDef?.body || "";
|
|
2694
|
+
const composedPrompt = localPrompt
|
|
2695
|
+
? basePrompt + "\n\n---\n\n## Project-Specific Guidance\n\n" + localPrompt
|
|
2696
|
+
: basePrompt;
|
|
2697
|
+
|
|
2698
|
+
// Local frontmatter overrides base (tools, model)
|
|
2699
|
+
const tools = localDef?.fm.tools || baseDef?.fm.tools || "read,grep,find,ls";
|
|
2700
|
+
const model = localDef?.fm.model || baseDef?.fm.model || "";
|
|
2701
|
+
|
|
2702
|
+
return { systemPrompt: composedPrompt.trim(), tools, model };
|
|
2703
|
+
}
|
|
2704
|
+
|
|
2705
|
+
export function resolveRuntimeStateRoot(repoRoot: string, workspaceRoot?: string): string {
|
|
2706
|
+
return workspaceRoot ?? repoRoot;
|
|
2707
|
+
}
|
|
2708
|
+
|
|
2709
|
+
// ── Runtime V2 Lane Execution (TP-105) ────────────────────────────
|
|
2710
|
+
|
|
2711
|
+
import { executeTaskV2, type LaneRunnerConfig, type LaneRunnerTaskResult } from "./lane-runner.ts";
|
|
2712
|
+
import { DEFAULT_WORKER_USER_TOOLS } from "./agent-host.ts";
|
|
2713
|
+
|
|
2714
|
+
/**
|
|
2715
|
+
* Execute a lane using the Runtime V2 headless backend.
|
|
2716
|
+
*
|
|
2717
|
+
* This replaces the legacy session-backed `executeLane()` for lanes that
|
|
2718
|
+
* should run on the new direct-child architecture. It uses the
|
|
2719
|
+
* lane-runner module which spawns workers via agent-host.ts instead
|
|
2720
|
+
* of terminal-session-backed workers.
|
|
2721
|
+
*
|
|
2722
|
+
* The function signature is deliberately close to the legacy
|
|
2723
|
+
* `executeLane()` to minimize integration churn in the engine.
|
|
2724
|
+
* The key difference: no legacy lane sessions are created.
|
|
2725
|
+
*
|
|
2726
|
+
* @since TP-105
|
|
2727
|
+
*/
|
|
2728
|
+
|
|
2729
|
+
/**
|
|
2730
|
+
* Build reviewer env vars from a TaskRunnerConfig or reviewer config object.
|
|
2731
|
+
* Used to ensure reviewer config is consistently passed to executeLaneV2
|
|
2732
|
+
* across all call sites (initial waves, resume, retries).
|
|
2733
|
+
*
|
|
2734
|
+
* Returns only the keys that have non-empty values, so that empty/inherit
|
|
2735
|
+
* config does not override inherited env vars from the parent process.
|
|
2736
|
+
*
|
|
2737
|
+
* @since TP-160
|
|
2738
|
+
*/
|
|
2739
|
+
/**
|
|
2740
|
+
* Parse a JSON string array from an env var value, returning empty array on failure.
|
|
2741
|
+
* @since TP-180
|
|
2742
|
+
*/
|
|
2743
|
+
function parseJsonArrayEnv(value?: string): string[] {
|
|
2744
|
+
if (!value) return [];
|
|
2745
|
+
try {
|
|
2746
|
+
const parsed = JSON.parse(value);
|
|
2747
|
+
if (Array.isArray(parsed))
|
|
2748
|
+
return parsed.filter((v: unknown): v is string => typeof v === "string");
|
|
2749
|
+
} catch {
|
|
2750
|
+
/* ignore malformed */
|
|
2751
|
+
}
|
|
2752
|
+
return [];
|
|
2753
|
+
}
|
|
2754
|
+
|
|
2755
|
+
export function buildReviewerEnv(
|
|
2756
|
+
reviewerConfig?: {
|
|
2757
|
+
model?: string;
|
|
2758
|
+
thinking?: string;
|
|
2759
|
+
tools?: string;
|
|
2760
|
+
excludeExtensions?: string[];
|
|
2761
|
+
} | null,
|
|
2762
|
+
): Record<string, string> {
|
|
2763
|
+
const env: Record<string, string> = {};
|
|
2764
|
+
if (reviewerConfig?.model) env.TASKPLANE_REVIEWER_MODEL = reviewerConfig.model;
|
|
2765
|
+
if (reviewerConfig?.thinking) env.TASKPLANE_REVIEWER_THINKING = reviewerConfig.thinking;
|
|
2766
|
+
if (reviewerConfig?.tools) env.TASKPLANE_REVIEWER_TOOLS = reviewerConfig.tools;
|
|
2767
|
+
// TP-180: Forward reviewer extension exclusions as JSON array
|
|
2768
|
+
if (reviewerConfig?.excludeExtensions && reviewerConfig.excludeExtensions.length > 0) {
|
|
2769
|
+
env.TASKPLANE_REVIEWER_EXCLUDE_EXTENSIONS = JSON.stringify(reviewerConfig.excludeExtensions);
|
|
2770
|
+
}
|
|
2771
|
+
return env;
|
|
2772
|
+
}
|
|
2773
|
+
|
|
2774
|
+
/**
|
|
2775
|
+
* Build worker env vars from config.
|
|
2776
|
+
*
|
|
2777
|
+
* Threads worker model/thinking/tools through to the lane runner
|
|
2778
|
+
* via env vars, mirroring the reviewer pattern (buildReviewerEnv).
|
|
2779
|
+
*
|
|
2780
|
+
* @since TP-181
|
|
2781
|
+
*/
|
|
2782
|
+
export function buildWorkerEnv(
|
|
2783
|
+
workerConfig?: {
|
|
2784
|
+
model?: string;
|
|
2785
|
+
thinking?: string;
|
|
2786
|
+
tools?: string;
|
|
2787
|
+
excludeExtensions?: string[];
|
|
2788
|
+
} | null,
|
|
2789
|
+
): Record<string, string> {
|
|
2790
|
+
const env: Record<string, string> = {};
|
|
2791
|
+
if (workerConfig?.model) env.TASKPLANE_WORKER_MODEL = workerConfig.model;
|
|
2792
|
+
if (workerConfig?.thinking) env.TASKPLANE_WORKER_THINKING = workerConfig.thinking;
|
|
2793
|
+
if (workerConfig?.tools) env.TASKPLANE_WORKER_TOOLS = workerConfig.tools;
|
|
2794
|
+
|
|
2795
|
+
return env;
|
|
2796
|
+
}
|
|
2797
|
+
|
|
2798
|
+
/**
|
|
2799
|
+
* Build worker extension exclusion env vars from config.
|
|
2800
|
+
* @since TP-180
|
|
2801
|
+
*/
|
|
2802
|
+
export function buildWorkerExcludeEnv(
|
|
2803
|
+
workerExcludeExtensions?: string[] | null,
|
|
2804
|
+
): Record<string, string> {
|
|
2805
|
+
const env: Record<string, string> = {};
|
|
2806
|
+
if (workerExcludeExtensions && workerExcludeExtensions.length > 0) {
|
|
2807
|
+
env.TASKPLANE_WORKER_EXCLUDE_EXTENSIONS = JSON.stringify(workerExcludeExtensions);
|
|
2808
|
+
}
|
|
2809
|
+
return env;
|
|
2810
|
+
}
|
|
2811
|
+
|
|
2812
|
+
export async function executeLaneV2(
|
|
2813
|
+
lane: AllocatedLane,
|
|
2814
|
+
config: OrchestratorConfig,
|
|
2815
|
+
repoRoot: string,
|
|
2816
|
+
pauseSignal: { paused: boolean },
|
|
2817
|
+
workspaceRoot?: string,
|
|
2818
|
+
isWorkspaceMode?: boolean,
|
|
2819
|
+
extraEnvVars?: Record<string, string>,
|
|
2820
|
+
onSupervisorAlert?: SupervisorAlertCallback,
|
|
2821
|
+
onLaneTerminated?: import("./types.ts").LaneTerminatedCallback,
|
|
2822
|
+
/**
|
|
2823
|
+
* TP-187 (#538): Optional callback fired BEFORE the first task of this
|
|
2824
|
+
* lane begins. The supervisor process uses it to lift any zombie-alert
|
|
2825
|
+
* suppression that was applied when this lane number was previously
|
|
2826
|
+
* terminated (e.g., in a prior wave).
|
|
2827
|
+
*/
|
|
2828
|
+
onLaneRespawned?: (laneNumber: number, agentId: string, batchId: string) => void,
|
|
2829
|
+
): Promise<LaneExecutionResult> {
|
|
2830
|
+
const laneId = lane.laneId;
|
|
2831
|
+
const laneStartTime = Date.now();
|
|
2832
|
+
const outcomes: LaneTaskOutcome[] = [];
|
|
2833
|
+
let shouldSkipRemaining = false;
|
|
2834
|
+
|
|
2835
|
+
const stateRoot = resolveRuntimeStateRoot(repoRoot, workspaceRoot);
|
|
2836
|
+
const batchId = config.orchestrator?.batchId || extraEnvVars?.ORCH_BATCH_ID || String(Date.now());
|
|
2837
|
+
|
|
2838
|
+
// Build agent ID prefix — must match the wave planner's naming (TP-115).
|
|
2839
|
+
// Uses resolveOperatorId() so agent registry keys align with lane session IDs.
|
|
2840
|
+
const sessionPrefix = config.orchestrator?.sessionPrefix ?? "orch";
|
|
2841
|
+
const opId = resolveOperatorId(config);
|
|
2842
|
+
const agentIdPrefix = `${sessionPrefix}-${opId}`;
|
|
2843
|
+
|
|
2844
|
+
// Load worker agent definition: compose base template + local project guidance.
|
|
2845
|
+
// The base template (templates/agents/task-worker.md) contains critical behavioral
|
|
2846
|
+
// rules: checkpoint discipline, STATUS.md resume algorithm, review_step instructions.
|
|
2847
|
+
// The local file (.pi/agents/task-worker.md) adds project-specific guidance.
|
|
2848
|
+
let workerSystemPrompt =
|
|
2849
|
+
"You are a task execution agent. Read STATUS.md first, find unchecked items, work on them, checkpoint after each.";
|
|
2850
|
+
let workerSegmentPrompt = "";
|
|
2851
|
+
try {
|
|
2852
|
+
const basePrompt = loadBaseAgentPrompt("task-worker");
|
|
2853
|
+
const localPrompt = loadLocalAgentPrompt(stateRoot, "task-worker");
|
|
2854
|
+
if (basePrompt && localPrompt) {
|
|
2855
|
+
workerSystemPrompt = basePrompt + "\n\n---\n\n## Project-Specific Guidance\n\n" + localPrompt;
|
|
2856
|
+
} else if (basePrompt) {
|
|
2857
|
+
workerSystemPrompt = basePrompt;
|
|
2858
|
+
} else if (localPrompt) {
|
|
2859
|
+
workerSystemPrompt = localPrompt;
|
|
2860
|
+
}
|
|
2861
|
+
// Load segment-scoped prompt overlay (appended when isSegmentScoped)
|
|
2862
|
+
const segPrompt = loadBaseAgentPrompt("task-worker-segment");
|
|
2863
|
+
if (segPrompt) workerSegmentPrompt = segPrompt;
|
|
2864
|
+
} catch {
|
|
2865
|
+
/* use default */
|
|
2866
|
+
}
|
|
2867
|
+
|
|
2868
|
+
execLog(laneId, "LANE", `starting Runtime V2 execution of ${lane.tasks.length} task(s)`, {
|
|
2869
|
+
worktree: lane.worktreePath,
|
|
2870
|
+
agentPrefix: agentIdPrefix,
|
|
2871
|
+
});
|
|
2872
|
+
|
|
2873
|
+
// TP-187 (#538): Lane is freshly starting — emit lane-respawned so any
|
|
2874
|
+
// zombie-alert suppression carried over from a prior wave's termination of
|
|
2875
|
+
// this lane number is lifted before new alerts begin to flow.
|
|
2876
|
+
if (onLaneRespawned) {
|
|
2877
|
+
try {
|
|
2878
|
+
onLaneRespawned(
|
|
2879
|
+
lane.laneNumber,
|
|
2880
|
+
buildRuntimeAgentId(agentIdPrefix, lane.laneNumber, "worker"),
|
|
2881
|
+
batchId,
|
|
2882
|
+
);
|
|
2883
|
+
} catch (err) {
|
|
2884
|
+
execLog(
|
|
2885
|
+
laneId,
|
|
2886
|
+
"LANE",
|
|
2887
|
+
`lane-respawned callback failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
2888
|
+
);
|
|
2889
|
+
}
|
|
2890
|
+
}
|
|
2891
|
+
|
|
2892
|
+
for (const task of lane.tasks) {
|
|
2893
|
+
const taskSegmentId = task.task.activeSegmentId ?? null;
|
|
2894
|
+
if (shouldSkipRemaining || pauseSignal.paused) {
|
|
2895
|
+
const reason = pauseSignal.paused
|
|
2896
|
+
? "Skipped due to pause signal"
|
|
2897
|
+
: "Skipped due to prior task failure in lane";
|
|
2898
|
+
outcomes.push({
|
|
2899
|
+
taskId: task.taskId,
|
|
2900
|
+
status: "skipped",
|
|
2901
|
+
segmentId: taskSegmentId,
|
|
2902
|
+
startTime: null,
|
|
2903
|
+
endTime: null,
|
|
2904
|
+
exitReason: reason,
|
|
2905
|
+
sessionName: buildRuntimeAgentId(agentIdPrefix, lane.laneNumber, "worker"),
|
|
2906
|
+
doneFileFound: false,
|
|
2907
|
+
laneNumber: lane.laneNumber,
|
|
2908
|
+
});
|
|
2909
|
+
continue;
|
|
2910
|
+
}
|
|
2911
|
+
|
|
2912
|
+
// Build execution unit
|
|
2913
|
+
const unit = buildExecutionUnit(lane, task, repoRoot, isWorkspaceMode);
|
|
2914
|
+
|
|
2915
|
+
const rawAutonomy = String(
|
|
2916
|
+
extraEnvVars?.TASKPLANE_SUPERVISOR_AUTONOMY ?? "autonomous",
|
|
2917
|
+
).toLowerCase();
|
|
2918
|
+
const supervisorAutonomy: LaneRunnerConfig["supervisorAutonomy"] =
|
|
2919
|
+
rawAutonomy === "interactive" || rawAutonomy === "supervised" || rawAutonomy === "autonomous"
|
|
2920
|
+
? (rawAutonomy as LaneRunnerConfig["supervisorAutonomy"])
|
|
2921
|
+
: "autonomous";
|
|
2922
|
+
|
|
2923
|
+
const laneRunnerConfig: LaneRunnerConfig = {
|
|
2924
|
+
batchId,
|
|
2925
|
+
agentIdPrefix,
|
|
2926
|
+
laneNumber: lane.laneNumber,
|
|
2927
|
+
worktreePath: lane.worktreePath,
|
|
2928
|
+
branch: lane.branch,
|
|
2929
|
+
repoId: lane.repoId ?? "default",
|
|
2930
|
+
stateRoot,
|
|
2931
|
+
workerModel: extraEnvVars?.TASKPLANE_WORKER_MODEL || "",
|
|
2932
|
+
// TP-184: This is the user-tools default. Engine bridge tools are NOT
|
|
2933
|
+
// added here — buildWorkerToolsAllowlist() at the lane-runner spawn
|
|
2934
|
+
// site appends ENGINE_BRIDGE_TOOLS exactly once, regardless of source.
|
|
2935
|
+
workerTools: extraEnvVars?.TASKPLANE_WORKER_TOOLS || DEFAULT_WORKER_USER_TOOLS,
|
|
2936
|
+
workerThinking: extraEnvVars?.TASKPLANE_WORKER_THINKING || "",
|
|
2937
|
+
workerSystemPrompt,
|
|
2938
|
+
workerSegmentPrompt,
|
|
2939
|
+
reviewerModel: extraEnvVars?.TASKPLANE_REVIEWER_MODEL || "",
|
|
2940
|
+
reviewerThinking: extraEnvVars?.TASKPLANE_REVIEWER_THINKING || "",
|
|
2941
|
+
reviewerTools: extraEnvVars?.TASKPLANE_REVIEWER_TOOLS || "",
|
|
2942
|
+
// TP-180: Extension exclusion lists from config
|
|
2943
|
+
workerExcludeExtensions: parseJsonArrayEnv(extraEnvVars?.TASKPLANE_WORKER_EXCLUDE_EXTENSIONS),
|
|
2944
|
+
reviewerExcludeExtensions: parseJsonArrayEnv(
|
|
2945
|
+
extraEnvVars?.TASKPLANE_REVIEWER_EXCLUDE_EXTENSIONS,
|
|
2946
|
+
),
|
|
2947
|
+
supervisorAutonomy,
|
|
2948
|
+
// TP-195: replaced `config.project?.name` (no `project` field on
|
|
2949
|
+
// `OrchestratorConfig`; always undefined) with the env-var read
|
|
2950
|
+
// already used elsewhere in the codebase (lane-runner.ts:668 sets
|
|
2951
|
+
// `TASKPLANE_PROJECT_NAME` from the same source). When the env
|
|
2952
|
+
// var is unset, falls through to the same `"project"` literal as
|
|
2953
|
+
// before — behavior-neutral.
|
|
2954
|
+
projectName: extraEnvVars?.TASKPLANE_PROJECT_NAME || "project",
|
|
2955
|
+
maxIterations: 20,
|
|
2956
|
+
noProgressLimit: 3,
|
|
2957
|
+
// TP-195: read the canonical `max_worker_minutes` field (snake_case
|
|
2958
|
+
// per `OrchestratorConfig.failure` in types.ts). The previous code
|
|
2959
|
+
// read a non-existent `maxWorkerMinutes` camelCase alias — always
|
|
2960
|
+
// undefined — silently ignoring any operator-set value. Honoring
|
|
2961
|
+
// the config is the intended behavior; default of 120 preserved
|
|
2962
|
+
// when the field is unset.
|
|
2963
|
+
maxWorkerMinutes: config.failure?.max_worker_minutes || 120,
|
|
2964
|
+
warnPercent: 85,
|
|
2965
|
+
killPercent: 95,
|
|
2966
|
+
onSupervisorAlert,
|
|
2967
|
+
onLaneTerminated,
|
|
2968
|
+
};
|
|
2969
|
+
|
|
2970
|
+
try {
|
|
2971
|
+
const result = await executeTaskV2(unit, laneRunnerConfig, pauseSignal);
|
|
2972
|
+
outcomes.push({
|
|
2973
|
+
...result.outcome,
|
|
2974
|
+
laneNumber: result.outcome.laneNumber ?? lane.laneNumber,
|
|
2975
|
+
});
|
|
2976
|
+
|
|
2977
|
+
// Commit artifacts after success (same as legacy path)
|
|
2978
|
+
if (result.outcome.status === "succeeded") {
|
|
2979
|
+
commitTaskArtifacts(lane, task, laneId);
|
|
2980
|
+
// Reset worktree for next task
|
|
2981
|
+
if (lane.tasks.indexOf(task) < lane.tasks.length - 1) {
|
|
2982
|
+
runGit(["checkout", "--", "."], lane.worktreePath);
|
|
2983
|
+
runGit(["clean", "-fd"], lane.worktreePath);
|
|
2984
|
+
}
|
|
2985
|
+
}
|
|
2986
|
+
|
|
2987
|
+
if (result.outcome.status === "failed" || result.outcome.status === "stalled") {
|
|
2988
|
+
shouldSkipRemaining = true;
|
|
2989
|
+
}
|
|
2990
|
+
} catch (err: unknown) {
|
|
2991
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
2992
|
+
execLog(laneId, task.taskId, `Runtime V2 execution error: ${errMsg}`);
|
|
2993
|
+
|
|
2994
|
+
// TP-190 (#561): Spawn-stage failures (Pi CLI not findable, worktree
|
|
2995
|
+
// provisioning failure, etc.) reach this catch synchronously —
|
|
2996
|
+
// `spawnAgent()` calls `resolvePiCliPath()` and other resolvers that
|
|
2997
|
+
// throw before any process is registered. Tag the outcome with the
|
|
2998
|
+
// `spawn_failure` ExitClassification so:
|
|
2999
|
+
// 1. The retry classifier (TIER0_RETRYABLE_CLASSIFICATIONS) excludes
|
|
3000
|
+
// it deterministically — spawn errors are never transient.
|
|
3001
|
+
// 2. The supervisor `task-failure` IPC alert can carry
|
|
3002
|
+
// `context.exitCategory = "spawn_failure"` so the playbook can
|
|
3003
|
+
// escalate immediately rather than retrying.
|
|
3004
|
+
// 3. The engine's post-wave logic can transition `phase` to
|
|
3005
|
+
// `"failed"` when every lane in a wave spawn-failed.
|
|
3006
|
+
const spawnExitDiagnostic: TaskExitDiagnostic = {
|
|
3007
|
+
classification: "spawn_failure",
|
|
3008
|
+
exitCode: null,
|
|
3009
|
+
errorMessage: errMsg,
|
|
3010
|
+
tokensUsed: null,
|
|
3011
|
+
contextPct: null,
|
|
3012
|
+
partialProgressCommits: 0,
|
|
3013
|
+
partialProgressBranch: null,
|
|
3014
|
+
durationSec: 0,
|
|
3015
|
+
lastKnownStep: null,
|
|
3016
|
+
lastKnownCheckbox: null,
|
|
3017
|
+
repoId: lane.repoId ?? "default",
|
|
3018
|
+
};
|
|
3019
|
+
const workerAgentId = buildRuntimeAgentId(agentIdPrefix, lane.laneNumber, "worker");
|
|
3020
|
+
outcomes.push({
|
|
3021
|
+
taskId: task.taskId,
|
|
3022
|
+
status: "failed",
|
|
3023
|
+
segmentId: taskSegmentId,
|
|
3024
|
+
startTime: Date.now(),
|
|
3025
|
+
endTime: Date.now(),
|
|
3026
|
+
exitReason: `spawn failure: ${errMsg}`,
|
|
3027
|
+
sessionName: workerAgentId,
|
|
3028
|
+
doneFileFound: false,
|
|
3029
|
+
laneNumber: lane.laneNumber,
|
|
3030
|
+
exitDiagnostic: spawnExitDiagnostic,
|
|
3031
|
+
});
|
|
3032
|
+
|
|
3033
|
+
// TP-190 (#561): Write a synthetic terminal lane snapshot so the
|
|
3034
|
+
// monitor (`monitorLanes` → `resolveTaskMonitorState`) reads
|
|
3035
|
+
// `snap.taskId === taskId` AND `snap.status === "failed"`, which sets
|
|
3036
|
+
// `sessionAlive = false` and triggers Priority 3 ("Session exited
|
|
3037
|
+
// without .DONE → failed"). Without this, the monitor's
|
|
3038
|
+
// `snap == null` startup-grace branch keeps `sessionAlive = true`
|
|
3039
|
+
// indefinitely and `executeWave` blocks forever on `await
|
|
3040
|
+
// monitorPromise`. Use the full `RuntimeLaneSnapshot` shape so
|
|
3041
|
+
// dashboard consumers stay schema-consistent.
|
|
3042
|
+
try {
|
|
3043
|
+
const spawnFailureSnapshot: RuntimeLaneSnapshot = {
|
|
3044
|
+
batchId,
|
|
3045
|
+
laneNumber: lane.laneNumber,
|
|
3046
|
+
laneId: `lane-${lane.laneNumber}`,
|
|
3047
|
+
repoId: lane.repoId ?? "default",
|
|
3048
|
+
taskId: task.taskId,
|
|
3049
|
+
segmentId: taskSegmentId,
|
|
3050
|
+
status: "failed",
|
|
3051
|
+
worker: {
|
|
3052
|
+
agentId: workerAgentId,
|
|
3053
|
+
status: "crashed",
|
|
3054
|
+
elapsedMs: 0,
|
|
3055
|
+
toolCalls: 0,
|
|
3056
|
+
contextPct: 0,
|
|
3057
|
+
costUsd: 0,
|
|
3058
|
+
lastTool: "",
|
|
3059
|
+
inputTokens: 0,
|
|
3060
|
+
outputTokens: 0,
|
|
3061
|
+
cacheReadTokens: 0,
|
|
3062
|
+
cacheWriteTokens: 0,
|
|
3063
|
+
},
|
|
3064
|
+
reviewer: null,
|
|
3065
|
+
progress: null,
|
|
3066
|
+
updatedAt: Date.now(),
|
|
3067
|
+
};
|
|
3068
|
+
writeLaneSnapshot(
|
|
3069
|
+
stateRoot,
|
|
3070
|
+
batchId,
|
|
3071
|
+
lane.laneNumber,
|
|
3072
|
+
spawnFailureSnapshot as unknown as Record<string, unknown>,
|
|
3073
|
+
);
|
|
3074
|
+
} catch (snapErr) {
|
|
3075
|
+
// Best effort — if the snapshot write fails, the monitor's
|
|
3076
|
+
// 30s-staleness fallback (snap with old updatedAt) eventually
|
|
3077
|
+
// kicks in via the registry liveness check. Log so this is
|
|
3078
|
+
// visible in operator diagnostics, but do NOT throw.
|
|
3079
|
+
execLog(
|
|
3080
|
+
laneId,
|
|
3081
|
+
task.taskId,
|
|
3082
|
+
`spawn-failure snapshot write failed (non-fatal): ${snapErr instanceof Error ? snapErr.message : String(snapErr)}`,
|
|
3083
|
+
);
|
|
3084
|
+
}
|
|
3085
|
+
|
|
3086
|
+
shouldSkipRemaining = true;
|
|
3087
|
+
}
|
|
3088
|
+
}
|
|
3089
|
+
|
|
3090
|
+
const endTime = Date.now();
|
|
3091
|
+
const succeeded = outcomes.every((o) => o.status === "succeeded");
|
|
3092
|
+
const failed = outcomes.some((o) => o.status === "failed" || o.status === "stalled");
|
|
3093
|
+
|
|
3094
|
+
return {
|
|
3095
|
+
laneNumber: lane.laneNumber,
|
|
3096
|
+
laneId,
|
|
3097
|
+
tasks: outcomes,
|
|
3098
|
+
overallStatus: succeeded ? "succeeded" : failed ? "failed" : "partial",
|
|
3099
|
+
startTime: laneStartTime,
|
|
3100
|
+
endTime,
|
|
3101
|
+
};
|
|
3102
|
+
}
|
|
3103
|
+
|
|
3104
|
+
// ── /orch Command — Full Execution (Step 5) ─────────────────────────
|