@pi-agents/orchid 0.1.0-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +41 -0
- package/LICENSE +21 -0
- package/README.md +246 -0
- package/agents/AGENTS-MANIFEST.md +42 -0
- package/agents/brain.md +42 -0
- package/agents/context-builder.md +46 -0
- package/agents/delegate.md +12 -0
- package/agents/dev-1.md +42 -0
- package/agents/oracle.md +73 -0
- package/agents/planner.md +55 -0
- package/agents/researcher.md +52 -0
- package/agents/reviewer.md +79 -0
- package/agents/scout.md +50 -0
- package/agents/tester.md +45 -0
- package/agents/worker.md +55 -0
- package/extensions/ralph.ts +1 -0
- package/extensions/reviewer-extension.ts +125 -0
- package/extensions/task-orchestrator.ts +28 -0
- package/package.json +63 -0
- package/prompts/gather-context-and-clarify.md +13 -0
- package/prompts/parallel-cleanup.md +59 -0
- package/prompts/parallel-context-build.md +53 -0
- package/prompts/parallel-handoff-plan.md +59 -0
- package/prompts/parallel-research.md +50 -0
- package/prompts/parallel-review.md +54 -0
- package/prompts/review-loop.md +41 -0
- package/skills/orchid/SKILL.md +214 -0
- package/skills/orchid/orchid-cleanup/SKILL.md +122 -0
- package/skills/orchid/orchid-converge/SKILL.md +124 -0
- package/skills/orchid/orchid-decompose/SKILL.md +201 -0
- package/skills/orchid/orchid-doctor/SKILL.md +162 -0
- package/skills/orchid/orchid-investigate/SKILL.md +102 -0
- package/skills/orchid/orchid-launch/SKILL.md +147 -0
- package/skills/ralph/SKILL.md +73 -0
- package/skills/subagents/pi-subagents/SKILL.md +813 -0
- package/src/index.ts +7 -0
- package/src/orchestrator/abort.ts +534 -0
- package/src/orchestrator/agent-bridge-extension.ts +1020 -0
- package/src/orchestrator/agent-host.ts +954 -0
- package/src/orchestrator/cleanup.ts +776 -0
- package/src/orchestrator/config-loader.ts +1412 -0
- package/src/orchestrator/config-schema.ts +690 -0
- package/src/orchestrator/config.ts +81 -0
- package/src/orchestrator/context-window.ts +66 -0
- package/src/orchestrator/diagnostic-reports.ts +475 -0
- package/src/orchestrator/diagnostics.ts +394 -0
- package/src/orchestrator/discovery.ts +1833 -0
- package/src/orchestrator/engine-worker.ts +415 -0
- package/src/orchestrator/engine.ts +5940 -0
- package/src/orchestrator/execution.ts +3104 -0
- package/src/orchestrator/extension.ts +5934 -0
- package/src/orchestrator/formatting.ts +785 -0
- package/src/orchestrator/git.ts +88 -0
- package/src/orchestrator/index.ts +28 -0
- package/src/orchestrator/lane-runner.ts +1787 -0
- package/src/orchestrator/mailbox.ts +780 -0
- package/src/orchestrator/merge.ts +3414 -0
- package/src/orchestrator/messages.ts +1062 -0
- package/src/orchestrator/migrations.ts +278 -0
- package/src/orchestrator/naming.ts +117 -0
- package/src/orchestrator/path-resolver.ts +275 -0
- package/src/orchestrator/persistence.ts +2625 -0
- package/src/orchestrator/process-registry.ts +452 -0
- package/src/orchestrator/quality-gate.ts +1085 -0
- package/src/orchestrator/resume.ts +3488 -0
- package/src/orchestrator/sessions.ts +57 -0
- package/src/orchestrator/settings-loader.ts +136 -0
- package/src/orchestrator/settings-tui.ts +2208 -0
- package/src/orchestrator/sidecar-telemetry.ts +267 -0
- package/src/orchestrator/supervisor.ts +4548 -0
- package/src/orchestrator/task-executor-core.ts +675 -0
- package/src/orchestrator/tmux-compat.ts +37 -0
- package/src/orchestrator/tool-allowlist-constants.ts +37 -0
- package/src/orchestrator/types.ts +4465 -0
- package/src/orchestrator/verification.ts +547 -0
- package/src/orchestrator/waves.ts +1564 -0
- package/src/orchestrator/workspace.ts +707 -0
- package/src/orchestrator/worktree.ts +2725 -0
- package/src/ralph/index.ts +825 -0
- package/src/subagents/agents/agent-management.ts +648 -0
- package/src/subagents/agents/agent-scope.ts +6 -0
- package/src/subagents/agents/agent-selection.ts +23 -0
- package/src/subagents/agents/agent-serializer.ts +86 -0
- package/src/subagents/agents/agents.ts +832 -0
- package/src/subagents/agents/chain-serializer.ts +137 -0
- package/src/subagents/agents/frontmatter.ts +29 -0
- package/src/subagents/agents/identity.ts +30 -0
- package/src/subagents/agents/skills.ts +632 -0
- package/src/subagents/extension/config.ts +16 -0
- package/src/subagents/extension/control-notices.ts +92 -0
- package/src/subagents/extension/doctor.ts +199 -0
- package/src/subagents/extension/fanout-child.ts +170 -0
- package/src/subagents/extension/index.ts +573 -0
- package/src/subagents/extension/schemas.ts +168 -0
- package/src/subagents/intercom/intercom-bridge.ts +379 -0
- package/src/subagents/intercom/result-intercom.ts +377 -0
- package/src/subagents/runs/background/async-execution.ts +712 -0
- package/src/subagents/runs/background/async-job-tracker.ts +310 -0
- package/src/subagents/runs/background/async-resume.ts +345 -0
- package/src/subagents/runs/background/async-status.ts +325 -0
- package/src/subagents/runs/background/completion-dedupe.ts +63 -0
- package/src/subagents/runs/background/notify.ts +108 -0
- package/src/subagents/runs/background/parallel-groups.ts +45 -0
- package/src/subagents/runs/background/result-watcher.ts +307 -0
- package/src/subagents/runs/background/run-id-resolver.ts +83 -0
- package/src/subagents/runs/background/run-status.ts +269 -0
- package/src/subagents/runs/background/stale-run-reconciler.ts +336 -0
- package/src/subagents/runs/background/subagent-runner.ts +1808 -0
- package/src/subagents/runs/background/top-level-async.ts +13 -0
- package/src/subagents/runs/foreground/chain-clarify.ts +1333 -0
- package/src/subagents/runs/foreground/chain-execution.ts +938 -0
- package/src/subagents/runs/foreground/execution.ts +918 -0
- package/src/subagents/runs/foreground/subagent-executor.ts +2527 -0
- package/src/subagents/runs/shared/completion-guard.ts +147 -0
- package/src/subagents/runs/shared/long-running-guard.ts +175 -0
- package/src/subagents/runs/shared/mcp-direct-tool-allowlist.ts +365 -0
- package/src/subagents/runs/shared/model-fallback.ts +103 -0
- package/src/subagents/runs/shared/nested-events.ts +819 -0
- package/src/subagents/runs/shared/nested-path.ts +52 -0
- package/src/subagents/runs/shared/nested-render.ts +115 -0
- package/src/subagents/runs/shared/parallel-utils.ts +109 -0
- package/src/subagents/runs/shared/pi-args.ts +220 -0
- package/src/subagents/runs/shared/pi-spawn.ts +115 -0
- package/src/subagents/runs/shared/run-history.ts +60 -0
- package/src/subagents/runs/shared/single-output.ts +164 -0
- package/src/subagents/runs/shared/subagent-control.ts +226 -0
- package/src/subagents/runs/shared/subagent-prompt-runtime.ts +170 -0
- package/src/subagents/runs/shared/worktree.ts +577 -0
- package/src/subagents/shared/artifacts.ts +98 -0
- package/src/subagents/shared/atomic-json.ts +16 -0
- package/src/subagents/shared/file-coalescer.ts +40 -0
- package/src/subagents/shared/fork-context.ts +76 -0
- package/src/subagents/shared/formatters.ts +133 -0
- package/src/subagents/shared/jsonl-writer.ts +81 -0
- package/src/subagents/shared/model-info.ts +78 -0
- package/src/subagents/shared/post-exit-stdio-guard.ts +85 -0
- package/src/subagents/shared/session-identity.ts +10 -0
- package/src/subagents/shared/session-tokens.ts +44 -0
- package/src/subagents/shared/settings.ts +397 -0
- package/src/subagents/shared/status-format.ts +49 -0
- package/src/subagents/shared/types.ts +822 -0
- package/src/subagents/shared/utils.ts +450 -0
- package/src/subagents/slash/prompt-template-bridge.ts +397 -0
- package/src/subagents/slash/slash-bridge.ts +174 -0
- package/src/subagents/slash/slash-commands.ts +528 -0
- package/src/subagents/slash/slash-live-state.ts +292 -0
- package/src/subagents/tui/render-helpers.ts +80 -0
- package/src/subagents/tui/render.ts +1358 -0
- package/templates/agents/local/supervisor.md +33 -0
- package/templates/agents/local/task-merger.md +27 -0
- package/templates/agents/local/task-reviewer.md +30 -0
- package/templates/agents/local/task-worker.md +34 -0
- package/templates/agents/supervisor-routing.md +92 -0
- package/templates/agents/supervisor.md +229 -0
- package/templates/agents/task-merger.md +214 -0
- package/templates/agents/task-reviewer.md +260 -0
- package/templates/agents/task-worker-segment.md +44 -0
- package/templates/agents/task-worker.md +557 -0
- package/templates/tasks/CONTEXT.md +30 -0
- package/templates/tasks/EXAMPLE-001-hello-world/PROMPT.md +98 -0
- package/templates/tasks/EXAMPLE-001-hello-world/STATUS.md +73 -0
- package/templates/tasks/EXAMPLE-002-parallel-smoke/PROMPT.md +97 -0
- package/templates/tasks/EXAMPLE-002-parallel-smoke/STATUS.md +73 -0
|
@@ -0,0 +1,3488 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Resume logic for paused/interrupted batches
|
|
3
|
+
* @module orch/resume
|
|
4
|
+
*/
|
|
5
|
+
import { existsSync } from "fs";
|
|
6
|
+
import { join } from "path";
|
|
7
|
+
|
|
8
|
+
import { assembleDiagnosticInput, emitDiagnosticReports } from "./diagnostic-reports.ts";
|
|
9
|
+
import { runDiscovery } from "./discovery.ts";
|
|
10
|
+
import {
|
|
11
|
+
executeOrchBatch,
|
|
12
|
+
resolveDisplayWaveNumber,
|
|
13
|
+
buildSpawnFailureAlertExtras,
|
|
14
|
+
} from "./engine.ts";
|
|
15
|
+
import {
|
|
16
|
+
buildReviewerEnv,
|
|
17
|
+
buildWorkerEnv,
|
|
18
|
+
buildWorkerExcludeEnv,
|
|
19
|
+
computeTransitiveDependents,
|
|
20
|
+
execLog,
|
|
21
|
+
executeLaneV2,
|
|
22
|
+
executeWave,
|
|
23
|
+
resolveCanonicalTaskPaths,
|
|
24
|
+
} from "./execution.ts";
|
|
25
|
+
import type { MonitorUpdateCallback, RuntimeBackend } from "./execution.ts";
|
|
26
|
+
import { selectRuntimeBackend } from "./engine.ts";
|
|
27
|
+
import { readRegistrySnapshot, isTerminalStatus, isProcessAlive } from "./process-registry.ts";
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* TP-112: Terminate any alive V2 agents for a lane before re-execution.
|
|
31
|
+
* Per Runtime V2 spec §7.3: detect + terminate + rehydrate.
|
|
32
|
+
* Prevents duplicate concurrent agents for the same lane/task on resume.
|
|
33
|
+
*/
|
|
34
|
+
function terminateAliveV2Agents(stateRoot: string, batchId: string, sessionName: string): void {
|
|
35
|
+
const registry = readRegistrySnapshot(stateRoot, batchId);
|
|
36
|
+
if (!registry) return;
|
|
37
|
+
for (const suffix of ["-worker", "-reviewer", ""]) {
|
|
38
|
+
const key = `${sessionName}${suffix}`;
|
|
39
|
+
const manifest = registry.agents[key];
|
|
40
|
+
if (manifest && !isTerminalStatus(manifest.status) && isProcessAlive(manifest.pid)) {
|
|
41
|
+
try {
|
|
42
|
+
process.kill(manifest.pid, "SIGTERM");
|
|
43
|
+
execLog("resume", key, `terminated alive V2 agent (PID ${manifest.pid}) before re-execute`);
|
|
44
|
+
} catch {
|
|
45
|
+
/* already dead */
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
import { getCurrentBranch, runGit } from "./git.ts";
|
|
51
|
+
import { mergeWaveByRepo } from "./merge.ts";
|
|
52
|
+
import {
|
|
53
|
+
applyMergeRetryLoop,
|
|
54
|
+
computeCleanupGatePolicy,
|
|
55
|
+
computeMergeFailurePolicy,
|
|
56
|
+
extractFailedRepoId,
|
|
57
|
+
formatRepoMergeSummary,
|
|
58
|
+
ORCH_MESSAGES,
|
|
59
|
+
} from "./messages.ts";
|
|
60
|
+
import type { CleanupGateRepoFailure } from "./messages.ts";
|
|
61
|
+
import { resolveOperatorId } from "./naming.ts";
|
|
62
|
+
import {
|
|
63
|
+
applyPartialProgressToOutcomes,
|
|
64
|
+
deleteBatchState,
|
|
65
|
+
hasTaskDoneMarker,
|
|
66
|
+
loadBatchState,
|
|
67
|
+
persistRuntimeState,
|
|
68
|
+
reconstructBatchStateFromRuntime,
|
|
69
|
+
saveBatchState,
|
|
70
|
+
seedPendingOutcomesForAllocatedLanes,
|
|
71
|
+
syncTaskOutcomesFromMonitor,
|
|
72
|
+
upsertTaskOutcome,
|
|
73
|
+
} from "./persistence.ts";
|
|
74
|
+
import {
|
|
75
|
+
buildBatchProgressSnapshot,
|
|
76
|
+
buildSupervisorSegmentFrontierSnapshot,
|
|
77
|
+
defaultResilienceState,
|
|
78
|
+
StateFileError,
|
|
79
|
+
} from "./types.ts";
|
|
80
|
+
import type {
|
|
81
|
+
AllocatedLane,
|
|
82
|
+
AllocatedTask,
|
|
83
|
+
LaneExecutionResult,
|
|
84
|
+
LaneTaskOutcome,
|
|
85
|
+
LaneTaskStatus,
|
|
86
|
+
MergeWaveResult,
|
|
87
|
+
OrchBatchPhase,
|
|
88
|
+
OrchBatchRuntimeState,
|
|
89
|
+
OrchestratorConfig,
|
|
90
|
+
ParsedTask,
|
|
91
|
+
PersistedBatchState,
|
|
92
|
+
PersistedLaneRecord,
|
|
93
|
+
PersistedSegmentRecord,
|
|
94
|
+
ReconciledTaskState,
|
|
95
|
+
ResumeEligibility,
|
|
96
|
+
ResumePoint,
|
|
97
|
+
TaskRunnerConfig,
|
|
98
|
+
WaveExecutionResult,
|
|
99
|
+
WorkspaceConfig,
|
|
100
|
+
} from "./types.ts";
|
|
101
|
+
import { buildDependencyGraph, resolveBaseBranch, resolveRepoRoot } from "./waves.ts";
|
|
102
|
+
import {
|
|
103
|
+
deleteBranchBestEffort,
|
|
104
|
+
forceCleanupWorktree,
|
|
105
|
+
listWorktrees,
|
|
106
|
+
preserveFailedLaneProgress,
|
|
107
|
+
removeAllWorktrees,
|
|
108
|
+
removeWorktree,
|
|
109
|
+
safeResetWorktree,
|
|
110
|
+
sleepSync,
|
|
111
|
+
} from "./worktree.ts";
|
|
112
|
+
|
|
113
|
+
// ── Resume Repo Helpers ──────────────────────────────────────────────
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Collect unique repo roots from persisted lane records.
|
|
117
|
+
*
|
|
118
|
+
* In repo mode (no repoId on lanes), returns `[defaultRepoRoot]`.
|
|
119
|
+
* In workspace mode, returns one entry per unique repoId, resolved
|
|
120
|
+
* via `resolveRepoRoot()`. Includes the default root as a fallback
|
|
121
|
+
* for lanes with no repoId.
|
|
122
|
+
*
|
|
123
|
+
* Used by inter-wave worktree reset and terminal cleanup to operate
|
|
124
|
+
* on worktrees across all repos in the batch.
|
|
125
|
+
*
|
|
126
|
+
* @param persistedState - Loaded batch state with lane records
|
|
127
|
+
* @param defaultRepoRoot - Default/main repo root (cwd)
|
|
128
|
+
* @param workspaceConfig - Workspace configuration (null in repo mode)
|
|
129
|
+
* @returns Array of unique absolute repo root paths
|
|
130
|
+
*/
|
|
131
|
+
export function collectRepoRoots(
|
|
132
|
+
persistedState: PersistedBatchState,
|
|
133
|
+
defaultRepoRoot: string,
|
|
134
|
+
workspaceConfig?: WorkspaceConfig | null,
|
|
135
|
+
): string[] {
|
|
136
|
+
const roots = new Set<string>();
|
|
137
|
+
|
|
138
|
+
for (const lane of persistedState.lanes) {
|
|
139
|
+
const root = resolveRepoRoot(lane.repoId, defaultRepoRoot, workspaceConfig);
|
|
140
|
+
roots.add(root);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Always include the default repo root (covers repo mode and any
|
|
144
|
+
// lanes without repoId)
|
|
145
|
+
roots.add(defaultRepoRoot);
|
|
146
|
+
|
|
147
|
+
return [...roots];
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Resolve a repoId from a resolved repo root path.
|
|
152
|
+
*
|
|
153
|
+
* In workspace mode, workspace config maps repoId → path. This performs
|
|
154
|
+
* the reverse lookup: given a resolved absolute path, find the repoId.
|
|
155
|
+
* Returns `undefined` if no workspace config or no matching repo is found
|
|
156
|
+
* (which is correct for repo mode or the primary/default repo).
|
|
157
|
+
*
|
|
158
|
+
* Used during cleanup to call `resolveBaseBranch()` per-repo with the
|
|
159
|
+
* correct repoId, ensuring unmerged-branch protection checks against
|
|
160
|
+
* the right target branch in workspace mode.
|
|
161
|
+
*
|
|
162
|
+
* @param repoRoot - Resolved absolute path of the repo
|
|
163
|
+
* @param workspaceConfig - Workspace configuration (null in repo mode)
|
|
164
|
+
* @returns The repoId or undefined if not found / not in workspace mode
|
|
165
|
+
*/
|
|
166
|
+
export function resolveRepoIdFromRoot(
|
|
167
|
+
repoRoot: string,
|
|
168
|
+
workspaceConfig?: WorkspaceConfig | null,
|
|
169
|
+
): string | undefined {
|
|
170
|
+
if (!workspaceConfig) return undefined;
|
|
171
|
+
|
|
172
|
+
for (const [repoId, repoConfig] of workspaceConfig.repos) {
|
|
173
|
+
if (repoConfig.path === repoRoot) {
|
|
174
|
+
return repoId;
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
return undefined;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* Reconstruct AllocatedLane[] from persisted lane records.
|
|
183
|
+
*
|
|
184
|
+
* Used during resume to preserve lane metadata (worktreePath, branch, repoId)
|
|
185
|
+
* across persistence checkpoints. Without this, the first resume checkpoint
|
|
186
|
+
* would serialize empty lanes, losing all lane context.
|
|
187
|
+
*
|
|
188
|
+
* When `persistedTasks` is provided, repo attribution fields (repoId,
|
|
189
|
+
* resolvedRepoId, taskFolder) are carried forward onto the reconstructed
|
|
190
|
+
* ParsedTask stubs. This ensures `serializeBatchState()` can emit repo
|
|
191
|
+
* fields for tasks not in `discovery.pending` (e.g., completed/failed tasks
|
|
192
|
+
* that have been archived).
|
|
193
|
+
*
|
|
194
|
+
* @param persistedLanes - Persisted lane records
|
|
195
|
+
* @param persistedTasks - Optional persisted task records for repo field carry-forward
|
|
196
|
+
* @returns Reconstructed AllocatedLane array with repo attribution preserved
|
|
197
|
+
*/
|
|
198
|
+
export function reconstructAllocatedLanes(
|
|
199
|
+
persistedLanes: PersistedLaneRecord[],
|
|
200
|
+
persistedTasks?: PersistedBatchState["tasks"],
|
|
201
|
+
): AllocatedLane[] {
|
|
202
|
+
// Build task lookup for repo field carry-forward
|
|
203
|
+
const taskLookup = new Map<string, PersistedBatchState["tasks"][0]>();
|
|
204
|
+
if (persistedTasks) {
|
|
205
|
+
for (const t of persistedTasks) {
|
|
206
|
+
taskLookup.set(t.taskId, t);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
return persistedLanes.map((lr) => ({
|
|
211
|
+
laneNumber: lr.laneNumber,
|
|
212
|
+
laneId: lr.laneId,
|
|
213
|
+
laneSessionId: lr.laneSessionId,
|
|
214
|
+
worktreePath: lr.worktreePath,
|
|
215
|
+
branch: lr.branch,
|
|
216
|
+
tasks: lr.taskIds.map((taskId) => {
|
|
217
|
+
const persistedTask = taskLookup.get(taskId);
|
|
218
|
+
// Build a minimal ParsedTask stub that carries repo attribution
|
|
219
|
+
// from the persisted record. This ensures serializeBatchState()
|
|
220
|
+
// can emit repoId/resolvedRepoId for tasks not in discovery.
|
|
221
|
+
const taskStub: Partial<ParsedTask> = {};
|
|
222
|
+
if (persistedTask?.repoId !== undefined) {
|
|
223
|
+
taskStub.promptRepoId = persistedTask.repoId;
|
|
224
|
+
}
|
|
225
|
+
if (persistedTask?.resolvedRepoId !== undefined) {
|
|
226
|
+
taskStub.resolvedRepoId = persistedTask.resolvedRepoId;
|
|
227
|
+
}
|
|
228
|
+
// TP-169: Always set taskFolder on stub, even if empty string.
|
|
229
|
+
// Previously, the falsy check `if (persistedTask?.taskFolder)` skipped
|
|
230
|
+
// empty-string values, leaving taskFolder as `undefined` on the stub.
|
|
231
|
+
// This caused crashes in buildExecutionUnit and merge code when
|
|
232
|
+
// accessing `task.task.taskFolder` on dynamically-expanded segments
|
|
233
|
+
// whose persisted records had taskFolder="" (the default from
|
|
234
|
+
// serializeBatchState before enrichment).
|
|
235
|
+
taskStub.taskFolder = persistedTask?.taskFolder ?? "";
|
|
236
|
+
if ((persistedTask as any)?.packetRepoId !== undefined) {
|
|
237
|
+
(taskStub as any).packetRepoId = (persistedTask as any).packetRepoId;
|
|
238
|
+
}
|
|
239
|
+
if ((persistedTask as any)?.packetTaskPath !== undefined) {
|
|
240
|
+
(taskStub as any).packetTaskPath = (persistedTask as any).packetTaskPath;
|
|
241
|
+
}
|
|
242
|
+
if ((persistedTask as any)?.segmentIds !== undefined) {
|
|
243
|
+
(taskStub as any).segmentIds = (persistedTask as any).segmentIds;
|
|
244
|
+
}
|
|
245
|
+
if ((persistedTask as any)?.activeSegmentId !== undefined) {
|
|
246
|
+
(taskStub as any).activeSegmentId = (persistedTask as any).activeSegmentId;
|
|
247
|
+
}
|
|
248
|
+
return {
|
|
249
|
+
taskId,
|
|
250
|
+
order: 0,
|
|
251
|
+
task: (Object.keys(taskStub).length > 0 ? taskStub : null) as unknown as ParsedTask,
|
|
252
|
+
estimatedMinutes: 0,
|
|
253
|
+
};
|
|
254
|
+
}),
|
|
255
|
+
strategy: "round-robin" as const,
|
|
256
|
+
estimatedLoad: 0,
|
|
257
|
+
estimatedMinutes: 0,
|
|
258
|
+
...(lr.repoId !== undefined ? { repoId: lr.repoId } : {}),
|
|
259
|
+
}));
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* Collect unique repo roots from a combination of sources.
|
|
264
|
+
*
|
|
265
|
+
* Unlike `collectRepoRoots()` which only reads from persistedState.lanes,
|
|
266
|
+
* this variant merges repo roots from multiple lane sources. This is
|
|
267
|
+
* important during resumed execution where new waves may allocate lanes
|
|
268
|
+
* in repos not present in the original persisted state.
|
|
269
|
+
*
|
|
270
|
+
* @param laneSources - Array of lane arrays to collect repo roots from
|
|
271
|
+
* @param defaultRepoRoot - Default/main repo root (cwd)
|
|
272
|
+
* @param workspaceConfig - Workspace configuration (null in repo mode)
|
|
273
|
+
* @returns Array of unique absolute repo root paths
|
|
274
|
+
*/
|
|
275
|
+
export function collectAllRepoRoots(
|
|
276
|
+
laneSources: Array<{ repoId?: string }[]>,
|
|
277
|
+
defaultRepoRoot: string,
|
|
278
|
+
workspaceConfig?: WorkspaceConfig | null,
|
|
279
|
+
): string[] {
|
|
280
|
+
const roots = new Set<string>();
|
|
281
|
+
|
|
282
|
+
for (const lanes of laneSources) {
|
|
283
|
+
for (const lane of lanes) {
|
|
284
|
+
const root = resolveRepoRoot(lane.repoId, defaultRepoRoot, workspaceConfig);
|
|
285
|
+
roots.add(root);
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
// Always include the default repo root (covers repo mode and any
|
|
290
|
+
// lanes without repoId)
|
|
291
|
+
roots.add(defaultRepoRoot);
|
|
292
|
+
|
|
293
|
+
return [...roots];
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// ── Resume Pure Functions ────────────────────────────────────────────
|
|
297
|
+
|
|
298
|
+
/**
|
|
299
|
+
* Determine whether a multi-segment task's persisted segment frontier is
|
|
300
|
+
* complete — i.e., every segment for the task reached a terminal-success
|
|
301
|
+
* status ("succeeded" or "skipped").
|
|
302
|
+
*
|
|
303
|
+
* Returns:
|
|
304
|
+
* - `true` when the task has segments AND all of them are terminal-success.
|
|
305
|
+
* - `true` when the task has no segments recorded (single-segment / legacy
|
|
306
|
+
* tasks — the guard does not apply and `.DONE` is authoritative).
|
|
307
|
+
* - `false` when at least one segment is pending/running/failed/stalled.
|
|
308
|
+
*
|
|
309
|
+
* Used by `collectDoneTaskIdsForResume` (TP-196 / #462) to refuse a stale or
|
|
310
|
+
* premature `.DONE` from suppressing re-execution of remaining segments.
|
|
311
|
+
*/
|
|
312
|
+
function isSegmentFrontierCompleteForResume(
|
|
313
|
+
persistedState: PersistedBatchState,
|
|
314
|
+
taskId: string,
|
|
315
|
+
): boolean {
|
|
316
|
+
const segments = (persistedState.segments ?? []).filter((s) => s.taskId === taskId);
|
|
317
|
+
if (segments.length === 0) return true; // No segments recorded — guard does not apply.
|
|
318
|
+
return segments.every((s) => s.status === "succeeded" || s.status === "skipped");
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
/**
|
|
322
|
+
* Collect task IDs with authoritative .DONE markers.
|
|
323
|
+
*
|
|
324
|
+
* Segment frontier state does not suppress .DONE authority for tasks WITHOUT
|
|
325
|
+
* persisted segment records (single-segment / legacy). For tasks WITH segment
|
|
326
|
+
* records (multi-segment), TP-196 / #462 adds a resume guard: when `.DONE`
|
|
327
|
+
* exists but the segment frontier is incomplete (at least one segment is not
|
|
328
|
+
* yet succeeded/skipped), we DO NOT add the taskId to the done set — the
|
|
329
|
+
* task will be re-reconciled instead of silently marked complete. A WARN is
|
|
330
|
+
* logged so operators can spot the inconsistency. The on-disk `.DONE` marker
|
|
331
|
+
* is left alone; the engine will re-establish authoritative state.
|
|
332
|
+
*/
|
|
333
|
+
export function collectDoneTaskIdsForResume(
|
|
334
|
+
persistedState: PersistedBatchState,
|
|
335
|
+
repoRoot: string,
|
|
336
|
+
workspaceConfig?: WorkspaceConfig | null,
|
|
337
|
+
): Set<string> {
|
|
338
|
+
const doneTaskIds = new Set<string>();
|
|
339
|
+
for (const task of persistedState.tasks) {
|
|
340
|
+
let markerFound = false;
|
|
341
|
+
let markerLocation: string | null = null;
|
|
342
|
+
if (task.taskFolder && hasTaskDoneMarker(task.taskFolder)) {
|
|
343
|
+
markerFound = true;
|
|
344
|
+
markerLocation = task.taskFolder;
|
|
345
|
+
}
|
|
346
|
+
if (!markerFound) {
|
|
347
|
+
const laneRec = persistedState.lanes.find((l) => l.taskIds.includes(task.taskId));
|
|
348
|
+
if (laneRec?.worktreePath && task.taskFolder) {
|
|
349
|
+
const resolved = resolveCanonicalTaskPaths(
|
|
350
|
+
task.taskFolder,
|
|
351
|
+
laneRec.worktreePath,
|
|
352
|
+
repoRoot,
|
|
353
|
+
!!workspaceConfig,
|
|
354
|
+
);
|
|
355
|
+
if (existsSync(resolved.donePath)) {
|
|
356
|
+
markerFound = true;
|
|
357
|
+
markerLocation = resolved.donePath;
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
if (!markerFound) continue;
|
|
362
|
+
|
|
363
|
+
// TP-196 / #462: Resume guard — refuse `.DONE` authority for multi-segment
|
|
364
|
+
// tasks with an incomplete segment frontier.
|
|
365
|
+
if (!isSegmentFrontierCompleteForResume(persistedState, task.taskId)) {
|
|
366
|
+
console.warn(
|
|
367
|
+
`[resume] WARN: .DONE present for task ${task.taskId} at ${markerLocation} but segment frontier is incomplete — not marking complete (#462 guard). Task will re-reconcile.`,
|
|
368
|
+
);
|
|
369
|
+
continue;
|
|
370
|
+
}
|
|
371
|
+
doneTaskIds.add(task.taskId);
|
|
372
|
+
}
|
|
373
|
+
return doneTaskIds;
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
/**
|
|
377
|
+
* Check whether a persisted batch state is eligible for resume.
|
|
378
|
+
*
|
|
379
|
+
* Resume eligibility matrix:
|
|
380
|
+
* | Phase | Normal | --force | Reason |
|
|
381
|
+
* |-----------|-----------|-----------|-------------------------------------------|
|
|
382
|
+
* | paused | ✅ | ✅ | Batch was paused (user/merge-failure) |
|
|
383
|
+
* | executing | ✅ | ✅ | Batch was executing when orchestrator died |
|
|
384
|
+
* | merging | ✅ | ✅ | Batch was merging when orchestrator died |
|
|
385
|
+
* | stopped | ❌ | ✅ | Batch was stopped by policy |
|
|
386
|
+
* | failed | ❌ | ✅ | Batch has terminal failure |
|
|
387
|
+
* | completed | ❌ | ❌ | Batch already completed |
|
|
388
|
+
* | idle | ❌ | ❌ | Batch never started execution |
|
|
389
|
+
* | planning | ❌ | ❌ | Batch was still planning |
|
|
390
|
+
*
|
|
391
|
+
* Pure function — no process or filesystem access.
|
|
392
|
+
*
|
|
393
|
+
* @param state - Persisted batch state to check
|
|
394
|
+
* @param force - When true, `stopped` and `failed` phases become eligible
|
|
395
|
+
*/
|
|
396
|
+
export function checkResumeEligibility(
|
|
397
|
+
state: PersistedBatchState,
|
|
398
|
+
force: boolean = false,
|
|
399
|
+
): ResumeEligibility {
|
|
400
|
+
const { phase, batchId } = state;
|
|
401
|
+
|
|
402
|
+
switch (phase) {
|
|
403
|
+
case "paused":
|
|
404
|
+
return {
|
|
405
|
+
eligible: true,
|
|
406
|
+
reason: `Batch ${batchId} is paused and can be resumed.`,
|
|
407
|
+
phase,
|
|
408
|
+
batchId,
|
|
409
|
+
};
|
|
410
|
+
|
|
411
|
+
case "executing":
|
|
412
|
+
return {
|
|
413
|
+
eligible: true,
|
|
414
|
+
reason: `Batch ${batchId} was executing when the orchestrator disconnected. Can be resumed.`,
|
|
415
|
+
phase,
|
|
416
|
+
batchId,
|
|
417
|
+
};
|
|
418
|
+
|
|
419
|
+
case "merging":
|
|
420
|
+
return {
|
|
421
|
+
eligible: true,
|
|
422
|
+
reason: `Batch ${batchId} was merging when the orchestrator disconnected. Can be resumed.`,
|
|
423
|
+
phase,
|
|
424
|
+
batchId,
|
|
425
|
+
};
|
|
426
|
+
|
|
427
|
+
case "stopped":
|
|
428
|
+
if (force) {
|
|
429
|
+
return {
|
|
430
|
+
eligible: true,
|
|
431
|
+
reason: `Batch ${batchId} was stopped by failure policy. Force-resuming (--force).`,
|
|
432
|
+
phase,
|
|
433
|
+
batchId,
|
|
434
|
+
};
|
|
435
|
+
}
|
|
436
|
+
return {
|
|
437
|
+
eligible: false,
|
|
438
|
+
reason: `Batch ${batchId} was stopped by failure policy. Use --force to resume, or /orch-abort to clean up.`,
|
|
439
|
+
phase,
|
|
440
|
+
batchId,
|
|
441
|
+
};
|
|
442
|
+
|
|
443
|
+
case "failed":
|
|
444
|
+
if (force) {
|
|
445
|
+
return {
|
|
446
|
+
eligible: true,
|
|
447
|
+
reason: `Batch ${batchId} has a terminal failure. Force-resuming (--force).`,
|
|
448
|
+
phase,
|
|
449
|
+
batchId,
|
|
450
|
+
};
|
|
451
|
+
}
|
|
452
|
+
return {
|
|
453
|
+
eligible: false,
|
|
454
|
+
reason: `Batch ${batchId} has a terminal failure. Use --force to resume, or /orch-abort to clean up.`,
|
|
455
|
+
phase,
|
|
456
|
+
batchId,
|
|
457
|
+
};
|
|
458
|
+
|
|
459
|
+
case "completed":
|
|
460
|
+
return {
|
|
461
|
+
eligible: false,
|
|
462
|
+
reason: `Batch ${batchId} already completed. ${force ? "--force cannot resume a completed batch. " : ""}Delete the state file or start a new batch.`,
|
|
463
|
+
phase,
|
|
464
|
+
batchId,
|
|
465
|
+
};
|
|
466
|
+
|
|
467
|
+
case "idle":
|
|
468
|
+
return {
|
|
469
|
+
eligible: false,
|
|
470
|
+
reason: `Batch ${batchId} never started execution. Start a new batch with /orch.`,
|
|
471
|
+
phase,
|
|
472
|
+
batchId,
|
|
473
|
+
};
|
|
474
|
+
|
|
475
|
+
case "launching":
|
|
476
|
+
return {
|
|
477
|
+
eligible: false,
|
|
478
|
+
reason: `Batch ${batchId} is currently launching. Wait for it to start or use /orch-abort.`,
|
|
479
|
+
phase,
|
|
480
|
+
batchId,
|
|
481
|
+
};
|
|
482
|
+
|
|
483
|
+
case "planning":
|
|
484
|
+
return {
|
|
485
|
+
eligible: false,
|
|
486
|
+
reason: `Batch ${batchId} was still in planning phase. Start a new batch with /orch.`,
|
|
487
|
+
phase,
|
|
488
|
+
batchId,
|
|
489
|
+
};
|
|
490
|
+
|
|
491
|
+
default:
|
|
492
|
+
return {
|
|
493
|
+
eligible: false,
|
|
494
|
+
reason: `Batch ${batchId} has unknown phase "${phase}". Delete the state file and start a new batch.`,
|
|
495
|
+
phase,
|
|
496
|
+
batchId,
|
|
497
|
+
};
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
interface SegmentFrontierResumeTaskState {
|
|
502
|
+
taskId: string;
|
|
503
|
+
completedSegmentIds: string[];
|
|
504
|
+
inFlightSegmentIds: string[];
|
|
505
|
+
pendingSegmentIds: string[];
|
|
506
|
+
failedSegmentIds: string[];
|
|
507
|
+
nextSegmentId: string | null;
|
|
508
|
+
allSucceeded: boolean;
|
|
509
|
+
dependencyBySegmentId: Map<string, string[]>;
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
function classifySegmentStatus(
|
|
513
|
+
status: PersistedSegmentRecord["status"] | undefined,
|
|
514
|
+
): "completed" | "failed" | "in-flight" | "pending" {
|
|
515
|
+
if (status === "succeeded" || status === "skipped") return "completed";
|
|
516
|
+
if (status === "failed" || status === "stalled") return "failed";
|
|
517
|
+
if (status === "running") return "in-flight";
|
|
518
|
+
return "pending";
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
/**
|
|
522
|
+
* Reconstruct per-task segment frontier from persisted segment records.
|
|
523
|
+
*
|
|
524
|
+
* Mutates persisted task records in-place to reflect the segment frontier:
|
|
525
|
+
* - sets `activeSegmentId` to running or next pending segment
|
|
526
|
+
* - normalizes task `status` to pending/running/terminal based on segments
|
|
527
|
+
*/
|
|
528
|
+
export function reconstructSegmentFrontier(
|
|
529
|
+
persistedState: PersistedBatchState,
|
|
530
|
+
): Map<string, SegmentFrontierResumeTaskState> {
|
|
531
|
+
const byTask = new Map<string, SegmentFrontierResumeTaskState>();
|
|
532
|
+
const segmentRecordById = new Map<string, PersistedSegmentRecord>();
|
|
533
|
+
for (const segment of persistedState.segments ?? []) {
|
|
534
|
+
segmentRecordById.set(segment.segmentId, segment);
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
for (const task of persistedState.tasks) {
|
|
538
|
+
const segmentIds = task.segmentIds ?? [];
|
|
539
|
+
if (segmentIds.length === 0) continue;
|
|
540
|
+
|
|
541
|
+
const dependencyBySegmentId = new Map<string, string[]>();
|
|
542
|
+
const completedSegmentIds: string[] = [];
|
|
543
|
+
const inFlightSegmentIds: string[] = [];
|
|
544
|
+
const pendingSegmentIds: string[] = [];
|
|
545
|
+
const failedSegmentIds: string[] = [];
|
|
546
|
+
let hasConcreteSegmentRecord = false;
|
|
547
|
+
|
|
548
|
+
for (let idx = 0; idx < segmentIds.length; idx++) {
|
|
549
|
+
const segmentId = segmentIds[idx];
|
|
550
|
+
const record = segmentRecordById.get(segmentId);
|
|
551
|
+
if (record) hasConcreteSegmentRecord = true;
|
|
552
|
+
const recordDeps = record?.dependsOnSegmentIds ?? [];
|
|
553
|
+
const fallbackDeps = idx > 0 ? [segmentIds[idx - 1]] : [];
|
|
554
|
+
const deps = (recordDeps.length > 0 ? recordDeps : fallbackDeps).filter((dep) =>
|
|
555
|
+
segmentIds.includes(dep),
|
|
556
|
+
);
|
|
557
|
+
dependencyBySegmentId.set(
|
|
558
|
+
segmentId,
|
|
559
|
+
[...new Set(deps)].sort((a, b) => a.localeCompare(b)),
|
|
560
|
+
);
|
|
561
|
+
|
|
562
|
+
switch (classifySegmentStatus(record?.status)) {
|
|
563
|
+
case "completed":
|
|
564
|
+
completedSegmentIds.push(segmentId);
|
|
565
|
+
break;
|
|
566
|
+
case "in-flight":
|
|
567
|
+
inFlightSegmentIds.push(segmentId);
|
|
568
|
+
break;
|
|
569
|
+
case "failed":
|
|
570
|
+
failedSegmentIds.push(segmentId);
|
|
571
|
+
break;
|
|
572
|
+
default:
|
|
573
|
+
pendingSegmentIds.push(segmentId);
|
|
574
|
+
break;
|
|
575
|
+
}
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
const completedSet = new Set(completedSegmentIds);
|
|
579
|
+
const readyPending = pendingSegmentIds.filter((segmentId) => {
|
|
580
|
+
const deps = dependencyBySegmentId.get(segmentId) ?? [];
|
|
581
|
+
return deps.every((dep) => completedSet.has(dep));
|
|
582
|
+
});
|
|
583
|
+
|
|
584
|
+
const nextSegmentId = inFlightSegmentIds[0] ?? readyPending[0] ?? pendingSegmentIds[0] ?? null;
|
|
585
|
+
const allSucceeded = segmentIds.every((segmentId) => {
|
|
586
|
+
const status = segmentRecordById.get(segmentId)?.status;
|
|
587
|
+
return status === "succeeded";
|
|
588
|
+
});
|
|
589
|
+
|
|
590
|
+
if (hasConcreteSegmentRecord) {
|
|
591
|
+
if (failedSegmentIds.length > 0) {
|
|
592
|
+
task.status = task.status === "skipped" ? "skipped" : "failed";
|
|
593
|
+
task.activeSegmentId = null;
|
|
594
|
+
} else if (inFlightSegmentIds.length > 0) {
|
|
595
|
+
task.status = "running";
|
|
596
|
+
task.activeSegmentId = inFlightSegmentIds[0];
|
|
597
|
+
} else if (pendingSegmentIds.length > 0) {
|
|
598
|
+
task.status = "pending";
|
|
599
|
+
task.activeSegmentId = nextSegmentId;
|
|
600
|
+
} else if (allSucceeded) {
|
|
601
|
+
task.status = "succeeded";
|
|
602
|
+
task.activeSegmentId = null;
|
|
603
|
+
} else {
|
|
604
|
+
task.status = task.status === "skipped" ? "skipped" : "failed";
|
|
605
|
+
task.activeSegmentId = null;
|
|
606
|
+
}
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
byTask.set(task.taskId, {
|
|
610
|
+
taskId: task.taskId,
|
|
611
|
+
completedSegmentIds,
|
|
612
|
+
inFlightSegmentIds,
|
|
613
|
+
pendingSegmentIds,
|
|
614
|
+
failedSegmentIds,
|
|
615
|
+
nextSegmentId,
|
|
616
|
+
allSucceeded,
|
|
617
|
+
dependencyBySegmentId,
|
|
618
|
+
});
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
return byTask;
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
/**
|
|
625
|
+
* Reconcile persisted task states against live signals.
|
|
626
|
+
*
|
|
627
|
+
* For each task in the persisted state, determines the correct action
|
|
628
|
+
* based on the current state of lane-session liveness and .DONE files.
|
|
629
|
+
*
|
|
630
|
+
* Precedence rules (applied per-task):
|
|
631
|
+
* 1. .DONE file found → "mark-complete" (even if session is alive — task is done)
|
|
632
|
+
* 2. Session alive + no .DONE → "reconnect" (task is still running)
|
|
633
|
+
* 3. Persisted status is terminal (succeeded/failed/stalled/skipped) → "skip"
|
|
634
|
+
* (already resolved in the original run, no action needed)
|
|
635
|
+
* 4. Session dead + no .DONE + was pending/running → "mark-failed"
|
|
636
|
+
* (task was interrupted and did not complete)
|
|
637
|
+
*
|
|
638
|
+
* Pure function — no process or filesystem access.
|
|
639
|
+
*
|
|
640
|
+
* @param persistedState - Loaded and validated batch state
|
|
641
|
+
* @param aliveSessions - Set of lane session names currently alive
|
|
642
|
+
* @param doneTaskIds - Set of task IDs whose .DONE files exist
|
|
643
|
+
* @returns Array of reconciled task states in persisted order
|
|
644
|
+
*/
|
|
645
|
+
export function reconcileTaskStates(
|
|
646
|
+
persistedState: PersistedBatchState,
|
|
647
|
+
aliveSessions: ReadonlySet<string>,
|
|
648
|
+
doneTaskIds: ReadonlySet<string>,
|
|
649
|
+
existingWorktrees: ReadonlySet<string> = new Set(),
|
|
650
|
+
): ReconciledTaskState[] {
|
|
651
|
+
return persistedState.tasks.map((task) => {
|
|
652
|
+
const sessionAlive = aliveSessions.has(task.sessionName);
|
|
653
|
+
const doneFileFound = doneTaskIds.has(task.taskId);
|
|
654
|
+
const worktreeExists = existingWorktrees.has(task.taskId);
|
|
655
|
+
|
|
656
|
+
// Precedence 1: .DONE file found → task completed
|
|
657
|
+
if (doneFileFound) {
|
|
658
|
+
return {
|
|
659
|
+
taskId: task.taskId,
|
|
660
|
+
persistedStatus: task.status,
|
|
661
|
+
liveStatus: "succeeded" as LaneTaskStatus,
|
|
662
|
+
sessionAlive,
|
|
663
|
+
doneFileFound: true,
|
|
664
|
+
worktreeExists,
|
|
665
|
+
action: "mark-complete" as const,
|
|
666
|
+
};
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
// Precedence 2: Session alive → reconnect
|
|
670
|
+
if (sessionAlive) {
|
|
671
|
+
return {
|
|
672
|
+
taskId: task.taskId,
|
|
673
|
+
persistedStatus: task.status,
|
|
674
|
+
liveStatus: "running" as LaneTaskStatus,
|
|
675
|
+
sessionAlive: true,
|
|
676
|
+
doneFileFound: false,
|
|
677
|
+
worktreeExists,
|
|
678
|
+
action: "reconnect" as const,
|
|
679
|
+
};
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
// Precedence 3: Already terminal in persisted state → skip
|
|
683
|
+
const terminalStatuses: LaneTaskStatus[] = ["succeeded", "failed", "stalled", "skipped"];
|
|
684
|
+
if (terminalStatuses.includes(task.status)) {
|
|
685
|
+
return {
|
|
686
|
+
taskId: task.taskId,
|
|
687
|
+
persistedStatus: task.status,
|
|
688
|
+
liveStatus: task.status,
|
|
689
|
+
sessionAlive: false,
|
|
690
|
+
doneFileFound: false,
|
|
691
|
+
worktreeExists,
|
|
692
|
+
action: "skip" as const,
|
|
693
|
+
};
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
// Precedence 4: Session dead + no .DONE + worktree exists → re-execute
|
|
697
|
+
if (worktreeExists) {
|
|
698
|
+
return {
|
|
699
|
+
taskId: task.taskId,
|
|
700
|
+
persistedStatus: task.status,
|
|
701
|
+
liveStatus: "pending" as LaneTaskStatus,
|
|
702
|
+
sessionAlive: false,
|
|
703
|
+
doneFileFound: false,
|
|
704
|
+
worktreeExists: true,
|
|
705
|
+
action: "re-execute" as const,
|
|
706
|
+
};
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
// Precedence 5: Pending task that was never started → remain pending
|
|
710
|
+
// Matches two cases:
|
|
711
|
+
// (a) No session assigned at all (future-wave task never allocated)
|
|
712
|
+
// (b) Session assigned from a prior failed resume, but session is dead
|
|
713
|
+
// and worktree doesn't exist — task was allocated but never actually
|
|
714
|
+
// started (TP-037 bug #102b fix)
|
|
715
|
+
// In both cases the task should be re-queued for execution, not failed.
|
|
716
|
+
if (task.status === "pending" && (!task.sessionName || (!sessionAlive && !worktreeExists))) {
|
|
717
|
+
return {
|
|
718
|
+
taskId: task.taskId,
|
|
719
|
+
persistedStatus: task.status,
|
|
720
|
+
liveStatus: "pending" as LaneTaskStatus,
|
|
721
|
+
sessionAlive: false,
|
|
722
|
+
doneFileFound: false,
|
|
723
|
+
worktreeExists: false,
|
|
724
|
+
action: "pending" as const,
|
|
725
|
+
};
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
// Precedence 6: Dead session + not terminal + no .DONE + no worktree → failed
|
|
729
|
+
// (Task was allocated and started but crashed without completing)
|
|
730
|
+
return {
|
|
731
|
+
taskId: task.taskId,
|
|
732
|
+
persistedStatus: task.status,
|
|
733
|
+
liveStatus: "failed" as LaneTaskStatus,
|
|
734
|
+
sessionAlive: false,
|
|
735
|
+
doneFileFound: false,
|
|
736
|
+
worktreeExists: false,
|
|
737
|
+
action: "mark-failed" as const,
|
|
738
|
+
};
|
|
739
|
+
});
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
/**
|
|
743
|
+
* Get the latest merge status for a specific wave index (0-based).
|
|
744
|
+
*
|
|
745
|
+
* Persisted merge results may contain multiple entries for the same wave
|
|
746
|
+
* (e.g., re-exec sentinel merges clamped to wave 0, or retry attempts).
|
|
747
|
+
* This helper returns the latest entry's status for the given wave,
|
|
748
|
+
* preferring the last entry in array order (which is the most recent).
|
|
749
|
+
*
|
|
750
|
+
* @param mergeResults - Persisted merge results array
|
|
751
|
+
* @param waveIndex - 0-based wave index to look up
|
|
752
|
+
* @returns The merge status ("succeeded" | "failed" | "partial") or null if no entry exists
|
|
753
|
+
*/
|
|
754
|
+
export function getMergeStatusForWave(
|
|
755
|
+
mergeResults: ReadonlyArray<{ waveIndex: number; status: "succeeded" | "failed" | "partial" }>,
|
|
756
|
+
waveIndex: number,
|
|
757
|
+
): "succeeded" | "failed" | "partial" | null {
|
|
758
|
+
// Walk in reverse to find the latest entry for this wave
|
|
759
|
+
for (let i = mergeResults.length - 1; i >= 0; i--) {
|
|
760
|
+
if (mergeResults[i].waveIndex === waveIndex) {
|
|
761
|
+
return mergeResults[i].status;
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
return null;
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
/**
|
|
768
|
+
* Expand persisted wave plan with continuation rounds required by segment counts.
|
|
769
|
+
*
|
|
770
|
+
* Groups missing rounds by the original last-occurrence wave so resumed execution
|
|
771
|
+
* preserves multi-task round concurrency semantics (`[A,B]`, then `[A]`, etc.).
|
|
772
|
+
*/
|
|
773
|
+
export function buildResumeRuntimeWavePlan(persistedState: PersistedBatchState): string[][] {
|
|
774
|
+
const baseWavePlan = persistedState.wavePlan.map((wave) => [...wave]);
|
|
775
|
+
const runtimeWavePlan = [...baseWavePlan];
|
|
776
|
+
const segmentCountByTaskId = new Map<string, number>();
|
|
777
|
+
for (const task of persistedState.tasks) {
|
|
778
|
+
if (Array.isArray(task.segmentIds) && task.segmentIds.length > 0) {
|
|
779
|
+
segmentCountByTaskId.set(task.taskId, task.segmentIds.length);
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
const scheduledCountByTaskId = new Map<string, number>();
|
|
784
|
+
const lastWaveIndexByTaskId = new Map<string, number>();
|
|
785
|
+
for (let waveIdx = 0; waveIdx < baseWavePlan.length; waveIdx++) {
|
|
786
|
+
for (const taskId of baseWavePlan[waveIdx]) {
|
|
787
|
+
scheduledCountByTaskId.set(taskId, (scheduledCountByTaskId.get(taskId) ?? 0) + 1);
|
|
788
|
+
lastWaveIndexByTaskId.set(taskId, waveIdx);
|
|
789
|
+
}
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
const missingByLastWaveIndex = new Map<number, Map<string, number>>();
|
|
793
|
+
for (const [taskId, segmentCount] of segmentCountByTaskId.entries()) {
|
|
794
|
+
const scheduledCount = scheduledCountByTaskId.get(taskId) ?? 0;
|
|
795
|
+
if (segmentCount <= scheduledCount) continue;
|
|
796
|
+
const lastWaveIndex = lastWaveIndexByTaskId.get(taskId) ?? -1;
|
|
797
|
+
if (!missingByLastWaveIndex.has(lastWaveIndex)) {
|
|
798
|
+
missingByLastWaveIndex.set(lastWaveIndex, new Map<string, number>());
|
|
799
|
+
}
|
|
800
|
+
missingByLastWaveIndex.get(lastWaveIndex)!.set(taskId, segmentCount - scheduledCount);
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
let offset = 0;
|
|
804
|
+
for (let baseWaveIdx = 0; baseWaveIdx < baseWavePlan.length; baseWaveIdx++) {
|
|
805
|
+
const missingForWave = missingByLastWaveIndex.get(baseWaveIdx);
|
|
806
|
+
if (!missingForWave || missingForWave.size === 0) continue;
|
|
807
|
+
const rounds: string[][] = [];
|
|
808
|
+
const remaining = new Map(missingForWave);
|
|
809
|
+
while ([...remaining.values()].some((count) => count > 0)) {
|
|
810
|
+
const roundTaskIds = [...remaining.entries()]
|
|
811
|
+
.filter(([, count]) => count > 0)
|
|
812
|
+
.map(([taskId]) => taskId)
|
|
813
|
+
.sort((a, b) => a.localeCompare(b));
|
|
814
|
+
if (roundTaskIds.length === 0) break;
|
|
815
|
+
rounds.push(roundTaskIds);
|
|
816
|
+
for (const taskId of roundTaskIds) {
|
|
817
|
+
remaining.set(taskId, (remaining.get(taskId) ?? 0) - 1);
|
|
818
|
+
}
|
|
819
|
+
}
|
|
820
|
+
if (rounds.length > 0) {
|
|
821
|
+
runtimeWavePlan.splice(baseWaveIdx + 1 + offset, 0, ...rounds);
|
|
822
|
+
offset += rounds.length;
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
const dangling = missingByLastWaveIndex.get(-1);
|
|
827
|
+
if (dangling && dangling.size > 0) {
|
|
828
|
+
const remaining = new Map(dangling);
|
|
829
|
+
while ([...remaining.values()].some((count) => count > 0)) {
|
|
830
|
+
const roundTaskIds = [...remaining.entries()]
|
|
831
|
+
.filter(([, count]) => count > 0)
|
|
832
|
+
.map(([taskId]) => taskId)
|
|
833
|
+
.sort((a, b) => a.localeCompare(b));
|
|
834
|
+
if (roundTaskIds.length === 0) break;
|
|
835
|
+
runtimeWavePlan.push(roundTaskIds);
|
|
836
|
+
for (const taskId of roundTaskIds) {
|
|
837
|
+
remaining.set(taskId, (remaining.get(taskId) ?? 0) - 1);
|
|
838
|
+
}
|
|
839
|
+
}
|
|
840
|
+
}
|
|
841
|
+
|
|
842
|
+
return runtimeWavePlan;
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
/**
|
|
846
|
+
* Compute the resume point from reconciled task states and wave plan.
|
|
847
|
+
*
|
|
848
|
+
* Determines which wave to resume from by finding the first wave that
|
|
849
|
+
* has any incomplete tasks. Skips fully completed waves only when
|
|
850
|
+
* their merge also succeeded.
|
|
851
|
+
*
|
|
852
|
+
* TP-037 (Bug #102): A wave where all tasks are terminal but the merge
|
|
853
|
+
* is missing or failed is NOT skipped — it is flagged for merge retry
|
|
854
|
+
* via `mergeRetryWaveIndexes`. The `resumeWaveIndex` is set to the
|
|
855
|
+
* earliest such wave so the resume loop can process it.
|
|
856
|
+
*
|
|
857
|
+
* Pure function — no process or filesystem access.
|
|
858
|
+
*
|
|
859
|
+
* @param persistedState - Loaded and validated batch state
|
|
860
|
+
* @param reconciledTasks - Reconciled task states
|
|
861
|
+
* @returns Resume point with wave index and categorized task IDs
|
|
862
|
+
*/
|
|
863
|
+
export function computeResumePoint(
|
|
864
|
+
persistedState: PersistedBatchState,
|
|
865
|
+
reconciledTasks: ReconciledTaskState[],
|
|
866
|
+
wavePlan: string[][] = persistedState.wavePlan,
|
|
867
|
+
): ResumePoint {
|
|
868
|
+
// Build lookup: taskId → reconciled state
|
|
869
|
+
const reconciledMap = new Map<string, ReconciledTaskState>();
|
|
870
|
+
for (const task of reconciledTasks) {
|
|
871
|
+
reconciledMap.set(task.taskId, task);
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
const segmentStatusBySegmentId = new Map<string, PersistedSegmentRecord["status"]>();
|
|
875
|
+
for (const segment of persistedState.segments ?? []) {
|
|
876
|
+
segmentStatusBySegmentId.set(segment.segmentId, segment.status);
|
|
877
|
+
}
|
|
878
|
+
const persistedTasks = Array.isArray((persistedState as { tasks?: unknown }).tasks)
|
|
879
|
+
? persistedState.tasks
|
|
880
|
+
: [];
|
|
881
|
+
const segmentIdsByTaskId = new Map<string, string[]>();
|
|
882
|
+
for (const task of persistedTasks) {
|
|
883
|
+
if (task.segmentIds && task.segmentIds.length > 0) {
|
|
884
|
+
segmentIdsByTaskId.set(task.taskId, task.segmentIds);
|
|
885
|
+
}
|
|
886
|
+
}
|
|
887
|
+
const waveSegmentIdByTaskOccurrence = new Map<string, string>();
|
|
888
|
+
const occurrenceByTaskId = new Map<string, number>();
|
|
889
|
+
for (let waveIdx = 0; waveIdx < wavePlan.length; waveIdx++) {
|
|
890
|
+
for (const taskId of wavePlan[waveIdx]) {
|
|
891
|
+
const segmentIds = segmentIdsByTaskId.get(taskId);
|
|
892
|
+
if (!segmentIds || segmentIds.length === 0) continue;
|
|
893
|
+
const occurrence = occurrenceByTaskId.get(taskId) ?? 0;
|
|
894
|
+
if (occurrence < segmentIds.length) {
|
|
895
|
+
waveSegmentIdByTaskOccurrence.set(`${waveIdx}:${taskId}`, segmentIds[occurrence]);
|
|
896
|
+
}
|
|
897
|
+
occurrenceByTaskId.set(taskId, occurrence + 1);
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
|
|
901
|
+
// Categorize tasks
|
|
902
|
+
const completedTaskIds: string[] = [];
|
|
903
|
+
const pendingTaskIds: string[] = [];
|
|
904
|
+
const failedTaskIds: string[] = [];
|
|
905
|
+
const reconnectTaskIds: string[] = [];
|
|
906
|
+
const reExecuteTaskIds: string[] = [];
|
|
907
|
+
|
|
908
|
+
for (const task of reconciledTasks) {
|
|
909
|
+
switch (task.action) {
|
|
910
|
+
case "mark-complete":
|
|
911
|
+
completedTaskIds.push(task.taskId);
|
|
912
|
+
break;
|
|
913
|
+
case "skip":
|
|
914
|
+
if (task.liveStatus === "succeeded" || task.persistedStatus === "succeeded") {
|
|
915
|
+
completedTaskIds.push(task.taskId);
|
|
916
|
+
} else if (
|
|
917
|
+
task.liveStatus === "failed" ||
|
|
918
|
+
task.liveStatus === "stalled" ||
|
|
919
|
+
task.persistedStatus === "failed" ||
|
|
920
|
+
task.persistedStatus === "stalled"
|
|
921
|
+
) {
|
|
922
|
+
failedTaskIds.push(task.taskId);
|
|
923
|
+
}
|
|
924
|
+
// persistedStatus === "skipped" → terminal but neither completed nor failed.
|
|
925
|
+
// Not re-queued. Counted separately via batchState.skippedTasks (carried from persisted state).
|
|
926
|
+
break;
|
|
927
|
+
case "reconnect":
|
|
928
|
+
reconnectTaskIds.push(task.taskId);
|
|
929
|
+
break;
|
|
930
|
+
case "re-execute":
|
|
931
|
+
reExecuteTaskIds.push(task.taskId);
|
|
932
|
+
break;
|
|
933
|
+
case "mark-failed":
|
|
934
|
+
failedTaskIds.push(task.taskId);
|
|
935
|
+
break;
|
|
936
|
+
case "pending":
|
|
937
|
+
// Never-started tasks remain pending for execution — not failed.
|
|
938
|
+
// These are future-wave tasks that were never allocated to a lane.
|
|
939
|
+
pendingTaskIds.push(task.taskId);
|
|
940
|
+
break;
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
// Find resume wave: first wave with any non-completed tasks OR missing/failed merge.
|
|
945
|
+
// TP-037 (Bug #102): A wave where all tasks are terminal but the merge
|
|
946
|
+
// hasn't succeeded is flagged for merge retry, not skipped.
|
|
947
|
+
let resumeWaveIndex = wavePlan.length; // default: past end = all done
|
|
948
|
+
const mergeRetryWaveIndexes: number[] = [];
|
|
949
|
+
|
|
950
|
+
for (let i = 0; i < wavePlan.length; i++) {
|
|
951
|
+
const waveTasks = wavePlan[i];
|
|
952
|
+
const allDone = waveTasks.every((taskId) => {
|
|
953
|
+
const waveSegmentId = waveSegmentIdByTaskOccurrence.get(`${i}:${taskId}`);
|
|
954
|
+
if (waveSegmentId && segmentStatusBySegmentId.has(waveSegmentId)) {
|
|
955
|
+
const segmentStatus = segmentStatusBySegmentId.get(waveSegmentId)!;
|
|
956
|
+
return (
|
|
957
|
+
segmentStatus === "succeeded" ||
|
|
958
|
+
segmentStatus === "failed" ||
|
|
959
|
+
segmentStatus === "stalled" ||
|
|
960
|
+
segmentStatus === "skipped"
|
|
961
|
+
);
|
|
962
|
+
}
|
|
963
|
+
const reconciled = reconciledMap.get(taskId);
|
|
964
|
+
if (!reconciled) return false;
|
|
965
|
+
// A task is "done" for wave-skip purposes if it's terminal:
|
|
966
|
+
// mark-complete, mark-failed, or skip with any terminal status
|
|
967
|
+
// (succeeded, failed, stalled, skipped)
|
|
968
|
+
if (reconciled.action === "mark-complete" || reconciled.action === "mark-failed") {
|
|
969
|
+
return true;
|
|
970
|
+
}
|
|
971
|
+
if (reconciled.action === "skip") {
|
|
972
|
+
const s = reconciled.liveStatus ?? reconciled.persistedStatus;
|
|
973
|
+
return s === "succeeded" || s === "failed" || s === "stalled" || s === "skipped";
|
|
974
|
+
}
|
|
975
|
+
return false;
|
|
976
|
+
});
|
|
977
|
+
|
|
978
|
+
if (!allDone) {
|
|
979
|
+
// Only set resumeWaveIndex if not already set by a merge retry
|
|
980
|
+
// (merge retry at an earlier wave takes precedence)
|
|
981
|
+
if (resumeWaveIndex === wavePlan.length) {
|
|
982
|
+
resumeWaveIndex = i;
|
|
983
|
+
}
|
|
984
|
+
break;
|
|
985
|
+
}
|
|
986
|
+
|
|
987
|
+
// TP-037 (Bug #102): All tasks are terminal — but did the merge succeed?
|
|
988
|
+
// Only check merge status if the wave had any succeeded tasks (waves with
|
|
989
|
+
// only failures/skips don't produce merges and can be safely skipped).
|
|
990
|
+
const hasSucceededTasks = waveTasks.some((taskId) => {
|
|
991
|
+
const waveSegmentId = waveSegmentIdByTaskOccurrence.get(`${i}:${taskId}`);
|
|
992
|
+
if (waveSegmentId && segmentStatusBySegmentId.has(waveSegmentId)) {
|
|
993
|
+
return segmentStatusBySegmentId.get(waveSegmentId) === "succeeded";
|
|
994
|
+
}
|
|
995
|
+
const reconciled = reconciledMap.get(taskId);
|
|
996
|
+
if (!reconciled) return false;
|
|
997
|
+
if (reconciled.action === "mark-complete") return true;
|
|
998
|
+
if (
|
|
999
|
+
reconciled.action === "skip" &&
|
|
1000
|
+
(reconciled.liveStatus === "succeeded" || reconciled.persistedStatus === "succeeded")
|
|
1001
|
+
)
|
|
1002
|
+
return true;
|
|
1003
|
+
return false;
|
|
1004
|
+
});
|
|
1005
|
+
|
|
1006
|
+
if (hasSucceededTasks && persistedState.mergeResults) {
|
|
1007
|
+
const mergeStatus = getMergeStatusForWave(persistedState.mergeResults, i);
|
|
1008
|
+
if (mergeStatus !== "succeeded") {
|
|
1009
|
+
// Merge missing or failed — flag for retry, don't skip past this wave
|
|
1010
|
+
mergeRetryWaveIndexes.push(i);
|
|
1011
|
+
if (resumeWaveIndex === wavePlan.length) {
|
|
1012
|
+
// This is the first wave needing attention — set resume point here
|
|
1013
|
+
resumeWaveIndex = i;
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
}
|
|
1017
|
+
}
|
|
1018
|
+
|
|
1019
|
+
// Determine pending tasks: tasks in resume wave and later that need execution
|
|
1020
|
+
const actualPendingTaskIds: string[] = [];
|
|
1021
|
+
for (let i = resumeWaveIndex; i < wavePlan.length; i++) {
|
|
1022
|
+
for (const taskId of wavePlan[i]) {
|
|
1023
|
+
const waveSegmentId = waveSegmentIdByTaskOccurrence.get(`${i}:${taskId}`);
|
|
1024
|
+
if (waveSegmentId && segmentStatusBySegmentId.has(waveSegmentId)) {
|
|
1025
|
+
const segmentStatus = segmentStatusBySegmentId.get(waveSegmentId)!;
|
|
1026
|
+
if (segmentStatus === "running" || segmentStatus === "pending") {
|
|
1027
|
+
actualPendingTaskIds.push(taskId);
|
|
1028
|
+
}
|
|
1029
|
+
continue;
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
const reconciled = reconciledMap.get(taskId);
|
|
1033
|
+
if (!reconciled) {
|
|
1034
|
+
actualPendingTaskIds.push(taskId); // Unknown task — treat as pending
|
|
1035
|
+
continue;
|
|
1036
|
+
}
|
|
1037
|
+
if (reconciled.action === "reconnect") {
|
|
1038
|
+
// Tasks with alive sessions need reconnection and remain pending.
|
|
1039
|
+
actualPendingTaskIds.push(taskId);
|
|
1040
|
+
}
|
|
1041
|
+
if (reconciled.action === "re-execute") {
|
|
1042
|
+
// Tasks with existing worktrees need re-execution and remain pending.
|
|
1043
|
+
actualPendingTaskIds.push(taskId);
|
|
1044
|
+
}
|
|
1045
|
+
if (reconciled.action === "skip" && reconciled.persistedStatus === "pending") {
|
|
1046
|
+
// Skipped tasks that were pending need execution
|
|
1047
|
+
actualPendingTaskIds.push(taskId);
|
|
1048
|
+
}
|
|
1049
|
+
if (reconciled.action === "pending") {
|
|
1050
|
+
// Never-started tasks from future waves need execution
|
|
1051
|
+
actualPendingTaskIds.push(taskId);
|
|
1052
|
+
}
|
|
1053
|
+
}
|
|
1054
|
+
}
|
|
1055
|
+
|
|
1056
|
+
return {
|
|
1057
|
+
resumeWaveIndex,
|
|
1058
|
+
completedTaskIds,
|
|
1059
|
+
pendingTaskIds: actualPendingTaskIds,
|
|
1060
|
+
failedTaskIds,
|
|
1061
|
+
reconnectTaskIds,
|
|
1062
|
+
reExecuteTaskIds,
|
|
1063
|
+
mergeRetryWaveIndexes,
|
|
1064
|
+
};
|
|
1065
|
+
}
|
|
1066
|
+
|
|
1067
|
+
// ── Pre-Resume Diagnostics ───────────────────────────────────────────
|
|
1068
|
+
|
|
1069
|
+
/**
|
|
1070
|
+
* Result of a single diagnostic check.
|
|
1071
|
+
*/
|
|
1072
|
+
export interface DiagnosticCheckResult {
|
|
1073
|
+
/** Short label for the check */
|
|
1074
|
+
check: string;
|
|
1075
|
+
/** Whether the check passed */
|
|
1076
|
+
passed: boolean;
|
|
1077
|
+
/** Human-readable detail (reason for failure or confirmation) */
|
|
1078
|
+
detail: string;
|
|
1079
|
+
}
|
|
1080
|
+
|
|
1081
|
+
/**
|
|
1082
|
+
* Aggregate result of pre-resume diagnostics.
|
|
1083
|
+
*/
|
|
1084
|
+
export interface PreResumeDiagnosticsResult {
|
|
1085
|
+
/** Whether all checks passed and resume can proceed */
|
|
1086
|
+
passed: boolean;
|
|
1087
|
+
/** Individual check results */
|
|
1088
|
+
checks: DiagnosticCheckResult[];
|
|
1089
|
+
/** Summary message for operator display */
|
|
1090
|
+
summary: string;
|
|
1091
|
+
}
|
|
1092
|
+
|
|
1093
|
+
/**
|
|
1094
|
+
* Run pre-resume diagnostics before allowing a force-resume.
|
|
1095
|
+
*
|
|
1096
|
+
* Checks performed (per repo in workspace mode):
|
|
1097
|
+
* 1. **State coherence:** batch-state.json exists and is loadable
|
|
1098
|
+
* 2. **Branch consistency:** orch branch exists in each repo
|
|
1099
|
+
* 3. **Worktree health:** persisted lane worktrees are accessible or cleanly absent
|
|
1100
|
+
*
|
|
1101
|
+
* Pure-ish function — reads filesystem/git state but does not mutate anything.
|
|
1102
|
+
*
|
|
1103
|
+
* @param persistedState - Loaded batch state
|
|
1104
|
+
* @param repoRoot - Default repo root (cwd)
|
|
1105
|
+
* @param stateRoot - Root for state files (.pi/)
|
|
1106
|
+
* @param workspaceConfig - Workspace configuration (null in repo mode)
|
|
1107
|
+
* @returns Diagnostics result with pass/fail and per-check details
|
|
1108
|
+
*/
|
|
1109
|
+
export function runPreResumeDiagnostics(
|
|
1110
|
+
persistedState: PersistedBatchState,
|
|
1111
|
+
repoRoot: string,
|
|
1112
|
+
stateRoot: string,
|
|
1113
|
+
workspaceConfig?: WorkspaceConfig | null,
|
|
1114
|
+
): PreResumeDiagnosticsResult {
|
|
1115
|
+
const checks: DiagnosticCheckResult[] = [];
|
|
1116
|
+
|
|
1117
|
+
// 1. State coherence — verify batch-state.json is well-formed
|
|
1118
|
+
// (Already loaded by caller, so if we get here the state is valid.)
|
|
1119
|
+
checks.push({
|
|
1120
|
+
check: "state-coherence",
|
|
1121
|
+
passed: true,
|
|
1122
|
+
detail: `Batch state loaded successfully (batchId: ${persistedState.batchId}, phase: ${persistedState.phase})`,
|
|
1123
|
+
});
|
|
1124
|
+
|
|
1125
|
+
// 2. Branch consistency — verify orch branch exists in each repo
|
|
1126
|
+
const repoRoots = collectRepoRoots(persistedState, repoRoot, workspaceConfig);
|
|
1127
|
+
for (const root of repoRoots) {
|
|
1128
|
+
const repoId = resolveRepoIdFromRoot(root, workspaceConfig);
|
|
1129
|
+
const label = repoId ? `repo:${repoId}` : "default-repo";
|
|
1130
|
+
|
|
1131
|
+
if (persistedState.orchBranch) {
|
|
1132
|
+
const branchCheck = runGit(
|
|
1133
|
+
["rev-parse", "--verify", `refs/heads/${persistedState.orchBranch}`],
|
|
1134
|
+
root,
|
|
1135
|
+
);
|
|
1136
|
+
if (branchCheck.ok) {
|
|
1137
|
+
checks.push({
|
|
1138
|
+
check: `branch-consistency:${label}`,
|
|
1139
|
+
passed: true,
|
|
1140
|
+
detail: `Orch branch "${persistedState.orchBranch}" exists in ${label}`,
|
|
1141
|
+
});
|
|
1142
|
+
} else {
|
|
1143
|
+
checks.push({
|
|
1144
|
+
check: `branch-consistency:${label}`,
|
|
1145
|
+
passed: false,
|
|
1146
|
+
detail:
|
|
1147
|
+
`Orch branch "${persistedState.orchBranch}" not found in ${label}. ` +
|
|
1148
|
+
`The branch may have been deleted or the repo is in an inconsistent state.`,
|
|
1149
|
+
});
|
|
1150
|
+
}
|
|
1151
|
+
}
|
|
1152
|
+
}
|
|
1153
|
+
|
|
1154
|
+
// 3. Worktree health — check each persisted lane worktree
|
|
1155
|
+
for (const lane of persistedState.lanes) {
|
|
1156
|
+
if (!lane.worktreePath) continue;
|
|
1157
|
+
|
|
1158
|
+
const wtExists = existsSync(lane.worktreePath);
|
|
1159
|
+
if (wtExists) {
|
|
1160
|
+
// Verify it's a valid git worktree (has .git file/directory)
|
|
1161
|
+
const gitMarker = join(lane.worktreePath, ".git");
|
|
1162
|
+
const isValidWt = existsSync(gitMarker);
|
|
1163
|
+
checks.push({
|
|
1164
|
+
check: `worktree-health:lane-${lane.laneNumber}`,
|
|
1165
|
+
passed: isValidWt,
|
|
1166
|
+
detail: isValidWt
|
|
1167
|
+
? `Lane ${lane.laneNumber} worktree exists and has valid .git marker`
|
|
1168
|
+
: `Lane ${lane.laneNumber} worktree exists at ${lane.worktreePath} but lacks .git marker (corrupted)`,
|
|
1169
|
+
});
|
|
1170
|
+
} else {
|
|
1171
|
+
// Absent worktree is OK — resume will re-create or skip
|
|
1172
|
+
checks.push({
|
|
1173
|
+
check: `worktree-health:lane-${lane.laneNumber}`,
|
|
1174
|
+
passed: true,
|
|
1175
|
+
detail: `Lane ${lane.laneNumber} worktree absent (will be re-created on resume)`,
|
|
1176
|
+
});
|
|
1177
|
+
}
|
|
1178
|
+
}
|
|
1179
|
+
|
|
1180
|
+
const failed = checks.filter((c) => !c.passed);
|
|
1181
|
+
const passed = failed.length === 0;
|
|
1182
|
+
|
|
1183
|
+
const summary = passed
|
|
1184
|
+
? `✅ Pre-resume diagnostics passed (${checks.length} checks)`
|
|
1185
|
+
: `❌ Pre-resume diagnostics failed (${failed.length}/${checks.length} checks failed):\n` +
|
|
1186
|
+
failed.map((c) => ` • ${c.check}: ${c.detail}`).join("\n");
|
|
1187
|
+
|
|
1188
|
+
return { passed, checks, summary };
|
|
1189
|
+
}
|
|
1190
|
+
|
|
1191
|
+
export async function resumeOrchBatch(
|
|
1192
|
+
orchConfig: OrchestratorConfig,
|
|
1193
|
+
runnerConfig: TaskRunnerConfig,
|
|
1194
|
+
cwd: string,
|
|
1195
|
+
batchState: OrchBatchRuntimeState,
|
|
1196
|
+
onNotify: (message: string, level: "info" | "warning" | "error") => void,
|
|
1197
|
+
onMonitorUpdate?: MonitorUpdateCallback,
|
|
1198
|
+
workspaceConfig?: WorkspaceConfig | null,
|
|
1199
|
+
workspaceRoot?: string,
|
|
1200
|
+
agentRoot?: string,
|
|
1201
|
+
force: boolean = false,
|
|
1202
|
+
onSupervisorAlert?: import("./types.ts").SupervisorAlertCallback | null,
|
|
1203
|
+
supervisorAutonomy: "interactive" | "supervised" | "autonomous" = "autonomous",
|
|
1204
|
+
/**
|
|
1205
|
+
* TP-187 (#538): Optional callback fired when a lane reaches a terminal
|
|
1206
|
+
* state during a resumed batch. Threaded through to executeWave so the
|
|
1207
|
+
* supervisor process keeps suppressing zombie alerts after resume too.
|
|
1208
|
+
*/
|
|
1209
|
+
onLaneTerminated?: import("./types.ts").LaneTerminatedCallback | null,
|
|
1210
|
+
/**
|
|
1211
|
+
* TP-187 (#538): Optional callback fired when a lane is freshly
|
|
1212
|
+
* (re-)allocated during resume. The supervisor uses it to lift any
|
|
1213
|
+
* carried-over zombie-alert suppression.
|
|
1214
|
+
*/
|
|
1215
|
+
onLaneRespawned?: ((laneNumber: number, agentId: string, batchId: string) => void) | null,
|
|
1216
|
+
): Promise<void> {
|
|
1217
|
+
const repoRoot = cwd;
|
|
1218
|
+
// State files (.pi/batch-state.json, lane-state, etc.) belong in the workspace root,
|
|
1219
|
+
// which is where .pi/ config lives. In repo mode, stateRoot === repoRoot.
|
|
1220
|
+
const stateRoot = workspaceRoot ?? cwd;
|
|
1221
|
+
|
|
1222
|
+
// ── TP-076: Supervisor alert emission helper ─────────────────
|
|
1223
|
+
const emitAlert = (alert: import("./types.ts").SupervisorAlert): void => {
|
|
1224
|
+
if (onSupervisorAlert) {
|
|
1225
|
+
try {
|
|
1226
|
+
onSupervisorAlert(alert);
|
|
1227
|
+
} catch (err: unknown) {
|
|
1228
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1229
|
+
execLog("resume", "unknown", `supervisor alert callback failed: ${msg}`, {
|
|
1230
|
+
alertCategory: alert.category,
|
|
1231
|
+
});
|
|
1232
|
+
}
|
|
1233
|
+
}
|
|
1234
|
+
};
|
|
1235
|
+
|
|
1236
|
+
// ── 1. Load persisted state ──────────────────────────────────
|
|
1237
|
+
let persistedState: PersistedBatchState | null;
|
|
1238
|
+
try {
|
|
1239
|
+
persistedState = loadBatchState(stateRoot);
|
|
1240
|
+
} catch (err: unknown) {
|
|
1241
|
+
if (err instanceof StateFileError) {
|
|
1242
|
+
onNotify(`❌ Cannot resume: ${err.message}`, "error");
|
|
1243
|
+
// ── TP-040 R006: Reset phase on pre-execution early return ──
|
|
1244
|
+
// The caller may have set batchState.phase = "launching" before
|
|
1245
|
+
// calling this function. Since we're returning without starting
|
|
1246
|
+
// any work, reset to "idle" so the batch isn't stuck.
|
|
1247
|
+
batchState.phase = "idle";
|
|
1248
|
+
return;
|
|
1249
|
+
}
|
|
1250
|
+
throw err;
|
|
1251
|
+
}
|
|
1252
|
+
|
|
1253
|
+
if (!persistedState) {
|
|
1254
|
+
if (!force) {
|
|
1255
|
+
onNotify(ORCH_MESSAGES.resumeNoState(), "error");
|
|
1256
|
+
// TP-040 R006: Reset phase on pre-execution early return
|
|
1257
|
+
batchState.phase = "idle";
|
|
1258
|
+
return;
|
|
1259
|
+
}
|
|
1260
|
+
// TP-187 (#539): On force-resume, attempt deterministic reconstruction
|
|
1261
|
+
// from .pi/runtime/<batchId>/ runtime artifacts (typically left intact
|
|
1262
|
+
// by `orch_abort()` even though `.pi/batch-state.json` is deleted).
|
|
1263
|
+
const reconstruction = reconstructBatchStateFromRuntime(stateRoot);
|
|
1264
|
+
if (!reconstruction.ok) {
|
|
1265
|
+
onNotify(ORCH_MESSAGES.resumeNoStateAfterAbort(reconstruction.error, null), "error");
|
|
1266
|
+
// TP-040 R006: Reset phase on pre-execution early return
|
|
1267
|
+
batchState.phase = "idle";
|
|
1268
|
+
return;
|
|
1269
|
+
}
|
|
1270
|
+
// Successful reconstruction: persist so the rest of resumeOrchBatch
|
|
1271
|
+
// proceeds with a normal on-disk batch-state.json picture.
|
|
1272
|
+
onNotify(
|
|
1273
|
+
ORCH_MESSAGES.resumeReconstructed(reconstruction.batchId, reconstruction.selectionNote),
|
|
1274
|
+
"warning",
|
|
1275
|
+
);
|
|
1276
|
+
try {
|
|
1277
|
+
saveBatchState(JSON.stringify(reconstruction.state, null, 2), stateRoot);
|
|
1278
|
+
} catch (err) {
|
|
1279
|
+
onNotify(
|
|
1280
|
+
ORCH_MESSAGES.resumeNoStateAfterAbort(
|
|
1281
|
+
`reconstructed state could not be persisted: ${err instanceof Error ? err.message : String(err)}`,
|
|
1282
|
+
reconstruction.batchId,
|
|
1283
|
+
),
|
|
1284
|
+
"error",
|
|
1285
|
+
);
|
|
1286
|
+
// TP-040 R006: Reset phase on pre-execution early return
|
|
1287
|
+
batchState.phase = "idle";
|
|
1288
|
+
return;
|
|
1289
|
+
}
|
|
1290
|
+
persistedState = reconstruction.state;
|
|
1291
|
+
}
|
|
1292
|
+
|
|
1293
|
+
// ── 2. Check eligibility ─────────────────────────────────────
|
|
1294
|
+
const eligibility = checkResumeEligibility(persistedState, force);
|
|
1295
|
+
if (!eligibility.eligible) {
|
|
1296
|
+
onNotify(
|
|
1297
|
+
ORCH_MESSAGES.resumePhaseNotResumable(
|
|
1298
|
+
persistedState.batchId,
|
|
1299
|
+
persistedState.phase,
|
|
1300
|
+
eligibility.reason,
|
|
1301
|
+
),
|
|
1302
|
+
"error",
|
|
1303
|
+
);
|
|
1304
|
+
// TP-040 R006: Reset phase on pre-execution early return
|
|
1305
|
+
batchState.phase = "idle";
|
|
1306
|
+
return;
|
|
1307
|
+
}
|
|
1308
|
+
|
|
1309
|
+
// ── 2b. Force-resume: pre-resume diagnostics & state mutation ──
|
|
1310
|
+
const isForceResume =
|
|
1311
|
+
force && (persistedState.phase === "stopped" || persistedState.phase === "failed");
|
|
1312
|
+
if (isForceResume) {
|
|
1313
|
+
onNotify(
|
|
1314
|
+
ORCH_MESSAGES.forceResumeStarting(persistedState.batchId, persistedState.phase),
|
|
1315
|
+
"warning",
|
|
1316
|
+
);
|
|
1317
|
+
|
|
1318
|
+
// Run pre-resume diagnostics before allowing force-resume
|
|
1319
|
+
const diagnostics = runPreResumeDiagnostics(persistedState, repoRoot, stateRoot, workspaceConfig);
|
|
1320
|
+
onNotify(diagnostics.summary, diagnostics.passed ? "info" : "error");
|
|
1321
|
+
|
|
1322
|
+
if (!diagnostics.passed) {
|
|
1323
|
+
onNotify(ORCH_MESSAGES.forceResumeDiagnosticsFailed(persistedState.batchId), "error");
|
|
1324
|
+
// TP-040 R006: Reset phase on pre-execution early return
|
|
1325
|
+
batchState.phase = "idle";
|
|
1326
|
+
return;
|
|
1327
|
+
}
|
|
1328
|
+
|
|
1329
|
+
// Record force intent in resilience state
|
|
1330
|
+
persistedState.resilience.resumeForced = true;
|
|
1331
|
+
|
|
1332
|
+
// Reset phase to paused so normal resume flow can proceed
|
|
1333
|
+
execLog(
|
|
1334
|
+
"resume",
|
|
1335
|
+
persistedState.batchId,
|
|
1336
|
+
`force-resume: phase ${persistedState.phase} → paused`,
|
|
1337
|
+
{
|
|
1338
|
+
diagnosticChecks: diagnostics.checks.length,
|
|
1339
|
+
diagnosticsPassed: diagnostics.passed,
|
|
1340
|
+
},
|
|
1341
|
+
);
|
|
1342
|
+
persistedState.phase = "paused";
|
|
1343
|
+
}
|
|
1344
|
+
|
|
1345
|
+
onNotify(ORCH_MESSAGES.resumeStarting(persistedState.batchId, persistedState.phase), "info");
|
|
1346
|
+
|
|
1347
|
+
const segmentFrontierByTask = reconstructSegmentFrontier(persistedState);
|
|
1348
|
+
if (segmentFrontierByTask.size > 0) {
|
|
1349
|
+
let completedSegments = 0;
|
|
1350
|
+
let inFlightSegments = 0;
|
|
1351
|
+
let pendingSegments = 0;
|
|
1352
|
+
for (const frontier of segmentFrontierByTask.values()) {
|
|
1353
|
+
completedSegments += frontier.completedSegmentIds.length;
|
|
1354
|
+
inFlightSegments += frontier.inFlightSegmentIds.length;
|
|
1355
|
+
pendingSegments += frontier.pendingSegmentIds.length;
|
|
1356
|
+
}
|
|
1357
|
+
execLog("resume", persistedState.batchId, `segment frontier reconstructed`, {
|
|
1358
|
+
tasks: segmentFrontierByTask.size,
|
|
1359
|
+
completedSegments,
|
|
1360
|
+
inFlightSegments,
|
|
1361
|
+
pendingSegments,
|
|
1362
|
+
});
|
|
1363
|
+
}
|
|
1364
|
+
|
|
1365
|
+
const runtimeWavePlan = buildResumeRuntimeWavePlan(persistedState);
|
|
1366
|
+
// TP-108/112: Runtime V2 backend selection for resumed batches.
|
|
1367
|
+
// MUST be computed before any backend-aware branch (section 3+).
|
|
1368
|
+
const resumeBackend: RuntimeBackend = selectRuntimeBackend(
|
|
1369
|
+
"all",
|
|
1370
|
+
runtimeWavePlan,
|
|
1371
|
+
workspaceConfig,
|
|
1372
|
+
).backend;
|
|
1373
|
+
execLog("resume", batchState.batchId, `runtime backend for resumed execution: ${resumeBackend}`);
|
|
1374
|
+
|
|
1375
|
+
// ── 3. Discover live signals ─────────────────────────────────
|
|
1376
|
+
// TP-112/119: Runtime V2 session liveness check only.
|
|
1377
|
+
// Alive sessions are discovered from the process registry.
|
|
1378
|
+
const aliveSessions = new Set<string>();
|
|
1379
|
+
const registry = readRegistrySnapshot(stateRoot, persistedState.batchId);
|
|
1380
|
+
if (registry) {
|
|
1381
|
+
for (const manifest of Object.values(registry.agents)) {
|
|
1382
|
+
if (!isTerminalStatus(manifest.status) && isProcessAlive(manifest.pid)) {
|
|
1383
|
+
aliveSessions.add(manifest.agentId);
|
|
1384
|
+
// Also add lane session name (without role suffix) so reconciliation
|
|
1385
|
+
// matches persisted task.sessionName.
|
|
1386
|
+
// e.g., "orch-op-lane-1-worker" -> also add "orch-op-lane-1"
|
|
1387
|
+
const laneSession = manifest.agentId.replace(/-(worker|reviewer)$/, "");
|
|
1388
|
+
if (laneSession !== manifest.agentId) aliveSessions.add(laneSession);
|
|
1389
|
+
}
|
|
1390
|
+
}
|
|
1391
|
+
}
|
|
1392
|
+
|
|
1393
|
+
// Check .DONE files — check both original path and worktree-relative path.
|
|
1394
|
+
// TP-109: In workspace mode or V2 execution, .DONE is written in the worktree
|
|
1395
|
+
// at the resolved packet path, not the original discovery path. Resume must
|
|
1396
|
+
// check both locations for authoritative completion detection.
|
|
1397
|
+
const doneTaskIds = collectDoneTaskIdsForResume(persistedState, repoRoot, workspaceConfig);
|
|
1398
|
+
|
|
1399
|
+
// ── 3b. Detect existing worktrees ────────────────────────────
|
|
1400
|
+
const existingWorktreeTaskIds = new Set<string>();
|
|
1401
|
+
for (const task of persistedState.tasks) {
|
|
1402
|
+
const laneRecord = persistedState.lanes.find((l) => l.taskIds.includes(task.taskId));
|
|
1403
|
+
if (laneRecord && laneRecord.worktreePath && existsSync(laneRecord.worktreePath)) {
|
|
1404
|
+
existingWorktreeTaskIds.add(task.taskId);
|
|
1405
|
+
}
|
|
1406
|
+
}
|
|
1407
|
+
|
|
1408
|
+
// ── 4. Reconcile task states ─────────────────────────────────
|
|
1409
|
+
const reconciledTasks = reconcileTaskStates(
|
|
1410
|
+
persistedState,
|
|
1411
|
+
aliveSessions,
|
|
1412
|
+
doneTaskIds,
|
|
1413
|
+
existingWorktreeTaskIds,
|
|
1414
|
+
);
|
|
1415
|
+
|
|
1416
|
+
// ── 4b. Clear stale session allocation for tasks reconciled as pending ──
|
|
1417
|
+
// TP-037 (Bug #102b): Pending tasks that had a sessionName from a prior
|
|
1418
|
+
// failed resume but were never actually started need their allocation
|
|
1419
|
+
// metadata cleared so they can be freshly assigned to new lanes.
|
|
1420
|
+
// We also prune these tasks from persisted lane records so that
|
|
1421
|
+
// serializeBatchState() doesn't reintroduce stale sessionName via lane
|
|
1422
|
+
// fallback paths when outcome.sessionName is absent.
|
|
1423
|
+
const stalePendingTaskIds = new Set<string>();
|
|
1424
|
+
for (const reconciled of reconciledTasks) {
|
|
1425
|
+
if (reconciled.action === "pending") {
|
|
1426
|
+
const persistedTask = persistedState.tasks.find((t) => t.taskId === reconciled.taskId);
|
|
1427
|
+
if (persistedTask && persistedTask.sessionName) {
|
|
1428
|
+
execLog(
|
|
1429
|
+
"resume",
|
|
1430
|
+
persistedState.batchId,
|
|
1431
|
+
`clear-stale-session: ${reconciled.taskId} had stale session "${persistedTask.sessionName}" (lane ${persistedTask.laneNumber})`,
|
|
1432
|
+
);
|
|
1433
|
+
stalePendingTaskIds.add(reconciled.taskId);
|
|
1434
|
+
persistedTask.sessionName = "";
|
|
1435
|
+
persistedTask.laneNumber = 0;
|
|
1436
|
+
}
|
|
1437
|
+
}
|
|
1438
|
+
}
|
|
1439
|
+
// Prune stale-pending tasks from lane records so reconstructAllocatedLanes()
|
|
1440
|
+
// (and subsequent serializeBatchState()) won't map them back to the old lane.
|
|
1441
|
+
if (stalePendingTaskIds.size > 0) {
|
|
1442
|
+
for (const lane of persistedState.lanes) {
|
|
1443
|
+
lane.taskIds = lane.taskIds.filter((id) => !stalePendingTaskIds.has(id));
|
|
1444
|
+
}
|
|
1445
|
+
}
|
|
1446
|
+
|
|
1447
|
+
// ── 5. Compute resume point ──────────────────────────────────
|
|
1448
|
+
const resumePoint = computeResumePoint(persistedState, reconciledTasks, runtimeWavePlan);
|
|
1449
|
+
const completedTaskSet = new Set(resumePoint.completedTaskIds);
|
|
1450
|
+
const failedTaskSet = new Set(resumePoint.failedTaskIds);
|
|
1451
|
+
const reconnectTaskSet = new Set(resumePoint.reconnectTaskIds);
|
|
1452
|
+
const reExecuteTaskSet = new Set(resumePoint.reExecuteTaskIds);
|
|
1453
|
+
|
|
1454
|
+
onNotify(
|
|
1455
|
+
ORCH_MESSAGES.resumeReconciled(
|
|
1456
|
+
persistedState.batchId,
|
|
1457
|
+
resumePoint.completedTaskIds.length,
|
|
1458
|
+
resumePoint.pendingTaskIds.length,
|
|
1459
|
+
resumePoint.failedTaskIds.length,
|
|
1460
|
+
resumePoint.reconnectTaskIds.length,
|
|
1461
|
+
resumePoint.reExecuteTaskIds.length,
|
|
1462
|
+
),
|
|
1463
|
+
"info",
|
|
1464
|
+
);
|
|
1465
|
+
|
|
1466
|
+
if (resumePoint.reconnectTaskIds.length > 0) {
|
|
1467
|
+
onNotify(ORCH_MESSAGES.resumeReconnecting(resumePoint.reconnectTaskIds.length), "info");
|
|
1468
|
+
}
|
|
1469
|
+
|
|
1470
|
+
if (resumePoint.resumeWaveIndex > 0) {
|
|
1471
|
+
onNotify(ORCH_MESSAGES.resumeSkippedWaves(resumePoint.resumeWaveIndex), "info");
|
|
1472
|
+
}
|
|
1473
|
+
|
|
1474
|
+
if (resumePoint.mergeRetryWaveIndexes.length > 0) {
|
|
1475
|
+
onNotify(
|
|
1476
|
+
`🔀 ${resumePoint.mergeRetryWaveIndexes.length} wave(s) need merge retry: ${resumePoint.mergeRetryWaveIndexes.map((i) => `W${i + 1}`).join(", ")}`,
|
|
1477
|
+
"warning",
|
|
1478
|
+
);
|
|
1479
|
+
}
|
|
1480
|
+
|
|
1481
|
+
// ── 6. Reconstruct runtime state ─────────────────────────────
|
|
1482
|
+
|
|
1483
|
+
// Guard: orchBranch must be present for routing. Persisted states from
|
|
1484
|
+
// pre-TP-022 runs may have orchBranch="" (TP-020 defaults).
|
|
1485
|
+
// Check BEFORE mutating batchState so phase/batchId remain idle on rejection,
|
|
1486
|
+
// allowing future /orch-resume or /orch-abort to proceed.
|
|
1487
|
+
if (!persistedState.orchBranch) {
|
|
1488
|
+
onNotify(
|
|
1489
|
+
`❌ Cannot resume batch ${persistedState.batchId}: persisted state has no orch branch. ` +
|
|
1490
|
+
`This batch was created before orch-branch routing was implemented. ` +
|
|
1491
|
+
`Use /orch-abort to clean up, then start a new batch.`,
|
|
1492
|
+
"error",
|
|
1493
|
+
);
|
|
1494
|
+
// TP-040 R006: Reset phase on pre-execution early return
|
|
1495
|
+
batchState.phase = "idle";
|
|
1496
|
+
return;
|
|
1497
|
+
}
|
|
1498
|
+
|
|
1499
|
+
batchState.phase = "executing";
|
|
1500
|
+
batchState.batchId = persistedState.batchId;
|
|
1501
|
+
batchState.baseBranch = persistedState.baseBranch || "";
|
|
1502
|
+
batchState.orchBranch = persistedState.orchBranch;
|
|
1503
|
+
|
|
1504
|
+
batchState.mode = persistedState.mode;
|
|
1505
|
+
batchState.startedAt = persistedState.startedAt;
|
|
1506
|
+
// Preserve pauseSignal if already set during "launching" phase (TP-040)
|
|
1507
|
+
if (!batchState.pauseSignal?.paused) batchState.pauseSignal = { paused: false };
|
|
1508
|
+
batchState.totalWaves = persistedState.totalWaves;
|
|
1509
|
+
// TP-166: Restore task-level wave metadata for correct display.
|
|
1510
|
+
// Normalize: fall back to totalWaves for pre-TP-166 state files.
|
|
1511
|
+
batchState.taskLevelWaveCount = persistedState.taskLevelWaveCount ?? persistedState.totalWaves;
|
|
1512
|
+
batchState.roundToTaskWave = persistedState.roundToTaskWave
|
|
1513
|
+
? [...persistedState.roundToTaskWave]
|
|
1514
|
+
: undefined;
|
|
1515
|
+
batchState.totalTasks = persistedState.totalTasks;
|
|
1516
|
+
batchState.succeededTasks = resumePoint.completedTaskIds.length;
|
|
1517
|
+
batchState.failedTasks = resumePoint.failedTaskIds.length;
|
|
1518
|
+
batchState.skippedTasks = persistedState.skippedTasks;
|
|
1519
|
+
batchState.blockedTasks = persistedState.blockedTasks;
|
|
1520
|
+
batchState.blockedTaskIds = new Set(persistedState.blockedTaskIds);
|
|
1521
|
+
// Track persisted blocked IDs separately to avoid double-counting in wave loop.
|
|
1522
|
+
// Engine.ts counts blocked tasks per-wave when a wave is entered. If the prior
|
|
1523
|
+
// run paused before reaching a wave, tasks blocked for that wave are in
|
|
1524
|
+
// `blockedTaskIds` but NOT yet counted in `blockedTasks`. On resume, the
|
|
1525
|
+
// per-wave counting loop excludes `persistedBlockedTaskIds`, so those tasks
|
|
1526
|
+
// would never be counted. Fix: count persisted blocked tasks in future waves
|
|
1527
|
+
// (waves >= resumeWaveIndex) that were not yet counted.
|
|
1528
|
+
const persistedBlockedTaskIds = new Set(persistedState.blockedTaskIds);
|
|
1529
|
+
|
|
1530
|
+
// Count persisted-blocked tasks in unvisited waves (wave >= resumeWaveIndex).
|
|
1531
|
+
// These were added to blockedTaskIds in the prior run but their wave was never
|
|
1532
|
+
// entered, so they were never counted in blockedTasks.
|
|
1533
|
+
if (persistedBlockedTaskIds.size > 0) {
|
|
1534
|
+
let uncountedBlocked = 0;
|
|
1535
|
+
for (let wi = resumePoint.resumeWaveIndex; wi < runtimeWavePlan.length; wi++) {
|
|
1536
|
+
for (const taskId of runtimeWavePlan[wi]) {
|
|
1537
|
+
if (persistedBlockedTaskIds.has(taskId)) {
|
|
1538
|
+
uncountedBlocked++;
|
|
1539
|
+
}
|
|
1540
|
+
}
|
|
1541
|
+
}
|
|
1542
|
+
if (uncountedBlocked > 0) {
|
|
1543
|
+
batchState.blockedTasks += uncountedBlocked;
|
|
1544
|
+
execLog(
|
|
1545
|
+
"resume",
|
|
1546
|
+
persistedState.batchId,
|
|
1547
|
+
`blocked counter fix: ${uncountedBlocked} persisted-blocked task(s) in unvisited waves added to blockedTasks`,
|
|
1548
|
+
);
|
|
1549
|
+
}
|
|
1550
|
+
}
|
|
1551
|
+
|
|
1552
|
+
batchState.errors = [...persistedState.errors];
|
|
1553
|
+
batchState.endedAt = null;
|
|
1554
|
+
batchState.currentWaveIndex = resumePoint.resumeWaveIndex;
|
|
1555
|
+
batchState.waveResults = [];
|
|
1556
|
+
|
|
1557
|
+
// v3: Carry forward resilience and diagnostics from persisted state
|
|
1558
|
+
batchState.resilience = persistedState.resilience;
|
|
1559
|
+
batchState.diagnostics = persistedState.diagnostics;
|
|
1560
|
+
// v4: Carry forward segment records (including dynamically expanded segments)
|
|
1561
|
+
batchState.segments = [...(persistedState.segments ?? [])];
|
|
1562
|
+
// Carry forward unknown fields for roundtrip preservation
|
|
1563
|
+
if (persistedState._extraFields) {
|
|
1564
|
+
batchState._extraFields = persistedState._extraFields;
|
|
1565
|
+
}
|
|
1566
|
+
|
|
1567
|
+
// ── 6b. TP-169: Verify orch branch exists in all workspace repos ────
|
|
1568
|
+
// During the original batch start, the orch branch was created in every
|
|
1569
|
+
// workspace repo. On resume, we verify it still exists. If it's missing
|
|
1570
|
+
// in any repo (e.g., deleted by user, corrupted), re-create it from the
|
|
1571
|
+
// repo's current branch so that worktree creation doesn't silently fall
|
|
1572
|
+
// back to the base branch, bypassing orch branch isolation.
|
|
1573
|
+
if (workspaceConfig && batchState.orchBranch) {
|
|
1574
|
+
for (const [repoId, repoConf] of workspaceConfig.repos) {
|
|
1575
|
+
const rRoot = repoConf.path;
|
|
1576
|
+
const check = runGit(["rev-parse", "--verify", `refs/heads/${batchState.orchBranch}`], rRoot);
|
|
1577
|
+
if (!check.ok) {
|
|
1578
|
+
// Orch branch missing in this repo — re-create from current HEAD
|
|
1579
|
+
const repoBranch = getCurrentBranch(rRoot) || "HEAD";
|
|
1580
|
+
const createRes = runGit(["branch", batchState.orchBranch, repoBranch], rRoot);
|
|
1581
|
+
if (createRes.ok) {
|
|
1582
|
+
execLog("resume", batchState.batchId, `re-created missing orch branch in ${repoId}`, {
|
|
1583
|
+
orchBranch: batchState.orchBranch,
|
|
1584
|
+
base: repoBranch,
|
|
1585
|
+
});
|
|
1586
|
+
onNotify(
|
|
1587
|
+
`⚠️ Orch branch "${batchState.orchBranch}" was missing in repo "${repoId}" — re-created from ${repoBranch}`,
|
|
1588
|
+
"warning",
|
|
1589
|
+
);
|
|
1590
|
+
} else {
|
|
1591
|
+
const errMsg =
|
|
1592
|
+
`Failed to re-create orch branch "${batchState.orchBranch}" in repo "${repoId}": ${createRes.stderr}. ` +
|
|
1593
|
+
`Cannot resume without orch branch isolation.`;
|
|
1594
|
+
execLog("resume", batchState.batchId, errMsg, {
|
|
1595
|
+
orchBranch: batchState.orchBranch,
|
|
1596
|
+
error: createRes.stderr,
|
|
1597
|
+
});
|
|
1598
|
+
throw new Error(errMsg);
|
|
1599
|
+
}
|
|
1600
|
+
}
|
|
1601
|
+
}
|
|
1602
|
+
}
|
|
1603
|
+
|
|
1604
|
+
// ── 7. Re-run discovery for ParsedTask metadata ──────────────
|
|
1605
|
+
// We need fresh ParsedTask data (taskFolder, promptPath) for execution.
|
|
1606
|
+
// Use "all" to discover all areas.
|
|
1607
|
+
const discovery = runDiscovery("all", runnerConfig.task_areas, cwd, {
|
|
1608
|
+
refreshDependencies: false,
|
|
1609
|
+
dependencySource: orchConfig.dependencies.source,
|
|
1610
|
+
useDependencyCache: orchConfig.dependencies.cache,
|
|
1611
|
+
workspaceConfig: workspaceConfig ?? null,
|
|
1612
|
+
});
|
|
1613
|
+
|
|
1614
|
+
// Build dependency graph for skip-dependents policy
|
|
1615
|
+
const depGraph = buildDependencyGraph(discovery.pending, discovery.completed);
|
|
1616
|
+
batchState.dependencyGraph = depGraph;
|
|
1617
|
+
|
|
1618
|
+
// Rehydrate discovered tasks with persisted segment metadata.
|
|
1619
|
+
// Dynamically expanded segments may reference tasks that have segment-level
|
|
1620
|
+
// fields (segmentIds, activeSegmentId, packetRepoId, packetTaskPath) set
|
|
1621
|
+
// during the prior run. Merge these back into discovered ParsedTask records
|
|
1622
|
+
// so execution can resume with correct segment context.
|
|
1623
|
+
for (const persistedTask of persistedState.tasks) {
|
|
1624
|
+
const parsed = discovery.pending.get(persistedTask.taskId);
|
|
1625
|
+
if (!parsed) continue;
|
|
1626
|
+
if (persistedTask.segmentIds?.length) {
|
|
1627
|
+
parsed.segmentIds = persistedTask.segmentIds;
|
|
1628
|
+
}
|
|
1629
|
+
if (persistedTask.activeSegmentId !== undefined) {
|
|
1630
|
+
parsed.activeSegmentId = persistedTask.activeSegmentId;
|
|
1631
|
+
}
|
|
1632
|
+
if (persistedTask.packetRepoId) {
|
|
1633
|
+
parsed.packetRepoId = persistedTask.packetRepoId;
|
|
1634
|
+
}
|
|
1635
|
+
if (persistedTask.packetTaskPath) {
|
|
1636
|
+
parsed.packetTaskPath = persistedTask.packetTaskPath;
|
|
1637
|
+
}
|
|
1638
|
+
}
|
|
1639
|
+
|
|
1640
|
+
// ── 8. Handle alive sessions (reconnect) ─────────────────────
|
|
1641
|
+
// For tasks with alive sessions, we need to wait for them to complete.
|
|
1642
|
+
// We poll each alive session's .DONE file.
|
|
1643
|
+
const reconnectTasks = reconciledTasks.filter((t) => t.action === "reconnect");
|
|
1644
|
+
const reconnectFinalStatus = new Map<string, LaneTaskStatus>();
|
|
1645
|
+
|
|
1646
|
+
if (reconnectTasks.length > 0) {
|
|
1647
|
+
// Wait for reconnected tasks to complete (poll .DONE files)
|
|
1648
|
+
for (const task of reconnectTasks) {
|
|
1649
|
+
const parsedTask = discovery.pending.get(task.taskId);
|
|
1650
|
+
if (!parsedTask) continue;
|
|
1651
|
+
|
|
1652
|
+
// Find the lane info from persisted state
|
|
1653
|
+
const laneRecord = persistedState.lanes.find((l) => l.taskIds.includes(task.taskId));
|
|
1654
|
+
if (!laneRecord) continue;
|
|
1655
|
+
|
|
1656
|
+
// Build a minimal AllocatedLane for polling
|
|
1657
|
+
const allocatedTask: AllocatedTask = {
|
|
1658
|
+
taskId: task.taskId,
|
|
1659
|
+
order: 0,
|
|
1660
|
+
task: parsedTask,
|
|
1661
|
+
estimatedMinutes: 0,
|
|
1662
|
+
};
|
|
1663
|
+
const lane: AllocatedLane = {
|
|
1664
|
+
laneNumber: laneRecord.laneNumber,
|
|
1665
|
+
laneId: laneRecord.laneId,
|
|
1666
|
+
laneSessionId: laneRecord.laneSessionId,
|
|
1667
|
+
worktreePath: laneRecord.worktreePath,
|
|
1668
|
+
branch: laneRecord.branch,
|
|
1669
|
+
tasks: [allocatedTask],
|
|
1670
|
+
strategy: "round-robin",
|
|
1671
|
+
estimatedLoad: 0,
|
|
1672
|
+
estimatedMinutes: 0,
|
|
1673
|
+
...(laneRecord.repoId !== undefined ? { repoId: laneRecord.repoId } : {}),
|
|
1674
|
+
};
|
|
1675
|
+
|
|
1676
|
+
// Resolve per-lane repo root for workspace mode (v1/repo mode: falls back to repoRoot)
|
|
1677
|
+
const laneRepoRoot = resolveRepoRoot(laneRecord.repoId, repoRoot, workspaceConfig);
|
|
1678
|
+
|
|
1679
|
+
// TP-112: Runtime V2 reconnect.
|
|
1680
|
+
// Agent-host processes do not survive supervisor restart, so reconnect
|
|
1681
|
+
// uses terminate + rehydrate via executeLaneV2.
|
|
1682
|
+
execLog("resume", task.taskId, "V2 reconnect: terminate + rehydrate via lane-runner", {
|
|
1683
|
+
repoId: laneRecord.repoId ?? "(default)",
|
|
1684
|
+
});
|
|
1685
|
+
terminateAliveV2Agents(stateRoot, persistedState.batchId, laneRecord.laneSessionId);
|
|
1686
|
+
try {
|
|
1687
|
+
const laneResult = await executeLaneV2(
|
|
1688
|
+
lane,
|
|
1689
|
+
orchConfig,
|
|
1690
|
+
laneRepoRoot,
|
|
1691
|
+
batchState.pauseSignal,
|
|
1692
|
+
workspaceRoot,
|
|
1693
|
+
!!workspaceConfig,
|
|
1694
|
+
{
|
|
1695
|
+
ORCH_BATCH_ID: batchState.batchId,
|
|
1696
|
+
...buildReviewerEnv(runnerConfig.reviewer),
|
|
1697
|
+
...buildWorkerExcludeEnv(runnerConfig.workerExcludeExtensions),
|
|
1698
|
+
},
|
|
1699
|
+
emitAlert,
|
|
1700
|
+
);
|
|
1701
|
+
const taskResult = laneResult.tasks.find((t) => t.taskId === task.taskId);
|
|
1702
|
+
if (taskResult?.status === "succeeded") {
|
|
1703
|
+
reconnectFinalStatus.set(task.taskId, "succeeded");
|
|
1704
|
+
completedTaskSet.add(task.taskId);
|
|
1705
|
+
failedTaskSet.delete(task.taskId);
|
|
1706
|
+
reconnectTaskSet.delete(task.taskId);
|
|
1707
|
+
batchState.succeededTasks++;
|
|
1708
|
+
} else {
|
|
1709
|
+
reconnectFinalStatus.set(task.taskId, "failed");
|
|
1710
|
+
failedTaskSet.add(task.taskId);
|
|
1711
|
+
completedTaskSet.delete(task.taskId);
|
|
1712
|
+
reconnectTaskSet.delete(task.taskId);
|
|
1713
|
+
batchState.failedTasks++;
|
|
1714
|
+
}
|
|
1715
|
+
} catch (err: unknown) {
|
|
1716
|
+
reconnectFinalStatus.set(task.taskId, "failed");
|
|
1717
|
+
failedTaskSet.add(task.taskId);
|
|
1718
|
+
completedTaskSet.delete(task.taskId);
|
|
1719
|
+
reconnectTaskSet.delete(task.taskId);
|
|
1720
|
+
batchState.failedTasks++;
|
|
1721
|
+
execLog(
|
|
1722
|
+
"resume",
|
|
1723
|
+
task.taskId,
|
|
1724
|
+
`V2 reconnect error: ${err instanceof Error ? err.message : String(err)}`,
|
|
1725
|
+
);
|
|
1726
|
+
}
|
|
1727
|
+
}
|
|
1728
|
+
}
|
|
1729
|
+
|
|
1730
|
+
// ── 8b. Handle re-execute tasks (dead session + existing worktree) ──
|
|
1731
|
+
const reExecuteTasks = reconciledTasks.filter((t) => t.action === "re-execute");
|
|
1732
|
+
const reExecuteFinalStatus = new Map<string, LaneTaskStatus>();
|
|
1733
|
+
const reExecAllocatedLanes: AllocatedLane[] = [];
|
|
1734
|
+
|
|
1735
|
+
if (reExecuteTasks.length > 0) {
|
|
1736
|
+
onNotify(
|
|
1737
|
+
`🔄 Re-executing ${reExecuteTasks.length} interrupted task(s) in existing worktrees...`,
|
|
1738
|
+
"info",
|
|
1739
|
+
);
|
|
1740
|
+
|
|
1741
|
+
for (const task of reExecuteTasks) {
|
|
1742
|
+
const parsedTask = discovery.pending.get(task.taskId);
|
|
1743
|
+
if (!parsedTask) continue;
|
|
1744
|
+
|
|
1745
|
+
const laneRecord = persistedState.lanes.find((l) => l.taskIds.includes(task.taskId));
|
|
1746
|
+
if (!laneRecord) continue;
|
|
1747
|
+
|
|
1748
|
+
const allocatedTask: AllocatedTask = {
|
|
1749
|
+
taskId: task.taskId,
|
|
1750
|
+
order: 0,
|
|
1751
|
+
task: parsedTask,
|
|
1752
|
+
estimatedMinutes: 0,
|
|
1753
|
+
};
|
|
1754
|
+
const lane: AllocatedLane = {
|
|
1755
|
+
laneNumber: laneRecord.laneNumber,
|
|
1756
|
+
laneId: laneRecord.laneId,
|
|
1757
|
+
laneSessionId: laneRecord.laneSessionId,
|
|
1758
|
+
worktreePath: laneRecord.worktreePath,
|
|
1759
|
+
branch: laneRecord.branch,
|
|
1760
|
+
tasks: [allocatedTask],
|
|
1761
|
+
strategy: "round-robin",
|
|
1762
|
+
estimatedLoad: 0,
|
|
1763
|
+
estimatedMinutes: 0,
|
|
1764
|
+
...(laneRecord.repoId !== undefined ? { repoId: laneRecord.repoId } : {}),
|
|
1765
|
+
};
|
|
1766
|
+
|
|
1767
|
+
// Resolve per-lane repo root for workspace mode (v1/repo mode: falls back to repoRoot)
|
|
1768
|
+
const reExecRepoRoot = resolveRepoRoot(laneRecord.repoId, repoRoot, workspaceConfig);
|
|
1769
|
+
|
|
1770
|
+
execLog("resume", task.taskId, "re-executing interrupted task in existing worktree", {
|
|
1771
|
+
session: laneRecord.laneSessionId,
|
|
1772
|
+
worktree: laneRecord.worktreePath,
|
|
1773
|
+
repoId: laneRecord.repoId ?? "(default)",
|
|
1774
|
+
});
|
|
1775
|
+
|
|
1776
|
+
try {
|
|
1777
|
+
// TP-112: Runtime V2 re-execution.
|
|
1778
|
+
terminateAliveV2Agents(stateRoot, batchState.batchId, laneRecord.laneSessionId);
|
|
1779
|
+
const laneResult = await executeLaneV2(
|
|
1780
|
+
lane,
|
|
1781
|
+
orchConfig,
|
|
1782
|
+
reExecRepoRoot,
|
|
1783
|
+
batchState.pauseSignal,
|
|
1784
|
+
workspaceRoot,
|
|
1785
|
+
!!workspaceConfig,
|
|
1786
|
+
{
|
|
1787
|
+
ORCH_BATCH_ID: batchState.batchId,
|
|
1788
|
+
...buildReviewerEnv(runnerConfig.reviewer),
|
|
1789
|
+
...buildWorkerExcludeEnv(runnerConfig.workerExcludeExtensions),
|
|
1790
|
+
},
|
|
1791
|
+
emitAlert,
|
|
1792
|
+
);
|
|
1793
|
+
const taskResult = laneResult.tasks.find((t) => t.taskId === task.taskId);
|
|
1794
|
+
const pollResult: { status: LaneTaskStatus; exitReason: string; doneFileFound: boolean } = {
|
|
1795
|
+
status: taskResult?.status ?? "failed",
|
|
1796
|
+
exitReason: taskResult?.exitReason ?? "V2 re-execution completed",
|
|
1797
|
+
doneFileFound: taskResult?.doneFileFound ?? false,
|
|
1798
|
+
};
|
|
1799
|
+
|
|
1800
|
+
if (pollResult.status === "succeeded") {
|
|
1801
|
+
reExecuteFinalStatus.set(task.taskId, "succeeded");
|
|
1802
|
+
completedTaskSet.add(task.taskId);
|
|
1803
|
+
failedTaskSet.delete(task.taskId);
|
|
1804
|
+
reExecuteTaskSet.delete(task.taskId);
|
|
1805
|
+
batchState.succeededTasks++;
|
|
1806
|
+
reExecAllocatedLanes.push(lane);
|
|
1807
|
+
execLog("resume", task.taskId, "re-executed task succeeded");
|
|
1808
|
+
} else {
|
|
1809
|
+
reExecuteFinalStatus.set(task.taskId, "failed");
|
|
1810
|
+
failedTaskSet.add(task.taskId);
|
|
1811
|
+
completedTaskSet.delete(task.taskId);
|
|
1812
|
+
reExecuteTaskSet.delete(task.taskId);
|
|
1813
|
+
batchState.failedTasks++;
|
|
1814
|
+
execLog(
|
|
1815
|
+
"resume",
|
|
1816
|
+
task.taskId,
|
|
1817
|
+
`re-executed task ${pollResult.status}: ${pollResult.exitReason}`,
|
|
1818
|
+
);
|
|
1819
|
+
}
|
|
1820
|
+
} catch (err: unknown) {
|
|
1821
|
+
reExecuteFinalStatus.set(task.taskId, "failed");
|
|
1822
|
+
failedTaskSet.add(task.taskId);
|
|
1823
|
+
completedTaskSet.delete(task.taskId);
|
|
1824
|
+
reExecuteTaskSet.delete(task.taskId);
|
|
1825
|
+
batchState.failedTasks++;
|
|
1826
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1827
|
+
execLog("resume", task.taskId, `re-execution error: ${msg}`);
|
|
1828
|
+
}
|
|
1829
|
+
}
|
|
1830
|
+
}
|
|
1831
|
+
|
|
1832
|
+
// ── 8c. Merge re-executed lane branches before cleanup ───────
|
|
1833
|
+
// Re-executed tasks completed outside the normal wave loop, so their
|
|
1834
|
+
// branches would not be merged by step 10. Merge them now.
|
|
1835
|
+
if (reExecAllocatedLanes.length > 0) {
|
|
1836
|
+
const succeededReExecTaskIds = [...reExecuteFinalStatus.entries()]
|
|
1837
|
+
.filter(([_, status]) => status === "succeeded")
|
|
1838
|
+
.map(([taskId]) => taskId);
|
|
1839
|
+
|
|
1840
|
+
if (succeededReExecTaskIds.length > 0) {
|
|
1841
|
+
onNotify(`🔀 Merging ${reExecAllocatedLanes.length} re-executed lane branch(es)...`, "info");
|
|
1842
|
+
|
|
1843
|
+
// Build synthetic WaveExecutionResult for mergeWaveByRepo()
|
|
1844
|
+
const syntheticLaneResults: LaneExecutionResult[] = reExecAllocatedLanes.map((lane) => ({
|
|
1845
|
+
laneNumber: lane.laneNumber,
|
|
1846
|
+
laneId: lane.laneId,
|
|
1847
|
+
tasks: lane.tasks.map((t) => ({
|
|
1848
|
+
taskId: t.taskId,
|
|
1849
|
+
status: "succeeded" as LaneTaskStatus,
|
|
1850
|
+
startTime: Date.now(),
|
|
1851
|
+
endTime: Date.now(),
|
|
1852
|
+
exitReason: "Re-executed task completed successfully",
|
|
1853
|
+
sessionName: lane.laneSessionId,
|
|
1854
|
+
doneFileFound: true,
|
|
1855
|
+
laneNumber: lane.laneNumber,
|
|
1856
|
+
})),
|
|
1857
|
+
overallStatus: "succeeded" as const,
|
|
1858
|
+
startTime: Date.now(),
|
|
1859
|
+
endTime: Date.now(),
|
|
1860
|
+
}));
|
|
1861
|
+
|
|
1862
|
+
// Use waveIndex -1 as a sentinel for "pre-wave-loop re-exec merge".
|
|
1863
|
+
// mergeWaveByRepo expects 1-indexed waveIndex; persistence normalizes
|
|
1864
|
+
// to 0-based via `mr.waveIndex - 1`. By passing -1 here:
|
|
1865
|
+
// - mergeWaveByRepo logs it as "W-1" (harmless)
|
|
1866
|
+
// - persistence normalizes to `Math.max(0, -1 - 1)` = 0 (valid)
|
|
1867
|
+
// - semantically distinguishes re-exec merges from wave 1 merges
|
|
1868
|
+
const RE_EXEC_WAVE_INDEX = -1;
|
|
1869
|
+
|
|
1870
|
+
const syntheticWaveResult: WaveExecutionResult = {
|
|
1871
|
+
waveIndex: RE_EXEC_WAVE_INDEX,
|
|
1872
|
+
startedAt: Date.now(),
|
|
1873
|
+
endedAt: Date.now(),
|
|
1874
|
+
laneResults: syntheticLaneResults,
|
|
1875
|
+
policyApplied: orchConfig.failure.on_task_failure,
|
|
1876
|
+
stoppedEarly: false,
|
|
1877
|
+
failedTaskIds: [],
|
|
1878
|
+
skippedTaskIds: [],
|
|
1879
|
+
succeededTaskIds: succeededReExecTaskIds,
|
|
1880
|
+
blockedTaskIds: [],
|
|
1881
|
+
laneCount: reExecAllocatedLanes.length,
|
|
1882
|
+
overallStatus: "succeeded",
|
|
1883
|
+
finalMonitorState: null,
|
|
1884
|
+
allocatedLanes: reExecAllocatedLanes,
|
|
1885
|
+
};
|
|
1886
|
+
|
|
1887
|
+
const reExecMergeResult = await mergeWaveByRepo(
|
|
1888
|
+
reExecAllocatedLanes,
|
|
1889
|
+
syntheticWaveResult,
|
|
1890
|
+
RE_EXEC_WAVE_INDEX,
|
|
1891
|
+
orchConfig,
|
|
1892
|
+
repoRoot,
|
|
1893
|
+
batchState.batchId,
|
|
1894
|
+
batchState.orchBranch,
|
|
1895
|
+
workspaceConfig,
|
|
1896
|
+
stateRoot,
|
|
1897
|
+
agentRoot,
|
|
1898
|
+
runnerConfig.testing_commands,
|
|
1899
|
+
undefined, // healthMonitor
|
|
1900
|
+
undefined, // forceMixedOutcome
|
|
1901
|
+
resumeBackend,
|
|
1902
|
+
);
|
|
1903
|
+
|
|
1904
|
+
if (reExecMergeResult.status === "succeeded") {
|
|
1905
|
+
onNotify(
|
|
1906
|
+
`✅ Re-executed branch merge complete: ${reExecMergeResult.laneResults.length} lane(s) merged`,
|
|
1907
|
+
"info",
|
|
1908
|
+
);
|
|
1909
|
+
|
|
1910
|
+
// Clean up merged branches (resolve per-lane repo root for workspace mode)
|
|
1911
|
+
// TP-032 R006-3: Exclude verification_new_failure lanes from branch cleanup
|
|
1912
|
+
for (const lr of reExecMergeResult.laneResults) {
|
|
1913
|
+
if (
|
|
1914
|
+
!lr.error &&
|
|
1915
|
+
(lr.result?.status === "SUCCESS" || lr.result?.status === "CONFLICT_RESOLVED")
|
|
1916
|
+
) {
|
|
1917
|
+
const laneRepoRoot = resolveRepoRoot(lr.repoId, repoRoot, workspaceConfig);
|
|
1918
|
+
deleteBranchBestEffort(lr.sourceBranch, laneRepoRoot);
|
|
1919
|
+
}
|
|
1920
|
+
}
|
|
1921
|
+
} else {
|
|
1922
|
+
onNotify(
|
|
1923
|
+
`⚠️ Re-executed branch merge ${reExecMergeResult.status}: ${reExecMergeResult.failureReason || "unknown"}`,
|
|
1924
|
+
"warning",
|
|
1925
|
+
);
|
|
1926
|
+
}
|
|
1927
|
+
|
|
1928
|
+
batchState.mergeResults.push(reExecMergeResult);
|
|
1929
|
+
}
|
|
1930
|
+
}
|
|
1931
|
+
|
|
1932
|
+
// ── 9. Persist state after reconciliation ────────────────────
|
|
1933
|
+
// Track state for persistence
|
|
1934
|
+
const wavePlan = runtimeWavePlan;
|
|
1935
|
+
persistedState.wavePlan = wavePlan;
|
|
1936
|
+
if (batchState.totalWaves < wavePlan.length) {
|
|
1937
|
+
batchState.totalWaves = wavePlan.length;
|
|
1938
|
+
}
|
|
1939
|
+
const allTaskOutcomes: LaneTaskOutcome[] = [];
|
|
1940
|
+
|
|
1941
|
+
// Initialize latestAllocatedLanes from persisted lane records so that
|
|
1942
|
+
// early persistence calls (before the first resumed wave) retain lane
|
|
1943
|
+
// records with repo attribution (laneNumber, laneId, branch, repoId).
|
|
1944
|
+
// Without this, the `resume-reconciliation` checkpoint would serialize
|
|
1945
|
+
// empty lanes[], losing all lane context until a new wave allocates.
|
|
1946
|
+
let latestAllocatedLanes: AllocatedLane[] = reconstructAllocatedLanes(
|
|
1947
|
+
persistedState.lanes,
|
|
1948
|
+
persistedState.tasks,
|
|
1949
|
+
);
|
|
1950
|
+
|
|
1951
|
+
// Track all repo roots encountered during execution (persisted + newly allocated).
|
|
1952
|
+
// Used by inter-wave reset and terminal cleanup to cover repos introduced
|
|
1953
|
+
// after resume starts (not present in persisted lanes).
|
|
1954
|
+
// Initialized from collectRepoRoots() helper for parity with other callers.
|
|
1955
|
+
const encounteredRepoRoots = new Set(collectRepoRoots(persistedState, repoRoot, workspaceConfig));
|
|
1956
|
+
|
|
1957
|
+
// Build outcomes from reconciled tasks
|
|
1958
|
+
for (const task of reconciledTasks) {
|
|
1959
|
+
const persistedTask = persistedState.tasks.find((t) => t.taskId === task.taskId);
|
|
1960
|
+
const reconnectStatus = reconnectFinalStatus.get(task.taskId);
|
|
1961
|
+
const reExecuteStatus = reExecuteFinalStatus.get(task.taskId);
|
|
1962
|
+
const status =
|
|
1963
|
+
task.action === "reconnect"
|
|
1964
|
+
? reconnectStatus || "running"
|
|
1965
|
+
: task.action === "re-execute"
|
|
1966
|
+
? reExecuteStatus || "pending"
|
|
1967
|
+
: task.liveStatus;
|
|
1968
|
+
const isTerminal =
|
|
1969
|
+
status === "succeeded" || status === "failed" || status === "stalled" || status === "skipped";
|
|
1970
|
+
allTaskOutcomes.push({
|
|
1971
|
+
taskId: task.taskId,
|
|
1972
|
+
status,
|
|
1973
|
+
startTime: persistedTask?.startedAt ?? null,
|
|
1974
|
+
endTime: isTerminal ? Date.now() : null,
|
|
1975
|
+
exitReason:
|
|
1976
|
+
task.action === "mark-complete"
|
|
1977
|
+
? ".DONE file found on resume"
|
|
1978
|
+
: task.action === "mark-failed"
|
|
1979
|
+
? "Session dead, no .DONE file, no worktree on resume"
|
|
1980
|
+
: task.action === "reconnect"
|
|
1981
|
+
? status === "succeeded"
|
|
1982
|
+
? "Reconnected task completed"
|
|
1983
|
+
: status === "failed"
|
|
1984
|
+
? "Reconnected task failed"
|
|
1985
|
+
: "Reconnected to alive session"
|
|
1986
|
+
: task.action === "re-execute"
|
|
1987
|
+
? status === "succeeded"
|
|
1988
|
+
? "Re-executed task completed"
|
|
1989
|
+
: status === "failed"
|
|
1990
|
+
? "Re-executed task failed"
|
|
1991
|
+
: "Re-executing in existing worktree"
|
|
1992
|
+
: (persistedTask?.exitReason ?? ""),
|
|
1993
|
+
sessionName: persistedTask?.sessionName ?? "",
|
|
1994
|
+
doneFileFound: status === "succeeded" ? true : task.doneFileFound,
|
|
1995
|
+
laneNumber: persistedTask?.laneNumber,
|
|
1996
|
+
// Carry forward partial progress from persisted state (TP-028)
|
|
1997
|
+
partialProgressCommits: persistedTask?.partialProgressCommits,
|
|
1998
|
+
partialProgressBranch: persistedTask?.partialProgressBranch,
|
|
1999
|
+
// v3: Carry forward exit diagnostic from persisted state (TP-030)
|
|
2000
|
+
exitDiagnostic: persistedTask?.exitDiagnostic,
|
|
2001
|
+
});
|
|
2002
|
+
}
|
|
2003
|
+
|
|
2004
|
+
// ── 9b. Seed blocked dependents from reconciled failures ─────
|
|
2005
|
+
// Under skip-dependents policy, failures discovered during reconciliation
|
|
2006
|
+
// (mark-failed) or resolved during reconnect/re-execute must propagate
|
|
2007
|
+
// to their transitive dependents BEFORE the wave loop begins.
|
|
2008
|
+
if (orchConfig.failure.on_task_failure === "skip-dependents" && failedTaskSet.size > 0) {
|
|
2009
|
+
const reconciledBlocked = computeTransitiveDependents(failedTaskSet, depGraph);
|
|
2010
|
+
for (const taskId of reconciledBlocked) {
|
|
2011
|
+
batchState.blockedTaskIds.add(taskId);
|
|
2012
|
+
}
|
|
2013
|
+
if (reconciledBlocked.size > 0) {
|
|
2014
|
+
execLog(
|
|
2015
|
+
"resume",
|
|
2016
|
+
batchState.batchId,
|
|
2017
|
+
`skip-dependents: ${reconciledBlocked.size} task(s) blocked from reconciled failures`,
|
|
2018
|
+
{
|
|
2019
|
+
blocked: [...reconciledBlocked].sort().join(","),
|
|
2020
|
+
sources: [...failedTaskSet].sort().join(","),
|
|
2021
|
+
},
|
|
2022
|
+
);
|
|
2023
|
+
}
|
|
2024
|
+
}
|
|
2025
|
+
|
|
2026
|
+
persistRuntimeState(
|
|
2027
|
+
"resume-reconciliation",
|
|
2028
|
+
batchState,
|
|
2029
|
+
wavePlan,
|
|
2030
|
+
latestAllocatedLanes,
|
|
2031
|
+
allTaskOutcomes,
|
|
2032
|
+
discovery ?? null,
|
|
2033
|
+
stateRoot,
|
|
2034
|
+
);
|
|
2035
|
+
|
|
2036
|
+
// ── 10. Continue wave execution ──────────────────────────────
|
|
2037
|
+
// We need to execute remaining waves starting from resumeWaveIndex.
|
|
2038
|
+
// For waves where some tasks are already done, we filter them out.
|
|
2039
|
+
|
|
2040
|
+
let preserveWorktreesForResume = false;
|
|
2041
|
+
const persistedStatusByTaskId = new Map(
|
|
2042
|
+
persistedState.tasks.map((task) => [task.taskId, task.status] as const),
|
|
2043
|
+
);
|
|
2044
|
+
|
|
2045
|
+
// TP-166: Use task-level wave metadata for correct display.
|
|
2046
|
+
const roundToTaskWave = batchState.roundToTaskWave;
|
|
2047
|
+
const taskLevelWaveCount = batchState.taskLevelWaveCount;
|
|
2048
|
+
|
|
2049
|
+
for (let waveIdx = resumePoint.resumeWaveIndex; waveIdx < wavePlan.length; waveIdx++) {
|
|
2050
|
+
// Check pause signal
|
|
2051
|
+
if (batchState.pauseSignal.paused) {
|
|
2052
|
+
batchState.phase = "paused";
|
|
2053
|
+
persistRuntimeState(
|
|
2054
|
+
"pause-before-wave",
|
|
2055
|
+
batchState,
|
|
2056
|
+
wavePlan,
|
|
2057
|
+
latestAllocatedLanes,
|
|
2058
|
+
allTaskOutcomes,
|
|
2059
|
+
discovery,
|
|
2060
|
+
stateRoot,
|
|
2061
|
+
);
|
|
2062
|
+
const { displayWave: pauseWave } = resolveDisplayWaveNumber(
|
|
2063
|
+
waveIdx,
|
|
2064
|
+
roundToTaskWave,
|
|
2065
|
+
taskLevelWaveCount,
|
|
2066
|
+
);
|
|
2067
|
+
onNotify(`⏸️ Batch paused before wave ${pauseWave}.`, "warning");
|
|
2068
|
+
break;
|
|
2069
|
+
}
|
|
2070
|
+
|
|
2071
|
+
batchState.currentWaveIndex = waveIdx;
|
|
2072
|
+
persistRuntimeState(
|
|
2073
|
+
"wave-index-change",
|
|
2074
|
+
batchState,
|
|
2075
|
+
wavePlan,
|
|
2076
|
+
latestAllocatedLanes,
|
|
2077
|
+
allTaskOutcomes,
|
|
2078
|
+
discovery,
|
|
2079
|
+
stateRoot,
|
|
2080
|
+
);
|
|
2081
|
+
|
|
2082
|
+
// Get wave tasks, filtering out completed/failed/skipped/blocked ones.
|
|
2083
|
+
// Persisted "skipped" tasks are terminal and must never be re-executed.
|
|
2084
|
+
let waveTasks = wavePlan[waveIdx].filter(
|
|
2085
|
+
(taskId) =>
|
|
2086
|
+
!completedTaskSet.has(taskId) &&
|
|
2087
|
+
!failedTaskSet.has(taskId) &&
|
|
2088
|
+
persistedStatusByTaskId.get(taskId) !== "skipped" &&
|
|
2089
|
+
!batchState.blockedTaskIds.has(taskId),
|
|
2090
|
+
);
|
|
2091
|
+
|
|
2092
|
+
// Also filter tasks where discovery doesn't have them as pending
|
|
2093
|
+
waveTasks = waveTasks.filter((taskId) => discovery.pending.has(taskId));
|
|
2094
|
+
|
|
2095
|
+
// Count only newly blocked tasks (not already persisted) to avoid double-counting.
|
|
2096
|
+
// persistedState.blockedTaskIds were already counted in persistedState.blockedTasks
|
|
2097
|
+
// which initialized batchState.blockedTasks.
|
|
2098
|
+
const blockedInWave = wavePlan[waveIdx].filter(
|
|
2099
|
+
(taskId) => batchState.blockedTaskIds.has(taskId) && !persistedBlockedTaskIds.has(taskId),
|
|
2100
|
+
);
|
|
2101
|
+
if (blockedInWave.length > 0) {
|
|
2102
|
+
batchState.blockedTasks += blockedInWave.length;
|
|
2103
|
+
}
|
|
2104
|
+
|
|
2105
|
+
if (waveTasks.length === 0) {
|
|
2106
|
+
// TP-037 Bug #102: Check if this wave needs merge retry.
|
|
2107
|
+
// All tasks are terminal but the merge may have failed/been interrupted.
|
|
2108
|
+
if (resumePoint.mergeRetryWaveIndexes.includes(waveIdx)) {
|
|
2109
|
+
execLog(
|
|
2110
|
+
"resume",
|
|
2111
|
+
batchState.batchId,
|
|
2112
|
+
`wave ${waveIdx + 1}: all tasks done but merge needs retry`,
|
|
2113
|
+
);
|
|
2114
|
+
onNotify(
|
|
2115
|
+
`🔀 Wave ${resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave}: retrying merge (tasks already complete, merge was missing/failed)`,
|
|
2116
|
+
"info",
|
|
2117
|
+
);
|
|
2118
|
+
|
|
2119
|
+
// Reconstruct lanes for this wave from persisted state
|
|
2120
|
+
const waveTaskIds = new Set(wavePlan[waveIdx]);
|
|
2121
|
+
const waveLaneRecords = persistedState.lanes.filter((lane) =>
|
|
2122
|
+
lane.taskIds.some((tid) => waveTaskIds.has(tid)),
|
|
2123
|
+
);
|
|
2124
|
+
const mergeRetryLanes = reconstructAllocatedLanes(waveLaneRecords, persistedState.tasks);
|
|
2125
|
+
|
|
2126
|
+
// Build synthetic WaveExecutionResult from persisted terminal task states.
|
|
2127
|
+
// Crucial for orch_force_merge: tasks intentionally marked "skipped" must
|
|
2128
|
+
// remain skipped here (not failed), otherwise mixed-outcome detection would
|
|
2129
|
+
// trigger again and block the forced merge recovery path.
|
|
2130
|
+
const succeededTaskIds = wavePlan[waveIdx].filter((taskId) => completedTaskSet.has(taskId));
|
|
2131
|
+
const skippedTaskIds = wavePlan[waveIdx].filter(
|
|
2132
|
+
(taskId) => persistedStatusByTaskId.get(taskId) === "skipped",
|
|
2133
|
+
);
|
|
2134
|
+
const failedTaskIds = wavePlan[waveIdx].filter((taskId) => {
|
|
2135
|
+
const status = persistedStatusByTaskId.get(taskId);
|
|
2136
|
+
return status === "failed" || status === "stalled";
|
|
2137
|
+
});
|
|
2138
|
+
|
|
2139
|
+
const syntheticLaneResults: LaneExecutionResult[] = mergeRetryLanes.map((lane) => {
|
|
2140
|
+
const laneTasks = lane.tasks.map((t) => {
|
|
2141
|
+
const persistedStatus = persistedStatusByTaskId.get(t.taskId);
|
|
2142
|
+
let status: LaneTaskStatus;
|
|
2143
|
+
if (completedTaskSet.has(t.taskId) || persistedStatus === "succeeded") {
|
|
2144
|
+
status = "succeeded";
|
|
2145
|
+
} else if (persistedStatus === "skipped") {
|
|
2146
|
+
status = "skipped";
|
|
2147
|
+
} else if (persistedStatus === "failed") {
|
|
2148
|
+
status = "failed";
|
|
2149
|
+
} else if (persistedStatus === "stalled") {
|
|
2150
|
+
status = "stalled";
|
|
2151
|
+
} else {
|
|
2152
|
+
status = "failed";
|
|
2153
|
+
}
|
|
2154
|
+
|
|
2155
|
+
return {
|
|
2156
|
+
taskId: t.taskId,
|
|
2157
|
+
status,
|
|
2158
|
+
startTime: Date.now(),
|
|
2159
|
+
endTime: Date.now(),
|
|
2160
|
+
exitReason:
|
|
2161
|
+
status === "succeeded"
|
|
2162
|
+
? "Task completed (merge retry)"
|
|
2163
|
+
: status === "skipped"
|
|
2164
|
+
? "Task skipped (merge retry)"
|
|
2165
|
+
: status === "stalled"
|
|
2166
|
+
? "Task stalled (merge retry)"
|
|
2167
|
+
: "Task failed (merge retry)",
|
|
2168
|
+
sessionName: lane.laneSessionId,
|
|
2169
|
+
doneFileFound: status === "succeeded",
|
|
2170
|
+
laneNumber: lane.laneNumber,
|
|
2171
|
+
};
|
|
2172
|
+
});
|
|
2173
|
+
|
|
2174
|
+
const laneHasHardFailure = laneTasks.some(
|
|
2175
|
+
(t) => t.status === "failed" || t.status === "stalled",
|
|
2176
|
+
);
|
|
2177
|
+
const laneHasSucceeded = laneTasks.some((t) => t.status === "succeeded");
|
|
2178
|
+
const overallStatus = laneHasHardFailure
|
|
2179
|
+
? laneHasSucceeded
|
|
2180
|
+
? "partial"
|
|
2181
|
+
: "failed"
|
|
2182
|
+
: "succeeded";
|
|
2183
|
+
|
|
2184
|
+
return {
|
|
2185
|
+
laneNumber: lane.laneNumber,
|
|
2186
|
+
laneId: lane.laneId,
|
|
2187
|
+
tasks: laneTasks,
|
|
2188
|
+
overallStatus,
|
|
2189
|
+
startTime: Date.now(),
|
|
2190
|
+
endTime: Date.now(),
|
|
2191
|
+
};
|
|
2192
|
+
});
|
|
2193
|
+
|
|
2194
|
+
const syntheticWaveResult: WaveExecutionResult = {
|
|
2195
|
+
waveIndex: waveIdx + 1,
|
|
2196
|
+
startedAt: Date.now(),
|
|
2197
|
+
endedAt: Date.now(),
|
|
2198
|
+
laneResults: syntheticLaneResults,
|
|
2199
|
+
policyApplied: orchConfig.failure.on_task_failure,
|
|
2200
|
+
stoppedEarly: false,
|
|
2201
|
+
failedTaskIds,
|
|
2202
|
+
skippedTaskIds,
|
|
2203
|
+
succeededTaskIds,
|
|
2204
|
+
blockedTaskIds: [],
|
|
2205
|
+
laneCount: mergeRetryLanes.length,
|
|
2206
|
+
overallStatus: "succeeded",
|
|
2207
|
+
finalMonitorState: null,
|
|
2208
|
+
allocatedLanes: mergeRetryLanes,
|
|
2209
|
+
};
|
|
2210
|
+
|
|
2211
|
+
batchState.phase = "merging";
|
|
2212
|
+
persistRuntimeState(
|
|
2213
|
+
"merge-retry-start",
|
|
2214
|
+
batchState,
|
|
2215
|
+
wavePlan,
|
|
2216
|
+
latestAllocatedLanes,
|
|
2217
|
+
allTaskOutcomes,
|
|
2218
|
+
discovery,
|
|
2219
|
+
stateRoot,
|
|
2220
|
+
);
|
|
2221
|
+
|
|
2222
|
+
const mergeRetryResult = await mergeWaveByRepo(
|
|
2223
|
+
mergeRetryLanes,
|
|
2224
|
+
syntheticWaveResult,
|
|
2225
|
+
waveIdx + 1,
|
|
2226
|
+
orchConfig,
|
|
2227
|
+
repoRoot,
|
|
2228
|
+
batchState.batchId,
|
|
2229
|
+
batchState.orchBranch,
|
|
2230
|
+
workspaceConfig,
|
|
2231
|
+
stateRoot,
|
|
2232
|
+
agentRoot,
|
|
2233
|
+
runnerConfig.testing_commands,
|
|
2234
|
+
undefined, // healthMonitor
|
|
2235
|
+
undefined, // forceMixedOutcome
|
|
2236
|
+
resumeBackend,
|
|
2237
|
+
);
|
|
2238
|
+
batchState.mergeResults.push(mergeRetryResult);
|
|
2239
|
+
|
|
2240
|
+
if (mergeRetryResult.status === "succeeded") {
|
|
2241
|
+
onNotify(
|
|
2242
|
+
`✅ Wave ${resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave} merge retry succeeded`,
|
|
2243
|
+
"info",
|
|
2244
|
+
);
|
|
2245
|
+
// Clean up merged branches
|
|
2246
|
+
for (const lr of mergeRetryResult.laneResults) {
|
|
2247
|
+
if (
|
|
2248
|
+
!lr.error &&
|
|
2249
|
+
(lr.result?.status === "SUCCESS" || lr.result?.status === "CONFLICT_RESOLVED")
|
|
2250
|
+
) {
|
|
2251
|
+
const laneRepoRoot = resolveRepoRoot(lr.repoId, repoRoot, workspaceConfig);
|
|
2252
|
+
deleteBranchBestEffort(lr.sourceBranch, laneRepoRoot);
|
|
2253
|
+
}
|
|
2254
|
+
}
|
|
2255
|
+
} else {
|
|
2256
|
+
onNotify(
|
|
2257
|
+
`⚠️ Wave ${resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave} merge retry ${mergeRetryResult.status}: ${mergeRetryResult.failureReason || "unknown"}`,
|
|
2258
|
+
"warning",
|
|
2259
|
+
);
|
|
2260
|
+
// Apply merge failure policy (same as normal wave merge failure)
|
|
2261
|
+
const policyResult = computeMergeFailurePolicy(mergeRetryResult, waveIdx, orchConfig);
|
|
2262
|
+
execLog(
|
|
2263
|
+
"batch",
|
|
2264
|
+
batchState.batchId,
|
|
2265
|
+
`merge retry failure — applying ${policyResult.policy} policy`,
|
|
2266
|
+
policyResult.logDetails,
|
|
2267
|
+
);
|
|
2268
|
+
batchState.phase = policyResult.targetPhase;
|
|
2269
|
+
batchState.errors.push(policyResult.errorMessage);
|
|
2270
|
+
persistRuntimeState(
|
|
2271
|
+
policyResult.persistTrigger,
|
|
2272
|
+
batchState,
|
|
2273
|
+
wavePlan,
|
|
2274
|
+
latestAllocatedLanes,
|
|
2275
|
+
allTaskOutcomes,
|
|
2276
|
+
discovery,
|
|
2277
|
+
stateRoot,
|
|
2278
|
+
);
|
|
2279
|
+
onNotify(policyResult.notifyMessage, policyResult.notifyLevel);
|
|
2280
|
+
preserveWorktreesForResume = true;
|
|
2281
|
+
break;
|
|
2282
|
+
}
|
|
2283
|
+
|
|
2284
|
+
batchState.phase = "executing";
|
|
2285
|
+
persistRuntimeState(
|
|
2286
|
+
"merge-retry-complete",
|
|
2287
|
+
batchState,
|
|
2288
|
+
wavePlan,
|
|
2289
|
+
latestAllocatedLanes,
|
|
2290
|
+
allTaskOutcomes,
|
|
2291
|
+
discovery,
|
|
2292
|
+
stateRoot,
|
|
2293
|
+
);
|
|
2294
|
+
} else {
|
|
2295
|
+
execLog(
|
|
2296
|
+
"resume",
|
|
2297
|
+
batchState.batchId,
|
|
2298
|
+
`wave ${waveIdx + 1}: no tasks to execute (all completed/blocked)`,
|
|
2299
|
+
);
|
|
2300
|
+
}
|
|
2301
|
+
continue;
|
|
2302
|
+
}
|
|
2303
|
+
|
|
2304
|
+
{
|
|
2305
|
+
const { displayWave, displayTotal } = resolveDisplayWaveNumber(
|
|
2306
|
+
waveIdx,
|
|
2307
|
+
roundToTaskWave,
|
|
2308
|
+
taskLevelWaveCount,
|
|
2309
|
+
);
|
|
2310
|
+
onNotify(
|
|
2311
|
+
ORCH_MESSAGES.orchWaveStart(
|
|
2312
|
+
displayWave,
|
|
2313
|
+
displayTotal,
|
|
2314
|
+
waveTasks.length,
|
|
2315
|
+
Math.min(waveTasks.length, orchConfig.orchestrator.max_lanes),
|
|
2316
|
+
),
|
|
2317
|
+
"info",
|
|
2318
|
+
);
|
|
2319
|
+
}
|
|
2320
|
+
|
|
2321
|
+
const handleResumeMonitorUpdate: MonitorUpdateCallback = (monitorState) => {
|
|
2322
|
+
const changed = syncTaskOutcomesFromMonitor(monitorState, allTaskOutcomes);
|
|
2323
|
+
if (changed) {
|
|
2324
|
+
persistRuntimeState(
|
|
2325
|
+
"task-transition",
|
|
2326
|
+
batchState,
|
|
2327
|
+
wavePlan,
|
|
2328
|
+
latestAllocatedLanes,
|
|
2329
|
+
allTaskOutcomes,
|
|
2330
|
+
discovery,
|
|
2331
|
+
stateRoot,
|
|
2332
|
+
);
|
|
2333
|
+
}
|
|
2334
|
+
onMonitorUpdate?.(monitorState);
|
|
2335
|
+
};
|
|
2336
|
+
|
|
2337
|
+
// Execute the wave
|
|
2338
|
+
const waveResult = await executeWave(
|
|
2339
|
+
waveTasks,
|
|
2340
|
+
waveIdx + 1,
|
|
2341
|
+
discovery.pending,
|
|
2342
|
+
orchConfig,
|
|
2343
|
+
repoRoot,
|
|
2344
|
+
batchState.batchId,
|
|
2345
|
+
batchState.pauseSignal,
|
|
2346
|
+
depGraph,
|
|
2347
|
+
batchState.orchBranch,
|
|
2348
|
+
handleResumeMonitorUpdate,
|
|
2349
|
+
(lanes) => {
|
|
2350
|
+
latestAllocatedLanes = lanes;
|
|
2351
|
+
batchState.currentLanes = lanes;
|
|
2352
|
+
// Track repos from newly allocated lanes for cleanup coverage
|
|
2353
|
+
for (const lane of lanes) {
|
|
2354
|
+
encounteredRepoRoots.add(resolveRepoRoot(lane.repoId, repoRoot, workspaceConfig));
|
|
2355
|
+
}
|
|
2356
|
+
if (seedPendingOutcomesForAllocatedLanes(lanes, allTaskOutcomes)) {
|
|
2357
|
+
persistRuntimeState(
|
|
2358
|
+
"wave-lanes-allocated",
|
|
2359
|
+
batchState,
|
|
2360
|
+
wavePlan,
|
|
2361
|
+
latestAllocatedLanes,
|
|
2362
|
+
allTaskOutcomes,
|
|
2363
|
+
discovery,
|
|
2364
|
+
stateRoot,
|
|
2365
|
+
);
|
|
2366
|
+
}
|
|
2367
|
+
},
|
|
2368
|
+
workspaceConfig,
|
|
2369
|
+
resumeBackend,
|
|
2370
|
+
emitAlert,
|
|
2371
|
+
supervisorAutonomy,
|
|
2372
|
+
runnerConfig.reviewer,
|
|
2373
|
+
runnerConfig.worker,
|
|
2374
|
+
runnerConfig.workerExcludeExtensions ?? [],
|
|
2375
|
+
onLaneTerminated ?? undefined,
|
|
2376
|
+
onLaneRespawned ?? undefined,
|
|
2377
|
+
);
|
|
2378
|
+
|
|
2379
|
+
batchState.waveResults.push(waveResult);
|
|
2380
|
+
batchState.currentLanes = [];
|
|
2381
|
+
|
|
2382
|
+
// Accumulate task outcomes
|
|
2383
|
+
latestAllocatedLanes = waveResult.allocatedLanes;
|
|
2384
|
+
for (const lr of waveResult.laneResults) {
|
|
2385
|
+
for (const taskOutcome of lr.tasks) {
|
|
2386
|
+
upsertTaskOutcome(allTaskOutcomes, taskOutcome);
|
|
2387
|
+
}
|
|
2388
|
+
}
|
|
2389
|
+
|
|
2390
|
+
// Accumulate results
|
|
2391
|
+
batchState.succeededTasks += waveResult.succeededTaskIds.length;
|
|
2392
|
+
batchState.failedTasks += waveResult.failedTaskIds.length;
|
|
2393
|
+
batchState.skippedTasks += waveResult.skippedTaskIds.length;
|
|
2394
|
+
|
|
2395
|
+
for (const taskId of waveResult.succeededTaskIds) {
|
|
2396
|
+
completedTaskSet.add(taskId);
|
|
2397
|
+
failedTaskSet.delete(taskId);
|
|
2398
|
+
reconnectTaskSet.delete(taskId);
|
|
2399
|
+
}
|
|
2400
|
+
for (const taskId of waveResult.failedTaskIds) {
|
|
2401
|
+
failedTaskSet.add(taskId);
|
|
2402
|
+
completedTaskSet.delete(taskId);
|
|
2403
|
+
reconnectTaskSet.delete(taskId);
|
|
2404
|
+
}
|
|
2405
|
+
|
|
2406
|
+
for (const blocked of waveResult.blockedTaskIds) {
|
|
2407
|
+
batchState.blockedTaskIds.add(blocked);
|
|
2408
|
+
}
|
|
2409
|
+
|
|
2410
|
+
// ── TP-076: Emit supervisor alerts for task failures ────
|
|
2411
|
+
for (const taskId of waveResult.failedTaskIds) {
|
|
2412
|
+
const outcome = allTaskOutcomes.find((o) => o.taskId === taskId);
|
|
2413
|
+
const laneForTask = latestAllocatedLanes.find((l) => l.tasks.some((t) => t.taskId === taskId));
|
|
2414
|
+
// TP-195: corrected the lookup to the real source of segment
|
|
2415
|
+
// metadata. `batchState.tasks` does not exist on
|
|
2416
|
+
// `OrchBatchRuntimeState` (it's on `PersistedBatchState`); the
|
|
2417
|
+
// previous read would have thrown `undefined.find is not a
|
|
2418
|
+
// function` if hit at runtime. The allocated lane carries the
|
|
2419
|
+
// `ParsedTask` payload via `AllocatedTask.task`, which has
|
|
2420
|
+
// `segmentIds`/`activeSegmentId` already populated by discovery.
|
|
2421
|
+
const taskRecord = laneForTask?.tasks.find((t) => t.taskId === taskId)?.task;
|
|
2422
|
+
const exitReason = outcome?.exitReason || "unknown";
|
|
2423
|
+
const hasPartialProgress = (outcome?.partialProgressCommits ?? 0) > 0;
|
|
2424
|
+
const segmentFrontier = buildSupervisorSegmentFrontierSnapshot(
|
|
2425
|
+
taskId,
|
|
2426
|
+
taskRecord?.segmentIds,
|
|
2427
|
+
taskRecord?.activeSegmentId,
|
|
2428
|
+
batchState.segments,
|
|
2429
|
+
outcome?.segmentId,
|
|
2430
|
+
);
|
|
2431
|
+
const segmentId =
|
|
2432
|
+
outcome?.segmentId ??
|
|
2433
|
+
taskRecord?.activeSegmentId ??
|
|
2434
|
+
segmentFrontier?.activeSegmentId ??
|
|
2435
|
+
undefined;
|
|
2436
|
+
const repoId = segmentId
|
|
2437
|
+
? (segmentFrontier?.segments.find((segment) => segment.segmentId === segmentId)?.repoId ??
|
|
2438
|
+
laneForTask?.repoId)
|
|
2439
|
+
: laneForTask?.repoId;
|
|
2440
|
+
const segmentSummary = segmentId
|
|
2441
|
+
? ` Segment: ${segmentId}${repoId ? ` (repo: ${repoId})` : ""}\n`
|
|
2442
|
+
: "";
|
|
2443
|
+
const frontierSummary = segmentFrontier
|
|
2444
|
+
? ` Segment frontier: ${segmentFrontier.terminalSegments}/${segmentFrontier.totalSegments} terminal\n`
|
|
2445
|
+
: "";
|
|
2446
|
+
// TP-190 (#561): Mirror engine.ts emission — propagate the structured
|
|
2447
|
+
// exit category so /orch-resume task-failure alerts route through the
|
|
2448
|
+
// same supervisor playbook branches as /orch. Shared helper enforces
|
|
2449
|
+
// payload parity between the two emission sites.
|
|
2450
|
+
const { exitCategory, summaryLine: spawnFailureLine } = buildSpawnFailureAlertExtras(outcome);
|
|
2451
|
+
emitAlert({
|
|
2452
|
+
category: "task-failure",
|
|
2453
|
+
summary:
|
|
2454
|
+
`⚠️ Task failure: ${taskId}\n` +
|
|
2455
|
+
` Exit reason: ${exitReason}\n` +
|
|
2456
|
+
spawnFailureLine +
|
|
2457
|
+
segmentSummary +
|
|
2458
|
+
frontierSummary +
|
|
2459
|
+
` Lane: ${laneForTask?.laneId ?? "unknown"} (lane ${laneForTask?.laneNumber ?? "?"})\n` +
|
|
2460
|
+
` Partial progress preserved: ${hasPartialProgress ? "yes" : "no"}\n` +
|
|
2461
|
+
` Batch: wave ${resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave}/${taskLevelWaveCount ?? batchState.totalWaves}, ` +
|
|
2462
|
+
`${batchState.succeededTasks} succeeded, ${batchState.failedTasks} failed\n\n` +
|
|
2463
|
+
`Available actions:\n` +
|
|
2464
|
+
` - orch_status() to inspect current state\n` +
|
|
2465
|
+
` - orch_resume(force=true) to retry\n` +
|
|
2466
|
+
` - Read STATUS.md and lane logs for diagnosis`,
|
|
2467
|
+
context: {
|
|
2468
|
+
taskId,
|
|
2469
|
+
segmentId,
|
|
2470
|
+
repoId,
|
|
2471
|
+
segmentFrontier,
|
|
2472
|
+
laneId: laneForTask?.laneId,
|
|
2473
|
+
laneNumber: laneForTask?.laneNumber,
|
|
2474
|
+
waveIndex: waveIdx,
|
|
2475
|
+
exitReason,
|
|
2476
|
+
exitCategory,
|
|
2477
|
+
partialProgress: hasPartialProgress,
|
|
2478
|
+
batchProgress: buildBatchProgressSnapshot(batchState),
|
|
2479
|
+
},
|
|
2480
|
+
});
|
|
2481
|
+
}
|
|
2482
|
+
|
|
2483
|
+
persistRuntimeState(
|
|
2484
|
+
"wave-execution-complete",
|
|
2485
|
+
batchState,
|
|
2486
|
+
wavePlan,
|
|
2487
|
+
latestAllocatedLanes,
|
|
2488
|
+
allTaskOutcomes,
|
|
2489
|
+
discovery,
|
|
2490
|
+
stateRoot,
|
|
2491
|
+
);
|
|
2492
|
+
|
|
2493
|
+
const elapsedSec = Math.round((waveResult.endedAt - waveResult.startedAt) / 1000);
|
|
2494
|
+
{
|
|
2495
|
+
const { displayWave: completeDisplayWave } = resolveDisplayWaveNumber(
|
|
2496
|
+
waveIdx,
|
|
2497
|
+
roundToTaskWave,
|
|
2498
|
+
taskLevelWaveCount,
|
|
2499
|
+
);
|
|
2500
|
+
onNotify(
|
|
2501
|
+
ORCH_MESSAGES.orchWaveComplete(
|
|
2502
|
+
completeDisplayWave,
|
|
2503
|
+
waveResult.succeededTaskIds.length,
|
|
2504
|
+
waveResult.failedTaskIds.length,
|
|
2505
|
+
waveResult.skippedTaskIds.length,
|
|
2506
|
+
elapsedSec,
|
|
2507
|
+
),
|
|
2508
|
+
waveResult.failedTaskIds.length > 0 ? "warning" : "info",
|
|
2509
|
+
);
|
|
2510
|
+
}
|
|
2511
|
+
|
|
2512
|
+
// Check failure policy
|
|
2513
|
+
if (waveResult.stoppedEarly) {
|
|
2514
|
+
if (waveResult.policyApplied === "stop-all") {
|
|
2515
|
+
batchState.phase = "stopped";
|
|
2516
|
+
persistRuntimeState(
|
|
2517
|
+
"stop-all",
|
|
2518
|
+
batchState,
|
|
2519
|
+
wavePlan,
|
|
2520
|
+
latestAllocatedLanes,
|
|
2521
|
+
allTaskOutcomes,
|
|
2522
|
+
discovery,
|
|
2523
|
+
stateRoot,
|
|
2524
|
+
);
|
|
2525
|
+
onNotify(ORCH_MESSAGES.orchBatchStopped(batchState.batchId, "stop-all"), "error");
|
|
2526
|
+
break;
|
|
2527
|
+
}
|
|
2528
|
+
if (waveResult.policyApplied === "stop-wave") {
|
|
2529
|
+
batchState.phase = "stopped";
|
|
2530
|
+
persistRuntimeState(
|
|
2531
|
+
"stop-wave",
|
|
2532
|
+
batchState,
|
|
2533
|
+
wavePlan,
|
|
2534
|
+
latestAllocatedLanes,
|
|
2535
|
+
allTaskOutcomes,
|
|
2536
|
+
discovery,
|
|
2537
|
+
stateRoot,
|
|
2538
|
+
);
|
|
2539
|
+
onNotify(ORCH_MESSAGES.orchBatchStopped(batchState.batchId, "stop-wave"), "error");
|
|
2540
|
+
break;
|
|
2541
|
+
}
|
|
2542
|
+
}
|
|
2543
|
+
|
|
2544
|
+
// Merge handling (same as executeOrchBatch)
|
|
2545
|
+
let mergeResult: MergeWaveResult | null = null;
|
|
2546
|
+
|
|
2547
|
+
const laneOutcomeByNumber = new Map<number, LaneExecutionResult>();
|
|
2548
|
+
for (const lr of waveResult.laneResults) {
|
|
2549
|
+
laneOutcomeByNumber.set(lr.laneNumber, lr);
|
|
2550
|
+
}
|
|
2551
|
+
const mixedOutcomeLanes = waveResult.laneResults.filter((lr) => {
|
|
2552
|
+
const hasSucceeded = lr.tasks.some((t) => t.status === "succeeded");
|
|
2553
|
+
const hasHardFailure = lr.tasks.some((t) => t.status === "failed" || t.status === "stalled");
|
|
2554
|
+
return hasSucceeded && hasHardFailure;
|
|
2555
|
+
});
|
|
2556
|
+
|
|
2557
|
+
if (waveResult.succeededTaskIds.length > 0) {
|
|
2558
|
+
const mergeableLaneCount = waveResult.allocatedLanes.filter((lane) => {
|
|
2559
|
+
const outcome = laneOutcomeByNumber.get(lane.laneNumber);
|
|
2560
|
+
if (!outcome) return false;
|
|
2561
|
+
const hasSucceeded = outcome.tasks.some((t) => t.status === "succeeded");
|
|
2562
|
+
const hasHardFailure = outcome.tasks.some(
|
|
2563
|
+
(t) => t.status === "failed" || t.status === "stalled",
|
|
2564
|
+
);
|
|
2565
|
+
return hasSucceeded && !hasHardFailure;
|
|
2566
|
+
}).length;
|
|
2567
|
+
|
|
2568
|
+
if (mergeableLaneCount > 0) {
|
|
2569
|
+
batchState.phase = "merging";
|
|
2570
|
+
persistRuntimeState(
|
|
2571
|
+
"merge-start",
|
|
2572
|
+
batchState,
|
|
2573
|
+
wavePlan,
|
|
2574
|
+
latestAllocatedLanes,
|
|
2575
|
+
allTaskOutcomes,
|
|
2576
|
+
discovery,
|
|
2577
|
+
stateRoot,
|
|
2578
|
+
);
|
|
2579
|
+
onNotify(
|
|
2580
|
+
ORCH_MESSAGES.orchMergeStart(
|
|
2581
|
+
resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave,
|
|
2582
|
+
mergeableLaneCount,
|
|
2583
|
+
),
|
|
2584
|
+
"info",
|
|
2585
|
+
);
|
|
2586
|
+
|
|
2587
|
+
mergeResult = await mergeWaveByRepo(
|
|
2588
|
+
waveResult.allocatedLanes,
|
|
2589
|
+
waveResult,
|
|
2590
|
+
waveIdx + 1,
|
|
2591
|
+
orchConfig,
|
|
2592
|
+
repoRoot,
|
|
2593
|
+
batchState.batchId,
|
|
2594
|
+
batchState.orchBranch,
|
|
2595
|
+
workspaceConfig,
|
|
2596
|
+
stateRoot,
|
|
2597
|
+
agentRoot,
|
|
2598
|
+
runnerConfig.testing_commands,
|
|
2599
|
+
undefined, // healthMonitor
|
|
2600
|
+
undefined, // forceMixedOutcome
|
|
2601
|
+
resumeBackend,
|
|
2602
|
+
);
|
|
2603
|
+
batchState.mergeResults.push(mergeResult);
|
|
2604
|
+
|
|
2605
|
+
// Emit per-lane merge notifications
|
|
2606
|
+
for (const lr of mergeResult.laneResults) {
|
|
2607
|
+
const durationSec = Math.round(lr.durationMs / 1000);
|
|
2608
|
+
// TP-032 R006-3: Check lr.error first — verification_new_failure lanes
|
|
2609
|
+
// have error set even though lr.result.status may be SUCCESS/CONFLICT_RESOLVED.
|
|
2610
|
+
if (lr.error) {
|
|
2611
|
+
onNotify(ORCH_MESSAGES.orchMergeLaneFailed(lr.laneNumber, lr.error), "error");
|
|
2612
|
+
} else if (lr.result?.status === "SUCCESS") {
|
|
2613
|
+
onNotify(
|
|
2614
|
+
ORCH_MESSAGES.orchMergeLaneSuccess(lr.laneNumber, lr.result.merge_commit, durationSec),
|
|
2615
|
+
"info",
|
|
2616
|
+
);
|
|
2617
|
+
} else if (lr.result?.status === "CONFLICT_RESOLVED") {
|
|
2618
|
+
onNotify(
|
|
2619
|
+
ORCH_MESSAGES.orchMergeLaneConflictResolved(
|
|
2620
|
+
lr.laneNumber,
|
|
2621
|
+
lr.result.conflicts.length,
|
|
2622
|
+
durationSec,
|
|
2623
|
+
),
|
|
2624
|
+
"info",
|
|
2625
|
+
);
|
|
2626
|
+
} else if (
|
|
2627
|
+
lr.result?.status === "CONFLICT_UNRESOLVED" ||
|
|
2628
|
+
lr.result?.status === "BUILD_FAILURE"
|
|
2629
|
+
) {
|
|
2630
|
+
onNotify(ORCH_MESSAGES.orchMergeLaneFailed(lr.laneNumber, lr.result.status), "error");
|
|
2631
|
+
}
|
|
2632
|
+
}
|
|
2633
|
+
|
|
2634
|
+
if (mixedOutcomeLanes.length > 0) {
|
|
2635
|
+
const mixedIds = mixedOutcomeLanes.map((l) => `lane-${l.laneNumber}`).join(", ");
|
|
2636
|
+
const failureReason =
|
|
2637
|
+
`Lane(s) ${mixedIds} contain both succeeded and failed tasks. ` +
|
|
2638
|
+
`Automatic partial-branch merge is disabled to avoid dropping succeeded commits.`;
|
|
2639
|
+
mergeResult = {
|
|
2640
|
+
...mergeResult,
|
|
2641
|
+
status: "partial",
|
|
2642
|
+
failedLane: mixedOutcomeLanes[0].laneNumber,
|
|
2643
|
+
failureReason,
|
|
2644
|
+
};
|
|
2645
|
+
// Update the already-pushed reference so persisted state reflects "partial"
|
|
2646
|
+
batchState.mergeResults[batchState.mergeResults.length - 1] = mergeResult;
|
|
2647
|
+
}
|
|
2648
|
+
|
|
2649
|
+
// TP-032 R006-3: Exclude verification_new_failure lanes from success count
|
|
2650
|
+
const mergedCount = mergeResult.laneResults.filter(
|
|
2651
|
+
(r) =>
|
|
2652
|
+
!r.error && (r.result?.status === "SUCCESS" || r.result?.status === "CONFLICT_RESOLVED"),
|
|
2653
|
+
).length;
|
|
2654
|
+
const mergeTotalSec = Math.round(mergeResult.totalDurationMs / 1000);
|
|
2655
|
+
|
|
2656
|
+
if (mergeResult.status === "succeeded") {
|
|
2657
|
+
onNotify(
|
|
2658
|
+
ORCH_MESSAGES.orchMergeComplete(
|
|
2659
|
+
resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave,
|
|
2660
|
+
mergedCount,
|
|
2661
|
+
mergeTotalSec,
|
|
2662
|
+
),
|
|
2663
|
+
"info",
|
|
2664
|
+
);
|
|
2665
|
+
} else {
|
|
2666
|
+
onNotify(
|
|
2667
|
+
ORCH_MESSAGES.orchMergeFailed(
|
|
2668
|
+
resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave,
|
|
2669
|
+
mergeResult.failedLane ?? 0,
|
|
2670
|
+
mergeResult.failureReason || "unknown",
|
|
2671
|
+
),
|
|
2672
|
+
"error",
|
|
2673
|
+
);
|
|
2674
|
+
|
|
2675
|
+
// Emit repo-divergence summary when partial is caused by cross-repo outcome differences
|
|
2676
|
+
if (mergeResult.status === "partial") {
|
|
2677
|
+
const repoSummary = formatRepoMergeSummary(mergeResult);
|
|
2678
|
+
if (repoSummary) {
|
|
2679
|
+
onNotify(repoSummary, "warning");
|
|
2680
|
+
}
|
|
2681
|
+
}
|
|
2682
|
+
}
|
|
2683
|
+
|
|
2684
|
+
batchState.phase = "executing";
|
|
2685
|
+
persistRuntimeState(
|
|
2686
|
+
"merge-complete",
|
|
2687
|
+
batchState,
|
|
2688
|
+
wavePlan,
|
|
2689
|
+
latestAllocatedLanes,
|
|
2690
|
+
allTaskOutcomes,
|
|
2691
|
+
discovery,
|
|
2692
|
+
stateRoot,
|
|
2693
|
+
);
|
|
2694
|
+
} else if (mixedOutcomeLanes.length > 0) {
|
|
2695
|
+
const mixedIds = mixedOutcomeLanes.map((l) => `lane-${l.laneNumber}`).join(", ");
|
|
2696
|
+
mergeResult = {
|
|
2697
|
+
waveIndex: waveIdx + 1,
|
|
2698
|
+
status: "partial",
|
|
2699
|
+
laneResults: [],
|
|
2700
|
+
failedLane: mixedOutcomeLanes[0].laneNumber,
|
|
2701
|
+
failureReason:
|
|
2702
|
+
`Lane(s) ${mixedIds} contain both succeeded and failed tasks. ` +
|
|
2703
|
+
`Automatic partial-branch merge is disabled to avoid dropping succeeded commits.`,
|
|
2704
|
+
totalDurationMs: 0,
|
|
2705
|
+
};
|
|
2706
|
+
// Keep mergeResults in sync even when no mergeable lane exists.
|
|
2707
|
+
// Downstream retry/update paths assume the current wave has an entry.
|
|
2708
|
+
batchState.mergeResults.push(mergeResult);
|
|
2709
|
+
onNotify(
|
|
2710
|
+
ORCH_MESSAGES.orchMergeFailed(
|
|
2711
|
+
resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave,
|
|
2712
|
+
mergeResult.failedLane,
|
|
2713
|
+
mergeResult.failureReason || "unknown",
|
|
2714
|
+
),
|
|
2715
|
+
"error",
|
|
2716
|
+
);
|
|
2717
|
+
} else {
|
|
2718
|
+
onNotify(
|
|
2719
|
+
ORCH_MESSAGES.orchMergeSkipped(
|
|
2720
|
+
resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave,
|
|
2721
|
+
),
|
|
2722
|
+
"info",
|
|
2723
|
+
);
|
|
2724
|
+
}
|
|
2725
|
+
} else {
|
|
2726
|
+
onNotify(
|
|
2727
|
+
ORCH_MESSAGES.orchMergeSkipped(
|
|
2728
|
+
resolveDisplayWaveNumber(waveIdx, roundToTaskWave, taskLevelWaveCount).displayWave,
|
|
2729
|
+
),
|
|
2730
|
+
"info",
|
|
2731
|
+
);
|
|
2732
|
+
}
|
|
2733
|
+
|
|
2734
|
+
// ── TP-033: Safe-stop on rollback failure ─────────────────
|
|
2735
|
+
// When a verification rollback failed, force paused regardless of
|
|
2736
|
+
// on_merge_failure policy. The merge worktree and temp branch are
|
|
2737
|
+
// preserved for manual recovery using commands in the transaction record.
|
|
2738
|
+
if (mergeResult?.rollbackFailed) {
|
|
2739
|
+
// TP-033 R004-2: Include persistence error warning when transaction
|
|
2740
|
+
// record files may be missing, so operator knows to inspect manually
|
|
2741
|
+
const hasPersistErrors =
|
|
2742
|
+
mergeResult.persistenceErrors && mergeResult.persistenceErrors.length > 0;
|
|
2743
|
+
const persistWarning = hasPersistErrors
|
|
2744
|
+
? ` WARNING: ${mergeResult.persistenceErrors!.length} transaction record(s) failed to persist — recovery file(s) may be missing.`
|
|
2745
|
+
: "";
|
|
2746
|
+
|
|
2747
|
+
execLog(
|
|
2748
|
+
"batch",
|
|
2749
|
+
batchState.batchId,
|
|
2750
|
+
"SAFE-STOP: verification rollback failed — forcing paused regardless of policy",
|
|
2751
|
+
{
|
|
2752
|
+
waveIndex: waveIdx,
|
|
2753
|
+
configPolicy: orchConfig.failure.on_merge_failure,
|
|
2754
|
+
...(hasPersistErrors ? { persistenceErrors: mergeResult.persistenceErrors } : {}),
|
|
2755
|
+
},
|
|
2756
|
+
);
|
|
2757
|
+
|
|
2758
|
+
batchState.phase = "paused";
|
|
2759
|
+
batchState.errors.push(
|
|
2760
|
+
`Safe-stop at wave ${waveIdx + 1}: verification rollback failed. ` +
|
|
2761
|
+
`Merge worktree and temp branch preserved for recovery. ` +
|
|
2762
|
+
`Check transaction records in .pi/verification/ for recovery commands.` +
|
|
2763
|
+
persistWarning,
|
|
2764
|
+
);
|
|
2765
|
+
persistRuntimeState(
|
|
2766
|
+
"merge-rollback-safe-stop",
|
|
2767
|
+
batchState,
|
|
2768
|
+
wavePlan,
|
|
2769
|
+
latestAllocatedLanes,
|
|
2770
|
+
allTaskOutcomes,
|
|
2771
|
+
discovery,
|
|
2772
|
+
stateRoot,
|
|
2773
|
+
);
|
|
2774
|
+
onNotify(
|
|
2775
|
+
`🛑 Safe-stop: verification rollback failed at wave ${waveIdx + 1}. ` +
|
|
2776
|
+
`Batch force-paused. Merge worktree preserved for manual recovery. ` +
|
|
2777
|
+
`See .pi/verification/ transaction records for recovery commands.` +
|
|
2778
|
+
persistWarning,
|
|
2779
|
+
"error",
|
|
2780
|
+
);
|
|
2781
|
+
|
|
2782
|
+
// ── TP-076: Emit supervisor alert for rollback safe-stop ──
|
|
2783
|
+
const rollbackRepoId = extractFailedRepoId(mergeResult) ?? undefined;
|
|
2784
|
+
emitAlert({
|
|
2785
|
+
category: "merge-failure",
|
|
2786
|
+
summary:
|
|
2787
|
+
`⚠️ Merge failed for wave ${waveIdx + 1} — verification rollback failed\n` +
|
|
2788
|
+
` Batch force-paused for manual recovery.\n` +
|
|
2789
|
+
` Check .pi/verification/ for recovery commands.\n\n` +
|
|
2790
|
+
`Available actions:\n` +
|
|
2791
|
+
` - Check .pi/verification/ transaction records\n` +
|
|
2792
|
+
` - orch_status() to inspect current state\n` +
|
|
2793
|
+
` - orch_resume(force=true) after manual recovery`,
|
|
2794
|
+
context: {
|
|
2795
|
+
waveIndex: waveIdx,
|
|
2796
|
+
laneNumber: mergeResult.failedLane ?? undefined,
|
|
2797
|
+
repoId: rollbackRepoId,
|
|
2798
|
+
mergeError: `Safe-stop: verification rollback failed at wave ${waveIdx + 1}`,
|
|
2799
|
+
batchProgress: buildBatchProgressSnapshot(batchState),
|
|
2800
|
+
},
|
|
2801
|
+
});
|
|
2802
|
+
|
|
2803
|
+
preserveWorktreesForResume = true;
|
|
2804
|
+
break;
|
|
2805
|
+
}
|
|
2806
|
+
|
|
2807
|
+
// Handle merge failure — TP-033 Step 2 (R006): Retry policy matrix via shared applyMergeRetryLoop.
|
|
2808
|
+
// Uses the same centralized loop as engine.ts for guaranteed parity.
|
|
2809
|
+
if (mergeResult && (mergeResult.status === "failed" || mergeResult.status === "partial")) {
|
|
2810
|
+
// Initialize resilience state if not yet present
|
|
2811
|
+
if (!batchState.resilience) {
|
|
2812
|
+
batchState.resilience = defaultResilienceState();
|
|
2813
|
+
}
|
|
2814
|
+
|
|
2815
|
+
const mergeRepoId = extractFailedRepoId(mergeResult) ?? undefined;
|
|
2816
|
+
const retryOutcome = await applyMergeRetryLoop(
|
|
2817
|
+
mergeResult,
|
|
2818
|
+
waveIdx,
|
|
2819
|
+
batchState.resilience.retryCountByScope,
|
|
2820
|
+
{
|
|
2821
|
+
performMerge: async () => {
|
|
2822
|
+
batchState.phase = "merging";
|
|
2823
|
+
return await mergeWaveByRepo(
|
|
2824
|
+
waveResult.allocatedLanes,
|
|
2825
|
+
waveResult,
|
|
2826
|
+
waveIdx + 1,
|
|
2827
|
+
orchConfig,
|
|
2828
|
+
repoRoot,
|
|
2829
|
+
batchState.batchId,
|
|
2830
|
+
batchState.orchBranch,
|
|
2831
|
+
workspaceConfig,
|
|
2832
|
+
stateRoot,
|
|
2833
|
+
agentRoot,
|
|
2834
|
+
runnerConfig.testing_commands,
|
|
2835
|
+
undefined, // healthMonitor
|
|
2836
|
+
undefined, // forceMixedOutcome
|
|
2837
|
+
resumeBackend,
|
|
2838
|
+
);
|
|
2839
|
+
},
|
|
2840
|
+
persist: (trigger) =>
|
|
2841
|
+
persistRuntimeState(
|
|
2842
|
+
trigger,
|
|
2843
|
+
batchState,
|
|
2844
|
+
wavePlan,
|
|
2845
|
+
latestAllocatedLanes,
|
|
2846
|
+
allTaskOutcomes,
|
|
2847
|
+
discovery,
|
|
2848
|
+
stateRoot,
|
|
2849
|
+
),
|
|
2850
|
+
log: (message, details) => execLog("batch", batchState.batchId, message, details),
|
|
2851
|
+
notify: (message, level) => onNotify(message, level),
|
|
2852
|
+
updateMergeResult: (result) => {
|
|
2853
|
+
mergeResult = result;
|
|
2854
|
+
batchState.mergeResults[batchState.mergeResults.length - 1] = result;
|
|
2855
|
+
},
|
|
2856
|
+
sleep: sleepSync,
|
|
2857
|
+
},
|
|
2858
|
+
);
|
|
2859
|
+
|
|
2860
|
+
if (retryOutcome.kind === "retry_succeeded") {
|
|
2861
|
+
mergeResult = retryOutcome.mergeResult;
|
|
2862
|
+
batchState.phase = "executing";
|
|
2863
|
+
persistRuntimeState(
|
|
2864
|
+
"merge-retry-succeeded",
|
|
2865
|
+
batchState,
|
|
2866
|
+
wavePlan,
|
|
2867
|
+
latestAllocatedLanes,
|
|
2868
|
+
allTaskOutcomes,
|
|
2869
|
+
discovery,
|
|
2870
|
+
stateRoot,
|
|
2871
|
+
);
|
|
2872
|
+
// Fall through to normal post-merge flow
|
|
2873
|
+
} else if (retryOutcome.kind === "safe_stop") {
|
|
2874
|
+
mergeResult = retryOutcome.mergeResult;
|
|
2875
|
+
batchState.phase = "paused";
|
|
2876
|
+
batchState.errors.push(retryOutcome.errorMessage);
|
|
2877
|
+
persistRuntimeState(
|
|
2878
|
+
"merge-rollback-safe-stop",
|
|
2879
|
+
batchState,
|
|
2880
|
+
wavePlan,
|
|
2881
|
+
latestAllocatedLanes,
|
|
2882
|
+
allTaskOutcomes,
|
|
2883
|
+
discovery,
|
|
2884
|
+
stateRoot,
|
|
2885
|
+
);
|
|
2886
|
+
onNotify(retryOutcome.notifyMessage, "error");
|
|
2887
|
+
|
|
2888
|
+
// ── TP-076: Emit supervisor alert for merge safe-stop ──
|
|
2889
|
+
emitAlert({
|
|
2890
|
+
category: "merge-failure",
|
|
2891
|
+
summary:
|
|
2892
|
+
`⚠️ Merge failed for wave ${waveIdx + 1} — rollback failure\n` +
|
|
2893
|
+
` Error: ${retryOutcome.errorMessage}\n\n` +
|
|
2894
|
+
`Available actions:\n` +
|
|
2895
|
+
` - orch_status() to inspect current state\n` +
|
|
2896
|
+
` - orch_resume(force=true) after manual recovery`,
|
|
2897
|
+
context: {
|
|
2898
|
+
waveIndex: waveIdx,
|
|
2899
|
+
laneNumber: mergeResult.failedLane ?? undefined,
|
|
2900
|
+
repoId: mergeRepoId,
|
|
2901
|
+
mergeError: retryOutcome.errorMessage,
|
|
2902
|
+
batchProgress: buildBatchProgressSnapshot(batchState),
|
|
2903
|
+
},
|
|
2904
|
+
});
|
|
2905
|
+
|
|
2906
|
+
preserveWorktreesForResume = true;
|
|
2907
|
+
break;
|
|
2908
|
+
} else if (retryOutcome.kind === "exhausted") {
|
|
2909
|
+
// TP-033 R006-2: Force paused regardless of on_merge_failure config.
|
|
2910
|
+
mergeResult = retryOutcome.mergeResult;
|
|
2911
|
+
const exhaustionMsg =
|
|
2912
|
+
retryOutcome.errorMessage +
|
|
2913
|
+
` [${retryOutcome.classification ?? "unknown"} ${retryOutcome.lastDecision.currentAttempt}/${retryOutcome.lastDecision.maxAttempts}, scope=${retryOutcome.scopeKey}]`;
|
|
2914
|
+
|
|
2915
|
+
execLog("batch", batchState.batchId, `merge retry exhausted — forcing paused`, {
|
|
2916
|
+
classification: retryOutcome.classification,
|
|
2917
|
+
scopeKey: retryOutcome.scopeKey,
|
|
2918
|
+
attempts: retryOutcome.lastDecision.currentAttempt,
|
|
2919
|
+
maxAttempts: retryOutcome.lastDecision.maxAttempts,
|
|
2920
|
+
});
|
|
2921
|
+
|
|
2922
|
+
batchState.phase = "paused";
|
|
2923
|
+
batchState.errors.push(exhaustionMsg);
|
|
2924
|
+
persistRuntimeState(
|
|
2925
|
+
"merge-retry-exhausted",
|
|
2926
|
+
batchState,
|
|
2927
|
+
wavePlan,
|
|
2928
|
+
latestAllocatedLanes,
|
|
2929
|
+
allTaskOutcomes,
|
|
2930
|
+
discovery,
|
|
2931
|
+
stateRoot,
|
|
2932
|
+
);
|
|
2933
|
+
onNotify(retryOutcome.notifyMessage, "error");
|
|
2934
|
+
|
|
2935
|
+
// ── TP-076: Emit supervisor alert for merge retry exhausted ──
|
|
2936
|
+
emitAlert({
|
|
2937
|
+
category: "merge-failure",
|
|
2938
|
+
summary:
|
|
2939
|
+
`⚠️ Merge failed for wave ${waveIdx + 1} — retry exhausted\n` +
|
|
2940
|
+
` Classification: ${retryOutcome.classification ?? "unknown"}\n` +
|
|
2941
|
+
` Error: ${exhaustionMsg}\n\n` +
|
|
2942
|
+
`Available actions:\n` +
|
|
2943
|
+
` - Investigate merge failure and retry manually\n` +
|
|
2944
|
+
` - orch_status() to inspect current state\n` +
|
|
2945
|
+
` - orch_resume(force=true) after fixing the issue`,
|
|
2946
|
+
context: {
|
|
2947
|
+
waveIndex: waveIdx,
|
|
2948
|
+
laneNumber: mergeResult.failedLane ?? undefined,
|
|
2949
|
+
repoId: mergeRepoId,
|
|
2950
|
+
mergeError: exhaustionMsg,
|
|
2951
|
+
batchProgress: buildBatchProgressSnapshot(batchState),
|
|
2952
|
+
},
|
|
2953
|
+
});
|
|
2954
|
+
|
|
2955
|
+
preserveWorktreesForResume = true;
|
|
2956
|
+
break;
|
|
2957
|
+
} else {
|
|
2958
|
+
// kind === "no_retry": fall through to standard on_merge_failure policy
|
|
2959
|
+
mergeResult = retryOutcome.mergeResult;
|
|
2960
|
+
const policyResult = computeMergeFailurePolicy(mergeResult, waveIdx, orchConfig);
|
|
2961
|
+
const classNote = retryOutcome.classification
|
|
2962
|
+
? ` [not retriable: ${retryOutcome.classification}, scope=${retryOutcome.scopeKey}]`
|
|
2963
|
+
: "";
|
|
2964
|
+
|
|
2965
|
+
execLog(
|
|
2966
|
+
"batch",
|
|
2967
|
+
batchState.batchId,
|
|
2968
|
+
`merge failure — applying ${policyResult.policy} policy${classNote}`,
|
|
2969
|
+
policyResult.logDetails,
|
|
2970
|
+
);
|
|
2971
|
+
|
|
2972
|
+
batchState.phase = policyResult.targetPhase;
|
|
2973
|
+
batchState.errors.push(policyResult.errorMessage + classNote);
|
|
2974
|
+
persistRuntimeState(
|
|
2975
|
+
policyResult.persistTrigger,
|
|
2976
|
+
batchState,
|
|
2977
|
+
wavePlan,
|
|
2978
|
+
latestAllocatedLanes,
|
|
2979
|
+
allTaskOutcomes,
|
|
2980
|
+
discovery,
|
|
2981
|
+
stateRoot,
|
|
2982
|
+
);
|
|
2983
|
+
onNotify(policyResult.notifyMessage + classNote, policyResult.notifyLevel);
|
|
2984
|
+
|
|
2985
|
+
// ── TP-076: Emit supervisor alert for merge failure (no-retry policy) ──
|
|
2986
|
+
emitAlert({
|
|
2987
|
+
category: "merge-failure",
|
|
2988
|
+
summary:
|
|
2989
|
+
`⚠️ Merge failed for wave ${waveIdx + 1}\n` +
|
|
2990
|
+
` Policy: ${policyResult.policy}${classNote}\n` +
|
|
2991
|
+
` Error: ${mergeResult.failureReason || "unknown"}\n\n` +
|
|
2992
|
+
`Available actions:\n` +
|
|
2993
|
+
` - Investigate failed merge\n` +
|
|
2994
|
+
` - orch_status() to inspect current state\n` +
|
|
2995
|
+
` - orch_resume(force=true) after fixing the issue`,
|
|
2996
|
+
context: {
|
|
2997
|
+
waveIndex: waveIdx,
|
|
2998
|
+
laneNumber: mergeResult.failedLane ?? undefined,
|
|
2999
|
+
repoId: mergeRepoId,
|
|
3000
|
+
mergeError: mergeResult.failureReason || "unknown",
|
|
3001
|
+
batchProgress: buildBatchProgressSnapshot(batchState),
|
|
3002
|
+
},
|
|
3003
|
+
});
|
|
3004
|
+
|
|
3005
|
+
preserveWorktreesForResume = true;
|
|
3006
|
+
break;
|
|
3007
|
+
}
|
|
3008
|
+
}
|
|
3009
|
+
|
|
3010
|
+
// Post-merge: reset worktrees for next wave
|
|
3011
|
+
// TP-032 R006-3: Exclude verification_new_failure lanes from branch cleanup
|
|
3012
|
+
if (mergeResult && mergeResult.status === "succeeded") {
|
|
3013
|
+
for (const lr of mergeResult.laneResults) {
|
|
3014
|
+
if (
|
|
3015
|
+
!lr.error &&
|
|
3016
|
+
(lr.result?.status === "SUCCESS" || lr.result?.status === "CONFLICT_RESOLVED")
|
|
3017
|
+
) {
|
|
3018
|
+
const laneRepoRoot = resolveRepoRoot(lr.repoId, repoRoot, workspaceConfig);
|
|
3019
|
+
const ancestorCheck = runGit(
|
|
3020
|
+
["merge-base", "--is-ancestor", lr.sourceBranch, lr.targetBranch],
|
|
3021
|
+
laneRepoRoot,
|
|
3022
|
+
);
|
|
3023
|
+
if (ancestorCheck.ok) {
|
|
3024
|
+
deleteBranchBestEffort(lr.sourceBranch, laneRepoRoot);
|
|
3025
|
+
}
|
|
3026
|
+
}
|
|
3027
|
+
}
|
|
3028
|
+
}
|
|
3029
|
+
|
|
3030
|
+
// ── TP-028: Preserve partial progress before inter-wave reset ──
|
|
3031
|
+
// Hoisted outside the if-block so unsafeBranches is accessible to the
|
|
3032
|
+
// reset loop below — both blocks share the same guard condition.
|
|
3033
|
+
let ppUnsafeBranches = new Set<string>();
|
|
3034
|
+
if (waveIdx < persistedState.wavePlan.length - 1 && !batchState.pauseSignal.paused) {
|
|
3035
|
+
const ppOpId = resolveOperatorId(orchConfig);
|
|
3036
|
+
const ppResult = preserveFailedLaneProgress(
|
|
3037
|
+
latestAllocatedLanes,
|
|
3038
|
+
allTaskOutcomes,
|
|
3039
|
+
ppOpId,
|
|
3040
|
+
batchState.batchId,
|
|
3041
|
+
(repoId) => {
|
|
3042
|
+
const perRepoRoot = resolveRepoRoot(repoId, repoRoot, workspaceConfig);
|
|
3043
|
+
let targetBranch = batchState.orchBranch;
|
|
3044
|
+
if (repoId && perRepoRoot !== repoRoot) {
|
|
3045
|
+
try {
|
|
3046
|
+
targetBranch = resolveBaseBranch(
|
|
3047
|
+
repoId,
|
|
3048
|
+
perRepoRoot,
|
|
3049
|
+
batchState.orchBranch,
|
|
3050
|
+
workspaceConfig,
|
|
3051
|
+
);
|
|
3052
|
+
} catch {
|
|
3053
|
+
/* fall back to orchBranch */
|
|
3054
|
+
}
|
|
3055
|
+
}
|
|
3056
|
+
return { repoRoot: perRepoRoot, targetBranch };
|
|
3057
|
+
},
|
|
3058
|
+
);
|
|
3059
|
+
ppUnsafeBranches = ppResult.unsafeBranches;
|
|
3060
|
+
if (ppResult.results.some((r) => r.saved)) {
|
|
3061
|
+
execLog(
|
|
3062
|
+
"batch",
|
|
3063
|
+
batchState.batchId,
|
|
3064
|
+
`preserved partial progress for ${ppResult.results.filter((r) => r.saved).length} failed task(s) before inter-wave reset`,
|
|
3065
|
+
);
|
|
3066
|
+
}
|
|
3067
|
+
// Log per-task warnings for failed preservation attempts
|
|
3068
|
+
for (const r of ppResult.results) {
|
|
3069
|
+
if (!r.saved && (r.commitCount > 0 || r.error)) {
|
|
3070
|
+
execLog(
|
|
3071
|
+
"batch",
|
|
3072
|
+
batchState.batchId,
|
|
3073
|
+
`WARNING: Failed to preserve partial progress for task ${r.taskId} ` +
|
|
3074
|
+
`(${r.commitCount} commit(s) at risk on lane branch)`,
|
|
3075
|
+
{ taskId: r.taskId, commitCount: r.commitCount, error: r.error ?? "unknown" },
|
|
3076
|
+
);
|
|
3077
|
+
}
|
|
3078
|
+
}
|
|
3079
|
+
if (ppUnsafeBranches.size > 0) {
|
|
3080
|
+
execLog(
|
|
3081
|
+
"batch",
|
|
3082
|
+
batchState.batchId,
|
|
3083
|
+
`WARNING: ${ppUnsafeBranches.size} lane branch(es) could not be preserved — skipping reset for those lanes to prevent commit loss`,
|
|
3084
|
+
{ unsafeBranches: [...ppUnsafeBranches] },
|
|
3085
|
+
);
|
|
3086
|
+
}
|
|
3087
|
+
// TP-028: Stamp task outcomes with partial progress data for persistence
|
|
3088
|
+
applyPartialProgressToOutcomes(ppResult, allTaskOutcomes);
|
|
3089
|
+
}
|
|
3090
|
+
|
|
3091
|
+
if (waveIdx < persistedState.wavePlan.length - 1 && !batchState.pauseSignal.paused) {
|
|
3092
|
+
const wtPrefix = orchConfig.orchestrator.worktree_prefix;
|
|
3093
|
+
const resetOpId = resolveOperatorId(orchConfig);
|
|
3094
|
+
// TP-029 R006: Track worktrees that failed reset AND removal
|
|
3095
|
+
// so the cleanup gate only fires on true stale state, not
|
|
3096
|
+
// successfully-reset reusable worktrees. (Parity with engine.ts)
|
|
3097
|
+
const failedRemovalWorktrees = new Map<
|
|
3098
|
+
string,
|
|
3099
|
+
{ repoId: string | undefined; paths: string[] }
|
|
3100
|
+
>();
|
|
3101
|
+
|
|
3102
|
+
// Use encounteredRepoRoots which includes both persisted lanes
|
|
3103
|
+
// AND newly allocated lanes from resumed waves, ensuring repos
|
|
3104
|
+
// introduced after resume starts are covered.
|
|
3105
|
+
// Per-repo target branch: primary repo uses orchBranch, secondary
|
|
3106
|
+
// repos resolve their own branch (same as cleanup — see section 11).
|
|
3107
|
+
for (const perRepoRoot of encounteredRepoRoots) {
|
|
3108
|
+
const existingWorktrees = listWorktrees(wtPrefix, perRepoRoot, resetOpId, batchState.batchId);
|
|
3109
|
+
if (existingWorktrees.length > 0) {
|
|
3110
|
+
let targetBranch: string;
|
|
3111
|
+
if (perRepoRoot === repoRoot) {
|
|
3112
|
+
targetBranch = batchState.orchBranch;
|
|
3113
|
+
} else {
|
|
3114
|
+
const repoId = resolveRepoIdFromRoot(perRepoRoot, workspaceConfig);
|
|
3115
|
+
try {
|
|
3116
|
+
targetBranch = resolveBaseBranch(
|
|
3117
|
+
repoId,
|
|
3118
|
+
perRepoRoot,
|
|
3119
|
+
batchState.orchBranch,
|
|
3120
|
+
workspaceConfig,
|
|
3121
|
+
);
|
|
3122
|
+
} catch {
|
|
3123
|
+
// If resolution fails, fall back to orchBranch (reset will
|
|
3124
|
+
// fail gracefully and trigger worktree removal)
|
|
3125
|
+
targetBranch = batchState.orchBranch;
|
|
3126
|
+
}
|
|
3127
|
+
}
|
|
3128
|
+
for (const wt of existingWorktrees) {
|
|
3129
|
+
// TP-028: Skip reset for worktrees whose lane branch has
|
|
3130
|
+
// unsaved partial progress (preservation failed with commits)
|
|
3131
|
+
if (ppUnsafeBranches.has(wt.branch)) {
|
|
3132
|
+
execLog(
|
|
3133
|
+
"batch",
|
|
3134
|
+
batchState.batchId,
|
|
3135
|
+
`skipping worktree reset for lane ${wt.laneNumber} — branch "${wt.branch}" has unsaved partial progress`,
|
|
3136
|
+
{ path: wt.path, branch: wt.branch },
|
|
3137
|
+
);
|
|
3138
|
+
continue;
|
|
3139
|
+
}
|
|
3140
|
+
|
|
3141
|
+
const resetResult = safeResetWorktree(wt, targetBranch, perRepoRoot);
|
|
3142
|
+
if (!resetResult.success) {
|
|
3143
|
+
try {
|
|
3144
|
+
removeWorktree(wt, perRepoRoot);
|
|
3145
|
+
} catch {
|
|
3146
|
+
forceCleanupWorktree(wt, perRepoRoot, batchState.batchId);
|
|
3147
|
+
// Track this worktree for the cleanup gate — it may still be registered
|
|
3148
|
+
const perRepoId =
|
|
3149
|
+
perRepoRoot === repoRoot ? undefined : resolveRepoIdFromRoot(perRepoRoot, workspaceConfig);
|
|
3150
|
+
if (!failedRemovalWorktrees.has(perRepoRoot)) {
|
|
3151
|
+
failedRemovalWorktrees.set(perRepoRoot, { repoId: perRepoId, paths: [] });
|
|
3152
|
+
}
|
|
3153
|
+
failedRemovalWorktrees.get(perRepoRoot)!.paths.push(wt.path);
|
|
3154
|
+
}
|
|
3155
|
+
}
|
|
3156
|
+
}
|
|
3157
|
+
}
|
|
3158
|
+
}
|
|
3159
|
+
|
|
3160
|
+
// ── TP-029: Post-merge cleanup gate (parity with engine.ts) ──
|
|
3161
|
+
// Only gate on worktrees that the reset loop tried and failed
|
|
3162
|
+
// to remove. Successfully-reset reusable worktrees are expected
|
|
3163
|
+
// to remain registered — they will be reused in the next wave.
|
|
3164
|
+
// For each failed-removal worktree, verify it is still registered
|
|
3165
|
+
// before classifying it as truly stale.
|
|
3166
|
+
const cleanupGateFailures: CleanupGateRepoFailure[] = [];
|
|
3167
|
+
if (failedRemovalWorktrees.size > 0) {
|
|
3168
|
+
for (const [perRepoRoot, { repoId: perRepoId, paths: failedPaths }] of failedRemovalWorktrees) {
|
|
3169
|
+
const remaining = listWorktrees(wtPrefix, perRepoRoot, resetOpId, batchState.batchId);
|
|
3170
|
+
const remainingPaths = new Set(remaining.map((wt) => wt.path));
|
|
3171
|
+
// Only report worktrees that were targeted for removal but are still registered
|
|
3172
|
+
const stale = failedPaths.filter((p) => remainingPaths.has(p));
|
|
3173
|
+
if (stale.length > 0) {
|
|
3174
|
+
cleanupGateFailures.push({
|
|
3175
|
+
repoRoot: perRepoRoot,
|
|
3176
|
+
repoId: perRepoId,
|
|
3177
|
+
staleWorktrees: stale,
|
|
3178
|
+
});
|
|
3179
|
+
}
|
|
3180
|
+
}
|
|
3181
|
+
}
|
|
3182
|
+
|
|
3183
|
+
if (cleanupGateFailures.length > 0) {
|
|
3184
|
+
const gatePolicyResult = computeCleanupGatePolicy(waveIdx, cleanupGateFailures);
|
|
3185
|
+
|
|
3186
|
+
execLog(
|
|
3187
|
+
"batch",
|
|
3188
|
+
batchState.batchId,
|
|
3189
|
+
`cleanup gate failed — pausing batch`,
|
|
3190
|
+
gatePolicyResult.logDetails,
|
|
3191
|
+
);
|
|
3192
|
+
|
|
3193
|
+
batchState.phase = gatePolicyResult.targetPhase;
|
|
3194
|
+
batchState.errors.push(gatePolicyResult.errorMessage);
|
|
3195
|
+
persistRuntimeState(
|
|
3196
|
+
gatePolicyResult.persistTrigger,
|
|
3197
|
+
batchState,
|
|
3198
|
+
wavePlan,
|
|
3199
|
+
latestAllocatedLanes,
|
|
3200
|
+
allTaskOutcomes,
|
|
3201
|
+
discovery,
|
|
3202
|
+
stateRoot,
|
|
3203
|
+
);
|
|
3204
|
+
onNotify(gatePolicyResult.notifyMessage, gatePolicyResult.notifyLevel);
|
|
3205
|
+
preserveWorktreesForResume = true;
|
|
3206
|
+
break;
|
|
3207
|
+
}
|
|
3208
|
+
}
|
|
3209
|
+
}
|
|
3210
|
+
|
|
3211
|
+
// ── Pre-cleanup: Determine if worktrees should be preserved ──
|
|
3212
|
+
// TP-031 (R006): Parity with engine.ts — this check MUST run before cleanup
|
|
3213
|
+
// so that worktrees survive when failedTasks > 0. Without this, cleanup
|
|
3214
|
+
// deletes worktrees before the batch is marked "paused", breaking resumability.
|
|
3215
|
+
if (
|
|
3216
|
+
!preserveWorktreesForResume &&
|
|
3217
|
+
((batchState.phase as OrchBatchPhase) === "executing" ||
|
|
3218
|
+
(batchState.phase as OrchBatchPhase) === "merging") &&
|
|
3219
|
+
batchState.failedTasks > 0
|
|
3220
|
+
) {
|
|
3221
|
+
preserveWorktreesForResume = true;
|
|
3222
|
+
execLog(
|
|
3223
|
+
"resume",
|
|
3224
|
+
batchState.batchId,
|
|
3225
|
+
"pre-cleanup: failedTasks > 0 detected, preserving worktrees for resume",
|
|
3226
|
+
);
|
|
3227
|
+
}
|
|
3228
|
+
|
|
3229
|
+
// ── 11. Cleanup and terminal state ───────────────────────────
|
|
3230
|
+
|
|
3231
|
+
// ── TP-028: Preserve partial progress before terminal cleanup ──
|
|
3232
|
+
if (!preserveWorktreesForResume) {
|
|
3233
|
+
const ppOpId = resolveOperatorId(orchConfig);
|
|
3234
|
+
const ppResult = preserveFailedLaneProgress(
|
|
3235
|
+
latestAllocatedLanes,
|
|
3236
|
+
allTaskOutcomes,
|
|
3237
|
+
ppOpId,
|
|
3238
|
+
batchState.batchId,
|
|
3239
|
+
(repoId) => {
|
|
3240
|
+
const perRepoRoot = resolveRepoRoot(repoId, repoRoot, workspaceConfig);
|
|
3241
|
+
let targetBranch = batchState.orchBranch;
|
|
3242
|
+
if (repoId && perRepoRoot !== repoRoot) {
|
|
3243
|
+
try {
|
|
3244
|
+
targetBranch = resolveBaseBranch(repoId, perRepoRoot, batchState.orchBranch, workspaceConfig);
|
|
3245
|
+
} catch {
|
|
3246
|
+
/* fall back to orchBranch */
|
|
3247
|
+
}
|
|
3248
|
+
}
|
|
3249
|
+
return { repoRoot: perRepoRoot, targetBranch };
|
|
3250
|
+
},
|
|
3251
|
+
);
|
|
3252
|
+
if (ppResult.results.some((r) => r.saved)) {
|
|
3253
|
+
execLog(
|
|
3254
|
+
"batch",
|
|
3255
|
+
batchState.batchId,
|
|
3256
|
+
`preserved partial progress for ${ppResult.results.filter((r) => r.saved).length} failed task(s) before terminal cleanup`,
|
|
3257
|
+
);
|
|
3258
|
+
}
|
|
3259
|
+
// Log warnings for failed preservation attempts — at terminal cleanup
|
|
3260
|
+
// we cannot skip deletion (batch is ending), but operators need to know
|
|
3261
|
+
// that commits may become unreachable via reflog only.
|
|
3262
|
+
for (const r of ppResult.results) {
|
|
3263
|
+
if (!r.saved && (r.commitCount > 0 || r.error)) {
|
|
3264
|
+
execLog(
|
|
3265
|
+
"batch",
|
|
3266
|
+
batchState.batchId,
|
|
3267
|
+
`WARNING: Failed to preserve partial progress for task ${r.taskId} ` +
|
|
3268
|
+
`(${r.commitCount} commit(s) may become unreachable after cleanup)`,
|
|
3269
|
+
{ taskId: r.taskId, commitCount: r.commitCount, error: r.error ?? "unknown" },
|
|
3270
|
+
);
|
|
3271
|
+
}
|
|
3272
|
+
}
|
|
3273
|
+
// TP-028: Stamp task outcomes with partial progress data for persistence
|
|
3274
|
+
applyPartialProgressToOutcomes(ppResult, allTaskOutcomes);
|
|
3275
|
+
}
|
|
3276
|
+
|
|
3277
|
+
if (!preserveWorktreesForResume) {
|
|
3278
|
+
const wtPrefix = orchConfig.orchestrator.worktree_prefix;
|
|
3279
|
+
const cleanupOpId = resolveOperatorId(orchConfig);
|
|
3280
|
+
|
|
3281
|
+
// Use encounteredRepoRoots which includes both persisted lanes
|
|
3282
|
+
// AND newly allocated lanes from resumed waves, ensuring repos
|
|
3283
|
+
// introduced after resume starts are cleaned up.
|
|
3284
|
+
//
|
|
3285
|
+
// Per-repo target branch resolution (workspace-mode correctness):
|
|
3286
|
+
// In repo mode, orchBranch is the correct target for all worktrees.
|
|
3287
|
+
// In workspace mode, the orchBranch only exists in the primary repo.
|
|
3288
|
+
// Secondary repos were merged against their own resolved base branch
|
|
3289
|
+
// (via resolveBaseBranch in mergeWaveByRepo), so unmerged-branch
|
|
3290
|
+
// protection must compare against that same per-repo branch.
|
|
3291
|
+
for (const perRepoRoot of encounteredRepoRoots) {
|
|
3292
|
+
let targetBranch: string | undefined;
|
|
3293
|
+
if (perRepoRoot === repoRoot) {
|
|
3294
|
+
// Primary repo: lane branches were merged into orchBranch
|
|
3295
|
+
targetBranch = batchState.orchBranch;
|
|
3296
|
+
} else {
|
|
3297
|
+
// Secondary repo (workspace mode): resolve the repo's own branch
|
|
3298
|
+
// using the same logic as mergeWaveByRepo. Find repoId by matching
|
|
3299
|
+
// the resolved path back to workspace config.
|
|
3300
|
+
const repoId = resolveRepoIdFromRoot(perRepoRoot, workspaceConfig);
|
|
3301
|
+
try {
|
|
3302
|
+
targetBranch = resolveBaseBranch(repoId, perRepoRoot, batchState.orchBranch, workspaceConfig);
|
|
3303
|
+
} catch {
|
|
3304
|
+
// resolveBaseBranch may throw if HEAD is detached and no
|
|
3305
|
+
// defaultBranch is configured. Fall back to undefined which
|
|
3306
|
+
// skips branch protection (branches are deleted without
|
|
3307
|
+
// merge-status check — safe because successfully merged
|
|
3308
|
+
// branches were already cleaned up in post-merge steps).
|
|
3309
|
+
targetBranch = undefined;
|
|
3310
|
+
}
|
|
3311
|
+
}
|
|
3312
|
+
removeAllWorktrees(
|
|
3313
|
+
wtPrefix,
|
|
3314
|
+
perRepoRoot,
|
|
3315
|
+
cleanupOpId,
|
|
3316
|
+
targetBranch,
|
|
3317
|
+
batchState.batchId,
|
|
3318
|
+
orchConfig,
|
|
3319
|
+
);
|
|
3320
|
+
}
|
|
3321
|
+
}
|
|
3322
|
+
|
|
3323
|
+
batchState.endedAt = Date.now();
|
|
3324
|
+
const totalElapsedSec = Math.round((batchState.endedAt - batchState.startedAt) / 1000);
|
|
3325
|
+
|
|
3326
|
+
if (
|
|
3327
|
+
(batchState.phase as OrchBatchPhase) === "executing" ||
|
|
3328
|
+
(batchState.phase as OrchBatchPhase) === "merging"
|
|
3329
|
+
) {
|
|
3330
|
+
if (batchState.failedTasks > 0) {
|
|
3331
|
+
// TP-031: Parity with engine.ts — default to "paused" so the batch is
|
|
3332
|
+
// resumable without --force. "failed" is reserved for unrecoverable
|
|
3333
|
+
// invariant violations after retry exhaustion.
|
|
3334
|
+
// NOTE: preserveWorktreesForResume was already set pre-cleanup to ensure
|
|
3335
|
+
// worktrees survive; this just sets the phase for state persistence.
|
|
3336
|
+
batchState.phase = "paused";
|
|
3337
|
+
} else {
|
|
3338
|
+
batchState.phase = "completed";
|
|
3339
|
+
}
|
|
3340
|
+
}
|
|
3341
|
+
|
|
3342
|
+
// ── Auto-Integration & Orch Branch Preservation (TP-022 Step 4) ──
|
|
3343
|
+
// Parity with engine.ts: auto-integrate if configured, else show manual guidance.
|
|
3344
|
+
// Gate: only run for terminal phases (completed/failed). Paused/stopped batches
|
|
3345
|
+
// are not yet done — integration would mutate refs prematurely.
|
|
3346
|
+
//
|
|
3347
|
+
// TP-043: "supervised" and "auto" integration modes are now owned by the
|
|
3348
|
+
// supervisor agent. Legacy engine fast-forward is removed — supervisor
|
|
3349
|
+
// handles all non-manual integration after batch_complete event.
|
|
3350
|
+
const mergedTaskCount = batchState.succeededTasks;
|
|
3351
|
+
// TP-195: hoist `batchState.phase` to a fresh local with the wide
|
|
3352
|
+
// `OrchBatchPhase` type. TypeScript's narrowing-on-property semantics
|
|
3353
|
+
// under `strict: false` carries assignments forward through the
|
|
3354
|
+
// function (visible in the `(batchState.phase as OrchBatchPhase) === ...`
|
|
3355
|
+
// pattern already used at lines ~3366/~3476 above), which here narrows
|
|
3356
|
+
// `batchState.phase` to a subtype that excludes `"completed"` and
|
|
3357
|
+
// `"failed"`. Hoisting to a typed local breaks the narrowing chain so
|
|
3358
|
+
// the comparisons typecheck without a per-call cast. Runtime
|
|
3359
|
+
// evaluation is identical.
|
|
3360
|
+
const phaseAtTerminal = batchState.phase as OrchBatchPhase;
|
|
3361
|
+
const isTerminalPhase = phaseAtTerminal === "completed" || phaseAtTerminal === "failed";
|
|
3362
|
+
if (
|
|
3363
|
+
isTerminalPhase &&
|
|
3364
|
+
!preserveWorktreesForResume &&
|
|
3365
|
+
batchState.orchBranch &&
|
|
3366
|
+
mergedTaskCount > 0
|
|
3367
|
+
) {
|
|
3368
|
+
if (
|
|
3369
|
+
orchConfig.orchestrator.integration === "supervised" ||
|
|
3370
|
+
orchConfig.orchestrator.integration === "auto"
|
|
3371
|
+
) {
|
|
3372
|
+
// TP-043: Supervisor-managed integration modes. Defer to supervisor.
|
|
3373
|
+
execLog(
|
|
3374
|
+
"resume",
|
|
3375
|
+
batchState.batchId,
|
|
3376
|
+
`integration deferred to supervisor (mode: ${orchConfig.orchestrator.integration})`,
|
|
3377
|
+
);
|
|
3378
|
+
} else {
|
|
3379
|
+
// Manual mode (default): show integration guidance
|
|
3380
|
+
onNotify(
|
|
3381
|
+
ORCH_MESSAGES.orchIntegrationManual(
|
|
3382
|
+
batchState.orchBranch,
|
|
3383
|
+
batchState.baseBranch,
|
|
3384
|
+
mergedTaskCount,
|
|
3385
|
+
),
|
|
3386
|
+
"info",
|
|
3387
|
+
);
|
|
3388
|
+
}
|
|
3389
|
+
}
|
|
3390
|
+
|
|
3391
|
+
persistRuntimeState(
|
|
3392
|
+
"batch-terminal",
|
|
3393
|
+
batchState,
|
|
3394
|
+
wavePlan,
|
|
3395
|
+
latestAllocatedLanes,
|
|
3396
|
+
allTaskOutcomes,
|
|
3397
|
+
discovery,
|
|
3398
|
+
stateRoot,
|
|
3399
|
+
);
|
|
3400
|
+
|
|
3401
|
+
// ── TP-076: Emit supervisor alert for batch completion ──────
|
|
3402
|
+
// TP-195: reuse the hoisted-typed phase to avoid the same narrowing
|
|
3403
|
+
// artifact as the `isTerminalPhase` check above.
|
|
3404
|
+
if (phaseAtTerminal === "completed" || phaseAtTerminal === "failed") {
|
|
3405
|
+
const batchDurationMs = batchState.endedAt ? batchState.endedAt - batchState.startedAt : 0;
|
|
3406
|
+
const durationStr =
|
|
3407
|
+
batchDurationMs > 0
|
|
3408
|
+
? `${Math.floor(batchDurationMs / 60000)}m ${Math.round((batchDurationMs % 60000) / 1000)}s`
|
|
3409
|
+
: "unknown";
|
|
3410
|
+
if (batchState.phase === "completed" && batchState.failedTasks === 0) {
|
|
3411
|
+
emitAlert({
|
|
3412
|
+
category: "batch-complete",
|
|
3413
|
+
summary:
|
|
3414
|
+
`✅ Batch ${batchState.batchId} completed\n` +
|
|
3415
|
+
` ${batchState.succeededTasks}/${batchState.totalTasks} tasks succeeded\n` +
|
|
3416
|
+
` ${batchState.taskLevelWaveCount ?? batchState.totalWaves} wave(s), duration: ${durationStr}\n` +
|
|
3417
|
+
` Merged to orch branch: ${batchState.orchBranch}\n\n` +
|
|
3418
|
+
`Ready for integration. Run orch_integrate() or review first.`,
|
|
3419
|
+
context: {
|
|
3420
|
+
batchProgress: buildBatchProgressSnapshot(batchState),
|
|
3421
|
+
batchDurationMs,
|
|
3422
|
+
},
|
|
3423
|
+
});
|
|
3424
|
+
} else {
|
|
3425
|
+
emitAlert({
|
|
3426
|
+
category: "batch-complete",
|
|
3427
|
+
summary:
|
|
3428
|
+
`⚠️ Batch ${batchState.batchId} finished with failures\n` +
|
|
3429
|
+
` ${batchState.succeededTasks} succeeded, ${batchState.failedTasks} failed, ` +
|
|
3430
|
+
`${batchState.skippedTasks} skipped, ${batchState.blockedTasks} blocked\n` +
|
|
3431
|
+
` Duration: ${durationStr}\n\n` +
|
|
3432
|
+
`Available actions:\n` +
|
|
3433
|
+
` - orch_status() to review final state\n` +
|
|
3434
|
+
` - orch_integrate() if succeeded work should be kept\n` +
|
|
3435
|
+
` - orch_resume(force=true) to retry failed tasks`,
|
|
3436
|
+
context: {
|
|
3437
|
+
batchProgress: buildBatchProgressSnapshot(batchState),
|
|
3438
|
+
batchDurationMs,
|
|
3439
|
+
},
|
|
3440
|
+
});
|
|
3441
|
+
}
|
|
3442
|
+
}
|
|
3443
|
+
|
|
3444
|
+
// ── TP-031: Emit diagnostic reports (JSONL + markdown) ──
|
|
3445
|
+
// Non-fatal: errors are logged but never crash batch finalization.
|
|
3446
|
+
emitDiagnosticReports(
|
|
3447
|
+
assembleDiagnosticInput(
|
|
3448
|
+
orchConfig,
|
|
3449
|
+
batchState,
|
|
3450
|
+
wavePlan,
|
|
3451
|
+
latestAllocatedLanes,
|
|
3452
|
+
allTaskOutcomes,
|
|
3453
|
+
stateRoot,
|
|
3454
|
+
),
|
|
3455
|
+
);
|
|
3456
|
+
|
|
3457
|
+
if (batchState.phase === "paused" || batchState.phase === "stopped") {
|
|
3458
|
+
execLog("resume", batchState.batchId, "resumed batch ended in non-terminal state", {
|
|
3459
|
+
phase: batchState.phase,
|
|
3460
|
+
});
|
|
3461
|
+
} else {
|
|
3462
|
+
onNotify(
|
|
3463
|
+
ORCH_MESSAGES.resumeComplete(
|
|
3464
|
+
batchState.batchId,
|
|
3465
|
+
batchState.succeededTasks,
|
|
3466
|
+
batchState.failedTasks,
|
|
3467
|
+
batchState.skippedTasks,
|
|
3468
|
+
batchState.blockedTasks,
|
|
3469
|
+
totalElapsedSec,
|
|
3470
|
+
),
|
|
3471
|
+
batchState.failedTasks > 0 ? "warning" : "info",
|
|
3472
|
+
);
|
|
3473
|
+
|
|
3474
|
+
if (batchState.phase === "completed") {
|
|
3475
|
+
try {
|
|
3476
|
+
deleteBatchState(stateRoot);
|
|
3477
|
+
execLog("state", batchState.batchId, "state file deleted on clean resume completion");
|
|
3478
|
+
} catch {
|
|
3479
|
+
// Best-effort
|
|
3480
|
+
}
|
|
3481
|
+
}
|
|
3482
|
+
}
|
|
3483
|
+
}
|
|
3484
|
+
|
|
3485
|
+
// TP-043: attemptAutoIntegration is no longer called from engine.ts or resume.ts.
|
|
3486
|
+
// Supervisor-managed integration ("supervised" and "auto" modes) is handled by
|
|
3487
|
+
// the supervisor agent after batch_complete. The helper remains in merge.ts for
|
|
3488
|
+
// use by the supervisor's integration flow.
|