@pi-agents/orchid 0.1.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. package/CHANGELOG.md +41 -0
  2. package/LICENSE +21 -0
  3. package/README.md +246 -0
  4. package/agents/AGENTS-MANIFEST.md +42 -0
  5. package/agents/brain.md +42 -0
  6. package/agents/context-builder.md +46 -0
  7. package/agents/delegate.md +12 -0
  8. package/agents/dev-1.md +42 -0
  9. package/agents/oracle.md +73 -0
  10. package/agents/planner.md +55 -0
  11. package/agents/researcher.md +52 -0
  12. package/agents/reviewer.md +79 -0
  13. package/agents/scout.md +50 -0
  14. package/agents/tester.md +45 -0
  15. package/agents/worker.md +55 -0
  16. package/extensions/ralph.ts +1 -0
  17. package/extensions/reviewer-extension.ts +125 -0
  18. package/extensions/task-orchestrator.ts +28 -0
  19. package/package.json +63 -0
  20. package/prompts/gather-context-and-clarify.md +13 -0
  21. package/prompts/parallel-cleanup.md +59 -0
  22. package/prompts/parallel-context-build.md +53 -0
  23. package/prompts/parallel-handoff-plan.md +59 -0
  24. package/prompts/parallel-research.md +50 -0
  25. package/prompts/parallel-review.md +54 -0
  26. package/prompts/review-loop.md +41 -0
  27. package/skills/orchid/SKILL.md +214 -0
  28. package/skills/orchid/orchid-cleanup/SKILL.md +122 -0
  29. package/skills/orchid/orchid-converge/SKILL.md +124 -0
  30. package/skills/orchid/orchid-decompose/SKILL.md +201 -0
  31. package/skills/orchid/orchid-doctor/SKILL.md +162 -0
  32. package/skills/orchid/orchid-investigate/SKILL.md +102 -0
  33. package/skills/orchid/orchid-launch/SKILL.md +147 -0
  34. package/skills/ralph/SKILL.md +73 -0
  35. package/skills/subagents/pi-subagents/SKILL.md +813 -0
  36. package/src/index.ts +7 -0
  37. package/src/orchestrator/abort.ts +534 -0
  38. package/src/orchestrator/agent-bridge-extension.ts +1020 -0
  39. package/src/orchestrator/agent-host.ts +954 -0
  40. package/src/orchestrator/cleanup.ts +776 -0
  41. package/src/orchestrator/config-loader.ts +1412 -0
  42. package/src/orchestrator/config-schema.ts +690 -0
  43. package/src/orchestrator/config.ts +81 -0
  44. package/src/orchestrator/context-window.ts +66 -0
  45. package/src/orchestrator/diagnostic-reports.ts +475 -0
  46. package/src/orchestrator/diagnostics.ts +394 -0
  47. package/src/orchestrator/discovery.ts +1833 -0
  48. package/src/orchestrator/engine-worker.ts +415 -0
  49. package/src/orchestrator/engine.ts +5940 -0
  50. package/src/orchestrator/execution.ts +3104 -0
  51. package/src/orchestrator/extension.ts +5934 -0
  52. package/src/orchestrator/formatting.ts +785 -0
  53. package/src/orchestrator/git.ts +88 -0
  54. package/src/orchestrator/index.ts +28 -0
  55. package/src/orchestrator/lane-runner.ts +1787 -0
  56. package/src/orchestrator/mailbox.ts +780 -0
  57. package/src/orchestrator/merge.ts +3414 -0
  58. package/src/orchestrator/messages.ts +1062 -0
  59. package/src/orchestrator/migrations.ts +278 -0
  60. package/src/orchestrator/naming.ts +117 -0
  61. package/src/orchestrator/path-resolver.ts +275 -0
  62. package/src/orchestrator/persistence.ts +2625 -0
  63. package/src/orchestrator/process-registry.ts +452 -0
  64. package/src/orchestrator/quality-gate.ts +1085 -0
  65. package/src/orchestrator/resume.ts +3488 -0
  66. package/src/orchestrator/sessions.ts +57 -0
  67. package/src/orchestrator/settings-loader.ts +136 -0
  68. package/src/orchestrator/settings-tui.ts +2208 -0
  69. package/src/orchestrator/sidecar-telemetry.ts +267 -0
  70. package/src/orchestrator/supervisor.ts +4548 -0
  71. package/src/orchestrator/task-executor-core.ts +675 -0
  72. package/src/orchestrator/tmux-compat.ts +37 -0
  73. package/src/orchestrator/tool-allowlist-constants.ts +37 -0
  74. package/src/orchestrator/types.ts +4465 -0
  75. package/src/orchestrator/verification.ts +547 -0
  76. package/src/orchestrator/waves.ts +1564 -0
  77. package/src/orchestrator/workspace.ts +707 -0
  78. package/src/orchestrator/worktree.ts +2725 -0
  79. package/src/ralph/index.ts +825 -0
  80. package/src/subagents/agents/agent-management.ts +648 -0
  81. package/src/subagents/agents/agent-scope.ts +6 -0
  82. package/src/subagents/agents/agent-selection.ts +23 -0
  83. package/src/subagents/agents/agent-serializer.ts +86 -0
  84. package/src/subagents/agents/agents.ts +832 -0
  85. package/src/subagents/agents/chain-serializer.ts +137 -0
  86. package/src/subagents/agents/frontmatter.ts +29 -0
  87. package/src/subagents/agents/identity.ts +30 -0
  88. package/src/subagents/agents/skills.ts +632 -0
  89. package/src/subagents/extension/config.ts +16 -0
  90. package/src/subagents/extension/control-notices.ts +92 -0
  91. package/src/subagents/extension/doctor.ts +199 -0
  92. package/src/subagents/extension/fanout-child.ts +170 -0
  93. package/src/subagents/extension/index.ts +573 -0
  94. package/src/subagents/extension/schemas.ts +168 -0
  95. package/src/subagents/intercom/intercom-bridge.ts +379 -0
  96. package/src/subagents/intercom/result-intercom.ts +377 -0
  97. package/src/subagents/runs/background/async-execution.ts +712 -0
  98. package/src/subagents/runs/background/async-job-tracker.ts +310 -0
  99. package/src/subagents/runs/background/async-resume.ts +345 -0
  100. package/src/subagents/runs/background/async-status.ts +325 -0
  101. package/src/subagents/runs/background/completion-dedupe.ts +63 -0
  102. package/src/subagents/runs/background/notify.ts +108 -0
  103. package/src/subagents/runs/background/parallel-groups.ts +45 -0
  104. package/src/subagents/runs/background/result-watcher.ts +307 -0
  105. package/src/subagents/runs/background/run-id-resolver.ts +83 -0
  106. package/src/subagents/runs/background/run-status.ts +269 -0
  107. package/src/subagents/runs/background/stale-run-reconciler.ts +336 -0
  108. package/src/subagents/runs/background/subagent-runner.ts +1808 -0
  109. package/src/subagents/runs/background/top-level-async.ts +13 -0
  110. package/src/subagents/runs/foreground/chain-clarify.ts +1333 -0
  111. package/src/subagents/runs/foreground/chain-execution.ts +938 -0
  112. package/src/subagents/runs/foreground/execution.ts +918 -0
  113. package/src/subagents/runs/foreground/subagent-executor.ts +2527 -0
  114. package/src/subagents/runs/shared/completion-guard.ts +147 -0
  115. package/src/subagents/runs/shared/long-running-guard.ts +175 -0
  116. package/src/subagents/runs/shared/mcp-direct-tool-allowlist.ts +365 -0
  117. package/src/subagents/runs/shared/model-fallback.ts +103 -0
  118. package/src/subagents/runs/shared/nested-events.ts +819 -0
  119. package/src/subagents/runs/shared/nested-path.ts +52 -0
  120. package/src/subagents/runs/shared/nested-render.ts +115 -0
  121. package/src/subagents/runs/shared/parallel-utils.ts +109 -0
  122. package/src/subagents/runs/shared/pi-args.ts +220 -0
  123. package/src/subagents/runs/shared/pi-spawn.ts +115 -0
  124. package/src/subagents/runs/shared/run-history.ts +60 -0
  125. package/src/subagents/runs/shared/single-output.ts +164 -0
  126. package/src/subagents/runs/shared/subagent-control.ts +226 -0
  127. package/src/subagents/runs/shared/subagent-prompt-runtime.ts +170 -0
  128. package/src/subagents/runs/shared/worktree.ts +577 -0
  129. package/src/subagents/shared/artifacts.ts +98 -0
  130. package/src/subagents/shared/atomic-json.ts +16 -0
  131. package/src/subagents/shared/file-coalescer.ts +40 -0
  132. package/src/subagents/shared/fork-context.ts +76 -0
  133. package/src/subagents/shared/formatters.ts +133 -0
  134. package/src/subagents/shared/jsonl-writer.ts +81 -0
  135. package/src/subagents/shared/model-info.ts +78 -0
  136. package/src/subagents/shared/post-exit-stdio-guard.ts +85 -0
  137. package/src/subagents/shared/session-identity.ts +10 -0
  138. package/src/subagents/shared/session-tokens.ts +44 -0
  139. package/src/subagents/shared/settings.ts +397 -0
  140. package/src/subagents/shared/status-format.ts +49 -0
  141. package/src/subagents/shared/types.ts +822 -0
  142. package/src/subagents/shared/utils.ts +450 -0
  143. package/src/subagents/slash/prompt-template-bridge.ts +397 -0
  144. package/src/subagents/slash/slash-bridge.ts +174 -0
  145. package/src/subagents/slash/slash-commands.ts +528 -0
  146. package/src/subagents/slash/slash-live-state.ts +292 -0
  147. package/src/subagents/tui/render-helpers.ts +80 -0
  148. package/src/subagents/tui/render.ts +1358 -0
  149. package/templates/agents/local/supervisor.md +33 -0
  150. package/templates/agents/local/task-merger.md +27 -0
  151. package/templates/agents/local/task-reviewer.md +30 -0
  152. package/templates/agents/local/task-worker.md +34 -0
  153. package/templates/agents/supervisor-routing.md +92 -0
  154. package/templates/agents/supervisor.md +229 -0
  155. package/templates/agents/task-merger.md +214 -0
  156. package/templates/agents/task-reviewer.md +260 -0
  157. package/templates/agents/task-worker-segment.md +44 -0
  158. package/templates/agents/task-worker.md +557 -0
  159. package/templates/tasks/CONTEXT.md +30 -0
  160. package/templates/tasks/EXAMPLE-001-hello-world/PROMPT.md +98 -0
  161. package/templates/tasks/EXAMPLE-001-hello-world/STATUS.md +73 -0
  162. package/templates/tasks/EXAMPLE-002-parallel-smoke/PROMPT.md +97 -0
  163. package/templates/tasks/EXAMPLE-002-parallel-smoke/STATUS.md +73 -0
@@ -0,0 +1,2625 @@
1
+ /**
2
+ * State persistence, serialization, orphan detection
3
+ * @module orch/persistence
4
+ */
5
+ import {
6
+ readFileSync,
7
+ writeFileSync,
8
+ existsSync,
9
+ unlinkSync,
10
+ renameSync,
11
+ mkdirSync,
12
+ appendFileSync,
13
+ readdirSync,
14
+ statSync,
15
+ } from "fs";
16
+ import { join, dirname, basename } from "path";
17
+
18
+ import { execLog } from "./execution.ts";
19
+ import {
20
+ BATCH_STATE_SCHEMA_VERSION,
21
+ StateFileError,
22
+ batchStatePath,
23
+ BATCH_HISTORY_MAX_ENTRIES,
24
+ defaultResilienceState,
25
+ defaultBatchDiagnostics,
26
+ runtimeRoot,
27
+ runtimeManifestPath,
28
+ } from "./types.ts";
29
+ import type { BatchHistorySummary, RuntimeAgentManifest } from "./types.ts";
30
+ import type {
31
+ AllocatedLane,
32
+ DiscoveryResult,
33
+ EngineEvent,
34
+ EscalationContext,
35
+ LaneTaskOutcome,
36
+ LaneTaskStatus,
37
+ MonitorState,
38
+ OrchBatchPhase,
39
+ OrchBatchRuntimeState,
40
+ PersistedBatchState,
41
+ PersistedLaneRecord,
42
+ PersistedMergeResult,
43
+ PersistedSegmentRecord,
44
+ PersistedTaskRecord,
45
+ TaskMonitorSnapshot,
46
+ Tier0RecoveryPattern,
47
+ WorkspaceMode,
48
+ } from "./types.ts";
49
+ import { sleepSync } from "./worktree.ts";
50
+ import type { PreserveFailedLaneProgressResult } from "./worktree.ts";
51
+ import { normalizeLaneSessionAlias, readLaneSessionAliases } from "./tmux-compat.ts";
52
+
53
+ // ── State Persistence Helper (TS-009 Step 2) ────────────────────────
54
+
55
+ /**
56
+ * Candidate .DONE file locations for a task folder.
57
+ *
58
+ * Task-runner archives completed tasks by moving:
59
+ * tasks/<task-folder>/ → tasks/archive/<task-folder>/
60
+ *
61
+ * During resume/orphan detection we must check both locations.
62
+ */
63
+ export function getTaskDoneFileCandidates(taskFolder: string): string[] {
64
+ const candidates = [join(taskFolder, ".DONE")];
65
+ const parent = dirname(taskFolder);
66
+ const taskFolderName = basename(taskFolder);
67
+
68
+ // If already in archive, avoid duplicate candidate.
69
+ if (basename(parent).toLowerCase() !== "archive") {
70
+ candidates.push(join(parent, "archive", taskFolderName, ".DONE"));
71
+ }
72
+
73
+ return candidates;
74
+ }
75
+
76
+ /**
77
+ * Check whether a task has a .DONE marker in active or archived location.
78
+ */
79
+ export function hasTaskDoneMarker(taskFolder: string): boolean {
80
+ for (const donePath of getTaskDoneFileCandidates(taskFolder)) {
81
+ try {
82
+ if (existsSync(donePath)) return true;
83
+ } catch {
84
+ // Ignore filesystem errors here; caller handles partial visibility.
85
+ }
86
+ }
87
+ return false;
88
+ }
89
+
90
+ /**
91
+ * Compare optional embedded outcome telemetry.
92
+ */
93
+ function sameOutcomeTelemetry(
94
+ a: LaneTaskOutcome["telemetry"],
95
+ b: LaneTaskOutcome["telemetry"],
96
+ ): boolean {
97
+ if (!a && !b) return true;
98
+ if (!a || !b) return false;
99
+ return (
100
+ a.inputTokens === b.inputTokens &&
101
+ a.outputTokens === b.outputTokens &&
102
+ a.cacheReadTokens === b.cacheReadTokens &&
103
+ a.cacheWriteTokens === b.cacheWriteTokens &&
104
+ a.costUsd === b.costUsd &&
105
+ a.toolCalls === b.toolCalls &&
106
+ a.durationMs === b.durationMs
107
+ );
108
+ }
109
+
110
+ /**
111
+ * Upsert a task outcome in-place. Returns true if changed.
112
+ */
113
+ export function upsertTaskOutcome(outcomes: LaneTaskOutcome[], next: LaneTaskOutcome): boolean {
114
+ const idx = outcomes.findIndex((o) => o.taskId === next.taskId);
115
+ if (idx < 0) {
116
+ outcomes.push(next);
117
+ return true;
118
+ }
119
+
120
+ const prev = outcomes[idx];
121
+ const mergedNext: LaneTaskOutcome = {
122
+ ...next,
123
+ laneNumber: next.laneNumber ?? prev.laneNumber,
124
+ telemetry: next.telemetry ?? prev.telemetry,
125
+ };
126
+
127
+ const changed =
128
+ prev.status !== mergedNext.status ||
129
+ prev.startTime !== mergedNext.startTime ||
130
+ prev.endTime !== mergedNext.endTime ||
131
+ prev.exitReason !== mergedNext.exitReason ||
132
+ prev.sessionName !== mergedNext.sessionName ||
133
+ prev.doneFileFound !== mergedNext.doneFileFound ||
134
+ prev.laneNumber !== mergedNext.laneNumber ||
135
+ !sameOutcomeTelemetry(prev.telemetry, mergedNext.telemetry) ||
136
+ prev.partialProgressCommits !== mergedNext.partialProgressCommits ||
137
+ prev.partialProgressBranch !== mergedNext.partialProgressBranch ||
138
+ prev.exitDiagnostic !== mergedNext.exitDiagnostic;
139
+
140
+ if (changed) {
141
+ outcomes[idx] = mergedNext;
142
+ }
143
+ return changed;
144
+ }
145
+
146
+ /**
147
+ * Apply partial progress preservation results to task outcomes (TP-028).
148
+ *
149
+ * After `preserveFailedLaneProgress()` runs, call this to stamp each
150
+ * successfully-preserved task outcome with the saved branch name and
151
+ * commit count. This ensures the data flows into persistence and
152
+ * diagnostics via the normal outcome → serialization path.
153
+ *
154
+ * @param ppResult - Result from `preserveFailedLaneProgress()`
155
+ * @param outcomes - Mutable array of task outcomes to update in-place
156
+ * @returns Number of outcomes that were updated
157
+ */
158
+ export function applyPartialProgressToOutcomes(
159
+ ppResult: PreserveFailedLaneProgressResult,
160
+ outcomes: LaneTaskOutcome[],
161
+ ): number {
162
+ let updated = 0;
163
+ for (const r of ppResult.results) {
164
+ if (!r.saved || !r.savedBranch) continue;
165
+ const outcome = outcomes.find((o) => o.taskId === r.taskId);
166
+ if (outcome) {
167
+ outcome.partialProgressCommits = r.commitCount;
168
+ outcome.partialProgressBranch = r.savedBranch;
169
+ updated++;
170
+ }
171
+ }
172
+ return updated;
173
+ }
174
+
175
+ /**
176
+ * Seed pending outcomes for all tasks in newly allocated lanes.
177
+ *
178
+ * Ensures the persisted state has a full task registry as soon as a wave starts,
179
+ * including lane/session assignment, even before tasks finish.
180
+ */
181
+ export function seedPendingOutcomesForAllocatedLanes(
182
+ lanes: AllocatedLane[],
183
+ outcomes: LaneTaskOutcome[],
184
+ ): boolean {
185
+ let changed = false;
186
+ for (const lane of lanes) {
187
+ for (const laneTask of lane.tasks) {
188
+ const existing = outcomes.find((o) => o.taskId === laneTask.taskId);
189
+ if (existing) continue;
190
+ changed =
191
+ upsertTaskOutcome(outcomes, {
192
+ taskId: laneTask.taskId,
193
+ status: "pending",
194
+ startTime: null,
195
+ endTime: null,
196
+ exitReason: "Pending execution",
197
+ sessionName: lane.laneSessionId,
198
+ doneFileFound: false,
199
+ laneNumber: lane.laneNumber,
200
+ }) || changed;
201
+ }
202
+ }
203
+ return changed;
204
+ }
205
+
206
+ /**
207
+ * Sync accumulated task outcomes from monitor snapshots.
208
+ *
209
+ * This captures in-wave task transitions (pending → running → terminal)
210
+ * so state persistence does not lag until wave completion.
211
+ */
212
+ export function syncTaskOutcomesFromMonitor(
213
+ monitorState: MonitorState,
214
+ outcomes: LaneTaskOutcome[],
215
+ ): boolean {
216
+ let changed = false;
217
+
218
+ for (const lane of monitorState.lanes) {
219
+ // Remaining tasks => pending
220
+ for (const taskId of lane.remainingTasks) {
221
+ const existing = outcomes.find((o) => o.taskId === taskId);
222
+ if (
223
+ existing &&
224
+ (existing.status === "succeeded" ||
225
+ existing.status === "failed" ||
226
+ existing.status === "stalled")
227
+ ) {
228
+ continue;
229
+ }
230
+ changed =
231
+ upsertTaskOutcome(outcomes, {
232
+ taskId,
233
+ status: "pending",
234
+ startTime: existing?.startTime ?? null,
235
+ endTime: null,
236
+ exitReason: existing?.exitReason || "Pending execution",
237
+ sessionName: existing?.sessionName || lane.sessionName,
238
+ doneFileFound: false,
239
+ laneNumber: existing?.laneNumber ?? lane.laneNumber,
240
+ telemetry: existing?.telemetry,
241
+ partialProgressCommits: existing?.partialProgressCommits,
242
+ partialProgressBranch: existing?.partialProgressBranch,
243
+ exitDiagnostic: existing?.exitDiagnostic,
244
+ }) || changed;
245
+ }
246
+
247
+ // Completed tasks => succeeded
248
+ // Use existing endTime if already set — prevents changed=true on every
249
+ // poll tick (lastPollTime differs each tick, causing persist log spam).
250
+ for (const taskId of lane.completedTasks) {
251
+ const existing = outcomes.find((o) => o.taskId === taskId);
252
+ changed =
253
+ upsertTaskOutcome(outcomes, {
254
+ taskId,
255
+ status: "succeeded",
256
+ startTime: existing?.startTime ?? null,
257
+ endTime: existing?.endTime ?? monitorState.lastPollTime,
258
+ exitReason: existing?.exitReason || ".DONE file created by task-runner",
259
+ sessionName: existing?.sessionName || lane.sessionName,
260
+ doneFileFound: true,
261
+ laneNumber: existing?.laneNumber ?? lane.laneNumber,
262
+ telemetry: existing?.telemetry,
263
+ partialProgressCommits: existing?.partialProgressCommits,
264
+ partialProgressBranch: existing?.partialProgressBranch,
265
+ exitDiagnostic: existing?.exitDiagnostic,
266
+ }) || changed;
267
+ }
268
+
269
+ // Failed tasks => failed
270
+ for (const taskId of lane.failedTasks) {
271
+ const existing = outcomes.find((o) => o.taskId === taskId);
272
+ changed =
273
+ upsertTaskOutcome(outcomes, {
274
+ taskId,
275
+ status: "failed",
276
+ startTime: existing?.startTime ?? null,
277
+ endTime: existing?.endTime ?? monitorState.lastPollTime,
278
+ exitReason: existing?.exitReason || "Task failed or stalled",
279
+ sessionName: existing?.sessionName || lane.sessionName,
280
+ doneFileFound: false,
281
+ laneNumber: existing?.laneNumber ?? lane.laneNumber,
282
+ telemetry: existing?.telemetry,
283
+ partialProgressCommits: existing?.partialProgressCommits,
284
+ partialProgressBranch: existing?.partialProgressBranch,
285
+ exitDiagnostic: existing?.exitDiagnostic,
286
+ }) || changed;
287
+ }
288
+
289
+ // Current task snapshot => running/stalled/succeeded/failed/skipped
290
+ if (lane.currentTaskId && lane.currentTaskSnapshot) {
291
+ const snap = lane.currentTaskSnapshot;
292
+ const existing = outcomes.find((o) => o.taskId === lane.currentTaskId);
293
+ const monitorToLane: Record<TaskMonitorSnapshot["status"], LaneTaskStatus> = {
294
+ pending: "pending",
295
+ running: "running",
296
+ succeeded: "succeeded",
297
+ failed: "failed",
298
+ stalled: "stalled",
299
+ skipped: "skipped",
300
+ unknown: existing?.status || "running",
301
+ };
302
+ const mappedStatus = monitorToLane[snap.status];
303
+ const terminal =
304
+ mappedStatus === "succeeded" ||
305
+ mappedStatus === "failed" ||
306
+ mappedStatus === "stalled" ||
307
+ mappedStatus === "skipped";
308
+
309
+ // TP-051: Use snap.observedAt (Date.now() from monitor poll) instead of
310
+ // snap.lastHeartbeat (STATUS.md mtime) for task start time. The mtime
311
+ // reflects when STATUS.md was last edited, which may be long before
312
+ // actual execution started (e.g., during task staging).
313
+ changed =
314
+ upsertTaskOutcome(outcomes, {
315
+ taskId: lane.currentTaskId,
316
+ status: mappedStatus,
317
+ startTime: existing?.startTime ?? snap.observedAt,
318
+ endTime: terminal ? (existing?.endTime ?? snap.observedAt) : null,
319
+ exitReason:
320
+ existing?.exitReason ||
321
+ (mappedStatus === "running"
322
+ ? "Task in progress"
323
+ : snap.stallReason || "Task reached terminal state"),
324
+ sessionName: existing?.sessionName || lane.sessionName,
325
+ doneFileFound: snap.doneFileFound,
326
+ laneNumber: existing?.laneNumber ?? lane.laneNumber,
327
+ telemetry: existing?.telemetry,
328
+ partialProgressCommits: existing?.partialProgressCommits,
329
+ partialProgressBranch: existing?.partialProgressBranch,
330
+ exitDiagnostic: existing?.exitDiagnostic,
331
+ }) || changed;
332
+ }
333
+ }
334
+
335
+ return changed;
336
+ }
337
+
338
+ /**
339
+ * Persist current runtime state to `.pi/batch-state.json`.
340
+ *
341
+ * Centralized helper that serializes runtime state, enriches task records
342
+ * with folder paths from discovery, and writes atomically. Logs the reason,
343
+ * batchId, phase, and waveIndex for each write.
344
+ *
345
+ * Write failures are non-fatal: logged as errors and added to
346
+ * batchState.errors, but do NOT crash the batch execution.
347
+ *
348
+ * @param reason - Human-readable reason for this state write (e.g., "batch-start", "wave-index-change")
349
+ * @param batchState - Current runtime batch state
350
+ * @param wavePlan - Wave plan (array of arrays of task IDs)
351
+ * @param lanes - Currently allocated lanes (latest wave's lanes)
352
+ * @param allTaskOutcomes - All task outcomes accumulated across completed waves
353
+ * @param discovery - Discovery result (for enriching taskFolder paths)
354
+ * @param repoRoot - Absolute path to the repository root
355
+ */
356
+ export function persistRuntimeState(
357
+ reason: string,
358
+ batchState: OrchBatchRuntimeState,
359
+ wavePlan: string[][],
360
+ lanes: AllocatedLane[],
361
+ allTaskOutcomes: LaneTaskOutcome[],
362
+ discovery: DiscoveryResult | null,
363
+ repoRoot: string,
364
+ ): void {
365
+ try {
366
+ const json = serializeBatchState(batchState, wavePlan, lanes, allTaskOutcomes);
367
+
368
+ // Enrich task records with folder paths and repo fields from discovery
369
+ if (discovery) {
370
+ const parsed = JSON.parse(json) as PersistedBatchState;
371
+ for (const taskRecord of parsed.tasks) {
372
+ const parsedTask = discovery.pending.get(taskRecord.taskId);
373
+ if (parsedTask) {
374
+ taskRecord.taskFolder = parsedTask.taskFolder;
375
+ // v2: Enrich repo fields for tasks not yet allocated (pending in future waves)
376
+ if (taskRecord.repoId === undefined && parsedTask.promptRepoId !== undefined) {
377
+ taskRecord.repoId = parsedTask.promptRepoId;
378
+ }
379
+ if (taskRecord.resolvedRepoId === undefined && parsedTask.resolvedRepoId !== undefined) {
380
+ taskRecord.resolvedRepoId = parsedTask.resolvedRepoId;
381
+ }
382
+ if ((taskRecord as any).packetRepoId === undefined && parsedTask.packetRepoId !== undefined) {
383
+ (taskRecord as any).packetRepoId = parsedTask.packetRepoId;
384
+ }
385
+ if (
386
+ (taskRecord as any).packetTaskPath === undefined &&
387
+ parsedTask.packetTaskPath !== undefined
388
+ ) {
389
+ (taskRecord as any).packetTaskPath = parsedTask.packetTaskPath;
390
+ }
391
+ if ((taskRecord as any).segmentIds === undefined && parsedTask.segmentIds !== undefined) {
392
+ (taskRecord as any).segmentIds = parsedTask.segmentIds;
393
+ }
394
+ if (
395
+ (taskRecord as any).activeSegmentId === undefined &&
396
+ parsedTask.activeSegmentId !== undefined
397
+ ) {
398
+ (taskRecord as any).activeSegmentId = parsedTask.activeSegmentId;
399
+ }
400
+ }
401
+ }
402
+ const enrichedJson = JSON.stringify(parsed, null, 2);
403
+ saveBatchState(enrichedJson, repoRoot);
404
+ } else {
405
+ saveBatchState(json, repoRoot);
406
+ }
407
+
408
+ execLog("state", batchState.batchId, `persisted: ${reason}`, {
409
+ phase: batchState.phase,
410
+ waveIndex: batchState.currentWaveIndex,
411
+ });
412
+ } catch (err: unknown) {
413
+ const msg =
414
+ err instanceof StateFileError
415
+ ? `[${err.code}] ${err.message}`
416
+ : err instanceof Error
417
+ ? err.message
418
+ : String(err);
419
+ execLog("state", batchState.batchId, `write failed: ${msg}`, {
420
+ reason,
421
+ phase: batchState.phase,
422
+ });
423
+ batchState.errors.push(`State persistence failed (${reason}): ${msg}`);
424
+ }
425
+ }
426
+
427
+ // ── State Validation ─────────────────────────────────────────────────
428
+
429
+ /** All valid OrchBatchPhase values for validation. */
430
+ export const VALID_BATCH_PHASES: ReadonlySet<string> = new Set([
431
+ "idle",
432
+ "launching",
433
+ "planning",
434
+ "executing",
435
+ "merging",
436
+ "paused",
437
+ "stopped",
438
+ "completed",
439
+ "failed",
440
+ ]);
441
+
442
+ /** All valid LaneTaskStatus values for validation. */
443
+ export const VALID_TASK_STATUSES: ReadonlySet<string> = new Set([
444
+ "pending",
445
+ "running",
446
+ "succeeded",
447
+ "failed",
448
+ "stalled",
449
+ "skipped",
450
+ ]);
451
+
452
+ /** All valid merge result statuses for persisted state. */
453
+ export const VALID_PERSISTED_MERGE_STATUSES: ReadonlySet<string> = new Set([
454
+ "succeeded",
455
+ "failed",
456
+ "partial",
457
+ ]);
458
+
459
+ /**
460
+ * Upconvert a v1 state object to v2 in-memory.
461
+ *
462
+ * Applied automatically by `validatePersistedState()` when a v1 file is loaded.
463
+ * The on-disk file is NOT rewritten — upconversion is purely in-memory.
464
+ *
465
+ * v1→v2 field defaults:
466
+ * - `schemaVersion`: bumped from 1 → 2
467
+ * - `baseBranch`: defaults to "" (was already handled in v1 validation)
468
+ * - `mode`: defaults to "repo" (v1 was always single-repo)
469
+ * - `tasks[].repoId`: remains undefined (repo mode has no repo routing)
470
+ * - `tasks[].resolvedRepoId`: remains undefined (same reason)
471
+ * - `lanes[].repoId`: preserved if present (was already serialized in v1
472
+ * when workspace mode was partially implemented)
473
+ *
474
+ * This function is idempotent: calling it on an already-v2 object is a no-op.
475
+ *
476
+ * @param obj - Parsed state object (mutated in-place)
477
+ */
478
+ export function upconvertV1toV2(obj: Record<string, unknown>): void {
479
+ if ((obj.schemaVersion as number) >= 2) return;
480
+ obj.schemaVersion = 2;
481
+ if (!obj.baseBranch) obj.baseBranch = "";
482
+ if (!obj.mode) obj.mode = "repo";
483
+ // Task and lane records: v2 optional fields default to undefined (omitted)
484
+ // which is already their state in v1 objects. No mutation needed.
485
+ }
486
+
487
+ /**
488
+ * Upconvert a v2 state object to v3 by adding resilience and diagnostics
489
+ * sections with conservative defaults.
490
+ *
491
+ * Added fields:
492
+ * - `resilience`: default empty resilience state (no retries, no repairs)
493
+ * - `diagnostics`: default empty diagnostics (no task exits, zero batch cost)
494
+ *
495
+ * This function is idempotent: calling it on an already-v3 object is a no-op.
496
+ *
497
+ * @param obj - Parsed state object (mutated in-place)
498
+ */
499
+ export function upconvertV2toV3(obj: Record<string, unknown>): void {
500
+ if ((obj.schemaVersion as number) >= 3) return;
501
+ obj.schemaVersion = 3;
502
+ // Backfill v3 sections with conservative defaults only during genuine
503
+ // v1/v2→v3 migration. A native v3 file missing these sections is
504
+ // malformed and must be rejected by validation — not silently patched.
505
+ if (!obj.resilience) obj.resilience = defaultResilienceState();
506
+ if (!obj.diagnostics) obj.diagnostics = defaultBatchDiagnostics();
507
+ }
508
+
509
+ /**
510
+ * Upconvert a v3 state object to v4 by adding the `segments` array.
511
+ *
512
+ * Added fields:
513
+ * - `segments`: empty array (no segment records exist in pre-v4 state)
514
+ *
515
+ * Task-level segment fields (`packetRepoId`, `packetTaskPath`,
516
+ * `segmentIds`, `activeSegmentId`) are optional and default to
517
+ * `undefined` (omitted from JSON). They are NOT backfilled here
518
+ * because their values depend on runtime discovery, not on
519
+ * migration defaults.
520
+ *
521
+ * This function is idempotent: calling it on an already-v4 object is a no-op.
522
+ *
523
+ * @param obj - Parsed state object (mutated in-place)
524
+ */
525
+ export function upconvertV3toV4(obj: Record<string, unknown>): void {
526
+ if ((obj.schemaVersion as number) >= 4) return;
527
+ obj.schemaVersion = 4;
528
+ // Backfill v4 segments with empty array only during genuine v3→v4 migration.
529
+ if (!obj.segments) obj.segments = [];
530
+ }
531
+
532
+ /**
533
+ * Validate a parsed JSON object as a PersistedBatchState.
534
+ *
535
+ * Checks:
536
+ * 1. Schema version is 1 (auto-upconverted to v2→v3), 2 (upconverted to v3), or 3 (current)
537
+ * 2. All required fields are present with correct types
538
+ * 3. Enum fields contain valid values (phase, task statuses, merge statuses)
539
+ * 4. Arrays contain valid sub-records
540
+ * 5. v2 optional fields (repoId, resolvedRepoId, mode) are valid when present
541
+ *
542
+ * @param data - Parsed JSON (unknown type)
543
+ * @returns Validated PersistedBatchState (always v3, even if input was v1/v2)
544
+ * @throws StateFileError with STATE_SCHEMA_INVALID on any validation failure
545
+ */
546
+ export function validatePersistedState(data: unknown): PersistedBatchState {
547
+ if (!data || typeof data !== "object") {
548
+ throw new StateFileError("STATE_SCHEMA_INVALID", "Batch state must be a non-null object");
549
+ }
550
+
551
+ const obj = data as Record<string, unknown>;
552
+
553
+ // ── Schema version ───────────────────────────────────────────
554
+ if (typeof obj.schemaVersion !== "number") {
555
+ throw new StateFileError(
556
+ "STATE_SCHEMA_INVALID",
557
+ `Missing or invalid "schemaVersion" field (expected number, got ${typeof obj.schemaVersion})`,
558
+ );
559
+ }
560
+ // Accept v1 (auto-upconvert to v2→v3→v4), v2 (upconvert to v3→v4), v3 (upconvert to v4), and v4 (current).
561
+ // Reject anything else — including future versions from newer runtimes.
562
+ const ACCEPTED_VERSIONS = [1, 2, 3, BATCH_STATE_SCHEMA_VERSION];
563
+ if (!ACCEPTED_VERSIONS.includes(obj.schemaVersion as number)) {
564
+ throw new StateFileError(
565
+ "STATE_SCHEMA_INVALID",
566
+ `Unsupported schema version ${obj.schemaVersion} (expected ${BATCH_STATE_SCHEMA_VERSION}). ` +
567
+ `Upgrade orchid to a version that supports schema v${obj.schemaVersion}, ` +
568
+ `or delete .pi/batch-state.json and re-run the batch.`,
569
+ );
570
+ }
571
+ const isV1 = obj.schemaVersion === 1;
572
+
573
+ // ── Required string fields ───────────────────────────────────
574
+ for (const field of ["phase", "batchId"] as const) {
575
+ if (typeof obj[field] !== "string") {
576
+ throw new StateFileError(
577
+ "STATE_SCHEMA_INVALID",
578
+ `Missing or invalid "${field}" field (expected string, got ${typeof obj[field]})`,
579
+ );
580
+ }
581
+ }
582
+
583
+ // ── Optional string fields (backward-compatible) ─────────────
584
+ // baseBranch was added after schema v1; default to empty string if missing
585
+ if (obj.baseBranch !== undefined && typeof obj.baseBranch !== "string") {
586
+ throw new StateFileError(
587
+ "STATE_SCHEMA_INVALID",
588
+ `Invalid "baseBranch" field (expected string, got ${typeof obj.baseBranch})`,
589
+ );
590
+ }
591
+
592
+ // ── Optional string fields: orchBranch ───────────────────────
593
+ // orchBranch was added after schema v2 shipped; default to "" if missing.
594
+ if (obj.orchBranch !== undefined && typeof obj.orchBranch !== "string") {
595
+ throw new StateFileError(
596
+ "STATE_SCHEMA_INVALID",
597
+ `Invalid "orchBranch" field (expected string, got ${typeof obj.orchBranch})`,
598
+ );
599
+ }
600
+ if (obj.orchBranch === undefined) {
601
+ obj.orchBranch = "";
602
+ }
603
+
604
+ // ── v2: mode field ───────────────────────────────────────────
605
+ // mode is required in v2, absent in v1 (defaults to "repo" via upconvert).
606
+ if (!isV1 && obj.mode === undefined) {
607
+ throw new StateFileError(
608
+ "STATE_SCHEMA_INVALID",
609
+ `Missing required "mode" field in schema v2 (expected "repo" or "workspace")`,
610
+ );
611
+ }
612
+ if (obj.mode !== undefined && typeof obj.mode !== "string") {
613
+ throw new StateFileError(
614
+ "STATE_SCHEMA_INVALID",
615
+ `Invalid "mode" field (expected string, got ${typeof obj.mode})`,
616
+ );
617
+ }
618
+ if (obj.mode !== undefined && obj.mode !== "repo" && obj.mode !== "workspace") {
619
+ throw new StateFileError(
620
+ "STATE_SCHEMA_INVALID",
621
+ `Invalid "mode" value "${obj.mode}" (expected "repo" or "workspace")`,
622
+ );
623
+ }
624
+
625
+ // ── Phase enum validation ────────────────────────────────────
626
+ if (!VALID_BATCH_PHASES.has(obj.phase as string)) {
627
+ throw new StateFileError(
628
+ "STATE_SCHEMA_INVALID",
629
+ `Invalid "phase" value "${obj.phase}" (expected one of: ${[...VALID_BATCH_PHASES].join(", ")})`,
630
+ );
631
+ }
632
+
633
+ // ── Required number fields ───────────────────────────────────
634
+ for (const field of [
635
+ "startedAt",
636
+ "updatedAt",
637
+ "currentWaveIndex",
638
+ "totalWaves",
639
+ "totalTasks",
640
+ "succeededTasks",
641
+ "failedTasks",
642
+ "skippedTasks",
643
+ "blockedTasks",
644
+ ] as const) {
645
+ if (typeof obj[field] !== "number") {
646
+ throw new StateFileError(
647
+ "STATE_SCHEMA_INVALID",
648
+ `Missing or invalid "${field}" field (expected number, got ${typeof obj[field]})`,
649
+ );
650
+ }
651
+ }
652
+
653
+ // ── Nullable number: endedAt ─────────────────────────────────
654
+ if (obj.endedAt !== null && typeof obj.endedAt !== "number") {
655
+ throw new StateFileError(
656
+ "STATE_SCHEMA_INVALID",
657
+ `Invalid "endedAt" field (expected number or null, got ${typeof obj.endedAt})`,
658
+ );
659
+ }
660
+
661
+ // ── Required arrays ──────────────────────────────────────────
662
+ for (const field of [
663
+ "wavePlan",
664
+ "lanes",
665
+ "tasks",
666
+ "mergeResults",
667
+ "blockedTaskIds",
668
+ "errors",
669
+ ] as const) {
670
+ if (!Array.isArray(obj[field])) {
671
+ throw new StateFileError(
672
+ "STATE_SCHEMA_INVALID",
673
+ `Missing or invalid "${field}" field (expected array, got ${typeof obj[field]})`,
674
+ );
675
+ }
676
+ }
677
+
678
+ // ── Validate wavePlan: array of arrays of strings ────────────
679
+ const wavePlan = obj.wavePlan as unknown[];
680
+ for (let i = 0; i < wavePlan.length; i++) {
681
+ if (!Array.isArray(wavePlan[i])) {
682
+ throw new StateFileError("STATE_SCHEMA_INVALID", `wavePlan[${i}] is not an array`);
683
+ }
684
+ for (const taskId of wavePlan[i] as unknown[]) {
685
+ if (typeof taskId !== "string") {
686
+ throw new StateFileError(
687
+ "STATE_SCHEMA_INVALID",
688
+ `wavePlan[${i}] contains non-string value: ${typeof taskId}`,
689
+ );
690
+ }
691
+ }
692
+ }
693
+
694
+ // ── Validate task records ────────────────────────────────────
695
+ const tasks = obj.tasks as unknown[];
696
+ for (let i = 0; i < tasks.length; i++) {
697
+ const t = tasks[i] as Record<string, unknown>;
698
+ if (!t || typeof t !== "object") {
699
+ throw new StateFileError("STATE_SCHEMA_INVALID", `tasks[${i}] is not an object`);
700
+ }
701
+ for (const field of ["taskId", "sessionName", "taskFolder", "exitReason"] as const) {
702
+ if (typeof t[field] !== "string") {
703
+ throw new StateFileError(
704
+ "STATE_SCHEMA_INVALID",
705
+ `tasks[${i}].${field} is missing or not a string`,
706
+ );
707
+ }
708
+ }
709
+ if (typeof t.laneNumber !== "number") {
710
+ throw new StateFileError(
711
+ "STATE_SCHEMA_INVALID",
712
+ `tasks[${i}].laneNumber is missing or not a number`,
713
+ );
714
+ }
715
+ if (typeof t.status !== "string" || !VALID_TASK_STATUSES.has(t.status)) {
716
+ throw new StateFileError(
717
+ "STATE_SCHEMA_INVALID",
718
+ `tasks[${i}].status is invalid: "${t.status}" (expected one of: ${[...VALID_TASK_STATUSES].join(", ")})`,
719
+ );
720
+ }
721
+ if (t.startedAt !== null && typeof t.startedAt !== "number") {
722
+ throw new StateFileError(
723
+ "STATE_SCHEMA_INVALID",
724
+ `tasks[${i}].startedAt is not a number or null`,
725
+ );
726
+ }
727
+ if (t.endedAt !== null && typeof t.endedAt !== "number") {
728
+ throw new StateFileError("STATE_SCHEMA_INVALID", `tasks[${i}].endedAt is not a number or null`);
729
+ }
730
+ if (typeof t.doneFileFound !== "boolean") {
731
+ throw new StateFileError(
732
+ "STATE_SCHEMA_INVALID",
733
+ `tasks[${i}].doneFileFound is missing or not a boolean`,
734
+ );
735
+ }
736
+ // v2 optional fields: repoId, resolvedRepoId (string | undefined)
737
+ if (t.repoId !== undefined && typeof t.repoId !== "string") {
738
+ throw new StateFileError(
739
+ "STATE_SCHEMA_INVALID",
740
+ `tasks[${i}].repoId is not a string (got ${typeof t.repoId})`,
741
+ );
742
+ }
743
+ if (t.resolvedRepoId !== undefined && typeof t.resolvedRepoId !== "string") {
744
+ throw new StateFileError(
745
+ "STATE_SCHEMA_INVALID",
746
+ `tasks[${i}].resolvedRepoId is not a string (got ${typeof t.resolvedRepoId})`,
747
+ );
748
+ }
749
+ // TP-028 optional fields: partialProgressCommits (number | undefined), partialProgressBranch (string | undefined)
750
+ if (t.partialProgressCommits !== undefined && typeof t.partialProgressCommits !== "number") {
751
+ throw new StateFileError(
752
+ "STATE_SCHEMA_INVALID",
753
+ `tasks[${i}].partialProgressCommits is not a number (got ${typeof t.partialProgressCommits})`,
754
+ );
755
+ }
756
+ if (t.partialProgressBranch !== undefined && typeof t.partialProgressBranch !== "string") {
757
+ throw new StateFileError(
758
+ "STATE_SCHEMA_INVALID",
759
+ `tasks[${i}].partialProgressBranch is not a string (got ${typeof t.partialProgressBranch})`,
760
+ );
761
+ }
762
+ // TP-026 optional field: exitDiagnostic (object with classification string | undefined)
763
+ if (t.exitDiagnostic !== undefined) {
764
+ if (
765
+ typeof t.exitDiagnostic !== "object" ||
766
+ t.exitDiagnostic === null ||
767
+ Array.isArray(t.exitDiagnostic)
768
+ ) {
769
+ throw new StateFileError(
770
+ "STATE_SCHEMA_INVALID",
771
+ `tasks[${i}].exitDiagnostic is not a plain object (got ${Array.isArray(t.exitDiagnostic) ? "array" : typeof t.exitDiagnostic})`,
772
+ );
773
+ }
774
+ if (typeof (t.exitDiagnostic as any).classification !== "string") {
775
+ throw new StateFileError(
776
+ "STATE_SCHEMA_INVALID",
777
+ `tasks[${i}].exitDiagnostic.classification is not a string (got ${typeof (t.exitDiagnostic as any).classification})`,
778
+ );
779
+ }
780
+ }
781
+ }
782
+
783
+ // ── Validate lane records ────────────────────────────────────
784
+ const lanes = obj.lanes as unknown[];
785
+ const legacyTmuxSessionLaneIndexes: number[] = [];
786
+ for (let i = 0; i < lanes.length; i++) {
787
+ const l = lanes[i] as Record<string, unknown>;
788
+ if (!l || typeof l !== "object") {
789
+ throw new StateFileError("STATE_SCHEMA_INVALID", `lanes[${i}] is not an object`);
790
+ }
791
+ for (const field of ["laneId", "worktreePath", "branch"] as const) {
792
+ if (typeof l[field] !== "string") {
793
+ throw new StateFileError(
794
+ "STATE_SCHEMA_INVALID",
795
+ `lanes[${i}].${field} is missing or not a string`,
796
+ );
797
+ }
798
+ }
799
+
800
+ const { laneSessionId, tmuxSessionName } = readLaneSessionAliases(l);
801
+ if (laneSessionId !== undefined && typeof laneSessionId !== "string") {
802
+ throw new StateFileError(
803
+ "STATE_SCHEMA_INVALID",
804
+ `lanes[${i}].laneSessionId is not a string (got ${typeof laneSessionId})`,
805
+ );
806
+ }
807
+
808
+ if (tmuxSessionName !== undefined && typeof tmuxSessionName !== "string") {
809
+ throw new StateFileError(
810
+ "STATE_SCHEMA_INVALID",
811
+ `lanes[${i}].tmuxSessionName is not a string (got ${typeof tmuxSessionName})`,
812
+ );
813
+ }
814
+
815
+ if (typeof laneSessionId !== "string" && typeof tmuxSessionName !== "string") {
816
+ throw new StateFileError(
817
+ "STATE_SCHEMA_INVALID",
818
+ `lanes[${i}] must include either laneSessionId or tmuxSessionName as a string`,
819
+ );
820
+ }
821
+
822
+ if (typeof tmuxSessionName === "string") {
823
+ legacyTmuxSessionLaneIndexes.push(i);
824
+ }
825
+
826
+ normalizeLaneSessionAlias(l);
827
+
828
+ if (typeof l.laneNumber !== "number") {
829
+ throw new StateFileError(
830
+ "STATE_SCHEMA_INVALID",
831
+ `lanes[${i}].laneNumber is missing or not a number`,
832
+ );
833
+ }
834
+ if (!Array.isArray(l.taskIds)) {
835
+ throw new StateFileError(
836
+ "STATE_SCHEMA_INVALID",
837
+ `lanes[${i}].taskIds is missing or not an array`,
838
+ );
839
+ }
840
+ // v2 optional field: repoId (string | undefined)
841
+ if (l.repoId !== undefined && typeof l.repoId !== "string") {
842
+ throw new StateFileError(
843
+ "STATE_SCHEMA_INVALID",
844
+ `lanes[${i}].repoId is not a string (got ${typeof l.repoId})`,
845
+ );
846
+ }
847
+ }
848
+
849
+ if (legacyTmuxSessionLaneIndexes.length > 0) {
850
+ console.error(
851
+ "[orchid] migration: detected legacy lanes[].tmuxSessionName in .pi/batch-state.json; " +
852
+ "normalized to lanes[].laneSessionId for this release. Re-save state (or re-run /orch-resume) to persist canonical fields.",
853
+ );
854
+ }
855
+
856
+ // ── Validate merge results ───────────────────────────────────
857
+ const mergeResults = obj.mergeResults as unknown[];
858
+ for (let i = 0; i < mergeResults.length; i++) {
859
+ const m = mergeResults[i] as Record<string, unknown>;
860
+ if (!m || typeof m !== "object") {
861
+ throw new StateFileError("STATE_SCHEMA_INVALID", `mergeResults[${i}] is not an object`);
862
+ }
863
+ if (typeof m.waveIndex !== "number") {
864
+ throw new StateFileError(
865
+ "STATE_SCHEMA_INVALID",
866
+ `mergeResults[${i}].waveIndex is missing or not a number`,
867
+ );
868
+ }
869
+ if (typeof m.status !== "string" || !VALID_PERSISTED_MERGE_STATUSES.has(m.status)) {
870
+ throw new StateFileError(
871
+ "STATE_SCHEMA_INVALID",
872
+ `mergeResults[${i}].status is invalid: "${m.status}" (expected one of: ${[...VALID_PERSISTED_MERGE_STATUSES].join(", ")})`,
873
+ );
874
+ }
875
+ // v2 optional field: repoResults (array | undefined)
876
+ if (m.repoResults !== undefined) {
877
+ if (!Array.isArray(m.repoResults)) {
878
+ throw new StateFileError(
879
+ "STATE_SCHEMA_INVALID",
880
+ `mergeResults[${i}].repoResults is not an array (got ${typeof m.repoResults})`,
881
+ );
882
+ }
883
+ for (let j = 0; j < (m.repoResults as unknown[]).length; j++) {
884
+ const rr = (m.repoResults as unknown[])[j] as Record<string, unknown>;
885
+ if (!rr || typeof rr !== "object") {
886
+ throw new StateFileError(
887
+ "STATE_SCHEMA_INVALID",
888
+ `mergeResults[${i}].repoResults[${j}] is not an object`,
889
+ );
890
+ }
891
+ if (typeof rr.status !== "string" || !VALID_PERSISTED_MERGE_STATUSES.has(rr.status)) {
892
+ throw new StateFileError(
893
+ "STATE_SCHEMA_INVALID",
894
+ `mergeResults[${i}].repoResults[${j}].status is invalid: "${rr.status}"`,
895
+ );
896
+ }
897
+ if (!Array.isArray(rr.laneNumbers)) {
898
+ throw new StateFileError(
899
+ "STATE_SCHEMA_INVALID",
900
+ `mergeResults[${i}].repoResults[${j}].laneNumbers is not an array`,
901
+ );
902
+ }
903
+ }
904
+ }
905
+ }
906
+
907
+ // ── Validate lastError ───────────────────────────────────────
908
+ if (obj.lastError !== null) {
909
+ if (typeof obj.lastError !== "object") {
910
+ throw new StateFileError("STATE_SCHEMA_INVALID", `lastError is not an object or null`);
911
+ }
912
+ const le = obj.lastError as Record<string, unknown>;
913
+ if (typeof le.code !== "string" || typeof le.message !== "string") {
914
+ throw new StateFileError(
915
+ "STATE_SCHEMA_INVALID",
916
+ `lastError must have "code" (string) and "message" (string) fields`,
917
+ );
918
+ }
919
+ }
920
+
921
+ // ── Validate blockedTaskIds: array of strings ────────────────
922
+ for (const id of obj.blockedTaskIds as unknown[]) {
923
+ if (typeof id !== "string") {
924
+ throw new StateFileError(
925
+ "STATE_SCHEMA_INVALID",
926
+ `blockedTaskIds contains non-string value: ${typeof id}`,
927
+ );
928
+ }
929
+ }
930
+
931
+ // ── Validate errors: array of strings ────────────────────────
932
+ for (const err of obj.errors as unknown[]) {
933
+ if (typeof err !== "string") {
934
+ throw new StateFileError(
935
+ "STATE_SCHEMA_INVALID",
936
+ `errors array contains non-string value: ${typeof err}`,
937
+ );
938
+ }
939
+ }
940
+
941
+ // ── v1→v2→v3→v4 upconversion ─────────────────────────────────
942
+ // Apply defaults for fields that may be absent in older state files.
943
+ // The on-disk file is NOT rewritten; upconversion is in-memory only.
944
+ // Chain: v1→v2 then v2→v3 then v3→v4 (each is idempotent / no-op if already at target).
945
+ upconvertV1toV2(obj);
946
+ upconvertV2toV3(obj);
947
+ upconvertV3toV4(obj);
948
+
949
+ // ── Validate v3 resilience section ───────────────────────────
950
+ // After upconversion, resilience must be a valid object with correct types.
951
+ if (!obj.resilience || typeof obj.resilience !== "object") {
952
+ throw new StateFileError(
953
+ "STATE_SCHEMA_INVALID",
954
+ `Missing or invalid "resilience" section (expected object, got ${typeof obj.resilience})`,
955
+ );
956
+ }
957
+ const res = obj.resilience as Record<string, unknown>;
958
+ if (typeof res.resumeForced !== "boolean") {
959
+ throw new StateFileError(
960
+ "STATE_SCHEMA_INVALID",
961
+ `resilience.resumeForced must be a boolean (got ${typeof res.resumeForced})`,
962
+ );
963
+ }
964
+ if (
965
+ !res.retryCountByScope ||
966
+ typeof res.retryCountByScope !== "object" ||
967
+ Array.isArray(res.retryCountByScope)
968
+ ) {
969
+ throw new StateFileError(
970
+ "STATE_SCHEMA_INVALID",
971
+ `resilience.retryCountByScope must be an object (got ${typeof res.retryCountByScope})`,
972
+ );
973
+ }
974
+ // Deep-validate retryCountByScope: all values must be numbers
975
+ for (const [scope, count] of Object.entries(res.retryCountByScope as Record<string, unknown>)) {
976
+ if (typeof count !== "number") {
977
+ throw new StateFileError(
978
+ "STATE_SCHEMA_INVALID",
979
+ `resilience.retryCountByScope["${scope}"] must be a number (got ${typeof count})`,
980
+ );
981
+ }
982
+ }
983
+ if (res.lastFailureClass !== null && typeof res.lastFailureClass !== "string") {
984
+ throw new StateFileError(
985
+ "STATE_SCHEMA_INVALID",
986
+ `resilience.lastFailureClass must be a string or null (got ${typeof res.lastFailureClass})`,
987
+ );
988
+ }
989
+ if (!Array.isArray(res.repairHistory)) {
990
+ throw new StateFileError(
991
+ "STATE_SCHEMA_INVALID",
992
+ `resilience.repairHistory must be an array (got ${typeof res.repairHistory})`,
993
+ );
994
+ }
995
+ // Deep-validate repairHistory entries
996
+ for (let i = 0; i < (res.repairHistory as unknown[]).length; i++) {
997
+ const rec = (res.repairHistory as unknown[])[i];
998
+ if (!rec || typeof rec !== "object") {
999
+ throw new StateFileError(
1000
+ "STATE_SCHEMA_INVALID",
1001
+ `resilience.repairHistory[${i}] must be an object (got ${typeof rec})`,
1002
+ );
1003
+ }
1004
+ const r = rec as Record<string, unknown>;
1005
+ if (typeof r.id !== "string") {
1006
+ throw new StateFileError(
1007
+ "STATE_SCHEMA_INVALID",
1008
+ `resilience.repairHistory[${i}].id must be a string (got ${typeof r.id})`,
1009
+ );
1010
+ }
1011
+ if (typeof r.strategy !== "string") {
1012
+ throw new StateFileError(
1013
+ "STATE_SCHEMA_INVALID",
1014
+ `resilience.repairHistory[${i}].strategy must be a string (got ${typeof r.strategy})`,
1015
+ );
1016
+ }
1017
+ const VALID_REPAIR_STATUSES = new Set(["succeeded", "failed", "skipped"]);
1018
+ if (typeof r.status !== "string" || !VALID_REPAIR_STATUSES.has(r.status)) {
1019
+ throw new StateFileError(
1020
+ "STATE_SCHEMA_INVALID",
1021
+ `resilience.repairHistory[${i}].status must be "succeeded"|"failed"|"skipped" (got ${JSON.stringify(r.status)})`,
1022
+ );
1023
+ }
1024
+ if (typeof r.startedAt !== "number") {
1025
+ throw new StateFileError(
1026
+ "STATE_SCHEMA_INVALID",
1027
+ `resilience.repairHistory[${i}].startedAt must be a number (got ${typeof r.startedAt})`,
1028
+ );
1029
+ }
1030
+ if (typeof r.endedAt !== "number") {
1031
+ throw new StateFileError(
1032
+ "STATE_SCHEMA_INVALID",
1033
+ `resilience.repairHistory[${i}].endedAt must be a number (got ${typeof r.endedAt})`,
1034
+ );
1035
+ }
1036
+ // repoId is optional — validate type only if present
1037
+ if (r.repoId !== undefined && typeof r.repoId !== "string") {
1038
+ throw new StateFileError(
1039
+ "STATE_SCHEMA_INVALID",
1040
+ `resilience.repairHistory[${i}].repoId must be a string when present (got ${typeof r.repoId})`,
1041
+ );
1042
+ }
1043
+ }
1044
+
1045
+ // ── Validate v3 diagnostics section ──────────────────────────
1046
+ // After upconversion, diagnostics must be a valid object with correct types.
1047
+ if (!obj.diagnostics || typeof obj.diagnostics !== "object") {
1048
+ throw new StateFileError(
1049
+ "STATE_SCHEMA_INVALID",
1050
+ `Missing or invalid "diagnostics" section (expected object, got ${typeof obj.diagnostics})`,
1051
+ );
1052
+ }
1053
+ const diag = obj.diagnostics as Record<string, unknown>;
1054
+ if (!diag.taskExits || typeof diag.taskExits !== "object" || Array.isArray(diag.taskExits)) {
1055
+ throw new StateFileError(
1056
+ "STATE_SCHEMA_INVALID",
1057
+ `diagnostics.taskExits must be an object (got ${typeof diag.taskExits})`,
1058
+ );
1059
+ }
1060
+ // Deep-validate taskExits entries
1061
+ for (const [taskId, entry] of Object.entries(diag.taskExits as Record<string, unknown>)) {
1062
+ if (!entry || typeof entry !== "object") {
1063
+ throw new StateFileError(
1064
+ "STATE_SCHEMA_INVALID",
1065
+ `diagnostics.taskExits["${taskId}"] must be an object (got ${typeof entry})`,
1066
+ );
1067
+ }
1068
+ const te = entry as Record<string, unknown>;
1069
+ if (typeof te.classification !== "string") {
1070
+ throw new StateFileError(
1071
+ "STATE_SCHEMA_INVALID",
1072
+ `diagnostics.taskExits["${taskId}"].classification must be a string (got ${typeof te.classification})`,
1073
+ );
1074
+ }
1075
+ if (typeof te.cost !== "number") {
1076
+ throw new StateFileError(
1077
+ "STATE_SCHEMA_INVALID",
1078
+ `diagnostics.taskExits["${taskId}"].cost must be a number (got ${typeof te.cost})`,
1079
+ );
1080
+ }
1081
+ if (typeof te.durationSec !== "number") {
1082
+ throw new StateFileError(
1083
+ "STATE_SCHEMA_INVALID",
1084
+ `diagnostics.taskExits["${taskId}"].durationSec must be a number (got ${typeof te.durationSec})`,
1085
+ );
1086
+ }
1087
+ // retries is optional — validate type only if present
1088
+ if (te.retries !== undefined && typeof te.retries !== "number") {
1089
+ throw new StateFileError(
1090
+ "STATE_SCHEMA_INVALID",
1091
+ `diagnostics.taskExits["${taskId}"].retries must be a number when present (got ${typeof te.retries})`,
1092
+ );
1093
+ }
1094
+ }
1095
+ if (typeof diag.batchCost !== "number") {
1096
+ throw new StateFileError(
1097
+ "STATE_SCHEMA_INVALID",
1098
+ `diagnostics.batchCost must be a number (got ${typeof diag.batchCost})`,
1099
+ );
1100
+ }
1101
+
1102
+ // ── Validate exitDiagnostic on task records (optional) ───────
1103
+ for (let i = 0; i < tasks.length; i++) {
1104
+ const t = tasks[i] as Record<string, unknown>;
1105
+ if (t.exitDiagnostic !== undefined) {
1106
+ if (!t.exitDiagnostic || typeof t.exitDiagnostic !== "object") {
1107
+ throw new StateFileError(
1108
+ "STATE_SCHEMA_INVALID",
1109
+ `tasks[${i}].exitDiagnostic must be an object when present (got ${typeof t.exitDiagnostic})`,
1110
+ );
1111
+ }
1112
+ const ed = t.exitDiagnostic as Record<string, unknown>;
1113
+ if (typeof ed.classification !== "string") {
1114
+ throw new StateFileError(
1115
+ "STATE_SCHEMA_INVALID",
1116
+ `tasks[${i}].exitDiagnostic.classification must be a string (got ${typeof ed.classification})`,
1117
+ );
1118
+ }
1119
+ }
1120
+ // v4 optional fields: packetRepoId, packetTaskPath (string | undefined)
1121
+ if (t.packetRepoId !== undefined && typeof t.packetRepoId !== "string") {
1122
+ throw new StateFileError(
1123
+ "STATE_SCHEMA_INVALID",
1124
+ `tasks[${i}].packetRepoId is not a string (got ${typeof t.packetRepoId})`,
1125
+ );
1126
+ }
1127
+ if (t.packetTaskPath !== undefined && typeof t.packetTaskPath !== "string") {
1128
+ throw new StateFileError(
1129
+ "STATE_SCHEMA_INVALID",
1130
+ `tasks[${i}].packetTaskPath is not a string (got ${typeof t.packetTaskPath})`,
1131
+ );
1132
+ }
1133
+ // v4 optional field: segmentIds (string[] | undefined)
1134
+ if (t.segmentIds !== undefined) {
1135
+ if (!Array.isArray(t.segmentIds)) {
1136
+ throw new StateFileError(
1137
+ "STATE_SCHEMA_INVALID",
1138
+ `tasks[${i}].segmentIds is not an array (got ${typeof t.segmentIds})`,
1139
+ );
1140
+ }
1141
+ for (let j = 0; j < (t.segmentIds as unknown[]).length; j++) {
1142
+ if (typeof (t.segmentIds as unknown[])[j] !== "string") {
1143
+ throw new StateFileError(
1144
+ "STATE_SCHEMA_INVALID",
1145
+ `tasks[${i}].segmentIds[${j}] is not a string`,
1146
+ );
1147
+ }
1148
+ }
1149
+ }
1150
+ // v4 optional field: activeSegmentId (string | null | undefined)
1151
+ if (
1152
+ t.activeSegmentId !== undefined &&
1153
+ t.activeSegmentId !== null &&
1154
+ typeof t.activeSegmentId !== "string"
1155
+ ) {
1156
+ throw new StateFileError(
1157
+ "STATE_SCHEMA_INVALID",
1158
+ `tasks[${i}].activeSegmentId is not a string or null (got ${typeof t.activeSegmentId})`,
1159
+ );
1160
+ }
1161
+ }
1162
+
1163
+ // ── Validate v4 segments array ───────────────────────────────
1164
+ if (!Array.isArray(obj.segments)) {
1165
+ throw new StateFileError(
1166
+ "STATE_SCHEMA_INVALID",
1167
+ `Missing or invalid "segments" field (expected array, got ${typeof obj.segments})`,
1168
+ );
1169
+ }
1170
+ const segments = obj.segments as unknown[];
1171
+ for (let i = 0; i < segments.length; i++) {
1172
+ const s = segments[i] as Record<string, unknown>;
1173
+ if (!s || typeof s !== "object") {
1174
+ throw new StateFileError("STATE_SCHEMA_INVALID", `segments[${i}] is not an object`);
1175
+ }
1176
+ // Required string fields
1177
+ for (const field of [
1178
+ "segmentId",
1179
+ "taskId",
1180
+ "repoId",
1181
+ "laneId",
1182
+ "sessionName",
1183
+ "worktreePath",
1184
+ "branch",
1185
+ "exitReason",
1186
+ ] as const) {
1187
+ if (typeof s[field] !== "string") {
1188
+ throw new StateFileError(
1189
+ "STATE_SCHEMA_INVALID",
1190
+ `segments[${i}].${field} is missing or not a string (got ${typeof s[field]})`,
1191
+ );
1192
+ }
1193
+ }
1194
+ // Required status field (same valid values as task status)
1195
+ if (typeof s.status !== "string" || !VALID_TASK_STATUSES.has(s.status)) {
1196
+ throw new StateFileError(
1197
+ "STATE_SCHEMA_INVALID",
1198
+ `segments[${i}].status is invalid: "${s.status}" (expected one of: ${[...VALID_TASK_STATUSES].join(", ")})`,
1199
+ );
1200
+ }
1201
+ // Nullable number fields: startedAt, endedAt
1202
+ if (s.startedAt !== null && typeof s.startedAt !== "number") {
1203
+ throw new StateFileError(
1204
+ "STATE_SCHEMA_INVALID",
1205
+ `segments[${i}].startedAt is not a number or null (got ${typeof s.startedAt})`,
1206
+ );
1207
+ }
1208
+ if (s.endedAt !== null && typeof s.endedAt !== "number") {
1209
+ throw new StateFileError(
1210
+ "STATE_SCHEMA_INVALID",
1211
+ `segments[${i}].endedAt is not a number or null (got ${typeof s.endedAt})`,
1212
+ );
1213
+ }
1214
+ // Required number: retries
1215
+ if (typeof s.retries !== "number") {
1216
+ throw new StateFileError(
1217
+ "STATE_SCHEMA_INVALID",
1218
+ `segments[${i}].retries is not a number (got ${typeof s.retries})`,
1219
+ );
1220
+ }
1221
+ // Required array: dependsOnSegmentIds
1222
+ if (!Array.isArray(s.dependsOnSegmentIds)) {
1223
+ throw new StateFileError(
1224
+ "STATE_SCHEMA_INVALID",
1225
+ `segments[${i}].dependsOnSegmentIds is not an array (got ${typeof s.dependsOnSegmentIds})`,
1226
+ );
1227
+ }
1228
+ for (let j = 0; j < (s.dependsOnSegmentIds as unknown[]).length; j++) {
1229
+ if (typeof (s.dependsOnSegmentIds as unknown[])[j] !== "string") {
1230
+ throw new StateFileError(
1231
+ "STATE_SCHEMA_INVALID",
1232
+ `segments[${i}].dependsOnSegmentIds[${j}] is not a string`,
1233
+ );
1234
+ }
1235
+ }
1236
+ if (s.expandedFrom !== undefined && typeof s.expandedFrom !== "string") {
1237
+ throw new StateFileError(
1238
+ "STATE_SCHEMA_INVALID",
1239
+ `segments[${i}].expandedFrom is not a string when present (got ${typeof s.expandedFrom})`,
1240
+ );
1241
+ }
1242
+ if (s.expansionRequestId !== undefined && typeof s.expansionRequestId !== "string") {
1243
+ throw new StateFileError(
1244
+ "STATE_SCHEMA_INVALID",
1245
+ `segments[${i}].expansionRequestId is not a string when present (got ${typeof s.expansionRequestId})`,
1246
+ );
1247
+ }
1248
+ // Optional exitDiagnostic
1249
+ if (s.exitDiagnostic !== undefined) {
1250
+ if (
1251
+ !s.exitDiagnostic ||
1252
+ typeof s.exitDiagnostic !== "object" ||
1253
+ Array.isArray(s.exitDiagnostic)
1254
+ ) {
1255
+ throw new StateFileError(
1256
+ "STATE_SCHEMA_INVALID",
1257
+ `segments[${i}].exitDiagnostic is not a plain object (got ${Array.isArray(s.exitDiagnostic) ? "array" : typeof s.exitDiagnostic})`,
1258
+ );
1259
+ }
1260
+ if (typeof (s.exitDiagnostic as Record<string, unknown>).classification !== "string") {
1261
+ throw new StateFileError(
1262
+ "STATE_SCHEMA_INVALID",
1263
+ `segments[${i}].exitDiagnostic.classification is not a string`,
1264
+ );
1265
+ }
1266
+ }
1267
+ }
1268
+
1269
+ // ── Capture unknown top-level fields for roundtrip preservation ──
1270
+ // Any fields not in the known schema are preserved so they survive
1271
+ // serialization. This protects against data loss from future schema
1272
+ // extensions or external tools writing additional fields.
1273
+ const KNOWN_TOP_LEVEL_FIELDS = new Set([
1274
+ "schemaVersion",
1275
+ "phase",
1276
+ "batchId",
1277
+ "baseBranch",
1278
+ "orchBranch",
1279
+ "mode",
1280
+ "startedAt",
1281
+ "updatedAt",
1282
+ "endedAt",
1283
+ "currentWaveIndex",
1284
+ "totalWaves",
1285
+ "wavePlan",
1286
+ "lanes",
1287
+ "tasks",
1288
+ "mergeResults",
1289
+ "totalTasks",
1290
+ "succeededTasks",
1291
+ "failedTasks",
1292
+ "skippedTasks",
1293
+ "blockedTasks",
1294
+ "blockedTaskIds",
1295
+ "lastError",
1296
+ "errors",
1297
+ "resilience",
1298
+ "diagnostics",
1299
+ "segments",
1300
+ "_extraFields",
1301
+ ]);
1302
+ const extraFields: Record<string, unknown> = {};
1303
+ for (const key of Object.keys(obj)) {
1304
+ if (!KNOWN_TOP_LEVEL_FIELDS.has(key)) {
1305
+ extraFields[key] = obj[key];
1306
+ }
1307
+ }
1308
+ if (Object.keys(extraFields).length > 0) {
1309
+ obj._extraFields = extraFields;
1310
+ }
1311
+
1312
+ return obj as unknown as PersistedBatchState;
1313
+ }
1314
+
1315
+ // ── Serialization ────────────────────────────────────────────────────
1316
+
1317
+ /**
1318
+ * Serialize runtime batch state to a PersistedBatchState JSON string.
1319
+ *
1320
+ * Pure function: extracts the serializable subset from OrchBatchRuntimeState
1321
+ * and its associated wave results, enriches with schema version and timestamps.
1322
+ *
1323
+ * @param state - Current runtime batch state
1324
+ * @param wavePlan - Wave plan (array of arrays of task IDs)
1325
+ * @param lanes - Currently allocated lanes (latest wave's lanes)
1326
+ * @param allTaskOutcomes - All task outcomes across completed waves + current
1327
+ * @returns JSON string (pretty-printed for debuggability)
1328
+ */
1329
+ export function serializeBatchState(
1330
+ state: OrchBatchRuntimeState,
1331
+ wavePlan: string[][],
1332
+ lanes: AllocatedLane[],
1333
+ allTaskOutcomes: LaneTaskOutcome[],
1334
+ ): string {
1335
+ const now = Date.now();
1336
+
1337
+ // Build lookup maps for fast per-task enrichment.
1338
+ const laneByTaskId = new Map<string, AllocatedLane>();
1339
+ for (const lane of lanes) {
1340
+ for (const task of lane.tasks) {
1341
+ laneByTaskId.set(task.taskId, lane);
1342
+ }
1343
+ }
1344
+
1345
+ // Latest outcome wins (allTaskOutcomes is append/replace ordered by time).
1346
+ const outcomeByTaskId = new Map<string, LaneTaskOutcome>();
1347
+ for (const outcome of allTaskOutcomes) {
1348
+ outcomeByTaskId.set(outcome.taskId, outcome);
1349
+ }
1350
+
1351
+ // Build full task registry from wave plan + any outcomes seen so far.
1352
+ const taskIdSet = new Set<string>();
1353
+ for (const wave of wavePlan) {
1354
+ for (const taskId of wave) taskIdSet.add(taskId);
1355
+ }
1356
+ for (const outcome of allTaskOutcomes) {
1357
+ taskIdSet.add(outcome.taskId);
1358
+ }
1359
+
1360
+ // Build a lookup from taskId → AllocatedTask (which holds the ParsedTask with repo fields).
1361
+ const allocatedTaskByTaskId = new Map<
1362
+ string,
1363
+ { allocatedTask: import("./types.ts").AllocatedTask; lane: AllocatedLane }
1364
+ >();
1365
+ for (const lane of lanes) {
1366
+ for (const allocTask of lane.tasks) {
1367
+ allocatedTaskByTaskId.set(allocTask.taskId, { allocatedTask: allocTask, lane });
1368
+ }
1369
+ }
1370
+
1371
+ const taskRecords: PersistedTaskRecord[] = [...taskIdSet].sort().map((taskId) => {
1372
+ const lane = laneByTaskId.get(taskId);
1373
+ const outcome = outcomeByTaskId.get(taskId);
1374
+ const allocated = allocatedTaskByTaskId.get(taskId);
1375
+
1376
+ const record: PersistedTaskRecord = {
1377
+ taskId,
1378
+ laneNumber: lane?.laneNumber ?? outcome?.laneNumber ?? 0,
1379
+ sessionName: outcome?.sessionName || lane?.laneSessionId || "",
1380
+ status: outcome?.status ?? "pending",
1381
+ taskFolder: "", // Enriched by caller from discovery
1382
+ startedAt: outcome?.startTime ?? null,
1383
+ endedAt: outcome?.endTime ?? null,
1384
+ doneFileFound: outcome?.doneFileFound ?? false,
1385
+ exitReason: outcome?.exitReason ?? "",
1386
+ };
1387
+
1388
+ // v2: Serialize repo-aware fields from the ParsedTask
1389
+ if (allocated?.allocatedTask.task?.promptRepoId !== undefined) {
1390
+ record.repoId = allocated.allocatedTask.task.promptRepoId;
1391
+ }
1392
+ if (allocated?.allocatedTask.task?.resolvedRepoId !== undefined) {
1393
+ record.resolvedRepoId = allocated.allocatedTask.task.resolvedRepoId;
1394
+ }
1395
+
1396
+ // TP-028: Serialize partial progress fields from task outcome
1397
+ if (outcome?.partialProgressCommits !== undefined) {
1398
+ record.partialProgressCommits = outcome.partialProgressCommits;
1399
+ }
1400
+ if (outcome?.partialProgressBranch !== undefined) {
1401
+ record.partialProgressBranch = outcome.partialProgressBranch;
1402
+ }
1403
+
1404
+ // TP-030 v3: Serialize exit diagnostic from task outcome
1405
+ if (outcome?.exitDiagnostic !== undefined) {
1406
+ record.exitDiagnostic = outcome.exitDiagnostic;
1407
+ }
1408
+
1409
+ // TP-081 v4: Serialize segment-level fields from ParsedTask or existing state
1410
+ if (allocated?.allocatedTask.task?.packetRepoId !== undefined) {
1411
+ (record as any).packetRepoId = allocated.allocatedTask.task.packetRepoId;
1412
+ }
1413
+ if (allocated?.allocatedTask.task?.packetTaskPath !== undefined) {
1414
+ (record as any).packetTaskPath = allocated.allocatedTask.task.packetTaskPath;
1415
+ }
1416
+ if (allocated?.allocatedTask.task?.segmentIds !== undefined) {
1417
+ (record as any).segmentIds = allocated.allocatedTask.task.segmentIds;
1418
+ }
1419
+ if (allocated?.allocatedTask.task?.activeSegmentId !== undefined) {
1420
+ (record as any).activeSegmentId = allocated.allocatedTask.task.activeSegmentId;
1421
+ }
1422
+
1423
+ return record;
1424
+ });
1425
+
1426
+ // Build lane records
1427
+ const laneRecords: PersistedLaneRecord[] = lanes.map((lane) => {
1428
+ const record: PersistedLaneRecord = {
1429
+ laneNumber: lane.laneNumber,
1430
+ laneId: lane.laneId,
1431
+ laneSessionId: lane.laneSessionId,
1432
+ worktreePath: lane.worktreePath,
1433
+ branch: lane.branch,
1434
+ taskIds: lane.tasks.map((t) => t.taskId),
1435
+ };
1436
+ if (lane.repoId !== undefined) {
1437
+ record.repoId = lane.repoId;
1438
+ }
1439
+ return record;
1440
+ });
1441
+
1442
+ // Build merge results from actual merge outcomes (accumulated on batchState).
1443
+ // MergeWaveResult.waveIndex is 1-based (from merge module); normalize to
1444
+ // 0-based for PersistedMergeResult (dashboard renders as "Wave N+1").
1445
+ // Clamp to 0 minimum: resume re-exec merges use sentinel waveIndex -1,
1446
+ // which would produce -2 without clamping.
1447
+ const mergeResults: PersistedMergeResult[] = (state.mergeResults || []).map((mr) => {
1448
+ const record: PersistedMergeResult = {
1449
+ waveIndex: Math.max(0, mr.waveIndex - 1),
1450
+ status: mr.status,
1451
+ failedLane: mr.failedLane,
1452
+ failureReason: mr.failureReason,
1453
+ };
1454
+ // v2 (TP-009): Serialize per-repo merge outcomes when available (workspace mode).
1455
+ if (mr.repoResults && mr.repoResults.length > 0) {
1456
+ record.repoResults = mr.repoResults.map((rr) => ({
1457
+ repoId: rr.repoId,
1458
+ status: rr.status,
1459
+ laneNumbers: rr.laneResults.map((lr) => lr.laneNumber),
1460
+ failedLane: rr.failedLane,
1461
+ failureReason: rr.failureReason,
1462
+ }));
1463
+ }
1464
+ return record;
1465
+ });
1466
+
1467
+ const persisted: PersistedBatchState = {
1468
+ schemaVersion: BATCH_STATE_SCHEMA_VERSION,
1469
+ phase: state.phase,
1470
+ batchId: state.batchId,
1471
+ baseBranch: state.baseBranch,
1472
+ orchBranch: state.orchBranch ?? "",
1473
+ mode: state.mode ?? "repo",
1474
+ startedAt: state.startedAt,
1475
+ updatedAt: now,
1476
+ endedAt: state.endedAt,
1477
+ currentWaveIndex: state.currentWaveIndex,
1478
+ totalWaves: state.totalWaves,
1479
+ // TP-166: Persist task-level wave metadata for correct display after resume
1480
+ ...(state.taskLevelWaveCount != null ? { taskLevelWaveCount: state.taskLevelWaveCount } : {}),
1481
+ ...(state.roundToTaskWave != null ? { roundToTaskWave: [...state.roundToTaskWave] } : {}),
1482
+ wavePlan,
1483
+ lanes: laneRecords,
1484
+ tasks: taskRecords,
1485
+ mergeResults,
1486
+ totalTasks: state.totalTasks,
1487
+ succeededTasks: state.succeededTasks,
1488
+ failedTasks: state.failedTasks,
1489
+ skippedTasks: state.skippedTasks,
1490
+ blockedTasks: state.blockedTasks,
1491
+ blockedTaskIds: [...state.blockedTaskIds],
1492
+ lastError:
1493
+ state.errors.length > 0
1494
+ ? { code: "BATCH_ERROR", message: state.errors[state.errors.length - 1] }
1495
+ : null,
1496
+ errors: [...state.errors],
1497
+ resilience: state.resilience ?? defaultResilienceState(),
1498
+ diagnostics: state.diagnostics ?? defaultBatchDiagnostics(),
1499
+ segments: state.segments ?? [],
1500
+ };
1501
+
1502
+ // Merge unknown fields from loaded state to preserve roundtrip fidelity.
1503
+ // Extra fields are placed at the end of the object (after known schema fields)
1504
+ // and will not overwrite any known field.
1505
+ if (state._extraFields) {
1506
+ // TP-195: 2-step `as unknown as` widening. PersistedBatchState is
1507
+ // structurally a string-keyed record at runtime; the cast lets us
1508
+ // add unknown extra fields for serialization roundtrip fidelity
1509
+ // without TypeScript requiring sufficient type overlap.
1510
+ const output = persisted as unknown as Record<string, unknown>;
1511
+ for (const [key, value] of Object.entries(state._extraFields)) {
1512
+ if (!(key in output)) {
1513
+ output[key] = value;
1514
+ }
1515
+ }
1516
+ }
1517
+
1518
+ return JSON.stringify(persisted, null, 2);
1519
+ }
1520
+
1521
+ // ── File Operations ──────────────────────────────────────────────────
1522
+
1523
+ /** Maximum retries for atomic write (Windows file locking). */
1524
+ export const STATE_WRITE_MAX_RETRIES = 3;
1525
+
1526
+ /** Delay between write retries (ms). */
1527
+ export const STATE_WRITE_RETRY_DELAY_MS = 500;
1528
+
1529
+ /**
1530
+ * Save batch state to `.pi/batch-state.json` with atomic write.
1531
+ *
1532
+ * Strategy: write to a temp file (`.pi/batch-state.json.tmp`), then
1533
+ * rename to the final path. This prevents partial writes from corrupting
1534
+ * the state file.
1535
+ *
1536
+ * On Windows, rename can fail if another process holds a handle on the
1537
+ * target file. We retry up to STATE_WRITE_MAX_RETRIES times with a
1538
+ * short delay.
1539
+ *
1540
+ * @param json - JSON string to write (from serializeBatchState)
1541
+ * @param repoRoot - Absolute path to the repository root
1542
+ * @throws StateFileError with STATE_FILE_IO_ERROR on failure
1543
+ */
1544
+ export function saveBatchState(json: string, repoRoot: string): void {
1545
+ const finalPath = batchStatePath(repoRoot);
1546
+ const tmpPath = `${finalPath}.tmp`;
1547
+ const dir = dirname(finalPath);
1548
+
1549
+ // Ensure .pi directory exists
1550
+ if (!existsSync(dir)) {
1551
+ try {
1552
+ mkdirSync(dir, { recursive: true });
1553
+ } catch (err: unknown) {
1554
+ throw new StateFileError(
1555
+ "STATE_FILE_IO_ERROR",
1556
+ `Failed to create directory "${dir}": ${(err as Error).message}`,
1557
+ );
1558
+ }
1559
+ }
1560
+
1561
+ // Write to temp file
1562
+ try {
1563
+ writeFileSync(tmpPath, json, "utf-8");
1564
+ } catch (err: unknown) {
1565
+ throw new StateFileError(
1566
+ "STATE_FILE_IO_ERROR",
1567
+ `Failed to write temp state file "${tmpPath}": ${(err as Error).message}`,
1568
+ );
1569
+ }
1570
+
1571
+ // Atomic rename with retry for Windows file locking
1572
+ let lastError: Error | null = null;
1573
+ for (let attempt = 1; attempt <= STATE_WRITE_MAX_RETRIES; attempt++) {
1574
+ try {
1575
+ renameSync(tmpPath, finalPath);
1576
+ return; // Success
1577
+ } catch (err: unknown) {
1578
+ lastError = err as Error;
1579
+ if (attempt < STATE_WRITE_MAX_RETRIES) {
1580
+ sleepSync(STATE_WRITE_RETRY_DELAY_MS);
1581
+ }
1582
+ }
1583
+ }
1584
+
1585
+ // All retries exhausted — clean up temp file if possible
1586
+ try {
1587
+ unlinkSync(tmpPath);
1588
+ } catch {
1589
+ /* ignore cleanup errors */
1590
+ }
1591
+
1592
+ throw new StateFileError(
1593
+ "STATE_FILE_IO_ERROR",
1594
+ `Failed to atomically save state file "${finalPath}" after ` +
1595
+ `${STATE_WRITE_MAX_RETRIES} attempts: ${lastError?.message ?? "unknown error"}`,
1596
+ );
1597
+ }
1598
+
1599
+ /**
1600
+ * Load and validate batch state from `.pi/batch-state.json`.
1601
+ *
1602
+ * @param repoRoot - Absolute path to the repository root
1603
+ * @returns Validated PersistedBatchState, or null if file doesn't exist
1604
+ * @throws StateFileError with STATE_FILE_PARSE_ERROR if file contains invalid JSON
1605
+ * @throws StateFileError with STATE_SCHEMA_INVALID if JSON fails validation
1606
+ */
1607
+ export function loadBatchState(repoRoot: string): PersistedBatchState | null {
1608
+ const filePath = batchStatePath(repoRoot);
1609
+
1610
+ if (!existsSync(filePath)) {
1611
+ return null;
1612
+ }
1613
+
1614
+ let raw: string;
1615
+ try {
1616
+ raw = readFileSync(filePath, "utf-8");
1617
+ } catch (err: unknown) {
1618
+ throw new StateFileError(
1619
+ "STATE_FILE_IO_ERROR",
1620
+ `Failed to read state file "${filePath}": ${(err as Error).message}`,
1621
+ );
1622
+ }
1623
+
1624
+ let parsed: unknown;
1625
+ try {
1626
+ parsed = JSON.parse(raw);
1627
+ } catch (err: unknown) {
1628
+ throw new StateFileError(
1629
+ "STATE_FILE_PARSE_ERROR",
1630
+ `State file "${filePath}" contains invalid JSON: ${(err as Error).message}`,
1631
+ );
1632
+ }
1633
+
1634
+ return validatePersistedState(parsed);
1635
+ }
1636
+
1637
+ /**
1638
+ * Delete the batch state file. Idempotent: no error if file doesn't exist.
1639
+ *
1640
+ * @param repoRoot - Absolute path to the repository root
1641
+ * @throws StateFileError with STATE_FILE_IO_ERROR on unexpected deletion failure
1642
+ */
1643
+ export function deleteBatchState(repoRoot: string): void {
1644
+ const filePath = batchStatePath(repoRoot);
1645
+
1646
+ if (!existsSync(filePath)) {
1647
+ return; // Already gone — idempotent
1648
+ }
1649
+
1650
+ try {
1651
+ unlinkSync(filePath);
1652
+ } catch (err: unknown) {
1653
+ // If file was deleted between our check and unlink, that's fine
1654
+ if (!existsSync(filePath)) return;
1655
+ throw new StateFileError(
1656
+ "STATE_FILE_IO_ERROR",
1657
+ `Failed to delete state file "${filePath}": ${(err as Error).message}`,
1658
+ );
1659
+ }
1660
+ }
1661
+
1662
+ // ── Orphan Detection (TS-009 Step 3) ─────────────────────────────────
1663
+
1664
+ /**
1665
+ * Status of the persisted batch state file.
1666
+ *
1667
+ * - "valid" — File exists, parsed, and validated successfully
1668
+ * - "missing" — File does not exist (normal for fresh start)
1669
+ * - "invalid" — File exists but has parse or schema errors
1670
+ * - "io-error" — File could not be read due to I/O error
1671
+ */
1672
+ export type OrphanStateStatus = "valid" | "missing" | "invalid" | "io-error";
1673
+
1674
+ /**
1675
+ * Recommended action based on orphan detection analysis.
1676
+ *
1677
+ * - "resume" — Orphan sessions + valid state, or no orphans + valid state with incomplete tasks: suggest /orch-resume
1678
+ * - "abort-orphans" — Orphan sessions without usable state: suggest /orch-abort
1679
+ * - "cleanup-stale" — No orphans + stale/valid/completed state: auto-delete and start fresh
1680
+ * - "paused-corrupt" — No orphans + corrupt/unreadable state file: do NOT auto-delete; notify user to inspect or manually remove
1681
+ * - "start-fresh" — No orphans, no state file: proceed normally
1682
+ */
1683
+ export type OrphanRecommendedAction =
1684
+ | "resume"
1685
+ | "abort-orphans"
1686
+ | "cleanup-stale"
1687
+ | "paused-corrupt"
1688
+ | "start-fresh";
1689
+
1690
+ /**
1691
+ * Result of orphan detection analysis.
1692
+ *
1693
+ * Machine-usable fields enable both automated handling and user notification.
1694
+ * The `userMessage` provides a human-readable summary for display.
1695
+ */
1696
+ export interface OrphanDetectionResult {
1697
+ /** TMUX sessions matching the orchestrator prefix that were found alive */
1698
+ orphanSessions: string[];
1699
+ /** Status of the persisted batch state file */
1700
+ stateStatus: OrphanStateStatus;
1701
+ /** Loaded and validated batch state (null if missing, invalid, or io-error) */
1702
+ loadedState: PersistedBatchState | null;
1703
+ /** Error message if state loading failed (null otherwise) */
1704
+ stateError: string | null;
1705
+ /** Deterministic recommended action */
1706
+ recommendedAction: OrphanRecommendedAction;
1707
+ /** Human-readable message for user notification */
1708
+ userMessage: string;
1709
+ }
1710
+
1711
+ /**
1712
+ * Parse TMUX `list-sessions -F "#{session_name}"` output.
1713
+ *
1714
+ * Filters session names by the given prefix (e.g., "orch" matches "orch-lane-1").
1715
+ * Handles empty output, blank lines, and whitespace-padded names gracefully.
1716
+ *
1717
+ * Pure function — no process or filesystem access.
1718
+ *
1719
+ * @param stdout - Raw stdout from `tmux list-sessions -F "#{session_name}"`
1720
+ * @param prefix - Session name prefix to filter by (e.g., "orch")
1721
+ * @returns Sorted array of matching session names
1722
+ */
1723
+ export function parseOrchSessionNames(stdout: string, prefix: string): string[] {
1724
+ if (!stdout || !stdout.trim()) return [];
1725
+
1726
+ const filterPrefix = `${prefix}-`;
1727
+
1728
+ return stdout
1729
+ .split("\n")
1730
+ .map((line) => line.trim())
1731
+ .filter((name) => name.length > 0 && name.startsWith(filterPrefix))
1732
+ .sort();
1733
+ }
1734
+
1735
+ /**
1736
+ * Analyze orchestrator startup state — pure deterministic decision logic.
1737
+ *
1738
+ * Given the current state of TMUX sessions, batch state file, and task
1739
+ * completion markers, returns a deterministic recommendation for what
1740
+ * the `/orch` command should do.
1741
+ *
1742
+ * Decision matrix:
1743
+ * | Orphans? | State Status | Done? | Action |
1744
+ * |----------|-------------|-------|-----------------|
1745
+ * | Yes | valid | — | resume |
1746
+ * | Yes | missing | — | abort-orphans |
1747
+ * | Yes | invalid | — | abort-orphans |
1748
+ * | Yes | io-error | — | abort-orphans |
1749
+ * | No | valid | all | cleanup-stale |
1750
+ * | No | valid | !all | resume |
1751
+ * | No | missing | — | start-fresh |
1752
+ * | No | invalid | — | paused-corrupt |
1753
+ * | No | io-error | — | paused-corrupt |
1754
+ *
1755
+ * Pure function — no process or filesystem access.
1756
+ *
1757
+ * @param orphanSessions - TMUX sessions matching the orch prefix
1758
+ * @param stateStatus - Status of the batch state file
1759
+ * @param loadedState - Validated batch state (null if unavailable)
1760
+ * @param stateError - Error message from state loading (null if no error)
1761
+ * @param doneTaskIds - Set of task IDs whose .DONE files were found
1762
+ * @returns OrphanDetectionResult with recommended action
1763
+ */
1764
+ export function analyzeOrchestratorStartupState(
1765
+ orphanSessions: string[],
1766
+ stateStatus: OrphanStateStatus,
1767
+ loadedState: PersistedBatchState | null,
1768
+ stateError: string | null,
1769
+ doneTaskIds: ReadonlySet<string>,
1770
+ ): OrphanDetectionResult {
1771
+ const hasOrphans = orphanSessions.length > 0;
1772
+ const sessionList = orphanSessions.join(", ");
1773
+
1774
+ // ── Orphan sessions exist ────────────────────────────────────
1775
+ if (hasOrphans) {
1776
+ if (stateStatus === "valid" && loadedState) {
1777
+ return {
1778
+ orphanSessions,
1779
+ stateStatus,
1780
+ loadedState,
1781
+ stateError,
1782
+ recommendedAction: "resume",
1783
+ userMessage:
1784
+ `🔄 Found ${orphanSessions.length} running orchestrator session(s): ${sessionList}\n` +
1785
+ ` Batch ${loadedState.batchId} (${loadedState.phase}) has persisted state.\n` +
1786
+ ` Use /orch-resume to continue, or /orch-abort to clean up.`,
1787
+ };
1788
+ }
1789
+
1790
+ // Orphans without usable state (missing, invalid, or io-error)
1791
+ const errorCtx = stateError ? `\n State error: ${stateError}` : "";
1792
+ return {
1793
+ orphanSessions,
1794
+ stateStatus,
1795
+ loadedState: null,
1796
+ stateError,
1797
+ recommendedAction: "abort-orphans",
1798
+ userMessage:
1799
+ `⚠️ Found ${orphanSessions.length} orphan orchestrator session(s): ${sessionList}\n` +
1800
+ ` No usable batch state file (status: ${stateStatus}).${errorCtx}\n` +
1801
+ ` Use /orch-abort to clean up before starting a new batch.`,
1802
+ };
1803
+ }
1804
+
1805
+ // ── No orphan sessions ───────────────────────────────────────
1806
+
1807
+ if (stateStatus === "missing") {
1808
+ return {
1809
+ orphanSessions: [],
1810
+ stateStatus,
1811
+ loadedState: null,
1812
+ stateError,
1813
+ recommendedAction: "start-fresh",
1814
+ userMessage: "", // No message needed for clean start
1815
+ };
1816
+ }
1817
+
1818
+ if (stateStatus === "valid" && loadedState) {
1819
+ // Check if all tasks completed (all have .DONE files)
1820
+ const allTaskIds = loadedState.tasks.map((t) => t.taskId);
1821
+ const allDone = allTaskIds.length > 0 && allTaskIds.every((id) => doneTaskIds.has(id));
1822
+
1823
+ if (allDone) {
1824
+ return {
1825
+ orphanSessions: [],
1826
+ stateStatus,
1827
+ loadedState,
1828
+ stateError,
1829
+ recommendedAction: "cleanup-stale",
1830
+ userMessage:
1831
+ `🧹 Found stale batch state file from batch ${loadedState.batchId}.\n` +
1832
+ ` All ${allTaskIds.length} task(s) have .DONE files. Cleaning up state file.`,
1833
+ };
1834
+ }
1835
+
1836
+ // Not all tasks done — batch was interrupted (crashed orchestrator)
1837
+ const completedCount = allTaskIds.filter((id) => doneTaskIds.has(id)).length;
1838
+
1839
+ // Only phases that resumeOrchBatch can actually handle should get "resume".
1840
+ // "failed" / "stopped" / "idle" / "planning" are non-resumable — if nothing
1841
+ // ran yet (completedCount === 0) the state file is pure noise; auto-clean it
1842
+ // so /orch can start fresh without forcing the user through /orch-abort first.
1843
+ const resumablePhases: OrchBatchPhase[] = ["paused", "executing", "merging"];
1844
+ const isResumable = resumablePhases.includes(loadedState.phase as OrchBatchPhase);
1845
+
1846
+ if (!isResumable && completedCount === 0) {
1847
+ return {
1848
+ orphanSessions: [],
1849
+ stateStatus,
1850
+ loadedState,
1851
+ stateError,
1852
+ recommendedAction: "cleanup-stale",
1853
+ userMessage:
1854
+ `🧹 Found non-resumable batch state (${loadedState.batchId}, phase=${loadedState.phase}, 0 tasks ran).\n` +
1855
+ ` Cleaning up stale state file so a fresh batch can start.`,
1856
+ };
1857
+ }
1858
+
1859
+ return {
1860
+ orphanSessions: [],
1861
+ stateStatus,
1862
+ loadedState,
1863
+ stateError,
1864
+ recommendedAction: isResumable ? "resume" : "cleanup-stale",
1865
+ userMessage: isResumable
1866
+ ? `🔄 Found interrupted batch ${loadedState.batchId} (${loadedState.phase}).\n` +
1867
+ ` ${completedCount}/${allTaskIds.length} task(s) completed.\n` +
1868
+ ` Use /orch-resume to continue, or /orch-abort to clean up.`
1869
+ : `🧹 Found non-resumable batch state (${loadedState.batchId}, phase=${loadedState.phase}).\n` +
1870
+ ` ${completedCount}/${allTaskIds.length} task(s) completed. Cleaning up state file.`,
1871
+ };
1872
+ }
1873
+
1874
+ // Invalid or io-error state with no orphans — corrupt state.
1875
+ // Never auto-delete: enter paused-corrupt so the user can inspect the file
1876
+ // and decide whether to manually recover or remove it.
1877
+ return {
1878
+ orphanSessions: [],
1879
+ stateStatus,
1880
+ loadedState: null,
1881
+ stateError,
1882
+ recommendedAction: "paused-corrupt",
1883
+ userMessage:
1884
+ `⚠️ Batch state file is corrupt or unreadable (${stateStatus}).\n` +
1885
+ (stateError ? ` Error: ${stateError}\n` : "") +
1886
+ ` The file has NOT been deleted. Inspect .pi/batch-state.json manually,\n` +
1887
+ ` then either fix it or delete it and run /orch again.`,
1888
+ };
1889
+ }
1890
+
1891
+ /**
1892
+ * Detect orphan orchestrator state and analyze startup recovery action.
1893
+ *
1894
+ * Runtime V2 no longer relies on TMUX session discovery. Startup decisions
1895
+ * are based on persisted batch state plus task .DONE markers.
1896
+ *
1897
+ * @param prefix - Legacy orchestrator session prefix (unused in Runtime V2)
1898
+ * @param repoRoot - Absolute path to the repository root
1899
+ * @returns OrphanDetectionResult with recommended action
1900
+ */
1901
+ export function detectOrphanSessions(prefix: string, repoRoot: string): OrphanDetectionResult {
1902
+ void prefix;
1903
+
1904
+ // Runtime V2 uses persisted state as the source of truth for orphan analysis.
1905
+ const orphanSessions: string[] = [];
1906
+
1907
+ // ── 1. Load batch state file ─────────────────────────────────
1908
+ let stateStatus: OrphanStateStatus = "missing";
1909
+ let loadedState: PersistedBatchState | null = null;
1910
+ let stateError: string | null = null;
1911
+
1912
+ try {
1913
+ loadedState = loadBatchState(repoRoot);
1914
+ stateStatus = loadedState ? "valid" : "missing";
1915
+ } catch (err: unknown) {
1916
+ if (err instanceof StateFileError) {
1917
+ switch (err.code) {
1918
+ case "STATE_FILE_PARSE_ERROR":
1919
+ case "STATE_SCHEMA_INVALID":
1920
+ stateStatus = "invalid";
1921
+ stateError = `[${err.code}] ${err.message}`;
1922
+ break;
1923
+ case "STATE_FILE_IO_ERROR":
1924
+ stateStatus = "io-error";
1925
+ stateError = `[${err.code}] ${err.message}`;
1926
+ break;
1927
+ }
1928
+ } else {
1929
+ stateStatus = "io-error";
1930
+ stateError = err instanceof Error ? err.message : String(err);
1931
+ }
1932
+ }
1933
+
1934
+ // ── 2. Check .DONE files for stale state detection ───────────
1935
+ const doneTaskIds = new Set<string>();
1936
+ if (loadedState && orphanSessions.length === 0) {
1937
+ // Only check .DONE files when we have state but no orphans
1938
+ // (stale state scenario — sessions finished while orchestrator was disconnected)
1939
+ for (const task of loadedState.tasks) {
1940
+ if (task.taskFolder && hasTaskDoneMarker(task.taskFolder)) {
1941
+ doneTaskIds.add(task.taskId);
1942
+ }
1943
+ }
1944
+ }
1945
+
1946
+ // ── 3. Analyze and return ────────────────────────────────────
1947
+ return analyzeOrchestratorStartupState(
1948
+ orphanSessions,
1949
+ stateStatus,
1950
+ loadedState,
1951
+ stateError,
1952
+ doneTaskIds,
1953
+ );
1954
+ }
1955
+
1956
+ // ── Batch History ────────────────────────────────────────────────────
1957
+
1958
+ /** Path to the batch history file. */
1959
+ function batchHistoryPath(repoRoot: string): string {
1960
+ return join(repoRoot, ".pi", "batch-history.json");
1961
+ }
1962
+
1963
+ /**
1964
+ * Load existing batch history entries from disk.
1965
+ * Returns empty array if file doesn't exist or is invalid.
1966
+ */
1967
+ export function loadBatchHistory(repoRoot: string): BatchHistorySummary[] {
1968
+ const filePath = batchHistoryPath(repoRoot);
1969
+ try {
1970
+ if (!existsSync(filePath)) return [];
1971
+ const raw = readFileSync(filePath, "utf-8");
1972
+ const data = JSON.parse(raw);
1973
+ if (!Array.isArray(data)) return [];
1974
+ return data;
1975
+ } catch {
1976
+ return [];
1977
+ }
1978
+ }
1979
+
1980
+ /**
1981
+ * Append a batch summary to history and trim to max entries.
1982
+ * Writes atomically via tmp+rename pattern.
1983
+ */
1984
+ export function saveBatchHistory(repoRoot: string, summary: BatchHistorySummary): void {
1985
+ const filePath = batchHistoryPath(repoRoot);
1986
+ try {
1987
+ const history = loadBatchHistory(repoRoot);
1988
+ // Upsert by batchId so resumed batches replace their earlier partial entry
1989
+ // instead of creating duplicates.
1990
+ const nextHistory = history.filter((entry) => entry.batchId !== summary.batchId);
1991
+ // Prepend newest first
1992
+ nextHistory.unshift(summary);
1993
+ // Trim to max
1994
+ if (nextHistory.length > BATCH_HISTORY_MAX_ENTRIES) {
1995
+ nextHistory.length = BATCH_HISTORY_MAX_ENTRIES;
1996
+ }
1997
+ const dir = dirname(filePath);
1998
+ if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
1999
+ const tmpPath = filePath + ".tmp";
2000
+ writeFileSync(tmpPath, JSON.stringify(nextHistory, null, 2));
2001
+ renameSync(tmpPath, filePath);
2002
+ execLog("batch", "history", `saved batch summary (${nextHistory.length} entries)`);
2003
+ } catch (err) {
2004
+ execLog("batch", "history", `failed to save batch history: ${err}`);
2005
+ }
2006
+ }
2007
+
2008
+ /**
2009
+ * Update an existing batch history entry with the integration timestamp.
2010
+ *
2011
+ * Sets `integratedAt` on the matching entry (by batchId). If no entry
2012
+ * is found, this is a no-op — the batch may predate the history feature.
2013
+ *
2014
+ * @since TP-179
2015
+ */
2016
+ export function updateBatchHistoryIntegration(
2017
+ repoRoot: string,
2018
+ batchId: string,
2019
+ integratedAt: number,
2020
+ ): void {
2021
+ const filePath = batchHistoryPath(repoRoot);
2022
+ try {
2023
+ const history = loadBatchHistory(repoRoot);
2024
+ const entry = history.find((e) => e.batchId === batchId);
2025
+ if (!entry) {
2026
+ execLog(
2027
+ "batch",
2028
+ "history",
2029
+ `no history entry found for batchId=${batchId}, skipping integratedAt update`,
2030
+ );
2031
+ return;
2032
+ }
2033
+ entry.integratedAt = integratedAt;
2034
+ const dir = dirname(filePath);
2035
+ if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
2036
+ const tmpPath = filePath + ".tmp";
2037
+ writeFileSync(tmpPath, JSON.stringify(history, null, 2));
2038
+ renameSync(tmpPath, filePath);
2039
+ execLog("batch", "history", `updated integratedAt for batchId=${batchId}`);
2040
+ } catch (err) {
2041
+ execLog("batch", "history", `failed to update integratedAt: ${err}`);
2042
+ }
2043
+ }
2044
+
2045
+ // ── Tier 0 Supervisor Event Logging (TP-039 Step 2) ─────────────────
2046
+
2047
+ /**
2048
+ * Event types emitted by Tier 0 recovery actions.
2049
+ *
2050
+ * - `tier0_recovery_attempt` — A recovery action is being tried
2051
+ * - `tier0_recovery_success` — Recovery succeeded
2052
+ * - `tier0_recovery_exhausted` — Retry budget exhausted, escalation needed
2053
+ * - `tier0_escalation` — Escalation to supervisor (emitted alongside exhausted)
2054
+ *
2055
+ * @since TP-039
2056
+ */
2057
+ export type Tier0EventType =
2058
+ | "tier0_recovery_attempt"
2059
+ | "tier0_recovery_success"
2060
+ | "tier0_recovery_exhausted"
2061
+ | "tier0_escalation";
2062
+
2063
+ /**
2064
+ * Structured event written to `.pi/supervisor/events.jsonl`.
2065
+ *
2066
+ * Each event contains enough context for the supervisor agent (Tier 1)
2067
+ * to understand what happened and decide next actions.
2068
+ *
2069
+ * @since TP-039
2070
+ */
2071
+ export interface Tier0Event {
2072
+ /** ISO 8601 timestamp */
2073
+ timestamp: string;
2074
+ /** Event type */
2075
+ type: Tier0EventType;
2076
+ /** Batch identifier */
2077
+ batchId: string;
2078
+ /** Wave index (0-based) */
2079
+ waveIndex: number;
2080
+ /** Recovery pattern being applied */
2081
+ pattern: Tier0RecoveryPattern | "merge_timeout";
2082
+ /** Current attempt number (1-based) */
2083
+ attempt: number;
2084
+ /** Maximum attempts allowed */
2085
+ maxAttempts: number;
2086
+ /** Affected task ID (for task-scoped patterns like worker_crash) */
2087
+ taskId?: string;
2088
+ /** Lane number (for lane-scoped patterns) */
2089
+ laneNumber?: number;
2090
+ /** Repo ID (for workspace-mode attribution; null/undefined for repo-mode) */
2091
+ repoId?: string | null;
2092
+ /** Exit classification or error type */
2093
+ classification?: string;
2094
+ /** Error message (for exhausted events) */
2095
+ error?: string;
2096
+ /** Resolution description (for success events) */
2097
+ resolution?: string;
2098
+ /** Cooldown/timeout in milliseconds before retry (for attempt events) */
2099
+ cooldownMs?: number;
2100
+ /** Scope key used for retry counter tracking */
2101
+ scopeKey?: string;
2102
+ /** Affected task IDs (for escalation context in exhausted events) */
2103
+ affectedTaskIds?: string[];
2104
+ /** Suggested remediation (for exhausted events) */
2105
+ suggestion?: string;
2106
+ /** Typed escalation payload (present only on `tier0_escalation` events) */
2107
+ escalation?: EscalationContext;
2108
+ }
2109
+
2110
+ /**
2111
+ * Build the required base fields for a Tier 0 event.
2112
+ *
2113
+ * Ensures consistent field population across all emit sites so
2114
+ * supervisor consumers get a deterministic event shape.
2115
+ *
2116
+ * @since TP-039 R004
2117
+ */
2118
+ export function buildTier0EventBase(
2119
+ type: Tier0EventType,
2120
+ batchId: string,
2121
+ waveIndex: number,
2122
+ pattern: Tier0RecoveryPattern | "merge_timeout",
2123
+ attempt: number,
2124
+ maxAttempts: number,
2125
+ ): Pick<
2126
+ Tier0Event,
2127
+ "timestamp" | "type" | "batchId" | "waveIndex" | "pattern" | "attempt" | "maxAttempts"
2128
+ > {
2129
+ return {
2130
+ timestamp: new Date().toISOString(),
2131
+ type,
2132
+ batchId,
2133
+ waveIndex,
2134
+ pattern,
2135
+ attempt,
2136
+ maxAttempts,
2137
+ };
2138
+ }
2139
+
2140
+ /**
2141
+ * Emit a Tier 0 event to `.pi/supervisor/events.jsonl`.
2142
+ *
2143
+ * Best-effort: creates the directory if needed, appends the event as a
2144
+ * single JSONL line. Failures are logged but never crash the batch.
2145
+ *
2146
+ * @param stateRoot - Root directory for state files (workspace root or repo root)
2147
+ * @param event - The event to emit
2148
+ *
2149
+ * @since TP-039
2150
+ */
2151
+ export function emitTier0Event(stateRoot: string, event: Tier0Event): void {
2152
+ try {
2153
+ const supervisorDir = join(stateRoot, ".pi", "supervisor");
2154
+ if (!existsSync(supervisorDir)) {
2155
+ mkdirSync(supervisorDir, { recursive: true });
2156
+ }
2157
+ const eventsPath = join(supervisorDir, "events.jsonl");
2158
+ const line = JSON.stringify(event) + "\n";
2159
+ appendFileSync(eventsPath, line);
2160
+ } catch (err: unknown) {
2161
+ // Best-effort: log but don't crash the batch
2162
+ const msg = err instanceof Error ? err.message : String(err);
2163
+ execLog("batch", event.batchId, `tier0 event write failed: ${msg}`, {
2164
+ eventType: event.type,
2165
+ pattern: event.pattern,
2166
+ });
2167
+ }
2168
+ }
2169
+
2170
+ // ── Engine Event Logging (TP-040) ───────────────────────────────────
2171
+
2172
+ /**
2173
+ * Emit an engine lifecycle event to `.pi/supervisor/events.jsonl`.
2174
+ *
2175
+ * Shares the same JSONL file as Tier 0 events for unified consumption
2176
+ * by the supervisor agent. Engine events cover batch lifecycle transitions
2177
+ * (wave start/end, task completion, merge phases, batch terminal states).
2178
+ *
2179
+ * Best-effort: creates the directory if needed, appends the event as a
2180
+ * single JSONL line. Failures are logged but never crash the batch.
2181
+ *
2182
+ * Also invokes the optional event callback for in-process consumers
2183
+ * (command handler, dashboard).
2184
+ *
2185
+ * @param stateRoot - Root directory for state files (workspace root or repo root)
2186
+ * @param event - The engine event to emit
2187
+ * @param callback - Optional in-process event callback
2188
+ *
2189
+ * @since TP-040
2190
+ */
2191
+ export function emitEngineEvent(
2192
+ stateRoot: string,
2193
+ event: EngineEvent,
2194
+ callback?: ((event: EngineEvent) => void) | null,
2195
+ ): void {
2196
+ // Write to JSONL file (same path as Tier 0 events)
2197
+ try {
2198
+ const supervisorDir = join(stateRoot, ".pi", "supervisor");
2199
+ if (!existsSync(supervisorDir)) {
2200
+ mkdirSync(supervisorDir, { recursive: true });
2201
+ }
2202
+ const eventsPath = join(supervisorDir, "events.jsonl");
2203
+ const line = JSON.stringify(event) + "\n";
2204
+ appendFileSync(eventsPath, line);
2205
+ } catch (err: unknown) {
2206
+ // Best-effort: log but don't crash the batch
2207
+ const msg = err instanceof Error ? err.message : String(err);
2208
+ execLog("batch", event.batchId, `engine event write failed: ${msg}`, {
2209
+ eventType: event.type,
2210
+ });
2211
+ }
2212
+
2213
+ // Invoke in-process callback
2214
+ if (callback) {
2215
+ try {
2216
+ callback(event);
2217
+ } catch (err: unknown) {
2218
+ const msg = err instanceof Error ? err.message : String(err);
2219
+ execLog("batch", event.batchId, `engine event callback failed: ${msg}`, {
2220
+ eventType: event.type,
2221
+ });
2222
+ }
2223
+ }
2224
+ }
2225
+
2226
+ // ── TP-187 (#539): Batch-Meta Runtime Artifact ─────────────────────
2227
+ //
2228
+ // Small JSON file written at batch-start to `.pi/runtime/<batchId>/batch-meta.json`.
2229
+ // Captures the wave plan and the few non-recoverable scalars (baseBranch,
2230
+ // orchBranch, mode, startedAt, totalWaves) so that `orch_resume(force=true)`
2231
+ // can deterministically reconstruct a validator-compliant PersistedBatchState
2232
+ // after `orch_abort()` deletes `.pi/batch-state.json`.
2233
+ //
2234
+ // Without this artifact the wave topology is unrecoverable from the surviving
2235
+ // runtime registry alone (manifests don't carry wave info) and a flattened
2236
+ // "single wave with all surviving tasks" reconstruction can violate DAG
2237
+ // dependency ordering. See R003 plan review.
2238
+
2239
+ /**
2240
+ * Schema-tagged batch metadata persisted alongside per-batch runtime state.
2241
+ *
2242
+ * @since TP-187 (#539)
2243
+ */
2244
+ export interface BatchMetaArtifact {
2245
+ schemaVersion: 1;
2246
+ batchId: string;
2247
+ wavePlan: string[][];
2248
+ baseBranch: string;
2249
+ orchBranch: string;
2250
+ mode: WorkspaceMode;
2251
+ startedAt: number;
2252
+ totalWaves: number;
2253
+ }
2254
+
2255
+ /** Path to the batch-meta artifact for a given batch. */
2256
+ function batchMetaPath(stateRoot: string, batchId: string): string {
2257
+ return join(runtimeRoot(stateRoot, batchId), "batch-meta.json");
2258
+ }
2259
+
2260
+ /**
2261
+ * Persist the wave plan and core batch metadata to the runtime artifact
2262
+ * directory. Best-effort: failures are logged but do NOT crash the batch.
2263
+ *
2264
+ * Called once at batch-start (after wavePlan is finalized) and re-written
2265
+ * whenever the wave plan mutates (segment expansion).
2266
+ *
2267
+ * @since TP-187 (#539)
2268
+ */
2269
+ export function saveBatchMetaRuntimeArtifact(stateRoot: string, artifact: BatchMetaArtifact): void {
2270
+ try {
2271
+ const path = batchMetaPath(stateRoot, artifact.batchId);
2272
+ mkdirSync(dirname(path), { recursive: true });
2273
+ const tmp = path + ".tmp";
2274
+ writeFileSync(tmp, JSON.stringify(artifact, null, 2) + "\n", "utf-8");
2275
+ renameSync(tmp, path);
2276
+ execLog("state", artifact.batchId, "persisted batch-meta runtime artifact", {
2277
+ waves: artifact.wavePlan.length,
2278
+ tasks: artifact.wavePlan.reduce((sum, w) => sum + w.length, 0),
2279
+ });
2280
+ } catch (err) {
2281
+ execLog(
2282
+ "state",
2283
+ artifact.batchId,
2284
+ `batch-meta write failed: ${err instanceof Error ? err.message : String(err)}`,
2285
+ );
2286
+ }
2287
+ }
2288
+
2289
+ /**
2290
+ * Load the batch-meta artifact for a given batch, or null if missing/invalid.
2291
+ *
2292
+ * @since TP-187 (#539)
2293
+ */
2294
+ export function loadBatchMetaRuntimeArtifact(
2295
+ stateRoot: string,
2296
+ batchId: string,
2297
+ ): BatchMetaArtifact | null {
2298
+ const path = batchMetaPath(stateRoot, batchId);
2299
+ if (!existsSync(path)) return null;
2300
+ try {
2301
+ const raw = readFileSync(path, "utf-8");
2302
+ const parsed = JSON.parse(raw);
2303
+ if (!parsed || typeof parsed !== "object") return null;
2304
+ const obj = parsed as Record<string, unknown>;
2305
+ if (obj.schemaVersion !== 1) return null;
2306
+ if (typeof obj.batchId !== "string" || obj.batchId !== batchId) return null;
2307
+ if (!Array.isArray(obj.wavePlan)) return null;
2308
+ for (const wave of obj.wavePlan) {
2309
+ if (!Array.isArray(wave)) return null;
2310
+ for (const taskId of wave) {
2311
+ if (typeof taskId !== "string") return null;
2312
+ }
2313
+ }
2314
+ if (typeof obj.baseBranch !== "string") return null;
2315
+ if (typeof obj.orchBranch !== "string") return null;
2316
+ if (obj.mode !== "repo" && obj.mode !== "workspace") return null;
2317
+ if (typeof obj.startedAt !== "number") return null;
2318
+ if (typeof obj.totalWaves !== "number") return null;
2319
+ return obj as unknown as BatchMetaArtifact;
2320
+ } catch {
2321
+ return null;
2322
+ }
2323
+ }
2324
+
2325
+ // ── TP-187 (#539): Reconstruct PersistedBatchState from runtime artifacts ──
2326
+
2327
+ /**
2328
+ * Result of `reconstructBatchStateFromRuntime`. On success, contains the
2329
+ * validator-compliant state, the selected batchId, and a human-readable note
2330
+ * about how the selection was made (used by resume's onNotify output). On
2331
+ * failure, names the missing or corrupt artifact for fail-loud reporting.
2332
+ *
2333
+ * @since TP-187 (#539)
2334
+ */
2335
+ // TP-195: `error?: undefined` on the success branch makes this a well-formed
2336
+ // discriminated union under `strict: false`. Without it, `if (!result.ok)`
2337
+ // does not narrow `error` because non-strict narrowing requires every
2338
+ // member of the union to share the discriminating fields. Runtime semantics
2339
+ // unchanged — the success branch never carries an error.
2340
+ export type ReconstructResult =
2341
+ | {
2342
+ ok: true;
2343
+ state: PersistedBatchState;
2344
+ batchId: string;
2345
+ selectionNote: string;
2346
+ error?: undefined;
2347
+ }
2348
+ | { ok: false; error: string };
2349
+
2350
+ /**
2351
+ * List candidate `.pi/runtime/<batchId>/` directories newest-first by mtime,
2352
+ * with lex-largest tie-break for determinism.
2353
+ */
2354
+ function listRuntimeBatchDirs(stateRoot: string): { batchId: string; mtimeMs: number }[] {
2355
+ const root = join(stateRoot, ".pi", "runtime");
2356
+ if (!existsSync(root)) return [];
2357
+ let entries: string[] = [];
2358
+ try {
2359
+ entries = readdirSync(root);
2360
+ } catch {
2361
+ return [];
2362
+ }
2363
+ const candidates: { batchId: string; mtimeMs: number }[] = [];
2364
+ for (const name of entries) {
2365
+ const dir = join(root, name);
2366
+ try {
2367
+ const st = statSync(dir);
2368
+ if (!st.isDirectory()) continue;
2369
+ candidates.push({ batchId: name, mtimeMs: st.mtimeMs });
2370
+ } catch {
2371
+ continue;
2372
+ }
2373
+ }
2374
+ candidates.sort((a, b) => {
2375
+ if (b.mtimeMs !== a.mtimeMs) return b.mtimeMs - a.mtimeMs;
2376
+ return b.batchId.localeCompare(a.batchId);
2377
+ });
2378
+ return candidates;
2379
+ }
2380
+
2381
+ /**
2382
+ * Read all worker manifests under `.pi/runtime/<batchId>/agents/`.
2383
+ *
2384
+ * Returns an empty array if the agents directory is missing.
2385
+ */
2386
+ function readWorkerManifests(stateRoot: string, batchId: string): RuntimeAgentManifest[] {
2387
+ const agentsDir = join(runtimeRoot(stateRoot, batchId), "agents");
2388
+ if (!existsSync(agentsDir)) return [];
2389
+ let entries: string[] = [];
2390
+ try {
2391
+ entries = readdirSync(agentsDir);
2392
+ } catch {
2393
+ return [];
2394
+ }
2395
+ const manifests: RuntimeAgentManifest[] = [];
2396
+ for (const agentId of entries) {
2397
+ const manifestPath = runtimeManifestPath(stateRoot, batchId, agentId);
2398
+ if (!existsSync(manifestPath)) continue;
2399
+ try {
2400
+ const raw = readFileSync(manifestPath, "utf-8");
2401
+ const parsed = JSON.parse(raw) as RuntimeAgentManifest;
2402
+ if (parsed && typeof parsed === "object" && parsed.role === "worker") {
2403
+ manifests.push(parsed);
2404
+ }
2405
+ } catch {
2406
+ continue;
2407
+ }
2408
+ }
2409
+ return manifests;
2410
+ }
2411
+
2412
+ /**
2413
+ * Deterministically reconstruct a validator-compliant `PersistedBatchState`
2414
+ * from the surviving runtime artifacts after `.pi/batch-state.json` has been
2415
+ * deleted (typically by `orch_abort()`).
2416
+ *
2417
+ * Required artifacts: at least one `.pi/runtime/<batchId>/` directory whose
2418
+ * `batch-meta.json` parses cleanly AND has at least one worker manifest with
2419
+ * an existing worktree on disk. Anything else returns a fail-loud error so
2420
+ * the caller can surface a clear "no resumable state" message instead of
2421
+ * silently producing an invalid state.
2422
+ *
2423
+ * @since TP-187 (#539)
2424
+ */
2425
+ export function reconstructBatchStateFromRuntime(stateRoot: string): ReconstructResult {
2426
+ const candidates = listRuntimeBatchDirs(stateRoot);
2427
+ if (candidates.length === 0) {
2428
+ return { ok: false, error: "no .pi/runtime/ directory or no batch subdirectories" };
2429
+ }
2430
+
2431
+ // Try the newest batch first; if its required artifacts are missing, fall
2432
+ // through to the next candidate. We stop at the first batch with a parseable
2433
+ // batch-meta + at least one viable worker manifest.
2434
+ const failures: string[] = [];
2435
+ for (let idx = 0; idx < candidates.length; idx++) {
2436
+ const cand = candidates[idx];
2437
+ const meta = loadBatchMetaRuntimeArtifact(stateRoot, cand.batchId);
2438
+ if (!meta) {
2439
+ failures.push(`${cand.batchId}: batch-meta.json missing or invalid`);
2440
+ continue;
2441
+ }
2442
+ const manifests = readWorkerManifests(stateRoot, cand.batchId);
2443
+ if (manifests.length === 0) {
2444
+ failures.push(`${cand.batchId}: no worker manifests`);
2445
+ continue;
2446
+ }
2447
+ const workerManifestsWithWorktree = manifests.filter(
2448
+ (m) => typeof m.cwd === "string" && m.cwd.length > 0 && existsSync(m.cwd),
2449
+ );
2450
+ if (workerManifestsWithWorktree.length === 0) {
2451
+ failures.push(`${cand.batchId}: worktree paths from manifests no longer exist on disk`);
2452
+ continue;
2453
+ }
2454
+
2455
+ // TP-187 (#539) — sage post-integration follow-up: refuse reconstruction
2456
+ // when the runtime artifacts indicate this batch was multi-repo (segment
2457
+ // expansion). Reconstruction hardcodes `segments: []` and cannot recover
2458
+ // the per-segment topology that lives only in the deleted batch-state.
2459
+ // Resuming with `segments: []` for a multi-repo batch would silently lose
2460
+ // the expansion state and could re-execute already-done segments OR fail
2461
+ // dependency checks for cross-repo waves. Detection heuristic: if worker
2462
+ // manifests carry more than one distinct repoId, segment expansion was
2463
+ // active. Single-repo batches (the common case, including OrchID's
2464
+ // own self-orchestration) are unaffected.
2465
+ {
2466
+ const distinctRepoIds = new Set<string>();
2467
+ for (const m of workerManifestsWithWorktree) {
2468
+ if (typeof m.repoId === "string" && m.repoId.length > 0) {
2469
+ distinctRepoIds.add(m.repoId);
2470
+ }
2471
+ }
2472
+ if (distinctRepoIds.size > 1) {
2473
+ failures.push(
2474
+ `${cand.batchId}: multi-repo batch detected (${distinctRepoIds.size} distinct repoIds: ` +
2475
+ `${[...distinctRepoIds].slice(0, 4).join(", ")}` +
2476
+ `${distinctRepoIds.size > 4 ? ", ..." : ""}); reconstruction would lose segment ` +
2477
+ `expansion state and is refused. Restore .pi/batch-state.json from backup or start a new batch.`,
2478
+ );
2479
+ continue;
2480
+ }
2481
+ }
2482
+
2483
+ // Build per-lane aggregation from worker manifests.
2484
+ const laneMap = new Map<
2485
+ number,
2486
+ { laneNumber: number; agentId: string; worktreePath: string; repoId: string; taskIds: string[] }
2487
+ >();
2488
+ for (const m of workerManifestsWithWorktree) {
2489
+ if (typeof m.laneNumber !== "number") continue;
2490
+ const lane = laneMap.get(m.laneNumber) ?? {
2491
+ laneNumber: m.laneNumber,
2492
+ agentId: m.agentId,
2493
+ worktreePath: m.cwd,
2494
+ repoId: m.repoId ?? "default",
2495
+ taskIds: [] as string[],
2496
+ };
2497
+ if (typeof m.taskId === "string" && m.taskId && !lane.taskIds.includes(m.taskId)) {
2498
+ lane.taskIds.push(m.taskId);
2499
+ }
2500
+ laneMap.set(m.laneNumber, lane);
2501
+ }
2502
+ if (laneMap.size === 0) {
2503
+ failures.push(`${cand.batchId}: no lane numbers in manifests`);
2504
+ continue;
2505
+ }
2506
+
2507
+ // Tasks: union of taskIds across all lanes, plus any wavePlan tasks that
2508
+ // are not represented (they are pending, not yet executed).
2509
+ const knownTaskIds = new Set<string>();
2510
+ for (const lane of laneMap.values()) {
2511
+ for (const tid of lane.taskIds) knownTaskIds.add(tid);
2512
+ }
2513
+ for (const wave of meta.wavePlan) {
2514
+ for (const tid of wave) knownTaskIds.add(tid);
2515
+ }
2516
+
2517
+ // Build task records with conservative defaults; resume's reconciliation
2518
+ // pass will re-detect succeeded tasks via `.DONE` markers and STATUS.md.
2519
+ const tasks: PersistedTaskRecord[] = [];
2520
+ const manifestByTaskId = new Map<string, RuntimeAgentManifest>();
2521
+ for (const m of workerManifestsWithWorktree) {
2522
+ if (typeof m.taskId === "string" && m.taskId) {
2523
+ manifestByTaskId.set(m.taskId, m);
2524
+ }
2525
+ }
2526
+ for (const taskId of knownTaskIds) {
2527
+ const m = manifestByTaskId.get(taskId);
2528
+ const lane = m ? laneMap.get(m.laneNumber) : undefined;
2529
+ // TP-195: dropped `taskName: taskId` — not on `PersistedTaskRecord`
2530
+ // schema; no consumer reads `.taskName` from persisted records
2531
+ // (only from `ParsedTask`). Was being added via untyped property
2532
+ // bag cast that the Step 0 typecheck inventory flagged.
2533
+ const taskRecord: PersistedTaskRecord = {
2534
+ taskId,
2535
+ taskFolder: m?.packet?.taskFolder ?? "",
2536
+ status: "pending",
2537
+ sessionName: m?.agentId ?? "",
2538
+ laneNumber: lane?.laneNumber ?? 0,
2539
+ startedAt: typeof m?.startedAt === "number" ? m.startedAt : null,
2540
+ endedAt: null,
2541
+ exitReason: "",
2542
+ doneFileFound: false,
2543
+ };
2544
+ if (m?.repoId) taskRecord.repoId = m.repoId;
2545
+ // TP-195: dropped dead reads of `m.packet.packetRepoId` /
2546
+ // `.packetTaskPath`. `m.packet` is `PacketPaths` which has only
2547
+ // `promptPath`/`statusPath`/`donePath`/`reviewsDir`/`taskFolder`
2548
+ // — the `packetRepoId`/`packetTaskPath` fields exist on
2549
+ // `PersistedTaskRecord` and `ParsedTask`, not on `PacketPaths`,
2550
+ // so these reads always returned undefined and the if-branches
2551
+ // never fired. Removed under the no-behavior-change guarantee.
2552
+ tasks.push(taskRecord);
2553
+ }
2554
+
2555
+ // Build lane records.
2556
+ const lanes: PersistedLaneRecord[] = Array.from(laneMap.values())
2557
+ .sort((a, b) => a.laneNumber - b.laneNumber)
2558
+ .map((l) => {
2559
+ const sessionId = l.agentId.replace(/-(worker|reviewer)$/, "");
2560
+ const rec: PersistedLaneRecord = {
2561
+ laneId: `lane-${l.laneNumber}`,
2562
+ laneNumber: l.laneNumber,
2563
+ laneSessionId: sessionId,
2564
+ worktreePath: l.worktreePath,
2565
+ branch: meta.orchBranch ? `${meta.orchBranch}-lane-${l.laneNumber}` : `lane-${l.laneNumber}`,
2566
+ taskIds: [...l.taskIds],
2567
+ };
2568
+ if (l.repoId && l.repoId !== "default") rec.repoId = l.repoId;
2569
+ return rec;
2570
+ });
2571
+
2572
+ const now = Date.now();
2573
+ const reconstructed: PersistedBatchState = {
2574
+ schemaVersion: BATCH_STATE_SCHEMA_VERSION,
2575
+ batchId: meta.batchId,
2576
+ phase: "stopped",
2577
+ baseBranch: meta.baseBranch,
2578
+ orchBranch: meta.orchBranch,
2579
+ mode: meta.mode,
2580
+ startedAt: meta.startedAt,
2581
+ endedAt: null,
2582
+ updatedAt: now,
2583
+ currentWaveIndex: 0,
2584
+ totalWaves: meta.totalWaves,
2585
+ totalTasks: tasks.length,
2586
+ succeededTasks: 0,
2587
+ failedTasks: 0,
2588
+ skippedTasks: 0,
2589
+ blockedTasks: 0,
2590
+ wavePlan: meta.wavePlan.map((wave) => [...wave]),
2591
+ lanes,
2592
+ tasks,
2593
+ mergeResults: [],
2594
+ blockedTaskIds: [],
2595
+ errors: [],
2596
+ segments: [],
2597
+ lastError: null,
2598
+ resilience: { ...defaultResilienceState(), resumeForced: true },
2599
+ diagnostics: defaultBatchDiagnostics(),
2600
+ } as PersistedBatchState;
2601
+
2602
+ // Validate the reconstructed shape against the on-disk schema gate.
2603
+ try {
2604
+ const json = JSON.stringify(reconstructed);
2605
+ validatePersistedState(JSON.parse(json));
2606
+ } catch (err) {
2607
+ failures.push(
2608
+ `${cand.batchId}: reconstructed state failed validation: ${err instanceof Error ? err.message : String(err)}`,
2609
+ );
2610
+ continue;
2611
+ }
2612
+
2613
+ const totalCandidates = candidates.length;
2614
+ const selectionNote =
2615
+ totalCandidates === 1
2616
+ ? `single batch in .pi/runtime/`
2617
+ : `selected from ${totalCandidates} candidate(s) by mtime newest-first (skipped ${idx} earlier candidate(s))`;
2618
+ return { ok: true, state: reconstructed, batchId: meta.batchId, selectionNote };
2619
+ }
2620
+
2621
+ return {
2622
+ ok: false,
2623
+ error: `no reconstructable batch found in .pi/runtime/ (${failures.length} candidate(s) inspected: ${failures.slice(0, 3).join("; ")}${failures.length > 3 ? "; ..." : ""})`,
2624
+ };
2625
+ }