@pi-agents/orchid 0.1.0-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +41 -0
- package/LICENSE +21 -0
- package/README.md +246 -0
- package/agents/AGENTS-MANIFEST.md +42 -0
- package/agents/brain.md +42 -0
- package/agents/context-builder.md +46 -0
- package/agents/delegate.md +12 -0
- package/agents/dev-1.md +42 -0
- package/agents/oracle.md +73 -0
- package/agents/planner.md +55 -0
- package/agents/researcher.md +52 -0
- package/agents/reviewer.md +79 -0
- package/agents/scout.md +50 -0
- package/agents/tester.md +45 -0
- package/agents/worker.md +55 -0
- package/extensions/ralph.ts +1 -0
- package/extensions/reviewer-extension.ts +125 -0
- package/extensions/task-orchestrator.ts +28 -0
- package/package.json +63 -0
- package/prompts/gather-context-and-clarify.md +13 -0
- package/prompts/parallel-cleanup.md +59 -0
- package/prompts/parallel-context-build.md +53 -0
- package/prompts/parallel-handoff-plan.md +59 -0
- package/prompts/parallel-research.md +50 -0
- package/prompts/parallel-review.md +54 -0
- package/prompts/review-loop.md +41 -0
- package/skills/orchid/SKILL.md +214 -0
- package/skills/orchid/orchid-cleanup/SKILL.md +122 -0
- package/skills/orchid/orchid-converge/SKILL.md +124 -0
- package/skills/orchid/orchid-decompose/SKILL.md +201 -0
- package/skills/orchid/orchid-doctor/SKILL.md +162 -0
- package/skills/orchid/orchid-investigate/SKILL.md +102 -0
- package/skills/orchid/orchid-launch/SKILL.md +147 -0
- package/skills/ralph/SKILL.md +73 -0
- package/skills/subagents/pi-subagents/SKILL.md +813 -0
- package/src/index.ts +7 -0
- package/src/orchestrator/abort.ts +534 -0
- package/src/orchestrator/agent-bridge-extension.ts +1020 -0
- package/src/orchestrator/agent-host.ts +954 -0
- package/src/orchestrator/cleanup.ts +776 -0
- package/src/orchestrator/config-loader.ts +1412 -0
- package/src/orchestrator/config-schema.ts +690 -0
- package/src/orchestrator/config.ts +81 -0
- package/src/orchestrator/context-window.ts +66 -0
- package/src/orchestrator/diagnostic-reports.ts +475 -0
- package/src/orchestrator/diagnostics.ts +394 -0
- package/src/orchestrator/discovery.ts +1833 -0
- package/src/orchestrator/engine-worker.ts +415 -0
- package/src/orchestrator/engine.ts +5940 -0
- package/src/orchestrator/execution.ts +3104 -0
- package/src/orchestrator/extension.ts +5934 -0
- package/src/orchestrator/formatting.ts +785 -0
- package/src/orchestrator/git.ts +88 -0
- package/src/orchestrator/index.ts +28 -0
- package/src/orchestrator/lane-runner.ts +1787 -0
- package/src/orchestrator/mailbox.ts +780 -0
- package/src/orchestrator/merge.ts +3414 -0
- package/src/orchestrator/messages.ts +1062 -0
- package/src/orchestrator/migrations.ts +278 -0
- package/src/orchestrator/naming.ts +117 -0
- package/src/orchestrator/path-resolver.ts +275 -0
- package/src/orchestrator/persistence.ts +2625 -0
- package/src/orchestrator/process-registry.ts +452 -0
- package/src/orchestrator/quality-gate.ts +1085 -0
- package/src/orchestrator/resume.ts +3488 -0
- package/src/orchestrator/sessions.ts +57 -0
- package/src/orchestrator/settings-loader.ts +136 -0
- package/src/orchestrator/settings-tui.ts +2208 -0
- package/src/orchestrator/sidecar-telemetry.ts +267 -0
- package/src/orchestrator/supervisor.ts +4548 -0
- package/src/orchestrator/task-executor-core.ts +675 -0
- package/src/orchestrator/tmux-compat.ts +37 -0
- package/src/orchestrator/tool-allowlist-constants.ts +37 -0
- package/src/orchestrator/types.ts +4465 -0
- package/src/orchestrator/verification.ts +547 -0
- package/src/orchestrator/waves.ts +1564 -0
- package/src/orchestrator/workspace.ts +707 -0
- package/src/orchestrator/worktree.ts +2725 -0
- package/src/ralph/index.ts +825 -0
- package/src/subagents/agents/agent-management.ts +648 -0
- package/src/subagents/agents/agent-scope.ts +6 -0
- package/src/subagents/agents/agent-selection.ts +23 -0
- package/src/subagents/agents/agent-serializer.ts +86 -0
- package/src/subagents/agents/agents.ts +832 -0
- package/src/subagents/agents/chain-serializer.ts +137 -0
- package/src/subagents/agents/frontmatter.ts +29 -0
- package/src/subagents/agents/identity.ts +30 -0
- package/src/subagents/agents/skills.ts +632 -0
- package/src/subagents/extension/config.ts +16 -0
- package/src/subagents/extension/control-notices.ts +92 -0
- package/src/subagents/extension/doctor.ts +199 -0
- package/src/subagents/extension/fanout-child.ts +170 -0
- package/src/subagents/extension/index.ts +573 -0
- package/src/subagents/extension/schemas.ts +168 -0
- package/src/subagents/intercom/intercom-bridge.ts +379 -0
- package/src/subagents/intercom/result-intercom.ts +377 -0
- package/src/subagents/runs/background/async-execution.ts +712 -0
- package/src/subagents/runs/background/async-job-tracker.ts +310 -0
- package/src/subagents/runs/background/async-resume.ts +345 -0
- package/src/subagents/runs/background/async-status.ts +325 -0
- package/src/subagents/runs/background/completion-dedupe.ts +63 -0
- package/src/subagents/runs/background/notify.ts +108 -0
- package/src/subagents/runs/background/parallel-groups.ts +45 -0
- package/src/subagents/runs/background/result-watcher.ts +307 -0
- package/src/subagents/runs/background/run-id-resolver.ts +83 -0
- package/src/subagents/runs/background/run-status.ts +269 -0
- package/src/subagents/runs/background/stale-run-reconciler.ts +336 -0
- package/src/subagents/runs/background/subagent-runner.ts +1808 -0
- package/src/subagents/runs/background/top-level-async.ts +13 -0
- package/src/subagents/runs/foreground/chain-clarify.ts +1333 -0
- package/src/subagents/runs/foreground/chain-execution.ts +938 -0
- package/src/subagents/runs/foreground/execution.ts +918 -0
- package/src/subagents/runs/foreground/subagent-executor.ts +2527 -0
- package/src/subagents/runs/shared/completion-guard.ts +147 -0
- package/src/subagents/runs/shared/long-running-guard.ts +175 -0
- package/src/subagents/runs/shared/mcp-direct-tool-allowlist.ts +365 -0
- package/src/subagents/runs/shared/model-fallback.ts +103 -0
- package/src/subagents/runs/shared/nested-events.ts +819 -0
- package/src/subagents/runs/shared/nested-path.ts +52 -0
- package/src/subagents/runs/shared/nested-render.ts +115 -0
- package/src/subagents/runs/shared/parallel-utils.ts +109 -0
- package/src/subagents/runs/shared/pi-args.ts +220 -0
- package/src/subagents/runs/shared/pi-spawn.ts +115 -0
- package/src/subagents/runs/shared/run-history.ts +60 -0
- package/src/subagents/runs/shared/single-output.ts +164 -0
- package/src/subagents/runs/shared/subagent-control.ts +226 -0
- package/src/subagents/runs/shared/subagent-prompt-runtime.ts +170 -0
- package/src/subagents/runs/shared/worktree.ts +577 -0
- package/src/subagents/shared/artifacts.ts +98 -0
- package/src/subagents/shared/atomic-json.ts +16 -0
- package/src/subagents/shared/file-coalescer.ts +40 -0
- package/src/subagents/shared/fork-context.ts +76 -0
- package/src/subagents/shared/formatters.ts +133 -0
- package/src/subagents/shared/jsonl-writer.ts +81 -0
- package/src/subagents/shared/model-info.ts +78 -0
- package/src/subagents/shared/post-exit-stdio-guard.ts +85 -0
- package/src/subagents/shared/session-identity.ts +10 -0
- package/src/subagents/shared/session-tokens.ts +44 -0
- package/src/subagents/shared/settings.ts +397 -0
- package/src/subagents/shared/status-format.ts +49 -0
- package/src/subagents/shared/types.ts +822 -0
- package/src/subagents/shared/utils.ts +450 -0
- package/src/subagents/slash/prompt-template-bridge.ts +397 -0
- package/src/subagents/slash/slash-bridge.ts +174 -0
- package/src/subagents/slash/slash-commands.ts +528 -0
- package/src/subagents/slash/slash-live-state.ts +292 -0
- package/src/subagents/tui/render-helpers.ts +80 -0
- package/src/subagents/tui/render.ts +1358 -0
- package/templates/agents/local/supervisor.md +33 -0
- package/templates/agents/local/task-merger.md +27 -0
- package/templates/agents/local/task-reviewer.md +30 -0
- package/templates/agents/local/task-worker.md +34 -0
- package/templates/agents/supervisor-routing.md +92 -0
- package/templates/agents/supervisor.md +229 -0
- package/templates/agents/task-merger.md +214 -0
- package/templates/agents/task-reviewer.md +260 -0
- package/templates/agents/task-worker-segment.md +44 -0
- package/templates/agents/task-worker.md +557 -0
- package/templates/tasks/CONTEXT.md +30 -0
- package/templates/tasks/EXAMPLE-001-hello-world/PROMPT.md +98 -0
- package/templates/tasks/EXAMPLE-001-hello-world/STATUS.md +73 -0
- package/templates/tasks/EXAMPLE-002-parallel-smoke/PROMPT.md +97 -0
- package/templates/tasks/EXAMPLE-002-parallel-smoke/STATUS.md +73 -0
|
@@ -0,0 +1,2625 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* State persistence, serialization, orphan detection
|
|
3
|
+
* @module orch/persistence
|
|
4
|
+
*/
|
|
5
|
+
import {
|
|
6
|
+
readFileSync,
|
|
7
|
+
writeFileSync,
|
|
8
|
+
existsSync,
|
|
9
|
+
unlinkSync,
|
|
10
|
+
renameSync,
|
|
11
|
+
mkdirSync,
|
|
12
|
+
appendFileSync,
|
|
13
|
+
readdirSync,
|
|
14
|
+
statSync,
|
|
15
|
+
} from "fs";
|
|
16
|
+
import { join, dirname, basename } from "path";
|
|
17
|
+
|
|
18
|
+
import { execLog } from "./execution.ts";
|
|
19
|
+
import {
|
|
20
|
+
BATCH_STATE_SCHEMA_VERSION,
|
|
21
|
+
StateFileError,
|
|
22
|
+
batchStatePath,
|
|
23
|
+
BATCH_HISTORY_MAX_ENTRIES,
|
|
24
|
+
defaultResilienceState,
|
|
25
|
+
defaultBatchDiagnostics,
|
|
26
|
+
runtimeRoot,
|
|
27
|
+
runtimeManifestPath,
|
|
28
|
+
} from "./types.ts";
|
|
29
|
+
import type { BatchHistorySummary, RuntimeAgentManifest } from "./types.ts";
|
|
30
|
+
import type {
|
|
31
|
+
AllocatedLane,
|
|
32
|
+
DiscoveryResult,
|
|
33
|
+
EngineEvent,
|
|
34
|
+
EscalationContext,
|
|
35
|
+
LaneTaskOutcome,
|
|
36
|
+
LaneTaskStatus,
|
|
37
|
+
MonitorState,
|
|
38
|
+
OrchBatchPhase,
|
|
39
|
+
OrchBatchRuntimeState,
|
|
40
|
+
PersistedBatchState,
|
|
41
|
+
PersistedLaneRecord,
|
|
42
|
+
PersistedMergeResult,
|
|
43
|
+
PersistedSegmentRecord,
|
|
44
|
+
PersistedTaskRecord,
|
|
45
|
+
TaskMonitorSnapshot,
|
|
46
|
+
Tier0RecoveryPattern,
|
|
47
|
+
WorkspaceMode,
|
|
48
|
+
} from "./types.ts";
|
|
49
|
+
import { sleepSync } from "./worktree.ts";
|
|
50
|
+
import type { PreserveFailedLaneProgressResult } from "./worktree.ts";
|
|
51
|
+
import { normalizeLaneSessionAlias, readLaneSessionAliases } from "./tmux-compat.ts";
|
|
52
|
+
|
|
53
|
+
// ── State Persistence Helper (TS-009 Step 2) ────────────────────────
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Candidate .DONE file locations for a task folder.
|
|
57
|
+
*
|
|
58
|
+
* Task-runner archives completed tasks by moving:
|
|
59
|
+
* tasks/<task-folder>/ → tasks/archive/<task-folder>/
|
|
60
|
+
*
|
|
61
|
+
* During resume/orphan detection we must check both locations.
|
|
62
|
+
*/
|
|
63
|
+
export function getTaskDoneFileCandidates(taskFolder: string): string[] {
|
|
64
|
+
const candidates = [join(taskFolder, ".DONE")];
|
|
65
|
+
const parent = dirname(taskFolder);
|
|
66
|
+
const taskFolderName = basename(taskFolder);
|
|
67
|
+
|
|
68
|
+
// If already in archive, avoid duplicate candidate.
|
|
69
|
+
if (basename(parent).toLowerCase() !== "archive") {
|
|
70
|
+
candidates.push(join(parent, "archive", taskFolderName, ".DONE"));
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
return candidates;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Check whether a task has a .DONE marker in active or archived location.
|
|
78
|
+
*/
|
|
79
|
+
export function hasTaskDoneMarker(taskFolder: string): boolean {
|
|
80
|
+
for (const donePath of getTaskDoneFileCandidates(taskFolder)) {
|
|
81
|
+
try {
|
|
82
|
+
if (existsSync(donePath)) return true;
|
|
83
|
+
} catch {
|
|
84
|
+
// Ignore filesystem errors here; caller handles partial visibility.
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
return false;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Compare optional embedded outcome telemetry.
|
|
92
|
+
*/
|
|
93
|
+
function sameOutcomeTelemetry(
|
|
94
|
+
a: LaneTaskOutcome["telemetry"],
|
|
95
|
+
b: LaneTaskOutcome["telemetry"],
|
|
96
|
+
): boolean {
|
|
97
|
+
if (!a && !b) return true;
|
|
98
|
+
if (!a || !b) return false;
|
|
99
|
+
return (
|
|
100
|
+
a.inputTokens === b.inputTokens &&
|
|
101
|
+
a.outputTokens === b.outputTokens &&
|
|
102
|
+
a.cacheReadTokens === b.cacheReadTokens &&
|
|
103
|
+
a.cacheWriteTokens === b.cacheWriteTokens &&
|
|
104
|
+
a.costUsd === b.costUsd &&
|
|
105
|
+
a.toolCalls === b.toolCalls &&
|
|
106
|
+
a.durationMs === b.durationMs
|
|
107
|
+
);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Upsert a task outcome in-place. Returns true if changed.
|
|
112
|
+
*/
|
|
113
|
+
export function upsertTaskOutcome(outcomes: LaneTaskOutcome[], next: LaneTaskOutcome): boolean {
|
|
114
|
+
const idx = outcomes.findIndex((o) => o.taskId === next.taskId);
|
|
115
|
+
if (idx < 0) {
|
|
116
|
+
outcomes.push(next);
|
|
117
|
+
return true;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
const prev = outcomes[idx];
|
|
121
|
+
const mergedNext: LaneTaskOutcome = {
|
|
122
|
+
...next,
|
|
123
|
+
laneNumber: next.laneNumber ?? prev.laneNumber,
|
|
124
|
+
telemetry: next.telemetry ?? prev.telemetry,
|
|
125
|
+
};
|
|
126
|
+
|
|
127
|
+
const changed =
|
|
128
|
+
prev.status !== mergedNext.status ||
|
|
129
|
+
prev.startTime !== mergedNext.startTime ||
|
|
130
|
+
prev.endTime !== mergedNext.endTime ||
|
|
131
|
+
prev.exitReason !== mergedNext.exitReason ||
|
|
132
|
+
prev.sessionName !== mergedNext.sessionName ||
|
|
133
|
+
prev.doneFileFound !== mergedNext.doneFileFound ||
|
|
134
|
+
prev.laneNumber !== mergedNext.laneNumber ||
|
|
135
|
+
!sameOutcomeTelemetry(prev.telemetry, mergedNext.telemetry) ||
|
|
136
|
+
prev.partialProgressCommits !== mergedNext.partialProgressCommits ||
|
|
137
|
+
prev.partialProgressBranch !== mergedNext.partialProgressBranch ||
|
|
138
|
+
prev.exitDiagnostic !== mergedNext.exitDiagnostic;
|
|
139
|
+
|
|
140
|
+
if (changed) {
|
|
141
|
+
outcomes[idx] = mergedNext;
|
|
142
|
+
}
|
|
143
|
+
return changed;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Apply partial progress preservation results to task outcomes (TP-028).
|
|
148
|
+
*
|
|
149
|
+
* After `preserveFailedLaneProgress()` runs, call this to stamp each
|
|
150
|
+
* successfully-preserved task outcome with the saved branch name and
|
|
151
|
+
* commit count. This ensures the data flows into persistence and
|
|
152
|
+
* diagnostics via the normal outcome → serialization path.
|
|
153
|
+
*
|
|
154
|
+
* @param ppResult - Result from `preserveFailedLaneProgress()`
|
|
155
|
+
* @param outcomes - Mutable array of task outcomes to update in-place
|
|
156
|
+
* @returns Number of outcomes that were updated
|
|
157
|
+
*/
|
|
158
|
+
export function applyPartialProgressToOutcomes(
|
|
159
|
+
ppResult: PreserveFailedLaneProgressResult,
|
|
160
|
+
outcomes: LaneTaskOutcome[],
|
|
161
|
+
): number {
|
|
162
|
+
let updated = 0;
|
|
163
|
+
for (const r of ppResult.results) {
|
|
164
|
+
if (!r.saved || !r.savedBranch) continue;
|
|
165
|
+
const outcome = outcomes.find((o) => o.taskId === r.taskId);
|
|
166
|
+
if (outcome) {
|
|
167
|
+
outcome.partialProgressCommits = r.commitCount;
|
|
168
|
+
outcome.partialProgressBranch = r.savedBranch;
|
|
169
|
+
updated++;
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
return updated;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/**
|
|
176
|
+
* Seed pending outcomes for all tasks in newly allocated lanes.
|
|
177
|
+
*
|
|
178
|
+
* Ensures the persisted state has a full task registry as soon as a wave starts,
|
|
179
|
+
* including lane/session assignment, even before tasks finish.
|
|
180
|
+
*/
|
|
181
|
+
export function seedPendingOutcomesForAllocatedLanes(
|
|
182
|
+
lanes: AllocatedLane[],
|
|
183
|
+
outcomes: LaneTaskOutcome[],
|
|
184
|
+
): boolean {
|
|
185
|
+
let changed = false;
|
|
186
|
+
for (const lane of lanes) {
|
|
187
|
+
for (const laneTask of lane.tasks) {
|
|
188
|
+
const existing = outcomes.find((o) => o.taskId === laneTask.taskId);
|
|
189
|
+
if (existing) continue;
|
|
190
|
+
changed =
|
|
191
|
+
upsertTaskOutcome(outcomes, {
|
|
192
|
+
taskId: laneTask.taskId,
|
|
193
|
+
status: "pending",
|
|
194
|
+
startTime: null,
|
|
195
|
+
endTime: null,
|
|
196
|
+
exitReason: "Pending execution",
|
|
197
|
+
sessionName: lane.laneSessionId,
|
|
198
|
+
doneFileFound: false,
|
|
199
|
+
laneNumber: lane.laneNumber,
|
|
200
|
+
}) || changed;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
return changed;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/**
|
|
207
|
+
* Sync accumulated task outcomes from monitor snapshots.
|
|
208
|
+
*
|
|
209
|
+
* This captures in-wave task transitions (pending → running → terminal)
|
|
210
|
+
* so state persistence does not lag until wave completion.
|
|
211
|
+
*/
|
|
212
|
+
export function syncTaskOutcomesFromMonitor(
|
|
213
|
+
monitorState: MonitorState,
|
|
214
|
+
outcomes: LaneTaskOutcome[],
|
|
215
|
+
): boolean {
|
|
216
|
+
let changed = false;
|
|
217
|
+
|
|
218
|
+
for (const lane of monitorState.lanes) {
|
|
219
|
+
// Remaining tasks => pending
|
|
220
|
+
for (const taskId of lane.remainingTasks) {
|
|
221
|
+
const existing = outcomes.find((o) => o.taskId === taskId);
|
|
222
|
+
if (
|
|
223
|
+
existing &&
|
|
224
|
+
(existing.status === "succeeded" ||
|
|
225
|
+
existing.status === "failed" ||
|
|
226
|
+
existing.status === "stalled")
|
|
227
|
+
) {
|
|
228
|
+
continue;
|
|
229
|
+
}
|
|
230
|
+
changed =
|
|
231
|
+
upsertTaskOutcome(outcomes, {
|
|
232
|
+
taskId,
|
|
233
|
+
status: "pending",
|
|
234
|
+
startTime: existing?.startTime ?? null,
|
|
235
|
+
endTime: null,
|
|
236
|
+
exitReason: existing?.exitReason || "Pending execution",
|
|
237
|
+
sessionName: existing?.sessionName || lane.sessionName,
|
|
238
|
+
doneFileFound: false,
|
|
239
|
+
laneNumber: existing?.laneNumber ?? lane.laneNumber,
|
|
240
|
+
telemetry: existing?.telemetry,
|
|
241
|
+
partialProgressCommits: existing?.partialProgressCommits,
|
|
242
|
+
partialProgressBranch: existing?.partialProgressBranch,
|
|
243
|
+
exitDiagnostic: existing?.exitDiagnostic,
|
|
244
|
+
}) || changed;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Completed tasks => succeeded
|
|
248
|
+
// Use existing endTime if already set — prevents changed=true on every
|
|
249
|
+
// poll tick (lastPollTime differs each tick, causing persist log spam).
|
|
250
|
+
for (const taskId of lane.completedTasks) {
|
|
251
|
+
const existing = outcomes.find((o) => o.taskId === taskId);
|
|
252
|
+
changed =
|
|
253
|
+
upsertTaskOutcome(outcomes, {
|
|
254
|
+
taskId,
|
|
255
|
+
status: "succeeded",
|
|
256
|
+
startTime: existing?.startTime ?? null,
|
|
257
|
+
endTime: existing?.endTime ?? monitorState.lastPollTime,
|
|
258
|
+
exitReason: existing?.exitReason || ".DONE file created by task-runner",
|
|
259
|
+
sessionName: existing?.sessionName || lane.sessionName,
|
|
260
|
+
doneFileFound: true,
|
|
261
|
+
laneNumber: existing?.laneNumber ?? lane.laneNumber,
|
|
262
|
+
telemetry: existing?.telemetry,
|
|
263
|
+
partialProgressCommits: existing?.partialProgressCommits,
|
|
264
|
+
partialProgressBranch: existing?.partialProgressBranch,
|
|
265
|
+
exitDiagnostic: existing?.exitDiagnostic,
|
|
266
|
+
}) || changed;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
// Failed tasks => failed
|
|
270
|
+
for (const taskId of lane.failedTasks) {
|
|
271
|
+
const existing = outcomes.find((o) => o.taskId === taskId);
|
|
272
|
+
changed =
|
|
273
|
+
upsertTaskOutcome(outcomes, {
|
|
274
|
+
taskId,
|
|
275
|
+
status: "failed",
|
|
276
|
+
startTime: existing?.startTime ?? null,
|
|
277
|
+
endTime: existing?.endTime ?? monitorState.lastPollTime,
|
|
278
|
+
exitReason: existing?.exitReason || "Task failed or stalled",
|
|
279
|
+
sessionName: existing?.sessionName || lane.sessionName,
|
|
280
|
+
doneFileFound: false,
|
|
281
|
+
laneNumber: existing?.laneNumber ?? lane.laneNumber,
|
|
282
|
+
telemetry: existing?.telemetry,
|
|
283
|
+
partialProgressCommits: existing?.partialProgressCommits,
|
|
284
|
+
partialProgressBranch: existing?.partialProgressBranch,
|
|
285
|
+
exitDiagnostic: existing?.exitDiagnostic,
|
|
286
|
+
}) || changed;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
// Current task snapshot => running/stalled/succeeded/failed/skipped
|
|
290
|
+
if (lane.currentTaskId && lane.currentTaskSnapshot) {
|
|
291
|
+
const snap = lane.currentTaskSnapshot;
|
|
292
|
+
const existing = outcomes.find((o) => o.taskId === lane.currentTaskId);
|
|
293
|
+
const monitorToLane: Record<TaskMonitorSnapshot["status"], LaneTaskStatus> = {
|
|
294
|
+
pending: "pending",
|
|
295
|
+
running: "running",
|
|
296
|
+
succeeded: "succeeded",
|
|
297
|
+
failed: "failed",
|
|
298
|
+
stalled: "stalled",
|
|
299
|
+
skipped: "skipped",
|
|
300
|
+
unknown: existing?.status || "running",
|
|
301
|
+
};
|
|
302
|
+
const mappedStatus = monitorToLane[snap.status];
|
|
303
|
+
const terminal =
|
|
304
|
+
mappedStatus === "succeeded" ||
|
|
305
|
+
mappedStatus === "failed" ||
|
|
306
|
+
mappedStatus === "stalled" ||
|
|
307
|
+
mappedStatus === "skipped";
|
|
308
|
+
|
|
309
|
+
// TP-051: Use snap.observedAt (Date.now() from monitor poll) instead of
|
|
310
|
+
// snap.lastHeartbeat (STATUS.md mtime) for task start time. The mtime
|
|
311
|
+
// reflects when STATUS.md was last edited, which may be long before
|
|
312
|
+
// actual execution started (e.g., during task staging).
|
|
313
|
+
changed =
|
|
314
|
+
upsertTaskOutcome(outcomes, {
|
|
315
|
+
taskId: lane.currentTaskId,
|
|
316
|
+
status: mappedStatus,
|
|
317
|
+
startTime: existing?.startTime ?? snap.observedAt,
|
|
318
|
+
endTime: terminal ? (existing?.endTime ?? snap.observedAt) : null,
|
|
319
|
+
exitReason:
|
|
320
|
+
existing?.exitReason ||
|
|
321
|
+
(mappedStatus === "running"
|
|
322
|
+
? "Task in progress"
|
|
323
|
+
: snap.stallReason || "Task reached terminal state"),
|
|
324
|
+
sessionName: existing?.sessionName || lane.sessionName,
|
|
325
|
+
doneFileFound: snap.doneFileFound,
|
|
326
|
+
laneNumber: existing?.laneNumber ?? lane.laneNumber,
|
|
327
|
+
telemetry: existing?.telemetry,
|
|
328
|
+
partialProgressCommits: existing?.partialProgressCommits,
|
|
329
|
+
partialProgressBranch: existing?.partialProgressBranch,
|
|
330
|
+
exitDiagnostic: existing?.exitDiagnostic,
|
|
331
|
+
}) || changed;
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
return changed;
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
/**
|
|
339
|
+
* Persist current runtime state to `.pi/batch-state.json`.
|
|
340
|
+
*
|
|
341
|
+
* Centralized helper that serializes runtime state, enriches task records
|
|
342
|
+
* with folder paths from discovery, and writes atomically. Logs the reason,
|
|
343
|
+
* batchId, phase, and waveIndex for each write.
|
|
344
|
+
*
|
|
345
|
+
* Write failures are non-fatal: logged as errors and added to
|
|
346
|
+
* batchState.errors, but do NOT crash the batch execution.
|
|
347
|
+
*
|
|
348
|
+
* @param reason - Human-readable reason for this state write (e.g., "batch-start", "wave-index-change")
|
|
349
|
+
* @param batchState - Current runtime batch state
|
|
350
|
+
* @param wavePlan - Wave plan (array of arrays of task IDs)
|
|
351
|
+
* @param lanes - Currently allocated lanes (latest wave's lanes)
|
|
352
|
+
* @param allTaskOutcomes - All task outcomes accumulated across completed waves
|
|
353
|
+
* @param discovery - Discovery result (for enriching taskFolder paths)
|
|
354
|
+
* @param repoRoot - Absolute path to the repository root
|
|
355
|
+
*/
|
|
356
|
+
export function persistRuntimeState(
|
|
357
|
+
reason: string,
|
|
358
|
+
batchState: OrchBatchRuntimeState,
|
|
359
|
+
wavePlan: string[][],
|
|
360
|
+
lanes: AllocatedLane[],
|
|
361
|
+
allTaskOutcomes: LaneTaskOutcome[],
|
|
362
|
+
discovery: DiscoveryResult | null,
|
|
363
|
+
repoRoot: string,
|
|
364
|
+
): void {
|
|
365
|
+
try {
|
|
366
|
+
const json = serializeBatchState(batchState, wavePlan, lanes, allTaskOutcomes);
|
|
367
|
+
|
|
368
|
+
// Enrich task records with folder paths and repo fields from discovery
|
|
369
|
+
if (discovery) {
|
|
370
|
+
const parsed = JSON.parse(json) as PersistedBatchState;
|
|
371
|
+
for (const taskRecord of parsed.tasks) {
|
|
372
|
+
const parsedTask = discovery.pending.get(taskRecord.taskId);
|
|
373
|
+
if (parsedTask) {
|
|
374
|
+
taskRecord.taskFolder = parsedTask.taskFolder;
|
|
375
|
+
// v2: Enrich repo fields for tasks not yet allocated (pending in future waves)
|
|
376
|
+
if (taskRecord.repoId === undefined && parsedTask.promptRepoId !== undefined) {
|
|
377
|
+
taskRecord.repoId = parsedTask.promptRepoId;
|
|
378
|
+
}
|
|
379
|
+
if (taskRecord.resolvedRepoId === undefined && parsedTask.resolvedRepoId !== undefined) {
|
|
380
|
+
taskRecord.resolvedRepoId = parsedTask.resolvedRepoId;
|
|
381
|
+
}
|
|
382
|
+
if ((taskRecord as any).packetRepoId === undefined && parsedTask.packetRepoId !== undefined) {
|
|
383
|
+
(taskRecord as any).packetRepoId = parsedTask.packetRepoId;
|
|
384
|
+
}
|
|
385
|
+
if (
|
|
386
|
+
(taskRecord as any).packetTaskPath === undefined &&
|
|
387
|
+
parsedTask.packetTaskPath !== undefined
|
|
388
|
+
) {
|
|
389
|
+
(taskRecord as any).packetTaskPath = parsedTask.packetTaskPath;
|
|
390
|
+
}
|
|
391
|
+
if ((taskRecord as any).segmentIds === undefined && parsedTask.segmentIds !== undefined) {
|
|
392
|
+
(taskRecord as any).segmentIds = parsedTask.segmentIds;
|
|
393
|
+
}
|
|
394
|
+
if (
|
|
395
|
+
(taskRecord as any).activeSegmentId === undefined &&
|
|
396
|
+
parsedTask.activeSegmentId !== undefined
|
|
397
|
+
) {
|
|
398
|
+
(taskRecord as any).activeSegmentId = parsedTask.activeSegmentId;
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
const enrichedJson = JSON.stringify(parsed, null, 2);
|
|
403
|
+
saveBatchState(enrichedJson, repoRoot);
|
|
404
|
+
} else {
|
|
405
|
+
saveBatchState(json, repoRoot);
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
execLog("state", batchState.batchId, `persisted: ${reason}`, {
|
|
409
|
+
phase: batchState.phase,
|
|
410
|
+
waveIndex: batchState.currentWaveIndex,
|
|
411
|
+
});
|
|
412
|
+
} catch (err: unknown) {
|
|
413
|
+
const msg =
|
|
414
|
+
err instanceof StateFileError
|
|
415
|
+
? `[${err.code}] ${err.message}`
|
|
416
|
+
: err instanceof Error
|
|
417
|
+
? err.message
|
|
418
|
+
: String(err);
|
|
419
|
+
execLog("state", batchState.batchId, `write failed: ${msg}`, {
|
|
420
|
+
reason,
|
|
421
|
+
phase: batchState.phase,
|
|
422
|
+
});
|
|
423
|
+
batchState.errors.push(`State persistence failed (${reason}): ${msg}`);
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
// ── State Validation ─────────────────────────────────────────────────
|
|
428
|
+
|
|
429
|
+
/** All valid OrchBatchPhase values for validation. */
|
|
430
|
+
export const VALID_BATCH_PHASES: ReadonlySet<string> = new Set([
|
|
431
|
+
"idle",
|
|
432
|
+
"launching",
|
|
433
|
+
"planning",
|
|
434
|
+
"executing",
|
|
435
|
+
"merging",
|
|
436
|
+
"paused",
|
|
437
|
+
"stopped",
|
|
438
|
+
"completed",
|
|
439
|
+
"failed",
|
|
440
|
+
]);
|
|
441
|
+
|
|
442
|
+
/** All valid LaneTaskStatus values for validation. */
|
|
443
|
+
export const VALID_TASK_STATUSES: ReadonlySet<string> = new Set([
|
|
444
|
+
"pending",
|
|
445
|
+
"running",
|
|
446
|
+
"succeeded",
|
|
447
|
+
"failed",
|
|
448
|
+
"stalled",
|
|
449
|
+
"skipped",
|
|
450
|
+
]);
|
|
451
|
+
|
|
452
|
+
/** All valid merge result statuses for persisted state. */
|
|
453
|
+
export const VALID_PERSISTED_MERGE_STATUSES: ReadonlySet<string> = new Set([
|
|
454
|
+
"succeeded",
|
|
455
|
+
"failed",
|
|
456
|
+
"partial",
|
|
457
|
+
]);
|
|
458
|
+
|
|
459
|
+
/**
|
|
460
|
+
* Upconvert a v1 state object to v2 in-memory.
|
|
461
|
+
*
|
|
462
|
+
* Applied automatically by `validatePersistedState()` when a v1 file is loaded.
|
|
463
|
+
* The on-disk file is NOT rewritten — upconversion is purely in-memory.
|
|
464
|
+
*
|
|
465
|
+
* v1→v2 field defaults:
|
|
466
|
+
* - `schemaVersion`: bumped from 1 → 2
|
|
467
|
+
* - `baseBranch`: defaults to "" (was already handled in v1 validation)
|
|
468
|
+
* - `mode`: defaults to "repo" (v1 was always single-repo)
|
|
469
|
+
* - `tasks[].repoId`: remains undefined (repo mode has no repo routing)
|
|
470
|
+
* - `tasks[].resolvedRepoId`: remains undefined (same reason)
|
|
471
|
+
* - `lanes[].repoId`: preserved if present (was already serialized in v1
|
|
472
|
+
* when workspace mode was partially implemented)
|
|
473
|
+
*
|
|
474
|
+
* This function is idempotent: calling it on an already-v2 object is a no-op.
|
|
475
|
+
*
|
|
476
|
+
* @param obj - Parsed state object (mutated in-place)
|
|
477
|
+
*/
|
|
478
|
+
export function upconvertV1toV2(obj: Record<string, unknown>): void {
|
|
479
|
+
if ((obj.schemaVersion as number) >= 2) return;
|
|
480
|
+
obj.schemaVersion = 2;
|
|
481
|
+
if (!obj.baseBranch) obj.baseBranch = "";
|
|
482
|
+
if (!obj.mode) obj.mode = "repo";
|
|
483
|
+
// Task and lane records: v2 optional fields default to undefined (omitted)
|
|
484
|
+
// which is already their state in v1 objects. No mutation needed.
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
/**
|
|
488
|
+
* Upconvert a v2 state object to v3 by adding resilience and diagnostics
|
|
489
|
+
* sections with conservative defaults.
|
|
490
|
+
*
|
|
491
|
+
* Added fields:
|
|
492
|
+
* - `resilience`: default empty resilience state (no retries, no repairs)
|
|
493
|
+
* - `diagnostics`: default empty diagnostics (no task exits, zero batch cost)
|
|
494
|
+
*
|
|
495
|
+
* This function is idempotent: calling it on an already-v3 object is a no-op.
|
|
496
|
+
*
|
|
497
|
+
* @param obj - Parsed state object (mutated in-place)
|
|
498
|
+
*/
|
|
499
|
+
export function upconvertV2toV3(obj: Record<string, unknown>): void {
|
|
500
|
+
if ((obj.schemaVersion as number) >= 3) return;
|
|
501
|
+
obj.schemaVersion = 3;
|
|
502
|
+
// Backfill v3 sections with conservative defaults only during genuine
|
|
503
|
+
// v1/v2→v3 migration. A native v3 file missing these sections is
|
|
504
|
+
// malformed and must be rejected by validation — not silently patched.
|
|
505
|
+
if (!obj.resilience) obj.resilience = defaultResilienceState();
|
|
506
|
+
if (!obj.diagnostics) obj.diagnostics = defaultBatchDiagnostics();
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
/**
|
|
510
|
+
* Upconvert a v3 state object to v4 by adding the `segments` array.
|
|
511
|
+
*
|
|
512
|
+
* Added fields:
|
|
513
|
+
* - `segments`: empty array (no segment records exist in pre-v4 state)
|
|
514
|
+
*
|
|
515
|
+
* Task-level segment fields (`packetRepoId`, `packetTaskPath`,
|
|
516
|
+
* `segmentIds`, `activeSegmentId`) are optional and default to
|
|
517
|
+
* `undefined` (omitted from JSON). They are NOT backfilled here
|
|
518
|
+
* because their values depend on runtime discovery, not on
|
|
519
|
+
* migration defaults.
|
|
520
|
+
*
|
|
521
|
+
* This function is idempotent: calling it on an already-v4 object is a no-op.
|
|
522
|
+
*
|
|
523
|
+
* @param obj - Parsed state object (mutated in-place)
|
|
524
|
+
*/
|
|
525
|
+
export function upconvertV3toV4(obj: Record<string, unknown>): void {
|
|
526
|
+
if ((obj.schemaVersion as number) >= 4) return;
|
|
527
|
+
obj.schemaVersion = 4;
|
|
528
|
+
// Backfill v4 segments with empty array only during genuine v3→v4 migration.
|
|
529
|
+
if (!obj.segments) obj.segments = [];
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
/**
|
|
533
|
+
* Validate a parsed JSON object as a PersistedBatchState.
|
|
534
|
+
*
|
|
535
|
+
* Checks:
|
|
536
|
+
* 1. Schema version is 1 (auto-upconverted to v2→v3), 2 (upconverted to v3), or 3 (current)
|
|
537
|
+
* 2. All required fields are present with correct types
|
|
538
|
+
* 3. Enum fields contain valid values (phase, task statuses, merge statuses)
|
|
539
|
+
* 4. Arrays contain valid sub-records
|
|
540
|
+
* 5. v2 optional fields (repoId, resolvedRepoId, mode) are valid when present
|
|
541
|
+
*
|
|
542
|
+
* @param data - Parsed JSON (unknown type)
|
|
543
|
+
* @returns Validated PersistedBatchState (always v3, even if input was v1/v2)
|
|
544
|
+
* @throws StateFileError with STATE_SCHEMA_INVALID on any validation failure
|
|
545
|
+
*/
|
|
546
|
+
export function validatePersistedState(data: unknown): PersistedBatchState {
|
|
547
|
+
if (!data || typeof data !== "object") {
|
|
548
|
+
throw new StateFileError("STATE_SCHEMA_INVALID", "Batch state must be a non-null object");
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
const obj = data as Record<string, unknown>;
|
|
552
|
+
|
|
553
|
+
// ── Schema version ───────────────────────────────────────────
|
|
554
|
+
if (typeof obj.schemaVersion !== "number") {
|
|
555
|
+
throw new StateFileError(
|
|
556
|
+
"STATE_SCHEMA_INVALID",
|
|
557
|
+
`Missing or invalid "schemaVersion" field (expected number, got ${typeof obj.schemaVersion})`,
|
|
558
|
+
);
|
|
559
|
+
}
|
|
560
|
+
// Accept v1 (auto-upconvert to v2→v3→v4), v2 (upconvert to v3→v4), v3 (upconvert to v4), and v4 (current).
|
|
561
|
+
// Reject anything else — including future versions from newer runtimes.
|
|
562
|
+
const ACCEPTED_VERSIONS = [1, 2, 3, BATCH_STATE_SCHEMA_VERSION];
|
|
563
|
+
if (!ACCEPTED_VERSIONS.includes(obj.schemaVersion as number)) {
|
|
564
|
+
throw new StateFileError(
|
|
565
|
+
"STATE_SCHEMA_INVALID",
|
|
566
|
+
`Unsupported schema version ${obj.schemaVersion} (expected ${BATCH_STATE_SCHEMA_VERSION}). ` +
|
|
567
|
+
`Upgrade orchid to a version that supports schema v${obj.schemaVersion}, ` +
|
|
568
|
+
`or delete .pi/batch-state.json and re-run the batch.`,
|
|
569
|
+
);
|
|
570
|
+
}
|
|
571
|
+
const isV1 = obj.schemaVersion === 1;
|
|
572
|
+
|
|
573
|
+
// ── Required string fields ───────────────────────────────────
|
|
574
|
+
for (const field of ["phase", "batchId"] as const) {
|
|
575
|
+
if (typeof obj[field] !== "string") {
|
|
576
|
+
throw new StateFileError(
|
|
577
|
+
"STATE_SCHEMA_INVALID",
|
|
578
|
+
`Missing or invalid "${field}" field (expected string, got ${typeof obj[field]})`,
|
|
579
|
+
);
|
|
580
|
+
}
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
// ── Optional string fields (backward-compatible) ─────────────
|
|
584
|
+
// baseBranch was added after schema v1; default to empty string if missing
|
|
585
|
+
if (obj.baseBranch !== undefined && typeof obj.baseBranch !== "string") {
|
|
586
|
+
throw new StateFileError(
|
|
587
|
+
"STATE_SCHEMA_INVALID",
|
|
588
|
+
`Invalid "baseBranch" field (expected string, got ${typeof obj.baseBranch})`,
|
|
589
|
+
);
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
// ── Optional string fields: orchBranch ───────────────────────
|
|
593
|
+
// orchBranch was added after schema v2 shipped; default to "" if missing.
|
|
594
|
+
if (obj.orchBranch !== undefined && typeof obj.orchBranch !== "string") {
|
|
595
|
+
throw new StateFileError(
|
|
596
|
+
"STATE_SCHEMA_INVALID",
|
|
597
|
+
`Invalid "orchBranch" field (expected string, got ${typeof obj.orchBranch})`,
|
|
598
|
+
);
|
|
599
|
+
}
|
|
600
|
+
if (obj.orchBranch === undefined) {
|
|
601
|
+
obj.orchBranch = "";
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
// ── v2: mode field ───────────────────────────────────────────
|
|
605
|
+
// mode is required in v2, absent in v1 (defaults to "repo" via upconvert).
|
|
606
|
+
if (!isV1 && obj.mode === undefined) {
|
|
607
|
+
throw new StateFileError(
|
|
608
|
+
"STATE_SCHEMA_INVALID",
|
|
609
|
+
`Missing required "mode" field in schema v2 (expected "repo" or "workspace")`,
|
|
610
|
+
);
|
|
611
|
+
}
|
|
612
|
+
if (obj.mode !== undefined && typeof obj.mode !== "string") {
|
|
613
|
+
throw new StateFileError(
|
|
614
|
+
"STATE_SCHEMA_INVALID",
|
|
615
|
+
`Invalid "mode" field (expected string, got ${typeof obj.mode})`,
|
|
616
|
+
);
|
|
617
|
+
}
|
|
618
|
+
if (obj.mode !== undefined && obj.mode !== "repo" && obj.mode !== "workspace") {
|
|
619
|
+
throw new StateFileError(
|
|
620
|
+
"STATE_SCHEMA_INVALID",
|
|
621
|
+
`Invalid "mode" value "${obj.mode}" (expected "repo" or "workspace")`,
|
|
622
|
+
);
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
// ── Phase enum validation ────────────────────────────────────
|
|
626
|
+
if (!VALID_BATCH_PHASES.has(obj.phase as string)) {
|
|
627
|
+
throw new StateFileError(
|
|
628
|
+
"STATE_SCHEMA_INVALID",
|
|
629
|
+
`Invalid "phase" value "${obj.phase}" (expected one of: ${[...VALID_BATCH_PHASES].join(", ")})`,
|
|
630
|
+
);
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
// ── Required number fields ───────────────────────────────────
|
|
634
|
+
for (const field of [
|
|
635
|
+
"startedAt",
|
|
636
|
+
"updatedAt",
|
|
637
|
+
"currentWaveIndex",
|
|
638
|
+
"totalWaves",
|
|
639
|
+
"totalTasks",
|
|
640
|
+
"succeededTasks",
|
|
641
|
+
"failedTasks",
|
|
642
|
+
"skippedTasks",
|
|
643
|
+
"blockedTasks",
|
|
644
|
+
] as const) {
|
|
645
|
+
if (typeof obj[field] !== "number") {
|
|
646
|
+
throw new StateFileError(
|
|
647
|
+
"STATE_SCHEMA_INVALID",
|
|
648
|
+
`Missing or invalid "${field}" field (expected number, got ${typeof obj[field]})`,
|
|
649
|
+
);
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
// ── Nullable number: endedAt ─────────────────────────────────
|
|
654
|
+
if (obj.endedAt !== null && typeof obj.endedAt !== "number") {
|
|
655
|
+
throw new StateFileError(
|
|
656
|
+
"STATE_SCHEMA_INVALID",
|
|
657
|
+
`Invalid "endedAt" field (expected number or null, got ${typeof obj.endedAt})`,
|
|
658
|
+
);
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
// ── Required arrays ──────────────────────────────────────────
|
|
662
|
+
for (const field of [
|
|
663
|
+
"wavePlan",
|
|
664
|
+
"lanes",
|
|
665
|
+
"tasks",
|
|
666
|
+
"mergeResults",
|
|
667
|
+
"blockedTaskIds",
|
|
668
|
+
"errors",
|
|
669
|
+
] as const) {
|
|
670
|
+
if (!Array.isArray(obj[field])) {
|
|
671
|
+
throw new StateFileError(
|
|
672
|
+
"STATE_SCHEMA_INVALID",
|
|
673
|
+
`Missing or invalid "${field}" field (expected array, got ${typeof obj[field]})`,
|
|
674
|
+
);
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
// ── Validate wavePlan: array of arrays of strings ────────────
|
|
679
|
+
const wavePlan = obj.wavePlan as unknown[];
|
|
680
|
+
for (let i = 0; i < wavePlan.length; i++) {
|
|
681
|
+
if (!Array.isArray(wavePlan[i])) {
|
|
682
|
+
throw new StateFileError("STATE_SCHEMA_INVALID", `wavePlan[${i}] is not an array`);
|
|
683
|
+
}
|
|
684
|
+
for (const taskId of wavePlan[i] as unknown[]) {
|
|
685
|
+
if (typeof taskId !== "string") {
|
|
686
|
+
throw new StateFileError(
|
|
687
|
+
"STATE_SCHEMA_INVALID",
|
|
688
|
+
`wavePlan[${i}] contains non-string value: ${typeof taskId}`,
|
|
689
|
+
);
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
// ── Validate task records ────────────────────────────────────
|
|
695
|
+
const tasks = obj.tasks as unknown[];
|
|
696
|
+
for (let i = 0; i < tasks.length; i++) {
|
|
697
|
+
const t = tasks[i] as Record<string, unknown>;
|
|
698
|
+
if (!t || typeof t !== "object") {
|
|
699
|
+
throw new StateFileError("STATE_SCHEMA_INVALID", `tasks[${i}] is not an object`);
|
|
700
|
+
}
|
|
701
|
+
for (const field of ["taskId", "sessionName", "taskFolder", "exitReason"] as const) {
|
|
702
|
+
if (typeof t[field] !== "string") {
|
|
703
|
+
throw new StateFileError(
|
|
704
|
+
"STATE_SCHEMA_INVALID",
|
|
705
|
+
`tasks[${i}].${field} is missing or not a string`,
|
|
706
|
+
);
|
|
707
|
+
}
|
|
708
|
+
}
|
|
709
|
+
if (typeof t.laneNumber !== "number") {
|
|
710
|
+
throw new StateFileError(
|
|
711
|
+
"STATE_SCHEMA_INVALID",
|
|
712
|
+
`tasks[${i}].laneNumber is missing or not a number`,
|
|
713
|
+
);
|
|
714
|
+
}
|
|
715
|
+
if (typeof t.status !== "string" || !VALID_TASK_STATUSES.has(t.status)) {
|
|
716
|
+
throw new StateFileError(
|
|
717
|
+
"STATE_SCHEMA_INVALID",
|
|
718
|
+
`tasks[${i}].status is invalid: "${t.status}" (expected one of: ${[...VALID_TASK_STATUSES].join(", ")})`,
|
|
719
|
+
);
|
|
720
|
+
}
|
|
721
|
+
if (t.startedAt !== null && typeof t.startedAt !== "number") {
|
|
722
|
+
throw new StateFileError(
|
|
723
|
+
"STATE_SCHEMA_INVALID",
|
|
724
|
+
`tasks[${i}].startedAt is not a number or null`,
|
|
725
|
+
);
|
|
726
|
+
}
|
|
727
|
+
if (t.endedAt !== null && typeof t.endedAt !== "number") {
|
|
728
|
+
throw new StateFileError("STATE_SCHEMA_INVALID", `tasks[${i}].endedAt is not a number or null`);
|
|
729
|
+
}
|
|
730
|
+
if (typeof t.doneFileFound !== "boolean") {
|
|
731
|
+
throw new StateFileError(
|
|
732
|
+
"STATE_SCHEMA_INVALID",
|
|
733
|
+
`tasks[${i}].doneFileFound is missing or not a boolean`,
|
|
734
|
+
);
|
|
735
|
+
}
|
|
736
|
+
// v2 optional fields: repoId, resolvedRepoId (string | undefined)
|
|
737
|
+
if (t.repoId !== undefined && typeof t.repoId !== "string") {
|
|
738
|
+
throw new StateFileError(
|
|
739
|
+
"STATE_SCHEMA_INVALID",
|
|
740
|
+
`tasks[${i}].repoId is not a string (got ${typeof t.repoId})`,
|
|
741
|
+
);
|
|
742
|
+
}
|
|
743
|
+
if (t.resolvedRepoId !== undefined && typeof t.resolvedRepoId !== "string") {
|
|
744
|
+
throw new StateFileError(
|
|
745
|
+
"STATE_SCHEMA_INVALID",
|
|
746
|
+
`tasks[${i}].resolvedRepoId is not a string (got ${typeof t.resolvedRepoId})`,
|
|
747
|
+
);
|
|
748
|
+
}
|
|
749
|
+
// TP-028 optional fields: partialProgressCommits (number | undefined), partialProgressBranch (string | undefined)
|
|
750
|
+
if (t.partialProgressCommits !== undefined && typeof t.partialProgressCommits !== "number") {
|
|
751
|
+
throw new StateFileError(
|
|
752
|
+
"STATE_SCHEMA_INVALID",
|
|
753
|
+
`tasks[${i}].partialProgressCommits is not a number (got ${typeof t.partialProgressCommits})`,
|
|
754
|
+
);
|
|
755
|
+
}
|
|
756
|
+
if (t.partialProgressBranch !== undefined && typeof t.partialProgressBranch !== "string") {
|
|
757
|
+
throw new StateFileError(
|
|
758
|
+
"STATE_SCHEMA_INVALID",
|
|
759
|
+
`tasks[${i}].partialProgressBranch is not a string (got ${typeof t.partialProgressBranch})`,
|
|
760
|
+
);
|
|
761
|
+
}
|
|
762
|
+
// TP-026 optional field: exitDiagnostic (object with classification string | undefined)
|
|
763
|
+
if (t.exitDiagnostic !== undefined) {
|
|
764
|
+
if (
|
|
765
|
+
typeof t.exitDiagnostic !== "object" ||
|
|
766
|
+
t.exitDiagnostic === null ||
|
|
767
|
+
Array.isArray(t.exitDiagnostic)
|
|
768
|
+
) {
|
|
769
|
+
throw new StateFileError(
|
|
770
|
+
"STATE_SCHEMA_INVALID",
|
|
771
|
+
`tasks[${i}].exitDiagnostic is not a plain object (got ${Array.isArray(t.exitDiagnostic) ? "array" : typeof t.exitDiagnostic})`,
|
|
772
|
+
);
|
|
773
|
+
}
|
|
774
|
+
if (typeof (t.exitDiagnostic as any).classification !== "string") {
|
|
775
|
+
throw new StateFileError(
|
|
776
|
+
"STATE_SCHEMA_INVALID",
|
|
777
|
+
`tasks[${i}].exitDiagnostic.classification is not a string (got ${typeof (t.exitDiagnostic as any).classification})`,
|
|
778
|
+
);
|
|
779
|
+
}
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
// ── Validate lane records ────────────────────────────────────
|
|
784
|
+
const lanes = obj.lanes as unknown[];
|
|
785
|
+
const legacyTmuxSessionLaneIndexes: number[] = [];
|
|
786
|
+
for (let i = 0; i < lanes.length; i++) {
|
|
787
|
+
const l = lanes[i] as Record<string, unknown>;
|
|
788
|
+
if (!l || typeof l !== "object") {
|
|
789
|
+
throw new StateFileError("STATE_SCHEMA_INVALID", `lanes[${i}] is not an object`);
|
|
790
|
+
}
|
|
791
|
+
for (const field of ["laneId", "worktreePath", "branch"] as const) {
|
|
792
|
+
if (typeof l[field] !== "string") {
|
|
793
|
+
throw new StateFileError(
|
|
794
|
+
"STATE_SCHEMA_INVALID",
|
|
795
|
+
`lanes[${i}].${field} is missing or not a string`,
|
|
796
|
+
);
|
|
797
|
+
}
|
|
798
|
+
}
|
|
799
|
+
|
|
800
|
+
const { laneSessionId, tmuxSessionName } = readLaneSessionAliases(l);
|
|
801
|
+
if (laneSessionId !== undefined && typeof laneSessionId !== "string") {
|
|
802
|
+
throw new StateFileError(
|
|
803
|
+
"STATE_SCHEMA_INVALID",
|
|
804
|
+
`lanes[${i}].laneSessionId is not a string (got ${typeof laneSessionId})`,
|
|
805
|
+
);
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
if (tmuxSessionName !== undefined && typeof tmuxSessionName !== "string") {
|
|
809
|
+
throw new StateFileError(
|
|
810
|
+
"STATE_SCHEMA_INVALID",
|
|
811
|
+
`lanes[${i}].tmuxSessionName is not a string (got ${typeof tmuxSessionName})`,
|
|
812
|
+
);
|
|
813
|
+
}
|
|
814
|
+
|
|
815
|
+
if (typeof laneSessionId !== "string" && typeof tmuxSessionName !== "string") {
|
|
816
|
+
throw new StateFileError(
|
|
817
|
+
"STATE_SCHEMA_INVALID",
|
|
818
|
+
`lanes[${i}] must include either laneSessionId or tmuxSessionName as a string`,
|
|
819
|
+
);
|
|
820
|
+
}
|
|
821
|
+
|
|
822
|
+
if (typeof tmuxSessionName === "string") {
|
|
823
|
+
legacyTmuxSessionLaneIndexes.push(i);
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
normalizeLaneSessionAlias(l);
|
|
827
|
+
|
|
828
|
+
if (typeof l.laneNumber !== "number") {
|
|
829
|
+
throw new StateFileError(
|
|
830
|
+
"STATE_SCHEMA_INVALID",
|
|
831
|
+
`lanes[${i}].laneNumber is missing or not a number`,
|
|
832
|
+
);
|
|
833
|
+
}
|
|
834
|
+
if (!Array.isArray(l.taskIds)) {
|
|
835
|
+
throw new StateFileError(
|
|
836
|
+
"STATE_SCHEMA_INVALID",
|
|
837
|
+
`lanes[${i}].taskIds is missing or not an array`,
|
|
838
|
+
);
|
|
839
|
+
}
|
|
840
|
+
// v2 optional field: repoId (string | undefined)
|
|
841
|
+
if (l.repoId !== undefined && typeof l.repoId !== "string") {
|
|
842
|
+
throw new StateFileError(
|
|
843
|
+
"STATE_SCHEMA_INVALID",
|
|
844
|
+
`lanes[${i}].repoId is not a string (got ${typeof l.repoId})`,
|
|
845
|
+
);
|
|
846
|
+
}
|
|
847
|
+
}
|
|
848
|
+
|
|
849
|
+
if (legacyTmuxSessionLaneIndexes.length > 0) {
|
|
850
|
+
console.error(
|
|
851
|
+
"[orchid] migration: detected legacy lanes[].tmuxSessionName in .pi/batch-state.json; " +
|
|
852
|
+
"normalized to lanes[].laneSessionId for this release. Re-save state (or re-run /orch-resume) to persist canonical fields.",
|
|
853
|
+
);
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
// ── Validate merge results ───────────────────────────────────
|
|
857
|
+
const mergeResults = obj.mergeResults as unknown[];
|
|
858
|
+
for (let i = 0; i < mergeResults.length; i++) {
|
|
859
|
+
const m = mergeResults[i] as Record<string, unknown>;
|
|
860
|
+
if (!m || typeof m !== "object") {
|
|
861
|
+
throw new StateFileError("STATE_SCHEMA_INVALID", `mergeResults[${i}] is not an object`);
|
|
862
|
+
}
|
|
863
|
+
if (typeof m.waveIndex !== "number") {
|
|
864
|
+
throw new StateFileError(
|
|
865
|
+
"STATE_SCHEMA_INVALID",
|
|
866
|
+
`mergeResults[${i}].waveIndex is missing or not a number`,
|
|
867
|
+
);
|
|
868
|
+
}
|
|
869
|
+
if (typeof m.status !== "string" || !VALID_PERSISTED_MERGE_STATUSES.has(m.status)) {
|
|
870
|
+
throw new StateFileError(
|
|
871
|
+
"STATE_SCHEMA_INVALID",
|
|
872
|
+
`mergeResults[${i}].status is invalid: "${m.status}" (expected one of: ${[...VALID_PERSISTED_MERGE_STATUSES].join(", ")})`,
|
|
873
|
+
);
|
|
874
|
+
}
|
|
875
|
+
// v2 optional field: repoResults (array | undefined)
|
|
876
|
+
if (m.repoResults !== undefined) {
|
|
877
|
+
if (!Array.isArray(m.repoResults)) {
|
|
878
|
+
throw new StateFileError(
|
|
879
|
+
"STATE_SCHEMA_INVALID",
|
|
880
|
+
`mergeResults[${i}].repoResults is not an array (got ${typeof m.repoResults})`,
|
|
881
|
+
);
|
|
882
|
+
}
|
|
883
|
+
for (let j = 0; j < (m.repoResults as unknown[]).length; j++) {
|
|
884
|
+
const rr = (m.repoResults as unknown[])[j] as Record<string, unknown>;
|
|
885
|
+
if (!rr || typeof rr !== "object") {
|
|
886
|
+
throw new StateFileError(
|
|
887
|
+
"STATE_SCHEMA_INVALID",
|
|
888
|
+
`mergeResults[${i}].repoResults[${j}] is not an object`,
|
|
889
|
+
);
|
|
890
|
+
}
|
|
891
|
+
if (typeof rr.status !== "string" || !VALID_PERSISTED_MERGE_STATUSES.has(rr.status)) {
|
|
892
|
+
throw new StateFileError(
|
|
893
|
+
"STATE_SCHEMA_INVALID",
|
|
894
|
+
`mergeResults[${i}].repoResults[${j}].status is invalid: "${rr.status}"`,
|
|
895
|
+
);
|
|
896
|
+
}
|
|
897
|
+
if (!Array.isArray(rr.laneNumbers)) {
|
|
898
|
+
throw new StateFileError(
|
|
899
|
+
"STATE_SCHEMA_INVALID",
|
|
900
|
+
`mergeResults[${i}].repoResults[${j}].laneNumbers is not an array`,
|
|
901
|
+
);
|
|
902
|
+
}
|
|
903
|
+
}
|
|
904
|
+
}
|
|
905
|
+
}
|
|
906
|
+
|
|
907
|
+
// ── Validate lastError ───────────────────────────────────────
|
|
908
|
+
if (obj.lastError !== null) {
|
|
909
|
+
if (typeof obj.lastError !== "object") {
|
|
910
|
+
throw new StateFileError("STATE_SCHEMA_INVALID", `lastError is not an object or null`);
|
|
911
|
+
}
|
|
912
|
+
const le = obj.lastError as Record<string, unknown>;
|
|
913
|
+
if (typeof le.code !== "string" || typeof le.message !== "string") {
|
|
914
|
+
throw new StateFileError(
|
|
915
|
+
"STATE_SCHEMA_INVALID",
|
|
916
|
+
`lastError must have "code" (string) and "message" (string) fields`,
|
|
917
|
+
);
|
|
918
|
+
}
|
|
919
|
+
}
|
|
920
|
+
|
|
921
|
+
// ── Validate blockedTaskIds: array of strings ────────────────
|
|
922
|
+
for (const id of obj.blockedTaskIds as unknown[]) {
|
|
923
|
+
if (typeof id !== "string") {
|
|
924
|
+
throw new StateFileError(
|
|
925
|
+
"STATE_SCHEMA_INVALID",
|
|
926
|
+
`blockedTaskIds contains non-string value: ${typeof id}`,
|
|
927
|
+
);
|
|
928
|
+
}
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
// ── Validate errors: array of strings ────────────────────────
|
|
932
|
+
for (const err of obj.errors as unknown[]) {
|
|
933
|
+
if (typeof err !== "string") {
|
|
934
|
+
throw new StateFileError(
|
|
935
|
+
"STATE_SCHEMA_INVALID",
|
|
936
|
+
`errors array contains non-string value: ${typeof err}`,
|
|
937
|
+
);
|
|
938
|
+
}
|
|
939
|
+
}
|
|
940
|
+
|
|
941
|
+
// ── v1→v2→v3→v4 upconversion ─────────────────────────────────
|
|
942
|
+
// Apply defaults for fields that may be absent in older state files.
|
|
943
|
+
// The on-disk file is NOT rewritten; upconversion is in-memory only.
|
|
944
|
+
// Chain: v1→v2 then v2→v3 then v3→v4 (each is idempotent / no-op if already at target).
|
|
945
|
+
upconvertV1toV2(obj);
|
|
946
|
+
upconvertV2toV3(obj);
|
|
947
|
+
upconvertV3toV4(obj);
|
|
948
|
+
|
|
949
|
+
// ── Validate v3 resilience section ───────────────────────────
|
|
950
|
+
// After upconversion, resilience must be a valid object with correct types.
|
|
951
|
+
if (!obj.resilience || typeof obj.resilience !== "object") {
|
|
952
|
+
throw new StateFileError(
|
|
953
|
+
"STATE_SCHEMA_INVALID",
|
|
954
|
+
`Missing or invalid "resilience" section (expected object, got ${typeof obj.resilience})`,
|
|
955
|
+
);
|
|
956
|
+
}
|
|
957
|
+
const res = obj.resilience as Record<string, unknown>;
|
|
958
|
+
if (typeof res.resumeForced !== "boolean") {
|
|
959
|
+
throw new StateFileError(
|
|
960
|
+
"STATE_SCHEMA_INVALID",
|
|
961
|
+
`resilience.resumeForced must be a boolean (got ${typeof res.resumeForced})`,
|
|
962
|
+
);
|
|
963
|
+
}
|
|
964
|
+
if (
|
|
965
|
+
!res.retryCountByScope ||
|
|
966
|
+
typeof res.retryCountByScope !== "object" ||
|
|
967
|
+
Array.isArray(res.retryCountByScope)
|
|
968
|
+
) {
|
|
969
|
+
throw new StateFileError(
|
|
970
|
+
"STATE_SCHEMA_INVALID",
|
|
971
|
+
`resilience.retryCountByScope must be an object (got ${typeof res.retryCountByScope})`,
|
|
972
|
+
);
|
|
973
|
+
}
|
|
974
|
+
// Deep-validate retryCountByScope: all values must be numbers
|
|
975
|
+
for (const [scope, count] of Object.entries(res.retryCountByScope as Record<string, unknown>)) {
|
|
976
|
+
if (typeof count !== "number") {
|
|
977
|
+
throw new StateFileError(
|
|
978
|
+
"STATE_SCHEMA_INVALID",
|
|
979
|
+
`resilience.retryCountByScope["${scope}"] must be a number (got ${typeof count})`,
|
|
980
|
+
);
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
if (res.lastFailureClass !== null && typeof res.lastFailureClass !== "string") {
|
|
984
|
+
throw new StateFileError(
|
|
985
|
+
"STATE_SCHEMA_INVALID",
|
|
986
|
+
`resilience.lastFailureClass must be a string or null (got ${typeof res.lastFailureClass})`,
|
|
987
|
+
);
|
|
988
|
+
}
|
|
989
|
+
if (!Array.isArray(res.repairHistory)) {
|
|
990
|
+
throw new StateFileError(
|
|
991
|
+
"STATE_SCHEMA_INVALID",
|
|
992
|
+
`resilience.repairHistory must be an array (got ${typeof res.repairHistory})`,
|
|
993
|
+
);
|
|
994
|
+
}
|
|
995
|
+
// Deep-validate repairHistory entries
|
|
996
|
+
for (let i = 0; i < (res.repairHistory as unknown[]).length; i++) {
|
|
997
|
+
const rec = (res.repairHistory as unknown[])[i];
|
|
998
|
+
if (!rec || typeof rec !== "object") {
|
|
999
|
+
throw new StateFileError(
|
|
1000
|
+
"STATE_SCHEMA_INVALID",
|
|
1001
|
+
`resilience.repairHistory[${i}] must be an object (got ${typeof rec})`,
|
|
1002
|
+
);
|
|
1003
|
+
}
|
|
1004
|
+
const r = rec as Record<string, unknown>;
|
|
1005
|
+
if (typeof r.id !== "string") {
|
|
1006
|
+
throw new StateFileError(
|
|
1007
|
+
"STATE_SCHEMA_INVALID",
|
|
1008
|
+
`resilience.repairHistory[${i}].id must be a string (got ${typeof r.id})`,
|
|
1009
|
+
);
|
|
1010
|
+
}
|
|
1011
|
+
if (typeof r.strategy !== "string") {
|
|
1012
|
+
throw new StateFileError(
|
|
1013
|
+
"STATE_SCHEMA_INVALID",
|
|
1014
|
+
`resilience.repairHistory[${i}].strategy must be a string (got ${typeof r.strategy})`,
|
|
1015
|
+
);
|
|
1016
|
+
}
|
|
1017
|
+
const VALID_REPAIR_STATUSES = new Set(["succeeded", "failed", "skipped"]);
|
|
1018
|
+
if (typeof r.status !== "string" || !VALID_REPAIR_STATUSES.has(r.status)) {
|
|
1019
|
+
throw new StateFileError(
|
|
1020
|
+
"STATE_SCHEMA_INVALID",
|
|
1021
|
+
`resilience.repairHistory[${i}].status must be "succeeded"|"failed"|"skipped" (got ${JSON.stringify(r.status)})`,
|
|
1022
|
+
);
|
|
1023
|
+
}
|
|
1024
|
+
if (typeof r.startedAt !== "number") {
|
|
1025
|
+
throw new StateFileError(
|
|
1026
|
+
"STATE_SCHEMA_INVALID",
|
|
1027
|
+
`resilience.repairHistory[${i}].startedAt must be a number (got ${typeof r.startedAt})`,
|
|
1028
|
+
);
|
|
1029
|
+
}
|
|
1030
|
+
if (typeof r.endedAt !== "number") {
|
|
1031
|
+
throw new StateFileError(
|
|
1032
|
+
"STATE_SCHEMA_INVALID",
|
|
1033
|
+
`resilience.repairHistory[${i}].endedAt must be a number (got ${typeof r.endedAt})`,
|
|
1034
|
+
);
|
|
1035
|
+
}
|
|
1036
|
+
// repoId is optional — validate type only if present
|
|
1037
|
+
if (r.repoId !== undefined && typeof r.repoId !== "string") {
|
|
1038
|
+
throw new StateFileError(
|
|
1039
|
+
"STATE_SCHEMA_INVALID",
|
|
1040
|
+
`resilience.repairHistory[${i}].repoId must be a string when present (got ${typeof r.repoId})`,
|
|
1041
|
+
);
|
|
1042
|
+
}
|
|
1043
|
+
}
|
|
1044
|
+
|
|
1045
|
+
// ── Validate v3 diagnostics section ──────────────────────────
|
|
1046
|
+
// After upconversion, diagnostics must be a valid object with correct types.
|
|
1047
|
+
if (!obj.diagnostics || typeof obj.diagnostics !== "object") {
|
|
1048
|
+
throw new StateFileError(
|
|
1049
|
+
"STATE_SCHEMA_INVALID",
|
|
1050
|
+
`Missing or invalid "diagnostics" section (expected object, got ${typeof obj.diagnostics})`,
|
|
1051
|
+
);
|
|
1052
|
+
}
|
|
1053
|
+
const diag = obj.diagnostics as Record<string, unknown>;
|
|
1054
|
+
if (!diag.taskExits || typeof diag.taskExits !== "object" || Array.isArray(diag.taskExits)) {
|
|
1055
|
+
throw new StateFileError(
|
|
1056
|
+
"STATE_SCHEMA_INVALID",
|
|
1057
|
+
`diagnostics.taskExits must be an object (got ${typeof diag.taskExits})`,
|
|
1058
|
+
);
|
|
1059
|
+
}
|
|
1060
|
+
// Deep-validate taskExits entries
|
|
1061
|
+
for (const [taskId, entry] of Object.entries(diag.taskExits as Record<string, unknown>)) {
|
|
1062
|
+
if (!entry || typeof entry !== "object") {
|
|
1063
|
+
throw new StateFileError(
|
|
1064
|
+
"STATE_SCHEMA_INVALID",
|
|
1065
|
+
`diagnostics.taskExits["${taskId}"] must be an object (got ${typeof entry})`,
|
|
1066
|
+
);
|
|
1067
|
+
}
|
|
1068
|
+
const te = entry as Record<string, unknown>;
|
|
1069
|
+
if (typeof te.classification !== "string") {
|
|
1070
|
+
throw new StateFileError(
|
|
1071
|
+
"STATE_SCHEMA_INVALID",
|
|
1072
|
+
`diagnostics.taskExits["${taskId}"].classification must be a string (got ${typeof te.classification})`,
|
|
1073
|
+
);
|
|
1074
|
+
}
|
|
1075
|
+
if (typeof te.cost !== "number") {
|
|
1076
|
+
throw new StateFileError(
|
|
1077
|
+
"STATE_SCHEMA_INVALID",
|
|
1078
|
+
`diagnostics.taskExits["${taskId}"].cost must be a number (got ${typeof te.cost})`,
|
|
1079
|
+
);
|
|
1080
|
+
}
|
|
1081
|
+
if (typeof te.durationSec !== "number") {
|
|
1082
|
+
throw new StateFileError(
|
|
1083
|
+
"STATE_SCHEMA_INVALID",
|
|
1084
|
+
`diagnostics.taskExits["${taskId}"].durationSec must be a number (got ${typeof te.durationSec})`,
|
|
1085
|
+
);
|
|
1086
|
+
}
|
|
1087
|
+
// retries is optional — validate type only if present
|
|
1088
|
+
if (te.retries !== undefined && typeof te.retries !== "number") {
|
|
1089
|
+
throw new StateFileError(
|
|
1090
|
+
"STATE_SCHEMA_INVALID",
|
|
1091
|
+
`diagnostics.taskExits["${taskId}"].retries must be a number when present (got ${typeof te.retries})`,
|
|
1092
|
+
);
|
|
1093
|
+
}
|
|
1094
|
+
}
|
|
1095
|
+
if (typeof diag.batchCost !== "number") {
|
|
1096
|
+
throw new StateFileError(
|
|
1097
|
+
"STATE_SCHEMA_INVALID",
|
|
1098
|
+
`diagnostics.batchCost must be a number (got ${typeof diag.batchCost})`,
|
|
1099
|
+
);
|
|
1100
|
+
}
|
|
1101
|
+
|
|
1102
|
+
// ── Validate exitDiagnostic on task records (optional) ───────
|
|
1103
|
+
for (let i = 0; i < tasks.length; i++) {
|
|
1104
|
+
const t = tasks[i] as Record<string, unknown>;
|
|
1105
|
+
if (t.exitDiagnostic !== undefined) {
|
|
1106
|
+
if (!t.exitDiagnostic || typeof t.exitDiagnostic !== "object") {
|
|
1107
|
+
throw new StateFileError(
|
|
1108
|
+
"STATE_SCHEMA_INVALID",
|
|
1109
|
+
`tasks[${i}].exitDiagnostic must be an object when present (got ${typeof t.exitDiagnostic})`,
|
|
1110
|
+
);
|
|
1111
|
+
}
|
|
1112
|
+
const ed = t.exitDiagnostic as Record<string, unknown>;
|
|
1113
|
+
if (typeof ed.classification !== "string") {
|
|
1114
|
+
throw new StateFileError(
|
|
1115
|
+
"STATE_SCHEMA_INVALID",
|
|
1116
|
+
`tasks[${i}].exitDiagnostic.classification must be a string (got ${typeof ed.classification})`,
|
|
1117
|
+
);
|
|
1118
|
+
}
|
|
1119
|
+
}
|
|
1120
|
+
// v4 optional fields: packetRepoId, packetTaskPath (string | undefined)
|
|
1121
|
+
if (t.packetRepoId !== undefined && typeof t.packetRepoId !== "string") {
|
|
1122
|
+
throw new StateFileError(
|
|
1123
|
+
"STATE_SCHEMA_INVALID",
|
|
1124
|
+
`tasks[${i}].packetRepoId is not a string (got ${typeof t.packetRepoId})`,
|
|
1125
|
+
);
|
|
1126
|
+
}
|
|
1127
|
+
if (t.packetTaskPath !== undefined && typeof t.packetTaskPath !== "string") {
|
|
1128
|
+
throw new StateFileError(
|
|
1129
|
+
"STATE_SCHEMA_INVALID",
|
|
1130
|
+
`tasks[${i}].packetTaskPath is not a string (got ${typeof t.packetTaskPath})`,
|
|
1131
|
+
);
|
|
1132
|
+
}
|
|
1133
|
+
// v4 optional field: segmentIds (string[] | undefined)
|
|
1134
|
+
if (t.segmentIds !== undefined) {
|
|
1135
|
+
if (!Array.isArray(t.segmentIds)) {
|
|
1136
|
+
throw new StateFileError(
|
|
1137
|
+
"STATE_SCHEMA_INVALID",
|
|
1138
|
+
`tasks[${i}].segmentIds is not an array (got ${typeof t.segmentIds})`,
|
|
1139
|
+
);
|
|
1140
|
+
}
|
|
1141
|
+
for (let j = 0; j < (t.segmentIds as unknown[]).length; j++) {
|
|
1142
|
+
if (typeof (t.segmentIds as unknown[])[j] !== "string") {
|
|
1143
|
+
throw new StateFileError(
|
|
1144
|
+
"STATE_SCHEMA_INVALID",
|
|
1145
|
+
`tasks[${i}].segmentIds[${j}] is not a string`,
|
|
1146
|
+
);
|
|
1147
|
+
}
|
|
1148
|
+
}
|
|
1149
|
+
}
|
|
1150
|
+
// v4 optional field: activeSegmentId (string | null | undefined)
|
|
1151
|
+
if (
|
|
1152
|
+
t.activeSegmentId !== undefined &&
|
|
1153
|
+
t.activeSegmentId !== null &&
|
|
1154
|
+
typeof t.activeSegmentId !== "string"
|
|
1155
|
+
) {
|
|
1156
|
+
throw new StateFileError(
|
|
1157
|
+
"STATE_SCHEMA_INVALID",
|
|
1158
|
+
`tasks[${i}].activeSegmentId is not a string or null (got ${typeof t.activeSegmentId})`,
|
|
1159
|
+
);
|
|
1160
|
+
}
|
|
1161
|
+
}
|
|
1162
|
+
|
|
1163
|
+
// ── Validate v4 segments array ───────────────────────────────
|
|
1164
|
+
if (!Array.isArray(obj.segments)) {
|
|
1165
|
+
throw new StateFileError(
|
|
1166
|
+
"STATE_SCHEMA_INVALID",
|
|
1167
|
+
`Missing or invalid "segments" field (expected array, got ${typeof obj.segments})`,
|
|
1168
|
+
);
|
|
1169
|
+
}
|
|
1170
|
+
const segments = obj.segments as unknown[];
|
|
1171
|
+
for (let i = 0; i < segments.length; i++) {
|
|
1172
|
+
const s = segments[i] as Record<string, unknown>;
|
|
1173
|
+
if (!s || typeof s !== "object") {
|
|
1174
|
+
throw new StateFileError("STATE_SCHEMA_INVALID", `segments[${i}] is not an object`);
|
|
1175
|
+
}
|
|
1176
|
+
// Required string fields
|
|
1177
|
+
for (const field of [
|
|
1178
|
+
"segmentId",
|
|
1179
|
+
"taskId",
|
|
1180
|
+
"repoId",
|
|
1181
|
+
"laneId",
|
|
1182
|
+
"sessionName",
|
|
1183
|
+
"worktreePath",
|
|
1184
|
+
"branch",
|
|
1185
|
+
"exitReason",
|
|
1186
|
+
] as const) {
|
|
1187
|
+
if (typeof s[field] !== "string") {
|
|
1188
|
+
throw new StateFileError(
|
|
1189
|
+
"STATE_SCHEMA_INVALID",
|
|
1190
|
+
`segments[${i}].${field} is missing or not a string (got ${typeof s[field]})`,
|
|
1191
|
+
);
|
|
1192
|
+
}
|
|
1193
|
+
}
|
|
1194
|
+
// Required status field (same valid values as task status)
|
|
1195
|
+
if (typeof s.status !== "string" || !VALID_TASK_STATUSES.has(s.status)) {
|
|
1196
|
+
throw new StateFileError(
|
|
1197
|
+
"STATE_SCHEMA_INVALID",
|
|
1198
|
+
`segments[${i}].status is invalid: "${s.status}" (expected one of: ${[...VALID_TASK_STATUSES].join(", ")})`,
|
|
1199
|
+
);
|
|
1200
|
+
}
|
|
1201
|
+
// Nullable number fields: startedAt, endedAt
|
|
1202
|
+
if (s.startedAt !== null && typeof s.startedAt !== "number") {
|
|
1203
|
+
throw new StateFileError(
|
|
1204
|
+
"STATE_SCHEMA_INVALID",
|
|
1205
|
+
`segments[${i}].startedAt is not a number or null (got ${typeof s.startedAt})`,
|
|
1206
|
+
);
|
|
1207
|
+
}
|
|
1208
|
+
if (s.endedAt !== null && typeof s.endedAt !== "number") {
|
|
1209
|
+
throw new StateFileError(
|
|
1210
|
+
"STATE_SCHEMA_INVALID",
|
|
1211
|
+
`segments[${i}].endedAt is not a number or null (got ${typeof s.endedAt})`,
|
|
1212
|
+
);
|
|
1213
|
+
}
|
|
1214
|
+
// Required number: retries
|
|
1215
|
+
if (typeof s.retries !== "number") {
|
|
1216
|
+
throw new StateFileError(
|
|
1217
|
+
"STATE_SCHEMA_INVALID",
|
|
1218
|
+
`segments[${i}].retries is not a number (got ${typeof s.retries})`,
|
|
1219
|
+
);
|
|
1220
|
+
}
|
|
1221
|
+
// Required array: dependsOnSegmentIds
|
|
1222
|
+
if (!Array.isArray(s.dependsOnSegmentIds)) {
|
|
1223
|
+
throw new StateFileError(
|
|
1224
|
+
"STATE_SCHEMA_INVALID",
|
|
1225
|
+
`segments[${i}].dependsOnSegmentIds is not an array (got ${typeof s.dependsOnSegmentIds})`,
|
|
1226
|
+
);
|
|
1227
|
+
}
|
|
1228
|
+
for (let j = 0; j < (s.dependsOnSegmentIds as unknown[]).length; j++) {
|
|
1229
|
+
if (typeof (s.dependsOnSegmentIds as unknown[])[j] !== "string") {
|
|
1230
|
+
throw new StateFileError(
|
|
1231
|
+
"STATE_SCHEMA_INVALID",
|
|
1232
|
+
`segments[${i}].dependsOnSegmentIds[${j}] is not a string`,
|
|
1233
|
+
);
|
|
1234
|
+
}
|
|
1235
|
+
}
|
|
1236
|
+
if (s.expandedFrom !== undefined && typeof s.expandedFrom !== "string") {
|
|
1237
|
+
throw new StateFileError(
|
|
1238
|
+
"STATE_SCHEMA_INVALID",
|
|
1239
|
+
`segments[${i}].expandedFrom is not a string when present (got ${typeof s.expandedFrom})`,
|
|
1240
|
+
);
|
|
1241
|
+
}
|
|
1242
|
+
if (s.expansionRequestId !== undefined && typeof s.expansionRequestId !== "string") {
|
|
1243
|
+
throw new StateFileError(
|
|
1244
|
+
"STATE_SCHEMA_INVALID",
|
|
1245
|
+
`segments[${i}].expansionRequestId is not a string when present (got ${typeof s.expansionRequestId})`,
|
|
1246
|
+
);
|
|
1247
|
+
}
|
|
1248
|
+
// Optional exitDiagnostic
|
|
1249
|
+
if (s.exitDiagnostic !== undefined) {
|
|
1250
|
+
if (
|
|
1251
|
+
!s.exitDiagnostic ||
|
|
1252
|
+
typeof s.exitDiagnostic !== "object" ||
|
|
1253
|
+
Array.isArray(s.exitDiagnostic)
|
|
1254
|
+
) {
|
|
1255
|
+
throw new StateFileError(
|
|
1256
|
+
"STATE_SCHEMA_INVALID",
|
|
1257
|
+
`segments[${i}].exitDiagnostic is not a plain object (got ${Array.isArray(s.exitDiagnostic) ? "array" : typeof s.exitDiagnostic})`,
|
|
1258
|
+
);
|
|
1259
|
+
}
|
|
1260
|
+
if (typeof (s.exitDiagnostic as Record<string, unknown>).classification !== "string") {
|
|
1261
|
+
throw new StateFileError(
|
|
1262
|
+
"STATE_SCHEMA_INVALID",
|
|
1263
|
+
`segments[${i}].exitDiagnostic.classification is not a string`,
|
|
1264
|
+
);
|
|
1265
|
+
}
|
|
1266
|
+
}
|
|
1267
|
+
}
|
|
1268
|
+
|
|
1269
|
+
// ── Capture unknown top-level fields for roundtrip preservation ──
|
|
1270
|
+
// Any fields not in the known schema are preserved so they survive
|
|
1271
|
+
// serialization. This protects against data loss from future schema
|
|
1272
|
+
// extensions or external tools writing additional fields.
|
|
1273
|
+
const KNOWN_TOP_LEVEL_FIELDS = new Set([
|
|
1274
|
+
"schemaVersion",
|
|
1275
|
+
"phase",
|
|
1276
|
+
"batchId",
|
|
1277
|
+
"baseBranch",
|
|
1278
|
+
"orchBranch",
|
|
1279
|
+
"mode",
|
|
1280
|
+
"startedAt",
|
|
1281
|
+
"updatedAt",
|
|
1282
|
+
"endedAt",
|
|
1283
|
+
"currentWaveIndex",
|
|
1284
|
+
"totalWaves",
|
|
1285
|
+
"wavePlan",
|
|
1286
|
+
"lanes",
|
|
1287
|
+
"tasks",
|
|
1288
|
+
"mergeResults",
|
|
1289
|
+
"totalTasks",
|
|
1290
|
+
"succeededTasks",
|
|
1291
|
+
"failedTasks",
|
|
1292
|
+
"skippedTasks",
|
|
1293
|
+
"blockedTasks",
|
|
1294
|
+
"blockedTaskIds",
|
|
1295
|
+
"lastError",
|
|
1296
|
+
"errors",
|
|
1297
|
+
"resilience",
|
|
1298
|
+
"diagnostics",
|
|
1299
|
+
"segments",
|
|
1300
|
+
"_extraFields",
|
|
1301
|
+
]);
|
|
1302
|
+
const extraFields: Record<string, unknown> = {};
|
|
1303
|
+
for (const key of Object.keys(obj)) {
|
|
1304
|
+
if (!KNOWN_TOP_LEVEL_FIELDS.has(key)) {
|
|
1305
|
+
extraFields[key] = obj[key];
|
|
1306
|
+
}
|
|
1307
|
+
}
|
|
1308
|
+
if (Object.keys(extraFields).length > 0) {
|
|
1309
|
+
obj._extraFields = extraFields;
|
|
1310
|
+
}
|
|
1311
|
+
|
|
1312
|
+
return obj as unknown as PersistedBatchState;
|
|
1313
|
+
}
|
|
1314
|
+
|
|
1315
|
+
// ── Serialization ────────────────────────────────────────────────────
|
|
1316
|
+
|
|
1317
|
+
/**
|
|
1318
|
+
* Serialize runtime batch state to a PersistedBatchState JSON string.
|
|
1319
|
+
*
|
|
1320
|
+
* Pure function: extracts the serializable subset from OrchBatchRuntimeState
|
|
1321
|
+
* and its associated wave results, enriches with schema version and timestamps.
|
|
1322
|
+
*
|
|
1323
|
+
* @param state - Current runtime batch state
|
|
1324
|
+
* @param wavePlan - Wave plan (array of arrays of task IDs)
|
|
1325
|
+
* @param lanes - Currently allocated lanes (latest wave's lanes)
|
|
1326
|
+
* @param allTaskOutcomes - All task outcomes across completed waves + current
|
|
1327
|
+
* @returns JSON string (pretty-printed for debuggability)
|
|
1328
|
+
*/
|
|
1329
|
+
export function serializeBatchState(
|
|
1330
|
+
state: OrchBatchRuntimeState,
|
|
1331
|
+
wavePlan: string[][],
|
|
1332
|
+
lanes: AllocatedLane[],
|
|
1333
|
+
allTaskOutcomes: LaneTaskOutcome[],
|
|
1334
|
+
): string {
|
|
1335
|
+
const now = Date.now();
|
|
1336
|
+
|
|
1337
|
+
// Build lookup maps for fast per-task enrichment.
|
|
1338
|
+
const laneByTaskId = new Map<string, AllocatedLane>();
|
|
1339
|
+
for (const lane of lanes) {
|
|
1340
|
+
for (const task of lane.tasks) {
|
|
1341
|
+
laneByTaskId.set(task.taskId, lane);
|
|
1342
|
+
}
|
|
1343
|
+
}
|
|
1344
|
+
|
|
1345
|
+
// Latest outcome wins (allTaskOutcomes is append/replace ordered by time).
|
|
1346
|
+
const outcomeByTaskId = new Map<string, LaneTaskOutcome>();
|
|
1347
|
+
for (const outcome of allTaskOutcomes) {
|
|
1348
|
+
outcomeByTaskId.set(outcome.taskId, outcome);
|
|
1349
|
+
}
|
|
1350
|
+
|
|
1351
|
+
// Build full task registry from wave plan + any outcomes seen so far.
|
|
1352
|
+
const taskIdSet = new Set<string>();
|
|
1353
|
+
for (const wave of wavePlan) {
|
|
1354
|
+
for (const taskId of wave) taskIdSet.add(taskId);
|
|
1355
|
+
}
|
|
1356
|
+
for (const outcome of allTaskOutcomes) {
|
|
1357
|
+
taskIdSet.add(outcome.taskId);
|
|
1358
|
+
}
|
|
1359
|
+
|
|
1360
|
+
// Build a lookup from taskId → AllocatedTask (which holds the ParsedTask with repo fields).
|
|
1361
|
+
const allocatedTaskByTaskId = new Map<
|
|
1362
|
+
string,
|
|
1363
|
+
{ allocatedTask: import("./types.ts").AllocatedTask; lane: AllocatedLane }
|
|
1364
|
+
>();
|
|
1365
|
+
for (const lane of lanes) {
|
|
1366
|
+
for (const allocTask of lane.tasks) {
|
|
1367
|
+
allocatedTaskByTaskId.set(allocTask.taskId, { allocatedTask: allocTask, lane });
|
|
1368
|
+
}
|
|
1369
|
+
}
|
|
1370
|
+
|
|
1371
|
+
const taskRecords: PersistedTaskRecord[] = [...taskIdSet].sort().map((taskId) => {
|
|
1372
|
+
const lane = laneByTaskId.get(taskId);
|
|
1373
|
+
const outcome = outcomeByTaskId.get(taskId);
|
|
1374
|
+
const allocated = allocatedTaskByTaskId.get(taskId);
|
|
1375
|
+
|
|
1376
|
+
const record: PersistedTaskRecord = {
|
|
1377
|
+
taskId,
|
|
1378
|
+
laneNumber: lane?.laneNumber ?? outcome?.laneNumber ?? 0,
|
|
1379
|
+
sessionName: outcome?.sessionName || lane?.laneSessionId || "",
|
|
1380
|
+
status: outcome?.status ?? "pending",
|
|
1381
|
+
taskFolder: "", // Enriched by caller from discovery
|
|
1382
|
+
startedAt: outcome?.startTime ?? null,
|
|
1383
|
+
endedAt: outcome?.endTime ?? null,
|
|
1384
|
+
doneFileFound: outcome?.doneFileFound ?? false,
|
|
1385
|
+
exitReason: outcome?.exitReason ?? "",
|
|
1386
|
+
};
|
|
1387
|
+
|
|
1388
|
+
// v2: Serialize repo-aware fields from the ParsedTask
|
|
1389
|
+
if (allocated?.allocatedTask.task?.promptRepoId !== undefined) {
|
|
1390
|
+
record.repoId = allocated.allocatedTask.task.promptRepoId;
|
|
1391
|
+
}
|
|
1392
|
+
if (allocated?.allocatedTask.task?.resolvedRepoId !== undefined) {
|
|
1393
|
+
record.resolvedRepoId = allocated.allocatedTask.task.resolvedRepoId;
|
|
1394
|
+
}
|
|
1395
|
+
|
|
1396
|
+
// TP-028: Serialize partial progress fields from task outcome
|
|
1397
|
+
if (outcome?.partialProgressCommits !== undefined) {
|
|
1398
|
+
record.partialProgressCommits = outcome.partialProgressCommits;
|
|
1399
|
+
}
|
|
1400
|
+
if (outcome?.partialProgressBranch !== undefined) {
|
|
1401
|
+
record.partialProgressBranch = outcome.partialProgressBranch;
|
|
1402
|
+
}
|
|
1403
|
+
|
|
1404
|
+
// TP-030 v3: Serialize exit diagnostic from task outcome
|
|
1405
|
+
if (outcome?.exitDiagnostic !== undefined) {
|
|
1406
|
+
record.exitDiagnostic = outcome.exitDiagnostic;
|
|
1407
|
+
}
|
|
1408
|
+
|
|
1409
|
+
// TP-081 v4: Serialize segment-level fields from ParsedTask or existing state
|
|
1410
|
+
if (allocated?.allocatedTask.task?.packetRepoId !== undefined) {
|
|
1411
|
+
(record as any).packetRepoId = allocated.allocatedTask.task.packetRepoId;
|
|
1412
|
+
}
|
|
1413
|
+
if (allocated?.allocatedTask.task?.packetTaskPath !== undefined) {
|
|
1414
|
+
(record as any).packetTaskPath = allocated.allocatedTask.task.packetTaskPath;
|
|
1415
|
+
}
|
|
1416
|
+
if (allocated?.allocatedTask.task?.segmentIds !== undefined) {
|
|
1417
|
+
(record as any).segmentIds = allocated.allocatedTask.task.segmentIds;
|
|
1418
|
+
}
|
|
1419
|
+
if (allocated?.allocatedTask.task?.activeSegmentId !== undefined) {
|
|
1420
|
+
(record as any).activeSegmentId = allocated.allocatedTask.task.activeSegmentId;
|
|
1421
|
+
}
|
|
1422
|
+
|
|
1423
|
+
return record;
|
|
1424
|
+
});
|
|
1425
|
+
|
|
1426
|
+
// Build lane records
|
|
1427
|
+
const laneRecords: PersistedLaneRecord[] = lanes.map((lane) => {
|
|
1428
|
+
const record: PersistedLaneRecord = {
|
|
1429
|
+
laneNumber: lane.laneNumber,
|
|
1430
|
+
laneId: lane.laneId,
|
|
1431
|
+
laneSessionId: lane.laneSessionId,
|
|
1432
|
+
worktreePath: lane.worktreePath,
|
|
1433
|
+
branch: lane.branch,
|
|
1434
|
+
taskIds: lane.tasks.map((t) => t.taskId),
|
|
1435
|
+
};
|
|
1436
|
+
if (lane.repoId !== undefined) {
|
|
1437
|
+
record.repoId = lane.repoId;
|
|
1438
|
+
}
|
|
1439
|
+
return record;
|
|
1440
|
+
});
|
|
1441
|
+
|
|
1442
|
+
// Build merge results from actual merge outcomes (accumulated on batchState).
|
|
1443
|
+
// MergeWaveResult.waveIndex is 1-based (from merge module); normalize to
|
|
1444
|
+
// 0-based for PersistedMergeResult (dashboard renders as "Wave N+1").
|
|
1445
|
+
// Clamp to 0 minimum: resume re-exec merges use sentinel waveIndex -1,
|
|
1446
|
+
// which would produce -2 without clamping.
|
|
1447
|
+
const mergeResults: PersistedMergeResult[] = (state.mergeResults || []).map((mr) => {
|
|
1448
|
+
const record: PersistedMergeResult = {
|
|
1449
|
+
waveIndex: Math.max(0, mr.waveIndex - 1),
|
|
1450
|
+
status: mr.status,
|
|
1451
|
+
failedLane: mr.failedLane,
|
|
1452
|
+
failureReason: mr.failureReason,
|
|
1453
|
+
};
|
|
1454
|
+
// v2 (TP-009): Serialize per-repo merge outcomes when available (workspace mode).
|
|
1455
|
+
if (mr.repoResults && mr.repoResults.length > 0) {
|
|
1456
|
+
record.repoResults = mr.repoResults.map((rr) => ({
|
|
1457
|
+
repoId: rr.repoId,
|
|
1458
|
+
status: rr.status,
|
|
1459
|
+
laneNumbers: rr.laneResults.map((lr) => lr.laneNumber),
|
|
1460
|
+
failedLane: rr.failedLane,
|
|
1461
|
+
failureReason: rr.failureReason,
|
|
1462
|
+
}));
|
|
1463
|
+
}
|
|
1464
|
+
return record;
|
|
1465
|
+
});
|
|
1466
|
+
|
|
1467
|
+
const persisted: PersistedBatchState = {
|
|
1468
|
+
schemaVersion: BATCH_STATE_SCHEMA_VERSION,
|
|
1469
|
+
phase: state.phase,
|
|
1470
|
+
batchId: state.batchId,
|
|
1471
|
+
baseBranch: state.baseBranch,
|
|
1472
|
+
orchBranch: state.orchBranch ?? "",
|
|
1473
|
+
mode: state.mode ?? "repo",
|
|
1474
|
+
startedAt: state.startedAt,
|
|
1475
|
+
updatedAt: now,
|
|
1476
|
+
endedAt: state.endedAt,
|
|
1477
|
+
currentWaveIndex: state.currentWaveIndex,
|
|
1478
|
+
totalWaves: state.totalWaves,
|
|
1479
|
+
// TP-166: Persist task-level wave metadata for correct display after resume
|
|
1480
|
+
...(state.taskLevelWaveCount != null ? { taskLevelWaveCount: state.taskLevelWaveCount } : {}),
|
|
1481
|
+
...(state.roundToTaskWave != null ? { roundToTaskWave: [...state.roundToTaskWave] } : {}),
|
|
1482
|
+
wavePlan,
|
|
1483
|
+
lanes: laneRecords,
|
|
1484
|
+
tasks: taskRecords,
|
|
1485
|
+
mergeResults,
|
|
1486
|
+
totalTasks: state.totalTasks,
|
|
1487
|
+
succeededTasks: state.succeededTasks,
|
|
1488
|
+
failedTasks: state.failedTasks,
|
|
1489
|
+
skippedTasks: state.skippedTasks,
|
|
1490
|
+
blockedTasks: state.blockedTasks,
|
|
1491
|
+
blockedTaskIds: [...state.blockedTaskIds],
|
|
1492
|
+
lastError:
|
|
1493
|
+
state.errors.length > 0
|
|
1494
|
+
? { code: "BATCH_ERROR", message: state.errors[state.errors.length - 1] }
|
|
1495
|
+
: null,
|
|
1496
|
+
errors: [...state.errors],
|
|
1497
|
+
resilience: state.resilience ?? defaultResilienceState(),
|
|
1498
|
+
diagnostics: state.diagnostics ?? defaultBatchDiagnostics(),
|
|
1499
|
+
segments: state.segments ?? [],
|
|
1500
|
+
};
|
|
1501
|
+
|
|
1502
|
+
// Merge unknown fields from loaded state to preserve roundtrip fidelity.
|
|
1503
|
+
// Extra fields are placed at the end of the object (after known schema fields)
|
|
1504
|
+
// and will not overwrite any known field.
|
|
1505
|
+
if (state._extraFields) {
|
|
1506
|
+
// TP-195: 2-step `as unknown as` widening. PersistedBatchState is
|
|
1507
|
+
// structurally a string-keyed record at runtime; the cast lets us
|
|
1508
|
+
// add unknown extra fields for serialization roundtrip fidelity
|
|
1509
|
+
// without TypeScript requiring sufficient type overlap.
|
|
1510
|
+
const output = persisted as unknown as Record<string, unknown>;
|
|
1511
|
+
for (const [key, value] of Object.entries(state._extraFields)) {
|
|
1512
|
+
if (!(key in output)) {
|
|
1513
|
+
output[key] = value;
|
|
1514
|
+
}
|
|
1515
|
+
}
|
|
1516
|
+
}
|
|
1517
|
+
|
|
1518
|
+
return JSON.stringify(persisted, null, 2);
|
|
1519
|
+
}
|
|
1520
|
+
|
|
1521
|
+
// ── File Operations ──────────────────────────────────────────────────
|
|
1522
|
+
|
|
1523
|
+
/** Maximum retries for atomic write (Windows file locking). */
|
|
1524
|
+
export const STATE_WRITE_MAX_RETRIES = 3;
|
|
1525
|
+
|
|
1526
|
+
/** Delay between write retries (ms). */
|
|
1527
|
+
export const STATE_WRITE_RETRY_DELAY_MS = 500;
|
|
1528
|
+
|
|
1529
|
+
/**
|
|
1530
|
+
* Save batch state to `.pi/batch-state.json` with atomic write.
|
|
1531
|
+
*
|
|
1532
|
+
* Strategy: write to a temp file (`.pi/batch-state.json.tmp`), then
|
|
1533
|
+
* rename to the final path. This prevents partial writes from corrupting
|
|
1534
|
+
* the state file.
|
|
1535
|
+
*
|
|
1536
|
+
* On Windows, rename can fail if another process holds a handle on the
|
|
1537
|
+
* target file. We retry up to STATE_WRITE_MAX_RETRIES times with a
|
|
1538
|
+
* short delay.
|
|
1539
|
+
*
|
|
1540
|
+
* @param json - JSON string to write (from serializeBatchState)
|
|
1541
|
+
* @param repoRoot - Absolute path to the repository root
|
|
1542
|
+
* @throws StateFileError with STATE_FILE_IO_ERROR on failure
|
|
1543
|
+
*/
|
|
1544
|
+
export function saveBatchState(json: string, repoRoot: string): void {
|
|
1545
|
+
const finalPath = batchStatePath(repoRoot);
|
|
1546
|
+
const tmpPath = `${finalPath}.tmp`;
|
|
1547
|
+
const dir = dirname(finalPath);
|
|
1548
|
+
|
|
1549
|
+
// Ensure .pi directory exists
|
|
1550
|
+
if (!existsSync(dir)) {
|
|
1551
|
+
try {
|
|
1552
|
+
mkdirSync(dir, { recursive: true });
|
|
1553
|
+
} catch (err: unknown) {
|
|
1554
|
+
throw new StateFileError(
|
|
1555
|
+
"STATE_FILE_IO_ERROR",
|
|
1556
|
+
`Failed to create directory "${dir}": ${(err as Error).message}`,
|
|
1557
|
+
);
|
|
1558
|
+
}
|
|
1559
|
+
}
|
|
1560
|
+
|
|
1561
|
+
// Write to temp file
|
|
1562
|
+
try {
|
|
1563
|
+
writeFileSync(tmpPath, json, "utf-8");
|
|
1564
|
+
} catch (err: unknown) {
|
|
1565
|
+
throw new StateFileError(
|
|
1566
|
+
"STATE_FILE_IO_ERROR",
|
|
1567
|
+
`Failed to write temp state file "${tmpPath}": ${(err as Error).message}`,
|
|
1568
|
+
);
|
|
1569
|
+
}
|
|
1570
|
+
|
|
1571
|
+
// Atomic rename with retry for Windows file locking
|
|
1572
|
+
let lastError: Error | null = null;
|
|
1573
|
+
for (let attempt = 1; attempt <= STATE_WRITE_MAX_RETRIES; attempt++) {
|
|
1574
|
+
try {
|
|
1575
|
+
renameSync(tmpPath, finalPath);
|
|
1576
|
+
return; // Success
|
|
1577
|
+
} catch (err: unknown) {
|
|
1578
|
+
lastError = err as Error;
|
|
1579
|
+
if (attempt < STATE_WRITE_MAX_RETRIES) {
|
|
1580
|
+
sleepSync(STATE_WRITE_RETRY_DELAY_MS);
|
|
1581
|
+
}
|
|
1582
|
+
}
|
|
1583
|
+
}
|
|
1584
|
+
|
|
1585
|
+
// All retries exhausted — clean up temp file if possible
|
|
1586
|
+
try {
|
|
1587
|
+
unlinkSync(tmpPath);
|
|
1588
|
+
} catch {
|
|
1589
|
+
/* ignore cleanup errors */
|
|
1590
|
+
}
|
|
1591
|
+
|
|
1592
|
+
throw new StateFileError(
|
|
1593
|
+
"STATE_FILE_IO_ERROR",
|
|
1594
|
+
`Failed to atomically save state file "${finalPath}" after ` +
|
|
1595
|
+
`${STATE_WRITE_MAX_RETRIES} attempts: ${lastError?.message ?? "unknown error"}`,
|
|
1596
|
+
);
|
|
1597
|
+
}
|
|
1598
|
+
|
|
1599
|
+
/**
|
|
1600
|
+
* Load and validate batch state from `.pi/batch-state.json`.
|
|
1601
|
+
*
|
|
1602
|
+
* @param repoRoot - Absolute path to the repository root
|
|
1603
|
+
* @returns Validated PersistedBatchState, or null if file doesn't exist
|
|
1604
|
+
* @throws StateFileError with STATE_FILE_PARSE_ERROR if file contains invalid JSON
|
|
1605
|
+
* @throws StateFileError with STATE_SCHEMA_INVALID if JSON fails validation
|
|
1606
|
+
*/
|
|
1607
|
+
export function loadBatchState(repoRoot: string): PersistedBatchState | null {
|
|
1608
|
+
const filePath = batchStatePath(repoRoot);
|
|
1609
|
+
|
|
1610
|
+
if (!existsSync(filePath)) {
|
|
1611
|
+
return null;
|
|
1612
|
+
}
|
|
1613
|
+
|
|
1614
|
+
let raw: string;
|
|
1615
|
+
try {
|
|
1616
|
+
raw = readFileSync(filePath, "utf-8");
|
|
1617
|
+
} catch (err: unknown) {
|
|
1618
|
+
throw new StateFileError(
|
|
1619
|
+
"STATE_FILE_IO_ERROR",
|
|
1620
|
+
`Failed to read state file "${filePath}": ${(err as Error).message}`,
|
|
1621
|
+
);
|
|
1622
|
+
}
|
|
1623
|
+
|
|
1624
|
+
let parsed: unknown;
|
|
1625
|
+
try {
|
|
1626
|
+
parsed = JSON.parse(raw);
|
|
1627
|
+
} catch (err: unknown) {
|
|
1628
|
+
throw new StateFileError(
|
|
1629
|
+
"STATE_FILE_PARSE_ERROR",
|
|
1630
|
+
`State file "${filePath}" contains invalid JSON: ${(err as Error).message}`,
|
|
1631
|
+
);
|
|
1632
|
+
}
|
|
1633
|
+
|
|
1634
|
+
return validatePersistedState(parsed);
|
|
1635
|
+
}
|
|
1636
|
+
|
|
1637
|
+
/**
|
|
1638
|
+
* Delete the batch state file. Idempotent: no error if file doesn't exist.
|
|
1639
|
+
*
|
|
1640
|
+
* @param repoRoot - Absolute path to the repository root
|
|
1641
|
+
* @throws StateFileError with STATE_FILE_IO_ERROR on unexpected deletion failure
|
|
1642
|
+
*/
|
|
1643
|
+
export function deleteBatchState(repoRoot: string): void {
|
|
1644
|
+
const filePath = batchStatePath(repoRoot);
|
|
1645
|
+
|
|
1646
|
+
if (!existsSync(filePath)) {
|
|
1647
|
+
return; // Already gone — idempotent
|
|
1648
|
+
}
|
|
1649
|
+
|
|
1650
|
+
try {
|
|
1651
|
+
unlinkSync(filePath);
|
|
1652
|
+
} catch (err: unknown) {
|
|
1653
|
+
// If file was deleted between our check and unlink, that's fine
|
|
1654
|
+
if (!existsSync(filePath)) return;
|
|
1655
|
+
throw new StateFileError(
|
|
1656
|
+
"STATE_FILE_IO_ERROR",
|
|
1657
|
+
`Failed to delete state file "${filePath}": ${(err as Error).message}`,
|
|
1658
|
+
);
|
|
1659
|
+
}
|
|
1660
|
+
}
|
|
1661
|
+
|
|
1662
|
+
// ── Orphan Detection (TS-009 Step 3) ─────────────────────────────────
|
|
1663
|
+
|
|
1664
|
+
/**
|
|
1665
|
+
* Status of the persisted batch state file.
|
|
1666
|
+
*
|
|
1667
|
+
* - "valid" — File exists, parsed, and validated successfully
|
|
1668
|
+
* - "missing" — File does not exist (normal for fresh start)
|
|
1669
|
+
* - "invalid" — File exists but has parse or schema errors
|
|
1670
|
+
* - "io-error" — File could not be read due to I/O error
|
|
1671
|
+
*/
|
|
1672
|
+
export type OrphanStateStatus = "valid" | "missing" | "invalid" | "io-error";
|
|
1673
|
+
|
|
1674
|
+
/**
|
|
1675
|
+
* Recommended action based on orphan detection analysis.
|
|
1676
|
+
*
|
|
1677
|
+
* - "resume" — Orphan sessions + valid state, or no orphans + valid state with incomplete tasks: suggest /orch-resume
|
|
1678
|
+
* - "abort-orphans" — Orphan sessions without usable state: suggest /orch-abort
|
|
1679
|
+
* - "cleanup-stale" — No orphans + stale/valid/completed state: auto-delete and start fresh
|
|
1680
|
+
* - "paused-corrupt" — No orphans + corrupt/unreadable state file: do NOT auto-delete; notify user to inspect or manually remove
|
|
1681
|
+
* - "start-fresh" — No orphans, no state file: proceed normally
|
|
1682
|
+
*/
|
|
1683
|
+
export type OrphanRecommendedAction =
|
|
1684
|
+
| "resume"
|
|
1685
|
+
| "abort-orphans"
|
|
1686
|
+
| "cleanup-stale"
|
|
1687
|
+
| "paused-corrupt"
|
|
1688
|
+
| "start-fresh";
|
|
1689
|
+
|
|
1690
|
+
/**
|
|
1691
|
+
* Result of orphan detection analysis.
|
|
1692
|
+
*
|
|
1693
|
+
* Machine-usable fields enable both automated handling and user notification.
|
|
1694
|
+
* The `userMessage` provides a human-readable summary for display.
|
|
1695
|
+
*/
|
|
1696
|
+
export interface OrphanDetectionResult {
|
|
1697
|
+
/** TMUX sessions matching the orchestrator prefix that were found alive */
|
|
1698
|
+
orphanSessions: string[];
|
|
1699
|
+
/** Status of the persisted batch state file */
|
|
1700
|
+
stateStatus: OrphanStateStatus;
|
|
1701
|
+
/** Loaded and validated batch state (null if missing, invalid, or io-error) */
|
|
1702
|
+
loadedState: PersistedBatchState | null;
|
|
1703
|
+
/** Error message if state loading failed (null otherwise) */
|
|
1704
|
+
stateError: string | null;
|
|
1705
|
+
/** Deterministic recommended action */
|
|
1706
|
+
recommendedAction: OrphanRecommendedAction;
|
|
1707
|
+
/** Human-readable message for user notification */
|
|
1708
|
+
userMessage: string;
|
|
1709
|
+
}
|
|
1710
|
+
|
|
1711
|
+
/**
|
|
1712
|
+
* Parse TMUX `list-sessions -F "#{session_name}"` output.
|
|
1713
|
+
*
|
|
1714
|
+
* Filters session names by the given prefix (e.g., "orch" matches "orch-lane-1").
|
|
1715
|
+
* Handles empty output, blank lines, and whitespace-padded names gracefully.
|
|
1716
|
+
*
|
|
1717
|
+
* Pure function — no process or filesystem access.
|
|
1718
|
+
*
|
|
1719
|
+
* @param stdout - Raw stdout from `tmux list-sessions -F "#{session_name}"`
|
|
1720
|
+
* @param prefix - Session name prefix to filter by (e.g., "orch")
|
|
1721
|
+
* @returns Sorted array of matching session names
|
|
1722
|
+
*/
|
|
1723
|
+
export function parseOrchSessionNames(stdout: string, prefix: string): string[] {
|
|
1724
|
+
if (!stdout || !stdout.trim()) return [];
|
|
1725
|
+
|
|
1726
|
+
const filterPrefix = `${prefix}-`;
|
|
1727
|
+
|
|
1728
|
+
return stdout
|
|
1729
|
+
.split("\n")
|
|
1730
|
+
.map((line) => line.trim())
|
|
1731
|
+
.filter((name) => name.length > 0 && name.startsWith(filterPrefix))
|
|
1732
|
+
.sort();
|
|
1733
|
+
}
|
|
1734
|
+
|
|
1735
|
+
/**
|
|
1736
|
+
* Analyze orchestrator startup state — pure deterministic decision logic.
|
|
1737
|
+
*
|
|
1738
|
+
* Given the current state of TMUX sessions, batch state file, and task
|
|
1739
|
+
* completion markers, returns a deterministic recommendation for what
|
|
1740
|
+
* the `/orch` command should do.
|
|
1741
|
+
*
|
|
1742
|
+
* Decision matrix:
|
|
1743
|
+
* | Orphans? | State Status | Done? | Action |
|
|
1744
|
+
* |----------|-------------|-------|-----------------|
|
|
1745
|
+
* | Yes | valid | — | resume |
|
|
1746
|
+
* | Yes | missing | — | abort-orphans |
|
|
1747
|
+
* | Yes | invalid | — | abort-orphans |
|
|
1748
|
+
* | Yes | io-error | — | abort-orphans |
|
|
1749
|
+
* | No | valid | all | cleanup-stale |
|
|
1750
|
+
* | No | valid | !all | resume |
|
|
1751
|
+
* | No | missing | — | start-fresh |
|
|
1752
|
+
* | No | invalid | — | paused-corrupt |
|
|
1753
|
+
* | No | io-error | — | paused-corrupt |
|
|
1754
|
+
*
|
|
1755
|
+
* Pure function — no process or filesystem access.
|
|
1756
|
+
*
|
|
1757
|
+
* @param orphanSessions - TMUX sessions matching the orch prefix
|
|
1758
|
+
* @param stateStatus - Status of the batch state file
|
|
1759
|
+
* @param loadedState - Validated batch state (null if unavailable)
|
|
1760
|
+
* @param stateError - Error message from state loading (null if no error)
|
|
1761
|
+
* @param doneTaskIds - Set of task IDs whose .DONE files were found
|
|
1762
|
+
* @returns OrphanDetectionResult with recommended action
|
|
1763
|
+
*/
|
|
1764
|
+
export function analyzeOrchestratorStartupState(
|
|
1765
|
+
orphanSessions: string[],
|
|
1766
|
+
stateStatus: OrphanStateStatus,
|
|
1767
|
+
loadedState: PersistedBatchState | null,
|
|
1768
|
+
stateError: string | null,
|
|
1769
|
+
doneTaskIds: ReadonlySet<string>,
|
|
1770
|
+
): OrphanDetectionResult {
|
|
1771
|
+
const hasOrphans = orphanSessions.length > 0;
|
|
1772
|
+
const sessionList = orphanSessions.join(", ");
|
|
1773
|
+
|
|
1774
|
+
// ── Orphan sessions exist ────────────────────────────────────
|
|
1775
|
+
if (hasOrphans) {
|
|
1776
|
+
if (stateStatus === "valid" && loadedState) {
|
|
1777
|
+
return {
|
|
1778
|
+
orphanSessions,
|
|
1779
|
+
stateStatus,
|
|
1780
|
+
loadedState,
|
|
1781
|
+
stateError,
|
|
1782
|
+
recommendedAction: "resume",
|
|
1783
|
+
userMessage:
|
|
1784
|
+
`🔄 Found ${orphanSessions.length} running orchestrator session(s): ${sessionList}\n` +
|
|
1785
|
+
` Batch ${loadedState.batchId} (${loadedState.phase}) has persisted state.\n` +
|
|
1786
|
+
` Use /orch-resume to continue, or /orch-abort to clean up.`,
|
|
1787
|
+
};
|
|
1788
|
+
}
|
|
1789
|
+
|
|
1790
|
+
// Orphans without usable state (missing, invalid, or io-error)
|
|
1791
|
+
const errorCtx = stateError ? `\n State error: ${stateError}` : "";
|
|
1792
|
+
return {
|
|
1793
|
+
orphanSessions,
|
|
1794
|
+
stateStatus,
|
|
1795
|
+
loadedState: null,
|
|
1796
|
+
stateError,
|
|
1797
|
+
recommendedAction: "abort-orphans",
|
|
1798
|
+
userMessage:
|
|
1799
|
+
`⚠️ Found ${orphanSessions.length} orphan orchestrator session(s): ${sessionList}\n` +
|
|
1800
|
+
` No usable batch state file (status: ${stateStatus}).${errorCtx}\n` +
|
|
1801
|
+
` Use /orch-abort to clean up before starting a new batch.`,
|
|
1802
|
+
};
|
|
1803
|
+
}
|
|
1804
|
+
|
|
1805
|
+
// ── No orphan sessions ───────────────────────────────────────
|
|
1806
|
+
|
|
1807
|
+
if (stateStatus === "missing") {
|
|
1808
|
+
return {
|
|
1809
|
+
orphanSessions: [],
|
|
1810
|
+
stateStatus,
|
|
1811
|
+
loadedState: null,
|
|
1812
|
+
stateError,
|
|
1813
|
+
recommendedAction: "start-fresh",
|
|
1814
|
+
userMessage: "", // No message needed for clean start
|
|
1815
|
+
};
|
|
1816
|
+
}
|
|
1817
|
+
|
|
1818
|
+
if (stateStatus === "valid" && loadedState) {
|
|
1819
|
+
// Check if all tasks completed (all have .DONE files)
|
|
1820
|
+
const allTaskIds = loadedState.tasks.map((t) => t.taskId);
|
|
1821
|
+
const allDone = allTaskIds.length > 0 && allTaskIds.every((id) => doneTaskIds.has(id));
|
|
1822
|
+
|
|
1823
|
+
if (allDone) {
|
|
1824
|
+
return {
|
|
1825
|
+
orphanSessions: [],
|
|
1826
|
+
stateStatus,
|
|
1827
|
+
loadedState,
|
|
1828
|
+
stateError,
|
|
1829
|
+
recommendedAction: "cleanup-stale",
|
|
1830
|
+
userMessage:
|
|
1831
|
+
`🧹 Found stale batch state file from batch ${loadedState.batchId}.\n` +
|
|
1832
|
+
` All ${allTaskIds.length} task(s) have .DONE files. Cleaning up state file.`,
|
|
1833
|
+
};
|
|
1834
|
+
}
|
|
1835
|
+
|
|
1836
|
+
// Not all tasks done — batch was interrupted (crashed orchestrator)
|
|
1837
|
+
const completedCount = allTaskIds.filter((id) => doneTaskIds.has(id)).length;
|
|
1838
|
+
|
|
1839
|
+
// Only phases that resumeOrchBatch can actually handle should get "resume".
|
|
1840
|
+
// "failed" / "stopped" / "idle" / "planning" are non-resumable — if nothing
|
|
1841
|
+
// ran yet (completedCount === 0) the state file is pure noise; auto-clean it
|
|
1842
|
+
// so /orch can start fresh without forcing the user through /orch-abort first.
|
|
1843
|
+
const resumablePhases: OrchBatchPhase[] = ["paused", "executing", "merging"];
|
|
1844
|
+
const isResumable = resumablePhases.includes(loadedState.phase as OrchBatchPhase);
|
|
1845
|
+
|
|
1846
|
+
if (!isResumable && completedCount === 0) {
|
|
1847
|
+
return {
|
|
1848
|
+
orphanSessions: [],
|
|
1849
|
+
stateStatus,
|
|
1850
|
+
loadedState,
|
|
1851
|
+
stateError,
|
|
1852
|
+
recommendedAction: "cleanup-stale",
|
|
1853
|
+
userMessage:
|
|
1854
|
+
`🧹 Found non-resumable batch state (${loadedState.batchId}, phase=${loadedState.phase}, 0 tasks ran).\n` +
|
|
1855
|
+
` Cleaning up stale state file so a fresh batch can start.`,
|
|
1856
|
+
};
|
|
1857
|
+
}
|
|
1858
|
+
|
|
1859
|
+
return {
|
|
1860
|
+
orphanSessions: [],
|
|
1861
|
+
stateStatus,
|
|
1862
|
+
loadedState,
|
|
1863
|
+
stateError,
|
|
1864
|
+
recommendedAction: isResumable ? "resume" : "cleanup-stale",
|
|
1865
|
+
userMessage: isResumable
|
|
1866
|
+
? `🔄 Found interrupted batch ${loadedState.batchId} (${loadedState.phase}).\n` +
|
|
1867
|
+
` ${completedCount}/${allTaskIds.length} task(s) completed.\n` +
|
|
1868
|
+
` Use /orch-resume to continue, or /orch-abort to clean up.`
|
|
1869
|
+
: `🧹 Found non-resumable batch state (${loadedState.batchId}, phase=${loadedState.phase}).\n` +
|
|
1870
|
+
` ${completedCount}/${allTaskIds.length} task(s) completed. Cleaning up state file.`,
|
|
1871
|
+
};
|
|
1872
|
+
}
|
|
1873
|
+
|
|
1874
|
+
// Invalid or io-error state with no orphans — corrupt state.
|
|
1875
|
+
// Never auto-delete: enter paused-corrupt so the user can inspect the file
|
|
1876
|
+
// and decide whether to manually recover or remove it.
|
|
1877
|
+
return {
|
|
1878
|
+
orphanSessions: [],
|
|
1879
|
+
stateStatus,
|
|
1880
|
+
loadedState: null,
|
|
1881
|
+
stateError,
|
|
1882
|
+
recommendedAction: "paused-corrupt",
|
|
1883
|
+
userMessage:
|
|
1884
|
+
`⚠️ Batch state file is corrupt or unreadable (${stateStatus}).\n` +
|
|
1885
|
+
(stateError ? ` Error: ${stateError}\n` : "") +
|
|
1886
|
+
` The file has NOT been deleted. Inspect .pi/batch-state.json manually,\n` +
|
|
1887
|
+
` then either fix it or delete it and run /orch again.`,
|
|
1888
|
+
};
|
|
1889
|
+
}
|
|
1890
|
+
|
|
1891
|
+
/**
|
|
1892
|
+
* Detect orphan orchestrator state and analyze startup recovery action.
|
|
1893
|
+
*
|
|
1894
|
+
* Runtime V2 no longer relies on TMUX session discovery. Startup decisions
|
|
1895
|
+
* are based on persisted batch state plus task .DONE markers.
|
|
1896
|
+
*
|
|
1897
|
+
* @param prefix - Legacy orchestrator session prefix (unused in Runtime V2)
|
|
1898
|
+
* @param repoRoot - Absolute path to the repository root
|
|
1899
|
+
* @returns OrphanDetectionResult with recommended action
|
|
1900
|
+
*/
|
|
1901
|
+
export function detectOrphanSessions(prefix: string, repoRoot: string): OrphanDetectionResult {
|
|
1902
|
+
void prefix;
|
|
1903
|
+
|
|
1904
|
+
// Runtime V2 uses persisted state as the source of truth for orphan analysis.
|
|
1905
|
+
const orphanSessions: string[] = [];
|
|
1906
|
+
|
|
1907
|
+
// ── 1. Load batch state file ─────────────────────────────────
|
|
1908
|
+
let stateStatus: OrphanStateStatus = "missing";
|
|
1909
|
+
let loadedState: PersistedBatchState | null = null;
|
|
1910
|
+
let stateError: string | null = null;
|
|
1911
|
+
|
|
1912
|
+
try {
|
|
1913
|
+
loadedState = loadBatchState(repoRoot);
|
|
1914
|
+
stateStatus = loadedState ? "valid" : "missing";
|
|
1915
|
+
} catch (err: unknown) {
|
|
1916
|
+
if (err instanceof StateFileError) {
|
|
1917
|
+
switch (err.code) {
|
|
1918
|
+
case "STATE_FILE_PARSE_ERROR":
|
|
1919
|
+
case "STATE_SCHEMA_INVALID":
|
|
1920
|
+
stateStatus = "invalid";
|
|
1921
|
+
stateError = `[${err.code}] ${err.message}`;
|
|
1922
|
+
break;
|
|
1923
|
+
case "STATE_FILE_IO_ERROR":
|
|
1924
|
+
stateStatus = "io-error";
|
|
1925
|
+
stateError = `[${err.code}] ${err.message}`;
|
|
1926
|
+
break;
|
|
1927
|
+
}
|
|
1928
|
+
} else {
|
|
1929
|
+
stateStatus = "io-error";
|
|
1930
|
+
stateError = err instanceof Error ? err.message : String(err);
|
|
1931
|
+
}
|
|
1932
|
+
}
|
|
1933
|
+
|
|
1934
|
+
// ── 2. Check .DONE files for stale state detection ───────────
|
|
1935
|
+
const doneTaskIds = new Set<string>();
|
|
1936
|
+
if (loadedState && orphanSessions.length === 0) {
|
|
1937
|
+
// Only check .DONE files when we have state but no orphans
|
|
1938
|
+
// (stale state scenario — sessions finished while orchestrator was disconnected)
|
|
1939
|
+
for (const task of loadedState.tasks) {
|
|
1940
|
+
if (task.taskFolder && hasTaskDoneMarker(task.taskFolder)) {
|
|
1941
|
+
doneTaskIds.add(task.taskId);
|
|
1942
|
+
}
|
|
1943
|
+
}
|
|
1944
|
+
}
|
|
1945
|
+
|
|
1946
|
+
// ── 3. Analyze and return ────────────────────────────────────
|
|
1947
|
+
return analyzeOrchestratorStartupState(
|
|
1948
|
+
orphanSessions,
|
|
1949
|
+
stateStatus,
|
|
1950
|
+
loadedState,
|
|
1951
|
+
stateError,
|
|
1952
|
+
doneTaskIds,
|
|
1953
|
+
);
|
|
1954
|
+
}
|
|
1955
|
+
|
|
1956
|
+
// ── Batch History ────────────────────────────────────────────────────
|
|
1957
|
+
|
|
1958
|
+
/** Path to the batch history file. */
|
|
1959
|
+
function batchHistoryPath(repoRoot: string): string {
|
|
1960
|
+
return join(repoRoot, ".pi", "batch-history.json");
|
|
1961
|
+
}
|
|
1962
|
+
|
|
1963
|
+
/**
|
|
1964
|
+
* Load existing batch history entries from disk.
|
|
1965
|
+
* Returns empty array if file doesn't exist or is invalid.
|
|
1966
|
+
*/
|
|
1967
|
+
export function loadBatchHistory(repoRoot: string): BatchHistorySummary[] {
|
|
1968
|
+
const filePath = batchHistoryPath(repoRoot);
|
|
1969
|
+
try {
|
|
1970
|
+
if (!existsSync(filePath)) return [];
|
|
1971
|
+
const raw = readFileSync(filePath, "utf-8");
|
|
1972
|
+
const data = JSON.parse(raw);
|
|
1973
|
+
if (!Array.isArray(data)) return [];
|
|
1974
|
+
return data;
|
|
1975
|
+
} catch {
|
|
1976
|
+
return [];
|
|
1977
|
+
}
|
|
1978
|
+
}
|
|
1979
|
+
|
|
1980
|
+
/**
|
|
1981
|
+
* Append a batch summary to history and trim to max entries.
|
|
1982
|
+
* Writes atomically via tmp+rename pattern.
|
|
1983
|
+
*/
|
|
1984
|
+
export function saveBatchHistory(repoRoot: string, summary: BatchHistorySummary): void {
|
|
1985
|
+
const filePath = batchHistoryPath(repoRoot);
|
|
1986
|
+
try {
|
|
1987
|
+
const history = loadBatchHistory(repoRoot);
|
|
1988
|
+
// Upsert by batchId so resumed batches replace their earlier partial entry
|
|
1989
|
+
// instead of creating duplicates.
|
|
1990
|
+
const nextHistory = history.filter((entry) => entry.batchId !== summary.batchId);
|
|
1991
|
+
// Prepend newest first
|
|
1992
|
+
nextHistory.unshift(summary);
|
|
1993
|
+
// Trim to max
|
|
1994
|
+
if (nextHistory.length > BATCH_HISTORY_MAX_ENTRIES) {
|
|
1995
|
+
nextHistory.length = BATCH_HISTORY_MAX_ENTRIES;
|
|
1996
|
+
}
|
|
1997
|
+
const dir = dirname(filePath);
|
|
1998
|
+
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
|
|
1999
|
+
const tmpPath = filePath + ".tmp";
|
|
2000
|
+
writeFileSync(tmpPath, JSON.stringify(nextHistory, null, 2));
|
|
2001
|
+
renameSync(tmpPath, filePath);
|
|
2002
|
+
execLog("batch", "history", `saved batch summary (${nextHistory.length} entries)`);
|
|
2003
|
+
} catch (err) {
|
|
2004
|
+
execLog("batch", "history", `failed to save batch history: ${err}`);
|
|
2005
|
+
}
|
|
2006
|
+
}
|
|
2007
|
+
|
|
2008
|
+
/**
|
|
2009
|
+
* Update an existing batch history entry with the integration timestamp.
|
|
2010
|
+
*
|
|
2011
|
+
* Sets `integratedAt` on the matching entry (by batchId). If no entry
|
|
2012
|
+
* is found, this is a no-op — the batch may predate the history feature.
|
|
2013
|
+
*
|
|
2014
|
+
* @since TP-179
|
|
2015
|
+
*/
|
|
2016
|
+
export function updateBatchHistoryIntegration(
|
|
2017
|
+
repoRoot: string,
|
|
2018
|
+
batchId: string,
|
|
2019
|
+
integratedAt: number,
|
|
2020
|
+
): void {
|
|
2021
|
+
const filePath = batchHistoryPath(repoRoot);
|
|
2022
|
+
try {
|
|
2023
|
+
const history = loadBatchHistory(repoRoot);
|
|
2024
|
+
const entry = history.find((e) => e.batchId === batchId);
|
|
2025
|
+
if (!entry) {
|
|
2026
|
+
execLog(
|
|
2027
|
+
"batch",
|
|
2028
|
+
"history",
|
|
2029
|
+
`no history entry found for batchId=${batchId}, skipping integratedAt update`,
|
|
2030
|
+
);
|
|
2031
|
+
return;
|
|
2032
|
+
}
|
|
2033
|
+
entry.integratedAt = integratedAt;
|
|
2034
|
+
const dir = dirname(filePath);
|
|
2035
|
+
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
|
|
2036
|
+
const tmpPath = filePath + ".tmp";
|
|
2037
|
+
writeFileSync(tmpPath, JSON.stringify(history, null, 2));
|
|
2038
|
+
renameSync(tmpPath, filePath);
|
|
2039
|
+
execLog("batch", "history", `updated integratedAt for batchId=${batchId}`);
|
|
2040
|
+
} catch (err) {
|
|
2041
|
+
execLog("batch", "history", `failed to update integratedAt: ${err}`);
|
|
2042
|
+
}
|
|
2043
|
+
}
|
|
2044
|
+
|
|
2045
|
+
// ── Tier 0 Supervisor Event Logging (TP-039 Step 2) ─────────────────
|
|
2046
|
+
|
|
2047
|
+
/**
|
|
2048
|
+
* Event types emitted by Tier 0 recovery actions.
|
|
2049
|
+
*
|
|
2050
|
+
* - `tier0_recovery_attempt` — A recovery action is being tried
|
|
2051
|
+
* - `tier0_recovery_success` — Recovery succeeded
|
|
2052
|
+
* - `tier0_recovery_exhausted` — Retry budget exhausted, escalation needed
|
|
2053
|
+
* - `tier0_escalation` — Escalation to supervisor (emitted alongside exhausted)
|
|
2054
|
+
*
|
|
2055
|
+
* @since TP-039
|
|
2056
|
+
*/
|
|
2057
|
+
export type Tier0EventType =
|
|
2058
|
+
| "tier0_recovery_attempt"
|
|
2059
|
+
| "tier0_recovery_success"
|
|
2060
|
+
| "tier0_recovery_exhausted"
|
|
2061
|
+
| "tier0_escalation";
|
|
2062
|
+
|
|
2063
|
+
/**
|
|
2064
|
+
* Structured event written to `.pi/supervisor/events.jsonl`.
|
|
2065
|
+
*
|
|
2066
|
+
* Each event contains enough context for the supervisor agent (Tier 1)
|
|
2067
|
+
* to understand what happened and decide next actions.
|
|
2068
|
+
*
|
|
2069
|
+
* @since TP-039
|
|
2070
|
+
*/
|
|
2071
|
+
export interface Tier0Event {
|
|
2072
|
+
/** ISO 8601 timestamp */
|
|
2073
|
+
timestamp: string;
|
|
2074
|
+
/** Event type */
|
|
2075
|
+
type: Tier0EventType;
|
|
2076
|
+
/** Batch identifier */
|
|
2077
|
+
batchId: string;
|
|
2078
|
+
/** Wave index (0-based) */
|
|
2079
|
+
waveIndex: number;
|
|
2080
|
+
/** Recovery pattern being applied */
|
|
2081
|
+
pattern: Tier0RecoveryPattern | "merge_timeout";
|
|
2082
|
+
/** Current attempt number (1-based) */
|
|
2083
|
+
attempt: number;
|
|
2084
|
+
/** Maximum attempts allowed */
|
|
2085
|
+
maxAttempts: number;
|
|
2086
|
+
/** Affected task ID (for task-scoped patterns like worker_crash) */
|
|
2087
|
+
taskId?: string;
|
|
2088
|
+
/** Lane number (for lane-scoped patterns) */
|
|
2089
|
+
laneNumber?: number;
|
|
2090
|
+
/** Repo ID (for workspace-mode attribution; null/undefined for repo-mode) */
|
|
2091
|
+
repoId?: string | null;
|
|
2092
|
+
/** Exit classification or error type */
|
|
2093
|
+
classification?: string;
|
|
2094
|
+
/** Error message (for exhausted events) */
|
|
2095
|
+
error?: string;
|
|
2096
|
+
/** Resolution description (for success events) */
|
|
2097
|
+
resolution?: string;
|
|
2098
|
+
/** Cooldown/timeout in milliseconds before retry (for attempt events) */
|
|
2099
|
+
cooldownMs?: number;
|
|
2100
|
+
/** Scope key used for retry counter tracking */
|
|
2101
|
+
scopeKey?: string;
|
|
2102
|
+
/** Affected task IDs (for escalation context in exhausted events) */
|
|
2103
|
+
affectedTaskIds?: string[];
|
|
2104
|
+
/** Suggested remediation (for exhausted events) */
|
|
2105
|
+
suggestion?: string;
|
|
2106
|
+
/** Typed escalation payload (present only on `tier0_escalation` events) */
|
|
2107
|
+
escalation?: EscalationContext;
|
|
2108
|
+
}
|
|
2109
|
+
|
|
2110
|
+
/**
|
|
2111
|
+
* Build the required base fields for a Tier 0 event.
|
|
2112
|
+
*
|
|
2113
|
+
* Ensures consistent field population across all emit sites so
|
|
2114
|
+
* supervisor consumers get a deterministic event shape.
|
|
2115
|
+
*
|
|
2116
|
+
* @since TP-039 R004
|
|
2117
|
+
*/
|
|
2118
|
+
export function buildTier0EventBase(
|
|
2119
|
+
type: Tier0EventType,
|
|
2120
|
+
batchId: string,
|
|
2121
|
+
waveIndex: number,
|
|
2122
|
+
pattern: Tier0RecoveryPattern | "merge_timeout",
|
|
2123
|
+
attempt: number,
|
|
2124
|
+
maxAttempts: number,
|
|
2125
|
+
): Pick<
|
|
2126
|
+
Tier0Event,
|
|
2127
|
+
"timestamp" | "type" | "batchId" | "waveIndex" | "pattern" | "attempt" | "maxAttempts"
|
|
2128
|
+
> {
|
|
2129
|
+
return {
|
|
2130
|
+
timestamp: new Date().toISOString(),
|
|
2131
|
+
type,
|
|
2132
|
+
batchId,
|
|
2133
|
+
waveIndex,
|
|
2134
|
+
pattern,
|
|
2135
|
+
attempt,
|
|
2136
|
+
maxAttempts,
|
|
2137
|
+
};
|
|
2138
|
+
}
|
|
2139
|
+
|
|
2140
|
+
/**
|
|
2141
|
+
* Emit a Tier 0 event to `.pi/supervisor/events.jsonl`.
|
|
2142
|
+
*
|
|
2143
|
+
* Best-effort: creates the directory if needed, appends the event as a
|
|
2144
|
+
* single JSONL line. Failures are logged but never crash the batch.
|
|
2145
|
+
*
|
|
2146
|
+
* @param stateRoot - Root directory for state files (workspace root or repo root)
|
|
2147
|
+
* @param event - The event to emit
|
|
2148
|
+
*
|
|
2149
|
+
* @since TP-039
|
|
2150
|
+
*/
|
|
2151
|
+
export function emitTier0Event(stateRoot: string, event: Tier0Event): void {
|
|
2152
|
+
try {
|
|
2153
|
+
const supervisorDir = join(stateRoot, ".pi", "supervisor");
|
|
2154
|
+
if (!existsSync(supervisorDir)) {
|
|
2155
|
+
mkdirSync(supervisorDir, { recursive: true });
|
|
2156
|
+
}
|
|
2157
|
+
const eventsPath = join(supervisorDir, "events.jsonl");
|
|
2158
|
+
const line = JSON.stringify(event) + "\n";
|
|
2159
|
+
appendFileSync(eventsPath, line);
|
|
2160
|
+
} catch (err: unknown) {
|
|
2161
|
+
// Best-effort: log but don't crash the batch
|
|
2162
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
2163
|
+
execLog("batch", event.batchId, `tier0 event write failed: ${msg}`, {
|
|
2164
|
+
eventType: event.type,
|
|
2165
|
+
pattern: event.pattern,
|
|
2166
|
+
});
|
|
2167
|
+
}
|
|
2168
|
+
}
|
|
2169
|
+
|
|
2170
|
+
// ── Engine Event Logging (TP-040) ───────────────────────────────────
|
|
2171
|
+
|
|
2172
|
+
/**
|
|
2173
|
+
* Emit an engine lifecycle event to `.pi/supervisor/events.jsonl`.
|
|
2174
|
+
*
|
|
2175
|
+
* Shares the same JSONL file as Tier 0 events for unified consumption
|
|
2176
|
+
* by the supervisor agent. Engine events cover batch lifecycle transitions
|
|
2177
|
+
* (wave start/end, task completion, merge phases, batch terminal states).
|
|
2178
|
+
*
|
|
2179
|
+
* Best-effort: creates the directory if needed, appends the event as a
|
|
2180
|
+
* single JSONL line. Failures are logged but never crash the batch.
|
|
2181
|
+
*
|
|
2182
|
+
* Also invokes the optional event callback for in-process consumers
|
|
2183
|
+
* (command handler, dashboard).
|
|
2184
|
+
*
|
|
2185
|
+
* @param stateRoot - Root directory for state files (workspace root or repo root)
|
|
2186
|
+
* @param event - The engine event to emit
|
|
2187
|
+
* @param callback - Optional in-process event callback
|
|
2188
|
+
*
|
|
2189
|
+
* @since TP-040
|
|
2190
|
+
*/
|
|
2191
|
+
export function emitEngineEvent(
|
|
2192
|
+
stateRoot: string,
|
|
2193
|
+
event: EngineEvent,
|
|
2194
|
+
callback?: ((event: EngineEvent) => void) | null,
|
|
2195
|
+
): void {
|
|
2196
|
+
// Write to JSONL file (same path as Tier 0 events)
|
|
2197
|
+
try {
|
|
2198
|
+
const supervisorDir = join(stateRoot, ".pi", "supervisor");
|
|
2199
|
+
if (!existsSync(supervisorDir)) {
|
|
2200
|
+
mkdirSync(supervisorDir, { recursive: true });
|
|
2201
|
+
}
|
|
2202
|
+
const eventsPath = join(supervisorDir, "events.jsonl");
|
|
2203
|
+
const line = JSON.stringify(event) + "\n";
|
|
2204
|
+
appendFileSync(eventsPath, line);
|
|
2205
|
+
} catch (err: unknown) {
|
|
2206
|
+
// Best-effort: log but don't crash the batch
|
|
2207
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
2208
|
+
execLog("batch", event.batchId, `engine event write failed: ${msg}`, {
|
|
2209
|
+
eventType: event.type,
|
|
2210
|
+
});
|
|
2211
|
+
}
|
|
2212
|
+
|
|
2213
|
+
// Invoke in-process callback
|
|
2214
|
+
if (callback) {
|
|
2215
|
+
try {
|
|
2216
|
+
callback(event);
|
|
2217
|
+
} catch (err: unknown) {
|
|
2218
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
2219
|
+
execLog("batch", event.batchId, `engine event callback failed: ${msg}`, {
|
|
2220
|
+
eventType: event.type,
|
|
2221
|
+
});
|
|
2222
|
+
}
|
|
2223
|
+
}
|
|
2224
|
+
}
|
|
2225
|
+
|
|
2226
|
+
// ── TP-187 (#539): Batch-Meta Runtime Artifact ─────────────────────
|
|
2227
|
+
//
|
|
2228
|
+
// Small JSON file written at batch-start to `.pi/runtime/<batchId>/batch-meta.json`.
|
|
2229
|
+
// Captures the wave plan and the few non-recoverable scalars (baseBranch,
|
|
2230
|
+
// orchBranch, mode, startedAt, totalWaves) so that `orch_resume(force=true)`
|
|
2231
|
+
// can deterministically reconstruct a validator-compliant PersistedBatchState
|
|
2232
|
+
// after `orch_abort()` deletes `.pi/batch-state.json`.
|
|
2233
|
+
//
|
|
2234
|
+
// Without this artifact the wave topology is unrecoverable from the surviving
|
|
2235
|
+
// runtime registry alone (manifests don't carry wave info) and a flattened
|
|
2236
|
+
// "single wave with all surviving tasks" reconstruction can violate DAG
|
|
2237
|
+
// dependency ordering. See R003 plan review.
|
|
2238
|
+
|
|
2239
|
+
/**
|
|
2240
|
+
* Schema-tagged batch metadata persisted alongside per-batch runtime state.
|
|
2241
|
+
*
|
|
2242
|
+
* @since TP-187 (#539)
|
|
2243
|
+
*/
|
|
2244
|
+
export interface BatchMetaArtifact {
|
|
2245
|
+
schemaVersion: 1;
|
|
2246
|
+
batchId: string;
|
|
2247
|
+
wavePlan: string[][];
|
|
2248
|
+
baseBranch: string;
|
|
2249
|
+
orchBranch: string;
|
|
2250
|
+
mode: WorkspaceMode;
|
|
2251
|
+
startedAt: number;
|
|
2252
|
+
totalWaves: number;
|
|
2253
|
+
}
|
|
2254
|
+
|
|
2255
|
+
/** Path to the batch-meta artifact for a given batch. */
|
|
2256
|
+
function batchMetaPath(stateRoot: string, batchId: string): string {
|
|
2257
|
+
return join(runtimeRoot(stateRoot, batchId), "batch-meta.json");
|
|
2258
|
+
}
|
|
2259
|
+
|
|
2260
|
+
/**
|
|
2261
|
+
* Persist the wave plan and core batch metadata to the runtime artifact
|
|
2262
|
+
* directory. Best-effort: failures are logged but do NOT crash the batch.
|
|
2263
|
+
*
|
|
2264
|
+
* Called once at batch-start (after wavePlan is finalized) and re-written
|
|
2265
|
+
* whenever the wave plan mutates (segment expansion).
|
|
2266
|
+
*
|
|
2267
|
+
* @since TP-187 (#539)
|
|
2268
|
+
*/
|
|
2269
|
+
export function saveBatchMetaRuntimeArtifact(stateRoot: string, artifact: BatchMetaArtifact): void {
|
|
2270
|
+
try {
|
|
2271
|
+
const path = batchMetaPath(stateRoot, artifact.batchId);
|
|
2272
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
2273
|
+
const tmp = path + ".tmp";
|
|
2274
|
+
writeFileSync(tmp, JSON.stringify(artifact, null, 2) + "\n", "utf-8");
|
|
2275
|
+
renameSync(tmp, path);
|
|
2276
|
+
execLog("state", artifact.batchId, "persisted batch-meta runtime artifact", {
|
|
2277
|
+
waves: artifact.wavePlan.length,
|
|
2278
|
+
tasks: artifact.wavePlan.reduce((sum, w) => sum + w.length, 0),
|
|
2279
|
+
});
|
|
2280
|
+
} catch (err) {
|
|
2281
|
+
execLog(
|
|
2282
|
+
"state",
|
|
2283
|
+
artifact.batchId,
|
|
2284
|
+
`batch-meta write failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
2285
|
+
);
|
|
2286
|
+
}
|
|
2287
|
+
}
|
|
2288
|
+
|
|
2289
|
+
/**
|
|
2290
|
+
* Load the batch-meta artifact for a given batch, or null if missing/invalid.
|
|
2291
|
+
*
|
|
2292
|
+
* @since TP-187 (#539)
|
|
2293
|
+
*/
|
|
2294
|
+
export function loadBatchMetaRuntimeArtifact(
|
|
2295
|
+
stateRoot: string,
|
|
2296
|
+
batchId: string,
|
|
2297
|
+
): BatchMetaArtifact | null {
|
|
2298
|
+
const path = batchMetaPath(stateRoot, batchId);
|
|
2299
|
+
if (!existsSync(path)) return null;
|
|
2300
|
+
try {
|
|
2301
|
+
const raw = readFileSync(path, "utf-8");
|
|
2302
|
+
const parsed = JSON.parse(raw);
|
|
2303
|
+
if (!parsed || typeof parsed !== "object") return null;
|
|
2304
|
+
const obj = parsed as Record<string, unknown>;
|
|
2305
|
+
if (obj.schemaVersion !== 1) return null;
|
|
2306
|
+
if (typeof obj.batchId !== "string" || obj.batchId !== batchId) return null;
|
|
2307
|
+
if (!Array.isArray(obj.wavePlan)) return null;
|
|
2308
|
+
for (const wave of obj.wavePlan) {
|
|
2309
|
+
if (!Array.isArray(wave)) return null;
|
|
2310
|
+
for (const taskId of wave) {
|
|
2311
|
+
if (typeof taskId !== "string") return null;
|
|
2312
|
+
}
|
|
2313
|
+
}
|
|
2314
|
+
if (typeof obj.baseBranch !== "string") return null;
|
|
2315
|
+
if (typeof obj.orchBranch !== "string") return null;
|
|
2316
|
+
if (obj.mode !== "repo" && obj.mode !== "workspace") return null;
|
|
2317
|
+
if (typeof obj.startedAt !== "number") return null;
|
|
2318
|
+
if (typeof obj.totalWaves !== "number") return null;
|
|
2319
|
+
return obj as unknown as BatchMetaArtifact;
|
|
2320
|
+
} catch {
|
|
2321
|
+
return null;
|
|
2322
|
+
}
|
|
2323
|
+
}
|
|
2324
|
+
|
|
2325
|
+
// ── TP-187 (#539): Reconstruct PersistedBatchState from runtime artifacts ──
|
|
2326
|
+
|
|
2327
|
+
/**
|
|
2328
|
+
* Result of `reconstructBatchStateFromRuntime`. On success, contains the
|
|
2329
|
+
* validator-compliant state, the selected batchId, and a human-readable note
|
|
2330
|
+
* about how the selection was made (used by resume's onNotify output). On
|
|
2331
|
+
* failure, names the missing or corrupt artifact for fail-loud reporting.
|
|
2332
|
+
*
|
|
2333
|
+
* @since TP-187 (#539)
|
|
2334
|
+
*/
|
|
2335
|
+
// TP-195: `error?: undefined` on the success branch makes this a well-formed
|
|
2336
|
+
// discriminated union under `strict: false`. Without it, `if (!result.ok)`
|
|
2337
|
+
// does not narrow `error` because non-strict narrowing requires every
|
|
2338
|
+
// member of the union to share the discriminating fields. Runtime semantics
|
|
2339
|
+
// unchanged — the success branch never carries an error.
|
|
2340
|
+
export type ReconstructResult =
|
|
2341
|
+
| {
|
|
2342
|
+
ok: true;
|
|
2343
|
+
state: PersistedBatchState;
|
|
2344
|
+
batchId: string;
|
|
2345
|
+
selectionNote: string;
|
|
2346
|
+
error?: undefined;
|
|
2347
|
+
}
|
|
2348
|
+
| { ok: false; error: string };
|
|
2349
|
+
|
|
2350
|
+
/**
|
|
2351
|
+
* List candidate `.pi/runtime/<batchId>/` directories newest-first by mtime,
|
|
2352
|
+
* with lex-largest tie-break for determinism.
|
|
2353
|
+
*/
|
|
2354
|
+
function listRuntimeBatchDirs(stateRoot: string): { batchId: string; mtimeMs: number }[] {
|
|
2355
|
+
const root = join(stateRoot, ".pi", "runtime");
|
|
2356
|
+
if (!existsSync(root)) return [];
|
|
2357
|
+
let entries: string[] = [];
|
|
2358
|
+
try {
|
|
2359
|
+
entries = readdirSync(root);
|
|
2360
|
+
} catch {
|
|
2361
|
+
return [];
|
|
2362
|
+
}
|
|
2363
|
+
const candidates: { batchId: string; mtimeMs: number }[] = [];
|
|
2364
|
+
for (const name of entries) {
|
|
2365
|
+
const dir = join(root, name);
|
|
2366
|
+
try {
|
|
2367
|
+
const st = statSync(dir);
|
|
2368
|
+
if (!st.isDirectory()) continue;
|
|
2369
|
+
candidates.push({ batchId: name, mtimeMs: st.mtimeMs });
|
|
2370
|
+
} catch {
|
|
2371
|
+
continue;
|
|
2372
|
+
}
|
|
2373
|
+
}
|
|
2374
|
+
candidates.sort((a, b) => {
|
|
2375
|
+
if (b.mtimeMs !== a.mtimeMs) return b.mtimeMs - a.mtimeMs;
|
|
2376
|
+
return b.batchId.localeCompare(a.batchId);
|
|
2377
|
+
});
|
|
2378
|
+
return candidates;
|
|
2379
|
+
}
|
|
2380
|
+
|
|
2381
|
+
/**
|
|
2382
|
+
* Read all worker manifests under `.pi/runtime/<batchId>/agents/`.
|
|
2383
|
+
*
|
|
2384
|
+
* Returns an empty array if the agents directory is missing.
|
|
2385
|
+
*/
|
|
2386
|
+
function readWorkerManifests(stateRoot: string, batchId: string): RuntimeAgentManifest[] {
|
|
2387
|
+
const agentsDir = join(runtimeRoot(stateRoot, batchId), "agents");
|
|
2388
|
+
if (!existsSync(agentsDir)) return [];
|
|
2389
|
+
let entries: string[] = [];
|
|
2390
|
+
try {
|
|
2391
|
+
entries = readdirSync(agentsDir);
|
|
2392
|
+
} catch {
|
|
2393
|
+
return [];
|
|
2394
|
+
}
|
|
2395
|
+
const manifests: RuntimeAgentManifest[] = [];
|
|
2396
|
+
for (const agentId of entries) {
|
|
2397
|
+
const manifestPath = runtimeManifestPath(stateRoot, batchId, agentId);
|
|
2398
|
+
if (!existsSync(manifestPath)) continue;
|
|
2399
|
+
try {
|
|
2400
|
+
const raw = readFileSync(manifestPath, "utf-8");
|
|
2401
|
+
const parsed = JSON.parse(raw) as RuntimeAgentManifest;
|
|
2402
|
+
if (parsed && typeof parsed === "object" && parsed.role === "worker") {
|
|
2403
|
+
manifests.push(parsed);
|
|
2404
|
+
}
|
|
2405
|
+
} catch {
|
|
2406
|
+
continue;
|
|
2407
|
+
}
|
|
2408
|
+
}
|
|
2409
|
+
return manifests;
|
|
2410
|
+
}
|
|
2411
|
+
|
|
2412
|
+
/**
|
|
2413
|
+
* Deterministically reconstruct a validator-compliant `PersistedBatchState`
|
|
2414
|
+
* from the surviving runtime artifacts after `.pi/batch-state.json` has been
|
|
2415
|
+
* deleted (typically by `orch_abort()`).
|
|
2416
|
+
*
|
|
2417
|
+
* Required artifacts: at least one `.pi/runtime/<batchId>/` directory whose
|
|
2418
|
+
* `batch-meta.json` parses cleanly AND has at least one worker manifest with
|
|
2419
|
+
* an existing worktree on disk. Anything else returns a fail-loud error so
|
|
2420
|
+
* the caller can surface a clear "no resumable state" message instead of
|
|
2421
|
+
* silently producing an invalid state.
|
|
2422
|
+
*
|
|
2423
|
+
* @since TP-187 (#539)
|
|
2424
|
+
*/
|
|
2425
|
+
export function reconstructBatchStateFromRuntime(stateRoot: string): ReconstructResult {
|
|
2426
|
+
const candidates = listRuntimeBatchDirs(stateRoot);
|
|
2427
|
+
if (candidates.length === 0) {
|
|
2428
|
+
return { ok: false, error: "no .pi/runtime/ directory or no batch subdirectories" };
|
|
2429
|
+
}
|
|
2430
|
+
|
|
2431
|
+
// Try the newest batch first; if its required artifacts are missing, fall
|
|
2432
|
+
// through to the next candidate. We stop at the first batch with a parseable
|
|
2433
|
+
// batch-meta + at least one viable worker manifest.
|
|
2434
|
+
const failures: string[] = [];
|
|
2435
|
+
for (let idx = 0; idx < candidates.length; idx++) {
|
|
2436
|
+
const cand = candidates[idx];
|
|
2437
|
+
const meta = loadBatchMetaRuntimeArtifact(stateRoot, cand.batchId);
|
|
2438
|
+
if (!meta) {
|
|
2439
|
+
failures.push(`${cand.batchId}: batch-meta.json missing or invalid`);
|
|
2440
|
+
continue;
|
|
2441
|
+
}
|
|
2442
|
+
const manifests = readWorkerManifests(stateRoot, cand.batchId);
|
|
2443
|
+
if (manifests.length === 0) {
|
|
2444
|
+
failures.push(`${cand.batchId}: no worker manifests`);
|
|
2445
|
+
continue;
|
|
2446
|
+
}
|
|
2447
|
+
const workerManifestsWithWorktree = manifests.filter(
|
|
2448
|
+
(m) => typeof m.cwd === "string" && m.cwd.length > 0 && existsSync(m.cwd),
|
|
2449
|
+
);
|
|
2450
|
+
if (workerManifestsWithWorktree.length === 0) {
|
|
2451
|
+
failures.push(`${cand.batchId}: worktree paths from manifests no longer exist on disk`);
|
|
2452
|
+
continue;
|
|
2453
|
+
}
|
|
2454
|
+
|
|
2455
|
+
// TP-187 (#539) — sage post-integration follow-up: refuse reconstruction
|
|
2456
|
+
// when the runtime artifacts indicate this batch was multi-repo (segment
|
|
2457
|
+
// expansion). Reconstruction hardcodes `segments: []` and cannot recover
|
|
2458
|
+
// the per-segment topology that lives only in the deleted batch-state.
|
|
2459
|
+
// Resuming with `segments: []` for a multi-repo batch would silently lose
|
|
2460
|
+
// the expansion state and could re-execute already-done segments OR fail
|
|
2461
|
+
// dependency checks for cross-repo waves. Detection heuristic: if worker
|
|
2462
|
+
// manifests carry more than one distinct repoId, segment expansion was
|
|
2463
|
+
// active. Single-repo batches (the common case, including OrchID's
|
|
2464
|
+
// own self-orchestration) are unaffected.
|
|
2465
|
+
{
|
|
2466
|
+
const distinctRepoIds = new Set<string>();
|
|
2467
|
+
for (const m of workerManifestsWithWorktree) {
|
|
2468
|
+
if (typeof m.repoId === "string" && m.repoId.length > 0) {
|
|
2469
|
+
distinctRepoIds.add(m.repoId);
|
|
2470
|
+
}
|
|
2471
|
+
}
|
|
2472
|
+
if (distinctRepoIds.size > 1) {
|
|
2473
|
+
failures.push(
|
|
2474
|
+
`${cand.batchId}: multi-repo batch detected (${distinctRepoIds.size} distinct repoIds: ` +
|
|
2475
|
+
`${[...distinctRepoIds].slice(0, 4).join(", ")}` +
|
|
2476
|
+
`${distinctRepoIds.size > 4 ? ", ..." : ""}); reconstruction would lose segment ` +
|
|
2477
|
+
`expansion state and is refused. Restore .pi/batch-state.json from backup or start a new batch.`,
|
|
2478
|
+
);
|
|
2479
|
+
continue;
|
|
2480
|
+
}
|
|
2481
|
+
}
|
|
2482
|
+
|
|
2483
|
+
// Build per-lane aggregation from worker manifests.
|
|
2484
|
+
const laneMap = new Map<
|
|
2485
|
+
number,
|
|
2486
|
+
{ laneNumber: number; agentId: string; worktreePath: string; repoId: string; taskIds: string[] }
|
|
2487
|
+
>();
|
|
2488
|
+
for (const m of workerManifestsWithWorktree) {
|
|
2489
|
+
if (typeof m.laneNumber !== "number") continue;
|
|
2490
|
+
const lane = laneMap.get(m.laneNumber) ?? {
|
|
2491
|
+
laneNumber: m.laneNumber,
|
|
2492
|
+
agentId: m.agentId,
|
|
2493
|
+
worktreePath: m.cwd,
|
|
2494
|
+
repoId: m.repoId ?? "default",
|
|
2495
|
+
taskIds: [] as string[],
|
|
2496
|
+
};
|
|
2497
|
+
if (typeof m.taskId === "string" && m.taskId && !lane.taskIds.includes(m.taskId)) {
|
|
2498
|
+
lane.taskIds.push(m.taskId);
|
|
2499
|
+
}
|
|
2500
|
+
laneMap.set(m.laneNumber, lane);
|
|
2501
|
+
}
|
|
2502
|
+
if (laneMap.size === 0) {
|
|
2503
|
+
failures.push(`${cand.batchId}: no lane numbers in manifests`);
|
|
2504
|
+
continue;
|
|
2505
|
+
}
|
|
2506
|
+
|
|
2507
|
+
// Tasks: union of taskIds across all lanes, plus any wavePlan tasks that
|
|
2508
|
+
// are not represented (they are pending, not yet executed).
|
|
2509
|
+
const knownTaskIds = new Set<string>();
|
|
2510
|
+
for (const lane of laneMap.values()) {
|
|
2511
|
+
for (const tid of lane.taskIds) knownTaskIds.add(tid);
|
|
2512
|
+
}
|
|
2513
|
+
for (const wave of meta.wavePlan) {
|
|
2514
|
+
for (const tid of wave) knownTaskIds.add(tid);
|
|
2515
|
+
}
|
|
2516
|
+
|
|
2517
|
+
// Build task records with conservative defaults; resume's reconciliation
|
|
2518
|
+
// pass will re-detect succeeded tasks via `.DONE` markers and STATUS.md.
|
|
2519
|
+
const tasks: PersistedTaskRecord[] = [];
|
|
2520
|
+
const manifestByTaskId = new Map<string, RuntimeAgentManifest>();
|
|
2521
|
+
for (const m of workerManifestsWithWorktree) {
|
|
2522
|
+
if (typeof m.taskId === "string" && m.taskId) {
|
|
2523
|
+
manifestByTaskId.set(m.taskId, m);
|
|
2524
|
+
}
|
|
2525
|
+
}
|
|
2526
|
+
for (const taskId of knownTaskIds) {
|
|
2527
|
+
const m = manifestByTaskId.get(taskId);
|
|
2528
|
+
const lane = m ? laneMap.get(m.laneNumber) : undefined;
|
|
2529
|
+
// TP-195: dropped `taskName: taskId` — not on `PersistedTaskRecord`
|
|
2530
|
+
// schema; no consumer reads `.taskName` from persisted records
|
|
2531
|
+
// (only from `ParsedTask`). Was being added via untyped property
|
|
2532
|
+
// bag cast that the Step 0 typecheck inventory flagged.
|
|
2533
|
+
const taskRecord: PersistedTaskRecord = {
|
|
2534
|
+
taskId,
|
|
2535
|
+
taskFolder: m?.packet?.taskFolder ?? "",
|
|
2536
|
+
status: "pending",
|
|
2537
|
+
sessionName: m?.agentId ?? "",
|
|
2538
|
+
laneNumber: lane?.laneNumber ?? 0,
|
|
2539
|
+
startedAt: typeof m?.startedAt === "number" ? m.startedAt : null,
|
|
2540
|
+
endedAt: null,
|
|
2541
|
+
exitReason: "",
|
|
2542
|
+
doneFileFound: false,
|
|
2543
|
+
};
|
|
2544
|
+
if (m?.repoId) taskRecord.repoId = m.repoId;
|
|
2545
|
+
// TP-195: dropped dead reads of `m.packet.packetRepoId` /
|
|
2546
|
+
// `.packetTaskPath`. `m.packet` is `PacketPaths` which has only
|
|
2547
|
+
// `promptPath`/`statusPath`/`donePath`/`reviewsDir`/`taskFolder`
|
|
2548
|
+
// — the `packetRepoId`/`packetTaskPath` fields exist on
|
|
2549
|
+
// `PersistedTaskRecord` and `ParsedTask`, not on `PacketPaths`,
|
|
2550
|
+
// so these reads always returned undefined and the if-branches
|
|
2551
|
+
// never fired. Removed under the no-behavior-change guarantee.
|
|
2552
|
+
tasks.push(taskRecord);
|
|
2553
|
+
}
|
|
2554
|
+
|
|
2555
|
+
// Build lane records.
|
|
2556
|
+
const lanes: PersistedLaneRecord[] = Array.from(laneMap.values())
|
|
2557
|
+
.sort((a, b) => a.laneNumber - b.laneNumber)
|
|
2558
|
+
.map((l) => {
|
|
2559
|
+
const sessionId = l.agentId.replace(/-(worker|reviewer)$/, "");
|
|
2560
|
+
const rec: PersistedLaneRecord = {
|
|
2561
|
+
laneId: `lane-${l.laneNumber}`,
|
|
2562
|
+
laneNumber: l.laneNumber,
|
|
2563
|
+
laneSessionId: sessionId,
|
|
2564
|
+
worktreePath: l.worktreePath,
|
|
2565
|
+
branch: meta.orchBranch ? `${meta.orchBranch}-lane-${l.laneNumber}` : `lane-${l.laneNumber}`,
|
|
2566
|
+
taskIds: [...l.taskIds],
|
|
2567
|
+
};
|
|
2568
|
+
if (l.repoId && l.repoId !== "default") rec.repoId = l.repoId;
|
|
2569
|
+
return rec;
|
|
2570
|
+
});
|
|
2571
|
+
|
|
2572
|
+
const now = Date.now();
|
|
2573
|
+
const reconstructed: PersistedBatchState = {
|
|
2574
|
+
schemaVersion: BATCH_STATE_SCHEMA_VERSION,
|
|
2575
|
+
batchId: meta.batchId,
|
|
2576
|
+
phase: "stopped",
|
|
2577
|
+
baseBranch: meta.baseBranch,
|
|
2578
|
+
orchBranch: meta.orchBranch,
|
|
2579
|
+
mode: meta.mode,
|
|
2580
|
+
startedAt: meta.startedAt,
|
|
2581
|
+
endedAt: null,
|
|
2582
|
+
updatedAt: now,
|
|
2583
|
+
currentWaveIndex: 0,
|
|
2584
|
+
totalWaves: meta.totalWaves,
|
|
2585
|
+
totalTasks: tasks.length,
|
|
2586
|
+
succeededTasks: 0,
|
|
2587
|
+
failedTasks: 0,
|
|
2588
|
+
skippedTasks: 0,
|
|
2589
|
+
blockedTasks: 0,
|
|
2590
|
+
wavePlan: meta.wavePlan.map((wave) => [...wave]),
|
|
2591
|
+
lanes,
|
|
2592
|
+
tasks,
|
|
2593
|
+
mergeResults: [],
|
|
2594
|
+
blockedTaskIds: [],
|
|
2595
|
+
errors: [],
|
|
2596
|
+
segments: [],
|
|
2597
|
+
lastError: null,
|
|
2598
|
+
resilience: { ...defaultResilienceState(), resumeForced: true },
|
|
2599
|
+
diagnostics: defaultBatchDiagnostics(),
|
|
2600
|
+
} as PersistedBatchState;
|
|
2601
|
+
|
|
2602
|
+
// Validate the reconstructed shape against the on-disk schema gate.
|
|
2603
|
+
try {
|
|
2604
|
+
const json = JSON.stringify(reconstructed);
|
|
2605
|
+
validatePersistedState(JSON.parse(json));
|
|
2606
|
+
} catch (err) {
|
|
2607
|
+
failures.push(
|
|
2608
|
+
`${cand.batchId}: reconstructed state failed validation: ${err instanceof Error ? err.message : String(err)}`,
|
|
2609
|
+
);
|
|
2610
|
+
continue;
|
|
2611
|
+
}
|
|
2612
|
+
|
|
2613
|
+
const totalCandidates = candidates.length;
|
|
2614
|
+
const selectionNote =
|
|
2615
|
+
totalCandidates === 1
|
|
2616
|
+
? `single batch in .pi/runtime/`
|
|
2617
|
+
: `selected from ${totalCandidates} candidate(s) by mtime newest-first (skipped ${idx} earlier candidate(s))`;
|
|
2618
|
+
return { ok: true, state: reconstructed, batchId: meta.batchId, selectionNote };
|
|
2619
|
+
}
|
|
2620
|
+
|
|
2621
|
+
return {
|
|
2622
|
+
ok: false,
|
|
2623
|
+
error: `no reconstructable batch found in .pi/runtime/ (${failures.length} candidate(s) inspected: ${failures.slice(0, 3).join("; ")}${failures.length > 3 ? "; ..." : ""})`,
|
|
2624
|
+
};
|
|
2625
|
+
}
|