taskplane 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +2 -20
- package/bin/taskplane.mjs +706 -0
- package/dashboard/public/app.js +900 -0
- package/dashboard/public/index.html +92 -0
- package/dashboard/public/style.css +924 -0
- package/dashboard/server.cjs +531 -0
- package/extensions/task-orchestrator.ts +28 -0
- package/extensions/task-runner.ts +1923 -0
- package/extensions/taskplane/abort.ts +466 -0
- package/extensions/taskplane/config.ts +102 -0
- package/extensions/taskplane/discovery.ts +988 -0
- package/extensions/taskplane/engine.ts +758 -0
- package/extensions/taskplane/execution.ts +1752 -0
- package/extensions/taskplane/extension.ts +577 -0
- package/extensions/taskplane/formatting.ts +718 -0
- package/extensions/taskplane/git.ts +38 -0
- package/extensions/taskplane/index.ts +22 -0
- package/extensions/taskplane/merge.ts +795 -0
- package/extensions/taskplane/messages.ts +134 -0
- package/extensions/taskplane/persistence.ts +1121 -0
- package/extensions/taskplane/resume.ts +1092 -0
- package/extensions/taskplane/sessions.ts +92 -0
- package/extensions/taskplane/types.ts +1514 -0
- package/extensions/taskplane/waves.ts +900 -0
- package/extensions/taskplane/worktree.ts +1624 -0
- package/package.json +48 -3
- package/skills/create-taskplane-task/SKILL.md +326 -0
- package/skills/create-taskplane-task/references/context-template.md +78 -0
- package/skills/create-taskplane-task/references/prompt-template.md +246 -0
- package/templates/agents/task-merger.md +256 -0
- package/templates/agents/task-reviewer.md +81 -0
- package/templates/agents/task-worker.md +140 -0
- package/templates/config/task-orchestrator.yaml +89 -0
- package/templates/config/task-runner.yaml +99 -0
- package/templates/tasks/CONTEXT.md +31 -0
- package/templates/tasks/EXAMPLE-001-hello-world/PROMPT.md +90 -0
- package/templates/tasks/EXAMPLE-001-hello-world/STATUS.md +73 -0
|
@@ -0,0 +1,1121 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* State persistence, serialization, orphan detection
|
|
3
|
+
* @module orch/persistence
|
|
4
|
+
*/
|
|
5
|
+
import { readFileSync, writeFileSync, existsSync, unlinkSync, renameSync, mkdirSync } from "fs";
|
|
6
|
+
import { execSync } from "child_process";
|
|
7
|
+
import { join, dirname, basename } from "path";
|
|
8
|
+
|
|
9
|
+
import { execLog } from "./execution.ts";
|
|
10
|
+
import { BATCH_STATE_SCHEMA_VERSION, StateFileError, batchStatePath, BATCH_HISTORY_MAX_ENTRIES } from "./types.ts";
|
|
11
|
+
import type { BatchHistorySummary } from "./types.ts";
|
|
12
|
+
import type { AllocatedLane, DiscoveryResult, LaneTaskOutcome, LaneTaskStatus, MonitorState, OrchBatchPhase, OrchBatchRuntimeState, PersistedBatchState, PersistedLaneRecord, PersistedMergeResult, PersistedTaskRecord, TaskMonitorSnapshot } from "./types.ts";
|
|
13
|
+
import { sleepSync } from "./worktree.ts";
|
|
14
|
+
|
|
15
|
+
// ── State Persistence Helper (TS-009 Step 2) ────────────────────────
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Candidate .DONE file locations for a task folder.
|
|
19
|
+
*
|
|
20
|
+
* Task-runner archives completed tasks by moving:
|
|
21
|
+
* tasks/<task-folder>/ → tasks/archive/<task-folder>/
|
|
22
|
+
*
|
|
23
|
+
* During resume/orphan detection we must check both locations.
|
|
24
|
+
*/
|
|
25
|
+
export function getTaskDoneFileCandidates(taskFolder: string): string[] {
|
|
26
|
+
const candidates = [join(taskFolder, ".DONE")];
|
|
27
|
+
const parent = dirname(taskFolder);
|
|
28
|
+
const taskFolderName = basename(taskFolder);
|
|
29
|
+
|
|
30
|
+
// If already in archive, avoid duplicate candidate.
|
|
31
|
+
if (basename(parent).toLowerCase() !== "archive") {
|
|
32
|
+
candidates.push(join(parent, "archive", taskFolderName, ".DONE"));
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
return candidates;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Check whether a task has a .DONE marker in active or archived location.
|
|
40
|
+
*/
|
|
41
|
+
export function hasTaskDoneMarker(taskFolder: string): boolean {
|
|
42
|
+
for (const donePath of getTaskDoneFileCandidates(taskFolder)) {
|
|
43
|
+
try {
|
|
44
|
+
if (existsSync(donePath)) return true;
|
|
45
|
+
} catch {
|
|
46
|
+
// Ignore filesystem errors here; caller handles partial visibility.
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
return false;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Upsert a task outcome in-place. Returns true if changed.
|
|
54
|
+
*/
|
|
55
|
+
export function upsertTaskOutcome(outcomes: LaneTaskOutcome[], next: LaneTaskOutcome): boolean {
|
|
56
|
+
const idx = outcomes.findIndex(o => o.taskId === next.taskId);
|
|
57
|
+
if (idx < 0) {
|
|
58
|
+
outcomes.push(next);
|
|
59
|
+
return true;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
const prev = outcomes[idx];
|
|
63
|
+
const changed =
|
|
64
|
+
prev.status !== next.status ||
|
|
65
|
+
prev.startTime !== next.startTime ||
|
|
66
|
+
prev.endTime !== next.endTime ||
|
|
67
|
+
prev.exitReason !== next.exitReason ||
|
|
68
|
+
prev.sessionName !== next.sessionName ||
|
|
69
|
+
prev.doneFileFound !== next.doneFileFound;
|
|
70
|
+
|
|
71
|
+
if (changed) {
|
|
72
|
+
outcomes[idx] = next;
|
|
73
|
+
}
|
|
74
|
+
return changed;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Seed pending outcomes for all tasks in newly allocated lanes.
|
|
79
|
+
*
|
|
80
|
+
* Ensures the persisted state has a full task registry as soon as a wave starts,
|
|
81
|
+
* including lane/session assignment, even before tasks finish.
|
|
82
|
+
*/
|
|
83
|
+
export function seedPendingOutcomesForAllocatedLanes(
|
|
84
|
+
lanes: AllocatedLane[],
|
|
85
|
+
outcomes: LaneTaskOutcome[],
|
|
86
|
+
): boolean {
|
|
87
|
+
let changed = false;
|
|
88
|
+
for (const lane of lanes) {
|
|
89
|
+
for (const laneTask of lane.tasks) {
|
|
90
|
+
const existing = outcomes.find(o => o.taskId === laneTask.taskId);
|
|
91
|
+
if (existing) continue;
|
|
92
|
+
changed = upsertTaskOutcome(outcomes, {
|
|
93
|
+
taskId: laneTask.taskId,
|
|
94
|
+
status: "pending",
|
|
95
|
+
startTime: null,
|
|
96
|
+
endTime: null,
|
|
97
|
+
exitReason: "Pending execution",
|
|
98
|
+
sessionName: lane.tmuxSessionName,
|
|
99
|
+
doneFileFound: false,
|
|
100
|
+
}) || changed;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
return changed;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Sync accumulated task outcomes from monitor snapshots.
|
|
108
|
+
*
|
|
109
|
+
* This captures in-wave task transitions (pending → running → terminal)
|
|
110
|
+
* so state persistence does not lag until wave completion.
|
|
111
|
+
*/
|
|
112
|
+
export function syncTaskOutcomesFromMonitor(
|
|
113
|
+
monitorState: MonitorState,
|
|
114
|
+
outcomes: LaneTaskOutcome[],
|
|
115
|
+
): boolean {
|
|
116
|
+
let changed = false;
|
|
117
|
+
|
|
118
|
+
for (const lane of monitorState.lanes) {
|
|
119
|
+
// Remaining tasks => pending
|
|
120
|
+
for (const taskId of lane.remainingTasks) {
|
|
121
|
+
const existing = outcomes.find(o => o.taskId === taskId);
|
|
122
|
+
if (existing && (existing.status === "succeeded" || existing.status === "failed" || existing.status === "stalled")) {
|
|
123
|
+
continue;
|
|
124
|
+
}
|
|
125
|
+
changed = upsertTaskOutcome(outcomes, {
|
|
126
|
+
taskId,
|
|
127
|
+
status: "pending",
|
|
128
|
+
startTime: existing?.startTime ?? null,
|
|
129
|
+
endTime: null,
|
|
130
|
+
exitReason: existing?.exitReason || "Pending execution",
|
|
131
|
+
sessionName: existing?.sessionName || lane.sessionName,
|
|
132
|
+
doneFileFound: false,
|
|
133
|
+
}) || changed;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// Completed tasks => succeeded
|
|
137
|
+
for (const taskId of lane.completedTasks) {
|
|
138
|
+
const existing = outcomes.find(o => o.taskId === taskId);
|
|
139
|
+
changed = upsertTaskOutcome(outcomes, {
|
|
140
|
+
taskId,
|
|
141
|
+
status: "succeeded",
|
|
142
|
+
startTime: existing?.startTime ?? null,
|
|
143
|
+
endTime: monitorState.lastPollTime,
|
|
144
|
+
exitReason: existing?.exitReason || ".DONE file created by task-runner",
|
|
145
|
+
sessionName: existing?.sessionName || lane.sessionName,
|
|
146
|
+
doneFileFound: true,
|
|
147
|
+
}) || changed;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// Failed tasks => failed
|
|
151
|
+
for (const taskId of lane.failedTasks) {
|
|
152
|
+
const existing = outcomes.find(o => o.taskId === taskId);
|
|
153
|
+
changed = upsertTaskOutcome(outcomes, {
|
|
154
|
+
taskId,
|
|
155
|
+
status: "failed",
|
|
156
|
+
startTime: existing?.startTime ?? null,
|
|
157
|
+
endTime: monitorState.lastPollTime,
|
|
158
|
+
exitReason: existing?.exitReason || "Task failed or stalled",
|
|
159
|
+
sessionName: existing?.sessionName || lane.sessionName,
|
|
160
|
+
doneFileFound: false,
|
|
161
|
+
}) || changed;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// Current task snapshot => running/stalled/succeeded/failed/skipped
|
|
165
|
+
if (lane.currentTaskId && lane.currentTaskSnapshot) {
|
|
166
|
+
const snap = lane.currentTaskSnapshot;
|
|
167
|
+
const existing = outcomes.find(o => o.taskId === lane.currentTaskId);
|
|
168
|
+
const monitorToLane: Record<TaskMonitorSnapshot["status"], LaneTaskStatus> = {
|
|
169
|
+
pending: "pending",
|
|
170
|
+
running: "running",
|
|
171
|
+
succeeded: "succeeded",
|
|
172
|
+
failed: "failed",
|
|
173
|
+
stalled: "stalled",
|
|
174
|
+
skipped: "skipped",
|
|
175
|
+
unknown: existing?.status || "running",
|
|
176
|
+
};
|
|
177
|
+
const mappedStatus = monitorToLane[snap.status];
|
|
178
|
+
const terminal = mappedStatus === "succeeded" || mappedStatus === "failed" || mappedStatus === "stalled" || mappedStatus === "skipped";
|
|
179
|
+
|
|
180
|
+
changed = upsertTaskOutcome(outcomes, {
|
|
181
|
+
taskId: lane.currentTaskId,
|
|
182
|
+
status: mappedStatus,
|
|
183
|
+
startTime: existing?.startTime ?? snap.lastHeartbeat ?? snap.observedAt,
|
|
184
|
+
endTime: terminal ? (existing?.endTime ?? snap.observedAt) : null,
|
|
185
|
+
exitReason: existing?.exitReason || (mappedStatus === "running" ? "Task in progress" : (snap.stallReason || "Task reached terminal state")),
|
|
186
|
+
sessionName: existing?.sessionName || lane.sessionName,
|
|
187
|
+
doneFileFound: snap.doneFileFound,
|
|
188
|
+
}) || changed;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
return changed;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Persist current runtime state to `.pi/batch-state.json`.
|
|
197
|
+
*
|
|
198
|
+
* Centralized helper that serializes runtime state, enriches task records
|
|
199
|
+
* with folder paths from discovery, and writes atomically. Logs the reason,
|
|
200
|
+
* batchId, phase, and waveIndex for each write.
|
|
201
|
+
*
|
|
202
|
+
* Write failures are non-fatal: logged as errors and added to
|
|
203
|
+
* batchState.errors, but do NOT crash the batch execution.
|
|
204
|
+
*
|
|
205
|
+
* @param reason - Human-readable reason for this state write (e.g., "batch-start", "wave-index-change")
|
|
206
|
+
* @param batchState - Current runtime batch state
|
|
207
|
+
* @param wavePlan - Wave plan (array of arrays of task IDs)
|
|
208
|
+
* @param lanes - Currently allocated lanes (latest wave's lanes)
|
|
209
|
+
* @param allTaskOutcomes - All task outcomes accumulated across completed waves
|
|
210
|
+
* @param discovery - Discovery result (for enriching taskFolder paths)
|
|
211
|
+
* @param repoRoot - Absolute path to the repository root
|
|
212
|
+
*/
|
|
213
|
+
export function persistRuntimeState(
|
|
214
|
+
reason: string,
|
|
215
|
+
batchState: OrchBatchRuntimeState,
|
|
216
|
+
wavePlan: string[][],
|
|
217
|
+
lanes: AllocatedLane[],
|
|
218
|
+
allTaskOutcomes: LaneTaskOutcome[],
|
|
219
|
+
discovery: DiscoveryResult | null,
|
|
220
|
+
repoRoot: string,
|
|
221
|
+
): void {
|
|
222
|
+
try {
|
|
223
|
+
const json = serializeBatchState(batchState, wavePlan, lanes, allTaskOutcomes);
|
|
224
|
+
|
|
225
|
+
// Enrich task records with folder paths from discovery
|
|
226
|
+
if (discovery) {
|
|
227
|
+
const parsed = JSON.parse(json) as PersistedBatchState;
|
|
228
|
+
for (const taskRecord of parsed.tasks) {
|
|
229
|
+
const parsedTask = discovery.pending.get(taskRecord.taskId);
|
|
230
|
+
if (parsedTask) {
|
|
231
|
+
taskRecord.taskFolder = parsedTask.taskFolder;
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
const enrichedJson = JSON.stringify(parsed, null, 2);
|
|
235
|
+
saveBatchState(enrichedJson, repoRoot);
|
|
236
|
+
} else {
|
|
237
|
+
saveBatchState(json, repoRoot);
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
execLog("state", batchState.batchId, `persisted: ${reason}`, {
|
|
241
|
+
phase: batchState.phase,
|
|
242
|
+
waveIndex: batchState.currentWaveIndex,
|
|
243
|
+
});
|
|
244
|
+
} catch (err: unknown) {
|
|
245
|
+
const msg = err instanceof StateFileError
|
|
246
|
+
? `[${err.code}] ${err.message}`
|
|
247
|
+
: (err instanceof Error ? err.message : String(err));
|
|
248
|
+
execLog("state", batchState.batchId, `write failed: ${msg}`, {
|
|
249
|
+
reason,
|
|
250
|
+
phase: batchState.phase,
|
|
251
|
+
});
|
|
252
|
+
batchState.errors.push(`State persistence failed (${reason}): ${msg}`);
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
// ── State Validation ─────────────────────────────────────────────────
|
|
258
|
+
|
|
259
|
+
/** All valid OrchBatchPhase values for validation. */
|
|
260
|
+
export const VALID_BATCH_PHASES: ReadonlySet<string> = new Set([
|
|
261
|
+
"idle", "planning", "executing", "merging", "paused", "stopped", "completed", "failed",
|
|
262
|
+
]);
|
|
263
|
+
|
|
264
|
+
/** All valid LaneTaskStatus values for validation. */
|
|
265
|
+
export const VALID_TASK_STATUSES: ReadonlySet<string> = new Set([
|
|
266
|
+
"pending", "running", "succeeded", "failed", "stalled", "skipped",
|
|
267
|
+
]);
|
|
268
|
+
|
|
269
|
+
/** All valid merge result statuses for persisted state. */
|
|
270
|
+
export const VALID_PERSISTED_MERGE_STATUSES: ReadonlySet<string> = new Set([
|
|
271
|
+
"succeeded", "failed", "partial",
|
|
272
|
+
]);
|
|
273
|
+
|
|
274
|
+
/**
|
|
275
|
+
* Validate a parsed JSON object as a PersistedBatchState.
|
|
276
|
+
*
|
|
277
|
+
* Checks:
|
|
278
|
+
* 1. Schema version matches BATCH_STATE_SCHEMA_VERSION
|
|
279
|
+
* 2. All required fields are present with correct types
|
|
280
|
+
* 3. Enum fields contain valid values (phase, task statuses, merge statuses)
|
|
281
|
+
* 4. Arrays contain valid sub-records
|
|
282
|
+
*
|
|
283
|
+
* @param data - Parsed JSON (unknown type)
|
|
284
|
+
* @returns Validated PersistedBatchState
|
|
285
|
+
* @throws StateFileError with STATE_SCHEMA_INVALID on any validation failure
|
|
286
|
+
*/
|
|
287
|
+
export function validatePersistedState(data: unknown): PersistedBatchState {
|
|
288
|
+
if (!data || typeof data !== "object") {
|
|
289
|
+
throw new StateFileError(
|
|
290
|
+
"STATE_SCHEMA_INVALID",
|
|
291
|
+
"Batch state must be a non-null object",
|
|
292
|
+
);
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
const obj = data as Record<string, unknown>;
|
|
296
|
+
|
|
297
|
+
// ── Schema version ───────────────────────────────────────────
|
|
298
|
+
if (typeof obj.schemaVersion !== "number") {
|
|
299
|
+
throw new StateFileError(
|
|
300
|
+
"STATE_SCHEMA_INVALID",
|
|
301
|
+
`Missing or invalid "schemaVersion" field (expected number, got ${typeof obj.schemaVersion})`,
|
|
302
|
+
);
|
|
303
|
+
}
|
|
304
|
+
if (obj.schemaVersion !== BATCH_STATE_SCHEMA_VERSION) {
|
|
305
|
+
throw new StateFileError(
|
|
306
|
+
"STATE_SCHEMA_INVALID",
|
|
307
|
+
`Unsupported schema version ${obj.schemaVersion} (expected ${BATCH_STATE_SCHEMA_VERSION}). ` +
|
|
308
|
+
`Delete .pi/batch-state.json and re-run the batch.`,
|
|
309
|
+
);
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
// ── Required string fields ───────────────────────────────────
|
|
313
|
+
for (const field of ["phase", "batchId"] as const) {
|
|
314
|
+
if (typeof obj[field] !== "string") {
|
|
315
|
+
throw new StateFileError(
|
|
316
|
+
"STATE_SCHEMA_INVALID",
|
|
317
|
+
`Missing or invalid "${field}" field (expected string, got ${typeof obj[field]})`,
|
|
318
|
+
);
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// ── Phase enum validation ────────────────────────────────────
|
|
323
|
+
if (!VALID_BATCH_PHASES.has(obj.phase as string)) {
|
|
324
|
+
throw new StateFileError(
|
|
325
|
+
"STATE_SCHEMA_INVALID",
|
|
326
|
+
`Invalid "phase" value "${obj.phase}" (expected one of: ${[...VALID_BATCH_PHASES].join(", ")})`,
|
|
327
|
+
);
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// ── Required number fields ───────────────────────────────────
|
|
331
|
+
for (const field of [
|
|
332
|
+
"startedAt", "updatedAt", "currentWaveIndex", "totalWaves",
|
|
333
|
+
"totalTasks", "succeededTasks", "failedTasks", "skippedTasks", "blockedTasks",
|
|
334
|
+
] as const) {
|
|
335
|
+
if (typeof obj[field] !== "number") {
|
|
336
|
+
throw new StateFileError(
|
|
337
|
+
"STATE_SCHEMA_INVALID",
|
|
338
|
+
`Missing or invalid "${field}" field (expected number, got ${typeof obj[field]})`,
|
|
339
|
+
);
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
// ── Nullable number: endedAt ─────────────────────────────────
|
|
344
|
+
if (obj.endedAt !== null && typeof obj.endedAt !== "number") {
|
|
345
|
+
throw new StateFileError(
|
|
346
|
+
"STATE_SCHEMA_INVALID",
|
|
347
|
+
`Invalid "endedAt" field (expected number or null, got ${typeof obj.endedAt})`,
|
|
348
|
+
);
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
// ── Required arrays ──────────────────────────────────────────
|
|
352
|
+
for (const field of ["wavePlan", "lanes", "tasks", "mergeResults", "blockedTaskIds", "errors"] as const) {
|
|
353
|
+
if (!Array.isArray(obj[field])) {
|
|
354
|
+
throw new StateFileError(
|
|
355
|
+
"STATE_SCHEMA_INVALID",
|
|
356
|
+
`Missing or invalid "${field}" field (expected array, got ${typeof obj[field]})`,
|
|
357
|
+
);
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
// ── Validate wavePlan: array of arrays of strings ────────────
|
|
362
|
+
const wavePlan = obj.wavePlan as unknown[];
|
|
363
|
+
for (let i = 0; i < wavePlan.length; i++) {
|
|
364
|
+
if (!Array.isArray(wavePlan[i])) {
|
|
365
|
+
throw new StateFileError(
|
|
366
|
+
"STATE_SCHEMA_INVALID",
|
|
367
|
+
`wavePlan[${i}] is not an array`,
|
|
368
|
+
);
|
|
369
|
+
}
|
|
370
|
+
for (const taskId of wavePlan[i] as unknown[]) {
|
|
371
|
+
if (typeof taskId !== "string") {
|
|
372
|
+
throw new StateFileError(
|
|
373
|
+
"STATE_SCHEMA_INVALID",
|
|
374
|
+
`wavePlan[${i}] contains non-string value: ${typeof taskId}`,
|
|
375
|
+
);
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
// ── Validate task records ────────────────────────────────────
|
|
381
|
+
const tasks = obj.tasks as unknown[];
|
|
382
|
+
for (let i = 0; i < tasks.length; i++) {
|
|
383
|
+
const t = tasks[i] as Record<string, unknown>;
|
|
384
|
+
if (!t || typeof t !== "object") {
|
|
385
|
+
throw new StateFileError(
|
|
386
|
+
"STATE_SCHEMA_INVALID",
|
|
387
|
+
`tasks[${i}] is not an object`,
|
|
388
|
+
);
|
|
389
|
+
}
|
|
390
|
+
for (const field of ["taskId", "sessionName", "taskFolder", "exitReason"] as const) {
|
|
391
|
+
if (typeof t[field] !== "string") {
|
|
392
|
+
throw new StateFileError(
|
|
393
|
+
"STATE_SCHEMA_INVALID",
|
|
394
|
+
`tasks[${i}].${field} is missing or not a string`,
|
|
395
|
+
);
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
if (typeof t.laneNumber !== "number") {
|
|
399
|
+
throw new StateFileError(
|
|
400
|
+
"STATE_SCHEMA_INVALID",
|
|
401
|
+
`tasks[${i}].laneNumber is missing or not a number`,
|
|
402
|
+
);
|
|
403
|
+
}
|
|
404
|
+
if (typeof t.status !== "string" || !VALID_TASK_STATUSES.has(t.status)) {
|
|
405
|
+
throw new StateFileError(
|
|
406
|
+
"STATE_SCHEMA_INVALID",
|
|
407
|
+
`tasks[${i}].status is invalid: "${t.status}" (expected one of: ${[...VALID_TASK_STATUSES].join(", ")})`,
|
|
408
|
+
);
|
|
409
|
+
}
|
|
410
|
+
if (t.startedAt !== null && typeof t.startedAt !== "number") {
|
|
411
|
+
throw new StateFileError(
|
|
412
|
+
"STATE_SCHEMA_INVALID",
|
|
413
|
+
`tasks[${i}].startedAt is not a number or null`,
|
|
414
|
+
);
|
|
415
|
+
}
|
|
416
|
+
if (t.endedAt !== null && typeof t.endedAt !== "number") {
|
|
417
|
+
throw new StateFileError(
|
|
418
|
+
"STATE_SCHEMA_INVALID",
|
|
419
|
+
`tasks[${i}].endedAt is not a number or null`,
|
|
420
|
+
);
|
|
421
|
+
}
|
|
422
|
+
if (typeof t.doneFileFound !== "boolean") {
|
|
423
|
+
throw new StateFileError(
|
|
424
|
+
"STATE_SCHEMA_INVALID",
|
|
425
|
+
`tasks[${i}].doneFileFound is missing or not a boolean`,
|
|
426
|
+
);
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
// ── Validate lane records ────────────────────────────────────
|
|
431
|
+
const lanes = obj.lanes as unknown[];
|
|
432
|
+
for (let i = 0; i < lanes.length; i++) {
|
|
433
|
+
const l = lanes[i] as Record<string, unknown>;
|
|
434
|
+
if (!l || typeof l !== "object") {
|
|
435
|
+
throw new StateFileError(
|
|
436
|
+
"STATE_SCHEMA_INVALID",
|
|
437
|
+
`lanes[${i}] is not an object`,
|
|
438
|
+
);
|
|
439
|
+
}
|
|
440
|
+
for (const field of ["laneId", "tmuxSessionName", "worktreePath", "branch"] as const) {
|
|
441
|
+
if (typeof l[field] !== "string") {
|
|
442
|
+
throw new StateFileError(
|
|
443
|
+
"STATE_SCHEMA_INVALID",
|
|
444
|
+
`lanes[${i}].${field} is missing or not a string`,
|
|
445
|
+
);
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
if (typeof l.laneNumber !== "number") {
|
|
449
|
+
throw new StateFileError(
|
|
450
|
+
"STATE_SCHEMA_INVALID",
|
|
451
|
+
`lanes[${i}].laneNumber is missing or not a number`,
|
|
452
|
+
);
|
|
453
|
+
}
|
|
454
|
+
if (!Array.isArray(l.taskIds)) {
|
|
455
|
+
throw new StateFileError(
|
|
456
|
+
"STATE_SCHEMA_INVALID",
|
|
457
|
+
`lanes[${i}].taskIds is missing or not an array`,
|
|
458
|
+
);
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
// ── Validate merge results ───────────────────────────────────
|
|
463
|
+
const mergeResults = obj.mergeResults as unknown[];
|
|
464
|
+
for (let i = 0; i < mergeResults.length; i++) {
|
|
465
|
+
const m = mergeResults[i] as Record<string, unknown>;
|
|
466
|
+
if (!m || typeof m !== "object") {
|
|
467
|
+
throw new StateFileError(
|
|
468
|
+
"STATE_SCHEMA_INVALID",
|
|
469
|
+
`mergeResults[${i}] is not an object`,
|
|
470
|
+
);
|
|
471
|
+
}
|
|
472
|
+
if (typeof m.waveIndex !== "number") {
|
|
473
|
+
throw new StateFileError(
|
|
474
|
+
"STATE_SCHEMA_INVALID",
|
|
475
|
+
`mergeResults[${i}].waveIndex is missing or not a number`,
|
|
476
|
+
);
|
|
477
|
+
}
|
|
478
|
+
if (typeof m.status !== "string" || !VALID_PERSISTED_MERGE_STATUSES.has(m.status)) {
|
|
479
|
+
throw new StateFileError(
|
|
480
|
+
"STATE_SCHEMA_INVALID",
|
|
481
|
+
`mergeResults[${i}].status is invalid: "${m.status}" (expected one of: ${[...VALID_PERSISTED_MERGE_STATUSES].join(", ")})`,
|
|
482
|
+
);
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
// ── Validate lastError ───────────────────────────────────────
|
|
487
|
+
if (obj.lastError !== null) {
|
|
488
|
+
if (typeof obj.lastError !== "object") {
|
|
489
|
+
throw new StateFileError(
|
|
490
|
+
"STATE_SCHEMA_INVALID",
|
|
491
|
+
`lastError is not an object or null`,
|
|
492
|
+
);
|
|
493
|
+
}
|
|
494
|
+
const le = obj.lastError as Record<string, unknown>;
|
|
495
|
+
if (typeof le.code !== "string" || typeof le.message !== "string") {
|
|
496
|
+
throw new StateFileError(
|
|
497
|
+
"STATE_SCHEMA_INVALID",
|
|
498
|
+
`lastError must have "code" (string) and "message" (string) fields`,
|
|
499
|
+
);
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
// ── Validate blockedTaskIds: array of strings ────────────────
|
|
504
|
+
for (const id of obj.blockedTaskIds as unknown[]) {
|
|
505
|
+
if (typeof id !== "string") {
|
|
506
|
+
throw new StateFileError(
|
|
507
|
+
"STATE_SCHEMA_INVALID",
|
|
508
|
+
`blockedTaskIds contains non-string value: ${typeof id}`,
|
|
509
|
+
);
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
// ── Validate errors: array of strings ────────────────────────
|
|
514
|
+
for (const err of obj.errors as unknown[]) {
|
|
515
|
+
if (typeof err !== "string") {
|
|
516
|
+
throw new StateFileError(
|
|
517
|
+
"STATE_SCHEMA_INVALID",
|
|
518
|
+
`errors array contains non-string value: ${typeof err}`,
|
|
519
|
+
);
|
|
520
|
+
}
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
return obj as unknown as PersistedBatchState;
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
// ── Serialization ────────────────────────────────────────────────────
|
|
527
|
+
|
|
528
|
+
/**
|
|
529
|
+
* Serialize runtime batch state to a PersistedBatchState JSON string.
|
|
530
|
+
*
|
|
531
|
+
* Pure function: extracts the serializable subset from OrchBatchRuntimeState
|
|
532
|
+
* and its associated wave results, enriches with schema version and timestamps.
|
|
533
|
+
*
|
|
534
|
+
* @param state - Current runtime batch state
|
|
535
|
+
* @param wavePlan - Wave plan (array of arrays of task IDs)
|
|
536
|
+
* @param lanes - Currently allocated lanes (latest wave's lanes)
|
|
537
|
+
* @param allTaskOutcomes - All task outcomes across completed waves + current
|
|
538
|
+
* @returns JSON string (pretty-printed for debuggability)
|
|
539
|
+
*/
|
|
540
|
+
export function serializeBatchState(
|
|
541
|
+
state: OrchBatchRuntimeState,
|
|
542
|
+
wavePlan: string[][],
|
|
543
|
+
lanes: AllocatedLane[],
|
|
544
|
+
allTaskOutcomes: LaneTaskOutcome[],
|
|
545
|
+
): string {
|
|
546
|
+
const now = Date.now();
|
|
547
|
+
|
|
548
|
+
// Build lookup maps for fast per-task enrichment.
|
|
549
|
+
const laneByTaskId = new Map<string, AllocatedLane>();
|
|
550
|
+
for (const lane of lanes) {
|
|
551
|
+
for (const task of lane.tasks) {
|
|
552
|
+
laneByTaskId.set(task.taskId, lane);
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
// Latest outcome wins (allTaskOutcomes is append/replace ordered by time).
|
|
557
|
+
const outcomeByTaskId = new Map<string, LaneTaskOutcome>();
|
|
558
|
+
for (const outcome of allTaskOutcomes) {
|
|
559
|
+
outcomeByTaskId.set(outcome.taskId, outcome);
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
// Build full task registry from wave plan + any outcomes seen so far.
|
|
563
|
+
const taskIdSet = new Set<string>();
|
|
564
|
+
for (const wave of wavePlan) {
|
|
565
|
+
for (const taskId of wave) taskIdSet.add(taskId);
|
|
566
|
+
}
|
|
567
|
+
for (const outcome of allTaskOutcomes) {
|
|
568
|
+
taskIdSet.add(outcome.taskId);
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
const taskRecords: PersistedTaskRecord[] = [...taskIdSet]
|
|
572
|
+
.sort()
|
|
573
|
+
.map((taskId) => {
|
|
574
|
+
const lane = laneByTaskId.get(taskId);
|
|
575
|
+
const outcome = outcomeByTaskId.get(taskId);
|
|
576
|
+
|
|
577
|
+
return {
|
|
578
|
+
taskId,
|
|
579
|
+
laneNumber: lane?.laneNumber ?? 0,
|
|
580
|
+
sessionName: outcome?.sessionName || lane?.tmuxSessionName || "",
|
|
581
|
+
status: outcome?.status ?? "pending",
|
|
582
|
+
taskFolder: "", // Enriched by caller from discovery
|
|
583
|
+
startedAt: outcome?.startTime ?? null,
|
|
584
|
+
endedAt: outcome?.endTime ?? null,
|
|
585
|
+
doneFileFound: outcome?.doneFileFound ?? false,
|
|
586
|
+
exitReason: outcome?.exitReason ?? "",
|
|
587
|
+
};
|
|
588
|
+
});
|
|
589
|
+
|
|
590
|
+
// Build lane records
|
|
591
|
+
const laneRecords: PersistedLaneRecord[] = lanes.map((lane) => ({
|
|
592
|
+
laneNumber: lane.laneNumber,
|
|
593
|
+
laneId: lane.laneId,
|
|
594
|
+
tmuxSessionName: lane.tmuxSessionName,
|
|
595
|
+
worktreePath: lane.worktreePath,
|
|
596
|
+
branch: lane.branch,
|
|
597
|
+
taskIds: lane.tasks.map((t) => t.taskId),
|
|
598
|
+
}));
|
|
599
|
+
|
|
600
|
+
// Build merge results from actual merge outcomes (accumulated on batchState).
|
|
601
|
+
// MergeWaveResult.waveIndex is 1-based (from merge module); normalize to
|
|
602
|
+
// 0-based for PersistedMergeResult (dashboard renders as "Wave N+1").
|
|
603
|
+
const mergeResults: PersistedMergeResult[] = (state.mergeResults || [])
|
|
604
|
+
.map((mr) => ({
|
|
605
|
+
waveIndex: mr.waveIndex - 1,
|
|
606
|
+
status: mr.status,
|
|
607
|
+
failedLane: mr.failedLane,
|
|
608
|
+
failureReason: mr.failureReason,
|
|
609
|
+
}));
|
|
610
|
+
|
|
611
|
+
const persisted: PersistedBatchState = {
|
|
612
|
+
schemaVersion: BATCH_STATE_SCHEMA_VERSION,
|
|
613
|
+
phase: state.phase,
|
|
614
|
+
batchId: state.batchId,
|
|
615
|
+
startedAt: state.startedAt,
|
|
616
|
+
updatedAt: now,
|
|
617
|
+
endedAt: state.endedAt,
|
|
618
|
+
currentWaveIndex: state.currentWaveIndex,
|
|
619
|
+
totalWaves: state.totalWaves,
|
|
620
|
+
wavePlan,
|
|
621
|
+
lanes: laneRecords,
|
|
622
|
+
tasks: taskRecords,
|
|
623
|
+
mergeResults,
|
|
624
|
+
totalTasks: state.totalTasks,
|
|
625
|
+
succeededTasks: state.succeededTasks,
|
|
626
|
+
failedTasks: state.failedTasks,
|
|
627
|
+
skippedTasks: state.skippedTasks,
|
|
628
|
+
blockedTasks: state.blockedTasks,
|
|
629
|
+
blockedTaskIds: [...state.blockedTaskIds],
|
|
630
|
+
lastError: state.errors.length > 0
|
|
631
|
+
? { code: "BATCH_ERROR", message: state.errors[state.errors.length - 1] }
|
|
632
|
+
: null,
|
|
633
|
+
errors: [...state.errors],
|
|
634
|
+
};
|
|
635
|
+
|
|
636
|
+
return JSON.stringify(persisted, null, 2);
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
// ── File Operations ──────────────────────────────────────────────────
|
|
640
|
+
|
|
641
|
+
/** Maximum retries for atomic write (Windows file locking). */
|
|
642
|
+
export const STATE_WRITE_MAX_RETRIES = 3;
|
|
643
|
+
|
|
644
|
+
/** Delay between write retries (ms). */
|
|
645
|
+
export const STATE_WRITE_RETRY_DELAY_MS = 500;
|
|
646
|
+
|
|
647
|
+
/**
|
|
648
|
+
* Save batch state to `.pi/batch-state.json` with atomic write.
|
|
649
|
+
*
|
|
650
|
+
* Strategy: write to a temp file (`.pi/batch-state.json.tmp`), then
|
|
651
|
+
* rename to the final path. This prevents partial writes from corrupting
|
|
652
|
+
* the state file.
|
|
653
|
+
*
|
|
654
|
+
* On Windows, rename can fail if another process holds a handle on the
|
|
655
|
+
* target file. We retry up to STATE_WRITE_MAX_RETRIES times with a
|
|
656
|
+
* short delay.
|
|
657
|
+
*
|
|
658
|
+
* @param json - JSON string to write (from serializeBatchState)
|
|
659
|
+
* @param repoRoot - Absolute path to the repository root
|
|
660
|
+
* @throws StateFileError with STATE_FILE_IO_ERROR on failure
|
|
661
|
+
*/
|
|
662
|
+
export function saveBatchState(json: string, repoRoot: string): void {
|
|
663
|
+
const finalPath = batchStatePath(repoRoot);
|
|
664
|
+
const tmpPath = `${finalPath}.tmp`;
|
|
665
|
+
const dir = dirname(finalPath);
|
|
666
|
+
|
|
667
|
+
// Ensure .pi directory exists
|
|
668
|
+
if (!existsSync(dir)) {
|
|
669
|
+
try {
|
|
670
|
+
mkdirSync(dir, { recursive: true });
|
|
671
|
+
} catch (err: unknown) {
|
|
672
|
+
throw new StateFileError(
|
|
673
|
+
"STATE_FILE_IO_ERROR",
|
|
674
|
+
`Failed to create directory "${dir}": ${(err as Error).message}`,
|
|
675
|
+
);
|
|
676
|
+
}
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
// Write to temp file
|
|
680
|
+
try {
|
|
681
|
+
writeFileSync(tmpPath, json, "utf-8");
|
|
682
|
+
} catch (err: unknown) {
|
|
683
|
+
throw new StateFileError(
|
|
684
|
+
"STATE_FILE_IO_ERROR",
|
|
685
|
+
`Failed to write temp state file "${tmpPath}": ${(err as Error).message}`,
|
|
686
|
+
);
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
// Atomic rename with retry for Windows file locking
|
|
690
|
+
let lastError: Error | null = null;
|
|
691
|
+
for (let attempt = 1; attempt <= STATE_WRITE_MAX_RETRIES; attempt++) {
|
|
692
|
+
try {
|
|
693
|
+
renameSync(tmpPath, finalPath);
|
|
694
|
+
return; // Success
|
|
695
|
+
} catch (err: unknown) {
|
|
696
|
+
lastError = err as Error;
|
|
697
|
+
if (attempt < STATE_WRITE_MAX_RETRIES) {
|
|
698
|
+
sleepSync(STATE_WRITE_RETRY_DELAY_MS);
|
|
699
|
+
}
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
// All retries exhausted — clean up temp file if possible
|
|
704
|
+
try { unlinkSync(tmpPath); } catch { /* ignore cleanup errors */ }
|
|
705
|
+
|
|
706
|
+
throw new StateFileError(
|
|
707
|
+
"STATE_FILE_IO_ERROR",
|
|
708
|
+
`Failed to atomically save state file "${finalPath}" after ` +
|
|
709
|
+
`${STATE_WRITE_MAX_RETRIES} attempts: ${lastError?.message ?? "unknown error"}`,
|
|
710
|
+
);
|
|
711
|
+
}
|
|
712
|
+
|
|
713
|
+
/**
|
|
714
|
+
* Load and validate batch state from `.pi/batch-state.json`.
|
|
715
|
+
*
|
|
716
|
+
* @param repoRoot - Absolute path to the repository root
|
|
717
|
+
* @returns Validated PersistedBatchState, or null if file doesn't exist
|
|
718
|
+
* @throws StateFileError with STATE_FILE_PARSE_ERROR if file contains invalid JSON
|
|
719
|
+
* @throws StateFileError with STATE_SCHEMA_INVALID if JSON fails validation
|
|
720
|
+
*/
|
|
721
|
+
export function loadBatchState(repoRoot: string): PersistedBatchState | null {
|
|
722
|
+
const filePath = batchStatePath(repoRoot);
|
|
723
|
+
|
|
724
|
+
if (!existsSync(filePath)) {
|
|
725
|
+
return null;
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
let raw: string;
|
|
729
|
+
try {
|
|
730
|
+
raw = readFileSync(filePath, "utf-8");
|
|
731
|
+
} catch (err: unknown) {
|
|
732
|
+
throw new StateFileError(
|
|
733
|
+
"STATE_FILE_IO_ERROR",
|
|
734
|
+
`Failed to read state file "${filePath}": ${(err as Error).message}`,
|
|
735
|
+
);
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
let parsed: unknown;
|
|
739
|
+
try {
|
|
740
|
+
parsed = JSON.parse(raw);
|
|
741
|
+
} catch (err: unknown) {
|
|
742
|
+
throw new StateFileError(
|
|
743
|
+
"STATE_FILE_PARSE_ERROR",
|
|
744
|
+
`State file "${filePath}" contains invalid JSON: ${(err as Error).message}`,
|
|
745
|
+
);
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
return validatePersistedState(parsed);
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
/**
|
|
752
|
+
* Delete the batch state file. Idempotent: no error if file doesn't exist.
|
|
753
|
+
*
|
|
754
|
+
* @param repoRoot - Absolute path to the repository root
|
|
755
|
+
* @throws StateFileError with STATE_FILE_IO_ERROR on unexpected deletion failure
|
|
756
|
+
*/
|
|
757
|
+
export function deleteBatchState(repoRoot: string): void {
|
|
758
|
+
const filePath = batchStatePath(repoRoot);
|
|
759
|
+
|
|
760
|
+
if (!existsSync(filePath)) {
|
|
761
|
+
return; // Already gone — idempotent
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
try {
|
|
765
|
+
unlinkSync(filePath);
|
|
766
|
+
} catch (err: unknown) {
|
|
767
|
+
// If file was deleted between our check and unlink, that's fine
|
|
768
|
+
if (!existsSync(filePath)) return;
|
|
769
|
+
throw new StateFileError(
|
|
770
|
+
"STATE_FILE_IO_ERROR",
|
|
771
|
+
`Failed to delete state file "${filePath}": ${(err as Error).message}`,
|
|
772
|
+
);
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
// ── Orphan Detection (TS-009 Step 3) ─────────────────────────────────
|
|
778
|
+
|
|
779
|
+
/**
|
|
780
|
+
* Status of the persisted batch state file.
|
|
781
|
+
*
|
|
782
|
+
* - "valid" — File exists, parsed, and validated successfully
|
|
783
|
+
* - "missing" — File does not exist (normal for fresh start)
|
|
784
|
+
* - "invalid" — File exists but has parse or schema errors
|
|
785
|
+
* - "io-error" — File could not be read due to I/O error
|
|
786
|
+
*/
|
|
787
|
+
export type OrphanStateStatus = "valid" | "missing" | "invalid" | "io-error";
|
|
788
|
+
|
|
789
|
+
/**
|
|
790
|
+
* Recommended action based on orphan detection analysis.
|
|
791
|
+
*
|
|
792
|
+
* - "resume" — Orphan sessions + valid state, or no orphans + valid state with incomplete tasks: suggest /orch-resume
|
|
793
|
+
* - "abort-orphans" — Orphan sessions without usable state: suggest /orch-abort
|
|
794
|
+
* - "cleanup-stale" — No orphans + stale/invalid state file: auto-delete and start fresh
|
|
795
|
+
* - "start-fresh" — No orphans, no state file: proceed normally
|
|
796
|
+
*/
|
|
797
|
+
export type OrphanRecommendedAction = "resume" | "abort-orphans" | "cleanup-stale" | "start-fresh";
|
|
798
|
+
|
|
799
|
+
/**
|
|
800
|
+
* Result of orphan detection analysis.
|
|
801
|
+
*
|
|
802
|
+
* Machine-usable fields enable both automated handling and user notification.
|
|
803
|
+
* The `userMessage` provides a human-readable summary for display.
|
|
804
|
+
*/
|
|
805
|
+
export interface OrphanDetectionResult {
|
|
806
|
+
/** TMUX sessions matching the orchestrator prefix that were found alive */
|
|
807
|
+
orphanSessions: string[];
|
|
808
|
+
/** Status of the persisted batch state file */
|
|
809
|
+
stateStatus: OrphanStateStatus;
|
|
810
|
+
/** Loaded and validated batch state (null if missing, invalid, or io-error) */
|
|
811
|
+
loadedState: PersistedBatchState | null;
|
|
812
|
+
/** Error message if state loading failed (null otherwise) */
|
|
813
|
+
stateError: string | null;
|
|
814
|
+
/** Deterministic recommended action */
|
|
815
|
+
recommendedAction: OrphanRecommendedAction;
|
|
816
|
+
/** Human-readable message for user notification */
|
|
817
|
+
userMessage: string;
|
|
818
|
+
}
|
|
819
|
+
|
|
820
|
+
/**
|
|
821
|
+
* Parse TMUX `list-sessions -F "#{session_name}"` output.
|
|
822
|
+
*
|
|
823
|
+
* Filters session names by the given prefix (e.g., "orch" matches "orch-lane-1").
|
|
824
|
+
* Handles empty output, blank lines, and whitespace-padded names gracefully.
|
|
825
|
+
*
|
|
826
|
+
* Pure function — no process or filesystem access.
|
|
827
|
+
*
|
|
828
|
+
* @param stdout - Raw stdout from `tmux list-sessions -F "#{session_name}"`
|
|
829
|
+
* @param prefix - Session name prefix to filter by (e.g., "orch")
|
|
830
|
+
* @returns Sorted array of matching session names
|
|
831
|
+
*/
|
|
832
|
+
export function parseOrchSessionNames(stdout: string, prefix: string): string[] {
|
|
833
|
+
if (!stdout || !stdout.trim()) return [];
|
|
834
|
+
|
|
835
|
+
const filterPrefix = `${prefix}-`;
|
|
836
|
+
|
|
837
|
+
return stdout
|
|
838
|
+
.split("\n")
|
|
839
|
+
.map(line => line.trim())
|
|
840
|
+
.filter(name => name.length > 0 && name.startsWith(filterPrefix))
|
|
841
|
+
.sort();
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
/**
|
|
845
|
+
* Analyze orchestrator startup state — pure deterministic decision logic.
|
|
846
|
+
*
|
|
847
|
+
* Given the current state of TMUX sessions, batch state file, and task
|
|
848
|
+
* completion markers, returns a deterministic recommendation for what
|
|
849
|
+
* the `/orch` command should do.
|
|
850
|
+
*
|
|
851
|
+
* Decision matrix:
|
|
852
|
+
* | Orphans? | State Status | Done? | Action |
|
|
853
|
+
* |----------|-------------|-------|-----------------|
|
|
854
|
+
* | Yes | valid | — | resume |
|
|
855
|
+
* | Yes | missing | — | abort-orphans |
|
|
856
|
+
* | Yes | invalid | — | abort-orphans |
|
|
857
|
+
* | Yes | io-error | — | abort-orphans |
|
|
858
|
+
* | No | valid | all | cleanup-stale |
|
|
859
|
+
* | No | valid | !all | resume |
|
|
860
|
+
* | No | missing | — | start-fresh |
|
|
861
|
+
* | No | invalid | — | cleanup-stale |
|
|
862
|
+
* | No | io-error | — | cleanup-stale |
|
|
863
|
+
*
|
|
864
|
+
* Pure function — no process or filesystem access.
|
|
865
|
+
*
|
|
866
|
+
* @param orphanSessions - TMUX sessions matching the orch prefix
|
|
867
|
+
* @param stateStatus - Status of the batch state file
|
|
868
|
+
* @param loadedState - Validated batch state (null if unavailable)
|
|
869
|
+
* @param stateError - Error message from state loading (null if no error)
|
|
870
|
+
* @param doneTaskIds - Set of task IDs whose .DONE files were found
|
|
871
|
+
* @returns OrphanDetectionResult with recommended action
|
|
872
|
+
*/
|
|
873
|
+
export function analyzeOrchestratorStartupState(
|
|
874
|
+
orphanSessions: string[],
|
|
875
|
+
stateStatus: OrphanStateStatus,
|
|
876
|
+
loadedState: PersistedBatchState | null,
|
|
877
|
+
stateError: string | null,
|
|
878
|
+
doneTaskIds: ReadonlySet<string>,
|
|
879
|
+
): OrphanDetectionResult {
|
|
880
|
+
const hasOrphans = orphanSessions.length > 0;
|
|
881
|
+
const sessionList = orphanSessions.join(", ");
|
|
882
|
+
|
|
883
|
+
// ── Orphan sessions exist ────────────────────────────────────
|
|
884
|
+
if (hasOrphans) {
|
|
885
|
+
if (stateStatus === "valid" && loadedState) {
|
|
886
|
+
return {
|
|
887
|
+
orphanSessions,
|
|
888
|
+
stateStatus,
|
|
889
|
+
loadedState,
|
|
890
|
+
stateError,
|
|
891
|
+
recommendedAction: "resume",
|
|
892
|
+
userMessage:
|
|
893
|
+
`🔄 Found ${orphanSessions.length} running orchestrator session(s): ${sessionList}\n` +
|
|
894
|
+
` Batch ${loadedState.batchId} (${loadedState.phase}) has persisted state.\n` +
|
|
895
|
+
` Use /orch-resume to continue, or /orch-abort to clean up.`,
|
|
896
|
+
};
|
|
897
|
+
}
|
|
898
|
+
|
|
899
|
+
// Orphans without usable state (missing, invalid, or io-error)
|
|
900
|
+
const errorCtx = stateError ? `\n State error: ${stateError}` : "";
|
|
901
|
+
return {
|
|
902
|
+
orphanSessions,
|
|
903
|
+
stateStatus,
|
|
904
|
+
loadedState: null,
|
|
905
|
+
stateError,
|
|
906
|
+
recommendedAction: "abort-orphans",
|
|
907
|
+
userMessage:
|
|
908
|
+
`⚠️ Found ${orphanSessions.length} orphan orchestrator session(s): ${sessionList}\n` +
|
|
909
|
+
` No usable batch state file (status: ${stateStatus}).${errorCtx}\n` +
|
|
910
|
+
` Use /orch-abort to clean up before starting a new batch.`,
|
|
911
|
+
};
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
// ── No orphan sessions ───────────────────────────────────────
|
|
915
|
+
|
|
916
|
+
if (stateStatus === "missing") {
|
|
917
|
+
return {
|
|
918
|
+
orphanSessions: [],
|
|
919
|
+
stateStatus,
|
|
920
|
+
loadedState: null,
|
|
921
|
+
stateError,
|
|
922
|
+
recommendedAction: "start-fresh",
|
|
923
|
+
userMessage: "", // No message needed for clean start
|
|
924
|
+
};
|
|
925
|
+
}
|
|
926
|
+
|
|
927
|
+
if (stateStatus === "valid" && loadedState) {
|
|
928
|
+
// Check if all tasks completed (all have .DONE files)
|
|
929
|
+
const allTaskIds = loadedState.tasks.map(t => t.taskId);
|
|
930
|
+
const allDone = allTaskIds.length > 0 && allTaskIds.every(id => doneTaskIds.has(id));
|
|
931
|
+
|
|
932
|
+
if (allDone) {
|
|
933
|
+
return {
|
|
934
|
+
orphanSessions: [],
|
|
935
|
+
stateStatus,
|
|
936
|
+
loadedState,
|
|
937
|
+
stateError,
|
|
938
|
+
recommendedAction: "cleanup-stale",
|
|
939
|
+
userMessage:
|
|
940
|
+
`🧹 Found stale batch state file from batch ${loadedState.batchId}.\n` +
|
|
941
|
+
` All ${allTaskIds.length} task(s) have .DONE files. Cleaning up state file.`,
|
|
942
|
+
};
|
|
943
|
+
}
|
|
944
|
+
|
|
945
|
+
// Not all tasks done — batch was interrupted (crashed orchestrator)
|
|
946
|
+
const completedCount = allTaskIds.filter(id => doneTaskIds.has(id)).length;
|
|
947
|
+
|
|
948
|
+
// Only phases that resumeOrchBatch can actually handle should get "resume".
|
|
949
|
+
// "failed" / "stopped" / "idle" / "planning" are non-resumable — if nothing
|
|
950
|
+
// ran yet (completedCount === 0) the state file is pure noise; auto-clean it
|
|
951
|
+
// so /orch can start fresh without forcing the user through /orch-abort first.
|
|
952
|
+
const resumablePhases: OrchBatchPhase[] = ["paused", "executing", "merging"];
|
|
953
|
+
const isResumable = resumablePhases.includes(loadedState.phase as OrchBatchPhase);
|
|
954
|
+
|
|
955
|
+
if (!isResumable && completedCount === 0) {
|
|
956
|
+
return {
|
|
957
|
+
orphanSessions: [],
|
|
958
|
+
stateStatus,
|
|
959
|
+
loadedState,
|
|
960
|
+
stateError,
|
|
961
|
+
recommendedAction: "cleanup-stale",
|
|
962
|
+
userMessage:
|
|
963
|
+
`🧹 Found non-resumable batch state (${loadedState.batchId}, phase=${loadedState.phase}, 0 tasks ran).\n` +
|
|
964
|
+
` Cleaning up stale state file so a fresh batch can start.`,
|
|
965
|
+
};
|
|
966
|
+
}
|
|
967
|
+
|
|
968
|
+
return {
|
|
969
|
+
orphanSessions: [],
|
|
970
|
+
stateStatus,
|
|
971
|
+
loadedState,
|
|
972
|
+
stateError,
|
|
973
|
+
recommendedAction: isResumable ? "resume" : "cleanup-stale",
|
|
974
|
+
userMessage: isResumable
|
|
975
|
+
? `🔄 Found interrupted batch ${loadedState.batchId} (${loadedState.phase}).\n` +
|
|
976
|
+
` ${completedCount}/${allTaskIds.length} task(s) completed.\n` +
|
|
977
|
+
` Use /orch-resume to continue, or /orch-abort to clean up.`
|
|
978
|
+
: `🧹 Found non-resumable batch state (${loadedState.batchId}, phase=${loadedState.phase}).\n` +
|
|
979
|
+
` ${completedCount}/${allTaskIds.length} task(s) completed. Cleaning up state file.`,
|
|
980
|
+
};
|
|
981
|
+
}
|
|
982
|
+
|
|
983
|
+
// Invalid or io-error state with no orphans — safe to clean up
|
|
984
|
+
return {
|
|
985
|
+
orphanSessions: [],
|
|
986
|
+
stateStatus,
|
|
987
|
+
loadedState: null,
|
|
988
|
+
stateError,
|
|
989
|
+
recommendedAction: "cleanup-stale",
|
|
990
|
+
userMessage:
|
|
991
|
+
`🧹 Found unusable batch state file (${stateStatus}).\n` +
|
|
992
|
+
(stateError ? ` Error: ${stateError}\n` : "") +
|
|
993
|
+
` Cleaning up state file before starting fresh.`,
|
|
994
|
+
};
|
|
995
|
+
}
|
|
996
|
+
|
|
997
|
+
/**
|
|
998
|
+
* Detect orphan TMUX sessions and analyze startup state.
|
|
999
|
+
*
|
|
1000
|
+
* Combines session discovery (via tmux), state file loading (with typed
|
|
1001
|
+
* error handling), and .DONE file checking into a single result.
|
|
1002
|
+
*
|
|
1003
|
+
* Non-blocking: detection failures (e.g., tmux not running) are handled
|
|
1004
|
+
* gracefully and do NOT crash `/orch` startup.
|
|
1005
|
+
*
|
|
1006
|
+
* @param prefix - TMUX session prefix to search for (e.g., "orch")
|
|
1007
|
+
* @param repoRoot - Absolute path to the repository root
|
|
1008
|
+
* @returns OrphanDetectionResult with recommended action
|
|
1009
|
+
*/
|
|
1010
|
+
export function detectOrphanSessions(prefix: string, repoRoot: string): OrphanDetectionResult {
|
|
1011
|
+
// ── 1. Discover TMUX sessions ────────────────────────────────
|
|
1012
|
+
let orphanSessions: string[] = [];
|
|
1013
|
+
try {
|
|
1014
|
+
const stdout = execSync('tmux list-sessions -F "#{session_name}"', {
|
|
1015
|
+
encoding: "utf-8",
|
|
1016
|
+
timeout: 5000,
|
|
1017
|
+
});
|
|
1018
|
+
orphanSessions = parseOrchSessionNames(stdout, prefix);
|
|
1019
|
+
} catch {
|
|
1020
|
+
// tmux not available or no sessions — proceed with empty orphan list
|
|
1021
|
+
}
|
|
1022
|
+
|
|
1023
|
+
// ── 2. Load batch state file ─────────────────────────────────
|
|
1024
|
+
let stateStatus: OrphanStateStatus = "missing";
|
|
1025
|
+
let loadedState: PersistedBatchState | null = null;
|
|
1026
|
+
let stateError: string | null = null;
|
|
1027
|
+
|
|
1028
|
+
try {
|
|
1029
|
+
loadedState = loadBatchState(repoRoot);
|
|
1030
|
+
stateStatus = loadedState ? "valid" : "missing";
|
|
1031
|
+
} catch (err: unknown) {
|
|
1032
|
+
if (err instanceof StateFileError) {
|
|
1033
|
+
switch (err.code) {
|
|
1034
|
+
case "STATE_FILE_PARSE_ERROR":
|
|
1035
|
+
case "STATE_SCHEMA_INVALID":
|
|
1036
|
+
stateStatus = "invalid";
|
|
1037
|
+
stateError = `[${err.code}] ${err.message}`;
|
|
1038
|
+
break;
|
|
1039
|
+
case "STATE_FILE_IO_ERROR":
|
|
1040
|
+
stateStatus = "io-error";
|
|
1041
|
+
stateError = `[${err.code}] ${err.message}`;
|
|
1042
|
+
break;
|
|
1043
|
+
}
|
|
1044
|
+
} else {
|
|
1045
|
+
stateStatus = "io-error";
|
|
1046
|
+
stateError = err instanceof Error ? err.message : String(err);
|
|
1047
|
+
}
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
// ── 3. Check .DONE files for stale state detection ───────────
|
|
1051
|
+
const doneTaskIds = new Set<string>();
|
|
1052
|
+
if (loadedState && orphanSessions.length === 0) {
|
|
1053
|
+
// Only check .DONE files when we have state but no orphans
|
|
1054
|
+
// (stale state scenario — sessions finished while orchestrator was disconnected)
|
|
1055
|
+
for (const task of loadedState.tasks) {
|
|
1056
|
+
if (task.taskFolder && hasTaskDoneMarker(task.taskFolder)) {
|
|
1057
|
+
doneTaskIds.add(task.taskId);
|
|
1058
|
+
}
|
|
1059
|
+
}
|
|
1060
|
+
}
|
|
1061
|
+
|
|
1062
|
+
// ── 4. Analyze and return ────────────────────────────────────
|
|
1063
|
+
return analyzeOrchestratorStartupState(
|
|
1064
|
+
orphanSessions,
|
|
1065
|
+
stateStatus,
|
|
1066
|
+
loadedState,
|
|
1067
|
+
stateError,
|
|
1068
|
+
doneTaskIds,
|
|
1069
|
+
);
|
|
1070
|
+
}
|
|
1071
|
+
|
|
1072
|
+
|
|
1073
|
+
// ── Batch History ────────────────────────────────────────────────────
|
|
1074
|
+
|
|
1075
|
+
/** Path to the batch history file. */
|
|
1076
|
+
function batchHistoryPath(repoRoot: string): string {
|
|
1077
|
+
return join(repoRoot, ".pi", "batch-history.json");
|
|
1078
|
+
}
|
|
1079
|
+
|
|
1080
|
+
/**
|
|
1081
|
+
* Load existing batch history entries from disk.
|
|
1082
|
+
* Returns empty array if file doesn't exist or is invalid.
|
|
1083
|
+
*/
|
|
1084
|
+
export function loadBatchHistory(repoRoot: string): BatchHistorySummary[] {
|
|
1085
|
+
const filePath = batchHistoryPath(repoRoot);
|
|
1086
|
+
try {
|
|
1087
|
+
if (!existsSync(filePath)) return [];
|
|
1088
|
+
const raw = readFileSync(filePath, "utf-8");
|
|
1089
|
+
const data = JSON.parse(raw);
|
|
1090
|
+
if (!Array.isArray(data)) return [];
|
|
1091
|
+
return data;
|
|
1092
|
+
} catch {
|
|
1093
|
+
return [];
|
|
1094
|
+
}
|
|
1095
|
+
}
|
|
1096
|
+
|
|
1097
|
+
/**
|
|
1098
|
+
* Append a batch summary to history and trim to max entries.
|
|
1099
|
+
* Writes atomically via tmp+rename pattern.
|
|
1100
|
+
*/
|
|
1101
|
+
export function saveBatchHistory(repoRoot: string, summary: BatchHistorySummary): void {
|
|
1102
|
+
const filePath = batchHistoryPath(repoRoot);
|
|
1103
|
+
try {
|
|
1104
|
+
const history = loadBatchHistory(repoRoot);
|
|
1105
|
+
// Prepend newest first
|
|
1106
|
+
history.unshift(summary);
|
|
1107
|
+
// Trim to max
|
|
1108
|
+
if (history.length > BATCH_HISTORY_MAX_ENTRIES) {
|
|
1109
|
+
history.length = BATCH_HISTORY_MAX_ENTRIES;
|
|
1110
|
+
}
|
|
1111
|
+
const dir = dirname(filePath);
|
|
1112
|
+
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
|
|
1113
|
+
const tmpPath = filePath + ".tmp";
|
|
1114
|
+
writeFileSync(tmpPath, JSON.stringify(history, null, 2));
|
|
1115
|
+
renameSync(tmpPath, filePath);
|
|
1116
|
+
execLog("batch", "history", `saved batch summary (${history.length} entries)`);
|
|
1117
|
+
} catch (err) {
|
|
1118
|
+
execLog("batch", "history", `failed to save batch history: ${err}`);
|
|
1119
|
+
}
|
|
1120
|
+
}
|
|
1121
|
+
|