@pi-agents/orchid 0.1.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. package/CHANGELOG.md +41 -0
  2. package/LICENSE +21 -0
  3. package/README.md +246 -0
  4. package/agents/AGENTS-MANIFEST.md +42 -0
  5. package/agents/brain.md +42 -0
  6. package/agents/context-builder.md +46 -0
  7. package/agents/delegate.md +12 -0
  8. package/agents/dev-1.md +42 -0
  9. package/agents/oracle.md +73 -0
  10. package/agents/planner.md +55 -0
  11. package/agents/researcher.md +52 -0
  12. package/agents/reviewer.md +79 -0
  13. package/agents/scout.md +50 -0
  14. package/agents/tester.md +45 -0
  15. package/agents/worker.md +55 -0
  16. package/extensions/ralph.ts +1 -0
  17. package/extensions/reviewer-extension.ts +125 -0
  18. package/extensions/task-orchestrator.ts +28 -0
  19. package/package.json +63 -0
  20. package/prompts/gather-context-and-clarify.md +13 -0
  21. package/prompts/parallel-cleanup.md +59 -0
  22. package/prompts/parallel-context-build.md +53 -0
  23. package/prompts/parallel-handoff-plan.md +59 -0
  24. package/prompts/parallel-research.md +50 -0
  25. package/prompts/parallel-review.md +54 -0
  26. package/prompts/review-loop.md +41 -0
  27. package/skills/orchid/SKILL.md +214 -0
  28. package/skills/orchid/orchid-cleanup/SKILL.md +122 -0
  29. package/skills/orchid/orchid-converge/SKILL.md +124 -0
  30. package/skills/orchid/orchid-decompose/SKILL.md +201 -0
  31. package/skills/orchid/orchid-doctor/SKILL.md +162 -0
  32. package/skills/orchid/orchid-investigate/SKILL.md +102 -0
  33. package/skills/orchid/orchid-launch/SKILL.md +147 -0
  34. package/skills/ralph/SKILL.md +73 -0
  35. package/skills/subagents/pi-subagents/SKILL.md +813 -0
  36. package/src/index.ts +7 -0
  37. package/src/orchestrator/abort.ts +534 -0
  38. package/src/orchestrator/agent-bridge-extension.ts +1020 -0
  39. package/src/orchestrator/agent-host.ts +954 -0
  40. package/src/orchestrator/cleanup.ts +776 -0
  41. package/src/orchestrator/config-loader.ts +1412 -0
  42. package/src/orchestrator/config-schema.ts +690 -0
  43. package/src/orchestrator/config.ts +81 -0
  44. package/src/orchestrator/context-window.ts +66 -0
  45. package/src/orchestrator/diagnostic-reports.ts +475 -0
  46. package/src/orchestrator/diagnostics.ts +394 -0
  47. package/src/orchestrator/discovery.ts +1833 -0
  48. package/src/orchestrator/engine-worker.ts +415 -0
  49. package/src/orchestrator/engine.ts +5940 -0
  50. package/src/orchestrator/execution.ts +3104 -0
  51. package/src/orchestrator/extension.ts +5934 -0
  52. package/src/orchestrator/formatting.ts +785 -0
  53. package/src/orchestrator/git.ts +88 -0
  54. package/src/orchestrator/index.ts +28 -0
  55. package/src/orchestrator/lane-runner.ts +1787 -0
  56. package/src/orchestrator/mailbox.ts +780 -0
  57. package/src/orchestrator/merge.ts +3414 -0
  58. package/src/orchestrator/messages.ts +1062 -0
  59. package/src/orchestrator/migrations.ts +278 -0
  60. package/src/orchestrator/naming.ts +117 -0
  61. package/src/orchestrator/path-resolver.ts +275 -0
  62. package/src/orchestrator/persistence.ts +2625 -0
  63. package/src/orchestrator/process-registry.ts +452 -0
  64. package/src/orchestrator/quality-gate.ts +1085 -0
  65. package/src/orchestrator/resume.ts +3488 -0
  66. package/src/orchestrator/sessions.ts +57 -0
  67. package/src/orchestrator/settings-loader.ts +136 -0
  68. package/src/orchestrator/settings-tui.ts +2208 -0
  69. package/src/orchestrator/sidecar-telemetry.ts +267 -0
  70. package/src/orchestrator/supervisor.ts +4548 -0
  71. package/src/orchestrator/task-executor-core.ts +675 -0
  72. package/src/orchestrator/tmux-compat.ts +37 -0
  73. package/src/orchestrator/tool-allowlist-constants.ts +37 -0
  74. package/src/orchestrator/types.ts +4465 -0
  75. package/src/orchestrator/verification.ts +547 -0
  76. package/src/orchestrator/waves.ts +1564 -0
  77. package/src/orchestrator/workspace.ts +707 -0
  78. package/src/orchestrator/worktree.ts +2725 -0
  79. package/src/ralph/index.ts +825 -0
  80. package/src/subagents/agents/agent-management.ts +648 -0
  81. package/src/subagents/agents/agent-scope.ts +6 -0
  82. package/src/subagents/agents/agent-selection.ts +23 -0
  83. package/src/subagents/agents/agent-serializer.ts +86 -0
  84. package/src/subagents/agents/agents.ts +832 -0
  85. package/src/subagents/agents/chain-serializer.ts +137 -0
  86. package/src/subagents/agents/frontmatter.ts +29 -0
  87. package/src/subagents/agents/identity.ts +30 -0
  88. package/src/subagents/agents/skills.ts +632 -0
  89. package/src/subagents/extension/config.ts +16 -0
  90. package/src/subagents/extension/control-notices.ts +92 -0
  91. package/src/subagents/extension/doctor.ts +199 -0
  92. package/src/subagents/extension/fanout-child.ts +170 -0
  93. package/src/subagents/extension/index.ts +573 -0
  94. package/src/subagents/extension/schemas.ts +168 -0
  95. package/src/subagents/intercom/intercom-bridge.ts +379 -0
  96. package/src/subagents/intercom/result-intercom.ts +377 -0
  97. package/src/subagents/runs/background/async-execution.ts +712 -0
  98. package/src/subagents/runs/background/async-job-tracker.ts +310 -0
  99. package/src/subagents/runs/background/async-resume.ts +345 -0
  100. package/src/subagents/runs/background/async-status.ts +325 -0
  101. package/src/subagents/runs/background/completion-dedupe.ts +63 -0
  102. package/src/subagents/runs/background/notify.ts +108 -0
  103. package/src/subagents/runs/background/parallel-groups.ts +45 -0
  104. package/src/subagents/runs/background/result-watcher.ts +307 -0
  105. package/src/subagents/runs/background/run-id-resolver.ts +83 -0
  106. package/src/subagents/runs/background/run-status.ts +269 -0
  107. package/src/subagents/runs/background/stale-run-reconciler.ts +336 -0
  108. package/src/subagents/runs/background/subagent-runner.ts +1808 -0
  109. package/src/subagents/runs/background/top-level-async.ts +13 -0
  110. package/src/subagents/runs/foreground/chain-clarify.ts +1333 -0
  111. package/src/subagents/runs/foreground/chain-execution.ts +938 -0
  112. package/src/subagents/runs/foreground/execution.ts +918 -0
  113. package/src/subagents/runs/foreground/subagent-executor.ts +2527 -0
  114. package/src/subagents/runs/shared/completion-guard.ts +147 -0
  115. package/src/subagents/runs/shared/long-running-guard.ts +175 -0
  116. package/src/subagents/runs/shared/mcp-direct-tool-allowlist.ts +365 -0
  117. package/src/subagents/runs/shared/model-fallback.ts +103 -0
  118. package/src/subagents/runs/shared/nested-events.ts +819 -0
  119. package/src/subagents/runs/shared/nested-path.ts +52 -0
  120. package/src/subagents/runs/shared/nested-render.ts +115 -0
  121. package/src/subagents/runs/shared/parallel-utils.ts +109 -0
  122. package/src/subagents/runs/shared/pi-args.ts +220 -0
  123. package/src/subagents/runs/shared/pi-spawn.ts +115 -0
  124. package/src/subagents/runs/shared/run-history.ts +60 -0
  125. package/src/subagents/runs/shared/single-output.ts +164 -0
  126. package/src/subagents/runs/shared/subagent-control.ts +226 -0
  127. package/src/subagents/runs/shared/subagent-prompt-runtime.ts +170 -0
  128. package/src/subagents/runs/shared/worktree.ts +577 -0
  129. package/src/subagents/shared/artifacts.ts +98 -0
  130. package/src/subagents/shared/atomic-json.ts +16 -0
  131. package/src/subagents/shared/file-coalescer.ts +40 -0
  132. package/src/subagents/shared/fork-context.ts +76 -0
  133. package/src/subagents/shared/formatters.ts +133 -0
  134. package/src/subagents/shared/jsonl-writer.ts +81 -0
  135. package/src/subagents/shared/model-info.ts +78 -0
  136. package/src/subagents/shared/post-exit-stdio-guard.ts +85 -0
  137. package/src/subagents/shared/session-identity.ts +10 -0
  138. package/src/subagents/shared/session-tokens.ts +44 -0
  139. package/src/subagents/shared/settings.ts +397 -0
  140. package/src/subagents/shared/status-format.ts +49 -0
  141. package/src/subagents/shared/types.ts +822 -0
  142. package/src/subagents/shared/utils.ts +450 -0
  143. package/src/subagents/slash/prompt-template-bridge.ts +397 -0
  144. package/src/subagents/slash/slash-bridge.ts +174 -0
  145. package/src/subagents/slash/slash-commands.ts +528 -0
  146. package/src/subagents/slash/slash-live-state.ts +292 -0
  147. package/src/subagents/tui/render-helpers.ts +80 -0
  148. package/src/subagents/tui/render.ts +1358 -0
  149. package/templates/agents/local/supervisor.md +33 -0
  150. package/templates/agents/local/task-merger.md +27 -0
  151. package/templates/agents/local/task-reviewer.md +30 -0
  152. package/templates/agents/local/task-worker.md +34 -0
  153. package/templates/agents/supervisor-routing.md +92 -0
  154. package/templates/agents/supervisor.md +229 -0
  155. package/templates/agents/task-merger.md +214 -0
  156. package/templates/agents/task-reviewer.md +260 -0
  157. package/templates/agents/task-worker-segment.md +44 -0
  158. package/templates/agents/task-worker.md +557 -0
  159. package/templates/tasks/CONTEXT.md +30 -0
  160. package/templates/tasks/EXAMPLE-001-hello-world/PROMPT.md +98 -0
  161. package/templates/tasks/EXAMPLE-001-hello-world/STATUS.md +73 -0
  162. package/templates/tasks/EXAMPLE-002-parallel-smoke/PROMPT.md +97 -0
  163. package/templates/tasks/EXAMPLE-002-parallel-smoke/STATUS.md +73 -0
@@ -0,0 +1,4465 @@
1
+ /**
2
+ * All types, interfaces, error classes, constants, and defaults
3
+ * @module orch/types
4
+ */
5
+ import { join } from "path";
6
+ import type { ExitClassification, TaskExitDiagnostic } from "./diagnostics.js";
7
+ // TP-189 (Cluster B): single source of truth for the worker user-tools
8
+ // default literal. The constants module is import-free so this does NOT
9
+ // create a cycle (types.ts -> tool-allowlist-constants.ts is a leaf).
10
+ import { DEFAULT_WORKER_USER_TOOLS } from "./tool-allowlist-constants.ts";
11
+
12
+ // ── Types ────────────────────────────────────────────────────────────
13
+
14
+ /** Configuration from .pi/task-orchestrator.yaml */
15
+ export interface OrchestratorConfig {
16
+ orchestrator: {
17
+ max_lanes: number;
18
+ worktree_location: "sibling" | "subdirectory";
19
+ worktree_prefix: string;
20
+ batch_id_format: "timestamp" | "sequential";
21
+ spawn_mode: "subprocess";
22
+ sessionPrefix: string;
23
+ /** Optional operator identifier. Auto-detected from OS username if empty. */
24
+ operator_id: string;
25
+ /** How completed batches are integrated. manual = user runs /orch-integrate. supervised = supervisor proposes plan, asks confirmation. auto = supervisor executes without asking. */
26
+ integration: "manual" | "supervised" | "auto";
27
+ /**
28
+ * Optional pre-resolved batch ID injected by callers that already
29
+ * know the batch identity (e.g., resumed orchestrations). When
30
+ * absent, callers fall back to the `ORCH_BATCH_ID` env var or a
31
+ * timestamp. Read by `executeLaneV2` (execution.ts).
32
+ *
33
+ * @since TP-195 (#TBD) — documented field that was already being
34
+ * read at runtime via `config.orchestrator?.batchId` and asserted
35
+ * by the source-grep invariant in `runtime-model-fallback.test.ts`.
36
+ */
37
+ batchId?: string;
38
+ };
39
+ dependencies: {
40
+ source: "prompt" | "agent";
41
+ cache: boolean;
42
+ };
43
+ assignment: {
44
+ strategy: "affinity-first" | "round-robin" | "load-balanced";
45
+ size_weights: Record<string, number>;
46
+ };
47
+ pre_warm: {
48
+ auto_detect: boolean;
49
+ commands: Record<string, string>;
50
+ always: string[];
51
+ };
52
+ merge: {
53
+ model: string;
54
+ tools: string;
55
+ /** Merge-agent thinking mode (empty = inherit session thinking) */
56
+ thinking: string;
57
+ verify: string[];
58
+ order: "fewest-files-first" | "sequential";
59
+ /** Merge agent timeout in minutes. Default: 10. Increase for large batches. */
60
+ timeout_minutes: number;
61
+ /** Package specifiers to exclude from extension forwarding (exact match). @since TP-180 */
62
+ exclude_extensions?: string[];
63
+ };
64
+ failure: {
65
+ on_task_failure: "skip-dependents" | "stop-wave" | "stop-all";
66
+ on_merge_failure: "pause" | "abort";
67
+ stall_timeout: number;
68
+ max_worker_minutes: number;
69
+ abort_grace_period: number;
70
+ };
71
+ monitoring: {
72
+ poll_interval: number;
73
+ };
74
+ /** Verification baseline fingerprinting settings (TP-032). */
75
+ verification: {
76
+ enabled: boolean;
77
+ mode: "strict" | "permissive";
78
+ flaky_reruns: number;
79
+ };
80
+ }
81
+
82
+ /**
83
+ * Stable segment identifier.
84
+ *
85
+ * SegmentId is opaque — never parse by string-splitting.
86
+ * Use structured node/record fields (`repoId`, `taskId`) instead.
87
+ */
88
+ export type SegmentId = `${string}::${string}` | `${string}::${string}::${number}`;
89
+
90
+ /** How an intra-task segment edge was produced (for observability/debugging). */
91
+ export type SegmentEdgeProvenance = "explicit" | "inferred";
92
+
93
+ /** Repo-scoped edge parsed from optional `## Segment DAG` prompt metadata. */
94
+ export interface PromptSegmentDagEdge {
95
+ fromRepoId: string;
96
+ toRepoId: string;
97
+ }
98
+
99
+ /** Optional explicit segment metadata parsed from PROMPT.md. */
100
+ export interface PromptSegmentDagMetadata {
101
+ /** Repo IDs participating in this task's segment graph, first-seen order. */
102
+ repoIds: string[];
103
+ /** Directed repo-level edges, sorted by `fromRepoId` then `toRepoId`. */
104
+ edges: PromptSegmentDagEdge[];
105
+ }
106
+
107
+ /** A parsed task from PROMPT.md, enriched for orchestrator use */
108
+ export interface ParsedTask {
109
+ taskId: string;
110
+ taskName: string;
111
+ reviewLevel: number;
112
+ size: string;
113
+ dependencies: string[];
114
+ fileScope: string[];
115
+ taskFolder: string;
116
+ promptPath: string;
117
+ areaName: string;
118
+ status: "pending" | "complete";
119
+ /** Repo ID declared in the PROMPT metadata (e.g., "api", "frontend"). Undefined if not declared. */
120
+ promptRepoId?: string;
121
+ /** Resolved repo ID after routing precedence (workspace mode only). Undefined in repo mode. */
122
+ resolvedRepoId?: string;
123
+ /** Optional explicit segment DAG metadata from `## Segment DAG`. */
124
+ explicitSegmentDag?: PromptSegmentDagMetadata;
125
+ /**
126
+ * Repo ID that owns task packet files (v4, TP-081).
127
+ * Populated by execution engine in workspace mode. Undefined in repo mode.
128
+ */
129
+ packetRepoId?: string;
130
+ /**
131
+ * Absolute path to task folder in the packet repo worktree (v4, TP-081).
132
+ * Populated by execution engine. Undefined if not yet resolved.
133
+ */
134
+ packetTaskPath?: string;
135
+ /**
136
+ * Segment IDs for this task (v4, TP-081).
137
+ * Populated from TaskSegmentPlan during execution.
138
+ */
139
+ segmentIds?: string[];
140
+ /**
141
+ * Currently active segment ID (v4, TP-081).
142
+ * Null when no segment is active.
143
+ */
144
+ activeSegmentId?: string | null;
145
+ /**
146
+ * Step-to-segment checkbox mapping parsed from PROMPT.md `#### Segment:` markers.
147
+ * Populated by discovery (Phase A, TP-173). Undefined if not yet parsed.
148
+ */
149
+ stepSegmentMap?: StepSegmentMapping[];
150
+ }
151
+
152
+ /** Build a stable segment ID from task + repo identity (`<taskId>::<repoId>[::N]`). */
153
+ export function buildSegmentId(taskId: string, repoId: string, sequence?: number): SegmentId {
154
+ if (typeof sequence === "number" && Number.isFinite(sequence) && sequence >= 2) {
155
+ return `${taskId}::${repoId}::${Math.floor(sequence)}` as SegmentId;
156
+ }
157
+ return `${taskId}::${repoId}` as SegmentId;
158
+ }
159
+
160
+ /**
161
+ * Read repoId from structured segment metadata.
162
+ *
163
+ * SegmentId is opaque — never parse it by string-splitting.
164
+ */
165
+ export function parseSegmentIdRepo(segment: { repoId: string }): string {
166
+ return segment.repoId;
167
+ }
168
+
169
+ /** Build a dynamic segment expansion request ID (`exp-{timestamp}-{random5}`). */
170
+ export function buildExpansionRequestId(timestamp = Date.now()): string {
171
+ const ts = Number.isFinite(timestamp) ? Math.floor(timestamp) : Date.now();
172
+ const base = Math.random()
173
+ .toString(36)
174
+ .slice(2)
175
+ .toLowerCase()
176
+ .replace(/[^a-z0-9]/g, "");
177
+ const random5 = (base + "00000").slice(0, 5);
178
+ return `exp-${ts}-${random5}`;
179
+ }
180
+
181
+ // ── Step-Segment Mapping (Phase A: segment-scoped worker visibility) ────
182
+
183
+ /**
184
+ * Authoritative segment-scope mode for a single worker iteration.
185
+ *
186
+ * - `FULL_TASK`: the worker sees the entire PROMPT.md, all steps, all checkboxes.
187
+ * No `Active segment ID` / `Your checkboxes for this step` prose is injected.
188
+ * Segment-related environment variables (`TASKPLANE_ACTIVE_SEGMENT_ID`,
189
+ * `TASKPLANE_SEGMENT_ID`) are hard-cleared so that runtime tools keyed on
190
+ * them (e.g., `request_segment_expansion`) cannot accidentally register.
191
+ *
192
+ * - `SEGMENT_SCOPED`: the worker is iterating a specific segment of a
193
+ * multi-segment task. Only that segment's steps and checkboxes are shown;
194
+ * `Active segment ID` is announced; segment-related env vars carry the
195
+ * active `segmentId`; the segment-overlay system prompt is appended.
196
+ *
197
+ * This is the single authoritative flag for the segment-scope decision
198
+ * (TP-196 / #502). Call sites should derive their behaviour from this mode
199
+ * rather than re-evaluating the underlying boolean conditions, which prevents
200
+ * the multiple branches drifting out of sync.
201
+ *
202
+ * @since TP-196
203
+ */
204
+ export type SegmentScopeMode = "FULL_TASK" | "SEGMENT_SCOPED";
205
+
206
+ /** A group of checkboxes scoped to a single repo within a step. */
207
+ export interface SegmentCheckboxGroup {
208
+ repoId: string;
209
+ checkboxes: string[];
210
+ }
211
+
212
+ /** Maps a step to its repo-scoped checkbox groups. */
213
+ export interface StepSegmentMapping {
214
+ stepNumber: number;
215
+ stepName: string;
216
+ segments: SegmentCheckboxGroup[];
217
+ }
218
+
219
+ /** One repo-scoped segment node for a task. */
220
+ export interface TaskSegmentNode {
221
+ segmentId: SegmentId;
222
+ taskId: string;
223
+ repoId: string;
224
+ /**
225
+ * Deterministic segment order within a task (0-indexed).
226
+ * Stable tie-break: repoId lexical order.
227
+ */
228
+ order: number;
229
+ }
230
+
231
+ /** Directed edge between two segment nodes in the same task. */
232
+ export interface TaskSegmentEdge {
233
+ fromSegmentId: SegmentId;
234
+ toSegmentId: SegmentId;
235
+ provenance: SegmentEdgeProvenance;
236
+ /** Optional explanation of why this edge exists (debug/telemetry aid). */
237
+ reason?: string;
238
+ }
239
+
240
+ /**
241
+ * Deterministic segment plan for one task.
242
+ *
243
+ * Ordering contract:
244
+ * - `segments`: sorted by `order`, then `repoId`
245
+ * - `edges`: sorted by `fromSegmentId`, then `toSegmentId`
246
+ */
247
+ export interface TaskSegmentPlan {
248
+ taskId: string;
249
+ segments: TaskSegmentNode[];
250
+ edges: TaskSegmentEdge[];
251
+ /**
252
+ * explicit-dag: parsed from prompt metadata
253
+ * inferred-sequential: deterministic fallback inference
254
+ * repo-singleton: repo mode fallback (`resolvedRepoId ?? "default"`)
255
+ */
256
+ mode: "explicit-dag" | "inferred-sequential" | "repo-singleton";
257
+ }
258
+
259
+ /** Directed edge between repos requested in a dynamic segment expansion. */
260
+ export interface SegmentExpansionEdge {
261
+ from: string;
262
+ to: string;
263
+ }
264
+
265
+ /**
266
+ * File IPC payload for worker-initiated dynamic segment expansion requests.
267
+ *
268
+ * Written to: `.pi/mailbox/{batchId}/{agentId}/outbox/segment-expansion-{requestId}.json`
269
+ */
270
+ export interface SegmentExpansionRequest {
271
+ /** Unique request ID: `exp-{timestamp}-{random5}` */
272
+ requestId: string;
273
+ /** Task ID making the expansion request. */
274
+ taskId: string;
275
+ /** Segment active when the request was emitted. */
276
+ fromSegmentId: SegmentId;
277
+ /** Repo IDs the worker is requesting the engine to add. */
278
+ requestedRepoIds: string[];
279
+ /** Human rationale from the worker. */
280
+ rationale: string;
281
+ /** Placement directive for inserting new segments. */
282
+ placement: "after-current" | "end";
283
+ /** Optional inter-request ordering edges. */
284
+ edges: SegmentExpansionEdge[];
285
+ /** Epoch milliseconds when the request was emitted. */
286
+ timestamp: number;
287
+ }
288
+
289
+ /**
290
+ * TaskId-keyed segment plans.
291
+ * Iteration order must be deterministic: sort task IDs lexicographically.
292
+ */
293
+ export type TaskSegmentPlanMap = Map<string, TaskSegmentPlan>;
294
+
295
+ /** A wave: a group of tasks whose dependencies are all satisfied */
296
+ export interface WaveAssignment {
297
+ waveNumber: number;
298
+ tasks: LaneAssignment[];
299
+ }
300
+
301
+ /** A task assigned to a specific lane within a wave */
302
+ export interface LaneAssignment {
303
+ taskId: string;
304
+ lane: number;
305
+ task: ParsedTask;
306
+ /** Repo ID this task targets (workspace mode only). Undefined in repo mode. */
307
+ repoId?: string;
308
+ }
309
+
310
+ /** Runtime state of the entire batch execution */
311
+ export interface BatchState {
312
+ phase: "idle" | "planning" | "running" | "paused" | "merging" | "complete" | "error" | "aborted";
313
+ batchId: string;
314
+ waves: WaveAssignment[];
315
+ currentWave: number;
316
+ tasksTotal: number;
317
+ tasksComplete: number;
318
+ tasksFailed: number;
319
+ laneCount: number;
320
+ laneStatuses: Map<number, LaneStatus>;
321
+ startTime: number;
322
+ errors: string[];
323
+ }
324
+
325
+ /** Per-lane runtime status */
326
+ export interface LaneStatus {
327
+ lane: number;
328
+ taskId: string | null;
329
+ status: "idle" | "running" | "complete" | "failed" | "stalled";
330
+ stepProgress: string;
331
+ iteration: number;
332
+ elapsed: number;
333
+ tmuxSession: string;
334
+ }
335
+
336
+ /** Task area definition from task-runner.yaml */
337
+ export interface TaskArea {
338
+ path: string;
339
+ prefix: string;
340
+ context: string;
341
+ /** Optional repo ID for routing tasks in this area (workspace mode only). */
342
+ repoId?: string;
343
+ }
344
+
345
+ /** Subset of task-runner.yaml that the orchestrator needs */
346
+ export interface TaskRunnerConfig {
347
+ task_areas: Record<string, TaskArea>;
348
+ reference_docs: Record<string, string>;
349
+ /** Named testing/verification commands (e.g., { test: "node --test tests/*.test.ts" }). Used for baseline fingerprinting (TP-032). */
350
+ testing_commands?: Record<string, string>;
351
+ /**
352
+ * Model fallback behavior when a configured model becomes unavailable mid-batch.
353
+ * - `"inherit"` (default): Retry without explicit model (session model fallback).
354
+ * - `"fail"`: No model substitution — normal failure path.
355
+ * @since TP-055
356
+ */
357
+ model_fallback?: "inherit" | "fail";
358
+ /**
359
+ * Reviewer agent model/thinking/tools configuration.
360
+ * Threaded through to `spawnReviewer()` via env vars.
361
+ * @since TP-160
362
+ */
363
+ reviewer?: {
364
+ /** Model string (empty = inherit session default) */
365
+ model: string;
366
+ /** Thinking mode ("on" | "off" | budget string, empty = inherit) */
367
+ thinking: string;
368
+ /** Comma-separated tool allowlist */
369
+ tools: string;
370
+ /** Package specifiers to exclude from extension forwarding (exact match). @since TP-180 */
371
+ excludeExtensions?: string[];
372
+ };
373
+ /**
374
+ * Worker agent model/thinking/tools configuration.
375
+ * Threaded through to `spawnAgent()` via env vars.
376
+ * @since TP-181
377
+ */
378
+ worker?: {
379
+ /** Model string (empty = inherit session default) */
380
+ model: string;
381
+ /** Thinking mode ("on" | "off" | budget string, empty = inherit) */
382
+ thinking: string;
383
+ /** Comma-separated tool allowlist */
384
+ tools: string;
385
+ /** Package specifiers to exclude from extension forwarding (exact match). @since TP-180 */
386
+ excludeExtensions?: string[];
387
+ };
388
+ /** Worker agent extension exclusion list. @since TP-180 */
389
+ workerExcludeExtensions?: string[];
390
+ }
391
+
392
+ /** Result of a preflight check */
393
+ export interface PreflightResult {
394
+ passed: boolean;
395
+ checks: PreflightCheck[];
396
+ }
397
+
398
+ /** Individual preflight check */
399
+ export interface PreflightCheck {
400
+ name: string;
401
+ status: "pass" | "fail" | "warn";
402
+ message: string;
403
+ hint?: string;
404
+ }
405
+
406
+ // ── Defaults ─────────────────────────────────────────────────────────
407
+
408
+ export const DEFAULT_ORCHESTRATOR_CONFIG: OrchestratorConfig = {
409
+ orchestrator: {
410
+ max_lanes: 3,
411
+ worktree_location: "subdirectory",
412
+ worktree_prefix: "orchid-wt",
413
+ batch_id_format: "timestamp",
414
+ spawn_mode: "subprocess",
415
+ sessionPrefix: "orch",
416
+ operator_id: "",
417
+ integration: "manual",
418
+ },
419
+ dependencies: {
420
+ source: "prompt",
421
+ cache: true,
422
+ },
423
+ assignment: {
424
+ strategy: "affinity-first",
425
+ size_weights: { S: 1, M: 2, L: 4 },
426
+ },
427
+ pre_warm: {
428
+ auto_detect: false,
429
+ commands: {},
430
+ always: [],
431
+ },
432
+ merge: {
433
+ model: "",
434
+ // TP-189 (Cluster B): merge default sourced from the import-free
435
+ // `tool-allowlist-constants.ts` module. The previous concern about
436
+ // importing from `agent-host.ts` (which DOES depend on types.ts and
437
+ // would create a cycle) no longer applies because the constant
438
+ // lives in a leaf module that imports nothing.
439
+ tools: DEFAULT_WORKER_USER_TOOLS,
440
+ thinking: "off",
441
+ verify: [],
442
+ order: "fewest-files-first",
443
+ timeout_minutes: 90,
444
+ },
445
+ failure: {
446
+ on_task_failure: "skip-dependents",
447
+ on_merge_failure: "pause",
448
+ stall_timeout: 30,
449
+ max_worker_minutes: 30,
450
+ abort_grace_period: 60,
451
+ },
452
+ monitoring: {
453
+ poll_interval: 5,
454
+ },
455
+ verification: {
456
+ enabled: false,
457
+ mode: "permissive",
458
+ flaky_reruns: 1,
459
+ },
460
+ };
461
+
462
+ export const DEFAULT_TASK_RUNNER_CONFIG: TaskRunnerConfig = {
463
+ task_areas: {},
464
+ reference_docs: {},
465
+ model_fallback: "inherit",
466
+ };
467
+
468
+ // ── Helpers ──────────────────────────────────────────────────────────
469
+
470
+ export function freshBatchState(): BatchState {
471
+ return {
472
+ phase: "idle",
473
+ batchId: "",
474
+ waves: [],
475
+ currentWave: 0,
476
+ tasksTotal: 0,
477
+ tasksComplete: 0,
478
+ tasksFailed: 0,
479
+ laneCount: 0,
480
+ laneStatuses: new Map(),
481
+ startTime: 0,
482
+ errors: [],
483
+ };
484
+ }
485
+
486
+ // ── Worktree Types ───────────────────────────────────────────────────
487
+
488
+ /** Information about a created worktree. Returned by createWorktree(). */
489
+ export interface WorktreeInfo {
490
+ /** Absolute filesystem path to the worktree directory */
491
+ path: string;
492
+ /** Branch name checked out in the worktree (e.g. task/lane-1-20260308T111750) */
493
+ branch: string;
494
+ /** Lane number (1-indexed) this worktree is assigned to */
495
+ laneNumber: number;
496
+ }
497
+
498
+ /** Options for createWorktree() */
499
+ export interface CreateWorktreeOptions {
500
+ /** Lane number (1-indexed) */
501
+ laneNumber: number;
502
+ /** Batch ID timestamp (e.g. "20260308T111750") */
503
+ batchId: string;
504
+ /** Branch to base the worktree on (e.g. "develop") */
505
+ baseBranch: string;
506
+ /** Worktree directory prefix (e.g. "orchid-wt") */
507
+ prefix: string;
508
+ /** Operator identifier (sanitized, e.g., "henrylach") */
509
+ opId: string;
510
+ /** Full orchestrator config (optional; used for worktree_location) */
511
+ config?: OrchestratorConfig;
512
+ }
513
+
514
+ /**
515
+ * Stable error codes for worktree operations.
516
+ *
517
+ * - WORKTREE_PATH_IS_WORKTREE: path already registered as a git worktree
518
+ * - WORKTREE_PATH_NOT_EMPTY: path exists and is a non-empty non-worktree dir
519
+ * - WORKTREE_BRANCH_EXISTS: branch name already exists (checked out elsewhere)
520
+ * - WORKTREE_INVALID_BASE: base branch does not exist
521
+ * - WORKTREE_GIT_ERROR: unexpected git command failure
522
+ * - WORKTREE_VERIFY_FAILED: post-creation/reset verification failed
523
+ * - WORKTREE_REMOVE_FAILED: worktree removal failed (even after retries)
524
+ * - WORKTREE_REMOVE_RETRY_EXHAUSTED: all retry attempts for worktree removal exhausted (Windows file locking)
525
+ * - WORKTREE_BRANCH_DELETE_FAILED: branch deletion failed after successful worktree removal
526
+ * - WORKTREE_NOT_FOUND: worktree path does not exist on disk
527
+ * - WORKTREE_NOT_REGISTERED: path exists but is not a registered git worktree
528
+ * - WORKTREE_DIRTY: worktree has uncommitted changes (cannot reset)
529
+ * - WORKTREE_RESET_FAILED: git checkout -B reset command failed
530
+ */
531
+ export type WorktreeErrorCode =
532
+ | "WORKTREE_PATH_IS_WORKTREE"
533
+ | "WORKTREE_PATH_NOT_EMPTY"
534
+ | "WORKTREE_BRANCH_EXISTS"
535
+ | "WORKTREE_INVALID_BASE"
536
+ | "WORKTREE_GIT_ERROR"
537
+ | "WORKTREE_VERIFY_FAILED"
538
+ | "WORKTREE_REMOVE_FAILED"
539
+ | "WORKTREE_REMOVE_RETRY_EXHAUSTED"
540
+ | "WORKTREE_BRANCH_DELETE_FAILED"
541
+ | "WORKTREE_NOT_FOUND"
542
+ | "WORKTREE_NOT_REGISTERED"
543
+ | "WORKTREE_DIRTY"
544
+ | "WORKTREE_RESET_FAILED";
545
+
546
+ /** Typed error class for worktree operations with stable error codes. */
547
+ export class WorktreeError extends Error {
548
+ code: WorktreeErrorCode;
549
+
550
+ constructor(code: WorktreeErrorCode, message: string) {
551
+ super(message);
552
+ this.name = "WorktreeError";
553
+ this.code = code;
554
+ }
555
+ }
556
+
557
+ /**
558
+ * Result of a removeWorktree() operation.
559
+ *
560
+ * Provides status flags so callers can branch on outcome without
561
+ * catching errors for expected idempotent scenarios.
562
+ */
563
+ export interface RemoveWorktreeResult {
564
+ /** Whether the worktree directory was removed in this call */
565
+ removed: boolean;
566
+ /** Whether the worktree was already absent (idempotent no-op) */
567
+ alreadyRemoved: boolean;
568
+ /** Whether the lane branch was deleted (or was already absent) */
569
+ branchDeleted: boolean;
570
+ /** Whether the lane branch was preserved (unmerged commits detected) */
571
+ branchPreserved: boolean;
572
+ /** The saved branch name (if preserved) */
573
+ savedBranch?: string;
574
+ /** Number of unmerged commits (if preserved) */
575
+ unmergedCount?: number;
576
+ }
577
+
578
+ // ── Bulk Operation Types ─────────────────────────────────────────────
579
+
580
+ /** Error from a single worktree within a bulk operation. */
581
+ export interface BulkWorktreeError {
582
+ /** Lane number that failed */
583
+ laneNumber: number;
584
+ /** Error code from WorktreeError (if available) */
585
+ code: WorktreeErrorCode | "UNKNOWN";
586
+ /** Human-readable error message */
587
+ message: string;
588
+ }
589
+
590
+ /**
591
+ * Result of createLaneWorktrees() bulk creation.
592
+ *
593
+ * On success: `success=true`, `worktrees` contains all created WorktreeInfos.
594
+ * On failure: `success=false`, `errors` lists per-lane failures,
595
+ * `rolledBack` indicates whether cleanup of partial state succeeded.
596
+ */
597
+ export interface CreateLaneWorktreesResult {
598
+ /** Whether all lane worktrees were created successfully */
599
+ success: boolean;
600
+ /** Created worktrees (sorted by laneNumber). Empty on failure if rolled back. */
601
+ worktrees: WorktreeInfo[];
602
+ /** Per-lane errors encountered during creation */
603
+ errors: BulkWorktreeError[];
604
+ /** Whether rollback of partially-created worktrees succeeded (only relevant on failure) */
605
+ rolledBack: boolean;
606
+ /** Errors encountered during rollback (if any) */
607
+ rollbackErrors: BulkWorktreeError[];
608
+ }
609
+
610
+ /**
611
+ * Per-worktree outcome within removeAllWorktrees().
612
+ */
613
+ export interface RemoveWorktreeOutcome {
614
+ /** The worktree that was targeted for removal */
615
+ worktree: WorktreeInfo;
616
+ /** The removal result (null if removal threw an error) */
617
+ result: RemoveWorktreeResult | null;
618
+ /** Error encountered during removal (null on success) */
619
+ error: BulkWorktreeError | null;
620
+ }
621
+
622
+ /**
623
+ * Result of removeAllWorktrees() bulk removal.
624
+ *
625
+ * Best-effort: continues on per-worktree errors (does not fail-fast).
626
+ */
627
+ export interface RemoveAllWorktreesResult {
628
+ /** Total worktrees found matching the prefix */
629
+ totalAttempted: number;
630
+ /** Successfully removed (or already removed) worktrees */
631
+ removed: WorktreeInfo[];
632
+ /** Worktrees that failed to remove */
633
+ failed: RemoveWorktreeOutcome[];
634
+ /** All per-worktree outcomes in order */
635
+ outcomes: RemoveWorktreeOutcome[];
636
+ /** Branches preserved (had unmerged commits) */
637
+ preserved: Array<{
638
+ branch: string;
639
+ savedBranch: string;
640
+ laneNumber: number;
641
+ unmergedCount?: number;
642
+ }>;
643
+ }
644
+
645
+ // ── Discovery Types ──────────────────────────────────────────────────
646
+
647
+ /** Structured error from the discovery phase with diagnostic context */
648
+ export interface DiscoveryError {
649
+ code:
650
+ | "PARSE_MISSING_ID"
651
+ | "PARSE_MALFORMED"
652
+ | "DUPLICATE_ID"
653
+ | "UNKNOWN_ARG"
654
+ | "SCAN_ERROR"
655
+ | "DEP_UNRESOLVED"
656
+ | "DEP_PENDING"
657
+ | "DEP_AMBIGUOUS"
658
+ | "DEP_SOURCE_FALLBACK"
659
+ | "TASK_REPO_UNRESOLVED"
660
+ | "TASK_REPO_UNKNOWN"
661
+ | "TASK_ROUTING_STRICT"
662
+ | "SEGMENT_DAG_INVALID"
663
+ | "SEGMENT_REPO_UNKNOWN"
664
+ | "SEGMENT_STEP_DUPLICATE_REPO"
665
+ | "SEGMENT_STEP_EMPTY"
666
+ | "SEGMENT_STEP_REPO_INVALID";
667
+ message: string;
668
+ taskPath?: string;
669
+ taskId?: string;
670
+ }
671
+
672
+ /**
673
+ * Discovery error codes that are fatal (block planning/execution).
674
+ *
675
+ * Used by formatDiscoveryResults, extension.ts, and engine.ts for
676
+ * consistent fatal-error classification. Keep in sync with the
677
+ * DiscoveryError.code union above.
678
+ */
679
+ export const FATAL_DISCOVERY_CODES: ReadonlyArray<DiscoveryError["code"]> = [
680
+ "DUPLICATE_ID",
681
+ "DEP_UNRESOLVED",
682
+ "DEP_PENDING",
683
+ "DEP_AMBIGUOUS",
684
+ "PARSE_MISSING_ID",
685
+ "TASK_REPO_UNRESOLVED",
686
+ "TASK_REPO_UNKNOWN",
687
+ "TASK_ROUTING_STRICT",
688
+ "SEGMENT_DAG_INVALID",
689
+ "SEGMENT_REPO_UNKNOWN",
690
+ "SEGMENT_STEP_DUPLICATE_REPO",
691
+ ] as const;
692
+
693
+ /** Result of the full discovery pipeline */
694
+ export interface DiscoveryResult {
695
+ pending: Map<string, ParsedTask>;
696
+ completed: Set<string>;
697
+ errors: DiscoveryError[];
698
+ }
699
+
700
+ // ── Wave Computation Types ───────────────────────────────────────────
701
+
702
+ /** Dependency graph: adjacency list (task → tasks it depends on) */
703
+ export interface DependencyGraph {
704
+ /** Map from task ID to list of task IDs it depends on (predecessors) */
705
+ dependencies: Map<string, string[]>;
706
+ /** Map from task ID to list of task IDs that depend on it (successors) */
707
+ dependents: Map<string, string[]>;
708
+ /** All task IDs in the graph (pending only, not completed) */
709
+ nodes: Set<string>;
710
+ }
711
+
712
+ /** Result of graph validation */
713
+ export interface GraphValidationResult {
714
+ valid: boolean;
715
+ errors: DiscoveryError[];
716
+ }
717
+
718
+ /** Result of wave computation */
719
+ export interface WaveComputationResult {
720
+ waves: WaveAssignment[];
721
+ errors: DiscoveryError[];
722
+ /** Optional task→segment planning map (TP-080, additive contract). */
723
+ segmentPlans?: TaskSegmentPlanMap;
724
+ }
725
+
726
+ // ── Lane Allocation (Phase 3) ────────────────────────────────────────
727
+
728
+ /**
729
+ * Error codes specific to lane allocation.
730
+ *
731
+ * - ALLOC_INVALID_CONFIG: configuration validation failed
732
+ * - ALLOC_EMPTY_WAVE: no tasks provided for allocation
733
+ * - ALLOC_WORKTREE_FAILED: worktree creation failed (includes rollback info)
734
+ * - ALLOC_TASK_NOT_FOUND: task ID from wave not found in pending map
735
+ */
736
+ export type AllocationErrorCode =
737
+ | "ALLOC_INVALID_CONFIG"
738
+ | "ALLOC_EMPTY_WAVE"
739
+ | "ALLOC_WORKTREE_FAILED"
740
+ | "ALLOC_TASK_NOT_FOUND";
741
+
742
+ /** Typed error for lane allocation failures. */
743
+ export class AllocationError extends Error {
744
+ code: AllocationErrorCode;
745
+ details?: string;
746
+
747
+ constructor(code: AllocationErrorCode, message: string, details?: string) {
748
+ super(message);
749
+ this.name = "AllocationError";
750
+ this.code = code;
751
+ this.details = details;
752
+ }
753
+ }
754
+
755
+ /**
756
+ * A task assigned within a lane, with its ordering position.
757
+ *
758
+ * Tasks within a lane execute sequentially in `order` (ascending).
759
+ * The ordering is deterministic given the same input.
760
+ */
761
+ export interface AllocatedTask {
762
+ /** Task ID (e.g., "TO-014") */
763
+ taskId: string;
764
+ /** Execution order within the lane (0-indexed) */
765
+ order: number;
766
+ /** Full parsed task metadata */
767
+ task: ParsedTask;
768
+ /** Estimated duration in minutes */
769
+ estimatedMinutes: number;
770
+ }
771
+
772
+ /**
773
+ * A fully-allocated lane ready for execution.
774
+ *
775
+ * Contains everything Steps 2-3 need to run lane sessions,
776
+ * monitor progress, and identify the lane. This is the contract
777
+ * between Step 1 (allocation) and Step 2 (execution).
778
+ */
779
+ export interface AllocatedLane {
780
+ /** Lane number (1-indexed, deterministic, globally unique across repos) */
781
+ laneNumber: number;
782
+ /** Lane identifier for display and logging (e.g., "lane-1") */
783
+ laneId: string;
784
+ /** Lane session identifier (e.g., "orch-lane-1") — used by Step 2 */
785
+ laneSessionId: string;
786
+ /** Absolute path to the lane's worktree directory */
787
+ worktreePath: string;
788
+ /** Git branch name checked out in the worktree */
789
+ branch: string;
790
+ /** Tasks assigned to this lane, ordered for sequential execution */
791
+ tasks: AllocatedTask[];
792
+ /** Assignment strategy that was used (for diagnostics) */
793
+ strategy: "affinity-first" | "round-robin" | "load-balanced";
794
+ /** Total estimated load (sum of task weights) */
795
+ estimatedLoad: number;
796
+ /** Total estimated duration in minutes (sum of task durations) */
797
+ estimatedMinutes: number;
798
+ /** Repo ID this lane targets (workspace mode only). Undefined in repo mode. */
799
+ repoId?: string;
800
+ }
801
+
802
+ // ── Execution Types & Contracts ──────────────────────────────────────
803
+
804
+ /**
805
+ * Lifecycle status for a single task within lane execution.
806
+ *
807
+ * State machine:
808
+ * pending → running → succeeded
809
+ * → failed
810
+ * → stalled
811
+ * pending → skipped (pause/abort before task starts, or prior task failed)
812
+ */
813
+ export type LaneTaskStatus = "pending" | "running" | "succeeded" | "failed" | "stalled" | "skipped";
814
+
815
+ /**
816
+ * Embedded telemetry attached to a lane task outcome.
817
+ *
818
+ * Populated by Runtime V2 lane-runner at emission time so downstream
819
+ * consumers (batch history, diagnostics) can read authoritative usage
820
+ * without reconstructing task↔lane joins from snapshot keys.
821
+ */
822
+ export interface LaneTaskOutcomeTelemetry {
823
+ /** Total input tokens for this task outcome. */
824
+ inputTokens: number;
825
+ /** Total output tokens for this task outcome. */
826
+ outputTokens: number;
827
+ /** Total cache-read tokens for this task outcome. */
828
+ cacheReadTokens: number;
829
+ /** Total cache-write tokens for this task outcome. */
830
+ cacheWriteTokens: number;
831
+ /** Cumulative cost in USD for this task outcome. */
832
+ costUsd: number;
833
+ /** Number of tool calls made while producing this outcome. */
834
+ toolCalls: number;
835
+ /** End-to-end duration in milliseconds for this outcome. */
836
+ durationMs: number;
837
+ }
838
+
839
+ /**
840
+ * Outcome of a single task execution within a lane.
841
+ *
842
+ * Produced by `executeLane()` for each task in the lane's task list.
843
+ * Consumed by Step 3 (monitoring) and Step 4 (wave policy logic).
844
+ */
845
+ export interface LaneTaskOutcome {
846
+ /** Task identifier (e.g., "TO-014") */
847
+ taskId: string;
848
+ /** Final task status */
849
+ status: LaneTaskStatus;
850
+ /** Segment identifier for segment-aware execution (null for whole-task units). */
851
+ segmentId?: string | null;
852
+ /** When execution started (epoch ms), null if never started (skipped) */
853
+ startTime: number | null;
854
+ /** When execution ended (epoch ms), null if still pending */
855
+ endTime: number | null;
856
+ /** Human-readable reason for the outcome */
857
+ exitReason: string;
858
+ /** Lane session name used for this task (e.g., "orch-lane-1") */
859
+ sessionName: string;
860
+ /** Whether .DONE file was found */
861
+ doneFileFound: boolean;
862
+ /**
863
+ * Lane number that produced this task outcome (1-indexed).
864
+ *
865
+ * Optional for backward compatibility with pre-TP-116 persisted state.
866
+ */
867
+ laneNumber?: number;
868
+ /**
869
+ * Embedded task-level telemetry (authoritative for Runtime V2).
870
+ *
871
+ * Optional for backward compatibility and non-agent outcomes
872
+ * (for example skipped tasks).
873
+ */
874
+ telemetry?: LaneTaskOutcomeTelemetry;
875
+ /**
876
+ * Number of commits preserved as partial progress for a failed task.
877
+ * 0 when no partial progress was saved (succeeded tasks, no commits, etc.).
878
+ * Optional for backward compatibility — defaults to 0 when absent.
879
+ */
880
+ partialProgressCommits?: number;
881
+ /**
882
+ * Saved branch name holding partial progress for a failed task.
883
+ * Undefined when no partial progress was saved.
884
+ * Optional for backward compatibility.
885
+ */
886
+ partialProgressBranch?: string;
887
+ /**
888
+ * Structured exit diagnostic for this task (v3, TP-030).
889
+ *
890
+ * Canonical structured exit data — preferred over the legacy `exitReason`
891
+ * string when present. Produced by `classifyExit()` after session ends,
892
+ * then enriched with progress/context metadata.
893
+ *
894
+ * Optional: absent for tasks that haven't exited yet, and for
895
+ * backward compatibility with pre-v3 code paths.
896
+ * Consumers should check `exitDiagnostic` first, falling back to
897
+ * `exitReason` for display.
898
+ */
899
+ exitDiagnostic?: TaskExitDiagnostic;
900
+ }
901
+
902
+ /**
903
+ * Overall result of executing all tasks in a lane.
904
+ *
905
+ * The lane runs tasks sequentially. If a task fails and the lane
906
+ * has remaining tasks, those remaining tasks are marked as `skipped`.
907
+ */
908
+ export interface LaneExecutionResult {
909
+ /** Lane number (1-indexed) */
910
+ laneNumber: number;
911
+ /** Lane identifier for display (e.g., "lane-1") */
912
+ laneId: string;
913
+ /** Per-task outcomes in execution order */
914
+ tasks: LaneTaskOutcome[];
915
+ /** Aggregate lane status: succeeded if all tasks succeeded, failed if any failed */
916
+ overallStatus: "succeeded" | "failed" | "partial";
917
+ /** When lane execution started (epoch ms) */
918
+ startTime: number;
919
+ /** When lane execution ended (epoch ms) */
920
+ endTime: number;
921
+ }
922
+
923
+ // ── Execution Constants ──────────────────────────────────────────────
924
+
925
+ /**
926
+ * Grace period (ms) after a lane session exits before declaring failure.
927
+ * Allows time for .DONE file to be flushed to disk on slow filesystems.
928
+ */
929
+ export const DONE_GRACE_MS = 5_000;
930
+
931
+ /**
932
+ * Polling interval (ms) for checking session liveness and .DONE file.
933
+ */
934
+ export const EXECUTION_POLL_INTERVAL_MS = 2_000;
935
+
936
+ /**
937
+ * Maximum retries for legacy lane-session spawn failures.
938
+ * Only transient failures (session name collision) are retried.
939
+ */
940
+ export const SESSION_SPAWN_RETRY_MAX = 2;
941
+
942
+ // ── Execution Error Types ────────────────────────────────────────────
943
+
944
+ /**
945
+ * Error codes for lane execution failures.
946
+ *
947
+ * - EXEC_SPAWN_FAILED: Lane session could not be created after retries
948
+ * - EXEC_TASK_FAILED: task completed without .DONE (non-zero exit)
949
+ * - EXEC_TASK_STALLED: STATUS.md unchanged for stall_timeout (handled by Step 3)
950
+ * - EXEC_TASK_STAGE_FAILED: git add failed for task files
951
+ * - EXEC_TASK_COMMIT_FAILED: git commit failed for staged task files
952
+ * - EXEC_TMUX_NOT_AVAILABLE: Legacy `tmux` binary not found (compat path)
953
+ * - EXEC_WORKTREE_MISSING: lane worktree path doesn't exist
954
+ */
955
+ export type ExecutionErrorCode =
956
+ | "EXEC_SPAWN_FAILED"
957
+ | "EXEC_TASK_FAILED"
958
+ | "EXEC_TASK_STALLED"
959
+ | "EXEC_TASK_STAGE_FAILED"
960
+ | "EXEC_TASK_COMMIT_FAILED"
961
+ | "EXEC_TMUX_NOT_AVAILABLE"
962
+ | "EXEC_WORKTREE_MISSING"
963
+ | "EXEC_MISSING_TASK_FOLDER";
964
+
965
+ /** Typed error for lane execution failures. */
966
+ export class ExecutionError extends Error {
967
+ code: ExecutionErrorCode;
968
+ laneId?: string;
969
+ taskId?: string;
970
+
971
+ constructor(code: ExecutionErrorCode, message: string, laneId?: string, taskId?: string) {
972
+ super(message);
973
+ this.name = "ExecutionError";
974
+ this.code = code;
975
+ this.laneId = laneId;
976
+ this.taskId = taskId;
977
+ }
978
+ }
979
+
980
+ // ── Monitoring Types & Contracts ─────────────────────────────────────
981
+
982
+ /**
983
+ * Snapshot of a single task's monitored state at a point in time.
984
+ *
985
+ * Produced by `resolveTaskMonitorState()` from combining:
986
+ * - .DONE file presence
987
+ * - Lane-session liveness
988
+ * - STATUS.md parse results
989
+ * - STATUS.md mtime for stall detection
990
+ */
991
+ export interface TaskMonitorSnapshot {
992
+ /** Task ID (e.g., "TO-014") */
993
+ taskId: string;
994
+ /** Resolved monitoring status */
995
+ status: "pending" | "running" | "succeeded" | "failed" | "stalled" | "skipped" | "unknown";
996
+ /** Current step name (e.g., "Implement Service Layer"), null if not parsed */
997
+ currentStepName: string | null;
998
+ /** Current step number, null if not parsed */
999
+ currentStepNumber: number | null;
1000
+ /** Total steps in the task */
1001
+ totalSteps: number;
1002
+ /** Checked checkbox count across all steps */
1003
+ totalChecked: number;
1004
+ /** Total checkbox count across all steps */
1005
+ totalItems: number;
1006
+ /** Whether the lane session is alive */
1007
+ sessionAlive: boolean;
1008
+ /** Whether the .DONE file was found */
1009
+ doneFileFound: boolean;
1010
+ /** Stall reason (null if not stalled) */
1011
+ stallReason: string | null;
1012
+ /** Epoch ms of last known STATUS.md modification */
1013
+ lastHeartbeat: number | null;
1014
+ /** Epoch ms when this snapshot was taken */
1015
+ observedAt: number;
1016
+ /** Reason string if STATUS.md couldn't be read */
1017
+ parseError: string | null;
1018
+ /** Worker iteration number from STATUS.md */
1019
+ iteration: number;
1020
+ /** Review counter from STATUS.md */
1021
+ reviewCounter: number;
1022
+ }
1023
+
1024
+ /**
1025
+ * Per-lane monitoring snapshot aggregating task-level snapshots.
1026
+ */
1027
+ export interface LaneMonitorSnapshot {
1028
+ /** Lane identifier (e.g., "lane-1") */
1029
+ laneId: string;
1030
+ /** Lane number (1-indexed) */
1031
+ laneNumber: number;
1032
+ /** Lane session name (e.g., "orch-lane-1") */
1033
+ sessionName: string;
1034
+ /** Whether the lane session is alive right now */
1035
+ sessionAlive: boolean;
1036
+ /** Current task being executed (null if lane is idle/complete) */
1037
+ currentTaskId: string | null;
1038
+ /** Snapshot of the current task (null if no current task) */
1039
+ currentTaskSnapshot: TaskMonitorSnapshot | null;
1040
+ /** Task IDs that have completed (succeeded) */
1041
+ completedTasks: string[];
1042
+ /** Task IDs that failed or stalled */
1043
+ failedTasks: string[];
1044
+ /** Task IDs not yet started */
1045
+ remainingTasks: string[];
1046
+ }
1047
+
1048
+ /**
1049
+ * Aggregate monitoring state across all lanes.
1050
+ *
1051
+ * This is the primary data contract consumed by:
1052
+ * - Step 4 (wave execution loop) for failure policy decisions
1053
+ * - Step 6 (dashboard widget) for rendering
1054
+ */
1055
+ export interface MonitorState {
1056
+ /** Per-lane snapshots */
1057
+ lanes: LaneMonitorSnapshot[];
1058
+ /** Overall progress: tasks done / total */
1059
+ tasksDone: number;
1060
+ tasksFailed: number;
1061
+ tasksTotal: number;
1062
+ /** Current wave number */
1063
+ waveNumber: number;
1064
+ /** Number of poll cycles completed */
1065
+ pollCount: number;
1066
+ /** Epoch ms of last poll */
1067
+ lastPollTime: number;
1068
+ /** Whether all lanes have reached terminal state */
1069
+ allTerminal: boolean;
1070
+ }
1071
+
1072
+ /**
1073
+ * Per-task mtime tracker for stall detection.
1074
+ *
1075
+ * Tracks when we first observed the task (for startup grace),
1076
+ * last known STATUS.md mtime, and stall timer state.
1077
+ */
1078
+ export interface MtimeTracker {
1079
+ /** Task ID */
1080
+ taskId: string;
1081
+ /** Epoch ms when we first observed this task running */
1082
+ firstObservedAt: number;
1083
+ /** Whether we've successfully read STATUS.md at least once */
1084
+ statusFileSeenOnce: boolean;
1085
+ /** Last known STATUS.md mtime (epoch ms), null if never read */
1086
+ lastMtime: number | null;
1087
+ /** Epoch ms when the stall timer started (mtime stopped changing) */
1088
+ stallTimerStart: number | null;
1089
+ }
1090
+
1091
+ // ── Wave Execution Types & Contracts ─────────────────────────────────
1092
+
1093
+ /**
1094
+ * Failure policy action matrix.
1095
+ *
1096
+ * Defines what happens to tasks in different states when a failure occurs,
1097
+ * depending on the configured failure policy.
1098
+ *
1099
+ * | Task State | skip-dependents | stop-wave | stop-all |
1100
+ * |---------------|--------------------------|------------------------|---------------------------|
1101
+ * | In-flight | Continue running | Continue running | Kill immediately |
1102
+ * | Queued (lane) | Continue if not dependent| Skip remaining in lane | Skip remaining in lane |
1103
+ * | Future waves | Prune transitive deps | Don't start next wave | Don't start any more |
1104
+ *
1105
+ * Ownership contract:
1106
+ * - executeLane() is source-of-truth for terminal task status
1107
+ * - monitorLanes() runs as sibling async loop, can kill stalled sessions
1108
+ * - executeWave() coordinates both and applies policy
1109
+ * - Monitor's stall-kill does NOT conflict with executeLane() because
1110
+ * executeLane() polls session liveness and will see the killed session
1111
+ */
1112
+
1113
+ /**
1114
+ * Result of executing a single wave.
1115
+ *
1116
+ * Consumed by:
1117
+ * - Step 5 (/orch command) for wave-to-wave progression decisions
1118
+ * - Step 6 (dashboard widget) for rendering wave summaries
1119
+ */
1120
+ export interface WaveExecutionResult {
1121
+ /** Wave number (1-indexed) */
1122
+ waveIndex: number;
1123
+ /** Epoch ms when wave execution started */
1124
+ startedAt: number;
1125
+ /** Epoch ms when wave execution ended */
1126
+ endedAt: number;
1127
+ /** Per-lane execution results */
1128
+ laneResults: LaneExecutionResult[];
1129
+ /** Which failure policy was configured */
1130
+ policyApplied: "skip-dependents" | "stop-wave" | "stop-all";
1131
+ /** Whether the wave was stopped early due to policy */
1132
+ stoppedEarly: boolean;
1133
+ /** Task IDs that failed (including stalled) */
1134
+ failedTaskIds: string[];
1135
+ /** Task IDs that were skipped (due to pause, prior failure, or policy) */
1136
+ skippedTaskIds: string[];
1137
+ /** Task IDs that succeeded */
1138
+ succeededTaskIds: string[];
1139
+ /** Task IDs blocked for future waves (transitive dependents of failed tasks) */
1140
+ blockedTaskIds: string[];
1141
+ /** Number of lanes used */
1142
+ laneCount: number;
1143
+ /** Overall wave status */
1144
+ overallStatus: "succeeded" | "failed" | "partial" | "aborted";
1145
+ /** Final monitor state snapshot (null if monitoring wasn't started) */
1146
+ finalMonitorState: MonitorState | null;
1147
+ /** Allocated lanes used in this wave (preserved for merge and cleanup) */
1148
+ allocatedLanes: AllocatedLane[];
1149
+ /**
1150
+ * Structured allocation error when lane provisioning failed.
1151
+ * Null when allocation succeeded or wave failed for other reasons.
1152
+ * Used by Tier 0 to detect stale worktree failures and retry.
1153
+ * @since TP-039
1154
+ */
1155
+ allocationError?: {
1156
+ code: AllocationErrorCode;
1157
+ message: string;
1158
+ details?: string;
1159
+ } | null;
1160
+ }
1161
+
1162
+ // ── Orchestrator Runtime State ───────────────────────────────────────
1163
+
1164
+ /**
1165
+ * Runtime phase of the orchestrator batch execution.
1166
+ *
1167
+ * State machine:
1168
+ * idle → planning → executing → completed
1169
+ * → failed
1170
+ * → stopped (stop-wave/stop-all policy triggered)
1171
+ * → paused (via /orch-pause)
1172
+ * Any active state → idle (via cleanup after completion/failure)
1173
+ */
1174
+ export type OrchBatchPhase =
1175
+ | "idle"
1176
+ | "launching"
1177
+ | "planning"
1178
+ | "executing"
1179
+ | "merging"
1180
+ | "paused"
1181
+ | "stopped"
1182
+ | "completed"
1183
+ | "failed";
1184
+
1185
+ /**
1186
+ * Runtime state for a batch execution.
1187
+ *
1188
+ * This is the primary state object that:
1189
+ * - Tracks progress across waves for the /orch command
1190
+ * - Is consumed by Step 6 (dashboard widget) for rendering
1191
+ * - Tracks pauseSignal for /orch-pause
1192
+ * - Accumulates wave results for summary
1193
+ */
1194
+ export interface OrchBatchRuntimeState {
1195
+ /** Current execution phase */
1196
+ phase: OrchBatchPhase;
1197
+ /** Unique batch identifier (timestamp format, e.g., "20260308T214300") */
1198
+ batchId: string;
1199
+ /** Branch that was active when /orch started — used as base for worktrees and merge target */
1200
+ baseBranch: string;
1201
+ /** Orchestrator-managed branch name (e.g., 'orch/henry-20260318T140000'). Empty = legacy mode (merge into baseBranch directly). */
1202
+ orchBranch: string;
1203
+ /** Workspace execution mode (v2). Defaults to "repo" for backward compatibility. */
1204
+ mode: WorkspaceMode;
1205
+ /** Shared pause signal — set by /orch-pause, read by executeLane/executeWave */
1206
+ pauseSignal: { paused: boolean };
1207
+ /** All wave results in order (grows as waves complete) */
1208
+ waveResults: WaveExecutionResult[];
1209
+ /** Current wave index (0-based into waves array, -1 if not started) */
1210
+ currentWaveIndex: number;
1211
+ /** Total number of waves planned (segment rounds — internal) */
1212
+ totalWaves: number;
1213
+ /**
1214
+ * Number of dependency-driven task-level waves (TP-166).
1215
+ * Used for operator-facing "Wave X of Y" display. When undefined,
1216
+ * falls back to `totalWaves` for backward compatibility.
1217
+ */
1218
+ taskLevelWaveCount?: number;
1219
+ /**
1220
+ * Maps each segment round index (0-based) to its parent task-level
1221
+ * wave index (0-based). Updated when continuation rounds are inserted.
1222
+ * Used with `resolveDisplayWaveNumber()` for correct display. (TP-166)
1223
+ */
1224
+ roundToTaskWave?: number[];
1225
+ /** Set of task IDs blocked for future waves (from skip-dependents policy) */
1226
+ blockedTaskIds: Set<string>;
1227
+ /** Epoch ms when batch started */
1228
+ startedAt: number;
1229
+ /** Epoch ms when batch ended (null if still running) */
1230
+ endedAt: number | null;
1231
+ /** Total tasks in batch */
1232
+ totalTasks: number;
1233
+ /** Tasks completed successfully */
1234
+ succeededTasks: number;
1235
+ /** Tasks that failed */
1236
+ failedTasks: number;
1237
+ /** Tasks skipped */
1238
+ skippedTasks: number;
1239
+ /** Tasks blocked (transitive dependents of failures) */
1240
+ blockedTasks: number;
1241
+ /** Error messages for display */
1242
+ errors: string[];
1243
+ /** Allocated lanes from current wave (for session registry) */
1244
+ currentLanes: AllocatedLane[];
1245
+ /** Dependency graph for the batch (for skip-dependents computation) */
1246
+ dependencyGraph: DependencyGraph | null;
1247
+ /** Accumulated merge results across all waves */
1248
+ mergeResults: MergeWaveResult[];
1249
+ /**
1250
+ * v3 resilience state carried forward across resume cycles.
1251
+ * Populated from persisted state on resume; defaults used for new batches.
1252
+ */
1253
+ resilience?: ResilienceState;
1254
+ /**
1255
+ * v3 diagnostics state carried forward across resume cycles.
1256
+ * Populated from persisted state on resume; defaults used for new batches.
1257
+ */
1258
+ diagnostics?: BatchDiagnostics;
1259
+ /**
1260
+ * v4 segment records carried forward across resume cycles (TP-081).
1261
+ * Populated from persisted state on resume; empty for new batches
1262
+ * and repo-mode batches.
1263
+ */
1264
+ segments?: PersistedSegmentRecord[];
1265
+ /**
1266
+ * Unknown top-level fields from loaded persisted state.
1267
+ * Carried forward so they survive serialization roundtrips.
1268
+ */
1269
+ _extraFields?: Record<string, unknown>;
1270
+ }
1271
+
1272
+ /**
1273
+ * Session registry entry for /orch-sessions command.
1274
+ */
1275
+ export interface OrchestratorSessionEntry {
1276
+ /** Lane session name (e.g., "orch-lane-1") */
1277
+ sessionName: string;
1278
+ /** Lane ID (e.g., "lane-1") */
1279
+ laneId: string;
1280
+ /** Task ID currently running (if tracked) */
1281
+ taskId: string | null;
1282
+ /** Session status */
1283
+ status: "alive" | "dead";
1284
+ /** Worktree path */
1285
+ worktreePath: string;
1286
+ /** Attach command for user */
1287
+ attachCmd: string;
1288
+ }
1289
+
1290
+ /**
1291
+ * Session registry: maps session names to their metadata.
1292
+ */
1293
+ export type OrchestratorSessionRegistry = Map<string, OrchestratorSessionEntry>;
1294
+
1295
+ // ── Batch ID Generation ──────────────────────────────────────────────
1296
+
1297
+ /**
1298
+ * Generate a batch ID from the current timestamp.
1299
+ * Format: "YYYYMMDDTHHMMSS" (e.g., "20260308T214300")
1300
+ */
1301
+ export function generateBatchId(): string {
1302
+ const now = new Date();
1303
+ const pad = (n: number) => String(n).padStart(2, "0");
1304
+ return `${now.getFullYear()}${pad(now.getMonth() + 1)}${pad(now.getDate())}T${pad(now.getHours())}${pad(now.getMinutes())}${pad(now.getSeconds())}`;
1305
+ }
1306
+
1307
+ /**
1308
+ * Create a fresh batch runtime state.
1309
+ */
1310
+ export function freshOrchBatchState(): OrchBatchRuntimeState {
1311
+ return {
1312
+ phase: "idle",
1313
+ batchId: "",
1314
+ baseBranch: "",
1315
+ orchBranch: "",
1316
+ mode: "repo",
1317
+ pauseSignal: { paused: false },
1318
+ waveResults: [],
1319
+ currentWaveIndex: -1,
1320
+ totalWaves: 0,
1321
+ blockedTaskIds: new Set(),
1322
+ startedAt: 0,
1323
+ endedAt: null,
1324
+ totalTasks: 0,
1325
+ succeededTasks: 0,
1326
+ failedTasks: 0,
1327
+ skippedTasks: 0,
1328
+ blockedTasks: 0,
1329
+ errors: [],
1330
+ currentLanes: [],
1331
+ dependencyGraph: null,
1332
+ mergeResults: [],
1333
+ };
1334
+ }
1335
+
1336
+ // ── Merge Types ──────────────────────────────────────────────────────
1337
+
1338
+ /**
1339
+ * Valid merge result statuses.
1340
+ * Matches the contract in .pi/agents/task-merger.md.
1341
+ */
1342
+ export type MergeResultStatus =
1343
+ | "SUCCESS"
1344
+ | "CONFLICT_RESOLVED"
1345
+ | "CONFLICT_UNRESOLVED"
1346
+ | "BUILD_FAILURE";
1347
+
1348
+ /** All valid status strings for runtime validation. */
1349
+ export const VALID_MERGE_STATUSES: ReadonlySet<string> = new Set([
1350
+ "SUCCESS",
1351
+ "CONFLICT_RESOLVED",
1352
+ "CONFLICT_UNRESOLVED",
1353
+ "BUILD_FAILURE",
1354
+ ]);
1355
+
1356
+ /** A single conflict entry in the merge result. */
1357
+ export interface MergeConflict {
1358
+ file: string;
1359
+ type: string;
1360
+ resolved: boolean;
1361
+ resolution?: string;
1362
+ }
1363
+
1364
+ /** Verification outcome in the merge result. */
1365
+ export interface MergeVerification {
1366
+ ran: boolean;
1367
+ passed: boolean;
1368
+ output: string;
1369
+ }
1370
+
1371
+ /**
1372
+ * Merge result JSON written by the merge agent.
1373
+ * Matches the schema in .pi/agents/task-merger.md § Result File Format.
1374
+ */
1375
+ export interface MergeResult {
1376
+ status: MergeResultStatus;
1377
+ source_branch: string;
1378
+ target_branch: string;
1379
+ merge_commit: string;
1380
+ conflicts: MergeConflict[];
1381
+ verification: MergeVerification;
1382
+ }
1383
+
1384
+ /**
1385
+ * Orchestrator-side verification baseline comparison result for a single lane.
1386
+ * Populated when verification baseline fingerprinting is enabled (testing.commands configured).
1387
+ */
1388
+ export interface VerificationBaselineResult {
1389
+ /** Whether baseline comparison was performed */
1390
+ performed: boolean;
1391
+ /** Number of new failures (not in baseline) */
1392
+ newFailureCount: number;
1393
+ /** Number of pre-existing failures (also in baseline) */
1394
+ preExistingCount: number;
1395
+ /** Number of failures that disappeared (fixed by the merge) */
1396
+ fixedCount: number;
1397
+ /** Classification: "pass" (no new failures), "verification_new_failure", "flaky_suspected" */
1398
+ classification: "pass" | "verification_new_failure" | "flaky_suspected";
1399
+ /** Human-readable summary of new failures (truncated) */
1400
+ newFailureSummary: string;
1401
+ /** Whether a flaky re-run was performed */
1402
+ flakyRerunPerformed: boolean;
1403
+ }
1404
+
1405
+ /** Per-lane merge outcome, enriched by the orchestrator. */
1406
+ export interface MergeLaneResult {
1407
+ laneNumber: number;
1408
+ laneId: string;
1409
+ sourceBranch: string;
1410
+ targetBranch: string;
1411
+ result: MergeResult | null;
1412
+ error: string | null;
1413
+ durationMs: number;
1414
+ /** Repo ID this lane targeted (workspace mode only). Undefined in repo mode. */
1415
+ repoId?: string;
1416
+ /**
1417
+ * Orchestrator-side verification baseline result (TP-032).
1418
+ * Populated when baseline fingerprinting is enabled and a successful merge occurred.
1419
+ * Undefined when fingerprinting is not enabled or merge failed before verification.
1420
+ */
1421
+ verificationBaseline?: VerificationBaselineResult;
1422
+ }
1423
+
1424
+ /** Overall wave merge outcome. */
1425
+ export interface MergeWaveResult {
1426
+ waveIndex: number;
1427
+ status: "succeeded" | "failed" | "partial";
1428
+ laneResults: MergeLaneResult[];
1429
+ failedLane: number | null;
1430
+ failureReason: string | null;
1431
+ totalDurationMs: number;
1432
+ /** Per-repo merge outcomes (populated in workspace mode; empty in repo mode). */
1433
+ repoResults?: RepoMergeOutcome[];
1434
+ /**
1435
+ * TP-033: True when a verification rollback failed and safe-stop was triggered.
1436
+ * Engine MUST force `paused` phase regardless of `on_merge_failure` config,
1437
+ * and preserve all merge worktrees/branches for manual recovery.
1438
+ */
1439
+ rollbackFailed?: boolean;
1440
+ /**
1441
+ * TP-033: Transaction records for each lane merge attempt in this wave.
1442
+ * Populated when transactional envelope is active.
1443
+ */
1444
+ transactionRecords?: TransactionRecord[];
1445
+ /**
1446
+ * TP-033 R004-2: Errors encountered while persisting transaction records.
1447
+ * When non-empty, recovery commands in transaction records may reference
1448
+ * files that don't exist on disk. Operator should check `.pi/verification/`
1449
+ * manually.
1450
+ */
1451
+ persistenceErrors?: string[];
1452
+ }
1453
+
1454
+ /** Per-repo merge outcome within a wave merge. */
1455
+ export interface RepoMergeOutcome {
1456
+ /** Repo ID (undefined in repo mode default group). */
1457
+ repoId: string | undefined;
1458
+ /** Merge status for this repo. */
1459
+ status: "succeeded" | "failed" | "partial";
1460
+ /** Lane results belonging to this repo. */
1461
+ laneResults: MergeLaneResult[];
1462
+ /** Failed lane number within this repo (null if all succeeded). */
1463
+ failedLane: number | null;
1464
+ /** Failure reason within this repo (null if all succeeded). */
1465
+ failureReason: string | null;
1466
+ }
1467
+
1468
+ // ── Merge Transaction Types (TP-033) ─────────────────────────────────
1469
+
1470
+ /**
1471
+ * Status of a transactional merge attempt for a single lane.
1472
+ *
1473
+ * - `committed`: Merge succeeded, verification passed, refs advanced.
1474
+ * - `rolled_back`: Verification failed, merge commit rolled back to baseHEAD.
1475
+ * - `rollback_failed`: Rollback attempted but failed — safe-stop triggered.
1476
+ * - `merge_failed`: Merge itself failed (conflict, crash, etc.) before verification.
1477
+ *
1478
+ * @since TP-033
1479
+ */
1480
+ export type TransactionStatus = "committed" | "rolled_back" | "rollback_failed" | "merge_failed";
1481
+
1482
+ /**
1483
+ * Transactional record for a single lane merge attempt.
1484
+ *
1485
+ * Persisted as JSON at:
1486
+ * `.pi/verification/{opId}/txn-b{batchId}-repo-{repoId}-wave-{n}-lane-{k}.json`
1487
+ *
1488
+ * Captures the complete ref state before and after merge, rollback outcome,
1489
+ * and recovery commands for safe-stop scenarios.
1490
+ *
1491
+ * @since TP-033
1492
+ */
1493
+ export interface TransactionRecord {
1494
+ /** Operator ID for this batch run */
1495
+ opId: string;
1496
+ /** Batch identifier */
1497
+ batchId: string;
1498
+ /** Wave index (0-based) */
1499
+ waveIndex: number;
1500
+ /** Lane number within the wave */
1501
+ laneNumber: number;
1502
+ /** Repo ID (undefined/null in repo mode, string in workspace mode) */
1503
+ repoId: string | null;
1504
+ /** HEAD of temp branch before this lane's merge commit (rollback target) */
1505
+ baseHEAD: string;
1506
+ /** HEAD of the lane's source branch (commit being merged in) */
1507
+ laneHEAD: string;
1508
+ /** HEAD of temp branch after merge commit (null if merge failed before commit) */
1509
+ mergedHEAD: string | null;
1510
+ /** Transaction outcome */
1511
+ status: TransactionStatus;
1512
+ /** Whether a rollback was attempted */
1513
+ rollbackAttempted: boolean;
1514
+ /** Rollback outcome detail (null if rollback not attempted) */
1515
+ rollbackResult: string | null;
1516
+ /** Recovery commands emitted on rollback failure (empty array otherwise) */
1517
+ recoveryCommands: string[];
1518
+ /** ISO timestamp when transaction started */
1519
+ startedAt: string;
1520
+ /** ISO timestamp when transaction completed */
1521
+ completedAt: string;
1522
+ }
1523
+
1524
+ // ── Merge Error Types ────────────────────────────────────────────────
1525
+
1526
+ /**
1527
+ * Error codes for merge operations.
1528
+ *
1529
+ * - MERGE_SPAWN_FAILED: Could not create merge-agent session
1530
+ * - MERGE_TIMEOUT: Merge agent did not produce result within timeout
1531
+ * - MERGE_SESSION_DIED: Merge-agent session exited without writing result
1532
+ * - MERGE_RESULT_INVALID: Result file exists but contains invalid JSON
1533
+ * - MERGE_RESULT_MISSING_FIELDS: Result JSON missing required fields
1534
+ * - MERGE_UNKNOWN_STATUS: Result has an unrecognized status value
1535
+ * - MERGE_GIT_ERROR: Git command failure during merge setup
1536
+ */
1537
+ export type MergeErrorCode =
1538
+ | "MERGE_SPAWN_FAILED"
1539
+ | "MERGE_TIMEOUT"
1540
+ | "MERGE_SESSION_DIED"
1541
+ | "MERGE_RESULT_INVALID"
1542
+ | "MERGE_RESULT_MISSING_FIELDS"
1543
+ | "MERGE_UNKNOWN_STATUS"
1544
+ | "MERGE_GIT_ERROR";
1545
+
1546
+ /** Typed error class for merge operations. */
1547
+ export class MergeError extends Error {
1548
+ code: MergeErrorCode;
1549
+
1550
+ constructor(code: MergeErrorCode, message: string) {
1551
+ super(message);
1552
+ this.name = "MergeError";
1553
+ this.code = code;
1554
+ }
1555
+ }
1556
+
1557
+ // ── Merge Constants ──────────────────────────────────────────────────
1558
+
1559
+ /**
1560
+ * Default timeout for merge agent execution (ms).
1561
+ * Merge agents typically complete in 10-60 seconds. A 5-minute timeout
1562
+ * is generous and covers verification (go build) on large codebases.
1563
+ */
1564
+ /** Default merge agent timeout. Use config.merge.timeout_minutes to override. */
1565
+ export const MERGE_TIMEOUT_MS = 90 * 60 * 1000;
1566
+
1567
+ /**
1568
+ * Polling interval for merge result file (ms).
1569
+ * Merge agents are fast; poll aggressively.
1570
+ */
1571
+ export const MERGE_POLL_INTERVAL_MS = 2_000;
1572
+
1573
+ /**
1574
+ * Grace period after a merge-agent session exits before declaring failure (ms).
1575
+ * Allows for slow disk flush of the result file.
1576
+ */
1577
+ export const MERGE_RESULT_GRACE_MS = 3_000;
1578
+
1579
+ /**
1580
+ * Maximum retries for reading a partially-written result file.
1581
+ * If JSON parse fails, wait and retry in case the file is still being written.
1582
+ */
1583
+ export const MERGE_RESULT_READ_RETRIES = 3;
1584
+
1585
+ /**
1586
+ * Delay between result file read retries (ms).
1587
+ */
1588
+ export const MERGE_RESULT_READ_RETRY_DELAY_MS = 1_000;
1589
+
1590
+ /**
1591
+ * Maximum retries for merge-agent session spawn.
1592
+ */
1593
+ export const MERGE_SPAWN_RETRY_MAX = 2;
1594
+
1595
+ /**
1596
+ * Maximum retries for merge agent timeout (TP-038).
1597
+ *
1598
+ * When a merge agent times out, the orchestrator retries with 2× the
1599
+ * previous timeout. This allows recovery from transient slowness without
1600
+ * operator intervention.
1601
+ *
1602
+ * Retry 0: original timeout (e.g., 10 min)
1603
+ * Retry 1: 2× original (e.g., 20 min)
1604
+ * Retry 2: 4× original (e.g., 40 min)
1605
+ */
1606
+ export const MERGE_TIMEOUT_MAX_RETRIES = 2;
1607
+
1608
+ // ── Merge Health Monitoring Constants (TP-056) ───────────────────────
1609
+
1610
+ /**
1611
+ * Polling interval for merge health monitor (ms).
1612
+ * Independent of the merge result poll — runs on its own cadence.
1613
+ * @since TP-056
1614
+ */
1615
+ export const MERGE_HEALTH_POLL_INTERVAL_MS = 2 * 60 * 1000; // 2 minutes
1616
+
1617
+ /**
1618
+ * Threshold (ms) after which a merge session with no new output
1619
+ * is classified as "possibly stalled" and a warning event is emitted.
1620
+ * @since TP-056
1621
+ */
1622
+ export const MERGE_HEALTH_WARNING_THRESHOLD_MS = 10 * 60 * 1000; // 10 minutes
1623
+
1624
+ /**
1625
+ * Threshold (ms) after which a merge session with no new output
1626
+ * is classified as "stuck" and a stuck event is emitted.
1627
+ * @since TP-056
1628
+ */
1629
+ export const MERGE_HEALTH_STUCK_THRESHOLD_MS = 20 * 60 * 1000; // 20 minutes
1630
+
1631
+ /**
1632
+ * Number of lines to capture from recent merge output snapshots
1633
+ * for activity detection via snapshot comparison.
1634
+ * @since TP-056
1635
+ */
1636
+ export const MERGE_HEALTH_CAPTURE_LINES = 10;
1637
+
1638
+ // ── Persistent Reviewer Constants (TP-057) ───────────────────────────
1639
+
1640
+ /**
1641
+ * Polling interval (ms) for the `wait_for_review` tool to check for signal files.
1642
+ * Reviews take minutes; 3s latency is invisible to the user.
1643
+ * @since TP-057
1644
+ */
1645
+ export const REVIEWER_POLL_INTERVAL_MS = 3_000;
1646
+
1647
+ /**
1648
+ * Maximum time (ms) for the `wait_for_review` tool to wait for a review signal.
1649
+ * 30 minutes — generous for long-running code reviews.
1650
+ * @since TP-057
1651
+ */
1652
+ export const REVIEWER_WAIT_TIMEOUT_MS = 30 * 60 * 1000;
1653
+
1654
+ /**
1655
+ * Grace period (ms) after writing shutdown signal before killing the reviewer session.
1656
+ * Allows the reviewer to exit cleanly after receiving the shutdown signal.
1657
+ * @since TP-057
1658
+ */
1659
+ export const REVIEWER_SHUTDOWN_GRACE_MS = 10_000;
1660
+
1661
+ /**
1662
+ * Signal file prefix for review requests. Full name: `.review-signal-{NNN}`
1663
+ * @since TP-057
1664
+ */
1665
+ export const REVIEWER_SIGNAL_PREFIX = ".review-signal-";
1666
+
1667
+ /**
1668
+ * Shutdown signal filename written to .reviews/ when the task is complete.
1669
+ * @since TP-057
1670
+ */
1671
+ export const REVIEWER_SHUTDOWN_SIGNAL = ".review-shutdown";
1672
+
1673
+ // ── Merge Health Event Types (TP-056) ────────────────────────────────
1674
+
1675
+ /**
1676
+ * Health classification for a merge session.
1677
+ *
1678
+ * - `healthy`: Session alive, output changing
1679
+ * - `warning`: Session alive, no new output for MERGE_HEALTH_WARNING_THRESHOLD_MS
1680
+ * - `dead`: Session gone, no result file
1681
+ * - `stuck`: Session alive, no new output for MERGE_HEALTH_STUCK_THRESHOLD_MS
1682
+ *
1683
+ * @since TP-056
1684
+ */
1685
+ export type MergeHealthStatus = "healthy" | "warning" | "dead" | "stuck";
1686
+
1687
+ /**
1688
+ * Engine event types for merge health monitoring.
1689
+ *
1690
+ * These extend the EngineEventType union and are emitted to the
1691
+ * unified events.jsonl for supervisor consumption.
1692
+ *
1693
+ * @since TP-056
1694
+ */
1695
+ export type MergeHealthEventType =
1696
+ | "merge_health_warning"
1697
+ | "merge_health_dead"
1698
+ | "merge_health_stuck";
1699
+
1700
+ /**
1701
+ * Snapshot of a merge session's pane output at a point in time.
1702
+ * Used for activity detection by comparing successive snapshots.
1703
+ *
1704
+ * @since TP-056
1705
+ */
1706
+ export interface MergeSessionSnapshot {
1707
+ /** Captured pane content (last N lines) */
1708
+ content: string;
1709
+ /** Epoch ms when the snapshot was taken */
1710
+ capturedAt: number;
1711
+ }
1712
+
1713
+ /**
1714
+ * Per-session health tracking state.
1715
+ *
1716
+ * @since TP-056
1717
+ */
1718
+ export interface MergeSessionHealthState {
1719
+ /** Merge session name */
1720
+ sessionName: string;
1721
+ /** Lane number this session belongs to */
1722
+ laneNumber: number;
1723
+ /** Last captured pane snapshot */
1724
+ lastSnapshot: MergeSessionSnapshot | null;
1725
+ /** Epoch ms when the last output change was detected */
1726
+ lastActivityAt: number;
1727
+ /** Current health classification */
1728
+ status: MergeHealthStatus;
1729
+ /** Whether a warning event has been emitted (prevent duplicates) */
1730
+ warningEmitted: boolean;
1731
+ /** Whether a stuck event has been emitted (prevent duplicates) */
1732
+ stuckEmitted: boolean;
1733
+ /** Whether a dead event has been emitted (prevent duplicates) */
1734
+ deadEmitted: boolean;
1735
+ }
1736
+
1737
+ // ── Merge Retry Policy Matrix (TP-033 Step 2) ───────────────────────
1738
+
1739
+ /**
1740
+ * Merge-related failure classifications for the retry policy matrix.
1741
+ *
1742
+ * These are the merge-phase failure classes from the resilience roadmap §4c.
1743
+ * Task-execution classes (api_error, context_overflow, etc.) are out of scope
1744
+ * for TP-033 and handled separately in Phase 1/3.
1745
+ *
1746
+ * @since TP-033
1747
+ */
1748
+ export type MergeFailureClassification =
1749
+ | "verification_new_failure"
1750
+ | "merge_conflict_unresolved"
1751
+ | "cleanup_post_merge_failed"
1752
+ | "git_worktree_dirty"
1753
+ | "git_lock_file";
1754
+
1755
+ /**
1756
+ * Retry policy for a single merge failure classification.
1757
+ *
1758
+ * Defines whether a failure class is retriable, the maximum retry attempts,
1759
+ * cooldown between retries (in milliseconds), and what happens on exhaustion.
1760
+ *
1761
+ * @since TP-033
1762
+ */
1763
+ export interface MergeRetryPolicy {
1764
+ /** Whether this failure class can be retried automatically */
1765
+ retriable: boolean;
1766
+ /** Maximum number of retry attempts (0 for non-retriable) */
1767
+ maxAttempts: number;
1768
+ /** Cooldown delay between retries in milliseconds (0 for immediate) */
1769
+ cooldownMs: number;
1770
+ /** Action when retries are exhausted or class is non-retriable */
1771
+ exhaustionAction: "pause" | "pause_wave_gate" | "pause_escalation";
1772
+ }
1773
+
1774
+ /**
1775
+ * Centralized retry policy matrix for merge-related failure classes.
1776
+ *
1777
+ * This is the **single source of truth** for retry behavior. Both engine.ts
1778
+ * and resume.ts consume this table through `computeMergeRetryDecision()` to
1779
+ * guarantee parity.
1780
+ *
1781
+ * Values from resilience roadmap §4c:
1782
+ *
1783
+ * | Classification | Retry? | Max | Cooldown | Exhaustion |
1784
+ * |-----------------------------|--------|-----|----------|---------------------|
1785
+ * | verification_new_failure | ✅ | 1 | 0ms | pause + diagnostic |
1786
+ * | merge_conflict_unresolved | ❌ | 0 | — | pause + escalation |
1787
+ * | cleanup_post_merge_failed | ✅ | 1 | 2000ms | pause (wave gate) |
1788
+ * | git_worktree_dirty | ✅ | 1 | 2000ms | pause |
1789
+ * | git_lock_file | ✅ | 2 | 3000ms | pause |
1790
+ *
1791
+ * @since TP-033
1792
+ */
1793
+ export const MERGE_RETRY_POLICY_MATRIX: Readonly<
1794
+ Record<MergeFailureClassification, MergeRetryPolicy>
1795
+ > = {
1796
+ verification_new_failure: {
1797
+ retriable: true,
1798
+ maxAttempts: 1,
1799
+ cooldownMs: 0,
1800
+ exhaustionAction: "pause",
1801
+ },
1802
+ merge_conflict_unresolved: {
1803
+ retriable: false,
1804
+ maxAttempts: 0,
1805
+ cooldownMs: 0,
1806
+ exhaustionAction: "pause_escalation",
1807
+ },
1808
+ cleanup_post_merge_failed: {
1809
+ retriable: true,
1810
+ maxAttempts: 1,
1811
+ cooldownMs: 2_000,
1812
+ exhaustionAction: "pause_wave_gate",
1813
+ },
1814
+ git_worktree_dirty: {
1815
+ retriable: true,
1816
+ maxAttempts: 1,
1817
+ cooldownMs: 2_000,
1818
+ exhaustionAction: "pause",
1819
+ },
1820
+ git_lock_file: {
1821
+ retriable: true,
1822
+ maxAttempts: 2,
1823
+ cooldownMs: 3_000,
1824
+ exhaustionAction: "pause",
1825
+ },
1826
+ };
1827
+
1828
+ /**
1829
+ * All merge failure classifications as a readonly array, for iteration/validation.
1830
+ * @since TP-033
1831
+ */
1832
+ export const MERGE_FAILURE_CLASSIFICATIONS: readonly MergeFailureClassification[] = [
1833
+ "verification_new_failure",
1834
+ "merge_conflict_unresolved",
1835
+ "cleanup_post_merge_failed",
1836
+ "git_worktree_dirty",
1837
+ "git_lock_file",
1838
+ ] as const;
1839
+
1840
+ // ── Tier 0 Watchdog Recovery Types (TP-039) ──────────────────────────
1841
+
1842
+ /**
1843
+ * Tier 0 recovery pattern identifiers.
1844
+ *
1845
+ * Each pattern corresponds to a failure class that the engine can
1846
+ * handle automatically without supervisor intervention.
1847
+ *
1848
+ * @since TP-039
1849
+ */
1850
+ export type Tier0RecoveryPattern =
1851
+ | "worker_crash"
1852
+ | "stale_worktree"
1853
+ | "cleanup_gate"
1854
+ | "model_fallback";
1855
+
1856
+ /**
1857
+ * Exit classifications that are eligible for automatic Tier 0 retry.
1858
+ *
1859
+ * These are transient failures where re-running the task has a reasonable
1860
+ * chance of success. Classifications NOT in this set (e.g., user_killed,
1861
+ * stall_timeout, context_overflow, spawn_failure) indicate persistent
1862
+ * problems that won't be fixed by retrying.
1863
+ *
1864
+ * **TP-190 (#561):** `spawn_failure` is intentionally excluded — spawn-stage
1865
+ * errors (Pi CLI not findable, worktree provisioning failure, branch
1866
+ * collision) are never transient and require operator action. Retrying
1867
+ * silently would just burn the retry budget and delay the alert.
1868
+ *
1869
+ * @since TP-039
1870
+ */
1871
+ export const TIER0_RETRYABLE_CLASSIFICATIONS: ReadonlySet<string> = new Set([
1872
+ "api_error",
1873
+ "model_access_error",
1874
+ "process_crash",
1875
+ "session_vanished",
1876
+ ]);
1877
+
1878
+ /**
1879
+ * Retry budget for Tier 0 recovery patterns.
1880
+ *
1881
+ * Defines max retries, cooldown between attempts, and backoff
1882
+ * multiplier for each pattern. Values from spec §5.3.
1883
+ *
1884
+ * @since TP-039
1885
+ */
1886
+ export interface Tier0RetryBudget {
1887
+ /** Maximum number of retry attempts */
1888
+ maxRetries: number;
1889
+ /** Cooldown delay between retries in milliseconds */
1890
+ cooldownMs: number;
1891
+ /** Multiplier applied to cooldown on each subsequent retry */
1892
+ backoffMultiplier: number;
1893
+ }
1894
+
1895
+ /**
1896
+ * Centralized retry budgets for Tier 0 recovery patterns.
1897
+ *
1898
+ * These are the defaults from spec §5.3. They are NOT configurable
1899
+ * via user config in Tier 0 — the supervisor (Tier 1) can override
1900
+ * them in future iterations.
1901
+ *
1902
+ * @since TP-039
1903
+ */
1904
+ export const TIER0_RETRY_BUDGETS: Readonly<Record<Tier0RecoveryPattern, Tier0RetryBudget>> = {
1905
+ worker_crash: {
1906
+ maxRetries: 1,
1907
+ cooldownMs: 5_000,
1908
+ backoffMultiplier: 1.0,
1909
+ },
1910
+ stale_worktree: {
1911
+ maxRetries: 1,
1912
+ cooldownMs: 2_000,
1913
+ backoffMultiplier: 1.0,
1914
+ },
1915
+ cleanup_gate: {
1916
+ maxRetries: 1,
1917
+ cooldownMs: 2_000,
1918
+ backoffMultiplier: 1.0,
1919
+ },
1920
+ model_fallback: {
1921
+ maxRetries: 1,
1922
+ cooldownMs: 3_000,
1923
+ backoffMultiplier: 1.0,
1924
+ },
1925
+ };
1926
+
1927
+ /**
1928
+ * All Tier 0 escalation-eligible pattern identifiers.
1929
+ *
1930
+ * Extends `Tier0RecoveryPattern` with `merge_timeout` so that
1931
+ * `EscalationContext` can describe escalations from every exhaustion
1932
+ * path, including the merge retry loop (which uses its own retry
1933
+ * matrix but still triggers Tier 0 escalation on exhaustion).
1934
+ *
1935
+ * @since TP-039
1936
+ */
1937
+ export type Tier0EscalationPattern = Tier0RecoveryPattern | "merge_timeout";
1938
+ // Note: model_fallback is already included via Tier0RecoveryPattern
1939
+
1940
+ /**
1941
+ * Context payload emitted when Tier 0 retries are exhausted and the
1942
+ * engine must escalate to the supervisor (future TP-041).
1943
+ *
1944
+ * This is the structured data that a Tier 1 supervisor agent uses to
1945
+ * decide what to do next. In Tier 0, escalation simply falls through
1946
+ * to the existing pause behaviour.
1947
+ *
1948
+ * @since TP-039
1949
+ */
1950
+ export interface EscalationContext {
1951
+ /** Which recovery pattern was attempted */
1952
+ pattern: Tier0EscalationPattern;
1953
+ /** Number of retry attempts that were made (1-based) */
1954
+ attempts: number;
1955
+ /** Maximum attempts that were allowed */
1956
+ maxAttempts: number;
1957
+ /** Human-readable last error / failure reason */
1958
+ lastError: string;
1959
+ /** Task IDs affected by this failure */
1960
+ affectedTasks: string[];
1961
+ /** Suggested remediation for an operator or supervisor */
1962
+ suggestion: string;
1963
+ }
1964
+
1965
+ /**
1966
+ * Scope key prefix for Tier 0 (non-merge) retry counters.
1967
+ *
1968
+ * Format: `t0:{pattern}:{taskId}:w{waveIndex}`
1969
+ * This namespace prevents collisions with merge retry scope keys
1970
+ * (which use `{taskId}:w{waveIndex}:l{laneNumber}`).
1971
+ *
1972
+ * @since TP-039
1973
+ */
1974
+ export function tier0ScopeKey(
1975
+ pattern: Tier0RecoveryPattern,
1976
+ taskId: string,
1977
+ waveIndex: number,
1978
+ ): string {
1979
+ return `t0:${pattern}:${taskId}:w${waveIndex}`;
1980
+ }
1981
+
1982
+ /**
1983
+ * Wave-level scope key for Tier 0 patterns that operate at wave granularity
1984
+ * (stale_worktree, cleanup_gate).
1985
+ *
1986
+ * Format: `t0:{pattern}:w{waveIndex}`
1987
+ *
1988
+ * @since TP-039
1989
+ */
1990
+ export function tier0WaveScopeKey(pattern: Tier0RecoveryPattern, waveIndex: number): string {
1991
+ return `t0:${pattern}:w${waveIndex}`;
1992
+ }
1993
+
1994
+ // ── Engine Event Types (TP-040) ──────────────────────────────────────
1995
+
1996
+ /**
1997
+ * Engine lifecycle event types emitted during batch execution.
1998
+ *
1999
+ * These events are the primary coordination mechanism between the
2000
+ * non-blocking engine and external consumers (supervisor agent,
2001
+ * dashboard, command handlers).
2002
+ *
2003
+ * Event semantics (from spec §7.3):
2004
+ * - `wave_start` — Wave execution begins
2005
+ * - `task_complete` — Task .DONE detected (succeeded)
2006
+ * - `task_failed` — Task failed or stalled
2007
+ * - `merge_start` — Wave merge begins
2008
+ * - `merge_success` — Merge and verification pass
2009
+ * - `merge_failed` — Merge or verification fails
2010
+ * - `batch_complete` — All waves done (terminal)
2011
+ * - `batch_paused` — Batch paused (failure or manual)
2012
+ *
2013
+ * Tier 0 recovery events (`tier0_recovery_attempt`, `tier0_recovery_success`,
2014
+ * `tier0_recovery_exhausted`, `tier0_escalation`) continue to use the
2015
+ * existing `Tier0EventType` from persistence.ts and share the same JSONL
2016
+ * file. Engine events extend the same stream with lifecycle context.
2017
+ *
2018
+ * @since TP-040
2019
+ */
2020
+ export type EngineEventType =
2021
+ | "wave_start"
2022
+ | "task_complete"
2023
+ | "task_failed"
2024
+ | "merge_start"
2025
+ | "merge_success"
2026
+ | "merge_failed"
2027
+ | "merge_health_warning"
2028
+ | "merge_health_dead"
2029
+ | "merge_health_stuck"
2030
+ | "batch_complete"
2031
+ | "batch_paused";
2032
+
2033
+ /**
2034
+ * Structured engine event written to `.pi/supervisor/events.jsonl`.
2035
+ *
2036
+ * Shares the same JSONL file as Tier 0 events, with a consistent
2037
+ * base payload (`timestamp`, `batchId`, `waveIndex`) for uniform
2038
+ * consumption by the supervisor agent.
2039
+ *
2040
+ * Design: follows reviewer suggestion (R001) to use a shared base
2041
+ * payload and extend the existing event-writing infrastructure rather
2042
+ * than introducing a parallel writer.
2043
+ *
2044
+ * @since TP-040
2045
+ */
2046
+ export interface EngineEvent {
2047
+ /** ISO 8601 timestamp */
2048
+ timestamp: string;
2049
+ /** Engine event type */
2050
+ type: EngineEventType;
2051
+ /** Batch identifier */
2052
+ batchId: string;
2053
+ /** Wave index (0-based, -1 if not wave-scoped) */
2054
+ waveIndex: number;
2055
+ /** Current batch phase at event emission time */
2056
+ phase: OrchBatchPhase;
2057
+
2058
+ // ── Event-specific fields (all optional) ─────────────────────
2059
+
2060
+ /** Task IDs in the wave (for wave_start) */
2061
+ taskIds?: string[];
2062
+ /** Number of lanes used (for wave_start, merge_start) */
2063
+ laneCount?: number;
2064
+ /** Task ID (for task_complete, task_failed) */
2065
+ taskId?: string;
2066
+ /** Task execution duration in milliseconds (for task_complete, task_failed) */
2067
+ durationMs?: number;
2068
+ /** Task outcome summary (for task_complete) */
2069
+ outcome?: string;
2070
+ /** Failure reason (for task_failed, merge_failed, batch_paused) */
2071
+ reason?: string;
2072
+ /** Whether partial progress was preserved (for task_failed) */
2073
+ partialProgress?: boolean;
2074
+ /** Lane number (for merge_failed) */
2075
+ laneNumber?: number;
2076
+ /** Merge error details (for merge_failed) */
2077
+ error?: string;
2078
+ /** Number of merge test verifications (for merge_success) */
2079
+ testCount?: number;
2080
+ /** Wave count for total waves (for merge_success) */
2081
+ totalWaves?: number;
2082
+
2083
+ // ── Batch summary fields (for batch_complete, batch_paused) ──
2084
+
2085
+ /** Total succeeded tasks (for batch_complete) */
2086
+ succeededTasks?: number;
2087
+ /** Total failed tasks (for batch_complete, batch_paused) */
2088
+ failedTasks?: number;
2089
+ /** Total skipped tasks (for batch_complete) */
2090
+ skippedTasks?: number;
2091
+ /** Total blocked tasks (for batch_complete) */
2092
+ blockedTasks?: number;
2093
+ /** Batch duration in milliseconds (for batch_complete) */
2094
+ batchDurationMs?: number;
2095
+
2096
+ // ── Merge health monitoring fields (TP-056) ──────────────────
2097
+
2098
+ /** Merge session name (for merge_health_* events) */
2099
+ sessionName?: string;
2100
+ /** Merge health status classification (for merge_health_* events) */
2101
+ healthStatus?: MergeHealthStatus;
2102
+ /** Minutes since last activity (for merge_health_warning, merge_health_stuck) */
2103
+ stalledMinutes?: number;
2104
+ }
2105
+
2106
+ /**
2107
+ * Callback type for engine event consumers.
2108
+ *
2109
+ * The command handler (extension.ts) subscribes to this to receive
2110
+ * real-time engine state transitions. In the non-blocking architecture
2111
+ * (Step 2), this is the primary way the caller observes engine progress
2112
+ * instead of awaiting the return value.
2113
+ *
2114
+ * The callback is invoked synchronously in the engine's event loop.
2115
+ * Consumers MUST NOT perform blocking I/O in the callback.
2116
+ *
2117
+ * @since TP-040
2118
+ */
2119
+ export type EngineEventCallback = (event: EngineEvent) => void;
2120
+
2121
+ // ── Supervisor Alert Types (TP-076) ──────────────────────────────────
2122
+
2123
+ /**
2124
+ * Alert category for supervisor notifications.
2125
+ *
2126
+ * Matches the alert categories in the autonomous supervisor spec:
2127
+ * - `task-failure`: A task failed after deterministic recovery was exhausted
2128
+ * - `merge-failure`: Wave merge failed and batch paused
2129
+ * - `batch-complete`: Batch finished (all waves done)
2130
+ * - `agent-message`: Runtime mailbox reply/escalation from a running agent
2131
+ * - `segment-expansion-requested`: Worker requested dynamic segment expansion
2132
+ * - `segment-expansion-approved`: Engine approved an expansion request
2133
+ * - `segment-expansion-rejected`: Engine rejected/discarded an expansion request
2134
+ *
2135
+ * Note: `stall` detection is deferred to a future phase (requires
2136
+ * last-activity tracking not yet built).
2137
+ *
2138
+ * @since TP-076
2139
+ */
2140
+ export type SupervisorAlertCategory =
2141
+ | "task-failure"
2142
+ | "merge-failure"
2143
+ | "batch-complete"
2144
+ | "agent-message"
2145
+ | "worker-exit-intercept"
2146
+ | "segment-expansion-requested"
2147
+ | "segment-expansion-approved"
2148
+ | "segment-expansion-rejected";
2149
+
2150
+ /**
2151
+ * Structured context payload for supervisor alerts.
2152
+ *
2153
+ * All fields are IPC-serializable (no functions, no circular refs, no Maps/Sets).
2154
+ * Each alert category populates the relevant subset of optional fields.
2155
+ *
2156
+ * @since TP-076
2157
+ */
2158
+ export interface SupervisorSegmentFrontierSnapshot {
2159
+ /** Parent task identifier */
2160
+ taskId: string;
2161
+ /** Total number of ordered segments for the task */
2162
+ totalSegments: number;
2163
+ /** Number of segments that reached a terminal status */
2164
+ terminalSegments: number;
2165
+ /** Active (or most recently active) segment ID */
2166
+ activeSegmentId: string | null;
2167
+ /** Segment-level execution snapshot in deterministic order */
2168
+ segments: Array<{
2169
+ segmentId: string;
2170
+ repoId: string;
2171
+ status: PersistedSegmentStatus;
2172
+ dependsOnSegmentIds: string[];
2173
+ }>;
2174
+ }
2175
+
2176
+ export interface SupervisorAlertContext {
2177
+ /** Task ID (for task-failure alerts) */
2178
+ taskId?: string;
2179
+ /** Segment ID (for segment-aware task-failure alerts) */
2180
+ segmentId?: string;
2181
+ /** Repo ID associated with the failure (task segment or merge target) */
2182
+ repoId?: string;
2183
+ /** Lane ID, e.g., "lane-1" (for task-failure alerts) */
2184
+ laneId?: string;
2185
+ /** Lane number (for task-failure and merge-failure alerts) */
2186
+ laneNumber?: number;
2187
+ /** Wave index, 0-based (for merge-failure and batch-complete alerts) */
2188
+ waveIndex?: number;
2189
+ /** Exit reason string (for task-failure alerts) */
2190
+ exitReason?: string;
2191
+ /**
2192
+ * Structured exit category for task-failure alerts.
2193
+ *
2194
+ * Mirrors `LaneTaskOutcome.exitDiagnostic.classification` for IPC
2195
+ * consumption by the supervisor. Optional for backward compatibility
2196
+ * — absent when the engine produces a task-failure alert without
2197
+ * structured diagnostic data.
2198
+ *
2199
+ * Notable values consumed by the supervisor playbook:
2200
+ * - `"spawn_failure"` (TP-190, #561): worker process never spawned
2201
+ * (Pi CLI not findable, worktree provisioning error, etc.). Never
2202
+ * transient — the playbook MUST escalate immediately rather than
2203
+ * retry. When the post-wave phase-transition logic detects an
2204
+ * all-spawn-failed wave it also flips `batchState.phase` to
2205
+ * `"failed"`; that transition is independent of this alert.
2206
+ *
2207
+ * @since TP-190 (#561)
2208
+ */
2209
+ exitCategory?: ExitClassification;
2210
+ /** Segment frontier snapshot for task-failure diagnosis */
2211
+ segmentFrontier?: SupervisorSegmentFrontierSnapshot;
2212
+ /** Agent ID (for agent-message alerts) */
2213
+ agentId?: string;
2214
+ /** Mailbox message ID (for agent-message alerts) */
2215
+ messageId?: string;
2216
+ /** Segment expansion request ID (for segment-expansion alerts) */
2217
+ expansionRequestId?: string;
2218
+ /** Whether partial progress was preserved (for task-failure alerts) */
2219
+ partialProgress?: boolean;
2220
+ /** Batch progress summary */
2221
+ batchProgress?: {
2222
+ succeededTasks: number;
2223
+ failedTasks: number;
2224
+ skippedTasks: number;
2225
+ blockedTasks: number;
2226
+ totalTasks: number;
2227
+ currentWave: number;
2228
+ totalWaves: number;
2229
+ };
2230
+ /** Merge failure reason (for merge-failure alerts) */
2231
+ mergeError?: string;
2232
+ /** Batch duration in milliseconds (for batch-complete alerts) */
2233
+ batchDurationMs?: number;
2234
+ }
2235
+
2236
+ /**
2237
+ * Structured supervisor alert message.
2238
+ *
2239
+ * Emitted by the engine (child process) via IPC when the supervisor
2240
+ * needs to be notified of an event requiring attention or acknowledgement.
2241
+ *
2242
+ * Design:
2243
+ * - All fields are plain JSON-serializable values (IPC-safe).
2244
+ * - `category` determines the alert type and which `context` fields are populated.
2245
+ * - `summary` is a pre-formatted, human-readable string suitable for direct
2246
+ * display to the supervisor LLM as a conversation message.
2247
+ * - `context` provides structured data for programmatic consumption.
2248
+ *
2249
+ * @since TP-076
2250
+ */
2251
+ export interface SupervisorAlert {
2252
+ /** Alert category — determines handling behavior */
2253
+ category: SupervisorAlertCategory;
2254
+ /** Human-readable summary suitable for display as a chat message */
2255
+ summary: string;
2256
+ /** Structured context data (all fields IPC-serializable) */
2257
+ context: SupervisorAlertContext;
2258
+ }
2259
+
2260
+ /**
2261
+ * Callback type for supervisor alert emission.
2262
+ *
2263
+ * The engine (child process) calls this when it needs to alert the
2264
+ * supervisor about a significant event. The main thread handler
2265
+ * converts the alert into a `sendUserMessage` call to wake the
2266
+ * supervisor LLM.
2267
+ *
2268
+ * @since TP-076
2269
+ */
2270
+ export type SupervisorAlertCallback = (alert: SupervisorAlert) => void;
2271
+
2272
+ /**
2273
+ * Information about a lane that has just reached a terminal state.
2274
+ *
2275
+ * Emitted at the no-progress kill and hard-fail decision points so the
2276
+ * supervisor process can mark the lane as terminated and drop any further
2277
+ * alerts queued for it (see {@link LaneTerminatedCallback}).
2278
+ *
2279
+ * @since TP-187 (#538)
2280
+ */
2281
+ export interface LaneTerminatedInfo {
2282
+ laneNumber: number;
2283
+ agentId: string;
2284
+ batchId: string;
2285
+ terminatedAt: number;
2286
+ reason: "no-progress-kill" | "hard-fail" | "supervisor-takeover";
2287
+ }
2288
+
2289
+ /**
2290
+ * Callback invoked when a lane reaches a terminal state.
2291
+ *
2292
+ * @since TP-187 (#538)
2293
+ */
2294
+ export type LaneTerminatedCallback = (info: LaneTerminatedInfo) => void;
2295
+
2296
+ /**
2297
+ * Build a batch progress snapshot from runtime state.
2298
+ *
2299
+ * Pure function — extracts the current progress counters from
2300
+ * OrchBatchRuntimeState into the IPC-serializable format used
2301
+ * by SupervisorAlertContext.batchProgress.
2302
+ *
2303
+ * @since TP-076
2304
+ */
2305
+ export function buildBatchProgressSnapshot(
2306
+ batchState: OrchBatchRuntimeState,
2307
+ ): NonNullable<SupervisorAlertContext["batchProgress"]> {
2308
+ return {
2309
+ succeededTasks: batchState.succeededTasks,
2310
+ failedTasks: batchState.failedTasks,
2311
+ skippedTasks: batchState.skippedTasks,
2312
+ blockedTasks: batchState.blockedTasks,
2313
+ totalTasks: batchState.totalTasks,
2314
+ currentWave: batchState.currentWaveIndex + 1, // 1-based for display
2315
+ totalWaves: batchState.totalWaves,
2316
+ };
2317
+ }
2318
+
2319
+ /**
2320
+ * Build a task-level segment frontier snapshot for supervisor failure alerts.
2321
+ *
2322
+ * Returns `undefined` when the task has no segment metadata.
2323
+ */
2324
+ export function buildSupervisorSegmentFrontierSnapshot(
2325
+ taskId: string,
2326
+ segmentIds: string[] | undefined,
2327
+ activeSegmentId: string | null | undefined,
2328
+ persistedSegments: PersistedSegmentRecord[] | undefined,
2329
+ preferredSegmentId?: string | null,
2330
+ ): SupervisorSegmentFrontierSnapshot | undefined {
2331
+ const orderedSegmentIds = Array.isArray(segmentIds)
2332
+ ? segmentIds.filter(
2333
+ (segmentId): segmentId is string =>
2334
+ typeof segmentId === "string" && segmentId.trim().length > 0,
2335
+ )
2336
+ : [];
2337
+ if (orderedSegmentIds.length === 0) return undefined;
2338
+
2339
+ const bySegmentId = new Map<string, PersistedSegmentRecord>();
2340
+ for (const segment of persistedSegments ?? []) {
2341
+ if (segment && segment.taskId === taskId) {
2342
+ bySegmentId.set(segment.segmentId, segment);
2343
+ }
2344
+ }
2345
+
2346
+ const resolvedActiveSegmentId =
2347
+ activeSegmentId && orderedSegmentIds.includes(activeSegmentId)
2348
+ ? activeSegmentId
2349
+ : preferredSegmentId && orderedSegmentIds.includes(preferredSegmentId)
2350
+ ? preferredSegmentId
2351
+ : null;
2352
+
2353
+ const segments = orderedSegmentIds.map((segmentId) => {
2354
+ const persisted = bySegmentId.get(segmentId);
2355
+ const status: PersistedSegmentStatus =
2356
+ persisted?.status ?? (resolvedActiveSegmentId === segmentId ? "running" : "pending");
2357
+ return {
2358
+ segmentId,
2359
+ repoId: persisted ? parseSegmentIdRepo(persisted) : "unknown",
2360
+ status,
2361
+ dependsOnSegmentIds: persisted?.dependsOnSegmentIds ?? [],
2362
+ };
2363
+ });
2364
+
2365
+ const terminalSegments = segments.filter(
2366
+ (segment) =>
2367
+ segment.status === "succeeded" ||
2368
+ segment.status === "failed" ||
2369
+ segment.status === "stalled" ||
2370
+ segment.status === "skipped",
2371
+ ).length;
2372
+
2373
+ return {
2374
+ taskId,
2375
+ totalSegments: segments.length,
2376
+ terminalSegments,
2377
+ activeSegmentId: resolvedActiveSegmentId,
2378
+ segments,
2379
+ };
2380
+ }
2381
+
2382
+ /**
2383
+ * Build the base fields for an engine event.
2384
+ *
2385
+ * Ensures consistent field population across all emit sites.
2386
+ * Analogous to `buildTier0EventBase()` for Tier 0 events.
2387
+ *
2388
+ * @since TP-040
2389
+ */
2390
+ export function buildEngineEventBase(
2391
+ type: EngineEventType,
2392
+ batchId: string,
2393
+ waveIndex: number,
2394
+ phase: OrchBatchPhase,
2395
+ ): Pick<EngineEvent, "timestamp" | "type" | "batchId" | "waveIndex" | "phase"> {
2396
+ return {
2397
+ timestamp: new Date().toISOString(),
2398
+ type,
2399
+ batchId,
2400
+ waveIndex,
2401
+ phase,
2402
+ };
2403
+ }
2404
+
2405
+ /**
2406
+ * Decision output from the merge retry policy evaluator.
2407
+ *
2408
+ * Pure data structure — callers use this to decide whether to retry,
2409
+ * wait, or escalate to paused.
2410
+ *
2411
+ * @since TP-033
2412
+ */
2413
+ export interface MergeRetryDecision {
2414
+ /** Whether the merge should be retried */
2415
+ shouldRetry: boolean;
2416
+ /** Cooldown to wait before retry (0 if no retry or immediate) */
2417
+ cooldownMs: number;
2418
+ /** Human-readable reason for the decision */
2419
+ reason: string;
2420
+ /** Current retry count for this scope (after increment if retrying) */
2421
+ currentAttempt: number;
2422
+ /** Maximum attempts allowed for this classification */
2423
+ maxAttempts: number;
2424
+ /** Classification that was evaluated */
2425
+ classification: MergeFailureClassification;
2426
+ /** Exhaustion action if not retrying */
2427
+ exhaustionAction: MergeRetryPolicy["exhaustionAction"];
2428
+ }
2429
+
2430
+ /**
2431
+ * Outcome of the merge retry loop.
2432
+ *
2433
+ * Returned by `applyMergeRetryLoop()` to tell the caller what happened
2434
+ * during the retry cycle so it can take the appropriate action (continue,
2435
+ * break, force-pause, etc.).
2436
+ *
2437
+ * @since TP-033 R006
2438
+ */
2439
+ export type MergeRetryLoopOutcome =
2440
+ | {
2441
+ /** Retry succeeded — caller should continue normal post-merge flow */
2442
+ kind: "retry_succeeded";
2443
+ mergeResult: MergeWaveResult;
2444
+ /** Classification of the failure that was retried */
2445
+ classification: MergeFailureClassification | null;
2446
+ /** Scope key used for retry counter tracking */
2447
+ scopeKey: string;
2448
+ /** Last retry decision (carries attempt/maxAttempts for event emission) */
2449
+ lastDecision: MergeRetryDecision;
2450
+ }
2451
+ | {
2452
+ /** Safe-stop triggered during retry — caller should break the wave loop */
2453
+ kind: "safe_stop";
2454
+ mergeResult: MergeWaveResult;
2455
+ /** Classification of the failure that was retried */
2456
+ classification: MergeFailureClassification | null;
2457
+ /** Scope key used for retry counter tracking */
2458
+ scopeKey: string;
2459
+ /** Last retry decision (carries attempt/maxAttempts for event emission) */
2460
+ lastDecision: MergeRetryDecision;
2461
+ errorMessage: string;
2462
+ notifyMessage: string;
2463
+ }
2464
+ | {
2465
+ /**
2466
+ * Retry exhausted or failure is non-retriable — caller should
2467
+ * force `paused` regardless of on_merge_failure config.
2468
+ */
2469
+ kind: "exhausted";
2470
+ mergeResult: MergeWaveResult;
2471
+ classification: MergeFailureClassification | null;
2472
+ scopeKey: string;
2473
+ lastDecision: MergeRetryDecision;
2474
+ errorMessage: string;
2475
+ notifyMessage: string;
2476
+ }
2477
+ | {
2478
+ /** No retry attempted (unclassifiable or non-retriable with 0 attempts).
2479
+ * Caller should fall through to standard on_merge_failure policy. */
2480
+ kind: "no_retry";
2481
+ mergeResult: MergeWaveResult;
2482
+ classification: MergeFailureClassification | null;
2483
+ scopeKey: string;
2484
+ };
2485
+
2486
+ /**
2487
+ * Callbacks provided to `applyMergeRetryLoop()` for side effects
2488
+ * that differ between engine.ts and resume.ts.
2489
+ *
2490
+ * @since TP-033 R006
2491
+ */
2492
+ export interface MergeRetryCallbacks {
2493
+ /** Re-invoke mergeWaveByRepo and return the new result */
2494
+ performMerge: () => MergeWaveResult | Promise<MergeWaveResult>;
2495
+ /** Persist batch state with a trigger label */
2496
+ persist: (trigger: string) => void;
2497
+ /** Log a message */
2498
+ log: (message: string, details?: Record<string, unknown>) => void;
2499
+ /** Emit a notification */
2500
+ notify: (message: string, level: "info" | "warning" | "error") => void;
2501
+ /** Update the merge result in tracking arrays */
2502
+ updateMergeResult: (result: MergeWaveResult) => void;
2503
+ /** Sleep for cooldown (allows test injection) */
2504
+ sleep: (ms: number) => void | Promise<void>;
2505
+ /**
2506
+ * Optional callback fired when a retry attempt is about to be executed.
2507
+ * Provides the retry decision with classification, attempt count, and cooldown
2508
+ * so callers can emit structured Tier 0 events at the right time.
2509
+ * @since TP-039 R004
2510
+ */
2511
+ onRetryAttempt?: (decision: MergeRetryDecision) => void;
2512
+ }
2513
+
2514
+ // ── View-Model Types ─────────────────────────────────────────────────
2515
+
2516
+ /**
2517
+ * Summary counts for the orchestrator dashboard.
2518
+ * Pure data — no rendering logic.
2519
+ */
2520
+ export interface OrchSummaryCounts {
2521
+ completed: number;
2522
+ running: number;
2523
+ queued: number;
2524
+ failed: number;
2525
+ blocked: number;
2526
+ stalled: number;
2527
+ total: number;
2528
+ }
2529
+
2530
+ /**
2531
+ * Per-lane view data for dashboard rendering.
2532
+ * Derived from MonitorState LaneMonitorSnapshot + AllocatedLane metadata.
2533
+ */
2534
+ export interface OrchLaneCardData {
2535
+ laneNumber: number;
2536
+ laneId: string;
2537
+ sessionName: string;
2538
+ sessionAlive: boolean;
2539
+ currentTaskId: string | null;
2540
+ currentStepName: string | null;
2541
+ totalChecked: number;
2542
+ totalItems: number;
2543
+ completedTasks: number;
2544
+ totalLaneTasks: number;
2545
+ status: "idle" | "running" | "succeeded" | "failed" | "stalled";
2546
+ stallReason: string | null;
2547
+ }
2548
+
2549
+ /**
2550
+ * Dashboard view-model — maps runtime state to render-ready data.
2551
+ *
2552
+ * This is the single data contract between OrchBatchRuntimeState +
2553
+ * MonitorState and the widget rendering function.
2554
+ */
2555
+ export interface OrchDashboardViewModel {
2556
+ phase: OrchBatchPhase;
2557
+ batchId: string;
2558
+ orchBranch: string; // e.g., "orch/henry-20260318T140000" — merge target branch
2559
+ waveProgress: string; // e.g., "2/3"
2560
+ elapsed: string; // e.g., "2m 14s"
2561
+ summary: OrchSummaryCounts;
2562
+ laneCards: OrchLaneCardData[];
2563
+ attachHint: string; // e.g., "Attach via the current runtime session tool"
2564
+ errors: string[];
2565
+ failurePolicy: string | null; // e.g., "stop-wave" if stopped by policy
2566
+ }
2567
+
2568
+ // ── State Persistence Types (TS-009) ─────────────────────────────────
2569
+
2570
+ // ── v3 Resilience & Diagnostics Sections (TP-030) ────────────────────
2571
+
2572
+ /**
2573
+ * Record of a single automated repair action taken by the orchestrator.
2574
+ *
2575
+ * Repair actions are deterministic strategies applied when known failure
2576
+ * classes are detected (e.g., stale worktree cleanup, lock file removal).
2577
+ * Each entry is immutable once written — history is append-only.
2578
+ *
2579
+ * @since v3 (TP-030)
2580
+ */
2581
+ export interface PersistedRepairRecord {
2582
+ /** Unique repair ID (e.g., "r-20260319-001") */
2583
+ id: string;
2584
+ /** Strategy name that was applied (e.g., "stale-worktree-cleanup", "lock-file-removal") */
2585
+ strategy: string;
2586
+ /** Outcome of the repair */
2587
+ status: "succeeded" | "failed" | "skipped";
2588
+ /** Repo ID targeted by the repair (undefined in repo mode) */
2589
+ repoId?: string;
2590
+ /** Epoch ms when the repair started */
2591
+ startedAt: number;
2592
+ /** Epoch ms when the repair ended */
2593
+ endedAt: number;
2594
+ }
2595
+
2596
+ /**
2597
+ * Resilience state section for batch-state.json.
2598
+ *
2599
+ * Tracks retry/repair metadata so the orchestrator can make informed
2600
+ * decisions about retries, force-resume, and failure escalation.
2601
+ *
2602
+ * All fields are required in a canonical v3 state. Migration from v1/v2
2603
+ * fills conservative defaults (no retries, no repairs, no forced resume).
2604
+ *
2605
+ * @since v3 (TP-030)
2606
+ */
2607
+ export interface ResilienceState {
2608
+ /** Whether the last resume was a --force resume */
2609
+ resumeForced: boolean;
2610
+ /**
2611
+ * Retry counts keyed by scope string.
2612
+ * Scope format: `{taskId}:w{waveIndex}:l{laneNumber}` (e.g., "TP-001:w0:l1").
2613
+ * Value is the number of retries attempted for that scope.
2614
+ */
2615
+ retryCountByScope: Record<string, number>;
2616
+ /**
2617
+ * Exit classification of the most recent failure (null if no failures).
2618
+ * Uses the same `ExitClassification` union from diagnostics.ts.
2619
+ */
2620
+ lastFailureClass: ExitClassification | null;
2621
+ /** Chronological history of automated repair actions. Append-only. */
2622
+ repairHistory: PersistedRepairRecord[];
2623
+ }
2624
+
2625
+ /**
2626
+ * Persisted summary of a single task's exit diagnostic.
2627
+ *
2628
+ * This is a compact representation stored in `diagnostics.taskExits`.
2629
+ * For the full diagnostic (tokens, progress, etc.), see the
2630
+ * `exitDiagnostic` field on `PersistedTaskRecord`.
2631
+ *
2632
+ * Uses `ExitClassification` from diagnostics.ts as the canonical
2633
+ * classification type — no duplication.
2634
+ *
2635
+ * @since v3 (TP-030)
2636
+ */
2637
+ export interface PersistedTaskExitSummary {
2638
+ /** Deterministic exit classification */
2639
+ classification: ExitClassification;
2640
+ /** Estimated cost in USD for this task's execution */
2641
+ cost: number;
2642
+ /** Wall-clock duration of the task in seconds */
2643
+ durationSec: number;
2644
+ /** Number of retry attempts (0 if never retried) */
2645
+ retries?: number;
2646
+ }
2647
+
2648
+ /**
2649
+ * Batch-level diagnostics section for batch-state.json.
2650
+ *
2651
+ * Aggregates per-task exit summaries and batch-wide cost for
2652
+ * dashboard display and post-mortem analysis.
2653
+ *
2654
+ * All fields are required in a canonical v3 state. Migration from v1/v2
2655
+ * fills conservative defaults (empty taskExits, zero batchCost).
2656
+ *
2657
+ * @since v3 (TP-030)
2658
+ */
2659
+ export interface BatchDiagnostics {
2660
+ /**
2661
+ * Per-task exit summaries keyed by task ID.
2662
+ * Populated as tasks complete during execution.
2663
+ */
2664
+ taskExits: Record<string, PersistedTaskExitSummary>;
2665
+ /** Accumulated batch cost in USD across all tasks */
2666
+ batchCost: number;
2667
+ }
2668
+
2669
+ /**
2670
+ * Create a default ResilienceState with conservative initial values.
2671
+ * Used when migrating v1/v2 states to v3, and for new batch creation.
2672
+ */
2673
+ export function defaultResilienceState(): ResilienceState {
2674
+ return {
2675
+ resumeForced: false,
2676
+ retryCountByScope: {},
2677
+ lastFailureClass: null,
2678
+ repairHistory: [],
2679
+ };
2680
+ }
2681
+
2682
+ /**
2683
+ * Create a default BatchDiagnostics with empty/zero initial values.
2684
+ * Used when migrating v1/v2 states to v3, and for new batch creation.
2685
+ */
2686
+ export function defaultBatchDiagnostics(): BatchDiagnostics {
2687
+ return {
2688
+ taskExits: {},
2689
+ batchCost: 0,
2690
+ };
2691
+ }
2692
+
2693
+ // ── Schema Version & Constants ───────────────────────────────────────
2694
+
2695
+ /**
2696
+ * Current schema version for batch-state.json.
2697
+ * Increment when the persisted schema changes in incompatible ways.
2698
+ *
2699
+ * Version history:
2700
+ * v1 — Original schema (TS-009). No repo-aware fields on task records.
2701
+ * Lane records had optional `repoId` but it was not validated.
2702
+ * v2 — Repo-aware records (TP-006). Adds `repoId` and `resolvedRepoId`
2703
+ * to task records. Formalizes `repoId` on lane records. Adds
2704
+ * `mode` field to top-level state.
2705
+ * v3 — Resilience & diagnostics (TP-030). Adds optional `resilience`
2706
+ * section (retry counters, force-resume, failure classification,
2707
+ * repair history) and optional `diagnostics` section (per-task
2708
+ * exit summaries, batch cost). Task records gain optional
2709
+ * `exitDiagnostic` alongside legacy `exitReason`.
2710
+ * Both new sections are optional for v1/v2 migration paths.
2711
+ * v4 — Segment execution (TP-081). Adds optional `segments` array
2712
+ * for persisting per-segment runtime state. Task records gain
2713
+ * optional `packetRepoId`, `packetTaskPath`, `segmentIds`, and
2714
+ * `activeSegmentId` fields. All v4-specific fields are optional
2715
+ * for backward compatibility with v1/v2/v3 migration paths.
2716
+ * When migrating from v3, `segments` defaults to `[]` and
2717
+ * task-level segment fields default to `undefined`.
2718
+ *
2719
+ * Compatibility policy:
2720
+ * - loadBatchState() accepts v1, v2, v3, and v4 files. v1→v2→v3→v4
2721
+ * auto-upconverted in memory (chained).
2722
+ * The on-disk file is NOT rewritten during load.
2723
+ * - saveBatchState() always writes v4.
2724
+ * - Schema versions > 4 are rejected with STATE_SCHEMA_INVALID.
2725
+ */
2726
+ export const BATCH_STATE_SCHEMA_VERSION = 4;
2727
+
2728
+ /**
2729
+ * Canonical file path for persisted batch state.
2730
+ * Resolved relative to repository root: `.pi/batch-state.json`
2731
+ */
2732
+ export const BATCH_STATE_FILENAME = "batch-state.json";
2733
+
2734
+ /**
2735
+ * Resolve the absolute path to the batch state file.
2736
+ * @param repoRoot - Absolute path to the repository root
2737
+ */
2738
+ export function batchStatePath(repoRoot: string): string {
2739
+ return join(repoRoot, ".pi", BATCH_STATE_FILENAME);
2740
+ }
2741
+
2742
+ /**
2743
+ * Error codes for state persistence operations.
2744
+ *
2745
+ * - STATE_FILE_IO_ERROR: Filesystem read/write/rename failure
2746
+ * - STATE_FILE_PARSE_ERROR: File exists but contains invalid JSON
2747
+ * - STATE_SCHEMA_INVALID: JSON is valid but fails schema validation
2748
+ * (missing required fields, unknown enum values, version mismatch)
2749
+ */
2750
+ export type StateFileErrorCode =
2751
+ | "STATE_FILE_IO_ERROR"
2752
+ | "STATE_FILE_PARSE_ERROR"
2753
+ | "STATE_SCHEMA_INVALID";
2754
+
2755
+ /** Typed error class for state file operations. */
2756
+ export class StateFileError extends Error {
2757
+ code: StateFileErrorCode;
2758
+
2759
+ constructor(code: StateFileErrorCode, message: string) {
2760
+ super(message);
2761
+ this.name = "StateFileError";
2762
+ this.code = code;
2763
+ }
2764
+ }
2765
+
2766
+ /**
2767
+ * Persisted record of a single task's execution state.
2768
+ *
2769
+ * Contains everything `/orch-resume` needs to reconstruct
2770
+ * task progress without re-running discovery.
2771
+ *
2772
+ * Repo-aware fields (v2):
2773
+ * `repoId` and `resolvedRepoId` capture task-to-repo attribution
2774
+ * so resume can reconstruct repo routing without re-running discovery.
2775
+ *
2776
+ * Mode semantics:
2777
+ * - **repo mode**: Both fields are `undefined`. Tasks implicitly target
2778
+ * the single repository (cwd). No repo routing needed.
2779
+ * - **workspace mode**: `repoId` is the repo ID declared in PROMPT.md
2780
+ * (may be `undefined` if the task didn't declare one). `resolvedRepoId`
2781
+ * is the final repo ID after applying the routing precedence chain
2782
+ * (prompt → area → workspace default). Always a non-empty string in
2783
+ * workspace mode for tasks that passed routing validation.
2784
+ *
2785
+ * Source of truth:
2786
+ * - For allocated tasks: derived from `ParsedTask.promptRepoId` and
2787
+ * `ParsedTask.resolvedRepoId` via `serializeBatchState()`.
2788
+ * - For unallocated/pending tasks: derived from the same ParsedTask
2789
+ * fields via discovery enrichment in `persistRuntimeState()`.
2790
+ */
2791
+ export interface PersistedTaskRecord {
2792
+ /** Task identifier (e.g., "TO-014") */
2793
+ taskId: string;
2794
+ /** Lane number the task was assigned to (1-indexed) */
2795
+ laneNumber: number;
2796
+ /** Lane session name used (e.g., "orch-lane-1") */
2797
+ sessionName: string;
2798
+ /** Current task status */
2799
+ status: LaneTaskStatus;
2800
+ /** Absolute path to the task's folder (contains PROMPT.md, STATUS.md) */
2801
+ taskFolder: string;
2802
+ /** Epoch ms when task started (null if never started) */
2803
+ startedAt: number | null;
2804
+ /** Epoch ms when task ended (null if still pending/running) */
2805
+ endedAt: number | null;
2806
+ /** Whether .DONE file was found for this task */
2807
+ doneFileFound: boolean;
2808
+ /** Human-readable exit reason (if completed/failed) */
2809
+ exitReason: string;
2810
+ /**
2811
+ * Repo ID declared in the task's PROMPT.md metadata (v2).
2812
+ * Undefined in repo mode or if the task didn't declare a repo.
2813
+ */
2814
+ repoId?: string;
2815
+ /**
2816
+ * Resolved repo ID after applying routing precedence (v2).
2817
+ * Undefined in repo mode. In workspace mode, this is the final
2818
+ * repo target after prompt → area → workspace-default fallback.
2819
+ */
2820
+ resolvedRepoId?: string;
2821
+ /**
2822
+ * Number of commits preserved as partial progress for a failed task (TP-028).
2823
+ * Undefined when no partial progress was saved (succeeded tasks, no commits, etc.).
2824
+ * Optional for backward compatibility with pre-TP-028 state files.
2825
+ */
2826
+ partialProgressCommits?: number;
2827
+ /**
2828
+ * Saved branch name holding partial progress for a failed task (TP-028).
2829
+ * Undefined when no partial progress was saved.
2830
+ * Optional for backward compatibility with pre-TP-028 state files.
2831
+ */
2832
+ partialProgressBranch?: string;
2833
+ /**
2834
+ * Structured exit diagnostic for this task (v3, TP-030).
2835
+ *
2836
+ * Canonical structured exit data — preferred over the legacy `exitReason`
2837
+ * string when present. Contains deterministic classification, cost, timing,
2838
+ * and progress metadata.
2839
+ *
2840
+ * Optional for backward compatibility with v1/v2 state files and tasks
2841
+ * that haven't exited yet. Consumers should check `exitDiagnostic` first,
2842
+ * falling back to `exitReason` for display.
2843
+ */
2844
+ exitDiagnostic?: TaskExitDiagnostic;
2845
+ /**
2846
+ * Repo ID that owns task packet files (PROMPT.md/STATUS.md/.DONE) (v4, TP-081).
2847
+ *
2848
+ * In workspace mode, this is the `taskPacketRepo` from routing config.
2849
+ * Undefined in repo mode or for pre-v4 state files.
2850
+ */
2851
+ packetRepoId?: string;
2852
+ /**
2853
+ * Absolute path to the task folder in the packet repo worktree (v4, TP-081).
2854
+ *
2855
+ * Used by resume to locate packet files without re-running discovery.
2856
+ * Undefined in repo mode or for pre-v4 state files.
2857
+ */
2858
+ packetTaskPath?: string;
2859
+ /**
2860
+ * Segment IDs belonging to this task (v4, TP-081).
2861
+ *
2862
+ * Array of segment ID strings (`<taskId>::<repoId>`).
2863
+ * Empty array for repo-mode tasks or single-repo tasks.
2864
+ * Undefined for pre-v4 state files.
2865
+ */
2866
+ segmentIds?: string[];
2867
+ /**
2868
+ * Currently executing segment ID (v4, TP-081).
2869
+ *
2870
+ * Null when no segment is active (all completed or not started).
2871
+ * Undefined for pre-v4 state files.
2872
+ */
2873
+ activeSegmentId?: string | null;
2874
+ }
2875
+
2876
+ // ── Segment-Level Persisted State (v4, TP-081) ──────────────────────
2877
+
2878
+ /**
2879
+ * Segment execution status within a batch.
2880
+ *
2881
+ * State machine mirrors `LaneTaskStatus` but applies at segment granularity:
2882
+ * pending → running → succeeded
2883
+ * → failed
2884
+ * → stalled
2885
+ * pending → skipped (prior segment failed, or task skipped)
2886
+ *
2887
+ * @since v4 (TP-081)
2888
+ */
2889
+ export type PersistedSegmentStatus =
2890
+ | "pending"
2891
+ | "running"
2892
+ | "succeeded"
2893
+ | "failed"
2894
+ | "stalled"
2895
+ | "skipped";
2896
+
2897
+ /**
2898
+ * Persisted record of a single segment's execution state.
2899
+ *
2900
+ * A segment is a repo-scoped execution unit within a task. Each task
2901
+ * may have one or more segments (one per repo the task touches).
2902
+ *
2903
+ * Contains everything `/orch-resume` needs to reconstruct segment-level
2904
+ * progress without re-running discovery.
2905
+ *
2906
+ * @since v4 (TP-081)
2907
+ */
2908
+ export interface PersistedSegmentRecord {
2909
+ /** Stable segment identifier (`<taskId>::<repoId>`, e.g., "TP-002::api") */
2910
+ segmentId: string;
2911
+ /** Parent task identifier */
2912
+ taskId: string;
2913
+ /** Repo ID this segment targets */
2914
+ repoId: string;
2915
+ /** Segment execution status */
2916
+ status: PersistedSegmentStatus;
2917
+ /** Lane ID the segment executed on (e.g., "lane-1"), empty if not yet assigned */
2918
+ laneId: string;
2919
+ /** Lane session name used for this segment */
2920
+ sessionName: string;
2921
+ /** Absolute path to the worktree used for this segment */
2922
+ worktreePath: string;
2923
+ /** Git branch name checked out for this segment */
2924
+ branch: string;
2925
+ /** Epoch ms when segment execution started (null if not yet started) */
2926
+ startedAt: number | null;
2927
+ /** Epoch ms when segment execution ended (null if still pending/running) */
2928
+ endedAt: number | null;
2929
+ /** Number of retry attempts for this segment */
2930
+ retries: number;
2931
+ /**
2932
+ * Segment IDs this segment depends on (intra-task DAG edges).
2933
+ * Empty array for the first segment in a task or for tasks with no intra-task deps.
2934
+ */
2935
+ dependsOnSegmentIds: string[];
2936
+ /**
2937
+ * Structured exit diagnostic for this segment.
2938
+ * Optional: absent for segments that haven't exited yet.
2939
+ * Uses the same `TaskExitDiagnostic` shape from diagnostics.ts.
2940
+ */
2941
+ exitDiagnostic?: TaskExitDiagnostic;
2942
+ /** Human-readable exit reason (legacy compat, same as task-level) */
2943
+ exitReason: string;
2944
+ /** Anchor segment ID this segment was dynamically expanded from (if any). */
2945
+ expandedFrom?: string;
2946
+ /** Segment expansion request ID that created this segment (if any). */
2947
+ expansionRequestId?: string;
2948
+ }
2949
+
2950
+ /**
2951
+ * Persisted record of a lane's configuration.
2952
+ *
2953
+ * Captures worktree/branch assignment so `/orch-resume` can
2954
+ * reconnect to existing worktrees without re-allocation.
2955
+ *
2956
+ * Repo-aware contract (v2):
2957
+ * `repoId` captures which repository this lane targets.
2958
+ *
2959
+ * Mode semantics:
2960
+ * - **repo mode**: `repoId` is `undefined`. The lane's worktree is
2961
+ * created from the single repository (cwd). All lanes share the
2962
+ * same repo implicitly.
2963
+ * - **workspace mode**: `repoId` is a non-empty string matching a
2964
+ * key in `WorkspaceConfig.repos`. All tasks assigned to this lane
2965
+ * target the same repo. Lane allocation guarantees repo affinity
2966
+ * (no lane mixes tasks from different repos).
2967
+ *
2968
+ * Source of truth: derived from `AllocatedLane.repoId` during
2969
+ * serialization in `serializeBatchState()`.
2970
+ */
2971
+ export interface PersistedLaneRecord {
2972
+ /** Lane number (1-indexed) */
2973
+ laneNumber: number;
2974
+ /** Lane identifier (e.g., "lane-1") */
2975
+ laneId: string;
2976
+ /** Lane session identifier (e.g., "orch-lane-1") */
2977
+ laneSessionId: string;
2978
+ /** Absolute path to the lane's worktree directory */
2979
+ worktreePath: string;
2980
+ /** Git branch name checked out in the worktree */
2981
+ branch: string;
2982
+ /** Task IDs assigned to this lane in execution order */
2983
+ taskIds: string[];
2984
+ /**
2985
+ * Repo ID this lane targets (v2).
2986
+ * Undefined in repo mode. Non-empty string in workspace mode,
2987
+ * matching a key in `WorkspaceConfig.repos`.
2988
+ */
2989
+ repoId?: string;
2990
+ }
2991
+
2992
+ /**
2993
+ * Persisted summary of a wave merge result.
2994
+ * Minimal subset of MergeWaveResult needed for resume decisions.
2995
+ */
2996
+ export interface PersistedMergeResult {
2997
+ /** Wave index (0-based) */
2998
+ waveIndex: number;
2999
+ /** Merge status */
3000
+ status: "succeeded" | "failed" | "partial";
3001
+ /** Which lane failed (null if all succeeded) */
3002
+ failedLane: number | null;
3003
+ /** Failure reason (null if all succeeded) */
3004
+ failureReason: string | null;
3005
+ /**
3006
+ * Per-repo merge outcomes (v2, TP-009).
3007
+ * Populated in workspace mode when MergeWaveResult.repoResults is available.
3008
+ * Undefined/absent in repo mode or for older state files. Dashboard treats
3009
+ * absence as single-repo merge.
3010
+ */
3011
+ repoResults?: PersistedRepoMergeOutcome[];
3012
+ }
3013
+
3014
+ /**
3015
+ * Persisted per-repo merge outcome within a wave merge.
3016
+ * Serializable subset of RepoMergeOutcome — excludes full MergeLaneResult
3017
+ * objects (which contain detailed merge agent result JSON) to keep state file compact.
3018
+ */
3019
+ export interface PersistedRepoMergeOutcome {
3020
+ /** Repo ID. Undefined for the default group in repo mode. */
3021
+ repoId: string | undefined;
3022
+ /** Merge status for this repo. */
3023
+ status: "succeeded" | "failed" | "partial";
3024
+ /** Lane numbers involved in this repo's merge. */
3025
+ laneNumbers: number[];
3026
+ /** Failed lane number within this repo (null if all succeeded). */
3027
+ failedLane: number | null;
3028
+ /** Failure reason within this repo (null if all succeeded). */
3029
+ failureReason: string | null;
3030
+ }
3031
+
3032
+ /**
3033
+ * Persisted batch state written to `.pi/batch-state.json`.
3034
+ *
3035
+ * This is the serialization contract for batch state persistence.
3036
+ * It captures enough information for `/orch-resume` to reconstruct
3037
+ * the orchestrator state after a terminal disconnect.
3038
+ *
3039
+ * Design decisions:
3040
+ * - `schemaVersion` enables forward-compatible rejection of old formats
3041
+ * - Phase uses the same `OrchBatchPhase` literal union as runtime state
3042
+ * - Per-task records include folder paths and session names for resume
3043
+ * - Merge results are summarized (not full MergeWaveResult) for size
3044
+ * - `updatedAt` is monotonic (epoch ms) for staleness detection
3045
+ * - `lastError` captures most recent error without PII
3046
+ *
3047
+ * v2 additions (TP-006):
3048
+ * - `mode` field captures workspace vs repo mode at batch start
3049
+ * - Task records include `repoId` and `resolvedRepoId` for repo attribution
3050
+ * - Lane records formalize `repoId` contract per mode
3051
+ * - v1 files are auto-upconverted: `mode` defaults to "repo", task/lane
3052
+ * `repoId` fields default to `undefined` (omitted from JSON)
3053
+ *
3054
+ * v3 additions (TP-030):
3055
+ * - `resilience` section (required): retry counters, force-resume intent,
3056
+ * failure classification, and repair history for automated recovery.
3057
+ * - `diagnostics` section (required): per-task exit summaries and batch cost.
3058
+ * - Task records gain optional `exitDiagnostic` (canonical structured exit
3059
+ * data alongside legacy `exitReason` string).
3060
+ * - Both sections are required in v3. Migration from v1/v2 fills
3061
+ * conservative defaults (see `defaultResilienceState()` / `defaultBatchDiagnostics()`).
3062
+ *
3063
+ * v4 additions (TP-081):
3064
+ * - `segments` array (required): per-segment execution records for multi-repo
3065
+ * task execution. Empty array in repo mode or for pre-v4 migration.
3066
+ * - Task records gain optional `packetRepoId`, `packetTaskPath`, `segmentIds`,
3067
+ * and `activeSegmentId` for segment-level tracking.
3068
+ * - Migration from v3 fills `segments` as `[]` and leaves task-level segment
3069
+ * fields as `undefined`.
3070
+ */
3071
+ export interface PersistedBatchState {
3072
+ /** Schema version — must equal BATCH_STATE_SCHEMA_VERSION (currently 4) */
3073
+ schemaVersion: number;
3074
+ /** Current batch execution phase */
3075
+ phase: OrchBatchPhase;
3076
+ /** Unique batch identifier (timestamp format) */
3077
+ batchId: string;
3078
+ /** Branch that was active when /orch started — used as base for worktrees and merge target */
3079
+ baseBranch: string;
3080
+ /** Orchestrator-managed branch name (e.g., 'orch/henry-20260318T140000'). Empty = legacy mode (merge into baseBranch directly). */
3081
+ orchBranch: string;
3082
+ /**
3083
+ * Workspace execution mode at batch start (v2).
3084
+ * - "repo": Single-repo mode (default, backward-compatible).
3085
+ * - "workspace": Multi-repo workspace mode.
3086
+ * Defaults to "repo" when loading v1 state files.
3087
+ */
3088
+ mode: WorkspaceMode;
3089
+ /** Epoch ms when batch started */
3090
+ startedAt: number;
3091
+ /** Epoch ms when state was last written */
3092
+ updatedAt: number;
3093
+ /** Epoch ms when batch ended (null if still active) */
3094
+ endedAt: number | null;
3095
+ /** Current wave index (0-based, -1 if not started) */
3096
+ currentWaveIndex: number;
3097
+ /** Total number of waves in the plan */
3098
+ totalWaves: number;
3099
+ /**
3100
+ * Number of dependency-driven task-level waves (TP-166).
3101
+ * Undefined for batches created before TP-166; falls back to totalWaves.
3102
+ */
3103
+ taskLevelWaveCount?: number;
3104
+ /**
3105
+ * Maps segment round index (0-based) to parent task-level wave (0-based).
3106
+ * Undefined for batches created before TP-166.
3107
+ */
3108
+ roundToTaskWave?: number[];
3109
+ /** Wave plan: array of arrays of task IDs per wave */
3110
+ wavePlan: string[][];
3111
+ /** Per-lane configuration records */
3112
+ lanes: PersistedLaneRecord[];
3113
+ /** Per-task execution records (all tasks across all waves) */
3114
+ tasks: PersistedTaskRecord[];
3115
+ /** Merge results for completed waves */
3116
+ mergeResults: PersistedMergeResult[];
3117
+ /** Summary counters */
3118
+ totalTasks: number;
3119
+ succeededTasks: number;
3120
+ failedTasks: number;
3121
+ skippedTasks: number;
3122
+ blockedTasks: number;
3123
+ /** Task IDs blocked for future waves (from skip-dependents) */
3124
+ blockedTaskIds: string[];
3125
+ /** Most recent error (code + message, no PII) */
3126
+ lastError: { code: string; message: string } | null;
3127
+ /** Accumulated error messages */
3128
+ errors: string[];
3129
+ /**
3130
+ * Resilience state for retry/recovery tracking (v3, TP-030).
3131
+ * Required in v3+. Migration from v1/v2 fills conservative defaults.
3132
+ */
3133
+ resilience: ResilienceState;
3134
+ /**
3135
+ * Batch-level diagnostics for cost tracking and exit summaries (v3, TP-030).
3136
+ * Required in v3+. Migration from v1/v2 fills conservative defaults.
3137
+ */
3138
+ diagnostics: BatchDiagnostics;
3139
+ /**
3140
+ * Per-segment execution records for multi-repo task execution (v4, TP-081).
3141
+ *
3142
+ * Each entry represents one repo-scoped segment of a task. In repo mode
3143
+ * or for single-repo tasks, this array is empty (segment tracking is
3144
+ * implicit via task records).
3145
+ *
3146
+ * Required in v4. Migration from v1/v2/v3 fills empty array.
3147
+ */
3148
+ segments: PersistedSegmentRecord[];
3149
+ /**
3150
+ * Unknown top-level fields captured during deserialization.
3151
+ * Preserved on roundtrip to avoid data loss from future schema extensions
3152
+ * or external tools writing additional fields.
3153
+ * Not serialized directly — merged back by `serializeBatchState()`.
3154
+ */
3155
+ _extraFields?: Record<string, unknown>;
3156
+ }
3157
+
3158
+ // ── Resume (TS-009 Step 4) ───────────────────────────────────────────
3159
+
3160
+ /**
3161
+ * Error codes for /orch-resume command failures.
3162
+ *
3163
+ * - RESUME_NO_STATE: No batch-state.json found on disk
3164
+ * - RESUME_INVALID_STATE: State file exists but cannot be parsed/validated
3165
+ * - RESUME_SCHEMA_MISMATCH: State file has incompatible schema version
3166
+ * - RESUME_PHASE_NOT_RESUMABLE: Persisted phase does not allow resume
3167
+ * - RESUME_TMUX_UNAVAILABLE: Legacy session backend is unavailable for reconnection
3168
+ * - RESUME_EXECUTION_FAILED: Resume reconciliation succeeded but execution failed
3169
+ */
3170
+ export type ResumeErrorCode =
3171
+ | "RESUME_NO_STATE"
3172
+ | "RESUME_INVALID_STATE"
3173
+ | "RESUME_SCHEMA_MISMATCH"
3174
+ | "RESUME_PHASE_NOT_RESUMABLE"
3175
+ | "RESUME_TMUX_UNAVAILABLE"
3176
+ | "RESUME_EXECUTION_FAILED";
3177
+
3178
+ /** Typed error class for resume failures with stable error codes. */
3179
+ export class ResumeError extends Error {
3180
+ code: ResumeErrorCode;
3181
+
3182
+ constructor(code: ResumeErrorCode, message: string) {
3183
+ super(message);
3184
+ this.name = "ResumeError";
3185
+ this.code = code;
3186
+ }
3187
+ }
3188
+
3189
+ /**
3190
+ * Result of reconciling a single task's persisted state against live signals.
3191
+ *
3192
+ * Combines persisted status, lane-session liveness, and .DONE file presence
3193
+ * into a deterministic action for the resume engine.
3194
+ *
3195
+ * Reconciliation precedence (highest → lowest):
3196
+ * 1. .DONE file found → "mark-complete" (regardless of session state)
3197
+ * 2. Session alive + no .DONE → "reconnect" (task is still running)
3198
+ * 3. Persisted status is terminal (succeeded/failed/stalled/skipped) → "skip"
3199
+ * 4. Session dead + no .DONE + was running → "mark-failed"
3200
+ */
3201
+ export interface ReconciledTaskState {
3202
+ /** Task identifier */
3203
+ taskId: string;
3204
+ /** Status from the persisted state file */
3205
+ persistedStatus: LaneTaskStatus;
3206
+ /** Reconciled live status after checking signals */
3207
+ liveStatus: LaneTaskStatus;
3208
+ /** Whether the lane session is alive right now */
3209
+ sessionAlive: boolean;
3210
+ /** Whether the .DONE file was found */
3211
+ doneFileFound: boolean;
3212
+ /** Whether the lane worktree still exists on disk */
3213
+ worktreeExists: boolean;
3214
+ /** Action the resume engine should take */
3215
+ action: "reconnect" | "mark-complete" | "mark-failed" | "re-execute" | "skip" | "pending";
3216
+ }
3217
+
3218
+ /**
3219
+ * Result of resume eligibility check.
3220
+ *
3221
+ * Determines whether a persisted batch state can be resumed based on its phase.
3222
+ */
3223
+ export interface ResumeEligibility {
3224
+ /** Whether the batch can be resumed */
3225
+ eligible: boolean;
3226
+ /** Human-readable reason (for both eligible and ineligible) */
3227
+ reason: string;
3228
+ /** Persisted phase */
3229
+ phase: OrchBatchPhase;
3230
+ /** Batch ID */
3231
+ batchId: string;
3232
+ }
3233
+
3234
+ /**
3235
+ * Resume point computed from reconciled task states.
3236
+ *
3237
+ * Tells the resume engine where to start in the wave plan.
3238
+ */
3239
+ export interface ResumePoint {
3240
+ /** Wave index to resume from (0-based) */
3241
+ resumeWaveIndex: number;
3242
+ /** Task IDs confirmed completed (via .DONE or prior succeeded) */
3243
+ completedTaskIds: string[];
3244
+ /** Task IDs that still need execution */
3245
+ pendingTaskIds: string[];
3246
+ /** Task IDs confirmed failed (dead session, no .DONE) */
3247
+ failedTaskIds: string[];
3248
+ /** Task IDs with alive sessions that need reconnection */
3249
+ reconnectTaskIds: string[];
3250
+ /** Task IDs with dead sessions but existing worktrees that need re-execution */
3251
+ reExecuteTaskIds: string[];
3252
+ /**
3253
+ * Wave indexes (0-based) where all tasks are terminal but the merge
3254
+ * is missing or failed. These waves should be retried for merge only
3255
+ * (no task re-execution). Empty when all completed waves have
3256
+ * successful merges. (TP-037, Bug #102)
3257
+ */
3258
+ mergeRetryWaveIndexes: number[];
3259
+ }
3260
+
3261
+ // ── Abort (TS-009 Step 5) ────────────────────────────────────────────
3262
+
3263
+ /**
3264
+ * Abort mode: graceful (checkpoint + wait + force-kill) or hard (immediate kill).
3265
+ */
3266
+ export type AbortMode = "graceful" | "hard";
3267
+
3268
+ /**
3269
+ * Error codes for abort operations.
3270
+ *
3271
+ * - ABORT_TMUX_LIST_FAILED: Could not list legacy session records
3272
+ * - ABORT_WRAPUP_WRITE_FAILED: Failed to write wrap-up signal file(s)
3273
+ * - ABORT_KILL_FAILED: Failed to kill one or more lane sessions
3274
+ * - ABORT_STATE_DELETE_FAILED: Failed to delete batch-state.json
3275
+ */
3276
+ export type AbortErrorCode =
3277
+ | "ABORT_TMUX_LIST_FAILED"
3278
+ | "ABORT_WRAPUP_WRITE_FAILED"
3279
+ | "ABORT_KILL_FAILED"
3280
+ | "ABORT_STATE_DELETE_FAILED";
3281
+
3282
+ /**
3283
+ * Per-lane result from an abort operation.
3284
+ */
3285
+ export interface AbortLaneResult {
3286
+ /** Lane session name */
3287
+ sessionName: string;
3288
+ /** Lane ID (e.g., "lane-1") or "unknown" */
3289
+ laneId: string;
3290
+ /** Task ID if known */
3291
+ taskId: string | null;
3292
+ /** Task folder path in the worktree (for wrap-up file writing) */
3293
+ taskFolderInWorktree: string | null;
3294
+ /** Whether wrap-up files were written (graceful only) */
3295
+ wrapUpWritten: boolean;
3296
+ /** Wrap-up write error if any */
3297
+ wrapUpError: string | null;
3298
+ /** Whether the session was killed */
3299
+ sessionKilled: boolean;
3300
+ /** Whether the session exited gracefully (before force-kill) */
3301
+ exitedGracefully: boolean;
3302
+ }
3303
+
3304
+ /**
3305
+ * Overall result from an abort operation.
3306
+ */
3307
+ export interface AbortResult {
3308
+ /** Abort mode used */
3309
+ mode: AbortMode;
3310
+ /** Number of sessions found to abort */
3311
+ sessionsFound: number;
3312
+ /** Number of sessions actually killed (force-killed or graceful exit) */
3313
+ sessionsKilled: number;
3314
+ /** Number of sessions that exited gracefully (before timeout) */
3315
+ gracefulExits: number;
3316
+ /** Per-lane results */
3317
+ laneResults: AbortLaneResult[];
3318
+ /** Number of wrap-up write failures (graceful only) */
3319
+ wrapUpFailures: number;
3320
+ /** Whether batch state file was deleted */
3321
+ stateDeleted: boolean;
3322
+ /** Aggregated errors */
3323
+ errors: Array<{ code: AbortErrorCode; message: string }>;
3324
+ /** Duration of the abort operation in milliseconds */
3325
+ durationMs: number;
3326
+ }
3327
+
3328
+ /**
3329
+ * Action step in an abort plan.
3330
+ */
3331
+ export type AbortActionStep =
3332
+ | { type: "write-wrapup" }
3333
+ | { type: "poll-wait"; gracePeriodMs: number; pollIntervalMs: number }
3334
+ | { type: "kill-remaining" }
3335
+ | { type: "kill-all" };
3336
+
3337
+ /**
3338
+ * Target session with enrichment from persisted state.
3339
+ */
3340
+ export interface AbortTargetSession {
3341
+ /** Lane session name */
3342
+ sessionName: string;
3343
+ /** Lane ID from persisted state or "unknown" */
3344
+ laneId: string;
3345
+ /** Task ID from persisted state or null */
3346
+ taskId: string | null;
3347
+ /** Task folder path resolved in the worktree (for wrap-up files), or null */
3348
+ taskFolderInWorktree: string | null;
3349
+ /** Worktree path from persisted state or batch state */
3350
+ worktreePath: string | null;
3351
+ }
3352
+
3353
+ // ── Size-to-Duration Mapping ─────────────────────────────────────────
3354
+
3355
+ /**
3356
+ * Default duration mapping (size → minutes).
3357
+ *
3358
+ * | Size | Weight | Duration |
3359
+ * |------|--------|----------|
3360
+ * | S | 1 | 30 min |
3361
+ * | M | 2 | 60 min |
3362
+ * | L | 4 | 120 min |
3363
+ */
3364
+ export const SIZE_DURATION_MINUTES: Record<string, number> = {
3365
+ S: 30,
3366
+ M: 60,
3367
+ L: 120,
3368
+ };
3369
+ export const DURATION_BASE_MINUTES = 30;
3370
+
3371
+ /**
3372
+ * Get estimated duration in minutes for a task size.
3373
+ * Uses explicit mapping, falling back to weight × base.
3374
+ */
3375
+ export function getTaskDurationMinutes(size: string, sizeWeights: Record<string, number>): number {
3376
+ if (SIZE_DURATION_MINUTES[size] !== undefined) {
3377
+ return SIZE_DURATION_MINUTES[size];
3378
+ }
3379
+ const weight = sizeWeights[size] || sizeWeights["M"] || 2;
3380
+ return weight * DURATION_BASE_MINUTES;
3381
+ }
3382
+
3383
+ // ── Batch History ────────────────────────────────────────────────────
3384
+
3385
+ /** Token counts for a task, wave, or batch. */
3386
+ export interface TokenCounts {
3387
+ input: number;
3388
+ output: number;
3389
+ cacheRead: number;
3390
+ cacheWrite: number;
3391
+ costUsd: number;
3392
+ }
3393
+
3394
+ /** Per-task summary for history. */
3395
+ export interface BatchTaskSummary {
3396
+ taskId: string;
3397
+ taskName: string;
3398
+ status: "succeeded" | "failed" | "skipped" | "blocked" | "stalled" | "pending";
3399
+ wave: number; // 1-based
3400
+ lane: number; // 1-based
3401
+ durationMs: number;
3402
+ tokens: TokenCounts;
3403
+ exitReason: string | null;
3404
+ }
3405
+
3406
+ /** Per-wave summary for history. */
3407
+ export interface BatchWaveSummary {
3408
+ wave: number; // 1-based
3409
+ tasks: string[]; // task IDs
3410
+ mergeStatus: "succeeded" | "failed" | "partial" | "skipped";
3411
+ durationMs: number;
3412
+ tokens: TokenCounts;
3413
+ }
3414
+
3415
+ /** Complete batch history entry — written after Phase 3 cleanup. */
3416
+ export interface BatchHistorySummary {
3417
+ batchId: string;
3418
+ status: "completed" | "partial" | "failed" | "aborted";
3419
+ startedAt: number;
3420
+ endedAt: number;
3421
+ durationMs: number;
3422
+ totalWaves: number;
3423
+ totalTasks: number;
3424
+ succeededTasks: number;
3425
+ failedTasks: number;
3426
+ skippedTasks: number;
3427
+ blockedTasks: number;
3428
+ tokens: TokenCounts;
3429
+ tasks: BatchTaskSummary[];
3430
+ waves: BatchWaveSummary[];
3431
+ /** Timestamp (ms since epoch) when the batch was integrated. Set by orch-integrate. */
3432
+ integratedAt?: number;
3433
+ }
3434
+
3435
+ /** Max number of batch history entries to retain. */
3436
+ export const BATCH_HISTORY_MAX_ENTRIES = 100;
3437
+
3438
+ // ── Workspace Mode Types ─────────────────────────────────────────────
3439
+
3440
+ /**
3441
+ * Workspace execution mode.
3442
+ *
3443
+ * Mode behavior contract:
3444
+ * - **"repo"** (default): No workspace config file present. The orchestrator
3445
+ * treats `cwd` as both the workspace root and the single repo root.
3446
+ * All existing monorepo behavior is preserved unchanged.
3447
+ * - **"workspace"**: A `.pi/orchid-workspace.yaml` file is present and
3448
+ * valid. The orchestrator runs from a non-git workspace root that
3449
+ * coordinates multiple repos and a shared task root.
3450
+ *
3451
+ * Mode determination rules:
3452
+ * 1. Workspace config file present + invalid → fatal error with actionable
3453
+ * `WorkspaceConfigError` (never silently falls back to repo mode).
3454
+ * 2. Workspace config file present + valid → workspace mode.
3455
+ * 3. No workspace config + cwd is a git repo → repo mode.
3456
+ * 4. No workspace config + cwd is not a git repo → `WORKSPACE_SETUP_REQUIRED`.
3457
+ */
3458
+ export type WorkspaceMode = "repo" | "workspace";
3459
+
3460
+ /**
3461
+ * Configuration for a single repository within a workspace.
3462
+ *
3463
+ * Each repo is identified by a stable ID (e.g., "api", "frontend")
3464
+ * that is used for routing tasks to repos and for display purposes.
3465
+ */
3466
+ export interface WorkspaceRepoConfig {
3467
+ /** Stable identifier for this repo (e.g., "api", "frontend") */
3468
+ id: string;
3469
+ /** Absolute filesystem path to the repo root (must be a git repo) */
3470
+ path: string;
3471
+ /** Optional default branch override (e.g., "develop", "main"). Falls back to repo HEAD. */
3472
+ defaultBranch?: string;
3473
+ }
3474
+
3475
+ /**
3476
+ * Routing configuration for workspace mode.
3477
+ *
3478
+ * Controls where tasks are discovered and which repo receives
3479
+ * unqualified operations.
3480
+ */
3481
+ export interface WorkspaceRoutingConfig {
3482
+ /**
3483
+ * Absolute path to the shared tasks root directory.
3484
+ * All task areas are resolved relative to this path.
3485
+ * Must exist on disk.
3486
+ */
3487
+ tasksRoot: string;
3488
+ /**
3489
+ * Default repo ID for operations that don't specify a repo.
3490
+ * Must reference a valid key in `WorkspaceConfig.repos`.
3491
+ */
3492
+ defaultRepo: string;
3493
+ /**
3494
+ * Repo ID that owns task packet files (PROMPT.md/STATUS.md/.DONE/.reviews).
3495
+ *
3496
+ * Required at runtime. Legacy workspace YAML without this field is
3497
+ * compatibility-mapped to `defaultRepo` during load with a warning.
3498
+ *
3499
+ * Invariant: `tasksRoot` must resolve inside `repos[taskPacketRepo].path`.
3500
+ */
3501
+ taskPacketRepo: string;
3502
+ /**
3503
+ * When true, every task MUST declare an explicit execution target
3504
+ * (via `## Execution Target` section or inline `**Repo:**` in PROMPT.md).
3505
+ * Area-level and workspace-default fallbacks are still used for
3506
+ * validation (unknown-repo checks) but NOT for automatic resolution.
3507
+ *
3508
+ * This prevents accidental misrouting in large multi-team workspaces
3509
+ * where task authors must be intentional about which repo a task targets.
3510
+ *
3511
+ * Default: false (permissive — existing precedence chain applies).
3512
+ * Only meaningful in workspace mode.
3513
+ */
3514
+ strict?: boolean;
3515
+ }
3516
+
3517
+ /**
3518
+ * Top-level workspace configuration.
3519
+ *
3520
+ * Loaded from `.pi/orchid-workspace.yaml` when present.
3521
+ * Immutable after initial validation — never mutated at runtime.
3522
+ */
3523
+ export interface WorkspaceConfig {
3524
+ /** Active workspace mode */
3525
+ mode: WorkspaceMode;
3526
+ /** Map of repo ID → repo configuration. At least one repo required in workspace mode. */
3527
+ repos: Map<string, WorkspaceRepoConfig>;
3528
+ /** Routing configuration (tasks root, default repo) */
3529
+ routing: WorkspaceRoutingConfig;
3530
+ /** Absolute path to the workspace config file that was loaded */
3531
+ configPath: string;
3532
+ }
3533
+
3534
+ /**
3535
+ * Canonical execution context for the orchestrator.
3536
+ *
3537
+ * This is the primary runtime context threaded through orchestrator
3538
+ * entry points. It replaces the previous pattern of passing raw `cwd`
3539
+ * as the sole repo root.
3540
+ *
3541
+ * In repo mode, `workspaceRoot` and `repoRoot` are the same directory.
3542
+ * In workspace mode, `workspaceRoot` is the non-git coordination root
3543
+ * and `repoRoot` is the default repo from the workspace config.
3544
+ *
3545
+ * Design rationale:
3546
+ * - Step 2 (wire orchestrator startup) will construct this from config
3547
+ * loading results and thread it into `executeOrchBatch()` and friends.
3548
+ * - `repoRoot` is always a git repository, preserving the invariant
3549
+ * that git operations (worktree, branch, merge) have a valid target.
3550
+ * - `workspaceConfig` is null in repo mode (no workspace file loaded).
3551
+ */
3552
+ export interface ExecutionContext {
3553
+ /** Absolute path to the workspace root (cwd in repo mode, workspace dir in workspace mode) */
3554
+ workspaceRoot: string;
3555
+ /** Absolute path to the default/primary git repo root */
3556
+ repoRoot: string;
3557
+ /** Active workspace mode */
3558
+ mode: WorkspaceMode;
3559
+ /** Workspace configuration (null in repo mode) */
3560
+ workspaceConfig: WorkspaceConfig | null;
3561
+ /** Loaded task runner configuration */
3562
+ taskRunnerConfig: TaskRunnerConfig;
3563
+ /** Loaded orchestrator configuration */
3564
+ orchestratorConfig: OrchestratorConfig;
3565
+ /**
3566
+ * Resolved pointer for config/agent paths (null in repo mode).
3567
+ *
3568
+ * When present, `pointer.configRoot` and `pointer.agentRoot` point to
3569
+ * the config repo's config directory. State/sidecar paths are NOT
3570
+ * affected — they always live at `<workspaceRoot>/.pi/`.
3571
+ */
3572
+ pointer: PointerResolution | null;
3573
+ }
3574
+
3575
+ // ── Workspace Validation Error Types ─────────────────────────────────
3576
+
3577
+ /**
3578
+ * Error codes for workspace configuration validation failures.
3579
+ *
3580
+ * Each code maps to a deterministic validation rule from the workspace
3581
+ * config loading pipeline. Codes are stable and machine-branchable.
3582
+ *
3583
+ * - WORKSPACE_FILE_READ_ERROR: Config file exists but cannot be read (permissions, encoding)
3584
+ * - WORKSPACE_FILE_PARSE_ERROR: Config file contains invalid YAML
3585
+ * - WORKSPACE_MISSING_REPOS: No repos defined in workspace config (at least one required)
3586
+ * - WORKSPACE_REPO_PATH_MISSING: A repo entry has no `path` field
3587
+ * - WORKSPACE_REPO_PATH_NOT_FOUND: A repo's `path` does not exist on disk
3588
+ * - WORKSPACE_REPO_NOT_GIT: A repo's `path` exists but is not a git repository
3589
+ * - WORKSPACE_MISSING_TASKS_ROOT: `routing.tasks_root` is missing or empty
3590
+ * - WORKSPACE_TASKS_ROOT_NOT_FOUND: `routing.tasks_root` path does not exist on disk
3591
+ * - WORKSPACE_MISSING_DEFAULT_REPO: `routing.default_repo` is missing or empty
3592
+ * - WORKSPACE_DEFAULT_REPO_NOT_FOUND: `routing.default_repo` references a repo ID not in the repos map
3593
+ * - WORKSPACE_TASK_PACKET_REPO_NOT_FOUND: `routing.task_packet_repo` references a repo ID not in the repos map
3594
+ * - WORKSPACE_TASKS_ROOT_OUTSIDE_PACKET_REPO: `routing.tasks_root` resolves outside `repos[routing.task_packet_repo].path`
3595
+ * - WORKSPACE_TASK_AREA_OUTSIDE_TASKS_ROOT: A configured task-area path resolves outside `routing.tasks_root`
3596
+ * - WORKSPACE_SETUP_REQUIRED: No workspace config and cwd is not a git repository
3597
+ * - WORKSPACE_DUPLICATE_REPO_PATH: Two or more repos share the same filesystem path
3598
+ * - WORKSPACE_SCHEMA_INVALID: Config file has valid YAML but missing/invalid top-level structure
3599
+ */
3600
+ export type WorkspaceConfigErrorCode =
3601
+ | "WORKSPACE_FILE_READ_ERROR"
3602
+ | "WORKSPACE_FILE_PARSE_ERROR"
3603
+ | "WORKSPACE_MISSING_REPOS"
3604
+ | "WORKSPACE_REPO_PATH_MISSING"
3605
+ | "WORKSPACE_REPO_PATH_NOT_FOUND"
3606
+ | "WORKSPACE_REPO_NOT_GIT"
3607
+ | "WORKSPACE_MISSING_TASKS_ROOT"
3608
+ | "WORKSPACE_TASKS_ROOT_NOT_FOUND"
3609
+ | "WORKSPACE_MISSING_DEFAULT_REPO"
3610
+ | "WORKSPACE_DEFAULT_REPO_NOT_FOUND"
3611
+ | "WORKSPACE_TASK_PACKET_REPO_NOT_FOUND"
3612
+ | "WORKSPACE_TASKS_ROOT_OUTSIDE_PACKET_REPO"
3613
+ | "WORKSPACE_TASK_AREA_OUTSIDE_TASKS_ROOT"
3614
+ | "WORKSPACE_SETUP_REQUIRED"
3615
+ | "WORKSPACE_DUPLICATE_REPO_PATH"
3616
+ | "WORKSPACE_SCHEMA_INVALID"; /**
3617
+ * Typed error class for workspace configuration failures.
3618
+ *
3619
+ * Thrown during workspace config loading/validation when the config file
3620
+ * is present but invalid. Never thrown when no config file exists (that
3621
+ * case silently falls back to repo mode).
3622
+ *
3623
+ * Follows the established pattern of typed error classes in this module
3624
+ * (WorktreeError, ExecutionError, MergeError, StateFileError, ResumeError).
3625
+ */
3626
+ export class WorkspaceConfigError extends Error {
3627
+ code: WorkspaceConfigErrorCode;
3628
+ /** Optional repo ID that triggered the error (for repo-specific validation failures) */
3629
+ repoId?: string;
3630
+ /** Optional filesystem path related to the error */
3631
+ relatedPath?: string;
3632
+
3633
+ constructor(
3634
+ code: WorkspaceConfigErrorCode,
3635
+ message: string,
3636
+ repoId?: string,
3637
+ relatedPath?: string,
3638
+ ) {
3639
+ super(message);
3640
+ this.name = "WorkspaceConfigError";
3641
+ this.code = code;
3642
+ this.repoId = repoId;
3643
+ this.relatedPath = relatedPath;
3644
+ }
3645
+ }
3646
+
3647
+ // ── Pointer Resolution Types ─────────────────────────────────────────
3648
+
3649
+ /**
3650
+ * Canonical filename for the workspace pointer file.
3651
+ * Located at `<workspace-root>/.pi/orchid-pointer.json`.
3652
+ *
3653
+ * Created by `orchid init` in workspace mode. Points to the config
3654
+ * repo and config path within it. Not committed to git — each user
3655
+ * creates it during onboarding.
3656
+ */
3657
+ export const POINTER_FILENAME = "orchid-pointer.json";
3658
+
3659
+ /**
3660
+ * Resolve the absolute path to the pointer file.
3661
+ * @param workspaceRoot - Absolute path to the workspace root
3662
+ */
3663
+ export function pointerFilePath(workspaceRoot: string): string {
3664
+ return join(workspaceRoot, ".pi", POINTER_FILENAME);
3665
+ }
3666
+
3667
+ /**
3668
+ * Result of resolving the workspace pointer file.
3669
+ *
3670
+ * This is the primary contract for downstream consumers (task-runner,
3671
+ * orchestrator, merge agent, dashboard). All pointer failures are
3672
+ * non-fatal: when the pointer cannot be resolved, `used` is false and
3673
+ * `configRoot`/`agentRoot` fall back to workspace-root paths.
3674
+ *
3675
+ * State/sidecar paths are NOT affected by the pointer — they always
3676
+ * live at `<workspace-root>/.pi/` regardless of pointer resolution.
3677
+ *
3678
+ * In repo mode, `resolvePointer()` returns null (pointer is ignored
3679
+ * entirely, even if a file happens to exist).
3680
+ */
3681
+ export interface PointerResolution {
3682
+ /**
3683
+ * Whether the pointer was successfully resolved.
3684
+ * - true: pointer file was found, parsed, and config_repo resolved
3685
+ * to a known repo in WorkspaceConfig.repos.
3686
+ * - false: pointer was missing, malformed, or referenced an unknown
3687
+ * repo. Fallback paths are used instead.
3688
+ */
3689
+ used: boolean;
3690
+
3691
+ /**
3692
+ * Resolved config root directory.
3693
+ * - When used=true: `<config-repo-path>/<config_path>/`
3694
+ * - When used=false: `<workspace-root>/.pi/` (existing fallback)
3695
+ */
3696
+ configRoot: string;
3697
+
3698
+ /**
3699
+ * Resolved agent overrides directory.
3700
+ * - When used=true: `<config-repo-path>/<config_path>/agents/`
3701
+ * - When used=false: `<workspace-root>/.pi/agents/` (existing fallback)
3702
+ */
3703
+ agentRoot: string;
3704
+
3705
+ /**
3706
+ * Warning message when pointer resolution fell back.
3707
+ * - undefined when used=true (no warning)
3708
+ * - Human-readable reason string when used=false
3709
+ */
3710
+ warning?: string;
3711
+ }
3712
+
3713
+ // ── Workspace Defaults ───────────────────────────────────────────────
3714
+
3715
+ /**
3716
+ * Canonical filename for workspace configuration.
3717
+ * Resolved relative to workspace root: `.pi/orchid-workspace.yaml`
3718
+ */
3719
+ export const WORKSPACE_CONFIG_FILENAME = "orchid-workspace.yaml";
3720
+
3721
+ /**
3722
+ * Resolve the absolute path to the workspace config file.
3723
+ * @param workspaceRoot - Absolute path to the workspace root
3724
+ */
3725
+ export function workspaceConfigPath(workspaceRoot: string): string {
3726
+ return join(workspaceRoot, ".pi", WORKSPACE_CONFIG_FILENAME);
3727
+ }
3728
+
3729
+ /**
3730
+ * Create a default ExecutionContext for repo mode.
3731
+ *
3732
+ * Used when no workspace config file is present. The workspace root
3733
+ * and repo root are the same directory (cwd), preserving existing
3734
+ * monorepo behavior exactly.
3735
+ *
3736
+ * @param cwd - Current working directory (treated as both workspace and repo root)
3737
+ * @param taskRunnerConfig - Loaded task runner config (or defaults)
3738
+ * @param orchestratorConfig - Loaded orchestrator config (or defaults)
3739
+ */
3740
+ export function createRepoModeContext(
3741
+ cwd: string,
3742
+ taskRunnerConfig: TaskRunnerConfig,
3743
+ orchestratorConfig: OrchestratorConfig,
3744
+ ): ExecutionContext {
3745
+ return {
3746
+ workspaceRoot: cwd,
3747
+ repoRoot: cwd,
3748
+ mode: "repo",
3749
+ workspaceConfig: null,
3750
+ taskRunnerConfig,
3751
+ orchestratorConfig,
3752
+ pointer: null,
3753
+ };
3754
+ }
3755
+
3756
+ // ── Agent Mailbox Types (TP-089) ─────────────────────────────────────
3757
+
3758
+ /**
3759
+ * Mailbox directory name under .pi/.
3760
+ * @since TP-089
3761
+ */
3762
+ export const MAILBOX_DIR_NAME = "mailbox";
3763
+
3764
+ /**
3765
+ * Maximum content size in UTF-8 bytes.
3766
+ * Steering messages should be concise directives; larger context should be
3767
+ * written to a separate file and referenced by path.
3768
+ * @since TP-089
3769
+ */
3770
+ export const MAILBOX_MAX_CONTENT_BYTES = 4096;
3771
+
3772
+ /**
3773
+ * Message types for the agent mailbox system.
3774
+ *
3775
+ * | Type | Direction | Purpose |
3776
+ * |------------|---------------------|--------------------------------------------|
3777
+ * | `steer` | supervisor → agent | Course correction. Agent must follow. |
3778
+ * | `query` | supervisor → agent | Request for status/info. Agent replies. |
3779
+ * | `abort` | supervisor → agent | Graceful stop. Agent wraps up and exits. |
3780
+ * | `info` | supervisor → agent | FYI context. No action required. |
3781
+ * | `reply` | agent → supervisor | Response to query or steer acknowledgment. |
3782
+ * | `escalate` | agent → supervisor | Agent-initiated: blocked or needs guidance. |
3783
+ *
3784
+ * @since TP-089
3785
+ */
3786
+ export type MailboxMessageType = "steer" | "query" | "abort" | "info" | "reply" | "escalate";
3787
+
3788
+ /**
3789
+ * Set of valid mailbox message types for runtime validation.
3790
+ * @since TP-089
3791
+ */
3792
+ export const MAILBOX_MESSAGE_TYPES: ReadonlySet<string> = new Set<MailboxMessageType>([
3793
+ "steer",
3794
+ "query",
3795
+ "abort",
3796
+ "info",
3797
+ "reply",
3798
+ "escalate",
3799
+ ]);
3800
+
3801
+ /**
3802
+ * Message format for the file-based agent mailbox.
3803
+ *
3804
+ * Messages are written as JSON files in batch-scoped, session-scoped
3805
+ * directories. The rpc-wrapper checks the inbox on every `message_end`
3806
+ * event and injects pending messages into the agent's LLM context via
3807
+ * pi's `steer` RPC command.
3808
+ *
3809
+ * @see docs/specifications/orchid/agent-mailbox-steering.md
3810
+ * @since TP-089
3811
+ */
3812
+ export interface MailboxMessage {
3813
+ /** Unique message ID: `{timestamp}-{5char-hex-nonce}` */
3814
+ id: string;
3815
+ /** Batch ID — must match current batch for validation */
3816
+ batchId: string;
3817
+ /** Sender identifier: `"supervisor"` or session name */
3818
+ from: string;
3819
+ /** Target session name or `"_broadcast"` */
3820
+ to: string;
3821
+ /** Epoch milliseconds (Date.now()) */
3822
+ timestamp: number;
3823
+ /** Message type */
3824
+ type: MailboxMessageType;
3825
+ /** Message body (max 4KB UTF-8 bytes) */
3826
+ content: string;
3827
+ /** Whether the sender expects a reply (default: false) */
3828
+ expectsReply?: boolean;
3829
+ /** Reference to a previous message ID for threading (default: null) */
3830
+ replyTo?: string | null;
3831
+ }
3832
+
3833
+ /**
3834
+ * Input options for writeMailboxMessage.
3835
+ *
3836
+ * The caller provides these fields; the utility generates `id`, `batchId`,
3837
+ * `to`, and `timestamp` from its own arguments.
3838
+ *
3839
+ * @since TP-089
3840
+ */
3841
+ export interface WriteMailboxMessageOpts {
3842
+ /** Sender identifier: `"supervisor"` or session name */
3843
+ from: string;
3844
+ /** Message type */
3845
+ type: MailboxMessageType;
3846
+ /** Message body (max 4KB UTF-8 bytes) */
3847
+ content: string;
3848
+ /** Whether the sender expects a reply (default: false) */
3849
+ expectsReply?: boolean;
3850
+ /** Reference to a previous message ID for threading (default: null) */
3851
+ replyTo?: string | null;
3852
+ }
3853
+
3854
+ // ── Runtime V2 Contracts (TP-102) ────────────────────────────────────
3855
+ //
3856
+ // These types define the foundational contracts for backend-neutral Runtime V2
3857
+ // architecture. They are additive — existing runtime paths continue to work
3858
+ // while Runtime V2 is incrementally adopted.
3859
+ //
3860
+ // Design principles:
3861
+ // 1. Agent identity is a stable runtime ID, not a legacy session name.
3862
+ // 2. Packet-path authority is explicit, never inferred from cwd.
3863
+ // 3. Process ownership uses a registry, not terminal session discovery.
3864
+ // 4. Normalized events flow directly from child to parent.
3865
+ //
3866
+ // See: docs/specifications/framework/orchid-runtime-v2/
3867
+ // ─────────────────────────────────────────────────────────────────────
3868
+
3869
+ /**
3870
+ * Canonical agent roles in the Runtime V2 process model.
3871
+ *
3872
+ * Every spawned agent process has exactly one role. The role determines
3873
+ * the process's responsibilities, tools, and lifecycle semantics.
3874
+ *
3875
+ * @since TP-102
3876
+ */
3877
+ export type RuntimeAgentRole = "worker" | "reviewer" | "merger" | "lane-runner";
3878
+
3879
+ /**
3880
+ * Agent lifecycle states in the process registry.
3881
+ *
3882
+ * State machine:
3883
+ * spawning → running → wrapping_up → exited
3884
+ * → crashed
3885
+ * → timed_out
3886
+ * → killed
3887
+ *
3888
+ * @since TP-102
3889
+ */
3890
+ export type RuntimeAgentStatus =
3891
+ | "spawning"
3892
+ | "running"
3893
+ | "wrapping_up"
3894
+ | "exited"
3895
+ | "crashed"
3896
+ | "timed_out"
3897
+ | "killed";
3898
+
3899
+ /** Set of terminal agent statuses (process is no longer alive). @since TP-102 */
3900
+ export const TERMINAL_AGENT_STATUSES: ReadonlySet<RuntimeAgentStatus> = new Set([
3901
+ "exited",
3902
+ "crashed",
3903
+ "timed_out",
3904
+ "killed",
3905
+ ]);
3906
+
3907
+ /**
3908
+ * Stable agent identity for Runtime V2.
3909
+ *
3910
+ * This replaces legacy session names as the canonical identifier for a
3911
+ * spawned agent process. The string format is deliberately compatible
3912
+ * with existing naming conventions (e.g., "orch-henrylach-lane-1-worker")
3913
+ * to minimize churn in supervisor tools, dashboard, and mailbox addressing.
3914
+ *
3915
+ * The key semantic change: this is a **runtime process ID**, not a terminal
3916
+ * session label. Code must not assume terminal-session probes apply to RuntimeAgentId.
3917
+ *
3918
+ * @since TP-102
3919
+ */
3920
+ export type RuntimeAgentId = string;
3921
+
3922
+ /**
3923
+ * Explicit packet-path authority for a task execution.
3924
+ *
3925
+ * In workspace mode, the packet home (where PROMPT.md / STATUS.md / .DONE
3926
+ * live) may differ from the execution cwd (the active segment repo worktree).
3927
+ * Runtime V2 requires these paths to be resolved explicitly and passed
3928
+ * through the execution chain — never inferred from cwd.
3929
+ *
3930
+ * In repo mode (single repo), all paths point into the same filesystem tree.
3931
+ * The contract is the same; the values just happen to be co-located.
3932
+ *
3933
+ * @since TP-102
3934
+ */
3935
+ export interface PacketPaths {
3936
+ /** Absolute path to the task's PROMPT.md */
3937
+ promptPath: string;
3938
+ /** Absolute path to the task's STATUS.md */
3939
+ statusPath: string;
3940
+ /** Absolute path to the task's .DONE marker */
3941
+ donePath: string;
3942
+ /** Absolute path to the task's .reviews/ directory */
3943
+ reviewsDir: string;
3944
+ /** Absolute path to the task folder containing packet files */
3945
+ taskFolder: string;
3946
+ }
3947
+
3948
+ /**
3949
+ * Resolve a PacketPaths object from a task folder path.
3950
+ *
3951
+ * This is a pure helper — it does not check whether the files exist.
3952
+ * Consumers should use this to build authoritative paths from an
3953
+ * already-resolved task folder location.
3954
+ *
3955
+ * @param taskFolder - Absolute path to the task folder
3956
+ * @returns Complete PacketPaths with all derived paths
3957
+ *
3958
+ * @since TP-102
3959
+ */
3960
+ export function resolvePacketPaths(taskFolder: string): PacketPaths {
3961
+ return {
3962
+ promptPath: `${taskFolder}/PROMPT.md`,
3963
+ statusPath: `${taskFolder}/STATUS.md`,
3964
+ donePath: `${taskFolder}/.DONE`,
3965
+ reviewsDir: `${taskFolder}/.reviews`,
3966
+ taskFolder,
3967
+ };
3968
+ }
3969
+
3970
+ /**
3971
+ * A single execution unit in Runtime V2.
3972
+ *
3973
+ * Represents one unit of work to be executed in one lane: either a whole
3974
+ * task (repo mode / single-segment workspace mode) or one segment of a
3975
+ * multi-repo task.
3976
+ *
3977
+ * This is the contract between the engine (which decides what to run) and
3978
+ * the lane-runner (which runs it). It carries everything the lane-runner
3979
+ * needs without requiring it to re-derive paths from cwd or session state.
3980
+ *
3981
+ * @since TP-102
3982
+ */
3983
+ export interface ExecutionUnit {
3984
+ /** Unique identifier: taskId for whole-task units, `taskId::repoId` for segments */
3985
+ id: string;
3986
+ /** Parent task identifier */
3987
+ taskId: string;
3988
+ /** Segment identifier (null for whole-task execution) */
3989
+ segmentId: string | null;
3990
+ /** Repo ID where execution happens (cwd of the worker) */
3991
+ executionRepoId: string;
3992
+ /** Repo ID that owns the packet files (may differ in workspace mode) */
3993
+ packetHomeRepoId: string;
3994
+ /** Absolute path to the execution worktree */
3995
+ worktreePath: string;
3996
+ /** Authoritative packet file paths */
3997
+ packet: PacketPaths;
3998
+ /** Full parsed task metadata */
3999
+ task: ParsedTask;
4000
+ }
4001
+
4002
+ /**
4003
+ * Per-agent process manifest for the runtime registry.
4004
+ *
4005
+ * Written by the agent's parent process (lane-runner or engine) before
4006
+ * the agent is considered visible. Updated on status transitions and
4007
+ * cleaned up on batch completion.
4008
+ *
4009
+ * Replaces legacy session discovery as the source of truth for agent
4010
+ * liveness, identity, and attribution.
4011
+ *
4012
+ * File location: `.pi/runtime/{batchId}/agents/{agentId}/manifest.json`
4013
+ *
4014
+ * @since TP-102
4015
+ */
4016
+ export interface RuntimeAgentManifest {
4017
+ /** Batch this agent belongs to */
4018
+ batchId: string;
4019
+ /** Stable agent identity (e.g., "orch-henrylach-lane-1-worker") */
4020
+ agentId: RuntimeAgentId;
4021
+ /** Agent role */
4022
+ role: RuntimeAgentRole;
4023
+ /** Lane number (null for merge agents) */
4024
+ laneNumber: number | null;
4025
+ /** Current task ID being executed (null before first assignment) */
4026
+ taskId: string | null;
4027
+ /** Repo ID the agent is operating in */
4028
+ repoId: string;
4029
+ /** OS process ID of the agent host process */
4030
+ pid: number;
4031
+ /** OS process ID of the parent (lane-runner or engine) */
4032
+ parentPid: number;
4033
+ /** Epoch ms when the agent was spawned */
4034
+ startedAt: number;
4035
+ /** Current lifecycle status */
4036
+ status: RuntimeAgentStatus;
4037
+ /** Absolute path to the agent's working directory */
4038
+ cwd: string;
4039
+ /** Authoritative packet paths (null for merge agents or pre-assignment) */
4040
+ packet: PacketPaths | null;
4041
+ }
4042
+
4043
+ /**
4044
+ * Batch-level runtime registry snapshot.
4045
+ *
4046
+ * Contains all active and recently-exited agents for one batch.
4047
+ * The authoritative source of truth for which agents exist, replacing
4048
+ * legacy session discovery.
4049
+ *
4050
+ * File location: `.pi/runtime/{batchId}/registry.json`
4051
+ *
4052
+ * @since TP-102
4053
+ */
4054
+ export interface RuntimeRegistry {
4055
+ /** Batch ID this registry belongs to */
4056
+ batchId: string;
4057
+ /** Epoch ms when the registry was last updated */
4058
+ updatedAt: number;
4059
+ /** All known agents (keyed by agentId for fast lookup in JSON form) */
4060
+ agents: Record<RuntimeAgentId, RuntimeAgentManifest>;
4061
+ }
4062
+
4063
+ /**
4064
+ * Lane execution snapshot emitted by the lane-runner.
4065
+ *
4066
+ * Replaces the current `lane-state-*.json` sidecar with a first-class
4067
+ * contract. Written by the lane-runner directly (not by tailing sidecar
4068
+ * files from a sibling process).
4069
+ *
4070
+ * File location: `.pi/runtime/{batchId}/lanes/lane-{N}.json`
4071
+ *
4072
+ * @since TP-102
4073
+ */
4074
+ export interface RuntimeLaneSnapshot {
4075
+ /** Batch this lane belongs to */
4076
+ batchId: string;
4077
+ /** Lane number (1-indexed) */
4078
+ laneNumber: number;
4079
+ /** Lane identifier (e.g., "lane-1") */
4080
+ laneId: string;
4081
+ /** Repo ID this lane targets */
4082
+ repoId: string;
4083
+ /** Current task ID being executed */
4084
+ taskId: string | null;
4085
+ /** Current segment ID (null for whole-task execution) */
4086
+ segmentId: string | null;
4087
+ /** Lane execution status */
4088
+ status: "idle" | "running" | "complete" | "failed";
4089
+ /** Worker agent snapshot (null when no worker is active) */
4090
+ worker: RuntimeAgentTelemetrySnapshot | null;
4091
+ /** Reviewer agent snapshot (null when no reviewer is active) */
4092
+ reviewer: RuntimeAgentTelemetrySnapshot | null;
4093
+ /** Task progress derived from STATUS.md */
4094
+ progress: RuntimeTaskProgress | null;
4095
+ /** Epoch ms when this snapshot was last updated */
4096
+ updatedAt: number;
4097
+ }
4098
+
4099
+ /**
4100
+ * Telemetry snapshot for a single agent within a lane.
4101
+ *
4102
+ * @since TP-102
4103
+ */
4104
+ export interface RuntimeAgentTelemetrySnapshot {
4105
+ /** Agent ID */
4106
+ agentId: RuntimeAgentId;
4107
+ /** Agent lifecycle status */
4108
+ status: RuntimeAgentStatus;
4109
+ /** Elapsed time in milliseconds */
4110
+ elapsedMs: number;
4111
+ /** Number of tool calls made */
4112
+ toolCalls: number;
4113
+ /** Context window utilization percentage (0-100) */
4114
+ contextPct: number;
4115
+ /** Cumulative cost in USD */
4116
+ costUsd: number;
4117
+ /** Last tool call description */
4118
+ lastTool: string;
4119
+ /** Input tokens consumed */
4120
+ inputTokens: number;
4121
+ /** Output tokens generated */
4122
+ outputTokens: number;
4123
+ /** Cache read tokens */
4124
+ cacheReadTokens: number;
4125
+ /** Cache write tokens */
4126
+ cacheWriteTokens: number;
4127
+ }
4128
+
4129
+ /**
4130
+ * Task progress derived from STATUS.md parsing.
4131
+ *
4132
+ * @since TP-102
4133
+ */
4134
+ export interface RuntimeTaskProgress {
4135
+ /** Human-readable current step label */
4136
+ currentStep: string;
4137
+ /** Number of checked checkboxes across all steps */
4138
+ checked: number;
4139
+ /** Total number of checkboxes across all steps */
4140
+ total: number;
4141
+ /** Current worker iteration number */
4142
+ iteration: number;
4143
+ /** Number of reviews performed */
4144
+ reviews: number;
4145
+ }
4146
+
4147
+ /**
4148
+ * Normalized event emitted by an agent host.
4149
+ *
4150
+ * The canonical telemetry/conversation event shape for Runtime V2.
4151
+ * Agent hosts write these to per-agent event logs and stream them
4152
+ * to their parent process via IPC.
4153
+ *
4154
+ * File location: `.pi/runtime/{batchId}/agents/{agentId}/events.jsonl`
4155
+ *
4156
+ * @since TP-102
4157
+ */
4158
+ export interface RuntimeAgentEvent {
4159
+ /** Batch ID */
4160
+ batchId: string;
4161
+ /** Agent that produced this event */
4162
+ agentId: RuntimeAgentId;
4163
+ /** Agent role */
4164
+ role: RuntimeAgentRole;
4165
+ /** Lane number (null for merge agents) */
4166
+ laneNumber: number | null;
4167
+ /** Task ID being executed when the event was produced */
4168
+ taskId: string | null;
4169
+ /** Repo ID */
4170
+ repoId: string;
4171
+ /** Epoch ms timestamp */
4172
+ ts: number;
4173
+ /** Event type */
4174
+ type: RuntimeAgentEventType;
4175
+ /** Event-specific payload */
4176
+ payload: Record<string, unknown>;
4177
+ }
4178
+
4179
+ /**
4180
+ * Normalized event types for the Runtime V2 agent event stream.
4181
+ *
4182
+ * @since TP-102
4183
+ */
4184
+ export type RuntimeAgentEventType =
4185
+ // Lifecycle
4186
+ | "agent_started"
4187
+ | "agent_exited"
4188
+ | "agent_killed"
4189
+ | "agent_crashed"
4190
+ | "agent_timeout"
4191
+ // Conversation
4192
+ | "prompt_sent"
4193
+ | "assistant_message"
4194
+ | "tool_call"
4195
+ | "tool_result"
4196
+ // Telemetry
4197
+ | "usage_delta"
4198
+ | "context_usage"
4199
+ | "retry_started"
4200
+ | "retry_finished"
4201
+ | "compaction_started"
4202
+ | "compaction_finished"
4203
+ // Steering
4204
+ | "message_delivered"
4205
+ | "reply_sent"
4206
+ | "escalation_sent"
4207
+ // Review / bridge
4208
+ | "review_requested"
4209
+ | "review_completed"
4210
+ | "review_failed"
4211
+ // Exit interception (TP-172)
4212
+ | "exit_intercepted";
4213
+
4214
+ // ── Runtime V2 Path Helpers (TP-102) ─────────────────────────────────
4215
+
4216
+ /**
4217
+ * Resolve the root directory for Runtime V2 artifacts for a given batch.
4218
+ *
4219
+ * @param stateRoot - Root directory containing .pi/ (workspace root or repo root)
4220
+ * @param batchId - Batch identifier
4221
+ * @returns Absolute path: `{stateRoot}/.pi/runtime/{batchId}/`
4222
+ *
4223
+ * @since TP-102
4224
+ */
4225
+ export function runtimeRoot(stateRoot: string, batchId: string): string {
4226
+ return `${stateRoot}/.pi/runtime/${batchId}`;
4227
+ }
4228
+
4229
+ /**
4230
+ * Resolve the path for a specific agent's runtime directory.
4231
+ *
4232
+ * @param stateRoot - Root directory containing .pi/
4233
+ * @param batchId - Batch identifier
4234
+ * @param agentId - Runtime agent identifier
4235
+ * @returns Absolute path: `{stateRoot}/.pi/runtime/{batchId}/agents/{agentId}/`
4236
+ *
4237
+ * @since TP-102
4238
+ */
4239
+ export function runtimeAgentDir(
4240
+ stateRoot: string,
4241
+ batchId: string,
4242
+ agentId: RuntimeAgentId,
4243
+ ): string {
4244
+ return `${stateRoot}/.pi/runtime/${batchId}/agents/${agentId}`;
4245
+ }
4246
+
4247
+ /**
4248
+ * Resolve the path for a specific agent's manifest file.
4249
+ *
4250
+ * @since TP-102
4251
+ */
4252
+ export function runtimeManifestPath(
4253
+ stateRoot: string,
4254
+ batchId: string,
4255
+ agentId: RuntimeAgentId,
4256
+ ): string {
4257
+ return `${runtimeAgentDir(stateRoot, batchId, agentId)}/manifest.json`;
4258
+ }
4259
+
4260
+ /**
4261
+ * Resolve the path for a specific agent's event log.
4262
+ *
4263
+ * @since TP-102
4264
+ */
4265
+ export function runtimeAgentEventsPath(
4266
+ stateRoot: string,
4267
+ batchId: string,
4268
+ agentId: RuntimeAgentId,
4269
+ ): string {
4270
+ return `${runtimeAgentDir(stateRoot, batchId, agentId)}/events.jsonl`;
4271
+ }
4272
+
4273
+ /**
4274
+ * Resolve the path for a lane snapshot file.
4275
+ *
4276
+ * @since TP-102
4277
+ */
4278
+ export function runtimeLaneSnapshotPath(
4279
+ stateRoot: string,
4280
+ batchId: string,
4281
+ laneNumber: number,
4282
+ ): string {
4283
+ return `${stateRoot}/.pi/runtime/${batchId}/lanes/lane-${laneNumber}.json`;
4284
+ }
4285
+
4286
+ /**
4287
+ * Telemetry snapshot for a merge agent.
4288
+ *
4289
+ * Written to `.pi/runtime/{batchId}/lanes/merge-{mergeNumber}.json` alongside
4290
+ * lane snapshots so the dashboard can display live merge-phase telemetry.
4291
+ * Follows the same file-backed pattern as {@link RuntimeLaneSnapshot} but is
4292
+ * simpler — merge agents have no reviewer, progress tracking, or repoId.
4293
+ *
4294
+ * @since TP-164
4295
+ */
4296
+ export interface RuntimeMergeSnapshot {
4297
+ /** Batch this merge agent belongs to */
4298
+ batchId: string;
4299
+ /** 1-indexed merge agent number (e.g. 1 for "orch-henry-merge-1") */
4300
+ mergeNumber: number;
4301
+ /** Stable agent session name (e.g. "orch-henry-merge-1") */
4302
+ sessionName: string;
4303
+ /** Wave index this merge agent is processing (0-indexed, 0 when unknown) */
4304
+ waveIndex: number;
4305
+ /** Merge agent lifecycle status */
4306
+ status: "running" | "complete" | "failed";
4307
+ /** Live telemetry snapshot for the merge agent (null when not yet started) */
4308
+ agent: RuntimeAgentTelemetrySnapshot | null;
4309
+ /** Epoch ms when this snapshot was last updated */
4310
+ updatedAt: number;
4311
+ }
4312
+
4313
+ /**
4314
+ * Resolve the path for a merge agent snapshot file.
4315
+ *
4316
+ * Snapshots are stored alongside lane snapshots in the `lanes/` directory so
4317
+ * the dashboard server's directory scan picks them up automatically.
4318
+ *
4319
+ * @param stateRoot - Repository root (where `.pi/` lives)
4320
+ * @param batchId - Current batch identifier
4321
+ * @param mergeNumber - 1-indexed merge agent number
4322
+ * @returns Absolute path to the merge snapshot JSON file
4323
+ *
4324
+ * @since TP-164
4325
+ */
4326
+ /**
4327
+ * Path to a merge agent snapshot file.
4328
+ *
4329
+ * The filename includes BOTH `waveIndex` and `mergeNumber` because lane
4330
+ * numbers (and therefore the legacy `mergeNumber`-only filename) repeat
4331
+ * across waves — a wave-2 lane-1 merge would overwrite the wave-1 lane-1
4332
+ * snapshot before the dashboard's next poll could read it. Per-wave
4333
+ * namespacing keeps each merge's snapshot durable until the runtime
4334
+ * directory itself is cleaned up at end-of-batch. See #509.
4335
+ *
4336
+ * @param waveIndex 0-based wave index for the merge
4337
+ * @param mergeNumber 1-based merge agent number (derived from lane number)
4338
+ */
4339
+ export function runtimeMergeSnapshotPath(
4340
+ stateRoot: string,
4341
+ batchId: string,
4342
+ waveIndex: number,
4343
+ mergeNumber: number,
4344
+ ): string {
4345
+ return `${stateRoot}/.pi/runtime/${batchId}/lanes/merge-w${waveIndex}-${mergeNumber}.json`;
4346
+ }
4347
+
4348
+ /**
4349
+ * Resolve the path for the batch runtime registry.
4350
+ *
4351
+ * @since TP-102
4352
+ */
4353
+ export function runtimeRegistryPath(stateRoot: string, batchId: string): string {
4354
+ return `${stateRoot}/.pi/runtime/${batchId}/registry.json`;
4355
+ }
4356
+
4357
+ /**
4358
+ * Build a canonical RuntimeAgentId from components.
4359
+ *
4360
+ * Produces IDs compatible with the existing naming convention
4361
+ * (e.g., "orch-henrylach-lane-1-worker") while semantically
4362
+ * decoupling them from legacy session names.
4363
+ *
4364
+ * @param prefix - Operator/batch prefix (e.g., "orch-henrylach")
4365
+ * @param laneNumber - Lane number (null for merge agents)
4366
+ * @param role - Agent role
4367
+ * @param mergeIndex - Merge wave index (only for merge agents)
4368
+ * @returns Canonical agent ID string
4369
+ *
4370
+ * @since TP-102
4371
+ */
4372
+ export function buildRuntimeAgentId(
4373
+ prefix: string,
4374
+ laneNumber: number | null,
4375
+ role: RuntimeAgentRole,
4376
+ mergeIndex?: number,
4377
+ ): RuntimeAgentId {
4378
+ if (role === "merger" && mergeIndex != null) {
4379
+ return `${prefix}-merge-${mergeIndex}`;
4380
+ }
4381
+ if (role === "lane-runner" && laneNumber != null) {
4382
+ return `${prefix}-lane-${laneNumber}`;
4383
+ }
4384
+ if (laneNumber != null) {
4385
+ return `${prefix}-lane-${laneNumber}-${role}`;
4386
+ }
4387
+ return `${prefix}-${role}`;
4388
+ }
4389
+
4390
+ /**
4391
+ * Validate that a RuntimeAgentManifest has required fields and sane values.
4392
+ *
4393
+ * Returns an array of validation error strings (empty = valid).
4394
+ *
4395
+ * @since TP-102
4396
+ */
4397
+ export function validateAgentManifest(manifest: unknown): string[] {
4398
+ const errors: string[] = [];
4399
+ if (!manifest || typeof manifest !== "object") {
4400
+ return ["manifest must be a non-null object"];
4401
+ }
4402
+ const m = manifest as Record<string, unknown>;
4403
+
4404
+ if (typeof m.batchId !== "string" || !m.batchId) errors.push("batchId must be a non-empty string");
4405
+ if (typeof m.agentId !== "string" || !m.agentId) errors.push("agentId must be a non-empty string");
4406
+ if (typeof m.role !== "string") errors.push("role must be a string");
4407
+ else {
4408
+ const validRoles: ReadonlySet<string> = new Set(["worker", "reviewer", "merger", "lane-runner"]);
4409
+ if (!validRoles.has(m.role as string))
4410
+ errors.push(`role must be one of: ${[...validRoles].join(", ")}`);
4411
+ }
4412
+ if (typeof m.pid !== "number" || !Number.isFinite(m.pid) || m.pid <= 0)
4413
+ errors.push("pid must be a positive finite number");
4414
+ if (typeof m.parentPid !== "number" || !Number.isFinite(m.parentPid) || m.parentPid <= 0)
4415
+ errors.push("parentPid must be a positive finite number");
4416
+ if (typeof m.startedAt !== "number" || !Number.isFinite(m.startedAt))
4417
+ errors.push("startedAt must be a finite number");
4418
+ if (typeof m.status !== "string") errors.push("status must be a string");
4419
+ else {
4420
+ const validStatuses: ReadonlySet<string> = new Set([
4421
+ "spawning",
4422
+ "running",
4423
+ "wrapping_up",
4424
+ "exited",
4425
+ "crashed",
4426
+ "timed_out",
4427
+ "killed",
4428
+ ]);
4429
+ if (!validStatuses.has(m.status as string))
4430
+ errors.push(`status must be one of: ${[...validStatuses].join(", ")}`);
4431
+ }
4432
+ if (typeof m.cwd !== "string" || !m.cwd) errors.push("cwd must be a non-empty string");
4433
+ if (typeof m.repoId !== "string") errors.push("repoId must be a string");
4434
+
4435
+ return errors;
4436
+ }
4437
+
4438
+ /**
4439
+ * Validate that a PacketPaths object has all required fields.
4440
+ *
4441
+ * Returns an array of validation error strings (empty = valid).
4442
+ *
4443
+ * @since TP-102
4444
+ */
4445
+ export function validatePacketPaths(packet: unknown): string[] {
4446
+ const errors: string[] = [];
4447
+ if (!packet || typeof packet !== "object") {
4448
+ return ["packet must be a non-null object"];
4449
+ }
4450
+ const p = packet as Record<string, unknown>;
4451
+
4452
+ for (const field of [
4453
+ "promptPath",
4454
+ "statusPath",
4455
+ "donePath",
4456
+ "reviewsDir",
4457
+ "taskFolder",
4458
+ ] as const) {
4459
+ if (typeof p[field] !== "string" || !(p[field] as string)) {
4460
+ errors.push(`${field} must be a non-empty string`);
4461
+ }
4462
+ }
4463
+
4464
+ return errors;
4465
+ }