@pi-agents/orchid 0.1.0-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +41 -0
- package/LICENSE +21 -0
- package/README.md +246 -0
- package/agents/AGENTS-MANIFEST.md +42 -0
- package/agents/brain.md +42 -0
- package/agents/context-builder.md +46 -0
- package/agents/delegate.md +12 -0
- package/agents/dev-1.md +42 -0
- package/agents/oracle.md +73 -0
- package/agents/planner.md +55 -0
- package/agents/researcher.md +52 -0
- package/agents/reviewer.md +79 -0
- package/agents/scout.md +50 -0
- package/agents/tester.md +45 -0
- package/agents/worker.md +55 -0
- package/extensions/ralph.ts +1 -0
- package/extensions/reviewer-extension.ts +125 -0
- package/extensions/task-orchestrator.ts +28 -0
- package/package.json +63 -0
- package/prompts/gather-context-and-clarify.md +13 -0
- package/prompts/parallel-cleanup.md +59 -0
- package/prompts/parallel-context-build.md +53 -0
- package/prompts/parallel-handoff-plan.md +59 -0
- package/prompts/parallel-research.md +50 -0
- package/prompts/parallel-review.md +54 -0
- package/prompts/review-loop.md +41 -0
- package/skills/orchid/SKILL.md +214 -0
- package/skills/orchid/orchid-cleanup/SKILL.md +122 -0
- package/skills/orchid/orchid-converge/SKILL.md +124 -0
- package/skills/orchid/orchid-decompose/SKILL.md +201 -0
- package/skills/orchid/orchid-doctor/SKILL.md +162 -0
- package/skills/orchid/orchid-investigate/SKILL.md +102 -0
- package/skills/orchid/orchid-launch/SKILL.md +147 -0
- package/skills/ralph/SKILL.md +73 -0
- package/skills/subagents/pi-subagents/SKILL.md +813 -0
- package/src/index.ts +7 -0
- package/src/orchestrator/abort.ts +534 -0
- package/src/orchestrator/agent-bridge-extension.ts +1020 -0
- package/src/orchestrator/agent-host.ts +954 -0
- package/src/orchestrator/cleanup.ts +776 -0
- package/src/orchestrator/config-loader.ts +1412 -0
- package/src/orchestrator/config-schema.ts +690 -0
- package/src/orchestrator/config.ts +81 -0
- package/src/orchestrator/context-window.ts +66 -0
- package/src/orchestrator/diagnostic-reports.ts +475 -0
- package/src/orchestrator/diagnostics.ts +394 -0
- package/src/orchestrator/discovery.ts +1833 -0
- package/src/orchestrator/engine-worker.ts +415 -0
- package/src/orchestrator/engine.ts +5940 -0
- package/src/orchestrator/execution.ts +3104 -0
- package/src/orchestrator/extension.ts +5934 -0
- package/src/orchestrator/formatting.ts +785 -0
- package/src/orchestrator/git.ts +88 -0
- package/src/orchestrator/index.ts +28 -0
- package/src/orchestrator/lane-runner.ts +1787 -0
- package/src/orchestrator/mailbox.ts +780 -0
- package/src/orchestrator/merge.ts +3414 -0
- package/src/orchestrator/messages.ts +1062 -0
- package/src/orchestrator/migrations.ts +278 -0
- package/src/orchestrator/naming.ts +117 -0
- package/src/orchestrator/path-resolver.ts +275 -0
- package/src/orchestrator/persistence.ts +2625 -0
- package/src/orchestrator/process-registry.ts +452 -0
- package/src/orchestrator/quality-gate.ts +1085 -0
- package/src/orchestrator/resume.ts +3488 -0
- package/src/orchestrator/sessions.ts +57 -0
- package/src/orchestrator/settings-loader.ts +136 -0
- package/src/orchestrator/settings-tui.ts +2208 -0
- package/src/orchestrator/sidecar-telemetry.ts +267 -0
- package/src/orchestrator/supervisor.ts +4548 -0
- package/src/orchestrator/task-executor-core.ts +675 -0
- package/src/orchestrator/tmux-compat.ts +37 -0
- package/src/orchestrator/tool-allowlist-constants.ts +37 -0
- package/src/orchestrator/types.ts +4465 -0
- package/src/orchestrator/verification.ts +547 -0
- package/src/orchestrator/waves.ts +1564 -0
- package/src/orchestrator/workspace.ts +707 -0
- package/src/orchestrator/worktree.ts +2725 -0
- package/src/ralph/index.ts +825 -0
- package/src/subagents/agents/agent-management.ts +648 -0
- package/src/subagents/agents/agent-scope.ts +6 -0
- package/src/subagents/agents/agent-selection.ts +23 -0
- package/src/subagents/agents/agent-serializer.ts +86 -0
- package/src/subagents/agents/agents.ts +832 -0
- package/src/subagents/agents/chain-serializer.ts +137 -0
- package/src/subagents/agents/frontmatter.ts +29 -0
- package/src/subagents/agents/identity.ts +30 -0
- package/src/subagents/agents/skills.ts +632 -0
- package/src/subagents/extension/config.ts +16 -0
- package/src/subagents/extension/control-notices.ts +92 -0
- package/src/subagents/extension/doctor.ts +199 -0
- package/src/subagents/extension/fanout-child.ts +170 -0
- package/src/subagents/extension/index.ts +573 -0
- package/src/subagents/extension/schemas.ts +168 -0
- package/src/subagents/intercom/intercom-bridge.ts +379 -0
- package/src/subagents/intercom/result-intercom.ts +377 -0
- package/src/subagents/runs/background/async-execution.ts +712 -0
- package/src/subagents/runs/background/async-job-tracker.ts +310 -0
- package/src/subagents/runs/background/async-resume.ts +345 -0
- package/src/subagents/runs/background/async-status.ts +325 -0
- package/src/subagents/runs/background/completion-dedupe.ts +63 -0
- package/src/subagents/runs/background/notify.ts +108 -0
- package/src/subagents/runs/background/parallel-groups.ts +45 -0
- package/src/subagents/runs/background/result-watcher.ts +307 -0
- package/src/subagents/runs/background/run-id-resolver.ts +83 -0
- package/src/subagents/runs/background/run-status.ts +269 -0
- package/src/subagents/runs/background/stale-run-reconciler.ts +336 -0
- package/src/subagents/runs/background/subagent-runner.ts +1808 -0
- package/src/subagents/runs/background/top-level-async.ts +13 -0
- package/src/subagents/runs/foreground/chain-clarify.ts +1333 -0
- package/src/subagents/runs/foreground/chain-execution.ts +938 -0
- package/src/subagents/runs/foreground/execution.ts +918 -0
- package/src/subagents/runs/foreground/subagent-executor.ts +2527 -0
- package/src/subagents/runs/shared/completion-guard.ts +147 -0
- package/src/subagents/runs/shared/long-running-guard.ts +175 -0
- package/src/subagents/runs/shared/mcp-direct-tool-allowlist.ts +365 -0
- package/src/subagents/runs/shared/model-fallback.ts +103 -0
- package/src/subagents/runs/shared/nested-events.ts +819 -0
- package/src/subagents/runs/shared/nested-path.ts +52 -0
- package/src/subagents/runs/shared/nested-render.ts +115 -0
- package/src/subagents/runs/shared/parallel-utils.ts +109 -0
- package/src/subagents/runs/shared/pi-args.ts +220 -0
- package/src/subagents/runs/shared/pi-spawn.ts +115 -0
- package/src/subagents/runs/shared/run-history.ts +60 -0
- package/src/subagents/runs/shared/single-output.ts +164 -0
- package/src/subagents/runs/shared/subagent-control.ts +226 -0
- package/src/subagents/runs/shared/subagent-prompt-runtime.ts +170 -0
- package/src/subagents/runs/shared/worktree.ts +577 -0
- package/src/subagents/shared/artifacts.ts +98 -0
- package/src/subagents/shared/atomic-json.ts +16 -0
- package/src/subagents/shared/file-coalescer.ts +40 -0
- package/src/subagents/shared/fork-context.ts +76 -0
- package/src/subagents/shared/formatters.ts +133 -0
- package/src/subagents/shared/jsonl-writer.ts +81 -0
- package/src/subagents/shared/model-info.ts +78 -0
- package/src/subagents/shared/post-exit-stdio-guard.ts +85 -0
- package/src/subagents/shared/session-identity.ts +10 -0
- package/src/subagents/shared/session-tokens.ts +44 -0
- package/src/subagents/shared/settings.ts +397 -0
- package/src/subagents/shared/status-format.ts +49 -0
- package/src/subagents/shared/types.ts +822 -0
- package/src/subagents/shared/utils.ts +450 -0
- package/src/subagents/slash/prompt-template-bridge.ts +397 -0
- package/src/subagents/slash/slash-bridge.ts +174 -0
- package/src/subagents/slash/slash-commands.ts +528 -0
- package/src/subagents/slash/slash-live-state.ts +292 -0
- package/src/subagents/tui/render-helpers.ts +80 -0
- package/src/subagents/tui/render.ts +1358 -0
- package/templates/agents/local/supervisor.md +33 -0
- package/templates/agents/local/task-merger.md +27 -0
- package/templates/agents/local/task-reviewer.md +30 -0
- package/templates/agents/local/task-worker.md +34 -0
- package/templates/agents/supervisor-routing.md +92 -0
- package/templates/agents/supervisor.md +229 -0
- package/templates/agents/task-merger.md +214 -0
- package/templates/agents/task-reviewer.md +260 -0
- package/templates/agents/task-worker-segment.md +44 -0
- package/templates/agents/task-worker.md +557 -0
- package/templates/tasks/CONTEXT.md +30 -0
- package/templates/tasks/EXAMPLE-001-hello-world/PROMPT.md +98 -0
- package/templates/tasks/EXAMPLE-001-hello-world/STATUS.md +73 -0
- package/templates/tasks/EXAMPLE-002-parallel-smoke/PROMPT.md +97 -0
- package/templates/tasks/EXAMPLE-002-parallel-smoke/STATUS.md +73 -0
|
@@ -0,0 +1,4465 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* All types, interfaces, error classes, constants, and defaults
|
|
3
|
+
* @module orch/types
|
|
4
|
+
*/
|
|
5
|
+
import { join } from "path";
|
|
6
|
+
import type { ExitClassification, TaskExitDiagnostic } from "./diagnostics.js";
|
|
7
|
+
// TP-189 (Cluster B): single source of truth for the worker user-tools
|
|
8
|
+
// default literal. The constants module is import-free so this does NOT
|
|
9
|
+
// create a cycle (types.ts -> tool-allowlist-constants.ts is a leaf).
|
|
10
|
+
import { DEFAULT_WORKER_USER_TOOLS } from "./tool-allowlist-constants.ts";
|
|
11
|
+
|
|
12
|
+
// ── Types ────────────────────────────────────────────────────────────
|
|
13
|
+
|
|
14
|
+
/** Configuration from .pi/task-orchestrator.yaml */
|
|
15
|
+
export interface OrchestratorConfig {
|
|
16
|
+
orchestrator: {
|
|
17
|
+
max_lanes: number;
|
|
18
|
+
worktree_location: "sibling" | "subdirectory";
|
|
19
|
+
worktree_prefix: string;
|
|
20
|
+
batch_id_format: "timestamp" | "sequential";
|
|
21
|
+
spawn_mode: "subprocess";
|
|
22
|
+
sessionPrefix: string;
|
|
23
|
+
/** Optional operator identifier. Auto-detected from OS username if empty. */
|
|
24
|
+
operator_id: string;
|
|
25
|
+
/** How completed batches are integrated. manual = user runs /orch-integrate. supervised = supervisor proposes plan, asks confirmation. auto = supervisor executes without asking. */
|
|
26
|
+
integration: "manual" | "supervised" | "auto";
|
|
27
|
+
/**
|
|
28
|
+
* Optional pre-resolved batch ID injected by callers that already
|
|
29
|
+
* know the batch identity (e.g., resumed orchestrations). When
|
|
30
|
+
* absent, callers fall back to the `ORCH_BATCH_ID` env var or a
|
|
31
|
+
* timestamp. Read by `executeLaneV2` (execution.ts).
|
|
32
|
+
*
|
|
33
|
+
* @since TP-195 (#TBD) — documented field that was already being
|
|
34
|
+
* read at runtime via `config.orchestrator?.batchId` and asserted
|
|
35
|
+
* by the source-grep invariant in `runtime-model-fallback.test.ts`.
|
|
36
|
+
*/
|
|
37
|
+
batchId?: string;
|
|
38
|
+
};
|
|
39
|
+
dependencies: {
|
|
40
|
+
source: "prompt" | "agent";
|
|
41
|
+
cache: boolean;
|
|
42
|
+
};
|
|
43
|
+
assignment: {
|
|
44
|
+
strategy: "affinity-first" | "round-robin" | "load-balanced";
|
|
45
|
+
size_weights: Record<string, number>;
|
|
46
|
+
};
|
|
47
|
+
pre_warm: {
|
|
48
|
+
auto_detect: boolean;
|
|
49
|
+
commands: Record<string, string>;
|
|
50
|
+
always: string[];
|
|
51
|
+
};
|
|
52
|
+
merge: {
|
|
53
|
+
model: string;
|
|
54
|
+
tools: string;
|
|
55
|
+
/** Merge-agent thinking mode (empty = inherit session thinking) */
|
|
56
|
+
thinking: string;
|
|
57
|
+
verify: string[];
|
|
58
|
+
order: "fewest-files-first" | "sequential";
|
|
59
|
+
/** Merge agent timeout in minutes. Default: 10. Increase for large batches. */
|
|
60
|
+
timeout_minutes: number;
|
|
61
|
+
/** Package specifiers to exclude from extension forwarding (exact match). @since TP-180 */
|
|
62
|
+
exclude_extensions?: string[];
|
|
63
|
+
};
|
|
64
|
+
failure: {
|
|
65
|
+
on_task_failure: "skip-dependents" | "stop-wave" | "stop-all";
|
|
66
|
+
on_merge_failure: "pause" | "abort";
|
|
67
|
+
stall_timeout: number;
|
|
68
|
+
max_worker_minutes: number;
|
|
69
|
+
abort_grace_period: number;
|
|
70
|
+
};
|
|
71
|
+
monitoring: {
|
|
72
|
+
poll_interval: number;
|
|
73
|
+
};
|
|
74
|
+
/** Verification baseline fingerprinting settings (TP-032). */
|
|
75
|
+
verification: {
|
|
76
|
+
enabled: boolean;
|
|
77
|
+
mode: "strict" | "permissive";
|
|
78
|
+
flaky_reruns: number;
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Stable segment identifier.
|
|
84
|
+
*
|
|
85
|
+
* SegmentId is opaque — never parse by string-splitting.
|
|
86
|
+
* Use structured node/record fields (`repoId`, `taskId`) instead.
|
|
87
|
+
*/
|
|
88
|
+
export type SegmentId = `${string}::${string}` | `${string}::${string}::${number}`;
|
|
89
|
+
|
|
90
|
+
/** How an intra-task segment edge was produced (for observability/debugging). */
|
|
91
|
+
export type SegmentEdgeProvenance = "explicit" | "inferred";
|
|
92
|
+
|
|
93
|
+
/** Repo-scoped edge parsed from optional `## Segment DAG` prompt metadata. */
|
|
94
|
+
export interface PromptSegmentDagEdge {
|
|
95
|
+
fromRepoId: string;
|
|
96
|
+
toRepoId: string;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/** Optional explicit segment metadata parsed from PROMPT.md. */
|
|
100
|
+
export interface PromptSegmentDagMetadata {
|
|
101
|
+
/** Repo IDs participating in this task's segment graph, first-seen order. */
|
|
102
|
+
repoIds: string[];
|
|
103
|
+
/** Directed repo-level edges, sorted by `fromRepoId` then `toRepoId`. */
|
|
104
|
+
edges: PromptSegmentDagEdge[];
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/** A parsed task from PROMPT.md, enriched for orchestrator use */
|
|
108
|
+
export interface ParsedTask {
|
|
109
|
+
taskId: string;
|
|
110
|
+
taskName: string;
|
|
111
|
+
reviewLevel: number;
|
|
112
|
+
size: string;
|
|
113
|
+
dependencies: string[];
|
|
114
|
+
fileScope: string[];
|
|
115
|
+
taskFolder: string;
|
|
116
|
+
promptPath: string;
|
|
117
|
+
areaName: string;
|
|
118
|
+
status: "pending" | "complete";
|
|
119
|
+
/** Repo ID declared in the PROMPT metadata (e.g., "api", "frontend"). Undefined if not declared. */
|
|
120
|
+
promptRepoId?: string;
|
|
121
|
+
/** Resolved repo ID after routing precedence (workspace mode only). Undefined in repo mode. */
|
|
122
|
+
resolvedRepoId?: string;
|
|
123
|
+
/** Optional explicit segment DAG metadata from `## Segment DAG`. */
|
|
124
|
+
explicitSegmentDag?: PromptSegmentDagMetadata;
|
|
125
|
+
/**
|
|
126
|
+
* Repo ID that owns task packet files (v4, TP-081).
|
|
127
|
+
* Populated by execution engine in workspace mode. Undefined in repo mode.
|
|
128
|
+
*/
|
|
129
|
+
packetRepoId?: string;
|
|
130
|
+
/**
|
|
131
|
+
* Absolute path to task folder in the packet repo worktree (v4, TP-081).
|
|
132
|
+
* Populated by execution engine. Undefined if not yet resolved.
|
|
133
|
+
*/
|
|
134
|
+
packetTaskPath?: string;
|
|
135
|
+
/**
|
|
136
|
+
* Segment IDs for this task (v4, TP-081).
|
|
137
|
+
* Populated from TaskSegmentPlan during execution.
|
|
138
|
+
*/
|
|
139
|
+
segmentIds?: string[];
|
|
140
|
+
/**
|
|
141
|
+
* Currently active segment ID (v4, TP-081).
|
|
142
|
+
* Null when no segment is active.
|
|
143
|
+
*/
|
|
144
|
+
activeSegmentId?: string | null;
|
|
145
|
+
/**
|
|
146
|
+
* Step-to-segment checkbox mapping parsed from PROMPT.md `#### Segment:` markers.
|
|
147
|
+
* Populated by discovery (Phase A, TP-173). Undefined if not yet parsed.
|
|
148
|
+
*/
|
|
149
|
+
stepSegmentMap?: StepSegmentMapping[];
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/** Build a stable segment ID from task + repo identity (`<taskId>::<repoId>[::N]`). */
|
|
153
|
+
export function buildSegmentId(taskId: string, repoId: string, sequence?: number): SegmentId {
|
|
154
|
+
if (typeof sequence === "number" && Number.isFinite(sequence) && sequence >= 2) {
|
|
155
|
+
return `${taskId}::${repoId}::${Math.floor(sequence)}` as SegmentId;
|
|
156
|
+
}
|
|
157
|
+
return `${taskId}::${repoId}` as SegmentId;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Read repoId from structured segment metadata.
|
|
162
|
+
*
|
|
163
|
+
* SegmentId is opaque — never parse it by string-splitting.
|
|
164
|
+
*/
|
|
165
|
+
export function parseSegmentIdRepo(segment: { repoId: string }): string {
|
|
166
|
+
return segment.repoId;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/** Build a dynamic segment expansion request ID (`exp-{timestamp}-{random5}`). */
|
|
170
|
+
export function buildExpansionRequestId(timestamp = Date.now()): string {
|
|
171
|
+
const ts = Number.isFinite(timestamp) ? Math.floor(timestamp) : Date.now();
|
|
172
|
+
const base = Math.random()
|
|
173
|
+
.toString(36)
|
|
174
|
+
.slice(2)
|
|
175
|
+
.toLowerCase()
|
|
176
|
+
.replace(/[^a-z0-9]/g, "");
|
|
177
|
+
const random5 = (base + "00000").slice(0, 5);
|
|
178
|
+
return `exp-${ts}-${random5}`;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// ── Step-Segment Mapping (Phase A: segment-scoped worker visibility) ────
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Authoritative segment-scope mode for a single worker iteration.
|
|
185
|
+
*
|
|
186
|
+
* - `FULL_TASK`: the worker sees the entire PROMPT.md, all steps, all checkboxes.
|
|
187
|
+
* No `Active segment ID` / `Your checkboxes for this step` prose is injected.
|
|
188
|
+
* Segment-related environment variables (`TASKPLANE_ACTIVE_SEGMENT_ID`,
|
|
189
|
+
* `TASKPLANE_SEGMENT_ID`) are hard-cleared so that runtime tools keyed on
|
|
190
|
+
* them (e.g., `request_segment_expansion`) cannot accidentally register.
|
|
191
|
+
*
|
|
192
|
+
* - `SEGMENT_SCOPED`: the worker is iterating a specific segment of a
|
|
193
|
+
* multi-segment task. Only that segment's steps and checkboxes are shown;
|
|
194
|
+
* `Active segment ID` is announced; segment-related env vars carry the
|
|
195
|
+
* active `segmentId`; the segment-overlay system prompt is appended.
|
|
196
|
+
*
|
|
197
|
+
* This is the single authoritative flag for the segment-scope decision
|
|
198
|
+
* (TP-196 / #502). Call sites should derive their behaviour from this mode
|
|
199
|
+
* rather than re-evaluating the underlying boolean conditions, which prevents
|
|
200
|
+
* the multiple branches drifting out of sync.
|
|
201
|
+
*
|
|
202
|
+
* @since TP-196
|
|
203
|
+
*/
|
|
204
|
+
export type SegmentScopeMode = "FULL_TASK" | "SEGMENT_SCOPED";
|
|
205
|
+
|
|
206
|
+
/** A group of checkboxes scoped to a single repo within a step. */
|
|
207
|
+
export interface SegmentCheckboxGroup {
|
|
208
|
+
repoId: string;
|
|
209
|
+
checkboxes: string[];
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/** Maps a step to its repo-scoped checkbox groups. */
|
|
213
|
+
export interface StepSegmentMapping {
|
|
214
|
+
stepNumber: number;
|
|
215
|
+
stepName: string;
|
|
216
|
+
segments: SegmentCheckboxGroup[];
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/** One repo-scoped segment node for a task. */
|
|
220
|
+
export interface TaskSegmentNode {
|
|
221
|
+
segmentId: SegmentId;
|
|
222
|
+
taskId: string;
|
|
223
|
+
repoId: string;
|
|
224
|
+
/**
|
|
225
|
+
* Deterministic segment order within a task (0-indexed).
|
|
226
|
+
* Stable tie-break: repoId lexical order.
|
|
227
|
+
*/
|
|
228
|
+
order: number;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
/** Directed edge between two segment nodes in the same task. */
|
|
232
|
+
export interface TaskSegmentEdge {
|
|
233
|
+
fromSegmentId: SegmentId;
|
|
234
|
+
toSegmentId: SegmentId;
|
|
235
|
+
provenance: SegmentEdgeProvenance;
|
|
236
|
+
/** Optional explanation of why this edge exists (debug/telemetry aid). */
|
|
237
|
+
reason?: string;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/**
|
|
241
|
+
* Deterministic segment plan for one task.
|
|
242
|
+
*
|
|
243
|
+
* Ordering contract:
|
|
244
|
+
* - `segments`: sorted by `order`, then `repoId`
|
|
245
|
+
* - `edges`: sorted by `fromSegmentId`, then `toSegmentId`
|
|
246
|
+
*/
|
|
247
|
+
export interface TaskSegmentPlan {
|
|
248
|
+
taskId: string;
|
|
249
|
+
segments: TaskSegmentNode[];
|
|
250
|
+
edges: TaskSegmentEdge[];
|
|
251
|
+
/**
|
|
252
|
+
* explicit-dag: parsed from prompt metadata
|
|
253
|
+
* inferred-sequential: deterministic fallback inference
|
|
254
|
+
* repo-singleton: repo mode fallback (`resolvedRepoId ?? "default"`)
|
|
255
|
+
*/
|
|
256
|
+
mode: "explicit-dag" | "inferred-sequential" | "repo-singleton";
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/** Directed edge between repos requested in a dynamic segment expansion. */
|
|
260
|
+
export interface SegmentExpansionEdge {
|
|
261
|
+
from: string;
|
|
262
|
+
to: string;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
/**
|
|
266
|
+
* File IPC payload for worker-initiated dynamic segment expansion requests.
|
|
267
|
+
*
|
|
268
|
+
* Written to: `.pi/mailbox/{batchId}/{agentId}/outbox/segment-expansion-{requestId}.json`
|
|
269
|
+
*/
|
|
270
|
+
export interface SegmentExpansionRequest {
|
|
271
|
+
/** Unique request ID: `exp-{timestamp}-{random5}` */
|
|
272
|
+
requestId: string;
|
|
273
|
+
/** Task ID making the expansion request. */
|
|
274
|
+
taskId: string;
|
|
275
|
+
/** Segment active when the request was emitted. */
|
|
276
|
+
fromSegmentId: SegmentId;
|
|
277
|
+
/** Repo IDs the worker is requesting the engine to add. */
|
|
278
|
+
requestedRepoIds: string[];
|
|
279
|
+
/** Human rationale from the worker. */
|
|
280
|
+
rationale: string;
|
|
281
|
+
/** Placement directive for inserting new segments. */
|
|
282
|
+
placement: "after-current" | "end";
|
|
283
|
+
/** Optional inter-request ordering edges. */
|
|
284
|
+
edges: SegmentExpansionEdge[];
|
|
285
|
+
/** Epoch milliseconds when the request was emitted. */
|
|
286
|
+
timestamp: number;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
* TaskId-keyed segment plans.
|
|
291
|
+
* Iteration order must be deterministic: sort task IDs lexicographically.
|
|
292
|
+
*/
|
|
293
|
+
export type TaskSegmentPlanMap = Map<string, TaskSegmentPlan>;
|
|
294
|
+
|
|
295
|
+
/** A wave: a group of tasks whose dependencies are all satisfied */
|
|
296
|
+
export interface WaveAssignment {
|
|
297
|
+
waveNumber: number;
|
|
298
|
+
tasks: LaneAssignment[];
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/** A task assigned to a specific lane within a wave */
|
|
302
|
+
export interface LaneAssignment {
|
|
303
|
+
taskId: string;
|
|
304
|
+
lane: number;
|
|
305
|
+
task: ParsedTask;
|
|
306
|
+
/** Repo ID this task targets (workspace mode only). Undefined in repo mode. */
|
|
307
|
+
repoId?: string;
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
/** Runtime state of the entire batch execution */
|
|
311
|
+
export interface BatchState {
|
|
312
|
+
phase: "idle" | "planning" | "running" | "paused" | "merging" | "complete" | "error" | "aborted";
|
|
313
|
+
batchId: string;
|
|
314
|
+
waves: WaveAssignment[];
|
|
315
|
+
currentWave: number;
|
|
316
|
+
tasksTotal: number;
|
|
317
|
+
tasksComplete: number;
|
|
318
|
+
tasksFailed: number;
|
|
319
|
+
laneCount: number;
|
|
320
|
+
laneStatuses: Map<number, LaneStatus>;
|
|
321
|
+
startTime: number;
|
|
322
|
+
errors: string[];
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
/** Per-lane runtime status */
|
|
326
|
+
export interface LaneStatus {
|
|
327
|
+
lane: number;
|
|
328
|
+
taskId: string | null;
|
|
329
|
+
status: "idle" | "running" | "complete" | "failed" | "stalled";
|
|
330
|
+
stepProgress: string;
|
|
331
|
+
iteration: number;
|
|
332
|
+
elapsed: number;
|
|
333
|
+
tmuxSession: string;
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
/** Task area definition from task-runner.yaml */
|
|
337
|
+
export interface TaskArea {
|
|
338
|
+
path: string;
|
|
339
|
+
prefix: string;
|
|
340
|
+
context: string;
|
|
341
|
+
/** Optional repo ID for routing tasks in this area (workspace mode only). */
|
|
342
|
+
repoId?: string;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
/** Subset of task-runner.yaml that the orchestrator needs */
|
|
346
|
+
export interface TaskRunnerConfig {
|
|
347
|
+
task_areas: Record<string, TaskArea>;
|
|
348
|
+
reference_docs: Record<string, string>;
|
|
349
|
+
/** Named testing/verification commands (e.g., { test: "node --test tests/*.test.ts" }). Used for baseline fingerprinting (TP-032). */
|
|
350
|
+
testing_commands?: Record<string, string>;
|
|
351
|
+
/**
|
|
352
|
+
* Model fallback behavior when a configured model becomes unavailable mid-batch.
|
|
353
|
+
* - `"inherit"` (default): Retry without explicit model (session model fallback).
|
|
354
|
+
* - `"fail"`: No model substitution — normal failure path.
|
|
355
|
+
* @since TP-055
|
|
356
|
+
*/
|
|
357
|
+
model_fallback?: "inherit" | "fail";
|
|
358
|
+
/**
|
|
359
|
+
* Reviewer agent model/thinking/tools configuration.
|
|
360
|
+
* Threaded through to `spawnReviewer()` via env vars.
|
|
361
|
+
* @since TP-160
|
|
362
|
+
*/
|
|
363
|
+
reviewer?: {
|
|
364
|
+
/** Model string (empty = inherit session default) */
|
|
365
|
+
model: string;
|
|
366
|
+
/** Thinking mode ("on" | "off" | budget string, empty = inherit) */
|
|
367
|
+
thinking: string;
|
|
368
|
+
/** Comma-separated tool allowlist */
|
|
369
|
+
tools: string;
|
|
370
|
+
/** Package specifiers to exclude from extension forwarding (exact match). @since TP-180 */
|
|
371
|
+
excludeExtensions?: string[];
|
|
372
|
+
};
|
|
373
|
+
/**
|
|
374
|
+
* Worker agent model/thinking/tools configuration.
|
|
375
|
+
* Threaded through to `spawnAgent()` via env vars.
|
|
376
|
+
* @since TP-181
|
|
377
|
+
*/
|
|
378
|
+
worker?: {
|
|
379
|
+
/** Model string (empty = inherit session default) */
|
|
380
|
+
model: string;
|
|
381
|
+
/** Thinking mode ("on" | "off" | budget string, empty = inherit) */
|
|
382
|
+
thinking: string;
|
|
383
|
+
/** Comma-separated tool allowlist */
|
|
384
|
+
tools: string;
|
|
385
|
+
/** Package specifiers to exclude from extension forwarding (exact match). @since TP-180 */
|
|
386
|
+
excludeExtensions?: string[];
|
|
387
|
+
};
|
|
388
|
+
/** Worker agent extension exclusion list. @since TP-180 */
|
|
389
|
+
workerExcludeExtensions?: string[];
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
/** Result of a preflight check */
|
|
393
|
+
export interface PreflightResult {
|
|
394
|
+
passed: boolean;
|
|
395
|
+
checks: PreflightCheck[];
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
/** Individual preflight check */
|
|
399
|
+
export interface PreflightCheck {
|
|
400
|
+
name: string;
|
|
401
|
+
status: "pass" | "fail" | "warn";
|
|
402
|
+
message: string;
|
|
403
|
+
hint?: string;
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
// ── Defaults ─────────────────────────────────────────────────────────
|
|
407
|
+
|
|
408
|
+
export const DEFAULT_ORCHESTRATOR_CONFIG: OrchestratorConfig = {
|
|
409
|
+
orchestrator: {
|
|
410
|
+
max_lanes: 3,
|
|
411
|
+
worktree_location: "subdirectory",
|
|
412
|
+
worktree_prefix: "orchid-wt",
|
|
413
|
+
batch_id_format: "timestamp",
|
|
414
|
+
spawn_mode: "subprocess",
|
|
415
|
+
sessionPrefix: "orch",
|
|
416
|
+
operator_id: "",
|
|
417
|
+
integration: "manual",
|
|
418
|
+
},
|
|
419
|
+
dependencies: {
|
|
420
|
+
source: "prompt",
|
|
421
|
+
cache: true,
|
|
422
|
+
},
|
|
423
|
+
assignment: {
|
|
424
|
+
strategy: "affinity-first",
|
|
425
|
+
size_weights: { S: 1, M: 2, L: 4 },
|
|
426
|
+
},
|
|
427
|
+
pre_warm: {
|
|
428
|
+
auto_detect: false,
|
|
429
|
+
commands: {},
|
|
430
|
+
always: [],
|
|
431
|
+
},
|
|
432
|
+
merge: {
|
|
433
|
+
model: "",
|
|
434
|
+
// TP-189 (Cluster B): merge default sourced from the import-free
|
|
435
|
+
// `tool-allowlist-constants.ts` module. The previous concern about
|
|
436
|
+
// importing from `agent-host.ts` (which DOES depend on types.ts and
|
|
437
|
+
// would create a cycle) no longer applies because the constant
|
|
438
|
+
// lives in a leaf module that imports nothing.
|
|
439
|
+
tools: DEFAULT_WORKER_USER_TOOLS,
|
|
440
|
+
thinking: "off",
|
|
441
|
+
verify: [],
|
|
442
|
+
order: "fewest-files-first",
|
|
443
|
+
timeout_minutes: 90,
|
|
444
|
+
},
|
|
445
|
+
failure: {
|
|
446
|
+
on_task_failure: "skip-dependents",
|
|
447
|
+
on_merge_failure: "pause",
|
|
448
|
+
stall_timeout: 30,
|
|
449
|
+
max_worker_minutes: 30,
|
|
450
|
+
abort_grace_period: 60,
|
|
451
|
+
},
|
|
452
|
+
monitoring: {
|
|
453
|
+
poll_interval: 5,
|
|
454
|
+
},
|
|
455
|
+
verification: {
|
|
456
|
+
enabled: false,
|
|
457
|
+
mode: "permissive",
|
|
458
|
+
flaky_reruns: 1,
|
|
459
|
+
},
|
|
460
|
+
};
|
|
461
|
+
|
|
462
|
+
export const DEFAULT_TASK_RUNNER_CONFIG: TaskRunnerConfig = {
|
|
463
|
+
task_areas: {},
|
|
464
|
+
reference_docs: {},
|
|
465
|
+
model_fallback: "inherit",
|
|
466
|
+
};
|
|
467
|
+
|
|
468
|
+
// ── Helpers ──────────────────────────────────────────────────────────
|
|
469
|
+
|
|
470
|
+
export function freshBatchState(): BatchState {
|
|
471
|
+
return {
|
|
472
|
+
phase: "idle",
|
|
473
|
+
batchId: "",
|
|
474
|
+
waves: [],
|
|
475
|
+
currentWave: 0,
|
|
476
|
+
tasksTotal: 0,
|
|
477
|
+
tasksComplete: 0,
|
|
478
|
+
tasksFailed: 0,
|
|
479
|
+
laneCount: 0,
|
|
480
|
+
laneStatuses: new Map(),
|
|
481
|
+
startTime: 0,
|
|
482
|
+
errors: [],
|
|
483
|
+
};
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
// ── Worktree Types ───────────────────────────────────────────────────
|
|
487
|
+
|
|
488
|
+
/** Information about a created worktree. Returned by createWorktree(). */
|
|
489
|
+
export interface WorktreeInfo {
|
|
490
|
+
/** Absolute filesystem path to the worktree directory */
|
|
491
|
+
path: string;
|
|
492
|
+
/** Branch name checked out in the worktree (e.g. task/lane-1-20260308T111750) */
|
|
493
|
+
branch: string;
|
|
494
|
+
/** Lane number (1-indexed) this worktree is assigned to */
|
|
495
|
+
laneNumber: number;
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
/** Options for createWorktree() */
|
|
499
|
+
export interface CreateWorktreeOptions {
|
|
500
|
+
/** Lane number (1-indexed) */
|
|
501
|
+
laneNumber: number;
|
|
502
|
+
/** Batch ID timestamp (e.g. "20260308T111750") */
|
|
503
|
+
batchId: string;
|
|
504
|
+
/** Branch to base the worktree on (e.g. "develop") */
|
|
505
|
+
baseBranch: string;
|
|
506
|
+
/** Worktree directory prefix (e.g. "orchid-wt") */
|
|
507
|
+
prefix: string;
|
|
508
|
+
/** Operator identifier (sanitized, e.g., "henrylach") */
|
|
509
|
+
opId: string;
|
|
510
|
+
/** Full orchestrator config (optional; used for worktree_location) */
|
|
511
|
+
config?: OrchestratorConfig;
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
/**
|
|
515
|
+
* Stable error codes for worktree operations.
|
|
516
|
+
*
|
|
517
|
+
* - WORKTREE_PATH_IS_WORKTREE: path already registered as a git worktree
|
|
518
|
+
* - WORKTREE_PATH_NOT_EMPTY: path exists and is a non-empty non-worktree dir
|
|
519
|
+
* - WORKTREE_BRANCH_EXISTS: branch name already exists (checked out elsewhere)
|
|
520
|
+
* - WORKTREE_INVALID_BASE: base branch does not exist
|
|
521
|
+
* - WORKTREE_GIT_ERROR: unexpected git command failure
|
|
522
|
+
* - WORKTREE_VERIFY_FAILED: post-creation/reset verification failed
|
|
523
|
+
* - WORKTREE_REMOVE_FAILED: worktree removal failed (even after retries)
|
|
524
|
+
* - WORKTREE_REMOVE_RETRY_EXHAUSTED: all retry attempts for worktree removal exhausted (Windows file locking)
|
|
525
|
+
* - WORKTREE_BRANCH_DELETE_FAILED: branch deletion failed after successful worktree removal
|
|
526
|
+
* - WORKTREE_NOT_FOUND: worktree path does not exist on disk
|
|
527
|
+
* - WORKTREE_NOT_REGISTERED: path exists but is not a registered git worktree
|
|
528
|
+
* - WORKTREE_DIRTY: worktree has uncommitted changes (cannot reset)
|
|
529
|
+
* - WORKTREE_RESET_FAILED: git checkout -B reset command failed
|
|
530
|
+
*/
|
|
531
|
+
export type WorktreeErrorCode =
|
|
532
|
+
| "WORKTREE_PATH_IS_WORKTREE"
|
|
533
|
+
| "WORKTREE_PATH_NOT_EMPTY"
|
|
534
|
+
| "WORKTREE_BRANCH_EXISTS"
|
|
535
|
+
| "WORKTREE_INVALID_BASE"
|
|
536
|
+
| "WORKTREE_GIT_ERROR"
|
|
537
|
+
| "WORKTREE_VERIFY_FAILED"
|
|
538
|
+
| "WORKTREE_REMOVE_FAILED"
|
|
539
|
+
| "WORKTREE_REMOVE_RETRY_EXHAUSTED"
|
|
540
|
+
| "WORKTREE_BRANCH_DELETE_FAILED"
|
|
541
|
+
| "WORKTREE_NOT_FOUND"
|
|
542
|
+
| "WORKTREE_NOT_REGISTERED"
|
|
543
|
+
| "WORKTREE_DIRTY"
|
|
544
|
+
| "WORKTREE_RESET_FAILED";
|
|
545
|
+
|
|
546
|
+
/** Typed error class for worktree operations with stable error codes. */
|
|
547
|
+
export class WorktreeError extends Error {
|
|
548
|
+
code: WorktreeErrorCode;
|
|
549
|
+
|
|
550
|
+
constructor(code: WorktreeErrorCode, message: string) {
|
|
551
|
+
super(message);
|
|
552
|
+
this.name = "WorktreeError";
|
|
553
|
+
this.code = code;
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
/**
|
|
558
|
+
* Result of a removeWorktree() operation.
|
|
559
|
+
*
|
|
560
|
+
* Provides status flags so callers can branch on outcome without
|
|
561
|
+
* catching errors for expected idempotent scenarios.
|
|
562
|
+
*/
|
|
563
|
+
export interface RemoveWorktreeResult {
|
|
564
|
+
/** Whether the worktree directory was removed in this call */
|
|
565
|
+
removed: boolean;
|
|
566
|
+
/** Whether the worktree was already absent (idempotent no-op) */
|
|
567
|
+
alreadyRemoved: boolean;
|
|
568
|
+
/** Whether the lane branch was deleted (or was already absent) */
|
|
569
|
+
branchDeleted: boolean;
|
|
570
|
+
/** Whether the lane branch was preserved (unmerged commits detected) */
|
|
571
|
+
branchPreserved: boolean;
|
|
572
|
+
/** The saved branch name (if preserved) */
|
|
573
|
+
savedBranch?: string;
|
|
574
|
+
/** Number of unmerged commits (if preserved) */
|
|
575
|
+
unmergedCount?: number;
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
// ── Bulk Operation Types ─────────────────────────────────────────────
|
|
579
|
+
|
|
580
|
+
/** Error from a single worktree within a bulk operation. */
|
|
581
|
+
export interface BulkWorktreeError {
|
|
582
|
+
/** Lane number that failed */
|
|
583
|
+
laneNumber: number;
|
|
584
|
+
/** Error code from WorktreeError (if available) */
|
|
585
|
+
code: WorktreeErrorCode | "UNKNOWN";
|
|
586
|
+
/** Human-readable error message */
|
|
587
|
+
message: string;
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
/**
|
|
591
|
+
* Result of createLaneWorktrees() bulk creation.
|
|
592
|
+
*
|
|
593
|
+
* On success: `success=true`, `worktrees` contains all created WorktreeInfos.
|
|
594
|
+
* On failure: `success=false`, `errors` lists per-lane failures,
|
|
595
|
+
* `rolledBack` indicates whether cleanup of partial state succeeded.
|
|
596
|
+
*/
|
|
597
|
+
export interface CreateLaneWorktreesResult {
|
|
598
|
+
/** Whether all lane worktrees were created successfully */
|
|
599
|
+
success: boolean;
|
|
600
|
+
/** Created worktrees (sorted by laneNumber). Empty on failure if rolled back. */
|
|
601
|
+
worktrees: WorktreeInfo[];
|
|
602
|
+
/** Per-lane errors encountered during creation */
|
|
603
|
+
errors: BulkWorktreeError[];
|
|
604
|
+
/** Whether rollback of partially-created worktrees succeeded (only relevant on failure) */
|
|
605
|
+
rolledBack: boolean;
|
|
606
|
+
/** Errors encountered during rollback (if any) */
|
|
607
|
+
rollbackErrors: BulkWorktreeError[];
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
/**
|
|
611
|
+
* Per-worktree outcome within removeAllWorktrees().
|
|
612
|
+
*/
|
|
613
|
+
export interface RemoveWorktreeOutcome {
|
|
614
|
+
/** The worktree that was targeted for removal */
|
|
615
|
+
worktree: WorktreeInfo;
|
|
616
|
+
/** The removal result (null if removal threw an error) */
|
|
617
|
+
result: RemoveWorktreeResult | null;
|
|
618
|
+
/** Error encountered during removal (null on success) */
|
|
619
|
+
error: BulkWorktreeError | null;
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
/**
|
|
623
|
+
* Result of removeAllWorktrees() bulk removal.
|
|
624
|
+
*
|
|
625
|
+
* Best-effort: continues on per-worktree errors (does not fail-fast).
|
|
626
|
+
*/
|
|
627
|
+
export interface RemoveAllWorktreesResult {
|
|
628
|
+
/** Total worktrees found matching the prefix */
|
|
629
|
+
totalAttempted: number;
|
|
630
|
+
/** Successfully removed (or already removed) worktrees */
|
|
631
|
+
removed: WorktreeInfo[];
|
|
632
|
+
/** Worktrees that failed to remove */
|
|
633
|
+
failed: RemoveWorktreeOutcome[];
|
|
634
|
+
/** All per-worktree outcomes in order */
|
|
635
|
+
outcomes: RemoveWorktreeOutcome[];
|
|
636
|
+
/** Branches preserved (had unmerged commits) */
|
|
637
|
+
preserved: Array<{
|
|
638
|
+
branch: string;
|
|
639
|
+
savedBranch: string;
|
|
640
|
+
laneNumber: number;
|
|
641
|
+
unmergedCount?: number;
|
|
642
|
+
}>;
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
// ── Discovery Types ──────────────────────────────────────────────────
|
|
646
|
+
|
|
647
|
+
/** Structured error from the discovery phase with diagnostic context */
|
|
648
|
+
export interface DiscoveryError {
|
|
649
|
+
code:
|
|
650
|
+
| "PARSE_MISSING_ID"
|
|
651
|
+
| "PARSE_MALFORMED"
|
|
652
|
+
| "DUPLICATE_ID"
|
|
653
|
+
| "UNKNOWN_ARG"
|
|
654
|
+
| "SCAN_ERROR"
|
|
655
|
+
| "DEP_UNRESOLVED"
|
|
656
|
+
| "DEP_PENDING"
|
|
657
|
+
| "DEP_AMBIGUOUS"
|
|
658
|
+
| "DEP_SOURCE_FALLBACK"
|
|
659
|
+
| "TASK_REPO_UNRESOLVED"
|
|
660
|
+
| "TASK_REPO_UNKNOWN"
|
|
661
|
+
| "TASK_ROUTING_STRICT"
|
|
662
|
+
| "SEGMENT_DAG_INVALID"
|
|
663
|
+
| "SEGMENT_REPO_UNKNOWN"
|
|
664
|
+
| "SEGMENT_STEP_DUPLICATE_REPO"
|
|
665
|
+
| "SEGMENT_STEP_EMPTY"
|
|
666
|
+
| "SEGMENT_STEP_REPO_INVALID";
|
|
667
|
+
message: string;
|
|
668
|
+
taskPath?: string;
|
|
669
|
+
taskId?: string;
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
/**
|
|
673
|
+
* Discovery error codes that are fatal (block planning/execution).
|
|
674
|
+
*
|
|
675
|
+
* Used by formatDiscoveryResults, extension.ts, and engine.ts for
|
|
676
|
+
* consistent fatal-error classification. Keep in sync with the
|
|
677
|
+
* DiscoveryError.code union above.
|
|
678
|
+
*/
|
|
679
|
+
export const FATAL_DISCOVERY_CODES: ReadonlyArray<DiscoveryError["code"]> = [
|
|
680
|
+
"DUPLICATE_ID",
|
|
681
|
+
"DEP_UNRESOLVED",
|
|
682
|
+
"DEP_PENDING",
|
|
683
|
+
"DEP_AMBIGUOUS",
|
|
684
|
+
"PARSE_MISSING_ID",
|
|
685
|
+
"TASK_REPO_UNRESOLVED",
|
|
686
|
+
"TASK_REPO_UNKNOWN",
|
|
687
|
+
"TASK_ROUTING_STRICT",
|
|
688
|
+
"SEGMENT_DAG_INVALID",
|
|
689
|
+
"SEGMENT_REPO_UNKNOWN",
|
|
690
|
+
"SEGMENT_STEP_DUPLICATE_REPO",
|
|
691
|
+
] as const;
|
|
692
|
+
|
|
693
|
+
/** Result of the full discovery pipeline */
|
|
694
|
+
export interface DiscoveryResult {
|
|
695
|
+
pending: Map<string, ParsedTask>;
|
|
696
|
+
completed: Set<string>;
|
|
697
|
+
errors: DiscoveryError[];
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
// ── Wave Computation Types ───────────────────────────────────────────
|
|
701
|
+
|
|
702
|
+
/** Dependency graph: adjacency list (task → tasks it depends on) */
|
|
703
|
+
export interface DependencyGraph {
|
|
704
|
+
/** Map from task ID to list of task IDs it depends on (predecessors) */
|
|
705
|
+
dependencies: Map<string, string[]>;
|
|
706
|
+
/** Map from task ID to list of task IDs that depend on it (successors) */
|
|
707
|
+
dependents: Map<string, string[]>;
|
|
708
|
+
/** All task IDs in the graph (pending only, not completed) */
|
|
709
|
+
nodes: Set<string>;
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
/** Result of graph validation */
|
|
713
|
+
export interface GraphValidationResult {
|
|
714
|
+
valid: boolean;
|
|
715
|
+
errors: DiscoveryError[];
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
/** Result of wave computation */
|
|
719
|
+
export interface WaveComputationResult {
|
|
720
|
+
waves: WaveAssignment[];
|
|
721
|
+
errors: DiscoveryError[];
|
|
722
|
+
/** Optional task→segment planning map (TP-080, additive contract). */
|
|
723
|
+
segmentPlans?: TaskSegmentPlanMap;
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
// ── Lane Allocation (Phase 3) ────────────────────────────────────────
|
|
727
|
+
|
|
728
|
+
/**
|
|
729
|
+
* Error codes specific to lane allocation.
|
|
730
|
+
*
|
|
731
|
+
* - ALLOC_INVALID_CONFIG: configuration validation failed
|
|
732
|
+
* - ALLOC_EMPTY_WAVE: no tasks provided for allocation
|
|
733
|
+
* - ALLOC_WORKTREE_FAILED: worktree creation failed (includes rollback info)
|
|
734
|
+
* - ALLOC_TASK_NOT_FOUND: task ID from wave not found in pending map
|
|
735
|
+
*/
|
|
736
|
+
export type AllocationErrorCode =
|
|
737
|
+
| "ALLOC_INVALID_CONFIG"
|
|
738
|
+
| "ALLOC_EMPTY_WAVE"
|
|
739
|
+
| "ALLOC_WORKTREE_FAILED"
|
|
740
|
+
| "ALLOC_TASK_NOT_FOUND";
|
|
741
|
+
|
|
742
|
+
/** Typed error for lane allocation failures. */
|
|
743
|
+
export class AllocationError extends Error {
|
|
744
|
+
code: AllocationErrorCode;
|
|
745
|
+
details?: string;
|
|
746
|
+
|
|
747
|
+
constructor(code: AllocationErrorCode, message: string, details?: string) {
|
|
748
|
+
super(message);
|
|
749
|
+
this.name = "AllocationError";
|
|
750
|
+
this.code = code;
|
|
751
|
+
this.details = details;
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
/**
|
|
756
|
+
* A task assigned within a lane, with its ordering position.
|
|
757
|
+
*
|
|
758
|
+
* Tasks within a lane execute sequentially in `order` (ascending).
|
|
759
|
+
* The ordering is deterministic given the same input.
|
|
760
|
+
*/
|
|
761
|
+
export interface AllocatedTask {
|
|
762
|
+
/** Task ID (e.g., "TO-014") */
|
|
763
|
+
taskId: string;
|
|
764
|
+
/** Execution order within the lane (0-indexed) */
|
|
765
|
+
order: number;
|
|
766
|
+
/** Full parsed task metadata */
|
|
767
|
+
task: ParsedTask;
|
|
768
|
+
/** Estimated duration in minutes */
|
|
769
|
+
estimatedMinutes: number;
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
/**
|
|
773
|
+
* A fully-allocated lane ready for execution.
|
|
774
|
+
*
|
|
775
|
+
* Contains everything Steps 2-3 need to run lane sessions,
|
|
776
|
+
* monitor progress, and identify the lane. This is the contract
|
|
777
|
+
* between Step 1 (allocation) and Step 2 (execution).
|
|
778
|
+
*/
|
|
779
|
+
export interface AllocatedLane {
|
|
780
|
+
/** Lane number (1-indexed, deterministic, globally unique across repos) */
|
|
781
|
+
laneNumber: number;
|
|
782
|
+
/** Lane identifier for display and logging (e.g., "lane-1") */
|
|
783
|
+
laneId: string;
|
|
784
|
+
/** Lane session identifier (e.g., "orch-lane-1") — used by Step 2 */
|
|
785
|
+
laneSessionId: string;
|
|
786
|
+
/** Absolute path to the lane's worktree directory */
|
|
787
|
+
worktreePath: string;
|
|
788
|
+
/** Git branch name checked out in the worktree */
|
|
789
|
+
branch: string;
|
|
790
|
+
/** Tasks assigned to this lane, ordered for sequential execution */
|
|
791
|
+
tasks: AllocatedTask[];
|
|
792
|
+
/** Assignment strategy that was used (for diagnostics) */
|
|
793
|
+
strategy: "affinity-first" | "round-robin" | "load-balanced";
|
|
794
|
+
/** Total estimated load (sum of task weights) */
|
|
795
|
+
estimatedLoad: number;
|
|
796
|
+
/** Total estimated duration in minutes (sum of task durations) */
|
|
797
|
+
estimatedMinutes: number;
|
|
798
|
+
/** Repo ID this lane targets (workspace mode only). Undefined in repo mode. */
|
|
799
|
+
repoId?: string;
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
// ── Execution Types & Contracts ──────────────────────────────────────
|
|
803
|
+
|
|
804
|
+
/**
|
|
805
|
+
* Lifecycle status for a single task within lane execution.
|
|
806
|
+
*
|
|
807
|
+
* State machine:
|
|
808
|
+
* pending → running → succeeded
|
|
809
|
+
* → failed
|
|
810
|
+
* → stalled
|
|
811
|
+
* pending → skipped (pause/abort before task starts, or prior task failed)
|
|
812
|
+
*/
|
|
813
|
+
export type LaneTaskStatus = "pending" | "running" | "succeeded" | "failed" | "stalled" | "skipped";
|
|
814
|
+
|
|
815
|
+
/**
|
|
816
|
+
* Embedded telemetry attached to a lane task outcome.
|
|
817
|
+
*
|
|
818
|
+
* Populated by Runtime V2 lane-runner at emission time so downstream
|
|
819
|
+
* consumers (batch history, diagnostics) can read authoritative usage
|
|
820
|
+
* without reconstructing task↔lane joins from snapshot keys.
|
|
821
|
+
*/
|
|
822
|
+
export interface LaneTaskOutcomeTelemetry {
|
|
823
|
+
/** Total input tokens for this task outcome. */
|
|
824
|
+
inputTokens: number;
|
|
825
|
+
/** Total output tokens for this task outcome. */
|
|
826
|
+
outputTokens: number;
|
|
827
|
+
/** Total cache-read tokens for this task outcome. */
|
|
828
|
+
cacheReadTokens: number;
|
|
829
|
+
/** Total cache-write tokens for this task outcome. */
|
|
830
|
+
cacheWriteTokens: number;
|
|
831
|
+
/** Cumulative cost in USD for this task outcome. */
|
|
832
|
+
costUsd: number;
|
|
833
|
+
/** Number of tool calls made while producing this outcome. */
|
|
834
|
+
toolCalls: number;
|
|
835
|
+
/** End-to-end duration in milliseconds for this outcome. */
|
|
836
|
+
durationMs: number;
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
/**
|
|
840
|
+
* Outcome of a single task execution within a lane.
|
|
841
|
+
*
|
|
842
|
+
* Produced by `executeLane()` for each task in the lane's task list.
|
|
843
|
+
* Consumed by Step 3 (monitoring) and Step 4 (wave policy logic).
|
|
844
|
+
*/
|
|
845
|
+
export interface LaneTaskOutcome {
|
|
846
|
+
/** Task identifier (e.g., "TO-014") */
|
|
847
|
+
taskId: string;
|
|
848
|
+
/** Final task status */
|
|
849
|
+
status: LaneTaskStatus;
|
|
850
|
+
/** Segment identifier for segment-aware execution (null for whole-task units). */
|
|
851
|
+
segmentId?: string | null;
|
|
852
|
+
/** When execution started (epoch ms), null if never started (skipped) */
|
|
853
|
+
startTime: number | null;
|
|
854
|
+
/** When execution ended (epoch ms), null if still pending */
|
|
855
|
+
endTime: number | null;
|
|
856
|
+
/** Human-readable reason for the outcome */
|
|
857
|
+
exitReason: string;
|
|
858
|
+
/** Lane session name used for this task (e.g., "orch-lane-1") */
|
|
859
|
+
sessionName: string;
|
|
860
|
+
/** Whether .DONE file was found */
|
|
861
|
+
doneFileFound: boolean;
|
|
862
|
+
/**
|
|
863
|
+
* Lane number that produced this task outcome (1-indexed).
|
|
864
|
+
*
|
|
865
|
+
* Optional for backward compatibility with pre-TP-116 persisted state.
|
|
866
|
+
*/
|
|
867
|
+
laneNumber?: number;
|
|
868
|
+
/**
|
|
869
|
+
* Embedded task-level telemetry (authoritative for Runtime V2).
|
|
870
|
+
*
|
|
871
|
+
* Optional for backward compatibility and non-agent outcomes
|
|
872
|
+
* (for example skipped tasks).
|
|
873
|
+
*/
|
|
874
|
+
telemetry?: LaneTaskOutcomeTelemetry;
|
|
875
|
+
/**
|
|
876
|
+
* Number of commits preserved as partial progress for a failed task.
|
|
877
|
+
* 0 when no partial progress was saved (succeeded tasks, no commits, etc.).
|
|
878
|
+
* Optional for backward compatibility — defaults to 0 when absent.
|
|
879
|
+
*/
|
|
880
|
+
partialProgressCommits?: number;
|
|
881
|
+
/**
|
|
882
|
+
* Saved branch name holding partial progress for a failed task.
|
|
883
|
+
* Undefined when no partial progress was saved.
|
|
884
|
+
* Optional for backward compatibility.
|
|
885
|
+
*/
|
|
886
|
+
partialProgressBranch?: string;
|
|
887
|
+
/**
|
|
888
|
+
* Structured exit diagnostic for this task (v3, TP-030).
|
|
889
|
+
*
|
|
890
|
+
* Canonical structured exit data — preferred over the legacy `exitReason`
|
|
891
|
+
* string when present. Produced by `classifyExit()` after session ends,
|
|
892
|
+
* then enriched with progress/context metadata.
|
|
893
|
+
*
|
|
894
|
+
* Optional: absent for tasks that haven't exited yet, and for
|
|
895
|
+
* backward compatibility with pre-v3 code paths.
|
|
896
|
+
* Consumers should check `exitDiagnostic` first, falling back to
|
|
897
|
+
* `exitReason` for display.
|
|
898
|
+
*/
|
|
899
|
+
exitDiagnostic?: TaskExitDiagnostic;
|
|
900
|
+
}
|
|
901
|
+
|
|
902
|
+
/**
|
|
903
|
+
* Overall result of executing all tasks in a lane.
|
|
904
|
+
*
|
|
905
|
+
* The lane runs tasks sequentially. If a task fails and the lane
|
|
906
|
+
* has remaining tasks, those remaining tasks are marked as `skipped`.
|
|
907
|
+
*/
|
|
908
|
+
export interface LaneExecutionResult {
|
|
909
|
+
/** Lane number (1-indexed) */
|
|
910
|
+
laneNumber: number;
|
|
911
|
+
/** Lane identifier for display (e.g., "lane-1") */
|
|
912
|
+
laneId: string;
|
|
913
|
+
/** Per-task outcomes in execution order */
|
|
914
|
+
tasks: LaneTaskOutcome[];
|
|
915
|
+
/** Aggregate lane status: succeeded if all tasks succeeded, failed if any failed */
|
|
916
|
+
overallStatus: "succeeded" | "failed" | "partial";
|
|
917
|
+
/** When lane execution started (epoch ms) */
|
|
918
|
+
startTime: number;
|
|
919
|
+
/** When lane execution ended (epoch ms) */
|
|
920
|
+
endTime: number;
|
|
921
|
+
}
|
|
922
|
+
|
|
923
|
+
// ── Execution Constants ──────────────────────────────────────────────
|
|
924
|
+
|
|
925
|
+
/**
|
|
926
|
+
* Grace period (ms) after a lane session exits before declaring failure.
|
|
927
|
+
* Allows time for .DONE file to be flushed to disk on slow filesystems.
|
|
928
|
+
*/
|
|
929
|
+
export const DONE_GRACE_MS = 5_000;
|
|
930
|
+
|
|
931
|
+
/**
|
|
932
|
+
* Polling interval (ms) for checking session liveness and .DONE file.
|
|
933
|
+
*/
|
|
934
|
+
export const EXECUTION_POLL_INTERVAL_MS = 2_000;
|
|
935
|
+
|
|
936
|
+
/**
|
|
937
|
+
* Maximum retries for legacy lane-session spawn failures.
|
|
938
|
+
* Only transient failures (session name collision) are retried.
|
|
939
|
+
*/
|
|
940
|
+
export const SESSION_SPAWN_RETRY_MAX = 2;
|
|
941
|
+
|
|
942
|
+
// ── Execution Error Types ────────────────────────────────────────────
|
|
943
|
+
|
|
944
|
+
/**
|
|
945
|
+
* Error codes for lane execution failures.
|
|
946
|
+
*
|
|
947
|
+
* - EXEC_SPAWN_FAILED: Lane session could not be created after retries
|
|
948
|
+
* - EXEC_TASK_FAILED: task completed without .DONE (non-zero exit)
|
|
949
|
+
* - EXEC_TASK_STALLED: STATUS.md unchanged for stall_timeout (handled by Step 3)
|
|
950
|
+
* - EXEC_TASK_STAGE_FAILED: git add failed for task files
|
|
951
|
+
* - EXEC_TASK_COMMIT_FAILED: git commit failed for staged task files
|
|
952
|
+
* - EXEC_TMUX_NOT_AVAILABLE: Legacy `tmux` binary not found (compat path)
|
|
953
|
+
* - EXEC_WORKTREE_MISSING: lane worktree path doesn't exist
|
|
954
|
+
*/
|
|
955
|
+
export type ExecutionErrorCode =
|
|
956
|
+
| "EXEC_SPAWN_FAILED"
|
|
957
|
+
| "EXEC_TASK_FAILED"
|
|
958
|
+
| "EXEC_TASK_STALLED"
|
|
959
|
+
| "EXEC_TASK_STAGE_FAILED"
|
|
960
|
+
| "EXEC_TASK_COMMIT_FAILED"
|
|
961
|
+
| "EXEC_TMUX_NOT_AVAILABLE"
|
|
962
|
+
| "EXEC_WORKTREE_MISSING"
|
|
963
|
+
| "EXEC_MISSING_TASK_FOLDER";
|
|
964
|
+
|
|
965
|
+
/** Typed error for lane execution failures. */
|
|
966
|
+
export class ExecutionError extends Error {
|
|
967
|
+
code: ExecutionErrorCode;
|
|
968
|
+
laneId?: string;
|
|
969
|
+
taskId?: string;
|
|
970
|
+
|
|
971
|
+
constructor(code: ExecutionErrorCode, message: string, laneId?: string, taskId?: string) {
|
|
972
|
+
super(message);
|
|
973
|
+
this.name = "ExecutionError";
|
|
974
|
+
this.code = code;
|
|
975
|
+
this.laneId = laneId;
|
|
976
|
+
this.taskId = taskId;
|
|
977
|
+
}
|
|
978
|
+
}
|
|
979
|
+
|
|
980
|
+
// ── Monitoring Types & Contracts ─────────────────────────────────────
|
|
981
|
+
|
|
982
|
+
/**
|
|
983
|
+
* Snapshot of a single task's monitored state at a point in time.
|
|
984
|
+
*
|
|
985
|
+
* Produced by `resolveTaskMonitorState()` from combining:
|
|
986
|
+
* - .DONE file presence
|
|
987
|
+
* - Lane-session liveness
|
|
988
|
+
* - STATUS.md parse results
|
|
989
|
+
* - STATUS.md mtime for stall detection
|
|
990
|
+
*/
|
|
991
|
+
export interface TaskMonitorSnapshot {
|
|
992
|
+
/** Task ID (e.g., "TO-014") */
|
|
993
|
+
taskId: string;
|
|
994
|
+
/** Resolved monitoring status */
|
|
995
|
+
status: "pending" | "running" | "succeeded" | "failed" | "stalled" | "skipped" | "unknown";
|
|
996
|
+
/** Current step name (e.g., "Implement Service Layer"), null if not parsed */
|
|
997
|
+
currentStepName: string | null;
|
|
998
|
+
/** Current step number, null if not parsed */
|
|
999
|
+
currentStepNumber: number | null;
|
|
1000
|
+
/** Total steps in the task */
|
|
1001
|
+
totalSteps: number;
|
|
1002
|
+
/** Checked checkbox count across all steps */
|
|
1003
|
+
totalChecked: number;
|
|
1004
|
+
/** Total checkbox count across all steps */
|
|
1005
|
+
totalItems: number;
|
|
1006
|
+
/** Whether the lane session is alive */
|
|
1007
|
+
sessionAlive: boolean;
|
|
1008
|
+
/** Whether the .DONE file was found */
|
|
1009
|
+
doneFileFound: boolean;
|
|
1010
|
+
/** Stall reason (null if not stalled) */
|
|
1011
|
+
stallReason: string | null;
|
|
1012
|
+
/** Epoch ms of last known STATUS.md modification */
|
|
1013
|
+
lastHeartbeat: number | null;
|
|
1014
|
+
/** Epoch ms when this snapshot was taken */
|
|
1015
|
+
observedAt: number;
|
|
1016
|
+
/** Reason string if STATUS.md couldn't be read */
|
|
1017
|
+
parseError: string | null;
|
|
1018
|
+
/** Worker iteration number from STATUS.md */
|
|
1019
|
+
iteration: number;
|
|
1020
|
+
/** Review counter from STATUS.md */
|
|
1021
|
+
reviewCounter: number;
|
|
1022
|
+
}
|
|
1023
|
+
|
|
1024
|
+
/**
|
|
1025
|
+
* Per-lane monitoring snapshot aggregating task-level snapshots.
|
|
1026
|
+
*/
|
|
1027
|
+
export interface LaneMonitorSnapshot {
|
|
1028
|
+
/** Lane identifier (e.g., "lane-1") */
|
|
1029
|
+
laneId: string;
|
|
1030
|
+
/** Lane number (1-indexed) */
|
|
1031
|
+
laneNumber: number;
|
|
1032
|
+
/** Lane session name (e.g., "orch-lane-1") */
|
|
1033
|
+
sessionName: string;
|
|
1034
|
+
/** Whether the lane session is alive right now */
|
|
1035
|
+
sessionAlive: boolean;
|
|
1036
|
+
/** Current task being executed (null if lane is idle/complete) */
|
|
1037
|
+
currentTaskId: string | null;
|
|
1038
|
+
/** Snapshot of the current task (null if no current task) */
|
|
1039
|
+
currentTaskSnapshot: TaskMonitorSnapshot | null;
|
|
1040
|
+
/** Task IDs that have completed (succeeded) */
|
|
1041
|
+
completedTasks: string[];
|
|
1042
|
+
/** Task IDs that failed or stalled */
|
|
1043
|
+
failedTasks: string[];
|
|
1044
|
+
/** Task IDs not yet started */
|
|
1045
|
+
remainingTasks: string[];
|
|
1046
|
+
}
|
|
1047
|
+
|
|
1048
|
+
/**
|
|
1049
|
+
* Aggregate monitoring state across all lanes.
|
|
1050
|
+
*
|
|
1051
|
+
* This is the primary data contract consumed by:
|
|
1052
|
+
* - Step 4 (wave execution loop) for failure policy decisions
|
|
1053
|
+
* - Step 6 (dashboard widget) for rendering
|
|
1054
|
+
*/
|
|
1055
|
+
export interface MonitorState {
|
|
1056
|
+
/** Per-lane snapshots */
|
|
1057
|
+
lanes: LaneMonitorSnapshot[];
|
|
1058
|
+
/** Overall progress: tasks done / total */
|
|
1059
|
+
tasksDone: number;
|
|
1060
|
+
tasksFailed: number;
|
|
1061
|
+
tasksTotal: number;
|
|
1062
|
+
/** Current wave number */
|
|
1063
|
+
waveNumber: number;
|
|
1064
|
+
/** Number of poll cycles completed */
|
|
1065
|
+
pollCount: number;
|
|
1066
|
+
/** Epoch ms of last poll */
|
|
1067
|
+
lastPollTime: number;
|
|
1068
|
+
/** Whether all lanes have reached terminal state */
|
|
1069
|
+
allTerminal: boolean;
|
|
1070
|
+
}
|
|
1071
|
+
|
|
1072
|
+
/**
|
|
1073
|
+
* Per-task mtime tracker for stall detection.
|
|
1074
|
+
*
|
|
1075
|
+
* Tracks when we first observed the task (for startup grace),
|
|
1076
|
+
* last known STATUS.md mtime, and stall timer state.
|
|
1077
|
+
*/
|
|
1078
|
+
export interface MtimeTracker {
|
|
1079
|
+
/** Task ID */
|
|
1080
|
+
taskId: string;
|
|
1081
|
+
/** Epoch ms when we first observed this task running */
|
|
1082
|
+
firstObservedAt: number;
|
|
1083
|
+
/** Whether we've successfully read STATUS.md at least once */
|
|
1084
|
+
statusFileSeenOnce: boolean;
|
|
1085
|
+
/** Last known STATUS.md mtime (epoch ms), null if never read */
|
|
1086
|
+
lastMtime: number | null;
|
|
1087
|
+
/** Epoch ms when the stall timer started (mtime stopped changing) */
|
|
1088
|
+
stallTimerStart: number | null;
|
|
1089
|
+
}
|
|
1090
|
+
|
|
1091
|
+
// ── Wave Execution Types & Contracts ─────────────────────────────────
|
|
1092
|
+
|
|
1093
|
+
/**
|
|
1094
|
+
* Failure policy action matrix.
|
|
1095
|
+
*
|
|
1096
|
+
* Defines what happens to tasks in different states when a failure occurs,
|
|
1097
|
+
* depending on the configured failure policy.
|
|
1098
|
+
*
|
|
1099
|
+
* | Task State | skip-dependents | stop-wave | stop-all |
|
|
1100
|
+
* |---------------|--------------------------|------------------------|---------------------------|
|
|
1101
|
+
* | In-flight | Continue running | Continue running | Kill immediately |
|
|
1102
|
+
* | Queued (lane) | Continue if not dependent| Skip remaining in lane | Skip remaining in lane |
|
|
1103
|
+
* | Future waves | Prune transitive deps | Don't start next wave | Don't start any more |
|
|
1104
|
+
*
|
|
1105
|
+
* Ownership contract:
|
|
1106
|
+
* - executeLane() is source-of-truth for terminal task status
|
|
1107
|
+
* - monitorLanes() runs as sibling async loop, can kill stalled sessions
|
|
1108
|
+
* - executeWave() coordinates both and applies policy
|
|
1109
|
+
* - Monitor's stall-kill does NOT conflict with executeLane() because
|
|
1110
|
+
* executeLane() polls session liveness and will see the killed session
|
|
1111
|
+
*/
|
|
1112
|
+
|
|
1113
|
+
/**
|
|
1114
|
+
* Result of executing a single wave.
|
|
1115
|
+
*
|
|
1116
|
+
* Consumed by:
|
|
1117
|
+
* - Step 5 (/orch command) for wave-to-wave progression decisions
|
|
1118
|
+
* - Step 6 (dashboard widget) for rendering wave summaries
|
|
1119
|
+
*/
|
|
1120
|
+
export interface WaveExecutionResult {
|
|
1121
|
+
/** Wave number (1-indexed) */
|
|
1122
|
+
waveIndex: number;
|
|
1123
|
+
/** Epoch ms when wave execution started */
|
|
1124
|
+
startedAt: number;
|
|
1125
|
+
/** Epoch ms when wave execution ended */
|
|
1126
|
+
endedAt: number;
|
|
1127
|
+
/** Per-lane execution results */
|
|
1128
|
+
laneResults: LaneExecutionResult[];
|
|
1129
|
+
/** Which failure policy was configured */
|
|
1130
|
+
policyApplied: "skip-dependents" | "stop-wave" | "stop-all";
|
|
1131
|
+
/** Whether the wave was stopped early due to policy */
|
|
1132
|
+
stoppedEarly: boolean;
|
|
1133
|
+
/** Task IDs that failed (including stalled) */
|
|
1134
|
+
failedTaskIds: string[];
|
|
1135
|
+
/** Task IDs that were skipped (due to pause, prior failure, or policy) */
|
|
1136
|
+
skippedTaskIds: string[];
|
|
1137
|
+
/** Task IDs that succeeded */
|
|
1138
|
+
succeededTaskIds: string[];
|
|
1139
|
+
/** Task IDs blocked for future waves (transitive dependents of failed tasks) */
|
|
1140
|
+
blockedTaskIds: string[];
|
|
1141
|
+
/** Number of lanes used */
|
|
1142
|
+
laneCount: number;
|
|
1143
|
+
/** Overall wave status */
|
|
1144
|
+
overallStatus: "succeeded" | "failed" | "partial" | "aborted";
|
|
1145
|
+
/** Final monitor state snapshot (null if monitoring wasn't started) */
|
|
1146
|
+
finalMonitorState: MonitorState | null;
|
|
1147
|
+
/** Allocated lanes used in this wave (preserved for merge and cleanup) */
|
|
1148
|
+
allocatedLanes: AllocatedLane[];
|
|
1149
|
+
/**
|
|
1150
|
+
* Structured allocation error when lane provisioning failed.
|
|
1151
|
+
* Null when allocation succeeded or wave failed for other reasons.
|
|
1152
|
+
* Used by Tier 0 to detect stale worktree failures and retry.
|
|
1153
|
+
* @since TP-039
|
|
1154
|
+
*/
|
|
1155
|
+
allocationError?: {
|
|
1156
|
+
code: AllocationErrorCode;
|
|
1157
|
+
message: string;
|
|
1158
|
+
details?: string;
|
|
1159
|
+
} | null;
|
|
1160
|
+
}
|
|
1161
|
+
|
|
1162
|
+
// ── Orchestrator Runtime State ───────────────────────────────────────
|
|
1163
|
+
|
|
1164
|
+
/**
|
|
1165
|
+
* Runtime phase of the orchestrator batch execution.
|
|
1166
|
+
*
|
|
1167
|
+
* State machine:
|
|
1168
|
+
* idle → planning → executing → completed
|
|
1169
|
+
* → failed
|
|
1170
|
+
* → stopped (stop-wave/stop-all policy triggered)
|
|
1171
|
+
* → paused (via /orch-pause)
|
|
1172
|
+
* Any active state → idle (via cleanup after completion/failure)
|
|
1173
|
+
*/
|
|
1174
|
+
export type OrchBatchPhase =
|
|
1175
|
+
| "idle"
|
|
1176
|
+
| "launching"
|
|
1177
|
+
| "planning"
|
|
1178
|
+
| "executing"
|
|
1179
|
+
| "merging"
|
|
1180
|
+
| "paused"
|
|
1181
|
+
| "stopped"
|
|
1182
|
+
| "completed"
|
|
1183
|
+
| "failed";
|
|
1184
|
+
|
|
1185
|
+
/**
|
|
1186
|
+
* Runtime state for a batch execution.
|
|
1187
|
+
*
|
|
1188
|
+
* This is the primary state object that:
|
|
1189
|
+
* - Tracks progress across waves for the /orch command
|
|
1190
|
+
* - Is consumed by Step 6 (dashboard widget) for rendering
|
|
1191
|
+
* - Tracks pauseSignal for /orch-pause
|
|
1192
|
+
* - Accumulates wave results for summary
|
|
1193
|
+
*/
|
|
1194
|
+
export interface OrchBatchRuntimeState {
|
|
1195
|
+
/** Current execution phase */
|
|
1196
|
+
phase: OrchBatchPhase;
|
|
1197
|
+
/** Unique batch identifier (timestamp format, e.g., "20260308T214300") */
|
|
1198
|
+
batchId: string;
|
|
1199
|
+
/** Branch that was active when /orch started — used as base for worktrees and merge target */
|
|
1200
|
+
baseBranch: string;
|
|
1201
|
+
/** Orchestrator-managed branch name (e.g., 'orch/henry-20260318T140000'). Empty = legacy mode (merge into baseBranch directly). */
|
|
1202
|
+
orchBranch: string;
|
|
1203
|
+
/** Workspace execution mode (v2). Defaults to "repo" for backward compatibility. */
|
|
1204
|
+
mode: WorkspaceMode;
|
|
1205
|
+
/** Shared pause signal — set by /orch-pause, read by executeLane/executeWave */
|
|
1206
|
+
pauseSignal: { paused: boolean };
|
|
1207
|
+
/** All wave results in order (grows as waves complete) */
|
|
1208
|
+
waveResults: WaveExecutionResult[];
|
|
1209
|
+
/** Current wave index (0-based into waves array, -1 if not started) */
|
|
1210
|
+
currentWaveIndex: number;
|
|
1211
|
+
/** Total number of waves planned (segment rounds — internal) */
|
|
1212
|
+
totalWaves: number;
|
|
1213
|
+
/**
|
|
1214
|
+
* Number of dependency-driven task-level waves (TP-166).
|
|
1215
|
+
* Used for operator-facing "Wave X of Y" display. When undefined,
|
|
1216
|
+
* falls back to `totalWaves` for backward compatibility.
|
|
1217
|
+
*/
|
|
1218
|
+
taskLevelWaveCount?: number;
|
|
1219
|
+
/**
|
|
1220
|
+
* Maps each segment round index (0-based) to its parent task-level
|
|
1221
|
+
* wave index (0-based). Updated when continuation rounds are inserted.
|
|
1222
|
+
* Used with `resolveDisplayWaveNumber()` for correct display. (TP-166)
|
|
1223
|
+
*/
|
|
1224
|
+
roundToTaskWave?: number[];
|
|
1225
|
+
/** Set of task IDs blocked for future waves (from skip-dependents policy) */
|
|
1226
|
+
blockedTaskIds: Set<string>;
|
|
1227
|
+
/** Epoch ms when batch started */
|
|
1228
|
+
startedAt: number;
|
|
1229
|
+
/** Epoch ms when batch ended (null if still running) */
|
|
1230
|
+
endedAt: number | null;
|
|
1231
|
+
/** Total tasks in batch */
|
|
1232
|
+
totalTasks: number;
|
|
1233
|
+
/** Tasks completed successfully */
|
|
1234
|
+
succeededTasks: number;
|
|
1235
|
+
/** Tasks that failed */
|
|
1236
|
+
failedTasks: number;
|
|
1237
|
+
/** Tasks skipped */
|
|
1238
|
+
skippedTasks: number;
|
|
1239
|
+
/** Tasks blocked (transitive dependents of failures) */
|
|
1240
|
+
blockedTasks: number;
|
|
1241
|
+
/** Error messages for display */
|
|
1242
|
+
errors: string[];
|
|
1243
|
+
/** Allocated lanes from current wave (for session registry) */
|
|
1244
|
+
currentLanes: AllocatedLane[];
|
|
1245
|
+
/** Dependency graph for the batch (for skip-dependents computation) */
|
|
1246
|
+
dependencyGraph: DependencyGraph | null;
|
|
1247
|
+
/** Accumulated merge results across all waves */
|
|
1248
|
+
mergeResults: MergeWaveResult[];
|
|
1249
|
+
/**
|
|
1250
|
+
* v3 resilience state carried forward across resume cycles.
|
|
1251
|
+
* Populated from persisted state on resume; defaults used for new batches.
|
|
1252
|
+
*/
|
|
1253
|
+
resilience?: ResilienceState;
|
|
1254
|
+
/**
|
|
1255
|
+
* v3 diagnostics state carried forward across resume cycles.
|
|
1256
|
+
* Populated from persisted state on resume; defaults used for new batches.
|
|
1257
|
+
*/
|
|
1258
|
+
diagnostics?: BatchDiagnostics;
|
|
1259
|
+
/**
|
|
1260
|
+
* v4 segment records carried forward across resume cycles (TP-081).
|
|
1261
|
+
* Populated from persisted state on resume; empty for new batches
|
|
1262
|
+
* and repo-mode batches.
|
|
1263
|
+
*/
|
|
1264
|
+
segments?: PersistedSegmentRecord[];
|
|
1265
|
+
/**
|
|
1266
|
+
* Unknown top-level fields from loaded persisted state.
|
|
1267
|
+
* Carried forward so they survive serialization roundtrips.
|
|
1268
|
+
*/
|
|
1269
|
+
_extraFields?: Record<string, unknown>;
|
|
1270
|
+
}
|
|
1271
|
+
|
|
1272
|
+
/**
|
|
1273
|
+
* Session registry entry for /orch-sessions command.
|
|
1274
|
+
*/
|
|
1275
|
+
export interface OrchestratorSessionEntry {
|
|
1276
|
+
/** Lane session name (e.g., "orch-lane-1") */
|
|
1277
|
+
sessionName: string;
|
|
1278
|
+
/** Lane ID (e.g., "lane-1") */
|
|
1279
|
+
laneId: string;
|
|
1280
|
+
/** Task ID currently running (if tracked) */
|
|
1281
|
+
taskId: string | null;
|
|
1282
|
+
/** Session status */
|
|
1283
|
+
status: "alive" | "dead";
|
|
1284
|
+
/** Worktree path */
|
|
1285
|
+
worktreePath: string;
|
|
1286
|
+
/** Attach command for user */
|
|
1287
|
+
attachCmd: string;
|
|
1288
|
+
}
|
|
1289
|
+
|
|
1290
|
+
/**
|
|
1291
|
+
* Session registry: maps session names to their metadata.
|
|
1292
|
+
*/
|
|
1293
|
+
export type OrchestratorSessionRegistry = Map<string, OrchestratorSessionEntry>;
|
|
1294
|
+
|
|
1295
|
+
// ── Batch ID Generation ──────────────────────────────────────────────
|
|
1296
|
+
|
|
1297
|
+
/**
|
|
1298
|
+
* Generate a batch ID from the current timestamp.
|
|
1299
|
+
* Format: "YYYYMMDDTHHMMSS" (e.g., "20260308T214300")
|
|
1300
|
+
*/
|
|
1301
|
+
export function generateBatchId(): string {
|
|
1302
|
+
const now = new Date();
|
|
1303
|
+
const pad = (n: number) => String(n).padStart(2, "0");
|
|
1304
|
+
return `${now.getFullYear()}${pad(now.getMonth() + 1)}${pad(now.getDate())}T${pad(now.getHours())}${pad(now.getMinutes())}${pad(now.getSeconds())}`;
|
|
1305
|
+
}
|
|
1306
|
+
|
|
1307
|
+
/**
|
|
1308
|
+
* Create a fresh batch runtime state.
|
|
1309
|
+
*/
|
|
1310
|
+
export function freshOrchBatchState(): OrchBatchRuntimeState {
|
|
1311
|
+
return {
|
|
1312
|
+
phase: "idle",
|
|
1313
|
+
batchId: "",
|
|
1314
|
+
baseBranch: "",
|
|
1315
|
+
orchBranch: "",
|
|
1316
|
+
mode: "repo",
|
|
1317
|
+
pauseSignal: { paused: false },
|
|
1318
|
+
waveResults: [],
|
|
1319
|
+
currentWaveIndex: -1,
|
|
1320
|
+
totalWaves: 0,
|
|
1321
|
+
blockedTaskIds: new Set(),
|
|
1322
|
+
startedAt: 0,
|
|
1323
|
+
endedAt: null,
|
|
1324
|
+
totalTasks: 0,
|
|
1325
|
+
succeededTasks: 0,
|
|
1326
|
+
failedTasks: 0,
|
|
1327
|
+
skippedTasks: 0,
|
|
1328
|
+
blockedTasks: 0,
|
|
1329
|
+
errors: [],
|
|
1330
|
+
currentLanes: [],
|
|
1331
|
+
dependencyGraph: null,
|
|
1332
|
+
mergeResults: [],
|
|
1333
|
+
};
|
|
1334
|
+
}
|
|
1335
|
+
|
|
1336
|
+
// ── Merge Types ──────────────────────────────────────────────────────
|
|
1337
|
+
|
|
1338
|
+
/**
|
|
1339
|
+
* Valid merge result statuses.
|
|
1340
|
+
* Matches the contract in .pi/agents/task-merger.md.
|
|
1341
|
+
*/
|
|
1342
|
+
export type MergeResultStatus =
|
|
1343
|
+
| "SUCCESS"
|
|
1344
|
+
| "CONFLICT_RESOLVED"
|
|
1345
|
+
| "CONFLICT_UNRESOLVED"
|
|
1346
|
+
| "BUILD_FAILURE";
|
|
1347
|
+
|
|
1348
|
+
/** All valid status strings for runtime validation. */
|
|
1349
|
+
export const VALID_MERGE_STATUSES: ReadonlySet<string> = new Set([
|
|
1350
|
+
"SUCCESS",
|
|
1351
|
+
"CONFLICT_RESOLVED",
|
|
1352
|
+
"CONFLICT_UNRESOLVED",
|
|
1353
|
+
"BUILD_FAILURE",
|
|
1354
|
+
]);
|
|
1355
|
+
|
|
1356
|
+
/** A single conflict entry in the merge result. */
|
|
1357
|
+
export interface MergeConflict {
|
|
1358
|
+
file: string;
|
|
1359
|
+
type: string;
|
|
1360
|
+
resolved: boolean;
|
|
1361
|
+
resolution?: string;
|
|
1362
|
+
}
|
|
1363
|
+
|
|
1364
|
+
/** Verification outcome in the merge result. */
|
|
1365
|
+
export interface MergeVerification {
|
|
1366
|
+
ran: boolean;
|
|
1367
|
+
passed: boolean;
|
|
1368
|
+
output: string;
|
|
1369
|
+
}
|
|
1370
|
+
|
|
1371
|
+
/**
|
|
1372
|
+
* Merge result JSON written by the merge agent.
|
|
1373
|
+
* Matches the schema in .pi/agents/task-merger.md § Result File Format.
|
|
1374
|
+
*/
|
|
1375
|
+
export interface MergeResult {
|
|
1376
|
+
status: MergeResultStatus;
|
|
1377
|
+
source_branch: string;
|
|
1378
|
+
target_branch: string;
|
|
1379
|
+
merge_commit: string;
|
|
1380
|
+
conflicts: MergeConflict[];
|
|
1381
|
+
verification: MergeVerification;
|
|
1382
|
+
}
|
|
1383
|
+
|
|
1384
|
+
/**
|
|
1385
|
+
* Orchestrator-side verification baseline comparison result for a single lane.
|
|
1386
|
+
* Populated when verification baseline fingerprinting is enabled (testing.commands configured).
|
|
1387
|
+
*/
|
|
1388
|
+
export interface VerificationBaselineResult {
|
|
1389
|
+
/** Whether baseline comparison was performed */
|
|
1390
|
+
performed: boolean;
|
|
1391
|
+
/** Number of new failures (not in baseline) */
|
|
1392
|
+
newFailureCount: number;
|
|
1393
|
+
/** Number of pre-existing failures (also in baseline) */
|
|
1394
|
+
preExistingCount: number;
|
|
1395
|
+
/** Number of failures that disappeared (fixed by the merge) */
|
|
1396
|
+
fixedCount: number;
|
|
1397
|
+
/** Classification: "pass" (no new failures), "verification_new_failure", "flaky_suspected" */
|
|
1398
|
+
classification: "pass" | "verification_new_failure" | "flaky_suspected";
|
|
1399
|
+
/** Human-readable summary of new failures (truncated) */
|
|
1400
|
+
newFailureSummary: string;
|
|
1401
|
+
/** Whether a flaky re-run was performed */
|
|
1402
|
+
flakyRerunPerformed: boolean;
|
|
1403
|
+
}
|
|
1404
|
+
|
|
1405
|
+
/** Per-lane merge outcome, enriched by the orchestrator. */
|
|
1406
|
+
export interface MergeLaneResult {
|
|
1407
|
+
laneNumber: number;
|
|
1408
|
+
laneId: string;
|
|
1409
|
+
sourceBranch: string;
|
|
1410
|
+
targetBranch: string;
|
|
1411
|
+
result: MergeResult | null;
|
|
1412
|
+
error: string | null;
|
|
1413
|
+
durationMs: number;
|
|
1414
|
+
/** Repo ID this lane targeted (workspace mode only). Undefined in repo mode. */
|
|
1415
|
+
repoId?: string;
|
|
1416
|
+
/**
|
|
1417
|
+
* Orchestrator-side verification baseline result (TP-032).
|
|
1418
|
+
* Populated when baseline fingerprinting is enabled and a successful merge occurred.
|
|
1419
|
+
* Undefined when fingerprinting is not enabled or merge failed before verification.
|
|
1420
|
+
*/
|
|
1421
|
+
verificationBaseline?: VerificationBaselineResult;
|
|
1422
|
+
}
|
|
1423
|
+
|
|
1424
|
+
/** Overall wave merge outcome. */
|
|
1425
|
+
export interface MergeWaveResult {
|
|
1426
|
+
waveIndex: number;
|
|
1427
|
+
status: "succeeded" | "failed" | "partial";
|
|
1428
|
+
laneResults: MergeLaneResult[];
|
|
1429
|
+
failedLane: number | null;
|
|
1430
|
+
failureReason: string | null;
|
|
1431
|
+
totalDurationMs: number;
|
|
1432
|
+
/** Per-repo merge outcomes (populated in workspace mode; empty in repo mode). */
|
|
1433
|
+
repoResults?: RepoMergeOutcome[];
|
|
1434
|
+
/**
|
|
1435
|
+
* TP-033: True when a verification rollback failed and safe-stop was triggered.
|
|
1436
|
+
* Engine MUST force `paused` phase regardless of `on_merge_failure` config,
|
|
1437
|
+
* and preserve all merge worktrees/branches for manual recovery.
|
|
1438
|
+
*/
|
|
1439
|
+
rollbackFailed?: boolean;
|
|
1440
|
+
/**
|
|
1441
|
+
* TP-033: Transaction records for each lane merge attempt in this wave.
|
|
1442
|
+
* Populated when transactional envelope is active.
|
|
1443
|
+
*/
|
|
1444
|
+
transactionRecords?: TransactionRecord[];
|
|
1445
|
+
/**
|
|
1446
|
+
* TP-033 R004-2: Errors encountered while persisting transaction records.
|
|
1447
|
+
* When non-empty, recovery commands in transaction records may reference
|
|
1448
|
+
* files that don't exist on disk. Operator should check `.pi/verification/`
|
|
1449
|
+
* manually.
|
|
1450
|
+
*/
|
|
1451
|
+
persistenceErrors?: string[];
|
|
1452
|
+
}
|
|
1453
|
+
|
|
1454
|
+
/** Per-repo merge outcome within a wave merge. */
|
|
1455
|
+
export interface RepoMergeOutcome {
|
|
1456
|
+
/** Repo ID (undefined in repo mode default group). */
|
|
1457
|
+
repoId: string | undefined;
|
|
1458
|
+
/** Merge status for this repo. */
|
|
1459
|
+
status: "succeeded" | "failed" | "partial";
|
|
1460
|
+
/** Lane results belonging to this repo. */
|
|
1461
|
+
laneResults: MergeLaneResult[];
|
|
1462
|
+
/** Failed lane number within this repo (null if all succeeded). */
|
|
1463
|
+
failedLane: number | null;
|
|
1464
|
+
/** Failure reason within this repo (null if all succeeded). */
|
|
1465
|
+
failureReason: string | null;
|
|
1466
|
+
}
|
|
1467
|
+
|
|
1468
|
+
// ── Merge Transaction Types (TP-033) ─────────────────────────────────
|
|
1469
|
+
|
|
1470
|
+
/**
|
|
1471
|
+
* Status of a transactional merge attempt for a single lane.
|
|
1472
|
+
*
|
|
1473
|
+
* - `committed`: Merge succeeded, verification passed, refs advanced.
|
|
1474
|
+
* - `rolled_back`: Verification failed, merge commit rolled back to baseHEAD.
|
|
1475
|
+
* - `rollback_failed`: Rollback attempted but failed — safe-stop triggered.
|
|
1476
|
+
* - `merge_failed`: Merge itself failed (conflict, crash, etc.) before verification.
|
|
1477
|
+
*
|
|
1478
|
+
* @since TP-033
|
|
1479
|
+
*/
|
|
1480
|
+
export type TransactionStatus = "committed" | "rolled_back" | "rollback_failed" | "merge_failed";
|
|
1481
|
+
|
|
1482
|
+
/**
|
|
1483
|
+
* Transactional record for a single lane merge attempt.
|
|
1484
|
+
*
|
|
1485
|
+
* Persisted as JSON at:
|
|
1486
|
+
* `.pi/verification/{opId}/txn-b{batchId}-repo-{repoId}-wave-{n}-lane-{k}.json`
|
|
1487
|
+
*
|
|
1488
|
+
* Captures the complete ref state before and after merge, rollback outcome,
|
|
1489
|
+
* and recovery commands for safe-stop scenarios.
|
|
1490
|
+
*
|
|
1491
|
+
* @since TP-033
|
|
1492
|
+
*/
|
|
1493
|
+
export interface TransactionRecord {
|
|
1494
|
+
/** Operator ID for this batch run */
|
|
1495
|
+
opId: string;
|
|
1496
|
+
/** Batch identifier */
|
|
1497
|
+
batchId: string;
|
|
1498
|
+
/** Wave index (0-based) */
|
|
1499
|
+
waveIndex: number;
|
|
1500
|
+
/** Lane number within the wave */
|
|
1501
|
+
laneNumber: number;
|
|
1502
|
+
/** Repo ID (undefined/null in repo mode, string in workspace mode) */
|
|
1503
|
+
repoId: string | null;
|
|
1504
|
+
/** HEAD of temp branch before this lane's merge commit (rollback target) */
|
|
1505
|
+
baseHEAD: string;
|
|
1506
|
+
/** HEAD of the lane's source branch (commit being merged in) */
|
|
1507
|
+
laneHEAD: string;
|
|
1508
|
+
/** HEAD of temp branch after merge commit (null if merge failed before commit) */
|
|
1509
|
+
mergedHEAD: string | null;
|
|
1510
|
+
/** Transaction outcome */
|
|
1511
|
+
status: TransactionStatus;
|
|
1512
|
+
/** Whether a rollback was attempted */
|
|
1513
|
+
rollbackAttempted: boolean;
|
|
1514
|
+
/** Rollback outcome detail (null if rollback not attempted) */
|
|
1515
|
+
rollbackResult: string | null;
|
|
1516
|
+
/** Recovery commands emitted on rollback failure (empty array otherwise) */
|
|
1517
|
+
recoveryCommands: string[];
|
|
1518
|
+
/** ISO timestamp when transaction started */
|
|
1519
|
+
startedAt: string;
|
|
1520
|
+
/** ISO timestamp when transaction completed */
|
|
1521
|
+
completedAt: string;
|
|
1522
|
+
}
|
|
1523
|
+
|
|
1524
|
+
// ── Merge Error Types ────────────────────────────────────────────────
|
|
1525
|
+
|
|
1526
|
+
/**
|
|
1527
|
+
* Error codes for merge operations.
|
|
1528
|
+
*
|
|
1529
|
+
* - MERGE_SPAWN_FAILED: Could not create merge-agent session
|
|
1530
|
+
* - MERGE_TIMEOUT: Merge agent did not produce result within timeout
|
|
1531
|
+
* - MERGE_SESSION_DIED: Merge-agent session exited without writing result
|
|
1532
|
+
* - MERGE_RESULT_INVALID: Result file exists but contains invalid JSON
|
|
1533
|
+
* - MERGE_RESULT_MISSING_FIELDS: Result JSON missing required fields
|
|
1534
|
+
* - MERGE_UNKNOWN_STATUS: Result has an unrecognized status value
|
|
1535
|
+
* - MERGE_GIT_ERROR: Git command failure during merge setup
|
|
1536
|
+
*/
|
|
1537
|
+
export type MergeErrorCode =
|
|
1538
|
+
| "MERGE_SPAWN_FAILED"
|
|
1539
|
+
| "MERGE_TIMEOUT"
|
|
1540
|
+
| "MERGE_SESSION_DIED"
|
|
1541
|
+
| "MERGE_RESULT_INVALID"
|
|
1542
|
+
| "MERGE_RESULT_MISSING_FIELDS"
|
|
1543
|
+
| "MERGE_UNKNOWN_STATUS"
|
|
1544
|
+
| "MERGE_GIT_ERROR";
|
|
1545
|
+
|
|
1546
|
+
/** Typed error class for merge operations. */
|
|
1547
|
+
export class MergeError extends Error {
|
|
1548
|
+
code: MergeErrorCode;
|
|
1549
|
+
|
|
1550
|
+
constructor(code: MergeErrorCode, message: string) {
|
|
1551
|
+
super(message);
|
|
1552
|
+
this.name = "MergeError";
|
|
1553
|
+
this.code = code;
|
|
1554
|
+
}
|
|
1555
|
+
}
|
|
1556
|
+
|
|
1557
|
+
// ── Merge Constants ──────────────────────────────────────────────────
|
|
1558
|
+
|
|
1559
|
+
/**
|
|
1560
|
+
* Default timeout for merge agent execution (ms).
|
|
1561
|
+
* Merge agents typically complete in 10-60 seconds. A 5-minute timeout
|
|
1562
|
+
* is generous and covers verification (go build) on large codebases.
|
|
1563
|
+
*/
|
|
1564
|
+
/** Default merge agent timeout. Use config.merge.timeout_minutes to override. */
|
|
1565
|
+
export const MERGE_TIMEOUT_MS = 90 * 60 * 1000;
|
|
1566
|
+
|
|
1567
|
+
/**
|
|
1568
|
+
* Polling interval for merge result file (ms).
|
|
1569
|
+
* Merge agents are fast; poll aggressively.
|
|
1570
|
+
*/
|
|
1571
|
+
export const MERGE_POLL_INTERVAL_MS = 2_000;
|
|
1572
|
+
|
|
1573
|
+
/**
|
|
1574
|
+
* Grace period after a merge-agent session exits before declaring failure (ms).
|
|
1575
|
+
* Allows for slow disk flush of the result file.
|
|
1576
|
+
*/
|
|
1577
|
+
export const MERGE_RESULT_GRACE_MS = 3_000;
|
|
1578
|
+
|
|
1579
|
+
/**
|
|
1580
|
+
* Maximum retries for reading a partially-written result file.
|
|
1581
|
+
* If JSON parse fails, wait and retry in case the file is still being written.
|
|
1582
|
+
*/
|
|
1583
|
+
export const MERGE_RESULT_READ_RETRIES = 3;
|
|
1584
|
+
|
|
1585
|
+
/**
|
|
1586
|
+
* Delay between result file read retries (ms).
|
|
1587
|
+
*/
|
|
1588
|
+
export const MERGE_RESULT_READ_RETRY_DELAY_MS = 1_000;
|
|
1589
|
+
|
|
1590
|
+
/**
|
|
1591
|
+
* Maximum retries for merge-agent session spawn.
|
|
1592
|
+
*/
|
|
1593
|
+
export const MERGE_SPAWN_RETRY_MAX = 2;
|
|
1594
|
+
|
|
1595
|
+
/**
|
|
1596
|
+
* Maximum retries for merge agent timeout (TP-038).
|
|
1597
|
+
*
|
|
1598
|
+
* When a merge agent times out, the orchestrator retries with 2× the
|
|
1599
|
+
* previous timeout. This allows recovery from transient slowness without
|
|
1600
|
+
* operator intervention.
|
|
1601
|
+
*
|
|
1602
|
+
* Retry 0: original timeout (e.g., 10 min)
|
|
1603
|
+
* Retry 1: 2× original (e.g., 20 min)
|
|
1604
|
+
* Retry 2: 4× original (e.g., 40 min)
|
|
1605
|
+
*/
|
|
1606
|
+
export const MERGE_TIMEOUT_MAX_RETRIES = 2;
|
|
1607
|
+
|
|
1608
|
+
// ── Merge Health Monitoring Constants (TP-056) ───────────────────────
|
|
1609
|
+
|
|
1610
|
+
/**
|
|
1611
|
+
* Polling interval for merge health monitor (ms).
|
|
1612
|
+
* Independent of the merge result poll — runs on its own cadence.
|
|
1613
|
+
* @since TP-056
|
|
1614
|
+
*/
|
|
1615
|
+
export const MERGE_HEALTH_POLL_INTERVAL_MS = 2 * 60 * 1000; // 2 minutes
|
|
1616
|
+
|
|
1617
|
+
/**
|
|
1618
|
+
* Threshold (ms) after which a merge session with no new output
|
|
1619
|
+
* is classified as "possibly stalled" and a warning event is emitted.
|
|
1620
|
+
* @since TP-056
|
|
1621
|
+
*/
|
|
1622
|
+
export const MERGE_HEALTH_WARNING_THRESHOLD_MS = 10 * 60 * 1000; // 10 minutes
|
|
1623
|
+
|
|
1624
|
+
/**
|
|
1625
|
+
* Threshold (ms) after which a merge session with no new output
|
|
1626
|
+
* is classified as "stuck" and a stuck event is emitted.
|
|
1627
|
+
* @since TP-056
|
|
1628
|
+
*/
|
|
1629
|
+
export const MERGE_HEALTH_STUCK_THRESHOLD_MS = 20 * 60 * 1000; // 20 minutes
|
|
1630
|
+
|
|
1631
|
+
/**
|
|
1632
|
+
* Number of lines to capture from recent merge output snapshots
|
|
1633
|
+
* for activity detection via snapshot comparison.
|
|
1634
|
+
* @since TP-056
|
|
1635
|
+
*/
|
|
1636
|
+
export const MERGE_HEALTH_CAPTURE_LINES = 10;
|
|
1637
|
+
|
|
1638
|
+
// ── Persistent Reviewer Constants (TP-057) ───────────────────────────
|
|
1639
|
+
|
|
1640
|
+
/**
|
|
1641
|
+
* Polling interval (ms) for the `wait_for_review` tool to check for signal files.
|
|
1642
|
+
* Reviews take minutes; 3s latency is invisible to the user.
|
|
1643
|
+
* @since TP-057
|
|
1644
|
+
*/
|
|
1645
|
+
export const REVIEWER_POLL_INTERVAL_MS = 3_000;
|
|
1646
|
+
|
|
1647
|
+
/**
|
|
1648
|
+
* Maximum time (ms) for the `wait_for_review` tool to wait for a review signal.
|
|
1649
|
+
* 30 minutes — generous for long-running code reviews.
|
|
1650
|
+
* @since TP-057
|
|
1651
|
+
*/
|
|
1652
|
+
export const REVIEWER_WAIT_TIMEOUT_MS = 30 * 60 * 1000;
|
|
1653
|
+
|
|
1654
|
+
/**
|
|
1655
|
+
* Grace period (ms) after writing shutdown signal before killing the reviewer session.
|
|
1656
|
+
* Allows the reviewer to exit cleanly after receiving the shutdown signal.
|
|
1657
|
+
* @since TP-057
|
|
1658
|
+
*/
|
|
1659
|
+
export const REVIEWER_SHUTDOWN_GRACE_MS = 10_000;
|
|
1660
|
+
|
|
1661
|
+
/**
|
|
1662
|
+
* Signal file prefix for review requests. Full name: `.review-signal-{NNN}`
|
|
1663
|
+
* @since TP-057
|
|
1664
|
+
*/
|
|
1665
|
+
export const REVIEWER_SIGNAL_PREFIX = ".review-signal-";
|
|
1666
|
+
|
|
1667
|
+
/**
|
|
1668
|
+
* Shutdown signal filename written to .reviews/ when the task is complete.
|
|
1669
|
+
* @since TP-057
|
|
1670
|
+
*/
|
|
1671
|
+
export const REVIEWER_SHUTDOWN_SIGNAL = ".review-shutdown";
|
|
1672
|
+
|
|
1673
|
+
// ── Merge Health Event Types (TP-056) ────────────────────────────────
|
|
1674
|
+
|
|
1675
|
+
/**
|
|
1676
|
+
* Health classification for a merge session.
|
|
1677
|
+
*
|
|
1678
|
+
* - `healthy`: Session alive, output changing
|
|
1679
|
+
* - `warning`: Session alive, no new output for MERGE_HEALTH_WARNING_THRESHOLD_MS
|
|
1680
|
+
* - `dead`: Session gone, no result file
|
|
1681
|
+
* - `stuck`: Session alive, no new output for MERGE_HEALTH_STUCK_THRESHOLD_MS
|
|
1682
|
+
*
|
|
1683
|
+
* @since TP-056
|
|
1684
|
+
*/
|
|
1685
|
+
export type MergeHealthStatus = "healthy" | "warning" | "dead" | "stuck";
|
|
1686
|
+
|
|
1687
|
+
/**
|
|
1688
|
+
* Engine event types for merge health monitoring.
|
|
1689
|
+
*
|
|
1690
|
+
* These extend the EngineEventType union and are emitted to the
|
|
1691
|
+
* unified events.jsonl for supervisor consumption.
|
|
1692
|
+
*
|
|
1693
|
+
* @since TP-056
|
|
1694
|
+
*/
|
|
1695
|
+
export type MergeHealthEventType =
|
|
1696
|
+
| "merge_health_warning"
|
|
1697
|
+
| "merge_health_dead"
|
|
1698
|
+
| "merge_health_stuck";
|
|
1699
|
+
|
|
1700
|
+
/**
|
|
1701
|
+
* Snapshot of a merge session's pane output at a point in time.
|
|
1702
|
+
* Used for activity detection by comparing successive snapshots.
|
|
1703
|
+
*
|
|
1704
|
+
* @since TP-056
|
|
1705
|
+
*/
|
|
1706
|
+
export interface MergeSessionSnapshot {
|
|
1707
|
+
/** Captured pane content (last N lines) */
|
|
1708
|
+
content: string;
|
|
1709
|
+
/** Epoch ms when the snapshot was taken */
|
|
1710
|
+
capturedAt: number;
|
|
1711
|
+
}
|
|
1712
|
+
|
|
1713
|
+
/**
|
|
1714
|
+
* Per-session health tracking state.
|
|
1715
|
+
*
|
|
1716
|
+
* @since TP-056
|
|
1717
|
+
*/
|
|
1718
|
+
export interface MergeSessionHealthState {
|
|
1719
|
+
/** Merge session name */
|
|
1720
|
+
sessionName: string;
|
|
1721
|
+
/** Lane number this session belongs to */
|
|
1722
|
+
laneNumber: number;
|
|
1723
|
+
/** Last captured pane snapshot */
|
|
1724
|
+
lastSnapshot: MergeSessionSnapshot | null;
|
|
1725
|
+
/** Epoch ms when the last output change was detected */
|
|
1726
|
+
lastActivityAt: number;
|
|
1727
|
+
/** Current health classification */
|
|
1728
|
+
status: MergeHealthStatus;
|
|
1729
|
+
/** Whether a warning event has been emitted (prevent duplicates) */
|
|
1730
|
+
warningEmitted: boolean;
|
|
1731
|
+
/** Whether a stuck event has been emitted (prevent duplicates) */
|
|
1732
|
+
stuckEmitted: boolean;
|
|
1733
|
+
/** Whether a dead event has been emitted (prevent duplicates) */
|
|
1734
|
+
deadEmitted: boolean;
|
|
1735
|
+
}
|
|
1736
|
+
|
|
1737
|
+
// ── Merge Retry Policy Matrix (TP-033 Step 2) ───────────────────────
|
|
1738
|
+
|
|
1739
|
+
/**
|
|
1740
|
+
* Merge-related failure classifications for the retry policy matrix.
|
|
1741
|
+
*
|
|
1742
|
+
* These are the merge-phase failure classes from the resilience roadmap §4c.
|
|
1743
|
+
* Task-execution classes (api_error, context_overflow, etc.) are out of scope
|
|
1744
|
+
* for TP-033 and handled separately in Phase 1/3.
|
|
1745
|
+
*
|
|
1746
|
+
* @since TP-033
|
|
1747
|
+
*/
|
|
1748
|
+
export type MergeFailureClassification =
|
|
1749
|
+
| "verification_new_failure"
|
|
1750
|
+
| "merge_conflict_unresolved"
|
|
1751
|
+
| "cleanup_post_merge_failed"
|
|
1752
|
+
| "git_worktree_dirty"
|
|
1753
|
+
| "git_lock_file";
|
|
1754
|
+
|
|
1755
|
+
/**
|
|
1756
|
+
* Retry policy for a single merge failure classification.
|
|
1757
|
+
*
|
|
1758
|
+
* Defines whether a failure class is retriable, the maximum retry attempts,
|
|
1759
|
+
* cooldown between retries (in milliseconds), and what happens on exhaustion.
|
|
1760
|
+
*
|
|
1761
|
+
* @since TP-033
|
|
1762
|
+
*/
|
|
1763
|
+
export interface MergeRetryPolicy {
|
|
1764
|
+
/** Whether this failure class can be retried automatically */
|
|
1765
|
+
retriable: boolean;
|
|
1766
|
+
/** Maximum number of retry attempts (0 for non-retriable) */
|
|
1767
|
+
maxAttempts: number;
|
|
1768
|
+
/** Cooldown delay between retries in milliseconds (0 for immediate) */
|
|
1769
|
+
cooldownMs: number;
|
|
1770
|
+
/** Action when retries are exhausted or class is non-retriable */
|
|
1771
|
+
exhaustionAction: "pause" | "pause_wave_gate" | "pause_escalation";
|
|
1772
|
+
}
|
|
1773
|
+
|
|
1774
|
+
/**
|
|
1775
|
+
* Centralized retry policy matrix for merge-related failure classes.
|
|
1776
|
+
*
|
|
1777
|
+
* This is the **single source of truth** for retry behavior. Both engine.ts
|
|
1778
|
+
* and resume.ts consume this table through `computeMergeRetryDecision()` to
|
|
1779
|
+
* guarantee parity.
|
|
1780
|
+
*
|
|
1781
|
+
* Values from resilience roadmap §4c:
|
|
1782
|
+
*
|
|
1783
|
+
* | Classification | Retry? | Max | Cooldown | Exhaustion |
|
|
1784
|
+
* |-----------------------------|--------|-----|----------|---------------------|
|
|
1785
|
+
* | verification_new_failure | ✅ | 1 | 0ms | pause + diagnostic |
|
|
1786
|
+
* | merge_conflict_unresolved | ❌ | 0 | — | pause + escalation |
|
|
1787
|
+
* | cleanup_post_merge_failed | ✅ | 1 | 2000ms | pause (wave gate) |
|
|
1788
|
+
* | git_worktree_dirty | ✅ | 1 | 2000ms | pause |
|
|
1789
|
+
* | git_lock_file | ✅ | 2 | 3000ms | pause |
|
|
1790
|
+
*
|
|
1791
|
+
* @since TP-033
|
|
1792
|
+
*/
|
|
1793
|
+
export const MERGE_RETRY_POLICY_MATRIX: Readonly<
|
|
1794
|
+
Record<MergeFailureClassification, MergeRetryPolicy>
|
|
1795
|
+
> = {
|
|
1796
|
+
verification_new_failure: {
|
|
1797
|
+
retriable: true,
|
|
1798
|
+
maxAttempts: 1,
|
|
1799
|
+
cooldownMs: 0,
|
|
1800
|
+
exhaustionAction: "pause",
|
|
1801
|
+
},
|
|
1802
|
+
merge_conflict_unresolved: {
|
|
1803
|
+
retriable: false,
|
|
1804
|
+
maxAttempts: 0,
|
|
1805
|
+
cooldownMs: 0,
|
|
1806
|
+
exhaustionAction: "pause_escalation",
|
|
1807
|
+
},
|
|
1808
|
+
cleanup_post_merge_failed: {
|
|
1809
|
+
retriable: true,
|
|
1810
|
+
maxAttempts: 1,
|
|
1811
|
+
cooldownMs: 2_000,
|
|
1812
|
+
exhaustionAction: "pause_wave_gate",
|
|
1813
|
+
},
|
|
1814
|
+
git_worktree_dirty: {
|
|
1815
|
+
retriable: true,
|
|
1816
|
+
maxAttempts: 1,
|
|
1817
|
+
cooldownMs: 2_000,
|
|
1818
|
+
exhaustionAction: "pause",
|
|
1819
|
+
},
|
|
1820
|
+
git_lock_file: {
|
|
1821
|
+
retriable: true,
|
|
1822
|
+
maxAttempts: 2,
|
|
1823
|
+
cooldownMs: 3_000,
|
|
1824
|
+
exhaustionAction: "pause",
|
|
1825
|
+
},
|
|
1826
|
+
};
|
|
1827
|
+
|
|
1828
|
+
/**
|
|
1829
|
+
* All merge failure classifications as a readonly array, for iteration/validation.
|
|
1830
|
+
* @since TP-033
|
|
1831
|
+
*/
|
|
1832
|
+
export const MERGE_FAILURE_CLASSIFICATIONS: readonly MergeFailureClassification[] = [
|
|
1833
|
+
"verification_new_failure",
|
|
1834
|
+
"merge_conflict_unresolved",
|
|
1835
|
+
"cleanup_post_merge_failed",
|
|
1836
|
+
"git_worktree_dirty",
|
|
1837
|
+
"git_lock_file",
|
|
1838
|
+
] as const;
|
|
1839
|
+
|
|
1840
|
+
// ── Tier 0 Watchdog Recovery Types (TP-039) ──────────────────────────
|
|
1841
|
+
|
|
1842
|
+
/**
|
|
1843
|
+
* Tier 0 recovery pattern identifiers.
|
|
1844
|
+
*
|
|
1845
|
+
* Each pattern corresponds to a failure class that the engine can
|
|
1846
|
+
* handle automatically without supervisor intervention.
|
|
1847
|
+
*
|
|
1848
|
+
* @since TP-039
|
|
1849
|
+
*/
|
|
1850
|
+
export type Tier0RecoveryPattern =
|
|
1851
|
+
| "worker_crash"
|
|
1852
|
+
| "stale_worktree"
|
|
1853
|
+
| "cleanup_gate"
|
|
1854
|
+
| "model_fallback";
|
|
1855
|
+
|
|
1856
|
+
/**
|
|
1857
|
+
* Exit classifications that are eligible for automatic Tier 0 retry.
|
|
1858
|
+
*
|
|
1859
|
+
* These are transient failures where re-running the task has a reasonable
|
|
1860
|
+
* chance of success. Classifications NOT in this set (e.g., user_killed,
|
|
1861
|
+
* stall_timeout, context_overflow, spawn_failure) indicate persistent
|
|
1862
|
+
* problems that won't be fixed by retrying.
|
|
1863
|
+
*
|
|
1864
|
+
* **TP-190 (#561):** `spawn_failure` is intentionally excluded — spawn-stage
|
|
1865
|
+
* errors (Pi CLI not findable, worktree provisioning failure, branch
|
|
1866
|
+
* collision) are never transient and require operator action. Retrying
|
|
1867
|
+
* silently would just burn the retry budget and delay the alert.
|
|
1868
|
+
*
|
|
1869
|
+
* @since TP-039
|
|
1870
|
+
*/
|
|
1871
|
+
export const TIER0_RETRYABLE_CLASSIFICATIONS: ReadonlySet<string> = new Set([
|
|
1872
|
+
"api_error",
|
|
1873
|
+
"model_access_error",
|
|
1874
|
+
"process_crash",
|
|
1875
|
+
"session_vanished",
|
|
1876
|
+
]);
|
|
1877
|
+
|
|
1878
|
+
/**
|
|
1879
|
+
* Retry budget for Tier 0 recovery patterns.
|
|
1880
|
+
*
|
|
1881
|
+
* Defines max retries, cooldown between attempts, and backoff
|
|
1882
|
+
* multiplier for each pattern. Values from spec §5.3.
|
|
1883
|
+
*
|
|
1884
|
+
* @since TP-039
|
|
1885
|
+
*/
|
|
1886
|
+
export interface Tier0RetryBudget {
|
|
1887
|
+
/** Maximum number of retry attempts */
|
|
1888
|
+
maxRetries: number;
|
|
1889
|
+
/** Cooldown delay between retries in milliseconds */
|
|
1890
|
+
cooldownMs: number;
|
|
1891
|
+
/** Multiplier applied to cooldown on each subsequent retry */
|
|
1892
|
+
backoffMultiplier: number;
|
|
1893
|
+
}
|
|
1894
|
+
|
|
1895
|
+
/**
|
|
1896
|
+
* Centralized retry budgets for Tier 0 recovery patterns.
|
|
1897
|
+
*
|
|
1898
|
+
* These are the defaults from spec §5.3. They are NOT configurable
|
|
1899
|
+
* via user config in Tier 0 — the supervisor (Tier 1) can override
|
|
1900
|
+
* them in future iterations.
|
|
1901
|
+
*
|
|
1902
|
+
* @since TP-039
|
|
1903
|
+
*/
|
|
1904
|
+
export const TIER0_RETRY_BUDGETS: Readonly<Record<Tier0RecoveryPattern, Tier0RetryBudget>> = {
|
|
1905
|
+
worker_crash: {
|
|
1906
|
+
maxRetries: 1,
|
|
1907
|
+
cooldownMs: 5_000,
|
|
1908
|
+
backoffMultiplier: 1.0,
|
|
1909
|
+
},
|
|
1910
|
+
stale_worktree: {
|
|
1911
|
+
maxRetries: 1,
|
|
1912
|
+
cooldownMs: 2_000,
|
|
1913
|
+
backoffMultiplier: 1.0,
|
|
1914
|
+
},
|
|
1915
|
+
cleanup_gate: {
|
|
1916
|
+
maxRetries: 1,
|
|
1917
|
+
cooldownMs: 2_000,
|
|
1918
|
+
backoffMultiplier: 1.0,
|
|
1919
|
+
},
|
|
1920
|
+
model_fallback: {
|
|
1921
|
+
maxRetries: 1,
|
|
1922
|
+
cooldownMs: 3_000,
|
|
1923
|
+
backoffMultiplier: 1.0,
|
|
1924
|
+
},
|
|
1925
|
+
};
|
|
1926
|
+
|
|
1927
|
+
/**
|
|
1928
|
+
* All Tier 0 escalation-eligible pattern identifiers.
|
|
1929
|
+
*
|
|
1930
|
+
* Extends `Tier0RecoveryPattern` with `merge_timeout` so that
|
|
1931
|
+
* `EscalationContext` can describe escalations from every exhaustion
|
|
1932
|
+
* path, including the merge retry loop (which uses its own retry
|
|
1933
|
+
* matrix but still triggers Tier 0 escalation on exhaustion).
|
|
1934
|
+
*
|
|
1935
|
+
* @since TP-039
|
|
1936
|
+
*/
|
|
1937
|
+
export type Tier0EscalationPattern = Tier0RecoveryPattern | "merge_timeout";
|
|
1938
|
+
// Note: model_fallback is already included via Tier0RecoveryPattern
|
|
1939
|
+
|
|
1940
|
+
/**
|
|
1941
|
+
* Context payload emitted when Tier 0 retries are exhausted and the
|
|
1942
|
+
* engine must escalate to the supervisor (future TP-041).
|
|
1943
|
+
*
|
|
1944
|
+
* This is the structured data that a Tier 1 supervisor agent uses to
|
|
1945
|
+
* decide what to do next. In Tier 0, escalation simply falls through
|
|
1946
|
+
* to the existing pause behaviour.
|
|
1947
|
+
*
|
|
1948
|
+
* @since TP-039
|
|
1949
|
+
*/
|
|
1950
|
+
export interface EscalationContext {
|
|
1951
|
+
/** Which recovery pattern was attempted */
|
|
1952
|
+
pattern: Tier0EscalationPattern;
|
|
1953
|
+
/** Number of retry attempts that were made (1-based) */
|
|
1954
|
+
attempts: number;
|
|
1955
|
+
/** Maximum attempts that were allowed */
|
|
1956
|
+
maxAttempts: number;
|
|
1957
|
+
/** Human-readable last error / failure reason */
|
|
1958
|
+
lastError: string;
|
|
1959
|
+
/** Task IDs affected by this failure */
|
|
1960
|
+
affectedTasks: string[];
|
|
1961
|
+
/** Suggested remediation for an operator or supervisor */
|
|
1962
|
+
suggestion: string;
|
|
1963
|
+
}
|
|
1964
|
+
|
|
1965
|
+
/**
|
|
1966
|
+
* Scope key prefix for Tier 0 (non-merge) retry counters.
|
|
1967
|
+
*
|
|
1968
|
+
* Format: `t0:{pattern}:{taskId}:w{waveIndex}`
|
|
1969
|
+
* This namespace prevents collisions with merge retry scope keys
|
|
1970
|
+
* (which use `{taskId}:w{waveIndex}:l{laneNumber}`).
|
|
1971
|
+
*
|
|
1972
|
+
* @since TP-039
|
|
1973
|
+
*/
|
|
1974
|
+
export function tier0ScopeKey(
|
|
1975
|
+
pattern: Tier0RecoveryPattern,
|
|
1976
|
+
taskId: string,
|
|
1977
|
+
waveIndex: number,
|
|
1978
|
+
): string {
|
|
1979
|
+
return `t0:${pattern}:${taskId}:w${waveIndex}`;
|
|
1980
|
+
}
|
|
1981
|
+
|
|
1982
|
+
/**
|
|
1983
|
+
* Wave-level scope key for Tier 0 patterns that operate at wave granularity
|
|
1984
|
+
* (stale_worktree, cleanup_gate).
|
|
1985
|
+
*
|
|
1986
|
+
* Format: `t0:{pattern}:w{waveIndex}`
|
|
1987
|
+
*
|
|
1988
|
+
* @since TP-039
|
|
1989
|
+
*/
|
|
1990
|
+
export function tier0WaveScopeKey(pattern: Tier0RecoveryPattern, waveIndex: number): string {
|
|
1991
|
+
return `t0:${pattern}:w${waveIndex}`;
|
|
1992
|
+
}
|
|
1993
|
+
|
|
1994
|
+
// ── Engine Event Types (TP-040) ──────────────────────────────────────
|
|
1995
|
+
|
|
1996
|
+
/**
|
|
1997
|
+
* Engine lifecycle event types emitted during batch execution.
|
|
1998
|
+
*
|
|
1999
|
+
* These events are the primary coordination mechanism between the
|
|
2000
|
+
* non-blocking engine and external consumers (supervisor agent,
|
|
2001
|
+
* dashboard, command handlers).
|
|
2002
|
+
*
|
|
2003
|
+
* Event semantics (from spec §7.3):
|
|
2004
|
+
* - `wave_start` — Wave execution begins
|
|
2005
|
+
* - `task_complete` — Task .DONE detected (succeeded)
|
|
2006
|
+
* - `task_failed` — Task failed or stalled
|
|
2007
|
+
* - `merge_start` — Wave merge begins
|
|
2008
|
+
* - `merge_success` — Merge and verification pass
|
|
2009
|
+
* - `merge_failed` — Merge or verification fails
|
|
2010
|
+
* - `batch_complete` — All waves done (terminal)
|
|
2011
|
+
* - `batch_paused` — Batch paused (failure or manual)
|
|
2012
|
+
*
|
|
2013
|
+
* Tier 0 recovery events (`tier0_recovery_attempt`, `tier0_recovery_success`,
|
|
2014
|
+
* `tier0_recovery_exhausted`, `tier0_escalation`) continue to use the
|
|
2015
|
+
* existing `Tier0EventType` from persistence.ts and share the same JSONL
|
|
2016
|
+
* file. Engine events extend the same stream with lifecycle context.
|
|
2017
|
+
*
|
|
2018
|
+
* @since TP-040
|
|
2019
|
+
*/
|
|
2020
|
+
export type EngineEventType =
|
|
2021
|
+
| "wave_start"
|
|
2022
|
+
| "task_complete"
|
|
2023
|
+
| "task_failed"
|
|
2024
|
+
| "merge_start"
|
|
2025
|
+
| "merge_success"
|
|
2026
|
+
| "merge_failed"
|
|
2027
|
+
| "merge_health_warning"
|
|
2028
|
+
| "merge_health_dead"
|
|
2029
|
+
| "merge_health_stuck"
|
|
2030
|
+
| "batch_complete"
|
|
2031
|
+
| "batch_paused";
|
|
2032
|
+
|
|
2033
|
+
/**
|
|
2034
|
+
* Structured engine event written to `.pi/supervisor/events.jsonl`.
|
|
2035
|
+
*
|
|
2036
|
+
* Shares the same JSONL file as Tier 0 events, with a consistent
|
|
2037
|
+
* base payload (`timestamp`, `batchId`, `waveIndex`) for uniform
|
|
2038
|
+
* consumption by the supervisor agent.
|
|
2039
|
+
*
|
|
2040
|
+
* Design: follows reviewer suggestion (R001) to use a shared base
|
|
2041
|
+
* payload and extend the existing event-writing infrastructure rather
|
|
2042
|
+
* than introducing a parallel writer.
|
|
2043
|
+
*
|
|
2044
|
+
* @since TP-040
|
|
2045
|
+
*/
|
|
2046
|
+
export interface EngineEvent {
|
|
2047
|
+
/** ISO 8601 timestamp */
|
|
2048
|
+
timestamp: string;
|
|
2049
|
+
/** Engine event type */
|
|
2050
|
+
type: EngineEventType;
|
|
2051
|
+
/** Batch identifier */
|
|
2052
|
+
batchId: string;
|
|
2053
|
+
/** Wave index (0-based, -1 if not wave-scoped) */
|
|
2054
|
+
waveIndex: number;
|
|
2055
|
+
/** Current batch phase at event emission time */
|
|
2056
|
+
phase: OrchBatchPhase;
|
|
2057
|
+
|
|
2058
|
+
// ── Event-specific fields (all optional) ─────────────────────
|
|
2059
|
+
|
|
2060
|
+
/** Task IDs in the wave (for wave_start) */
|
|
2061
|
+
taskIds?: string[];
|
|
2062
|
+
/** Number of lanes used (for wave_start, merge_start) */
|
|
2063
|
+
laneCount?: number;
|
|
2064
|
+
/** Task ID (for task_complete, task_failed) */
|
|
2065
|
+
taskId?: string;
|
|
2066
|
+
/** Task execution duration in milliseconds (for task_complete, task_failed) */
|
|
2067
|
+
durationMs?: number;
|
|
2068
|
+
/** Task outcome summary (for task_complete) */
|
|
2069
|
+
outcome?: string;
|
|
2070
|
+
/** Failure reason (for task_failed, merge_failed, batch_paused) */
|
|
2071
|
+
reason?: string;
|
|
2072
|
+
/** Whether partial progress was preserved (for task_failed) */
|
|
2073
|
+
partialProgress?: boolean;
|
|
2074
|
+
/** Lane number (for merge_failed) */
|
|
2075
|
+
laneNumber?: number;
|
|
2076
|
+
/** Merge error details (for merge_failed) */
|
|
2077
|
+
error?: string;
|
|
2078
|
+
/** Number of merge test verifications (for merge_success) */
|
|
2079
|
+
testCount?: number;
|
|
2080
|
+
/** Wave count for total waves (for merge_success) */
|
|
2081
|
+
totalWaves?: number;
|
|
2082
|
+
|
|
2083
|
+
// ── Batch summary fields (for batch_complete, batch_paused) ──
|
|
2084
|
+
|
|
2085
|
+
/** Total succeeded tasks (for batch_complete) */
|
|
2086
|
+
succeededTasks?: number;
|
|
2087
|
+
/** Total failed tasks (for batch_complete, batch_paused) */
|
|
2088
|
+
failedTasks?: number;
|
|
2089
|
+
/** Total skipped tasks (for batch_complete) */
|
|
2090
|
+
skippedTasks?: number;
|
|
2091
|
+
/** Total blocked tasks (for batch_complete) */
|
|
2092
|
+
blockedTasks?: number;
|
|
2093
|
+
/** Batch duration in milliseconds (for batch_complete) */
|
|
2094
|
+
batchDurationMs?: number;
|
|
2095
|
+
|
|
2096
|
+
// ── Merge health monitoring fields (TP-056) ──────────────────
|
|
2097
|
+
|
|
2098
|
+
/** Merge session name (for merge_health_* events) */
|
|
2099
|
+
sessionName?: string;
|
|
2100
|
+
/** Merge health status classification (for merge_health_* events) */
|
|
2101
|
+
healthStatus?: MergeHealthStatus;
|
|
2102
|
+
/** Minutes since last activity (for merge_health_warning, merge_health_stuck) */
|
|
2103
|
+
stalledMinutes?: number;
|
|
2104
|
+
}
|
|
2105
|
+
|
|
2106
|
+
/**
|
|
2107
|
+
* Callback type for engine event consumers.
|
|
2108
|
+
*
|
|
2109
|
+
* The command handler (extension.ts) subscribes to this to receive
|
|
2110
|
+
* real-time engine state transitions. In the non-blocking architecture
|
|
2111
|
+
* (Step 2), this is the primary way the caller observes engine progress
|
|
2112
|
+
* instead of awaiting the return value.
|
|
2113
|
+
*
|
|
2114
|
+
* The callback is invoked synchronously in the engine's event loop.
|
|
2115
|
+
* Consumers MUST NOT perform blocking I/O in the callback.
|
|
2116
|
+
*
|
|
2117
|
+
* @since TP-040
|
|
2118
|
+
*/
|
|
2119
|
+
export type EngineEventCallback = (event: EngineEvent) => void;
|
|
2120
|
+
|
|
2121
|
+
// ── Supervisor Alert Types (TP-076) ──────────────────────────────────
|
|
2122
|
+
|
|
2123
|
+
/**
|
|
2124
|
+
* Alert category for supervisor notifications.
|
|
2125
|
+
*
|
|
2126
|
+
* Matches the alert categories in the autonomous supervisor spec:
|
|
2127
|
+
* - `task-failure`: A task failed after deterministic recovery was exhausted
|
|
2128
|
+
* - `merge-failure`: Wave merge failed and batch paused
|
|
2129
|
+
* - `batch-complete`: Batch finished (all waves done)
|
|
2130
|
+
* - `agent-message`: Runtime mailbox reply/escalation from a running agent
|
|
2131
|
+
* - `segment-expansion-requested`: Worker requested dynamic segment expansion
|
|
2132
|
+
* - `segment-expansion-approved`: Engine approved an expansion request
|
|
2133
|
+
* - `segment-expansion-rejected`: Engine rejected/discarded an expansion request
|
|
2134
|
+
*
|
|
2135
|
+
* Note: `stall` detection is deferred to a future phase (requires
|
|
2136
|
+
* last-activity tracking not yet built).
|
|
2137
|
+
*
|
|
2138
|
+
* @since TP-076
|
|
2139
|
+
*/
|
|
2140
|
+
export type SupervisorAlertCategory =
|
|
2141
|
+
| "task-failure"
|
|
2142
|
+
| "merge-failure"
|
|
2143
|
+
| "batch-complete"
|
|
2144
|
+
| "agent-message"
|
|
2145
|
+
| "worker-exit-intercept"
|
|
2146
|
+
| "segment-expansion-requested"
|
|
2147
|
+
| "segment-expansion-approved"
|
|
2148
|
+
| "segment-expansion-rejected";
|
|
2149
|
+
|
|
2150
|
+
/**
|
|
2151
|
+
* Structured context payload for supervisor alerts.
|
|
2152
|
+
*
|
|
2153
|
+
* All fields are IPC-serializable (no functions, no circular refs, no Maps/Sets).
|
|
2154
|
+
* Each alert category populates the relevant subset of optional fields.
|
|
2155
|
+
*
|
|
2156
|
+
* @since TP-076
|
|
2157
|
+
*/
|
|
2158
|
+
export interface SupervisorSegmentFrontierSnapshot {
|
|
2159
|
+
/** Parent task identifier */
|
|
2160
|
+
taskId: string;
|
|
2161
|
+
/** Total number of ordered segments for the task */
|
|
2162
|
+
totalSegments: number;
|
|
2163
|
+
/** Number of segments that reached a terminal status */
|
|
2164
|
+
terminalSegments: number;
|
|
2165
|
+
/** Active (or most recently active) segment ID */
|
|
2166
|
+
activeSegmentId: string | null;
|
|
2167
|
+
/** Segment-level execution snapshot in deterministic order */
|
|
2168
|
+
segments: Array<{
|
|
2169
|
+
segmentId: string;
|
|
2170
|
+
repoId: string;
|
|
2171
|
+
status: PersistedSegmentStatus;
|
|
2172
|
+
dependsOnSegmentIds: string[];
|
|
2173
|
+
}>;
|
|
2174
|
+
}
|
|
2175
|
+
|
|
2176
|
+
export interface SupervisorAlertContext {
|
|
2177
|
+
/** Task ID (for task-failure alerts) */
|
|
2178
|
+
taskId?: string;
|
|
2179
|
+
/** Segment ID (for segment-aware task-failure alerts) */
|
|
2180
|
+
segmentId?: string;
|
|
2181
|
+
/** Repo ID associated with the failure (task segment or merge target) */
|
|
2182
|
+
repoId?: string;
|
|
2183
|
+
/** Lane ID, e.g., "lane-1" (for task-failure alerts) */
|
|
2184
|
+
laneId?: string;
|
|
2185
|
+
/** Lane number (for task-failure and merge-failure alerts) */
|
|
2186
|
+
laneNumber?: number;
|
|
2187
|
+
/** Wave index, 0-based (for merge-failure and batch-complete alerts) */
|
|
2188
|
+
waveIndex?: number;
|
|
2189
|
+
/** Exit reason string (for task-failure alerts) */
|
|
2190
|
+
exitReason?: string;
|
|
2191
|
+
/**
|
|
2192
|
+
* Structured exit category for task-failure alerts.
|
|
2193
|
+
*
|
|
2194
|
+
* Mirrors `LaneTaskOutcome.exitDiagnostic.classification` for IPC
|
|
2195
|
+
* consumption by the supervisor. Optional for backward compatibility
|
|
2196
|
+
* — absent when the engine produces a task-failure alert without
|
|
2197
|
+
* structured diagnostic data.
|
|
2198
|
+
*
|
|
2199
|
+
* Notable values consumed by the supervisor playbook:
|
|
2200
|
+
* - `"spawn_failure"` (TP-190, #561): worker process never spawned
|
|
2201
|
+
* (Pi CLI not findable, worktree provisioning error, etc.). Never
|
|
2202
|
+
* transient — the playbook MUST escalate immediately rather than
|
|
2203
|
+
* retry. When the post-wave phase-transition logic detects an
|
|
2204
|
+
* all-spawn-failed wave it also flips `batchState.phase` to
|
|
2205
|
+
* `"failed"`; that transition is independent of this alert.
|
|
2206
|
+
*
|
|
2207
|
+
* @since TP-190 (#561)
|
|
2208
|
+
*/
|
|
2209
|
+
exitCategory?: ExitClassification;
|
|
2210
|
+
/** Segment frontier snapshot for task-failure diagnosis */
|
|
2211
|
+
segmentFrontier?: SupervisorSegmentFrontierSnapshot;
|
|
2212
|
+
/** Agent ID (for agent-message alerts) */
|
|
2213
|
+
agentId?: string;
|
|
2214
|
+
/** Mailbox message ID (for agent-message alerts) */
|
|
2215
|
+
messageId?: string;
|
|
2216
|
+
/** Segment expansion request ID (for segment-expansion alerts) */
|
|
2217
|
+
expansionRequestId?: string;
|
|
2218
|
+
/** Whether partial progress was preserved (for task-failure alerts) */
|
|
2219
|
+
partialProgress?: boolean;
|
|
2220
|
+
/** Batch progress summary */
|
|
2221
|
+
batchProgress?: {
|
|
2222
|
+
succeededTasks: number;
|
|
2223
|
+
failedTasks: number;
|
|
2224
|
+
skippedTasks: number;
|
|
2225
|
+
blockedTasks: number;
|
|
2226
|
+
totalTasks: number;
|
|
2227
|
+
currentWave: number;
|
|
2228
|
+
totalWaves: number;
|
|
2229
|
+
};
|
|
2230
|
+
/** Merge failure reason (for merge-failure alerts) */
|
|
2231
|
+
mergeError?: string;
|
|
2232
|
+
/** Batch duration in milliseconds (for batch-complete alerts) */
|
|
2233
|
+
batchDurationMs?: number;
|
|
2234
|
+
}
|
|
2235
|
+
|
|
2236
|
+
/**
|
|
2237
|
+
* Structured supervisor alert message.
|
|
2238
|
+
*
|
|
2239
|
+
* Emitted by the engine (child process) via IPC when the supervisor
|
|
2240
|
+
* needs to be notified of an event requiring attention or acknowledgement.
|
|
2241
|
+
*
|
|
2242
|
+
* Design:
|
|
2243
|
+
* - All fields are plain JSON-serializable values (IPC-safe).
|
|
2244
|
+
* - `category` determines the alert type and which `context` fields are populated.
|
|
2245
|
+
* - `summary` is a pre-formatted, human-readable string suitable for direct
|
|
2246
|
+
* display to the supervisor LLM as a conversation message.
|
|
2247
|
+
* - `context` provides structured data for programmatic consumption.
|
|
2248
|
+
*
|
|
2249
|
+
* @since TP-076
|
|
2250
|
+
*/
|
|
2251
|
+
export interface SupervisorAlert {
|
|
2252
|
+
/** Alert category — determines handling behavior */
|
|
2253
|
+
category: SupervisorAlertCategory;
|
|
2254
|
+
/** Human-readable summary suitable for display as a chat message */
|
|
2255
|
+
summary: string;
|
|
2256
|
+
/** Structured context data (all fields IPC-serializable) */
|
|
2257
|
+
context: SupervisorAlertContext;
|
|
2258
|
+
}
|
|
2259
|
+
|
|
2260
|
+
/**
|
|
2261
|
+
* Callback type for supervisor alert emission.
|
|
2262
|
+
*
|
|
2263
|
+
* The engine (child process) calls this when it needs to alert the
|
|
2264
|
+
* supervisor about a significant event. The main thread handler
|
|
2265
|
+
* converts the alert into a `sendUserMessage` call to wake the
|
|
2266
|
+
* supervisor LLM.
|
|
2267
|
+
*
|
|
2268
|
+
* @since TP-076
|
|
2269
|
+
*/
|
|
2270
|
+
export type SupervisorAlertCallback = (alert: SupervisorAlert) => void;
|
|
2271
|
+
|
|
2272
|
+
/**
|
|
2273
|
+
* Information about a lane that has just reached a terminal state.
|
|
2274
|
+
*
|
|
2275
|
+
* Emitted at the no-progress kill and hard-fail decision points so the
|
|
2276
|
+
* supervisor process can mark the lane as terminated and drop any further
|
|
2277
|
+
* alerts queued for it (see {@link LaneTerminatedCallback}).
|
|
2278
|
+
*
|
|
2279
|
+
* @since TP-187 (#538)
|
|
2280
|
+
*/
|
|
2281
|
+
export interface LaneTerminatedInfo {
|
|
2282
|
+
laneNumber: number;
|
|
2283
|
+
agentId: string;
|
|
2284
|
+
batchId: string;
|
|
2285
|
+
terminatedAt: number;
|
|
2286
|
+
reason: "no-progress-kill" | "hard-fail" | "supervisor-takeover";
|
|
2287
|
+
}
|
|
2288
|
+
|
|
2289
|
+
/**
|
|
2290
|
+
* Callback invoked when a lane reaches a terminal state.
|
|
2291
|
+
*
|
|
2292
|
+
* @since TP-187 (#538)
|
|
2293
|
+
*/
|
|
2294
|
+
export type LaneTerminatedCallback = (info: LaneTerminatedInfo) => void;
|
|
2295
|
+
|
|
2296
|
+
/**
|
|
2297
|
+
* Build a batch progress snapshot from runtime state.
|
|
2298
|
+
*
|
|
2299
|
+
* Pure function — extracts the current progress counters from
|
|
2300
|
+
* OrchBatchRuntimeState into the IPC-serializable format used
|
|
2301
|
+
* by SupervisorAlertContext.batchProgress.
|
|
2302
|
+
*
|
|
2303
|
+
* @since TP-076
|
|
2304
|
+
*/
|
|
2305
|
+
export function buildBatchProgressSnapshot(
|
|
2306
|
+
batchState: OrchBatchRuntimeState,
|
|
2307
|
+
): NonNullable<SupervisorAlertContext["batchProgress"]> {
|
|
2308
|
+
return {
|
|
2309
|
+
succeededTasks: batchState.succeededTasks,
|
|
2310
|
+
failedTasks: batchState.failedTasks,
|
|
2311
|
+
skippedTasks: batchState.skippedTasks,
|
|
2312
|
+
blockedTasks: batchState.blockedTasks,
|
|
2313
|
+
totalTasks: batchState.totalTasks,
|
|
2314
|
+
currentWave: batchState.currentWaveIndex + 1, // 1-based for display
|
|
2315
|
+
totalWaves: batchState.totalWaves,
|
|
2316
|
+
};
|
|
2317
|
+
}
|
|
2318
|
+
|
|
2319
|
+
/**
|
|
2320
|
+
* Build a task-level segment frontier snapshot for supervisor failure alerts.
|
|
2321
|
+
*
|
|
2322
|
+
* Returns `undefined` when the task has no segment metadata.
|
|
2323
|
+
*/
|
|
2324
|
+
export function buildSupervisorSegmentFrontierSnapshot(
|
|
2325
|
+
taskId: string,
|
|
2326
|
+
segmentIds: string[] | undefined,
|
|
2327
|
+
activeSegmentId: string | null | undefined,
|
|
2328
|
+
persistedSegments: PersistedSegmentRecord[] | undefined,
|
|
2329
|
+
preferredSegmentId?: string | null,
|
|
2330
|
+
): SupervisorSegmentFrontierSnapshot | undefined {
|
|
2331
|
+
const orderedSegmentIds = Array.isArray(segmentIds)
|
|
2332
|
+
? segmentIds.filter(
|
|
2333
|
+
(segmentId): segmentId is string =>
|
|
2334
|
+
typeof segmentId === "string" && segmentId.trim().length > 0,
|
|
2335
|
+
)
|
|
2336
|
+
: [];
|
|
2337
|
+
if (orderedSegmentIds.length === 0) return undefined;
|
|
2338
|
+
|
|
2339
|
+
const bySegmentId = new Map<string, PersistedSegmentRecord>();
|
|
2340
|
+
for (const segment of persistedSegments ?? []) {
|
|
2341
|
+
if (segment && segment.taskId === taskId) {
|
|
2342
|
+
bySegmentId.set(segment.segmentId, segment);
|
|
2343
|
+
}
|
|
2344
|
+
}
|
|
2345
|
+
|
|
2346
|
+
const resolvedActiveSegmentId =
|
|
2347
|
+
activeSegmentId && orderedSegmentIds.includes(activeSegmentId)
|
|
2348
|
+
? activeSegmentId
|
|
2349
|
+
: preferredSegmentId && orderedSegmentIds.includes(preferredSegmentId)
|
|
2350
|
+
? preferredSegmentId
|
|
2351
|
+
: null;
|
|
2352
|
+
|
|
2353
|
+
const segments = orderedSegmentIds.map((segmentId) => {
|
|
2354
|
+
const persisted = bySegmentId.get(segmentId);
|
|
2355
|
+
const status: PersistedSegmentStatus =
|
|
2356
|
+
persisted?.status ?? (resolvedActiveSegmentId === segmentId ? "running" : "pending");
|
|
2357
|
+
return {
|
|
2358
|
+
segmentId,
|
|
2359
|
+
repoId: persisted ? parseSegmentIdRepo(persisted) : "unknown",
|
|
2360
|
+
status,
|
|
2361
|
+
dependsOnSegmentIds: persisted?.dependsOnSegmentIds ?? [],
|
|
2362
|
+
};
|
|
2363
|
+
});
|
|
2364
|
+
|
|
2365
|
+
const terminalSegments = segments.filter(
|
|
2366
|
+
(segment) =>
|
|
2367
|
+
segment.status === "succeeded" ||
|
|
2368
|
+
segment.status === "failed" ||
|
|
2369
|
+
segment.status === "stalled" ||
|
|
2370
|
+
segment.status === "skipped",
|
|
2371
|
+
).length;
|
|
2372
|
+
|
|
2373
|
+
return {
|
|
2374
|
+
taskId,
|
|
2375
|
+
totalSegments: segments.length,
|
|
2376
|
+
terminalSegments,
|
|
2377
|
+
activeSegmentId: resolvedActiveSegmentId,
|
|
2378
|
+
segments,
|
|
2379
|
+
};
|
|
2380
|
+
}
|
|
2381
|
+
|
|
2382
|
+
/**
|
|
2383
|
+
* Build the base fields for an engine event.
|
|
2384
|
+
*
|
|
2385
|
+
* Ensures consistent field population across all emit sites.
|
|
2386
|
+
* Analogous to `buildTier0EventBase()` for Tier 0 events.
|
|
2387
|
+
*
|
|
2388
|
+
* @since TP-040
|
|
2389
|
+
*/
|
|
2390
|
+
export function buildEngineEventBase(
|
|
2391
|
+
type: EngineEventType,
|
|
2392
|
+
batchId: string,
|
|
2393
|
+
waveIndex: number,
|
|
2394
|
+
phase: OrchBatchPhase,
|
|
2395
|
+
): Pick<EngineEvent, "timestamp" | "type" | "batchId" | "waveIndex" | "phase"> {
|
|
2396
|
+
return {
|
|
2397
|
+
timestamp: new Date().toISOString(),
|
|
2398
|
+
type,
|
|
2399
|
+
batchId,
|
|
2400
|
+
waveIndex,
|
|
2401
|
+
phase,
|
|
2402
|
+
};
|
|
2403
|
+
}
|
|
2404
|
+
|
|
2405
|
+
/**
|
|
2406
|
+
* Decision output from the merge retry policy evaluator.
|
|
2407
|
+
*
|
|
2408
|
+
* Pure data structure — callers use this to decide whether to retry,
|
|
2409
|
+
* wait, or escalate to paused.
|
|
2410
|
+
*
|
|
2411
|
+
* @since TP-033
|
|
2412
|
+
*/
|
|
2413
|
+
export interface MergeRetryDecision {
|
|
2414
|
+
/** Whether the merge should be retried */
|
|
2415
|
+
shouldRetry: boolean;
|
|
2416
|
+
/** Cooldown to wait before retry (0 if no retry or immediate) */
|
|
2417
|
+
cooldownMs: number;
|
|
2418
|
+
/** Human-readable reason for the decision */
|
|
2419
|
+
reason: string;
|
|
2420
|
+
/** Current retry count for this scope (after increment if retrying) */
|
|
2421
|
+
currentAttempt: number;
|
|
2422
|
+
/** Maximum attempts allowed for this classification */
|
|
2423
|
+
maxAttempts: number;
|
|
2424
|
+
/** Classification that was evaluated */
|
|
2425
|
+
classification: MergeFailureClassification;
|
|
2426
|
+
/** Exhaustion action if not retrying */
|
|
2427
|
+
exhaustionAction: MergeRetryPolicy["exhaustionAction"];
|
|
2428
|
+
}
|
|
2429
|
+
|
|
2430
|
+
/**
|
|
2431
|
+
* Outcome of the merge retry loop.
|
|
2432
|
+
*
|
|
2433
|
+
* Returned by `applyMergeRetryLoop()` to tell the caller what happened
|
|
2434
|
+
* during the retry cycle so it can take the appropriate action (continue,
|
|
2435
|
+
* break, force-pause, etc.).
|
|
2436
|
+
*
|
|
2437
|
+
* @since TP-033 R006
|
|
2438
|
+
*/
|
|
2439
|
+
export type MergeRetryLoopOutcome =
|
|
2440
|
+
| {
|
|
2441
|
+
/** Retry succeeded — caller should continue normal post-merge flow */
|
|
2442
|
+
kind: "retry_succeeded";
|
|
2443
|
+
mergeResult: MergeWaveResult;
|
|
2444
|
+
/** Classification of the failure that was retried */
|
|
2445
|
+
classification: MergeFailureClassification | null;
|
|
2446
|
+
/** Scope key used for retry counter tracking */
|
|
2447
|
+
scopeKey: string;
|
|
2448
|
+
/** Last retry decision (carries attempt/maxAttempts for event emission) */
|
|
2449
|
+
lastDecision: MergeRetryDecision;
|
|
2450
|
+
}
|
|
2451
|
+
| {
|
|
2452
|
+
/** Safe-stop triggered during retry — caller should break the wave loop */
|
|
2453
|
+
kind: "safe_stop";
|
|
2454
|
+
mergeResult: MergeWaveResult;
|
|
2455
|
+
/** Classification of the failure that was retried */
|
|
2456
|
+
classification: MergeFailureClassification | null;
|
|
2457
|
+
/** Scope key used for retry counter tracking */
|
|
2458
|
+
scopeKey: string;
|
|
2459
|
+
/** Last retry decision (carries attempt/maxAttempts for event emission) */
|
|
2460
|
+
lastDecision: MergeRetryDecision;
|
|
2461
|
+
errorMessage: string;
|
|
2462
|
+
notifyMessage: string;
|
|
2463
|
+
}
|
|
2464
|
+
| {
|
|
2465
|
+
/**
|
|
2466
|
+
* Retry exhausted or failure is non-retriable — caller should
|
|
2467
|
+
* force `paused` regardless of on_merge_failure config.
|
|
2468
|
+
*/
|
|
2469
|
+
kind: "exhausted";
|
|
2470
|
+
mergeResult: MergeWaveResult;
|
|
2471
|
+
classification: MergeFailureClassification | null;
|
|
2472
|
+
scopeKey: string;
|
|
2473
|
+
lastDecision: MergeRetryDecision;
|
|
2474
|
+
errorMessage: string;
|
|
2475
|
+
notifyMessage: string;
|
|
2476
|
+
}
|
|
2477
|
+
| {
|
|
2478
|
+
/** No retry attempted (unclassifiable or non-retriable with 0 attempts).
|
|
2479
|
+
* Caller should fall through to standard on_merge_failure policy. */
|
|
2480
|
+
kind: "no_retry";
|
|
2481
|
+
mergeResult: MergeWaveResult;
|
|
2482
|
+
classification: MergeFailureClassification | null;
|
|
2483
|
+
scopeKey: string;
|
|
2484
|
+
};
|
|
2485
|
+
|
|
2486
|
+
/**
|
|
2487
|
+
* Callbacks provided to `applyMergeRetryLoop()` for side effects
|
|
2488
|
+
* that differ between engine.ts and resume.ts.
|
|
2489
|
+
*
|
|
2490
|
+
* @since TP-033 R006
|
|
2491
|
+
*/
|
|
2492
|
+
export interface MergeRetryCallbacks {
|
|
2493
|
+
/** Re-invoke mergeWaveByRepo and return the new result */
|
|
2494
|
+
performMerge: () => MergeWaveResult | Promise<MergeWaveResult>;
|
|
2495
|
+
/** Persist batch state with a trigger label */
|
|
2496
|
+
persist: (trigger: string) => void;
|
|
2497
|
+
/** Log a message */
|
|
2498
|
+
log: (message: string, details?: Record<string, unknown>) => void;
|
|
2499
|
+
/** Emit a notification */
|
|
2500
|
+
notify: (message: string, level: "info" | "warning" | "error") => void;
|
|
2501
|
+
/** Update the merge result in tracking arrays */
|
|
2502
|
+
updateMergeResult: (result: MergeWaveResult) => void;
|
|
2503
|
+
/** Sleep for cooldown (allows test injection) */
|
|
2504
|
+
sleep: (ms: number) => void | Promise<void>;
|
|
2505
|
+
/**
|
|
2506
|
+
* Optional callback fired when a retry attempt is about to be executed.
|
|
2507
|
+
* Provides the retry decision with classification, attempt count, and cooldown
|
|
2508
|
+
* so callers can emit structured Tier 0 events at the right time.
|
|
2509
|
+
* @since TP-039 R004
|
|
2510
|
+
*/
|
|
2511
|
+
onRetryAttempt?: (decision: MergeRetryDecision) => void;
|
|
2512
|
+
}
|
|
2513
|
+
|
|
2514
|
+
// ── View-Model Types ─────────────────────────────────────────────────
|
|
2515
|
+
|
|
2516
|
+
/**
|
|
2517
|
+
* Summary counts for the orchestrator dashboard.
|
|
2518
|
+
* Pure data — no rendering logic.
|
|
2519
|
+
*/
|
|
2520
|
+
export interface OrchSummaryCounts {
|
|
2521
|
+
completed: number;
|
|
2522
|
+
running: number;
|
|
2523
|
+
queued: number;
|
|
2524
|
+
failed: number;
|
|
2525
|
+
blocked: number;
|
|
2526
|
+
stalled: number;
|
|
2527
|
+
total: number;
|
|
2528
|
+
}
|
|
2529
|
+
|
|
2530
|
+
/**
|
|
2531
|
+
* Per-lane view data for dashboard rendering.
|
|
2532
|
+
* Derived from MonitorState LaneMonitorSnapshot + AllocatedLane metadata.
|
|
2533
|
+
*/
|
|
2534
|
+
export interface OrchLaneCardData {
|
|
2535
|
+
laneNumber: number;
|
|
2536
|
+
laneId: string;
|
|
2537
|
+
sessionName: string;
|
|
2538
|
+
sessionAlive: boolean;
|
|
2539
|
+
currentTaskId: string | null;
|
|
2540
|
+
currentStepName: string | null;
|
|
2541
|
+
totalChecked: number;
|
|
2542
|
+
totalItems: number;
|
|
2543
|
+
completedTasks: number;
|
|
2544
|
+
totalLaneTasks: number;
|
|
2545
|
+
status: "idle" | "running" | "succeeded" | "failed" | "stalled";
|
|
2546
|
+
stallReason: string | null;
|
|
2547
|
+
}
|
|
2548
|
+
|
|
2549
|
+
/**
|
|
2550
|
+
* Dashboard view-model — maps runtime state to render-ready data.
|
|
2551
|
+
*
|
|
2552
|
+
* This is the single data contract between OrchBatchRuntimeState +
|
|
2553
|
+
* MonitorState and the widget rendering function.
|
|
2554
|
+
*/
|
|
2555
|
+
export interface OrchDashboardViewModel {
|
|
2556
|
+
phase: OrchBatchPhase;
|
|
2557
|
+
batchId: string;
|
|
2558
|
+
orchBranch: string; // e.g., "orch/henry-20260318T140000" — merge target branch
|
|
2559
|
+
waveProgress: string; // e.g., "2/3"
|
|
2560
|
+
elapsed: string; // e.g., "2m 14s"
|
|
2561
|
+
summary: OrchSummaryCounts;
|
|
2562
|
+
laneCards: OrchLaneCardData[];
|
|
2563
|
+
attachHint: string; // e.g., "Attach via the current runtime session tool"
|
|
2564
|
+
errors: string[];
|
|
2565
|
+
failurePolicy: string | null; // e.g., "stop-wave" if stopped by policy
|
|
2566
|
+
}
|
|
2567
|
+
|
|
2568
|
+
// ── State Persistence Types (TS-009) ─────────────────────────────────
|
|
2569
|
+
|
|
2570
|
+
// ── v3 Resilience & Diagnostics Sections (TP-030) ────────────────────
|
|
2571
|
+
|
|
2572
|
+
/**
|
|
2573
|
+
* Record of a single automated repair action taken by the orchestrator.
|
|
2574
|
+
*
|
|
2575
|
+
* Repair actions are deterministic strategies applied when known failure
|
|
2576
|
+
* classes are detected (e.g., stale worktree cleanup, lock file removal).
|
|
2577
|
+
* Each entry is immutable once written — history is append-only.
|
|
2578
|
+
*
|
|
2579
|
+
* @since v3 (TP-030)
|
|
2580
|
+
*/
|
|
2581
|
+
export interface PersistedRepairRecord {
|
|
2582
|
+
/** Unique repair ID (e.g., "r-20260319-001") */
|
|
2583
|
+
id: string;
|
|
2584
|
+
/** Strategy name that was applied (e.g., "stale-worktree-cleanup", "lock-file-removal") */
|
|
2585
|
+
strategy: string;
|
|
2586
|
+
/** Outcome of the repair */
|
|
2587
|
+
status: "succeeded" | "failed" | "skipped";
|
|
2588
|
+
/** Repo ID targeted by the repair (undefined in repo mode) */
|
|
2589
|
+
repoId?: string;
|
|
2590
|
+
/** Epoch ms when the repair started */
|
|
2591
|
+
startedAt: number;
|
|
2592
|
+
/** Epoch ms when the repair ended */
|
|
2593
|
+
endedAt: number;
|
|
2594
|
+
}
|
|
2595
|
+
|
|
2596
|
+
/**
|
|
2597
|
+
* Resilience state section for batch-state.json.
|
|
2598
|
+
*
|
|
2599
|
+
* Tracks retry/repair metadata so the orchestrator can make informed
|
|
2600
|
+
* decisions about retries, force-resume, and failure escalation.
|
|
2601
|
+
*
|
|
2602
|
+
* All fields are required in a canonical v3 state. Migration from v1/v2
|
|
2603
|
+
* fills conservative defaults (no retries, no repairs, no forced resume).
|
|
2604
|
+
*
|
|
2605
|
+
* @since v3 (TP-030)
|
|
2606
|
+
*/
|
|
2607
|
+
export interface ResilienceState {
|
|
2608
|
+
/** Whether the last resume was a --force resume */
|
|
2609
|
+
resumeForced: boolean;
|
|
2610
|
+
/**
|
|
2611
|
+
* Retry counts keyed by scope string.
|
|
2612
|
+
* Scope format: `{taskId}:w{waveIndex}:l{laneNumber}` (e.g., "TP-001:w0:l1").
|
|
2613
|
+
* Value is the number of retries attempted for that scope.
|
|
2614
|
+
*/
|
|
2615
|
+
retryCountByScope: Record<string, number>;
|
|
2616
|
+
/**
|
|
2617
|
+
* Exit classification of the most recent failure (null if no failures).
|
|
2618
|
+
* Uses the same `ExitClassification` union from diagnostics.ts.
|
|
2619
|
+
*/
|
|
2620
|
+
lastFailureClass: ExitClassification | null;
|
|
2621
|
+
/** Chronological history of automated repair actions. Append-only. */
|
|
2622
|
+
repairHistory: PersistedRepairRecord[];
|
|
2623
|
+
}
|
|
2624
|
+
|
|
2625
|
+
/**
|
|
2626
|
+
* Persisted summary of a single task's exit diagnostic.
|
|
2627
|
+
*
|
|
2628
|
+
* This is a compact representation stored in `diagnostics.taskExits`.
|
|
2629
|
+
* For the full diagnostic (tokens, progress, etc.), see the
|
|
2630
|
+
* `exitDiagnostic` field on `PersistedTaskRecord`.
|
|
2631
|
+
*
|
|
2632
|
+
* Uses `ExitClassification` from diagnostics.ts as the canonical
|
|
2633
|
+
* classification type — no duplication.
|
|
2634
|
+
*
|
|
2635
|
+
* @since v3 (TP-030)
|
|
2636
|
+
*/
|
|
2637
|
+
export interface PersistedTaskExitSummary {
|
|
2638
|
+
/** Deterministic exit classification */
|
|
2639
|
+
classification: ExitClassification;
|
|
2640
|
+
/** Estimated cost in USD for this task's execution */
|
|
2641
|
+
cost: number;
|
|
2642
|
+
/** Wall-clock duration of the task in seconds */
|
|
2643
|
+
durationSec: number;
|
|
2644
|
+
/** Number of retry attempts (0 if never retried) */
|
|
2645
|
+
retries?: number;
|
|
2646
|
+
}
|
|
2647
|
+
|
|
2648
|
+
/**
|
|
2649
|
+
* Batch-level diagnostics section for batch-state.json.
|
|
2650
|
+
*
|
|
2651
|
+
* Aggregates per-task exit summaries and batch-wide cost for
|
|
2652
|
+
* dashboard display and post-mortem analysis.
|
|
2653
|
+
*
|
|
2654
|
+
* All fields are required in a canonical v3 state. Migration from v1/v2
|
|
2655
|
+
* fills conservative defaults (empty taskExits, zero batchCost).
|
|
2656
|
+
*
|
|
2657
|
+
* @since v3 (TP-030)
|
|
2658
|
+
*/
|
|
2659
|
+
export interface BatchDiagnostics {
|
|
2660
|
+
/**
|
|
2661
|
+
* Per-task exit summaries keyed by task ID.
|
|
2662
|
+
* Populated as tasks complete during execution.
|
|
2663
|
+
*/
|
|
2664
|
+
taskExits: Record<string, PersistedTaskExitSummary>;
|
|
2665
|
+
/** Accumulated batch cost in USD across all tasks */
|
|
2666
|
+
batchCost: number;
|
|
2667
|
+
}
|
|
2668
|
+
|
|
2669
|
+
/**
|
|
2670
|
+
* Create a default ResilienceState with conservative initial values.
|
|
2671
|
+
* Used when migrating v1/v2 states to v3, and for new batch creation.
|
|
2672
|
+
*/
|
|
2673
|
+
export function defaultResilienceState(): ResilienceState {
|
|
2674
|
+
return {
|
|
2675
|
+
resumeForced: false,
|
|
2676
|
+
retryCountByScope: {},
|
|
2677
|
+
lastFailureClass: null,
|
|
2678
|
+
repairHistory: [],
|
|
2679
|
+
};
|
|
2680
|
+
}
|
|
2681
|
+
|
|
2682
|
+
/**
|
|
2683
|
+
* Create a default BatchDiagnostics with empty/zero initial values.
|
|
2684
|
+
* Used when migrating v1/v2 states to v3, and for new batch creation.
|
|
2685
|
+
*/
|
|
2686
|
+
export function defaultBatchDiagnostics(): BatchDiagnostics {
|
|
2687
|
+
return {
|
|
2688
|
+
taskExits: {},
|
|
2689
|
+
batchCost: 0,
|
|
2690
|
+
};
|
|
2691
|
+
}
|
|
2692
|
+
|
|
2693
|
+
// ── Schema Version & Constants ───────────────────────────────────────
|
|
2694
|
+
|
|
2695
|
+
/**
|
|
2696
|
+
* Current schema version for batch-state.json.
|
|
2697
|
+
* Increment when the persisted schema changes in incompatible ways.
|
|
2698
|
+
*
|
|
2699
|
+
* Version history:
|
|
2700
|
+
* v1 — Original schema (TS-009). No repo-aware fields on task records.
|
|
2701
|
+
* Lane records had optional `repoId` but it was not validated.
|
|
2702
|
+
* v2 — Repo-aware records (TP-006). Adds `repoId` and `resolvedRepoId`
|
|
2703
|
+
* to task records. Formalizes `repoId` on lane records. Adds
|
|
2704
|
+
* `mode` field to top-level state.
|
|
2705
|
+
* v3 — Resilience & diagnostics (TP-030). Adds optional `resilience`
|
|
2706
|
+
* section (retry counters, force-resume, failure classification,
|
|
2707
|
+
* repair history) and optional `diagnostics` section (per-task
|
|
2708
|
+
* exit summaries, batch cost). Task records gain optional
|
|
2709
|
+
* `exitDiagnostic` alongside legacy `exitReason`.
|
|
2710
|
+
* Both new sections are optional for v1/v2 migration paths.
|
|
2711
|
+
* v4 — Segment execution (TP-081). Adds optional `segments` array
|
|
2712
|
+
* for persisting per-segment runtime state. Task records gain
|
|
2713
|
+
* optional `packetRepoId`, `packetTaskPath`, `segmentIds`, and
|
|
2714
|
+
* `activeSegmentId` fields. All v4-specific fields are optional
|
|
2715
|
+
* for backward compatibility with v1/v2/v3 migration paths.
|
|
2716
|
+
* When migrating from v3, `segments` defaults to `[]` and
|
|
2717
|
+
* task-level segment fields default to `undefined`.
|
|
2718
|
+
*
|
|
2719
|
+
* Compatibility policy:
|
|
2720
|
+
* - loadBatchState() accepts v1, v2, v3, and v4 files. v1→v2→v3→v4
|
|
2721
|
+
* auto-upconverted in memory (chained).
|
|
2722
|
+
* The on-disk file is NOT rewritten during load.
|
|
2723
|
+
* - saveBatchState() always writes v4.
|
|
2724
|
+
* - Schema versions > 4 are rejected with STATE_SCHEMA_INVALID.
|
|
2725
|
+
*/
|
|
2726
|
+
export const BATCH_STATE_SCHEMA_VERSION = 4;
|
|
2727
|
+
|
|
2728
|
+
/**
|
|
2729
|
+
* Canonical file path for persisted batch state.
|
|
2730
|
+
* Resolved relative to repository root: `.pi/batch-state.json`
|
|
2731
|
+
*/
|
|
2732
|
+
export const BATCH_STATE_FILENAME = "batch-state.json";
|
|
2733
|
+
|
|
2734
|
+
/**
|
|
2735
|
+
* Resolve the absolute path to the batch state file.
|
|
2736
|
+
* @param repoRoot - Absolute path to the repository root
|
|
2737
|
+
*/
|
|
2738
|
+
export function batchStatePath(repoRoot: string): string {
|
|
2739
|
+
return join(repoRoot, ".pi", BATCH_STATE_FILENAME);
|
|
2740
|
+
}
|
|
2741
|
+
|
|
2742
|
+
/**
|
|
2743
|
+
* Error codes for state persistence operations.
|
|
2744
|
+
*
|
|
2745
|
+
* - STATE_FILE_IO_ERROR: Filesystem read/write/rename failure
|
|
2746
|
+
* - STATE_FILE_PARSE_ERROR: File exists but contains invalid JSON
|
|
2747
|
+
* - STATE_SCHEMA_INVALID: JSON is valid but fails schema validation
|
|
2748
|
+
* (missing required fields, unknown enum values, version mismatch)
|
|
2749
|
+
*/
|
|
2750
|
+
export type StateFileErrorCode =
|
|
2751
|
+
| "STATE_FILE_IO_ERROR"
|
|
2752
|
+
| "STATE_FILE_PARSE_ERROR"
|
|
2753
|
+
| "STATE_SCHEMA_INVALID";
|
|
2754
|
+
|
|
2755
|
+
/** Typed error class for state file operations. */
|
|
2756
|
+
export class StateFileError extends Error {
|
|
2757
|
+
code: StateFileErrorCode;
|
|
2758
|
+
|
|
2759
|
+
constructor(code: StateFileErrorCode, message: string) {
|
|
2760
|
+
super(message);
|
|
2761
|
+
this.name = "StateFileError";
|
|
2762
|
+
this.code = code;
|
|
2763
|
+
}
|
|
2764
|
+
}
|
|
2765
|
+
|
|
2766
|
+
/**
|
|
2767
|
+
* Persisted record of a single task's execution state.
|
|
2768
|
+
*
|
|
2769
|
+
* Contains everything `/orch-resume` needs to reconstruct
|
|
2770
|
+
* task progress without re-running discovery.
|
|
2771
|
+
*
|
|
2772
|
+
* Repo-aware fields (v2):
|
|
2773
|
+
* `repoId` and `resolvedRepoId` capture task-to-repo attribution
|
|
2774
|
+
* so resume can reconstruct repo routing without re-running discovery.
|
|
2775
|
+
*
|
|
2776
|
+
* Mode semantics:
|
|
2777
|
+
* - **repo mode**: Both fields are `undefined`. Tasks implicitly target
|
|
2778
|
+
* the single repository (cwd). No repo routing needed.
|
|
2779
|
+
* - **workspace mode**: `repoId` is the repo ID declared in PROMPT.md
|
|
2780
|
+
* (may be `undefined` if the task didn't declare one). `resolvedRepoId`
|
|
2781
|
+
* is the final repo ID after applying the routing precedence chain
|
|
2782
|
+
* (prompt → area → workspace default). Always a non-empty string in
|
|
2783
|
+
* workspace mode for tasks that passed routing validation.
|
|
2784
|
+
*
|
|
2785
|
+
* Source of truth:
|
|
2786
|
+
* - For allocated tasks: derived from `ParsedTask.promptRepoId` and
|
|
2787
|
+
* `ParsedTask.resolvedRepoId` via `serializeBatchState()`.
|
|
2788
|
+
* - For unallocated/pending tasks: derived from the same ParsedTask
|
|
2789
|
+
* fields via discovery enrichment in `persistRuntimeState()`.
|
|
2790
|
+
*/
|
|
2791
|
+
export interface PersistedTaskRecord {
|
|
2792
|
+
/** Task identifier (e.g., "TO-014") */
|
|
2793
|
+
taskId: string;
|
|
2794
|
+
/** Lane number the task was assigned to (1-indexed) */
|
|
2795
|
+
laneNumber: number;
|
|
2796
|
+
/** Lane session name used (e.g., "orch-lane-1") */
|
|
2797
|
+
sessionName: string;
|
|
2798
|
+
/** Current task status */
|
|
2799
|
+
status: LaneTaskStatus;
|
|
2800
|
+
/** Absolute path to the task's folder (contains PROMPT.md, STATUS.md) */
|
|
2801
|
+
taskFolder: string;
|
|
2802
|
+
/** Epoch ms when task started (null if never started) */
|
|
2803
|
+
startedAt: number | null;
|
|
2804
|
+
/** Epoch ms when task ended (null if still pending/running) */
|
|
2805
|
+
endedAt: number | null;
|
|
2806
|
+
/** Whether .DONE file was found for this task */
|
|
2807
|
+
doneFileFound: boolean;
|
|
2808
|
+
/** Human-readable exit reason (if completed/failed) */
|
|
2809
|
+
exitReason: string;
|
|
2810
|
+
/**
|
|
2811
|
+
* Repo ID declared in the task's PROMPT.md metadata (v2).
|
|
2812
|
+
* Undefined in repo mode or if the task didn't declare a repo.
|
|
2813
|
+
*/
|
|
2814
|
+
repoId?: string;
|
|
2815
|
+
/**
|
|
2816
|
+
* Resolved repo ID after applying routing precedence (v2).
|
|
2817
|
+
* Undefined in repo mode. In workspace mode, this is the final
|
|
2818
|
+
* repo target after prompt → area → workspace-default fallback.
|
|
2819
|
+
*/
|
|
2820
|
+
resolvedRepoId?: string;
|
|
2821
|
+
/**
|
|
2822
|
+
* Number of commits preserved as partial progress for a failed task (TP-028).
|
|
2823
|
+
* Undefined when no partial progress was saved (succeeded tasks, no commits, etc.).
|
|
2824
|
+
* Optional for backward compatibility with pre-TP-028 state files.
|
|
2825
|
+
*/
|
|
2826
|
+
partialProgressCommits?: number;
|
|
2827
|
+
/**
|
|
2828
|
+
* Saved branch name holding partial progress for a failed task (TP-028).
|
|
2829
|
+
* Undefined when no partial progress was saved.
|
|
2830
|
+
* Optional for backward compatibility with pre-TP-028 state files.
|
|
2831
|
+
*/
|
|
2832
|
+
partialProgressBranch?: string;
|
|
2833
|
+
/**
|
|
2834
|
+
* Structured exit diagnostic for this task (v3, TP-030).
|
|
2835
|
+
*
|
|
2836
|
+
* Canonical structured exit data — preferred over the legacy `exitReason`
|
|
2837
|
+
* string when present. Contains deterministic classification, cost, timing,
|
|
2838
|
+
* and progress metadata.
|
|
2839
|
+
*
|
|
2840
|
+
* Optional for backward compatibility with v1/v2 state files and tasks
|
|
2841
|
+
* that haven't exited yet. Consumers should check `exitDiagnostic` first,
|
|
2842
|
+
* falling back to `exitReason` for display.
|
|
2843
|
+
*/
|
|
2844
|
+
exitDiagnostic?: TaskExitDiagnostic;
|
|
2845
|
+
/**
|
|
2846
|
+
* Repo ID that owns task packet files (PROMPT.md/STATUS.md/.DONE) (v4, TP-081).
|
|
2847
|
+
*
|
|
2848
|
+
* In workspace mode, this is the `taskPacketRepo` from routing config.
|
|
2849
|
+
* Undefined in repo mode or for pre-v4 state files.
|
|
2850
|
+
*/
|
|
2851
|
+
packetRepoId?: string;
|
|
2852
|
+
/**
|
|
2853
|
+
* Absolute path to the task folder in the packet repo worktree (v4, TP-081).
|
|
2854
|
+
*
|
|
2855
|
+
* Used by resume to locate packet files without re-running discovery.
|
|
2856
|
+
* Undefined in repo mode or for pre-v4 state files.
|
|
2857
|
+
*/
|
|
2858
|
+
packetTaskPath?: string;
|
|
2859
|
+
/**
|
|
2860
|
+
* Segment IDs belonging to this task (v4, TP-081).
|
|
2861
|
+
*
|
|
2862
|
+
* Array of segment ID strings (`<taskId>::<repoId>`).
|
|
2863
|
+
* Empty array for repo-mode tasks or single-repo tasks.
|
|
2864
|
+
* Undefined for pre-v4 state files.
|
|
2865
|
+
*/
|
|
2866
|
+
segmentIds?: string[];
|
|
2867
|
+
/**
|
|
2868
|
+
* Currently executing segment ID (v4, TP-081).
|
|
2869
|
+
*
|
|
2870
|
+
* Null when no segment is active (all completed or not started).
|
|
2871
|
+
* Undefined for pre-v4 state files.
|
|
2872
|
+
*/
|
|
2873
|
+
activeSegmentId?: string | null;
|
|
2874
|
+
}
|
|
2875
|
+
|
|
2876
|
+
// ── Segment-Level Persisted State (v4, TP-081) ──────────────────────
|
|
2877
|
+
|
|
2878
|
+
/**
|
|
2879
|
+
* Segment execution status within a batch.
|
|
2880
|
+
*
|
|
2881
|
+
* State machine mirrors `LaneTaskStatus` but applies at segment granularity:
|
|
2882
|
+
* pending → running → succeeded
|
|
2883
|
+
* → failed
|
|
2884
|
+
* → stalled
|
|
2885
|
+
* pending → skipped (prior segment failed, or task skipped)
|
|
2886
|
+
*
|
|
2887
|
+
* @since v4 (TP-081)
|
|
2888
|
+
*/
|
|
2889
|
+
export type PersistedSegmentStatus =
|
|
2890
|
+
| "pending"
|
|
2891
|
+
| "running"
|
|
2892
|
+
| "succeeded"
|
|
2893
|
+
| "failed"
|
|
2894
|
+
| "stalled"
|
|
2895
|
+
| "skipped";
|
|
2896
|
+
|
|
2897
|
+
/**
|
|
2898
|
+
* Persisted record of a single segment's execution state.
|
|
2899
|
+
*
|
|
2900
|
+
* A segment is a repo-scoped execution unit within a task. Each task
|
|
2901
|
+
* may have one or more segments (one per repo the task touches).
|
|
2902
|
+
*
|
|
2903
|
+
* Contains everything `/orch-resume` needs to reconstruct segment-level
|
|
2904
|
+
* progress without re-running discovery.
|
|
2905
|
+
*
|
|
2906
|
+
* @since v4 (TP-081)
|
|
2907
|
+
*/
|
|
2908
|
+
export interface PersistedSegmentRecord {
|
|
2909
|
+
/** Stable segment identifier (`<taskId>::<repoId>`, e.g., "TP-002::api") */
|
|
2910
|
+
segmentId: string;
|
|
2911
|
+
/** Parent task identifier */
|
|
2912
|
+
taskId: string;
|
|
2913
|
+
/** Repo ID this segment targets */
|
|
2914
|
+
repoId: string;
|
|
2915
|
+
/** Segment execution status */
|
|
2916
|
+
status: PersistedSegmentStatus;
|
|
2917
|
+
/** Lane ID the segment executed on (e.g., "lane-1"), empty if not yet assigned */
|
|
2918
|
+
laneId: string;
|
|
2919
|
+
/** Lane session name used for this segment */
|
|
2920
|
+
sessionName: string;
|
|
2921
|
+
/** Absolute path to the worktree used for this segment */
|
|
2922
|
+
worktreePath: string;
|
|
2923
|
+
/** Git branch name checked out for this segment */
|
|
2924
|
+
branch: string;
|
|
2925
|
+
/** Epoch ms when segment execution started (null if not yet started) */
|
|
2926
|
+
startedAt: number | null;
|
|
2927
|
+
/** Epoch ms when segment execution ended (null if still pending/running) */
|
|
2928
|
+
endedAt: number | null;
|
|
2929
|
+
/** Number of retry attempts for this segment */
|
|
2930
|
+
retries: number;
|
|
2931
|
+
/**
|
|
2932
|
+
* Segment IDs this segment depends on (intra-task DAG edges).
|
|
2933
|
+
* Empty array for the first segment in a task or for tasks with no intra-task deps.
|
|
2934
|
+
*/
|
|
2935
|
+
dependsOnSegmentIds: string[];
|
|
2936
|
+
/**
|
|
2937
|
+
* Structured exit diagnostic for this segment.
|
|
2938
|
+
* Optional: absent for segments that haven't exited yet.
|
|
2939
|
+
* Uses the same `TaskExitDiagnostic` shape from diagnostics.ts.
|
|
2940
|
+
*/
|
|
2941
|
+
exitDiagnostic?: TaskExitDiagnostic;
|
|
2942
|
+
/** Human-readable exit reason (legacy compat, same as task-level) */
|
|
2943
|
+
exitReason: string;
|
|
2944
|
+
/** Anchor segment ID this segment was dynamically expanded from (if any). */
|
|
2945
|
+
expandedFrom?: string;
|
|
2946
|
+
/** Segment expansion request ID that created this segment (if any). */
|
|
2947
|
+
expansionRequestId?: string;
|
|
2948
|
+
}
|
|
2949
|
+
|
|
2950
|
+
/**
|
|
2951
|
+
* Persisted record of a lane's configuration.
|
|
2952
|
+
*
|
|
2953
|
+
* Captures worktree/branch assignment so `/orch-resume` can
|
|
2954
|
+
* reconnect to existing worktrees without re-allocation.
|
|
2955
|
+
*
|
|
2956
|
+
* Repo-aware contract (v2):
|
|
2957
|
+
* `repoId` captures which repository this lane targets.
|
|
2958
|
+
*
|
|
2959
|
+
* Mode semantics:
|
|
2960
|
+
* - **repo mode**: `repoId` is `undefined`. The lane's worktree is
|
|
2961
|
+
* created from the single repository (cwd). All lanes share the
|
|
2962
|
+
* same repo implicitly.
|
|
2963
|
+
* - **workspace mode**: `repoId` is a non-empty string matching a
|
|
2964
|
+
* key in `WorkspaceConfig.repos`. All tasks assigned to this lane
|
|
2965
|
+
* target the same repo. Lane allocation guarantees repo affinity
|
|
2966
|
+
* (no lane mixes tasks from different repos).
|
|
2967
|
+
*
|
|
2968
|
+
* Source of truth: derived from `AllocatedLane.repoId` during
|
|
2969
|
+
* serialization in `serializeBatchState()`.
|
|
2970
|
+
*/
|
|
2971
|
+
export interface PersistedLaneRecord {
|
|
2972
|
+
/** Lane number (1-indexed) */
|
|
2973
|
+
laneNumber: number;
|
|
2974
|
+
/** Lane identifier (e.g., "lane-1") */
|
|
2975
|
+
laneId: string;
|
|
2976
|
+
/** Lane session identifier (e.g., "orch-lane-1") */
|
|
2977
|
+
laneSessionId: string;
|
|
2978
|
+
/** Absolute path to the lane's worktree directory */
|
|
2979
|
+
worktreePath: string;
|
|
2980
|
+
/** Git branch name checked out in the worktree */
|
|
2981
|
+
branch: string;
|
|
2982
|
+
/** Task IDs assigned to this lane in execution order */
|
|
2983
|
+
taskIds: string[];
|
|
2984
|
+
/**
|
|
2985
|
+
* Repo ID this lane targets (v2).
|
|
2986
|
+
* Undefined in repo mode. Non-empty string in workspace mode,
|
|
2987
|
+
* matching a key in `WorkspaceConfig.repos`.
|
|
2988
|
+
*/
|
|
2989
|
+
repoId?: string;
|
|
2990
|
+
}
|
|
2991
|
+
|
|
2992
|
+
/**
|
|
2993
|
+
* Persisted summary of a wave merge result.
|
|
2994
|
+
* Minimal subset of MergeWaveResult needed for resume decisions.
|
|
2995
|
+
*/
|
|
2996
|
+
export interface PersistedMergeResult {
|
|
2997
|
+
/** Wave index (0-based) */
|
|
2998
|
+
waveIndex: number;
|
|
2999
|
+
/** Merge status */
|
|
3000
|
+
status: "succeeded" | "failed" | "partial";
|
|
3001
|
+
/** Which lane failed (null if all succeeded) */
|
|
3002
|
+
failedLane: number | null;
|
|
3003
|
+
/** Failure reason (null if all succeeded) */
|
|
3004
|
+
failureReason: string | null;
|
|
3005
|
+
/**
|
|
3006
|
+
* Per-repo merge outcomes (v2, TP-009).
|
|
3007
|
+
* Populated in workspace mode when MergeWaveResult.repoResults is available.
|
|
3008
|
+
* Undefined/absent in repo mode or for older state files. Dashboard treats
|
|
3009
|
+
* absence as single-repo merge.
|
|
3010
|
+
*/
|
|
3011
|
+
repoResults?: PersistedRepoMergeOutcome[];
|
|
3012
|
+
}
|
|
3013
|
+
|
|
3014
|
+
/**
|
|
3015
|
+
* Persisted per-repo merge outcome within a wave merge.
|
|
3016
|
+
* Serializable subset of RepoMergeOutcome — excludes full MergeLaneResult
|
|
3017
|
+
* objects (which contain detailed merge agent result JSON) to keep state file compact.
|
|
3018
|
+
*/
|
|
3019
|
+
export interface PersistedRepoMergeOutcome {
|
|
3020
|
+
/** Repo ID. Undefined for the default group in repo mode. */
|
|
3021
|
+
repoId: string | undefined;
|
|
3022
|
+
/** Merge status for this repo. */
|
|
3023
|
+
status: "succeeded" | "failed" | "partial";
|
|
3024
|
+
/** Lane numbers involved in this repo's merge. */
|
|
3025
|
+
laneNumbers: number[];
|
|
3026
|
+
/** Failed lane number within this repo (null if all succeeded). */
|
|
3027
|
+
failedLane: number | null;
|
|
3028
|
+
/** Failure reason within this repo (null if all succeeded). */
|
|
3029
|
+
failureReason: string | null;
|
|
3030
|
+
}
|
|
3031
|
+
|
|
3032
|
+
/**
|
|
3033
|
+
* Persisted batch state written to `.pi/batch-state.json`.
|
|
3034
|
+
*
|
|
3035
|
+
* This is the serialization contract for batch state persistence.
|
|
3036
|
+
* It captures enough information for `/orch-resume` to reconstruct
|
|
3037
|
+
* the orchestrator state after a terminal disconnect.
|
|
3038
|
+
*
|
|
3039
|
+
* Design decisions:
|
|
3040
|
+
* - `schemaVersion` enables forward-compatible rejection of old formats
|
|
3041
|
+
* - Phase uses the same `OrchBatchPhase` literal union as runtime state
|
|
3042
|
+
* - Per-task records include folder paths and session names for resume
|
|
3043
|
+
* - Merge results are summarized (not full MergeWaveResult) for size
|
|
3044
|
+
* - `updatedAt` is monotonic (epoch ms) for staleness detection
|
|
3045
|
+
* - `lastError` captures most recent error without PII
|
|
3046
|
+
*
|
|
3047
|
+
* v2 additions (TP-006):
|
|
3048
|
+
* - `mode` field captures workspace vs repo mode at batch start
|
|
3049
|
+
* - Task records include `repoId` and `resolvedRepoId` for repo attribution
|
|
3050
|
+
* - Lane records formalize `repoId` contract per mode
|
|
3051
|
+
* - v1 files are auto-upconverted: `mode` defaults to "repo", task/lane
|
|
3052
|
+
* `repoId` fields default to `undefined` (omitted from JSON)
|
|
3053
|
+
*
|
|
3054
|
+
* v3 additions (TP-030):
|
|
3055
|
+
* - `resilience` section (required): retry counters, force-resume intent,
|
|
3056
|
+
* failure classification, and repair history for automated recovery.
|
|
3057
|
+
* - `diagnostics` section (required): per-task exit summaries and batch cost.
|
|
3058
|
+
* - Task records gain optional `exitDiagnostic` (canonical structured exit
|
|
3059
|
+
* data alongside legacy `exitReason` string).
|
|
3060
|
+
* - Both sections are required in v3. Migration from v1/v2 fills
|
|
3061
|
+
* conservative defaults (see `defaultResilienceState()` / `defaultBatchDiagnostics()`).
|
|
3062
|
+
*
|
|
3063
|
+
* v4 additions (TP-081):
|
|
3064
|
+
* - `segments` array (required): per-segment execution records for multi-repo
|
|
3065
|
+
* task execution. Empty array in repo mode or for pre-v4 migration.
|
|
3066
|
+
* - Task records gain optional `packetRepoId`, `packetTaskPath`, `segmentIds`,
|
|
3067
|
+
* and `activeSegmentId` for segment-level tracking.
|
|
3068
|
+
* - Migration from v3 fills `segments` as `[]` and leaves task-level segment
|
|
3069
|
+
* fields as `undefined`.
|
|
3070
|
+
*/
|
|
3071
|
+
export interface PersistedBatchState {
|
|
3072
|
+
/** Schema version — must equal BATCH_STATE_SCHEMA_VERSION (currently 4) */
|
|
3073
|
+
schemaVersion: number;
|
|
3074
|
+
/** Current batch execution phase */
|
|
3075
|
+
phase: OrchBatchPhase;
|
|
3076
|
+
/** Unique batch identifier (timestamp format) */
|
|
3077
|
+
batchId: string;
|
|
3078
|
+
/** Branch that was active when /orch started — used as base for worktrees and merge target */
|
|
3079
|
+
baseBranch: string;
|
|
3080
|
+
/** Orchestrator-managed branch name (e.g., 'orch/henry-20260318T140000'). Empty = legacy mode (merge into baseBranch directly). */
|
|
3081
|
+
orchBranch: string;
|
|
3082
|
+
/**
|
|
3083
|
+
* Workspace execution mode at batch start (v2).
|
|
3084
|
+
* - "repo": Single-repo mode (default, backward-compatible).
|
|
3085
|
+
* - "workspace": Multi-repo workspace mode.
|
|
3086
|
+
* Defaults to "repo" when loading v1 state files.
|
|
3087
|
+
*/
|
|
3088
|
+
mode: WorkspaceMode;
|
|
3089
|
+
/** Epoch ms when batch started */
|
|
3090
|
+
startedAt: number;
|
|
3091
|
+
/** Epoch ms when state was last written */
|
|
3092
|
+
updatedAt: number;
|
|
3093
|
+
/** Epoch ms when batch ended (null if still active) */
|
|
3094
|
+
endedAt: number | null;
|
|
3095
|
+
/** Current wave index (0-based, -1 if not started) */
|
|
3096
|
+
currentWaveIndex: number;
|
|
3097
|
+
/** Total number of waves in the plan */
|
|
3098
|
+
totalWaves: number;
|
|
3099
|
+
/**
|
|
3100
|
+
* Number of dependency-driven task-level waves (TP-166).
|
|
3101
|
+
* Undefined for batches created before TP-166; falls back to totalWaves.
|
|
3102
|
+
*/
|
|
3103
|
+
taskLevelWaveCount?: number;
|
|
3104
|
+
/**
|
|
3105
|
+
* Maps segment round index (0-based) to parent task-level wave (0-based).
|
|
3106
|
+
* Undefined for batches created before TP-166.
|
|
3107
|
+
*/
|
|
3108
|
+
roundToTaskWave?: number[];
|
|
3109
|
+
/** Wave plan: array of arrays of task IDs per wave */
|
|
3110
|
+
wavePlan: string[][];
|
|
3111
|
+
/** Per-lane configuration records */
|
|
3112
|
+
lanes: PersistedLaneRecord[];
|
|
3113
|
+
/** Per-task execution records (all tasks across all waves) */
|
|
3114
|
+
tasks: PersistedTaskRecord[];
|
|
3115
|
+
/** Merge results for completed waves */
|
|
3116
|
+
mergeResults: PersistedMergeResult[];
|
|
3117
|
+
/** Summary counters */
|
|
3118
|
+
totalTasks: number;
|
|
3119
|
+
succeededTasks: number;
|
|
3120
|
+
failedTasks: number;
|
|
3121
|
+
skippedTasks: number;
|
|
3122
|
+
blockedTasks: number;
|
|
3123
|
+
/** Task IDs blocked for future waves (from skip-dependents) */
|
|
3124
|
+
blockedTaskIds: string[];
|
|
3125
|
+
/** Most recent error (code + message, no PII) */
|
|
3126
|
+
lastError: { code: string; message: string } | null;
|
|
3127
|
+
/** Accumulated error messages */
|
|
3128
|
+
errors: string[];
|
|
3129
|
+
/**
|
|
3130
|
+
* Resilience state for retry/recovery tracking (v3, TP-030).
|
|
3131
|
+
* Required in v3+. Migration from v1/v2 fills conservative defaults.
|
|
3132
|
+
*/
|
|
3133
|
+
resilience: ResilienceState;
|
|
3134
|
+
/**
|
|
3135
|
+
* Batch-level diagnostics for cost tracking and exit summaries (v3, TP-030).
|
|
3136
|
+
* Required in v3+. Migration from v1/v2 fills conservative defaults.
|
|
3137
|
+
*/
|
|
3138
|
+
diagnostics: BatchDiagnostics;
|
|
3139
|
+
/**
|
|
3140
|
+
* Per-segment execution records for multi-repo task execution (v4, TP-081).
|
|
3141
|
+
*
|
|
3142
|
+
* Each entry represents one repo-scoped segment of a task. In repo mode
|
|
3143
|
+
* or for single-repo tasks, this array is empty (segment tracking is
|
|
3144
|
+
* implicit via task records).
|
|
3145
|
+
*
|
|
3146
|
+
* Required in v4. Migration from v1/v2/v3 fills empty array.
|
|
3147
|
+
*/
|
|
3148
|
+
segments: PersistedSegmentRecord[];
|
|
3149
|
+
/**
|
|
3150
|
+
* Unknown top-level fields captured during deserialization.
|
|
3151
|
+
* Preserved on roundtrip to avoid data loss from future schema extensions
|
|
3152
|
+
* or external tools writing additional fields.
|
|
3153
|
+
* Not serialized directly — merged back by `serializeBatchState()`.
|
|
3154
|
+
*/
|
|
3155
|
+
_extraFields?: Record<string, unknown>;
|
|
3156
|
+
}
|
|
3157
|
+
|
|
3158
|
+
// ── Resume (TS-009 Step 4) ───────────────────────────────────────────
|
|
3159
|
+
|
|
3160
|
+
/**
|
|
3161
|
+
* Error codes for /orch-resume command failures.
|
|
3162
|
+
*
|
|
3163
|
+
* - RESUME_NO_STATE: No batch-state.json found on disk
|
|
3164
|
+
* - RESUME_INVALID_STATE: State file exists but cannot be parsed/validated
|
|
3165
|
+
* - RESUME_SCHEMA_MISMATCH: State file has incompatible schema version
|
|
3166
|
+
* - RESUME_PHASE_NOT_RESUMABLE: Persisted phase does not allow resume
|
|
3167
|
+
* - RESUME_TMUX_UNAVAILABLE: Legacy session backend is unavailable for reconnection
|
|
3168
|
+
* - RESUME_EXECUTION_FAILED: Resume reconciliation succeeded but execution failed
|
|
3169
|
+
*/
|
|
3170
|
+
export type ResumeErrorCode =
|
|
3171
|
+
| "RESUME_NO_STATE"
|
|
3172
|
+
| "RESUME_INVALID_STATE"
|
|
3173
|
+
| "RESUME_SCHEMA_MISMATCH"
|
|
3174
|
+
| "RESUME_PHASE_NOT_RESUMABLE"
|
|
3175
|
+
| "RESUME_TMUX_UNAVAILABLE"
|
|
3176
|
+
| "RESUME_EXECUTION_FAILED";
|
|
3177
|
+
|
|
3178
|
+
/** Typed error class for resume failures with stable error codes. */
|
|
3179
|
+
export class ResumeError extends Error {
|
|
3180
|
+
code: ResumeErrorCode;
|
|
3181
|
+
|
|
3182
|
+
constructor(code: ResumeErrorCode, message: string) {
|
|
3183
|
+
super(message);
|
|
3184
|
+
this.name = "ResumeError";
|
|
3185
|
+
this.code = code;
|
|
3186
|
+
}
|
|
3187
|
+
}
|
|
3188
|
+
|
|
3189
|
+
/**
|
|
3190
|
+
* Result of reconciling a single task's persisted state against live signals.
|
|
3191
|
+
*
|
|
3192
|
+
* Combines persisted status, lane-session liveness, and .DONE file presence
|
|
3193
|
+
* into a deterministic action for the resume engine.
|
|
3194
|
+
*
|
|
3195
|
+
* Reconciliation precedence (highest → lowest):
|
|
3196
|
+
* 1. .DONE file found → "mark-complete" (regardless of session state)
|
|
3197
|
+
* 2. Session alive + no .DONE → "reconnect" (task is still running)
|
|
3198
|
+
* 3. Persisted status is terminal (succeeded/failed/stalled/skipped) → "skip"
|
|
3199
|
+
* 4. Session dead + no .DONE + was running → "mark-failed"
|
|
3200
|
+
*/
|
|
3201
|
+
export interface ReconciledTaskState {
|
|
3202
|
+
/** Task identifier */
|
|
3203
|
+
taskId: string;
|
|
3204
|
+
/** Status from the persisted state file */
|
|
3205
|
+
persistedStatus: LaneTaskStatus;
|
|
3206
|
+
/** Reconciled live status after checking signals */
|
|
3207
|
+
liveStatus: LaneTaskStatus;
|
|
3208
|
+
/** Whether the lane session is alive right now */
|
|
3209
|
+
sessionAlive: boolean;
|
|
3210
|
+
/** Whether the .DONE file was found */
|
|
3211
|
+
doneFileFound: boolean;
|
|
3212
|
+
/** Whether the lane worktree still exists on disk */
|
|
3213
|
+
worktreeExists: boolean;
|
|
3214
|
+
/** Action the resume engine should take */
|
|
3215
|
+
action: "reconnect" | "mark-complete" | "mark-failed" | "re-execute" | "skip" | "pending";
|
|
3216
|
+
}
|
|
3217
|
+
|
|
3218
|
+
/**
|
|
3219
|
+
* Result of resume eligibility check.
|
|
3220
|
+
*
|
|
3221
|
+
* Determines whether a persisted batch state can be resumed based on its phase.
|
|
3222
|
+
*/
|
|
3223
|
+
export interface ResumeEligibility {
|
|
3224
|
+
/** Whether the batch can be resumed */
|
|
3225
|
+
eligible: boolean;
|
|
3226
|
+
/** Human-readable reason (for both eligible and ineligible) */
|
|
3227
|
+
reason: string;
|
|
3228
|
+
/** Persisted phase */
|
|
3229
|
+
phase: OrchBatchPhase;
|
|
3230
|
+
/** Batch ID */
|
|
3231
|
+
batchId: string;
|
|
3232
|
+
}
|
|
3233
|
+
|
|
3234
|
+
/**
|
|
3235
|
+
* Resume point computed from reconciled task states.
|
|
3236
|
+
*
|
|
3237
|
+
* Tells the resume engine where to start in the wave plan.
|
|
3238
|
+
*/
|
|
3239
|
+
export interface ResumePoint {
|
|
3240
|
+
/** Wave index to resume from (0-based) */
|
|
3241
|
+
resumeWaveIndex: number;
|
|
3242
|
+
/** Task IDs confirmed completed (via .DONE or prior succeeded) */
|
|
3243
|
+
completedTaskIds: string[];
|
|
3244
|
+
/** Task IDs that still need execution */
|
|
3245
|
+
pendingTaskIds: string[];
|
|
3246
|
+
/** Task IDs confirmed failed (dead session, no .DONE) */
|
|
3247
|
+
failedTaskIds: string[];
|
|
3248
|
+
/** Task IDs with alive sessions that need reconnection */
|
|
3249
|
+
reconnectTaskIds: string[];
|
|
3250
|
+
/** Task IDs with dead sessions but existing worktrees that need re-execution */
|
|
3251
|
+
reExecuteTaskIds: string[];
|
|
3252
|
+
/**
|
|
3253
|
+
* Wave indexes (0-based) where all tasks are terminal but the merge
|
|
3254
|
+
* is missing or failed. These waves should be retried for merge only
|
|
3255
|
+
* (no task re-execution). Empty when all completed waves have
|
|
3256
|
+
* successful merges. (TP-037, Bug #102)
|
|
3257
|
+
*/
|
|
3258
|
+
mergeRetryWaveIndexes: number[];
|
|
3259
|
+
}
|
|
3260
|
+
|
|
3261
|
+
// ── Abort (TS-009 Step 5) ────────────────────────────────────────────
|
|
3262
|
+
|
|
3263
|
+
/**
|
|
3264
|
+
* Abort mode: graceful (checkpoint + wait + force-kill) or hard (immediate kill).
|
|
3265
|
+
*/
|
|
3266
|
+
export type AbortMode = "graceful" | "hard";
|
|
3267
|
+
|
|
3268
|
+
/**
|
|
3269
|
+
* Error codes for abort operations.
|
|
3270
|
+
*
|
|
3271
|
+
* - ABORT_TMUX_LIST_FAILED: Could not list legacy session records
|
|
3272
|
+
* - ABORT_WRAPUP_WRITE_FAILED: Failed to write wrap-up signal file(s)
|
|
3273
|
+
* - ABORT_KILL_FAILED: Failed to kill one or more lane sessions
|
|
3274
|
+
* - ABORT_STATE_DELETE_FAILED: Failed to delete batch-state.json
|
|
3275
|
+
*/
|
|
3276
|
+
export type AbortErrorCode =
|
|
3277
|
+
| "ABORT_TMUX_LIST_FAILED"
|
|
3278
|
+
| "ABORT_WRAPUP_WRITE_FAILED"
|
|
3279
|
+
| "ABORT_KILL_FAILED"
|
|
3280
|
+
| "ABORT_STATE_DELETE_FAILED";
|
|
3281
|
+
|
|
3282
|
+
/**
|
|
3283
|
+
* Per-lane result from an abort operation.
|
|
3284
|
+
*/
|
|
3285
|
+
export interface AbortLaneResult {
|
|
3286
|
+
/** Lane session name */
|
|
3287
|
+
sessionName: string;
|
|
3288
|
+
/** Lane ID (e.g., "lane-1") or "unknown" */
|
|
3289
|
+
laneId: string;
|
|
3290
|
+
/** Task ID if known */
|
|
3291
|
+
taskId: string | null;
|
|
3292
|
+
/** Task folder path in the worktree (for wrap-up file writing) */
|
|
3293
|
+
taskFolderInWorktree: string | null;
|
|
3294
|
+
/** Whether wrap-up files were written (graceful only) */
|
|
3295
|
+
wrapUpWritten: boolean;
|
|
3296
|
+
/** Wrap-up write error if any */
|
|
3297
|
+
wrapUpError: string | null;
|
|
3298
|
+
/** Whether the session was killed */
|
|
3299
|
+
sessionKilled: boolean;
|
|
3300
|
+
/** Whether the session exited gracefully (before force-kill) */
|
|
3301
|
+
exitedGracefully: boolean;
|
|
3302
|
+
}
|
|
3303
|
+
|
|
3304
|
+
/**
|
|
3305
|
+
* Overall result from an abort operation.
|
|
3306
|
+
*/
|
|
3307
|
+
export interface AbortResult {
|
|
3308
|
+
/** Abort mode used */
|
|
3309
|
+
mode: AbortMode;
|
|
3310
|
+
/** Number of sessions found to abort */
|
|
3311
|
+
sessionsFound: number;
|
|
3312
|
+
/** Number of sessions actually killed (force-killed or graceful exit) */
|
|
3313
|
+
sessionsKilled: number;
|
|
3314
|
+
/** Number of sessions that exited gracefully (before timeout) */
|
|
3315
|
+
gracefulExits: number;
|
|
3316
|
+
/** Per-lane results */
|
|
3317
|
+
laneResults: AbortLaneResult[];
|
|
3318
|
+
/** Number of wrap-up write failures (graceful only) */
|
|
3319
|
+
wrapUpFailures: number;
|
|
3320
|
+
/** Whether batch state file was deleted */
|
|
3321
|
+
stateDeleted: boolean;
|
|
3322
|
+
/** Aggregated errors */
|
|
3323
|
+
errors: Array<{ code: AbortErrorCode; message: string }>;
|
|
3324
|
+
/** Duration of the abort operation in milliseconds */
|
|
3325
|
+
durationMs: number;
|
|
3326
|
+
}
|
|
3327
|
+
|
|
3328
|
+
/**
|
|
3329
|
+
* Action step in an abort plan.
|
|
3330
|
+
*/
|
|
3331
|
+
export type AbortActionStep =
|
|
3332
|
+
| { type: "write-wrapup" }
|
|
3333
|
+
| { type: "poll-wait"; gracePeriodMs: number; pollIntervalMs: number }
|
|
3334
|
+
| { type: "kill-remaining" }
|
|
3335
|
+
| { type: "kill-all" };
|
|
3336
|
+
|
|
3337
|
+
/**
|
|
3338
|
+
* Target session with enrichment from persisted state.
|
|
3339
|
+
*/
|
|
3340
|
+
export interface AbortTargetSession {
|
|
3341
|
+
/** Lane session name */
|
|
3342
|
+
sessionName: string;
|
|
3343
|
+
/** Lane ID from persisted state or "unknown" */
|
|
3344
|
+
laneId: string;
|
|
3345
|
+
/** Task ID from persisted state or null */
|
|
3346
|
+
taskId: string | null;
|
|
3347
|
+
/** Task folder path resolved in the worktree (for wrap-up files), or null */
|
|
3348
|
+
taskFolderInWorktree: string | null;
|
|
3349
|
+
/** Worktree path from persisted state or batch state */
|
|
3350
|
+
worktreePath: string | null;
|
|
3351
|
+
}
|
|
3352
|
+
|
|
3353
|
+
// ── Size-to-Duration Mapping ─────────────────────────────────────────
|
|
3354
|
+
|
|
3355
|
+
/**
|
|
3356
|
+
* Default duration mapping (size → minutes).
|
|
3357
|
+
*
|
|
3358
|
+
* | Size | Weight | Duration |
|
|
3359
|
+
* |------|--------|----------|
|
|
3360
|
+
* | S | 1 | 30 min |
|
|
3361
|
+
* | M | 2 | 60 min |
|
|
3362
|
+
* | L | 4 | 120 min |
|
|
3363
|
+
*/
|
|
3364
|
+
export const SIZE_DURATION_MINUTES: Record<string, number> = {
|
|
3365
|
+
S: 30,
|
|
3366
|
+
M: 60,
|
|
3367
|
+
L: 120,
|
|
3368
|
+
};
|
|
3369
|
+
export const DURATION_BASE_MINUTES = 30;
|
|
3370
|
+
|
|
3371
|
+
/**
|
|
3372
|
+
* Get estimated duration in minutes for a task size.
|
|
3373
|
+
* Uses explicit mapping, falling back to weight × base.
|
|
3374
|
+
*/
|
|
3375
|
+
export function getTaskDurationMinutes(size: string, sizeWeights: Record<string, number>): number {
|
|
3376
|
+
if (SIZE_DURATION_MINUTES[size] !== undefined) {
|
|
3377
|
+
return SIZE_DURATION_MINUTES[size];
|
|
3378
|
+
}
|
|
3379
|
+
const weight = sizeWeights[size] || sizeWeights["M"] || 2;
|
|
3380
|
+
return weight * DURATION_BASE_MINUTES;
|
|
3381
|
+
}
|
|
3382
|
+
|
|
3383
|
+
// ── Batch History ────────────────────────────────────────────────────
|
|
3384
|
+
|
|
3385
|
+
/** Token counts for a task, wave, or batch. */
|
|
3386
|
+
export interface TokenCounts {
|
|
3387
|
+
input: number;
|
|
3388
|
+
output: number;
|
|
3389
|
+
cacheRead: number;
|
|
3390
|
+
cacheWrite: number;
|
|
3391
|
+
costUsd: number;
|
|
3392
|
+
}
|
|
3393
|
+
|
|
3394
|
+
/** Per-task summary for history. */
|
|
3395
|
+
export interface BatchTaskSummary {
|
|
3396
|
+
taskId: string;
|
|
3397
|
+
taskName: string;
|
|
3398
|
+
status: "succeeded" | "failed" | "skipped" | "blocked" | "stalled" | "pending";
|
|
3399
|
+
wave: number; // 1-based
|
|
3400
|
+
lane: number; // 1-based
|
|
3401
|
+
durationMs: number;
|
|
3402
|
+
tokens: TokenCounts;
|
|
3403
|
+
exitReason: string | null;
|
|
3404
|
+
}
|
|
3405
|
+
|
|
3406
|
+
/** Per-wave summary for history. */
|
|
3407
|
+
export interface BatchWaveSummary {
|
|
3408
|
+
wave: number; // 1-based
|
|
3409
|
+
tasks: string[]; // task IDs
|
|
3410
|
+
mergeStatus: "succeeded" | "failed" | "partial" | "skipped";
|
|
3411
|
+
durationMs: number;
|
|
3412
|
+
tokens: TokenCounts;
|
|
3413
|
+
}
|
|
3414
|
+
|
|
3415
|
+
/** Complete batch history entry — written after Phase 3 cleanup. */
|
|
3416
|
+
export interface BatchHistorySummary {
|
|
3417
|
+
batchId: string;
|
|
3418
|
+
status: "completed" | "partial" | "failed" | "aborted";
|
|
3419
|
+
startedAt: number;
|
|
3420
|
+
endedAt: number;
|
|
3421
|
+
durationMs: number;
|
|
3422
|
+
totalWaves: number;
|
|
3423
|
+
totalTasks: number;
|
|
3424
|
+
succeededTasks: number;
|
|
3425
|
+
failedTasks: number;
|
|
3426
|
+
skippedTasks: number;
|
|
3427
|
+
blockedTasks: number;
|
|
3428
|
+
tokens: TokenCounts;
|
|
3429
|
+
tasks: BatchTaskSummary[];
|
|
3430
|
+
waves: BatchWaveSummary[];
|
|
3431
|
+
/** Timestamp (ms since epoch) when the batch was integrated. Set by orch-integrate. */
|
|
3432
|
+
integratedAt?: number;
|
|
3433
|
+
}
|
|
3434
|
+
|
|
3435
|
+
/** Max number of batch history entries to retain. */
|
|
3436
|
+
export const BATCH_HISTORY_MAX_ENTRIES = 100;
|
|
3437
|
+
|
|
3438
|
+
// ── Workspace Mode Types ─────────────────────────────────────────────
|
|
3439
|
+
|
|
3440
|
+
/**
|
|
3441
|
+
* Workspace execution mode.
|
|
3442
|
+
*
|
|
3443
|
+
* Mode behavior contract:
|
|
3444
|
+
* - **"repo"** (default): No workspace config file present. The orchestrator
|
|
3445
|
+
* treats `cwd` as both the workspace root and the single repo root.
|
|
3446
|
+
* All existing monorepo behavior is preserved unchanged.
|
|
3447
|
+
* - **"workspace"**: A `.pi/orchid-workspace.yaml` file is present and
|
|
3448
|
+
* valid. The orchestrator runs from a non-git workspace root that
|
|
3449
|
+
* coordinates multiple repos and a shared task root.
|
|
3450
|
+
*
|
|
3451
|
+
* Mode determination rules:
|
|
3452
|
+
* 1. Workspace config file present + invalid → fatal error with actionable
|
|
3453
|
+
* `WorkspaceConfigError` (never silently falls back to repo mode).
|
|
3454
|
+
* 2. Workspace config file present + valid → workspace mode.
|
|
3455
|
+
* 3. No workspace config + cwd is a git repo → repo mode.
|
|
3456
|
+
* 4. No workspace config + cwd is not a git repo → `WORKSPACE_SETUP_REQUIRED`.
|
|
3457
|
+
*/
|
|
3458
|
+
export type WorkspaceMode = "repo" | "workspace";
|
|
3459
|
+
|
|
3460
|
+
/**
|
|
3461
|
+
* Configuration for a single repository within a workspace.
|
|
3462
|
+
*
|
|
3463
|
+
* Each repo is identified by a stable ID (e.g., "api", "frontend")
|
|
3464
|
+
* that is used for routing tasks to repos and for display purposes.
|
|
3465
|
+
*/
|
|
3466
|
+
export interface WorkspaceRepoConfig {
|
|
3467
|
+
/** Stable identifier for this repo (e.g., "api", "frontend") */
|
|
3468
|
+
id: string;
|
|
3469
|
+
/** Absolute filesystem path to the repo root (must be a git repo) */
|
|
3470
|
+
path: string;
|
|
3471
|
+
/** Optional default branch override (e.g., "develop", "main"). Falls back to repo HEAD. */
|
|
3472
|
+
defaultBranch?: string;
|
|
3473
|
+
}
|
|
3474
|
+
|
|
3475
|
+
/**
|
|
3476
|
+
* Routing configuration for workspace mode.
|
|
3477
|
+
*
|
|
3478
|
+
* Controls where tasks are discovered and which repo receives
|
|
3479
|
+
* unqualified operations.
|
|
3480
|
+
*/
|
|
3481
|
+
export interface WorkspaceRoutingConfig {
|
|
3482
|
+
/**
|
|
3483
|
+
* Absolute path to the shared tasks root directory.
|
|
3484
|
+
* All task areas are resolved relative to this path.
|
|
3485
|
+
* Must exist on disk.
|
|
3486
|
+
*/
|
|
3487
|
+
tasksRoot: string;
|
|
3488
|
+
/**
|
|
3489
|
+
* Default repo ID for operations that don't specify a repo.
|
|
3490
|
+
* Must reference a valid key in `WorkspaceConfig.repos`.
|
|
3491
|
+
*/
|
|
3492
|
+
defaultRepo: string;
|
|
3493
|
+
/**
|
|
3494
|
+
* Repo ID that owns task packet files (PROMPT.md/STATUS.md/.DONE/.reviews).
|
|
3495
|
+
*
|
|
3496
|
+
* Required at runtime. Legacy workspace YAML without this field is
|
|
3497
|
+
* compatibility-mapped to `defaultRepo` during load with a warning.
|
|
3498
|
+
*
|
|
3499
|
+
* Invariant: `tasksRoot` must resolve inside `repos[taskPacketRepo].path`.
|
|
3500
|
+
*/
|
|
3501
|
+
taskPacketRepo: string;
|
|
3502
|
+
/**
|
|
3503
|
+
* When true, every task MUST declare an explicit execution target
|
|
3504
|
+
* (via `## Execution Target` section or inline `**Repo:**` in PROMPT.md).
|
|
3505
|
+
* Area-level and workspace-default fallbacks are still used for
|
|
3506
|
+
* validation (unknown-repo checks) but NOT for automatic resolution.
|
|
3507
|
+
*
|
|
3508
|
+
* This prevents accidental misrouting in large multi-team workspaces
|
|
3509
|
+
* where task authors must be intentional about which repo a task targets.
|
|
3510
|
+
*
|
|
3511
|
+
* Default: false (permissive — existing precedence chain applies).
|
|
3512
|
+
* Only meaningful in workspace mode.
|
|
3513
|
+
*/
|
|
3514
|
+
strict?: boolean;
|
|
3515
|
+
}
|
|
3516
|
+
|
|
3517
|
+
/**
|
|
3518
|
+
* Top-level workspace configuration.
|
|
3519
|
+
*
|
|
3520
|
+
* Loaded from `.pi/orchid-workspace.yaml` when present.
|
|
3521
|
+
* Immutable after initial validation — never mutated at runtime.
|
|
3522
|
+
*/
|
|
3523
|
+
export interface WorkspaceConfig {
|
|
3524
|
+
/** Active workspace mode */
|
|
3525
|
+
mode: WorkspaceMode;
|
|
3526
|
+
/** Map of repo ID → repo configuration. At least one repo required in workspace mode. */
|
|
3527
|
+
repos: Map<string, WorkspaceRepoConfig>;
|
|
3528
|
+
/** Routing configuration (tasks root, default repo) */
|
|
3529
|
+
routing: WorkspaceRoutingConfig;
|
|
3530
|
+
/** Absolute path to the workspace config file that was loaded */
|
|
3531
|
+
configPath: string;
|
|
3532
|
+
}
|
|
3533
|
+
|
|
3534
|
+
/**
|
|
3535
|
+
* Canonical execution context for the orchestrator.
|
|
3536
|
+
*
|
|
3537
|
+
* This is the primary runtime context threaded through orchestrator
|
|
3538
|
+
* entry points. It replaces the previous pattern of passing raw `cwd`
|
|
3539
|
+
* as the sole repo root.
|
|
3540
|
+
*
|
|
3541
|
+
* In repo mode, `workspaceRoot` and `repoRoot` are the same directory.
|
|
3542
|
+
* In workspace mode, `workspaceRoot` is the non-git coordination root
|
|
3543
|
+
* and `repoRoot` is the default repo from the workspace config.
|
|
3544
|
+
*
|
|
3545
|
+
* Design rationale:
|
|
3546
|
+
* - Step 2 (wire orchestrator startup) will construct this from config
|
|
3547
|
+
* loading results and thread it into `executeOrchBatch()` and friends.
|
|
3548
|
+
* - `repoRoot` is always a git repository, preserving the invariant
|
|
3549
|
+
* that git operations (worktree, branch, merge) have a valid target.
|
|
3550
|
+
* - `workspaceConfig` is null in repo mode (no workspace file loaded).
|
|
3551
|
+
*/
|
|
3552
|
+
export interface ExecutionContext {
|
|
3553
|
+
/** Absolute path to the workspace root (cwd in repo mode, workspace dir in workspace mode) */
|
|
3554
|
+
workspaceRoot: string;
|
|
3555
|
+
/** Absolute path to the default/primary git repo root */
|
|
3556
|
+
repoRoot: string;
|
|
3557
|
+
/** Active workspace mode */
|
|
3558
|
+
mode: WorkspaceMode;
|
|
3559
|
+
/** Workspace configuration (null in repo mode) */
|
|
3560
|
+
workspaceConfig: WorkspaceConfig | null;
|
|
3561
|
+
/** Loaded task runner configuration */
|
|
3562
|
+
taskRunnerConfig: TaskRunnerConfig;
|
|
3563
|
+
/** Loaded orchestrator configuration */
|
|
3564
|
+
orchestratorConfig: OrchestratorConfig;
|
|
3565
|
+
/**
|
|
3566
|
+
* Resolved pointer for config/agent paths (null in repo mode).
|
|
3567
|
+
*
|
|
3568
|
+
* When present, `pointer.configRoot` and `pointer.agentRoot` point to
|
|
3569
|
+
* the config repo's config directory. State/sidecar paths are NOT
|
|
3570
|
+
* affected — they always live at `<workspaceRoot>/.pi/`.
|
|
3571
|
+
*/
|
|
3572
|
+
pointer: PointerResolution | null;
|
|
3573
|
+
}
|
|
3574
|
+
|
|
3575
|
+
// ── Workspace Validation Error Types ─────────────────────────────────
|
|
3576
|
+
|
|
3577
|
+
/**
|
|
3578
|
+
* Error codes for workspace configuration validation failures.
|
|
3579
|
+
*
|
|
3580
|
+
* Each code maps to a deterministic validation rule from the workspace
|
|
3581
|
+
* config loading pipeline. Codes are stable and machine-branchable.
|
|
3582
|
+
*
|
|
3583
|
+
* - WORKSPACE_FILE_READ_ERROR: Config file exists but cannot be read (permissions, encoding)
|
|
3584
|
+
* - WORKSPACE_FILE_PARSE_ERROR: Config file contains invalid YAML
|
|
3585
|
+
* - WORKSPACE_MISSING_REPOS: No repos defined in workspace config (at least one required)
|
|
3586
|
+
* - WORKSPACE_REPO_PATH_MISSING: A repo entry has no `path` field
|
|
3587
|
+
* - WORKSPACE_REPO_PATH_NOT_FOUND: A repo's `path` does not exist on disk
|
|
3588
|
+
* - WORKSPACE_REPO_NOT_GIT: A repo's `path` exists but is not a git repository
|
|
3589
|
+
* - WORKSPACE_MISSING_TASKS_ROOT: `routing.tasks_root` is missing or empty
|
|
3590
|
+
* - WORKSPACE_TASKS_ROOT_NOT_FOUND: `routing.tasks_root` path does not exist on disk
|
|
3591
|
+
* - WORKSPACE_MISSING_DEFAULT_REPO: `routing.default_repo` is missing or empty
|
|
3592
|
+
* - WORKSPACE_DEFAULT_REPO_NOT_FOUND: `routing.default_repo` references a repo ID not in the repos map
|
|
3593
|
+
* - WORKSPACE_TASK_PACKET_REPO_NOT_FOUND: `routing.task_packet_repo` references a repo ID not in the repos map
|
|
3594
|
+
* - WORKSPACE_TASKS_ROOT_OUTSIDE_PACKET_REPO: `routing.tasks_root` resolves outside `repos[routing.task_packet_repo].path`
|
|
3595
|
+
* - WORKSPACE_TASK_AREA_OUTSIDE_TASKS_ROOT: A configured task-area path resolves outside `routing.tasks_root`
|
|
3596
|
+
* - WORKSPACE_SETUP_REQUIRED: No workspace config and cwd is not a git repository
|
|
3597
|
+
* - WORKSPACE_DUPLICATE_REPO_PATH: Two or more repos share the same filesystem path
|
|
3598
|
+
* - WORKSPACE_SCHEMA_INVALID: Config file has valid YAML but missing/invalid top-level structure
|
|
3599
|
+
*/
|
|
3600
|
+
export type WorkspaceConfigErrorCode =
|
|
3601
|
+
| "WORKSPACE_FILE_READ_ERROR"
|
|
3602
|
+
| "WORKSPACE_FILE_PARSE_ERROR"
|
|
3603
|
+
| "WORKSPACE_MISSING_REPOS"
|
|
3604
|
+
| "WORKSPACE_REPO_PATH_MISSING"
|
|
3605
|
+
| "WORKSPACE_REPO_PATH_NOT_FOUND"
|
|
3606
|
+
| "WORKSPACE_REPO_NOT_GIT"
|
|
3607
|
+
| "WORKSPACE_MISSING_TASKS_ROOT"
|
|
3608
|
+
| "WORKSPACE_TASKS_ROOT_NOT_FOUND"
|
|
3609
|
+
| "WORKSPACE_MISSING_DEFAULT_REPO"
|
|
3610
|
+
| "WORKSPACE_DEFAULT_REPO_NOT_FOUND"
|
|
3611
|
+
| "WORKSPACE_TASK_PACKET_REPO_NOT_FOUND"
|
|
3612
|
+
| "WORKSPACE_TASKS_ROOT_OUTSIDE_PACKET_REPO"
|
|
3613
|
+
| "WORKSPACE_TASK_AREA_OUTSIDE_TASKS_ROOT"
|
|
3614
|
+
| "WORKSPACE_SETUP_REQUIRED"
|
|
3615
|
+
| "WORKSPACE_DUPLICATE_REPO_PATH"
|
|
3616
|
+
| "WORKSPACE_SCHEMA_INVALID"; /**
|
|
3617
|
+
* Typed error class for workspace configuration failures.
|
|
3618
|
+
*
|
|
3619
|
+
* Thrown during workspace config loading/validation when the config file
|
|
3620
|
+
* is present but invalid. Never thrown when no config file exists (that
|
|
3621
|
+
* case silently falls back to repo mode).
|
|
3622
|
+
*
|
|
3623
|
+
* Follows the established pattern of typed error classes in this module
|
|
3624
|
+
* (WorktreeError, ExecutionError, MergeError, StateFileError, ResumeError).
|
|
3625
|
+
*/
|
|
3626
|
+
export class WorkspaceConfigError extends Error {
|
|
3627
|
+
code: WorkspaceConfigErrorCode;
|
|
3628
|
+
/** Optional repo ID that triggered the error (for repo-specific validation failures) */
|
|
3629
|
+
repoId?: string;
|
|
3630
|
+
/** Optional filesystem path related to the error */
|
|
3631
|
+
relatedPath?: string;
|
|
3632
|
+
|
|
3633
|
+
constructor(
|
|
3634
|
+
code: WorkspaceConfigErrorCode,
|
|
3635
|
+
message: string,
|
|
3636
|
+
repoId?: string,
|
|
3637
|
+
relatedPath?: string,
|
|
3638
|
+
) {
|
|
3639
|
+
super(message);
|
|
3640
|
+
this.name = "WorkspaceConfigError";
|
|
3641
|
+
this.code = code;
|
|
3642
|
+
this.repoId = repoId;
|
|
3643
|
+
this.relatedPath = relatedPath;
|
|
3644
|
+
}
|
|
3645
|
+
}
|
|
3646
|
+
|
|
3647
|
+
// ── Pointer Resolution Types ─────────────────────────────────────────
|
|
3648
|
+
|
|
3649
|
+
/**
|
|
3650
|
+
* Canonical filename for the workspace pointer file.
|
|
3651
|
+
* Located at `<workspace-root>/.pi/orchid-pointer.json`.
|
|
3652
|
+
*
|
|
3653
|
+
* Created by `orchid init` in workspace mode. Points to the config
|
|
3654
|
+
* repo and config path within it. Not committed to git — each user
|
|
3655
|
+
* creates it during onboarding.
|
|
3656
|
+
*/
|
|
3657
|
+
export const POINTER_FILENAME = "orchid-pointer.json";
|
|
3658
|
+
|
|
3659
|
+
/**
|
|
3660
|
+
* Resolve the absolute path to the pointer file.
|
|
3661
|
+
* @param workspaceRoot - Absolute path to the workspace root
|
|
3662
|
+
*/
|
|
3663
|
+
export function pointerFilePath(workspaceRoot: string): string {
|
|
3664
|
+
return join(workspaceRoot, ".pi", POINTER_FILENAME);
|
|
3665
|
+
}
|
|
3666
|
+
|
|
3667
|
+
/**
|
|
3668
|
+
* Result of resolving the workspace pointer file.
|
|
3669
|
+
*
|
|
3670
|
+
* This is the primary contract for downstream consumers (task-runner,
|
|
3671
|
+
* orchestrator, merge agent, dashboard). All pointer failures are
|
|
3672
|
+
* non-fatal: when the pointer cannot be resolved, `used` is false and
|
|
3673
|
+
* `configRoot`/`agentRoot` fall back to workspace-root paths.
|
|
3674
|
+
*
|
|
3675
|
+
* State/sidecar paths are NOT affected by the pointer — they always
|
|
3676
|
+
* live at `<workspace-root>/.pi/` regardless of pointer resolution.
|
|
3677
|
+
*
|
|
3678
|
+
* In repo mode, `resolvePointer()` returns null (pointer is ignored
|
|
3679
|
+
* entirely, even if a file happens to exist).
|
|
3680
|
+
*/
|
|
3681
|
+
export interface PointerResolution {
|
|
3682
|
+
/**
|
|
3683
|
+
* Whether the pointer was successfully resolved.
|
|
3684
|
+
* - true: pointer file was found, parsed, and config_repo resolved
|
|
3685
|
+
* to a known repo in WorkspaceConfig.repos.
|
|
3686
|
+
* - false: pointer was missing, malformed, or referenced an unknown
|
|
3687
|
+
* repo. Fallback paths are used instead.
|
|
3688
|
+
*/
|
|
3689
|
+
used: boolean;
|
|
3690
|
+
|
|
3691
|
+
/**
|
|
3692
|
+
* Resolved config root directory.
|
|
3693
|
+
* - When used=true: `<config-repo-path>/<config_path>/`
|
|
3694
|
+
* - When used=false: `<workspace-root>/.pi/` (existing fallback)
|
|
3695
|
+
*/
|
|
3696
|
+
configRoot: string;
|
|
3697
|
+
|
|
3698
|
+
/**
|
|
3699
|
+
* Resolved agent overrides directory.
|
|
3700
|
+
* - When used=true: `<config-repo-path>/<config_path>/agents/`
|
|
3701
|
+
* - When used=false: `<workspace-root>/.pi/agents/` (existing fallback)
|
|
3702
|
+
*/
|
|
3703
|
+
agentRoot: string;
|
|
3704
|
+
|
|
3705
|
+
/**
|
|
3706
|
+
* Warning message when pointer resolution fell back.
|
|
3707
|
+
* - undefined when used=true (no warning)
|
|
3708
|
+
* - Human-readable reason string when used=false
|
|
3709
|
+
*/
|
|
3710
|
+
warning?: string;
|
|
3711
|
+
}
|
|
3712
|
+
|
|
3713
|
+
// ── Workspace Defaults ───────────────────────────────────────────────
|
|
3714
|
+
|
|
3715
|
+
/**
|
|
3716
|
+
* Canonical filename for workspace configuration.
|
|
3717
|
+
* Resolved relative to workspace root: `.pi/orchid-workspace.yaml`
|
|
3718
|
+
*/
|
|
3719
|
+
export const WORKSPACE_CONFIG_FILENAME = "orchid-workspace.yaml";
|
|
3720
|
+
|
|
3721
|
+
/**
|
|
3722
|
+
* Resolve the absolute path to the workspace config file.
|
|
3723
|
+
* @param workspaceRoot - Absolute path to the workspace root
|
|
3724
|
+
*/
|
|
3725
|
+
export function workspaceConfigPath(workspaceRoot: string): string {
|
|
3726
|
+
return join(workspaceRoot, ".pi", WORKSPACE_CONFIG_FILENAME);
|
|
3727
|
+
}
|
|
3728
|
+
|
|
3729
|
+
/**
|
|
3730
|
+
* Create a default ExecutionContext for repo mode.
|
|
3731
|
+
*
|
|
3732
|
+
* Used when no workspace config file is present. The workspace root
|
|
3733
|
+
* and repo root are the same directory (cwd), preserving existing
|
|
3734
|
+
* monorepo behavior exactly.
|
|
3735
|
+
*
|
|
3736
|
+
* @param cwd - Current working directory (treated as both workspace and repo root)
|
|
3737
|
+
* @param taskRunnerConfig - Loaded task runner config (or defaults)
|
|
3738
|
+
* @param orchestratorConfig - Loaded orchestrator config (or defaults)
|
|
3739
|
+
*/
|
|
3740
|
+
export function createRepoModeContext(
|
|
3741
|
+
cwd: string,
|
|
3742
|
+
taskRunnerConfig: TaskRunnerConfig,
|
|
3743
|
+
orchestratorConfig: OrchestratorConfig,
|
|
3744
|
+
): ExecutionContext {
|
|
3745
|
+
return {
|
|
3746
|
+
workspaceRoot: cwd,
|
|
3747
|
+
repoRoot: cwd,
|
|
3748
|
+
mode: "repo",
|
|
3749
|
+
workspaceConfig: null,
|
|
3750
|
+
taskRunnerConfig,
|
|
3751
|
+
orchestratorConfig,
|
|
3752
|
+
pointer: null,
|
|
3753
|
+
};
|
|
3754
|
+
}
|
|
3755
|
+
|
|
3756
|
+
// ── Agent Mailbox Types (TP-089) ─────────────────────────────────────
|
|
3757
|
+
|
|
3758
|
+
/**
|
|
3759
|
+
* Mailbox directory name under .pi/.
|
|
3760
|
+
* @since TP-089
|
|
3761
|
+
*/
|
|
3762
|
+
export const MAILBOX_DIR_NAME = "mailbox";
|
|
3763
|
+
|
|
3764
|
+
/**
|
|
3765
|
+
* Maximum content size in UTF-8 bytes.
|
|
3766
|
+
* Steering messages should be concise directives; larger context should be
|
|
3767
|
+
* written to a separate file and referenced by path.
|
|
3768
|
+
* @since TP-089
|
|
3769
|
+
*/
|
|
3770
|
+
export const MAILBOX_MAX_CONTENT_BYTES = 4096;
|
|
3771
|
+
|
|
3772
|
+
/**
|
|
3773
|
+
* Message types for the agent mailbox system.
|
|
3774
|
+
*
|
|
3775
|
+
* | Type | Direction | Purpose |
|
|
3776
|
+
* |------------|---------------------|--------------------------------------------|
|
|
3777
|
+
* | `steer` | supervisor → agent | Course correction. Agent must follow. |
|
|
3778
|
+
* | `query` | supervisor → agent | Request for status/info. Agent replies. |
|
|
3779
|
+
* | `abort` | supervisor → agent | Graceful stop. Agent wraps up and exits. |
|
|
3780
|
+
* | `info` | supervisor → agent | FYI context. No action required. |
|
|
3781
|
+
* | `reply` | agent → supervisor | Response to query or steer acknowledgment. |
|
|
3782
|
+
* | `escalate` | agent → supervisor | Agent-initiated: blocked or needs guidance. |
|
|
3783
|
+
*
|
|
3784
|
+
* @since TP-089
|
|
3785
|
+
*/
|
|
3786
|
+
export type MailboxMessageType = "steer" | "query" | "abort" | "info" | "reply" | "escalate";
|
|
3787
|
+
|
|
3788
|
+
/**
|
|
3789
|
+
* Set of valid mailbox message types for runtime validation.
|
|
3790
|
+
* @since TP-089
|
|
3791
|
+
*/
|
|
3792
|
+
export const MAILBOX_MESSAGE_TYPES: ReadonlySet<string> = new Set<MailboxMessageType>([
|
|
3793
|
+
"steer",
|
|
3794
|
+
"query",
|
|
3795
|
+
"abort",
|
|
3796
|
+
"info",
|
|
3797
|
+
"reply",
|
|
3798
|
+
"escalate",
|
|
3799
|
+
]);
|
|
3800
|
+
|
|
3801
|
+
/**
|
|
3802
|
+
* Message format for the file-based agent mailbox.
|
|
3803
|
+
*
|
|
3804
|
+
* Messages are written as JSON files in batch-scoped, session-scoped
|
|
3805
|
+
* directories. The rpc-wrapper checks the inbox on every `message_end`
|
|
3806
|
+
* event and injects pending messages into the agent's LLM context via
|
|
3807
|
+
* pi's `steer` RPC command.
|
|
3808
|
+
*
|
|
3809
|
+
* @see docs/specifications/orchid/agent-mailbox-steering.md
|
|
3810
|
+
* @since TP-089
|
|
3811
|
+
*/
|
|
3812
|
+
export interface MailboxMessage {
|
|
3813
|
+
/** Unique message ID: `{timestamp}-{5char-hex-nonce}` */
|
|
3814
|
+
id: string;
|
|
3815
|
+
/** Batch ID — must match current batch for validation */
|
|
3816
|
+
batchId: string;
|
|
3817
|
+
/** Sender identifier: `"supervisor"` or session name */
|
|
3818
|
+
from: string;
|
|
3819
|
+
/** Target session name or `"_broadcast"` */
|
|
3820
|
+
to: string;
|
|
3821
|
+
/** Epoch milliseconds (Date.now()) */
|
|
3822
|
+
timestamp: number;
|
|
3823
|
+
/** Message type */
|
|
3824
|
+
type: MailboxMessageType;
|
|
3825
|
+
/** Message body (max 4KB UTF-8 bytes) */
|
|
3826
|
+
content: string;
|
|
3827
|
+
/** Whether the sender expects a reply (default: false) */
|
|
3828
|
+
expectsReply?: boolean;
|
|
3829
|
+
/** Reference to a previous message ID for threading (default: null) */
|
|
3830
|
+
replyTo?: string | null;
|
|
3831
|
+
}
|
|
3832
|
+
|
|
3833
|
+
/**
|
|
3834
|
+
* Input options for writeMailboxMessage.
|
|
3835
|
+
*
|
|
3836
|
+
* The caller provides these fields; the utility generates `id`, `batchId`,
|
|
3837
|
+
* `to`, and `timestamp` from its own arguments.
|
|
3838
|
+
*
|
|
3839
|
+
* @since TP-089
|
|
3840
|
+
*/
|
|
3841
|
+
export interface WriteMailboxMessageOpts {
|
|
3842
|
+
/** Sender identifier: `"supervisor"` or session name */
|
|
3843
|
+
from: string;
|
|
3844
|
+
/** Message type */
|
|
3845
|
+
type: MailboxMessageType;
|
|
3846
|
+
/** Message body (max 4KB UTF-8 bytes) */
|
|
3847
|
+
content: string;
|
|
3848
|
+
/** Whether the sender expects a reply (default: false) */
|
|
3849
|
+
expectsReply?: boolean;
|
|
3850
|
+
/** Reference to a previous message ID for threading (default: null) */
|
|
3851
|
+
replyTo?: string | null;
|
|
3852
|
+
}
|
|
3853
|
+
|
|
3854
|
+
// ── Runtime V2 Contracts (TP-102) ────────────────────────────────────
|
|
3855
|
+
//
|
|
3856
|
+
// These types define the foundational contracts for backend-neutral Runtime V2
|
|
3857
|
+
// architecture. They are additive — existing runtime paths continue to work
|
|
3858
|
+
// while Runtime V2 is incrementally adopted.
|
|
3859
|
+
//
|
|
3860
|
+
// Design principles:
|
|
3861
|
+
// 1. Agent identity is a stable runtime ID, not a legacy session name.
|
|
3862
|
+
// 2. Packet-path authority is explicit, never inferred from cwd.
|
|
3863
|
+
// 3. Process ownership uses a registry, not terminal session discovery.
|
|
3864
|
+
// 4. Normalized events flow directly from child to parent.
|
|
3865
|
+
//
|
|
3866
|
+
// See: docs/specifications/framework/orchid-runtime-v2/
|
|
3867
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
3868
|
+
|
|
3869
|
+
/**
|
|
3870
|
+
* Canonical agent roles in the Runtime V2 process model.
|
|
3871
|
+
*
|
|
3872
|
+
* Every spawned agent process has exactly one role. The role determines
|
|
3873
|
+
* the process's responsibilities, tools, and lifecycle semantics.
|
|
3874
|
+
*
|
|
3875
|
+
* @since TP-102
|
|
3876
|
+
*/
|
|
3877
|
+
export type RuntimeAgentRole = "worker" | "reviewer" | "merger" | "lane-runner";
|
|
3878
|
+
|
|
3879
|
+
/**
|
|
3880
|
+
* Agent lifecycle states in the process registry.
|
|
3881
|
+
*
|
|
3882
|
+
* State machine:
|
|
3883
|
+
* spawning → running → wrapping_up → exited
|
|
3884
|
+
* → crashed
|
|
3885
|
+
* → timed_out
|
|
3886
|
+
* → killed
|
|
3887
|
+
*
|
|
3888
|
+
* @since TP-102
|
|
3889
|
+
*/
|
|
3890
|
+
export type RuntimeAgentStatus =
|
|
3891
|
+
| "spawning"
|
|
3892
|
+
| "running"
|
|
3893
|
+
| "wrapping_up"
|
|
3894
|
+
| "exited"
|
|
3895
|
+
| "crashed"
|
|
3896
|
+
| "timed_out"
|
|
3897
|
+
| "killed";
|
|
3898
|
+
|
|
3899
|
+
/** Set of terminal agent statuses (process is no longer alive). @since TP-102 */
|
|
3900
|
+
export const TERMINAL_AGENT_STATUSES: ReadonlySet<RuntimeAgentStatus> = new Set([
|
|
3901
|
+
"exited",
|
|
3902
|
+
"crashed",
|
|
3903
|
+
"timed_out",
|
|
3904
|
+
"killed",
|
|
3905
|
+
]);
|
|
3906
|
+
|
|
3907
|
+
/**
|
|
3908
|
+
* Stable agent identity for Runtime V2.
|
|
3909
|
+
*
|
|
3910
|
+
* This replaces legacy session names as the canonical identifier for a
|
|
3911
|
+
* spawned agent process. The string format is deliberately compatible
|
|
3912
|
+
* with existing naming conventions (e.g., "orch-henrylach-lane-1-worker")
|
|
3913
|
+
* to minimize churn in supervisor tools, dashboard, and mailbox addressing.
|
|
3914
|
+
*
|
|
3915
|
+
* The key semantic change: this is a **runtime process ID**, not a terminal
|
|
3916
|
+
* session label. Code must not assume terminal-session probes apply to RuntimeAgentId.
|
|
3917
|
+
*
|
|
3918
|
+
* @since TP-102
|
|
3919
|
+
*/
|
|
3920
|
+
export type RuntimeAgentId = string;
|
|
3921
|
+
|
|
3922
|
+
/**
|
|
3923
|
+
* Explicit packet-path authority for a task execution.
|
|
3924
|
+
*
|
|
3925
|
+
* In workspace mode, the packet home (where PROMPT.md / STATUS.md / .DONE
|
|
3926
|
+
* live) may differ from the execution cwd (the active segment repo worktree).
|
|
3927
|
+
* Runtime V2 requires these paths to be resolved explicitly and passed
|
|
3928
|
+
* through the execution chain — never inferred from cwd.
|
|
3929
|
+
*
|
|
3930
|
+
* In repo mode (single repo), all paths point into the same filesystem tree.
|
|
3931
|
+
* The contract is the same; the values just happen to be co-located.
|
|
3932
|
+
*
|
|
3933
|
+
* @since TP-102
|
|
3934
|
+
*/
|
|
3935
|
+
export interface PacketPaths {
|
|
3936
|
+
/** Absolute path to the task's PROMPT.md */
|
|
3937
|
+
promptPath: string;
|
|
3938
|
+
/** Absolute path to the task's STATUS.md */
|
|
3939
|
+
statusPath: string;
|
|
3940
|
+
/** Absolute path to the task's .DONE marker */
|
|
3941
|
+
donePath: string;
|
|
3942
|
+
/** Absolute path to the task's .reviews/ directory */
|
|
3943
|
+
reviewsDir: string;
|
|
3944
|
+
/** Absolute path to the task folder containing packet files */
|
|
3945
|
+
taskFolder: string;
|
|
3946
|
+
}
|
|
3947
|
+
|
|
3948
|
+
/**
|
|
3949
|
+
* Resolve a PacketPaths object from a task folder path.
|
|
3950
|
+
*
|
|
3951
|
+
* This is a pure helper — it does not check whether the files exist.
|
|
3952
|
+
* Consumers should use this to build authoritative paths from an
|
|
3953
|
+
* already-resolved task folder location.
|
|
3954
|
+
*
|
|
3955
|
+
* @param taskFolder - Absolute path to the task folder
|
|
3956
|
+
* @returns Complete PacketPaths with all derived paths
|
|
3957
|
+
*
|
|
3958
|
+
* @since TP-102
|
|
3959
|
+
*/
|
|
3960
|
+
export function resolvePacketPaths(taskFolder: string): PacketPaths {
|
|
3961
|
+
return {
|
|
3962
|
+
promptPath: `${taskFolder}/PROMPT.md`,
|
|
3963
|
+
statusPath: `${taskFolder}/STATUS.md`,
|
|
3964
|
+
donePath: `${taskFolder}/.DONE`,
|
|
3965
|
+
reviewsDir: `${taskFolder}/.reviews`,
|
|
3966
|
+
taskFolder,
|
|
3967
|
+
};
|
|
3968
|
+
}
|
|
3969
|
+
|
|
3970
|
+
/**
|
|
3971
|
+
* A single execution unit in Runtime V2.
|
|
3972
|
+
*
|
|
3973
|
+
* Represents one unit of work to be executed in one lane: either a whole
|
|
3974
|
+
* task (repo mode / single-segment workspace mode) or one segment of a
|
|
3975
|
+
* multi-repo task.
|
|
3976
|
+
*
|
|
3977
|
+
* This is the contract between the engine (which decides what to run) and
|
|
3978
|
+
* the lane-runner (which runs it). It carries everything the lane-runner
|
|
3979
|
+
* needs without requiring it to re-derive paths from cwd or session state.
|
|
3980
|
+
*
|
|
3981
|
+
* @since TP-102
|
|
3982
|
+
*/
|
|
3983
|
+
export interface ExecutionUnit {
|
|
3984
|
+
/** Unique identifier: taskId for whole-task units, `taskId::repoId` for segments */
|
|
3985
|
+
id: string;
|
|
3986
|
+
/** Parent task identifier */
|
|
3987
|
+
taskId: string;
|
|
3988
|
+
/** Segment identifier (null for whole-task execution) */
|
|
3989
|
+
segmentId: string | null;
|
|
3990
|
+
/** Repo ID where execution happens (cwd of the worker) */
|
|
3991
|
+
executionRepoId: string;
|
|
3992
|
+
/** Repo ID that owns the packet files (may differ in workspace mode) */
|
|
3993
|
+
packetHomeRepoId: string;
|
|
3994
|
+
/** Absolute path to the execution worktree */
|
|
3995
|
+
worktreePath: string;
|
|
3996
|
+
/** Authoritative packet file paths */
|
|
3997
|
+
packet: PacketPaths;
|
|
3998
|
+
/** Full parsed task metadata */
|
|
3999
|
+
task: ParsedTask;
|
|
4000
|
+
}
|
|
4001
|
+
|
|
4002
|
+
/**
|
|
4003
|
+
* Per-agent process manifest for the runtime registry.
|
|
4004
|
+
*
|
|
4005
|
+
* Written by the agent's parent process (lane-runner or engine) before
|
|
4006
|
+
* the agent is considered visible. Updated on status transitions and
|
|
4007
|
+
* cleaned up on batch completion.
|
|
4008
|
+
*
|
|
4009
|
+
* Replaces legacy session discovery as the source of truth for agent
|
|
4010
|
+
* liveness, identity, and attribution.
|
|
4011
|
+
*
|
|
4012
|
+
* File location: `.pi/runtime/{batchId}/agents/{agentId}/manifest.json`
|
|
4013
|
+
*
|
|
4014
|
+
* @since TP-102
|
|
4015
|
+
*/
|
|
4016
|
+
export interface RuntimeAgentManifest {
|
|
4017
|
+
/** Batch this agent belongs to */
|
|
4018
|
+
batchId: string;
|
|
4019
|
+
/** Stable agent identity (e.g., "orch-henrylach-lane-1-worker") */
|
|
4020
|
+
agentId: RuntimeAgentId;
|
|
4021
|
+
/** Agent role */
|
|
4022
|
+
role: RuntimeAgentRole;
|
|
4023
|
+
/** Lane number (null for merge agents) */
|
|
4024
|
+
laneNumber: number | null;
|
|
4025
|
+
/** Current task ID being executed (null before first assignment) */
|
|
4026
|
+
taskId: string | null;
|
|
4027
|
+
/** Repo ID the agent is operating in */
|
|
4028
|
+
repoId: string;
|
|
4029
|
+
/** OS process ID of the agent host process */
|
|
4030
|
+
pid: number;
|
|
4031
|
+
/** OS process ID of the parent (lane-runner or engine) */
|
|
4032
|
+
parentPid: number;
|
|
4033
|
+
/** Epoch ms when the agent was spawned */
|
|
4034
|
+
startedAt: number;
|
|
4035
|
+
/** Current lifecycle status */
|
|
4036
|
+
status: RuntimeAgentStatus;
|
|
4037
|
+
/** Absolute path to the agent's working directory */
|
|
4038
|
+
cwd: string;
|
|
4039
|
+
/** Authoritative packet paths (null for merge agents or pre-assignment) */
|
|
4040
|
+
packet: PacketPaths | null;
|
|
4041
|
+
}
|
|
4042
|
+
|
|
4043
|
+
/**
|
|
4044
|
+
* Batch-level runtime registry snapshot.
|
|
4045
|
+
*
|
|
4046
|
+
* Contains all active and recently-exited agents for one batch.
|
|
4047
|
+
* The authoritative source of truth for which agents exist, replacing
|
|
4048
|
+
* legacy session discovery.
|
|
4049
|
+
*
|
|
4050
|
+
* File location: `.pi/runtime/{batchId}/registry.json`
|
|
4051
|
+
*
|
|
4052
|
+
* @since TP-102
|
|
4053
|
+
*/
|
|
4054
|
+
export interface RuntimeRegistry {
|
|
4055
|
+
/** Batch ID this registry belongs to */
|
|
4056
|
+
batchId: string;
|
|
4057
|
+
/** Epoch ms when the registry was last updated */
|
|
4058
|
+
updatedAt: number;
|
|
4059
|
+
/** All known agents (keyed by agentId for fast lookup in JSON form) */
|
|
4060
|
+
agents: Record<RuntimeAgentId, RuntimeAgentManifest>;
|
|
4061
|
+
}
|
|
4062
|
+
|
|
4063
|
+
/**
|
|
4064
|
+
* Lane execution snapshot emitted by the lane-runner.
|
|
4065
|
+
*
|
|
4066
|
+
* Replaces the current `lane-state-*.json` sidecar with a first-class
|
|
4067
|
+
* contract. Written by the lane-runner directly (not by tailing sidecar
|
|
4068
|
+
* files from a sibling process).
|
|
4069
|
+
*
|
|
4070
|
+
* File location: `.pi/runtime/{batchId}/lanes/lane-{N}.json`
|
|
4071
|
+
*
|
|
4072
|
+
* @since TP-102
|
|
4073
|
+
*/
|
|
4074
|
+
export interface RuntimeLaneSnapshot {
|
|
4075
|
+
/** Batch this lane belongs to */
|
|
4076
|
+
batchId: string;
|
|
4077
|
+
/** Lane number (1-indexed) */
|
|
4078
|
+
laneNumber: number;
|
|
4079
|
+
/** Lane identifier (e.g., "lane-1") */
|
|
4080
|
+
laneId: string;
|
|
4081
|
+
/** Repo ID this lane targets */
|
|
4082
|
+
repoId: string;
|
|
4083
|
+
/** Current task ID being executed */
|
|
4084
|
+
taskId: string | null;
|
|
4085
|
+
/** Current segment ID (null for whole-task execution) */
|
|
4086
|
+
segmentId: string | null;
|
|
4087
|
+
/** Lane execution status */
|
|
4088
|
+
status: "idle" | "running" | "complete" | "failed";
|
|
4089
|
+
/** Worker agent snapshot (null when no worker is active) */
|
|
4090
|
+
worker: RuntimeAgentTelemetrySnapshot | null;
|
|
4091
|
+
/** Reviewer agent snapshot (null when no reviewer is active) */
|
|
4092
|
+
reviewer: RuntimeAgentTelemetrySnapshot | null;
|
|
4093
|
+
/** Task progress derived from STATUS.md */
|
|
4094
|
+
progress: RuntimeTaskProgress | null;
|
|
4095
|
+
/** Epoch ms when this snapshot was last updated */
|
|
4096
|
+
updatedAt: number;
|
|
4097
|
+
}
|
|
4098
|
+
|
|
4099
|
+
/**
|
|
4100
|
+
* Telemetry snapshot for a single agent within a lane.
|
|
4101
|
+
*
|
|
4102
|
+
* @since TP-102
|
|
4103
|
+
*/
|
|
4104
|
+
export interface RuntimeAgentTelemetrySnapshot {
|
|
4105
|
+
/** Agent ID */
|
|
4106
|
+
agentId: RuntimeAgentId;
|
|
4107
|
+
/** Agent lifecycle status */
|
|
4108
|
+
status: RuntimeAgentStatus;
|
|
4109
|
+
/** Elapsed time in milliseconds */
|
|
4110
|
+
elapsedMs: number;
|
|
4111
|
+
/** Number of tool calls made */
|
|
4112
|
+
toolCalls: number;
|
|
4113
|
+
/** Context window utilization percentage (0-100) */
|
|
4114
|
+
contextPct: number;
|
|
4115
|
+
/** Cumulative cost in USD */
|
|
4116
|
+
costUsd: number;
|
|
4117
|
+
/** Last tool call description */
|
|
4118
|
+
lastTool: string;
|
|
4119
|
+
/** Input tokens consumed */
|
|
4120
|
+
inputTokens: number;
|
|
4121
|
+
/** Output tokens generated */
|
|
4122
|
+
outputTokens: number;
|
|
4123
|
+
/** Cache read tokens */
|
|
4124
|
+
cacheReadTokens: number;
|
|
4125
|
+
/** Cache write tokens */
|
|
4126
|
+
cacheWriteTokens: number;
|
|
4127
|
+
}
|
|
4128
|
+
|
|
4129
|
+
/**
|
|
4130
|
+
* Task progress derived from STATUS.md parsing.
|
|
4131
|
+
*
|
|
4132
|
+
* @since TP-102
|
|
4133
|
+
*/
|
|
4134
|
+
export interface RuntimeTaskProgress {
|
|
4135
|
+
/** Human-readable current step label */
|
|
4136
|
+
currentStep: string;
|
|
4137
|
+
/** Number of checked checkboxes across all steps */
|
|
4138
|
+
checked: number;
|
|
4139
|
+
/** Total number of checkboxes across all steps */
|
|
4140
|
+
total: number;
|
|
4141
|
+
/** Current worker iteration number */
|
|
4142
|
+
iteration: number;
|
|
4143
|
+
/** Number of reviews performed */
|
|
4144
|
+
reviews: number;
|
|
4145
|
+
}
|
|
4146
|
+
|
|
4147
|
+
/**
|
|
4148
|
+
* Normalized event emitted by an agent host.
|
|
4149
|
+
*
|
|
4150
|
+
* The canonical telemetry/conversation event shape for Runtime V2.
|
|
4151
|
+
* Agent hosts write these to per-agent event logs and stream them
|
|
4152
|
+
* to their parent process via IPC.
|
|
4153
|
+
*
|
|
4154
|
+
* File location: `.pi/runtime/{batchId}/agents/{agentId}/events.jsonl`
|
|
4155
|
+
*
|
|
4156
|
+
* @since TP-102
|
|
4157
|
+
*/
|
|
4158
|
+
export interface RuntimeAgentEvent {
|
|
4159
|
+
/** Batch ID */
|
|
4160
|
+
batchId: string;
|
|
4161
|
+
/** Agent that produced this event */
|
|
4162
|
+
agentId: RuntimeAgentId;
|
|
4163
|
+
/** Agent role */
|
|
4164
|
+
role: RuntimeAgentRole;
|
|
4165
|
+
/** Lane number (null for merge agents) */
|
|
4166
|
+
laneNumber: number | null;
|
|
4167
|
+
/** Task ID being executed when the event was produced */
|
|
4168
|
+
taskId: string | null;
|
|
4169
|
+
/** Repo ID */
|
|
4170
|
+
repoId: string;
|
|
4171
|
+
/** Epoch ms timestamp */
|
|
4172
|
+
ts: number;
|
|
4173
|
+
/** Event type */
|
|
4174
|
+
type: RuntimeAgentEventType;
|
|
4175
|
+
/** Event-specific payload */
|
|
4176
|
+
payload: Record<string, unknown>;
|
|
4177
|
+
}
|
|
4178
|
+
|
|
4179
|
+
/**
|
|
4180
|
+
* Normalized event types for the Runtime V2 agent event stream.
|
|
4181
|
+
*
|
|
4182
|
+
* @since TP-102
|
|
4183
|
+
*/
|
|
4184
|
+
export type RuntimeAgentEventType =
|
|
4185
|
+
// Lifecycle
|
|
4186
|
+
| "agent_started"
|
|
4187
|
+
| "agent_exited"
|
|
4188
|
+
| "agent_killed"
|
|
4189
|
+
| "agent_crashed"
|
|
4190
|
+
| "agent_timeout"
|
|
4191
|
+
// Conversation
|
|
4192
|
+
| "prompt_sent"
|
|
4193
|
+
| "assistant_message"
|
|
4194
|
+
| "tool_call"
|
|
4195
|
+
| "tool_result"
|
|
4196
|
+
// Telemetry
|
|
4197
|
+
| "usage_delta"
|
|
4198
|
+
| "context_usage"
|
|
4199
|
+
| "retry_started"
|
|
4200
|
+
| "retry_finished"
|
|
4201
|
+
| "compaction_started"
|
|
4202
|
+
| "compaction_finished"
|
|
4203
|
+
// Steering
|
|
4204
|
+
| "message_delivered"
|
|
4205
|
+
| "reply_sent"
|
|
4206
|
+
| "escalation_sent"
|
|
4207
|
+
// Review / bridge
|
|
4208
|
+
| "review_requested"
|
|
4209
|
+
| "review_completed"
|
|
4210
|
+
| "review_failed"
|
|
4211
|
+
// Exit interception (TP-172)
|
|
4212
|
+
| "exit_intercepted";
|
|
4213
|
+
|
|
4214
|
+
// ── Runtime V2 Path Helpers (TP-102) ─────────────────────────────────
|
|
4215
|
+
|
|
4216
|
+
/**
|
|
4217
|
+
* Resolve the root directory for Runtime V2 artifacts for a given batch.
|
|
4218
|
+
*
|
|
4219
|
+
* @param stateRoot - Root directory containing .pi/ (workspace root or repo root)
|
|
4220
|
+
* @param batchId - Batch identifier
|
|
4221
|
+
* @returns Absolute path: `{stateRoot}/.pi/runtime/{batchId}/`
|
|
4222
|
+
*
|
|
4223
|
+
* @since TP-102
|
|
4224
|
+
*/
|
|
4225
|
+
export function runtimeRoot(stateRoot: string, batchId: string): string {
|
|
4226
|
+
return `${stateRoot}/.pi/runtime/${batchId}`;
|
|
4227
|
+
}
|
|
4228
|
+
|
|
4229
|
+
/**
|
|
4230
|
+
* Resolve the path for a specific agent's runtime directory.
|
|
4231
|
+
*
|
|
4232
|
+
* @param stateRoot - Root directory containing .pi/
|
|
4233
|
+
* @param batchId - Batch identifier
|
|
4234
|
+
* @param agentId - Runtime agent identifier
|
|
4235
|
+
* @returns Absolute path: `{stateRoot}/.pi/runtime/{batchId}/agents/{agentId}/`
|
|
4236
|
+
*
|
|
4237
|
+
* @since TP-102
|
|
4238
|
+
*/
|
|
4239
|
+
export function runtimeAgentDir(
|
|
4240
|
+
stateRoot: string,
|
|
4241
|
+
batchId: string,
|
|
4242
|
+
agentId: RuntimeAgentId,
|
|
4243
|
+
): string {
|
|
4244
|
+
return `${stateRoot}/.pi/runtime/${batchId}/agents/${agentId}`;
|
|
4245
|
+
}
|
|
4246
|
+
|
|
4247
|
+
/**
|
|
4248
|
+
* Resolve the path for a specific agent's manifest file.
|
|
4249
|
+
*
|
|
4250
|
+
* @since TP-102
|
|
4251
|
+
*/
|
|
4252
|
+
export function runtimeManifestPath(
|
|
4253
|
+
stateRoot: string,
|
|
4254
|
+
batchId: string,
|
|
4255
|
+
agentId: RuntimeAgentId,
|
|
4256
|
+
): string {
|
|
4257
|
+
return `${runtimeAgentDir(stateRoot, batchId, agentId)}/manifest.json`;
|
|
4258
|
+
}
|
|
4259
|
+
|
|
4260
|
+
/**
|
|
4261
|
+
* Resolve the path for a specific agent's event log.
|
|
4262
|
+
*
|
|
4263
|
+
* @since TP-102
|
|
4264
|
+
*/
|
|
4265
|
+
export function runtimeAgentEventsPath(
|
|
4266
|
+
stateRoot: string,
|
|
4267
|
+
batchId: string,
|
|
4268
|
+
agentId: RuntimeAgentId,
|
|
4269
|
+
): string {
|
|
4270
|
+
return `${runtimeAgentDir(stateRoot, batchId, agentId)}/events.jsonl`;
|
|
4271
|
+
}
|
|
4272
|
+
|
|
4273
|
+
/**
|
|
4274
|
+
* Resolve the path for a lane snapshot file.
|
|
4275
|
+
*
|
|
4276
|
+
* @since TP-102
|
|
4277
|
+
*/
|
|
4278
|
+
export function runtimeLaneSnapshotPath(
|
|
4279
|
+
stateRoot: string,
|
|
4280
|
+
batchId: string,
|
|
4281
|
+
laneNumber: number,
|
|
4282
|
+
): string {
|
|
4283
|
+
return `${stateRoot}/.pi/runtime/${batchId}/lanes/lane-${laneNumber}.json`;
|
|
4284
|
+
}
|
|
4285
|
+
|
|
4286
|
+
/**
|
|
4287
|
+
* Telemetry snapshot for a merge agent.
|
|
4288
|
+
*
|
|
4289
|
+
* Written to `.pi/runtime/{batchId}/lanes/merge-{mergeNumber}.json` alongside
|
|
4290
|
+
* lane snapshots so the dashboard can display live merge-phase telemetry.
|
|
4291
|
+
* Follows the same file-backed pattern as {@link RuntimeLaneSnapshot} but is
|
|
4292
|
+
* simpler — merge agents have no reviewer, progress tracking, or repoId.
|
|
4293
|
+
*
|
|
4294
|
+
* @since TP-164
|
|
4295
|
+
*/
|
|
4296
|
+
export interface RuntimeMergeSnapshot {
|
|
4297
|
+
/** Batch this merge agent belongs to */
|
|
4298
|
+
batchId: string;
|
|
4299
|
+
/** 1-indexed merge agent number (e.g. 1 for "orch-henry-merge-1") */
|
|
4300
|
+
mergeNumber: number;
|
|
4301
|
+
/** Stable agent session name (e.g. "orch-henry-merge-1") */
|
|
4302
|
+
sessionName: string;
|
|
4303
|
+
/** Wave index this merge agent is processing (0-indexed, 0 when unknown) */
|
|
4304
|
+
waveIndex: number;
|
|
4305
|
+
/** Merge agent lifecycle status */
|
|
4306
|
+
status: "running" | "complete" | "failed";
|
|
4307
|
+
/** Live telemetry snapshot for the merge agent (null when not yet started) */
|
|
4308
|
+
agent: RuntimeAgentTelemetrySnapshot | null;
|
|
4309
|
+
/** Epoch ms when this snapshot was last updated */
|
|
4310
|
+
updatedAt: number;
|
|
4311
|
+
}
|
|
4312
|
+
|
|
4313
|
+
/**
|
|
4314
|
+
* Resolve the path for a merge agent snapshot file.
|
|
4315
|
+
*
|
|
4316
|
+
* Snapshots are stored alongside lane snapshots in the `lanes/` directory so
|
|
4317
|
+
* the dashboard server's directory scan picks them up automatically.
|
|
4318
|
+
*
|
|
4319
|
+
* @param stateRoot - Repository root (where `.pi/` lives)
|
|
4320
|
+
* @param batchId - Current batch identifier
|
|
4321
|
+
* @param mergeNumber - 1-indexed merge agent number
|
|
4322
|
+
* @returns Absolute path to the merge snapshot JSON file
|
|
4323
|
+
*
|
|
4324
|
+
* @since TP-164
|
|
4325
|
+
*/
|
|
4326
|
+
/**
|
|
4327
|
+
* Path to a merge agent snapshot file.
|
|
4328
|
+
*
|
|
4329
|
+
* The filename includes BOTH `waveIndex` and `mergeNumber` because lane
|
|
4330
|
+
* numbers (and therefore the legacy `mergeNumber`-only filename) repeat
|
|
4331
|
+
* across waves — a wave-2 lane-1 merge would overwrite the wave-1 lane-1
|
|
4332
|
+
* snapshot before the dashboard's next poll could read it. Per-wave
|
|
4333
|
+
* namespacing keeps each merge's snapshot durable until the runtime
|
|
4334
|
+
* directory itself is cleaned up at end-of-batch. See #509.
|
|
4335
|
+
*
|
|
4336
|
+
* @param waveIndex 0-based wave index for the merge
|
|
4337
|
+
* @param mergeNumber 1-based merge agent number (derived from lane number)
|
|
4338
|
+
*/
|
|
4339
|
+
export function runtimeMergeSnapshotPath(
|
|
4340
|
+
stateRoot: string,
|
|
4341
|
+
batchId: string,
|
|
4342
|
+
waveIndex: number,
|
|
4343
|
+
mergeNumber: number,
|
|
4344
|
+
): string {
|
|
4345
|
+
return `${stateRoot}/.pi/runtime/${batchId}/lanes/merge-w${waveIndex}-${mergeNumber}.json`;
|
|
4346
|
+
}
|
|
4347
|
+
|
|
4348
|
+
/**
|
|
4349
|
+
* Resolve the path for the batch runtime registry.
|
|
4350
|
+
*
|
|
4351
|
+
* @since TP-102
|
|
4352
|
+
*/
|
|
4353
|
+
export function runtimeRegistryPath(stateRoot: string, batchId: string): string {
|
|
4354
|
+
return `${stateRoot}/.pi/runtime/${batchId}/registry.json`;
|
|
4355
|
+
}
|
|
4356
|
+
|
|
4357
|
+
/**
|
|
4358
|
+
* Build a canonical RuntimeAgentId from components.
|
|
4359
|
+
*
|
|
4360
|
+
* Produces IDs compatible with the existing naming convention
|
|
4361
|
+
* (e.g., "orch-henrylach-lane-1-worker") while semantically
|
|
4362
|
+
* decoupling them from legacy session names.
|
|
4363
|
+
*
|
|
4364
|
+
* @param prefix - Operator/batch prefix (e.g., "orch-henrylach")
|
|
4365
|
+
* @param laneNumber - Lane number (null for merge agents)
|
|
4366
|
+
* @param role - Agent role
|
|
4367
|
+
* @param mergeIndex - Merge wave index (only for merge agents)
|
|
4368
|
+
* @returns Canonical agent ID string
|
|
4369
|
+
*
|
|
4370
|
+
* @since TP-102
|
|
4371
|
+
*/
|
|
4372
|
+
export function buildRuntimeAgentId(
|
|
4373
|
+
prefix: string,
|
|
4374
|
+
laneNumber: number | null,
|
|
4375
|
+
role: RuntimeAgentRole,
|
|
4376
|
+
mergeIndex?: number,
|
|
4377
|
+
): RuntimeAgentId {
|
|
4378
|
+
if (role === "merger" && mergeIndex != null) {
|
|
4379
|
+
return `${prefix}-merge-${mergeIndex}`;
|
|
4380
|
+
}
|
|
4381
|
+
if (role === "lane-runner" && laneNumber != null) {
|
|
4382
|
+
return `${prefix}-lane-${laneNumber}`;
|
|
4383
|
+
}
|
|
4384
|
+
if (laneNumber != null) {
|
|
4385
|
+
return `${prefix}-lane-${laneNumber}-${role}`;
|
|
4386
|
+
}
|
|
4387
|
+
return `${prefix}-${role}`;
|
|
4388
|
+
}
|
|
4389
|
+
|
|
4390
|
+
/**
|
|
4391
|
+
* Validate that a RuntimeAgentManifest has required fields and sane values.
|
|
4392
|
+
*
|
|
4393
|
+
* Returns an array of validation error strings (empty = valid).
|
|
4394
|
+
*
|
|
4395
|
+
* @since TP-102
|
|
4396
|
+
*/
|
|
4397
|
+
export function validateAgentManifest(manifest: unknown): string[] {
|
|
4398
|
+
const errors: string[] = [];
|
|
4399
|
+
if (!manifest || typeof manifest !== "object") {
|
|
4400
|
+
return ["manifest must be a non-null object"];
|
|
4401
|
+
}
|
|
4402
|
+
const m = manifest as Record<string, unknown>;
|
|
4403
|
+
|
|
4404
|
+
if (typeof m.batchId !== "string" || !m.batchId) errors.push("batchId must be a non-empty string");
|
|
4405
|
+
if (typeof m.agentId !== "string" || !m.agentId) errors.push("agentId must be a non-empty string");
|
|
4406
|
+
if (typeof m.role !== "string") errors.push("role must be a string");
|
|
4407
|
+
else {
|
|
4408
|
+
const validRoles: ReadonlySet<string> = new Set(["worker", "reviewer", "merger", "lane-runner"]);
|
|
4409
|
+
if (!validRoles.has(m.role as string))
|
|
4410
|
+
errors.push(`role must be one of: ${[...validRoles].join(", ")}`);
|
|
4411
|
+
}
|
|
4412
|
+
if (typeof m.pid !== "number" || !Number.isFinite(m.pid) || m.pid <= 0)
|
|
4413
|
+
errors.push("pid must be a positive finite number");
|
|
4414
|
+
if (typeof m.parentPid !== "number" || !Number.isFinite(m.parentPid) || m.parentPid <= 0)
|
|
4415
|
+
errors.push("parentPid must be a positive finite number");
|
|
4416
|
+
if (typeof m.startedAt !== "number" || !Number.isFinite(m.startedAt))
|
|
4417
|
+
errors.push("startedAt must be a finite number");
|
|
4418
|
+
if (typeof m.status !== "string") errors.push("status must be a string");
|
|
4419
|
+
else {
|
|
4420
|
+
const validStatuses: ReadonlySet<string> = new Set([
|
|
4421
|
+
"spawning",
|
|
4422
|
+
"running",
|
|
4423
|
+
"wrapping_up",
|
|
4424
|
+
"exited",
|
|
4425
|
+
"crashed",
|
|
4426
|
+
"timed_out",
|
|
4427
|
+
"killed",
|
|
4428
|
+
]);
|
|
4429
|
+
if (!validStatuses.has(m.status as string))
|
|
4430
|
+
errors.push(`status must be one of: ${[...validStatuses].join(", ")}`);
|
|
4431
|
+
}
|
|
4432
|
+
if (typeof m.cwd !== "string" || !m.cwd) errors.push("cwd must be a non-empty string");
|
|
4433
|
+
if (typeof m.repoId !== "string") errors.push("repoId must be a string");
|
|
4434
|
+
|
|
4435
|
+
return errors;
|
|
4436
|
+
}
|
|
4437
|
+
|
|
4438
|
+
/**
|
|
4439
|
+
* Validate that a PacketPaths object has all required fields.
|
|
4440
|
+
*
|
|
4441
|
+
* Returns an array of validation error strings (empty = valid).
|
|
4442
|
+
*
|
|
4443
|
+
* @since TP-102
|
|
4444
|
+
*/
|
|
4445
|
+
export function validatePacketPaths(packet: unknown): string[] {
|
|
4446
|
+
const errors: string[] = [];
|
|
4447
|
+
if (!packet || typeof packet !== "object") {
|
|
4448
|
+
return ["packet must be a non-null object"];
|
|
4449
|
+
}
|
|
4450
|
+
const p = packet as Record<string, unknown>;
|
|
4451
|
+
|
|
4452
|
+
for (const field of [
|
|
4453
|
+
"promptPath",
|
|
4454
|
+
"statusPath",
|
|
4455
|
+
"donePath",
|
|
4456
|
+
"reviewsDir",
|
|
4457
|
+
"taskFolder",
|
|
4458
|
+
] as const) {
|
|
4459
|
+
if (typeof p[field] !== "string" || !(p[field] as string)) {
|
|
4460
|
+
errors.push(`${field} must be a non-empty string`);
|
|
4461
|
+
}
|
|
4462
|
+
}
|
|
4463
|
+
|
|
4464
|
+
return errors;
|
|
4465
|
+
}
|