@buihongduc132/pi-acp-agents 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,966 @@
1
+ /**
2
+ * DagExecutor — Wave-based parallel execution of DAG steps.
3
+ *
4
+ * This is the orchestration core of the `acp-dag-delegation` change
5
+ * (design.md D2). It owns the topological-sort → wave loop: for each wave,
6
+ * every step whose dependencies are satisfied is dispatched in parallel via
7
+ * the existing {@link AgentCoordinator.delegate()} method (the executor
8
+ * manages the wave loop directly — it does NOT hand dispatch off to
9
+ * `AsyncExecutor`, per task 5.3). Outputs and errors are captured per step
10
+ * and persisted through {@link DagStore} so the run survives pi restart.
11
+ *
12
+ * The executor is wired (design.md "Integration with existing
13
+ * infrastructure"; task 7.1) with the existing infrastructure singletons
14
+ * from `index.ts`:
15
+ *
16
+ * - {@link AgentCoordinator} — one short-lived `delegate()` call per step
17
+ * - {@link AcpCircuitBreaker} — consulted before every dispatch; an open
18
+ * circuit fails the step immediately with
19
+ * `Agent "<name>" is unavailable (circuit breaker open)` (task 5.7)
20
+ * - {@link TemplateResolver} — expands `{<step>.output}` / `{<step>.status}`
21
+ * / `{dag.args.*}` in each step's prompt before dispatch (task 5.3)
22
+ * - {@link DagStore} — the persistence layer for DAG + step state
23
+ *
24
+ * Task 5.1 scope: create the class with a constructor that wires up these
25
+ * dependencies. The execution surface — `topologicalSort()`, `execute()`,
26
+ * wave dispatch, gate evaluation, failFast, circuit-breaker check,
27
+ * completion detection, `cancel()`, resume, stale detection, and retry — is
28
+ * implemented by the subsequent tasks 5.2–5.13.
29
+ */
30
+
31
+ import type { DagStore } from "./dag-store.js";
32
+ import type {
33
+ DagRecord,
34
+ DagStatus,
35
+ DagStepRecord,
36
+ DagStepStatus,
37
+ DagTaskDefinition,
38
+ } from "../config/types.js";
39
+ import type { TemplateResolver } from "./template-resolver.js";
40
+ import type { AgentCoordinator } from "../coordination/coordinator.js";
41
+ import type { AcpCircuitBreaker } from "../core/circuit-breaker.js";
42
+ import type { Logger } from "../logger.js";
43
+ import { createNoopLogger } from "../logger.js";
44
+
45
+ /** Constructor options for {@link DagExecutor}. */
46
+ export interface DagExecutorOptions {
47
+ /** File-backed DAG + step state persistence. */
48
+ store: DagStore;
49
+ /** Template variable interpolation for step prompts. */
50
+ resolver: TemplateResolver;
51
+ /** Existing agent coordinator used for per-step `delegate()` dispatch. */
52
+ coordinator: AgentCoordinator;
53
+ /** Existing per-agent circuit breaker consulted before each dispatch. */
54
+ circuitBreaker: AcpCircuitBreaker;
55
+ /**
56
+ * Optional existing async executor. Retained on the instance for
57
+ * integration wiring (task 7.1) even though the wave loop is driven
58
+ * directly by the executor (task 5.3); defaults to undefined.
59
+ */
60
+ asyncExecutor?: unknown;
61
+ /** Logger; defaults to a no-op logger so the executor is constructable standalone. */
62
+ logger?: Logger;
63
+ /**
64
+ * Optional event log for recording step lifecycle transitions (task 7.4).
65
+ * When provided, the executor appends "dag-step" events for each step
66
+ * status transition (running, completed, failed, skipped, cancelled) with
67
+ * data including dagId, stepId, agent, status, and durationMs.
68
+ */
69
+ eventLog?: { append(type: string, data: Record<string, unknown>): void };
70
+ }
71
+
72
+ /** No-op default logger so the executor is safe to build without one. */
73
+ const noopLogger = createNoopLogger();
74
+
75
+ /** Summary returned by {@link DagExecutor.cancel} (specs/dag-monitoring). */
76
+ export interface DagCancelSummary {
77
+ /** Steps that had already reached `completed` at cancel time. */
78
+ completed: number;
79
+ /** Steps that were `running` (in-flight) at cancel time and got aborted. */
80
+ aborted: number;
81
+ /** Steps that were `pending` at cancel time and got marked `cancelled`. */
82
+ cancelled: number;
83
+ }
84
+
85
+ export class DagExecutor {
86
+ /** File-backed DAG + step state persistence. */
87
+ readonly store: DagStore;
88
+ /** Template variable interpolation for step prompts. */
89
+ readonly resolver: TemplateResolver;
90
+ /** Existing agent coordinator used for per-step `delegate()` dispatch. */
91
+ readonly coordinator: AgentCoordinator;
92
+ /** Existing per-agent circuit breaker consulted before each dispatch. */
93
+ readonly circuitBreaker: AcpCircuitBreaker;
94
+ /** Optional async executor wired from `index.ts` (task 7.1). */
95
+ readonly asyncExecutor: unknown;
96
+ /** Logger for step lifecycle / wave / resume events. */
97
+ protected readonly logger: Logger;
98
+ /**
99
+ * Optional event log for recording step lifecycle transitions (task 7.4).
100
+ * Appends "dag-step" events for each step status change with data including
101
+ * dagId, stepId, agent, status, and durationMs.
102
+ */
103
+ protected readonly eventLog?: { append(type: string, data: Record<string, unknown>): void };
104
+ /**
105
+ * In-flight abort controllers keyed by `dagId` → `stepId`. Registered by
106
+ * {@link DagExecutor.dispatchStep} before each dispatch so {@link
107
+ * DagExecutor.cancel} (task 5.9) can abort in-flight agent sessions.
108
+ *
109
+ * This registry is SHARED across all DagExecutor instances (module-level
110
+ * singleton, see {@link SHARED_ABORT_CONTROLLERS}). In-flight agent
111
+ * sessions exist independent of which executor instance dispatched them
112
+ * or processes the cancel — `index.ts` constructs a fresh DagExecutor per
113
+ * tool call (task 7.1 wiring), so a per-instance map would leave
114
+ * `acp_dag_cancel` unable to abort sessions dispatched by the
115
+ * `acp_dag_submit` executor. Sharing the registry keeps cancellation
116
+ * working end-to-end (specs/dag-monitoring "DAG cancellation").
117
+ */
118
+ protected readonly abortControllers = SHARED_ABORT_CONTROLLERS;
119
+
120
+ /**
121
+ * Group DAG tasks into ordered execution waves (design.md D2 / task 5.2).
122
+ *
123
+ * Wave 0 contains every task with no dependencies. Each subsequent task
124
+ * is assigned to the wave immediately after the latest wave any of its
125
+ * dependencies landed in. All tasks sharing the same wave index form one
126
+ * wave and dispatch in parallel by {@link DagExecutor.execute} (task 5.3).
127
+ *
128
+ * This mirrors dorkestrator's `buildExecutionWaves()` and pi-taskflow's
129
+ * phase-by-phase model. The input array is treated as read-only — the
130
+ * caller's array and its task objects are not mutated.
131
+ *
132
+ * @param tasks Declarative DAG task definitions (already validated — no
133
+ * cycles, no dangling refs; see {@link DagValidator}).
134
+ * @returns An ordered array of waves; each wave is an array of step IDs.
135
+ * Empty input yields an empty array.
136
+ */
137
+ topologicalSort(tasks: readonly DagTaskDefinition[]): string[][] {
138
+ if (tasks.length === 0) return [];
139
+
140
+ // Map each step id → its (normalized) dependency list.
141
+ const depsOf = new Map<string, string[]>();
142
+ for (const t of tasks) {
143
+ depsOf.set(t.id, [...(t.dependsOn ?? [])]);
144
+ }
145
+
146
+ // Longest-path layering: wave(id) = max(wave(dep)) + 1, or 0 if no deps.
147
+ const waveOf = new Map<string, number>();
148
+ const remaining = new Set(depsOf.keys());
149
+
150
+ // Iteratively peel off tasks whose dependencies have all been assigned a
151
+ // wave. A validated DAG is a DAG, so this always drains in
152
+ // (number of waves) passes at most.
153
+ let progressed = true;
154
+ while (remaining.size > 0 && progressed) {
155
+ progressed = false;
156
+ for (const id of remaining) {
157
+ const deps = depsOf.get(id)!;
158
+ if (!deps.every((d) => waveOf.has(d))) continue;
159
+ const wave = deps.reduce((m, d) => Math.max(m, waveOf.get(d)! + 1), 0);
160
+ waveOf.set(id, wave);
161
+ remaining.delete(id);
162
+ progressed = true;
163
+ }
164
+ }
165
+ if (remaining.size > 0) {
166
+ // Should be unreachable for a validated DAG (cycles/dangling refs are
167
+ // caught by DagValidator before execution). Surface defensively.
168
+ throw new Error(
169
+ `DagExecutor.topologicalSort: unresolved dependencies for steps: ${[
170
+ ...remaining,
171
+ ].join(", ")}`,
172
+ );
173
+ }
174
+
175
+ // Preserve the input declaration order within each wave for determinism.
176
+ const maxWave = Math.max(...waveOf.values());
177
+ const waves: string[][] = Array.from({ length: maxWave + 1 }, () => []);
178
+ for (const t of tasks) {
179
+ waves[waveOf.get(t.id)!].push(t.id);
180
+ }
181
+ return waves;
182
+ }
183
+
184
+ constructor(options: DagExecutorOptions) {
185
+ this.store = options.store;
186
+ this.resolver = options.resolver;
187
+ this.coordinator = options.coordinator;
188
+ this.circuitBreaker = options.circuitBreaker;
189
+ this.asyncExecutor = options.asyncExecutor;
190
+ this.logger = options.logger ?? noopLogger;
191
+ this.eventLog = options.eventLog;
192
+ }
193
+
194
+ /**
195
+ * Execute a DAG to completion, wave by wave (task 5.3).
196
+ *
197
+ * Loads the persisted DAG, transitions it to `running`, computes waves via
198
+ * {@link DagExecutor.topologicalSort}, then for each wave dispatches every
199
+ * step **in parallel** directly through {@link AgentCoordinator.delegate}
200
+ * (the executor owns the wave loop — it does NOT delegate dispatch to
201
+ * `AsyncExecutor`, per design.md D2 / task 5.3). It waits for the entire
202
+ * wave to reach a terminal state before advancing, capturing each step's
203
+ * output (or error) into the persisted record via {@link DagStore.updateStep}
204
+ * so downstream waves can resolve `{<step>.output}` template variables.
205
+ *
206
+ * After the last wave the DAG transitions to `completed` when every step
207
+ * succeeded, or `failed` otherwise.
208
+ *
209
+ * @param dagId DAG to execute.
210
+ * @param options Optional execution flags. `skipTerminal` (task 5.10)
211
+ * leaves steps already in a terminal state untouched instead of
212
+ * re-dispatching them — used by {@link DagExecutor.resume} so persisted
213
+ * outputs feed downstream template resolution.
214
+ */
215
+ async execute(
216
+ dagId: string,
217
+ options?: { skipTerminal?: boolean },
218
+ ): Promise<void> {
219
+ const record = this.store.get(dagId);
220
+ if (!record) {
221
+ throw new Error(`DAG "${dagId}" not found`);
222
+ }
223
+
224
+ // Re-hydrate the persisted step states into a working snapshot the wave
225
+ // loop reads from. Steps already terminal (e.g. completed on resume)
226
+ // are left untouched and their stored outputs feed template resolution.
227
+ const steps: Record<string, DagStepRecord> = {};
228
+ for (const stepId of Object.keys(record.steps)) {
229
+ steps[stepId] = { ...record.steps[stepId] };
230
+ }
231
+
232
+ const waves = this.topologicalSort(record.tasks);
233
+ this.store.updateDagStatus(dagId, "running");
234
+
235
+ for (let waveIndex = 0; waveIndex < waves.length; waveIndex += 1) {
236
+ const waveStepIds = waves[waveIndex];
237
+ await this.runWave(dagId, record, steps, waveStepIds, options);
238
+
239
+ // If the DAG was cancelled mid-execution (task 5.9), stop advancing —
240
+ // `cancel()` owns the transition to `cancelled` and we MUST NOT
241
+ // overwrite it with a completion-derived status.
242
+ if (this.store.get(dagId)?.status === "cancelled") return;
243
+ }
244
+
245
+ // DAG completion detection (task 5.8): when every step has reached a
246
+ // terminal state, transition the DAG to `completed` or `failed`.
247
+ const terminalStatus = this.detectCompletion(steps);
248
+ if (terminalStatus !== null) {
249
+ this.store.updateDagStatus(dagId, terminalStatus);
250
+ }
251
+ }
252
+
253
+ /**
254
+ * Resume a previously-interrupted DAG (task 5.10, specs/dag-resume "Resume
255
+ * from last checkpoint after pi restart").
256
+ *
257
+ * On pi restart the extension calls {@link DagExecutor.resumeAll} which
258
+ * discovers DAGs persisted in `running` state and resumes each via this
259
+ * method. Resume:
260
+ *
261
+ * 1. Resets every step still marked `running` back to `pending` — it was
262
+ * interrupted mid-flight and its outcome is unknown, so it must be
263
+ * retried (specs/dag-resume scenario "Resume a DAG interrupted by pi
264
+ * restart").
265
+ * 2. Re-runs the wave loop via {@link DagExecutor.execute} with
266
+ * `skipTerminal: true`, so steps already `completed` / `failed` /
267
+ * `skipped` / `cancelled` are NOT re-dispatched — their persisted
268
+ * outputs feed downstream template resolution (specs/dag-resume
269
+ * scenario "Skip already-completed steps on resume").
270
+ *
271
+ * Throws when the DAG does not exist (mirrors {@link DagExecutor.execute}).
272
+ *
273
+ * @param dagId DAG to resume.
274
+ */
275
+ async resume(dagId: string): Promise<void> {
276
+ const record = this.store.get(dagId);
277
+ if (!record) {
278
+ throw new Error(`DAG "${dagId}" not found`);
279
+ }
280
+
281
+ // Reset every step still marked `running` back to `pending`. A `running`
282
+ // step at resume time was interrupted mid-flight — its outcome is
283
+ // unknown, so it must be retried from scratch.
284
+ for (const stepId of Object.keys(record.steps)) {
285
+ const step = record.steps[stepId];
286
+ if (step.status === "running") {
287
+ this.store.updateStep(dagId, stepId, (s) => ({
288
+ ...s,
289
+ status: "pending",
290
+ startedAt: undefined,
291
+ }));
292
+ }
293
+ }
294
+
295
+ // Re-run the wave loop, skipping steps already in a terminal state.
296
+ // `execute` reloads the (now-reset) record from the store, so the
297
+ // snapshot it builds reflects the resets above.
298
+ await this.execute(dagId, { skipTerminal: true });
299
+ }
300
+
301
+ /**
302
+ * Resume every DAG persisted in `running` state (task 5.10, task 7.3).
303
+ *
304
+ * This is the startup hook invoked by the extension on load: it scans
305
+ * `~/.pi/acp-agents/dag/` via {@link DagStore.findRunning} and resumes each
306
+ * discovered DAG through {@link DagExecutor.resume}. `stale` DAGs are
307
+ * naturally excluded — `findRunning` only returns `running` DAGs (task
308
+ * 5.11 / specs/dag-resume "Stale DAG does not auto-resume").
309
+ *
310
+ * A single DAG that fails to resume (e.g. an unreadable step record) does
311
+ * NOT abort the pass — the error is logged and the remaining DAGs still
312
+ * resume. Returns the list of DAG IDs that were attempted.
313
+ */
314
+ async resumeAll(): Promise<string[]> {
315
+ const running = this.store.findRunning();
316
+ const resumed: string[] = [];
317
+ for (const record of running) {
318
+ try {
319
+ await this.resume(record.dagId);
320
+ } catch (err) {
321
+ // One bad DAG must not abort the resume pass — log and continue.
322
+ const message = err instanceof Error ? err.message : String(err);
323
+ this.logger.error(
324
+ `DagExecutor.resumeAll: failed to resume DAG "${record.dagId}": ${message}`,
325
+ );
326
+ }
327
+ resumed.push(record.dagId);
328
+ }
329
+ return resumed;
330
+ }
331
+
332
+ /**
333
+ * DAG completion detection (task 5.8, specs/dag-execution "DAG state
334
+ * transitions"). Returns the DAG-level terminal status once every step
335
+ * has reached a terminal state, or `null` while at least one step is
336
+ * still `pending`/`running` (the DAG is not yet done).
337
+ *
338
+ * - Returns `"completed"` when every step is `completed`.
339
+ * - Returns `"failed"` when any step is `failed`, `skipped`, or
340
+ * `cancelled` (the run as a whole did not succeed; a `cancelled` step
341
+ * from the cancel path (task 5.9) is also a non-success terminal state).
342
+ * - Returns `null` while any step is still non-terminal.
343
+ *
344
+ * An empty step set is vacuously complete (`"completed"`).
345
+ *
346
+ * This is a pure function over the supplied step map — it does not read
347
+ * from or mutate the {@link DagStore}, which keeps it trivially testable
348
+ * and reusable by the cancel/resume paths.
349
+ */
350
+ detectCompletion(
351
+ steps: Record<string, DagStepRecord>,
352
+ ): DagStatus | null {
353
+ const records = Object.values(steps);
354
+ if (!records.every((s) => isTerminal(s.status))) return null;
355
+ const anyFailure = records.some(
356
+ (s) =>
357
+ s.status === "failed" ||
358
+ s.status === "skipped" ||
359
+ s.status === "cancelled",
360
+ );
361
+ return anyFailure ? "failed" : "completed";
362
+ }
363
+
364
+ /**
365
+ * Dispatch every step in a single wave in parallel, await all of them,
366
+ * and persist the captured output/error per step. Mutates `steps` to
367
+ * reflect terminal states.
368
+ *
369
+ * Gate evaluation (task 5.5, design.md D4): before dispatching a step,
370
+ * consult {@link DagExecutor.gateAllowsDispatch}. A `needs` gate is only
371
+ * satisfied when every dependency `completed`; a dependency that did not
372
+ * `complete` (e.g. `failed`) blocks the downstream step and it is marked
373
+ * `skipped` without dispatching. An `after` gate is satisfied as soon as
374
+ * the dependency is in any terminal state, so the downstream step runs
375
+ * regardless of the dependency's outcome.
376
+ */
377
+ private async runWave(
378
+ dagId: string,
379
+ record: DagRecord,
380
+ steps: Record<string, DagStepRecord>,
381
+ waveStepIds: string[],
382
+ options?: { skipTerminal?: boolean },
383
+ ): Promise<void> {
384
+ // Build the template context from already-terminal steps (their outputs
385
+ // and statuses), plus the workflow-level args. Pre-computing here keeps
386
+ // every parallel dispatch within the wave on equal footing.
387
+ const outputs = collectOutputs(steps);
388
+ const statuses = collectStatuses(steps);
389
+ const dagArgs = record.args ?? {};
390
+
391
+ const dispatches = waveStepIds.map(async (stepId) => {
392
+ const step = steps[stepId];
393
+ if (!step) return undefined;
394
+
395
+ // Resume (task 5.10): steps already in a terminal state (e.g.
396
+ // `completed` on resume) are NOT re-dispatched — their stored
397
+ // outputs feed downstream template resolution instead.
398
+ if (options?.skipTerminal && isTerminal(step.status)) {
399
+ return undefined;
400
+ }
401
+
402
+ // Gate evaluation (task 5.5) + failFast (task 5.6, design.md D5).
403
+ // A `needs` gate whose dependency did not `complete` is not
404
+ // satisfiable: skip the step instead of dispatching — UNLESS the DAG
405
+ // is running with `failFast: false`, in which case a failed
406
+ // dependency is treated like an `after` gate and the step still
407
+ // dispatches (receiving the dep's error message as `{<dep>.output}`,.
408
+ // surfaced by `collectOutputs`).
409
+ const failFast = record.options?.failFast !== false;
410
+ if (!this.gateAllowsDispatch(step, steps, failFast)) {
411
+ return this.skipStep(dagId, step);
412
+ }
413
+
414
+ const resolvedPrompt = this.resolver.resolve(
415
+ step.prompt,
416
+ outputs,
417
+ statuses,
418
+ dagArgs,
419
+ );
420
+
421
+ // Fail the step if template variables remain unresolved (unknown
422
+ // step id, missing dag arg, or typo) — per README spec.
423
+ if (this.resolver.hasUnresolvedTemplates(resolvedPrompt)) {
424
+ const failedStep = this.store.updateStep(dagId, step.id, (s) => ({
425
+ ...s,
426
+ status: "failed" as const,
427
+ output: null,
428
+ error: `Unresolved template variable in prompt: ${resolvedPrompt.match(/\{[^}]+\}/g)?.join(", ")}`,
429
+ completedAt: new Date().toISOString(),
430
+ }));
431
+ this.logStepEvent(dagId, step, "failed", 0);
432
+ return Promise.resolve(
433
+ failedStep ?? { ...step, status: "failed" as const, output: null },
434
+ );
435
+ }
436
+
437
+ return this.dispatchStep(dagId, step, resolvedPrompt, {
438
+ maxRetries: record.options?.maxRetries ?? 0,
439
+ });
440
+ });
441
+
442
+ const settled = await Promise.allSettled(dispatches);
443
+
444
+ // Mirror the dispatched results back into the working snapshot so the
445
+ // next wave's template resolution sees them.
446
+ settled.forEach((result, i) => {
447
+ const stepId = waveStepIds[i];
448
+ if (result.status === "fulfilled" && result.value) {
449
+ steps[stepId] = result.value;
450
+ }
451
+ });
452
+ }
453
+
454
+ /**
455
+ * Gate evaluation (task 5.5, design.md D4). Returns whether `step`'s gate
456
+ * is satisfied given the current step states, i.e. whether the step may be
457
+ * dispatched in this wave.
458
+ * - `needs` gate: every dependency MUST be `completed`. Any other dep
459
+ * state (including `failed`) blocks dispatch — unless `failFast` is
460
+ * `false`, in which case a failed dependency is treated like `after`
461
+ * (task 5.6, design.md D5) and dispatch proceeds with the error text
462
+ * surfaced as `{<dep>.output}`.
463
+ * - `after` gate: every dependency MUST be in a terminal state
464
+ * (`completed`, `failed`, `skipped`, or `cancelled`) — outcome is
465
+ * irrelevant. This lets audit/review steps run on failure evidence.
466
+ *
467
+ * Steps with no dependencies always pass (their gate is vacuously true).
468
+ *
469
+ * @param failFast DAG-level failFast flag (defaults to `true`). When
470
+ * `false`, failed `needs`-gate dependencies do not block dispatch.
471
+ */
472
+ gateAllowsDispatch(
473
+ step: DagStepRecord,
474
+ steps: Record<string, DagStepRecord>,
475
+ failFast = true,
476
+ ): boolean {
477
+ const deps = step.dependsOn ?? [];
478
+ if (deps.length === 0) return true;
479
+
480
+ if (step.gate === "after") {
481
+ return deps.every((depId) => isTerminal(steps[depId]?.status));
482
+ }
483
+ // Default gate is `needs`. With failFast=false a failed dependency is
484
+ // treated like `after` — the step still dispatches.
485
+ if (!failFast) {
486
+ return deps.every((depId) => isTerminal(steps[depId]?.status));
487
+ }
488
+ return deps.every((depId) => steps[depId]?.status === "completed");
489
+ }
490
+
491
+ /**
492
+ * Mark a step `skipped` without dispatching it, persisting the transition
493
+ * through {@link DagStore.updateStep}. Used when a `needs` gate blocks the
494
+ * step because a dependency did not `complete` (task 5.5; the broader
495
+ * failFast transitive skip propagation is task 5.6).
496
+ */
497
+ private skipStep(dagId: string, step: DagStepRecord): DagStepRecord {
498
+ const completedAt = new Date().toISOString();
499
+ const updated = this.store.updateStep(dagId, step.id, (s) => ({
500
+ ...s,
501
+ status: "skipped",
502
+ output: null,
503
+ completedAt,
504
+ }));
505
+ this.logStepEvent(dagId, step, "skipped", 0);
506
+ return (
507
+ updated ?? {
508
+ ...step,
509
+ status: "skipped",
510
+ output: null,
511
+ completedAt,
512
+ }
513
+ );
514
+ }
515
+
516
+ /**
517
+ * Dispatch one step via {@link AgentCoordinator.delegate}, capturing the
518
+ * result text on success or the error message on failure. Persists the
519
+ * terminal transition through {@link DagStore.updateStep} and returns the
520
+ * updated step record.
521
+ *
522
+ * Circuit-breaker check (task 5.7, design.md R3): before dispatching, the
523
+ * executor consults {@link AcpCircuitBreaker.isHealthy}. An open circuit
524
+ * fails the step immediately with
525
+ * `Agent "<name>" is unavailable (circuit breaker open)` —
526
+ * `coordinator.delegate` is NOT called, mirroring
527
+ * specs/dag-execution "Step dispatch via AgentCoordinator".
528
+ */
529
+ private async dispatchStep(
530
+ dagId: string,
531
+ step: DagStepRecord,
532
+ resolvedPrompt: string,
533
+ retryOptions?: { maxRetries?: number },
534
+ ): Promise<DagStepRecord> {
535
+ // Circuit breaker check (task 5.7). An open circuit fails the step
536
+ // immediately without dispatching — protects the wave loop from
537
+ // hammering a known-unhealthy agent (design.md R3).
538
+ if (!this.circuitBreaker.isHealthy(step.agent)) {
539
+ const error = `Agent "${step.agent}" is unavailable (circuit breaker open)`;
540
+ const completedAt = new Date().toISOString();
541
+ const updated = this.store.updateStep(dagId, step.id, (s) => ({
542
+ ...s,
543
+ status: "failed",
544
+ output: null,
545
+ error,
546
+ completedAt,
547
+ durationMs: 0,
548
+ }));
549
+ this.logStepEvent(dagId, step, "failed", 0);
550
+ return (
551
+ updated ?? {
552
+ ...step,
553
+ status: "failed",
554
+ output: null,
555
+ error,
556
+ completedAt,
557
+ durationMs: 0,
558
+ }
559
+ );
560
+ }
561
+
562
+ const startedAt = new Date().toISOString();
563
+ this.store.updateStep(dagId, step.id, (s) => ({
564
+ ...s,
565
+ status: "running",
566
+ startedAt,
567
+ }));
568
+ this.logStepEvent(dagId, step, "running");
569
+
570
+ // Register an AbortController so `cancel(dagId)` (task 5.9) can abort
571
+ // this in-flight agent session. The coordinator forwards the signal
572
+ // to the adapter, which cancels + disposes the session (best-effort).
573
+ const controller = this.registerAbortController(dagId, step.id);
574
+ const signal = controller.signal;
575
+
576
+ try {
577
+ const result = await this.coordinator.delegate(
578
+ step.agent,
579
+ resolvedPrompt,
580
+ undefined,
581
+ undefined,
582
+ signal,
583
+ );
584
+ const completedAt = new Date().toISOString();
585
+ const durationMs =
586
+ Date.parse(completedAt) - Date.parse(startedAt);
587
+ const updated = this.store.updateStep(dagId, step.id, (s) => ({
588
+ ...s,
589
+ status: "completed",
590
+ output: result.text,
591
+ error: undefined,
592
+ completedAt,
593
+ durationMs,
594
+ }));
595
+ this.logStepEvent(dagId, step, "completed", durationMs);
596
+ return (
597
+ updated ?? {
598
+ ...step,
599
+ status: "completed",
600
+ output: result.text,
601
+ completedAt,
602
+ durationMs,
603
+ }
604
+ );
605
+ } catch (err) {
606
+ this.unregisterAbortController(dagId, step.id);
607
+
608
+ // AbortError means `cancel()` aborted this in-flight session (task
609
+ // 5.9, specs/dag-monitoring "best-effort for in-flight steps"). The
610
+ // step transitions to `cancelled` (not `failed`) so its terminal
611
+ // state reflects the cancellation outcome.
612
+ if (isAbortError(err)) {
613
+ const completedAt = new Date().toISOString();
614
+ const durationMs =
615
+ Date.parse(completedAt) - Date.parse(startedAt);
616
+ const updated = this.store.updateStep(dagId, step.id, (s) => ({
617
+ ...s,
618
+ status: "cancelled",
619
+ output: null,
620
+ error: undefined,
621
+ completedAt,
622
+ durationMs,
623
+ }));
624
+ this.logStepEvent(dagId, step, "cancelled", durationMs);
625
+ return (
626
+ updated ?? {
627
+ ...step,
628
+ status: "cancelled",
629
+ output: null,
630
+ completedAt,
631
+ durationMs,
632
+ }
633
+ );
634
+ }
635
+
636
+ const message = err instanceof Error ? err.message : String(err);
637
+
638
+ // Step retry logic (task 5.12, design.md D5; specs/dag-submission
639
+ // "DAG options — failFast and maxRetries"). On failure, when
640
+ // `maxRetries > 0` and the step's `retryCount` is still below the
641
+ // budget, increment `retryCount`, persist the step back to
642
+ // `running`, and re-dispatch the same resolved prompt. Once the
643
+ // budget is exhausted the step stays `failed`.
644
+ const maxRetries = retryOptions?.maxRetries ?? 0;
645
+ const currentRetries = step.retryCount ?? 0;
646
+ if (maxRetries > 0 && currentRetries < maxRetries) {
647
+ const retriedStep = this.recordRetry(dagId, step, currentRetries);
648
+ return await this.dispatchStep(dagId, retriedStep, resolvedPrompt, {
649
+ maxRetries,
650
+ });
651
+ }
652
+
653
+ const completedAt = new Date().toISOString();
654
+ const durationMs =
655
+ Date.parse(completedAt) - Date.parse(startedAt);
656
+ const updated = this.store.updateStep(dagId, step.id, (s) => ({
657
+ ...s,
658
+ status: "failed",
659
+ output: null,
660
+ error: message,
661
+ completedAt,
662
+ durationMs,
663
+ }));
664
+ this.logStepEvent(dagId, step, "failed", durationMs);
665
+ return (
666
+ updated ?? {
667
+ ...step,
668
+ status: "failed",
669
+ error: message,
670
+ completedAt,
671
+ durationMs,
672
+ }
673
+ );
674
+ }
675
+ finally {
676
+ // Always release the in-flight controller entry once the dispatch
677
+ // settles, regardless of outcome (completed/cancelled/failed).
678
+ this.unregisterAbortController(dagId, step.id);
679
+ }
680
+ }
681
+
682
+ /**
683
+ * Record a retry attempt for a failed step (task 5.12). Increments the
684
+ * step's `retryCount`, resets its status to `running` (the dispatch
685
+ * loop will re-attempt it), and persists the transition through
686
+ * {@link DagStore.updateStep}. Returns the updated step record so the
687
+ * caller can chain the re-dispatch.
688
+ */
689
+ protected recordRetry(
690
+ dagId: string,
691
+ step: DagStepRecord,
692
+ currentRetries: number,
693
+ ): DagStepRecord {
694
+ const startedAt = new Date().toISOString();
695
+ const updated = this.store.updateStep(dagId, step.id, (s) => ({
696
+ ...s,
697
+ status: "running",
698
+ retryCount: currentRetries + 1,
699
+ startedAt,
700
+ error: undefined,
701
+ output: null,
702
+ completedAt: undefined,
703
+ durationMs: undefined,
704
+ }));
705
+ return (
706
+ updated ?? {
707
+ ...step,
708
+ status: "running",
709
+ retryCount: currentRetries + 1,
710
+ startedAt,
711
+ error: undefined,
712
+ output: null,
713
+ completedAt: undefined,
714
+ durationMs: undefined,
715
+ }
716
+ );
717
+ }
718
+
719
+ /**
720
+ * Cancel a running DAG (task 5.9, specs/dag-monitoring "DAG cancellation").
721
+ *
722
+ * Aborts every in-flight agent session (via the abort signal threaded
723
+ * through {@link AgentCoordinator.delegate}), marks all `pending` and
724
+ * `running` steps as `cancelled`, transitions the DAG to `cancelled`, and
725
+ * returns a summary of the cancellation.
726
+ *
727
+ * The summary counts reflect the step states AT cancel time:
728
+ * - `completed` — steps that had already reached `completed` (untouched)
729
+ * - `aborted` — steps that were `running` (in-flight) and got aborted
730
+ * - `cancelled` — steps that were `pending` and got marked `cancelled`
731
+ *
732
+ * A step that finishes successfully between the abort signal firing and
733
+ * the step being persisted reflects its actual outcome (best-effort),
734
+ * per specs/dag-monitoring "Cancel is best-effort for in-flight steps".
735
+ *
736
+ * @throws when the DAG does not exist, or is already in a terminal state
737
+ * (`completed` / `failed` / `cancelled`).
738
+ */
739
+ async cancel(dagId: string): Promise<DagCancelSummary> {
740
+ const record = this.store.get(dagId);
741
+ if (!record) {
742
+ throw new Error(`DAG "${dagId}" not found`);
743
+ }
744
+ if (
745
+ record.status === "completed" ||
746
+ record.status === "failed" ||
747
+ record.status === "cancelled"
748
+ ) {
749
+ throw new Error(
750
+ `DAG "${dagId}" is already ${record.status} and cannot be cancelled`,
751
+ );
752
+ }
753
+
754
+ // Tally counts from the persisted step states at cancel time.
755
+ let completed = 0;
756
+ let aborted = 0;
757
+ let cancelled = 0;
758
+ for (const step of Object.values(record.steps)) {
759
+ if (step.status === "completed") completed += 1;
760
+ else if (step.status === "running") aborted += 1;
761
+ else if (step.status === "pending") cancelled += 1;
762
+ }
763
+
764
+ // Abort every in-flight agent session for this DAG (best-effort).
765
+ this.abortInFlight(dagId);
766
+
767
+ // Mark every pending + running step as `cancelled` and persist.
768
+ const completedAt = new Date().toISOString();
769
+ for (const stepId of Object.keys(record.steps)) {
770
+ const step = record.steps[stepId];
771
+ if (step.status === "pending" || step.status === "running") {
772
+ this.store.updateStep(dagId, stepId, (s) => ({
773
+ ...s,
774
+ status: "cancelled",
775
+ output: null,
776
+ error: undefined,
777
+ completedAt,
778
+ }));
779
+ this.logStepEvent(dagId, step, "cancelled", 0);
780
+ }
781
+ }
782
+
783
+ this.store.updateDagStatus(dagId, "cancelled");
784
+
785
+ return { completed, aborted, cancelled };
786
+ }
787
+
788
+ /**
789
+ * Stale DAG detection (task 5.11, specs/dag-resume "Stale DAG cleanup").
790
+ *
791
+ * Scans all DAGs and marks those in `running` state whose last transition
792
+ * (`updatedAt`) is older than `timeoutMs` as `stale`. A stale DAG has had
793
+ * no step transitions for the entire timeout window, indicating the
794
+ * process likely died or stalled without a clean shutdown.
795
+ *
796
+ * Stale DAGs are excluded from auto-resume (specs/dag-resume "Stale DAG
797
+ * does not auto-resume"), require explicit re-submission, and are reported
798
+ * in `acp_dag_status` listings. Each marked DAG emits a warning log event.
799
+ *
800
+ * Already-stale DAGs are NOT re-marked (idempotent). Terminal DAGs
801
+ * (`completed` / `failed` / `cancelled`) are unaffected.
802
+ *
803
+ * @param timeoutMs Stale threshold in ms (default: `dagStaleTimeoutMs`
804
+ * from config, typically 1 hour).
805
+ * @returns The list of DAG IDs that were newly marked `stale` during
806
+ * this call. Empty when no DAGs crossed the threshold.
807
+ */
808
+ markStale(timeoutMs: number): string[] {
809
+ // `findRunning()` scans the per-DAG `<dagId>.json` files directly
810
+ // (the source of truth for `updatedAt`), not the index summary, so a
811
+ // backdated or out-of-sync index cannot mask a stale DAG.
812
+ const running = this.store.findRunning();
813
+ const cutoff = Date.now() - timeoutMs;
814
+ const marked: string[] = [];
815
+
816
+ for (const record of running) {
817
+ const updatedAtMs = Date.parse(record.updatedAt);
818
+ if (Number.isNaN(updatedAtMs)) continue;
819
+ if (updatedAtMs >= cutoff) continue;
820
+
821
+ // Transition to `stale` via the store so the index reflects it.
822
+ this.store.updateDagStatus(record.dagId, "stale");
823
+ marked.push(record.dagId);
824
+ this.logger.error(
825
+ `DagExecutor.markStale: DAG "${record.dagId}" marked stale (no transitions for >${timeoutMs}ms)`,
826
+ );
827
+ }
828
+
829
+ return marked;
830
+ }
831
+
832
+ /**
833
+ * Emit a `dag-step` lifecycle event to the wired {@link
834
+ * DagExecutor.eventLog} (task 7.4, specs/dag-monitoring "Event logging for
835
+ * DAG steps"). One event per step status transition (running, completed,
836
+ * failed, skipped, cancelled). The data includes `dagId`, `stepId`,
837
+ * `agent`, `status`, `timestamp`, and `durationMs` (for terminal states).
838
+ *
839
+ * No-op when no event log was wired so the executor stays backward
840
+ * compatible with existing tests/construction sites.
841
+ */
842
+ protected logStepEvent(
843
+ dagId: string,
844
+ step: { id: string; agent: string },
845
+ status: DagStepStatus,
846
+ durationMs?: number,
847
+ ): void {
848
+ if (!this.eventLog) return;
849
+ const data: Record<string, unknown> = {
850
+ dagId,
851
+ stepId: step.id,
852
+ agent: step.agent,
853
+ status,
854
+ timestamp: new Date().toISOString(),
855
+ };
856
+ if (typeof durationMs === "number") {
857
+ data.durationMs = durationMs;
858
+ }
859
+ this.eventLog.append("dag-step", data);
860
+ }
861
+
862
+ /**
863
+ * Register an {@link AbortController} for an in-flight step dispatch so
864
+ * {@link DagExecutor.cancel} can abort it. Returns the controller so the
865
+ * dispatch can hand its `signal` to {@link AgentCoordinator.delegate}.
866
+ */
867
+ protected registerAbortController(
868
+ dagId: string,
869
+ stepId: string,
870
+ ): AbortController {
871
+ let byStep = this.abortControllers.get(dagId);
872
+ if (!byStep) {
873
+ byStep = new Map();
874
+ this.abortControllers.set(dagId, byStep);
875
+ }
876
+ const controller = new AbortController();
877
+ byStep.set(stepId, controller);
878
+ return controller;
879
+ }
880
+
881
+ /** Remove the abort-controller entry for a settled step dispatch. */
882
+ protected unregisterAbortController(dagId: string, stepId: string): void {
883
+ const byStep = this.abortControllers.get(dagId);
884
+ if (!byStep) return;
885
+ byStep.delete(stepId);
886
+ if (byStep.size === 0) this.abortControllers.delete(dagId);
887
+ }
888
+
889
+ /**
890
+ * Abort every in-flight agent session for a DAG (task 5.9). Best-effort:
891
+ * firing `abort()` on each registered controller causes the coordinator
892
+ * to cancel + dispose the underlying session and reject the dispatch
893
+ * with an `AbortError`, which {@link DagExecutor.dispatchStep} maps to a
894
+ * `cancelled` terminal status.
895
+ */
896
+ protected abortInFlight(dagId: string): void {
897
+ const byStep = this.abortControllers.get(dagId);
898
+ if (!byStep) return;
899
+ for (const controller of byStep.values()) {
900
+ try {
901
+ controller.abort();
902
+ } catch {
903
+ /* best-effort — a controller that already aborted is a no-op */
904
+ }
905
+ }
906
+ }
907
+ }
908
+
909
+ /**
910
+ * Module-level shared registry of in-flight abort controllers, keyed by
911
+ * `dagId` → `stepId`. Shared across ALL DagExecutor instances so that an
912
+ * executor constructed for `acp_dag_cancel` can abort sessions dispatched by
913
+ * a different executor constructed for `acp_dag_submit` (task 7.1 wires a
914
+ * fresh DagExecutor per tool call). Keyed by dagId + stepId so concurrent
915
+ * DAGs never collide.
916
+ */
917
+ const SHARED_ABORT_CONTROLLERS = new Map<string, Map<string, AbortController>>();
918
+
919
+ /**
920
+ * Collect `{id → output}` for all terminal steps that have a text output.
921
+ * Kept as a module function so the wave loop reads from a plain snapshot.
922
+ */
923
+ function collectOutputs(steps: Record<string, DagStepRecord>): Record<string, string> {
924
+ const out: Record<string, string> = {};
925
+ for (const [id, step] of Object.entries(steps)) {
926
+ if (typeof step.output === "string") {
927
+ out[id] = step.output;
928
+ } else if (step.status === "failed" && step.error) {
929
+ // Allow `{<failed-step>.output}` to surface the error text for
930
+ // `after`-gate / failFast=false downstream steps.
931
+ out[id] = step.error;
932
+ }
933
+ }
934
+ return out;
935
+ }
936
+
937
+ /** Collect `{id → status}` for all terminal steps. */
938
+ function collectStatuses(steps: Record<string, DagStepRecord>): Record<string, string> {
939
+ const out: Record<string, string> = {};
940
+ for (const [id, step] of Object.entries(steps)) {
941
+ out[id] = step.status;
942
+ }
943
+ return out;
944
+ }
945
+
946
+ /** Whether a step status is terminal (no further transitions expected). */
947
+ function isTerminal(status: DagStepStatus | undefined): boolean {
948
+ return (
949
+ status === "completed" ||
950
+ status === "failed" ||
951
+ status === "skipped" ||
952
+ status === "cancelled"
953
+ );
954
+ }
955
+
956
+ /**
957
+ * Whether an error is an `AbortError` raised by aborting an in-flight agent
958
+ * session (coordinator wraps the abort in a `DOMException` with name
959
+ * `"AbortError"`). Used by {@link DagExecutor.dispatchStep} to map an
960
+ * aborted dispatch to a `cancelled` terminal status (task 5.9).
961
+ */
962
+ function isAbortError(err: unknown): boolean {
963
+ if (err == null || typeof err !== "object") return false;
964
+ const name = (err as { name?: unknown }).name;
965
+ return name === "AbortError";
966
+ }