@buihongduc132/pi-acp-agents 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +311 -211
- package/index.ts +309 -41
- package/package.json +1 -1
- package/src/acp-widget.ts +197 -0
- package/src/config/config.ts +9 -0
- package/src/config/types.ts +96 -0
- package/src/dag/dag-executor.ts +966 -0
- package/src/dag/dag-store.ts +408 -0
- package/src/dag/dag-validator.ts +202 -0
- package/src/dag/template-resolver.ts +174 -0
- package/src/management/governance-store.ts +10 -3
- package/src/management/legacy-migration.ts +79 -0
- package/src/management/mailbox-manager.ts +10 -3
- package/src/management/runtime-paths.ts +18 -7
- package/src/management/session-archive-store.ts +1 -1
- package/src/management/session-store-factory.ts +58 -0
- package/src/management/task-store.ts +10 -3
- package/src/management/worker-store.ts +10 -3
- package/src/settings/config.ts +3 -0
|
@@ -0,0 +1,966 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DagExecutor — Wave-based parallel execution of DAG steps.
|
|
3
|
+
*
|
|
4
|
+
* This is the orchestration core of the `acp-dag-delegation` change
|
|
5
|
+
* (design.md D2). It owns the topological-sort → wave loop: for each wave,
|
|
6
|
+
* every step whose dependencies are satisfied is dispatched in parallel via
|
|
7
|
+
* the existing {@link AgentCoordinator.delegate()} method (the executor
|
|
8
|
+
* manages the wave loop directly — it does NOT hand dispatch off to
|
|
9
|
+
* `AsyncExecutor`, per task 5.3). Outputs and errors are captured per step
|
|
10
|
+
* and persisted through {@link DagStore} so the run survives pi restart.
|
|
11
|
+
*
|
|
12
|
+
* The executor is wired (design.md "Integration with existing
|
|
13
|
+
* infrastructure"; task 7.1) with the existing infrastructure singletons
|
|
14
|
+
* from `index.ts`:
|
|
15
|
+
*
|
|
16
|
+
* - {@link AgentCoordinator} — one short-lived `delegate()` call per step
|
|
17
|
+
* - {@link AcpCircuitBreaker} — consulted before every dispatch; an open
|
|
18
|
+
* circuit fails the step immediately with
|
|
19
|
+
* `Agent "<name>" is unavailable (circuit breaker open)` (task 5.7)
|
|
20
|
+
* - {@link TemplateResolver} — expands `{<step>.output}` / `{<step>.status}`
|
|
21
|
+
* / `{dag.args.*}` in each step's prompt before dispatch (task 5.3)
|
|
22
|
+
* - {@link DagStore} — the persistence layer for DAG + step state
|
|
23
|
+
*
|
|
24
|
+
* Task 5.1 scope: create the class with a constructor that wires up these
|
|
25
|
+
* dependencies. The execution surface — `topologicalSort()`, `execute()`,
|
|
26
|
+
* wave dispatch, gate evaluation, failFast, circuit-breaker check,
|
|
27
|
+
* completion detection, `cancel()`, resume, stale detection, and retry — is
|
|
28
|
+
* implemented by the subsequent tasks 5.2–5.13.
|
|
29
|
+
*/
|
|
30
|
+
|
|
31
|
+
import type { DagStore } from "./dag-store.js";
|
|
32
|
+
import type {
|
|
33
|
+
DagRecord,
|
|
34
|
+
DagStatus,
|
|
35
|
+
DagStepRecord,
|
|
36
|
+
DagStepStatus,
|
|
37
|
+
DagTaskDefinition,
|
|
38
|
+
} from "../config/types.js";
|
|
39
|
+
import type { TemplateResolver } from "./template-resolver.js";
|
|
40
|
+
import type { AgentCoordinator } from "../coordination/coordinator.js";
|
|
41
|
+
import type { AcpCircuitBreaker } from "../core/circuit-breaker.js";
|
|
42
|
+
import type { Logger } from "../logger.js";
|
|
43
|
+
import { createNoopLogger } from "../logger.js";
|
|
44
|
+
|
|
45
|
+
/** Constructor options for {@link DagExecutor}. */
|
|
46
|
+
export interface DagExecutorOptions {
|
|
47
|
+
/** File-backed DAG + step state persistence. */
|
|
48
|
+
store: DagStore;
|
|
49
|
+
/** Template variable interpolation for step prompts. */
|
|
50
|
+
resolver: TemplateResolver;
|
|
51
|
+
/** Existing agent coordinator used for per-step `delegate()` dispatch. */
|
|
52
|
+
coordinator: AgentCoordinator;
|
|
53
|
+
/** Existing per-agent circuit breaker consulted before each dispatch. */
|
|
54
|
+
circuitBreaker: AcpCircuitBreaker;
|
|
55
|
+
/**
|
|
56
|
+
* Optional existing async executor. Retained on the instance for
|
|
57
|
+
* integration wiring (task 7.1) even though the wave loop is driven
|
|
58
|
+
* directly by the executor (task 5.3); defaults to undefined.
|
|
59
|
+
*/
|
|
60
|
+
asyncExecutor?: unknown;
|
|
61
|
+
/** Logger; defaults to a no-op logger so the executor is constructable standalone. */
|
|
62
|
+
logger?: Logger;
|
|
63
|
+
/**
|
|
64
|
+
* Optional event log for recording step lifecycle transitions (task 7.4).
|
|
65
|
+
* When provided, the executor appends "dag-step" events for each step
|
|
66
|
+
* status transition (running, completed, failed, skipped, cancelled) with
|
|
67
|
+
* data including dagId, stepId, agent, status, and durationMs.
|
|
68
|
+
*/
|
|
69
|
+
eventLog?: { append(type: string, data: Record<string, unknown>): void };
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/** No-op default logger so the executor is safe to build without one. */
|
|
73
|
+
const noopLogger = createNoopLogger();
|
|
74
|
+
|
|
75
|
+
/** Summary returned by {@link DagExecutor.cancel} (specs/dag-monitoring). */
|
|
76
|
+
export interface DagCancelSummary {
|
|
77
|
+
/** Steps that had already reached `completed` at cancel time. */
|
|
78
|
+
completed: number;
|
|
79
|
+
/** Steps that were `running` (in-flight) at cancel time and got aborted. */
|
|
80
|
+
aborted: number;
|
|
81
|
+
/** Steps that were `pending` at cancel time and got marked `cancelled`. */
|
|
82
|
+
cancelled: number;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
export class DagExecutor {
|
|
86
|
+
/** File-backed DAG + step state persistence. */
|
|
87
|
+
readonly store: DagStore;
|
|
88
|
+
/** Template variable interpolation for step prompts. */
|
|
89
|
+
readonly resolver: TemplateResolver;
|
|
90
|
+
/** Existing agent coordinator used for per-step `delegate()` dispatch. */
|
|
91
|
+
readonly coordinator: AgentCoordinator;
|
|
92
|
+
/** Existing per-agent circuit breaker consulted before each dispatch. */
|
|
93
|
+
readonly circuitBreaker: AcpCircuitBreaker;
|
|
94
|
+
/** Optional async executor wired from `index.ts` (task 7.1). */
|
|
95
|
+
readonly asyncExecutor: unknown;
|
|
96
|
+
/** Logger for step lifecycle / wave / resume events. */
|
|
97
|
+
protected readonly logger: Logger;
|
|
98
|
+
/**
|
|
99
|
+
* Optional event log for recording step lifecycle transitions (task 7.4).
|
|
100
|
+
* Appends "dag-step" events for each step status change with data including
|
|
101
|
+
* dagId, stepId, agent, status, and durationMs.
|
|
102
|
+
*/
|
|
103
|
+
protected readonly eventLog?: { append(type: string, data: Record<string, unknown>): void };
|
|
104
|
+
/**
|
|
105
|
+
* In-flight abort controllers keyed by `dagId` → `stepId`. Registered by
|
|
106
|
+
* {@link DagExecutor.dispatchStep} before each dispatch so {@link
|
|
107
|
+
* DagExecutor.cancel} (task 5.9) can abort in-flight agent sessions.
|
|
108
|
+
*
|
|
109
|
+
* This registry is SHARED across all DagExecutor instances (module-level
|
|
110
|
+
* singleton, see {@link SHARED_ABORT_CONTROLLERS}). In-flight agent
|
|
111
|
+
* sessions exist independent of which executor instance dispatched them
|
|
112
|
+
* or processes the cancel — `index.ts` constructs a fresh DagExecutor per
|
|
113
|
+
* tool call (task 7.1 wiring), so a per-instance map would leave
|
|
114
|
+
* `acp_dag_cancel` unable to abort sessions dispatched by the
|
|
115
|
+
* `acp_dag_submit` executor. Sharing the registry keeps cancellation
|
|
116
|
+
* working end-to-end (specs/dag-monitoring "DAG cancellation").
|
|
117
|
+
*/
|
|
118
|
+
protected readonly abortControllers = SHARED_ABORT_CONTROLLERS;
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Group DAG tasks into ordered execution waves (design.md D2 / task 5.2).
|
|
122
|
+
*
|
|
123
|
+
* Wave 0 contains every task with no dependencies. Each subsequent task
|
|
124
|
+
* is assigned to the wave immediately after the latest wave any of its
|
|
125
|
+
* dependencies landed in. All tasks sharing the same wave index form one
|
|
126
|
+
* wave and dispatch in parallel by {@link DagExecutor.execute} (task 5.3).
|
|
127
|
+
*
|
|
128
|
+
* This mirrors dorkestrator's `buildExecutionWaves()` and pi-taskflow's
|
|
129
|
+
* phase-by-phase model. The input array is treated as read-only — the
|
|
130
|
+
* caller's array and its task objects are not mutated.
|
|
131
|
+
*
|
|
132
|
+
* @param tasks Declarative DAG task definitions (already validated — no
|
|
133
|
+
* cycles, no dangling refs; see {@link DagValidator}).
|
|
134
|
+
* @returns An ordered array of waves; each wave is an array of step IDs.
|
|
135
|
+
* Empty input yields an empty array.
|
|
136
|
+
*/
|
|
137
|
+
topologicalSort(tasks: readonly DagTaskDefinition[]): string[][] {
|
|
138
|
+
if (tasks.length === 0) return [];
|
|
139
|
+
|
|
140
|
+
// Map each step id → its (normalized) dependency list.
|
|
141
|
+
const depsOf = new Map<string, string[]>();
|
|
142
|
+
for (const t of tasks) {
|
|
143
|
+
depsOf.set(t.id, [...(t.dependsOn ?? [])]);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Longest-path layering: wave(id) = max(wave(dep)) + 1, or 0 if no deps.
|
|
147
|
+
const waveOf = new Map<string, number>();
|
|
148
|
+
const remaining = new Set(depsOf.keys());
|
|
149
|
+
|
|
150
|
+
// Iteratively peel off tasks whose dependencies have all been assigned a
|
|
151
|
+
// wave. A validated DAG is a DAG, so this always drains in
|
|
152
|
+
// (number of waves) passes at most.
|
|
153
|
+
let progressed = true;
|
|
154
|
+
while (remaining.size > 0 && progressed) {
|
|
155
|
+
progressed = false;
|
|
156
|
+
for (const id of remaining) {
|
|
157
|
+
const deps = depsOf.get(id)!;
|
|
158
|
+
if (!deps.every((d) => waveOf.has(d))) continue;
|
|
159
|
+
const wave = deps.reduce((m, d) => Math.max(m, waveOf.get(d)! + 1), 0);
|
|
160
|
+
waveOf.set(id, wave);
|
|
161
|
+
remaining.delete(id);
|
|
162
|
+
progressed = true;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
if (remaining.size > 0) {
|
|
166
|
+
// Should be unreachable for a validated DAG (cycles/dangling refs are
|
|
167
|
+
// caught by DagValidator before execution). Surface defensively.
|
|
168
|
+
throw new Error(
|
|
169
|
+
`DagExecutor.topologicalSort: unresolved dependencies for steps: ${[
|
|
170
|
+
...remaining,
|
|
171
|
+
].join(", ")}`,
|
|
172
|
+
);
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Preserve the input declaration order within each wave for determinism.
|
|
176
|
+
const maxWave = Math.max(...waveOf.values());
|
|
177
|
+
const waves: string[][] = Array.from({ length: maxWave + 1 }, () => []);
|
|
178
|
+
for (const t of tasks) {
|
|
179
|
+
waves[waveOf.get(t.id)!].push(t.id);
|
|
180
|
+
}
|
|
181
|
+
return waves;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
constructor(options: DagExecutorOptions) {
|
|
185
|
+
this.store = options.store;
|
|
186
|
+
this.resolver = options.resolver;
|
|
187
|
+
this.coordinator = options.coordinator;
|
|
188
|
+
this.circuitBreaker = options.circuitBreaker;
|
|
189
|
+
this.asyncExecutor = options.asyncExecutor;
|
|
190
|
+
this.logger = options.logger ?? noopLogger;
|
|
191
|
+
this.eventLog = options.eventLog;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* Execute a DAG to completion, wave by wave (task 5.3).
|
|
196
|
+
*
|
|
197
|
+
* Loads the persisted DAG, transitions it to `running`, computes waves via
|
|
198
|
+
* {@link DagExecutor.topologicalSort}, then for each wave dispatches every
|
|
199
|
+
* step **in parallel** directly through {@link AgentCoordinator.delegate}
|
|
200
|
+
* (the executor owns the wave loop — it does NOT delegate dispatch to
|
|
201
|
+
* `AsyncExecutor`, per design.md D2 / task 5.3). It waits for the entire
|
|
202
|
+
* wave to reach a terminal state before advancing, capturing each step's
|
|
203
|
+
* output (or error) into the persisted record via {@link DagStore.updateStep}
|
|
204
|
+
* so downstream waves can resolve `{<step>.output}` template variables.
|
|
205
|
+
*
|
|
206
|
+
* After the last wave the DAG transitions to `completed` when every step
|
|
207
|
+
* succeeded, or `failed` otherwise.
|
|
208
|
+
*
|
|
209
|
+
* @param dagId DAG to execute.
|
|
210
|
+
* @param options Optional execution flags. `skipTerminal` (task 5.10)
|
|
211
|
+
* leaves steps already in a terminal state untouched instead of
|
|
212
|
+
* re-dispatching them — used by {@link DagExecutor.resume} so persisted
|
|
213
|
+
* outputs feed downstream template resolution.
|
|
214
|
+
*/
|
|
215
|
+
async execute(
|
|
216
|
+
dagId: string,
|
|
217
|
+
options?: { skipTerminal?: boolean },
|
|
218
|
+
): Promise<void> {
|
|
219
|
+
const record = this.store.get(dagId);
|
|
220
|
+
if (!record) {
|
|
221
|
+
throw new Error(`DAG "${dagId}" not found`);
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// Re-hydrate the persisted step states into a working snapshot the wave
|
|
225
|
+
// loop reads from. Steps already terminal (e.g. completed on resume)
|
|
226
|
+
// are left untouched and their stored outputs feed template resolution.
|
|
227
|
+
const steps: Record<string, DagStepRecord> = {};
|
|
228
|
+
for (const stepId of Object.keys(record.steps)) {
|
|
229
|
+
steps[stepId] = { ...record.steps[stepId] };
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
const waves = this.topologicalSort(record.tasks);
|
|
233
|
+
this.store.updateDagStatus(dagId, "running");
|
|
234
|
+
|
|
235
|
+
for (let waveIndex = 0; waveIndex < waves.length; waveIndex += 1) {
|
|
236
|
+
const waveStepIds = waves[waveIndex];
|
|
237
|
+
await this.runWave(dagId, record, steps, waveStepIds, options);
|
|
238
|
+
|
|
239
|
+
// If the DAG was cancelled mid-execution (task 5.9), stop advancing —
|
|
240
|
+
// `cancel()` owns the transition to `cancelled` and we MUST NOT
|
|
241
|
+
// overwrite it with a completion-derived status.
|
|
242
|
+
if (this.store.get(dagId)?.status === "cancelled") return;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// DAG completion detection (task 5.8): when every step has reached a
|
|
246
|
+
// terminal state, transition the DAG to `completed` or `failed`.
|
|
247
|
+
const terminalStatus = this.detectCompletion(steps);
|
|
248
|
+
if (terminalStatus !== null) {
|
|
249
|
+
this.store.updateDagStatus(dagId, terminalStatus);
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
/**
|
|
254
|
+
* Resume a previously-interrupted DAG (task 5.10, specs/dag-resume "Resume
|
|
255
|
+
* from last checkpoint after pi restart").
|
|
256
|
+
*
|
|
257
|
+
* On pi restart the extension calls {@link DagExecutor.resumeAll} which
|
|
258
|
+
* discovers DAGs persisted in `running` state and resumes each via this
|
|
259
|
+
* method. Resume:
|
|
260
|
+
*
|
|
261
|
+
* 1. Resets every step still marked `running` back to `pending` — it was
|
|
262
|
+
* interrupted mid-flight and its outcome is unknown, so it must be
|
|
263
|
+
* retried (specs/dag-resume scenario "Resume a DAG interrupted by pi
|
|
264
|
+
* restart").
|
|
265
|
+
* 2. Re-runs the wave loop via {@link DagExecutor.execute} with
|
|
266
|
+
* `skipTerminal: true`, so steps already `completed` / `failed` /
|
|
267
|
+
* `skipped` / `cancelled` are NOT re-dispatched — their persisted
|
|
268
|
+
* outputs feed downstream template resolution (specs/dag-resume
|
|
269
|
+
* scenario "Skip already-completed steps on resume").
|
|
270
|
+
*
|
|
271
|
+
* Throws when the DAG does not exist (mirrors {@link DagExecutor.execute}).
|
|
272
|
+
*
|
|
273
|
+
* @param dagId DAG to resume.
|
|
274
|
+
*/
|
|
275
|
+
async resume(dagId: string): Promise<void> {
|
|
276
|
+
const record = this.store.get(dagId);
|
|
277
|
+
if (!record) {
|
|
278
|
+
throw new Error(`DAG "${dagId}" not found`);
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
// Reset every step still marked `running` back to `pending`. A `running`
|
|
282
|
+
// step at resume time was interrupted mid-flight — its outcome is
|
|
283
|
+
// unknown, so it must be retried from scratch.
|
|
284
|
+
for (const stepId of Object.keys(record.steps)) {
|
|
285
|
+
const step = record.steps[stepId];
|
|
286
|
+
if (step.status === "running") {
|
|
287
|
+
this.store.updateStep(dagId, stepId, (s) => ({
|
|
288
|
+
...s,
|
|
289
|
+
status: "pending",
|
|
290
|
+
startedAt: undefined,
|
|
291
|
+
}));
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// Re-run the wave loop, skipping steps already in a terminal state.
|
|
296
|
+
// `execute` reloads the (now-reset) record from the store, so the
|
|
297
|
+
// snapshot it builds reflects the resets above.
|
|
298
|
+
await this.execute(dagId, { skipTerminal: true });
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/**
|
|
302
|
+
* Resume every DAG persisted in `running` state (task 5.10, task 7.3).
|
|
303
|
+
*
|
|
304
|
+
* This is the startup hook invoked by the extension on load: it scans
|
|
305
|
+
* `~/.pi/acp-agents/dag/` via {@link DagStore.findRunning} and resumes each
|
|
306
|
+
* discovered DAG through {@link DagExecutor.resume}. `stale` DAGs are
|
|
307
|
+
* naturally excluded — `findRunning` only returns `running` DAGs (task
|
|
308
|
+
* 5.11 / specs/dag-resume "Stale DAG does not auto-resume").
|
|
309
|
+
*
|
|
310
|
+
* A single DAG that fails to resume (e.g. an unreadable step record) does
|
|
311
|
+
* NOT abort the pass — the error is logged and the remaining DAGs still
|
|
312
|
+
* resume. Returns the list of DAG IDs that were attempted.
|
|
313
|
+
*/
|
|
314
|
+
async resumeAll(): Promise<string[]> {
|
|
315
|
+
const running = this.store.findRunning();
|
|
316
|
+
const resumed: string[] = [];
|
|
317
|
+
for (const record of running) {
|
|
318
|
+
try {
|
|
319
|
+
await this.resume(record.dagId);
|
|
320
|
+
} catch (err) {
|
|
321
|
+
// One bad DAG must not abort the resume pass — log and continue.
|
|
322
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
323
|
+
this.logger.error(
|
|
324
|
+
`DagExecutor.resumeAll: failed to resume DAG "${record.dagId}": ${message}`,
|
|
325
|
+
);
|
|
326
|
+
}
|
|
327
|
+
resumed.push(record.dagId);
|
|
328
|
+
}
|
|
329
|
+
return resumed;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
/**
|
|
333
|
+
* DAG completion detection (task 5.8, specs/dag-execution "DAG state
|
|
334
|
+
* transitions"). Returns the DAG-level terminal status once every step
|
|
335
|
+
* has reached a terminal state, or `null` while at least one step is
|
|
336
|
+
* still `pending`/`running` (the DAG is not yet done).
|
|
337
|
+
*
|
|
338
|
+
* - Returns `"completed"` when every step is `completed`.
|
|
339
|
+
* - Returns `"failed"` when any step is `failed`, `skipped`, or
|
|
340
|
+
* `cancelled` (the run as a whole did not succeed; a `cancelled` step
|
|
341
|
+
* from the cancel path (task 5.9) is also a non-success terminal state).
|
|
342
|
+
* - Returns `null` while any step is still non-terminal.
|
|
343
|
+
*
|
|
344
|
+
* An empty step set is vacuously complete (`"completed"`).
|
|
345
|
+
*
|
|
346
|
+
* This is a pure function over the supplied step map — it does not read
|
|
347
|
+
* from or mutate the {@link DagStore}, which keeps it trivially testable
|
|
348
|
+
* and reusable by the cancel/resume paths.
|
|
349
|
+
*/
|
|
350
|
+
detectCompletion(
|
|
351
|
+
steps: Record<string, DagStepRecord>,
|
|
352
|
+
): DagStatus | null {
|
|
353
|
+
const records = Object.values(steps);
|
|
354
|
+
if (!records.every((s) => isTerminal(s.status))) return null;
|
|
355
|
+
const anyFailure = records.some(
|
|
356
|
+
(s) =>
|
|
357
|
+
s.status === "failed" ||
|
|
358
|
+
s.status === "skipped" ||
|
|
359
|
+
s.status === "cancelled",
|
|
360
|
+
);
|
|
361
|
+
return anyFailure ? "failed" : "completed";
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
/**
|
|
365
|
+
* Dispatch every step in a single wave in parallel, await all of them,
|
|
366
|
+
* and persist the captured output/error per step. Mutates `steps` to
|
|
367
|
+
* reflect terminal states.
|
|
368
|
+
*
|
|
369
|
+
* Gate evaluation (task 5.5, design.md D4): before dispatching a step,
|
|
370
|
+
* consult {@link DagExecutor.gateAllowsDispatch}. A `needs` gate is only
|
|
371
|
+
* satisfied when every dependency `completed`; a dependency that did not
|
|
372
|
+
* `complete` (e.g. `failed`) blocks the downstream step and it is marked
|
|
373
|
+
* `skipped` without dispatching. An `after` gate is satisfied as soon as
|
|
374
|
+
* the dependency is in any terminal state, so the downstream step runs
|
|
375
|
+
* regardless of the dependency's outcome.
|
|
376
|
+
*/
|
|
377
|
+
private async runWave(
|
|
378
|
+
dagId: string,
|
|
379
|
+
record: DagRecord,
|
|
380
|
+
steps: Record<string, DagStepRecord>,
|
|
381
|
+
waveStepIds: string[],
|
|
382
|
+
options?: { skipTerminal?: boolean },
|
|
383
|
+
): Promise<void> {
|
|
384
|
+
// Build the template context from already-terminal steps (their outputs
|
|
385
|
+
// and statuses), plus the workflow-level args. Pre-computing here keeps
|
|
386
|
+
// every parallel dispatch within the wave on equal footing.
|
|
387
|
+
const outputs = collectOutputs(steps);
|
|
388
|
+
const statuses = collectStatuses(steps);
|
|
389
|
+
const dagArgs = record.args ?? {};
|
|
390
|
+
|
|
391
|
+
const dispatches = waveStepIds.map(async (stepId) => {
|
|
392
|
+
const step = steps[stepId];
|
|
393
|
+
if (!step) return undefined;
|
|
394
|
+
|
|
395
|
+
// Resume (task 5.10): steps already in a terminal state (e.g.
|
|
396
|
+
// `completed` on resume) are NOT re-dispatched — their stored
|
|
397
|
+
// outputs feed downstream template resolution instead.
|
|
398
|
+
if (options?.skipTerminal && isTerminal(step.status)) {
|
|
399
|
+
return undefined;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
// Gate evaluation (task 5.5) + failFast (task 5.6, design.md D5).
|
|
403
|
+
// A `needs` gate whose dependency did not `complete` is not
|
|
404
|
+
// satisfiable: skip the step instead of dispatching — UNLESS the DAG
|
|
405
|
+
// is running with `failFast: false`, in which case a failed
|
|
406
|
+
// dependency is treated like an `after` gate and the step still
|
|
407
|
+
// dispatches (receiving the dep's error message as `{<dep>.output}`,.
|
|
408
|
+
// surfaced by `collectOutputs`).
|
|
409
|
+
const failFast = record.options?.failFast !== false;
|
|
410
|
+
if (!this.gateAllowsDispatch(step, steps, failFast)) {
|
|
411
|
+
return this.skipStep(dagId, step);
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
const resolvedPrompt = this.resolver.resolve(
|
|
415
|
+
step.prompt,
|
|
416
|
+
outputs,
|
|
417
|
+
statuses,
|
|
418
|
+
dagArgs,
|
|
419
|
+
);
|
|
420
|
+
|
|
421
|
+
// Fail the step if template variables remain unresolved (unknown
|
|
422
|
+
// step id, missing dag arg, or typo) — per README spec.
|
|
423
|
+
if (this.resolver.hasUnresolvedTemplates(resolvedPrompt)) {
|
|
424
|
+
const failedStep = this.store.updateStep(dagId, step.id, (s) => ({
|
|
425
|
+
...s,
|
|
426
|
+
status: "failed" as const,
|
|
427
|
+
output: null,
|
|
428
|
+
error: `Unresolved template variable in prompt: ${resolvedPrompt.match(/\{[^}]+\}/g)?.join(", ")}`,
|
|
429
|
+
completedAt: new Date().toISOString(),
|
|
430
|
+
}));
|
|
431
|
+
this.logStepEvent(dagId, step, "failed", 0);
|
|
432
|
+
return Promise.resolve(
|
|
433
|
+
failedStep ?? { ...step, status: "failed" as const, output: null },
|
|
434
|
+
);
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
return this.dispatchStep(dagId, step, resolvedPrompt, {
|
|
438
|
+
maxRetries: record.options?.maxRetries ?? 0,
|
|
439
|
+
});
|
|
440
|
+
});
|
|
441
|
+
|
|
442
|
+
const settled = await Promise.allSettled(dispatches);
|
|
443
|
+
|
|
444
|
+
// Mirror the dispatched results back into the working snapshot so the
|
|
445
|
+
// next wave's template resolution sees them.
|
|
446
|
+
settled.forEach((result, i) => {
|
|
447
|
+
const stepId = waveStepIds[i];
|
|
448
|
+
if (result.status === "fulfilled" && result.value) {
|
|
449
|
+
steps[stepId] = result.value;
|
|
450
|
+
}
|
|
451
|
+
});
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
/**
|
|
455
|
+
* Gate evaluation (task 5.5, design.md D4). Returns whether `step`'s gate
|
|
456
|
+
* is satisfied given the current step states, i.e. whether the step may be
|
|
457
|
+
* dispatched in this wave.
|
|
458
|
+
* - `needs` gate: every dependency MUST be `completed`. Any other dep
|
|
459
|
+
* state (including `failed`) blocks dispatch — unless `failFast` is
|
|
460
|
+
* `false`, in which case a failed dependency is treated like `after`
|
|
461
|
+
* (task 5.6, design.md D5) and dispatch proceeds with the error text
|
|
462
|
+
* surfaced as `{<dep>.output}`.
|
|
463
|
+
* - `after` gate: every dependency MUST be in a terminal state
|
|
464
|
+
* (`completed`, `failed`, `skipped`, or `cancelled`) — outcome is
|
|
465
|
+
* irrelevant. This lets audit/review steps run on failure evidence.
|
|
466
|
+
*
|
|
467
|
+
* Steps with no dependencies always pass (their gate is vacuously true).
|
|
468
|
+
*
|
|
469
|
+
* @param failFast DAG-level failFast flag (defaults to `true`). When
|
|
470
|
+
* `false`, failed `needs`-gate dependencies do not block dispatch.
|
|
471
|
+
*/
|
|
472
|
+
gateAllowsDispatch(
|
|
473
|
+
step: DagStepRecord,
|
|
474
|
+
steps: Record<string, DagStepRecord>,
|
|
475
|
+
failFast = true,
|
|
476
|
+
): boolean {
|
|
477
|
+
const deps = step.dependsOn ?? [];
|
|
478
|
+
if (deps.length === 0) return true;
|
|
479
|
+
|
|
480
|
+
if (step.gate === "after") {
|
|
481
|
+
return deps.every((depId) => isTerminal(steps[depId]?.status));
|
|
482
|
+
}
|
|
483
|
+
// Default gate is `needs`. With failFast=false a failed dependency is
|
|
484
|
+
// treated like `after` — the step still dispatches.
|
|
485
|
+
if (!failFast) {
|
|
486
|
+
return deps.every((depId) => isTerminal(steps[depId]?.status));
|
|
487
|
+
}
|
|
488
|
+
return deps.every((depId) => steps[depId]?.status === "completed");
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
/**
|
|
492
|
+
* Mark a step `skipped` without dispatching it, persisting the transition
|
|
493
|
+
* through {@link DagStore.updateStep}. Used when a `needs` gate blocks the
|
|
494
|
+
* step because a dependency did not `complete` (task 5.5; the broader
|
|
495
|
+
* failFast transitive skip propagation is task 5.6).
|
|
496
|
+
*/
|
|
497
|
+
private skipStep(dagId: string, step: DagStepRecord): DagStepRecord {
|
|
498
|
+
const completedAt = new Date().toISOString();
|
|
499
|
+
const updated = this.store.updateStep(dagId, step.id, (s) => ({
|
|
500
|
+
...s,
|
|
501
|
+
status: "skipped",
|
|
502
|
+
output: null,
|
|
503
|
+
completedAt,
|
|
504
|
+
}));
|
|
505
|
+
this.logStepEvent(dagId, step, "skipped", 0);
|
|
506
|
+
return (
|
|
507
|
+
updated ?? {
|
|
508
|
+
...step,
|
|
509
|
+
status: "skipped",
|
|
510
|
+
output: null,
|
|
511
|
+
completedAt,
|
|
512
|
+
}
|
|
513
|
+
);
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
/**
|
|
517
|
+
* Dispatch one step via {@link AgentCoordinator.delegate}, capturing the
|
|
518
|
+
* result text on success or the error message on failure. Persists the
|
|
519
|
+
* terminal transition through {@link DagStore.updateStep} and returns the
|
|
520
|
+
* updated step record.
|
|
521
|
+
*
|
|
522
|
+
* Circuit-breaker check (task 5.7, design.md R3): before dispatching, the
|
|
523
|
+
* executor consults {@link AcpCircuitBreaker.isHealthy}. An open circuit
|
|
524
|
+
* fails the step immediately with
|
|
525
|
+
* `Agent "<name>" is unavailable (circuit breaker open)` —
|
|
526
|
+
* `coordinator.delegate` is NOT called, mirroring
|
|
527
|
+
* specs/dag-execution "Step dispatch via AgentCoordinator".
|
|
528
|
+
*/
|
|
529
|
+
private async dispatchStep(
|
|
530
|
+
dagId: string,
|
|
531
|
+
step: DagStepRecord,
|
|
532
|
+
resolvedPrompt: string,
|
|
533
|
+
retryOptions?: { maxRetries?: number },
|
|
534
|
+
): Promise<DagStepRecord> {
|
|
535
|
+
// Circuit breaker check (task 5.7). An open circuit fails the step
|
|
536
|
+
// immediately without dispatching — protects the wave loop from
|
|
537
|
+
// hammering a known-unhealthy agent (design.md R3).
|
|
538
|
+
if (!this.circuitBreaker.isHealthy(step.agent)) {
|
|
539
|
+
const error = `Agent "${step.agent}" is unavailable (circuit breaker open)`;
|
|
540
|
+
const completedAt = new Date().toISOString();
|
|
541
|
+
const updated = this.store.updateStep(dagId, step.id, (s) => ({
|
|
542
|
+
...s,
|
|
543
|
+
status: "failed",
|
|
544
|
+
output: null,
|
|
545
|
+
error,
|
|
546
|
+
completedAt,
|
|
547
|
+
durationMs: 0,
|
|
548
|
+
}));
|
|
549
|
+
this.logStepEvent(dagId, step, "failed", 0);
|
|
550
|
+
return (
|
|
551
|
+
updated ?? {
|
|
552
|
+
...step,
|
|
553
|
+
status: "failed",
|
|
554
|
+
output: null,
|
|
555
|
+
error,
|
|
556
|
+
completedAt,
|
|
557
|
+
durationMs: 0,
|
|
558
|
+
}
|
|
559
|
+
);
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
const startedAt = new Date().toISOString();
|
|
563
|
+
this.store.updateStep(dagId, step.id, (s) => ({
|
|
564
|
+
...s,
|
|
565
|
+
status: "running",
|
|
566
|
+
startedAt,
|
|
567
|
+
}));
|
|
568
|
+
this.logStepEvent(dagId, step, "running");
|
|
569
|
+
|
|
570
|
+
// Register an AbortController so `cancel(dagId)` (task 5.9) can abort
|
|
571
|
+
// this in-flight agent session. The coordinator forwards the signal
|
|
572
|
+
// to the adapter, which cancels + disposes the session (best-effort).
|
|
573
|
+
const controller = this.registerAbortController(dagId, step.id);
|
|
574
|
+
const signal = controller.signal;
|
|
575
|
+
|
|
576
|
+
try {
|
|
577
|
+
const result = await this.coordinator.delegate(
|
|
578
|
+
step.agent,
|
|
579
|
+
resolvedPrompt,
|
|
580
|
+
undefined,
|
|
581
|
+
undefined,
|
|
582
|
+
signal,
|
|
583
|
+
);
|
|
584
|
+
const completedAt = new Date().toISOString();
|
|
585
|
+
const durationMs =
|
|
586
|
+
Date.parse(completedAt) - Date.parse(startedAt);
|
|
587
|
+
const updated = this.store.updateStep(dagId, step.id, (s) => ({
|
|
588
|
+
...s,
|
|
589
|
+
status: "completed",
|
|
590
|
+
output: result.text,
|
|
591
|
+
error: undefined,
|
|
592
|
+
completedAt,
|
|
593
|
+
durationMs,
|
|
594
|
+
}));
|
|
595
|
+
this.logStepEvent(dagId, step, "completed", durationMs);
|
|
596
|
+
return (
|
|
597
|
+
updated ?? {
|
|
598
|
+
...step,
|
|
599
|
+
status: "completed",
|
|
600
|
+
output: result.text,
|
|
601
|
+
completedAt,
|
|
602
|
+
durationMs,
|
|
603
|
+
}
|
|
604
|
+
);
|
|
605
|
+
} catch (err) {
|
|
606
|
+
this.unregisterAbortController(dagId, step.id);
|
|
607
|
+
|
|
608
|
+
// AbortError means `cancel()` aborted this in-flight session (task
|
|
609
|
+
// 5.9, specs/dag-monitoring "best-effort for in-flight steps"). The
|
|
610
|
+
// step transitions to `cancelled` (not `failed`) so its terminal
|
|
611
|
+
// state reflects the cancellation outcome.
|
|
612
|
+
if (isAbortError(err)) {
|
|
613
|
+
const completedAt = new Date().toISOString();
|
|
614
|
+
const durationMs =
|
|
615
|
+
Date.parse(completedAt) - Date.parse(startedAt);
|
|
616
|
+
const updated = this.store.updateStep(dagId, step.id, (s) => ({
|
|
617
|
+
...s,
|
|
618
|
+
status: "cancelled",
|
|
619
|
+
output: null,
|
|
620
|
+
error: undefined,
|
|
621
|
+
completedAt,
|
|
622
|
+
durationMs,
|
|
623
|
+
}));
|
|
624
|
+
this.logStepEvent(dagId, step, "cancelled", durationMs);
|
|
625
|
+
return (
|
|
626
|
+
updated ?? {
|
|
627
|
+
...step,
|
|
628
|
+
status: "cancelled",
|
|
629
|
+
output: null,
|
|
630
|
+
completedAt,
|
|
631
|
+
durationMs,
|
|
632
|
+
}
|
|
633
|
+
);
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
637
|
+
|
|
638
|
+
// Step retry logic (task 5.12, design.md D5; specs/dag-submission
|
|
639
|
+
// "DAG options — failFast and maxRetries"). On failure, when
|
|
640
|
+
// `maxRetries > 0` and the step's `retryCount` is still below the
|
|
641
|
+
// budget, increment `retryCount`, persist the step back to
|
|
642
|
+
// `running`, and re-dispatch the same resolved prompt. Once the
|
|
643
|
+
// budget is exhausted the step stays `failed`.
|
|
644
|
+
const maxRetries = retryOptions?.maxRetries ?? 0;
|
|
645
|
+
const currentRetries = step.retryCount ?? 0;
|
|
646
|
+
if (maxRetries > 0 && currentRetries < maxRetries) {
|
|
647
|
+
const retriedStep = this.recordRetry(dagId, step, currentRetries);
|
|
648
|
+
return await this.dispatchStep(dagId, retriedStep, resolvedPrompt, {
|
|
649
|
+
maxRetries,
|
|
650
|
+
});
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
const completedAt = new Date().toISOString();
|
|
654
|
+
const durationMs =
|
|
655
|
+
Date.parse(completedAt) - Date.parse(startedAt);
|
|
656
|
+
const updated = this.store.updateStep(dagId, step.id, (s) => ({
|
|
657
|
+
...s,
|
|
658
|
+
status: "failed",
|
|
659
|
+
output: null,
|
|
660
|
+
error: message,
|
|
661
|
+
completedAt,
|
|
662
|
+
durationMs,
|
|
663
|
+
}));
|
|
664
|
+
this.logStepEvent(dagId, step, "failed", durationMs);
|
|
665
|
+
return (
|
|
666
|
+
updated ?? {
|
|
667
|
+
...step,
|
|
668
|
+
status: "failed",
|
|
669
|
+
error: message,
|
|
670
|
+
completedAt,
|
|
671
|
+
durationMs,
|
|
672
|
+
}
|
|
673
|
+
);
|
|
674
|
+
}
|
|
675
|
+
finally {
|
|
676
|
+
// Always release the in-flight controller entry once the dispatch
|
|
677
|
+
// settles, regardless of outcome (completed/cancelled/failed).
|
|
678
|
+
this.unregisterAbortController(dagId, step.id);
|
|
679
|
+
}
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
/**
|
|
683
|
+
* Record a retry attempt for a failed step (task 5.12). Increments the
|
|
684
|
+
* step's `retryCount`, resets its status to `running` (the dispatch
|
|
685
|
+
* loop will re-attempt it), and persists the transition through
|
|
686
|
+
* {@link DagStore.updateStep}. Returns the updated step record so the
|
|
687
|
+
* caller can chain the re-dispatch.
|
|
688
|
+
*/
|
|
689
|
+
protected recordRetry(
|
|
690
|
+
dagId: string,
|
|
691
|
+
step: DagStepRecord,
|
|
692
|
+
currentRetries: number,
|
|
693
|
+
): DagStepRecord {
|
|
694
|
+
const startedAt = new Date().toISOString();
|
|
695
|
+
const updated = this.store.updateStep(dagId, step.id, (s) => ({
|
|
696
|
+
...s,
|
|
697
|
+
status: "running",
|
|
698
|
+
retryCount: currentRetries + 1,
|
|
699
|
+
startedAt,
|
|
700
|
+
error: undefined,
|
|
701
|
+
output: null,
|
|
702
|
+
completedAt: undefined,
|
|
703
|
+
durationMs: undefined,
|
|
704
|
+
}));
|
|
705
|
+
return (
|
|
706
|
+
updated ?? {
|
|
707
|
+
...step,
|
|
708
|
+
status: "running",
|
|
709
|
+
retryCount: currentRetries + 1,
|
|
710
|
+
startedAt,
|
|
711
|
+
error: undefined,
|
|
712
|
+
output: null,
|
|
713
|
+
completedAt: undefined,
|
|
714
|
+
durationMs: undefined,
|
|
715
|
+
}
|
|
716
|
+
);
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
/**
|
|
720
|
+
* Cancel a running DAG (task 5.9, specs/dag-monitoring "DAG cancellation").
|
|
721
|
+
*
|
|
722
|
+
* Aborts every in-flight agent session (via the abort signal threaded
|
|
723
|
+
* through {@link AgentCoordinator.delegate}), marks all `pending` and
|
|
724
|
+
* `running` steps as `cancelled`, transitions the DAG to `cancelled`, and
|
|
725
|
+
* returns a summary of the cancellation.
|
|
726
|
+
*
|
|
727
|
+
* The summary counts reflect the step states AT cancel time:
|
|
728
|
+
* - `completed` — steps that had already reached `completed` (untouched)
|
|
729
|
+
* - `aborted` — steps that were `running` (in-flight) and got aborted
|
|
730
|
+
* - `cancelled` — steps that were `pending` and got marked `cancelled`
|
|
731
|
+
*
|
|
732
|
+
* A step that finishes successfully between the abort signal firing and
|
|
733
|
+
* the step being persisted reflects its actual outcome (best-effort),
|
|
734
|
+
* per specs/dag-monitoring "Cancel is best-effort for in-flight steps".
|
|
735
|
+
*
|
|
736
|
+
* @throws when the DAG does not exist, or is already in a terminal state
|
|
737
|
+
* (`completed` / `failed` / `cancelled`).
|
|
738
|
+
*/
|
|
739
|
+
async cancel(dagId: string): Promise<DagCancelSummary> {
|
|
740
|
+
const record = this.store.get(dagId);
|
|
741
|
+
if (!record) {
|
|
742
|
+
throw new Error(`DAG "${dagId}" not found`);
|
|
743
|
+
}
|
|
744
|
+
if (
|
|
745
|
+
record.status === "completed" ||
|
|
746
|
+
record.status === "failed" ||
|
|
747
|
+
record.status === "cancelled"
|
|
748
|
+
) {
|
|
749
|
+
throw new Error(
|
|
750
|
+
`DAG "${dagId}" is already ${record.status} and cannot be cancelled`,
|
|
751
|
+
);
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
// Tally counts from the persisted step states at cancel time.
|
|
755
|
+
let completed = 0;
|
|
756
|
+
let aborted = 0;
|
|
757
|
+
let cancelled = 0;
|
|
758
|
+
for (const step of Object.values(record.steps)) {
|
|
759
|
+
if (step.status === "completed") completed += 1;
|
|
760
|
+
else if (step.status === "running") aborted += 1;
|
|
761
|
+
else if (step.status === "pending") cancelled += 1;
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
// Abort every in-flight agent session for this DAG (best-effort).
|
|
765
|
+
this.abortInFlight(dagId);
|
|
766
|
+
|
|
767
|
+
// Mark every pending + running step as `cancelled` and persist.
|
|
768
|
+
const completedAt = new Date().toISOString();
|
|
769
|
+
for (const stepId of Object.keys(record.steps)) {
|
|
770
|
+
const step = record.steps[stepId];
|
|
771
|
+
if (step.status === "pending" || step.status === "running") {
|
|
772
|
+
this.store.updateStep(dagId, stepId, (s) => ({
|
|
773
|
+
...s,
|
|
774
|
+
status: "cancelled",
|
|
775
|
+
output: null,
|
|
776
|
+
error: undefined,
|
|
777
|
+
completedAt,
|
|
778
|
+
}));
|
|
779
|
+
this.logStepEvent(dagId, step, "cancelled", 0);
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
this.store.updateDagStatus(dagId, "cancelled");
|
|
784
|
+
|
|
785
|
+
return { completed, aborted, cancelled };
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
/**
|
|
789
|
+
* Stale DAG detection (task 5.11, specs/dag-resume "Stale DAG cleanup").
|
|
790
|
+
*
|
|
791
|
+
* Scans all DAGs and marks those in `running` state whose last transition
|
|
792
|
+
* (`updatedAt`) is older than `timeoutMs` as `stale`. A stale DAG has had
|
|
793
|
+
* no step transitions for the entire timeout window, indicating the
|
|
794
|
+
* process likely died or stalled without a clean shutdown.
|
|
795
|
+
*
|
|
796
|
+
* Stale DAGs are excluded from auto-resume (specs/dag-resume "Stale DAG
|
|
797
|
+
* does not auto-resume"), require explicit re-submission, and are reported
|
|
798
|
+
* in `acp_dag_status` listings. Each marked DAG emits a warning log event.
|
|
799
|
+
*
|
|
800
|
+
* Already-stale DAGs are NOT re-marked (idempotent). Terminal DAGs
|
|
801
|
+
* (`completed` / `failed` / `cancelled`) are unaffected.
|
|
802
|
+
*
|
|
803
|
+
* @param timeoutMs Stale threshold in ms (default: `dagStaleTimeoutMs`
|
|
804
|
+
* from config, typically 1 hour).
|
|
805
|
+
* @returns The list of DAG IDs that were newly marked `stale` during
|
|
806
|
+
* this call. Empty when no DAGs crossed the threshold.
|
|
807
|
+
*/
|
|
808
|
+
markStale(timeoutMs: number): string[] {
|
|
809
|
+
// `findRunning()` scans the per-DAG `<dagId>.json` files directly
|
|
810
|
+
// (the source of truth for `updatedAt`), not the index summary, so a
|
|
811
|
+
// backdated or out-of-sync index cannot mask a stale DAG.
|
|
812
|
+
const running = this.store.findRunning();
|
|
813
|
+
const cutoff = Date.now() - timeoutMs;
|
|
814
|
+
const marked: string[] = [];
|
|
815
|
+
|
|
816
|
+
for (const record of running) {
|
|
817
|
+
const updatedAtMs = Date.parse(record.updatedAt);
|
|
818
|
+
if (Number.isNaN(updatedAtMs)) continue;
|
|
819
|
+
if (updatedAtMs >= cutoff) continue;
|
|
820
|
+
|
|
821
|
+
// Transition to `stale` via the store so the index reflects it.
|
|
822
|
+
this.store.updateDagStatus(record.dagId, "stale");
|
|
823
|
+
marked.push(record.dagId);
|
|
824
|
+
this.logger.error(
|
|
825
|
+
`DagExecutor.markStale: DAG "${record.dagId}" marked stale (no transitions for >${timeoutMs}ms)`,
|
|
826
|
+
);
|
|
827
|
+
}
|
|
828
|
+
|
|
829
|
+
return marked;
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
/**
|
|
833
|
+
* Emit a `dag-step` lifecycle event to the wired {@link
|
|
834
|
+
* DagExecutor.eventLog} (task 7.4, specs/dag-monitoring "Event logging for
|
|
835
|
+
* DAG steps"). One event per step status transition (running, completed,
|
|
836
|
+
* failed, skipped, cancelled). The data includes `dagId`, `stepId`,
|
|
837
|
+
* `agent`, `status`, `timestamp`, and `durationMs` (for terminal states).
|
|
838
|
+
*
|
|
839
|
+
* No-op when no event log was wired so the executor stays backward
|
|
840
|
+
* compatible with existing tests/construction sites.
|
|
841
|
+
*/
|
|
842
|
+
protected logStepEvent(
|
|
843
|
+
dagId: string,
|
|
844
|
+
step: { id: string; agent: string },
|
|
845
|
+
status: DagStepStatus,
|
|
846
|
+
durationMs?: number,
|
|
847
|
+
): void {
|
|
848
|
+
if (!this.eventLog) return;
|
|
849
|
+
const data: Record<string, unknown> = {
|
|
850
|
+
dagId,
|
|
851
|
+
stepId: step.id,
|
|
852
|
+
agent: step.agent,
|
|
853
|
+
status,
|
|
854
|
+
timestamp: new Date().toISOString(),
|
|
855
|
+
};
|
|
856
|
+
if (typeof durationMs === "number") {
|
|
857
|
+
data.durationMs = durationMs;
|
|
858
|
+
}
|
|
859
|
+
this.eventLog.append("dag-step", data);
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
/**
|
|
863
|
+
* Register an {@link AbortController} for an in-flight step dispatch so
|
|
864
|
+
* {@link DagExecutor.cancel} can abort it. Returns the controller so the
|
|
865
|
+
* dispatch can hand its `signal` to {@link AgentCoordinator.delegate}.
|
|
866
|
+
*/
|
|
867
|
+
protected registerAbortController(
|
|
868
|
+
dagId: string,
|
|
869
|
+
stepId: string,
|
|
870
|
+
): AbortController {
|
|
871
|
+
let byStep = this.abortControllers.get(dagId);
|
|
872
|
+
if (!byStep) {
|
|
873
|
+
byStep = new Map();
|
|
874
|
+
this.abortControllers.set(dagId, byStep);
|
|
875
|
+
}
|
|
876
|
+
const controller = new AbortController();
|
|
877
|
+
byStep.set(stepId, controller);
|
|
878
|
+
return controller;
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
/** Remove the abort-controller entry for a settled step dispatch. */
|
|
882
|
+
protected unregisterAbortController(dagId: string, stepId: string): void {
|
|
883
|
+
const byStep = this.abortControllers.get(dagId);
|
|
884
|
+
if (!byStep) return;
|
|
885
|
+
byStep.delete(stepId);
|
|
886
|
+
if (byStep.size === 0) this.abortControllers.delete(dagId);
|
|
887
|
+
}
|
|
888
|
+
|
|
889
|
+
/**
|
|
890
|
+
* Abort every in-flight agent session for a DAG (task 5.9). Best-effort:
|
|
891
|
+
* firing `abort()` on each registered controller causes the coordinator
|
|
892
|
+
* to cancel + dispose the underlying session and reject the dispatch
|
|
893
|
+
* with an `AbortError`, which {@link DagExecutor.dispatchStep} maps to a
|
|
894
|
+
* `cancelled` terminal status.
|
|
895
|
+
*/
|
|
896
|
+
protected abortInFlight(dagId: string): void {
|
|
897
|
+
const byStep = this.abortControllers.get(dagId);
|
|
898
|
+
if (!byStep) return;
|
|
899
|
+
for (const controller of byStep.values()) {
|
|
900
|
+
try {
|
|
901
|
+
controller.abort();
|
|
902
|
+
} catch {
|
|
903
|
+
/* best-effort — a controller that already aborted is a no-op */
|
|
904
|
+
}
|
|
905
|
+
}
|
|
906
|
+
}
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
/**
|
|
910
|
+
* Module-level shared registry of in-flight abort controllers, keyed by
|
|
911
|
+
* `dagId` → `stepId`. Shared across ALL DagExecutor instances so that an
|
|
912
|
+
* executor constructed for `acp_dag_cancel` can abort sessions dispatched by
|
|
913
|
+
* a different executor constructed for `acp_dag_submit` (task 7.1 wires a
|
|
914
|
+
* fresh DagExecutor per tool call). Keyed by dagId + stepId so concurrent
|
|
915
|
+
* DAGs never collide.
|
|
916
|
+
*/
|
|
917
|
+
const SHARED_ABORT_CONTROLLERS = new Map<string, Map<string, AbortController>>();
|
|
918
|
+
|
|
919
|
+
/**
|
|
920
|
+
* Collect `{id → output}` for all terminal steps that have a text output.
|
|
921
|
+
* Kept as a module function so the wave loop reads from a plain snapshot.
|
|
922
|
+
*/
|
|
923
|
+
function collectOutputs(steps: Record<string, DagStepRecord>): Record<string, string> {
|
|
924
|
+
const out: Record<string, string> = {};
|
|
925
|
+
for (const [id, step] of Object.entries(steps)) {
|
|
926
|
+
if (typeof step.output === "string") {
|
|
927
|
+
out[id] = step.output;
|
|
928
|
+
} else if (step.status === "failed" && step.error) {
|
|
929
|
+
// Allow `{<failed-step>.output}` to surface the error text for
|
|
930
|
+
// `after`-gate / failFast=false downstream steps.
|
|
931
|
+
out[id] = step.error;
|
|
932
|
+
}
|
|
933
|
+
}
|
|
934
|
+
return out;
|
|
935
|
+
}
|
|
936
|
+
|
|
937
|
+
/** Collect `{id → status}` for all terminal steps. */
|
|
938
|
+
function collectStatuses(steps: Record<string, DagStepRecord>): Record<string, string> {
|
|
939
|
+
const out: Record<string, string> = {};
|
|
940
|
+
for (const [id, step] of Object.entries(steps)) {
|
|
941
|
+
out[id] = step.status;
|
|
942
|
+
}
|
|
943
|
+
return out;
|
|
944
|
+
}
|
|
945
|
+
|
|
946
|
+
/** Whether a step status is terminal (no further transitions expected). */
|
|
947
|
+
function isTerminal(status: DagStepStatus | undefined): boolean {
|
|
948
|
+
return (
|
|
949
|
+
status === "completed" ||
|
|
950
|
+
status === "failed" ||
|
|
951
|
+
status === "skipped" ||
|
|
952
|
+
status === "cancelled"
|
|
953
|
+
);
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
/**
|
|
957
|
+
* Whether an error is an `AbortError` raised by aborting an in-flight agent
|
|
958
|
+
* session (coordinator wraps the abort in a `DOMException` with name
|
|
959
|
+
* `"AbortError"`). Used by {@link DagExecutor.dispatchStep} to map an
|
|
960
|
+
* aborted dispatch to a `cancelled` terminal status (task 5.9).
|
|
961
|
+
*/
|
|
962
|
+
function isAbortError(err: unknown): boolean {
|
|
963
|
+
if (err == null || typeof err !== "object") return false;
|
|
964
|
+
const name = (err as { name?: unknown }).name;
|
|
965
|
+
return name === "AbortError";
|
|
966
|
+
}
|