@dogpile/sdk 0.3.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/CHANGELOG.md +201 -0
  2. package/README.md +1 -0
  3. package/dist/browser/index.js +2328 -237
  4. package/dist/browser/index.js.map +1 -1
  5. package/dist/index.d.ts +3 -1
  6. package/dist/index.d.ts.map +1 -1
  7. package/dist/index.js +1 -0
  8. package/dist/index.js.map +1 -1
  9. package/dist/providers/openai-compatible.d.ts +11 -0
  10. package/dist/providers/openai-compatible.d.ts.map +1 -1
  11. package/dist/providers/openai-compatible.js +88 -2
  12. package/dist/providers/openai-compatible.js.map +1 -1
  13. package/dist/runtime/audit.d.ts +42 -0
  14. package/dist/runtime/audit.d.ts.map +1 -0
  15. package/dist/runtime/audit.js +73 -0
  16. package/dist/runtime/audit.js.map +1 -0
  17. package/dist/runtime/broadcast.d.ts.map +1 -1
  18. package/dist/runtime/broadcast.js +39 -36
  19. package/dist/runtime/broadcast.js.map +1 -1
  20. package/dist/runtime/cancellation.d.ts +26 -0
  21. package/dist/runtime/cancellation.d.ts.map +1 -1
  22. package/dist/runtime/cancellation.js +38 -1
  23. package/dist/runtime/cancellation.js.map +1 -1
  24. package/dist/runtime/coordinator.d.ts +79 -1
  25. package/dist/runtime/coordinator.d.ts.map +1 -1
  26. package/dist/runtime/coordinator.js +979 -61
  27. package/dist/runtime/coordinator.js.map +1 -1
  28. package/dist/runtime/decisions.d.ts +25 -3
  29. package/dist/runtime/decisions.d.ts.map +1 -1
  30. package/dist/runtime/decisions.js +241 -3
  31. package/dist/runtime/decisions.js.map +1 -1
  32. package/dist/runtime/defaults.d.ts +37 -1
  33. package/dist/runtime/defaults.d.ts.map +1 -1
  34. package/dist/runtime/defaults.js +359 -4
  35. package/dist/runtime/defaults.js.map +1 -1
  36. package/dist/runtime/engine.d.ts +17 -4
  37. package/dist/runtime/engine.d.ts.map +1 -1
  38. package/dist/runtime/engine.js +770 -35
  39. package/dist/runtime/engine.js.map +1 -1
  40. package/dist/runtime/health.d.ts +51 -0
  41. package/dist/runtime/health.d.ts.map +1 -0
  42. package/dist/runtime/health.js +85 -0
  43. package/dist/runtime/health.js.map +1 -0
  44. package/dist/runtime/introspection.d.ts +96 -0
  45. package/dist/runtime/introspection.d.ts.map +1 -0
  46. package/dist/runtime/introspection.js +31 -0
  47. package/dist/runtime/introspection.js.map +1 -0
  48. package/dist/runtime/metrics.d.ts +44 -0
  49. package/dist/runtime/metrics.d.ts.map +1 -0
  50. package/dist/runtime/metrics.js +12 -0
  51. package/dist/runtime/metrics.js.map +1 -0
  52. package/dist/runtime/model.d.ts.map +1 -1
  53. package/dist/runtime/model.js +34 -7
  54. package/dist/runtime/model.js.map +1 -1
  55. package/dist/runtime/provenance.d.ts +25 -0
  56. package/dist/runtime/provenance.d.ts.map +1 -0
  57. package/dist/runtime/provenance.js +13 -0
  58. package/dist/runtime/provenance.js.map +1 -0
  59. package/dist/runtime/sequential.d.ts.map +1 -1
  60. package/dist/runtime/sequential.js +47 -37
  61. package/dist/runtime/sequential.js.map +1 -1
  62. package/dist/runtime/shared.d.ts.map +1 -1
  63. package/dist/runtime/shared.js +39 -36
  64. package/dist/runtime/shared.js.map +1 -1
  65. package/dist/runtime/tracing.d.ts +31 -0
  66. package/dist/runtime/tracing.d.ts.map +1 -0
  67. package/dist/runtime/tracing.js +18 -0
  68. package/dist/runtime/tracing.js.map +1 -0
  69. package/dist/runtime/validation.d.ts +10 -0
  70. package/dist/runtime/validation.d.ts.map +1 -1
  71. package/dist/runtime/validation.js +73 -0
  72. package/dist/runtime/validation.js.map +1 -1
  73. package/dist/types/events.d.ts +339 -12
  74. package/dist/types/events.d.ts.map +1 -1
  75. package/dist/types/replay.d.ts +7 -1
  76. package/dist/types/replay.d.ts.map +1 -1
  77. package/dist/types.d.ts +255 -6
  78. package/dist/types.d.ts.map +1 -1
  79. package/dist/types.js.map +1 -1
  80. package/package.json +39 -1
  81. package/src/index.ts +15 -0
  82. package/src/providers/openai-compatible.ts +83 -3
  83. package/src/runtime/audit.ts +121 -0
  84. package/src/runtime/broadcast.ts +40 -37
  85. package/src/runtime/cancellation.ts +59 -1
  86. package/src/runtime/coordinator.ts +1221 -61
  87. package/src/runtime/decisions.ts +307 -4
  88. package/src/runtime/defaults.ts +389 -4
  89. package/src/runtime/engine.ts +1004 -35
  90. package/src/runtime/health.ts +136 -0
  91. package/src/runtime/introspection.ts +122 -0
  92. package/src/runtime/metrics.ts +45 -0
  93. package/src/runtime/model.ts +38 -6
  94. package/src/runtime/provenance.ts +43 -0
  95. package/src/runtime/sequential.ts +49 -38
  96. package/src/runtime/shared.ts +40 -37
  97. package/src/runtime/tracing.ts +35 -0
  98. package/src/runtime/validation.ts +81 -0
  99. package/src/types/events.ts +369 -12
  100. package/src/types/replay.ts +14 -1
  101. package/src/types.ts +279 -4
@@ -1,22 +1,31 @@
1
+ import { DogpileError } from "../types.js";
1
2
  import type {
2
3
  AgentSpec,
3
4
  ConfiguredModelProvider,
4
5
  CoordinatorProtocolConfig,
5
6
  CostSummary,
7
+ DelegateAgentDecision,
6
8
  DogpileOptions,
7
9
  JsonObject,
8
10
  JsonValue,
9
11
  ModelRequest,
10
12
  ModelResponse,
13
+ ProtocolSelection,
11
14
  ReplayTraceProtocolDecision,
12
15
  ReplayTraceProviderCall,
13
16
  RuntimeTool,
14
17
  RuntimeToolExecutor,
15
18
  RunEvent,
16
19
  RunResult,
20
+ SubRunBudgetClampedEvent,
21
+ SubRunConcurrencyClampedEvent,
22
+ SubRunFailedEvent,
23
+ SubRunQueuedEvent,
24
+ SubRunParentAbortedEvent,
17
25
  TerminationCondition,
18
26
  TerminationStopRecord,
19
27
  Tier,
28
+ Trace,
20
29
  TranscriptEntry
21
30
  } from "../types.js";
22
31
  import { createRunId, elapsedMs, nowMs, providerCallIdFor } from "./ids.js";
@@ -34,15 +43,68 @@ import {
34
43
  createRunUsage,
35
44
  createTranscriptLink,
36
45
  emptyCost,
46
+ lastCostBearingEventCost,
37
47
  nextProviderCallId
38
48
  } from "./defaults.js";
39
- import { throwIfAborted } from "./cancellation.js";
40
- import { parseAgentDecision } from "./decisions.js";
49
+ import { computeHealth, DEFAULT_HEALTH_THRESHOLDS } from "./health.js";
50
+ import {
51
+ classifyAbortReason,
52
+ classifyChildTimeoutSource,
53
+ createAbortErrorFromSignal,
54
+ createEngineDeadlineTimeoutError,
55
+ throwIfAborted
56
+ } from "./cancellation.js";
57
+ import { assertDepthWithinLimit, parseAgentDecision } from "./decisions.js";
41
58
  import { generateModelTurn } from "./model.js";
42
59
  import { evaluateTerminationStop, warnOnProtocolTerminationMisconfiguration } from "./termination.js";
43
60
  import { createRuntimeToolExecutor, executeModelResponseToolRequests, runtimeToolAvailability } from "./tools.js";
44
61
  import { createWrapUpHintController } from "./wrap-up.js";
45
62
 
63
+ /**
64
+ * Callback to invoke a child run via the engine's `runProtocol` switch. Passed
65
+ * in by `engine.ts` so coordinator avoids a circular import.
66
+ */
67
+ export type RunProtocolFn = (input: {
68
+ /**
69
+ * Planned child run id emitted on sub-run lifecycle events before dispatch.
70
+ * The engine callback uses this to look up the matching sub-run span.
71
+ */
72
+ readonly runId: string;
73
+ readonly intent: string;
74
+ readonly protocol: ProtocolSelection;
75
+ readonly tier: Tier;
76
+ readonly model: ConfiguredModelProvider;
77
+ readonly agents: readonly AgentSpec[];
78
+ readonly tools: readonly RuntimeTool<JsonObject, JsonValue>[];
79
+ readonly temperature: number;
80
+ readonly budget?: DogpileOptions["budget"];
81
+ readonly seed?: string | number;
82
+ readonly signal?: AbortSignal;
83
+ readonly terminate?: TerminationCondition;
84
+ readonly wrapUpHint?: DogpileOptions["wrapUpHint"];
85
+ readonly emit?: (event: RunEvent) => void;
86
+ readonly streamEvents?: boolean;
87
+ readonly currentDepth?: number;
88
+ readonly effectiveMaxDepth?: number;
89
+ readonly effectiveMaxConcurrentChildren?: number;
90
+ readonly onChildFailure?: DogpileOptions["onChildFailure"];
91
+ /**
92
+ * Root-run deadline (epoch ms). Children inherit `parentDeadlineMs - now()`
93
+ * as their default timeout window so a depth-N child sees the ROOT's deadline,
94
+ * not its immediate parent's freshly-computed value (BUDGET-02 / D-12).
95
+ */
96
+ readonly parentDeadlineMs?: number;
97
+ /**
98
+ * Engine-level fallback sub-run timeout (BUDGET-02 / D-14). Applied only when
99
+ * neither the parent nor the decision specifies a `budget.timeoutMs`.
100
+ */
101
+ readonly defaultSubRunTimeoutMs?: number;
102
+ readonly registerAbortDrain?: (drain: AbortDrainFn) => void;
103
+ readonly failureInstancesByChildRunId?: Map<string, DogpileError>;
104
+ }) => Promise<RunResult>;
105
+
106
+ export type AbortDrainFn = (reason?: unknown) => void;
107
+
46
108
  interface CoordinatorRunOptions {
47
109
  readonly intent: string;
48
110
  readonly protocol: CoordinatorProtocolConfig;
@@ -57,6 +119,121 @@ interface CoordinatorRunOptions {
57
119
  readonly terminate?: TerminationCondition;
58
120
  readonly wrapUpHint?: DogpileOptions["wrapUpHint"];
59
121
  readonly emit?: (event: RunEvent) => void;
122
+ readonly streamEvents?: boolean;
123
+ /**
124
+ * Recursion depth of this coordinator run. Top-level callers pass 0; child
125
+ * sub-runs receive parent depth + 1 from the dispatch loop.
126
+ */
127
+ readonly currentDepth?: number;
128
+ /**
129
+ * Effective max recursion depth resolved at run start. Plan 04 enforces;
130
+ * Plan 03 only plumbs the value.
131
+ */
132
+ readonly effectiveMaxDepth?: number;
133
+ readonly effectiveMaxConcurrentChildren?: number;
134
+ readonly onChildFailure?: DogpileOptions["onChildFailure"];
135
+ /**
136
+ * Engine `runProtocol` callback used by the delegate dispatch loop to
137
+ * recursively run a child protocol. Optional so unit tests that exercise
138
+ * the coordinator without the engine wrapper still typecheck — when omitted,
139
+ * delegate dispatch falls back to throwing `invalid-configuration`.
140
+ */
141
+ readonly runProtocol?: RunProtocolFn;
142
+ /**
143
+ * Root-run deadline (epoch ms) threaded through every recursive coordinator
144
+ * dispatch (BUDGET-02 / D-12). When set, sub-run dispatches compute their
145
+ * `remainingMs = parentDeadlineMs - Date.now()` against this deadline rather
146
+ * than the parent's full `budget.timeoutMs` window.
147
+ */
148
+ readonly parentDeadlineMs?: number;
149
+ /**
150
+ * Engine-level fallback sub-run timeout (BUDGET-02 / D-14). Applied only when
151
+ * neither the parent nor the decision specifies a `budget.timeoutMs`.
152
+ */
153
+ readonly defaultSubRunTimeoutMs?: number;
154
+ readonly registerAbortDrain?: (drain: AbortDrainFn) => void;
155
+ readonly failureInstancesByChildRunId?: Map<string, DogpileError>;
156
+ }
157
+
158
+ /**
159
+ * Hard-coded loop guard for the delegate dispatch in the coordinator plan
160
+ * turn. After this many consecutive delegate decisions the coordinator throws
161
+ * `invalid-configuration` (T-03-01). Not a public option.
162
+ */
163
+ const MAX_DISPATCH_PER_TURN = 8;
164
+ const DEFAULT_MAX_CONCURRENT_CHILDREN = 4;
165
+
166
+ type DispatchWaveFailure = {
167
+ readonly childRunId: string;
168
+ readonly intent: string;
169
+ readonly error: {
170
+ readonly code: string;
171
+ readonly message: string;
172
+ readonly detail?: { readonly reason?: string };
173
+ };
174
+ readonly partialCost: { readonly usd: number };
175
+ };
176
+
177
+ interface Semaphore {
178
+ acquire(): Promise<void>;
179
+ release(): void;
180
+ readonly inFlight: number;
181
+ readonly queued: number;
182
+ }
183
+
184
+ function createSemaphore(maxConcurrent: number): Semaphore {
185
+ let inFlight = 0;
186
+ const waiters: Array<() => void> = [];
187
+ return {
188
+ acquire(): Promise<void> {
189
+ if (inFlight < maxConcurrent) {
190
+ inFlight += 1;
191
+ return Promise.resolve();
192
+ }
193
+ return new Promise<void>((resolve) => {
194
+ waiters.push(() => {
195
+ inFlight += 1;
196
+ resolve();
197
+ });
198
+ });
199
+ },
200
+ release(): void {
201
+ inFlight -= 1;
202
+ const next = waiters.shift();
203
+ if (next !== undefined) {
204
+ next();
205
+ }
206
+ },
207
+ get inFlight() {
208
+ return inFlight;
209
+ },
210
+ get queued() {
211
+ return waiters.length;
212
+ }
213
+ };
214
+ }
215
+
216
+ /**
217
+ * Walk the coordinator's active provider set and return the FIRST provider
218
+ * whose metadata.locality === "local", or undefined if none found.
219
+ *
220
+ * Walk order (forward-compat): options.model first, then options.agents in
221
+ * declaration order. AgentSpec has no `model` field today (Phase 3 D-11
222
+ * forward-compat scaffolding); the agent walk uses optional chaining and
223
+ * effectively no-ops until a future phase adds AgentSpec.model.
224
+ */
225
+ function findFirstLocalProvider(options: CoordinatorRunOptions): ConfiguredModelProvider | undefined {
226
+ if (options.model.metadata?.locality === "local") {
227
+ return options.model;
228
+ }
229
+ // Forward-compat: AgentSpec.model not yet declared (Phase 3 D-11). Walk no-ops today; ready for caller-defined trees in a future milestone.
230
+ for (const agent of options.agents) {
231
+ const agentModel = (agent as { readonly model?: ConfiguredModelProvider }).model;
232
+ if (agentModel?.metadata?.locality === "local") {
233
+ return agentModel;
234
+ }
235
+ }
236
+ return undefined;
60
237
  }
61
238
 
62
239
  export async function runCoordinator(options: CoordinatorRunOptions): Promise<RunResult> {
@@ -65,13 +242,16 @@ export async function runCoordinator(options: CoordinatorRunOptions): Promise<Ru
65
242
  const transcript: TranscriptEntry[] = [];
66
243
  const protocolDecisions: ReplayTraceProtocolDecision[] = [];
67
244
  const providerCalls: ReplayTraceProviderCall[] = [];
245
+ const dispatchedChildren = new Map<string, DispatchedChild>();
68
246
  let totalCost = emptyCost();
247
+ let concurrencyClampEmitted = false; // D-12: emit once per run, never per-engine.
69
248
  const maxTurns = options.protocol.maxTurns ?? options.agents.length;
70
249
  const activeAgents = options.agents.slice(0, maxTurns);
71
250
  const coordinator = activeAgents[0];
72
251
  const startedAtMs = nowMs();
73
252
  let stopped = false;
74
253
  let termination: TerminationStopRecord | undefined;
254
+ let triggeringFailureForAbortMode: DispatchWaveFailure | undefined;
75
255
  const wrapUpHint = createWrapUpHintController({
76
256
  protocol: options.protocol,
77
257
  tier: options.tier,
@@ -96,6 +276,63 @@ export async function runCoordinator(options: CoordinatorRunOptions): Promise<Ru
96
276
  );
97
277
  };
98
278
 
279
+ const drainOnParentAbort = (reasonSource?: unknown): void => {
280
+ const reason = classifyAbortReason(reasonSource);
281
+ for (const child of dispatchedChildren.values()) {
282
+ if (child.closed) {
283
+ continue;
284
+ }
285
+ const partialCost = child.started
286
+ ? lastCostBearingEventCost(child.childEvents) ?? emptyCost()
287
+ : emptyCost();
288
+ const partialTrace = buildPartialTrace({
289
+ childRunId: child.childRunId,
290
+ events: [...child.childEvents],
291
+ startedAtMs: child.startedAtMs,
292
+ protocol: child.decision.protocol,
293
+ tier: options.tier,
294
+ modelProviderId: options.model.id,
295
+ agents: options.agents,
296
+ intent: child.decision.intent,
297
+ temperature: options.temperature,
298
+ ...(child.childTimeoutMs !== undefined ? { childTimeoutMs: child.childTimeoutMs } : {}),
299
+ ...(options.seed !== undefined ? { seed: options.seed } : {})
300
+ });
301
+ const failedEvent: SubRunFailedEvent = {
302
+ type: "sub-run-failed",
303
+ runId,
304
+ at: new Date().toISOString(),
305
+ childRunId: child.childRunId,
306
+ parentRunId: runId,
307
+ parentDecisionId: child.parentDecisionId,
308
+ parentDecisionArrayIndex: child.parentDecisionArrayIndex,
309
+ error: child.started
310
+ ? {
311
+ code: "aborted",
312
+ message: "Parent run aborted.",
313
+ detail: {
314
+ reason
315
+ }
316
+ }
317
+ : {
318
+ code: "aborted",
319
+ message: "Sibling delegate failed; queued delegate never started.",
320
+ detail: {
321
+ reason: "sibling-failed"
322
+ }
323
+ },
324
+ partialTrace,
325
+ partialCost
326
+ };
327
+ child.closed = true;
328
+ totalCost = addCost(totalCost, partialCost);
329
+ emit(failedEvent);
330
+ recordProtocolDecision(failedEvent);
331
+ }
332
+ };
333
+
334
+ options.registerAbortDrain?.(drainOnParentAbort);
335
+
99
336
  const toolExecutor = createRuntimeToolExecutor({
100
337
  runId,
101
338
  protocol: "coordinator",
@@ -126,24 +363,280 @@ export async function runCoordinator(options: CoordinatorRunOptions): Promise<Ru
126
363
 
127
364
  if (coordinator) {
128
365
  if (!stopIfNeeded()) {
129
- totalCost = await runCoordinatorTurn({
130
- agent: coordinator,
131
- coordinator,
132
- input: buildCoordinatorPlanInput(options.intent, coordinator),
133
- phase: "plan",
134
- options,
135
- runId,
136
- transcript,
137
- totalCost,
138
- providerCalls,
139
- toolExecutor,
140
- toolAvailability,
141
- events,
142
- startedAtMs,
143
- wrapUpHint,
144
- emit,
145
- recordProtocolDecision
146
- });
366
+ // Delegate dispatch loop (D-11/D-16/D-17/D-18). Phase 1 limits delegation
367
+ // to the coordinator's plan turn; workers cannot delegate. The loop
368
+ // re-issues the coordinator plan turn after each successful sub-run with
369
+ // the projected D-17 result tagged into the next prompt and a synthetic
370
+ // D-18 transcript entry already appended. `partialTrace` for failed
371
+ // sub-runs is captured via a tee'd emit buffer locally — `runProtocol`'s
372
+ // error contract is unchanged.
373
+ let dispatchInput = buildCoordinatorPlanInput(options.intent, coordinator);
374
+ let dispatchCount = 0;
375
+ while (true) {
376
+ const turnOutcome = await runCoordinatorTurn({
377
+ agent: coordinator,
378
+ coordinator,
379
+ input: dispatchInput,
380
+ phase: "plan",
381
+ options,
382
+ runId,
383
+ transcript,
384
+ totalCost,
385
+ providerCalls,
386
+ toolExecutor,
387
+ toolAvailability,
388
+ events,
389
+ startedAtMs,
390
+ wrapUpHint,
391
+ emit,
392
+ recordProtocolDecision
393
+ });
394
+ totalCost = turnOutcome.totalCost;
395
+
396
+ if (turnOutcome.decision === undefined) {
397
+ break;
398
+ }
399
+
400
+ const delegates = Array.isArray(turnOutcome.decision)
401
+ ? turnOutcome.decision
402
+ : turnOutcome.decision.type === "delegate"
403
+ ? [turnOutcome.decision]
404
+ : [];
405
+ if (delegates.length === 0) {
406
+ break;
407
+ }
408
+
409
+ if (dispatchCount + delegates.length > MAX_DISPATCH_PER_TURN) {
410
+ throw new DogpileError({
411
+ code: "invalid-configuration",
412
+ message: `Coordinator plan turn delegated ${delegates.length} more children after ${dispatchCount}; max is ${MAX_DISPATCH_PER_TURN}.`,
413
+ retryable: false,
414
+ detail: {
415
+ kind: "delegate-validation",
416
+ path: "decision",
417
+ reason: "loop-guard-exceeded",
418
+ maxDispatchPerTurn: MAX_DISPATCH_PER_TURN
419
+ }
420
+ });
421
+ }
422
+
423
+ const parentDecisionId = String(events.length - 1);
424
+ const parentDepth = options.currentDepth ?? 0;
425
+ const decisionMax = delegates.reduce(
426
+ (max, delegate) => Math.min(max, delegate.maxConcurrentChildren ?? Number.POSITIVE_INFINITY),
427
+ Number.POSITIVE_INFINITY
428
+ );
429
+ let effectiveForTurn = Math.min(
430
+ options.effectiveMaxConcurrentChildren ?? DEFAULT_MAX_CONCURRENT_CHILDREN,
431
+ decisionMax
432
+ );
433
+ const requestedMax = effectiveForTurn;
434
+ const localProvider = findFirstLocalProvider(options);
435
+ if (localProvider !== undefined) {
436
+ effectiveForTurn = 1;
437
+ if (!concurrencyClampEmitted) {
438
+ const clampEvent: SubRunConcurrencyClampedEvent = {
439
+ type: "sub-run-concurrency-clamped",
440
+ runId,
441
+ at: new Date().toISOString(),
442
+ requestedMax,
443
+ effectiveMax: 1,
444
+ reason: "local-provider-detected",
445
+ providerId: localProvider.id
446
+ };
447
+ emit(clampEvent);
448
+ recordProtocolDecision(clampEvent);
449
+ concurrencyClampEmitted = true;
450
+ }
451
+ }
452
+ const semaphore = createSemaphore(effectiveForTurn);
453
+ const childRunIds = delegates.map(() => createRunId());
454
+ const dispatchedForTurn = delegates.map((delegate, index): DispatchedChild => {
455
+ const childRunId = childRunIds[index];
456
+ if (childRunId === undefined) {
457
+ throw new Error("missing child run id");
458
+ }
459
+ const dispatchedChild: DispatchedChild = {
460
+ childRunId,
461
+ decision: delegate,
462
+ parentDecisionId,
463
+ parentDecisionArrayIndex: index,
464
+ parentDepth,
465
+ controller: new AbortController(),
466
+ removeParentListener: undefined,
467
+ childEvents: [],
468
+ started: false,
469
+ closed: false,
470
+ startedAtMs: Date.now(),
471
+ childTimeoutMs: undefined,
472
+ failure: undefined
473
+ };
474
+ dispatchedChildren.set(childRunId, dispatchedChild);
475
+ return dispatchedChild;
476
+ });
477
+ const dispatchResults: Array<{ readonly index: number; readonly result: DispatchDelegateResult }> = [];
478
+ let firstFailureIndex: number | undefined;
479
+
480
+ const tasks = delegates.map(async (delegate, index) => {
481
+ const childRunId = childRunIds[index];
482
+ if (childRunId === undefined) {
483
+ throw new Error("missing child run id");
484
+ }
485
+ if (semaphore.inFlight >= effectiveForTurn) {
486
+ const queuedEvent: SubRunQueuedEvent = {
487
+ type: "sub-run-queued",
488
+ runId,
489
+ at: new Date().toISOString(),
490
+ childRunId,
491
+ parentRunId: runId,
492
+ parentDecisionId,
493
+ parentDecisionArrayIndex: index,
494
+ protocol: delegate.protocol,
495
+ intent: delegate.intent,
496
+ depth: parentDepth + 1,
497
+ queuePosition: semaphore.queued
498
+ };
499
+ emit(queuedEvent);
500
+ recordProtocolDecision(queuedEvent);
501
+ }
502
+
503
+ await semaphore.acquire();
504
+ try {
505
+ const dispatchedChild = dispatchedForTurn[index];
506
+ if (!dispatchedChild) {
507
+ throw new Error("missing dispatched child");
508
+ }
509
+ if (firstFailureIndex !== undefined) {
510
+ if (dispatchedChild.closed) {
511
+ dispatchResults.push({
512
+ index,
513
+ result: {
514
+ nextInput: "",
515
+ taggedText: `[sub-run ${childRunId}]: skipped because the parent run aborted`,
516
+ completedAtMs: Date.now()
517
+ }
518
+ });
519
+ return;
520
+ }
521
+ const partialCost = emptyCost();
522
+ const partialTrace = buildPartialTrace({
523
+ childRunId,
524
+ events: [],
525
+ startedAtMs: Date.now(),
526
+ protocol: delegate.protocol,
527
+ tier: options.tier,
528
+ modelProviderId: options.model.id,
529
+ agents: options.agents,
530
+ intent: delegate.intent,
531
+ temperature: options.temperature,
532
+ ...(options.seed !== undefined ? { seed: options.seed } : {})
533
+ });
534
+ const failedEvent: SubRunFailedEvent = {
535
+ type: "sub-run-failed",
536
+ runId,
537
+ at: new Date().toISOString(),
538
+ childRunId,
539
+ parentRunId: runId,
540
+ parentDecisionId,
541
+ parentDecisionArrayIndex: index,
542
+ error: {
543
+ code: "aborted",
544
+ message: "Sibling delegate failed; queued delegate never started.",
545
+ detail: {
546
+ reason: "sibling-failed"
547
+ }
548
+ },
549
+ partialTrace,
550
+ partialCost
551
+ };
552
+ emit(failedEvent);
553
+ recordProtocolDecision(failedEvent);
554
+ dispatchedChild.closed = true;
555
+ dispatchResults.push({
556
+ index,
557
+ result: {
558
+ nextInput: "",
559
+ taggedText: `[sub-run ${childRunId}]: skipped because a sibling delegate failed`,
560
+ completedAtMs: Date.now()
561
+ }
562
+ });
563
+ return;
564
+ }
565
+ const result = await dispatchDelegate({
566
+ decision: delegate,
567
+ childRunId,
568
+ parentDecisionId,
569
+ parentDecisionArrayIndex: index,
570
+ parentDepth,
571
+ parentRunId: runId,
572
+ options,
573
+ transcript,
574
+ emit,
575
+ recordProtocolDecision,
576
+ recordSubRunCost: (cost: CostSummary): void => {
577
+ totalCost = addCost(totalCost, cost);
578
+ },
579
+ dispatchedChild
580
+ });
581
+ dispatchResults.push({ index, result });
582
+ } catch (error) {
583
+ firstFailureIndex ??= index;
584
+ const dispatchedChild = dispatchedForTurn[index];
585
+ const failure = dispatchedChild?.failure;
586
+ if (
587
+ delegates.length === 1 &&
588
+ (options.onChildFailure === "abort" || failure === undefined || isDelegateValidationError(error))
589
+ ) {
590
+ throw error;
591
+ }
592
+ const failureMessage = error instanceof Error ? error.message : String(error);
593
+ let taggedText = `[sub-run ${childRunId} failed]: ${failureMessage}`;
594
+ if (failure) {
595
+ const error = failure.error;
596
+ taggedText = `[sub-run ${childRunId} failed | code=${error.code} | spent=$${failure.partialCost.usd.toFixed(3)}]: ${error.message}`;
597
+ }
598
+ dispatchResults.push({
599
+ index,
600
+ result: {
601
+ nextInput: "",
602
+ taggedText,
603
+ completedAtMs: Date.now()
604
+ }
605
+ });
606
+ } finally {
607
+ semaphore.release();
608
+ }
609
+ });
610
+ const settled = await Promise.allSettled(tasks);
611
+ const firstRejected = settled.find((result) => result.status === "rejected");
612
+ if (
613
+ firstRejected?.status === "rejected" &&
614
+ delegates.length === 1 &&
615
+ (options.onChildFailure === "abort" || dispatchResults.length === 0)
616
+ ) {
617
+ throw firstRejected.reason;
618
+ }
619
+
620
+ dispatchResults.sort((a, b) => a.result.completedAtMs - b.result.completedAtMs);
621
+ const taggedResults = dispatchResults.map((entry) => entry.result.taggedText).join("\n\n");
622
+ const currentWaveFailures = dispatchedForTurn
623
+ .map((child) => child.failure)
624
+ .filter((failure): failure is DispatchWaveFailure => failure !== undefined);
625
+ if (options.onChildFailure === "abort" && currentWaveFailures.length > 0) {
626
+ triggeringFailureForAbortMode ??= currentWaveFailures[0];
627
+ break;
628
+ }
629
+ const failuresSection = buildFailuresSection(currentWaveFailures);
630
+ const coordinatorAgent = options.agents[0] ?? { id: "coordinator", role: "coordinator" };
631
+ const baseInput = buildCoordinatorPlanInput(options.intent, coordinatorAgent);
632
+ dispatchInput = [
633
+ baseInput,
634
+ taggedResults,
635
+ failuresSection,
636
+ "Using the sub-run results above, decide the next step (participate or delegate)."
637
+ ].filter((section): section is string => Boolean(section)).join("\n\n");
638
+ dispatchCount += delegates.length;
639
+ }
147
640
  stopIfNeeded();
148
641
  }
149
642
 
@@ -209,7 +702,7 @@ export async function runCoordinator(options: CoordinatorRunOptions): Promise<Ru
209
702
  }
210
703
 
211
704
  if (!stopIfNeeded()) {
212
- totalCost = await runCoordinatorTurn({
705
+ const synthesisOutcome = await runCoordinatorTurn({
213
706
  agent: coordinator,
214
707
  coordinator,
215
708
  input: buildFinalSynthesisInput(options.intent, transcript, coordinator),
@@ -227,6 +720,20 @@ export async function runCoordinator(options: CoordinatorRunOptions): Promise<Ru
227
720
  emit,
228
721
  recordProtocolDecision
229
722
  });
723
+ totalCost = synthesisOutcome.totalCost;
724
+ // Phase 1: final-synthesis turn cannot delegate.
725
+ if (Array.isArray(synthesisOutcome.decision) || synthesisOutcome.decision?.type === "delegate") {
726
+ throw new DogpileError({
727
+ code: "invalid-configuration",
728
+ message: "Coordinator final-synthesis turn cannot emit a delegate decision in Phase 1",
729
+ retryable: false,
730
+ detail: {
731
+ kind: "delegate-validation",
732
+ path: "decision",
733
+ phase: "final-synthesis"
734
+ }
735
+ });
736
+ }
230
737
  stopIfNeeded();
231
738
  }
232
739
  }
@@ -247,45 +754,47 @@ export async function runCoordinator(options: CoordinatorRunOptions): Promise<Ru
247
754
  transcriptEntryCount: transcript.length
248
755
  });
249
756
  const finalEvent = events.at(-1);
757
+ const trace: Trace = {
758
+ schemaVersion: "1.0",
759
+ runId,
760
+ protocol: "coordinator",
761
+ tier: options.tier,
762
+ modelProviderId: options.model.id,
763
+ agentsUsed: activeAgents,
764
+ inputs: createReplayTraceRunInputs({
765
+ intent: options.intent,
766
+ protocol: options.protocol,
767
+ tier: options.tier,
768
+ modelProviderId: options.model.id,
769
+ agents: activeAgents,
770
+ temperature: options.temperature
771
+ }),
772
+ budget: createReplayTraceBudget({
773
+ tier: options.tier,
774
+ ...(options.budget ? { caps: options.budget } : {}),
775
+ ...(options.terminate ? { termination: options.terminate } : {})
776
+ }),
777
+ budgetStateChanges: createReplayTraceBudgetStateChanges(events),
778
+ seed: createReplayTraceSeed(options.seed),
779
+ protocolDecisions,
780
+ providerCalls,
781
+ finalOutput: createReplayTraceFinalOutput(output, finalEvent ?? {
782
+ type: "final",
783
+ runId,
784
+ at: "",
785
+ output,
786
+ cost: totalCost,
787
+ transcript: createTranscriptLink(transcript)
788
+ }),
789
+ ...(triggeringFailureForAbortMode !== undefined ? { triggeringFailureForAbortMode } : {}),
790
+ events,
791
+ transcript
792
+ };
250
793
 
251
794
  return {
252
795
  output,
253
796
  eventLog: createRunEventLog(runId, "coordinator", events),
254
- trace: {
255
- schemaVersion: "1.0",
256
- runId,
257
- protocol: "coordinator",
258
- tier: options.tier,
259
- modelProviderId: options.model.id,
260
- agentsUsed: activeAgents,
261
- inputs: createReplayTraceRunInputs({
262
- intent: options.intent,
263
- protocol: options.protocol,
264
- tier: options.tier,
265
- modelProviderId: options.model.id,
266
- agents: activeAgents,
267
- temperature: options.temperature
268
- }),
269
- budget: createReplayTraceBudget({
270
- tier: options.tier,
271
- ...(options.budget ? { caps: options.budget } : {}),
272
- ...(options.terminate ? { termination: options.terminate } : {})
273
- }),
274
- budgetStateChanges: createReplayTraceBudgetStateChanges(events),
275
- seed: createReplayTraceSeed(options.seed),
276
- protocolDecisions,
277
- providerCalls,
278
- finalOutput: createReplayTraceFinalOutput(output, finalEvent ?? {
279
- type: "final",
280
- runId,
281
- at: "",
282
- output,
283
- cost: totalCost,
284
- transcript: createTranscriptLink(transcript)
285
- }),
286
- events,
287
- transcript
288
- },
797
+ trace,
289
798
  transcript,
290
799
  usage: createRunUsage(totalCost),
291
800
  metadata: createRunMetadata({
@@ -303,7 +812,8 @@ export async function runCoordinator(options: CoordinatorRunOptions): Promise<Ru
303
812
  cost: totalCost,
304
813
  events
305
814
  }),
306
- cost: totalCost
815
+ cost: totalCost,
816
+ health: computeHealth(trace, DEFAULT_HEALTH_THRESHOLDS)
307
817
  };
308
818
 
309
819
  function stopIfNeeded(): boolean {
@@ -358,6 +868,11 @@ export async function runCoordinator(options: CoordinatorRunOptions): Promise<Ru
358
868
  }
359
869
  }
360
870
 
871
+ function isDelegateValidationError(error: unknown): boolean {
872
+ return DogpileError.isInstance(error) && error.code === "invalid-configuration" &&
873
+ error.detail?.["kind"] === "delegate-validation";
874
+ }
875
+
361
876
  interface CoordinatorTurnOptions {
362
877
  readonly agent: AgentSpec;
363
878
  readonly coordinator: AgentSpec;
@@ -380,7 +895,12 @@ interface CoordinatorTurnOptions {
380
895
  ) => void;
381
896
  }
382
897
 
383
- async function runCoordinatorTurn(turn: CoordinatorTurnOptions): Promise<CostSummary> {
898
+ interface CoordinatorTurnResult {
899
+ readonly totalCost: CostSummary;
900
+ readonly decision: ReturnType<typeof parseAgentDecision>;
901
+ }
902
+
903
+ async function runCoordinatorTurn(turn: CoordinatorTurnOptions): Promise<CoordinatorTurnResult> {
384
904
  throwIfAborted(turn.options.signal, turn.options.model.id);
385
905
 
386
906
  const request: ModelRequest = {
@@ -430,7 +950,11 @@ async function runCoordinatorTurn(turn: CoordinatorTurnOptions): Promise<CostSum
430
950
  turn.providerCalls.push(call);
431
951
  }
432
952
  });
433
- const decision = parseAgentDecision(response.text);
953
+ const decision = parseAgentDecision(response.text, {
954
+ parentProviderId: turn.options.model.id,
955
+ currentDepth: turn.options.currentDepth ?? 0,
956
+ maxDepth: turn.options.effectiveMaxDepth ?? Number.POSITIVE_INFINITY
957
+ });
434
958
  const totalCost = addCost(turn.totalCost, responseCost(response));
435
959
  const toolCalls = await executeModelResponseToolRequests({
436
960
  response,
@@ -471,7 +995,7 @@ async function runCoordinatorTurn(turn: CoordinatorTurnOptions): Promise<CostSum
471
995
  transcriptEntryCount: turn.transcript.length
472
996
  });
473
997
 
474
- return totalCost;
998
+ return { totalCost, decision };
475
999
  }
476
1000
 
477
1001
  interface CoordinatorWorkerTurnOptions {
@@ -553,7 +1077,23 @@ async function runCoordinatorWorkerTurn(turn: CoordinatorWorkerTurnOptions): Pro
553
1077
  turn.providerCallSlots[turn.providerCallIndex] = call;
554
1078
  }
555
1079
  });
556
- const decision = parseAgentDecision(response.text);
1080
+ const decision = parseAgentDecision(response.text, {
1081
+ parentProviderId: turn.options.model.id,
1082
+ currentDepth: turn.options.currentDepth ?? 0,
1083
+ maxDepth: turn.options.effectiveMaxDepth ?? Number.POSITIVE_INFINITY
1084
+ });
1085
+ if (Array.isArray(decision) || decision?.type === "delegate") {
1086
+ throw new DogpileError({
1087
+ code: "invalid-configuration",
1088
+ message: "Workers cannot emit delegate decisions in Phase 1",
1089
+ retryable: false,
1090
+ detail: {
1091
+ kind: "delegate-validation",
1092
+ path: "decision",
1093
+ phase: "worker"
1094
+ }
1095
+ });
1096
+ }
557
1097
  const toolCalls = await executeModelResponseToolRequests({
558
1098
  response,
559
1099
  executor: turn.toolExecutor,
@@ -589,6 +1129,39 @@ function buildCoordinatorPlanInput(intent: string, coordinator: AgentSpec): stri
589
1129
  return `Mission: ${intent}\nCoordinator ${coordinator.id}: assign the work, name the plan, and provide the first contribution.`;
590
1130
  }
591
1131
 
1132
+ function buildFailuresSection(failures: readonly DispatchWaveFailure[]): string | null {
1133
+ if (failures.length === 0) {
1134
+ return null;
1135
+ }
1136
+ return [
1137
+ "## Sub-run failures since last decision",
1138
+ "",
1139
+ "```json",
1140
+ JSON.stringify(failures, null, 2),
1141
+ "```"
1142
+ ].join("\n");
1143
+ }
1144
+
1145
+ function dispatchWaveFailureFromEvent(
1146
+ intent: string,
1147
+ event: SubRunFailedEvent
1148
+ ): DispatchWaveFailure | undefined {
1149
+ const reason = typeof event.error.detail?.["reason"] === "string" ? event.error.detail["reason"] : undefined;
1150
+ if (reason === "sibling-failed" || reason === "parent-aborted") {
1151
+ return undefined;
1152
+ }
1153
+ return {
1154
+ childRunId: event.childRunId,
1155
+ intent,
1156
+ error: {
1157
+ code: event.error.code,
1158
+ message: event.error.message,
1159
+ ...(reason !== undefined ? { detail: { reason } } : {})
1160
+ },
1161
+ partialCost: { usd: event.partialCost.usd }
1162
+ };
1163
+ }
1164
+
592
1165
  function buildWorkerInput(
593
1166
  intent: string,
594
1167
  transcript: readonly TranscriptEntry[],
@@ -620,3 +1193,590 @@ function responseCost(response: ModelResponse): CostSummary {
620
1193
  };
621
1194
  }
622
1195
 
1196
+ interface DispatchDelegateOptions {
1197
+ readonly decision: DelegateAgentDecision;
1198
+ readonly childRunId?: string;
1199
+ readonly parentDecisionId: string;
1200
+ readonly parentDecisionArrayIndex: number;
1201
+ readonly parentDepth: number;
1202
+ readonly parentRunId: string;
1203
+ readonly options: CoordinatorRunOptions;
1204
+ readonly transcript: TranscriptEntry[];
1205
+ readonly emit: (event: RunEvent) => void;
1206
+ readonly recordProtocolDecision: (
1207
+ event: RunEvent,
1208
+ decisionOptions?: { readonly transcriptEntryCount?: number }
1209
+ ) => void;
1210
+ /**
1211
+ * BUDGET-03 / D-01 seam: closure-mutation callback that adds child cost
1212
+ * (subResult.cost on success, partialCost on failure) into the parent's
1213
+ * `totalCost` accumulator. Invoked BEFORE `parentEmit(completedEvent)` /
1214
+ * `parentEmit(failEvent)` so the existing "last cost-bearing event ===
1215
+ * final.cost" invariant survives unchanged.
1216
+ */
1217
+ readonly recordSubRunCost: (cost: CostSummary) => void;
1218
+ readonly dispatchedChild: DispatchedChild;
1219
+ }
1220
+
1221
+ interface DispatchDelegateResult {
1222
+ readonly nextInput: string;
1223
+ readonly taggedText: string;
1224
+ readonly completedAtMs: number;
1225
+ }
1226
+
1227
+ interface DispatchedChild {
1228
+ readonly childRunId: string;
1229
+ readonly decision: DelegateAgentDecision;
1230
+ readonly parentDecisionId: string;
1231
+ readonly parentDecisionArrayIndex: number;
1232
+ readonly parentDepth: number;
1233
+ readonly controller: AbortController;
1234
+ removeParentListener: (() => void) | undefined;
1235
+ readonly childEvents: RunEvent[];
1236
+ started: boolean;
1237
+ closed: boolean;
1238
+ startedAtMs: number;
1239
+ childTimeoutMs: number | undefined;
1240
+ failure: DispatchWaveFailure | undefined;
1241
+ /** STREAM-03 hook (Phase 4). Reserved; do not use. */
1242
+ readonly streamHandle?: never;
1243
+ }
1244
+
1245
+ /**
1246
+ * Dispatch a single delegate decision as a recursive sub-run.
1247
+ *
1248
+ * D-11: child reuses the parent provider object verbatim.
1249
+ * D-16: `recursive: true` flag set when both parent and child protocol are
1250
+ * `coordinator`.
1251
+ * D-17: tagged result text appended to the next coordinator prompt.
1252
+ * D-18: synthetic transcript entry pushed for replay/provenance.
1253
+ *
1254
+ * On thrown error from the child engine, builds `partialTrace` from a locally
1255
+ * tee'd `childEvents` buffer — `runProtocol`'s error contract is unchanged.
1256
+ */
1257
+ async function dispatchDelegate(input: DispatchDelegateOptions): Promise<DispatchDelegateResult> {
1258
+ const { decision, options } = input;
1259
+
1260
+ // Dispatcher-time depth gate (D-14). Same error shape as the parser; this
1261
+ // is the TOCTOU defense for any state mutation between parse and dispatch.
1262
+ // Fires BEFORE sub-run-started is emitted so failed dispatches do not show
1263
+ // up in the trace as half-started sub-runs.
1264
+ if (options.effectiveMaxDepth !== undefined) {
1265
+ assertDepthWithinLimit(input.parentDepth, options.effectiveMaxDepth);
1266
+ }
1267
+
1268
+ const childRunId = input.childRunId ?? createRunId();
1269
+ const recursive = decision.protocol === "coordinator";
1270
+ const decisionTimeoutMs = decision.budget?.timeoutMs;
1271
+ const parentDeadlineMs = options.parentDeadlineMs;
1272
+
1273
+ // BUDGET-02 / D-12: deadline-based remaining-time math. Children inherit
1274
+ // `parentDeadlineMs - now()`, not a static `parent.budget.timeoutMs`. If the
1275
+ // parent's deadline has already elapsed, throw `code: "aborted"` with
1276
+ // `detail.reason: "timeout"` BEFORE `sub-run-started` is emitted.
1277
+ const remainingMs =
1278
+ parentDeadlineMs !== undefined ? Math.max(0, parentDeadlineMs - Date.now()) : undefined;
1279
+
1280
+ if (parentDeadlineMs !== undefined && remainingMs === 0) {
1281
+ throw new DogpileError({
1282
+ code: "aborted",
1283
+ message: "Parent deadline elapsed before sub-run dispatch.",
1284
+ retryable: false,
1285
+ providerId: options.model.id,
1286
+ detail: { reason: "timeout" }
1287
+ });
1288
+ }
1289
+
1290
+ // Resolve child timeout with precedence (D-12 / D-14):
1291
+ // decision.budget.timeoutMs > parent's remaining > defaultSubRunTimeoutMs > undefined.
1292
+ // When the decision-level timeout exceeds the parent's remaining, CLAMP
1293
+ // (no longer throw) and emit a `sub-run-budget-clamped` event below.
1294
+ let childTimeoutMs: number | undefined;
1295
+ let clampedFrom: number | undefined;
1296
+ if (remainingMs !== undefined) {
1297
+ if (decisionTimeoutMs !== undefined) {
1298
+ if (decisionTimeoutMs > remainingMs) {
1299
+ clampedFrom = decisionTimeoutMs;
1300
+ childTimeoutMs = remainingMs;
1301
+ } else {
1302
+ childTimeoutMs = decisionTimeoutMs;
1303
+ }
1304
+ } else {
1305
+ childTimeoutMs = remainingMs;
1306
+ }
1307
+ } else if (decisionTimeoutMs !== undefined) {
1308
+ childTimeoutMs = decisionTimeoutMs;
1309
+ } else if (options.defaultSubRunTimeoutMs !== undefined) {
1310
+ childTimeoutMs = options.defaultSubRunTimeoutMs;
1311
+ }
1312
+
1313
+ if (!options.runProtocol) {
1314
+ throw new DogpileError({
1315
+ code: "invalid-configuration",
1316
+ message:
1317
+ "Coordinator delegate dispatch requires the engine `runProtocol` callback. " +
1318
+ "Use `Dogpile.run` / `createEngine` rather than calling `runCoordinator` directly when delegate is in play.",
1319
+ retryable: false,
1320
+ detail: {
1321
+ kind: "delegate-validation",
1322
+ path: "runProtocol"
1323
+ }
1324
+ });
1325
+ }
1326
+
1327
+ // Buffered tee for partialTrace capture — see Plan 03 step 8.
1328
+ const childEvents = input.dispatchedChild.childEvents;
1329
+ const parentEmit = input.emit;
1330
+ const teedEmit = (event: RunEvent): void => {
1331
+ childEvents.push(event);
1332
+ if (input.dispatchedChild.closed) {
1333
+ return;
1334
+ }
1335
+ if (options.streamEvents && options.emit) {
1336
+ const inbound = (event as { readonly parentRunIds?: readonly string[] }).parentRunIds;
1337
+ options.emit({
1338
+ ...event,
1339
+ parentRunIds: [input.parentRunId, ...(inbound ?? [])]
1340
+ } as RunEvent);
1341
+ }
1342
+ };
1343
+ const childStartedAt = Date.now();
1344
+ input.dispatchedChild.startedAtMs = childStartedAt;
1345
+
1346
+ // BUDGET-02 / D-12: emit clamp event BEFORE sub-run-started so the trace
1347
+ // records "this child's requested timeout was reduced to fit parent's
1348
+ // remaining deadline." Skipped on the happy path (no clamp, no event).
1349
+ if (clampedFrom !== undefined && childTimeoutMs !== undefined) {
1350
+ const clampEvent: SubRunBudgetClampedEvent = {
1351
+ type: "sub-run-budget-clamped",
1352
+ runId: input.parentRunId,
1353
+ at: new Date().toISOString(),
1354
+ childRunId,
1355
+ parentRunId: input.parentRunId,
1356
+ parentDecisionId: input.parentDecisionId,
1357
+ requestedTimeoutMs: clampedFrom,
1358
+ clampedTimeoutMs: childTimeoutMs,
1359
+ reason: "exceeded-parent-remaining"
1360
+ };
1361
+ input.emit(clampEvent);
1362
+ input.recordProtocolDecision(clampEvent);
1363
+ }
1364
+
1365
+ const startEvent: RunEvent = {
1366
+ type: "sub-run-started",
1367
+ runId: input.parentRunId,
1368
+ at: new Date().toISOString(),
1369
+ childRunId,
1370
+ parentRunId: input.parentRunId,
1371
+ parentDecisionId: input.parentDecisionId,
1372
+ parentDecisionArrayIndex: input.parentDecisionArrayIndex,
1373
+ protocol: decision.protocol,
1374
+ intent: decision.intent,
1375
+ depth: input.parentDepth + 1,
1376
+ ...(recursive ? { recursive: true } : {})
1377
+ };
1378
+ parentEmit(startEvent);
1379
+ input.recordProtocolDecision(startEvent);
1380
+
1381
+ // BUDGET-01 / D-07: derive a per-child AbortController so child engines see
1382
+ // their own signal. Listener forwards parent.signal.reason verbatim, so
1383
+ // detail.reason classification (parent-aborted vs timeout) is preserved.
1384
+ // Phase 4 STREAM-03 hook: per-child cancel handle attaches here.
1385
+ const parentSignal = options.signal;
1386
+ let removeParentAbortListener: (() => void) | undefined;
1387
+ if (parentSignal !== undefined) {
1388
+ if (parentSignal.aborted) {
1389
+ input.dispatchedChild.controller.abort(parentSignal.reason);
1390
+ } else {
1391
+ const handler = (): void => {
1392
+ input.dispatchedChild.controller.abort(parentSignal.reason);
1393
+ };
1394
+ parentSignal.addEventListener("abort", handler, { once: true });
1395
+ removeParentAbortListener = (): void => {
1396
+ parentSignal.removeEventListener("abort", handler);
1397
+ };
1398
+ }
1399
+ }
1400
+ input.dispatchedChild.removeParentListener = removeParentAbortListener;
1401
+ input.dispatchedChild.started = true;
1402
+ input.dispatchedChild.childTimeoutMs = childTimeoutMs;
1403
+ const childDeadlineReason =
1404
+ childTimeoutMs !== undefined && parentDeadlineMs === undefined
1405
+ ? createEngineDeadlineTimeoutError(options.model.id, childTimeoutMs)
1406
+ : undefined;
1407
+ const childDeadlineTimer =
1408
+ childDeadlineReason !== undefined
1409
+ ? setTimeout(() => {
1410
+ input.dispatchedChild.controller.abort(childDeadlineReason);
1411
+ }, childTimeoutMs)
1412
+ : undefined;
1413
+
1414
+ const childOptions = {
1415
+ runId: childRunId,
1416
+ intent: decision.intent,
1417
+ protocol: decision.protocol,
1418
+ tier: options.tier,
1419
+ model: options.model, // D-11: same provider instance verbatim
1420
+ agents: options.agents,
1421
+ tools: options.tools,
1422
+ temperature: options.temperature,
1423
+ ...(childTimeoutMs !== undefined ? { budget: { timeoutMs: childTimeoutMs } } : {}),
1424
+ signal: input.dispatchedChild.controller.signal,
1425
+ emit: teedEmit,
1426
+ ...(options.streamEvents !== undefined ? { streamEvents: options.streamEvents } : {}),
1427
+ currentDepth: input.parentDepth + 1,
1428
+ ...(options.effectiveMaxDepth !== undefined ? { effectiveMaxDepth: options.effectiveMaxDepth } : {}),
1429
+ ...(options.effectiveMaxConcurrentChildren !== undefined
1430
+ ? { effectiveMaxConcurrentChildren: options.effectiveMaxConcurrentChildren }
1431
+ : {}),
1432
+ ...(options.onChildFailure !== undefined ? { onChildFailure: options.onChildFailure } : {}),
1433
+ // BUDGET-02 / D-12: forward the ROOT deadline so depth-N grandchildren
1434
+ // see the same `parentDeadlineMs` rather than a fresh per-level snapshot.
1435
+ ...(parentDeadlineMs !== undefined ? { parentDeadlineMs } : {}),
1436
+ ...(options.defaultSubRunTimeoutMs !== undefined
1437
+ ? { defaultSubRunTimeoutMs: options.defaultSubRunTimeoutMs }
1438
+ : {})
1439
+ };
1440
+
1441
+ let subResult: RunResult;
1442
+ try {
1443
+ subResult = await options.runProtocol(childOptions);
1444
+ } catch (error) {
1445
+ if (childDeadlineTimer !== undefined) {
1446
+ clearTimeout(childDeadlineTimer);
1447
+ }
1448
+ removeParentAbortListener?.();
1449
+ if (input.dispatchedChild.closed) {
1450
+ const enrichedError = enrichAbortErrorWithParentReason(error, parentSignal);
1451
+ if (DogpileError.isInstance(enrichedError)) {
1452
+ throw enrichedError;
1453
+ }
1454
+ throw error;
1455
+ }
1456
+
1457
+ const failedDecision: JsonObject = {
1458
+ type: "delegate",
1459
+ protocol: decision.protocol,
1460
+ intent: decision.intent,
1461
+ ...(decision.model !== undefined ? { model: decision.model } : {}),
1462
+ ...(decision.budget !== undefined ? { budget: decision.budget as unknown as JsonValue } : {})
1463
+ };
1464
+
1465
+ const partialTrace: Trace = buildPartialTrace({
1466
+ childRunId,
1467
+ events: childEvents,
1468
+ startedAtMs: childStartedAt,
1469
+ protocol: decision.protocol,
1470
+ tier: options.tier,
1471
+ modelProviderId: options.model.id,
1472
+ agents: options.agents,
1473
+ intent: decision.intent,
1474
+ temperature: options.temperature,
1475
+ ...(childTimeoutMs !== undefined ? { childTimeoutMs } : {}),
1476
+ ...(options.seed !== undefined ? { seed: options.seed } : {})
1477
+ });
1478
+
1479
+ // BUDGET-01 / D-08: when the child aborted because the parent.signal
1480
+ // aborted, lock detail.reason on the surfaced error. Upstream engine
1481
+ // wrapping (e.g., createStreamCancellationError) attaches its own
1482
+ // detail.status; we add detail.reason so consumers can discriminate
1483
+ // parent-aborted vs timeout regardless of which engine path produced the
1484
+ // abort error.
1485
+ const enrichedError = enrichProviderTimeoutSource(
1486
+ enrichAbortErrorWithParentReason(error, parentSignal),
1487
+ {
1488
+ ...(decisionTimeoutMs !== undefined ? { decisionTimeoutMs } : {}),
1489
+ ...(options.defaultSubRunTimeoutMs !== undefined
1490
+ ? { engineDefaultTimeoutMs: options.defaultSubRunTimeoutMs }
1491
+ : {})
1492
+ }
1493
+ );
1494
+ if (DogpileError.isInstance(enrichedError)) {
1495
+ options.failureInstancesByChildRunId?.set(childRunId, enrichedError);
1496
+ }
1497
+ const errorPayload = errorPayloadFromUnknown(enrichedError, failedDecision);
1498
+ // BUDGET-03 / D-02: capture real provider spend before the throw and
1499
+ // roll it into the parent's totalCost BEFORE emitting sub-run-failed.
1500
+ const partialCost = lastCostBearingEventCost(childEvents) ?? emptyCost();
1501
+ input.recordSubRunCost(partialCost);
1502
+ const failEvent: SubRunFailedEvent = {
1503
+ type: "sub-run-failed",
1504
+ runId: input.parentRunId,
1505
+ at: new Date().toISOString(),
1506
+ childRunId,
1507
+ parentRunId: input.parentRunId,
1508
+ parentDecisionId: input.parentDecisionId,
1509
+ parentDecisionArrayIndex: input.parentDecisionArrayIndex,
1510
+ error: errorPayload,
1511
+ partialTrace,
1512
+ partialCost
1513
+ };
1514
+ parentEmit(failEvent);
1515
+ input.recordProtocolDecision(failEvent);
1516
+ input.dispatchedChild.closed = true;
1517
+ input.dispatchedChild.failure = dispatchWaveFailureFromEvent(decision.intent, failEvent);
1518
+
1519
+ // Re-throw a DogpileError so the parent run terminates with a typed error.
1520
+ if (DogpileError.isInstance(enrichedError)) {
1521
+ throw enrichedError;
1522
+ }
1523
+ throw new DogpileError({
1524
+ code: "invalid-configuration",
1525
+ message: error instanceof Error ? error.message : String(error),
1526
+ retryable: false,
1527
+ detail: {
1528
+ kind: "delegate-validation",
1529
+ path: "decision",
1530
+ reason: "child-run-failed"
1531
+ }
1532
+ });
1533
+ }
1534
+
1535
+ if (childDeadlineTimer !== undefined) {
1536
+ clearTimeout(childDeadlineTimer);
1537
+ }
1538
+ removeParentAbortListener?.();
1539
+
1540
+ // BUDGET-03 / D-01: roll child's full cost into the parent's totalCost
1541
+ // BEFORE emitting sub-run-completed. The next agent-turn / final event will
1542
+ // read totalCost from the closure scope, preserving the existing
1543
+ // "last cost-bearing event === final.cost" invariant.
1544
+ input.recordSubRunCost(subResult.cost);
1545
+
1546
+ const completedEvent: RunEvent = {
1547
+ type: "sub-run-completed",
1548
+ runId: input.parentRunId,
1549
+ at: new Date().toISOString(),
1550
+ childRunId,
1551
+ parentRunId: input.parentRunId,
1552
+ parentDecisionId: input.parentDecisionId,
1553
+ parentDecisionArrayIndex: input.parentDecisionArrayIndex,
1554
+ subResult
1555
+ };
1556
+ parentEmit(completedEvent);
1557
+ input.recordProtocolDecision(completedEvent);
1558
+ input.dispatchedChild.closed = true;
1559
+
1560
+ // BUDGET-01 / D-10: parent.signal aborted AFTER the child completed but
1561
+ // before we advance to the next coordinator turn. Emit a marker event so
1562
+ // streaming subscribers see "parent gave up after sub-run" provenance,
1563
+ // then re-throw the parent's abort reason. Non-streaming run() rejects with
1564
+ // the thrown error and does NOT preserve the marker — engine.ts does not
1565
+ // attach the parent events array to the rejected error (verified at
1566
+ // engine.ts:230-239). Streaming-subscriber observability is the contract.
1567
+ if (parentSignal?.aborted) {
1568
+ const abortMarker: SubRunParentAbortedEvent = {
1569
+ type: "sub-run-parent-aborted",
1570
+ runId: input.parentRunId,
1571
+ at: new Date().toISOString(),
1572
+ childRunId,
1573
+ parentRunId: input.parentRunId,
1574
+ reason: "parent-aborted"
1575
+ };
1576
+ parentEmit(abortMarker);
1577
+ input.recordProtocolDecision(abortMarker);
1578
+ throw enrichAbortErrorWithParentReason(
1579
+ createAbortErrorFromSignal(parentSignal, options.model.id),
1580
+ parentSignal
1581
+ );
1582
+ }
1583
+
1584
+ // D-18 synthetic transcript entry.
1585
+ const decisionAsJson: JsonObject = {
1586
+ type: "delegate",
1587
+ protocol: decision.protocol,
1588
+ intent: decision.intent,
1589
+ ...(decision.model !== undefined ? { model: decision.model } : {}),
1590
+ ...(decision.budget !== undefined ? { budget: decision.budget as unknown as JsonValue } : {})
1591
+ };
1592
+ const taggedText = renderSubRunResult(childRunId, subResult);
1593
+ input.transcript.push({
1594
+ agentId: `sub-run:${childRunId}`,
1595
+ role: "delegate-result",
1596
+ input: JSON.stringify(decisionAsJson),
1597
+ output: taggedText
1598
+ });
1599
+
1600
+ // Build the next coordinator prompt by appending the D-17 tagged block.
1601
+ const coordinatorAgent = options.agents[0];
1602
+ const baseInput = buildCoordinatorPlanInput(input.options.intent, coordinatorAgent ?? {
1603
+ id: "coordinator",
1604
+ role: "coordinator"
1605
+ });
1606
+ return {
1607
+ nextInput: `${baseInput}\n\n${taggedText}\n\nUsing the sub-run result above, decide the next step (participate or delegate).`,
1608
+ taggedText,
1609
+ completedAtMs: Date.now()
1610
+ };
1611
+ }
1612
+
1613
+ /**
1614
+ * D-17 prompt-injection helper. Renders a child `RunResult` as the canonical
1615
+ * tagged-result block injected into the parent coordinator's next prompt.
1616
+ *
1617
+ * Format:
1618
+ * `[sub-run <childRunId>]: <output>`
1619
+ * `[sub-run <childRunId> stats]: turns=<N> costUsd=<X> durationMs=<Y>`
1620
+ *
1621
+ * The stats line is a soft contract — field names stable, ordering stable.
1622
+ */
1623
+ function renderSubRunResult(childRunId: string, subResult: RunResult): string {
1624
+ const turns = subResult.transcript.length;
1625
+ const costUsd = subResult.cost.usd ?? 0;
1626
+ const startedAt = eventTimestamp(subResult.trace.events[0]);
1627
+ const endedAt = eventTimestamp(subResult.trace.events.at(-1));
1628
+ const durationMs =
1629
+ startedAt && endedAt
1630
+ ? Math.max(0, Date.parse(endedAt) - Date.parse(startedAt))
1631
+ : 0;
1632
+ return [
1633
+ `[sub-run ${childRunId}]: ${subResult.output}`,
1634
+ `[sub-run ${childRunId} stats]: turns=${turns} costUsd=${costUsd} durationMs=${durationMs}`
1635
+ ].join("\n");
1636
+ }
1637
+
1638
+ function eventTimestamp(event: RunEvent | undefined): string | undefined {
1639
+ if (event === undefined) return undefined;
1640
+ if ("at" in event) return event.at;
1641
+ return event.type === "model-response" ? event.completedAt : event.startedAt;
1642
+ }
1643
+
1644
+ /**
1645
+ * Build a JSON-serializable {@link Trace} for `sub-run-failed.partialTrace`
1646
+ * from a buffered tee of child emits. Keeps `runProtocol`'s error contract
1647
+ * unchanged — Plan 03 step 8.
1648
+ */
1649
+ function buildPartialTrace(input: {
1650
+ readonly childRunId: string;
1651
+ readonly events: readonly RunEvent[];
1652
+ readonly startedAtMs: number;
1653
+ readonly protocol: ProtocolSelection;
1654
+ readonly tier: Tier;
1655
+ readonly modelProviderId: string;
1656
+ readonly agents: readonly AgentSpec[];
1657
+ readonly intent: string;
1658
+ readonly temperature: number;
1659
+ readonly childTimeoutMs?: number;
1660
+ readonly seed?: string | number;
1661
+ }): Trace {
1662
+ const protocolName = typeof input.protocol === "string" ? input.protocol : input.protocol.kind;
1663
+ const protocolConfig =
1664
+ typeof input.protocol === "string"
1665
+ ? ({ kind: input.protocol } as unknown as Parameters<typeof createReplayTraceRunInputs>[0]["protocol"])
1666
+ : input.protocol;
1667
+ return {
1668
+ schemaVersion: "1.0",
1669
+ runId: input.childRunId,
1670
+ protocol: protocolName,
1671
+ tier: input.tier,
1672
+ modelProviderId: input.modelProviderId,
1673
+ agentsUsed: input.agents,
1674
+ inputs: createReplayTraceRunInputs({
1675
+ intent: input.intent,
1676
+ protocol: protocolConfig,
1677
+ tier: input.tier,
1678
+ modelProviderId: input.modelProviderId,
1679
+ agents: input.agents,
1680
+ temperature: input.temperature
1681
+ }),
1682
+ budget: createReplayTraceBudget({
1683
+ tier: input.tier,
1684
+ ...(input.childTimeoutMs !== undefined ? { caps: { timeoutMs: input.childTimeoutMs } } : {})
1685
+ }),
1686
+ budgetStateChanges: createReplayTraceBudgetStateChanges(input.events),
1687
+ seed: createReplayTraceSeed(input.seed),
1688
+ protocolDecisions: [],
1689
+ providerCalls: [],
1690
+ finalOutput: {
1691
+ kind: "replay-trace-final-output",
1692
+ output: "",
1693
+ cost: emptyCost(),
1694
+ completedAt: new Date().toISOString(),
1695
+ transcript: createTranscriptLink([])
1696
+ },
1697
+ events: input.events,
1698
+ transcript: []
1699
+ };
1700
+ }
1701
+
1702
+ /**
1703
+ * BUDGET-01 / D-08: when a child sub-run threw because the parent's signal
1704
+ * aborted, lock the `detail.reason` discriminator on the resulting
1705
+ * `code: "aborted"` error. Preserves any pre-existing detail keys (e.g.,
1706
+ * `detail.status: "cancelled"` attached by `createStreamCancellationError`).
1707
+ *
1708
+ * No-op when:
1709
+ * - parent.signal is undefined or not aborted (child failure was unrelated)
1710
+ * - error is not a DogpileError with `code: "aborted"`
1711
+ * - error already has a `detail.reason` set (preserve upstream classification)
1712
+ */
1713
+ function enrichAbortErrorWithParentReason(error: unknown, parentSignal: AbortSignal | undefined): unknown {
1714
+ if (parentSignal === undefined || !parentSignal.aborted) {
1715
+ return error;
1716
+ }
1717
+ if (!DogpileError.isInstance(error) || error.code !== "aborted") {
1718
+ return error;
1719
+ }
1720
+ const existingDetail = error.detail ?? {};
1721
+ if (existingDetail["reason"] !== undefined) {
1722
+ return error;
1723
+ }
1724
+ const reason = classifyAbortReason(parentSignal.reason);
1725
+ return new DogpileError({
1726
+ code: "aborted",
1727
+ message: error.message,
1728
+ retryable: error.retryable ?? false,
1729
+ ...(error.providerId !== undefined ? { providerId: error.providerId } : {}),
1730
+ detail: { ...existingDetail, reason },
1731
+ ...(error.cause !== undefined ? { cause: error.cause } : {})
1732
+ });
1733
+ }
1734
+
1735
+ function enrichProviderTimeoutSource(
1736
+ error: unknown,
1737
+ context: {
1738
+ readonly decisionTimeoutMs?: number;
1739
+ readonly engineDefaultTimeoutMs?: number;
1740
+ }
1741
+ ): unknown {
1742
+ if (!DogpileError.isInstance(error) || error.code !== "provider-timeout") {
1743
+ return error;
1744
+ }
1745
+ const existingDetail = error.detail ?? {};
1746
+ if (existingDetail["source"] !== undefined) {
1747
+ return error;
1748
+ }
1749
+ const source = classifyChildTimeoutSource(error, {
1750
+ ...context,
1751
+ isProviderError: true
1752
+ });
1753
+ return new DogpileError({
1754
+ code: "provider-timeout",
1755
+ message: error.message,
1756
+ retryable: error.retryable ?? true,
1757
+ ...(error.providerId !== undefined ? { providerId: error.providerId } : {}),
1758
+ detail: { ...existingDetail, source },
1759
+ ...(error.cause !== undefined ? { cause: error.cause } : {})
1760
+ });
1761
+ }
1762
+
1763
+ function errorPayloadFromUnknown(error: unknown, failedDecision: JsonObject): SubRunFailedEvent["error"] {
1764
+ if (DogpileError.isInstance(error)) {
1765
+ const detail: JsonObject = {
1766
+ ...(error.detail ?? {}),
1767
+ failedDecision
1768
+ };
1769
+ return {
1770
+ code: error.code,
1771
+ message: error.message,
1772
+ ...(error.providerId !== undefined ? { providerId: error.providerId } : {}),
1773
+ detail
1774
+ };
1775
+ }
1776
+ const message = error instanceof Error ? error.message : String(error);
1777
+ return {
1778
+ code: "invalid-configuration",
1779
+ message,
1780
+ detail: { failedDecision }
1781
+ };
1782
+ }