@dogpile/sdk 0.3.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/CHANGELOG.md +201 -0
  2. package/README.md +1 -0
  3. package/dist/browser/index.js +2328 -237
  4. package/dist/browser/index.js.map +1 -1
  5. package/dist/index.d.ts +3 -1
  6. package/dist/index.d.ts.map +1 -1
  7. package/dist/index.js +1 -0
  8. package/dist/index.js.map +1 -1
  9. package/dist/providers/openai-compatible.d.ts +11 -0
  10. package/dist/providers/openai-compatible.d.ts.map +1 -1
  11. package/dist/providers/openai-compatible.js +88 -2
  12. package/dist/providers/openai-compatible.js.map +1 -1
  13. package/dist/runtime/audit.d.ts +42 -0
  14. package/dist/runtime/audit.d.ts.map +1 -0
  15. package/dist/runtime/audit.js +73 -0
  16. package/dist/runtime/audit.js.map +1 -0
  17. package/dist/runtime/broadcast.d.ts.map +1 -1
  18. package/dist/runtime/broadcast.js +39 -36
  19. package/dist/runtime/broadcast.js.map +1 -1
  20. package/dist/runtime/cancellation.d.ts +26 -0
  21. package/dist/runtime/cancellation.d.ts.map +1 -1
  22. package/dist/runtime/cancellation.js +38 -1
  23. package/dist/runtime/cancellation.js.map +1 -1
  24. package/dist/runtime/coordinator.d.ts +79 -1
  25. package/dist/runtime/coordinator.d.ts.map +1 -1
  26. package/dist/runtime/coordinator.js +979 -61
  27. package/dist/runtime/coordinator.js.map +1 -1
  28. package/dist/runtime/decisions.d.ts +25 -3
  29. package/dist/runtime/decisions.d.ts.map +1 -1
  30. package/dist/runtime/decisions.js +241 -3
  31. package/dist/runtime/decisions.js.map +1 -1
  32. package/dist/runtime/defaults.d.ts +37 -1
  33. package/dist/runtime/defaults.d.ts.map +1 -1
  34. package/dist/runtime/defaults.js +359 -4
  35. package/dist/runtime/defaults.js.map +1 -1
  36. package/dist/runtime/engine.d.ts +17 -4
  37. package/dist/runtime/engine.d.ts.map +1 -1
  38. package/dist/runtime/engine.js +770 -35
  39. package/dist/runtime/engine.js.map +1 -1
  40. package/dist/runtime/health.d.ts +51 -0
  41. package/dist/runtime/health.d.ts.map +1 -0
  42. package/dist/runtime/health.js +85 -0
  43. package/dist/runtime/health.js.map +1 -0
  44. package/dist/runtime/introspection.d.ts +96 -0
  45. package/dist/runtime/introspection.d.ts.map +1 -0
  46. package/dist/runtime/introspection.js +31 -0
  47. package/dist/runtime/introspection.js.map +1 -0
  48. package/dist/runtime/metrics.d.ts +44 -0
  49. package/dist/runtime/metrics.d.ts.map +1 -0
  50. package/dist/runtime/metrics.js +12 -0
  51. package/dist/runtime/metrics.js.map +1 -0
  52. package/dist/runtime/model.d.ts.map +1 -1
  53. package/dist/runtime/model.js +34 -7
  54. package/dist/runtime/model.js.map +1 -1
  55. package/dist/runtime/provenance.d.ts +25 -0
  56. package/dist/runtime/provenance.d.ts.map +1 -0
  57. package/dist/runtime/provenance.js +13 -0
  58. package/dist/runtime/provenance.js.map +1 -0
  59. package/dist/runtime/sequential.d.ts.map +1 -1
  60. package/dist/runtime/sequential.js +47 -37
  61. package/dist/runtime/sequential.js.map +1 -1
  62. package/dist/runtime/shared.d.ts.map +1 -1
  63. package/dist/runtime/shared.js +39 -36
  64. package/dist/runtime/shared.js.map +1 -1
  65. package/dist/runtime/tracing.d.ts +31 -0
  66. package/dist/runtime/tracing.d.ts.map +1 -0
  67. package/dist/runtime/tracing.js +18 -0
  68. package/dist/runtime/tracing.js.map +1 -0
  69. package/dist/runtime/validation.d.ts +10 -0
  70. package/dist/runtime/validation.d.ts.map +1 -1
  71. package/dist/runtime/validation.js +73 -0
  72. package/dist/runtime/validation.js.map +1 -1
  73. package/dist/types/events.d.ts +339 -12
  74. package/dist/types/events.d.ts.map +1 -1
  75. package/dist/types/replay.d.ts +7 -1
  76. package/dist/types/replay.d.ts.map +1 -1
  77. package/dist/types.d.ts +255 -6
  78. package/dist/types.d.ts.map +1 -1
  79. package/dist/types.js.map +1 -1
  80. package/package.json +39 -1
  81. package/src/index.ts +15 -0
  82. package/src/providers/openai-compatible.ts +83 -3
  83. package/src/runtime/audit.ts +121 -0
  84. package/src/runtime/broadcast.ts +40 -37
  85. package/src/runtime/cancellation.ts +59 -1
  86. package/src/runtime/coordinator.ts +1221 -61
  87. package/src/runtime/decisions.ts +307 -4
  88. package/src/runtime/defaults.ts +389 -4
  89. package/src/runtime/engine.ts +1004 -35
  90. package/src/runtime/health.ts +136 -0
  91. package/src/runtime/introspection.ts +122 -0
  92. package/src/runtime/metrics.ts +45 -0
  93. package/src/runtime/model.ts +38 -6
  94. package/src/runtime/provenance.ts +43 -0
  95. package/src/runtime/sequential.ts +49 -38
  96. package/src/runtime/shared.ts +40 -37
  97. package/src/runtime/tracing.ts +35 -0
  98. package/src/runtime/validation.ts +81 -0
  99. package/src/types/events.ts +369 -12
  100. package/src/types/replay.ts +14 -1
  101. package/src/types.ts +279 -4
@@ -1,16 +1,24 @@
1
1
  import { DogpileError } from "../types.js";
2
2
  import type {
3
+ AbortedEvent,
4
+ BudgetStopEvent,
3
5
  BudgetTier,
6
+ CostSummary,
7
+ DogpileErrorCode,
4
8
  DogpileOptions,
5
9
  Engine,
6
10
  EngineOptions,
7
11
  FinalEvent,
8
12
  JsonObject,
9
13
  JsonValue,
14
+ ModelRequestEvent,
10
15
  ProtocolSelection,
16
+ RunCallOptions,
11
17
  RunEvaluation,
12
18
  RunEvent,
13
19
  RunResult,
20
+ ReplayTraceProviderCall,
21
+ SubRunFailedEvent,
14
22
  StreamErrorEvent,
15
23
  StreamEvent,
16
24
  StreamEventSubscriber,
@@ -19,8 +27,9 @@ import type {
19
27
  Trace
20
28
  } from "../types.js";
21
29
  import { runBroadcast } from "./broadcast.js";
22
- import { runCoordinator } from "./coordinator.js";
30
+ import { runCoordinator, type AbortDrainFn } from "./coordinator.js";
23
31
  import {
32
+ addCost,
24
33
  createReplayTraceFinalOutput,
25
34
  createReplayTraceBudgetStateChanges,
26
35
  canonicalizeRunResult,
@@ -30,15 +39,36 @@ import {
30
39
  createRunMetadata,
31
40
  createRunUsage,
32
41
  defaultAgents,
42
+ emptyCost,
33
43
  normalizeProtocol,
34
44
  orderAgentsForTemperature,
45
+ recomputeAccountingFromTrace,
46
+ resolveOnChildFailure,
35
47
  tierTemperature
36
48
  } from "./defaults.js";
49
+ import { computeHealth, DEFAULT_HEALTH_THRESHOLDS } from "./health.js";
37
50
  import { runSequential } from "./sequential.js";
38
51
  import { runShared } from "./shared.js";
39
- import { createAbortErrorFromSignal, createTimeoutError } from "./cancellation.js";
52
+ import {
53
+ classifyChildTimeoutSource,
54
+ createAbortErrorFromSignal,
55
+ createEngineDeadlineTimeoutError,
56
+ createTimeoutError
57
+ } from "./cancellation.js";
40
58
  import { budget as budgetCondition } from "./termination.js";
41
- import { validateDogpileOptions, validateEngineOptions, validateMissionIntent } from "./validation.js";
59
+ import {
60
+ validateDogpileOptions,
61
+ validateEngineOptions,
62
+ validateMissionIntent,
63
+ validateProviderLocality,
64
+ validateRunCallOptions
65
+ } from "./validation.js";
66
+ import { DOGPILE_SPAN_NAMES, type DogpileSpan, type DogpileTracer } from "./tracing.js";
67
+ import type { Logger } from "./logger.js";
68
+ import type { MetricsHook, RunMetricsSnapshot } from "./metrics.js";
69
+
70
+ const DEFAULT_MAX_DEPTH = 4;
71
+ const DEFAULT_MAX_CONCURRENT_CHILDREN = 4;
42
72
 
43
73
  const defaultHighLevelProtocol = "sequential";
44
74
  const defaultHighLevelTier = "balanced";
@@ -67,10 +97,34 @@ export function createEngine(options: EngineOptions): Engine {
67
97
  const temperature = options.temperature ?? tierTemperature(options.tier);
68
98
  const agents = orderAgentsForTemperature(options.agents ?? defaultAgents(), temperature, options.seed);
69
99
  const terminate = options.terminate ?? (options.budget ? conditionFromBudget(options.budget) : undefined);
100
+ const engineMaxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH;
101
+ const engineMaxConcurrentChildren = options.maxConcurrentChildren ?? DEFAULT_MAX_CONCURRENT_CHILDREN;
102
+ const engineOnChildFailure = options.onChildFailure;
70
103
 
71
104
  return {
72
- run(intent: string): Promise<RunResult> {
105
+ run(intent: string, runOptions?: RunCallOptions): Promise<RunResult> {
73
106
  validateMissionIntent(intent);
107
+ validateRunCallOptions(runOptions);
108
+ validateProviderLocality(options.model, "model");
109
+
110
+ const effectiveMaxDepth = Math.min(
111
+ engineMaxDepth,
112
+ runOptions?.maxDepth ?? Number.POSITIVE_INFINITY
113
+ );
114
+ assertRunDoesNotRaiseEngineMax(
115
+ "maxConcurrentChildren",
116
+ runOptions?.maxConcurrentChildren,
117
+ engineMaxConcurrentChildren
118
+ );
119
+ const effectiveMaxConcurrentChildren = Math.min(
120
+ engineMaxConcurrentChildren,
121
+ runOptions?.maxConcurrentChildren ?? Number.POSITIVE_INFINITY
122
+ );
123
+ const onChildFailure = resolveOnChildFailure(runOptions?.onChildFailure, engineOnChildFailure);
124
+
125
+ const startedAtMs = Date.now();
126
+ const parentDeadlineMs =
127
+ options.budget?.timeoutMs !== undefined ? startedAtMs + options.budget.timeoutMs : undefined;
74
128
 
75
129
  return runNonStreamingProtocol({
76
130
  intent,
@@ -85,12 +139,40 @@ export function createEngine(options: EngineOptions): Engine {
85
139
  ...(options.signal !== undefined ? { signal: options.signal } : {}),
86
140
  ...(terminate ? { terminate } : {}),
87
141
  ...(options.wrapUpHint ? { wrapUpHint: options.wrapUpHint } : {}),
88
- ...(options.evaluate ? { evaluate: options.evaluate } : {})
142
+ ...(options.evaluate ? { evaluate: options.evaluate } : {}),
143
+ ...(options.tracer ? { tracer: options.tracer } : {}),
144
+ ...(options.metricsHook ? { metricsHook: options.metricsHook } : {}),
145
+ ...(options.logger ? { logger: options.logger } : {}),
146
+ currentDepth: 0,
147
+ effectiveMaxDepth,
148
+ effectiveMaxConcurrentChildren,
149
+ onChildFailure,
150
+ ...(parentDeadlineMs !== undefined ? { parentDeadlineMs } : {}),
151
+ ...(options.defaultSubRunTimeoutMs !== undefined
152
+ ? { defaultSubRunTimeoutMs: options.defaultSubRunTimeoutMs }
153
+ : {})
89
154
  });
90
155
  },
91
156
 
92
- stream(intent: string): StreamHandle {
157
+ stream(intent: string, runOptions?: RunCallOptions): StreamHandle {
93
158
  validateMissionIntent(intent);
159
+ validateRunCallOptions(runOptions);
160
+ validateProviderLocality(options.model, "model");
161
+
162
+ const effectiveMaxDepth = Math.min(
163
+ engineMaxDepth,
164
+ runOptions?.maxDepth ?? Number.POSITIVE_INFINITY
165
+ );
166
+ assertRunDoesNotRaiseEngineMax(
167
+ "maxConcurrentChildren",
168
+ runOptions?.maxConcurrentChildren,
169
+ engineMaxConcurrentChildren
170
+ );
171
+ const effectiveMaxConcurrentChildren = Math.min(
172
+ engineMaxConcurrentChildren,
173
+ runOptions?.maxConcurrentChildren ?? Number.POSITIVE_INFINITY
174
+ );
175
+ const onChildFailure = resolveOnChildFailure(runOptions?.onChildFailure, engineOnChildFailure);
94
176
 
95
177
  const pendingEvents: StreamEvent[] = [];
96
178
  const pendingResolvers: Array<(value: IteratorResult<StreamEvent>) => void> = [];
@@ -105,7 +187,10 @@ export function createEngine(options: EngineOptions): Engine {
105
187
  const abortRace = createAbortRace(abortController.signal, options.model.id);
106
188
  let complete = false;
107
189
  let lastRunId = "";
190
+ let rootRunId: string | undefined;
108
191
  let pendingFinalEvent: FinalEvent | undefined;
192
+ let activeAbortDrain: AbortDrainFn | undefined;
193
+ const failureInstancesByChildRunId = new Map<string, DogpileError>();
109
194
  let status: StreamHandleStatus = "running";
110
195
  let resolveResult!: (result: RunResult) => void;
111
196
  let rejectResult!: (error: unknown) => void;
@@ -163,6 +248,9 @@ export function createEngine(options: EngineOptions): Engine {
163
248
  }
164
249
 
165
250
  try {
251
+ const streamStartedAtMs = Date.now();
252
+ const streamParentDeadlineMs =
253
+ options.budget?.timeoutMs !== undefined ? streamStartedAtMs + options.budget.timeoutMs : undefined;
166
254
  const baseResult = await abortRace.run(runProtocol({
167
255
  intent,
168
256
  protocol,
@@ -175,22 +263,47 @@ export function createEngine(options: EngineOptions): Engine {
175
263
  ...(options.seed !== undefined ? { seed: options.seed } : {}),
176
264
  signal: abortController.signal,
177
265
  ...(terminate ? { terminate } : {}),
266
+ currentDepth: 0,
267
+ effectiveMaxDepth,
268
+ effectiveMaxConcurrentChildren,
269
+ onChildFailure,
270
+ ...(streamParentDeadlineMs !== undefined ? { parentDeadlineMs: streamParentDeadlineMs } : {}),
271
+ ...(options.defaultSubRunTimeoutMs !== undefined
272
+ ? { defaultSubRunTimeoutMs: options.defaultSubRunTimeoutMs }
273
+ : {}),
274
+ ...(options.tracer ? { tracer: options.tracer } : {}),
275
+ ...(options.metricsHook ? { metricsHook: options.metricsHook } : {}),
276
+ ...(options.logger ? { logger: options.logger } : {}),
277
+ streamEvents: true,
178
278
  emit(event: RunEvent): void {
179
279
  if (status !== "running") {
180
280
  return;
181
281
  }
182
282
 
283
+ const parentRunIds = (event as { readonly parentRunIds?: readonly string[] }).parentRunIds;
284
+ if (rootRunId === undefined && parentRunIds === undefined) {
285
+ rootRunId = event.runId;
286
+ }
287
+
183
288
  lastRunId = event.runId;
184
- if (event.type === "final") {
289
+ if (event.type === "final" && event.runId === rootRunId) {
185
290
  pendingFinalEvent = event;
186
291
  return;
187
292
  }
188
293
  publish(event);
189
- }
294
+ },
295
+ registerAbortDrain(drain: AbortDrainFn): void {
296
+ activeAbortDrain = drain;
297
+ },
298
+ failureInstancesByChildRunId
190
299
  }));
191
300
  if (status !== "running") {
192
301
  return;
193
302
  }
303
+ const terminalThrow = resolveRuntimeTerminalThrow(baseResult.trace, failureInstancesByChildRunId);
304
+ if (terminalThrow) {
305
+ throw terminalThrow;
306
+ }
194
307
 
195
308
  const finalizedResult = await abortRace.run(applyRunEvaluation(baseResult, options.evaluate));
196
309
  if (status !== "running") {
@@ -213,6 +326,10 @@ export function createEngine(options: EngineOptions): Engine {
213
326
 
214
327
  const runtimeError = timeoutLifecycle.translateError(error);
215
328
  status = isCancellationError(runtimeError) ? "cancelled" : "failed";
329
+ if (shouldPublishAborted(runtimeError)) {
330
+ activeAbortDrain?.(runtimeError);
331
+ publish(createStreamAbortedEvent(runtimeError, lastRunId));
332
+ }
216
333
  publish(createStreamErrorEvent(runtimeError, lastRunId));
217
334
  closeStream();
218
335
  rejectResult(runtimeError);
@@ -225,9 +342,11 @@ export function createEngine(options: EngineOptions): Engine {
225
342
  }
226
343
 
227
344
  const error = createStreamCancellationError(options.model.id, cause);
228
- status = "cancelled";
229
345
  abortController.abort(error);
346
+ activeAbortDrain?.(error);
347
+ publish(createStreamAbortedEvent(error, lastRunId));
230
348
  publish(createStreamErrorEvent(error, lastRunId));
349
+ status = "cancelled";
231
350
  closeStream();
232
351
  rejectResult(error);
233
352
  }
@@ -238,6 +357,7 @@ export function createEngine(options: EngineOptions): Engine {
238
357
  }
239
358
 
240
359
  complete = true;
360
+ failureInstancesByChildRunId.clear();
241
361
  removeCallerAbortListener();
242
362
  timeoutLifecycle.cleanup();
243
363
  abortRace.cleanup();
@@ -303,6 +423,7 @@ function createNonStreamingAbortLifecycle(options: {
303
423
  readonly callerSignal?: AbortSignal | undefined;
304
424
  readonly timeoutMs?: number | undefined;
305
425
  readonly providerId: string;
426
+ readonly timeoutErrorSource?: "runtime" | "engine";
306
427
  }): AbortLifecycle {
307
428
  if (options.timeoutMs === undefined) {
308
429
  return {
@@ -321,7 +442,8 @@ function createNonStreamingAbortLifecycle(options: {
321
442
  const timeoutLifecycle = createTimeoutAbortLifecycle({
322
443
  abortController,
323
444
  timeoutMs: options.timeoutMs,
324
- providerId: options.providerId
445
+ providerId: options.providerId,
446
+ timeoutErrorSource: options.timeoutErrorSource ?? "runtime"
325
447
  });
326
448
  const abortRace = createAbortRace(abortController.signal, options.providerId);
327
449
  const removeCallerAbortListener = wireCallerAbortSignal(options.callerSignal, abortController, () => {
@@ -348,6 +470,7 @@ function createTimeoutAbortLifecycle(options: {
348
470
  readonly abortController: AbortController;
349
471
  readonly timeoutMs?: number | undefined;
350
472
  readonly providerId: string;
473
+ readonly timeoutErrorSource?: "runtime" | "engine";
351
474
  }): TimeoutAbortLifecycle {
352
475
  if (options.timeoutMs === undefined) {
353
476
  return {
@@ -358,7 +481,14 @@ function createTimeoutAbortLifecycle(options: {
358
481
  };
359
482
  }
360
483
 
361
- const timeoutError = createTimeoutError(options.providerId, options.timeoutMs);
484
+ const timeoutSource = classifyChildTimeoutSource(undefined, {
485
+ ...(options.timeoutErrorSource === "engine" ? { engineDefaultTimeoutMs: options.timeoutMs } : {}),
486
+ isProviderError: false
487
+ });
488
+ const timeoutError =
489
+ options.timeoutErrorSource === "engine" && timeoutSource === "engine"
490
+ ? createEngineDeadlineTimeoutError(options.providerId, options.timeoutMs)
491
+ : createTimeoutError(options.providerId, options.timeoutMs);
362
492
  const timeoutId = setTimeout(() => {
363
493
  options.abortController.abort(timeoutError);
364
494
  }, options.timeoutMs);
@@ -454,6 +584,28 @@ function readAbortSignalReason(signal: AbortSignal | undefined): unknown {
454
584
  return signal?.aborted ? signal.reason : undefined;
455
585
  }
456
586
 
587
+ function createStreamAbortedEvent(error: unknown, runId: string): AbortedEvent {
588
+ return {
589
+ type: "aborted",
590
+ runId,
591
+ at: new Date().toISOString(),
592
+ reason: streamAbortedReason(error)
593
+ };
594
+ }
595
+
596
+ function shouldPublishAborted(error: unknown): boolean {
597
+ return DogpileError.isInstance(error) && (error.code === "aborted" || error.code === "timeout");
598
+ }
599
+
600
+ function streamAbortedReason(error: unknown): AbortedEvent["reason"] {
601
+ if (DogpileError.isInstance(error)) {
602
+ if (error.code === "timeout" || error.detail?.["reason"] === "timeout") {
603
+ return "timeout";
604
+ }
605
+ }
606
+ return "parent-aborted";
607
+ }
608
+
457
609
  function createStreamErrorEvent(error: unknown, runId: string): StreamErrorEvent {
458
610
  if (DogpileError.isInstance(error)) {
459
611
  return {
@@ -519,15 +671,528 @@ interface RunProtocolOptions {
519
671
  readonly terminate?: EngineOptions["terminate"];
520
672
  readonly wrapUpHint?: EngineOptions["wrapUpHint"];
521
673
  readonly emit?: (event: RunEvent) => void;
674
+ readonly streamEvents?: boolean;
675
+ /**
676
+ * Current recursion depth. Top-level runs use 0; the coordinator dispatch
677
+ * loop increments before invoking {@link runProtocol} for a child run.
678
+ * Plan 04 will wire `effectiveMaxDepth` validation around this value.
679
+ */
680
+ readonly currentDepth?: number;
681
+ /**
682
+ * Effective max recursion depth. Plan 04 enforces; Plan 03 plumbs the param.
683
+ */
684
+ readonly effectiveMaxDepth?: number;
685
+ /** Effective max delegated child concurrency resolved at run start. */
686
+ readonly effectiveMaxConcurrentChildren?: number;
687
+ readonly onChildFailure?: EngineOptions["onChildFailure"];
688
+ /**
689
+ * Root-run deadline (epoch ms) threaded through every recursive coordinator
690
+ * dispatch (BUDGET-02 / D-12). Children inherit `parentDeadlineMs - now()`
691
+ * as their default timeout window.
692
+ */
693
+ readonly parentDeadlineMs?: number;
694
+ /**
695
+ * Engine-level fallback sub-run timeout (BUDGET-02 / D-14). Applied only when
696
+ * neither the parent nor the decision specifies a `budget.timeoutMs`.
697
+ */
698
+ readonly defaultSubRunTimeoutMs?: number;
699
+ readonly registerAbortDrain?: (drain: AbortDrainFn) => void;
700
+ readonly failureInstancesByChildRunId?: Map<string, DogpileError>;
701
+ readonly tracer?: EngineOptions["tracer"];
702
+ readonly metricsHook?: EngineOptions["metricsHook"];
703
+ readonly logger?: EngineOptions["logger"];
704
+ /**
705
+ * Optional parent span for the next runProtocol invocation. Threaded by the
706
+ * coordinator when dispatching child runs so that the child's `dogpile.run`
707
+ * span is correctly nested under its parent's `dogpile.sub-run` span.
708
+ * Internal-only; not part of the public surface.
709
+ */
710
+ readonly parentSpan?: DogpileSpan;
711
+ /**
712
+ * Per-child sub-run span lookup, keyed by childRunId. Populated by the
713
+ * parent's emit closure on `sub-run-started`. The coordinator dispatcher
714
+ * reads this to thread the correct per-child span as parent for the
715
+ * recursive runProtocol call. Internal-only.
716
+ */
717
+ readonly subRunSpansByChildId?: ReadonlyMap<string, DogpileSpan>;
522
718
  }
523
719
 
524
720
  type NonStreamingProtocolOptions = Omit<RunProtocolOptions, "emit"> & Pick<EngineOptions, "evaluate">;
525
721
 
722
+ interface TracingState {
723
+ readonly tracer: DogpileTracer;
724
+ readonly runSpan: DogpileSpan;
725
+ readonly subRunSpans: Map<string, DogpileSpan>;
726
+ readonly agentTurnSpans: Map<string, DogpileSpan>;
727
+ readonly modelCallSpans: Map<string, DogpileSpan>;
728
+ readonly pendingModelRequests: Map<string, ModelRequestEvent>;
729
+ readonly agentTurnCounters: Map<string, number>;
730
+ readonly turnAccumByAgent: Map<string, TurnAccum>;
731
+ readonly agentIds: Set<string>;
732
+ runId?: string;
733
+ turnCount: number;
734
+ lastCost: CostSummary;
735
+ }
736
+
737
+ interface TurnAccum {
738
+ inputTokens: number;
739
+ outputTokens: number;
740
+ costUsd: number;
741
+ }
742
+
743
+ function openRunTracing(options: {
744
+ readonly tracer?: DogpileTracer;
745
+ readonly parentSpan?: DogpileSpan;
746
+ readonly intent: string;
747
+ readonly protocolKind: string;
748
+ readonly tier: unknown;
749
+ }): TracingState | undefined {
750
+ if (!options.tracer) {
751
+ return undefined;
752
+ }
753
+
754
+ const runSpan = options.tracer.startSpan(DOGPILE_SPAN_NAMES.RUN, {
755
+ ...(options.parentSpan ? { parent: options.parentSpan } : {}),
756
+ attributes: {
757
+ "dogpile.run.protocol": options.protocolKind,
758
+ "dogpile.run.tier": String(options.tier),
759
+ "dogpile.run.intent": options.intent.slice(0, 200)
760
+ }
761
+ });
762
+
763
+ return {
764
+ tracer: options.tracer,
765
+ runSpan,
766
+ subRunSpans: new Map(),
767
+ agentTurnSpans: new Map(),
768
+ modelCallSpans: new Map(),
769
+ pendingModelRequests: new Map(),
770
+ agentTurnCounters: new Map(),
771
+ turnAccumByAgent: new Map(),
772
+ agentIds: new Set(),
773
+ turnCount: 0,
774
+ lastCost: emptyCost()
775
+ };
776
+ }
777
+
778
+ interface MetricsState {
779
+ readonly metricsHook: MetricsHook;
780
+ readonly logger: Logger | undefined;
781
+ readonly startedAtMs: number;
782
+ readonly subRunStartTimes: Map<string, number>;
783
+ totalCost: CostSummary;
784
+ nestedCost: CostSummary;
785
+ turns: number;
786
+ }
787
+
788
+ function openRunMetrics(options: {
789
+ readonly metricsHook?: MetricsHook;
790
+ readonly logger?: Logger;
791
+ }): MetricsState | undefined {
792
+ if (!options.metricsHook) {
793
+ return undefined;
794
+ }
795
+
796
+ return {
797
+ metricsHook: options.metricsHook,
798
+ logger: options.logger,
799
+ startedAtMs: Date.now(),
800
+ subRunStartTimes: new Map(),
801
+ totalCost: emptyCost(),
802
+ nestedCost: emptyCost(),
803
+ turns: 0
804
+ };
805
+ }
806
+
807
+ function routeMetricsError(err: unknown, logger: Logger | undefined): void {
808
+ const msg = err instanceof Error ? err.message : String(err);
809
+ try {
810
+ if (logger !== undefined) {
811
+ logger.error("dogpile:metricsHook threw", { error: msg });
812
+ } else {
813
+ console.error("dogpile:metricsHook threw", { error: msg });
814
+ }
815
+ } catch {
816
+ // A logger that throws from error() cannot be helped.
817
+ }
818
+ }
819
+
820
+ function fireHook(
821
+ callback: ((snapshot: RunMetricsSnapshot) => void | Promise<void>) | undefined,
822
+ snapshot: RunMetricsSnapshot,
823
+ logger: Logger | undefined
824
+ ): void {
825
+ if (!callback) {
826
+ return;
827
+ }
828
+
829
+ try {
830
+ const result = callback(snapshot);
831
+ if (result && typeof (result as Promise<void>).catch === "function") {
832
+ (result as Promise<void>).catch((err: unknown) => {
833
+ routeMetricsError(err, logger);
834
+ });
835
+ }
836
+ } catch (err: unknown) {
837
+ routeMetricsError(err, logger);
838
+ }
839
+ }
840
+
841
+ function buildRunSnapshot(
842
+ result: RunResult,
843
+ startedAtMs: number
844
+ ): RunMetricsSnapshot {
845
+ const nestedCosts = nestedSubRunCosts(result);
846
+ const budgetStopEvent = result.trace.events.find((event): event is BudgetStopEvent => event.type === "budget-stop");
847
+ const outcome: RunMetricsSnapshot["outcome"] = budgetStopEvent !== undefined ? "budget-stopped" : "completed";
848
+ const totalInputTokens = result.cost.inputTokens;
849
+ const totalOutputTokens = result.cost.outputTokens;
850
+ const totalCostUsd = result.cost.usd;
851
+ const ownInputTokens =
852
+ totalInputTokens - nestedCosts.reduce((sum, cost) => sum + cost.inputTokens, 0);
853
+ const ownOutputTokens =
854
+ totalOutputTokens - nestedCosts.reduce((sum, cost) => sum + cost.outputTokens, 0);
855
+ const ownCostUsd =
856
+ totalCostUsd - nestedCosts.reduce((sum, cost) => sum + cost.usd, 0);
857
+ const turns = result.trace.events.filter((event) => event.type === "agent-turn").length;
858
+
859
+ return {
860
+ outcome,
861
+ inputTokens: ownInputTokens,
862
+ outputTokens: ownOutputTokens,
863
+ costUsd: ownCostUsd,
864
+ totalInputTokens,
865
+ totalOutputTokens,
866
+ totalCostUsd,
867
+ turns,
868
+ durationMs: Date.now() - startedAtMs
869
+ };
870
+ }
871
+
872
+ function buildSubRunSnapshot(
873
+ subResult: RunResult,
874
+ durationMs: number
875
+ ): RunMetricsSnapshot {
876
+ const nestedCosts = nestedSubRunCosts(subResult);
877
+ const budgetStopEvent = subResult.trace.events.find((event): event is BudgetStopEvent => event.type === "budget-stop");
878
+ const outcome: RunMetricsSnapshot["outcome"] = budgetStopEvent !== undefined ? "budget-stopped" : "completed";
879
+ const totalInputTokens = subResult.cost.inputTokens;
880
+ const totalOutputTokens = subResult.cost.outputTokens;
881
+ const totalCostUsd = subResult.cost.usd;
882
+ const ownInputTokens =
883
+ totalInputTokens - nestedCosts.reduce((sum, cost) => sum + cost.inputTokens, 0);
884
+ const ownOutputTokens =
885
+ totalOutputTokens - nestedCosts.reduce((sum, cost) => sum + cost.outputTokens, 0);
886
+ const ownCostUsd =
887
+ totalCostUsd - nestedCosts.reduce((sum, cost) => sum + cost.usd, 0);
888
+ const turns = subResult.trace.events.filter((event) => event.type === "agent-turn").length;
889
+
890
+ return {
891
+ outcome,
892
+ inputTokens: ownInputTokens,
893
+ outputTokens: ownOutputTokens,
894
+ costUsd: ownCostUsd,
895
+ totalInputTokens,
896
+ totalOutputTokens,
897
+ totalCostUsd,
898
+ turns,
899
+ durationMs
900
+ };
901
+ }
902
+
903
+ function nestedSubRunCosts(result: RunResult): CostSummary[] {
904
+ return result.trace.events.flatMap((event) => {
905
+ if (event.type === "sub-run-completed") {
906
+ return [event.subResult.cost];
907
+ }
908
+ if (event.type === "sub-run-failed") {
909
+ return [event.partialCost];
910
+ }
911
+ return [];
912
+ });
913
+ }
914
+
915
+ function subtractCost(total: CostSummary, nested: CostSummary): CostSummary {
916
+ return {
917
+ usd: total.usd - nested.usd,
918
+ inputTokens: total.inputTokens - nested.inputTokens,
919
+ outputTokens: total.outputTokens - nested.outputTokens,
920
+ totalTokens: total.totalTokens - nested.totalTokens
921
+ };
922
+ }
923
+
924
+ function handleMetricsEvent(state: MetricsState, event: RunEvent): void {
925
+ const parentRunIds = (event as { readonly parentRunIds?: readonly string[] }).parentRunIds;
926
+ if (parentRunIds !== undefined) {
927
+ return;
928
+ }
929
+
930
+ switch (event.type) {
931
+ case "agent-turn": {
932
+ state.totalCost = event.cost;
933
+ state.turns += 1;
934
+ break;
935
+ }
936
+ case "broadcast":
937
+ case "budget-stop":
938
+ case "final": {
939
+ state.totalCost = event.cost;
940
+ break;
941
+ }
942
+ case "sub-run-started": {
943
+ state.subRunStartTimes.set(event.childRunId, Date.now());
944
+ break;
945
+ }
946
+ case "sub-run-completed": {
947
+ state.totalCost = addCost(state.totalCost, event.subResult.cost);
948
+ state.nestedCost = addCost(state.nestedCost, event.subResult.cost);
949
+ const startMs = state.subRunStartTimes.get(event.childRunId);
950
+ const durationMs = startMs !== undefined ? Date.now() - startMs : 0;
951
+ state.subRunStartTimes.delete(event.childRunId);
952
+ const snapshot = buildSubRunSnapshot(event.subResult, durationMs);
953
+ fireHook(state.metricsHook.onSubRunComplete, snapshot, state.logger);
954
+ break;
955
+ }
956
+ case "sub-run-failed": {
957
+ state.totalCost = addCost(state.totalCost, event.partialCost);
958
+ state.nestedCost = addCost(state.nestedCost, event.partialCost);
959
+ state.subRunStartTimes.delete(event.childRunId);
960
+ break;
961
+ }
962
+ default:
963
+ break;
964
+ }
965
+ }
966
+
967
+ function closeRunMetrics(state: MetricsState, result: RunResult | undefined): void {
968
+ if (result !== undefined) {
969
+ const snapshot = buildRunSnapshot(result, state.startedAtMs);
970
+ fireHook(state.metricsHook.onRunComplete, snapshot, state.logger);
971
+ return;
972
+ }
973
+
974
+ const ownCost = subtractCost(state.totalCost, state.nestedCost);
975
+ const snapshot: RunMetricsSnapshot = {
976
+ outcome: "aborted",
977
+ inputTokens: ownCost.inputTokens,
978
+ outputTokens: ownCost.outputTokens,
979
+ costUsd: ownCost.usd,
980
+ totalInputTokens: state.totalCost.inputTokens,
981
+ totalOutputTokens: state.totalCost.outputTokens,
982
+ totalCostUsd: state.totalCost.usd,
983
+ turns: state.turns,
984
+ durationMs: Date.now() - state.startedAtMs
985
+ };
986
+ fireHook(state.metricsHook.onRunComplete, snapshot, state.logger);
987
+ }
988
+
989
+ function handleTracingEvent(state: TracingState, event: RunEvent): void {
990
+ const parentRunIds = (event as { readonly parentRunIds?: readonly string[] }).parentRunIds;
991
+ if (parentRunIds !== undefined) {
992
+ return;
993
+ }
994
+
995
+ if (state.runId === undefined) {
996
+ state.runId = event.runId;
997
+ state.runSpan.setAttribute("dogpile.run.id", event.runId);
998
+ }
999
+
1000
+ switch (event.type) {
1001
+ case "model-request": {
1002
+ state.pendingModelRequests.set(event.callId, event);
1003
+ state.agentIds.add(event.agentId);
1004
+
1005
+ if (!state.agentTurnSpans.has(event.agentId)) {
1006
+ const turnNumber = (state.agentTurnCounters.get(event.agentId) ?? 0) + 1;
1007
+ state.agentTurnCounters.set(event.agentId, turnNumber);
1008
+ const turnParent = state.subRunSpans.get(event.runId) ?? state.runSpan;
1009
+ const turnSpan = state.tracer.startSpan(DOGPILE_SPAN_NAMES.AGENT_TURN, {
1010
+ parent: turnParent,
1011
+ attributes: {
1012
+ "dogpile.agent.id": event.agentId,
1013
+ "dogpile.agent.role": event.role,
1014
+ "dogpile.turn.number": turnNumber,
1015
+ "dogpile.model.id": event.modelId
1016
+ }
1017
+ });
1018
+ state.agentTurnSpans.set(event.agentId, turnSpan);
1019
+ }
1020
+
1021
+ const callParent =
1022
+ state.agentTurnSpans.get(event.agentId) ??
1023
+ state.subRunSpans.get(event.runId) ??
1024
+ state.runSpan;
1025
+ const callSpan = state.tracer.startSpan(DOGPILE_SPAN_NAMES.MODEL_CALL, {
1026
+ parent: callParent,
1027
+ attributes: {
1028
+ "dogpile.model.id": event.modelId,
1029
+ "dogpile.call.id": event.callId,
1030
+ "dogpile.provider.id": event.providerId
1031
+ }
1032
+ });
1033
+ state.modelCallSpans.set(event.callId, callSpan);
1034
+ break;
1035
+ }
1036
+ case "model-response": {
1037
+ const span = state.modelCallSpans.get(event.callId);
1038
+ if (span) {
1039
+ const inputTokens = event.response.usage?.inputTokens ?? 0;
1040
+ const outputTokens = event.response.usage?.outputTokens ?? 0;
1041
+ const responseCost: CostSummary = {
1042
+ usd: event.response.costUsd ?? 0,
1043
+ inputTokens,
1044
+ outputTokens,
1045
+ totalTokens: event.response.usage?.totalTokens ?? inputTokens + outputTokens
1046
+ };
1047
+ span.setAttribute("dogpile.model.input_tokens", inputTokens);
1048
+ span.setAttribute("dogpile.model.output_tokens", outputTokens);
1049
+ if (event.response.costUsd !== undefined) {
1050
+ span.setAttribute("dogpile.model.cost_usd", event.response.costUsd);
1051
+ }
1052
+ span.setStatus("ok");
1053
+ span.end();
1054
+ state.modelCallSpans.delete(event.callId);
1055
+ const accum = state.turnAccumByAgent.get(event.agentId) ?? {
1056
+ inputTokens: 0,
1057
+ outputTokens: 0,
1058
+ costUsd: 0
1059
+ };
1060
+ accum.inputTokens += inputTokens;
1061
+ accum.outputTokens += outputTokens;
1062
+ accum.costUsd += responseCost.usd;
1063
+ state.turnAccumByAgent.set(event.agentId, accum);
1064
+ state.lastCost = addCost(state.lastCost, responseCost);
1065
+ }
1066
+ state.pendingModelRequests.delete(event.callId);
1067
+ break;
1068
+ }
1069
+ case "agent-turn": {
1070
+ state.agentIds.add(event.agentId);
1071
+ state.turnCount += 1;
1072
+ state.lastCost = event.cost;
1073
+ const turnSpan = state.agentTurnSpans.get(event.agentId);
1074
+ if (turnSpan) {
1075
+ turnSpan.setAttribute("dogpile.agent.role", event.role);
1076
+ const accum = state.turnAccumByAgent.get(event.agentId);
1077
+ turnSpan.setAttribute("dogpile.turn.cost_usd", accum?.costUsd ?? 0);
1078
+ turnSpan.setAttribute("dogpile.turn.input_tokens", accum?.inputTokens ?? 0);
1079
+ turnSpan.setAttribute("dogpile.turn.output_tokens", accum?.outputTokens ?? 0);
1080
+ turnSpan.setStatus("ok");
1081
+ turnSpan.end();
1082
+ state.agentTurnSpans.delete(event.agentId);
1083
+ }
1084
+ state.turnAccumByAgent.delete(event.agentId);
1085
+ break;
1086
+ }
1087
+ case "broadcast":
1088
+ case "budget-stop":
1089
+ case "final": {
1090
+ state.lastCost = event.cost;
1091
+ break;
1092
+ }
1093
+ case "sub-run-started": {
1094
+ const span = state.tracer.startSpan(DOGPILE_SPAN_NAMES.SUB_RUN, {
1095
+ parent: state.runSpan,
1096
+ attributes: {
1097
+ "dogpile.sub_run.child_run_id": event.childRunId,
1098
+ "dogpile.sub_run.parent_run_id": event.parentRunId,
1099
+ "dogpile.sub_run.depth": event.depth
1100
+ }
1101
+ });
1102
+ state.subRunSpans.set(event.childRunId, span);
1103
+ break;
1104
+ }
1105
+ case "sub-run-completed": {
1106
+ const span = state.subRunSpans.get(event.childRunId);
1107
+ if (span) {
1108
+ span.setStatus("ok");
1109
+ span.end();
1110
+ state.subRunSpans.delete(event.childRunId);
1111
+ }
1112
+ break;
1113
+ }
1114
+ case "sub-run-failed": {
1115
+ const span = state.subRunSpans.get(event.childRunId);
1116
+ if (span) {
1117
+ span.setStatus("error", event.error.message);
1118
+ span.end();
1119
+ state.subRunSpans.delete(event.childRunId);
1120
+ }
1121
+ break;
1122
+ }
1123
+ default:
1124
+ break;
1125
+ }
1126
+ }
1127
+
1128
+ function closeRunTracing(state: TracingState, result: RunResult | undefined, error?: unknown): void {
1129
+ if (error !== undefined) {
1130
+ if (state.runId !== undefined) {
1131
+ state.runSpan.setAttribute("dogpile.run.id", state.runId);
1132
+ }
1133
+ state.runSpan.setAttribute("dogpile.run.agent_count", state.agentIds.size);
1134
+ state.runSpan.setAttribute("dogpile.run.turn_count", state.turnCount);
1135
+ state.runSpan.setAttribute("dogpile.run.cost_usd", state.lastCost.usd);
1136
+ state.runSpan.setAttribute("dogpile.run.input_tokens", state.lastCost.inputTokens);
1137
+ state.runSpan.setAttribute("dogpile.run.output_tokens", state.lastCost.outputTokens);
1138
+ state.runSpan.setAttribute("dogpile.run.outcome", "aborted");
1139
+ state.runSpan.setStatus("error", error instanceof Error ? error.message : String(error));
1140
+ closeOpenTracingSpans(state);
1141
+ state.runSpan.end();
1142
+ return;
1143
+ }
1144
+
1145
+ if (result === undefined) {
1146
+ closeOpenTracingSpans(state);
1147
+ state.runSpan.end();
1148
+ return;
1149
+ }
1150
+
1151
+ const budgetStopEvent = result.trace.events.find((event): event is BudgetStopEvent => event.type === "budget-stop");
1152
+ const terminationReason = budgetStopEvent?.reason;
1153
+ const outcome = terminationReason !== undefined ? "budget-stopped" : "completed";
1154
+ state.runSpan.setAttribute("dogpile.run.id", result.trace.runId);
1155
+ state.runSpan.setAttribute("dogpile.run.agent_count", result.trace.agentsUsed.length);
1156
+ state.runSpan.setAttribute("dogpile.run.turn_count", result.trace.events.filter((event) => event.type === "agent-turn").length);
1157
+ state.runSpan.setAttribute("dogpile.run.cost_usd", result.cost.usd);
1158
+ state.runSpan.setAttribute("dogpile.run.input_tokens", result.cost.inputTokens);
1159
+ state.runSpan.setAttribute("dogpile.run.output_tokens", result.cost.outputTokens);
1160
+ state.runSpan.setAttribute("dogpile.run.outcome", outcome);
1161
+ if (terminationReason !== undefined) {
1162
+ state.runSpan.setAttribute("dogpile.run.termination_reason", terminationReason);
1163
+ }
1164
+ state.runSpan.setStatus("ok");
1165
+ closeOpenTracingSpans(state);
1166
+ state.runSpan.end();
1167
+ }
1168
+
1169
+ function closeOpenTracingSpans(state: TracingState): void {
1170
+ for (const span of state.modelCallSpans.values()) {
1171
+ span.end();
1172
+ }
1173
+ state.modelCallSpans.clear();
1174
+ for (const span of state.agentTurnSpans.values()) {
1175
+ span.end();
1176
+ }
1177
+ state.agentTurnSpans.clear();
1178
+ for (const span of state.subRunSpans.values()) {
1179
+ span.end();
1180
+ }
1181
+ state.subRunSpans.clear();
1182
+ }
1183
+
526
1184
  async function runNonStreamingProtocol(options: NonStreamingProtocolOptions): Promise<RunResult> {
1185
+ const failureInstancesByChildRunId = new Map<string, DogpileError>();
527
1186
  const abortLifecycle = createNonStreamingAbortLifecycle({
528
1187
  callerSignal: options.signal,
529
1188
  timeoutMs: runtimeTimeoutMs(options),
530
- providerId: options.model.id
1189
+ providerId: options.model.id,
1190
+ timeoutErrorSource:
1191
+ options.currentDepth !== undefined &&
1192
+ options.currentDepth > 0 &&
1193
+ options.parentDeadlineMs === undefined
1194
+ ? "engine"
1195
+ : "runtime"
531
1196
  });
532
1197
 
533
1198
  try {
@@ -537,7 +1202,8 @@ async function runNonStreamingProtocol(options: NonStreamingProtocolOptions): Pr
537
1202
  ...(abortLifecycle.signal !== undefined ? { signal: abortLifecycle.signal } : {}),
538
1203
  emit(event: RunEvent): void {
539
1204
  emittedEvents.push(event);
540
- }
1205
+ },
1206
+ failureInstancesByChildRunId
541
1207
  }));
542
1208
  const events = emittedEvents.length > 0 ? emittedEvents : result.trace.events;
543
1209
  const trace = {
@@ -557,12 +1223,18 @@ async function runNonStreamingProtocol(options: NonStreamingProtocolOptions): Pr
557
1223
  events
558
1224
  }),
559
1225
  eventLog: createRunEventLog(trace.runId, trace.protocol, events),
560
- trace
1226
+ trace,
1227
+ health: computeHealth(trace, DEFAULT_HEALTH_THRESHOLDS)
561
1228
  };
1229
+ const terminalThrow = resolveRuntimeTerminalThrow(runResult.trace, failureInstancesByChildRunId);
1230
+ if (terminalThrow) {
1231
+ throw terminalThrow;
1232
+ }
562
1233
  return canonicalizeRunResult(await abortLifecycle.run(applyRunEvaluation(runResult, options.evaluate)));
563
1234
  } catch (error: unknown) {
564
1235
  throw abortLifecycle.translateError(error);
565
1236
  } finally {
1237
+ failureInstancesByChildRunId.clear();
566
1238
  abortLifecycle.cleanup();
567
1239
  }
568
1240
  }
@@ -605,7 +1277,61 @@ function finalEventWithEvaluation(event: FinalEvent, evaluation: RunEvaluation):
605
1277
  };
606
1278
  }
607
1279
 
608
- function runProtocol(options: RunProtocolOptions): Promise<RunResult> {
1280
+ async function runProtocol(options: RunProtocolOptions): Promise<RunResult> {
1281
+ const tracing = openRunTracing({
1282
+ ...(options.tracer ? { tracer: options.tracer } : {}),
1283
+ ...(options.parentSpan ? { parentSpan: options.parentSpan } : {}),
1284
+ intent: options.intent,
1285
+ protocolKind: options.protocol.kind,
1286
+ tier: options.tier
1287
+ });
1288
+ const metrics = openRunMetrics({
1289
+ ...(options.metricsHook ? { metricsHook: options.metricsHook } : {}),
1290
+ ...(options.logger ? { logger: options.logger } : {})
1291
+ });
1292
+ const emitForProtocol =
1293
+ tracing || metrics || options.emit
1294
+ ? (event: RunEvent): void => {
1295
+ if (tracing) {
1296
+ handleTracingEvent(tracing, event);
1297
+ }
1298
+ if (metrics) {
1299
+ handleMetricsEvent(metrics, event);
1300
+ }
1301
+ options.emit?.(event);
1302
+ }
1303
+ : undefined;
1304
+ const protocolOptions = tracing
1305
+ ? {
1306
+ ...options,
1307
+ subRunSpansByChildId: tracing.subRunSpans
1308
+ }
1309
+ : options;
1310
+
1311
+ try {
1312
+ const result = await runProtocolInner(protocolOptions, emitForProtocol);
1313
+ if (tracing) {
1314
+ closeRunTracing(tracing, result);
1315
+ }
1316
+ if (metrics && (options.currentDepth === 0 || options.currentDepth === undefined)) {
1317
+ closeRunMetrics(metrics, result);
1318
+ }
1319
+ return result;
1320
+ } catch (error) {
1321
+ if (tracing) {
1322
+ closeRunTracing(tracing, undefined, error);
1323
+ }
1324
+ if (metrics && (options.currentDepth === 0 || options.currentDepth === undefined)) {
1325
+ closeRunMetrics(metrics, undefined);
1326
+ }
1327
+ throw error;
1328
+ }
1329
+ }
1330
+
1331
+ function runProtocolInner(
1332
+ options: RunProtocolOptions,
1333
+ emitForProtocol?: (event: RunEvent) => void
1334
+ ): Promise<RunResult> {
609
1335
  switch (options.protocol.kind) {
610
1336
  case "sequential":
611
1337
  return runSequential({
@@ -621,7 +1347,7 @@ function runProtocol(options: RunProtocolOptions): Promise<RunResult> {
621
1347
  ...(options.signal !== undefined ? { signal: options.signal } : {}),
622
1348
  ...(options.terminate ? { terminate: options.terminate } : {}),
623
1349
  ...(options.wrapUpHint ? { wrapUpHint: options.wrapUpHint } : {}),
624
- ...(options.emit ? { emit: options.emit } : {})
1350
+ ...(emitForProtocol ? { emit: emitForProtocol } : {})
625
1351
  });
626
1352
  case "broadcast":
627
1353
  return runBroadcast({
@@ -637,7 +1363,7 @@ function runProtocol(options: RunProtocolOptions): Promise<RunResult> {
637
1363
  ...(options.signal !== undefined ? { signal: options.signal } : {}),
638
1364
  ...(options.terminate ? { terminate: options.terminate } : {}),
639
1365
  ...(options.wrapUpHint ? { wrapUpHint: options.wrapUpHint } : {}),
640
- ...(options.emit ? { emit: options.emit } : {})
1366
+ ...(emitForProtocol ? { emit: emitForProtocol } : {})
641
1367
  });
642
1368
  case "coordinator":
643
1369
  return runCoordinator({
@@ -653,7 +1379,31 @@ function runProtocol(options: RunProtocolOptions): Promise<RunResult> {
653
1379
  ...(options.signal !== undefined ? { signal: options.signal } : {}),
654
1380
  ...(options.terminate ? { terminate: options.terminate } : {}),
655
1381
  ...(options.wrapUpHint ? { wrapUpHint: options.wrapUpHint } : {}),
656
- ...(options.emit ? { emit: options.emit } : {})
1382
+ ...(emitForProtocol ? { emit: emitForProtocol } : {}),
1383
+ ...(options.streamEvents !== undefined ? { streamEvents: options.streamEvents } : {}),
1384
+ currentDepth: options.currentDepth ?? 0,
1385
+ effectiveMaxDepth: options.effectiveMaxDepth ?? Infinity,
1386
+ effectiveMaxConcurrentChildren: options.effectiveMaxConcurrentChildren ?? DEFAULT_MAX_CONCURRENT_CHILDREN,
1387
+ onChildFailure: options.onChildFailure ?? "continue",
1388
+ ...(options.parentDeadlineMs !== undefined ? { parentDeadlineMs: options.parentDeadlineMs } : {}),
1389
+ ...(options.defaultSubRunTimeoutMs !== undefined
1390
+ ? { defaultSubRunTimeoutMs: options.defaultSubRunTimeoutMs }
1391
+ : {}),
1392
+ ...(options.registerAbortDrain !== undefined ? { registerAbortDrain: options.registerAbortDrain } : {}),
1393
+ ...(options.failureInstancesByChildRunId !== undefined
1394
+ ? { failureInstancesByChildRunId: options.failureInstancesByChildRunId }
1395
+ : {}),
1396
+ runProtocol: (childInput) => {
1397
+ const { runId: childRunId, ...childProtocolInput } = childInput;
1398
+ const childParent = options.subRunSpansByChildId?.get(childRunId) ?? options.parentSpan;
1399
+ return runProtocol({
1400
+ ...childProtocolInput,
1401
+ protocol: normalizeProtocol(childProtocolInput.protocol),
1402
+ ...(options.tracer ? { tracer: options.tracer } : {}),
1403
+ ...(childParent ? { parentSpan: childParent } : {}),
1404
+ ...(options.logger ? { logger: options.logger } : {})
1405
+ });
1406
+ }
657
1407
  });
658
1408
  case "shared":
659
1409
  return runShared({
@@ -669,7 +1419,7 @@ function runProtocol(options: RunProtocolOptions): Promise<RunResult> {
669
1419
  ...(options.signal !== undefined ? { signal: options.signal } : {}),
670
1420
  ...(options.terminate ? { terminate: options.terminate } : {}),
671
1421
  ...(options.wrapUpHint ? { wrapUpHint: options.wrapUpHint } : {}),
672
- ...(options.emit ? { emit: options.emit } : {})
1422
+ ...(emitForProtocol ? { emit: emitForProtocol } : {})
673
1423
  });
674
1424
  }
675
1425
  }
@@ -722,13 +1472,33 @@ export function stream(options: DogpileOptions): StreamHandle {
722
1472
  * the ergonomic {@link RunResult} wrapper from the JSON-serializable
723
1473
  * {@link Trace} returned by a previous `run()`, `stream()`, or
724
1474
  * `Dogpile.pile()` call.
1475
+ *
1476
+ * Tracing and metrics: replay is intentionally tracing-free and metrics-free.
1477
+ * Even when an engine instance has been configured with a `tracer` or
1478
+ * `metricsHook` on its `EngineOptions`, calling this function emits no spans
1479
+ * or callbacks — replaying historical events with current timestamps would
1480
+ * confuse observability backends. See `docs/developer-usage.md`.
725
1481
  */
1482
+ // Tracing/metrics-free: replay never uses EngineOptions tracer or metricsHook.
726
1483
  export function replay(trace: Trace): RunResult {
727
1484
  const cost = trace.finalOutput.cost;
728
1485
  const lastEvent = trace.events.at(-1);
1486
+ // D-08 / D-10: rebuild accounting recursively from the saved trace and
1487
+ // verify every embedded sub-run's recorded accounting matches what the
1488
+ // child trace recomputes. Mismatches throw `invalid-configuration` with
1489
+ // `detail.reason: "trace-accounting-mismatch"`. No provider invocation.
1490
+ const accounting = recomputeAccountingFromTrace(trace);
1491
+ const replayThrow = resolveReplayTerminalThrow(trace);
1492
+ if (replayThrow) {
1493
+ throw replayThrow;
1494
+ }
729
1495
  const baseResult = {
730
1496
  output: trace.finalOutput.output,
731
- eventLog: createRunEventLog(trace.runId, trace.protocol, trace.events),
1497
+ eventLog: createRunEventLog(
1498
+ trace.runId,
1499
+ trace.protocol,
1500
+ synthesizeProviderEvents(trace, trace.providerCalls)
1501
+ ),
732
1502
  trace,
733
1503
  transcript: trace.transcript,
734
1504
  usage: createRunUsage(cost),
@@ -740,14 +1510,9 @@ export function replay(trace: Trace): RunResult {
740
1510
  agentsUsed: trace.agentsUsed,
741
1511
  events: trace.events
742
1512
  }),
743
- accounting: createRunAccounting({
744
- tier: trace.tier,
745
- ...(trace.budget.caps ? { budget: trace.budget.caps } : {}),
746
- ...(trace.budget.termination ? { termination: trace.budget.termination } : {}),
747
- cost,
748
- events: trace.events
749
- }),
750
- cost
1513
+ accounting,
1514
+ cost,
1515
+ health: computeHealth(trace, DEFAULT_HEALTH_THRESHOLDS)
751
1516
  };
752
1517
 
753
1518
  if (lastEvent?.type !== "final") {
@@ -761,17 +1526,178 @@ export function replay(trace: Trace): RunResult {
761
1526
  };
762
1527
  }
763
1528
 
1529
+ function synthesizeProviderEvents(
1530
+ trace: Trace,
1531
+ providerCalls: readonly ReplayTraceProviderCall[]
1532
+ ): readonly RunEvent[] {
1533
+ const hasLiveProvenance = trace.events.some(
1534
+ (event) => event.type === "model-request" || event.type === "model-response"
1535
+ );
1536
+ if (hasLiveProvenance) {
1537
+ return trace.events;
1538
+ }
1539
+
1540
+ const baseEvents = trace.events.filter(
1541
+ (event) => event.type !== "model-request" && event.type !== "model-response"
1542
+ );
1543
+ const result: RunEvent[] = [];
1544
+ let turnCount = 0;
1545
+
1546
+ for (const event of baseEvents) {
1547
+ if (event.type === "agent-turn") {
1548
+ const call = providerCalls[turnCount];
1549
+ if (call !== undefined) {
1550
+ const modelId = typeof call.modelId === "string" && call.modelId.length > 0 ? call.modelId : call.providerId;
1551
+ result.push({
1552
+ type: "model-request",
1553
+ runId: trace.runId,
1554
+ callId: call.callId,
1555
+ providerId: call.providerId,
1556
+ modelId,
1557
+ startedAt: call.startedAt,
1558
+ agentId: call.agentId,
1559
+ role: call.role,
1560
+ request: call.request
1561
+ });
1562
+ result.push({
1563
+ type: "model-response",
1564
+ runId: trace.runId,
1565
+ callId: call.callId,
1566
+ providerId: call.providerId,
1567
+ modelId,
1568
+ startedAt: call.startedAt,
1569
+ completedAt: call.completedAt,
1570
+ agentId: call.agentId,
1571
+ role: call.role,
1572
+ response: call.response
1573
+ });
1574
+ }
1575
+ turnCount += 1;
1576
+ }
1577
+ result.push(event);
1578
+ }
1579
+
1580
+ return result;
1581
+ }
1582
+
1583
+ function resolveRuntimeTerminalThrow(
1584
+ trace: Trace,
1585
+ failureInstancesByChildRunId: ReadonlyMap<string, DogpileError>
1586
+ ): DogpileError | null {
1587
+ if (trace.triggeringFailureForAbortMode !== undefined) {
1588
+ return failureInstancesByChildRunId.get(trace.triggeringFailureForAbortMode.childRunId) ?? null;
1589
+ }
1590
+
1591
+ const finalEvent = trace.events.at(-1);
1592
+ if (finalEvent?.type !== "final" || finalEvent.termination === undefined) {
1593
+ return null;
1594
+ }
1595
+
1596
+ const lastFailure = findLastRealFailure(trace.events, failureInstancesByChildRunId);
1597
+ if (lastFailure === null) {
1598
+ return null;
1599
+ }
1600
+ if (hasFinalSynthesisAfterEvent(trace, lastFailure.eventIndex)) {
1601
+ return null;
1602
+ }
1603
+ return lastFailure.error;
1604
+ }
1605
+
1606
+ function findLastRealFailure(
1607
+ events: readonly RunEvent[],
1608
+ failureInstancesByChildRunId: ReadonlyMap<string, DogpileError>
1609
+ ): { readonly error: DogpileError; readonly eventIndex: number } | null {
1610
+ for (let index = events.length - 1; index >= 0; index -= 1) {
1611
+ const event = events[index];
1612
+ if (event?.type !== "sub-run-failed") {
1613
+ continue;
1614
+ }
1615
+ const instance = failureInstancesByChildRunId.get(event.childRunId);
1616
+ if (instance) {
1617
+ return { error: instance, eventIndex: index };
1618
+ }
1619
+ }
1620
+ return null;
1621
+ }
1622
+
1623
+ function resolveReplayTerminalThrow(trace: Trace): DogpileError | null {
1624
+ if (trace.triggeringFailureForAbortMode !== undefined) {
1625
+ return dogpileErrorFromSerializedPayload(trace.triggeringFailureForAbortMode.error);
1626
+ }
1627
+
1628
+ const finalEvent = trace.events.at(-1);
1629
+ if (finalEvent?.type !== "final" || finalEvent.termination === undefined) {
1630
+ return null;
1631
+ }
1632
+
1633
+ const lastFailure = reconstructLastRealFailure(trace.events);
1634
+ if (lastFailure === null) {
1635
+ return null;
1636
+ }
1637
+ if (hasFinalSynthesisAfterEvent(trace, lastFailure.eventIndex)) {
1638
+ return null;
1639
+ }
1640
+ return lastFailure.error;
1641
+ }
1642
+
1643
+ function reconstructLastRealFailure(
1644
+ events: readonly RunEvent[]
1645
+ ): { readonly error: DogpileError; readonly eventIndex: number } | null {
1646
+ for (let index = events.length - 1; index >= 0; index -= 1) {
1647
+ const event = events[index];
1648
+ if (event?.type !== "sub-run-failed" || isSyntheticSubRunFailure(event)) {
1649
+ continue;
1650
+ }
1651
+ return { error: dogpileErrorFromSerializedPayload(event.error), eventIndex: index };
1652
+ }
1653
+ return null;
1654
+ }
1655
+
1656
+ function hasFinalSynthesisAfterEvent(trace: Trace, eventIndex: number): boolean {
1657
+ return trace.protocolDecisions.some((decision) => {
1658
+ return decision.phase === "final-synthesis" && decision.eventIndex > eventIndex;
1659
+ });
1660
+ }
1661
+
1662
+ function isSyntheticSubRunFailure(event: SubRunFailedEvent): boolean {
1663
+ const reason = event.error.detail?.["reason"];
1664
+ return reason === "sibling-failed" || reason === "parent-aborted";
1665
+ }
1666
+
1667
+ function dogpileErrorFromSerializedPayload(input: {
1668
+ readonly code: string;
1669
+ readonly message: string;
1670
+ readonly providerId?: string;
1671
+ readonly detail?: JsonObject;
1672
+ }): DogpileError {
1673
+ return new DogpileError({
1674
+ code: input.code as DogpileErrorCode,
1675
+ message: input.message,
1676
+ ...(input.providerId !== undefined ? { providerId: input.providerId } : {}),
1677
+ ...(input.detail !== undefined ? { detail: input.detail } : {})
1678
+ });
1679
+ }
1680
+
764
1681
  /**
765
1682
  * Replay a saved completed trace as a stream without invoking a model provider.
766
1683
  *
767
1684
  * @remarks
768
- * This is the streaming counterpart to {@link replay}. It yields the exact
769
- * saved {@link Trace.events} in order and resolves {@link StreamHandle.result}
770
- * to the rehydrated {@link RunResult}. Since all data comes from the trace,
771
- * replay remains storage-free and provider-free.
1685
+ * This is the streaming counterpart to {@link replay}. It yields the same
1686
+ * event sequence exposed by the replayed result event log, including legacy
1687
+ * provenance synthesis when a saved trace predates model request/response
1688
+ * events. Since all data comes from the trace, replay remains storage-free and
1689
+ * provider-free.
1690
+ *
1691
+ * Tracing and metrics: replayStream is intentionally tracing-free and
1692
+ * metrics-free. Even when an engine instance has been configured with a
1693
+ * `tracer` or `metricsHook` on its `EngineOptions`, calling this function
1694
+ * emits no spans or callbacks — replaying historical events with current
1695
+ * timestamps would confuse observability backends. See `docs/developer-usage.md`.
772
1696
  */
1697
+ // Tracing/metrics-free: replayStream never uses EngineOptions tracer or metricsHook.
773
1698
  export function replayStream(trace: Trace): StreamHandle {
774
1699
  const result = Promise.resolve(replay(trace));
1700
+ const replayEvents = replayStreamEvents(trace);
775
1701
 
776
1702
  return {
777
1703
  get status(): StreamHandleStatus {
@@ -782,7 +1708,7 @@ export function replayStream(trace: Trace): StreamHandle {
782
1708
  // Replay streams are already completed snapshots, so cancellation is a no-op.
783
1709
  },
784
1710
  subscribe(subscriber: StreamEventSubscriber) {
785
- for (const event of trace.events) {
1711
+ for (const event of replayEvents) {
786
1712
  subscriber(event);
787
1713
  }
788
1714
 
@@ -797,7 +1723,7 @@ export function replayStream(trace: Trace): StreamHandle {
797
1723
 
798
1724
  return {
799
1725
  next(): Promise<IteratorResult<StreamEvent>> {
800
- const event = trace.events[index];
1726
+ const event = replayEvents[index];
801
1727
  if (event) {
802
1728
  index += 1;
803
1729
  return Promise.resolve({ done: false, value: event });
@@ -810,6 +1736,31 @@ export function replayStream(trace: Trace): StreamHandle {
810
1736
  };
811
1737
  }
812
1738
 
1739
+ function replayStreamEvents(trace: Trace, parentRunIds: readonly string[] = []): StreamEvent[] {
1740
+ const events: StreamEvent[] = [];
1741
+
1742
+ for (const event of synthesizeProviderEvents(trace, trace.providerCalls)) {
1743
+ if (event.type === "sub-run-completed") {
1744
+ events.push(...replayStreamEvents(event.subResult.trace, [...parentRunIds, trace.runId]));
1745
+ }
1746
+ events.push(wrapReplayStreamEvent(event, parentRunIds));
1747
+ }
1748
+
1749
+ return events;
1750
+ }
1751
+
1752
+ function wrapReplayStreamEvent(event: RunEvent, parentRunIds: readonly string[]): StreamEvent {
1753
+ if (parentRunIds.length === 0) {
1754
+ return event;
1755
+ }
1756
+
1757
+ const inbound = (event as { readonly parentRunIds?: readonly string[] }).parentRunIds;
1758
+ return {
1759
+ ...event,
1760
+ parentRunIds: [...parentRunIds, ...(inbound ?? [])]
1761
+ } as StreamEvent;
1762
+ }
1763
+
813
1764
  function wireCallerAbortSignal(
814
1765
  callerSignal: AbortSignal | undefined,
815
1766
  abortController: AbortController,
@@ -844,7 +1795,8 @@ function createStreamCancellationError(providerId: string, cause?: unknown): Dog
844
1795
  providerId,
845
1796
  ...(cause !== undefined ? { cause } : {}),
846
1797
  detail: {
847
- status: "cancelled"
1798
+ status: "cancelled",
1799
+ reason: "parent-aborted"
848
1800
  }
849
1801
  });
850
1802
  }
@@ -865,6 +1817,23 @@ function withHighLevelDefaults(options: DogpileOptions): NormalizedDogpileOption
865
1817
  };
866
1818
  }
867
1819
 
1820
+ function assertRunDoesNotRaiseEngineMax(path: string, runValue: number | undefined, engineValue: number): void {
1821
+ if (runValue === undefined || runValue <= engineValue) {
1822
+ return;
1823
+ }
1824
+ throw new DogpileError({
1825
+ code: "invalid-configuration",
1826
+ message: `${path} cannot raise the engine ceiling (${engineValue}).`,
1827
+ retryable: false,
1828
+ detail: {
1829
+ kind: "configuration-validation",
1830
+ path,
1831
+ expected: `integer <= ${engineValue}`,
1832
+ actual: runValue
1833
+ }
1834
+ });
1835
+ }
1836
+
868
1837
  /**
869
1838
  * Branded high-level SDK namespace.
870
1839
  *