@dogpile/sdk 0.3.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +201 -0
- package/README.md +1 -0
- package/dist/browser/index.js +2328 -237
- package/dist/browser/index.js.map +1 -1
- package/dist/index.d.ts +3 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/dist/providers/openai-compatible.d.ts +11 -0
- package/dist/providers/openai-compatible.d.ts.map +1 -1
- package/dist/providers/openai-compatible.js +88 -2
- package/dist/providers/openai-compatible.js.map +1 -1
- package/dist/runtime/audit.d.ts +42 -0
- package/dist/runtime/audit.d.ts.map +1 -0
- package/dist/runtime/audit.js +73 -0
- package/dist/runtime/audit.js.map +1 -0
- package/dist/runtime/broadcast.d.ts.map +1 -1
- package/dist/runtime/broadcast.js +39 -36
- package/dist/runtime/broadcast.js.map +1 -1
- package/dist/runtime/cancellation.d.ts +26 -0
- package/dist/runtime/cancellation.d.ts.map +1 -1
- package/dist/runtime/cancellation.js +38 -1
- package/dist/runtime/cancellation.js.map +1 -1
- package/dist/runtime/coordinator.d.ts +79 -1
- package/dist/runtime/coordinator.d.ts.map +1 -1
- package/dist/runtime/coordinator.js +979 -61
- package/dist/runtime/coordinator.js.map +1 -1
- package/dist/runtime/decisions.d.ts +25 -3
- package/dist/runtime/decisions.d.ts.map +1 -1
- package/dist/runtime/decisions.js +241 -3
- package/dist/runtime/decisions.js.map +1 -1
- package/dist/runtime/defaults.d.ts +37 -1
- package/dist/runtime/defaults.d.ts.map +1 -1
- package/dist/runtime/defaults.js +359 -4
- package/dist/runtime/defaults.js.map +1 -1
- package/dist/runtime/engine.d.ts +17 -4
- package/dist/runtime/engine.d.ts.map +1 -1
- package/dist/runtime/engine.js +770 -35
- package/dist/runtime/engine.js.map +1 -1
- package/dist/runtime/health.d.ts +51 -0
- package/dist/runtime/health.d.ts.map +1 -0
- package/dist/runtime/health.js +85 -0
- package/dist/runtime/health.js.map +1 -0
- package/dist/runtime/introspection.d.ts +96 -0
- package/dist/runtime/introspection.d.ts.map +1 -0
- package/dist/runtime/introspection.js +31 -0
- package/dist/runtime/introspection.js.map +1 -0
- package/dist/runtime/metrics.d.ts +44 -0
- package/dist/runtime/metrics.d.ts.map +1 -0
- package/dist/runtime/metrics.js +12 -0
- package/dist/runtime/metrics.js.map +1 -0
- package/dist/runtime/model.d.ts.map +1 -1
- package/dist/runtime/model.js +34 -7
- package/dist/runtime/model.js.map +1 -1
- package/dist/runtime/provenance.d.ts +25 -0
- package/dist/runtime/provenance.d.ts.map +1 -0
- package/dist/runtime/provenance.js +13 -0
- package/dist/runtime/provenance.js.map +1 -0
- package/dist/runtime/sequential.d.ts.map +1 -1
- package/dist/runtime/sequential.js +47 -37
- package/dist/runtime/sequential.js.map +1 -1
- package/dist/runtime/shared.d.ts.map +1 -1
- package/dist/runtime/shared.js +39 -36
- package/dist/runtime/shared.js.map +1 -1
- package/dist/runtime/tracing.d.ts +31 -0
- package/dist/runtime/tracing.d.ts.map +1 -0
- package/dist/runtime/tracing.js +18 -0
- package/dist/runtime/tracing.js.map +1 -0
- package/dist/runtime/validation.d.ts +10 -0
- package/dist/runtime/validation.d.ts.map +1 -1
- package/dist/runtime/validation.js +73 -0
- package/dist/runtime/validation.js.map +1 -1
- package/dist/types/events.d.ts +339 -12
- package/dist/types/events.d.ts.map +1 -1
- package/dist/types/replay.d.ts +7 -1
- package/dist/types/replay.d.ts.map +1 -1
- package/dist/types.d.ts +255 -6
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/package.json +39 -1
- package/src/index.ts +15 -0
- package/src/providers/openai-compatible.ts +83 -3
- package/src/runtime/audit.ts +121 -0
- package/src/runtime/broadcast.ts +40 -37
- package/src/runtime/cancellation.ts +59 -1
- package/src/runtime/coordinator.ts +1221 -61
- package/src/runtime/decisions.ts +307 -4
- package/src/runtime/defaults.ts +389 -4
- package/src/runtime/engine.ts +1004 -35
- package/src/runtime/health.ts +136 -0
- package/src/runtime/introspection.ts +122 -0
- package/src/runtime/metrics.ts +45 -0
- package/src/runtime/model.ts +38 -6
- package/src/runtime/provenance.ts +43 -0
- package/src/runtime/sequential.ts +49 -38
- package/src/runtime/shared.ts +40 -37
- package/src/runtime/tracing.ts +35 -0
- package/src/runtime/validation.ts +81 -0
- package/src/types/events.ts +369 -12
- package/src/types/replay.ts +14 -1
- package/src/types.ts +279 -4
|
@@ -1,22 +1,31 @@
|
|
|
1
|
+
import { DogpileError } from "../types.js";
|
|
1
2
|
import type {
|
|
2
3
|
AgentSpec,
|
|
3
4
|
ConfiguredModelProvider,
|
|
4
5
|
CoordinatorProtocolConfig,
|
|
5
6
|
CostSummary,
|
|
7
|
+
DelegateAgentDecision,
|
|
6
8
|
DogpileOptions,
|
|
7
9
|
JsonObject,
|
|
8
10
|
JsonValue,
|
|
9
11
|
ModelRequest,
|
|
10
12
|
ModelResponse,
|
|
13
|
+
ProtocolSelection,
|
|
11
14
|
ReplayTraceProtocolDecision,
|
|
12
15
|
ReplayTraceProviderCall,
|
|
13
16
|
RuntimeTool,
|
|
14
17
|
RuntimeToolExecutor,
|
|
15
18
|
RunEvent,
|
|
16
19
|
RunResult,
|
|
20
|
+
SubRunBudgetClampedEvent,
|
|
21
|
+
SubRunConcurrencyClampedEvent,
|
|
22
|
+
SubRunFailedEvent,
|
|
23
|
+
SubRunQueuedEvent,
|
|
24
|
+
SubRunParentAbortedEvent,
|
|
17
25
|
TerminationCondition,
|
|
18
26
|
TerminationStopRecord,
|
|
19
27
|
Tier,
|
|
28
|
+
Trace,
|
|
20
29
|
TranscriptEntry
|
|
21
30
|
} from "../types.js";
|
|
22
31
|
import { createRunId, elapsedMs, nowMs, providerCallIdFor } from "./ids.js";
|
|
@@ -34,15 +43,68 @@ import {
|
|
|
34
43
|
createRunUsage,
|
|
35
44
|
createTranscriptLink,
|
|
36
45
|
emptyCost,
|
|
46
|
+
lastCostBearingEventCost,
|
|
37
47
|
nextProviderCallId
|
|
38
48
|
} from "./defaults.js";
|
|
39
|
-
import {
|
|
40
|
-
import {
|
|
49
|
+
import { computeHealth, DEFAULT_HEALTH_THRESHOLDS } from "./health.js";
|
|
50
|
+
import {
|
|
51
|
+
classifyAbortReason,
|
|
52
|
+
classifyChildTimeoutSource,
|
|
53
|
+
createAbortErrorFromSignal,
|
|
54
|
+
createEngineDeadlineTimeoutError,
|
|
55
|
+
throwIfAborted
|
|
56
|
+
} from "./cancellation.js";
|
|
57
|
+
import { assertDepthWithinLimit, parseAgentDecision } from "./decisions.js";
|
|
41
58
|
import { generateModelTurn } from "./model.js";
|
|
42
59
|
import { evaluateTerminationStop, warnOnProtocolTerminationMisconfiguration } from "./termination.js";
|
|
43
60
|
import { createRuntimeToolExecutor, executeModelResponseToolRequests, runtimeToolAvailability } from "./tools.js";
|
|
44
61
|
import { createWrapUpHintController } from "./wrap-up.js";
|
|
45
62
|
|
|
63
|
+
/**
|
|
64
|
+
* Callback to invoke a child run via the engine's `runProtocol` switch. Passed
|
|
65
|
+
* in by `engine.ts` so coordinator avoids a circular import.
|
|
66
|
+
*/
|
|
67
|
+
export type RunProtocolFn = (input: {
|
|
68
|
+
/**
|
|
69
|
+
* Planned child run id emitted on sub-run lifecycle events before dispatch.
|
|
70
|
+
* The engine callback uses this to look up the matching sub-run span.
|
|
71
|
+
*/
|
|
72
|
+
readonly runId: string;
|
|
73
|
+
readonly intent: string;
|
|
74
|
+
readonly protocol: ProtocolSelection;
|
|
75
|
+
readonly tier: Tier;
|
|
76
|
+
readonly model: ConfiguredModelProvider;
|
|
77
|
+
readonly agents: readonly AgentSpec[];
|
|
78
|
+
readonly tools: readonly RuntimeTool<JsonObject, JsonValue>[];
|
|
79
|
+
readonly temperature: number;
|
|
80
|
+
readonly budget?: DogpileOptions["budget"];
|
|
81
|
+
readonly seed?: string | number;
|
|
82
|
+
readonly signal?: AbortSignal;
|
|
83
|
+
readonly terminate?: TerminationCondition;
|
|
84
|
+
readonly wrapUpHint?: DogpileOptions["wrapUpHint"];
|
|
85
|
+
readonly emit?: (event: RunEvent) => void;
|
|
86
|
+
readonly streamEvents?: boolean;
|
|
87
|
+
readonly currentDepth?: number;
|
|
88
|
+
readonly effectiveMaxDepth?: number;
|
|
89
|
+
readonly effectiveMaxConcurrentChildren?: number;
|
|
90
|
+
readonly onChildFailure?: DogpileOptions["onChildFailure"];
|
|
91
|
+
/**
|
|
92
|
+
* Root-run deadline (epoch ms). Children inherit `parentDeadlineMs - now()`
|
|
93
|
+
* as their default timeout window so a depth-N child sees the ROOT's deadline,
|
|
94
|
+
* not its immediate parent's freshly-computed value (BUDGET-02 / D-12).
|
|
95
|
+
*/
|
|
96
|
+
readonly parentDeadlineMs?: number;
|
|
97
|
+
/**
|
|
98
|
+
* Engine-level fallback sub-run timeout (BUDGET-02 / D-14). Applied only when
|
|
99
|
+
* neither the parent nor the decision specifies a `budget.timeoutMs`.
|
|
100
|
+
*/
|
|
101
|
+
readonly defaultSubRunTimeoutMs?: number;
|
|
102
|
+
readonly registerAbortDrain?: (drain: AbortDrainFn) => void;
|
|
103
|
+
readonly failureInstancesByChildRunId?: Map<string, DogpileError>;
|
|
104
|
+
}) => Promise<RunResult>;
|
|
105
|
+
|
|
106
|
+
export type AbortDrainFn = (reason?: unknown) => void;
|
|
107
|
+
|
|
46
108
|
interface CoordinatorRunOptions {
|
|
47
109
|
readonly intent: string;
|
|
48
110
|
readonly protocol: CoordinatorProtocolConfig;
|
|
@@ -57,6 +119,121 @@ interface CoordinatorRunOptions {
|
|
|
57
119
|
readonly terminate?: TerminationCondition;
|
|
58
120
|
readonly wrapUpHint?: DogpileOptions["wrapUpHint"];
|
|
59
121
|
readonly emit?: (event: RunEvent) => void;
|
|
122
|
+
readonly streamEvents?: boolean;
|
|
123
|
+
/**
|
|
124
|
+
* Recursion depth of this coordinator run. Top-level callers pass 0; child
|
|
125
|
+
* sub-runs receive parent depth + 1 from the dispatch loop.
|
|
126
|
+
*/
|
|
127
|
+
readonly currentDepth?: number;
|
|
128
|
+
/**
|
|
129
|
+
* Effective max recursion depth resolved at run start. Plan 04 enforces;
|
|
130
|
+
* Plan 03 only plumbs the value.
|
|
131
|
+
*/
|
|
132
|
+
readonly effectiveMaxDepth?: number;
|
|
133
|
+
readonly effectiveMaxConcurrentChildren?: number;
|
|
134
|
+
readonly onChildFailure?: DogpileOptions["onChildFailure"];
|
|
135
|
+
/**
|
|
136
|
+
* Engine `runProtocol` callback used by the delegate dispatch loop to
|
|
137
|
+
* recursively run a child protocol. Optional so unit tests that exercise
|
|
138
|
+
* the coordinator without the engine wrapper still typecheck — when omitted,
|
|
139
|
+
* delegate dispatch falls back to throwing `invalid-configuration`.
|
|
140
|
+
*/
|
|
141
|
+
readonly runProtocol?: RunProtocolFn;
|
|
142
|
+
/**
|
|
143
|
+
* Root-run deadline (epoch ms) threaded through every recursive coordinator
|
|
144
|
+
* dispatch (BUDGET-02 / D-12). When set, sub-run dispatches compute their
|
|
145
|
+
* `remainingMs = parentDeadlineMs - Date.now()` against this deadline rather
|
|
146
|
+
* than the parent's full `budget.timeoutMs` window.
|
|
147
|
+
*/
|
|
148
|
+
readonly parentDeadlineMs?: number;
|
|
149
|
+
/**
|
|
150
|
+
* Engine-level fallback sub-run timeout (BUDGET-02 / D-14). Applied only when
|
|
151
|
+
* neither the parent nor the decision specifies a `budget.timeoutMs`.
|
|
152
|
+
*/
|
|
153
|
+
readonly defaultSubRunTimeoutMs?: number;
|
|
154
|
+
readonly registerAbortDrain?: (drain: AbortDrainFn) => void;
|
|
155
|
+
readonly failureInstancesByChildRunId?: Map<string, DogpileError>;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Hard-coded loop guard for the delegate dispatch in the coordinator plan
|
|
160
|
+
* turn. After this many consecutive delegate decisions the coordinator throws
|
|
161
|
+
* `invalid-configuration` (T-03-01). Not a public option.
|
|
162
|
+
*/
|
|
163
|
+
const MAX_DISPATCH_PER_TURN = 8;
|
|
164
|
+
const DEFAULT_MAX_CONCURRENT_CHILDREN = 4;
|
|
165
|
+
|
|
166
|
+
type DispatchWaveFailure = {
|
|
167
|
+
readonly childRunId: string;
|
|
168
|
+
readonly intent: string;
|
|
169
|
+
readonly error: {
|
|
170
|
+
readonly code: string;
|
|
171
|
+
readonly message: string;
|
|
172
|
+
readonly detail?: { readonly reason?: string };
|
|
173
|
+
};
|
|
174
|
+
readonly partialCost: { readonly usd: number };
|
|
175
|
+
};
|
|
176
|
+
|
|
177
|
+
interface Semaphore {
|
|
178
|
+
acquire(): Promise<void>;
|
|
179
|
+
release(): void;
|
|
180
|
+
readonly inFlight: number;
|
|
181
|
+
readonly queued: number;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
function createSemaphore(maxConcurrent: number): Semaphore {
|
|
185
|
+
let inFlight = 0;
|
|
186
|
+
const waiters: Array<() => void> = [];
|
|
187
|
+
return {
|
|
188
|
+
acquire(): Promise<void> {
|
|
189
|
+
if (inFlight < maxConcurrent) {
|
|
190
|
+
inFlight += 1;
|
|
191
|
+
return Promise.resolve();
|
|
192
|
+
}
|
|
193
|
+
return new Promise<void>((resolve) => {
|
|
194
|
+
waiters.push(() => {
|
|
195
|
+
inFlight += 1;
|
|
196
|
+
resolve();
|
|
197
|
+
});
|
|
198
|
+
});
|
|
199
|
+
},
|
|
200
|
+
release(): void {
|
|
201
|
+
inFlight -= 1;
|
|
202
|
+
const next = waiters.shift();
|
|
203
|
+
if (next !== undefined) {
|
|
204
|
+
next();
|
|
205
|
+
}
|
|
206
|
+
},
|
|
207
|
+
get inFlight() {
|
|
208
|
+
return inFlight;
|
|
209
|
+
},
|
|
210
|
+
get queued() {
|
|
211
|
+
return waiters.length;
|
|
212
|
+
}
|
|
213
|
+
};
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Walk the coordinator's active provider set and return the FIRST provider
|
|
218
|
+
* whose metadata.locality === "local", or undefined if none found.
|
|
219
|
+
*
|
|
220
|
+
* Walk order (forward-compat): options.model first, then options.agents in
|
|
221
|
+
* declaration order. AgentSpec has no `model` field today (Phase 3 D-11
|
|
222
|
+
* forward-compat scaffolding); the agent walk uses optional chaining and
|
|
223
|
+
* effectively no-ops until a future phase adds AgentSpec.model.
|
|
224
|
+
*/
|
|
225
|
+
function findFirstLocalProvider(options: CoordinatorRunOptions): ConfiguredModelProvider | undefined {
|
|
226
|
+
if (options.model.metadata?.locality === "local") {
|
|
227
|
+
return options.model;
|
|
228
|
+
}
|
|
229
|
+
// Forward-compat: AgentSpec.model not yet declared (Phase 3 D-11). Walk no-ops today; ready for caller-defined trees in a future milestone.
|
|
230
|
+
for (const agent of options.agents) {
|
|
231
|
+
const agentModel = (agent as { readonly model?: ConfiguredModelProvider }).model;
|
|
232
|
+
if (agentModel?.metadata?.locality === "local") {
|
|
233
|
+
return agentModel;
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
return undefined;
|
|
60
237
|
}
|
|
61
238
|
|
|
62
239
|
export async function runCoordinator(options: CoordinatorRunOptions): Promise<RunResult> {
|
|
@@ -65,13 +242,16 @@ export async function runCoordinator(options: CoordinatorRunOptions): Promise<Ru
|
|
|
65
242
|
const transcript: TranscriptEntry[] = [];
|
|
66
243
|
const protocolDecisions: ReplayTraceProtocolDecision[] = [];
|
|
67
244
|
const providerCalls: ReplayTraceProviderCall[] = [];
|
|
245
|
+
const dispatchedChildren = new Map<string, DispatchedChild>();
|
|
68
246
|
let totalCost = emptyCost();
|
|
247
|
+
let concurrencyClampEmitted = false; // D-12: emit once per run, never per-engine.
|
|
69
248
|
const maxTurns = options.protocol.maxTurns ?? options.agents.length;
|
|
70
249
|
const activeAgents = options.agents.slice(0, maxTurns);
|
|
71
250
|
const coordinator = activeAgents[0];
|
|
72
251
|
const startedAtMs = nowMs();
|
|
73
252
|
let stopped = false;
|
|
74
253
|
let termination: TerminationStopRecord | undefined;
|
|
254
|
+
let triggeringFailureForAbortMode: DispatchWaveFailure | undefined;
|
|
75
255
|
const wrapUpHint = createWrapUpHintController({
|
|
76
256
|
protocol: options.protocol,
|
|
77
257
|
tier: options.tier,
|
|
@@ -96,6 +276,63 @@ export async function runCoordinator(options: CoordinatorRunOptions): Promise<Ru
|
|
|
96
276
|
);
|
|
97
277
|
};
|
|
98
278
|
|
|
279
|
+
const drainOnParentAbort = (reasonSource?: unknown): void => {
|
|
280
|
+
const reason = classifyAbortReason(reasonSource);
|
|
281
|
+
for (const child of dispatchedChildren.values()) {
|
|
282
|
+
if (child.closed) {
|
|
283
|
+
continue;
|
|
284
|
+
}
|
|
285
|
+
const partialCost = child.started
|
|
286
|
+
? lastCostBearingEventCost(child.childEvents) ?? emptyCost()
|
|
287
|
+
: emptyCost();
|
|
288
|
+
const partialTrace = buildPartialTrace({
|
|
289
|
+
childRunId: child.childRunId,
|
|
290
|
+
events: [...child.childEvents],
|
|
291
|
+
startedAtMs: child.startedAtMs,
|
|
292
|
+
protocol: child.decision.protocol,
|
|
293
|
+
tier: options.tier,
|
|
294
|
+
modelProviderId: options.model.id,
|
|
295
|
+
agents: options.agents,
|
|
296
|
+
intent: child.decision.intent,
|
|
297
|
+
temperature: options.temperature,
|
|
298
|
+
...(child.childTimeoutMs !== undefined ? { childTimeoutMs: child.childTimeoutMs } : {}),
|
|
299
|
+
...(options.seed !== undefined ? { seed: options.seed } : {})
|
|
300
|
+
});
|
|
301
|
+
const failedEvent: SubRunFailedEvent = {
|
|
302
|
+
type: "sub-run-failed",
|
|
303
|
+
runId,
|
|
304
|
+
at: new Date().toISOString(),
|
|
305
|
+
childRunId: child.childRunId,
|
|
306
|
+
parentRunId: runId,
|
|
307
|
+
parentDecisionId: child.parentDecisionId,
|
|
308
|
+
parentDecisionArrayIndex: child.parentDecisionArrayIndex,
|
|
309
|
+
error: child.started
|
|
310
|
+
? {
|
|
311
|
+
code: "aborted",
|
|
312
|
+
message: "Parent run aborted.",
|
|
313
|
+
detail: {
|
|
314
|
+
reason
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
: {
|
|
318
|
+
code: "aborted",
|
|
319
|
+
message: "Sibling delegate failed; queued delegate never started.",
|
|
320
|
+
detail: {
|
|
321
|
+
reason: "sibling-failed"
|
|
322
|
+
}
|
|
323
|
+
},
|
|
324
|
+
partialTrace,
|
|
325
|
+
partialCost
|
|
326
|
+
};
|
|
327
|
+
child.closed = true;
|
|
328
|
+
totalCost = addCost(totalCost, partialCost);
|
|
329
|
+
emit(failedEvent);
|
|
330
|
+
recordProtocolDecision(failedEvent);
|
|
331
|
+
}
|
|
332
|
+
};
|
|
333
|
+
|
|
334
|
+
options.registerAbortDrain?.(drainOnParentAbort);
|
|
335
|
+
|
|
99
336
|
const toolExecutor = createRuntimeToolExecutor({
|
|
100
337
|
runId,
|
|
101
338
|
protocol: "coordinator",
|
|
@@ -126,24 +363,280 @@ export async function runCoordinator(options: CoordinatorRunOptions): Promise<Ru
|
|
|
126
363
|
|
|
127
364
|
if (coordinator) {
|
|
128
365
|
if (!stopIfNeeded()) {
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
366
|
+
// Delegate dispatch loop (D-11/D-16/D-17/D-18). Phase 1 limits delegation
|
|
367
|
+
// to the coordinator's plan turn; workers cannot delegate. The loop
|
|
368
|
+
// re-issues the coordinator plan turn after each successful sub-run with
|
|
369
|
+
// the projected D-17 result tagged into the next prompt and a synthetic
|
|
370
|
+
// D-18 transcript entry already appended. `partialTrace` for failed
|
|
371
|
+
// sub-runs is captured via a tee'd emit buffer locally — `runProtocol`'s
|
|
372
|
+
// error contract is unchanged.
|
|
373
|
+
let dispatchInput = buildCoordinatorPlanInput(options.intent, coordinator);
|
|
374
|
+
let dispatchCount = 0;
|
|
375
|
+
while (true) {
|
|
376
|
+
const turnOutcome = await runCoordinatorTurn({
|
|
377
|
+
agent: coordinator,
|
|
378
|
+
coordinator,
|
|
379
|
+
input: dispatchInput,
|
|
380
|
+
phase: "plan",
|
|
381
|
+
options,
|
|
382
|
+
runId,
|
|
383
|
+
transcript,
|
|
384
|
+
totalCost,
|
|
385
|
+
providerCalls,
|
|
386
|
+
toolExecutor,
|
|
387
|
+
toolAvailability,
|
|
388
|
+
events,
|
|
389
|
+
startedAtMs,
|
|
390
|
+
wrapUpHint,
|
|
391
|
+
emit,
|
|
392
|
+
recordProtocolDecision
|
|
393
|
+
});
|
|
394
|
+
totalCost = turnOutcome.totalCost;
|
|
395
|
+
|
|
396
|
+
if (turnOutcome.decision === undefined) {
|
|
397
|
+
break;
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
const delegates = Array.isArray(turnOutcome.decision)
|
|
401
|
+
? turnOutcome.decision
|
|
402
|
+
: turnOutcome.decision.type === "delegate"
|
|
403
|
+
? [turnOutcome.decision]
|
|
404
|
+
: [];
|
|
405
|
+
if (delegates.length === 0) {
|
|
406
|
+
break;
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
if (dispatchCount + delegates.length > MAX_DISPATCH_PER_TURN) {
|
|
410
|
+
throw new DogpileError({
|
|
411
|
+
code: "invalid-configuration",
|
|
412
|
+
message: `Coordinator plan turn delegated ${delegates.length} more children after ${dispatchCount}; max is ${MAX_DISPATCH_PER_TURN}.`,
|
|
413
|
+
retryable: false,
|
|
414
|
+
detail: {
|
|
415
|
+
kind: "delegate-validation",
|
|
416
|
+
path: "decision",
|
|
417
|
+
reason: "loop-guard-exceeded",
|
|
418
|
+
maxDispatchPerTurn: MAX_DISPATCH_PER_TURN
|
|
419
|
+
}
|
|
420
|
+
});
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
const parentDecisionId = String(events.length - 1);
|
|
424
|
+
const parentDepth = options.currentDepth ?? 0;
|
|
425
|
+
const decisionMax = delegates.reduce(
|
|
426
|
+
(max, delegate) => Math.min(max, delegate.maxConcurrentChildren ?? Number.POSITIVE_INFINITY),
|
|
427
|
+
Number.POSITIVE_INFINITY
|
|
428
|
+
);
|
|
429
|
+
let effectiveForTurn = Math.min(
|
|
430
|
+
options.effectiveMaxConcurrentChildren ?? DEFAULT_MAX_CONCURRENT_CHILDREN,
|
|
431
|
+
decisionMax
|
|
432
|
+
);
|
|
433
|
+
const requestedMax = effectiveForTurn;
|
|
434
|
+
const localProvider = findFirstLocalProvider(options);
|
|
435
|
+
if (localProvider !== undefined) {
|
|
436
|
+
effectiveForTurn = 1;
|
|
437
|
+
if (!concurrencyClampEmitted) {
|
|
438
|
+
const clampEvent: SubRunConcurrencyClampedEvent = {
|
|
439
|
+
type: "sub-run-concurrency-clamped",
|
|
440
|
+
runId,
|
|
441
|
+
at: new Date().toISOString(),
|
|
442
|
+
requestedMax,
|
|
443
|
+
effectiveMax: 1,
|
|
444
|
+
reason: "local-provider-detected",
|
|
445
|
+
providerId: localProvider.id
|
|
446
|
+
};
|
|
447
|
+
emit(clampEvent);
|
|
448
|
+
recordProtocolDecision(clampEvent);
|
|
449
|
+
concurrencyClampEmitted = true;
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
const semaphore = createSemaphore(effectiveForTurn);
|
|
453
|
+
const childRunIds = delegates.map(() => createRunId());
|
|
454
|
+
const dispatchedForTurn = delegates.map((delegate, index): DispatchedChild => {
|
|
455
|
+
const childRunId = childRunIds[index];
|
|
456
|
+
if (childRunId === undefined) {
|
|
457
|
+
throw new Error("missing child run id");
|
|
458
|
+
}
|
|
459
|
+
const dispatchedChild: DispatchedChild = {
|
|
460
|
+
childRunId,
|
|
461
|
+
decision: delegate,
|
|
462
|
+
parentDecisionId,
|
|
463
|
+
parentDecisionArrayIndex: index,
|
|
464
|
+
parentDepth,
|
|
465
|
+
controller: new AbortController(),
|
|
466
|
+
removeParentListener: undefined,
|
|
467
|
+
childEvents: [],
|
|
468
|
+
started: false,
|
|
469
|
+
closed: false,
|
|
470
|
+
startedAtMs: Date.now(),
|
|
471
|
+
childTimeoutMs: undefined,
|
|
472
|
+
failure: undefined
|
|
473
|
+
};
|
|
474
|
+
dispatchedChildren.set(childRunId, dispatchedChild);
|
|
475
|
+
return dispatchedChild;
|
|
476
|
+
});
|
|
477
|
+
const dispatchResults: Array<{ readonly index: number; readonly result: DispatchDelegateResult }> = [];
|
|
478
|
+
let firstFailureIndex: number | undefined;
|
|
479
|
+
|
|
480
|
+
const tasks = delegates.map(async (delegate, index) => {
|
|
481
|
+
const childRunId = childRunIds[index];
|
|
482
|
+
if (childRunId === undefined) {
|
|
483
|
+
throw new Error("missing child run id");
|
|
484
|
+
}
|
|
485
|
+
if (semaphore.inFlight >= effectiveForTurn) {
|
|
486
|
+
const queuedEvent: SubRunQueuedEvent = {
|
|
487
|
+
type: "sub-run-queued",
|
|
488
|
+
runId,
|
|
489
|
+
at: new Date().toISOString(),
|
|
490
|
+
childRunId,
|
|
491
|
+
parentRunId: runId,
|
|
492
|
+
parentDecisionId,
|
|
493
|
+
parentDecisionArrayIndex: index,
|
|
494
|
+
protocol: delegate.protocol,
|
|
495
|
+
intent: delegate.intent,
|
|
496
|
+
depth: parentDepth + 1,
|
|
497
|
+
queuePosition: semaphore.queued
|
|
498
|
+
};
|
|
499
|
+
emit(queuedEvent);
|
|
500
|
+
recordProtocolDecision(queuedEvent);
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
await semaphore.acquire();
|
|
504
|
+
try {
|
|
505
|
+
const dispatchedChild = dispatchedForTurn[index];
|
|
506
|
+
if (!dispatchedChild) {
|
|
507
|
+
throw new Error("missing dispatched child");
|
|
508
|
+
}
|
|
509
|
+
if (firstFailureIndex !== undefined) {
|
|
510
|
+
if (dispatchedChild.closed) {
|
|
511
|
+
dispatchResults.push({
|
|
512
|
+
index,
|
|
513
|
+
result: {
|
|
514
|
+
nextInput: "",
|
|
515
|
+
taggedText: `[sub-run ${childRunId}]: skipped because the parent run aborted`,
|
|
516
|
+
completedAtMs: Date.now()
|
|
517
|
+
}
|
|
518
|
+
});
|
|
519
|
+
return;
|
|
520
|
+
}
|
|
521
|
+
const partialCost = emptyCost();
|
|
522
|
+
const partialTrace = buildPartialTrace({
|
|
523
|
+
childRunId,
|
|
524
|
+
events: [],
|
|
525
|
+
startedAtMs: Date.now(),
|
|
526
|
+
protocol: delegate.protocol,
|
|
527
|
+
tier: options.tier,
|
|
528
|
+
modelProviderId: options.model.id,
|
|
529
|
+
agents: options.agents,
|
|
530
|
+
intent: delegate.intent,
|
|
531
|
+
temperature: options.temperature,
|
|
532
|
+
...(options.seed !== undefined ? { seed: options.seed } : {})
|
|
533
|
+
});
|
|
534
|
+
const failedEvent: SubRunFailedEvent = {
|
|
535
|
+
type: "sub-run-failed",
|
|
536
|
+
runId,
|
|
537
|
+
at: new Date().toISOString(),
|
|
538
|
+
childRunId,
|
|
539
|
+
parentRunId: runId,
|
|
540
|
+
parentDecisionId,
|
|
541
|
+
parentDecisionArrayIndex: index,
|
|
542
|
+
error: {
|
|
543
|
+
code: "aborted",
|
|
544
|
+
message: "Sibling delegate failed; queued delegate never started.",
|
|
545
|
+
detail: {
|
|
546
|
+
reason: "sibling-failed"
|
|
547
|
+
}
|
|
548
|
+
},
|
|
549
|
+
partialTrace,
|
|
550
|
+
partialCost
|
|
551
|
+
};
|
|
552
|
+
emit(failedEvent);
|
|
553
|
+
recordProtocolDecision(failedEvent);
|
|
554
|
+
dispatchedChild.closed = true;
|
|
555
|
+
dispatchResults.push({
|
|
556
|
+
index,
|
|
557
|
+
result: {
|
|
558
|
+
nextInput: "",
|
|
559
|
+
taggedText: `[sub-run ${childRunId}]: skipped because a sibling delegate failed`,
|
|
560
|
+
completedAtMs: Date.now()
|
|
561
|
+
}
|
|
562
|
+
});
|
|
563
|
+
return;
|
|
564
|
+
}
|
|
565
|
+
const result = await dispatchDelegate({
|
|
566
|
+
decision: delegate,
|
|
567
|
+
childRunId,
|
|
568
|
+
parentDecisionId,
|
|
569
|
+
parentDecisionArrayIndex: index,
|
|
570
|
+
parentDepth,
|
|
571
|
+
parentRunId: runId,
|
|
572
|
+
options,
|
|
573
|
+
transcript,
|
|
574
|
+
emit,
|
|
575
|
+
recordProtocolDecision,
|
|
576
|
+
recordSubRunCost: (cost: CostSummary): void => {
|
|
577
|
+
totalCost = addCost(totalCost, cost);
|
|
578
|
+
},
|
|
579
|
+
dispatchedChild
|
|
580
|
+
});
|
|
581
|
+
dispatchResults.push({ index, result });
|
|
582
|
+
} catch (error) {
|
|
583
|
+
firstFailureIndex ??= index;
|
|
584
|
+
const dispatchedChild = dispatchedForTurn[index];
|
|
585
|
+
const failure = dispatchedChild?.failure;
|
|
586
|
+
if (
|
|
587
|
+
delegates.length === 1 &&
|
|
588
|
+
(options.onChildFailure === "abort" || failure === undefined || isDelegateValidationError(error))
|
|
589
|
+
) {
|
|
590
|
+
throw error;
|
|
591
|
+
}
|
|
592
|
+
const failureMessage = error instanceof Error ? error.message : String(error);
|
|
593
|
+
let taggedText = `[sub-run ${childRunId} failed]: ${failureMessage}`;
|
|
594
|
+
if (failure) {
|
|
595
|
+
const error = failure.error;
|
|
596
|
+
taggedText = `[sub-run ${childRunId} failed | code=${error.code} | spent=$${failure.partialCost.usd.toFixed(3)}]: ${error.message}`;
|
|
597
|
+
}
|
|
598
|
+
dispatchResults.push({
|
|
599
|
+
index,
|
|
600
|
+
result: {
|
|
601
|
+
nextInput: "",
|
|
602
|
+
taggedText,
|
|
603
|
+
completedAtMs: Date.now()
|
|
604
|
+
}
|
|
605
|
+
});
|
|
606
|
+
} finally {
|
|
607
|
+
semaphore.release();
|
|
608
|
+
}
|
|
609
|
+
});
|
|
610
|
+
const settled = await Promise.allSettled(tasks);
|
|
611
|
+
const firstRejected = settled.find((result) => result.status === "rejected");
|
|
612
|
+
if (
|
|
613
|
+
firstRejected?.status === "rejected" &&
|
|
614
|
+
delegates.length === 1 &&
|
|
615
|
+
(options.onChildFailure === "abort" || dispatchResults.length === 0)
|
|
616
|
+
) {
|
|
617
|
+
throw firstRejected.reason;
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
dispatchResults.sort((a, b) => a.result.completedAtMs - b.result.completedAtMs);
|
|
621
|
+
const taggedResults = dispatchResults.map((entry) => entry.result.taggedText).join("\n\n");
|
|
622
|
+
const currentWaveFailures = dispatchedForTurn
|
|
623
|
+
.map((child) => child.failure)
|
|
624
|
+
.filter((failure): failure is DispatchWaveFailure => failure !== undefined);
|
|
625
|
+
if (options.onChildFailure === "abort" && currentWaveFailures.length > 0) {
|
|
626
|
+
triggeringFailureForAbortMode ??= currentWaveFailures[0];
|
|
627
|
+
break;
|
|
628
|
+
}
|
|
629
|
+
const failuresSection = buildFailuresSection(currentWaveFailures);
|
|
630
|
+
const coordinatorAgent = options.agents[0] ?? { id: "coordinator", role: "coordinator" };
|
|
631
|
+
const baseInput = buildCoordinatorPlanInput(options.intent, coordinatorAgent);
|
|
632
|
+
dispatchInput = [
|
|
633
|
+
baseInput,
|
|
634
|
+
taggedResults,
|
|
635
|
+
failuresSection,
|
|
636
|
+
"Using the sub-run results above, decide the next step (participate or delegate)."
|
|
637
|
+
].filter((section): section is string => Boolean(section)).join("\n\n");
|
|
638
|
+
dispatchCount += delegates.length;
|
|
639
|
+
}
|
|
147
640
|
stopIfNeeded();
|
|
148
641
|
}
|
|
149
642
|
|
|
@@ -209,7 +702,7 @@ export async function runCoordinator(options: CoordinatorRunOptions): Promise<Ru
|
|
|
209
702
|
}
|
|
210
703
|
|
|
211
704
|
if (!stopIfNeeded()) {
|
|
212
|
-
|
|
705
|
+
const synthesisOutcome = await runCoordinatorTurn({
|
|
213
706
|
agent: coordinator,
|
|
214
707
|
coordinator,
|
|
215
708
|
input: buildFinalSynthesisInput(options.intent, transcript, coordinator),
|
|
@@ -227,6 +720,20 @@ export async function runCoordinator(options: CoordinatorRunOptions): Promise<Ru
|
|
|
227
720
|
emit,
|
|
228
721
|
recordProtocolDecision
|
|
229
722
|
});
|
|
723
|
+
totalCost = synthesisOutcome.totalCost;
|
|
724
|
+
// Phase 1: final-synthesis turn cannot delegate.
|
|
725
|
+
if (Array.isArray(synthesisOutcome.decision) || synthesisOutcome.decision?.type === "delegate") {
|
|
726
|
+
throw new DogpileError({
|
|
727
|
+
code: "invalid-configuration",
|
|
728
|
+
message: "Coordinator final-synthesis turn cannot emit a delegate decision in Phase 1",
|
|
729
|
+
retryable: false,
|
|
730
|
+
detail: {
|
|
731
|
+
kind: "delegate-validation",
|
|
732
|
+
path: "decision",
|
|
733
|
+
phase: "final-synthesis"
|
|
734
|
+
}
|
|
735
|
+
});
|
|
736
|
+
}
|
|
230
737
|
stopIfNeeded();
|
|
231
738
|
}
|
|
232
739
|
}
|
|
@@ -247,45 +754,47 @@ export async function runCoordinator(options: CoordinatorRunOptions): Promise<Ru
|
|
|
247
754
|
transcriptEntryCount: transcript.length
|
|
248
755
|
});
|
|
249
756
|
const finalEvent = events.at(-1);
|
|
757
|
+
const trace: Trace = {
|
|
758
|
+
schemaVersion: "1.0",
|
|
759
|
+
runId,
|
|
760
|
+
protocol: "coordinator",
|
|
761
|
+
tier: options.tier,
|
|
762
|
+
modelProviderId: options.model.id,
|
|
763
|
+
agentsUsed: activeAgents,
|
|
764
|
+
inputs: createReplayTraceRunInputs({
|
|
765
|
+
intent: options.intent,
|
|
766
|
+
protocol: options.protocol,
|
|
767
|
+
tier: options.tier,
|
|
768
|
+
modelProviderId: options.model.id,
|
|
769
|
+
agents: activeAgents,
|
|
770
|
+
temperature: options.temperature
|
|
771
|
+
}),
|
|
772
|
+
budget: createReplayTraceBudget({
|
|
773
|
+
tier: options.tier,
|
|
774
|
+
...(options.budget ? { caps: options.budget } : {}),
|
|
775
|
+
...(options.terminate ? { termination: options.terminate } : {})
|
|
776
|
+
}),
|
|
777
|
+
budgetStateChanges: createReplayTraceBudgetStateChanges(events),
|
|
778
|
+
seed: createReplayTraceSeed(options.seed),
|
|
779
|
+
protocolDecisions,
|
|
780
|
+
providerCalls,
|
|
781
|
+
finalOutput: createReplayTraceFinalOutput(output, finalEvent ?? {
|
|
782
|
+
type: "final",
|
|
783
|
+
runId,
|
|
784
|
+
at: "",
|
|
785
|
+
output,
|
|
786
|
+
cost: totalCost,
|
|
787
|
+
transcript: createTranscriptLink(transcript)
|
|
788
|
+
}),
|
|
789
|
+
...(triggeringFailureForAbortMode !== undefined ? { triggeringFailureForAbortMode } : {}),
|
|
790
|
+
events,
|
|
791
|
+
transcript
|
|
792
|
+
};
|
|
250
793
|
|
|
251
794
|
return {
|
|
252
795
|
output,
|
|
253
796
|
eventLog: createRunEventLog(runId, "coordinator", events),
|
|
254
|
-
trace
|
|
255
|
-
schemaVersion: "1.0",
|
|
256
|
-
runId,
|
|
257
|
-
protocol: "coordinator",
|
|
258
|
-
tier: options.tier,
|
|
259
|
-
modelProviderId: options.model.id,
|
|
260
|
-
agentsUsed: activeAgents,
|
|
261
|
-
inputs: createReplayTraceRunInputs({
|
|
262
|
-
intent: options.intent,
|
|
263
|
-
protocol: options.protocol,
|
|
264
|
-
tier: options.tier,
|
|
265
|
-
modelProviderId: options.model.id,
|
|
266
|
-
agents: activeAgents,
|
|
267
|
-
temperature: options.temperature
|
|
268
|
-
}),
|
|
269
|
-
budget: createReplayTraceBudget({
|
|
270
|
-
tier: options.tier,
|
|
271
|
-
...(options.budget ? { caps: options.budget } : {}),
|
|
272
|
-
...(options.terminate ? { termination: options.terminate } : {})
|
|
273
|
-
}),
|
|
274
|
-
budgetStateChanges: createReplayTraceBudgetStateChanges(events),
|
|
275
|
-
seed: createReplayTraceSeed(options.seed),
|
|
276
|
-
protocolDecisions,
|
|
277
|
-
providerCalls,
|
|
278
|
-
finalOutput: createReplayTraceFinalOutput(output, finalEvent ?? {
|
|
279
|
-
type: "final",
|
|
280
|
-
runId,
|
|
281
|
-
at: "",
|
|
282
|
-
output,
|
|
283
|
-
cost: totalCost,
|
|
284
|
-
transcript: createTranscriptLink(transcript)
|
|
285
|
-
}),
|
|
286
|
-
events,
|
|
287
|
-
transcript
|
|
288
|
-
},
|
|
797
|
+
trace,
|
|
289
798
|
transcript,
|
|
290
799
|
usage: createRunUsage(totalCost),
|
|
291
800
|
metadata: createRunMetadata({
|
|
@@ -303,7 +812,8 @@ export async function runCoordinator(options: CoordinatorRunOptions): Promise<Ru
|
|
|
303
812
|
cost: totalCost,
|
|
304
813
|
events
|
|
305
814
|
}),
|
|
306
|
-
cost: totalCost
|
|
815
|
+
cost: totalCost,
|
|
816
|
+
health: computeHealth(trace, DEFAULT_HEALTH_THRESHOLDS)
|
|
307
817
|
};
|
|
308
818
|
|
|
309
819
|
function stopIfNeeded(): boolean {
|
|
@@ -358,6 +868,11 @@ export async function runCoordinator(options: CoordinatorRunOptions): Promise<Ru
|
|
|
358
868
|
}
|
|
359
869
|
}
|
|
360
870
|
|
|
871
|
+
function isDelegateValidationError(error: unknown): boolean {
|
|
872
|
+
return DogpileError.isInstance(error) && error.code === "invalid-configuration" &&
|
|
873
|
+
error.detail?.["kind"] === "delegate-validation";
|
|
874
|
+
}
|
|
875
|
+
|
|
361
876
|
interface CoordinatorTurnOptions {
|
|
362
877
|
readonly agent: AgentSpec;
|
|
363
878
|
readonly coordinator: AgentSpec;
|
|
@@ -380,7 +895,12 @@ interface CoordinatorTurnOptions {
|
|
|
380
895
|
) => void;
|
|
381
896
|
}
|
|
382
897
|
|
|
383
|
-
|
|
898
|
+
interface CoordinatorTurnResult {
|
|
899
|
+
readonly totalCost: CostSummary;
|
|
900
|
+
readonly decision: ReturnType<typeof parseAgentDecision>;
|
|
901
|
+
}
|
|
902
|
+
|
|
903
|
+
async function runCoordinatorTurn(turn: CoordinatorTurnOptions): Promise<CoordinatorTurnResult> {
|
|
384
904
|
throwIfAborted(turn.options.signal, turn.options.model.id);
|
|
385
905
|
|
|
386
906
|
const request: ModelRequest = {
|
|
@@ -430,7 +950,11 @@ async function runCoordinatorTurn(turn: CoordinatorTurnOptions): Promise<CostSum
|
|
|
430
950
|
turn.providerCalls.push(call);
|
|
431
951
|
}
|
|
432
952
|
});
|
|
433
|
-
const decision = parseAgentDecision(response.text
|
|
953
|
+
const decision = parseAgentDecision(response.text, {
|
|
954
|
+
parentProviderId: turn.options.model.id,
|
|
955
|
+
currentDepth: turn.options.currentDepth ?? 0,
|
|
956
|
+
maxDepth: turn.options.effectiveMaxDepth ?? Number.POSITIVE_INFINITY
|
|
957
|
+
});
|
|
434
958
|
const totalCost = addCost(turn.totalCost, responseCost(response));
|
|
435
959
|
const toolCalls = await executeModelResponseToolRequests({
|
|
436
960
|
response,
|
|
@@ -471,7 +995,7 @@ async function runCoordinatorTurn(turn: CoordinatorTurnOptions): Promise<CostSum
|
|
|
471
995
|
transcriptEntryCount: turn.transcript.length
|
|
472
996
|
});
|
|
473
997
|
|
|
474
|
-
return totalCost;
|
|
998
|
+
return { totalCost, decision };
|
|
475
999
|
}
|
|
476
1000
|
|
|
477
1001
|
interface CoordinatorWorkerTurnOptions {
|
|
@@ -553,7 +1077,23 @@ async function runCoordinatorWorkerTurn(turn: CoordinatorWorkerTurnOptions): Pro
|
|
|
553
1077
|
turn.providerCallSlots[turn.providerCallIndex] = call;
|
|
554
1078
|
}
|
|
555
1079
|
});
|
|
556
|
-
const decision = parseAgentDecision(response.text
|
|
1080
|
+
const decision = parseAgentDecision(response.text, {
|
|
1081
|
+
parentProviderId: turn.options.model.id,
|
|
1082
|
+
currentDepth: turn.options.currentDepth ?? 0,
|
|
1083
|
+
maxDepth: turn.options.effectiveMaxDepth ?? Number.POSITIVE_INFINITY
|
|
1084
|
+
});
|
|
1085
|
+
if (Array.isArray(decision) || decision?.type === "delegate") {
|
|
1086
|
+
throw new DogpileError({
|
|
1087
|
+
code: "invalid-configuration",
|
|
1088
|
+
message: "Workers cannot emit delegate decisions in Phase 1",
|
|
1089
|
+
retryable: false,
|
|
1090
|
+
detail: {
|
|
1091
|
+
kind: "delegate-validation",
|
|
1092
|
+
path: "decision",
|
|
1093
|
+
phase: "worker"
|
|
1094
|
+
}
|
|
1095
|
+
});
|
|
1096
|
+
}
|
|
557
1097
|
const toolCalls = await executeModelResponseToolRequests({
|
|
558
1098
|
response,
|
|
559
1099
|
executor: turn.toolExecutor,
|
|
@@ -589,6 +1129,39 @@ function buildCoordinatorPlanInput(intent: string, coordinator: AgentSpec): stri
|
|
|
589
1129
|
return `Mission: ${intent}\nCoordinator ${coordinator.id}: assign the work, name the plan, and provide the first contribution.`;
|
|
590
1130
|
}
|
|
591
1131
|
|
|
1132
|
+
function buildFailuresSection(failures: readonly DispatchWaveFailure[]): string | null {
|
|
1133
|
+
if (failures.length === 0) {
|
|
1134
|
+
return null;
|
|
1135
|
+
}
|
|
1136
|
+
return [
|
|
1137
|
+
"## Sub-run failures since last decision",
|
|
1138
|
+
"",
|
|
1139
|
+
"```json",
|
|
1140
|
+
JSON.stringify(failures, null, 2),
|
|
1141
|
+
"```"
|
|
1142
|
+
].join("\n");
|
|
1143
|
+
}
|
|
1144
|
+
|
|
1145
|
+
function dispatchWaveFailureFromEvent(
|
|
1146
|
+
intent: string,
|
|
1147
|
+
event: SubRunFailedEvent
|
|
1148
|
+
): DispatchWaveFailure | undefined {
|
|
1149
|
+
const reason = typeof event.error.detail?.["reason"] === "string" ? event.error.detail["reason"] : undefined;
|
|
1150
|
+
if (reason === "sibling-failed" || reason === "parent-aborted") {
|
|
1151
|
+
return undefined;
|
|
1152
|
+
}
|
|
1153
|
+
return {
|
|
1154
|
+
childRunId: event.childRunId,
|
|
1155
|
+
intent,
|
|
1156
|
+
error: {
|
|
1157
|
+
code: event.error.code,
|
|
1158
|
+
message: event.error.message,
|
|
1159
|
+
...(reason !== undefined ? { detail: { reason } } : {})
|
|
1160
|
+
},
|
|
1161
|
+
partialCost: { usd: event.partialCost.usd }
|
|
1162
|
+
};
|
|
1163
|
+
}
|
|
1164
|
+
|
|
592
1165
|
function buildWorkerInput(
|
|
593
1166
|
intent: string,
|
|
594
1167
|
transcript: readonly TranscriptEntry[],
|
|
@@ -620,3 +1193,590 @@ function responseCost(response: ModelResponse): CostSummary {
|
|
|
620
1193
|
};
|
|
621
1194
|
}
|
|
622
1195
|
|
|
1196
|
+
interface DispatchDelegateOptions {
|
|
1197
|
+
readonly decision: DelegateAgentDecision;
|
|
1198
|
+
readonly childRunId?: string;
|
|
1199
|
+
readonly parentDecisionId: string;
|
|
1200
|
+
readonly parentDecisionArrayIndex: number;
|
|
1201
|
+
readonly parentDepth: number;
|
|
1202
|
+
readonly parentRunId: string;
|
|
1203
|
+
readonly options: CoordinatorRunOptions;
|
|
1204
|
+
readonly transcript: TranscriptEntry[];
|
|
1205
|
+
readonly emit: (event: RunEvent) => void;
|
|
1206
|
+
readonly recordProtocolDecision: (
|
|
1207
|
+
event: RunEvent,
|
|
1208
|
+
decisionOptions?: { readonly transcriptEntryCount?: number }
|
|
1209
|
+
) => void;
|
|
1210
|
+
/**
|
|
1211
|
+
* BUDGET-03 / D-01 seam: closure-mutation callback that adds child cost
|
|
1212
|
+
* (subResult.cost on success, partialCost on failure) into the parent's
|
|
1213
|
+
* `totalCost` accumulator. Invoked BEFORE `parentEmit(completedEvent)` /
|
|
1214
|
+
* `parentEmit(failEvent)` so the existing "last cost-bearing event ===
|
|
1215
|
+
* final.cost" invariant survives unchanged.
|
|
1216
|
+
*/
|
|
1217
|
+
readonly recordSubRunCost: (cost: CostSummary) => void;
|
|
1218
|
+
readonly dispatchedChild: DispatchedChild;
|
|
1219
|
+
}
|
|
1220
|
+
|
|
1221
|
+
interface DispatchDelegateResult {
|
|
1222
|
+
readonly nextInput: string;
|
|
1223
|
+
readonly taggedText: string;
|
|
1224
|
+
readonly completedAtMs: number;
|
|
1225
|
+
}
|
|
1226
|
+
|
|
1227
|
+
interface DispatchedChild {
|
|
1228
|
+
readonly childRunId: string;
|
|
1229
|
+
readonly decision: DelegateAgentDecision;
|
|
1230
|
+
readonly parentDecisionId: string;
|
|
1231
|
+
readonly parentDecisionArrayIndex: number;
|
|
1232
|
+
readonly parentDepth: number;
|
|
1233
|
+
readonly controller: AbortController;
|
|
1234
|
+
removeParentListener: (() => void) | undefined;
|
|
1235
|
+
readonly childEvents: RunEvent[];
|
|
1236
|
+
started: boolean;
|
|
1237
|
+
closed: boolean;
|
|
1238
|
+
startedAtMs: number;
|
|
1239
|
+
childTimeoutMs: number | undefined;
|
|
1240
|
+
failure: DispatchWaveFailure | undefined;
|
|
1241
|
+
/** STREAM-03 hook (Phase 4). Reserved; do not use. */
|
|
1242
|
+
readonly streamHandle?: never;
|
|
1243
|
+
}
|
|
1244
|
+
|
|
1245
|
+
/**
|
|
1246
|
+
* Dispatch a single delegate decision as a recursive sub-run.
|
|
1247
|
+
*
|
|
1248
|
+
* D-11: child reuses the parent provider object verbatim.
|
|
1249
|
+
* D-16: `recursive: true` flag set when both parent and child protocol are
|
|
1250
|
+
* `coordinator`.
|
|
1251
|
+
* D-17: tagged result text appended to the next coordinator prompt.
|
|
1252
|
+
* D-18: synthetic transcript entry pushed for replay/provenance.
|
|
1253
|
+
*
|
|
1254
|
+
* On thrown error from the child engine, builds `partialTrace` from a locally
|
|
1255
|
+
* tee'd `childEvents` buffer — `runProtocol`'s error contract is unchanged.
|
|
1256
|
+
*/
|
|
1257
|
+
async function dispatchDelegate(input: DispatchDelegateOptions): Promise<DispatchDelegateResult> {
|
|
1258
|
+
const { decision, options } = input;
|
|
1259
|
+
|
|
1260
|
+
// Dispatcher-time depth gate (D-14). Same error shape as the parser; this
|
|
1261
|
+
// is the TOCTOU defense for any state mutation between parse and dispatch.
|
|
1262
|
+
// Fires BEFORE sub-run-started is emitted so failed dispatches do not show
|
|
1263
|
+
// up in the trace as half-started sub-runs.
|
|
1264
|
+
if (options.effectiveMaxDepth !== undefined) {
|
|
1265
|
+
assertDepthWithinLimit(input.parentDepth, options.effectiveMaxDepth);
|
|
1266
|
+
}
|
|
1267
|
+
|
|
1268
|
+
const childRunId = input.childRunId ?? createRunId();
|
|
1269
|
+
const recursive = decision.protocol === "coordinator";
|
|
1270
|
+
const decisionTimeoutMs = decision.budget?.timeoutMs;
|
|
1271
|
+
const parentDeadlineMs = options.parentDeadlineMs;
|
|
1272
|
+
|
|
1273
|
+
// BUDGET-02 / D-12: deadline-based remaining-time math. Children inherit
|
|
1274
|
+
// `parentDeadlineMs - now()`, not a static `parent.budget.timeoutMs`. If the
|
|
1275
|
+
// parent's deadline has already elapsed, throw `code: "aborted"` with
|
|
1276
|
+
// `detail.reason: "timeout"` BEFORE `sub-run-started` is emitted.
|
|
1277
|
+
const remainingMs =
|
|
1278
|
+
parentDeadlineMs !== undefined ? Math.max(0, parentDeadlineMs - Date.now()) : undefined;
|
|
1279
|
+
|
|
1280
|
+
if (parentDeadlineMs !== undefined && remainingMs === 0) {
|
|
1281
|
+
throw new DogpileError({
|
|
1282
|
+
code: "aborted",
|
|
1283
|
+
message: "Parent deadline elapsed before sub-run dispatch.",
|
|
1284
|
+
retryable: false,
|
|
1285
|
+
providerId: options.model.id,
|
|
1286
|
+
detail: { reason: "timeout" }
|
|
1287
|
+
});
|
|
1288
|
+
}
|
|
1289
|
+
|
|
1290
|
+
// Resolve child timeout with precedence (D-12 / D-14):
|
|
1291
|
+
// decision.budget.timeoutMs > parent's remaining > defaultSubRunTimeoutMs > undefined.
|
|
1292
|
+
// When the decision-level timeout exceeds the parent's remaining, CLAMP
|
|
1293
|
+
// (no longer throw) and emit a `sub-run-budget-clamped` event below.
|
|
1294
|
+
let childTimeoutMs: number | undefined;
|
|
1295
|
+
let clampedFrom: number | undefined;
|
|
1296
|
+
if (remainingMs !== undefined) {
|
|
1297
|
+
if (decisionTimeoutMs !== undefined) {
|
|
1298
|
+
if (decisionTimeoutMs > remainingMs) {
|
|
1299
|
+
clampedFrom = decisionTimeoutMs;
|
|
1300
|
+
childTimeoutMs = remainingMs;
|
|
1301
|
+
} else {
|
|
1302
|
+
childTimeoutMs = decisionTimeoutMs;
|
|
1303
|
+
}
|
|
1304
|
+
} else {
|
|
1305
|
+
childTimeoutMs = remainingMs;
|
|
1306
|
+
}
|
|
1307
|
+
} else if (decisionTimeoutMs !== undefined) {
|
|
1308
|
+
childTimeoutMs = decisionTimeoutMs;
|
|
1309
|
+
} else if (options.defaultSubRunTimeoutMs !== undefined) {
|
|
1310
|
+
childTimeoutMs = options.defaultSubRunTimeoutMs;
|
|
1311
|
+
}
|
|
1312
|
+
|
|
1313
|
+
if (!options.runProtocol) {
|
|
1314
|
+
throw new DogpileError({
|
|
1315
|
+
code: "invalid-configuration",
|
|
1316
|
+
message:
|
|
1317
|
+
"Coordinator delegate dispatch requires the engine `runProtocol` callback. " +
|
|
1318
|
+
"Use `Dogpile.run` / `createEngine` rather than calling `runCoordinator` directly when delegate is in play.",
|
|
1319
|
+
retryable: false,
|
|
1320
|
+
detail: {
|
|
1321
|
+
kind: "delegate-validation",
|
|
1322
|
+
path: "runProtocol"
|
|
1323
|
+
}
|
|
1324
|
+
});
|
|
1325
|
+
}
|
|
1326
|
+
|
|
1327
|
+
// Buffered tee for partialTrace capture — see Plan 03 step 8.
|
|
1328
|
+
const childEvents = input.dispatchedChild.childEvents;
|
|
1329
|
+
const parentEmit = input.emit;
|
|
1330
|
+
const teedEmit = (event: RunEvent): void => {
|
|
1331
|
+
childEvents.push(event);
|
|
1332
|
+
if (input.dispatchedChild.closed) {
|
|
1333
|
+
return;
|
|
1334
|
+
}
|
|
1335
|
+
if (options.streamEvents && options.emit) {
|
|
1336
|
+
const inbound = (event as { readonly parentRunIds?: readonly string[] }).parentRunIds;
|
|
1337
|
+
options.emit({
|
|
1338
|
+
...event,
|
|
1339
|
+
parentRunIds: [input.parentRunId, ...(inbound ?? [])]
|
|
1340
|
+
} as RunEvent);
|
|
1341
|
+
}
|
|
1342
|
+
};
|
|
1343
|
+
const childStartedAt = Date.now();
|
|
1344
|
+
input.dispatchedChild.startedAtMs = childStartedAt;
|
|
1345
|
+
|
|
1346
|
+
// BUDGET-02 / D-12: emit clamp event BEFORE sub-run-started so the trace
|
|
1347
|
+
// records "this child's requested timeout was reduced to fit parent's
|
|
1348
|
+
// remaining deadline." Skipped on the happy path (no clamp, no event).
|
|
1349
|
+
if (clampedFrom !== undefined && childTimeoutMs !== undefined) {
|
|
1350
|
+
const clampEvent: SubRunBudgetClampedEvent = {
|
|
1351
|
+
type: "sub-run-budget-clamped",
|
|
1352
|
+
runId: input.parentRunId,
|
|
1353
|
+
at: new Date().toISOString(),
|
|
1354
|
+
childRunId,
|
|
1355
|
+
parentRunId: input.parentRunId,
|
|
1356
|
+
parentDecisionId: input.parentDecisionId,
|
|
1357
|
+
requestedTimeoutMs: clampedFrom,
|
|
1358
|
+
clampedTimeoutMs: childTimeoutMs,
|
|
1359
|
+
reason: "exceeded-parent-remaining"
|
|
1360
|
+
};
|
|
1361
|
+
input.emit(clampEvent);
|
|
1362
|
+
input.recordProtocolDecision(clampEvent);
|
|
1363
|
+
}
|
|
1364
|
+
|
|
1365
|
+
const startEvent: RunEvent = {
|
|
1366
|
+
type: "sub-run-started",
|
|
1367
|
+
runId: input.parentRunId,
|
|
1368
|
+
at: new Date().toISOString(),
|
|
1369
|
+
childRunId,
|
|
1370
|
+
parentRunId: input.parentRunId,
|
|
1371
|
+
parentDecisionId: input.parentDecisionId,
|
|
1372
|
+
parentDecisionArrayIndex: input.parentDecisionArrayIndex,
|
|
1373
|
+
protocol: decision.protocol,
|
|
1374
|
+
intent: decision.intent,
|
|
1375
|
+
depth: input.parentDepth + 1,
|
|
1376
|
+
...(recursive ? { recursive: true } : {})
|
|
1377
|
+
};
|
|
1378
|
+
parentEmit(startEvent);
|
|
1379
|
+
input.recordProtocolDecision(startEvent);
|
|
1380
|
+
|
|
1381
|
+
// BUDGET-01 / D-07: derive a per-child AbortController so child engines see
|
|
1382
|
+
// their own signal. Listener forwards parent.signal.reason verbatim, so
|
|
1383
|
+
// detail.reason classification (parent-aborted vs timeout) is preserved.
|
|
1384
|
+
// Phase 4 STREAM-03 hook: per-child cancel handle attaches here.
|
|
1385
|
+
const parentSignal = options.signal;
|
|
1386
|
+
let removeParentAbortListener: (() => void) | undefined;
|
|
1387
|
+
if (parentSignal !== undefined) {
|
|
1388
|
+
if (parentSignal.aborted) {
|
|
1389
|
+
input.dispatchedChild.controller.abort(parentSignal.reason);
|
|
1390
|
+
} else {
|
|
1391
|
+
const handler = (): void => {
|
|
1392
|
+
input.dispatchedChild.controller.abort(parentSignal.reason);
|
|
1393
|
+
};
|
|
1394
|
+
parentSignal.addEventListener("abort", handler, { once: true });
|
|
1395
|
+
removeParentAbortListener = (): void => {
|
|
1396
|
+
parentSignal.removeEventListener("abort", handler);
|
|
1397
|
+
};
|
|
1398
|
+
}
|
|
1399
|
+
}
|
|
1400
|
+
input.dispatchedChild.removeParentListener = removeParentAbortListener;
|
|
1401
|
+
input.dispatchedChild.started = true;
|
|
1402
|
+
input.dispatchedChild.childTimeoutMs = childTimeoutMs;
|
|
1403
|
+
const childDeadlineReason =
|
|
1404
|
+
childTimeoutMs !== undefined && parentDeadlineMs === undefined
|
|
1405
|
+
? createEngineDeadlineTimeoutError(options.model.id, childTimeoutMs)
|
|
1406
|
+
: undefined;
|
|
1407
|
+
const childDeadlineTimer =
|
|
1408
|
+
childDeadlineReason !== undefined
|
|
1409
|
+
? setTimeout(() => {
|
|
1410
|
+
input.dispatchedChild.controller.abort(childDeadlineReason);
|
|
1411
|
+
}, childTimeoutMs)
|
|
1412
|
+
: undefined;
|
|
1413
|
+
|
|
1414
|
+
const childOptions = {
|
|
1415
|
+
runId: childRunId,
|
|
1416
|
+
intent: decision.intent,
|
|
1417
|
+
protocol: decision.protocol,
|
|
1418
|
+
tier: options.tier,
|
|
1419
|
+
model: options.model, // D-11: same provider instance verbatim
|
|
1420
|
+
agents: options.agents,
|
|
1421
|
+
tools: options.tools,
|
|
1422
|
+
temperature: options.temperature,
|
|
1423
|
+
...(childTimeoutMs !== undefined ? { budget: { timeoutMs: childTimeoutMs } } : {}),
|
|
1424
|
+
signal: input.dispatchedChild.controller.signal,
|
|
1425
|
+
emit: teedEmit,
|
|
1426
|
+
...(options.streamEvents !== undefined ? { streamEvents: options.streamEvents } : {}),
|
|
1427
|
+
currentDepth: input.parentDepth + 1,
|
|
1428
|
+
...(options.effectiveMaxDepth !== undefined ? { effectiveMaxDepth: options.effectiveMaxDepth } : {}),
|
|
1429
|
+
...(options.effectiveMaxConcurrentChildren !== undefined
|
|
1430
|
+
? { effectiveMaxConcurrentChildren: options.effectiveMaxConcurrentChildren }
|
|
1431
|
+
: {}),
|
|
1432
|
+
...(options.onChildFailure !== undefined ? { onChildFailure: options.onChildFailure } : {}),
|
|
1433
|
+
// BUDGET-02 / D-12: forward the ROOT deadline so depth-N grandchildren
|
|
1434
|
+
// see the same `parentDeadlineMs` rather than a fresh per-level snapshot.
|
|
1435
|
+
...(parentDeadlineMs !== undefined ? { parentDeadlineMs } : {}),
|
|
1436
|
+
...(options.defaultSubRunTimeoutMs !== undefined
|
|
1437
|
+
? { defaultSubRunTimeoutMs: options.defaultSubRunTimeoutMs }
|
|
1438
|
+
: {})
|
|
1439
|
+
};
|
|
1440
|
+
|
|
1441
|
+
let subResult: RunResult;
|
|
1442
|
+
try {
|
|
1443
|
+
subResult = await options.runProtocol(childOptions);
|
|
1444
|
+
} catch (error) {
|
|
1445
|
+
if (childDeadlineTimer !== undefined) {
|
|
1446
|
+
clearTimeout(childDeadlineTimer);
|
|
1447
|
+
}
|
|
1448
|
+
removeParentAbortListener?.();
|
|
1449
|
+
if (input.dispatchedChild.closed) {
|
|
1450
|
+
const enrichedError = enrichAbortErrorWithParentReason(error, parentSignal);
|
|
1451
|
+
if (DogpileError.isInstance(enrichedError)) {
|
|
1452
|
+
throw enrichedError;
|
|
1453
|
+
}
|
|
1454
|
+
throw error;
|
|
1455
|
+
}
|
|
1456
|
+
|
|
1457
|
+
const failedDecision: JsonObject = {
|
|
1458
|
+
type: "delegate",
|
|
1459
|
+
protocol: decision.protocol,
|
|
1460
|
+
intent: decision.intent,
|
|
1461
|
+
...(decision.model !== undefined ? { model: decision.model } : {}),
|
|
1462
|
+
...(decision.budget !== undefined ? { budget: decision.budget as unknown as JsonValue } : {})
|
|
1463
|
+
};
|
|
1464
|
+
|
|
1465
|
+
const partialTrace: Trace = buildPartialTrace({
|
|
1466
|
+
childRunId,
|
|
1467
|
+
events: childEvents,
|
|
1468
|
+
startedAtMs: childStartedAt,
|
|
1469
|
+
protocol: decision.protocol,
|
|
1470
|
+
tier: options.tier,
|
|
1471
|
+
modelProviderId: options.model.id,
|
|
1472
|
+
agents: options.agents,
|
|
1473
|
+
intent: decision.intent,
|
|
1474
|
+
temperature: options.temperature,
|
|
1475
|
+
...(childTimeoutMs !== undefined ? { childTimeoutMs } : {}),
|
|
1476
|
+
...(options.seed !== undefined ? { seed: options.seed } : {})
|
|
1477
|
+
});
|
|
1478
|
+
|
|
1479
|
+
// BUDGET-01 / D-08: when the child aborted because the parent.signal
|
|
1480
|
+
// aborted, lock detail.reason on the surfaced error. Upstream engine
|
|
1481
|
+
// wrapping (e.g., createStreamCancellationError) attaches its own
|
|
1482
|
+
// detail.status; we add detail.reason so consumers can discriminate
|
|
1483
|
+
// parent-aborted vs timeout regardless of which engine path produced the
|
|
1484
|
+
// abort error.
|
|
1485
|
+
const enrichedError = enrichProviderTimeoutSource(
|
|
1486
|
+
enrichAbortErrorWithParentReason(error, parentSignal),
|
|
1487
|
+
{
|
|
1488
|
+
...(decisionTimeoutMs !== undefined ? { decisionTimeoutMs } : {}),
|
|
1489
|
+
...(options.defaultSubRunTimeoutMs !== undefined
|
|
1490
|
+
? { engineDefaultTimeoutMs: options.defaultSubRunTimeoutMs }
|
|
1491
|
+
: {})
|
|
1492
|
+
}
|
|
1493
|
+
);
|
|
1494
|
+
if (DogpileError.isInstance(enrichedError)) {
|
|
1495
|
+
options.failureInstancesByChildRunId?.set(childRunId, enrichedError);
|
|
1496
|
+
}
|
|
1497
|
+
const errorPayload = errorPayloadFromUnknown(enrichedError, failedDecision);
|
|
1498
|
+
// BUDGET-03 / D-02: capture real provider spend before the throw and
|
|
1499
|
+
// roll it into the parent's totalCost BEFORE emitting sub-run-failed.
|
|
1500
|
+
const partialCost = lastCostBearingEventCost(childEvents) ?? emptyCost();
|
|
1501
|
+
input.recordSubRunCost(partialCost);
|
|
1502
|
+
const failEvent: SubRunFailedEvent = {
|
|
1503
|
+
type: "sub-run-failed",
|
|
1504
|
+
runId: input.parentRunId,
|
|
1505
|
+
at: new Date().toISOString(),
|
|
1506
|
+
childRunId,
|
|
1507
|
+
parentRunId: input.parentRunId,
|
|
1508
|
+
parentDecisionId: input.parentDecisionId,
|
|
1509
|
+
parentDecisionArrayIndex: input.parentDecisionArrayIndex,
|
|
1510
|
+
error: errorPayload,
|
|
1511
|
+
partialTrace,
|
|
1512
|
+
partialCost
|
|
1513
|
+
};
|
|
1514
|
+
parentEmit(failEvent);
|
|
1515
|
+
input.recordProtocolDecision(failEvent);
|
|
1516
|
+
input.dispatchedChild.closed = true;
|
|
1517
|
+
input.dispatchedChild.failure = dispatchWaveFailureFromEvent(decision.intent, failEvent);
|
|
1518
|
+
|
|
1519
|
+
// Re-throw a DogpileError so the parent run terminates with a typed error.
|
|
1520
|
+
if (DogpileError.isInstance(enrichedError)) {
|
|
1521
|
+
throw enrichedError;
|
|
1522
|
+
}
|
|
1523
|
+
throw new DogpileError({
|
|
1524
|
+
code: "invalid-configuration",
|
|
1525
|
+
message: error instanceof Error ? error.message : String(error),
|
|
1526
|
+
retryable: false,
|
|
1527
|
+
detail: {
|
|
1528
|
+
kind: "delegate-validation",
|
|
1529
|
+
path: "decision",
|
|
1530
|
+
reason: "child-run-failed"
|
|
1531
|
+
}
|
|
1532
|
+
});
|
|
1533
|
+
}
|
|
1534
|
+
|
|
1535
|
+
if (childDeadlineTimer !== undefined) {
|
|
1536
|
+
clearTimeout(childDeadlineTimer);
|
|
1537
|
+
}
|
|
1538
|
+
removeParentAbortListener?.();
|
|
1539
|
+
|
|
1540
|
+
// BUDGET-03 / D-01: roll child's full cost into the parent's totalCost
|
|
1541
|
+
// BEFORE emitting sub-run-completed. The next agent-turn / final event will
|
|
1542
|
+
// read totalCost from the closure scope, preserving the existing
|
|
1543
|
+
// "last cost-bearing event === final.cost" invariant.
|
|
1544
|
+
input.recordSubRunCost(subResult.cost);
|
|
1545
|
+
|
|
1546
|
+
const completedEvent: RunEvent = {
|
|
1547
|
+
type: "sub-run-completed",
|
|
1548
|
+
runId: input.parentRunId,
|
|
1549
|
+
at: new Date().toISOString(),
|
|
1550
|
+
childRunId,
|
|
1551
|
+
parentRunId: input.parentRunId,
|
|
1552
|
+
parentDecisionId: input.parentDecisionId,
|
|
1553
|
+
parentDecisionArrayIndex: input.parentDecisionArrayIndex,
|
|
1554
|
+
subResult
|
|
1555
|
+
};
|
|
1556
|
+
parentEmit(completedEvent);
|
|
1557
|
+
input.recordProtocolDecision(completedEvent);
|
|
1558
|
+
input.dispatchedChild.closed = true;
|
|
1559
|
+
|
|
1560
|
+
// BUDGET-01 / D-10: parent.signal aborted AFTER the child completed but
|
|
1561
|
+
// before we advance to the next coordinator turn. Emit a marker event so
|
|
1562
|
+
// streaming subscribers see "parent gave up after sub-run" provenance,
|
|
1563
|
+
// then re-throw the parent's abort reason. Non-streaming run() rejects with
|
|
1564
|
+
// the thrown error and does NOT preserve the marker — engine.ts does not
|
|
1565
|
+
// attach the parent events array to the rejected error (verified at
|
|
1566
|
+
// engine.ts:230-239). Streaming-subscriber observability is the contract.
|
|
1567
|
+
if (parentSignal?.aborted) {
|
|
1568
|
+
const abortMarker: SubRunParentAbortedEvent = {
|
|
1569
|
+
type: "sub-run-parent-aborted",
|
|
1570
|
+
runId: input.parentRunId,
|
|
1571
|
+
at: new Date().toISOString(),
|
|
1572
|
+
childRunId,
|
|
1573
|
+
parentRunId: input.parentRunId,
|
|
1574
|
+
reason: "parent-aborted"
|
|
1575
|
+
};
|
|
1576
|
+
parentEmit(abortMarker);
|
|
1577
|
+
input.recordProtocolDecision(abortMarker);
|
|
1578
|
+
throw enrichAbortErrorWithParentReason(
|
|
1579
|
+
createAbortErrorFromSignal(parentSignal, options.model.id),
|
|
1580
|
+
parentSignal
|
|
1581
|
+
);
|
|
1582
|
+
}
|
|
1583
|
+
|
|
1584
|
+
// D-18 synthetic transcript entry.
|
|
1585
|
+
const decisionAsJson: JsonObject = {
|
|
1586
|
+
type: "delegate",
|
|
1587
|
+
protocol: decision.protocol,
|
|
1588
|
+
intent: decision.intent,
|
|
1589
|
+
...(decision.model !== undefined ? { model: decision.model } : {}),
|
|
1590
|
+
...(decision.budget !== undefined ? { budget: decision.budget as unknown as JsonValue } : {})
|
|
1591
|
+
};
|
|
1592
|
+
const taggedText = renderSubRunResult(childRunId, subResult);
|
|
1593
|
+
input.transcript.push({
|
|
1594
|
+
agentId: `sub-run:${childRunId}`,
|
|
1595
|
+
role: "delegate-result",
|
|
1596
|
+
input: JSON.stringify(decisionAsJson),
|
|
1597
|
+
output: taggedText
|
|
1598
|
+
});
|
|
1599
|
+
|
|
1600
|
+
// Build the next coordinator prompt by appending the D-17 tagged block.
|
|
1601
|
+
const coordinatorAgent = options.agents[0];
|
|
1602
|
+
const baseInput = buildCoordinatorPlanInput(input.options.intent, coordinatorAgent ?? {
|
|
1603
|
+
id: "coordinator",
|
|
1604
|
+
role: "coordinator"
|
|
1605
|
+
});
|
|
1606
|
+
return {
|
|
1607
|
+
nextInput: `${baseInput}\n\n${taggedText}\n\nUsing the sub-run result above, decide the next step (participate or delegate).`,
|
|
1608
|
+
taggedText,
|
|
1609
|
+
completedAtMs: Date.now()
|
|
1610
|
+
};
|
|
1611
|
+
}
|
|
1612
|
+
|
|
1613
|
+
/**
|
|
1614
|
+
* D-17 prompt-injection helper. Renders a child `RunResult` as the canonical
|
|
1615
|
+
* tagged-result block injected into the parent coordinator's next prompt.
|
|
1616
|
+
*
|
|
1617
|
+
* Format:
|
|
1618
|
+
* `[sub-run <childRunId>]: <output>`
|
|
1619
|
+
* `[sub-run <childRunId> stats]: turns=<N> costUsd=<X> durationMs=<Y>`
|
|
1620
|
+
*
|
|
1621
|
+
* The stats line is a soft contract — field names stable, ordering stable.
|
|
1622
|
+
*/
|
|
1623
|
+
function renderSubRunResult(childRunId: string, subResult: RunResult): string {
|
|
1624
|
+
const turns = subResult.transcript.length;
|
|
1625
|
+
const costUsd = subResult.cost.usd ?? 0;
|
|
1626
|
+
const startedAt = eventTimestamp(subResult.trace.events[0]);
|
|
1627
|
+
const endedAt = eventTimestamp(subResult.trace.events.at(-1));
|
|
1628
|
+
const durationMs =
|
|
1629
|
+
startedAt && endedAt
|
|
1630
|
+
? Math.max(0, Date.parse(endedAt) - Date.parse(startedAt))
|
|
1631
|
+
: 0;
|
|
1632
|
+
return [
|
|
1633
|
+
`[sub-run ${childRunId}]: ${subResult.output}`,
|
|
1634
|
+
`[sub-run ${childRunId} stats]: turns=${turns} costUsd=${costUsd} durationMs=${durationMs}`
|
|
1635
|
+
].join("\n");
|
|
1636
|
+
}
|
|
1637
|
+
|
|
1638
|
+
function eventTimestamp(event: RunEvent | undefined): string | undefined {
|
|
1639
|
+
if (event === undefined) return undefined;
|
|
1640
|
+
if ("at" in event) return event.at;
|
|
1641
|
+
return event.type === "model-response" ? event.completedAt : event.startedAt;
|
|
1642
|
+
}
|
|
1643
|
+
|
|
1644
|
+
/**
|
|
1645
|
+
* Build a JSON-serializable {@link Trace} for `sub-run-failed.partialTrace`
|
|
1646
|
+
* from a buffered tee of child emits. Keeps `runProtocol`'s error contract
|
|
1647
|
+
* unchanged — Plan 03 step 8.
|
|
1648
|
+
*/
|
|
1649
|
+
function buildPartialTrace(input: {
|
|
1650
|
+
readonly childRunId: string;
|
|
1651
|
+
readonly events: readonly RunEvent[];
|
|
1652
|
+
readonly startedAtMs: number;
|
|
1653
|
+
readonly protocol: ProtocolSelection;
|
|
1654
|
+
readonly tier: Tier;
|
|
1655
|
+
readonly modelProviderId: string;
|
|
1656
|
+
readonly agents: readonly AgentSpec[];
|
|
1657
|
+
readonly intent: string;
|
|
1658
|
+
readonly temperature: number;
|
|
1659
|
+
readonly childTimeoutMs?: number;
|
|
1660
|
+
readonly seed?: string | number;
|
|
1661
|
+
}): Trace {
|
|
1662
|
+
const protocolName = typeof input.protocol === "string" ? input.protocol : input.protocol.kind;
|
|
1663
|
+
const protocolConfig =
|
|
1664
|
+
typeof input.protocol === "string"
|
|
1665
|
+
? ({ kind: input.protocol } as unknown as Parameters<typeof createReplayTraceRunInputs>[0]["protocol"])
|
|
1666
|
+
: input.protocol;
|
|
1667
|
+
return {
|
|
1668
|
+
schemaVersion: "1.0",
|
|
1669
|
+
runId: input.childRunId,
|
|
1670
|
+
protocol: protocolName,
|
|
1671
|
+
tier: input.tier,
|
|
1672
|
+
modelProviderId: input.modelProviderId,
|
|
1673
|
+
agentsUsed: input.agents,
|
|
1674
|
+
inputs: createReplayTraceRunInputs({
|
|
1675
|
+
intent: input.intent,
|
|
1676
|
+
protocol: protocolConfig,
|
|
1677
|
+
tier: input.tier,
|
|
1678
|
+
modelProviderId: input.modelProviderId,
|
|
1679
|
+
agents: input.agents,
|
|
1680
|
+
temperature: input.temperature
|
|
1681
|
+
}),
|
|
1682
|
+
budget: createReplayTraceBudget({
|
|
1683
|
+
tier: input.tier,
|
|
1684
|
+
...(input.childTimeoutMs !== undefined ? { caps: { timeoutMs: input.childTimeoutMs } } : {})
|
|
1685
|
+
}),
|
|
1686
|
+
budgetStateChanges: createReplayTraceBudgetStateChanges(input.events),
|
|
1687
|
+
seed: createReplayTraceSeed(input.seed),
|
|
1688
|
+
protocolDecisions: [],
|
|
1689
|
+
providerCalls: [],
|
|
1690
|
+
finalOutput: {
|
|
1691
|
+
kind: "replay-trace-final-output",
|
|
1692
|
+
output: "",
|
|
1693
|
+
cost: emptyCost(),
|
|
1694
|
+
completedAt: new Date().toISOString(),
|
|
1695
|
+
transcript: createTranscriptLink([])
|
|
1696
|
+
},
|
|
1697
|
+
events: input.events,
|
|
1698
|
+
transcript: []
|
|
1699
|
+
};
|
|
1700
|
+
}
|
|
1701
|
+
|
|
1702
|
+
/**
|
|
1703
|
+
* BUDGET-01 / D-08: when a child sub-run threw because the parent's signal
|
|
1704
|
+
* aborted, lock the `detail.reason` discriminator on the resulting
|
|
1705
|
+
* `code: "aborted"` error. Preserves any pre-existing detail keys (e.g.,
|
|
1706
|
+
* `detail.status: "cancelled"` attached by `createStreamCancellationError`).
|
|
1707
|
+
*
|
|
1708
|
+
* No-op when:
|
|
1709
|
+
* - parent.signal is undefined or not aborted (child failure was unrelated)
|
|
1710
|
+
* - error is not a DogpileError with `code: "aborted"`
|
|
1711
|
+
* - error already has a `detail.reason` set (preserve upstream classification)
|
|
1712
|
+
*/
|
|
1713
|
+
function enrichAbortErrorWithParentReason(error: unknown, parentSignal: AbortSignal | undefined): unknown {
|
|
1714
|
+
if (parentSignal === undefined || !parentSignal.aborted) {
|
|
1715
|
+
return error;
|
|
1716
|
+
}
|
|
1717
|
+
if (!DogpileError.isInstance(error) || error.code !== "aborted") {
|
|
1718
|
+
return error;
|
|
1719
|
+
}
|
|
1720
|
+
const existingDetail = error.detail ?? {};
|
|
1721
|
+
if (existingDetail["reason"] !== undefined) {
|
|
1722
|
+
return error;
|
|
1723
|
+
}
|
|
1724
|
+
const reason = classifyAbortReason(parentSignal.reason);
|
|
1725
|
+
return new DogpileError({
|
|
1726
|
+
code: "aborted",
|
|
1727
|
+
message: error.message,
|
|
1728
|
+
retryable: error.retryable ?? false,
|
|
1729
|
+
...(error.providerId !== undefined ? { providerId: error.providerId } : {}),
|
|
1730
|
+
detail: { ...existingDetail, reason },
|
|
1731
|
+
...(error.cause !== undefined ? { cause: error.cause } : {})
|
|
1732
|
+
});
|
|
1733
|
+
}
|
|
1734
|
+
|
|
1735
|
+
function enrichProviderTimeoutSource(
|
|
1736
|
+
error: unknown,
|
|
1737
|
+
context: {
|
|
1738
|
+
readonly decisionTimeoutMs?: number;
|
|
1739
|
+
readonly engineDefaultTimeoutMs?: number;
|
|
1740
|
+
}
|
|
1741
|
+
): unknown {
|
|
1742
|
+
if (!DogpileError.isInstance(error) || error.code !== "provider-timeout") {
|
|
1743
|
+
return error;
|
|
1744
|
+
}
|
|
1745
|
+
const existingDetail = error.detail ?? {};
|
|
1746
|
+
if (existingDetail["source"] !== undefined) {
|
|
1747
|
+
return error;
|
|
1748
|
+
}
|
|
1749
|
+
const source = classifyChildTimeoutSource(error, {
|
|
1750
|
+
...context,
|
|
1751
|
+
isProviderError: true
|
|
1752
|
+
});
|
|
1753
|
+
return new DogpileError({
|
|
1754
|
+
code: "provider-timeout",
|
|
1755
|
+
message: error.message,
|
|
1756
|
+
retryable: error.retryable ?? true,
|
|
1757
|
+
...(error.providerId !== undefined ? { providerId: error.providerId } : {}),
|
|
1758
|
+
detail: { ...existingDetail, source },
|
|
1759
|
+
...(error.cause !== undefined ? { cause: error.cause } : {})
|
|
1760
|
+
});
|
|
1761
|
+
}
|
|
1762
|
+
|
|
1763
|
+
function errorPayloadFromUnknown(error: unknown, failedDecision: JsonObject): SubRunFailedEvent["error"] {
|
|
1764
|
+
if (DogpileError.isInstance(error)) {
|
|
1765
|
+
const detail: JsonObject = {
|
|
1766
|
+
...(error.detail ?? {}),
|
|
1767
|
+
failedDecision
|
|
1768
|
+
};
|
|
1769
|
+
return {
|
|
1770
|
+
code: error.code,
|
|
1771
|
+
message: error.message,
|
|
1772
|
+
...(error.providerId !== undefined ? { providerId: error.providerId } : {}),
|
|
1773
|
+
detail
|
|
1774
|
+
};
|
|
1775
|
+
}
|
|
1776
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1777
|
+
return {
|
|
1778
|
+
code: "invalid-configuration",
|
|
1779
|
+
message,
|
|
1780
|
+
detail: { failedDecision }
|
|
1781
|
+
};
|
|
1782
|
+
}
|