deepline 0.1.79 → 0.1.80
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +68 -31
- package/dist/cli/index.mjs +68 -31
- package/dist/index.d.mts +9 -1
- package/dist/index.d.ts +9 -1
- package/dist/index.js +7 -4
- package/dist/index.mjs +7 -4
- package/dist/repo/apps/play-runner-workers/src/child-play-await.ts +192 -0
- package/dist/repo/apps/play-runner-workers/src/coordinator-entry.ts +1102 -1616
- package/dist/repo/apps/play-runner-workers/src/dedup-do.ts +506 -654
- package/dist/repo/apps/play-runner-workers/src/entry.ts +896 -354
- package/dist/repo/apps/play-runner-workers/src/workflow-retry-state.ts +8 -2
- package/dist/repo/sdk/src/client.ts +9 -2
- package/dist/repo/sdk/src/release.ts +2 -2
- package/dist/repo/sdk/src/types.ts +5 -0
- package/dist/repo/shared_libs/play-runtime/governor/coordinator-rate-state-backend.ts +231 -0
- package/dist/repo/shared_libs/play-runtime/governor/governor.ts +376 -0
- package/dist/repo/shared_libs/play-runtime/governor/policy.ts +179 -0
- package/dist/repo/shared_libs/play-runtime/governor/rate-state-backend.ts +87 -0
- package/dist/repo/shared_libs/play-runtime/run-failure.ts +12 -0
- package/dist/repo/shared_libs/play-runtime/scheduler-backend.ts +24 -0
- package/dist/repo/shared_libs/play-runtime/submit-limits.ts +35 -0
- package/dist/repo/shared_libs/plays/bundling/index.ts +4 -12
- package/dist/repo/shared_libs/plays/bundling/limits.ts +29 -0
- package/dist/repo/shared_libs/plays/static-pipeline.ts +56 -3
- package/dist/repo/shared_libs/temporal/constants.ts +38 -0
- package/package.json +1 -1
|
@@ -44,6 +44,22 @@ import {
|
|
|
44
44
|
type ChunkExecutionResult,
|
|
45
45
|
} from '../../../shared_libs/play-runtime/batch-runtime';
|
|
46
46
|
import { getDefaultPlayRuntimeBatchStrategy } from '../../../shared_libs/play-runtime/default-batch-strategies';
|
|
47
|
+
import { STANDARD_PLAY_RUNTIME_LIMIT_SECONDS } from '../../../shared_libs/temporal/constants';
|
|
48
|
+
import {
|
|
49
|
+
createPlayExecutionGovernor,
|
|
50
|
+
type GovernanceSnapshot,
|
|
51
|
+
type PlayExecutionGovernor,
|
|
52
|
+
} from '../../../shared_libs/play-runtime/governor/governor';
|
|
53
|
+
import {
|
|
54
|
+
CoordinatorRateStateBackend,
|
|
55
|
+
type CoordinatorRatePort,
|
|
56
|
+
} from '../../../shared_libs/play-runtime/governor/coordinator-rate-state-backend';
|
|
57
|
+
import type { PacingRule } from '../../../shared_libs/play-runtime/governor/rate-state-backend';
|
|
58
|
+
import {
|
|
59
|
+
awaitChildTerminal,
|
|
60
|
+
type ChildPlayTerminalWaitResult,
|
|
61
|
+
type WorkflowStepLike,
|
|
62
|
+
} from './child-play-await';
|
|
47
63
|
import type { AnyBatchOperationStrategy } from '../../../shared_libs/play-runtime/batching-types';
|
|
48
64
|
import {
|
|
49
65
|
createToolBatchExecutor,
|
|
@@ -201,6 +217,8 @@ type RunRequest = {
|
|
|
201
217
|
/** Internal ctx.runPlay lineage. Public SDK/users never see this. */
|
|
202
218
|
playCallGovernance?: PlayCallGovernanceSnapshot | null;
|
|
203
219
|
preloadedDbSessions?: PreloadedRuntimeDbSession[] | null;
|
|
220
|
+
/** Coordinator already created the child run row before invoking /run-inline. */
|
|
221
|
+
inlineChildRunRegistered?: boolean | null;
|
|
204
222
|
/** Cloudflare coordinator URL for direct Workflow control-plane signals. */
|
|
205
223
|
coordinatorUrl?: string | null;
|
|
206
224
|
/** Request-scoped coordinator auth token for preview/dev direct control calls. */
|
|
@@ -253,11 +271,21 @@ type WorkerEnv = {
|
|
|
253
271
|
* `/api/v2/plays/runtime-tools/*`) skip the public callback URL and route
|
|
254
272
|
* directly through the coordinator's process to the configured app — saves
|
|
255
273
|
* the *.workers.dev → CF edge → cloudflared → localhost chain on every
|
|
256
|
-
* runtime callback.
|
|
257
|
-
*
|
|
274
|
+
* runtime callback. Required for workers_edge; missing binding is an infra
|
|
275
|
+
* error instead of a transport fallback.
|
|
258
276
|
*/
|
|
259
277
|
RUNTIME_API?: {
|
|
260
|
-
|
|
278
|
+
runtimeApiCall(input: {
|
|
279
|
+
executorToken: string;
|
|
280
|
+
path: string;
|
|
281
|
+
body: unknown;
|
|
282
|
+
headers?: Record<string, string>;
|
|
283
|
+
timeoutMs?: number;
|
|
284
|
+
}): Promise<{
|
|
285
|
+
status: number;
|
|
286
|
+
headers?: Record<string, string>;
|
|
287
|
+
body: string;
|
|
288
|
+
}>;
|
|
261
289
|
};
|
|
262
290
|
/**
|
|
263
291
|
* Loopback RPC binding into the coordinator Worker. Used for CF-to-CF
|
|
@@ -280,6 +308,20 @@ type WorkerEnv = {
|
|
|
280
308
|
logs?: string[];
|
|
281
309
|
timings?: Array<{ phase: string; ms: number }>;
|
|
282
310
|
}>;
|
|
311
|
+
submitWorkflowChild?(
|
|
312
|
+
parentRunId: string,
|
|
313
|
+
body: Record<string, unknown>,
|
|
314
|
+
): Promise<{
|
|
315
|
+
workflowId?: string;
|
|
316
|
+
runId?: string;
|
|
317
|
+
status?: string;
|
|
318
|
+
mode?: string;
|
|
319
|
+
output?: unknown;
|
|
320
|
+
result?: unknown;
|
|
321
|
+
error?: unknown;
|
|
322
|
+
logs?: string[];
|
|
323
|
+
timings?: Array<{ phase: string; ms: number }>;
|
|
324
|
+
}>;
|
|
283
325
|
signal(
|
|
284
326
|
runId: string,
|
|
285
327
|
body: Record<string, unknown>,
|
|
@@ -292,6 +334,26 @@ type WorkerEnv = {
|
|
|
292
334
|
runId: string,
|
|
293
335
|
event: Record<string, unknown>,
|
|
294
336
|
): Promise<void>;
|
|
337
|
+
readTerminalState?(runId: string): Promise<Record<string, unknown> | null>;
|
|
338
|
+
readChildTerminalState?(
|
|
339
|
+
parentRunId: string,
|
|
340
|
+
eventKey: string,
|
|
341
|
+
timeoutMs?: number,
|
|
342
|
+
): Promise<Record<string, unknown> | null>;
|
|
343
|
+
/**
|
|
344
|
+
* Distributed Rate State Backend RPC. Routes to the per-(org,provider)
|
|
345
|
+
* rate-bucket Durable Object so the request window is global across
|
|
346
|
+
* isolates. See CoordinatorRateStateBackend + dedup-do.ts.
|
|
347
|
+
*/
|
|
348
|
+
rateAcquire?(input: {
|
|
349
|
+
bucketId: string;
|
|
350
|
+
rules: PacingRule[];
|
|
351
|
+
requested: number;
|
|
352
|
+
}): Promise<{ granted: number; waitMs: number }>;
|
|
353
|
+
ratePenalize?(input: {
|
|
354
|
+
bucketId: string;
|
|
355
|
+
cooldownMs: number;
|
|
356
|
+
}): Promise<void>;
|
|
295
357
|
};
|
|
296
358
|
/**
|
|
297
359
|
* Required service binding to the long-lived Play Harness Worker
|
|
@@ -379,10 +441,9 @@ async function probeHarnessOnce(
|
|
|
379
441
|
}
|
|
380
442
|
}
|
|
381
443
|
/**
|
|
382
|
-
* Routes runtime API requests through the in-process RUNTIME_API
|
|
383
|
-
*
|
|
384
|
-
*
|
|
385
|
-
* fetch transport so the play still reaches the same authenticated handler.
|
|
444
|
+
* Routes runtime API requests through the in-process RUNTIME_API service
|
|
445
|
+
* binding. workers_edge treats a missing binding as infrastructure failure
|
|
446
|
+
* instead of falling back to public HTTP.
|
|
386
447
|
*/
|
|
387
448
|
const RUNTIME_API_TIMEOUT_MS = 30_000;
|
|
388
449
|
const RUNTIME_API_PLAY_RUN_TIMEOUT_MS = 75_000;
|
|
@@ -390,7 +451,6 @@ const RUNTIME_API_INTEGRATION_EXECUTE_TIMEOUT_MS = 180_000;
|
|
|
390
451
|
const RUNTIME_API_RETRY_DELAYS_MS = [
|
|
391
452
|
250, 750, 1500, 3000, 5000, 10000,
|
|
392
453
|
] as const;
|
|
393
|
-
let loggedMissingRuntimeApiBinding = false;
|
|
394
454
|
|
|
395
455
|
async function fetchRuntimeApi(
|
|
396
456
|
baseUrl: string,
|
|
@@ -418,37 +478,25 @@ async function fetchRuntimeApi(
|
|
|
418
478
|
try {
|
|
419
479
|
const mergedInit: RequestInit = {
|
|
420
480
|
...init,
|
|
421
|
-
headers: runtimeApiHeaders(init.headers,
|
|
481
|
+
headers: runtimeApiHeaders(init.headers, false),
|
|
422
482
|
signal: controller.signal,
|
|
423
483
|
};
|
|
424
484
|
if (!cachedRuntimeApiBinding) {
|
|
425
|
-
|
|
426
|
-
loggedMissingRuntimeApiBinding = true;
|
|
427
|
-
console.warn(
|
|
428
|
-
`[play-harness] RUNTIME_API binding missing; using public runtime API transport. path=${path}`,
|
|
429
|
-
);
|
|
430
|
-
}
|
|
431
|
-
return await Promise.race([
|
|
432
|
-
fetch(`${baseUrl.replace(/\/$/, '')}${path}`, mergedInit),
|
|
433
|
-
timeoutPromise,
|
|
434
|
-
]);
|
|
485
|
+
throw new Error('[play-harness] RUNTIME_API service binding is required');
|
|
435
486
|
}
|
|
436
|
-
const responsePromise =
|
|
437
|
-
|
|
487
|
+
const responsePromise = callRuntimeApiRpcBinding(
|
|
488
|
+
cachedRuntimeApiBinding,
|
|
489
|
+
mergedInit,
|
|
490
|
+
{
|
|
491
|
+
path,
|
|
492
|
+
timeoutMs,
|
|
493
|
+
},
|
|
438
494
|
);
|
|
439
495
|
const response = await Promise.race([responsePromise, timeoutPromise]);
|
|
440
|
-
if (await
|
|
441
|
-
|
|
442
|
-
`[play-harness] RUNTIME_API binding
|
|
496
|
+
if (await isRuntimeApiBindingNotFoundResponse(response)) {
|
|
497
|
+
throw new Error(
|
|
498
|
+
`[play-harness] RUNTIME_API service binding could not route ${path}; coordinator returned not found.`,
|
|
443
499
|
);
|
|
444
|
-
return await Promise.race([
|
|
445
|
-
fetch(`${baseUrl.replace(/\/$/, '')}${path}`, {
|
|
446
|
-
...init,
|
|
447
|
-
headers: runtimeApiHeaders(init.headers, true),
|
|
448
|
-
signal: controller.signal,
|
|
449
|
-
}),
|
|
450
|
-
timeoutPromise,
|
|
451
|
-
]);
|
|
452
500
|
}
|
|
453
501
|
return response;
|
|
454
502
|
} catch (err) {
|
|
@@ -463,7 +511,33 @@ async function fetchRuntimeApi(
|
|
|
463
511
|
}
|
|
464
512
|
}
|
|
465
513
|
|
|
466
|
-
async function
|
|
514
|
+
async function callRuntimeApiRpcBinding(
|
|
515
|
+
binding: NonNullable<WorkerEnv['RUNTIME_API']>,
|
|
516
|
+
init: RequestInit,
|
|
517
|
+
input: { path: string; timeoutMs: number },
|
|
518
|
+
): Promise<Response> {
|
|
519
|
+
const h = new Headers(init.headers);
|
|
520
|
+
const authorization = h.get('authorization') ?? '';
|
|
521
|
+
const headers: Record<string, string> = {};
|
|
522
|
+
const metadata = h.get(EXECUTE_TOOL_METADATA_HEADER);
|
|
523
|
+
if (metadata) headers[EXECUTE_TOOL_METADATA_HEADER] = metadata;
|
|
524
|
+
const contract = h.get(EXECUTE_RESPONSE_CONTRACT_HEADER);
|
|
525
|
+
if (contract) headers[EXECUTE_RESPONSE_CONTRACT_HEADER] = contract;
|
|
526
|
+
const rawBody = typeof init.body === 'string' ? init.body : '';
|
|
527
|
+
const result = await binding.runtimeApiCall({
|
|
528
|
+
executorToken: authorization.replace(/^Bearer\s+/i, '').trim(),
|
|
529
|
+
path: input.path,
|
|
530
|
+
body: rawBody ? JSON.parse(rawBody) : {},
|
|
531
|
+
headers,
|
|
532
|
+
timeoutMs: input.timeoutMs,
|
|
533
|
+
});
|
|
534
|
+
return new Response(result.body, {
|
|
535
|
+
status: result.status,
|
|
536
|
+
headers: result.headers ?? {},
|
|
537
|
+
});
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
async function isRuntimeApiBindingNotFoundResponse(
|
|
467
541
|
response: Response,
|
|
468
542
|
): Promise<boolean> {
|
|
469
543
|
if (response.status !== 404) {
|
|
@@ -494,13 +568,6 @@ function cachedVercelProtectionBypassToken(): string | null {
|
|
|
494
568
|
return cachedRuntimeApiVercelBypassToken;
|
|
495
569
|
}
|
|
496
570
|
|
|
497
|
-
const WORKER_PLAY_CALL_LIMITS = {
|
|
498
|
-
maxPlayCallDepth: 6,
|
|
499
|
-
maxPlayCallCount: 1_000,
|
|
500
|
-
maxChildPlayCallsPerParent: 1_000,
|
|
501
|
-
maxConcurrentPlayCalls: 16,
|
|
502
|
-
};
|
|
503
|
-
|
|
504
571
|
type RunnerEvent =
|
|
505
572
|
| {
|
|
506
573
|
type: 'log';
|
|
@@ -520,6 +587,12 @@ type WorkflowRunOutput = {
|
|
|
520
587
|
durationMs: number;
|
|
521
588
|
};
|
|
522
589
|
|
|
590
|
+
type InlineRunTiming = {
|
|
591
|
+
phase: string;
|
|
592
|
+
ms: number;
|
|
593
|
+
extra?: Record<string, unknown>;
|
|
594
|
+
};
|
|
595
|
+
|
|
523
596
|
type WorkerCtxCallbacks = {
|
|
524
597
|
onNodeProgress?: (input: {
|
|
525
598
|
nodeId: string;
|
|
@@ -658,9 +731,8 @@ async function postRuntimeApi<T>(
|
|
|
658
731
|
executorToken: string,
|
|
659
732
|
body: unknown,
|
|
660
733
|
): Promise<T> {
|
|
661
|
-
// Routes through the in-process RUNTIME_API binding
|
|
662
|
-
//
|
|
663
|
-
// hits the same handler with the same auth — only the transport changes.
|
|
734
|
+
// Routes through the in-process RUNTIME_API service binding. Missing binding
|
|
735
|
+
// is an infra error in workers_edge, not a reason to fall back to public HTTP.
|
|
664
736
|
const serializedBody = JSON.stringify(body);
|
|
665
737
|
let lastError: unknown = null;
|
|
666
738
|
for (
|
|
@@ -797,6 +869,15 @@ async function submitChildPlayThroughCoordinator(input: {
|
|
|
797
869
|
}
|
|
798
870
|
return cachedCoordinatorBinding.submitChild(input.req.runId, input.body);
|
|
799
871
|
}
|
|
872
|
+
if (cachedCoordinatorBinding?.submitWorkflowChild) {
|
|
873
|
+
if (!isRecord(input.body)) {
|
|
874
|
+
throw new Error('ctx.runPlay child submit requires an object body.');
|
|
875
|
+
}
|
|
876
|
+
return cachedCoordinatorBinding.submitWorkflowChild(
|
|
877
|
+
input.req.runId,
|
|
878
|
+
input.body,
|
|
879
|
+
);
|
|
880
|
+
}
|
|
800
881
|
const coordinatorUrl = input.req.coordinatorUrl?.trim();
|
|
801
882
|
if (coordinatorUrl) {
|
|
802
883
|
// Keep child plays on the same coordinator/Workflow submit path as
|
|
@@ -924,46 +1005,6 @@ function workflowTimeoutFromMs(timeoutMs: number): string {
|
|
|
924
1005
|
return `${seconds} second${seconds === 1 ? '' : 's'}`;
|
|
925
1006
|
}
|
|
926
1007
|
|
|
927
|
-
async function waitForChildPlayTerminalEvent(input: {
|
|
928
|
-
req: RunRequest;
|
|
929
|
-
workflowStep?: WorkflowStep;
|
|
930
|
-
workflowId: string;
|
|
931
|
-
playName: string;
|
|
932
|
-
key: string;
|
|
933
|
-
timeoutMs: number;
|
|
934
|
-
}): Promise<unknown> {
|
|
935
|
-
if (!input.workflowStep) {
|
|
936
|
-
throw new Error(
|
|
937
|
-
'ctx.runPlay child waits require the cf-workflows runtime event scheduler.',
|
|
938
|
-
);
|
|
939
|
-
}
|
|
940
|
-
const eventKey = await childPlayEventKey({
|
|
941
|
-
key: input.key,
|
|
942
|
-
workflowId: input.workflowId,
|
|
943
|
-
});
|
|
944
|
-
const event = (await (
|
|
945
|
-
input.workflowStep.waitForEvent as unknown as (
|
|
946
|
-
name: string,
|
|
947
|
-
options: { type: string; timeout: string },
|
|
948
|
-
) => Promise<{ payload: unknown }>
|
|
949
|
-
)(`child_play_terminal:${eventKey}`, {
|
|
950
|
-
type: integrationEventType(eventKey),
|
|
951
|
-
timeout: workflowTimeoutFromMs(input.timeoutMs),
|
|
952
|
-
})) as { payload: unknown };
|
|
953
|
-
const rawPayload = isRecord(event.payload) ? event.payload : {};
|
|
954
|
-
const payload = isRecord(rawPayload.data) ? rawPayload.data : rawPayload;
|
|
955
|
-
const status = String(payload.status ?? '').toLowerCase();
|
|
956
|
-
if (status === 'completed') {
|
|
957
|
-
return extractChildPlayOutput(payload);
|
|
958
|
-
}
|
|
959
|
-
const error = isRecord(payload.error) ? payload.error : null;
|
|
960
|
-
const message =
|
|
961
|
-
(typeof error?.message === 'string' && error.message.trim()) ||
|
|
962
|
-
(typeof payload.error === 'string' && payload.error.trim()) ||
|
|
963
|
-
`Child play ${input.playName} (${input.workflowId}) finished with status ${status || 'unknown'}.`;
|
|
964
|
-
throw new Error(message);
|
|
965
|
-
}
|
|
966
|
-
|
|
967
1008
|
async function signalParentPlayTerminal(input: {
|
|
968
1009
|
req: RunRequest;
|
|
969
1010
|
status: 'completed' | 'failed' | 'cancelled';
|
|
@@ -1045,6 +1086,8 @@ async function executeTool(
|
|
|
1045
1086
|
req: RunRequest,
|
|
1046
1087
|
args: { id: string; toolId: string; input: Record<string, unknown> },
|
|
1047
1088
|
workflowStep?: WorkflowStep,
|
|
1089
|
+
onProviderBackpressure?: (retryAfterMs: number) => void,
|
|
1090
|
+
onRetryAttempt?: () => void,
|
|
1048
1091
|
): Promise<ToolExecuteResult> {
|
|
1049
1092
|
if (args.toolId === 'test_wait_for_event' && workflowStep) {
|
|
1050
1093
|
const result = await waitForSyntheticIntegrationEvent(
|
|
@@ -1059,7 +1102,7 @@ async function executeTool(
|
|
|
1059
1102
|
// service bindings, NOT through HTTP from this worker. Removing the
|
|
1060
1103
|
// dispatcher-side coordinatorUrl plumbing intentionally turns the old
|
|
1061
1104
|
// HTTP-based dedup helpers into dead code.
|
|
1062
|
-
return callToolDirect(req, args);
|
|
1105
|
+
return callToolDirect(req, args, onProviderBackpressure, onRetryAttempt);
|
|
1063
1106
|
}
|
|
1064
1107
|
|
|
1065
1108
|
async function executeToolWithLifecycle(
|
|
@@ -1193,6 +1236,13 @@ async function waitForSyntheticIntegrationEvent(
|
|
|
1193
1236
|
async function callToolDirect(
|
|
1194
1237
|
req: RunRequest,
|
|
1195
1238
|
args: { id: string; toolId: string; input: Record<string, unknown> },
|
|
1239
|
+
onProviderBackpressure?: (retryAfterMs: number) => void,
|
|
1240
|
+
// Invoked once per in-process retry attempt (429 / retryable 5xx / synthetic
|
|
1241
|
+
// transient) so the Governor charges chargeBudget('retry') per attempt — the
|
|
1242
|
+
// same runaway guard the cjs runner applies (context.ts charges retry on each
|
|
1243
|
+
// 429 / transient-5xx retry). Without this the worker substrate would leave
|
|
1244
|
+
// policy.budgets.maxRetryCount effectively unenforced.
|
|
1245
|
+
onRetryAttempt?: () => void,
|
|
1196
1246
|
): Promise<ToolExecuteResult> {
|
|
1197
1247
|
const { id, toolId, input } = args;
|
|
1198
1248
|
if (toolId === 'test_rate_limit') {
|
|
@@ -1233,6 +1283,8 @@ async function callToolDirect(
|
|
|
1233
1283
|
if (attempt >= maxAttempts) {
|
|
1234
1284
|
throw lastError;
|
|
1235
1285
|
}
|
|
1286
|
+
// Charge the retry budget per attempt, matching the cjs runner.
|
|
1287
|
+
onRetryAttempt?.();
|
|
1236
1288
|
await new Promise((resolve) => setTimeout(resolve, 1_000));
|
|
1237
1289
|
continue;
|
|
1238
1290
|
}
|
|
@@ -1273,17 +1325,26 @@ async function callToolDirect(
|
|
|
1273
1325
|
maxAttempts,
|
|
1274
1326
|
bodyText: text,
|
|
1275
1327
|
});
|
|
1328
|
+
const retryAfterSeconds = Number(res.headers.get('retry-after'));
|
|
1329
|
+
const retryAfterMs =
|
|
1330
|
+
Number.isFinite(retryAfterSeconds) && retryAfterSeconds > 0
|
|
1331
|
+
? Math.ceil(retryAfterSeconds * 1000)
|
|
1332
|
+
: 0;
|
|
1333
|
+
if (res.status === 429) {
|
|
1334
|
+
// Feed the provider's backpressure into the shared pacer even on the
|
|
1335
|
+
// final attempt so the (org, provider) bucket backs off across isolates.
|
|
1336
|
+
onProviderBackpressure?.(retryAfterMs > 0 ? retryAfterMs : 1_000);
|
|
1337
|
+
}
|
|
1276
1338
|
const retryable =
|
|
1277
1339
|
(res.status === 429 && !isHardBillingToolHttpError(lastError)) ||
|
|
1278
1340
|
(res.status >= 500 && WORKER_RETRY_SAFE_5XX_TOOLS.has(toolId));
|
|
1279
1341
|
if (!retryable || attempt >= maxAttempts) {
|
|
1280
1342
|
throw lastError;
|
|
1281
1343
|
}
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
: 1_000;
|
|
1344
|
+
// Charge the retry budget per attempt, matching the cjs runner's
|
|
1345
|
+
// chargeBudget('retry') on every 429 / retryable-5xx retry.
|
|
1346
|
+
onRetryAttempt?.();
|
|
1347
|
+
const delayMs = retryAfterMs > 0 ? Math.min(5_000, retryAfterMs) : 1_000;
|
|
1287
1348
|
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
|
1288
1349
|
}
|
|
1289
1350
|
|
|
@@ -1731,6 +1792,11 @@ type WorkerToolBatchRequest = {
|
|
|
1731
1792
|
};
|
|
1732
1793
|
|
|
1733
1794
|
const WORKER_TOOL_BATCH_GRACE_MS = 15;
|
|
1795
|
+
// Fallback batch-chunk parallelism when a tool declares no provider rate hints.
|
|
1796
|
+
// Matches the prior hardcoded `Math.min(4, ...)` so undeclared providers keep
|
|
1797
|
+
// their previous batching behavior; declared providers tighten via the
|
|
1798
|
+
// Governor's suggestedParallelism.
|
|
1799
|
+
const WORKER_TOOL_BATCH_DEFAULT_PARALLELISM = 4;
|
|
1734
1800
|
const WORKER_RETRY_SAFE_5XX_TOOLS = new Set(['test_transient_500']);
|
|
1735
1801
|
|
|
1736
1802
|
function stepProgramColumnName(parentField: string, stepId: string): string {
|
|
@@ -1741,7 +1807,31 @@ class WorkerToolBatchScheduler {
|
|
|
1741
1807
|
private queue: WorkerToolBatchRequest[] = [];
|
|
1742
1808
|
private scheduled = false;
|
|
1743
1809
|
|
|
1744
|
-
constructor(
|
|
1810
|
+
constructor(
|
|
1811
|
+
private readonly req: RunRequest,
|
|
1812
|
+
private readonly governor: PlayExecutionGovernor,
|
|
1813
|
+
private readonly resolvePacing: WorkerPacingResolver,
|
|
1814
|
+
private readonly abortSignal?: AbortSignal,
|
|
1815
|
+
) {}
|
|
1816
|
+
|
|
1817
|
+
/**
|
|
1818
|
+
* Report a provider 429 / Retry-After back into the Governor's shared pacer
|
|
1819
|
+
* so future acquires for this (org, provider) bucket back off across all
|
|
1820
|
+
* isolates. Provider comes from the same pacing resolver the Governor uses
|
|
1821
|
+
* (the worker has no local catalog), so callers pass only the toolId.
|
|
1822
|
+
*/
|
|
1823
|
+
private reportBackpressure(toolId: string, retryAfterMs: number): void {
|
|
1824
|
+
if (retryAfterMs <= 0) return;
|
|
1825
|
+
void (async () => {
|
|
1826
|
+
const pacing = await this.resolvePacing(toolId).catch(() => null);
|
|
1827
|
+
if (pacing?.provider) {
|
|
1828
|
+
this.governor.reportProviderBackpressure({
|
|
1829
|
+
provider: pacing.provider,
|
|
1830
|
+
retryAfterMs,
|
|
1831
|
+
});
|
|
1832
|
+
}
|
|
1833
|
+
})();
|
|
1834
|
+
}
|
|
1745
1835
|
|
|
1746
1836
|
execute(
|
|
1747
1837
|
id: string,
|
|
@@ -1824,16 +1914,26 @@ class WorkerToolBatchScheduler {
|
|
|
1824
1914
|
const groupStartedAt = nowMs();
|
|
1825
1915
|
await Promise.all(
|
|
1826
1916
|
requests.map(async (request) => {
|
|
1917
|
+
// Each unbatched provider call takes its own tool slot: the Governor
|
|
1918
|
+
// charges tool budget, holds a global tool-concurrency slot, and
|
|
1919
|
+
// applies per-(org,provider) pacing before the call runs.
|
|
1920
|
+
const slot = await this.governor.acquireToolSlot(toolId, {
|
|
1921
|
+
signal: this.abortSignal,
|
|
1922
|
+
});
|
|
1827
1923
|
try {
|
|
1828
1924
|
request.resolve(
|
|
1829
1925
|
await executeTool(
|
|
1830
1926
|
this.req,
|
|
1831
1927
|
{ id: request.id, toolId, input: request.input },
|
|
1832
1928
|
request.workflowStep,
|
|
1929
|
+
(retryAfterMs) => this.reportBackpressure(toolId, retryAfterMs),
|
|
1930
|
+
() => this.governor.chargeBudget('retry'),
|
|
1833
1931
|
),
|
|
1834
1932
|
);
|
|
1835
1933
|
} catch (error) {
|
|
1836
1934
|
request.reject(error);
|
|
1935
|
+
} finally {
|
|
1936
|
+
slot.release();
|
|
1837
1937
|
}
|
|
1838
1938
|
}),
|
|
1839
1939
|
);
|
|
@@ -1851,6 +1951,14 @@ class WorkerToolBatchScheduler {
|
|
|
1851
1951
|
req: this.req,
|
|
1852
1952
|
requests,
|
|
1853
1953
|
strategy,
|
|
1954
|
+
governor: this.governor,
|
|
1955
|
+
suggestedParallelism: await this.governor.suggestedParallelism(
|
|
1956
|
+
toolId,
|
|
1957
|
+
WORKER_TOOL_BATCH_DEFAULT_PARALLELISM,
|
|
1958
|
+
),
|
|
1959
|
+
abortSignal: this.abortSignal,
|
|
1960
|
+
reportBackpressure: (retryAfterMs) =>
|
|
1961
|
+
this.reportBackpressure(toolId, retryAfterMs),
|
|
1854
1962
|
});
|
|
1855
1963
|
recordRunnerPerfTrace({
|
|
1856
1964
|
req: this.req,
|
|
@@ -1880,6 +1988,10 @@ async function executeBatchedWorkerToolGroup(input: {
|
|
|
1880
1988
|
req: RunRequest;
|
|
1881
1989
|
requests: WorkerToolBatchRequest[];
|
|
1882
1990
|
strategy: AnyBatchOperationStrategy;
|
|
1991
|
+
governor: PlayExecutionGovernor;
|
|
1992
|
+
suggestedParallelism: number;
|
|
1993
|
+
abortSignal?: AbortSignal;
|
|
1994
|
+
reportBackpressure: (retryAfterMs: number) => void;
|
|
1883
1995
|
}): Promise<void> {
|
|
1884
1996
|
const compiledBatches = compileRequestsWithStrategy({
|
|
1885
1997
|
requests: input.requests,
|
|
@@ -1889,13 +2001,34 @@ async function executeBatchedWorkerToolGroup(input: {
|
|
|
1889
2001
|
|
|
1890
2002
|
await executeChunkedRequests({
|
|
1891
2003
|
requests: compiledBatches,
|
|
1892
|
-
|
|
1893
|
-
|
|
1894
|
-
|
|
1895
|
-
|
|
1896
|
-
|
|
1897
|
-
|
|
1898
|
-
|
|
2004
|
+
// Chunk parallelism is the Governor's per-tool suggestion (provider rate
|
|
2005
|
+
// hints tightened to the policy ceiling), bounded by the batch count.
|
|
2006
|
+
batchSize: Math.max(
|
|
2007
|
+
1,
|
|
2008
|
+
Math.min(input.suggestedParallelism, compiledBatches.length || 1),
|
|
2009
|
+
),
|
|
2010
|
+
execute: async (batch) => {
|
|
2011
|
+
// One provider call per batch → one tool slot (budget + global
|
|
2012
|
+
// concurrency + per-(org,provider) pacing) around the whole batch.
|
|
2013
|
+
const slot = await input.governor.acquireToolSlot(batch.batchOperation, {
|
|
2014
|
+
signal: input.abortSignal,
|
|
2015
|
+
});
|
|
2016
|
+
try {
|
|
2017
|
+
return await executeTool(
|
|
2018
|
+
input.req,
|
|
2019
|
+
{
|
|
2020
|
+
id: `batch:${batch.memberRequests.map((request) => request.id).join('|')}`,
|
|
2021
|
+
toolId: batch.batchOperation,
|
|
2022
|
+
input: batch.batchPayload,
|
|
2023
|
+
},
|
|
2024
|
+
undefined,
|
|
2025
|
+
input.reportBackpressure,
|
|
2026
|
+
() => input.governor.chargeBudget('retry'),
|
|
2027
|
+
);
|
|
2028
|
+
} finally {
|
|
2029
|
+
slot.release();
|
|
2030
|
+
}
|
|
2031
|
+
},
|
|
1899
2032
|
onChunkComplete: async (
|
|
1900
2033
|
chunkResults: Array<
|
|
1901
2034
|
ChunkExecutionResult<(typeof compiledBatches)[number], unknown>
|
|
@@ -3164,9 +3297,23 @@ function assertNotAborted(signal: AbortSignal | undefined): void {
|
|
|
3164
3297
|
function childPipelineUsesCtxDataset(
|
|
3165
3298
|
pipeline: PlayStaticPipeline | null | undefined,
|
|
3166
3299
|
): boolean {
|
|
3167
|
-
|
|
3168
|
-
|
|
3169
|
-
|
|
3300
|
+
if (!pipeline) return false;
|
|
3301
|
+
if (typeof pipeline.tableNamespace === 'string' && pipeline.tableNamespace) {
|
|
3302
|
+
return true;
|
|
3303
|
+
}
|
|
3304
|
+
if (pipeline.sheetContract) {
|
|
3305
|
+
return true;
|
|
3306
|
+
}
|
|
3307
|
+
return flattenStaticPipeline(pipeline).some((substep) => {
|
|
3308
|
+
if (substep.type === 'dataset') return true;
|
|
3309
|
+
if (!isRecord(substep)) return false;
|
|
3310
|
+
return (
|
|
3311
|
+
('tableNamespace' in substep &&
|
|
3312
|
+
typeof substep.tableNamespace === 'string' &&
|
|
3313
|
+
substep.tableNamespace.length > 0) ||
|
|
3314
|
+
('sheetContract' in substep && Boolean(substep.sheetContract))
|
|
3315
|
+
);
|
|
3316
|
+
});
|
|
3170
3317
|
}
|
|
3171
3318
|
|
|
3172
3319
|
function childPipelineNeedsWorkflowScheduler(
|
|
@@ -3181,16 +3328,160 @@ function childPipelineNeedsWorkflowScheduler(
|
|
|
3181
3328
|
);
|
|
3182
3329
|
}
|
|
3183
3330
|
|
|
3184
|
-
|
|
3185
|
-
|
|
3186
|
-
|
|
3187
|
-
|
|
3188
|
-
|
|
3189
|
-
|
|
3190
|
-
|
|
3191
|
-
|
|
3192
|
-
|
|
3193
|
-
|
|
3331
|
+
/**
|
|
3332
|
+
* Build the per-(org,provider) rate port the distributed Rate State Backend
|
|
3333
|
+
* RPCs through. When the coordinator binding (or its rate RPCs) is absent we
|
|
3334
|
+
* fail OPEN — grant immediately — matching customer-rate-limiter semantics so a
|
|
3335
|
+
* miswired binding degrades pacing without stalling the run.
|
|
3336
|
+
*/
|
|
3337
|
+
function createCoordinatorRatePort(): CoordinatorRatePort {
|
|
3338
|
+
return {
|
|
3339
|
+
async rateAcquire(input) {
|
|
3340
|
+
const binding = cachedCoordinatorBinding;
|
|
3341
|
+
if (!binding?.rateAcquire) {
|
|
3342
|
+
return { granted: input.requested, waitMs: 0 };
|
|
3343
|
+
}
|
|
3344
|
+
return await binding.rateAcquire(input);
|
|
3345
|
+
},
|
|
3346
|
+
async ratePenalize(input) {
|
|
3347
|
+
const binding = cachedCoordinatorBinding;
|
|
3348
|
+
if (!binding?.ratePenalize) return;
|
|
3349
|
+
await binding.ratePenalize(input);
|
|
3350
|
+
},
|
|
3351
|
+
};
|
|
3352
|
+
}
|
|
3353
|
+
|
|
3354
|
+
/**
|
|
3355
|
+
* Resolve a tool's provider + pacing rules from the same runtime tool-metadata
|
|
3356
|
+
* endpoint the cjs_node20 runner uses (`getToolQueueHints`). The worker has no
|
|
3357
|
+
* local catalog, so this is an HTTP fetch through the runtime API binding,
|
|
3358
|
+
* memoized per isolate. No hints → null (pacing is a no-op; the Governor's
|
|
3359
|
+
* global tool-concurrency slot still applies).
|
|
3360
|
+
*/
|
|
3361
|
+
type WorkerPacingResolver = (
|
|
3362
|
+
toolId: string,
|
|
3363
|
+
) => Promise<{ provider: string; rules: PacingRule[] } | null>;
|
|
3364
|
+
|
|
3365
|
+
function createWorkerPacingResolver(req: RunRequest): WorkerPacingResolver {
|
|
3366
|
+
const cache = new Map<
|
|
3367
|
+
string,
|
|
3368
|
+
Promise<{ provider: string; rules: PacingRule[] } | null>
|
|
3369
|
+
>();
|
|
3370
|
+
return (toolId: string) => {
|
|
3371
|
+
const normalized = String(toolId || '').trim();
|
|
3372
|
+
if (!normalized) return Promise.resolve(null);
|
|
3373
|
+
const cached = cache.get(normalized);
|
|
3374
|
+
if (cached) return cached;
|
|
3375
|
+
const promise = (async () => {
|
|
3376
|
+
const res = await fetchRuntimeApi(
|
|
3377
|
+
req.baseUrl,
|
|
3378
|
+
`/api/v2/plays/runtime-tools/${encodeURIComponent(normalized)}`,
|
|
3379
|
+
{
|
|
3380
|
+
method: 'GET',
|
|
3381
|
+
headers: { authorization: `Bearer ${req.executorToken}` },
|
|
3382
|
+
},
|
|
3383
|
+
).catch(() => null);
|
|
3384
|
+
if (!res || !res.ok) return null;
|
|
3385
|
+
const body = (await res.json().catch(() => null)) as {
|
|
3386
|
+
provider?: unknown;
|
|
3387
|
+
queueHints?: unknown;
|
|
3388
|
+
} | null;
|
|
3389
|
+
if (!body) return null;
|
|
3390
|
+
const provider =
|
|
3391
|
+
typeof body.provider === 'string' && body.provider.trim()
|
|
3392
|
+
? body.provider.trim()
|
|
3393
|
+
: null;
|
|
3394
|
+
if (!provider || !Array.isArray(body.queueHints)) return null;
|
|
3395
|
+
const rules: PacingRule[] = body.queueHints.flatMap((hint) => {
|
|
3396
|
+
if (!hint || typeof hint !== 'object') return [];
|
|
3397
|
+
const record = hint as Record<string, unknown>;
|
|
3398
|
+
if (
|
|
3399
|
+
typeof record.ruleId !== 'string' ||
|
|
3400
|
+
typeof record.requestsPerWindow !== 'number' ||
|
|
3401
|
+
typeof record.windowMs !== 'number'
|
|
3402
|
+
) {
|
|
3403
|
+
return [];
|
|
3404
|
+
}
|
|
3405
|
+
return [
|
|
3406
|
+
{
|
|
3407
|
+
ruleId: record.ruleId,
|
|
3408
|
+
requestsPerWindow: record.requestsPerWindow,
|
|
3409
|
+
windowMs: record.windowMs,
|
|
3410
|
+
maxConcurrency:
|
|
3411
|
+
typeof record.maxConcurrency === 'number'
|
|
3412
|
+
? record.maxConcurrency
|
|
3413
|
+
: null,
|
|
3414
|
+
} satisfies PacingRule,
|
|
3415
|
+
];
|
|
3416
|
+
});
|
|
3417
|
+
if (rules.length === 0) return null;
|
|
3418
|
+
return { provider, rules };
|
|
3419
|
+
})();
|
|
3420
|
+
cache.set(normalized, promise);
|
|
3421
|
+
return promise;
|
|
3422
|
+
};
|
|
3423
|
+
}
|
|
3424
|
+
|
|
3425
|
+
/**
|
|
3426
|
+
* Build the Governor's lineage snapshot for this worker, seeded from the
|
|
3427
|
+
* inherited PlayCallGovernanceSnapshot (threaded via internalRunPlay) so play-
|
|
3428
|
+
* call budgets accumulate down the dispatch tree across isolates. The current
|
|
3429
|
+
* play id is always `req.playName` so the per-parent child-call counter keys off
|
|
3430
|
+
* the executing play (matching the prior worker behavior). Per-run counters the
|
|
3431
|
+
* worker lineage does not carry (tool/retry/descendant/waterfall) seed at 0 —
|
|
3432
|
+
* these are runaway guards, not exact cross-isolate ledgers, so a per-worker
|
|
3433
|
+
* reset is acceptable and matches the Governor contract.
|
|
3434
|
+
*/
|
|
3435
|
+
function resumeGovernanceFromRequest(req: RunRequest): GovernanceSnapshot {
|
|
3436
|
+
const inherited = req.playCallGovernance;
|
|
3437
|
+
const rootRunId = inherited?.rootRunId || req.runId;
|
|
3438
|
+
const ancestryPlayIds = inherited?.ancestryPlayIds?.length
|
|
3439
|
+
? // Per the lineage validator the inherited tail equals the parent; ensure
|
|
3440
|
+
// the chain ends with the currently-executing play for the cycle guard.
|
|
3441
|
+
inherited.ancestryPlayIds[inherited.ancestryPlayIds.length - 1] ===
|
|
3442
|
+
req.playName
|
|
3443
|
+
? [...inherited.ancestryPlayIds]
|
|
3444
|
+
: [...inherited.ancestryPlayIds, req.playName]
|
|
3445
|
+
: [req.playName];
|
|
3446
|
+
const ancestryRunIds =
|
|
3447
|
+
rootRunId === req.runId ? [req.runId] : [rootRunId, req.runId];
|
|
3448
|
+
return {
|
|
3449
|
+
rootRunId,
|
|
3450
|
+
currentRunId: req.runId,
|
|
3451
|
+
currentPlayId: req.playName,
|
|
3452
|
+
ancestryPlayIds,
|
|
3453
|
+
ancestryRunIds,
|
|
3454
|
+
callDepth: inherited?.callDepth ?? 0,
|
|
3455
|
+
// Seed every lineage-global budget counter from the inherited snapshot so
|
|
3456
|
+
// descendant/tool/retry/waterfall budgets accumulate across isolates exactly
|
|
3457
|
+
// as they do across the cjs forkChild lineage. Without this they would reset
|
|
3458
|
+
// to 0 in each isolate and become per-worker — contradicting the Governor's
|
|
3459
|
+
// lineage-global budget contract. Fail-safe to 0 for older callers.
|
|
3460
|
+
playCallCount: inherited?.playCallCount ?? 0,
|
|
3461
|
+
toolCallCount: inherited?.toolCallCount ?? 0,
|
|
3462
|
+
retryCount: inherited?.retryCount ?? 0,
|
|
3463
|
+
descendantCount: inherited?.descendantCount ?? 0,
|
|
3464
|
+
waterfallStepExecutions: inherited?.waterfallStepExecutions ?? 0,
|
|
3465
|
+
parentChildCalls: {},
|
|
3466
|
+
};
|
|
3467
|
+
}
|
|
3468
|
+
|
|
3469
|
+
function createGovernorForRun(req: RunRequest): {
|
|
3470
|
+
governor: PlayExecutionGovernor;
|
|
3471
|
+
resolvePacing: WorkerPacingResolver;
|
|
3472
|
+
} {
|
|
3473
|
+
const resolvePacing = createWorkerPacingResolver(req);
|
|
3474
|
+
const governor = createPlayExecutionGovernor({
|
|
3475
|
+
adapter: 'esm_workers',
|
|
3476
|
+
scope: {
|
|
3477
|
+
orgId: req.orgId,
|
|
3478
|
+
rootRunId: req.playCallGovernance?.rootRunId ?? req.runId,
|
|
3479
|
+
},
|
|
3480
|
+
rateState: new CoordinatorRateStateBackend(createCoordinatorRatePort()),
|
|
3481
|
+
resolvePacing,
|
|
3482
|
+
resume: resumeGovernanceFromRequest(req),
|
|
3483
|
+
});
|
|
3484
|
+
return { governor, resolvePacing };
|
|
3194
3485
|
}
|
|
3195
3486
|
|
|
3196
3487
|
function createMinimalWorkerCtx(
|
|
@@ -3201,12 +3492,12 @@ function createMinimalWorkerCtx(
|
|
|
3201
3492
|
abortSignal?: AbortSignal,
|
|
3202
3493
|
callbacks?: WorkerCtxCallbacks,
|
|
3203
3494
|
): unknown {
|
|
3204
|
-
|
|
3205
|
-
|
|
3495
|
+
const { governor, resolvePacing: resolveToolPacing } =
|
|
3496
|
+
createGovernorForRun(req);
|
|
3497
|
+
// Play-call depth/count/per-parent budgets, child-play concurrency, and the
|
|
3498
|
+
// lineage snapshot are owned by the Governor (createGovernorForRun above).
|
|
3499
|
+
// The worker keeps only substrate mechanism here.
|
|
3206
3500
|
const stepCallCounts: Record<string, number> = {};
|
|
3207
|
-
const inFlightChildCallsByPlayName: Record<string, number> = {};
|
|
3208
|
-
let inFlightChildPlayCalls = 0;
|
|
3209
|
-
const childPlaySlotWaiters: Array<() => void> = [];
|
|
3210
3501
|
const secretRedactor = createSecretRedactionContext();
|
|
3211
3502
|
|
|
3212
3503
|
const resolveSecretAuth = async (auth?: SecretAuth) => {
|
|
@@ -3245,38 +3536,6 @@ function createMinimalWorkerCtx(
|
|
|
3245
3536
|
: { [auth.header.toLowerCase()]: value };
|
|
3246
3537
|
};
|
|
3247
3538
|
|
|
3248
|
-
const acquireChildPlaySlot = async (): Promise<() => void> => {
|
|
3249
|
-
while (
|
|
3250
|
-
inFlightChildPlayCalls >= WORKER_PLAY_CALL_LIMITS.maxConcurrentPlayCalls
|
|
3251
|
-
) {
|
|
3252
|
-
await new Promise<void>((resolve, reject) => {
|
|
3253
|
-
const waiter = () => {
|
|
3254
|
-
abortSignal?.removeEventListener('abort', onAbort);
|
|
3255
|
-
resolve();
|
|
3256
|
-
};
|
|
3257
|
-
const onAbort = () => {
|
|
3258
|
-
const index = childPlaySlotWaiters.indexOf(waiter);
|
|
3259
|
-
if (index >= 0) childPlaySlotWaiters.splice(index, 1);
|
|
3260
|
-
reject(
|
|
3261
|
-
abortSignal?.reason instanceof Error
|
|
3262
|
-
? abortSignal.reason
|
|
3263
|
-
: new WorkflowAbortError(),
|
|
3264
|
-
);
|
|
3265
|
-
};
|
|
3266
|
-
childPlaySlotWaiters.push(waiter);
|
|
3267
|
-
abortSignal?.addEventListener('abort', onAbort, { once: true });
|
|
3268
|
-
});
|
|
3269
|
-
assertNotAborted(abortSignal);
|
|
3270
|
-
}
|
|
3271
|
-
inFlightChildPlayCalls += 1;
|
|
3272
|
-
let released = false;
|
|
3273
|
-
return () => {
|
|
3274
|
-
if (released) return;
|
|
3275
|
-
released = true;
|
|
3276
|
-
inFlightChildPlayCalls = Math.max(0, inFlightChildPlayCalls - 1);
|
|
3277
|
-
childPlaySlotWaiters.shift()?.();
|
|
3278
|
-
};
|
|
3279
|
-
};
|
|
3280
3539
|
const rootGovernance = req.playCallGovernance;
|
|
3281
3540
|
const rootRunId = rootGovernance?.rootRunId ?? req.runId;
|
|
3282
3541
|
const receiptStore = createHarnessWorkerReceiptStore({
|
|
@@ -3577,7 +3836,12 @@ function createMinimalWorkerCtx(
|
|
|
3577
3836
|
0,
|
|
3578
3837
|
prepared.skipped - missingPreparedRows.length,
|
|
3579
3838
|
);
|
|
3580
|
-
|
|
3839
|
+
// Row concurrency comes from the Governor: an explicit map concurrency is
|
|
3840
|
+
// clamped to the policy row-max, otherwise the policy default. Each row
|
|
3841
|
+
// body additionally acquires a global row slot (the Governor's rowMax
|
|
3842
|
+
// semaphore) so total in-flight rows across all maps in this isolate stay
|
|
3843
|
+
// bounded even when several maps run at once.
|
|
3844
|
+
const concurrency = governor.resolveRowConcurrency();
|
|
3581
3845
|
const executedRows: Array<T & Record<string, unknown>> = new Array(
|
|
3582
3846
|
rowsToExecute.length,
|
|
3583
3847
|
);
|
|
@@ -3594,7 +3858,12 @@ function createMinimalWorkerCtx(
|
|
|
3594
3858
|
>
|
|
3595
3859
|
| undefined
|
|
3596
3860
|
> = new Array(rowsToExecute.length);
|
|
3597
|
-
const toolBatchScheduler = new WorkerToolBatchScheduler(
|
|
3861
|
+
const toolBatchScheduler = new WorkerToolBatchScheduler(
|
|
3862
|
+
req,
|
|
3863
|
+
governor,
|
|
3864
|
+
resolveToolPacing,
|
|
3865
|
+
abortSignal,
|
|
3866
|
+
);
|
|
3598
3867
|
const generatedOutputFields = new Set<string>();
|
|
3599
3868
|
let idx = 0;
|
|
3600
3869
|
const workers: Array<Promise<void>> = [];
|
|
@@ -3605,143 +3874,152 @@ function createMinimalWorkerCtx(
|
|
|
3605
3874
|
if (abortSignal?.aborted) return;
|
|
3606
3875
|
const myIndex = idx++;
|
|
3607
3876
|
if (myIndex >= rowsToExecute.length) return;
|
|
3608
|
-
const
|
|
3609
|
-
|
|
3610
|
-
|
|
3611
|
-
|
|
3612
|
-
|
|
3613
|
-
|
|
3614
|
-
|
|
3615
|
-
|
|
3616
|
-
|
|
3617
|
-
|
|
3618
|
-
|
|
3619
|
-
|
|
3620
|
-
|
|
3621
|
-
|
|
3622
|
-
|
|
3623
|
-
|
|
3624
|
-
|
|
3625
|
-
|
|
3626
|
-
|
|
3627
|
-
|
|
3628
|
-
|
|
3629
|
-
|
|
3630
|
-
|
|
3631
|
-
|
|
3632
|
-
|
|
3633
|
-
|
|
3634
|
-
|
|
3635
|
-
|
|
3636
|
-
|
|
3637
|
-
|
|
3638
|
-
|
|
3639
|
-
|
|
3640
|
-
|
|
3641
|
-
|
|
3642
|
-
|
|
3877
|
+
const rowSlot = await governor.acquireRowSlot({
|
|
3878
|
+
signal: abortSignal,
|
|
3879
|
+
});
|
|
3880
|
+
try {
|
|
3881
|
+
const entry = uniqueRowsToExecuteEntries[myIndex]!;
|
|
3882
|
+
const row = pendingRowsByKey.has(entry.rowKey)
|
|
3883
|
+
? ({
|
|
3884
|
+
...entry.row,
|
|
3885
|
+
...publicCsvInputRow(pendingRowsByKey.get(entry.rowKey)!),
|
|
3886
|
+
} as T & Record<string, unknown>)
|
|
3887
|
+
: entry.row;
|
|
3888
|
+
const absoluteIndex = entry.absoluteIndex;
|
|
3889
|
+
const enriched: Record<string, unknown> =
|
|
3890
|
+
cloneCsvAliasedRow(row);
|
|
3891
|
+
const fieldOutputs: Record<string, unknown> = {};
|
|
3892
|
+
const cellMetaPatch: Record<
|
|
3893
|
+
string,
|
|
3894
|
+
{
|
|
3895
|
+
status: 'cached' | 'skipped' | 'completed';
|
|
3896
|
+
stage?: string | null;
|
|
3897
|
+
reused?: boolean;
|
|
3898
|
+
runId?: string;
|
|
3899
|
+
completedAt?: number;
|
|
3900
|
+
}
|
|
3901
|
+
> = {};
|
|
3902
|
+
const waterfallOutputs: RecordedWaterfallOutput[] = [];
|
|
3903
|
+
const stepProgramOutputs: RecordedStepProgramOutput[] = [];
|
|
3904
|
+
const rowCtx = {
|
|
3905
|
+
...(ctx as Record<string, unknown>),
|
|
3906
|
+
tools: {
|
|
3907
|
+
...((ctx as { tools?: Record<string, unknown> }).tools ??
|
|
3908
|
+
{}),
|
|
3909
|
+
execute: async (requestArg: unknown): Promise<unknown> => {
|
|
3910
|
+
assertNotAborted(abortSignal);
|
|
3911
|
+
const request = normalizeToolExecuteArgs(requestArg);
|
|
3912
|
+
return await toolBatchScheduler.execute(
|
|
3913
|
+
request.id,
|
|
3914
|
+
request.toolId,
|
|
3915
|
+
request.input,
|
|
3916
|
+
workflowStep,
|
|
3917
|
+
);
|
|
3918
|
+
},
|
|
3643
3919
|
},
|
|
3644
|
-
|
|
3645
|
-
|
|
3646
|
-
|
|
3647
|
-
|
|
3648
|
-
|
|
3649
|
-
|
|
3650
|
-
|
|
3651
|
-
|
|
3652
|
-
|
|
3653
|
-
|
|
3654
|
-
|
|
3655
|
-
|
|
3656
|
-
|
|
3657
|
-
|
|
3658
|
-
|
|
3659
|
-
|
|
3660
|
-
|
|
3661
|
-
|
|
3662
|
-
|
|
3663
|
-
|
|
3664
|
-
|
|
3665
|
-
|
|
3666
|
-
|
|
3667
|
-
|
|
3668
|
-
|
|
3669
|
-
|
|
3670
|
-
|
|
3671
|
-
|
|
3672
|
-
|
|
3673
|
-
|
|
3674
|
-
|
|
3675
|
-
|
|
3676
|
-
|
|
3677
|
-
|
|
3678
|
-
|
|
3679
|
-
|
|
3680
|
-
|
|
3681
|
-
|
|
3682
|
-
|
|
3683
|
-
|
|
3684
|
-
|
|
3685
|
-
|
|
3686
|
-
|
|
3687
|
-
|
|
3688
|
-
|
|
3689
|
-
|
|
3920
|
+
waterfall: (
|
|
3921
|
+
toolNameOrSpec: string | WorkerInlineWaterfallSpec,
|
|
3922
|
+
waterfallInput: Record<string, unknown>,
|
|
3923
|
+
waterfallOpts?: WorkerWaterfallOptions,
|
|
3924
|
+
) =>
|
|
3925
|
+
executeWorkerWaterfall(
|
|
3926
|
+
req,
|
|
3927
|
+
waterfallOutputs,
|
|
3928
|
+
toolNameOrSpec,
|
|
3929
|
+
waterfallInput,
|
|
3930
|
+
waterfallOpts,
|
|
3931
|
+
callbacks,
|
|
3932
|
+
workflowStep,
|
|
3933
|
+
),
|
|
3934
|
+
};
|
|
3935
|
+
for (const [key, value] of fieldEntries) {
|
|
3936
|
+
const rawCellMeta =
|
|
3937
|
+
enriched[DEEPLINE_CELL_META_FIELD] &&
|
|
3938
|
+
typeof enriched[DEEPLINE_CELL_META_FIELD] === 'object'
|
|
3939
|
+
? (
|
|
3940
|
+
enriched[DEEPLINE_CELL_META_FIELD] as Record<
|
|
3941
|
+
string,
|
|
3942
|
+
unknown
|
|
3943
|
+
>
|
|
3944
|
+
)[key]
|
|
3945
|
+
: null;
|
|
3946
|
+
const reuseDecision = shouldRecomputeCell({
|
|
3947
|
+
hasValue: isCompletedWorkerFieldValue(enriched[key]),
|
|
3948
|
+
meta:
|
|
3949
|
+
rawCellMeta && typeof rawCellMeta === 'object'
|
|
3950
|
+
? (rawCellMeta as {
|
|
3951
|
+
status?: string;
|
|
3952
|
+
completedAt?: number;
|
|
3953
|
+
})
|
|
3954
|
+
: null,
|
|
3955
|
+
policy: cellPolicies?.[key],
|
|
3956
|
+
});
|
|
3957
|
+
if (reuseDecision.action === 'reuse') {
|
|
3958
|
+
cellMetaPatch[key] = {
|
|
3959
|
+
status: 'cached',
|
|
3960
|
+
stage: key,
|
|
3961
|
+
reused: true,
|
|
3962
|
+
runId: req.runId,
|
|
3963
|
+
};
|
|
3964
|
+
continue;
|
|
3965
|
+
}
|
|
3966
|
+
const resolved = await executeWorkerStepResolver(
|
|
3967
|
+
value,
|
|
3968
|
+
enriched,
|
|
3969
|
+
rowCtx,
|
|
3970
|
+
absoluteIndex,
|
|
3971
|
+
isWorkerStepProgram(value)
|
|
3972
|
+
? {
|
|
3973
|
+
parentField: key,
|
|
3974
|
+
path: [],
|
|
3975
|
+
outputs: stepProgramOutputs,
|
|
3976
|
+
}
|
|
3977
|
+
: undefined,
|
|
3978
|
+
);
|
|
3979
|
+
enriched[key] = resolved.value;
|
|
3980
|
+
fieldOutputs[key] = resolved.value;
|
|
3981
|
+
if (resolved.status === 'skipped') {
|
|
3982
|
+
cellMetaPatch[key] = {
|
|
3983
|
+
status: 'skipped',
|
|
3984
|
+
stage: key,
|
|
3985
|
+
runId: req.runId,
|
|
3986
|
+
};
|
|
3987
|
+
} else {
|
|
3988
|
+
cellMetaPatch[key] = {
|
|
3989
|
+
status: 'completed',
|
|
3990
|
+
stage: key,
|
|
3991
|
+
runId: req.runId,
|
|
3992
|
+
completedAt: nowMs(),
|
|
3993
|
+
};
|
|
3994
|
+
}
|
|
3690
3995
|
}
|
|
3691
|
-
const
|
|
3692
|
-
value
|
|
3693
|
-
|
|
3694
|
-
|
|
3695
|
-
|
|
3696
|
-
|
|
3697
|
-
|
|
3698
|
-
|
|
3699
|
-
|
|
3700
|
-
|
|
3701
|
-
|
|
3702
|
-
: undefined,
|
|
3703
|
-
);
|
|
3704
|
-
enriched[key] = resolved.value;
|
|
3705
|
-
fieldOutputs[key] = resolved.value;
|
|
3706
|
-
if (resolved.status === 'skipped') {
|
|
3707
|
-
cellMetaPatch[key] = {
|
|
3708
|
-
status: 'skipped',
|
|
3709
|
-
stage: key,
|
|
3710
|
-
runId: req.runId,
|
|
3711
|
-
};
|
|
3712
|
-
} else {
|
|
3713
|
-
cellMetaPatch[key] = {
|
|
3714
|
-
status: 'completed',
|
|
3715
|
-
stage: key,
|
|
3716
|
-
runId: req.runId,
|
|
3717
|
-
completedAt: nowMs(),
|
|
3718
|
-
};
|
|
3996
|
+
for (const stepOutput of stepProgramOutputs) {
|
|
3997
|
+
enriched[stepOutput.columnName] = stepOutput.value;
|
|
3998
|
+
fieldOutputs[stepOutput.columnName] = stepOutput.value;
|
|
3999
|
+
generatedOutputFields.add(stepOutput.columnName);
|
|
4000
|
+
if (stepOutput.status === 'skipped') {
|
|
4001
|
+
cellMetaPatch[stepOutput.columnName] = {
|
|
4002
|
+
status: 'skipped',
|
|
4003
|
+
stage: stepOutput.stepId,
|
|
4004
|
+
runId: req.runId,
|
|
4005
|
+
};
|
|
4006
|
+
}
|
|
3719
4007
|
}
|
|
3720
|
-
|
|
3721
|
-
|
|
3722
|
-
|
|
3723
|
-
|
|
3724
|
-
|
|
3725
|
-
|
|
3726
|
-
cellMetaPatch[stepOutput.columnName] = {
|
|
3727
|
-
status: 'skipped',
|
|
3728
|
-
stage: stepOutput.stepId,
|
|
3729
|
-
runId: req.runId,
|
|
3730
|
-
};
|
|
4008
|
+
for (const waterfallOutput of waterfallOutputs) {
|
|
4009
|
+
const columnName =
|
|
4010
|
+
`${sqlishIdentifierPart(waterfallOutput.waterfallId)}__` +
|
|
4011
|
+
sqlishIdentifierPart(waterfallOutput.stepId);
|
|
4012
|
+
enriched[columnName] = waterfallOutput.value;
|
|
4013
|
+
generatedOutputFields.add(columnName);
|
|
3731
4014
|
}
|
|
4015
|
+
executedCellMetaPatches[myIndex] =
|
|
4016
|
+
Object.keys(cellMetaPatch).length > 0
|
|
4017
|
+
? cellMetaPatch
|
|
4018
|
+
: undefined;
|
|
4019
|
+
executedRows[myIndex] = enriched as T & Record<string, unknown>;
|
|
4020
|
+
} finally {
|
|
4021
|
+
rowSlot.release();
|
|
3732
4022
|
}
|
|
3733
|
-
for (const waterfallOutput of waterfallOutputs) {
|
|
3734
|
-
const columnName =
|
|
3735
|
-
`${sqlishIdentifierPart(waterfallOutput.waterfallId)}__` +
|
|
3736
|
-
sqlishIdentifierPart(waterfallOutput.stepId);
|
|
3737
|
-
enriched[columnName] = waterfallOutput.value;
|
|
3738
|
-
generatedOutputFields.add(columnName);
|
|
3739
|
-
}
|
|
3740
|
-
executedCellMetaPatches[myIndex] =
|
|
3741
|
-
Object.keys(cellMetaPatch).length > 0
|
|
3742
|
-
? cellMetaPatch
|
|
3743
|
-
: undefined;
|
|
3744
|
-
executedRows[myIndex] = enriched as T & Record<string, unknown>;
|
|
3745
4023
|
}
|
|
3746
4024
|
})(),
|
|
3747
4025
|
);
|
|
@@ -4410,33 +4688,20 @@ function createMinimalWorkerCtx(
|
|
|
4410
4688
|
childPlayName: resolvedName,
|
|
4411
4689
|
input,
|
|
4412
4690
|
})}${staleRuntimeSuffix(options?.staleAfterSeconds)}`;
|
|
4413
|
-
if (ancestryPlayIds.includes(resolvedName)) {
|
|
4414
|
-
const chain = [...ancestryPlayIds, resolvedName].join(' -> ');
|
|
4415
|
-
throw new Error(`Recursive play graph detected: ${chain}`);
|
|
4416
|
-
}
|
|
4417
|
-
const nextDepth = callDepth + 1;
|
|
4418
|
-
if (nextDepth > WORKER_PLAY_CALL_LIMITS.maxPlayCallDepth) {
|
|
4419
|
-
throw new Error(
|
|
4420
|
-
`Play-call depth exceeded (${nextDepth}/${WORKER_PLAY_CALL_LIMITS.maxPlayCallDepth}) while calling ${resolvedName}.`,
|
|
4421
|
-
);
|
|
4422
|
-
}
|
|
4423
|
-
const nextPlayCallCount = playCallCount + 1;
|
|
4424
|
-
if (nextPlayCallCount > WORKER_PLAY_CALL_LIMITS.maxPlayCallCount) {
|
|
4425
|
-
throw new Error(
|
|
4426
|
-
`Root play-call budget exceeded (${nextPlayCallCount}/${WORKER_PLAY_CALL_LIMITS.maxPlayCallCount}).`,
|
|
4427
|
-
);
|
|
4428
|
-
}
|
|
4429
|
-
const nextParentCalls = (parentChildCalls[req.playName] ?? 0) + 1;
|
|
4430
|
-
if (
|
|
4431
|
-
nextParentCalls > WORKER_PLAY_CALL_LIMITS.maxChildPlayCallsPerParent
|
|
4432
|
-
) {
|
|
4433
|
-
throw new Error(
|
|
4434
|
-
`Child play-call cap exceeded for ${req.playName} (${nextParentCalls}/${WORKER_PLAY_CALL_LIMITS.maxChildPlayCallsPerParent}).`,
|
|
4435
|
-
);
|
|
4436
|
-
}
|
|
4437
4691
|
return await executeWithRuntimeReceipt(receiptKey, async () => {
|
|
4438
|
-
|
|
4439
|
-
|
|
4692
|
+
// The Governor owns the play-call lineage: forkChild does the cycle
|
|
4693
|
+
// guard, depth/per-parent/playCall/descendant budget charges, and
|
|
4694
|
+
// returns the snapshot to thread into the child so budgets accumulate
|
|
4695
|
+
// across isolates. Charged inside the receipt boundary so a replay
|
|
4696
|
+
// (cache hit) never double-charges.
|
|
4697
|
+
const childRunId = `${req.runId}:child:${normalizedKey}`;
|
|
4698
|
+
const childGovernance = governor.forkChild({
|
|
4699
|
+
childPlayName: resolvedName,
|
|
4700
|
+
childRunId,
|
|
4701
|
+
});
|
|
4702
|
+
const nextDepth = childGovernance.callDepth;
|
|
4703
|
+
const nextParentCalls =
|
|
4704
|
+
governor.snapshot().parentChildCalls[req.playName] ?? 0;
|
|
4440
4705
|
|
|
4441
4706
|
emitEvent({
|
|
4442
4707
|
type: 'log',
|
|
@@ -4456,31 +4721,47 @@ function createMinimalWorkerCtx(
|
|
|
4456
4721
|
const childNeedsWorkflowScheduler = childPipelineNeedsWorkflowScheduler(
|
|
4457
4722
|
childManifest.staticPipeline,
|
|
4458
4723
|
);
|
|
4459
|
-
|
|
4460
|
-
|
|
4461
|
-
|
|
4462
|
-
|
|
4463
|
-
|
|
4464
|
-
|
|
4465
|
-
|
|
4466
|
-
|
|
4467
|
-
|
|
4468
|
-
|
|
4469
|
-
|
|
4470
|
-
|
|
4471
|
-
|
|
4472
|
-
|
|
4473
|
-
|
|
4724
|
+
console.info('[play.runtime.span]', {
|
|
4725
|
+
event: 'play.runtime.span',
|
|
4726
|
+
phase: 'child_route',
|
|
4727
|
+
runId: req.runId,
|
|
4728
|
+
parentRunId: req.runId,
|
|
4729
|
+
playName: resolvedName,
|
|
4730
|
+
graphHash: req.graphHash ?? null,
|
|
4731
|
+
depth: nextDepth,
|
|
4732
|
+
fanoutIndex: nextParentCalls - 1,
|
|
4733
|
+
childIsDatasetBacked,
|
|
4734
|
+
childNeedsWorkflowScheduler,
|
|
4735
|
+
hasStaticPipeline: Boolean(childManifest.staticPipeline),
|
|
4736
|
+
childTableNamespace:
|
|
4737
|
+
typeof childManifest.staticPipeline?.tableNamespace === 'string'
|
|
4738
|
+
? childManifest.staticPipeline.tableNamespace
|
|
4739
|
+
: null,
|
|
4740
|
+
childStageCount: Array.isArray(childManifest.staticPipeline?.stages)
|
|
4741
|
+
? childManifest.staticPipeline.stages.length
|
|
4742
|
+
: null,
|
|
4743
|
+
childSubstepCount: Array.isArray(
|
|
4744
|
+
childManifest.staticPipeline?.substeps,
|
|
4745
|
+
)
|
|
4746
|
+
? childManifest.staticPipeline.substeps.length
|
|
4747
|
+
: null,
|
|
4748
|
+
});
|
|
4749
|
+
let childPlaySlot: { release(): void } | null = null;
|
|
4474
4750
|
try {
|
|
4475
|
-
|
|
4751
|
+
childPlaySlot = await governor.acquireChildPlaySlot({
|
|
4752
|
+
signal: abortSignal,
|
|
4753
|
+
});
|
|
4476
4754
|
const childSubmitStartedAt = nowMs();
|
|
4477
4755
|
let started: {
|
|
4478
4756
|
workflowId?: string;
|
|
4479
4757
|
runId?: string;
|
|
4480
4758
|
status?: string;
|
|
4759
|
+
mode?: string;
|
|
4481
4760
|
output?: unknown;
|
|
4482
4761
|
result?: unknown;
|
|
4483
4762
|
error?: unknown;
|
|
4763
|
+
logs?: string[];
|
|
4764
|
+
timings?: Array<{ phase: string; ms: number }>;
|
|
4484
4765
|
};
|
|
4485
4766
|
try {
|
|
4486
4767
|
started = await submitChildPlayThroughCoordinator({
|
|
@@ -4507,6 +4788,17 @@ function createMinimalWorkerCtx(
|
|
|
4507
4788
|
// executor token's play name (the parent making this call).
|
|
4508
4789
|
ancestryPlayIds,
|
|
4509
4790
|
callDepth: nextDepth,
|
|
4791
|
+
// Cumulative lineage-global budget counters (incl. this
|
|
4792
|
+
// launch's play/descendant charges) so the child seeds its
|
|
4793
|
+
// budgets from the lineage total instead of resetting to 0 in
|
|
4794
|
+
// its isolate. Threading descendantCount in particular keeps
|
|
4795
|
+
// fan-out descendant accounting lineage-global, matching cjs.
|
|
4796
|
+
playCallCount: childGovernance.playCallCount,
|
|
4797
|
+
toolCallCount: childGovernance.toolCallCount,
|
|
4798
|
+
retryCount: childGovernance.retryCount,
|
|
4799
|
+
descendantCount: childGovernance.descendantCount,
|
|
4800
|
+
waterfallStepExecutions:
|
|
4801
|
+
childGovernance.waterfallStepExecutions,
|
|
4510
4802
|
description:
|
|
4511
4803
|
typeof options?.description === 'string'
|
|
4512
4804
|
? options.description
|
|
@@ -4528,6 +4820,21 @@ function createMinimalWorkerCtx(
|
|
|
4528
4820
|
status: 'failed',
|
|
4529
4821
|
errorCode: 'CHILD_SUBMIT_FAILED',
|
|
4530
4822
|
});
|
|
4823
|
+
recordRunnerPerfTrace({
|
|
4824
|
+
req,
|
|
4825
|
+
phase: 'ctx_run_play.child_submit',
|
|
4826
|
+
ms: nowMs() - childSubmitStartedAt,
|
|
4827
|
+
extra: {
|
|
4828
|
+
status: 'failed',
|
|
4829
|
+
errorCode: 'CHILD_SUBMIT_FAILED',
|
|
4830
|
+
playName: resolvedName,
|
|
4831
|
+
key: normalizedKey,
|
|
4832
|
+
depth: nextDepth,
|
|
4833
|
+
fanoutIndex: nextParentCalls - 1,
|
|
4834
|
+
childIsDatasetBacked,
|
|
4835
|
+
childNeedsWorkflowScheduler,
|
|
4836
|
+
},
|
|
4837
|
+
});
|
|
4531
4838
|
throw error;
|
|
4532
4839
|
}
|
|
4533
4840
|
const workflowId = started.workflowId ?? started.runId;
|
|
@@ -4558,6 +4865,26 @@ function createMinimalWorkerCtx(
|
|
|
4558
4865
|
ms: nowMs() - childSubmitStartedAt,
|
|
4559
4866
|
status: 'ok',
|
|
4560
4867
|
});
|
|
4868
|
+
recordRunnerPerfTrace({
|
|
4869
|
+
req,
|
|
4870
|
+
phase: 'ctx_run_play.child_submit',
|
|
4871
|
+
ms: nowMs() - childSubmitStartedAt,
|
|
4872
|
+
extra: {
|
|
4873
|
+
status: 'ok',
|
|
4874
|
+
childRunId: workflowId,
|
|
4875
|
+
startedStatus: started.status ?? null,
|
|
4876
|
+
mode: started.mode ?? null,
|
|
4877
|
+
coordinatorTimings: Array.isArray(started.timings)
|
|
4878
|
+
? started.timings
|
|
4879
|
+
: null,
|
|
4880
|
+
playName: resolvedName,
|
|
4881
|
+
key: normalizedKey,
|
|
4882
|
+
depth: nextDepth,
|
|
4883
|
+
fanoutIndex: nextParentCalls - 1,
|
|
4884
|
+
childIsDatasetBacked,
|
|
4885
|
+
childNeedsWorkflowScheduler,
|
|
4886
|
+
},
|
|
4887
|
+
});
|
|
4561
4888
|
const startedStatus = String(started.status ?? '').toLowerCase();
|
|
4562
4889
|
if (startedStatus === 'completed') {
|
|
4563
4890
|
emitEvent({
|
|
@@ -4580,11 +4907,16 @@ function createMinimalWorkerCtx(
|
|
|
4580
4907
|
throw new Error(startedErrorMessage);
|
|
4581
4908
|
}
|
|
4582
4909
|
const childWaitStartedAt = nowMs();
|
|
4583
|
-
let
|
|
4910
|
+
let waitResult: ChildPlayTerminalWaitResult;
|
|
4584
4911
|
try {
|
|
4585
|
-
|
|
4586
|
-
req,
|
|
4587
|
-
|
|
4912
|
+
waitResult = await awaitChildTerminal({
|
|
4913
|
+
parentRunId: req.runId,
|
|
4914
|
+
// CF's WorkflowStep.waitForEvent generic signature is wider than
|
|
4915
|
+
// the small structural shape ChildPlayAwait needs; bridge it the
|
|
4916
|
+
// same way the inline implementation did.
|
|
4917
|
+
workflowStep: workflowStep as unknown as
|
|
4918
|
+
| WorkflowStepLike
|
|
4919
|
+
| undefined,
|
|
4588
4920
|
workflowId,
|
|
4589
4921
|
playName: resolvedName,
|
|
4590
4922
|
key: normalizedKey,
|
|
@@ -4592,6 +4924,22 @@ function createMinimalWorkerCtx(
|
|
|
4592
4924
|
1_000,
|
|
4593
4925
|
Math.min(options?.timeoutMs ?? 5 * 60_000, 30 * 60_000),
|
|
4594
4926
|
),
|
|
4927
|
+
coordinator: cachedCoordinatorBinding?.readChildTerminalState
|
|
4928
|
+
? {
|
|
4929
|
+
readChildTerminalState: (
|
|
4930
|
+
parentRunId,
|
|
4931
|
+
eventKey,
|
|
4932
|
+
timeoutMs,
|
|
4933
|
+
) =>
|
|
4934
|
+
cachedCoordinatorBinding!.readChildTerminalState!(
|
|
4935
|
+
parentRunId,
|
|
4936
|
+
eventKey,
|
|
4937
|
+
timeoutMs,
|
|
4938
|
+
),
|
|
4939
|
+
}
|
|
4940
|
+
: null,
|
|
4941
|
+
now: nowMs,
|
|
4942
|
+
hashJson,
|
|
4595
4943
|
});
|
|
4596
4944
|
} catch (error) {
|
|
4597
4945
|
console.info('[play.runtime.span]', {
|
|
@@ -4608,6 +4956,22 @@ function createMinimalWorkerCtx(
|
|
|
4608
4956
|
status: 'failed',
|
|
4609
4957
|
errorCode: 'CHILD_WAIT_FAILED',
|
|
4610
4958
|
});
|
|
4959
|
+
recordRunnerPerfTrace({
|
|
4960
|
+
req,
|
|
4961
|
+
phase: 'ctx_run_play.child_wait',
|
|
4962
|
+
ms: nowMs() - childWaitStartedAt,
|
|
4963
|
+
extra: {
|
|
4964
|
+
status: 'failed',
|
|
4965
|
+
errorCode: 'CHILD_WAIT_FAILED',
|
|
4966
|
+
childRunId: workflowId,
|
|
4967
|
+
playName: resolvedName,
|
|
4968
|
+
key: normalizedKey,
|
|
4969
|
+
depth: nextDepth,
|
|
4970
|
+
fanoutIndex: nextParentCalls - 1,
|
|
4971
|
+
childIsDatasetBacked,
|
|
4972
|
+
childNeedsWorkflowScheduler,
|
|
4973
|
+
},
|
|
4974
|
+
});
|
|
4611
4975
|
throw error;
|
|
4612
4976
|
}
|
|
4613
4977
|
console.info('[play.runtime.span]', {
|
|
@@ -4622,6 +4986,27 @@ function createMinimalWorkerCtx(
|
|
|
4622
4986
|
fanoutIndex: nextParentCalls - 1,
|
|
4623
4987
|
ms: nowMs() - childWaitStartedAt,
|
|
4624
4988
|
status: 'ok',
|
|
4989
|
+
waitSource: waitResult.source,
|
|
4990
|
+
waitAttempts: waitResult.attempts ?? null,
|
|
4991
|
+
reportedWaitMs: waitResult.waitMs,
|
|
4992
|
+
});
|
|
4993
|
+
recordRunnerPerfTrace({
|
|
4994
|
+
req,
|
|
4995
|
+
phase: 'ctx_run_play.child_wait',
|
|
4996
|
+
ms: nowMs() - childWaitStartedAt,
|
|
4997
|
+
extra: {
|
|
4998
|
+
status: 'ok',
|
|
4999
|
+
childRunId: workflowId,
|
|
5000
|
+
playName: resolvedName,
|
|
5001
|
+
key: normalizedKey,
|
|
5002
|
+
depth: nextDepth,
|
|
5003
|
+
fanoutIndex: nextParentCalls - 1,
|
|
5004
|
+
childIsDatasetBacked,
|
|
5005
|
+
childNeedsWorkflowScheduler,
|
|
5006
|
+
waitSource: waitResult.source,
|
|
5007
|
+
waitAttempts: waitResult.attempts ?? null,
|
|
5008
|
+
reportedWaitMs: waitResult.waitMs,
|
|
5009
|
+
},
|
|
4625
5010
|
});
|
|
4626
5011
|
emitEvent({
|
|
4627
5012
|
type: 'log',
|
|
@@ -4629,15 +5014,9 @@ function createMinimalWorkerCtx(
|
|
|
4629
5014
|
message: `Completed child play ${resolvedName} (${normalizedKey})`,
|
|
4630
5015
|
ts: nowMs(),
|
|
4631
5016
|
});
|
|
4632
|
-
return
|
|
5017
|
+
return waitResult.output;
|
|
4633
5018
|
} finally {
|
|
4634
|
-
|
|
4635
|
-
if (childConcurrencyAcquired) {
|
|
4636
|
-
releaseChildPlayConcurrency(
|
|
4637
|
-
inFlightChildCallsByPlayName,
|
|
4638
|
-
resolvedName,
|
|
4639
|
-
);
|
|
4640
|
-
}
|
|
5019
|
+
childPlaySlot?.release();
|
|
4641
5020
|
}
|
|
4642
5021
|
});
|
|
4643
5022
|
},
|
|
@@ -4813,6 +5192,135 @@ async function handleRun(request: Request, env: WorkerEnv): Promise<Response> {
|
|
|
4813
5192
|
});
|
|
4814
5193
|
}
|
|
4815
5194
|
|
|
5195
|
+
async function handleRunInline(
|
|
5196
|
+
request: Request,
|
|
5197
|
+
env: WorkerEnv,
|
|
5198
|
+
): Promise<Response> {
|
|
5199
|
+
let req: RunRequest;
|
|
5200
|
+
try {
|
|
5201
|
+
req = (await request.json()) as RunRequest;
|
|
5202
|
+
} catch {
|
|
5203
|
+
return Response.json(
|
|
5204
|
+
{
|
|
5205
|
+
status: 'failed',
|
|
5206
|
+
error: { message: 'invalid JSON body' },
|
|
5207
|
+
},
|
|
5208
|
+
{ status: 400 },
|
|
5209
|
+
);
|
|
5210
|
+
}
|
|
5211
|
+
|
|
5212
|
+
const events: RunnerEvent[] = [];
|
|
5213
|
+
const timings: InlineRunTiming[] = [];
|
|
5214
|
+
const traceInline = (
|
|
5215
|
+
phase: string,
|
|
5216
|
+
phaseStartedAt: number,
|
|
5217
|
+
extra?: Record<string, unknown>,
|
|
5218
|
+
): void => {
|
|
5219
|
+
timings.push({
|
|
5220
|
+
phase,
|
|
5221
|
+
ms: nowMs() - phaseStartedAt,
|
|
5222
|
+
...(extra ? { extra } : {}),
|
|
5223
|
+
});
|
|
5224
|
+
};
|
|
5225
|
+
const inlineStartedAt = nowMs();
|
|
5226
|
+
try {
|
|
5227
|
+
const runPrefix = `[deepline-run:${req.runId}]`;
|
|
5228
|
+
captureCoordinatorBinding(env);
|
|
5229
|
+
captureRuntimeApiBinding(env);
|
|
5230
|
+
captureHarnessBinding(env);
|
|
5231
|
+
const probeStartedAt = nowMs();
|
|
5232
|
+
await probeHarnessOnce(env, runPrefix);
|
|
5233
|
+
traceInline('inline.probe_harness', probeStartedAt);
|
|
5234
|
+
if (!req.inlineChildRunRegistered) {
|
|
5235
|
+
const registerStartedAt = nowMs();
|
|
5236
|
+
await registerInlineChildRun(req);
|
|
5237
|
+
traceInline('inline.register_child_run', registerStartedAt);
|
|
5238
|
+
} else {
|
|
5239
|
+
traceInline('inline.register_child_run', nowMs(), { skipped: true });
|
|
5240
|
+
}
|
|
5241
|
+
const executeStartedAt = nowMs();
|
|
5242
|
+
const output = await executeRunRequest(
|
|
5243
|
+
req,
|
|
5244
|
+
env,
|
|
5245
|
+
(event) => {
|
|
5246
|
+
events.push(event);
|
|
5247
|
+
},
|
|
5248
|
+
undefined,
|
|
5249
|
+
{
|
|
5250
|
+
persistResultDatasets: true,
|
|
5251
|
+
},
|
|
5252
|
+
);
|
|
5253
|
+
traceInline('inline.execute_run_request', executeStartedAt, {
|
|
5254
|
+
durationMs: output.durationMs,
|
|
5255
|
+
outputRows: output.outputRows,
|
|
5256
|
+
});
|
|
5257
|
+
traceInline('inline.total', inlineStartedAt);
|
|
5258
|
+
return Response.json({
|
|
5259
|
+
status: 'completed',
|
|
5260
|
+
result: output.result,
|
|
5261
|
+
outputRows: output.outputRows,
|
|
5262
|
+
durationMs: output.durationMs,
|
|
5263
|
+
events,
|
|
5264
|
+
timings,
|
|
5265
|
+
});
|
|
5266
|
+
} catch (error) {
|
|
5267
|
+
const err = error as Error;
|
|
5268
|
+
events.push({
|
|
5269
|
+
type: 'error',
|
|
5270
|
+
message: err.message ?? String(err),
|
|
5271
|
+
stack: err.stack,
|
|
5272
|
+
ts: nowMs(),
|
|
5273
|
+
});
|
|
5274
|
+
return Response.json({
|
|
5275
|
+
status: 'failed',
|
|
5276
|
+
error: {
|
|
5277
|
+
message: err.message ?? String(err),
|
|
5278
|
+
stack: err.stack,
|
|
5279
|
+
},
|
|
5280
|
+
events,
|
|
5281
|
+
timings,
|
|
5282
|
+
});
|
|
5283
|
+
}
|
|
5284
|
+
}
|
|
5285
|
+
|
|
5286
|
+
async function registerInlineChildRun(req: RunRequest): Promise<void> {
|
|
5287
|
+
const snapshot = isRecord(req.contractSnapshot) ? req.contractSnapshot : {};
|
|
5288
|
+
const artifactMetadata = isRecord(snapshot.artifactMetadata)
|
|
5289
|
+
? snapshot.artifactMetadata
|
|
5290
|
+
: {};
|
|
5291
|
+
const governance = req.playCallGovernance;
|
|
5292
|
+
await postRuntimeApi(req.baseUrl, req.executorToken, {
|
|
5293
|
+
action: 'start_inline_child_run',
|
|
5294
|
+
playName: req.playName,
|
|
5295
|
+
runId: req.runId,
|
|
5296
|
+
workflowFamilyKey:
|
|
5297
|
+
governance?.rootRunId ?? governance?.parentRunId ?? req.runId,
|
|
5298
|
+
artifactStorageKey:
|
|
5299
|
+
typeof artifactMetadata.storageKey === 'string'
|
|
5300
|
+
? artifactMetadata.storageKey
|
|
5301
|
+
: undefined,
|
|
5302
|
+
artifactHash:
|
|
5303
|
+
typeof artifactMetadata.artifactHash === 'string'
|
|
5304
|
+
? artifactMetadata.artifactHash
|
|
5305
|
+
: undefined,
|
|
5306
|
+
graphHash:
|
|
5307
|
+
typeof artifactMetadata.graphHash === 'string'
|
|
5308
|
+
? artifactMetadata.graphHash
|
|
5309
|
+
: undefined,
|
|
5310
|
+
runtimeBackend: 'workers_edge',
|
|
5311
|
+
schedulerBackend: 'inline_child',
|
|
5312
|
+
executionProfile: 'workers_edge',
|
|
5313
|
+
maxCreditsPerRun: extractMaxCreditsPerRun(req.contractSnapshot),
|
|
5314
|
+
staticPipeline: snapshot.staticPipeline ?? null,
|
|
5315
|
+
source:
|
|
5316
|
+
snapshot.source === 'published' ||
|
|
5317
|
+
snapshot.source === 'ad_hoc' ||
|
|
5318
|
+
snapshot.source === 'draft'
|
|
5319
|
+
? snapshot.source
|
|
5320
|
+
: 'published',
|
|
5321
|
+
});
|
|
5322
|
+
}
|
|
5323
|
+
|
|
4816
5324
|
/** Cap on run log lines retained in the terminal output compatibility shape. */
|
|
4817
5325
|
const RUN_LOG_BUFFER_LIMIT = 500;
|
|
4818
5326
|
/** Min wall-clock interval between live run-ledger flushes during a run. */
|
|
@@ -5081,6 +5589,20 @@ async function executeRunRequest(
|
|
|
5081
5589
|
abortSignal,
|
|
5082
5590
|
workerCallbacks,
|
|
5083
5591
|
);
|
|
5592
|
+
// Hard wall-clock cap on active user-code runtime. CF Workflows does not
|
|
5593
|
+
// impose a play-level execution ceiling on this substrate, so without this a
|
|
5594
|
+
// runaway play (infinite loop, stuck await) would only stop when the executor
|
|
5595
|
+
// token expires. Aborting the controller surfaces cooperatively through the
|
|
5596
|
+
// same assertNotAborted checks used for harness cancellation.
|
|
5597
|
+
let runtimeLimitExceeded = false;
|
|
5598
|
+
const runtimeDeadlineTimer = setTimeout(() => {
|
|
5599
|
+
runtimeLimitExceeded = true;
|
|
5600
|
+
if (!abortSignal.aborted) {
|
|
5601
|
+
abortController.abort(
|
|
5602
|
+
`Play runtime limit exceeded after ${STANDARD_PLAY_RUNTIME_LIMIT_SECONDS}s.`,
|
|
5603
|
+
);
|
|
5604
|
+
}
|
|
5605
|
+
}, STANDARD_PLAY_RUNTIME_LIMIT_SECONDS * 1000);
|
|
5084
5606
|
try {
|
|
5085
5607
|
const playStartedAt = nowMs();
|
|
5086
5608
|
const result = await (
|
|
@@ -5102,6 +5624,33 @@ async function executeRunRequest(
|
|
|
5102
5624
|
phase: 'runner.serialize_result',
|
|
5103
5625
|
ms: nowMs() - serializeStartedAt,
|
|
5104
5626
|
});
|
|
5627
|
+
const terminalResult = trimResultForStatus(serializedResult);
|
|
5628
|
+
let parentSignalPromise: Promise<void> | null = null;
|
|
5629
|
+
const startParentTerminalSignal = (): Promise<void> => {
|
|
5630
|
+
if (!parentSignalPromise) {
|
|
5631
|
+
const parentSignalStartedAt = nowMs();
|
|
5632
|
+
parentSignalPromise = signalParentPlayTerminal({
|
|
5633
|
+
req,
|
|
5634
|
+
status: 'completed',
|
|
5635
|
+
result: terminalResult as Record<string, unknown>,
|
|
5636
|
+
})
|
|
5637
|
+
.catch((error) => {
|
|
5638
|
+
console.error(
|
|
5639
|
+
`[play-harness] non-fatal parent completion signal failed runId=${req.runId}: ${
|
|
5640
|
+
error instanceof Error ? error.message : String(error)
|
|
5641
|
+
}`,
|
|
5642
|
+
);
|
|
5643
|
+
})
|
|
5644
|
+
.finally(() => {
|
|
5645
|
+
recordRunnerPerfTrace({
|
|
5646
|
+
req,
|
|
5647
|
+
phase: 'runner.parent_terminal_signal',
|
|
5648
|
+
ms: nowMs() - parentSignalStartedAt,
|
|
5649
|
+
});
|
|
5650
|
+
});
|
|
5651
|
+
}
|
|
5652
|
+
return parentSignalPromise;
|
|
5653
|
+
};
|
|
5105
5654
|
if (options?.persistResultDatasets) {
|
|
5106
5655
|
const ledgerFlushWaitStartedAt = nowMs();
|
|
5107
5656
|
await ledgerFlushInFlight;
|
|
@@ -5117,7 +5666,7 @@ async function executeRunRequest(
|
|
|
5117
5666
|
phase: 'runner.persist_result_datasets',
|
|
5118
5667
|
ms: nowMs() - resultDatasetStartedAt,
|
|
5119
5668
|
});
|
|
5120
|
-
const
|
|
5669
|
+
const parentSignal = startParentTerminalSignal();
|
|
5121
5670
|
const terminalOccurredAt = nowMs();
|
|
5122
5671
|
const terminalUpdateStartedAt = nowMs();
|
|
5123
5672
|
await flushTerminalLedgerEvents({
|
|
@@ -5161,24 +5710,9 @@ async function executeRunRequest(
|
|
|
5161
5710
|
await nonBlockingBillingPromise;
|
|
5162
5711
|
}
|
|
5163
5712
|
}
|
|
5713
|
+
await parentSignal;
|
|
5164
5714
|
}
|
|
5165
|
-
|
|
5166
|
-
await signalParentPlayTerminal({
|
|
5167
|
-
req,
|
|
5168
|
-
status: 'completed',
|
|
5169
|
-
result: trimResultForStatus(serializedResult) as Record<string, unknown>,
|
|
5170
|
-
}).catch((error) => {
|
|
5171
|
-
console.error(
|
|
5172
|
-
`[play-harness] non-fatal parent completion signal failed runId=${req.runId}: ${
|
|
5173
|
-
error instanceof Error ? error.message : String(error)
|
|
5174
|
-
}`,
|
|
5175
|
-
);
|
|
5176
|
-
});
|
|
5177
|
-
recordRunnerPerfTrace({
|
|
5178
|
-
req,
|
|
5179
|
-
phase: 'runner.parent_terminal_signal',
|
|
5180
|
-
ms: nowMs() - parentSignalStartedAt,
|
|
5181
|
-
});
|
|
5715
|
+
await startParentTerminalSignal();
|
|
5182
5716
|
recordRunnerPerfTrace({
|
|
5183
5717
|
req,
|
|
5184
5718
|
phase: 'runner.execute_total',
|
|
@@ -5194,7 +5728,10 @@ async function executeRunRequest(
|
|
|
5194
5728
|
};
|
|
5195
5729
|
} catch (error) {
|
|
5196
5730
|
stepLifecycle?.markStartedFailed(nowMs());
|
|
5197
|
-
|
|
5731
|
+
// A runtime-limit abort is a timeout failure, not a user cancellation, so
|
|
5732
|
+
// it should be reported as run.failed with the limit message rather than
|
|
5733
|
+
// run.cancelled.
|
|
5734
|
+
const aborted = isAbortLikeError(error) && !runtimeLimitExceeded;
|
|
5198
5735
|
if (aborted) {
|
|
5199
5736
|
// Flip the controller so any concurrent user code observes the abort
|
|
5200
5737
|
// through ctx.signal. We mark the run cancelled instead of failed.
|
|
@@ -5253,6 +5790,8 @@ async function executeRunRequest(
|
|
|
5253
5790
|
error: message,
|
|
5254
5791
|
}).catch(() => null);
|
|
5255
5792
|
throw error;
|
|
5793
|
+
} finally {
|
|
5794
|
+
clearTimeout(runtimeDeadlineTimer);
|
|
5256
5795
|
}
|
|
5257
5796
|
}
|
|
5258
5797
|
|
|
@@ -5851,6 +6390,9 @@ const workerEntrypoint = {
|
|
|
5851
6390
|
},
|
|
5852
6391
|
});
|
|
5853
6392
|
}
|
|
6393
|
+
if (request.method === 'POST' && url.pathname === '/run-inline') {
|
|
6394
|
+
return handleRunInline(request, env);
|
|
6395
|
+
}
|
|
5854
6396
|
if (request.method === 'POST' && url.pathname === '/run') {
|
|
5855
6397
|
return handleRun(request, env);
|
|
5856
6398
|
}
|