deepline 0.1.79 → 0.1.81
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/dist/cli/index.js +76 -42
- package/dist/cli/index.mjs +76 -42
- package/dist/index.d.mts +9 -1
- package/dist/index.d.ts +9 -1
- package/dist/index.js +13 -10
- package/dist/index.mjs +13 -10
- package/dist/repo/apps/play-runner-workers/src/child-play-await.ts +192 -0
- package/dist/repo/apps/play-runner-workers/src/coordinator-entry.ts +1103 -1617
- package/dist/repo/apps/play-runner-workers/src/dedup-do.ts +506 -654
- package/dist/repo/apps/play-runner-workers/src/entry.ts +1148 -598
- package/dist/repo/apps/play-runner-workers/src/runtime/tool-http-errors.ts +43 -1
- package/dist/repo/apps/play-runner-workers/src/workflow-retry-state.ts +8 -2
- package/dist/repo/sdk/src/client.ts +15 -8
- package/dist/repo/sdk/src/release.ts +2 -2
- package/dist/repo/sdk/src/types.ts +5 -0
- package/dist/repo/shared_libs/play-runtime/governor/coordinator-rate-state-backend.ts +231 -0
- package/dist/repo/shared_libs/play-runtime/governor/governor.ts +376 -0
- package/dist/repo/shared_libs/play-runtime/governor/policy.ts +179 -0
- package/dist/repo/shared_libs/play-runtime/governor/rate-state-backend.ts +87 -0
- package/dist/repo/shared_libs/play-runtime/run-failure.ts +12 -0
- package/dist/repo/shared_libs/play-runtime/scheduler-backend.ts +24 -0
- package/dist/repo/shared_libs/play-runtime/submit-limits.ts +35 -0
- package/dist/repo/shared_libs/plays/bundling/index.ts +4 -12
- package/dist/repo/shared_libs/plays/bundling/limits.ts +29 -0
- package/dist/repo/shared_libs/plays/static-pipeline.ts +56 -3
- package/dist/repo/shared_libs/temporal/constants.ts +38 -0
- package/package.json +1 -1
- package/dist/repo/shared_libs/play-runtime/tool-batch-executor.ts +0 -149
|
@@ -44,11 +44,23 @@ import {
|
|
|
44
44
|
type ChunkExecutionResult,
|
|
45
45
|
} from '../../../shared_libs/play-runtime/batch-runtime';
|
|
46
46
|
import { getDefaultPlayRuntimeBatchStrategy } from '../../../shared_libs/play-runtime/default-batch-strategies';
|
|
47
|
-
import
|
|
47
|
+
import { STANDARD_PLAY_RUNTIME_LIMIT_SECONDS } from '../../../shared_libs/temporal/constants';
|
|
48
|
+
import {
|
|
49
|
+
createPlayExecutionGovernor,
|
|
50
|
+
type GovernanceSnapshot,
|
|
51
|
+
type PlayExecutionGovernor,
|
|
52
|
+
} from '../../../shared_libs/play-runtime/governor/governor';
|
|
48
53
|
import {
|
|
49
|
-
|
|
50
|
-
type
|
|
51
|
-
} from '../../../shared_libs/play-runtime/
|
|
54
|
+
CoordinatorRateStateBackend,
|
|
55
|
+
type CoordinatorRatePort,
|
|
56
|
+
} from '../../../shared_libs/play-runtime/governor/coordinator-rate-state-backend';
|
|
57
|
+
import type { PacingRule } from '../../../shared_libs/play-runtime/governor/rate-state-backend';
|
|
58
|
+
import {
|
|
59
|
+
awaitChildTerminal,
|
|
60
|
+
type ChildPlayTerminalWaitResult,
|
|
61
|
+
type WorkflowStepLike,
|
|
62
|
+
} from './child-play-await';
|
|
63
|
+
import type { AnyBatchOperationStrategy } from '../../../shared_libs/play-runtime/batching-types';
|
|
52
64
|
import {
|
|
53
65
|
adaptV2ExecuteResponseToToolResult,
|
|
54
66
|
createToolExecuteResult,
|
|
@@ -121,7 +133,6 @@ import {
|
|
|
121
133
|
import { createHarnessWorkerReceiptStore } from './runtime/harness-receipt-store';
|
|
122
134
|
import {
|
|
123
135
|
applyCsvRenameProjection,
|
|
124
|
-
stripCsvProjectedFields,
|
|
125
136
|
stripCsvProjectionMetadata,
|
|
126
137
|
cloneCsvAliasedRow,
|
|
127
138
|
type CsvRenameOptions,
|
|
@@ -146,7 +157,6 @@ import type {
|
|
|
146
157
|
LiveNodeProgressSnapshot,
|
|
147
158
|
} from './runtime/live-progress';
|
|
148
159
|
import {
|
|
149
|
-
ToolHttpError,
|
|
150
160
|
extractErrorBilling,
|
|
151
161
|
isHardBillingToolHttpError,
|
|
152
162
|
normalizeToolHttpErrorMessage,
|
|
@@ -201,6 +211,8 @@ type RunRequest = {
|
|
|
201
211
|
/** Internal ctx.runPlay lineage. Public SDK/users never see this. */
|
|
202
212
|
playCallGovernance?: PlayCallGovernanceSnapshot | null;
|
|
203
213
|
preloadedDbSessions?: PreloadedRuntimeDbSession[] | null;
|
|
214
|
+
/** Coordinator already created the child run row before invoking /run-inline. */
|
|
215
|
+
inlineChildRunRegistered?: boolean | null;
|
|
204
216
|
/** Cloudflare coordinator URL for direct Workflow control-plane signals. */
|
|
205
217
|
coordinatorUrl?: string | null;
|
|
206
218
|
/** Request-scoped coordinator auth token for preview/dev direct control calls. */
|
|
@@ -253,11 +265,21 @@ type WorkerEnv = {
|
|
|
253
265
|
* `/api/v2/plays/runtime-tools/*`) skip the public callback URL and route
|
|
254
266
|
* directly through the coordinator's process to the configured app — saves
|
|
255
267
|
* the *.workers.dev → CF edge → cloudflared → localhost chain on every
|
|
256
|
-
* runtime callback.
|
|
257
|
-
*
|
|
268
|
+
* runtime callback. Required for workers_edge; missing binding is an infra
|
|
269
|
+
* error instead of a transport fallback.
|
|
258
270
|
*/
|
|
259
271
|
RUNTIME_API?: {
|
|
260
|
-
|
|
272
|
+
runtimeApiCall(input: {
|
|
273
|
+
executorToken: string;
|
|
274
|
+
path: string;
|
|
275
|
+
body: unknown;
|
|
276
|
+
headers?: Record<string, string>;
|
|
277
|
+
timeoutMs?: number;
|
|
278
|
+
}): Promise<{
|
|
279
|
+
status: number;
|
|
280
|
+
headers?: Record<string, string>;
|
|
281
|
+
body: string;
|
|
282
|
+
}>;
|
|
261
283
|
};
|
|
262
284
|
/**
|
|
263
285
|
* Loopback RPC binding into the coordinator Worker. Used for CF-to-CF
|
|
@@ -280,6 +302,20 @@ type WorkerEnv = {
|
|
|
280
302
|
logs?: string[];
|
|
281
303
|
timings?: Array<{ phase: string; ms: number }>;
|
|
282
304
|
}>;
|
|
305
|
+
submitWorkflowChild?(
|
|
306
|
+
parentRunId: string,
|
|
307
|
+
body: Record<string, unknown>,
|
|
308
|
+
): Promise<{
|
|
309
|
+
workflowId?: string;
|
|
310
|
+
runId?: string;
|
|
311
|
+
status?: string;
|
|
312
|
+
mode?: string;
|
|
313
|
+
output?: unknown;
|
|
314
|
+
result?: unknown;
|
|
315
|
+
error?: unknown;
|
|
316
|
+
logs?: string[];
|
|
317
|
+
timings?: Array<{ phase: string; ms: number }>;
|
|
318
|
+
}>;
|
|
283
319
|
signal(
|
|
284
320
|
runId: string,
|
|
285
321
|
body: Record<string, unknown>,
|
|
@@ -292,6 +328,26 @@ type WorkerEnv = {
|
|
|
292
328
|
runId: string,
|
|
293
329
|
event: Record<string, unknown>,
|
|
294
330
|
): Promise<void>;
|
|
331
|
+
readTerminalState?(runId: string): Promise<Record<string, unknown> | null>;
|
|
332
|
+
readChildTerminalState?(
|
|
333
|
+
parentRunId: string,
|
|
334
|
+
eventKey: string,
|
|
335
|
+
timeoutMs?: number,
|
|
336
|
+
): Promise<Record<string, unknown> | null>;
|
|
337
|
+
/**
|
|
338
|
+
* Distributed Rate State Backend RPC. Routes to the per-(org,provider)
|
|
339
|
+
* rate-bucket Durable Object so the request window is global across
|
|
340
|
+
* isolates. See CoordinatorRateStateBackend + dedup-do.ts.
|
|
341
|
+
*/
|
|
342
|
+
rateAcquire?(input: {
|
|
343
|
+
bucketId: string;
|
|
344
|
+
rules: PacingRule[];
|
|
345
|
+
requested: number;
|
|
346
|
+
}): Promise<{ granted: number; waitMs: number }>;
|
|
347
|
+
ratePenalize?(input: {
|
|
348
|
+
bucketId: string;
|
|
349
|
+
cooldownMs: number;
|
|
350
|
+
}): Promise<void>;
|
|
295
351
|
};
|
|
296
352
|
/**
|
|
297
353
|
* Required service binding to the long-lived Play Harness Worker
|
|
@@ -379,10 +435,9 @@ async function probeHarnessOnce(
|
|
|
379
435
|
}
|
|
380
436
|
}
|
|
381
437
|
/**
|
|
382
|
-
* Routes runtime API requests through the in-process RUNTIME_API
|
|
383
|
-
*
|
|
384
|
-
*
|
|
385
|
-
* fetch transport so the play still reaches the same authenticated handler.
|
|
438
|
+
* Routes runtime API requests through the in-process RUNTIME_API service
|
|
439
|
+
* binding. workers_edge treats a missing binding as infrastructure failure
|
|
440
|
+
* instead of falling back to public HTTP.
|
|
386
441
|
*/
|
|
387
442
|
const RUNTIME_API_TIMEOUT_MS = 30_000;
|
|
388
443
|
const RUNTIME_API_PLAY_RUN_TIMEOUT_MS = 75_000;
|
|
@@ -390,7 +445,6 @@ const RUNTIME_API_INTEGRATION_EXECUTE_TIMEOUT_MS = 180_000;
|
|
|
390
445
|
const RUNTIME_API_RETRY_DELAYS_MS = [
|
|
391
446
|
250, 750, 1500, 3000, 5000, 10000,
|
|
392
447
|
] as const;
|
|
393
|
-
let loggedMissingRuntimeApiBinding = false;
|
|
394
448
|
|
|
395
449
|
async function fetchRuntimeApi(
|
|
396
450
|
baseUrl: string,
|
|
@@ -418,37 +472,25 @@ async function fetchRuntimeApi(
|
|
|
418
472
|
try {
|
|
419
473
|
const mergedInit: RequestInit = {
|
|
420
474
|
...init,
|
|
421
|
-
headers: runtimeApiHeaders(init.headers,
|
|
475
|
+
headers: runtimeApiHeaders(init.headers, false),
|
|
422
476
|
signal: controller.signal,
|
|
423
477
|
};
|
|
424
478
|
if (!cachedRuntimeApiBinding) {
|
|
425
|
-
|
|
426
|
-
loggedMissingRuntimeApiBinding = true;
|
|
427
|
-
console.warn(
|
|
428
|
-
`[play-harness] RUNTIME_API binding missing; using public runtime API transport. path=${path}`,
|
|
429
|
-
);
|
|
430
|
-
}
|
|
431
|
-
return await Promise.race([
|
|
432
|
-
fetch(`${baseUrl.replace(/\/$/, '')}${path}`, mergedInit),
|
|
433
|
-
timeoutPromise,
|
|
434
|
-
]);
|
|
479
|
+
throw new Error('[play-harness] RUNTIME_API service binding is required');
|
|
435
480
|
}
|
|
436
|
-
const responsePromise =
|
|
437
|
-
|
|
481
|
+
const responsePromise = callRuntimeApiRpcBinding(
|
|
482
|
+
cachedRuntimeApiBinding,
|
|
483
|
+
mergedInit,
|
|
484
|
+
{
|
|
485
|
+
path,
|
|
486
|
+
timeoutMs,
|
|
487
|
+
},
|
|
438
488
|
);
|
|
439
489
|
const response = await Promise.race([responsePromise, timeoutPromise]);
|
|
440
|
-
if (await
|
|
441
|
-
|
|
442
|
-
`[play-harness] RUNTIME_API binding
|
|
490
|
+
if (await isRuntimeApiBindingNotFoundResponse(response)) {
|
|
491
|
+
throw new Error(
|
|
492
|
+
`[play-harness] RUNTIME_API service binding could not route ${path}; coordinator returned not found.`,
|
|
443
493
|
);
|
|
444
|
-
return await Promise.race([
|
|
445
|
-
fetch(`${baseUrl.replace(/\/$/, '')}${path}`, {
|
|
446
|
-
...init,
|
|
447
|
-
headers: runtimeApiHeaders(init.headers, true),
|
|
448
|
-
signal: controller.signal,
|
|
449
|
-
}),
|
|
450
|
-
timeoutPromise,
|
|
451
|
-
]);
|
|
452
494
|
}
|
|
453
495
|
return response;
|
|
454
496
|
} catch (err) {
|
|
@@ -463,7 +505,33 @@ async function fetchRuntimeApi(
|
|
|
463
505
|
}
|
|
464
506
|
}
|
|
465
507
|
|
|
466
|
-
async function
|
|
508
|
+
async function callRuntimeApiRpcBinding(
|
|
509
|
+
binding: NonNullable<WorkerEnv['RUNTIME_API']>,
|
|
510
|
+
init: RequestInit,
|
|
511
|
+
input: { path: string; timeoutMs: number },
|
|
512
|
+
): Promise<Response> {
|
|
513
|
+
const h = new Headers(init.headers);
|
|
514
|
+
const authorization = h.get('authorization') ?? '';
|
|
515
|
+
const headers: Record<string, string> = {};
|
|
516
|
+
const metadata = h.get(EXECUTE_TOOL_METADATA_HEADER);
|
|
517
|
+
if (metadata) headers[EXECUTE_TOOL_METADATA_HEADER] = metadata;
|
|
518
|
+
const contract = h.get(EXECUTE_RESPONSE_CONTRACT_HEADER);
|
|
519
|
+
if (contract) headers[EXECUTE_RESPONSE_CONTRACT_HEADER] = contract;
|
|
520
|
+
const rawBody = typeof init.body === 'string' ? init.body : '';
|
|
521
|
+
const result = await binding.runtimeApiCall({
|
|
522
|
+
executorToken: authorization.replace(/^Bearer\s+/i, '').trim(),
|
|
523
|
+
path: input.path,
|
|
524
|
+
body: rawBody ? JSON.parse(rawBody) : {},
|
|
525
|
+
headers,
|
|
526
|
+
timeoutMs: input.timeoutMs,
|
|
527
|
+
});
|
|
528
|
+
return new Response(result.body, {
|
|
529
|
+
status: result.status,
|
|
530
|
+
headers: result.headers ?? {},
|
|
531
|
+
});
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
async function isRuntimeApiBindingNotFoundResponse(
|
|
467
535
|
response: Response,
|
|
468
536
|
): Promise<boolean> {
|
|
469
537
|
if (response.status !== 404) {
|
|
@@ -494,13 +562,6 @@ function cachedVercelProtectionBypassToken(): string | null {
|
|
|
494
562
|
return cachedRuntimeApiVercelBypassToken;
|
|
495
563
|
}
|
|
496
564
|
|
|
497
|
-
const WORKER_PLAY_CALL_LIMITS = {
|
|
498
|
-
maxPlayCallDepth: 6,
|
|
499
|
-
maxPlayCallCount: 1_000,
|
|
500
|
-
maxChildPlayCallsPerParent: 1_000,
|
|
501
|
-
maxConcurrentPlayCalls: 16,
|
|
502
|
-
};
|
|
503
|
-
|
|
504
565
|
type RunnerEvent =
|
|
505
566
|
| {
|
|
506
567
|
type: 'log';
|
|
@@ -520,10 +581,17 @@ type WorkflowRunOutput = {
|
|
|
520
581
|
durationMs: number;
|
|
521
582
|
};
|
|
522
583
|
|
|
584
|
+
type InlineRunTiming = {
|
|
585
|
+
phase: string;
|
|
586
|
+
ms: number;
|
|
587
|
+
extra?: Record<string, unknown>;
|
|
588
|
+
};
|
|
589
|
+
|
|
523
590
|
type WorkerCtxCallbacks = {
|
|
524
591
|
onNodeProgress?: (input: {
|
|
525
592
|
nodeId: string;
|
|
526
593
|
progress: LiveNodeProgressSnapshot;
|
|
594
|
+
forceFlush?: boolean;
|
|
527
595
|
}) => void;
|
|
528
596
|
onMapStarted?: (nodeId: string, at?: number) => void;
|
|
529
597
|
onMapCompleted?: (nodeId: string, at?: number) => void;
|
|
@@ -612,12 +680,17 @@ function makeRequestId(): string {
|
|
|
612
680
|
}
|
|
613
681
|
|
|
614
682
|
function publicCsvInputRow<T extends Record<string, unknown>>(row: T): T {
|
|
615
|
-
const
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
683
|
+
const restored = stripCsvProjectionMetadata(row) as Record<string, unknown>;
|
|
684
|
+
const publicRow: Record<string, unknown> = {};
|
|
685
|
+
for (const fieldName of Reflect.ownKeys(restored)) {
|
|
686
|
+
if (typeof fieldName === 'string' && fieldName.startsWith('__deepline')) {
|
|
687
|
+
continue;
|
|
688
|
+
}
|
|
689
|
+
const descriptor = Object.getOwnPropertyDescriptor(restored, fieldName);
|
|
690
|
+
if (!descriptor) continue;
|
|
691
|
+
Object.defineProperty(publicRow, fieldName, descriptor);
|
|
692
|
+
}
|
|
693
|
+
return publicRow as T;
|
|
621
694
|
}
|
|
622
695
|
|
|
623
696
|
function publicCsvOutputRow<T extends Record<string, unknown>>(row: T): T {
|
|
@@ -634,6 +707,27 @@ function publicCsvOutputRow<T extends Record<string, unknown>>(row: T): T {
|
|
|
634
707
|
return publicRow as T;
|
|
635
708
|
}
|
|
636
709
|
|
|
710
|
+
function publicCsvStorageRow<T extends Record<string, unknown>>(row: T): T {
|
|
711
|
+
const publicRow = publicCsvInputRow(row) as Record<string, unknown>;
|
|
712
|
+
const storageRow: Record<string, unknown> = {};
|
|
713
|
+
for (const fieldName of Reflect.ownKeys(publicRow)) {
|
|
714
|
+
if (typeof fieldName !== 'string') continue;
|
|
715
|
+
const descriptor = Object.getOwnPropertyDescriptor(publicRow, fieldName);
|
|
716
|
+
if (!descriptor) continue;
|
|
717
|
+
storageRow[fieldName] =
|
|
718
|
+
'value' in descriptor ? descriptor.value : publicRow[fieldName];
|
|
719
|
+
}
|
|
720
|
+
for (const runtimeField of [
|
|
721
|
+
'__deeplineRowKey',
|
|
722
|
+
'__deeplineCellMetaPatch',
|
|
723
|
+
]) {
|
|
724
|
+
if (Object.prototype.hasOwnProperty.call(row, runtimeField)) {
|
|
725
|
+
storageRow[runtimeField] = row[runtimeField];
|
|
726
|
+
}
|
|
727
|
+
}
|
|
728
|
+
return storageRow as T;
|
|
729
|
+
}
|
|
730
|
+
|
|
637
731
|
/**
|
|
638
732
|
* Strip credentials and JWT-shaped tokens from any string before it lands in
|
|
639
733
|
* a log buffer or upstream error message. The harness routinely echoes
|
|
@@ -658,9 +752,8 @@ async function postRuntimeApi<T>(
|
|
|
658
752
|
executorToken: string,
|
|
659
753
|
body: unknown,
|
|
660
754
|
): Promise<T> {
|
|
661
|
-
// Routes through the in-process RUNTIME_API binding
|
|
662
|
-
//
|
|
663
|
-
// hits the same handler with the same auth — only the transport changes.
|
|
755
|
+
// Routes through the in-process RUNTIME_API service binding. Missing binding
|
|
756
|
+
// is an infra error in workers_edge, not a reason to fall back to public HTTP.
|
|
664
757
|
const serializedBody = JSON.stringify(body);
|
|
665
758
|
let lastError: unknown = null;
|
|
666
759
|
for (
|
|
@@ -797,6 +890,15 @@ async function submitChildPlayThroughCoordinator(input: {
|
|
|
797
890
|
}
|
|
798
891
|
return cachedCoordinatorBinding.submitChild(input.req.runId, input.body);
|
|
799
892
|
}
|
|
893
|
+
if (cachedCoordinatorBinding?.submitWorkflowChild) {
|
|
894
|
+
if (!isRecord(input.body)) {
|
|
895
|
+
throw new Error('ctx.runPlay child submit requires an object body.');
|
|
896
|
+
}
|
|
897
|
+
return cachedCoordinatorBinding.submitWorkflowChild(
|
|
898
|
+
input.req.runId,
|
|
899
|
+
input.body,
|
|
900
|
+
);
|
|
901
|
+
}
|
|
800
902
|
const coordinatorUrl = input.req.coordinatorUrl?.trim();
|
|
801
903
|
if (coordinatorUrl) {
|
|
802
904
|
// Keep child plays on the same coordinator/Workflow submit path as
|
|
@@ -924,46 +1026,6 @@ function workflowTimeoutFromMs(timeoutMs: number): string {
|
|
|
924
1026
|
return `${seconds} second${seconds === 1 ? '' : 's'}`;
|
|
925
1027
|
}
|
|
926
1028
|
|
|
927
|
-
async function waitForChildPlayTerminalEvent(input: {
|
|
928
|
-
req: RunRequest;
|
|
929
|
-
workflowStep?: WorkflowStep;
|
|
930
|
-
workflowId: string;
|
|
931
|
-
playName: string;
|
|
932
|
-
key: string;
|
|
933
|
-
timeoutMs: number;
|
|
934
|
-
}): Promise<unknown> {
|
|
935
|
-
if (!input.workflowStep) {
|
|
936
|
-
throw new Error(
|
|
937
|
-
'ctx.runPlay child waits require the cf-workflows runtime event scheduler.',
|
|
938
|
-
);
|
|
939
|
-
}
|
|
940
|
-
const eventKey = await childPlayEventKey({
|
|
941
|
-
key: input.key,
|
|
942
|
-
workflowId: input.workflowId,
|
|
943
|
-
});
|
|
944
|
-
const event = (await (
|
|
945
|
-
input.workflowStep.waitForEvent as unknown as (
|
|
946
|
-
name: string,
|
|
947
|
-
options: { type: string; timeout: string },
|
|
948
|
-
) => Promise<{ payload: unknown }>
|
|
949
|
-
)(`child_play_terminal:${eventKey}`, {
|
|
950
|
-
type: integrationEventType(eventKey),
|
|
951
|
-
timeout: workflowTimeoutFromMs(input.timeoutMs),
|
|
952
|
-
})) as { payload: unknown };
|
|
953
|
-
const rawPayload = isRecord(event.payload) ? event.payload : {};
|
|
954
|
-
const payload = isRecord(rawPayload.data) ? rawPayload.data : rawPayload;
|
|
955
|
-
const status = String(payload.status ?? '').toLowerCase();
|
|
956
|
-
if (status === 'completed') {
|
|
957
|
-
return extractChildPlayOutput(payload);
|
|
958
|
-
}
|
|
959
|
-
const error = isRecord(payload.error) ? payload.error : null;
|
|
960
|
-
const message =
|
|
961
|
-
(typeof error?.message === 'string' && error.message.trim()) ||
|
|
962
|
-
(typeof payload.error === 'string' && payload.error.trim()) ||
|
|
963
|
-
`Child play ${input.playName} (${input.workflowId}) finished with status ${status || 'unknown'}.`;
|
|
964
|
-
throw new Error(message);
|
|
965
|
-
}
|
|
966
|
-
|
|
967
1029
|
async function signalParentPlayTerminal(input: {
|
|
968
1030
|
req: RunRequest;
|
|
969
1031
|
status: 'completed' | 'failed' | 'cancelled';
|
|
@@ -1045,6 +1107,8 @@ async function executeTool(
|
|
|
1045
1107
|
req: RunRequest,
|
|
1046
1108
|
args: { id: string; toolId: string; input: Record<string, unknown> },
|
|
1047
1109
|
workflowStep?: WorkflowStep,
|
|
1110
|
+
onProviderBackpressure?: (retryAfterMs: number) => void,
|
|
1111
|
+
onRetryAttempt?: () => void,
|
|
1048
1112
|
): Promise<ToolExecuteResult> {
|
|
1049
1113
|
if (args.toolId === 'test_wait_for_event' && workflowStep) {
|
|
1050
1114
|
const result = await waitForSyntheticIntegrationEvent(
|
|
@@ -1059,7 +1123,7 @@ async function executeTool(
|
|
|
1059
1123
|
// service bindings, NOT through HTTP from this worker. Removing the
|
|
1060
1124
|
// dispatcher-side coordinatorUrl plumbing intentionally turns the old
|
|
1061
1125
|
// HTTP-based dedup helpers into dead code.
|
|
1062
|
-
return callToolDirect(req, args);
|
|
1126
|
+
return callToolDirect(req, args, onProviderBackpressure, onRetryAttempt);
|
|
1063
1127
|
}
|
|
1064
1128
|
|
|
1065
1129
|
async function executeToolWithLifecycle(
|
|
@@ -1193,50 +1257,20 @@ async function waitForSyntheticIntegrationEvent(
|
|
|
1193
1257
|
async function callToolDirect(
|
|
1194
1258
|
req: RunRequest,
|
|
1195
1259
|
args: { id: string; toolId: string; input: Record<string, unknown> },
|
|
1260
|
+
onProviderBackpressure?: (retryAfterMs: number) => void,
|
|
1261
|
+
// Invoked once per in-process retry attempt (429 / retryable 5xx / synthetic
|
|
1262
|
+
// transient) so the Governor charges chargeBudget('retry') per attempt — the
|
|
1263
|
+
// same runaway guard the cjs runner applies (context.ts charges retry on each
|
|
1264
|
+
// 429 / transient-5xx retry). Without this the worker substrate would leave
|
|
1265
|
+
// policy.budgets.maxRetryCount effectively unenforced.
|
|
1266
|
+
onRetryAttempt?: () => void,
|
|
1196
1267
|
): Promise<ToolExecuteResult> {
|
|
1197
1268
|
const { id, toolId, input } = args;
|
|
1198
|
-
if (toolId === 'test_rate_limit') {
|
|
1199
|
-
return wrapWorkerToolResult(
|
|
1200
|
-
toolId,
|
|
1201
|
-
executeSyntheticTestRateLimit(input),
|
|
1202
|
-
syntheticToolMetadata(toolId),
|
|
1203
|
-
);
|
|
1204
|
-
}
|
|
1205
|
-
if (toolId === 'test_batch_rate_limit') {
|
|
1206
|
-
return wrapWorkerToolResult(
|
|
1207
|
-
toolId,
|
|
1208
|
-
await executeSyntheticTestRateLimitBatch(req, input),
|
|
1209
|
-
syntheticToolMetadata(toolId),
|
|
1210
|
-
);
|
|
1211
|
-
}
|
|
1212
1269
|
const path = `/api/v2/integrations/${encodeURIComponent(toolId)}/execute`;
|
|
1213
1270
|
const maxAttempts = 3;
|
|
1214
1271
|
let lastError: Error | null = null;
|
|
1215
1272
|
|
|
1216
1273
|
for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
|
|
1217
|
-
if (toolId === 'test_transient_500' || toolId === 'test_transient_429') {
|
|
1218
|
-
const syntheticResult = executeSyntheticTransientRetry(
|
|
1219
|
-
toolId,
|
|
1220
|
-
input,
|
|
1221
|
-
attempt,
|
|
1222
|
-
);
|
|
1223
|
-
if (syntheticResult.ok) {
|
|
1224
|
-
return wrapWorkerToolResult(
|
|
1225
|
-
toolId,
|
|
1226
|
-
syntheticResult.result,
|
|
1227
|
-
syntheticToolMetadata(toolId),
|
|
1228
|
-
);
|
|
1229
|
-
}
|
|
1230
|
-
lastError = new Error(
|
|
1231
|
-
`tool ${toolId} ${syntheticResult.status} attempt ${attempt}/${maxAttempts}: ${syntheticResult.message}`,
|
|
1232
|
-
);
|
|
1233
|
-
if (attempt >= maxAttempts) {
|
|
1234
|
-
throw lastError;
|
|
1235
|
-
}
|
|
1236
|
-
await new Promise((resolve) => setTimeout(resolve, 1_000));
|
|
1237
|
-
continue;
|
|
1238
|
-
}
|
|
1239
|
-
|
|
1240
1274
|
const res = await fetchRuntimeApi(req.baseUrl, path, {
|
|
1241
1275
|
method: 'POST',
|
|
1242
1276
|
headers: {
|
|
@@ -1273,17 +1307,26 @@ async function callToolDirect(
|
|
|
1273
1307
|
maxAttempts,
|
|
1274
1308
|
bodyText: text,
|
|
1275
1309
|
});
|
|
1310
|
+
const retryAfterSeconds = Number(res.headers.get('retry-after'));
|
|
1311
|
+
const retryAfterMs =
|
|
1312
|
+
Number.isFinite(retryAfterSeconds) && retryAfterSeconds > 0
|
|
1313
|
+
? Math.ceil(retryAfterSeconds * 1000)
|
|
1314
|
+
: 0;
|
|
1315
|
+
if (res.status === 429) {
|
|
1316
|
+
// Feed the provider's backpressure into the shared pacer even on the
|
|
1317
|
+
// final attempt so the (org, provider) bucket backs off across isolates.
|
|
1318
|
+
onProviderBackpressure?.(retryAfterMs > 0 ? retryAfterMs : 1_000);
|
|
1319
|
+
}
|
|
1276
1320
|
const retryable =
|
|
1277
1321
|
(res.status === 429 && !isHardBillingToolHttpError(lastError)) ||
|
|
1278
1322
|
(res.status >= 500 && WORKER_RETRY_SAFE_5XX_TOOLS.has(toolId));
|
|
1279
1323
|
if (!retryable || attempt >= maxAttempts) {
|
|
1280
1324
|
throw lastError;
|
|
1281
1325
|
}
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
: 1_000;
|
|
1326
|
+
// Charge the retry budget per attempt, matching the cjs runner's
|
|
1327
|
+
// chargeBudget('retry') on every 429 / retryable-5xx retry.
|
|
1328
|
+
onRetryAttempt?.();
|
|
1329
|
+
const delayMs = retryAfterMs > 0 ? Math.min(5_000, retryAfterMs) : 1_000;
|
|
1287
1330
|
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
|
1288
1331
|
}
|
|
1289
1332
|
|
|
@@ -1405,7 +1448,7 @@ function parseStringArray(value: unknown): string[] {
|
|
|
1405
1448
|
.filter(Boolean);
|
|
1406
1449
|
}
|
|
1407
1450
|
|
|
1408
|
-
function
|
|
1451
|
+
function toolMetadataFallback(toolId: string): ToolResultMetadataInput {
|
|
1409
1452
|
if (toolId === 'test_rate_limit') {
|
|
1410
1453
|
return {
|
|
1411
1454
|
toolId,
|
|
@@ -1450,193 +1493,6 @@ function wrapWorkerToolResult(
|
|
|
1450
1493
|
});
|
|
1451
1494
|
}
|
|
1452
1495
|
|
|
1453
|
-
async function executeSyntheticTestRateLimitBatch(
|
|
1454
|
-
req: RunRequest,
|
|
1455
|
-
input: Record<string, unknown>,
|
|
1456
|
-
): Promise<Record<string, unknown>> {
|
|
1457
|
-
const delayMs =
|
|
1458
|
-
typeof input.simulated_delay_ms === 'number' &&
|
|
1459
|
-
Number.isInteger(input.simulated_delay_ms) &&
|
|
1460
|
-
input.simulated_delay_ms > 0
|
|
1461
|
-
? input.simulated_delay_ms
|
|
1462
|
-
: 0;
|
|
1463
|
-
if (delayMs > 0) {
|
|
1464
|
-
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
|
1465
|
-
}
|
|
1466
|
-
const rawItems = Array.isArray(input.items) ? input.items : [];
|
|
1467
|
-
const items = rawItems
|
|
1468
|
-
.filter((item): item is Record<string, unknown> =>
|
|
1469
|
-
Boolean(item && typeof item === 'object' && !Array.isArray(item)),
|
|
1470
|
-
)
|
|
1471
|
-
.map((item, index) => {
|
|
1472
|
-
const itemKey =
|
|
1473
|
-
typeof item.itemKey === 'string' && item.itemKey.trim()
|
|
1474
|
-
? item.itemKey.trim()
|
|
1475
|
-
: `item-${index}`;
|
|
1476
|
-
const payload =
|
|
1477
|
-
item.payload &&
|
|
1478
|
-
typeof item.payload === 'object' &&
|
|
1479
|
-
!Array.isArray(item.payload)
|
|
1480
|
-
? (item.payload as Record<string, unknown>)
|
|
1481
|
-
: {};
|
|
1482
|
-
return { itemKey, payload };
|
|
1483
|
-
});
|
|
1484
|
-
const batchRequest: ToolBatchRequest = {
|
|
1485
|
-
runId: req.runId,
|
|
1486
|
-
orgId: req.orgId,
|
|
1487
|
-
toolId: 'test_rate_limit',
|
|
1488
|
-
operation: 'test_batch_rate_limit',
|
|
1489
|
-
provider: 'test',
|
|
1490
|
-
items,
|
|
1491
|
-
waterfallId:
|
|
1492
|
-
typeof input.waterfall_id === 'string' ? input.waterfall_id : null,
|
|
1493
|
-
stageId: typeof input.stage === 'string' ? input.stage : null,
|
|
1494
|
-
fieldName: typeof input.field_name === 'string' ? input.field_name : null,
|
|
1495
|
-
mapName: typeof input.map_name === 'string' ? input.map_name : null,
|
|
1496
|
-
chunkIndex:
|
|
1497
|
-
typeof input.chunk_index === 'number' ? input.chunk_index : null,
|
|
1498
|
-
userProvidedRateLimitKey:
|
|
1499
|
-
typeof input.rate_limit_key === 'string' ? input.rate_limit_key : null,
|
|
1500
|
-
providerBatchSize: 200,
|
|
1501
|
-
};
|
|
1502
|
-
const executor = createToolBatchExecutor({
|
|
1503
|
-
async executeProviderBatch({ items: providerItems }) {
|
|
1504
|
-
return providerItems.map((item) => ({
|
|
1505
|
-
itemKey: item.itemKey,
|
|
1506
|
-
result: executeSyntheticTestRateLimit(item.payload),
|
|
1507
|
-
}));
|
|
1508
|
-
},
|
|
1509
|
-
});
|
|
1510
|
-
const result = await executor.executeToolBatch(batchRequest);
|
|
1511
|
-
return {
|
|
1512
|
-
status: 'completed',
|
|
1513
|
-
key: String(input.key ?? 'batch'),
|
|
1514
|
-
provider: 'test',
|
|
1515
|
-
batch: true,
|
|
1516
|
-
batch_size: result.itemCount,
|
|
1517
|
-
provider_batch_count: result.batchCount,
|
|
1518
|
-
items: result.results.map((item) => ({
|
|
1519
|
-
itemKey: item.itemKey,
|
|
1520
|
-
result: item.result,
|
|
1521
|
-
})),
|
|
1522
|
-
};
|
|
1523
|
-
}
|
|
1524
|
-
|
|
1525
|
-
type SyntheticTransientRetryResult =
|
|
1526
|
-
| { ok: true; result: Record<string, unknown> }
|
|
1527
|
-
| { ok: false; status: number; message: string };
|
|
1528
|
-
|
|
1529
|
-
function executeSyntheticTransientRetry(
|
|
1530
|
-
toolId: string,
|
|
1531
|
-
input: Record<string, unknown>,
|
|
1532
|
-
attempt: number,
|
|
1533
|
-
): SyntheticTransientRetryResult {
|
|
1534
|
-
const failuresBeforeSuccess =
|
|
1535
|
-
typeof input.failures_before_success === 'number' &&
|
|
1536
|
-
Number.isInteger(input.failures_before_success) &&
|
|
1537
|
-
input.failures_before_success >= 0
|
|
1538
|
-
? input.failures_before_success
|
|
1539
|
-
: 1;
|
|
1540
|
-
if (attempt <= failuresBeforeSuccess) {
|
|
1541
|
-
const status = toolId === 'test_transient_429' ? 429 : 502;
|
|
1542
|
-
return {
|
|
1543
|
-
ok: false,
|
|
1544
|
-
status,
|
|
1545
|
-
message: `Synthetic transient ${status} for attempt ${attempt}`,
|
|
1546
|
-
};
|
|
1547
|
-
}
|
|
1548
|
-
return {
|
|
1549
|
-
ok: true,
|
|
1550
|
-
result: {
|
|
1551
|
-
status: 'completed',
|
|
1552
|
-
provider: 'test',
|
|
1553
|
-
key: String(input.key ?? 'transient'),
|
|
1554
|
-
attempts: attempt,
|
|
1555
|
-
recovered: attempt > 1,
|
|
1556
|
-
},
|
|
1557
|
-
};
|
|
1558
|
-
}
|
|
1559
|
-
|
|
1560
|
-
function executeSyntheticTestRateLimit(
|
|
1561
|
-
input: Record<string, unknown>,
|
|
1562
|
-
): Record<string, unknown> {
|
|
1563
|
-
if (
|
|
1564
|
-
typeof input.key === 'string' &&
|
|
1565
|
-
input.key.startsWith('public-error-message-regression')
|
|
1566
|
-
) {
|
|
1567
|
-
throw new ToolHttpError(
|
|
1568
|
-
[
|
|
1569
|
-
'tool test_rate_limit 422 attempt 1/1:',
|
|
1570
|
-
'Synthetic public test error with a redacted token=[REDACTED].',
|
|
1571
|
-
'code=TEST_PUBLIC_ERROR.',
|
|
1572
|
-
'failure_description=The fake test provider intentionally raised a typed public error so V2 runner output preserves actionable details.',
|
|
1573
|
-
'operator_hint=Use this no-bill test provider fixture when verifying play runner error rendering.',
|
|
1574
|
-
].join(' '),
|
|
1575
|
-
null,
|
|
1576
|
-
);
|
|
1577
|
-
}
|
|
1578
|
-
const rowNumber =
|
|
1579
|
-
typeof input.row_number === 'number' && Number.isInteger(input.row_number)
|
|
1580
|
-
? input.row_number
|
|
1581
|
-
: null;
|
|
1582
|
-
const leadId = typeof input.lead_id === 'string' ? input.lead_id : null;
|
|
1583
|
-
const matchedDomain =
|
|
1584
|
-
typeof input.matched_domain === 'string' && input.matched_domain.trim()
|
|
1585
|
-
? input.matched_domain.trim()
|
|
1586
|
-
: 'example.com';
|
|
1587
|
-
const matchedPrefix =
|
|
1588
|
-
typeof input.matched_prefix === 'string' && input.matched_prefix.trim()
|
|
1589
|
-
? input.matched_prefix.trim()
|
|
1590
|
-
: (leadId ??
|
|
1591
|
-
(rowNumber !== null
|
|
1592
|
-
? `row${String(rowNumber).padStart(3, '0')}`
|
|
1593
|
-
: 'match'));
|
|
1594
|
-
const matched = syntheticMatchWindow(input, rowNumber);
|
|
1595
|
-
const matchedEmail = matched ? `${matchedPrefix}@${matchedDomain}` : null;
|
|
1596
|
-
const securityGateway =
|
|
1597
|
-
input.emit_security_gateway === true
|
|
1598
|
-
? { email_status: 'valid', mx_security_gateway: true }
|
|
1599
|
-
: {};
|
|
1600
|
-
return {
|
|
1601
|
-
status: 'completed',
|
|
1602
|
-
key: String(input.key || ''),
|
|
1603
|
-
provider: 'test',
|
|
1604
|
-
lead_id: leadId,
|
|
1605
|
-
row_number: rowNumber,
|
|
1606
|
-
matched_result: matchedEmail,
|
|
1607
|
-
email: matchedEmail,
|
|
1608
|
-
value: matchedEmail,
|
|
1609
|
-
batch: false,
|
|
1610
|
-
...securityGateway,
|
|
1611
|
-
};
|
|
1612
|
-
}
|
|
1613
|
-
|
|
1614
|
-
function syntheticMatchWindow(
|
|
1615
|
-
input: Record<string, unknown>,
|
|
1616
|
-
rowNumber: number | null,
|
|
1617
|
-
): boolean {
|
|
1618
|
-
const min =
|
|
1619
|
-
typeof input.match_rows_min === 'number' ? input.match_rows_min : null;
|
|
1620
|
-
const max =
|
|
1621
|
-
typeof input.match_rows_max === 'number' ? input.match_rows_max : null;
|
|
1622
|
-
if (rowNumber === null) return min === null && max === null;
|
|
1623
|
-
if (min !== null && rowNumber < min) return false;
|
|
1624
|
-
if (max !== null && rowNumber > max) return false;
|
|
1625
|
-
const moduloBase =
|
|
1626
|
-
typeof input.match_modulo_base === 'number' && input.match_modulo_base > 0
|
|
1627
|
-
? input.match_modulo_base
|
|
1628
|
-
: null;
|
|
1629
|
-
if (moduloBase !== null) {
|
|
1630
|
-
const equals = Array.isArray(input.match_modulo_equals)
|
|
1631
|
-
? input.match_modulo_equals
|
|
1632
|
-
.filter((entry): entry is number => typeof entry === 'number')
|
|
1633
|
-
.map((entry) => entry % moduloBase)
|
|
1634
|
-
: [];
|
|
1635
|
-
return equals.length > 0 && equals.includes(rowNumber % moduloBase);
|
|
1636
|
-
}
|
|
1637
|
-
return true;
|
|
1638
|
-
}
|
|
1639
|
-
|
|
1640
1496
|
function isRecordLike(value: unknown): value is Record<string, unknown> {
|
|
1641
1497
|
return value != null && typeof value === 'object' && !Array.isArray(value);
|
|
1642
1498
|
}
|
|
@@ -1730,7 +1586,12 @@ type WorkerToolBatchRequest = {
|
|
|
1730
1586
|
reject: (error: unknown) => void;
|
|
1731
1587
|
};
|
|
1732
1588
|
|
|
1733
|
-
const WORKER_TOOL_BATCH_GRACE_MS =
|
|
1589
|
+
const WORKER_TOOL_BATCH_GRACE_MS = 250;
|
|
1590
|
+
// Fallback batch-chunk parallelism when a tool declares no provider rate hints.
|
|
1591
|
+
// Matches the prior hardcoded `Math.min(4, ...)` so undeclared providers keep
|
|
1592
|
+
// their previous batching behavior; declared providers tighten via the
|
|
1593
|
+
// Governor's suggestedParallelism.
|
|
1594
|
+
const WORKER_TOOL_BATCH_DEFAULT_PARALLELISM = 4;
|
|
1734
1595
|
const WORKER_RETRY_SAFE_5XX_TOOLS = new Set(['test_transient_500']);
|
|
1735
1596
|
|
|
1736
1597
|
function stepProgramColumnName(parentField: string, stepId: string): string {
|
|
@@ -1741,7 +1602,32 @@ class WorkerToolBatchScheduler {
|
|
|
1741
1602
|
private queue: WorkerToolBatchRequest[] = [];
|
|
1742
1603
|
private scheduled = false;
|
|
1743
1604
|
|
|
1744
|
-
constructor(
|
|
1605
|
+
constructor(
|
|
1606
|
+
private readonly req: RunRequest,
|
|
1607
|
+
private readonly governor: PlayExecutionGovernor,
|
|
1608
|
+
private readonly resolvePacing: WorkerPacingResolver,
|
|
1609
|
+
private readonly abortSignal?: AbortSignal,
|
|
1610
|
+
private readonly onRequestsSettled?: (count: number) => void,
|
|
1611
|
+
) {}
|
|
1612
|
+
|
|
1613
|
+
/**
|
|
1614
|
+
* Report a provider 429 / Retry-After back into the Governor's shared pacer
|
|
1615
|
+
* so future acquires for this (org, provider) bucket back off across all
|
|
1616
|
+
* isolates. Provider comes from the same pacing resolver the Governor uses
|
|
1617
|
+
* (the worker has no local catalog), so callers pass only the toolId.
|
|
1618
|
+
*/
|
|
1619
|
+
private reportBackpressure(toolId: string, retryAfterMs: number): void {
|
|
1620
|
+
if (retryAfterMs <= 0) return;
|
|
1621
|
+
void (async () => {
|
|
1622
|
+
const pacing = await this.resolvePacing(toolId).catch(() => null);
|
|
1623
|
+
if (pacing?.provider) {
|
|
1624
|
+
this.governor.reportProviderBackpressure({
|
|
1625
|
+
provider: pacing.provider,
|
|
1626
|
+
retryAfterMs,
|
|
1627
|
+
});
|
|
1628
|
+
}
|
|
1629
|
+
})();
|
|
1630
|
+
}
|
|
1745
1631
|
|
|
1746
1632
|
execute(
|
|
1747
1633
|
id: string,
|
|
@@ -1824,16 +1710,27 @@ class WorkerToolBatchScheduler {
|
|
|
1824
1710
|
const groupStartedAt = nowMs();
|
|
1825
1711
|
await Promise.all(
|
|
1826
1712
|
requests.map(async (request) => {
|
|
1713
|
+
// Each unbatched provider call takes its own tool slot: the Governor
|
|
1714
|
+
// charges tool budget, holds a global tool-concurrency slot, and
|
|
1715
|
+
// applies per-(org,provider) pacing before the call runs.
|
|
1716
|
+
const slot = await this.governor.acquireToolSlot(toolId, {
|
|
1717
|
+
signal: this.abortSignal,
|
|
1718
|
+
});
|
|
1827
1719
|
try {
|
|
1828
1720
|
request.resolve(
|
|
1829
1721
|
await executeTool(
|
|
1830
1722
|
this.req,
|
|
1831
1723
|
{ id: request.id, toolId, input: request.input },
|
|
1832
1724
|
request.workflowStep,
|
|
1725
|
+
(retryAfterMs) => this.reportBackpressure(toolId, retryAfterMs),
|
|
1726
|
+
() => this.governor.chargeBudget('retry'),
|
|
1833
1727
|
),
|
|
1834
1728
|
);
|
|
1835
1729
|
} catch (error) {
|
|
1836
1730
|
request.reject(error);
|
|
1731
|
+
} finally {
|
|
1732
|
+
this.onRequestsSettled?.(1);
|
|
1733
|
+
slot.release();
|
|
1837
1734
|
}
|
|
1838
1735
|
}),
|
|
1839
1736
|
);
|
|
@@ -1851,6 +1748,15 @@ class WorkerToolBatchScheduler {
|
|
|
1851
1748
|
req: this.req,
|
|
1852
1749
|
requests,
|
|
1853
1750
|
strategy,
|
|
1751
|
+
governor: this.governor,
|
|
1752
|
+
suggestedParallelism: await this.governor.suggestedParallelism(
|
|
1753
|
+
toolId,
|
|
1754
|
+
WORKER_TOOL_BATCH_DEFAULT_PARALLELISM,
|
|
1755
|
+
),
|
|
1756
|
+
abortSignal: this.abortSignal,
|
|
1757
|
+
reportBackpressure: (retryAfterMs) =>
|
|
1758
|
+
this.reportBackpressure(toolId, retryAfterMs),
|
|
1759
|
+
onRequestsSettled: this.onRequestsSettled,
|
|
1854
1760
|
});
|
|
1855
1761
|
recordRunnerPerfTrace({
|
|
1856
1762
|
req: this.req,
|
|
@@ -1880,22 +1786,60 @@ async function executeBatchedWorkerToolGroup(input: {
|
|
|
1880
1786
|
req: RunRequest;
|
|
1881
1787
|
requests: WorkerToolBatchRequest[];
|
|
1882
1788
|
strategy: AnyBatchOperationStrategy;
|
|
1789
|
+
governor: PlayExecutionGovernor;
|
|
1790
|
+
suggestedParallelism: number;
|
|
1791
|
+
abortSignal?: AbortSignal;
|
|
1792
|
+
reportBackpressure: (retryAfterMs: number) => void;
|
|
1793
|
+
onRequestsSettled?: (count: number) => void;
|
|
1883
1794
|
}): Promise<void> {
|
|
1884
1795
|
const compiledBatches = compileRequestsWithStrategy({
|
|
1885
1796
|
requests: input.requests,
|
|
1886
1797
|
strategy: input.strategy,
|
|
1887
1798
|
getPayload: (request) => request.input,
|
|
1888
1799
|
});
|
|
1800
|
+
recordRunnerPerfTrace({
|
|
1801
|
+
req: input.req,
|
|
1802
|
+
phase: 'runner.tool.batch.compile',
|
|
1803
|
+
ms: 0,
|
|
1804
|
+
extra: {
|
|
1805
|
+
sourceOperation: input.strategy.sourceOperation,
|
|
1806
|
+
batchOperation: input.strategy.batchOperation,
|
|
1807
|
+
requests: input.requests.length,
|
|
1808
|
+
batches: compiledBatches.length,
|
|
1809
|
+
batchSizes: compiledBatches.map((batch) => batch.memberRequests.length),
|
|
1810
|
+
},
|
|
1811
|
+
});
|
|
1889
1812
|
|
|
1890
1813
|
await executeChunkedRequests({
|
|
1891
1814
|
requests: compiledBatches,
|
|
1892
|
-
|
|
1893
|
-
|
|
1894
|
-
|
|
1895
|
-
|
|
1896
|
-
|
|
1897
|
-
|
|
1898
|
-
|
|
1815
|
+
// Chunk parallelism is the Governor's per-tool suggestion (provider rate
|
|
1816
|
+
// hints tightened to the policy ceiling), bounded by the batch count.
|
|
1817
|
+
batchSize: Math.max(
|
|
1818
|
+
1,
|
|
1819
|
+
Math.min(input.suggestedParallelism, compiledBatches.length || 1),
|
|
1820
|
+
),
|
|
1821
|
+
execute: async (batch) => {
|
|
1822
|
+
// One provider call per batch → one tool slot (budget + global
|
|
1823
|
+
// concurrency + per-(org,provider) pacing) around the whole batch.
|
|
1824
|
+
const slot = await input.governor.acquireToolSlot(batch.batchOperation, {
|
|
1825
|
+
signal: input.abortSignal,
|
|
1826
|
+
});
|
|
1827
|
+
try {
|
|
1828
|
+
return await executeTool(
|
|
1829
|
+
input.req,
|
|
1830
|
+
{
|
|
1831
|
+
id: `batch:${batch.memberRequests.map((request) => request.id).join('|')}`,
|
|
1832
|
+
toolId: batch.batchOperation,
|
|
1833
|
+
input: batch.batchPayload,
|
|
1834
|
+
},
|
|
1835
|
+
undefined,
|
|
1836
|
+
input.reportBackpressure,
|
|
1837
|
+
() => input.governor.chargeBudget('retry'),
|
|
1838
|
+
);
|
|
1839
|
+
} finally {
|
|
1840
|
+
slot.release();
|
|
1841
|
+
}
|
|
1842
|
+
},
|
|
1899
1843
|
onChunkComplete: async (
|
|
1900
1844
|
chunkResults: Array<
|
|
1901
1845
|
ChunkExecutionResult<(typeof compiledBatches)[number], unknown>
|
|
@@ -1919,11 +1863,18 @@ async function executeBatchedWorkerToolGroup(input: {
|
|
|
1919
1863
|
wrapWorkerToolResult(
|
|
1920
1864
|
request.toolId,
|
|
1921
1865
|
splitResults[index] ?? null,
|
|
1922
|
-
|
|
1866
|
+
toolMetadataFallback(request.toolId),
|
|
1923
1867
|
),
|
|
1924
1868
|
);
|
|
1925
1869
|
}
|
|
1926
1870
|
}
|
|
1871
|
+
const settledMembers = chunkResults.reduce(
|
|
1872
|
+
(total, entry) => total + entry.request.memberRequests.length,
|
|
1873
|
+
0,
|
|
1874
|
+
);
|
|
1875
|
+
if (settledMembers > 0) {
|
|
1876
|
+
input.onRequestsSettled?.(settledMembers);
|
|
1877
|
+
}
|
|
1927
1878
|
},
|
|
1928
1879
|
}).catch((error) => {
|
|
1929
1880
|
for (const request of input.requests) {
|
|
@@ -3039,10 +2990,10 @@ async function persistCompletedMapRows(input: {
|
|
|
3039
2990
|
tableNamespace: input.tableNamespace,
|
|
3040
2991
|
sheetContract: augmentSheetContractWithDatasetFields({
|
|
3041
2992
|
contract: requireSheetContract(input.req, input.tableNamespace),
|
|
3042
|
-
rows: input.rows,
|
|
2993
|
+
rows: input.rows.map((row) => publicCsvStorageRow(row)),
|
|
3043
2994
|
outputFields,
|
|
3044
2995
|
}),
|
|
3045
|
-
rows: input.rows,
|
|
2996
|
+
rows: input.rows.map((row) => publicCsvStorageRow(row)),
|
|
3046
2997
|
outputFields,
|
|
3047
2998
|
runId: input.req.runId,
|
|
3048
2999
|
userEmail: input.req.userEmail,
|
|
@@ -3073,10 +3024,10 @@ async function prepareMapRows(input: {
|
|
|
3073
3024
|
tableNamespace: input.tableNamespace,
|
|
3074
3025
|
sheetContract: augmentSheetContractWithDatasetFields({
|
|
3075
3026
|
contract: requireSheetContract(input.req, input.tableNamespace),
|
|
3076
|
-
rows: input.rows,
|
|
3027
|
+
rows: input.rows.map((row) => publicCsvStorageRow(row)),
|
|
3077
3028
|
outputFields: input.outputFields,
|
|
3078
3029
|
}),
|
|
3079
|
-
rows: input.rows.map((row) => (
|
|
3030
|
+
rows: input.rows.map((row) => publicCsvStorageRow(row)),
|
|
3080
3031
|
runId: input.req.runId,
|
|
3081
3032
|
userEmail: input.req.userEmail,
|
|
3082
3033
|
cellPolicies: input.cellPolicies,
|
|
@@ -3164,9 +3115,23 @@ function assertNotAborted(signal: AbortSignal | undefined): void {
|
|
|
3164
3115
|
function childPipelineUsesCtxDataset(
|
|
3165
3116
|
pipeline: PlayStaticPipeline | null | undefined,
|
|
3166
3117
|
): boolean {
|
|
3167
|
-
|
|
3168
|
-
|
|
3169
|
-
|
|
3118
|
+
if (!pipeline) return false;
|
|
3119
|
+
if (typeof pipeline.tableNamespace === 'string' && pipeline.tableNamespace) {
|
|
3120
|
+
return true;
|
|
3121
|
+
}
|
|
3122
|
+
if (pipeline.sheetContract) {
|
|
3123
|
+
return true;
|
|
3124
|
+
}
|
|
3125
|
+
return flattenStaticPipeline(pipeline).some((substep) => {
|
|
3126
|
+
if (substep.type === 'dataset') return true;
|
|
3127
|
+
if (!isRecord(substep)) return false;
|
|
3128
|
+
return (
|
|
3129
|
+
('tableNamespace' in substep &&
|
|
3130
|
+
typeof substep.tableNamespace === 'string' &&
|
|
3131
|
+
substep.tableNamespace.length > 0) ||
|
|
3132
|
+
('sheetContract' in substep && Boolean(substep.sheetContract))
|
|
3133
|
+
);
|
|
3134
|
+
});
|
|
3170
3135
|
}
|
|
3171
3136
|
|
|
3172
3137
|
function childPipelineNeedsWorkflowScheduler(
|
|
@@ -3181,16 +3146,207 @@ function childPipelineNeedsWorkflowScheduler(
|
|
|
3181
3146
|
);
|
|
3182
3147
|
}
|
|
3183
3148
|
|
|
3184
|
-
|
|
3185
|
-
|
|
3186
|
-
|
|
3187
|
-
|
|
3188
|
-
|
|
3189
|
-
|
|
3190
|
-
|
|
3191
|
-
|
|
3192
|
-
|
|
3193
|
-
|
|
3149
|
+
/**
|
|
3150
|
+
* Build the per-(org,provider) rate port the distributed Rate State Backend
|
|
3151
|
+
* RPCs through. When the coordinator binding (or its rate RPCs) is absent we
|
|
3152
|
+
* fail OPEN — grant immediately — matching customer-rate-limiter semantics so a
|
|
3153
|
+
* miswired binding degrades pacing without stalling the run.
|
|
3154
|
+
*/
|
|
3155
|
+
function createCoordinatorRatePort(req: RunRequest): CoordinatorRatePort {
|
|
3156
|
+
return {
|
|
3157
|
+
async rateAcquire(input) {
|
|
3158
|
+
const binding = cachedCoordinatorBinding;
|
|
3159
|
+
if (!binding?.rateAcquire) {
|
|
3160
|
+
const coordinatorUrl = req.coordinatorUrl?.trim();
|
|
3161
|
+
if (!coordinatorUrl) {
|
|
3162
|
+
throw new Error('Coordinator rate acquire is unavailable.');
|
|
3163
|
+
}
|
|
3164
|
+
const res = await fetch(`${coordinatorUrl.replace(/\/$/, '')}/rate-acquire`, {
|
|
3165
|
+
method: 'POST',
|
|
3166
|
+
headers: {
|
|
3167
|
+
'x-deepline-request-id': makeRequestId(),
|
|
3168
|
+
...coordinatorRequestHeaders({
|
|
3169
|
+
runId: req.runId,
|
|
3170
|
+
contentType: 'application/json',
|
|
3171
|
+
internalToken: req.coordinatorInternalToken,
|
|
3172
|
+
}),
|
|
3173
|
+
},
|
|
3174
|
+
body: JSON.stringify(input),
|
|
3175
|
+
});
|
|
3176
|
+
if (!res.ok) {
|
|
3177
|
+
const text = await res.text().catch(() => '');
|
|
3178
|
+
throw new Error(
|
|
3179
|
+
`Coordinator rate acquire failed (${res.status}): ${text}`,
|
|
3180
|
+
);
|
|
3181
|
+
}
|
|
3182
|
+
return (await res.json()) as { granted: number; waitMs: number };
|
|
3183
|
+
}
|
|
3184
|
+
return await binding.rateAcquire(input);
|
|
3185
|
+
},
|
|
3186
|
+
async ratePenalize(input) {
|
|
3187
|
+
const binding = cachedCoordinatorBinding;
|
|
3188
|
+
if (!binding?.ratePenalize) {
|
|
3189
|
+
const coordinatorUrl = req.coordinatorUrl?.trim();
|
|
3190
|
+
if (!coordinatorUrl) return;
|
|
3191
|
+
const res = await fetch(
|
|
3192
|
+
`${coordinatorUrl.replace(/\/$/, '')}/rate-penalize`,
|
|
3193
|
+
{
|
|
3194
|
+
method: 'POST',
|
|
3195
|
+
headers: {
|
|
3196
|
+
'x-deepline-request-id': makeRequestId(),
|
|
3197
|
+
...coordinatorRequestHeaders({
|
|
3198
|
+
runId: req.runId,
|
|
3199
|
+
contentType: 'application/json',
|
|
3200
|
+
internalToken: req.coordinatorInternalToken,
|
|
3201
|
+
}),
|
|
3202
|
+
},
|
|
3203
|
+
body: JSON.stringify(input),
|
|
3204
|
+
},
|
|
3205
|
+
);
|
|
3206
|
+
if (!res.ok) {
|
|
3207
|
+
const text = await res.text().catch(() => '');
|
|
3208
|
+
throw new Error(
|
|
3209
|
+
`Coordinator rate penalize failed (${res.status}): ${text}`,
|
|
3210
|
+
);
|
|
3211
|
+
}
|
|
3212
|
+
return;
|
|
3213
|
+
}
|
|
3214
|
+
await binding.ratePenalize(input);
|
|
3215
|
+
},
|
|
3216
|
+
};
|
|
3217
|
+
}
|
|
3218
|
+
|
|
3219
|
+
/**
|
|
3220
|
+
* Resolve a tool's provider + pacing rules from the same runtime tool-metadata
|
|
3221
|
+
* endpoint the cjs_node20 runner uses (`getToolQueueHints`). The worker has no
|
|
3222
|
+
* local catalog, so this is an HTTP fetch through the runtime API binding,
|
|
3223
|
+
* memoized per isolate. No hints → null (pacing is a no-op; the Governor's
|
|
3224
|
+
* global tool-concurrency slot still applies).
|
|
3225
|
+
*/
|
|
3226
|
+
type WorkerPacingResolver = (
|
|
3227
|
+
toolId: string,
|
|
3228
|
+
) => Promise<{ provider: string; rules: PacingRule[] } | null>;
|
|
3229
|
+
|
|
3230
|
+
function createWorkerPacingResolver(req: RunRequest): WorkerPacingResolver {
|
|
3231
|
+
const cache = new Map<
|
|
3232
|
+
string,
|
|
3233
|
+
Promise<{ provider: string; rules: PacingRule[] } | null>
|
|
3234
|
+
>();
|
|
3235
|
+
return (toolId: string) => {
|
|
3236
|
+
const normalized = String(toolId || '').trim();
|
|
3237
|
+
if (!normalized) return Promise.resolve(null);
|
|
3238
|
+
const cached = cache.get(normalized);
|
|
3239
|
+
if (cached) return cached;
|
|
3240
|
+
const promise = (async () => {
|
|
3241
|
+
const res = await fetchRuntimeApi(
|
|
3242
|
+
req.baseUrl,
|
|
3243
|
+
`/api/v2/plays/runtime-tools/${encodeURIComponent(normalized)}`,
|
|
3244
|
+
{
|
|
3245
|
+
method: 'GET',
|
|
3246
|
+
headers: { authorization: `Bearer ${req.executorToken}` },
|
|
3247
|
+
},
|
|
3248
|
+
).catch(() => null);
|
|
3249
|
+
if (!res || !res.ok) return null;
|
|
3250
|
+
const body = (await res.json().catch(() => null)) as {
|
|
3251
|
+
provider?: unknown;
|
|
3252
|
+
queueHints?: unknown;
|
|
3253
|
+
} | null;
|
|
3254
|
+
if (!body) return null;
|
|
3255
|
+
const provider =
|
|
3256
|
+
typeof body.provider === 'string' && body.provider.trim()
|
|
3257
|
+
? body.provider.trim()
|
|
3258
|
+
: null;
|
|
3259
|
+
if (!provider || !Array.isArray(body.queueHints)) return null;
|
|
3260
|
+
const rules: PacingRule[] = body.queueHints.flatMap((hint) => {
|
|
3261
|
+
if (!hint || typeof hint !== 'object') return [];
|
|
3262
|
+
const record = hint as Record<string, unknown>;
|
|
3263
|
+
if (
|
|
3264
|
+
typeof record.ruleId !== 'string' ||
|
|
3265
|
+
typeof record.requestsPerWindow !== 'number' ||
|
|
3266
|
+
typeof record.windowMs !== 'number'
|
|
3267
|
+
) {
|
|
3268
|
+
return [];
|
|
3269
|
+
}
|
|
3270
|
+
return [
|
|
3271
|
+
{
|
|
3272
|
+
ruleId: record.ruleId,
|
|
3273
|
+
requestsPerWindow: record.requestsPerWindow,
|
|
3274
|
+
windowMs: record.windowMs,
|
|
3275
|
+
maxConcurrency:
|
|
3276
|
+
typeof record.maxConcurrency === 'number'
|
|
3277
|
+
? record.maxConcurrency
|
|
3278
|
+
: null,
|
|
3279
|
+
} satisfies PacingRule,
|
|
3280
|
+
];
|
|
3281
|
+
});
|
|
3282
|
+
if (rules.length === 0) return null;
|
|
3283
|
+
return { provider, rules };
|
|
3284
|
+
})();
|
|
3285
|
+
cache.set(normalized, promise);
|
|
3286
|
+
return promise;
|
|
3287
|
+
};
|
|
3288
|
+
}
|
|
3289
|
+
|
|
3290
|
+
/**
|
|
3291
|
+
* Build the Governor's lineage snapshot for this worker, seeded from the
|
|
3292
|
+
* inherited PlayCallGovernanceSnapshot (threaded via internalRunPlay) so play-
|
|
3293
|
+
* call budgets accumulate down the dispatch tree across isolates. The current
|
|
3294
|
+
* play id is always `req.playName` so the per-parent child-call counter keys off
|
|
3295
|
+
* the executing play (matching the prior worker behavior). Per-run counters the
|
|
3296
|
+
* worker lineage does not carry (tool/retry/descendant/waterfall) seed at 0 —
|
|
3297
|
+
* these are runaway guards, not exact cross-isolate ledgers, so a per-worker
|
|
3298
|
+
* reset is acceptable and matches the Governor contract.
|
|
3299
|
+
*/
|
|
3300
|
+
function resumeGovernanceFromRequest(req: RunRequest): GovernanceSnapshot {
|
|
3301
|
+
const inherited = req.playCallGovernance;
|
|
3302
|
+
const rootRunId = inherited?.rootRunId || req.runId;
|
|
3303
|
+
const ancestryPlayIds = inherited?.ancestryPlayIds?.length
|
|
3304
|
+
? // Per the lineage validator the inherited tail equals the parent; ensure
|
|
3305
|
+
// the chain ends with the currently-executing play for the cycle guard.
|
|
3306
|
+
inherited.ancestryPlayIds[inherited.ancestryPlayIds.length - 1] ===
|
|
3307
|
+
req.playName
|
|
3308
|
+
? [...inherited.ancestryPlayIds]
|
|
3309
|
+
: [...inherited.ancestryPlayIds, req.playName]
|
|
3310
|
+
: [req.playName];
|
|
3311
|
+
const ancestryRunIds =
|
|
3312
|
+
rootRunId === req.runId ? [req.runId] : [rootRunId, req.runId];
|
|
3313
|
+
return {
|
|
3314
|
+
rootRunId,
|
|
3315
|
+
currentRunId: req.runId,
|
|
3316
|
+
currentPlayId: req.playName,
|
|
3317
|
+
ancestryPlayIds,
|
|
3318
|
+
ancestryRunIds,
|
|
3319
|
+
callDepth: inherited?.callDepth ?? 0,
|
|
3320
|
+
// Seed every lineage-global budget counter from the inherited snapshot so
|
|
3321
|
+
// descendant/tool/retry/waterfall budgets accumulate across isolates exactly
|
|
3322
|
+
// as they do across the cjs forkChild lineage. Without this they would reset
|
|
3323
|
+
// to 0 in each isolate and become per-worker — contradicting the Governor's
|
|
3324
|
+
// lineage-global budget contract. Fail-safe to 0 for older callers.
|
|
3325
|
+
playCallCount: inherited?.playCallCount ?? 0,
|
|
3326
|
+
toolCallCount: inherited?.toolCallCount ?? 0,
|
|
3327
|
+
retryCount: inherited?.retryCount ?? 0,
|
|
3328
|
+
descendantCount: inherited?.descendantCount ?? 0,
|
|
3329
|
+
waterfallStepExecutions: inherited?.waterfallStepExecutions ?? 0,
|
|
3330
|
+
parentChildCalls: {},
|
|
3331
|
+
};
|
|
3332
|
+
}
|
|
3333
|
+
|
|
3334
|
+
function createGovernorForRun(req: RunRequest): {
|
|
3335
|
+
governor: PlayExecutionGovernor;
|
|
3336
|
+
resolvePacing: WorkerPacingResolver;
|
|
3337
|
+
} {
|
|
3338
|
+
const resolvePacing = createWorkerPacingResolver(req);
|
|
3339
|
+
const governor = createPlayExecutionGovernor({
|
|
3340
|
+
adapter: 'esm_workers',
|
|
3341
|
+
scope: {
|
|
3342
|
+
orgId: req.orgId,
|
|
3343
|
+
rootRunId: req.playCallGovernance?.rootRunId ?? req.runId,
|
|
3344
|
+
},
|
|
3345
|
+
rateState: new CoordinatorRateStateBackend(createCoordinatorRatePort(req)),
|
|
3346
|
+
resolvePacing,
|
|
3347
|
+
resume: resumeGovernanceFromRequest(req),
|
|
3348
|
+
});
|
|
3349
|
+
return { governor, resolvePacing };
|
|
3194
3350
|
}
|
|
3195
3351
|
|
|
3196
3352
|
function createMinimalWorkerCtx(
|
|
@@ -3201,12 +3357,12 @@ function createMinimalWorkerCtx(
|
|
|
3201
3357
|
abortSignal?: AbortSignal,
|
|
3202
3358
|
callbacks?: WorkerCtxCallbacks,
|
|
3203
3359
|
): unknown {
|
|
3204
|
-
|
|
3205
|
-
|
|
3360
|
+
const { governor, resolvePacing: resolveToolPacing } =
|
|
3361
|
+
createGovernorForRun(req);
|
|
3362
|
+
// Play-call depth/count/per-parent budgets, child-play concurrency, and the
|
|
3363
|
+
// lineage snapshot are owned by the Governor (createGovernorForRun above).
|
|
3364
|
+
// The worker keeps only substrate mechanism here.
|
|
3206
3365
|
const stepCallCounts: Record<string, number> = {};
|
|
3207
|
-
const inFlightChildCallsByPlayName: Record<string, number> = {};
|
|
3208
|
-
let inFlightChildPlayCalls = 0;
|
|
3209
|
-
const childPlaySlotWaiters: Array<() => void> = [];
|
|
3210
3366
|
const secretRedactor = createSecretRedactionContext();
|
|
3211
3367
|
|
|
3212
3368
|
const resolveSecretAuth = async (auth?: SecretAuth) => {
|
|
@@ -3245,38 +3401,6 @@ function createMinimalWorkerCtx(
|
|
|
3245
3401
|
: { [auth.header.toLowerCase()]: value };
|
|
3246
3402
|
};
|
|
3247
3403
|
|
|
3248
|
-
const acquireChildPlaySlot = async (): Promise<() => void> => {
|
|
3249
|
-
while (
|
|
3250
|
-
inFlightChildPlayCalls >= WORKER_PLAY_CALL_LIMITS.maxConcurrentPlayCalls
|
|
3251
|
-
) {
|
|
3252
|
-
await new Promise<void>((resolve, reject) => {
|
|
3253
|
-
const waiter = () => {
|
|
3254
|
-
abortSignal?.removeEventListener('abort', onAbort);
|
|
3255
|
-
resolve();
|
|
3256
|
-
};
|
|
3257
|
-
const onAbort = () => {
|
|
3258
|
-
const index = childPlaySlotWaiters.indexOf(waiter);
|
|
3259
|
-
if (index >= 0) childPlaySlotWaiters.splice(index, 1);
|
|
3260
|
-
reject(
|
|
3261
|
-
abortSignal?.reason instanceof Error
|
|
3262
|
-
? abortSignal.reason
|
|
3263
|
-
: new WorkflowAbortError(),
|
|
3264
|
-
);
|
|
3265
|
-
};
|
|
3266
|
-
childPlaySlotWaiters.push(waiter);
|
|
3267
|
-
abortSignal?.addEventListener('abort', onAbort, { once: true });
|
|
3268
|
-
});
|
|
3269
|
-
assertNotAborted(abortSignal);
|
|
3270
|
-
}
|
|
3271
|
-
inFlightChildPlayCalls += 1;
|
|
3272
|
-
let released = false;
|
|
3273
|
-
return () => {
|
|
3274
|
-
if (released) return;
|
|
3275
|
-
released = true;
|
|
3276
|
-
inFlightChildPlayCalls = Math.max(0, inFlightChildPlayCalls - 1);
|
|
3277
|
-
childPlaySlotWaiters.shift()?.();
|
|
3278
|
-
};
|
|
3279
|
-
};
|
|
3280
3404
|
const rootGovernance = req.playCallGovernance;
|
|
3281
3405
|
const rootRunId = rootGovernance?.rootRunId ?? req.runId;
|
|
3282
3406
|
const receiptStore = createHarnessWorkerReceiptStore({
|
|
@@ -3401,6 +3525,7 @@ function createMinimalWorkerCtx(
|
|
|
3401
3525
|
...progress,
|
|
3402
3526
|
updatedAt: progress.updatedAt ?? nowMs(),
|
|
3403
3527
|
},
|
|
3528
|
+
forceFlush: true,
|
|
3404
3529
|
});
|
|
3405
3530
|
};
|
|
3406
3531
|
const formatMapProgressMessage = (completed: number, total?: number) =>
|
|
@@ -3530,6 +3655,18 @@ function createMinimalWorkerCtx(
|
|
|
3530
3655
|
completedRows: prepared.completedRows.length,
|
|
3531
3656
|
},
|
|
3532
3657
|
});
|
|
3658
|
+
updateMapProgress({
|
|
3659
|
+
completed: prepared.completedRows.length,
|
|
3660
|
+
total: chunkRows.length,
|
|
3661
|
+
startedAt: mapStartedAt,
|
|
3662
|
+
message:
|
|
3663
|
+
prepared.pendingRows.length > 0
|
|
3664
|
+
? `${prepared.pendingRows.length.toLocaleString()} rows queued`
|
|
3665
|
+
: formatMapProgressMessage(
|
|
3666
|
+
prepared.completedRows.length,
|
|
3667
|
+
chunkRows.length,
|
|
3668
|
+
),
|
|
3669
|
+
});
|
|
3533
3670
|
const pendingKeys = new Set<string>();
|
|
3534
3671
|
const pendingRowsByKey = new Map<string, Record<string, unknown>>();
|
|
3535
3672
|
const completedKeys = new Set<string>();
|
|
@@ -3577,7 +3714,40 @@ function createMinimalWorkerCtx(
|
|
|
3577
3714
|
0,
|
|
3578
3715
|
prepared.skipped - missingPreparedRows.length,
|
|
3579
3716
|
);
|
|
3580
|
-
|
|
3717
|
+
let settledToolRequests = 0;
|
|
3718
|
+
let lastToolProgressAt = 0;
|
|
3719
|
+
const reportSettledToolRequests = (count: number) => {
|
|
3720
|
+
if (count <= 0) return;
|
|
3721
|
+
settledToolRequests += count;
|
|
3722
|
+
const now = nowMs();
|
|
3723
|
+
const estimatedCompleted = Math.min(
|
|
3724
|
+
chunkRows.length,
|
|
3725
|
+
prepared.completedRows.length + settledToolRequests,
|
|
3726
|
+
);
|
|
3727
|
+
const isTerminalEstimate = estimatedCompleted >= chunkRows.length;
|
|
3728
|
+
if (
|
|
3729
|
+
!isTerminalEstimate &&
|
|
3730
|
+
now - lastToolProgressAt < RUN_LEDGER_FLUSH_INTERVAL_MS
|
|
3731
|
+
) {
|
|
3732
|
+
return;
|
|
3733
|
+
}
|
|
3734
|
+
lastToolProgressAt = now;
|
|
3735
|
+
updateMapProgress({
|
|
3736
|
+
completed: estimatedCompleted,
|
|
3737
|
+
total: chunkRows.length,
|
|
3738
|
+
startedAt: mapStartedAt,
|
|
3739
|
+
message: formatMapProgressMessage(
|
|
3740
|
+
estimatedCompleted,
|
|
3741
|
+
chunkRows.length,
|
|
3742
|
+
),
|
|
3743
|
+
});
|
|
3744
|
+
};
|
|
3745
|
+
// Row concurrency comes from the Governor: an explicit map concurrency is
|
|
3746
|
+
// clamped to the policy row-max, otherwise the policy default. Each row
|
|
3747
|
+
// body additionally acquires a global row slot (the Governor's rowMax
|
|
3748
|
+
// semaphore) so total in-flight rows across all maps in this isolate stay
|
|
3749
|
+
// bounded even when several maps run at once.
|
|
3750
|
+
const concurrency = governor.resolveRowConcurrency();
|
|
3581
3751
|
const executedRows: Array<T & Record<string, unknown>> = new Array(
|
|
3582
3752
|
rowsToExecute.length,
|
|
3583
3753
|
);
|
|
@@ -3594,7 +3764,13 @@ function createMinimalWorkerCtx(
|
|
|
3594
3764
|
>
|
|
3595
3765
|
| undefined
|
|
3596
3766
|
> = new Array(rowsToExecute.length);
|
|
3597
|
-
const toolBatchScheduler = new WorkerToolBatchScheduler(
|
|
3767
|
+
const toolBatchScheduler = new WorkerToolBatchScheduler(
|
|
3768
|
+
req,
|
|
3769
|
+
governor,
|
|
3770
|
+
resolveToolPacing,
|
|
3771
|
+
abortSignal,
|
|
3772
|
+
reportSettledToolRequests,
|
|
3773
|
+
);
|
|
3598
3774
|
const generatedOutputFields = new Set<string>();
|
|
3599
3775
|
let idx = 0;
|
|
3600
3776
|
const workers: Array<Promise<void>> = [];
|
|
@@ -3605,143 +3781,152 @@ function createMinimalWorkerCtx(
|
|
|
3605
3781
|
if (abortSignal?.aborted) return;
|
|
3606
3782
|
const myIndex = idx++;
|
|
3607
3783
|
if (myIndex >= rowsToExecute.length) return;
|
|
3608
|
-
const
|
|
3609
|
-
|
|
3610
|
-
|
|
3611
|
-
|
|
3612
|
-
|
|
3613
|
-
|
|
3614
|
-
|
|
3615
|
-
|
|
3616
|
-
|
|
3617
|
-
|
|
3618
|
-
|
|
3619
|
-
|
|
3620
|
-
|
|
3621
|
-
|
|
3622
|
-
|
|
3623
|
-
|
|
3624
|
-
|
|
3625
|
-
|
|
3626
|
-
|
|
3627
|
-
|
|
3628
|
-
|
|
3629
|
-
|
|
3630
|
-
|
|
3631
|
-
|
|
3632
|
-
|
|
3633
|
-
|
|
3634
|
-
|
|
3635
|
-
|
|
3636
|
-
|
|
3637
|
-
|
|
3638
|
-
|
|
3639
|
-
|
|
3640
|
-
|
|
3641
|
-
|
|
3642
|
-
|
|
3784
|
+
const rowSlot = await governor.acquireRowSlot({
|
|
3785
|
+
signal: abortSignal,
|
|
3786
|
+
});
|
|
3787
|
+
try {
|
|
3788
|
+
const entry = uniqueRowsToExecuteEntries[myIndex]!;
|
|
3789
|
+
const row = pendingRowsByKey.has(entry.rowKey)
|
|
3790
|
+
? ({
|
|
3791
|
+
...entry.row,
|
|
3792
|
+
...publicCsvInputRow(pendingRowsByKey.get(entry.rowKey)!),
|
|
3793
|
+
} as T & Record<string, unknown>)
|
|
3794
|
+
: entry.row;
|
|
3795
|
+
const absoluteIndex = entry.absoluteIndex;
|
|
3796
|
+
const enriched: Record<string, unknown> =
|
|
3797
|
+
cloneCsvAliasedRow(row);
|
|
3798
|
+
const fieldOutputs: Record<string, unknown> = {};
|
|
3799
|
+
const cellMetaPatch: Record<
|
|
3800
|
+
string,
|
|
3801
|
+
{
|
|
3802
|
+
status: 'cached' | 'skipped' | 'completed';
|
|
3803
|
+
stage?: string | null;
|
|
3804
|
+
reused?: boolean;
|
|
3805
|
+
runId?: string;
|
|
3806
|
+
completedAt?: number;
|
|
3807
|
+
}
|
|
3808
|
+
> = {};
|
|
3809
|
+
const waterfallOutputs: RecordedWaterfallOutput[] = [];
|
|
3810
|
+
const stepProgramOutputs: RecordedStepProgramOutput[] = [];
|
|
3811
|
+
const rowCtx = {
|
|
3812
|
+
...(ctx as Record<string, unknown>),
|
|
3813
|
+
tools: {
|
|
3814
|
+
...((ctx as { tools?: Record<string, unknown> }).tools ??
|
|
3815
|
+
{}),
|
|
3816
|
+
execute: async (requestArg: unknown): Promise<unknown> => {
|
|
3817
|
+
assertNotAborted(abortSignal);
|
|
3818
|
+
const request = normalizeToolExecuteArgs(requestArg);
|
|
3819
|
+
return await toolBatchScheduler.execute(
|
|
3820
|
+
request.id,
|
|
3821
|
+
request.toolId,
|
|
3822
|
+
request.input,
|
|
3823
|
+
workflowStep,
|
|
3824
|
+
);
|
|
3825
|
+
},
|
|
3643
3826
|
},
|
|
3644
|
-
|
|
3645
|
-
|
|
3646
|
-
|
|
3647
|
-
|
|
3648
|
-
|
|
3649
|
-
|
|
3650
|
-
|
|
3651
|
-
|
|
3652
|
-
|
|
3653
|
-
|
|
3654
|
-
|
|
3655
|
-
|
|
3656
|
-
|
|
3657
|
-
|
|
3658
|
-
|
|
3659
|
-
|
|
3660
|
-
|
|
3661
|
-
|
|
3662
|
-
|
|
3663
|
-
|
|
3664
|
-
|
|
3665
|
-
|
|
3666
|
-
|
|
3667
|
-
|
|
3668
|
-
|
|
3669
|
-
|
|
3670
|
-
|
|
3671
|
-
|
|
3672
|
-
|
|
3673
|
-
|
|
3674
|
-
|
|
3675
|
-
|
|
3676
|
-
|
|
3677
|
-
|
|
3678
|
-
|
|
3679
|
-
|
|
3680
|
-
|
|
3681
|
-
|
|
3682
|
-
|
|
3683
|
-
|
|
3684
|
-
|
|
3685
|
-
|
|
3686
|
-
|
|
3687
|
-
|
|
3688
|
-
|
|
3689
|
-
|
|
3827
|
+
waterfall: (
|
|
3828
|
+
toolNameOrSpec: string | WorkerInlineWaterfallSpec,
|
|
3829
|
+
waterfallInput: Record<string, unknown>,
|
|
3830
|
+
waterfallOpts?: WorkerWaterfallOptions,
|
|
3831
|
+
) =>
|
|
3832
|
+
executeWorkerWaterfall(
|
|
3833
|
+
req,
|
|
3834
|
+
waterfallOutputs,
|
|
3835
|
+
toolNameOrSpec,
|
|
3836
|
+
waterfallInput,
|
|
3837
|
+
waterfallOpts,
|
|
3838
|
+
callbacks,
|
|
3839
|
+
workflowStep,
|
|
3840
|
+
),
|
|
3841
|
+
};
|
|
3842
|
+
for (const [key, value] of fieldEntries) {
|
|
3843
|
+
const rawCellMeta =
|
|
3844
|
+
enriched[DEEPLINE_CELL_META_FIELD] &&
|
|
3845
|
+
typeof enriched[DEEPLINE_CELL_META_FIELD] === 'object'
|
|
3846
|
+
? (
|
|
3847
|
+
enriched[DEEPLINE_CELL_META_FIELD] as Record<
|
|
3848
|
+
string,
|
|
3849
|
+
unknown
|
|
3850
|
+
>
|
|
3851
|
+
)[key]
|
|
3852
|
+
: null;
|
|
3853
|
+
const reuseDecision = shouldRecomputeCell({
|
|
3854
|
+
hasValue: isCompletedWorkerFieldValue(enriched[key]),
|
|
3855
|
+
meta:
|
|
3856
|
+
rawCellMeta && typeof rawCellMeta === 'object'
|
|
3857
|
+
? (rawCellMeta as {
|
|
3858
|
+
status?: string;
|
|
3859
|
+
completedAt?: number;
|
|
3860
|
+
})
|
|
3861
|
+
: null,
|
|
3862
|
+
policy: cellPolicies?.[key],
|
|
3863
|
+
});
|
|
3864
|
+
if (reuseDecision.action === 'reuse') {
|
|
3865
|
+
cellMetaPatch[key] = {
|
|
3866
|
+
status: 'cached',
|
|
3867
|
+
stage: key,
|
|
3868
|
+
reused: true,
|
|
3869
|
+
runId: req.runId,
|
|
3870
|
+
};
|
|
3871
|
+
continue;
|
|
3872
|
+
}
|
|
3873
|
+
const resolved = await executeWorkerStepResolver(
|
|
3874
|
+
value,
|
|
3875
|
+
enriched,
|
|
3876
|
+
rowCtx,
|
|
3877
|
+
absoluteIndex,
|
|
3878
|
+
isWorkerStepProgram(value)
|
|
3879
|
+
? {
|
|
3880
|
+
parentField: key,
|
|
3881
|
+
path: [],
|
|
3882
|
+
outputs: stepProgramOutputs,
|
|
3883
|
+
}
|
|
3884
|
+
: undefined,
|
|
3885
|
+
);
|
|
3886
|
+
enriched[key] = resolved.value;
|
|
3887
|
+
fieldOutputs[key] = resolved.value;
|
|
3888
|
+
if (resolved.status === 'skipped') {
|
|
3889
|
+
cellMetaPatch[key] = {
|
|
3890
|
+
status: 'skipped',
|
|
3891
|
+
stage: key,
|
|
3892
|
+
runId: req.runId,
|
|
3893
|
+
};
|
|
3894
|
+
} else {
|
|
3895
|
+
cellMetaPatch[key] = {
|
|
3896
|
+
status: 'completed',
|
|
3897
|
+
stage: key,
|
|
3898
|
+
runId: req.runId,
|
|
3899
|
+
completedAt: nowMs(),
|
|
3900
|
+
};
|
|
3901
|
+
}
|
|
3690
3902
|
}
|
|
3691
|
-
const
|
|
3692
|
-
value
|
|
3693
|
-
|
|
3694
|
-
|
|
3695
|
-
|
|
3696
|
-
|
|
3697
|
-
|
|
3698
|
-
|
|
3699
|
-
|
|
3700
|
-
|
|
3701
|
-
|
|
3702
|
-
: undefined,
|
|
3703
|
-
);
|
|
3704
|
-
enriched[key] = resolved.value;
|
|
3705
|
-
fieldOutputs[key] = resolved.value;
|
|
3706
|
-
if (resolved.status === 'skipped') {
|
|
3707
|
-
cellMetaPatch[key] = {
|
|
3708
|
-
status: 'skipped',
|
|
3709
|
-
stage: key,
|
|
3710
|
-
runId: req.runId,
|
|
3711
|
-
};
|
|
3712
|
-
} else {
|
|
3713
|
-
cellMetaPatch[key] = {
|
|
3714
|
-
status: 'completed',
|
|
3715
|
-
stage: key,
|
|
3716
|
-
runId: req.runId,
|
|
3717
|
-
completedAt: nowMs(),
|
|
3718
|
-
};
|
|
3903
|
+
for (const stepOutput of stepProgramOutputs) {
|
|
3904
|
+
enriched[stepOutput.columnName] = stepOutput.value;
|
|
3905
|
+
fieldOutputs[stepOutput.columnName] = stepOutput.value;
|
|
3906
|
+
generatedOutputFields.add(stepOutput.columnName);
|
|
3907
|
+
if (stepOutput.status === 'skipped') {
|
|
3908
|
+
cellMetaPatch[stepOutput.columnName] = {
|
|
3909
|
+
status: 'skipped',
|
|
3910
|
+
stage: stepOutput.stepId,
|
|
3911
|
+
runId: req.runId,
|
|
3912
|
+
};
|
|
3913
|
+
}
|
|
3719
3914
|
}
|
|
3720
|
-
|
|
3721
|
-
|
|
3722
|
-
|
|
3723
|
-
|
|
3724
|
-
|
|
3725
|
-
|
|
3726
|
-
cellMetaPatch[stepOutput.columnName] = {
|
|
3727
|
-
status: 'skipped',
|
|
3728
|
-
stage: stepOutput.stepId,
|
|
3729
|
-
runId: req.runId,
|
|
3730
|
-
};
|
|
3915
|
+
for (const waterfallOutput of waterfallOutputs) {
|
|
3916
|
+
const columnName =
|
|
3917
|
+
`${sqlishIdentifierPart(waterfallOutput.waterfallId)}__` +
|
|
3918
|
+
sqlishIdentifierPart(waterfallOutput.stepId);
|
|
3919
|
+
enriched[columnName] = waterfallOutput.value;
|
|
3920
|
+
generatedOutputFields.add(columnName);
|
|
3731
3921
|
}
|
|
3922
|
+
executedCellMetaPatches[myIndex] =
|
|
3923
|
+
Object.keys(cellMetaPatch).length > 0
|
|
3924
|
+
? cellMetaPatch
|
|
3925
|
+
: undefined;
|
|
3926
|
+
executedRows[myIndex] = enriched as T & Record<string, unknown>;
|
|
3927
|
+
} finally {
|
|
3928
|
+
rowSlot.release();
|
|
3732
3929
|
}
|
|
3733
|
-
for (const waterfallOutput of waterfallOutputs) {
|
|
3734
|
-
const columnName =
|
|
3735
|
-
`${sqlishIdentifierPart(waterfallOutput.waterfallId)}__` +
|
|
3736
|
-
sqlishIdentifierPart(waterfallOutput.stepId);
|
|
3737
|
-
enriched[columnName] = waterfallOutput.value;
|
|
3738
|
-
generatedOutputFields.add(columnName);
|
|
3739
|
-
}
|
|
3740
|
-
executedCellMetaPatches[myIndex] =
|
|
3741
|
-
Object.keys(cellMetaPatch).length > 0
|
|
3742
|
-
? cellMetaPatch
|
|
3743
|
-
: undefined;
|
|
3744
|
-
executedRows[myIndex] = enriched as T & Record<string, unknown>;
|
|
3745
3930
|
}
|
|
3746
3931
|
})(),
|
|
3747
3932
|
);
|
|
@@ -4410,33 +4595,20 @@ function createMinimalWorkerCtx(
|
|
|
4410
4595
|
childPlayName: resolvedName,
|
|
4411
4596
|
input,
|
|
4412
4597
|
})}${staleRuntimeSuffix(options?.staleAfterSeconds)}`;
|
|
4413
|
-
if (ancestryPlayIds.includes(resolvedName)) {
|
|
4414
|
-
const chain = [...ancestryPlayIds, resolvedName].join(' -> ');
|
|
4415
|
-
throw new Error(`Recursive play graph detected: ${chain}`);
|
|
4416
|
-
}
|
|
4417
|
-
const nextDepth = callDepth + 1;
|
|
4418
|
-
if (nextDepth > WORKER_PLAY_CALL_LIMITS.maxPlayCallDepth) {
|
|
4419
|
-
throw new Error(
|
|
4420
|
-
`Play-call depth exceeded (${nextDepth}/${WORKER_PLAY_CALL_LIMITS.maxPlayCallDepth}) while calling ${resolvedName}.`,
|
|
4421
|
-
);
|
|
4422
|
-
}
|
|
4423
|
-
const nextPlayCallCount = playCallCount + 1;
|
|
4424
|
-
if (nextPlayCallCount > WORKER_PLAY_CALL_LIMITS.maxPlayCallCount) {
|
|
4425
|
-
throw new Error(
|
|
4426
|
-
`Root play-call budget exceeded (${nextPlayCallCount}/${WORKER_PLAY_CALL_LIMITS.maxPlayCallCount}).`,
|
|
4427
|
-
);
|
|
4428
|
-
}
|
|
4429
|
-
const nextParentCalls = (parentChildCalls[req.playName] ?? 0) + 1;
|
|
4430
|
-
if (
|
|
4431
|
-
nextParentCalls > WORKER_PLAY_CALL_LIMITS.maxChildPlayCallsPerParent
|
|
4432
|
-
) {
|
|
4433
|
-
throw new Error(
|
|
4434
|
-
`Child play-call cap exceeded for ${req.playName} (${nextParentCalls}/${WORKER_PLAY_CALL_LIMITS.maxChildPlayCallsPerParent}).`,
|
|
4435
|
-
);
|
|
4436
|
-
}
|
|
4437
4598
|
return await executeWithRuntimeReceipt(receiptKey, async () => {
|
|
4438
|
-
|
|
4439
|
-
|
|
4599
|
+
// The Governor owns the play-call lineage: forkChild does the cycle
|
|
4600
|
+
// guard, depth/per-parent/playCall/descendant budget charges, and
|
|
4601
|
+
// returns the snapshot to thread into the child so budgets accumulate
|
|
4602
|
+
// across isolates. Charged inside the receipt boundary so a replay
|
|
4603
|
+
// (cache hit) never double-charges.
|
|
4604
|
+
const childRunId = `${req.runId}:child:${normalizedKey}`;
|
|
4605
|
+
const childGovernance = governor.forkChild({
|
|
4606
|
+
childPlayName: resolvedName,
|
|
4607
|
+
childRunId,
|
|
4608
|
+
});
|
|
4609
|
+
const nextDepth = childGovernance.callDepth;
|
|
4610
|
+
const nextParentCalls =
|
|
4611
|
+
governor.snapshot().parentChildCalls[req.playName] ?? 0;
|
|
4440
4612
|
|
|
4441
4613
|
emitEvent({
|
|
4442
4614
|
type: 'log',
|
|
@@ -4456,31 +4628,47 @@ function createMinimalWorkerCtx(
|
|
|
4456
4628
|
const childNeedsWorkflowScheduler = childPipelineNeedsWorkflowScheduler(
|
|
4457
4629
|
childManifest.staticPipeline,
|
|
4458
4630
|
);
|
|
4459
|
-
|
|
4460
|
-
|
|
4461
|
-
|
|
4462
|
-
|
|
4463
|
-
|
|
4464
|
-
|
|
4465
|
-
|
|
4466
|
-
|
|
4467
|
-
|
|
4468
|
-
|
|
4469
|
-
|
|
4470
|
-
|
|
4471
|
-
|
|
4472
|
-
|
|
4473
|
-
|
|
4631
|
+
console.info('[play.runtime.span]', {
|
|
4632
|
+
event: 'play.runtime.span',
|
|
4633
|
+
phase: 'child_route',
|
|
4634
|
+
runId: req.runId,
|
|
4635
|
+
parentRunId: req.runId,
|
|
4636
|
+
playName: resolvedName,
|
|
4637
|
+
graphHash: req.graphHash ?? null,
|
|
4638
|
+
depth: nextDepth,
|
|
4639
|
+
fanoutIndex: nextParentCalls - 1,
|
|
4640
|
+
childIsDatasetBacked,
|
|
4641
|
+
childNeedsWorkflowScheduler,
|
|
4642
|
+
hasStaticPipeline: Boolean(childManifest.staticPipeline),
|
|
4643
|
+
childTableNamespace:
|
|
4644
|
+
typeof childManifest.staticPipeline?.tableNamespace === 'string'
|
|
4645
|
+
? childManifest.staticPipeline.tableNamespace
|
|
4646
|
+
: null,
|
|
4647
|
+
childStageCount: Array.isArray(childManifest.staticPipeline?.stages)
|
|
4648
|
+
? childManifest.staticPipeline.stages.length
|
|
4649
|
+
: null,
|
|
4650
|
+
childSubstepCount: Array.isArray(
|
|
4651
|
+
childManifest.staticPipeline?.substeps,
|
|
4652
|
+
)
|
|
4653
|
+
? childManifest.staticPipeline.substeps.length
|
|
4654
|
+
: null,
|
|
4655
|
+
});
|
|
4656
|
+
let childPlaySlot: { release(): void } | null = null;
|
|
4474
4657
|
try {
|
|
4475
|
-
|
|
4658
|
+
childPlaySlot = await governor.acquireChildPlaySlot({
|
|
4659
|
+
signal: abortSignal,
|
|
4660
|
+
});
|
|
4476
4661
|
const childSubmitStartedAt = nowMs();
|
|
4477
4662
|
let started: {
|
|
4478
4663
|
workflowId?: string;
|
|
4479
4664
|
runId?: string;
|
|
4480
4665
|
status?: string;
|
|
4666
|
+
mode?: string;
|
|
4481
4667
|
output?: unknown;
|
|
4482
4668
|
result?: unknown;
|
|
4483
4669
|
error?: unknown;
|
|
4670
|
+
logs?: string[];
|
|
4671
|
+
timings?: Array<{ phase: string; ms: number }>;
|
|
4484
4672
|
};
|
|
4485
4673
|
try {
|
|
4486
4674
|
started = await submitChildPlayThroughCoordinator({
|
|
@@ -4507,6 +4695,17 @@ function createMinimalWorkerCtx(
|
|
|
4507
4695
|
// executor token's play name (the parent making this call).
|
|
4508
4696
|
ancestryPlayIds,
|
|
4509
4697
|
callDepth: nextDepth,
|
|
4698
|
+
// Cumulative lineage-global budget counters (incl. this
|
|
4699
|
+
// launch's play/descendant charges) so the child seeds its
|
|
4700
|
+
// budgets from the lineage total instead of resetting to 0 in
|
|
4701
|
+
// its isolate. Threading descendantCount in particular keeps
|
|
4702
|
+
// fan-out descendant accounting lineage-global, matching cjs.
|
|
4703
|
+
playCallCount: childGovernance.playCallCount,
|
|
4704
|
+
toolCallCount: childGovernance.toolCallCount,
|
|
4705
|
+
retryCount: childGovernance.retryCount,
|
|
4706
|
+
descendantCount: childGovernance.descendantCount,
|
|
4707
|
+
waterfallStepExecutions:
|
|
4708
|
+
childGovernance.waterfallStepExecutions,
|
|
4510
4709
|
description:
|
|
4511
4710
|
typeof options?.description === 'string'
|
|
4512
4711
|
? options.description
|
|
@@ -4528,6 +4727,21 @@ function createMinimalWorkerCtx(
|
|
|
4528
4727
|
status: 'failed',
|
|
4529
4728
|
errorCode: 'CHILD_SUBMIT_FAILED',
|
|
4530
4729
|
});
|
|
4730
|
+
recordRunnerPerfTrace({
|
|
4731
|
+
req,
|
|
4732
|
+
phase: 'ctx_run_play.child_submit',
|
|
4733
|
+
ms: nowMs() - childSubmitStartedAt,
|
|
4734
|
+
extra: {
|
|
4735
|
+
status: 'failed',
|
|
4736
|
+
errorCode: 'CHILD_SUBMIT_FAILED',
|
|
4737
|
+
playName: resolvedName,
|
|
4738
|
+
key: normalizedKey,
|
|
4739
|
+
depth: nextDepth,
|
|
4740
|
+
fanoutIndex: nextParentCalls - 1,
|
|
4741
|
+
childIsDatasetBacked,
|
|
4742
|
+
childNeedsWorkflowScheduler,
|
|
4743
|
+
},
|
|
4744
|
+
});
|
|
4531
4745
|
throw error;
|
|
4532
4746
|
}
|
|
4533
4747
|
const workflowId = started.workflowId ?? started.runId;
|
|
@@ -4558,6 +4772,26 @@ function createMinimalWorkerCtx(
|
|
|
4558
4772
|
ms: nowMs() - childSubmitStartedAt,
|
|
4559
4773
|
status: 'ok',
|
|
4560
4774
|
});
|
|
4775
|
+
recordRunnerPerfTrace({
|
|
4776
|
+
req,
|
|
4777
|
+
phase: 'ctx_run_play.child_submit',
|
|
4778
|
+
ms: nowMs() - childSubmitStartedAt,
|
|
4779
|
+
extra: {
|
|
4780
|
+
status: 'ok',
|
|
4781
|
+
childRunId: workflowId,
|
|
4782
|
+
startedStatus: started.status ?? null,
|
|
4783
|
+
mode: started.mode ?? null,
|
|
4784
|
+
coordinatorTimings: Array.isArray(started.timings)
|
|
4785
|
+
? started.timings
|
|
4786
|
+
: null,
|
|
4787
|
+
playName: resolvedName,
|
|
4788
|
+
key: normalizedKey,
|
|
4789
|
+
depth: nextDepth,
|
|
4790
|
+
fanoutIndex: nextParentCalls - 1,
|
|
4791
|
+
childIsDatasetBacked,
|
|
4792
|
+
childNeedsWorkflowScheduler,
|
|
4793
|
+
},
|
|
4794
|
+
});
|
|
4561
4795
|
const startedStatus = String(started.status ?? '').toLowerCase();
|
|
4562
4796
|
if (startedStatus === 'completed') {
|
|
4563
4797
|
emitEvent({
|
|
@@ -4580,11 +4814,16 @@ function createMinimalWorkerCtx(
|
|
|
4580
4814
|
throw new Error(startedErrorMessage);
|
|
4581
4815
|
}
|
|
4582
4816
|
const childWaitStartedAt = nowMs();
|
|
4583
|
-
let
|
|
4817
|
+
let waitResult: ChildPlayTerminalWaitResult;
|
|
4584
4818
|
try {
|
|
4585
|
-
|
|
4586
|
-
req,
|
|
4587
|
-
|
|
4819
|
+
waitResult = await awaitChildTerminal({
|
|
4820
|
+
parentRunId: req.runId,
|
|
4821
|
+
// CF's WorkflowStep.waitForEvent generic signature is wider than
|
|
4822
|
+
// the small structural shape ChildPlayAwait needs; bridge it the
|
|
4823
|
+
// same way the inline implementation did.
|
|
4824
|
+
workflowStep: workflowStep as unknown as
|
|
4825
|
+
| WorkflowStepLike
|
|
4826
|
+
| undefined,
|
|
4588
4827
|
workflowId,
|
|
4589
4828
|
playName: resolvedName,
|
|
4590
4829
|
key: normalizedKey,
|
|
@@ -4592,6 +4831,22 @@ function createMinimalWorkerCtx(
|
|
|
4592
4831
|
1_000,
|
|
4593
4832
|
Math.min(options?.timeoutMs ?? 5 * 60_000, 30 * 60_000),
|
|
4594
4833
|
),
|
|
4834
|
+
coordinator: cachedCoordinatorBinding?.readChildTerminalState
|
|
4835
|
+
? {
|
|
4836
|
+
readChildTerminalState: (
|
|
4837
|
+
parentRunId,
|
|
4838
|
+
eventKey,
|
|
4839
|
+
timeoutMs,
|
|
4840
|
+
) =>
|
|
4841
|
+
cachedCoordinatorBinding!.readChildTerminalState!(
|
|
4842
|
+
parentRunId,
|
|
4843
|
+
eventKey,
|
|
4844
|
+
timeoutMs,
|
|
4845
|
+
),
|
|
4846
|
+
}
|
|
4847
|
+
: null,
|
|
4848
|
+
now: nowMs,
|
|
4849
|
+
hashJson,
|
|
4595
4850
|
});
|
|
4596
4851
|
} catch (error) {
|
|
4597
4852
|
console.info('[play.runtime.span]', {
|
|
@@ -4608,6 +4863,22 @@ function createMinimalWorkerCtx(
|
|
|
4608
4863
|
status: 'failed',
|
|
4609
4864
|
errorCode: 'CHILD_WAIT_FAILED',
|
|
4610
4865
|
});
|
|
4866
|
+
recordRunnerPerfTrace({
|
|
4867
|
+
req,
|
|
4868
|
+
phase: 'ctx_run_play.child_wait',
|
|
4869
|
+
ms: nowMs() - childWaitStartedAt,
|
|
4870
|
+
extra: {
|
|
4871
|
+
status: 'failed',
|
|
4872
|
+
errorCode: 'CHILD_WAIT_FAILED',
|
|
4873
|
+
childRunId: workflowId,
|
|
4874
|
+
playName: resolvedName,
|
|
4875
|
+
key: normalizedKey,
|
|
4876
|
+
depth: nextDepth,
|
|
4877
|
+
fanoutIndex: nextParentCalls - 1,
|
|
4878
|
+
childIsDatasetBacked,
|
|
4879
|
+
childNeedsWorkflowScheduler,
|
|
4880
|
+
},
|
|
4881
|
+
});
|
|
4611
4882
|
throw error;
|
|
4612
4883
|
}
|
|
4613
4884
|
console.info('[play.runtime.span]', {
|
|
@@ -4622,6 +4893,27 @@ function createMinimalWorkerCtx(
|
|
|
4622
4893
|
fanoutIndex: nextParentCalls - 1,
|
|
4623
4894
|
ms: nowMs() - childWaitStartedAt,
|
|
4624
4895
|
status: 'ok',
|
|
4896
|
+
waitSource: waitResult.source,
|
|
4897
|
+
waitAttempts: waitResult.attempts ?? null,
|
|
4898
|
+
reportedWaitMs: waitResult.waitMs,
|
|
4899
|
+
});
|
|
4900
|
+
recordRunnerPerfTrace({
|
|
4901
|
+
req,
|
|
4902
|
+
phase: 'ctx_run_play.child_wait',
|
|
4903
|
+
ms: nowMs() - childWaitStartedAt,
|
|
4904
|
+
extra: {
|
|
4905
|
+
status: 'ok',
|
|
4906
|
+
childRunId: workflowId,
|
|
4907
|
+
playName: resolvedName,
|
|
4908
|
+
key: normalizedKey,
|
|
4909
|
+
depth: nextDepth,
|
|
4910
|
+
fanoutIndex: nextParentCalls - 1,
|
|
4911
|
+
childIsDatasetBacked,
|
|
4912
|
+
childNeedsWorkflowScheduler,
|
|
4913
|
+
waitSource: waitResult.source,
|
|
4914
|
+
waitAttempts: waitResult.attempts ?? null,
|
|
4915
|
+
reportedWaitMs: waitResult.waitMs,
|
|
4916
|
+
},
|
|
4625
4917
|
});
|
|
4626
4918
|
emitEvent({
|
|
4627
4919
|
type: 'log',
|
|
@@ -4629,15 +4921,9 @@ function createMinimalWorkerCtx(
|
|
|
4629
4921
|
message: `Completed child play ${resolvedName} (${normalizedKey})`,
|
|
4630
4922
|
ts: nowMs(),
|
|
4631
4923
|
});
|
|
4632
|
-
return
|
|
4924
|
+
return waitResult.output;
|
|
4633
4925
|
} finally {
|
|
4634
|
-
|
|
4635
|
-
if (childConcurrencyAcquired) {
|
|
4636
|
-
releaseChildPlayConcurrency(
|
|
4637
|
-
inFlightChildCallsByPlayName,
|
|
4638
|
-
resolvedName,
|
|
4639
|
-
);
|
|
4640
|
-
}
|
|
4926
|
+
childPlaySlot?.release();
|
|
4641
4927
|
}
|
|
4642
4928
|
});
|
|
4643
4929
|
},
|
|
@@ -4813,6 +5099,135 @@ async function handleRun(request: Request, env: WorkerEnv): Promise<Response> {
|
|
|
4813
5099
|
});
|
|
4814
5100
|
}
|
|
4815
5101
|
|
|
5102
|
+
async function handleRunInline(
|
|
5103
|
+
request: Request,
|
|
5104
|
+
env: WorkerEnv,
|
|
5105
|
+
): Promise<Response> {
|
|
5106
|
+
let req: RunRequest;
|
|
5107
|
+
try {
|
|
5108
|
+
req = (await request.json()) as RunRequest;
|
|
5109
|
+
} catch {
|
|
5110
|
+
return Response.json(
|
|
5111
|
+
{
|
|
5112
|
+
status: 'failed',
|
|
5113
|
+
error: { message: 'invalid JSON body' },
|
|
5114
|
+
},
|
|
5115
|
+
{ status: 400 },
|
|
5116
|
+
);
|
|
5117
|
+
}
|
|
5118
|
+
|
|
5119
|
+
const events: RunnerEvent[] = [];
|
|
5120
|
+
const timings: InlineRunTiming[] = [];
|
|
5121
|
+
const traceInline = (
|
|
5122
|
+
phase: string,
|
|
5123
|
+
phaseStartedAt: number,
|
|
5124
|
+
extra?: Record<string, unknown>,
|
|
5125
|
+
): void => {
|
|
5126
|
+
timings.push({
|
|
5127
|
+
phase,
|
|
5128
|
+
ms: nowMs() - phaseStartedAt,
|
|
5129
|
+
...(extra ? { extra } : {}),
|
|
5130
|
+
});
|
|
5131
|
+
};
|
|
5132
|
+
const inlineStartedAt = nowMs();
|
|
5133
|
+
try {
|
|
5134
|
+
const runPrefix = `[deepline-run:${req.runId}]`;
|
|
5135
|
+
captureCoordinatorBinding(env);
|
|
5136
|
+
captureRuntimeApiBinding(env);
|
|
5137
|
+
captureHarnessBinding(env);
|
|
5138
|
+
const probeStartedAt = nowMs();
|
|
5139
|
+
await probeHarnessOnce(env, runPrefix);
|
|
5140
|
+
traceInline('inline.probe_harness', probeStartedAt);
|
|
5141
|
+
if (!req.inlineChildRunRegistered) {
|
|
5142
|
+
const registerStartedAt = nowMs();
|
|
5143
|
+
await registerInlineChildRun(req);
|
|
5144
|
+
traceInline('inline.register_child_run', registerStartedAt);
|
|
5145
|
+
} else {
|
|
5146
|
+
traceInline('inline.register_child_run', nowMs(), { skipped: true });
|
|
5147
|
+
}
|
|
5148
|
+
const executeStartedAt = nowMs();
|
|
5149
|
+
const output = await executeRunRequest(
|
|
5150
|
+
req,
|
|
5151
|
+
env,
|
|
5152
|
+
(event) => {
|
|
5153
|
+
events.push(event);
|
|
5154
|
+
},
|
|
5155
|
+
undefined,
|
|
5156
|
+
{
|
|
5157
|
+
persistResultDatasets: true,
|
|
5158
|
+
},
|
|
5159
|
+
);
|
|
5160
|
+
traceInline('inline.execute_run_request', executeStartedAt, {
|
|
5161
|
+
durationMs: output.durationMs,
|
|
5162
|
+
outputRows: output.outputRows,
|
|
5163
|
+
});
|
|
5164
|
+
traceInline('inline.total', inlineStartedAt);
|
|
5165
|
+
return Response.json({
|
|
5166
|
+
status: 'completed',
|
|
5167
|
+
result: output.result,
|
|
5168
|
+
outputRows: output.outputRows,
|
|
5169
|
+
durationMs: output.durationMs,
|
|
5170
|
+
events,
|
|
5171
|
+
timings,
|
|
5172
|
+
});
|
|
5173
|
+
} catch (error) {
|
|
5174
|
+
const err = error as Error;
|
|
5175
|
+
events.push({
|
|
5176
|
+
type: 'error',
|
|
5177
|
+
message: err.message ?? String(err),
|
|
5178
|
+
stack: err.stack,
|
|
5179
|
+
ts: nowMs(),
|
|
5180
|
+
});
|
|
5181
|
+
return Response.json({
|
|
5182
|
+
status: 'failed',
|
|
5183
|
+
error: {
|
|
5184
|
+
message: err.message ?? String(err),
|
|
5185
|
+
stack: err.stack,
|
|
5186
|
+
},
|
|
5187
|
+
events,
|
|
5188
|
+
timings,
|
|
5189
|
+
});
|
|
5190
|
+
}
|
|
5191
|
+
}
|
|
5192
|
+
|
|
5193
|
+
async function registerInlineChildRun(req: RunRequest): Promise<void> {
|
|
5194
|
+
const snapshot = isRecord(req.contractSnapshot) ? req.contractSnapshot : {};
|
|
5195
|
+
const artifactMetadata = isRecord(snapshot.artifactMetadata)
|
|
5196
|
+
? snapshot.artifactMetadata
|
|
5197
|
+
: {};
|
|
5198
|
+
const governance = req.playCallGovernance;
|
|
5199
|
+
await postRuntimeApi(req.baseUrl, req.executorToken, {
|
|
5200
|
+
action: 'start_inline_child_run',
|
|
5201
|
+
playName: req.playName,
|
|
5202
|
+
runId: req.runId,
|
|
5203
|
+
workflowFamilyKey:
|
|
5204
|
+
governance?.rootRunId ?? governance?.parentRunId ?? req.runId,
|
|
5205
|
+
artifactStorageKey:
|
|
5206
|
+
typeof artifactMetadata.storageKey === 'string'
|
|
5207
|
+
? artifactMetadata.storageKey
|
|
5208
|
+
: undefined,
|
|
5209
|
+
artifactHash:
|
|
5210
|
+
typeof artifactMetadata.artifactHash === 'string'
|
|
5211
|
+
? artifactMetadata.artifactHash
|
|
5212
|
+
: undefined,
|
|
5213
|
+
graphHash:
|
|
5214
|
+
typeof artifactMetadata.graphHash === 'string'
|
|
5215
|
+
? artifactMetadata.graphHash
|
|
5216
|
+
: undefined,
|
|
5217
|
+
runtimeBackend: 'workers_edge',
|
|
5218
|
+
schedulerBackend: 'inline_child',
|
|
5219
|
+
executionProfile: 'workers_edge',
|
|
5220
|
+
maxCreditsPerRun: extractMaxCreditsPerRun(req.contractSnapshot),
|
|
5221
|
+
staticPipeline: snapshot.staticPipeline ?? null,
|
|
5222
|
+
source:
|
|
5223
|
+
snapshot.source === 'published' ||
|
|
5224
|
+
snapshot.source === 'ad_hoc' ||
|
|
5225
|
+
snapshot.source === 'draft'
|
|
5226
|
+
? snapshot.source
|
|
5227
|
+
: 'published',
|
|
5228
|
+
});
|
|
5229
|
+
}
|
|
5230
|
+
|
|
4816
5231
|
/** Cap on run log lines retained in the terminal output compatibility shape. */
|
|
4817
5232
|
const RUN_LOG_BUFFER_LIMIT = 500;
|
|
4818
5233
|
/** Min wall-clock interval between live run-ledger flushes during a run. */
|
|
@@ -4890,6 +5305,100 @@ async function executeRunRequest(
|
|
|
4890
5305
|
|
|
4891
5306
|
const stepProgressSnapshot = () => ({ ...stepProgressByNodeId });
|
|
4892
5307
|
|
|
5308
|
+
const publishCoordinatorProgressEvent = async (
|
|
5309
|
+
occurredAt: number,
|
|
5310
|
+
): Promise<void> => {
|
|
5311
|
+
const coordinatorUrl = req.coordinatorUrl?.trim();
|
|
5312
|
+
if (!coordinatorUrl) {
|
|
5313
|
+
recordRunnerPerfTrace({
|
|
5314
|
+
req,
|
|
5315
|
+
phase: 'runner.coordinator_progress_publish',
|
|
5316
|
+
ms: 0,
|
|
5317
|
+
extra: { status: 'skipped_no_url' },
|
|
5318
|
+
});
|
|
5319
|
+
return;
|
|
5320
|
+
}
|
|
5321
|
+
const publishStartedAt = nowMs();
|
|
5322
|
+
const liveNodeProgress = stepProgressSnapshot();
|
|
5323
|
+
const activeEntry =
|
|
5324
|
+
Object.entries(liveNodeProgress).find(
|
|
5325
|
+
([, progress]) => typeof progress.completedAt !== 'number',
|
|
5326
|
+
) ?? Object.entries(liveNodeProgress).at(-1);
|
|
5327
|
+
const activeNodeId = activeEntry?.[0] ?? null;
|
|
5328
|
+
const activeProgress = activeEntry?.[1] ?? null;
|
|
5329
|
+
const activeArtifactTableNamespace =
|
|
5330
|
+
typeof activeProgress?.artifactTableNamespace === 'string'
|
|
5331
|
+
? activeProgress.artifactTableNamespace
|
|
5332
|
+
: null;
|
|
5333
|
+
const activeCompleted =
|
|
5334
|
+
typeof activeProgress?.completed === 'number'
|
|
5335
|
+
? activeProgress.completed
|
|
5336
|
+
: null;
|
|
5337
|
+
const activeTotal =
|
|
5338
|
+
typeof activeProgress?.total === 'number' ? activeProgress.total : null;
|
|
5339
|
+
const activeMessage =
|
|
5340
|
+
typeof activeProgress?.message === 'string'
|
|
5341
|
+
? activeProgress.message
|
|
5342
|
+
: null;
|
|
5343
|
+
const response = await fetch(
|
|
5344
|
+
`${coordinatorUrl.replace(/\/$/, '')}/dedup/${encodeURIComponent(
|
|
5345
|
+
req.runId,
|
|
5346
|
+
)}/event-add`,
|
|
5347
|
+
{
|
|
5348
|
+
method: 'POST',
|
|
5349
|
+
headers: {
|
|
5350
|
+
'x-deepline-request-id': makeRequestId(),
|
|
5351
|
+
...coordinatorRequestHeaders({
|
|
5352
|
+
runId: req.runId,
|
|
5353
|
+
contentType: 'application/json',
|
|
5354
|
+
internalToken: req.coordinatorInternalToken,
|
|
5355
|
+
}),
|
|
5356
|
+
},
|
|
5357
|
+
body: JSON.stringify({
|
|
5358
|
+
runId: req.runId,
|
|
5359
|
+
type: 'progress',
|
|
5360
|
+
status: 'running',
|
|
5361
|
+
ts: occurredAt,
|
|
5362
|
+
logs: runLogBuffer,
|
|
5363
|
+
activeNodeId,
|
|
5364
|
+
activeArtifactTableNamespace,
|
|
5365
|
+
updatedAt: occurredAt,
|
|
5366
|
+
liveNodeProgress,
|
|
5367
|
+
}),
|
|
5368
|
+
},
|
|
5369
|
+
);
|
|
5370
|
+
if (!response.ok) {
|
|
5371
|
+
recordRunnerPerfTrace({
|
|
5372
|
+
req,
|
|
5373
|
+
phase: 'runner.coordinator_progress_publish',
|
|
5374
|
+
ms: nowMs() - publishStartedAt,
|
|
5375
|
+
extra: {
|
|
5376
|
+
status: 'failed',
|
|
5377
|
+
httpStatus: response.status,
|
|
5378
|
+
activeNodeId,
|
|
5379
|
+
activeArtifactTableNamespace,
|
|
5380
|
+
activeCompleted,
|
|
5381
|
+
activeTotal,
|
|
5382
|
+
activeMessage,
|
|
5383
|
+
},
|
|
5384
|
+
});
|
|
5385
|
+
throw new Error(`coordinator progress event failed ${response.status}`);
|
|
5386
|
+
}
|
|
5387
|
+
recordRunnerPerfTrace({
|
|
5388
|
+
req,
|
|
5389
|
+
phase: 'runner.coordinator_progress_publish',
|
|
5390
|
+
ms: nowMs() - publishStartedAt,
|
|
5391
|
+
extra: {
|
|
5392
|
+
status: 'ok',
|
|
5393
|
+
activeNodeId,
|
|
5394
|
+
activeArtifactTableNamespace,
|
|
5395
|
+
activeCompleted,
|
|
5396
|
+
activeTotal,
|
|
5397
|
+
activeMessage,
|
|
5398
|
+
},
|
|
5399
|
+
});
|
|
5400
|
+
};
|
|
5401
|
+
|
|
4893
5402
|
const appendStepLifecycleEvent = (event: PlayStepLifecycleEvent) => {
|
|
4894
5403
|
updateStepProgress({
|
|
4895
5404
|
nodeId: event.nodeId,
|
|
@@ -4957,6 +5466,12 @@ async function executeRunRequest(
|
|
|
4957
5466
|
progress.artifactTableNamespace === null
|
|
4958
5467
|
? { artifactTableNamespace: progress.artifactTableNamespace }
|
|
4959
5468
|
: {}),
|
|
5469
|
+
...(typeof progress.startedAt === 'number'
|
|
5470
|
+
? { startedAt: progress.startedAt }
|
|
5471
|
+
: {}),
|
|
5472
|
+
...(typeof progress.completedAt === 'number'
|
|
5473
|
+
? { completedAt: progress.completedAt }
|
|
5474
|
+
: {}),
|
|
4960
5475
|
updatedAt:
|
|
4961
5476
|
typeof progress.updatedAt === 'number'
|
|
4962
5477
|
? progress.updatedAt
|
|
@@ -5005,6 +5520,7 @@ async function executeRunRequest(
|
|
|
5005
5520
|
pendingLedgerEvents = [...events, ...pendingLedgerEvents];
|
|
5006
5521
|
throw new Error('runtime run-ledger append failed');
|
|
5007
5522
|
}
|
|
5523
|
+
await publishCoordinatorProgressEvent(now).catch(() => undefined);
|
|
5008
5524
|
})
|
|
5009
5525
|
.catch(() => undefined);
|
|
5010
5526
|
};
|
|
@@ -5048,7 +5564,7 @@ async function executeRunRequest(
|
|
|
5048
5564
|
const workerCallbacks: WorkerCtxCallbacks = {
|
|
5049
5565
|
onNodeProgress: (input) => {
|
|
5050
5566
|
updateStepProgress(input);
|
|
5051
|
-
flushLedgerEvents(
|
|
5567
|
+
flushLedgerEvents(Boolean(input.forceFlush));
|
|
5052
5568
|
},
|
|
5053
5569
|
onMapStarted: (nodeId, at) => stepLifecycle?.onMapStarted(nodeId, at),
|
|
5054
5570
|
onMapCompleted: (nodeId, at) => stepLifecycle?.onMapCompleted(nodeId, at),
|
|
@@ -5081,6 +5597,20 @@ async function executeRunRequest(
|
|
|
5081
5597
|
abortSignal,
|
|
5082
5598
|
workerCallbacks,
|
|
5083
5599
|
);
|
|
5600
|
+
// Hard wall-clock cap on active user-code runtime. CF Workflows does not
|
|
5601
|
+
// impose a play-level execution ceiling on this substrate, so without this a
|
|
5602
|
+
// runaway play (infinite loop, stuck await) would only stop when the executor
|
|
5603
|
+
// token expires. Aborting the controller surfaces cooperatively through the
|
|
5604
|
+
// same assertNotAborted checks used for harness cancellation.
|
|
5605
|
+
let runtimeLimitExceeded = false;
|
|
5606
|
+
const runtimeDeadlineTimer = setTimeout(() => {
|
|
5607
|
+
runtimeLimitExceeded = true;
|
|
5608
|
+
if (!abortSignal.aborted) {
|
|
5609
|
+
abortController.abort(
|
|
5610
|
+
`Play runtime limit exceeded after ${STANDARD_PLAY_RUNTIME_LIMIT_SECONDS}s.`,
|
|
5611
|
+
);
|
|
5612
|
+
}
|
|
5613
|
+
}, STANDARD_PLAY_RUNTIME_LIMIT_SECONDS * 1000);
|
|
5084
5614
|
try {
|
|
5085
5615
|
const playStartedAt = nowMs();
|
|
5086
5616
|
const result = await (
|
|
@@ -5102,6 +5632,33 @@ async function executeRunRequest(
|
|
|
5102
5632
|
phase: 'runner.serialize_result',
|
|
5103
5633
|
ms: nowMs() - serializeStartedAt,
|
|
5104
5634
|
});
|
|
5635
|
+
const terminalResult = trimResultForStatus(serializedResult);
|
|
5636
|
+
let parentSignalPromise: Promise<void> | null = null;
|
|
5637
|
+
const startParentTerminalSignal = (): Promise<void> => {
|
|
5638
|
+
if (!parentSignalPromise) {
|
|
5639
|
+
const parentSignalStartedAt = nowMs();
|
|
5640
|
+
parentSignalPromise = signalParentPlayTerminal({
|
|
5641
|
+
req,
|
|
5642
|
+
status: 'completed',
|
|
5643
|
+
result: terminalResult as Record<string, unknown>,
|
|
5644
|
+
})
|
|
5645
|
+
.catch((error) => {
|
|
5646
|
+
console.error(
|
|
5647
|
+
`[play-harness] non-fatal parent completion signal failed runId=${req.runId}: ${
|
|
5648
|
+
error instanceof Error ? error.message : String(error)
|
|
5649
|
+
}`,
|
|
5650
|
+
);
|
|
5651
|
+
})
|
|
5652
|
+
.finally(() => {
|
|
5653
|
+
recordRunnerPerfTrace({
|
|
5654
|
+
req,
|
|
5655
|
+
phase: 'runner.parent_terminal_signal',
|
|
5656
|
+
ms: nowMs() - parentSignalStartedAt,
|
|
5657
|
+
});
|
|
5658
|
+
});
|
|
5659
|
+
}
|
|
5660
|
+
return parentSignalPromise;
|
|
5661
|
+
};
|
|
5105
5662
|
if (options?.persistResultDatasets) {
|
|
5106
5663
|
const ledgerFlushWaitStartedAt = nowMs();
|
|
5107
5664
|
await ledgerFlushInFlight;
|
|
@@ -5117,7 +5674,7 @@ async function executeRunRequest(
|
|
|
5117
5674
|
phase: 'runner.persist_result_datasets',
|
|
5118
5675
|
ms: nowMs() - resultDatasetStartedAt,
|
|
5119
5676
|
});
|
|
5120
|
-
const
|
|
5677
|
+
const parentSignal = startParentTerminalSignal();
|
|
5121
5678
|
const terminalOccurredAt = nowMs();
|
|
5122
5679
|
const terminalUpdateStartedAt = nowMs();
|
|
5123
5680
|
await flushTerminalLedgerEvents({
|
|
@@ -5161,24 +5718,9 @@ async function executeRunRequest(
|
|
|
5161
5718
|
await nonBlockingBillingPromise;
|
|
5162
5719
|
}
|
|
5163
5720
|
}
|
|
5721
|
+
await parentSignal;
|
|
5164
5722
|
}
|
|
5165
|
-
|
|
5166
|
-
await signalParentPlayTerminal({
|
|
5167
|
-
req,
|
|
5168
|
-
status: 'completed',
|
|
5169
|
-
result: trimResultForStatus(serializedResult) as Record<string, unknown>,
|
|
5170
|
-
}).catch((error) => {
|
|
5171
|
-
console.error(
|
|
5172
|
-
`[play-harness] non-fatal parent completion signal failed runId=${req.runId}: ${
|
|
5173
|
-
error instanceof Error ? error.message : String(error)
|
|
5174
|
-
}`,
|
|
5175
|
-
);
|
|
5176
|
-
});
|
|
5177
|
-
recordRunnerPerfTrace({
|
|
5178
|
-
req,
|
|
5179
|
-
phase: 'runner.parent_terminal_signal',
|
|
5180
|
-
ms: nowMs() - parentSignalStartedAt,
|
|
5181
|
-
});
|
|
5723
|
+
await startParentTerminalSignal();
|
|
5182
5724
|
recordRunnerPerfTrace({
|
|
5183
5725
|
req,
|
|
5184
5726
|
phase: 'runner.execute_total',
|
|
@@ -5194,7 +5736,10 @@ async function executeRunRequest(
|
|
|
5194
5736
|
};
|
|
5195
5737
|
} catch (error) {
|
|
5196
5738
|
stepLifecycle?.markStartedFailed(nowMs());
|
|
5197
|
-
|
|
5739
|
+
// A runtime-limit abort is a timeout failure, not a user cancellation, so
|
|
5740
|
+
// it should be reported as run.failed with the limit message rather than
|
|
5741
|
+
// run.cancelled.
|
|
5742
|
+
const aborted = isAbortLikeError(error) && !runtimeLimitExceeded;
|
|
5198
5743
|
if (aborted) {
|
|
5199
5744
|
// Flip the controller so any concurrent user code observes the abort
|
|
5200
5745
|
// through ctx.signal. We mark the run cancelled instead of failed.
|
|
@@ -5253,6 +5798,8 @@ async function executeRunRequest(
|
|
|
5253
5798
|
error: message,
|
|
5254
5799
|
}).catch(() => null);
|
|
5255
5800
|
throw error;
|
|
5801
|
+
} finally {
|
|
5802
|
+
clearTimeout(runtimeDeadlineTimer);
|
|
5256
5803
|
}
|
|
5257
5804
|
}
|
|
5258
5805
|
|
|
@@ -5851,6 +6398,9 @@ const workerEntrypoint = {
|
|
|
5851
6398
|
},
|
|
5852
6399
|
});
|
|
5853
6400
|
}
|
|
6401
|
+
if (request.method === 'POST' && url.pathname === '/run-inline') {
|
|
6402
|
+
return handleRunInline(request, env);
|
|
6403
|
+
}
|
|
5854
6404
|
if (request.method === 'POST' && url.pathname === '/run') {
|
|
5855
6405
|
return handleRun(request, env);
|
|
5856
6406
|
}
|