deepline 0.1.79 → 0.1.81

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. package/README.md +2 -1
  2. package/dist/cli/index.js +76 -42
  3. package/dist/cli/index.mjs +76 -42
  4. package/dist/index.d.mts +9 -1
  5. package/dist/index.d.ts +9 -1
  6. package/dist/index.js +13 -10
  7. package/dist/index.mjs +13 -10
  8. package/dist/repo/apps/play-runner-workers/src/child-play-await.ts +192 -0
  9. package/dist/repo/apps/play-runner-workers/src/coordinator-entry.ts +1103 -1617
  10. package/dist/repo/apps/play-runner-workers/src/dedup-do.ts +506 -654
  11. package/dist/repo/apps/play-runner-workers/src/entry.ts +1148 -598
  12. package/dist/repo/apps/play-runner-workers/src/runtime/tool-http-errors.ts +43 -1
  13. package/dist/repo/apps/play-runner-workers/src/workflow-retry-state.ts +8 -2
  14. package/dist/repo/sdk/src/client.ts +15 -8
  15. package/dist/repo/sdk/src/release.ts +2 -2
  16. package/dist/repo/sdk/src/types.ts +5 -0
  17. package/dist/repo/shared_libs/play-runtime/governor/coordinator-rate-state-backend.ts +231 -0
  18. package/dist/repo/shared_libs/play-runtime/governor/governor.ts +376 -0
  19. package/dist/repo/shared_libs/play-runtime/governor/policy.ts +179 -0
  20. package/dist/repo/shared_libs/play-runtime/governor/rate-state-backend.ts +87 -0
  21. package/dist/repo/shared_libs/play-runtime/run-failure.ts +12 -0
  22. package/dist/repo/shared_libs/play-runtime/scheduler-backend.ts +24 -0
  23. package/dist/repo/shared_libs/play-runtime/submit-limits.ts +35 -0
  24. package/dist/repo/shared_libs/plays/bundling/index.ts +4 -12
  25. package/dist/repo/shared_libs/plays/bundling/limits.ts +29 -0
  26. package/dist/repo/shared_libs/plays/static-pipeline.ts +56 -3
  27. package/dist/repo/shared_libs/temporal/constants.ts +38 -0
  28. package/package.json +1 -1
  29. package/dist/repo/shared_libs/play-runtime/tool-batch-executor.ts +0 -149
@@ -44,11 +44,23 @@ import {
44
44
  type ChunkExecutionResult,
45
45
  } from '../../../shared_libs/play-runtime/batch-runtime';
46
46
  import { getDefaultPlayRuntimeBatchStrategy } from '../../../shared_libs/play-runtime/default-batch-strategies';
47
- import type { AnyBatchOperationStrategy } from '../../../shared_libs/play-runtime/batching-types';
47
+ import { STANDARD_PLAY_RUNTIME_LIMIT_SECONDS } from '../../../shared_libs/temporal/constants';
48
+ import {
49
+ createPlayExecutionGovernor,
50
+ type GovernanceSnapshot,
51
+ type PlayExecutionGovernor,
52
+ } from '../../../shared_libs/play-runtime/governor/governor';
48
53
  import {
49
- createToolBatchExecutor,
50
- type ToolBatchRequest,
51
- } from '../../../shared_libs/play-runtime/tool-batch-executor';
54
+ CoordinatorRateStateBackend,
55
+ type CoordinatorRatePort,
56
+ } from '../../../shared_libs/play-runtime/governor/coordinator-rate-state-backend';
57
+ import type { PacingRule } from '../../../shared_libs/play-runtime/governor/rate-state-backend';
58
+ import {
59
+ awaitChildTerminal,
60
+ type ChildPlayTerminalWaitResult,
61
+ type WorkflowStepLike,
62
+ } from './child-play-await';
63
+ import type { AnyBatchOperationStrategy } from '../../../shared_libs/play-runtime/batching-types';
52
64
  import {
53
65
  adaptV2ExecuteResponseToToolResult,
54
66
  createToolExecuteResult,
@@ -121,7 +133,6 @@ import {
121
133
  import { createHarnessWorkerReceiptStore } from './runtime/harness-receipt-store';
122
134
  import {
123
135
  applyCsvRenameProjection,
124
- stripCsvProjectedFields,
125
136
  stripCsvProjectionMetadata,
126
137
  cloneCsvAliasedRow,
127
138
  type CsvRenameOptions,
@@ -146,7 +157,6 @@ import type {
146
157
  LiveNodeProgressSnapshot,
147
158
  } from './runtime/live-progress';
148
159
  import {
149
- ToolHttpError,
150
160
  extractErrorBilling,
151
161
  isHardBillingToolHttpError,
152
162
  normalizeToolHttpErrorMessage,
@@ -201,6 +211,8 @@ type RunRequest = {
201
211
  /** Internal ctx.runPlay lineage. Public SDK/users never see this. */
202
212
  playCallGovernance?: PlayCallGovernanceSnapshot | null;
203
213
  preloadedDbSessions?: PreloadedRuntimeDbSession[] | null;
214
+ /** Coordinator already created the child run row before invoking /run-inline. */
215
+ inlineChildRunRegistered?: boolean | null;
204
216
  /** Cloudflare coordinator URL for direct Workflow control-plane signals. */
205
217
  coordinatorUrl?: string | null;
206
218
  /** Request-scoped coordinator auth token for preview/dev direct control calls. */
@@ -253,11 +265,21 @@ type WorkerEnv = {
253
265
  * `/api/v2/plays/runtime-tools/*`) skip the public callback URL and route
254
266
  * directly through the coordinator's process to the configured app — saves
255
267
  * the *.workers.dev → CF edge → cloudflared → localhost chain on every
256
- * runtime callback. Absent on legacy coordinator deploys; the fetch
257
- * helpers fall back to `globalThis.fetch(req.baseUrl + path)`.
268
+ * runtime callback. Required for workers_edge; missing binding is an infra
269
+ * error instead of a transport fallback.
258
270
  */
259
271
  RUNTIME_API?: {
260
- fetch(input: Request): Promise<Response>;
272
+ runtimeApiCall(input: {
273
+ executorToken: string;
274
+ path: string;
275
+ body: unknown;
276
+ headers?: Record<string, string>;
277
+ timeoutMs?: number;
278
+ }): Promise<{
279
+ status: number;
280
+ headers?: Record<string, string>;
281
+ body: string;
282
+ }>;
261
283
  };
262
284
  /**
263
285
  * Loopback RPC binding into the coordinator Worker. Used for CF-to-CF
@@ -280,6 +302,20 @@ type WorkerEnv = {
280
302
  logs?: string[];
281
303
  timings?: Array<{ phase: string; ms: number }>;
282
304
  }>;
305
+ submitWorkflowChild?(
306
+ parentRunId: string,
307
+ body: Record<string, unknown>,
308
+ ): Promise<{
309
+ workflowId?: string;
310
+ runId?: string;
311
+ status?: string;
312
+ mode?: string;
313
+ output?: unknown;
314
+ result?: unknown;
315
+ error?: unknown;
316
+ logs?: string[];
317
+ timings?: Array<{ phase: string; ms: number }>;
318
+ }>;
283
319
  signal(
284
320
  runId: string,
285
321
  body: Record<string, unknown>,
@@ -292,6 +328,26 @@ type WorkerEnv = {
292
328
  runId: string,
293
329
  event: Record<string, unknown>,
294
330
  ): Promise<void>;
331
+ readTerminalState?(runId: string): Promise<Record<string, unknown> | null>;
332
+ readChildTerminalState?(
333
+ parentRunId: string,
334
+ eventKey: string,
335
+ timeoutMs?: number,
336
+ ): Promise<Record<string, unknown> | null>;
337
+ /**
338
+ * Distributed Rate State Backend RPC. Routes to the per-(org,provider)
339
+ * rate-bucket Durable Object so the request window is global across
340
+ * isolates. See CoordinatorRateStateBackend + dedup-do.ts.
341
+ */
342
+ rateAcquire?(input: {
343
+ bucketId: string;
344
+ rules: PacingRule[];
345
+ requested: number;
346
+ }): Promise<{ granted: number; waitMs: number }>;
347
+ ratePenalize?(input: {
348
+ bucketId: string;
349
+ cooldownMs: number;
350
+ }): Promise<void>;
295
351
  };
296
352
  /**
297
353
  * Required service binding to the long-lived Play Harness Worker
@@ -379,10 +435,9 @@ async function probeHarnessOnce(
379
435
  }
380
436
  }
381
437
  /**
382
- * Routes runtime API requests through the in-process RUNTIME_API binding when
383
- * Cloudflare exposes the coordinator WorkerEntrypoint export. Some workflow
384
- * execution paths do not expose those exports; there we keep the older public
385
- * fetch transport so the play still reaches the same authenticated handler.
438
+ * Routes runtime API requests through the in-process RUNTIME_API service
439
+ * binding. workers_edge treats a missing binding as infrastructure failure
440
+ * instead of falling back to public HTTP.
386
441
  */
387
442
  const RUNTIME_API_TIMEOUT_MS = 30_000;
388
443
  const RUNTIME_API_PLAY_RUN_TIMEOUT_MS = 75_000;
@@ -390,7 +445,6 @@ const RUNTIME_API_INTEGRATION_EXECUTE_TIMEOUT_MS = 180_000;
390
445
  const RUNTIME_API_RETRY_DELAYS_MS = [
391
446
  250, 750, 1500, 3000, 5000, 10000,
392
447
  ] as const;
393
- let loggedMissingRuntimeApiBinding = false;
394
448
 
395
449
  async function fetchRuntimeApi(
396
450
  baseUrl: string,
@@ -418,37 +472,25 @@ async function fetchRuntimeApi(
418
472
  try {
419
473
  const mergedInit: RequestInit = {
420
474
  ...init,
421
- headers: runtimeApiHeaders(init.headers, cachedRuntimeApiBinding == null),
475
+ headers: runtimeApiHeaders(init.headers, false),
422
476
  signal: controller.signal,
423
477
  };
424
478
  if (!cachedRuntimeApiBinding) {
425
- if (!loggedMissingRuntimeApiBinding) {
426
- loggedMissingRuntimeApiBinding = true;
427
- console.warn(
428
- `[play-harness] RUNTIME_API binding missing; using public runtime API transport. path=${path}`,
429
- );
430
- }
431
- return await Promise.race([
432
- fetch(`${baseUrl.replace(/\/$/, '')}${path}`, mergedInit),
433
- timeoutPromise,
434
- ]);
479
+ throw new Error('[play-harness] RUNTIME_API service binding is required');
435
480
  }
436
- const responsePromise = cachedRuntimeApiBinding.fetch(
437
- new Request(`${baseUrl.replace(/\/$/, '')}${path}`, mergedInit),
481
+ const responsePromise = callRuntimeApiRpcBinding(
482
+ cachedRuntimeApiBinding,
483
+ mergedInit,
484
+ {
485
+ path,
486
+ timeoutMs,
487
+ },
438
488
  );
439
489
  const response = await Promise.race([responsePromise, timeoutPromise]);
440
- if (await shouldFallbackRuntimeApiBindingResponse(response)) {
441
- console.warn(
442
- `[play-harness] RUNTIME_API binding returned coordinator not found; using public runtime API transport. path=${path}`,
490
+ if (await isRuntimeApiBindingNotFoundResponse(response)) {
491
+ throw new Error(
492
+ `[play-harness] RUNTIME_API service binding could not route ${path}; coordinator returned not found.`,
443
493
  );
444
- return await Promise.race([
445
- fetch(`${baseUrl.replace(/\/$/, '')}${path}`, {
446
- ...init,
447
- headers: runtimeApiHeaders(init.headers, true),
448
- signal: controller.signal,
449
- }),
450
- timeoutPromise,
451
- ]);
452
494
  }
453
495
  return response;
454
496
  } catch (err) {
@@ -463,7 +505,33 @@ async function fetchRuntimeApi(
463
505
  }
464
506
  }
465
507
 
466
- async function shouldFallbackRuntimeApiBindingResponse(
508
+ async function callRuntimeApiRpcBinding(
509
+ binding: NonNullable<WorkerEnv['RUNTIME_API']>,
510
+ init: RequestInit,
511
+ input: { path: string; timeoutMs: number },
512
+ ): Promise<Response> {
513
+ const h = new Headers(init.headers);
514
+ const authorization = h.get('authorization') ?? '';
515
+ const headers: Record<string, string> = {};
516
+ const metadata = h.get(EXECUTE_TOOL_METADATA_HEADER);
517
+ if (metadata) headers[EXECUTE_TOOL_METADATA_HEADER] = metadata;
518
+ const contract = h.get(EXECUTE_RESPONSE_CONTRACT_HEADER);
519
+ if (contract) headers[EXECUTE_RESPONSE_CONTRACT_HEADER] = contract;
520
+ const rawBody = typeof init.body === 'string' ? init.body : '';
521
+ const result = await binding.runtimeApiCall({
522
+ executorToken: authorization.replace(/^Bearer\s+/i, '').trim(),
523
+ path: input.path,
524
+ body: rawBody ? JSON.parse(rawBody) : {},
525
+ headers,
526
+ timeoutMs: input.timeoutMs,
527
+ });
528
+ return new Response(result.body, {
529
+ status: result.status,
530
+ headers: result.headers ?? {},
531
+ });
532
+ }
533
+
534
+ async function isRuntimeApiBindingNotFoundResponse(
467
535
  response: Response,
468
536
  ): Promise<boolean> {
469
537
  if (response.status !== 404) {
@@ -494,13 +562,6 @@ function cachedVercelProtectionBypassToken(): string | null {
494
562
  return cachedRuntimeApiVercelBypassToken;
495
563
  }
496
564
 
497
- const WORKER_PLAY_CALL_LIMITS = {
498
- maxPlayCallDepth: 6,
499
- maxPlayCallCount: 1_000,
500
- maxChildPlayCallsPerParent: 1_000,
501
- maxConcurrentPlayCalls: 16,
502
- };
503
-
504
565
  type RunnerEvent =
505
566
  | {
506
567
  type: 'log';
@@ -520,10 +581,17 @@ type WorkflowRunOutput = {
520
581
  durationMs: number;
521
582
  };
522
583
 
584
+ type InlineRunTiming = {
585
+ phase: string;
586
+ ms: number;
587
+ extra?: Record<string, unknown>;
588
+ };
589
+
523
590
  type WorkerCtxCallbacks = {
524
591
  onNodeProgress?: (input: {
525
592
  nodeId: string;
526
593
  progress: LiveNodeProgressSnapshot;
594
+ forceFlush?: boolean;
527
595
  }) => void;
528
596
  onMapStarted?: (nodeId: string, at?: number) => void;
529
597
  onMapCompleted?: (nodeId: string, at?: number) => void;
@@ -612,12 +680,17 @@ function makeRequestId(): string {
612
680
  }
613
681
 
614
682
  function publicCsvInputRow<T extends Record<string, unknown>>(row: T): T {
615
- const stripped = stripCsvProjectedFields(row) as Record<string, unknown>;
616
- return Object.fromEntries(
617
- Object.entries(stripped).filter(
618
- ([fieldName]) => !fieldName.startsWith('__deepline'),
619
- ),
620
- ) as T;
683
+ const restored = stripCsvProjectionMetadata(row) as Record<string, unknown>;
684
+ const publicRow: Record<string, unknown> = {};
685
+ for (const fieldName of Reflect.ownKeys(restored)) {
686
+ if (typeof fieldName === 'string' && fieldName.startsWith('__deepline')) {
687
+ continue;
688
+ }
689
+ const descriptor = Object.getOwnPropertyDescriptor(restored, fieldName);
690
+ if (!descriptor) continue;
691
+ Object.defineProperty(publicRow, fieldName, descriptor);
692
+ }
693
+ return publicRow as T;
621
694
  }
622
695
 
623
696
  function publicCsvOutputRow<T extends Record<string, unknown>>(row: T): T {
@@ -634,6 +707,27 @@ function publicCsvOutputRow<T extends Record<string, unknown>>(row: T): T {
634
707
  return publicRow as T;
635
708
  }
636
709
 
710
+ function publicCsvStorageRow<T extends Record<string, unknown>>(row: T): T {
711
+ const publicRow = publicCsvInputRow(row) as Record<string, unknown>;
712
+ const storageRow: Record<string, unknown> = {};
713
+ for (const fieldName of Reflect.ownKeys(publicRow)) {
714
+ if (typeof fieldName !== 'string') continue;
715
+ const descriptor = Object.getOwnPropertyDescriptor(publicRow, fieldName);
716
+ if (!descriptor) continue;
717
+ storageRow[fieldName] =
718
+ 'value' in descriptor ? descriptor.value : publicRow[fieldName];
719
+ }
720
+ for (const runtimeField of [
721
+ '__deeplineRowKey',
722
+ '__deeplineCellMetaPatch',
723
+ ]) {
724
+ if (Object.prototype.hasOwnProperty.call(row, runtimeField)) {
725
+ storageRow[runtimeField] = row[runtimeField];
726
+ }
727
+ }
728
+ return storageRow as T;
729
+ }
730
+
637
731
  /**
638
732
  * Strip credentials and JWT-shaped tokens from any string before it lands in
639
733
  * a log buffer or upstream error message. The harness routinely echoes
@@ -658,9 +752,8 @@ async function postRuntimeApi<T>(
658
752
  executorToken: string,
659
753
  body: unknown,
660
754
  ): Promise<T> {
661
- // Routes through the in-process RUNTIME_API binding when present; otherwise
662
- // falls back to a public fetch against `${baseUrl}${path}`. Either path
663
- // hits the same handler with the same auth — only the transport changes.
755
+ // Routes through the in-process RUNTIME_API service binding. Missing binding
756
+ // is an infra error in workers_edge, not a reason to fall back to public HTTP.
664
757
  const serializedBody = JSON.stringify(body);
665
758
  let lastError: unknown = null;
666
759
  for (
@@ -797,6 +890,15 @@ async function submitChildPlayThroughCoordinator(input: {
797
890
  }
798
891
  return cachedCoordinatorBinding.submitChild(input.req.runId, input.body);
799
892
  }
893
+ if (cachedCoordinatorBinding?.submitWorkflowChild) {
894
+ if (!isRecord(input.body)) {
895
+ throw new Error('ctx.runPlay child submit requires an object body.');
896
+ }
897
+ return cachedCoordinatorBinding.submitWorkflowChild(
898
+ input.req.runId,
899
+ input.body,
900
+ );
901
+ }
800
902
  const coordinatorUrl = input.req.coordinatorUrl?.trim();
801
903
  if (coordinatorUrl) {
802
904
  // Keep child plays on the same coordinator/Workflow submit path as
@@ -924,46 +1026,6 @@ function workflowTimeoutFromMs(timeoutMs: number): string {
924
1026
  return `${seconds} second${seconds === 1 ? '' : 's'}`;
925
1027
  }
926
1028
 
927
- async function waitForChildPlayTerminalEvent(input: {
928
- req: RunRequest;
929
- workflowStep?: WorkflowStep;
930
- workflowId: string;
931
- playName: string;
932
- key: string;
933
- timeoutMs: number;
934
- }): Promise<unknown> {
935
- if (!input.workflowStep) {
936
- throw new Error(
937
- 'ctx.runPlay child waits require the cf-workflows runtime event scheduler.',
938
- );
939
- }
940
- const eventKey = await childPlayEventKey({
941
- key: input.key,
942
- workflowId: input.workflowId,
943
- });
944
- const event = (await (
945
- input.workflowStep.waitForEvent as unknown as (
946
- name: string,
947
- options: { type: string; timeout: string },
948
- ) => Promise<{ payload: unknown }>
949
- )(`child_play_terminal:${eventKey}`, {
950
- type: integrationEventType(eventKey),
951
- timeout: workflowTimeoutFromMs(input.timeoutMs),
952
- })) as { payload: unknown };
953
- const rawPayload = isRecord(event.payload) ? event.payload : {};
954
- const payload = isRecord(rawPayload.data) ? rawPayload.data : rawPayload;
955
- const status = String(payload.status ?? '').toLowerCase();
956
- if (status === 'completed') {
957
- return extractChildPlayOutput(payload);
958
- }
959
- const error = isRecord(payload.error) ? payload.error : null;
960
- const message =
961
- (typeof error?.message === 'string' && error.message.trim()) ||
962
- (typeof payload.error === 'string' && payload.error.trim()) ||
963
- `Child play ${input.playName} (${input.workflowId}) finished with status ${status || 'unknown'}.`;
964
- throw new Error(message);
965
- }
966
-
967
1029
  async function signalParentPlayTerminal(input: {
968
1030
  req: RunRequest;
969
1031
  status: 'completed' | 'failed' | 'cancelled';
@@ -1045,6 +1107,8 @@ async function executeTool(
1045
1107
  req: RunRequest,
1046
1108
  args: { id: string; toolId: string; input: Record<string, unknown> },
1047
1109
  workflowStep?: WorkflowStep,
1110
+ onProviderBackpressure?: (retryAfterMs: number) => void,
1111
+ onRetryAttempt?: () => void,
1048
1112
  ): Promise<ToolExecuteResult> {
1049
1113
  if (args.toolId === 'test_wait_for_event' && workflowStep) {
1050
1114
  const result = await waitForSyntheticIntegrationEvent(
@@ -1059,7 +1123,7 @@ async function executeTool(
1059
1123
  // service bindings, NOT through HTTP from this worker. Removing the
1060
1124
  // dispatcher-side coordinatorUrl plumbing intentionally turns the old
1061
1125
  // HTTP-based dedup helpers into dead code.
1062
- return callToolDirect(req, args);
1126
+ return callToolDirect(req, args, onProviderBackpressure, onRetryAttempt);
1063
1127
  }
1064
1128
 
1065
1129
  async function executeToolWithLifecycle(
@@ -1193,50 +1257,20 @@ async function waitForSyntheticIntegrationEvent(
1193
1257
  async function callToolDirect(
1194
1258
  req: RunRequest,
1195
1259
  args: { id: string; toolId: string; input: Record<string, unknown> },
1260
+ onProviderBackpressure?: (retryAfterMs: number) => void,
1261
+ // Invoked once per in-process retry attempt (429 / retryable 5xx / synthetic
1262
+ // transient) so the Governor charges chargeBudget('retry') per attempt — the
1263
+ // same runaway guard the cjs runner applies (context.ts charges retry on each
1264
+ // 429 / transient-5xx retry). Without this the worker substrate would leave
1265
+ // policy.budgets.maxRetryCount effectively unenforced.
1266
+ onRetryAttempt?: () => void,
1196
1267
  ): Promise<ToolExecuteResult> {
1197
1268
  const { id, toolId, input } = args;
1198
- if (toolId === 'test_rate_limit') {
1199
- return wrapWorkerToolResult(
1200
- toolId,
1201
- executeSyntheticTestRateLimit(input),
1202
- syntheticToolMetadata(toolId),
1203
- );
1204
- }
1205
- if (toolId === 'test_batch_rate_limit') {
1206
- return wrapWorkerToolResult(
1207
- toolId,
1208
- await executeSyntheticTestRateLimitBatch(req, input),
1209
- syntheticToolMetadata(toolId),
1210
- );
1211
- }
1212
1269
  const path = `/api/v2/integrations/${encodeURIComponent(toolId)}/execute`;
1213
1270
  const maxAttempts = 3;
1214
1271
  let lastError: Error | null = null;
1215
1272
 
1216
1273
  for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
1217
- if (toolId === 'test_transient_500' || toolId === 'test_transient_429') {
1218
- const syntheticResult = executeSyntheticTransientRetry(
1219
- toolId,
1220
- input,
1221
- attempt,
1222
- );
1223
- if (syntheticResult.ok) {
1224
- return wrapWorkerToolResult(
1225
- toolId,
1226
- syntheticResult.result,
1227
- syntheticToolMetadata(toolId),
1228
- );
1229
- }
1230
- lastError = new Error(
1231
- `tool ${toolId} ${syntheticResult.status} attempt ${attempt}/${maxAttempts}: ${syntheticResult.message}`,
1232
- );
1233
- if (attempt >= maxAttempts) {
1234
- throw lastError;
1235
- }
1236
- await new Promise((resolve) => setTimeout(resolve, 1_000));
1237
- continue;
1238
- }
1239
-
1240
1274
  const res = await fetchRuntimeApi(req.baseUrl, path, {
1241
1275
  method: 'POST',
1242
1276
  headers: {
@@ -1273,17 +1307,26 @@ async function callToolDirect(
1273
1307
  maxAttempts,
1274
1308
  bodyText: text,
1275
1309
  });
1310
+ const retryAfterSeconds = Number(res.headers.get('retry-after'));
1311
+ const retryAfterMs =
1312
+ Number.isFinite(retryAfterSeconds) && retryAfterSeconds > 0
1313
+ ? Math.ceil(retryAfterSeconds * 1000)
1314
+ : 0;
1315
+ if (res.status === 429) {
1316
+ // Feed the provider's backpressure into the shared pacer even on the
1317
+ // final attempt so the (org, provider) bucket backs off across isolates.
1318
+ onProviderBackpressure?.(retryAfterMs > 0 ? retryAfterMs : 1_000);
1319
+ }
1276
1320
  const retryable =
1277
1321
  (res.status === 429 && !isHardBillingToolHttpError(lastError)) ||
1278
1322
  (res.status >= 500 && WORKER_RETRY_SAFE_5XX_TOOLS.has(toolId));
1279
1323
  if (!retryable || attempt >= maxAttempts) {
1280
1324
  throw lastError;
1281
1325
  }
1282
- const retryAfterSeconds = Number(res.headers.get('retry-after'));
1283
- const delayMs =
1284
- Number.isFinite(retryAfterSeconds) && retryAfterSeconds > 0
1285
- ? Math.min(5_000, Math.ceil(retryAfterSeconds * 1000))
1286
- : 1_000;
1326
+ // Charge the retry budget per attempt, matching the cjs runner's
1327
+ // chargeBudget('retry') on every 429 / retryable-5xx retry.
1328
+ onRetryAttempt?.();
1329
+ const delayMs = retryAfterMs > 0 ? Math.min(5_000, retryAfterMs) : 1_000;
1287
1330
  await new Promise((resolve) => setTimeout(resolve, delayMs));
1288
1331
  }
1289
1332
 
@@ -1405,7 +1448,7 @@ function parseStringArray(value: unknown): string[] {
1405
1448
  .filter(Boolean);
1406
1449
  }
1407
1450
 
1408
- function syntheticToolMetadata(toolId: string): ToolResultMetadataInput {
1451
+ function toolMetadataFallback(toolId: string): ToolResultMetadataInput {
1409
1452
  if (toolId === 'test_rate_limit') {
1410
1453
  return {
1411
1454
  toolId,
@@ -1450,193 +1493,6 @@ function wrapWorkerToolResult(
1450
1493
  });
1451
1494
  }
1452
1495
 
1453
- async function executeSyntheticTestRateLimitBatch(
1454
- req: RunRequest,
1455
- input: Record<string, unknown>,
1456
- ): Promise<Record<string, unknown>> {
1457
- const delayMs =
1458
- typeof input.simulated_delay_ms === 'number' &&
1459
- Number.isInteger(input.simulated_delay_ms) &&
1460
- input.simulated_delay_ms > 0
1461
- ? input.simulated_delay_ms
1462
- : 0;
1463
- if (delayMs > 0) {
1464
- await new Promise((resolve) => setTimeout(resolve, delayMs));
1465
- }
1466
- const rawItems = Array.isArray(input.items) ? input.items : [];
1467
- const items = rawItems
1468
- .filter((item): item is Record<string, unknown> =>
1469
- Boolean(item && typeof item === 'object' && !Array.isArray(item)),
1470
- )
1471
- .map((item, index) => {
1472
- const itemKey =
1473
- typeof item.itemKey === 'string' && item.itemKey.trim()
1474
- ? item.itemKey.trim()
1475
- : `item-${index}`;
1476
- const payload =
1477
- item.payload &&
1478
- typeof item.payload === 'object' &&
1479
- !Array.isArray(item.payload)
1480
- ? (item.payload as Record<string, unknown>)
1481
- : {};
1482
- return { itemKey, payload };
1483
- });
1484
- const batchRequest: ToolBatchRequest = {
1485
- runId: req.runId,
1486
- orgId: req.orgId,
1487
- toolId: 'test_rate_limit',
1488
- operation: 'test_batch_rate_limit',
1489
- provider: 'test',
1490
- items,
1491
- waterfallId:
1492
- typeof input.waterfall_id === 'string' ? input.waterfall_id : null,
1493
- stageId: typeof input.stage === 'string' ? input.stage : null,
1494
- fieldName: typeof input.field_name === 'string' ? input.field_name : null,
1495
- mapName: typeof input.map_name === 'string' ? input.map_name : null,
1496
- chunkIndex:
1497
- typeof input.chunk_index === 'number' ? input.chunk_index : null,
1498
- userProvidedRateLimitKey:
1499
- typeof input.rate_limit_key === 'string' ? input.rate_limit_key : null,
1500
- providerBatchSize: 200,
1501
- };
1502
- const executor = createToolBatchExecutor({
1503
- async executeProviderBatch({ items: providerItems }) {
1504
- return providerItems.map((item) => ({
1505
- itemKey: item.itemKey,
1506
- result: executeSyntheticTestRateLimit(item.payload),
1507
- }));
1508
- },
1509
- });
1510
- const result = await executor.executeToolBatch(batchRequest);
1511
- return {
1512
- status: 'completed',
1513
- key: String(input.key ?? 'batch'),
1514
- provider: 'test',
1515
- batch: true,
1516
- batch_size: result.itemCount,
1517
- provider_batch_count: result.batchCount,
1518
- items: result.results.map((item) => ({
1519
- itemKey: item.itemKey,
1520
- result: item.result,
1521
- })),
1522
- };
1523
- }
1524
-
1525
- type SyntheticTransientRetryResult =
1526
- | { ok: true; result: Record<string, unknown> }
1527
- | { ok: false; status: number; message: string };
1528
-
1529
- function executeSyntheticTransientRetry(
1530
- toolId: string,
1531
- input: Record<string, unknown>,
1532
- attempt: number,
1533
- ): SyntheticTransientRetryResult {
1534
- const failuresBeforeSuccess =
1535
- typeof input.failures_before_success === 'number' &&
1536
- Number.isInteger(input.failures_before_success) &&
1537
- input.failures_before_success >= 0
1538
- ? input.failures_before_success
1539
- : 1;
1540
- if (attempt <= failuresBeforeSuccess) {
1541
- const status = toolId === 'test_transient_429' ? 429 : 502;
1542
- return {
1543
- ok: false,
1544
- status,
1545
- message: `Synthetic transient ${status} for attempt ${attempt}`,
1546
- };
1547
- }
1548
- return {
1549
- ok: true,
1550
- result: {
1551
- status: 'completed',
1552
- provider: 'test',
1553
- key: String(input.key ?? 'transient'),
1554
- attempts: attempt,
1555
- recovered: attempt > 1,
1556
- },
1557
- };
1558
- }
1559
-
1560
- function executeSyntheticTestRateLimit(
1561
- input: Record<string, unknown>,
1562
- ): Record<string, unknown> {
1563
- if (
1564
- typeof input.key === 'string' &&
1565
- input.key.startsWith('public-error-message-regression')
1566
- ) {
1567
- throw new ToolHttpError(
1568
- [
1569
- 'tool test_rate_limit 422 attempt 1/1:',
1570
- 'Synthetic public test error with a redacted token=[REDACTED].',
1571
- 'code=TEST_PUBLIC_ERROR.',
1572
- 'failure_description=The fake test provider intentionally raised a typed public error so V2 runner output preserves actionable details.',
1573
- 'operator_hint=Use this no-bill test provider fixture when verifying play runner error rendering.',
1574
- ].join(' '),
1575
- null,
1576
- );
1577
- }
1578
- const rowNumber =
1579
- typeof input.row_number === 'number' && Number.isInteger(input.row_number)
1580
- ? input.row_number
1581
- : null;
1582
- const leadId = typeof input.lead_id === 'string' ? input.lead_id : null;
1583
- const matchedDomain =
1584
- typeof input.matched_domain === 'string' && input.matched_domain.trim()
1585
- ? input.matched_domain.trim()
1586
- : 'example.com';
1587
- const matchedPrefix =
1588
- typeof input.matched_prefix === 'string' && input.matched_prefix.trim()
1589
- ? input.matched_prefix.trim()
1590
- : (leadId ??
1591
- (rowNumber !== null
1592
- ? `row${String(rowNumber).padStart(3, '0')}`
1593
- : 'match'));
1594
- const matched = syntheticMatchWindow(input, rowNumber);
1595
- const matchedEmail = matched ? `${matchedPrefix}@${matchedDomain}` : null;
1596
- const securityGateway =
1597
- input.emit_security_gateway === true
1598
- ? { email_status: 'valid', mx_security_gateway: true }
1599
- : {};
1600
- return {
1601
- status: 'completed',
1602
- key: String(input.key || ''),
1603
- provider: 'test',
1604
- lead_id: leadId,
1605
- row_number: rowNumber,
1606
- matched_result: matchedEmail,
1607
- email: matchedEmail,
1608
- value: matchedEmail,
1609
- batch: false,
1610
- ...securityGateway,
1611
- };
1612
- }
1613
-
1614
- function syntheticMatchWindow(
1615
- input: Record<string, unknown>,
1616
- rowNumber: number | null,
1617
- ): boolean {
1618
- const min =
1619
- typeof input.match_rows_min === 'number' ? input.match_rows_min : null;
1620
- const max =
1621
- typeof input.match_rows_max === 'number' ? input.match_rows_max : null;
1622
- if (rowNumber === null) return min === null && max === null;
1623
- if (min !== null && rowNumber < min) return false;
1624
- if (max !== null && rowNumber > max) return false;
1625
- const moduloBase =
1626
- typeof input.match_modulo_base === 'number' && input.match_modulo_base > 0
1627
- ? input.match_modulo_base
1628
- : null;
1629
- if (moduloBase !== null) {
1630
- const equals = Array.isArray(input.match_modulo_equals)
1631
- ? input.match_modulo_equals
1632
- .filter((entry): entry is number => typeof entry === 'number')
1633
- .map((entry) => entry % moduloBase)
1634
- : [];
1635
- return equals.length > 0 && equals.includes(rowNumber % moduloBase);
1636
- }
1637
- return true;
1638
- }
1639
-
1640
1496
  function isRecordLike(value: unknown): value is Record<string, unknown> {
1641
1497
  return value != null && typeof value === 'object' && !Array.isArray(value);
1642
1498
  }
@@ -1730,7 +1586,12 @@ type WorkerToolBatchRequest = {
1730
1586
  reject: (error: unknown) => void;
1731
1587
  };
1732
1588
 
1733
- const WORKER_TOOL_BATCH_GRACE_MS = 15;
1589
+ const WORKER_TOOL_BATCH_GRACE_MS = 250;
1590
+ // Fallback batch-chunk parallelism when a tool declares no provider rate hints.
1591
+ // Matches the prior hardcoded `Math.min(4, ...)` so undeclared providers keep
1592
+ // their previous batching behavior; declared providers tighten via the
1593
+ // Governor's suggestedParallelism.
1594
+ const WORKER_TOOL_BATCH_DEFAULT_PARALLELISM = 4;
1734
1595
  const WORKER_RETRY_SAFE_5XX_TOOLS = new Set(['test_transient_500']);
1735
1596
 
1736
1597
  function stepProgramColumnName(parentField: string, stepId: string): string {
@@ -1741,7 +1602,32 @@ class WorkerToolBatchScheduler {
1741
1602
  private queue: WorkerToolBatchRequest[] = [];
1742
1603
  private scheduled = false;
1743
1604
 
1744
- constructor(private readonly req: RunRequest) {}
1605
+ constructor(
1606
+ private readonly req: RunRequest,
1607
+ private readonly governor: PlayExecutionGovernor,
1608
+ private readonly resolvePacing: WorkerPacingResolver,
1609
+ private readonly abortSignal?: AbortSignal,
1610
+ private readonly onRequestsSettled?: (count: number) => void,
1611
+ ) {}
1612
+
1613
+ /**
1614
+ * Report a provider 429 / Retry-After back into the Governor's shared pacer
1615
+ * so future acquires for this (org, provider) bucket back off across all
1616
+ * isolates. Provider comes from the same pacing resolver the Governor uses
1617
+ * (the worker has no local catalog), so callers pass only the toolId.
1618
+ */
1619
+ private reportBackpressure(toolId: string, retryAfterMs: number): void {
1620
+ if (retryAfterMs <= 0) return;
1621
+ void (async () => {
1622
+ const pacing = await this.resolvePacing(toolId).catch(() => null);
1623
+ if (pacing?.provider) {
1624
+ this.governor.reportProviderBackpressure({
1625
+ provider: pacing.provider,
1626
+ retryAfterMs,
1627
+ });
1628
+ }
1629
+ })();
1630
+ }
1745
1631
 
1746
1632
  execute(
1747
1633
  id: string,
@@ -1824,16 +1710,27 @@ class WorkerToolBatchScheduler {
1824
1710
  const groupStartedAt = nowMs();
1825
1711
  await Promise.all(
1826
1712
  requests.map(async (request) => {
1713
+ // Each unbatched provider call takes its own tool slot: the Governor
1714
+ // charges tool budget, holds a global tool-concurrency slot, and
1715
+ // applies per-(org,provider) pacing before the call runs.
1716
+ const slot = await this.governor.acquireToolSlot(toolId, {
1717
+ signal: this.abortSignal,
1718
+ });
1827
1719
  try {
1828
1720
  request.resolve(
1829
1721
  await executeTool(
1830
1722
  this.req,
1831
1723
  { id: request.id, toolId, input: request.input },
1832
1724
  request.workflowStep,
1725
+ (retryAfterMs) => this.reportBackpressure(toolId, retryAfterMs),
1726
+ () => this.governor.chargeBudget('retry'),
1833
1727
  ),
1834
1728
  );
1835
1729
  } catch (error) {
1836
1730
  request.reject(error);
1731
+ } finally {
1732
+ this.onRequestsSettled?.(1);
1733
+ slot.release();
1837
1734
  }
1838
1735
  }),
1839
1736
  );
@@ -1851,6 +1748,15 @@ class WorkerToolBatchScheduler {
1851
1748
  req: this.req,
1852
1749
  requests,
1853
1750
  strategy,
1751
+ governor: this.governor,
1752
+ suggestedParallelism: await this.governor.suggestedParallelism(
1753
+ toolId,
1754
+ WORKER_TOOL_BATCH_DEFAULT_PARALLELISM,
1755
+ ),
1756
+ abortSignal: this.abortSignal,
1757
+ reportBackpressure: (retryAfterMs) =>
1758
+ this.reportBackpressure(toolId, retryAfterMs),
1759
+ onRequestsSettled: this.onRequestsSettled,
1854
1760
  });
1855
1761
  recordRunnerPerfTrace({
1856
1762
  req: this.req,
@@ -1880,22 +1786,60 @@ async function executeBatchedWorkerToolGroup(input: {
1880
1786
  req: RunRequest;
1881
1787
  requests: WorkerToolBatchRequest[];
1882
1788
  strategy: AnyBatchOperationStrategy;
1789
+ governor: PlayExecutionGovernor;
1790
+ suggestedParallelism: number;
1791
+ abortSignal?: AbortSignal;
1792
+ reportBackpressure: (retryAfterMs: number) => void;
1793
+ onRequestsSettled?: (count: number) => void;
1883
1794
  }): Promise<void> {
1884
1795
  const compiledBatches = compileRequestsWithStrategy({
1885
1796
  requests: input.requests,
1886
1797
  strategy: input.strategy,
1887
1798
  getPayload: (request) => request.input,
1888
1799
  });
1800
+ recordRunnerPerfTrace({
1801
+ req: input.req,
1802
+ phase: 'runner.tool.batch.compile',
1803
+ ms: 0,
1804
+ extra: {
1805
+ sourceOperation: input.strategy.sourceOperation,
1806
+ batchOperation: input.strategy.batchOperation,
1807
+ requests: input.requests.length,
1808
+ batches: compiledBatches.length,
1809
+ batchSizes: compiledBatches.map((batch) => batch.memberRequests.length),
1810
+ },
1811
+ });
1889
1812
 
1890
1813
  await executeChunkedRequests({
1891
1814
  requests: compiledBatches,
1892
- batchSize: Math.max(1, Math.min(4, compiledBatches.length || 1)),
1893
- execute: async (batch) =>
1894
- await executeTool(input.req, {
1895
- id: `batch:${batch.memberRequests.map((request) => request.id).join('|')}`,
1896
- toolId: batch.batchOperation,
1897
- input: batch.batchPayload,
1898
- }),
1815
+ // Chunk parallelism is the Governor's per-tool suggestion (provider rate
1816
+ // hints tightened to the policy ceiling), bounded by the batch count.
1817
+ batchSize: Math.max(
1818
+ 1,
1819
+ Math.min(input.suggestedParallelism, compiledBatches.length || 1),
1820
+ ),
1821
+ execute: async (batch) => {
1822
+ // One provider call per batch → one tool slot (budget + global
1823
+ // concurrency + per-(org,provider) pacing) around the whole batch.
1824
+ const slot = await input.governor.acquireToolSlot(batch.batchOperation, {
1825
+ signal: input.abortSignal,
1826
+ });
1827
+ try {
1828
+ return await executeTool(
1829
+ input.req,
1830
+ {
1831
+ id: `batch:${batch.memberRequests.map((request) => request.id).join('|')}`,
1832
+ toolId: batch.batchOperation,
1833
+ input: batch.batchPayload,
1834
+ },
1835
+ undefined,
1836
+ input.reportBackpressure,
1837
+ () => input.governor.chargeBudget('retry'),
1838
+ );
1839
+ } finally {
1840
+ slot.release();
1841
+ }
1842
+ },
1899
1843
  onChunkComplete: async (
1900
1844
  chunkResults: Array<
1901
1845
  ChunkExecutionResult<(typeof compiledBatches)[number], unknown>
@@ -1919,11 +1863,18 @@ async function executeBatchedWorkerToolGroup(input: {
1919
1863
  wrapWorkerToolResult(
1920
1864
  request.toolId,
1921
1865
  splitResults[index] ?? null,
1922
- syntheticToolMetadata(request.toolId),
1866
+ toolMetadataFallback(request.toolId),
1923
1867
  ),
1924
1868
  );
1925
1869
  }
1926
1870
  }
1871
+ const settledMembers = chunkResults.reduce(
1872
+ (total, entry) => total + entry.request.memberRequests.length,
1873
+ 0,
1874
+ );
1875
+ if (settledMembers > 0) {
1876
+ input.onRequestsSettled?.(settledMembers);
1877
+ }
1927
1878
  },
1928
1879
  }).catch((error) => {
1929
1880
  for (const request of input.requests) {
@@ -3039,10 +2990,10 @@ async function persistCompletedMapRows(input: {
3039
2990
  tableNamespace: input.tableNamespace,
3040
2991
  sheetContract: augmentSheetContractWithDatasetFields({
3041
2992
  contract: requireSheetContract(input.req, input.tableNamespace),
3042
- rows: input.rows,
2993
+ rows: input.rows.map((row) => publicCsvStorageRow(row)),
3043
2994
  outputFields,
3044
2995
  }),
3045
- rows: input.rows,
2996
+ rows: input.rows.map((row) => publicCsvStorageRow(row)),
3046
2997
  outputFields,
3047
2998
  runId: input.req.runId,
3048
2999
  userEmail: input.req.userEmail,
@@ -3073,10 +3024,10 @@ async function prepareMapRows(input: {
3073
3024
  tableNamespace: input.tableNamespace,
3074
3025
  sheetContract: augmentSheetContractWithDatasetFields({
3075
3026
  contract: requireSheetContract(input.req, input.tableNamespace),
3076
- rows: input.rows,
3027
+ rows: input.rows.map((row) => publicCsvStorageRow(row)),
3077
3028
  outputFields: input.outputFields,
3078
3029
  }),
3079
- rows: input.rows.map((row) => ({ ...row })),
3030
+ rows: input.rows.map((row) => publicCsvStorageRow(row)),
3080
3031
  runId: input.req.runId,
3081
3032
  userEmail: input.req.userEmail,
3082
3033
  cellPolicies: input.cellPolicies,
@@ -3164,9 +3115,23 @@ function assertNotAborted(signal: AbortSignal | undefined): void {
3164
3115
  function childPipelineUsesCtxDataset(
3165
3116
  pipeline: PlayStaticPipeline | null | undefined,
3166
3117
  ): boolean {
3167
- return getCompiledPipelineSubsteps(pipeline).some(
3168
- (substep) => substep.type === 'dataset',
3169
- );
3118
+ if (!pipeline) return false;
3119
+ if (typeof pipeline.tableNamespace === 'string' && pipeline.tableNamespace) {
3120
+ return true;
3121
+ }
3122
+ if (pipeline.sheetContract) {
3123
+ return true;
3124
+ }
3125
+ return flattenStaticPipeline(pipeline).some((substep) => {
3126
+ if (substep.type === 'dataset') return true;
3127
+ if (!isRecord(substep)) return false;
3128
+ return (
3129
+ ('tableNamespace' in substep &&
3130
+ typeof substep.tableNamespace === 'string' &&
3131
+ substep.tableNamespace.length > 0) ||
3132
+ ('sheetContract' in substep && Boolean(substep.sheetContract))
3133
+ );
3134
+ });
3170
3135
  }
3171
3136
 
3172
3137
  function childPipelineNeedsWorkflowScheduler(
@@ -3181,16 +3146,207 @@ function childPipelineNeedsWorkflowScheduler(
3181
3146
  );
3182
3147
  }
3183
3148
 
3184
- function releaseChildPlayConcurrency(
3185
- inFlightByPlayName: Record<string, number>,
3186
- playName: string,
3187
- ): void {
3188
- const next = Math.max(0, (inFlightByPlayName[playName] ?? 0) - 1);
3189
- if (next === 0) {
3190
- delete inFlightByPlayName[playName];
3191
- return;
3192
- }
3193
- inFlightByPlayName[playName] = next;
3149
+ /**
3150
+ * Build the per-(org,provider) rate port the distributed Rate State Backend
3151
+ * RPCs through. When the coordinator binding (or its rate RPCs) is absent we
3152
+ * fail OPEN — grant immediately — matching customer-rate-limiter semantics so a
3153
+ * miswired binding degrades pacing without stalling the run.
3154
+ */
3155
+ function createCoordinatorRatePort(req: RunRequest): CoordinatorRatePort {
3156
+ return {
3157
+ async rateAcquire(input) {
3158
+ const binding = cachedCoordinatorBinding;
3159
+ if (!binding?.rateAcquire) {
3160
+ const coordinatorUrl = req.coordinatorUrl?.trim();
3161
+ if (!coordinatorUrl) {
3162
+ throw new Error('Coordinator rate acquire is unavailable.');
3163
+ }
3164
+ const res = await fetch(`${coordinatorUrl.replace(/\/$/, '')}/rate-acquire`, {
3165
+ method: 'POST',
3166
+ headers: {
3167
+ 'x-deepline-request-id': makeRequestId(),
3168
+ ...coordinatorRequestHeaders({
3169
+ runId: req.runId,
3170
+ contentType: 'application/json',
3171
+ internalToken: req.coordinatorInternalToken,
3172
+ }),
3173
+ },
3174
+ body: JSON.stringify(input),
3175
+ });
3176
+ if (!res.ok) {
3177
+ const text = await res.text().catch(() => '');
3178
+ throw new Error(
3179
+ `Coordinator rate acquire failed (${res.status}): ${text}`,
3180
+ );
3181
+ }
3182
+ return (await res.json()) as { granted: number; waitMs: number };
3183
+ }
3184
+ return await binding.rateAcquire(input);
3185
+ },
3186
+ async ratePenalize(input) {
3187
+ const binding = cachedCoordinatorBinding;
3188
+ if (!binding?.ratePenalize) {
3189
+ const coordinatorUrl = req.coordinatorUrl?.trim();
3190
+ if (!coordinatorUrl) return;
3191
+ const res = await fetch(
3192
+ `${coordinatorUrl.replace(/\/$/, '')}/rate-penalize`,
3193
+ {
3194
+ method: 'POST',
3195
+ headers: {
3196
+ 'x-deepline-request-id': makeRequestId(),
3197
+ ...coordinatorRequestHeaders({
3198
+ runId: req.runId,
3199
+ contentType: 'application/json',
3200
+ internalToken: req.coordinatorInternalToken,
3201
+ }),
3202
+ },
3203
+ body: JSON.stringify(input),
3204
+ },
3205
+ );
3206
+ if (!res.ok) {
3207
+ const text = await res.text().catch(() => '');
3208
+ throw new Error(
3209
+ `Coordinator rate penalize failed (${res.status}): ${text}`,
3210
+ );
3211
+ }
3212
+ return;
3213
+ }
3214
+ await binding.ratePenalize(input);
3215
+ },
3216
+ };
3217
+ }
3218
+
3219
+ /**
3220
+ * Resolve a tool's provider + pacing rules from the same runtime tool-metadata
3221
+ * endpoint the cjs_node20 runner uses (`getToolQueueHints`). The worker has no
3222
+ * local catalog, so this is an HTTP fetch through the runtime API binding,
3223
+ * memoized per isolate. No hints → null (pacing is a no-op; the Governor's
3224
+ * global tool-concurrency slot still applies).
3225
+ */
3226
+ type WorkerPacingResolver = (
3227
+ toolId: string,
3228
+ ) => Promise<{ provider: string; rules: PacingRule[] } | null>;
3229
+
3230
+ function createWorkerPacingResolver(req: RunRequest): WorkerPacingResolver {
3231
+ const cache = new Map<
3232
+ string,
3233
+ Promise<{ provider: string; rules: PacingRule[] } | null>
3234
+ >();
3235
+ return (toolId: string) => {
3236
+ const normalized = String(toolId || '').trim();
3237
+ if (!normalized) return Promise.resolve(null);
3238
+ const cached = cache.get(normalized);
3239
+ if (cached) return cached;
3240
+ const promise = (async () => {
3241
+ const res = await fetchRuntimeApi(
3242
+ req.baseUrl,
3243
+ `/api/v2/plays/runtime-tools/${encodeURIComponent(normalized)}`,
3244
+ {
3245
+ method: 'GET',
3246
+ headers: { authorization: `Bearer ${req.executorToken}` },
3247
+ },
3248
+ ).catch(() => null);
3249
+ if (!res || !res.ok) return null;
3250
+ const body = (await res.json().catch(() => null)) as {
3251
+ provider?: unknown;
3252
+ queueHints?: unknown;
3253
+ } | null;
3254
+ if (!body) return null;
3255
+ const provider =
3256
+ typeof body.provider === 'string' && body.provider.trim()
3257
+ ? body.provider.trim()
3258
+ : null;
3259
+ if (!provider || !Array.isArray(body.queueHints)) return null;
3260
+ const rules: PacingRule[] = body.queueHints.flatMap((hint) => {
3261
+ if (!hint || typeof hint !== 'object') return [];
3262
+ const record = hint as Record<string, unknown>;
3263
+ if (
3264
+ typeof record.ruleId !== 'string' ||
3265
+ typeof record.requestsPerWindow !== 'number' ||
3266
+ typeof record.windowMs !== 'number'
3267
+ ) {
3268
+ return [];
3269
+ }
3270
+ return [
3271
+ {
3272
+ ruleId: record.ruleId,
3273
+ requestsPerWindow: record.requestsPerWindow,
3274
+ windowMs: record.windowMs,
3275
+ maxConcurrency:
3276
+ typeof record.maxConcurrency === 'number'
3277
+ ? record.maxConcurrency
3278
+ : null,
3279
+ } satisfies PacingRule,
3280
+ ];
3281
+ });
3282
+ if (rules.length === 0) return null;
3283
+ return { provider, rules };
3284
+ })();
3285
+ cache.set(normalized, promise);
3286
+ return promise;
3287
+ };
3288
+ }
3289
+
3290
+ /**
3291
+ * Build the Governor's lineage snapshot for this worker, seeded from the
3292
+ * inherited PlayCallGovernanceSnapshot (threaded via internalRunPlay) so play-
3293
+ * call budgets accumulate down the dispatch tree across isolates. The current
3294
+ * play id is always `req.playName` so the per-parent child-call counter keys off
3295
+ * the executing play (matching the prior worker behavior). Per-run counters the
3296
+ * worker lineage does not carry (tool/retry/descendant/waterfall) seed at 0 —
3297
+ * these are runaway guards, not exact cross-isolate ledgers, so a per-worker
3298
+ * reset is acceptable and matches the Governor contract.
3299
+ */
3300
+ function resumeGovernanceFromRequest(req: RunRequest): GovernanceSnapshot {
3301
+ const inherited = req.playCallGovernance;
3302
+ const rootRunId = inherited?.rootRunId || req.runId;
3303
+ const ancestryPlayIds = inherited?.ancestryPlayIds?.length
3304
+ ? // Per the lineage validator the inherited tail equals the parent; ensure
3305
+ // the chain ends with the currently-executing play for the cycle guard.
3306
+ inherited.ancestryPlayIds[inherited.ancestryPlayIds.length - 1] ===
3307
+ req.playName
3308
+ ? [...inherited.ancestryPlayIds]
3309
+ : [...inherited.ancestryPlayIds, req.playName]
3310
+ : [req.playName];
3311
+ const ancestryRunIds =
3312
+ rootRunId === req.runId ? [req.runId] : [rootRunId, req.runId];
3313
+ return {
3314
+ rootRunId,
3315
+ currentRunId: req.runId,
3316
+ currentPlayId: req.playName,
3317
+ ancestryPlayIds,
3318
+ ancestryRunIds,
3319
+ callDepth: inherited?.callDepth ?? 0,
3320
+ // Seed every lineage-global budget counter from the inherited snapshot so
3321
+ // descendant/tool/retry/waterfall budgets accumulate across isolates exactly
3322
+ // as they do across the cjs forkChild lineage. Without this they would reset
3323
+ // to 0 in each isolate and become per-worker — contradicting the Governor's
3324
+ // lineage-global budget contract. Fail-safe to 0 for older callers.
3325
+ playCallCount: inherited?.playCallCount ?? 0,
3326
+ toolCallCount: inherited?.toolCallCount ?? 0,
3327
+ retryCount: inherited?.retryCount ?? 0,
3328
+ descendantCount: inherited?.descendantCount ?? 0,
3329
+ waterfallStepExecutions: inherited?.waterfallStepExecutions ?? 0,
3330
+ parentChildCalls: {},
3331
+ };
3332
+ }
3333
+
3334
+ function createGovernorForRun(req: RunRequest): {
3335
+ governor: PlayExecutionGovernor;
3336
+ resolvePacing: WorkerPacingResolver;
3337
+ } {
3338
+ const resolvePacing = createWorkerPacingResolver(req);
3339
+ const governor = createPlayExecutionGovernor({
3340
+ adapter: 'esm_workers',
3341
+ scope: {
3342
+ orgId: req.orgId,
3343
+ rootRunId: req.playCallGovernance?.rootRunId ?? req.runId,
3344
+ },
3345
+ rateState: new CoordinatorRateStateBackend(createCoordinatorRatePort(req)),
3346
+ resolvePacing,
3347
+ resume: resumeGovernanceFromRequest(req),
3348
+ });
3349
+ return { governor, resolvePacing };
3194
3350
  }
3195
3351
 
3196
3352
  function createMinimalWorkerCtx(
@@ -3201,12 +3357,12 @@ function createMinimalWorkerCtx(
3201
3357
  abortSignal?: AbortSignal,
3202
3358
  callbacks?: WorkerCtxCallbacks,
3203
3359
  ): unknown {
3204
- let playCallCount = 0;
3205
- const parentChildCalls: Record<string, number> = {};
3360
+ const { governor, resolvePacing: resolveToolPacing } =
3361
+ createGovernorForRun(req);
3362
+ // Play-call depth/count/per-parent budgets, child-play concurrency, and the
3363
+ // lineage snapshot are owned by the Governor (createGovernorForRun above).
3364
+ // The worker keeps only substrate mechanism here.
3206
3365
  const stepCallCounts: Record<string, number> = {};
3207
- const inFlightChildCallsByPlayName: Record<string, number> = {};
3208
- let inFlightChildPlayCalls = 0;
3209
- const childPlaySlotWaiters: Array<() => void> = [];
3210
3366
  const secretRedactor = createSecretRedactionContext();
3211
3367
 
3212
3368
  const resolveSecretAuth = async (auth?: SecretAuth) => {
@@ -3245,38 +3401,6 @@ function createMinimalWorkerCtx(
3245
3401
  : { [auth.header.toLowerCase()]: value };
3246
3402
  };
3247
3403
 
3248
- const acquireChildPlaySlot = async (): Promise<() => void> => {
3249
- while (
3250
- inFlightChildPlayCalls >= WORKER_PLAY_CALL_LIMITS.maxConcurrentPlayCalls
3251
- ) {
3252
- await new Promise<void>((resolve, reject) => {
3253
- const waiter = () => {
3254
- abortSignal?.removeEventListener('abort', onAbort);
3255
- resolve();
3256
- };
3257
- const onAbort = () => {
3258
- const index = childPlaySlotWaiters.indexOf(waiter);
3259
- if (index >= 0) childPlaySlotWaiters.splice(index, 1);
3260
- reject(
3261
- abortSignal?.reason instanceof Error
3262
- ? abortSignal.reason
3263
- : new WorkflowAbortError(),
3264
- );
3265
- };
3266
- childPlaySlotWaiters.push(waiter);
3267
- abortSignal?.addEventListener('abort', onAbort, { once: true });
3268
- });
3269
- assertNotAborted(abortSignal);
3270
- }
3271
- inFlightChildPlayCalls += 1;
3272
- let released = false;
3273
- return () => {
3274
- if (released) return;
3275
- released = true;
3276
- inFlightChildPlayCalls = Math.max(0, inFlightChildPlayCalls - 1);
3277
- childPlaySlotWaiters.shift()?.();
3278
- };
3279
- };
3280
3404
  const rootGovernance = req.playCallGovernance;
3281
3405
  const rootRunId = rootGovernance?.rootRunId ?? req.runId;
3282
3406
  const receiptStore = createHarnessWorkerReceiptStore({
@@ -3401,6 +3525,7 @@ function createMinimalWorkerCtx(
3401
3525
  ...progress,
3402
3526
  updatedAt: progress.updatedAt ?? nowMs(),
3403
3527
  },
3528
+ forceFlush: true,
3404
3529
  });
3405
3530
  };
3406
3531
  const formatMapProgressMessage = (completed: number, total?: number) =>
@@ -3530,6 +3655,18 @@ function createMinimalWorkerCtx(
3530
3655
  completedRows: prepared.completedRows.length,
3531
3656
  },
3532
3657
  });
3658
+ updateMapProgress({
3659
+ completed: prepared.completedRows.length,
3660
+ total: chunkRows.length,
3661
+ startedAt: mapStartedAt,
3662
+ message:
3663
+ prepared.pendingRows.length > 0
3664
+ ? `${prepared.pendingRows.length.toLocaleString()} rows queued`
3665
+ : formatMapProgressMessage(
3666
+ prepared.completedRows.length,
3667
+ chunkRows.length,
3668
+ ),
3669
+ });
3533
3670
  const pendingKeys = new Set<string>();
3534
3671
  const pendingRowsByKey = new Map<string, Record<string, unknown>>();
3535
3672
  const completedKeys = new Set<string>();
@@ -3577,7 +3714,40 @@ function createMinimalWorkerCtx(
3577
3714
  0,
3578
3715
  prepared.skipped - missingPreparedRows.length,
3579
3716
  );
3580
- const concurrency = Math.max(1, Math.min(opts?.concurrency ?? 10, 100));
3717
+ let settledToolRequests = 0;
3718
+ let lastToolProgressAt = 0;
3719
+ const reportSettledToolRequests = (count: number) => {
3720
+ if (count <= 0) return;
3721
+ settledToolRequests += count;
3722
+ const now = nowMs();
3723
+ const estimatedCompleted = Math.min(
3724
+ chunkRows.length,
3725
+ prepared.completedRows.length + settledToolRequests,
3726
+ );
3727
+ const isTerminalEstimate = estimatedCompleted >= chunkRows.length;
3728
+ if (
3729
+ !isTerminalEstimate &&
3730
+ now - lastToolProgressAt < RUN_LEDGER_FLUSH_INTERVAL_MS
3731
+ ) {
3732
+ return;
3733
+ }
3734
+ lastToolProgressAt = now;
3735
+ updateMapProgress({
3736
+ completed: estimatedCompleted,
3737
+ total: chunkRows.length,
3738
+ startedAt: mapStartedAt,
3739
+ message: formatMapProgressMessage(
3740
+ estimatedCompleted,
3741
+ chunkRows.length,
3742
+ ),
3743
+ });
3744
+ };
3745
+ // Row concurrency comes from the Governor: an explicit map concurrency is
3746
+ // clamped to the policy row-max, otherwise the policy default. Each row
3747
+ // body additionally acquires a global row slot (the Governor's rowMax
3748
+ // semaphore) so total in-flight rows across all maps in this isolate stay
3749
+ // bounded even when several maps run at once.
3750
+ const concurrency = governor.resolveRowConcurrency();
3581
3751
  const executedRows: Array<T & Record<string, unknown>> = new Array(
3582
3752
  rowsToExecute.length,
3583
3753
  );
@@ -3594,7 +3764,13 @@ function createMinimalWorkerCtx(
3594
3764
  >
3595
3765
  | undefined
3596
3766
  > = new Array(rowsToExecute.length);
3597
- const toolBatchScheduler = new WorkerToolBatchScheduler(req);
3767
+ const toolBatchScheduler = new WorkerToolBatchScheduler(
3768
+ req,
3769
+ governor,
3770
+ resolveToolPacing,
3771
+ abortSignal,
3772
+ reportSettledToolRequests,
3773
+ );
3598
3774
  const generatedOutputFields = new Set<string>();
3599
3775
  let idx = 0;
3600
3776
  const workers: Array<Promise<void>> = [];
@@ -3605,143 +3781,152 @@ function createMinimalWorkerCtx(
3605
3781
  if (abortSignal?.aborted) return;
3606
3782
  const myIndex = idx++;
3607
3783
  if (myIndex >= rowsToExecute.length) return;
3608
- const entry = uniqueRowsToExecuteEntries[myIndex]!;
3609
- const row = pendingRowsByKey.has(entry.rowKey)
3610
- ? ({
3611
- ...entry.row,
3612
- ...publicCsvInputRow(pendingRowsByKey.get(entry.rowKey)!),
3613
- } as T & Record<string, unknown>)
3614
- : entry.row;
3615
- const absoluteIndex = entry.absoluteIndex;
3616
- const enriched: Record<string, unknown> = cloneCsvAliasedRow(row);
3617
- const fieldOutputs: Record<string, unknown> = {};
3618
- const cellMetaPatch: Record<
3619
- string,
3620
- {
3621
- status: 'cached' | 'skipped' | 'completed';
3622
- stage?: string | null;
3623
- reused?: boolean;
3624
- runId?: string;
3625
- completedAt?: number;
3626
- }
3627
- > = {};
3628
- const waterfallOutputs: RecordedWaterfallOutput[] = [];
3629
- const stepProgramOutputs: RecordedStepProgramOutput[] = [];
3630
- const rowCtx = {
3631
- ...(ctx as Record<string, unknown>),
3632
- tools: {
3633
- ...((ctx as { tools?: Record<string, unknown> }).tools ?? {}),
3634
- execute: async (requestArg: unknown): Promise<unknown> => {
3635
- assertNotAborted(abortSignal);
3636
- const request = normalizeToolExecuteArgs(requestArg);
3637
- return await toolBatchScheduler.execute(
3638
- request.id,
3639
- request.toolId,
3640
- request.input,
3641
- workflowStep,
3642
- );
3784
+ const rowSlot = await governor.acquireRowSlot({
3785
+ signal: abortSignal,
3786
+ });
3787
+ try {
3788
+ const entry = uniqueRowsToExecuteEntries[myIndex]!;
3789
+ const row = pendingRowsByKey.has(entry.rowKey)
3790
+ ? ({
3791
+ ...entry.row,
3792
+ ...publicCsvInputRow(pendingRowsByKey.get(entry.rowKey)!),
3793
+ } as T & Record<string, unknown>)
3794
+ : entry.row;
3795
+ const absoluteIndex = entry.absoluteIndex;
3796
+ const enriched: Record<string, unknown> =
3797
+ cloneCsvAliasedRow(row);
3798
+ const fieldOutputs: Record<string, unknown> = {};
3799
+ const cellMetaPatch: Record<
3800
+ string,
3801
+ {
3802
+ status: 'cached' | 'skipped' | 'completed';
3803
+ stage?: string | null;
3804
+ reused?: boolean;
3805
+ runId?: string;
3806
+ completedAt?: number;
3807
+ }
3808
+ > = {};
3809
+ const waterfallOutputs: RecordedWaterfallOutput[] = [];
3810
+ const stepProgramOutputs: RecordedStepProgramOutput[] = [];
3811
+ const rowCtx = {
3812
+ ...(ctx as Record<string, unknown>),
3813
+ tools: {
3814
+ ...((ctx as { tools?: Record<string, unknown> }).tools ??
3815
+ {}),
3816
+ execute: async (requestArg: unknown): Promise<unknown> => {
3817
+ assertNotAborted(abortSignal);
3818
+ const request = normalizeToolExecuteArgs(requestArg);
3819
+ return await toolBatchScheduler.execute(
3820
+ request.id,
3821
+ request.toolId,
3822
+ request.input,
3823
+ workflowStep,
3824
+ );
3825
+ },
3643
3826
  },
3644
- },
3645
- waterfall: (
3646
- toolNameOrSpec: string | WorkerInlineWaterfallSpec,
3647
- waterfallInput: Record<string, unknown>,
3648
- waterfallOpts?: WorkerWaterfallOptions,
3649
- ) =>
3650
- executeWorkerWaterfall(
3651
- req,
3652
- waterfallOutputs,
3653
- toolNameOrSpec,
3654
- waterfallInput,
3655
- waterfallOpts,
3656
- callbacks,
3657
- workflowStep,
3658
- ),
3659
- };
3660
- for (const [key, value] of fieldEntries) {
3661
- const rawCellMeta =
3662
- enriched[DEEPLINE_CELL_META_FIELD] &&
3663
- typeof enriched[DEEPLINE_CELL_META_FIELD] === 'object'
3664
- ? (
3665
- enriched[DEEPLINE_CELL_META_FIELD] as Record<
3666
- string,
3667
- unknown
3668
- >
3669
- )[key]
3670
- : null;
3671
- const reuseDecision = shouldRecomputeCell({
3672
- hasValue: isCompletedWorkerFieldValue(enriched[key]),
3673
- meta:
3674
- rawCellMeta && typeof rawCellMeta === 'object'
3675
- ? (rawCellMeta as {
3676
- status?: string;
3677
- completedAt?: number;
3678
- })
3679
- : null,
3680
- policy: cellPolicies?.[key],
3681
- });
3682
- if (reuseDecision.action === 'reuse') {
3683
- cellMetaPatch[key] = {
3684
- status: 'cached',
3685
- stage: key,
3686
- reused: true,
3687
- runId: req.runId,
3688
- };
3689
- continue;
3827
+ waterfall: (
3828
+ toolNameOrSpec: string | WorkerInlineWaterfallSpec,
3829
+ waterfallInput: Record<string, unknown>,
3830
+ waterfallOpts?: WorkerWaterfallOptions,
3831
+ ) =>
3832
+ executeWorkerWaterfall(
3833
+ req,
3834
+ waterfallOutputs,
3835
+ toolNameOrSpec,
3836
+ waterfallInput,
3837
+ waterfallOpts,
3838
+ callbacks,
3839
+ workflowStep,
3840
+ ),
3841
+ };
3842
+ for (const [key, value] of fieldEntries) {
3843
+ const rawCellMeta =
3844
+ enriched[DEEPLINE_CELL_META_FIELD] &&
3845
+ typeof enriched[DEEPLINE_CELL_META_FIELD] === 'object'
3846
+ ? (
3847
+ enriched[DEEPLINE_CELL_META_FIELD] as Record<
3848
+ string,
3849
+ unknown
3850
+ >
3851
+ )[key]
3852
+ : null;
3853
+ const reuseDecision = shouldRecomputeCell({
3854
+ hasValue: isCompletedWorkerFieldValue(enriched[key]),
3855
+ meta:
3856
+ rawCellMeta && typeof rawCellMeta === 'object'
3857
+ ? (rawCellMeta as {
3858
+ status?: string;
3859
+ completedAt?: number;
3860
+ })
3861
+ : null,
3862
+ policy: cellPolicies?.[key],
3863
+ });
3864
+ if (reuseDecision.action === 'reuse') {
3865
+ cellMetaPatch[key] = {
3866
+ status: 'cached',
3867
+ stage: key,
3868
+ reused: true,
3869
+ runId: req.runId,
3870
+ };
3871
+ continue;
3872
+ }
3873
+ const resolved = await executeWorkerStepResolver(
3874
+ value,
3875
+ enriched,
3876
+ rowCtx,
3877
+ absoluteIndex,
3878
+ isWorkerStepProgram(value)
3879
+ ? {
3880
+ parentField: key,
3881
+ path: [],
3882
+ outputs: stepProgramOutputs,
3883
+ }
3884
+ : undefined,
3885
+ );
3886
+ enriched[key] = resolved.value;
3887
+ fieldOutputs[key] = resolved.value;
3888
+ if (resolved.status === 'skipped') {
3889
+ cellMetaPatch[key] = {
3890
+ status: 'skipped',
3891
+ stage: key,
3892
+ runId: req.runId,
3893
+ };
3894
+ } else {
3895
+ cellMetaPatch[key] = {
3896
+ status: 'completed',
3897
+ stage: key,
3898
+ runId: req.runId,
3899
+ completedAt: nowMs(),
3900
+ };
3901
+ }
3690
3902
  }
3691
- const resolved = await executeWorkerStepResolver(
3692
- value,
3693
- enriched,
3694
- rowCtx,
3695
- absoluteIndex,
3696
- isWorkerStepProgram(value)
3697
- ? {
3698
- parentField: key,
3699
- path: [],
3700
- outputs: stepProgramOutputs,
3701
- }
3702
- : undefined,
3703
- );
3704
- enriched[key] = resolved.value;
3705
- fieldOutputs[key] = resolved.value;
3706
- if (resolved.status === 'skipped') {
3707
- cellMetaPatch[key] = {
3708
- status: 'skipped',
3709
- stage: key,
3710
- runId: req.runId,
3711
- };
3712
- } else {
3713
- cellMetaPatch[key] = {
3714
- status: 'completed',
3715
- stage: key,
3716
- runId: req.runId,
3717
- completedAt: nowMs(),
3718
- };
3903
+ for (const stepOutput of stepProgramOutputs) {
3904
+ enriched[stepOutput.columnName] = stepOutput.value;
3905
+ fieldOutputs[stepOutput.columnName] = stepOutput.value;
3906
+ generatedOutputFields.add(stepOutput.columnName);
3907
+ if (stepOutput.status === 'skipped') {
3908
+ cellMetaPatch[stepOutput.columnName] = {
3909
+ status: 'skipped',
3910
+ stage: stepOutput.stepId,
3911
+ runId: req.runId,
3912
+ };
3913
+ }
3719
3914
  }
3720
- }
3721
- for (const stepOutput of stepProgramOutputs) {
3722
- enriched[stepOutput.columnName] = stepOutput.value;
3723
- fieldOutputs[stepOutput.columnName] = stepOutput.value;
3724
- generatedOutputFields.add(stepOutput.columnName);
3725
- if (stepOutput.status === 'skipped') {
3726
- cellMetaPatch[stepOutput.columnName] = {
3727
- status: 'skipped',
3728
- stage: stepOutput.stepId,
3729
- runId: req.runId,
3730
- };
3915
+ for (const waterfallOutput of waterfallOutputs) {
3916
+ const columnName =
3917
+ `${sqlishIdentifierPart(waterfallOutput.waterfallId)}__` +
3918
+ sqlishIdentifierPart(waterfallOutput.stepId);
3919
+ enriched[columnName] = waterfallOutput.value;
3920
+ generatedOutputFields.add(columnName);
3731
3921
  }
3922
+ executedCellMetaPatches[myIndex] =
3923
+ Object.keys(cellMetaPatch).length > 0
3924
+ ? cellMetaPatch
3925
+ : undefined;
3926
+ executedRows[myIndex] = enriched as T & Record<string, unknown>;
3927
+ } finally {
3928
+ rowSlot.release();
3732
3929
  }
3733
- for (const waterfallOutput of waterfallOutputs) {
3734
- const columnName =
3735
- `${sqlishIdentifierPart(waterfallOutput.waterfallId)}__` +
3736
- sqlishIdentifierPart(waterfallOutput.stepId);
3737
- enriched[columnName] = waterfallOutput.value;
3738
- generatedOutputFields.add(columnName);
3739
- }
3740
- executedCellMetaPatches[myIndex] =
3741
- Object.keys(cellMetaPatch).length > 0
3742
- ? cellMetaPatch
3743
- : undefined;
3744
- executedRows[myIndex] = enriched as T & Record<string, unknown>;
3745
3930
  }
3746
3931
  })(),
3747
3932
  );
@@ -4410,33 +4595,20 @@ function createMinimalWorkerCtx(
4410
4595
  childPlayName: resolvedName,
4411
4596
  input,
4412
4597
  })}${staleRuntimeSuffix(options?.staleAfterSeconds)}`;
4413
- if (ancestryPlayIds.includes(resolvedName)) {
4414
- const chain = [...ancestryPlayIds, resolvedName].join(' -> ');
4415
- throw new Error(`Recursive play graph detected: ${chain}`);
4416
- }
4417
- const nextDepth = callDepth + 1;
4418
- if (nextDepth > WORKER_PLAY_CALL_LIMITS.maxPlayCallDepth) {
4419
- throw new Error(
4420
- `Play-call depth exceeded (${nextDepth}/${WORKER_PLAY_CALL_LIMITS.maxPlayCallDepth}) while calling ${resolvedName}.`,
4421
- );
4422
- }
4423
- const nextPlayCallCount = playCallCount + 1;
4424
- if (nextPlayCallCount > WORKER_PLAY_CALL_LIMITS.maxPlayCallCount) {
4425
- throw new Error(
4426
- `Root play-call budget exceeded (${nextPlayCallCount}/${WORKER_PLAY_CALL_LIMITS.maxPlayCallCount}).`,
4427
- );
4428
- }
4429
- const nextParentCalls = (parentChildCalls[req.playName] ?? 0) + 1;
4430
- if (
4431
- nextParentCalls > WORKER_PLAY_CALL_LIMITS.maxChildPlayCallsPerParent
4432
- ) {
4433
- throw new Error(
4434
- `Child play-call cap exceeded for ${req.playName} (${nextParentCalls}/${WORKER_PLAY_CALL_LIMITS.maxChildPlayCallsPerParent}).`,
4435
- );
4436
- }
4437
4598
  return await executeWithRuntimeReceipt(receiptKey, async () => {
4438
- playCallCount = nextPlayCallCount;
4439
- parentChildCalls[req.playName] = nextParentCalls;
4599
+ // The Governor owns the play-call lineage: forkChild does the cycle
4600
+ // guard, depth/per-parent/playCall/descendant budget charges, and
4601
+ // returns the snapshot to thread into the child so budgets accumulate
4602
+ // across isolates. Charged inside the receipt boundary so a replay
4603
+ // (cache hit) never double-charges.
4604
+ const childRunId = `${req.runId}:child:${normalizedKey}`;
4605
+ const childGovernance = governor.forkChild({
4606
+ childPlayName: resolvedName,
4607
+ childRunId,
4608
+ });
4609
+ const nextDepth = childGovernance.callDepth;
4610
+ const nextParentCalls =
4611
+ governor.snapshot().parentChildCalls[req.playName] ?? 0;
4440
4612
 
4441
4613
  emitEvent({
4442
4614
  type: 'log',
@@ -4456,31 +4628,47 @@ function createMinimalWorkerCtx(
4456
4628
  const childNeedsWorkflowScheduler = childPipelineNeedsWorkflowScheduler(
4457
4629
  childManifest.staticPipeline,
4458
4630
  );
4459
- let childConcurrencyAcquired = false;
4460
- let releaseChildPlaySlot: (() => void) | null = null;
4461
- if (childIsDatasetBacked) {
4462
- const nextInFlight =
4463
- (inFlightChildCallsByPlayName[resolvedName] ?? 0) + 1;
4464
- if (nextInFlight > 1) {
4465
- throw new Error(
4466
- `Concurrent dataset-backed play call blocked for ${resolvedName}. ` +
4467
- 'A child play that uses ctx.dataset() cannot run more than once at the same time because its dataset tables share durable row identity. ' +
4468
- 'Run these child play calls sequentially, or give each concurrent branch a different child play/table contract.',
4469
- );
4470
- }
4471
- inFlightChildCallsByPlayName[resolvedName] = nextInFlight;
4472
- childConcurrencyAcquired = true;
4473
- }
4631
+ console.info('[play.runtime.span]', {
4632
+ event: 'play.runtime.span',
4633
+ phase: 'child_route',
4634
+ runId: req.runId,
4635
+ parentRunId: req.runId,
4636
+ playName: resolvedName,
4637
+ graphHash: req.graphHash ?? null,
4638
+ depth: nextDepth,
4639
+ fanoutIndex: nextParentCalls - 1,
4640
+ childIsDatasetBacked,
4641
+ childNeedsWorkflowScheduler,
4642
+ hasStaticPipeline: Boolean(childManifest.staticPipeline),
4643
+ childTableNamespace:
4644
+ typeof childManifest.staticPipeline?.tableNamespace === 'string'
4645
+ ? childManifest.staticPipeline.tableNamespace
4646
+ : null,
4647
+ childStageCount: Array.isArray(childManifest.staticPipeline?.stages)
4648
+ ? childManifest.staticPipeline.stages.length
4649
+ : null,
4650
+ childSubstepCount: Array.isArray(
4651
+ childManifest.staticPipeline?.substeps,
4652
+ )
4653
+ ? childManifest.staticPipeline.substeps.length
4654
+ : null,
4655
+ });
4656
+ let childPlaySlot: { release(): void } | null = null;
4474
4657
  try {
4475
- releaseChildPlaySlot = await acquireChildPlaySlot();
4658
+ childPlaySlot = await governor.acquireChildPlaySlot({
4659
+ signal: abortSignal,
4660
+ });
4476
4661
  const childSubmitStartedAt = nowMs();
4477
4662
  let started: {
4478
4663
  workflowId?: string;
4479
4664
  runId?: string;
4480
4665
  status?: string;
4666
+ mode?: string;
4481
4667
  output?: unknown;
4482
4668
  result?: unknown;
4483
4669
  error?: unknown;
4670
+ logs?: string[];
4671
+ timings?: Array<{ phase: string; ms: number }>;
4484
4672
  };
4485
4673
  try {
4486
4674
  started = await submitChildPlayThroughCoordinator({
@@ -4507,6 +4695,17 @@ function createMinimalWorkerCtx(
4507
4695
  // executor token's play name (the parent making this call).
4508
4696
  ancestryPlayIds,
4509
4697
  callDepth: nextDepth,
4698
+ // Cumulative lineage-global budget counters (incl. this
4699
+ // launch's play/descendant charges) so the child seeds its
4700
+ // budgets from the lineage total instead of resetting to 0 in
4701
+ // its isolate. Threading descendantCount in particular keeps
4702
+ // fan-out descendant accounting lineage-global, matching cjs.
4703
+ playCallCount: childGovernance.playCallCount,
4704
+ toolCallCount: childGovernance.toolCallCount,
4705
+ retryCount: childGovernance.retryCount,
4706
+ descendantCount: childGovernance.descendantCount,
4707
+ waterfallStepExecutions:
4708
+ childGovernance.waterfallStepExecutions,
4510
4709
  description:
4511
4710
  typeof options?.description === 'string'
4512
4711
  ? options.description
@@ -4528,6 +4727,21 @@ function createMinimalWorkerCtx(
4528
4727
  status: 'failed',
4529
4728
  errorCode: 'CHILD_SUBMIT_FAILED',
4530
4729
  });
4730
+ recordRunnerPerfTrace({
4731
+ req,
4732
+ phase: 'ctx_run_play.child_submit',
4733
+ ms: nowMs() - childSubmitStartedAt,
4734
+ extra: {
4735
+ status: 'failed',
4736
+ errorCode: 'CHILD_SUBMIT_FAILED',
4737
+ playName: resolvedName,
4738
+ key: normalizedKey,
4739
+ depth: nextDepth,
4740
+ fanoutIndex: nextParentCalls - 1,
4741
+ childIsDatasetBacked,
4742
+ childNeedsWorkflowScheduler,
4743
+ },
4744
+ });
4531
4745
  throw error;
4532
4746
  }
4533
4747
  const workflowId = started.workflowId ?? started.runId;
@@ -4558,6 +4772,26 @@ function createMinimalWorkerCtx(
4558
4772
  ms: nowMs() - childSubmitStartedAt,
4559
4773
  status: 'ok',
4560
4774
  });
4775
+ recordRunnerPerfTrace({
4776
+ req,
4777
+ phase: 'ctx_run_play.child_submit',
4778
+ ms: nowMs() - childSubmitStartedAt,
4779
+ extra: {
4780
+ status: 'ok',
4781
+ childRunId: workflowId,
4782
+ startedStatus: started.status ?? null,
4783
+ mode: started.mode ?? null,
4784
+ coordinatorTimings: Array.isArray(started.timings)
4785
+ ? started.timings
4786
+ : null,
4787
+ playName: resolvedName,
4788
+ key: normalizedKey,
4789
+ depth: nextDepth,
4790
+ fanoutIndex: nextParentCalls - 1,
4791
+ childIsDatasetBacked,
4792
+ childNeedsWorkflowScheduler,
4793
+ },
4794
+ });
4561
4795
  const startedStatus = String(started.status ?? '').toLowerCase();
4562
4796
  if (startedStatus === 'completed') {
4563
4797
  emitEvent({
@@ -4580,11 +4814,16 @@ function createMinimalWorkerCtx(
4580
4814
  throw new Error(startedErrorMessage);
4581
4815
  }
4582
4816
  const childWaitStartedAt = nowMs();
4583
- let result: unknown;
4817
+ let waitResult: ChildPlayTerminalWaitResult;
4584
4818
  try {
4585
- result = await waitForChildPlayTerminalEvent({
4586
- req,
4587
- workflowStep,
4819
+ waitResult = await awaitChildTerminal({
4820
+ parentRunId: req.runId,
4821
+ // CF's WorkflowStep.waitForEvent generic signature is wider than
4822
+ // the small structural shape ChildPlayAwait needs; bridge it the
4823
+ // same way the inline implementation did.
4824
+ workflowStep: workflowStep as unknown as
4825
+ | WorkflowStepLike
4826
+ | undefined,
4588
4827
  workflowId,
4589
4828
  playName: resolvedName,
4590
4829
  key: normalizedKey,
@@ -4592,6 +4831,22 @@ function createMinimalWorkerCtx(
4592
4831
  1_000,
4593
4832
  Math.min(options?.timeoutMs ?? 5 * 60_000, 30 * 60_000),
4594
4833
  ),
4834
+ coordinator: cachedCoordinatorBinding?.readChildTerminalState
4835
+ ? {
4836
+ readChildTerminalState: (
4837
+ parentRunId,
4838
+ eventKey,
4839
+ timeoutMs,
4840
+ ) =>
4841
+ cachedCoordinatorBinding!.readChildTerminalState!(
4842
+ parentRunId,
4843
+ eventKey,
4844
+ timeoutMs,
4845
+ ),
4846
+ }
4847
+ : null,
4848
+ now: nowMs,
4849
+ hashJson,
4595
4850
  });
4596
4851
  } catch (error) {
4597
4852
  console.info('[play.runtime.span]', {
@@ -4608,6 +4863,22 @@ function createMinimalWorkerCtx(
4608
4863
  status: 'failed',
4609
4864
  errorCode: 'CHILD_WAIT_FAILED',
4610
4865
  });
4866
+ recordRunnerPerfTrace({
4867
+ req,
4868
+ phase: 'ctx_run_play.child_wait',
4869
+ ms: nowMs() - childWaitStartedAt,
4870
+ extra: {
4871
+ status: 'failed',
4872
+ errorCode: 'CHILD_WAIT_FAILED',
4873
+ childRunId: workflowId,
4874
+ playName: resolvedName,
4875
+ key: normalizedKey,
4876
+ depth: nextDepth,
4877
+ fanoutIndex: nextParentCalls - 1,
4878
+ childIsDatasetBacked,
4879
+ childNeedsWorkflowScheduler,
4880
+ },
4881
+ });
4611
4882
  throw error;
4612
4883
  }
4613
4884
  console.info('[play.runtime.span]', {
@@ -4622,6 +4893,27 @@ function createMinimalWorkerCtx(
4622
4893
  fanoutIndex: nextParentCalls - 1,
4623
4894
  ms: nowMs() - childWaitStartedAt,
4624
4895
  status: 'ok',
4896
+ waitSource: waitResult.source,
4897
+ waitAttempts: waitResult.attempts ?? null,
4898
+ reportedWaitMs: waitResult.waitMs,
4899
+ });
4900
+ recordRunnerPerfTrace({
4901
+ req,
4902
+ phase: 'ctx_run_play.child_wait',
4903
+ ms: nowMs() - childWaitStartedAt,
4904
+ extra: {
4905
+ status: 'ok',
4906
+ childRunId: workflowId,
4907
+ playName: resolvedName,
4908
+ key: normalizedKey,
4909
+ depth: nextDepth,
4910
+ fanoutIndex: nextParentCalls - 1,
4911
+ childIsDatasetBacked,
4912
+ childNeedsWorkflowScheduler,
4913
+ waitSource: waitResult.source,
4914
+ waitAttempts: waitResult.attempts ?? null,
4915
+ reportedWaitMs: waitResult.waitMs,
4916
+ },
4625
4917
  });
4626
4918
  emitEvent({
4627
4919
  type: 'log',
@@ -4629,15 +4921,9 @@ function createMinimalWorkerCtx(
4629
4921
  message: `Completed child play ${resolvedName} (${normalizedKey})`,
4630
4922
  ts: nowMs(),
4631
4923
  });
4632
- return result;
4924
+ return waitResult.output;
4633
4925
  } finally {
4634
- releaseChildPlaySlot?.();
4635
- if (childConcurrencyAcquired) {
4636
- releaseChildPlayConcurrency(
4637
- inFlightChildCallsByPlayName,
4638
- resolvedName,
4639
- );
4640
- }
4926
+ childPlaySlot?.release();
4641
4927
  }
4642
4928
  });
4643
4929
  },
@@ -4813,6 +5099,135 @@ async function handleRun(request: Request, env: WorkerEnv): Promise<Response> {
4813
5099
  });
4814
5100
  }
4815
5101
 
5102
+ async function handleRunInline(
5103
+ request: Request,
5104
+ env: WorkerEnv,
5105
+ ): Promise<Response> {
5106
+ let req: RunRequest;
5107
+ try {
5108
+ req = (await request.json()) as RunRequest;
5109
+ } catch {
5110
+ return Response.json(
5111
+ {
5112
+ status: 'failed',
5113
+ error: { message: 'invalid JSON body' },
5114
+ },
5115
+ { status: 400 },
5116
+ );
5117
+ }
5118
+
5119
+ const events: RunnerEvent[] = [];
5120
+ const timings: InlineRunTiming[] = [];
5121
+ const traceInline = (
5122
+ phase: string,
5123
+ phaseStartedAt: number,
5124
+ extra?: Record<string, unknown>,
5125
+ ): void => {
5126
+ timings.push({
5127
+ phase,
5128
+ ms: nowMs() - phaseStartedAt,
5129
+ ...(extra ? { extra } : {}),
5130
+ });
5131
+ };
5132
+ const inlineStartedAt = nowMs();
5133
+ try {
5134
+ const runPrefix = `[deepline-run:${req.runId}]`;
5135
+ captureCoordinatorBinding(env);
5136
+ captureRuntimeApiBinding(env);
5137
+ captureHarnessBinding(env);
5138
+ const probeStartedAt = nowMs();
5139
+ await probeHarnessOnce(env, runPrefix);
5140
+ traceInline('inline.probe_harness', probeStartedAt);
5141
+ if (!req.inlineChildRunRegistered) {
5142
+ const registerStartedAt = nowMs();
5143
+ await registerInlineChildRun(req);
5144
+ traceInline('inline.register_child_run', registerStartedAt);
5145
+ } else {
5146
+ traceInline('inline.register_child_run', nowMs(), { skipped: true });
5147
+ }
5148
+ const executeStartedAt = nowMs();
5149
+ const output = await executeRunRequest(
5150
+ req,
5151
+ env,
5152
+ (event) => {
5153
+ events.push(event);
5154
+ },
5155
+ undefined,
5156
+ {
5157
+ persistResultDatasets: true,
5158
+ },
5159
+ );
5160
+ traceInline('inline.execute_run_request', executeStartedAt, {
5161
+ durationMs: output.durationMs,
5162
+ outputRows: output.outputRows,
5163
+ });
5164
+ traceInline('inline.total', inlineStartedAt);
5165
+ return Response.json({
5166
+ status: 'completed',
5167
+ result: output.result,
5168
+ outputRows: output.outputRows,
5169
+ durationMs: output.durationMs,
5170
+ events,
5171
+ timings,
5172
+ });
5173
+ } catch (error) {
5174
+ const err = error as Error;
5175
+ events.push({
5176
+ type: 'error',
5177
+ message: err.message ?? String(err),
5178
+ stack: err.stack,
5179
+ ts: nowMs(),
5180
+ });
5181
+ return Response.json({
5182
+ status: 'failed',
5183
+ error: {
5184
+ message: err.message ?? String(err),
5185
+ stack: err.stack,
5186
+ },
5187
+ events,
5188
+ timings,
5189
+ });
5190
+ }
5191
+ }
5192
+
5193
+ async function registerInlineChildRun(req: RunRequest): Promise<void> {
5194
+ const snapshot = isRecord(req.contractSnapshot) ? req.contractSnapshot : {};
5195
+ const artifactMetadata = isRecord(snapshot.artifactMetadata)
5196
+ ? snapshot.artifactMetadata
5197
+ : {};
5198
+ const governance = req.playCallGovernance;
5199
+ await postRuntimeApi(req.baseUrl, req.executorToken, {
5200
+ action: 'start_inline_child_run',
5201
+ playName: req.playName,
5202
+ runId: req.runId,
5203
+ workflowFamilyKey:
5204
+ governance?.rootRunId ?? governance?.parentRunId ?? req.runId,
5205
+ artifactStorageKey:
5206
+ typeof artifactMetadata.storageKey === 'string'
5207
+ ? artifactMetadata.storageKey
5208
+ : undefined,
5209
+ artifactHash:
5210
+ typeof artifactMetadata.artifactHash === 'string'
5211
+ ? artifactMetadata.artifactHash
5212
+ : undefined,
5213
+ graphHash:
5214
+ typeof artifactMetadata.graphHash === 'string'
5215
+ ? artifactMetadata.graphHash
5216
+ : undefined,
5217
+ runtimeBackend: 'workers_edge',
5218
+ schedulerBackend: 'inline_child',
5219
+ executionProfile: 'workers_edge',
5220
+ maxCreditsPerRun: extractMaxCreditsPerRun(req.contractSnapshot),
5221
+ staticPipeline: snapshot.staticPipeline ?? null,
5222
+ source:
5223
+ snapshot.source === 'published' ||
5224
+ snapshot.source === 'ad_hoc' ||
5225
+ snapshot.source === 'draft'
5226
+ ? snapshot.source
5227
+ : 'published',
5228
+ });
5229
+ }
5230
+
4816
5231
  /** Cap on run log lines retained in the terminal output compatibility shape. */
4817
5232
  const RUN_LOG_BUFFER_LIMIT = 500;
4818
5233
  /** Min wall-clock interval between live run-ledger flushes during a run. */
@@ -4890,6 +5305,100 @@ async function executeRunRequest(
4890
5305
 
4891
5306
  const stepProgressSnapshot = () => ({ ...stepProgressByNodeId });
4892
5307
 
5308
+ const publishCoordinatorProgressEvent = async (
5309
+ occurredAt: number,
5310
+ ): Promise<void> => {
5311
+ const coordinatorUrl = req.coordinatorUrl?.trim();
5312
+ if (!coordinatorUrl) {
5313
+ recordRunnerPerfTrace({
5314
+ req,
5315
+ phase: 'runner.coordinator_progress_publish',
5316
+ ms: 0,
5317
+ extra: { status: 'skipped_no_url' },
5318
+ });
5319
+ return;
5320
+ }
5321
+ const publishStartedAt = nowMs();
5322
+ const liveNodeProgress = stepProgressSnapshot();
5323
+ const activeEntry =
5324
+ Object.entries(liveNodeProgress).find(
5325
+ ([, progress]) => typeof progress.completedAt !== 'number',
5326
+ ) ?? Object.entries(liveNodeProgress).at(-1);
5327
+ const activeNodeId = activeEntry?.[0] ?? null;
5328
+ const activeProgress = activeEntry?.[1] ?? null;
5329
+ const activeArtifactTableNamespace =
5330
+ typeof activeProgress?.artifactTableNamespace === 'string'
5331
+ ? activeProgress.artifactTableNamespace
5332
+ : null;
5333
+ const activeCompleted =
5334
+ typeof activeProgress?.completed === 'number'
5335
+ ? activeProgress.completed
5336
+ : null;
5337
+ const activeTotal =
5338
+ typeof activeProgress?.total === 'number' ? activeProgress.total : null;
5339
+ const activeMessage =
5340
+ typeof activeProgress?.message === 'string'
5341
+ ? activeProgress.message
5342
+ : null;
5343
+ const response = await fetch(
5344
+ `${coordinatorUrl.replace(/\/$/, '')}/dedup/${encodeURIComponent(
5345
+ req.runId,
5346
+ )}/event-add`,
5347
+ {
5348
+ method: 'POST',
5349
+ headers: {
5350
+ 'x-deepline-request-id': makeRequestId(),
5351
+ ...coordinatorRequestHeaders({
5352
+ runId: req.runId,
5353
+ contentType: 'application/json',
5354
+ internalToken: req.coordinatorInternalToken,
5355
+ }),
5356
+ },
5357
+ body: JSON.stringify({
5358
+ runId: req.runId,
5359
+ type: 'progress',
5360
+ status: 'running',
5361
+ ts: occurredAt,
5362
+ logs: runLogBuffer,
5363
+ activeNodeId,
5364
+ activeArtifactTableNamespace,
5365
+ updatedAt: occurredAt,
5366
+ liveNodeProgress,
5367
+ }),
5368
+ },
5369
+ );
5370
+ if (!response.ok) {
5371
+ recordRunnerPerfTrace({
5372
+ req,
5373
+ phase: 'runner.coordinator_progress_publish',
5374
+ ms: nowMs() - publishStartedAt,
5375
+ extra: {
5376
+ status: 'failed',
5377
+ httpStatus: response.status,
5378
+ activeNodeId,
5379
+ activeArtifactTableNamespace,
5380
+ activeCompleted,
5381
+ activeTotal,
5382
+ activeMessage,
5383
+ },
5384
+ });
5385
+ throw new Error(`coordinator progress event failed ${response.status}`);
5386
+ }
5387
+ recordRunnerPerfTrace({
5388
+ req,
5389
+ phase: 'runner.coordinator_progress_publish',
5390
+ ms: nowMs() - publishStartedAt,
5391
+ extra: {
5392
+ status: 'ok',
5393
+ activeNodeId,
5394
+ activeArtifactTableNamespace,
5395
+ activeCompleted,
5396
+ activeTotal,
5397
+ activeMessage,
5398
+ },
5399
+ });
5400
+ };
5401
+
4893
5402
  const appendStepLifecycleEvent = (event: PlayStepLifecycleEvent) => {
4894
5403
  updateStepProgress({
4895
5404
  nodeId: event.nodeId,
@@ -4957,6 +5466,12 @@ async function executeRunRequest(
4957
5466
  progress.artifactTableNamespace === null
4958
5467
  ? { artifactTableNamespace: progress.artifactTableNamespace }
4959
5468
  : {}),
5469
+ ...(typeof progress.startedAt === 'number'
5470
+ ? { startedAt: progress.startedAt }
5471
+ : {}),
5472
+ ...(typeof progress.completedAt === 'number'
5473
+ ? { completedAt: progress.completedAt }
5474
+ : {}),
4960
5475
  updatedAt:
4961
5476
  typeof progress.updatedAt === 'number'
4962
5477
  ? progress.updatedAt
@@ -5005,6 +5520,7 @@ async function executeRunRequest(
5005
5520
  pendingLedgerEvents = [...events, ...pendingLedgerEvents];
5006
5521
  throw new Error('runtime run-ledger append failed');
5007
5522
  }
5523
+ await publishCoordinatorProgressEvent(now).catch(() => undefined);
5008
5524
  })
5009
5525
  .catch(() => undefined);
5010
5526
  };
@@ -5048,7 +5564,7 @@ async function executeRunRequest(
5048
5564
  const workerCallbacks: WorkerCtxCallbacks = {
5049
5565
  onNodeProgress: (input) => {
5050
5566
  updateStepProgress(input);
5051
- flushLedgerEvents(false);
5567
+ flushLedgerEvents(Boolean(input.forceFlush));
5052
5568
  },
5053
5569
  onMapStarted: (nodeId, at) => stepLifecycle?.onMapStarted(nodeId, at),
5054
5570
  onMapCompleted: (nodeId, at) => stepLifecycle?.onMapCompleted(nodeId, at),
@@ -5081,6 +5597,20 @@ async function executeRunRequest(
5081
5597
  abortSignal,
5082
5598
  workerCallbacks,
5083
5599
  );
5600
+ // Hard wall-clock cap on active user-code runtime. CF Workflows does not
5601
+ // impose a play-level execution ceiling on this substrate, so without this a
5602
+ // runaway play (infinite loop, stuck await) would only stop when the executor
5603
+ // token expires. Aborting the controller surfaces cooperatively through the
5604
+ // same assertNotAborted checks used for harness cancellation.
5605
+ let runtimeLimitExceeded = false;
5606
+ const runtimeDeadlineTimer = setTimeout(() => {
5607
+ runtimeLimitExceeded = true;
5608
+ if (!abortSignal.aborted) {
5609
+ abortController.abort(
5610
+ `Play runtime limit exceeded after ${STANDARD_PLAY_RUNTIME_LIMIT_SECONDS}s.`,
5611
+ );
5612
+ }
5613
+ }, STANDARD_PLAY_RUNTIME_LIMIT_SECONDS * 1000);
5084
5614
  try {
5085
5615
  const playStartedAt = nowMs();
5086
5616
  const result = await (
@@ -5102,6 +5632,33 @@ async function executeRunRequest(
5102
5632
  phase: 'runner.serialize_result',
5103
5633
  ms: nowMs() - serializeStartedAt,
5104
5634
  });
5635
+ const terminalResult = trimResultForStatus(serializedResult);
5636
+ let parentSignalPromise: Promise<void> | null = null;
5637
+ const startParentTerminalSignal = (): Promise<void> => {
5638
+ if (!parentSignalPromise) {
5639
+ const parentSignalStartedAt = nowMs();
5640
+ parentSignalPromise = signalParentPlayTerminal({
5641
+ req,
5642
+ status: 'completed',
5643
+ result: terminalResult as Record<string, unknown>,
5644
+ })
5645
+ .catch((error) => {
5646
+ console.error(
5647
+ `[play-harness] non-fatal parent completion signal failed runId=${req.runId}: ${
5648
+ error instanceof Error ? error.message : String(error)
5649
+ }`,
5650
+ );
5651
+ })
5652
+ .finally(() => {
5653
+ recordRunnerPerfTrace({
5654
+ req,
5655
+ phase: 'runner.parent_terminal_signal',
5656
+ ms: nowMs() - parentSignalStartedAt,
5657
+ });
5658
+ });
5659
+ }
5660
+ return parentSignalPromise;
5661
+ };
5105
5662
  if (options?.persistResultDatasets) {
5106
5663
  const ledgerFlushWaitStartedAt = nowMs();
5107
5664
  await ledgerFlushInFlight;
@@ -5117,7 +5674,7 @@ async function executeRunRequest(
5117
5674
  phase: 'runner.persist_result_datasets',
5118
5675
  ms: nowMs() - resultDatasetStartedAt,
5119
5676
  });
5120
- const terminalResult = trimResultForStatus(serializedResult);
5677
+ const parentSignal = startParentTerminalSignal();
5121
5678
  const terminalOccurredAt = nowMs();
5122
5679
  const terminalUpdateStartedAt = nowMs();
5123
5680
  await flushTerminalLedgerEvents({
@@ -5161,24 +5718,9 @@ async function executeRunRequest(
5161
5718
  await nonBlockingBillingPromise;
5162
5719
  }
5163
5720
  }
5721
+ await parentSignal;
5164
5722
  }
5165
- const parentSignalStartedAt = nowMs();
5166
- await signalParentPlayTerminal({
5167
- req,
5168
- status: 'completed',
5169
- result: trimResultForStatus(serializedResult) as Record<string, unknown>,
5170
- }).catch((error) => {
5171
- console.error(
5172
- `[play-harness] non-fatal parent completion signal failed runId=${req.runId}: ${
5173
- error instanceof Error ? error.message : String(error)
5174
- }`,
5175
- );
5176
- });
5177
- recordRunnerPerfTrace({
5178
- req,
5179
- phase: 'runner.parent_terminal_signal',
5180
- ms: nowMs() - parentSignalStartedAt,
5181
- });
5723
+ await startParentTerminalSignal();
5182
5724
  recordRunnerPerfTrace({
5183
5725
  req,
5184
5726
  phase: 'runner.execute_total',
@@ -5194,7 +5736,10 @@ async function executeRunRequest(
5194
5736
  };
5195
5737
  } catch (error) {
5196
5738
  stepLifecycle?.markStartedFailed(nowMs());
5197
- const aborted = isAbortLikeError(error);
5739
+ // A runtime-limit abort is a timeout failure, not a user cancellation, so
5740
+ // it should be reported as run.failed with the limit message rather than
5741
+ // run.cancelled.
5742
+ const aborted = isAbortLikeError(error) && !runtimeLimitExceeded;
5198
5743
  if (aborted) {
5199
5744
  // Flip the controller so any concurrent user code observes the abort
5200
5745
  // through ctx.signal. We mark the run cancelled instead of failed.
@@ -5253,6 +5798,8 @@ async function executeRunRequest(
5253
5798
  error: message,
5254
5799
  }).catch(() => null);
5255
5800
  throw error;
5801
+ } finally {
5802
+ clearTimeout(runtimeDeadlineTimer);
5256
5803
  }
5257
5804
  }
5258
5805
 
@@ -5851,6 +6398,9 @@ const workerEntrypoint = {
5851
6398
  },
5852
6399
  });
5853
6400
  }
6401
+ if (request.method === 'POST' && url.pathname === '/run-inline') {
6402
+ return handleRunInline(request, env);
6403
+ }
5854
6404
  if (request.method === 'POST' && url.pathname === '/run') {
5855
6405
  return handleRun(request, env);
5856
6406
  }