deepline 0.1.79 → 0.1.80

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. package/dist/cli/index.js +68 -31
  2. package/dist/cli/index.mjs +68 -31
  3. package/dist/index.d.mts +9 -1
  4. package/dist/index.d.ts +9 -1
  5. package/dist/index.js +7 -4
  6. package/dist/index.mjs +7 -4
  7. package/dist/repo/apps/play-runner-workers/src/child-play-await.ts +192 -0
  8. package/dist/repo/apps/play-runner-workers/src/coordinator-entry.ts +1102 -1616
  9. package/dist/repo/apps/play-runner-workers/src/dedup-do.ts +506 -654
  10. package/dist/repo/apps/play-runner-workers/src/entry.ts +896 -354
  11. package/dist/repo/apps/play-runner-workers/src/workflow-retry-state.ts +8 -2
  12. package/dist/repo/sdk/src/client.ts +9 -2
  13. package/dist/repo/sdk/src/release.ts +2 -2
  14. package/dist/repo/sdk/src/types.ts +5 -0
  15. package/dist/repo/shared_libs/play-runtime/governor/coordinator-rate-state-backend.ts +231 -0
  16. package/dist/repo/shared_libs/play-runtime/governor/governor.ts +376 -0
  17. package/dist/repo/shared_libs/play-runtime/governor/policy.ts +179 -0
  18. package/dist/repo/shared_libs/play-runtime/governor/rate-state-backend.ts +87 -0
  19. package/dist/repo/shared_libs/play-runtime/run-failure.ts +12 -0
  20. package/dist/repo/shared_libs/play-runtime/scheduler-backend.ts +24 -0
  21. package/dist/repo/shared_libs/play-runtime/submit-limits.ts +35 -0
  22. package/dist/repo/shared_libs/plays/bundling/index.ts +4 -12
  23. package/dist/repo/shared_libs/plays/bundling/limits.ts +29 -0
  24. package/dist/repo/shared_libs/plays/static-pipeline.ts +56 -3
  25. package/dist/repo/shared_libs/temporal/constants.ts +38 -0
  26. package/package.json +1 -1
@@ -44,6 +44,22 @@ import {
44
44
  type ChunkExecutionResult,
45
45
  } from '../../../shared_libs/play-runtime/batch-runtime';
46
46
  import { getDefaultPlayRuntimeBatchStrategy } from '../../../shared_libs/play-runtime/default-batch-strategies';
47
+ import { STANDARD_PLAY_RUNTIME_LIMIT_SECONDS } from '../../../shared_libs/temporal/constants';
48
+ import {
49
+ createPlayExecutionGovernor,
50
+ type GovernanceSnapshot,
51
+ type PlayExecutionGovernor,
52
+ } from '../../../shared_libs/play-runtime/governor/governor';
53
+ import {
54
+ CoordinatorRateStateBackend,
55
+ type CoordinatorRatePort,
56
+ } from '../../../shared_libs/play-runtime/governor/coordinator-rate-state-backend';
57
+ import type { PacingRule } from '../../../shared_libs/play-runtime/governor/rate-state-backend';
58
+ import {
59
+ awaitChildTerminal,
60
+ type ChildPlayTerminalWaitResult,
61
+ type WorkflowStepLike,
62
+ } from './child-play-await';
47
63
  import type { AnyBatchOperationStrategy } from '../../../shared_libs/play-runtime/batching-types';
48
64
  import {
49
65
  createToolBatchExecutor,
@@ -201,6 +217,8 @@ type RunRequest = {
201
217
  /** Internal ctx.runPlay lineage. Public SDK/users never see this. */
202
218
  playCallGovernance?: PlayCallGovernanceSnapshot | null;
203
219
  preloadedDbSessions?: PreloadedRuntimeDbSession[] | null;
220
+ /** Coordinator already created the child run row before invoking /run-inline. */
221
+ inlineChildRunRegistered?: boolean | null;
204
222
  /** Cloudflare coordinator URL for direct Workflow control-plane signals. */
205
223
  coordinatorUrl?: string | null;
206
224
  /** Request-scoped coordinator auth token for preview/dev direct control calls. */
@@ -253,11 +271,21 @@ type WorkerEnv = {
253
271
  * `/api/v2/plays/runtime-tools/*`) skip the public callback URL and route
254
272
  * directly through the coordinator's process to the configured app — saves
255
273
  * the *.workers.dev → CF edge → cloudflared → localhost chain on every
256
- * runtime callback. Absent on legacy coordinator deploys; the fetch
257
- * helpers fall back to `globalThis.fetch(req.baseUrl + path)`.
274
+ * runtime callback. Required for workers_edge; missing binding is an infra
275
+ * error instead of a transport fallback.
258
276
  */
259
277
  RUNTIME_API?: {
260
- fetch(input: Request): Promise<Response>;
278
+ runtimeApiCall(input: {
279
+ executorToken: string;
280
+ path: string;
281
+ body: unknown;
282
+ headers?: Record<string, string>;
283
+ timeoutMs?: number;
284
+ }): Promise<{
285
+ status: number;
286
+ headers?: Record<string, string>;
287
+ body: string;
288
+ }>;
261
289
  };
262
290
  /**
263
291
  * Loopback RPC binding into the coordinator Worker. Used for CF-to-CF
@@ -280,6 +308,20 @@ type WorkerEnv = {
280
308
  logs?: string[];
281
309
  timings?: Array<{ phase: string; ms: number }>;
282
310
  }>;
311
+ submitWorkflowChild?(
312
+ parentRunId: string,
313
+ body: Record<string, unknown>,
314
+ ): Promise<{
315
+ workflowId?: string;
316
+ runId?: string;
317
+ status?: string;
318
+ mode?: string;
319
+ output?: unknown;
320
+ result?: unknown;
321
+ error?: unknown;
322
+ logs?: string[];
323
+ timings?: Array<{ phase: string; ms: number }>;
324
+ }>;
283
325
  signal(
284
326
  runId: string,
285
327
  body: Record<string, unknown>,
@@ -292,6 +334,26 @@ type WorkerEnv = {
292
334
  runId: string,
293
335
  event: Record<string, unknown>,
294
336
  ): Promise<void>;
337
+ readTerminalState?(runId: string): Promise<Record<string, unknown> | null>;
338
+ readChildTerminalState?(
339
+ parentRunId: string,
340
+ eventKey: string,
341
+ timeoutMs?: number,
342
+ ): Promise<Record<string, unknown> | null>;
343
+ /**
344
+ * Distributed Rate State Backend RPC. Routes to the per-(org,provider)
345
+ * rate-bucket Durable Object so the request window is global across
346
+ * isolates. See CoordinatorRateStateBackend + dedup-do.ts.
347
+ */
348
+ rateAcquire?(input: {
349
+ bucketId: string;
350
+ rules: PacingRule[];
351
+ requested: number;
352
+ }): Promise<{ granted: number; waitMs: number }>;
353
+ ratePenalize?(input: {
354
+ bucketId: string;
355
+ cooldownMs: number;
356
+ }): Promise<void>;
295
357
  };
296
358
  /**
297
359
  * Required service binding to the long-lived Play Harness Worker
@@ -379,10 +441,9 @@ async function probeHarnessOnce(
379
441
  }
380
442
  }
381
443
  /**
382
- * Routes runtime API requests through the in-process RUNTIME_API binding when
383
- * Cloudflare exposes the coordinator WorkerEntrypoint export. Some workflow
384
- * execution paths do not expose those exports; there we keep the older public
385
- * fetch transport so the play still reaches the same authenticated handler.
444
+ * Routes runtime API requests through the in-process RUNTIME_API service
445
+ * binding. workers_edge treats a missing binding as infrastructure failure
446
+ * instead of falling back to public HTTP.
386
447
  */
387
448
  const RUNTIME_API_TIMEOUT_MS = 30_000;
388
449
  const RUNTIME_API_PLAY_RUN_TIMEOUT_MS = 75_000;
@@ -390,7 +451,6 @@ const RUNTIME_API_INTEGRATION_EXECUTE_TIMEOUT_MS = 180_000;
390
451
  const RUNTIME_API_RETRY_DELAYS_MS = [
391
452
  250, 750, 1500, 3000, 5000, 10000,
392
453
  ] as const;
393
- let loggedMissingRuntimeApiBinding = false;
394
454
 
395
455
  async function fetchRuntimeApi(
396
456
  baseUrl: string,
@@ -418,37 +478,25 @@ async function fetchRuntimeApi(
418
478
  try {
419
479
  const mergedInit: RequestInit = {
420
480
  ...init,
421
- headers: runtimeApiHeaders(init.headers, cachedRuntimeApiBinding == null),
481
+ headers: runtimeApiHeaders(init.headers, false),
422
482
  signal: controller.signal,
423
483
  };
424
484
  if (!cachedRuntimeApiBinding) {
425
- if (!loggedMissingRuntimeApiBinding) {
426
- loggedMissingRuntimeApiBinding = true;
427
- console.warn(
428
- `[play-harness] RUNTIME_API binding missing; using public runtime API transport. path=${path}`,
429
- );
430
- }
431
- return await Promise.race([
432
- fetch(`${baseUrl.replace(/\/$/, '')}${path}`, mergedInit),
433
- timeoutPromise,
434
- ]);
485
+ throw new Error('[play-harness] RUNTIME_API service binding is required');
435
486
  }
436
- const responsePromise = cachedRuntimeApiBinding.fetch(
437
- new Request(`${baseUrl.replace(/\/$/, '')}${path}`, mergedInit),
487
+ const responsePromise = callRuntimeApiRpcBinding(
488
+ cachedRuntimeApiBinding,
489
+ mergedInit,
490
+ {
491
+ path,
492
+ timeoutMs,
493
+ },
438
494
  );
439
495
  const response = await Promise.race([responsePromise, timeoutPromise]);
440
- if (await shouldFallbackRuntimeApiBindingResponse(response)) {
441
- console.warn(
442
- `[play-harness] RUNTIME_API binding returned coordinator not found; using public runtime API transport. path=${path}`,
496
+ if (await isRuntimeApiBindingNotFoundResponse(response)) {
497
+ throw new Error(
498
+ `[play-harness] RUNTIME_API service binding could not route ${path}; coordinator returned not found.`,
443
499
  );
444
- return await Promise.race([
445
- fetch(`${baseUrl.replace(/\/$/, '')}${path}`, {
446
- ...init,
447
- headers: runtimeApiHeaders(init.headers, true),
448
- signal: controller.signal,
449
- }),
450
- timeoutPromise,
451
- ]);
452
500
  }
453
501
  return response;
454
502
  } catch (err) {
@@ -463,7 +511,33 @@ async function fetchRuntimeApi(
463
511
  }
464
512
  }
465
513
 
466
- async function shouldFallbackRuntimeApiBindingResponse(
514
+ async function callRuntimeApiRpcBinding(
515
+ binding: NonNullable<WorkerEnv['RUNTIME_API']>,
516
+ init: RequestInit,
517
+ input: { path: string; timeoutMs: number },
518
+ ): Promise<Response> {
519
+ const h = new Headers(init.headers);
520
+ const authorization = h.get('authorization') ?? '';
521
+ const headers: Record<string, string> = {};
522
+ const metadata = h.get(EXECUTE_TOOL_METADATA_HEADER);
523
+ if (metadata) headers[EXECUTE_TOOL_METADATA_HEADER] = metadata;
524
+ const contract = h.get(EXECUTE_RESPONSE_CONTRACT_HEADER);
525
+ if (contract) headers[EXECUTE_RESPONSE_CONTRACT_HEADER] = contract;
526
+ const rawBody = typeof init.body === 'string' ? init.body : '';
527
+ const result = await binding.runtimeApiCall({
528
+ executorToken: authorization.replace(/^Bearer\s+/i, '').trim(),
529
+ path: input.path,
530
+ body: rawBody ? JSON.parse(rawBody) : {},
531
+ headers,
532
+ timeoutMs: input.timeoutMs,
533
+ });
534
+ return new Response(result.body, {
535
+ status: result.status,
536
+ headers: result.headers ?? {},
537
+ });
538
+ }
539
+
540
+ async function isRuntimeApiBindingNotFoundResponse(
467
541
  response: Response,
468
542
  ): Promise<boolean> {
469
543
  if (response.status !== 404) {
@@ -494,13 +568,6 @@ function cachedVercelProtectionBypassToken(): string | null {
494
568
  return cachedRuntimeApiVercelBypassToken;
495
569
  }
496
570
 
497
- const WORKER_PLAY_CALL_LIMITS = {
498
- maxPlayCallDepth: 6,
499
- maxPlayCallCount: 1_000,
500
- maxChildPlayCallsPerParent: 1_000,
501
- maxConcurrentPlayCalls: 16,
502
- };
503
-
504
571
  type RunnerEvent =
505
572
  | {
506
573
  type: 'log';
@@ -520,6 +587,12 @@ type WorkflowRunOutput = {
520
587
  durationMs: number;
521
588
  };
522
589
 
590
+ type InlineRunTiming = {
591
+ phase: string;
592
+ ms: number;
593
+ extra?: Record<string, unknown>;
594
+ };
595
+
523
596
  type WorkerCtxCallbacks = {
524
597
  onNodeProgress?: (input: {
525
598
  nodeId: string;
@@ -658,9 +731,8 @@ async function postRuntimeApi<T>(
658
731
  executorToken: string,
659
732
  body: unknown,
660
733
  ): Promise<T> {
661
- // Routes through the in-process RUNTIME_API binding when present; otherwise
662
- // falls back to a public fetch against `${baseUrl}${path}`. Either path
663
- // hits the same handler with the same auth — only the transport changes.
734
+ // Routes through the in-process RUNTIME_API service binding. Missing binding
735
+ // is an infra error in workers_edge, not a reason to fall back to public HTTP.
664
736
  const serializedBody = JSON.stringify(body);
665
737
  let lastError: unknown = null;
666
738
  for (
@@ -797,6 +869,15 @@ async function submitChildPlayThroughCoordinator(input: {
797
869
  }
798
870
  return cachedCoordinatorBinding.submitChild(input.req.runId, input.body);
799
871
  }
872
+ if (cachedCoordinatorBinding?.submitWorkflowChild) {
873
+ if (!isRecord(input.body)) {
874
+ throw new Error('ctx.runPlay child submit requires an object body.');
875
+ }
876
+ return cachedCoordinatorBinding.submitWorkflowChild(
877
+ input.req.runId,
878
+ input.body,
879
+ );
880
+ }
800
881
  const coordinatorUrl = input.req.coordinatorUrl?.trim();
801
882
  if (coordinatorUrl) {
802
883
  // Keep child plays on the same coordinator/Workflow submit path as
@@ -924,46 +1005,6 @@ function workflowTimeoutFromMs(timeoutMs: number): string {
924
1005
  return `${seconds} second${seconds === 1 ? '' : 's'}`;
925
1006
  }
926
1007
 
927
- async function waitForChildPlayTerminalEvent(input: {
928
- req: RunRequest;
929
- workflowStep?: WorkflowStep;
930
- workflowId: string;
931
- playName: string;
932
- key: string;
933
- timeoutMs: number;
934
- }): Promise<unknown> {
935
- if (!input.workflowStep) {
936
- throw new Error(
937
- 'ctx.runPlay child waits require the cf-workflows runtime event scheduler.',
938
- );
939
- }
940
- const eventKey = await childPlayEventKey({
941
- key: input.key,
942
- workflowId: input.workflowId,
943
- });
944
- const event = (await (
945
- input.workflowStep.waitForEvent as unknown as (
946
- name: string,
947
- options: { type: string; timeout: string },
948
- ) => Promise<{ payload: unknown }>
949
- )(`child_play_terminal:${eventKey}`, {
950
- type: integrationEventType(eventKey),
951
- timeout: workflowTimeoutFromMs(input.timeoutMs),
952
- })) as { payload: unknown };
953
- const rawPayload = isRecord(event.payload) ? event.payload : {};
954
- const payload = isRecord(rawPayload.data) ? rawPayload.data : rawPayload;
955
- const status = String(payload.status ?? '').toLowerCase();
956
- if (status === 'completed') {
957
- return extractChildPlayOutput(payload);
958
- }
959
- const error = isRecord(payload.error) ? payload.error : null;
960
- const message =
961
- (typeof error?.message === 'string' && error.message.trim()) ||
962
- (typeof payload.error === 'string' && payload.error.trim()) ||
963
- `Child play ${input.playName} (${input.workflowId}) finished with status ${status || 'unknown'}.`;
964
- throw new Error(message);
965
- }
966
-
967
1008
  async function signalParentPlayTerminal(input: {
968
1009
  req: RunRequest;
969
1010
  status: 'completed' | 'failed' | 'cancelled';
@@ -1045,6 +1086,8 @@ async function executeTool(
1045
1086
  req: RunRequest,
1046
1087
  args: { id: string; toolId: string; input: Record<string, unknown> },
1047
1088
  workflowStep?: WorkflowStep,
1089
+ onProviderBackpressure?: (retryAfterMs: number) => void,
1090
+ onRetryAttempt?: () => void,
1048
1091
  ): Promise<ToolExecuteResult> {
1049
1092
  if (args.toolId === 'test_wait_for_event' && workflowStep) {
1050
1093
  const result = await waitForSyntheticIntegrationEvent(
@@ -1059,7 +1102,7 @@ async function executeTool(
1059
1102
  // service bindings, NOT through HTTP from this worker. Removing the
1060
1103
  // dispatcher-side coordinatorUrl plumbing intentionally turns the old
1061
1104
  // HTTP-based dedup helpers into dead code.
1062
- return callToolDirect(req, args);
1105
+ return callToolDirect(req, args, onProviderBackpressure, onRetryAttempt);
1063
1106
  }
1064
1107
 
1065
1108
  async function executeToolWithLifecycle(
@@ -1193,6 +1236,13 @@ async function waitForSyntheticIntegrationEvent(
1193
1236
  async function callToolDirect(
1194
1237
  req: RunRequest,
1195
1238
  args: { id: string; toolId: string; input: Record<string, unknown> },
1239
+ onProviderBackpressure?: (retryAfterMs: number) => void,
1240
+ // Invoked once per in-process retry attempt (429 / retryable 5xx / synthetic
1241
+ // transient) so the Governor charges chargeBudget('retry') per attempt — the
1242
+ // same runaway guard the cjs runner applies (context.ts charges retry on each
1243
+ // 429 / transient-5xx retry). Without this the worker substrate would leave
1244
+ // policy.budgets.maxRetryCount effectively unenforced.
1245
+ onRetryAttempt?: () => void,
1196
1246
  ): Promise<ToolExecuteResult> {
1197
1247
  const { id, toolId, input } = args;
1198
1248
  if (toolId === 'test_rate_limit') {
@@ -1233,6 +1283,8 @@ async function callToolDirect(
1233
1283
  if (attempt >= maxAttempts) {
1234
1284
  throw lastError;
1235
1285
  }
1286
+ // Charge the retry budget per attempt, matching the cjs runner.
1287
+ onRetryAttempt?.();
1236
1288
  await new Promise((resolve) => setTimeout(resolve, 1_000));
1237
1289
  continue;
1238
1290
  }
@@ -1273,17 +1325,26 @@ async function callToolDirect(
1273
1325
  maxAttempts,
1274
1326
  bodyText: text,
1275
1327
  });
1328
+ const retryAfterSeconds = Number(res.headers.get('retry-after'));
1329
+ const retryAfterMs =
1330
+ Number.isFinite(retryAfterSeconds) && retryAfterSeconds > 0
1331
+ ? Math.ceil(retryAfterSeconds * 1000)
1332
+ : 0;
1333
+ if (res.status === 429) {
1334
+ // Feed the provider's backpressure into the shared pacer even on the
1335
+ // final attempt so the (org, provider) bucket backs off across isolates.
1336
+ onProviderBackpressure?.(retryAfterMs > 0 ? retryAfterMs : 1_000);
1337
+ }
1276
1338
  const retryable =
1277
1339
  (res.status === 429 && !isHardBillingToolHttpError(lastError)) ||
1278
1340
  (res.status >= 500 && WORKER_RETRY_SAFE_5XX_TOOLS.has(toolId));
1279
1341
  if (!retryable || attempt >= maxAttempts) {
1280
1342
  throw lastError;
1281
1343
  }
1282
- const retryAfterSeconds = Number(res.headers.get('retry-after'));
1283
- const delayMs =
1284
- Number.isFinite(retryAfterSeconds) && retryAfterSeconds > 0
1285
- ? Math.min(5_000, Math.ceil(retryAfterSeconds * 1000))
1286
- : 1_000;
1344
+ // Charge the retry budget per attempt, matching the cjs runner's
1345
+ // chargeBudget('retry') on every 429 / retryable-5xx retry.
1346
+ onRetryAttempt?.();
1347
+ const delayMs = retryAfterMs > 0 ? Math.min(5_000, retryAfterMs) : 1_000;
1287
1348
  await new Promise((resolve) => setTimeout(resolve, delayMs));
1288
1349
  }
1289
1350
 
@@ -1731,6 +1792,11 @@ type WorkerToolBatchRequest = {
1731
1792
  };
1732
1793
 
1733
1794
  const WORKER_TOOL_BATCH_GRACE_MS = 15;
1795
+ // Fallback batch-chunk parallelism when a tool declares no provider rate hints.
1796
+ // Matches the prior hardcoded `Math.min(4, ...)` so undeclared providers keep
1797
+ // their previous batching behavior; declared providers tighten via the
1798
+ // Governor's suggestedParallelism.
1799
+ const WORKER_TOOL_BATCH_DEFAULT_PARALLELISM = 4;
1734
1800
  const WORKER_RETRY_SAFE_5XX_TOOLS = new Set(['test_transient_500']);
1735
1801
 
1736
1802
  function stepProgramColumnName(parentField: string, stepId: string): string {
@@ -1741,7 +1807,31 @@ class WorkerToolBatchScheduler {
1741
1807
  private queue: WorkerToolBatchRequest[] = [];
1742
1808
  private scheduled = false;
1743
1809
 
1744
- constructor(private readonly req: RunRequest) {}
1810
+ constructor(
1811
+ private readonly req: RunRequest,
1812
+ private readonly governor: PlayExecutionGovernor,
1813
+ private readonly resolvePacing: WorkerPacingResolver,
1814
+ private readonly abortSignal?: AbortSignal,
1815
+ ) {}
1816
+
1817
+ /**
1818
+ * Report a provider 429 / Retry-After back into the Governor's shared pacer
1819
+ * so future acquires for this (org, provider) bucket back off across all
1820
+ * isolates. Provider comes from the same pacing resolver the Governor uses
1821
+ * (the worker has no local catalog), so callers pass only the toolId.
1822
+ */
1823
+ private reportBackpressure(toolId: string, retryAfterMs: number): void {
1824
+ if (retryAfterMs <= 0) return;
1825
+ void (async () => {
1826
+ const pacing = await this.resolvePacing(toolId).catch(() => null);
1827
+ if (pacing?.provider) {
1828
+ this.governor.reportProviderBackpressure({
1829
+ provider: pacing.provider,
1830
+ retryAfterMs,
1831
+ });
1832
+ }
1833
+ })();
1834
+ }
1745
1835
 
1746
1836
  execute(
1747
1837
  id: string,
@@ -1824,16 +1914,26 @@ class WorkerToolBatchScheduler {
1824
1914
  const groupStartedAt = nowMs();
1825
1915
  await Promise.all(
1826
1916
  requests.map(async (request) => {
1917
+ // Each unbatched provider call takes its own tool slot: the Governor
1918
+ // charges tool budget, holds a global tool-concurrency slot, and
1919
+ // applies per-(org,provider) pacing before the call runs.
1920
+ const slot = await this.governor.acquireToolSlot(toolId, {
1921
+ signal: this.abortSignal,
1922
+ });
1827
1923
  try {
1828
1924
  request.resolve(
1829
1925
  await executeTool(
1830
1926
  this.req,
1831
1927
  { id: request.id, toolId, input: request.input },
1832
1928
  request.workflowStep,
1929
+ (retryAfterMs) => this.reportBackpressure(toolId, retryAfterMs),
1930
+ () => this.governor.chargeBudget('retry'),
1833
1931
  ),
1834
1932
  );
1835
1933
  } catch (error) {
1836
1934
  request.reject(error);
1935
+ } finally {
1936
+ slot.release();
1837
1937
  }
1838
1938
  }),
1839
1939
  );
@@ -1851,6 +1951,14 @@ class WorkerToolBatchScheduler {
1851
1951
  req: this.req,
1852
1952
  requests,
1853
1953
  strategy,
1954
+ governor: this.governor,
1955
+ suggestedParallelism: await this.governor.suggestedParallelism(
1956
+ toolId,
1957
+ WORKER_TOOL_BATCH_DEFAULT_PARALLELISM,
1958
+ ),
1959
+ abortSignal: this.abortSignal,
1960
+ reportBackpressure: (retryAfterMs) =>
1961
+ this.reportBackpressure(toolId, retryAfterMs),
1854
1962
  });
1855
1963
  recordRunnerPerfTrace({
1856
1964
  req: this.req,
@@ -1880,6 +1988,10 @@ async function executeBatchedWorkerToolGroup(input: {
1880
1988
  req: RunRequest;
1881
1989
  requests: WorkerToolBatchRequest[];
1882
1990
  strategy: AnyBatchOperationStrategy;
1991
+ governor: PlayExecutionGovernor;
1992
+ suggestedParallelism: number;
1993
+ abortSignal?: AbortSignal;
1994
+ reportBackpressure: (retryAfterMs: number) => void;
1883
1995
  }): Promise<void> {
1884
1996
  const compiledBatches = compileRequestsWithStrategy({
1885
1997
  requests: input.requests,
@@ -1889,13 +2001,34 @@ async function executeBatchedWorkerToolGroup(input: {
1889
2001
 
1890
2002
  await executeChunkedRequests({
1891
2003
  requests: compiledBatches,
1892
- batchSize: Math.max(1, Math.min(4, compiledBatches.length || 1)),
1893
- execute: async (batch) =>
1894
- await executeTool(input.req, {
1895
- id: `batch:${batch.memberRequests.map((request) => request.id).join('|')}`,
1896
- toolId: batch.batchOperation,
1897
- input: batch.batchPayload,
1898
- }),
2004
+ // Chunk parallelism is the Governor's per-tool suggestion (provider rate
2005
+ // hints tightened to the policy ceiling), bounded by the batch count.
2006
+ batchSize: Math.max(
2007
+ 1,
2008
+ Math.min(input.suggestedParallelism, compiledBatches.length || 1),
2009
+ ),
2010
+ execute: async (batch) => {
2011
+ // One provider call per batch → one tool slot (budget + global
2012
+ // concurrency + per-(org,provider) pacing) around the whole batch.
2013
+ const slot = await input.governor.acquireToolSlot(batch.batchOperation, {
2014
+ signal: input.abortSignal,
2015
+ });
2016
+ try {
2017
+ return await executeTool(
2018
+ input.req,
2019
+ {
2020
+ id: `batch:${batch.memberRequests.map((request) => request.id).join('|')}`,
2021
+ toolId: batch.batchOperation,
2022
+ input: batch.batchPayload,
2023
+ },
2024
+ undefined,
2025
+ input.reportBackpressure,
2026
+ () => input.governor.chargeBudget('retry'),
2027
+ );
2028
+ } finally {
2029
+ slot.release();
2030
+ }
2031
+ },
1899
2032
  onChunkComplete: async (
1900
2033
  chunkResults: Array<
1901
2034
  ChunkExecutionResult<(typeof compiledBatches)[number], unknown>
@@ -3164,9 +3297,23 @@ function assertNotAborted(signal: AbortSignal | undefined): void {
3164
3297
  function childPipelineUsesCtxDataset(
3165
3298
  pipeline: PlayStaticPipeline | null | undefined,
3166
3299
  ): boolean {
3167
- return getCompiledPipelineSubsteps(pipeline).some(
3168
- (substep) => substep.type === 'dataset',
3169
- );
3300
+ if (!pipeline) return false;
3301
+ if (typeof pipeline.tableNamespace === 'string' && pipeline.tableNamespace) {
3302
+ return true;
3303
+ }
3304
+ if (pipeline.sheetContract) {
3305
+ return true;
3306
+ }
3307
+ return flattenStaticPipeline(pipeline).some((substep) => {
3308
+ if (substep.type === 'dataset') return true;
3309
+ if (!isRecord(substep)) return false;
3310
+ return (
3311
+ ('tableNamespace' in substep &&
3312
+ typeof substep.tableNamespace === 'string' &&
3313
+ substep.tableNamespace.length > 0) ||
3314
+ ('sheetContract' in substep && Boolean(substep.sheetContract))
3315
+ );
3316
+ });
3170
3317
  }
3171
3318
 
3172
3319
  function childPipelineNeedsWorkflowScheduler(
@@ -3181,16 +3328,160 @@ function childPipelineNeedsWorkflowScheduler(
3181
3328
  );
3182
3329
  }
3183
3330
 
3184
- function releaseChildPlayConcurrency(
3185
- inFlightByPlayName: Record<string, number>,
3186
- playName: string,
3187
- ): void {
3188
- const next = Math.max(0, (inFlightByPlayName[playName] ?? 0) - 1);
3189
- if (next === 0) {
3190
- delete inFlightByPlayName[playName];
3191
- return;
3192
- }
3193
- inFlightByPlayName[playName] = next;
3331
+ /**
3332
+ * Build the per-(org,provider) rate port the distributed Rate State Backend
3333
+ * RPCs through. When the coordinator binding (or its rate RPCs) is absent we
3334
+ * fail OPEN — grant immediately — matching customer-rate-limiter semantics so a
3335
+ * miswired binding degrades pacing without stalling the run.
3336
+ */
3337
+ function createCoordinatorRatePort(): CoordinatorRatePort {
3338
+ return {
3339
+ async rateAcquire(input) {
3340
+ const binding = cachedCoordinatorBinding;
3341
+ if (!binding?.rateAcquire) {
3342
+ return { granted: input.requested, waitMs: 0 };
3343
+ }
3344
+ return await binding.rateAcquire(input);
3345
+ },
3346
+ async ratePenalize(input) {
3347
+ const binding = cachedCoordinatorBinding;
3348
+ if (!binding?.ratePenalize) return;
3349
+ await binding.ratePenalize(input);
3350
+ },
3351
+ };
3352
+ }
3353
+
3354
+ /**
3355
+ * Resolve a tool's provider + pacing rules from the same runtime tool-metadata
3356
+ * endpoint the cjs_node20 runner uses (`getToolQueueHints`). The worker has no
3357
+ * local catalog, so this is an HTTP fetch through the runtime API binding,
3358
+ * memoized per isolate. No hints → null (pacing is a no-op; the Governor's
3359
+ * global tool-concurrency slot still applies).
3360
+ */
3361
+ type WorkerPacingResolver = (
3362
+ toolId: string,
3363
+ ) => Promise<{ provider: string; rules: PacingRule[] } | null>;
3364
+
3365
+ function createWorkerPacingResolver(req: RunRequest): WorkerPacingResolver {
3366
+ const cache = new Map<
3367
+ string,
3368
+ Promise<{ provider: string; rules: PacingRule[] } | null>
3369
+ >();
3370
+ return (toolId: string) => {
3371
+ const normalized = String(toolId || '').trim();
3372
+ if (!normalized) return Promise.resolve(null);
3373
+ const cached = cache.get(normalized);
3374
+ if (cached) return cached;
3375
+ const promise = (async () => {
3376
+ const res = await fetchRuntimeApi(
3377
+ req.baseUrl,
3378
+ `/api/v2/plays/runtime-tools/${encodeURIComponent(normalized)}`,
3379
+ {
3380
+ method: 'GET',
3381
+ headers: { authorization: `Bearer ${req.executorToken}` },
3382
+ },
3383
+ ).catch(() => null);
3384
+ if (!res || !res.ok) return null;
3385
+ const body = (await res.json().catch(() => null)) as {
3386
+ provider?: unknown;
3387
+ queueHints?: unknown;
3388
+ } | null;
3389
+ if (!body) return null;
3390
+ const provider =
3391
+ typeof body.provider === 'string' && body.provider.trim()
3392
+ ? body.provider.trim()
3393
+ : null;
3394
+ if (!provider || !Array.isArray(body.queueHints)) return null;
3395
+ const rules: PacingRule[] = body.queueHints.flatMap((hint) => {
3396
+ if (!hint || typeof hint !== 'object') return [];
3397
+ const record = hint as Record<string, unknown>;
3398
+ if (
3399
+ typeof record.ruleId !== 'string' ||
3400
+ typeof record.requestsPerWindow !== 'number' ||
3401
+ typeof record.windowMs !== 'number'
3402
+ ) {
3403
+ return [];
3404
+ }
3405
+ return [
3406
+ {
3407
+ ruleId: record.ruleId,
3408
+ requestsPerWindow: record.requestsPerWindow,
3409
+ windowMs: record.windowMs,
3410
+ maxConcurrency:
3411
+ typeof record.maxConcurrency === 'number'
3412
+ ? record.maxConcurrency
3413
+ : null,
3414
+ } satisfies PacingRule,
3415
+ ];
3416
+ });
3417
+ if (rules.length === 0) return null;
3418
+ return { provider, rules };
3419
+ })();
3420
+ cache.set(normalized, promise);
3421
+ return promise;
3422
+ };
3423
+ }
3424
+
3425
+ /**
3426
+ * Build the Governor's lineage snapshot for this worker, seeded from the
3427
+ * inherited PlayCallGovernanceSnapshot (threaded via internalRunPlay) so play-
3428
+ * call budgets accumulate down the dispatch tree across isolates. The current
3429
+ * play id is always `req.playName` so the per-parent child-call counter keys off
3430
+ * the executing play (matching the prior worker behavior). Per-run counters the
3431
+ * worker lineage does not carry (tool/retry/descendant/waterfall) seed at 0 —
3432
+ * these are runaway guards, not exact cross-isolate ledgers, so a per-worker
3433
+ * reset is acceptable and matches the Governor contract.
3434
+ */
3435
+ function resumeGovernanceFromRequest(req: RunRequest): GovernanceSnapshot {
3436
+ const inherited = req.playCallGovernance;
3437
+ const rootRunId = inherited?.rootRunId || req.runId;
3438
+ const ancestryPlayIds = inherited?.ancestryPlayIds?.length
3439
+ ? // Per the lineage validator the inherited tail equals the parent; ensure
3440
+ // the chain ends with the currently-executing play for the cycle guard.
3441
+ inherited.ancestryPlayIds[inherited.ancestryPlayIds.length - 1] ===
3442
+ req.playName
3443
+ ? [...inherited.ancestryPlayIds]
3444
+ : [...inherited.ancestryPlayIds, req.playName]
3445
+ : [req.playName];
3446
+ const ancestryRunIds =
3447
+ rootRunId === req.runId ? [req.runId] : [rootRunId, req.runId];
3448
+ return {
3449
+ rootRunId,
3450
+ currentRunId: req.runId,
3451
+ currentPlayId: req.playName,
3452
+ ancestryPlayIds,
3453
+ ancestryRunIds,
3454
+ callDepth: inherited?.callDepth ?? 0,
3455
+ // Seed every lineage-global budget counter from the inherited snapshot so
3456
+ // descendant/tool/retry/waterfall budgets accumulate across isolates exactly
3457
+ // as they do across the cjs forkChild lineage. Without this they would reset
3458
+ // to 0 in each isolate and become per-worker — contradicting the Governor's
3459
+ // lineage-global budget contract. Fail-safe to 0 for older callers.
3460
+ playCallCount: inherited?.playCallCount ?? 0,
3461
+ toolCallCount: inherited?.toolCallCount ?? 0,
3462
+ retryCount: inherited?.retryCount ?? 0,
3463
+ descendantCount: inherited?.descendantCount ?? 0,
3464
+ waterfallStepExecutions: inherited?.waterfallStepExecutions ?? 0,
3465
+ parentChildCalls: {},
3466
+ };
3467
+ }
3468
+
3469
+ function createGovernorForRun(req: RunRequest): {
3470
+ governor: PlayExecutionGovernor;
3471
+ resolvePacing: WorkerPacingResolver;
3472
+ } {
3473
+ const resolvePacing = createWorkerPacingResolver(req);
3474
+ const governor = createPlayExecutionGovernor({
3475
+ adapter: 'esm_workers',
3476
+ scope: {
3477
+ orgId: req.orgId,
3478
+ rootRunId: req.playCallGovernance?.rootRunId ?? req.runId,
3479
+ },
3480
+ rateState: new CoordinatorRateStateBackend(createCoordinatorRatePort()),
3481
+ resolvePacing,
3482
+ resume: resumeGovernanceFromRequest(req),
3483
+ });
3484
+ return { governor, resolvePacing };
3194
3485
  }
3195
3486
 
3196
3487
  function createMinimalWorkerCtx(
@@ -3201,12 +3492,12 @@ function createMinimalWorkerCtx(
3201
3492
  abortSignal?: AbortSignal,
3202
3493
  callbacks?: WorkerCtxCallbacks,
3203
3494
  ): unknown {
3204
- let playCallCount = 0;
3205
- const parentChildCalls: Record<string, number> = {};
3495
+ const { governor, resolvePacing: resolveToolPacing } =
3496
+ createGovernorForRun(req);
3497
+ // Play-call depth/count/per-parent budgets, child-play concurrency, and the
3498
+ // lineage snapshot are owned by the Governor (createGovernorForRun above).
3499
+ // The worker keeps only substrate mechanism here.
3206
3500
  const stepCallCounts: Record<string, number> = {};
3207
- const inFlightChildCallsByPlayName: Record<string, number> = {};
3208
- let inFlightChildPlayCalls = 0;
3209
- const childPlaySlotWaiters: Array<() => void> = [];
3210
3501
  const secretRedactor = createSecretRedactionContext();
3211
3502
 
3212
3503
  const resolveSecretAuth = async (auth?: SecretAuth) => {
@@ -3245,38 +3536,6 @@ function createMinimalWorkerCtx(
3245
3536
  : { [auth.header.toLowerCase()]: value };
3246
3537
  };
3247
3538
 
3248
- const acquireChildPlaySlot = async (): Promise<() => void> => {
3249
- while (
3250
- inFlightChildPlayCalls >= WORKER_PLAY_CALL_LIMITS.maxConcurrentPlayCalls
3251
- ) {
3252
- await new Promise<void>((resolve, reject) => {
3253
- const waiter = () => {
3254
- abortSignal?.removeEventListener('abort', onAbort);
3255
- resolve();
3256
- };
3257
- const onAbort = () => {
3258
- const index = childPlaySlotWaiters.indexOf(waiter);
3259
- if (index >= 0) childPlaySlotWaiters.splice(index, 1);
3260
- reject(
3261
- abortSignal?.reason instanceof Error
3262
- ? abortSignal.reason
3263
- : new WorkflowAbortError(),
3264
- );
3265
- };
3266
- childPlaySlotWaiters.push(waiter);
3267
- abortSignal?.addEventListener('abort', onAbort, { once: true });
3268
- });
3269
- assertNotAborted(abortSignal);
3270
- }
3271
- inFlightChildPlayCalls += 1;
3272
- let released = false;
3273
- return () => {
3274
- if (released) return;
3275
- released = true;
3276
- inFlightChildPlayCalls = Math.max(0, inFlightChildPlayCalls - 1);
3277
- childPlaySlotWaiters.shift()?.();
3278
- };
3279
- };
3280
3539
  const rootGovernance = req.playCallGovernance;
3281
3540
  const rootRunId = rootGovernance?.rootRunId ?? req.runId;
3282
3541
  const receiptStore = createHarnessWorkerReceiptStore({
@@ -3577,7 +3836,12 @@ function createMinimalWorkerCtx(
3577
3836
  0,
3578
3837
  prepared.skipped - missingPreparedRows.length,
3579
3838
  );
3580
- const concurrency = Math.max(1, Math.min(opts?.concurrency ?? 10, 100));
3839
+ // Row concurrency comes from the Governor: an explicit map concurrency is
3840
+ // clamped to the policy row-max, otherwise the policy default. Each row
3841
+ // body additionally acquires a global row slot (the Governor's rowMax
3842
+ // semaphore) so total in-flight rows across all maps in this isolate stay
3843
+ // bounded even when several maps run at once.
3844
+ const concurrency = governor.resolveRowConcurrency();
3581
3845
  const executedRows: Array<T & Record<string, unknown>> = new Array(
3582
3846
  rowsToExecute.length,
3583
3847
  );
@@ -3594,7 +3858,12 @@ function createMinimalWorkerCtx(
3594
3858
  >
3595
3859
  | undefined
3596
3860
  > = new Array(rowsToExecute.length);
3597
- const toolBatchScheduler = new WorkerToolBatchScheduler(req);
3861
+ const toolBatchScheduler = new WorkerToolBatchScheduler(
3862
+ req,
3863
+ governor,
3864
+ resolveToolPacing,
3865
+ abortSignal,
3866
+ );
3598
3867
  const generatedOutputFields = new Set<string>();
3599
3868
  let idx = 0;
3600
3869
  const workers: Array<Promise<void>> = [];
@@ -3605,143 +3874,152 @@ function createMinimalWorkerCtx(
3605
3874
  if (abortSignal?.aborted) return;
3606
3875
  const myIndex = idx++;
3607
3876
  if (myIndex >= rowsToExecute.length) return;
3608
- const entry = uniqueRowsToExecuteEntries[myIndex]!;
3609
- const row = pendingRowsByKey.has(entry.rowKey)
3610
- ? ({
3611
- ...entry.row,
3612
- ...publicCsvInputRow(pendingRowsByKey.get(entry.rowKey)!),
3613
- } as T & Record<string, unknown>)
3614
- : entry.row;
3615
- const absoluteIndex = entry.absoluteIndex;
3616
- const enriched: Record<string, unknown> = cloneCsvAliasedRow(row);
3617
- const fieldOutputs: Record<string, unknown> = {};
3618
- const cellMetaPatch: Record<
3619
- string,
3620
- {
3621
- status: 'cached' | 'skipped' | 'completed';
3622
- stage?: string | null;
3623
- reused?: boolean;
3624
- runId?: string;
3625
- completedAt?: number;
3626
- }
3627
- > = {};
3628
- const waterfallOutputs: RecordedWaterfallOutput[] = [];
3629
- const stepProgramOutputs: RecordedStepProgramOutput[] = [];
3630
- const rowCtx = {
3631
- ...(ctx as Record<string, unknown>),
3632
- tools: {
3633
- ...((ctx as { tools?: Record<string, unknown> }).tools ?? {}),
3634
- execute: async (requestArg: unknown): Promise<unknown> => {
3635
- assertNotAborted(abortSignal);
3636
- const request = normalizeToolExecuteArgs(requestArg);
3637
- return await toolBatchScheduler.execute(
3638
- request.id,
3639
- request.toolId,
3640
- request.input,
3641
- workflowStep,
3642
- );
3877
+ const rowSlot = await governor.acquireRowSlot({
3878
+ signal: abortSignal,
3879
+ });
3880
+ try {
3881
+ const entry = uniqueRowsToExecuteEntries[myIndex]!;
3882
+ const row = pendingRowsByKey.has(entry.rowKey)
3883
+ ? ({
3884
+ ...entry.row,
3885
+ ...publicCsvInputRow(pendingRowsByKey.get(entry.rowKey)!),
3886
+ } as T & Record<string, unknown>)
3887
+ : entry.row;
3888
+ const absoluteIndex = entry.absoluteIndex;
3889
+ const enriched: Record<string, unknown> =
3890
+ cloneCsvAliasedRow(row);
3891
+ const fieldOutputs: Record<string, unknown> = {};
3892
+ const cellMetaPatch: Record<
3893
+ string,
3894
+ {
3895
+ status: 'cached' | 'skipped' | 'completed';
3896
+ stage?: string | null;
3897
+ reused?: boolean;
3898
+ runId?: string;
3899
+ completedAt?: number;
3900
+ }
3901
+ > = {};
3902
+ const waterfallOutputs: RecordedWaterfallOutput[] = [];
3903
+ const stepProgramOutputs: RecordedStepProgramOutput[] = [];
3904
+ const rowCtx = {
3905
+ ...(ctx as Record<string, unknown>),
3906
+ tools: {
3907
+ ...((ctx as { tools?: Record<string, unknown> }).tools ??
3908
+ {}),
3909
+ execute: async (requestArg: unknown): Promise<unknown> => {
3910
+ assertNotAborted(abortSignal);
3911
+ const request = normalizeToolExecuteArgs(requestArg);
3912
+ return await toolBatchScheduler.execute(
3913
+ request.id,
3914
+ request.toolId,
3915
+ request.input,
3916
+ workflowStep,
3917
+ );
3918
+ },
3643
3919
  },
3644
- },
3645
- waterfall: (
3646
- toolNameOrSpec: string | WorkerInlineWaterfallSpec,
3647
- waterfallInput: Record<string, unknown>,
3648
- waterfallOpts?: WorkerWaterfallOptions,
3649
- ) =>
3650
- executeWorkerWaterfall(
3651
- req,
3652
- waterfallOutputs,
3653
- toolNameOrSpec,
3654
- waterfallInput,
3655
- waterfallOpts,
3656
- callbacks,
3657
- workflowStep,
3658
- ),
3659
- };
3660
- for (const [key, value] of fieldEntries) {
3661
- const rawCellMeta =
3662
- enriched[DEEPLINE_CELL_META_FIELD] &&
3663
- typeof enriched[DEEPLINE_CELL_META_FIELD] === 'object'
3664
- ? (
3665
- enriched[DEEPLINE_CELL_META_FIELD] as Record<
3666
- string,
3667
- unknown
3668
- >
3669
- )[key]
3670
- : null;
3671
- const reuseDecision = shouldRecomputeCell({
3672
- hasValue: isCompletedWorkerFieldValue(enriched[key]),
3673
- meta:
3674
- rawCellMeta && typeof rawCellMeta === 'object'
3675
- ? (rawCellMeta as {
3676
- status?: string;
3677
- completedAt?: number;
3678
- })
3679
- : null,
3680
- policy: cellPolicies?.[key],
3681
- });
3682
- if (reuseDecision.action === 'reuse') {
3683
- cellMetaPatch[key] = {
3684
- status: 'cached',
3685
- stage: key,
3686
- reused: true,
3687
- runId: req.runId,
3688
- };
3689
- continue;
3920
+ waterfall: (
3921
+ toolNameOrSpec: string | WorkerInlineWaterfallSpec,
3922
+ waterfallInput: Record<string, unknown>,
3923
+ waterfallOpts?: WorkerWaterfallOptions,
3924
+ ) =>
3925
+ executeWorkerWaterfall(
3926
+ req,
3927
+ waterfallOutputs,
3928
+ toolNameOrSpec,
3929
+ waterfallInput,
3930
+ waterfallOpts,
3931
+ callbacks,
3932
+ workflowStep,
3933
+ ),
3934
+ };
3935
+ for (const [key, value] of fieldEntries) {
3936
+ const rawCellMeta =
3937
+ enriched[DEEPLINE_CELL_META_FIELD] &&
3938
+ typeof enriched[DEEPLINE_CELL_META_FIELD] === 'object'
3939
+ ? (
3940
+ enriched[DEEPLINE_CELL_META_FIELD] as Record<
3941
+ string,
3942
+ unknown
3943
+ >
3944
+ )[key]
3945
+ : null;
3946
+ const reuseDecision = shouldRecomputeCell({
3947
+ hasValue: isCompletedWorkerFieldValue(enriched[key]),
3948
+ meta:
3949
+ rawCellMeta && typeof rawCellMeta === 'object'
3950
+ ? (rawCellMeta as {
3951
+ status?: string;
3952
+ completedAt?: number;
3953
+ })
3954
+ : null,
3955
+ policy: cellPolicies?.[key],
3956
+ });
3957
+ if (reuseDecision.action === 'reuse') {
3958
+ cellMetaPatch[key] = {
3959
+ status: 'cached',
3960
+ stage: key,
3961
+ reused: true,
3962
+ runId: req.runId,
3963
+ };
3964
+ continue;
3965
+ }
3966
+ const resolved = await executeWorkerStepResolver(
3967
+ value,
3968
+ enriched,
3969
+ rowCtx,
3970
+ absoluteIndex,
3971
+ isWorkerStepProgram(value)
3972
+ ? {
3973
+ parentField: key,
3974
+ path: [],
3975
+ outputs: stepProgramOutputs,
3976
+ }
3977
+ : undefined,
3978
+ );
3979
+ enriched[key] = resolved.value;
3980
+ fieldOutputs[key] = resolved.value;
3981
+ if (resolved.status === 'skipped') {
3982
+ cellMetaPatch[key] = {
3983
+ status: 'skipped',
3984
+ stage: key,
3985
+ runId: req.runId,
3986
+ };
3987
+ } else {
3988
+ cellMetaPatch[key] = {
3989
+ status: 'completed',
3990
+ stage: key,
3991
+ runId: req.runId,
3992
+ completedAt: nowMs(),
3993
+ };
3994
+ }
3690
3995
  }
3691
- const resolved = await executeWorkerStepResolver(
3692
- value,
3693
- enriched,
3694
- rowCtx,
3695
- absoluteIndex,
3696
- isWorkerStepProgram(value)
3697
- ? {
3698
- parentField: key,
3699
- path: [],
3700
- outputs: stepProgramOutputs,
3701
- }
3702
- : undefined,
3703
- );
3704
- enriched[key] = resolved.value;
3705
- fieldOutputs[key] = resolved.value;
3706
- if (resolved.status === 'skipped') {
3707
- cellMetaPatch[key] = {
3708
- status: 'skipped',
3709
- stage: key,
3710
- runId: req.runId,
3711
- };
3712
- } else {
3713
- cellMetaPatch[key] = {
3714
- status: 'completed',
3715
- stage: key,
3716
- runId: req.runId,
3717
- completedAt: nowMs(),
3718
- };
3996
+ for (const stepOutput of stepProgramOutputs) {
3997
+ enriched[stepOutput.columnName] = stepOutput.value;
3998
+ fieldOutputs[stepOutput.columnName] = stepOutput.value;
3999
+ generatedOutputFields.add(stepOutput.columnName);
4000
+ if (stepOutput.status === 'skipped') {
4001
+ cellMetaPatch[stepOutput.columnName] = {
4002
+ status: 'skipped',
4003
+ stage: stepOutput.stepId,
4004
+ runId: req.runId,
4005
+ };
4006
+ }
3719
4007
  }
3720
- }
3721
- for (const stepOutput of stepProgramOutputs) {
3722
- enriched[stepOutput.columnName] = stepOutput.value;
3723
- fieldOutputs[stepOutput.columnName] = stepOutput.value;
3724
- generatedOutputFields.add(stepOutput.columnName);
3725
- if (stepOutput.status === 'skipped') {
3726
- cellMetaPatch[stepOutput.columnName] = {
3727
- status: 'skipped',
3728
- stage: stepOutput.stepId,
3729
- runId: req.runId,
3730
- };
4008
+ for (const waterfallOutput of waterfallOutputs) {
4009
+ const columnName =
4010
+ `${sqlishIdentifierPart(waterfallOutput.waterfallId)}__` +
4011
+ sqlishIdentifierPart(waterfallOutput.stepId);
4012
+ enriched[columnName] = waterfallOutput.value;
4013
+ generatedOutputFields.add(columnName);
3731
4014
  }
4015
+ executedCellMetaPatches[myIndex] =
4016
+ Object.keys(cellMetaPatch).length > 0
4017
+ ? cellMetaPatch
4018
+ : undefined;
4019
+ executedRows[myIndex] = enriched as T & Record<string, unknown>;
4020
+ } finally {
4021
+ rowSlot.release();
3732
4022
  }
3733
- for (const waterfallOutput of waterfallOutputs) {
3734
- const columnName =
3735
- `${sqlishIdentifierPart(waterfallOutput.waterfallId)}__` +
3736
- sqlishIdentifierPart(waterfallOutput.stepId);
3737
- enriched[columnName] = waterfallOutput.value;
3738
- generatedOutputFields.add(columnName);
3739
- }
3740
- executedCellMetaPatches[myIndex] =
3741
- Object.keys(cellMetaPatch).length > 0
3742
- ? cellMetaPatch
3743
- : undefined;
3744
- executedRows[myIndex] = enriched as T & Record<string, unknown>;
3745
4023
  }
3746
4024
  })(),
3747
4025
  );
@@ -4410,33 +4688,20 @@ function createMinimalWorkerCtx(
4410
4688
  childPlayName: resolvedName,
4411
4689
  input,
4412
4690
  })}${staleRuntimeSuffix(options?.staleAfterSeconds)}`;
4413
- if (ancestryPlayIds.includes(resolvedName)) {
4414
- const chain = [...ancestryPlayIds, resolvedName].join(' -> ');
4415
- throw new Error(`Recursive play graph detected: ${chain}`);
4416
- }
4417
- const nextDepth = callDepth + 1;
4418
- if (nextDepth > WORKER_PLAY_CALL_LIMITS.maxPlayCallDepth) {
4419
- throw new Error(
4420
- `Play-call depth exceeded (${nextDepth}/${WORKER_PLAY_CALL_LIMITS.maxPlayCallDepth}) while calling ${resolvedName}.`,
4421
- );
4422
- }
4423
- const nextPlayCallCount = playCallCount + 1;
4424
- if (nextPlayCallCount > WORKER_PLAY_CALL_LIMITS.maxPlayCallCount) {
4425
- throw new Error(
4426
- `Root play-call budget exceeded (${nextPlayCallCount}/${WORKER_PLAY_CALL_LIMITS.maxPlayCallCount}).`,
4427
- );
4428
- }
4429
- const nextParentCalls = (parentChildCalls[req.playName] ?? 0) + 1;
4430
- if (
4431
- nextParentCalls > WORKER_PLAY_CALL_LIMITS.maxChildPlayCallsPerParent
4432
- ) {
4433
- throw new Error(
4434
- `Child play-call cap exceeded for ${req.playName} (${nextParentCalls}/${WORKER_PLAY_CALL_LIMITS.maxChildPlayCallsPerParent}).`,
4435
- );
4436
- }
4437
4691
  return await executeWithRuntimeReceipt(receiptKey, async () => {
4438
- playCallCount = nextPlayCallCount;
4439
- parentChildCalls[req.playName] = nextParentCalls;
4692
+ // The Governor owns the play-call lineage: forkChild does the cycle
4693
+ // guard, depth/per-parent/playCall/descendant budget charges, and
4694
+ // returns the snapshot to thread into the child so budgets accumulate
4695
+ // across isolates. Charged inside the receipt boundary so a replay
4696
+ // (cache hit) never double-charges.
4697
+ const childRunId = `${req.runId}:child:${normalizedKey}`;
4698
+ const childGovernance = governor.forkChild({
4699
+ childPlayName: resolvedName,
4700
+ childRunId,
4701
+ });
4702
+ const nextDepth = childGovernance.callDepth;
4703
+ const nextParentCalls =
4704
+ governor.snapshot().parentChildCalls[req.playName] ?? 0;
4440
4705
 
4441
4706
  emitEvent({
4442
4707
  type: 'log',
@@ -4456,31 +4721,47 @@ function createMinimalWorkerCtx(
4456
4721
  const childNeedsWorkflowScheduler = childPipelineNeedsWorkflowScheduler(
4457
4722
  childManifest.staticPipeline,
4458
4723
  );
4459
- let childConcurrencyAcquired = false;
4460
- let releaseChildPlaySlot: (() => void) | null = null;
4461
- if (childIsDatasetBacked) {
4462
- const nextInFlight =
4463
- (inFlightChildCallsByPlayName[resolvedName] ?? 0) + 1;
4464
- if (nextInFlight > 1) {
4465
- throw new Error(
4466
- `Concurrent dataset-backed play call blocked for ${resolvedName}. ` +
4467
- 'A child play that uses ctx.dataset() cannot run more than once at the same time because its dataset tables share durable row identity. ' +
4468
- 'Run these child play calls sequentially, or give each concurrent branch a different child play/table contract.',
4469
- );
4470
- }
4471
- inFlightChildCallsByPlayName[resolvedName] = nextInFlight;
4472
- childConcurrencyAcquired = true;
4473
- }
4724
+ console.info('[play.runtime.span]', {
4725
+ event: 'play.runtime.span',
4726
+ phase: 'child_route',
4727
+ runId: req.runId,
4728
+ parentRunId: req.runId,
4729
+ playName: resolvedName,
4730
+ graphHash: req.graphHash ?? null,
4731
+ depth: nextDepth,
4732
+ fanoutIndex: nextParentCalls - 1,
4733
+ childIsDatasetBacked,
4734
+ childNeedsWorkflowScheduler,
4735
+ hasStaticPipeline: Boolean(childManifest.staticPipeline),
4736
+ childTableNamespace:
4737
+ typeof childManifest.staticPipeline?.tableNamespace === 'string'
4738
+ ? childManifest.staticPipeline.tableNamespace
4739
+ : null,
4740
+ childStageCount: Array.isArray(childManifest.staticPipeline?.stages)
4741
+ ? childManifest.staticPipeline.stages.length
4742
+ : null,
4743
+ childSubstepCount: Array.isArray(
4744
+ childManifest.staticPipeline?.substeps,
4745
+ )
4746
+ ? childManifest.staticPipeline.substeps.length
4747
+ : null,
4748
+ });
4749
+ let childPlaySlot: { release(): void } | null = null;
4474
4750
  try {
4475
- releaseChildPlaySlot = await acquireChildPlaySlot();
4751
+ childPlaySlot = await governor.acquireChildPlaySlot({
4752
+ signal: abortSignal,
4753
+ });
4476
4754
  const childSubmitStartedAt = nowMs();
4477
4755
  let started: {
4478
4756
  workflowId?: string;
4479
4757
  runId?: string;
4480
4758
  status?: string;
4759
+ mode?: string;
4481
4760
  output?: unknown;
4482
4761
  result?: unknown;
4483
4762
  error?: unknown;
4763
+ logs?: string[];
4764
+ timings?: Array<{ phase: string; ms: number }>;
4484
4765
  };
4485
4766
  try {
4486
4767
  started = await submitChildPlayThroughCoordinator({
@@ -4507,6 +4788,17 @@ function createMinimalWorkerCtx(
4507
4788
  // executor token's play name (the parent making this call).
4508
4789
  ancestryPlayIds,
4509
4790
  callDepth: nextDepth,
4791
+ // Cumulative lineage-global budget counters (incl. this
4792
+ // launch's play/descendant charges) so the child seeds its
4793
+ // budgets from the lineage total instead of resetting to 0 in
4794
+ // its isolate. Threading descendantCount in particular keeps
4795
+ // fan-out descendant accounting lineage-global, matching cjs.
4796
+ playCallCount: childGovernance.playCallCount,
4797
+ toolCallCount: childGovernance.toolCallCount,
4798
+ retryCount: childGovernance.retryCount,
4799
+ descendantCount: childGovernance.descendantCount,
4800
+ waterfallStepExecutions:
4801
+ childGovernance.waterfallStepExecutions,
4510
4802
  description:
4511
4803
  typeof options?.description === 'string'
4512
4804
  ? options.description
@@ -4528,6 +4820,21 @@ function createMinimalWorkerCtx(
4528
4820
  status: 'failed',
4529
4821
  errorCode: 'CHILD_SUBMIT_FAILED',
4530
4822
  });
4823
+ recordRunnerPerfTrace({
4824
+ req,
4825
+ phase: 'ctx_run_play.child_submit',
4826
+ ms: nowMs() - childSubmitStartedAt,
4827
+ extra: {
4828
+ status: 'failed',
4829
+ errorCode: 'CHILD_SUBMIT_FAILED',
4830
+ playName: resolvedName,
4831
+ key: normalizedKey,
4832
+ depth: nextDepth,
4833
+ fanoutIndex: nextParentCalls - 1,
4834
+ childIsDatasetBacked,
4835
+ childNeedsWorkflowScheduler,
4836
+ },
4837
+ });
4531
4838
  throw error;
4532
4839
  }
4533
4840
  const workflowId = started.workflowId ?? started.runId;
@@ -4558,6 +4865,26 @@ function createMinimalWorkerCtx(
4558
4865
  ms: nowMs() - childSubmitStartedAt,
4559
4866
  status: 'ok',
4560
4867
  });
4868
+ recordRunnerPerfTrace({
4869
+ req,
4870
+ phase: 'ctx_run_play.child_submit',
4871
+ ms: nowMs() - childSubmitStartedAt,
4872
+ extra: {
4873
+ status: 'ok',
4874
+ childRunId: workflowId,
4875
+ startedStatus: started.status ?? null,
4876
+ mode: started.mode ?? null,
4877
+ coordinatorTimings: Array.isArray(started.timings)
4878
+ ? started.timings
4879
+ : null,
4880
+ playName: resolvedName,
4881
+ key: normalizedKey,
4882
+ depth: nextDepth,
4883
+ fanoutIndex: nextParentCalls - 1,
4884
+ childIsDatasetBacked,
4885
+ childNeedsWorkflowScheduler,
4886
+ },
4887
+ });
4561
4888
  const startedStatus = String(started.status ?? '').toLowerCase();
4562
4889
  if (startedStatus === 'completed') {
4563
4890
  emitEvent({
@@ -4580,11 +4907,16 @@ function createMinimalWorkerCtx(
4580
4907
  throw new Error(startedErrorMessage);
4581
4908
  }
4582
4909
  const childWaitStartedAt = nowMs();
4583
- let result: unknown;
4910
+ let waitResult: ChildPlayTerminalWaitResult;
4584
4911
  try {
4585
- result = await waitForChildPlayTerminalEvent({
4586
- req,
4587
- workflowStep,
4912
+ waitResult = await awaitChildTerminal({
4913
+ parentRunId: req.runId,
4914
+ // CF's WorkflowStep.waitForEvent generic signature is wider than
4915
+ // the small structural shape ChildPlayAwait needs; bridge it the
4916
+ // same way the inline implementation did.
4917
+ workflowStep: workflowStep as unknown as
4918
+ | WorkflowStepLike
4919
+ | undefined,
4588
4920
  workflowId,
4589
4921
  playName: resolvedName,
4590
4922
  key: normalizedKey,
@@ -4592,6 +4924,22 @@ function createMinimalWorkerCtx(
4592
4924
  1_000,
4593
4925
  Math.min(options?.timeoutMs ?? 5 * 60_000, 30 * 60_000),
4594
4926
  ),
4927
+ coordinator: cachedCoordinatorBinding?.readChildTerminalState
4928
+ ? {
4929
+ readChildTerminalState: (
4930
+ parentRunId,
4931
+ eventKey,
4932
+ timeoutMs,
4933
+ ) =>
4934
+ cachedCoordinatorBinding!.readChildTerminalState!(
4935
+ parentRunId,
4936
+ eventKey,
4937
+ timeoutMs,
4938
+ ),
4939
+ }
4940
+ : null,
4941
+ now: nowMs,
4942
+ hashJson,
4595
4943
  });
4596
4944
  } catch (error) {
4597
4945
  console.info('[play.runtime.span]', {
@@ -4608,6 +4956,22 @@ function createMinimalWorkerCtx(
4608
4956
  status: 'failed',
4609
4957
  errorCode: 'CHILD_WAIT_FAILED',
4610
4958
  });
4959
+ recordRunnerPerfTrace({
4960
+ req,
4961
+ phase: 'ctx_run_play.child_wait',
4962
+ ms: nowMs() - childWaitStartedAt,
4963
+ extra: {
4964
+ status: 'failed',
4965
+ errorCode: 'CHILD_WAIT_FAILED',
4966
+ childRunId: workflowId,
4967
+ playName: resolvedName,
4968
+ key: normalizedKey,
4969
+ depth: nextDepth,
4970
+ fanoutIndex: nextParentCalls - 1,
4971
+ childIsDatasetBacked,
4972
+ childNeedsWorkflowScheduler,
4973
+ },
4974
+ });
4611
4975
  throw error;
4612
4976
  }
4613
4977
  console.info('[play.runtime.span]', {
@@ -4622,6 +4986,27 @@ function createMinimalWorkerCtx(
4622
4986
  fanoutIndex: nextParentCalls - 1,
4623
4987
  ms: nowMs() - childWaitStartedAt,
4624
4988
  status: 'ok',
4989
+ waitSource: waitResult.source,
4990
+ waitAttempts: waitResult.attempts ?? null,
4991
+ reportedWaitMs: waitResult.waitMs,
4992
+ });
4993
+ recordRunnerPerfTrace({
4994
+ req,
4995
+ phase: 'ctx_run_play.child_wait',
4996
+ ms: nowMs() - childWaitStartedAt,
4997
+ extra: {
4998
+ status: 'ok',
4999
+ childRunId: workflowId,
5000
+ playName: resolvedName,
5001
+ key: normalizedKey,
5002
+ depth: nextDepth,
5003
+ fanoutIndex: nextParentCalls - 1,
5004
+ childIsDatasetBacked,
5005
+ childNeedsWorkflowScheduler,
5006
+ waitSource: waitResult.source,
5007
+ waitAttempts: waitResult.attempts ?? null,
5008
+ reportedWaitMs: waitResult.waitMs,
5009
+ },
4625
5010
  });
4626
5011
  emitEvent({
4627
5012
  type: 'log',
@@ -4629,15 +5014,9 @@ function createMinimalWorkerCtx(
4629
5014
  message: `Completed child play ${resolvedName} (${normalizedKey})`,
4630
5015
  ts: nowMs(),
4631
5016
  });
4632
- return result;
5017
+ return waitResult.output;
4633
5018
  } finally {
4634
- releaseChildPlaySlot?.();
4635
- if (childConcurrencyAcquired) {
4636
- releaseChildPlayConcurrency(
4637
- inFlightChildCallsByPlayName,
4638
- resolvedName,
4639
- );
4640
- }
5019
+ childPlaySlot?.release();
4641
5020
  }
4642
5021
  });
4643
5022
  },
@@ -4813,6 +5192,135 @@ async function handleRun(request: Request, env: WorkerEnv): Promise<Response> {
4813
5192
  });
4814
5193
  }
4815
5194
 
5195
+ async function handleRunInline(
5196
+ request: Request,
5197
+ env: WorkerEnv,
5198
+ ): Promise<Response> {
5199
+ let req: RunRequest;
5200
+ try {
5201
+ req = (await request.json()) as RunRequest;
5202
+ } catch {
5203
+ return Response.json(
5204
+ {
5205
+ status: 'failed',
5206
+ error: { message: 'invalid JSON body' },
5207
+ },
5208
+ { status: 400 },
5209
+ );
5210
+ }
5211
+
5212
+ const events: RunnerEvent[] = [];
5213
+ const timings: InlineRunTiming[] = [];
5214
+ const traceInline = (
5215
+ phase: string,
5216
+ phaseStartedAt: number,
5217
+ extra?: Record<string, unknown>,
5218
+ ): void => {
5219
+ timings.push({
5220
+ phase,
5221
+ ms: nowMs() - phaseStartedAt,
5222
+ ...(extra ? { extra } : {}),
5223
+ });
5224
+ };
5225
+ const inlineStartedAt = nowMs();
5226
+ try {
5227
+ const runPrefix = `[deepline-run:${req.runId}]`;
5228
+ captureCoordinatorBinding(env);
5229
+ captureRuntimeApiBinding(env);
5230
+ captureHarnessBinding(env);
5231
+ const probeStartedAt = nowMs();
5232
+ await probeHarnessOnce(env, runPrefix);
5233
+ traceInline('inline.probe_harness', probeStartedAt);
5234
+ if (!req.inlineChildRunRegistered) {
5235
+ const registerStartedAt = nowMs();
5236
+ await registerInlineChildRun(req);
5237
+ traceInline('inline.register_child_run', registerStartedAt);
5238
+ } else {
5239
+ traceInline('inline.register_child_run', nowMs(), { skipped: true });
5240
+ }
5241
+ const executeStartedAt = nowMs();
5242
+ const output = await executeRunRequest(
5243
+ req,
5244
+ env,
5245
+ (event) => {
5246
+ events.push(event);
5247
+ },
5248
+ undefined,
5249
+ {
5250
+ persistResultDatasets: true,
5251
+ },
5252
+ );
5253
+ traceInline('inline.execute_run_request', executeStartedAt, {
5254
+ durationMs: output.durationMs,
5255
+ outputRows: output.outputRows,
5256
+ });
5257
+ traceInline('inline.total', inlineStartedAt);
5258
+ return Response.json({
5259
+ status: 'completed',
5260
+ result: output.result,
5261
+ outputRows: output.outputRows,
5262
+ durationMs: output.durationMs,
5263
+ events,
5264
+ timings,
5265
+ });
5266
+ } catch (error) {
5267
+ const err = error as Error;
5268
+ events.push({
5269
+ type: 'error',
5270
+ message: err.message ?? String(err),
5271
+ stack: err.stack,
5272
+ ts: nowMs(),
5273
+ });
5274
+ return Response.json({
5275
+ status: 'failed',
5276
+ error: {
5277
+ message: err.message ?? String(err),
5278
+ stack: err.stack,
5279
+ },
5280
+ events,
5281
+ timings,
5282
+ });
5283
+ }
5284
+ }
5285
+
5286
+ async function registerInlineChildRun(req: RunRequest): Promise<void> {
5287
+ const snapshot = isRecord(req.contractSnapshot) ? req.contractSnapshot : {};
5288
+ const artifactMetadata = isRecord(snapshot.artifactMetadata)
5289
+ ? snapshot.artifactMetadata
5290
+ : {};
5291
+ const governance = req.playCallGovernance;
5292
+ await postRuntimeApi(req.baseUrl, req.executorToken, {
5293
+ action: 'start_inline_child_run',
5294
+ playName: req.playName,
5295
+ runId: req.runId,
5296
+ workflowFamilyKey:
5297
+ governance?.rootRunId ?? governance?.parentRunId ?? req.runId,
5298
+ artifactStorageKey:
5299
+ typeof artifactMetadata.storageKey === 'string'
5300
+ ? artifactMetadata.storageKey
5301
+ : undefined,
5302
+ artifactHash:
5303
+ typeof artifactMetadata.artifactHash === 'string'
5304
+ ? artifactMetadata.artifactHash
5305
+ : undefined,
5306
+ graphHash:
5307
+ typeof artifactMetadata.graphHash === 'string'
5308
+ ? artifactMetadata.graphHash
5309
+ : undefined,
5310
+ runtimeBackend: 'workers_edge',
5311
+ schedulerBackend: 'inline_child',
5312
+ executionProfile: 'workers_edge',
5313
+ maxCreditsPerRun: extractMaxCreditsPerRun(req.contractSnapshot),
5314
+ staticPipeline: snapshot.staticPipeline ?? null,
5315
+ source:
5316
+ snapshot.source === 'published' ||
5317
+ snapshot.source === 'ad_hoc' ||
5318
+ snapshot.source === 'draft'
5319
+ ? snapshot.source
5320
+ : 'published',
5321
+ });
5322
+ }
5323
+
4816
5324
  /** Cap on run log lines retained in the terminal output compatibility shape. */
4817
5325
  const RUN_LOG_BUFFER_LIMIT = 500;
4818
5326
  /** Min wall-clock interval between live run-ledger flushes during a run. */
@@ -5081,6 +5589,20 @@ async function executeRunRequest(
5081
5589
  abortSignal,
5082
5590
  workerCallbacks,
5083
5591
  );
5592
+ // Hard wall-clock cap on active user-code runtime. CF Workflows does not
5593
+ // impose a play-level execution ceiling on this substrate, so without this a
5594
+ // runaway play (infinite loop, stuck await) would only stop when the executor
5595
+ // token expires. Aborting the controller surfaces cooperatively through the
5596
+ // same assertNotAborted checks used for harness cancellation.
5597
+ let runtimeLimitExceeded = false;
5598
+ const runtimeDeadlineTimer = setTimeout(() => {
5599
+ runtimeLimitExceeded = true;
5600
+ if (!abortSignal.aborted) {
5601
+ abortController.abort(
5602
+ `Play runtime limit exceeded after ${STANDARD_PLAY_RUNTIME_LIMIT_SECONDS}s.`,
5603
+ );
5604
+ }
5605
+ }, STANDARD_PLAY_RUNTIME_LIMIT_SECONDS * 1000);
5084
5606
  try {
5085
5607
  const playStartedAt = nowMs();
5086
5608
  const result = await (
@@ -5102,6 +5624,33 @@ async function executeRunRequest(
5102
5624
  phase: 'runner.serialize_result',
5103
5625
  ms: nowMs() - serializeStartedAt,
5104
5626
  });
5627
+ const terminalResult = trimResultForStatus(serializedResult);
5628
+ let parentSignalPromise: Promise<void> | null = null;
5629
+ const startParentTerminalSignal = (): Promise<void> => {
5630
+ if (!parentSignalPromise) {
5631
+ const parentSignalStartedAt = nowMs();
5632
+ parentSignalPromise = signalParentPlayTerminal({
5633
+ req,
5634
+ status: 'completed',
5635
+ result: terminalResult as Record<string, unknown>,
5636
+ })
5637
+ .catch((error) => {
5638
+ console.error(
5639
+ `[play-harness] non-fatal parent completion signal failed runId=${req.runId}: ${
5640
+ error instanceof Error ? error.message : String(error)
5641
+ }`,
5642
+ );
5643
+ })
5644
+ .finally(() => {
5645
+ recordRunnerPerfTrace({
5646
+ req,
5647
+ phase: 'runner.parent_terminal_signal',
5648
+ ms: nowMs() - parentSignalStartedAt,
5649
+ });
5650
+ });
5651
+ }
5652
+ return parentSignalPromise;
5653
+ };
5105
5654
  if (options?.persistResultDatasets) {
5106
5655
  const ledgerFlushWaitStartedAt = nowMs();
5107
5656
  await ledgerFlushInFlight;
@@ -5117,7 +5666,7 @@ async function executeRunRequest(
5117
5666
  phase: 'runner.persist_result_datasets',
5118
5667
  ms: nowMs() - resultDatasetStartedAt,
5119
5668
  });
5120
- const terminalResult = trimResultForStatus(serializedResult);
5669
+ const parentSignal = startParentTerminalSignal();
5121
5670
  const terminalOccurredAt = nowMs();
5122
5671
  const terminalUpdateStartedAt = nowMs();
5123
5672
  await flushTerminalLedgerEvents({
@@ -5161,24 +5710,9 @@ async function executeRunRequest(
5161
5710
  await nonBlockingBillingPromise;
5162
5711
  }
5163
5712
  }
5713
+ await parentSignal;
5164
5714
  }
5165
- const parentSignalStartedAt = nowMs();
5166
- await signalParentPlayTerminal({
5167
- req,
5168
- status: 'completed',
5169
- result: trimResultForStatus(serializedResult) as Record<string, unknown>,
5170
- }).catch((error) => {
5171
- console.error(
5172
- `[play-harness] non-fatal parent completion signal failed runId=${req.runId}: ${
5173
- error instanceof Error ? error.message : String(error)
5174
- }`,
5175
- );
5176
- });
5177
- recordRunnerPerfTrace({
5178
- req,
5179
- phase: 'runner.parent_terminal_signal',
5180
- ms: nowMs() - parentSignalStartedAt,
5181
- });
5715
+ await startParentTerminalSignal();
5182
5716
  recordRunnerPerfTrace({
5183
5717
  req,
5184
5718
  phase: 'runner.execute_total',
@@ -5194,7 +5728,10 @@ async function executeRunRequest(
5194
5728
  };
5195
5729
  } catch (error) {
5196
5730
  stepLifecycle?.markStartedFailed(nowMs());
5197
- const aborted = isAbortLikeError(error);
5731
+ // A runtime-limit abort is a timeout failure, not a user cancellation, so
5732
+ // it should be reported as run.failed with the limit message rather than
5733
+ // run.cancelled.
5734
+ const aborted = isAbortLikeError(error) && !runtimeLimitExceeded;
5198
5735
  if (aborted) {
5199
5736
  // Flip the controller so any concurrent user code observes the abort
5200
5737
  // through ctx.signal. We mark the run cancelled instead of failed.
@@ -5253,6 +5790,8 @@ async function executeRunRequest(
5253
5790
  error: message,
5254
5791
  }).catch(() => null);
5255
5792
  throw error;
5793
+ } finally {
5794
+ clearTimeout(runtimeDeadlineTimer);
5256
5795
  }
5257
5796
  }
5258
5797
 
@@ -5851,6 +6390,9 @@ const workerEntrypoint = {
5851
6390
  },
5852
6391
  });
5853
6392
  }
6393
+ if (request.method === 'POST' && url.pathname === '/run-inline') {
6394
+ return handleRunInline(request, env);
6395
+ }
5854
6396
  if (request.method === 'POST' && url.pathname === '/run') {
5855
6397
  return handleRun(request, env);
5856
6398
  }