deepline 0.1.79 → 0.1.80

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. package/dist/cli/index.js +68 -31
  2. package/dist/cli/index.mjs +68 -31
  3. package/dist/index.d.mts +9 -1
  4. package/dist/index.d.ts +9 -1
  5. package/dist/index.js +7 -4
  6. package/dist/index.mjs +7 -4
  7. package/dist/repo/apps/play-runner-workers/src/child-play-await.ts +192 -0
  8. package/dist/repo/apps/play-runner-workers/src/coordinator-entry.ts +1102 -1616
  9. package/dist/repo/apps/play-runner-workers/src/dedup-do.ts +506 -654
  10. package/dist/repo/apps/play-runner-workers/src/entry.ts +896 -354
  11. package/dist/repo/apps/play-runner-workers/src/workflow-retry-state.ts +8 -2
  12. package/dist/repo/sdk/src/client.ts +9 -2
  13. package/dist/repo/sdk/src/release.ts +2 -2
  14. package/dist/repo/sdk/src/types.ts +5 -0
  15. package/dist/repo/shared_libs/play-runtime/governor/coordinator-rate-state-backend.ts +231 -0
  16. package/dist/repo/shared_libs/play-runtime/governor/governor.ts +376 -0
  17. package/dist/repo/shared_libs/play-runtime/governor/policy.ts +179 -0
  18. package/dist/repo/shared_libs/play-runtime/governor/rate-state-backend.ts +87 -0
  19. package/dist/repo/shared_libs/play-runtime/run-failure.ts +12 -0
  20. package/dist/repo/shared_libs/play-runtime/scheduler-backend.ts +24 -0
  21. package/dist/repo/shared_libs/play-runtime/submit-limits.ts +35 -0
  22. package/dist/repo/shared_libs/plays/bundling/index.ts +4 -12
  23. package/dist/repo/shared_libs/plays/bundling/limits.ts +29 -0
  24. package/dist/repo/shared_libs/plays/static-pipeline.ts +56 -3
  25. package/dist/repo/shared_libs/temporal/constants.ts +38 -0
  26. package/package.json +1 -1
@@ -112,6 +112,7 @@ export type PlayWorkflowParams = {
112
112
  totalRows?: number;
113
113
  coordinatorUrl?: string | null;
114
114
  coordinatorInternalToken?: string | null;
115
+ submittedAt?: number | null;
115
116
  };
116
117
 
117
118
  type InlineChildSubmitResult =
@@ -175,6 +176,12 @@ type CoordinatorTerminalState = {
175
176
  completedAt?: number;
176
177
  };
177
178
 
179
+ type CoordinatorChildTerminalState = {
180
+ eventKey: string;
181
+ data: unknown;
182
+ storedAt: number;
183
+ };
184
+
178
185
  type CoordinatorRunEvent =
179
186
  | {
180
187
  seq?: number;
@@ -224,6 +231,11 @@ type InlineWorkerRunResponse = {
224
231
  outputRows?: number;
225
232
  durationMs?: number;
226
233
  parseMs?: number;
234
+ timings?: Array<{
235
+ phase?: unknown;
236
+ ms?: unknown;
237
+ extra?: unknown;
238
+ }>;
227
239
  events?: Array<
228
240
  | { type: 'log'; message?: string; level?: string; ts?: number }
229
241
  | { type: 'result'; result?: unknown; outputRows?: number; ts?: number }
@@ -238,7 +250,7 @@ function isRecord(value: unknown): value is Record<string, unknown> {
238
250
  }
239
251
 
240
252
  interface CoordinatorEnv {
241
- PLAY_WORKFLOW: Workflow<PlayWorkflowParams | PooledWorkflowBootstrapPayload>;
253
+ PLAY_WORKFLOW: Workflow<PlayWorkflowParams>;
242
254
  PLAY_DEDUP: DurableObjectNamespace;
243
255
  LOADER?: {
244
256
  get(
@@ -531,6 +543,67 @@ async function readCoordinatorTerminalState(
531
543
  return state as CoordinatorTerminalState;
532
544
  }
533
545
 
546
+ async function writeCoordinatorChildTerminalState(input: {
547
+ env: CoordinatorEnv;
548
+ parentRunId: string;
549
+ eventKey: string;
550
+ data: unknown;
551
+ }): Promise<void> {
552
+ const stub = input.env.PLAY_DEDUP.get(
553
+ input.env.PLAY_DEDUP.idFromName(input.parentRunId),
554
+ );
555
+ const response = await stub.fetch(
556
+ 'https://deepline.dedup.internal/child-terminal-set',
557
+ {
558
+ method: 'POST',
559
+ headers: { 'content-type': 'application/json' },
560
+ body: JSON.stringify({
561
+ eventKey: input.eventKey,
562
+ data: input.data,
563
+ storedAt: Date.now(),
564
+ }),
565
+ },
566
+ );
567
+ if (!response.ok) {
568
+ throw new Error(`coordinator child terminal set failed ${response.status}`);
569
+ }
570
+ }
571
+
572
+ async function readCoordinatorChildTerminalState(input: {
573
+ env: CoordinatorEnv;
574
+ parentRunId: string;
575
+ eventKey: string;
576
+ timeoutMs?: number;
577
+ }): Promise<CoordinatorChildTerminalState | null> {
578
+ const stub = input.env.PLAY_DEDUP.get(
579
+ input.env.PLAY_DEDUP.idFromName(input.parentRunId),
580
+ );
581
+ const endpoint =
582
+ input.timeoutMs && input.timeoutMs > 0
583
+ ? 'child-terminal-await'
584
+ : 'child-terminal-get';
585
+ const timeoutParam =
586
+ input.timeoutMs && input.timeoutMs > 0
587
+ ? `&timeoutMs=${encodeURIComponent(String(Math.floor(input.timeoutMs)))}`
588
+ : '';
589
+ const response = await stub.fetch(
590
+ `https://deepline.dedup.internal/${endpoint}?eventKey=${encodeURIComponent(
591
+ input.eventKey,
592
+ )}${timeoutParam}`,
593
+ );
594
+ if (!response.ok) {
595
+ throw new Error(
596
+ `coordinator child terminal ${endpoint} failed ${response.status}`,
597
+ );
598
+ }
599
+ const body = (await response.json().catch(() => ({}))) as {
600
+ state?: unknown;
601
+ };
602
+ const state = body.state;
603
+ if (!isRecord(state) || state.eventKey !== input.eventKey) return null;
604
+ return state as CoordinatorChildTerminalState;
605
+ }
606
+
534
607
  function workflowEventType(name: string): string {
535
608
  const normalized = name
536
609
  .trim()
@@ -555,38 +628,9 @@ type DynamicWorkflowMetadata = {
555
628
  }> | null;
556
629
  };
557
630
 
558
- type DispatcherEnvelope = {
559
- __dispatcherMetadata: DynamicWorkflowMetadata;
560
- params: PlayWorkflowParams;
561
- };
562
-
563
- type PooledWorkflowBootstrapPayload = {
564
- __deeplinePooledWorkflow: true;
565
- poolId: string;
566
- createdAt: number;
567
- };
568
-
569
- const WORKFLOW_POOL_PROTOCOL_VERSION =
570
- 'pooled-workflow-wait-v14-ready-signal-http-storage';
571
- const WORKFLOW_POOL_DO_NAME = 'workflow-pool:v2';
572
- const WORKFLOW_POOL_START_EVENT_TYPE = 'play_start';
573
- const WORKFLOW_POOL_TTL_MS = 8 * 60 * 1000;
574
- const WORKFLOW_POOL_TARGET_SIZE = 4;
575
- const WORKFLOW_POOL_READY_TIMEOUT_MS = 1_500;
576
- const WORKFLOW_POOL_READY_POLL_MS = 250;
577
- const WORKFLOW_POOL_REFILL_ON_MISS_TIMEOUT_MS = 2_500;
578
- const WORKFLOW_POOL_REFILL_ON_MISS_MIN_AVAILABLE = 4;
579
- const WORKFLOW_POOL_CONTROL_TIMEOUT_MS = 750;
580
- const WORKFLOW_POOL_START_ACK_TIMEOUT_MS = 750;
581
- const WORKFLOW_POOL_START_ACK_POLL_MS = 25;
582
- const WORKFLOW_POOL_MISS_CLAIM_RETRY_TIMEOUT_MS = 3_000;
583
- const WORKFLOW_POOL_MISS_CLAIM_RETRY_POLL_MS = 50;
584
631
  const SUBMIT_INITIAL_STATE_MAX_WAIT_MS = 0;
585
632
  const SUBMIT_INITIAL_STATE_POLL_MS = 50;
586
633
  const WORKFLOW_RETRY_STATE_TTL_MS = 60 * 60 * 1000;
587
- const WORKFLOW_POOL_PREWARM_ESCALATE_TARGET_AFTER_MS = 250;
588
- const WORKFLOW_POOL_SCHEDULED_REFILL_MIN_AVAILABLE = 1;
589
- const WORKFLOW_POOL_SCHEDULED_REFILL_TIMEOUT_MS = 10_000;
590
634
 
591
635
  function buildDynamicWorkflowMetadata(
592
636
  params: PlayWorkflowParams,
@@ -601,35 +645,11 @@ function buildDynamicWorkflowMetadata(
601
645
  };
602
646
  }
603
647
 
604
- function buildDispatcherEnvelope(
605
- params: PlayWorkflowParams,
606
- ): DispatcherEnvelope {
607
- // Mirrors @cloudflare/dynamic-workflows' envelope. We need to send the
608
- // dispatcher payload via Workflow sendEvent for prewarmed instances; the
609
- // public wrapper only applies this envelope to create() params.
610
- return {
611
- __dispatcherMetadata: buildDynamicWorkflowMetadata(params),
612
- params,
613
- };
614
- }
615
-
616
- function isPooledWorkflowBootstrapPayload(
617
- value: unknown,
618
- ): value is PooledWorkflowBootstrapPayload {
619
- return (
620
- Boolean(value) &&
621
- typeof value === 'object' &&
622
- !Array.isArray(value) &&
623
- (value as Record<string, unknown>).__deeplinePooledWorkflow === true &&
624
- typeof (value as Record<string, unknown>).poolId === 'string'
625
- );
626
- }
627
-
628
648
  function readWorkflowTraceContext(event: unknown): {
629
649
  runId: string;
630
650
  graphHash: string | null;
631
651
  instanceId: string | null;
632
- pooledBootstrap: boolean;
652
+ submittedAt: number | null;
633
653
  } {
634
654
  const record = isRecord(event) ? event : {};
635
655
  const payload = isRecord(record.payload) ? record.payload : {};
@@ -637,17 +657,15 @@ function readWorkflowTraceContext(event: unknown): {
637
657
  const metadata = isRecord(payload.__dispatcherMetadata)
638
658
  ? payload.__dispatcherMetadata
639
659
  : null;
640
- const pooled = isPooledWorkflowBootstrapPayload(payload);
641
660
  const runId =
642
661
  (typeof params?.runId === 'string' && params.runId) ||
643
662
  (typeof metadata?.runId === 'string' && metadata.runId) ||
644
- (pooled && typeof payload.poolId === 'string' ? payload.poolId : null) ||
645
663
  (typeof record.instanceId === 'string' && record.instanceId) ||
646
664
  'unknown-workflow-run';
647
665
  const graphHash =
648
666
  (typeof params?.graphHash === 'string' && params.graphHash) ||
649
667
  (typeof metadata?.graphHash === 'string' && metadata.graphHash) ||
650
- (pooled ? 'workflow-pool' : null);
668
+ null;
651
669
  return {
652
670
  runId,
653
671
  graphHash,
@@ -655,18 +673,14 @@ function readWorkflowTraceContext(event: unknown): {
655
673
  typeof record.instanceId === 'string' && record.instanceId
656
674
  ? record.instanceId
657
675
  : null,
658
- pooledBootstrap: pooled,
676
+ submittedAt:
677
+ typeof params?.submittedAt === 'number' &&
678
+ Number.isFinite(params.submittedAt)
679
+ ? params.submittedAt
680
+ : null,
659
681
  };
660
682
  }
661
683
 
662
- function workflowPoolEnabled(): boolean {
663
- return WORKFLOW_POOL_TARGET_SIZE > 0;
664
- }
665
-
666
- function workflowPoolTargetSize(): number {
667
- return WORKFLOW_POOL_TARGET_SIZE;
668
- }
669
-
670
684
  async function waitForSubmitInitialState(input: {
671
685
  instance: WorkflowInstance;
672
686
  runId: string;
@@ -714,87 +728,110 @@ async function createDynamicWorkflowInstance(input: {
714
728
  });
715
729
  }
716
730
 
731
+ function runScopedDurableObject(
732
+ env: CoordinatorEnv,
733
+ runId: string,
734
+ ): DurableObjectStub {
735
+ return env.PLAY_DEDUP.get(env.PLAY_DEDUP.idFromName(runId));
736
+ }
737
+
717
738
  /**
718
- * Returns the slug-rooted Durable Scope key for the workflow pool DO.
719
- *
720
- * The pool DO name is a *shared* (non-runId) key, so without slug-scoping
721
- * a single Cloudflare account running multiple PR previews would collide
722
- * on `workflow-pool:v2`. The slug isolates the pool per-preview.
723
- *
724
- * See docs/adr/0005-durable-scope.md.
739
+ * Address the rate-state Durable Object for a single `<orgId>:<provider>`
740
+ * bucket. Keying the DO by the bucket id (not the run id) makes one
741
+ * single-threaded instance own that bucket's request window across every
742
+ * isolate of every run in the org which is exactly what the distributed Rate
743
+ * State Backend needs. Reuses the PlayDedup namespace (the DO already hosts the
744
+ * token-bucket handlers) so no extra binding is required.
725
745
  */
726
- function workflowPoolDurableObjectName(env: CoordinatorEnv): string {
727
- const slug = env.DEEPLINE_PLAY_PREVIEW_SLUG?.trim();
728
- return slug ? `${slug}:${WORKFLOW_POOL_DO_NAME}` : WORKFLOW_POOL_DO_NAME;
746
+ function rateBucketDurableObject(
747
+ env: CoordinatorEnv,
748
+ bucketId: string,
749
+ ): DurableObjectStub {
750
+ return env.PLAY_DEDUP.get(env.PLAY_DEDUP.idFromName(`rate:${bucketId}`));
729
751
  }
730
752
 
731
- function workflowPoolDurableObject(env: CoordinatorEnv): DurableObjectStub {
732
- return env.PLAY_DEDUP.get(
733
- env.PLAY_DEDUP.idFromName(workflowPoolDurableObjectName(env)),
753
+ async function callRateBucketControl<T>(
754
+ env: CoordinatorEnv,
755
+ bucketId: string,
756
+ path: string,
757
+ body: unknown,
758
+ ): Promise<T> {
759
+ const response = await rateBucketDurableObject(env, bucketId).fetch(
760
+ `https://deepline.rate-state.internal${path}`,
761
+ {
762
+ method: 'POST',
763
+ headers: { 'content-type': 'application/json' },
764
+ body: JSON.stringify(body),
765
+ },
734
766
  );
767
+ if (!response.ok) {
768
+ throw new Error(
769
+ `rate state ${path} failed ${response.status}: ${(
770
+ await response.text().catch(() => '')
771
+ ).slice(0, 400)}`,
772
+ );
773
+ }
774
+ return (await response.json()) as T;
735
775
  }
736
776
 
737
- function runScopedDurableObject(
777
+ async function callRunScopedControl<T>(
738
778
  env: CoordinatorEnv,
739
779
  runId: string,
740
- ): DurableObjectStub {
741
- return env.PLAY_DEDUP.get(env.PLAY_DEDUP.idFromName(runId));
742
- }
743
-
744
- async function callWorkflowPool<T>(
745
- env: CoordinatorEnv,
746
780
  path: string,
747
- init?: RequestInit & { timeoutMs?: number },
781
+ init?: RequestInit,
748
782
  ): Promise<T> {
749
- const timeoutMs = Math.max(
750
- 1,
751
- Math.floor(init?.timeoutMs ?? WORKFLOW_POOL_CONTROL_TIMEOUT_MS),
783
+ const response = await runScopedDurableObject(env, runId).fetch(
784
+ `https://deepline.run-state.internal${path}`,
785
+ {
786
+ ...(init ?? {}),
787
+ headers: {
788
+ 'content-type': 'application/json',
789
+ ...(init?.headers ?? {}),
790
+ },
791
+ },
752
792
  );
753
- let timeoutId: ReturnType<typeof setTimeout> | null = null;
754
- try {
755
- const fetchInit: RequestInit = { ...(init ?? {}) };
756
- delete (fetchInit as { timeoutMs?: number }).timeoutMs;
757
- delete fetchInit.signal;
758
- const response = await Promise.race([
759
- workflowPoolDurableObject(env).fetch(
760
- `https://deepline.workflow-pool.internal${path}`,
761
- {
762
- ...fetchInit,
763
- headers: {
764
- 'content-type': 'application/json',
765
- ...(init?.headers ?? {}),
766
- },
767
- },
768
- ),
769
- new Promise<Response>((_, reject) => {
770
- timeoutId = setTimeout(
771
- () =>
772
- reject(
773
- new Error(`workflow pool ${path} timed out after ${timeoutMs}ms`),
774
- ),
775
- timeoutMs,
776
- );
777
- }),
778
- ]);
779
- if (!response.ok) {
780
- throw new Error(
781
- `workflow pool ${path} failed ${response.status}: ${(
782
- await response.text().catch(() => '')
783
- ).slice(0, 400)}`,
784
- );
785
- }
786
- return (await response.json()) as T;
787
- } catch (error) {
788
- if (
789
- error instanceof Error &&
790
- (error.name === 'AbortError' || error.message.includes('aborted'))
791
- ) {
792
- throw new Error(`workflow pool ${path} timed out after ${timeoutMs}ms`);
793
- }
794
- throw error;
795
- } finally {
796
- if (timeoutId) clearTimeout(timeoutId);
793
+ if (!response.ok) {
794
+ throw new Error(
795
+ `run state ${path} failed ${response.status}: ${(
796
+ await response.text().catch(() => '')
797
+ ).slice(0, 400)}`,
798
+ );
797
799
  }
800
+ return (await response.json()) as T;
801
+ }
802
+
803
+ async function recordWorkflowInstanceId(input: {
804
+ env: CoordinatorEnv;
805
+ runId: string;
806
+ instanceId: string;
807
+ }): Promise<void> {
808
+ await callRunScopedControl<{ ok?: unknown }>(
809
+ input.env,
810
+ input.runId,
811
+ '/workflow-instance-put',
812
+ {
813
+ method: 'POST',
814
+ body: JSON.stringify({
815
+ runId: input.runId,
816
+ instanceId: input.instanceId,
817
+ ttlMs: WORKFLOW_RETRY_STATE_TTL_MS,
818
+ }),
819
+ },
820
+ );
821
+ }
822
+
823
+ async function resolveWorkflowInstanceIdForRun(
824
+ env: CoordinatorEnv,
825
+ runId: string,
826
+ ): Promise<string> {
827
+ const body = await callRunScopedControl<{ instanceId?: unknown }>(
828
+ env,
829
+ runId,
830
+ `/workflow-instance-get?runId=${encodeURIComponent(runId)}`,
831
+ ).catch(() => ({ instanceId: null }));
832
+ return typeof body.instanceId === 'string' && body.instanceId
833
+ ? body.instanceId
834
+ : workflowInstanceId(runId);
798
835
  }
799
836
 
800
837
  function assertEncryptedPreloadedDbSessions(
@@ -814,49 +851,6 @@ function assertEncryptedPreloadedDbSessions(
814
851
  }
815
852
  }
816
853
 
817
- async function persistWorkflowDbSessions(input: {
818
- env: CoordinatorEnv;
819
- runId: string;
820
- sessions: PreloadedRuntimeDbSession[];
821
- }): Promise<NonNullable<PlayWorkflowParams['preloadedDbSessionRef']>> {
822
- assertEncryptedPreloadedDbSessions(input.sessions);
823
- const response = await runScopedDurableObject(input.env, input.runId).fetch(
824
- 'https://deepline.dedup.internal/db-sessions-put',
825
- {
826
- method: 'POST',
827
- headers: { 'content-type': 'application/json' },
828
- body: JSON.stringify({
829
- runId: input.runId,
830
- sessions: input.sessions,
831
- ttlMs: DB_SESSION_DEFAULT_TTL_SECONDS * 1000,
832
- }),
833
- },
834
- );
835
- if (!response.ok) {
836
- throw new Error(
837
- `workflow db session storage failed ${response.status}: ${(
838
- await response.text().catch(() => '')
839
- ).slice(0, 400)}`,
840
- );
841
- }
842
- const body = (await response.json().catch(() => ({}))) as {
843
- sessionCount?: unknown;
844
- expiresAt?: unknown;
845
- };
846
- return {
847
- runId: input.runId,
848
- sessionCount:
849
- typeof body.sessionCount === 'number' &&
850
- Number.isFinite(body.sessionCount)
851
- ? body.sessionCount
852
- : input.sessions.length,
853
- expiresAt:
854
- typeof body.expiresAt === 'number' && Number.isFinite(body.expiresAt)
855
- ? body.expiresAt
856
- : Date.now() + DB_SESSION_DEFAULT_TTL_SECONDS * 1000,
857
- };
858
- }
859
-
860
854
  async function readWorkflowDbSessions(input: {
861
855
  env: CoordinatorEnv;
862
856
  ref: NonNullable<PlayWorkflowParams['preloadedDbSessionRef']>;
@@ -887,34 +881,56 @@ async function readWorkflowDbSessions(input: {
887
881
  return sessions;
888
882
  }
889
883
 
890
- async function externalizeWorkflowDbSessions(input: {
884
+ async function readWorkflowDbSessionsWithRetry(input: {
891
885
  env: CoordinatorEnv;
886
+ ref: NonNullable<PlayWorkflowParams['preloadedDbSessionRef']>;
887
+ }): Promise<PreloadedRuntimeDbSession[]> {
888
+ const delays = [25, 50, 100, 200] as const;
889
+ let lastError: unknown = null;
890
+ for (let attempt = 0; attempt <= delays.length; attempt += 1) {
891
+ try {
892
+ return await readWorkflowDbSessions(input);
893
+ } catch (error) {
894
+ lastError = error;
895
+ const message = error instanceof Error ? error.message : String(error);
896
+ if (
897
+ !message.includes('workflow db session lookup failed 404') ||
898
+ attempt >= delays.length
899
+ ) {
900
+ throw error;
901
+ }
902
+ await sleep(delays[attempt]);
903
+ }
904
+ }
905
+ throw lastError instanceof Error ? lastError : new Error(String(lastError));
906
+ }
907
+
908
+ function externalizedWorkflowDbSessionParams(input: {
892
909
  params: PlayWorkflowParams;
893
- recordSubmitTiming?: (timing: CoordinatorTiming) => void;
894
- }): Promise<PlayWorkflowParams> {
910
+ }): {
911
+ params: PlayWorkflowParams;
912
+ sessions: PreloadedRuntimeDbSession[];
913
+ ref: NonNullable<PlayWorkflowParams['preloadedDbSessionRef']> | null;
914
+ } {
895
915
  const sessions = Array.isArray(input.params.preloadedDbSessions)
896
916
  ? input.params.preloadedDbSessions
897
917
  : [];
898
- if (sessions.length === 0) return input.params;
899
- const startedAt = Date.now();
900
- const ref = await persistWorkflowDbSessions({
901
- env: input.env,
918
+ if (sessions.length === 0) {
919
+ return { params: input.params, sessions, ref: null };
920
+ }
921
+ const ref: NonNullable<PlayWorkflowParams['preloadedDbSessionRef']> = {
902
922
  runId: input.params.runId,
903
- sessions,
904
- });
905
- input.recordSubmitTiming?.({
906
- phase: 'coordinator.workflow_db_sessions_externalized',
907
- ms: Date.now() - startedAt,
908
- graphHash: input.params.graphHash ?? null,
909
- extra: {
910
- sessions: sessions.length,
911
- expiresAt: ref.expiresAt,
912
- },
913
- });
923
+ sessionCount: sessions.length,
924
+ expiresAt: Date.now() + DB_SESSION_DEFAULT_TTL_SECONDS * 1000,
925
+ };
914
926
  return {
915
- ...input.params,
916
- preloadedDbSessions: null,
917
- preloadedDbSessionRef: ref,
927
+ params: {
928
+ ...input.params,
929
+ preloadedDbSessions: null,
930
+ preloadedDbSessionRef: ref,
931
+ },
932
+ sessions,
933
+ ref,
918
934
  };
919
935
  }
920
936
 
@@ -951,7 +967,10 @@ async function hydrateWorkflowDbSessions(input: {
951
967
  const ref = readPreloadedDbSessionRef(params.preloadedDbSessionRef);
952
968
  if (!ref) return input.event;
953
969
  const startedAt = Date.now();
954
- const sessions = await readWorkflowDbSessions({ env: input.env, ref });
970
+ const sessions = await readWorkflowDbSessionsWithRetry({
971
+ env: input.env,
972
+ ref,
973
+ });
955
974
  input.trace({
956
975
  runId: ref.runId,
957
976
  phase: 'coordinator.workflow_db_sessions_hydrated',
@@ -979,331 +998,99 @@ async function hydrateWorkflowDbSessions(input: {
979
998
  };
980
999
  }
981
1000
 
982
- type WorkflowPoolCounts = {
983
- available: number;
984
- warming: number;
985
- };
986
-
987
- type WorkflowPoolRefillResult = WorkflowPoolCounts & {
988
- target: number;
989
- created: number;
990
- promoted: number;
991
- removed: number;
992
- waitedMs: number;
993
- waitIterations: number;
994
- };
995
-
996
- type WorkflowPoolListEntry = {
997
- id: string;
998
- state: string;
999
- createdAt: number;
1000
- readyAt: number | null;
1001
- expiresAt: number;
1002
- };
1003
-
1004
- async function workflowPoolCount(
1005
- env: CoordinatorEnv,
1006
- ): Promise<WorkflowPoolCounts> {
1007
- const body = await callWorkflowPool<{
1008
- available?: unknown;
1009
- warming?: unknown;
1010
- }>(
1011
- env,
1012
- `/pool-count?version=${encodeURIComponent(WORKFLOW_POOL_PROTOCOL_VERSION)}`,
1013
- );
1014
- return {
1015
- available: typeof body.available === 'number' ? body.available : 0,
1016
- warming: typeof body.warming === 'number' ? body.warming : 0,
1017
- };
1018
- }
1019
-
1020
1001
  function sleep(ms: number): Promise<void> {
1021
1002
  return new Promise((resolve) => setTimeout(resolve, ms));
1022
1003
  }
1023
1004
 
1024
- async function listWorkflowPoolEntries(
1025
- env: CoordinatorEnv,
1026
- ): Promise<WorkflowPoolListEntry[]> {
1027
- const body = await callWorkflowPool<{ entries?: unknown }>(
1028
- env,
1029
- `/pool-list?version=${encodeURIComponent(WORKFLOW_POOL_PROTOCOL_VERSION)}`,
1030
- );
1031
- if (!Array.isArray(body.entries)) return [];
1032
- return body.entries
1033
- .filter((entry): entry is Record<string, unknown> =>
1034
- Boolean(entry && typeof entry === 'object' && !Array.isArray(entry)),
1035
- )
1036
- .map((entry) => ({
1037
- id: typeof entry.id === 'string' ? entry.id : '',
1038
- state: typeof entry.state === 'string' ? entry.state : '',
1039
- createdAt:
1040
- typeof entry.createdAt === 'number' && Number.isFinite(entry.createdAt)
1041
- ? entry.createdAt
1042
- : 0,
1043
- readyAt:
1044
- typeof entry.readyAt === 'number' && Number.isFinite(entry.readyAt)
1045
- ? entry.readyAt
1046
- : null,
1047
- expiresAt:
1048
- typeof entry.expiresAt === 'number' && Number.isFinite(entry.expiresAt)
1049
- ? entry.expiresAt
1050
- : 0,
1051
- }))
1052
- .filter((entry) => entry.id);
1005
+ function readWorkflowPayload(event: unknown): Record<string, unknown> | null {
1006
+ if (!isRecord(event)) return null;
1007
+ const payload = event.payload;
1008
+ if (!isRecord(payload)) return null;
1009
+ return isRecord(payload.params) ? payload.params : payload;
1053
1010
  }
1054
1011
 
1055
- async function addWorkflowPoolIds(
1056
- env: CoordinatorEnv,
1057
- ids: string[],
1058
- options?: { ready?: boolean },
1059
- ): Promise<void> {
1060
- if (ids.length === 0) return;
1061
- await callWorkflowPool(env, '/pool-add', {
1062
- method: 'POST',
1063
- body: JSON.stringify({
1064
- ids,
1065
- ttlMs: WORKFLOW_POOL_TTL_MS,
1066
- version: WORKFLOW_POOL_PROTOCOL_VERSION,
1067
- ready: options?.ready === true,
1068
- ...(options?.ready === true ? { readyAt: Date.now() } : {}),
1069
- }),
1012
+ async function markWorkflowRuntimeFailure(input: {
1013
+ env: CoordinatorEnv;
1014
+ event: unknown;
1015
+ error: unknown;
1016
+ }): Promise<void> {
1017
+ const payload = readWorkflowPayload(input.event);
1018
+ if (!payload) return;
1019
+ const runId = typeof payload.runId === 'string' ? payload.runId : null;
1020
+ const baseUrl = typeof payload.baseUrl === 'string' ? payload.baseUrl : null;
1021
+ const executorToken =
1022
+ typeof payload.executorToken === 'string' ? payload.executorToken : null;
1023
+ if (!runId || !baseUrl || !executorToken) return;
1024
+ const errorName =
1025
+ input.error instanceof Error && input.error.name
1026
+ ? input.error.name
1027
+ : 'Error';
1028
+ const errorMessage =
1029
+ input.error instanceof Error ? input.error.message : String(input.error);
1030
+ const errorStack =
1031
+ input.error instanceof Error && typeof input.error.stack === 'string'
1032
+ ? input.error.stack.split('\n').slice(0, 12).join('\n')
1033
+ : null;
1034
+ const headers = new Headers({
1035
+ authorization: `Bearer ${executorToken}`,
1036
+ 'content-type': 'application/json',
1070
1037
  });
1071
- }
1072
-
1073
- async function markWorkflowPoolIdReady(
1074
- env: CoordinatorEnv,
1075
- poolId: string,
1076
- ): Promise<boolean> {
1077
- const body = await callWorkflowPool<{ ready?: unknown }>(env, '/pool-ready', {
1078
- method: 'POST',
1079
- body: JSON.stringify({
1080
- poolId,
1081
- version: WORKFLOW_POOL_PROTOCOL_VERSION,
1082
- }),
1083
- });
1084
- return body.ready === true;
1085
- }
1086
-
1087
- async function promoteWorkflowPoolIds(
1088
- env: CoordinatorEnv,
1089
- ids: string[],
1090
- ): Promise<void> {
1091
- if (ids.length === 0) return;
1092
- await callWorkflowPool(env, '/pool-promote', {
1093
- method: 'POST',
1094
- body: JSON.stringify({
1095
- ids,
1096
- version: WORKFLOW_POOL_PROTOCOL_VERSION,
1097
- }),
1098
- });
1099
- }
1100
-
1101
- async function deleteWorkflowPoolIds(
1102
- env: CoordinatorEnv,
1103
- ids: string[],
1104
- ): Promise<void> {
1105
- if (ids.length === 0) return;
1106
- await callWorkflowPool(env, '/pool-delete', {
1107
- method: 'POST',
1108
- body: JSON.stringify({
1109
- ids,
1110
- version: WORKFLOW_POOL_PROTOCOL_VERSION,
1111
- }),
1112
- });
1113
- }
1114
-
1115
- async function leaseWorkflowPoolId(
1116
- env: CoordinatorEnv,
1117
- runId: string,
1118
- ): Promise<string | null> {
1119
- const body = await callWorkflowPool<{ id?: unknown }>(
1120
- env,
1121
- `/pool-claim?version=${encodeURIComponent(WORKFLOW_POOL_PROTOCOL_VERSION)}`,
1122
- {
1123
- method: 'POST',
1124
- body: JSON.stringify({ runId }),
1125
- },
1126
- );
1127
- return typeof body.id === 'string' && body.id ? body.id : null;
1128
- }
1129
-
1130
- async function leaseWorkflowPoolIdWithMissRecovery(input: {
1131
- env: CoordinatorEnv;
1132
- runId: string;
1133
- recordSubmitTiming: (timing: CoordinatorTiming) => void;
1134
- graphHash?: string | null;
1135
- }): Promise<{
1136
- pooledInstanceId: string | null;
1137
- missCounts: WorkflowPoolCounts | null;
1138
- leaseError: string | null;
1139
- }> {
1140
- let leaseError: string | null = null;
1141
- let pooledInstanceId = await leaseWorkflowPoolId(
1142
- input.env,
1143
- input.runId,
1144
- ).catch((error) => {
1145
- leaseError = error instanceof Error ? error.message : String(error);
1146
- return null;
1147
- });
1148
- let missCounts = pooledInstanceId
1149
- ? null
1150
- : await workflowPoolCount(input.env).catch(() => null);
1151
- if (
1152
- pooledInstanceId ||
1153
- leaseError ||
1154
- !missCounts ||
1155
- missCounts.available + missCounts.warming <= 0
1156
- ) {
1157
- return { pooledInstanceId, missCounts, leaseError };
1158
- }
1159
-
1160
- const recoveryStartedAt = Date.now();
1161
- const refill = await refillWorkflowPool(input.env, {
1162
- minAvailable: 1,
1163
- waitReady: true,
1164
- waitTimeoutMs: WORKFLOW_POOL_REFILL_ON_MISS_TIMEOUT_MS,
1165
- }).catch((error) => {
1166
- input.recordSubmitTiming({
1167
- phase: 'coordinator.workflow_pool_refill_on_miss',
1168
- ms: Date.now() - recoveryStartedAt,
1169
- graphHash: input.graphHash ?? null,
1170
- extra: {
1171
- status: 'failed',
1172
- error: error instanceof Error ? error.message : String(error),
1173
- available: missCounts?.available ?? null,
1174
- warming: missCounts?.warming ?? null,
1175
- },
1176
- });
1177
- return null;
1038
+ const bypass = input.env.VERCEL_PROTECTION_BYPASS_TOKEN?.trim();
1039
+ if (bypass) headers.set('x-vercel-protection-bypass', bypass);
1040
+ const body = JSON.stringify({
1041
+ action: 'append_run_events',
1042
+ playId: runId,
1043
+ events: [
1044
+ {
1045
+ type: 'run.failed',
1046
+ runId,
1047
+ source: 'coordinator',
1048
+ occurredAt: Date.now(),
1049
+ error: `DynamicWorkflow runner failed: ${errorName}: ${errorMessage}${
1050
+ errorStack ? `\n${errorStack}` : ''
1051
+ }`,
1052
+ } satisfies PlayRunLedgerEvent,
1053
+ ],
1178
1054
  });
1179
- if (refill) {
1180
- input.recordSubmitTiming({
1181
- phase: 'coordinator.workflow_pool_refill_on_miss',
1182
- ms: Date.now() - recoveryStartedAt,
1183
- graphHash: input.graphHash ?? null,
1184
- extra: {
1185
- status: 'ok',
1186
- available: refill.available,
1187
- warming: refill.warming,
1188
- created: refill.created,
1189
- promoted: refill.promoted,
1190
- removed: refill.removed,
1191
- waitedMs: refill.waitedMs,
1192
- waitIterations: refill.waitIterations,
1193
- },
1194
- });
1195
- }
1196
-
1197
- let retryCount = 0;
1198
- const retryStartedAt = Date.now();
1199
- while (
1200
- Date.now() - retryStartedAt <
1201
- WORKFLOW_POOL_MISS_CLAIM_RETRY_TIMEOUT_MS
1202
- ) {
1203
- retryCount += 1;
1204
- pooledInstanceId = await leaseWorkflowPoolId(input.env, input.runId).catch(
1205
- (error) => {
1206
- leaseError = error instanceof Error ? error.message : String(error);
1207
- return null;
1208
- },
1209
- );
1210
- if (pooledInstanceId || leaseError) {
1211
- break;
1055
+ const url = `${baseUrl.replace(/\/$/, '')}/api/v2/plays/internal/runtime`;
1056
+ const backoffMs = [200, 500, 1500];
1057
+ let lastError: unknown = null;
1058
+ for (let attempt = 0; attempt <= backoffMs.length; attempt += 1) {
1059
+ try {
1060
+ const response = await fetch(url, { method: 'POST', headers, body });
1061
+ if (response.ok) return;
1062
+ lastError = new Error(
1063
+ `runtime API responded ${response.status}: ${(await response.text().catch(() => '')).slice(0, 400)}`,
1064
+ );
1065
+ if (
1066
+ response.status >= 400 &&
1067
+ response.status < 500 &&
1068
+ response.status !== 408 &&
1069
+ response.status !== 429
1070
+ ) {
1071
+ break;
1072
+ }
1073
+ } catch (error) {
1074
+ lastError = error;
1212
1075
  }
1213
- missCounts = await workflowPoolCount(input.env).catch(() => missCounts);
1214
- if (!missCounts || missCounts.available + missCounts.warming <= 0) {
1215
- break;
1076
+ if (attempt < backoffMs.length) {
1077
+ await new Promise((resolve) => setTimeout(resolve, backoffMs[attempt]));
1216
1078
  }
1217
- await sleep(WORKFLOW_POOL_MISS_CLAIM_RETRY_POLL_MS);
1218
1079
  }
1219
- input.recordSubmitTiming({
1220
- phase: 'coordinator.workflow_pool_claim_retry',
1221
- ms: Date.now() - retryStartedAt,
1222
- graphHash: input.graphHash ?? null,
1223
- extra: {
1224
- pooled: Boolean(pooledInstanceId),
1225
- retries: retryCount,
1226
- ...(leaseError ? { error: leaseError } : {}),
1227
- ...(missCounts
1228
- ? {
1229
- availableAfterRetry: missCounts.available,
1230
- warmingAfterRetry: missCounts.warming,
1231
- }
1232
- : {}),
1233
- },
1080
+ console.error('[coordinator] failed to mark workflow runtime failure', {
1081
+ runId,
1082
+ message: lastError instanceof Error ? lastError.message : String(lastError),
1234
1083
  });
1235
-
1236
- return { pooledInstanceId, missCounts, leaseError };
1237
- }
1238
-
1239
- async function mapRunToWorkflowInstance(input: {
1240
- env: CoordinatorEnv;
1241
- runId: string;
1242
- instanceId: string;
1243
- started?: boolean;
1244
- }): Promise<boolean> {
1245
- const body = await callWorkflowPool<{ mapped?: unknown }>(
1246
- input.env,
1247
- '/pool-map-run',
1248
- {
1249
- method: 'POST',
1250
- body: JSON.stringify({
1251
- runId: input.runId,
1252
- instanceId: input.instanceId,
1253
- started: input.started === true,
1254
- version: WORKFLOW_POOL_PROTOCOL_VERSION,
1255
- }),
1256
- },
1257
- );
1258
- return body.mapped !== false;
1259
1084
  }
1260
1085
 
1261
- async function blockWorkflowPoolRun(input: {
1262
- env: CoordinatorEnv;
1263
- runId: string;
1264
- instanceId: string;
1265
- }): Promise<{ blocked: boolean; started: boolean }> {
1266
- const body = await callWorkflowPool<{
1267
- blocked?: unknown;
1268
- started?: unknown;
1269
- }>(input.env, '/pool-block-run', {
1270
- method: 'POST',
1271
- body: JSON.stringify({
1272
- runId: input.runId,
1273
- instanceId: input.instanceId,
1274
- version: WORKFLOW_POOL_PROTOCOL_VERSION,
1275
- }),
1276
- });
1277
- return {
1278
- blocked: body.blocked === true,
1279
- started: body.started === true,
1086
+ type StoredPlayArtifactPayload = {
1087
+ artifact?: {
1088
+ bundledCode?: string;
1089
+ artifactKind?: string;
1280
1090
  };
1281
- }
1091
+ };
1282
1092
 
1283
- async function readWorkflowPoolRunMapping(input: {
1284
- env: CoordinatorEnv;
1285
- runId: string;
1286
- }): Promise<{ instanceId: string | null; startedAt: number | null }> {
1287
- const body = await callWorkflowPool<{
1288
- instanceId?: unknown;
1289
- startedAt?: unknown;
1290
- }>(
1291
- input.env,
1292
- `/pool-resolve-run?runId=${encodeURIComponent(input.runId)}&version=${encodeURIComponent(
1293
- WORKFLOW_POOL_PROTOCOL_VERSION,
1294
- )}`,
1295
- ).catch(() => ({ instanceId: null, startedAt: null }));
1296
- return {
1297
- instanceId:
1298
- typeof body.instanceId === 'string' && body.instanceId
1299
- ? body.instanceId
1300
- : null,
1301
- startedAt:
1302
- typeof body.startedAt === 'number' && Number.isFinite(body.startedAt)
1303
- ? body.startedAt
1304
- : null,
1305
- };
1306
- }
1093
+ const DYNAMIC_WORKER_COMPATIBILITY_DATE = '2026-05-01';
1307
1094
 
1308
1095
  async function persistWorkflowRetryState(input: {
1309
1096
  env: CoordinatorEnv;
@@ -1354,10 +1141,103 @@ async function persistWorkflowRetryState(input: {
1354
1141
  ttlMs: WORKFLOW_RETRY_STATE_TTL_MS,
1355
1142
  };
1356
1143
  }
1357
- await callWorkflowPool<{ ok?: unknown }>(input.env, '/run-retry-state-put', {
1144
+ await callRunScopedControl<{ ok?: unknown }>(
1145
+ input.env,
1146
+ input.runId,
1147
+ '/run-retry-state-put',
1148
+ {
1149
+ method: 'POST',
1150
+ body: JSON.stringify(body),
1151
+ },
1152
+ );
1153
+ }
1154
+
1155
+ async function persistWorkflowLaunchState(input: {
1156
+ env: CoordinatorEnv;
1157
+ runId: string;
1158
+ params: PlayWorkflowParams;
1159
+ sessions: PreloadedRuntimeDbSession[];
1160
+ }): Promise<{
1161
+ retryExpiresAt?: number;
1162
+ dbSessionsExpiresAt?: number;
1163
+ sessionCount?: number;
1164
+ }> {
1165
+ if (input.sessions.length === 0) {
1166
+ await persistWorkflowRetryState({
1167
+ env: input.env,
1168
+ runId: input.runId,
1169
+ params: input.params,
1170
+ });
1171
+ return {};
1172
+ }
1173
+ const retryParams = buildWorkflowRetryParams(input.params);
1174
+ const paramsBytes = jsonByteLength(retryParams);
1175
+ let body: {
1176
+ runId: string;
1177
+ params?: PlayWorkflowParams;
1178
+ paramsRef?: WorkflowRetryParamsRef;
1179
+ paramsBytes: number;
1180
+ sessions: PreloadedRuntimeDbSession[];
1181
+ retryTtlMs: number;
1182
+ dbSessionsTtlMs: number;
1183
+ };
1184
+ if (paramsBytes > WORKFLOW_RETRY_PARAMS_EXTERNALIZE_AFTER_BYTES) {
1185
+ const serialized = JSON.stringify(retryParams);
1186
+ const hash = stableHash(serialized);
1187
+ const storageKey = workflowRetryParamsStorageKey({
1188
+ runId: input.runId,
1189
+ hash,
1190
+ });
1191
+ await input.env.PLAYS_BUCKET.put(storageKey, serialized, {
1192
+ httpMetadata: { contentType: 'application/json' },
1193
+ });
1194
+ body = {
1195
+ runId: input.runId,
1196
+ paramsRef: {
1197
+ storageKind: 'r2',
1198
+ storageKey,
1199
+ bytes: paramsBytes,
1200
+ hash,
1201
+ expiresAt: Date.now() + WORKFLOW_RETRY_STATE_TTL_MS,
1202
+ },
1203
+ paramsBytes,
1204
+ sessions: input.sessions,
1205
+ retryTtlMs: WORKFLOW_RETRY_STATE_TTL_MS,
1206
+ dbSessionsTtlMs: DB_SESSION_DEFAULT_TTL_SECONDS * 1000,
1207
+ };
1208
+ } else {
1209
+ body = {
1210
+ runId: input.runId,
1211
+ params: retryParams,
1212
+ paramsBytes,
1213
+ sessions: input.sessions,
1214
+ retryTtlMs: WORKFLOW_RETRY_STATE_TTL_MS,
1215
+ dbSessionsTtlMs: DB_SESSION_DEFAULT_TTL_SECONDS * 1000,
1216
+ };
1217
+ }
1218
+ const response = await callRunScopedControl<{
1219
+ ok?: unknown;
1220
+ retryExpiresAt?: unknown;
1221
+ dbSessionsExpiresAt?: unknown;
1222
+ sessionCount?: unknown;
1223
+ }>(input.env, input.runId, '/run-launch-state-put', {
1358
1224
  method: 'POST',
1359
1225
  body: JSON.stringify(body),
1360
1226
  });
1227
+ return {
1228
+ retryExpiresAt:
1229
+ typeof response.retryExpiresAt === 'number'
1230
+ ? response.retryExpiresAt
1231
+ : undefined,
1232
+ dbSessionsExpiresAt:
1233
+ typeof response.dbSessionsExpiresAt === 'number'
1234
+ ? response.dbSessionsExpiresAt
1235
+ : undefined,
1236
+ sessionCount:
1237
+ typeof response.sessionCount === 'number'
1238
+ ? response.sessionCount
1239
+ : undefined,
1240
+ };
1361
1241
  }
1362
1242
 
1363
1243
  async function hydrateWorkflowRetryParams(input: {
@@ -1434,12 +1314,12 @@ async function claimWorkflowPlatformRetry(input: {
1434
1314
  attempts: number;
1435
1315
  params: PlayWorkflowParams | null;
1436
1316
  }> {
1437
- const body = await callWorkflowPool<{
1317
+ const body = await callRunScopedControl<{
1438
1318
  claimed?: unknown;
1439
1319
  attempts?: unknown;
1440
1320
  params?: unknown;
1441
1321
  paramsRef?: unknown;
1442
- }>(input.env, '/run-retry-claim', {
1322
+ }>(input.env, input.runId, '/run-retry-claim', {
1443
1323
  method: 'POST',
1444
1324
  body: JSON.stringify({
1445
1325
  runId: input.runId,
@@ -1506,11 +1386,10 @@ async function restartWorkflowAfterPlatformReset(input: {
1506
1386
  id: retryInstanceId,
1507
1387
  params: claim.params,
1508
1388
  });
1509
- await mapRunToWorkflowInstance({
1389
+ await recordWorkflowInstanceId({
1510
1390
  env: input.env,
1511
1391
  runId: input.runId,
1512
1392
  instanceId: retryInstance.id,
1513
- started: true,
1514
1393
  });
1515
1394
  input.ctx?.waitUntil(input.oldInstance.terminate().catch(() => undefined));
1516
1395
  recordCoordinatorPerfTraceBuffered(input.env, input.ctx, {
@@ -1544,609 +1423,8 @@ async function restartWorkflowAfterPlatformReset(input: {
1544
1423
  }
1545
1424
  }
1546
1425
 
1547
- async function waitForWorkflowPoolStartAck(input: {
1548
- env: CoordinatorEnv;
1549
- runId: string;
1550
- instanceId: string;
1551
- timeoutMs: number;
1552
- }): Promise<{
1553
- acknowledged: boolean;
1554
- ms: number;
1555
- polls: number;
1556
- startedAt: number | null;
1557
- mappedInstanceId: string | null;
1558
- }> {
1559
- const startedAt = Date.now();
1560
- let polls = 0;
1561
- let latestMapping: { instanceId: string | null; startedAt: number | null } = {
1562
- instanceId: null,
1563
- startedAt: null,
1564
- };
1565
- while (Date.now() - startedAt < input.timeoutMs) {
1566
- polls += 1;
1567
- latestMapping = await readWorkflowPoolRunMapping({
1568
- env: input.env,
1569
- runId: input.runId,
1570
- });
1571
- if (
1572
- latestMapping.instanceId === input.instanceId &&
1573
- latestMapping.startedAt !== null
1574
- ) {
1575
- return {
1576
- acknowledged: true,
1577
- ms: Date.now() - startedAt,
1578
- polls,
1579
- startedAt: latestMapping.startedAt,
1580
- mappedInstanceId: latestMapping.instanceId,
1581
- };
1582
- }
1583
- await sleep(WORKFLOW_POOL_START_ACK_POLL_MS);
1584
- }
1585
- return {
1586
- acknowledged: false,
1587
- ms: Date.now() - startedAt,
1588
- polls,
1589
- startedAt: latestMapping.startedAt,
1590
- mappedInstanceId: latestMapping.instanceId,
1591
- };
1592
- }
1593
-
1594
- async function resolveWorkflowInstanceIdForRun(
1595
- env: CoordinatorEnv,
1596
- runId: string,
1597
- ): Promise<string> {
1598
- if (!workflowPoolEnabled()) {
1599
- return workflowInstanceId(runId);
1600
- }
1601
- const mapping = await readWorkflowPoolRunMapping({ env, runId });
1602
- return mapping.instanceId ? mapping.instanceId : workflowInstanceId(runId);
1603
- }
1604
-
1605
- async function clearWorkflowPool(env: CoordinatorEnv): Promise<number> {
1606
- const entries = await listWorkflowPoolEntries(env).catch(() => []);
1607
- const body = await callWorkflowPool<{ deleted?: unknown }>(
1608
- env,
1609
- `/pool-clear?version=${encodeURIComponent(WORKFLOW_POOL_PROTOCOL_VERSION)}`,
1610
- { method: 'POST', body: '{}' },
1611
- );
1612
- await Promise.all(
1613
- entries.map(async (entry) => {
1614
- const instance = await getWorkflowPoolInstance(env, entry.id);
1615
- if (!instance) {
1616
- return;
1617
- }
1618
- try {
1619
- await instance.terminate().catch(() => undefined);
1620
- } finally {
1621
- disposeRpcStub(instance);
1622
- }
1623
- }),
1624
- );
1625
- return typeof body.deleted === 'number' ? body.deleted : 0;
1626
- }
1627
-
1628
- function workflowStatusName(status: InstanceStatus | null): string {
1629
- return typeof status?.status === 'string' ? status.status : 'unknown';
1630
- }
1631
-
1632
- function isWorkflowInstanceNotFoundError(error: unknown): boolean {
1633
- const message = error instanceof Error ? error.message : String(error);
1634
- return /not[ _]found|not_found|does not exist|no such instance|404/i.test(
1635
- message,
1636
- );
1637
- }
1638
-
1639
- async function getWorkflowPoolInstance(
1640
- env: CoordinatorEnv,
1641
- instanceId: string,
1642
- ): Promise<WorkflowInstance | null> {
1643
- try {
1644
- return await env.PLAY_WORKFLOW.get(instanceId);
1645
- } catch (error) {
1646
- if (isWorkflowInstanceNotFoundError(error)) {
1647
- return null;
1648
- }
1649
- throw error;
1650
- }
1651
- }
1652
-
1653
- function workflowPoolStatusIsReady(statusName: string): boolean {
1654
- // This is only a liveness guard. Readiness itself comes from the pooled
1655
- // Workflow calling /pool-ready after waitForEvent("play_start") has been
1656
- // created, because Cloudflare may report an armed wait as "running".
1657
- return statusName === 'running' || statusName === 'waiting';
1658
- }
1659
-
1660
- async function waitForWorkflowPoolReadySignal(input: {
1661
- env: CoordinatorEnv;
1662
- instance: WorkflowInstance;
1663
- poolId: string;
1664
- }): Promise<{
1665
- ready: boolean;
1666
- status: string;
1667
- ms: number;
1668
- polls: number;
1669
- }> {
1670
- const startedAt = Date.now();
1671
- let lastStatusName = 'unknown';
1672
- let polls = 0;
1673
- while (Date.now() - startedAt < WORKFLOW_POOL_READY_TIMEOUT_MS) {
1674
- polls += 1;
1675
- const [entry, status] = await Promise.all([
1676
- listWorkflowPoolEntries(input.env)
1677
- .then((entries) =>
1678
- entries.find((candidate) => candidate.id === input.poolId),
1679
- )
1680
- .catch(() => undefined),
1681
- input.instance.status().catch(() => null),
1682
- ]);
1683
- const statusName = workflowStatusName(status);
1684
- lastStatusName = statusName;
1685
- if (entry?.state === 'ready' && entry.readyAt !== null) {
1686
- return {
1687
- ready: true,
1688
- status: statusName,
1689
- ms: Date.now() - startedAt,
1690
- polls,
1691
- };
1692
- }
1693
- if (
1694
- statusName === 'complete' ||
1695
- statusName === 'errored' ||
1696
- statusName === 'terminated' ||
1697
- statusName === 'unknown'
1698
- ) {
1699
- return {
1700
- ready: false,
1701
- status: statusName,
1702
- ms: Date.now() - startedAt,
1703
- polls,
1704
- };
1705
- }
1706
- await sleep(WORKFLOW_POOL_READY_POLL_MS);
1707
- }
1708
- return {
1709
- ready: false,
1710
- status: lastStatusName,
1711
- ms: Date.now() - startedAt,
1712
- polls,
1713
- };
1714
- }
1715
-
1716
- async function refillWorkflowPoolOnce(
1717
- env: CoordinatorEnv,
1718
- ): Promise<Omit<WorkflowPoolRefillResult, 'waitedMs' | 'waitIterations'>> {
1719
- if (!workflowPoolEnabled()) {
1720
- return {
1721
- available: 0,
1722
- warming: 0,
1723
- target: 0,
1724
- created: 0,
1725
- promoted: 0,
1726
- removed: 0,
1727
- };
1728
- }
1729
- const target = workflowPoolTargetSize();
1730
- const entries = await listWorkflowPoolEntries(env);
1731
- const warmingEntries = entries.filter((entry) => entry.readyAt === null);
1732
- const promotedIds: string[] = [];
1733
- const removedIds: string[] = [];
1734
- for (const entry of warmingEntries) {
1735
- const instance = await getWorkflowPoolInstance(env, entry.id);
1736
- if (!instance) {
1737
- removedIds.push(entry.id);
1738
- continue;
1739
- }
1740
- try {
1741
- if (entry.state === 'ready' && entry.readyAt !== null) {
1742
- promotedIds.push(entry.id);
1743
- continue;
1744
- }
1745
- const status = await instance.status().catch(() => null);
1746
- const statusName = workflowStatusName(status);
1747
- if (
1748
- statusName === 'complete' ||
1749
- statusName === 'errored' ||
1750
- statusName === 'terminated' ||
1751
- statusName === 'unknown'
1752
- ) {
1753
- removedIds.push(entry.id);
1754
- }
1755
- } finally {
1756
- disposeRpcStub(instance);
1757
- }
1758
- }
1759
- await Promise.all([
1760
- promoteWorkflowPoolIds(env, promotedIds),
1761
- deleteWorkflowPoolIds(env, removedIds),
1762
- ]);
1763
- const counts = await workflowPoolCount(env);
1764
- const totalTracked = counts.available + counts.warming;
1765
- const needed = Math.max(0, target - totalTracked);
1766
- if (needed === 0) {
1767
- return {
1768
- available: counts.available,
1769
- warming: counts.warming,
1770
- target,
1771
- created: 0,
1772
- promoted: promotedIds.length,
1773
- removed: removedIds.length,
1774
- };
1775
- }
1776
- const created = await Promise.all(
1777
- Array.from({ length: needed }, async () => {
1778
- const poolId = `pool-v2-${Date.now().toString(36)}-${crypto.randomUUID().slice(0, 12)}`;
1779
- await addWorkflowPoolIds(env, [poolId], { ready: false });
1780
- const instance = await env.PLAY_WORKFLOW.create({
1781
- id: poolId,
1782
- params: {
1783
- __deeplinePooledWorkflow: true,
1784
- poolId,
1785
- createdAt: Date.now(),
1786
- } satisfies PooledWorkflowBootstrapPayload,
1787
- });
1788
- try {
1789
- const readiness = await waitForWorkflowPoolReadySignal({
1790
- env,
1791
- instance,
1792
- poolId,
1793
- });
1794
- recordCoordinatorPerfTrace({
1795
- runId: poolId,
1796
- phase: 'coordinator.workflow_pool_ready',
1797
- ms: readiness.ms,
1798
- graphHash: 'workflow-pool',
1799
- extra: {
1800
- ready: readiness.ready,
1801
- status: readiness.status,
1802
- polls: readiness.polls,
1803
- },
1804
- });
1805
- if (readiness.ready) {
1806
- return { id: poolId, state: 'ready' as const };
1807
- }
1808
- if (
1809
- readiness.status === 'complete' ||
1810
- readiness.status === 'errored' ||
1811
- readiness.status === 'terminated' ||
1812
- readiness.status === 'unknown'
1813
- ) {
1814
- await instance.terminate().catch(() => undefined);
1815
- return { id: poolId, state: 'removed' as const };
1816
- }
1817
- return { id: poolId, state: 'warming' as const };
1818
- } finally {
1819
- disposeRpcStub(instance);
1820
- }
1821
- }),
1822
- );
1823
- const readyCreatedIds = created
1824
- .filter((entry) => entry.state === 'ready')
1825
- .map((entry) => entry.id);
1826
- const warmingCreatedIds = created
1827
- .filter((entry) => entry.state === 'warming')
1828
- .map((entry) => entry.id);
1829
- removedIds.push(
1830
- ...created
1831
- .filter((entry) => entry.state === 'removed')
1832
- .map((entry) => entry.id),
1833
- );
1834
- await Promise.all([
1835
- addWorkflowPoolIds(env, readyCreatedIds, { ready: true }),
1836
- addWorkflowPoolIds(env, warmingCreatedIds, { ready: false }),
1837
- ]);
1838
- const finalCounts = await workflowPoolCount(env);
1839
- return {
1840
- available: finalCounts.available,
1841
- warming: finalCounts.warming,
1842
- target,
1843
- created: readyCreatedIds.length + warmingCreatedIds.length,
1844
- promoted: promotedIds.length,
1845
- removed: removedIds.length,
1846
- };
1847
- }
1848
-
1849
- async function refillWorkflowPool(
1850
- env: CoordinatorEnv,
1851
- options?: {
1852
- minAvailable?: number;
1853
- waitReady?: boolean;
1854
- waitTimeoutMs?: number;
1855
- },
1856
- ): Promise<WorkflowPoolRefillResult> {
1857
- const startedAt = Date.now();
1858
- const minAvailable = Math.max(1, Math.floor(options?.minAvailable ?? 1));
1859
- const waitReady = options?.waitReady === true;
1860
- const waitTimeoutMs =
1861
- typeof options?.waitTimeoutMs === 'number' &&
1862
- Number.isFinite(options.waitTimeoutMs) &&
1863
- options.waitTimeoutMs > 0
1864
- ? Math.min(Math.floor(options.waitTimeoutMs), 15_000)
1865
- : 4_000;
1866
- let totals = await refillWorkflowPoolOnce(env);
1867
- let iterations = 0;
1868
- const readyWaitStartedAt = Date.now();
1869
-
1870
- while (
1871
- workflowPoolEnabled() &&
1872
- waitReady &&
1873
- totals.available < minAvailable &&
1874
- Date.now() - readyWaitStartedAt < waitTimeoutMs
1875
- ) {
1876
- iterations += 1;
1877
- await sleep(WORKFLOW_POOL_READY_POLL_MS);
1878
- const next = await refillWorkflowPoolOnce(env);
1879
- totals = {
1880
- ...next,
1881
- created: totals.created + next.created,
1882
- promoted: totals.promoted + next.promoted,
1883
- removed: totals.removed + next.removed,
1884
- };
1885
- }
1886
-
1887
- const result: WorkflowPoolRefillResult = {
1888
- ...totals,
1889
- waitedMs: Date.now() - startedAt,
1890
- waitIterations: iterations,
1891
- };
1892
- recordCoordinatorPerfTrace({
1893
- runId: 'workflow-pool',
1894
- phase: 'coordinator.workflow_pool_refill',
1895
- ms: result.waitedMs,
1896
- graphHash: 'workflow-pool',
1897
- extra: result,
1898
- });
1899
- return result;
1900
- }
1901
-
1902
- async function submitViaPooledWorkflow(input: {
1903
- env: CoordinatorEnv;
1904
- params: PlayWorkflowParams;
1905
- recordSubmitTiming: (timing: CoordinatorTiming) => void;
1906
- }): Promise<WorkflowInstance | null> {
1907
- if (!workflowPoolEnabled()) {
1908
- return null;
1909
- }
1910
- const leaseStartedAt = Date.now();
1911
- const { pooledInstanceId, missCounts, leaseError } =
1912
- await leaseWorkflowPoolIdWithMissRecovery({
1913
- env: input.env,
1914
- runId: input.params.runId,
1915
- recordSubmitTiming: input.recordSubmitTiming,
1916
- graphHash: input.params.graphHash ?? null,
1917
- });
1918
- input.recordSubmitTiming({
1919
- phase: 'coordinator.workflow_pool_lease',
1920
- ms: Date.now() - leaseStartedAt,
1921
- graphHash: input.params.graphHash ?? null,
1922
- extra: {
1923
- pooled: Boolean(pooledInstanceId),
1924
- ...(leaseError ? { error: leaseError } : {}),
1925
- ...(missCounts
1926
- ? {
1927
- availableAfterMiss: missCounts.available,
1928
- warmingAfterMiss: missCounts.warming,
1929
- }
1930
- : {}),
1931
- },
1932
- });
1933
-
1934
- if (!pooledInstanceId) {
1935
- return null;
1936
- }
1937
-
1938
- const instance = await getWorkflowPoolInstance(input.env, pooledInstanceId);
1939
- if (!instance) {
1940
- await blockWorkflowPoolRun({
1941
- env: input.env,
1942
- runId: input.params.runId,
1943
- instanceId: pooledInstanceId,
1944
- }).catch(() => undefined);
1945
- input.recordSubmitTiming({
1946
- phase: 'coordinator.workflow_pool_ready_check',
1947
- ms: Date.now() - leaseStartedAt,
1948
- graphHash: input.params.graphHash ?? null,
1949
- extra: { instanceId: pooledInstanceId, status: 'missing' },
1950
- });
1951
- return null;
1952
- }
1953
- const readyCheckStartedAt = Date.now();
1954
- const status = await instance.status().catch(() => null);
1955
- const statusName = workflowStatusName(status);
1956
- input.recordSubmitTiming({
1957
- phase: 'coordinator.workflow_pool_ready_check',
1958
- ms: Date.now() - readyCheckStartedAt,
1959
- graphHash: input.params.graphHash ?? null,
1960
- extra: { instanceId: pooledInstanceId, status: statusName },
1961
- });
1962
- if (!workflowPoolStatusIsReady(statusName)) {
1963
- await blockWorkflowPoolRun({
1964
- env: input.env,
1965
- runId: input.params.runId,
1966
- instanceId: pooledInstanceId,
1967
- }).catch(() => undefined);
1968
- await instance.terminate().catch(() => undefined);
1969
- disposeRpcStub(instance);
1970
- return null;
1971
- }
1972
- const sendStartedAt = Date.now();
1973
- try {
1974
- await instance.sendEvent({
1975
- type: WORKFLOW_POOL_START_EVENT_TYPE,
1976
- payload: buildDispatcherEnvelope(input.params),
1977
- });
1978
- } catch (error) {
1979
- await blockWorkflowPoolRun({
1980
- env: input.env,
1981
- runId: input.params.runId,
1982
- instanceId: pooledInstanceId,
1983
- }).catch(() => undefined);
1984
- disposeRpcStub(instance);
1985
- console.warn('[coordinator.workflow_pool] sendEvent failed; falling back', {
1986
- runId: input.params.runId,
1987
- pooledInstanceId,
1988
- error: error instanceof Error ? error.message : String(error),
1989
- });
1990
- return null;
1991
- }
1992
- input.recordSubmitTiming({
1993
- phase: 'coordinator.workflow_pool_send_event',
1994
- ms: Date.now() - sendStartedAt,
1995
- graphHash: input.params.graphHash ?? null,
1996
- extra: { instanceId: pooledInstanceId },
1997
- });
1998
- const ack = await waitForWorkflowPoolStartAck({
1999
- env: input.env,
2000
- runId: input.params.runId,
2001
- instanceId: pooledInstanceId,
2002
- timeoutMs: WORKFLOW_POOL_START_ACK_TIMEOUT_MS,
2003
- });
2004
- if (ack.acknowledged) {
2005
- input.recordSubmitTiming({
2006
- phase: 'coordinator.workflow_pool_start_ack',
2007
- ms: ack.ms,
2008
- graphHash: input.params.graphHash ?? null,
2009
- extra: {
2010
- acknowledged: true,
2011
- instanceId: pooledInstanceId,
2012
- polls: ack.polls,
2013
- startedAt: ack.startedAt,
2014
- },
2015
- });
2016
- return instance;
2017
- }
2018
-
2019
- const blockStartedAt = Date.now();
2020
- const block = await blockWorkflowPoolRun({
2021
- env: input.env,
2022
- runId: input.params.runId,
2023
- instanceId: pooledInstanceId,
2024
- }).catch(() => ({ blocked: false, started: false }));
2025
- input.recordSubmitTiming({
2026
- phase: 'coordinator.workflow_pool_start_ack',
2027
- ms: ack.ms,
2028
- graphHash: input.params.graphHash ?? null,
2029
- extra: {
2030
- acknowledged: block.started,
2031
- instanceId: pooledInstanceId,
2032
- polls: ack.polls,
2033
- startedAt: ack.startedAt,
2034
- mappedInstanceId: ack.mappedInstanceId,
2035
- blocked: block.blocked,
2036
- blockMs: Date.now() - blockStartedAt,
2037
- },
2038
- });
2039
- if (block.started) {
2040
- return instance;
2041
- }
2042
- await instance.terminate().catch(() => undefined);
2043
- disposeRpcStub(instance);
2044
- input.recordSubmitTiming({
2045
- phase: 'coordinator.workflow_pool_fallback',
2046
- ms: Date.now() - sendStartedAt,
2047
- graphHash: input.params.graphHash ?? null,
2048
- extra: {
2049
- reason: 'start_ack_timeout',
2050
- instanceId: pooledInstanceId,
2051
- ackTimeoutMs: WORKFLOW_POOL_START_ACK_TIMEOUT_MS,
2052
- },
2053
- });
2054
- return null;
2055
- }
2056
-
2057
- function readWorkflowPayload(event: unknown): Record<string, unknown> | null {
2058
- if (!isRecord(event)) return null;
2059
- const payload = event.payload;
2060
- if (!isRecord(payload)) return null;
2061
- return isRecord(payload.params) ? payload.params : payload;
2062
- }
2063
-
2064
- async function markWorkflowRuntimeFailure(input: {
2065
- env: CoordinatorEnv;
2066
- event: unknown;
2067
- error: unknown;
2068
- }): Promise<void> {
2069
- const payload = readWorkflowPayload(input.event);
2070
- if (!payload) return;
2071
- const runId = typeof payload.runId === 'string' ? payload.runId : null;
2072
- const baseUrl = typeof payload.baseUrl === 'string' ? payload.baseUrl : null;
2073
- const executorToken =
2074
- typeof payload.executorToken === 'string' ? payload.executorToken : null;
2075
- if (!runId || !baseUrl || !executorToken) return;
2076
- const errorName =
2077
- input.error instanceof Error && input.error.name
2078
- ? input.error.name
2079
- : 'Error';
2080
- const errorMessage =
2081
- input.error instanceof Error ? input.error.message : String(input.error);
2082
- const errorStack =
2083
- input.error instanceof Error && typeof input.error.stack === 'string'
2084
- ? input.error.stack.split('\n').slice(0, 12).join('\n')
2085
- : null;
2086
- const headers = new Headers({
2087
- authorization: `Bearer ${executorToken}`,
2088
- 'content-type': 'application/json',
2089
- });
2090
- const bypass = input.env.VERCEL_PROTECTION_BYPASS_TOKEN?.trim();
2091
- if (bypass) headers.set('x-vercel-protection-bypass', bypass);
2092
- const body = JSON.stringify({
2093
- action: 'append_run_events',
2094
- playId: runId,
2095
- events: [
2096
- {
2097
- type: 'run.failed',
2098
- runId,
2099
- source: 'coordinator',
2100
- occurredAt: Date.now(),
2101
- error: `DynamicWorkflow runner failed: ${errorName}: ${errorMessage}${
2102
- errorStack ? `\n${errorStack}` : ''
2103
- }`,
2104
- } satisfies PlayRunLedgerEvent,
2105
- ],
2106
- });
2107
- const url = `${baseUrl.replace(/\/$/, '')}/api/v2/plays/internal/runtime`;
2108
- const backoffMs = [200, 500, 1500];
2109
- let lastError: unknown = null;
2110
- for (let attempt = 0; attempt <= backoffMs.length; attempt += 1) {
2111
- try {
2112
- const response = await fetch(url, { method: 'POST', headers, body });
2113
- if (response.ok) return;
2114
- lastError = new Error(
2115
- `runtime API responded ${response.status}: ${(await response.text().catch(() => '')).slice(0, 400)}`,
2116
- );
2117
- if (
2118
- response.status >= 400 &&
2119
- response.status < 500 &&
2120
- response.status !== 408 &&
2121
- response.status !== 429
2122
- ) {
2123
- break;
2124
- }
2125
- } catch (error) {
2126
- lastError = error;
2127
- }
2128
- if (attempt < backoffMs.length) {
2129
- await new Promise((resolve) => setTimeout(resolve, backoffMs[attempt]));
2130
- }
2131
- }
2132
- console.error('[coordinator] failed to mark workflow runtime failure', {
2133
- runId,
2134
- message: lastError instanceof Error ? lastError.message : String(lastError),
2135
- });
2136
- }
2137
-
2138
- type StoredPlayArtifactPayload = {
2139
- artifact?: {
2140
- bundledCode?: string;
2141
- artifactKind?: string;
2142
- };
2143
- };
2144
-
2145
- const DYNAMIC_WORKER_COMPATIBILITY_DATE = '2026-05-01';
2146
-
2147
1426
  async function mintChildWorkflowExecutorToken(input: {
2148
1427
  env: CoordinatorEnv;
2149
- baseUrl: string;
2150
1428
  parentExecutorToken: string;
2151
1429
  parentRunId: string;
2152
1430
  parentPlayName: string;
@@ -2154,37 +1432,27 @@ async function mintChildWorkflowExecutorToken(input: {
2154
1432
  childPlayName: string;
2155
1433
  maxCreditsPerRun?: number | null;
2156
1434
  }): Promise<string> {
2157
- const url = `${input.baseUrl.replace(/\/$/, '')}/api/v2/plays/internal/child-executor-token`;
2158
- const headers = new Headers({
2159
- authorization: `Bearer ${input.parentExecutorToken}`,
2160
- 'content-type': 'application/json',
2161
- 'x-deepline-request-id': crypto.randomUUID(),
2162
- });
2163
- if (input.env.VERCEL_PROTECTION_BYPASS_TOKEN?.trim()) {
2164
- headers.set(
2165
- 'x-vercel-protection-bypass',
2166
- input.env.VERCEL_PROTECTION_BYPASS_TOKEN.trim(),
2167
- );
2168
- }
2169
- const response = await fetch(url, {
2170
- method: 'POST',
2171
- headers,
2172
- body: JSON.stringify({
1435
+ const response = await input.env.HARNESS.runtimeApiCall({
1436
+ executorToken: input.parentExecutorToken,
1437
+ path: '/api/v2/plays/internal/child-executor-token',
1438
+ headers: { 'x-deepline-request-id': crypto.randomUUID() },
1439
+ timeoutMs: 15_000,
1440
+ body: {
2173
1441
  parentRunId: input.parentRunId,
2174
1442
  parentPlayName: input.parentPlayName,
2175
1443
  childRunId: input.childRunId,
2176
1444
  childPlayName: input.childPlayName,
2177
1445
  maxCreditsPerRun: input.maxCreditsPerRun ?? null,
2178
- }),
1446
+ },
2179
1447
  });
2180
- const text = await response.text().catch(() => '');
1448
+ const text = response.body;
2181
1449
  let parsed: Record<string, unknown> = {};
2182
1450
  try {
2183
1451
  parsed = text ? (JSON.parse(text) as Record<string, unknown>) : {};
2184
1452
  } catch {
2185
1453
  parsed = {};
2186
1454
  }
2187
- if (!response.ok) {
1455
+ if (response.status < 200 || response.status >= 300) {
2188
1456
  const error = isRecord(parsed.error) ? parsed.error : null;
2189
1457
  const message =
2190
1458
  (typeof error?.message === 'string' && error.message.trim()) ||
@@ -2267,7 +1535,6 @@ async function reencryptChildDbSessionForExecutor(input: {
2267
1535
 
2268
1536
  async function createChildRuntimeDbSession(input: {
2269
1537
  env: CoordinatorEnv;
2270
- baseUrl: string;
2271
1538
  childExecutorToken: string;
2272
1539
  childPlayName: string;
2273
1540
  requirement: RuntimeDbSessionRequirement;
@@ -2275,22 +1542,12 @@ async function createChildRuntimeDbSession(input: {
2275
1542
  orgId: string;
2276
1543
  }): Promise<CreateDbSessionResponse> {
2277
1544
  const decryptionKey = await generateDbSessionPostgresUrlDecryptionKey();
2278
- const url = `${input.baseUrl.replace(/\/$/, '')}/api/v2/plays/internal/runtime`;
2279
- const headers = new Headers({
2280
- authorization: `Bearer ${input.childExecutorToken}`,
2281
- 'content-type': 'application/json',
2282
- 'x-deepline-request-id': crypto.randomUUID(),
2283
- });
2284
- if (input.env.VERCEL_PROTECTION_BYPASS_TOKEN?.trim()) {
2285
- headers.set(
2286
- 'x-vercel-protection-bypass',
2287
- input.env.VERCEL_PROTECTION_BYPASS_TOKEN.trim(),
2288
- );
2289
- }
2290
- const response = await fetch(url, {
2291
- method: 'POST',
2292
- headers,
2293
- body: JSON.stringify({
1545
+ const response = await input.env.HARNESS.runtimeApiCall({
1546
+ executorToken: input.childExecutorToken,
1547
+ path: '/api/v2/plays/internal/runtime',
1548
+ headers: { 'x-deepline-request-id': crypto.randomUUID() },
1549
+ timeoutMs: 15_000,
1550
+ body: {
2294
1551
  action: 'create_db_session',
2295
1552
  playName: input.childPlayName,
2296
1553
  target: {
@@ -2303,16 +1560,16 @@ async function createChildRuntimeDbSession(input: {
2303
1560
  ttlSeconds: DB_SESSION_DEFAULT_TTL_SECONDS,
2304
1561
  userEmail: input.userEmail,
2305
1562
  postgresUrlEncryption: decryptionKey.request,
2306
- }),
1563
+ },
2307
1564
  });
2308
- const text = await response.text().catch(() => '');
1565
+ const text = response.body;
2309
1566
  let parsed: unknown = {};
2310
1567
  try {
2311
1568
  parsed = text ? JSON.parse(text) : {};
2312
1569
  } catch {
2313
1570
  parsed = {};
2314
1571
  }
2315
- if (!response.ok) {
1572
+ if (response.status < 200 || response.status >= 300) {
2316
1573
  const error =
2317
1574
  isRecord(parsed) && isRecord(parsed.error) ? parsed.error : {};
2318
1575
  const message =
@@ -2342,7 +1599,6 @@ async function createChildRuntimeDbSession(input: {
2342
1599
 
2343
1600
  async function preloadChildRuntimeDbSessions(input: {
2344
1601
  env: CoordinatorEnv;
2345
- baseUrl: string;
2346
1602
  childExecutorToken: string;
2347
1603
  childRunId: string;
2348
1604
  childPlayName: string;
@@ -2362,7 +1618,6 @@ async function preloadChildRuntimeDbSessions(input: {
2362
1618
  ...(requirement.limits ? { limits: requirement.limits } : {}),
2363
1619
  session: await createChildRuntimeDbSession({
2364
1620
  env: input.env,
2365
- baseUrl: input.baseUrl,
2366
1621
  childExecutorToken: input.childExecutorToken,
2367
1622
  childPlayName: input.childPlayName,
2368
1623
  requirement,
@@ -2381,6 +1636,180 @@ async function preloadChildRuntimeDbSessions(input: {
2381
1636
  return sessions;
2382
1637
  }
2383
1638
 
1639
+ async function registerInlineChildRunWithRuntime(input: {
1640
+ env: CoordinatorEnv;
1641
+ childExecutorToken: string;
1642
+ childRunId: string;
1643
+ childPlayName: string;
1644
+ manifest: PlayRuntimeManifest;
1645
+ governance: PlayCallGovernanceSnapshot;
1646
+ }): Promise<void> {
1647
+ const response = await input.env.HARNESS.runtimeApiCall({
1648
+ executorToken: input.childExecutorToken,
1649
+ path: '/api/v2/plays/internal/runtime',
1650
+ headers: { 'x-deepline-request-id': crypto.randomUUID() },
1651
+ timeoutMs: 15_000,
1652
+ body: {
1653
+ action: 'start_inline_child_run',
1654
+ playName: input.childPlayName,
1655
+ runId: input.childRunId,
1656
+ workflowFamilyKey:
1657
+ input.governance.rootRunId ??
1658
+ input.governance.parentRunId ??
1659
+ input.childRunId,
1660
+ artifactStorageKey: input.manifest.artifactStorageKey,
1661
+ artifactHash: input.manifest.artifactHash,
1662
+ graphHash: input.manifest.graphHash,
1663
+ runtimeBackend: 'workers_edge',
1664
+ schedulerBackend: 'inline_child',
1665
+ executionProfile: 'workers_edge',
1666
+ ...(typeof input.manifest.maxCreditsPerRun === 'number'
1667
+ ? { maxCreditsPerRun: input.manifest.maxCreditsPerRun }
1668
+ : {}),
1669
+ staticPipeline: input.manifest.staticPipeline ?? null,
1670
+ source: 'published',
1671
+ },
1672
+ });
1673
+ if (response.status < 200 || response.status >= 300) {
1674
+ const text = response.body ?? '';
1675
+ throw new Error(
1676
+ `Inline child run registration failed ${response.status}: ${text.slice(0, 800)}`,
1677
+ );
1678
+ }
1679
+ }
1680
+
1681
+ type CoordinatorRuntimeApiTiming = {
1682
+ phase: string;
1683
+ ms: number;
1684
+ bytes?: number;
1685
+ };
1686
+
1687
+ async function callRuntimeApiFromCoordinator(input: {
1688
+ env: CoordinatorEnv;
1689
+ executorToken: string;
1690
+ body: unknown;
1691
+ }): Promise<{
1692
+ status: number;
1693
+ body: string;
1694
+ timings: CoordinatorRuntimeApiTiming[];
1695
+ }> {
1696
+ const timings: CoordinatorRuntimeApiTiming[] = [];
1697
+ const totalStartedAt = Date.now();
1698
+ const recordTiming = (
1699
+ phase: string,
1700
+ startedAt: number,
1701
+ extra?: { bytes?: number },
1702
+ ): void => {
1703
+ timings.push({
1704
+ phase,
1705
+ ms: Date.now() - startedAt,
1706
+ ...(extra?.bytes !== undefined ? { bytes: extra.bytes } : {}),
1707
+ });
1708
+ };
1709
+
1710
+ const buildStartedAt = Date.now();
1711
+ const body = input.body ?? {};
1712
+ const serializedBody = JSON.stringify(body);
1713
+ recordTiming('coordinator.runtime_api.build_request', buildStartedAt, {
1714
+ bytes: serializedBody.length,
1715
+ });
1716
+
1717
+ const fetchStartedAt = Date.now();
1718
+ const response = await input.env.HARNESS.runtimeApiCall({
1719
+ executorToken: input.executorToken,
1720
+ path: '/api/v2/plays/internal/runtime',
1721
+ body,
1722
+ headers: {
1723
+ 'x-deepline-request-id': crypto.randomUUID(),
1724
+ },
1725
+ });
1726
+ recordTiming('coordinator.runtime_api.fetch', fetchStartedAt);
1727
+
1728
+ const bodyStartedAt = Date.now();
1729
+ const responseBody = response.body;
1730
+ recordTiming('coordinator.runtime_api.body', bodyStartedAt, {
1731
+ bytes: responseBody.length,
1732
+ });
1733
+ recordTiming('coordinator.runtime_api.total', totalStartedAt);
1734
+ return {
1735
+ status: response.status,
1736
+ body: responseBody,
1737
+ timings,
1738
+ };
1739
+ }
1740
+
1741
+ async function prepareInlineChildRunWithRuntime(input: {
1742
+ env: CoordinatorEnv;
1743
+ parentExecutorToken: string;
1744
+ parentRunId: string;
1745
+ parentPlayName: string;
1746
+ childRunId: string;
1747
+ childPlayName: string;
1748
+ manifest: PlayRuntimeManifest;
1749
+ governance: PlayCallGovernanceSnapshot;
1750
+ userEmail: string;
1751
+ }): Promise<{
1752
+ childToken: string;
1753
+ preloadedDbSessions: PreloadedRuntimeDbSession[];
1754
+ prepareTimings: unknown[];
1755
+ transportTimings: unknown[];
1756
+ }> {
1757
+ const response = await callRuntimeApiFromCoordinator({
1758
+ env: input.env,
1759
+ executorToken: input.parentExecutorToken,
1760
+ body: {
1761
+ action: 'prepare_inline_child_run',
1762
+ parentRunId: input.parentRunId,
1763
+ parentPlayName: input.parentPlayName,
1764
+ childRunId: input.childRunId,
1765
+ childPlayName: input.childPlayName,
1766
+ workflowFamilyKey:
1767
+ input.governance.rootRunId ??
1768
+ input.governance.parentRunId ??
1769
+ input.childRunId,
1770
+ artifactStorageKey: input.manifest.artifactStorageKey,
1771
+ artifactHash: input.manifest.artifactHash,
1772
+ graphHash: input.manifest.graphHash,
1773
+ runtimeBackend: 'workers_edge',
1774
+ schedulerBackend: 'inline_child',
1775
+ executionProfile: 'workers_edge',
1776
+ ...(typeof input.manifest.maxCreditsPerRun === 'number'
1777
+ ? { maxCreditsPerRun: input.manifest.maxCreditsPerRun }
1778
+ : {}),
1779
+ staticPipeline: input.manifest.staticPipeline ?? null,
1780
+ source: 'published',
1781
+ userEmail: input.userEmail,
1782
+ },
1783
+ });
1784
+ const text = response.body;
1785
+ let parsed: unknown = {};
1786
+ try {
1787
+ parsed = text ? JSON.parse(text) : {};
1788
+ } catch {
1789
+ parsed = {};
1790
+ }
1791
+ if (response.status < 200 || response.status >= 300) {
1792
+ throw new Error(
1793
+ `Inline child prepare failed ${response.status}: ${text.slice(0, 800)}`,
1794
+ );
1795
+ }
1796
+ if (!isRecord(parsed) || typeof parsed.executorToken !== 'string') {
1797
+ throw new Error('Inline child prepare response was missing executorToken.');
1798
+ }
1799
+ const preloadedDbSessions = Array.isArray(parsed.preloadedDbSessions)
1800
+ ? (parsed.preloadedDbSessions as PreloadedRuntimeDbSession[])
1801
+ : [];
1802
+ const prepareTimings = Array.isArray(parsed.prepareTimings)
1803
+ ? parsed.prepareTimings
1804
+ : [];
1805
+ return {
1806
+ childToken: parsed.executorToken,
1807
+ preloadedDbSessions,
1808
+ prepareTimings,
1809
+ transportTimings: response.timings,
1810
+ };
1811
+ }
1812
+
2384
1813
  function buildChildRunId(playName: string): string {
2385
1814
  const slug =
2386
1815
  playName
@@ -2637,6 +2066,8 @@ function runRequestFromPlayWorkflowParams(
2637
2066
  childPlayManifests: params.childPlayManifests ?? null,
2638
2067
  playCallGovernance: params.playCallGovernance ?? null,
2639
2068
  preloadedDbSessions: params.preloadedDbSessions ?? null,
2069
+ inlineChildRunRegistered:
2070
+ params.runtimeBackend === 'cf_workflows_dynamic_worker_inline_child',
2640
2071
  coordinatorUrl: params.coordinatorUrl ?? null,
2641
2072
  totalRows: params.totalRows,
2642
2073
  };
@@ -2767,38 +2198,41 @@ async function executeChildInline(input: {
2767
2198
  },
2768
2199
  });
2769
2200
 
2770
- const tokenStartedAt = Date.now();
2771
- const baseUrl = resolveRuntimeBaseUrl(input.env, input.body);
2772
- const childToken = await mintChildWorkflowExecutorToken({
2773
- env: input.env,
2774
- baseUrl,
2775
- parentExecutorToken,
2776
- parentRunId: input.parentRunId,
2777
- parentPlayName:
2778
- typeof input.body.parentPlayName === 'string' &&
2779
- input.body.parentPlayName.trim()
2780
- ? input.body.parentPlayName.trim()
2781
- : governance.parentPlayName,
2782
- childRunId,
2783
- childPlayName,
2784
- maxCreditsPerRun: manifest.maxCreditsPerRun ?? null,
2201
+ const loaderStartedAt = Date.now();
2202
+ const stub = loadDynamicPlayWorker(input.env, {
2203
+ runId: childRunId,
2204
+ graphHash: manifest.graphHash,
2205
+ artifactStorageKey: manifest.artifactStorageKey,
2206
+ artifactHash: manifest.artifactHash,
2207
+ dynamicWorkerCode:
2208
+ typeof manifest.bundledCode === 'string' ? manifest.bundledCode : null,
2209
+ packagedFiles: null,
2785
2210
  });
2786
- trace('coordinator.inline_child_token', tokenStartedAt);
2211
+ trace('coordinator.inline_child_loader_get', loaderStartedAt);
2787
2212
 
2788
- const dbSessionStartedAt = Date.now();
2789
- const preloadedDbSessions = await preloadChildRuntimeDbSessions({
2790
- env: input.env,
2791
- baseUrl,
2792
- childExecutorToken: childToken,
2793
- childRunId,
2794
- childPlayName,
2795
- manifest,
2796
- orgId,
2797
- userEmail:
2798
- typeof input.body.userEmail === 'string' ? input.body.userEmail : '',
2799
- });
2800
- trace('coordinator.inline_child_db_session_preload', dbSessionStartedAt, {
2213
+ const prepareStartedAt = Date.now();
2214
+ const parentPlayName =
2215
+ typeof input.body.parentPlayName === 'string' &&
2216
+ input.body.parentPlayName.trim()
2217
+ ? input.body.parentPlayName.trim()
2218
+ : governance.parentPlayName;
2219
+ const { childToken, preloadedDbSessions, prepareTimings, transportTimings } =
2220
+ await prepareInlineChildRunWithRuntime({
2221
+ env: input.env,
2222
+ parentExecutorToken,
2223
+ parentRunId: input.parentRunId,
2224
+ parentPlayName,
2225
+ childRunId,
2226
+ childPlayName,
2227
+ manifest,
2228
+ governance,
2229
+ userEmail:
2230
+ typeof input.body.userEmail === 'string' ? input.body.userEmail : '',
2231
+ });
2232
+ trace('coordinator.inline_child_prepare', prepareStartedAt, {
2801
2233
  sessions: preloadedDbSessions.length,
2234
+ prepareTimings,
2235
+ transportTimings,
2802
2236
  });
2803
2237
 
2804
2238
  const params = buildChildWorkflowParams({
@@ -2816,17 +2250,6 @@ async function executeChildInline(input: {
2816
2250
  preloadedDbSessions:
2817
2251
  preloadedDbSessions.length > 0 ? preloadedDbSessions : null,
2818
2252
  });
2819
- const loaderStartedAt = Date.now();
2820
- const stub = loadDynamicPlayWorker(input.env, {
2821
- runId: childRunId,
2822
- graphHash: manifest.graphHash,
2823
- artifactStorageKey: manifest.artifactStorageKey,
2824
- artifactHash: manifest.artifactHash,
2825
- dynamicWorkerCode:
2826
- typeof manifest.bundledCode === 'string' ? manifest.bundledCode : null,
2827
- packagedFiles: null,
2828
- });
2829
- trace('coordinator.inline_child_loader_get', loaderStartedAt);
2830
2253
 
2831
2254
  let entrypoint: ReturnType<Awaited<typeof stub>['getEntrypoint']> | null =
2832
2255
  null;
@@ -2837,44 +2260,54 @@ async function executeChildInline(input: {
2837
2260
  entrypoint = awaitedStub.getEntrypoint();
2838
2261
  trace('coordinator.inline_child_get_entrypoint', entrypointStartedAt);
2839
2262
  const fetchStartedAt = Date.now();
2840
- response = await entrypoint.fetch(
2263
+ const inlineResponse = await entrypoint.fetch(
2841
2264
  new Request('https://deepline.dynamic.internal/run-inline', {
2842
2265
  method: 'POST',
2843
2266
  headers: { 'content-type': 'application/json' },
2844
2267
  body: JSON.stringify(runRequestFromPlayWorkflowParams(params)),
2845
2268
  }),
2846
2269
  );
2270
+ if (!inlineResponse) {
2271
+ throw new Error('Inline child Worker returned no response.');
2272
+ }
2273
+ let workerResponse = inlineResponse as Response;
2274
+ response = workerResponse;
2847
2275
  trace('coordinator.inline_child_worker_fetch', fetchStartedAt, {
2848
- status: response.status,
2276
+ status: workerResponse.status,
2849
2277
  endpoint: '/run-inline',
2850
2278
  });
2851
2279
  let usedLegacyRunStream = false;
2852
- if (response.status === 404) {
2853
- disposeRpcStub(response);
2280
+ if (workerResponse.status === 404) {
2281
+ disposeRpcStub(workerResponse);
2854
2282
  const legacyFetchStartedAt = Date.now();
2855
- response = await entrypoint.fetch(
2283
+ const legacyResponse = await entrypoint.fetch(
2856
2284
  new Request('https://deepline.dynamic.internal/run', {
2857
2285
  method: 'POST',
2858
2286
  headers: { 'content-type': 'application/json' },
2859
2287
  body: JSON.stringify(runRequestFromPlayWorkflowParams(params)),
2860
2288
  }),
2861
2289
  );
2290
+ if (!legacyResponse) {
2291
+ throw new Error('Legacy inline child Worker returned no response.');
2292
+ }
2293
+ workerResponse = legacyResponse as Response;
2294
+ response = workerResponse;
2862
2295
  usedLegacyRunStream = true;
2863
2296
  trace('coordinator.inline_child_worker_fetch', legacyFetchStartedAt, {
2864
- status: response.status,
2297
+ status: workerResponse.status,
2865
2298
  endpoint: '/run',
2866
2299
  compatibility: 'legacy_stream',
2867
2300
  });
2868
2301
  }
2869
- if (!response.ok) {
2870
- const text = await response.text().catch(() => '');
2302
+ if (!workerResponse.ok) {
2303
+ const text = await workerResponse.text().catch(() => '');
2871
2304
  throw new Error(
2872
- `Inline child Worker failed ${response.status}: ${text.slice(0, 800)}`,
2305
+ `Inline child Worker failed ${workerResponse.status}: ${text.slice(0, 800)}`,
2873
2306
  );
2874
2307
  }
2875
2308
  const responseStartedAt = Date.now();
2876
2309
  const parsed: InlineWorkerRunResponse = usedLegacyRunStream
2877
- ? await readLegacyRunStream(response).then((legacy) => ({
2310
+ ? await readLegacyRunStream(workerResponse).then((legacy) => ({
2878
2311
  status: legacy.error ? 'failed' : 'completed',
2879
2312
  result: legacy.result,
2880
2313
  outputRows: legacy.outputRows ?? undefined,
@@ -2884,7 +2317,7 @@ async function executeChildInline(input: {
2884
2317
  })),
2885
2318
  error: legacy.error ?? undefined,
2886
2319
  }))
2887
- : ((await response.json()) as InlineWorkerRunResponse);
2320
+ : ((await workerResponse.json()) as InlineWorkerRunResponse);
2888
2321
  const logs = (parsed.events ?? []).flatMap((event) => {
2889
2322
  if (
2890
2323
  event &&
@@ -2905,6 +2338,34 @@ async function executeChildInline(input: {
2905
2338
  durationMs:
2906
2339
  typeof parsed.durationMs === 'number' ? parsed.durationMs : null,
2907
2340
  });
2341
+ for (const timing of parsed.timings ?? []) {
2342
+ if (
2343
+ !timing ||
2344
+ typeof timing !== 'object' ||
2345
+ typeof timing.phase !== 'string' ||
2346
+ typeof timing.ms !== 'number' ||
2347
+ !Number.isFinite(timing.ms)
2348
+ ) {
2349
+ continue;
2350
+ }
2351
+ recordCoordinatorPerfTrace({
2352
+ runId: childRunId,
2353
+ phase: `dynamic_worker.${timing.phase}`,
2354
+ ms: Math.max(0, Math.round(timing.ms)),
2355
+ graphHash: manifest.graphHash,
2356
+ extra: {
2357
+ parentRunId: input.parentRunId,
2358
+ mode: 'inline_dynamic_worker',
2359
+ ...(isRecord(timing.extra) ? timing.extra : {}),
2360
+ },
2361
+ });
2362
+ timings.push({
2363
+ phase: `dynamic_worker.${timing.phase}`,
2364
+ ms: Math.max(0, Math.round(timing.ms)),
2365
+ graphHash: manifest.graphHash,
2366
+ ...(isRecord(timing.extra) ? { extra: timing.extra } : {}),
2367
+ });
2368
+ }
2908
2369
  trace('coordinator.inline_child_total', startedAt);
2909
2370
  if (parsed.status === 'failed' || parsed.error) {
2910
2371
  const error = {
@@ -2949,29 +2410,176 @@ async function executeChildInline(input: {
2949
2410
  action: 'completed',
2950
2411
  mode: 'inline_dynamic_worker',
2951
2412
  },
2952
- });
2953
- return {
2954
- workflowId: childRunId,
2955
- runId: childRunId,
2956
- status: 'completed',
2957
- mode: 'inline_dynamic_worker',
2958
- result: parsed.result,
2959
- output: parsed.result,
2960
- logs,
2961
- timings,
2962
- };
2963
- } finally {
2964
- disposeRpcStub(response);
2965
- disposeRpcStub(entrypoint);
2966
- disposeRpcStub(await stub.catch(() => null));
2967
- }
2413
+ });
2414
+ return {
2415
+ workflowId: childRunId,
2416
+ runId: childRunId,
2417
+ status: 'completed',
2418
+ mode: 'inline_dynamic_worker',
2419
+ result: parsed.result,
2420
+ output: parsed.result,
2421
+ logs,
2422
+ timings,
2423
+ };
2424
+ } finally {
2425
+ disposeRpcStub(response);
2426
+ disposeRpcStub(entrypoint);
2427
+ disposeRpcStub(await stub.catch(() => null));
2428
+ }
2429
+ }
2430
+
2431
+ async function submitChildWorkflowThroughCoordinator(input: {
2432
+ env: CoordinatorEnv;
2433
+ parentRunId: string;
2434
+ body: Record<string, unknown>;
2435
+ coordinatorUrl: string | null;
2436
+ }): Promise<{
2437
+ response: Response;
2438
+ responseText: string;
2439
+ childRunId: string;
2440
+ childPlayName: string;
2441
+ startedAt: number;
2442
+ timings: CoordinatorTiming[];
2443
+ }> {
2444
+ const startedAt = Date.now();
2445
+ const timings: CoordinatorTiming[] = [];
2446
+ const trace = (
2447
+ phase: string,
2448
+ phaseStartedAt: number,
2449
+ graphHash?: string | null,
2450
+ extra?: Record<string, unknown>,
2451
+ ): void => {
2452
+ const timing: CoordinatorTiming = {
2453
+ phase,
2454
+ ms: Date.now() - phaseStartedAt,
2455
+ ...(graphHash ? { graphHash } : {}),
2456
+ ...(extra ? { extra } : {}),
2457
+ };
2458
+ timings.push(timing);
2459
+ recordCoordinatorPerfTrace({
2460
+ runId: input.parentRunId,
2461
+ phase,
2462
+ ms: timing.ms,
2463
+ graphHash: graphHash ?? undefined,
2464
+ extra,
2465
+ });
2466
+ };
2467
+ const validated = validateChildSubmitBody({
2468
+ parentRunId: input.parentRunId,
2469
+ body: input.body,
2470
+ });
2471
+ if (!validated.ok) {
2472
+ return {
2473
+ response: Response.json(
2474
+ { error: validated.error },
2475
+ { status: validated.status },
2476
+ ),
2477
+ responseText: '',
2478
+ childRunId: '',
2479
+ childPlayName: '',
2480
+ startedAt,
2481
+ timings,
2482
+ };
2483
+ }
2484
+ const { manifest, governance, childPlayName, orgId, parentExecutorToken } =
2485
+ validated;
2486
+ const childRunId = buildChildRunId(childPlayName);
2487
+ const baseUrl = resolveRuntimeBaseUrl(input.env, input.body);
2488
+
2489
+ const tokenStartedAt = Date.now();
2490
+ const childToken = await mintChildWorkflowExecutorToken({
2491
+ env: input.env,
2492
+ parentExecutorToken,
2493
+ parentRunId: input.parentRunId,
2494
+ parentPlayName:
2495
+ typeof input.body.parentPlayName === 'string' &&
2496
+ input.body.parentPlayName.trim()
2497
+ ? input.body.parentPlayName.trim()
2498
+ : governance.parentPlayName,
2499
+ childRunId,
2500
+ childPlayName,
2501
+ maxCreditsPerRun: manifest.maxCreditsPerRun ?? null,
2502
+ });
2503
+ trace('coordinator.child_submit_token', tokenStartedAt, manifest.graphHash, {
2504
+ childRunId,
2505
+ childPlayName,
2506
+ });
2507
+
2508
+ const dbSessionStartedAt = Date.now();
2509
+ const preloadedDbSessions = await preloadChildRuntimeDbSessions({
2510
+ env: input.env,
2511
+ childExecutorToken: childToken,
2512
+ childRunId,
2513
+ childPlayName,
2514
+ manifest,
2515
+ orgId,
2516
+ userEmail:
2517
+ typeof input.body.userEmail === 'string' ? input.body.userEmail : '',
2518
+ });
2519
+ trace(
2520
+ 'coordinator.child_submit_db_session_preload',
2521
+ dbSessionStartedAt,
2522
+ manifest.graphHash,
2523
+ { childRunId, sessions: preloadedDbSessions.length },
2524
+ );
2525
+
2526
+ const params = buildChildWorkflowParams({
2527
+ env: input.env,
2528
+ body: input.body,
2529
+ manifest,
2530
+ governance,
2531
+ childRunId,
2532
+ childPlayName,
2533
+ childToken,
2534
+ orgId,
2535
+ coordinatorUrl: input.coordinatorUrl,
2536
+ runtimeBackend: 'cf_workflows_dynamic_worker',
2537
+ dynamicWorkerCode:
2538
+ typeof manifest.bundledCode === 'string' ? manifest.bundledCode : null,
2539
+ preloadedDbSessions:
2540
+ preloadedDbSessions.length > 0 ? preloadedDbSessions : null,
2541
+ });
2542
+
2543
+ const workflowSubmitStartedAt = Date.now();
2544
+ const response = await handleWorkflowRoute({
2545
+ runId: childRunId,
2546
+ action: 'submit',
2547
+ request: new Request(
2548
+ `https://deepline.coordinator.internal/workflow/${encodeURIComponent(
2549
+ childRunId,
2550
+ )}/submit`,
2551
+ {
2552
+ method: 'POST',
2553
+ headers: { 'content-type': 'application/json' },
2554
+ body: JSON.stringify(params),
2555
+ },
2556
+ ),
2557
+ env: input.env,
2558
+ });
2559
+ trace(
2560
+ 'coordinator.child_submit_workflow',
2561
+ workflowSubmitStartedAt,
2562
+ manifest.graphHash,
2563
+ { childRunId, status: response.status },
2564
+ );
2565
+ const responseText = await response.text().catch(() => '');
2566
+ return {
2567
+ response,
2568
+ responseText,
2569
+ childRunId,
2570
+ childPlayName,
2571
+ startedAt,
2572
+ timings,
2573
+ };
2968
2574
  }
2969
2575
 
2970
2576
  /**
2971
2577
  * In-process Fetcher handed to each per-graphHash play Worker as
2972
- * `env.RUNTIME_API`. Runs in the coordinator's isolate (not the play's), so
2973
- * `fetch(target)` here can reach `http://localhost:3000` directly in dev —
2974
- * no public *.workers.dev CF edge cloudflared localhost chain.
2578
+ * `env.RUNTIME_API`. Runs in the coordinator's isolate. Forwards runtime
2579
+ * callbacks to DEEPLINE_API_BASE_URL: in dev (the only mode deployed CF
2580
+ * coordinator + local app) that is the cloudflared tunnel URL exposing the
2581
+ * laptop's app; in prod it is the deployed app URL. There is no
2582
+ * direct-to-localhost path (the local-workerd dev mode was removed).
2975
2583
  *
2976
2584
  * Has to be a `WorkerEntrypoint` (not a plain closure) because closures
2977
2585
  * containing captured state aren't structured-cloneable, and Cloudflare
@@ -3043,6 +2651,49 @@ export class CoordinatorControl extends WorkerEntrypoint<
3043
2651
  });
3044
2652
  }
3045
2653
 
2654
+ async submitWorkflowChild(
2655
+ parentRunId: string,
2656
+ body: Record<string, unknown>,
2657
+ ): Promise<{
2658
+ workflowId?: string;
2659
+ runId?: string;
2660
+ status?: string;
2661
+ mode?: string;
2662
+ timings?: CoordinatorTiming[];
2663
+ coordinator?: unknown;
2664
+ error?: unknown;
2665
+ }> {
2666
+ const { response, responseText, childRunId, timings } =
2667
+ await submitChildWorkflowThroughCoordinator({
2668
+ env: this.env,
2669
+ parentRunId,
2670
+ body,
2671
+ coordinatorUrl: null,
2672
+ });
2673
+ let parsed: unknown = {};
2674
+ try {
2675
+ parsed = responseText ? JSON.parse(responseText) : {};
2676
+ } catch {
2677
+ parsed = { error: responseText };
2678
+ }
2679
+ if (!response.ok) {
2680
+ return {
2681
+ runId: childRunId || undefined,
2682
+ workflowId: childRunId || undefined,
2683
+ status: 'failed',
2684
+ error: isRecord(parsed) ? (parsed.error ?? parsed) : parsed,
2685
+ };
2686
+ }
2687
+ return {
2688
+ workflowId: childRunId,
2689
+ runId: childRunId,
2690
+ status: 'started',
2691
+ mode: 'workflow_rpc',
2692
+ coordinator: parsed,
2693
+ timings,
2694
+ };
2695
+ }
2696
+
3046
2697
  async signal(
3047
2698
  runId: string,
3048
2699
  body: Record<string, unknown>,
@@ -3103,6 +2754,75 @@ export class CoordinatorControl extends WorkerEntrypoint<
3103
2754
  }
3104
2755
  await appendCoordinatorRunEvent(this.env, event);
3105
2756
  }
2757
+
2758
+ async readTerminalState(
2759
+ runId: string,
2760
+ ): Promise<CoordinatorTerminalState | null> {
2761
+ if (!runId) {
2762
+ throw new Error('runId is required.');
2763
+ }
2764
+ return await readCoordinatorTerminalState(this.env, runId);
2765
+ }
2766
+
2767
+ async readChildTerminalState(
2768
+ parentRunId: string,
2769
+ eventKey: string,
2770
+ timeoutMs?: number,
2771
+ ): Promise<CoordinatorChildTerminalState | null> {
2772
+ if (!parentRunId || !eventKey) {
2773
+ throw new Error('parentRunId and eventKey are required.');
2774
+ }
2775
+ return await readCoordinatorChildTerminalState({
2776
+ env: this.env,
2777
+ parentRunId,
2778
+ eventKey,
2779
+ timeoutMs:
2780
+ typeof timeoutMs === 'number' && Number.isFinite(timeoutMs)
2781
+ ? Math.max(0, Math.min(Math.floor(timeoutMs), 30_000))
2782
+ : undefined,
2783
+ });
2784
+ }
2785
+
2786
+ /**
2787
+ * Distributed Rate State Backend acquire: lease up to `requested` request-
2788
+ * window permits for `bucketId` (`<orgId>:<provider>`) from the per-bucket
2789
+ * rate-state Durable Object. See CoordinatorRateStateBackend + dedup-do.ts.
2790
+ */
2791
+ async rateAcquire(input: {
2792
+ bucketId: string;
2793
+ rules: Array<{
2794
+ ruleId: string;
2795
+ requestsPerWindow: number;
2796
+ windowMs: number;
2797
+ maxConcurrency: number | null;
2798
+ }>;
2799
+ requested: number;
2800
+ }): Promise<{ granted: number; waitMs: number }> {
2801
+ if (!input.bucketId || !input.bucketId.trim()) {
2802
+ throw new Error('bucketId is required.');
2803
+ }
2804
+ return await callRateBucketControl<{ granted: number; waitMs: number }>(
2805
+ this.env,
2806
+ input.bucketId,
2807
+ '/rate-acquire',
2808
+ input,
2809
+ );
2810
+ }
2811
+
2812
+ async ratePenalize(input: {
2813
+ bucketId: string;
2814
+ cooldownMs: number;
2815
+ }): Promise<void> {
2816
+ if (!input.bucketId || !input.bucketId.trim()) {
2817
+ throw new Error('bucketId is required.');
2818
+ }
2819
+ await callRateBucketControl<{ ok?: unknown }>(
2820
+ this.env,
2821
+ input.bucketId,
2822
+ '/rate-penalize',
2823
+ input,
2824
+ );
2825
+ }
3106
2826
  }
3107
2827
 
3108
2828
  /**
@@ -3132,80 +2852,20 @@ export class DynamicWorkflow extends WorkflowEntrypoint<
3132
2852
  graphHash: entryTrace.graphHash,
3133
2853
  extra: {
3134
2854
  instanceId: entryTrace.instanceId,
3135
- pooledBootstrap: entryTrace.pooledBootstrap,
3136
2855
  },
3137
2856
  });
3138
- let dispatchedEvent = event;
3139
- if (isPooledWorkflowBootstrapPayload(workflowEvent.payload)) {
3140
- const pooledPayload = workflowEvent.payload;
3141
- const waitingStep = step as {
3142
- waitForEvent<T>(
3143
- name: string,
3144
- options: { type: string; timeout?: string | number },
3145
- ): Promise<{ payload: Readonly<T>; timestamp: Date; type: string }>;
3146
- };
3147
- const waitStartedAt = Date.now();
3148
- const startEventPromise = waitingStep.waitForEvent<DispatcherEnvelope>(
3149
- 'wait for pooled play start',
3150
- { type: WORKFLOW_POOL_START_EVENT_TYPE, timeout: '10 minutes' },
3151
- );
3152
- await markWorkflowPoolIdReady(this.env, pooledPayload.poolId).catch(
3153
- (error) => {
3154
- console.warn('[coordinator.workflow_pool] ready signal failed', {
3155
- poolId: pooledPayload.poolId,
3156
- message: error instanceof Error ? error.message : String(error),
3157
- });
3158
- },
3159
- );
3160
- const startEvent = await startEventPromise;
3161
- dispatchedEvent = {
3162
- payload: startEvent.payload,
3163
- timestamp: startEvent.timestamp,
3164
- instanceId: workflowEvent.instanceId ?? pooledPayload.poolId,
3165
- };
3166
- const dispatchedTrace = readWorkflowTraceContext(dispatchedEvent);
3167
- const mapped = await mapRunToWorkflowInstance({
3168
- env: this.env,
3169
- runId: dispatchedTrace.runId,
3170
- instanceId: pooledPayload.poolId,
3171
- started: true,
3172
- }).catch((error) => {
3173
- console.warn('[coordinator.workflow_pool] start ack failed', {
3174
- poolId: pooledPayload.poolId,
3175
- runId: dispatchedTrace.runId,
3176
- message: error instanceof Error ? error.message : String(error),
3177
- });
3178
- return false;
3179
- });
3180
- if (!mapped) {
3181
- trace({
3182
- runId: dispatchedTrace.runId,
3183
- phase: 'coordinator.workflow_pool_start_blocked',
3184
- ms: 0,
3185
- graphHash: dispatchedTrace.graphHash,
3186
- extra: {
3187
- instanceId: pooledPayload.poolId,
3188
- eventType: startEvent.type,
3189
- },
3190
- });
3191
- return { ok: false, blocked: true, runId: dispatchedTrace.runId };
3192
- }
3193
- const eventDeliveryMs = Math.max(
3194
- 0,
3195
- Date.now() - startEvent.timestamp.getTime(),
3196
- );
2857
+ if (entryTrace.submittedAt !== null) {
3197
2858
  trace({
3198
- runId: dispatchedTrace.runId,
3199
- phase: 'coordinator.workflow_pool_start_event',
3200
- ms: eventDeliveryMs,
3201
- graphHash: dispatchedTrace.graphHash,
2859
+ runId: entryTrace.runId,
2860
+ phase: 'coordinator.workflow_start_gap',
2861
+ ms: Math.max(0, Date.now() - entryTrace.submittedAt),
2862
+ graphHash: entryTrace.graphHash,
3202
2863
  extra: {
3203
- instanceId: dispatchedTrace.instanceId,
3204
- eventType: startEvent.type,
3205
- poolWaitAgeMs: Date.now() - waitStartedAt,
2864
+ instanceId: entryTrace.instanceId,
3206
2865
  },
3207
2866
  });
3208
2867
  }
2868
+ let dispatchedEvent = event;
3209
2869
  dispatchedEvent = await hydrateWorkflowDbSessions({
3210
2870
  env: this.env,
3211
2871
  event: dispatchedEvent,
@@ -3219,7 +2879,6 @@ export class DynamicWorkflow extends WorkflowEntrypoint<
3219
2879
  graphHash: dispatchTrace.graphHash,
3220
2880
  extra: {
3221
2881
  instanceId: dispatchTrace.instanceId,
3222
- pooledBootstrap: dispatchTrace.pooledBootstrap,
3223
2882
  },
3224
2883
  });
3225
2884
 
@@ -3373,9 +3032,6 @@ const coordinatorEntrypoint = {
3373
3032
  ): Promise<Response> {
3374
3033
  const url = new URL(request.url);
3375
3034
  if (url.pathname === '/health') {
3376
- if (workflowPoolEnabled()) {
3377
- ctx?.waitUntil(refillWorkflowPool(env).catch(() => undefined));
3378
- }
3379
3035
  return new Response('ok', { status: 200 });
3380
3036
  }
3381
3037
  if (url.pathname === '/warmup/submit') {
@@ -3411,100 +3067,6 @@ const coordinatorEntrypoint = {
3411
3067
  if (authError) return authError;
3412
3068
  return await handleStagedFilePut(request, env);
3413
3069
  }
3414
- if (url.pathname === '/workflow-pool/refill') {
3415
- const internalAuthError = authorizeCoordinatorControlRequest({
3416
- request,
3417
- env,
3418
- });
3419
- if (internalAuthError) return internalAuthError;
3420
- const warmupToken = env.VERCEL_PROTECTION_BYPASS_TOKEN?.trim();
3421
- if (
3422
- warmupToken &&
3423
- request.headers.get('x-vercel-protection-bypass') !== warmupToken
3424
- ) {
3425
- return new Response('unauthorized', { status: 401 });
3426
- }
3427
- const startedAt = Date.now();
3428
- const minAvailableRaw = Number(
3429
- url.searchParams.get('minAvailable') ?? '',
3430
- );
3431
- const waitTimeoutMsRaw = Number(
3432
- url.searchParams.get('waitTimeoutMs') ?? '',
3433
- );
3434
- const result = await refillWorkflowPool(env, {
3435
- waitReady: url.searchParams.get('waitReady') === '1',
3436
- minAvailable:
3437
- Number.isFinite(minAvailableRaw) && minAvailableRaw > 0
3438
- ? minAvailableRaw
3439
- : undefined,
3440
- waitTimeoutMs:
3441
- Number.isFinite(waitTimeoutMsRaw) && waitTimeoutMsRaw > 0
3442
- ? waitTimeoutMsRaw
3443
- : undefined,
3444
- });
3445
- return Response.json({
3446
- ok: true,
3447
- ...result,
3448
- ms: Date.now() - startedAt,
3449
- });
3450
- }
3451
- if (url.pathname === '/workflow-pool/clear') {
3452
- const internalAuthError = authorizeCoordinatorControlRequest({
3453
- request,
3454
- env,
3455
- });
3456
- if (internalAuthError) return internalAuthError;
3457
- const warmupToken = env.VERCEL_PROTECTION_BYPASS_TOKEN?.trim();
3458
- if (
3459
- warmupToken &&
3460
- request.headers.get('x-vercel-protection-bypass') !== warmupToken
3461
- ) {
3462
- return new Response('unauthorized', { status: 401 });
3463
- }
3464
- const startedAt = Date.now();
3465
- const deleted = await clearWorkflowPool(env);
3466
- return Response.json({
3467
- ok: true,
3468
- deleted,
3469
- ms: Date.now() - startedAt,
3470
- });
3471
- }
3472
- if (url.pathname === '/workflow-pool/debug') {
3473
- const internalAuthError = authorizeCoordinatorControlRequest({
3474
- request,
3475
- env,
3476
- });
3477
- if (internalAuthError) return internalAuthError;
3478
- const entries = await listWorkflowPoolEntries(env);
3479
- const detailed = [];
3480
- for (const entry of entries) {
3481
- const instance = await getWorkflowPoolInstance(env, entry.id);
3482
- if (!instance) {
3483
- detailed.push({
3484
- ...entry,
3485
- status: 'missing',
3486
- mappedStatus: 'failed',
3487
- });
3488
- continue;
3489
- }
3490
- try {
3491
- const status = await instance.status().catch(() => null);
3492
- detailed.push({
3493
- ...entry,
3494
- status: workflowStatusName(status),
3495
- mappedStatus: status ? mapWorkflowStatus(status) : 'running',
3496
- });
3497
- } finally {
3498
- disposeRpcStub(instance);
3499
- }
3500
- }
3501
- return Response.json({
3502
- ok: true,
3503
- enabled: workflowPoolEnabled(),
3504
- entries: detailed,
3505
- });
3506
- }
3507
-
3508
3070
  // Workflow routes: /workflow/{runId}/{action}
3509
3071
  const wfMatch = url.pathname.match(/^\/workflow\/([^/]+)(?:\/(.+))?$/);
3510
3072
  if (wfMatch) {
@@ -3552,12 +3114,9 @@ const coordinatorEntrypoint = {
3552
3114
  },
3553
3115
  async scheduled(
3554
3116
  _controller: unknown,
3555
- env: CoordinatorEnv,
3556
- ctx?: ExecutionContext,
3557
- ): Promise<void> {
3558
- if (!workflowPoolEnabled()) return;
3559
- ctx?.waitUntil(refillWorkflowPool(env).catch(() => undefined));
3560
- },
3117
+ _env: CoordinatorEnv,
3118
+ _ctx?: ExecutionContext,
3119
+ ): Promise<void> {},
3561
3120
  };
3562
3121
 
3563
3122
  export default coordinatorEntrypoint;
@@ -3762,22 +3321,42 @@ async function handleWorkflowRoute(input: {
3762
3321
  });
3763
3322
  input.ctx?.waitUntil(prewarmPromise);
3764
3323
  }
3765
- const workflowParams = await externalizeWorkflowDbSessions({
3766
- env,
3324
+ const dbSessionExternalization = externalizedWorkflowDbSessionParams({
3767
3325
  params,
3768
- recordSubmitTiming,
3769
3326
  });
3327
+ const workflowParams = dbSessionExternalization.params;
3770
3328
  try {
3771
3329
  const retryStateStartedAt = Date.now();
3772
- await persistWorkflowRetryState({
3330
+ const launchState = await persistWorkflowLaunchState({
3773
3331
  env,
3774
3332
  runId: submittedRunId,
3775
3333
  params: workflowParams,
3334
+ sessions: dbSessionExternalization.sessions,
3776
3335
  });
3336
+ const persistedAt = Date.now();
3337
+ if (dbSessionExternalization.sessions.length > 0) {
3338
+ recordSubmitTiming({
3339
+ phase: 'coordinator.workflow_db_sessions_externalized',
3340
+ ms: persistedAt - retryStateStartedAt,
3341
+ graphHash: params.graphHash ?? null,
3342
+ extra: {
3343
+ sessions:
3344
+ launchState.sessionCount ??
3345
+ dbSessionExternalization.sessions.length,
3346
+ expiresAt:
3347
+ launchState.dbSessionsExpiresAt ??
3348
+ dbSessionExternalization.ref?.expiresAt,
3349
+ combinedLaunchState: true,
3350
+ },
3351
+ });
3352
+ }
3777
3353
  recordSubmitTiming({
3778
3354
  phase: 'coordinator.retry_state_persistence',
3779
- ms: Date.now() - retryStateStartedAt,
3355
+ ms: persistedAt - retryStateStartedAt,
3780
3356
  graphHash: params.graphHash ?? null,
3357
+ extra: {
3358
+ combinedLaunchState: dbSessionExternalization.sessions.length > 0,
3359
+ },
3781
3360
  });
3782
3361
  } catch (error) {
3783
3362
  const errorMessage =
@@ -3801,59 +3380,42 @@ async function handleWorkflowRoute(input: {
3801
3380
  error,
3802
3381
  });
3803
3382
  }
3383
+ workflowParams.submittedAt = Date.now();
3804
3384
  let instance: WorkflowInstance | null = null;
3805
3385
  try {
3806
- const statusEventStartedAt = Date.now();
3807
- await appendCoordinatorRunEvent(env, {
3808
- runId: submittedRunId,
3809
- type: 'status',
3810
- status: 'running',
3811
- ts: Date.now(),
3812
- });
3813
- recordSubmitTiming({
3814
- phase: 'coordinator.submit_status_event',
3815
- ms: Date.now() - statusEventStartedAt,
3816
- graphHash: params.graphHash ?? null,
3817
- });
3818
3386
  const dispatchStartedAt = Date.now();
3819
- const poolAttemptStartedAt = Date.now();
3820
- instance = await submitViaPooledWorkflow({
3387
+ const createStartedAt = Date.now();
3388
+ instance = await createDynamicWorkflowInstance({
3821
3389
  env,
3390
+ id: defaultInstanceId,
3822
3391
  params: workflowParams,
3823
- recordSubmitTiming,
3824
3392
  });
3393
+ const workflowCreatedAt = Date.now();
3825
3394
  recordSubmitTiming({
3826
- phase: 'coordinator.workflow_pool_attempt',
3827
- ms: Date.now() - poolAttemptStartedAt,
3395
+ phase: 'coordinator.workflow_create',
3396
+ ms: workflowCreatedAt - createStartedAt,
3828
3397
  graphHash: params.graphHash ?? null,
3829
- extra: {
3830
- usedPool: Boolean(instance),
3831
- enabled: workflowPoolEnabled(),
3832
- },
3398
+ extra: { instanceId: instance.id },
3833
3399
  });
3834
- if (!instance) {
3835
- const createStartedAt = Date.now();
3836
- instance = await createDynamicWorkflowInstance({
3837
- env,
3838
- id: defaultInstanceId,
3839
- params: workflowParams,
3840
- });
3841
- recordSubmitTiming({
3842
- phase: 'coordinator.workflow_create',
3843
- ms: Date.now() - createStartedAt,
3844
- graphHash: params.graphHash ?? null,
3845
- extra: { instanceId: instance.id },
3400
+ const instanceIdRecord = recordWorkflowInstanceId({
3401
+ env,
3402
+ runId: submittedRunId,
3403
+ instanceId: instance.id,
3404
+ }).catch((error) => {
3405
+ console.warn('[coordinator] workflow instance id record failed', {
3406
+ runId: submittedRunId,
3407
+ instanceId: instance?.id ?? null,
3408
+ error: error instanceof Error ? error.message : String(error),
3846
3409
  });
3847
- }
3410
+ });
3411
+ input.ctx?.waitUntil(instanceIdRecord);
3848
3412
  recordSubmitTiming({
3849
3413
  phase: 'coordinator.dispatch_workflow',
3850
3414
  ms: Date.now() - dispatchStartedAt,
3851
3415
  graphHash: params.graphHash ?? null,
3852
3416
  extra: {
3853
- startMode:
3854
- instance.id === defaultInstanceId
3855
- ? 'direct_workflow_create'
3856
- : 'pooled_workflow_start_event',
3417
+ startMode: 'direct_workflow_create',
3418
+ instanceIdRecord: 'waitUntil',
3857
3419
  },
3858
3420
  });
3859
3421
  const initialWaitMsRaw = Number(
@@ -3888,9 +3450,6 @@ async function handleWorkflowRoute(input: {
3888
3450
  ms: totalMs,
3889
3451
  graphHash: params.graphHash ?? null,
3890
3452
  });
3891
- if (workflowPoolEnabled() && instance.id === defaultInstanceId) {
3892
- input.ctx?.waitUntil(refillWorkflowPool(env).catch(() => undefined));
3893
- }
3894
3453
  return Response.json({
3895
3454
  runId,
3896
3455
  status: 'submitted',
@@ -3923,126 +3482,17 @@ async function handleWorkflowRoute(input: {
3923
3482
  { status: 400 },
3924
3483
  );
3925
3484
  }
3926
- const manifest = body.manifest as PlayRuntimeManifest | undefined;
3927
- const governance = body.internalRunPlay as
3928
- | PlayCallGovernanceSnapshot
3929
- | undefined;
3930
- const childPlayName =
3931
- typeof body.name === 'string' && body.name.trim()
3932
- ? body.name.trim()
3933
- : manifest?.playName?.trim();
3934
- if (
3935
- !manifest ||
3936
- !childPlayName ||
3937
- !manifest.artifactStorageKey ||
3938
- !manifest.artifactHash ||
3939
- !manifest.graphHash ||
3940
- !governance
3941
- ) {
3942
- return Response.json(
3943
- {
3944
- error: {
3945
- code: 'CHILD_MANIFEST_REQUIRED',
3946
- message:
3947
- 'submit-child requires a trusted child manifest and lineage.',
3948
- phase: 'coordinator_child_submit',
3949
- parentRunId: runId,
3950
- },
3951
- },
3952
- { status: 400 },
3953
- );
3954
- }
3955
- const childRunId = buildChildRunId(childPlayName);
3956
- const orgId = typeof body.orgId === 'string' ? body.orgId : '';
3957
- if (!orgId) {
3958
- return Response.json(
3959
- {
3960
- error: {
3961
- code: 'CHILD_ORG_REQUIRED',
3962
- message: 'submit-child requires orgId from the parent runtime.',
3963
- phase: 'coordinator_child_submit',
3964
- parentRunId: runId,
3965
- },
3966
- },
3967
- { status: 400 },
3968
- );
3969
- }
3970
- const parentExecutorToken =
3971
- typeof body.parentExecutorToken === 'string'
3972
- ? body.parentExecutorToken.trim()
3973
- : '';
3974
- if (!parentExecutorToken) {
3975
- return Response.json(
3976
- {
3977
- error: {
3978
- code: 'PARENT_EXECUTOR_TOKEN_REQUIRED',
3979
- message:
3980
- 'submit-child requires the parent executor token for origin-scoped child token minting.',
3981
- phase: 'coordinator_child_submit',
3982
- parentRunId: runId,
3983
- },
3984
- },
3985
- { status: 400 },
3986
- );
3987
- }
3988
- const baseUrl = resolveRuntimeBaseUrl(env, body);
3989
- const childToken = await mintChildWorkflowExecutorToken({
3990
- env,
3991
- baseUrl,
3992
- parentExecutorToken,
3993
- parentRunId: runId,
3994
- parentPlayName:
3995
- typeof body.parentPlayName === 'string' && body.parentPlayName.trim()
3996
- ? body.parentPlayName.trim()
3997
- : governance.parentPlayName,
3998
- childRunId,
3999
- childPlayName,
4000
- maxCreditsPerRun: manifest.maxCreditsPerRun ?? null,
4001
- });
4002
- const preloadedDbSessions = await preloadChildRuntimeDbSessions({
4003
- env,
4004
- baseUrl,
4005
- childExecutorToken: childToken,
3485
+ const {
3486
+ response: submitResponse,
3487
+ responseText,
4006
3488
  childRunId,
4007
3489
  childPlayName,
4008
- manifest,
4009
- orgId,
4010
- userEmail: typeof body.userEmail === 'string' ? body.userEmail : '',
4011
- });
4012
- const params = buildChildWorkflowParams({
3490
+ } = await submitChildWorkflowThroughCoordinator({
4013
3491
  env,
3492
+ parentRunId: runId,
4014
3493
  body,
4015
- manifest,
4016
- governance,
4017
- childRunId,
4018
- childPlayName,
4019
- childToken,
4020
- orgId,
4021
3494
  coordinatorUrl: new URL(request.url).origin,
4022
- runtimeBackend: 'cf_workflows_dynamic_worker',
4023
- dynamicWorkerCode:
4024
- typeof manifest.bundledCode === 'string'
4025
- ? manifest.bundledCode
4026
- : null,
4027
- preloadedDbSessions:
4028
- preloadedDbSessions.length > 0 ? preloadedDbSessions : null,
4029
- });
4030
- const submitResponse = await handleWorkflowRoute({
4031
- runId: childRunId,
4032
- action: 'submit',
4033
- request: new Request(
4034
- `https://deepline.coordinator.internal/workflow/${encodeURIComponent(
4035
- childRunId,
4036
- )}/submit`,
4037
- {
4038
- method: 'POST',
4039
- headers: { 'content-type': 'application/json' },
4040
- body: JSON.stringify(params),
4041
- },
4042
- ),
4043
- env,
4044
3495
  });
4045
- const responseText = await submitResponse.text().catch(() => '');
4046
3496
  recordCoordinatorPerfTrace({
4047
3497
  runId,
4048
3498
  phase: 'coordinator.child_submit',
@@ -4194,7 +3644,8 @@ async function handleWorkflowRoute(input: {
4194
3644
  .get('instanceId')
4195
3645
  ?.trim();
4196
3646
  const instanceId =
4197
- requestedInstanceId && !isWorkflowMutatingAction(action)
3647
+ requestedInstanceId &&
3648
+ isWorkflowInstanceIdForRun(runId, requestedInstanceId)
4198
3649
  ? requestedInstanceId
4199
3650
  : await resolveWorkflowInstanceIdForRun(env, runId);
4200
3651
  instance = await env.PLAY_WORKFLOW.get(instanceId);
@@ -4254,6 +3705,20 @@ async function handleWorkflowRoute(input: {
4254
3705
  : eventKey
4255
3706
  ? `integration_event_${eventKey}`
4256
3707
  : 'integration_event';
3708
+ if (body.signal === 'integration_event' && eventKey) {
3709
+ await writeCoordinatorChildTerminalState({
3710
+ env,
3711
+ parentRunId: runId,
3712
+ eventKey,
3713
+ data: body.data ?? body,
3714
+ }).catch((error: unknown) => {
3715
+ console.warn('[coordinator] child terminal cache write failed', {
3716
+ runId,
3717
+ eventKey,
3718
+ error: error instanceof Error ? error.message : String(error),
3719
+ });
3720
+ });
3721
+ }
4257
3722
  await instance.sendEvent({
4258
3723
  type: workflowEventType(eventType),
4259
3724
  payload: body,
@@ -4390,6 +3855,16 @@ function workflowInstanceId(runId: string): string {
4390
3855
  return `run-${stableHash(runId)}`;
4391
3856
  }
4392
3857
 
3858
+ function isWorkflowInstanceIdForRun(
3859
+ runId: string,
3860
+ instanceId: string,
3861
+ ): boolean {
3862
+ const canonical = workflowInstanceId(runId);
3863
+ return (
3864
+ instanceId === canonical || instanceId.startsWith(`${canonical}-retry-`)
3865
+ );
3866
+ }
3867
+
4393
3868
  function stableHash(value: string): string {
4394
3869
  let hash = 2166136261;
4395
3870
  for (let index = 0; index < value.length; index += 1) {
@@ -4513,10 +3988,11 @@ function loadDynamicPlayWorkerSync(
4513
3988
  // miswired environments fail before user code starts.
4514
3989
  HARNESS: env.HARNESS,
4515
3990
  VERCEL_PROTECTION_BYPASS_TOKEN: env.VERCEL_PROTECTION_BYPASS_TOKEN,
4516
- // In-process runtime API bridge used by the play harness for status,
4517
- // tool execution, DB session, and artifact callbacks. This avoids a
4518
- // public fetch hop when Cloudflare exposes the RuntimeApi export.
4519
- ...makeRuntimeApiEnvBinding(),
3991
+ // Runtime API bridge used by the play harness for status, tool
3992
+ // execution, DB session, and artifact callbacks. This uses the
3993
+ // long-lived HARNESS service binding, avoiding public callback HTTP
3994
+ // without relying on dynamic-worker access to named exports.
3995
+ ...makeRuntimeApiEnvBinding(env),
4520
3996
  // In-process coordinator control bridge used by ctx.runPlay and
4521
3997
  // parent terminal signals. This keeps scalar child plays inline with
4522
3998
  // the parent instead of round-tripping through nested Workflow waits.
@@ -4601,7 +4077,7 @@ async function loadDynamicPlayWorker(
4601
4077
  // HARNESS, and child workflow control uses the COORDINATOR binding.
4602
4078
  HARNESS: env.HARNESS,
4603
4079
  VERCEL_PROTECTION_BYPASS_TOKEN: env.VERCEL_PROTECTION_BYPASS_TOKEN,
4604
- ...makeRuntimeApiEnvBinding(),
4080
+ ...makeRuntimeApiEnvBinding(env),
4605
4081
  ...makeCoordinatorControlBinding(),
4606
4082
  },
4607
4083
  };
@@ -5051,20 +4527,6 @@ async function handleCoordinatorWarmup(
5051
4527
  graphHash: params.graphHash,
5052
4528
  extra: { status: response.status, label },
5053
4529
  });
5054
- const poolRefillPromise = refillWorkflowPool(env, {
5055
- waitReady: true,
5056
- minAvailable: 1,
5057
- }).catch(() => ({
5058
- available: 0,
5059
- warming: 0,
5060
- target: 0,
5061
- created: 0,
5062
- promoted: 0,
5063
- removed: 0,
5064
- waitedMs: 0,
5065
- waitIterations: 0,
5066
- }));
5067
- ctx?.waitUntil(poolRefillPromise.then(() => undefined));
5068
4530
  let body: unknown = null;
5069
4531
  try {
5070
4532
  body = text ? JSON.parse(text) : null;
@@ -5087,54 +4549,26 @@ async function handleCoordinatorWarmup(
5087
4549
  status: response.status,
5088
4550
  body,
5089
4551
  terminalState,
5090
- workflowPool: await poolRefillPromise,
5091
4552
  },
5092
4553
  { status: responseStatus },
5093
4554
  );
5094
4555
  }
5095
4556
 
5096
4557
  /**
5097
- * Returns a structured-cloneable `Fetcher` stub for the `RuntimeApi`
5098
- * WorkerEntrypoint. The stub goes into the per-graphHash play Worker's
5099
- * `env.RUNTIME_API`. When the harness calls `env.RUNTIME_API.fetch(req)`,
5100
- * the request is RPC-dispatched into the `RuntimeApi.fetch` method on the
5101
- * coordinator side, which path-allowlists it and forwards to
5102
- * `DEEPLINE_API_BASE_URL` directly. Skips the public *.workers.dev CF
5103
- * edge cloudflared localhost chain that the harness's old
5104
- * `fetch(req.baseUrl + path)` path traverses.
5105
- *
5106
- * Implemented as a WorkerEntrypoint (not a plain closure) because Cloudflare
5107
- * Workflows serializes the dynamic Worker's env when persisting workflow
5108
- * state, and closures containing captured locals aren't
5109
- * structured-cloneable. WorkerEntrypoint stubs ARE cloneable — same trick
5110
- * `makePlayAssetsBinding` already uses.
5111
- *
5112
- * Falls back transparently when Cloudflare does not expose module exports in
5113
- * the current execution path: if the binding is omitted from `env`, the play
5114
- * worker uses its existing `fetch(req.baseUrl + path)` transport.
4558
+ * Returns a structured-cloneable runtime API binding for the per-graphHash
4559
+ * play Worker's `env.RUNTIME_API`. We intentionally pass the long-lived
4560
+ * HARNESS WorkerEntrypoint service binding instead of a plain closure: the
4561
+ * dynamic Worker env is serialized by Cloudflare Workflows, and service
4562
+ * bindings are cloneable while closures are not. The per-play runtime accepts
4563
+ * this binding via `runtimeApiCall(...)`, so callbacks still stay on
4564
+ * service bindings and never fall back to public HTTP.
5115
4565
  */
5116
- let loggedMissingRuntimeApiExport = false;
5117
4566
  let loggedMissingCoordinatorControlExport = false;
5118
4567
 
5119
- function makeRuntimeApiEnvBinding():
5120
- | { RUNTIME_API: { fetch(req: Request): Promise<Response> } }
5121
- | Record<string, never> {
5122
- const exports = workersExports as unknown as {
5123
- RuntimeApi?: (init: { props: undefined }) => {
5124
- fetch(req: Request): Promise<Response>;
5125
- };
5126
- };
5127
- const ctor = exports.RuntimeApi;
5128
- if (typeof ctor !== 'function') {
5129
- if (!loggedMissingRuntimeApiExport) {
5130
- loggedMissingRuntimeApiExport = true;
5131
- console.warn(
5132
- '[coordinator] RuntimeApi is not registered on cloudflare:workers exports; using public runtime API transport.',
5133
- );
5134
- }
5135
- return {};
5136
- }
5137
- return { RUNTIME_API: ctor({ props: undefined }) };
4568
+ function makeRuntimeApiEnvBinding(env: CoordinatorEnv): {
4569
+ RUNTIME_API: CoordinatorEnv['HARNESS'];
4570
+ } {
4571
+ return { RUNTIME_API: env.HARNESS };
5138
4572
  }
5139
4573
 
5140
4574
  function makeCoordinatorControlBinding():
@@ -5144,6 +4578,10 @@ function makeCoordinatorControlBinding():
5144
4578
  parentRunId: string,
5145
4579
  body: Record<string, unknown>,
5146
4580
  ): Promise<{ workflowId?: string; runId?: string; error?: unknown }>;
4581
+ submitWorkflowChild(
4582
+ parentRunId: string,
4583
+ body: Record<string, unknown>,
4584
+ ): Promise<{ workflowId?: string; runId?: string; error?: unknown }>;
5147
4585
  signal(
5148
4586
  runId: string,
5149
4587
  body: Record<string, unknown>,
@@ -5156,6 +4594,28 @@ function makeCoordinatorControlBinding():
5156
4594
  runId: string,
5157
4595
  event: CoordinatorRunEvent,
5158
4596
  ): Promise<void>;
4597
+ readTerminalState(
4598
+ runId: string,
4599
+ ): Promise<CoordinatorTerminalState | null>;
4600
+ readChildTerminalState(
4601
+ parentRunId: string,
4602
+ eventKey: string,
4603
+ timeoutMs?: number,
4604
+ ): Promise<CoordinatorChildTerminalState | null>;
4605
+ rateAcquire(input: {
4606
+ bucketId: string;
4607
+ rules: Array<{
4608
+ ruleId: string;
4609
+ requestsPerWindow: number;
4610
+ windowMs: number;
4611
+ maxConcurrency: number | null;
4612
+ }>;
4613
+ requested: number;
4614
+ }): Promise<{ granted: number; waitMs: number }>;
4615
+ ratePenalize(input: {
4616
+ bucketId: string;
4617
+ cooldownMs: number;
4618
+ }): Promise<void>;
5159
4619
  };
5160
4620
  }
5161
4621
  | Record<string, never> {
@@ -5165,6 +4625,10 @@ function makeCoordinatorControlBinding():
5165
4625
  parentRunId: string,
5166
4626
  body: Record<string, unknown>,
5167
4627
  ): Promise<{ workflowId?: string; runId?: string; error?: unknown }>;
4628
+ submitWorkflowChild(
4629
+ parentRunId: string,
4630
+ body: Record<string, unknown>,
4631
+ ): Promise<{ workflowId?: string; runId?: string; error?: unknown }>;
5168
4632
  signal(
5169
4633
  runId: string,
5170
4634
  body: Record<string, unknown>,
@@ -5174,6 +4638,28 @@ function makeCoordinatorControlBinding():
5174
4638
  payload: CoordinatorPerfTracePayload,
5175
4639
  ): Promise<void>;
5176
4640
  recordRunEvent(runId: string, event: CoordinatorRunEvent): Promise<void>;
4641
+ readTerminalState(
4642
+ runId: string,
4643
+ ): Promise<CoordinatorTerminalState | null>;
4644
+ readChildTerminalState(
4645
+ parentRunId: string,
4646
+ eventKey: string,
4647
+ timeoutMs?: number,
4648
+ ): Promise<CoordinatorChildTerminalState | null>;
4649
+ rateAcquire(input: {
4650
+ bucketId: string;
4651
+ rules: Array<{
4652
+ ruleId: string;
4653
+ requestsPerWindow: number;
4654
+ windowMs: number;
4655
+ maxConcurrency: number | null;
4656
+ }>;
4657
+ requested: number;
4658
+ }): Promise<{ granted: number; waitMs: number }>;
4659
+ ratePenalize(input: {
4660
+ bucketId: string;
4661
+ cooldownMs: number;
4662
+ }): Promise<void>;
5177
4663
  };
5178
4664
  };
5179
4665
  const ctor = exports.CoordinatorControl;