deepline 0.1.78 → 0.1.80
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +69 -37
- package/dist/cli/index.mjs +69 -37
- package/dist/index.d.mts +32 -1
- package/dist/index.d.ts +32 -1
- package/dist/index.js +7 -4
- package/dist/index.mjs +7 -4
- package/dist/repo/apps/play-runner-workers/src/child-play-await.ts +192 -0
- package/dist/repo/apps/play-runner-workers/src/coordinator-entry.ts +1320 -1644
- package/dist/repo/apps/play-runner-workers/src/dedup-do.ts +515 -648
- package/dist/repo/apps/play-runner-workers/src/entry.ts +896 -354
- package/dist/repo/apps/play-runner-workers/src/workflow-retry-state.ts +209 -0
- package/dist/repo/sdk/src/client.ts +9 -2
- package/dist/repo/sdk/src/release.ts +2 -2
- package/dist/repo/sdk/src/types.ts +5 -0
- package/dist/repo/shared_libs/play-runtime/governor/coordinator-rate-state-backend.ts +231 -0
- package/dist/repo/shared_libs/play-runtime/governor/governor.ts +376 -0
- package/dist/repo/shared_libs/play-runtime/governor/policy.ts +179 -0
- package/dist/repo/shared_libs/play-runtime/governor/rate-state-backend.ts +87 -0
- package/dist/repo/shared_libs/play-runtime/run-failure.ts +12 -0
- package/dist/repo/shared_libs/play-runtime/scheduler-backend.ts +24 -0
- package/dist/repo/shared_libs/play-runtime/submit-limits.ts +35 -0
- package/dist/repo/shared_libs/plays/bundling/index.ts +4 -12
- package/dist/repo/shared_libs/plays/bundling/limits.ts +29 -0
- package/dist/repo/shared_libs/plays/static-pipeline.ts +314 -1
- package/dist/repo/shared_libs/temporal/constants.ts +38 -0
- package/package.json +1 -1
|
@@ -53,6 +53,14 @@ import {
|
|
|
53
53
|
decideWorkflowPlatformRetry,
|
|
54
54
|
PLATFORM_DEPLOY_WORKFLOW_RETRY_LIMIT,
|
|
55
55
|
} from './workflow-retry';
|
|
56
|
+
import {
|
|
57
|
+
WORKFLOW_RETRY_PARAMS_EXTERNALIZE_AFTER_BYTES,
|
|
58
|
+
WORKFLOW_RETRY_PARAMS_MAX_BYTES,
|
|
59
|
+
buildWorkflowRetryParams,
|
|
60
|
+
jsonByteLength,
|
|
61
|
+
workflowRetryParamsStorageKey,
|
|
62
|
+
type WorkflowRetryParamsRef,
|
|
63
|
+
} from './workflow-retry-state';
|
|
56
64
|
import { sanitizeLiveLogLines } from './runtime/live-progress';
|
|
57
65
|
|
|
58
66
|
export { DynamicWorkflowBinding };
|
|
@@ -104,6 +112,7 @@ export type PlayWorkflowParams = {
|
|
|
104
112
|
totalRows?: number;
|
|
105
113
|
coordinatorUrl?: string | null;
|
|
106
114
|
coordinatorInternalToken?: string | null;
|
|
115
|
+
submittedAt?: number | null;
|
|
107
116
|
};
|
|
108
117
|
|
|
109
118
|
type InlineChildSubmitResult =
|
|
@@ -167,6 +176,12 @@ type CoordinatorTerminalState = {
|
|
|
167
176
|
completedAt?: number;
|
|
168
177
|
};
|
|
169
178
|
|
|
179
|
+
type CoordinatorChildTerminalState = {
|
|
180
|
+
eventKey: string;
|
|
181
|
+
data: unknown;
|
|
182
|
+
storedAt: number;
|
|
183
|
+
};
|
|
184
|
+
|
|
170
185
|
type CoordinatorRunEvent =
|
|
171
186
|
| {
|
|
172
187
|
seq?: number;
|
|
@@ -216,6 +231,11 @@ type InlineWorkerRunResponse = {
|
|
|
216
231
|
outputRows?: number;
|
|
217
232
|
durationMs?: number;
|
|
218
233
|
parseMs?: number;
|
|
234
|
+
timings?: Array<{
|
|
235
|
+
phase?: unknown;
|
|
236
|
+
ms?: unknown;
|
|
237
|
+
extra?: unknown;
|
|
238
|
+
}>;
|
|
219
239
|
events?: Array<
|
|
220
240
|
| { type: 'log'; message?: string; level?: string; ts?: number }
|
|
221
241
|
| { type: 'result'; result?: unknown; outputRows?: number; ts?: number }
|
|
@@ -230,7 +250,7 @@ function isRecord(value: unknown): value is Record<string, unknown> {
|
|
|
230
250
|
}
|
|
231
251
|
|
|
232
252
|
interface CoordinatorEnv {
|
|
233
|
-
PLAY_WORKFLOW: Workflow<PlayWorkflowParams
|
|
253
|
+
PLAY_WORKFLOW: Workflow<PlayWorkflowParams>;
|
|
234
254
|
PLAY_DEDUP: DurableObjectNamespace;
|
|
235
255
|
LOADER?: {
|
|
236
256
|
get(
|
|
@@ -523,6 +543,67 @@ async function readCoordinatorTerminalState(
|
|
|
523
543
|
return state as CoordinatorTerminalState;
|
|
524
544
|
}
|
|
525
545
|
|
|
546
|
+
async function writeCoordinatorChildTerminalState(input: {
|
|
547
|
+
env: CoordinatorEnv;
|
|
548
|
+
parentRunId: string;
|
|
549
|
+
eventKey: string;
|
|
550
|
+
data: unknown;
|
|
551
|
+
}): Promise<void> {
|
|
552
|
+
const stub = input.env.PLAY_DEDUP.get(
|
|
553
|
+
input.env.PLAY_DEDUP.idFromName(input.parentRunId),
|
|
554
|
+
);
|
|
555
|
+
const response = await stub.fetch(
|
|
556
|
+
'https://deepline.dedup.internal/child-terminal-set',
|
|
557
|
+
{
|
|
558
|
+
method: 'POST',
|
|
559
|
+
headers: { 'content-type': 'application/json' },
|
|
560
|
+
body: JSON.stringify({
|
|
561
|
+
eventKey: input.eventKey,
|
|
562
|
+
data: input.data,
|
|
563
|
+
storedAt: Date.now(),
|
|
564
|
+
}),
|
|
565
|
+
},
|
|
566
|
+
);
|
|
567
|
+
if (!response.ok) {
|
|
568
|
+
throw new Error(`coordinator child terminal set failed ${response.status}`);
|
|
569
|
+
}
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
async function readCoordinatorChildTerminalState(input: {
|
|
573
|
+
env: CoordinatorEnv;
|
|
574
|
+
parentRunId: string;
|
|
575
|
+
eventKey: string;
|
|
576
|
+
timeoutMs?: number;
|
|
577
|
+
}): Promise<CoordinatorChildTerminalState | null> {
|
|
578
|
+
const stub = input.env.PLAY_DEDUP.get(
|
|
579
|
+
input.env.PLAY_DEDUP.idFromName(input.parentRunId),
|
|
580
|
+
);
|
|
581
|
+
const endpoint =
|
|
582
|
+
input.timeoutMs && input.timeoutMs > 0
|
|
583
|
+
? 'child-terminal-await'
|
|
584
|
+
: 'child-terminal-get';
|
|
585
|
+
const timeoutParam =
|
|
586
|
+
input.timeoutMs && input.timeoutMs > 0
|
|
587
|
+
? `&timeoutMs=${encodeURIComponent(String(Math.floor(input.timeoutMs)))}`
|
|
588
|
+
: '';
|
|
589
|
+
const response = await stub.fetch(
|
|
590
|
+
`https://deepline.dedup.internal/${endpoint}?eventKey=${encodeURIComponent(
|
|
591
|
+
input.eventKey,
|
|
592
|
+
)}${timeoutParam}`,
|
|
593
|
+
);
|
|
594
|
+
if (!response.ok) {
|
|
595
|
+
throw new Error(
|
|
596
|
+
`coordinator child terminal ${endpoint} failed ${response.status}`,
|
|
597
|
+
);
|
|
598
|
+
}
|
|
599
|
+
const body = (await response.json().catch(() => ({}))) as {
|
|
600
|
+
state?: unknown;
|
|
601
|
+
};
|
|
602
|
+
const state = body.state;
|
|
603
|
+
if (!isRecord(state) || state.eventKey !== input.eventKey) return null;
|
|
604
|
+
return state as CoordinatorChildTerminalState;
|
|
605
|
+
}
|
|
606
|
+
|
|
526
607
|
function workflowEventType(name: string): string {
|
|
527
608
|
const normalized = name
|
|
528
609
|
.trim()
|
|
@@ -547,36 +628,9 @@ type DynamicWorkflowMetadata = {
|
|
|
547
628
|
}> | null;
|
|
548
629
|
};
|
|
549
630
|
|
|
550
|
-
type DispatcherEnvelope = {
|
|
551
|
-
__dispatcherMetadata: DynamicWorkflowMetadata;
|
|
552
|
-
params: PlayWorkflowParams;
|
|
553
|
-
};
|
|
554
|
-
|
|
555
|
-
type PooledWorkflowBootstrapPayload = {
|
|
556
|
-
__deeplinePooledWorkflow: true;
|
|
557
|
-
poolId: string;
|
|
558
|
-
createdAt: number;
|
|
559
|
-
};
|
|
560
|
-
|
|
561
|
-
const WORKFLOW_POOL_PROTOCOL_VERSION =
|
|
562
|
-
'pooled-workflow-wait-v14-ready-signal-http-storage';
|
|
563
|
-
const WORKFLOW_POOL_DO_NAME = 'workflow-pool:v2';
|
|
564
|
-
const WORKFLOW_POOL_START_EVENT_TYPE = 'play_start';
|
|
565
|
-
const WORKFLOW_POOL_TTL_MS = 8 * 60 * 1000;
|
|
566
|
-
const WORKFLOW_POOL_TARGET_SIZE = 0;
|
|
567
|
-
const WORKFLOW_POOL_READY_TIMEOUT_MS = 1_500;
|
|
568
|
-
const WORKFLOW_POOL_READY_POLL_MS = 250;
|
|
569
|
-
const WORKFLOW_POOL_REFILL_ON_MISS_TIMEOUT_MS = 2_500;
|
|
570
|
-
const WORKFLOW_POOL_REFILL_ON_MISS_MIN_AVAILABLE = 4;
|
|
571
|
-
const WORKFLOW_POOL_CONTROL_TIMEOUT_MS = 750;
|
|
572
|
-
const WORKFLOW_POOL_START_ACK_TIMEOUT_MS = 750;
|
|
573
|
-
const WORKFLOW_POOL_START_ACK_POLL_MS = 25;
|
|
574
631
|
const SUBMIT_INITIAL_STATE_MAX_WAIT_MS = 0;
|
|
575
632
|
const SUBMIT_INITIAL_STATE_POLL_MS = 50;
|
|
576
633
|
const WORKFLOW_RETRY_STATE_TTL_MS = 60 * 60 * 1000;
|
|
577
|
-
const WORKFLOW_POOL_PREWARM_ESCALATE_TARGET_AFTER_MS = 250;
|
|
578
|
-
const WORKFLOW_POOL_SCHEDULED_REFILL_MIN_AVAILABLE = 1;
|
|
579
|
-
const WORKFLOW_POOL_SCHEDULED_REFILL_TIMEOUT_MS = 10_000;
|
|
580
634
|
|
|
581
635
|
function buildDynamicWorkflowMetadata(
|
|
582
636
|
params: PlayWorkflowParams,
|
|
@@ -591,35 +645,11 @@ function buildDynamicWorkflowMetadata(
|
|
|
591
645
|
};
|
|
592
646
|
}
|
|
593
647
|
|
|
594
|
-
function buildDispatcherEnvelope(
|
|
595
|
-
params: PlayWorkflowParams,
|
|
596
|
-
): DispatcherEnvelope {
|
|
597
|
-
// Mirrors @cloudflare/dynamic-workflows' envelope. We need to send the
|
|
598
|
-
// dispatcher payload via Workflow sendEvent for prewarmed instances; the
|
|
599
|
-
// public wrapper only applies this envelope to create() params.
|
|
600
|
-
return {
|
|
601
|
-
__dispatcherMetadata: buildDynamicWorkflowMetadata(params),
|
|
602
|
-
params,
|
|
603
|
-
};
|
|
604
|
-
}
|
|
605
|
-
|
|
606
|
-
function isPooledWorkflowBootstrapPayload(
|
|
607
|
-
value: unknown,
|
|
608
|
-
): value is PooledWorkflowBootstrapPayload {
|
|
609
|
-
return (
|
|
610
|
-
Boolean(value) &&
|
|
611
|
-
typeof value === 'object' &&
|
|
612
|
-
!Array.isArray(value) &&
|
|
613
|
-
(value as Record<string, unknown>).__deeplinePooledWorkflow === true &&
|
|
614
|
-
typeof (value as Record<string, unknown>).poolId === 'string'
|
|
615
|
-
);
|
|
616
|
-
}
|
|
617
|
-
|
|
618
648
|
function readWorkflowTraceContext(event: unknown): {
|
|
619
649
|
runId: string;
|
|
620
650
|
graphHash: string | null;
|
|
621
651
|
instanceId: string | null;
|
|
622
|
-
|
|
652
|
+
submittedAt: number | null;
|
|
623
653
|
} {
|
|
624
654
|
const record = isRecord(event) ? event : {};
|
|
625
655
|
const payload = isRecord(record.payload) ? record.payload : {};
|
|
@@ -627,17 +657,15 @@ function readWorkflowTraceContext(event: unknown): {
|
|
|
627
657
|
const metadata = isRecord(payload.__dispatcherMetadata)
|
|
628
658
|
? payload.__dispatcherMetadata
|
|
629
659
|
: null;
|
|
630
|
-
const pooled = isPooledWorkflowBootstrapPayload(payload);
|
|
631
660
|
const runId =
|
|
632
661
|
(typeof params?.runId === 'string' && params.runId) ||
|
|
633
662
|
(typeof metadata?.runId === 'string' && metadata.runId) ||
|
|
634
|
-
(pooled && typeof payload.poolId === 'string' ? payload.poolId : null) ||
|
|
635
663
|
(typeof record.instanceId === 'string' && record.instanceId) ||
|
|
636
664
|
'unknown-workflow-run';
|
|
637
665
|
const graphHash =
|
|
638
666
|
(typeof params?.graphHash === 'string' && params.graphHash) ||
|
|
639
667
|
(typeof metadata?.graphHash === 'string' && metadata.graphHash) ||
|
|
640
|
-
|
|
668
|
+
null;
|
|
641
669
|
return {
|
|
642
670
|
runId,
|
|
643
671
|
graphHash,
|
|
@@ -645,18 +673,14 @@ function readWorkflowTraceContext(event: unknown): {
|
|
|
645
673
|
typeof record.instanceId === 'string' && record.instanceId
|
|
646
674
|
? record.instanceId
|
|
647
675
|
: null,
|
|
648
|
-
|
|
676
|
+
submittedAt:
|
|
677
|
+
typeof params?.submittedAt === 'number' &&
|
|
678
|
+
Number.isFinite(params.submittedAt)
|
|
679
|
+
? params.submittedAt
|
|
680
|
+
: null,
|
|
649
681
|
};
|
|
650
682
|
}
|
|
651
683
|
|
|
652
|
-
function workflowPoolEnabled(): boolean {
|
|
653
|
-
return WORKFLOW_POOL_TARGET_SIZE > 0;
|
|
654
|
-
}
|
|
655
|
-
|
|
656
|
-
function workflowPoolTargetSize(): number {
|
|
657
|
-
return WORKFLOW_POOL_TARGET_SIZE;
|
|
658
|
-
}
|
|
659
|
-
|
|
660
684
|
async function waitForSubmitInitialState(input: {
|
|
661
685
|
instance: WorkflowInstance;
|
|
662
686
|
runId: string;
|
|
@@ -704,87 +728,110 @@ async function createDynamicWorkflowInstance(input: {
|
|
|
704
728
|
});
|
|
705
729
|
}
|
|
706
730
|
|
|
731
|
+
function runScopedDurableObject(
|
|
732
|
+
env: CoordinatorEnv,
|
|
733
|
+
runId: string,
|
|
734
|
+
): DurableObjectStub {
|
|
735
|
+
return env.PLAY_DEDUP.get(env.PLAY_DEDUP.idFromName(runId));
|
|
736
|
+
}
|
|
737
|
+
|
|
707
738
|
/**
|
|
708
|
-
*
|
|
709
|
-
*
|
|
710
|
-
*
|
|
711
|
-
*
|
|
712
|
-
*
|
|
713
|
-
*
|
|
714
|
-
* See docs/adr/0005-durable-scope.md.
|
|
739
|
+
* Address the rate-state Durable Object for a single `<orgId>:<provider>`
|
|
740
|
+
* bucket. Keying the DO by the bucket id (not the run id) makes one
|
|
741
|
+
* single-threaded instance own that bucket's request window across every
|
|
742
|
+
* isolate of every run in the org — which is exactly what the distributed Rate
|
|
743
|
+
* State Backend needs. Reuses the PlayDedup namespace (the DO already hosts the
|
|
744
|
+
* token-bucket handlers) so no extra binding is required.
|
|
715
745
|
*/
|
|
716
|
-
function
|
|
717
|
-
|
|
718
|
-
|
|
746
|
+
function rateBucketDurableObject(
|
|
747
|
+
env: CoordinatorEnv,
|
|
748
|
+
bucketId: string,
|
|
749
|
+
): DurableObjectStub {
|
|
750
|
+
return env.PLAY_DEDUP.get(env.PLAY_DEDUP.idFromName(`rate:${bucketId}`));
|
|
719
751
|
}
|
|
720
752
|
|
|
721
|
-
function
|
|
722
|
-
|
|
723
|
-
|
|
753
|
+
async function callRateBucketControl<T>(
|
|
754
|
+
env: CoordinatorEnv,
|
|
755
|
+
bucketId: string,
|
|
756
|
+
path: string,
|
|
757
|
+
body: unknown,
|
|
758
|
+
): Promise<T> {
|
|
759
|
+
const response = await rateBucketDurableObject(env, bucketId).fetch(
|
|
760
|
+
`https://deepline.rate-state.internal${path}`,
|
|
761
|
+
{
|
|
762
|
+
method: 'POST',
|
|
763
|
+
headers: { 'content-type': 'application/json' },
|
|
764
|
+
body: JSON.stringify(body),
|
|
765
|
+
},
|
|
724
766
|
);
|
|
767
|
+
if (!response.ok) {
|
|
768
|
+
throw new Error(
|
|
769
|
+
`rate state ${path} failed ${response.status}: ${(
|
|
770
|
+
await response.text().catch(() => '')
|
|
771
|
+
).slice(0, 400)}`,
|
|
772
|
+
);
|
|
773
|
+
}
|
|
774
|
+
return (await response.json()) as T;
|
|
725
775
|
}
|
|
726
776
|
|
|
727
|
-
function
|
|
777
|
+
async function callRunScopedControl<T>(
|
|
728
778
|
env: CoordinatorEnv,
|
|
729
779
|
runId: string,
|
|
730
|
-
): DurableObjectStub {
|
|
731
|
-
return env.PLAY_DEDUP.get(env.PLAY_DEDUP.idFromName(runId));
|
|
732
|
-
}
|
|
733
|
-
|
|
734
|
-
async function callWorkflowPool<T>(
|
|
735
|
-
env: CoordinatorEnv,
|
|
736
780
|
path: string,
|
|
737
|
-
init?: RequestInit
|
|
781
|
+
init?: RequestInit,
|
|
738
782
|
): Promise<T> {
|
|
739
|
-
const
|
|
740
|
-
|
|
741
|
-
|
|
783
|
+
const response = await runScopedDurableObject(env, runId).fetch(
|
|
784
|
+
`https://deepline.run-state.internal${path}`,
|
|
785
|
+
{
|
|
786
|
+
...(init ?? {}),
|
|
787
|
+
headers: {
|
|
788
|
+
'content-type': 'application/json',
|
|
789
|
+
...(init?.headers ?? {}),
|
|
790
|
+
},
|
|
791
|
+
},
|
|
742
792
|
);
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
workflowPoolDurableObject(env).fetch(
|
|
750
|
-
`https://deepline.workflow-pool.internal${path}`,
|
|
751
|
-
{
|
|
752
|
-
...fetchInit,
|
|
753
|
-
headers: {
|
|
754
|
-
'content-type': 'application/json',
|
|
755
|
-
...(init?.headers ?? {}),
|
|
756
|
-
},
|
|
757
|
-
},
|
|
758
|
-
),
|
|
759
|
-
new Promise<Response>((_, reject) => {
|
|
760
|
-
timeoutId = setTimeout(
|
|
761
|
-
() =>
|
|
762
|
-
reject(
|
|
763
|
-
new Error(`workflow pool ${path} timed out after ${timeoutMs}ms`),
|
|
764
|
-
),
|
|
765
|
-
timeoutMs,
|
|
766
|
-
);
|
|
767
|
-
}),
|
|
768
|
-
]);
|
|
769
|
-
if (!response.ok) {
|
|
770
|
-
throw new Error(
|
|
771
|
-
`workflow pool ${path} failed ${response.status}: ${(
|
|
772
|
-
await response.text().catch(() => '')
|
|
773
|
-
).slice(0, 400)}`,
|
|
774
|
-
);
|
|
775
|
-
}
|
|
776
|
-
return (await response.json()) as T;
|
|
777
|
-
} catch (error) {
|
|
778
|
-
if (
|
|
779
|
-
error instanceof Error &&
|
|
780
|
-
(error.name === 'AbortError' || error.message.includes('aborted'))
|
|
781
|
-
) {
|
|
782
|
-
throw new Error(`workflow pool ${path} timed out after ${timeoutMs}ms`);
|
|
783
|
-
}
|
|
784
|
-
throw error;
|
|
785
|
-
} finally {
|
|
786
|
-
if (timeoutId) clearTimeout(timeoutId);
|
|
793
|
+
if (!response.ok) {
|
|
794
|
+
throw new Error(
|
|
795
|
+
`run state ${path} failed ${response.status}: ${(
|
|
796
|
+
await response.text().catch(() => '')
|
|
797
|
+
).slice(0, 400)}`,
|
|
798
|
+
);
|
|
787
799
|
}
|
|
800
|
+
return (await response.json()) as T;
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
async function recordWorkflowInstanceId(input: {
|
|
804
|
+
env: CoordinatorEnv;
|
|
805
|
+
runId: string;
|
|
806
|
+
instanceId: string;
|
|
807
|
+
}): Promise<void> {
|
|
808
|
+
await callRunScopedControl<{ ok?: unknown }>(
|
|
809
|
+
input.env,
|
|
810
|
+
input.runId,
|
|
811
|
+
'/workflow-instance-put',
|
|
812
|
+
{
|
|
813
|
+
method: 'POST',
|
|
814
|
+
body: JSON.stringify({
|
|
815
|
+
runId: input.runId,
|
|
816
|
+
instanceId: input.instanceId,
|
|
817
|
+
ttlMs: WORKFLOW_RETRY_STATE_TTL_MS,
|
|
818
|
+
}),
|
|
819
|
+
},
|
|
820
|
+
);
|
|
821
|
+
}
|
|
822
|
+
|
|
823
|
+
async function resolveWorkflowInstanceIdForRun(
|
|
824
|
+
env: CoordinatorEnv,
|
|
825
|
+
runId: string,
|
|
826
|
+
): Promise<string> {
|
|
827
|
+
const body = await callRunScopedControl<{ instanceId?: unknown }>(
|
|
828
|
+
env,
|
|
829
|
+
runId,
|
|
830
|
+
`/workflow-instance-get?runId=${encodeURIComponent(runId)}`,
|
|
831
|
+
).catch(() => ({ instanceId: null }));
|
|
832
|
+
return typeof body.instanceId === 'string' && body.instanceId
|
|
833
|
+
? body.instanceId
|
|
834
|
+
: workflowInstanceId(runId);
|
|
788
835
|
}
|
|
789
836
|
|
|
790
837
|
function assertEncryptedPreloadedDbSessions(
|
|
@@ -804,49 +851,6 @@ function assertEncryptedPreloadedDbSessions(
|
|
|
804
851
|
}
|
|
805
852
|
}
|
|
806
853
|
|
|
807
|
-
async function persistWorkflowDbSessions(input: {
|
|
808
|
-
env: CoordinatorEnv;
|
|
809
|
-
runId: string;
|
|
810
|
-
sessions: PreloadedRuntimeDbSession[];
|
|
811
|
-
}): Promise<NonNullable<PlayWorkflowParams['preloadedDbSessionRef']>> {
|
|
812
|
-
assertEncryptedPreloadedDbSessions(input.sessions);
|
|
813
|
-
const response = await runScopedDurableObject(input.env, input.runId).fetch(
|
|
814
|
-
'https://deepline.dedup.internal/db-sessions-put',
|
|
815
|
-
{
|
|
816
|
-
method: 'POST',
|
|
817
|
-
headers: { 'content-type': 'application/json' },
|
|
818
|
-
body: JSON.stringify({
|
|
819
|
-
runId: input.runId,
|
|
820
|
-
sessions: input.sessions,
|
|
821
|
-
ttlMs: DB_SESSION_DEFAULT_TTL_SECONDS * 1000,
|
|
822
|
-
}),
|
|
823
|
-
},
|
|
824
|
-
);
|
|
825
|
-
if (!response.ok) {
|
|
826
|
-
throw new Error(
|
|
827
|
-
`workflow db session storage failed ${response.status}: ${(
|
|
828
|
-
await response.text().catch(() => '')
|
|
829
|
-
).slice(0, 400)}`,
|
|
830
|
-
);
|
|
831
|
-
}
|
|
832
|
-
const body = (await response.json().catch(() => ({}))) as {
|
|
833
|
-
sessionCount?: unknown;
|
|
834
|
-
expiresAt?: unknown;
|
|
835
|
-
};
|
|
836
|
-
return {
|
|
837
|
-
runId: input.runId,
|
|
838
|
-
sessionCount:
|
|
839
|
-
typeof body.sessionCount === 'number' &&
|
|
840
|
-
Number.isFinite(body.sessionCount)
|
|
841
|
-
? body.sessionCount
|
|
842
|
-
: input.sessions.length,
|
|
843
|
-
expiresAt:
|
|
844
|
-
typeof body.expiresAt === 'number' && Number.isFinite(body.expiresAt)
|
|
845
|
-
? body.expiresAt
|
|
846
|
-
: Date.now() + DB_SESSION_DEFAULT_TTL_SECONDS * 1000,
|
|
847
|
-
};
|
|
848
|
-
}
|
|
849
|
-
|
|
850
854
|
async function readWorkflowDbSessions(input: {
|
|
851
855
|
env: CoordinatorEnv;
|
|
852
856
|
ref: NonNullable<PlayWorkflowParams['preloadedDbSessionRef']>;
|
|
@@ -877,34 +881,56 @@ async function readWorkflowDbSessions(input: {
|
|
|
877
881
|
return sessions;
|
|
878
882
|
}
|
|
879
883
|
|
|
880
|
-
async function
|
|
884
|
+
async function readWorkflowDbSessionsWithRetry(input: {
|
|
881
885
|
env: CoordinatorEnv;
|
|
886
|
+
ref: NonNullable<PlayWorkflowParams['preloadedDbSessionRef']>;
|
|
887
|
+
}): Promise<PreloadedRuntimeDbSession[]> {
|
|
888
|
+
const delays = [25, 50, 100, 200] as const;
|
|
889
|
+
let lastError: unknown = null;
|
|
890
|
+
for (let attempt = 0; attempt <= delays.length; attempt += 1) {
|
|
891
|
+
try {
|
|
892
|
+
return await readWorkflowDbSessions(input);
|
|
893
|
+
} catch (error) {
|
|
894
|
+
lastError = error;
|
|
895
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
896
|
+
if (
|
|
897
|
+
!message.includes('workflow db session lookup failed 404') ||
|
|
898
|
+
attempt >= delays.length
|
|
899
|
+
) {
|
|
900
|
+
throw error;
|
|
901
|
+
}
|
|
902
|
+
await sleep(delays[attempt]);
|
|
903
|
+
}
|
|
904
|
+
}
|
|
905
|
+
throw lastError instanceof Error ? lastError : new Error(String(lastError));
|
|
906
|
+
}
|
|
907
|
+
|
|
908
|
+
function externalizedWorkflowDbSessionParams(input: {
|
|
882
909
|
params: PlayWorkflowParams;
|
|
883
|
-
|
|
884
|
-
|
|
910
|
+
}): {
|
|
911
|
+
params: PlayWorkflowParams;
|
|
912
|
+
sessions: PreloadedRuntimeDbSession[];
|
|
913
|
+
ref: NonNullable<PlayWorkflowParams['preloadedDbSessionRef']> | null;
|
|
914
|
+
} {
|
|
885
915
|
const sessions = Array.isArray(input.params.preloadedDbSessions)
|
|
886
916
|
? input.params.preloadedDbSessions
|
|
887
917
|
: [];
|
|
888
|
-
if (sessions.length === 0)
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
918
|
+
if (sessions.length === 0) {
|
|
919
|
+
return { params: input.params, sessions, ref: null };
|
|
920
|
+
}
|
|
921
|
+
const ref: NonNullable<PlayWorkflowParams['preloadedDbSessionRef']> = {
|
|
892
922
|
runId: input.params.runId,
|
|
893
|
-
sessions,
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
phase: 'coordinator.workflow_db_sessions_externalized',
|
|
897
|
-
ms: Date.now() - startedAt,
|
|
898
|
-
graphHash: input.params.graphHash ?? null,
|
|
899
|
-
extra: {
|
|
900
|
-
sessions: sessions.length,
|
|
901
|
-
expiresAt: ref.expiresAt,
|
|
902
|
-
},
|
|
903
|
-
});
|
|
923
|
+
sessionCount: sessions.length,
|
|
924
|
+
expiresAt: Date.now() + DB_SESSION_DEFAULT_TTL_SECONDS * 1000,
|
|
925
|
+
};
|
|
904
926
|
return {
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
927
|
+
params: {
|
|
928
|
+
...input.params,
|
|
929
|
+
preloadedDbSessions: null,
|
|
930
|
+
preloadedDbSessionRef: ref,
|
|
931
|
+
},
|
|
932
|
+
sessions,
|
|
933
|
+
ref,
|
|
908
934
|
};
|
|
909
935
|
}
|
|
910
936
|
|
|
@@ -941,7 +967,10 @@ async function hydrateWorkflowDbSessions(input: {
|
|
|
941
967
|
const ref = readPreloadedDbSessionRef(params.preloadedDbSessionRef);
|
|
942
968
|
if (!ref) return input.event;
|
|
943
969
|
const startedAt = Date.now();
|
|
944
|
-
const sessions = await
|
|
970
|
+
const sessions = await readWorkflowDbSessionsWithRetry({
|
|
971
|
+
env: input.env,
|
|
972
|
+
ref,
|
|
973
|
+
});
|
|
945
974
|
input.trace({
|
|
946
975
|
runId: ref.runId,
|
|
947
976
|
phase: 'coordinator.workflow_db_sessions_hydrated',
|
|
@@ -969,278 +998,312 @@ async function hydrateWorkflowDbSessions(input: {
|
|
|
969
998
|
};
|
|
970
999
|
}
|
|
971
1000
|
|
|
972
|
-
type WorkflowPoolCounts = {
|
|
973
|
-
available: number;
|
|
974
|
-
warming: number;
|
|
975
|
-
};
|
|
976
|
-
|
|
977
|
-
type WorkflowPoolRefillResult = WorkflowPoolCounts & {
|
|
978
|
-
target: number;
|
|
979
|
-
created: number;
|
|
980
|
-
promoted: number;
|
|
981
|
-
removed: number;
|
|
982
|
-
waitedMs: number;
|
|
983
|
-
waitIterations: number;
|
|
984
|
-
};
|
|
985
|
-
|
|
986
|
-
type WorkflowPoolListEntry = {
|
|
987
|
-
id: string;
|
|
988
|
-
state: string;
|
|
989
|
-
createdAt: number;
|
|
990
|
-
readyAt: number | null;
|
|
991
|
-
expiresAt: number;
|
|
992
|
-
};
|
|
993
|
-
|
|
994
|
-
async function workflowPoolCount(
|
|
995
|
-
env: CoordinatorEnv,
|
|
996
|
-
): Promise<WorkflowPoolCounts> {
|
|
997
|
-
const body = await callWorkflowPool<{
|
|
998
|
-
available?: unknown;
|
|
999
|
-
warming?: unknown;
|
|
1000
|
-
}>(
|
|
1001
|
-
env,
|
|
1002
|
-
`/pool-count?version=${encodeURIComponent(WORKFLOW_POOL_PROTOCOL_VERSION)}`,
|
|
1003
|
-
);
|
|
1004
|
-
return {
|
|
1005
|
-
available: typeof body.available === 'number' ? body.available : 0,
|
|
1006
|
-
warming: typeof body.warming === 'number' ? body.warming : 0,
|
|
1007
|
-
};
|
|
1008
|
-
}
|
|
1009
|
-
|
|
1010
1001
|
function sleep(ms: number): Promise<void> {
|
|
1011
1002
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
1012
1003
|
}
|
|
1013
1004
|
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
`/pool-list?version=${encodeURIComponent(WORKFLOW_POOL_PROTOCOL_VERSION)}`,
|
|
1020
|
-
);
|
|
1021
|
-
if (!Array.isArray(body.entries)) return [];
|
|
1022
|
-
return body.entries
|
|
1023
|
-
.filter((entry): entry is Record<string, unknown> =>
|
|
1024
|
-
Boolean(entry && typeof entry === 'object' && !Array.isArray(entry)),
|
|
1025
|
-
)
|
|
1026
|
-
.map((entry) => ({
|
|
1027
|
-
id: typeof entry.id === 'string' ? entry.id : '',
|
|
1028
|
-
state: typeof entry.state === 'string' ? entry.state : '',
|
|
1029
|
-
createdAt:
|
|
1030
|
-
typeof entry.createdAt === 'number' && Number.isFinite(entry.createdAt)
|
|
1031
|
-
? entry.createdAt
|
|
1032
|
-
: 0,
|
|
1033
|
-
readyAt:
|
|
1034
|
-
typeof entry.readyAt === 'number' && Number.isFinite(entry.readyAt)
|
|
1035
|
-
? entry.readyAt
|
|
1036
|
-
: null,
|
|
1037
|
-
expiresAt:
|
|
1038
|
-
typeof entry.expiresAt === 'number' && Number.isFinite(entry.expiresAt)
|
|
1039
|
-
? entry.expiresAt
|
|
1040
|
-
: 0,
|
|
1041
|
-
}))
|
|
1042
|
-
.filter((entry) => entry.id);
|
|
1005
|
+
function readWorkflowPayload(event: unknown): Record<string, unknown> | null {
|
|
1006
|
+
if (!isRecord(event)) return null;
|
|
1007
|
+
const payload = event.payload;
|
|
1008
|
+
if (!isRecord(payload)) return null;
|
|
1009
|
+
return isRecord(payload.params) ? payload.params : payload;
|
|
1043
1010
|
}
|
|
1044
1011
|
|
|
1045
|
-
async function
|
|
1046
|
-
env: CoordinatorEnv
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
): Promise<void> {
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1012
|
+
async function markWorkflowRuntimeFailure(input: {
|
|
1013
|
+
env: CoordinatorEnv;
|
|
1014
|
+
event: unknown;
|
|
1015
|
+
error: unknown;
|
|
1016
|
+
}): Promise<void> {
|
|
1017
|
+
const payload = readWorkflowPayload(input.event);
|
|
1018
|
+
if (!payload) return;
|
|
1019
|
+
const runId = typeof payload.runId === 'string' ? payload.runId : null;
|
|
1020
|
+
const baseUrl = typeof payload.baseUrl === 'string' ? payload.baseUrl : null;
|
|
1021
|
+
const executorToken =
|
|
1022
|
+
typeof payload.executorToken === 'string' ? payload.executorToken : null;
|
|
1023
|
+
if (!runId || !baseUrl || !executorToken) return;
|
|
1024
|
+
const errorName =
|
|
1025
|
+
input.error instanceof Error && input.error.name
|
|
1026
|
+
? input.error.name
|
|
1027
|
+
: 'Error';
|
|
1028
|
+
const errorMessage =
|
|
1029
|
+
input.error instanceof Error ? input.error.message : String(input.error);
|
|
1030
|
+
const errorStack =
|
|
1031
|
+
input.error instanceof Error && typeof input.error.stack === 'string'
|
|
1032
|
+
? input.error.stack.split('\n').slice(0, 12).join('\n')
|
|
1033
|
+
: null;
|
|
1034
|
+
const headers = new Headers({
|
|
1035
|
+
authorization: `Bearer ${executorToken}`,
|
|
1036
|
+
'content-type': 'application/json',
|
|
1060
1037
|
});
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1038
|
+
const bypass = input.env.VERCEL_PROTECTION_BYPASS_TOKEN?.trim();
|
|
1039
|
+
if (bypass) headers.set('x-vercel-protection-bypass', bypass);
|
|
1040
|
+
const body = JSON.stringify({
|
|
1041
|
+
action: 'append_run_events',
|
|
1042
|
+
playId: runId,
|
|
1043
|
+
events: [
|
|
1044
|
+
{
|
|
1045
|
+
type: 'run.failed',
|
|
1046
|
+
runId,
|
|
1047
|
+
source: 'coordinator',
|
|
1048
|
+
occurredAt: Date.now(),
|
|
1049
|
+
error: `DynamicWorkflow runner failed: ${errorName}: ${errorMessage}${
|
|
1050
|
+
errorStack ? `\n${errorStack}` : ''
|
|
1051
|
+
}`,
|
|
1052
|
+
} satisfies PlayRunLedgerEvent,
|
|
1053
|
+
],
|
|
1073
1054
|
});
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
)
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1055
|
+
const url = `${baseUrl.replace(/\/$/, '')}/api/v2/plays/internal/runtime`;
|
|
1056
|
+
const backoffMs = [200, 500, 1500];
|
|
1057
|
+
let lastError: unknown = null;
|
|
1058
|
+
for (let attempt = 0; attempt <= backoffMs.length; attempt += 1) {
|
|
1059
|
+
try {
|
|
1060
|
+
const response = await fetch(url, { method: 'POST', headers, body });
|
|
1061
|
+
if (response.ok) return;
|
|
1062
|
+
lastError = new Error(
|
|
1063
|
+
`runtime API responded ${response.status}: ${(await response.text().catch(() => '')).slice(0, 400)}`,
|
|
1064
|
+
);
|
|
1065
|
+
if (
|
|
1066
|
+
response.status >= 400 &&
|
|
1067
|
+
response.status < 500 &&
|
|
1068
|
+
response.status !== 408 &&
|
|
1069
|
+
response.status !== 429
|
|
1070
|
+
) {
|
|
1071
|
+
break;
|
|
1072
|
+
}
|
|
1073
|
+
} catch (error) {
|
|
1074
|
+
lastError = error;
|
|
1075
|
+
}
|
|
1076
|
+
if (attempt < backoffMs.length) {
|
|
1077
|
+
await new Promise((resolve) => setTimeout(resolve, backoffMs[attempt]));
|
|
1078
|
+
}
|
|
1079
|
+
}
|
|
1080
|
+
console.error('[coordinator] failed to mark workflow runtime failure', {
|
|
1081
|
+
runId,
|
|
1082
|
+
message: lastError instanceof Error ? lastError.message : String(lastError),
|
|
1088
1083
|
});
|
|
1089
1084
|
}
|
|
1090
1085
|
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
method: 'POST',
|
|
1098
|
-
body: JSON.stringify({
|
|
1099
|
-
ids,
|
|
1100
|
-
version: WORKFLOW_POOL_PROTOCOL_VERSION,
|
|
1101
|
-
}),
|
|
1102
|
-
});
|
|
1103
|
-
}
|
|
1086
|
+
type StoredPlayArtifactPayload = {
|
|
1087
|
+
artifact?: {
|
|
1088
|
+
bundledCode?: string;
|
|
1089
|
+
artifactKind?: string;
|
|
1090
|
+
};
|
|
1091
|
+
};
|
|
1104
1092
|
|
|
1105
|
-
|
|
1106
|
-
env: CoordinatorEnv,
|
|
1107
|
-
runId: string,
|
|
1108
|
-
): Promise<string | null> {
|
|
1109
|
-
const body = await callWorkflowPool<{ id?: unknown }>(
|
|
1110
|
-
env,
|
|
1111
|
-
`/pool-claim?version=${encodeURIComponent(WORKFLOW_POOL_PROTOCOL_VERSION)}`,
|
|
1112
|
-
{
|
|
1113
|
-
method: 'POST',
|
|
1114
|
-
body: JSON.stringify({ runId }),
|
|
1115
|
-
},
|
|
1116
|
-
);
|
|
1117
|
-
return typeof body.id === 'string' && body.id ? body.id : null;
|
|
1118
|
-
}
|
|
1093
|
+
const DYNAMIC_WORKER_COMPATIBILITY_DATE = '2026-05-01';
|
|
1119
1094
|
|
|
1120
|
-
async function
|
|
1095
|
+
async function persistWorkflowRetryState(input: {
|
|
1121
1096
|
env: CoordinatorEnv;
|
|
1122
1097
|
runId: string;
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
const
|
|
1098
|
+
params: PlayWorkflowParams;
|
|
1099
|
+
}): Promise<void> {
|
|
1100
|
+
const retryParams = buildWorkflowRetryParams(input.params);
|
|
1101
|
+
const paramsBytes = jsonByteLength(retryParams);
|
|
1102
|
+
if (paramsBytes > WORKFLOW_RETRY_PARAMS_MAX_BYTES) {
|
|
1103
|
+
throw new Error(
|
|
1104
|
+
`workflow retry params too large: ${paramsBytes} bytes exceeds ${WORKFLOW_RETRY_PARAMS_MAX_BYTES}. Pass large payloads as staged files or ctx.csv inputs instead of inline JSON.`,
|
|
1105
|
+
);
|
|
1106
|
+
}
|
|
1107
|
+
let body: {
|
|
1108
|
+
runId: string;
|
|
1109
|
+
params?: PlayWorkflowParams;
|
|
1110
|
+
paramsRef?: WorkflowRetryParamsRef;
|
|
1111
|
+
paramsBytes: number;
|
|
1112
|
+
ttlMs: number;
|
|
1113
|
+
};
|
|
1114
|
+
if (paramsBytes > WORKFLOW_RETRY_PARAMS_EXTERNALIZE_AFTER_BYTES) {
|
|
1115
|
+
const serialized = JSON.stringify(retryParams);
|
|
1116
|
+
const hash = stableHash(serialized);
|
|
1117
|
+
const storageKey = workflowRetryParamsStorageKey({
|
|
1118
|
+
runId: input.runId,
|
|
1119
|
+
hash,
|
|
1120
|
+
});
|
|
1121
|
+
await input.env.PLAYS_BUCKET.put(storageKey, serialized, {
|
|
1122
|
+
httpMetadata: { contentType: 'application/json' },
|
|
1123
|
+
});
|
|
1124
|
+
body = {
|
|
1125
|
+
runId: input.runId,
|
|
1126
|
+
paramsRef: {
|
|
1127
|
+
storageKind: 'r2',
|
|
1128
|
+
storageKey,
|
|
1129
|
+
bytes: paramsBytes,
|
|
1130
|
+
hash,
|
|
1131
|
+
expiresAt: Date.now() + WORKFLOW_RETRY_STATE_TTL_MS,
|
|
1132
|
+
},
|
|
1133
|
+
paramsBytes,
|
|
1134
|
+
ttlMs: WORKFLOW_RETRY_STATE_TTL_MS,
|
|
1135
|
+
};
|
|
1136
|
+
} else {
|
|
1137
|
+
body = {
|
|
1138
|
+
runId: input.runId,
|
|
1139
|
+
params: retryParams,
|
|
1140
|
+
paramsBytes,
|
|
1141
|
+
ttlMs: WORKFLOW_RETRY_STATE_TTL_MS,
|
|
1142
|
+
};
|
|
1143
|
+
}
|
|
1144
|
+
await callRunScopedControl<{ ok?: unknown }>(
|
|
1127
1145
|
input.env,
|
|
1128
|
-
|
|
1146
|
+
input.runId,
|
|
1147
|
+
'/run-retry-state-put',
|
|
1129
1148
|
{
|
|
1130
1149
|
method: 'POST',
|
|
1131
|
-
body: JSON.stringify(
|
|
1132
|
-
runId: input.runId,
|
|
1133
|
-
instanceId: input.instanceId,
|
|
1134
|
-
started: input.started === true,
|
|
1135
|
-
version: WORKFLOW_POOL_PROTOCOL_VERSION,
|
|
1136
|
-
}),
|
|
1150
|
+
body: JSON.stringify(body),
|
|
1137
1151
|
},
|
|
1138
1152
|
);
|
|
1139
|
-
return body.mapped !== false;
|
|
1140
1153
|
}
|
|
1141
1154
|
|
|
1142
|
-
async function
|
|
1155
|
+
async function persistWorkflowLaunchState(input: {
|
|
1143
1156
|
env: CoordinatorEnv;
|
|
1144
1157
|
runId: string;
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1158
|
+
params: PlayWorkflowParams;
|
|
1159
|
+
sessions: PreloadedRuntimeDbSession[];
|
|
1160
|
+
}): Promise<{
|
|
1161
|
+
retryExpiresAt?: number;
|
|
1162
|
+
dbSessionsExpiresAt?: number;
|
|
1163
|
+
sessionCount?: number;
|
|
1164
|
+
}> {
|
|
1165
|
+
if (input.sessions.length === 0) {
|
|
1166
|
+
await persistWorkflowRetryState({
|
|
1167
|
+
env: input.env,
|
|
1153
1168
|
runId: input.runId,
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
}
|
|
1169
|
+
params: input.params,
|
|
1170
|
+
});
|
|
1171
|
+
return {};
|
|
1172
|
+
}
|
|
1173
|
+
const retryParams = buildWorkflowRetryParams(input.params);
|
|
1174
|
+
const paramsBytes = jsonByteLength(retryParams);
|
|
1175
|
+
let body: {
|
|
1176
|
+
runId: string;
|
|
1177
|
+
params?: PlayWorkflowParams;
|
|
1178
|
+
paramsRef?: WorkflowRetryParamsRef;
|
|
1179
|
+
paramsBytes: number;
|
|
1180
|
+
sessions: PreloadedRuntimeDbSession[];
|
|
1181
|
+
retryTtlMs: number;
|
|
1182
|
+
dbSessionsTtlMs: number;
|
|
1183
|
+
};
|
|
1184
|
+
if (paramsBytes > WORKFLOW_RETRY_PARAMS_EXTERNALIZE_AFTER_BYTES) {
|
|
1185
|
+
const serialized = JSON.stringify(retryParams);
|
|
1186
|
+
const hash = stableHash(serialized);
|
|
1187
|
+
const storageKey = workflowRetryParamsStorageKey({
|
|
1188
|
+
runId: input.runId,
|
|
1189
|
+
hash,
|
|
1190
|
+
});
|
|
1191
|
+
await input.env.PLAYS_BUCKET.put(storageKey, serialized, {
|
|
1192
|
+
httpMetadata: { contentType: 'application/json' },
|
|
1193
|
+
});
|
|
1194
|
+
body = {
|
|
1195
|
+
runId: input.runId,
|
|
1196
|
+
paramsRef: {
|
|
1197
|
+
storageKind: 'r2',
|
|
1198
|
+
storageKey,
|
|
1199
|
+
bytes: paramsBytes,
|
|
1200
|
+
hash,
|
|
1201
|
+
expiresAt: Date.now() + WORKFLOW_RETRY_STATE_TTL_MS,
|
|
1202
|
+
},
|
|
1203
|
+
paramsBytes,
|
|
1204
|
+
sessions: input.sessions,
|
|
1205
|
+
retryTtlMs: WORKFLOW_RETRY_STATE_TTL_MS,
|
|
1206
|
+
dbSessionsTtlMs: DB_SESSION_DEFAULT_TTL_SECONDS * 1000,
|
|
1207
|
+
};
|
|
1208
|
+
} else {
|
|
1209
|
+
body = {
|
|
1210
|
+
runId: input.runId,
|
|
1211
|
+
params: retryParams,
|
|
1212
|
+
paramsBytes,
|
|
1213
|
+
sessions: input.sessions,
|
|
1214
|
+
retryTtlMs: WORKFLOW_RETRY_STATE_TTL_MS,
|
|
1215
|
+
dbSessionsTtlMs: DB_SESSION_DEFAULT_TTL_SECONDS * 1000,
|
|
1216
|
+
};
|
|
1217
|
+
}
|
|
1218
|
+
const response = await callRunScopedControl<{
|
|
1219
|
+
ok?: unknown;
|
|
1220
|
+
retryExpiresAt?: unknown;
|
|
1221
|
+
dbSessionsExpiresAt?: unknown;
|
|
1222
|
+
sessionCount?: unknown;
|
|
1223
|
+
}>(input.env, input.runId, '/run-launch-state-put', {
|
|
1224
|
+
method: 'POST',
|
|
1225
|
+
body: JSON.stringify(body),
|
|
1157
1226
|
});
|
|
1158
1227
|
return {
|
|
1159
|
-
|
|
1160
|
-
|
|
1228
|
+
retryExpiresAt:
|
|
1229
|
+
typeof response.retryExpiresAt === 'number'
|
|
1230
|
+
? response.retryExpiresAt
|
|
1231
|
+
: undefined,
|
|
1232
|
+
dbSessionsExpiresAt:
|
|
1233
|
+
typeof response.dbSessionsExpiresAt === 'number'
|
|
1234
|
+
? response.dbSessionsExpiresAt
|
|
1235
|
+
: undefined,
|
|
1236
|
+
sessionCount:
|
|
1237
|
+
typeof response.sessionCount === 'number'
|
|
1238
|
+
? response.sessionCount
|
|
1239
|
+
: undefined,
|
|
1161
1240
|
};
|
|
1162
1241
|
}
|
|
1163
1242
|
|
|
1164
|
-
async function
|
|
1243
|
+
async function hydrateWorkflowRetryParams(input: {
|
|
1165
1244
|
env: CoordinatorEnv;
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
}
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1245
|
+
params: unknown;
|
|
1246
|
+
paramsRef: unknown;
|
|
1247
|
+
}): Promise<PlayWorkflowParams | null> {
|
|
1248
|
+
if (isRecord(input.params)) {
|
|
1249
|
+
return input.params as PlayWorkflowParams;
|
|
1250
|
+
}
|
|
1251
|
+
if (!isRecord(input.paramsRef)) {
|
|
1252
|
+
return null;
|
|
1253
|
+
}
|
|
1254
|
+
const storageKind = input.paramsRef.storageKind;
|
|
1255
|
+
const storageKey = input.paramsRef.storageKey;
|
|
1256
|
+
const expectedBytes = input.paramsRef.bytes;
|
|
1257
|
+
const expectedHash = input.paramsRef.hash;
|
|
1258
|
+
if (
|
|
1259
|
+
storageKind !== 'r2' ||
|
|
1260
|
+
typeof storageKey !== 'string' ||
|
|
1261
|
+
!storageKey.startsWith('plays/workflow-retry-params/') ||
|
|
1262
|
+
typeof expectedBytes !== 'number' ||
|
|
1263
|
+
!Number.isFinite(expectedBytes) ||
|
|
1264
|
+
typeof expectedHash !== 'string' ||
|
|
1265
|
+
!expectedHash
|
|
1266
|
+
) {
|
|
1267
|
+
throw new Error('Invalid workflow retry params reference.');
|
|
1268
|
+
}
|
|
1269
|
+
const object = await input.env.PLAYS_BUCKET.get(storageKey);
|
|
1270
|
+
if (!object) {
|
|
1271
|
+
throw new Error(`Workflow retry params missing from R2: ${storageKey}`);
|
|
1272
|
+
}
|
|
1273
|
+
const text = await object.text();
|
|
1274
|
+
const actualBytes = new TextEncoder().encode(text).length;
|
|
1275
|
+
if (actualBytes !== expectedBytes) {
|
|
1276
|
+
throw new Error(
|
|
1277
|
+
`Workflow retry params byte length mismatch: expected ${expectedBytes}, got ${actualBytes}.`,
|
|
1278
|
+
);
|
|
1279
|
+
}
|
|
1280
|
+
const actualHash = stableHash(text);
|
|
1281
|
+
if (actualHash !== expectedHash) {
|
|
1282
|
+
throw new Error('Workflow retry params hash mismatch.');
|
|
1283
|
+
}
|
|
1284
|
+
const parsed = JSON.parse(text) as unknown;
|
|
1285
|
+
return isRecord(parsed) ? (parsed as PlayWorkflowParams) : null;
|
|
1187
1286
|
}
|
|
1188
1287
|
|
|
1189
|
-
|
|
1190
|
-
env: CoordinatorEnv;
|
|
1288
|
+
function workflowRetryStatePersistenceErrorResponse(input: {
|
|
1191
1289
|
runId: string;
|
|
1192
|
-
|
|
1193
|
-
}):
|
|
1194
|
-
const
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
await callWorkflowPool<{ ok?: unknown }>(input.env, '/run-retry-state-put', {
|
|
1210
|
-
method: 'POST',
|
|
1211
|
-
body: JSON.stringify({
|
|
1212
|
-
runId: input.runId,
|
|
1213
|
-
params: retryParams,
|
|
1214
|
-
ttlMs: WORKFLOW_RETRY_STATE_TTL_MS,
|
|
1215
|
-
}),
|
|
1216
|
-
}).catch((error) => {
|
|
1217
|
-
console.warn('[coordinator] workflow retry state persistence skipped', {
|
|
1218
|
-
runId: input.runId,
|
|
1219
|
-
error: error instanceof Error ? error.message : String(error),
|
|
1220
|
-
});
|
|
1221
|
-
});
|
|
1222
|
-
}
|
|
1223
|
-
|
|
1224
|
-
function stripRetrySourceSnapshot(snapshot: unknown): unknown {
|
|
1225
|
-
if (!isRecord(snapshot)) return snapshot;
|
|
1226
|
-
const rest = { ...snapshot };
|
|
1227
|
-
delete rest.sourceCode;
|
|
1228
|
-
delete rest.sourceFiles;
|
|
1229
|
-
return rest;
|
|
1230
|
-
}
|
|
1231
|
-
|
|
1232
|
-
function stripRetryChildManifestCode(
|
|
1233
|
-
manifests: PlayRuntimeManifestMap | null | undefined,
|
|
1234
|
-
): PlayRuntimeManifestMap | null {
|
|
1235
|
-
if (!manifests) return null;
|
|
1236
|
-
const stripped: PlayRuntimeManifestMap = {};
|
|
1237
|
-
for (const [key, manifest] of Object.entries(manifests)) {
|
|
1238
|
-
const rest = { ...manifest };
|
|
1239
|
-
delete rest.bundledCode;
|
|
1240
|
-
delete rest.sourceCode;
|
|
1241
|
-
stripped[key] = rest;
|
|
1242
|
-
}
|
|
1243
|
-
return stripped;
|
|
1290
|
+
error: unknown;
|
|
1291
|
+
}): Response {
|
|
1292
|
+
const message =
|
|
1293
|
+
input.error instanceof Error ? input.error.message : String(input.error);
|
|
1294
|
+
return Response.json(
|
|
1295
|
+
{
|
|
1296
|
+
error: {
|
|
1297
|
+
code: 'WORKFLOW_RETRY_STATE_PERSISTENCE_FAILED',
|
|
1298
|
+
message:
|
|
1299
|
+
'Failed to persist workflow retry state before dispatching the play run.',
|
|
1300
|
+
phase: 'coordinator_retry_state_persistence',
|
|
1301
|
+
runId: input.runId,
|
|
1302
|
+
cause: message,
|
|
1303
|
+
},
|
|
1304
|
+
},
|
|
1305
|
+
{ status: 503 },
|
|
1306
|
+
);
|
|
1244
1307
|
}
|
|
1245
1308
|
|
|
1246
1309
|
async function claimWorkflowPlatformRetry(input: {
|
|
@@ -1251,21 +1314,27 @@ async function claimWorkflowPlatformRetry(input: {
|
|
|
1251
1314
|
attempts: number;
|
|
1252
1315
|
params: PlayWorkflowParams | null;
|
|
1253
1316
|
}> {
|
|
1254
|
-
const body = await
|
|
1317
|
+
const body = await callRunScopedControl<{
|
|
1255
1318
|
claimed?: unknown;
|
|
1256
1319
|
attempts?: unknown;
|
|
1257
1320
|
params?: unknown;
|
|
1258
|
-
|
|
1321
|
+
paramsRef?: unknown;
|
|
1322
|
+
}>(input.env, input.runId, '/run-retry-claim', {
|
|
1259
1323
|
method: 'POST',
|
|
1260
1324
|
body: JSON.stringify({
|
|
1261
1325
|
runId: input.runId,
|
|
1262
1326
|
maxAttempts: PLATFORM_DEPLOY_WORKFLOW_RETRY_LIMIT,
|
|
1263
1327
|
}),
|
|
1264
1328
|
});
|
|
1329
|
+
const params = await hydrateWorkflowRetryParams({
|
|
1330
|
+
env: input.env,
|
|
1331
|
+
params: body.params,
|
|
1332
|
+
paramsRef: body.paramsRef,
|
|
1333
|
+
});
|
|
1265
1334
|
return {
|
|
1266
1335
|
claimed: body.claimed === true,
|
|
1267
1336
|
attempts: typeof body.attempts === 'number' ? body.attempts : 0,
|
|
1268
|
-
params
|
|
1337
|
+
params,
|
|
1269
1338
|
};
|
|
1270
1339
|
}
|
|
1271
1340
|
|
|
@@ -1302,690 +1371,60 @@ async function restartWorkflowAfterPlatformReset(input: {
|
|
|
1302
1371
|
console.warn('[coordinator] workflow platform retry claim failed', {
|
|
1303
1372
|
runId: input.runId,
|
|
1304
1373
|
error: error instanceof Error ? error.message : String(error),
|
|
1305
|
-
});
|
|
1306
|
-
return null;
|
|
1307
|
-
});
|
|
1308
|
-
if (!claim?.claimed || !claim.params) {
|
|
1309
|
-
return { retried: false, result: null };
|
|
1310
|
-
}
|
|
1311
|
-
const retryInstanceId = workflowRetryInstanceId(input.runId, claim.attempts);
|
|
1312
|
-
const retryStartedAt = Date.now();
|
|
1313
|
-
let retryInstance: WorkflowInstance | null = null;
|
|
1314
|
-
try {
|
|
1315
|
-
retryInstance = await createDynamicWorkflowInstance({
|
|
1316
|
-
env: input.env,
|
|
1317
|
-
id: retryInstanceId,
|
|
1318
|
-
params: claim.params,
|
|
1319
|
-
});
|
|
1320
|
-
await mapRunToWorkflowInstance({
|
|
1321
|
-
env: input.env,
|
|
1322
|
-
runId: input.runId,
|
|
1323
|
-
instanceId: retryInstance.id,
|
|
1324
|
-
started: true,
|
|
1325
|
-
});
|
|
1326
|
-
input.ctx?.waitUntil(input.oldInstance.terminate().catch(() => undefined));
|
|
1327
|
-
recordCoordinatorPerfTraceBuffered(input.env, input.ctx, {
|
|
1328
|
-
runId: input.runId,
|
|
1329
|
-
phase: 'coordinator.platform_deploy_retry',
|
|
1330
|
-
ms: Date.now() - retryStartedAt,
|
|
1331
|
-
graphHash: claim.params.graphHash ?? null,
|
|
1332
|
-
extra: {
|
|
1333
|
-
retryAttempt: claim.attempts,
|
|
1334
|
-
retryInstanceId: retryInstance.id,
|
|
1335
|
-
reason: decision.reason,
|
|
1336
|
-
},
|
|
1337
|
-
});
|
|
1338
|
-
return {
|
|
1339
|
-
retried: true,
|
|
1340
|
-
result: {
|
|
1341
|
-
runId: input.runId,
|
|
1342
|
-
playName: claim.params.playName,
|
|
1343
|
-
status: 'running',
|
|
1344
|
-
result: null,
|
|
1345
|
-
error: null,
|
|
1346
|
-
retry: {
|
|
1347
|
-
reason: decision.reason,
|
|
1348
|
-
attempt: claim.attempts,
|
|
1349
|
-
message: decision.message,
|
|
1350
|
-
},
|
|
1351
|
-
},
|
|
1352
|
-
};
|
|
1353
|
-
} finally {
|
|
1354
|
-
disposeRpcStub(retryInstance);
|
|
1355
|
-
}
|
|
1356
|
-
}
|
|
1357
|
-
|
|
1358
|
-
async function waitForWorkflowPoolStartAck(input: {
|
|
1359
|
-
env: CoordinatorEnv;
|
|
1360
|
-
runId: string;
|
|
1361
|
-
instanceId: string;
|
|
1362
|
-
timeoutMs: number;
|
|
1363
|
-
}): Promise<{
|
|
1364
|
-
acknowledged: boolean;
|
|
1365
|
-
ms: number;
|
|
1366
|
-
polls: number;
|
|
1367
|
-
startedAt: number | null;
|
|
1368
|
-
mappedInstanceId: string | null;
|
|
1369
|
-
}> {
|
|
1370
|
-
const startedAt = Date.now();
|
|
1371
|
-
let polls = 0;
|
|
1372
|
-
let latestMapping: { instanceId: string | null; startedAt: number | null } = {
|
|
1373
|
-
instanceId: null,
|
|
1374
|
-
startedAt: null,
|
|
1375
|
-
};
|
|
1376
|
-
while (Date.now() - startedAt < input.timeoutMs) {
|
|
1377
|
-
polls += 1;
|
|
1378
|
-
latestMapping = await readWorkflowPoolRunMapping({
|
|
1379
|
-
env: input.env,
|
|
1380
|
-
runId: input.runId,
|
|
1381
|
-
});
|
|
1382
|
-
if (
|
|
1383
|
-
latestMapping.instanceId === input.instanceId &&
|
|
1384
|
-
latestMapping.startedAt !== null
|
|
1385
|
-
) {
|
|
1386
|
-
return {
|
|
1387
|
-
acknowledged: true,
|
|
1388
|
-
ms: Date.now() - startedAt,
|
|
1389
|
-
polls,
|
|
1390
|
-
startedAt: latestMapping.startedAt,
|
|
1391
|
-
mappedInstanceId: latestMapping.instanceId,
|
|
1392
|
-
};
|
|
1393
|
-
}
|
|
1394
|
-
await sleep(WORKFLOW_POOL_START_ACK_POLL_MS);
|
|
1395
|
-
}
|
|
1396
|
-
return {
|
|
1397
|
-
acknowledged: false,
|
|
1398
|
-
ms: Date.now() - startedAt,
|
|
1399
|
-
polls,
|
|
1400
|
-
startedAt: latestMapping.startedAt,
|
|
1401
|
-
mappedInstanceId: latestMapping.instanceId,
|
|
1402
|
-
};
|
|
1403
|
-
}
|
|
1404
|
-
|
|
1405
|
-
async function resolveWorkflowInstanceIdForRun(
|
|
1406
|
-
env: CoordinatorEnv,
|
|
1407
|
-
runId: string,
|
|
1408
|
-
): Promise<string> {
|
|
1409
|
-
if (!workflowPoolEnabled()) {
|
|
1410
|
-
return workflowInstanceId(runId);
|
|
1411
|
-
}
|
|
1412
|
-
const mapping = await readWorkflowPoolRunMapping({ env, runId });
|
|
1413
|
-
return mapping.instanceId ? mapping.instanceId : workflowInstanceId(runId);
|
|
1414
|
-
}
|
|
1415
|
-
|
|
1416
|
-
async function clearWorkflowPool(env: CoordinatorEnv): Promise<number> {
|
|
1417
|
-
const entries = await listWorkflowPoolEntries(env).catch(() => []);
|
|
1418
|
-
const body = await callWorkflowPool<{ deleted?: unknown }>(
|
|
1419
|
-
env,
|
|
1420
|
-
`/pool-clear?version=${encodeURIComponent(WORKFLOW_POOL_PROTOCOL_VERSION)}`,
|
|
1421
|
-
{ method: 'POST', body: '{}' },
|
|
1422
|
-
);
|
|
1423
|
-
await Promise.all(
|
|
1424
|
-
entries.map(async (entry) => {
|
|
1425
|
-
const instance = await getWorkflowPoolInstance(env, entry.id);
|
|
1426
|
-
if (!instance) {
|
|
1427
|
-
return;
|
|
1428
|
-
}
|
|
1429
|
-
try {
|
|
1430
|
-
await instance.terminate().catch(() => undefined);
|
|
1431
|
-
} finally {
|
|
1432
|
-
disposeRpcStub(instance);
|
|
1433
|
-
}
|
|
1434
|
-
}),
|
|
1435
|
-
);
|
|
1436
|
-
return typeof body.deleted === 'number' ? body.deleted : 0;
|
|
1437
|
-
}
|
|
1438
|
-
|
|
1439
|
-
function workflowStatusName(status: InstanceStatus | null): string {
|
|
1440
|
-
return typeof status?.status === 'string' ? status.status : 'unknown';
|
|
1441
|
-
}
|
|
1442
|
-
|
|
1443
|
-
function isWorkflowInstanceNotFoundError(error: unknown): boolean {
|
|
1444
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1445
|
-
return /not[ _]found|not_found|does not exist|no such instance|404/i.test(
|
|
1446
|
-
message,
|
|
1447
|
-
);
|
|
1448
|
-
}
|
|
1449
|
-
|
|
1450
|
-
async function getWorkflowPoolInstance(
|
|
1451
|
-
env: CoordinatorEnv,
|
|
1452
|
-
instanceId: string,
|
|
1453
|
-
): Promise<WorkflowInstance | null> {
|
|
1454
|
-
try {
|
|
1455
|
-
return await env.PLAY_WORKFLOW.get(instanceId);
|
|
1456
|
-
} catch (error) {
|
|
1457
|
-
if (isWorkflowInstanceNotFoundError(error)) {
|
|
1458
|
-
return null;
|
|
1459
|
-
}
|
|
1460
|
-
throw error;
|
|
1461
|
-
}
|
|
1462
|
-
}
|
|
1463
|
-
|
|
1464
|
-
function workflowPoolStatusIsReady(statusName: string): boolean {
|
|
1465
|
-
// This is only a liveness guard. Readiness itself comes from the pooled
|
|
1466
|
-
// Workflow calling /pool-ready after waitForEvent("play_start") has been
|
|
1467
|
-
// created, because Cloudflare may report an armed wait as "running".
|
|
1468
|
-
return statusName === 'running' || statusName === 'waiting';
|
|
1469
|
-
}
|
|
1470
|
-
|
|
1471
|
-
async function waitForWorkflowPoolReadySignal(input: {
|
|
1472
|
-
env: CoordinatorEnv;
|
|
1473
|
-
instance: WorkflowInstance;
|
|
1474
|
-
poolId: string;
|
|
1475
|
-
}): Promise<{
|
|
1476
|
-
ready: boolean;
|
|
1477
|
-
status: string;
|
|
1478
|
-
ms: number;
|
|
1479
|
-
polls: number;
|
|
1480
|
-
}> {
|
|
1481
|
-
const startedAt = Date.now();
|
|
1482
|
-
let lastStatusName = 'unknown';
|
|
1483
|
-
let polls = 0;
|
|
1484
|
-
while (Date.now() - startedAt < WORKFLOW_POOL_READY_TIMEOUT_MS) {
|
|
1485
|
-
polls += 1;
|
|
1486
|
-
const [entry, status] = await Promise.all([
|
|
1487
|
-
listWorkflowPoolEntries(input.env)
|
|
1488
|
-
.then((entries) =>
|
|
1489
|
-
entries.find((candidate) => candidate.id === input.poolId),
|
|
1490
|
-
)
|
|
1491
|
-
.catch(() => undefined),
|
|
1492
|
-
input.instance.status().catch(() => null),
|
|
1493
|
-
]);
|
|
1494
|
-
const statusName = workflowStatusName(status);
|
|
1495
|
-
lastStatusName = statusName;
|
|
1496
|
-
if (entry?.state === 'ready' && entry.readyAt !== null) {
|
|
1497
|
-
return {
|
|
1498
|
-
ready: true,
|
|
1499
|
-
status: statusName,
|
|
1500
|
-
ms: Date.now() - startedAt,
|
|
1501
|
-
polls,
|
|
1502
|
-
};
|
|
1503
|
-
}
|
|
1504
|
-
if (
|
|
1505
|
-
statusName === 'complete' ||
|
|
1506
|
-
statusName === 'errored' ||
|
|
1507
|
-
statusName === 'terminated' ||
|
|
1508
|
-
statusName === 'unknown'
|
|
1509
|
-
) {
|
|
1510
|
-
return {
|
|
1511
|
-
ready: false,
|
|
1512
|
-
status: statusName,
|
|
1513
|
-
ms: Date.now() - startedAt,
|
|
1514
|
-
polls,
|
|
1515
|
-
};
|
|
1516
|
-
}
|
|
1517
|
-
await sleep(WORKFLOW_POOL_READY_POLL_MS);
|
|
1518
|
-
}
|
|
1519
|
-
return {
|
|
1520
|
-
ready: false,
|
|
1521
|
-
status: lastStatusName,
|
|
1522
|
-
ms: Date.now() - startedAt,
|
|
1523
|
-
polls,
|
|
1524
|
-
};
|
|
1525
|
-
}
|
|
1526
|
-
|
|
1527
|
-
async function refillWorkflowPoolOnce(
|
|
1528
|
-
env: CoordinatorEnv,
|
|
1529
|
-
): Promise<Omit<WorkflowPoolRefillResult, 'waitedMs' | 'waitIterations'>> {
|
|
1530
|
-
if (!workflowPoolEnabled()) {
|
|
1531
|
-
return {
|
|
1532
|
-
available: 0,
|
|
1533
|
-
warming: 0,
|
|
1534
|
-
target: 0,
|
|
1535
|
-
created: 0,
|
|
1536
|
-
promoted: 0,
|
|
1537
|
-
removed: 0,
|
|
1538
|
-
};
|
|
1539
|
-
}
|
|
1540
|
-
const target = workflowPoolTargetSize();
|
|
1541
|
-
const entries = await listWorkflowPoolEntries(env);
|
|
1542
|
-
const warmingEntries = entries.filter((entry) => entry.readyAt === null);
|
|
1543
|
-
const promotedIds: string[] = [];
|
|
1544
|
-
const removedIds: string[] = [];
|
|
1545
|
-
for (const entry of warmingEntries) {
|
|
1546
|
-
const instance = await getWorkflowPoolInstance(env, entry.id);
|
|
1547
|
-
if (!instance) {
|
|
1548
|
-
removedIds.push(entry.id);
|
|
1549
|
-
continue;
|
|
1550
|
-
}
|
|
1551
|
-
try {
|
|
1552
|
-
if (entry.state === 'ready' && entry.readyAt !== null) {
|
|
1553
|
-
promotedIds.push(entry.id);
|
|
1554
|
-
continue;
|
|
1555
|
-
}
|
|
1556
|
-
const status = await instance.status().catch(() => null);
|
|
1557
|
-
const statusName = workflowStatusName(status);
|
|
1558
|
-
if (
|
|
1559
|
-
statusName === 'complete' ||
|
|
1560
|
-
statusName === 'errored' ||
|
|
1561
|
-
statusName === 'terminated' ||
|
|
1562
|
-
statusName === 'unknown'
|
|
1563
|
-
) {
|
|
1564
|
-
removedIds.push(entry.id);
|
|
1565
|
-
}
|
|
1566
|
-
} finally {
|
|
1567
|
-
disposeRpcStub(instance);
|
|
1568
|
-
}
|
|
1569
|
-
}
|
|
1570
|
-
await Promise.all([
|
|
1571
|
-
promoteWorkflowPoolIds(env, promotedIds),
|
|
1572
|
-
deleteWorkflowPoolIds(env, removedIds),
|
|
1573
|
-
]);
|
|
1574
|
-
const counts = await workflowPoolCount(env);
|
|
1575
|
-
const totalTracked = counts.available + counts.warming;
|
|
1576
|
-
const needed = Math.max(0, target - totalTracked);
|
|
1577
|
-
if (needed === 0) {
|
|
1578
|
-
return {
|
|
1579
|
-
available: counts.available,
|
|
1580
|
-
warming: counts.warming,
|
|
1581
|
-
target,
|
|
1582
|
-
created: 0,
|
|
1583
|
-
promoted: promotedIds.length,
|
|
1584
|
-
removed: removedIds.length,
|
|
1585
|
-
};
|
|
1586
|
-
}
|
|
1587
|
-
const created = await Promise.all(
|
|
1588
|
-
Array.from({ length: needed }, async () => {
|
|
1589
|
-
const poolId = `pool-v2-${Date.now().toString(36)}-${crypto.randomUUID().slice(0, 12)}`;
|
|
1590
|
-
await addWorkflowPoolIds(env, [poolId], { ready: false });
|
|
1591
|
-
const instance = await env.PLAY_WORKFLOW.create({
|
|
1592
|
-
id: poolId,
|
|
1593
|
-
params: {
|
|
1594
|
-
__deeplinePooledWorkflow: true,
|
|
1595
|
-
poolId,
|
|
1596
|
-
createdAt: Date.now(),
|
|
1597
|
-
} satisfies PooledWorkflowBootstrapPayload,
|
|
1598
|
-
});
|
|
1599
|
-
try {
|
|
1600
|
-
const readiness = await waitForWorkflowPoolReadySignal({
|
|
1601
|
-
env,
|
|
1602
|
-
instance,
|
|
1603
|
-
poolId,
|
|
1604
|
-
});
|
|
1605
|
-
recordCoordinatorPerfTrace({
|
|
1606
|
-
runId: poolId,
|
|
1607
|
-
phase: 'coordinator.workflow_pool_ready',
|
|
1608
|
-
ms: readiness.ms,
|
|
1609
|
-
graphHash: 'workflow-pool',
|
|
1610
|
-
extra: {
|
|
1611
|
-
ready: readiness.ready,
|
|
1612
|
-
status: readiness.status,
|
|
1613
|
-
polls: readiness.polls,
|
|
1614
|
-
},
|
|
1615
|
-
});
|
|
1616
|
-
if (readiness.ready) {
|
|
1617
|
-
return { id: poolId, state: 'ready' as const };
|
|
1618
|
-
}
|
|
1619
|
-
if (
|
|
1620
|
-
readiness.status === 'complete' ||
|
|
1621
|
-
readiness.status === 'errored' ||
|
|
1622
|
-
readiness.status === 'terminated' ||
|
|
1623
|
-
readiness.status === 'unknown'
|
|
1624
|
-
) {
|
|
1625
|
-
await instance.terminate().catch(() => undefined);
|
|
1626
|
-
return { id: poolId, state: 'removed' as const };
|
|
1627
|
-
}
|
|
1628
|
-
return { id: poolId, state: 'warming' as const };
|
|
1629
|
-
} finally {
|
|
1630
|
-
disposeRpcStub(instance);
|
|
1631
|
-
}
|
|
1632
|
-
}),
|
|
1633
|
-
);
|
|
1634
|
-
const readyCreatedIds = created
|
|
1635
|
-
.filter((entry) => entry.state === 'ready')
|
|
1636
|
-
.map((entry) => entry.id);
|
|
1637
|
-
const warmingCreatedIds = created
|
|
1638
|
-
.filter((entry) => entry.state === 'warming')
|
|
1639
|
-
.map((entry) => entry.id);
|
|
1640
|
-
removedIds.push(
|
|
1641
|
-
...created
|
|
1642
|
-
.filter((entry) => entry.state === 'removed')
|
|
1643
|
-
.map((entry) => entry.id),
|
|
1644
|
-
);
|
|
1645
|
-
await Promise.all([
|
|
1646
|
-
addWorkflowPoolIds(env, readyCreatedIds, { ready: true }),
|
|
1647
|
-
addWorkflowPoolIds(env, warmingCreatedIds, { ready: false }),
|
|
1648
|
-
]);
|
|
1649
|
-
const finalCounts = await workflowPoolCount(env);
|
|
1650
|
-
return {
|
|
1651
|
-
available: finalCounts.available,
|
|
1652
|
-
warming: finalCounts.warming,
|
|
1653
|
-
target,
|
|
1654
|
-
created: readyCreatedIds.length + warmingCreatedIds.length,
|
|
1655
|
-
promoted: promotedIds.length,
|
|
1656
|
-
removed: removedIds.length,
|
|
1657
|
-
};
|
|
1658
|
-
}
|
|
1659
|
-
|
|
1660
|
-
async function refillWorkflowPool(
|
|
1661
|
-
env: CoordinatorEnv,
|
|
1662
|
-
options?: {
|
|
1663
|
-
minAvailable?: number;
|
|
1664
|
-
waitReady?: boolean;
|
|
1665
|
-
waitTimeoutMs?: number;
|
|
1666
|
-
},
|
|
1667
|
-
): Promise<WorkflowPoolRefillResult> {
|
|
1668
|
-
const startedAt = Date.now();
|
|
1669
|
-
const minAvailable = Math.max(1, Math.floor(options?.minAvailable ?? 1));
|
|
1670
|
-
const waitReady = options?.waitReady === true;
|
|
1671
|
-
const waitTimeoutMs =
|
|
1672
|
-
typeof options?.waitTimeoutMs === 'number' &&
|
|
1673
|
-
Number.isFinite(options.waitTimeoutMs) &&
|
|
1674
|
-
options.waitTimeoutMs > 0
|
|
1675
|
-
? Math.min(Math.floor(options.waitTimeoutMs), 15_000)
|
|
1676
|
-
: 4_000;
|
|
1677
|
-
let totals = await refillWorkflowPoolOnce(env);
|
|
1678
|
-
let iterations = 0;
|
|
1679
|
-
const readyWaitStartedAt = Date.now();
|
|
1680
|
-
|
|
1681
|
-
while (
|
|
1682
|
-
workflowPoolEnabled() &&
|
|
1683
|
-
waitReady &&
|
|
1684
|
-
totals.available < minAvailable &&
|
|
1685
|
-
Date.now() - readyWaitStartedAt < waitTimeoutMs
|
|
1686
|
-
) {
|
|
1687
|
-
iterations += 1;
|
|
1688
|
-
await sleep(WORKFLOW_POOL_READY_POLL_MS);
|
|
1689
|
-
const next = await refillWorkflowPoolOnce(env);
|
|
1690
|
-
totals = {
|
|
1691
|
-
...next,
|
|
1692
|
-
created: totals.created + next.created,
|
|
1693
|
-
promoted: totals.promoted + next.promoted,
|
|
1694
|
-
removed: totals.removed + next.removed,
|
|
1695
|
-
};
|
|
1696
|
-
}
|
|
1697
|
-
|
|
1698
|
-
const result: WorkflowPoolRefillResult = {
|
|
1699
|
-
...totals,
|
|
1700
|
-
waitedMs: Date.now() - startedAt,
|
|
1701
|
-
waitIterations: iterations,
|
|
1702
|
-
};
|
|
1703
|
-
recordCoordinatorPerfTrace({
|
|
1704
|
-
runId: 'workflow-pool',
|
|
1705
|
-
phase: 'coordinator.workflow_pool_refill',
|
|
1706
|
-
ms: result.waitedMs,
|
|
1707
|
-
graphHash: 'workflow-pool',
|
|
1708
|
-
extra: result,
|
|
1709
|
-
});
|
|
1710
|
-
return result;
|
|
1711
|
-
}
|
|
1712
|
-
|
|
1713
|
-
async function submitViaPooledWorkflow(input: {
|
|
1714
|
-
env: CoordinatorEnv;
|
|
1715
|
-
params: PlayWorkflowParams;
|
|
1716
|
-
recordSubmitTiming: (timing: CoordinatorTiming) => void;
|
|
1717
|
-
}): Promise<WorkflowInstance | null> {
|
|
1718
|
-
if (!workflowPoolEnabled()) {
|
|
1719
|
-
return null;
|
|
1720
|
-
}
|
|
1721
|
-
const leaseStartedAt = Date.now();
|
|
1722
|
-
let leaseError: string | null = null;
|
|
1723
|
-
const pooledInstanceId = await leaseWorkflowPoolId(
|
|
1724
|
-
input.env,
|
|
1725
|
-
input.params.runId,
|
|
1726
|
-
).catch((error) => {
|
|
1727
|
-
leaseError = error instanceof Error ? error.message : String(error);
|
|
1728
|
-
return null;
|
|
1729
|
-
});
|
|
1730
|
-
const missCounts = pooledInstanceId
|
|
1731
|
-
? null
|
|
1732
|
-
: await workflowPoolCount(input.env).catch(() => null);
|
|
1733
|
-
input.recordSubmitTiming({
|
|
1734
|
-
phase: 'coordinator.workflow_pool_lease',
|
|
1735
|
-
ms: Date.now() - leaseStartedAt,
|
|
1736
|
-
graphHash: input.params.graphHash ?? null,
|
|
1737
|
-
extra: {
|
|
1738
|
-
pooled: Boolean(pooledInstanceId),
|
|
1739
|
-
...(leaseError ? { error: leaseError } : {}),
|
|
1740
|
-
...(missCounts
|
|
1741
|
-
? {
|
|
1742
|
-
availableAfterMiss: missCounts.available,
|
|
1743
|
-
warmingAfterMiss: missCounts.warming,
|
|
1744
|
-
}
|
|
1745
|
-
: {}),
|
|
1746
|
-
},
|
|
1747
|
-
});
|
|
1748
|
-
|
|
1749
|
-
if (!pooledInstanceId) {
|
|
1750
|
-
// A pool miss must not block the user path. Refilling is handled by the
|
|
1751
|
-
// caller's waitUntil after submit, so fall through to cold create now.
|
|
1752
|
-
const counts =
|
|
1753
|
-
missCounts ?? (await workflowPoolCount(input.env).catch(() => null));
|
|
1754
|
-
input.recordSubmitTiming({
|
|
1755
|
-
phase: 'coordinator.workflow_pool_refill_on_miss',
|
|
1756
|
-
ms: 0,
|
|
1757
|
-
graphHash: input.params.graphHash ?? null,
|
|
1758
|
-
extra: {
|
|
1759
|
-
skipped: true,
|
|
1760
|
-
reason: 'pool_miss_does_not_block_submit',
|
|
1761
|
-
...(counts
|
|
1762
|
-
? {
|
|
1763
|
-
available: counts.available,
|
|
1764
|
-
warming: counts.warming,
|
|
1765
|
-
waitedMs: 0,
|
|
1766
|
-
waitIterations: 0,
|
|
1767
|
-
}
|
|
1768
|
-
: {}),
|
|
1769
|
-
},
|
|
1770
|
-
});
|
|
1771
|
-
}
|
|
1772
|
-
|
|
1773
|
-
if (!pooledInstanceId) {
|
|
1774
|
-
return null;
|
|
1775
|
-
}
|
|
1776
|
-
|
|
1777
|
-
const instance = await getWorkflowPoolInstance(input.env, pooledInstanceId);
|
|
1778
|
-
if (!instance) {
|
|
1779
|
-
await blockWorkflowPoolRun({
|
|
1780
|
-
env: input.env,
|
|
1781
|
-
runId: input.params.runId,
|
|
1782
|
-
instanceId: pooledInstanceId,
|
|
1783
|
-
}).catch(() => undefined);
|
|
1784
|
-
input.recordSubmitTiming({
|
|
1785
|
-
phase: 'coordinator.workflow_pool_ready_check',
|
|
1786
|
-
ms: Date.now() - leaseStartedAt,
|
|
1787
|
-
graphHash: input.params.graphHash ?? null,
|
|
1788
|
-
extra: { instanceId: pooledInstanceId, status: 'missing' },
|
|
1789
|
-
});
|
|
1790
|
-
return null;
|
|
1791
|
-
}
|
|
1792
|
-
const readyCheckStartedAt = Date.now();
|
|
1793
|
-
const status = await instance.status().catch(() => null);
|
|
1794
|
-
const statusName = workflowStatusName(status);
|
|
1795
|
-
input.recordSubmitTiming({
|
|
1796
|
-
phase: 'coordinator.workflow_pool_ready_check',
|
|
1797
|
-
ms: Date.now() - readyCheckStartedAt,
|
|
1798
|
-
graphHash: input.params.graphHash ?? null,
|
|
1799
|
-
extra: { instanceId: pooledInstanceId, status: statusName },
|
|
1800
|
-
});
|
|
1801
|
-
if (!workflowPoolStatusIsReady(statusName)) {
|
|
1802
|
-
await blockWorkflowPoolRun({
|
|
1803
|
-
env: input.env,
|
|
1804
|
-
runId: input.params.runId,
|
|
1805
|
-
instanceId: pooledInstanceId,
|
|
1806
|
-
}).catch(() => undefined);
|
|
1807
|
-
await instance.terminate().catch(() => undefined);
|
|
1808
|
-
disposeRpcStub(instance);
|
|
1374
|
+
});
|
|
1809
1375
|
return null;
|
|
1376
|
+
});
|
|
1377
|
+
if (!claim?.claimed || !claim.params) {
|
|
1378
|
+
return { retried: false, result: null };
|
|
1810
1379
|
}
|
|
1811
|
-
const
|
|
1380
|
+
const retryInstanceId = workflowRetryInstanceId(input.runId, claim.attempts);
|
|
1381
|
+
const retryStartedAt = Date.now();
|
|
1382
|
+
let retryInstance: WorkflowInstance | null = null;
|
|
1812
1383
|
try {
|
|
1813
|
-
await
|
|
1814
|
-
|
|
1815
|
-
|
|
1384
|
+
retryInstance = await createDynamicWorkflowInstance({
|
|
1385
|
+
env: input.env,
|
|
1386
|
+
id: retryInstanceId,
|
|
1387
|
+
params: claim.params,
|
|
1816
1388
|
});
|
|
1817
|
-
|
|
1818
|
-
await blockWorkflowPoolRun({
|
|
1389
|
+
await recordWorkflowInstanceId({
|
|
1819
1390
|
env: input.env,
|
|
1820
|
-
runId: input.
|
|
1821
|
-
instanceId:
|
|
1822
|
-
}).catch(() => undefined);
|
|
1823
|
-
disposeRpcStub(instance);
|
|
1824
|
-
console.warn('[coordinator.workflow_pool] sendEvent failed; falling back', {
|
|
1825
|
-
runId: input.params.runId,
|
|
1826
|
-
pooledInstanceId,
|
|
1827
|
-
error: error instanceof Error ? error.message : String(error),
|
|
1391
|
+
runId: input.runId,
|
|
1392
|
+
instanceId: retryInstance.id,
|
|
1828
1393
|
});
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
|
|
1833
|
-
|
|
1834
|
-
|
|
1835
|
-
extra: { instanceId: pooledInstanceId },
|
|
1836
|
-
});
|
|
1837
|
-
const ack = await waitForWorkflowPoolStartAck({
|
|
1838
|
-
env: input.env,
|
|
1839
|
-
runId: input.params.runId,
|
|
1840
|
-
instanceId: pooledInstanceId,
|
|
1841
|
-
timeoutMs: WORKFLOW_POOL_START_ACK_TIMEOUT_MS,
|
|
1842
|
-
});
|
|
1843
|
-
if (ack.acknowledged) {
|
|
1844
|
-
input.recordSubmitTiming({
|
|
1845
|
-
phase: 'coordinator.workflow_pool_start_ack',
|
|
1846
|
-
ms: ack.ms,
|
|
1847
|
-
graphHash: input.params.graphHash ?? null,
|
|
1394
|
+
input.ctx?.waitUntil(input.oldInstance.terminate().catch(() => undefined));
|
|
1395
|
+
recordCoordinatorPerfTraceBuffered(input.env, input.ctx, {
|
|
1396
|
+
runId: input.runId,
|
|
1397
|
+
phase: 'coordinator.platform_deploy_retry',
|
|
1398
|
+
ms: Date.now() - retryStartedAt,
|
|
1399
|
+
graphHash: claim.params.graphHash ?? null,
|
|
1848
1400
|
extra: {
|
|
1849
|
-
|
|
1850
|
-
|
|
1851
|
-
|
|
1852
|
-
startedAt: ack.startedAt,
|
|
1401
|
+
retryAttempt: claim.attempts,
|
|
1402
|
+
retryInstanceId: retryInstance.id,
|
|
1403
|
+
reason: decision.reason,
|
|
1853
1404
|
},
|
|
1854
1405
|
});
|
|
1855
|
-
return
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
|
|
1860
|
-
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
|
|
1868
|
-
|
|
1869
|
-
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
startedAt: ack.startedAt,
|
|
1873
|
-
mappedInstanceId: ack.mappedInstanceId,
|
|
1874
|
-
blocked: block.blocked,
|
|
1875
|
-
blockMs: Date.now() - blockStartedAt,
|
|
1876
|
-
},
|
|
1877
|
-
});
|
|
1878
|
-
if (block.started) {
|
|
1879
|
-
return instance;
|
|
1880
|
-
}
|
|
1881
|
-
await instance.terminate().catch(() => undefined);
|
|
1882
|
-
disposeRpcStub(instance);
|
|
1883
|
-
input.recordSubmitTiming({
|
|
1884
|
-
phase: 'coordinator.workflow_pool_fallback',
|
|
1885
|
-
ms: Date.now() - sendStartedAt,
|
|
1886
|
-
graphHash: input.params.graphHash ?? null,
|
|
1887
|
-
extra: {
|
|
1888
|
-
reason: 'start_ack_timeout',
|
|
1889
|
-
instanceId: pooledInstanceId,
|
|
1890
|
-
ackTimeoutMs: WORKFLOW_POOL_START_ACK_TIMEOUT_MS,
|
|
1891
|
-
},
|
|
1892
|
-
});
|
|
1893
|
-
return null;
|
|
1894
|
-
}
|
|
1895
|
-
|
|
1896
|
-
function readWorkflowPayload(event: unknown): Record<string, unknown> | null {
|
|
1897
|
-
if (!isRecord(event)) return null;
|
|
1898
|
-
const payload = event.payload;
|
|
1899
|
-
if (!isRecord(payload)) return null;
|
|
1900
|
-
return isRecord(payload.params) ? payload.params : payload;
|
|
1901
|
-
}
|
|
1902
|
-
|
|
1903
|
-
async function markWorkflowRuntimeFailure(input: {
|
|
1904
|
-
env: CoordinatorEnv;
|
|
1905
|
-
event: unknown;
|
|
1906
|
-
error: unknown;
|
|
1907
|
-
}): Promise<void> {
|
|
1908
|
-
const payload = readWorkflowPayload(input.event);
|
|
1909
|
-
if (!payload) return;
|
|
1910
|
-
const runId = typeof payload.runId === 'string' ? payload.runId : null;
|
|
1911
|
-
const baseUrl = typeof payload.baseUrl === 'string' ? payload.baseUrl : null;
|
|
1912
|
-
const executorToken =
|
|
1913
|
-
typeof payload.executorToken === 'string' ? payload.executorToken : null;
|
|
1914
|
-
if (!runId || !baseUrl || !executorToken) return;
|
|
1915
|
-
const errorName =
|
|
1916
|
-
input.error instanceof Error && input.error.name
|
|
1917
|
-
? input.error.name
|
|
1918
|
-
: 'Error';
|
|
1919
|
-
const errorMessage =
|
|
1920
|
-
input.error instanceof Error ? input.error.message : String(input.error);
|
|
1921
|
-
const errorStack =
|
|
1922
|
-
input.error instanceof Error && typeof input.error.stack === 'string'
|
|
1923
|
-
? input.error.stack.split('\n').slice(0, 12).join('\n')
|
|
1924
|
-
: null;
|
|
1925
|
-
const headers = new Headers({
|
|
1926
|
-
authorization: `Bearer ${executorToken}`,
|
|
1927
|
-
'content-type': 'application/json',
|
|
1928
|
-
});
|
|
1929
|
-
const bypass = input.env.VERCEL_PROTECTION_BYPASS_TOKEN?.trim();
|
|
1930
|
-
if (bypass) headers.set('x-vercel-protection-bypass', bypass);
|
|
1931
|
-
const body = JSON.stringify({
|
|
1932
|
-
action: 'append_run_events',
|
|
1933
|
-
playId: runId,
|
|
1934
|
-
events: [
|
|
1935
|
-
{
|
|
1936
|
-
type: 'run.failed',
|
|
1937
|
-
runId,
|
|
1938
|
-
source: 'coordinator',
|
|
1939
|
-
occurredAt: Date.now(),
|
|
1940
|
-
error: `DynamicWorkflow runner failed: ${errorName}: ${errorMessage}${
|
|
1941
|
-
errorStack ? `\n${errorStack}` : ''
|
|
1942
|
-
}`,
|
|
1943
|
-
} satisfies PlayRunLedgerEvent,
|
|
1944
|
-
],
|
|
1945
|
-
});
|
|
1946
|
-
const url = `${baseUrl.replace(/\/$/, '')}/api/v2/plays/internal/runtime`;
|
|
1947
|
-
const backoffMs = [200, 500, 1500];
|
|
1948
|
-
let lastError: unknown = null;
|
|
1949
|
-
for (let attempt = 0; attempt <= backoffMs.length; attempt += 1) {
|
|
1950
|
-
try {
|
|
1951
|
-
const response = await fetch(url, { method: 'POST', headers, body });
|
|
1952
|
-
if (response.ok) return;
|
|
1953
|
-
lastError = new Error(
|
|
1954
|
-
`runtime API responded ${response.status}: ${(await response.text().catch(() => '')).slice(0, 400)}`,
|
|
1955
|
-
);
|
|
1956
|
-
if (
|
|
1957
|
-
response.status >= 400 &&
|
|
1958
|
-
response.status < 500 &&
|
|
1959
|
-
response.status !== 408 &&
|
|
1960
|
-
response.status !== 429
|
|
1961
|
-
) {
|
|
1962
|
-
break;
|
|
1963
|
-
}
|
|
1964
|
-
} catch (error) {
|
|
1965
|
-
lastError = error;
|
|
1966
|
-
}
|
|
1967
|
-
if (attempt < backoffMs.length) {
|
|
1968
|
-
await new Promise((resolve) => setTimeout(resolve, backoffMs[attempt]));
|
|
1969
|
-
}
|
|
1406
|
+
return {
|
|
1407
|
+
retried: true,
|
|
1408
|
+
result: {
|
|
1409
|
+
runId: input.runId,
|
|
1410
|
+
playName: claim.params.playName,
|
|
1411
|
+
status: 'running',
|
|
1412
|
+
result: null,
|
|
1413
|
+
error: null,
|
|
1414
|
+
retry: {
|
|
1415
|
+
reason: decision.reason,
|
|
1416
|
+
attempt: claim.attempts,
|
|
1417
|
+
message: decision.message,
|
|
1418
|
+
},
|
|
1419
|
+
},
|
|
1420
|
+
};
|
|
1421
|
+
} finally {
|
|
1422
|
+
disposeRpcStub(retryInstance);
|
|
1970
1423
|
}
|
|
1971
|
-
console.error('[coordinator] failed to mark workflow runtime failure', {
|
|
1972
|
-
runId,
|
|
1973
|
-
message: lastError instanceof Error ? lastError.message : String(lastError),
|
|
1974
|
-
});
|
|
1975
1424
|
}
|
|
1976
1425
|
|
|
1977
|
-
type StoredPlayArtifactPayload = {
|
|
1978
|
-
artifact?: {
|
|
1979
|
-
bundledCode?: string;
|
|
1980
|
-
artifactKind?: string;
|
|
1981
|
-
};
|
|
1982
|
-
};
|
|
1983
|
-
|
|
1984
|
-
const DYNAMIC_WORKER_COMPATIBILITY_DATE = '2026-05-01';
|
|
1985
|
-
|
|
1986
1426
|
async function mintChildWorkflowExecutorToken(input: {
|
|
1987
1427
|
env: CoordinatorEnv;
|
|
1988
|
-
baseUrl: string;
|
|
1989
1428
|
parentExecutorToken: string;
|
|
1990
1429
|
parentRunId: string;
|
|
1991
1430
|
parentPlayName: string;
|
|
@@ -1993,37 +1432,27 @@ async function mintChildWorkflowExecutorToken(input: {
|
|
|
1993
1432
|
childPlayName: string;
|
|
1994
1433
|
maxCreditsPerRun?: number | null;
|
|
1995
1434
|
}): Promise<string> {
|
|
1996
|
-
const
|
|
1997
|
-
|
|
1998
|
-
|
|
1999
|
-
'
|
|
2000
|
-
|
|
2001
|
-
|
|
2002
|
-
if (input.env.VERCEL_PROTECTION_BYPASS_TOKEN?.trim()) {
|
|
2003
|
-
headers.set(
|
|
2004
|
-
'x-vercel-protection-bypass',
|
|
2005
|
-
input.env.VERCEL_PROTECTION_BYPASS_TOKEN.trim(),
|
|
2006
|
-
);
|
|
2007
|
-
}
|
|
2008
|
-
const response = await fetch(url, {
|
|
2009
|
-
method: 'POST',
|
|
2010
|
-
headers,
|
|
2011
|
-
body: JSON.stringify({
|
|
1435
|
+
const response = await input.env.HARNESS.runtimeApiCall({
|
|
1436
|
+
executorToken: input.parentExecutorToken,
|
|
1437
|
+
path: '/api/v2/plays/internal/child-executor-token',
|
|
1438
|
+
headers: { 'x-deepline-request-id': crypto.randomUUID() },
|
|
1439
|
+
timeoutMs: 15_000,
|
|
1440
|
+
body: {
|
|
2012
1441
|
parentRunId: input.parentRunId,
|
|
2013
1442
|
parentPlayName: input.parentPlayName,
|
|
2014
1443
|
childRunId: input.childRunId,
|
|
2015
1444
|
childPlayName: input.childPlayName,
|
|
2016
1445
|
maxCreditsPerRun: input.maxCreditsPerRun ?? null,
|
|
2017
|
-
}
|
|
1446
|
+
},
|
|
2018
1447
|
});
|
|
2019
|
-
const text =
|
|
1448
|
+
const text = response.body;
|
|
2020
1449
|
let parsed: Record<string, unknown> = {};
|
|
2021
1450
|
try {
|
|
2022
1451
|
parsed = text ? (JSON.parse(text) as Record<string, unknown>) : {};
|
|
2023
1452
|
} catch {
|
|
2024
1453
|
parsed = {};
|
|
2025
1454
|
}
|
|
2026
|
-
if (
|
|
1455
|
+
if (response.status < 200 || response.status >= 300) {
|
|
2027
1456
|
const error = isRecord(parsed.error) ? parsed.error : null;
|
|
2028
1457
|
const message =
|
|
2029
1458
|
(typeof error?.message === 'string' && error.message.trim()) ||
|
|
@@ -2106,7 +1535,6 @@ async function reencryptChildDbSessionForExecutor(input: {
|
|
|
2106
1535
|
|
|
2107
1536
|
async function createChildRuntimeDbSession(input: {
|
|
2108
1537
|
env: CoordinatorEnv;
|
|
2109
|
-
baseUrl: string;
|
|
2110
1538
|
childExecutorToken: string;
|
|
2111
1539
|
childPlayName: string;
|
|
2112
1540
|
requirement: RuntimeDbSessionRequirement;
|
|
@@ -2114,22 +1542,12 @@ async function createChildRuntimeDbSession(input: {
|
|
|
2114
1542
|
orgId: string;
|
|
2115
1543
|
}): Promise<CreateDbSessionResponse> {
|
|
2116
1544
|
const decryptionKey = await generateDbSessionPostgresUrlDecryptionKey();
|
|
2117
|
-
const
|
|
2118
|
-
|
|
2119
|
-
|
|
2120
|
-
'
|
|
2121
|
-
|
|
2122
|
-
|
|
2123
|
-
if (input.env.VERCEL_PROTECTION_BYPASS_TOKEN?.trim()) {
|
|
2124
|
-
headers.set(
|
|
2125
|
-
'x-vercel-protection-bypass',
|
|
2126
|
-
input.env.VERCEL_PROTECTION_BYPASS_TOKEN.trim(),
|
|
2127
|
-
);
|
|
2128
|
-
}
|
|
2129
|
-
const response = await fetch(url, {
|
|
2130
|
-
method: 'POST',
|
|
2131
|
-
headers,
|
|
2132
|
-
body: JSON.stringify({
|
|
1545
|
+
const response = await input.env.HARNESS.runtimeApiCall({
|
|
1546
|
+
executorToken: input.childExecutorToken,
|
|
1547
|
+
path: '/api/v2/plays/internal/runtime',
|
|
1548
|
+
headers: { 'x-deepline-request-id': crypto.randomUUID() },
|
|
1549
|
+
timeoutMs: 15_000,
|
|
1550
|
+
body: {
|
|
2133
1551
|
action: 'create_db_session',
|
|
2134
1552
|
playName: input.childPlayName,
|
|
2135
1553
|
target: {
|
|
@@ -2142,16 +1560,16 @@ async function createChildRuntimeDbSession(input: {
|
|
|
2142
1560
|
ttlSeconds: DB_SESSION_DEFAULT_TTL_SECONDS,
|
|
2143
1561
|
userEmail: input.userEmail,
|
|
2144
1562
|
postgresUrlEncryption: decryptionKey.request,
|
|
2145
|
-
}
|
|
1563
|
+
},
|
|
2146
1564
|
});
|
|
2147
|
-
const text =
|
|
1565
|
+
const text = response.body;
|
|
2148
1566
|
let parsed: unknown = {};
|
|
2149
1567
|
try {
|
|
2150
1568
|
parsed = text ? JSON.parse(text) : {};
|
|
2151
1569
|
} catch {
|
|
2152
1570
|
parsed = {};
|
|
2153
1571
|
}
|
|
2154
|
-
if (
|
|
1572
|
+
if (response.status < 200 || response.status >= 300) {
|
|
2155
1573
|
const error =
|
|
2156
1574
|
isRecord(parsed) && isRecord(parsed.error) ? parsed.error : {};
|
|
2157
1575
|
const message =
|
|
@@ -2172,52 +1590,224 @@ async function createChildRuntimeDbSession(input: {
|
|
|
2172
1590
|
orgId: input.orgId,
|
|
2173
1591
|
childPlayName: input.childPlayName,
|
|
2174
1592
|
});
|
|
2175
|
-
return await reencryptChildDbSessionForExecutor({
|
|
2176
|
-
session,
|
|
2177
|
-
decryptionKey,
|
|
2178
|
-
childExecutorToken: input.childExecutorToken,
|
|
1593
|
+
return await reencryptChildDbSessionForExecutor({
|
|
1594
|
+
session,
|
|
1595
|
+
decryptionKey,
|
|
1596
|
+
childExecutorToken: input.childExecutorToken,
|
|
1597
|
+
});
|
|
1598
|
+
}
|
|
1599
|
+
|
|
1600
|
+
async function preloadChildRuntimeDbSessions(input: {
|
|
1601
|
+
env: CoordinatorEnv;
|
|
1602
|
+
childExecutorToken: string;
|
|
1603
|
+
childRunId: string;
|
|
1604
|
+
childPlayName: string;
|
|
1605
|
+
manifest: PlayRuntimeManifest;
|
|
1606
|
+
orgId: string;
|
|
1607
|
+
userEmail: string;
|
|
1608
|
+
}): Promise<PreloadedRuntimeDbSession[]> {
|
|
1609
|
+
const startedAt = Date.now();
|
|
1610
|
+
const requirements = planRuntimeDbSessionRequirements(
|
|
1611
|
+
input.manifest.staticPipeline ?? null,
|
|
1612
|
+
);
|
|
1613
|
+
const sessions = await Promise.all(
|
|
1614
|
+
requirements.map(async (requirement) => ({
|
|
1615
|
+
tableNamespace: requirement.tableNamespace,
|
|
1616
|
+
logicalTable: requirement.logicalTable,
|
|
1617
|
+
operations: requirement.operations,
|
|
1618
|
+
...(requirement.limits ? { limits: requirement.limits } : {}),
|
|
1619
|
+
session: await createChildRuntimeDbSession({
|
|
1620
|
+
env: input.env,
|
|
1621
|
+
childExecutorToken: input.childExecutorToken,
|
|
1622
|
+
childPlayName: input.childPlayName,
|
|
1623
|
+
requirement,
|
|
1624
|
+
userEmail: input.userEmail,
|
|
1625
|
+
orgId: input.orgId,
|
|
1626
|
+
}),
|
|
1627
|
+
})),
|
|
1628
|
+
);
|
|
1629
|
+
recordCoordinatorPerfTrace({
|
|
1630
|
+
runId: input.childRunId,
|
|
1631
|
+
phase: 'coordinator.child_db_session_preload',
|
|
1632
|
+
ms: Date.now() - startedAt,
|
|
1633
|
+
graphHash: input.manifest.graphHash,
|
|
1634
|
+
extra: { sessions: sessions.length },
|
|
1635
|
+
});
|
|
1636
|
+
return sessions;
|
|
1637
|
+
}
|
|
1638
|
+
|
|
1639
|
+
async function registerInlineChildRunWithRuntime(input: {
|
|
1640
|
+
env: CoordinatorEnv;
|
|
1641
|
+
childExecutorToken: string;
|
|
1642
|
+
childRunId: string;
|
|
1643
|
+
childPlayName: string;
|
|
1644
|
+
manifest: PlayRuntimeManifest;
|
|
1645
|
+
governance: PlayCallGovernanceSnapshot;
|
|
1646
|
+
}): Promise<void> {
|
|
1647
|
+
const response = await input.env.HARNESS.runtimeApiCall({
|
|
1648
|
+
executorToken: input.childExecutorToken,
|
|
1649
|
+
path: '/api/v2/plays/internal/runtime',
|
|
1650
|
+
headers: { 'x-deepline-request-id': crypto.randomUUID() },
|
|
1651
|
+
timeoutMs: 15_000,
|
|
1652
|
+
body: {
|
|
1653
|
+
action: 'start_inline_child_run',
|
|
1654
|
+
playName: input.childPlayName,
|
|
1655
|
+
runId: input.childRunId,
|
|
1656
|
+
workflowFamilyKey:
|
|
1657
|
+
input.governance.rootRunId ??
|
|
1658
|
+
input.governance.parentRunId ??
|
|
1659
|
+
input.childRunId,
|
|
1660
|
+
artifactStorageKey: input.manifest.artifactStorageKey,
|
|
1661
|
+
artifactHash: input.manifest.artifactHash,
|
|
1662
|
+
graphHash: input.manifest.graphHash,
|
|
1663
|
+
runtimeBackend: 'workers_edge',
|
|
1664
|
+
schedulerBackend: 'inline_child',
|
|
1665
|
+
executionProfile: 'workers_edge',
|
|
1666
|
+
...(typeof input.manifest.maxCreditsPerRun === 'number'
|
|
1667
|
+
? { maxCreditsPerRun: input.manifest.maxCreditsPerRun }
|
|
1668
|
+
: {}),
|
|
1669
|
+
staticPipeline: input.manifest.staticPipeline ?? null,
|
|
1670
|
+
source: 'published',
|
|
1671
|
+
},
|
|
1672
|
+
});
|
|
1673
|
+
if (response.status < 200 || response.status >= 300) {
|
|
1674
|
+
const text = response.body ?? '';
|
|
1675
|
+
throw new Error(
|
|
1676
|
+
`Inline child run registration failed ${response.status}: ${text.slice(0, 800)}`,
|
|
1677
|
+
);
|
|
1678
|
+
}
|
|
1679
|
+
}
|
|
1680
|
+
|
|
1681
|
+
type CoordinatorRuntimeApiTiming = {
|
|
1682
|
+
phase: string;
|
|
1683
|
+
ms: number;
|
|
1684
|
+
bytes?: number;
|
|
1685
|
+
};
|
|
1686
|
+
|
|
1687
|
+
async function callRuntimeApiFromCoordinator(input: {
|
|
1688
|
+
env: CoordinatorEnv;
|
|
1689
|
+
executorToken: string;
|
|
1690
|
+
body: unknown;
|
|
1691
|
+
}): Promise<{
|
|
1692
|
+
status: number;
|
|
1693
|
+
body: string;
|
|
1694
|
+
timings: CoordinatorRuntimeApiTiming[];
|
|
1695
|
+
}> {
|
|
1696
|
+
const timings: CoordinatorRuntimeApiTiming[] = [];
|
|
1697
|
+
const totalStartedAt = Date.now();
|
|
1698
|
+
const recordTiming = (
|
|
1699
|
+
phase: string,
|
|
1700
|
+
startedAt: number,
|
|
1701
|
+
extra?: { bytes?: number },
|
|
1702
|
+
): void => {
|
|
1703
|
+
timings.push({
|
|
1704
|
+
phase,
|
|
1705
|
+
ms: Date.now() - startedAt,
|
|
1706
|
+
...(extra?.bytes !== undefined ? { bytes: extra.bytes } : {}),
|
|
1707
|
+
});
|
|
1708
|
+
};
|
|
1709
|
+
|
|
1710
|
+
const buildStartedAt = Date.now();
|
|
1711
|
+
const body = input.body ?? {};
|
|
1712
|
+
const serializedBody = JSON.stringify(body);
|
|
1713
|
+
recordTiming('coordinator.runtime_api.build_request', buildStartedAt, {
|
|
1714
|
+
bytes: serializedBody.length,
|
|
1715
|
+
});
|
|
1716
|
+
|
|
1717
|
+
const fetchStartedAt = Date.now();
|
|
1718
|
+
const response = await input.env.HARNESS.runtimeApiCall({
|
|
1719
|
+
executorToken: input.executorToken,
|
|
1720
|
+
path: '/api/v2/plays/internal/runtime',
|
|
1721
|
+
body,
|
|
1722
|
+
headers: {
|
|
1723
|
+
'x-deepline-request-id': crypto.randomUUID(),
|
|
1724
|
+
},
|
|
1725
|
+
});
|
|
1726
|
+
recordTiming('coordinator.runtime_api.fetch', fetchStartedAt);
|
|
1727
|
+
|
|
1728
|
+
const bodyStartedAt = Date.now();
|
|
1729
|
+
const responseBody = response.body;
|
|
1730
|
+
recordTiming('coordinator.runtime_api.body', bodyStartedAt, {
|
|
1731
|
+
bytes: responseBody.length,
|
|
2179
1732
|
});
|
|
1733
|
+
recordTiming('coordinator.runtime_api.total', totalStartedAt);
|
|
1734
|
+
return {
|
|
1735
|
+
status: response.status,
|
|
1736
|
+
body: responseBody,
|
|
1737
|
+
timings,
|
|
1738
|
+
};
|
|
2180
1739
|
}
|
|
2181
1740
|
|
|
2182
|
-
async function
|
|
1741
|
+
async function prepareInlineChildRunWithRuntime(input: {
|
|
2183
1742
|
env: CoordinatorEnv;
|
|
2184
|
-
|
|
2185
|
-
|
|
1743
|
+
parentExecutorToken: string;
|
|
1744
|
+
parentRunId: string;
|
|
1745
|
+
parentPlayName: string;
|
|
2186
1746
|
childRunId: string;
|
|
2187
1747
|
childPlayName: string;
|
|
2188
1748
|
manifest: PlayRuntimeManifest;
|
|
2189
|
-
|
|
1749
|
+
governance: PlayCallGovernanceSnapshot;
|
|
2190
1750
|
userEmail: string;
|
|
2191
|
-
}): Promise<
|
|
2192
|
-
|
|
2193
|
-
|
|
2194
|
-
|
|
2195
|
-
|
|
2196
|
-
|
|
2197
|
-
|
|
2198
|
-
|
|
2199
|
-
|
|
2200
|
-
|
|
2201
|
-
|
|
2202
|
-
|
|
2203
|
-
|
|
2204
|
-
|
|
2205
|
-
|
|
2206
|
-
|
|
2207
|
-
|
|
2208
|
-
|
|
2209
|
-
|
|
2210
|
-
|
|
2211
|
-
|
|
2212
|
-
|
|
2213
|
-
|
|
2214
|
-
|
|
2215
|
-
|
|
2216
|
-
|
|
2217
|
-
|
|
2218
|
-
|
|
1751
|
+
}): Promise<{
|
|
1752
|
+
childToken: string;
|
|
1753
|
+
preloadedDbSessions: PreloadedRuntimeDbSession[];
|
|
1754
|
+
prepareTimings: unknown[];
|
|
1755
|
+
transportTimings: unknown[];
|
|
1756
|
+
}> {
|
|
1757
|
+
const response = await callRuntimeApiFromCoordinator({
|
|
1758
|
+
env: input.env,
|
|
1759
|
+
executorToken: input.parentExecutorToken,
|
|
1760
|
+
body: {
|
|
1761
|
+
action: 'prepare_inline_child_run',
|
|
1762
|
+
parentRunId: input.parentRunId,
|
|
1763
|
+
parentPlayName: input.parentPlayName,
|
|
1764
|
+
childRunId: input.childRunId,
|
|
1765
|
+
childPlayName: input.childPlayName,
|
|
1766
|
+
workflowFamilyKey:
|
|
1767
|
+
input.governance.rootRunId ??
|
|
1768
|
+
input.governance.parentRunId ??
|
|
1769
|
+
input.childRunId,
|
|
1770
|
+
artifactStorageKey: input.manifest.artifactStorageKey,
|
|
1771
|
+
artifactHash: input.manifest.artifactHash,
|
|
1772
|
+
graphHash: input.manifest.graphHash,
|
|
1773
|
+
runtimeBackend: 'workers_edge',
|
|
1774
|
+
schedulerBackend: 'inline_child',
|
|
1775
|
+
executionProfile: 'workers_edge',
|
|
1776
|
+
...(typeof input.manifest.maxCreditsPerRun === 'number'
|
|
1777
|
+
? { maxCreditsPerRun: input.manifest.maxCreditsPerRun }
|
|
1778
|
+
: {}),
|
|
1779
|
+
staticPipeline: input.manifest.staticPipeline ?? null,
|
|
1780
|
+
source: 'published',
|
|
1781
|
+
userEmail: input.userEmail,
|
|
1782
|
+
},
|
|
2219
1783
|
});
|
|
2220
|
-
|
|
1784
|
+
const text = response.body;
|
|
1785
|
+
let parsed: unknown = {};
|
|
1786
|
+
try {
|
|
1787
|
+
parsed = text ? JSON.parse(text) : {};
|
|
1788
|
+
} catch {
|
|
1789
|
+
parsed = {};
|
|
1790
|
+
}
|
|
1791
|
+
if (response.status < 200 || response.status >= 300) {
|
|
1792
|
+
throw new Error(
|
|
1793
|
+
`Inline child prepare failed ${response.status}: ${text.slice(0, 800)}`,
|
|
1794
|
+
);
|
|
1795
|
+
}
|
|
1796
|
+
if (!isRecord(parsed) || typeof parsed.executorToken !== 'string') {
|
|
1797
|
+
throw new Error('Inline child prepare response was missing executorToken.');
|
|
1798
|
+
}
|
|
1799
|
+
const preloadedDbSessions = Array.isArray(parsed.preloadedDbSessions)
|
|
1800
|
+
? (parsed.preloadedDbSessions as PreloadedRuntimeDbSession[])
|
|
1801
|
+
: [];
|
|
1802
|
+
const prepareTimings = Array.isArray(parsed.prepareTimings)
|
|
1803
|
+
? parsed.prepareTimings
|
|
1804
|
+
: [];
|
|
1805
|
+
return {
|
|
1806
|
+
childToken: parsed.executorToken,
|
|
1807
|
+
preloadedDbSessions,
|
|
1808
|
+
prepareTimings,
|
|
1809
|
+
transportTimings: response.timings,
|
|
1810
|
+
};
|
|
2221
1811
|
}
|
|
2222
1812
|
|
|
2223
1813
|
function buildChildRunId(playName: string): string {
|
|
@@ -2476,6 +2066,8 @@ function runRequestFromPlayWorkflowParams(
|
|
|
2476
2066
|
childPlayManifests: params.childPlayManifests ?? null,
|
|
2477
2067
|
playCallGovernance: params.playCallGovernance ?? null,
|
|
2478
2068
|
preloadedDbSessions: params.preloadedDbSessions ?? null,
|
|
2069
|
+
inlineChildRunRegistered:
|
|
2070
|
+
params.runtimeBackend === 'cf_workflows_dynamic_worker_inline_child',
|
|
2479
2071
|
coordinatorUrl: params.coordinatorUrl ?? null,
|
|
2480
2072
|
totalRows: params.totalRows,
|
|
2481
2073
|
};
|
|
@@ -2606,38 +2198,41 @@ async function executeChildInline(input: {
|
|
|
2606
2198
|
},
|
|
2607
2199
|
});
|
|
2608
2200
|
|
|
2609
|
-
const
|
|
2610
|
-
const
|
|
2611
|
-
|
|
2612
|
-
|
|
2613
|
-
|
|
2614
|
-
|
|
2615
|
-
|
|
2616
|
-
|
|
2617
|
-
|
|
2618
|
-
input.body.parentPlayName.trim()
|
|
2619
|
-
? input.body.parentPlayName.trim()
|
|
2620
|
-
: governance.parentPlayName,
|
|
2621
|
-
childRunId,
|
|
2622
|
-
childPlayName,
|
|
2623
|
-
maxCreditsPerRun: manifest.maxCreditsPerRun ?? null,
|
|
2201
|
+
const loaderStartedAt = Date.now();
|
|
2202
|
+
const stub = loadDynamicPlayWorker(input.env, {
|
|
2203
|
+
runId: childRunId,
|
|
2204
|
+
graphHash: manifest.graphHash,
|
|
2205
|
+
artifactStorageKey: manifest.artifactStorageKey,
|
|
2206
|
+
artifactHash: manifest.artifactHash,
|
|
2207
|
+
dynamicWorkerCode:
|
|
2208
|
+
typeof manifest.bundledCode === 'string' ? manifest.bundledCode : null,
|
|
2209
|
+
packagedFiles: null,
|
|
2624
2210
|
});
|
|
2625
|
-
trace('coordinator.
|
|
2211
|
+
trace('coordinator.inline_child_loader_get', loaderStartedAt);
|
|
2626
2212
|
|
|
2627
|
-
const
|
|
2628
|
-
const
|
|
2629
|
-
|
|
2630
|
-
|
|
2631
|
-
|
|
2632
|
-
|
|
2633
|
-
|
|
2634
|
-
|
|
2635
|
-
|
|
2636
|
-
|
|
2637
|
-
|
|
2638
|
-
|
|
2639
|
-
|
|
2213
|
+
const prepareStartedAt = Date.now();
|
|
2214
|
+
const parentPlayName =
|
|
2215
|
+
typeof input.body.parentPlayName === 'string' &&
|
|
2216
|
+
input.body.parentPlayName.trim()
|
|
2217
|
+
? input.body.parentPlayName.trim()
|
|
2218
|
+
: governance.parentPlayName;
|
|
2219
|
+
const { childToken, preloadedDbSessions, prepareTimings, transportTimings } =
|
|
2220
|
+
await prepareInlineChildRunWithRuntime({
|
|
2221
|
+
env: input.env,
|
|
2222
|
+
parentExecutorToken,
|
|
2223
|
+
parentRunId: input.parentRunId,
|
|
2224
|
+
parentPlayName,
|
|
2225
|
+
childRunId,
|
|
2226
|
+
childPlayName,
|
|
2227
|
+
manifest,
|
|
2228
|
+
governance,
|
|
2229
|
+
userEmail:
|
|
2230
|
+
typeof input.body.userEmail === 'string' ? input.body.userEmail : '',
|
|
2231
|
+
});
|
|
2232
|
+
trace('coordinator.inline_child_prepare', prepareStartedAt, {
|
|
2640
2233
|
sessions: preloadedDbSessions.length,
|
|
2234
|
+
prepareTimings,
|
|
2235
|
+
transportTimings,
|
|
2641
2236
|
});
|
|
2642
2237
|
|
|
2643
2238
|
const params = buildChildWorkflowParams({
|
|
@@ -2655,17 +2250,6 @@ async function executeChildInline(input: {
|
|
|
2655
2250
|
preloadedDbSessions:
|
|
2656
2251
|
preloadedDbSessions.length > 0 ? preloadedDbSessions : null,
|
|
2657
2252
|
});
|
|
2658
|
-
const loaderStartedAt = Date.now();
|
|
2659
|
-
const stub = loadDynamicPlayWorker(input.env, {
|
|
2660
|
-
runId: childRunId,
|
|
2661
|
-
graphHash: manifest.graphHash,
|
|
2662
|
-
artifactStorageKey: manifest.artifactStorageKey,
|
|
2663
|
-
artifactHash: manifest.artifactHash,
|
|
2664
|
-
dynamicWorkerCode:
|
|
2665
|
-
typeof manifest.bundledCode === 'string' ? manifest.bundledCode : null,
|
|
2666
|
-
packagedFiles: null,
|
|
2667
|
-
});
|
|
2668
|
-
trace('coordinator.inline_child_loader_get', loaderStartedAt);
|
|
2669
2253
|
|
|
2670
2254
|
let entrypoint: ReturnType<Awaited<typeof stub>['getEntrypoint']> | null =
|
|
2671
2255
|
null;
|
|
@@ -2676,44 +2260,54 @@ async function executeChildInline(input: {
|
|
|
2676
2260
|
entrypoint = awaitedStub.getEntrypoint();
|
|
2677
2261
|
trace('coordinator.inline_child_get_entrypoint', entrypointStartedAt);
|
|
2678
2262
|
const fetchStartedAt = Date.now();
|
|
2679
|
-
|
|
2263
|
+
const inlineResponse = await entrypoint.fetch(
|
|
2680
2264
|
new Request('https://deepline.dynamic.internal/run-inline', {
|
|
2681
2265
|
method: 'POST',
|
|
2682
2266
|
headers: { 'content-type': 'application/json' },
|
|
2683
2267
|
body: JSON.stringify(runRequestFromPlayWorkflowParams(params)),
|
|
2684
2268
|
}),
|
|
2685
2269
|
);
|
|
2270
|
+
if (!inlineResponse) {
|
|
2271
|
+
throw new Error('Inline child Worker returned no response.');
|
|
2272
|
+
}
|
|
2273
|
+
let workerResponse = inlineResponse as Response;
|
|
2274
|
+
response = workerResponse;
|
|
2686
2275
|
trace('coordinator.inline_child_worker_fetch', fetchStartedAt, {
|
|
2687
|
-
status:
|
|
2276
|
+
status: workerResponse.status,
|
|
2688
2277
|
endpoint: '/run-inline',
|
|
2689
2278
|
});
|
|
2690
2279
|
let usedLegacyRunStream = false;
|
|
2691
|
-
if (
|
|
2692
|
-
disposeRpcStub(
|
|
2280
|
+
if (workerResponse.status === 404) {
|
|
2281
|
+
disposeRpcStub(workerResponse);
|
|
2693
2282
|
const legacyFetchStartedAt = Date.now();
|
|
2694
|
-
|
|
2283
|
+
const legacyResponse = await entrypoint.fetch(
|
|
2695
2284
|
new Request('https://deepline.dynamic.internal/run', {
|
|
2696
2285
|
method: 'POST',
|
|
2697
2286
|
headers: { 'content-type': 'application/json' },
|
|
2698
2287
|
body: JSON.stringify(runRequestFromPlayWorkflowParams(params)),
|
|
2699
2288
|
}),
|
|
2700
2289
|
);
|
|
2290
|
+
if (!legacyResponse) {
|
|
2291
|
+
throw new Error('Legacy inline child Worker returned no response.');
|
|
2292
|
+
}
|
|
2293
|
+
workerResponse = legacyResponse as Response;
|
|
2294
|
+
response = workerResponse;
|
|
2701
2295
|
usedLegacyRunStream = true;
|
|
2702
2296
|
trace('coordinator.inline_child_worker_fetch', legacyFetchStartedAt, {
|
|
2703
|
-
status:
|
|
2297
|
+
status: workerResponse.status,
|
|
2704
2298
|
endpoint: '/run',
|
|
2705
2299
|
compatibility: 'legacy_stream',
|
|
2706
2300
|
});
|
|
2707
2301
|
}
|
|
2708
|
-
if (!
|
|
2709
|
-
const text = await
|
|
2302
|
+
if (!workerResponse.ok) {
|
|
2303
|
+
const text = await workerResponse.text().catch(() => '');
|
|
2710
2304
|
throw new Error(
|
|
2711
|
-
`Inline child Worker failed ${
|
|
2305
|
+
`Inline child Worker failed ${workerResponse.status}: ${text.slice(0, 800)}`,
|
|
2712
2306
|
);
|
|
2713
2307
|
}
|
|
2714
2308
|
const responseStartedAt = Date.now();
|
|
2715
2309
|
const parsed: InlineWorkerRunResponse = usedLegacyRunStream
|
|
2716
|
-
? await readLegacyRunStream(
|
|
2310
|
+
? await readLegacyRunStream(workerResponse).then((legacy) => ({
|
|
2717
2311
|
status: legacy.error ? 'failed' : 'completed',
|
|
2718
2312
|
result: legacy.result,
|
|
2719
2313
|
outputRows: legacy.outputRows ?? undefined,
|
|
@@ -2723,7 +2317,7 @@ async function executeChildInline(input: {
|
|
|
2723
2317
|
})),
|
|
2724
2318
|
error: legacy.error ?? undefined,
|
|
2725
2319
|
}))
|
|
2726
|
-
: ((await
|
|
2320
|
+
: ((await workerResponse.json()) as InlineWorkerRunResponse);
|
|
2727
2321
|
const logs = (parsed.events ?? []).flatMap((event) => {
|
|
2728
2322
|
if (
|
|
2729
2323
|
event &&
|
|
@@ -2744,6 +2338,34 @@ async function executeChildInline(input: {
|
|
|
2744
2338
|
durationMs:
|
|
2745
2339
|
typeof parsed.durationMs === 'number' ? parsed.durationMs : null,
|
|
2746
2340
|
});
|
|
2341
|
+
for (const timing of parsed.timings ?? []) {
|
|
2342
|
+
if (
|
|
2343
|
+
!timing ||
|
|
2344
|
+
typeof timing !== 'object' ||
|
|
2345
|
+
typeof timing.phase !== 'string' ||
|
|
2346
|
+
typeof timing.ms !== 'number' ||
|
|
2347
|
+
!Number.isFinite(timing.ms)
|
|
2348
|
+
) {
|
|
2349
|
+
continue;
|
|
2350
|
+
}
|
|
2351
|
+
recordCoordinatorPerfTrace({
|
|
2352
|
+
runId: childRunId,
|
|
2353
|
+
phase: `dynamic_worker.${timing.phase}`,
|
|
2354
|
+
ms: Math.max(0, Math.round(timing.ms)),
|
|
2355
|
+
graphHash: manifest.graphHash,
|
|
2356
|
+
extra: {
|
|
2357
|
+
parentRunId: input.parentRunId,
|
|
2358
|
+
mode: 'inline_dynamic_worker',
|
|
2359
|
+
...(isRecord(timing.extra) ? timing.extra : {}),
|
|
2360
|
+
},
|
|
2361
|
+
});
|
|
2362
|
+
timings.push({
|
|
2363
|
+
phase: `dynamic_worker.${timing.phase}`,
|
|
2364
|
+
ms: Math.max(0, Math.round(timing.ms)),
|
|
2365
|
+
graphHash: manifest.graphHash,
|
|
2366
|
+
...(isRecord(timing.extra) ? { extra: timing.extra } : {}),
|
|
2367
|
+
});
|
|
2368
|
+
}
|
|
2747
2369
|
trace('coordinator.inline_child_total', startedAt);
|
|
2748
2370
|
if (parsed.status === 'failed' || parsed.error) {
|
|
2749
2371
|
const error = {
|
|
@@ -2788,29 +2410,176 @@ async function executeChildInline(input: {
|
|
|
2788
2410
|
action: 'completed',
|
|
2789
2411
|
mode: 'inline_dynamic_worker',
|
|
2790
2412
|
},
|
|
2791
|
-
});
|
|
2792
|
-
return {
|
|
2793
|
-
workflowId: childRunId,
|
|
2794
|
-
runId: childRunId,
|
|
2795
|
-
status: 'completed',
|
|
2796
|
-
mode: 'inline_dynamic_worker',
|
|
2797
|
-
result: parsed.result,
|
|
2798
|
-
output: parsed.result,
|
|
2799
|
-
logs,
|
|
2800
|
-
timings,
|
|
2801
|
-
};
|
|
2802
|
-
} finally {
|
|
2803
|
-
disposeRpcStub(response);
|
|
2804
|
-
disposeRpcStub(entrypoint);
|
|
2805
|
-
disposeRpcStub(await stub.catch(() => null));
|
|
2806
|
-
}
|
|
2413
|
+
});
|
|
2414
|
+
return {
|
|
2415
|
+
workflowId: childRunId,
|
|
2416
|
+
runId: childRunId,
|
|
2417
|
+
status: 'completed',
|
|
2418
|
+
mode: 'inline_dynamic_worker',
|
|
2419
|
+
result: parsed.result,
|
|
2420
|
+
output: parsed.result,
|
|
2421
|
+
logs,
|
|
2422
|
+
timings,
|
|
2423
|
+
};
|
|
2424
|
+
} finally {
|
|
2425
|
+
disposeRpcStub(response);
|
|
2426
|
+
disposeRpcStub(entrypoint);
|
|
2427
|
+
disposeRpcStub(await stub.catch(() => null));
|
|
2428
|
+
}
|
|
2429
|
+
}
|
|
2430
|
+
|
|
2431
|
+
async function submitChildWorkflowThroughCoordinator(input: {
|
|
2432
|
+
env: CoordinatorEnv;
|
|
2433
|
+
parentRunId: string;
|
|
2434
|
+
body: Record<string, unknown>;
|
|
2435
|
+
coordinatorUrl: string | null;
|
|
2436
|
+
}): Promise<{
|
|
2437
|
+
response: Response;
|
|
2438
|
+
responseText: string;
|
|
2439
|
+
childRunId: string;
|
|
2440
|
+
childPlayName: string;
|
|
2441
|
+
startedAt: number;
|
|
2442
|
+
timings: CoordinatorTiming[];
|
|
2443
|
+
}> {
|
|
2444
|
+
const startedAt = Date.now();
|
|
2445
|
+
const timings: CoordinatorTiming[] = [];
|
|
2446
|
+
const trace = (
|
|
2447
|
+
phase: string,
|
|
2448
|
+
phaseStartedAt: number,
|
|
2449
|
+
graphHash?: string | null,
|
|
2450
|
+
extra?: Record<string, unknown>,
|
|
2451
|
+
): void => {
|
|
2452
|
+
const timing: CoordinatorTiming = {
|
|
2453
|
+
phase,
|
|
2454
|
+
ms: Date.now() - phaseStartedAt,
|
|
2455
|
+
...(graphHash ? { graphHash } : {}),
|
|
2456
|
+
...(extra ? { extra } : {}),
|
|
2457
|
+
};
|
|
2458
|
+
timings.push(timing);
|
|
2459
|
+
recordCoordinatorPerfTrace({
|
|
2460
|
+
runId: input.parentRunId,
|
|
2461
|
+
phase,
|
|
2462
|
+
ms: timing.ms,
|
|
2463
|
+
graphHash: graphHash ?? undefined,
|
|
2464
|
+
extra,
|
|
2465
|
+
});
|
|
2466
|
+
};
|
|
2467
|
+
const validated = validateChildSubmitBody({
|
|
2468
|
+
parentRunId: input.parentRunId,
|
|
2469
|
+
body: input.body,
|
|
2470
|
+
});
|
|
2471
|
+
if (!validated.ok) {
|
|
2472
|
+
return {
|
|
2473
|
+
response: Response.json(
|
|
2474
|
+
{ error: validated.error },
|
|
2475
|
+
{ status: validated.status },
|
|
2476
|
+
),
|
|
2477
|
+
responseText: '',
|
|
2478
|
+
childRunId: '',
|
|
2479
|
+
childPlayName: '',
|
|
2480
|
+
startedAt,
|
|
2481
|
+
timings,
|
|
2482
|
+
};
|
|
2483
|
+
}
|
|
2484
|
+
const { manifest, governance, childPlayName, orgId, parentExecutorToken } =
|
|
2485
|
+
validated;
|
|
2486
|
+
const childRunId = buildChildRunId(childPlayName);
|
|
2487
|
+
const baseUrl = resolveRuntimeBaseUrl(input.env, input.body);
|
|
2488
|
+
|
|
2489
|
+
const tokenStartedAt = Date.now();
|
|
2490
|
+
const childToken = await mintChildWorkflowExecutorToken({
|
|
2491
|
+
env: input.env,
|
|
2492
|
+
parentExecutorToken,
|
|
2493
|
+
parentRunId: input.parentRunId,
|
|
2494
|
+
parentPlayName:
|
|
2495
|
+
typeof input.body.parentPlayName === 'string' &&
|
|
2496
|
+
input.body.parentPlayName.trim()
|
|
2497
|
+
? input.body.parentPlayName.trim()
|
|
2498
|
+
: governance.parentPlayName,
|
|
2499
|
+
childRunId,
|
|
2500
|
+
childPlayName,
|
|
2501
|
+
maxCreditsPerRun: manifest.maxCreditsPerRun ?? null,
|
|
2502
|
+
});
|
|
2503
|
+
trace('coordinator.child_submit_token', tokenStartedAt, manifest.graphHash, {
|
|
2504
|
+
childRunId,
|
|
2505
|
+
childPlayName,
|
|
2506
|
+
});
|
|
2507
|
+
|
|
2508
|
+
const dbSessionStartedAt = Date.now();
|
|
2509
|
+
const preloadedDbSessions = await preloadChildRuntimeDbSessions({
|
|
2510
|
+
env: input.env,
|
|
2511
|
+
childExecutorToken: childToken,
|
|
2512
|
+
childRunId,
|
|
2513
|
+
childPlayName,
|
|
2514
|
+
manifest,
|
|
2515
|
+
orgId,
|
|
2516
|
+
userEmail:
|
|
2517
|
+
typeof input.body.userEmail === 'string' ? input.body.userEmail : '',
|
|
2518
|
+
});
|
|
2519
|
+
trace(
|
|
2520
|
+
'coordinator.child_submit_db_session_preload',
|
|
2521
|
+
dbSessionStartedAt,
|
|
2522
|
+
manifest.graphHash,
|
|
2523
|
+
{ childRunId, sessions: preloadedDbSessions.length },
|
|
2524
|
+
);
|
|
2525
|
+
|
|
2526
|
+
const params = buildChildWorkflowParams({
|
|
2527
|
+
env: input.env,
|
|
2528
|
+
body: input.body,
|
|
2529
|
+
manifest,
|
|
2530
|
+
governance,
|
|
2531
|
+
childRunId,
|
|
2532
|
+
childPlayName,
|
|
2533
|
+
childToken,
|
|
2534
|
+
orgId,
|
|
2535
|
+
coordinatorUrl: input.coordinatorUrl,
|
|
2536
|
+
runtimeBackend: 'cf_workflows_dynamic_worker',
|
|
2537
|
+
dynamicWorkerCode:
|
|
2538
|
+
typeof manifest.bundledCode === 'string' ? manifest.bundledCode : null,
|
|
2539
|
+
preloadedDbSessions:
|
|
2540
|
+
preloadedDbSessions.length > 0 ? preloadedDbSessions : null,
|
|
2541
|
+
});
|
|
2542
|
+
|
|
2543
|
+
const workflowSubmitStartedAt = Date.now();
|
|
2544
|
+
const response = await handleWorkflowRoute({
|
|
2545
|
+
runId: childRunId,
|
|
2546
|
+
action: 'submit',
|
|
2547
|
+
request: new Request(
|
|
2548
|
+
`https://deepline.coordinator.internal/workflow/${encodeURIComponent(
|
|
2549
|
+
childRunId,
|
|
2550
|
+
)}/submit`,
|
|
2551
|
+
{
|
|
2552
|
+
method: 'POST',
|
|
2553
|
+
headers: { 'content-type': 'application/json' },
|
|
2554
|
+
body: JSON.stringify(params),
|
|
2555
|
+
},
|
|
2556
|
+
),
|
|
2557
|
+
env: input.env,
|
|
2558
|
+
});
|
|
2559
|
+
trace(
|
|
2560
|
+
'coordinator.child_submit_workflow',
|
|
2561
|
+
workflowSubmitStartedAt,
|
|
2562
|
+
manifest.graphHash,
|
|
2563
|
+
{ childRunId, status: response.status },
|
|
2564
|
+
);
|
|
2565
|
+
const responseText = await response.text().catch(() => '');
|
|
2566
|
+
return {
|
|
2567
|
+
response,
|
|
2568
|
+
responseText,
|
|
2569
|
+
childRunId,
|
|
2570
|
+
childPlayName,
|
|
2571
|
+
startedAt,
|
|
2572
|
+
timings,
|
|
2573
|
+
};
|
|
2807
2574
|
}
|
|
2808
2575
|
|
|
2809
2576
|
/**
|
|
2810
2577
|
* In-process Fetcher handed to each per-graphHash play Worker as
|
|
2811
|
-
* `env.RUNTIME_API`. Runs in the coordinator's isolate
|
|
2812
|
-
*
|
|
2813
|
-
*
|
|
2578
|
+
* `env.RUNTIME_API`. Runs in the coordinator's isolate. Forwards runtime
|
|
2579
|
+
* callbacks to DEEPLINE_API_BASE_URL: in dev (the only mode — deployed CF
|
|
2580
|
+
* coordinator + local app) that is the cloudflared tunnel URL exposing the
|
|
2581
|
+
* laptop's app; in prod it is the deployed app URL. There is no
|
|
2582
|
+
* direct-to-localhost path (the local-workerd dev mode was removed).
|
|
2814
2583
|
*
|
|
2815
2584
|
* Has to be a `WorkerEntrypoint` (not a plain closure) because closures
|
|
2816
2585
|
* containing captured state aren't structured-cloneable, and Cloudflare
|
|
@@ -2882,6 +2651,49 @@ export class CoordinatorControl extends WorkerEntrypoint<
|
|
|
2882
2651
|
});
|
|
2883
2652
|
}
|
|
2884
2653
|
|
|
2654
|
+
async submitWorkflowChild(
|
|
2655
|
+
parentRunId: string,
|
|
2656
|
+
body: Record<string, unknown>,
|
|
2657
|
+
): Promise<{
|
|
2658
|
+
workflowId?: string;
|
|
2659
|
+
runId?: string;
|
|
2660
|
+
status?: string;
|
|
2661
|
+
mode?: string;
|
|
2662
|
+
timings?: CoordinatorTiming[];
|
|
2663
|
+
coordinator?: unknown;
|
|
2664
|
+
error?: unknown;
|
|
2665
|
+
}> {
|
|
2666
|
+
const { response, responseText, childRunId, timings } =
|
|
2667
|
+
await submitChildWorkflowThroughCoordinator({
|
|
2668
|
+
env: this.env,
|
|
2669
|
+
parentRunId,
|
|
2670
|
+
body,
|
|
2671
|
+
coordinatorUrl: null,
|
|
2672
|
+
});
|
|
2673
|
+
let parsed: unknown = {};
|
|
2674
|
+
try {
|
|
2675
|
+
parsed = responseText ? JSON.parse(responseText) : {};
|
|
2676
|
+
} catch {
|
|
2677
|
+
parsed = { error: responseText };
|
|
2678
|
+
}
|
|
2679
|
+
if (!response.ok) {
|
|
2680
|
+
return {
|
|
2681
|
+
runId: childRunId || undefined,
|
|
2682
|
+
workflowId: childRunId || undefined,
|
|
2683
|
+
status: 'failed',
|
|
2684
|
+
error: isRecord(parsed) ? (parsed.error ?? parsed) : parsed,
|
|
2685
|
+
};
|
|
2686
|
+
}
|
|
2687
|
+
return {
|
|
2688
|
+
workflowId: childRunId,
|
|
2689
|
+
runId: childRunId,
|
|
2690
|
+
status: 'started',
|
|
2691
|
+
mode: 'workflow_rpc',
|
|
2692
|
+
coordinator: parsed,
|
|
2693
|
+
timings,
|
|
2694
|
+
};
|
|
2695
|
+
}
|
|
2696
|
+
|
|
2885
2697
|
async signal(
|
|
2886
2698
|
runId: string,
|
|
2887
2699
|
body: Record<string, unknown>,
|
|
@@ -2942,6 +2754,75 @@ export class CoordinatorControl extends WorkerEntrypoint<
|
|
|
2942
2754
|
}
|
|
2943
2755
|
await appendCoordinatorRunEvent(this.env, event);
|
|
2944
2756
|
}
|
|
2757
|
+
|
|
2758
|
+
async readTerminalState(
|
|
2759
|
+
runId: string,
|
|
2760
|
+
): Promise<CoordinatorTerminalState | null> {
|
|
2761
|
+
if (!runId) {
|
|
2762
|
+
throw new Error('runId is required.');
|
|
2763
|
+
}
|
|
2764
|
+
return await readCoordinatorTerminalState(this.env, runId);
|
|
2765
|
+
}
|
|
2766
|
+
|
|
2767
|
+
async readChildTerminalState(
|
|
2768
|
+
parentRunId: string,
|
|
2769
|
+
eventKey: string,
|
|
2770
|
+
timeoutMs?: number,
|
|
2771
|
+
): Promise<CoordinatorChildTerminalState | null> {
|
|
2772
|
+
if (!parentRunId || !eventKey) {
|
|
2773
|
+
throw new Error('parentRunId and eventKey are required.');
|
|
2774
|
+
}
|
|
2775
|
+
return await readCoordinatorChildTerminalState({
|
|
2776
|
+
env: this.env,
|
|
2777
|
+
parentRunId,
|
|
2778
|
+
eventKey,
|
|
2779
|
+
timeoutMs:
|
|
2780
|
+
typeof timeoutMs === 'number' && Number.isFinite(timeoutMs)
|
|
2781
|
+
? Math.max(0, Math.min(Math.floor(timeoutMs), 30_000))
|
|
2782
|
+
: undefined,
|
|
2783
|
+
});
|
|
2784
|
+
}
|
|
2785
|
+
|
|
2786
|
+
/**
|
|
2787
|
+
* Distributed Rate State Backend acquire: lease up to `requested` request-
|
|
2788
|
+
* window permits for `bucketId` (`<orgId>:<provider>`) from the per-bucket
|
|
2789
|
+
* rate-state Durable Object. See CoordinatorRateStateBackend + dedup-do.ts.
|
|
2790
|
+
*/
|
|
2791
|
+
async rateAcquire(input: {
|
|
2792
|
+
bucketId: string;
|
|
2793
|
+
rules: Array<{
|
|
2794
|
+
ruleId: string;
|
|
2795
|
+
requestsPerWindow: number;
|
|
2796
|
+
windowMs: number;
|
|
2797
|
+
maxConcurrency: number | null;
|
|
2798
|
+
}>;
|
|
2799
|
+
requested: number;
|
|
2800
|
+
}): Promise<{ granted: number; waitMs: number }> {
|
|
2801
|
+
if (!input.bucketId || !input.bucketId.trim()) {
|
|
2802
|
+
throw new Error('bucketId is required.');
|
|
2803
|
+
}
|
|
2804
|
+
return await callRateBucketControl<{ granted: number; waitMs: number }>(
|
|
2805
|
+
this.env,
|
|
2806
|
+
input.bucketId,
|
|
2807
|
+
'/rate-acquire',
|
|
2808
|
+
input,
|
|
2809
|
+
);
|
|
2810
|
+
}
|
|
2811
|
+
|
|
2812
|
+
async ratePenalize(input: {
|
|
2813
|
+
bucketId: string;
|
|
2814
|
+
cooldownMs: number;
|
|
2815
|
+
}): Promise<void> {
|
|
2816
|
+
if (!input.bucketId || !input.bucketId.trim()) {
|
|
2817
|
+
throw new Error('bucketId is required.');
|
|
2818
|
+
}
|
|
2819
|
+
await callRateBucketControl<{ ok?: unknown }>(
|
|
2820
|
+
this.env,
|
|
2821
|
+
input.bucketId,
|
|
2822
|
+
'/rate-penalize',
|
|
2823
|
+
input,
|
|
2824
|
+
);
|
|
2825
|
+
}
|
|
2945
2826
|
}
|
|
2946
2827
|
|
|
2947
2828
|
/**
|
|
@@ -2971,80 +2852,20 @@ export class DynamicWorkflow extends WorkflowEntrypoint<
|
|
|
2971
2852
|
graphHash: entryTrace.graphHash,
|
|
2972
2853
|
extra: {
|
|
2973
2854
|
instanceId: entryTrace.instanceId,
|
|
2974
|
-
pooledBootstrap: entryTrace.pooledBootstrap,
|
|
2975
2855
|
},
|
|
2976
2856
|
});
|
|
2977
|
-
|
|
2978
|
-
if (isPooledWorkflowBootstrapPayload(workflowEvent.payload)) {
|
|
2979
|
-
const pooledPayload = workflowEvent.payload;
|
|
2980
|
-
const waitingStep = step as {
|
|
2981
|
-
waitForEvent<T>(
|
|
2982
|
-
name: string,
|
|
2983
|
-
options: { type: string; timeout?: string | number },
|
|
2984
|
-
): Promise<{ payload: Readonly<T>; timestamp: Date; type: string }>;
|
|
2985
|
-
};
|
|
2986
|
-
const waitStartedAt = Date.now();
|
|
2987
|
-
const startEventPromise = waitingStep.waitForEvent<DispatcherEnvelope>(
|
|
2988
|
-
'wait for pooled play start',
|
|
2989
|
-
{ type: WORKFLOW_POOL_START_EVENT_TYPE, timeout: '10 minutes' },
|
|
2990
|
-
);
|
|
2991
|
-
await markWorkflowPoolIdReady(this.env, pooledPayload.poolId).catch(
|
|
2992
|
-
(error) => {
|
|
2993
|
-
console.warn('[coordinator.workflow_pool] ready signal failed', {
|
|
2994
|
-
poolId: pooledPayload.poolId,
|
|
2995
|
-
message: error instanceof Error ? error.message : String(error),
|
|
2996
|
-
});
|
|
2997
|
-
},
|
|
2998
|
-
);
|
|
2999
|
-
const startEvent = await startEventPromise;
|
|
3000
|
-
dispatchedEvent = {
|
|
3001
|
-
payload: startEvent.payload,
|
|
3002
|
-
timestamp: startEvent.timestamp,
|
|
3003
|
-
instanceId: workflowEvent.instanceId ?? pooledPayload.poolId,
|
|
3004
|
-
};
|
|
3005
|
-
const dispatchedTrace = readWorkflowTraceContext(dispatchedEvent);
|
|
3006
|
-
const mapped = await mapRunToWorkflowInstance({
|
|
3007
|
-
env: this.env,
|
|
3008
|
-
runId: dispatchedTrace.runId,
|
|
3009
|
-
instanceId: pooledPayload.poolId,
|
|
3010
|
-
started: true,
|
|
3011
|
-
}).catch((error) => {
|
|
3012
|
-
console.warn('[coordinator.workflow_pool] start ack failed', {
|
|
3013
|
-
poolId: pooledPayload.poolId,
|
|
3014
|
-
runId: dispatchedTrace.runId,
|
|
3015
|
-
message: error instanceof Error ? error.message : String(error),
|
|
3016
|
-
});
|
|
3017
|
-
return false;
|
|
3018
|
-
});
|
|
3019
|
-
if (!mapped) {
|
|
3020
|
-
trace({
|
|
3021
|
-
runId: dispatchedTrace.runId,
|
|
3022
|
-
phase: 'coordinator.workflow_pool_start_blocked',
|
|
3023
|
-
ms: 0,
|
|
3024
|
-
graphHash: dispatchedTrace.graphHash,
|
|
3025
|
-
extra: {
|
|
3026
|
-
instanceId: pooledPayload.poolId,
|
|
3027
|
-
eventType: startEvent.type,
|
|
3028
|
-
},
|
|
3029
|
-
});
|
|
3030
|
-
return { ok: false, blocked: true, runId: dispatchedTrace.runId };
|
|
3031
|
-
}
|
|
3032
|
-
const eventDeliveryMs = Math.max(
|
|
3033
|
-
0,
|
|
3034
|
-
Date.now() - startEvent.timestamp.getTime(),
|
|
3035
|
-
);
|
|
2857
|
+
if (entryTrace.submittedAt !== null) {
|
|
3036
2858
|
trace({
|
|
3037
|
-
runId:
|
|
3038
|
-
phase: 'coordinator.
|
|
3039
|
-
ms:
|
|
3040
|
-
graphHash:
|
|
2859
|
+
runId: entryTrace.runId,
|
|
2860
|
+
phase: 'coordinator.workflow_start_gap',
|
|
2861
|
+
ms: Math.max(0, Date.now() - entryTrace.submittedAt),
|
|
2862
|
+
graphHash: entryTrace.graphHash,
|
|
3041
2863
|
extra: {
|
|
3042
|
-
instanceId:
|
|
3043
|
-
eventType: startEvent.type,
|
|
3044
|
-
poolWaitAgeMs: Date.now() - waitStartedAt,
|
|
2864
|
+
instanceId: entryTrace.instanceId,
|
|
3045
2865
|
},
|
|
3046
2866
|
});
|
|
3047
2867
|
}
|
|
2868
|
+
let dispatchedEvent = event;
|
|
3048
2869
|
dispatchedEvent = await hydrateWorkflowDbSessions({
|
|
3049
2870
|
env: this.env,
|
|
3050
2871
|
event: dispatchedEvent,
|
|
@@ -3058,7 +2879,6 @@ export class DynamicWorkflow extends WorkflowEntrypoint<
|
|
|
3058
2879
|
graphHash: dispatchTrace.graphHash,
|
|
3059
2880
|
extra: {
|
|
3060
2881
|
instanceId: dispatchTrace.instanceId,
|
|
3061
|
-
pooledBootstrap: dispatchTrace.pooledBootstrap,
|
|
3062
2882
|
},
|
|
3063
2883
|
});
|
|
3064
2884
|
|
|
@@ -3212,9 +3032,6 @@ const coordinatorEntrypoint = {
|
|
|
3212
3032
|
): Promise<Response> {
|
|
3213
3033
|
const url = new URL(request.url);
|
|
3214
3034
|
if (url.pathname === '/health') {
|
|
3215
|
-
if (workflowPoolEnabled()) {
|
|
3216
|
-
ctx?.waitUntil(refillWorkflowPool(env).catch(() => undefined));
|
|
3217
|
-
}
|
|
3218
3035
|
return new Response('ok', { status: 200 });
|
|
3219
3036
|
}
|
|
3220
3037
|
if (url.pathname === '/warmup/submit') {
|
|
@@ -3250,100 +3067,6 @@ const coordinatorEntrypoint = {
|
|
|
3250
3067
|
if (authError) return authError;
|
|
3251
3068
|
return await handleStagedFilePut(request, env);
|
|
3252
3069
|
}
|
|
3253
|
-
if (url.pathname === '/workflow-pool/refill') {
|
|
3254
|
-
const internalAuthError = authorizeCoordinatorControlRequest({
|
|
3255
|
-
request,
|
|
3256
|
-
env,
|
|
3257
|
-
});
|
|
3258
|
-
if (internalAuthError) return internalAuthError;
|
|
3259
|
-
const warmupToken = env.VERCEL_PROTECTION_BYPASS_TOKEN?.trim();
|
|
3260
|
-
if (
|
|
3261
|
-
warmupToken &&
|
|
3262
|
-
request.headers.get('x-vercel-protection-bypass') !== warmupToken
|
|
3263
|
-
) {
|
|
3264
|
-
return new Response('unauthorized', { status: 401 });
|
|
3265
|
-
}
|
|
3266
|
-
const startedAt = Date.now();
|
|
3267
|
-
const minAvailableRaw = Number(
|
|
3268
|
-
url.searchParams.get('minAvailable') ?? '',
|
|
3269
|
-
);
|
|
3270
|
-
const waitTimeoutMsRaw = Number(
|
|
3271
|
-
url.searchParams.get('waitTimeoutMs') ?? '',
|
|
3272
|
-
);
|
|
3273
|
-
const result = await refillWorkflowPool(env, {
|
|
3274
|
-
waitReady: url.searchParams.get('waitReady') === '1',
|
|
3275
|
-
minAvailable:
|
|
3276
|
-
Number.isFinite(minAvailableRaw) && minAvailableRaw > 0
|
|
3277
|
-
? minAvailableRaw
|
|
3278
|
-
: undefined,
|
|
3279
|
-
waitTimeoutMs:
|
|
3280
|
-
Number.isFinite(waitTimeoutMsRaw) && waitTimeoutMsRaw > 0
|
|
3281
|
-
? waitTimeoutMsRaw
|
|
3282
|
-
: undefined,
|
|
3283
|
-
});
|
|
3284
|
-
return Response.json({
|
|
3285
|
-
ok: true,
|
|
3286
|
-
...result,
|
|
3287
|
-
ms: Date.now() - startedAt,
|
|
3288
|
-
});
|
|
3289
|
-
}
|
|
3290
|
-
if (url.pathname === '/workflow-pool/clear') {
|
|
3291
|
-
const internalAuthError = authorizeCoordinatorControlRequest({
|
|
3292
|
-
request,
|
|
3293
|
-
env,
|
|
3294
|
-
});
|
|
3295
|
-
if (internalAuthError) return internalAuthError;
|
|
3296
|
-
const warmupToken = env.VERCEL_PROTECTION_BYPASS_TOKEN?.trim();
|
|
3297
|
-
if (
|
|
3298
|
-
warmupToken &&
|
|
3299
|
-
request.headers.get('x-vercel-protection-bypass') !== warmupToken
|
|
3300
|
-
) {
|
|
3301
|
-
return new Response('unauthorized', { status: 401 });
|
|
3302
|
-
}
|
|
3303
|
-
const startedAt = Date.now();
|
|
3304
|
-
const deleted = await clearWorkflowPool(env);
|
|
3305
|
-
return Response.json({
|
|
3306
|
-
ok: true,
|
|
3307
|
-
deleted,
|
|
3308
|
-
ms: Date.now() - startedAt,
|
|
3309
|
-
});
|
|
3310
|
-
}
|
|
3311
|
-
if (url.pathname === '/workflow-pool/debug') {
|
|
3312
|
-
const internalAuthError = authorizeCoordinatorControlRequest({
|
|
3313
|
-
request,
|
|
3314
|
-
env,
|
|
3315
|
-
});
|
|
3316
|
-
if (internalAuthError) return internalAuthError;
|
|
3317
|
-
const entries = await listWorkflowPoolEntries(env);
|
|
3318
|
-
const detailed = [];
|
|
3319
|
-
for (const entry of entries) {
|
|
3320
|
-
const instance = await getWorkflowPoolInstance(env, entry.id);
|
|
3321
|
-
if (!instance) {
|
|
3322
|
-
detailed.push({
|
|
3323
|
-
...entry,
|
|
3324
|
-
status: 'missing',
|
|
3325
|
-
mappedStatus: 'failed',
|
|
3326
|
-
});
|
|
3327
|
-
continue;
|
|
3328
|
-
}
|
|
3329
|
-
try {
|
|
3330
|
-
const status = await instance.status().catch(() => null);
|
|
3331
|
-
detailed.push({
|
|
3332
|
-
...entry,
|
|
3333
|
-
status: workflowStatusName(status),
|
|
3334
|
-
mappedStatus: status ? mapWorkflowStatus(status) : 'running',
|
|
3335
|
-
});
|
|
3336
|
-
} finally {
|
|
3337
|
-
disposeRpcStub(instance);
|
|
3338
|
-
}
|
|
3339
|
-
}
|
|
3340
|
-
return Response.json({
|
|
3341
|
-
ok: true,
|
|
3342
|
-
enabled: workflowPoolEnabled(),
|
|
3343
|
-
entries: detailed,
|
|
3344
|
-
});
|
|
3345
|
-
}
|
|
3346
|
-
|
|
3347
3070
|
// Workflow routes: /workflow/{runId}/{action}
|
|
3348
3071
|
const wfMatch = url.pathname.match(/^\/workflow\/([^/]+)(?:\/(.+))?$/);
|
|
3349
3072
|
if (wfMatch) {
|
|
@@ -3391,12 +3114,9 @@ const coordinatorEntrypoint = {
|
|
|
3391
3114
|
},
|
|
3392
3115
|
async scheduled(
|
|
3393
3116
|
_controller: unknown,
|
|
3394
|
-
|
|
3395
|
-
|
|
3396
|
-
): Promise<void> {
|
|
3397
|
-
if (!workflowPoolEnabled()) return;
|
|
3398
|
-
ctx?.waitUntil(refillWorkflowPool(env).catch(() => undefined));
|
|
3399
|
-
},
|
|
3117
|
+
_env: CoordinatorEnv,
|
|
3118
|
+
_ctx?: ExecutionContext,
|
|
3119
|
+
): Promise<void> {},
|
|
3400
3120
|
};
|
|
3401
3121
|
|
|
3402
3122
|
export default coordinatorEntrypoint;
|
|
@@ -3601,69 +3321,101 @@ async function handleWorkflowRoute(input: {
|
|
|
3601
3321
|
});
|
|
3602
3322
|
input.ctx?.waitUntil(prewarmPromise);
|
|
3603
3323
|
}
|
|
3604
|
-
const
|
|
3605
|
-
env,
|
|
3324
|
+
const dbSessionExternalization = externalizedWorkflowDbSessionParams({
|
|
3606
3325
|
params,
|
|
3607
|
-
recordSubmitTiming,
|
|
3608
|
-
});
|
|
3609
|
-
await persistWorkflowRetryState({
|
|
3610
|
-
env,
|
|
3611
|
-
runId: submittedRunId,
|
|
3612
|
-
params: workflowParams,
|
|
3613
3326
|
});
|
|
3614
|
-
|
|
3327
|
+
const workflowParams = dbSessionExternalization.params;
|
|
3615
3328
|
try {
|
|
3616
|
-
const
|
|
3617
|
-
await
|
|
3329
|
+
const retryStateStartedAt = Date.now();
|
|
3330
|
+
const launchState = await persistWorkflowLaunchState({
|
|
3331
|
+
env,
|
|
3618
3332
|
runId: submittedRunId,
|
|
3619
|
-
|
|
3620
|
-
|
|
3621
|
-
|
|
3333
|
+
params: workflowParams,
|
|
3334
|
+
sessions: dbSessionExternalization.sessions,
|
|
3335
|
+
});
|
|
3336
|
+
const persistedAt = Date.now();
|
|
3337
|
+
if (dbSessionExternalization.sessions.length > 0) {
|
|
3338
|
+
recordSubmitTiming({
|
|
3339
|
+
phase: 'coordinator.workflow_db_sessions_externalized',
|
|
3340
|
+
ms: persistedAt - retryStateStartedAt,
|
|
3341
|
+
graphHash: params.graphHash ?? null,
|
|
3342
|
+
extra: {
|
|
3343
|
+
sessions:
|
|
3344
|
+
launchState.sessionCount ??
|
|
3345
|
+
dbSessionExternalization.sessions.length,
|
|
3346
|
+
expiresAt:
|
|
3347
|
+
launchState.dbSessionsExpiresAt ??
|
|
3348
|
+
dbSessionExternalization.ref?.expiresAt,
|
|
3349
|
+
combinedLaunchState: true,
|
|
3350
|
+
},
|
|
3351
|
+
});
|
|
3352
|
+
}
|
|
3353
|
+
recordSubmitTiming({
|
|
3354
|
+
phase: 'coordinator.retry_state_persistence',
|
|
3355
|
+
ms: persistedAt - retryStateStartedAt,
|
|
3356
|
+
graphHash: params.graphHash ?? null,
|
|
3357
|
+
extra: {
|
|
3358
|
+
combinedLaunchState: dbSessionExternalization.sessions.length > 0,
|
|
3359
|
+
},
|
|
3360
|
+
});
|
|
3361
|
+
} catch (error) {
|
|
3362
|
+
const errorMessage =
|
|
3363
|
+
error instanceof Error ? error.message : String(error);
|
|
3364
|
+
console.error('[coordinator] workflow retry state persistence failed', {
|
|
3365
|
+
code: 'WORKFLOW_RETRY_STATE_PERSISTENCE_FAILED',
|
|
3366
|
+
runId: submittedRunId,
|
|
3367
|
+
error: errorMessage,
|
|
3622
3368
|
});
|
|
3623
3369
|
recordSubmitTiming({
|
|
3624
|
-
phase: 'coordinator.
|
|
3625
|
-
ms:
|
|
3370
|
+
phase: 'coordinator.retry_state_persistence',
|
|
3371
|
+
ms: 0,
|
|
3626
3372
|
graphHash: params.graphHash ?? null,
|
|
3373
|
+
extra: {
|
|
3374
|
+
status: 'failed',
|
|
3375
|
+
error: errorMessage,
|
|
3376
|
+
},
|
|
3377
|
+
});
|
|
3378
|
+
return workflowRetryStatePersistenceErrorResponse({
|
|
3379
|
+
runId: submittedRunId,
|
|
3380
|
+
error,
|
|
3627
3381
|
});
|
|
3382
|
+
}
|
|
3383
|
+
workflowParams.submittedAt = Date.now();
|
|
3384
|
+
let instance: WorkflowInstance | null = null;
|
|
3385
|
+
try {
|
|
3628
3386
|
const dispatchStartedAt = Date.now();
|
|
3629
|
-
const
|
|
3630
|
-
instance = await
|
|
3387
|
+
const createStartedAt = Date.now();
|
|
3388
|
+
instance = await createDynamicWorkflowInstance({
|
|
3631
3389
|
env,
|
|
3390
|
+
id: defaultInstanceId,
|
|
3632
3391
|
params: workflowParams,
|
|
3633
|
-
recordSubmitTiming,
|
|
3634
3392
|
});
|
|
3393
|
+
const workflowCreatedAt = Date.now();
|
|
3635
3394
|
recordSubmitTiming({
|
|
3636
|
-
phase: 'coordinator.
|
|
3637
|
-
ms:
|
|
3395
|
+
phase: 'coordinator.workflow_create',
|
|
3396
|
+
ms: workflowCreatedAt - createStartedAt,
|
|
3638
3397
|
graphHash: params.graphHash ?? null,
|
|
3639
|
-
extra: {
|
|
3640
|
-
usedPool: Boolean(instance),
|
|
3641
|
-
enabled: workflowPoolEnabled(),
|
|
3642
|
-
},
|
|
3398
|
+
extra: { instanceId: instance.id },
|
|
3643
3399
|
});
|
|
3644
|
-
|
|
3645
|
-
|
|
3646
|
-
|
|
3647
|
-
|
|
3648
|
-
|
|
3649
|
-
|
|
3650
|
-
|
|
3651
|
-
|
|
3652
|
-
|
|
3653
|
-
ms: Date.now() - createStartedAt,
|
|
3654
|
-
graphHash: params.graphHash ?? null,
|
|
3655
|
-
extra: { instanceId: instance.id },
|
|
3400
|
+
const instanceIdRecord = recordWorkflowInstanceId({
|
|
3401
|
+
env,
|
|
3402
|
+
runId: submittedRunId,
|
|
3403
|
+
instanceId: instance.id,
|
|
3404
|
+
}).catch((error) => {
|
|
3405
|
+
console.warn('[coordinator] workflow instance id record failed', {
|
|
3406
|
+
runId: submittedRunId,
|
|
3407
|
+
instanceId: instance?.id ?? null,
|
|
3408
|
+
error: error instanceof Error ? error.message : String(error),
|
|
3656
3409
|
});
|
|
3657
|
-
}
|
|
3410
|
+
});
|
|
3411
|
+
input.ctx?.waitUntil(instanceIdRecord);
|
|
3658
3412
|
recordSubmitTiming({
|
|
3659
3413
|
phase: 'coordinator.dispatch_workflow',
|
|
3660
3414
|
ms: Date.now() - dispatchStartedAt,
|
|
3661
3415
|
graphHash: params.graphHash ?? null,
|
|
3662
3416
|
extra: {
|
|
3663
|
-
startMode:
|
|
3664
|
-
|
|
3665
|
-
? 'direct_workflow_create'
|
|
3666
|
-
: 'pooled_workflow_start_event',
|
|
3417
|
+
startMode: 'direct_workflow_create',
|
|
3418
|
+
instanceIdRecord: 'waitUntil',
|
|
3667
3419
|
},
|
|
3668
3420
|
});
|
|
3669
3421
|
const initialWaitMsRaw = Number(
|
|
@@ -3698,9 +3450,6 @@ async function handleWorkflowRoute(input: {
|
|
|
3698
3450
|
ms: totalMs,
|
|
3699
3451
|
graphHash: params.graphHash ?? null,
|
|
3700
3452
|
});
|
|
3701
|
-
if (workflowPoolEnabled() && instance.id === defaultInstanceId) {
|
|
3702
|
-
input.ctx?.waitUntil(refillWorkflowPool(env).catch(() => undefined));
|
|
3703
|
-
}
|
|
3704
3453
|
return Response.json({
|
|
3705
3454
|
runId,
|
|
3706
3455
|
status: 'submitted',
|
|
@@ -3733,126 +3482,17 @@ async function handleWorkflowRoute(input: {
|
|
|
3733
3482
|
{ status: 400 },
|
|
3734
3483
|
);
|
|
3735
3484
|
}
|
|
3736
|
-
const
|
|
3737
|
-
|
|
3738
|
-
|
|
3739
|
-
| undefined;
|
|
3740
|
-
const childPlayName =
|
|
3741
|
-
typeof body.name === 'string' && body.name.trim()
|
|
3742
|
-
? body.name.trim()
|
|
3743
|
-
: manifest?.playName?.trim();
|
|
3744
|
-
if (
|
|
3745
|
-
!manifest ||
|
|
3746
|
-
!childPlayName ||
|
|
3747
|
-
!manifest.artifactStorageKey ||
|
|
3748
|
-
!manifest.artifactHash ||
|
|
3749
|
-
!manifest.graphHash ||
|
|
3750
|
-
!governance
|
|
3751
|
-
) {
|
|
3752
|
-
return Response.json(
|
|
3753
|
-
{
|
|
3754
|
-
error: {
|
|
3755
|
-
code: 'CHILD_MANIFEST_REQUIRED',
|
|
3756
|
-
message:
|
|
3757
|
-
'submit-child requires a trusted child manifest and lineage.',
|
|
3758
|
-
phase: 'coordinator_child_submit',
|
|
3759
|
-
parentRunId: runId,
|
|
3760
|
-
},
|
|
3761
|
-
},
|
|
3762
|
-
{ status: 400 },
|
|
3763
|
-
);
|
|
3764
|
-
}
|
|
3765
|
-
const childRunId = buildChildRunId(childPlayName);
|
|
3766
|
-
const orgId = typeof body.orgId === 'string' ? body.orgId : '';
|
|
3767
|
-
if (!orgId) {
|
|
3768
|
-
return Response.json(
|
|
3769
|
-
{
|
|
3770
|
-
error: {
|
|
3771
|
-
code: 'CHILD_ORG_REQUIRED',
|
|
3772
|
-
message: 'submit-child requires orgId from the parent runtime.',
|
|
3773
|
-
phase: 'coordinator_child_submit',
|
|
3774
|
-
parentRunId: runId,
|
|
3775
|
-
},
|
|
3776
|
-
},
|
|
3777
|
-
{ status: 400 },
|
|
3778
|
-
);
|
|
3779
|
-
}
|
|
3780
|
-
const parentExecutorToken =
|
|
3781
|
-
typeof body.parentExecutorToken === 'string'
|
|
3782
|
-
? body.parentExecutorToken.trim()
|
|
3783
|
-
: '';
|
|
3784
|
-
if (!parentExecutorToken) {
|
|
3785
|
-
return Response.json(
|
|
3786
|
-
{
|
|
3787
|
-
error: {
|
|
3788
|
-
code: 'PARENT_EXECUTOR_TOKEN_REQUIRED',
|
|
3789
|
-
message:
|
|
3790
|
-
'submit-child requires the parent executor token for origin-scoped child token minting.',
|
|
3791
|
-
phase: 'coordinator_child_submit',
|
|
3792
|
-
parentRunId: runId,
|
|
3793
|
-
},
|
|
3794
|
-
},
|
|
3795
|
-
{ status: 400 },
|
|
3796
|
-
);
|
|
3797
|
-
}
|
|
3798
|
-
const baseUrl = resolveRuntimeBaseUrl(env, body);
|
|
3799
|
-
const childToken = await mintChildWorkflowExecutorToken({
|
|
3800
|
-
env,
|
|
3801
|
-
baseUrl,
|
|
3802
|
-
parentExecutorToken,
|
|
3803
|
-
parentRunId: runId,
|
|
3804
|
-
parentPlayName:
|
|
3805
|
-
typeof body.parentPlayName === 'string' && body.parentPlayName.trim()
|
|
3806
|
-
? body.parentPlayName.trim()
|
|
3807
|
-
: governance.parentPlayName,
|
|
3808
|
-
childRunId,
|
|
3809
|
-
childPlayName,
|
|
3810
|
-
maxCreditsPerRun: manifest.maxCreditsPerRun ?? null,
|
|
3811
|
-
});
|
|
3812
|
-
const preloadedDbSessions = await preloadChildRuntimeDbSessions({
|
|
3813
|
-
env,
|
|
3814
|
-
baseUrl,
|
|
3815
|
-
childExecutorToken: childToken,
|
|
3485
|
+
const {
|
|
3486
|
+
response: submitResponse,
|
|
3487
|
+
responseText,
|
|
3816
3488
|
childRunId,
|
|
3817
3489
|
childPlayName,
|
|
3818
|
-
|
|
3819
|
-
orgId,
|
|
3820
|
-
userEmail: typeof body.userEmail === 'string' ? body.userEmail : '',
|
|
3821
|
-
});
|
|
3822
|
-
const params = buildChildWorkflowParams({
|
|
3490
|
+
} = await submitChildWorkflowThroughCoordinator({
|
|
3823
3491
|
env,
|
|
3492
|
+
parentRunId: runId,
|
|
3824
3493
|
body,
|
|
3825
|
-
manifest,
|
|
3826
|
-
governance,
|
|
3827
|
-
childRunId,
|
|
3828
|
-
childPlayName,
|
|
3829
|
-
childToken,
|
|
3830
|
-
orgId,
|
|
3831
3494
|
coordinatorUrl: new URL(request.url).origin,
|
|
3832
|
-
runtimeBackend: 'cf_workflows_dynamic_worker',
|
|
3833
|
-
dynamicWorkerCode:
|
|
3834
|
-
typeof manifest.bundledCode === 'string'
|
|
3835
|
-
? manifest.bundledCode
|
|
3836
|
-
: null,
|
|
3837
|
-
preloadedDbSessions:
|
|
3838
|
-
preloadedDbSessions.length > 0 ? preloadedDbSessions : null,
|
|
3839
|
-
});
|
|
3840
|
-
const submitResponse = await handleWorkflowRoute({
|
|
3841
|
-
runId: childRunId,
|
|
3842
|
-
action: 'submit',
|
|
3843
|
-
request: new Request(
|
|
3844
|
-
`https://deepline.coordinator.internal/workflow/${encodeURIComponent(
|
|
3845
|
-
childRunId,
|
|
3846
|
-
)}/submit`,
|
|
3847
|
-
{
|
|
3848
|
-
method: 'POST',
|
|
3849
|
-
headers: { 'content-type': 'application/json' },
|
|
3850
|
-
body: JSON.stringify(params),
|
|
3851
|
-
},
|
|
3852
|
-
),
|
|
3853
|
-
env,
|
|
3854
3495
|
});
|
|
3855
|
-
const responseText = await submitResponse.text().catch(() => '');
|
|
3856
3496
|
recordCoordinatorPerfTrace({
|
|
3857
3497
|
runId,
|
|
3858
3498
|
phase: 'coordinator.child_submit',
|
|
@@ -4004,7 +3644,8 @@ async function handleWorkflowRoute(input: {
|
|
|
4004
3644
|
.get('instanceId')
|
|
4005
3645
|
?.trim();
|
|
4006
3646
|
const instanceId =
|
|
4007
|
-
requestedInstanceId &&
|
|
3647
|
+
requestedInstanceId &&
|
|
3648
|
+
isWorkflowInstanceIdForRun(runId, requestedInstanceId)
|
|
4008
3649
|
? requestedInstanceId
|
|
4009
3650
|
: await resolveWorkflowInstanceIdForRun(env, runId);
|
|
4010
3651
|
instance = await env.PLAY_WORKFLOW.get(instanceId);
|
|
@@ -4064,6 +3705,20 @@ async function handleWorkflowRoute(input: {
|
|
|
4064
3705
|
: eventKey
|
|
4065
3706
|
? `integration_event_${eventKey}`
|
|
4066
3707
|
: 'integration_event';
|
|
3708
|
+
if (body.signal === 'integration_event' && eventKey) {
|
|
3709
|
+
await writeCoordinatorChildTerminalState({
|
|
3710
|
+
env,
|
|
3711
|
+
parentRunId: runId,
|
|
3712
|
+
eventKey,
|
|
3713
|
+
data: body.data ?? body,
|
|
3714
|
+
}).catch((error: unknown) => {
|
|
3715
|
+
console.warn('[coordinator] child terminal cache write failed', {
|
|
3716
|
+
runId,
|
|
3717
|
+
eventKey,
|
|
3718
|
+
error: error instanceof Error ? error.message : String(error),
|
|
3719
|
+
});
|
|
3720
|
+
});
|
|
3721
|
+
}
|
|
4067
3722
|
await instance.sendEvent({
|
|
4068
3723
|
type: workflowEventType(eventType),
|
|
4069
3724
|
payload: body,
|
|
@@ -4200,6 +3855,16 @@ function workflowInstanceId(runId: string): string {
|
|
|
4200
3855
|
return `run-${stableHash(runId)}`;
|
|
4201
3856
|
}
|
|
4202
3857
|
|
|
3858
|
+
function isWorkflowInstanceIdForRun(
|
|
3859
|
+
runId: string,
|
|
3860
|
+
instanceId: string,
|
|
3861
|
+
): boolean {
|
|
3862
|
+
const canonical = workflowInstanceId(runId);
|
|
3863
|
+
return (
|
|
3864
|
+
instanceId === canonical || instanceId.startsWith(`${canonical}-retry-`)
|
|
3865
|
+
);
|
|
3866
|
+
}
|
|
3867
|
+
|
|
4203
3868
|
function stableHash(value: string): string {
|
|
4204
3869
|
let hash = 2166136261;
|
|
4205
3870
|
for (let index = 0; index < value.length; index += 1) {
|
|
@@ -4323,10 +3988,11 @@ function loadDynamicPlayWorkerSync(
|
|
|
4323
3988
|
// miswired environments fail before user code starts.
|
|
4324
3989
|
HARNESS: env.HARNESS,
|
|
4325
3990
|
VERCEL_PROTECTION_BYPASS_TOKEN: env.VERCEL_PROTECTION_BYPASS_TOKEN,
|
|
4326
|
-
//
|
|
4327
|
-
//
|
|
4328
|
-
//
|
|
4329
|
-
|
|
3991
|
+
// Runtime API bridge used by the play harness for status, tool
|
|
3992
|
+
// execution, DB session, and artifact callbacks. This uses the
|
|
3993
|
+
// long-lived HARNESS service binding, avoiding public callback HTTP
|
|
3994
|
+
// without relying on dynamic-worker access to named exports.
|
|
3995
|
+
...makeRuntimeApiEnvBinding(env),
|
|
4330
3996
|
// In-process coordinator control bridge used by ctx.runPlay and
|
|
4331
3997
|
// parent terminal signals. This keeps scalar child plays inline with
|
|
4332
3998
|
// the parent instead of round-tripping through nested Workflow waits.
|
|
@@ -4411,7 +4077,7 @@ async function loadDynamicPlayWorker(
|
|
|
4411
4077
|
// HARNESS, and child workflow control uses the COORDINATOR binding.
|
|
4412
4078
|
HARNESS: env.HARNESS,
|
|
4413
4079
|
VERCEL_PROTECTION_BYPASS_TOKEN: env.VERCEL_PROTECTION_BYPASS_TOKEN,
|
|
4414
|
-
...makeRuntimeApiEnvBinding(),
|
|
4080
|
+
...makeRuntimeApiEnvBinding(env),
|
|
4415
4081
|
...makeCoordinatorControlBinding(),
|
|
4416
4082
|
},
|
|
4417
4083
|
};
|
|
@@ -4861,20 +4527,6 @@ async function handleCoordinatorWarmup(
|
|
|
4861
4527
|
graphHash: params.graphHash,
|
|
4862
4528
|
extra: { status: response.status, label },
|
|
4863
4529
|
});
|
|
4864
|
-
const poolRefillPromise = refillWorkflowPool(env, {
|
|
4865
|
-
waitReady: true,
|
|
4866
|
-
minAvailable: 1,
|
|
4867
|
-
}).catch(() => ({
|
|
4868
|
-
available: 0,
|
|
4869
|
-
warming: 0,
|
|
4870
|
-
target: 0,
|
|
4871
|
-
created: 0,
|
|
4872
|
-
promoted: 0,
|
|
4873
|
-
removed: 0,
|
|
4874
|
-
waitedMs: 0,
|
|
4875
|
-
waitIterations: 0,
|
|
4876
|
-
}));
|
|
4877
|
-
ctx?.waitUntil(poolRefillPromise.then(() => undefined));
|
|
4878
4530
|
let body: unknown = null;
|
|
4879
4531
|
try {
|
|
4880
4532
|
body = text ? JSON.parse(text) : null;
|
|
@@ -4897,54 +4549,26 @@ async function handleCoordinatorWarmup(
|
|
|
4897
4549
|
status: response.status,
|
|
4898
4550
|
body,
|
|
4899
4551
|
terminalState,
|
|
4900
|
-
workflowPool: await poolRefillPromise,
|
|
4901
4552
|
},
|
|
4902
4553
|
{ status: responseStatus },
|
|
4903
4554
|
);
|
|
4904
4555
|
}
|
|
4905
4556
|
|
|
4906
4557
|
/**
|
|
4907
|
-
* Returns a structured-cloneable
|
|
4908
|
-
*
|
|
4909
|
-
*
|
|
4910
|
-
*
|
|
4911
|
-
*
|
|
4912
|
-
*
|
|
4913
|
-
*
|
|
4914
|
-
* `fetch(req.baseUrl + path)` path traverses.
|
|
4915
|
-
*
|
|
4916
|
-
* Implemented as a WorkerEntrypoint (not a plain closure) because Cloudflare
|
|
4917
|
-
* Workflows serializes the dynamic Worker's env when persisting workflow
|
|
4918
|
-
* state, and closures containing captured locals aren't
|
|
4919
|
-
* structured-cloneable. WorkerEntrypoint stubs ARE cloneable — same trick
|
|
4920
|
-
* `makePlayAssetsBinding` already uses.
|
|
4921
|
-
*
|
|
4922
|
-
* Falls back transparently when Cloudflare does not expose module exports in
|
|
4923
|
-
* the current execution path: if the binding is omitted from `env`, the play
|
|
4924
|
-
* worker uses its existing `fetch(req.baseUrl + path)` transport.
|
|
4558
|
+
* Returns a structured-cloneable runtime API binding for the per-graphHash
|
|
4559
|
+
* play Worker's `env.RUNTIME_API`. We intentionally pass the long-lived
|
|
4560
|
+
* HARNESS WorkerEntrypoint service binding instead of a plain closure: the
|
|
4561
|
+
* dynamic Worker env is serialized by Cloudflare Workflows, and service
|
|
4562
|
+
* bindings are cloneable while closures are not. The per-play runtime accepts
|
|
4563
|
+
* this binding via `runtimeApiCall(...)`, so callbacks still stay on
|
|
4564
|
+
* service bindings and never fall back to public HTTP.
|
|
4925
4565
|
*/
|
|
4926
|
-
let loggedMissingRuntimeApiExport = false;
|
|
4927
4566
|
let loggedMissingCoordinatorControlExport = false;
|
|
4928
4567
|
|
|
4929
|
-
function makeRuntimeApiEnvBinding():
|
|
4930
|
-
|
|
4931
|
-
|
|
4932
|
-
|
|
4933
|
-
RuntimeApi?: (init: { props: undefined }) => {
|
|
4934
|
-
fetch(req: Request): Promise<Response>;
|
|
4935
|
-
};
|
|
4936
|
-
};
|
|
4937
|
-
const ctor = exports.RuntimeApi;
|
|
4938
|
-
if (typeof ctor !== 'function') {
|
|
4939
|
-
if (!loggedMissingRuntimeApiExport) {
|
|
4940
|
-
loggedMissingRuntimeApiExport = true;
|
|
4941
|
-
console.warn(
|
|
4942
|
-
'[coordinator] RuntimeApi is not registered on cloudflare:workers exports; using public runtime API transport.',
|
|
4943
|
-
);
|
|
4944
|
-
}
|
|
4945
|
-
return {};
|
|
4946
|
-
}
|
|
4947
|
-
return { RUNTIME_API: ctor({ props: undefined }) };
|
|
4568
|
+
function makeRuntimeApiEnvBinding(env: CoordinatorEnv): {
|
|
4569
|
+
RUNTIME_API: CoordinatorEnv['HARNESS'];
|
|
4570
|
+
} {
|
|
4571
|
+
return { RUNTIME_API: env.HARNESS };
|
|
4948
4572
|
}
|
|
4949
4573
|
|
|
4950
4574
|
function makeCoordinatorControlBinding():
|
|
@@ -4954,6 +4578,10 @@ function makeCoordinatorControlBinding():
|
|
|
4954
4578
|
parentRunId: string,
|
|
4955
4579
|
body: Record<string, unknown>,
|
|
4956
4580
|
): Promise<{ workflowId?: string; runId?: string; error?: unknown }>;
|
|
4581
|
+
submitWorkflowChild(
|
|
4582
|
+
parentRunId: string,
|
|
4583
|
+
body: Record<string, unknown>,
|
|
4584
|
+
): Promise<{ workflowId?: string; runId?: string; error?: unknown }>;
|
|
4957
4585
|
signal(
|
|
4958
4586
|
runId: string,
|
|
4959
4587
|
body: Record<string, unknown>,
|
|
@@ -4966,6 +4594,28 @@ function makeCoordinatorControlBinding():
|
|
|
4966
4594
|
runId: string,
|
|
4967
4595
|
event: CoordinatorRunEvent,
|
|
4968
4596
|
): Promise<void>;
|
|
4597
|
+
readTerminalState(
|
|
4598
|
+
runId: string,
|
|
4599
|
+
): Promise<CoordinatorTerminalState | null>;
|
|
4600
|
+
readChildTerminalState(
|
|
4601
|
+
parentRunId: string,
|
|
4602
|
+
eventKey: string,
|
|
4603
|
+
timeoutMs?: number,
|
|
4604
|
+
): Promise<CoordinatorChildTerminalState | null>;
|
|
4605
|
+
rateAcquire(input: {
|
|
4606
|
+
bucketId: string;
|
|
4607
|
+
rules: Array<{
|
|
4608
|
+
ruleId: string;
|
|
4609
|
+
requestsPerWindow: number;
|
|
4610
|
+
windowMs: number;
|
|
4611
|
+
maxConcurrency: number | null;
|
|
4612
|
+
}>;
|
|
4613
|
+
requested: number;
|
|
4614
|
+
}): Promise<{ granted: number; waitMs: number }>;
|
|
4615
|
+
ratePenalize(input: {
|
|
4616
|
+
bucketId: string;
|
|
4617
|
+
cooldownMs: number;
|
|
4618
|
+
}): Promise<void>;
|
|
4969
4619
|
};
|
|
4970
4620
|
}
|
|
4971
4621
|
| Record<string, never> {
|
|
@@ -4975,6 +4625,10 @@ function makeCoordinatorControlBinding():
|
|
|
4975
4625
|
parentRunId: string,
|
|
4976
4626
|
body: Record<string, unknown>,
|
|
4977
4627
|
): Promise<{ workflowId?: string; runId?: string; error?: unknown }>;
|
|
4628
|
+
submitWorkflowChild(
|
|
4629
|
+
parentRunId: string,
|
|
4630
|
+
body: Record<string, unknown>,
|
|
4631
|
+
): Promise<{ workflowId?: string; runId?: string; error?: unknown }>;
|
|
4978
4632
|
signal(
|
|
4979
4633
|
runId: string,
|
|
4980
4634
|
body: Record<string, unknown>,
|
|
@@ -4984,6 +4638,28 @@ function makeCoordinatorControlBinding():
|
|
|
4984
4638
|
payload: CoordinatorPerfTracePayload,
|
|
4985
4639
|
): Promise<void>;
|
|
4986
4640
|
recordRunEvent(runId: string, event: CoordinatorRunEvent): Promise<void>;
|
|
4641
|
+
readTerminalState(
|
|
4642
|
+
runId: string,
|
|
4643
|
+
): Promise<CoordinatorTerminalState | null>;
|
|
4644
|
+
readChildTerminalState(
|
|
4645
|
+
parentRunId: string,
|
|
4646
|
+
eventKey: string,
|
|
4647
|
+
timeoutMs?: number,
|
|
4648
|
+
): Promise<CoordinatorChildTerminalState | null>;
|
|
4649
|
+
rateAcquire(input: {
|
|
4650
|
+
bucketId: string;
|
|
4651
|
+
rules: Array<{
|
|
4652
|
+
ruleId: string;
|
|
4653
|
+
requestsPerWindow: number;
|
|
4654
|
+
windowMs: number;
|
|
4655
|
+
maxConcurrency: number | null;
|
|
4656
|
+
}>;
|
|
4657
|
+
requested: number;
|
|
4658
|
+
}): Promise<{ granted: number; waitMs: number }>;
|
|
4659
|
+
ratePenalize(input: {
|
|
4660
|
+
bucketId: string;
|
|
4661
|
+
cooldownMs: number;
|
|
4662
|
+
}): Promise<void>;
|
|
4987
4663
|
};
|
|
4988
4664
|
};
|
|
4989
4665
|
const ctor = exports.CoordinatorControl;
|