deepline 0.1.32 → 0.1.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -33,6 +33,10 @@ import {
33
33
  COORDINATOR_INTERNAL_TOKEN_HEADER,
34
34
  COORDINATOR_RUN_SCOPE_HEADER,
35
35
  } from '../../../shared_libs/play-runtime/coordinator-headers';
36
+ import {
37
+ decideWorkflowPlatformRetry,
38
+ PLATFORM_DEPLOY_WORKFLOW_RETRY_LIMIT,
39
+ } from './workflow-retry';
36
40
  import { sanitizeLiveLogLines } from './runtime/live-progress';
37
41
 
38
42
  export { DynamicWorkflowBinding };
@@ -551,6 +555,11 @@ const WORKFLOW_POOL_START_ACK_TIMEOUT_MS = 750;
551
555
  const WORKFLOW_POOL_START_ACK_POLL_MS = 25;
552
556
  const SUBMIT_INITIAL_STATE_MAX_WAIT_MS = 0;
553
557
  const SUBMIT_INITIAL_STATE_POLL_MS = 50;
558
+ const WORKFLOW_RETRY_STATE_TTL_MS = 60 * 60 * 1000;
559
+ const WORKFLOW_POOL_PREWARM_ESCALATE_TARGET_AFTER_MS = 250;
560
+ const WORKFLOW_POOL_SCHEDULED_REFILL_MIN_AVAILABLE = 1;
561
+ const WORKFLOW_POOL_SCHEDULED_REFILL_TIMEOUT_MS = 10_000;
562
+
554
563
  function buildDynamicWorkflowMetadata(
555
564
  params: PlayWorkflowParams,
556
565
  ): DynamicWorkflowMetadata {
@@ -954,6 +963,176 @@ async function readWorkflowPoolRunMapping(input: {
954
963
  };
955
964
  }
956
965
 
966
+ async function persistWorkflowRetryState(input: {
967
+ env: CoordinatorEnv;
968
+ runId: string;
969
+ params: PlayWorkflowParams;
970
+ }): Promise<void> {
971
+ const retryParams: PlayWorkflowParams = {
972
+ ...input.params,
973
+ dynamicWorkerCode: null,
974
+ contractSnapshot: stripRetrySourceSnapshot(input.params.contractSnapshot),
975
+ childPlayManifests: stripRetryChildManifestCode(
976
+ input.params.childPlayManifests,
977
+ ),
978
+ packagedFiles: input.params.packagedFiles?.map((file) => ({
979
+ playPath: file.playPath,
980
+ storageKey: file.storageKey,
981
+ contentType: file.contentType,
982
+ bytes: file.bytes,
983
+ })) ?? null,
984
+ };
985
+ await callWorkflowPool<{ ok?: unknown }>(input.env, '/run-retry-state-put', {
986
+ method: 'POST',
987
+ body: JSON.stringify({
988
+ runId: input.runId,
989
+ params: retryParams,
990
+ ttlMs: WORKFLOW_RETRY_STATE_TTL_MS,
991
+ }),
992
+ }).catch((error) => {
993
+ console.warn('[coordinator] workflow retry state persistence skipped', {
994
+ runId: input.runId,
995
+ error: error instanceof Error ? error.message : String(error),
996
+ });
997
+ });
998
+ }
999
+
1000
+ function stripRetrySourceSnapshot(snapshot: unknown): unknown {
1001
+ if (!isRecord(snapshot)) return snapshot;
1002
+ const rest = { ...snapshot };
1003
+ delete rest.sourceCode;
1004
+ delete rest.sourceFiles;
1005
+ return rest;
1006
+ }
1007
+
1008
+ function stripRetryChildManifestCode(
1009
+ manifests: PlayRuntimeManifestMap | null | undefined,
1010
+ ): PlayRuntimeManifestMap | null {
1011
+ if (!manifests) return null;
1012
+ const stripped: PlayRuntimeManifestMap = {};
1013
+ for (const [key, manifest] of Object.entries(manifests)) {
1014
+ const rest = { ...manifest };
1015
+ delete rest.bundledCode;
1016
+ delete rest.sourceCode;
1017
+ stripped[key] = rest;
1018
+ }
1019
+ return stripped;
1020
+ }
1021
+
1022
+ async function claimWorkflowPlatformRetry(input: {
1023
+ env: CoordinatorEnv;
1024
+ runId: string;
1025
+ }): Promise<{
1026
+ claimed: boolean;
1027
+ attempts: number;
1028
+ params: PlayWorkflowParams | null;
1029
+ }> {
1030
+ const body = await callWorkflowPool<{
1031
+ claimed?: unknown;
1032
+ attempts?: unknown;
1033
+ params?: unknown;
1034
+ }>(input.env, '/run-retry-claim', {
1035
+ method: 'POST',
1036
+ body: JSON.stringify({
1037
+ runId: input.runId,
1038
+ maxAttempts: PLATFORM_DEPLOY_WORKFLOW_RETRY_LIMIT,
1039
+ }),
1040
+ });
1041
+ return {
1042
+ claimed: body.claimed === true,
1043
+ attempts: typeof body.attempts === 'number' ? body.attempts : 0,
1044
+ params: isRecord(body.params) ? (body.params as PlayWorkflowParams) : null,
1045
+ };
1046
+ }
1047
+
1048
+ function workflowRetryInstanceId(runId: string, attempt: number): string {
1049
+ const suffix = `retry-${attempt}-${stableHash(`${runId}:${attempt}`).slice(0, 8)}`;
1050
+ const base = workflowInstanceId(runId);
1051
+ const maxBaseLength = Math.max(1, 100 - suffix.length - 1);
1052
+ return `${base.slice(0, maxBaseLength)}-${suffix}`;
1053
+ }
1054
+
1055
+ async function restartWorkflowAfterPlatformReset(input: {
1056
+ env: CoordinatorEnv;
1057
+ ctx?: ExecutionContext;
1058
+ runId: string;
1059
+ oldInstance: WorkflowInstance;
1060
+ error: string;
1061
+ status: InstanceStatus;
1062
+ }): Promise<{
1063
+ retried: boolean;
1064
+ result: Record<string, unknown> | null;
1065
+ }> {
1066
+ const decision = decideWorkflowPlatformRetry({
1067
+ workflowStatus: String(input.status.status ?? ''),
1068
+ error: input.error,
1069
+ retryAttempts: 0,
1070
+ });
1071
+ if (decision.action !== 'retry') {
1072
+ return { retried: false, result: null };
1073
+ }
1074
+ const claim = await claimWorkflowPlatformRetry({
1075
+ env: input.env,
1076
+ runId: input.runId,
1077
+ }).catch((error) => {
1078
+ console.warn('[coordinator] workflow platform retry claim failed', {
1079
+ runId: input.runId,
1080
+ error: error instanceof Error ? error.message : String(error),
1081
+ });
1082
+ return null;
1083
+ });
1084
+ if (!claim?.claimed || !claim.params) {
1085
+ return { retried: false, result: null };
1086
+ }
1087
+ const retryInstanceId = workflowRetryInstanceId(input.runId, claim.attempts);
1088
+ const retryStartedAt = Date.now();
1089
+ let retryInstance: WorkflowInstance | null = null;
1090
+ try {
1091
+ retryInstance = await createDynamicWorkflowInstance({
1092
+ env: input.env,
1093
+ id: retryInstanceId,
1094
+ params: claim.params,
1095
+ });
1096
+ await mapRunToWorkflowInstance({
1097
+ env: input.env,
1098
+ runId: input.runId,
1099
+ instanceId: retryInstance.id,
1100
+ started: true,
1101
+ });
1102
+ input.ctx?.waitUntil(
1103
+ input.oldInstance.terminate().catch(() => undefined),
1104
+ );
1105
+ recordCoordinatorPerfTraceBuffered(input.env, input.ctx, {
1106
+ runId: input.runId,
1107
+ phase: 'coordinator.platform_deploy_retry',
1108
+ ms: Date.now() - retryStartedAt,
1109
+ graphHash: claim.params.graphHash ?? null,
1110
+ extra: {
1111
+ retryAttempt: claim.attempts,
1112
+ retryInstanceId: retryInstance.id,
1113
+ reason: decision.reason,
1114
+ },
1115
+ });
1116
+ return {
1117
+ retried: true,
1118
+ result: {
1119
+ runId: input.runId,
1120
+ playName: claim.params.playName,
1121
+ status: 'running',
1122
+ result: null,
1123
+ error: null,
1124
+ retry: {
1125
+ reason: decision.reason,
1126
+ attempt: claim.attempts,
1127
+ message: decision.message,
1128
+ },
1129
+ },
1130
+ };
1131
+ } finally {
1132
+ disposeRpcStub(retryInstance);
1133
+ }
1134
+ }
1135
+
957
1136
  async function waitForWorkflowPoolStartAck(input: {
958
1137
  env: CoordinatorEnv;
959
1138
  runId: string;
@@ -2979,9 +3158,14 @@ async function handleWorkflowRoute(input: {
2979
3158
  error: error instanceof Error ? error.message : String(error),
2980
3159
  },
2981
3160
  });
2982
- });
3161
+ });
2983
3162
  input.ctx?.waitUntil(prewarmPromise);
2984
3163
  }
3164
+ await persistWorkflowRetryState({
3165
+ env,
3166
+ runId: submittedRunId,
3167
+ params,
3168
+ });
2985
3169
  let instance: WorkflowInstance | null = null;
2986
3170
  try {
2987
3171
  const statusEventStartedAt = Date.now();
@@ -3500,6 +3684,33 @@ async function handleWorkflowRoute(input: {
3500
3684
  });
3501
3685
  }
3502
3686
  const status = await instance.status();
3687
+ const workflowError = readWorkflowError(status);
3688
+ if (workflowError) {
3689
+ const retry = await restartWorkflowAfterPlatformReset({
3690
+ env,
3691
+ ctx: input.ctx,
3692
+ runId,
3693
+ oldInstance: instance,
3694
+ error: workflowError,
3695
+ status,
3696
+ });
3697
+ if (retry.retried && retry.result) {
3698
+ const coordinatorTrace = includeTrace
3699
+ ? await listCoordinatorPerfTrace(env, runId).catch(() => [])
3700
+ : [];
3701
+ return Response.json({
3702
+ ...retry.result,
3703
+ coordinatorObserve: {
3704
+ ms: Date.now() - statusStartedAt,
3705
+ waitMs: 0,
3706
+ workflowStatus: 'retrying',
3707
+ statusPolls: 1,
3708
+ instanceId: instance.id,
3709
+ },
3710
+ ...(includeTrace ? { coordinatorTrace } : {}),
3711
+ });
3712
+ }
3713
+ }
3503
3714
  const result = mapWorkflowResult(runId, status);
3504
3715
  const observeMs = Date.now() - statusStartedAt;
3505
3716
  // If we forced a permanent-error fail-fast (status='failed' even
@@ -75,6 +75,14 @@ type WorkflowRunMapping = {
75
75
  expiresAt: number;
76
76
  };
77
77
 
78
+ type WorkflowRunRetryState = {
79
+ runId: string;
80
+ params: unknown;
81
+ retryAttempts: number;
82
+ updatedAt: number;
83
+ expiresAt: number;
84
+ };
85
+
78
86
  type CoordinatorTraceEntry = {
79
87
  ts: number;
80
88
  source: 'coordinator' | 'dynamic_worker';
@@ -157,6 +165,7 @@ type WorkflowPoolCounts = {
157
165
  const DEDUP_KEY_PREFIX = 'd:';
158
166
  const WORKFLOW_POOL_KEY_PREFIX = 'p:';
159
167
  const WORKFLOW_POOL_RUN_KEY_PREFIX = 'm:';
168
+ const WORKFLOW_RUN_RETRY_KEY_PREFIX = 'r:';
160
169
  const COORDINATOR_TRACE_KEY_PREFIX = 't:';
161
170
  const COORDINATOR_RUN_EVENT_KEY_PREFIX = 'e:';
162
171
  const COORDINATOR_TERMINAL_KEY = 'terminal';
@@ -167,6 +176,11 @@ const FINISH_ALARM_DELAY_MS = 60_000; // self-evict 1 min after finish() called
167
176
  const WORKFLOW_POOL_DEFAULT_TTL_MS = 8 * 60 * 1000;
168
177
  const WORKFLOW_POOL_RUN_MAPPING_TTL_MS = 60 * 60 * 1000;
169
178
  const WORKFLOW_POOL_READY_MAX_AGE_MS = 7 * 60_000;
179
+ const WORKFLOW_RUN_RETRY_STATE_MAX_BYTES = 110_000;
180
+
181
+ function jsonByteLength(value: unknown): number {
182
+ return new TextEncoder().encode(JSON.stringify(value)).byteLength;
183
+ }
170
184
 
171
185
  interface DedupEnv {
172
186
  PLAY_DEDUP: DurableObjectNamespace;
@@ -234,6 +248,10 @@ export class PlayDedup implements DurableObject {
234
248
  return await this.handlePoolBlockRun(req);
235
249
  case '/pool-resolve-run':
236
250
  return await this.handlePoolResolveRun(req);
251
+ case '/run-retry-state-put':
252
+ return await this.handleRunRetryStatePut(req);
253
+ case '/run-retry-claim':
254
+ return await this.handleRunRetryClaim(req);
237
255
  case '/pool-clear':
238
256
  return await this.handlePoolClear(req);
239
257
  case '/trace-add':
@@ -466,13 +484,16 @@ export class PlayDedup implements DurableObject {
466
484
  now = Date.now(),
467
485
  version?: string,
468
486
  ): Promise<void> {
469
- const [pool, mappings] = await Promise.all([
487
+ const [pool, mappings, retries] = await Promise.all([
470
488
  this.state.storage.list<WorkflowPoolEntry>({
471
489
  prefix: WORKFLOW_POOL_KEY_PREFIX,
472
490
  }),
473
491
  this.state.storage.list<WorkflowRunMapping>({
474
492
  prefix: WORKFLOW_POOL_RUN_KEY_PREFIX,
475
493
  }),
494
+ this.state.storage.list<WorkflowRunRetryState>({
495
+ prefix: WORKFLOW_RUN_RETRY_KEY_PREFIX,
496
+ }),
476
497
  ]);
477
498
  const expiredKeys: string[] = [];
478
499
  for (const [key, entry] of pool) {
@@ -496,6 +517,11 @@ export class PlayDedup implements DurableObject {
496
517
  expiredKeys.push(key);
497
518
  }
498
519
  }
520
+ for (const [key, retryState] of retries) {
521
+ if (!retryState || retryState.expiresAt <= now) {
522
+ expiredKeys.push(key);
523
+ }
524
+ }
499
525
  if (expiredKeys.length > 0) {
500
526
  await this.state.storage.delete(expiredKeys);
501
527
  }
@@ -963,15 +989,115 @@ export class PlayDedup implements DurableObject {
963
989
  );
964
990
  }
965
991
 
992
+ private async handleRunRetryStatePut(req: Request): Promise<Response> {
993
+ const body = (await req.json().catch(() => null)) as {
994
+ runId?: unknown;
995
+ params?: unknown;
996
+ ttlMs?: unknown;
997
+ } | null;
998
+ const runId = typeof body?.runId === 'string' ? body.runId : '';
999
+ if (!runId || !body || !('params' in body)) {
1000
+ return new Response('runId and params are required', { status: 400 });
1001
+ }
1002
+ const now = Date.now();
1003
+ const ttlMs =
1004
+ typeof body.ttlMs === 'number' && Number.isFinite(body.ttlMs)
1005
+ ? Math.max(
1006
+ 60_000,
1007
+ Math.min(body.ttlMs, WORKFLOW_POOL_RUN_MAPPING_TTL_MS),
1008
+ )
1009
+ : WORKFLOW_POOL_RUN_MAPPING_TTL_MS;
1010
+ const key = `${WORKFLOW_RUN_RETRY_KEY_PREFIX}${runId}`;
1011
+ await this.state.blockConcurrencyWhile(async () => {
1012
+ const existing = await this.state.storage.get<WorkflowRunRetryState>(key);
1013
+ const retryState = {
1014
+ runId,
1015
+ params: body.params,
1016
+ retryAttempts:
1017
+ existing?.runId === runId &&
1018
+ typeof existing.retryAttempts === 'number'
1019
+ ? existing.retryAttempts
1020
+ : 0,
1021
+ updatedAt: now,
1022
+ expiresAt: now + ttlMs,
1023
+ } satisfies WorkflowRunRetryState;
1024
+ const bytes = jsonByteLength(retryState);
1025
+ if (bytes > WORKFLOW_RUN_RETRY_STATE_MAX_BYTES) {
1026
+ throw new Error(
1027
+ `workflow retry state too large: ${bytes} bytes exceeds ${WORKFLOW_RUN_RETRY_STATE_MAX_BYTES}`,
1028
+ );
1029
+ }
1030
+ await this.state.storage.put(key, retryState);
1031
+ });
1032
+ return new Response(JSON.stringify({ ok: true }), {
1033
+ headers: { 'content-type': 'application/json' },
1034
+ });
1035
+ }
1036
+
1037
+ private async handleRunRetryClaim(req: Request): Promise<Response> {
1038
+ const body = (await req.json().catch(() => null)) as {
1039
+ runId?: unknown;
1040
+ maxAttempts?: unknown;
1041
+ } | null;
1042
+ const runId = typeof body?.runId === 'string' ? body.runId : '';
1043
+ if (!runId) {
1044
+ return new Response('runId is required', { status: 400 });
1045
+ }
1046
+ const maxAttempts =
1047
+ typeof body?.maxAttempts === 'number' && Number.isFinite(body.maxAttempts)
1048
+ ? Math.max(0, Math.floor(body.maxAttempts))
1049
+ : 1;
1050
+ const now = Date.now();
1051
+ const key = `${WORKFLOW_RUN_RETRY_KEY_PREFIX}${runId}`;
1052
+ let response: Record<string, unknown> = {
1053
+ claimed: false,
1054
+ attempts: 0,
1055
+ params: null,
1056
+ };
1057
+ await this.state.blockConcurrencyWhile(async () => {
1058
+ const existing = await this.state.storage.get<WorkflowRunRetryState>(key);
1059
+ if (!existing || existing.expiresAt <= now) {
1060
+ response = { claimed: false, attempts: 0, params: null };
1061
+ return;
1062
+ }
1063
+ if (existing.retryAttempts >= maxAttempts) {
1064
+ response = {
1065
+ claimed: false,
1066
+ attempts: existing.retryAttempts,
1067
+ params: existing.params,
1068
+ };
1069
+ return;
1070
+ }
1071
+ const nextAttempts = existing.retryAttempts + 1;
1072
+ await this.state.storage.put(key, {
1073
+ ...existing,
1074
+ retryAttempts: nextAttempts,
1075
+ updatedAt: now,
1076
+ expiresAt: now + WORKFLOW_POOL_RUN_MAPPING_TTL_MS,
1077
+ } satisfies WorkflowRunRetryState);
1078
+ response = {
1079
+ claimed: true,
1080
+ attempts: nextAttempts,
1081
+ params: existing.params,
1082
+ };
1083
+ });
1084
+ return new Response(JSON.stringify(response), {
1085
+ headers: { 'content-type': 'application/json' },
1086
+ });
1087
+ }
1088
+
966
1089
  private async handlePoolClear(req: Request): Promise<Response> {
967
1090
  const version = this.workflowPoolVersion(req);
968
- const [pool, mappings] = await Promise.all([
1091
+ const [pool, mappings, retries] = await Promise.all([
969
1092
  this.state.storage.list<WorkflowPoolEntry>({
970
1093
  prefix: WORKFLOW_POOL_KEY_PREFIX,
971
1094
  }),
972
1095
  this.state.storage.list<WorkflowRunMapping>({
973
1096
  prefix: WORKFLOW_POOL_RUN_KEY_PREFIX,
974
1097
  }),
1098
+ this.state.storage.list<WorkflowRunRetryState>({
1099
+ prefix: WORKFLOW_RUN_RETRY_KEY_PREFIX,
1100
+ }),
975
1101
  ]);
976
1102
  const keys = [
977
1103
  ...[...pool.entries()]
@@ -980,6 +1106,7 @@ export class PlayDedup implements DurableObject {
980
1106
  ...[...mappings.entries()]
981
1107
  .filter(([, entry]) => !version || entry.version === version)
982
1108
  .map(([key]) => key),
1109
+ ...[...retries.keys()],
983
1110
  ];
984
1111
  if (keys.length > 0) {
985
1112
  await this.state.storage.delete(keys);