deepline 0.1.77 → 0.1.79

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -16,6 +16,7 @@ type PlayArtifactKind = (typeof PLAY_ARTIFACT_KINDS)[keyof typeof PLAY_ARTIFACT_
16
16
  interface PlayStaticPipelineSnapshot {
17
17
  tableNamespace?: string;
18
18
  inputFields?: string[];
19
+ rowKeyFields?: string[];
19
20
  csvArg?: string;
20
21
  hasInlineData?: boolean;
21
22
  csvDescription?: string;
@@ -39,11 +40,30 @@ interface PlaySheetColumnContractSnapshot {
39
40
  outputSqlName?: string;
40
41
  stepId?: string;
41
42
  toolId?: string;
43
+ isRowKey?: boolean;
42
44
  }
43
45
  interface PlaySheetContractSnapshot {
44
46
  tableNamespace: string;
45
47
  columns: PlaySheetColumnContractSnapshot[];
46
48
  }
49
+ type PlayStaticColumnProducerKindSnapshot = 'tool' | 'waterfall' | 'stepProgram' | 'playCall' | 'transform';
50
+ interface PlayStaticColumnProducerSnapshot {
51
+ id: string;
52
+ kind: PlayStaticColumnProducerKindSnapshot;
53
+ field: string;
54
+ toolId?: string;
55
+ playId?: string;
56
+ conditional?: boolean;
57
+ sourceRange?: PlayStaticSourceRangeSnapshot;
58
+ steps?: PlayStaticColumnProducerSnapshot[];
59
+ substep: PlayStaticSubstepSnapshot;
60
+ }
61
+ interface PlayStaticDatasetColumnSnapshot {
62
+ id: string;
63
+ source: PlaySheetColumnSourceSnapshot;
64
+ sqlName?: string;
65
+ producers: PlayStaticColumnProducerSnapshot[];
66
+ }
47
67
  interface PlayStaticSourceRangeSnapshot {
48
68
  sourcePath?: string;
49
69
  startLine: number;
@@ -68,8 +88,11 @@ type PlayStaticSubstepSnapshot = PlayStaticSubstepMetadataSnapshot & ({
68
88
  name?: string;
69
89
  tableNamespace?: string;
70
90
  inputFields?: string[];
91
+ rowKeyFields?: string[];
71
92
  outputFields?: string[];
93
+ columns?: PlayStaticDatasetColumnSnapshot[];
72
94
  waterfallIds?: string[];
95
+ steps?: PlayStaticSubstepSnapshot[];
73
96
  sheetContract?: PlaySheetContractSnapshot | null;
74
97
  description?: string;
75
98
  sourceRange?: PlayStaticSourceRangeSnapshot;
@@ -775,6 +798,9 @@ interface PlayListItem {
775
798
  isDraftDirty?: boolean;
776
799
  inputSchema?: Record<string, unknown> | null;
777
800
  outputSchema?: Record<string, unknown> | null;
801
+ staticPipeline?: unknown;
802
+ currentRevision?: PlayRevisionSummary | null;
803
+ liveRevision?: PlayRevisionSummary | null;
778
804
  aliases?: string[];
779
805
  }
780
806
  interface PlayDescription {
@@ -788,6 +814,7 @@ interface PlayDescription {
788
814
  aliases: string[];
789
815
  inputSchema?: Record<string, unknown> | null;
790
816
  outputSchema?: Record<string, unknown> | null;
817
+ staticPipeline?: Record<string, unknown> | null;
791
818
  csvInput?: Record<string, unknown> | null;
792
819
  rowOutputSchema?: Record<string, unknown> | null;
793
820
  runCommand: string;
@@ -1953,7 +1980,9 @@ type PlayDatasetTransformOptions = {
1953
1980
  * Deepline keeps row progress, retries, memory use, and table output under
1954
1981
  * runtime control. Use `count()` and `peek()` for bounded inspection. Use
1955
1982
  * `materialize(limit)` or async iteration only when the dataset is intentionally
1956
- * small and bounded.
1983
+ * small and bounded. `PlayDataset` intentionally does not expose `.rows`,
1984
+ * `.toArray()`, or other array aliases; those hide the runtime cost of loading
1985
+ * persisted rows into memory.
1957
1986
  */
1958
1987
  interface PlayDataset<T> extends AsyncIterable<T> {
1959
1988
  readonly [PLAY_DATASET_BRAND]: true;
@@ -2467,7 +2496,7 @@ interface DeeplinePlayRuntimeContext {
2467
2496
  * @param options - Run options.
2468
2497
  * @returns Program output.
2469
2498
  */
2470
- runSteps<TInput extends Record<string, unknown>, TOutput>(program: StepProgram<TInput, unknown, TOutput>, input: TInput, options?: {
2499
+ runSteps<TInput extends Record<string, unknown>, TOutput>(program: StepProgram<TInput, any, TOutput>, input: TInput, options?: {
2471
2500
  description?: string;
2472
2501
  }): Promise<TOutput>;
2473
2502
  /**
package/dist/index.d.ts CHANGED
@@ -16,6 +16,7 @@ type PlayArtifactKind = (typeof PLAY_ARTIFACT_KINDS)[keyof typeof PLAY_ARTIFACT_
16
16
  interface PlayStaticPipelineSnapshot {
17
17
  tableNamespace?: string;
18
18
  inputFields?: string[];
19
+ rowKeyFields?: string[];
19
20
  csvArg?: string;
20
21
  hasInlineData?: boolean;
21
22
  csvDescription?: string;
@@ -39,11 +40,30 @@ interface PlaySheetColumnContractSnapshot {
39
40
  outputSqlName?: string;
40
41
  stepId?: string;
41
42
  toolId?: string;
43
+ isRowKey?: boolean;
42
44
  }
43
45
  interface PlaySheetContractSnapshot {
44
46
  tableNamespace: string;
45
47
  columns: PlaySheetColumnContractSnapshot[];
46
48
  }
49
+ type PlayStaticColumnProducerKindSnapshot = 'tool' | 'waterfall' | 'stepProgram' | 'playCall' | 'transform';
50
+ interface PlayStaticColumnProducerSnapshot {
51
+ id: string;
52
+ kind: PlayStaticColumnProducerKindSnapshot;
53
+ field: string;
54
+ toolId?: string;
55
+ playId?: string;
56
+ conditional?: boolean;
57
+ sourceRange?: PlayStaticSourceRangeSnapshot;
58
+ steps?: PlayStaticColumnProducerSnapshot[];
59
+ substep: PlayStaticSubstepSnapshot;
60
+ }
61
+ interface PlayStaticDatasetColumnSnapshot {
62
+ id: string;
63
+ source: PlaySheetColumnSourceSnapshot;
64
+ sqlName?: string;
65
+ producers: PlayStaticColumnProducerSnapshot[];
66
+ }
47
67
  interface PlayStaticSourceRangeSnapshot {
48
68
  sourcePath?: string;
49
69
  startLine: number;
@@ -68,8 +88,11 @@ type PlayStaticSubstepSnapshot = PlayStaticSubstepMetadataSnapshot & ({
68
88
  name?: string;
69
89
  tableNamespace?: string;
70
90
  inputFields?: string[];
91
+ rowKeyFields?: string[];
71
92
  outputFields?: string[];
93
+ columns?: PlayStaticDatasetColumnSnapshot[];
72
94
  waterfallIds?: string[];
95
+ steps?: PlayStaticSubstepSnapshot[];
73
96
  sheetContract?: PlaySheetContractSnapshot | null;
74
97
  description?: string;
75
98
  sourceRange?: PlayStaticSourceRangeSnapshot;
@@ -775,6 +798,9 @@ interface PlayListItem {
775
798
  isDraftDirty?: boolean;
776
799
  inputSchema?: Record<string, unknown> | null;
777
800
  outputSchema?: Record<string, unknown> | null;
801
+ staticPipeline?: unknown;
802
+ currentRevision?: PlayRevisionSummary | null;
803
+ liveRevision?: PlayRevisionSummary | null;
778
804
  aliases?: string[];
779
805
  }
780
806
  interface PlayDescription {
@@ -788,6 +814,7 @@ interface PlayDescription {
788
814
  aliases: string[];
789
815
  inputSchema?: Record<string, unknown> | null;
790
816
  outputSchema?: Record<string, unknown> | null;
817
+ staticPipeline?: Record<string, unknown> | null;
791
818
  csvInput?: Record<string, unknown> | null;
792
819
  rowOutputSchema?: Record<string, unknown> | null;
793
820
  runCommand: string;
@@ -1953,7 +1980,9 @@ type PlayDatasetTransformOptions = {
1953
1980
  * Deepline keeps row progress, retries, memory use, and table output under
1954
1981
  * runtime control. Use `count()` and `peek()` for bounded inspection. Use
1955
1982
  * `materialize(limit)` or async iteration only when the dataset is intentionally
1956
- * small and bounded.
1983
+ * small and bounded. `PlayDataset` intentionally does not expose `.rows`,
1984
+ * `.toArray()`, or other array aliases; those hide the runtime cost of loading
1985
+ * persisted rows into memory.
1957
1986
  */
1958
1987
  interface PlayDataset<T> extends AsyncIterable<T> {
1959
1988
  readonly [PLAY_DATASET_BRAND]: true;
@@ -2467,7 +2496,7 @@ interface DeeplinePlayRuntimeContext {
2467
2496
  * @param options - Run options.
2468
2497
  * @returns Program output.
2469
2498
  */
2470
- runSteps<TInput extends Record<string, unknown>, TOutput>(program: StepProgram<TInput, unknown, TOutput>, input: TInput, options?: {
2499
+ runSteps<TInput extends Record<string, unknown>, TOutput>(program: StepProgram<TInput, any, TOutput>, input: TInput, options?: {
2471
2500
  description?: string;
2472
2501
  }): Promise<TOutput>;
2473
2502
  /**
package/dist/index.js CHANGED
@@ -241,10 +241,10 @@ var import_node_path2 = require("path");
241
241
 
242
242
  // src/release.ts
243
243
  var SDK_RELEASE = {
244
- version: "0.1.77",
244
+ version: "0.1.79",
245
245
  apiContract: "2026-06-dataset-column-cell-stale-hard-cutover",
246
246
  supportPolicy: {
247
- latest: "0.1.77",
247
+ latest: "0.1.79",
248
248
  minimumSupported: "0.1.53",
249
249
  deprecatedBelow: "0.1.53"
250
250
  }
@@ -820,6 +820,7 @@ var DeeplineClient = class {
820
820
  aliases,
821
821
  inputSchema: options?.compact ? this.compactSchema(play.inputSchema) : play.inputSchema ?? null,
822
822
  outputSchema: options?.compact ? this.compactSchema(play.outputSchema) : play.outputSchema ?? null,
823
+ staticPipeline: isRecord(play.staticPipeline) ? play.staticPipeline : isRecord(play.currentRevision?.staticPipeline) ? play.currentRevision.staticPipeline : isRecord(play.liveRevision?.staticPipeline) ? play.liveRevision.staticPipeline : null,
823
824
  ...csvInput ? { csvInput } : {},
824
825
  ...rowOutputSchema ? { rowOutputSchema } : {},
825
826
  runCommand,
package/dist/index.mjs CHANGED
@@ -179,10 +179,10 @@ import { join as join2 } from "path";
179
179
 
180
180
  // src/release.ts
181
181
  var SDK_RELEASE = {
182
- version: "0.1.77",
182
+ version: "0.1.79",
183
183
  apiContract: "2026-06-dataset-column-cell-stale-hard-cutover",
184
184
  supportPolicy: {
185
- latest: "0.1.77",
185
+ latest: "0.1.79",
186
186
  minimumSupported: "0.1.53",
187
187
  deprecatedBelow: "0.1.53"
188
188
  }
@@ -758,6 +758,7 @@ var DeeplineClient = class {
758
758
  aliases,
759
759
  inputSchema: options?.compact ? this.compactSchema(play.inputSchema) : play.inputSchema ?? null,
760
760
  outputSchema: options?.compact ? this.compactSchema(play.outputSchema) : play.outputSchema ?? null,
761
+ staticPipeline: isRecord(play.staticPipeline) ? play.staticPipeline : isRecord(play.currentRevision?.staticPipeline) ? play.currentRevision.staticPipeline : isRecord(play.liveRevision?.staticPipeline) ? play.liveRevision.staticPipeline : null,
761
762
  ...csvInput ? { csvInput } : {},
762
763
  ...rowOutputSchema ? { rowOutputSchema } : {},
763
764
  runCommand,
@@ -53,6 +53,14 @@ import {
53
53
  decideWorkflowPlatformRetry,
54
54
  PLATFORM_DEPLOY_WORKFLOW_RETRY_LIMIT,
55
55
  } from './workflow-retry';
56
+ import {
57
+ WORKFLOW_RETRY_PARAMS_EXTERNALIZE_AFTER_BYTES,
58
+ WORKFLOW_RETRY_PARAMS_MAX_BYTES,
59
+ buildWorkflowRetryParams,
60
+ jsonByteLength,
61
+ workflowRetryParamsStorageKey,
62
+ type WorkflowRetryParamsRef,
63
+ } from './workflow-retry-state';
56
64
  import { sanitizeLiveLogLines } from './runtime/live-progress';
57
65
 
58
66
  export { DynamicWorkflowBinding };
@@ -563,7 +571,7 @@ const WORKFLOW_POOL_PROTOCOL_VERSION =
563
571
  const WORKFLOW_POOL_DO_NAME = 'workflow-pool:v2';
564
572
  const WORKFLOW_POOL_START_EVENT_TYPE = 'play_start';
565
573
  const WORKFLOW_POOL_TTL_MS = 8 * 60 * 1000;
566
- const WORKFLOW_POOL_TARGET_SIZE = 0;
574
+ const WORKFLOW_POOL_TARGET_SIZE = 4;
567
575
  const WORKFLOW_POOL_READY_TIMEOUT_MS = 1_500;
568
576
  const WORKFLOW_POOL_READY_POLL_MS = 250;
569
577
  const WORKFLOW_POOL_REFILL_ON_MISS_TIMEOUT_MS = 2_500;
@@ -571,6 +579,8 @@ const WORKFLOW_POOL_REFILL_ON_MISS_MIN_AVAILABLE = 4;
571
579
  const WORKFLOW_POOL_CONTROL_TIMEOUT_MS = 750;
572
580
  const WORKFLOW_POOL_START_ACK_TIMEOUT_MS = 750;
573
581
  const WORKFLOW_POOL_START_ACK_POLL_MS = 25;
582
+ const WORKFLOW_POOL_MISS_CLAIM_RETRY_TIMEOUT_MS = 3_000;
583
+ const WORKFLOW_POOL_MISS_CLAIM_RETRY_POLL_MS = 50;
574
584
  const SUBMIT_INITIAL_STATE_MAX_WAIT_MS = 0;
575
585
  const SUBMIT_INITIAL_STATE_POLL_MS = 50;
576
586
  const WORKFLOW_RETRY_STATE_TTL_MS = 60 * 60 * 1000;
@@ -1117,6 +1127,115 @@ async function leaseWorkflowPoolId(
1117
1127
  return typeof body.id === 'string' && body.id ? body.id : null;
1118
1128
  }
1119
1129
 
1130
+ async function leaseWorkflowPoolIdWithMissRecovery(input: {
1131
+ env: CoordinatorEnv;
1132
+ runId: string;
1133
+ recordSubmitTiming: (timing: CoordinatorTiming) => void;
1134
+ graphHash?: string | null;
1135
+ }): Promise<{
1136
+ pooledInstanceId: string | null;
1137
+ missCounts: WorkflowPoolCounts | null;
1138
+ leaseError: string | null;
1139
+ }> {
1140
+ let leaseError: string | null = null;
1141
+ let pooledInstanceId = await leaseWorkflowPoolId(
1142
+ input.env,
1143
+ input.runId,
1144
+ ).catch((error) => {
1145
+ leaseError = error instanceof Error ? error.message : String(error);
1146
+ return null;
1147
+ });
1148
+ let missCounts = pooledInstanceId
1149
+ ? null
1150
+ : await workflowPoolCount(input.env).catch(() => null);
1151
+ if (
1152
+ pooledInstanceId ||
1153
+ leaseError ||
1154
+ !missCounts ||
1155
+ missCounts.available + missCounts.warming <= 0
1156
+ ) {
1157
+ return { pooledInstanceId, missCounts, leaseError };
1158
+ }
1159
+
1160
+ const recoveryStartedAt = Date.now();
1161
+ const refill = await refillWorkflowPool(input.env, {
1162
+ minAvailable: 1,
1163
+ waitReady: true,
1164
+ waitTimeoutMs: WORKFLOW_POOL_REFILL_ON_MISS_TIMEOUT_MS,
1165
+ }).catch((error) => {
1166
+ input.recordSubmitTiming({
1167
+ phase: 'coordinator.workflow_pool_refill_on_miss',
1168
+ ms: Date.now() - recoveryStartedAt,
1169
+ graphHash: input.graphHash ?? null,
1170
+ extra: {
1171
+ status: 'failed',
1172
+ error: error instanceof Error ? error.message : String(error),
1173
+ available: missCounts?.available ?? null,
1174
+ warming: missCounts?.warming ?? null,
1175
+ },
1176
+ });
1177
+ return null;
1178
+ });
1179
+ if (refill) {
1180
+ input.recordSubmitTiming({
1181
+ phase: 'coordinator.workflow_pool_refill_on_miss',
1182
+ ms: Date.now() - recoveryStartedAt,
1183
+ graphHash: input.graphHash ?? null,
1184
+ extra: {
1185
+ status: 'ok',
1186
+ available: refill.available,
1187
+ warming: refill.warming,
1188
+ created: refill.created,
1189
+ promoted: refill.promoted,
1190
+ removed: refill.removed,
1191
+ waitedMs: refill.waitedMs,
1192
+ waitIterations: refill.waitIterations,
1193
+ },
1194
+ });
1195
+ }
1196
+
1197
+ let retryCount = 0;
1198
+ const retryStartedAt = Date.now();
1199
+ while (
1200
+ Date.now() - retryStartedAt <
1201
+ WORKFLOW_POOL_MISS_CLAIM_RETRY_TIMEOUT_MS
1202
+ ) {
1203
+ retryCount += 1;
1204
+ pooledInstanceId = await leaseWorkflowPoolId(input.env, input.runId).catch(
1205
+ (error) => {
1206
+ leaseError = error instanceof Error ? error.message : String(error);
1207
+ return null;
1208
+ },
1209
+ );
1210
+ if (pooledInstanceId || leaseError) {
1211
+ break;
1212
+ }
1213
+ missCounts = await workflowPoolCount(input.env).catch(() => missCounts);
1214
+ if (!missCounts || missCounts.available + missCounts.warming <= 0) {
1215
+ break;
1216
+ }
1217
+ await sleep(WORKFLOW_POOL_MISS_CLAIM_RETRY_POLL_MS);
1218
+ }
1219
+ input.recordSubmitTiming({
1220
+ phase: 'coordinator.workflow_pool_claim_retry',
1221
+ ms: Date.now() - retryStartedAt,
1222
+ graphHash: input.graphHash ?? null,
1223
+ extra: {
1224
+ pooled: Boolean(pooledInstanceId),
1225
+ retries: retryCount,
1226
+ ...(leaseError ? { error: leaseError } : {}),
1227
+ ...(missCounts
1228
+ ? {
1229
+ availableAfterRetry: missCounts.available,
1230
+ warmingAfterRetry: missCounts.warming,
1231
+ }
1232
+ : {}),
1233
+ },
1234
+ });
1235
+
1236
+ return { pooledInstanceId, missCounts, leaseError };
1237
+ }
1238
+
1120
1239
  async function mapRunToWorkflowInstance(input: {
1121
1240
  env: CoordinatorEnv;
1122
1241
  runId: string;
@@ -1191,56 +1310,120 @@ async function persistWorkflowRetryState(input: {
1191
1310
  runId: string;
1192
1311
  params: PlayWorkflowParams;
1193
1312
  }): Promise<void> {
1194
- const retryParams: PlayWorkflowParams = {
1195
- ...input.params,
1196
- dynamicWorkerCode: null,
1197
- contractSnapshot: stripRetrySourceSnapshot(input.params.contractSnapshot),
1198
- childPlayManifests: stripRetryChildManifestCode(
1199
- input.params.childPlayManifests,
1200
- ),
1201
- packagedFiles:
1202
- input.params.packagedFiles?.map((file) => ({
1203
- playPath: file.playPath,
1204
- storageKey: file.storageKey,
1205
- contentType: file.contentType,
1206
- bytes: file.bytes,
1207
- })) ?? null,
1313
+ const retryParams = buildWorkflowRetryParams(input.params);
1314
+ const paramsBytes = jsonByteLength(retryParams);
1315
+ if (paramsBytes > WORKFLOW_RETRY_PARAMS_MAX_BYTES) {
1316
+ throw new Error(
1317
+ `workflow retry params too large: ${paramsBytes} bytes exceeds ${WORKFLOW_RETRY_PARAMS_MAX_BYTES}. Pass large payloads as staged files or ctx.csv inputs instead of inline JSON.`,
1318
+ );
1319
+ }
1320
+ let body: {
1321
+ runId: string;
1322
+ params?: PlayWorkflowParams;
1323
+ paramsRef?: WorkflowRetryParamsRef;
1324
+ paramsBytes: number;
1325
+ ttlMs: number;
1208
1326
  };
1209
- await callWorkflowPool<{ ok?: unknown }>(input.env, '/run-retry-state-put', {
1210
- method: 'POST',
1211
- body: JSON.stringify({
1327
+ if (paramsBytes > WORKFLOW_RETRY_PARAMS_EXTERNALIZE_AFTER_BYTES) {
1328
+ const serialized = JSON.stringify(retryParams);
1329
+ const hash = stableHash(serialized);
1330
+ const storageKey = workflowRetryParamsStorageKey({
1212
1331
  runId: input.runId,
1213
- params: retryParams,
1332
+ hash,
1333
+ });
1334
+ await input.env.PLAYS_BUCKET.put(storageKey, serialized, {
1335
+ httpMetadata: { contentType: 'application/json' },
1336
+ });
1337
+ body = {
1338
+ runId: input.runId,
1339
+ paramsRef: {
1340
+ storageKind: 'r2',
1341
+ storageKey,
1342
+ bytes: paramsBytes,
1343
+ hash,
1344
+ expiresAt: Date.now() + WORKFLOW_RETRY_STATE_TTL_MS,
1345
+ },
1346
+ paramsBytes,
1214
1347
  ttlMs: WORKFLOW_RETRY_STATE_TTL_MS,
1215
- }),
1216
- }).catch((error) => {
1217
- console.warn('[coordinator] workflow retry state persistence skipped', {
1348
+ };
1349
+ } else {
1350
+ body = {
1218
1351
  runId: input.runId,
1219
- error: error instanceof Error ? error.message : String(error),
1220
- });
1352
+ params: retryParams,
1353
+ paramsBytes,
1354
+ ttlMs: WORKFLOW_RETRY_STATE_TTL_MS,
1355
+ };
1356
+ }
1357
+ await callWorkflowPool<{ ok?: unknown }>(input.env, '/run-retry-state-put', {
1358
+ method: 'POST',
1359
+ body: JSON.stringify(body),
1221
1360
  });
1222
1361
  }
1223
1362
 
1224
- function stripRetrySourceSnapshot(snapshot: unknown): unknown {
1225
- if (!isRecord(snapshot)) return snapshot;
1226
- const rest = { ...snapshot };
1227
- delete rest.sourceCode;
1228
- delete rest.sourceFiles;
1229
- return rest;
1363
+ async function hydrateWorkflowRetryParams(input: {
1364
+ env: CoordinatorEnv;
1365
+ params: unknown;
1366
+ paramsRef: unknown;
1367
+ }): Promise<PlayWorkflowParams | null> {
1368
+ if (isRecord(input.params)) {
1369
+ return input.params as PlayWorkflowParams;
1370
+ }
1371
+ if (!isRecord(input.paramsRef)) {
1372
+ return null;
1373
+ }
1374
+ const storageKind = input.paramsRef.storageKind;
1375
+ const storageKey = input.paramsRef.storageKey;
1376
+ const expectedBytes = input.paramsRef.bytes;
1377
+ const expectedHash = input.paramsRef.hash;
1378
+ if (
1379
+ storageKind !== 'r2' ||
1380
+ typeof storageKey !== 'string' ||
1381
+ !storageKey.startsWith('plays/workflow-retry-params/') ||
1382
+ typeof expectedBytes !== 'number' ||
1383
+ !Number.isFinite(expectedBytes) ||
1384
+ typeof expectedHash !== 'string' ||
1385
+ !expectedHash
1386
+ ) {
1387
+ throw new Error('Invalid workflow retry params reference.');
1388
+ }
1389
+ const object = await input.env.PLAYS_BUCKET.get(storageKey);
1390
+ if (!object) {
1391
+ throw new Error(`Workflow retry params missing from R2: ${storageKey}`);
1392
+ }
1393
+ const text = await object.text();
1394
+ const actualBytes = new TextEncoder().encode(text).length;
1395
+ if (actualBytes !== expectedBytes) {
1396
+ throw new Error(
1397
+ `Workflow retry params byte length mismatch: expected ${expectedBytes}, got ${actualBytes}.`,
1398
+ );
1399
+ }
1400
+ const actualHash = stableHash(text);
1401
+ if (actualHash !== expectedHash) {
1402
+ throw new Error('Workflow retry params hash mismatch.');
1403
+ }
1404
+ const parsed = JSON.parse(text) as unknown;
1405
+ return isRecord(parsed) ? (parsed as PlayWorkflowParams) : null;
1230
1406
  }
1231
1407
 
1232
- function stripRetryChildManifestCode(
1233
- manifests: PlayRuntimeManifestMap | null | undefined,
1234
- ): PlayRuntimeManifestMap | null {
1235
- if (!manifests) return null;
1236
- const stripped: PlayRuntimeManifestMap = {};
1237
- for (const [key, manifest] of Object.entries(manifests)) {
1238
- const rest = { ...manifest };
1239
- delete rest.bundledCode;
1240
- delete rest.sourceCode;
1241
- stripped[key] = rest;
1242
- }
1243
- return stripped;
1408
+ function workflowRetryStatePersistenceErrorResponse(input: {
1409
+ runId: string;
1410
+ error: unknown;
1411
+ }): Response {
1412
+ const message =
1413
+ input.error instanceof Error ? input.error.message : String(input.error);
1414
+ return Response.json(
1415
+ {
1416
+ error: {
1417
+ code: 'WORKFLOW_RETRY_STATE_PERSISTENCE_FAILED',
1418
+ message:
1419
+ 'Failed to persist workflow retry state before dispatching the play run.',
1420
+ phase: 'coordinator_retry_state_persistence',
1421
+ runId: input.runId,
1422
+ cause: message,
1423
+ },
1424
+ },
1425
+ { status: 503 },
1426
+ );
1244
1427
  }
1245
1428
 
1246
1429
  async function claimWorkflowPlatformRetry(input: {
@@ -1255,6 +1438,7 @@ async function claimWorkflowPlatformRetry(input: {
1255
1438
  claimed?: unknown;
1256
1439
  attempts?: unknown;
1257
1440
  params?: unknown;
1441
+ paramsRef?: unknown;
1258
1442
  }>(input.env, '/run-retry-claim', {
1259
1443
  method: 'POST',
1260
1444
  body: JSON.stringify({
@@ -1262,10 +1446,15 @@ async function claimWorkflowPlatformRetry(input: {
1262
1446
  maxAttempts: PLATFORM_DEPLOY_WORKFLOW_RETRY_LIMIT,
1263
1447
  }),
1264
1448
  });
1449
+ const params = await hydrateWorkflowRetryParams({
1450
+ env: input.env,
1451
+ params: body.params,
1452
+ paramsRef: body.paramsRef,
1453
+ });
1265
1454
  return {
1266
1455
  claimed: body.claimed === true,
1267
1456
  attempts: typeof body.attempts === 'number' ? body.attempts : 0,
1268
- params: isRecord(body.params) ? (body.params as PlayWorkflowParams) : null,
1457
+ params,
1269
1458
  };
1270
1459
  }
1271
1460
 
@@ -1719,17 +1908,13 @@ async function submitViaPooledWorkflow(input: {
1719
1908
  return null;
1720
1909
  }
1721
1910
  const leaseStartedAt = Date.now();
1722
- let leaseError: string | null = null;
1723
- const pooledInstanceId = await leaseWorkflowPoolId(
1724
- input.env,
1725
- input.params.runId,
1726
- ).catch((error) => {
1727
- leaseError = error instanceof Error ? error.message : String(error);
1728
- return null;
1729
- });
1730
- const missCounts = pooledInstanceId
1731
- ? null
1732
- : await workflowPoolCount(input.env).catch(() => null);
1911
+ const { pooledInstanceId, missCounts, leaseError } =
1912
+ await leaseWorkflowPoolIdWithMissRecovery({
1913
+ env: input.env,
1914
+ runId: input.params.runId,
1915
+ recordSubmitTiming: input.recordSubmitTiming,
1916
+ graphHash: input.params.graphHash ?? null,
1917
+ });
1733
1918
  input.recordSubmitTiming({
1734
1919
  phase: 'coordinator.workflow_pool_lease',
1735
1920
  ms: Date.now() - leaseStartedAt,
@@ -1746,30 +1931,6 @@ async function submitViaPooledWorkflow(input: {
1746
1931
  },
1747
1932
  });
1748
1933
 
1749
- if (!pooledInstanceId) {
1750
- // A pool miss must not block the user path. Refilling is handled by the
1751
- // caller's waitUntil after submit, so fall through to cold create now.
1752
- const counts =
1753
- missCounts ?? (await workflowPoolCount(input.env).catch(() => null));
1754
- input.recordSubmitTiming({
1755
- phase: 'coordinator.workflow_pool_refill_on_miss',
1756
- ms: 0,
1757
- graphHash: input.params.graphHash ?? null,
1758
- extra: {
1759
- skipped: true,
1760
- reason: 'pool_miss_does_not_block_submit',
1761
- ...(counts
1762
- ? {
1763
- available: counts.available,
1764
- warming: counts.warming,
1765
- waitedMs: 0,
1766
- waitIterations: 0,
1767
- }
1768
- : {}),
1769
- },
1770
- });
1771
- }
1772
-
1773
1934
  if (!pooledInstanceId) {
1774
1935
  return null;
1775
1936
  }
@@ -3606,11 +3767,40 @@ async function handleWorkflowRoute(input: {
3606
3767
  params,
3607
3768
  recordSubmitTiming,
3608
3769
  });
3609
- await persistWorkflowRetryState({
3610
- env,
3611
- runId: submittedRunId,
3612
- params: workflowParams,
3613
- });
3770
+ try {
3771
+ const retryStateStartedAt = Date.now();
3772
+ await persistWorkflowRetryState({
3773
+ env,
3774
+ runId: submittedRunId,
3775
+ params: workflowParams,
3776
+ });
3777
+ recordSubmitTiming({
3778
+ phase: 'coordinator.retry_state_persistence',
3779
+ ms: Date.now() - retryStateStartedAt,
3780
+ graphHash: params.graphHash ?? null,
3781
+ });
3782
+ } catch (error) {
3783
+ const errorMessage =
3784
+ error instanceof Error ? error.message : String(error);
3785
+ console.error('[coordinator] workflow retry state persistence failed', {
3786
+ code: 'WORKFLOW_RETRY_STATE_PERSISTENCE_FAILED',
3787
+ runId: submittedRunId,
3788
+ error: errorMessage,
3789
+ });
3790
+ recordSubmitTiming({
3791
+ phase: 'coordinator.retry_state_persistence',
3792
+ ms: 0,
3793
+ graphHash: params.graphHash ?? null,
3794
+ extra: {
3795
+ status: 'failed',
3796
+ error: errorMessage,
3797
+ },
3798
+ });
3799
+ return workflowRetryStatePersistenceErrorResponse({
3800
+ runId: submittedRunId,
3801
+ error,
3802
+ });
3803
+ }
3614
3804
  let instance: WorkflowInstance | null = null;
3615
3805
  try {
3616
3806
  const statusEventStartedAt = Date.now();