deepline 0.1.77 → 0.1.79
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -8
- package/dist/cli/index.js +525 -355
- package/dist/cli/index.mjs +538 -368
- package/dist/index.d.mts +31 -2
- package/dist/index.d.ts +31 -2
- package/dist/index.js +3 -2
- package/dist/index.mjs +3 -2
- package/dist/repo/apps/play-runner-workers/src/coordinator-entry.ts +273 -83
- package/dist/repo/apps/play-runner-workers/src/dedup-do.ts +18 -3
- package/dist/repo/apps/play-runner-workers/src/workflow-retry-state.ts +203 -0
- package/dist/repo/sdk/src/client.ts +7 -0
- package/dist/repo/sdk/src/play.ts +1 -1
- package/dist/repo/sdk/src/release.ts +2 -2
- package/dist/repo/sdk/src/types.ts +4 -0
- package/dist/repo/shared_libs/plays/dataset.ts +3 -1
- package/dist/repo/shared_libs/plays/static-pipeline.ts +261 -1
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -16,6 +16,7 @@ type PlayArtifactKind = (typeof PLAY_ARTIFACT_KINDS)[keyof typeof PLAY_ARTIFACT_
|
|
|
16
16
|
interface PlayStaticPipelineSnapshot {
|
|
17
17
|
tableNamespace?: string;
|
|
18
18
|
inputFields?: string[];
|
|
19
|
+
rowKeyFields?: string[];
|
|
19
20
|
csvArg?: string;
|
|
20
21
|
hasInlineData?: boolean;
|
|
21
22
|
csvDescription?: string;
|
|
@@ -39,11 +40,30 @@ interface PlaySheetColumnContractSnapshot {
|
|
|
39
40
|
outputSqlName?: string;
|
|
40
41
|
stepId?: string;
|
|
41
42
|
toolId?: string;
|
|
43
|
+
isRowKey?: boolean;
|
|
42
44
|
}
|
|
43
45
|
interface PlaySheetContractSnapshot {
|
|
44
46
|
tableNamespace: string;
|
|
45
47
|
columns: PlaySheetColumnContractSnapshot[];
|
|
46
48
|
}
|
|
49
|
+
type PlayStaticColumnProducerKindSnapshot = 'tool' | 'waterfall' | 'stepProgram' | 'playCall' | 'transform';
|
|
50
|
+
interface PlayStaticColumnProducerSnapshot {
|
|
51
|
+
id: string;
|
|
52
|
+
kind: PlayStaticColumnProducerKindSnapshot;
|
|
53
|
+
field: string;
|
|
54
|
+
toolId?: string;
|
|
55
|
+
playId?: string;
|
|
56
|
+
conditional?: boolean;
|
|
57
|
+
sourceRange?: PlayStaticSourceRangeSnapshot;
|
|
58
|
+
steps?: PlayStaticColumnProducerSnapshot[];
|
|
59
|
+
substep: PlayStaticSubstepSnapshot;
|
|
60
|
+
}
|
|
61
|
+
interface PlayStaticDatasetColumnSnapshot {
|
|
62
|
+
id: string;
|
|
63
|
+
source: PlaySheetColumnSourceSnapshot;
|
|
64
|
+
sqlName?: string;
|
|
65
|
+
producers: PlayStaticColumnProducerSnapshot[];
|
|
66
|
+
}
|
|
47
67
|
interface PlayStaticSourceRangeSnapshot {
|
|
48
68
|
sourcePath?: string;
|
|
49
69
|
startLine: number;
|
|
@@ -68,8 +88,11 @@ type PlayStaticSubstepSnapshot = PlayStaticSubstepMetadataSnapshot & ({
|
|
|
68
88
|
name?: string;
|
|
69
89
|
tableNamespace?: string;
|
|
70
90
|
inputFields?: string[];
|
|
91
|
+
rowKeyFields?: string[];
|
|
71
92
|
outputFields?: string[];
|
|
93
|
+
columns?: PlayStaticDatasetColumnSnapshot[];
|
|
72
94
|
waterfallIds?: string[];
|
|
95
|
+
steps?: PlayStaticSubstepSnapshot[];
|
|
73
96
|
sheetContract?: PlaySheetContractSnapshot | null;
|
|
74
97
|
description?: string;
|
|
75
98
|
sourceRange?: PlayStaticSourceRangeSnapshot;
|
|
@@ -775,6 +798,9 @@ interface PlayListItem {
|
|
|
775
798
|
isDraftDirty?: boolean;
|
|
776
799
|
inputSchema?: Record<string, unknown> | null;
|
|
777
800
|
outputSchema?: Record<string, unknown> | null;
|
|
801
|
+
staticPipeline?: unknown;
|
|
802
|
+
currentRevision?: PlayRevisionSummary | null;
|
|
803
|
+
liveRevision?: PlayRevisionSummary | null;
|
|
778
804
|
aliases?: string[];
|
|
779
805
|
}
|
|
780
806
|
interface PlayDescription {
|
|
@@ -788,6 +814,7 @@ interface PlayDescription {
|
|
|
788
814
|
aliases: string[];
|
|
789
815
|
inputSchema?: Record<string, unknown> | null;
|
|
790
816
|
outputSchema?: Record<string, unknown> | null;
|
|
817
|
+
staticPipeline?: Record<string, unknown> | null;
|
|
791
818
|
csvInput?: Record<string, unknown> | null;
|
|
792
819
|
rowOutputSchema?: Record<string, unknown> | null;
|
|
793
820
|
runCommand: string;
|
|
@@ -1953,7 +1980,9 @@ type PlayDatasetTransformOptions = {
|
|
|
1953
1980
|
* Deepline keeps row progress, retries, memory use, and table output under
|
|
1954
1981
|
* runtime control. Use `count()` and `peek()` for bounded inspection. Use
|
|
1955
1982
|
* `materialize(limit)` or async iteration only when the dataset is intentionally
|
|
1956
|
-
* small and bounded.
|
|
1983
|
+
* small and bounded. `PlayDataset` intentionally does not expose `.rows`,
|
|
1984
|
+
* `.toArray()`, or other array aliases; those hide the runtime cost of loading
|
|
1985
|
+
* persisted rows into memory.
|
|
1957
1986
|
*/
|
|
1958
1987
|
interface PlayDataset<T> extends AsyncIterable<T> {
|
|
1959
1988
|
readonly [PLAY_DATASET_BRAND]: true;
|
|
@@ -2467,7 +2496,7 @@ interface DeeplinePlayRuntimeContext {
|
|
|
2467
2496
|
* @param options - Run options.
|
|
2468
2497
|
* @returns Program output.
|
|
2469
2498
|
*/
|
|
2470
|
-
runSteps<TInput extends Record<string, unknown>, TOutput>(program: StepProgram<TInput,
|
|
2499
|
+
runSteps<TInput extends Record<string, unknown>, TOutput>(program: StepProgram<TInput, any, TOutput>, input: TInput, options?: {
|
|
2471
2500
|
description?: string;
|
|
2472
2501
|
}): Promise<TOutput>;
|
|
2473
2502
|
/**
|
package/dist/index.d.ts
CHANGED
|
@@ -16,6 +16,7 @@ type PlayArtifactKind = (typeof PLAY_ARTIFACT_KINDS)[keyof typeof PLAY_ARTIFACT_
|
|
|
16
16
|
interface PlayStaticPipelineSnapshot {
|
|
17
17
|
tableNamespace?: string;
|
|
18
18
|
inputFields?: string[];
|
|
19
|
+
rowKeyFields?: string[];
|
|
19
20
|
csvArg?: string;
|
|
20
21
|
hasInlineData?: boolean;
|
|
21
22
|
csvDescription?: string;
|
|
@@ -39,11 +40,30 @@ interface PlaySheetColumnContractSnapshot {
|
|
|
39
40
|
outputSqlName?: string;
|
|
40
41
|
stepId?: string;
|
|
41
42
|
toolId?: string;
|
|
43
|
+
isRowKey?: boolean;
|
|
42
44
|
}
|
|
43
45
|
interface PlaySheetContractSnapshot {
|
|
44
46
|
tableNamespace: string;
|
|
45
47
|
columns: PlaySheetColumnContractSnapshot[];
|
|
46
48
|
}
|
|
49
|
+
type PlayStaticColumnProducerKindSnapshot = 'tool' | 'waterfall' | 'stepProgram' | 'playCall' | 'transform';
|
|
50
|
+
interface PlayStaticColumnProducerSnapshot {
|
|
51
|
+
id: string;
|
|
52
|
+
kind: PlayStaticColumnProducerKindSnapshot;
|
|
53
|
+
field: string;
|
|
54
|
+
toolId?: string;
|
|
55
|
+
playId?: string;
|
|
56
|
+
conditional?: boolean;
|
|
57
|
+
sourceRange?: PlayStaticSourceRangeSnapshot;
|
|
58
|
+
steps?: PlayStaticColumnProducerSnapshot[];
|
|
59
|
+
substep: PlayStaticSubstepSnapshot;
|
|
60
|
+
}
|
|
61
|
+
interface PlayStaticDatasetColumnSnapshot {
|
|
62
|
+
id: string;
|
|
63
|
+
source: PlaySheetColumnSourceSnapshot;
|
|
64
|
+
sqlName?: string;
|
|
65
|
+
producers: PlayStaticColumnProducerSnapshot[];
|
|
66
|
+
}
|
|
47
67
|
interface PlayStaticSourceRangeSnapshot {
|
|
48
68
|
sourcePath?: string;
|
|
49
69
|
startLine: number;
|
|
@@ -68,8 +88,11 @@ type PlayStaticSubstepSnapshot = PlayStaticSubstepMetadataSnapshot & ({
|
|
|
68
88
|
name?: string;
|
|
69
89
|
tableNamespace?: string;
|
|
70
90
|
inputFields?: string[];
|
|
91
|
+
rowKeyFields?: string[];
|
|
71
92
|
outputFields?: string[];
|
|
93
|
+
columns?: PlayStaticDatasetColumnSnapshot[];
|
|
72
94
|
waterfallIds?: string[];
|
|
95
|
+
steps?: PlayStaticSubstepSnapshot[];
|
|
73
96
|
sheetContract?: PlaySheetContractSnapshot | null;
|
|
74
97
|
description?: string;
|
|
75
98
|
sourceRange?: PlayStaticSourceRangeSnapshot;
|
|
@@ -775,6 +798,9 @@ interface PlayListItem {
|
|
|
775
798
|
isDraftDirty?: boolean;
|
|
776
799
|
inputSchema?: Record<string, unknown> | null;
|
|
777
800
|
outputSchema?: Record<string, unknown> | null;
|
|
801
|
+
staticPipeline?: unknown;
|
|
802
|
+
currentRevision?: PlayRevisionSummary | null;
|
|
803
|
+
liveRevision?: PlayRevisionSummary | null;
|
|
778
804
|
aliases?: string[];
|
|
779
805
|
}
|
|
780
806
|
interface PlayDescription {
|
|
@@ -788,6 +814,7 @@ interface PlayDescription {
|
|
|
788
814
|
aliases: string[];
|
|
789
815
|
inputSchema?: Record<string, unknown> | null;
|
|
790
816
|
outputSchema?: Record<string, unknown> | null;
|
|
817
|
+
staticPipeline?: Record<string, unknown> | null;
|
|
791
818
|
csvInput?: Record<string, unknown> | null;
|
|
792
819
|
rowOutputSchema?: Record<string, unknown> | null;
|
|
793
820
|
runCommand: string;
|
|
@@ -1953,7 +1980,9 @@ type PlayDatasetTransformOptions = {
|
|
|
1953
1980
|
* Deepline keeps row progress, retries, memory use, and table output under
|
|
1954
1981
|
* runtime control. Use `count()` and `peek()` for bounded inspection. Use
|
|
1955
1982
|
* `materialize(limit)` or async iteration only when the dataset is intentionally
|
|
1956
|
-
* small and bounded.
|
|
1983
|
+
* small and bounded. `PlayDataset` intentionally does not expose `.rows`,
|
|
1984
|
+
* `.toArray()`, or other array aliases; those hide the runtime cost of loading
|
|
1985
|
+
* persisted rows into memory.
|
|
1957
1986
|
*/
|
|
1958
1987
|
interface PlayDataset<T> extends AsyncIterable<T> {
|
|
1959
1988
|
readonly [PLAY_DATASET_BRAND]: true;
|
|
@@ -2467,7 +2496,7 @@ interface DeeplinePlayRuntimeContext {
|
|
|
2467
2496
|
* @param options - Run options.
|
|
2468
2497
|
* @returns Program output.
|
|
2469
2498
|
*/
|
|
2470
|
-
runSteps<TInput extends Record<string, unknown>, TOutput>(program: StepProgram<TInput,
|
|
2499
|
+
runSteps<TInput extends Record<string, unknown>, TOutput>(program: StepProgram<TInput, any, TOutput>, input: TInput, options?: {
|
|
2471
2500
|
description?: string;
|
|
2472
2501
|
}): Promise<TOutput>;
|
|
2473
2502
|
/**
|
package/dist/index.js
CHANGED
|
@@ -241,10 +241,10 @@ var import_node_path2 = require("path");
|
|
|
241
241
|
|
|
242
242
|
// src/release.ts
|
|
243
243
|
var SDK_RELEASE = {
|
|
244
|
-
version: "0.1.
|
|
244
|
+
version: "0.1.79",
|
|
245
245
|
apiContract: "2026-06-dataset-column-cell-stale-hard-cutover",
|
|
246
246
|
supportPolicy: {
|
|
247
|
-
latest: "0.1.
|
|
247
|
+
latest: "0.1.79",
|
|
248
248
|
minimumSupported: "0.1.53",
|
|
249
249
|
deprecatedBelow: "0.1.53"
|
|
250
250
|
}
|
|
@@ -820,6 +820,7 @@ var DeeplineClient = class {
|
|
|
820
820
|
aliases,
|
|
821
821
|
inputSchema: options?.compact ? this.compactSchema(play.inputSchema) : play.inputSchema ?? null,
|
|
822
822
|
outputSchema: options?.compact ? this.compactSchema(play.outputSchema) : play.outputSchema ?? null,
|
|
823
|
+
staticPipeline: isRecord(play.staticPipeline) ? play.staticPipeline : isRecord(play.currentRevision?.staticPipeline) ? play.currentRevision.staticPipeline : isRecord(play.liveRevision?.staticPipeline) ? play.liveRevision.staticPipeline : null,
|
|
823
824
|
...csvInput ? { csvInput } : {},
|
|
824
825
|
...rowOutputSchema ? { rowOutputSchema } : {},
|
|
825
826
|
runCommand,
|
package/dist/index.mjs
CHANGED
|
@@ -179,10 +179,10 @@ import { join as join2 } from "path";
|
|
|
179
179
|
|
|
180
180
|
// src/release.ts
|
|
181
181
|
var SDK_RELEASE = {
|
|
182
|
-
version: "0.1.
|
|
182
|
+
version: "0.1.79",
|
|
183
183
|
apiContract: "2026-06-dataset-column-cell-stale-hard-cutover",
|
|
184
184
|
supportPolicy: {
|
|
185
|
-
latest: "0.1.
|
|
185
|
+
latest: "0.1.79",
|
|
186
186
|
minimumSupported: "0.1.53",
|
|
187
187
|
deprecatedBelow: "0.1.53"
|
|
188
188
|
}
|
|
@@ -758,6 +758,7 @@ var DeeplineClient = class {
|
|
|
758
758
|
aliases,
|
|
759
759
|
inputSchema: options?.compact ? this.compactSchema(play.inputSchema) : play.inputSchema ?? null,
|
|
760
760
|
outputSchema: options?.compact ? this.compactSchema(play.outputSchema) : play.outputSchema ?? null,
|
|
761
|
+
staticPipeline: isRecord(play.staticPipeline) ? play.staticPipeline : isRecord(play.currentRevision?.staticPipeline) ? play.currentRevision.staticPipeline : isRecord(play.liveRevision?.staticPipeline) ? play.liveRevision.staticPipeline : null,
|
|
761
762
|
...csvInput ? { csvInput } : {},
|
|
762
763
|
...rowOutputSchema ? { rowOutputSchema } : {},
|
|
763
764
|
runCommand,
|
|
@@ -53,6 +53,14 @@ import {
|
|
|
53
53
|
decideWorkflowPlatformRetry,
|
|
54
54
|
PLATFORM_DEPLOY_WORKFLOW_RETRY_LIMIT,
|
|
55
55
|
} from './workflow-retry';
|
|
56
|
+
import {
|
|
57
|
+
WORKFLOW_RETRY_PARAMS_EXTERNALIZE_AFTER_BYTES,
|
|
58
|
+
WORKFLOW_RETRY_PARAMS_MAX_BYTES,
|
|
59
|
+
buildWorkflowRetryParams,
|
|
60
|
+
jsonByteLength,
|
|
61
|
+
workflowRetryParamsStorageKey,
|
|
62
|
+
type WorkflowRetryParamsRef,
|
|
63
|
+
} from './workflow-retry-state';
|
|
56
64
|
import { sanitizeLiveLogLines } from './runtime/live-progress';
|
|
57
65
|
|
|
58
66
|
export { DynamicWorkflowBinding };
|
|
@@ -563,7 +571,7 @@ const WORKFLOW_POOL_PROTOCOL_VERSION =
|
|
|
563
571
|
const WORKFLOW_POOL_DO_NAME = 'workflow-pool:v2';
|
|
564
572
|
const WORKFLOW_POOL_START_EVENT_TYPE = 'play_start';
|
|
565
573
|
const WORKFLOW_POOL_TTL_MS = 8 * 60 * 1000;
|
|
566
|
-
const WORKFLOW_POOL_TARGET_SIZE =
|
|
574
|
+
const WORKFLOW_POOL_TARGET_SIZE = 4;
|
|
567
575
|
const WORKFLOW_POOL_READY_TIMEOUT_MS = 1_500;
|
|
568
576
|
const WORKFLOW_POOL_READY_POLL_MS = 250;
|
|
569
577
|
const WORKFLOW_POOL_REFILL_ON_MISS_TIMEOUT_MS = 2_500;
|
|
@@ -571,6 +579,8 @@ const WORKFLOW_POOL_REFILL_ON_MISS_MIN_AVAILABLE = 4;
|
|
|
571
579
|
const WORKFLOW_POOL_CONTROL_TIMEOUT_MS = 750;
|
|
572
580
|
const WORKFLOW_POOL_START_ACK_TIMEOUT_MS = 750;
|
|
573
581
|
const WORKFLOW_POOL_START_ACK_POLL_MS = 25;
|
|
582
|
+
const WORKFLOW_POOL_MISS_CLAIM_RETRY_TIMEOUT_MS = 3_000;
|
|
583
|
+
const WORKFLOW_POOL_MISS_CLAIM_RETRY_POLL_MS = 50;
|
|
574
584
|
const SUBMIT_INITIAL_STATE_MAX_WAIT_MS = 0;
|
|
575
585
|
const SUBMIT_INITIAL_STATE_POLL_MS = 50;
|
|
576
586
|
const WORKFLOW_RETRY_STATE_TTL_MS = 60 * 60 * 1000;
|
|
@@ -1117,6 +1127,115 @@ async function leaseWorkflowPoolId(
|
|
|
1117
1127
|
return typeof body.id === 'string' && body.id ? body.id : null;
|
|
1118
1128
|
}
|
|
1119
1129
|
|
|
1130
|
+
async function leaseWorkflowPoolIdWithMissRecovery(input: {
|
|
1131
|
+
env: CoordinatorEnv;
|
|
1132
|
+
runId: string;
|
|
1133
|
+
recordSubmitTiming: (timing: CoordinatorTiming) => void;
|
|
1134
|
+
graphHash?: string | null;
|
|
1135
|
+
}): Promise<{
|
|
1136
|
+
pooledInstanceId: string | null;
|
|
1137
|
+
missCounts: WorkflowPoolCounts | null;
|
|
1138
|
+
leaseError: string | null;
|
|
1139
|
+
}> {
|
|
1140
|
+
let leaseError: string | null = null;
|
|
1141
|
+
let pooledInstanceId = await leaseWorkflowPoolId(
|
|
1142
|
+
input.env,
|
|
1143
|
+
input.runId,
|
|
1144
|
+
).catch((error) => {
|
|
1145
|
+
leaseError = error instanceof Error ? error.message : String(error);
|
|
1146
|
+
return null;
|
|
1147
|
+
});
|
|
1148
|
+
let missCounts = pooledInstanceId
|
|
1149
|
+
? null
|
|
1150
|
+
: await workflowPoolCount(input.env).catch(() => null);
|
|
1151
|
+
if (
|
|
1152
|
+
pooledInstanceId ||
|
|
1153
|
+
leaseError ||
|
|
1154
|
+
!missCounts ||
|
|
1155
|
+
missCounts.available + missCounts.warming <= 0
|
|
1156
|
+
) {
|
|
1157
|
+
return { pooledInstanceId, missCounts, leaseError };
|
|
1158
|
+
}
|
|
1159
|
+
|
|
1160
|
+
const recoveryStartedAt = Date.now();
|
|
1161
|
+
const refill = await refillWorkflowPool(input.env, {
|
|
1162
|
+
minAvailable: 1,
|
|
1163
|
+
waitReady: true,
|
|
1164
|
+
waitTimeoutMs: WORKFLOW_POOL_REFILL_ON_MISS_TIMEOUT_MS,
|
|
1165
|
+
}).catch((error) => {
|
|
1166
|
+
input.recordSubmitTiming({
|
|
1167
|
+
phase: 'coordinator.workflow_pool_refill_on_miss',
|
|
1168
|
+
ms: Date.now() - recoveryStartedAt,
|
|
1169
|
+
graphHash: input.graphHash ?? null,
|
|
1170
|
+
extra: {
|
|
1171
|
+
status: 'failed',
|
|
1172
|
+
error: error instanceof Error ? error.message : String(error),
|
|
1173
|
+
available: missCounts?.available ?? null,
|
|
1174
|
+
warming: missCounts?.warming ?? null,
|
|
1175
|
+
},
|
|
1176
|
+
});
|
|
1177
|
+
return null;
|
|
1178
|
+
});
|
|
1179
|
+
if (refill) {
|
|
1180
|
+
input.recordSubmitTiming({
|
|
1181
|
+
phase: 'coordinator.workflow_pool_refill_on_miss',
|
|
1182
|
+
ms: Date.now() - recoveryStartedAt,
|
|
1183
|
+
graphHash: input.graphHash ?? null,
|
|
1184
|
+
extra: {
|
|
1185
|
+
status: 'ok',
|
|
1186
|
+
available: refill.available,
|
|
1187
|
+
warming: refill.warming,
|
|
1188
|
+
created: refill.created,
|
|
1189
|
+
promoted: refill.promoted,
|
|
1190
|
+
removed: refill.removed,
|
|
1191
|
+
waitedMs: refill.waitedMs,
|
|
1192
|
+
waitIterations: refill.waitIterations,
|
|
1193
|
+
},
|
|
1194
|
+
});
|
|
1195
|
+
}
|
|
1196
|
+
|
|
1197
|
+
let retryCount = 0;
|
|
1198
|
+
const retryStartedAt = Date.now();
|
|
1199
|
+
while (
|
|
1200
|
+
Date.now() - retryStartedAt <
|
|
1201
|
+
WORKFLOW_POOL_MISS_CLAIM_RETRY_TIMEOUT_MS
|
|
1202
|
+
) {
|
|
1203
|
+
retryCount += 1;
|
|
1204
|
+
pooledInstanceId = await leaseWorkflowPoolId(input.env, input.runId).catch(
|
|
1205
|
+
(error) => {
|
|
1206
|
+
leaseError = error instanceof Error ? error.message : String(error);
|
|
1207
|
+
return null;
|
|
1208
|
+
},
|
|
1209
|
+
);
|
|
1210
|
+
if (pooledInstanceId || leaseError) {
|
|
1211
|
+
break;
|
|
1212
|
+
}
|
|
1213
|
+
missCounts = await workflowPoolCount(input.env).catch(() => missCounts);
|
|
1214
|
+
if (!missCounts || missCounts.available + missCounts.warming <= 0) {
|
|
1215
|
+
break;
|
|
1216
|
+
}
|
|
1217
|
+
await sleep(WORKFLOW_POOL_MISS_CLAIM_RETRY_POLL_MS);
|
|
1218
|
+
}
|
|
1219
|
+
input.recordSubmitTiming({
|
|
1220
|
+
phase: 'coordinator.workflow_pool_claim_retry',
|
|
1221
|
+
ms: Date.now() - retryStartedAt,
|
|
1222
|
+
graphHash: input.graphHash ?? null,
|
|
1223
|
+
extra: {
|
|
1224
|
+
pooled: Boolean(pooledInstanceId),
|
|
1225
|
+
retries: retryCount,
|
|
1226
|
+
...(leaseError ? { error: leaseError } : {}),
|
|
1227
|
+
...(missCounts
|
|
1228
|
+
? {
|
|
1229
|
+
availableAfterRetry: missCounts.available,
|
|
1230
|
+
warmingAfterRetry: missCounts.warming,
|
|
1231
|
+
}
|
|
1232
|
+
: {}),
|
|
1233
|
+
},
|
|
1234
|
+
});
|
|
1235
|
+
|
|
1236
|
+
return { pooledInstanceId, missCounts, leaseError };
|
|
1237
|
+
}
|
|
1238
|
+
|
|
1120
1239
|
async function mapRunToWorkflowInstance(input: {
|
|
1121
1240
|
env: CoordinatorEnv;
|
|
1122
1241
|
runId: string;
|
|
@@ -1191,56 +1310,120 @@ async function persistWorkflowRetryState(input: {
|
|
|
1191
1310
|
runId: string;
|
|
1192
1311
|
params: PlayWorkflowParams;
|
|
1193
1312
|
}): Promise<void> {
|
|
1194
|
-
const retryParams
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
})) ?? null,
|
|
1313
|
+
const retryParams = buildWorkflowRetryParams(input.params);
|
|
1314
|
+
const paramsBytes = jsonByteLength(retryParams);
|
|
1315
|
+
if (paramsBytes > WORKFLOW_RETRY_PARAMS_MAX_BYTES) {
|
|
1316
|
+
throw new Error(
|
|
1317
|
+
`workflow retry params too large: ${paramsBytes} bytes exceeds ${WORKFLOW_RETRY_PARAMS_MAX_BYTES}. Pass large payloads as staged files or ctx.csv inputs instead of inline JSON.`,
|
|
1318
|
+
);
|
|
1319
|
+
}
|
|
1320
|
+
let body: {
|
|
1321
|
+
runId: string;
|
|
1322
|
+
params?: PlayWorkflowParams;
|
|
1323
|
+
paramsRef?: WorkflowRetryParamsRef;
|
|
1324
|
+
paramsBytes: number;
|
|
1325
|
+
ttlMs: number;
|
|
1208
1326
|
};
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1327
|
+
if (paramsBytes > WORKFLOW_RETRY_PARAMS_EXTERNALIZE_AFTER_BYTES) {
|
|
1328
|
+
const serialized = JSON.stringify(retryParams);
|
|
1329
|
+
const hash = stableHash(serialized);
|
|
1330
|
+
const storageKey = workflowRetryParamsStorageKey({
|
|
1212
1331
|
runId: input.runId,
|
|
1213
|
-
|
|
1332
|
+
hash,
|
|
1333
|
+
});
|
|
1334
|
+
await input.env.PLAYS_BUCKET.put(storageKey, serialized, {
|
|
1335
|
+
httpMetadata: { contentType: 'application/json' },
|
|
1336
|
+
});
|
|
1337
|
+
body = {
|
|
1338
|
+
runId: input.runId,
|
|
1339
|
+
paramsRef: {
|
|
1340
|
+
storageKind: 'r2',
|
|
1341
|
+
storageKey,
|
|
1342
|
+
bytes: paramsBytes,
|
|
1343
|
+
hash,
|
|
1344
|
+
expiresAt: Date.now() + WORKFLOW_RETRY_STATE_TTL_MS,
|
|
1345
|
+
},
|
|
1346
|
+
paramsBytes,
|
|
1214
1347
|
ttlMs: WORKFLOW_RETRY_STATE_TTL_MS,
|
|
1215
|
-
}
|
|
1216
|
-
}
|
|
1217
|
-
|
|
1348
|
+
};
|
|
1349
|
+
} else {
|
|
1350
|
+
body = {
|
|
1218
1351
|
runId: input.runId,
|
|
1219
|
-
|
|
1220
|
-
|
|
1352
|
+
params: retryParams,
|
|
1353
|
+
paramsBytes,
|
|
1354
|
+
ttlMs: WORKFLOW_RETRY_STATE_TTL_MS,
|
|
1355
|
+
};
|
|
1356
|
+
}
|
|
1357
|
+
await callWorkflowPool<{ ok?: unknown }>(input.env, '/run-retry-state-put', {
|
|
1358
|
+
method: 'POST',
|
|
1359
|
+
body: JSON.stringify(body),
|
|
1221
1360
|
});
|
|
1222
1361
|
}
|
|
1223
1362
|
|
|
1224
|
-
function
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1363
|
+
async function hydrateWorkflowRetryParams(input: {
|
|
1364
|
+
env: CoordinatorEnv;
|
|
1365
|
+
params: unknown;
|
|
1366
|
+
paramsRef: unknown;
|
|
1367
|
+
}): Promise<PlayWorkflowParams | null> {
|
|
1368
|
+
if (isRecord(input.params)) {
|
|
1369
|
+
return input.params as PlayWorkflowParams;
|
|
1370
|
+
}
|
|
1371
|
+
if (!isRecord(input.paramsRef)) {
|
|
1372
|
+
return null;
|
|
1373
|
+
}
|
|
1374
|
+
const storageKind = input.paramsRef.storageKind;
|
|
1375
|
+
const storageKey = input.paramsRef.storageKey;
|
|
1376
|
+
const expectedBytes = input.paramsRef.bytes;
|
|
1377
|
+
const expectedHash = input.paramsRef.hash;
|
|
1378
|
+
if (
|
|
1379
|
+
storageKind !== 'r2' ||
|
|
1380
|
+
typeof storageKey !== 'string' ||
|
|
1381
|
+
!storageKey.startsWith('plays/workflow-retry-params/') ||
|
|
1382
|
+
typeof expectedBytes !== 'number' ||
|
|
1383
|
+
!Number.isFinite(expectedBytes) ||
|
|
1384
|
+
typeof expectedHash !== 'string' ||
|
|
1385
|
+
!expectedHash
|
|
1386
|
+
) {
|
|
1387
|
+
throw new Error('Invalid workflow retry params reference.');
|
|
1388
|
+
}
|
|
1389
|
+
const object = await input.env.PLAYS_BUCKET.get(storageKey);
|
|
1390
|
+
if (!object) {
|
|
1391
|
+
throw new Error(`Workflow retry params missing from R2: ${storageKey}`);
|
|
1392
|
+
}
|
|
1393
|
+
const text = await object.text();
|
|
1394
|
+
const actualBytes = new TextEncoder().encode(text).length;
|
|
1395
|
+
if (actualBytes !== expectedBytes) {
|
|
1396
|
+
throw new Error(
|
|
1397
|
+
`Workflow retry params byte length mismatch: expected ${expectedBytes}, got ${actualBytes}.`,
|
|
1398
|
+
);
|
|
1399
|
+
}
|
|
1400
|
+
const actualHash = stableHash(text);
|
|
1401
|
+
if (actualHash !== expectedHash) {
|
|
1402
|
+
throw new Error('Workflow retry params hash mismatch.');
|
|
1403
|
+
}
|
|
1404
|
+
const parsed = JSON.parse(text) as unknown;
|
|
1405
|
+
return isRecord(parsed) ? (parsed as PlayWorkflowParams) : null;
|
|
1230
1406
|
}
|
|
1231
1407
|
|
|
1232
|
-
function
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
const
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1408
|
+
function workflowRetryStatePersistenceErrorResponse(input: {
|
|
1409
|
+
runId: string;
|
|
1410
|
+
error: unknown;
|
|
1411
|
+
}): Response {
|
|
1412
|
+
const message =
|
|
1413
|
+
input.error instanceof Error ? input.error.message : String(input.error);
|
|
1414
|
+
return Response.json(
|
|
1415
|
+
{
|
|
1416
|
+
error: {
|
|
1417
|
+
code: 'WORKFLOW_RETRY_STATE_PERSISTENCE_FAILED',
|
|
1418
|
+
message:
|
|
1419
|
+
'Failed to persist workflow retry state before dispatching the play run.',
|
|
1420
|
+
phase: 'coordinator_retry_state_persistence',
|
|
1421
|
+
runId: input.runId,
|
|
1422
|
+
cause: message,
|
|
1423
|
+
},
|
|
1424
|
+
},
|
|
1425
|
+
{ status: 503 },
|
|
1426
|
+
);
|
|
1244
1427
|
}
|
|
1245
1428
|
|
|
1246
1429
|
async function claimWorkflowPlatformRetry(input: {
|
|
@@ -1255,6 +1438,7 @@ async function claimWorkflowPlatformRetry(input: {
|
|
|
1255
1438
|
claimed?: unknown;
|
|
1256
1439
|
attempts?: unknown;
|
|
1257
1440
|
params?: unknown;
|
|
1441
|
+
paramsRef?: unknown;
|
|
1258
1442
|
}>(input.env, '/run-retry-claim', {
|
|
1259
1443
|
method: 'POST',
|
|
1260
1444
|
body: JSON.stringify({
|
|
@@ -1262,10 +1446,15 @@ async function claimWorkflowPlatformRetry(input: {
|
|
|
1262
1446
|
maxAttempts: PLATFORM_DEPLOY_WORKFLOW_RETRY_LIMIT,
|
|
1263
1447
|
}),
|
|
1264
1448
|
});
|
|
1449
|
+
const params = await hydrateWorkflowRetryParams({
|
|
1450
|
+
env: input.env,
|
|
1451
|
+
params: body.params,
|
|
1452
|
+
paramsRef: body.paramsRef,
|
|
1453
|
+
});
|
|
1265
1454
|
return {
|
|
1266
1455
|
claimed: body.claimed === true,
|
|
1267
1456
|
attempts: typeof body.attempts === 'number' ? body.attempts : 0,
|
|
1268
|
-
params
|
|
1457
|
+
params,
|
|
1269
1458
|
};
|
|
1270
1459
|
}
|
|
1271
1460
|
|
|
@@ -1719,17 +1908,13 @@ async function submitViaPooledWorkflow(input: {
|
|
|
1719
1908
|
return null;
|
|
1720
1909
|
}
|
|
1721
1910
|
const leaseStartedAt = Date.now();
|
|
1722
|
-
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
|
|
1727
|
-
|
|
1728
|
-
|
|
1729
|
-
});
|
|
1730
|
-
const missCounts = pooledInstanceId
|
|
1731
|
-
? null
|
|
1732
|
-
: await workflowPoolCount(input.env).catch(() => null);
|
|
1911
|
+
const { pooledInstanceId, missCounts, leaseError } =
|
|
1912
|
+
await leaseWorkflowPoolIdWithMissRecovery({
|
|
1913
|
+
env: input.env,
|
|
1914
|
+
runId: input.params.runId,
|
|
1915
|
+
recordSubmitTiming: input.recordSubmitTiming,
|
|
1916
|
+
graphHash: input.params.graphHash ?? null,
|
|
1917
|
+
});
|
|
1733
1918
|
input.recordSubmitTiming({
|
|
1734
1919
|
phase: 'coordinator.workflow_pool_lease',
|
|
1735
1920
|
ms: Date.now() - leaseStartedAt,
|
|
@@ -1746,30 +1931,6 @@ async function submitViaPooledWorkflow(input: {
|
|
|
1746
1931
|
},
|
|
1747
1932
|
});
|
|
1748
1933
|
|
|
1749
|
-
if (!pooledInstanceId) {
|
|
1750
|
-
// A pool miss must not block the user path. Refilling is handled by the
|
|
1751
|
-
// caller's waitUntil after submit, so fall through to cold create now.
|
|
1752
|
-
const counts =
|
|
1753
|
-
missCounts ?? (await workflowPoolCount(input.env).catch(() => null));
|
|
1754
|
-
input.recordSubmitTiming({
|
|
1755
|
-
phase: 'coordinator.workflow_pool_refill_on_miss',
|
|
1756
|
-
ms: 0,
|
|
1757
|
-
graphHash: input.params.graphHash ?? null,
|
|
1758
|
-
extra: {
|
|
1759
|
-
skipped: true,
|
|
1760
|
-
reason: 'pool_miss_does_not_block_submit',
|
|
1761
|
-
...(counts
|
|
1762
|
-
? {
|
|
1763
|
-
available: counts.available,
|
|
1764
|
-
warming: counts.warming,
|
|
1765
|
-
waitedMs: 0,
|
|
1766
|
-
waitIterations: 0,
|
|
1767
|
-
}
|
|
1768
|
-
: {}),
|
|
1769
|
-
},
|
|
1770
|
-
});
|
|
1771
|
-
}
|
|
1772
|
-
|
|
1773
1934
|
if (!pooledInstanceId) {
|
|
1774
1935
|
return null;
|
|
1775
1936
|
}
|
|
@@ -3606,11 +3767,40 @@ async function handleWorkflowRoute(input: {
|
|
|
3606
3767
|
params,
|
|
3607
3768
|
recordSubmitTiming,
|
|
3608
3769
|
});
|
|
3609
|
-
|
|
3610
|
-
|
|
3611
|
-
|
|
3612
|
-
|
|
3613
|
-
|
|
3770
|
+
try {
|
|
3771
|
+
const retryStateStartedAt = Date.now();
|
|
3772
|
+
await persistWorkflowRetryState({
|
|
3773
|
+
env,
|
|
3774
|
+
runId: submittedRunId,
|
|
3775
|
+
params: workflowParams,
|
|
3776
|
+
});
|
|
3777
|
+
recordSubmitTiming({
|
|
3778
|
+
phase: 'coordinator.retry_state_persistence',
|
|
3779
|
+
ms: Date.now() - retryStateStartedAt,
|
|
3780
|
+
graphHash: params.graphHash ?? null,
|
|
3781
|
+
});
|
|
3782
|
+
} catch (error) {
|
|
3783
|
+
const errorMessage =
|
|
3784
|
+
error instanceof Error ? error.message : String(error);
|
|
3785
|
+
console.error('[coordinator] workflow retry state persistence failed', {
|
|
3786
|
+
code: 'WORKFLOW_RETRY_STATE_PERSISTENCE_FAILED',
|
|
3787
|
+
runId: submittedRunId,
|
|
3788
|
+
error: errorMessage,
|
|
3789
|
+
});
|
|
3790
|
+
recordSubmitTiming({
|
|
3791
|
+
phase: 'coordinator.retry_state_persistence',
|
|
3792
|
+
ms: 0,
|
|
3793
|
+
graphHash: params.graphHash ?? null,
|
|
3794
|
+
extra: {
|
|
3795
|
+
status: 'failed',
|
|
3796
|
+
error: errorMessage,
|
|
3797
|
+
},
|
|
3798
|
+
});
|
|
3799
|
+
return workflowRetryStatePersistenceErrorResponse({
|
|
3800
|
+
runId: submittedRunId,
|
|
3801
|
+
error,
|
|
3802
|
+
});
|
|
3803
|
+
}
|
|
3614
3804
|
let instance: WorkflowInstance | null = null;
|
|
3615
3805
|
try {
|
|
3616
3806
|
const statusEventStartedAt = Date.now();
|