deepline 0.1.99 → 0.1.101
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +12 -2
- package/dist/cli/index.mjs +12 -2
- package/dist/index.js +3 -2
- package/dist/index.mjs +3 -2
- package/dist/repo/apps/play-runner-workers/src/entry.ts +399 -199
- package/dist/repo/apps/play-runner-workers/src/runtime/row-isolation.ts +53 -0
- package/dist/repo/apps/play-runner-workers/src/runtime/tool-http-errors.ts +20 -1
- package/dist/repo/sdk/src/release.ts +3 -2
- package/dist/repo/shared_libs/play-runtime/batch-runtime.ts +27 -14
- package/package.json +1 -1
package/dist/cli/index.js
CHANGED
|
@@ -232,10 +232,11 @@ var SDK_RELEASE = {
|
|
|
232
232
|
// 0.1.94 is claimed by PR #1527 — this watch-render fix ships as 0.1.95.
|
|
233
233
|
// 0.1.98 ships the duplicate-browser-tab fix (default-browser detection).
|
|
234
234
|
// 0.1.99 ships prebuilt job-change source-column preservation and validation fixes.
|
|
235
|
-
|
|
235
|
+
// 0.1.101 ships retryable play artifact publish failures and CI retry hardening.
|
|
236
|
+
version: "0.1.101",
|
|
236
237
|
apiContract: "2026-06-dataset-column-cell-stale-hard-cutover",
|
|
237
238
|
supportPolicy: {
|
|
238
|
-
latest: "0.1.
|
|
239
|
+
latest: "0.1.101",
|
|
239
240
|
minimumSupported: "0.1.53",
|
|
240
241
|
deprecatedBelow: "0.1.53"
|
|
241
242
|
}
|
|
@@ -11148,6 +11149,15 @@ function buildRunPackageTextLines(packaged) {
|
|
|
11148
11149
|
if (runError && (status === "failed" || status === "cancelled")) {
|
|
11149
11150
|
lines.push(` error: ${runError.slice(0, 200)}`);
|
|
11150
11151
|
}
|
|
11152
|
+
for (const step of readRecordArray(packaged.steps)) {
|
|
11153
|
+
const output2 = step.output && typeof step.output === "object" && !Array.isArray(step.output) ? step.output : null;
|
|
11154
|
+
if (!output2 || output2.recovered !== true) continue;
|
|
11155
|
+
const rowCount = typeof output2.rowCount === "number" ? formatInteger(output2.rowCount) : "persisted";
|
|
11156
|
+
const datasetPath = typeof output2.path === "string" ? output2.path : "dataset";
|
|
11157
|
+
lines.push(
|
|
11158
|
+
` recoverable: ${rowCount} rows persisted in ${datasetPath} \u2014 re-running reuses them; export with the command below`
|
|
11159
|
+
);
|
|
11160
|
+
}
|
|
11151
11161
|
if (playName) {
|
|
11152
11162
|
lines.push(` play: ${playName}`);
|
|
11153
11163
|
}
|
package/dist/cli/index.mjs
CHANGED
|
@@ -209,10 +209,11 @@ var SDK_RELEASE = {
|
|
|
209
209
|
// 0.1.94 is claimed by PR #1527 — this watch-render fix ships as 0.1.95.
|
|
210
210
|
// 0.1.98 ships the duplicate-browser-tab fix (default-browser detection).
|
|
211
211
|
// 0.1.99 ships prebuilt job-change source-column preservation and validation fixes.
|
|
212
|
-
|
|
212
|
+
// 0.1.101 ships retryable play artifact publish failures and CI retry hardening.
|
|
213
|
+
version: "0.1.101",
|
|
213
214
|
apiContract: "2026-06-dataset-column-cell-stale-hard-cutover",
|
|
214
215
|
supportPolicy: {
|
|
215
|
-
latest: "0.1.
|
|
216
|
+
latest: "0.1.101",
|
|
216
217
|
minimumSupported: "0.1.53",
|
|
217
218
|
deprecatedBelow: "0.1.53"
|
|
218
219
|
}
|
|
@@ -11164,6 +11165,15 @@ function buildRunPackageTextLines(packaged) {
|
|
|
11164
11165
|
if (runError && (status === "failed" || status === "cancelled")) {
|
|
11165
11166
|
lines.push(` error: ${runError.slice(0, 200)}`);
|
|
11166
11167
|
}
|
|
11168
|
+
for (const step of readRecordArray(packaged.steps)) {
|
|
11169
|
+
const output2 = step.output && typeof step.output === "object" && !Array.isArray(step.output) ? step.output : null;
|
|
11170
|
+
if (!output2 || output2.recovered !== true) continue;
|
|
11171
|
+
const rowCount = typeof output2.rowCount === "number" ? formatInteger(output2.rowCount) : "persisted";
|
|
11172
|
+
const datasetPath = typeof output2.path === "string" ? output2.path : "dataset";
|
|
11173
|
+
lines.push(
|
|
11174
|
+
` recoverable: ${rowCount} rows persisted in ${datasetPath} \u2014 re-running reuses them; export with the command below`
|
|
11175
|
+
);
|
|
11176
|
+
}
|
|
11167
11177
|
if (playName) {
|
|
11168
11178
|
lines.push(` play: ${playName}`);
|
|
11169
11179
|
}
|
package/dist/index.js
CHANGED
|
@@ -260,10 +260,11 @@ var SDK_RELEASE = {
|
|
|
260
260
|
// 0.1.94 is claimed by PR #1527 — this watch-render fix ships as 0.1.95.
|
|
261
261
|
// 0.1.98 ships the duplicate-browser-tab fix (default-browser detection).
|
|
262
262
|
// 0.1.99 ships prebuilt job-change source-column preservation and validation fixes.
|
|
263
|
-
|
|
263
|
+
// 0.1.101 ships retryable play artifact publish failures and CI retry hardening.
|
|
264
|
+
version: "0.1.101",
|
|
264
265
|
apiContract: "2026-06-dataset-column-cell-stale-hard-cutover",
|
|
265
266
|
supportPolicy: {
|
|
266
|
-
latest: "0.1.
|
|
267
|
+
latest: "0.1.101",
|
|
267
268
|
minimumSupported: "0.1.53",
|
|
268
269
|
deprecatedBelow: "0.1.53"
|
|
269
270
|
}
|
package/dist/index.mjs
CHANGED
|
@@ -182,10 +182,11 @@ var SDK_RELEASE = {
|
|
|
182
182
|
// 0.1.94 is claimed by PR #1527 — this watch-render fix ships as 0.1.95.
|
|
183
183
|
// 0.1.98 ships the duplicate-browser-tab fix (default-browser detection).
|
|
184
184
|
// 0.1.99 ships prebuilt job-change source-column preservation and validation fixes.
|
|
185
|
-
|
|
185
|
+
// 0.1.101 ships retryable play artifact publish failures and CI retry hardening.
|
|
186
|
+
version: "0.1.101",
|
|
186
187
|
apiContract: "2026-06-dataset-column-cell-stale-hard-cutover",
|
|
187
188
|
supportPolicy: {
|
|
188
|
-
latest: "0.1.
|
|
189
|
+
latest: "0.1.101",
|
|
189
190
|
minimumSupported: "0.1.53",
|
|
190
191
|
deprecatedBelow: "0.1.53"
|
|
191
192
|
}
|
|
@@ -161,6 +161,11 @@ import {
|
|
|
161
161
|
isHardBillingToolHttpError,
|
|
162
162
|
normalizeToolHttpErrorMessage,
|
|
163
163
|
} from './runtime/tool-http-errors';
|
|
164
|
+
import {
|
|
165
|
+
WorkflowAbortError,
|
|
166
|
+
isAbortLikeError,
|
|
167
|
+
isRowIsolationExemptError,
|
|
168
|
+
} from './runtime/row-isolation';
|
|
164
169
|
import {
|
|
165
170
|
StepProgramDatasetBuilder,
|
|
166
171
|
type StepProgramDatasetColumnInput,
|
|
@@ -733,7 +738,12 @@ function publicCsvStorageRow<T extends Record<string, unknown>>(row: T): T {
|
|
|
733
738
|
storageRow[fieldName] =
|
|
734
739
|
'value' in descriptor ? descriptor.value : publicRow[fieldName];
|
|
735
740
|
}
|
|
736
|
-
for (const runtimeField of [
|
|
741
|
+
for (const runtimeField of [
|
|
742
|
+
'__deeplineRowKey',
|
|
743
|
+
'__deeplineCellMetaPatch',
|
|
744
|
+
'__deeplineRowStatus',
|
|
745
|
+
'__deeplineRowError',
|
|
746
|
+
]) {
|
|
737
747
|
if (Object.prototype.hasOwnProperty.call(row, runtimeField)) {
|
|
738
748
|
storageRow[runtimeField] = row[runtimeField];
|
|
739
749
|
}
|
|
@@ -1286,7 +1296,11 @@ async function callToolDirect(
|
|
|
1286
1296
|
const maxAttempts = 3;
|
|
1287
1297
|
let lastError: Error | null = null;
|
|
1288
1298
|
|
|
1289
|
-
for (
|
|
1299
|
+
for (
|
|
1300
|
+
let attempt = 1;
|
|
1301
|
+
attempt <= WORKER_TOOL_RATE_LIMIT_MAX_ATTEMPTS;
|
|
1302
|
+
attempt += 1
|
|
1303
|
+
) {
|
|
1290
1304
|
const res = await fetchRuntimeApi(req.baseUrl, path, {
|
|
1291
1305
|
method: 'POST',
|
|
1292
1306
|
headers: {
|
|
@@ -1310,11 +1324,17 @@ async function callToolDirect(
|
|
|
1310
1324
|
}
|
|
1311
1325
|
|
|
1312
1326
|
const text = await res.text().catch(() => '');
|
|
1327
|
+
const isRateLimited = res.status === 429;
|
|
1328
|
+
// Rate-limit pushback gets the larger 429-specific retry budget; every
|
|
1329
|
+
// other failure keeps the generic 3-attempt budget.
|
|
1330
|
+
const attemptCap = isRateLimited
|
|
1331
|
+
? WORKER_TOOL_RATE_LIMIT_MAX_ATTEMPTS
|
|
1332
|
+
: maxAttempts;
|
|
1313
1333
|
lastError = normalizeToolHttpErrorMessage({
|
|
1314
1334
|
toolId,
|
|
1315
1335
|
status: res.status,
|
|
1316
1336
|
attempt,
|
|
1317
|
-
maxAttempts,
|
|
1337
|
+
maxAttempts: attemptCap,
|
|
1318
1338
|
bodyText: text,
|
|
1319
1339
|
});
|
|
1320
1340
|
const retryAfterSeconds = Number(res.headers.get('retry-after'));
|
|
@@ -1322,21 +1342,28 @@ async function callToolDirect(
|
|
|
1322
1342
|
Number.isFinite(retryAfterSeconds) && retryAfterSeconds > 0
|
|
1323
1343
|
? Math.ceil(retryAfterSeconds * 1000)
|
|
1324
1344
|
: 0;
|
|
1325
|
-
if (
|
|
1345
|
+
if (isRateLimited) {
|
|
1326
1346
|
// Feed the provider's backpressure into the shared pacer even on the
|
|
1327
1347
|
// final attempt so the (org, provider) bucket backs off across isolates.
|
|
1328
1348
|
onProviderBackpressure?.(retryAfterMs > 0 ? retryAfterMs : 1_000);
|
|
1329
1349
|
}
|
|
1330
1350
|
const retryable =
|
|
1331
|
-
(
|
|
1351
|
+
(isRateLimited && !isHardBillingToolHttpError(lastError)) ||
|
|
1332
1352
|
(res.status >= 500 && WORKER_RETRY_SAFE_5XX_TOOLS.has(toolId));
|
|
1333
|
-
if (!retryable || attempt >=
|
|
1353
|
+
if (!retryable || attempt >= attemptCap) {
|
|
1334
1354
|
throw lastError;
|
|
1335
1355
|
}
|
|
1336
1356
|
// Charge the retry budget per attempt, matching the cjs runner's
|
|
1337
1357
|
// chargeBudget('retry') on every 429 / retryable-5xx retry.
|
|
1338
1358
|
onRetryAttempt?.();
|
|
1339
|
-
|
|
1359
|
+
// 429 delays escalate per attempt (still honoring a larger retry-after)
|
|
1360
|
+
// so sustained throttling spaces calls out instead of hammering the
|
|
1361
|
+
// limiter with fixed 1s retries.
|
|
1362
|
+
const delayMs = isRateLimited
|
|
1363
|
+
? Math.min(5_000, Math.max(retryAfterMs, 1_000 * attempt))
|
|
1364
|
+
: retryAfterMs > 0
|
|
1365
|
+
? Math.min(5_000, retryAfterMs)
|
|
1366
|
+
: 1_000;
|
|
1340
1367
|
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
|
1341
1368
|
}
|
|
1342
1369
|
|
|
@@ -1498,12 +1525,27 @@ type WorkerToolBatchRequest = {
|
|
|
1498
1525
|
|
|
1499
1526
|
const WORKER_TOOL_BATCH_GRACE_MS = 250;
|
|
1500
1527
|
const MAP_EXECUTION_HEARTBEAT_INTERVAL_MS = 5_000;
|
|
1528
|
+
/**
|
|
1529
|
+
* Bounded number of per-row failure samples carried in chunk summaries and the
|
|
1530
|
+
* map's terminal partial-failure log. Every failed row is persisted with its
|
|
1531
|
+
* full error in the runtime sheet; the samples just keep run logs readable.
|
|
1532
|
+
*/
|
|
1533
|
+
const MAP_ROW_FAILURE_SAMPLE_LIMIT = 3;
|
|
1501
1534
|
// Fallback batch-chunk parallelism when a tool declares no provider rate hints.
|
|
1502
1535
|
// Matches the prior hardcoded `Math.min(4, ...)` so undeclared providers keep
|
|
1503
1536
|
// their previous batching behavior; declared providers tighten via the
|
|
1504
1537
|
// Governor's suggestedParallelism.
|
|
1505
1538
|
const WORKER_TOOL_BATCH_DEFAULT_PARALLELISM = 4;
|
|
1506
1539
|
const WORKER_RETRY_SAFE_5XX_TOOLS = new Set(['test_transient_500']);
|
|
1540
|
+
/**
|
|
1541
|
+
* In-process retry budget for HTTP 429 tool responses. Rate-limit pushback is
|
|
1542
|
+
* throughput pacing (provider or Deepline limiter), not a tool defect, so it
|
|
1543
|
+
* gets more patience than the generic 3-attempt budget: with retry-after-aware
|
|
1544
|
+
* escalating delays (capped at 5s) this absorbs roughly 25s of sustained
|
|
1545
|
+
* throttling before the call fails. Every retry still charges the Governor's
|
|
1546
|
+
* retry budget, so a runaway storm stays bounded and loud.
|
|
1547
|
+
*/
|
|
1548
|
+
const WORKER_TOOL_RATE_LIMIT_MAX_ATTEMPTS = 8;
|
|
1507
1549
|
|
|
1508
1550
|
function sleepWorkerMs(ms: number): Promise<void> {
|
|
1509
1551
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
@@ -1761,6 +1803,15 @@ async function executeBatchedWorkerToolGroup(input: {
|
|
|
1761
1803
|
>,
|
|
1762
1804
|
) => {
|
|
1763
1805
|
for (const entry of chunkResults) {
|
|
1806
|
+
if (entry.error !== undefined) {
|
|
1807
|
+
// One batch's provider error stays scoped to that batch's member
|
|
1808
|
+
// requests. Sibling batches in this chunk keep their results so a
|
|
1809
|
+
// single provider hiccup cannot cascade into a whole-map failure.
|
|
1810
|
+
for (const request of entry.request.memberRequests) {
|
|
1811
|
+
request.reject(entry.error);
|
|
1812
|
+
}
|
|
1813
|
+
continue;
|
|
1814
|
+
}
|
|
1764
1815
|
const batchResult = isToolExecuteResult(entry.result)
|
|
1765
1816
|
? entry.result.toolOutput.raw
|
|
1766
1817
|
: entry.result;
|
|
@@ -1841,6 +1892,10 @@ type WorkerMapChunkSummary<T extends Record<string, unknown>> = {
|
|
|
1841
1892
|
rowsDuplicateReused: number;
|
|
1842
1893
|
rowsInserted: number;
|
|
1843
1894
|
rowsSkipped: number;
|
|
1895
|
+
/** Rows whose execution failed and persisted as `_status='failed'`. */
|
|
1896
|
+
rowsFailed: number;
|
|
1897
|
+
/** Bounded sample of row failures for the partial-failure summary. */
|
|
1898
|
+
rowFailureSamples: Array<{ rowKey: string; error: string }>;
|
|
1844
1899
|
outputDatasetId: string;
|
|
1845
1900
|
hash: string;
|
|
1846
1901
|
preview: T[];
|
|
@@ -1926,6 +1981,31 @@ type WorkerMapOptions = {
|
|
|
1926
1981
|
row: Record<string, unknown>,
|
|
1927
1982
|
index: number,
|
|
1928
1983
|
) => string | number | readonly unknown[]);
|
|
1984
|
+
/**
|
|
1985
|
+
* Row failure policy. Default 'isolate': one row's tool/provider error is
|
|
1986
|
+
* recorded on that row (cell meta + `_status='failed'` + `_error`), sibling
|
|
1987
|
+
* rows continue, and the run completes with a partial-failure summary.
|
|
1988
|
+
* Failed rows re-execute on the next run; succeeded rows replay free.
|
|
1989
|
+
* 'fail' opts into fail-fast: the first row error aborts the map and fails
|
|
1990
|
+
* the run (rows persisted before the error stay recoverable).
|
|
1991
|
+
*/
|
|
1992
|
+
onRowError?: 'isolate' | 'fail';
|
|
1993
|
+
};
|
|
1994
|
+
|
|
1995
|
+
/**
|
|
1996
|
+
* Per-cell terminal state recorded by map row execution and merged into the
|
|
1997
|
+
* Runtime Sheet row's `_cell_meta`. 'failed' carries the cell's error message;
|
|
1998
|
+
* `shouldRecomputeCell` treats it as recompute on the next run.
|
|
1999
|
+
*/
|
|
2000
|
+
type WorkerCellMetaPatchEntry = {
|
|
2001
|
+
status: 'cached' | 'skipped' | 'completed' | 'failed';
|
|
2002
|
+
stage?: string | null;
|
|
2003
|
+
reused?: boolean;
|
|
2004
|
+
runId?: string;
|
|
2005
|
+
completedAt?: number;
|
|
2006
|
+
staleAt?: number | null;
|
|
2007
|
+
staleAfterSeconds?: number | null;
|
|
2008
|
+
error?: string;
|
|
1929
2009
|
};
|
|
1930
2010
|
|
|
1931
2011
|
function isWorkerStepProgram(value: unknown): value is WorkerStepProgram {
|
|
@@ -3025,46 +3105,6 @@ async function prepareMapRows(input: {
|
|
|
3025
3105
|
};
|
|
3026
3106
|
}
|
|
3027
3107
|
|
|
3028
|
-
/**
|
|
3029
|
-
* Builds the minimal HTTP-backed ctx surface needed to run tool-basic-shaped
|
|
3030
|
-
* plays. NOT a full implementation of shared_libs/play-runtime/context.ts.
|
|
3031
|
-
*
|
|
3032
|
-
* Supported:
|
|
3033
|
-
* - ctx.log(msg)
|
|
3034
|
-
* - ctx.csv(filename | inline rows) (calls runtime API for file resolve)
|
|
3035
|
-
* - ctx.dataset(name, rows).withColumn(name, resolver).run(opts)
|
|
3036
|
-
* - ctx.tools.execute({ id, tool, input, ... })
|
|
3037
|
-
* - ctx.runPlay(key, playRef, input, opts)
|
|
3038
|
-
*
|
|
3039
|
-
* Not supported (will throw):
|
|
3040
|
-
* - ctx.fetch, checkpoints, etc.
|
|
3041
|
-
*
|
|
3042
|
-
* Plays that need more should run on Daytona; the resolver is composable.
|
|
3043
|
-
*/
|
|
3044
|
-
/**
|
|
3045
|
-
* Thrown by `assertNotAborted` and surfaced through ctx.step / ctx.sleep / map
|
|
3046
|
-
* processing when the workflow has been terminated externally. Cooperatively
|
|
3047
|
-
* cancels in-flight user code: the play must check `ctx.signal.aborted` (or
|
|
3048
|
-
* await one of the abort-aware ctx methods) before doing more work.
|
|
3049
|
-
*/
|
|
3050
|
-
class WorkflowAbortError extends Error {
|
|
3051
|
-
override readonly name = 'WorkflowAbort';
|
|
3052
|
-
constructor(message = 'Play run cancelled.') {
|
|
3053
|
-
super(message);
|
|
3054
|
-
}
|
|
3055
|
-
}
|
|
3056
|
-
|
|
3057
|
-
function isAbortLikeError(error: unknown): boolean {
|
|
3058
|
-
if (!error) return false;
|
|
3059
|
-
if (error instanceof WorkflowAbortError) return true;
|
|
3060
|
-
if (error instanceof Error) {
|
|
3061
|
-
if (error.name === 'WorkflowAbort' || error.name === 'AbortError')
|
|
3062
|
-
return true;
|
|
3063
|
-
return /\b(cancell?ed|aborted|terminate[d]?)\b/i.test(error.message);
|
|
3064
|
-
}
|
|
3065
|
-
return false;
|
|
3066
|
-
}
|
|
3067
|
-
|
|
3068
3108
|
function assertNotAborted(signal: AbortSignal | undefined): void {
|
|
3069
3109
|
if (signal?.aborted) {
|
|
3070
3110
|
throw new WorkflowAbortError(
|
|
@@ -3075,6 +3115,19 @@ function assertNotAborted(signal: AbortSignal | undefined): void {
|
|
|
3075
3115
|
}
|
|
3076
3116
|
}
|
|
3077
3117
|
|
|
3118
|
+
/** Bounded, single-line row failure message persisted to row/cell state. */
|
|
3119
|
+
function formatWorkerRowFailureMessage(error: unknown): string {
|
|
3120
|
+
const raw =
|
|
3121
|
+
error instanceof Error
|
|
3122
|
+
? error.message
|
|
3123
|
+
: typeof error === 'string'
|
|
3124
|
+
? error
|
|
3125
|
+
: JSON.stringify(error);
|
|
3126
|
+
const message = (raw ?? '').replace(/\s+/g, ' ').trim();
|
|
3127
|
+
if (!message) return 'Row execution failed.';
|
|
3128
|
+
return message.length > 1_000 ? `${message.slice(0, 1_000)}…` : message;
|
|
3129
|
+
}
|
|
3130
|
+
|
|
3078
3131
|
function childPipelineUsesCtxDataset(
|
|
3079
3132
|
pipeline: PlayStaticPipeline | null | undefined,
|
|
3080
3133
|
): boolean {
|
|
@@ -3315,6 +3368,22 @@ function createGovernorForRun(req: RunRequest): {
|
|
|
3315
3368
|
return { governor, resolvePacing };
|
|
3316
3369
|
}
|
|
3317
3370
|
|
|
3371
|
+
/**
|
|
3372
|
+
* Builds the minimal HTTP-backed ctx surface needed to run tool-basic-shaped
|
|
3373
|
+
* plays. NOT a full implementation of shared_libs/play-runtime/context.ts.
|
|
3374
|
+
*
|
|
3375
|
+
* Supported:
|
|
3376
|
+
* - ctx.log(msg)
|
|
3377
|
+
* - ctx.csv(filename | inline rows) (calls runtime API for file resolve)
|
|
3378
|
+
* - ctx.dataset(name, rows).withColumn(name, resolver).run(opts)
|
|
3379
|
+
* - ctx.tools.execute({ id, tool, input, ... })
|
|
3380
|
+
* - ctx.runPlay(key, playRef, input, opts)
|
|
3381
|
+
*
|
|
3382
|
+
* Not supported (will throw):
|
|
3383
|
+
* - ctx.fetch, checkpoints, etc.
|
|
3384
|
+
*
|
|
3385
|
+
* Plays that need more should run on Daytona; the resolver is composable.
|
|
3386
|
+
*/
|
|
3318
3387
|
function createMinimalWorkerCtx(
|
|
3319
3388
|
req: RunRequest,
|
|
3320
3389
|
emitEvent: (event: RunnerEvent) => void,
|
|
@@ -3739,6 +3808,7 @@ function createMinimalWorkerCtx(
|
|
|
3739
3808
|
prepared.skipped - missingPreparedRows.length,
|
|
3740
3809
|
);
|
|
3741
3810
|
let completedExecutedRows = 0;
|
|
3811
|
+
let failedExecutedRows = 0;
|
|
3742
3812
|
let startedExecutedRows = 0;
|
|
3743
3813
|
let activeExecutedRows = 0;
|
|
3744
3814
|
let lastChunkProgressAt = 0;
|
|
@@ -3809,20 +3879,15 @@ function createMinimalWorkerCtx(
|
|
|
3809
3879
|
const executedRows: Array<T & Record<string, unknown>> = new Array(
|
|
3810
3880
|
rowsToExecute.length,
|
|
3811
3881
|
);
|
|
3882
|
+
// Row failure isolation (default): a failed row keeps its
|
|
3883
|
+
// partially-enriched data + the row error so it persists as a
|
|
3884
|
+
// recoverable `_status='failed'` sheet row instead of aborting the map.
|
|
3885
|
+
const failFastRowErrors = opts?.onRowError === 'fail';
|
|
3886
|
+
const failedRowEntries: Array<
|
|
3887
|
+
{ row: T & Record<string, unknown>; error: string } | undefined
|
|
3888
|
+
> = new Array(rowsToExecute.length);
|
|
3812
3889
|
const executedCellMetaPatches: Array<
|
|
3813
|
-
|
|
|
3814
|
-
string,
|
|
3815
|
-
{
|
|
3816
|
-
status: 'cached' | 'skipped' | 'completed';
|
|
3817
|
-
stage?: string | null;
|
|
3818
|
-
reused?: boolean;
|
|
3819
|
-
runId?: string;
|
|
3820
|
-
completedAt?: number;
|
|
3821
|
-
staleAt?: number | null;
|
|
3822
|
-
staleAfterSeconds?: number | null;
|
|
3823
|
-
}
|
|
3824
|
-
>
|
|
3825
|
-
| undefined
|
|
3890
|
+
Record<string, WorkerCellMetaPatchEntry> | undefined
|
|
3826
3891
|
> = new Array(rowsToExecute.length);
|
|
3827
3892
|
const toolBatchScheduler = new WorkerToolBatchScheduler(
|
|
3828
3893
|
req,
|
|
@@ -3869,18 +3934,8 @@ function createMinimalWorkerCtx(
|
|
|
3869
3934
|
const enriched: Record<string, unknown> =
|
|
3870
3935
|
cloneCsvAliasedRow(row);
|
|
3871
3936
|
const fieldOutputs: Record<string, unknown> = {};
|
|
3872
|
-
const cellMetaPatch: Record<
|
|
3873
|
-
|
|
3874
|
-
{
|
|
3875
|
-
status: 'cached' | 'skipped' | 'completed';
|
|
3876
|
-
stage?: string | null;
|
|
3877
|
-
reused?: boolean;
|
|
3878
|
-
runId?: string;
|
|
3879
|
-
completedAt?: number;
|
|
3880
|
-
staleAt?: number | null;
|
|
3881
|
-
staleAfterSeconds?: number | null;
|
|
3882
|
-
}
|
|
3883
|
-
> = {};
|
|
3937
|
+
const cellMetaPatch: Record<string, WorkerCellMetaPatchEntry> =
|
|
3938
|
+
{};
|
|
3884
3939
|
const waterfallOutputs: RecordedWaterfallOutput[] = [];
|
|
3885
3940
|
const stepProgramOutputs: RecordedStepProgramOutput[] = [];
|
|
3886
3941
|
const rowCtx = {
|
|
@@ -3914,116 +3969,168 @@ function createMinimalWorkerCtx(
|
|
|
3914
3969
|
workflowStep,
|
|
3915
3970
|
),
|
|
3916
3971
|
};
|
|
3917
|
-
|
|
3918
|
-
|
|
3919
|
-
|
|
3920
|
-
|
|
3921
|
-
|
|
3922
|
-
|
|
3923
|
-
|
|
3924
|
-
|
|
3925
|
-
|
|
3926
|
-
|
|
3927
|
-
|
|
3928
|
-
|
|
3929
|
-
|
|
3930
|
-
|
|
3931
|
-
|
|
3932
|
-
|
|
3933
|
-
|
|
3934
|
-
|
|
3935
|
-
|
|
3936
|
-
|
|
3937
|
-
|
|
3938
|
-
|
|
3939
|
-
|
|
3940
|
-
|
|
3941
|
-
|
|
3942
|
-
|
|
3943
|
-
value: enriched[key],
|
|
3944
|
-
meta:
|
|
3945
|
-
rawCellMeta && typeof rawCellMeta === 'object'
|
|
3946
|
-
? (rawCellMeta as {
|
|
3947
|
-
status?: string;
|
|
3948
|
-
completedAt?: number;
|
|
3949
|
-
staleAt?: number | null;
|
|
3950
|
-
staleAfterSeconds?: number | null;
|
|
3951
|
-
})
|
|
3952
|
-
: null,
|
|
3953
|
-
});
|
|
3954
|
-
if (reuseDecision.action === 'reuse') {
|
|
3955
|
-
cellMetaPatch[key] = {
|
|
3956
|
-
status: 'cached',
|
|
3957
|
-
stage: key,
|
|
3958
|
-
reused: true,
|
|
3959
|
-
runId: req.runId,
|
|
3960
|
-
};
|
|
3961
|
-
continue;
|
|
3962
|
-
}
|
|
3963
|
-
const resolved = await executeWorkerStepResolver(
|
|
3964
|
-
value,
|
|
3965
|
-
enriched,
|
|
3966
|
-
rowCtx,
|
|
3967
|
-
absoluteIndex,
|
|
3968
|
-
previousCell,
|
|
3969
|
-
isWorkerStepProgram(value)
|
|
3970
|
-
? {
|
|
3971
|
-
parentField: key,
|
|
3972
|
-
path: [],
|
|
3973
|
-
outputs: stepProgramOutputs,
|
|
3974
|
-
}
|
|
3975
|
-
: undefined,
|
|
3976
|
-
);
|
|
3977
|
-
enriched[key] = resolved.value;
|
|
3978
|
-
fieldOutputs[key] = resolved.value;
|
|
3979
|
-
if (resolved.status === 'skipped') {
|
|
3980
|
-
cellMetaPatch[key] = {
|
|
3981
|
-
status: 'skipped',
|
|
3982
|
-
stage: key,
|
|
3983
|
-
runId: req.runId,
|
|
3984
|
-
};
|
|
3985
|
-
} else {
|
|
3986
|
-
const completedAt = nowMs();
|
|
3987
|
-
const stalenessMeta = resolveCompletedCellStalenessMeta({
|
|
3988
|
-
policy: authoredCellPolicies?.[key],
|
|
3989
|
-
value: resolved.value,
|
|
3990
|
-
completedAt,
|
|
3972
|
+
let activeField: string | null = null;
|
|
3973
|
+
try {
|
|
3974
|
+
for (const [key, value] of fieldEntries) {
|
|
3975
|
+
activeField = key;
|
|
3976
|
+
const rawCellMeta =
|
|
3977
|
+
enriched[DEEPLINE_CELL_META_FIELD] &&
|
|
3978
|
+
typeof enriched[DEEPLINE_CELL_META_FIELD] === 'object'
|
|
3979
|
+
? (
|
|
3980
|
+
enriched[DEEPLINE_CELL_META_FIELD] as Record<
|
|
3981
|
+
string,
|
|
3982
|
+
unknown
|
|
3983
|
+
>
|
|
3984
|
+
)[key]
|
|
3985
|
+
: null;
|
|
3986
|
+
const reuseDecision = shouldRecomputeCell({
|
|
3987
|
+
hasValue: isCompletedWorkerFieldValue(enriched[key]),
|
|
3988
|
+
meta:
|
|
3989
|
+
rawCellMeta && typeof rawCellMeta === 'object'
|
|
3990
|
+
? (rawCellMeta as {
|
|
3991
|
+
status?: string;
|
|
3992
|
+
completedAt?: number;
|
|
3993
|
+
staleAt?: number | null;
|
|
3994
|
+
staleAfterSeconds?: number | null;
|
|
3995
|
+
})
|
|
3996
|
+
: null,
|
|
3997
|
+
policy: cellPolicies?.[key],
|
|
3991
3998
|
});
|
|
3992
|
-
|
|
3993
|
-
|
|
3994
|
-
|
|
3995
|
-
|
|
3996
|
-
|
|
3997
|
-
|
|
3998
|
-
|
|
3999
|
+
const previousCell = previousCellFromValue({
|
|
4000
|
+
hasValue: isCompletedWorkerFieldValue(enriched[key]),
|
|
4001
|
+
value: enriched[key],
|
|
4002
|
+
meta:
|
|
4003
|
+
rawCellMeta && typeof rawCellMeta === 'object'
|
|
4004
|
+
? (rawCellMeta as {
|
|
4005
|
+
status?: string;
|
|
4006
|
+
completedAt?: number;
|
|
4007
|
+
staleAt?: number | null;
|
|
4008
|
+
staleAfterSeconds?: number | null;
|
|
4009
|
+
})
|
|
4010
|
+
: null,
|
|
4011
|
+
});
|
|
4012
|
+
if (reuseDecision.action === 'reuse') {
|
|
4013
|
+
cellMetaPatch[key] = {
|
|
4014
|
+
status: 'cached',
|
|
4015
|
+
stage: key,
|
|
4016
|
+
reused: true,
|
|
4017
|
+
runId: req.runId,
|
|
4018
|
+
};
|
|
4019
|
+
continue;
|
|
4020
|
+
}
|
|
4021
|
+
const resolved = await executeWorkerStepResolver(
|
|
4022
|
+
value,
|
|
4023
|
+
enriched,
|
|
4024
|
+
rowCtx,
|
|
4025
|
+
absoluteIndex,
|
|
4026
|
+
previousCell,
|
|
4027
|
+
isWorkerStepProgram(value)
|
|
4028
|
+
? {
|
|
4029
|
+
parentField: key,
|
|
4030
|
+
path: [],
|
|
4031
|
+
outputs: stepProgramOutputs,
|
|
4032
|
+
}
|
|
4033
|
+
: undefined,
|
|
4034
|
+
);
|
|
4035
|
+
enriched[key] = resolved.value;
|
|
4036
|
+
fieldOutputs[key] = resolved.value;
|
|
4037
|
+
if (resolved.status === 'skipped') {
|
|
4038
|
+
cellMetaPatch[key] = {
|
|
4039
|
+
status: 'skipped',
|
|
4040
|
+
stage: key,
|
|
4041
|
+
runId: req.runId,
|
|
4042
|
+
};
|
|
4043
|
+
} else {
|
|
4044
|
+
const completedAt = nowMs();
|
|
4045
|
+
const stalenessMeta = resolveCompletedCellStalenessMeta({
|
|
4046
|
+
policy: authoredCellPolicies?.[key],
|
|
4047
|
+
value: resolved.value,
|
|
4048
|
+
completedAt,
|
|
4049
|
+
});
|
|
4050
|
+
cellMetaPatch[key] = {
|
|
4051
|
+
status: 'completed',
|
|
4052
|
+
stage: key,
|
|
4053
|
+
runId: req.runId,
|
|
4054
|
+
completedAt,
|
|
4055
|
+
...stalenessMeta,
|
|
4056
|
+
};
|
|
4057
|
+
}
|
|
4058
|
+
activeField = null;
|
|
3999
4059
|
}
|
|
4000
|
-
|
|
4001
|
-
|
|
4002
|
-
|
|
4003
|
-
|
|
4004
|
-
|
|
4005
|
-
|
|
4006
|
-
|
|
4007
|
-
|
|
4008
|
-
|
|
4060
|
+
for (const stepOutput of stepProgramOutputs) {
|
|
4061
|
+
enriched[stepOutput.columnName] = stepOutput.value;
|
|
4062
|
+
fieldOutputs[stepOutput.columnName] = stepOutput.value;
|
|
4063
|
+
generatedOutputFields.add(stepOutput.columnName);
|
|
4064
|
+
if (stepOutput.status === 'skipped') {
|
|
4065
|
+
cellMetaPatch[stepOutput.columnName] = {
|
|
4066
|
+
status: 'skipped',
|
|
4067
|
+
stage: stepOutput.stepId,
|
|
4068
|
+
runId: req.runId,
|
|
4069
|
+
};
|
|
4070
|
+
}
|
|
4071
|
+
}
|
|
4072
|
+
for (const waterfallOutput of waterfallOutputs) {
|
|
4073
|
+
const columnName =
|
|
4074
|
+
`${sqlishIdentifierPart(waterfallOutput.waterfallId)}__` +
|
|
4075
|
+
sqlishIdentifierPart(waterfallOutput.stepId);
|
|
4076
|
+
enriched[columnName] = waterfallOutput.value;
|
|
4077
|
+
generatedOutputFields.add(columnName);
|
|
4078
|
+
}
|
|
4079
|
+
executedCellMetaPatches[myIndex] =
|
|
4080
|
+
Object.keys(cellMetaPatch).length > 0
|
|
4081
|
+
? cellMetaPatch
|
|
4082
|
+
: undefined;
|
|
4083
|
+
executedRows[myIndex] = enriched as T &
|
|
4084
|
+
Record<string, unknown>;
|
|
4085
|
+
completedExecutedRows += 1;
|
|
4086
|
+
reportChunkProgress(false);
|
|
4087
|
+
} catch (rowError) {
|
|
4088
|
+
// Row failure isolation (the default): one row's
|
|
4089
|
+
// tool/provider error is recorded on that row and its
|
|
4090
|
+
// siblings continue. Abort/budget errors stay run-fatal,
|
|
4091
|
+
// and `onRowError: 'fail'` opts the map into fail-fast.
|
|
4092
|
+
if (
|
|
4093
|
+
failFastRowErrors ||
|
|
4094
|
+
isRowIsolationExemptError(rowError)
|
|
4095
|
+
) {
|
|
4096
|
+
throw rowError;
|
|
4097
|
+
}
|
|
4098
|
+
const message = formatWorkerRowFailureMessage(rowError);
|
|
4099
|
+
if (activeField) {
|
|
4100
|
+
cellMetaPatch[activeField] = {
|
|
4101
|
+
status: 'failed',
|
|
4102
|
+
stage: activeField,
|
|
4009
4103
|
runId: req.runId,
|
|
4104
|
+
error: message,
|
|
4010
4105
|
};
|
|
4011
4106
|
}
|
|
4107
|
+
executedCellMetaPatches[myIndex] =
|
|
4108
|
+
Object.keys(cellMetaPatch).length > 0
|
|
4109
|
+
? cellMetaPatch
|
|
4110
|
+
: undefined;
|
|
4111
|
+
// Keep the partially-enriched row so completed sibling
|
|
4112
|
+
// cells persist and replay free when the row re-executes.
|
|
4113
|
+
failedRowEntries[myIndex] = {
|
|
4114
|
+
row: enriched as T & Record<string, unknown>,
|
|
4115
|
+
error: message,
|
|
4116
|
+
};
|
|
4117
|
+
failedExecutedRows += 1;
|
|
4118
|
+
// Bounded per-chunk samples: every failure is persisted on
|
|
4119
|
+
// its row, but only the first few get a log line so a wide
|
|
4120
|
+
// outage cannot flood the Run Log Stream.
|
|
4121
|
+
if (failedExecutedRows <= MAP_ROW_FAILURE_SAMPLE_LIMIT) {
|
|
4122
|
+
emitEvent({
|
|
4123
|
+
type: 'log',
|
|
4124
|
+
level: 'warn',
|
|
4125
|
+
message:
|
|
4126
|
+
`Row ${absoluteIndex} of ctx.dataset("${name}") failed` +
|
|
4127
|
+
`${activeField ? ` at column "${activeField}"` : ''}: ${message} ` +
|
|
4128
|
+
'(row recorded as failed; sibling rows continue and the row re-executes on the next run)',
|
|
4129
|
+
ts: nowMs(),
|
|
4130
|
+
});
|
|
4131
|
+
}
|
|
4132
|
+
reportChunkProgress(false);
|
|
4012
4133
|
}
|
|
4013
|
-
for (const waterfallOutput of waterfallOutputs) {
|
|
4014
|
-
const columnName =
|
|
4015
|
-
`${sqlishIdentifierPart(waterfallOutput.waterfallId)}__` +
|
|
4016
|
-
sqlishIdentifierPart(waterfallOutput.stepId);
|
|
4017
|
-
enriched[columnName] = waterfallOutput.value;
|
|
4018
|
-
generatedOutputFields.add(columnName);
|
|
4019
|
-
}
|
|
4020
|
-
executedCellMetaPatches[myIndex] =
|
|
4021
|
-
Object.keys(cellMetaPatch).length > 0
|
|
4022
|
-
? cellMetaPatch
|
|
4023
|
-
: undefined;
|
|
4024
|
-
executedRows[myIndex] = enriched as T & Record<string, unknown>;
|
|
4025
|
-
completedExecutedRows += 1;
|
|
4026
|
-
reportChunkProgress(false);
|
|
4027
4134
|
} finally {
|
|
4028
4135
|
if (rowMarkedActive) {
|
|
4029
4136
|
activeExecutedRows = Math.max(0, activeExecutedRows - 1);
|
|
@@ -4053,7 +4160,24 @@ function createMinimalWorkerCtx(
|
|
|
4053
4160
|
executedIndex: number;
|
|
4054
4161
|
} => entry !== null,
|
|
4055
4162
|
);
|
|
4056
|
-
|
|
4163
|
+
const failedRowsToPersist = failedRowEntries
|
|
4164
|
+
.map((failure, executedIndex) =>
|
|
4165
|
+
failure
|
|
4166
|
+
? {
|
|
4167
|
+
failure,
|
|
4168
|
+
executedIndex,
|
|
4169
|
+
}
|
|
4170
|
+
: null,
|
|
4171
|
+
)
|
|
4172
|
+
.filter(
|
|
4173
|
+
(
|
|
4174
|
+
entry,
|
|
4175
|
+
): entry is {
|
|
4176
|
+
failure: { row: T & Record<string, unknown>; error: string };
|
|
4177
|
+
executedIndex: number;
|
|
4178
|
+
} => entry !== null,
|
|
4179
|
+
);
|
|
4180
|
+
if (rowsToPersist.length === 0 && failedRowsToPersist.length === 0) {
|
|
4057
4181
|
return;
|
|
4058
4182
|
}
|
|
4059
4183
|
await persistCompletedMapRows({
|
|
@@ -4061,16 +4185,34 @@ function createMinimalWorkerCtx(
|
|
|
4061
4185
|
tableNamespace: name,
|
|
4062
4186
|
outputFields,
|
|
4063
4187
|
extraOutputFields: Array.from(generatedOutputFields),
|
|
4064
|
-
rows:
|
|
4065
|
-
...row,
|
|
4066
|
-
|
|
4067
|
-
|
|
4068
|
-
|
|
4069
|
-
|
|
4070
|
-
|
|
4071
|
-
|
|
4072
|
-
|
|
4073
|
-
|
|
4188
|
+
rows: [
|
|
4189
|
+
...rowsToPersist.map(({ row, executedIndex }) => ({
|
|
4190
|
+
...row,
|
|
4191
|
+
...(executedCellMetaPatches[executedIndex]
|
|
4192
|
+
? {
|
|
4193
|
+
__deeplineCellMetaPatch:
|
|
4194
|
+
executedCellMetaPatches[executedIndex],
|
|
4195
|
+
}
|
|
4196
|
+
: {}),
|
|
4197
|
+
__deeplineRowKey:
|
|
4198
|
+
uniqueRowsToExecuteEntries[executedIndex]!.rowKey,
|
|
4199
|
+
})),
|
|
4200
|
+
// Failed rows persist as recoverable `_status='failed'` sheet
|
|
4201
|
+
// rows: partial data + per-cell failure meta + the row error.
|
|
4202
|
+
...failedRowsToPersist.map(({ failure, executedIndex }) => ({
|
|
4203
|
+
...failure.row,
|
|
4204
|
+
...(executedCellMetaPatches[executedIndex]
|
|
4205
|
+
? {
|
|
4206
|
+
__deeplineCellMetaPatch:
|
|
4207
|
+
executedCellMetaPatches[executedIndex],
|
|
4208
|
+
}
|
|
4209
|
+
: {}),
|
|
4210
|
+
__deeplineRowKey:
|
|
4211
|
+
uniqueRowsToExecuteEntries[executedIndex]!.rowKey,
|
|
4212
|
+
__deeplineRowStatus: 'failed',
|
|
4213
|
+
__deeplineRowError: failure.error,
|
|
4214
|
+
})),
|
|
4215
|
+
],
|
|
4074
4216
|
});
|
|
4075
4217
|
};
|
|
4076
4218
|
const workersStartedAt = nowMs();
|
|
@@ -4167,9 +4309,11 @@ function createMinimalWorkerCtx(
|
|
|
4167
4309
|
executedIndex < executedRows.length;
|
|
4168
4310
|
executedIndex += 1
|
|
4169
4311
|
) {
|
|
4170
|
-
const executedRow = executedRows[executedIndex]
|
|
4312
|
+
const executedRow = executedRows[executedIndex];
|
|
4171
4313
|
const key = uniqueRowsToExecuteEntries[executedIndex]!.rowKey;
|
|
4172
|
-
|
|
4314
|
+
// Failed rows have no executed result; they stay out of the map output
|
|
4315
|
+
// dataset (their recoverable state lives in the runtime sheet).
|
|
4316
|
+
if (key && executedRow) resultByKey.set(key, executedRow);
|
|
4173
4317
|
}
|
|
4174
4318
|
const out = chunkRows
|
|
4175
4319
|
.map((_row, index) => {
|
|
@@ -4177,6 +4321,24 @@ function createMinimalWorkerCtx(
|
|
|
4177
4321
|
return resultByKey.get(key);
|
|
4178
4322
|
})
|
|
4179
4323
|
.filter((row): row is T & Record<string, unknown> => Boolean(row));
|
|
4324
|
+
const executedSuccessCount = Math.max(
|
|
4325
|
+
0,
|
|
4326
|
+
executedRows.length - failedExecutedRows,
|
|
4327
|
+
);
|
|
4328
|
+
const rowFailureSamples = failedRowEntries
|
|
4329
|
+
.map((failure, executedIndex) =>
|
|
4330
|
+
failure
|
|
4331
|
+
? {
|
|
4332
|
+
rowKey: uniqueRowsToExecuteEntries[executedIndex]!.rowKey,
|
|
4333
|
+
error: failure.error,
|
|
4334
|
+
}
|
|
4335
|
+
: null,
|
|
4336
|
+
)
|
|
4337
|
+
.filter(
|
|
4338
|
+
(sample): sample is { rowKey: string; error: string } =>
|
|
4339
|
+
sample !== null,
|
|
4340
|
+
)
|
|
4341
|
+
.slice(0, MAP_ROW_FAILURE_SAMPLE_LIMIT);
|
|
4180
4342
|
const publicOut = out.map((row) => publicCsvOutputRow(row));
|
|
4181
4343
|
const hashStartedAt = nowMs();
|
|
4182
4344
|
const hash = await hashJson(publicOut);
|
|
@@ -4196,7 +4358,8 @@ function createMinimalWorkerCtx(
|
|
|
4196
4358
|
rowsRead: chunkRows.length,
|
|
4197
4359
|
rowsWritten: out.length,
|
|
4198
4360
|
rowsExecuted: executedRows.length,
|
|
4199
|
-
|
|
4361
|
+
rowsFailed: failedExecutedRows,
|
|
4362
|
+
rowsCached: Math.max(0, out.length - executedSuccessCount),
|
|
4200
4363
|
},
|
|
4201
4364
|
});
|
|
4202
4365
|
return {
|
|
@@ -4206,10 +4369,12 @@ function createMinimalWorkerCtx(
|
|
|
4206
4369
|
rowsRead: chunkRows.length,
|
|
4207
4370
|
rowsWritten: out.length,
|
|
4208
4371
|
rowsExecuted: executedRows.length,
|
|
4209
|
-
rowsCached: Math.max(0, out.length -
|
|
4372
|
+
rowsCached: Math.max(0, out.length - executedSuccessCount),
|
|
4210
4373
|
rowsDuplicateReused: duplicateInputReuseCount,
|
|
4211
4374
|
rowsInserted,
|
|
4212
4375
|
rowsSkipped,
|
|
4376
|
+
rowsFailed: failedExecutedRows,
|
|
4377
|
+
rowFailureSamples,
|
|
4213
4378
|
outputDatasetId: `map:${name}`,
|
|
4214
4379
|
hash,
|
|
4215
4380
|
preview: toWorkflowSerializableValue(publicOut.slice(0, 5)),
|
|
@@ -4228,6 +4393,8 @@ function createMinimalWorkerCtx(
|
|
|
4228
4393
|
let totalRowsDuplicateReused = 0;
|
|
4229
4394
|
let totalRowsInserted = 0;
|
|
4230
4395
|
let totalRowsSkipped = 0;
|
|
4396
|
+
let totalRowsFailed = 0;
|
|
4397
|
+
const totalRowFailureSamples: Array<{ rowKey: string; error: string }> = [];
|
|
4231
4398
|
|
|
4232
4399
|
const runChunkStep = async (
|
|
4233
4400
|
chunkRows: T[],
|
|
@@ -4255,22 +4422,35 @@ function createMinimalWorkerCtx(
|
|
|
4255
4422
|
};
|
|
4256
4423
|
|
|
4257
4424
|
const finalize = (totalRowsWritten: number) => {
|
|
4425
|
+
const failureSampleSummary =
|
|
4426
|
+
totalRowFailureSamples.length > 0
|
|
4427
|
+
? ` First error: ${totalRowFailureSamples[0]!.error}`
|
|
4428
|
+
: '';
|
|
4258
4429
|
const cacheSummary =
|
|
4259
|
-
|
|
4260
|
-
|
|
4261
|
-
|
|
4430
|
+
totalRowsFailed > 0
|
|
4431
|
+
? `Map completed with partial failures: ${totalRowsWritten} succeeded, ` +
|
|
4432
|
+
`${totalRowsFailed} failed (${totalRowsExecuted} executed, ${totalRowsCached} already satisfied) ` +
|
|
4433
|
+
`inserted=${totalRowsInserted} skipped=${totalRowsSkipped}. ` +
|
|
4434
|
+
`Failed rows are persisted with their errors and re-execute on the next run.${failureSampleSummary}`
|
|
4435
|
+
: `Map completed: ${totalRowsWritten} results ` +
|
|
4436
|
+
`(${totalRowsExecuted} executed, ${totalRowsCached} already satisfied) ` +
|
|
4437
|
+
`inserted=${totalRowsInserted} skipped=${totalRowsSkipped}`;
|
|
4262
4438
|
const completedAt = nowMs();
|
|
4263
4439
|
callbacks?.onMapCompleted?.(mapNodeId, completedAt);
|
|
4264
4440
|
void updateMapProgress({
|
|
4265
4441
|
completed: totalRowsWritten,
|
|
4266
|
-
total: totalRowsWritten,
|
|
4442
|
+
total: totalRowsWritten + totalRowsFailed,
|
|
4443
|
+
failed: totalRowsFailed,
|
|
4267
4444
|
completedAt,
|
|
4268
4445
|
updatedAt: completedAt,
|
|
4269
|
-
message:
|
|
4446
|
+
message:
|
|
4447
|
+
totalRowsFailed > 0
|
|
4448
|
+
? `${totalRowsWritten.toLocaleString()} succeeded, ${totalRowsFailed.toLocaleString()} failed`
|
|
4449
|
+
: formatMapProgressMessage(totalRowsWritten, totalRowsWritten),
|
|
4270
4450
|
});
|
|
4271
4451
|
emitEvent({
|
|
4272
4452
|
type: 'log',
|
|
4273
|
-
level: 'info',
|
|
4453
|
+
level: totalRowsFailed > 0 ? 'warn' : 'info',
|
|
4274
4454
|
message: cacheSummary,
|
|
4275
4455
|
ts: nowMs(),
|
|
4276
4456
|
});
|
|
@@ -4299,12 +4479,12 @@ function createMinimalWorkerCtx(
|
|
|
4299
4479
|
recordRunnerPerfTrace({ req, phase, ms, extra }),
|
|
4300
4480
|
nowMs,
|
|
4301
4481
|
workProgress: {
|
|
4302
|
-
total: totalRowsWritten,
|
|
4482
|
+
total: totalRowsWritten + totalRowsFailed,
|
|
4303
4483
|
executed: totalRowsExecuted,
|
|
4304
4484
|
reused: totalRowsCached,
|
|
4305
4485
|
skipped: totalRowsCached,
|
|
4306
4486
|
pending: 0,
|
|
4307
|
-
failed:
|
|
4487
|
+
failed: totalRowsFailed,
|
|
4308
4488
|
...(totalRowsDuplicateReused > 0
|
|
4309
4489
|
? { duplicates: { exact: totalRowsDuplicateReused } }
|
|
4310
4490
|
: {}),
|
|
@@ -4325,9 +4505,17 @@ function createMinimalWorkerCtx(
|
|
|
4325
4505
|
totalRowsDuplicateReused += chunkResult.rowsDuplicateReused;
|
|
4326
4506
|
totalRowsInserted += chunkResult.rowsInserted;
|
|
4327
4507
|
totalRowsSkipped += chunkResult.rowsSkipped;
|
|
4508
|
+
totalRowsFailed += chunkResult.rowsFailed ?? 0;
|
|
4509
|
+
for (const sample of chunkResult.rowFailureSamples ?? []) {
|
|
4510
|
+
if (totalRowFailureSamples.length >= MAP_ROW_FAILURE_SAMPLE_LIMIT) {
|
|
4511
|
+
break;
|
|
4512
|
+
}
|
|
4513
|
+
totalRowFailureSamples.push(sample);
|
|
4514
|
+
}
|
|
4328
4515
|
await updateMapProgress({
|
|
4329
4516
|
completed: totalRowsWritten,
|
|
4330
4517
|
total: rowCountHint ?? undefined,
|
|
4518
|
+
...(totalRowsFailed > 0 ? { failed: totalRowsFailed } : {}),
|
|
4331
4519
|
message: formatMapProgressMessage(
|
|
4332
4520
|
totalRowsWritten,
|
|
4333
4521
|
rowCountHint ?? undefined,
|
|
@@ -4356,6 +4544,18 @@ function createMinimalWorkerCtx(
|
|
|
4356
4544
|
chunkStart += chunkRows.length;
|
|
4357
4545
|
chunkIndex += 1;
|
|
4358
4546
|
}
|
|
4547
|
+
if (totalRowsFailed > 0 && totalRowsWritten === 0) {
|
|
4548
|
+
// Every row failed: this is a systemic failure (provider outage, broken
|
|
4549
|
+
// resolver, exhausted credits), not a partial one. Isolating it would
|
|
4550
|
+
// silently complete the run with an empty dataset. Fail loudly — the
|
|
4551
|
+
// failed rows are persisted with their errors and re-execute on re-run.
|
|
4552
|
+
const firstError = totalRowFailureSamples[0]?.error ?? 'unknown error';
|
|
4553
|
+
throw new Error(
|
|
4554
|
+
`ctx.dataset("${name}") failed for all ${totalRowsFailed} executed rows. ` +
|
|
4555
|
+
`First error: ${firstError} ` +
|
|
4556
|
+
`(rows are persisted with per-row errors; fix the cause and re-run to resume)`,
|
|
4557
|
+
);
|
|
4558
|
+
}
|
|
4359
4559
|
const dataset = finalize(totalRowsWritten);
|
|
4360
4560
|
recordRunnerPerfTrace({
|
|
4361
4561
|
req,
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import {
|
|
2
|
+
isHardBillingToolHttpError,
|
|
3
|
+
isRateLimitToolHttpError,
|
|
4
|
+
} from './tool-http-errors';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Thrown by `assertNotAborted` and surfaced through ctx.step / ctx.sleep / map
|
|
8
|
+
* processing when the workflow has been terminated externally. Cooperatively
|
|
9
|
+
* cancels in-flight user code: the play must check `ctx.signal.aborted` (or
|
|
10
|
+
* await one of the abort-aware ctx methods) before doing more work.
|
|
11
|
+
*/
|
|
12
|
+
export class WorkflowAbortError extends Error {
|
|
13
|
+
override readonly name = 'WorkflowAbort';
|
|
14
|
+
constructor(message = 'Play run cancelled.') {
|
|
15
|
+
super(message);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export function isAbortLikeError(error: unknown): boolean {
|
|
20
|
+
if (!error) return false;
|
|
21
|
+
if (error instanceof WorkflowAbortError) return true;
|
|
22
|
+
if (error instanceof Error) {
|
|
23
|
+
if (error.name === 'WorkflowAbort' || error.name === 'AbortError')
|
|
24
|
+
return true;
|
|
25
|
+
return /\b(cancell?ed|aborted|terminate[d]?)\b/i.test(error.message);
|
|
26
|
+
}
|
|
27
|
+
return false;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Errors that must stay run-fatal even under the default map row failure
|
|
32
|
+
* isolation:
|
|
33
|
+
*
|
|
34
|
+
* - Cancellation/abort must stop the run.
|
|
35
|
+
* - Governor budget exhaustion is a run-level invariant — isolating it per
|
|
36
|
+
* row would silently convert "this run exceeded its execution budget" into
|
|
37
|
+
* thousands of identical row failures.
|
|
38
|
+
* - Rate-limit pushback (a tool call that still got HTTP 429 after the
|
|
39
|
+
* in-process retry budget) is run-level throughput pressure that applies to
|
|
40
|
+
* every row equally, not a row defect. Isolating it silently drops healthy
|
|
41
|
+
* rows from the output dataset whenever a provider throttles — the durable
|
|
42
|
+
* chunk step's retries (and, if the storm persists, a loud run failure with
|
|
43
|
+
* recoverable persisted rows) are the correct response.
|
|
44
|
+
* - Hard billing failures (billing cap / insufficient credits) promise "run
|
|
45
|
+
* halted before marking remaining rows processed"; isolating them would
|
|
46
|
+
* complete the run while silently failing every remaining row.
|
|
47
|
+
*/
|
|
48
|
+
export function isRowIsolationExemptError(error: unknown): boolean {
|
|
49
|
+
if (isAbortLikeError(error)) return true;
|
|
50
|
+
if (error instanceof Error && error.name === 'GovernorBudgetError')
|
|
51
|
+
return true;
|
|
52
|
+
return isRateLimitToolHttpError(error) || isHardBillingToolHttpError(error);
|
|
53
|
+
}
|
|
@@ -1,10 +1,17 @@
|
|
|
1
1
|
export class ToolHttpError extends Error {
|
|
2
2
|
readonly billing: Record<string, unknown> | null;
|
|
3
|
+
/** HTTP status of the failed tool-execute response (e.g. 429, 502). */
|
|
4
|
+
readonly status: number;
|
|
3
5
|
|
|
4
|
-
constructor(
|
|
6
|
+
constructor(
|
|
7
|
+
message: string,
|
|
8
|
+
billing: Record<string, unknown> | null,
|
|
9
|
+
status: number,
|
|
10
|
+
) {
|
|
5
11
|
super(message);
|
|
6
12
|
this.name = 'ToolHttpError';
|
|
7
13
|
this.billing = billing;
|
|
14
|
+
this.status = status;
|
|
8
15
|
}
|
|
9
16
|
}
|
|
10
17
|
|
|
@@ -200,6 +207,7 @@ export function normalizeToolHttpErrorMessage(input: {
|
|
|
200
207
|
},
|
|
201
208
|
)}`,
|
|
202
209
|
billing,
|
|
210
|
+
input.status,
|
|
203
211
|
);
|
|
204
212
|
}
|
|
205
213
|
const hardBillingPayload = isHardBillingFailurePayload(billing)
|
|
@@ -217,6 +225,7 @@ export function normalizeToolHttpErrorMessage(input: {
|
|
|
217
225
|
maxAttempts: input.maxAttempts,
|
|
218
226
|
}),
|
|
219
227
|
hardBillingPayload,
|
|
228
|
+
input.status,
|
|
220
229
|
);
|
|
221
230
|
}
|
|
222
231
|
return new ToolHttpError(
|
|
@@ -227,6 +236,7 @@ export function normalizeToolHttpErrorMessage(input: {
|
|
|
227
236
|
},
|
|
228
237
|
)}`,
|
|
229
238
|
billing,
|
|
239
|
+
input.status,
|
|
230
240
|
);
|
|
231
241
|
}
|
|
232
242
|
|
|
@@ -241,3 +251,12 @@ export function isHardBillingToolHttpError(error: unknown): boolean {
|
|
|
241
251
|
error instanceof ToolHttpError && isHardBillingFailurePayload(error.billing)
|
|
242
252
|
);
|
|
243
253
|
}
|
|
254
|
+
|
|
255
|
+
/**
|
|
256
|
+
* A tool call that ultimately failed with HTTP 429 — provider or
|
|
257
|
+
* Deepline-internal rate-limit pushback that survived the in-process retry
|
|
258
|
+
* budget. This is run-level throughput pressure, never a row-specific defect.
|
|
259
|
+
*/
|
|
260
|
+
export function isRateLimitToolHttpError(error: unknown): boolean {
|
|
261
|
+
return error instanceof ToolHttpError && error.status === 429;
|
|
262
|
+
}
|
|
@@ -53,10 +53,11 @@ export const SDK_RELEASE = {
|
|
|
53
53
|
// 0.1.94 is claimed by PR #1527 — this watch-render fix ships as 0.1.95.
|
|
54
54
|
// 0.1.98 ships the duplicate-browser-tab fix (default-browser detection).
|
|
55
55
|
// 0.1.99 ships prebuilt job-change source-column preservation and validation fixes.
|
|
56
|
-
|
|
56
|
+
// 0.1.101 ships retryable play artifact publish failures and CI retry hardening.
|
|
57
|
+
version: '0.1.101',
|
|
57
58
|
apiContract: '2026-06-dataset-column-cell-stale-hard-cutover',
|
|
58
59
|
supportPolicy: {
|
|
59
|
-
latest: '0.1.
|
|
60
|
+
latest: '0.1.101',
|
|
60
61
|
minimumSupported: '0.1.53',
|
|
61
62
|
deprecatedBelow: '0.1.53',
|
|
62
63
|
},
|
|
@@ -3,9 +3,16 @@ import type { AnyBatchOperationStrategy } from './batching-types';
|
|
|
3
3
|
export interface ChunkExecutionResult<TRequest, TResult> {
|
|
4
4
|
request: TRequest;
|
|
5
5
|
result: TResult | null;
|
|
6
|
+
/**
|
|
7
|
+
* Present when this request's execution rejected. The request failed but
|
|
8
|
+
* its siblings in the chunk kept their results — one provider hiccup must
|
|
9
|
+
* stay a per-request failure, not a run-level abort that discards billed
|
|
10
|
+
* work (rows already persisted by completed calls stay recoverable).
|
|
11
|
+
*/
|
|
12
|
+
error?: string;
|
|
6
13
|
}
|
|
7
14
|
|
|
8
|
-
function formatChunkExecutionError(error: unknown): string {
|
|
15
|
+
export function formatChunkExecutionError(error: unknown): string {
|
|
9
16
|
if (error instanceof Error) {
|
|
10
17
|
return error.message;
|
|
11
18
|
}
|
|
@@ -23,6 +30,13 @@ export async function executeChunkedRequests<TRequest, TResult>(input: {
|
|
|
23
30
|
requests: TRequest[];
|
|
24
31
|
batchSize: number;
|
|
25
32
|
execute: (request: TRequest) => Promise<TResult>;
|
|
33
|
+
/**
|
|
34
|
+
* Loud per-request failure hook. A rejected request is recorded as a
|
|
35
|
+
* `result: null` entry with `error` set so the row-level state can carry
|
|
36
|
+
* the provider error; it must never abort the chunk, the sibling requests,
|
|
37
|
+
* or the run. Callers use this to log and persist the failure.
|
|
38
|
+
*/
|
|
39
|
+
onRequestError?: (request: TRequest, error: unknown) => void;
|
|
26
40
|
onChunkComplete?: (
|
|
27
41
|
results: Array<ChunkExecutionResult<TRequest, TResult>>,
|
|
28
42
|
) => void | Promise<void>;
|
|
@@ -35,21 +49,20 @@ export async function executeChunkedRequests<TRequest, TResult>(input: {
|
|
|
35
49
|
chunk.map((request) => input.execute(request)),
|
|
36
50
|
);
|
|
37
51
|
|
|
38
|
-
const rejected = settled.find(
|
|
39
|
-
(outcome): outcome is PromiseRejectedResult =>
|
|
40
|
-
outcome.status === 'rejected',
|
|
41
|
-
);
|
|
42
|
-
if (rejected) {
|
|
43
|
-
throw new Error(
|
|
44
|
-
`Play batch request failed: ${formatChunkExecutionError(rejected.reason)}`,
|
|
45
|
-
{ cause: rejected.reason },
|
|
46
|
-
);
|
|
47
|
-
}
|
|
48
|
-
|
|
49
52
|
for (let index = 0; index < chunk.length; index += 1) {
|
|
50
|
-
const
|
|
53
|
+
const request = chunk[index]!;
|
|
54
|
+
const outcome = settled[index]!;
|
|
55
|
+
if (outcome.status === 'rejected') {
|
|
56
|
+
input.onRequestError?.(request, outcome.reason);
|
|
57
|
+
results.push({
|
|
58
|
+
request,
|
|
59
|
+
result: null,
|
|
60
|
+
error: formatChunkExecutionError(outcome.reason),
|
|
61
|
+
});
|
|
62
|
+
continue;
|
|
63
|
+
}
|
|
51
64
|
results.push({
|
|
52
|
-
request
|
|
65
|
+
request,
|
|
53
66
|
result: outcome.value,
|
|
54
67
|
});
|
|
55
68
|
}
|