npm - deepline - Versions diffs - 0.1.99 → 0.1.101 - Mend

deepline 0.1.99 → 0.1.101

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/cli/index.js +12 -2
package/dist/cli/index.mjs +12 -2
package/dist/index.js +3 -2
package/dist/index.mjs +3 -2
package/dist/repo/apps/play-runner-workers/src/entry.ts +399 -199
package/dist/repo/apps/play-runner-workers/src/runtime/row-isolation.ts +53 -0
package/dist/repo/apps/play-runner-workers/src/runtime/tool-http-errors.ts +20 -1
package/dist/repo/sdk/src/release.ts +3 -2
package/dist/repo/shared_libs/play-runtime/batch-runtime.ts +27 -14
package/package.json +1 -1

package/dist/cli/index.js CHANGED Viewed

@@ -232,10 +232,11 @@ var SDK_RELEASE = {
   // 0.1.94 is claimed by PR #1527 — this watch-render fix ships as 0.1.95.
   // 0.1.98 ships the duplicate-browser-tab fix (default-browser detection).
   // 0.1.99 ships prebuilt job-change source-column preservation and validation fixes.
-  version: "0.1.99",
+  // 0.1.101 ships retryable play artifact publish failures and CI retry hardening.
+  version: "0.1.101",
   apiContract: "2026-06-dataset-column-cell-stale-hard-cutover",
   supportPolicy: {
-    latest: "0.1.99",
+    latest: "0.1.101",
     minimumSupported: "0.1.53",
     deprecatedBelow: "0.1.53"
   }
@@ -11148,6 +11149,15 @@ function buildRunPackageTextLines(packaged) {
   if (runError && (status === "failed" || status === "cancelled")) {
     lines.push(`  error: ${runError.slice(0, 200)}`);
   }
+  for (const step of readRecordArray(packaged.steps)) {
+    const output2 = step.output && typeof step.output === "object" && !Array.isArray(step.output) ? step.output : null;
+    if (!output2 || output2.recovered !== true) continue;
+    const rowCount = typeof output2.rowCount === "number" ? formatInteger(output2.rowCount) : "persisted";
+    const datasetPath = typeof output2.path === "string" ? output2.path : "dataset";
+    lines.push(
+      `  recoverable: ${rowCount} rows persisted in ${datasetPath} \u2014 re-running reuses them; export with the command below`
+    );
+  }
   if (playName) {
     lines.push(`  play: ${playName}`);
   }

package/dist/cli/index.mjs CHANGED Viewed

@@ -209,10 +209,11 @@ var SDK_RELEASE = {
   // 0.1.94 is claimed by PR #1527 — this watch-render fix ships as 0.1.95.
   // 0.1.98 ships the duplicate-browser-tab fix (default-browser detection).
   // 0.1.99 ships prebuilt job-change source-column preservation and validation fixes.
-  version: "0.1.99",
+  // 0.1.101 ships retryable play artifact publish failures and CI retry hardening.
+  version: "0.1.101",
   apiContract: "2026-06-dataset-column-cell-stale-hard-cutover",
   supportPolicy: {
-    latest: "0.1.99",
+    latest: "0.1.101",
     minimumSupported: "0.1.53",
     deprecatedBelow: "0.1.53"
   }
@@ -11164,6 +11165,15 @@ function buildRunPackageTextLines(packaged) {
   if (runError && (status === "failed" || status === "cancelled")) {
     lines.push(`  error: ${runError.slice(0, 200)}`);
   }
+  for (const step of readRecordArray(packaged.steps)) {
+    const output2 = step.output && typeof step.output === "object" && !Array.isArray(step.output) ? step.output : null;
+    if (!output2 || output2.recovered !== true) continue;
+    const rowCount = typeof output2.rowCount === "number" ? formatInteger(output2.rowCount) : "persisted";
+    const datasetPath = typeof output2.path === "string" ? output2.path : "dataset";
+    lines.push(
+      `  recoverable: ${rowCount} rows persisted in ${datasetPath} \u2014 re-running reuses them; export with the command below`
+    );
+  }
   if (playName) {
     lines.push(`  play: ${playName}`);
   }

package/dist/index.js CHANGED Viewed

@@ -260,10 +260,11 @@ var SDK_RELEASE = {
   // 0.1.94 is claimed by PR #1527 — this watch-render fix ships as 0.1.95.
   // 0.1.98 ships the duplicate-browser-tab fix (default-browser detection).
   // 0.1.99 ships prebuilt job-change source-column preservation and validation fixes.
-  version: "0.1.99",
+  // 0.1.101 ships retryable play artifact publish failures and CI retry hardening.
+  version: "0.1.101",
   apiContract: "2026-06-dataset-column-cell-stale-hard-cutover",
   supportPolicy: {
-    latest: "0.1.99",
+    latest: "0.1.101",
     minimumSupported: "0.1.53",
     deprecatedBelow: "0.1.53"
   }

package/dist/index.mjs CHANGED Viewed

@@ -182,10 +182,11 @@ var SDK_RELEASE = {
   // 0.1.94 is claimed by PR #1527 — this watch-render fix ships as 0.1.95.
   // 0.1.98 ships the duplicate-browser-tab fix (default-browser detection).
   // 0.1.99 ships prebuilt job-change source-column preservation and validation fixes.
-  version: "0.1.99",
+  // 0.1.101 ships retryable play artifact publish failures and CI retry hardening.
+  version: "0.1.101",
   apiContract: "2026-06-dataset-column-cell-stale-hard-cutover",
   supportPolicy: {
-    latest: "0.1.99",
+    latest: "0.1.101",
     minimumSupported: "0.1.53",
     deprecatedBelow: "0.1.53"
   }

package/dist/repo/apps/play-runner-workers/src/entry.ts CHANGED Viewed

@@ -161,6 +161,11 @@ import {
   isHardBillingToolHttpError,
   normalizeToolHttpErrorMessage,
 } from './runtime/tool-http-errors';
+import {
+  WorkflowAbortError,
+  isAbortLikeError,
+  isRowIsolationExemptError,
+} from './runtime/row-isolation';
 import {
   StepProgramDatasetBuilder,
   type StepProgramDatasetColumnInput,
@@ -733,7 +738,12 @@ function publicCsvStorageRow<T extends Record<string, unknown>>(row: T): T {
     storageRow[fieldName] =
       'value' in descriptor ? descriptor.value : publicRow[fieldName];
   }
-  for (const runtimeField of ['__deeplineRowKey', '__deeplineCellMetaPatch']) {
+  for (const runtimeField of [
+    '__deeplineRowKey',
+    '__deeplineCellMetaPatch',
+    '__deeplineRowStatus',
+    '__deeplineRowError',
+  ]) {
     if (Object.prototype.hasOwnProperty.call(row, runtimeField)) {
       storageRow[runtimeField] = row[runtimeField];
     }
@@ -1286,7 +1296,11 @@ async function callToolDirect(
   const maxAttempts = 3;
   let lastError: Error | null = null;
-  for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
+  for (
+    let attempt = 1;
+    attempt <= WORKER_TOOL_RATE_LIMIT_MAX_ATTEMPTS;
+    attempt += 1
+  ) {
     const res = await fetchRuntimeApi(req.baseUrl, path, {
       method: 'POST',
       headers: {
@@ -1310,11 +1324,17 @@ async function callToolDirect(
     }
     const text = await res.text().catch(() => '');
+    const isRateLimited = res.status === 429;
+    // Rate-limit pushback gets the larger 429-specific retry budget; every
+    // other failure keeps the generic 3-attempt budget.
+    const attemptCap = isRateLimited
+      ? WORKER_TOOL_RATE_LIMIT_MAX_ATTEMPTS
+      : maxAttempts;
     lastError = normalizeToolHttpErrorMessage({
       toolId,
       status: res.status,
       attempt,
-      maxAttempts,
+      maxAttempts: attemptCap,
       bodyText: text,
     });
     const retryAfterSeconds = Number(res.headers.get('retry-after'));
@@ -1322,21 +1342,28 @@ async function callToolDirect(
       Number.isFinite(retryAfterSeconds) && retryAfterSeconds > 0
         ? Math.ceil(retryAfterSeconds * 1000)
         : 0;
-    if (res.status === 429) {
+    if (isRateLimited) {
       // Feed the provider's backpressure into the shared pacer even on the
       // final attempt so the (org, provider) bucket backs off across isolates.
       onProviderBackpressure?.(retryAfterMs > 0 ? retryAfterMs : 1_000);
     }
     const retryable =
-      (res.status === 429 && !isHardBillingToolHttpError(lastError)) ||
+      (isRateLimited && !isHardBillingToolHttpError(lastError)) ||
       (res.status >= 500 && WORKER_RETRY_SAFE_5XX_TOOLS.has(toolId));
-    if (!retryable || attempt >= maxAttempts) {
+    if (!retryable || attempt >= attemptCap) {
       throw lastError;
     }
     // Charge the retry budget per attempt, matching the cjs runner's
     // chargeBudget('retry') on every 429 / retryable-5xx retry.
     onRetryAttempt?.();
-    const delayMs = retryAfterMs > 0 ? Math.min(5_000, retryAfterMs) : 1_000;
+    // 429 delays escalate per attempt (still honoring a larger retry-after)
+    // so sustained throttling spaces calls out instead of hammering the
+    // limiter with fixed 1s retries.
+    const delayMs = isRateLimited
+      ? Math.min(5_000, Math.max(retryAfterMs, 1_000 * attempt))
+      : retryAfterMs > 0
+        ? Math.min(5_000, retryAfterMs)
+        : 1_000;
     await new Promise((resolve) => setTimeout(resolve, delayMs));
   }
@@ -1498,12 +1525,27 @@ type WorkerToolBatchRequest = {
 const WORKER_TOOL_BATCH_GRACE_MS = 250;
 const MAP_EXECUTION_HEARTBEAT_INTERVAL_MS = 5_000;
+/**
+ * Bounded number of per-row failure samples carried in chunk summaries and the
+ * map's terminal partial-failure log. Every failed row is persisted with its
+ * full error in the runtime sheet; the samples just keep run logs readable.
+ */
+const MAP_ROW_FAILURE_SAMPLE_LIMIT = 3;
 // Fallback batch-chunk parallelism when a tool declares no provider rate hints.
 // Matches the prior hardcoded `Math.min(4, ...)` so undeclared providers keep
 // their previous batching behavior; declared providers tighten via the
 // Governor's suggestedParallelism.
 const WORKER_TOOL_BATCH_DEFAULT_PARALLELISM = 4;
 const WORKER_RETRY_SAFE_5XX_TOOLS = new Set(['test_transient_500']);
+/**
+ * In-process retry budget for HTTP 429 tool responses. Rate-limit pushback is
+ * throughput pacing (provider or Deepline limiter), not a tool defect, so it
+ * gets more patience than the generic 3-attempt budget: with retry-after-aware
+ * escalating delays (capped at 5s) this absorbs roughly 25s of sustained
+ * throttling before the call fails. Every retry still charges the Governor's
+ * retry budget, so a runaway storm stays bounded and loud.
+ */
+const WORKER_TOOL_RATE_LIMIT_MAX_ATTEMPTS = 8;
 function sleepWorkerMs(ms: number): Promise<void> {
   return new Promise((resolve) => setTimeout(resolve, ms));
@@ -1761,6 +1803,15 @@ async function executeBatchedWorkerToolGroup(input: {
       >,
     ) => {
       for (const entry of chunkResults) {
+        if (entry.error !== undefined) {
+          // One batch's provider error stays scoped to that batch's member
+          // requests. Sibling batches in this chunk keep their results so a
+          // single provider hiccup cannot cascade into a whole-map failure.
+          for (const request of entry.request.memberRequests) {
+            request.reject(entry.error);
+          }
+          continue;
+        }
         const batchResult = isToolExecuteResult(entry.result)
           ? entry.result.toolOutput.raw
           : entry.result;
@@ -1841,6 +1892,10 @@ type WorkerMapChunkSummary<T extends Record<string, unknown>> = {
   rowsDuplicateReused: number;
   rowsInserted: number;
   rowsSkipped: number;
+  /** Rows whose execution failed and persisted as `_status='failed'`. */
+  rowsFailed: number;
+  /** Bounded sample of row failures for the partial-failure summary. */
+  rowFailureSamples: Array<{ rowKey: string; error: string }>;
   outputDatasetId: string;
   hash: string;
   preview: T[];
@@ -1926,6 +1981,31 @@ type WorkerMapOptions = {
         row: Record<string, unknown>,
         index: number,
       ) => string | number | readonly unknown[]);
+  /**
+   * Row failure policy. Default 'isolate': one row's tool/provider error is
+   * recorded on that row (cell meta + `_status='failed'` + `_error`), sibling
+   * rows continue, and the run completes with a partial-failure summary.
+   * Failed rows re-execute on the next run; succeeded rows replay free.
+   * 'fail' opts into fail-fast: the first row error aborts the map and fails
+   * the run (rows persisted before the error stay recoverable).
+   */
+  onRowError?: 'isolate' | 'fail';
+};
+/**
+ * Per-cell terminal state recorded by map row execution and merged into the
+ * Runtime Sheet row's `_cell_meta`. 'failed' carries the cell's error message;
+ * `shouldRecomputeCell` treats it as recompute on the next run.
+ */
+type WorkerCellMetaPatchEntry = {
+  status: 'cached' | 'skipped' | 'completed' | 'failed';
+  stage?: string | null;
+  reused?: boolean;
+  runId?: string;
+  completedAt?: number;
+  staleAt?: number | null;
+  staleAfterSeconds?: number | null;
+  error?: string;
 };
 function isWorkerStepProgram(value: unknown): value is WorkerStepProgram {
@@ -3025,46 +3105,6 @@ async function prepareMapRows(input: {
   };
 }
-/**
- * Builds the minimal HTTP-backed ctx surface needed to run tool-basic-shaped
- * plays. NOT a full implementation of shared_libs/play-runtime/context.ts.
- *
- * Supported:
- *   - ctx.log(msg)
- *   - ctx.csv(filename | inline rows)  (calls runtime API for file resolve)
- *   - ctx.dataset(name, rows).withColumn(name, resolver).run(opts)
- *   - ctx.tools.execute({ id, tool, input, ... })
- *   - ctx.runPlay(key, playRef, input, opts)
- *
- * Not supported (will throw):
- *   - ctx.fetch, checkpoints, etc.
- *
- * Plays that need more should run on Daytona; the resolver is composable.
- */
-/**
- * Thrown by `assertNotAborted` and surfaced through ctx.step / ctx.sleep / map
- * processing when the workflow has been terminated externally. Cooperatively
- * cancels in-flight user code: the play must check `ctx.signal.aborted` (or
- * await one of the abort-aware ctx methods) before doing more work.
- */
-class WorkflowAbortError extends Error {
-  override readonly name = 'WorkflowAbort';
-  constructor(message = 'Play run cancelled.') {
-    super(message);
-  }
-}
-function isAbortLikeError(error: unknown): boolean {
-  if (!error) return false;
-  if (error instanceof WorkflowAbortError) return true;
-  if (error instanceof Error) {
-    if (error.name === 'WorkflowAbort' || error.name === 'AbortError')
-      return true;
-    return /\b(cancell?ed|aborted|terminate[d]?)\b/i.test(error.message);
-  }
-  return false;
-}
 function assertNotAborted(signal: AbortSignal | undefined): void {
   if (signal?.aborted) {
     throw new WorkflowAbortError(
@@ -3075,6 +3115,19 @@ function assertNotAborted(signal: AbortSignal | undefined): void {
   }
 }
+/** Bounded, single-line row failure message persisted to row/cell state. */
+function formatWorkerRowFailureMessage(error: unknown): string {
+  const raw =
+    error instanceof Error
+      ? error.message
+      : typeof error === 'string'
+        ? error
+        : JSON.stringify(error);
+  const message = (raw ?? '').replace(/\s+/g, ' ').trim();
+  if (!message) return 'Row execution failed.';
+  return message.length > 1_000 ? `${message.slice(0, 1_000)}…` : message;
+}
 function childPipelineUsesCtxDataset(
   pipeline: PlayStaticPipeline | null | undefined,
 ): boolean {
@@ -3315,6 +3368,22 @@ function createGovernorForRun(req: RunRequest): {
   return { governor, resolvePacing };
 }
+/**
+ * Builds the minimal HTTP-backed ctx surface needed to run tool-basic-shaped
+ * plays. NOT a full implementation of shared_libs/play-runtime/context.ts.
+ *
+ * Supported:
+ *   - ctx.log(msg)
+ *   - ctx.csv(filename | inline rows)  (calls runtime API for file resolve)
+ *   - ctx.dataset(name, rows).withColumn(name, resolver).run(opts)
+ *   - ctx.tools.execute({ id, tool, input, ... })
+ *   - ctx.runPlay(key, playRef, input, opts)
+ *
+ * Not supported (will throw):
+ *   - ctx.fetch, checkpoints, etc.
+ *
+ * Plays that need more should run on Daytona; the resolver is composable.
+ */
 function createMinimalWorkerCtx(
   req: RunRequest,
   emitEvent: (event: RunnerEvent) => void,
@@ -3739,6 +3808,7 @@ function createMinimalWorkerCtx(
         prepared.skipped - missingPreparedRows.length,
       );
       let completedExecutedRows = 0;
+      let failedExecutedRows = 0;
       let startedExecutedRows = 0;
       let activeExecutedRows = 0;
       let lastChunkProgressAt = 0;
@@ -3809,20 +3879,15 @@ function createMinimalWorkerCtx(
       const executedRows: Array<T & Record<string, unknown>> = new Array(
         rowsToExecute.length,
       );
+      // Row failure isolation (default): a failed row keeps its
+      // partially-enriched data + the row error so it persists as a
+      // recoverable `_status='failed'` sheet row instead of aborting the map.
+      const failFastRowErrors = opts?.onRowError === 'fail';
+      const failedRowEntries: Array<
+        { row: T & Record<string, unknown>; error: string } | undefined
+      > = new Array(rowsToExecute.length);
       const executedCellMetaPatches: Array<
-        | Record<
-            string,
-            {
-              status: 'cached' | 'skipped' | 'completed';
-              stage?: string | null;
-              reused?: boolean;
-              runId?: string;
-              completedAt?: number;
-              staleAt?: number | null;
-              staleAfterSeconds?: number | null;
-            }
-          >
-        | undefined
+        Record<string, WorkerCellMetaPatchEntry> | undefined
       > = new Array(rowsToExecute.length);
       const toolBatchScheduler = new WorkerToolBatchScheduler(
         req,
@@ -3869,18 +3934,8 @@ function createMinimalWorkerCtx(
                 const enriched: Record<string, unknown> =
                   cloneCsvAliasedRow(row);
                 const fieldOutputs: Record<string, unknown> = {};
-                const cellMetaPatch: Record<
-                  string,
-                  {
-                    status: 'cached' | 'skipped' | 'completed';
-                    stage?: string | null;
-                    reused?: boolean;
-                    runId?: string;
-                    completedAt?: number;
-                    staleAt?: number | null;
-                    staleAfterSeconds?: number | null;
-                  }
-                > = {};
+                const cellMetaPatch: Record<string, WorkerCellMetaPatchEntry> =
+                  {};
                 const waterfallOutputs: RecordedWaterfallOutput[] = [];
                 const stepProgramOutputs: RecordedStepProgramOutput[] = [];
                 const rowCtx = {
@@ -3914,116 +3969,168 @@ function createMinimalWorkerCtx(
                       workflowStep,
                     ),
                 };
-                for (const [key, value] of fieldEntries) {
-                  const rawCellMeta =
-                    enriched[DEEPLINE_CELL_META_FIELD] &&
-                    typeof enriched[DEEPLINE_CELL_META_FIELD] === 'object'
-                      ? (
-                          enriched[DEEPLINE_CELL_META_FIELD] as Record<
-                            string,
-                            unknown
-                          >
-                        )[key]
-                      : null;
-                  const reuseDecision = shouldRecomputeCell({
-                    hasValue: isCompletedWorkerFieldValue(enriched[key]),
-                    meta:
-                      rawCellMeta && typeof rawCellMeta === 'object'
-                        ? (rawCellMeta as {
-                            status?: string;
-                            completedAt?: number;
-                            staleAt?: number | null;
-                            staleAfterSeconds?: number | null;
-                          })
-                        : null,
-                    policy: cellPolicies?.[key],
-                  });
-                  const previousCell = previousCellFromValue({
-                    hasValue: isCompletedWorkerFieldValue(enriched[key]),
-                    value: enriched[key],
-                    meta:
-                      rawCellMeta && typeof rawCellMeta === 'object'
-                        ? (rawCellMeta as {
-                            status?: string;
-                            completedAt?: number;
-                            staleAt?: number | null;
-                            staleAfterSeconds?: number | null;
-                          })
-                        : null,
-                  });
-                  if (reuseDecision.action === 'reuse') {
-                    cellMetaPatch[key] = {
-                      status: 'cached',
-                      stage: key,
-                      reused: true,
-                      runId: req.runId,
-                    };
-                    continue;
-                  }
-                  const resolved = await executeWorkerStepResolver(
-                    value,
-                    enriched,
-                    rowCtx,
-                    absoluteIndex,
-                    previousCell,
-                    isWorkerStepProgram(value)
-                      ? {
-                          parentField: key,
-                          path: [],
-                          outputs: stepProgramOutputs,
-                        }
-                      : undefined,
-                  );
-                  enriched[key] = resolved.value;
-                  fieldOutputs[key] = resolved.value;
-                  if (resolved.status === 'skipped') {
-                    cellMetaPatch[key] = {
-                      status: 'skipped',
-                      stage: key,
-                      runId: req.runId,
-                    };
-                  } else {
-                    const completedAt = nowMs();
-                    const stalenessMeta = resolveCompletedCellStalenessMeta({
-                      policy: authoredCellPolicies?.[key],
-                      value: resolved.value,
-                      completedAt,
+                let activeField: string | null = null;
+                try {
+                  for (const [key, value] of fieldEntries) {
+                    activeField = key;
+                    const rawCellMeta =
+                      enriched[DEEPLINE_CELL_META_FIELD] &&
+                      typeof enriched[DEEPLINE_CELL_META_FIELD] === 'object'
+                        ? (
+                            enriched[DEEPLINE_CELL_META_FIELD] as Record<
+                              string,
+                              unknown
+                            >
+                          )[key]
+                        : null;
+                    const reuseDecision = shouldRecomputeCell({
+                      hasValue: isCompletedWorkerFieldValue(enriched[key]),
+                      meta:
+                        rawCellMeta && typeof rawCellMeta === 'object'
+                          ? (rawCellMeta as {
+                              status?: string;
+                              completedAt?: number;
+                              staleAt?: number | null;
+                              staleAfterSeconds?: number | null;
+                            })
+                          : null,
+                      policy: cellPolicies?.[key],
                     });
-                    cellMetaPatch[key] = {
-                      status: 'completed',
-                      stage: key,
-                      runId: req.runId,
-                      completedAt,
-                      ...stalenessMeta,
-                    };
+                    const previousCell = previousCellFromValue({
+                      hasValue: isCompletedWorkerFieldValue(enriched[key]),
+                      value: enriched[key],
+                      meta:
+                        rawCellMeta && typeof rawCellMeta === 'object'
+                          ? (rawCellMeta as {
+                              status?: string;
+                              completedAt?: number;
+                              staleAt?: number | null;
+                              staleAfterSeconds?: number | null;
+                            })
+                          : null,
+                    });
+                    if (reuseDecision.action === 'reuse') {
+                      cellMetaPatch[key] = {
+                        status: 'cached',
+                        stage: key,
+                        reused: true,
+                        runId: req.runId,
+                      };
+                      continue;
+                    }
+                    const resolved = await executeWorkerStepResolver(
+                      value,
+                      enriched,
+                      rowCtx,
+                      absoluteIndex,
+                      previousCell,
+                      isWorkerStepProgram(value)
+                        ? {
+                            parentField: key,
+                            path: [],
+                            outputs: stepProgramOutputs,
+                          }
+                        : undefined,
+                    );
+                    enriched[key] = resolved.value;
+                    fieldOutputs[key] = resolved.value;
+                    if (resolved.status === 'skipped') {
+                      cellMetaPatch[key] = {
+                        status: 'skipped',
+                        stage: key,
+                        runId: req.runId,
+                      };
+                    } else {
+                      const completedAt = nowMs();
+                      const stalenessMeta = resolveCompletedCellStalenessMeta({
+                        policy: authoredCellPolicies?.[key],
+                        value: resolved.value,
+                        completedAt,
+                      });
+                      cellMetaPatch[key] = {
+                        status: 'completed',
+                        stage: key,
+                        runId: req.runId,
+                        completedAt,
+                        ...stalenessMeta,
+                      };
+                    }
+                    activeField = null;
                   }
-                }
-                for (const stepOutput of stepProgramOutputs) {
-                  enriched[stepOutput.columnName] = stepOutput.value;
-                  fieldOutputs[stepOutput.columnName] = stepOutput.value;
-                  generatedOutputFields.add(stepOutput.columnName);
-                  if (stepOutput.status === 'skipped') {
-                    cellMetaPatch[stepOutput.columnName] = {
-                      status: 'skipped',
-                      stage: stepOutput.stepId,
+                  for (const stepOutput of stepProgramOutputs) {
+                    enriched[stepOutput.columnName] = stepOutput.value;
+                    fieldOutputs[stepOutput.columnName] = stepOutput.value;
+                    generatedOutputFields.add(stepOutput.columnName);
+                    if (stepOutput.status === 'skipped') {
+                      cellMetaPatch[stepOutput.columnName] = {
+                        status: 'skipped',
+                        stage: stepOutput.stepId,
+                        runId: req.runId,
+                      };
+                    }
+                  }
+                  for (const waterfallOutput of waterfallOutputs) {
+                    const columnName =
+                      `${sqlishIdentifierPart(waterfallOutput.waterfallId)}__` +
+                      sqlishIdentifierPart(waterfallOutput.stepId);
+                    enriched[columnName] = waterfallOutput.value;
+                    generatedOutputFields.add(columnName);
+                  }
+                  executedCellMetaPatches[myIndex] =
+                    Object.keys(cellMetaPatch).length > 0
+                      ? cellMetaPatch
+                      : undefined;
+                  executedRows[myIndex] = enriched as T &
+                    Record<string, unknown>;
+                  completedExecutedRows += 1;
+                  reportChunkProgress(false);
+                } catch (rowError) {
+                  // Row failure isolation (the default): one row's
+                  // tool/provider error is recorded on that row and its
+                  // siblings continue. Abort/budget errors stay run-fatal,
+                  // and `onRowError: 'fail'` opts the map into fail-fast.
+                  if (
+                    failFastRowErrors ||
+                    isRowIsolationExemptError(rowError)
+                  ) {
+                    throw rowError;
+                  }
+                  const message = formatWorkerRowFailureMessage(rowError);
+                  if (activeField) {
+                    cellMetaPatch[activeField] = {
+                      status: 'failed',
+                      stage: activeField,
                       runId: req.runId,
+                      error: message,
                     };
                   }
+                  executedCellMetaPatches[myIndex] =
+                    Object.keys(cellMetaPatch).length > 0
+                      ? cellMetaPatch
+                      : undefined;
+                  // Keep the partially-enriched row so completed sibling
+                  // cells persist and replay free when the row re-executes.
+                  failedRowEntries[myIndex] = {
+                    row: enriched as T & Record<string, unknown>,
+                    error: message,
+                  };
+                  failedExecutedRows += 1;
+                  // Bounded per-chunk samples: every failure is persisted on
+                  // its row, but only the first few get a log line so a wide
+                  // outage cannot flood the Run Log Stream.
+                  if (failedExecutedRows <= MAP_ROW_FAILURE_SAMPLE_LIMIT) {
+                    emitEvent({
+                      type: 'log',
+                      level: 'warn',
+                      message:
+                        `Row ${absoluteIndex} of ctx.dataset("${name}") failed` +
+                        `${activeField ? ` at column "${activeField}"` : ''}: ${message} ` +
+                        '(row recorded as failed; sibling rows continue and the row re-executes on the next run)',
+                      ts: nowMs(),
+                    });
+                  }
+                  reportChunkProgress(false);
                 }
-                for (const waterfallOutput of waterfallOutputs) {
-                  const columnName =
-                    `${sqlishIdentifierPart(waterfallOutput.waterfallId)}__` +
-                    sqlishIdentifierPart(waterfallOutput.stepId);
-                  enriched[columnName] = waterfallOutput.value;
-                  generatedOutputFields.add(columnName);
-                }
-                executedCellMetaPatches[myIndex] =
-                  Object.keys(cellMetaPatch).length > 0
-                    ? cellMetaPatch
-                    : undefined;
-                executedRows[myIndex] = enriched as T & Record<string, unknown>;
-                completedExecutedRows += 1;
-                reportChunkProgress(false);
               } finally {
                 if (rowMarkedActive) {
                   activeExecutedRows = Math.max(0, activeExecutedRows - 1);
@@ -4053,7 +4160,24 @@ function createMinimalWorkerCtx(
               executedIndex: number;
             } => entry !== null,
           );
-        if (rowsToPersist.length === 0) {
+        const failedRowsToPersist = failedRowEntries
+          .map((failure, executedIndex) =>
+            failure
+              ? {
+                  failure,
+                  executedIndex,
+                }
+              : null,
+          )
+          .filter(
+            (
+              entry,
+            ): entry is {
+              failure: { row: T & Record<string, unknown>; error: string };
+              executedIndex: number;
+            } => entry !== null,
+          );
+        if (rowsToPersist.length === 0 && failedRowsToPersist.length === 0) {
           return;
         }
         await persistCompletedMapRows({
@@ -4061,16 +4185,34 @@ function createMinimalWorkerCtx(
           tableNamespace: name,
           outputFields,
           extraOutputFields: Array.from(generatedOutputFields),
-          rows: rowsToPersist.map(({ row, executedIndex }) => ({
-            ...row,
-            ...(executedCellMetaPatches[executedIndex]
-              ? {
-                  __deeplineCellMetaPatch:
-                    executedCellMetaPatches[executedIndex],
-                }
-              : {}),
-            __deeplineRowKey: uniqueRowsToExecuteEntries[executedIndex]!.rowKey,
-          })),
+          rows: [
+            ...rowsToPersist.map(({ row, executedIndex }) => ({
+              ...row,
+              ...(executedCellMetaPatches[executedIndex]
+                ? {
+                    __deeplineCellMetaPatch:
+                      executedCellMetaPatches[executedIndex],
+                  }
+                : {}),
+              __deeplineRowKey:
+                uniqueRowsToExecuteEntries[executedIndex]!.rowKey,
+            })),
+            // Failed rows persist as recoverable `_status='failed'` sheet
+            // rows: partial data + per-cell failure meta + the row error.
+            ...failedRowsToPersist.map(({ failure, executedIndex }) => ({
+              ...failure.row,
+              ...(executedCellMetaPatches[executedIndex]
+                ? {
+                    __deeplineCellMetaPatch:
+                      executedCellMetaPatches[executedIndex],
+                  }
+                : {}),
+              __deeplineRowKey:
+                uniqueRowsToExecuteEntries[executedIndex]!.rowKey,
+              __deeplineRowStatus: 'failed',
+              __deeplineRowError: failure.error,
+            })),
+          ],
         });
       };
       const workersStartedAt = nowMs();
@@ -4167,9 +4309,11 @@ function createMinimalWorkerCtx(
         executedIndex < executedRows.length;
         executedIndex += 1
       ) {
-        const executedRow = executedRows[executedIndex]!;
+        const executedRow = executedRows[executedIndex];
         const key = uniqueRowsToExecuteEntries[executedIndex]!.rowKey;
-        if (key) resultByKey.set(key, executedRow);
+        // Failed rows have no executed result; they stay out of the map output
+        // dataset (their recoverable state lives in the runtime sheet).
+        if (key && executedRow) resultByKey.set(key, executedRow);
       }
       const out = chunkRows
         .map((_row, index) => {
@@ -4177,6 +4321,24 @@ function createMinimalWorkerCtx(
           return resultByKey.get(key);
         })
         .filter((row): row is T & Record<string, unknown> => Boolean(row));
+      const executedSuccessCount = Math.max(
+        0,
+        executedRows.length - failedExecutedRows,
+      );
+      const rowFailureSamples = failedRowEntries
+        .map((failure, executedIndex) =>
+          failure
+            ? {
+                rowKey: uniqueRowsToExecuteEntries[executedIndex]!.rowKey,
+                error: failure.error,
+              }
+            : null,
+        )
+        .filter(
+          (sample): sample is { rowKey: string; error: string } =>
+            sample !== null,
+        )
+        .slice(0, MAP_ROW_FAILURE_SAMPLE_LIMIT);
       const publicOut = out.map((row) => publicCsvOutputRow(row));
       const hashStartedAt = nowMs();
       const hash = await hashJson(publicOut);
@@ -4196,7 +4358,8 @@ function createMinimalWorkerCtx(
           rowsRead: chunkRows.length,
           rowsWritten: out.length,
           rowsExecuted: executedRows.length,
-          rowsCached: Math.max(0, out.length - executedRows.length),
+          rowsFailed: failedExecutedRows,
+          rowsCached: Math.max(0, out.length - executedSuccessCount),
         },
       });
       return {
@@ -4206,10 +4369,12 @@ function createMinimalWorkerCtx(
         rowsRead: chunkRows.length,
         rowsWritten: out.length,
         rowsExecuted: executedRows.length,
-        rowsCached: Math.max(0, out.length - executedRows.length),
+        rowsCached: Math.max(0, out.length - executedSuccessCount),
         rowsDuplicateReused: duplicateInputReuseCount,
         rowsInserted,
         rowsSkipped,
+        rowsFailed: failedExecutedRows,
+        rowFailureSamples,
         outputDatasetId: `map:${name}`,
         hash,
         preview: toWorkflowSerializableValue(publicOut.slice(0, 5)),
@@ -4228,6 +4393,8 @@ function createMinimalWorkerCtx(
     let totalRowsDuplicateReused = 0;
     let totalRowsInserted = 0;
     let totalRowsSkipped = 0;
+    let totalRowsFailed = 0;
+    const totalRowFailureSamples: Array<{ rowKey: string; error: string }> = [];
     const runChunkStep = async (
       chunkRows: T[],
@@ -4255,22 +4422,35 @@ function createMinimalWorkerCtx(
     };
     const finalize = (totalRowsWritten: number) => {
+      const failureSampleSummary =
+        totalRowFailureSamples.length > 0
+          ? ` First error: ${totalRowFailureSamples[0]!.error}`
+          : '';
       const cacheSummary =
-        `Map completed: ${totalRowsWritten} results ` +
-        `(${totalRowsExecuted} executed, ${totalRowsCached} already satisfied) ` +
-        `inserted=${totalRowsInserted} skipped=${totalRowsSkipped}`;
+        totalRowsFailed > 0
+          ? `Map completed with partial failures: ${totalRowsWritten} succeeded, ` +
+            `${totalRowsFailed} failed (${totalRowsExecuted} executed, ${totalRowsCached} already satisfied) ` +
+            `inserted=${totalRowsInserted} skipped=${totalRowsSkipped}. ` +
+            `Failed rows are persisted with their errors and re-execute on the next run.${failureSampleSummary}`
+          : `Map completed: ${totalRowsWritten} results ` +
+            `(${totalRowsExecuted} executed, ${totalRowsCached} already satisfied) ` +
+            `inserted=${totalRowsInserted} skipped=${totalRowsSkipped}`;
       const completedAt = nowMs();
       callbacks?.onMapCompleted?.(mapNodeId, completedAt);
       void updateMapProgress({
         completed: totalRowsWritten,
-        total: totalRowsWritten,
+        total: totalRowsWritten + totalRowsFailed,
+        failed: totalRowsFailed,
         completedAt,
         updatedAt: completedAt,
-        message: formatMapProgressMessage(totalRowsWritten, totalRowsWritten),
+        message:
+          totalRowsFailed > 0
+            ? `${totalRowsWritten.toLocaleString()} succeeded, ${totalRowsFailed.toLocaleString()} failed`
+            : formatMapProgressMessage(totalRowsWritten, totalRowsWritten),
       });
       emitEvent({
         type: 'log',
-        level: 'info',
+        level: totalRowsFailed > 0 ? 'warn' : 'info',
         message: cacheSummary,
         ts: nowMs(),
       });
@@ -4299,12 +4479,12 @@ function createMinimalWorkerCtx(
           recordRunnerPerfTrace({ req, phase, ms, extra }),
         nowMs,
         workProgress: {
-          total: totalRowsWritten,
+          total: totalRowsWritten + totalRowsFailed,
           executed: totalRowsExecuted,
           reused: totalRowsCached,
           skipped: totalRowsCached,
           pending: 0,
-          failed: 0,
+          failed: totalRowsFailed,
           ...(totalRowsDuplicateReused > 0
             ? { duplicates: { exact: totalRowsDuplicateReused } }
             : {}),
@@ -4325,9 +4505,17 @@ function createMinimalWorkerCtx(
       totalRowsDuplicateReused += chunkResult.rowsDuplicateReused;
       totalRowsInserted += chunkResult.rowsInserted;
       totalRowsSkipped += chunkResult.rowsSkipped;
+      totalRowsFailed += chunkResult.rowsFailed ?? 0;
+      for (const sample of chunkResult.rowFailureSamples ?? []) {
+        if (totalRowFailureSamples.length >= MAP_ROW_FAILURE_SAMPLE_LIMIT) {
+          break;
+        }
+        totalRowFailureSamples.push(sample);
+      }
       await updateMapProgress({
         completed: totalRowsWritten,
         total: rowCountHint ?? undefined,
+        ...(totalRowsFailed > 0 ? { failed: totalRowsFailed } : {}),
         message: formatMapProgressMessage(
           totalRowsWritten,
           rowCountHint ?? undefined,
@@ -4356,6 +4544,18 @@ function createMinimalWorkerCtx(
       chunkStart += chunkRows.length;
       chunkIndex += 1;
     }
+    if (totalRowsFailed > 0 && totalRowsWritten === 0) {
+      // Every row failed: this is a systemic failure (provider outage, broken
+      // resolver, exhausted credits), not a partial one. Isolating it would
+      // silently complete the run with an empty dataset. Fail loudly — the
+      // failed rows are persisted with their errors and re-execute on re-run.
+      const firstError = totalRowFailureSamples[0]?.error ?? 'unknown error';
+      throw new Error(
+        `ctx.dataset("${name}") failed for all ${totalRowsFailed} executed rows. ` +
+          `First error: ${firstError} ` +
+          `(rows are persisted with per-row errors; fix the cause and re-run to resume)`,
+      );
+    }
     const dataset = finalize(totalRowsWritten);
     recordRunnerPerfTrace({
       req,

package/dist/repo/apps/play-runner-workers/src/runtime/row-isolation.ts ADDED Viewed

@@ -0,0 +1,53 @@
+import {
+  isHardBillingToolHttpError,
+  isRateLimitToolHttpError,
+} from './tool-http-errors';
+/**
+ * Thrown by `assertNotAborted` and surfaced through ctx.step / ctx.sleep / map
+ * processing when the workflow has been terminated externally. Cooperatively
+ * cancels in-flight user code: the play must check `ctx.signal.aborted` (or
+ * await one of the abort-aware ctx methods) before doing more work.
+ */
+export class WorkflowAbortError extends Error {
+  override readonly name = 'WorkflowAbort';
+  constructor(message = 'Play run cancelled.') {
+    super(message);
+  }
+}
+export function isAbortLikeError(error: unknown): boolean {
+  if (!error) return false;
+  if (error instanceof WorkflowAbortError) return true;
+  if (error instanceof Error) {
+    if (error.name === 'WorkflowAbort' || error.name === 'AbortError')
+      return true;
+    return /\b(cancell?ed|aborted|terminate[d]?)\b/i.test(error.message);
+  }
+  return false;
+}
+/**
+ * Errors that must stay run-fatal even under the default map row failure
+ * isolation:
+ *
+ * - Cancellation/abort must stop the run.
+ * - Governor budget exhaustion is a run-level invariant — isolating it per
+ *   row would silently convert "this run exceeded its execution budget" into
+ *   thousands of identical row failures.
+ * - Rate-limit pushback (a tool call that still got HTTP 429 after the
+ *   in-process retry budget) is run-level throughput pressure that applies to
+ *   every row equally, not a row defect. Isolating it silently drops healthy
+ *   rows from the output dataset whenever a provider throttles — the durable
+ *   chunk step's retries (and, if the storm persists, a loud run failure with
+ *   recoverable persisted rows) are the correct response.
+ * - Hard billing failures (billing cap / insufficient credits) promise "run
+ *   halted before marking remaining rows processed"; isolating them would
+ *   complete the run while silently failing every remaining row.
+ */
+export function isRowIsolationExemptError(error: unknown): boolean {
+  if (isAbortLikeError(error)) return true;
+  if (error instanceof Error && error.name === 'GovernorBudgetError')
+    return true;
+  return isRateLimitToolHttpError(error) || isHardBillingToolHttpError(error);
+}

package/dist/repo/apps/play-runner-workers/src/runtime/tool-http-errors.ts CHANGED Viewed

@@ -1,10 +1,17 @@
 export class ToolHttpError extends Error {
   readonly billing: Record<string, unknown> | null;
+  /** HTTP status of the failed tool-execute response (e.g. 429, 502). */
+  readonly status: number;
-  constructor(message: string, billing: Record<string, unknown> | null) {
+  constructor(
+    message: string,
+    billing: Record<string, unknown> | null,
+    status: number,
+  ) {
     super(message);
     this.name = 'ToolHttpError';
     this.billing = billing;
+    this.status = status;
   }
 }
@@ -200,6 +207,7 @@ export function normalizeToolHttpErrorMessage(input: {
         },
       )}`,
       billing,
+      input.status,
     );
   }
   const hardBillingPayload = isHardBillingFailurePayload(billing)
@@ -217,6 +225,7 @@ export function normalizeToolHttpErrorMessage(input: {
         maxAttempts: input.maxAttempts,
       }),
       hardBillingPayload,
+      input.status,
     );
   }
   return new ToolHttpError(
@@ -227,6 +236,7 @@ export function normalizeToolHttpErrorMessage(input: {
       },
     )}`,
     billing,
+    input.status,
   );
 }
@@ -241,3 +251,12 @@ export function isHardBillingToolHttpError(error: unknown): boolean {
     error instanceof ToolHttpError && isHardBillingFailurePayload(error.billing)
   );
 }
+/**
+ * A tool call that ultimately failed with HTTP 429 — provider or
+ * Deepline-internal rate-limit pushback that survived the in-process retry
+ * budget. This is run-level throughput pressure, never a row-specific defect.
+ */
+export function isRateLimitToolHttpError(error: unknown): boolean {
+  return error instanceof ToolHttpError && error.status === 429;
+}

package/dist/repo/sdk/src/release.ts CHANGED Viewed

@@ -53,10 +53,11 @@ export const SDK_RELEASE = {
   // 0.1.94 is claimed by PR #1527 — this watch-render fix ships as 0.1.95.
   // 0.1.98 ships the duplicate-browser-tab fix (default-browser detection).
   // 0.1.99 ships prebuilt job-change source-column preservation and validation fixes.
-  version: '0.1.99',
+  // 0.1.101 ships retryable play artifact publish failures and CI retry hardening.
+  version: '0.1.101',
   apiContract: '2026-06-dataset-column-cell-stale-hard-cutover',
   supportPolicy: {
-    latest: '0.1.99',
+    latest: '0.1.101',
     minimumSupported: '0.1.53',
     deprecatedBelow: '0.1.53',
   },

package/dist/repo/shared_libs/play-runtime/batch-runtime.ts CHANGED Viewed

@@ -3,9 +3,16 @@ import type { AnyBatchOperationStrategy } from './batching-types';
 export interface ChunkExecutionResult<TRequest, TResult> {
   request: TRequest;
   result: TResult | null;
+  /**
+   * Present when this request's execution rejected. The request failed but
+   * its siblings in the chunk kept their results — one provider hiccup must
+   * stay a per-request failure, not a run-level abort that discards billed
+   * work (rows already persisted by completed calls stay recoverable).
+   */
+  error?: string;
 }
-function formatChunkExecutionError(error: unknown): string {
+export function formatChunkExecutionError(error: unknown): string {
   if (error instanceof Error) {
     return error.message;
   }
@@ -23,6 +30,13 @@ export async function executeChunkedRequests<TRequest, TResult>(input: {
   requests: TRequest[];
   batchSize: number;
   execute: (request: TRequest) => Promise<TResult>;
+  /**
+   * Loud per-request failure hook. A rejected request is recorded as a
+   * `result: null` entry with `error` set so the row-level state can carry
+   * the provider error; it must never abort the chunk, the sibling requests,
+   * or the run. Callers use this to log and persist the failure.
+   */
+  onRequestError?: (request: TRequest, error: unknown) => void;
   onChunkComplete?: (
     results: Array<ChunkExecutionResult<TRequest, TResult>>,
   ) => void | Promise<void>;
@@ -35,21 +49,20 @@ export async function executeChunkedRequests<TRequest, TResult>(input: {
       chunk.map((request) => input.execute(request)),
     );
-    const rejected = settled.find(
-      (outcome): outcome is PromiseRejectedResult =>
-        outcome.status === 'rejected',
-    );
-    if (rejected) {
-      throw new Error(
-        `Play batch request failed: ${formatChunkExecutionError(rejected.reason)}`,
-        { cause: rejected.reason },
-      );
-    }
     for (let index = 0; index < chunk.length; index += 1) {
-      const outcome = settled[index] as PromiseFulfilledResult<TResult>;
+      const request = chunk[index]!;
+      const outcome = settled[index]!;
+      if (outcome.status === 'rejected') {
+        input.onRequestError?.(request, outcome.reason);
+        results.push({
+          request,
+          result: null,
+          error: formatChunkExecutionError(outcome.reason),
+        });
+        continue;
+      }
       results.push({
-        request: chunk[index]!,
+        request,
         result: outcome.value,
       });
     }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "deepline",
-  "version": "0.1.99",
+  "version": "0.1.101",
   "description": "Deepline SDK + CLI — B2B data enrichment powered by durable cloud execution",
   "license": "MIT",
   "repository": {