npm - deepline - Versions diffs - 0.1.150 → 0.1.152 - Mend

deepline 0.1.150 → 0.1.152

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/dist/bundling-sources/apps/play-runner-workers/src/entry.ts CHANGED Viewed

@@ -55,13 +55,17 @@ import {
 import {
   CTX_FETCH_EGRESS_PROVIDER,
   CTX_FETCH_EGRESS_TOOL_ID,
-  resolveBuiltinPacing,
 } from '../../../shared_libs/play-runtime/builtin-pacing';
 import {
   CoordinatorRateStateBackend,
   type CoordinatorRatePort,
 } from '../../../shared_libs/play-runtime/governor/coordinator-rate-state-backend';
 import type { PacingRule } from '../../../shared_libs/play-runtime/governor/rate-state-backend';
+import {
+  pacingPolicyForTool,
+  pacingPolicyFromUnknownQueueHints,
+  type ResolvedPacingPolicy,
+} from '../../../shared_libs/play-runtime/pacing';
 import {
   awaitChildTerminal,
   type ChildPlayTerminalWaitResult,
@@ -83,7 +87,8 @@ import {
   TOOL_EXECUTE_RATE_LIMIT_MAX_ATTEMPTS,
   TOOL_EXECUTE_TRANSPORT_MAX_ATTEMPTS,
   TOOL_EXECUTE_TRANSPORT_RETRY_DELAY_MS,
-  decideToolExecuteHttpRetry,
+  classifyToolExecuteHttpFailure,
+  createToolExecuteHttpFailureAttemptTracker,
 } from '../../../shared_libs/play-runtime/tool-execute-retry-policy';
 import type { PlayCallGovernanceSnapshot } from '../../../shared_libs/play-runtime/scheduler-backend';
 import type { PreloadedRuntimeDbSession } from '../../../shared_libs/play-runtime/db-session';
@@ -161,6 +166,15 @@ import {
   publicCsvStorageRow,
   runtimeCsvStorageRow,
 } from './runtime/csv-rows';
+import {
+  completedMapRowOutcome,
+  failedMapRowOutcome,
+  mapRowOutcomeRuntimeRow,
+  mapRowOutcomeRuntimeFields,
+  resolveMapRowOutcomeKey,
+  stripMapRowOutcomeRuntimeFields,
+} from '../../../shared_libs/play-runtime/map-row-outcome';
+import { runtimeSheetSessionScope } from '../../../shared_libs/play-runtime/runtime-sheet-session';
 import { chooseWorkerMapRowsPerChunk } from './runtime/map-chunk-plan';
 import {
   applyCsvRenameProjection,
@@ -194,11 +208,7 @@ import type {
   LiveNodeProgressMap,
   LiveNodeProgressSnapshot,
 } from './runtime/live-progress';
-import {
-  extractErrorBilling,
-  isHardBillingToolHttpError,
-  normalizeToolHttpErrorMessage,
-} from './runtime/tool-http-errors';
+import { extractErrorBilling } from './runtime/tool-http-errors';
 import {
   WorkflowAbortError,
   isAbortLikeError,
@@ -326,6 +336,7 @@ type WorkerEnv = {
     runtimeApiCall(input: {
       executorToken: string;
       path: string;
+      method?: string;
       body: unknown;
       headers?: Record<string, string>;
       timeoutMs?: number;
@@ -581,6 +592,7 @@ async function callRuntimeApiRpcBinding(
   const result = await binding.runtimeApiCall({
     executorToken: authorization.replace(/^Bearer\s+/i, '').trim(),
     path: input.path,
+    method: init.method ?? 'POST',
     body: rawBody ? JSON.parse(rawBody) : {},
     headers,
     timeoutMs: input.timeoutMs,
@@ -1156,6 +1168,7 @@ async function executeTool(
   workflowStep?: WorkflowStep,
   onProviderBackpressure?: (retryAfterMs: number) => void,
   onRetryAttempt?: () => void,
+  transientHttpRetrySafe = false,
 ): Promise<ToolExecuteResult> {
   if (args.toolId === 'test_wait_for_event' && workflowStep) {
     const result = await waitForSyntheticIntegrationEvent(
@@ -1170,7 +1183,13 @@ async function executeTool(
   // service bindings, NOT through HTTP from this worker. Removing the
   // dispatcher-side coordinatorUrl plumbing intentionally turns the old
   // HTTP-based dedup helpers into dead code.
-  return callToolDirect(req, args, onProviderBackpressure, onRetryAttempt);
+  return callToolDirect(
+    req,
+    args,
+    onProviderBackpressure,
+    onRetryAttempt,
+    transientHttpRetrySafe,
+  );
 }
 async function executeToolWithLifecycle(
@@ -1180,6 +1199,7 @@ async function executeToolWithLifecycle(
   callbacks: WorkerCtxCallbacks | undefined,
   onProviderBackpressure?: (retryAfterMs: number) => void,
   onRetryAttempt?: () => void,
+  transientHttpRetrySafe = false,
 ): Promise<ToolExecuteResult> {
   callbacks?.onToolCalled?.(args.toolId, nowMs());
   try {
@@ -1189,6 +1209,7 @@ async function executeToolWithLifecycle(
       workflowStep,
       onProviderBackpressure,
       onRetryAttempt,
+      transientHttpRetrySafe,
     );
   } catch (error) {
     callbacks?.onToolFailed?.(args.toolId, nowMs());
@@ -1322,16 +1343,17 @@ async function callToolDirect(
   // 429 / transient-5xx retry). Without this the worker substrate would leave
   // policy.budgets.maxRetryCount effectively unenforced.
   onRetryAttempt?: () => void,
+  transientHttpRetrySafe = false,
 ): Promise<ToolExecuteResult> {
   const { id, toolId, input } = args;
   const path = `/api/v2/integrations/${encodeURIComponent(toolId)}/execute`;
   let lastError: Error | null = null;
+  const httpFailureAttempts = createToolExecuteHttpFailureAttemptTracker();
+  let requestAttempt = 0;
+  let transportAttempt = 0;
-  for (
-    let attempt = 1;
-    attempt <= TOOL_EXECUTE_RATE_LIMIT_MAX_ATTEMPTS;
-    attempt += 1
-  ) {
+  while (true) {
+    requestAttempt += 1;
     let res: Response;
     try {
       res = await fetchRuntimeApi(req.baseUrl, path, {
@@ -1339,7 +1361,7 @@ async function callToolDirect(
         headers: {
           'content-type': 'application/json',
           authorization: `Bearer ${req.executorToken}`,
-          'x-deepline-request-id': `${req.runId}:${toolId}:${id}:attempt:${attempt}`,
+          'x-deepline-request-id': `${req.runId}:${toolId}:${id}:attempt:${requestAttempt}`,
           [EXECUTE_RESPONSE_CONTRACT_HEADER]: V2_EXECUTE_RESPONSE_CONTRACT,
           [EXECUTE_TOOL_METADATA_HEADER]: 'true',
         },
@@ -1349,20 +1371,21 @@ async function callToolDirect(
         }),
       });
     } catch (error) {
+      transportAttempt += 1;
       const message = error instanceof Error ? error.message : String(error);
       lastError = new Error(
-        `Tool ${toolId} transport failed calling ${path} for run ${req.runId} on attempt ${attempt}/${TOOL_EXECUTE_TRANSPORT_MAX_ATTEMPTS}: ${message}`,
+        `Tool ${toolId} transport failed calling ${path} for run ${req.runId} on attempt ${transportAttempt}/${TOOL_EXECUTE_TRANSPORT_MAX_ATTEMPTS}: ${message}`,
       );
       if (
-        attempt >= TOOL_EXECUTE_TRANSPORT_MAX_ATTEMPTS ||
+        transportAttempt >= TOOL_EXECUTE_TRANSPORT_MAX_ATTEMPTS ||
         !isRetryableRuntimeApiError(error)
       ) {
         throw lastError;
       }
       onRetryAttempt?.();
-      const delayMs = TOOL_EXECUTE_TRANSPORT_RETRY_DELAY_MS * attempt;
+      const delayMs = TOOL_EXECUTE_TRANSPORT_RETRY_DELAY_MS * transportAttempt;
       console.warn(
-        `[deepline-run:${req.runId}] tool transport retry tool=${toolId} path=${path} attempt=${attempt}/${TOOL_EXECUTE_TRANSPORT_MAX_ATTEMPTS} retryAfterMs=${delayMs} error=${redactSecretsFromLogString(message)}`,
+        `[deepline-run:${req.runId}] tool transport retry tool=${toolId} path=${path} attempt=${transportAttempt}/${TOOL_EXECUTE_TRANSPORT_MAX_ATTEMPTS} retryAfterMs=${delayMs} error=${redactSecretsFromLogString(message)}`,
       );
       await sleepWorkerMs(delayMs);
       continue;
@@ -1379,51 +1402,34 @@ async function callToolDirect(
     }
     const text = await res.text().catch(() => '');
-    const isRateLimited = res.status === 429;
-    const initialRetryDecision = decideToolExecuteHttpRetry({
+    const httpFailureAttempt = httpFailureAttempts.next({
       toolId,
       status: res.status,
+      transientHttpRetrySafe,
     });
-    lastError = normalizeToolHttpErrorMessage({
+    const failure = classifyToolExecuteHttpFailure({
       toolId,
       status: res.status,
-      attempt,
-      maxAttempts: initialRetryDecision.attemptCap,
+      attempt: httpFailureAttempt,
       bodyText: text,
+      retryAfterHeader: res.headers.get('retry-after'),
+      transientHttpRetrySafe,
     });
-    // Rate-limit pushback gets the larger 429-specific retry budget, unless the
-    // current response body is a hard Deepline billing denial.
-    const retryDecision = decideToolExecuteHttpRetry({
-      toolId,
-      status: res.status,
-      hardBillingFailure: isHardBillingToolHttpError(lastError),
-    });
-    const attemptCap = retryDecision.attemptCap;
-    const retryAfterSeconds = Number(res.headers.get('retry-after'));
-    const retryAfterMs =
-      Number.isFinite(retryAfterSeconds) && retryAfterSeconds > 0
-        ? Math.ceil(retryAfterSeconds * 1000)
-        : 0;
-    if (isRateLimited) {
+    lastError = failure.error;
+    if (failure.backpressureDelayMs !== null) {
       // Feed the provider's backpressure into the shared pacer even on the
       // final attempt so the (org, provider) bucket backs off across isolates.
-      onProviderBackpressure?.(retryAfterMs > 0 ? retryAfterMs : 1_000);
+      onProviderBackpressure?.(failure.backpressureDelayMs);
     }
-    if (!retryDecision.retryable || attempt >= attemptCap) {
+    if (!failure.shouldRetry) {
       throw lastError;
     }
     // Charge the retry budget per attempt, matching the cjs runner's
     // chargeBudget('retry') on every 429 / retryable-5xx retry.
-    onRetryAttempt?.();
-    // 429 delays escalate per attempt (still honoring a larger retry-after)
-    // so sustained throttling spaces calls out instead of hammering the
-    // limiter with fixed 1s retries.
-    const delayMs = isRateLimited
-      ? Math.min(5_000, Math.max(retryAfterMs, 1_000 * attempt))
-      : retryAfterMs > 0
-        ? Math.min(5_000, retryAfterMs)
-        : 1_000;
-    await new Promise((resolve) => setTimeout(resolve, delayMs));
+    if (failure.chargeRetryBudget) {
+      onRetryAttempt?.();
+    }
+    await sleepWorkerMs(failure.retryDelayMs);
   }
   throw lastError ?? new Error(`tool ${toolId} failed before execution.`);
@@ -1728,6 +1734,9 @@ class WorkerToolBatchScheduler {
       const groupStartedAt = nowMs();
       await Promise.all(
         requests.map(async (request) => {
+          const toolContract = await this.resolvePacing(toolId).catch(
+            () => null,
+          );
           // Each unbatched provider call takes its own tool slot: the Governor
           // charges tool budget, holds a global tool-concurrency slot, and
           // applies per-(org,provider) pacing before the call runs.
@@ -1743,6 +1752,7 @@ class WorkerToolBatchScheduler {
                 this.callbacks,
                 (retryAfterMs) => this.reportBackpressure(toolId, retryAfterMs),
                 () => this.governor.chargeBudget('retry'),
+                toolContract?.retrySafeTransientHttp === true,
               ),
             );
           } catch (error) {
@@ -1775,6 +1785,7 @@ class WorkerToolBatchScheduler {
       abortSignal: this.abortSignal,
       reportBackpressure: (retryAfterMs) =>
         this.reportBackpressure(toolId, retryAfterMs),
+      resolveToolContract: this.resolvePacing,
       onRequestsSettled: this.onRequestsSettled,
       callbacks: this.callbacks,
     });
@@ -1810,6 +1821,7 @@ async function executeBatchedWorkerToolGroup(input: {
   suggestedParallelism: number;
   abortSignal?: AbortSignal;
   reportBackpressure: (retryAfterMs: number) => void;
+  resolveToolContract: WorkerPacingResolver;
   onRequestsSettled?: (count: number) => void;
   callbacks?: WorkerCtxCallbacks;
 }): Promise<void> {
@@ -1840,6 +1852,9 @@ async function executeBatchedWorkerToolGroup(input: {
       Math.min(input.suggestedParallelism, compiledBatches.length || 1),
     ),
     execute: async (batch) => {
+      const toolContract = await input
+        .resolveToolContract(batch.batchOperation)
+        .catch(() => null);
       // One provider call per batch → one tool slot (budget + global
       // concurrency + per-(org,provider) pacing) around the whole batch.
       const slot = await input.governor.acquireToolSlot(batch.batchOperation, {
@@ -1857,6 +1872,7 @@ async function executeBatchedWorkerToolGroup(input: {
           undefined,
           input.reportBackpressure,
           () => input.governor.chargeBudget('retry'),
+          toolContract?.retrySafeTransientHttp === true,
         );
       } catch (error) {
         input.callbacks?.onToolFailed?.(batch.batchOperation, nowMs());
@@ -2246,7 +2262,11 @@ async function executeWorkerWaterfall(
   opts?: WorkerWaterfallOptions,
   callbacks?: WorkerCtxCallbacks,
   workflowStep?: WorkflowStep,
+  resolveToolContract?: WorkerPacingResolver,
 ): Promise<unknown | null> {
+  const retrySafeTransientHttp = async (toolId: string): Promise<boolean> =>
+    (await resolveToolContract?.(toolId).catch(() => null))
+      ?.retrySafeTransientHttp === true;
   // Inline-spec form
   if (typeof toolNameOrSpec === 'object' && toolNameOrSpec) {
     const spec = toolNameOrSpec;
@@ -2257,13 +2277,18 @@ async function executeWorkerWaterfall(
         if (isWorkerInlineCodeStep(step)) {
           result = await step.run(input, {
             tools: {
-              execute: async (request: unknown) =>
-                await executeToolWithLifecycle(
+              execute: async (request: unknown) => {
+                const args = normalizeToolExecuteArgs(request);
+                return await executeToolWithLifecycle(
                   req,
-                  normalizeToolExecuteArgs(request),
+                  args,
                   workflowStep,
                   callbacks,
-                ),
+                  undefined,
+                  undefined,
+                  await retrySafeTransientHttp(args.toolId),
+                );
+              },
             },
           });
         } else {
@@ -2276,6 +2301,9 @@ async function executeWorkerWaterfall(
             },
             workflowStep,
             callbacks,
+            undefined,
+            undefined,
+            await retrySafeTransientHttp(step.toolId),
           );
         }
       } catch {
@@ -2367,6 +2395,9 @@ async function executeWorkerWaterfall(
         { id: toolName, toolId: toolName, input },
         workflowStep,
         callbacks,
+        undefined,
+        undefined,
+        await retrySafeTransientHttp(toolName),
       );
     } catch {
       return null;
@@ -2384,6 +2415,9 @@ async function executeWorkerWaterfall(
         },
         workflowStep,
         callbacks,
+        undefined,
+        undefined,
+        await retrySafeTransientHttp(toolName),
       );
       if (resultHasContent(result)) {
         recorder.push({
@@ -3196,22 +3230,18 @@ async function persistCompletedMapRows(input: {
       (field) => !input.outputFields.includes(field),
     ),
   ];
+  const sessionScope = runtimeSheetSessionScope(input.req);
+  const rows = input.rows.map((row) => publicCsvStorageRow(row));
   await harnessPersistCompletedSheetRows({
-    baseUrl: input.req.baseUrl,
-    executorToken: input.req.executorToken,
-    orgId: input.req.orgId,
-    preloadedDbSessions: input.req.preloadedDbSessions ?? null,
-    playName: input.req.playName,
+    ...sessionScope,
     tableNamespace: input.tableNamespace,
     sheetContract: augmentSheetContractWithDatasetFields({
       contract: requireSheetContract(input.req, input.tableNamespace),
-      rows: input.rows.map((row) => publicCsvStorageRow(row)),
+      rows,
       outputFields,
     }),
-    rows: input.rows.map((row) => publicCsvStorageRow(row)),
+    rows,
     outputFields,
-    runId: input.req.runId,
-    userEmail: input.req.userEmail,
   });
 }
@@ -3231,22 +3261,18 @@ async function prepareMapRows(input: {
   if (input.rows.length === 0) {
     return { inserted: 0, skipped: 0, pendingRows: [], completedRows: [] };
   }
+  const sessionScope = runtimeSheetSessionScope(input.req);
+  const rows = input.rows.map((row) => runtimeCsvStorageRow(row));
   const result = await harnessStartSheetDataset({
-    baseUrl: input.req.baseUrl,
-    executorToken: input.req.executorToken,
-    orgId: input.req.orgId,
-    preloadedDbSessions: input.req.preloadedDbSessions ?? null,
-    playName: input.req.playName,
+    ...sessionScope,
     tableNamespace: input.tableNamespace,
     sheetContract: augmentSheetContractWithDatasetFields({
       contract: requireSheetContract(input.req, input.tableNamespace),
-      rows: input.rows.map((row) => runtimeCsvStorageRow(row)),
+      rows,
       outputFields: input.outputFields,
     }),
-    rows: input.rows.map((row) => runtimeCsvStorageRow(row)),
-    runId: input.req.runId,
+    rows,
     inputOffset: input.inputOffset,
-    userEmail: input.req.userEmail,
     cellPolicies: input.cellPolicies,
   });
   for (const timing of result.timings ?? []) {
@@ -3418,18 +3444,25 @@ function createCoordinatorRatePort(req: RunRequest): CoordinatorRatePort {
  */
 type WorkerPacingResolver = (
   toolId: string,
-) => Promise<{ provider: string; rules: PacingRule[] } | null>;
+) => Promise<
+  (ResolvedPacingPolicy & { retrySafeTransientHttp: boolean }) | null
+>;
 function createWorkerPacingResolver(req: RunRequest): WorkerPacingResolver {
   const cache = new Map<
     string,
-    Promise<{ provider: string; rules: PacingRule[] } | null>
+    Promise<(ResolvedPacingPolicy & { retrySafeTransientHttp: boolean }) | null>
   >();
   return (toolId: string) => {
     const normalized = String(toolId || '').trim();
     if (!normalized) return Promise.resolve(null);
-    const builtin = resolveBuiltinPacing(normalized);
-    if (builtin) return Promise.resolve(builtin);
+    const builtin = pacingPolicyForTool(normalized, []);
+    if (builtin) {
+      return Promise.resolve({
+        ...builtin,
+        retrySafeTransientHttp: false,
+      });
+    }
     const cached = cache.get(normalized);
     if (cached) return cached;
     const promise = (async () => {
@@ -3445,37 +3478,26 @@ function createWorkerPacingResolver(req: RunRequest): WorkerPacingResolver {
       const body = (await res.json().catch(() => null)) as {
         provider?: unknown;
         queueHints?: unknown;
+        retry?: unknown;
       } | null;
       if (!body) return null;
-      const provider =
-        typeof body.provider === 'string' && body.provider.trim()
-          ? body.provider.trim()
-          : null;
-      if (!provider || !Array.isArray(body.queueHints)) return null;
-      const rules: PacingRule[] = body.queueHints.flatMap((hint) => {
-        if (!hint || typeof hint !== 'object') return [];
-        const record = hint as Record<string, unknown>;
-        if (
-          typeof record.ruleId !== 'string' ||
-          typeof record.requestsPerWindow !== 'number' ||
-          typeof record.windowMs !== 'number'
-        ) {
-          return [];
-        }
-        return [
-          {
-            ruleId: record.ruleId,
-            requestsPerWindow: record.requestsPerWindow,
-            windowMs: record.windowMs,
-            maxConcurrency:
-              typeof record.maxConcurrency === 'number'
-                ? record.maxConcurrency
-                : null,
-          } satisfies PacingRule,
-        ];
-      });
-      if (rules.length === 0) return null;
-      return { provider, rules };
+      const pacing = pacingPolicyFromUnknownQueueHints(body.queueHints);
+      const retry =
+        body.retry &&
+        typeof body.retry === 'object' &&
+        !Array.isArray(body.retry)
+          ? (body.retry as Record<string, unknown>)
+          : {};
+      return {
+        ...(pacing ?? {
+          provider:
+            typeof body.provider === 'string' && body.provider.trim()
+              ? body.provider.trim()
+              : '',
+          rules: [],
+        }),
+        retrySafeTransientHttp: retry.retrySafeTransientHttp === true,
+      };
     })();
     cache.set(normalized, promise);
     return promise;
@@ -3930,7 +3952,7 @@ function createMinimalWorkerCtx(
         cellPolicies,
         rows: chunkEntries.map(({ row, rowKey }) => ({
           ...row,
-          __deeplineRowKey: rowKey,
+          ...mapRowOutcomeRuntimeFields({ key: rowKey }),
         })),
         inputOffset: baseOffset + chunkStart,
       });
@@ -3969,9 +3991,8 @@ function createMinimalWorkerCtx(
       const preparedKeys = new Set<string>();
       for (const row of prepared.pendingRows) {
         const key =
-          typeof row.__deeplineRowKey === 'string'
-            ? row.__deeplineRowKey
-            : derivePlayRowIdentity(publicCsvInputRow(row), name);
+          resolveMapRowOutcomeKey(row) ??
+          derivePlayRowIdentity(publicCsvInputRow(row), name);
         if (key) {
           pendingKeys.add(key);
           pendingRowsByKey.set(key, row);
@@ -3980,9 +4001,8 @@ function createMinimalWorkerCtx(
       }
       for (const row of prepared.completedRows) {
         const key =
-          typeof row.__deeplineRowKey === 'string'
-            ? row.__deeplineRowKey
-            : derivePlayRowIdentity(publicCsvInputRow(row), name);
+          resolveMapRowOutcomeKey(row) ??
+          derivePlayRowIdentity(publicCsvInputRow(row), name);
         if (key) {
           completedKeys.add(key);
           preparedKeys.add(key);
@@ -4168,22 +4188,10 @@ function createMinimalWorkerCtx(
               executedIndex: number;
             } => entry !== null,
           );
-        // Under the default isolation, every failed row persists as a
-        // recoverable `_status='failed'` row (it re-executes free next run).
-        // Under `onRowError: 'fail'` the run dies, so a failed row's partial
-        // data is persisted ONLY as a last-resort recovery: when this chunk has
-        // no other recoverable rows (no successful executed rows and no
-        // already-completed rows). That keeps a partial fail-fast run's export
-        // to the rows that fully committed before the failure, while an
-        // all-rows-failed fail-fast run still exposes the persisted partial
-        // cells instead of advertising an empty, unrecoverable dataset.
-        const failedRowsToPersist =
-          failFastRowErrors &&
-          (rowsToPersist.length > 0 ||
-            persistedExecutedIndexes.size > 0 ||
-            prepared.completedRows.length > 0)
-            ? []
-            : allFailedRowsToPersist;
+        // Failed rows persist as recoverable `_status='failed'` rows in both
+        // default row isolation and fail-fast mode. A fail-fast run still dies,
+        // but export/retry keeps cells completed before the failing column.
+        const failedRowsToPersist = allFailedRowsToPersist;
         if (rowsToPersist.length === 0 && failedRowsToPersist.length === 0) {
           return;
         }
@@ -4193,32 +4201,27 @@ function createMinimalWorkerCtx(
           outputFields,
           extraOutputFields: Array.from(generatedOutputFields),
           rows: [
-            ...rowsToPersist.map(({ row, executedIndex }) => ({
-              ...row,
-              ...(executedCellMetaPatches[executedIndex]
-                ? {
-                    __deeplineCellMetaPatch:
-                      executedCellMetaPatches[executedIndex],
-                  }
-                : {}),
-              __deeplineRowKey:
-                uniqueRowsToExecuteEntries[executedIndex]!.rowKey,
-            })),
+            ...rowsToPersist.map(({ row, executedIndex }) =>
+              mapRowOutcomeRuntimeRow(
+                completedMapRowOutcome({
+                  key: uniqueRowsToExecuteEntries[executedIndex]!.rowKey,
+                  data: row,
+                  cellMetaPatch: executedCellMetaPatches[executedIndex],
+                }),
+              ),
+            ),
             // Failed rows persist as recoverable `_status='failed'` sheet
             // rows: partial data + per-cell failure meta + the row error.
-            ...failedRowsToPersist.map(({ failure, executedIndex }) => ({
-              ...failure.row,
-              ...(executedCellMetaPatches[executedIndex]
-                ? {
-                    __deeplineCellMetaPatch:
-                      executedCellMetaPatches[executedIndex],
-                  }
-                : {}),
-              __deeplineRowKey:
-                uniqueRowsToExecuteEntries[executedIndex]!.rowKey,
-              __deeplineRowStatus: 'failed',
-              __deeplineRowError: failure.error,
-            })),
+            ...failedRowsToPersist.map(({ failure, executedIndex }) =>
+              mapRowOutcomeRuntimeRow(
+                failedMapRowOutcome({
+                  key: uniqueRowsToExecuteEntries[executedIndex]!.rowKey,
+                  data: failure.row,
+                  cellMetaPatch: executedCellMetaPatches[executedIndex],
+                  error: failure.error,
+                }),
+              ),
+            ),
           ],
         });
         for (const { executedIndex } of rowsToPersist) {
@@ -4324,6 +4327,7 @@ function createMinimalWorkerCtx(
                       waterfallOpts,
                       callbacks,
                       workflowStep,
+                      resolveToolPacing,
                     ),
                 };
                 let activeField: string | null = null;
@@ -4461,11 +4465,9 @@ function createMinimalWorkerCtx(
                     Object.keys(cellMetaPatch).length > 0
                       ? cellMetaPatch
                       : undefined;
-                  // Keep the partially-enriched row. Default isolation persists
-                  // it as `_status='failed'` so the row can re-execute free on
-                  // the next run. Fail-fast persists failed rows only after the
-                  // chunk settles and only when every row failed; otherwise only
-                  // fully committed successful rows are recoverable.
+                  // Keep the partially-enriched row. It persists as
+                  // `_status='failed'` so export/retry can recover cells that
+                  // completed before the row error.
                   failedRowEntries[myIndex] = {
                     row: enriched as T & Record<string, unknown>,
                     error: message,
@@ -4485,7 +4487,7 @@ function createMinimalWorkerCtx(
                         `Row ${absoluteIndex} of ctx.dataset("${name}") failed` +
                         `${activeField ? ` at column "${activeField}"` : ''}: ${message} ` +
                         (failFastRowErrors
-                          ? '(row recorded as failed; onRowError:"fail" persists it only if every row fails)'
+                          ? '(row recorded as failed; onRowError:"fail" fails the run after recoverable cells persist)'
                           : '(row recorded as failed; sibling rows continue and the row re-executes on the next run)'),
                       ts: nowMs(),
                     });
@@ -4596,12 +4598,12 @@ function createMinimalWorkerCtx(
       const resultByKey = new Map<string, T & Record<string, unknown>>();
       for (const completedRow of prepared.completedRows) {
         const key =
-          typeof completedRow.__deeplineRowKey === 'string'
-            ? completedRow.__deeplineRowKey
-            : derivePlayRowIdentity(publicCsvInputRow(completedRow), name);
+          resolveMapRowOutcomeKey(completedRow) ??
+          derivePlayRowIdentity(publicCsvInputRow(completedRow), name);
         if (key) {
-          const cleanedRow = publicCsvOutputRow(completedRow);
-          delete cleanedRow.__deeplineRowKey;
+          const cleanedRow = stripMapRowOutcomeRuntimeFields(
+            publicCsvOutputRow(completedRow),
+          );
           resultByKey.set(key, cleanedRow as T & Record<string, unknown>);
         }
       }
@@ -4880,12 +4882,11 @@ function createMinimalWorkerCtx(
     if (failFastRowErrors && totalRowsFailed > 0 && totalRowsWritten > 0) {
       // onRowError:'fail', PARTIAL failure (some rows committed): fail the run
       // without finalizing the dataset. The committed rows already persisted
-      // per chunk and are surfaced as a recovered dataset (the failed rows'
-      // partial data was intentionally NOT persisted here — only the rows that
-      // fully committed before the failure are recoverable). We reach this
-      // AFTER the failing chunk completed normally (no per-row throw inside
-      // the durable chunk step, so no chunk-step retry storm); later chunks
-      // were skipped by the fail-fast short-circuit in the chunk loop.
+      // per chunk and are surfaced as a recovered dataset alongside failed
+      // rows' partial cells. We reach this AFTER the failing chunk completed
+      // normally (no per-row throw inside the durable chunk step, so no
+      // chunk-step retry storm); later chunks were skipped by the fail-fast
+      // short-circuit in the chunk loop.
       const firstError = totalRowFailureSamples[0]?.error ?? 'unknown error';
       throw new Error(
         `ctx.dataset("${name}") failed for ${totalRowsFailed} executed row(s) under onRowError:'fail'. ` +
@@ -5247,6 +5248,7 @@ function createMinimalWorkerCtx(
         opts,
         callbacks,
         workflowStep,
+        resolveToolPacing,
       );
     },
     async sleep(ms: number): Promise<void> {