npm - deepline - Versions diffs - 0.1.109 → 0.1.111 - Mend

deepline 0.1.109 → 0.1.111

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/dist/cli/index.js +2634 -1532
package/dist/cli/index.mjs +2547 -1451
package/dist/index.d.mts +21 -14
package/dist/index.d.ts +21 -14
package/dist/index.js +97 -23
package/dist/index.mjs +97 -23
package/dist/repo/apps/play-runner-workers/src/coordinator-entry.ts +192 -121
package/dist/repo/apps/play-runner-workers/src/entry.ts +254 -65
package/dist/repo/apps/play-runner-workers/src/runtime/receipts.ts +18 -27
package/dist/repo/apps/play-runner-workers/src/workflow-instance-create.ts +44 -0
package/dist/repo/apps/play-runner-workers/src/workflow-retry.ts +7 -11
package/dist/repo/sdk/src/client.ts +35 -12
package/dist/repo/sdk/src/errors.ts +2 -2
package/dist/repo/sdk/src/http.ts +87 -7
package/dist/repo/sdk/src/play.ts +1 -1
package/dist/repo/sdk/src/plays/bundle-play-file.ts +5 -1
package/dist/repo/sdk/src/release.ts +13 -10
package/dist/repo/sdk/src/tool-output.ts +2 -2
package/dist/repo/sdk/src/types.ts +9 -6
package/dist/repo/shared_libs/play-runtime/fullenrich-batching.ts +229 -0
package/dist/repo/shared_libs/play-runtime/governor/policy.ts +1 -1
package/dist/repo/shared_libs/play-runtime/play-runtime-batching-registry.ts +20 -0
package/dist/repo/shared_libs/play-runtime/run-failure.ts +20 -12
package/dist/repo/shared_libs/play-runtime/run-ledger.ts +147 -70
package/dist/repo/shared_libs/play-runtime/scheduler-backend.ts +6 -2
package/dist/repo/shared_libs/play-runtime/secret-redaction.ts +15 -0
package/dist/repo/shared_libs/play-runtime/work-receipts.ts +1 -0
package/dist/repo/shared_libs/plays/bundling/index.ts +193 -21
package/dist/repo/shared_libs/plays/static-pipeline.ts +1 -3
package/dist/repo/shared_libs/security/outbound-url-policy.ts +238 -0
package/dist/repo/shared_libs/security/safe-fetch.ts +118 -0
package/dist/viewer/viewer.css +617 -0
package/dist/viewer/viewer.js +1496 -0
package/package.json +5 -1

package/dist/repo/apps/play-runner-workers/src/entry.ts CHANGED Viewed

@@ -42,7 +42,7 @@ import {
   executeChunkedRequests,
   type ChunkExecutionResult,
 } from '../../../shared_libs/play-runtime/batch-runtime';
-import { getDefaultPlayRuntimeBatchStrategy } from '../../../shared_libs/play-runtime/default-batch-strategies';
+import { getPlayRuntimeBatchStrategy } from '../../../shared_libs/play-runtime/play-runtime-batching-registry';
 import { STANDARD_PLAY_RUNTIME_LIMIT_SECONDS } from '../../../shared_libs/temporal/constants';
 import {
   createPlayExecutionGovernor,
@@ -161,6 +161,12 @@ import {
   type SecretAwareRequestInit,
   type SecretHandle,
 } from '../../../shared_libs/play-runtime/secret-capability';
+import { safePublicFetch } from '../../../shared_libs/security/safe-fetch';
+import {
+  assertPublicHttpUrl,
+  isIpAddressLiteral,
+  UnsafeOutboundUrlError,
+} from '../../../shared_libs/security/outbound-url-policy';
 import type {
   LiveNodeProgressMap,
   LiveNodeProgressSnapshot,
@@ -395,6 +401,9 @@ function captureRuntimeApiBinding(env: WorkerEnv): void {
 }
 let cachedCoordinatorBinding: WorkerEnv['COORDINATOR'] | null = null;
+const TRACE_FLUSH_MS = 1_000;
+const pendingTraceForwardsByRun = new Map<string, Promise<void>>();
 function captureCoordinatorBinding(env: WorkerEnv): void {
   cachedCoordinatorBinding = env.COORDINATOR ?? null;
 }
@@ -679,32 +688,58 @@ function recordRunnerPerfTrace(input: {
   ms?: number;
   extra?: Record<string, unknown>;
 }): void {
+  // Benchmark note: these runner spans decompose the server watch's terminal
+  // wait. They are logged locally and forwarded to the coordinator so
+  // `/api/v2/plays/run --watch` benchmark exports can join them with
+  // `server.stream_scheduler_terminal_event` by runId.
   if (!input.req.runId || !input.phase) return;
+  const phase = input.phase.startsWith('runner.')
+    ? input.phase
+    : `runner.${input.phase}`;
   // Tool-level traces can fire once per row/provider step. Forwarding each one
   // through the coordinator binding can consume Cloudflare's subrequest budget
   // before large batched maps finish.
-  if (input.phase.startsWith('runner.tool.')) {
+  if (phase.startsWith('runner.tool.')) {
     return;
   }
   const payload = {
     ts: Date.now(),
     source: 'dynamic_worker' as const,
     runId: input.req.runId,
-    phase: `runner.${input.phase}`,
+    phase,
     ms: input.ms ?? 0,
     ...(input.extra ?? {}),
   };
   console.log(
     `[deepline-run:${input.req.runId}] [perf-trace] ${JSON.stringify(payload)}`,
   );
-  cachedCoordinatorBinding
-    ?.recordPerfTrace(input.req.runId, payload)
-    .catch((error: unknown) => {
-      const message = error instanceof Error ? error.message : String(error);
-      console.warn(
-        `[deepline-run:${input.req.runId}] failed to forward runner perf trace: ${message}`,
-      );
-    });
+  const binding = cachedCoordinatorBinding;
+  if (!binding) return;
+  const forward = binding
+    .recordPerfTrace(input.req.runId, payload)
+    .catch(() => undefined);
+  const previous = pendingTraceForwardsByRun.get(input.req.runId);
+  const pending = previous
+    ? previous.then(
+        () => forward,
+        () => forward,
+      )
+    : forward;
+  pendingTraceForwardsByRun.set(input.req.runId, pending);
+  void pending.finally(() => {
+    if (pendingTraceForwardsByRun.get(input.req.runId) === pending) {
+      pendingTraceForwardsByRun.delete(input.req.runId);
+    }
+  });
+}
+async function drainRunnerPerfTraces(req: RunRequest): Promise<void> {
+  const pending = pendingTraceForwardsByRun.get(req.runId);
+  if (!pending) return;
+  await Promise.race([
+    pending,
+    new Promise((resolve) => setTimeout(resolve, TRACE_FLUSH_MS)),
+  ]);
 }
 function makeRequestId(): string {
@@ -1031,10 +1066,18 @@ async function executeToolWithLifecycle(
   args: { id: string; toolId: string; input: Record<string, unknown> },
   workflowStep: WorkflowStep | undefined,
   callbacks: WorkerCtxCallbacks | undefined,
+  onProviderBackpressure?: (retryAfterMs: number) => void,
+  onRetryAttempt?: () => void,
 ): Promise<ToolExecuteResult> {
   callbacks?.onToolCalled?.(args.toolId, nowMs());
   try {
-    return await executeTool(req, args, workflowStep);
+    return await executeTool(
+      req,
+      args,
+      workflowStep,
+      onProviderBackpressure,
+      onRetryAttempt,
+    );
   } catch (error) {
     callbacks?.onToolFailed?.(args.toolId, nowMs());
     throw error;
@@ -1178,17 +1221,38 @@ async function callToolDirect(
     attempt <= WORKER_TOOL_RATE_LIMIT_MAX_ATTEMPTS;
     attempt += 1
   ) {
-    const res = await fetchRuntimeApi(req.baseUrl, path, {
-      method: 'POST',
-      headers: {
-        'content-type': 'application/json',
-        authorization: `Bearer ${req.executorToken}`,
-        'x-deepline-request-id': `${req.runId}:${toolId}:${id}:attempt:${attempt}`,
-        [EXECUTE_RESPONSE_CONTRACT_HEADER]: V2_EXECUTE_RESPONSE_CONTRACT,
-        [EXECUTE_TOOL_METADATA_HEADER]: 'true',
-      },
-      body: JSON.stringify({ payload: input }),
-    });
+    let res: Response;
+    try {
+      res = await fetchRuntimeApi(req.baseUrl, path, {
+        method: 'POST',
+        headers: {
+          'content-type': 'application/json',
+          authorization: `Bearer ${req.executorToken}`,
+          'x-deepline-request-id': `${req.runId}:${toolId}:${id}:attempt:${attempt}`,
+          [EXECUTE_RESPONSE_CONTRACT_HEADER]: V2_EXECUTE_RESPONSE_CONTRACT,
+          [EXECUTE_TOOL_METADATA_HEADER]: 'true',
+        },
+        body: JSON.stringify({ payload: input }),
+      });
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      lastError = new Error(
+        `Tool ${toolId} transport failed calling ${path} for run ${req.runId} on attempt ${attempt}/${WORKER_TOOL_TRANSPORT_MAX_ATTEMPTS}: ${message}`,
+      );
+      if (
+        attempt >= WORKER_TOOL_TRANSPORT_MAX_ATTEMPTS ||
+        !isRetryableRuntimeApiError(error)
+      ) {
+        throw lastError;
+      }
+      onRetryAttempt?.();
+      const delayMs = WORKER_TOOL_TRANSPORT_RETRY_DELAY_MS * attempt;
+      console.warn(
+        `[deepline-run:${req.runId}] tool transport retry tool=${toolId} path=${path} attempt=${attempt}/${WORKER_TOOL_TRANSPORT_MAX_ATTEMPTS} retryAfterMs=${delayMs} error=${redactSecretsFromLogString(message)}`,
+      );
+      await sleepWorkerMs(delayMs);
+      continue;
+    }
     if (res.ok) {
       const body = (await res.json()) as Record<string, unknown>;
       const parsed = parseToolExecuteResponse(toolId, body);
@@ -1423,6 +1487,8 @@ const WORKER_RETRY_SAFE_5XX_TOOLS = new Set(['test_transient_500']);
  * retry budget, so a runaway storm stays bounded and loud.
  */
 const WORKER_TOOL_RATE_LIMIT_MAX_ATTEMPTS = 8;
+const WORKER_TOOL_TRANSPORT_MAX_ATTEMPTS = 3;
+const WORKER_TOOL_TRANSPORT_RETRY_DELAY_MS = 1_000;
 function sleepWorkerMs(ms: number): Promise<void> {
   return new Promise((resolve) => setTimeout(resolve, ms));
@@ -1442,6 +1508,7 @@ class WorkerToolBatchScheduler {
     private readonly resolvePacing: WorkerPacingResolver,
     private readonly abortSignal?: AbortSignal,
     private readonly onRequestsSettled?: (count: number) => void,
+    private readonly callbacks?: WorkerCtxCallbacks,
   ) {}
   /**
@@ -1508,7 +1575,7 @@ class WorkerToolBatchScheduler {
     return this.queue.some(
       (request) =>
         request.toolId !== 'test_wait_for_event' &&
-        getDefaultPlayRuntimeBatchStrategy(request.toolId) !== null,
+        getPlayRuntimeBatchStrategy(request.toolId) !== null,
     );
   }
@@ -1539,8 +1606,8 @@ class WorkerToolBatchScheduler {
     toolId: string,
     requests: WorkerToolBatchRequest[],
   ): Promise<void> {
-    const strategy = getDefaultPlayRuntimeBatchStrategy(toolId);
-    if (!strategy || toolId === 'test_wait_for_event') {
+    const strategy = getPlayRuntimeBatchStrategy(toolId);
+    if (!strategy || toolId === 'test_wait_for_event' || requests.length < 2) {
       const groupStartedAt = nowMs();
       await Promise.all(
         requests.map(async (request) => {
@@ -1552,10 +1619,11 @@ class WorkerToolBatchScheduler {
           });
           try {
             request.resolve(
-              await executeTool(
+              await executeToolWithLifecycle(
                 this.req,
                 { id: request.id, toolId, input: request.input },
                 request.workflowStep,
+                this.callbacks,
                 (retryAfterMs) => this.reportBackpressure(toolId, retryAfterMs),
                 () => this.governor.chargeBudget('retry'),
               ),
@@ -1591,6 +1659,7 @@ class WorkerToolBatchScheduler {
       reportBackpressure: (retryAfterMs) =>
         this.reportBackpressure(toolId, retryAfterMs),
       onRequestsSettled: this.onRequestsSettled,
+      callbacks: this.callbacks,
     });
     recordRunnerPerfTrace({
       req: this.req,
@@ -1625,6 +1694,7 @@ async function executeBatchedWorkerToolGroup(input: {
   abortSignal?: AbortSignal;
   reportBackpressure: (retryAfterMs: number) => void;
   onRequestsSettled?: (count: number) => void;
+  callbacks?: WorkerCtxCallbacks;
 }): Promise<void> {
   const compiledBatches = compileRequestsWithStrategy({
     requests: input.requests,
@@ -1659,6 +1729,7 @@ async function executeBatchedWorkerToolGroup(input: {
         signal: input.abortSignal,
       });
       try {
+        input.callbacks?.onToolCalled?.(batch.batchOperation, nowMs());
         return await executeTool(
           input.req,
           {
@@ -1670,6 +1741,9 @@ async function executeBatchedWorkerToolGroup(input: {
           input.reportBackpressure,
           () => input.governor.chargeBudget('retry'),
         );
+      } catch (error) {
+        input.callbacks?.onToolFailed?.(batch.batchOperation, nowMs());
+        throw error;
       } finally {
         slot.release();
       }
@@ -2276,6 +2350,46 @@ function parseFetchJsonOrNull(bodyText: string): unknown | null {
   }
 }
+async function safeWorkerPublicFetch(
+  input: string | URL,
+  init: RequestInit,
+  options: {
+    allowedOrigins: Iterable<string>;
+    sensitiveHeaders: Iterable<string>;
+  },
+): Promise<Response> {
+  const allowedOrigins = new Set(options.allowedOrigins);
+  return safePublicFetch(input, init, {
+    sensitiveHeaders: options.sensitiveHeaders,
+    fetchImpl: async (nextInput, nextInit) => {
+      const url = assertPublicHttpUrl(nextInput);
+      if (
+        !isIpAddressLiteral(url.hostname) &&
+        !allowedOrigins.has(url.origin)
+      ) {
+        throw new UnsafeOutboundUrlError(
+          'workers_edge ctx.fetch requires a public IP literal target or Deepline runtime origin. Use a Deepline integration tool for other hostname URLs.',
+        );
+      }
+      return fetch(url, nextInit);
+    },
+  });
+}
+function normalizeAllowedWorkerFetchOrigin(rawUrl: string): string | null {
+  try {
+    return assertPublicHttpUrl(rawUrl).origin;
+  } catch {
+    return null;
+  }
+}
+function getAllowedWorkerFetchOrigins(req: RunRequest): string[] {
+  return [req.baseUrl, req.callbackUrl]
+    .map(normalizeAllowedWorkerFetchOrigin)
+    .filter((origin): origin is string => origin !== null);
+}
 // ---------------------------------------------------------------------------
 // Streaming CSV parser. Pipes a `ReadableStream<Uint8Array>` from R2 through
 // a TextDecoder + line buffer + RFC-4180-ish state machine, yielding chunks
@@ -3248,7 +3362,8 @@ function createMinimalWorkerCtx(
   const executeWithRuntimeReceipt = async <T>(
     key: string,
     execute: () => Promise<T> | T,
-    repairRunningReceiptForSameRun = false,
+    repairRunningReceiptForSameRun = true,
+    reclaimRunning = false,
   ): Promise<T> => {
     const serialized = await runWorkerRuntimeReceiptBoundary<unknown>({
       orgId: req.orgId,
@@ -3258,6 +3373,7 @@ function createMinimalWorkerCtx(
       receiptStore,
       execute: async () => serializeDurableStepValue(await execute()),
       repairRunningReceiptForSameRun,
+      reclaimRunning,
     });
     return deserializeDurableStepValue(serialized) as T;
   };
@@ -3279,7 +3395,7 @@ function createMinimalWorkerCtx(
         )(name, async () => serializeDurableStepValue(await execute()));
         return deserializeDurableStepValue(serialized) as T;
       },
-      true,
+      false,
     );
   };
   const nextCtxStepReceiptKey = (name: string): string => {
@@ -3300,6 +3416,14 @@ function createMinimalWorkerCtx(
     }
     return `:stale:${staleAfterSeconds}:${Math.floor(nowMs() / (staleAfterSeconds * 1000))}`;
   };
+  const rootToolBatchScheduler = new WorkerToolBatchScheduler(
+    req,
+    governor,
+    resolveToolPacing,
+    abortSignal,
+    undefined,
+    callbacks,
+  );
   // Local ancestry chain that always ENDS with the currently-executing play
   // (req.playName). The /api/v2/plays/run lineage validator requires the
   // submitted ancestry's tail to equal the executor token's play name (i.e.
@@ -3746,10 +3870,8 @@ function createMinimalWorkerCtx(
                 reportExecutionHeartbeat(false);
                 const entry = uniqueRowsToExecuteEntries[myIndex]!;
                 const pendingRow = pendingRowsByKey.get(entry.rowKey);
-                const row = runtimeCsvExecutionRow(
-                  entry.row,
-                  pendingRow,
-                ) as T & Record<string, unknown>;
+                const row = runtimeCsvExecutionRow(entry.row, pendingRow) as T &
+                  Record<string, unknown>;
                 const absoluteIndex = entry.absoluteIndex;
                 const enriched: Record<string, unknown> =
                   cloneCsvAliasedRow(row);
@@ -4753,7 +4875,13 @@ function createMinimalWorkerCtx(
             toolId: request.toolId,
             requestInput: request.input,
           })}${staleRuntimeSuffix(request.staleAfterSeconds)}`,
-          () => executeToolWithLifecycle(req, request, workflowStep, callbacks),
+          () =>
+            rootToolBatchScheduler.execute(
+              request.id,
+              request.toolId,
+              request.input,
+              workflowStep,
+            ),
         );
       },
     },
@@ -5214,7 +5342,10 @@ function createMinimalWorkerCtx(
         };
         const fetchInit = { ...init, headers };
         delete fetchInit.auth;
-        const response = await fetch(url, fetchInit);
+        const response = await safeWorkerPublicFetch(url, fetchInit, {
+          allowedOrigins: getAllowedWorkerFetchOrigins(req),
+          sensitiveHeaders: Object.keys(secretHeaderMarkers),
+        });
         assertNotAborted(abortSignal);
         const bodyText = await response.text();
         const redactedBodyText = secretRedactor.redactString(bodyText);
@@ -5500,12 +5631,10 @@ async function executeRunRequest(
   let runLogBuffer: string[] = [];
   let pendingRunLogLines: string[] = [];
   // Monotonic count of every line ever appended to this run's worker log
-  // channel. runLogBuffer/pendingRunLogLines are rotating tails of those
-  // lines (RUN_LOG_BUFFER_LIMIT is the coordinator transport cache only), so
-  // each log.appended batch can carry the absolute channelOffset of its first
-  // line: totalEmittedLogLines - pendingRunLogLines.length. Run Log Stream
-  // ingestion skips re-sent prefixes positionally (exactly-once, repeated
-  // identical lines preserved) instead of text-deduping.
+  // channel. runLogBuffer is only the rotating live/coordinator transport
+  // cache; pendingRunLogLines is the durable unsent suffix and must not rotate,
+  // otherwise a flush already in flight can let fresh lines fall out before
+  // Run Log Stream ingestion ever sees them.
   let totalEmittedLogLines = 0;
   let stepProgressByNodeId: LiveNodeProgressMap = {};
   let dirtyProgressNodeIds = new Set<string>();
@@ -5531,9 +5660,7 @@ async function executeRunRequest(
     if (!trimmed) return;
     totalEmittedLogLines += 1;
     runLogBuffer = [...runLogBuffer, trimmed].slice(-RUN_LOG_BUFFER_LIMIT);
-    pendingRunLogLines = [...pendingRunLogLines, trimmed].slice(
-      -RUN_LOG_BUFFER_LIMIT,
-    );
+    pendingRunLogLines = [...pendingRunLogLines, trimmed];
   };
   const updateStepProgress = (input: {
@@ -5722,9 +5849,7 @@ async function executeRunRequest(
         lines: pendingRunLogLines,
         // Positional cursor: pendingRunLogLines always holds the LAST
         // pending lines emitted on this channel, so the offset of its first
-        // line is total-emitted minus pending length. This also covers the
-        // terminal full-buffer re-send (pending = runLogBuffer), which
-        // ingestion then skips positionally instead of via text dedupe.
+        // line is total-emitted minus pending length.
         channelOffset: totalEmittedLogLines - pendingRunLogLines.length,
       });
       pendingRunLogLines = [];
@@ -5820,18 +5945,13 @@ async function executeRunRequest(
     terminalEvent: PlayRunLedgerEvent,
   ): Promise<void> => {
     if (!options?.persistResultDatasets) return;
+    await ledgerFlushInFlight;
     const now = nowMs();
-    // Terminal re-send of the full retained buffer. drainPendingLedgerEvents
-    // stamps it with channelOffset = totalEmitted - buffer length, so Run Log
-    // Stream ingestion drops the already-ingested prefix positionally.
-    pendingRunLogLines = runLogBuffer;
     dirtyProgressNodeIds = new Set([
       ...dirtyProgressNodeIds,
       ...Object.keys(stepProgressByNodeId),
     ]);
-    pendingLedgerEvents = [...pendingLedgerEvents, terminalEvent];
-    await ledgerFlushInFlight;
-    const events = drainPendingLedgerEvents(now);
+    const events = [...drainPendingLedgerEvents(now), terminalEvent];
     if (events.length === 0) return;
     try {
       await postRuntimeApi(req.baseUrl, req.executorToken, {
@@ -5871,7 +5991,17 @@ async function executeRunRequest(
     onToolFailed: (toolId, at) => stepLifecycle?.onToolFailed(toolId, at),
   };
+  let hasEmittedRunnerEvent = false;
   const wrappedEmit = (event: RunnerEvent) => {
+    if (!hasEmittedRunnerEvent) {
+      hasEmittedRunnerEvent = true;
+      recordRunnerPerfTrace({
+        req,
+        phase: 'first_event',
+        ms: nowMs() - startedAt,
+        extra: { eventType: event.type },
+      });
+    }
     if (event.type === 'log') {
       appendRunLogLine(event.message);
       flushLedgerEvents(false);
@@ -6042,11 +6172,18 @@ async function executeRunRequest(
       phase: 'runner.execute_total',
       ms: nowMs() - startedAt,
     });
+    // The server-side watch path reads coordinator-buffered perf traces from
+    // the same tail response that carries the terminal event. Runner traces are
+    // forwarded asynchronously during execution so normal play latency is not
+    // gated on observability writes; before returning terminal output, wait a
+    // bounded interval for those writes to land. This keeps benchmark exports
+    // able to decompose "terminal wait" into runner/dataset/ledger phases
+    // without turning trace delivery into a correctness dependency.
+    await drainRunnerPerfTraces(req);
     return {
       playName: req.playName,
       result: serializedResult,
       outputRows: inferOutputRows(serializedResult),
-      liveLogs: runLogBuffer,
       liveNodeProgress: stepProgressSnapshot(),
       durationMs: nowMs() - startedAt,
     };
@@ -6070,6 +6207,7 @@ async function executeRunRequest(
       appendRunLogLine(
         `${aborted ? '[cancelled]' : '[error]'} ${redactSecretsFromLogString(message)}`,
       );
+      const terminalUpdateStartedAt = nowMs();
       await flushTerminalLedgerEvents({
         type: aborted ? 'run.cancelled' : 'run.failed',
         runId: req.runId,
@@ -6094,25 +6232,55 @@ async function executeRunRequest(
               ],
             },
       });
+      recordRunnerPerfTrace({
+        req,
+        phase: aborted
+          ? 'runner.terminal_ledger_append_cancelled'
+          : 'runner.terminal_ledger_append_failed',
+        ms: nowMs() - terminalUpdateStartedAt,
+        extra: {
+          errorCode: failure.code,
+          errorPhase: failure.phase,
+        },
+      });
+      const billingStartedAt = nowMs();
       await finalizeWorkerComputeBilling({
         req,
         success: false,
         actionEstimate: 4,
-      }).catch((finalizeError) => {
-        console.error(
-          `[play-harness] non-fatal compute billing finalize failed runId=${req.runId}: ${
-            finalizeError instanceof Error
-              ? finalizeError.message
-              : String(finalizeError)
-          }`,
-        );
-      });
+      })
+        .catch((finalizeError) => {
+          console.error(
+            `[play-harness] non-fatal compute billing finalize failed runId=${req.runId}: ${
+              finalizeError instanceof Error
+                ? finalizeError.message
+                : String(finalizeError)
+            }`,
+          );
+        })
+        .finally(() => {
+          recordRunnerPerfTrace({
+            req,
+            phase: 'runner.compute_billing_finalize_failed',
+            ms: nowMs() - billingStartedAt,
+          });
+        });
     }
     await signalParentPlayTerminal({
       req,
       status: aborted ? 'cancelled' : 'failed',
       error: message,
     }).catch(() => null);
+    recordRunnerPerfTrace({
+      req,
+      phase: aborted ? 'runner.execute_cancelled' : 'runner.execute_failed',
+      ms: nowMs() - startedAt,
+      extra: {
+        errorCode: failure.code,
+        errorPhase: failure.phase,
+      },
+    });
+    await drainRunnerPerfTraces(req);
     throw error;
   } finally {
     clearTimeout(runtimeDeadlineTimer);
@@ -6541,14 +6709,29 @@ export class TenantWorkflow extends WorkflowEntrypoint<
     // Must run BEFORE any SDK call site that would reach into HARNESS,
     // i.e. before user play code is invoked. Idempotent within a run.
     captureHarnessBinding(this.env);
+    recordRunnerPerfTrace({
+      req,
+      phase: 'tenant_workflow_entry',
+      ms: 0,
+      extra: {
+        hasWorkflowStep: true,
+      },
+    });
     // Fire the one-time wiring probe (deduplicated across runs in the
     // same isolate). Awaited so the result is in the log before user code
     // begins. A missing or unhealthy HARNESS fails the run before user code
     // can accidentally take a slower fallback path.
+    const probeStartedAt = nowMs();
     await probeHarnessOnce(this.env, runPrefix);
+    recordRunnerPerfTrace({
+      req,
+      phase: 'tenant_workflow_probe_harness',
+      ms: nowMs() - probeStartedAt,
+    });
     const abortController = new AbortController();
     try {
-      return (await executeRunRequest(
+      const executeStartedAt = nowMs();
+      const output = (await executeRunRequest(
         req,
         this.env,
         (runnerEvent) => {
@@ -6579,6 +6762,12 @@ export class TenantWorkflow extends WorkflowEntrypoint<
           waitUntil: (promise) => this.ctx.waitUntil(promise),
         },
       )) as Record<string, unknown>;
+      recordRunnerPerfTrace({
+        req,
+        phase: 'tenant_workflow_execute_request',
+        ms: nowMs() - executeStartedAt,
+      });
+      return output;
     } catch (error) {
       // CF Workflows + the dynamic-workflows framework swallow the error
       // message and surface only "internal error; reference = <id>" via

package/dist/repo/apps/play-runner-workers/src/runtime/receipts.ts CHANGED Viewed

@@ -41,22 +41,13 @@ function errorMessage(error: unknown): string {
   return error instanceof Error ? error.message : String(error);
 }
-function runningReceiptError(
-  key: string,
-  receipt: WorkerRuntimeReceipt,
-): Error {
-  return new Error(
-    `Runtime receipt ${key} is already running for run ${receipt.runId ?? 'unknown'}.`,
-  );
-}
 async function executeAndPersistReceipt<T>(input: {
   key: string;
   playName: string;
   runId: string;
   execute: () => Promise<T> | T;
   receiptStore: WorkerRuntimeReceiptStore;
-  ownership: 'claimed' | 'workflow_replay';
+  ownership: 'claimed' | 'reconciled';
 }): Promise<T> {
   let output: T;
   try {
@@ -83,9 +74,13 @@ async function executeAndPersistReceipt<T>(input: {
     output,
   });
   if (!completed) {
-    throw new Error(
-      `Runtime receipt ${input.key} ${input.ownership} execution completed but completed receipt could not be persisted.`,
-    );
+    return output;
+  }
+  if (
+    (completed.status === 'completed' || completed.status === 'skipped') &&
+    completed.output !== undefined
+  ) {
+    return receiptOutput<T>(completed);
   }
   return output;
 }
@@ -94,6 +89,7 @@ export async function runWorkerRuntimeReceiptBoundary<T>(
   input: RuntimeReceiptContext & {
     execute: () => Promise<T> | T;
     repairRunningReceiptForSameRun?: boolean;
+    reclaimRunning?: boolean;
   },
 ): Promise<T> {
   const key = scopedReceiptKey(input);
@@ -102,25 +98,20 @@ export async function runWorkerRuntimeReceiptBoundary<T>(
     playName: input.playName,
     runId: input.runId,
     key,
+    ...(input.reclaimRunning === true ? { reclaimRunning: true } : {}),
   });
   if (claimed.disposition === 'reused') {
     return receiptOutput<T>(claimed.receipt);
   }
   if (claimed.disposition === 'running') {
-    if (
-      input.repairRunningReceiptForSameRun &&
-      claimed.receipt.runId === input.runId
-    ) {
-      return executeAndPersistReceipt({
-        key,
-        playName: input.playName,
-        runId: input.runId,
-        execute: input.execute,
-        receiptStore,
-        ownership: 'workflow_replay',
-      });
-    }
-    throw runningReceiptError(key, claimed.receipt);
+    return executeAndPersistReceipt({
+      key,
+      playName: input.playName,
+      runId: input.runId,
+      execute: input.execute,
+      receiptStore,
+      ownership: 'reconciled',
+    });
   }
   if (claimed.disposition === 'failed') {
     throw new Error(