deepline 0.1.25 → 0.1.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -63,12 +63,37 @@ import {
63
63
  derivePlayRowIdentityFromKey,
64
64
  } from '../../../shared_libs/plays/row-identity';
65
65
  import {
66
+ getTopLevelPipelineSubsteps,
66
67
  getCompiledPipelineSubsteps,
67
68
  flattenStaticPipeline,
68
69
  resolveSheetContractForTableNamespace,
69
70
  sqlSafePlayColumnName,
71
+ type PlayStaticSubstep,
70
72
  type PlayStaticPipeline,
73
+ type PlaySheetContract,
71
74
  } from '../../../shared_libs/plays/static-pipeline';
75
+ import {
76
+ PlayStepLifecycleTracker,
77
+ type PlayStepLifecycleEvent,
78
+ } from '../../../shared_libs/play-runtime/step-lifecycle-tracker';
79
+ import type {
80
+ PlayRunLedgerEvent,
81
+ PlayRunLedgerStepProgress,
82
+ PlayRunLedgerStepStatus,
83
+ } from '../../../shared_libs/play-runtime/run-ledger';
84
+ import {
85
+ createCsvDatasetHandle,
86
+ createInlineDatasetHandle,
87
+ createMaterializedDatasetHandle,
88
+ createPersistedDatasetHandle,
89
+ datasetRowCountHint,
90
+ isDatasetHandle,
91
+ iterDatasetChunks,
92
+ WORKER_DATASET_IN_MEMORY_ROWS,
93
+ WORKER_DATASET_PREVIEW_ROWS,
94
+ type WorkerDatasetHandle,
95
+ type WorkerDatasetInput,
96
+ } from './runtime/dataset-handles';
72
97
  // The harness stub forwards leaf calls (validation, runtime-api HTTP) into
73
98
  // the long-lived Play Harness Worker via env.HARNESS. We import the
74
99
  // `setHarnessBinding` setter eagerly so it's available the moment
@@ -80,9 +105,9 @@ import {
80
105
  // modules without going through this stub is how we'd accidentally
81
106
  // re-bundle harness internals into per-play. Keep that in mind.
82
107
  import {
83
- harnessFetchStagedFile,
84
108
  harnessPersistCompletedSheetRows,
85
- harnessPrewarmPostgresSessions,
109
+ harnessReadSheetDatasetRows,
110
+ harnessReadStagedFileChunk,
86
111
  harnessStartSheetDataset,
87
112
  setHarnessBinding,
88
113
  } from '../../../sdk/src/plays/harness-stub';
@@ -115,12 +140,14 @@ type RunRequest = {
115
140
  runtimeInput: Record<string, unknown>;
116
141
  /** Optional inline CSV rows (for plays where ctx.csv was passed inline data). */
117
142
  inlineCsv?: { name: string; rows: Record<string, unknown>[] } | null;
118
- /** R2 keys for input files keyed by logical filename (used by ctx.csv). */
119
- inputR2Keys?: Record<string, string> | null;
143
+ /** Staged input files keyed by logical filename (used by ctx.csv). */
144
+ inputFiles?: Record<string, WorkerFileRef> | null;
120
145
  /** Files packaged with the play artifact (relative-path imports). */
121
146
  packagedFiles?: Array<{
122
147
  playPath: string;
123
148
  storageKey: string;
149
+ contentType?: string | null;
150
+ bytes?: number | null;
124
151
  }> | null;
125
152
  /** Partition fan-out: only process rows[start..end) of a sliced dataset. */
126
153
  partitionRange?: { start: number; end: number } | null;
@@ -148,6 +175,14 @@ type RunRequest = {
148
175
  totalRows?: number;
149
176
  };
150
177
 
178
+ type WorkerFileRef = {
179
+ logicalPath: string;
180
+ fileName: string;
181
+ storageKey: string;
182
+ contentType?: string | null;
183
+ bytes?: number | null;
184
+ };
185
+
151
186
  const EXECUTE_TOOL_METADATA_HEADER = 'x-deepline-include-tool-metadata';
152
187
 
153
188
  /** R2 binding injected by the Worker runtime (when present in deploy metadata). */
@@ -315,6 +350,7 @@ async function probeHarnessOnce(
315
350
  */
316
351
  const RUNTIME_API_TIMEOUT_MS = 30_000;
317
352
  const RUNTIME_API_PLAY_RUN_TIMEOUT_MS = 75_000;
353
+ const RUNTIME_API_RETRY_DELAYS_MS = [250, 750, 1500] as const;
318
354
  let loggedMissingRuntimeApiBinding = false;
319
355
 
320
356
  async function fetchRuntimeApi(
@@ -383,132 +419,6 @@ const WORKER_PLAY_CALL_LIMITS = {
383
419
  maxConcurrentPlayCalls: 16,
384
420
  };
385
421
 
386
- /**
387
- * Produces a dataset-envelope-shaped object compatible with the legacy
388
- * SerializedPlayDataset shape (kind/datasetKind/count/columns/preview) so
389
- * tests + assertions that probe `result.rows.columns` etc. work without the
390
- * ctx changing semantics. Plays still iterate rows via array semantics.
391
- */
392
- function makeWorkerDataset<T extends Record<string, unknown>>(
393
- name: string,
394
- rows: T[],
395
- options?: {
396
- count?: number;
397
- datasetKind?: 'csv' | 'map';
398
- cacheSummary?: string | null;
399
- workProgress?: {
400
- total: number;
401
- executed: number;
402
- reused: number;
403
- skipped: number;
404
- pending: number;
405
- failed: number;
406
- degraded?: boolean;
407
- duplicates?: {
408
- exact?: number;
409
- semantic?: number;
410
- rejected?: number;
411
- };
412
- };
413
- },
414
- ): T[] & {
415
- count(): Promise<number>;
416
- peek(limit?: number): Promise<T[]>;
417
- materialize(limit?: number): Promise<T[]>;
418
- toJSON(): unknown;
419
- datasetId: string;
420
- tableNamespace: string;
421
- } {
422
- const datasetId = `map:${name}`;
423
- const count = Math.max(0, Math.floor(options?.count ?? rows.length));
424
- const datasetKind = options?.datasetKind ?? 'map';
425
- const cacheSummary = options?.cacheSummary ?? null;
426
- const workProgress = options?.workProgress;
427
- // Build the array result. JSON.stringify on arrays calls toJSON only if
428
- // present on the array itself — we attach below. The dataset metadata is
429
- // also exposed via own properties so plays can `enriched.count()` etc.
430
- const arr = rows as T[] & {
431
- count(): Promise<number>;
432
- peek(limit?: number): Promise<T[]>;
433
- materialize(limit?: number): Promise<T[]>;
434
- toJSON(): unknown;
435
- datasetId: string;
436
- tableNamespace: string;
437
- };
438
- const previewLimit = 5;
439
- const inferredColumns = (() => {
440
- const cols = new Set<string>();
441
- for (const r of rows) {
442
- for (const k of Object.keys(r)) cols.add(k);
443
- }
444
- return [...cols];
445
- })();
446
- Object.defineProperty(arr, 'count', {
447
- value: async () => count,
448
- enumerable: false,
449
- });
450
- Object.defineProperty(arr, 'peek', {
451
- value: async (limit = previewLimit) => rows.slice(0, Math.max(0, limit)),
452
- enumerable: false,
453
- });
454
- Object.defineProperty(arr, 'materialize', {
455
- value: async (limit?: number) =>
456
- limit === undefined ? [...rows] : rows.slice(0, Math.max(0, limit)),
457
- enumerable: false,
458
- });
459
- Object.defineProperty(arr, 'datasetId', {
460
- value: datasetId,
461
- enumerable: true,
462
- });
463
- Object.defineProperty(arr, 'tableNamespace', {
464
- value: name,
465
- enumerable: true,
466
- });
467
- Object.defineProperty(arr, '__deeplineDatasetCount', {
468
- value: count,
469
- enumerable: false,
470
- });
471
- Object.defineProperty(arr, '__deeplineDatasetKind', {
472
- value: datasetKind,
473
- enumerable: false,
474
- });
475
- Object.defineProperty(arr, '__deeplineCacheSummary', {
476
- value: cacheSummary,
477
- enumerable: false,
478
- });
479
- Object.defineProperty(arr, '__deeplineWorkProgress', {
480
- value: workProgress,
481
- enumerable: false,
482
- });
483
- // Plays often `return { rows: dataset, count: N }`. JSON.stringify on the
484
- // array would normally produce `[row, row, ...]` — we want the dataset
485
- // envelope shape instead so assertions seeing `result.rows.columns` pass.
486
- // toJSON on an array is honored by JSON.stringify per ES spec.
487
- // toJSON includes ALL rows so the workflow DO can persist the full
488
- // dataset to the sheet table. We clone via plain-object copy to avoid
489
- // re-entrant toJSON resolution (the dataset IS an array; passing it back
490
- // via `preview: arr` would recurse forever through this same toJSON).
491
- Object.defineProperty(arr, 'toJSON', {
492
- value: () => {
493
- const plainRows = rows.map((r) => ({ ...r }));
494
- return {
495
- kind: 'dataset' as const,
496
- datasetKind,
497
- datasetId,
498
- count,
499
- columns: inferredColumns,
500
- preview: plainRows,
501
- tableNamespace: name,
502
- ...(cacheSummary ? { cacheSummary } : {}),
503
- ...(workProgress ? { _metadata: { workProgress } } : {}),
504
- };
505
- },
506
- enumerable: false,
507
- });
508
- void previewLimit;
509
- return arr;
510
- }
511
-
512
422
  type RunnerEvent =
513
423
  | {
514
424
  type: 'log';
@@ -533,12 +443,53 @@ type WorkerCtxCallbacks = {
533
443
  nodeId: string;
534
444
  progress: LiveNodeProgressSnapshot;
535
445
  }) => void;
446
+ onMapStarted?: (nodeId: string, at?: number) => void;
447
+ onMapCompleted?: (nodeId: string, at?: number) => void;
448
+ onToolCalled?: (toolId: string, at?: number) => void;
449
+ onToolFailed?: (toolId: string, at?: number) => void;
536
450
  };
537
451
 
538
452
  function nowMs(): number {
539
453
  return Date.now();
540
454
  }
541
455
 
456
+ function getStaticSubstepNodeId(
457
+ substep: PlayStaticSubstep,
458
+ index: number,
459
+ ): string {
460
+ switch (substep.type) {
461
+ case 'csv':
462
+ return `csv:${substep.field || index}`;
463
+ case 'map':
464
+ return `map:${substep.tableNamespace ?? substep.field}`;
465
+ case 'tool':
466
+ return `tool:${substep.field}:${substep.toolId}`;
467
+ case 'waterfall':
468
+ return `waterfall:${substep.id ?? substep.field}`;
469
+ case 'play_call':
470
+ return `play_call:${substep.field}:${substep.playId}`;
471
+ case 'run_javascript':
472
+ return `run_javascript:${substep.alias}`;
473
+ case 'code':
474
+ return `code:${substep.field || index}`;
475
+ default:
476
+ return `node:${index}`;
477
+ }
478
+ }
479
+
480
+ function buildOrderedNodeList(
481
+ contractSnapshot: unknown,
482
+ ): Array<{ nodeId: string; type: string }> {
483
+ const snapshot = isRecord(contractSnapshot) ? contractSnapshot : null;
484
+ const substeps = getTopLevelPipelineSubsteps(
485
+ (snapshot?.staticPipeline as PlayStaticPipeline | null | undefined) ?? null,
486
+ );
487
+ return substeps.map((substep, index) => ({
488
+ nodeId: getStaticSubstepNodeId(substep, index),
489
+ type: substep.type,
490
+ }));
491
+ }
492
+
542
493
  function recordRunnerPerfTrace(input: {
543
494
  req: RunRequest;
544
495
  phase: string;
@@ -557,7 +508,7 @@ function recordRunnerPerfTrace(input: {
557
508
  source: 'dynamic_worker' as const,
558
509
  runId: input.req.runId,
559
510
  phase: `runner.${input.phase}`,
560
- ...(input.ms !== undefined ? { ms: input.ms } : {}),
511
+ ms: input.ms ?? 0,
561
512
  ...(input.extra ?? {}),
562
513
  };
563
514
  console.log(
@@ -614,44 +565,83 @@ async function postRuntimeApi<T>(
614
565
  // Routes through the in-process RUNTIME_API binding when present; otherwise
615
566
  // falls back to a public fetch against `${baseUrl}${path}`. Either path
616
567
  // hits the same handler with the same auth — only the transport changes.
617
- const res = await fetchRuntimeApi(baseUrl, '/api/v2/plays/internal/runtime', {
618
- method: 'POST',
619
- headers: {
620
- 'content-type': 'application/json',
621
- authorization: `Bearer ${executorToken}`,
622
- 'x-deepline-request-id': makeRequestId(),
623
- },
624
- body: JSON.stringify(body),
625
- });
626
- if (!res.ok) {
568
+ const serializedBody = JSON.stringify(body);
569
+ let lastError: unknown = null;
570
+ for (
571
+ let attempt = 0;
572
+ attempt <= RUNTIME_API_RETRY_DELAYS_MS.length;
573
+ attempt += 1
574
+ ) {
575
+ let res: Response;
576
+ try {
577
+ res = await fetchRuntimeApi(baseUrl, '/api/v2/plays/internal/runtime', {
578
+ method: 'POST',
579
+ headers: {
580
+ 'content-type': 'application/json',
581
+ authorization: `Bearer ${executorToken}`,
582
+ 'x-deepline-request-id': makeRequestId(),
583
+ },
584
+ body: serializedBody,
585
+ });
586
+ } catch (error) {
587
+ lastError = error;
588
+ if (
589
+ attempt >= RUNTIME_API_RETRY_DELAYS_MS.length ||
590
+ !isRetryableRuntimeApiError(error)
591
+ ) {
592
+ throw error;
593
+ }
594
+ await sleepRuntimeApiRetry(attempt);
595
+ continue;
596
+ }
597
+
598
+ if (res.ok) {
599
+ return (await res.json()) as T;
600
+ }
601
+
627
602
  const text = await res.text().catch(() => '');
628
- throw new Error(
629
- `runtime API ${res.status}: ${redactSecretsFromLogString(text.slice(0, 500))}`,
630
- );
603
+ const redacted = redactSecretsFromLogString(text.slice(0, 500));
604
+ lastError = new Error(`runtime API ${res.status}: ${redacted}`);
605
+ if (
606
+ attempt >= RUNTIME_API_RETRY_DELAYS_MS.length ||
607
+ !isRetryableRuntimeApiResponse(res.status, text)
608
+ ) {
609
+ throw lastError;
610
+ }
611
+ await sleepRuntimeApiRetry(attempt);
631
612
  }
632
- return (await res.json()) as T;
613
+ throw lastError instanceof Error ? lastError : new Error(String(lastError));
633
614
  }
634
615
 
635
- async function postDeeplineApi(
636
- req: RunRequest,
637
- path: string,
638
- body: unknown,
639
- ): Promise<void> {
640
- const res = await fetch(`${req.baseUrl.replace(/\/$/, '')}${path}`, {
641
- method: 'POST',
642
- headers: {
643
- 'content-type': 'application/json',
644
- authorization: `Bearer ${req.executorToken}`,
645
- 'x-deepline-request-id': makeRequestId(),
646
- },
647
- body: JSON.stringify(body),
648
- });
649
- if (!res.ok) {
650
- const text = await res.text().catch(() => '');
651
- throw new Error(
652
- `Deepline API ${path} ${res.status}: ${redactSecretsFromLogString(text.slice(0, 500))}`,
653
- );
616
+ function isRetryableRuntimeApiError(error: unknown): boolean {
617
+ const message = error instanceof Error ? error.message : String(error);
618
+ return /timed out|timeout|fetch failed|ECONNRESET|ECONNREFUSED|UND_ERR_CONNECT_TIMEOUT/i.test(
619
+ message,
620
+ );
621
+ }
622
+
623
+ function isRetryableRuntimeApiResponse(status: number, body: string): boolean {
624
+ if (
625
+ status === 408 ||
626
+ status === 429 ||
627
+ status === 502 ||
628
+ status === 503 ||
629
+ status === 504
630
+ ) {
631
+ return true;
654
632
  }
633
+ return (
634
+ status === 500 &&
635
+ /timeout exceeded when trying to connect|timed out|fetch failed|ECONNRESET|UND_ERR_CONNECT_TIMEOUT/i.test(
636
+ body,
637
+ )
638
+ );
639
+ }
640
+
641
+ async function sleepRuntimeApiRetry(attempt: number): Promise<void> {
642
+ await new Promise((resolve) =>
643
+ setTimeout(resolve, RUNTIME_API_RETRY_DELAYS_MS[attempt] ?? 0),
644
+ );
655
645
  }
656
646
 
657
647
  function describeRuntimeApiBody(body: unknown): string {
@@ -973,6 +963,21 @@ async function executeTool(
973
963
  return callToolDirect(req, args);
974
964
  }
975
965
 
966
+ async function executeToolWithLifecycle(
967
+ req: RunRequest,
968
+ args: { id: string; toolId: string; input: Record<string, unknown> },
969
+ workflowStep: WorkflowStep | undefined,
970
+ callbacks: WorkerCtxCallbacks | undefined,
971
+ ): Promise<ToolExecuteResult> {
972
+ callbacks?.onToolCalled?.(args.toolId, nowMs());
973
+ try {
974
+ return await executeTool(req, args, workflowStep);
975
+ } catch (error) {
976
+ callbacks?.onToolFailed?.(args.toolId, nowMs());
977
+ throw error;
978
+ }
979
+ }
980
+
976
981
  function isToolExecuteRecord(value: unknown): value is Record<string, unknown> {
977
982
  return typeof value === 'object' && value !== null && !Array.isArray(value);
978
983
  }
@@ -1032,14 +1037,19 @@ async function waitForSyntheticIntegrationEvent(
1032
1037
  ? Math.max(1, Math.round(input.timeout_ms))
1033
1038
  : 30_000;
1034
1039
  await postRuntimeApiBestEffort(req.baseUrl, req.executorToken, {
1035
- action: 'update_run_status',
1040
+ action: 'append_run_events',
1036
1041
  playId: req.runId,
1037
- status: 'running',
1038
- runtimeBackend: 'cf_workflows_dynamic_worker',
1039
- waitKind: 'integration_event_batch',
1040
- waitUntil: nowMs() + timeoutMs,
1041
- activeBoundaryId: `integration_event:${eventKey}`,
1042
- lastCheckpointAt: nowMs(),
1042
+ events: [
1043
+ {
1044
+ type: 'log.appended',
1045
+ runId: req.runId,
1046
+ source: 'worker',
1047
+ occurredAt: nowMs(),
1048
+ lines: [
1049
+ `Waiting for integration_event:${eventKey} for up to ${timeoutMs}ms.`,
1050
+ ],
1051
+ } satisfies PlayRunLedgerEvent,
1052
+ ],
1043
1053
  });
1044
1054
  try {
1045
1055
  const event = (await (
@@ -1751,6 +1761,7 @@ type WorkerMapChunkSummary<T extends Record<string, unknown>> = {
1751
1761
  outputDatasetId: string;
1752
1762
  hash: string;
1753
1763
  preview: T[];
1764
+ cachedRows?: T[];
1754
1765
  };
1755
1766
 
1756
1767
  function toWorkflowSerializableValue<T>(value: T): T {
@@ -1932,6 +1943,8 @@ async function executeWorkerWaterfall(
1932
1943
  toolNameOrSpec: string | WorkerInlineWaterfallSpec,
1933
1944
  input: Record<string, unknown>,
1934
1945
  opts?: WorkerWaterfallOptions,
1946
+ callbacks?: WorkerCtxCallbacks,
1947
+ workflowStep?: WorkflowStep,
1935
1948
  ): Promise<unknown | null> {
1936
1949
  // Inline-spec form
1937
1950
  if (typeof toolNameOrSpec === 'object' && toolNameOrSpec) {
@@ -1948,20 +1961,32 @@ async function executeWorkerWaterfall(
1948
1961
  toolId?: unknown,
1949
1962
  toolInput?: unknown,
1950
1963
  ) =>
1951
- await executeTool(
1964
+ await executeToolWithLifecycle(
1952
1965
  req,
1953
1966
  normalizeToolExecuteArgs(requestOrKey, toolId, toolInput),
1967
+ workflowStep,
1968
+ callbacks,
1954
1969
  ),
1955
1970
  },
1956
1971
  tool: async (key, toolId, toolInput) =>
1957
- await executeTool(req, { id: key, toolId, input: toolInput }),
1972
+ await executeToolWithLifecycle(
1973
+ req,
1974
+ { id: key, toolId, input: toolInput },
1975
+ workflowStep,
1976
+ callbacks,
1977
+ ),
1958
1978
  });
1959
1979
  } else {
1960
- result = await executeTool(req, {
1961
- id: step.id,
1962
- toolId: step.toolId,
1963
- input: step.mapInput(input),
1964
- });
1980
+ result = await executeToolWithLifecycle(
1981
+ req,
1982
+ {
1983
+ id: step.id,
1984
+ toolId: step.toolId,
1985
+ input: step.mapInput(input),
1986
+ },
1987
+ workflowStep,
1988
+ callbacks,
1989
+ );
1965
1990
  }
1966
1991
  } catch {
1967
1992
  continue;
@@ -2047,7 +2072,12 @@ async function executeWorkerWaterfall(
2047
2072
  const providers = opts?.providers ?? [];
2048
2073
  if (providers.length === 0) {
2049
2074
  try {
2050
- return await executeTool(req, { id: toolName, toolId: toolName, input });
2075
+ return await executeToolWithLifecycle(
2076
+ req,
2077
+ { id: toolName, toolId: toolName, input },
2078
+ workflowStep,
2079
+ callbacks,
2080
+ );
2051
2081
  } catch {
2052
2082
  return null;
2053
2083
  }
@@ -2055,11 +2085,16 @@ async function executeWorkerWaterfall(
2055
2085
  let lastError: Error | null = null;
2056
2086
  for (const provider of providers) {
2057
2087
  try {
2058
- const result = await executeTool(req, {
2059
- id: `${toolName}:${provider}`,
2060
- toolId: toolName,
2061
- input: { ...input, provider },
2062
- });
2088
+ const result = await executeToolWithLifecycle(
2089
+ req,
2090
+ {
2091
+ id: `${toolName}:${provider}`,
2092
+ toolId: toolName,
2093
+ input: { ...input, provider },
2094
+ },
2095
+ workflowStep,
2096
+ callbacks,
2097
+ );
2063
2098
  if (resultHasContent(result)) {
2064
2099
  recorder.push({
2065
2100
  waterfallId: toolName,
@@ -2114,6 +2149,118 @@ function makeCsvParserState(): CsvParserState {
2114
2149
  return { field: '', row: [], inQuotes: false, pendingCr: false };
2115
2150
  }
2116
2151
 
2152
+ function normalizeExpectedBytes(value: unknown): number | null {
2153
+ return typeof value === 'number' && Number.isSafeInteger(value) && value >= 0
2154
+ ? value
2155
+ : null;
2156
+ }
2157
+
2158
+ function hasByteLengthMismatch(
2159
+ expectedBytes: number | null | undefined,
2160
+ actualBytes: number | null | undefined,
2161
+ ): boolean {
2162
+ return (
2163
+ typeof expectedBytes === 'number' &&
2164
+ typeof actualBytes === 'number' &&
2165
+ actualBytes !== expectedBytes
2166
+ );
2167
+ }
2168
+
2169
+ async function* iterReadableStreamChunks(
2170
+ body: ReadableStream<Uint8Array>,
2171
+ ): AsyncGenerator<Uint8Array, void, void> {
2172
+ const reader = body.getReader();
2173
+ try {
2174
+ while (true) {
2175
+ const { done, value } = await reader.read();
2176
+ if (done) return;
2177
+ if (value && value.byteLength > 0) yield value;
2178
+ }
2179
+ } finally {
2180
+ reader.releaseLock();
2181
+ }
2182
+ }
2183
+
2184
+ function singleByteChunk(bytes: Uint8Array): AsyncIterable<Uint8Array> {
2185
+ return {
2186
+ async *[Symbol.asyncIterator]() {
2187
+ if (bytes.byteLength > 0) yield bytes;
2188
+ },
2189
+ };
2190
+ }
2191
+
2192
+ async function* guardExpectedByteChunks(input: {
2193
+ req: RunRequest;
2194
+ logicalPath: string;
2195
+ storageKey: string;
2196
+ source: string;
2197
+ chunks: AsyncIterable<Uint8Array>;
2198
+ expectedBytes?: number | null;
2199
+ reportedBytes?: number | null;
2200
+ fallback?: () => AsyncIterable<Uint8Array>;
2201
+ }): AsyncGenerator<Uint8Array, void, void> {
2202
+ const expectedBytes =
2203
+ normalizeExpectedBytes(input.expectedBytes) ??
2204
+ normalizeExpectedBytes(input.reportedBytes);
2205
+ let bytesRead = 0;
2206
+ let sawChunk = false;
2207
+ let skippedEmptyChunks = 0;
2208
+
2209
+ for await (const value of input.chunks) {
2210
+ if (!value || value.byteLength === 0) {
2211
+ skippedEmptyChunks += 1;
2212
+ continue;
2213
+ }
2214
+ sawChunk = true;
2215
+ bytesRead += value.byteLength;
2216
+ yield value;
2217
+ }
2218
+
2219
+ if (!sawChunk) {
2220
+ if (typeof expectedBytes === 'number' && expectedBytes > 0) {
2221
+ recordRunnerPerfTrace({
2222
+ req: input.req,
2223
+ phase: 'csv.open_empty_body',
2224
+ extra: {
2225
+ source: input.source,
2226
+ logicalPath: input.logicalPath,
2227
+ expectedBytes,
2228
+ reportedBytes: normalizeExpectedBytes(input.reportedBytes),
2229
+ skippedEmptyChunks,
2230
+ storageKey: input.storageKey,
2231
+ },
2232
+ });
2233
+ if (input.fallback) {
2234
+ yield* input.fallback();
2235
+ return;
2236
+ }
2237
+ throw new Error(
2238
+ `ctx.csv("${input.logicalPath}"): ${input.source} returned an empty body for ` +
2239
+ `${expectedBytes} byte staged file ${input.storageKey}.`,
2240
+ );
2241
+ }
2242
+ return;
2243
+ }
2244
+
2245
+ if (expectedBytes !== null && bytesRead !== expectedBytes) {
2246
+ recordRunnerPerfTrace({
2247
+ req: input.req,
2248
+ phase: 'csv.read_mismatch',
2249
+ extra: {
2250
+ source: input.source,
2251
+ logicalPath: input.logicalPath,
2252
+ expectedBytes,
2253
+ actualBytes: bytesRead,
2254
+ storageKey: input.storageKey,
2255
+ },
2256
+ });
2257
+ throw new Error(
2258
+ `ctx.csv("${input.logicalPath}"): ${input.source} streamed ${bytesRead} bytes ` +
2259
+ `for ${expectedBytes} byte staged file ${input.storageKey}.`,
2260
+ );
2261
+ }
2262
+ }
2263
+
2117
2264
  /**
2118
2265
  * Push one buffered text chunk through the CSV state machine. Accumulates
2119
2266
  * fully-terminated rows into `out`; partial trailing field/row stays in
@@ -2182,11 +2329,10 @@ function flushCsvParser(state: CsvParserState, out: string[][]): void {
2182
2329
  * to every subsequent row. Stops cleanly on stream end and flushes any
2183
2330
  * trailing row.
2184
2331
  */
2185
- async function* streamCsvRowsFromBody<T extends Record<string, unknown>>(
2186
- body: ReadableStream<Uint8Array>,
2332
+ async function* streamCsvRowsFromByteChunks<T extends Record<string, unknown>>(
2333
+ byteChunks: AsyncIterable<Uint8Array>,
2187
2334
  chunkSize: number,
2188
2335
  ): AsyncGenerator<T[], void, void> {
2189
- const reader = body.getReader();
2190
2336
  const decoder = new TextDecoder('utf-8');
2191
2337
  const state = makeCsvParserState();
2192
2338
  const physicalRowBuffer: string[][] = [];
@@ -2195,7 +2341,13 @@ async function* streamCsvRowsFromBody<T extends Record<string, unknown>>(
2195
2341
 
2196
2342
  const flushPhysicalRowsAsObjects = (terminal: boolean): T[][] => {
2197
2343
  const yielded: T[][] = [];
2198
- if (physicalRowBuffer.length === 0) return yielded;
2344
+ if (physicalRowBuffer.length === 0) {
2345
+ if (terminal && pendingChunk.length > 0) {
2346
+ yielded.push(pendingChunk);
2347
+ pendingChunk = [];
2348
+ }
2349
+ return yielded;
2350
+ }
2199
2351
  if (!headers) {
2200
2352
  headers = physicalRowBuffer.shift() ?? null;
2201
2353
  if (!headers) return yielded;
@@ -2219,224 +2371,240 @@ async function* streamCsvRowsFromBody<T extends Record<string, unknown>>(
2219
2371
  return yielded;
2220
2372
  };
2221
2373
 
2222
- try {
2223
- while (true) {
2224
- const { done, value } = await reader.read();
2225
- if (done) {
2226
- flushCsvParser(state, physicalRowBuffer);
2227
- for (const chunk of flushPhysicalRowsAsObjects(true)) yield chunk;
2228
- return;
2229
- }
2230
- const text = decoder.decode(value, { stream: true });
2231
- pushCsvTextIntoParser(state, text, physicalRowBuffer);
2232
- for (const chunk of flushPhysicalRowsAsObjects(false)) yield chunk;
2233
- }
2234
- } finally {
2235
- reader.releaseLock();
2374
+ for await (const value of byteChunks) {
2375
+ if (value.byteLength === 0) continue;
2376
+ const text = decoder.decode(value, { stream: true });
2377
+ pushCsvTextIntoParser(state, text, physicalRowBuffer);
2378
+ for (const chunk of flushPhysicalRowsAsObjects(false)) yield chunk;
2379
+ }
2380
+ const trailingText = decoder.decode();
2381
+ if (trailingText) {
2382
+ pushCsvTextIntoParser(state, trailingText, physicalRowBuffer);
2236
2383
  }
2384
+ flushCsvParser(state, physicalRowBuffer);
2385
+ for (const chunk of flushPhysicalRowsAsObjects(true)) yield chunk;
2237
2386
  void TARGET_CSV_DECODE_CHUNK_BYTES; // referenced for future tuning
2238
2387
  }
2239
2388
 
2389
+ function readHarnessStagedFileChunks(input: {
2390
+ req: RunRequest;
2391
+ logicalPath: string;
2392
+ storageKey: string;
2393
+ expectedBytes?: number | null;
2394
+ }): AsyncIterable<Uint8Array> {
2395
+ const expectedBytes = normalizeExpectedBytes(input.expectedBytes);
2396
+ return {
2397
+ async *[Symbol.asyncIterator]() {
2398
+ let offset = 0;
2399
+ let observedBytes = 0;
2400
+ let objectSize: number | null = null;
2401
+ let loggedOpen = false;
2402
+ let done = false;
2403
+
2404
+ while (!done) {
2405
+ const result = await harnessReadStagedFileChunk({
2406
+ storageKey: input.storageKey,
2407
+ offset,
2408
+ length: TARGET_CSV_DECODE_CHUNK_BYTES,
2409
+ });
2410
+ const actualObjectSize = normalizeExpectedBytes(result.objectSize);
2411
+ if (actualObjectSize === null) {
2412
+ throw new Error(
2413
+ `ctx.csv("${input.logicalPath}"): harness returned an invalid object size for ${input.storageKey}.`,
2414
+ );
2415
+ }
2416
+ if (objectSize !== null && objectSize !== actualObjectSize) {
2417
+ throw new Error(
2418
+ `ctx.csv("${input.logicalPath}"): staged file size changed while streaming ` +
2419
+ `${input.storageKey}; started at ${objectSize} bytes, now ${actualObjectSize}.`,
2420
+ );
2421
+ }
2422
+ objectSize = actualObjectSize;
2423
+ if (hasByteLengthMismatch(expectedBytes, actualObjectSize)) {
2424
+ throw new Error(
2425
+ `ctx.csv("${input.logicalPath}"): harness staged file size mismatch for ` +
2426
+ `storageKey=${input.storageKey}; expected ${expectedBytes} bytes, got ${actualObjectSize}.`,
2427
+ );
2428
+ }
2429
+ if (result.offset !== offset) {
2430
+ throw new Error(
2431
+ `ctx.csv("${input.logicalPath}"): harness returned offset ${result.offset} while ` +
2432
+ `reading offset ${offset} from ${input.storageKey}.`,
2433
+ );
2434
+ }
2435
+
2436
+ const chunk = result.chunk;
2437
+ if (!(chunk instanceof Uint8Array)) {
2438
+ throw new Error(
2439
+ `ctx.csv("${input.logicalPath}"): harness returned a non-byte chunk for ${input.storageKey}.`,
2440
+ );
2441
+ }
2442
+ if (chunk.byteLength !== result.bytesRead) {
2443
+ throw new Error(
2444
+ `ctx.csv("${input.logicalPath}"): harness chunk metadata mismatch for ` +
2445
+ `${input.storageKey}; bytesRead=${result.bytesRead}, chunk=${chunk.byteLength}.`,
2446
+ );
2447
+ }
2448
+ if (chunk.byteLength === 0 && !result.done) {
2449
+ throw new Error(
2450
+ `ctx.csv("${input.logicalPath}"): harness returned an empty non-terminal chunk for ${input.storageKey}.`,
2451
+ );
2452
+ }
2453
+
2454
+ if (!loggedOpen) {
2455
+ loggedOpen = true;
2456
+ recordRunnerPerfTrace({
2457
+ req: input.req,
2458
+ phase: 'csv.open',
2459
+ extra: {
2460
+ source: 'harness_rpc_range',
2461
+ logicalPath: input.logicalPath,
2462
+ expectedBytes,
2463
+ actualBytes: actualObjectSize,
2464
+ chunkBytes: TARGET_CSV_DECODE_CHUNK_BYTES,
2465
+ storageKey: input.storageKey,
2466
+ },
2467
+ });
2468
+ }
2469
+
2470
+ offset += chunk.byteLength;
2471
+ observedBytes += chunk.byteLength;
2472
+ done = result.done;
2473
+ if (chunk.byteLength > 0) yield chunk;
2474
+ }
2475
+
2476
+ const requiredBytes = expectedBytes ?? objectSize;
2477
+ if (
2478
+ typeof requiredBytes === 'number' &&
2479
+ observedBytes !== requiredBytes
2480
+ ) {
2481
+ recordRunnerPerfTrace({
2482
+ req: input.req,
2483
+ phase: 'csv.read_mismatch',
2484
+ extra: {
2485
+ source: 'harness_rpc_range',
2486
+ logicalPath: input.logicalPath,
2487
+ expectedBytes: requiredBytes,
2488
+ actualBytes: observedBytes,
2489
+ storageKey: input.storageKey,
2490
+ },
2491
+ });
2492
+ throw new Error(
2493
+ `ctx.csv("${input.logicalPath}"): harness streamed ${observedBytes} bytes ` +
2494
+ `for ${requiredBytes} byte staged file ${input.storageKey}.`,
2495
+ );
2496
+ }
2497
+ },
2498
+ };
2499
+ }
2500
+
2240
2501
  /**
2241
- * R2 reader that returns a body stream. Per-play Workers loaded via
2502
+ * Dataset source adapter that returns byte chunks. Per-play Workers loaded via
2242
2503
  * WorkerLoader cannot accept a raw R2Bucket binding (CF Workflows refuses to
2243
2504
  * serialize R2Bucket through its workflow-state path), so per-play Workers
2244
2505
  * stream staged files through the long-lived harness Worker service binding.
2245
2506
  * Returns null only if the asset is genuinely missing (404).
2246
2507
  */
2247
- async function openR2BodyStream(input: {
2508
+ async function openFileByteChunks(input: {
2248
2509
  req: RunRequest;
2249
2510
  env: WorkerEnv;
2250
2511
  logicalPath: string;
2251
- storageKey: string;
2252
- }): Promise<ReadableStream<Uint8Array> | null> {
2512
+ file: WorkerFileRef;
2513
+ }): Promise<AsyncIterable<Uint8Array> | null> {
2514
+ const storageKey = input.file.storageKey;
2515
+ const expectedBytes = normalizeExpectedBytes(input.file.bytes);
2516
+ if (expectedBytes === null) {
2517
+ throw new Error(
2518
+ `ctx.csv("${input.logicalPath}"): staged dataset handle is missing a byte length for ${storageKey}.`,
2519
+ );
2520
+ }
2253
2521
  if (input.env.PLAYS_BUCKET) {
2254
- const object = await input.env.PLAYS_BUCKET.get(input.storageKey);
2522
+ const object = await input.env.PLAYS_BUCKET.get(storageKey);
2255
2523
  if (object) {
2256
- return object.body;
2524
+ if (hasByteLengthMismatch(expectedBytes, object.size)) {
2525
+ recordRunnerPerfTrace({
2526
+ req: input.req,
2527
+ phase: 'csv.open_mismatch',
2528
+ extra: {
2529
+ source: 'direct_r2',
2530
+ logicalPath: input.logicalPath,
2531
+ expectedBytes,
2532
+ actualBytes: object.size,
2533
+ storageKey,
2534
+ },
2535
+ });
2536
+ await object.body.cancel().catch(() => undefined);
2537
+ } else {
2538
+ recordRunnerPerfTrace({
2539
+ req: input.req,
2540
+ phase: 'csv.open',
2541
+ extra: {
2542
+ source: 'direct_r2',
2543
+ logicalPath: input.logicalPath,
2544
+ expectedBytes,
2545
+ actualBytes: object.size,
2546
+ storageKey,
2547
+ },
2548
+ });
2549
+ return guardExpectedByteChunks({
2550
+ req: input.req,
2551
+ logicalPath: input.logicalPath,
2552
+ storageKey,
2553
+ source: 'direct_r2',
2554
+ chunks: iterReadableStreamChunks(object.body),
2555
+ expectedBytes,
2556
+ reportedBytes: object.size,
2557
+ fallback: () =>
2558
+ readHarnessStagedFileChunks({
2559
+ req: input.req,
2560
+ logicalPath: input.logicalPath,
2561
+ storageKey,
2562
+ expectedBytes,
2563
+ }),
2564
+ });
2565
+ }
2257
2566
  }
2258
2567
  }
2259
2568
  if (input.env.PLAY_ASSETS) {
2260
2569
  try {
2261
2570
  const text = await input.env.PLAY_ASSETS.readText(input.logicalPath);
2262
2571
  const bytes = new TextEncoder().encode(text);
2263
- return new ReadableStream<Uint8Array>({
2264
- start(controller) {
2265
- controller.enqueue(bytes);
2266
- controller.close();
2572
+ if (hasByteLengthMismatch(expectedBytes, bytes.byteLength)) {
2573
+ throw new Error(
2574
+ `ctx.csv("${input.logicalPath}"): packaged asset size mismatch for ` +
2575
+ `storageKey=${storageKey}; expected ${expectedBytes} bytes, got ${bytes.byteLength}.`,
2576
+ );
2577
+ }
2578
+ recordRunnerPerfTrace({
2579
+ req: input.req,
2580
+ phase: 'csv.open',
2581
+ extra: {
2582
+ source: 'play_assets',
2583
+ logicalPath: input.logicalPath,
2584
+ expectedBytes,
2585
+ actualBytes: bytes.byteLength,
2586
+ storageKey,
2267
2587
  },
2268
2588
  });
2589
+ return singleByteChunk(bytes);
2269
2590
  } catch (error) {
2270
2591
  if (!/missing from R2|not found|No such object/i.test(String(error))) {
2271
- throw error;
2272
- }
2273
- }
2274
- }
2275
-
2276
- // The harness fetch path returns a real Response body backed by R2.
2277
- // Errors are loud: we want CI / regression failures to surface the real
2278
- // cause (auth, missing object, network) rather than getting squashed into a
2279
- // generic "R2 asset is not reachable".
2280
- const response = await harnessFetchStagedFile({
2281
- executorToken: input.req.executorToken,
2282
- storageKey: input.storageKey,
2283
- });
2284
- if (response.status === 404) {
2285
- throw new Error(
2286
- `ctx.csv("${input.logicalPath}"): harness R2 fetch returned 404 for storageKey=${input.storageKey}. ` +
2287
- `The staged file is missing from R2; the upload either failed silently before the run started, ` +
2288
- `or the storageKey threaded through the workflow params no longer matches what the harness resolves.`,
2289
- );
2290
- }
2291
- if (!response.ok || !response.body) {
2292
- const body = await response.text().catch(() => '');
2293
- throw new Error(
2294
- `ctx.csv("${input.logicalPath}"): harness R2 fetch failed ${response.status}: ${body.slice(0, 400)}`,
2295
- );
2296
- }
2297
- return response.body;
2298
- }
2299
-
2300
- /**
2301
- * Streaming CSV dataset. Backed by R2 (or a signed URL when PLAYS_BUCKET
2302
- * isn't bound). Looks like a length-0 array to plays that pass it straight
2303
- * to `ctx.map`; ctx.map detects the streaming surface via `iterChunks` and
2304
- * uses it instead of `slice()`. Plays that try to access rows synchronously
2305
- * (`csv[0]`, `csv.length`) are intentionally given an empty array — they
2306
- * must use ctx.map (the supported surface), call `materialize()` (bounded),
2307
- * or iterate via `for await (const row of csv)`.
2308
- */
2309
- type StreamingCsvDataset<T extends Record<string, unknown>> = T[] & {
2310
- count(): Promise<number>;
2311
- peek(limit?: number): Promise<T[]>;
2312
- materialize(limit?: number): Promise<T[]>;
2313
- iterChunks(chunkSize: number): AsyncIterable<T[]>;
2314
- toJSON(): unknown;
2315
- datasetId: string;
2316
- tableNamespace: string;
2317
- __deeplineDatasetKind: 'csv';
2318
- /** Marker so `ctx.map` can detect this is streaming-only and switch path. */
2319
- __deeplineStreamingDataset: true;
2320
- };
2321
-
2322
- const MAX_MATERIALIZE_ROWS_DEFAULT = 50_000;
2323
-
2324
- function makeStreamingCsvDataset<T extends Record<string, unknown>>(input: {
2325
- name: string;
2326
- logicalPath: string;
2327
- renameOptions?: CsvRenameOptions;
2328
- open: () => Promise<ReadableStream<Uint8Array> | null>;
2329
- }): StreamingCsvDataset<T> {
2330
- const datasetId = `csv:${input.name}`;
2331
- const arr = [] as T[] as StreamingCsvDataset<T>;
2332
- let cachedCount: number | null = null;
2333
-
2334
- async function* doStream(chunkSize: number): AsyncGenerator<T[], void, void> {
2335
- const body = await input.open();
2336
- if (!body) {
2337
- throw new Error(
2338
- `ctx.csv("${input.logicalPath}"): R2 asset is not reachable (no PLAYS_BUCKET binding and signed URL unavailable).`,
2339
- );
2340
- }
2341
- for await (const chunk of streamCsvRowsFromBody<T>(
2342
- body,
2343
- Math.max(1, Math.floor(chunkSize)),
2344
- )) {
2345
- yield applyCsvRenameProjection(chunk, input.renameOptions) as T[];
2592
+ throw error;
2593
+ }
2346
2594
  }
2347
2595
  }
2348
2596
 
2349
- Object.defineProperty(arr, 'iterChunks', {
2350
- value: (chunkSize: number) => ({
2351
- [Symbol.asyncIterator]: () => doStream(chunkSize),
2352
- }),
2353
- enumerable: false,
2354
- });
2355
- Object.defineProperty(arr, Symbol.asyncIterator, {
2356
- value: async function* () {
2357
- for await (const chunk of doStream(1_000)) {
2358
- for (const row of chunk) yield row;
2359
- }
2360
- },
2361
- enumerable: false,
2362
- });
2363
- Object.defineProperty(arr, 'count', {
2364
- value: async () => {
2365
- if (cachedCount !== null) return cachedCount;
2366
- let total = 0;
2367
- for await (const chunk of doStream(5_000)) total += chunk.length;
2368
- cachedCount = total;
2369
- return total;
2370
- },
2371
- enumerable: false,
2372
- });
2373
- Object.defineProperty(arr, 'peek', {
2374
- value: async (limit = 10) => {
2375
- const out: T[] = [];
2376
- for await (const chunk of doStream(Math.max(1, limit))) {
2377
- for (const row of chunk) {
2378
- out.push(row);
2379
- if (out.length >= limit) return out;
2380
- }
2381
- }
2382
- return out;
2383
- },
2384
- enumerable: false,
2385
- });
2386
- Object.defineProperty(arr, 'materialize', {
2387
- value: async (limit?: number) => {
2388
- const cap = limit ?? MAX_MATERIALIZE_ROWS_DEFAULT;
2389
- const out: T[] = [];
2390
- for await (const chunk of doStream(5_000)) {
2391
- for (const row of chunk) {
2392
- if (out.length >= cap) {
2393
- return out;
2394
- }
2395
- out.push(row);
2396
- }
2397
- }
2398
- return out;
2399
- },
2400
- enumerable: false,
2401
- });
2402
- Object.defineProperty(arr, 'datasetId', {
2403
- value: datasetId,
2404
- enumerable: true,
2405
- });
2406
- Object.defineProperty(arr, 'tableNamespace', {
2407
- value: input.name,
2408
- enumerable: true,
2409
- });
2410
- Object.defineProperty(arr, '__deeplineStreamingDataset', {
2411
- value: true,
2412
- enumerable: false,
2597
+ // Dynamic Workers cannot receive a raw R2Bucket binding, and both previous
2598
+ // fallbacks were different data planes: service-binding fetch bodies could
2599
+ // arrive empty across WorkerLoader isolates, while app-signed URLs pointed at
2600
+ // the app namespace instead of the preview harness namespace. The harness owns
2601
+ // staged R2 now, so the only fallback is typed bounded range RPC.
2602
+ return readHarnessStagedFileChunks({
2603
+ req: input.req,
2604
+ logicalPath: input.logicalPath,
2605
+ storageKey,
2606
+ expectedBytes,
2413
2607
  });
2414
- Object.defineProperty(arr, '__deeplineDatasetKind', {
2415
- value: 'csv',
2416
- enumerable: false,
2417
- });
2418
- Object.defineProperty(arr, 'toJSON', {
2419
- value: () => ({
2420
- kind: 'dataset' as const,
2421
- datasetKind: 'csv',
2422
- datasetId,
2423
- count: cachedCount,
2424
- streaming: true,
2425
- tableNamespace: input.name,
2426
- }),
2427
- enumerable: false,
2428
- });
2429
- return arr;
2430
- }
2431
-
2432
- function isStreamingDataset<T extends Record<string, unknown>>(
2433
- value: unknown,
2434
- ): value is StreamingCsvDataset<T> {
2435
- return (
2436
- Array.isArray(value) &&
2437
- (value as { __deeplineStreamingDataset?: unknown })
2438
- .__deeplineStreamingDataset === true
2439
- );
2440
2608
  }
2441
2609
 
2442
2610
  /**
@@ -2492,6 +2660,60 @@ function requireSheetContract(
2492
2660
  return contract;
2493
2661
  }
2494
2662
 
2663
+ function isDatasetPayloadField(field: string): boolean {
2664
+ return (
2665
+ field.length > 0 &&
2666
+ !field.startsWith('__deepline') &&
2667
+ field !== '_key' &&
2668
+ field !== '_status' &&
2669
+ field !== '_run_id' &&
2670
+ field !== '_error' &&
2671
+ field !== '_stage' &&
2672
+ field !== '_provider' &&
2673
+ field !== '_input_index' &&
2674
+ field !== '_created_at' &&
2675
+ field !== '_updated_at' &&
2676
+ field !== '_cell_meta'
2677
+ );
2678
+ }
2679
+
2680
+ function augmentSheetContractWithDatasetFields(input: {
2681
+ contract: PlaySheetContract;
2682
+ rows: readonly Record<string, unknown>[];
2683
+ outputFields?: readonly string[];
2684
+ }): PlaySheetContract {
2685
+ const outputFields = new Set(input.outputFields ?? []);
2686
+ const existingFields = new Set(
2687
+ input.contract.columns.flatMap((column) =>
2688
+ typeof column.field === 'string' ? [column.field] : [],
2689
+ ),
2690
+ );
2691
+ const existingSqlNames = new Set(
2692
+ input.contract.columns.map((column) => column.sqlName),
2693
+ );
2694
+ const columns = [...input.contract.columns];
2695
+ for (const row of input.rows) {
2696
+ for (const field of Object.keys(row)) {
2697
+ if (!isDatasetPayloadField(field) || existingFields.has(field)) {
2698
+ continue;
2699
+ }
2700
+ const sqlName = sqlSafePlayColumnName(field);
2701
+ if (existingSqlNames.has(sqlName)) {
2702
+ continue;
2703
+ }
2704
+ existingFields.add(field);
2705
+ existingSqlNames.add(sqlName);
2706
+ columns.push({
2707
+ id: `runtime:${input.contract.tableNamespace}:${field}`,
2708
+ sqlName,
2709
+ source: outputFields.has(field) ? 'mapField' : 'input',
2710
+ field,
2711
+ });
2712
+ }
2713
+ }
2714
+ return { ...input.contract, columns };
2715
+ }
2716
+
2495
2717
  async function persistCompletedMapRows(input: {
2496
2718
  req: RunRequest;
2497
2719
  tableNamespace: string;
@@ -2500,19 +2722,24 @@ async function persistCompletedMapRows(input: {
2500
2722
  extraOutputFields?: string[];
2501
2723
  }): Promise<void> {
2502
2724
  if (input.rows.length === 0) return;
2725
+ const outputFields = [
2726
+ ...input.outputFields,
2727
+ ...(input.extraOutputFields ?? []).filter(
2728
+ (field) => !input.outputFields.includes(field),
2729
+ ),
2730
+ ];
2503
2731
  await harnessPersistCompletedSheetRows({
2504
2732
  baseUrl: input.req.baseUrl,
2505
2733
  executorToken: input.req.executorToken,
2506
2734
  playName: input.req.playName,
2507
2735
  tableNamespace: input.tableNamespace,
2508
- sheetContract: requireSheetContract(input.req, input.tableNamespace),
2736
+ sheetContract: augmentSheetContractWithDatasetFields({
2737
+ contract: requireSheetContract(input.req, input.tableNamespace),
2738
+ rows: input.rows,
2739
+ outputFields,
2740
+ }),
2509
2741
  rows: input.rows,
2510
- outputFields: [
2511
- ...input.outputFields,
2512
- ...(input.extraOutputFields ?? []).filter(
2513
- (field) => !input.outputFields.includes(field),
2514
- ),
2515
- ],
2742
+ outputFields,
2516
2743
  runId: input.req.runId,
2517
2744
  userEmail: input.req.userEmail,
2518
2745
  preloadedDbSessions: input.req.preloadedDbSessions ?? null,
@@ -2537,12 +2764,37 @@ async function prepareMapRows(input: {
2537
2764
  executorToken: input.req.executorToken,
2538
2765
  playName: input.req.playName,
2539
2766
  tableNamespace: input.tableNamespace,
2540
- sheetContract: requireSheetContract(input.req, input.tableNamespace),
2767
+ sheetContract: augmentSheetContractWithDatasetFields({
2768
+ contract: requireSheetContract(input.req, input.tableNamespace),
2769
+ rows: input.rows,
2770
+ }),
2541
2771
  rows: input.rows.map((row) => ({ ...row })),
2542
2772
  runId: input.req.runId,
2543
2773
  userEmail: input.req.userEmail,
2544
2774
  preloadedDbSessions: input.req.preloadedDbSessions ?? null,
2545
2775
  });
2776
+ for (const timing of result.timings ?? []) {
2777
+ const phase =
2778
+ typeof timing.phase === 'string' && timing.phase.trim()
2779
+ ? timing.phase.trim()
2780
+ : 'unknown';
2781
+ const ms =
2782
+ typeof timing.ms === 'number' && Number.isFinite(timing.ms)
2783
+ ? timing.ms
2784
+ : 0;
2785
+ const { phase: _phase, ms: _ms, ...extra } = timing;
2786
+ void _phase;
2787
+ void _ms;
2788
+ recordRunnerPerfTrace({
2789
+ req: input.req,
2790
+ phase: `sheet_start.${phase}`,
2791
+ ms,
2792
+ extra: {
2793
+ tableNamespace: input.tableNamespace,
2794
+ ...extra,
2795
+ },
2796
+ });
2797
+ }
2546
2798
  return {
2547
2799
  inserted: result.inserted,
2548
2800
  skipped: result.skipped,
@@ -2700,7 +2952,7 @@ function createMinimalWorkerCtx(
2700
2952
  const callDepth = rootGovernance?.callDepth ?? 0;
2701
2953
  const runMap = async <T extends Record<string, unknown>>(
2702
2954
  name: string,
2703
- rows: T[],
2955
+ rows: WorkerDatasetInput<T>,
2704
2956
  fieldsDef: Record<
2705
2957
  string,
2706
2958
  | unknown
@@ -2715,7 +2967,8 @@ function createMinimalWorkerCtx(
2715
2967
  ): Promise<unknown> => {
2716
2968
  const mapStartedAt = nowMs();
2717
2969
  const mapNodeId = `map:${name}`;
2718
- const sliced = rows;
2970
+ const inputRows = rows;
2971
+ const rowCountHint = datasetRowCountHint(inputRows);
2719
2972
  const baseOffset = 0;
2720
2973
  const fieldEntries = Object.entries(fieldsDef);
2721
2974
  const plan = req.executionPlan;
@@ -2723,12 +2976,8 @@ function createMinimalWorkerCtx(
2723
2976
  (candidate) =>
2724
2977
  candidate.mapName === name || candidate.tableNamespace === name,
2725
2978
  );
2726
- const streaming = isStreamingDataset<T>(sliced);
2727
- // For streaming inputs we don't know the row count upfront — pass
2728
- // `totalRows: 0` so chooseMapChunkSize falls back to the preferred /
2729
- // default chunk size rather than trying to budget against an unknown.
2730
2979
  const rowsPerChunk = chooseMapChunkSize({
2731
- totalRows: streaming ? 0 : sliced.length,
2980
+ totalRows: rowCountHint,
2732
2981
  mapCount: Math.max(1, plan?.maps.length ?? 1),
2733
2982
  stepsPerChunk: planMap?.stepsPerChunk ?? 1,
2734
2983
  preferredChunkSize: planMap?.defaultChunkSize,
@@ -2750,14 +2999,12 @@ function createMinimalWorkerCtx(
2750
2999
  typeof total === 'number' && Number.isFinite(total) && total > 0
2751
3000
  ? `${completed.toLocaleString()} / ${total.toLocaleString()} rows processed`
2752
3001
  : `${completed.toLocaleString()} rows processed`;
3002
+ callbacks?.onMapStarted?.(mapNodeId, mapStartedAt);
2753
3003
  updateMapProgress({
2754
3004
  completed: 0,
2755
- total: streaming ? undefined : sliced.length,
3005
+ total: rowCountHint ?? undefined,
2756
3006
  startedAt: mapStartedAt,
2757
- message: formatMapProgressMessage(
2758
- 0,
2759
- streaming ? undefined : sliced.length,
2760
- ),
3007
+ message: formatMapProgressMessage(0, rowCountHint ?? undefined),
2761
3008
  });
2762
3009
  const explicitRowKeysSeen =
2763
3010
  opts?.key === undefined ? null : new Map<string, number>();
@@ -2983,6 +3230,7 @@ function createMinimalWorkerCtx(
2983
3230
  input?: unknown,
2984
3231
  _opts?: { description?: string },
2985
3232
  ): Promise<unknown> => {
3233
+ void _opts;
2986
3234
  assertNotAborted(abortSignal);
2987
3235
  const request = normalizeToolExecuteArgs(
2988
3236
  requestOrKey,
@@ -3008,6 +3256,8 @@ function createMinimalWorkerCtx(
3008
3256
  toolNameOrSpec,
3009
3257
  waterfallInput,
3010
3258
  waterfallOpts,
3259
+ callbacks,
3260
+ workflowStep,
3011
3261
  ),
3012
3262
  };
3013
3263
  for (const [key, value] of fieldEntries) {
@@ -3219,10 +3469,16 @@ function createMinimalWorkerCtx(
3219
3469
  outputDatasetId: `map:${name}`,
3220
3470
  hash,
3221
3471
  preview: toWorkflowSerializableValue(out.slice(0, 5)),
3472
+ cachedRows:
3473
+ out.length <= WORKER_DATASET_IN_MEMORY_ROWS
3474
+ ? toWorkflowSerializableValue(out)
3475
+ : undefined,
3222
3476
  };
3223
3477
  };
3224
3478
 
3225
- const out: Array<T & Record<string, unknown>> = [];
3479
+ const previewRows: Array<T & Record<string, unknown>> = [];
3480
+ const cachedRows: Array<T & Record<string, unknown>> = [];
3481
+ let canCacheRows = true;
3226
3482
  let totalRowsExecuted = 0;
3227
3483
  let totalRowsCached = 0;
3228
3484
  let totalRowsDuplicateReused = 0;
@@ -3260,6 +3516,7 @@ function createMinimalWorkerCtx(
3260
3516
  `(${totalRowsExecuted} executed, ${totalRowsCached} already satisfied) ` +
3261
3517
  `inserted=${totalRowsInserted} skipped=${totalRowsSkipped}`;
3262
3518
  const completedAt = nowMs();
3519
+ callbacks?.onMapCompleted?.(mapNodeId, completedAt);
3263
3520
  updateMapProgress({
3264
3521
  completed: totalRowsWritten,
3265
3522
  total: totalRowsWritten,
@@ -3273,9 +3530,29 @@ function createMinimalWorkerCtx(
3273
3530
  message: cacheSummary,
3274
3531
  ts: nowMs(),
3275
3532
  });
3276
- return makeWorkerDataset(name, out, {
3533
+ return createPersistedDatasetHandle({
3534
+ playName: req.playName,
3535
+ name,
3277
3536
  count: totalRowsWritten,
3278
- cacheSummary,
3537
+ previewRows,
3538
+ cachedRows: canCacheRows ? cachedRows : null,
3539
+ readRows: async ({ limit, offset }) => {
3540
+ const result = await harnessReadSheetDatasetRows({
3541
+ baseUrl: req.baseUrl,
3542
+ executorToken: req.executorToken,
3543
+ playName: req.playName,
3544
+ tableNamespace: name,
3545
+ runId: req.runId,
3546
+ limit,
3547
+ offset,
3548
+ userEmail: req.userEmail,
3549
+ preloadedDbSessions: req.preloadedDbSessions ?? null,
3550
+ });
3551
+ return result.rows as Array<T & Record<string, unknown>>;
3552
+ },
3553
+ trace: (phase, ms, extra) =>
3554
+ recordRunnerPerfTrace({ req, phase, ms, extra }),
3555
+ nowMs,
3279
3556
  workProgress: {
3280
3557
  total: totalRowsWritten,
3281
3558
  executed: totalRowsExecuted,
@@ -3290,110 +3567,61 @@ function createMinimalWorkerCtx(
3290
3567
  });
3291
3568
  };
3292
3569
 
3293
- if (streaming) {
3294
- let totalRowsWritten = 0;
3295
- let chunkIndex = 0;
3296
- let chunkStart = 0;
3297
- const streamingDataset = sliced as unknown as StreamingCsvDataset<T>;
3298
- for await (const chunkRows of streamingDataset.iterChunks(rowsPerChunk)) {
3299
- assertNotAborted(abortSignal);
3300
- if (chunkRows.length === 0) continue;
3301
- assertUniqueExplicitRowKeys(chunkRows, chunkStart);
3302
- const chunkResult = await runChunkStep(
3303
- chunkRows,
3304
- chunkStart,
3305
- chunkIndex,
3570
+ let totalRowsWritten = 0;
3571
+ let chunkIndex = 0;
3572
+ let chunkStart = 0;
3573
+ for await (const chunkRows of iterDatasetChunks(inputRows, rowsPerChunk)) {
3574
+ assertNotAborted(abortSignal);
3575
+ if (chunkRows.length === 0) continue;
3576
+ assertUniqueExplicitRowKeys(chunkRows, chunkStart);
3577
+ const chunkResult = await runChunkStep(chunkRows, chunkStart, chunkIndex);
3578
+ totalRowsWritten += chunkResult.rowsWritten;
3579
+ totalRowsExecuted += chunkResult.rowsExecuted;
3580
+ totalRowsCached += chunkResult.rowsCached;
3581
+ totalRowsDuplicateReused += chunkResult.rowsDuplicateReused;
3582
+ totalRowsInserted += chunkResult.rowsInserted;
3583
+ totalRowsSkipped += chunkResult.rowsSkipped;
3584
+ updateMapProgress({
3585
+ completed: totalRowsWritten,
3586
+ total: rowCountHint ?? undefined,
3587
+ message: formatMapProgressMessage(
3588
+ totalRowsWritten,
3589
+ rowCountHint ?? undefined,
3590
+ ),
3591
+ });
3592
+ if (previewRows.length < WORKER_DATASET_PREVIEW_ROWS) {
3593
+ previewRows.push(
3594
+ ...chunkResult.preview.slice(
3595
+ 0,
3596
+ WORKER_DATASET_PREVIEW_ROWS - previewRows.length,
3597
+ ),
3306
3598
  );
3307
- totalRowsWritten += chunkResult.rowsWritten;
3308
- totalRowsExecuted += chunkResult.rowsExecuted;
3309
- totalRowsCached += chunkResult.rowsCached;
3310
- totalRowsDuplicateReused += chunkResult.rowsDuplicateReused;
3311
- totalRowsInserted += chunkResult.rowsInserted;
3312
- totalRowsSkipped += chunkResult.rowsSkipped;
3313
- updateMapProgress({
3314
- completed: totalRowsWritten,
3315
- message: formatMapProgressMessage(totalRowsWritten),
3316
- });
3317
- if (out.length < 10) {
3318
- out.push(...chunkResult.preview.slice(0, 10 - out.length));
3319
- }
3320
- chunkStart += chunkRows.length;
3321
- chunkIndex += 1;
3322
3599
  }
3323
- const dataset = finalize(totalRowsWritten);
3324
- recordRunnerPerfTrace({
3325
- req,
3326
- phase: 'runner.map.total',
3327
- ms: nowMs() - mapStartedAt,
3328
- extra: {
3329
- mapName: name,
3330
- rowsWritten: totalRowsWritten,
3331
- streaming: true,
3332
- },
3333
- });
3334
- return dataset;
3335
- }
3336
-
3337
- if (workflowStep && sliced.length > rowsPerChunk) {
3338
- let totalRowsWritten = 0;
3339
- for (let start = 0; start < sliced.length; start += rowsPerChunk) {
3340
- assertNotAborted(abortSignal);
3341
- const end = Math.min(sliced.length, start + rowsPerChunk);
3342
- const chunkRows = sliced.slice(start, end);
3343
- const chunkIndex = Math.floor(start / rowsPerChunk);
3344
- assertUniqueExplicitRowKeys(chunkRows, start);
3345
- const chunkResult = await runChunkStep(chunkRows, start, chunkIndex);
3346
- totalRowsWritten += chunkResult.rowsWritten;
3347
- totalRowsExecuted += chunkResult.rowsExecuted;
3348
- totalRowsCached += chunkResult.rowsCached;
3349
- totalRowsDuplicateReused += chunkResult.rowsDuplicateReused;
3350
- totalRowsInserted += chunkResult.rowsInserted;
3351
- totalRowsSkipped += chunkResult.rowsSkipped;
3352
- updateMapProgress({
3353
- completed: totalRowsWritten,
3354
- total: sliced.length,
3355
- message: formatMapProgressMessage(totalRowsWritten, sliced.length),
3356
- });
3357
- if (out.length < 10) {
3358
- out.push(...chunkResult.preview.slice(0, 10 - out.length));
3600
+ if (canCacheRows) {
3601
+ const nextRows = chunkResult.cachedRows ?? [];
3602
+ if (
3603
+ nextRows.length === chunkResult.rowsWritten &&
3604
+ cachedRows.length + nextRows.length <= WORKER_DATASET_IN_MEMORY_ROWS
3605
+ ) {
3606
+ cachedRows.push(...nextRows);
3607
+ } else {
3608
+ cachedRows.length = 0;
3609
+ canCacheRows = false;
3359
3610
  }
3360
3611
  }
3361
- const dataset = finalize(totalRowsWritten);
3362
- recordRunnerPerfTrace({
3363
- req,
3364
- phase: 'runner.map.total',
3365
- ms: nowMs() - mapStartedAt,
3366
- extra: {
3367
- mapName: name,
3368
- rowsWritten: totalRowsWritten,
3369
- streaming: false,
3370
- },
3371
- });
3372
- return dataset;
3612
+ chunkStart += chunkRows.length;
3613
+ chunkIndex += 1;
3373
3614
  }
3374
-
3375
- assertUniqueExplicitRowKeys(sliced, 0);
3376
- const chunkResult = await runChunkStep(sliced, 0, 0);
3377
- totalRowsExecuted = chunkResult.rowsExecuted;
3378
- totalRowsCached = chunkResult.rowsCached;
3379
- totalRowsDuplicateReused = chunkResult.rowsDuplicateReused;
3380
- totalRowsInserted = chunkResult.rowsInserted;
3381
- totalRowsSkipped = chunkResult.rowsSkipped;
3382
- out.push(...chunkResult.preview);
3383
- updateMapProgress({
3384
- completed: chunkResult.rowsWritten,
3385
- total: sliced.length,
3386
- message: formatMapProgressMessage(chunkResult.rowsWritten, sliced.length),
3387
- });
3388
- const dataset = finalize(chunkResult.rowsWritten);
3615
+ const dataset = finalize(totalRowsWritten);
3389
3616
  recordRunnerPerfTrace({
3390
3617
  req,
3391
3618
  phase: 'runner.map.total',
3392
3619
  ms: nowMs() - mapStartedAt,
3393
3620
  extra: {
3394
3621
  mapName: name,
3395
- rowsWritten: chunkResult.rowsWritten,
3396
- streaming: false,
3622
+ rowsWritten: totalRowsWritten,
3623
+ inputKind: rowCountHint === null ? 'streaming' : 'known_count',
3624
+ chunks: chunkIndex,
3397
3625
  },
3398
3626
  });
3399
3627
  return dataset;
@@ -3407,7 +3635,7 @@ function createMinimalWorkerCtx(
3407
3635
 
3408
3636
  constructor(
3409
3637
  private readonly name: string,
3410
- private readonly rows: T[],
3638
+ private readonly rows: WorkerDatasetInput<T>,
3411
3639
  ) {}
3412
3640
 
3413
3641
  step(name: string, resolver: WorkerStepProgramStep['resolver']): this {
@@ -3482,18 +3710,13 @@ function createMinimalWorkerCtx(
3482
3710
  async csv<T extends Record<string, unknown> = Record<string, unknown>>(
3483
3711
  arg: unknown,
3484
3712
  options?: CsvRenameOptions,
3485
- ): Promise<T[]> {
3713
+ ): Promise<WorkerDatasetHandle<T>> {
3486
3714
  const csvStartedAt = nowMs();
3487
3715
  if (Array.isArray(arg)) {
3488
- // Inline rows passed at call site — already in memory, keep the
3489
- // legacy array-backed dataset shape.
3490
- const dataset = makeWorkerDataset(
3491
- 'csv',
3492
- applyCsvRenameProjection(arg as T[], options),
3493
- {
3494
- datasetKind: 'csv',
3495
- },
3496
- ) as unknown as T[];
3716
+ const dataset = createInlineDatasetHandle(
3717
+ applyCsvRenameProjection(arg as T[], options) as T[],
3718
+ { name: 'csv', kind: 'csv' },
3719
+ );
3497
3720
  recordRunnerPerfTrace({
3498
3721
  req,
3499
3722
  phase: 'runner.csv',
@@ -3504,15 +3727,10 @@ function createMinimalWorkerCtx(
3504
3727
  }
3505
3728
  const filename = String(arg ?? '');
3506
3729
  if (req.inlineCsv && filename === req.inlineCsv.name) {
3507
- // Inline CSV pre-staged by the dispatcher (small files <1 MiB). Already
3508
- // in memory; no streaming needed.
3509
- const dataset = makeWorkerDataset(
3510
- 'csv',
3511
- applyCsvRenameProjection(req.inlineCsv.rows as T[], options),
3512
- {
3513
- datasetKind: 'csv',
3514
- },
3515
- ) as unknown as T[];
3730
+ const dataset = createInlineDatasetHandle(
3731
+ applyCsvRenameProjection(req.inlineCsv.rows as T[], options) as T[],
3732
+ { name: filename, kind: 'csv' },
3733
+ );
3516
3734
  recordRunnerPerfTrace({
3517
3735
  req,
3518
3736
  phase: 'runner.csv',
@@ -3521,52 +3739,73 @@ function createMinimalWorkerCtx(
3521
3739
  });
3522
3740
  return dataset;
3523
3741
  }
3524
- // Resolution order: explicit inputR2Keys (runtime input) → packaged
3742
+ // Resolution order: explicit inputFiles (runtime input) → packaged
3525
3743
  // files (relative-path imports bundled with the play artifact).
3526
- let r2Key = req.inputR2Keys?.[filename];
3527
- if (!r2Key && req.packagedFiles) {
3744
+ let file = req.inputFiles?.[filename] ?? null;
3745
+ if (!file && req.packagedFiles) {
3528
3746
  const matchByPath = req.packagedFiles.find(
3529
3747
  (f) =>
3530
3748
  f.playPath === filename ||
3531
3749
  f.playPath === filename.replace(/^\.\//, ''),
3532
3750
  );
3533
- if (matchByPath) r2Key = matchByPath.storageKey;
3751
+ if (matchByPath) {
3752
+ file = {
3753
+ logicalPath: matchByPath.playPath,
3754
+ fileName:
3755
+ matchByPath.playPath.split('/').pop() ?? matchByPath.playPath,
3756
+ storageKey: matchByPath.storageKey,
3757
+ contentType: matchByPath.contentType,
3758
+ bytes: matchByPath.bytes,
3759
+ };
3760
+ }
3534
3761
  }
3535
- if (!r2Key) {
3762
+ if (!file?.storageKey) {
3536
3763
  throw new Error(
3537
3764
  `ctx.csv("${filename}"): no inline rows or R2 asset binding registered. ` +
3538
- 'Pass inline rows, or upload to R2 and register packagedFiles/inputR2Keys in the run config.',
3765
+ 'Pass inline rows, or upload to R2 and register packagedFiles/inputFiles in the run config.',
3766
+ );
3767
+ }
3768
+ const selectedFile = file;
3769
+ const expectedBytes = normalizeExpectedBytes(selectedFile.bytes);
3770
+ if (expectedBytes === null) {
3771
+ throw new Error(
3772
+ `ctx.csv("${filename}"): staged dataset handle is missing a byte length for ` +
3773
+ `${selectedFile.storageKey}. Re-stage the file with bytes metadata.`,
3539
3774
  );
3540
3775
  }
3541
- // Streaming path: returns a length-0 dataset shell whose iterChunks()
3542
- // pulls 1 MiB-ish text chunks from R2 and yields parsed row chunks.
3543
- // ctx.map detects the streaming surface via __deeplineStreamingDataset
3544
- // and switches its chunked execution loop to consume iterChunks
3545
- // directly, so 2M-row CSVs never get fully materialized in memory.
3546
- const storageKey = r2Key;
3547
- const dataset = makeStreamingCsvDataset<T>({
3776
+ const dataset = createCsvDatasetHandle<T>({
3548
3777
  name: filename,
3549
3778
  logicalPath: filename,
3779
+ expectedBytes,
3550
3780
  renameOptions: options,
3781
+ nowMs,
3782
+ streamRows: streamCsvRowsFromByteChunks,
3783
+ trace: (phase, ms, extra) =>
3784
+ recordRunnerPerfTrace({ req, phase, ms, extra }),
3551
3785
  open: () =>
3552
- openR2BodyStream({
3786
+ openFileByteChunks({
3553
3787
  req,
3554
3788
  env,
3555
3789
  logicalPath: filename,
3556
- storageKey,
3790
+ file: selectedFile,
3557
3791
  }),
3558
- }) as unknown as T[];
3792
+ });
3559
3793
  recordRunnerPerfTrace({
3560
3794
  req,
3561
3795
  phase: 'runner.csv',
3562
3796
  ms: nowMs() - csvStartedAt,
3563
- extra: { mode: 'streaming_r2', filename },
3797
+ extra: {
3798
+ mode: 'streaming_file',
3799
+ filename,
3800
+ expectedBytes,
3801
+ storageKey: selectedFile.storageKey,
3802
+ },
3564
3803
  });
3565
3804
  return dataset;
3566
3805
  },
3567
3806
  map<T extends Record<string, unknown>>(
3568
3807
  name: string,
3569
- rows: T[],
3808
+ rows: WorkerDatasetInput<T>,
3570
3809
  fieldsDef?:
3571
3810
  | Record<
3572
3811
  string,
@@ -3600,7 +3839,12 @@ function createMinimalWorkerCtx(
3600
3839
  input: Record<string, unknown>,
3601
3840
  ): Promise<unknown> => {
3602
3841
  assertNotAborted(abortSignal);
3603
- return executeTool(req, { id: key, toolId, input }, workflowStep);
3842
+ return executeToolWithLifecycle(
3843
+ req,
3844
+ { id: key, toolId, input },
3845
+ workflowStep,
3846
+ callbacks,
3847
+ );
3604
3848
  },
3605
3849
  tools: {
3606
3850
  async execute(
@@ -3609,11 +3853,13 @@ function createMinimalWorkerCtx(
3609
3853
  input?: unknown,
3610
3854
  _opts?: { description?: string },
3611
3855
  ): Promise<unknown> {
3856
+ void _opts;
3612
3857
  assertNotAborted(abortSignal);
3613
- return executeTool(
3858
+ return executeToolWithLifecycle(
3614
3859
  req,
3615
3860
  normalizeToolExecuteArgs(requestOrKey, toolId, input),
3616
3861
  workflowStep,
3862
+ callbacks,
3617
3863
  );
3618
3864
  },
3619
3865
  },
@@ -3640,7 +3886,15 @@ function createMinimalWorkerCtx(
3640
3886
  input: Record<string, unknown>,
3641
3887
  opts?: WorkerWaterfallOptions,
3642
3888
  ): Promise<unknown | null> {
3643
- return executeWorkerWaterfall(req, [], toolNameOrSpec, input, opts);
3889
+ return executeWorkerWaterfall(
3890
+ req,
3891
+ [],
3892
+ toolNameOrSpec,
3893
+ input,
3894
+ opts,
3895
+ callbacks,
3896
+ workflowStep,
3897
+ );
3644
3898
  },
3645
3899
  async sleep(ms: number): Promise<void> {
3646
3900
  assertNotAborted(abortSignal);
@@ -3993,17 +4247,10 @@ async function handleRun(request: Request, env: WorkerEnv): Promise<Response> {
3993
4247
  });
3994
4248
  }
3995
4249
 
3996
- /** Cap on `liveLogs` retained both in-memory and persisted to Convex. */
3997
- const LIVE_LOG_BUFFER_LIMIT = 500;
3998
- /** Min wall-clock interval between live-log flushes during a run. */
3999
- const LIVE_LOG_FLUSH_INTERVAL_MS = 500;
4000
- /**
4001
- * Initial flush delay for live logs. Short plays should not pay an extra
4002
- * non-terminal Convex write just to show a transient "running" log state; the
4003
- * terminal status carries the full log buffer. Longer plays still flush early
4004
- * enough for the dashboard to feel alive.
4005
- */
4006
- const LIVE_LOG_FIRST_FLUSH_DELAY_MS = 30_000;
4250
+ /** Cap on run log lines retained in the terminal output compatibility shape. */
4251
+ const RUN_LOG_BUFFER_LIMIT = 500;
4252
+ /** Min wall-clock interval between live run-ledger flushes during a run. */
4253
+ const RUN_LEDGER_FLUSH_INTERVAL_MS = 500;
4007
4254
 
4008
4255
  async function executeRunRequest(
4009
4256
  req: RunRequest,
@@ -4033,99 +4280,240 @@ async function executeRunRequest(
4033
4280
  });
4034
4281
  const abortController = options?.abortController ?? new AbortController();
4035
4282
  const abortSignal = abortController.signal;
4036
- const postgresPrewarmStartedAt = nowMs();
4037
- await harnessPrewarmPostgresSessions({
4038
- executorToken: req.executorToken,
4039
- sessions: req.preloadedDbSessions ?? [],
4040
- });
4041
- recordRunnerPerfTrace({
4042
- req,
4043
- phase: 'runner.prewarm_postgres',
4044
- ms: nowMs() - postgresPrewarmStartedAt,
4045
- extra: {
4046
- sessions: req.preloadedDbSessions?.length ?? 0,
4283
+ let runLogBuffer: string[] = [];
4284
+ let pendingRunLogLines: string[] = [];
4285
+ let stepProgressByNodeId: LiveNodeProgressMap = {};
4286
+ let dirtyProgressNodeIds = new Set<string>();
4287
+ let pendingLedgerEvents: PlayRunLedgerEvent[] = [
4288
+ {
4289
+ type: 'run.started',
4290
+ runId: req.runId,
4291
+ playName: req.playName,
4292
+ source: 'worker',
4293
+ occurredAt: startedAt,
4294
+ runtimeBackend: 'cf_workflows_dynamic_worker',
4047
4295
  },
4048
- });
4049
- // Maintain a rolling buffer of log lines emitted during the run. This is
4050
- // what the play-page UI consumes via Convex polling + diffPlayRunStreamEvents
4051
- // → play.run.log SSE events. Without periodic flushing, the play page only
4052
- // sees the final terminal status with no intermediate logs/progress.
4053
- let liveLogs: string[] = [];
4054
- let liveLogsDirty = false;
4055
- let liveNodeProgress: LiveNodeProgressMap = {};
4056
- let lastLiveLogFlushAt =
4057
- nowMs() - LIVE_LOG_FLUSH_INTERVAL_MS + LIVE_LOG_FIRST_FLUSH_DELAY_MS;
4058
- let liveLogFlushInFlight: Promise<void> = Promise.resolve();
4059
- const appendLiveLog = (line: string) => {
4296
+ ];
4297
+ let lastLedgerFlushAt = 0;
4298
+ let ledgerFlushInFlight: Promise<void> = Promise.resolve();
4299
+
4300
+ const appendRunLogLine = (line: string) => {
4060
4301
  const trimmed = redactSecretsFromLogString(line.trim());
4061
4302
  if (!trimmed) return;
4062
- liveLogs = [...liveLogs, trimmed].slice(-LIVE_LOG_BUFFER_LIMIT);
4063
- liveLogsDirty = true;
4303
+ runLogBuffer = [...runLogBuffer, trimmed].slice(-RUN_LOG_BUFFER_LIMIT);
4304
+ pendingRunLogLines = [...pendingRunLogLines, trimmed].slice(
4305
+ -RUN_LOG_BUFFER_LIMIT,
4306
+ );
4064
4307
  };
4065
- const updateLiveNodeProgress = (input: {
4308
+
4309
+ const updateStepProgress = (input: {
4066
4310
  nodeId: string;
4067
4311
  progress: LiveNodeProgressSnapshot;
4068
4312
  }) => {
4069
4313
  const nodeId = input.nodeId.trim();
4070
4314
  if (!nodeId) return;
4071
- liveNodeProgress = {
4072
- ...liveNodeProgress,
4315
+ stepProgressByNodeId = {
4316
+ ...stepProgressByNodeId,
4073
4317
  [nodeId]: {
4074
- ...(liveNodeProgress[nodeId] ?? {}),
4318
+ ...(stepProgressByNodeId[nodeId] ?? {}),
4075
4319
  ...input.progress,
4076
4320
  },
4077
4321
  };
4322
+ dirtyProgressNodeIds.add(nodeId);
4323
+ };
4324
+
4325
+ const stepProgressSnapshot = () => ({ ...stepProgressByNodeId });
4326
+
4327
+ const appendStepLifecycleEvent = (event: PlayStepLifecycleEvent) => {
4328
+ updateStepProgress({
4329
+ nodeId: event.nodeId,
4330
+ progress: {
4331
+ ...(event.transition === 'started'
4332
+ ? { startedAt: event.at }
4333
+ : { completedAt: event.at }),
4334
+ updatedAt: event.at,
4335
+ },
4336
+ });
4337
+ pendingLedgerEvents = [
4338
+ ...pendingLedgerEvents,
4339
+ {
4340
+ type:
4341
+ event.transition === 'started'
4342
+ ? 'step.started'
4343
+ : event.transition === 'failed'
4344
+ ? 'step.failed'
4345
+ : 'step.completed',
4346
+ runId: req.runId,
4347
+ source: 'worker',
4348
+ occurredAt: event.at,
4349
+ stepId: event.nodeId,
4350
+ kind: event.type,
4351
+ },
4352
+ ];
4353
+ flushLedgerEvents(false);
4354
+ };
4355
+
4356
+ const drainPendingLedgerEvents = (
4357
+ occurredAt: number,
4358
+ ): PlayRunLedgerEvent[] => {
4359
+ const events = pendingLedgerEvents;
4360
+ pendingLedgerEvents = [];
4361
+
4362
+ if (pendingRunLogLines.length > 0) {
4363
+ events.push({
4364
+ type: 'log.appended',
4365
+ runId: req.runId,
4366
+ source: 'worker',
4367
+ occurredAt,
4368
+ lines: pendingRunLogLines,
4369
+ });
4370
+ pendingRunLogLines = [];
4371
+ }
4372
+
4373
+ if (dirtyProgressNodeIds.size > 0) {
4374
+ for (const nodeId of dirtyProgressNodeIds) {
4375
+ const progress = stepProgressByNodeId[nodeId];
4376
+ if (!progress) continue;
4377
+ const normalizedProgress: PlayRunLedgerStepProgress = {
4378
+ ...(typeof progress.completed === 'number'
4379
+ ? { completed: progress.completed }
4380
+ : {}),
4381
+ ...(typeof progress.total === 'number'
4382
+ ? { total: progress.total }
4383
+ : {}),
4384
+ ...(typeof progress.failed === 'number'
4385
+ ? { failed: progress.failed }
4386
+ : {}),
4387
+ ...(typeof progress.message === 'string' && progress.message
4388
+ ? { message: progress.message }
4389
+ : {}),
4390
+ ...(typeof progress.artifactTableNamespace === 'string' ||
4391
+ progress.artifactTableNamespace === null
4392
+ ? { artifactTableNamespace: progress.artifactTableNamespace }
4393
+ : {}),
4394
+ updatedAt:
4395
+ typeof progress.updatedAt === 'number'
4396
+ ? progress.updatedAt
4397
+ : occurredAt,
4398
+ };
4399
+ const status: PlayRunLedgerStepStatus =
4400
+ typeof progress.completedAt === 'number' ? 'completed' : 'running';
4401
+ events.push({
4402
+ type: 'step.progress',
4403
+ runId: req.runId,
4404
+ source: 'worker',
4405
+ occurredAt:
4406
+ typeof progress.updatedAt === 'number'
4407
+ ? progress.updatedAt
4408
+ : occurredAt,
4409
+ stepId: nodeId,
4410
+ status,
4411
+ progress: normalizedProgress,
4412
+ });
4413
+ }
4414
+ dirtyProgressNodeIds = new Set<string>();
4415
+ }
4416
+
4417
+ return events;
4078
4418
  };
4079
- const liveNodeProgressSnapshot = () => ({ ...liveNodeProgress });
4080
- const flushLiveLogs = (force: boolean): void => {
4419
+
4420
+ const flushLedgerEvents = (force: boolean): void => {
4081
4421
  if (!options?.persistResultDatasets) return;
4082
- if (!liveLogsDirty && !force) return;
4083
4422
  const now = nowMs();
4084
- if (!force && now - lastLiveLogFlushAt < LIVE_LOG_FLUSH_INTERVAL_MS) return;
4085
- lastLiveLogFlushAt = now;
4086
- liveLogsDirty = false;
4087
- const snapshot = [...liveLogs];
4088
- liveLogFlushInFlight = liveLogFlushInFlight
4423
+ if (!force && now - lastLedgerFlushAt < RUN_LEDGER_FLUSH_INTERVAL_MS) {
4424
+ return;
4425
+ }
4426
+ const events = drainPendingLedgerEvents(now);
4427
+ if (events.length === 0) return;
4428
+ lastLedgerFlushAt = now;
4429
+ ledgerFlushInFlight = ledgerFlushInFlight
4089
4430
  .catch(() => undefined)
4090
4431
  .then(async () => {
4091
4432
  try {
4092
4433
  await postRuntimeApi(req.baseUrl, req.executorToken, {
4093
- action: 'update_run_status',
4434
+ action: 'append_run_events',
4094
4435
  playId: req.runId,
4095
- status: 'running',
4096
- runtimeBackend: 'cf_workflows_dynamic_worker',
4097
- liveLogs: snapshot,
4098
- liveNodeProgress: liveNodeProgressSnapshot(),
4099
- lastCheckpointAt: now,
4436
+ events,
4100
4437
  });
4101
4438
  } catch {
4102
- // Best-effort; the terminal update still carries the final logs.
4439
+ pendingLedgerEvents = [...events, ...pendingLedgerEvents];
4440
+ throw new Error('runtime run-ledger append failed');
4103
4441
  }
4442
+ })
4443
+ .catch(() => undefined);
4444
+ };
4445
+
4446
+ const flushTerminalLedgerEvents = async (
4447
+ terminalEvent: PlayRunLedgerEvent,
4448
+ ): Promise<void> => {
4449
+ if (!options?.persistResultDatasets) return;
4450
+ await ledgerFlushInFlight.catch(() => undefined);
4451
+ const now = nowMs();
4452
+ pendingRunLogLines = runLogBuffer;
4453
+ dirtyProgressNodeIds = new Set([
4454
+ ...dirtyProgressNodeIds,
4455
+ ...Object.keys(stepProgressByNodeId),
4456
+ ]);
4457
+ pendingLedgerEvents = [...pendingLedgerEvents, terminalEvent];
4458
+ const events = drainPendingLedgerEvents(now);
4459
+ if (events.length === 0) return;
4460
+ try {
4461
+ await postRuntimeApi(req.baseUrl, req.executorToken, {
4462
+ action: 'append_run_events',
4463
+ playId: req.runId,
4464
+ events,
4104
4465
  });
4466
+ } catch (error) {
4467
+ pendingLedgerEvents = [...events, ...pendingLedgerEvents];
4468
+ throw error;
4469
+ }
4470
+ };
4471
+
4472
+ const orderedNodes = buildOrderedNodeList(req.contractSnapshot);
4473
+ const stepLifecycle =
4474
+ orderedNodes.length > 0
4475
+ ? new PlayStepLifecycleTracker(
4476
+ orderedNodes,
4477
+ () => stepProgressByNodeId,
4478
+ appendStepLifecycleEvent,
4479
+ nowMs,
4480
+ )
4481
+ : null;
4482
+ const workerCallbacks: WorkerCtxCallbacks = {
4483
+ onNodeProgress: (input) => {
4484
+ updateStepProgress(input);
4485
+ flushLedgerEvents(false);
4486
+ },
4487
+ onMapStarted: (nodeId, at) => stepLifecycle?.onMapStarted(nodeId, at),
4488
+ onMapCompleted: (nodeId, at) => stepLifecycle?.onMapCompleted(nodeId, at),
4489
+ onToolCalled: (toolId, at) => stepLifecycle?.onToolCalled(toolId, at),
4490
+ onToolFailed: (toolId, at) => stepLifecycle?.onToolFailed(toolId, at),
4105
4491
  };
4106
4492
 
4107
4493
  const wrappedEmit = (event: RunnerEvent) => {
4108
4494
  if (event.type === 'log') {
4109
- appendLiveLog(event.message);
4110
- flushLiveLogs(false);
4495
+ appendRunLogLine(event.message);
4496
+ flushLedgerEvents(false);
4111
4497
  } else if (event.type === 'error') {
4112
4498
  // Sanitize the inbound message before it enters the live-log buffer.
4113
4499
  // The downstream `emit` still receives the raw event so the console /
4114
4500
  // NDJSON stream can keep its full debugging fidelity.
4115
4501
  const sanitizedMessage = redactSecretsFromLogString(event.message);
4116
- appendLiveLog(`[error] ${sanitizedMessage}`);
4117
- flushLiveLogs(true);
4502
+ appendRunLogLine(`[error] ${sanitizedMessage}`);
4503
+ flushLedgerEvents(true);
4118
4504
  }
4119
4505
  emit(event);
4120
4506
  };
4121
4507
 
4508
+ stepLifecycle?.markPreMapStepsStarted(startedAt);
4509
+ flushLedgerEvents(false);
4122
4510
  const ctx = createMinimalWorkerCtx(
4123
4511
  req,
4124
4512
  wrappedEmit,
4125
4513
  env,
4126
4514
  workflowStep,
4127
4515
  abortSignal,
4128
- { onNodeProgress: updateLiveNodeProgress },
4516
+ workerCallbacks,
4129
4517
  );
4130
4518
  try {
4131
4519
  const playStartedAt = nowMs();
@@ -4140,6 +4528,7 @@ async function executeRunRequest(
4140
4528
  phase: 'runner.play_function',
4141
4529
  ms: nowMs() - playStartedAt,
4142
4530
  });
4531
+ stepLifecycle?.markAllTerminal(nowMs());
4143
4532
  const serializeStartedAt = nowMs();
4144
4533
  const serializedResult = serializePlayReturnValue(result);
4145
4534
  recordRunnerPerfTrace({
@@ -4148,53 +4537,74 @@ async function executeRunRequest(
4148
4537
  ms: nowMs() - serializeStartedAt,
4149
4538
  });
4150
4539
  if (options?.persistResultDatasets) {
4151
- const persistStartedAt = nowMs();
4152
- await liveLogFlushInFlight.catch(() => undefined);
4540
+ const ledgerFlushWaitStartedAt = nowMs();
4541
+ await ledgerFlushInFlight.catch(() => undefined);
4153
4542
  recordRunnerPerfTrace({
4154
4543
  req,
4155
- phase: 'runner.live_log_flush_wait',
4156
- ms: nowMs() - persistStartedAt,
4544
+ phase: 'runner.run_ledger_flush_wait',
4545
+ ms: nowMs() - ledgerFlushWaitStartedAt,
4157
4546
  });
4158
4547
  const resultDatasetStartedAt = nowMs();
4159
- await persistResultDatasets(req, serializedResult);
4548
+ await persistResultDatasets(req, result, serializedResult);
4160
4549
  recordRunnerPerfTrace({
4161
4550
  req,
4162
4551
  phase: 'runner.persist_result_datasets',
4163
4552
  ms: nowMs() - resultDatasetStartedAt,
4164
4553
  });
4165
4554
  const terminalResult = trimResultForStatus(serializedResult);
4166
- const terminalUpdateStartedAt = nowMs();
4167
- await postRuntimeApiBestEffort(req.baseUrl, req.executorToken, {
4168
- action: 'update_run_status',
4169
- playId: req.runId,
4170
- status: 'completed',
4171
- error: null,
4172
- result: terminalResult,
4173
- runtimeBackend: 'cf_workflows_dynamic_worker',
4174
- waitKind: null,
4175
- waitUntil: null,
4176
- activeBoundaryId: null,
4177
- liveLogs,
4178
- liveNodeProgress: liveNodeProgressSnapshot(),
4179
- lastCheckpointAt: nowMs(),
4180
- });
4181
- recordRunnerPerfTrace({
4182
- req,
4183
- phase: 'runner.terminal_status_update',
4184
- ms: nowMs() - terminalUpdateStartedAt,
4555
+ const terminalOccurredAt = nowMs();
4556
+ const terminalLedgerPromise = (async () => {
4557
+ const terminalUpdateStartedAt = nowMs();
4558
+ await flushTerminalLedgerEvents({
4559
+ type: 'run.completed',
4560
+ runId: req.runId,
4561
+ source: 'worker',
4562
+ occurredAt: terminalOccurredAt,
4563
+ result: terminalResult,
4564
+ });
4565
+ recordRunnerPerfTrace({
4566
+ req,
4567
+ phase: 'runner.terminal_ledger_append',
4568
+ ms: nowMs() - terminalUpdateStartedAt,
4569
+ });
4570
+ })().catch((error) => {
4571
+ console.error(
4572
+ `[play-harness] non-fatal terminal ledger append failed runId=${req.runId}: ${
4573
+ error instanceof Error ? error.message : String(error)
4574
+ }`,
4575
+ );
4185
4576
  });
4186
4577
 
4578
+ await terminalLedgerPromise;
4579
+
4187
4580
  const billingStartedAt = nowMs();
4188
- await finalizeWorkerComputeBilling({
4581
+ const billingPromise = finalizeWorkerComputeBilling({
4189
4582
  req,
4190
4583
  success: true,
4191
4584
  actionEstimate: 4,
4585
+ }).then(() => {
4586
+ recordRunnerPerfTrace({
4587
+ req,
4588
+ phase: 'runner.compute_billing_finalize',
4589
+ ms: nowMs() - billingStartedAt,
4590
+ });
4192
4591
  });
4193
- recordRunnerPerfTrace({
4194
- req,
4195
- phase: 'runner.compute_billing_finalize',
4196
- ms: nowMs() - billingStartedAt,
4197
- });
4592
+ if (extractMaxCreditsPerRun(req.contractSnapshot) !== null) {
4593
+ await billingPromise;
4594
+ } else {
4595
+ const nonBlockingBillingPromise = billingPromise.catch((error) => {
4596
+ console.error(
4597
+ `[play-harness] non-fatal compute billing finalize failed runId=${req.runId}: ${
4598
+ error instanceof Error ? error.message : String(error)
4599
+ }`,
4600
+ );
4601
+ });
4602
+ if (options?.waitUntil) {
4603
+ options.waitUntil(nonBlockingBillingPromise);
4604
+ } else {
4605
+ await nonBlockingBillingPromise;
4606
+ }
4607
+ }
4198
4608
  }
4199
4609
  const parentSignalStartedAt = nowMs();
4200
4610
  await signalParentPlayTerminal({
@@ -4222,11 +4632,12 @@ async function executeRunRequest(
4222
4632
  playName: req.playName,
4223
4633
  result: serializedResult,
4224
4634
  outputRows: inferOutputRows(serializedResult),
4225
- liveLogs,
4226
- liveNodeProgress: liveNodeProgressSnapshot(),
4635
+ liveLogs: runLogBuffer,
4636
+ liveNodeProgress: stepProgressSnapshot(),
4227
4637
  durationMs: nowMs() - startedAt,
4228
4638
  };
4229
4639
  } catch (error) {
4640
+ stepLifecycle?.markStartedFailed(nowMs());
4230
4641
  const aborted = isAbortLikeError(error);
4231
4642
  if (aborted) {
4232
4643
  // Flip the controller so any concurrent user code observes the abort
@@ -4237,19 +4648,15 @@ async function executeRunRequest(
4237
4648
  }
4238
4649
  const message = error instanceof Error ? error.message : String(error);
4239
4650
  if (options?.persistResultDatasets) {
4240
- await liveLogFlushInFlight.catch(() => undefined);
4241
- await postRuntimeApiBestEffort(req.baseUrl, req.executorToken, {
4242
- action: 'update_run_status',
4243
- playId: req.runId,
4244
- status: aborted ? 'cancelled' : 'failed',
4651
+ appendRunLogLine(
4652
+ `${aborted ? '[cancelled]' : '[error]'} ${redactSecretsFromLogString(message)}`,
4653
+ );
4654
+ await flushTerminalLedgerEvents({
4655
+ type: aborted ? 'run.cancelled' : 'run.failed',
4656
+ runId: req.runId,
4657
+ source: 'worker',
4658
+ occurredAt: nowMs(),
4245
4659
  error: message,
4246
- runtimeBackend: 'cf_workflows_dynamic_worker',
4247
- waitKind: null,
4248
- waitUntil: null,
4249
- activeBoundaryId: null,
4250
- liveLogs,
4251
- liveNodeProgress: liveNodeProgressSnapshot(),
4252
- lastCheckpointAt: nowMs(),
4253
4660
  });
4254
4661
  await finalizeWorkerComputeBilling({
4255
4662
  req,
@@ -4338,6 +4745,12 @@ function runRequestFromWorkflowParams(
4338
4745
  ): RunRequest {
4339
4746
  const inputFile = isRecord(params.inputFile) ? params.inputFile : null;
4340
4747
  const fileName = String(inputFile?.name ?? inputFile?.path ?? 'input.csv');
4748
+ const inputStorageKey =
4749
+ typeof inputFile?.r2Key === 'string'
4750
+ ? inputFile.r2Key
4751
+ : typeof inputFile?.storageKey === 'string'
4752
+ ? inputFile.storageKey
4753
+ : null;
4341
4754
  return {
4342
4755
  runId: String(params.runId ?? ''),
4343
4756
  callbackUrl: String(params.baseUrl ?? ''),
@@ -4350,14 +4763,30 @@ function runRequestFromWorkflowParams(
4350
4763
  ? (params.input as Record<string, unknown>)
4351
4764
  : {},
4352
4765
  inlineCsv: isInlineCsv(params.inlineCsv) ? params.inlineCsv : null,
4353
- inputR2Keys:
4354
- inputFile && typeof inputFile.r2Key === 'string'
4355
- ? { [fileName]: inputFile.r2Key }
4766
+ inputFiles:
4767
+ inputFile && inputStorageKey
4768
+ ? {
4769
+ [fileName]: {
4770
+ logicalPath: String(
4771
+ inputFile.logicalPath ?? inputFile.path ?? fileName,
4772
+ ),
4773
+ fileName,
4774
+ storageKey: inputStorageKey,
4775
+ contentType:
4776
+ typeof inputFile.contentType === 'string'
4777
+ ? inputFile.contentType
4778
+ : null,
4779
+ bytes: normalizeExpectedBytes(inputFile.bytes),
4780
+ },
4781
+ }
4356
4782
  : null,
4357
4783
  packagedFiles: Array.isArray(params.packagedFiles)
4358
4784
  ? params.packagedFiles.filter(isRecord).map((file) => ({
4359
4785
  playPath: String(file.playPath ?? ''),
4360
4786
  storageKey: String(file.storageKey ?? ''),
4787
+ contentType:
4788
+ typeof file.contentType === 'string' ? file.contentType : null,
4789
+ bytes: normalizeExpectedBytes(file.bytes),
4361
4790
  }))
4362
4791
  : null,
4363
4792
  partitionRange: null,
@@ -4425,11 +4854,39 @@ function isPlayCallGovernanceSnapshot(
4425
4854
  async function persistResultDatasets(
4426
4855
  req: RunRequest,
4427
4856
  result: unknown,
4857
+ serializedResult: unknown,
4428
4858
  ): Promise<void> {
4429
- const datasets = collectDatasetEnvelopes(result);
4859
+ const persistedNamespaces = new Set<string>();
4860
+ for (const dataset of collectDatasetHandles(result)) {
4861
+ if (dataset.datasetKind === 'map') continue;
4862
+ let inputOffset = 0;
4863
+ for await (const chunk of iterDatasetChunks(
4864
+ dataset.handle,
4865
+ RESULT_DATASET_PERSIST_CHUNK_ROWS,
4866
+ )) {
4867
+ if (chunk.length === 0) continue;
4868
+ await harnessStartSheetDataset({
4869
+ baseUrl: req.baseUrl,
4870
+ executorToken: req.executorToken,
4871
+ playName: req.playName,
4872
+ tableNamespace: dataset.tableNamespace,
4873
+ sheetContract: requireSheetContract(req, dataset.tableNamespace),
4874
+ rows: chunk.map((row) => ({ ...row })),
4875
+ runId: req.runId,
4876
+ inputOffset,
4877
+ userEmail: req.userEmail,
4878
+ preloadedDbSessions: req.preloadedDbSessions ?? null,
4879
+ });
4880
+ inputOffset += chunk.length;
4881
+ }
4882
+ persistedNamespaces.add(dataset.tableNamespace);
4883
+ }
4884
+
4885
+ const datasets = collectDatasetEnvelopes(serializedResult);
4430
4886
  for (const dataset of datasets) {
4431
4887
  if (dataset.datasetKind === 'map') continue;
4432
4888
  if (dataset.rows.length === 0) continue;
4889
+ if (persistedNamespaces.has(dataset.tableNamespace)) continue;
4433
4890
  await harnessStartSheetDataset({
4434
4891
  baseUrl: req.baseUrl,
4435
4892
  executorToken: req.executorToken,
@@ -4438,12 +4895,63 @@ async function persistResultDatasets(
4438
4895
  sheetContract: requireSheetContract(req, dataset.tableNamespace),
4439
4896
  rows: dataset.rows,
4440
4897
  runId: req.runId,
4898
+ inputOffset: 0,
4441
4899
  userEmail: req.userEmail,
4442
4900
  preloadedDbSessions: req.preloadedDbSessions ?? null,
4443
4901
  });
4444
4902
  }
4445
4903
  }
4446
4904
 
4905
+ const RESULT_DATASET_PERSIST_CHUNK_ROWS = 5_000;
4906
+
4907
+ function collectDatasetHandles(value: unknown): Array<{
4908
+ tableNamespace: string;
4909
+ datasetKind: 'csv' | 'map' | null;
4910
+ handle: WorkerDatasetHandle<Record<string, unknown>>;
4911
+ }> {
4912
+ const datasets: Array<{
4913
+ tableNamespace: string;
4914
+ datasetKind: 'csv' | 'map' | null;
4915
+ handle: WorkerDatasetHandle<Record<string, unknown>>;
4916
+ }> = [];
4917
+ const seen = new WeakSet<object>();
4918
+ const walk = (candidate: unknown, depth: number) => {
4919
+ if (depth > 12 || candidate == null) return;
4920
+ if (isDatasetHandle(candidate)) {
4921
+ const metadata = candidate.toJSON() as Record<string, unknown>;
4922
+ const tableNamespace =
4923
+ typeof metadata.tableNamespace === 'string'
4924
+ ? metadata.tableNamespace
4925
+ : null;
4926
+ const datasetKind =
4927
+ metadata.datasetKind === 'csv' || metadata.datasetKind === 'map'
4928
+ ? metadata.datasetKind
4929
+ : null;
4930
+ if (tableNamespace) {
4931
+ datasets.push({
4932
+ tableNamespace,
4933
+ datasetKind,
4934
+ handle: candidate as WorkerDatasetHandle<Record<string, unknown>>,
4935
+ });
4936
+ }
4937
+ return;
4938
+ }
4939
+ if (Array.isArray(candidate)) {
4940
+ for (const item of candidate) walk(item, depth + 1);
4941
+ return;
4942
+ }
4943
+ if (typeof candidate !== 'object') return;
4944
+ const object = candidate as Record<string, unknown>;
4945
+ if (seen.has(object)) return;
4946
+ seen.add(object);
4947
+ for (const child of Object.values(object)) {
4948
+ walk(child, depth + 1);
4949
+ }
4950
+ };
4951
+ walk(value, 0);
4952
+ return datasets;
4953
+ }
4954
+
4447
4955
  function serializePlayReturnValue(value: unknown): unknown {
4448
4956
  return serializeValue(value, 0);
4449
4957
  }
@@ -4498,64 +5006,10 @@ function trimResultShape(value: unknown): unknown {
4498
5006
 
4499
5007
  function serializeValue(value: unknown, depth: number): unknown {
4500
5008
  if (depth > 20 || value == null) return value;
5009
+ if (isDatasetHandle(value)) {
5010
+ return serializeValue(value.toJSON(), depth + 1);
5011
+ }
4501
5012
  if (Array.isArray(value)) {
4502
- const tableNamespace =
4503
- typeof (value as unknown as { tableNamespace?: unknown })
4504
- .tableNamespace === 'string'
4505
- ? (value as unknown as { tableNamespace: string }).tableNamespace
4506
- : null;
4507
- const datasetId =
4508
- typeof (value as unknown as { datasetId?: unknown }).datasetId ===
4509
- 'string'
4510
- ? (value as unknown as { datasetId: string }).datasetId
4511
- : null;
4512
- const datasetCount =
4513
- typeof (value as unknown as { __deeplineDatasetCount?: unknown })
4514
- .__deeplineDatasetCount === 'number'
4515
- ? (value as unknown as { __deeplineDatasetCount: number })
4516
- .__deeplineDatasetCount
4517
- : value.length;
4518
- const datasetKind =
4519
- (value as unknown as { __deeplineDatasetKind?: unknown })
4520
- .__deeplineDatasetKind === 'csv'
4521
- ? 'csv'
4522
- : 'map';
4523
- const cacheSummary =
4524
- typeof (value as unknown as { __deeplineCacheSummary?: unknown })
4525
- .__deeplineCacheSummary === 'string'
4526
- ? (value as unknown as { __deeplineCacheSummary: string })
4527
- .__deeplineCacheSummary
4528
- : null;
4529
- const workProgress = isRecord(
4530
- (value as unknown as { __deeplineWorkProgress?: unknown })
4531
- .__deeplineWorkProgress,
4532
- )
4533
- ? (
4534
- value as unknown as {
4535
- __deeplineWorkProgress: Record<string, unknown>;
4536
- }
4537
- ).__deeplineWorkProgress
4538
- : null;
4539
- const previewRows = value
4540
- .slice(0, 5)
4541
- .map((row) => serializeValue(row, depth + 1))
4542
- .filter(isRecord);
4543
- if (tableNamespace && datasetId) {
4544
- const columns = inferColumns(
4545
- value.map((row) => serializeValue(row, depth + 1)).filter(isRecord),
4546
- );
4547
- return {
4548
- kind: 'dataset' as const,
4549
- datasetKind,
4550
- datasetId,
4551
- count: datasetCount,
4552
- columns,
4553
- preview: previewRows,
4554
- tableNamespace,
4555
- ...(cacheSummary ? { cacheSummary } : {}),
4556
- ...(workProgress ? { _metadata: { workProgress } } : {}),
4557
- };
4558
- }
4559
5013
  return value.map((entry) => serializeValue(entry, depth + 1));
4560
5014
  }
4561
5015
  if (typeof value !== 'object') return value;
@@ -4566,16 +5020,6 @@ function serializeValue(value: unknown, depth: number): unknown {
4566
5020
  return out;
4567
5021
  }
4568
5022
 
4569
- function inferColumns(rows: ReadonlyArray<Record<string, unknown>>): string[] {
4570
- const columns = new Set<string>();
4571
- for (const row of rows) {
4572
- for (const key of Object.keys(row)) {
4573
- columns.add(key);
4574
- }
4575
- }
4576
- return [...columns];
4577
- }
4578
-
4579
5023
  function collectDatasetEnvelopes(value: unknown): Array<{
4580
5024
  tableNamespace: string;
4581
5025
  datasetKind: 'csv' | 'map' | null;
@@ -4714,10 +5158,17 @@ export class TenantWorkflow extends WorkflowEntrypoint<
4714
5158
  // user via tail/SSE. Retry with backoff before giving up; if we drop
4715
5159
  // it, the user is stuck staring at the opaque CF reference id.
4716
5160
  const errorPayload = JSON.stringify({
4717
- action: 'update_run_status',
5161
+ action: 'append_run_events',
4718
5162
  playId: req.runId,
4719
- status: 'failed',
4720
- error: `TenantWorkflow.run threw: ${detail.name ?? 'Error'}: ${detail.message}\n${detail.stack ?? ''}`,
5163
+ events: [
5164
+ {
5165
+ type: 'run.failed',
5166
+ runId: req.runId,
5167
+ source: 'worker',
5168
+ occurredAt: nowMs(),
5169
+ error: `TenantWorkflow.run threw: ${detail.name ?? 'Error'}: ${detail.message}\n${detail.stack ?? ''}`,
5170
+ } satisfies PlayRunLedgerEvent,
5171
+ ],
4721
5172
  });
4722
5173
  const backoffMs = [200, 500, 1500];
4723
5174
  let lastCallbackError: unknown = null;
@@ -4850,6 +5301,10 @@ function inferOutputRows(result: unknown): number {
4850
5301
  const datasets: number[] = [];
4851
5302
  const walk = (value: unknown, depth: number) => {
4852
5303
  if (depth > 6 || value == null) return;
5304
+ if (isDatasetHandle(value)) {
5305
+ datasets.push(value.toJSON().count);
5306
+ return;
5307
+ }
4853
5308
  if (Array.isArray(value)) {
4854
5309
  for (const item of value) walk(item, depth + 1);
4855
5310
  return;
@@ -4858,14 +5313,9 @@ function inferOutputRows(result: unknown): number {
4858
5313
  const record = value as Record<string, unknown>;
4859
5314
  if (
4860
5315
  typeof record.tableNamespace === 'string' &&
4861
- (typeof record.count === 'number' ||
4862
- typeof record.__deeplineDatasetCount === 'number')
5316
+ typeof record.count === 'number'
4863
5317
  ) {
4864
- datasets.push(
4865
- typeof record.count === 'number'
4866
- ? record.count
4867
- : Number(record.__deeplineDatasetCount),
4868
- );
5318
+ datasets.push(record.count);
4869
5319
  }
4870
5320
  for (const [key, child] of Object.entries(record)) {
4871
5321
  if (key === 'preview') continue;