deepline 0.1.24 → 0.1.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -63,12 +63,37 @@ import {
63
63
  derivePlayRowIdentityFromKey,
64
64
  } from '../../../shared_libs/plays/row-identity';
65
65
  import {
66
+ getTopLevelPipelineSubsteps,
66
67
  getCompiledPipelineSubsteps,
67
68
  flattenStaticPipeline,
68
69
  resolveSheetContractForTableNamespace,
69
70
  sqlSafePlayColumnName,
71
+ type PlayStaticSubstep,
70
72
  type PlayStaticPipeline,
73
+ type PlaySheetContract,
71
74
  } from '../../../shared_libs/plays/static-pipeline';
75
+ import {
76
+ PlayStepLifecycleTracker,
77
+ type PlayStepLifecycleEvent,
78
+ } from '../../../shared_libs/play-runtime/step-lifecycle-tracker';
79
+ import type {
80
+ PlayRunLedgerEvent,
81
+ PlayRunLedgerStepProgress,
82
+ PlayRunLedgerStepStatus,
83
+ } from '../../../shared_libs/play-runtime/run-ledger';
84
+ import {
85
+ createCsvDatasetHandle,
86
+ createInlineDatasetHandle,
87
+ createMaterializedDatasetHandle,
88
+ createPersistedDatasetHandle,
89
+ datasetRowCountHint,
90
+ isDatasetHandle,
91
+ iterDatasetChunks,
92
+ WORKER_DATASET_IN_MEMORY_ROWS,
93
+ WORKER_DATASET_PREVIEW_ROWS,
94
+ type WorkerDatasetHandle,
95
+ type WorkerDatasetInput,
96
+ } from './runtime/dataset-handles';
72
97
  // The harness stub forwards leaf calls (validation, runtime-api HTTP) into
73
98
  // the long-lived Play Harness Worker via env.HARNESS. We import the
74
99
  // `setHarnessBinding` setter eagerly so it's available the moment
@@ -80,9 +105,10 @@ import {
80
105
  // modules without going through this stub is how we'd accidentally
81
106
  // re-bundle harness internals into per-play. Keep that in mind.
82
107
  import {
83
- harnessFetchStagedFile,
84
108
  harnessPersistCompletedSheetRows,
85
109
  harnessPrewarmPostgresSessions,
110
+ harnessReadSheetDatasetRows,
111
+ harnessReadStagedFileChunk,
86
112
  harnessStartSheetDataset,
87
113
  setHarnessBinding,
88
114
  } from '../../../sdk/src/plays/harness-stub';
@@ -115,12 +141,14 @@ type RunRequest = {
115
141
  runtimeInput: Record<string, unknown>;
116
142
  /** Optional inline CSV rows (for plays where ctx.csv was passed inline data). */
117
143
  inlineCsv?: { name: string; rows: Record<string, unknown>[] } | null;
118
- /** R2 keys for input files keyed by logical filename (used by ctx.csv). */
119
- inputR2Keys?: Record<string, string> | null;
144
+ /** Staged input files keyed by logical filename (used by ctx.csv). */
145
+ inputFiles?: Record<string, WorkerFileRef> | null;
120
146
  /** Files packaged with the play artifact (relative-path imports). */
121
147
  packagedFiles?: Array<{
122
148
  playPath: string;
123
149
  storageKey: string;
150
+ contentType?: string | null;
151
+ bytes?: number | null;
124
152
  }> | null;
125
153
  /** Partition fan-out: only process rows[start..end) of a sliced dataset. */
126
154
  partitionRange?: { start: number; end: number } | null;
@@ -148,6 +176,14 @@ type RunRequest = {
148
176
  totalRows?: number;
149
177
  };
150
178
 
179
+ type WorkerFileRef = {
180
+ logicalPath: string;
181
+ fileName: string;
182
+ storageKey: string;
183
+ contentType?: string | null;
184
+ bytes?: number | null;
185
+ };
186
+
151
187
  const EXECUTE_TOOL_METADATA_HEADER = 'x-deepline-include-tool-metadata';
152
188
 
153
189
  /** R2 binding injected by the Worker runtime (when present in deploy metadata). */
@@ -315,6 +351,7 @@ async function probeHarnessOnce(
315
351
  */
316
352
  const RUNTIME_API_TIMEOUT_MS = 30_000;
317
353
  const RUNTIME_API_PLAY_RUN_TIMEOUT_MS = 75_000;
354
+ const RUNTIME_API_RETRY_DELAYS_MS = [250, 750, 1500] as const;
318
355
  let loggedMissingRuntimeApiBinding = false;
319
356
 
320
357
  async function fetchRuntimeApi(
@@ -383,132 +420,6 @@ const WORKER_PLAY_CALL_LIMITS = {
383
420
  maxConcurrentPlayCalls: 16,
384
421
  };
385
422
 
386
- /**
387
- * Produces a dataset-envelope-shaped object compatible with the legacy
388
- * SerializedPlayDataset shape (kind/datasetKind/count/columns/preview) so
389
- * tests + assertions that probe `result.rows.columns` etc. work without the
390
- * ctx changing semantics. Plays still iterate rows via array semantics.
391
- */
392
- function makeWorkerDataset<T extends Record<string, unknown>>(
393
- name: string,
394
- rows: T[],
395
- options?: {
396
- count?: number;
397
- datasetKind?: 'csv' | 'map';
398
- cacheSummary?: string | null;
399
- workProgress?: {
400
- total: number;
401
- executed: number;
402
- reused: number;
403
- skipped: number;
404
- pending: number;
405
- failed: number;
406
- degraded?: boolean;
407
- duplicates?: {
408
- exact?: number;
409
- semantic?: number;
410
- rejected?: number;
411
- };
412
- };
413
- },
414
- ): T[] & {
415
- count(): Promise<number>;
416
- peek(limit?: number): Promise<T[]>;
417
- materialize(limit?: number): Promise<T[]>;
418
- toJSON(): unknown;
419
- datasetId: string;
420
- tableNamespace: string;
421
- } {
422
- const datasetId = `map:${name}`;
423
- const count = Math.max(0, Math.floor(options?.count ?? rows.length));
424
- const datasetKind = options?.datasetKind ?? 'map';
425
- const cacheSummary = options?.cacheSummary ?? null;
426
- const workProgress = options?.workProgress;
427
- // Build the array result. JSON.stringify on arrays calls toJSON only if
428
- // present on the array itself — we attach below. The dataset metadata is
429
- // also exposed via own properties so plays can `enriched.count()` etc.
430
- const arr = rows as T[] & {
431
- count(): Promise<number>;
432
- peek(limit?: number): Promise<T[]>;
433
- materialize(limit?: number): Promise<T[]>;
434
- toJSON(): unknown;
435
- datasetId: string;
436
- tableNamespace: string;
437
- };
438
- const previewLimit = 5;
439
- const inferredColumns = (() => {
440
- const cols = new Set<string>();
441
- for (const r of rows) {
442
- for (const k of Object.keys(r)) cols.add(k);
443
- }
444
- return [...cols];
445
- })();
446
- Object.defineProperty(arr, 'count', {
447
- value: async () => count,
448
- enumerable: false,
449
- });
450
- Object.defineProperty(arr, 'peek', {
451
- value: async (limit = previewLimit) => rows.slice(0, Math.max(0, limit)),
452
- enumerable: false,
453
- });
454
- Object.defineProperty(arr, 'materialize', {
455
- value: async (limit?: number) =>
456
- limit === undefined ? [...rows] : rows.slice(0, Math.max(0, limit)),
457
- enumerable: false,
458
- });
459
- Object.defineProperty(arr, 'datasetId', {
460
- value: datasetId,
461
- enumerable: true,
462
- });
463
- Object.defineProperty(arr, 'tableNamespace', {
464
- value: name,
465
- enumerable: true,
466
- });
467
- Object.defineProperty(arr, '__deeplineDatasetCount', {
468
- value: count,
469
- enumerable: false,
470
- });
471
- Object.defineProperty(arr, '__deeplineDatasetKind', {
472
- value: datasetKind,
473
- enumerable: false,
474
- });
475
- Object.defineProperty(arr, '__deeplineCacheSummary', {
476
- value: cacheSummary,
477
- enumerable: false,
478
- });
479
- Object.defineProperty(arr, '__deeplineWorkProgress', {
480
- value: workProgress,
481
- enumerable: false,
482
- });
483
- // Plays often `return { rows: dataset, count: N }`. JSON.stringify on the
484
- // array would normally produce `[row, row, ...]` — we want the dataset
485
- // envelope shape instead so assertions seeing `result.rows.columns` pass.
486
- // toJSON on an array is honored by JSON.stringify per ES spec.
487
- // toJSON includes ALL rows so the workflow DO can persist the full
488
- // dataset to the sheet table. We clone via plain-object copy to avoid
489
- // re-entrant toJSON resolution (the dataset IS an array; passing it back
490
- // via `preview: arr` would recurse forever through this same toJSON).
491
- Object.defineProperty(arr, 'toJSON', {
492
- value: () => {
493
- const plainRows = rows.map((r) => ({ ...r }));
494
- return {
495
- kind: 'dataset' as const,
496
- datasetKind,
497
- datasetId,
498
- count,
499
- columns: inferredColumns,
500
- preview: plainRows,
501
- tableNamespace: name,
502
- ...(cacheSummary ? { cacheSummary } : {}),
503
- ...(workProgress ? { _metadata: { workProgress } } : {}),
504
- };
505
- },
506
- enumerable: false,
507
- });
508
- void previewLimit;
509
- return arr;
510
- }
511
-
512
423
  type RunnerEvent =
513
424
  | {
514
425
  type: 'log';
@@ -533,12 +444,53 @@ type WorkerCtxCallbacks = {
533
444
  nodeId: string;
534
445
  progress: LiveNodeProgressSnapshot;
535
446
  }) => void;
447
+ onMapStarted?: (nodeId: string, at?: number) => void;
448
+ onMapCompleted?: (nodeId: string, at?: number) => void;
449
+ onToolCalled?: (toolId: string, at?: number) => void;
450
+ onToolFailed?: (toolId: string, at?: number) => void;
536
451
  };
537
452
 
538
453
  function nowMs(): number {
539
454
  return Date.now();
540
455
  }
541
456
 
457
+ function getStaticSubstepNodeId(
458
+ substep: PlayStaticSubstep,
459
+ index: number,
460
+ ): string {
461
+ switch (substep.type) {
462
+ case 'csv':
463
+ return `csv:${substep.field || index}`;
464
+ case 'map':
465
+ return `map:${substep.tableNamespace ?? substep.field}`;
466
+ case 'tool':
467
+ return `tool:${substep.field}:${substep.toolId}`;
468
+ case 'waterfall':
469
+ return `waterfall:${substep.id ?? substep.field}`;
470
+ case 'play_call':
471
+ return `play_call:${substep.field}:${substep.playId}`;
472
+ case 'run_javascript':
473
+ return `run_javascript:${substep.alias}`;
474
+ case 'code':
475
+ return `code:${substep.field || index}`;
476
+ default:
477
+ return `node:${index}`;
478
+ }
479
+ }
480
+
481
+ function buildOrderedNodeList(
482
+ contractSnapshot: unknown,
483
+ ): Array<{ nodeId: string; type: string }> {
484
+ const snapshot = isRecord(contractSnapshot) ? contractSnapshot : null;
485
+ const substeps = getTopLevelPipelineSubsteps(
486
+ (snapshot?.staticPipeline as PlayStaticPipeline | null | undefined) ?? null,
487
+ );
488
+ return substeps.map((substep, index) => ({
489
+ nodeId: getStaticSubstepNodeId(substep, index),
490
+ type: substep.type,
491
+ }));
492
+ }
493
+
542
494
  function recordRunnerPerfTrace(input: {
543
495
  req: RunRequest;
544
496
  phase: string;
@@ -557,7 +509,7 @@ function recordRunnerPerfTrace(input: {
557
509
  source: 'dynamic_worker' as const,
558
510
  runId: input.req.runId,
559
511
  phase: `runner.${input.phase}`,
560
- ...(input.ms !== undefined ? { ms: input.ms } : {}),
512
+ ms: input.ms ?? 0,
561
513
  ...(input.extra ?? {}),
562
514
  };
563
515
  console.log(
@@ -614,44 +566,73 @@ async function postRuntimeApi<T>(
614
566
  // Routes through the in-process RUNTIME_API binding when present; otherwise
615
567
  // falls back to a public fetch against `${baseUrl}${path}`. Either path
616
568
  // hits the same handler with the same auth — only the transport changes.
617
- const res = await fetchRuntimeApi(baseUrl, '/api/v2/plays/internal/runtime', {
618
- method: 'POST',
619
- headers: {
620
- 'content-type': 'application/json',
621
- authorization: `Bearer ${executorToken}`,
622
- 'x-deepline-request-id': makeRequestId(),
623
- },
624
- body: JSON.stringify(body),
625
- });
626
- if (!res.ok) {
569
+ const serializedBody = JSON.stringify(body);
570
+ let lastError: unknown = null;
571
+ for (let attempt = 0; attempt <= RUNTIME_API_RETRY_DELAYS_MS.length; attempt += 1) {
572
+ let res: Response;
573
+ try {
574
+ res = await fetchRuntimeApi(baseUrl, '/api/v2/plays/internal/runtime', {
575
+ method: 'POST',
576
+ headers: {
577
+ 'content-type': 'application/json',
578
+ authorization: `Bearer ${executorToken}`,
579
+ 'x-deepline-request-id': makeRequestId(),
580
+ },
581
+ body: serializedBody,
582
+ });
583
+ } catch (error) {
584
+ lastError = error;
585
+ if (
586
+ attempt >= RUNTIME_API_RETRY_DELAYS_MS.length ||
587
+ !isRetryableRuntimeApiError(error)
588
+ ) {
589
+ throw error;
590
+ }
591
+ await sleepRuntimeApiRetry(attempt);
592
+ continue;
593
+ }
594
+
595
+ if (res.ok) {
596
+ return (await res.json()) as T;
597
+ }
598
+
627
599
  const text = await res.text().catch(() => '');
628
- throw new Error(
629
- `runtime API ${res.status}: ${redactSecretsFromLogString(text.slice(0, 500))}`,
630
- );
600
+ const redacted = redactSecretsFromLogString(text.slice(0, 500));
601
+ lastError = new Error(`runtime API ${res.status}: ${redacted}`);
602
+ if (
603
+ attempt >= RUNTIME_API_RETRY_DELAYS_MS.length ||
604
+ !isRetryableRuntimeApiResponse(res.status, text)
605
+ ) {
606
+ throw lastError;
607
+ }
608
+ await sleepRuntimeApiRetry(attempt);
631
609
  }
632
- return (await res.json()) as T;
610
+ throw lastError instanceof Error ? lastError : new Error(String(lastError));
633
611
  }
634
612
 
635
- async function postDeeplineApi(
636
- req: RunRequest,
637
- path: string,
638
- body: unknown,
639
- ): Promise<void> {
640
- const res = await fetch(`${req.baseUrl.replace(/\/$/, '')}${path}`, {
641
- method: 'POST',
642
- headers: {
643
- 'content-type': 'application/json',
644
- authorization: `Bearer ${req.executorToken}`,
645
- 'x-deepline-request-id': makeRequestId(),
646
- },
647
- body: JSON.stringify(body),
648
- });
649
- if (!res.ok) {
650
- const text = await res.text().catch(() => '');
651
- throw new Error(
652
- `Deepline API ${path} ${res.status}: ${redactSecretsFromLogString(text.slice(0, 500))}`,
653
- );
613
+ function isRetryableRuntimeApiError(error: unknown): boolean {
614
+ const message = error instanceof Error ? error.message : String(error);
615
+ return /timed out|timeout|fetch failed|ECONNRESET|ECONNREFUSED|UND_ERR_CONNECT_TIMEOUT/i.test(
616
+ message,
617
+ );
618
+ }
619
+
620
+ function isRetryableRuntimeApiResponse(status: number, body: string): boolean {
621
+ if (status === 408 || status === 429 || status === 502 || status === 503 || status === 504) {
622
+ return true;
654
623
  }
624
+ return (
625
+ status === 500 &&
626
+ /timeout exceeded when trying to connect|timed out|fetch failed|ECONNRESET|UND_ERR_CONNECT_TIMEOUT/i.test(
627
+ body,
628
+ )
629
+ );
630
+ }
631
+
632
+ async function sleepRuntimeApiRetry(attempt: number): Promise<void> {
633
+ await new Promise((resolve) =>
634
+ setTimeout(resolve, RUNTIME_API_RETRY_DELAYS_MS[attempt] ?? 0),
635
+ );
655
636
  }
656
637
 
657
638
  function describeRuntimeApiBody(body: unknown): string {
@@ -973,6 +954,21 @@ async function executeTool(
973
954
  return callToolDirect(req, args);
974
955
  }
975
956
 
957
+ async function executeToolWithLifecycle(
958
+ req: RunRequest,
959
+ args: { id: string; toolId: string; input: Record<string, unknown> },
960
+ workflowStep: WorkflowStep | undefined,
961
+ callbacks: WorkerCtxCallbacks | undefined,
962
+ ): Promise<ToolExecuteResult> {
963
+ callbacks?.onToolCalled?.(args.toolId, nowMs());
964
+ try {
965
+ return await executeTool(req, args, workflowStep);
966
+ } catch (error) {
967
+ callbacks?.onToolFailed?.(args.toolId, nowMs());
968
+ throw error;
969
+ }
970
+ }
971
+
976
972
  function isToolExecuteRecord(value: unknown): value is Record<string, unknown> {
977
973
  return typeof value === 'object' && value !== null && !Array.isArray(value);
978
974
  }
@@ -1032,14 +1028,19 @@ async function waitForSyntheticIntegrationEvent(
1032
1028
  ? Math.max(1, Math.round(input.timeout_ms))
1033
1029
  : 30_000;
1034
1030
  await postRuntimeApiBestEffort(req.baseUrl, req.executorToken, {
1035
- action: 'update_run_status',
1031
+ action: 'append_run_events',
1036
1032
  playId: req.runId,
1037
- status: 'running',
1038
- runtimeBackend: 'cf_workflows_dynamic_worker',
1039
- waitKind: 'integration_event_batch',
1040
- waitUntil: nowMs() + timeoutMs,
1041
- activeBoundaryId: `integration_event:${eventKey}`,
1042
- lastCheckpointAt: nowMs(),
1033
+ events: [
1034
+ {
1035
+ type: 'log.appended',
1036
+ runId: req.runId,
1037
+ source: 'worker',
1038
+ occurredAt: nowMs(),
1039
+ lines: [
1040
+ `Waiting for integration_event:${eventKey} for up to ${timeoutMs}ms.`,
1041
+ ],
1042
+ } satisfies PlayRunLedgerEvent,
1043
+ ],
1043
1044
  });
1044
1045
  try {
1045
1046
  const event = (await (
@@ -1751,6 +1752,7 @@ type WorkerMapChunkSummary<T extends Record<string, unknown>> = {
1751
1752
  outputDatasetId: string;
1752
1753
  hash: string;
1753
1754
  preview: T[];
1755
+ cachedRows?: T[];
1754
1756
  };
1755
1757
 
1756
1758
  function toWorkflowSerializableValue<T>(value: T): T {
@@ -1932,6 +1934,8 @@ async function executeWorkerWaterfall(
1932
1934
  toolNameOrSpec: string | WorkerInlineWaterfallSpec,
1933
1935
  input: Record<string, unknown>,
1934
1936
  opts?: WorkerWaterfallOptions,
1937
+ callbacks?: WorkerCtxCallbacks,
1938
+ workflowStep?: WorkflowStep,
1935
1939
  ): Promise<unknown | null> {
1936
1940
  // Inline-spec form
1937
1941
  if (typeof toolNameOrSpec === 'object' && toolNameOrSpec) {
@@ -1948,20 +1952,32 @@ async function executeWorkerWaterfall(
1948
1952
  toolId?: unknown,
1949
1953
  toolInput?: unknown,
1950
1954
  ) =>
1951
- await executeTool(
1955
+ await executeToolWithLifecycle(
1952
1956
  req,
1953
1957
  normalizeToolExecuteArgs(requestOrKey, toolId, toolInput),
1958
+ workflowStep,
1959
+ callbacks,
1954
1960
  ),
1955
1961
  },
1956
1962
  tool: async (key, toolId, toolInput) =>
1957
- await executeTool(req, { id: key, toolId, input: toolInput }),
1963
+ await executeToolWithLifecycle(
1964
+ req,
1965
+ { id: key, toolId, input: toolInput },
1966
+ workflowStep,
1967
+ callbacks,
1968
+ ),
1958
1969
  });
1959
1970
  } else {
1960
- result = await executeTool(req, {
1961
- id: step.id,
1962
- toolId: step.toolId,
1963
- input: step.mapInput(input),
1964
- });
1971
+ result = await executeToolWithLifecycle(
1972
+ req,
1973
+ {
1974
+ id: step.id,
1975
+ toolId: step.toolId,
1976
+ input: step.mapInput(input),
1977
+ },
1978
+ workflowStep,
1979
+ callbacks,
1980
+ );
1965
1981
  }
1966
1982
  } catch {
1967
1983
  continue;
@@ -2047,7 +2063,12 @@ async function executeWorkerWaterfall(
2047
2063
  const providers = opts?.providers ?? [];
2048
2064
  if (providers.length === 0) {
2049
2065
  try {
2050
- return await executeTool(req, { id: toolName, toolId: toolName, input });
2066
+ return await executeToolWithLifecycle(
2067
+ req,
2068
+ { id: toolName, toolId: toolName, input },
2069
+ workflowStep,
2070
+ callbacks,
2071
+ );
2051
2072
  } catch {
2052
2073
  return null;
2053
2074
  }
@@ -2055,11 +2076,16 @@ async function executeWorkerWaterfall(
2055
2076
  let lastError: Error | null = null;
2056
2077
  for (const provider of providers) {
2057
2078
  try {
2058
- const result = await executeTool(req, {
2059
- id: `${toolName}:${provider}`,
2060
- toolId: toolName,
2061
- input: { ...input, provider },
2062
- });
2079
+ const result = await executeToolWithLifecycle(
2080
+ req,
2081
+ {
2082
+ id: `${toolName}:${provider}`,
2083
+ toolId: toolName,
2084
+ input: { ...input, provider },
2085
+ },
2086
+ workflowStep,
2087
+ callbacks,
2088
+ );
2063
2089
  if (resultHasContent(result)) {
2064
2090
  recorder.push({
2065
2091
  waterfallId: toolName,
@@ -2114,6 +2140,118 @@ function makeCsvParserState(): CsvParserState {
2114
2140
  return { field: '', row: [], inQuotes: false, pendingCr: false };
2115
2141
  }
2116
2142
 
2143
+ function normalizeExpectedBytes(value: unknown): number | null {
2144
+ return typeof value === 'number' && Number.isSafeInteger(value) && value >= 0
2145
+ ? value
2146
+ : null;
2147
+ }
2148
+
2149
+ function hasByteLengthMismatch(
2150
+ expectedBytes: number | null | undefined,
2151
+ actualBytes: number | null | undefined,
2152
+ ): boolean {
2153
+ return (
2154
+ typeof expectedBytes === 'number' &&
2155
+ typeof actualBytes === 'number' &&
2156
+ actualBytes !== expectedBytes
2157
+ );
2158
+ }
2159
+
2160
+ async function* iterReadableStreamChunks(
2161
+ body: ReadableStream<Uint8Array>,
2162
+ ): AsyncGenerator<Uint8Array, void, void> {
2163
+ const reader = body.getReader();
2164
+ try {
2165
+ while (true) {
2166
+ const { done, value } = await reader.read();
2167
+ if (done) return;
2168
+ if (value && value.byteLength > 0) yield value;
2169
+ }
2170
+ } finally {
2171
+ reader.releaseLock();
2172
+ }
2173
+ }
2174
+
2175
+ function singleByteChunk(bytes: Uint8Array): AsyncIterable<Uint8Array> {
2176
+ return {
2177
+ async *[Symbol.asyncIterator]() {
2178
+ if (bytes.byteLength > 0) yield bytes;
2179
+ },
2180
+ };
2181
+ }
2182
+
2183
+ async function* guardExpectedByteChunks(input: {
2184
+ req: RunRequest;
2185
+ logicalPath: string;
2186
+ storageKey: string;
2187
+ source: string;
2188
+ chunks: AsyncIterable<Uint8Array>;
2189
+ expectedBytes?: number | null;
2190
+ reportedBytes?: number | null;
2191
+ fallback?: () => AsyncIterable<Uint8Array>;
2192
+ }): AsyncGenerator<Uint8Array, void, void> {
2193
+ const expectedBytes =
2194
+ normalizeExpectedBytes(input.expectedBytes) ??
2195
+ normalizeExpectedBytes(input.reportedBytes);
2196
+ let bytesRead = 0;
2197
+ let sawChunk = false;
2198
+ let skippedEmptyChunks = 0;
2199
+
2200
+ for await (const value of input.chunks) {
2201
+ if (!value || value.byteLength === 0) {
2202
+ skippedEmptyChunks += 1;
2203
+ continue;
2204
+ }
2205
+ sawChunk = true;
2206
+ bytesRead += value.byteLength;
2207
+ yield value;
2208
+ }
2209
+
2210
+ if (!sawChunk) {
2211
+ if (typeof expectedBytes === 'number' && expectedBytes > 0) {
2212
+ recordRunnerPerfTrace({
2213
+ req: input.req,
2214
+ phase: 'csv.open_empty_body',
2215
+ extra: {
2216
+ source: input.source,
2217
+ logicalPath: input.logicalPath,
2218
+ expectedBytes,
2219
+ reportedBytes: normalizeExpectedBytes(input.reportedBytes),
2220
+ skippedEmptyChunks,
2221
+ storageKey: input.storageKey,
2222
+ },
2223
+ });
2224
+ if (input.fallback) {
2225
+ yield* input.fallback();
2226
+ return;
2227
+ }
2228
+ throw new Error(
2229
+ `ctx.csv("${input.logicalPath}"): ${input.source} returned an empty body for ` +
2230
+ `${expectedBytes} byte staged file ${input.storageKey}.`,
2231
+ );
2232
+ }
2233
+ return;
2234
+ }
2235
+
2236
+ if (expectedBytes !== null && bytesRead !== expectedBytes) {
2237
+ recordRunnerPerfTrace({
2238
+ req: input.req,
2239
+ phase: 'csv.read_mismatch',
2240
+ extra: {
2241
+ source: input.source,
2242
+ logicalPath: input.logicalPath,
2243
+ expectedBytes,
2244
+ actualBytes: bytesRead,
2245
+ storageKey: input.storageKey,
2246
+ },
2247
+ });
2248
+ throw new Error(
2249
+ `ctx.csv("${input.logicalPath}"): ${input.source} streamed ${bytesRead} bytes ` +
2250
+ `for ${expectedBytes} byte staged file ${input.storageKey}.`,
2251
+ );
2252
+ }
2253
+ }
2254
+
2117
2255
  /**
2118
2256
  * Push one buffered text chunk through the CSV state machine. Accumulates
2119
2257
  * fully-terminated rows into `out`; partial trailing field/row stays in
@@ -2182,11 +2320,10 @@ function flushCsvParser(state: CsvParserState, out: string[][]): void {
2182
2320
  * to every subsequent row. Stops cleanly on stream end and flushes any
2183
2321
  * trailing row.
2184
2322
  */
2185
- async function* streamCsvRowsFromBody<T extends Record<string, unknown>>(
2186
- body: ReadableStream<Uint8Array>,
2323
+ async function* streamCsvRowsFromByteChunks<T extends Record<string, unknown>>(
2324
+ byteChunks: AsyncIterable<Uint8Array>,
2187
2325
  chunkSize: number,
2188
2326
  ): AsyncGenerator<T[], void, void> {
2189
- const reader = body.getReader();
2190
2327
  const decoder = new TextDecoder('utf-8');
2191
2328
  const state = makeCsvParserState();
2192
2329
  const physicalRowBuffer: string[][] = [];
@@ -2195,7 +2332,13 @@ async function* streamCsvRowsFromBody<T extends Record<string, unknown>>(
2195
2332
 
2196
2333
  const flushPhysicalRowsAsObjects = (terminal: boolean): T[][] => {
2197
2334
  const yielded: T[][] = [];
2198
- if (physicalRowBuffer.length === 0) return yielded;
2335
+ if (physicalRowBuffer.length === 0) {
2336
+ if (terminal && pendingChunk.length > 0) {
2337
+ yielded.push(pendingChunk);
2338
+ pendingChunk = [];
2339
+ }
2340
+ return yielded;
2341
+ }
2199
2342
  if (!headers) {
2200
2343
  headers = physicalRowBuffer.shift() ?? null;
2201
2344
  if (!headers) return yielded;
@@ -2219,224 +2362,237 @@ async function* streamCsvRowsFromBody<T extends Record<string, unknown>>(
2219
2362
  return yielded;
2220
2363
  };
2221
2364
 
2222
- try {
2223
- while (true) {
2224
- const { done, value } = await reader.read();
2225
- if (done) {
2226
- flushCsvParser(state, physicalRowBuffer);
2227
- for (const chunk of flushPhysicalRowsAsObjects(true)) yield chunk;
2228
- return;
2229
- }
2230
- const text = decoder.decode(value, { stream: true });
2231
- pushCsvTextIntoParser(state, text, physicalRowBuffer);
2232
- for (const chunk of flushPhysicalRowsAsObjects(false)) yield chunk;
2233
- }
2234
- } finally {
2235
- reader.releaseLock();
2365
+ for await (const value of byteChunks) {
2366
+ if (value.byteLength === 0) continue;
2367
+ const text = decoder.decode(value, { stream: true });
2368
+ pushCsvTextIntoParser(state, text, physicalRowBuffer);
2369
+ for (const chunk of flushPhysicalRowsAsObjects(false)) yield chunk;
2370
+ }
2371
+ const trailingText = decoder.decode();
2372
+ if (trailingText) {
2373
+ pushCsvTextIntoParser(state, trailingText, physicalRowBuffer);
2236
2374
  }
2375
+ flushCsvParser(state, physicalRowBuffer);
2376
+ for (const chunk of flushPhysicalRowsAsObjects(true)) yield chunk;
2237
2377
  void TARGET_CSV_DECODE_CHUNK_BYTES; // referenced for future tuning
2238
2378
  }
2239
2379
 
2380
+ function readHarnessStagedFileChunks(input: {
2381
+ req: RunRequest;
2382
+ logicalPath: string;
2383
+ storageKey: string;
2384
+ expectedBytes?: number | null;
2385
+ }): AsyncIterable<Uint8Array> {
2386
+ const expectedBytes = normalizeExpectedBytes(input.expectedBytes);
2387
+ return {
2388
+ async *[Symbol.asyncIterator]() {
2389
+ let offset = 0;
2390
+ let observedBytes = 0;
2391
+ let objectSize: number | null = null;
2392
+ let loggedOpen = false;
2393
+ let done = false;
2394
+
2395
+ while (!done) {
2396
+ const result = await harnessReadStagedFileChunk({
2397
+ storageKey: input.storageKey,
2398
+ offset,
2399
+ length: TARGET_CSV_DECODE_CHUNK_BYTES,
2400
+ });
2401
+ const actualObjectSize = normalizeExpectedBytes(result.objectSize);
2402
+ if (actualObjectSize === null) {
2403
+ throw new Error(
2404
+ `ctx.csv("${input.logicalPath}"): harness returned an invalid object size for ${input.storageKey}.`,
2405
+ );
2406
+ }
2407
+ if (objectSize !== null && objectSize !== actualObjectSize) {
2408
+ throw new Error(
2409
+ `ctx.csv("${input.logicalPath}"): staged file size changed while streaming ` +
2410
+ `${input.storageKey}; started at ${objectSize} bytes, now ${actualObjectSize}.`,
2411
+ );
2412
+ }
2413
+ objectSize = actualObjectSize;
2414
+ if (hasByteLengthMismatch(expectedBytes, actualObjectSize)) {
2415
+ throw new Error(
2416
+ `ctx.csv("${input.logicalPath}"): harness staged file size mismatch for ` +
2417
+ `storageKey=${input.storageKey}; expected ${expectedBytes} bytes, got ${actualObjectSize}.`,
2418
+ );
2419
+ }
2420
+ if (result.offset !== offset) {
2421
+ throw new Error(
2422
+ `ctx.csv("${input.logicalPath}"): harness returned offset ${result.offset} while ` +
2423
+ `reading offset ${offset} from ${input.storageKey}.`,
2424
+ );
2425
+ }
2426
+
2427
+ const chunk = result.chunk;
2428
+ if (!(chunk instanceof Uint8Array)) {
2429
+ throw new Error(
2430
+ `ctx.csv("${input.logicalPath}"): harness returned a non-byte chunk for ${input.storageKey}.`,
2431
+ );
2432
+ }
2433
+ if (chunk.byteLength !== result.bytesRead) {
2434
+ throw new Error(
2435
+ `ctx.csv("${input.logicalPath}"): harness chunk metadata mismatch for ` +
2436
+ `${input.storageKey}; bytesRead=${result.bytesRead}, chunk=${chunk.byteLength}.`,
2437
+ );
2438
+ }
2439
+ if (chunk.byteLength === 0 && !result.done) {
2440
+ throw new Error(
2441
+ `ctx.csv("${input.logicalPath}"): harness returned an empty non-terminal chunk for ${input.storageKey}.`,
2442
+ );
2443
+ }
2444
+
2445
+ if (!loggedOpen) {
2446
+ loggedOpen = true;
2447
+ recordRunnerPerfTrace({
2448
+ req: input.req,
2449
+ phase: 'csv.open',
2450
+ extra: {
2451
+ source: 'harness_rpc_range',
2452
+ logicalPath: input.logicalPath,
2453
+ expectedBytes,
2454
+ actualBytes: actualObjectSize,
2455
+ chunkBytes: TARGET_CSV_DECODE_CHUNK_BYTES,
2456
+ storageKey: input.storageKey,
2457
+ },
2458
+ });
2459
+ }
2460
+
2461
+ offset += chunk.byteLength;
2462
+ observedBytes += chunk.byteLength;
2463
+ done = result.done;
2464
+ if (chunk.byteLength > 0) yield chunk;
2465
+ }
2466
+
2467
+ const requiredBytes = expectedBytes ?? objectSize;
2468
+ if (typeof requiredBytes === 'number' && observedBytes !== requiredBytes) {
2469
+ recordRunnerPerfTrace({
2470
+ req: input.req,
2471
+ phase: 'csv.read_mismatch',
2472
+ extra: {
2473
+ source: 'harness_rpc_range',
2474
+ logicalPath: input.logicalPath,
2475
+ expectedBytes: requiredBytes,
2476
+ actualBytes: observedBytes,
2477
+ storageKey: input.storageKey,
2478
+ },
2479
+ });
2480
+ throw new Error(
2481
+ `ctx.csv("${input.logicalPath}"): harness streamed ${observedBytes} bytes ` +
2482
+ `for ${requiredBytes} byte staged file ${input.storageKey}.`,
2483
+ );
2484
+ }
2485
+ },
2486
+ };
2487
+ }
2488
+
2240
2489
  /**
2241
- * R2 reader that returns a body stream. Per-play Workers loaded via
2490
+ * Dataset source adapter that returns byte chunks. Per-play Workers loaded via
2242
2491
  * WorkerLoader cannot accept a raw R2Bucket binding (CF Workflows refuses to
2243
2492
  * serialize R2Bucket through its workflow-state path), so per-play Workers
2244
2493
  * stream staged files through the long-lived harness Worker service binding.
2245
2494
  * Returns null only if the asset is genuinely missing (404).
2246
2495
  */
2247
- async function openR2BodyStream(input: {
2496
+ async function openFileByteChunks(input: {
2248
2497
  req: RunRequest;
2249
2498
  env: WorkerEnv;
2250
2499
  logicalPath: string;
2251
- storageKey: string;
2252
- }): Promise<ReadableStream<Uint8Array> | null> {
2253
- if (input.env.PLAYS_BUCKET) {
2254
- const object = await input.env.PLAYS_BUCKET.get(input.storageKey);
2255
- if (object) {
2256
- return object.body;
2257
- }
2258
- }
2259
- if (input.env.PLAY_ASSETS) {
2260
- try {
2261
- const text = await input.env.PLAY_ASSETS.readText(input.logicalPath);
2262
- const bytes = new TextEncoder().encode(text);
2263
- return new ReadableStream<Uint8Array>({
2264
- start(controller) {
2265
- controller.enqueue(bytes);
2266
- controller.close();
2267
- },
2268
- });
2269
- } catch (error) {
2270
- if (!/missing from R2|not found|No such object/i.test(String(error))) {
2271
- throw error;
2272
- }
2273
- }
2274
- }
2275
-
2276
- // The harness fetch path returns a real Response body backed by R2.
2277
- // Errors are loud: we want CI / regression failures to surface the real
2278
- // cause (auth, missing object, network) rather than getting squashed into a
2279
- // generic "R2 asset is not reachable".
2280
- const response = await harnessFetchStagedFile({
2281
- executorToken: input.req.executorToken,
2282
- storageKey: input.storageKey,
2283
- });
2284
- if (response.status === 404) {
2500
+ file: WorkerFileRef;
2501
+ }): Promise<AsyncIterable<Uint8Array> | null> {
2502
+ const storageKey = input.file.storageKey;
2503
+ const expectedBytes = normalizeExpectedBytes(input.file.bytes);
2504
+ if (expectedBytes === null) {
2285
2505
  throw new Error(
2286
- `ctx.csv("${input.logicalPath}"): harness R2 fetch returned 404 for storageKey=${input.storageKey}. ` +
2287
- `The staged file is missing from R2; the upload either failed silently before the run started, ` +
2288
- `or the storageKey threaded through the workflow params no longer matches what the harness resolves.`,
2506
+ `ctx.csv("${input.logicalPath}"): staged dataset handle is missing a byte length for ${storageKey}.`,
2289
2507
  );
2290
2508
  }
2291
- if (!response.ok || !response.body) {
2292
- const body = await response.text().catch(() => '');
2293
- throw new Error(
2294
- `ctx.csv("${input.logicalPath}"): harness R2 fetch failed ${response.status}: ${body.slice(0, 400)}`,
2295
- );
2296
- }
2297
- return response.body;
2298
- }
2299
-
2300
- /**
2301
- * Streaming CSV dataset. Backed by R2 (or a signed URL when PLAYS_BUCKET
2302
- * isn't bound). Looks like a length-0 array to plays that pass it straight
2303
- * to `ctx.map`; ctx.map detects the streaming surface via `iterChunks` and
2304
- * uses it instead of `slice()`. Plays that try to access rows synchronously
2305
- * (`csv[0]`, `csv.length`) are intentionally given an empty array — they
2306
- * must use ctx.map (the supported surface), call `materialize()` (bounded),
2307
- * or iterate via `for await (const row of csv)`.
2308
- */
2309
- type StreamingCsvDataset<T extends Record<string, unknown>> = T[] & {
2310
- count(): Promise<number>;
2311
- peek(limit?: number): Promise<T[]>;
2312
- materialize(limit?: number): Promise<T[]>;
2313
- iterChunks(chunkSize: number): AsyncIterable<T[]>;
2314
- toJSON(): unknown;
2315
- datasetId: string;
2316
- tableNamespace: string;
2317
- __deeplineDatasetKind: 'csv';
2318
- /** Marker so `ctx.map` can detect this is streaming-only and switch path. */
2319
- __deeplineStreamingDataset: true;
2320
- };
2321
-
2322
- const MAX_MATERIALIZE_ROWS_DEFAULT = 50_000;
2323
-
2324
- function makeStreamingCsvDataset<T extends Record<string, unknown>>(input: {
2325
- name: string;
2326
- logicalPath: string;
2327
- renameOptions?: CsvRenameOptions;
2328
- open: () => Promise<ReadableStream<Uint8Array> | null>;
2329
- }): StreamingCsvDataset<T> {
2330
- const datasetId = `csv:${input.name}`;
2331
- const arr = [] as T[] as StreamingCsvDataset<T>;
2332
- let cachedCount: number | null = null;
2333
-
2334
- async function* doStream(chunkSize: number): AsyncGenerator<T[], void, void> {
2335
- const body = await input.open();
2336
- if (!body) {
2337
- throw new Error(
2338
- `ctx.csv("${input.logicalPath}"): R2 asset is not reachable (no PLAYS_BUCKET binding and signed URL unavailable).`,
2339
- );
2340
- }
2341
- for await (const chunk of streamCsvRowsFromBody<T>(
2342
- body,
2343
- Math.max(1, Math.floor(chunkSize)),
2344
- )) {
2345
- yield applyCsvRenameProjection(chunk, input.renameOptions) as T[];
2509
+ if (input.env.PLAYS_BUCKET) {
2510
+ const object = await input.env.PLAYS_BUCKET.get(storageKey);
2511
+ if (object) {
2512
+ if (hasByteLengthMismatch(expectedBytes, object.size)) {
2513
+ recordRunnerPerfTrace({
2514
+ req: input.req,
2515
+ phase: 'csv.open_mismatch',
2516
+ extra: {
2517
+ source: 'direct_r2',
2518
+ logicalPath: input.logicalPath,
2519
+ expectedBytes,
2520
+ actualBytes: object.size,
2521
+ storageKey,
2522
+ },
2523
+ });
2524
+ await object.body.cancel().catch(() => undefined);
2525
+ } else {
2526
+ recordRunnerPerfTrace({
2527
+ req: input.req,
2528
+ phase: 'csv.open',
2529
+ extra: {
2530
+ source: 'direct_r2',
2531
+ logicalPath: input.logicalPath,
2532
+ expectedBytes,
2533
+ actualBytes: object.size,
2534
+ storageKey,
2535
+ },
2536
+ });
2537
+ return guardExpectedByteChunks({
2538
+ req: input.req,
2539
+ logicalPath: input.logicalPath,
2540
+ storageKey,
2541
+ source: 'direct_r2',
2542
+ chunks: iterReadableStreamChunks(object.body),
2543
+ expectedBytes,
2544
+ reportedBytes: object.size,
2545
+ fallback: () =>
2546
+ readHarnessStagedFileChunks({
2547
+ req: input.req,
2548
+ logicalPath: input.logicalPath,
2549
+ storageKey,
2550
+ expectedBytes,
2551
+ }),
2552
+ });
2553
+ }
2346
2554
  }
2347
2555
  }
2348
-
2349
- Object.defineProperty(arr, 'iterChunks', {
2350
- value: (chunkSize: number) => ({
2351
- [Symbol.asyncIterator]: () => doStream(chunkSize),
2352
- }),
2353
- enumerable: false,
2354
- });
2355
- Object.defineProperty(arr, Symbol.asyncIterator, {
2356
- value: async function* () {
2357
- for await (const chunk of doStream(1_000)) {
2358
- for (const row of chunk) yield row;
2359
- }
2360
- },
2361
- enumerable: false,
2362
- });
2363
- Object.defineProperty(arr, 'count', {
2364
- value: async () => {
2365
- if (cachedCount !== null) return cachedCount;
2366
- let total = 0;
2367
- for await (const chunk of doStream(5_000)) total += chunk.length;
2368
- cachedCount = total;
2369
- return total;
2370
- },
2371
- enumerable: false,
2372
- });
2373
- Object.defineProperty(arr, 'peek', {
2374
- value: async (limit = 10) => {
2375
- const out: T[] = [];
2376
- for await (const chunk of doStream(Math.max(1, limit))) {
2377
- for (const row of chunk) {
2378
- out.push(row);
2379
- if (out.length >= limit) return out;
2380
- }
2556
+ if (input.env.PLAY_ASSETS) {
2557
+ try {
2558
+ const text = await input.env.PLAY_ASSETS.readText(input.logicalPath);
2559
+ const bytes = new TextEncoder().encode(text);
2560
+ if (hasByteLengthMismatch(expectedBytes, bytes.byteLength)) {
2561
+ throw new Error(
2562
+ `ctx.csv("${input.logicalPath}"): packaged asset size mismatch for ` +
2563
+ `storageKey=${storageKey}; expected ${expectedBytes} bytes, got ${bytes.byteLength}.`,
2564
+ );
2381
2565
  }
2382
- return out;
2383
- },
2384
- enumerable: false,
2385
- });
2386
- Object.defineProperty(arr, 'materialize', {
2387
- value: async (limit?: number) => {
2388
- const cap = limit ?? MAX_MATERIALIZE_ROWS_DEFAULT;
2389
- const out: T[] = [];
2390
- for await (const chunk of doStream(5_000)) {
2391
- for (const row of chunk) {
2392
- if (out.length >= cap) {
2393
- return out;
2394
- }
2395
- out.push(row);
2396
- }
2566
+ recordRunnerPerfTrace({
2567
+ req: input.req,
2568
+ phase: 'csv.open',
2569
+ extra: {
2570
+ source: 'play_assets',
2571
+ logicalPath: input.logicalPath,
2572
+ expectedBytes,
2573
+ actualBytes: bytes.byteLength,
2574
+ storageKey,
2575
+ },
2576
+ });
2577
+ return singleByteChunk(bytes);
2578
+ } catch (error) {
2579
+ if (!/missing from R2|not found|No such object/i.test(String(error))) {
2580
+ throw error;
2397
2581
  }
2398
- return out;
2399
- },
2400
- enumerable: false,
2401
- });
2402
- Object.defineProperty(arr, 'datasetId', {
2403
- value: datasetId,
2404
- enumerable: true,
2405
- });
2406
- Object.defineProperty(arr, 'tableNamespace', {
2407
- value: input.name,
2408
- enumerable: true,
2409
- });
2410
- Object.defineProperty(arr, '__deeplineStreamingDataset', {
2411
- value: true,
2412
- enumerable: false,
2413
- });
2414
- Object.defineProperty(arr, '__deeplineDatasetKind', {
2415
- value: 'csv',
2416
- enumerable: false,
2417
- });
2418
- Object.defineProperty(arr, 'toJSON', {
2419
- value: () => ({
2420
- kind: 'dataset' as const,
2421
- datasetKind: 'csv',
2422
- datasetId,
2423
- count: cachedCount,
2424
- streaming: true,
2425
- tableNamespace: input.name,
2426
- }),
2427
- enumerable: false,
2428
- });
2429
- return arr;
2430
- }
2582
+ }
2583
+ }
2431
2584
 
2432
- function isStreamingDataset<T extends Record<string, unknown>>(
2433
- value: unknown,
2434
- ): value is StreamingCsvDataset<T> {
2435
- return (
2436
- Array.isArray(value) &&
2437
- (value as { __deeplineStreamingDataset?: unknown })
2438
- .__deeplineStreamingDataset === true
2439
- );
2585
+ // Dynamic Workers cannot receive a raw R2Bucket binding, and both previous
2586
+ // fallbacks were different data planes: service-binding fetch bodies could
2587
+ // arrive empty across WorkerLoader isolates, while app-signed URLs pointed at
2588
+ // the app namespace instead of the preview harness namespace. The harness owns
2589
+ // staged R2 now, so the only fallback is typed bounded range RPC.
2590
+ return readHarnessStagedFileChunks({
2591
+ req: input.req,
2592
+ logicalPath: input.logicalPath,
2593
+ storageKey,
2594
+ expectedBytes,
2595
+ });
2440
2596
  }
2441
2597
 
2442
2598
  /**
@@ -2492,6 +2648,60 @@ function requireSheetContract(
2492
2648
  return contract;
2493
2649
  }
2494
2650
 
2651
+ function isDatasetPayloadField(field: string): boolean {
2652
+ return (
2653
+ field.length > 0 &&
2654
+ !field.startsWith('__deepline') &&
2655
+ field !== '_key' &&
2656
+ field !== '_status' &&
2657
+ field !== '_run_id' &&
2658
+ field !== '_error' &&
2659
+ field !== '_stage' &&
2660
+ field !== '_provider' &&
2661
+ field !== '_input_index' &&
2662
+ field !== '_created_at' &&
2663
+ field !== '_updated_at' &&
2664
+ field !== '_cell_meta'
2665
+ );
2666
+ }
2667
+
2668
+ function augmentSheetContractWithDatasetFields(input: {
2669
+ contract: PlaySheetContract;
2670
+ rows: readonly Record<string, unknown>[];
2671
+ outputFields?: readonly string[];
2672
+ }): PlaySheetContract {
2673
+ const outputFields = new Set(input.outputFields ?? []);
2674
+ const existingFields = new Set(
2675
+ input.contract.columns.flatMap((column) =>
2676
+ typeof column.field === 'string' ? [column.field] : [],
2677
+ ),
2678
+ );
2679
+ const existingSqlNames = new Set(
2680
+ input.contract.columns.map((column) => column.sqlName),
2681
+ );
2682
+ const columns = [...input.contract.columns];
2683
+ for (const row of input.rows) {
2684
+ for (const field of Object.keys(row)) {
2685
+ if (!isDatasetPayloadField(field) || existingFields.has(field)) {
2686
+ continue;
2687
+ }
2688
+ const sqlName = sqlSafePlayColumnName(field);
2689
+ if (existingSqlNames.has(sqlName)) {
2690
+ continue;
2691
+ }
2692
+ existingFields.add(field);
2693
+ existingSqlNames.add(sqlName);
2694
+ columns.push({
2695
+ id: `runtime:${input.contract.tableNamespace}:${field}`,
2696
+ sqlName,
2697
+ source: outputFields.has(field) ? 'mapField' : 'input',
2698
+ field,
2699
+ });
2700
+ }
2701
+ }
2702
+ return { ...input.contract, columns };
2703
+ }
2704
+
2495
2705
  async function persistCompletedMapRows(input: {
2496
2706
  req: RunRequest;
2497
2707
  tableNamespace: string;
@@ -2500,19 +2710,24 @@ async function persistCompletedMapRows(input: {
2500
2710
  extraOutputFields?: string[];
2501
2711
  }): Promise<void> {
2502
2712
  if (input.rows.length === 0) return;
2713
+ const outputFields = [
2714
+ ...input.outputFields,
2715
+ ...(input.extraOutputFields ?? []).filter(
2716
+ (field) => !input.outputFields.includes(field),
2717
+ ),
2718
+ ];
2503
2719
  await harnessPersistCompletedSheetRows({
2504
2720
  baseUrl: input.req.baseUrl,
2505
2721
  executorToken: input.req.executorToken,
2506
2722
  playName: input.req.playName,
2507
2723
  tableNamespace: input.tableNamespace,
2508
- sheetContract: requireSheetContract(input.req, input.tableNamespace),
2724
+ sheetContract: augmentSheetContractWithDatasetFields({
2725
+ contract: requireSheetContract(input.req, input.tableNamespace),
2726
+ rows: input.rows,
2727
+ outputFields,
2728
+ }),
2509
2729
  rows: input.rows,
2510
- outputFields: [
2511
- ...input.outputFields,
2512
- ...(input.extraOutputFields ?? []).filter(
2513
- (field) => !input.outputFields.includes(field),
2514
- ),
2515
- ],
2730
+ outputFields,
2516
2731
  runId: input.req.runId,
2517
2732
  userEmail: input.req.userEmail,
2518
2733
  preloadedDbSessions: input.req.preloadedDbSessions ?? null,
@@ -2537,7 +2752,10 @@ async function prepareMapRows(input: {
2537
2752
  executorToken: input.req.executorToken,
2538
2753
  playName: input.req.playName,
2539
2754
  tableNamespace: input.tableNamespace,
2540
- sheetContract: requireSheetContract(input.req, input.tableNamespace),
2755
+ sheetContract: augmentSheetContractWithDatasetFields({
2756
+ contract: requireSheetContract(input.req, input.tableNamespace),
2757
+ rows: input.rows,
2758
+ }),
2541
2759
  rows: input.rows.map((row) => ({ ...row })),
2542
2760
  runId: input.req.runId,
2543
2761
  userEmail: input.req.userEmail,
@@ -2700,7 +2918,7 @@ function createMinimalWorkerCtx(
2700
2918
  const callDepth = rootGovernance?.callDepth ?? 0;
2701
2919
  const runMap = async <T extends Record<string, unknown>>(
2702
2920
  name: string,
2703
- rows: T[],
2921
+ rows: WorkerDatasetInput<T>,
2704
2922
  fieldsDef: Record<
2705
2923
  string,
2706
2924
  | unknown
@@ -2715,7 +2933,8 @@ function createMinimalWorkerCtx(
2715
2933
  ): Promise<unknown> => {
2716
2934
  const mapStartedAt = nowMs();
2717
2935
  const mapNodeId = `map:${name}`;
2718
- const sliced = rows;
2936
+ const inputRows = rows;
2937
+ const rowCountHint = datasetRowCountHint(inputRows);
2719
2938
  const baseOffset = 0;
2720
2939
  const fieldEntries = Object.entries(fieldsDef);
2721
2940
  const plan = req.executionPlan;
@@ -2723,12 +2942,8 @@ function createMinimalWorkerCtx(
2723
2942
  (candidate) =>
2724
2943
  candidate.mapName === name || candidate.tableNamespace === name,
2725
2944
  );
2726
- const streaming = isStreamingDataset<T>(sliced);
2727
- // For streaming inputs we don't know the row count upfront — pass
2728
- // `totalRows: 0` so chooseMapChunkSize falls back to the preferred /
2729
- // default chunk size rather than trying to budget against an unknown.
2730
2945
  const rowsPerChunk = chooseMapChunkSize({
2731
- totalRows: streaming ? 0 : sliced.length,
2946
+ totalRows: rowCountHint,
2732
2947
  mapCount: Math.max(1, plan?.maps.length ?? 1),
2733
2948
  stepsPerChunk: planMap?.stepsPerChunk ?? 1,
2734
2949
  preferredChunkSize: planMap?.defaultChunkSize,
@@ -2750,14 +2965,12 @@ function createMinimalWorkerCtx(
2750
2965
  typeof total === 'number' && Number.isFinite(total) && total > 0
2751
2966
  ? `${completed.toLocaleString()} / ${total.toLocaleString()} rows processed`
2752
2967
  : `${completed.toLocaleString()} rows processed`;
2968
+ callbacks?.onMapStarted?.(mapNodeId, mapStartedAt);
2753
2969
  updateMapProgress({
2754
2970
  completed: 0,
2755
- total: streaming ? undefined : sliced.length,
2971
+ total: rowCountHint ?? undefined,
2756
2972
  startedAt: mapStartedAt,
2757
- message: formatMapProgressMessage(
2758
- 0,
2759
- streaming ? undefined : sliced.length,
2760
- ),
2973
+ message: formatMapProgressMessage(0, rowCountHint ?? undefined),
2761
2974
  });
2762
2975
  const explicitRowKeysSeen =
2763
2976
  opts?.key === undefined ? null : new Map<string, number>();
@@ -2983,6 +3196,7 @@ function createMinimalWorkerCtx(
2983
3196
  input?: unknown,
2984
3197
  _opts?: { description?: string },
2985
3198
  ): Promise<unknown> => {
3199
+ void _opts;
2986
3200
  assertNotAborted(abortSignal);
2987
3201
  const request = normalizeToolExecuteArgs(
2988
3202
  requestOrKey,
@@ -3008,6 +3222,8 @@ function createMinimalWorkerCtx(
3008
3222
  toolNameOrSpec,
3009
3223
  waterfallInput,
3010
3224
  waterfallOpts,
3225
+ callbacks,
3226
+ workflowStep,
3011
3227
  ),
3012
3228
  };
3013
3229
  for (const [key, value] of fieldEntries) {
@@ -3219,10 +3435,16 @@ function createMinimalWorkerCtx(
3219
3435
  outputDatasetId: `map:${name}`,
3220
3436
  hash,
3221
3437
  preview: toWorkflowSerializableValue(out.slice(0, 5)),
3438
+ cachedRows:
3439
+ out.length <= WORKER_DATASET_IN_MEMORY_ROWS
3440
+ ? toWorkflowSerializableValue(out)
3441
+ : undefined,
3222
3442
  };
3223
3443
  };
3224
3444
 
3225
- const out: Array<T & Record<string, unknown>> = [];
3445
+ const previewRows: Array<T & Record<string, unknown>> = [];
3446
+ const cachedRows: Array<T & Record<string, unknown>> = [];
3447
+ let canCacheRows = true;
3226
3448
  let totalRowsExecuted = 0;
3227
3449
  let totalRowsCached = 0;
3228
3450
  let totalRowsDuplicateReused = 0;
@@ -3260,6 +3482,7 @@ function createMinimalWorkerCtx(
3260
3482
  `(${totalRowsExecuted} executed, ${totalRowsCached} already satisfied) ` +
3261
3483
  `inserted=${totalRowsInserted} skipped=${totalRowsSkipped}`;
3262
3484
  const completedAt = nowMs();
3485
+ callbacks?.onMapCompleted?.(mapNodeId, completedAt);
3263
3486
  updateMapProgress({
3264
3487
  completed: totalRowsWritten,
3265
3488
  total: totalRowsWritten,
@@ -3273,9 +3496,29 @@ function createMinimalWorkerCtx(
3273
3496
  message: cacheSummary,
3274
3497
  ts: nowMs(),
3275
3498
  });
3276
- return makeWorkerDataset(name, out, {
3499
+ return createPersistedDatasetHandle({
3500
+ playName: req.playName,
3501
+ name,
3277
3502
  count: totalRowsWritten,
3278
- cacheSummary,
3503
+ previewRows,
3504
+ cachedRows: canCacheRows ? cachedRows : null,
3505
+ readRows: async ({ limit, offset }) => {
3506
+ const result = await harnessReadSheetDatasetRows({
3507
+ baseUrl: req.baseUrl,
3508
+ executorToken: req.executorToken,
3509
+ playName: req.playName,
3510
+ tableNamespace: name,
3511
+ runId: req.runId,
3512
+ limit,
3513
+ offset,
3514
+ userEmail: req.userEmail,
3515
+ preloadedDbSessions: req.preloadedDbSessions ?? null,
3516
+ });
3517
+ return result.rows as Array<T & Record<string, unknown>>;
3518
+ },
3519
+ trace: (phase, ms, extra) =>
3520
+ recordRunnerPerfTrace({ req, phase, ms, extra }),
3521
+ nowMs,
3279
3522
  workProgress: {
3280
3523
  total: totalRowsWritten,
3281
3524
  executed: totalRowsExecuted,
@@ -3290,110 +3533,61 @@ function createMinimalWorkerCtx(
3290
3533
  });
3291
3534
  };
3292
3535
 
3293
- if (streaming) {
3294
- let totalRowsWritten = 0;
3295
- let chunkIndex = 0;
3296
- let chunkStart = 0;
3297
- const streamingDataset = sliced as unknown as StreamingCsvDataset<T>;
3298
- for await (const chunkRows of streamingDataset.iterChunks(rowsPerChunk)) {
3299
- assertNotAborted(abortSignal);
3300
- if (chunkRows.length === 0) continue;
3301
- assertUniqueExplicitRowKeys(chunkRows, chunkStart);
3302
- const chunkResult = await runChunkStep(
3303
- chunkRows,
3304
- chunkStart,
3305
- chunkIndex,
3536
+ let totalRowsWritten = 0;
3537
+ let chunkIndex = 0;
3538
+ let chunkStart = 0;
3539
+ for await (const chunkRows of iterDatasetChunks(inputRows, rowsPerChunk)) {
3540
+ assertNotAborted(abortSignal);
3541
+ if (chunkRows.length === 0) continue;
3542
+ assertUniqueExplicitRowKeys(chunkRows, chunkStart);
3543
+ const chunkResult = await runChunkStep(chunkRows, chunkStart, chunkIndex);
3544
+ totalRowsWritten += chunkResult.rowsWritten;
3545
+ totalRowsExecuted += chunkResult.rowsExecuted;
3546
+ totalRowsCached += chunkResult.rowsCached;
3547
+ totalRowsDuplicateReused += chunkResult.rowsDuplicateReused;
3548
+ totalRowsInserted += chunkResult.rowsInserted;
3549
+ totalRowsSkipped += chunkResult.rowsSkipped;
3550
+ updateMapProgress({
3551
+ completed: totalRowsWritten,
3552
+ total: rowCountHint ?? undefined,
3553
+ message: formatMapProgressMessage(
3554
+ totalRowsWritten,
3555
+ rowCountHint ?? undefined,
3556
+ ),
3557
+ });
3558
+ if (previewRows.length < WORKER_DATASET_PREVIEW_ROWS) {
3559
+ previewRows.push(
3560
+ ...chunkResult.preview.slice(
3561
+ 0,
3562
+ WORKER_DATASET_PREVIEW_ROWS - previewRows.length,
3563
+ ),
3306
3564
  );
3307
- totalRowsWritten += chunkResult.rowsWritten;
3308
- totalRowsExecuted += chunkResult.rowsExecuted;
3309
- totalRowsCached += chunkResult.rowsCached;
3310
- totalRowsDuplicateReused += chunkResult.rowsDuplicateReused;
3311
- totalRowsInserted += chunkResult.rowsInserted;
3312
- totalRowsSkipped += chunkResult.rowsSkipped;
3313
- updateMapProgress({
3314
- completed: totalRowsWritten,
3315
- message: formatMapProgressMessage(totalRowsWritten),
3316
- });
3317
- if (out.length < 10) {
3318
- out.push(...chunkResult.preview.slice(0, 10 - out.length));
3319
- }
3320
- chunkStart += chunkRows.length;
3321
- chunkIndex += 1;
3322
3565
  }
3323
- const dataset = finalize(totalRowsWritten);
3324
- recordRunnerPerfTrace({
3325
- req,
3326
- phase: 'runner.map.total',
3327
- ms: nowMs() - mapStartedAt,
3328
- extra: {
3329
- mapName: name,
3330
- rowsWritten: totalRowsWritten,
3331
- streaming: true,
3332
- },
3333
- });
3334
- return dataset;
3335
- }
3336
-
3337
- if (workflowStep && sliced.length > rowsPerChunk) {
3338
- let totalRowsWritten = 0;
3339
- for (let start = 0; start < sliced.length; start += rowsPerChunk) {
3340
- assertNotAborted(abortSignal);
3341
- const end = Math.min(sliced.length, start + rowsPerChunk);
3342
- const chunkRows = sliced.slice(start, end);
3343
- const chunkIndex = Math.floor(start / rowsPerChunk);
3344
- assertUniqueExplicitRowKeys(chunkRows, start);
3345
- const chunkResult = await runChunkStep(chunkRows, start, chunkIndex);
3346
- totalRowsWritten += chunkResult.rowsWritten;
3347
- totalRowsExecuted += chunkResult.rowsExecuted;
3348
- totalRowsCached += chunkResult.rowsCached;
3349
- totalRowsDuplicateReused += chunkResult.rowsDuplicateReused;
3350
- totalRowsInserted += chunkResult.rowsInserted;
3351
- totalRowsSkipped += chunkResult.rowsSkipped;
3352
- updateMapProgress({
3353
- completed: totalRowsWritten,
3354
- total: sliced.length,
3355
- message: formatMapProgressMessage(totalRowsWritten, sliced.length),
3356
- });
3357
- if (out.length < 10) {
3358
- out.push(...chunkResult.preview.slice(0, 10 - out.length));
3566
+ if (canCacheRows) {
3567
+ const nextRows = chunkResult.cachedRows ?? [];
3568
+ if (
3569
+ nextRows.length === chunkResult.rowsWritten &&
3570
+ cachedRows.length + nextRows.length <= WORKER_DATASET_IN_MEMORY_ROWS
3571
+ ) {
3572
+ cachedRows.push(...nextRows);
3573
+ } else {
3574
+ cachedRows.length = 0;
3575
+ canCacheRows = false;
3359
3576
  }
3360
3577
  }
3361
- const dataset = finalize(totalRowsWritten);
3362
- recordRunnerPerfTrace({
3363
- req,
3364
- phase: 'runner.map.total',
3365
- ms: nowMs() - mapStartedAt,
3366
- extra: {
3367
- mapName: name,
3368
- rowsWritten: totalRowsWritten,
3369
- streaming: false,
3370
- },
3371
- });
3372
- return dataset;
3578
+ chunkStart += chunkRows.length;
3579
+ chunkIndex += 1;
3373
3580
  }
3374
-
3375
- assertUniqueExplicitRowKeys(sliced, 0);
3376
- const chunkResult = await runChunkStep(sliced, 0, 0);
3377
- totalRowsExecuted = chunkResult.rowsExecuted;
3378
- totalRowsCached = chunkResult.rowsCached;
3379
- totalRowsDuplicateReused = chunkResult.rowsDuplicateReused;
3380
- totalRowsInserted = chunkResult.rowsInserted;
3381
- totalRowsSkipped = chunkResult.rowsSkipped;
3382
- out.push(...chunkResult.preview);
3383
- updateMapProgress({
3384
- completed: chunkResult.rowsWritten,
3385
- total: sliced.length,
3386
- message: formatMapProgressMessage(chunkResult.rowsWritten, sliced.length),
3387
- });
3388
- const dataset = finalize(chunkResult.rowsWritten);
3581
+ const dataset = finalize(totalRowsWritten);
3389
3582
  recordRunnerPerfTrace({
3390
3583
  req,
3391
3584
  phase: 'runner.map.total',
3392
3585
  ms: nowMs() - mapStartedAt,
3393
3586
  extra: {
3394
3587
  mapName: name,
3395
- rowsWritten: chunkResult.rowsWritten,
3396
- streaming: false,
3588
+ rowsWritten: totalRowsWritten,
3589
+ inputKind: rowCountHint === null ? 'streaming' : 'known_count',
3590
+ chunks: chunkIndex,
3397
3591
  },
3398
3592
  });
3399
3593
  return dataset;
@@ -3407,7 +3601,7 @@ function createMinimalWorkerCtx(
3407
3601
 
3408
3602
  constructor(
3409
3603
  private readonly name: string,
3410
- private readonly rows: T[],
3604
+ private readonly rows: WorkerDatasetInput<T>,
3411
3605
  ) {}
3412
3606
 
3413
3607
  step(name: string, resolver: WorkerStepProgramStep['resolver']): this {
@@ -3482,18 +3676,13 @@ function createMinimalWorkerCtx(
3482
3676
  async csv<T extends Record<string, unknown> = Record<string, unknown>>(
3483
3677
  arg: unknown,
3484
3678
  options?: CsvRenameOptions,
3485
- ): Promise<T[]> {
3679
+ ): Promise<WorkerDatasetHandle<T>> {
3486
3680
  const csvStartedAt = nowMs();
3487
3681
  if (Array.isArray(arg)) {
3488
- // Inline rows passed at call site — already in memory, keep the
3489
- // legacy array-backed dataset shape.
3490
- const dataset = makeWorkerDataset(
3491
- 'csv',
3492
- applyCsvRenameProjection(arg as T[], options),
3493
- {
3494
- datasetKind: 'csv',
3495
- },
3496
- ) as unknown as T[];
3682
+ const dataset = createInlineDatasetHandle(
3683
+ applyCsvRenameProjection(arg as T[], options) as T[],
3684
+ { name: 'csv', kind: 'csv' },
3685
+ );
3497
3686
  recordRunnerPerfTrace({
3498
3687
  req,
3499
3688
  phase: 'runner.csv',
@@ -3504,15 +3693,10 @@ function createMinimalWorkerCtx(
3504
3693
  }
3505
3694
  const filename = String(arg ?? '');
3506
3695
  if (req.inlineCsv && filename === req.inlineCsv.name) {
3507
- // Inline CSV pre-staged by the dispatcher (small files <1 MiB). Already
3508
- // in memory; no streaming needed.
3509
- const dataset = makeWorkerDataset(
3510
- 'csv',
3511
- applyCsvRenameProjection(req.inlineCsv.rows as T[], options),
3512
- {
3513
- datasetKind: 'csv',
3514
- },
3515
- ) as unknown as T[];
3696
+ const dataset = createInlineDatasetHandle(
3697
+ applyCsvRenameProjection(req.inlineCsv.rows as T[], options) as T[],
3698
+ { name: filename, kind: 'csv' },
3699
+ );
3516
3700
  recordRunnerPerfTrace({
3517
3701
  req,
3518
3702
  phase: 'runner.csv',
@@ -3521,52 +3705,72 @@ function createMinimalWorkerCtx(
3521
3705
  });
3522
3706
  return dataset;
3523
3707
  }
3524
- // Resolution order: explicit inputR2Keys (runtime input) → packaged
3708
+ // Resolution order: explicit inputFiles (runtime input) → packaged
3525
3709
  // files (relative-path imports bundled with the play artifact).
3526
- let r2Key = req.inputR2Keys?.[filename];
3527
- if (!r2Key && req.packagedFiles) {
3710
+ let file = req.inputFiles?.[filename] ?? null;
3711
+ if (!file && req.packagedFiles) {
3528
3712
  const matchByPath = req.packagedFiles.find(
3529
3713
  (f) =>
3530
3714
  f.playPath === filename ||
3531
3715
  f.playPath === filename.replace(/^\.\//, ''),
3532
3716
  );
3533
- if (matchByPath) r2Key = matchByPath.storageKey;
3717
+ if (matchByPath) {
3718
+ file = {
3719
+ logicalPath: matchByPath.playPath,
3720
+ fileName: matchByPath.playPath.split('/').pop() ?? matchByPath.playPath,
3721
+ storageKey: matchByPath.storageKey,
3722
+ contentType: matchByPath.contentType,
3723
+ bytes: matchByPath.bytes,
3724
+ };
3725
+ }
3534
3726
  }
3535
- if (!r2Key) {
3727
+ if (!file?.storageKey) {
3536
3728
  throw new Error(
3537
3729
  `ctx.csv("${filename}"): no inline rows or R2 asset binding registered. ` +
3538
- 'Pass inline rows, or upload to R2 and register packagedFiles/inputR2Keys in the run config.',
3730
+ 'Pass inline rows, or upload to R2 and register packagedFiles/inputFiles in the run config.',
3731
+ );
3732
+ }
3733
+ const selectedFile = file;
3734
+ const expectedBytes = normalizeExpectedBytes(selectedFile.bytes);
3735
+ if (expectedBytes === null) {
3736
+ throw new Error(
3737
+ `ctx.csv("${filename}"): staged dataset handle is missing a byte length for ` +
3738
+ `${selectedFile.storageKey}. Re-stage the file with bytes metadata.`,
3539
3739
  );
3540
3740
  }
3541
- // Streaming path: returns a length-0 dataset shell whose iterChunks()
3542
- // pulls 1 MiB-ish text chunks from R2 and yields parsed row chunks.
3543
- // ctx.map detects the streaming surface via __deeplineStreamingDataset
3544
- // and switches its chunked execution loop to consume iterChunks
3545
- // directly, so 2M-row CSVs never get fully materialized in memory.
3546
- const storageKey = r2Key;
3547
- const dataset = makeStreamingCsvDataset<T>({
3741
+ const dataset = createCsvDatasetHandle<T>({
3548
3742
  name: filename,
3549
3743
  logicalPath: filename,
3744
+ expectedBytes,
3550
3745
  renameOptions: options,
3746
+ nowMs,
3747
+ streamRows: streamCsvRowsFromByteChunks,
3748
+ trace: (phase, ms, extra) =>
3749
+ recordRunnerPerfTrace({ req, phase, ms, extra }),
3551
3750
  open: () =>
3552
- openR2BodyStream({
3751
+ openFileByteChunks({
3553
3752
  req,
3554
3753
  env,
3555
3754
  logicalPath: filename,
3556
- storageKey,
3755
+ file: selectedFile,
3557
3756
  }),
3558
- }) as unknown as T[];
3757
+ });
3559
3758
  recordRunnerPerfTrace({
3560
3759
  req,
3561
3760
  phase: 'runner.csv',
3562
3761
  ms: nowMs() - csvStartedAt,
3563
- extra: { mode: 'streaming_r2', filename },
3762
+ extra: {
3763
+ mode: 'streaming_file',
3764
+ filename,
3765
+ expectedBytes,
3766
+ storageKey: selectedFile.storageKey,
3767
+ },
3564
3768
  });
3565
3769
  return dataset;
3566
3770
  },
3567
3771
  map<T extends Record<string, unknown>>(
3568
3772
  name: string,
3569
- rows: T[],
3773
+ rows: WorkerDatasetInput<T>,
3570
3774
  fieldsDef?:
3571
3775
  | Record<
3572
3776
  string,
@@ -3600,7 +3804,12 @@ function createMinimalWorkerCtx(
3600
3804
  input: Record<string, unknown>,
3601
3805
  ): Promise<unknown> => {
3602
3806
  assertNotAborted(abortSignal);
3603
- return executeTool(req, { id: key, toolId, input }, workflowStep);
3807
+ return executeToolWithLifecycle(
3808
+ req,
3809
+ { id: key, toolId, input },
3810
+ workflowStep,
3811
+ callbacks,
3812
+ );
3604
3813
  },
3605
3814
  tools: {
3606
3815
  async execute(
@@ -3609,11 +3818,13 @@ function createMinimalWorkerCtx(
3609
3818
  input?: unknown,
3610
3819
  _opts?: { description?: string },
3611
3820
  ): Promise<unknown> {
3821
+ void _opts;
3612
3822
  assertNotAborted(abortSignal);
3613
- return executeTool(
3823
+ return executeToolWithLifecycle(
3614
3824
  req,
3615
3825
  normalizeToolExecuteArgs(requestOrKey, toolId, input),
3616
3826
  workflowStep,
3827
+ callbacks,
3617
3828
  );
3618
3829
  },
3619
3830
  },
@@ -3640,7 +3851,15 @@ function createMinimalWorkerCtx(
3640
3851
  input: Record<string, unknown>,
3641
3852
  opts?: WorkerWaterfallOptions,
3642
3853
  ): Promise<unknown | null> {
3643
- return executeWorkerWaterfall(req, [], toolNameOrSpec, input, opts);
3854
+ return executeWorkerWaterfall(
3855
+ req,
3856
+ [],
3857
+ toolNameOrSpec,
3858
+ input,
3859
+ opts,
3860
+ callbacks,
3861
+ workflowStep,
3862
+ );
3644
3863
  },
3645
3864
  async sleep(ms: number): Promise<void> {
3646
3865
  assertNotAborted(abortSignal);
@@ -3993,17 +4212,10 @@ async function handleRun(request: Request, env: WorkerEnv): Promise<Response> {
3993
4212
  });
3994
4213
  }
3995
4214
 
3996
- /** Cap on `liveLogs` retained both in-memory and persisted to Convex. */
3997
- const LIVE_LOG_BUFFER_LIMIT = 500;
3998
- /** Min wall-clock interval between live-log flushes during a run. */
3999
- const LIVE_LOG_FLUSH_INTERVAL_MS = 500;
4000
- /**
4001
- * Initial flush delay for live logs. Short plays should not pay an extra
4002
- * non-terminal Convex write just to show a transient "running" log state; the
4003
- * terminal status carries the full log buffer. Longer plays still flush early
4004
- * enough for the dashboard to feel alive.
4005
- */
4006
- const LIVE_LOG_FIRST_FLUSH_DELAY_MS = 30_000;
4215
+ /** Cap on run log lines retained in the terminal output compatibility shape. */
4216
+ const RUN_LOG_BUFFER_LIMIT = 500;
4217
+ /** Min wall-clock interval between live run-ledger flushes during a run. */
4218
+ const RUN_LEDGER_FLUSH_INTERVAL_MS = 500;
4007
4219
 
4008
4220
  async function executeRunRequest(
4009
4221
  req: RunRequest,
@@ -4046,86 +4258,235 @@ async function executeRunRequest(
4046
4258
  sessions: req.preloadedDbSessions?.length ?? 0,
4047
4259
  },
4048
4260
  });
4049
- // Maintain a rolling buffer of log lines emitted during the run. This is
4050
- // what the play-page UI consumes via Convex polling + diffPlayRunStreamEvents
4051
- // play.run.log SSE events. Without periodic flushing, the play page only
4052
- // sees the final terminal status with no intermediate logs/progress.
4053
- let liveLogs: string[] = [];
4054
- let liveLogsDirty = false;
4055
- let liveNodeProgress: LiveNodeProgressMap = {};
4056
- let lastLiveLogFlushAt =
4057
- nowMs() - LIVE_LOG_FLUSH_INTERVAL_MS + LIVE_LOG_FIRST_FLUSH_DELAY_MS;
4058
- let liveLogFlushInFlight: Promise<void> = Promise.resolve();
4059
- const appendLiveLog = (line: string) => {
4261
+ let runLogBuffer: string[] = [];
4262
+ let pendingRunLogLines: string[] = [];
4263
+ let stepProgressByNodeId: LiveNodeProgressMap = {};
4264
+ let dirtyProgressNodeIds = new Set<string>();
4265
+ let pendingLedgerEvents: PlayRunLedgerEvent[] = [
4266
+ {
4267
+ type: 'run.started',
4268
+ runId: req.runId,
4269
+ playName: req.playName,
4270
+ source: 'worker',
4271
+ occurredAt: startedAt,
4272
+ runtimeBackend: 'cf_workflows_dynamic_worker',
4273
+ },
4274
+ ];
4275
+ let lastLedgerFlushAt = 0;
4276
+ let ledgerFlushInFlight: Promise<void> = Promise.resolve();
4277
+
4278
+ const appendRunLogLine = (line: string) => {
4060
4279
  const trimmed = redactSecretsFromLogString(line.trim());
4061
4280
  if (!trimmed) return;
4062
- liveLogs = [...liveLogs, trimmed].slice(-LIVE_LOG_BUFFER_LIMIT);
4063
- liveLogsDirty = true;
4281
+ runLogBuffer = [...runLogBuffer, trimmed].slice(-RUN_LOG_BUFFER_LIMIT);
4282
+ pendingRunLogLines = [...pendingRunLogLines, trimmed].slice(
4283
+ -RUN_LOG_BUFFER_LIMIT,
4284
+ );
4064
4285
  };
4065
- const updateLiveNodeProgress = (input: {
4286
+
4287
+ const updateStepProgress = (input: {
4066
4288
  nodeId: string;
4067
4289
  progress: LiveNodeProgressSnapshot;
4068
4290
  }) => {
4069
4291
  const nodeId = input.nodeId.trim();
4070
4292
  if (!nodeId) return;
4071
- liveNodeProgress = {
4072
- ...liveNodeProgress,
4293
+ stepProgressByNodeId = {
4294
+ ...stepProgressByNodeId,
4073
4295
  [nodeId]: {
4074
- ...(liveNodeProgress[nodeId] ?? {}),
4296
+ ...(stepProgressByNodeId[nodeId] ?? {}),
4075
4297
  ...input.progress,
4076
4298
  },
4077
4299
  };
4300
+ dirtyProgressNodeIds.add(nodeId);
4301
+ };
4302
+
4303
+ const stepProgressSnapshot = () => ({ ...stepProgressByNodeId });
4304
+
4305
+ const appendStepLifecycleEvent = (event: PlayStepLifecycleEvent) => {
4306
+ updateStepProgress({
4307
+ nodeId: event.nodeId,
4308
+ progress: {
4309
+ ...(event.transition === 'started'
4310
+ ? { startedAt: event.at }
4311
+ : { completedAt: event.at }),
4312
+ updatedAt: event.at,
4313
+ },
4314
+ });
4315
+ pendingLedgerEvents = [
4316
+ ...pendingLedgerEvents,
4317
+ {
4318
+ type:
4319
+ event.transition === 'started'
4320
+ ? 'step.started'
4321
+ : event.transition === 'failed'
4322
+ ? 'step.failed'
4323
+ : 'step.completed',
4324
+ runId: req.runId,
4325
+ source: 'worker',
4326
+ occurredAt: event.at,
4327
+ stepId: event.nodeId,
4328
+ kind: event.type,
4329
+ },
4330
+ ];
4331
+ flushLedgerEvents(false);
4332
+ };
4333
+
4334
+ const drainPendingLedgerEvents = (
4335
+ occurredAt: number,
4336
+ ): PlayRunLedgerEvent[] => {
4337
+ const events = pendingLedgerEvents;
4338
+ pendingLedgerEvents = [];
4339
+
4340
+ if (pendingRunLogLines.length > 0) {
4341
+ events.push({
4342
+ type: 'log.appended',
4343
+ runId: req.runId,
4344
+ source: 'worker',
4345
+ occurredAt,
4346
+ lines: pendingRunLogLines,
4347
+ });
4348
+ pendingRunLogLines = [];
4349
+ }
4350
+
4351
+ if (dirtyProgressNodeIds.size > 0) {
4352
+ for (const nodeId of dirtyProgressNodeIds) {
4353
+ const progress = stepProgressByNodeId[nodeId];
4354
+ if (!progress) continue;
4355
+ const normalizedProgress: PlayRunLedgerStepProgress = {
4356
+ ...(typeof progress.completed === 'number'
4357
+ ? { completed: progress.completed }
4358
+ : {}),
4359
+ ...(typeof progress.total === 'number'
4360
+ ? { total: progress.total }
4361
+ : {}),
4362
+ ...(typeof progress.failed === 'number'
4363
+ ? { failed: progress.failed }
4364
+ : {}),
4365
+ ...(typeof progress.message === 'string' && progress.message
4366
+ ? { message: progress.message }
4367
+ : {}),
4368
+ ...(typeof progress.artifactTableNamespace === 'string' ||
4369
+ progress.artifactTableNamespace === null
4370
+ ? { artifactTableNamespace: progress.artifactTableNamespace }
4371
+ : {}),
4372
+ updatedAt:
4373
+ typeof progress.updatedAt === 'number'
4374
+ ? progress.updatedAt
4375
+ : occurredAt,
4376
+ };
4377
+ const status: PlayRunLedgerStepStatus =
4378
+ typeof progress.completedAt === 'number' ? 'completed' : 'running';
4379
+ events.push({
4380
+ type: 'step.progress',
4381
+ runId: req.runId,
4382
+ source: 'worker',
4383
+ occurredAt:
4384
+ typeof progress.updatedAt === 'number'
4385
+ ? progress.updatedAt
4386
+ : occurredAt,
4387
+ stepId: nodeId,
4388
+ status,
4389
+ progress: normalizedProgress,
4390
+ });
4391
+ }
4392
+ dirtyProgressNodeIds = new Set<string>();
4393
+ }
4394
+
4395
+ return events;
4078
4396
  };
4079
- const liveNodeProgressSnapshot = () => ({ ...liveNodeProgress });
4080
- const flushLiveLogs = (force: boolean): void => {
4397
+
4398
+ const flushLedgerEvents = (force: boolean): void => {
4081
4399
  if (!options?.persistResultDatasets) return;
4082
- if (!liveLogsDirty && !force) return;
4083
4400
  const now = nowMs();
4084
- if (!force && now - lastLiveLogFlushAt < LIVE_LOG_FLUSH_INTERVAL_MS) return;
4085
- lastLiveLogFlushAt = now;
4086
- liveLogsDirty = false;
4087
- const snapshot = [...liveLogs];
4088
- liveLogFlushInFlight = liveLogFlushInFlight
4401
+ if (!force && now - lastLedgerFlushAt < RUN_LEDGER_FLUSH_INTERVAL_MS) {
4402
+ return;
4403
+ }
4404
+ const events = drainPendingLedgerEvents(now);
4405
+ if (events.length === 0) return;
4406
+ lastLedgerFlushAt = now;
4407
+ ledgerFlushInFlight = ledgerFlushInFlight
4089
4408
  .catch(() => undefined)
4090
4409
  .then(async () => {
4091
4410
  try {
4092
4411
  await postRuntimeApi(req.baseUrl, req.executorToken, {
4093
- action: 'update_run_status',
4412
+ action: 'append_run_events',
4094
4413
  playId: req.runId,
4095
- status: 'running',
4096
- runtimeBackend: 'cf_workflows_dynamic_worker',
4097
- liveLogs: snapshot,
4098
- liveNodeProgress: liveNodeProgressSnapshot(),
4099
- lastCheckpointAt: now,
4414
+ events,
4100
4415
  });
4101
4416
  } catch {
4102
- // Best-effort; the terminal update still carries the final logs.
4417
+ pendingLedgerEvents = [...events, ...pendingLedgerEvents];
4418
+ throw new Error('runtime run-ledger append failed');
4103
4419
  }
4420
+ })
4421
+ .catch(() => undefined);
4422
+ };
4423
+
4424
+ const flushTerminalLedgerEvents = async (
4425
+ terminalEvent: PlayRunLedgerEvent,
4426
+ ): Promise<void> => {
4427
+ if (!options?.persistResultDatasets) return;
4428
+ await ledgerFlushInFlight.catch(() => undefined);
4429
+ const now = nowMs();
4430
+ pendingLedgerEvents = [...pendingLedgerEvents, terminalEvent];
4431
+ const events = drainPendingLedgerEvents(now);
4432
+ if (events.length === 0) return;
4433
+ try {
4434
+ await postRuntimeApi(req.baseUrl, req.executorToken, {
4435
+ action: 'append_run_events',
4436
+ playId: req.runId,
4437
+ events,
4104
4438
  });
4439
+ } catch (error) {
4440
+ pendingLedgerEvents = [...events, ...pendingLedgerEvents];
4441
+ throw error;
4442
+ }
4443
+ };
4444
+
4445
+ const orderedNodes = buildOrderedNodeList(req.contractSnapshot);
4446
+ const stepLifecycle =
4447
+ orderedNodes.length > 0
4448
+ ? new PlayStepLifecycleTracker(
4449
+ orderedNodes,
4450
+ () => stepProgressByNodeId,
4451
+ appendStepLifecycleEvent,
4452
+ nowMs,
4453
+ )
4454
+ : null;
4455
+ const workerCallbacks: WorkerCtxCallbacks = {
4456
+ onNodeProgress: (input) => {
4457
+ updateStepProgress(input);
4458
+ flushLedgerEvents(false);
4459
+ },
4460
+ onMapStarted: (nodeId, at) => stepLifecycle?.onMapStarted(nodeId, at),
4461
+ onMapCompleted: (nodeId, at) => stepLifecycle?.onMapCompleted(nodeId, at),
4462
+ onToolCalled: (toolId, at) => stepLifecycle?.onToolCalled(toolId, at),
4463
+ onToolFailed: (toolId, at) => stepLifecycle?.onToolFailed(toolId, at),
4105
4464
  };
4106
4465
 
4107
4466
  const wrappedEmit = (event: RunnerEvent) => {
4108
4467
  if (event.type === 'log') {
4109
- appendLiveLog(event.message);
4110
- flushLiveLogs(false);
4468
+ appendRunLogLine(event.message);
4469
+ flushLedgerEvents(false);
4111
4470
  } else if (event.type === 'error') {
4112
4471
  // Sanitize the inbound message before it enters the live-log buffer.
4113
4472
  // The downstream `emit` still receives the raw event so the console /
4114
4473
  // NDJSON stream can keep its full debugging fidelity.
4115
4474
  const sanitizedMessage = redactSecretsFromLogString(event.message);
4116
- appendLiveLog(`[error] ${sanitizedMessage}`);
4117
- flushLiveLogs(true);
4475
+ appendRunLogLine(`[error] ${sanitizedMessage}`);
4476
+ flushLedgerEvents(true);
4118
4477
  }
4119
4478
  emit(event);
4120
4479
  };
4121
4480
 
4481
+ stepLifecycle?.markPreMapStepsStarted(startedAt);
4482
+ flushLedgerEvents(false);
4122
4483
  const ctx = createMinimalWorkerCtx(
4123
4484
  req,
4124
4485
  wrappedEmit,
4125
4486
  env,
4126
4487
  workflowStep,
4127
4488
  abortSignal,
4128
- { onNodeProgress: updateLiveNodeProgress },
4489
+ workerCallbacks,
4129
4490
  );
4130
4491
  try {
4131
4492
  const playStartedAt = nowMs();
@@ -4140,6 +4501,7 @@ async function executeRunRequest(
4140
4501
  phase: 'runner.play_function',
4141
4502
  ms: nowMs() - playStartedAt,
4142
4503
  });
4504
+ stepLifecycle?.markAllTerminal(nowMs());
4143
4505
  const serializeStartedAt = nowMs();
4144
4506
  const serializedResult = serializePlayReturnValue(result);
4145
4507
  recordRunnerPerfTrace({
@@ -4149,41 +4511,21 @@ async function executeRunRequest(
4149
4511
  });
4150
4512
  if (options?.persistResultDatasets) {
4151
4513
  const persistStartedAt = nowMs();
4152
- await liveLogFlushInFlight.catch(() => undefined);
4514
+ await ledgerFlushInFlight.catch(() => undefined);
4153
4515
  recordRunnerPerfTrace({
4154
4516
  req,
4155
- phase: 'runner.live_log_flush_wait',
4517
+ phase: 'runner.run_ledger_flush_wait',
4156
4518
  ms: nowMs() - persistStartedAt,
4157
4519
  });
4158
4520
  const resultDatasetStartedAt = nowMs();
4159
- await persistResultDatasets(req, serializedResult);
4521
+ await persistResultDatasets(req, result, serializedResult);
4160
4522
  recordRunnerPerfTrace({
4161
4523
  req,
4162
4524
  phase: 'runner.persist_result_datasets',
4163
4525
  ms: nowMs() - resultDatasetStartedAt,
4164
4526
  });
4165
4527
  const terminalResult = trimResultForStatus(serializedResult);
4166
- const terminalUpdateStartedAt = nowMs();
4167
- await postRuntimeApiBestEffort(req.baseUrl, req.executorToken, {
4168
- action: 'update_run_status',
4169
- playId: req.runId,
4170
- status: 'completed',
4171
- error: null,
4172
- result: terminalResult,
4173
- runtimeBackend: 'cf_workflows_dynamic_worker',
4174
- waitKind: null,
4175
- waitUntil: null,
4176
- activeBoundaryId: null,
4177
- liveLogs,
4178
- liveNodeProgress: liveNodeProgressSnapshot(),
4179
- lastCheckpointAt: nowMs(),
4180
- });
4181
- recordRunnerPerfTrace({
4182
- req,
4183
- phase: 'runner.terminal_status_update',
4184
- ms: nowMs() - terminalUpdateStartedAt,
4185
- });
4186
-
4528
+ const terminalOccurredAt = nowMs();
4187
4529
  const billingStartedAt = nowMs();
4188
4530
  await finalizeWorkerComputeBilling({
4189
4531
  req,
@@ -4195,6 +4537,20 @@ async function executeRunRequest(
4195
4537
  phase: 'runner.compute_billing_finalize',
4196
4538
  ms: nowMs() - billingStartedAt,
4197
4539
  });
4540
+
4541
+ const terminalUpdateStartedAt = nowMs();
4542
+ await flushTerminalLedgerEvents({
4543
+ type: 'run.completed',
4544
+ runId: req.runId,
4545
+ source: 'worker',
4546
+ occurredAt: terminalOccurredAt,
4547
+ result: terminalResult,
4548
+ });
4549
+ recordRunnerPerfTrace({
4550
+ req,
4551
+ phase: 'runner.terminal_ledger_append',
4552
+ ms: nowMs() - terminalUpdateStartedAt,
4553
+ });
4198
4554
  }
4199
4555
  const parentSignalStartedAt = nowMs();
4200
4556
  await signalParentPlayTerminal({
@@ -4222,11 +4578,12 @@ async function executeRunRequest(
4222
4578
  playName: req.playName,
4223
4579
  result: serializedResult,
4224
4580
  outputRows: inferOutputRows(serializedResult),
4225
- liveLogs,
4226
- liveNodeProgress: liveNodeProgressSnapshot(),
4581
+ liveLogs: runLogBuffer,
4582
+ liveNodeProgress: stepProgressSnapshot(),
4227
4583
  durationMs: nowMs() - startedAt,
4228
4584
  };
4229
4585
  } catch (error) {
4586
+ stepLifecycle?.markStartedFailed(nowMs());
4230
4587
  const aborted = isAbortLikeError(error);
4231
4588
  if (aborted) {
4232
4589
  // Flip the controller so any concurrent user code observes the abort
@@ -4237,19 +4594,15 @@ async function executeRunRequest(
4237
4594
  }
4238
4595
  const message = error instanceof Error ? error.message : String(error);
4239
4596
  if (options?.persistResultDatasets) {
4240
- await liveLogFlushInFlight.catch(() => undefined);
4241
- await postRuntimeApiBestEffort(req.baseUrl, req.executorToken, {
4242
- action: 'update_run_status',
4243
- playId: req.runId,
4244
- status: aborted ? 'cancelled' : 'failed',
4597
+ appendRunLogLine(
4598
+ `${aborted ? '[cancelled]' : '[error]'} ${redactSecretsFromLogString(message)}`,
4599
+ );
4600
+ await flushTerminalLedgerEvents({
4601
+ type: aborted ? 'run.cancelled' : 'run.failed',
4602
+ runId: req.runId,
4603
+ source: 'worker',
4604
+ occurredAt: nowMs(),
4245
4605
  error: message,
4246
- runtimeBackend: 'cf_workflows_dynamic_worker',
4247
- waitKind: null,
4248
- waitUntil: null,
4249
- activeBoundaryId: null,
4250
- liveLogs,
4251
- liveNodeProgress: liveNodeProgressSnapshot(),
4252
- lastCheckpointAt: nowMs(),
4253
4606
  });
4254
4607
  await finalizeWorkerComputeBilling({
4255
4608
  req,
@@ -4338,6 +4691,12 @@ function runRequestFromWorkflowParams(
4338
4691
  ): RunRequest {
4339
4692
  const inputFile = isRecord(params.inputFile) ? params.inputFile : null;
4340
4693
  const fileName = String(inputFile?.name ?? inputFile?.path ?? 'input.csv');
4694
+ const inputStorageKey =
4695
+ typeof inputFile?.r2Key === 'string'
4696
+ ? inputFile.r2Key
4697
+ : typeof inputFile?.storageKey === 'string'
4698
+ ? inputFile.storageKey
4699
+ : null;
4341
4700
  return {
4342
4701
  runId: String(params.runId ?? ''),
4343
4702
  callbackUrl: String(params.baseUrl ?? ''),
@@ -4350,14 +4709,28 @@ function runRequestFromWorkflowParams(
4350
4709
  ? (params.input as Record<string, unknown>)
4351
4710
  : {},
4352
4711
  inlineCsv: isInlineCsv(params.inlineCsv) ? params.inlineCsv : null,
4353
- inputR2Keys:
4354
- inputFile && typeof inputFile.r2Key === 'string'
4355
- ? { [fileName]: inputFile.r2Key }
4712
+ inputFiles:
4713
+ inputFile && inputStorageKey
4714
+ ? {
4715
+ [fileName]: {
4716
+ logicalPath: String(inputFile.logicalPath ?? inputFile.path ?? fileName),
4717
+ fileName,
4718
+ storageKey: inputStorageKey,
4719
+ contentType:
4720
+ typeof inputFile.contentType === 'string'
4721
+ ? inputFile.contentType
4722
+ : null,
4723
+ bytes: normalizeExpectedBytes(inputFile.bytes),
4724
+ },
4725
+ }
4356
4726
  : null,
4357
4727
  packagedFiles: Array.isArray(params.packagedFiles)
4358
4728
  ? params.packagedFiles.filter(isRecord).map((file) => ({
4359
4729
  playPath: String(file.playPath ?? ''),
4360
4730
  storageKey: String(file.storageKey ?? ''),
4731
+ contentType:
4732
+ typeof file.contentType === 'string' ? file.contentType : null,
4733
+ bytes: normalizeExpectedBytes(file.bytes),
4361
4734
  }))
4362
4735
  : null,
4363
4736
  partitionRange: null,
@@ -4425,11 +4798,39 @@ function isPlayCallGovernanceSnapshot(
4425
4798
  async function persistResultDatasets(
4426
4799
  req: RunRequest,
4427
4800
  result: unknown,
4801
+ serializedResult: unknown,
4428
4802
  ): Promise<void> {
4429
- const datasets = collectDatasetEnvelopes(result);
4803
+ const persistedNamespaces = new Set<string>();
4804
+ for (const dataset of collectDatasetHandles(result)) {
4805
+ if (dataset.datasetKind === 'map') continue;
4806
+ let inputOffset = 0;
4807
+ for await (const chunk of iterDatasetChunks(
4808
+ dataset.handle,
4809
+ RESULT_DATASET_PERSIST_CHUNK_ROWS,
4810
+ )) {
4811
+ if (chunk.length === 0) continue;
4812
+ await harnessStartSheetDataset({
4813
+ baseUrl: req.baseUrl,
4814
+ executorToken: req.executorToken,
4815
+ playName: req.playName,
4816
+ tableNamespace: dataset.tableNamespace,
4817
+ sheetContract: requireSheetContract(req, dataset.tableNamespace),
4818
+ rows: chunk.map((row) => ({ ...row })),
4819
+ runId: req.runId,
4820
+ inputOffset,
4821
+ userEmail: req.userEmail,
4822
+ preloadedDbSessions: req.preloadedDbSessions ?? null,
4823
+ });
4824
+ inputOffset += chunk.length;
4825
+ }
4826
+ persistedNamespaces.add(dataset.tableNamespace);
4827
+ }
4828
+
4829
+ const datasets = collectDatasetEnvelopes(serializedResult);
4430
4830
  for (const dataset of datasets) {
4431
4831
  if (dataset.datasetKind === 'map') continue;
4432
4832
  if (dataset.rows.length === 0) continue;
4833
+ if (persistedNamespaces.has(dataset.tableNamespace)) continue;
4433
4834
  await harnessStartSheetDataset({
4434
4835
  baseUrl: req.baseUrl,
4435
4836
  executorToken: req.executorToken,
@@ -4438,12 +4839,63 @@ async function persistResultDatasets(
4438
4839
  sheetContract: requireSheetContract(req, dataset.tableNamespace),
4439
4840
  rows: dataset.rows,
4440
4841
  runId: req.runId,
4842
+ inputOffset: 0,
4441
4843
  userEmail: req.userEmail,
4442
4844
  preloadedDbSessions: req.preloadedDbSessions ?? null,
4443
4845
  });
4444
4846
  }
4445
4847
  }
4446
4848
 
4849
+ const RESULT_DATASET_PERSIST_CHUNK_ROWS = 5_000;
4850
+
4851
+ function collectDatasetHandles(value: unknown): Array<{
4852
+ tableNamespace: string;
4853
+ datasetKind: 'csv' | 'map' | null;
4854
+ handle: WorkerDatasetHandle<Record<string, unknown>>;
4855
+ }> {
4856
+ const datasets: Array<{
4857
+ tableNamespace: string;
4858
+ datasetKind: 'csv' | 'map' | null;
4859
+ handle: WorkerDatasetHandle<Record<string, unknown>>;
4860
+ }> = [];
4861
+ const seen = new WeakSet<object>();
4862
+ const walk = (candidate: unknown, depth: number) => {
4863
+ if (depth > 12 || candidate == null) return;
4864
+ if (isDatasetHandle(candidate)) {
4865
+ const metadata = candidate.toJSON() as Record<string, unknown>;
4866
+ const tableNamespace =
4867
+ typeof metadata.tableNamespace === 'string'
4868
+ ? metadata.tableNamespace
4869
+ : null;
4870
+ const datasetKind =
4871
+ metadata.datasetKind === 'csv' || metadata.datasetKind === 'map'
4872
+ ? metadata.datasetKind
4873
+ : null;
4874
+ if (tableNamespace) {
4875
+ datasets.push({
4876
+ tableNamespace,
4877
+ datasetKind,
4878
+ handle: candidate as WorkerDatasetHandle<Record<string, unknown>>,
4879
+ });
4880
+ }
4881
+ return;
4882
+ }
4883
+ if (Array.isArray(candidate)) {
4884
+ for (const item of candidate) walk(item, depth + 1);
4885
+ return;
4886
+ }
4887
+ if (typeof candidate !== 'object') return;
4888
+ const object = candidate as Record<string, unknown>;
4889
+ if (seen.has(object)) return;
4890
+ seen.add(object);
4891
+ for (const child of Object.values(object)) {
4892
+ walk(child, depth + 1);
4893
+ }
4894
+ };
4895
+ walk(value, 0);
4896
+ return datasets;
4897
+ }
4898
+
4447
4899
  function serializePlayReturnValue(value: unknown): unknown {
4448
4900
  return serializeValue(value, 0);
4449
4901
  }
@@ -4498,64 +4950,10 @@ function trimResultShape(value: unknown): unknown {
4498
4950
 
4499
4951
  function serializeValue(value: unknown, depth: number): unknown {
4500
4952
  if (depth > 20 || value == null) return value;
4953
+ if (isDatasetHandle(value)) {
4954
+ return serializeValue(value.toJSON(), depth + 1);
4955
+ }
4501
4956
  if (Array.isArray(value)) {
4502
- const tableNamespace =
4503
- typeof (value as unknown as { tableNamespace?: unknown })
4504
- .tableNamespace === 'string'
4505
- ? (value as unknown as { tableNamespace: string }).tableNamespace
4506
- : null;
4507
- const datasetId =
4508
- typeof (value as unknown as { datasetId?: unknown }).datasetId ===
4509
- 'string'
4510
- ? (value as unknown as { datasetId: string }).datasetId
4511
- : null;
4512
- const datasetCount =
4513
- typeof (value as unknown as { __deeplineDatasetCount?: unknown })
4514
- .__deeplineDatasetCount === 'number'
4515
- ? (value as unknown as { __deeplineDatasetCount: number })
4516
- .__deeplineDatasetCount
4517
- : value.length;
4518
- const datasetKind =
4519
- (value as unknown as { __deeplineDatasetKind?: unknown })
4520
- .__deeplineDatasetKind === 'csv'
4521
- ? 'csv'
4522
- : 'map';
4523
- const cacheSummary =
4524
- typeof (value as unknown as { __deeplineCacheSummary?: unknown })
4525
- .__deeplineCacheSummary === 'string'
4526
- ? (value as unknown as { __deeplineCacheSummary: string })
4527
- .__deeplineCacheSummary
4528
- : null;
4529
- const workProgress = isRecord(
4530
- (value as unknown as { __deeplineWorkProgress?: unknown })
4531
- .__deeplineWorkProgress,
4532
- )
4533
- ? (
4534
- value as unknown as {
4535
- __deeplineWorkProgress: Record<string, unknown>;
4536
- }
4537
- ).__deeplineWorkProgress
4538
- : null;
4539
- const previewRows = value
4540
- .slice(0, 5)
4541
- .map((row) => serializeValue(row, depth + 1))
4542
- .filter(isRecord);
4543
- if (tableNamespace && datasetId) {
4544
- const columns = inferColumns(
4545
- value.map((row) => serializeValue(row, depth + 1)).filter(isRecord),
4546
- );
4547
- return {
4548
- kind: 'dataset' as const,
4549
- datasetKind,
4550
- datasetId,
4551
- count: datasetCount,
4552
- columns,
4553
- preview: previewRows,
4554
- tableNamespace,
4555
- ...(cacheSummary ? { cacheSummary } : {}),
4556
- ...(workProgress ? { _metadata: { workProgress } } : {}),
4557
- };
4558
- }
4559
4957
  return value.map((entry) => serializeValue(entry, depth + 1));
4560
4958
  }
4561
4959
  if (typeof value !== 'object') return value;
@@ -4566,16 +4964,6 @@ function serializeValue(value: unknown, depth: number): unknown {
4566
4964
  return out;
4567
4965
  }
4568
4966
 
4569
- function inferColumns(rows: ReadonlyArray<Record<string, unknown>>): string[] {
4570
- const columns = new Set<string>();
4571
- for (const row of rows) {
4572
- for (const key of Object.keys(row)) {
4573
- columns.add(key);
4574
- }
4575
- }
4576
- return [...columns];
4577
- }
4578
-
4579
4967
  function collectDatasetEnvelopes(value: unknown): Array<{
4580
4968
  tableNamespace: string;
4581
4969
  datasetKind: 'csv' | 'map' | null;
@@ -4714,10 +5102,17 @@ export class TenantWorkflow extends WorkflowEntrypoint<
4714
5102
  // user via tail/SSE. Retry with backoff before giving up; if we drop
4715
5103
  // it, the user is stuck staring at the opaque CF reference id.
4716
5104
  const errorPayload = JSON.stringify({
4717
- action: 'update_run_status',
5105
+ action: 'append_run_events',
4718
5106
  playId: req.runId,
4719
- status: 'failed',
4720
- error: `TenantWorkflow.run threw: ${detail.name ?? 'Error'}: ${detail.message}\n${detail.stack ?? ''}`,
5107
+ events: [
5108
+ {
5109
+ type: 'run.failed',
5110
+ runId: req.runId,
5111
+ source: 'worker',
5112
+ occurredAt: nowMs(),
5113
+ error: `TenantWorkflow.run threw: ${detail.name ?? 'Error'}: ${detail.message}\n${detail.stack ?? ''}`,
5114
+ } satisfies PlayRunLedgerEvent,
5115
+ ],
4721
5116
  });
4722
5117
  const backoffMs = [200, 500, 1500];
4723
5118
  let lastCallbackError: unknown = null;
@@ -4850,22 +5245,18 @@ function inferOutputRows(result: unknown): number {
4850
5245
  const datasets: number[] = [];
4851
5246
  const walk = (value: unknown, depth: number) => {
4852
5247
  if (depth > 6 || value == null) return;
5248
+ if (isDatasetHandle(value)) {
5249
+ datasets.push(value.toJSON().count);
5250
+ return;
5251
+ }
4853
5252
  if (Array.isArray(value)) {
4854
5253
  for (const item of value) walk(item, depth + 1);
4855
5254
  return;
4856
5255
  }
4857
5256
  if (typeof value !== 'object') return;
4858
5257
  const record = value as Record<string, unknown>;
4859
- if (
4860
- typeof record.tableNamespace === 'string' &&
4861
- (typeof record.count === 'number' ||
4862
- typeof record.__deeplineDatasetCount === 'number')
4863
- ) {
4864
- datasets.push(
4865
- typeof record.count === 'number'
4866
- ? record.count
4867
- : Number(record.__deeplineDatasetCount),
4868
- );
5258
+ if (typeof record.tableNamespace === 'string' && typeof record.count === 'number') {
5259
+ datasets.push(record.count);
4869
5260
  }
4870
5261
  for (const [key, child] of Object.entries(record)) {
4871
5262
  if (key === 'preview') continue;