@ekairos/dataset 1.22.78-beta.development.0 → 1.22.80-beta.development.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,82 @@
1
- import type { AnyDatasetRuntime, DatasetBuilderState, InternalSource } from "./types.js";
1
+ import type { AnyDatasetRuntime, DatasetBuilderState, DatasetSchemaInput, InternalSource } from "./types.js";
2
+ import type { SandboxState } from "../file/file-dataset.types.js";
3
+ import type { FilePreviewContext } from "../file/filepreview.types.js";
4
+ import type { TransformSandboxState, TransformSourcePreviewContext } from "../transform/transform-dataset.types.js";
2
5
  export declare function resolveDatasetAgentDurable(requestedDurable?: boolean): Promise<boolean>;
6
+ type PreparedFileDatasetContext = {
7
+ kind: "file";
8
+ datasetId: string;
9
+ sandboxId: string;
10
+ fileId: string;
11
+ sandboxState: SandboxState;
12
+ filePreview?: FilePreviewContext;
13
+ schema?: DatasetSchemaInput | null;
14
+ };
15
+ type PreparedTransformDatasetContext = {
16
+ kind: "transform";
17
+ datasetId: string;
18
+ sandboxId: string;
19
+ sourceDatasetIds: string[];
20
+ outputSchema: DatasetSchemaInput;
21
+ sandboxState: TransformSandboxState;
22
+ sourcePreviews?: Array<{
23
+ datasetId: string;
24
+ preview: TransformSourcePreviewContext;
25
+ }>;
26
+ };
27
+ type PreparedDatasetContext = PreparedFileDatasetContext | PreparedTransformDatasetContext;
28
+ type DatasetContextInitialization = PreparedDatasetContext & {
29
+ prompt: string;
30
+ instructions?: string;
31
+ };
32
+ export declare function initializeDatasetStep<Runtime extends AnyDatasetRuntime>(params: {
33
+ runtime: Runtime;
34
+ datasetId: string;
35
+ sandboxId: string;
36
+ title?: string;
37
+ instructions?: string;
38
+ sources: any[];
39
+ sourceKinds: string[];
40
+ schema?: DatasetSchemaInput;
41
+ }): Promise<{
42
+ datasetId: string;
43
+ sandboxId: string;
44
+ }>;
45
+ export declare function prepareDatasetSourcesStep<Runtime extends AnyDatasetRuntime>(params: {
46
+ kind: "file";
47
+ runtime: Runtime;
48
+ datasetId: string;
49
+ sandboxId: string;
50
+ source: Extract<InternalSource, {
51
+ kind: "file" | "text";
52
+ }>;
53
+ schema?: DatasetSchemaInput;
54
+ } | {
55
+ kind: "transform";
56
+ runtime: Runtime;
57
+ datasetId: string;
58
+ sandboxId: string;
59
+ sourceDatasetIds: string[];
60
+ outputSchema: DatasetSchemaInput;
61
+ }): Promise<PreparedDatasetContext>;
62
+ export declare function initializeDatasetContextStep(params: {
63
+ prepared: PreparedDatasetContext;
64
+ instructions?: string;
65
+ outputSchema?: DatasetSchemaInput;
66
+ }): Promise<DatasetContextInitialization>;
67
+ export declare function completeDatasetStep<Runtime extends AnyDatasetRuntime>(params: {
68
+ runtime: Runtime;
69
+ datasetId: string;
70
+ schema?: DatasetSchemaInput;
71
+ first: boolean;
72
+ }): Promise<{
73
+ datasetId: string;
74
+ dataset: any;
75
+ previewRows: any[];
76
+ firstRow: any;
77
+ }>;
3
78
  export declare function materializeSingleFileLikeSource<Runtime extends AnyDatasetRuntime>(state: DatasetBuilderState<Runtime>, source: Extract<InternalSource, {
4
79
  kind: "file" | "text";
5
80
  }>, targetDatasetId: string): Promise<string>;
6
81
  export declare function materializeDerivedDataset<Runtime extends AnyDatasetRuntime>(state: DatasetBuilderState<Runtime>, targetDatasetId: string): Promise<string>;
82
+ export {};
@@ -1,12 +1,16 @@
1
1
  import { createFileParseContext } from "../file/file-dataset.agent.js";
2
+ import { readInstantFileStep } from "../file/steps.js";
3
+ import { generateFileParsePreviewStep, initializeFileParseSandboxStep, } from "../file/file-dataset.steps.js";
2
4
  import { createTransformDatasetContext } from "../transform/transform-dataset.agent.js";
3
- import { datasetInferAndUpdateSchemaStep, datasetReadOneStep, } from "../dataset/steps.js";
5
+ import { ensureTransformSourcesInSandboxStep, generateTransformSourcePreviewsStep, } from "../transform/transform-dataset.steps.js";
6
+ import { datasetGetByIdStep, datasetInferAndUpdateSchemaStep, datasetPreviewRowsStep, datasetReadOneStep, } from "../dataset/steps.js";
7
+ import { getDatasetOutputPath, getDatasetScriptsDir, getDatasetSourcesDir, getDatasetStandardDirs, } from "../datasetFiles.js";
4
8
  import { registerDatasetAgentMaterializers } from "./agentMaterializers.js";
5
9
  import { buildFileDefaultInstructions, buildRawSourceInstructions, buildTransformInstructions, } from "./instructions.js";
6
10
  import { createOrUpdateDatasetMetadata, materializeRowsToDataset, uploadInlineTextSource, } from "./persistence.js";
7
11
  import { getDomainDescriptor } from "./sourceRows.js";
8
12
  import { materializeQuerySource } from "./materializeQuery.js";
9
- import { createDatasetSandboxStep } from "../sandbox/steps.js";
13
+ import { readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep, writeDatasetSandboxTextFilesStep, } from "../sandbox/steps.js";
10
14
  function makeIntermediateDatasetId(targetDatasetId, sourceKind, index) {
11
15
  return `${targetDatasetId}__${sourceKind}_${index}`;
12
16
  }
@@ -41,6 +45,162 @@ function materializeRawTextRows(source) {
41
45
  }
42
46
  return [{ text }];
43
47
  }
48
+ function parseContentDispositionFileName(value) {
49
+ const text = String(value ?? "");
50
+ const utf8Match = /filename\*=UTF-8''([^;]+)/i.exec(text);
51
+ if (utf8Match?.[1]) {
52
+ try {
53
+ return decodeURIComponent(utf8Match[1]).trim();
54
+ }
55
+ catch {
56
+ return utf8Match[1].trim();
57
+ }
58
+ }
59
+ const quotedMatch = /filename="([^"]+)"/i.exec(text);
60
+ if (quotedMatch?.[1])
61
+ return quotedMatch[1].trim();
62
+ const plainMatch = /filename=([^;]+)/i.exec(text);
63
+ if (plainMatch?.[1])
64
+ return plainMatch[1].trim();
65
+ return "";
66
+ }
67
+ function isPdfContentDisposition(value) {
68
+ const text = String(value ?? "").toLowerCase();
69
+ return text.includes("application/pdf") || text.includes(".pdf");
70
+ }
71
+ function sanitizePdfFileName(value, fallback) {
72
+ const name = String(value ?? "").trim() || fallback;
73
+ const cleaned = name.replace(/[\\/:"*?<>|]+/g, "_").replace(/\s+/g, "_").slice(0, 120);
74
+ return cleaned.toLowerCase().endsWith(".pdf") ? cleaned : `${cleaned || fallback}.pdf`;
75
+ }
76
+ function pdfTextRowsSchema() {
77
+ return {
78
+ title: "PdfTextPage",
79
+ description: "Extracted PDF page text",
80
+ schema: {
81
+ type: "object",
82
+ additionalProperties: false,
83
+ required: ["fileId", "fileName", "pageNumber", "text"],
84
+ properties: {
85
+ fileId: { type: "string" },
86
+ fileName: { type: "string" },
87
+ pageNumber: { type: "number" },
88
+ text: { type: "string" },
89
+ },
90
+ },
91
+ };
92
+ }
93
+ function parseJsonlDataRows(content) {
94
+ return String(content ?? "")
95
+ .split(/\r?\n/g)
96
+ .map((line) => line.trim())
97
+ .filter(Boolean)
98
+ .map((line) => JSON.parse(line))
99
+ .map((record) => record?.data)
100
+ .filter((row) => row && typeof row === "object" && !Array.isArray(row));
101
+ }
102
+ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
103
+ const file = await readInstantFileStep({ runtime: state.runtime, fileId: source.fileId });
104
+ if (!isPdfContentDisposition(file.contentDisposition))
105
+ return null;
106
+ const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
107
+ const outputPath = getDatasetOutputPath(targetDatasetId);
108
+ const fileName = sanitizePdfFileName(parseContentDispositionFileName(file.contentDisposition), `${source.fileId}.pdf`);
109
+ const sourcePath = `${getDatasetSourcesDir(targetDatasetId)}/${fileName}`;
110
+ const scriptPath = `${getDatasetScriptsDir(targetDatasetId)}/extract_pdf_text.py`;
111
+ await runDatasetSandboxCommandStep({
112
+ runtime: state.runtime,
113
+ sandboxId,
114
+ cmd: "mkdir",
115
+ args: ["-p", ...getDatasetStandardDirs(targetDatasetId)],
116
+ });
117
+ await writeDatasetSandboxFilesStep({
118
+ runtime: state.runtime,
119
+ sandboxId,
120
+ files: [{ path: sourcePath, contentBase64: file.contentBase64 }],
121
+ });
122
+ const install = await runDatasetSandboxCommandStep({
123
+ runtime: state.runtime,
124
+ sandboxId,
125
+ cmd: "python",
126
+ args: ["-m", "pip", "install", "pypdf", "--quiet"],
127
+ });
128
+ if (install.exitCode !== 0) {
129
+ throw new Error(`dataset_pdf_dependency_install_failed:${install.stderr || install.stdout}`);
130
+ }
131
+ await writeDatasetSandboxTextFilesStep({
132
+ runtime: state.runtime,
133
+ sandboxId,
134
+ files: [
135
+ {
136
+ path: scriptPath,
137
+ content: [
138
+ "from pathlib import Path",
139
+ "import json",
140
+ "import sys",
141
+ "from pypdf import PdfReader",
142
+ "",
143
+ "source_path = Path(sys.argv[1])",
144
+ "output_path = Path(sys.argv[2])",
145
+ "file_id = sys.argv[3]",
146
+ "file_name = sys.argv[4]",
147
+ "reader = PdfReader(str(source_path))",
148
+ "rows = 0",
149
+ "with output_path.open('w', encoding='utf-8') as out:",
150
+ " for index, page in enumerate(reader.pages, start=1):",
151
+ " text = page.extract_text() or ''",
152
+ " text = text.replace('\\x00', '').strip()",
153
+ " if not text:",
154
+ " continue",
155
+ " data = {",
156
+ " 'fileId': file_id,",
157
+ " 'fileName': file_name,",
158
+ " 'pageNumber': index,",
159
+ " 'text': text,",
160
+ " }",
161
+ " out.write(json.dumps({'type': 'row', 'data': data}, ensure_ascii=False) + '\\n')",
162
+ " rows += 1",
163
+ " if rows == 0:",
164
+ " data = {'fileId': file_id, 'fileName': file_name, 'pageNumber': 0, 'text': ''}",
165
+ " out.write(json.dumps({'type': 'row', 'data': data}, ensure_ascii=False) + '\\n')",
166
+ " rows = 1",
167
+ "print(f'extracted_pdf_pages={len(reader.pages)} rows={rows} output={output_path}')",
168
+ "",
169
+ ].join("\n"),
170
+ },
171
+ ],
172
+ });
173
+ const extraction = await runDatasetSandboxCommandStep({
174
+ runtime: state.runtime,
175
+ sandboxId,
176
+ cmd: "python",
177
+ args: [scriptPath, sourcePath, outputPath, source.fileId, fileName],
178
+ });
179
+ if (extraction.exitCode !== 0) {
180
+ throw new Error(`dataset_pdf_text_extraction_failed:${extraction.stderr || extraction.stdout}`);
181
+ }
182
+ const output = await readDatasetSandboxTextFileStep({
183
+ runtime: state.runtime,
184
+ sandboxId,
185
+ path: outputPath,
186
+ });
187
+ const rows = parseJsonlDataRows(output.content);
188
+ if (rows.length === 0) {
189
+ throw new Error("dataset_pdf_text_extraction_empty");
190
+ }
191
+ await materializeRowsToDataset(state.runtime, {
192
+ datasetId: targetDatasetId,
193
+ sandboxId,
194
+ title: state.title ?? fileName,
195
+ instructions: state.instructions,
196
+ sources: [{ kind: "file", fileId: source.fileId, description: source.description }],
197
+ sourceKinds: ["file"],
198
+ rows,
199
+ schema: pdfTextRowsSchema(),
200
+ first: state.first,
201
+ });
202
+ return targetDatasetId;
203
+ }
44
204
  async function materializeRawTextSource(state, source, targetDatasetId) {
45
205
  const rows = materializeRawTextRows(source);
46
206
  await materializeRowsToDataset(state.runtime, {
@@ -63,24 +223,11 @@ async function materializeRawTextSource(state, source, targetDatasetId) {
63
223
  });
64
224
  return targetDatasetId;
65
225
  }
66
- async function resolveDatasetSandboxId(state, targetDatasetId) {
226
+ function resolveDatasetSandboxId(state, _targetDatasetId) {
67
227
  const sandboxId = String(state.sandboxId ?? "").trim();
68
228
  if (sandboxId)
69
229
  return sandboxId;
70
- const created = await createDatasetSandboxStep({
71
- runtime: state.runtime,
72
- provider: "vercel",
73
- sandboxRuntime: "python3.13",
74
- timeoutMs: 20 * 60 * 1000,
75
- resources: { vcpus: 2 },
76
- purpose: "dataset.materialize",
77
- params: { datasetId: targetDatasetId },
78
- vercel: {
79
- profile: "ephemeral",
80
- deleteOnStop: true,
81
- },
82
- });
83
- return created.sandboxId;
230
+ throw new Error("dataset_sandbox_required");
84
231
  }
85
232
  export async function resolveDatasetAgentDurable(requestedDurable) {
86
233
  if (!requestedDurable)
@@ -96,15 +243,150 @@ export async function resolveDatasetAgentDurable(requestedDurable) {
96
243
  }
97
244
  return true;
98
245
  }
246
+ export async function initializeDatasetStep(params) {
247
+ "use step";
248
+ await createOrUpdateDatasetMetadata(params.runtime, {
249
+ datasetId: params.datasetId,
250
+ sandboxId: params.sandboxId,
251
+ title: params.title ?? params.datasetId,
252
+ instructions: params.instructions,
253
+ sources: params.sources,
254
+ sourceKinds: params.sourceKinds,
255
+ schema: params.schema,
256
+ status: "building",
257
+ });
258
+ return {
259
+ datasetId: params.datasetId,
260
+ sandboxId: params.sandboxId,
261
+ };
262
+ }
263
+ export async function prepareDatasetSourcesStep(params) {
264
+ "use step";
265
+ if (params.kind === "file") {
266
+ const fileId = params.source.kind === "file"
267
+ ? params.source.fileId
268
+ : await uploadInlineTextSource(params.runtime, params.datasetId, params.source);
269
+ const initialized = await initializeFileParseSandboxStep({
270
+ runtime: params.runtime,
271
+ sandboxId: params.sandboxId,
272
+ datasetId: params.datasetId,
273
+ fileId,
274
+ state: { initialized: false, filePath: "" },
275
+ });
276
+ const filePreview = await generateFileParsePreviewStep({
277
+ runtime: params.runtime,
278
+ sandboxId: params.sandboxId,
279
+ sandboxFilePath: initialized.filePath,
280
+ datasetId: params.datasetId,
281
+ });
282
+ return {
283
+ kind: "file",
284
+ datasetId: params.datasetId,
285
+ sandboxId: params.sandboxId,
286
+ fileId,
287
+ sandboxState: initialized.state,
288
+ filePreview,
289
+ schema: params.schema ?? null,
290
+ };
291
+ }
292
+ const initialized = await ensureTransformSourcesInSandboxStep({
293
+ runtime: params.runtime,
294
+ sandboxId: params.sandboxId,
295
+ datasetId: params.datasetId,
296
+ sourceDatasetIds: params.sourceDatasetIds,
297
+ state: { initialized: false, sourcePaths: [] },
298
+ });
299
+ const sourcePreviews = await generateTransformSourcePreviewsStep({
300
+ runtime: params.runtime,
301
+ sandboxId: params.sandboxId,
302
+ datasetId: params.datasetId,
303
+ sourcePaths: initialized.sourcePaths,
304
+ });
305
+ return {
306
+ kind: "transform",
307
+ datasetId: params.datasetId,
308
+ sandboxId: params.sandboxId,
309
+ sourceDatasetIds: params.sourceDatasetIds,
310
+ outputSchema: params.outputSchema,
311
+ sandboxState: initialized.state,
312
+ sourcePreviews,
313
+ };
314
+ }
315
+ export async function initializeDatasetContextStep(params) {
316
+ "use step";
317
+ if (params.prepared.kind === "file") {
318
+ return {
319
+ ...params.prepared,
320
+ instructions: params.instructions ?? buildFileDefaultInstructions(params.outputSchema),
321
+ prompt: "generate a dataset for this file",
322
+ };
323
+ }
324
+ return {
325
+ ...params.prepared,
326
+ instructions: params.instructions,
327
+ prompt: params.prepared.sourceDatasetIds.length === 1
328
+ ? "Transform the source dataset into a new dataset matching the provided output schema"
329
+ : `Transform ${params.prepared.sourceDatasetIds.length} source datasets into a new dataset matching the provided output schema`,
330
+ };
331
+ }
332
+ export async function completeDatasetStep(params) {
333
+ "use step";
334
+ let datasetResult = await datasetGetByIdStep({
335
+ runtime: params.runtime,
336
+ datasetId: params.datasetId,
337
+ });
338
+ if (!datasetResult.ok)
339
+ throw new Error(datasetResult.error);
340
+ if (!params.schema && !datasetResult.data?.schema) {
341
+ await datasetInferAndUpdateSchemaStep({
342
+ runtime: params.runtime,
343
+ datasetId: params.datasetId,
344
+ title: `${params.datasetId}Row`,
345
+ description: "One dataset row",
346
+ });
347
+ datasetResult = await datasetGetByIdStep({
348
+ runtime: params.runtime,
349
+ datasetId: params.datasetId,
350
+ });
351
+ if (!datasetResult.ok)
352
+ throw new Error(datasetResult.error);
353
+ }
354
+ const previewResult = await datasetPreviewRowsStep({
355
+ runtime: params.runtime,
356
+ datasetId: params.datasetId,
357
+ limit: 20,
358
+ });
359
+ if (!params.first) {
360
+ return {
361
+ datasetId: params.datasetId,
362
+ dataset: datasetResult.data,
363
+ previewRows: previewResult.rows,
364
+ firstRow: undefined,
365
+ };
366
+ }
367
+ const firstResult = await datasetReadOneStep({
368
+ runtime: params.runtime,
369
+ datasetId: params.datasetId,
370
+ });
371
+ return {
372
+ datasetId: params.datasetId,
373
+ dataset: datasetResult.data,
374
+ previewRows: previewResult.rows,
375
+ firstRow: firstResult.row,
376
+ };
377
+ }
99
378
  export async function materializeSingleFileLikeSource(state, source, targetDatasetId) {
379
+ if (source.kind === "file" && !state.outputSchema) {
380
+ const materializedPdf = await tryMaterializeRawPdfFileSource(state, source, targetDatasetId);
381
+ if (materializedPdf)
382
+ return materializedPdf;
383
+ }
384
+ const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
100
385
  if (!state.reactor) {
101
386
  throw new Error("dataset_reactor_required");
102
387
  }
103
- const sandboxId = await resolveDatasetSandboxId(state, targetDatasetId);
104
- const fileId = source.kind === "file"
105
- ? source.fileId
106
- : await uploadInlineTextSource(state.runtime, targetDatasetId, source);
107
- await createOrUpdateDatasetMetadata(state.runtime, {
388
+ await initializeDatasetStep({
389
+ runtime: state.runtime,
108
390
  datasetId: targetDatasetId,
109
391
  sandboxId,
110
392
  title: state.title ?? targetDatasetId,
@@ -121,28 +403,45 @@ export async function materializeSingleFileLikeSource(state, source, targetDatas
121
403
  ],
122
404
  sourceKinds: [source.kind],
123
405
  schema: state.outputSchema,
124
- status: "building",
125
406
  });
126
- const parseContext = createFileParseContext(fileId, {
407
+ const prepared = await prepareDatasetSourcesStep({
408
+ kind: "file",
409
+ runtime: state.runtime,
127
410
  datasetId: targetDatasetId,
128
- instructions: state.instructions ?? buildFileDefaultInstructions(state.outputSchema),
129
- reactor: state.reactor,
130
411
  sandboxId,
412
+ source,
413
+ schema: state.outputSchema,
414
+ });
415
+ const context = await initializeDatasetContextStep({
416
+ prepared,
417
+ instructions: state.instructions,
418
+ outputSchema: state.outputSchema,
419
+ });
420
+ if (context.kind !== "file") {
421
+ throw new Error("dataset_context_kind_mismatch:file");
422
+ }
423
+ const parseContext = createFileParseContext(context.fileId, {
424
+ datasetId: context.datasetId,
425
+ instructions: context.instructions,
426
+ reactor: state.reactor,
427
+ sandboxId: context.sandboxId,
428
+ sandboxState: context.sandboxState,
429
+ filePreview: context.filePreview,
430
+ schema: context.schema,
131
431
  });
132
432
  await parseContext.parse(state.runtime, {
133
433
  durable: await resolveDatasetAgentDurable(state.durable),
434
+ prompt: context.prompt,
435
+ initialContent: {
436
+ datasetId: context.datasetId,
437
+ fileId: context.fileId,
438
+ instructions: context.instructions ?? "",
439
+ sandboxId: context.sandboxId,
440
+ sandboxState: context.sandboxState,
441
+ filePreview: context.filePreview,
442
+ schema: context.schema,
443
+ },
134
444
  });
135
- if (!state.outputSchema) {
136
- await datasetInferAndUpdateSchemaStep({
137
- runtime: state.runtime,
138
- datasetId: targetDatasetId,
139
- title: `${targetDatasetId}Row`,
140
- description: "One dataset row",
141
- });
142
- }
143
- if (state.first) {
144
- await datasetReadOneStep({ runtime: state.runtime, datasetId: targetDatasetId });
145
- }
146
445
  return targetDatasetId;
147
446
  }
148
447
  async function normalizeSourceToDatasetId(state, source, targetDatasetId, sourceIndex) {
@@ -181,7 +480,7 @@ export async function materializeDerivedDataset(state, targetDatasetId) {
181
480
  if (!state.reactor) {
182
481
  throw new Error("dataset_reactor_required");
183
482
  }
184
- const sandboxId = await resolveDatasetSandboxId(state, targetDatasetId);
483
+ const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
185
484
  const stateWithSandbox = { ...state, sandboxId };
186
485
  const normalizedSources = [];
187
486
  for (let index = 0; index < stateWithSandbox.sources.length; index++) {
@@ -197,7 +496,8 @@ export async function materializeDerivedDataset(state, targetDatasetId) {
197
496
  properties: {},
198
497
  },
199
498
  };
200
- await createOrUpdateDatasetMetadata(stateWithSandbox.runtime, {
499
+ await initializeDatasetStep({
500
+ runtime: stateWithSandbox.runtime,
201
501
  datasetId: targetDatasetId,
202
502
  sandboxId,
203
503
  title: stateWithSandbox.title ?? targetDatasetId,
@@ -213,30 +513,46 @@ export async function materializeDerivedDataset(state, targetDatasetId) {
213
513
  : source),
214
514
  sourceKinds: stateWithSandbox.sources.map((source) => source.kind),
215
515
  schema: transformSchema,
216
- status: "building",
217
516
  });
218
- const transformContext = createTransformDatasetContext({
517
+ const prepared = await prepareDatasetSourcesStep({
518
+ kind: "transform",
519
+ runtime: stateWithSandbox.runtime,
520
+ datasetId: targetDatasetId,
521
+ sandboxId,
219
522
  sourceDatasetIds: normalizedSources,
220
523
  outputSchema: transformSchema,
524
+ });
525
+ const context = await initializeDatasetContextStep({
526
+ prepared,
221
527
  instructions: buildTransformInstructions(normalizedSources.length, stateWithSandbox.instructions, stateWithSandbox.outputSchema),
222
- datasetId: targetDatasetId,
528
+ outputSchema: transformSchema,
529
+ });
530
+ if (context.kind !== "transform") {
531
+ throw new Error("dataset_context_kind_mismatch:transform");
532
+ }
533
+ const transformContext = createTransformDatasetContext({
534
+ sourceDatasetIds: context.sourceDatasetIds,
535
+ outputSchema: context.outputSchema,
536
+ instructions: context.instructions,
537
+ datasetId: context.datasetId,
223
538
  reactor: stateWithSandbox.reactor,
224
- sandboxId,
539
+ sandboxId: context.sandboxId,
540
+ sandboxState: context.sandboxState,
541
+ sourcePreviews: context.sourcePreviews,
225
542
  });
226
543
  await transformContext.transform(stateWithSandbox.runtime, {
227
544
  durable: await resolveDatasetAgentDurable(stateWithSandbox.durable),
545
+ prompt: context.prompt,
546
+ initialContent: {
547
+ datasetId: context.datasetId,
548
+ sourceDatasetIds: context.sourceDatasetIds,
549
+ outputSchema: context.outputSchema,
550
+ instructions: context.instructions,
551
+ sandboxId: context.sandboxId,
552
+ sandboxState: context.sandboxState,
553
+ sourcePreviews: context.sourcePreviews,
554
+ },
228
555
  });
229
- if (!stateWithSandbox.outputSchema) {
230
- await datasetInferAndUpdateSchemaStep({
231
- runtime: stateWithSandbox.runtime,
232
- datasetId: targetDatasetId,
233
- title: `${targetDatasetId}Row`,
234
- description: "One dataset row",
235
- });
236
- }
237
- if (stateWithSandbox.first) {
238
- await datasetReadOneStep({ runtime: stateWithSandbox.runtime, datasetId: targetDatasetId });
239
- }
240
556
  return targetDatasetId;
241
557
  }
242
558
  registerDatasetAgentMaterializers({
@@ -15,3 +15,9 @@ export declare function createOrUpdateDatasetMetadata<Runtime extends AnyDataset
15
15
  export declare function materializeRowsToDataset<Runtime extends AnyDatasetRuntime>(runtime: Runtime, params: MaterializeRowsParams): Promise<string>;
16
16
  export declare function uploadInlineTextSource<Runtime extends AnyDatasetRuntime>(runtime: Runtime, datasetId: string, source: DatasetTextSourceInput): Promise<string>;
17
17
  export declare function finalizeBuildResult<Runtime extends AnyDatasetRuntime>(runtime: Runtime, datasetId: string, withFirst: boolean): Promise<DatasetBuildResult>;
18
+ export declare function createDatasetBuildResult<Runtime extends AnyDatasetRuntime>(runtime: Runtime, params: {
19
+ datasetId: string;
20
+ dataset: any;
21
+ previewRows: any[];
22
+ firstRow?: any | null;
23
+ }): DatasetBuildResult;
@@ -128,3 +128,25 @@ export async function finalizeBuildResult(runtime, datasetId, withFirst) {
128
128
  firstRow: firstResult.row,
129
129
  };
130
130
  }
131
+ export function createDatasetBuildResult(runtime, params) {
132
+ const reader = {
133
+ async read(cursorOrParams, limit) {
134
+ const readParams = typeof cursorOrParams === "object" && cursorOrParams !== null
135
+ ? cursorOrParams
136
+ : { cursor: cursorOrParams, limit };
137
+ return await datasetReadRowsStep({
138
+ runtime,
139
+ datasetId: params.datasetId,
140
+ cursor: readParams.cursor,
141
+ limit: readParams.limit,
142
+ });
143
+ },
144
+ };
145
+ return {
146
+ datasetId: params.datasetId,
147
+ dataset: params.dataset,
148
+ previewRows: params.previewRows,
149
+ reader,
150
+ ...(params.firstRow !== undefined ? { firstRow: params.firstRow } : {}),
151
+ };
152
+ }