@ekairos/dataset 1.22.83-beta.development.0 → 1.22.85-beta.development.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/dist/builder/agentMaterializers.d.ts +2 -2
  2. package/dist/builder/context.d.ts +7 -0
  3. package/dist/builder/context.js +192 -0
  4. package/dist/builder/instructions.d.ts +3 -3
  5. package/dist/builder/instructions.js +10 -10
  6. package/dist/builder/materialize.d.ts +10 -11
  7. package/dist/builder/materialize.js +116 -113
  8. package/dist/builder/materializeQuery.d.ts +3 -2
  9. package/dist/builder/materializeQuery.js +10 -19
  10. package/dist/builder/persistence.d.ts +4 -5
  11. package/dist/builder/persistence.js +20 -19
  12. package/dist/builder/types.d.ts +29 -24
  13. package/dist/completeDataset.steps.js +1 -1
  14. package/dist/dataset.d.ts +1 -1
  15. package/dist/dataset.js +42 -29
  16. package/dist/datasetFiles.d.ts +1 -1
  17. package/dist/datasetFiles.js +3 -3
  18. package/dist/file/file-dataset.agent.js +3 -4
  19. package/dist/file/prompts.js +12 -12
  20. package/dist/materializeDataset.tool.d.ts +34 -26
  21. package/dist/materializeDataset.tool.js +40 -29
  22. package/dist/schema.d.ts +12 -2
  23. package/dist/schema.js +6 -3
  24. package/dist/service.d.ts +1 -2
  25. package/dist/service.js +5 -2
  26. package/dist/transform/filepreview.d.ts +2 -2
  27. package/dist/transform/filepreview.js +3 -3
  28. package/dist/transform/prompts.js +25 -25
  29. package/dist/transform/transform-dataset.agent.d.ts +4 -4
  30. package/dist/transform/transform-dataset.agent.js +29 -30
  31. package/dist/transform/transform-dataset.steps.d.ts +7 -7
  32. package/dist/transform/transform-dataset.steps.js +20 -20
  33. package/dist/transform/transform-dataset.types.d.ts +13 -13
  34. package/dist/transform/transformDataset.js +4 -4
  35. package/package.json +4 -4
  36. /package/dist/builder/{sourceRows.d.ts → rows.d.ts} +0 -0
  37. /package/dist/builder/{sourceRows.js → rows.js} +0 -0
@@ -1,17 +1,16 @@
1
1
  import { createFileParseContext } from "../file/file-dataset.agent.js";
2
2
  import { readInstantFileStep } from "../file/steps.js";
3
3
  import { createTransformDatasetContext } from "../transform/transform-dataset.agent.js";
4
- import { ensureTransformSourcesInSandboxStep, generateTransformSourcePreviewsStep, } from "../transform/transform-dataset.steps.js";
4
+ import { ensureTransformInputsInSandboxStep, generateTransformInputPreviewsStep, } from "../transform/transform-dataset.steps.js";
5
5
  import { datasetGetByIdStep, datasetInferAndUpdateSchemaStep, datasetPreviewRowsStep, datasetReadOneStep, } from "../dataset/steps.js";
6
- import { getDatasetOutputPath, getDatasetScriptsDir, getDatasetSourcesDir, getDatasetStandardDirs, } from "../datasetFiles.js";
6
+ import { getDatasetOutputPath, getDatasetScriptsDir, getDatasetResourcesDir, getDatasetStandardDirs, } from "../datasetFiles.js";
7
7
  import { registerDatasetAgentMaterializers } from "./agentMaterializers.js";
8
- import { buildFileDefaultInstructions, buildRawSourceInstructions, buildTransformInstructions, } from "./instructions.js";
9
- import { createOrUpdateDatasetMetadata, materializeRowsToDataset, uploadInlineTextSource, } from "./persistence.js";
10
- import { getDomainDescriptor } from "./sourceRows.js";
11
- import { materializeQuerySource } from "./materializeQuery.js";
8
+ import { buildFileDefaultInstructions, buildRawResourceInstructions, buildTransformInstructions, } from "./instructions.js";
9
+ import { createOrUpdateDatasetMetadata, materializeRowsToDataset, uploadInlineTextResource, } from "./persistence.js";
10
+ import { materializeQueryResource } from "./materializeQuery.js";
12
11
  import { readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep, writeDatasetSandboxTextFilesStep, } from "../sandbox/steps.js";
13
- function makeIntermediateDatasetId(targetDatasetId, sourceKind, index) {
14
- return `${targetDatasetId}__${sourceKind}_${index}`;
12
+ function makeIntermediateDatasetId(targetDatasetId, resourceKind, index) {
13
+ return `${targetDatasetId}__${resourceKind}_${index}`;
15
14
  }
16
15
  function normalizeParsedTextRows(value) {
17
16
  if (Array.isArray(value)) {
@@ -21,10 +20,10 @@ function normalizeParsedTextRows(value) {
21
20
  return [value];
22
21
  return [{ value }];
23
22
  }
24
- function materializeRawTextRows(source) {
25
- const text = String(source.text ?? "");
26
- const mimeType = String(source.mimeType ?? "").toLowerCase();
27
- const name = String(source.name ?? "").toLowerCase();
23
+ function materializeRawTextRows(resource) {
24
+ const text = String(resource.text ?? "");
25
+ const mimeType = String(resource.mimeType ?? "").toLowerCase();
26
+ const name = String(resource.name ?? "").toLowerCase();
28
27
  const shouldParseJson = mimeType.includes("json") || name.endsWith(".json") || name.endsWith(".jsonl");
29
28
  if (shouldParseJson) {
30
29
  try {
@@ -67,10 +66,14 @@ function isPdfContentDisposition(value) {
67
66
  const text = String(value ?? "").toLowerCase();
68
67
  return text.includes("application/pdf") || text.includes(".pdf");
69
68
  }
70
- function sanitizePdfFileName(value, fallback) {
69
+ function sanitizeResourceFileName(value, fallback) {
71
70
  const name = String(value ?? "").trim() || fallback;
72
71
  const cleaned = name.replace(/[\\/:"*?<>|]+/g, "_").replace(/\s+/g, "_").slice(0, 120);
73
- return cleaned.toLowerCase().endsWith(".pdf") ? cleaned : `${cleaned || fallback}.pdf`;
72
+ return cleaned || fallback;
73
+ }
74
+ function sanitizePdfFileName(value, fallback) {
75
+ const cleaned = sanitizeResourceFileName(value, fallback);
76
+ return cleaned.toLowerCase().endsWith(".pdf") ? cleaned : `${cleaned}.pdf`;
74
77
  }
75
78
  function pdfTextRowsSchema() {
76
79
  return {
@@ -98,14 +101,14 @@ function parseJsonlDataRows(content) {
98
101
  .map((record) => record?.data)
99
102
  .filter((row) => row && typeof row === "object" && !Array.isArray(row));
100
103
  }
101
- async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
102
- const file = await readInstantFileStep({ runtime: state.runtime, fileId: source.fileId });
104
+ async function tryMaterializeRawPdfFileResource(state, resource, targetDatasetId) {
105
+ const file = await readInstantFileStep({ runtime: state.runtime, fileId: resource.fileId });
103
106
  if (!isPdfContentDisposition(file.contentDisposition))
104
107
  return null;
105
108
  const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
106
109
  const outputPath = getDatasetOutputPath(targetDatasetId);
107
- const fileName = sanitizePdfFileName(parseContentDispositionFileName(file.contentDisposition), `${source.fileId}.pdf`);
108
- const sourcePath = `${getDatasetSourcesDir(targetDatasetId)}/${fileName}`;
110
+ const fileName = sanitizePdfFileName(parseContentDispositionFileName(file.contentDisposition), `${resource.fileId}.pdf`);
111
+ const resourcePath = `${getDatasetResourcesDir(targetDatasetId)}/${fileName}`;
109
112
  const scriptPath = `${getDatasetScriptsDir(targetDatasetId)}/extract_pdf_text.py`;
110
113
  await runDatasetSandboxCommandStep({
111
114
  runtime: state.runtime,
@@ -116,7 +119,7 @@ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
116
119
  await writeDatasetSandboxFilesStep({
117
120
  runtime: state.runtime,
118
121
  sandboxId,
119
- files: [{ path: sourcePath, contentBase64: file.contentBase64 }],
122
+ files: [{ path: resourcePath, contentBase64: file.contentBase64 }],
120
123
  });
121
124
  const install = await runDatasetSandboxCommandStep({
122
125
  runtime: state.runtime,
@@ -139,11 +142,11 @@ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
139
142
  "import sys",
140
143
  "from pypdf import PdfReader",
141
144
  "",
142
- "source_path = Path(sys.argv[1])",
145
+ "resource_path = Path(sys.argv[1])",
143
146
  "output_path = Path(sys.argv[2])",
144
147
  "file_id = sys.argv[3]",
145
148
  "file_name = sys.argv[4]",
146
- "reader = PdfReader(str(source_path))",
149
+ "reader = PdfReader(str(resource_path))",
147
150
  "rows = 0",
148
151
  "with output_path.open('w', encoding='utf-8') as out:",
149
152
  " for index, page in enumerate(reader.pages, start=1):",
@@ -173,7 +176,7 @@ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
173
176
  runtime: state.runtime,
174
177
  sandboxId,
175
178
  cmd: "python",
176
- args: [scriptPath, sourcePath, outputPath, source.fileId, fileName],
179
+ args: [scriptPath, resourcePath, outputPath, resource.fileId, fileName],
177
180
  });
178
181
  if (extraction.exitCode !== 0) {
179
182
  throw new Error(`dataset_pdf_text_extraction_failed:${extraction.stderr || extraction.stdout}`);
@@ -192,36 +195,45 @@ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
192
195
  sandboxId,
193
196
  title: state.title ?? fileName,
194
197
  instructions: state.instructions,
195
- sources: [{ kind: "file", fileId: source.fileId, description: source.description }],
196
- sourceKinds: ["file"],
198
+ contextId: state.contextId ?? "",
197
199
  rows,
198
200
  schema: pdfTextRowsSchema(),
199
201
  first: state.first,
200
202
  });
201
203
  return targetDatasetId;
202
204
  }
203
- async function materializeRawTextSource(state, source, targetDatasetId) {
204
- const rows = materializeRawTextRows(source);
205
+ async function materializeRawTextResource(state, resource, targetDatasetId) {
206
+ const rows = materializeRawTextRows(resource);
205
207
  await materializeRowsToDataset(state.runtime, {
206
208
  datasetId: targetDatasetId,
207
209
  sandboxId: state.sandboxId,
208
- title: state.title ?? source.name ?? targetDatasetId,
210
+ title: state.title ?? resource.name ?? targetDatasetId,
209
211
  instructions: state.instructions,
210
- sources: [
211
- {
212
- kind: "text",
213
- mimeType: source.mimeType,
214
- name: source.name,
215
- description: source.description,
216
- },
217
- ],
218
- sourceKinds: ["text"],
212
+ contextId: state.contextId ?? "",
219
213
  rows,
220
214
  schema: state.outputSchema,
221
215
  first: state.first,
222
216
  });
223
217
  return targetDatasetId;
224
218
  }
219
+ async function writePreparedFileResourceToSandbox(params) {
220
+ const file = await readInstantFileStep({ runtime: params.runtime, fileId: params.fileId });
221
+ const contentDispositionName = parseContentDispositionFileName(file.contentDisposition);
222
+ const fileName = sanitizeResourceFileName(params.filename ?? contentDispositionName, `${params.fileId}.bin`);
223
+ const resourcePath = `${getDatasetResourcesDir(params.datasetId)}/${fileName}`;
224
+ await runDatasetSandboxCommandStep({
225
+ runtime: params.runtime,
226
+ sandboxId: params.sandboxId,
227
+ cmd: "mkdir",
228
+ args: ["-p", ...getDatasetStandardDirs(params.datasetId)],
229
+ });
230
+ await writeDatasetSandboxFilesStep({
231
+ runtime: params.runtime,
232
+ sandboxId: params.sandboxId,
233
+ files: [{ path: resourcePath, contentBase64: file.contentBase64 }],
234
+ });
235
+ return { fileName, resourcePath };
236
+ }
225
237
  function resolveDatasetSandboxId(state, _targetDatasetId) {
226
238
  const sandboxId = String(state.sandboxId ?? "").trim();
227
239
  if (sandboxId)
@@ -249,8 +261,7 @@ export async function initializeDatasetStep(params) {
249
261
  sandboxId: params.sandboxId,
250
262
  title: params.title ?? params.datasetId,
251
263
  instructions: params.instructions,
252
- sources: params.sources,
253
- sourceKinds: params.sourceKinds,
264
+ contextId: params.contextId,
254
265
  schema: params.schema,
255
266
  status: "building",
256
267
  });
@@ -259,12 +270,12 @@ export async function initializeDatasetStep(params) {
259
270
  sandboxId: params.sandboxId,
260
271
  };
261
272
  }
262
- export async function prepareDatasetSourcesStep(params) {
273
+ export async function prepareDatasetResourcesStep(params) {
263
274
  "use step";
264
275
  if (params.kind === "file") {
265
- const fileId = params.source.kind === "file"
266
- ? params.source.fileId
267
- : await uploadInlineTextSource(params.runtime, params.datasetId, params.source);
276
+ const fileId = params.resource.kind === "file"
277
+ ? params.resource.fileId
278
+ : await uploadInlineTextResource(params.runtime, params.datasetId, params.resource);
268
279
  return {
269
280
  kind: "file",
270
281
  datasetId: params.datasetId,
@@ -273,31 +284,31 @@ export async function prepareDatasetSourcesStep(params) {
273
284
  sandboxState: { initialized: false, filePath: "" },
274
285
  filePreview: undefined,
275
286
  schema: params.schema ?? null,
276
- filename: params.source.kind === "file" ? params.source.filename : params.source.name,
277
- mediaType: params.source.kind === "file" ? params.source.mediaType : params.source.mimeType,
287
+ filename: params.resource.kind === "file" ? params.resource.filename : params.resource.name,
288
+ mediaType: params.resource.kind === "file" ? params.resource.mediaType : params.resource.mimeType,
278
289
  };
279
290
  }
280
- const initialized = await ensureTransformSourcesInSandboxStep({
291
+ const initialized = await ensureTransformInputsInSandboxStep({
281
292
  runtime: params.runtime,
282
293
  sandboxId: params.sandboxId,
283
294
  datasetId: params.datasetId,
284
- sourceDatasetIds: params.sourceDatasetIds,
285
- state: { initialized: false, sourcePaths: [] },
295
+ inputDatasetIds: params.inputDatasetIds,
296
+ state: { initialized: false, inputPaths: [] },
286
297
  });
287
- const sourcePreviews = await generateTransformSourcePreviewsStep({
298
+ const inputPreviews = await generateTransformInputPreviewsStep({
288
299
  runtime: params.runtime,
289
300
  sandboxId: params.sandboxId,
290
301
  datasetId: params.datasetId,
291
- sourcePaths: initialized.sourcePaths,
302
+ inputPaths: initialized.inputPaths,
292
303
  });
293
304
  return {
294
305
  kind: "transform",
295
306
  datasetId: params.datasetId,
296
307
  sandboxId: params.sandboxId,
297
- sourceDatasetIds: params.sourceDatasetIds,
308
+ inputDatasetIds: params.inputDatasetIds,
298
309
  outputSchema: params.outputSchema,
299
310
  sandboxState: initialized.state,
300
- sourcePreviews,
311
+ inputPreviews,
301
312
  };
302
313
  }
303
314
  export async function initializeDatasetContextStep(params) {
@@ -312,9 +323,9 @@ export async function initializeDatasetContextStep(params) {
312
323
  return {
313
324
  ...params.prepared,
314
325
  instructions: params.instructions,
315
- prompt: params.prepared.sourceDatasetIds.length === 1
316
- ? "Transform the source dataset into a new dataset matching the provided output schema"
317
- : `Transform ${params.prepared.sourceDatasetIds.length} source datasets into a new dataset matching the provided output schema`,
326
+ prompt: params.prepared.inputDatasetIds.length === 1
327
+ ? "Transform the input dataset into a new dataset matching the provided output schema"
328
+ : `Transform ${params.prepared.inputDatasetIds.length} input datasets into a new dataset matching the provided output schema`,
318
329
  };
319
330
  }
320
331
  export async function completeDatasetStep(params) {
@@ -363,9 +374,9 @@ export async function completeDatasetStep(params) {
363
374
  firstRow: firstResult.row,
364
375
  };
365
376
  }
366
- export async function materializeSingleFileLikeSource(state, source, targetDatasetId) {
367
- if (source.kind === "file" && !state.outputSchema) {
368
- const materializedPdf = await tryMaterializeRawPdfFileSource(state, source, targetDatasetId);
377
+ export async function materializeSingleFileLikeResource(state, resource, targetDatasetId) {
378
+ if (resource.kind === "file" && !state.outputSchema) {
379
+ const materializedPdf = await tryMaterializeRawPdfFileResource(state, resource, targetDatasetId);
369
380
  if (materializedPdf)
370
381
  return materializedPdf;
371
382
  }
@@ -379,35 +390,32 @@ export async function materializeSingleFileLikeSource(state, source, targetDatas
379
390
  sandboxId,
380
391
  title: state.title ?? targetDatasetId,
381
392
  instructions: state.instructions,
382
- sources: [
383
- source.kind === "file"
384
- ? {
385
- kind: "file",
386
- fileId: source.fileId,
387
- description: source.description,
388
- filename: source.filename,
389
- mediaType: source.mediaType,
390
- }
391
- : {
392
- kind: "text",
393
- mimeType: source.mimeType,
394
- name: source.name,
395
- description: source.description,
396
- },
397
- ],
398
- sourceKinds: [source.kind],
393
+ contextId: state.contextId ?? "",
399
394
  schema: state.outputSchema,
400
395
  });
401
- const prepared = await prepareDatasetSourcesStep({
396
+ const prepared = await prepareDatasetResourcesStep({
402
397
  kind: "file",
403
398
  runtime: state.runtime,
404
399
  datasetId: targetDatasetId,
405
400
  sandboxId,
406
- source,
401
+ resource,
407
402
  schema: state.outputSchema,
408
403
  });
404
+ if (prepared.kind !== "file") {
405
+ throw new Error("dataset_context_kind_mismatch:file");
406
+ }
407
+ const preparedFile = await writePreparedFileResourceToSandbox({
408
+ runtime: state.runtime,
409
+ sandboxId,
410
+ datasetId: targetDatasetId,
411
+ fileId: prepared.fileId,
412
+ filename: prepared.filename,
413
+ });
409
414
  const context = await initializeDatasetContextStep({
410
- prepared,
415
+ prepared: {
416
+ ...prepared,
417
+ filename: prepared.filename ?? preparedFile.fileName,
418
+ },
411
419
  instructions: state.instructions,
412
420
  outputSchema: state.outputSchema,
413
421
  });
@@ -442,36 +450,40 @@ export async function materializeSingleFileLikeSource(state, source, targetDatas
442
450
  });
443
451
  return targetDatasetId;
444
452
  }
445
- async function normalizeSourceToDatasetId(state, source, targetDatasetId, sourceIndex) {
446
- if (source.kind === "dataset") {
447
- return source.datasetId;
453
+ async function normalizeResourceToDatasetId(state, resource, targetDatasetId, resourceIndex) {
454
+ if (resource.kind === "dataset") {
455
+ return resource.datasetId;
448
456
  }
449
- const intermediateDatasetId = makeIntermediateDatasetId(targetDatasetId, source.kind, sourceIndex);
450
- if (source.kind === "query") {
451
- await materializeQuerySource(state.runtime, source, {
457
+ const intermediateDatasetId = makeIntermediateDatasetId(targetDatasetId, resource.kind, resourceIndex);
458
+ if (resource.kind === "query") {
459
+ await materializeQueryResource(state.runtime, resource, {
452
460
  datasetId: intermediateDatasetId,
453
461
  sandboxId: state.sandboxId,
454
- title: source.title,
462
+ title: resource.title,
455
463
  first: false,
464
+ contextId: state.contextId ?? "",
456
465
  });
457
466
  return intermediateDatasetId;
458
467
  }
459
- if (source.kind === "text") {
460
- await materializeRawTextSource({
468
+ if (resource.kind === "text") {
469
+ await materializeRawTextResource({
461
470
  ...state,
462
471
  outputSchema: undefined,
463
472
  first: false,
464
- instructions: buildRawSourceInstructions(source.kind),
465
- title: source.name ?? state.title,
466
- }, source, intermediateDatasetId);
473
+ instructions: buildRawResourceInstructions(resource.kind),
474
+ title: resource.name ?? state.title,
475
+ }, resource, intermediateDatasetId);
467
476
  return intermediateDatasetId;
468
477
  }
469
- await materializeSingleFileLikeSource({
478
+ if (resource.kind === "context") {
479
+ throw new Error("dataset_context_resource_must_be_resolved_before_materialization");
480
+ }
481
+ await materializeSingleFileLikeResource({
470
482
  ...state,
471
483
  outputSchema: undefined,
472
484
  first: false,
473
- instructions: buildRawSourceInstructions(source.kind),
474
- }, source, intermediateDatasetId);
485
+ instructions: buildRawResourceInstructions(resource.kind),
486
+ }, resource, intermediateDatasetId);
475
487
  return intermediateDatasetId;
476
488
  }
477
489
  export async function materializeDerivedDataset(state, targetDatasetId) {
@@ -480,9 +492,9 @@ export async function materializeDerivedDataset(state, targetDatasetId) {
480
492
  }
481
493
  const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
482
494
  const stateWithSandbox = { ...state, sandboxId };
483
- const normalizedSources = [];
484
- for (let index = 0; index < stateWithSandbox.sources.length; index++) {
485
- normalizedSources.push(await normalizeSourceToDatasetId(stateWithSandbox, stateWithSandbox.sources[index], targetDatasetId, index));
495
+ const normalizedResources = [];
496
+ for (let index = 0; index < stateWithSandbox.resources.length; index++) {
497
+ normalizedResources.push(await normalizeResourceToDatasetId(stateWithSandbox, stateWithSandbox.resources[index], targetDatasetId, index));
486
498
  }
487
499
  const transformSchema = stateWithSandbox.outputSchema ??
488
500
  {
@@ -500,60 +512,51 @@ export async function materializeDerivedDataset(state, targetDatasetId) {
500
512
  sandboxId,
501
513
  title: stateWithSandbox.title ?? targetDatasetId,
502
514
  instructions: stateWithSandbox.instructions,
503
- sources: stateWithSandbox.sources.map((source) => source.kind === "query"
504
- ? {
505
- kind: "query",
506
- query: source.query,
507
- title: source.title,
508
- explanation: source.explanation,
509
- ...getDomainDescriptor(source.domain),
510
- }
511
- : source),
512
- sourceKinds: stateWithSandbox.sources.map((source) => source.kind),
515
+ contextId: stateWithSandbox.contextId ?? "",
513
516
  schema: transformSchema,
514
517
  });
515
- const prepared = await prepareDatasetSourcesStep({
518
+ const prepared = await prepareDatasetResourcesStep({
516
519
  kind: "transform",
517
520
  runtime: stateWithSandbox.runtime,
518
521
  datasetId: targetDatasetId,
519
522
  sandboxId,
520
- sourceDatasetIds: normalizedSources,
523
+ inputDatasetIds: normalizedResources,
521
524
  outputSchema: transformSchema,
522
525
  });
523
526
  const context = await initializeDatasetContextStep({
524
527
  prepared,
525
- instructions: buildTransformInstructions(normalizedSources.length, stateWithSandbox.instructions, stateWithSandbox.outputSchema),
528
+ instructions: buildTransformInstructions(normalizedResources.length, stateWithSandbox.instructions, stateWithSandbox.outputSchema),
526
529
  outputSchema: transformSchema,
527
530
  });
528
531
  if (context.kind !== "transform") {
529
532
  throw new Error("dataset_context_kind_mismatch:transform");
530
533
  }
531
534
  const transformContext = createTransformDatasetContext({
532
- sourceDatasetIds: context.sourceDatasetIds,
535
+ inputDatasetIds: context.inputDatasetIds,
533
536
  outputSchema: context.outputSchema,
534
537
  instructions: context.instructions,
535
538
  datasetId: context.datasetId,
536
539
  reactor: stateWithSandbox.reactor,
537
540
  sandboxId: context.sandboxId,
538
541
  sandboxState: context.sandboxState,
539
- sourcePreviews: context.sourcePreviews,
542
+ inputPreviews: context.inputPreviews,
540
543
  });
541
544
  await transformContext.transform(stateWithSandbox.runtime, {
542
545
  durable: await resolveDatasetAgentDurable(stateWithSandbox.durable),
543
546
  prompt: context.prompt,
544
547
  initialContent: {
545
548
  datasetId: context.datasetId,
546
- sourceDatasetIds: context.sourceDatasetIds,
549
+ inputDatasetIds: context.inputDatasetIds,
547
550
  outputSchema: context.outputSchema,
548
551
  instructions: context.instructions,
549
552
  sandboxId: context.sandboxId,
550
553
  sandboxState: context.sandboxState,
551
- sourcePreviews: context.sourcePreviews,
554
+ inputPreviews: context.inputPreviews,
552
555
  },
553
556
  });
554
557
  return targetDatasetId;
555
558
  }
556
559
  registerDatasetAgentMaterializers({
557
- materializeSingleFileLikeSource,
560
+ materializeSingleFileLikeResource,
558
561
  materializeDerivedDataset,
559
562
  });
@@ -1,5 +1,5 @@
1
- import type { AnyDatasetRuntime, DatasetBuilderState, DatasetSchemaInput, InternalSource } from "./types.js";
2
- export declare function materializeQuerySource<Runtime extends AnyDatasetRuntime>(runtime: DatasetBuilderState<Runtime>["runtime"], source: Extract<InternalSource, {
1
+ import type { AnyDatasetRuntime, DatasetBuilderState, DatasetSchemaInput, InternalDatasetResource } from "./types.js";
2
+ export declare function materializeQueryResource<Runtime extends AnyDatasetRuntime>(runtime: DatasetBuilderState<Runtime>["runtime"], resource: Extract<InternalDatasetResource, {
3
3
  kind: "query";
4
4
  }>, params: {
5
5
  datasetId: string;
@@ -8,4 +8,5 @@ export declare function materializeQuerySource<Runtime extends AnyDatasetRuntime
8
8
  title?: string;
9
9
  instructions?: string;
10
10
  first?: boolean;
11
+ contextId: string;
11
12
  }): Promise<string>;
@@ -1,35 +1,26 @@
1
1
  import { materializeRowsToDataset } from "./persistence.js";
2
- import { getDomainDescriptor, normalizeQueryRows } from "./sourceRows.js";
3
- async function readQuerySourceRowsStep(params) {
2
+ import { getDomainDescriptor, normalizeQueryRows } from "./rows.js";
3
+ async function readQueryResourceRowsStep(params) {
4
4
  "use step";
5
5
  const db = await params.runtime.db();
6
6
  const result = await db.query(params.query);
7
7
  return { rows: normalizeQueryRows(result) };
8
8
  }
9
- export async function materializeQuerySource(runtime, source, params) {
10
- const { rows } = await readQuerySourceRowsStep({
9
+ export async function materializeQueryResource(runtime, resource, params) {
10
+ const { rows } = await readQueryResourceRowsStep({
11
11
  runtime,
12
- query: source.query,
12
+ query: resource.query,
13
13
  });
14
- const domainDescriptor = getDomainDescriptor(source.domain);
14
+ const domainDescriptor = getDomainDescriptor(resource.domain);
15
15
  return await materializeRowsToDataset(runtime, {
16
16
  datasetId: params.datasetId,
17
17
  sandboxId: params.sandboxId,
18
- title: params.title ?? source.title,
18
+ title: params.title ?? resource.title,
19
19
  instructions: params.instructions,
20
- sources: [
21
- {
22
- kind: "query",
23
- query: source.query,
24
- title: source.title,
25
- explanation: source.explanation,
26
- ...domainDescriptor,
27
- },
28
- ],
29
- sourceKinds: ["query"],
20
+ contextId: params.contextId,
30
21
  analysis: {
31
- query: source.query,
32
- explanation: source.explanation,
22
+ query: resource.query,
23
+ explanation: resource.explanation,
33
24
  ...domainDescriptor,
34
25
  },
35
26
  rows,
@@ -1,19 +1,18 @@
1
- import type { AnyDatasetRuntime, DatasetBuildResult, DatasetTextSourceInput, MaterializeRowsParams } from "./types.js";
2
- export declare function defaultTextSourceName(source: DatasetTextSourceInput): string;
1
+ import type { AnyDatasetRuntime, DatasetBuildResult, DatasetTextResourceInput, MaterializeRowsParams } from "./types.js";
2
+ export declare function defaultTextResourceName(resource: DatasetTextResourceInput): string;
3
3
  export declare function getDatasetDb<Runtime extends AnyDatasetRuntime>(runtime: Runtime): Promise<any>;
4
4
  export declare function createOrUpdateDatasetMetadata<Runtime extends AnyDatasetRuntime>(runtime: Runtime, params: {
5
5
  datasetId: string;
6
6
  sandboxId?: string;
7
7
  title?: string;
8
8
  instructions?: string;
9
- sources: any[];
10
- sourceKinds: string[];
9
+ contextId: string;
11
10
  analysis?: any;
12
11
  schema?: any;
13
12
  status?: string;
14
13
  }): Promise<void>;
15
14
  export declare function materializeRowsToDataset<Runtime extends AnyDatasetRuntime>(runtime: Runtime, params: MaterializeRowsParams): Promise<string>;
16
- export declare function uploadInlineTextSource<Runtime extends AnyDatasetRuntime>(runtime: Runtime, datasetId: string, source: DatasetTextSourceInput): Promise<string>;
15
+ export declare function uploadInlineTextResource<Runtime extends AnyDatasetRuntime>(runtime: Runtime, datasetId: string, resource: DatasetTextResourceInput): Promise<string>;
17
16
  export declare function finalizeBuildResult<Runtime extends AnyDatasetRuntime>(runtime: Runtime, datasetId: string, withFirst: boolean): Promise<DatasetBuildResult>;
18
17
  export declare function createDatasetBuildResult<Runtime extends AnyDatasetRuntime>(runtime: Runtime, params: {
19
18
  datasetId: string;
@@ -2,18 +2,18 @@ import { DatasetService } from "../service.js";
2
2
  import { datasetDomain } from "../schema.js";
3
3
  import { datasetGetByIdStep, datasetPreviewRowsStep, datasetReadOneStep, datasetReadRowsStep, } from "../dataset/steps.js";
4
4
  import { inferDatasetSchema, validateRows } from "./schemaInference.js";
5
- import { rowsToJsonl } from "./sourceRows.js";
6
- export function defaultTextSourceName(source) {
7
- if (source.name?.trim())
8
- return source.name.trim();
9
- const mimeType = String(source.mimeType ?? "").toLowerCase();
5
+ import { rowsToJsonl } from "./rows.js";
6
+ export function defaultTextResourceName(resource) {
7
+ if (resource.name?.trim())
8
+ return resource.name.trim();
9
+ const mimeType = String(resource.mimeType ?? "").toLowerCase();
10
10
  if (mimeType.includes("csv"))
11
- return "source.csv";
11
+ return "resource.csv";
12
12
  if (mimeType.includes("json"))
13
- return "source.json";
13
+ return "resource.json";
14
14
  if (mimeType.includes("yaml") || mimeType.includes("yml"))
15
- return "source.yaml";
16
- return "source.txt";
15
+ return "resource.yaml";
16
+ return "resource.txt";
17
17
  }
18
18
  export async function getDatasetDb(runtime) {
19
19
  const scoped = await runtime.use(datasetDomain);
@@ -21,6 +21,9 @@ export async function getDatasetDb(runtime) {
21
21
  }
22
22
  export async function createOrUpdateDatasetMetadata(runtime, params) {
23
23
  "use step";
24
+ if (!params.contextId.trim()) {
25
+ throw new Error("dataset_context_required");
26
+ }
24
27
  const db = await getDatasetDb(runtime);
25
28
  const service = new DatasetService(db);
26
29
  const result = await service.createDataset({
@@ -28,8 +31,7 @@ export async function createOrUpdateDatasetMetadata(runtime, params) {
28
31
  sandboxId: params.sandboxId,
29
32
  title: params.title ?? params.datasetId,
30
33
  instructions: params.instructions ?? "",
31
- sources: params.sources,
32
- sourceKinds: params.sourceKinds,
34
+ contextId: params.contextId,
33
35
  analysis: params.analysis,
34
36
  schema: params.schema,
35
37
  status: params.status ?? "building",
@@ -52,8 +54,7 @@ export async function materializeRowsToDataset(runtime, params) {
52
54
  sandboxId: params.sandboxId,
53
55
  title: params.title,
54
56
  instructions: params.instructions,
55
- sources: params.sources,
56
- sourceKinds: params.sourceKinds,
57
+ contextId: params.contextId,
57
58
  analysis: params.analysis,
58
59
  schema: resolvedSchema,
59
60
  status: "building",
@@ -78,18 +79,18 @@ export async function materializeRowsToDataset(runtime, params) {
78
79
  }
79
80
  return params.datasetId;
80
81
  }
81
- export async function uploadInlineTextSource(runtime, datasetId, source) {
82
+ export async function uploadInlineTextResource(runtime, datasetId, resource) {
82
83
  "use step";
83
84
  const db = await getDatasetDb(runtime);
84
- const fileName = defaultTextSourceName(source);
85
- const storagePath = `/dataset/source/${datasetId}/${Date.now()}-${fileName}`;
86
- const uploadResult = await db.storage.uploadFile(storagePath, Buffer.from(source.text, "utf-8"), {
87
- contentType: source.mimeType ?? "text/plain",
85
+ const fileName = defaultTextResourceName(resource);
86
+ const storagePath = `/dataset/resource/${datasetId}/${Date.now()}-${fileName}`;
87
+ const uploadResult = await db.storage.uploadFile(storagePath, Buffer.from(resource.text, "utf-8"), {
88
+ contentType: resource.mimeType ?? "text/plain",
88
89
  contentDisposition: fileName,
89
90
  });
90
91
  const fileId = uploadResult?.data?.id;
91
92
  if (!fileId) {
92
- throw new Error("dataset_text_source_upload_failed");
93
+ throw new Error("dataset_text_resource_upload_failed");
93
94
  }
94
95
  return fileId;
95
96
  }