@ekairos/dataset 1.22.82-beta.development.0 → 1.22.84-beta.development.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/dist/builder/agentMaterializers.d.ts +2 -2
  2. package/dist/builder/context.d.ts +7 -0
  3. package/dist/builder/context.js +192 -0
  4. package/dist/builder/instructions.d.ts +3 -3
  5. package/dist/builder/instructions.js +10 -10
  6. package/dist/builder/materialize.d.ts +12 -11
  7. package/dist/builder/materialize.js +122 -121
  8. package/dist/builder/materializeQuery.d.ts +3 -2
  9. package/dist/builder/materializeQuery.js +10 -19
  10. package/dist/builder/persistence.d.ts +4 -5
  11. package/dist/builder/persistence.js +20 -19
  12. package/dist/builder/types.d.ts +31 -24
  13. package/dist/completeDataset.steps.d.ts +9 -8
  14. package/dist/completeDataset.steps.js +18 -11
  15. package/dist/completeDataset.tool.d.ts +9 -8
  16. package/dist/completeDataset.tool.js +2 -1
  17. package/dist/contextWorkspace.d.ts +72 -0
  18. package/dist/contextWorkspace.js +218 -0
  19. package/dist/dataset.d.ts +1 -1
  20. package/dist/dataset.js +42 -29
  21. package/dist/datasetFiles.d.ts +1 -1
  22. package/dist/datasetFiles.js +3 -3
  23. package/dist/executeCommand.tool.d.ts +1 -43
  24. package/dist/executeCommand.tool.js +10 -3
  25. package/dist/file/file-dataset.agent.d.ts +2 -0
  26. package/dist/file/file-dataset.agent.js +51 -16
  27. package/dist/file/file-dataset.steps.d.ts +6 -0
  28. package/dist/file/file-dataset.steps.js +18 -21
  29. package/dist/file/file-dataset.types.d.ts +10 -0
  30. package/dist/file/prompts.js +16 -14
  31. package/dist/index.d.ts +1 -0
  32. package/dist/index.js +1 -0
  33. package/dist/materializeDataset.tool.d.ts +34 -26
  34. package/dist/materializeDataset.tool.js +40 -29
  35. package/dist/schema.d.ts +12 -2
  36. package/dist/schema.js +6 -3
  37. package/dist/service.d.ts +2 -2
  38. package/dist/service.js +6 -3
  39. package/dist/transform/filepreview.d.ts +2 -2
  40. package/dist/transform/filepreview.js +3 -3
  41. package/dist/transform/prompts.js +25 -25
  42. package/dist/transform/transform-dataset.agent.d.ts +4 -4
  43. package/dist/transform/transform-dataset.agent.js +29 -30
  44. package/dist/transform/transform-dataset.steps.d.ts +7 -7
  45. package/dist/transform/transform-dataset.steps.js +20 -20
  46. package/dist/transform/transform-dataset.types.d.ts +13 -13
  47. package/dist/transform/transformDataset.js +4 -4
  48. package/package.json +4 -4
  49. /package/dist/builder/{sourceRows.d.ts → rows.d.ts} +0 -0
  50. /package/dist/builder/{sourceRows.js → rows.js} +0 -0
@@ -1,18 +1,16 @@
1
1
  import { createFileParseContext } from "../file/file-dataset.agent.js";
2
2
  import { readInstantFileStep } from "../file/steps.js";
3
- import { generateFileParsePreviewStep, initializeFileParseSandboxStep, } from "../file/file-dataset.steps.js";
4
3
  import { createTransformDatasetContext } from "../transform/transform-dataset.agent.js";
5
- import { ensureTransformSourcesInSandboxStep, generateTransformSourcePreviewsStep, } from "../transform/transform-dataset.steps.js";
4
+ import { ensureTransformInputsInSandboxStep, generateTransformInputPreviewsStep, } from "../transform/transform-dataset.steps.js";
6
5
  import { datasetGetByIdStep, datasetInferAndUpdateSchemaStep, datasetPreviewRowsStep, datasetReadOneStep, } from "../dataset/steps.js";
7
- import { getDatasetOutputPath, getDatasetScriptsDir, getDatasetSourcesDir, getDatasetStandardDirs, } from "../datasetFiles.js";
6
+ import { getDatasetOutputPath, getDatasetScriptsDir, getDatasetResourcesDir, getDatasetStandardDirs, } from "../datasetFiles.js";
8
7
  import { registerDatasetAgentMaterializers } from "./agentMaterializers.js";
9
- import { buildFileDefaultInstructions, buildRawSourceInstructions, buildTransformInstructions, } from "./instructions.js";
10
- import { createOrUpdateDatasetMetadata, materializeRowsToDataset, uploadInlineTextSource, } from "./persistence.js";
11
- import { getDomainDescriptor } from "./sourceRows.js";
12
- import { materializeQuerySource } from "./materializeQuery.js";
8
+ import { buildFileDefaultInstructions, buildRawResourceInstructions, buildTransformInstructions, } from "./instructions.js";
9
+ import { createOrUpdateDatasetMetadata, materializeRowsToDataset, uploadInlineTextResource, } from "./persistence.js";
10
+ import { materializeQueryResource } from "./materializeQuery.js";
13
11
  import { readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep, writeDatasetSandboxTextFilesStep, } from "../sandbox/steps.js";
14
- function makeIntermediateDatasetId(targetDatasetId, sourceKind, index) {
15
- return `${targetDatasetId}__${sourceKind}_${index}`;
12
+ function makeIntermediateDatasetId(targetDatasetId, resourceKind, index) {
13
+ return `${targetDatasetId}__${resourceKind}_${index}`;
16
14
  }
17
15
  function normalizeParsedTextRows(value) {
18
16
  if (Array.isArray(value)) {
@@ -22,10 +20,10 @@ function normalizeParsedTextRows(value) {
22
20
  return [value];
23
21
  return [{ value }];
24
22
  }
25
- function materializeRawTextRows(source) {
26
- const text = String(source.text ?? "");
27
- const mimeType = String(source.mimeType ?? "").toLowerCase();
28
- const name = String(source.name ?? "").toLowerCase();
23
+ function materializeRawTextRows(resource) {
24
+ const text = String(resource.text ?? "");
25
+ const mimeType = String(resource.mimeType ?? "").toLowerCase();
26
+ const name = String(resource.name ?? "").toLowerCase();
29
27
  const shouldParseJson = mimeType.includes("json") || name.endsWith(".json") || name.endsWith(".jsonl");
30
28
  if (shouldParseJson) {
31
29
  try {
@@ -68,10 +66,14 @@ function isPdfContentDisposition(value) {
68
66
  const text = String(value ?? "").toLowerCase();
69
67
  return text.includes("application/pdf") || text.includes(".pdf");
70
68
  }
71
- function sanitizePdfFileName(value, fallback) {
69
+ function sanitizeResourceFileName(value, fallback) {
72
70
  const name = String(value ?? "").trim() || fallback;
73
71
  const cleaned = name.replace(/[\\/:"*?<>|]+/g, "_").replace(/\s+/g, "_").slice(0, 120);
74
- return cleaned.toLowerCase().endsWith(".pdf") ? cleaned : `${cleaned || fallback}.pdf`;
72
+ return cleaned || fallback;
73
+ }
74
+ function sanitizePdfFileName(value, fallback) {
75
+ const cleaned = sanitizeResourceFileName(value, fallback);
76
+ return cleaned.toLowerCase().endsWith(".pdf") ? cleaned : `${cleaned}.pdf`;
75
77
  }
76
78
  function pdfTextRowsSchema() {
77
79
  return {
@@ -99,14 +101,14 @@ function parseJsonlDataRows(content) {
99
101
  .map((record) => record?.data)
100
102
  .filter((row) => row && typeof row === "object" && !Array.isArray(row));
101
103
  }
102
- async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
103
- const file = await readInstantFileStep({ runtime: state.runtime, fileId: source.fileId });
104
+ async function tryMaterializeRawPdfFileResource(state, resource, targetDatasetId) {
105
+ const file = await readInstantFileStep({ runtime: state.runtime, fileId: resource.fileId });
104
106
  if (!isPdfContentDisposition(file.contentDisposition))
105
107
  return null;
106
108
  const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
107
109
  const outputPath = getDatasetOutputPath(targetDatasetId);
108
- const fileName = sanitizePdfFileName(parseContentDispositionFileName(file.contentDisposition), `${source.fileId}.pdf`);
109
- const sourcePath = `${getDatasetSourcesDir(targetDatasetId)}/${fileName}`;
110
+ const fileName = sanitizePdfFileName(parseContentDispositionFileName(file.contentDisposition), `${resource.fileId}.pdf`);
111
+ const resourcePath = `${getDatasetResourcesDir(targetDatasetId)}/${fileName}`;
110
112
  const scriptPath = `${getDatasetScriptsDir(targetDatasetId)}/extract_pdf_text.py`;
111
113
  await runDatasetSandboxCommandStep({
112
114
  runtime: state.runtime,
@@ -117,7 +119,7 @@ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
117
119
  await writeDatasetSandboxFilesStep({
118
120
  runtime: state.runtime,
119
121
  sandboxId,
120
- files: [{ path: sourcePath, contentBase64: file.contentBase64 }],
122
+ files: [{ path: resourcePath, contentBase64: file.contentBase64 }],
121
123
  });
122
124
  const install = await runDatasetSandboxCommandStep({
123
125
  runtime: state.runtime,
@@ -140,11 +142,11 @@ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
140
142
  "import sys",
141
143
  "from pypdf import PdfReader",
142
144
  "",
143
- "source_path = Path(sys.argv[1])",
145
+ "resource_path = Path(sys.argv[1])",
144
146
  "output_path = Path(sys.argv[2])",
145
147
  "file_id = sys.argv[3]",
146
148
  "file_name = sys.argv[4]",
147
- "reader = PdfReader(str(source_path))",
149
+ "reader = PdfReader(str(resource_path))",
148
150
  "rows = 0",
149
151
  "with output_path.open('w', encoding='utf-8') as out:",
150
152
  " for index, page in enumerate(reader.pages, start=1):",
@@ -174,7 +176,7 @@ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
174
176
  runtime: state.runtime,
175
177
  sandboxId,
176
178
  cmd: "python",
177
- args: [scriptPath, sourcePath, outputPath, source.fileId, fileName],
179
+ args: [scriptPath, resourcePath, outputPath, resource.fileId, fileName],
178
180
  });
179
181
  if (extraction.exitCode !== 0) {
180
182
  throw new Error(`dataset_pdf_text_extraction_failed:${extraction.stderr || extraction.stdout}`);
@@ -193,36 +195,45 @@ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
193
195
  sandboxId,
194
196
  title: state.title ?? fileName,
195
197
  instructions: state.instructions,
196
- sources: [{ kind: "file", fileId: source.fileId, description: source.description }],
197
- sourceKinds: ["file"],
198
+ contextId: state.contextId ?? "",
198
199
  rows,
199
200
  schema: pdfTextRowsSchema(),
200
201
  first: state.first,
201
202
  });
202
203
  return targetDatasetId;
203
204
  }
204
- async function materializeRawTextSource(state, source, targetDatasetId) {
205
- const rows = materializeRawTextRows(source);
205
+ async function materializeRawTextResource(state, resource, targetDatasetId) {
206
+ const rows = materializeRawTextRows(resource);
206
207
  await materializeRowsToDataset(state.runtime, {
207
208
  datasetId: targetDatasetId,
208
209
  sandboxId: state.sandboxId,
209
- title: state.title ?? source.name ?? targetDatasetId,
210
+ title: state.title ?? resource.name ?? targetDatasetId,
210
211
  instructions: state.instructions,
211
- sources: [
212
- {
213
- kind: "text",
214
- mimeType: source.mimeType,
215
- name: source.name,
216
- description: source.description,
217
- },
218
- ],
219
- sourceKinds: ["text"],
212
+ contextId: state.contextId ?? "",
220
213
  rows,
221
214
  schema: state.outputSchema,
222
215
  first: state.first,
223
216
  });
224
217
  return targetDatasetId;
225
218
  }
219
+ async function writePreparedFileResourceToSandbox(params) {
220
+ const file = await readInstantFileStep({ runtime: params.runtime, fileId: params.fileId });
221
+ const contentDispositionName = parseContentDispositionFileName(file.contentDisposition);
222
+ const fileName = sanitizeResourceFileName(params.filename ?? contentDispositionName, `${params.fileId}.bin`);
223
+ const resourcePath = `${getDatasetResourcesDir(params.datasetId)}/${fileName}`;
224
+ await runDatasetSandboxCommandStep({
225
+ runtime: params.runtime,
226
+ sandboxId: params.sandboxId,
227
+ cmd: "mkdir",
228
+ args: ["-p", ...getDatasetStandardDirs(params.datasetId)],
229
+ });
230
+ await writeDatasetSandboxFilesStep({
231
+ runtime: params.runtime,
232
+ sandboxId: params.sandboxId,
233
+ files: [{ path: resourcePath, contentBase64: file.contentBase64 }],
234
+ });
235
+ return { fileName, resourcePath };
236
+ }
226
237
  function resolveDatasetSandboxId(state, _targetDatasetId) {
227
238
  const sandboxId = String(state.sandboxId ?? "").trim();
228
239
  if (sandboxId)
@@ -250,8 +261,7 @@ export async function initializeDatasetStep(params) {
250
261
  sandboxId: params.sandboxId,
251
262
  title: params.title ?? params.datasetId,
252
263
  instructions: params.instructions,
253
- sources: params.sources,
254
- sourceKinds: params.sourceKinds,
264
+ contextId: params.contextId,
255
265
  schema: params.schema,
256
266
  status: "building",
257
267
  });
@@ -260,56 +270,45 @@ export async function initializeDatasetStep(params) {
260
270
  sandboxId: params.sandboxId,
261
271
  };
262
272
  }
263
- export async function prepareDatasetSourcesStep(params) {
273
+ export async function prepareDatasetResourcesStep(params) {
264
274
  "use step";
265
275
  if (params.kind === "file") {
266
- const fileId = params.source.kind === "file"
267
- ? params.source.fileId
268
- : await uploadInlineTextSource(params.runtime, params.datasetId, params.source);
269
- const initialized = await initializeFileParseSandboxStep({
270
- runtime: params.runtime,
271
- sandboxId: params.sandboxId,
272
- datasetId: params.datasetId,
273
- fileId,
274
- state: { initialized: false, filePath: "" },
275
- });
276
- const filePreview = await generateFileParsePreviewStep({
277
- runtime: params.runtime,
278
- sandboxId: params.sandboxId,
279
- sandboxFilePath: initialized.filePath,
280
- datasetId: params.datasetId,
281
- });
276
+ const fileId = params.resource.kind === "file"
277
+ ? params.resource.fileId
278
+ : await uploadInlineTextResource(params.runtime, params.datasetId, params.resource);
282
279
  return {
283
280
  kind: "file",
284
281
  datasetId: params.datasetId,
285
282
  sandboxId: params.sandboxId,
286
283
  fileId,
287
- sandboxState: initialized.state,
288
- filePreview,
284
+ sandboxState: { initialized: false, filePath: "" },
285
+ filePreview: undefined,
289
286
  schema: params.schema ?? null,
287
+ filename: params.resource.kind === "file" ? params.resource.filename : params.resource.name,
288
+ mediaType: params.resource.kind === "file" ? params.resource.mediaType : params.resource.mimeType,
290
289
  };
291
290
  }
292
- const initialized = await ensureTransformSourcesInSandboxStep({
291
+ const initialized = await ensureTransformInputsInSandboxStep({
293
292
  runtime: params.runtime,
294
293
  sandboxId: params.sandboxId,
295
294
  datasetId: params.datasetId,
296
- sourceDatasetIds: params.sourceDatasetIds,
297
- state: { initialized: false, sourcePaths: [] },
295
+ inputDatasetIds: params.inputDatasetIds,
296
+ state: { initialized: false, inputPaths: [] },
298
297
  });
299
- const sourcePreviews = await generateTransformSourcePreviewsStep({
298
+ const inputPreviews = await generateTransformInputPreviewsStep({
300
299
  runtime: params.runtime,
301
300
  sandboxId: params.sandboxId,
302
301
  datasetId: params.datasetId,
303
- sourcePaths: initialized.sourcePaths,
302
+ inputPaths: initialized.inputPaths,
304
303
  });
305
304
  return {
306
305
  kind: "transform",
307
306
  datasetId: params.datasetId,
308
307
  sandboxId: params.sandboxId,
309
- sourceDatasetIds: params.sourceDatasetIds,
308
+ inputDatasetIds: params.inputDatasetIds,
310
309
  outputSchema: params.outputSchema,
311
310
  sandboxState: initialized.state,
312
- sourcePreviews,
311
+ inputPreviews,
313
312
  };
314
313
  }
315
314
  export async function initializeDatasetContextStep(params) {
@@ -324,9 +323,9 @@ export async function initializeDatasetContextStep(params) {
324
323
  return {
325
324
  ...params.prepared,
326
325
  instructions: params.instructions,
327
- prompt: params.prepared.sourceDatasetIds.length === 1
328
- ? "Transform the source dataset into a new dataset matching the provided output schema"
329
- : `Transform ${params.prepared.sourceDatasetIds.length} source datasets into a new dataset matching the provided output schema`,
326
+ prompt: params.prepared.inputDatasetIds.length === 1
327
+ ? "Transform the input dataset into a new dataset matching the provided output schema"
328
+ : `Transform ${params.prepared.inputDatasetIds.length} input datasets into a new dataset matching the provided output schema`,
330
329
  };
331
330
  }
332
331
  export async function completeDatasetStep(params) {
@@ -375,9 +374,9 @@ export async function completeDatasetStep(params) {
375
374
  firstRow: firstResult.row,
376
375
  };
377
376
  }
378
- export async function materializeSingleFileLikeSource(state, source, targetDatasetId) {
379
- if (source.kind === "file" && !state.outputSchema) {
380
- const materializedPdf = await tryMaterializeRawPdfFileSource(state, source, targetDatasetId);
377
+ export async function materializeSingleFileLikeResource(state, resource, targetDatasetId) {
378
+ if (resource.kind === "file" && !state.outputSchema) {
379
+ const materializedPdf = await tryMaterializeRawPdfFileResource(state, resource, targetDatasetId);
381
380
  if (materializedPdf)
382
381
  return materializedPdf;
383
382
  }
@@ -391,29 +390,32 @@ export async function materializeSingleFileLikeSource(state, source, targetDatas
391
390
  sandboxId,
392
391
  title: state.title ?? targetDatasetId,
393
392
  instructions: state.instructions,
394
- sources: [
395
- source.kind === "file"
396
- ? { kind: "file", fileId: source.fileId, description: source.description }
397
- : {
398
- kind: "text",
399
- mimeType: source.mimeType,
400
- name: source.name,
401
- description: source.description,
402
- },
403
- ],
404
- sourceKinds: [source.kind],
393
+ contextId: state.contextId ?? "",
405
394
  schema: state.outputSchema,
406
395
  });
407
- const prepared = await prepareDatasetSourcesStep({
396
+ const prepared = await prepareDatasetResourcesStep({
408
397
  kind: "file",
409
398
  runtime: state.runtime,
410
399
  datasetId: targetDatasetId,
411
400
  sandboxId,
412
- source,
401
+ resource,
413
402
  schema: state.outputSchema,
414
403
  });
404
+ if (prepared.kind !== "file") {
405
+ throw new Error("dataset_context_kind_mismatch:file");
406
+ }
407
+ const preparedFile = await writePreparedFileResourceToSandbox({
408
+ runtime: state.runtime,
409
+ sandboxId,
410
+ datasetId: targetDatasetId,
411
+ fileId: prepared.fileId,
412
+ filename: prepared.filename,
413
+ });
415
414
  const context = await initializeDatasetContextStep({
416
- prepared,
415
+ prepared: {
416
+ ...prepared,
417
+ filename: prepared.filename ?? preparedFile.fileName,
418
+ },
417
419
  instructions: state.instructions,
418
420
  outputSchema: state.outputSchema,
419
421
  });
@@ -428,6 +430,8 @@ export async function materializeSingleFileLikeSource(state, source, targetDatas
428
430
  sandboxState: context.sandboxState,
429
431
  filePreview: context.filePreview,
430
432
  schema: context.schema,
433
+ filename: context.filename,
434
+ mediaType: context.mediaType,
431
435
  });
432
436
  await parseContext.parse(state.runtime, {
433
437
  durable: await resolveDatasetAgentDurable(state.durable),
@@ -440,40 +444,46 @@ export async function materializeSingleFileLikeSource(state, source, targetDatas
440
444
  sandboxState: context.sandboxState,
441
445
  filePreview: context.filePreview,
442
446
  schema: context.schema,
447
+ filename: context.filename,
448
+ mediaType: context.mediaType,
443
449
  },
444
450
  });
445
451
  return targetDatasetId;
446
452
  }
447
- async function normalizeSourceToDatasetId(state, source, targetDatasetId, sourceIndex) {
448
- if (source.kind === "dataset") {
449
- return source.datasetId;
453
+ async function normalizeResourceToDatasetId(state, resource, targetDatasetId, resourceIndex) {
454
+ if (resource.kind === "dataset") {
455
+ return resource.datasetId;
450
456
  }
451
- const intermediateDatasetId = makeIntermediateDatasetId(targetDatasetId, source.kind, sourceIndex);
452
- if (source.kind === "query") {
453
- await materializeQuerySource(state.runtime, source, {
457
+ const intermediateDatasetId = makeIntermediateDatasetId(targetDatasetId, resource.kind, resourceIndex);
458
+ if (resource.kind === "query") {
459
+ await materializeQueryResource(state.runtime, resource, {
454
460
  datasetId: intermediateDatasetId,
455
461
  sandboxId: state.sandboxId,
456
- title: source.title,
462
+ title: resource.title,
457
463
  first: false,
464
+ contextId: state.contextId ?? "",
458
465
  });
459
466
  return intermediateDatasetId;
460
467
  }
461
- if (source.kind === "text") {
462
- await materializeRawTextSource({
468
+ if (resource.kind === "text") {
469
+ await materializeRawTextResource({
463
470
  ...state,
464
471
  outputSchema: undefined,
465
472
  first: false,
466
- instructions: buildRawSourceInstructions(source.kind),
467
- title: source.name ?? state.title,
468
- }, source, intermediateDatasetId);
473
+ instructions: buildRawResourceInstructions(resource.kind),
474
+ title: resource.name ?? state.title,
475
+ }, resource, intermediateDatasetId);
469
476
  return intermediateDatasetId;
470
477
  }
471
- await materializeSingleFileLikeSource({
478
+ if (resource.kind === "context") {
479
+ throw new Error("dataset_context_resource_must_be_resolved_before_materialization");
480
+ }
481
+ await materializeSingleFileLikeResource({
472
482
  ...state,
473
483
  outputSchema: undefined,
474
484
  first: false,
475
- instructions: buildRawSourceInstructions(source.kind),
476
- }, source, intermediateDatasetId);
485
+ instructions: buildRawResourceInstructions(resource.kind),
486
+ }, resource, intermediateDatasetId);
477
487
  return intermediateDatasetId;
478
488
  }
479
489
  export async function materializeDerivedDataset(state, targetDatasetId) {
@@ -482,9 +492,9 @@ export async function materializeDerivedDataset(state, targetDatasetId) {
482
492
  }
483
493
  const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
484
494
  const stateWithSandbox = { ...state, sandboxId };
485
- const normalizedSources = [];
486
- for (let index = 0; index < stateWithSandbox.sources.length; index++) {
487
- normalizedSources.push(await normalizeSourceToDatasetId(stateWithSandbox, stateWithSandbox.sources[index], targetDatasetId, index));
495
+ const normalizedResources = [];
496
+ for (let index = 0; index < stateWithSandbox.resources.length; index++) {
497
+ normalizedResources.push(await normalizeResourceToDatasetId(stateWithSandbox, stateWithSandbox.resources[index], targetDatasetId, index));
488
498
  }
489
499
  const transformSchema = stateWithSandbox.outputSchema ??
490
500
  {
@@ -502,60 +512,51 @@ export async function materializeDerivedDataset(state, targetDatasetId) {
502
512
  sandboxId,
503
513
  title: stateWithSandbox.title ?? targetDatasetId,
504
514
  instructions: stateWithSandbox.instructions,
505
- sources: stateWithSandbox.sources.map((source) => source.kind === "query"
506
- ? {
507
- kind: "query",
508
- query: source.query,
509
- title: source.title,
510
- explanation: source.explanation,
511
- ...getDomainDescriptor(source.domain),
512
- }
513
- : source),
514
- sourceKinds: stateWithSandbox.sources.map((source) => source.kind),
515
+ contextId: stateWithSandbox.contextId ?? "",
515
516
  schema: transformSchema,
516
517
  });
517
- const prepared = await prepareDatasetSourcesStep({
518
+ const prepared = await prepareDatasetResourcesStep({
518
519
  kind: "transform",
519
520
  runtime: stateWithSandbox.runtime,
520
521
  datasetId: targetDatasetId,
521
522
  sandboxId,
522
- sourceDatasetIds: normalizedSources,
523
+ inputDatasetIds: normalizedResources,
523
524
  outputSchema: transformSchema,
524
525
  });
525
526
  const context = await initializeDatasetContextStep({
526
527
  prepared,
527
- instructions: buildTransformInstructions(normalizedSources.length, stateWithSandbox.instructions, stateWithSandbox.outputSchema),
528
+ instructions: buildTransformInstructions(normalizedResources.length, stateWithSandbox.instructions, stateWithSandbox.outputSchema),
528
529
  outputSchema: transformSchema,
529
530
  });
530
531
  if (context.kind !== "transform") {
531
532
  throw new Error("dataset_context_kind_mismatch:transform");
532
533
  }
533
534
  const transformContext = createTransformDatasetContext({
534
- sourceDatasetIds: context.sourceDatasetIds,
535
+ inputDatasetIds: context.inputDatasetIds,
535
536
  outputSchema: context.outputSchema,
536
537
  instructions: context.instructions,
537
538
  datasetId: context.datasetId,
538
539
  reactor: stateWithSandbox.reactor,
539
540
  sandboxId: context.sandboxId,
540
541
  sandboxState: context.sandboxState,
541
- sourcePreviews: context.sourcePreviews,
542
+ inputPreviews: context.inputPreviews,
542
543
  });
543
544
  await transformContext.transform(stateWithSandbox.runtime, {
544
545
  durable: await resolveDatasetAgentDurable(stateWithSandbox.durable),
545
546
  prompt: context.prompt,
546
547
  initialContent: {
547
548
  datasetId: context.datasetId,
548
- sourceDatasetIds: context.sourceDatasetIds,
549
+ inputDatasetIds: context.inputDatasetIds,
549
550
  outputSchema: context.outputSchema,
550
551
  instructions: context.instructions,
551
552
  sandboxId: context.sandboxId,
552
553
  sandboxState: context.sandboxState,
553
- sourcePreviews: context.sourcePreviews,
554
+ inputPreviews: context.inputPreviews,
554
555
  },
555
556
  });
556
557
  return targetDatasetId;
557
558
  }
558
559
  registerDatasetAgentMaterializers({
559
- materializeSingleFileLikeSource,
560
+ materializeSingleFileLikeResource,
560
561
  materializeDerivedDataset,
561
562
  });
@@ -1,5 +1,5 @@
1
- import type { AnyDatasetRuntime, DatasetBuilderState, DatasetSchemaInput, InternalSource } from "./types.js";
2
- export declare function materializeQuerySource<Runtime extends AnyDatasetRuntime>(runtime: DatasetBuilderState<Runtime>["runtime"], source: Extract<InternalSource, {
1
+ import type { AnyDatasetRuntime, DatasetBuilderState, DatasetSchemaInput, InternalDatasetResource } from "./types.js";
2
+ export declare function materializeQueryResource<Runtime extends AnyDatasetRuntime>(runtime: DatasetBuilderState<Runtime>["runtime"], resource: Extract<InternalDatasetResource, {
3
3
  kind: "query";
4
4
  }>, params: {
5
5
  datasetId: string;
@@ -8,4 +8,5 @@ export declare function materializeQuerySource<Runtime extends AnyDatasetRuntime
8
8
  title?: string;
9
9
  instructions?: string;
10
10
  first?: boolean;
11
+ contextId: string;
11
12
  }): Promise<string>;
@@ -1,35 +1,26 @@
1
1
  import { materializeRowsToDataset } from "./persistence.js";
2
- import { getDomainDescriptor, normalizeQueryRows } from "./sourceRows.js";
3
- async function readQuerySourceRowsStep(params) {
2
+ import { getDomainDescriptor, normalizeQueryRows } from "./rows.js";
3
+ async function readQueryResourceRowsStep(params) {
4
4
  "use step";
5
5
  const db = await params.runtime.db();
6
6
  const result = await db.query(params.query);
7
7
  return { rows: normalizeQueryRows(result) };
8
8
  }
9
- export async function materializeQuerySource(runtime, source, params) {
10
- const { rows } = await readQuerySourceRowsStep({
9
+ export async function materializeQueryResource(runtime, resource, params) {
10
+ const { rows } = await readQueryResourceRowsStep({
11
11
  runtime,
12
- query: source.query,
12
+ query: resource.query,
13
13
  });
14
- const domainDescriptor = getDomainDescriptor(source.domain);
14
+ const domainDescriptor = getDomainDescriptor(resource.domain);
15
15
  return await materializeRowsToDataset(runtime, {
16
16
  datasetId: params.datasetId,
17
17
  sandboxId: params.sandboxId,
18
- title: params.title ?? source.title,
18
+ title: params.title ?? resource.title,
19
19
  instructions: params.instructions,
20
- sources: [
21
- {
22
- kind: "query",
23
- query: source.query,
24
- title: source.title,
25
- explanation: source.explanation,
26
- ...domainDescriptor,
27
- },
28
- ],
29
- sourceKinds: ["query"],
20
+ contextId: params.contextId,
30
21
  analysis: {
31
- query: source.query,
32
- explanation: source.explanation,
22
+ query: resource.query,
23
+ explanation: resource.explanation,
33
24
  ...domainDescriptor,
34
25
  },
35
26
  rows,
@@ -1,19 +1,18 @@
1
- import type { AnyDatasetRuntime, DatasetBuildResult, DatasetTextSourceInput, MaterializeRowsParams } from "./types.js";
2
- export declare function defaultTextSourceName(source: DatasetTextSourceInput): string;
1
+ import type { AnyDatasetRuntime, DatasetBuildResult, DatasetTextResourceInput, MaterializeRowsParams } from "./types.js";
2
+ export declare function defaultTextResourceName(resource: DatasetTextResourceInput): string;
3
3
  export declare function getDatasetDb<Runtime extends AnyDatasetRuntime>(runtime: Runtime): Promise<any>;
4
4
  export declare function createOrUpdateDatasetMetadata<Runtime extends AnyDatasetRuntime>(runtime: Runtime, params: {
5
5
  datasetId: string;
6
6
  sandboxId?: string;
7
7
  title?: string;
8
8
  instructions?: string;
9
- sources: any[];
10
- sourceKinds: string[];
9
+ contextId: string;
11
10
  analysis?: any;
12
11
  schema?: any;
13
12
  status?: string;
14
13
  }): Promise<void>;
15
14
  export declare function materializeRowsToDataset<Runtime extends AnyDatasetRuntime>(runtime: Runtime, params: MaterializeRowsParams): Promise<string>;
16
- export declare function uploadInlineTextSource<Runtime extends AnyDatasetRuntime>(runtime: Runtime, datasetId: string, source: DatasetTextSourceInput): Promise<string>;
15
+ export declare function uploadInlineTextResource<Runtime extends AnyDatasetRuntime>(runtime: Runtime, datasetId: string, resource: DatasetTextResourceInput): Promise<string>;
17
16
  export declare function finalizeBuildResult<Runtime extends AnyDatasetRuntime>(runtime: Runtime, datasetId: string, withFirst: boolean): Promise<DatasetBuildResult>;
18
17
  export declare function createDatasetBuildResult<Runtime extends AnyDatasetRuntime>(runtime: Runtime, params: {
19
18
  datasetId: string;
@@ -2,18 +2,18 @@ import { DatasetService } from "../service.js";
2
2
  import { datasetDomain } from "../schema.js";
3
3
  import { datasetGetByIdStep, datasetPreviewRowsStep, datasetReadOneStep, datasetReadRowsStep, } from "../dataset/steps.js";
4
4
  import { inferDatasetSchema, validateRows } from "./schemaInference.js";
5
- import { rowsToJsonl } from "./sourceRows.js";
6
- export function defaultTextSourceName(source) {
7
- if (source.name?.trim())
8
- return source.name.trim();
9
- const mimeType = String(source.mimeType ?? "").toLowerCase();
5
+ import { rowsToJsonl } from "./rows.js";
6
+ export function defaultTextResourceName(resource) {
7
+ if (resource.name?.trim())
8
+ return resource.name.trim();
9
+ const mimeType = String(resource.mimeType ?? "").toLowerCase();
10
10
  if (mimeType.includes("csv"))
11
- return "source.csv";
11
+ return "resource.csv";
12
12
  if (mimeType.includes("json"))
13
- return "source.json";
13
+ return "resource.json";
14
14
  if (mimeType.includes("yaml") || mimeType.includes("yml"))
15
- return "source.yaml";
16
- return "source.txt";
15
+ return "resource.yaml";
16
+ return "resource.txt";
17
17
  }
18
18
  export async function getDatasetDb(runtime) {
19
19
  const scoped = await runtime.use(datasetDomain);
@@ -21,6 +21,9 @@ export async function getDatasetDb(runtime) {
21
21
  }
22
22
  export async function createOrUpdateDatasetMetadata(runtime, params) {
23
23
  "use step";
24
+ if (!params.contextId.trim()) {
25
+ throw new Error("dataset_context_required");
26
+ }
24
27
  const db = await getDatasetDb(runtime);
25
28
  const service = new DatasetService(db);
26
29
  const result = await service.createDataset({
@@ -28,8 +31,7 @@ export async function createOrUpdateDatasetMetadata(runtime, params) {
28
31
  sandboxId: params.sandboxId,
29
32
  title: params.title ?? params.datasetId,
30
33
  instructions: params.instructions ?? "",
31
- sources: params.sources,
32
- sourceKinds: params.sourceKinds,
34
+ contextId: params.contextId,
33
35
  analysis: params.analysis,
34
36
  schema: params.schema,
35
37
  status: params.status ?? "building",
@@ -52,8 +54,7 @@ export async function materializeRowsToDataset(runtime, params) {
52
54
  sandboxId: params.sandboxId,
53
55
  title: params.title,
54
56
  instructions: params.instructions,
55
- sources: params.sources,
56
- sourceKinds: params.sourceKinds,
57
+ contextId: params.contextId,
57
58
  analysis: params.analysis,
58
59
  schema: resolvedSchema,
59
60
  status: "building",
@@ -78,18 +79,18 @@ export async function materializeRowsToDataset(runtime, params) {
78
79
  }
79
80
  return params.datasetId;
80
81
  }
81
- export async function uploadInlineTextSource(runtime, datasetId, source) {
82
+ export async function uploadInlineTextResource(runtime, datasetId, resource) {
82
83
  "use step";
83
84
  const db = await getDatasetDb(runtime);
84
- const fileName = defaultTextSourceName(source);
85
- const storagePath = `/dataset/source/${datasetId}/${Date.now()}-${fileName}`;
86
- const uploadResult = await db.storage.uploadFile(storagePath, Buffer.from(source.text, "utf-8"), {
87
- contentType: source.mimeType ?? "text/plain",
85
+ const fileName = defaultTextResourceName(resource);
86
+ const storagePath = `/dataset/resource/${datasetId}/${Date.now()}-${fileName}`;
87
+ const uploadResult = await db.storage.uploadFile(storagePath, Buffer.from(resource.text, "utf-8"), {
88
+ contentType: resource.mimeType ?? "text/plain",
88
89
  contentDisposition: fileName,
89
90
  });
90
91
  const fileId = uploadResult?.data?.id;
91
92
  if (!fileId) {
92
- throw new Error("dataset_text_source_upload_failed");
93
+ throw new Error("dataset_text_resource_upload_failed");
93
94
  }
94
95
  return fileId;
95
96
  }