@ekairos/dataset 1.22.83-beta.development.0 → 1.22.84-beta.development.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/builder/agentMaterializers.d.ts +2 -2
- package/dist/builder/context.d.ts +7 -0
- package/dist/builder/context.js +192 -0
- package/dist/builder/instructions.d.ts +3 -3
- package/dist/builder/instructions.js +10 -10
- package/dist/builder/materialize.d.ts +10 -11
- package/dist/builder/materialize.js +116 -113
- package/dist/builder/materializeQuery.d.ts +3 -2
- package/dist/builder/materializeQuery.js +10 -19
- package/dist/builder/persistence.d.ts +4 -5
- package/dist/builder/persistence.js +20 -19
- package/dist/builder/types.d.ts +29 -24
- package/dist/completeDataset.steps.js +1 -1
- package/dist/dataset.d.ts +1 -1
- package/dist/dataset.js +42 -29
- package/dist/datasetFiles.d.ts +1 -1
- package/dist/datasetFiles.js +3 -3
- package/dist/file/file-dataset.agent.js +3 -4
- package/dist/file/prompts.js +12 -12
- package/dist/materializeDataset.tool.d.ts +34 -26
- package/dist/materializeDataset.tool.js +40 -29
- package/dist/schema.d.ts +12 -2
- package/dist/schema.js +6 -3
- package/dist/service.d.ts +1 -2
- package/dist/service.js +5 -2
- package/dist/transform/filepreview.d.ts +2 -2
- package/dist/transform/filepreview.js +3 -3
- package/dist/transform/prompts.js +25 -25
- package/dist/transform/transform-dataset.agent.d.ts +4 -4
- package/dist/transform/transform-dataset.agent.js +29 -30
- package/dist/transform/transform-dataset.steps.d.ts +7 -7
- package/dist/transform/transform-dataset.steps.js +20 -20
- package/dist/transform/transform-dataset.types.d.ts +13 -13
- package/dist/transform/transformDataset.js +4 -4
- package/package.json +4 -4
- /package/dist/builder/{sourceRows.d.ts → rows.d.ts} +0 -0
- /package/dist/builder/{sourceRows.js → rows.js} +0 -0
|
@@ -1,17 +1,16 @@
|
|
|
1
1
|
import { createFileParseContext } from "../file/file-dataset.agent.js";
|
|
2
2
|
import { readInstantFileStep } from "../file/steps.js";
|
|
3
3
|
import { createTransformDatasetContext } from "../transform/transform-dataset.agent.js";
|
|
4
|
-
import {
|
|
4
|
+
import { ensureTransformInputsInSandboxStep, generateTransformInputPreviewsStep, } from "../transform/transform-dataset.steps.js";
|
|
5
5
|
import { datasetGetByIdStep, datasetInferAndUpdateSchemaStep, datasetPreviewRowsStep, datasetReadOneStep, } from "../dataset/steps.js";
|
|
6
|
-
import { getDatasetOutputPath, getDatasetScriptsDir,
|
|
6
|
+
import { getDatasetOutputPath, getDatasetScriptsDir, getDatasetResourcesDir, getDatasetStandardDirs, } from "../datasetFiles.js";
|
|
7
7
|
import { registerDatasetAgentMaterializers } from "./agentMaterializers.js";
|
|
8
|
-
import { buildFileDefaultInstructions,
|
|
9
|
-
import { createOrUpdateDatasetMetadata, materializeRowsToDataset,
|
|
10
|
-
import {
|
|
11
|
-
import { materializeQuerySource } from "./materializeQuery.js";
|
|
8
|
+
import { buildFileDefaultInstructions, buildRawResourceInstructions, buildTransformInstructions, } from "./instructions.js";
|
|
9
|
+
import { createOrUpdateDatasetMetadata, materializeRowsToDataset, uploadInlineTextResource, } from "./persistence.js";
|
|
10
|
+
import { materializeQueryResource } from "./materializeQuery.js";
|
|
12
11
|
import { readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep, writeDatasetSandboxTextFilesStep, } from "../sandbox/steps.js";
|
|
13
|
-
function makeIntermediateDatasetId(targetDatasetId,
|
|
14
|
-
return `${targetDatasetId}__${
|
|
12
|
+
function makeIntermediateDatasetId(targetDatasetId, resourceKind, index) {
|
|
13
|
+
return `${targetDatasetId}__${resourceKind}_${index}`;
|
|
15
14
|
}
|
|
16
15
|
function normalizeParsedTextRows(value) {
|
|
17
16
|
if (Array.isArray(value)) {
|
|
@@ -21,10 +20,10 @@ function normalizeParsedTextRows(value) {
|
|
|
21
20
|
return [value];
|
|
22
21
|
return [{ value }];
|
|
23
22
|
}
|
|
24
|
-
function materializeRawTextRows(
|
|
25
|
-
const text = String(
|
|
26
|
-
const mimeType = String(
|
|
27
|
-
const name = String(
|
|
23
|
+
function materializeRawTextRows(resource) {
|
|
24
|
+
const text = String(resource.text ?? "");
|
|
25
|
+
const mimeType = String(resource.mimeType ?? "").toLowerCase();
|
|
26
|
+
const name = String(resource.name ?? "").toLowerCase();
|
|
28
27
|
const shouldParseJson = mimeType.includes("json") || name.endsWith(".json") || name.endsWith(".jsonl");
|
|
29
28
|
if (shouldParseJson) {
|
|
30
29
|
try {
|
|
@@ -67,10 +66,14 @@ function isPdfContentDisposition(value) {
|
|
|
67
66
|
const text = String(value ?? "").toLowerCase();
|
|
68
67
|
return text.includes("application/pdf") || text.includes(".pdf");
|
|
69
68
|
}
|
|
70
|
-
function
|
|
69
|
+
function sanitizeResourceFileName(value, fallback) {
|
|
71
70
|
const name = String(value ?? "").trim() || fallback;
|
|
72
71
|
const cleaned = name.replace(/[\\/:"*?<>|]+/g, "_").replace(/\s+/g, "_").slice(0, 120);
|
|
73
|
-
return cleaned
|
|
72
|
+
return cleaned || fallback;
|
|
73
|
+
}
|
|
74
|
+
function sanitizePdfFileName(value, fallback) {
|
|
75
|
+
const cleaned = sanitizeResourceFileName(value, fallback);
|
|
76
|
+
return cleaned.toLowerCase().endsWith(".pdf") ? cleaned : `${cleaned}.pdf`;
|
|
74
77
|
}
|
|
75
78
|
function pdfTextRowsSchema() {
|
|
76
79
|
return {
|
|
@@ -98,14 +101,14 @@ function parseJsonlDataRows(content) {
|
|
|
98
101
|
.map((record) => record?.data)
|
|
99
102
|
.filter((row) => row && typeof row === "object" && !Array.isArray(row));
|
|
100
103
|
}
|
|
101
|
-
async function
|
|
102
|
-
const file = await readInstantFileStep({ runtime: state.runtime, fileId:
|
|
104
|
+
async function tryMaterializeRawPdfFileResource(state, resource, targetDatasetId) {
|
|
105
|
+
const file = await readInstantFileStep({ runtime: state.runtime, fileId: resource.fileId });
|
|
103
106
|
if (!isPdfContentDisposition(file.contentDisposition))
|
|
104
107
|
return null;
|
|
105
108
|
const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
|
|
106
109
|
const outputPath = getDatasetOutputPath(targetDatasetId);
|
|
107
|
-
const fileName = sanitizePdfFileName(parseContentDispositionFileName(file.contentDisposition), `${
|
|
108
|
-
const
|
|
110
|
+
const fileName = sanitizePdfFileName(parseContentDispositionFileName(file.contentDisposition), `${resource.fileId}.pdf`);
|
|
111
|
+
const resourcePath = `${getDatasetResourcesDir(targetDatasetId)}/${fileName}`;
|
|
109
112
|
const scriptPath = `${getDatasetScriptsDir(targetDatasetId)}/extract_pdf_text.py`;
|
|
110
113
|
await runDatasetSandboxCommandStep({
|
|
111
114
|
runtime: state.runtime,
|
|
@@ -116,7 +119,7 @@ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
|
|
|
116
119
|
await writeDatasetSandboxFilesStep({
|
|
117
120
|
runtime: state.runtime,
|
|
118
121
|
sandboxId,
|
|
119
|
-
files: [{ path:
|
|
122
|
+
files: [{ path: resourcePath, contentBase64: file.contentBase64 }],
|
|
120
123
|
});
|
|
121
124
|
const install = await runDatasetSandboxCommandStep({
|
|
122
125
|
runtime: state.runtime,
|
|
@@ -139,11 +142,11 @@ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
|
|
|
139
142
|
"import sys",
|
|
140
143
|
"from pypdf import PdfReader",
|
|
141
144
|
"",
|
|
142
|
-
"
|
|
145
|
+
"resource_path = Path(sys.argv[1])",
|
|
143
146
|
"output_path = Path(sys.argv[2])",
|
|
144
147
|
"file_id = sys.argv[3]",
|
|
145
148
|
"file_name = sys.argv[4]",
|
|
146
|
-
"reader = PdfReader(str(
|
|
149
|
+
"reader = PdfReader(str(resource_path))",
|
|
147
150
|
"rows = 0",
|
|
148
151
|
"with output_path.open('w', encoding='utf-8') as out:",
|
|
149
152
|
" for index, page in enumerate(reader.pages, start=1):",
|
|
@@ -173,7 +176,7 @@ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
|
|
|
173
176
|
runtime: state.runtime,
|
|
174
177
|
sandboxId,
|
|
175
178
|
cmd: "python",
|
|
176
|
-
args: [scriptPath,
|
|
179
|
+
args: [scriptPath, resourcePath, outputPath, resource.fileId, fileName],
|
|
177
180
|
});
|
|
178
181
|
if (extraction.exitCode !== 0) {
|
|
179
182
|
throw new Error(`dataset_pdf_text_extraction_failed:${extraction.stderr || extraction.stdout}`);
|
|
@@ -192,36 +195,45 @@ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
|
|
|
192
195
|
sandboxId,
|
|
193
196
|
title: state.title ?? fileName,
|
|
194
197
|
instructions: state.instructions,
|
|
195
|
-
|
|
196
|
-
sourceKinds: ["file"],
|
|
198
|
+
contextId: state.contextId ?? "",
|
|
197
199
|
rows,
|
|
198
200
|
schema: pdfTextRowsSchema(),
|
|
199
201
|
first: state.first,
|
|
200
202
|
});
|
|
201
203
|
return targetDatasetId;
|
|
202
204
|
}
|
|
203
|
-
async function
|
|
204
|
-
const rows = materializeRawTextRows(
|
|
205
|
+
async function materializeRawTextResource(state, resource, targetDatasetId) {
|
|
206
|
+
const rows = materializeRawTextRows(resource);
|
|
205
207
|
await materializeRowsToDataset(state.runtime, {
|
|
206
208
|
datasetId: targetDatasetId,
|
|
207
209
|
sandboxId: state.sandboxId,
|
|
208
|
-
title: state.title ??
|
|
210
|
+
title: state.title ?? resource.name ?? targetDatasetId,
|
|
209
211
|
instructions: state.instructions,
|
|
210
|
-
|
|
211
|
-
{
|
|
212
|
-
kind: "text",
|
|
213
|
-
mimeType: source.mimeType,
|
|
214
|
-
name: source.name,
|
|
215
|
-
description: source.description,
|
|
216
|
-
},
|
|
217
|
-
],
|
|
218
|
-
sourceKinds: ["text"],
|
|
212
|
+
contextId: state.contextId ?? "",
|
|
219
213
|
rows,
|
|
220
214
|
schema: state.outputSchema,
|
|
221
215
|
first: state.first,
|
|
222
216
|
});
|
|
223
217
|
return targetDatasetId;
|
|
224
218
|
}
|
|
219
|
+
async function writePreparedFileResourceToSandbox(params) {
|
|
220
|
+
const file = await readInstantFileStep({ runtime: params.runtime, fileId: params.fileId });
|
|
221
|
+
const contentDispositionName = parseContentDispositionFileName(file.contentDisposition);
|
|
222
|
+
const fileName = sanitizeResourceFileName(params.filename ?? contentDispositionName, `${params.fileId}.bin`);
|
|
223
|
+
const resourcePath = `${getDatasetResourcesDir(params.datasetId)}/${fileName}`;
|
|
224
|
+
await runDatasetSandboxCommandStep({
|
|
225
|
+
runtime: params.runtime,
|
|
226
|
+
sandboxId: params.sandboxId,
|
|
227
|
+
cmd: "mkdir",
|
|
228
|
+
args: ["-p", ...getDatasetStandardDirs(params.datasetId)],
|
|
229
|
+
});
|
|
230
|
+
await writeDatasetSandboxFilesStep({
|
|
231
|
+
runtime: params.runtime,
|
|
232
|
+
sandboxId: params.sandboxId,
|
|
233
|
+
files: [{ path: resourcePath, contentBase64: file.contentBase64 }],
|
|
234
|
+
});
|
|
235
|
+
return { fileName, resourcePath };
|
|
236
|
+
}
|
|
225
237
|
function resolveDatasetSandboxId(state, _targetDatasetId) {
|
|
226
238
|
const sandboxId = String(state.sandboxId ?? "").trim();
|
|
227
239
|
if (sandboxId)
|
|
@@ -249,8 +261,7 @@ export async function initializeDatasetStep(params) {
|
|
|
249
261
|
sandboxId: params.sandboxId,
|
|
250
262
|
title: params.title ?? params.datasetId,
|
|
251
263
|
instructions: params.instructions,
|
|
252
|
-
|
|
253
|
-
sourceKinds: params.sourceKinds,
|
|
264
|
+
contextId: params.contextId,
|
|
254
265
|
schema: params.schema,
|
|
255
266
|
status: "building",
|
|
256
267
|
});
|
|
@@ -259,12 +270,12 @@ export async function initializeDatasetStep(params) {
|
|
|
259
270
|
sandboxId: params.sandboxId,
|
|
260
271
|
};
|
|
261
272
|
}
|
|
262
|
-
export async function
|
|
273
|
+
export async function prepareDatasetResourcesStep(params) {
|
|
263
274
|
"use step";
|
|
264
275
|
if (params.kind === "file") {
|
|
265
|
-
const fileId = params.
|
|
266
|
-
? params.
|
|
267
|
-
: await
|
|
276
|
+
const fileId = params.resource.kind === "file"
|
|
277
|
+
? params.resource.fileId
|
|
278
|
+
: await uploadInlineTextResource(params.runtime, params.datasetId, params.resource);
|
|
268
279
|
return {
|
|
269
280
|
kind: "file",
|
|
270
281
|
datasetId: params.datasetId,
|
|
@@ -273,31 +284,31 @@ export async function prepareDatasetSourcesStep(params) {
|
|
|
273
284
|
sandboxState: { initialized: false, filePath: "" },
|
|
274
285
|
filePreview: undefined,
|
|
275
286
|
schema: params.schema ?? null,
|
|
276
|
-
filename: params.
|
|
277
|
-
mediaType: params.
|
|
287
|
+
filename: params.resource.kind === "file" ? params.resource.filename : params.resource.name,
|
|
288
|
+
mediaType: params.resource.kind === "file" ? params.resource.mediaType : params.resource.mimeType,
|
|
278
289
|
};
|
|
279
290
|
}
|
|
280
|
-
const initialized = await
|
|
291
|
+
const initialized = await ensureTransformInputsInSandboxStep({
|
|
281
292
|
runtime: params.runtime,
|
|
282
293
|
sandboxId: params.sandboxId,
|
|
283
294
|
datasetId: params.datasetId,
|
|
284
|
-
|
|
285
|
-
state: { initialized: false,
|
|
295
|
+
inputDatasetIds: params.inputDatasetIds,
|
|
296
|
+
state: { initialized: false, inputPaths: [] },
|
|
286
297
|
});
|
|
287
|
-
const
|
|
298
|
+
const inputPreviews = await generateTransformInputPreviewsStep({
|
|
288
299
|
runtime: params.runtime,
|
|
289
300
|
sandboxId: params.sandboxId,
|
|
290
301
|
datasetId: params.datasetId,
|
|
291
|
-
|
|
302
|
+
inputPaths: initialized.inputPaths,
|
|
292
303
|
});
|
|
293
304
|
return {
|
|
294
305
|
kind: "transform",
|
|
295
306
|
datasetId: params.datasetId,
|
|
296
307
|
sandboxId: params.sandboxId,
|
|
297
|
-
|
|
308
|
+
inputDatasetIds: params.inputDatasetIds,
|
|
298
309
|
outputSchema: params.outputSchema,
|
|
299
310
|
sandboxState: initialized.state,
|
|
300
|
-
|
|
311
|
+
inputPreviews,
|
|
301
312
|
};
|
|
302
313
|
}
|
|
303
314
|
export async function initializeDatasetContextStep(params) {
|
|
@@ -312,9 +323,9 @@ export async function initializeDatasetContextStep(params) {
|
|
|
312
323
|
return {
|
|
313
324
|
...params.prepared,
|
|
314
325
|
instructions: params.instructions,
|
|
315
|
-
prompt: params.prepared.
|
|
316
|
-
? "Transform the
|
|
317
|
-
: `Transform ${params.prepared.
|
|
326
|
+
prompt: params.prepared.inputDatasetIds.length === 1
|
|
327
|
+
? "Transform the input dataset into a new dataset matching the provided output schema"
|
|
328
|
+
: `Transform ${params.prepared.inputDatasetIds.length} input datasets into a new dataset matching the provided output schema`,
|
|
318
329
|
};
|
|
319
330
|
}
|
|
320
331
|
export async function completeDatasetStep(params) {
|
|
@@ -363,9 +374,9 @@ export async function completeDatasetStep(params) {
|
|
|
363
374
|
firstRow: firstResult.row,
|
|
364
375
|
};
|
|
365
376
|
}
|
|
366
|
-
export async function
|
|
367
|
-
if (
|
|
368
|
-
const materializedPdf = await
|
|
377
|
+
export async function materializeSingleFileLikeResource(state, resource, targetDatasetId) {
|
|
378
|
+
if (resource.kind === "file" && !state.outputSchema) {
|
|
379
|
+
const materializedPdf = await tryMaterializeRawPdfFileResource(state, resource, targetDatasetId);
|
|
369
380
|
if (materializedPdf)
|
|
370
381
|
return materializedPdf;
|
|
371
382
|
}
|
|
@@ -379,35 +390,32 @@ export async function materializeSingleFileLikeSource(state, source, targetDatas
|
|
|
379
390
|
sandboxId,
|
|
380
391
|
title: state.title ?? targetDatasetId,
|
|
381
392
|
instructions: state.instructions,
|
|
382
|
-
|
|
383
|
-
source.kind === "file"
|
|
384
|
-
? {
|
|
385
|
-
kind: "file",
|
|
386
|
-
fileId: source.fileId,
|
|
387
|
-
description: source.description,
|
|
388
|
-
filename: source.filename,
|
|
389
|
-
mediaType: source.mediaType,
|
|
390
|
-
}
|
|
391
|
-
: {
|
|
392
|
-
kind: "text",
|
|
393
|
-
mimeType: source.mimeType,
|
|
394
|
-
name: source.name,
|
|
395
|
-
description: source.description,
|
|
396
|
-
},
|
|
397
|
-
],
|
|
398
|
-
sourceKinds: [source.kind],
|
|
393
|
+
contextId: state.contextId ?? "",
|
|
399
394
|
schema: state.outputSchema,
|
|
400
395
|
});
|
|
401
|
-
const prepared = await
|
|
396
|
+
const prepared = await prepareDatasetResourcesStep({
|
|
402
397
|
kind: "file",
|
|
403
398
|
runtime: state.runtime,
|
|
404
399
|
datasetId: targetDatasetId,
|
|
405
400
|
sandboxId,
|
|
406
|
-
|
|
401
|
+
resource,
|
|
407
402
|
schema: state.outputSchema,
|
|
408
403
|
});
|
|
404
|
+
if (prepared.kind !== "file") {
|
|
405
|
+
throw new Error("dataset_context_kind_mismatch:file");
|
|
406
|
+
}
|
|
407
|
+
const preparedFile = await writePreparedFileResourceToSandbox({
|
|
408
|
+
runtime: state.runtime,
|
|
409
|
+
sandboxId,
|
|
410
|
+
datasetId: targetDatasetId,
|
|
411
|
+
fileId: prepared.fileId,
|
|
412
|
+
filename: prepared.filename,
|
|
413
|
+
});
|
|
409
414
|
const context = await initializeDatasetContextStep({
|
|
410
|
-
prepared
|
|
415
|
+
prepared: {
|
|
416
|
+
...prepared,
|
|
417
|
+
filename: prepared.filename ?? preparedFile.fileName,
|
|
418
|
+
},
|
|
411
419
|
instructions: state.instructions,
|
|
412
420
|
outputSchema: state.outputSchema,
|
|
413
421
|
});
|
|
@@ -442,36 +450,40 @@ export async function materializeSingleFileLikeSource(state, source, targetDatas
|
|
|
442
450
|
});
|
|
443
451
|
return targetDatasetId;
|
|
444
452
|
}
|
|
445
|
-
async function
|
|
446
|
-
if (
|
|
447
|
-
return
|
|
453
|
+
async function normalizeResourceToDatasetId(state, resource, targetDatasetId, resourceIndex) {
|
|
454
|
+
if (resource.kind === "dataset") {
|
|
455
|
+
return resource.datasetId;
|
|
448
456
|
}
|
|
449
|
-
const intermediateDatasetId = makeIntermediateDatasetId(targetDatasetId,
|
|
450
|
-
if (
|
|
451
|
-
await
|
|
457
|
+
const intermediateDatasetId = makeIntermediateDatasetId(targetDatasetId, resource.kind, resourceIndex);
|
|
458
|
+
if (resource.kind === "query") {
|
|
459
|
+
await materializeQueryResource(state.runtime, resource, {
|
|
452
460
|
datasetId: intermediateDatasetId,
|
|
453
461
|
sandboxId: state.sandboxId,
|
|
454
|
-
title:
|
|
462
|
+
title: resource.title,
|
|
455
463
|
first: false,
|
|
464
|
+
contextId: state.contextId ?? "",
|
|
456
465
|
});
|
|
457
466
|
return intermediateDatasetId;
|
|
458
467
|
}
|
|
459
|
-
if (
|
|
460
|
-
await
|
|
468
|
+
if (resource.kind === "text") {
|
|
469
|
+
await materializeRawTextResource({
|
|
461
470
|
...state,
|
|
462
471
|
outputSchema: undefined,
|
|
463
472
|
first: false,
|
|
464
|
-
instructions:
|
|
465
|
-
title:
|
|
466
|
-
},
|
|
473
|
+
instructions: buildRawResourceInstructions(resource.kind),
|
|
474
|
+
title: resource.name ?? state.title,
|
|
475
|
+
}, resource, intermediateDatasetId);
|
|
467
476
|
return intermediateDatasetId;
|
|
468
477
|
}
|
|
469
|
-
|
|
478
|
+
if (resource.kind === "context") {
|
|
479
|
+
throw new Error("dataset_context_resource_must_be_resolved_before_materialization");
|
|
480
|
+
}
|
|
481
|
+
await materializeSingleFileLikeResource({
|
|
470
482
|
...state,
|
|
471
483
|
outputSchema: undefined,
|
|
472
484
|
first: false,
|
|
473
|
-
instructions:
|
|
474
|
-
},
|
|
485
|
+
instructions: buildRawResourceInstructions(resource.kind),
|
|
486
|
+
}, resource, intermediateDatasetId);
|
|
475
487
|
return intermediateDatasetId;
|
|
476
488
|
}
|
|
477
489
|
export async function materializeDerivedDataset(state, targetDatasetId) {
|
|
@@ -480,9 +492,9 @@ export async function materializeDerivedDataset(state, targetDatasetId) {
|
|
|
480
492
|
}
|
|
481
493
|
const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
|
|
482
494
|
const stateWithSandbox = { ...state, sandboxId };
|
|
483
|
-
const
|
|
484
|
-
for (let index = 0; index < stateWithSandbox.
|
|
485
|
-
|
|
495
|
+
const normalizedResources = [];
|
|
496
|
+
for (let index = 0; index < stateWithSandbox.resources.length; index++) {
|
|
497
|
+
normalizedResources.push(await normalizeResourceToDatasetId(stateWithSandbox, stateWithSandbox.resources[index], targetDatasetId, index));
|
|
486
498
|
}
|
|
487
499
|
const transformSchema = stateWithSandbox.outputSchema ??
|
|
488
500
|
{
|
|
@@ -500,60 +512,51 @@ export async function materializeDerivedDataset(state, targetDatasetId) {
|
|
|
500
512
|
sandboxId,
|
|
501
513
|
title: stateWithSandbox.title ?? targetDatasetId,
|
|
502
514
|
instructions: stateWithSandbox.instructions,
|
|
503
|
-
|
|
504
|
-
? {
|
|
505
|
-
kind: "query",
|
|
506
|
-
query: source.query,
|
|
507
|
-
title: source.title,
|
|
508
|
-
explanation: source.explanation,
|
|
509
|
-
...getDomainDescriptor(source.domain),
|
|
510
|
-
}
|
|
511
|
-
: source),
|
|
512
|
-
sourceKinds: stateWithSandbox.sources.map((source) => source.kind),
|
|
515
|
+
contextId: stateWithSandbox.contextId ?? "",
|
|
513
516
|
schema: transformSchema,
|
|
514
517
|
});
|
|
515
|
-
const prepared = await
|
|
518
|
+
const prepared = await prepareDatasetResourcesStep({
|
|
516
519
|
kind: "transform",
|
|
517
520
|
runtime: stateWithSandbox.runtime,
|
|
518
521
|
datasetId: targetDatasetId,
|
|
519
522
|
sandboxId,
|
|
520
|
-
|
|
523
|
+
inputDatasetIds: normalizedResources,
|
|
521
524
|
outputSchema: transformSchema,
|
|
522
525
|
});
|
|
523
526
|
const context = await initializeDatasetContextStep({
|
|
524
527
|
prepared,
|
|
525
|
-
instructions: buildTransformInstructions(
|
|
528
|
+
instructions: buildTransformInstructions(normalizedResources.length, stateWithSandbox.instructions, stateWithSandbox.outputSchema),
|
|
526
529
|
outputSchema: transformSchema,
|
|
527
530
|
});
|
|
528
531
|
if (context.kind !== "transform") {
|
|
529
532
|
throw new Error("dataset_context_kind_mismatch:transform");
|
|
530
533
|
}
|
|
531
534
|
const transformContext = createTransformDatasetContext({
|
|
532
|
-
|
|
535
|
+
inputDatasetIds: context.inputDatasetIds,
|
|
533
536
|
outputSchema: context.outputSchema,
|
|
534
537
|
instructions: context.instructions,
|
|
535
538
|
datasetId: context.datasetId,
|
|
536
539
|
reactor: stateWithSandbox.reactor,
|
|
537
540
|
sandboxId: context.sandboxId,
|
|
538
541
|
sandboxState: context.sandboxState,
|
|
539
|
-
|
|
542
|
+
inputPreviews: context.inputPreviews,
|
|
540
543
|
});
|
|
541
544
|
await transformContext.transform(stateWithSandbox.runtime, {
|
|
542
545
|
durable: await resolveDatasetAgentDurable(stateWithSandbox.durable),
|
|
543
546
|
prompt: context.prompt,
|
|
544
547
|
initialContent: {
|
|
545
548
|
datasetId: context.datasetId,
|
|
546
|
-
|
|
549
|
+
inputDatasetIds: context.inputDatasetIds,
|
|
547
550
|
outputSchema: context.outputSchema,
|
|
548
551
|
instructions: context.instructions,
|
|
549
552
|
sandboxId: context.sandboxId,
|
|
550
553
|
sandboxState: context.sandboxState,
|
|
551
|
-
|
|
554
|
+
inputPreviews: context.inputPreviews,
|
|
552
555
|
},
|
|
553
556
|
});
|
|
554
557
|
return targetDatasetId;
|
|
555
558
|
}
|
|
556
559
|
registerDatasetAgentMaterializers({
|
|
557
|
-
|
|
560
|
+
materializeSingleFileLikeResource,
|
|
558
561
|
materializeDerivedDataset,
|
|
559
562
|
});
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import type { AnyDatasetRuntime, DatasetBuilderState, DatasetSchemaInput,
|
|
2
|
-
export declare function
|
|
1
|
+
import type { AnyDatasetRuntime, DatasetBuilderState, DatasetSchemaInput, InternalDatasetResource } from "./types.js";
|
|
2
|
+
export declare function materializeQueryResource<Runtime extends AnyDatasetRuntime>(runtime: DatasetBuilderState<Runtime>["runtime"], resource: Extract<InternalDatasetResource, {
|
|
3
3
|
kind: "query";
|
|
4
4
|
}>, params: {
|
|
5
5
|
datasetId: string;
|
|
@@ -8,4 +8,5 @@ export declare function materializeQuerySource<Runtime extends AnyDatasetRuntime
|
|
|
8
8
|
title?: string;
|
|
9
9
|
instructions?: string;
|
|
10
10
|
first?: boolean;
|
|
11
|
+
contextId: string;
|
|
11
12
|
}): Promise<string>;
|
|
@@ -1,35 +1,26 @@
|
|
|
1
1
|
import { materializeRowsToDataset } from "./persistence.js";
|
|
2
|
-
import { getDomainDescriptor, normalizeQueryRows } from "./
|
|
3
|
-
async function
|
|
2
|
+
import { getDomainDescriptor, normalizeQueryRows } from "./rows.js";
|
|
3
|
+
async function readQueryResourceRowsStep(params) {
|
|
4
4
|
"use step";
|
|
5
5
|
const db = await params.runtime.db();
|
|
6
6
|
const result = await db.query(params.query);
|
|
7
7
|
return { rows: normalizeQueryRows(result) };
|
|
8
8
|
}
|
|
9
|
-
export async function
|
|
10
|
-
const { rows } = await
|
|
9
|
+
export async function materializeQueryResource(runtime, resource, params) {
|
|
10
|
+
const { rows } = await readQueryResourceRowsStep({
|
|
11
11
|
runtime,
|
|
12
|
-
query:
|
|
12
|
+
query: resource.query,
|
|
13
13
|
});
|
|
14
|
-
const domainDescriptor = getDomainDescriptor(
|
|
14
|
+
const domainDescriptor = getDomainDescriptor(resource.domain);
|
|
15
15
|
return await materializeRowsToDataset(runtime, {
|
|
16
16
|
datasetId: params.datasetId,
|
|
17
17
|
sandboxId: params.sandboxId,
|
|
18
|
-
title: params.title ??
|
|
18
|
+
title: params.title ?? resource.title,
|
|
19
19
|
instructions: params.instructions,
|
|
20
|
-
|
|
21
|
-
{
|
|
22
|
-
kind: "query",
|
|
23
|
-
query: source.query,
|
|
24
|
-
title: source.title,
|
|
25
|
-
explanation: source.explanation,
|
|
26
|
-
...domainDescriptor,
|
|
27
|
-
},
|
|
28
|
-
],
|
|
29
|
-
sourceKinds: ["query"],
|
|
20
|
+
contextId: params.contextId,
|
|
30
21
|
analysis: {
|
|
31
|
-
query:
|
|
32
|
-
explanation:
|
|
22
|
+
query: resource.query,
|
|
23
|
+
explanation: resource.explanation,
|
|
33
24
|
...domainDescriptor,
|
|
34
25
|
},
|
|
35
26
|
rows,
|
|
@@ -1,19 +1,18 @@
|
|
|
1
|
-
import type { AnyDatasetRuntime, DatasetBuildResult,
|
|
2
|
-
export declare function
|
|
1
|
+
import type { AnyDatasetRuntime, DatasetBuildResult, DatasetTextResourceInput, MaterializeRowsParams } from "./types.js";
|
|
2
|
+
export declare function defaultTextResourceName(resource: DatasetTextResourceInput): string;
|
|
3
3
|
export declare function getDatasetDb<Runtime extends AnyDatasetRuntime>(runtime: Runtime): Promise<any>;
|
|
4
4
|
export declare function createOrUpdateDatasetMetadata<Runtime extends AnyDatasetRuntime>(runtime: Runtime, params: {
|
|
5
5
|
datasetId: string;
|
|
6
6
|
sandboxId?: string;
|
|
7
7
|
title?: string;
|
|
8
8
|
instructions?: string;
|
|
9
|
-
|
|
10
|
-
sourceKinds: string[];
|
|
9
|
+
contextId: string;
|
|
11
10
|
analysis?: any;
|
|
12
11
|
schema?: any;
|
|
13
12
|
status?: string;
|
|
14
13
|
}): Promise<void>;
|
|
15
14
|
export declare function materializeRowsToDataset<Runtime extends AnyDatasetRuntime>(runtime: Runtime, params: MaterializeRowsParams): Promise<string>;
|
|
16
|
-
export declare function
|
|
15
|
+
export declare function uploadInlineTextResource<Runtime extends AnyDatasetRuntime>(runtime: Runtime, datasetId: string, resource: DatasetTextResourceInput): Promise<string>;
|
|
17
16
|
export declare function finalizeBuildResult<Runtime extends AnyDatasetRuntime>(runtime: Runtime, datasetId: string, withFirst: boolean): Promise<DatasetBuildResult>;
|
|
18
17
|
export declare function createDatasetBuildResult<Runtime extends AnyDatasetRuntime>(runtime: Runtime, params: {
|
|
19
18
|
datasetId: string;
|
|
@@ -2,18 +2,18 @@ import { DatasetService } from "../service.js";
|
|
|
2
2
|
import { datasetDomain } from "../schema.js";
|
|
3
3
|
import { datasetGetByIdStep, datasetPreviewRowsStep, datasetReadOneStep, datasetReadRowsStep, } from "../dataset/steps.js";
|
|
4
4
|
import { inferDatasetSchema, validateRows } from "./schemaInference.js";
|
|
5
|
-
import { rowsToJsonl } from "./
|
|
6
|
-
export function
|
|
7
|
-
if (
|
|
8
|
-
return
|
|
9
|
-
const mimeType = String(
|
|
5
|
+
import { rowsToJsonl } from "./rows.js";
|
|
6
|
+
export function defaultTextResourceName(resource) {
|
|
7
|
+
if (resource.name?.trim())
|
|
8
|
+
return resource.name.trim();
|
|
9
|
+
const mimeType = String(resource.mimeType ?? "").toLowerCase();
|
|
10
10
|
if (mimeType.includes("csv"))
|
|
11
|
-
return "
|
|
11
|
+
return "resource.csv";
|
|
12
12
|
if (mimeType.includes("json"))
|
|
13
|
-
return "
|
|
13
|
+
return "resource.json";
|
|
14
14
|
if (mimeType.includes("yaml") || mimeType.includes("yml"))
|
|
15
|
-
return "
|
|
16
|
-
return "
|
|
15
|
+
return "resource.yaml";
|
|
16
|
+
return "resource.txt";
|
|
17
17
|
}
|
|
18
18
|
export async function getDatasetDb(runtime) {
|
|
19
19
|
const scoped = await runtime.use(datasetDomain);
|
|
@@ -21,6 +21,9 @@ export async function getDatasetDb(runtime) {
|
|
|
21
21
|
}
|
|
22
22
|
export async function createOrUpdateDatasetMetadata(runtime, params) {
|
|
23
23
|
"use step";
|
|
24
|
+
if (!params.contextId.trim()) {
|
|
25
|
+
throw new Error("dataset_context_required");
|
|
26
|
+
}
|
|
24
27
|
const db = await getDatasetDb(runtime);
|
|
25
28
|
const service = new DatasetService(db);
|
|
26
29
|
const result = await service.createDataset({
|
|
@@ -28,8 +31,7 @@ export async function createOrUpdateDatasetMetadata(runtime, params) {
|
|
|
28
31
|
sandboxId: params.sandboxId,
|
|
29
32
|
title: params.title ?? params.datasetId,
|
|
30
33
|
instructions: params.instructions ?? "",
|
|
31
|
-
|
|
32
|
-
sourceKinds: params.sourceKinds,
|
|
34
|
+
contextId: params.contextId,
|
|
33
35
|
analysis: params.analysis,
|
|
34
36
|
schema: params.schema,
|
|
35
37
|
status: params.status ?? "building",
|
|
@@ -52,8 +54,7 @@ export async function materializeRowsToDataset(runtime, params) {
|
|
|
52
54
|
sandboxId: params.sandboxId,
|
|
53
55
|
title: params.title,
|
|
54
56
|
instructions: params.instructions,
|
|
55
|
-
|
|
56
|
-
sourceKinds: params.sourceKinds,
|
|
57
|
+
contextId: params.contextId,
|
|
57
58
|
analysis: params.analysis,
|
|
58
59
|
schema: resolvedSchema,
|
|
59
60
|
status: "building",
|
|
@@ -78,18 +79,18 @@ export async function materializeRowsToDataset(runtime, params) {
|
|
|
78
79
|
}
|
|
79
80
|
return params.datasetId;
|
|
80
81
|
}
|
|
81
|
-
export async function
|
|
82
|
+
export async function uploadInlineTextResource(runtime, datasetId, resource) {
|
|
82
83
|
"use step";
|
|
83
84
|
const db = await getDatasetDb(runtime);
|
|
84
|
-
const fileName =
|
|
85
|
-
const storagePath = `/dataset/
|
|
86
|
-
const uploadResult = await db.storage.uploadFile(storagePath, Buffer.from(
|
|
87
|
-
contentType:
|
|
85
|
+
const fileName = defaultTextResourceName(resource);
|
|
86
|
+
const storagePath = `/dataset/resource/${datasetId}/${Date.now()}-${fileName}`;
|
|
87
|
+
const uploadResult = await db.storage.uploadFile(storagePath, Buffer.from(resource.text, "utf-8"), {
|
|
88
|
+
contentType: resource.mimeType ?? "text/plain",
|
|
88
89
|
contentDisposition: fileName,
|
|
89
90
|
});
|
|
90
91
|
const fileId = uploadResult?.data?.id;
|
|
91
92
|
if (!fileId) {
|
|
92
|
-
throw new Error("
|
|
93
|
+
throw new Error("dataset_text_resource_upload_failed");
|
|
93
94
|
}
|
|
94
95
|
return fileId;
|
|
95
96
|
}
|