@ekairos/dataset 1.22.82-beta.development.0 → 1.22.84-beta.development.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/builder/agentMaterializers.d.ts +2 -2
- package/dist/builder/context.d.ts +7 -0
- package/dist/builder/context.js +192 -0
- package/dist/builder/instructions.d.ts +3 -3
- package/dist/builder/instructions.js +10 -10
- package/dist/builder/materialize.d.ts +12 -11
- package/dist/builder/materialize.js +122 -121
- package/dist/builder/materializeQuery.d.ts +3 -2
- package/dist/builder/materializeQuery.js +10 -19
- package/dist/builder/persistence.d.ts +4 -5
- package/dist/builder/persistence.js +20 -19
- package/dist/builder/types.d.ts +31 -24
- package/dist/completeDataset.steps.d.ts +9 -8
- package/dist/completeDataset.steps.js +18 -11
- package/dist/completeDataset.tool.d.ts +9 -8
- package/dist/completeDataset.tool.js +2 -1
- package/dist/contextWorkspace.d.ts +72 -0
- package/dist/contextWorkspace.js +218 -0
- package/dist/dataset.d.ts +1 -1
- package/dist/dataset.js +42 -29
- package/dist/datasetFiles.d.ts +1 -1
- package/dist/datasetFiles.js +3 -3
- package/dist/executeCommand.tool.d.ts +1 -43
- package/dist/executeCommand.tool.js +10 -3
- package/dist/file/file-dataset.agent.d.ts +2 -0
- package/dist/file/file-dataset.agent.js +51 -16
- package/dist/file/file-dataset.steps.d.ts +6 -0
- package/dist/file/file-dataset.steps.js +18 -21
- package/dist/file/file-dataset.types.d.ts +10 -0
- package/dist/file/prompts.js +16 -14
- package/dist/index.d.ts +1 -0
- package/dist/index.js +1 -0
- package/dist/materializeDataset.tool.d.ts +34 -26
- package/dist/materializeDataset.tool.js +40 -29
- package/dist/schema.d.ts +12 -2
- package/dist/schema.js +6 -3
- package/dist/service.d.ts +2 -2
- package/dist/service.js +6 -3
- package/dist/transform/filepreview.d.ts +2 -2
- package/dist/transform/filepreview.js +3 -3
- package/dist/transform/prompts.js +25 -25
- package/dist/transform/transform-dataset.agent.d.ts +4 -4
- package/dist/transform/transform-dataset.agent.js +29 -30
- package/dist/transform/transform-dataset.steps.d.ts +7 -7
- package/dist/transform/transform-dataset.steps.js +20 -20
- package/dist/transform/transform-dataset.types.d.ts +13 -13
- package/dist/transform/transformDataset.js +4 -4
- package/package.json +4 -4
- /package/dist/builder/{sourceRows.d.ts → rows.d.ts} +0 -0
- /package/dist/builder/{sourceRows.js → rows.js} +0 -0
|
@@ -1,18 +1,16 @@
|
|
|
1
1
|
import { createFileParseContext } from "../file/file-dataset.agent.js";
|
|
2
2
|
import { readInstantFileStep } from "../file/steps.js";
|
|
3
|
-
import { generateFileParsePreviewStep, initializeFileParseSandboxStep, } from "../file/file-dataset.steps.js";
|
|
4
3
|
import { createTransformDatasetContext } from "../transform/transform-dataset.agent.js";
|
|
5
|
-
import {
|
|
4
|
+
import { ensureTransformInputsInSandboxStep, generateTransformInputPreviewsStep, } from "../transform/transform-dataset.steps.js";
|
|
6
5
|
import { datasetGetByIdStep, datasetInferAndUpdateSchemaStep, datasetPreviewRowsStep, datasetReadOneStep, } from "../dataset/steps.js";
|
|
7
|
-
import { getDatasetOutputPath, getDatasetScriptsDir,
|
|
6
|
+
import { getDatasetOutputPath, getDatasetScriptsDir, getDatasetResourcesDir, getDatasetStandardDirs, } from "../datasetFiles.js";
|
|
8
7
|
import { registerDatasetAgentMaterializers } from "./agentMaterializers.js";
|
|
9
|
-
import { buildFileDefaultInstructions,
|
|
10
|
-
import { createOrUpdateDatasetMetadata, materializeRowsToDataset,
|
|
11
|
-
import {
|
|
12
|
-
import { materializeQuerySource } from "./materializeQuery.js";
|
|
8
|
+
import { buildFileDefaultInstructions, buildRawResourceInstructions, buildTransformInstructions, } from "./instructions.js";
|
|
9
|
+
import { createOrUpdateDatasetMetadata, materializeRowsToDataset, uploadInlineTextResource, } from "./persistence.js";
|
|
10
|
+
import { materializeQueryResource } from "./materializeQuery.js";
|
|
13
11
|
import { readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep, writeDatasetSandboxTextFilesStep, } from "../sandbox/steps.js";
|
|
14
|
-
function makeIntermediateDatasetId(targetDatasetId,
|
|
15
|
-
return `${targetDatasetId}__${
|
|
12
|
+
function makeIntermediateDatasetId(targetDatasetId, resourceKind, index) {
|
|
13
|
+
return `${targetDatasetId}__${resourceKind}_${index}`;
|
|
16
14
|
}
|
|
17
15
|
function normalizeParsedTextRows(value) {
|
|
18
16
|
if (Array.isArray(value)) {
|
|
@@ -22,10 +20,10 @@ function normalizeParsedTextRows(value) {
|
|
|
22
20
|
return [value];
|
|
23
21
|
return [{ value }];
|
|
24
22
|
}
|
|
25
|
-
function materializeRawTextRows(
|
|
26
|
-
const text = String(
|
|
27
|
-
const mimeType = String(
|
|
28
|
-
const name = String(
|
|
23
|
+
function materializeRawTextRows(resource) {
|
|
24
|
+
const text = String(resource.text ?? "");
|
|
25
|
+
const mimeType = String(resource.mimeType ?? "").toLowerCase();
|
|
26
|
+
const name = String(resource.name ?? "").toLowerCase();
|
|
29
27
|
const shouldParseJson = mimeType.includes("json") || name.endsWith(".json") || name.endsWith(".jsonl");
|
|
30
28
|
if (shouldParseJson) {
|
|
31
29
|
try {
|
|
@@ -68,10 +66,14 @@ function isPdfContentDisposition(value) {
|
|
|
68
66
|
const text = String(value ?? "").toLowerCase();
|
|
69
67
|
return text.includes("application/pdf") || text.includes(".pdf");
|
|
70
68
|
}
|
|
71
|
-
function
|
|
69
|
+
function sanitizeResourceFileName(value, fallback) {
|
|
72
70
|
const name = String(value ?? "").trim() || fallback;
|
|
73
71
|
const cleaned = name.replace(/[\\/:"*?<>|]+/g, "_").replace(/\s+/g, "_").slice(0, 120);
|
|
74
|
-
return cleaned
|
|
72
|
+
return cleaned || fallback;
|
|
73
|
+
}
|
|
74
|
+
function sanitizePdfFileName(value, fallback) {
|
|
75
|
+
const cleaned = sanitizeResourceFileName(value, fallback);
|
|
76
|
+
return cleaned.toLowerCase().endsWith(".pdf") ? cleaned : `${cleaned}.pdf`;
|
|
75
77
|
}
|
|
76
78
|
function pdfTextRowsSchema() {
|
|
77
79
|
return {
|
|
@@ -99,14 +101,14 @@ function parseJsonlDataRows(content) {
|
|
|
99
101
|
.map((record) => record?.data)
|
|
100
102
|
.filter((row) => row && typeof row === "object" && !Array.isArray(row));
|
|
101
103
|
}
|
|
102
|
-
async function
|
|
103
|
-
const file = await readInstantFileStep({ runtime: state.runtime, fileId:
|
|
104
|
+
async function tryMaterializeRawPdfFileResource(state, resource, targetDatasetId) {
|
|
105
|
+
const file = await readInstantFileStep({ runtime: state.runtime, fileId: resource.fileId });
|
|
104
106
|
if (!isPdfContentDisposition(file.contentDisposition))
|
|
105
107
|
return null;
|
|
106
108
|
const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
|
|
107
109
|
const outputPath = getDatasetOutputPath(targetDatasetId);
|
|
108
|
-
const fileName = sanitizePdfFileName(parseContentDispositionFileName(file.contentDisposition), `${
|
|
109
|
-
const
|
|
110
|
+
const fileName = sanitizePdfFileName(parseContentDispositionFileName(file.contentDisposition), `${resource.fileId}.pdf`);
|
|
111
|
+
const resourcePath = `${getDatasetResourcesDir(targetDatasetId)}/${fileName}`;
|
|
110
112
|
const scriptPath = `${getDatasetScriptsDir(targetDatasetId)}/extract_pdf_text.py`;
|
|
111
113
|
await runDatasetSandboxCommandStep({
|
|
112
114
|
runtime: state.runtime,
|
|
@@ -117,7 +119,7 @@ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
|
|
|
117
119
|
await writeDatasetSandboxFilesStep({
|
|
118
120
|
runtime: state.runtime,
|
|
119
121
|
sandboxId,
|
|
120
|
-
files: [{ path:
|
|
122
|
+
files: [{ path: resourcePath, contentBase64: file.contentBase64 }],
|
|
121
123
|
});
|
|
122
124
|
const install = await runDatasetSandboxCommandStep({
|
|
123
125
|
runtime: state.runtime,
|
|
@@ -140,11 +142,11 @@ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
|
|
|
140
142
|
"import sys",
|
|
141
143
|
"from pypdf import PdfReader",
|
|
142
144
|
"",
|
|
143
|
-
"
|
|
145
|
+
"resource_path = Path(sys.argv[1])",
|
|
144
146
|
"output_path = Path(sys.argv[2])",
|
|
145
147
|
"file_id = sys.argv[3]",
|
|
146
148
|
"file_name = sys.argv[4]",
|
|
147
|
-
"reader = PdfReader(str(
|
|
149
|
+
"reader = PdfReader(str(resource_path))",
|
|
148
150
|
"rows = 0",
|
|
149
151
|
"with output_path.open('w', encoding='utf-8') as out:",
|
|
150
152
|
" for index, page in enumerate(reader.pages, start=1):",
|
|
@@ -174,7 +176,7 @@ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
|
|
|
174
176
|
runtime: state.runtime,
|
|
175
177
|
sandboxId,
|
|
176
178
|
cmd: "python",
|
|
177
|
-
args: [scriptPath,
|
|
179
|
+
args: [scriptPath, resourcePath, outputPath, resource.fileId, fileName],
|
|
178
180
|
});
|
|
179
181
|
if (extraction.exitCode !== 0) {
|
|
180
182
|
throw new Error(`dataset_pdf_text_extraction_failed:${extraction.stderr || extraction.stdout}`);
|
|
@@ -193,36 +195,45 @@ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
|
|
|
193
195
|
sandboxId,
|
|
194
196
|
title: state.title ?? fileName,
|
|
195
197
|
instructions: state.instructions,
|
|
196
|
-
|
|
197
|
-
sourceKinds: ["file"],
|
|
198
|
+
contextId: state.contextId ?? "",
|
|
198
199
|
rows,
|
|
199
200
|
schema: pdfTextRowsSchema(),
|
|
200
201
|
first: state.first,
|
|
201
202
|
});
|
|
202
203
|
return targetDatasetId;
|
|
203
204
|
}
|
|
204
|
-
async function
|
|
205
|
-
const rows = materializeRawTextRows(
|
|
205
|
+
async function materializeRawTextResource(state, resource, targetDatasetId) {
|
|
206
|
+
const rows = materializeRawTextRows(resource);
|
|
206
207
|
await materializeRowsToDataset(state.runtime, {
|
|
207
208
|
datasetId: targetDatasetId,
|
|
208
209
|
sandboxId: state.sandboxId,
|
|
209
|
-
title: state.title ??
|
|
210
|
+
title: state.title ?? resource.name ?? targetDatasetId,
|
|
210
211
|
instructions: state.instructions,
|
|
211
|
-
|
|
212
|
-
{
|
|
213
|
-
kind: "text",
|
|
214
|
-
mimeType: source.mimeType,
|
|
215
|
-
name: source.name,
|
|
216
|
-
description: source.description,
|
|
217
|
-
},
|
|
218
|
-
],
|
|
219
|
-
sourceKinds: ["text"],
|
|
212
|
+
contextId: state.contextId ?? "",
|
|
220
213
|
rows,
|
|
221
214
|
schema: state.outputSchema,
|
|
222
215
|
first: state.first,
|
|
223
216
|
});
|
|
224
217
|
return targetDatasetId;
|
|
225
218
|
}
|
|
219
|
+
async function writePreparedFileResourceToSandbox(params) {
|
|
220
|
+
const file = await readInstantFileStep({ runtime: params.runtime, fileId: params.fileId });
|
|
221
|
+
const contentDispositionName = parseContentDispositionFileName(file.contentDisposition);
|
|
222
|
+
const fileName = sanitizeResourceFileName(params.filename ?? contentDispositionName, `${params.fileId}.bin`);
|
|
223
|
+
const resourcePath = `${getDatasetResourcesDir(params.datasetId)}/${fileName}`;
|
|
224
|
+
await runDatasetSandboxCommandStep({
|
|
225
|
+
runtime: params.runtime,
|
|
226
|
+
sandboxId: params.sandboxId,
|
|
227
|
+
cmd: "mkdir",
|
|
228
|
+
args: ["-p", ...getDatasetStandardDirs(params.datasetId)],
|
|
229
|
+
});
|
|
230
|
+
await writeDatasetSandboxFilesStep({
|
|
231
|
+
runtime: params.runtime,
|
|
232
|
+
sandboxId: params.sandboxId,
|
|
233
|
+
files: [{ path: resourcePath, contentBase64: file.contentBase64 }],
|
|
234
|
+
});
|
|
235
|
+
return { fileName, resourcePath };
|
|
236
|
+
}
|
|
226
237
|
function resolveDatasetSandboxId(state, _targetDatasetId) {
|
|
227
238
|
const sandboxId = String(state.sandboxId ?? "").trim();
|
|
228
239
|
if (sandboxId)
|
|
@@ -250,8 +261,7 @@ export async function initializeDatasetStep(params) {
|
|
|
250
261
|
sandboxId: params.sandboxId,
|
|
251
262
|
title: params.title ?? params.datasetId,
|
|
252
263
|
instructions: params.instructions,
|
|
253
|
-
|
|
254
|
-
sourceKinds: params.sourceKinds,
|
|
264
|
+
contextId: params.contextId,
|
|
255
265
|
schema: params.schema,
|
|
256
266
|
status: "building",
|
|
257
267
|
});
|
|
@@ -260,56 +270,45 @@ export async function initializeDatasetStep(params) {
|
|
|
260
270
|
sandboxId: params.sandboxId,
|
|
261
271
|
};
|
|
262
272
|
}
|
|
263
|
-
export async function
|
|
273
|
+
export async function prepareDatasetResourcesStep(params) {
|
|
264
274
|
"use step";
|
|
265
275
|
if (params.kind === "file") {
|
|
266
|
-
const fileId = params.
|
|
267
|
-
? params.
|
|
268
|
-
: await
|
|
269
|
-
const initialized = await initializeFileParseSandboxStep({
|
|
270
|
-
runtime: params.runtime,
|
|
271
|
-
sandboxId: params.sandboxId,
|
|
272
|
-
datasetId: params.datasetId,
|
|
273
|
-
fileId,
|
|
274
|
-
state: { initialized: false, filePath: "" },
|
|
275
|
-
});
|
|
276
|
-
const filePreview = await generateFileParsePreviewStep({
|
|
277
|
-
runtime: params.runtime,
|
|
278
|
-
sandboxId: params.sandboxId,
|
|
279
|
-
sandboxFilePath: initialized.filePath,
|
|
280
|
-
datasetId: params.datasetId,
|
|
281
|
-
});
|
|
276
|
+
const fileId = params.resource.kind === "file"
|
|
277
|
+
? params.resource.fileId
|
|
278
|
+
: await uploadInlineTextResource(params.runtime, params.datasetId, params.resource);
|
|
282
279
|
return {
|
|
283
280
|
kind: "file",
|
|
284
281
|
datasetId: params.datasetId,
|
|
285
282
|
sandboxId: params.sandboxId,
|
|
286
283
|
fileId,
|
|
287
|
-
sandboxState: initialized
|
|
288
|
-
filePreview,
|
|
284
|
+
sandboxState: { initialized: false, filePath: "" },
|
|
285
|
+
filePreview: undefined,
|
|
289
286
|
schema: params.schema ?? null,
|
|
287
|
+
filename: params.resource.kind === "file" ? params.resource.filename : params.resource.name,
|
|
288
|
+
mediaType: params.resource.kind === "file" ? params.resource.mediaType : params.resource.mimeType,
|
|
290
289
|
};
|
|
291
290
|
}
|
|
292
|
-
const initialized = await
|
|
291
|
+
const initialized = await ensureTransformInputsInSandboxStep({
|
|
293
292
|
runtime: params.runtime,
|
|
294
293
|
sandboxId: params.sandboxId,
|
|
295
294
|
datasetId: params.datasetId,
|
|
296
|
-
|
|
297
|
-
state: { initialized: false,
|
|
295
|
+
inputDatasetIds: params.inputDatasetIds,
|
|
296
|
+
state: { initialized: false, inputPaths: [] },
|
|
298
297
|
});
|
|
299
|
-
const
|
|
298
|
+
const inputPreviews = await generateTransformInputPreviewsStep({
|
|
300
299
|
runtime: params.runtime,
|
|
301
300
|
sandboxId: params.sandboxId,
|
|
302
301
|
datasetId: params.datasetId,
|
|
303
|
-
|
|
302
|
+
inputPaths: initialized.inputPaths,
|
|
304
303
|
});
|
|
305
304
|
return {
|
|
306
305
|
kind: "transform",
|
|
307
306
|
datasetId: params.datasetId,
|
|
308
307
|
sandboxId: params.sandboxId,
|
|
309
|
-
|
|
308
|
+
inputDatasetIds: params.inputDatasetIds,
|
|
310
309
|
outputSchema: params.outputSchema,
|
|
311
310
|
sandboxState: initialized.state,
|
|
312
|
-
|
|
311
|
+
inputPreviews,
|
|
313
312
|
};
|
|
314
313
|
}
|
|
315
314
|
export async function initializeDatasetContextStep(params) {
|
|
@@ -324,9 +323,9 @@ export async function initializeDatasetContextStep(params) {
|
|
|
324
323
|
return {
|
|
325
324
|
...params.prepared,
|
|
326
325
|
instructions: params.instructions,
|
|
327
|
-
prompt: params.prepared.
|
|
328
|
-
? "Transform the
|
|
329
|
-
: `Transform ${params.prepared.
|
|
326
|
+
prompt: params.prepared.inputDatasetIds.length === 1
|
|
327
|
+
? "Transform the input dataset into a new dataset matching the provided output schema"
|
|
328
|
+
: `Transform ${params.prepared.inputDatasetIds.length} input datasets into a new dataset matching the provided output schema`,
|
|
330
329
|
};
|
|
331
330
|
}
|
|
332
331
|
export async function completeDatasetStep(params) {
|
|
@@ -375,9 +374,9 @@ export async function completeDatasetStep(params) {
|
|
|
375
374
|
firstRow: firstResult.row,
|
|
376
375
|
};
|
|
377
376
|
}
|
|
378
|
-
export async function
|
|
379
|
-
if (
|
|
380
|
-
const materializedPdf = await
|
|
377
|
+
export async function materializeSingleFileLikeResource(state, resource, targetDatasetId) {
|
|
378
|
+
if (resource.kind === "file" && !state.outputSchema) {
|
|
379
|
+
const materializedPdf = await tryMaterializeRawPdfFileResource(state, resource, targetDatasetId);
|
|
381
380
|
if (materializedPdf)
|
|
382
381
|
return materializedPdf;
|
|
383
382
|
}
|
|
@@ -391,29 +390,32 @@ export async function materializeSingleFileLikeSource(state, source, targetDatas
|
|
|
391
390
|
sandboxId,
|
|
392
391
|
title: state.title ?? targetDatasetId,
|
|
393
392
|
instructions: state.instructions,
|
|
394
|
-
|
|
395
|
-
source.kind === "file"
|
|
396
|
-
? { kind: "file", fileId: source.fileId, description: source.description }
|
|
397
|
-
: {
|
|
398
|
-
kind: "text",
|
|
399
|
-
mimeType: source.mimeType,
|
|
400
|
-
name: source.name,
|
|
401
|
-
description: source.description,
|
|
402
|
-
},
|
|
403
|
-
],
|
|
404
|
-
sourceKinds: [source.kind],
|
|
393
|
+
contextId: state.contextId ?? "",
|
|
405
394
|
schema: state.outputSchema,
|
|
406
395
|
});
|
|
407
|
-
const prepared = await
|
|
396
|
+
const prepared = await prepareDatasetResourcesStep({
|
|
408
397
|
kind: "file",
|
|
409
398
|
runtime: state.runtime,
|
|
410
399
|
datasetId: targetDatasetId,
|
|
411
400
|
sandboxId,
|
|
412
|
-
|
|
401
|
+
resource,
|
|
413
402
|
schema: state.outputSchema,
|
|
414
403
|
});
|
|
404
|
+
if (prepared.kind !== "file") {
|
|
405
|
+
throw new Error("dataset_context_kind_mismatch:file");
|
|
406
|
+
}
|
|
407
|
+
const preparedFile = await writePreparedFileResourceToSandbox({
|
|
408
|
+
runtime: state.runtime,
|
|
409
|
+
sandboxId,
|
|
410
|
+
datasetId: targetDatasetId,
|
|
411
|
+
fileId: prepared.fileId,
|
|
412
|
+
filename: prepared.filename,
|
|
413
|
+
});
|
|
415
414
|
const context = await initializeDatasetContextStep({
|
|
416
|
-
prepared
|
|
415
|
+
prepared: {
|
|
416
|
+
...prepared,
|
|
417
|
+
filename: prepared.filename ?? preparedFile.fileName,
|
|
418
|
+
},
|
|
417
419
|
instructions: state.instructions,
|
|
418
420
|
outputSchema: state.outputSchema,
|
|
419
421
|
});
|
|
@@ -428,6 +430,8 @@ export async function materializeSingleFileLikeSource(state, source, targetDatas
|
|
|
428
430
|
sandboxState: context.sandboxState,
|
|
429
431
|
filePreview: context.filePreview,
|
|
430
432
|
schema: context.schema,
|
|
433
|
+
filename: context.filename,
|
|
434
|
+
mediaType: context.mediaType,
|
|
431
435
|
});
|
|
432
436
|
await parseContext.parse(state.runtime, {
|
|
433
437
|
durable: await resolveDatasetAgentDurable(state.durable),
|
|
@@ -440,40 +444,46 @@ export async function materializeSingleFileLikeSource(state, source, targetDatas
|
|
|
440
444
|
sandboxState: context.sandboxState,
|
|
441
445
|
filePreview: context.filePreview,
|
|
442
446
|
schema: context.schema,
|
|
447
|
+
filename: context.filename,
|
|
448
|
+
mediaType: context.mediaType,
|
|
443
449
|
},
|
|
444
450
|
});
|
|
445
451
|
return targetDatasetId;
|
|
446
452
|
}
|
|
447
|
-
async function
|
|
448
|
-
if (
|
|
449
|
-
return
|
|
453
|
+
async function normalizeResourceToDatasetId(state, resource, targetDatasetId, resourceIndex) {
|
|
454
|
+
if (resource.kind === "dataset") {
|
|
455
|
+
return resource.datasetId;
|
|
450
456
|
}
|
|
451
|
-
const intermediateDatasetId = makeIntermediateDatasetId(targetDatasetId,
|
|
452
|
-
if (
|
|
453
|
-
await
|
|
457
|
+
const intermediateDatasetId = makeIntermediateDatasetId(targetDatasetId, resource.kind, resourceIndex);
|
|
458
|
+
if (resource.kind === "query") {
|
|
459
|
+
await materializeQueryResource(state.runtime, resource, {
|
|
454
460
|
datasetId: intermediateDatasetId,
|
|
455
461
|
sandboxId: state.sandboxId,
|
|
456
|
-
title:
|
|
462
|
+
title: resource.title,
|
|
457
463
|
first: false,
|
|
464
|
+
contextId: state.contextId ?? "",
|
|
458
465
|
});
|
|
459
466
|
return intermediateDatasetId;
|
|
460
467
|
}
|
|
461
|
-
if (
|
|
462
|
-
await
|
|
468
|
+
if (resource.kind === "text") {
|
|
469
|
+
await materializeRawTextResource({
|
|
463
470
|
...state,
|
|
464
471
|
outputSchema: undefined,
|
|
465
472
|
first: false,
|
|
466
|
-
instructions:
|
|
467
|
-
title:
|
|
468
|
-
},
|
|
473
|
+
instructions: buildRawResourceInstructions(resource.kind),
|
|
474
|
+
title: resource.name ?? state.title,
|
|
475
|
+
}, resource, intermediateDatasetId);
|
|
469
476
|
return intermediateDatasetId;
|
|
470
477
|
}
|
|
471
|
-
|
|
478
|
+
if (resource.kind === "context") {
|
|
479
|
+
throw new Error("dataset_context_resource_must_be_resolved_before_materialization");
|
|
480
|
+
}
|
|
481
|
+
await materializeSingleFileLikeResource({
|
|
472
482
|
...state,
|
|
473
483
|
outputSchema: undefined,
|
|
474
484
|
first: false,
|
|
475
|
-
instructions:
|
|
476
|
-
},
|
|
485
|
+
instructions: buildRawResourceInstructions(resource.kind),
|
|
486
|
+
}, resource, intermediateDatasetId);
|
|
477
487
|
return intermediateDatasetId;
|
|
478
488
|
}
|
|
479
489
|
export async function materializeDerivedDataset(state, targetDatasetId) {
|
|
@@ -482,9 +492,9 @@ export async function materializeDerivedDataset(state, targetDatasetId) {
|
|
|
482
492
|
}
|
|
483
493
|
const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
|
|
484
494
|
const stateWithSandbox = { ...state, sandboxId };
|
|
485
|
-
const
|
|
486
|
-
for (let index = 0; index < stateWithSandbox.
|
|
487
|
-
|
|
495
|
+
const normalizedResources = [];
|
|
496
|
+
for (let index = 0; index < stateWithSandbox.resources.length; index++) {
|
|
497
|
+
normalizedResources.push(await normalizeResourceToDatasetId(stateWithSandbox, stateWithSandbox.resources[index], targetDatasetId, index));
|
|
488
498
|
}
|
|
489
499
|
const transformSchema = stateWithSandbox.outputSchema ??
|
|
490
500
|
{
|
|
@@ -502,60 +512,51 @@ export async function materializeDerivedDataset(state, targetDatasetId) {
|
|
|
502
512
|
sandboxId,
|
|
503
513
|
title: stateWithSandbox.title ?? targetDatasetId,
|
|
504
514
|
instructions: stateWithSandbox.instructions,
|
|
505
|
-
|
|
506
|
-
? {
|
|
507
|
-
kind: "query",
|
|
508
|
-
query: source.query,
|
|
509
|
-
title: source.title,
|
|
510
|
-
explanation: source.explanation,
|
|
511
|
-
...getDomainDescriptor(source.domain),
|
|
512
|
-
}
|
|
513
|
-
: source),
|
|
514
|
-
sourceKinds: stateWithSandbox.sources.map((source) => source.kind),
|
|
515
|
+
contextId: stateWithSandbox.contextId ?? "",
|
|
515
516
|
schema: transformSchema,
|
|
516
517
|
});
|
|
517
|
-
const prepared = await
|
|
518
|
+
const prepared = await prepareDatasetResourcesStep({
|
|
518
519
|
kind: "transform",
|
|
519
520
|
runtime: stateWithSandbox.runtime,
|
|
520
521
|
datasetId: targetDatasetId,
|
|
521
522
|
sandboxId,
|
|
522
|
-
|
|
523
|
+
inputDatasetIds: normalizedResources,
|
|
523
524
|
outputSchema: transformSchema,
|
|
524
525
|
});
|
|
525
526
|
const context = await initializeDatasetContextStep({
|
|
526
527
|
prepared,
|
|
527
|
-
instructions: buildTransformInstructions(
|
|
528
|
+
instructions: buildTransformInstructions(normalizedResources.length, stateWithSandbox.instructions, stateWithSandbox.outputSchema),
|
|
528
529
|
outputSchema: transformSchema,
|
|
529
530
|
});
|
|
530
531
|
if (context.kind !== "transform") {
|
|
531
532
|
throw new Error("dataset_context_kind_mismatch:transform");
|
|
532
533
|
}
|
|
533
534
|
const transformContext = createTransformDatasetContext({
|
|
534
|
-
|
|
535
|
+
inputDatasetIds: context.inputDatasetIds,
|
|
535
536
|
outputSchema: context.outputSchema,
|
|
536
537
|
instructions: context.instructions,
|
|
537
538
|
datasetId: context.datasetId,
|
|
538
539
|
reactor: stateWithSandbox.reactor,
|
|
539
540
|
sandboxId: context.sandboxId,
|
|
540
541
|
sandboxState: context.sandboxState,
|
|
541
|
-
|
|
542
|
+
inputPreviews: context.inputPreviews,
|
|
542
543
|
});
|
|
543
544
|
await transformContext.transform(stateWithSandbox.runtime, {
|
|
544
545
|
durable: await resolveDatasetAgentDurable(stateWithSandbox.durable),
|
|
545
546
|
prompt: context.prompt,
|
|
546
547
|
initialContent: {
|
|
547
548
|
datasetId: context.datasetId,
|
|
548
|
-
|
|
549
|
+
inputDatasetIds: context.inputDatasetIds,
|
|
549
550
|
outputSchema: context.outputSchema,
|
|
550
551
|
instructions: context.instructions,
|
|
551
552
|
sandboxId: context.sandboxId,
|
|
552
553
|
sandboxState: context.sandboxState,
|
|
553
|
-
|
|
554
|
+
inputPreviews: context.inputPreviews,
|
|
554
555
|
},
|
|
555
556
|
});
|
|
556
557
|
return targetDatasetId;
|
|
557
558
|
}
|
|
558
559
|
registerDatasetAgentMaterializers({
|
|
559
|
-
|
|
560
|
+
materializeSingleFileLikeResource,
|
|
560
561
|
materializeDerivedDataset,
|
|
561
562
|
});
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import type { AnyDatasetRuntime, DatasetBuilderState, DatasetSchemaInput,
|
|
2
|
-
export declare function
|
|
1
|
+
import type { AnyDatasetRuntime, DatasetBuilderState, DatasetSchemaInput, InternalDatasetResource } from "./types.js";
|
|
2
|
+
export declare function materializeQueryResource<Runtime extends AnyDatasetRuntime>(runtime: DatasetBuilderState<Runtime>["runtime"], resource: Extract<InternalDatasetResource, {
|
|
3
3
|
kind: "query";
|
|
4
4
|
}>, params: {
|
|
5
5
|
datasetId: string;
|
|
@@ -8,4 +8,5 @@ export declare function materializeQuerySource<Runtime extends AnyDatasetRuntime
|
|
|
8
8
|
title?: string;
|
|
9
9
|
instructions?: string;
|
|
10
10
|
first?: boolean;
|
|
11
|
+
contextId: string;
|
|
11
12
|
}): Promise<string>;
|
|
@@ -1,35 +1,26 @@
|
|
|
1
1
|
import { materializeRowsToDataset } from "./persistence.js";
|
|
2
|
-
import { getDomainDescriptor, normalizeQueryRows } from "./
|
|
3
|
-
async function
|
|
2
|
+
import { getDomainDescriptor, normalizeQueryRows } from "./rows.js";
|
|
3
|
+
async function readQueryResourceRowsStep(params) {
|
|
4
4
|
"use step";
|
|
5
5
|
const db = await params.runtime.db();
|
|
6
6
|
const result = await db.query(params.query);
|
|
7
7
|
return { rows: normalizeQueryRows(result) };
|
|
8
8
|
}
|
|
9
|
-
export async function
|
|
10
|
-
const { rows } = await
|
|
9
|
+
export async function materializeQueryResource(runtime, resource, params) {
|
|
10
|
+
const { rows } = await readQueryResourceRowsStep({
|
|
11
11
|
runtime,
|
|
12
|
-
query:
|
|
12
|
+
query: resource.query,
|
|
13
13
|
});
|
|
14
|
-
const domainDescriptor = getDomainDescriptor(
|
|
14
|
+
const domainDescriptor = getDomainDescriptor(resource.domain);
|
|
15
15
|
return await materializeRowsToDataset(runtime, {
|
|
16
16
|
datasetId: params.datasetId,
|
|
17
17
|
sandboxId: params.sandboxId,
|
|
18
|
-
title: params.title ??
|
|
18
|
+
title: params.title ?? resource.title,
|
|
19
19
|
instructions: params.instructions,
|
|
20
|
-
|
|
21
|
-
{
|
|
22
|
-
kind: "query",
|
|
23
|
-
query: source.query,
|
|
24
|
-
title: source.title,
|
|
25
|
-
explanation: source.explanation,
|
|
26
|
-
...domainDescriptor,
|
|
27
|
-
},
|
|
28
|
-
],
|
|
29
|
-
sourceKinds: ["query"],
|
|
20
|
+
contextId: params.contextId,
|
|
30
21
|
analysis: {
|
|
31
|
-
query:
|
|
32
|
-
explanation:
|
|
22
|
+
query: resource.query,
|
|
23
|
+
explanation: resource.explanation,
|
|
33
24
|
...domainDescriptor,
|
|
34
25
|
},
|
|
35
26
|
rows,
|
|
@@ -1,19 +1,18 @@
|
|
|
1
|
-
import type { AnyDatasetRuntime, DatasetBuildResult,
|
|
2
|
-
export declare function
|
|
1
|
+
import type { AnyDatasetRuntime, DatasetBuildResult, DatasetTextResourceInput, MaterializeRowsParams } from "./types.js";
|
|
2
|
+
export declare function defaultTextResourceName(resource: DatasetTextResourceInput): string;
|
|
3
3
|
export declare function getDatasetDb<Runtime extends AnyDatasetRuntime>(runtime: Runtime): Promise<any>;
|
|
4
4
|
export declare function createOrUpdateDatasetMetadata<Runtime extends AnyDatasetRuntime>(runtime: Runtime, params: {
|
|
5
5
|
datasetId: string;
|
|
6
6
|
sandboxId?: string;
|
|
7
7
|
title?: string;
|
|
8
8
|
instructions?: string;
|
|
9
|
-
|
|
10
|
-
sourceKinds: string[];
|
|
9
|
+
contextId: string;
|
|
11
10
|
analysis?: any;
|
|
12
11
|
schema?: any;
|
|
13
12
|
status?: string;
|
|
14
13
|
}): Promise<void>;
|
|
15
14
|
export declare function materializeRowsToDataset<Runtime extends AnyDatasetRuntime>(runtime: Runtime, params: MaterializeRowsParams): Promise<string>;
|
|
16
|
-
export declare function
|
|
15
|
+
export declare function uploadInlineTextResource<Runtime extends AnyDatasetRuntime>(runtime: Runtime, datasetId: string, resource: DatasetTextResourceInput): Promise<string>;
|
|
17
16
|
export declare function finalizeBuildResult<Runtime extends AnyDatasetRuntime>(runtime: Runtime, datasetId: string, withFirst: boolean): Promise<DatasetBuildResult>;
|
|
18
17
|
export declare function createDatasetBuildResult<Runtime extends AnyDatasetRuntime>(runtime: Runtime, params: {
|
|
19
18
|
datasetId: string;
|
|
@@ -2,18 +2,18 @@ import { DatasetService } from "../service.js";
|
|
|
2
2
|
import { datasetDomain } from "../schema.js";
|
|
3
3
|
import { datasetGetByIdStep, datasetPreviewRowsStep, datasetReadOneStep, datasetReadRowsStep, } from "../dataset/steps.js";
|
|
4
4
|
import { inferDatasetSchema, validateRows } from "./schemaInference.js";
|
|
5
|
-
import { rowsToJsonl } from "./
|
|
6
|
-
export function
|
|
7
|
-
if (
|
|
8
|
-
return
|
|
9
|
-
const mimeType = String(
|
|
5
|
+
import { rowsToJsonl } from "./rows.js";
|
|
6
|
+
export function defaultTextResourceName(resource) {
|
|
7
|
+
if (resource.name?.trim())
|
|
8
|
+
return resource.name.trim();
|
|
9
|
+
const mimeType = String(resource.mimeType ?? "").toLowerCase();
|
|
10
10
|
if (mimeType.includes("csv"))
|
|
11
|
-
return "
|
|
11
|
+
return "resource.csv";
|
|
12
12
|
if (mimeType.includes("json"))
|
|
13
|
-
return "
|
|
13
|
+
return "resource.json";
|
|
14
14
|
if (mimeType.includes("yaml") || mimeType.includes("yml"))
|
|
15
|
-
return "
|
|
16
|
-
return "
|
|
15
|
+
return "resource.yaml";
|
|
16
|
+
return "resource.txt";
|
|
17
17
|
}
|
|
18
18
|
export async function getDatasetDb(runtime) {
|
|
19
19
|
const scoped = await runtime.use(datasetDomain);
|
|
@@ -21,6 +21,9 @@ export async function getDatasetDb(runtime) {
|
|
|
21
21
|
}
|
|
22
22
|
export async function createOrUpdateDatasetMetadata(runtime, params) {
|
|
23
23
|
"use step";
|
|
24
|
+
if (!params.contextId.trim()) {
|
|
25
|
+
throw new Error("dataset_context_required");
|
|
26
|
+
}
|
|
24
27
|
const db = await getDatasetDb(runtime);
|
|
25
28
|
const service = new DatasetService(db);
|
|
26
29
|
const result = await service.createDataset({
|
|
@@ -28,8 +31,7 @@ export async function createOrUpdateDatasetMetadata(runtime, params) {
|
|
|
28
31
|
sandboxId: params.sandboxId,
|
|
29
32
|
title: params.title ?? params.datasetId,
|
|
30
33
|
instructions: params.instructions ?? "",
|
|
31
|
-
|
|
32
|
-
sourceKinds: params.sourceKinds,
|
|
34
|
+
contextId: params.contextId,
|
|
33
35
|
analysis: params.analysis,
|
|
34
36
|
schema: params.schema,
|
|
35
37
|
status: params.status ?? "building",
|
|
@@ -52,8 +54,7 @@ export async function materializeRowsToDataset(runtime, params) {
|
|
|
52
54
|
sandboxId: params.sandboxId,
|
|
53
55
|
title: params.title,
|
|
54
56
|
instructions: params.instructions,
|
|
55
|
-
|
|
56
|
-
sourceKinds: params.sourceKinds,
|
|
57
|
+
contextId: params.contextId,
|
|
57
58
|
analysis: params.analysis,
|
|
58
59
|
schema: resolvedSchema,
|
|
59
60
|
status: "building",
|
|
@@ -78,18 +79,18 @@ export async function materializeRowsToDataset(runtime, params) {
|
|
|
78
79
|
}
|
|
79
80
|
return params.datasetId;
|
|
80
81
|
}
|
|
81
|
-
export async function
|
|
82
|
+
export async function uploadInlineTextResource(runtime, datasetId, resource) {
|
|
82
83
|
"use step";
|
|
83
84
|
const db = await getDatasetDb(runtime);
|
|
84
|
-
const fileName =
|
|
85
|
-
const storagePath = `/dataset/
|
|
86
|
-
const uploadResult = await db.storage.uploadFile(storagePath, Buffer.from(
|
|
87
|
-
contentType:
|
|
85
|
+
const fileName = defaultTextResourceName(resource);
|
|
86
|
+
const storagePath = `/dataset/resource/${datasetId}/${Date.now()}-${fileName}`;
|
|
87
|
+
const uploadResult = await db.storage.uploadFile(storagePath, Buffer.from(resource.text, "utf-8"), {
|
|
88
|
+
contentType: resource.mimeType ?? "text/plain",
|
|
88
89
|
contentDisposition: fileName,
|
|
89
90
|
});
|
|
90
91
|
const fileId = uploadResult?.data?.id;
|
|
91
92
|
if (!fileId) {
|
|
92
|
-
throw new Error("
|
|
93
|
+
throw new Error("dataset_text_resource_upload_failed");
|
|
93
94
|
}
|
|
94
95
|
return fileId;
|
|
95
96
|
}
|