@ekairos/dataset 1.22.79-beta.development.0 → 1.22.80-beta.development.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/builder/materialize.d.ts +77 -1
- package/dist/builder/materialize.js +212 -60
- package/dist/builder/persistence.d.ts +6 -0
- package/dist/builder/persistence.js +22 -0
- package/dist/completeDataset.steps.d.ts +87 -0
- package/dist/completeDataset.steps.js +449 -0
- package/dist/completeDataset.tool.d.ts +53 -2
- package/dist/completeDataset.tool.js +4 -262
- package/dist/dataset/steps.d.ts +1 -0
- package/dist/dataset/steps.js +12 -12
- package/dist/dataset.js +16 -4
- package/dist/datasetFiles.d.ts +5 -0
- package/dist/datasetFiles.js +21 -0
- package/dist/executeCommand.tool.js +2 -3
- package/dist/file/file-dataset.agent.d.ts +4 -1
- package/dist/file/file-dataset.agent.js +30 -18
- package/dist/file/file-dataset.steps.js +3 -3
- package/dist/file/file-dataset.types.d.ts +4 -0
- package/dist/file/prompts.js +108 -4
- package/dist/transform/filepreview.js +2 -3
- package/dist/transform/transform-dataset.agent.d.ts +6 -1
- package/dist/transform/transform-dataset.agent.js +30 -15
- package/dist/transform/transform-dataset.steps.js +3 -4
- package/dist/transform/transform-dataset.types.d.ts +6 -0
- package/package.json +4 -4
|
@@ -1,6 +1,82 @@
|
|
|
1
|
-
import type { AnyDatasetRuntime, DatasetBuilderState, InternalSource } from "./types.js";
|
|
1
|
+
import type { AnyDatasetRuntime, DatasetBuilderState, DatasetSchemaInput, InternalSource } from "./types.js";
|
|
2
|
+
import type { SandboxState } from "../file/file-dataset.types.js";
|
|
3
|
+
import type { FilePreviewContext } from "../file/filepreview.types.js";
|
|
4
|
+
import type { TransformSandboxState, TransformSourcePreviewContext } from "../transform/transform-dataset.types.js";
|
|
2
5
|
export declare function resolveDatasetAgentDurable(requestedDurable?: boolean): Promise<boolean>;
|
|
6
|
+
type PreparedFileDatasetContext = {
|
|
7
|
+
kind: "file";
|
|
8
|
+
datasetId: string;
|
|
9
|
+
sandboxId: string;
|
|
10
|
+
fileId: string;
|
|
11
|
+
sandboxState: SandboxState;
|
|
12
|
+
filePreview?: FilePreviewContext;
|
|
13
|
+
schema?: DatasetSchemaInput | null;
|
|
14
|
+
};
|
|
15
|
+
type PreparedTransformDatasetContext = {
|
|
16
|
+
kind: "transform";
|
|
17
|
+
datasetId: string;
|
|
18
|
+
sandboxId: string;
|
|
19
|
+
sourceDatasetIds: string[];
|
|
20
|
+
outputSchema: DatasetSchemaInput;
|
|
21
|
+
sandboxState: TransformSandboxState;
|
|
22
|
+
sourcePreviews?: Array<{
|
|
23
|
+
datasetId: string;
|
|
24
|
+
preview: TransformSourcePreviewContext;
|
|
25
|
+
}>;
|
|
26
|
+
};
|
|
27
|
+
type PreparedDatasetContext = PreparedFileDatasetContext | PreparedTransformDatasetContext;
|
|
28
|
+
type DatasetContextInitialization = PreparedDatasetContext & {
|
|
29
|
+
prompt: string;
|
|
30
|
+
instructions?: string;
|
|
31
|
+
};
|
|
32
|
+
export declare function initializeDatasetStep<Runtime extends AnyDatasetRuntime>(params: {
|
|
33
|
+
runtime: Runtime;
|
|
34
|
+
datasetId: string;
|
|
35
|
+
sandboxId: string;
|
|
36
|
+
title?: string;
|
|
37
|
+
instructions?: string;
|
|
38
|
+
sources: any[];
|
|
39
|
+
sourceKinds: string[];
|
|
40
|
+
schema?: DatasetSchemaInput;
|
|
41
|
+
}): Promise<{
|
|
42
|
+
datasetId: string;
|
|
43
|
+
sandboxId: string;
|
|
44
|
+
}>;
|
|
45
|
+
export declare function prepareDatasetSourcesStep<Runtime extends AnyDatasetRuntime>(params: {
|
|
46
|
+
kind: "file";
|
|
47
|
+
runtime: Runtime;
|
|
48
|
+
datasetId: string;
|
|
49
|
+
sandboxId: string;
|
|
50
|
+
source: Extract<InternalSource, {
|
|
51
|
+
kind: "file" | "text";
|
|
52
|
+
}>;
|
|
53
|
+
schema?: DatasetSchemaInput;
|
|
54
|
+
} | {
|
|
55
|
+
kind: "transform";
|
|
56
|
+
runtime: Runtime;
|
|
57
|
+
datasetId: string;
|
|
58
|
+
sandboxId: string;
|
|
59
|
+
sourceDatasetIds: string[];
|
|
60
|
+
outputSchema: DatasetSchemaInput;
|
|
61
|
+
}): Promise<PreparedDatasetContext>;
|
|
62
|
+
export declare function initializeDatasetContextStep(params: {
|
|
63
|
+
prepared: PreparedDatasetContext;
|
|
64
|
+
instructions?: string;
|
|
65
|
+
outputSchema?: DatasetSchemaInput;
|
|
66
|
+
}): Promise<DatasetContextInitialization>;
|
|
67
|
+
export declare function completeDatasetStep<Runtime extends AnyDatasetRuntime>(params: {
|
|
68
|
+
runtime: Runtime;
|
|
69
|
+
datasetId: string;
|
|
70
|
+
schema?: DatasetSchemaInput;
|
|
71
|
+
first: boolean;
|
|
72
|
+
}): Promise<{
|
|
73
|
+
datasetId: string;
|
|
74
|
+
dataset: any;
|
|
75
|
+
previewRows: any[];
|
|
76
|
+
firstRow: any;
|
|
77
|
+
}>;
|
|
3
78
|
export declare function materializeSingleFileLikeSource<Runtime extends AnyDatasetRuntime>(state: DatasetBuilderState<Runtime>, source: Extract<InternalSource, {
|
|
4
79
|
kind: "file" | "text";
|
|
5
80
|
}>, targetDatasetId: string): Promise<string>;
|
|
6
81
|
export declare function materializeDerivedDataset<Runtime extends AnyDatasetRuntime>(state: DatasetBuilderState<Runtime>, targetDatasetId: string): Promise<string>;
|
|
82
|
+
export {};
|
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
import { createFileParseContext } from "../file/file-dataset.agent.js";
|
|
2
2
|
import { readInstantFileStep } from "../file/steps.js";
|
|
3
|
+
import { generateFileParsePreviewStep, initializeFileParseSandboxStep, } from "../file/file-dataset.steps.js";
|
|
3
4
|
import { createTransformDatasetContext } from "../transform/transform-dataset.agent.js";
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
5
|
+
import { ensureTransformSourcesInSandboxStep, generateTransformSourcePreviewsStep, } from "../transform/transform-dataset.steps.js";
|
|
6
|
+
import { datasetGetByIdStep, datasetInferAndUpdateSchemaStep, datasetPreviewRowsStep, datasetReadOneStep, } from "../dataset/steps.js";
|
|
7
|
+
import { getDatasetOutputPath, getDatasetScriptsDir, getDatasetSourcesDir, getDatasetStandardDirs, } from "../datasetFiles.js";
|
|
6
8
|
import { registerDatasetAgentMaterializers } from "./agentMaterializers.js";
|
|
7
9
|
import { buildFileDefaultInstructions, buildRawSourceInstructions, buildTransformInstructions, } from "./instructions.js";
|
|
8
10
|
import { createOrUpdateDatasetMetadata, materializeRowsToDataset, uploadInlineTextSource, } from "./persistence.js";
|
|
9
11
|
import { getDomainDescriptor } from "./sourceRows.js";
|
|
10
12
|
import { materializeQuerySource } from "./materializeQuery.js";
|
|
11
|
-
import {
|
|
13
|
+
import { readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep, writeDatasetSandboxTextFilesStep, } from "../sandbox/steps.js";
|
|
12
14
|
function makeIntermediateDatasetId(targetDatasetId, sourceKind, index) {
|
|
13
15
|
return `${targetDatasetId}__${sourceKind}_${index}`;
|
|
14
16
|
}
|
|
@@ -101,17 +103,16 @@ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
|
|
|
101
103
|
const file = await readInstantFileStep({ runtime: state.runtime, fileId: source.fileId });
|
|
102
104
|
if (!isPdfContentDisposition(file.contentDisposition))
|
|
103
105
|
return null;
|
|
104
|
-
const sandboxId =
|
|
105
|
-
const workstation = getDatasetWorkstation(targetDatasetId);
|
|
106
|
+
const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
|
|
106
107
|
const outputPath = getDatasetOutputPath(targetDatasetId);
|
|
107
108
|
const fileName = sanitizePdfFileName(parseContentDispositionFileName(file.contentDisposition), `${source.fileId}.pdf`);
|
|
108
|
-
const sourcePath = `${
|
|
109
|
-
const scriptPath = `${
|
|
109
|
+
const sourcePath = `${getDatasetSourcesDir(targetDatasetId)}/${fileName}`;
|
|
110
|
+
const scriptPath = `${getDatasetScriptsDir(targetDatasetId)}/extract_pdf_text.py`;
|
|
110
111
|
await runDatasetSandboxCommandStep({
|
|
111
112
|
runtime: state.runtime,
|
|
112
113
|
sandboxId,
|
|
113
114
|
cmd: "mkdir",
|
|
114
|
-
args: ["-p",
|
|
115
|
+
args: ["-p", ...getDatasetStandardDirs(targetDatasetId)],
|
|
115
116
|
});
|
|
116
117
|
await writeDatasetSandboxFilesStep({
|
|
117
118
|
runtime: state.runtime,
|
|
@@ -222,24 +223,11 @@ async function materializeRawTextSource(state, source, targetDatasetId) {
|
|
|
222
223
|
});
|
|
223
224
|
return targetDatasetId;
|
|
224
225
|
}
|
|
225
|
-
|
|
226
|
+
function resolveDatasetSandboxId(state, _targetDatasetId) {
|
|
226
227
|
const sandboxId = String(state.sandboxId ?? "").trim();
|
|
227
228
|
if (sandboxId)
|
|
228
229
|
return sandboxId;
|
|
229
|
-
|
|
230
|
-
runtime: state.runtime,
|
|
231
|
-
provider: "vercel",
|
|
232
|
-
sandboxRuntime: "python3.13",
|
|
233
|
-
timeoutMs: 20 * 60 * 1000,
|
|
234
|
-
resources: { vcpus: 2 },
|
|
235
|
-
purpose: "dataset.materialize",
|
|
236
|
-
params: { datasetId: targetDatasetId },
|
|
237
|
-
vercel: {
|
|
238
|
-
profile: "ephemeral",
|
|
239
|
-
deleteOnStop: true,
|
|
240
|
-
},
|
|
241
|
-
});
|
|
242
|
-
return created.sandboxId;
|
|
230
|
+
throw new Error("dataset_sandbox_required");
|
|
243
231
|
}
|
|
244
232
|
export async function resolveDatasetAgentDurable(requestedDurable) {
|
|
245
233
|
if (!requestedDurable)
|
|
@@ -255,20 +243,150 @@ export async function resolveDatasetAgentDurable(requestedDurable) {
|
|
|
255
243
|
}
|
|
256
244
|
return true;
|
|
257
245
|
}
|
|
246
|
+
export async function initializeDatasetStep(params) {
|
|
247
|
+
"use step";
|
|
248
|
+
await createOrUpdateDatasetMetadata(params.runtime, {
|
|
249
|
+
datasetId: params.datasetId,
|
|
250
|
+
sandboxId: params.sandboxId,
|
|
251
|
+
title: params.title ?? params.datasetId,
|
|
252
|
+
instructions: params.instructions,
|
|
253
|
+
sources: params.sources,
|
|
254
|
+
sourceKinds: params.sourceKinds,
|
|
255
|
+
schema: params.schema,
|
|
256
|
+
status: "building",
|
|
257
|
+
});
|
|
258
|
+
return {
|
|
259
|
+
datasetId: params.datasetId,
|
|
260
|
+
sandboxId: params.sandboxId,
|
|
261
|
+
};
|
|
262
|
+
}
|
|
263
|
+
export async function prepareDatasetSourcesStep(params) {
|
|
264
|
+
"use step";
|
|
265
|
+
if (params.kind === "file") {
|
|
266
|
+
const fileId = params.source.kind === "file"
|
|
267
|
+
? params.source.fileId
|
|
268
|
+
: await uploadInlineTextSource(params.runtime, params.datasetId, params.source);
|
|
269
|
+
const initialized = await initializeFileParseSandboxStep({
|
|
270
|
+
runtime: params.runtime,
|
|
271
|
+
sandboxId: params.sandboxId,
|
|
272
|
+
datasetId: params.datasetId,
|
|
273
|
+
fileId,
|
|
274
|
+
state: { initialized: false, filePath: "" },
|
|
275
|
+
});
|
|
276
|
+
const filePreview = await generateFileParsePreviewStep({
|
|
277
|
+
runtime: params.runtime,
|
|
278
|
+
sandboxId: params.sandboxId,
|
|
279
|
+
sandboxFilePath: initialized.filePath,
|
|
280
|
+
datasetId: params.datasetId,
|
|
281
|
+
});
|
|
282
|
+
return {
|
|
283
|
+
kind: "file",
|
|
284
|
+
datasetId: params.datasetId,
|
|
285
|
+
sandboxId: params.sandboxId,
|
|
286
|
+
fileId,
|
|
287
|
+
sandboxState: initialized.state,
|
|
288
|
+
filePreview,
|
|
289
|
+
schema: params.schema ?? null,
|
|
290
|
+
};
|
|
291
|
+
}
|
|
292
|
+
const initialized = await ensureTransformSourcesInSandboxStep({
|
|
293
|
+
runtime: params.runtime,
|
|
294
|
+
sandboxId: params.sandboxId,
|
|
295
|
+
datasetId: params.datasetId,
|
|
296
|
+
sourceDatasetIds: params.sourceDatasetIds,
|
|
297
|
+
state: { initialized: false, sourcePaths: [] },
|
|
298
|
+
});
|
|
299
|
+
const sourcePreviews = await generateTransformSourcePreviewsStep({
|
|
300
|
+
runtime: params.runtime,
|
|
301
|
+
sandboxId: params.sandboxId,
|
|
302
|
+
datasetId: params.datasetId,
|
|
303
|
+
sourcePaths: initialized.sourcePaths,
|
|
304
|
+
});
|
|
305
|
+
return {
|
|
306
|
+
kind: "transform",
|
|
307
|
+
datasetId: params.datasetId,
|
|
308
|
+
sandboxId: params.sandboxId,
|
|
309
|
+
sourceDatasetIds: params.sourceDatasetIds,
|
|
310
|
+
outputSchema: params.outputSchema,
|
|
311
|
+
sandboxState: initialized.state,
|
|
312
|
+
sourcePreviews,
|
|
313
|
+
};
|
|
314
|
+
}
|
|
315
|
+
export async function initializeDatasetContextStep(params) {
|
|
316
|
+
"use step";
|
|
317
|
+
if (params.prepared.kind === "file") {
|
|
318
|
+
return {
|
|
319
|
+
...params.prepared,
|
|
320
|
+
instructions: params.instructions ?? buildFileDefaultInstructions(params.outputSchema),
|
|
321
|
+
prompt: "generate a dataset for this file",
|
|
322
|
+
};
|
|
323
|
+
}
|
|
324
|
+
return {
|
|
325
|
+
...params.prepared,
|
|
326
|
+
instructions: params.instructions,
|
|
327
|
+
prompt: params.prepared.sourceDatasetIds.length === 1
|
|
328
|
+
? "Transform the source dataset into a new dataset matching the provided output schema"
|
|
329
|
+
: `Transform ${params.prepared.sourceDatasetIds.length} source datasets into a new dataset matching the provided output schema`,
|
|
330
|
+
};
|
|
331
|
+
}
|
|
332
|
+
export async function completeDatasetStep(params) {
|
|
333
|
+
"use step";
|
|
334
|
+
let datasetResult = await datasetGetByIdStep({
|
|
335
|
+
runtime: params.runtime,
|
|
336
|
+
datasetId: params.datasetId,
|
|
337
|
+
});
|
|
338
|
+
if (!datasetResult.ok)
|
|
339
|
+
throw new Error(datasetResult.error);
|
|
340
|
+
if (!params.schema && !datasetResult.data?.schema) {
|
|
341
|
+
await datasetInferAndUpdateSchemaStep({
|
|
342
|
+
runtime: params.runtime,
|
|
343
|
+
datasetId: params.datasetId,
|
|
344
|
+
title: `${params.datasetId}Row`,
|
|
345
|
+
description: "One dataset row",
|
|
346
|
+
});
|
|
347
|
+
datasetResult = await datasetGetByIdStep({
|
|
348
|
+
runtime: params.runtime,
|
|
349
|
+
datasetId: params.datasetId,
|
|
350
|
+
});
|
|
351
|
+
if (!datasetResult.ok)
|
|
352
|
+
throw new Error(datasetResult.error);
|
|
353
|
+
}
|
|
354
|
+
const previewResult = await datasetPreviewRowsStep({
|
|
355
|
+
runtime: params.runtime,
|
|
356
|
+
datasetId: params.datasetId,
|
|
357
|
+
limit: 20,
|
|
358
|
+
});
|
|
359
|
+
if (!params.first) {
|
|
360
|
+
return {
|
|
361
|
+
datasetId: params.datasetId,
|
|
362
|
+
dataset: datasetResult.data,
|
|
363
|
+
previewRows: previewResult.rows,
|
|
364
|
+
firstRow: undefined,
|
|
365
|
+
};
|
|
366
|
+
}
|
|
367
|
+
const firstResult = await datasetReadOneStep({
|
|
368
|
+
runtime: params.runtime,
|
|
369
|
+
datasetId: params.datasetId,
|
|
370
|
+
});
|
|
371
|
+
return {
|
|
372
|
+
datasetId: params.datasetId,
|
|
373
|
+
dataset: datasetResult.data,
|
|
374
|
+
previewRows: previewResult.rows,
|
|
375
|
+
firstRow: firstResult.row,
|
|
376
|
+
};
|
|
377
|
+
}
|
|
258
378
|
export async function materializeSingleFileLikeSource(state, source, targetDatasetId) {
|
|
259
379
|
if (source.kind === "file" && !state.outputSchema) {
|
|
260
380
|
const materializedPdf = await tryMaterializeRawPdfFileSource(state, source, targetDatasetId);
|
|
261
381
|
if (materializedPdf)
|
|
262
382
|
return materializedPdf;
|
|
263
383
|
}
|
|
384
|
+
const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
|
|
264
385
|
if (!state.reactor) {
|
|
265
386
|
throw new Error("dataset_reactor_required");
|
|
266
387
|
}
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
? source.fileId
|
|
270
|
-
: await uploadInlineTextSource(state.runtime, targetDatasetId, source);
|
|
271
|
-
await createOrUpdateDatasetMetadata(state.runtime, {
|
|
388
|
+
await initializeDatasetStep({
|
|
389
|
+
runtime: state.runtime,
|
|
272
390
|
datasetId: targetDatasetId,
|
|
273
391
|
sandboxId,
|
|
274
392
|
title: state.title ?? targetDatasetId,
|
|
@@ -285,28 +403,45 @@ export async function materializeSingleFileLikeSource(state, source, targetDatas
|
|
|
285
403
|
],
|
|
286
404
|
sourceKinds: [source.kind],
|
|
287
405
|
schema: state.outputSchema,
|
|
288
|
-
status: "building",
|
|
289
406
|
});
|
|
290
|
-
const
|
|
407
|
+
const prepared = await prepareDatasetSourcesStep({
|
|
408
|
+
kind: "file",
|
|
409
|
+
runtime: state.runtime,
|
|
291
410
|
datasetId: targetDatasetId,
|
|
292
|
-
instructions: state.instructions ?? buildFileDefaultInstructions(state.outputSchema),
|
|
293
|
-
reactor: state.reactor,
|
|
294
411
|
sandboxId,
|
|
412
|
+
source,
|
|
413
|
+
schema: state.outputSchema,
|
|
414
|
+
});
|
|
415
|
+
const context = await initializeDatasetContextStep({
|
|
416
|
+
prepared,
|
|
417
|
+
instructions: state.instructions,
|
|
418
|
+
outputSchema: state.outputSchema,
|
|
419
|
+
});
|
|
420
|
+
if (context.kind !== "file") {
|
|
421
|
+
throw new Error("dataset_context_kind_mismatch:file");
|
|
422
|
+
}
|
|
423
|
+
const parseContext = createFileParseContext(context.fileId, {
|
|
424
|
+
datasetId: context.datasetId,
|
|
425
|
+
instructions: context.instructions,
|
|
426
|
+
reactor: state.reactor,
|
|
427
|
+
sandboxId: context.sandboxId,
|
|
428
|
+
sandboxState: context.sandboxState,
|
|
429
|
+
filePreview: context.filePreview,
|
|
430
|
+
schema: context.schema,
|
|
295
431
|
});
|
|
296
432
|
await parseContext.parse(state.runtime, {
|
|
297
433
|
durable: await resolveDatasetAgentDurable(state.durable),
|
|
434
|
+
prompt: context.prompt,
|
|
435
|
+
initialContent: {
|
|
436
|
+
datasetId: context.datasetId,
|
|
437
|
+
fileId: context.fileId,
|
|
438
|
+
instructions: context.instructions ?? "",
|
|
439
|
+
sandboxId: context.sandboxId,
|
|
440
|
+
sandboxState: context.sandboxState,
|
|
441
|
+
filePreview: context.filePreview,
|
|
442
|
+
schema: context.schema,
|
|
443
|
+
},
|
|
298
444
|
});
|
|
299
|
-
if (!state.outputSchema) {
|
|
300
|
-
await datasetInferAndUpdateSchemaStep({
|
|
301
|
-
runtime: state.runtime,
|
|
302
|
-
datasetId: targetDatasetId,
|
|
303
|
-
title: `${targetDatasetId}Row`,
|
|
304
|
-
description: "One dataset row",
|
|
305
|
-
});
|
|
306
|
-
}
|
|
307
|
-
if (state.first) {
|
|
308
|
-
await datasetReadOneStep({ runtime: state.runtime, datasetId: targetDatasetId });
|
|
309
|
-
}
|
|
310
445
|
return targetDatasetId;
|
|
311
446
|
}
|
|
312
447
|
async function normalizeSourceToDatasetId(state, source, targetDatasetId, sourceIndex) {
|
|
@@ -345,7 +480,7 @@ export async function materializeDerivedDataset(state, targetDatasetId) {
|
|
|
345
480
|
if (!state.reactor) {
|
|
346
481
|
throw new Error("dataset_reactor_required");
|
|
347
482
|
}
|
|
348
|
-
const sandboxId =
|
|
483
|
+
const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
|
|
349
484
|
const stateWithSandbox = { ...state, sandboxId };
|
|
350
485
|
const normalizedSources = [];
|
|
351
486
|
for (let index = 0; index < stateWithSandbox.sources.length; index++) {
|
|
@@ -361,7 +496,8 @@ export async function materializeDerivedDataset(state, targetDatasetId) {
|
|
|
361
496
|
properties: {},
|
|
362
497
|
},
|
|
363
498
|
};
|
|
364
|
-
await
|
|
499
|
+
await initializeDatasetStep({
|
|
500
|
+
runtime: stateWithSandbox.runtime,
|
|
365
501
|
datasetId: targetDatasetId,
|
|
366
502
|
sandboxId,
|
|
367
503
|
title: stateWithSandbox.title ?? targetDatasetId,
|
|
@@ -377,30 +513,46 @@ export async function materializeDerivedDataset(state, targetDatasetId) {
|
|
|
377
513
|
: source),
|
|
378
514
|
sourceKinds: stateWithSandbox.sources.map((source) => source.kind),
|
|
379
515
|
schema: transformSchema,
|
|
380
|
-
status: "building",
|
|
381
516
|
});
|
|
382
|
-
const
|
|
517
|
+
const prepared = await prepareDatasetSourcesStep({
|
|
518
|
+
kind: "transform",
|
|
519
|
+
runtime: stateWithSandbox.runtime,
|
|
520
|
+
datasetId: targetDatasetId,
|
|
521
|
+
sandboxId,
|
|
383
522
|
sourceDatasetIds: normalizedSources,
|
|
384
523
|
outputSchema: transformSchema,
|
|
524
|
+
});
|
|
525
|
+
const context = await initializeDatasetContextStep({
|
|
526
|
+
prepared,
|
|
385
527
|
instructions: buildTransformInstructions(normalizedSources.length, stateWithSandbox.instructions, stateWithSandbox.outputSchema),
|
|
386
|
-
|
|
528
|
+
outputSchema: transformSchema,
|
|
529
|
+
});
|
|
530
|
+
if (context.kind !== "transform") {
|
|
531
|
+
throw new Error("dataset_context_kind_mismatch:transform");
|
|
532
|
+
}
|
|
533
|
+
const transformContext = createTransformDatasetContext({
|
|
534
|
+
sourceDatasetIds: context.sourceDatasetIds,
|
|
535
|
+
outputSchema: context.outputSchema,
|
|
536
|
+
instructions: context.instructions,
|
|
537
|
+
datasetId: context.datasetId,
|
|
387
538
|
reactor: stateWithSandbox.reactor,
|
|
388
|
-
sandboxId,
|
|
539
|
+
sandboxId: context.sandboxId,
|
|
540
|
+
sandboxState: context.sandboxState,
|
|
541
|
+
sourcePreviews: context.sourcePreviews,
|
|
389
542
|
});
|
|
390
543
|
await transformContext.transform(stateWithSandbox.runtime, {
|
|
391
544
|
durable: await resolveDatasetAgentDurable(stateWithSandbox.durable),
|
|
545
|
+
prompt: context.prompt,
|
|
546
|
+
initialContent: {
|
|
547
|
+
datasetId: context.datasetId,
|
|
548
|
+
sourceDatasetIds: context.sourceDatasetIds,
|
|
549
|
+
outputSchema: context.outputSchema,
|
|
550
|
+
instructions: context.instructions,
|
|
551
|
+
sandboxId: context.sandboxId,
|
|
552
|
+
sandboxState: context.sandboxState,
|
|
553
|
+
sourcePreviews: context.sourcePreviews,
|
|
554
|
+
},
|
|
392
555
|
});
|
|
393
|
-
if (!stateWithSandbox.outputSchema) {
|
|
394
|
-
await datasetInferAndUpdateSchemaStep({
|
|
395
|
-
runtime: stateWithSandbox.runtime,
|
|
396
|
-
datasetId: targetDatasetId,
|
|
397
|
-
title: `${targetDatasetId}Row`,
|
|
398
|
-
description: "One dataset row",
|
|
399
|
-
});
|
|
400
|
-
}
|
|
401
|
-
if (stateWithSandbox.first) {
|
|
402
|
-
await datasetReadOneStep({ runtime: stateWithSandbox.runtime, datasetId: targetDatasetId });
|
|
403
|
-
}
|
|
404
556
|
return targetDatasetId;
|
|
405
557
|
}
|
|
406
558
|
registerDatasetAgentMaterializers({
|
|
@@ -15,3 +15,9 @@ export declare function createOrUpdateDatasetMetadata<Runtime extends AnyDataset
|
|
|
15
15
|
export declare function materializeRowsToDataset<Runtime extends AnyDatasetRuntime>(runtime: Runtime, params: MaterializeRowsParams): Promise<string>;
|
|
16
16
|
export declare function uploadInlineTextSource<Runtime extends AnyDatasetRuntime>(runtime: Runtime, datasetId: string, source: DatasetTextSourceInput): Promise<string>;
|
|
17
17
|
export declare function finalizeBuildResult<Runtime extends AnyDatasetRuntime>(runtime: Runtime, datasetId: string, withFirst: boolean): Promise<DatasetBuildResult>;
|
|
18
|
+
export declare function createDatasetBuildResult<Runtime extends AnyDatasetRuntime>(runtime: Runtime, params: {
|
|
19
|
+
datasetId: string;
|
|
20
|
+
dataset: any;
|
|
21
|
+
previewRows: any[];
|
|
22
|
+
firstRow?: any | null;
|
|
23
|
+
}): DatasetBuildResult;
|
|
@@ -128,3 +128,25 @@ export async function finalizeBuildResult(runtime, datasetId, withFirst) {
|
|
|
128
128
|
firstRow: firstResult.row,
|
|
129
129
|
};
|
|
130
130
|
}
|
|
131
|
+
export function createDatasetBuildResult(runtime, params) {
|
|
132
|
+
const reader = {
|
|
133
|
+
async read(cursorOrParams, limit) {
|
|
134
|
+
const readParams = typeof cursorOrParams === "object" && cursorOrParams !== null
|
|
135
|
+
? cursorOrParams
|
|
136
|
+
: { cursor: cursorOrParams, limit };
|
|
137
|
+
return await datasetReadRowsStep({
|
|
138
|
+
runtime,
|
|
139
|
+
datasetId: params.datasetId,
|
|
140
|
+
cursor: readParams.cursor,
|
|
141
|
+
limit: readParams.limit,
|
|
142
|
+
});
|
|
143
|
+
},
|
|
144
|
+
};
|
|
145
|
+
return {
|
|
146
|
+
datasetId: params.datasetId,
|
|
147
|
+
dataset: params.dataset,
|
|
148
|
+
previewRows: params.previewRows,
|
|
149
|
+
reader,
|
|
150
|
+
...(params.firstRow !== undefined ? { firstRow: params.firstRow } : {}),
|
|
151
|
+
};
|
|
152
|
+
}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
export interface PersistDatasetStepParams {
|
|
2
|
+
datasetId: string;
|
|
3
|
+
sandboxId: string;
|
|
4
|
+
runtime: any;
|
|
5
|
+
summary?: string;
|
|
6
|
+
}
|
|
7
|
+
export declare function persistDatasetStep({ runtime, datasetId, sandboxId, summary }: PersistDatasetStepParams): Promise<{
|
|
8
|
+
success: boolean;
|
|
9
|
+
validation?: RowValidationEntry[];
|
|
10
|
+
validationTruncated?: number;
|
|
11
|
+
failureSummary?: ValidationFailureSummary;
|
|
12
|
+
repairInstructions?: string[];
|
|
13
|
+
validRowCount?: number;
|
|
14
|
+
rowRecordCount?: number;
|
|
15
|
+
error?: string;
|
|
16
|
+
status?: string;
|
|
17
|
+
message?: string;
|
|
18
|
+
} | {
|
|
19
|
+
success: boolean;
|
|
20
|
+
status: string;
|
|
21
|
+
validRows: number;
|
|
22
|
+
rowRecordCount: number;
|
|
23
|
+
validation: RowValidationEntry[] | undefined;
|
|
24
|
+
error: string;
|
|
25
|
+
message: string;
|
|
26
|
+
fileId?: undefined;
|
|
27
|
+
storagePath?: undefined;
|
|
28
|
+
} | {
|
|
29
|
+
success: boolean;
|
|
30
|
+
status: string;
|
|
31
|
+
validRows: number;
|
|
32
|
+
rowRecordCount: number;
|
|
33
|
+
fileId: string;
|
|
34
|
+
storagePath: string;
|
|
35
|
+
message: string;
|
|
36
|
+
validation?: undefined;
|
|
37
|
+
error?: undefined;
|
|
38
|
+
}>;
|
|
39
|
+
type RowValidationEntry = {
|
|
40
|
+
index: number;
|
|
41
|
+
valid: boolean;
|
|
42
|
+
errors?: string[];
|
|
43
|
+
errorDetails?: Array<{
|
|
44
|
+
path: string;
|
|
45
|
+
keyword: string;
|
|
46
|
+
message: string;
|
|
47
|
+
params?: Record<string, unknown>;
|
|
48
|
+
schemaPath?: string;
|
|
49
|
+
}>;
|
|
50
|
+
dataKeys?: string[];
|
|
51
|
+
};
|
|
52
|
+
type ValidationFailureSummary = {
|
|
53
|
+
rowRecordCount: number;
|
|
54
|
+
validRowCount: number;
|
|
55
|
+
invalidRowCount: number;
|
|
56
|
+
expectedTopLevelKeys: string[];
|
|
57
|
+
requiredTopLevelKeys: string[];
|
|
58
|
+
requiredPaths: string[];
|
|
59
|
+
enumConstraints: Array<{
|
|
60
|
+
path: string;
|
|
61
|
+
values: unknown[];
|
|
62
|
+
}>;
|
|
63
|
+
topErrors: Array<{
|
|
64
|
+
message: string;
|
|
65
|
+
count: number;
|
|
66
|
+
}>;
|
|
67
|
+
missingRequiredProperties: Array<{
|
|
68
|
+
property: string;
|
|
69
|
+
count: number;
|
|
70
|
+
}>;
|
|
71
|
+
additionalProperties: Array<{
|
|
72
|
+
property: string;
|
|
73
|
+
count: number;
|
|
74
|
+
}>;
|
|
75
|
+
enumFailures: Array<{
|
|
76
|
+
path: string;
|
|
77
|
+
allowedValues: unknown[];
|
|
78
|
+
count: number;
|
|
79
|
+
}>;
|
|
80
|
+
observedTopLevelKeys: string[];
|
|
81
|
+
sampleInvalidRows: Array<{
|
|
82
|
+
index: number;
|
|
83
|
+
dataKeys?: string[];
|
|
84
|
+
errors?: string[];
|
|
85
|
+
}>;
|
|
86
|
+
};
|
|
87
|
+
export {};
|