@ekairos/dataset 1.22.78-beta.development.0 → 1.22.80-beta.development.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/builder/materialize.d.ts +77 -1
- package/dist/builder/materialize.js +370 -54
- package/dist/builder/persistence.d.ts +6 -0
- package/dist/builder/persistence.js +22 -0
- package/dist/completeDataset.steps.d.ts +87 -0
- package/dist/completeDataset.steps.js +449 -0
- package/dist/completeDataset.tool.d.ts +53 -2
- package/dist/completeDataset.tool.js +4 -262
- package/dist/dataset/steps.d.ts +1 -0
- package/dist/dataset/steps.js +12 -12
- package/dist/dataset.js +16 -4
- package/dist/datasetFiles.d.ts +5 -0
- package/dist/datasetFiles.js +21 -0
- package/dist/executeCommand.tool.js +2 -3
- package/dist/file/file-dataset.agent.d.ts +4 -1
- package/dist/file/file-dataset.agent.js +30 -18
- package/dist/file/file-dataset.steps.js +3 -3
- package/dist/file/file-dataset.types.d.ts +4 -0
- package/dist/file/prompts.js +108 -4
- package/dist/transform/filepreview.js +2 -3
- package/dist/transform/transform-dataset.agent.d.ts +6 -1
- package/dist/transform/transform-dataset.agent.js +30 -15
- package/dist/transform/transform-dataset.steps.js +3 -4
- package/dist/transform/transform-dataset.types.d.ts +6 -0
- package/package.json +4 -4
|
@@ -1,6 +1,82 @@
|
|
|
1
|
-
import type { AnyDatasetRuntime, DatasetBuilderState, InternalSource } from "./types.js";
|
|
1
|
+
import type { AnyDatasetRuntime, DatasetBuilderState, DatasetSchemaInput, InternalSource } from "./types.js";
|
|
2
|
+
import type { SandboxState } from "../file/file-dataset.types.js";
|
|
3
|
+
import type { FilePreviewContext } from "../file/filepreview.types.js";
|
|
4
|
+
import type { TransformSandboxState, TransformSourcePreviewContext } from "../transform/transform-dataset.types.js";
|
|
2
5
|
export declare function resolveDatasetAgentDurable(requestedDurable?: boolean): Promise<boolean>;
|
|
6
|
+
type PreparedFileDatasetContext = {
|
|
7
|
+
kind: "file";
|
|
8
|
+
datasetId: string;
|
|
9
|
+
sandboxId: string;
|
|
10
|
+
fileId: string;
|
|
11
|
+
sandboxState: SandboxState;
|
|
12
|
+
filePreview?: FilePreviewContext;
|
|
13
|
+
schema?: DatasetSchemaInput | null;
|
|
14
|
+
};
|
|
15
|
+
type PreparedTransformDatasetContext = {
|
|
16
|
+
kind: "transform";
|
|
17
|
+
datasetId: string;
|
|
18
|
+
sandboxId: string;
|
|
19
|
+
sourceDatasetIds: string[];
|
|
20
|
+
outputSchema: DatasetSchemaInput;
|
|
21
|
+
sandboxState: TransformSandboxState;
|
|
22
|
+
sourcePreviews?: Array<{
|
|
23
|
+
datasetId: string;
|
|
24
|
+
preview: TransformSourcePreviewContext;
|
|
25
|
+
}>;
|
|
26
|
+
};
|
|
27
|
+
type PreparedDatasetContext = PreparedFileDatasetContext | PreparedTransformDatasetContext;
|
|
28
|
+
type DatasetContextInitialization = PreparedDatasetContext & {
|
|
29
|
+
prompt: string;
|
|
30
|
+
instructions?: string;
|
|
31
|
+
};
|
|
32
|
+
export declare function initializeDatasetStep<Runtime extends AnyDatasetRuntime>(params: {
|
|
33
|
+
runtime: Runtime;
|
|
34
|
+
datasetId: string;
|
|
35
|
+
sandboxId: string;
|
|
36
|
+
title?: string;
|
|
37
|
+
instructions?: string;
|
|
38
|
+
sources: any[];
|
|
39
|
+
sourceKinds: string[];
|
|
40
|
+
schema?: DatasetSchemaInput;
|
|
41
|
+
}): Promise<{
|
|
42
|
+
datasetId: string;
|
|
43
|
+
sandboxId: string;
|
|
44
|
+
}>;
|
|
45
|
+
export declare function prepareDatasetSourcesStep<Runtime extends AnyDatasetRuntime>(params: {
|
|
46
|
+
kind: "file";
|
|
47
|
+
runtime: Runtime;
|
|
48
|
+
datasetId: string;
|
|
49
|
+
sandboxId: string;
|
|
50
|
+
source: Extract<InternalSource, {
|
|
51
|
+
kind: "file" | "text";
|
|
52
|
+
}>;
|
|
53
|
+
schema?: DatasetSchemaInput;
|
|
54
|
+
} | {
|
|
55
|
+
kind: "transform";
|
|
56
|
+
runtime: Runtime;
|
|
57
|
+
datasetId: string;
|
|
58
|
+
sandboxId: string;
|
|
59
|
+
sourceDatasetIds: string[];
|
|
60
|
+
outputSchema: DatasetSchemaInput;
|
|
61
|
+
}): Promise<PreparedDatasetContext>;
|
|
62
|
+
export declare function initializeDatasetContextStep(params: {
|
|
63
|
+
prepared: PreparedDatasetContext;
|
|
64
|
+
instructions?: string;
|
|
65
|
+
outputSchema?: DatasetSchemaInput;
|
|
66
|
+
}): Promise<DatasetContextInitialization>;
|
|
67
|
+
export declare function completeDatasetStep<Runtime extends AnyDatasetRuntime>(params: {
|
|
68
|
+
runtime: Runtime;
|
|
69
|
+
datasetId: string;
|
|
70
|
+
schema?: DatasetSchemaInput;
|
|
71
|
+
first: boolean;
|
|
72
|
+
}): Promise<{
|
|
73
|
+
datasetId: string;
|
|
74
|
+
dataset: any;
|
|
75
|
+
previewRows: any[];
|
|
76
|
+
firstRow: any;
|
|
77
|
+
}>;
|
|
3
78
|
export declare function materializeSingleFileLikeSource<Runtime extends AnyDatasetRuntime>(state: DatasetBuilderState<Runtime>, source: Extract<InternalSource, {
|
|
4
79
|
kind: "file" | "text";
|
|
5
80
|
}>, targetDatasetId: string): Promise<string>;
|
|
6
81
|
export declare function materializeDerivedDataset<Runtime extends AnyDatasetRuntime>(state: DatasetBuilderState<Runtime>, targetDatasetId: string): Promise<string>;
|
|
82
|
+
export {};
|
|
@@ -1,12 +1,16 @@
|
|
|
1
1
|
import { createFileParseContext } from "../file/file-dataset.agent.js";
|
|
2
|
+
import { readInstantFileStep } from "../file/steps.js";
|
|
3
|
+
import { generateFileParsePreviewStep, initializeFileParseSandboxStep, } from "../file/file-dataset.steps.js";
|
|
2
4
|
import { createTransformDatasetContext } from "../transform/transform-dataset.agent.js";
|
|
3
|
-
import {
|
|
5
|
+
import { ensureTransformSourcesInSandboxStep, generateTransformSourcePreviewsStep, } from "../transform/transform-dataset.steps.js";
|
|
6
|
+
import { datasetGetByIdStep, datasetInferAndUpdateSchemaStep, datasetPreviewRowsStep, datasetReadOneStep, } from "../dataset/steps.js";
|
|
7
|
+
import { getDatasetOutputPath, getDatasetScriptsDir, getDatasetSourcesDir, getDatasetStandardDirs, } from "../datasetFiles.js";
|
|
4
8
|
import { registerDatasetAgentMaterializers } from "./agentMaterializers.js";
|
|
5
9
|
import { buildFileDefaultInstructions, buildRawSourceInstructions, buildTransformInstructions, } from "./instructions.js";
|
|
6
10
|
import { createOrUpdateDatasetMetadata, materializeRowsToDataset, uploadInlineTextSource, } from "./persistence.js";
|
|
7
11
|
import { getDomainDescriptor } from "./sourceRows.js";
|
|
8
12
|
import { materializeQuerySource } from "./materializeQuery.js";
|
|
9
|
-
import {
|
|
13
|
+
import { readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep, writeDatasetSandboxTextFilesStep, } from "../sandbox/steps.js";
|
|
10
14
|
function makeIntermediateDatasetId(targetDatasetId, sourceKind, index) {
|
|
11
15
|
return `${targetDatasetId}__${sourceKind}_${index}`;
|
|
12
16
|
}
|
|
@@ -41,6 +45,162 @@ function materializeRawTextRows(source) {
|
|
|
41
45
|
}
|
|
42
46
|
return [{ text }];
|
|
43
47
|
}
|
|
48
|
+
function parseContentDispositionFileName(value) {
|
|
49
|
+
const text = String(value ?? "");
|
|
50
|
+
const utf8Match = /filename\*=UTF-8''([^;]+)/i.exec(text);
|
|
51
|
+
if (utf8Match?.[1]) {
|
|
52
|
+
try {
|
|
53
|
+
return decodeURIComponent(utf8Match[1]).trim();
|
|
54
|
+
}
|
|
55
|
+
catch {
|
|
56
|
+
return utf8Match[1].trim();
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
const quotedMatch = /filename="([^"]+)"/i.exec(text);
|
|
60
|
+
if (quotedMatch?.[1])
|
|
61
|
+
return quotedMatch[1].trim();
|
|
62
|
+
const plainMatch = /filename=([^;]+)/i.exec(text);
|
|
63
|
+
if (plainMatch?.[1])
|
|
64
|
+
return plainMatch[1].trim();
|
|
65
|
+
return "";
|
|
66
|
+
}
|
|
67
|
+
function isPdfContentDisposition(value) {
|
|
68
|
+
const text = String(value ?? "").toLowerCase();
|
|
69
|
+
return text.includes("application/pdf") || text.includes(".pdf");
|
|
70
|
+
}
|
|
71
|
+
function sanitizePdfFileName(value, fallback) {
|
|
72
|
+
const name = String(value ?? "").trim() || fallback;
|
|
73
|
+
const cleaned = name.replace(/[\\/:"*?<>|]+/g, "_").replace(/\s+/g, "_").slice(0, 120);
|
|
74
|
+
return cleaned.toLowerCase().endsWith(".pdf") ? cleaned : `${cleaned || fallback}.pdf`;
|
|
75
|
+
}
|
|
76
|
+
function pdfTextRowsSchema() {
|
|
77
|
+
return {
|
|
78
|
+
title: "PdfTextPage",
|
|
79
|
+
description: "Extracted PDF page text",
|
|
80
|
+
schema: {
|
|
81
|
+
type: "object",
|
|
82
|
+
additionalProperties: false,
|
|
83
|
+
required: ["fileId", "fileName", "pageNumber", "text"],
|
|
84
|
+
properties: {
|
|
85
|
+
fileId: { type: "string" },
|
|
86
|
+
fileName: { type: "string" },
|
|
87
|
+
pageNumber: { type: "number" },
|
|
88
|
+
text: { type: "string" },
|
|
89
|
+
},
|
|
90
|
+
},
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
function parseJsonlDataRows(content) {
|
|
94
|
+
return String(content ?? "")
|
|
95
|
+
.split(/\r?\n/g)
|
|
96
|
+
.map((line) => line.trim())
|
|
97
|
+
.filter(Boolean)
|
|
98
|
+
.map((line) => JSON.parse(line))
|
|
99
|
+
.map((record) => record?.data)
|
|
100
|
+
.filter((row) => row && typeof row === "object" && !Array.isArray(row));
|
|
101
|
+
}
|
|
102
|
+
async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
|
|
103
|
+
const file = await readInstantFileStep({ runtime: state.runtime, fileId: source.fileId });
|
|
104
|
+
if (!isPdfContentDisposition(file.contentDisposition))
|
|
105
|
+
return null;
|
|
106
|
+
const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
|
|
107
|
+
const outputPath = getDatasetOutputPath(targetDatasetId);
|
|
108
|
+
const fileName = sanitizePdfFileName(parseContentDispositionFileName(file.contentDisposition), `${source.fileId}.pdf`);
|
|
109
|
+
const sourcePath = `${getDatasetSourcesDir(targetDatasetId)}/${fileName}`;
|
|
110
|
+
const scriptPath = `${getDatasetScriptsDir(targetDatasetId)}/extract_pdf_text.py`;
|
|
111
|
+
await runDatasetSandboxCommandStep({
|
|
112
|
+
runtime: state.runtime,
|
|
113
|
+
sandboxId,
|
|
114
|
+
cmd: "mkdir",
|
|
115
|
+
args: ["-p", ...getDatasetStandardDirs(targetDatasetId)],
|
|
116
|
+
});
|
|
117
|
+
await writeDatasetSandboxFilesStep({
|
|
118
|
+
runtime: state.runtime,
|
|
119
|
+
sandboxId,
|
|
120
|
+
files: [{ path: sourcePath, contentBase64: file.contentBase64 }],
|
|
121
|
+
});
|
|
122
|
+
const install = await runDatasetSandboxCommandStep({
|
|
123
|
+
runtime: state.runtime,
|
|
124
|
+
sandboxId,
|
|
125
|
+
cmd: "python",
|
|
126
|
+
args: ["-m", "pip", "install", "pypdf", "--quiet"],
|
|
127
|
+
});
|
|
128
|
+
if (install.exitCode !== 0) {
|
|
129
|
+
throw new Error(`dataset_pdf_dependency_install_failed:${install.stderr || install.stdout}`);
|
|
130
|
+
}
|
|
131
|
+
await writeDatasetSandboxTextFilesStep({
|
|
132
|
+
runtime: state.runtime,
|
|
133
|
+
sandboxId,
|
|
134
|
+
files: [
|
|
135
|
+
{
|
|
136
|
+
path: scriptPath,
|
|
137
|
+
content: [
|
|
138
|
+
"from pathlib import Path",
|
|
139
|
+
"import json",
|
|
140
|
+
"import sys",
|
|
141
|
+
"from pypdf import PdfReader",
|
|
142
|
+
"",
|
|
143
|
+
"source_path = Path(sys.argv[1])",
|
|
144
|
+
"output_path = Path(sys.argv[2])",
|
|
145
|
+
"file_id = sys.argv[3]",
|
|
146
|
+
"file_name = sys.argv[4]",
|
|
147
|
+
"reader = PdfReader(str(source_path))",
|
|
148
|
+
"rows = 0",
|
|
149
|
+
"with output_path.open('w', encoding='utf-8') as out:",
|
|
150
|
+
" for index, page in enumerate(reader.pages, start=1):",
|
|
151
|
+
" text = page.extract_text() or ''",
|
|
152
|
+
" text = text.replace('\\x00', '').strip()",
|
|
153
|
+
" if not text:",
|
|
154
|
+
" continue",
|
|
155
|
+
" data = {",
|
|
156
|
+
" 'fileId': file_id,",
|
|
157
|
+
" 'fileName': file_name,",
|
|
158
|
+
" 'pageNumber': index,",
|
|
159
|
+
" 'text': text,",
|
|
160
|
+
" }",
|
|
161
|
+
" out.write(json.dumps({'type': 'row', 'data': data}, ensure_ascii=False) + '\\n')",
|
|
162
|
+
" rows += 1",
|
|
163
|
+
" if rows == 0:",
|
|
164
|
+
" data = {'fileId': file_id, 'fileName': file_name, 'pageNumber': 0, 'text': ''}",
|
|
165
|
+
" out.write(json.dumps({'type': 'row', 'data': data}, ensure_ascii=False) + '\\n')",
|
|
166
|
+
" rows = 1",
|
|
167
|
+
"print(f'extracted_pdf_pages={len(reader.pages)} rows={rows} output={output_path}')",
|
|
168
|
+
"",
|
|
169
|
+
].join("\n"),
|
|
170
|
+
},
|
|
171
|
+
],
|
|
172
|
+
});
|
|
173
|
+
const extraction = await runDatasetSandboxCommandStep({
|
|
174
|
+
runtime: state.runtime,
|
|
175
|
+
sandboxId,
|
|
176
|
+
cmd: "python",
|
|
177
|
+
args: [scriptPath, sourcePath, outputPath, source.fileId, fileName],
|
|
178
|
+
});
|
|
179
|
+
if (extraction.exitCode !== 0) {
|
|
180
|
+
throw new Error(`dataset_pdf_text_extraction_failed:${extraction.stderr || extraction.stdout}`);
|
|
181
|
+
}
|
|
182
|
+
const output = await readDatasetSandboxTextFileStep({
|
|
183
|
+
runtime: state.runtime,
|
|
184
|
+
sandboxId,
|
|
185
|
+
path: outputPath,
|
|
186
|
+
});
|
|
187
|
+
const rows = parseJsonlDataRows(output.content);
|
|
188
|
+
if (rows.length === 0) {
|
|
189
|
+
throw new Error("dataset_pdf_text_extraction_empty");
|
|
190
|
+
}
|
|
191
|
+
await materializeRowsToDataset(state.runtime, {
|
|
192
|
+
datasetId: targetDatasetId,
|
|
193
|
+
sandboxId,
|
|
194
|
+
title: state.title ?? fileName,
|
|
195
|
+
instructions: state.instructions,
|
|
196
|
+
sources: [{ kind: "file", fileId: source.fileId, description: source.description }],
|
|
197
|
+
sourceKinds: ["file"],
|
|
198
|
+
rows,
|
|
199
|
+
schema: pdfTextRowsSchema(),
|
|
200
|
+
first: state.first,
|
|
201
|
+
});
|
|
202
|
+
return targetDatasetId;
|
|
203
|
+
}
|
|
44
204
|
async function materializeRawTextSource(state, source, targetDatasetId) {
|
|
45
205
|
const rows = materializeRawTextRows(source);
|
|
46
206
|
await materializeRowsToDataset(state.runtime, {
|
|
@@ -63,24 +223,11 @@ async function materializeRawTextSource(state, source, targetDatasetId) {
|
|
|
63
223
|
});
|
|
64
224
|
return targetDatasetId;
|
|
65
225
|
}
|
|
66
|
-
|
|
226
|
+
function resolveDatasetSandboxId(state, _targetDatasetId) {
|
|
67
227
|
const sandboxId = String(state.sandboxId ?? "").trim();
|
|
68
228
|
if (sandboxId)
|
|
69
229
|
return sandboxId;
|
|
70
|
-
|
|
71
|
-
runtime: state.runtime,
|
|
72
|
-
provider: "vercel",
|
|
73
|
-
sandboxRuntime: "python3.13",
|
|
74
|
-
timeoutMs: 20 * 60 * 1000,
|
|
75
|
-
resources: { vcpus: 2 },
|
|
76
|
-
purpose: "dataset.materialize",
|
|
77
|
-
params: { datasetId: targetDatasetId },
|
|
78
|
-
vercel: {
|
|
79
|
-
profile: "ephemeral",
|
|
80
|
-
deleteOnStop: true,
|
|
81
|
-
},
|
|
82
|
-
});
|
|
83
|
-
return created.sandboxId;
|
|
230
|
+
throw new Error("dataset_sandbox_required");
|
|
84
231
|
}
|
|
85
232
|
export async function resolveDatasetAgentDurable(requestedDurable) {
|
|
86
233
|
if (!requestedDurable)
|
|
@@ -96,15 +243,150 @@ export async function resolveDatasetAgentDurable(requestedDurable) {
|
|
|
96
243
|
}
|
|
97
244
|
return true;
|
|
98
245
|
}
|
|
246
|
+
export async function initializeDatasetStep(params) {
|
|
247
|
+
"use step";
|
|
248
|
+
await createOrUpdateDatasetMetadata(params.runtime, {
|
|
249
|
+
datasetId: params.datasetId,
|
|
250
|
+
sandboxId: params.sandboxId,
|
|
251
|
+
title: params.title ?? params.datasetId,
|
|
252
|
+
instructions: params.instructions,
|
|
253
|
+
sources: params.sources,
|
|
254
|
+
sourceKinds: params.sourceKinds,
|
|
255
|
+
schema: params.schema,
|
|
256
|
+
status: "building",
|
|
257
|
+
});
|
|
258
|
+
return {
|
|
259
|
+
datasetId: params.datasetId,
|
|
260
|
+
sandboxId: params.sandboxId,
|
|
261
|
+
};
|
|
262
|
+
}
|
|
263
|
+
export async function prepareDatasetSourcesStep(params) {
|
|
264
|
+
"use step";
|
|
265
|
+
if (params.kind === "file") {
|
|
266
|
+
const fileId = params.source.kind === "file"
|
|
267
|
+
? params.source.fileId
|
|
268
|
+
: await uploadInlineTextSource(params.runtime, params.datasetId, params.source);
|
|
269
|
+
const initialized = await initializeFileParseSandboxStep({
|
|
270
|
+
runtime: params.runtime,
|
|
271
|
+
sandboxId: params.sandboxId,
|
|
272
|
+
datasetId: params.datasetId,
|
|
273
|
+
fileId,
|
|
274
|
+
state: { initialized: false, filePath: "" },
|
|
275
|
+
});
|
|
276
|
+
const filePreview = await generateFileParsePreviewStep({
|
|
277
|
+
runtime: params.runtime,
|
|
278
|
+
sandboxId: params.sandboxId,
|
|
279
|
+
sandboxFilePath: initialized.filePath,
|
|
280
|
+
datasetId: params.datasetId,
|
|
281
|
+
});
|
|
282
|
+
return {
|
|
283
|
+
kind: "file",
|
|
284
|
+
datasetId: params.datasetId,
|
|
285
|
+
sandboxId: params.sandboxId,
|
|
286
|
+
fileId,
|
|
287
|
+
sandboxState: initialized.state,
|
|
288
|
+
filePreview,
|
|
289
|
+
schema: params.schema ?? null,
|
|
290
|
+
};
|
|
291
|
+
}
|
|
292
|
+
const initialized = await ensureTransformSourcesInSandboxStep({
|
|
293
|
+
runtime: params.runtime,
|
|
294
|
+
sandboxId: params.sandboxId,
|
|
295
|
+
datasetId: params.datasetId,
|
|
296
|
+
sourceDatasetIds: params.sourceDatasetIds,
|
|
297
|
+
state: { initialized: false, sourcePaths: [] },
|
|
298
|
+
});
|
|
299
|
+
const sourcePreviews = await generateTransformSourcePreviewsStep({
|
|
300
|
+
runtime: params.runtime,
|
|
301
|
+
sandboxId: params.sandboxId,
|
|
302
|
+
datasetId: params.datasetId,
|
|
303
|
+
sourcePaths: initialized.sourcePaths,
|
|
304
|
+
});
|
|
305
|
+
return {
|
|
306
|
+
kind: "transform",
|
|
307
|
+
datasetId: params.datasetId,
|
|
308
|
+
sandboxId: params.sandboxId,
|
|
309
|
+
sourceDatasetIds: params.sourceDatasetIds,
|
|
310
|
+
outputSchema: params.outputSchema,
|
|
311
|
+
sandboxState: initialized.state,
|
|
312
|
+
sourcePreviews,
|
|
313
|
+
};
|
|
314
|
+
}
|
|
315
|
+
export async function initializeDatasetContextStep(params) {
|
|
316
|
+
"use step";
|
|
317
|
+
if (params.prepared.kind === "file") {
|
|
318
|
+
return {
|
|
319
|
+
...params.prepared,
|
|
320
|
+
instructions: params.instructions ?? buildFileDefaultInstructions(params.outputSchema),
|
|
321
|
+
prompt: "generate a dataset for this file",
|
|
322
|
+
};
|
|
323
|
+
}
|
|
324
|
+
return {
|
|
325
|
+
...params.prepared,
|
|
326
|
+
instructions: params.instructions,
|
|
327
|
+
prompt: params.prepared.sourceDatasetIds.length === 1
|
|
328
|
+
? "Transform the source dataset into a new dataset matching the provided output schema"
|
|
329
|
+
: `Transform ${params.prepared.sourceDatasetIds.length} source datasets into a new dataset matching the provided output schema`,
|
|
330
|
+
};
|
|
331
|
+
}
|
|
332
|
+
export async function completeDatasetStep(params) {
|
|
333
|
+
"use step";
|
|
334
|
+
let datasetResult = await datasetGetByIdStep({
|
|
335
|
+
runtime: params.runtime,
|
|
336
|
+
datasetId: params.datasetId,
|
|
337
|
+
});
|
|
338
|
+
if (!datasetResult.ok)
|
|
339
|
+
throw new Error(datasetResult.error);
|
|
340
|
+
if (!params.schema && !datasetResult.data?.schema) {
|
|
341
|
+
await datasetInferAndUpdateSchemaStep({
|
|
342
|
+
runtime: params.runtime,
|
|
343
|
+
datasetId: params.datasetId,
|
|
344
|
+
title: `${params.datasetId}Row`,
|
|
345
|
+
description: "One dataset row",
|
|
346
|
+
});
|
|
347
|
+
datasetResult = await datasetGetByIdStep({
|
|
348
|
+
runtime: params.runtime,
|
|
349
|
+
datasetId: params.datasetId,
|
|
350
|
+
});
|
|
351
|
+
if (!datasetResult.ok)
|
|
352
|
+
throw new Error(datasetResult.error);
|
|
353
|
+
}
|
|
354
|
+
const previewResult = await datasetPreviewRowsStep({
|
|
355
|
+
runtime: params.runtime,
|
|
356
|
+
datasetId: params.datasetId,
|
|
357
|
+
limit: 20,
|
|
358
|
+
});
|
|
359
|
+
if (!params.first) {
|
|
360
|
+
return {
|
|
361
|
+
datasetId: params.datasetId,
|
|
362
|
+
dataset: datasetResult.data,
|
|
363
|
+
previewRows: previewResult.rows,
|
|
364
|
+
firstRow: undefined,
|
|
365
|
+
};
|
|
366
|
+
}
|
|
367
|
+
const firstResult = await datasetReadOneStep({
|
|
368
|
+
runtime: params.runtime,
|
|
369
|
+
datasetId: params.datasetId,
|
|
370
|
+
});
|
|
371
|
+
return {
|
|
372
|
+
datasetId: params.datasetId,
|
|
373
|
+
dataset: datasetResult.data,
|
|
374
|
+
previewRows: previewResult.rows,
|
|
375
|
+
firstRow: firstResult.row,
|
|
376
|
+
};
|
|
377
|
+
}
|
|
99
378
|
export async function materializeSingleFileLikeSource(state, source, targetDatasetId) {
|
|
379
|
+
if (source.kind === "file" && !state.outputSchema) {
|
|
380
|
+
const materializedPdf = await tryMaterializeRawPdfFileSource(state, source, targetDatasetId);
|
|
381
|
+
if (materializedPdf)
|
|
382
|
+
return materializedPdf;
|
|
383
|
+
}
|
|
384
|
+
const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
|
|
100
385
|
if (!state.reactor) {
|
|
101
386
|
throw new Error("dataset_reactor_required");
|
|
102
387
|
}
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
? source.fileId
|
|
106
|
-
: await uploadInlineTextSource(state.runtime, targetDatasetId, source);
|
|
107
|
-
await createOrUpdateDatasetMetadata(state.runtime, {
|
|
388
|
+
await initializeDatasetStep({
|
|
389
|
+
runtime: state.runtime,
|
|
108
390
|
datasetId: targetDatasetId,
|
|
109
391
|
sandboxId,
|
|
110
392
|
title: state.title ?? targetDatasetId,
|
|
@@ -121,28 +403,45 @@ export async function materializeSingleFileLikeSource(state, source, targetDatas
|
|
|
121
403
|
],
|
|
122
404
|
sourceKinds: [source.kind],
|
|
123
405
|
schema: state.outputSchema,
|
|
124
|
-
status: "building",
|
|
125
406
|
});
|
|
126
|
-
const
|
|
407
|
+
const prepared = await prepareDatasetSourcesStep({
|
|
408
|
+
kind: "file",
|
|
409
|
+
runtime: state.runtime,
|
|
127
410
|
datasetId: targetDatasetId,
|
|
128
|
-
instructions: state.instructions ?? buildFileDefaultInstructions(state.outputSchema),
|
|
129
|
-
reactor: state.reactor,
|
|
130
411
|
sandboxId,
|
|
412
|
+
source,
|
|
413
|
+
schema: state.outputSchema,
|
|
414
|
+
});
|
|
415
|
+
const context = await initializeDatasetContextStep({
|
|
416
|
+
prepared,
|
|
417
|
+
instructions: state.instructions,
|
|
418
|
+
outputSchema: state.outputSchema,
|
|
419
|
+
});
|
|
420
|
+
if (context.kind !== "file") {
|
|
421
|
+
throw new Error("dataset_context_kind_mismatch:file");
|
|
422
|
+
}
|
|
423
|
+
const parseContext = createFileParseContext(context.fileId, {
|
|
424
|
+
datasetId: context.datasetId,
|
|
425
|
+
instructions: context.instructions,
|
|
426
|
+
reactor: state.reactor,
|
|
427
|
+
sandboxId: context.sandboxId,
|
|
428
|
+
sandboxState: context.sandboxState,
|
|
429
|
+
filePreview: context.filePreview,
|
|
430
|
+
schema: context.schema,
|
|
131
431
|
});
|
|
132
432
|
await parseContext.parse(state.runtime, {
|
|
133
433
|
durable: await resolveDatasetAgentDurable(state.durable),
|
|
434
|
+
prompt: context.prompt,
|
|
435
|
+
initialContent: {
|
|
436
|
+
datasetId: context.datasetId,
|
|
437
|
+
fileId: context.fileId,
|
|
438
|
+
instructions: context.instructions ?? "",
|
|
439
|
+
sandboxId: context.sandboxId,
|
|
440
|
+
sandboxState: context.sandboxState,
|
|
441
|
+
filePreview: context.filePreview,
|
|
442
|
+
schema: context.schema,
|
|
443
|
+
},
|
|
134
444
|
});
|
|
135
|
-
if (!state.outputSchema) {
|
|
136
|
-
await datasetInferAndUpdateSchemaStep({
|
|
137
|
-
runtime: state.runtime,
|
|
138
|
-
datasetId: targetDatasetId,
|
|
139
|
-
title: `${targetDatasetId}Row`,
|
|
140
|
-
description: "One dataset row",
|
|
141
|
-
});
|
|
142
|
-
}
|
|
143
|
-
if (state.first) {
|
|
144
|
-
await datasetReadOneStep({ runtime: state.runtime, datasetId: targetDatasetId });
|
|
145
|
-
}
|
|
146
445
|
return targetDatasetId;
|
|
147
446
|
}
|
|
148
447
|
async function normalizeSourceToDatasetId(state, source, targetDatasetId, sourceIndex) {
|
|
@@ -181,7 +480,7 @@ export async function materializeDerivedDataset(state, targetDatasetId) {
|
|
|
181
480
|
if (!state.reactor) {
|
|
182
481
|
throw new Error("dataset_reactor_required");
|
|
183
482
|
}
|
|
184
|
-
const sandboxId =
|
|
483
|
+
const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
|
|
185
484
|
const stateWithSandbox = { ...state, sandboxId };
|
|
186
485
|
const normalizedSources = [];
|
|
187
486
|
for (let index = 0; index < stateWithSandbox.sources.length; index++) {
|
|
@@ -197,7 +496,8 @@ export async function materializeDerivedDataset(state, targetDatasetId) {
|
|
|
197
496
|
properties: {},
|
|
198
497
|
},
|
|
199
498
|
};
|
|
200
|
-
await
|
|
499
|
+
await initializeDatasetStep({
|
|
500
|
+
runtime: stateWithSandbox.runtime,
|
|
201
501
|
datasetId: targetDatasetId,
|
|
202
502
|
sandboxId,
|
|
203
503
|
title: stateWithSandbox.title ?? targetDatasetId,
|
|
@@ -213,30 +513,46 @@ export async function materializeDerivedDataset(state, targetDatasetId) {
|
|
|
213
513
|
: source),
|
|
214
514
|
sourceKinds: stateWithSandbox.sources.map((source) => source.kind),
|
|
215
515
|
schema: transformSchema,
|
|
216
|
-
status: "building",
|
|
217
516
|
});
|
|
218
|
-
const
|
|
517
|
+
const prepared = await prepareDatasetSourcesStep({
|
|
518
|
+
kind: "transform",
|
|
519
|
+
runtime: stateWithSandbox.runtime,
|
|
520
|
+
datasetId: targetDatasetId,
|
|
521
|
+
sandboxId,
|
|
219
522
|
sourceDatasetIds: normalizedSources,
|
|
220
523
|
outputSchema: transformSchema,
|
|
524
|
+
});
|
|
525
|
+
const context = await initializeDatasetContextStep({
|
|
526
|
+
prepared,
|
|
221
527
|
instructions: buildTransformInstructions(normalizedSources.length, stateWithSandbox.instructions, stateWithSandbox.outputSchema),
|
|
222
|
-
|
|
528
|
+
outputSchema: transformSchema,
|
|
529
|
+
});
|
|
530
|
+
if (context.kind !== "transform") {
|
|
531
|
+
throw new Error("dataset_context_kind_mismatch:transform");
|
|
532
|
+
}
|
|
533
|
+
const transformContext = createTransformDatasetContext({
|
|
534
|
+
sourceDatasetIds: context.sourceDatasetIds,
|
|
535
|
+
outputSchema: context.outputSchema,
|
|
536
|
+
instructions: context.instructions,
|
|
537
|
+
datasetId: context.datasetId,
|
|
223
538
|
reactor: stateWithSandbox.reactor,
|
|
224
|
-
sandboxId,
|
|
539
|
+
sandboxId: context.sandboxId,
|
|
540
|
+
sandboxState: context.sandboxState,
|
|
541
|
+
sourcePreviews: context.sourcePreviews,
|
|
225
542
|
});
|
|
226
543
|
await transformContext.transform(stateWithSandbox.runtime, {
|
|
227
544
|
durable: await resolveDatasetAgentDurable(stateWithSandbox.durable),
|
|
545
|
+
prompt: context.prompt,
|
|
546
|
+
initialContent: {
|
|
547
|
+
datasetId: context.datasetId,
|
|
548
|
+
sourceDatasetIds: context.sourceDatasetIds,
|
|
549
|
+
outputSchema: context.outputSchema,
|
|
550
|
+
instructions: context.instructions,
|
|
551
|
+
sandboxId: context.sandboxId,
|
|
552
|
+
sandboxState: context.sandboxState,
|
|
553
|
+
sourcePreviews: context.sourcePreviews,
|
|
554
|
+
},
|
|
228
555
|
});
|
|
229
|
-
if (!stateWithSandbox.outputSchema) {
|
|
230
|
-
await datasetInferAndUpdateSchemaStep({
|
|
231
|
-
runtime: stateWithSandbox.runtime,
|
|
232
|
-
datasetId: targetDatasetId,
|
|
233
|
-
title: `${targetDatasetId}Row`,
|
|
234
|
-
description: "One dataset row",
|
|
235
|
-
});
|
|
236
|
-
}
|
|
237
|
-
if (stateWithSandbox.first) {
|
|
238
|
-
await datasetReadOneStep({ runtime: stateWithSandbox.runtime, datasetId: targetDatasetId });
|
|
239
|
-
}
|
|
240
556
|
return targetDatasetId;
|
|
241
557
|
}
|
|
242
558
|
registerDatasetAgentMaterializers({
|
|
@@ -15,3 +15,9 @@ export declare function createOrUpdateDatasetMetadata<Runtime extends AnyDataset
|
|
|
15
15
|
export declare function materializeRowsToDataset<Runtime extends AnyDatasetRuntime>(runtime: Runtime, params: MaterializeRowsParams): Promise<string>;
|
|
16
16
|
export declare function uploadInlineTextSource<Runtime extends AnyDatasetRuntime>(runtime: Runtime, datasetId: string, source: DatasetTextSourceInput): Promise<string>;
|
|
17
17
|
export declare function finalizeBuildResult<Runtime extends AnyDatasetRuntime>(runtime: Runtime, datasetId: string, withFirst: boolean): Promise<DatasetBuildResult>;
|
|
18
|
+
export declare function createDatasetBuildResult<Runtime extends AnyDatasetRuntime>(runtime: Runtime, params: {
|
|
19
|
+
datasetId: string;
|
|
20
|
+
dataset: any;
|
|
21
|
+
previewRows: any[];
|
|
22
|
+
firstRow?: any | null;
|
|
23
|
+
}): DatasetBuildResult;
|
|
@@ -128,3 +128,25 @@ export async function finalizeBuildResult(runtime, datasetId, withFirst) {
|
|
|
128
128
|
firstRow: firstResult.row,
|
|
129
129
|
};
|
|
130
130
|
}
|
|
131
|
+
export function createDatasetBuildResult(runtime, params) {
|
|
132
|
+
const reader = {
|
|
133
|
+
async read(cursorOrParams, limit) {
|
|
134
|
+
const readParams = typeof cursorOrParams === "object" && cursorOrParams !== null
|
|
135
|
+
? cursorOrParams
|
|
136
|
+
: { cursor: cursorOrParams, limit };
|
|
137
|
+
return await datasetReadRowsStep({
|
|
138
|
+
runtime,
|
|
139
|
+
datasetId: params.datasetId,
|
|
140
|
+
cursor: readParams.cursor,
|
|
141
|
+
limit: readParams.limit,
|
|
142
|
+
});
|
|
143
|
+
},
|
|
144
|
+
};
|
|
145
|
+
return {
|
|
146
|
+
datasetId: params.datasetId,
|
|
147
|
+
dataset: params.dataset,
|
|
148
|
+
previewRows: params.previewRows,
|
|
149
|
+
reader,
|
|
150
|
+
...(params.firstRow !== undefined ? { firstRow: params.firstRow } : {}),
|
|
151
|
+
};
|
|
152
|
+
}
|