@ekairos/dataset 1.22.40-beta.development.0 → 1.22.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agents.d.ts +8 -0
- package/dist/agents.js +8 -0
- package/dist/builder/agentMaterializers.d.ts +9 -0
- package/dist/builder/agentMaterializers.js +10 -0
- package/dist/builder/context.d.ts +15 -0
- package/dist/builder/context.js +251 -0
- package/dist/builder/instructions.d.ts +4 -5
- package/dist/builder/instructions.js +15 -21
- package/dist/builder/materialize.d.ts +77 -10
- package/dist/builder/materialize.js +495 -152
- package/dist/builder/materializeQuery.d.ts +12 -0
- package/dist/builder/materializeQuery.js +31 -0
- package/dist/builder/persistence.d.ts +10 -6
- package/dist/builder/persistence.js +107 -62
- package/dist/builder/{sourceRows.d.ts → rows.d.ts} +0 -1
- package/dist/builder/{sourceRows.js → rows.js} +3 -9
- package/dist/builder/schemaInference.d.ts +1 -2
- package/dist/builder/schemaInference.js +4 -12
- package/dist/builder/types.d.ts +41 -26
- package/dist/builder/types.js +1 -3
- package/dist/clearDataset.tool.d.ts +2 -3
- package/dist/clearDataset.tool.js +13 -17
- package/dist/completeDataset.steps.d.ts +117 -0
- package/dist/completeDataset.steps.js +537 -0
- package/dist/completeDataset.tool.d.ts +132 -7
- package/dist/completeDataset.tool.js +46 -192
- package/dist/contextResources.d.ts +31 -0
- package/dist/contextResources.js +151 -0
- package/dist/contextWorkspace.d.ts +79 -0
- package/dist/contextWorkspace.js +234 -0
- package/dist/dataset/steps.d.ts +39 -15
- package/dist/dataset/steps.js +96 -39
- package/dist/dataset.d.ts +2 -3
- package/dist/dataset.js +73 -51
- package/dist/datasetFiles.d.ts +5 -1
- package/dist/datasetFiles.js +29 -27
- package/dist/defineNotation.tool.d.ts +49 -0
- package/dist/defineNotation.tool.js +154 -0
- package/dist/domain.d.ts +1 -2
- package/dist/domain.js +1 -6
- package/dist/executeCommand.tool.d.ts +2 -30
- package/dist/executeCommand.tool.js +165 -39
- package/dist/file/file-dataset.agent.d.ts +19 -56
- package/dist/file/file-dataset.agent.js +182 -136
- package/dist/file/file-dataset.steps.d.ts +27 -0
- package/dist/file/file-dataset.steps.js +47 -0
- package/dist/file/file-dataset.types.d.ts +64 -0
- package/dist/file/file-dataset.types.js +1 -0
- package/dist/file/filepreview.d.ts +5 -35
- package/dist/file/filepreview.js +60 -107
- package/dist/file/filepreview.types.d.ts +31 -0
- package/dist/file/filepreview.types.js +1 -0
- package/dist/file/generateSchema.tool.d.ts +2 -3
- package/dist/file/generateSchema.tool.js +11 -15
- package/dist/file/index.d.ts +1 -2
- package/dist/file/index.js +1 -18
- package/dist/file/prompts.d.ts +2 -3
- package/dist/file/prompts.js +152 -32
- package/dist/file/scripts.generated.d.ts +1 -0
- package/dist/file/scripts.generated.js +11 -0
- package/dist/file/steps.d.ts +1 -2
- package/dist/file/steps.js +9 -7
- package/dist/id.d.ts +1 -0
- package/dist/id.js +10 -0
- package/dist/index.d.ts +9 -7
- package/dist/index.js +9 -23
- package/dist/materializeDataset.tool.d.ts +35 -28
- package/dist/materializeDataset.tool.js +74 -68
- package/dist/notation.d.ts +205 -0
- package/dist/notation.js +424 -0
- package/dist/query/index.d.ts +1 -2
- package/dist/query/index.js +1 -18
- package/dist/query/queryDomain.d.ts +3 -4
- package/dist/query/queryDomain.js +3 -40
- package/dist/query/queryDomain.step.d.ts +1 -1
- package/dist/query/queryDomain.step.js +24 -13
- package/dist/sandbox/steps.d.ts +23 -15
- package/dist/sandbox/steps.js +73 -76
- package/dist/sandbox.steps.d.ts +1 -2
- package/dist/sandbox.steps.js +1 -18
- package/dist/schema.d.ts +14 -3
- package/dist/schema.js +27 -26
- package/dist/service.d.ts +12 -5
- package/dist/service.js +88 -15
- package/dist/skill.d.ts +0 -1
- package/dist/skill.js +12 -17
- package/dist/transform/filepreview.d.ts +2 -3
- package/dist/transform/filepreview.js +9 -26
- package/dist/transform/index.d.ts +2 -3
- package/dist/transform/index.js +2 -8
- package/dist/transform/prompts.d.ts +1 -34
- package/dist/transform/prompts.js +66 -46
- package/dist/transform/transform-dataset.agent.d.ts +21 -46
- package/dist/transform/transform-dataset.agent.js +152 -93
- package/dist/transform/transform-dataset.steps.d.ts +30 -0
- package/dist/transform/transform-dataset.steps.js +61 -0
- package/dist/transform/transform-dataset.types.d.ts +96 -0
- package/dist/transform/transform-dataset.types.js +1 -0
- package/dist/transform/transformDataset.d.ts +3 -3
- package/dist/transform/transformDataset.js +15 -18
- package/dist/writeDatasetRows.tool.d.ts +188 -0
- package/dist/writeDatasetRows.tool.js +258 -0
- package/package.json +33 -8
- package/dist/builder/instructions.d.ts.map +0 -1
- package/dist/builder/instructions.js.map +0 -1
- package/dist/builder/materialize.d.ts.map +0 -1
- package/dist/builder/materialize.js.map +0 -1
- package/dist/builder/persistence.d.ts.map +0 -1
- package/dist/builder/persistence.js.map +0 -1
- package/dist/builder/schemaInference.d.ts.map +0 -1
- package/dist/builder/schemaInference.js.map +0 -1
- package/dist/builder/sourceRows.d.ts.map +0 -1
- package/dist/builder/sourceRows.js.map +0 -1
- package/dist/builder/types.d.ts.map +0 -1
- package/dist/builder/types.js.map +0 -1
- package/dist/clearDataset.tool.d.ts.map +0 -1
- package/dist/clearDataset.tool.js.map +0 -1
- package/dist/completeDataset.tool.d.ts.map +0 -1
- package/dist/completeDataset.tool.js.map +0 -1
- package/dist/dataset/steps.d.ts.map +0 -1
- package/dist/dataset/steps.js.map +0 -1
- package/dist/dataset.d.ts.map +0 -1
- package/dist/dataset.js.map +0 -1
- package/dist/datasetFiles.d.ts.map +0 -1
- package/dist/datasetFiles.js.map +0 -1
- package/dist/domain.d.ts.map +0 -1
- package/dist/domain.js.map +0 -1
- package/dist/eventsReactRuntime.d.ts +0 -22
- package/dist/eventsReactRuntime.d.ts.map +0 -1
- package/dist/eventsReactRuntime.js +0 -29
- package/dist/eventsReactRuntime.js.map +0 -1
- package/dist/executeCommand.tool.d.ts.map +0 -1
- package/dist/executeCommand.tool.js.map +0 -1
- package/dist/file/file-dataset.agent.d.ts.map +0 -1
- package/dist/file/file-dataset.agent.js.map +0 -1
- package/dist/file/filepreview.d.ts.map +0 -1
- package/dist/file/filepreview.js.map +0 -1
- package/dist/file/generateSchema.tool.d.ts.map +0 -1
- package/dist/file/generateSchema.tool.js.map +0 -1
- package/dist/file/index.d.ts.map +0 -1
- package/dist/file/index.js.map +0 -1
- package/dist/file/prompts.d.ts.map +0 -1
- package/dist/file/prompts.js.map +0 -1
- package/dist/file/steps.d.ts.map +0 -1
- package/dist/file/steps.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/materializeDataset.tool.d.ts.map +0 -1
- package/dist/materializeDataset.tool.js.map +0 -1
- package/dist/query/index.d.ts.map +0 -1
- package/dist/query/index.js.map +0 -1
- package/dist/query/queryDomain.d.ts.map +0 -1
- package/dist/query/queryDomain.js.map +0 -1
- package/dist/query/queryDomain.step.d.ts.map +0 -1
- package/dist/query/queryDomain.step.js.map +0 -1
- package/dist/sandbox/steps.d.ts.map +0 -1
- package/dist/sandbox/steps.js.map +0 -1
- package/dist/sandbox.steps.d.ts.map +0 -1
- package/dist/sandbox.steps.js.map +0 -1
- package/dist/schema.d.ts.map +0 -1
- package/dist/schema.js.map +0 -1
- package/dist/service.d.ts.map +0 -1
- package/dist/service.js.map +0 -1
- package/dist/skill.d.ts.map +0 -1
- package/dist/skill.js.map +0 -1
- package/dist/transform/filepreview.d.ts.map +0 -1
- package/dist/transform/filepreview.js.map +0 -1
- package/dist/transform/index.d.ts.map +0 -1
- package/dist/transform/index.js.map +0 -1
- package/dist/transform/prompts.d.ts.map +0 -1
- package/dist/transform/prompts.js.map +0 -1
- package/dist/transform/transform-dataset.agent.d.ts.map +0 -1
- package/dist/transform/transform-dataset.agent.js.map +0 -1
- package/dist/transform/transformDataset.d.ts.map +0 -1
- package/dist/transform/transformDataset.js.map +0 -1
|
@@ -1,146 +1,485 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
1
|
+
import { createFileParseContext } from "../file/file-dataset.agent.js";
|
|
2
|
+
import { readInstantFileStep } from "../file/steps.js";
|
|
3
|
+
import { createTransformDatasetContext } from "../transform/transform-dataset.agent.js";
|
|
4
|
+
import { datasetGetByIdStep, datasetInferAndUpdateSchemaStep, datasetPreviewRowsStep, datasetReadOneStep, } from "../dataset/steps.js";
|
|
5
|
+
import { getDatasetOutputPath, getDatasetScriptsDir, getDatasetResourcesDir, getDatasetStandardDirs, } from "../datasetFiles.js";
|
|
6
|
+
import { registerDatasetAgentMaterializers } from "./agentMaterializers.js";
|
|
7
|
+
import { buildFileDefaultInstructions, buildRawResourceInstructions, buildTransformInstructions, } from "./instructions.js";
|
|
8
|
+
import { createOrUpdateDatasetMetadata, materializeRowsToDataset, uploadInlineTextResource, } from "./persistence.js";
|
|
9
|
+
import { materializeQueryResource } from "./materializeQuery.js";
|
|
10
|
+
import { readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep, writeDatasetSandboxTextFilesStep, } from "../sandbox/steps.js";
|
|
11
|
+
function makeIntermediateDatasetId(targetDatasetId, resourceKind, index) {
|
|
12
|
+
return `${targetDatasetId}__${resourceKind}_${index}`;
|
|
13
|
+
}
|
|
14
|
+
function normalizeParsedTextRows(value) {
|
|
15
|
+
if (Array.isArray(value)) {
|
|
16
|
+
return value.map((item) => (item && typeof item === "object" ? item : { value: item }));
|
|
17
|
+
}
|
|
18
|
+
if (value && typeof value === "object")
|
|
19
|
+
return [value];
|
|
20
|
+
return [{ value }];
|
|
21
|
+
}
|
|
22
|
+
function materializeRawTextRows(resource) {
|
|
23
|
+
const text = String(resource.text ?? "");
|
|
24
|
+
const mimeType = String(resource.mimeType ?? "").toLowerCase();
|
|
25
|
+
const name = String(resource.name ?? "").toLowerCase();
|
|
26
|
+
const shouldParseJson = mimeType.includes("json") || name.endsWith(".json") || name.endsWith(".jsonl");
|
|
27
|
+
if (shouldParseJson) {
|
|
28
|
+
try {
|
|
29
|
+
if (name.endsWith(".jsonl")) {
|
|
30
|
+
const rows = text
|
|
31
|
+
.split(/\r?\n/g)
|
|
32
|
+
.map((line) => line.trim())
|
|
33
|
+
.filter(Boolean)
|
|
34
|
+
.map((line) => JSON.parse(line));
|
|
35
|
+
return rows.flatMap((row) => normalizeParsedTextRows(row));
|
|
36
|
+
}
|
|
37
|
+
return normalizeParsedTextRows(JSON.parse(text));
|
|
38
|
+
}
|
|
39
|
+
catch {
|
|
40
|
+
return [{ text }];
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return [{ text }];
|
|
44
|
+
}
|
|
45
|
+
function parseContentDispositionFileName(value) {
|
|
46
|
+
const text = String(value ?? "");
|
|
47
|
+
const utf8Match = /filename\*=UTF-8''([^;]+)/i.exec(text);
|
|
48
|
+
if (utf8Match?.[1]) {
|
|
49
|
+
try {
|
|
50
|
+
return decodeURIComponent(utf8Match[1]).trim();
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
return utf8Match[1].trim();
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
const quotedMatch = /filename="([^"]+)"/i.exec(text);
|
|
57
|
+
if (quotedMatch?.[1])
|
|
58
|
+
return quotedMatch[1].trim();
|
|
59
|
+
const plainMatch = /filename=([^;]+)/i.exec(text);
|
|
60
|
+
if (plainMatch?.[1])
|
|
61
|
+
return plainMatch[1].trim();
|
|
62
|
+
return "";
|
|
63
|
+
}
|
|
64
|
+
function isPdfContentDisposition(value) {
|
|
65
|
+
const text = String(value ?? "").toLowerCase();
|
|
66
|
+
return text.includes("application/pdf") || text.includes(".pdf");
|
|
67
|
+
}
|
|
68
|
+
function sanitizeResourceFileName(value, fallback) {
|
|
69
|
+
const name = String(value ?? "").trim() || fallback;
|
|
70
|
+
const cleaned = name.replace(/[\\/:"*?<>|]+/g, "_").replace(/\s+/g, "_").slice(0, 120);
|
|
71
|
+
return cleaned || fallback;
|
|
72
|
+
}
|
|
73
|
+
function sanitizePdfFileName(value, fallback) {
|
|
74
|
+
const cleaned = sanitizeResourceFileName(value, fallback);
|
|
75
|
+
return cleaned.toLowerCase().endsWith(".pdf") ? cleaned : `${cleaned}.pdf`;
|
|
76
|
+
}
|
|
77
|
+
function pdfTextRowsSchema() {
|
|
78
|
+
return {
|
|
79
|
+
title: "PdfTextPage",
|
|
80
|
+
description: "Extracted PDF page text",
|
|
81
|
+
schema: {
|
|
82
|
+
type: "object",
|
|
83
|
+
additionalProperties: false,
|
|
84
|
+
required: ["fileId", "fileName", "pageNumber", "text"],
|
|
85
|
+
properties: {
|
|
86
|
+
fileId: { type: "string" },
|
|
87
|
+
fileName: { type: "string" },
|
|
88
|
+
pageNumber: { type: "number" },
|
|
89
|
+
text: { type: "string" },
|
|
90
|
+
},
|
|
91
|
+
},
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
function parseJsonlDataRows(content) {
|
|
95
|
+
return String(content ?? "")
|
|
96
|
+
.split(/\r?\n/g)
|
|
97
|
+
.map((line) => line.trim())
|
|
98
|
+
.filter(Boolean)
|
|
99
|
+
.map((line) => JSON.parse(line))
|
|
100
|
+
.map((record) => record?.data)
|
|
101
|
+
.filter((row) => row && typeof row === "object" && !Array.isArray(row));
|
|
102
|
+
}
|
|
103
|
+
async function tryMaterializeRawPdfFileResource(state, resource, targetDatasetId) {
|
|
104
|
+
const file = await readInstantFileStep({ runtime: state.runtime, fileId: resource.fileId });
|
|
105
|
+
if (!isPdfContentDisposition(file.contentDisposition))
|
|
106
|
+
return null;
|
|
107
|
+
const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
|
|
108
|
+
const outputPath = getDatasetOutputPath(targetDatasetId);
|
|
109
|
+
const fileName = sanitizePdfFileName(parseContentDispositionFileName(file.contentDisposition), `${resource.fileId}.pdf`);
|
|
110
|
+
const resourcePath = `${getDatasetResourcesDir(targetDatasetId)}/${fileName}`;
|
|
111
|
+
const scriptPath = `${getDatasetScriptsDir(targetDatasetId)}/extract_pdf_text.py`;
|
|
112
|
+
await runDatasetSandboxCommandStep({
|
|
113
|
+
runtime: state.runtime,
|
|
114
|
+
sandboxId,
|
|
115
|
+
cmd: "mkdir",
|
|
116
|
+
args: ["-p", ...getDatasetStandardDirs(targetDatasetId)],
|
|
117
|
+
});
|
|
118
|
+
await writeDatasetSandboxFilesStep({
|
|
119
|
+
runtime: state.runtime,
|
|
120
|
+
sandboxId,
|
|
121
|
+
files: [{ path: resourcePath, contentBase64: file.contentBase64 }],
|
|
122
|
+
});
|
|
123
|
+
const install = await runDatasetSandboxCommandStep({
|
|
124
|
+
runtime: state.runtime,
|
|
125
|
+
sandboxId,
|
|
126
|
+
cmd: "python",
|
|
127
|
+
args: ["-m", "pip", "install", "pypdf", "--quiet"],
|
|
128
|
+
});
|
|
129
|
+
if (install.exitCode !== 0) {
|
|
130
|
+
throw new Error(`dataset_pdf_dependency_install_failed:${install.stderr || install.stdout}`);
|
|
131
|
+
}
|
|
132
|
+
await writeDatasetSandboxTextFilesStep({
|
|
133
|
+
runtime: state.runtime,
|
|
134
|
+
sandboxId,
|
|
135
|
+
files: [
|
|
27
136
|
{
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
137
|
+
path: scriptPath,
|
|
138
|
+
content: [
|
|
139
|
+
"from pathlib import Path",
|
|
140
|
+
"import json",
|
|
141
|
+
"import sys",
|
|
142
|
+
"from pypdf import PdfReader",
|
|
143
|
+
"",
|
|
144
|
+
"resource_path = Path(sys.argv[1])",
|
|
145
|
+
"output_path = Path(sys.argv[2])",
|
|
146
|
+
"file_id = sys.argv[3]",
|
|
147
|
+
"file_name = sys.argv[4]",
|
|
148
|
+
"reader = PdfReader(str(resource_path))",
|
|
149
|
+
"rows = 0",
|
|
150
|
+
"with output_path.open('w', encoding='utf-8') as out:",
|
|
151
|
+
" for index, page in enumerate(reader.pages, start=1):",
|
|
152
|
+
" text = page.extract_text() or ''",
|
|
153
|
+
" text = text.replace('\\x00', '').strip()",
|
|
154
|
+
" if not text:",
|
|
155
|
+
" continue",
|
|
156
|
+
" data = {",
|
|
157
|
+
" 'fileId': file_id,",
|
|
158
|
+
" 'fileName': file_name,",
|
|
159
|
+
" 'pageNumber': index,",
|
|
160
|
+
" 'text': text,",
|
|
161
|
+
" }",
|
|
162
|
+
" out.write(json.dumps({'type': 'row', 'data': data}, ensure_ascii=False) + '\\n')",
|
|
163
|
+
" rows += 1",
|
|
164
|
+
" if rows == 0:",
|
|
165
|
+
" data = {'fileId': file_id, 'fileName': file_name, 'pageNumber': 0, 'text': ''}",
|
|
166
|
+
" out.write(json.dumps({'type': 'row', 'data': data}, ensure_ascii=False) + '\\n')",
|
|
167
|
+
" rows = 1",
|
|
168
|
+
"print(f'extracted_pdf_pages={len(reader.pages)} rows={rows} output={output_path}')",
|
|
169
|
+
"",
|
|
170
|
+
].join("\n"),
|
|
33
171
|
},
|
|
34
172
|
],
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
173
|
+
});
|
|
174
|
+
const extraction = await runDatasetSandboxCommandStep({
|
|
175
|
+
runtime: state.runtime,
|
|
176
|
+
sandboxId,
|
|
177
|
+
cmd: "python",
|
|
178
|
+
args: [scriptPath, resourcePath, outputPath, resource.fileId, fileName],
|
|
179
|
+
});
|
|
180
|
+
if (extraction.exitCode !== 0) {
|
|
181
|
+
throw new Error(`dataset_pdf_text_extraction_failed:${extraction.stderr || extraction.stdout}`);
|
|
182
|
+
}
|
|
183
|
+
const output = await readDatasetSandboxTextFileStep({
|
|
184
|
+
runtime: state.runtime,
|
|
185
|
+
sandboxId,
|
|
186
|
+
path: outputPath,
|
|
187
|
+
});
|
|
188
|
+
const rows = parseJsonlDataRows(output.content);
|
|
189
|
+
if (rows.length === 0) {
|
|
190
|
+
throw new Error("dataset_pdf_text_extraction_empty");
|
|
191
|
+
}
|
|
192
|
+
await materializeRowsToDataset(state.runtime, {
|
|
193
|
+
datasetId: targetDatasetId,
|
|
194
|
+
sandboxId,
|
|
195
|
+
title: state.title ?? fileName,
|
|
196
|
+
instructions: state.instructions,
|
|
197
|
+
contextId: state.contextId ?? "",
|
|
198
|
+
rows,
|
|
199
|
+
schema: pdfTextRowsSchema(),
|
|
200
|
+
first: state.first,
|
|
201
|
+
});
|
|
202
|
+
return targetDatasetId;
|
|
203
|
+
}
|
|
204
|
+
async function materializeRawTextResource(state, resource, targetDatasetId) {
|
|
205
|
+
const rows = materializeRawTextRows(resource);
|
|
206
|
+
await materializeRowsToDataset(state.runtime, {
|
|
207
|
+
datasetId: targetDatasetId,
|
|
208
|
+
sandboxId: state.sandboxId,
|
|
209
|
+
title: state.title ?? resource.name ?? targetDatasetId,
|
|
210
|
+
instructions: state.instructions,
|
|
211
|
+
contextId: state.contextId ?? "",
|
|
41
212
|
rows,
|
|
213
|
+
schema: state.outputSchema,
|
|
214
|
+
first: state.first,
|
|
215
|
+
});
|
|
216
|
+
return targetDatasetId;
|
|
217
|
+
}
|
|
218
|
+
async function writePreparedFileResourceToSandbox(params) {
|
|
219
|
+
const file = await readInstantFileStep({ runtime: params.runtime, fileId: params.fileId });
|
|
220
|
+
const contentDispositionName = parseContentDispositionFileName(file.contentDisposition);
|
|
221
|
+
const fileName = sanitizeResourceFileName(params.filename ?? contentDispositionName, `${params.fileId}.bin`);
|
|
222
|
+
const resourcePath = `${getDatasetResourcesDir(params.datasetId)}/${fileName}`;
|
|
223
|
+
await runDatasetSandboxCommandStep({
|
|
224
|
+
runtime: params.runtime,
|
|
225
|
+
sandboxId: params.sandboxId,
|
|
226
|
+
cmd: "mkdir",
|
|
227
|
+
args: ["-p", ...getDatasetStandardDirs(params.datasetId)],
|
|
228
|
+
});
|
|
229
|
+
await writeDatasetSandboxFilesStep({
|
|
230
|
+
runtime: params.runtime,
|
|
231
|
+
sandboxId: params.sandboxId,
|
|
232
|
+
files: [{ path: resourcePath, contentBase64: file.contentBase64 }],
|
|
233
|
+
});
|
|
234
|
+
return { fileName, resourcePath };
|
|
235
|
+
}
|
|
236
|
+
function resolveDatasetSandboxId(state, _targetDatasetId) {
|
|
237
|
+
const sandboxId = String(state.sandboxId ?? "").trim();
|
|
238
|
+
if (sandboxId)
|
|
239
|
+
return sandboxId;
|
|
240
|
+
throw new Error("dataset_sandbox_required");
|
|
241
|
+
}
|
|
242
|
+
export async function resolveDatasetAgentDurable(requestedDurable) {
|
|
243
|
+
if (!requestedDurable)
|
|
244
|
+
return false;
|
|
245
|
+
try {
|
|
246
|
+
const { getWorkflowMetadata } = await import("workflow");
|
|
247
|
+
const workflowRunId = getWorkflowMetadata?.()?.workflowRunId;
|
|
248
|
+
if (workflowRunId)
|
|
249
|
+
return false;
|
|
250
|
+
}
|
|
251
|
+
catch {
|
|
252
|
+
// Outside Workflow runtime there is no active metadata, so honor the caller.
|
|
253
|
+
}
|
|
254
|
+
return true;
|
|
255
|
+
}
|
|
256
|
+
export async function initializeDatasetStep(params) {
|
|
257
|
+
"use step";
|
|
258
|
+
await createOrUpdateDatasetMetadata(params.runtime, {
|
|
259
|
+
datasetId: params.datasetId,
|
|
260
|
+
sandboxId: params.sandboxId,
|
|
261
|
+
title: params.title ?? params.datasetId,
|
|
262
|
+
instructions: params.instructions,
|
|
263
|
+
contextId: params.contextId,
|
|
42
264
|
schema: params.schema,
|
|
43
|
-
|
|
44
|
-
|
|
265
|
+
status: "building",
|
|
266
|
+
});
|
|
267
|
+
return {
|
|
268
|
+
datasetId: params.datasetId,
|
|
269
|
+
sandboxId: params.sandboxId,
|
|
270
|
+
};
|
|
271
|
+
}
|
|
272
|
+
export async function prepareDatasetResourcesStep(params) {
|
|
273
|
+
"use step";
|
|
274
|
+
if (params.kind === "file") {
|
|
275
|
+
const fileId = params.resource.kind === "file"
|
|
276
|
+
? params.resource.fileId
|
|
277
|
+
: await uploadInlineTextResource(params.runtime, params.datasetId, params.resource);
|
|
278
|
+
return {
|
|
279
|
+
kind: "file",
|
|
280
|
+
datasetId: params.datasetId,
|
|
281
|
+
sandboxId: params.sandboxId,
|
|
282
|
+
fileId,
|
|
283
|
+
sandboxState: { initialized: false, filePath: "" },
|
|
284
|
+
filePreview: undefined,
|
|
285
|
+
schema: params.schema ?? null,
|
|
286
|
+
filename: params.resource.kind === "file" ? params.resource.filename : params.resource.name,
|
|
287
|
+
mediaType: params.resource.kind === "file" ? params.resource.mediaType : params.resource.mimeType,
|
|
288
|
+
};
|
|
289
|
+
}
|
|
290
|
+
return {
|
|
291
|
+
kind: "transform",
|
|
292
|
+
datasetId: params.datasetId,
|
|
293
|
+
sandboxId: params.sandboxId,
|
|
294
|
+
inputDatasetIds: params.inputDatasetIds,
|
|
295
|
+
outputSchema: params.outputSchema,
|
|
296
|
+
sandboxState: { initialized: false, inputPaths: [] },
|
|
297
|
+
inputPreviews: undefined,
|
|
298
|
+
};
|
|
299
|
+
}
|
|
300
|
+
export async function initializeDatasetContextStep(params) {
|
|
301
|
+
"use step";
|
|
302
|
+
if (params.prepared.kind === "file") {
|
|
303
|
+
return {
|
|
304
|
+
...params.prepared,
|
|
305
|
+
instructions: params.instructions ?? buildFileDefaultInstructions(params.outputSchema),
|
|
306
|
+
prompt: "generate a dataset for this file",
|
|
307
|
+
};
|
|
308
|
+
}
|
|
309
|
+
return {
|
|
310
|
+
...params.prepared,
|
|
311
|
+
instructions: params.instructions,
|
|
312
|
+
prompt: params.prepared.inputDatasetIds.length === 1
|
|
313
|
+
? "Transform the input dataset into a new dataset matching the provided output schema"
|
|
314
|
+
: `Transform ${params.prepared.inputDatasetIds.length} input datasets into a new dataset matching the provided output schema`,
|
|
315
|
+
};
|
|
316
|
+
}
|
|
317
|
+
export async function completeDatasetStep(params) {
|
|
318
|
+
"use step";
|
|
319
|
+
let datasetResult = await datasetGetByIdStep({
|
|
320
|
+
runtime: params.runtime,
|
|
321
|
+
datasetId: params.datasetId,
|
|
322
|
+
});
|
|
323
|
+
if (!datasetResult.ok)
|
|
324
|
+
throw new Error(datasetResult.error);
|
|
325
|
+
if (!params.schema && !datasetResult.data?.schema) {
|
|
326
|
+
await datasetInferAndUpdateSchemaStep({
|
|
327
|
+
runtime: params.runtime,
|
|
328
|
+
datasetId: params.datasetId,
|
|
329
|
+
title: `${params.datasetId}Row`,
|
|
330
|
+
description: "One dataset row",
|
|
331
|
+
});
|
|
332
|
+
datasetResult = await datasetGetByIdStep({
|
|
333
|
+
runtime: params.runtime,
|
|
334
|
+
datasetId: params.datasetId,
|
|
335
|
+
});
|
|
336
|
+
if (!datasetResult.ok)
|
|
337
|
+
throw new Error(datasetResult.error);
|
|
338
|
+
}
|
|
339
|
+
const previewResult = await datasetPreviewRowsStep({
|
|
340
|
+
runtime: params.runtime,
|
|
341
|
+
datasetId: params.datasetId,
|
|
342
|
+
limit: 20,
|
|
343
|
+
});
|
|
344
|
+
if (!params.first) {
|
|
345
|
+
return {
|
|
346
|
+
datasetId: params.datasetId,
|
|
347
|
+
dataset: datasetResult.data,
|
|
348
|
+
previewRows: previewResult.rows,
|
|
349
|
+
firstRow: undefined,
|
|
350
|
+
};
|
|
351
|
+
}
|
|
352
|
+
const firstResult = await datasetReadOneStep({
|
|
353
|
+
runtime: params.runtime,
|
|
354
|
+
datasetId: params.datasetId,
|
|
45
355
|
});
|
|
356
|
+
return {
|
|
357
|
+
datasetId: params.datasetId,
|
|
358
|
+
dataset: datasetResult.data,
|
|
359
|
+
previewRows: previewResult.rows,
|
|
360
|
+
firstRow: firstResult.row,
|
|
361
|
+
};
|
|
46
362
|
}
|
|
47
|
-
async function
|
|
363
|
+
export async function materializeSingleFileLikeResource(state, resource, targetDatasetId) {
|
|
364
|
+
if (resource.kind === "file" && !state.outputSchema) {
|
|
365
|
+
const materializedPdf = await tryMaterializeRawPdfFileResource(state, resource, targetDatasetId);
|
|
366
|
+
if (materializedPdf)
|
|
367
|
+
return materializedPdf;
|
|
368
|
+
}
|
|
369
|
+
const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
|
|
48
370
|
if (!state.reactor) {
|
|
49
371
|
throw new Error("dataset_reactor_required");
|
|
50
372
|
}
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
}
|
|
54
|
-
const fileId = source.kind === "file"
|
|
55
|
-
? source.fileId
|
|
56
|
-
: await (0, persistence_1.uploadInlineTextSource)(state.runtime, targetDatasetId, source);
|
|
57
|
-
await (0, persistence_1.createOrUpdateDatasetMetadata)(state.runtime, {
|
|
373
|
+
await initializeDatasetStep({
|
|
374
|
+
runtime: state.runtime,
|
|
58
375
|
datasetId: targetDatasetId,
|
|
59
|
-
sandboxId
|
|
376
|
+
sandboxId,
|
|
60
377
|
title: state.title ?? targetDatasetId,
|
|
61
378
|
instructions: state.instructions,
|
|
62
|
-
|
|
63
|
-
source.kind === "file"
|
|
64
|
-
? { kind: "file", fileId: source.fileId, description: source.description }
|
|
65
|
-
: {
|
|
66
|
-
kind: "text",
|
|
67
|
-
mimeType: source.mimeType,
|
|
68
|
-
name: source.name,
|
|
69
|
-
description: source.description,
|
|
70
|
-
},
|
|
71
|
-
],
|
|
72
|
-
sourceKinds: [source.kind],
|
|
379
|
+
contextId: state.contextId ?? "",
|
|
73
380
|
schema: state.outputSchema,
|
|
74
|
-
status: "building",
|
|
75
381
|
});
|
|
76
|
-
const
|
|
382
|
+
const prepared = await prepareDatasetResourcesStep({
|
|
383
|
+
kind: "file",
|
|
384
|
+
runtime: state.runtime,
|
|
77
385
|
datasetId: targetDatasetId,
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
386
|
+
sandboxId,
|
|
387
|
+
resource,
|
|
388
|
+
schema: state.outputSchema,
|
|
81
389
|
});
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
const db = await (0, persistence_1.getDatasetDb)(state.runtime);
|
|
85
|
-
const service = new service_1.DatasetService(db);
|
|
86
|
-
const readResult = await service.readRows({ datasetId: targetDatasetId, cursor: 0, limit: 1000 });
|
|
87
|
-
if (!readResult.ok) {
|
|
88
|
-
throw new Error(readResult.error);
|
|
89
|
-
}
|
|
90
|
-
const inferred = (0, schemaInference_1.inferDatasetSchema)(readResult.data.rows, `${targetDatasetId}Row`, "One dataset row");
|
|
91
|
-
const updateResult = await service.updateDatasetSchema({
|
|
92
|
-
datasetId: targetDatasetId,
|
|
93
|
-
schema: inferred,
|
|
94
|
-
status: "completed",
|
|
95
|
-
});
|
|
96
|
-
if (!updateResult.ok) {
|
|
97
|
-
throw new Error(updateResult.error);
|
|
98
|
-
}
|
|
390
|
+
if (prepared.kind !== "file") {
|
|
391
|
+
throw new Error("dataset_context_kind_mismatch:file");
|
|
99
392
|
}
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
393
|
+
const preparedFile = await writePreparedFileResourceToSandbox({
|
|
394
|
+
runtime: state.runtime,
|
|
395
|
+
sandboxId,
|
|
396
|
+
datasetId: targetDatasetId,
|
|
397
|
+
fileId: prepared.fileId,
|
|
398
|
+
filename: prepared.filename,
|
|
399
|
+
});
|
|
400
|
+
const context = await initializeDatasetContextStep({
|
|
401
|
+
prepared: {
|
|
402
|
+
...prepared,
|
|
403
|
+
filename: prepared.filename ?? preparedFile.fileName,
|
|
404
|
+
},
|
|
405
|
+
instructions: state.instructions,
|
|
406
|
+
outputSchema: state.outputSchema,
|
|
407
|
+
});
|
|
408
|
+
if (context.kind !== "file") {
|
|
409
|
+
throw new Error("dataset_context_kind_mismatch:file");
|
|
107
410
|
}
|
|
411
|
+
const parseContext = createFileParseContext(context.fileId, {
|
|
412
|
+
datasetId: context.datasetId,
|
|
413
|
+
instructions: context.instructions,
|
|
414
|
+
reactor: state.reactor,
|
|
415
|
+
sandboxId: context.sandboxId,
|
|
416
|
+
sandboxState: context.sandboxState,
|
|
417
|
+
filePreview: context.filePreview,
|
|
418
|
+
schema: context.schema,
|
|
419
|
+
filename: context.filename,
|
|
420
|
+
mediaType: context.mediaType,
|
|
421
|
+
});
|
|
422
|
+
await parseContext.parse(state.runtime, {
|
|
423
|
+
durable: await resolveDatasetAgentDurable(state.durable),
|
|
424
|
+
prompt: context.prompt,
|
|
425
|
+
initialContent: {
|
|
426
|
+
datasetId: context.datasetId,
|
|
427
|
+
fileId: context.fileId,
|
|
428
|
+
instructions: context.instructions ?? "",
|
|
429
|
+
sandboxId: context.sandboxId,
|
|
430
|
+
sandboxState: context.sandboxState,
|
|
431
|
+
filePreview: context.filePreview,
|
|
432
|
+
schema: context.schema,
|
|
433
|
+
filename: context.filename,
|
|
434
|
+
mediaType: context.mediaType,
|
|
435
|
+
},
|
|
436
|
+
});
|
|
108
437
|
return targetDatasetId;
|
|
109
438
|
}
|
|
110
|
-
async function
|
|
111
|
-
if (
|
|
112
|
-
return
|
|
439
|
+
async function normalizeResourceToDatasetId(state, resource, targetDatasetId, resourceIndex) {
|
|
440
|
+
if (resource.kind === "dataset") {
|
|
441
|
+
return resource.datasetId;
|
|
113
442
|
}
|
|
114
|
-
const intermediateDatasetId = makeIntermediateDatasetId(targetDatasetId,
|
|
115
|
-
if (
|
|
116
|
-
await
|
|
443
|
+
const intermediateDatasetId = makeIntermediateDatasetId(targetDatasetId, resource.kind, resourceIndex);
|
|
444
|
+
if (resource.kind === "query") {
|
|
445
|
+
await materializeQueryResource(state.runtime, resource, {
|
|
117
446
|
datasetId: intermediateDatasetId,
|
|
118
447
|
sandboxId: state.sandboxId,
|
|
119
|
-
title:
|
|
448
|
+
title: resource.title,
|
|
120
449
|
first: false,
|
|
450
|
+
contextId: state.contextId ?? "",
|
|
121
451
|
});
|
|
122
452
|
return intermediateDatasetId;
|
|
123
453
|
}
|
|
124
|
-
|
|
454
|
+
if (resource.kind === "text") {
|
|
455
|
+
await materializeRawTextResource({
|
|
456
|
+
...state,
|
|
457
|
+
outputSchema: undefined,
|
|
458
|
+
first: false,
|
|
459
|
+
instructions: buildRawResourceInstructions(resource.kind),
|
|
460
|
+
title: resource.name ?? state.title,
|
|
461
|
+
}, resource, intermediateDatasetId);
|
|
462
|
+
return intermediateDatasetId;
|
|
463
|
+
}
|
|
464
|
+
if (resource.kind === "context") {
|
|
465
|
+
throw new Error("dataset_context_resource_must_be_resolved_before_materialization");
|
|
466
|
+
}
|
|
467
|
+
await materializeSingleFileLikeResource({
|
|
125
468
|
...state,
|
|
126
469
|
outputSchema: undefined,
|
|
127
470
|
first: false,
|
|
128
|
-
instructions: (
|
|
129
|
-
},
|
|
471
|
+
instructions: buildRawResourceInstructions(resource.kind),
|
|
472
|
+
}, resource, intermediateDatasetId);
|
|
130
473
|
return intermediateDatasetId;
|
|
131
474
|
}
|
|
132
|
-
async function materializeDerivedDataset(state, targetDatasetId) {
|
|
475
|
+
export async function materializeDerivedDataset(state, targetDatasetId) {
|
|
133
476
|
if (!state.reactor) {
|
|
134
477
|
throw new Error("dataset_reactor_required");
|
|
135
478
|
}
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
}
|
|
139
|
-
const
|
|
140
|
-
for (let index = 0; index < state.sources.length; index++) {
|
|
141
|
-
normalizedSources.push(await normalizeSourceToDatasetId(state, state.sources[index], targetDatasetId, index));
|
|
142
|
-
}
|
|
143
|
-
const transformSchema = state.outputSchema ??
|
|
479
|
+
const sandboxId = resolveDatasetSandboxId(state, targetDatasetId);
|
|
480
|
+
const stateWithSandbox = { ...state, sandboxId };
|
|
481
|
+
const inputDatasetIds = (stateWithSandbox.contextResources ?? []).map((resource, index) => String(resource.datasetId ?? resource.key ?? `resource_${index + 1}`));
|
|
482
|
+
const transformSchema = stateWithSandbox.outputSchema ??
|
|
144
483
|
{
|
|
145
484
|
title: "DatasetRow",
|
|
146
485
|
description: "One dataset row",
|
|
@@ -150,56 +489,60 @@ async function materializeDerivedDataset(state, targetDatasetId) {
|
|
|
150
489
|
properties: {},
|
|
151
490
|
},
|
|
152
491
|
};
|
|
153
|
-
await (
|
|
492
|
+
await initializeDatasetStep({
|
|
493
|
+
runtime: stateWithSandbox.runtime,
|
|
154
494
|
datasetId: targetDatasetId,
|
|
155
|
-
sandboxId
|
|
156
|
-
title:
|
|
157
|
-
instructions:
|
|
158
|
-
|
|
159
|
-
? {
|
|
160
|
-
kind: "query",
|
|
161
|
-
query: source.query,
|
|
162
|
-
title: source.title,
|
|
163
|
-
explanation: source.explanation,
|
|
164
|
-
...(0, sourceRows_1.getDomainDescriptor)(source.domain),
|
|
165
|
-
}
|
|
166
|
-
: source),
|
|
167
|
-
sourceKinds: state.sources.map((source) => source.kind),
|
|
495
|
+
sandboxId,
|
|
496
|
+
title: stateWithSandbox.title ?? targetDatasetId,
|
|
497
|
+
instructions: stateWithSandbox.instructions,
|
|
498
|
+
contextId: stateWithSandbox.contextId ?? "",
|
|
168
499
|
schema: transformSchema,
|
|
169
|
-
status: "building",
|
|
170
500
|
});
|
|
171
|
-
const
|
|
172
|
-
|
|
173
|
-
outputSchema: transformSchema,
|
|
174
|
-
instructions: (0, instructions_1.buildTransformInstructions)(normalizedSources.length, state.instructions, state.outputSchema),
|
|
501
|
+
const prepared = {
|
|
502
|
+
kind: "transform",
|
|
175
503
|
datasetId: targetDatasetId,
|
|
176
|
-
|
|
177
|
-
|
|
504
|
+
sandboxId,
|
|
505
|
+
inputDatasetIds,
|
|
506
|
+
outputSchema: transformSchema,
|
|
507
|
+
sandboxState: { initialized: false, inputPaths: [] },
|
|
508
|
+
inputPreviews: undefined,
|
|
509
|
+
};
|
|
510
|
+
const context = await initializeDatasetContextStep({
|
|
511
|
+
prepared,
|
|
512
|
+
instructions: buildTransformInstructions(inputDatasetIds.length, stateWithSandbox.instructions, stateWithSandbox.outputSchema),
|
|
513
|
+
outputSchema: transformSchema,
|
|
178
514
|
});
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
const service = new service_1.DatasetService(db);
|
|
182
|
-
if (!state.outputSchema) {
|
|
183
|
-
const readResult = await service.readRows({ datasetId: targetDatasetId, cursor: 0, limit: 1000 });
|
|
184
|
-
if (!readResult.ok) {
|
|
185
|
-
throw new Error(readResult.error);
|
|
186
|
-
}
|
|
187
|
-
const inferred = (0, schemaInference_1.inferDatasetSchema)(readResult.data.rows, `${targetDatasetId}Row`, "One dataset row");
|
|
188
|
-
const updateResult = await service.updateDatasetSchema({
|
|
189
|
-
datasetId: targetDatasetId,
|
|
190
|
-
schema: inferred,
|
|
191
|
-
status: "completed",
|
|
192
|
-
});
|
|
193
|
-
if (!updateResult.ok) {
|
|
194
|
-
throw new Error(updateResult.error);
|
|
195
|
-
}
|
|
196
|
-
}
|
|
197
|
-
if (state.first) {
|
|
198
|
-
const firstResult = await service.readOne(targetDatasetId);
|
|
199
|
-
if (!firstResult.ok) {
|
|
200
|
-
throw new Error(firstResult.error);
|
|
201
|
-
}
|
|
515
|
+
if (context.kind !== "transform") {
|
|
516
|
+
throw new Error("dataset_context_kind_mismatch:transform");
|
|
202
517
|
}
|
|
518
|
+
const transformContext = createTransformDatasetContext({
|
|
519
|
+
inputDatasetIds: context.inputDatasetIds,
|
|
520
|
+
outputSchema: context.outputSchema,
|
|
521
|
+
instructions: context.instructions,
|
|
522
|
+
datasetId: context.datasetId,
|
|
523
|
+
reactor: stateWithSandbox.reactor,
|
|
524
|
+
sandboxId: context.sandboxId,
|
|
525
|
+
sandboxState: context.sandboxState,
|
|
526
|
+
inputPreviews: context.inputPreviews,
|
|
527
|
+
contextResources: stateWithSandbox.contextResources ?? [],
|
|
528
|
+
});
|
|
529
|
+
await transformContext.transform(stateWithSandbox.runtime, {
|
|
530
|
+
durable: await resolveDatasetAgentDurable(stateWithSandbox.durable),
|
|
531
|
+
prompt: context.prompt,
|
|
532
|
+
initialContent: {
|
|
533
|
+
datasetId: context.datasetId,
|
|
534
|
+
inputDatasetIds: context.inputDatasetIds,
|
|
535
|
+
outputSchema: context.outputSchema,
|
|
536
|
+
instructions: context.instructions,
|
|
537
|
+
sandboxId: context.sandboxId,
|
|
538
|
+
sandboxState: context.sandboxState,
|
|
539
|
+
inputPreviews: context.inputPreviews,
|
|
540
|
+
contextResources: stateWithSandbox.contextResources ?? [],
|
|
541
|
+
},
|
|
542
|
+
});
|
|
203
543
|
return targetDatasetId;
|
|
204
544
|
}
|
|
205
|
-
|
|
545
|
+
registerDatasetAgentMaterializers({
|
|
546
|
+
materializeSingleFileLikeResource,
|
|
547
|
+
materializeDerivedDataset,
|
|
548
|
+
});
|