@ekairos/dataset 1.22.35-beta.development.0 → 1.22.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +347 -0
- package/dist/agents.d.ts +8 -0
- package/dist/agents.js +8 -0
- package/dist/builder/agentMaterializers.d.ts +9 -0
- package/dist/builder/agentMaterializers.js +10 -0
- package/dist/builder/context.d.ts +15 -0
- package/dist/builder/context.js +251 -0
- package/dist/builder/instructions.d.ts +5 -0
- package/dist/builder/instructions.js +40 -0
- package/dist/builder/materialize.d.ts +83 -0
- package/dist/builder/materialize.js +548 -0
- package/dist/builder/materializeQuery.d.ts +12 -0
- package/dist/builder/materializeQuery.js +31 -0
- package/dist/builder/persistence.d.ts +22 -0
- package/dist/builder/persistence.js +153 -0
- package/dist/builder/rows.d.ts +7 -0
- package/dist/builder/rows.js +56 -0
- package/dist/builder/schemaInference.d.ts +3 -0
- package/dist/builder/schemaInference.js +61 -0
- package/dist/builder/types.d.ts +140 -0
- package/dist/builder/types.js +1 -0
- package/dist/clearDataset.tool.d.ts +2 -3
- package/dist/clearDataset.tool.js +13 -17
- package/dist/completeDataset.steps.d.ts +117 -0
- package/dist/completeDataset.steps.js +487 -0
- package/dist/completeDataset.tool.d.ts +132 -7
- package/dist/completeDataset.tool.js +46 -192
- package/dist/contextResources.d.ts +31 -0
- package/dist/contextResources.js +151 -0
- package/dist/contextWorkspace.d.ts +79 -0
- package/dist/contextWorkspace.js +234 -0
- package/dist/dataset/steps.d.ts +39 -15
- package/dist/dataset/steps.js +96 -39
- package/dist/dataset.d.ts +3 -67
- package/dist/dataset.js +129 -521
- package/dist/datasetFiles.d.ts +5 -1
- package/dist/datasetFiles.js +29 -27
- package/dist/domain.d.ts +1 -2
- package/dist/domain.js +1 -6
- package/dist/executeCommand.tool.d.ts +2 -30
- package/dist/executeCommand.tool.js +165 -39
- package/dist/file/file-dataset.agent.d.ts +19 -56
- package/dist/file/file-dataset.agent.js +176 -134
- package/dist/file/file-dataset.steps.d.ts +27 -0
- package/dist/file/file-dataset.steps.js +47 -0
- package/dist/file/file-dataset.types.d.ts +64 -0
- package/dist/file/file-dataset.types.js +1 -0
- package/dist/file/filepreview.d.ts +5 -35
- package/dist/file/filepreview.js +60 -107
- package/dist/file/filepreview.types.d.ts +31 -0
- package/dist/file/filepreview.types.js +1 -0
- package/dist/file/generateSchema.tool.d.ts +2 -3
- package/dist/file/generateSchema.tool.js +11 -15
- package/dist/file/index.d.ts +1 -2
- package/dist/file/index.js +1 -18
- package/dist/file/prompts.d.ts +2 -3
- package/dist/file/prompts.js +134 -27
- package/dist/file/scripts.generated.d.ts +1 -0
- package/dist/file/scripts.generated.js +11 -0
- package/dist/file/steps.d.ts +1 -2
- package/dist/file/steps.js +9 -7
- package/dist/id.d.ts +1 -0
- package/dist/id.js +10 -0
- package/dist/index.d.ts +8 -7
- package/dist/index.js +8 -23
- package/dist/materializeDataset.tool.d.ts +52 -32
- package/dist/materializeDataset.tool.js +81 -65
- package/dist/query/index.d.ts +1 -2
- package/dist/query/index.js +1 -18
- package/dist/query/queryDomain.d.ts +3 -4
- package/dist/query/queryDomain.js +3 -40
- package/dist/query/queryDomain.step.d.ts +1 -1
- package/dist/query/queryDomain.step.js +13 -13
- package/dist/sandbox/steps.d.ts +23 -15
- package/dist/sandbox/steps.js +73 -76
- package/dist/sandbox.steps.d.ts +1 -2
- package/dist/sandbox.steps.js +1 -18
- package/dist/schema.d.ts +13 -13
- package/dist/schema.js +25 -37
- package/dist/service.d.ts +8 -5
- package/dist/service.js +70 -15
- package/dist/skill.d.ts +0 -1
- package/dist/skill.js +12 -17
- package/dist/transform/filepreview.d.ts +2 -3
- package/dist/transform/filepreview.js +9 -26
- package/dist/transform/index.d.ts +2 -3
- package/dist/transform/index.js +2 -8
- package/dist/transform/prompts.d.ts +1 -34
- package/dist/transform/prompts.js +58 -43
- package/dist/transform/transform-dataset.agent.d.ts +20 -45
- package/dist/transform/transform-dataset.agent.js +146 -91
- package/dist/transform/transform-dataset.steps.d.ts +30 -0
- package/dist/transform/transform-dataset.steps.js +61 -0
- package/dist/transform/transform-dataset.types.d.ts +95 -0
- package/dist/transform/transform-dataset.types.js +1 -0
- package/dist/transform/transformDataset.d.ts +3 -3
- package/dist/transform/transformDataset.js +15 -18
- package/dist/writeDatasetRows.tool.d.ts +188 -0
- package/dist/writeDatasetRows.tool.js +258 -0
- package/package.json +35 -10
- package/dist/clearDataset.tool.d.ts.map +0 -1
- package/dist/clearDataset.tool.js.map +0 -1
- package/dist/completeDataset.tool.d.ts.map +0 -1
- package/dist/completeDataset.tool.js.map +0 -1
- package/dist/dataset/steps.d.ts.map +0 -1
- package/dist/dataset/steps.js.map +0 -1
- package/dist/dataset.d.ts.map +0 -1
- package/dist/dataset.js.map +0 -1
- package/dist/datasetFiles.d.ts.map +0 -1
- package/dist/datasetFiles.js.map +0 -1
- package/dist/domain.d.ts.map +0 -1
- package/dist/domain.js.map +0 -1
- package/dist/eventsReactRuntime.d.ts +0 -22
- package/dist/eventsReactRuntime.d.ts.map +0 -1
- package/dist/eventsReactRuntime.js +0 -29
- package/dist/eventsReactRuntime.js.map +0 -1
- package/dist/executeCommand.tool.d.ts.map +0 -1
- package/dist/executeCommand.tool.js.map +0 -1
- package/dist/file/file-dataset.agent.d.ts.map +0 -1
- package/dist/file/file-dataset.agent.js.map +0 -1
- package/dist/file/filepreview.d.ts.map +0 -1
- package/dist/file/filepreview.js.map +0 -1
- package/dist/file/generateSchema.tool.d.ts.map +0 -1
- package/dist/file/generateSchema.tool.js.map +0 -1
- package/dist/file/index.d.ts.map +0 -1
- package/dist/file/index.js.map +0 -1
- package/dist/file/prompts.d.ts.map +0 -1
- package/dist/file/prompts.js.map +0 -1
- package/dist/file/steps.d.ts.map +0 -1
- package/dist/file/steps.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/materializeDataset.tool.d.ts.map +0 -1
- package/dist/materializeDataset.tool.js.map +0 -1
- package/dist/query/index.d.ts.map +0 -1
- package/dist/query/index.js.map +0 -1
- package/dist/query/queryDomain.d.ts.map +0 -1
- package/dist/query/queryDomain.js.map +0 -1
- package/dist/query/queryDomain.step.d.ts.map +0 -1
- package/dist/query/queryDomain.step.js.map +0 -1
- package/dist/sandbox/steps.d.ts.map +0 -1
- package/dist/sandbox/steps.js.map +0 -1
- package/dist/sandbox.steps.d.ts.map +0 -1
- package/dist/sandbox.steps.js.map +0 -1
- package/dist/schema.d.ts.map +0 -1
- package/dist/schema.js.map +0 -1
- package/dist/service.d.ts.map +0 -1
- package/dist/service.js.map +0 -1
- package/dist/skill.d.ts.map +0 -1
- package/dist/skill.js.map +0 -1
- package/dist/transform/filepreview.d.ts.map +0 -1
- package/dist/transform/filepreview.js.map +0 -1
- package/dist/transform/index.d.ts.map +0 -1
- package/dist/transform/index.js.map +0 -1
- package/dist/transform/prompts.d.ts.map +0 -1
- package/dist/transform/prompts.js.map +0 -1
- package/dist/transform/transform-dataset.agent.d.ts.map +0 -1
- package/dist/transform/transform-dataset.agent.js.map +0 -1
- package/dist/transform/transformDataset.d.ts.map +0 -1
- package/dist/transform/transformDataset.js.map +0 -1
package/dist/file/filepreview.js
CHANGED
|
@@ -1,30 +1,38 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
exports.ensurePreviewScriptsAvailable = ensurePreviewScriptsAvailable;
|
|
4
|
-
exports.generateFilePreview = generateFilePreview;
|
|
5
|
-
const fs_1 = require("fs");
|
|
6
|
-
const path_1 = require("path");
|
|
7
|
-
const steps_1 = require("../sandbox/steps");
|
|
1
|
+
import { runDatasetSandboxCommandStep } from "../sandbox/steps.js";
|
|
2
|
+
import { PYTHON_SCRIPT_BASE64_BY_NAME } from "./scripts.generated.js";
|
|
8
3
|
const DEFAULT_HEAD_LINES = 50;
|
|
9
4
|
const DEFAULT_TAIL_LINES = 20;
|
|
10
5
|
const DEFAULT_MID_LINES = 20;
|
|
11
|
-
|
|
12
|
-
const
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
"
|
|
20
|
-
|
|
21
|
-
function
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
6
|
+
export function getEmbeddedFilePreviewScriptBase64(scriptName) {
|
|
7
|
+
const embedded = PYTHON_SCRIPT_BASE64_BY_NAME[scriptName];
|
|
8
|
+
if (!embedded) {
|
|
9
|
+
throw new Error(`dataset_preview_script_not_embedded:${scriptName}`);
|
|
10
|
+
}
|
|
11
|
+
return embedded;
|
|
12
|
+
}
|
|
13
|
+
function readFilePreviewScriptText(scriptName) {
|
|
14
|
+
return Buffer.from(getEmbeddedFilePreviewScriptBase64(scriptName), "base64").toString("utf-8");
|
|
15
|
+
}
|
|
16
|
+
function sanitizePreviewText(value) {
|
|
17
|
+
return String(value ?? "")
|
|
18
|
+
.replace(/\u0000/g, "")
|
|
19
|
+
.replace(/[\u0001-\u0008\u000B\u000C\u000E-\u001F\u007F]/g, "");
|
|
20
|
+
}
|
|
21
|
+
function getPreviewKind(extension) {
|
|
22
|
+
const normalized = extension.toLowerCase();
|
|
23
|
+
if (normalized === ".xlsx" || normalized === ".xls")
|
|
24
|
+
return "excel";
|
|
25
|
+
if (normalized === ".csv" ||
|
|
26
|
+
normalized === ".tsv" ||
|
|
27
|
+
normalized === ".txt" ||
|
|
28
|
+
normalized === ".log" ||
|
|
29
|
+
normalized === ".json" ||
|
|
30
|
+
normalized === ".jsonl" ||
|
|
31
|
+
normalized === ".md") {
|
|
32
|
+
return "text";
|
|
33
|
+
}
|
|
34
|
+
return null;
|
|
25
35
|
}
|
|
26
|
-
const preparedSandboxIds = new Set();
|
|
27
|
-
const sandboxSetupPromises = new Map();
|
|
28
36
|
function validateScriptResult(result, context) {
|
|
29
37
|
if (!result.stderr) {
|
|
30
38
|
return;
|
|
@@ -37,75 +45,24 @@ function validateScriptResult(result, context) {
|
|
|
37
45
|
throw new Error(`${context} failed: ${stderr.substring(0, 500)}`);
|
|
38
46
|
}
|
|
39
47
|
}
|
|
40
|
-
async function ensurePreviewScriptsAvailable(
|
|
41
|
-
|
|
42
|
-
return;
|
|
43
|
-
}
|
|
44
|
-
const inFlight = sandboxSetupPromises.get(sandboxId);
|
|
45
|
-
if (inFlight) {
|
|
46
|
-
await inFlight;
|
|
47
|
-
return;
|
|
48
|
-
}
|
|
49
|
-
const setupPromise = (async () => {
|
|
50
|
-
try {
|
|
51
|
-
await (0, steps_1.runDatasetSandboxCommandStep)({
|
|
52
|
-
env,
|
|
53
|
-
sandboxId,
|
|
54
|
-
cmd: "mkdir",
|
|
55
|
-
args: ["-p", SANDBOX_SCRIPT_DIRECTORY],
|
|
56
|
-
});
|
|
57
|
-
}
|
|
58
|
-
catch (error) {
|
|
59
|
-
console.warn("[Dataset Scripts] Failed to create sandbox scripts directory", error);
|
|
60
|
-
}
|
|
61
|
-
const filesToWrite = [];
|
|
62
|
-
for (const scriptName of PYTHON_SCRIPT_FILES) {
|
|
63
|
-
try {
|
|
64
|
-
const scriptPath = resolveScriptPath(scriptName);
|
|
65
|
-
const fileBuffer = (0, fs_1.readFileSync)(scriptPath);
|
|
66
|
-
filesToWrite.push({
|
|
67
|
-
path: `${SANDBOX_SCRIPT_DIRECTORY}/${scriptName}`,
|
|
68
|
-
contentBase64: Buffer.from(fileBuffer).toString("base64"),
|
|
69
|
-
});
|
|
70
|
-
}
|
|
71
|
-
catch (error) {
|
|
72
|
-
console.error(`[Dataset Scripts] Failed to read script ${scriptName}`, error);
|
|
73
|
-
throw error;
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
if (filesToWrite.length > 0) {
|
|
77
|
-
await (0, steps_1.writeDatasetSandboxFilesStep)({
|
|
78
|
-
env,
|
|
79
|
-
sandboxId,
|
|
80
|
-
files: filesToWrite,
|
|
81
|
-
});
|
|
82
|
-
}
|
|
83
|
-
})();
|
|
84
|
-
sandboxSetupPromises.set(sandboxId, setupPromise);
|
|
85
|
-
try {
|
|
86
|
-
await setupPromise;
|
|
87
|
-
preparedSandboxIds.add(sandboxId);
|
|
88
|
-
}
|
|
89
|
-
catch (error) {
|
|
90
|
-
sandboxSetupPromises.delete(sandboxId);
|
|
91
|
-
throw error;
|
|
92
|
-
}
|
|
48
|
+
export async function ensurePreviewScriptsAvailable(_runtime, _sandboxId) {
|
|
49
|
+
return;
|
|
93
50
|
}
|
|
94
|
-
async function generateFilePreview(
|
|
51
|
+
export async function generateFilePreview(runtime, sandboxId, sandboxFilePath, datasetId, options = {}) {
|
|
95
52
|
const context = {
|
|
96
53
|
totalRows: 0,
|
|
97
54
|
};
|
|
98
55
|
try {
|
|
99
|
-
await
|
|
100
|
-
|
|
56
|
+
const metadataResult = await runScript(runtime, sandboxId, "file_metadata.py", [sandboxFilePath], "Extracts file metadata: name, extension, size, row count estimate, column count, and header preview");
|
|
57
|
+
validateScriptResult(metadataResult, `preview_metadata for ${datasetId}`);
|
|
101
58
|
context.metadata = metadataResult;
|
|
102
|
-
let
|
|
59
|
+
let previewKind = null;
|
|
103
60
|
if (metadataResult.stdout) {
|
|
104
61
|
try {
|
|
105
62
|
const metadataJson = JSON.parse(metadataResult.stdout);
|
|
106
63
|
context.totalRows = metadataJson.row_count_estimate || 0;
|
|
107
64
|
const extension = metadataJson.extension || "";
|
|
108
|
-
|
|
65
|
+
previewKind = getPreviewKind(extension);
|
|
109
66
|
}
|
|
110
67
|
catch {
|
|
111
68
|
console.warn(`[Dataset ${datasetId}] Failed to parse metadata JSON`);
|
|
@@ -118,28 +75,32 @@ async function generateFilePreview(env, sandboxId, sandboxFilePath, datasetId, o
|
|
|
118
75
|
console.log(`[Dataset ${datasetId}] No rows detected, skipping preview`);
|
|
119
76
|
return context;
|
|
120
77
|
}
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
78
|
+
if (!previewKind) {
|
|
79
|
+
console.log(`[Dataset ${datasetId}] Binary or unsupported preview format, keeping metadata only`);
|
|
80
|
+
return context;
|
|
81
|
+
}
|
|
82
|
+
const headScript = previewKind === "excel" ? "preview_head_excel.py" : "preview_head_csv.py";
|
|
83
|
+
const tailScript = previewKind === "excel" ? "preview_tail_excel.py" : "preview_tail_csv.py";
|
|
84
|
+
const midScript = previewKind === "excel" ? "preview_mid_excel.py" : "preview_mid_csv.py";
|
|
124
85
|
if (totalRows <= headLines) {
|
|
125
86
|
console.log(`[Dataset ${datasetId}] File has ${totalRows} rows, reading all with head only`);
|
|
126
|
-
const headResult = await runScript(
|
|
87
|
+
const headResult = await runScript(runtime, sandboxId, headScript, [sandboxFilePath, String(totalRows)], `Reads the first ${totalRows} rows (entire file)`);
|
|
127
88
|
validateScriptResult(headResult, `preview_head for ${datasetId}`);
|
|
128
89
|
context.head = headResult;
|
|
129
90
|
return context;
|
|
130
91
|
}
|
|
131
92
|
if (headLines + tailLines >= totalRows) {
|
|
132
93
|
console.log(`[Dataset ${datasetId}] Head + tail would cover entire file (${totalRows} rows), reading all with head only`);
|
|
133
|
-
const headResult = await runScript(
|
|
94
|
+
const headResult = await runScript(runtime, sandboxId, headScript, [sandboxFilePath, String(totalRows)], `Reads the first ${totalRows} rows (entire file)`);
|
|
134
95
|
validateScriptResult(headResult, `preview_head for ${datasetId}`);
|
|
135
96
|
context.head = headResult;
|
|
136
97
|
return context;
|
|
137
98
|
}
|
|
138
99
|
console.log(`[Dataset ${datasetId}] Reading head (${headLines} rows) and tail (${tailLines} rows) from ${totalRows} total rows`);
|
|
139
|
-
const headResult = await runScript(
|
|
100
|
+
const headResult = await runScript(runtime, sandboxId, headScript, [sandboxFilePath, String(headLines)], `Reads the first ${headLines} rows of the file`);
|
|
140
101
|
validateScriptResult(headResult, `preview_head for ${datasetId}`);
|
|
141
102
|
context.head = headResult;
|
|
142
|
-
const tailResult = await runScript(
|
|
103
|
+
const tailResult = await runScript(runtime, sandboxId, tailScript, [sandboxFilePath, String(tailLines)], `Reads the last ${tailLines} rows of the file`);
|
|
143
104
|
validateScriptResult(tailResult, `preview_tail for ${datasetId}`);
|
|
144
105
|
context.tail = tailResult;
|
|
145
106
|
const midLines = options.midLines || DEFAULT_MID_LINES;
|
|
@@ -148,40 +109,33 @@ async function generateFilePreview(env, sandboxId, sandboxFilePath, datasetId, o
|
|
|
148
109
|
const midStart = headLines;
|
|
149
110
|
const midEnd = totalRows - tailLines;
|
|
150
111
|
console.log(`[Dataset ${datasetId}] Large gap (${gapSize} rows), adding mid sample (${midLines} rows)`);
|
|
151
|
-
const midResult = await runScript(
|
|
112
|
+
const midResult = await runScript(runtime, sandboxId, midScript, [sandboxFilePath, String(midStart), String(midEnd), String(midLines)], `Samples ${midLines} rows from the middle section (rows ${midStart + 1} to ${midEnd})`);
|
|
152
113
|
validateScriptResult(midResult, `preview_mid for ${datasetId}`);
|
|
153
114
|
context.mid = midResult;
|
|
154
115
|
}
|
|
155
116
|
}
|
|
156
117
|
catch (error) {
|
|
157
118
|
console.error(`[Dataset ${datasetId}] Error generating file preview:`, error);
|
|
119
|
+
throw error;
|
|
158
120
|
}
|
|
159
121
|
return context;
|
|
160
122
|
}
|
|
161
|
-
async function runScript(
|
|
162
|
-
const
|
|
163
|
-
const command = `python
|
|
164
|
-
let scriptContent = "";
|
|
165
|
-
try {
|
|
166
|
-
const localScriptPath = resolveScriptPath(scriptName);
|
|
167
|
-
scriptContent = (0, fs_1.readFileSync)(localScriptPath, 'utf-8');
|
|
168
|
-
}
|
|
169
|
-
catch (error) {
|
|
170
|
-
console.warn(`Failed to read script ${scriptName}:`, error);
|
|
171
|
-
}
|
|
123
|
+
async function runScript(runtime, sandboxId, scriptName, args, description) {
|
|
124
|
+
const scriptContent = readFilePreviewScriptText(scriptName);
|
|
125
|
+
const command = `python -c <${scriptName}> ${args.join(" ")}`;
|
|
172
126
|
try {
|
|
173
|
-
const result = await
|
|
174
|
-
|
|
127
|
+
const result = await runDatasetSandboxCommandStep({
|
|
128
|
+
runtime,
|
|
175
129
|
sandboxId,
|
|
176
130
|
cmd: "python",
|
|
177
|
-
args: [
|
|
131
|
+
args: ["-c", scriptContent, ...args],
|
|
178
132
|
});
|
|
179
133
|
return {
|
|
180
134
|
description,
|
|
181
135
|
script: scriptContent,
|
|
182
136
|
command,
|
|
183
|
-
stdout: result.stdout
|
|
184
|
-
stderr: result.stderr
|
|
137
|
+
stdout: sanitizePreviewText(result.stdout),
|
|
138
|
+
stderr: sanitizePreviewText(result.stderr),
|
|
185
139
|
};
|
|
186
140
|
}
|
|
187
141
|
catch (error) {
|
|
@@ -190,8 +144,7 @@ async function runScript(env, sandboxId, scriptName, args, description) {
|
|
|
190
144
|
script: scriptContent,
|
|
191
145
|
command,
|
|
192
146
|
stdout: "",
|
|
193
|
-
stderr: error instanceof Error ? error.message : String(error),
|
|
147
|
+
stderr: sanitizePreviewText(error instanceof Error ? error.message : String(error)),
|
|
194
148
|
};
|
|
195
149
|
}
|
|
196
150
|
}
|
|
197
|
-
//# sourceMappingURL=filepreview.js.map
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
export type FilePreviewContext = {
|
|
2
|
+
totalRows: number;
|
|
3
|
+
metadata?: {
|
|
4
|
+
description: string;
|
|
5
|
+
script: string;
|
|
6
|
+
command: string;
|
|
7
|
+
stdout: string;
|
|
8
|
+
stderr: string;
|
|
9
|
+
};
|
|
10
|
+
head?: {
|
|
11
|
+
description: string;
|
|
12
|
+
script: string;
|
|
13
|
+
command: string;
|
|
14
|
+
stdout: string;
|
|
15
|
+
stderr: string;
|
|
16
|
+
};
|
|
17
|
+
tail?: {
|
|
18
|
+
description: string;
|
|
19
|
+
script: string;
|
|
20
|
+
command: string;
|
|
21
|
+
stdout: string;
|
|
22
|
+
stderr: string;
|
|
23
|
+
};
|
|
24
|
+
mid?: {
|
|
25
|
+
description: string;
|
|
26
|
+
script: string;
|
|
27
|
+
command: string;
|
|
28
|
+
stdout: string;
|
|
29
|
+
stderr: string;
|
|
30
|
+
};
|
|
31
|
+
};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -2,9 +2,9 @@ interface GenerateSchemaToolParams {
|
|
|
2
2
|
datasetId: string;
|
|
3
3
|
isNested?: boolean;
|
|
4
4
|
fileId?: string;
|
|
5
|
-
|
|
5
|
+
runtime: any;
|
|
6
6
|
}
|
|
7
|
-
export declare function createGenerateSchemaTool({ datasetId, isNested, fileId,
|
|
7
|
+
export declare function createGenerateSchemaTool({ datasetId, isNested, fileId, runtime }: GenerateSchemaToolParams): import("ai").Tool<{
|
|
8
8
|
schemaTitle: string;
|
|
9
9
|
schemaDescription: string;
|
|
10
10
|
schemaJson: string;
|
|
@@ -25,4 +25,3 @@ export declare function createGenerateSchemaTool({ datasetId, isNested, fileId,
|
|
|
25
25
|
error?: undefined;
|
|
26
26
|
}>;
|
|
27
27
|
export {};
|
|
28
|
-
//# sourceMappingURL=generateSchema.tool.d.ts.map
|
|
@@ -1,20 +1,17 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
const steps_1 = require("../dataset/steps");
|
|
7
|
-
function createGenerateSchemaTool({ datasetId, isNested, fileId, env }) {
|
|
8
|
-
return (0, ai_1.tool)({
|
|
1
|
+
import { tool } from "ai";
|
|
2
|
+
import { z } from "zod";
|
|
3
|
+
import { datasetUpdateSchemaStep } from "../dataset/steps.js";
|
|
4
|
+
export function createGenerateSchemaTool({ datasetId, isNested, fileId, runtime }) {
|
|
5
|
+
return tool({
|
|
9
6
|
description: `Generate a formal JSON schema for a SINGLE RECORD (row) from the file. This schema describes the structure of ONE record, not the entire dataset or array of records. Requirements:
|
|
10
7
|
1. Schema describes ONE RECORD structure only (no array wrappers)
|
|
11
8
|
2. All property names MUST use lowercaseCamelCase convention (e.g., 'productName', 'unitPrice')
|
|
12
9
|
3. Each property MUST have a description field
|
|
13
10
|
4. The schema description must explain what one record represents and field mappings from original file`,
|
|
14
|
-
inputSchema:
|
|
15
|
-
schemaTitle:
|
|
16
|
-
schemaDescription:
|
|
17
|
-
schemaJson:
|
|
11
|
+
inputSchema: z.object({
|
|
12
|
+
schemaTitle: z.string().describe("Title for the RECORD schema in PascalCase (e.g., 'ProductRecord', 'TransactionRecord')"),
|
|
13
|
+
schemaDescription: z.string().describe("Comprehensive description that includes: 1) what ONE record represents, 2) its purpose, 3) complete field mapping from original file fields to schema fields with explanations (e.g., 'ARTÍCULO' -> 'articleCode': normalized to camelCase)"),
|
|
14
|
+
schemaJson: z.string().describe("Complete JSON schema as string describing ONE RECORD. Must be type 'object' with properties. All properties must be in lowercaseCamelCase and have descriptions. Do NOT use type 'array' at root level."),
|
|
18
15
|
}),
|
|
19
16
|
execute: async ({ schemaTitle, schemaDescription, schemaJson, }) => {
|
|
20
17
|
console.log(`[Dataset ${datasetId}] ========================================`);
|
|
@@ -74,8 +71,8 @@ function createGenerateSchemaTool({ datasetId, isNested, fileId, env }) {
|
|
|
74
71
|
console.log(`[Dataset ${datasetId}] Description: ${schemaDescription}`);
|
|
75
72
|
console.log(`[Dataset ${datasetId}] Schema JSON:`);
|
|
76
73
|
console.log(JSON.stringify(parsedSchema, null, 2));
|
|
77
|
-
const updateResult = await
|
|
78
|
-
|
|
74
|
+
const updateResult = await datasetUpdateSchemaStep({
|
|
75
|
+
runtime,
|
|
79
76
|
datasetId,
|
|
80
77
|
schema: schemaData,
|
|
81
78
|
status: "schema_complete",
|
|
@@ -107,4 +104,3 @@ function createGenerateSchemaTool({ datasetId, isNested, fileId, env }) {
|
|
|
107
104
|
},
|
|
108
105
|
});
|
|
109
106
|
}
|
|
110
|
-
//# sourceMappingURL=generateSchema.tool.js.map
|
package/dist/file/index.d.ts
CHANGED
|
@@ -1,2 +1 @@
|
|
|
1
|
-
export * from "./file-dataset.agent";
|
|
2
|
-
//# sourceMappingURL=index.d.ts.map
|
|
1
|
+
export * from "./file-dataset.agent.js";
|
package/dist/file/index.js
CHANGED
|
@@ -1,18 +1 @@
|
|
|
1
|
-
|
|
2
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
-
if (k2 === undefined) k2 = k;
|
|
4
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
-
}
|
|
8
|
-
Object.defineProperty(o, k2, desc);
|
|
9
|
-
}) : (function(o, m, k, k2) {
|
|
10
|
-
if (k2 === undefined) k2 = k;
|
|
11
|
-
o[k2] = m[k];
|
|
12
|
-
}));
|
|
13
|
-
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
-
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
-
};
|
|
16
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
-
__exportStar(require("./file-dataset.agent"), exports);
|
|
18
|
-
//# sourceMappingURL=index.js.map
|
|
1
|
+
export * from "./file-dataset.agent.js";
|
package/dist/file/prompts.d.ts
CHANGED
|
@@ -1,3 +1,2 @@
|
|
|
1
|
-
import {
|
|
2
|
-
export declare function buildFileDatasetPrompt(context:
|
|
3
|
-
//# sourceMappingURL=prompts.d.ts.map
|
|
1
|
+
import type { FileParseContext } from "./file-dataset.types.js";
|
|
2
|
+
export declare function buildFileDatasetPrompt(context: FileParseContext): string;
|
package/dist/file/prompts.js
CHANGED
|
@@ -1,26 +1,23 @@
|
|
|
1
|
-
"use strict";
|
|
2
1
|
// Plain build API using template literals and XML
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
const xmlbuilder2_1 = require("xmlbuilder2");
|
|
6
|
-
const datasetFiles_1 = require("../datasetFiles");
|
|
2
|
+
import { create } from "xmlbuilder2";
|
|
3
|
+
import { getDatasetWorkstation, getDatasetOutputPath } from "../datasetFiles.js";
|
|
7
4
|
function buildRole() {
|
|
8
|
-
let xml =
|
|
5
|
+
let xml = create()
|
|
9
6
|
.ele("Role")
|
|
10
7
|
.txt("You are a dataset creator for a SINGLE file. Your goal is to convert the file content into a validated JSONL dataset where each line represents one record.")
|
|
11
8
|
.up();
|
|
12
9
|
return xml.end({ prettyPrint: true, headless: true });
|
|
13
10
|
}
|
|
14
11
|
function buildGoal() {
|
|
15
|
-
let xml =
|
|
12
|
+
let xml = create()
|
|
16
13
|
.ele("Goal")
|
|
17
|
-
.txt("Convert the
|
|
14
|
+
.txt("Convert the input file into a validated JSONL dataset (output.jsonl) where each line is a JSON object conforming to a generated schema. The schema describes ONE data record structure. Extract ONLY data records; exclude any header sections, metadata, or summary information from the file.")
|
|
18
15
|
.up();
|
|
19
16
|
return xml.end({ prettyPrint: true, headless: true });
|
|
20
17
|
}
|
|
21
|
-
function
|
|
22
|
-
let xml =
|
|
23
|
-
.ele("
|
|
18
|
+
function buildResourceInfo(context) {
|
|
19
|
+
let xml = create()
|
|
20
|
+
.ele("FileResource")
|
|
24
21
|
.ele("Type").txt("file").up()
|
|
25
22
|
.ele("FileId").txt(context.fileId).up()
|
|
26
23
|
.ele("DatasetId").txt(context.datasetId).up()
|
|
@@ -29,7 +26,7 @@ function buildSourceInfo(context) {
|
|
|
29
26
|
return xml;
|
|
30
27
|
}
|
|
31
28
|
function buildFilePreviewSection(preview) {
|
|
32
|
-
let xml =
|
|
29
|
+
let xml = create()
|
|
33
30
|
.ele("FilePreview")
|
|
34
31
|
.ele("TotalRows").txt(String(preview.totalRows)).up();
|
|
35
32
|
if (preview.metadata) {
|
|
@@ -91,8 +88,9 @@ function buildErrorsSection(errors) {
|
|
|
91
88
|
if (errors.length === 0) {
|
|
92
89
|
return null;
|
|
93
90
|
}
|
|
94
|
-
let xml =
|
|
95
|
-
.ele("PreviousErrors")
|
|
91
|
+
let xml = create()
|
|
92
|
+
.ele("PreviousErrors")
|
|
93
|
+
.ele("Instruction").txt("Treat these as repair feedback from the previous validation attempt. Rewrite output.jsonl from the schema contract; do not patch input column names into schema keys piecemeal.").up();
|
|
96
94
|
for (const error of errors) {
|
|
97
95
|
xml = xml.ele("Error").txt(error).up();
|
|
98
96
|
}
|
|
@@ -100,10 +98,10 @@ function buildErrorsSection(errors) {
|
|
|
100
98
|
return xml;
|
|
101
99
|
}
|
|
102
100
|
function buildContextSection(context) {
|
|
103
|
-
let xml =
|
|
101
|
+
let xml = create()
|
|
104
102
|
.ele("Context");
|
|
105
|
-
const
|
|
106
|
-
xml = xml.import(
|
|
103
|
+
const resourceXml = buildResourceInfo(context);
|
|
104
|
+
xml = xml.import(resourceXml.first());
|
|
107
105
|
if (context.filePreview) {
|
|
108
106
|
const previewXml = buildFilePreviewSection(context.filePreview);
|
|
109
107
|
xml = xml.import(previewXml.first());
|
|
@@ -117,27 +115,123 @@ function buildContextSection(context) {
|
|
|
117
115
|
xml = xml.up();
|
|
118
116
|
return xml.end({ prettyPrint: true, headless: true });
|
|
119
117
|
}
|
|
118
|
+
function asRecord(value) {
|
|
119
|
+
return value && typeof value === "object" && !Array.isArray(value)
|
|
120
|
+
? value
|
|
121
|
+
: null;
|
|
122
|
+
}
|
|
123
|
+
function getSchemaObject(context) {
|
|
124
|
+
return asRecord(context.schema?.schema);
|
|
125
|
+
}
|
|
126
|
+
function joinSchemaPath(basePath, key) {
|
|
127
|
+
return basePath === "$" ? `$.${key}` : `${basePath}.${key}`;
|
|
128
|
+
}
|
|
129
|
+
function collectSchemaContract(schema, path = "$", contract = {
|
|
130
|
+
requiredPaths: [],
|
|
131
|
+
propertyPaths: [],
|
|
132
|
+
enumConstraints: [],
|
|
133
|
+
closedObjectPaths: [],
|
|
134
|
+
}) {
|
|
135
|
+
const record = asRecord(schema);
|
|
136
|
+
if (!record) {
|
|
137
|
+
return contract;
|
|
138
|
+
}
|
|
139
|
+
if (Array.isArray(record.enum)) {
|
|
140
|
+
contract.enumConstraints.push({
|
|
141
|
+
path,
|
|
142
|
+
values: record.enum.map((value) => JSON.stringify(value)),
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
const properties = asRecord(record.properties);
|
|
146
|
+
if (properties) {
|
|
147
|
+
if (record.additionalProperties === false) {
|
|
148
|
+
contract.closedObjectPaths.push(path);
|
|
149
|
+
}
|
|
150
|
+
const required = Array.isArray(record.required)
|
|
151
|
+
? record.required.filter((value) => typeof value === "string")
|
|
152
|
+
: [];
|
|
153
|
+
for (const key of required) {
|
|
154
|
+
contract.requiredPaths.push(joinSchemaPath(path, key));
|
|
155
|
+
}
|
|
156
|
+
for (const [key, childSchema] of Object.entries(properties)) {
|
|
157
|
+
const childPath = joinSchemaPath(path, key);
|
|
158
|
+
contract.propertyPaths.push(childPath);
|
|
159
|
+
collectSchemaContract(childSchema, childPath, contract);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
if (record.items) {
|
|
163
|
+
collectSchemaContract(record.items, `${path}[]`, contract);
|
|
164
|
+
}
|
|
165
|
+
for (const keyword of ["oneOf", "anyOf", "allOf"]) {
|
|
166
|
+
if (Array.isArray(record[keyword])) {
|
|
167
|
+
for (const childSchema of record[keyword]) {
|
|
168
|
+
collectSchemaContract(childSchema, path, contract);
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
return contract;
|
|
173
|
+
}
|
|
174
|
+
function appendLimitedList(xml, elementName, itemName, values, maxItems) {
|
|
175
|
+
let node = xml.ele(elementName);
|
|
176
|
+
for (const value of values.slice(0, maxItems)) {
|
|
177
|
+
node = node.ele(itemName).txt(value).up();
|
|
178
|
+
}
|
|
179
|
+
if (values.length > maxItems) {
|
|
180
|
+
node = node.ele("Truncated").txt(String(values.length - maxItems)).up();
|
|
181
|
+
}
|
|
182
|
+
return node.up();
|
|
183
|
+
}
|
|
120
184
|
function buildSchemaSection(context) {
|
|
121
|
-
|
|
185
|
+
const schema = getSchemaObject(context);
|
|
186
|
+
if (!context.schema || !schema) {
|
|
122
187
|
return "";
|
|
123
188
|
}
|
|
124
|
-
|
|
189
|
+
const contract = collectSchemaContract(schema);
|
|
190
|
+
let xml = create()
|
|
125
191
|
.com("Schema section: This defines the structure of ONE RECORD (row). Each line in the JSONL output must conform to this schema.")
|
|
126
192
|
.ele("Schema")
|
|
127
193
|
.ele("Title").txt(context.schema.title || "").up()
|
|
128
|
-
.ele("Description").txt(context.schema.description || "").up()
|
|
129
|
-
|
|
194
|
+
.ele("Description").txt(context.schema.description || "").up();
|
|
195
|
+
xml = xml
|
|
196
|
+
.ele("SchemaContract")
|
|
197
|
+
.ele("Purpose").txt("Compact output contract derived from JSON Schema. Use this before writing output.jsonl.").up()
|
|
198
|
+
.ele("Rule").txt("Use only schema property keys in data objects. Input headers are input labels, not output keys.").up()
|
|
199
|
+
.ele("Rule").txt("Required paths are required everywhere, including nested objects and array items.").up()
|
|
200
|
+
.ele("Rule").txt("Enum fields must use exactly one of the listed literal values. Normalize input labels to the closest valid enum literal; never emit a value outside the enum.").up();
|
|
201
|
+
xml = appendLimitedList(xml, "RequiredPaths", "Path", contract.requiredPaths, 120);
|
|
202
|
+
xml = appendLimitedList(xml, "PropertyPaths", "Path", contract.propertyPaths, 160);
|
|
203
|
+
let enumsXml = xml.ele("EnumConstraints");
|
|
204
|
+
for (const constraint of contract.enumConstraints.slice(0, 80)) {
|
|
205
|
+
let enumXml = enumsXml.ele("Enum", { path: constraint.path });
|
|
206
|
+
for (const value of constraint.values.slice(0, 80)) {
|
|
207
|
+
enumXml = enumXml.ele("Value").txt(value).up();
|
|
208
|
+
}
|
|
209
|
+
if (constraint.values.length > 80) {
|
|
210
|
+
enumXml = enumXml.ele("Truncated").txt(String(constraint.values.length - 80)).up();
|
|
211
|
+
}
|
|
212
|
+
enumsXml = enumXml.up();
|
|
213
|
+
}
|
|
214
|
+
if (contract.enumConstraints.length > 80) {
|
|
215
|
+
enumsXml = enumsXml.ele("Truncated").txt(String(contract.enumConstraints.length - 80)).up();
|
|
216
|
+
}
|
|
217
|
+
xml = enumsXml.up();
|
|
218
|
+
xml = appendLimitedList(xml, "ClosedObjectPaths", "Path", contract.closedObjectPaths, 80);
|
|
219
|
+
xml = xml
|
|
220
|
+
.up()
|
|
221
|
+
.ele("JsonSchema").txt(JSON.stringify(schema, null, 2)).up()
|
|
130
222
|
.up();
|
|
131
223
|
return xml.end({ prettyPrint: true, headless: true });
|
|
132
224
|
}
|
|
133
225
|
function buildInstructions(context) {
|
|
134
|
-
const datasetWorkstation =
|
|
135
|
-
|
|
226
|
+
const datasetWorkstation = context.sandboxConfig.scriptsDir
|
|
227
|
+
? context.sandboxConfig.scriptsDir.replace(/\/scripts$/, "")
|
|
228
|
+
: getDatasetWorkstation(context.datasetId);
|
|
229
|
+
const outputPath = context.sandboxConfig.outputPath ?? getDatasetOutputPath(context.datasetId);
|
|
136
230
|
const hasProvidedSchema = Boolean(context.schema?.schema);
|
|
137
231
|
const currentTask = hasProvidedSchema
|
|
138
232
|
? "Review FilePreview section, use the provided schema as the output contract, then parse the file and generate the dataset"
|
|
139
233
|
: "Review FilePreview section to understand file structure, then generate JSON Schema for a SINGLE RECORD, then parse the file and generate the dataset";
|
|
140
|
-
let xml =
|
|
234
|
+
let xml = create()
|
|
141
235
|
.ele("Instructions")
|
|
142
236
|
.ele("Workflow")
|
|
143
237
|
.ele("Step", { number: "1", name: "Inspect File" })
|
|
@@ -150,6 +244,11 @@ function buildInstructions(context) {
|
|
|
150
244
|
.ele("Action").txt("Use the provided schema as the output contract for every row in output.jsonl").up()
|
|
151
245
|
.ele("Requirements")
|
|
152
246
|
.ele("Requirement").txt("Every output row must conform exactly to the provided schema").up()
|
|
247
|
+
.ele("Requirement").txt("Every data object MUST use the exact property names from the provided JSON Schema required/properties keys").up()
|
|
248
|
+
.ele("Requirement").txt("Build a schema-first mapping from input columns to schema fields before writing output.jsonl. Do not use raw input headers as JSON keys unless they are exactly schema keys").up()
|
|
249
|
+
.ele("Requirement").txt("For nested required fields, populate the required child keys inside each nested object or array item; top-level validity is not enough").up()
|
|
250
|
+
.ele("Requirement").txt("For enum fields, emit exactly one allowed enum literal from SchemaContract; normalize labels or abbreviations into allowed literals").up()
|
|
251
|
+
.ele("Requirement").txt("Do not translate, localize, rename, camelize differently, or infer alternative field names. Field names are a technical contract; only field values may preserve the input language").up()
|
|
153
252
|
.ele("Requirement").txt("Do not call generateSchema when a schema is already provided").up()
|
|
154
253
|
.up()
|
|
155
254
|
.up();
|
|
@@ -173,6 +272,8 @@ function buildInstructions(context) {
|
|
|
173
272
|
.ele("Requirements")
|
|
174
273
|
.ele("Requirement").txt("Parse ALL data rows/records from the file (exclude header sections and metadata)").up()
|
|
175
274
|
.ele("Requirement").txt("Output JSONL format: each line is {\"type\": \"row\", \"data\": {...record...}}").up()
|
|
275
|
+
.ele("Requirement").txt("When a schema is provided, each data object must contain the exact required schema keys and must not use translated or synonymous keys").up()
|
|
276
|
+
.ele("Requirement").txt("When validation returns zero valid rows, treat the previous output as structurally wrong and rewrite output.jsonl from the SchemaContract, not by applying small patches").up()
|
|
176
277
|
.ele("Requirement").txt("Extract ONLY data records; skip any header lines, summary sections, or file metadata").up()
|
|
177
278
|
.ele("Requirement").txt(`Save output to: ${outputPath}`).up()
|
|
178
279
|
.ele("Requirement").txt("Use descriptive scriptName in snake_case (e.g., 'parse_csv_to_jsonl')").up()
|
|
@@ -180,11 +281,13 @@ function buildInstructions(context) {
|
|
|
180
281
|
.up()
|
|
181
282
|
.ele("Step", { number: "4", name: "Complete and Validate" })
|
|
182
283
|
.ele("Action").txt("Call completeDataset to validate the dataset").up()
|
|
183
|
-
.ele("Behavior").txt("Validates that output.jsonl exists and all records conform to the schema stored in database. Returns
|
|
284
|
+
.ele("Behavior").txt("Validates that output.jsonl exists and all records conform to the schema stored in database. Returns success:false with validation details if validation fails. If validation fails, inspect validation errors, rewrite output.jsonl, and call completeDataset again. Do not stop until completeDataset returns success:true.").up()
|
|
184
285
|
.up()
|
|
185
286
|
.up()
|
|
186
287
|
.ele("Rules")
|
|
187
288
|
.ele("Rule").txt("Schema defines ONE DATA RECORD structure (not array, not header)").up()
|
|
289
|
+
.ele("Rule").txt("Schema property names are authoritative. Never translate or rename keys such as itemName, quantity, or unit into the input language").up()
|
|
290
|
+
.ele("Rule").txt("Original/input language applies to extracted values only, not to JSON object keys").up()
|
|
188
291
|
.ele("Rule").txt("Datasets contain ONLY data records; exclude all header sections and file metadata").up()
|
|
189
292
|
.ele("Rule").txt("JSONL format: each line = separate JSON object representing one data record").up()
|
|
190
293
|
.ele("Rule").txt("FilePreview shows raw file content - use Script to understand data extraction").up()
|
|
@@ -197,7 +300,7 @@ function buildInstructions(context) {
|
|
|
197
300
|
.up();
|
|
198
301
|
return xml.end({ prettyPrint: true, headless: true });
|
|
199
302
|
}
|
|
200
|
-
function buildFileDatasetPrompt(context) {
|
|
303
|
+
export function buildFileDatasetPrompt(context) {
|
|
201
304
|
const sections = [];
|
|
202
305
|
sections.push(buildRole());
|
|
203
306
|
sections.push("");
|
|
@@ -205,7 +308,11 @@ function buildFileDatasetPrompt(context) {
|
|
|
205
308
|
sections.push("");
|
|
206
309
|
sections.push(buildContextSection(context));
|
|
207
310
|
sections.push("");
|
|
311
|
+
const schemaSection = buildSchemaSection(context);
|
|
312
|
+
if (schemaSection) {
|
|
313
|
+
sections.push(schemaSection);
|
|
314
|
+
sections.push("");
|
|
315
|
+
}
|
|
208
316
|
sections.push(buildInstructions(context));
|
|
209
317
|
return sections.join("\n");
|
|
210
318
|
}
|
|
211
|
-
//# sourceMappingURL=prompts.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare const PYTHON_SCRIPT_BASE64_BY_NAME: Readonly<Record<string, string>>;
|