@ekairos/dataset 1.22.79-beta.development.0 → 1.22.81-beta.development.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/builder/materialize.d.ts +77 -1
- package/dist/builder/materialize.js +212 -60
- package/dist/builder/persistence.d.ts +6 -0
- package/dist/builder/persistence.js +22 -0
- package/dist/completeDataset.steps.d.ts +87 -0
- package/dist/completeDataset.steps.js +449 -0
- package/dist/completeDataset.tool.d.ts +53 -2
- package/dist/completeDataset.tool.js +4 -262
- package/dist/dataset/steps.d.ts +1 -0
- package/dist/dataset/steps.js +12 -12
- package/dist/dataset.js +16 -4
- package/dist/datasetFiles.d.ts +5 -0
- package/dist/datasetFiles.js +23 -14
- package/dist/executeCommand.tool.js +2 -3
- package/dist/file/file-dataset.agent.d.ts +4 -1
- package/dist/file/file-dataset.agent.js +30 -18
- package/dist/file/file-dataset.steps.js +5 -17
- package/dist/file/file-dataset.types.d.ts +4 -0
- package/dist/file/filepreview.d.ts +1 -2
- package/dist/file/filepreview.js +9 -118
- package/dist/file/prompts.js +108 -4
- package/dist/transform/filepreview.js +3 -16
- package/dist/transform/transform-dataset.agent.d.ts +6 -1
- package/dist/transform/transform-dataset.agent.js +30 -15
- package/dist/transform/transform-dataset.steps.js +3 -4
- package/dist/transform/transform-dataset.types.d.ts +6 -0
- package/package.json +4 -4
|
@@ -26,10 +26,14 @@ export type FileParseContextParams = {
|
|
|
26
26
|
datasetId?: string;
|
|
27
27
|
model?: string;
|
|
28
28
|
reactor?: ContextReactor<any, any>;
|
|
29
|
+
sandboxState?: SandboxState;
|
|
30
|
+
filePreview?: FilePreviewContext;
|
|
31
|
+
schema?: any | null;
|
|
29
32
|
};
|
|
30
33
|
export type FileParseRunOptions = {
|
|
31
34
|
prompt?: string;
|
|
32
35
|
durable?: boolean;
|
|
36
|
+
initialContent?: Record<string, any>;
|
|
33
37
|
};
|
|
34
38
|
export type FileParseContextBuilder<Env extends {
|
|
35
39
|
orgId: string;
|
|
@@ -5,7 +5,6 @@ interface PreviewOptions {
|
|
|
5
5
|
tailLines?: number;
|
|
6
6
|
midLines?: number;
|
|
7
7
|
}
|
|
8
|
-
export declare function resolveFilePreviewScriptPath(scriptName: string): string;
|
|
9
8
|
export declare function getEmbeddedFilePreviewScriptBase64(scriptName: string): string;
|
|
10
|
-
export declare function ensurePreviewScriptsAvailable(
|
|
9
|
+
export declare function ensurePreviewScriptsAvailable(_runtime: any, _sandboxId: string): Promise<void>;
|
|
11
10
|
export declare function generateFilePreview(runtime: any, sandboxId: string, sandboxFilePath: string, datasetId: string, options?: PreviewOptions): Promise<FilePreviewContext>;
|
package/dist/file/filepreview.js
CHANGED
|
@@ -1,40 +1,8 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { dirname, join } from "node:path";
|
|
3
|
-
import { fileURLToPath } from "node:url";
|
|
4
|
-
import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
|
|
1
|
+
import { runDatasetSandboxCommandStep } from "../sandbox/steps.js";
|
|
5
2
|
import { PYTHON_SCRIPT_BASE64_BY_NAME } from "./scripts.generated.js";
|
|
6
3
|
const DEFAULT_HEAD_LINES = 50;
|
|
7
4
|
const DEFAULT_TAIL_LINES = 20;
|
|
8
5
|
const DEFAULT_MID_LINES = 20;
|
|
9
|
-
const SANDBOX_SCRIPT_DIRECTORY = "/tmp/ekairos/dataset/file/scripts";
|
|
10
|
-
const PYTHON_SCRIPT_FILES = [
|
|
11
|
-
"file_metadata.py",
|
|
12
|
-
"preview_head_csv.py",
|
|
13
|
-
"preview_head_excel.py",
|
|
14
|
-
"preview_mid_csv.py",
|
|
15
|
-
"preview_mid_excel.py",
|
|
16
|
-
"preview_tail_csv.py",
|
|
17
|
-
"preview_tail_excel.py",
|
|
18
|
-
];
|
|
19
|
-
export function resolveFilePreviewScriptPath(scriptName) {
|
|
20
|
-
const currentDir = dirname(fileURLToPath(import.meta.url));
|
|
21
|
-
const taskRoot = String(process.env.LAMBDA_TASK_ROOT ?? "").trim();
|
|
22
|
-
const candidates = [
|
|
23
|
-
join(currentDir, "scripts", scriptName),
|
|
24
|
-
join(process.cwd(), "node_modules", "@ekairos", "dataset", "dist", "file", "scripts", scriptName),
|
|
25
|
-
taskRoot
|
|
26
|
-
? join(taskRoot, "node_modules", "@ekairos", "dataset", "dist", "file", "scripts", scriptName)
|
|
27
|
-
: "",
|
|
28
|
-
join(process.cwd(), "packages", "dataset", "dist", "file", "scripts", scriptName),
|
|
29
|
-
join(process.cwd(), "packages", "dataset", "src", "file", "scripts", scriptName),
|
|
30
|
-
].filter(Boolean);
|
|
31
|
-
for (const candidate of candidates) {
|
|
32
|
-
if (existsSync(candidate)) {
|
|
33
|
-
return candidate;
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
throw new Error(`dataset_preview_script_not_found:${scriptName}; searched=${candidates.join(",")}`);
|
|
37
|
-
}
|
|
38
6
|
export function getEmbeddedFilePreviewScriptBase64(scriptName) {
|
|
39
7
|
const embedded = PYTHON_SCRIPT_BASE64_BY_NAME[scriptName];
|
|
40
8
|
if (!embedded) {
|
|
@@ -42,31 +10,9 @@ export function getEmbeddedFilePreviewScriptBase64(scriptName) {
|
|
|
42
10
|
}
|
|
43
11
|
return embedded;
|
|
44
12
|
}
|
|
45
|
-
function readFilePreviewScriptBase64(scriptName) {
|
|
46
|
-
try {
|
|
47
|
-
const scriptPath = resolveFilePreviewScriptPath(scriptName);
|
|
48
|
-
return Buffer.from(readFileSync(scriptPath)).toString("base64");
|
|
49
|
-
}
|
|
50
|
-
catch (error) {
|
|
51
|
-
try {
|
|
52
|
-
return getEmbeddedFilePreviewScriptBase64(scriptName);
|
|
53
|
-
}
|
|
54
|
-
catch {
|
|
55
|
-
throw error;
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
13
|
function readFilePreviewScriptText(scriptName) {
|
|
60
|
-
|
|
61
|
-
const scriptPath = resolveFilePreviewScriptPath(scriptName);
|
|
62
|
-
return readFileSync(scriptPath, "utf-8");
|
|
63
|
-
}
|
|
64
|
-
catch {
|
|
65
|
-
return Buffer.from(getEmbeddedFilePreviewScriptBase64(scriptName), "base64").toString("utf-8");
|
|
66
|
-
}
|
|
14
|
+
return Buffer.from(getEmbeddedFilePreviewScriptBase64(scriptName), "base64").toString("utf-8");
|
|
67
15
|
}
|
|
68
|
-
const preparedSandboxIds = new Set();
|
|
69
|
-
const sandboxSetupPromises = new Map();
|
|
70
16
|
function sanitizePreviewText(value) {
|
|
71
17
|
return String(value ?? "")
|
|
72
18
|
.replace(/\u0000/g, "")
|
|
@@ -99,65 +45,16 @@ function validateScriptResult(result, context) {
|
|
|
99
45
|
throw new Error(`${context} failed: ${stderr.substring(0, 500)}`);
|
|
100
46
|
}
|
|
101
47
|
}
|
|
102
|
-
export async function ensurePreviewScriptsAvailable(
|
|
103
|
-
|
|
104
|
-
return;
|
|
105
|
-
}
|
|
106
|
-
const inFlight = sandboxSetupPromises.get(sandboxId);
|
|
107
|
-
if (inFlight) {
|
|
108
|
-
await inFlight;
|
|
109
|
-
return;
|
|
110
|
-
}
|
|
111
|
-
const setupPromise = (async () => {
|
|
112
|
-
try {
|
|
113
|
-
await runDatasetSandboxCommandStep({
|
|
114
|
-
runtime,
|
|
115
|
-
sandboxId,
|
|
116
|
-
cmd: "mkdir",
|
|
117
|
-
args: ["-p", SANDBOX_SCRIPT_DIRECTORY],
|
|
118
|
-
});
|
|
119
|
-
}
|
|
120
|
-
catch (error) {
|
|
121
|
-
console.warn("[Dataset Scripts] Failed to create sandbox scripts directory", error);
|
|
122
|
-
}
|
|
123
|
-
const filesToWrite = [];
|
|
124
|
-
for (const scriptName of PYTHON_SCRIPT_FILES) {
|
|
125
|
-
try {
|
|
126
|
-
filesToWrite.push({
|
|
127
|
-
path: `${SANDBOX_SCRIPT_DIRECTORY}/${scriptName}`,
|
|
128
|
-
contentBase64: readFilePreviewScriptBase64(scriptName),
|
|
129
|
-
});
|
|
130
|
-
}
|
|
131
|
-
catch (error) {
|
|
132
|
-
console.error(`[Dataset Scripts] Failed to read script ${scriptName}`, error);
|
|
133
|
-
throw error;
|
|
134
|
-
}
|
|
135
|
-
}
|
|
136
|
-
if (filesToWrite.length > 0) {
|
|
137
|
-
await writeDatasetSandboxFilesStep({
|
|
138
|
-
runtime,
|
|
139
|
-
sandboxId,
|
|
140
|
-
files: filesToWrite,
|
|
141
|
-
});
|
|
142
|
-
}
|
|
143
|
-
})();
|
|
144
|
-
sandboxSetupPromises.set(sandboxId, setupPromise);
|
|
145
|
-
try {
|
|
146
|
-
await setupPromise;
|
|
147
|
-
preparedSandboxIds.add(sandboxId);
|
|
148
|
-
}
|
|
149
|
-
catch (error) {
|
|
150
|
-
sandboxSetupPromises.delete(sandboxId);
|
|
151
|
-
throw error;
|
|
152
|
-
}
|
|
48
|
+
export async function ensurePreviewScriptsAvailable(_runtime, _sandboxId) {
|
|
49
|
+
return;
|
|
153
50
|
}
|
|
154
51
|
export async function generateFilePreview(runtime, sandboxId, sandboxFilePath, datasetId, options = {}) {
|
|
155
52
|
const context = {
|
|
156
53
|
totalRows: 0,
|
|
157
54
|
};
|
|
158
55
|
try {
|
|
159
|
-
await ensurePreviewScriptsAvailable(runtime, sandboxId);
|
|
160
56
|
const metadataResult = await runScript(runtime, sandboxId, "file_metadata.py", [sandboxFilePath], "Extracts file metadata: name, extension, size, row count estimate, column count, and header preview");
|
|
57
|
+
validateScriptResult(metadataResult, `preview_metadata for ${datasetId}`);
|
|
161
58
|
context.metadata = metadataResult;
|
|
162
59
|
let previewKind = null;
|
|
163
60
|
if (metadataResult.stdout) {
|
|
@@ -219,25 +116,19 @@ export async function generateFilePreview(runtime, sandboxId, sandboxFilePath, d
|
|
|
219
116
|
}
|
|
220
117
|
catch (error) {
|
|
221
118
|
console.error(`[Dataset ${datasetId}] Error generating file preview:`, error);
|
|
119
|
+
throw error;
|
|
222
120
|
}
|
|
223
121
|
return context;
|
|
224
122
|
}
|
|
225
123
|
async function runScript(runtime, sandboxId, scriptName, args, description) {
|
|
226
|
-
const
|
|
227
|
-
const command = `python
|
|
228
|
-
let scriptContent = "";
|
|
229
|
-
try {
|
|
230
|
-
scriptContent = readFilePreviewScriptText(scriptName);
|
|
231
|
-
}
|
|
232
|
-
catch (error) {
|
|
233
|
-
console.warn(`Failed to read script ${scriptName}:`, error);
|
|
234
|
-
}
|
|
124
|
+
const scriptContent = readFilePreviewScriptText(scriptName);
|
|
125
|
+
const command = `python -c <${scriptName}> ${args.join(" ")}`;
|
|
235
126
|
try {
|
|
236
127
|
const result = await runDatasetSandboxCommandStep({
|
|
237
128
|
runtime,
|
|
238
129
|
sandboxId,
|
|
239
130
|
cmd: "python",
|
|
240
|
-
args: [
|
|
131
|
+
args: ["-c", scriptContent, ...args],
|
|
241
132
|
});
|
|
242
133
|
return {
|
|
243
134
|
description,
|
package/dist/file/prompts.js
CHANGED
|
@@ -89,7 +89,8 @@ function buildErrorsSection(errors) {
|
|
|
89
89
|
return null;
|
|
90
90
|
}
|
|
91
91
|
let xml = create()
|
|
92
|
-
.ele("PreviousErrors")
|
|
92
|
+
.ele("PreviousErrors")
|
|
93
|
+
.ele("Instruction").txt("Treat these as repair feedback from the previous validation attempt. Rewrite output.jsonl from the schema contract; do not patch source column names into schema keys piecemeal.").up();
|
|
93
94
|
for (const error of errors) {
|
|
94
95
|
xml = xml.ele("Error").txt(error).up();
|
|
95
96
|
}
|
|
@@ -114,16 +115,110 @@ function buildContextSection(context) {
|
|
|
114
115
|
xml = xml.up();
|
|
115
116
|
return xml.end({ prettyPrint: true, headless: true });
|
|
116
117
|
}
|
|
118
|
+
function asRecord(value) {
|
|
119
|
+
return value && typeof value === "object" && !Array.isArray(value)
|
|
120
|
+
? value
|
|
121
|
+
: null;
|
|
122
|
+
}
|
|
123
|
+
function getSchemaObject(context) {
|
|
124
|
+
return asRecord(context.schema?.schema);
|
|
125
|
+
}
|
|
126
|
+
function joinSchemaPath(basePath, key) {
|
|
127
|
+
return basePath === "$" ? `$.${key}` : `${basePath}.${key}`;
|
|
128
|
+
}
|
|
129
|
+
function collectSchemaContract(schema, path = "$", contract = {
|
|
130
|
+
requiredPaths: [],
|
|
131
|
+
propertyPaths: [],
|
|
132
|
+
enumConstraints: [],
|
|
133
|
+
closedObjectPaths: [],
|
|
134
|
+
}) {
|
|
135
|
+
const record = asRecord(schema);
|
|
136
|
+
if (!record) {
|
|
137
|
+
return contract;
|
|
138
|
+
}
|
|
139
|
+
if (Array.isArray(record.enum)) {
|
|
140
|
+
contract.enumConstraints.push({
|
|
141
|
+
path,
|
|
142
|
+
values: record.enum.map((value) => JSON.stringify(value)),
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
const properties = asRecord(record.properties);
|
|
146
|
+
if (properties) {
|
|
147
|
+
if (record.additionalProperties === false) {
|
|
148
|
+
contract.closedObjectPaths.push(path);
|
|
149
|
+
}
|
|
150
|
+
const required = Array.isArray(record.required)
|
|
151
|
+
? record.required.filter((value) => typeof value === "string")
|
|
152
|
+
: [];
|
|
153
|
+
for (const key of required) {
|
|
154
|
+
contract.requiredPaths.push(joinSchemaPath(path, key));
|
|
155
|
+
}
|
|
156
|
+
for (const [key, childSchema] of Object.entries(properties)) {
|
|
157
|
+
const childPath = joinSchemaPath(path, key);
|
|
158
|
+
contract.propertyPaths.push(childPath);
|
|
159
|
+
collectSchemaContract(childSchema, childPath, contract);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
if (record.items) {
|
|
163
|
+
collectSchemaContract(record.items, `${path}[]`, contract);
|
|
164
|
+
}
|
|
165
|
+
for (const keyword of ["oneOf", "anyOf", "allOf"]) {
|
|
166
|
+
if (Array.isArray(record[keyword])) {
|
|
167
|
+
for (const childSchema of record[keyword]) {
|
|
168
|
+
collectSchemaContract(childSchema, path, contract);
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
return contract;
|
|
173
|
+
}
|
|
174
|
+
function appendLimitedList(xml, elementName, itemName, values, maxItems) {
|
|
175
|
+
let node = xml.ele(elementName);
|
|
176
|
+
for (const value of values.slice(0, maxItems)) {
|
|
177
|
+
node = node.ele(itemName).txt(value).up();
|
|
178
|
+
}
|
|
179
|
+
if (values.length > maxItems) {
|
|
180
|
+
node = node.ele("Truncated").txt(String(values.length - maxItems)).up();
|
|
181
|
+
}
|
|
182
|
+
return node.up();
|
|
183
|
+
}
|
|
117
184
|
function buildSchemaSection(context) {
|
|
118
|
-
|
|
185
|
+
const schema = getSchemaObject(context);
|
|
186
|
+
if (!context.schema || !schema) {
|
|
119
187
|
return "";
|
|
120
188
|
}
|
|
189
|
+
const contract = collectSchemaContract(schema);
|
|
121
190
|
let xml = create()
|
|
122
191
|
.com("Schema section: This defines the structure of ONE RECORD (row). Each line in the JSONL output must conform to this schema.")
|
|
123
192
|
.ele("Schema")
|
|
124
193
|
.ele("Title").txt(context.schema.title || "").up()
|
|
125
|
-
.ele("Description").txt(context.schema.description || "").up()
|
|
126
|
-
|
|
194
|
+
.ele("Description").txt(context.schema.description || "").up();
|
|
195
|
+
xml = xml
|
|
196
|
+
.ele("SchemaContract")
|
|
197
|
+
.ele("Purpose").txt("Compact output contract derived from JSON Schema. Use this before writing output.jsonl.").up()
|
|
198
|
+
.ele("Rule").txt("Use only schema property keys in data objects. Source headers are input labels, not output keys.").up()
|
|
199
|
+
.ele("Rule").txt("Required paths are required everywhere, including nested objects and array items.").up()
|
|
200
|
+
.ele("Rule").txt("Enum fields must use exactly one of the listed literal values. Normalize source labels to the closest valid enum literal; never emit a value outside the enum.").up();
|
|
201
|
+
xml = appendLimitedList(xml, "RequiredPaths", "Path", contract.requiredPaths, 120);
|
|
202
|
+
xml = appendLimitedList(xml, "PropertyPaths", "Path", contract.propertyPaths, 160);
|
|
203
|
+
let enumsXml = xml.ele("EnumConstraints");
|
|
204
|
+
for (const constraint of contract.enumConstraints.slice(0, 80)) {
|
|
205
|
+
let enumXml = enumsXml.ele("Enum", { path: constraint.path });
|
|
206
|
+
for (const value of constraint.values.slice(0, 80)) {
|
|
207
|
+
enumXml = enumXml.ele("Value").txt(value).up();
|
|
208
|
+
}
|
|
209
|
+
if (constraint.values.length > 80) {
|
|
210
|
+
enumXml = enumXml.ele("Truncated").txt(String(constraint.values.length - 80)).up();
|
|
211
|
+
}
|
|
212
|
+
enumsXml = enumXml.up();
|
|
213
|
+
}
|
|
214
|
+
if (contract.enumConstraints.length > 80) {
|
|
215
|
+
enumsXml = enumsXml.ele("Truncated").txt(String(contract.enumConstraints.length - 80)).up();
|
|
216
|
+
}
|
|
217
|
+
xml = enumsXml.up();
|
|
218
|
+
xml = appendLimitedList(xml, "ClosedObjectPaths", "Path", contract.closedObjectPaths, 80);
|
|
219
|
+
xml = xml
|
|
220
|
+
.up()
|
|
221
|
+
.ele("JsonSchema").txt(JSON.stringify(schema, null, 2)).up()
|
|
127
222
|
.up();
|
|
128
223
|
return xml.end({ prettyPrint: true, headless: true });
|
|
129
224
|
}
|
|
@@ -148,6 +243,9 @@ function buildInstructions(context) {
|
|
|
148
243
|
.ele("Requirements")
|
|
149
244
|
.ele("Requirement").txt("Every output row must conform exactly to the provided schema").up()
|
|
150
245
|
.ele("Requirement").txt("Every data object MUST use the exact property names from the provided JSON Schema required/properties keys").up()
|
|
246
|
+
.ele("Requirement").txt("Build a schema-first mapping from source columns to schema fields before writing output.jsonl. Do not use raw source headers as JSON keys unless they are exactly schema keys").up()
|
|
247
|
+
.ele("Requirement").txt("For nested required fields, populate the required child keys inside each nested object or array item; top-level validity is not enough").up()
|
|
248
|
+
.ele("Requirement").txt("For enum fields, emit exactly one allowed enum literal from SchemaContract; normalize labels or abbreviations into allowed literals").up()
|
|
151
249
|
.ele("Requirement").txt("Do not translate, localize, rename, camelize differently, or infer alternative field names. Field names are a technical contract; only field values may preserve the source language").up()
|
|
152
250
|
.ele("Requirement").txt("Do not call generateSchema when a schema is already provided").up()
|
|
153
251
|
.up()
|
|
@@ -173,6 +271,7 @@ function buildInstructions(context) {
|
|
|
173
271
|
.ele("Requirement").txt("Parse ALL data rows/records from the file (exclude header sections and metadata)").up()
|
|
174
272
|
.ele("Requirement").txt("Output JSONL format: each line is {\"type\": \"row\", \"data\": {...record...}}").up()
|
|
175
273
|
.ele("Requirement").txt("When a schema is provided, each data object must contain the exact required schema keys and must not use translated or synonymous keys").up()
|
|
274
|
+
.ele("Requirement").txt("When validation returns zero valid rows, treat the previous output as structurally wrong and rewrite output.jsonl from the SchemaContract, not by applying small patches").up()
|
|
176
275
|
.ele("Requirement").txt("Extract ONLY data records; skip any header lines, summary sections, or file metadata").up()
|
|
177
276
|
.ele("Requirement").txt(`Save output to: ${outputPath}`).up()
|
|
178
277
|
.ele("Requirement").txt("Use descriptive scriptName in snake_case (e.g., 'parse_csv_to_jsonl')").up()
|
|
@@ -207,6 +306,11 @@ export function buildFileDatasetPrompt(context) {
|
|
|
207
306
|
sections.push("");
|
|
208
307
|
sections.push(buildContextSection(context));
|
|
209
308
|
sections.push("");
|
|
309
|
+
const schemaSection = buildSchemaSection(context);
|
|
310
|
+
if (schemaSection) {
|
|
311
|
+
sections.push(schemaSection);
|
|
312
|
+
sections.push("");
|
|
313
|
+
}
|
|
210
314
|
sections.push(buildInstructions(context));
|
|
211
315
|
return sections.join("\n");
|
|
212
316
|
}
|
|
@@ -1,31 +1,18 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
|
|
1
|
+
import { runDatasetSandboxCommandStep } from "../sandbox/steps.js";
|
|
3
2
|
const DEFAULT_HEAD_LINES = 50;
|
|
4
3
|
async function runPythonSnippet(runtime, sandboxId, datasetId, scriptName, code, args, description) {
|
|
5
|
-
const workstation = getDatasetWorkstation(datasetId);
|
|
6
|
-
const scriptPath = `${workstation}/${scriptName}.py`;
|
|
7
|
-
await writeDatasetSandboxFilesStep({
|
|
8
|
-
runtime,
|
|
9
|
-
sandboxId,
|
|
10
|
-
files: [
|
|
11
|
-
{
|
|
12
|
-
path: scriptPath,
|
|
13
|
-
contentBase64: Buffer.from(code, "utf-8").toString("base64"),
|
|
14
|
-
},
|
|
15
|
-
],
|
|
16
|
-
});
|
|
17
4
|
const result = await runDatasetSandboxCommandStep({
|
|
18
5
|
runtime,
|
|
19
6
|
sandboxId,
|
|
20
7
|
cmd: "python",
|
|
21
|
-
args: [
|
|
8
|
+
args: ["-c", code, ...args],
|
|
22
9
|
});
|
|
23
10
|
const stdout = result.stdout || "";
|
|
24
11
|
const stderr = result.stderr || "";
|
|
25
12
|
return {
|
|
26
13
|
description,
|
|
27
14
|
script: code,
|
|
28
|
-
command: `python
|
|
15
|
+
command: `python -c <${scriptName}.py> ${args.join(" ")}`,
|
|
29
16
|
stdout,
|
|
30
17
|
stderr,
|
|
31
18
|
};
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { type ContextReactor } from "@ekairos/events";
|
|
2
|
-
import type { TransformDatasetRunOptions } from "./transform-dataset.types.js";
|
|
2
|
+
import type { TransformDatasetRunOptions, TransformSandboxState, TransformSourcePreviewContext } from "./transform-dataset.types.js";
|
|
3
3
|
export type { TransformDatasetAgentParams, TransformDatasetContext, TransformDatasetResult, TransformDatasetRunOptions, TransformPromptContext, TransformSandboxState, } from "./transform-dataset.types.js";
|
|
4
4
|
export declare function createTransformDatasetContext<Env extends {
|
|
5
5
|
orgId: string;
|
|
@@ -11,6 +11,11 @@ export declare function createTransformDatasetContext<Env extends {
|
|
|
11
11
|
model?: string;
|
|
12
12
|
sandboxId?: string;
|
|
13
13
|
reactor?: ContextReactor<any, any>;
|
|
14
|
+
sandboxState?: TransformSandboxState;
|
|
15
|
+
sourcePreviews?: Array<{
|
|
16
|
+
datasetId: string;
|
|
17
|
+
preview: TransformSourcePreviewContext;
|
|
18
|
+
}>;
|
|
14
19
|
}): {
|
|
15
20
|
datasetId: string;
|
|
16
21
|
transform(runtime: {
|
|
@@ -2,6 +2,7 @@ import { createContext, INPUT_TEXT_ITEM_TYPE, WEB_CHANNEL, } from "@ekairos/even
|
|
|
2
2
|
import { createClearDatasetTool } from "../clearDataset.tool.js";
|
|
3
3
|
import { createCompleteDatasetTool, didCompleteDatasetSucceed, getDatasetFatalFailure, } from "../completeDataset.tool.js";
|
|
4
4
|
import { datasetUpdateSchemaStep } from "../dataset/steps.js";
|
|
5
|
+
import { getDatasetOutputPath } from "../datasetFiles.js";
|
|
5
6
|
import { createExecuteCommandTool } from "../executeCommand.tool.js";
|
|
6
7
|
import { buildTransformDatasetPromptStep, ensureTransformSourcesInSandboxStep, generateTransformSourcePreviewsStep, } from "./transform-dataset.steps.js";
|
|
7
8
|
import { createDatasetId } from "../id.js";
|
|
@@ -20,7 +21,8 @@ function createTransformDatasetContextDefinition(params) {
|
|
|
20
21
|
let contextBuilder = createContext("dataset.transform")
|
|
21
22
|
.context(async (stored, _env, runtime) => {
|
|
22
23
|
const previous = stored?.content ?? {};
|
|
23
|
-
const sandboxState = previous?.sandboxState ??
|
|
24
|
+
const sandboxState = previous?.sandboxState ??
|
|
25
|
+
params.sandboxState ?? { initialized: false, sourcePaths: [] };
|
|
24
26
|
const datasetId = previous?.datasetId ?? fallbackDatasetId ?? "";
|
|
25
27
|
const sourceDatasetIds = Array.isArray(previous?.sourceDatasetIds)
|
|
26
28
|
? previous.sourceDatasetIds
|
|
@@ -42,19 +44,28 @@ function createTransformDatasetContextDefinition(params) {
|
|
|
42
44
|
if (!sandboxId) {
|
|
43
45
|
throw new Error("dataset_sandbox_required");
|
|
44
46
|
}
|
|
45
|
-
const initialized =
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
47
|
+
const initialized = sandboxState.initialized && Array.isArray(sandboxState.sourcePaths)
|
|
48
|
+
? {
|
|
49
|
+
sourcePaths: sandboxState.sourcePaths,
|
|
50
|
+
outputPath: previous?.sandboxConfig?.outputPath ?? getDatasetOutputPath(datasetId),
|
|
51
|
+
state: sandboxState,
|
|
52
|
+
}
|
|
53
|
+
: await ensureTransformSourcesInSandboxStep({
|
|
54
|
+
runtime,
|
|
55
|
+
sandboxId,
|
|
56
|
+
datasetId,
|
|
57
|
+
sourceDatasetIds,
|
|
58
|
+
state: sandboxState,
|
|
59
|
+
});
|
|
60
|
+
let sourcePreviews = previous?.sourcePreviews ?? params.sourcePreviews ?? undefined;
|
|
61
|
+
if (!sourcePreviews) {
|
|
62
|
+
sourcePreviews = await generateTransformSourcePreviewsStep({
|
|
63
|
+
runtime,
|
|
64
|
+
sandboxId,
|
|
65
|
+
datasetId,
|
|
66
|
+
sourcePaths: initialized.sourcePaths,
|
|
67
|
+
});
|
|
68
|
+
}
|
|
58
69
|
await datasetUpdateSchemaStep({
|
|
59
70
|
runtime,
|
|
60
71
|
datasetId,
|
|
@@ -155,6 +166,8 @@ export function createTransformDatasetContext(params) {
|
|
|
155
166
|
model: params.model,
|
|
156
167
|
sandboxId: params.sandboxId,
|
|
157
168
|
reactor: params.reactor,
|
|
169
|
+
sandboxState: params.sandboxState,
|
|
170
|
+
sourcePreviews: params.sourcePreviews,
|
|
158
171
|
});
|
|
159
172
|
return {
|
|
160
173
|
datasetId,
|
|
@@ -189,12 +202,14 @@ export function createTransformDatasetContext(params) {
|
|
|
189
202
|
maxModelSteps: 5,
|
|
190
203
|
},
|
|
191
204
|
__initialContent: {
|
|
205
|
+
...(options.initialContent ?? {}),
|
|
192
206
|
datasetId,
|
|
193
207
|
sourceDatasetIds: params.sourceDatasetIds,
|
|
194
208
|
outputSchema: params.outputSchema,
|
|
195
209
|
instructions: params.instructions,
|
|
196
210
|
sandboxId: params.sandboxId ?? "",
|
|
197
|
-
sandboxState: { initialized: false, sourcePaths: [] },
|
|
211
|
+
sandboxState: params.sandboxState ?? { initialized: false, sourcePaths: [] },
|
|
212
|
+
sourcePreviews: params.sourcePreviews,
|
|
198
213
|
},
|
|
199
214
|
});
|
|
200
215
|
await awaitContextRun(shell.run);
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { getDatasetOutputPath,
|
|
1
|
+
import { getDatasetOutputPath, getDatasetSourcesDir, getDatasetStandardDirs, } from "../datasetFiles.js";
|
|
2
2
|
import { datasetReadOutputJsonlStep } from "../dataset/steps.js";
|
|
3
3
|
import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
|
|
4
4
|
import { generateSourcePreview } from "./filepreview.js";
|
|
@@ -12,16 +12,15 @@ export async function ensureTransformSourcesInSandboxStep(params) {
|
|
|
12
12
|
state: params.state,
|
|
13
13
|
};
|
|
14
14
|
}
|
|
15
|
-
const workstation = getDatasetWorkstation(params.datasetId);
|
|
16
15
|
await runDatasetSandboxCommandStep({
|
|
17
16
|
runtime: params.runtime,
|
|
18
17
|
sandboxId: params.sandboxId,
|
|
19
18
|
cmd: "mkdir",
|
|
20
|
-
args: ["-p",
|
|
19
|
+
args: ["-p", ...getDatasetStandardDirs(params.datasetId)],
|
|
21
20
|
});
|
|
22
21
|
const sourcePaths = [];
|
|
23
22
|
for (const sourceDatasetId of params.sourceDatasetIds) {
|
|
24
|
-
const sourcePath = `${
|
|
23
|
+
const sourcePath = `${getDatasetSourcesDir(params.datasetId)}/source_${sourceDatasetId}.jsonl`;
|
|
25
24
|
const source = await datasetReadOutputJsonlStep({
|
|
26
25
|
runtime: params.runtime,
|
|
27
26
|
datasetId: sourceDatasetId,
|
|
@@ -35,10 +35,16 @@ export type TransformDatasetAgentParams = {
|
|
|
35
35
|
model?: string;
|
|
36
36
|
sandboxId?: string;
|
|
37
37
|
reactor?: ContextReactor<any, any>;
|
|
38
|
+
sandboxState?: TransformSandboxState;
|
|
39
|
+
sourcePreviews?: Array<{
|
|
40
|
+
datasetId: string;
|
|
41
|
+
preview: TransformSourcePreviewContext;
|
|
42
|
+
}>;
|
|
38
43
|
};
|
|
39
44
|
export type TransformDatasetRunOptions = {
|
|
40
45
|
prompt?: string;
|
|
41
46
|
durable?: boolean;
|
|
47
|
+
initialContent?: Record<string, any>;
|
|
42
48
|
};
|
|
43
49
|
export type TransformDatasetResult = {
|
|
44
50
|
id: string;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ekairos/dataset",
|
|
3
|
-
"version": "1.22.
|
|
3
|
+
"version": "1.22.81-beta.development.0",
|
|
4
4
|
"description": "Pulzar Dataset Tools",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -65,9 +65,9 @@
|
|
|
65
65
|
"test:ai-sdk:instant": "vitest run -c vitest.codex.config.mts src/tests/materializeDataset.ai-sdk.instant.test.ts"
|
|
66
66
|
},
|
|
67
67
|
"dependencies": {
|
|
68
|
-
"@ekairos/domain": "^1.22.
|
|
69
|
-
"@ekairos/events": "^1.22.
|
|
70
|
-
"@ekairos/sandbox": "^1.22.
|
|
68
|
+
"@ekairos/domain": "^1.22.81-beta.development.0",
|
|
69
|
+
"@ekairos/events": "^1.22.81-beta.development.0",
|
|
70
|
+
"@ekairos/sandbox": "^1.22.81-beta.development.0",
|
|
71
71
|
"@instantdb/admin": "0.22.158",
|
|
72
72
|
"@instantdb/core": "0.22.142",
|
|
73
73
|
"ai": "^5.0.44",
|