@ekairos/dataset 1.22.55-beta.development.0 → 1.22.57-beta.development.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/completeDataset.tool.d.ts +5 -0
- package/dist/completeDataset.tool.js +34 -7
- package/dist/dataset/steps.d.ts +1 -1
- package/dist/dataset/steps.js +1 -1
- package/dist/dataset.js +3 -3
- package/dist/executeCommand.tool.d.ts +18 -0
- package/dist/executeCommand.tool.js +49 -7
- package/dist/file/file-dataset.agent.d.ts +3 -56
- package/dist/file/file-dataset.agent.js +42 -86
- package/dist/file/file-dataset.steps.d.ts +21 -0
- package/dist/file/file-dataset.steps.js +62 -0
- package/dist/file/file-dataset.types.d.ts +50 -0
- package/dist/file/file-dataset.types.js +1 -0
- package/dist/file/filepreview.d.ts +2 -32
- package/dist/file/filepreview.types.d.ts +31 -0
- package/dist/file/filepreview.types.js +1 -0
- package/dist/file/prompts.d.ts +1 -1
- package/dist/index.d.ts +2 -0
- package/dist/index.js +2 -0
- package/dist/sandbox/steps.d.ts +15 -0
- package/dist/sandbox/steps.js +32 -0
- package/dist/transform/index.d.ts +1 -1
- package/dist/transform/index.js +1 -1
- package/dist/transform/prompts.d.ts +1 -33
- package/dist/transform/transform-dataset.agent.d.ts +2 -44
- package/dist/transform/transform-dataset.agent.js +42 -48
- package/dist/transform/transform-dataset.steps.d.ts +30 -0
- package/dist/transform/transform-dataset.steps.js +62 -0
- package/dist/transform/transform-dataset.types.d.ts +86 -0
- package/dist/transform/transform-dataset.types.js +1 -0
- package/package.json +4 -4
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { tool } from "ai";
|
|
2
2
|
import { z } from "zod";
|
|
3
|
-
import { readDatasetSandboxFileStep, runDatasetSandboxCommandStep } from "./sandbox/steps.js";
|
|
3
|
+
import { readDatasetSandboxFileStep, readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep } from "./sandbox/steps.js";
|
|
4
4
|
import Ajv from "ajv";
|
|
5
5
|
import { getDatasetOutputPath, } from "./datasetFiles.js";
|
|
6
6
|
import { datasetGetByIdStep, datasetUpdateStatusStep, datasetUploadOutputFileStep } from "./dataset/steps.js";
|
|
@@ -113,9 +113,12 @@ export function createCompleteDatasetTool({ datasetId, sandboxId, runtime }) {
|
|
|
113
113
|
message: "Empty file content",
|
|
114
114
|
};
|
|
115
115
|
}
|
|
116
|
-
const fileBuffer = Buffer.from(fileRead.contentBase64, "base64");
|
|
117
116
|
console.log(`[Dataset ${datasetId}] Uploading file to InstantDB storage`);
|
|
118
|
-
const uploadResult = await datasetUploadOutputFileStep({
|
|
117
|
+
const uploadResult = await datasetUploadOutputFileStep({
|
|
118
|
+
runtime,
|
|
119
|
+
datasetId,
|
|
120
|
+
contentBase64: fileRead.contentBase64,
|
|
121
|
+
});
|
|
119
122
|
if (!uploadResult.ok) {
|
|
120
123
|
console.error(`[Dataset ${datasetId}] File upload failed: ${uploadResult.error}`);
|
|
121
124
|
return {
|
|
@@ -176,6 +179,31 @@ export function didCompleteDatasetSucceed(event) {
|
|
|
176
179
|
return false;
|
|
177
180
|
});
|
|
178
181
|
}
|
|
182
|
+
export function getDatasetFatalFailure(event) {
|
|
183
|
+
const parts = Array.isArray(event?.content?.parts) ? event.content.parts : [];
|
|
184
|
+
for (const part of parts) {
|
|
185
|
+
let actionName;
|
|
186
|
+
let output;
|
|
187
|
+
if (part?.type === "action") {
|
|
188
|
+
actionName = part.content?.actionName;
|
|
189
|
+
output = part.content?.output;
|
|
190
|
+
}
|
|
191
|
+
else if (typeof part?.type === "string" && part.type.startsWith("tool-")) {
|
|
192
|
+
actionName = part.type.slice("tool-".length);
|
|
193
|
+
output = part.output ?? part.result;
|
|
194
|
+
}
|
|
195
|
+
if (!output || output.fatal !== true) {
|
|
196
|
+
continue;
|
|
197
|
+
}
|
|
198
|
+
const message = typeof output.error === "string" && output.error.trim()
|
|
199
|
+
? output.error.trim()
|
|
200
|
+
: typeof output.message === "string" && output.message.trim()
|
|
201
|
+
? output.message.trim()
|
|
202
|
+
: "Dataset action failed fatally";
|
|
203
|
+
return actionName ? `${actionName}: ${message}` : message;
|
|
204
|
+
}
|
|
205
|
+
return null;
|
|
206
|
+
}
|
|
179
207
|
async function ensureFileExists(runtime, sandboxId, path) {
|
|
180
208
|
const result = await runDatasetSandboxCommandStep({
|
|
181
209
|
runtime,
|
|
@@ -192,8 +220,8 @@ async function validateJsonlRows({ runtime, sandboxId, outputPath, validator, da
|
|
|
192
220
|
let validRowCount = 0;
|
|
193
221
|
let rowRecordCount = 0;
|
|
194
222
|
console.log(`[Dataset ${datasetId}] Reading and validating JSONL file from sandbox`);
|
|
195
|
-
const fileRead = await
|
|
196
|
-
if (!fileRead.
|
|
223
|
+
const fileRead = await readDatasetSandboxTextFileStep({ runtime, sandboxId, path: outputPath });
|
|
224
|
+
if (!fileRead.content) {
|
|
197
225
|
console.log(`[Dataset ${datasetId}] Empty output file`);
|
|
198
226
|
return {
|
|
199
227
|
success: false,
|
|
@@ -205,8 +233,7 @@ async function validateJsonlRows({ runtime, sandboxId, outputPath, validator, da
|
|
|
205
233
|
message: "output.jsonl is empty",
|
|
206
234
|
};
|
|
207
235
|
}
|
|
208
|
-
const
|
|
209
|
-
const lines = fileContent.split("\n");
|
|
236
|
+
const lines = fileRead.content.split("\n");
|
|
210
237
|
console.log(`[Dataset ${datasetId}] Validating ${lines.length} lines`);
|
|
211
238
|
for (let index = 0; index < lines.length; index++) {
|
|
212
239
|
const line = lines[index];
|
package/dist/dataset/steps.d.ts
CHANGED
|
@@ -18,7 +18,7 @@ export declare function datasetUpdateSchemaStep(params: {
|
|
|
18
18
|
export declare function datasetUploadOutputFileStep(params: {
|
|
19
19
|
runtime: any;
|
|
20
20
|
datasetId: string;
|
|
21
|
-
|
|
21
|
+
contentBase64: string;
|
|
22
22
|
}): Promise<import("../service.js").ServiceResult<{
|
|
23
23
|
fileId: string;
|
|
24
24
|
storagePath: string;
|
package/dist/dataset/steps.js
CHANGED
|
@@ -60,7 +60,7 @@ export async function datasetUploadOutputFileStep(params) {
|
|
|
60
60
|
const service = new DatasetService(db);
|
|
61
61
|
return await service.uploadDatasetOutputFile({
|
|
62
62
|
datasetId: params.datasetId,
|
|
63
|
-
fileBuffer: params.
|
|
63
|
+
fileBuffer: Buffer.from(params.contentBase64, "base64"),
|
|
64
64
|
});
|
|
65
65
|
}
|
|
66
66
|
export async function datasetUpdateStatusStep(params) {
|
package/dist/dataset.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { id as newId } from "@instantdb/admin";
|
|
2
2
|
import { buildObjectOutputInstructions } from "./builder/instructions.js";
|
|
3
|
-
import {
|
|
3
|
+
import { materializeDerivedDataset, materializeSingleFileLikeSource, } from "./builder/materialize.js";
|
|
4
4
|
import { materializeQuerySource } from "./builder/materializeQuery.js";
|
|
5
5
|
import { finalizeBuildResult } from "./builder/persistence.js";
|
|
6
6
|
export function dataset(runtime, options = {}) {
|
|
@@ -131,13 +131,13 @@ export function dataset(runtime, options = {}) {
|
|
|
131
131
|
if (!effectiveState.reactor) {
|
|
132
132
|
throw new Error("dataset_reactor_required");
|
|
133
133
|
}
|
|
134
|
-
await
|
|
134
|
+
await materializeSingleFileLikeSource(effectiveState, onlySource, targetDatasetId);
|
|
135
135
|
return finalizeOutputResult(await finalizeBuildResult(effectiveState.runtime, targetDatasetId, effectiveState.first), effectiveState.output);
|
|
136
136
|
}
|
|
137
137
|
if (!effectiveState.reactor) {
|
|
138
138
|
throw new Error("dataset_reactor_required");
|
|
139
139
|
}
|
|
140
|
-
await
|
|
140
|
+
await materializeDerivedDataset(effectiveState, targetDatasetId);
|
|
141
141
|
return finalizeOutputResult(await finalizeBuildResult(effectiveState.runtime, targetDatasetId, effectiveState.first), effectiveState.output);
|
|
142
142
|
},
|
|
143
143
|
};
|
|
@@ -7,6 +7,20 @@ export declare function createExecuteCommandTool({ datasetId, sandboxId, runtime
|
|
|
7
7
|
pythonCode: string;
|
|
8
8
|
scriptName: string;
|
|
9
9
|
}, {
|
|
10
|
+
success: boolean;
|
|
11
|
+
fatal: boolean;
|
|
12
|
+
status: string;
|
|
13
|
+
error: string;
|
|
14
|
+
stdout: string;
|
|
15
|
+
stderr: string;
|
|
16
|
+
exitCode: number;
|
|
17
|
+
scriptPath: string;
|
|
18
|
+
stdoutTruncated: boolean;
|
|
19
|
+
stderrTruncated: boolean;
|
|
20
|
+
stdoutOriginalLength: number;
|
|
21
|
+
stderrOriginalLength: number;
|
|
22
|
+
message?: undefined;
|
|
23
|
+
} | {
|
|
10
24
|
success: boolean;
|
|
11
25
|
exitCode: number;
|
|
12
26
|
stdout: string;
|
|
@@ -17,6 +31,8 @@ export declare function createExecuteCommandTool({ datasetId, sandboxId, runtime
|
|
|
17
31
|
stderrTruncated: boolean;
|
|
18
32
|
stdoutOriginalLength: number;
|
|
19
33
|
stderrOriginalLength: number;
|
|
34
|
+
fatal?: undefined;
|
|
35
|
+
status?: undefined;
|
|
20
36
|
message?: undefined;
|
|
21
37
|
} | {
|
|
22
38
|
success: boolean;
|
|
@@ -29,6 +45,8 @@ export declare function createExecuteCommandTool({ datasetId, sandboxId, runtime
|
|
|
29
45
|
stderrTruncated: boolean;
|
|
30
46
|
stdoutOriginalLength: number;
|
|
31
47
|
stderrOriginalLength: number;
|
|
48
|
+
fatal?: undefined;
|
|
49
|
+
status?: undefined;
|
|
32
50
|
error?: undefined;
|
|
33
51
|
}>;
|
|
34
52
|
export {};
|
|
@@ -1,39 +1,81 @@
|
|
|
1
1
|
import { tool } from "ai";
|
|
2
2
|
import { z } from "zod";
|
|
3
|
-
import { runDatasetSandboxCommandStep,
|
|
3
|
+
import { runDatasetSandboxCommandStep, writeDatasetSandboxTextFilesStep } from "./sandbox/steps.js";
|
|
4
4
|
import { getDatasetWorkstation } from "./datasetFiles.js";
|
|
5
5
|
// To keep responses predictable for big data scenarios, we cap stdout/stderr.
|
|
6
6
|
// The tool's return payload exposes stdout (capped) plus the on-disk script path.
|
|
7
7
|
const MAX_STDOUT_CHARS = 20000;
|
|
8
8
|
const MAX_STDERR_CHARS = 5000;
|
|
9
|
+
function normalizeScriptName(scriptName) {
|
|
10
|
+
const normalized = String(scriptName ?? "")
|
|
11
|
+
.trim()
|
|
12
|
+
.replace(/[^a-zA-Z0-9_.-]/g, "_")
|
|
13
|
+
.replace(/_+/g, "_")
|
|
14
|
+
.slice(0, 80);
|
|
15
|
+
return normalized || "script";
|
|
16
|
+
}
|
|
17
|
+
function stableScriptHash(value) {
|
|
18
|
+
let hash = 2166136261;
|
|
19
|
+
for (let index = 0; index < value.length; index++) {
|
|
20
|
+
hash ^= value.charCodeAt(index);
|
|
21
|
+
hash = Math.imul(hash, 16777619);
|
|
22
|
+
}
|
|
23
|
+
return (hash >>> 0).toString(36);
|
|
24
|
+
}
|
|
9
25
|
export function createExecuteCommandTool({ datasetId, sandboxId, runtime }) {
|
|
10
26
|
return tool({
|
|
11
27
|
description: "Execute Python scripts in the sandbox. Always saves script to a file before executing. The tool's output is EXACTLY the script's stdout and includes the script file path for traceability. CRITICAL: Print concise, human-readable summaries only; do NOT print raw large data. For big results, write artifacts to files in the workstation and print their file paths. Always include progress/result prints (e.g., 'Processing file X...', 'Found Y records', 'Generated output.csv').",
|
|
12
28
|
inputSchema: z.object({
|
|
13
29
|
pythonCode: z.string().describe("Python code to execute. Saved to a file before running. MANDATORY: Use print() to report progress and final results. Keep prints concise; avoid dumping rows/JSON. For large outputs, write to files in the workstation directory and print only file paths and brief summaries."),
|
|
14
|
-
scriptName: z.string().describe("Name for the script file in snake_case (e.g., 'inspect_file', 'parse_csv', 'generate_dataset'). A
|
|
30
|
+
scriptName: z.string().describe("Name for the script file in snake_case (e.g., 'inspect_file', 'parse_csv', 'generate_dataset'). A deterministic suffix will be appended automatically."),
|
|
15
31
|
}),
|
|
16
32
|
execute: async ({ pythonCode, scriptName }) => {
|
|
17
|
-
const uuid = `${Date.now()}-${Math.random().toString(36).substring(2, 9)}`;
|
|
18
33
|
const workstation = getDatasetWorkstation(datasetId);
|
|
19
|
-
const
|
|
34
|
+
const normalizedScriptName = normalizeScriptName(scriptName);
|
|
35
|
+
const scriptHash = stableScriptHash(`${normalizedScriptName}\0${pythonCode}`);
|
|
36
|
+
const scriptFile = `${workstation}/${normalizedScriptName}-${scriptHash}.py`;
|
|
20
37
|
console.log(`[Dataset ${datasetId}] ========================================`);
|
|
21
38
|
console.log(`[Dataset ${datasetId}] Tool: executeCommand`);
|
|
22
|
-
console.log(`[Dataset ${datasetId}] Script: ${
|
|
39
|
+
console.log(`[Dataset ${datasetId}] Script: ${normalizedScriptName}`);
|
|
23
40
|
console.log(`[Dataset ${datasetId}] File: ${scriptFile}`);
|
|
24
41
|
console.log(`[Dataset ${datasetId}] Code length: ${pythonCode.length} chars`);
|
|
25
42
|
console.log(`[Dataset ${datasetId}] ========================================`);
|
|
26
43
|
try {
|
|
27
|
-
await
|
|
44
|
+
await writeDatasetSandboxTextFilesStep({
|
|
28
45
|
runtime,
|
|
29
46
|
sandboxId,
|
|
30
47
|
files: [
|
|
31
48
|
{
|
|
32
49
|
path: scriptFile,
|
|
33
|
-
|
|
50
|
+
content: pythonCode,
|
|
34
51
|
},
|
|
35
52
|
],
|
|
36
53
|
});
|
|
54
|
+
const written = await runDatasetSandboxCommandStep({
|
|
55
|
+
runtime,
|
|
56
|
+
sandboxId,
|
|
57
|
+
cmd: "test",
|
|
58
|
+
args: ["-f", scriptFile],
|
|
59
|
+
});
|
|
60
|
+
if (written.exitCode !== 0) {
|
|
61
|
+
const error = `Script write verification failed: ${scriptFile}`;
|
|
62
|
+
console.error(`[Dataset ${datasetId}] ${error}`);
|
|
63
|
+
console.error(`[Dataset ${datasetId}] ========================================`);
|
|
64
|
+
return {
|
|
65
|
+
success: false,
|
|
66
|
+
fatal: true,
|
|
67
|
+
status: "script_write_failed",
|
|
68
|
+
error,
|
|
69
|
+
stdout: written.stdout || "",
|
|
70
|
+
stderr: written.stderr || "",
|
|
71
|
+
exitCode: written.exitCode,
|
|
72
|
+
scriptPath: scriptFile,
|
|
73
|
+
stdoutTruncated: false,
|
|
74
|
+
stderrTruncated: false,
|
|
75
|
+
stdoutOriginalLength: 0,
|
|
76
|
+
stderrOriginalLength: 0,
|
|
77
|
+
};
|
|
78
|
+
}
|
|
37
79
|
console.log(`[Dataset ${datasetId}] Script written to: ${scriptFile}`);
|
|
38
80
|
console.log(`[Dataset ${datasetId}] Executing: python ${scriptFile}`);
|
|
39
81
|
const result = await runDatasetSandboxCommandStep({
|
|
@@ -1,59 +1,6 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
3
|
-
export type FileParseContext
|
|
4
|
-
datasetId: string;
|
|
5
|
-
fileId: string;
|
|
6
|
-
instructions: string;
|
|
7
|
-
sandboxConfig: {
|
|
8
|
-
filePath: string;
|
|
9
|
-
};
|
|
10
|
-
analysis: any[];
|
|
11
|
-
schema: any | null;
|
|
12
|
-
plan: any | null;
|
|
13
|
-
executionResult: any | null;
|
|
14
|
-
errors: string[];
|
|
15
|
-
iterationCount: number;
|
|
16
|
-
filePreview?: FilePreviewContext;
|
|
17
|
-
};
|
|
18
|
-
export type FileParseContextParams = {
|
|
19
|
-
fileId?: string;
|
|
20
|
-
instructions?: string;
|
|
21
|
-
sandboxId?: string;
|
|
22
|
-
datasetId?: string;
|
|
23
|
-
model?: string;
|
|
24
|
-
reactor?: ContextReactor<any, any>;
|
|
25
|
-
};
|
|
26
|
-
export type FileParseRunOptions = {
|
|
27
|
-
prompt?: string;
|
|
28
|
-
durable?: boolean;
|
|
29
|
-
};
|
|
30
|
-
export type FileParseContextBuilder<Env extends {
|
|
31
|
-
orgId: string;
|
|
32
|
-
}> = {
|
|
33
|
-
datasetId: string;
|
|
34
|
-
context: ReturnType<ReturnType<typeof createContext<Env>>["context"]> extends any ? any : any;
|
|
35
|
-
};
|
|
36
|
-
export type DatasetResult = {
|
|
37
|
-
id: string;
|
|
38
|
-
status?: string;
|
|
39
|
-
title?: string;
|
|
40
|
-
schema?: any;
|
|
41
|
-
analysis?: any;
|
|
42
|
-
calculatedTotalRows?: number;
|
|
43
|
-
actualGeneratedRowCount?: number;
|
|
44
|
-
createdAt?: number;
|
|
45
|
-
updatedAt?: number;
|
|
46
|
-
};
|
|
47
|
-
/**
|
|
48
|
-
* Factory (DX-first):
|
|
49
|
-
*
|
|
50
|
-
* Usage:
|
|
51
|
-
* const { datasetId } = await createFileParseContext(fileId, { instructions }).parse(runtime)
|
|
52
|
-
*
|
|
53
|
-
* - Uses the caller runtime; no secondary runtime is created.
|
|
54
|
-
* - All I/O happens in `"use step"` functions via the provided Ekairos runtime.
|
|
55
|
-
* - `parse()` is the entrypoint; it calls `context.react(...)` internally.
|
|
56
|
-
*/
|
|
1
|
+
import { type ContextReactor } from "@ekairos/events";
|
|
2
|
+
import type { FileParseRunOptions } from "./file-dataset.types.js";
|
|
3
|
+
export type { DatasetResult, FileParseContext, FileParseContextBuilder, FileParseContextParams, FileParseRunOptions, SandboxState, } from "./file-dataset.types.js";
|
|
57
4
|
export declare function createFileParseContext<Env extends {
|
|
58
5
|
orgId: string;
|
|
59
6
|
}>(fileId: string, opts?: {
|
|
@@ -1,15 +1,11 @@
|
|
|
1
|
-
import { createContext, INPUT_TEXT_ITEM_TYPE, WEB_CHANNEL } from "@ekairos/events";
|
|
2
|
-
import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
|
|
3
|
-
import { createGenerateSchemaTool } from "./generateSchema.tool.js";
|
|
4
|
-
import { createCompleteDatasetTool, didCompleteDatasetSucceed } from "../completeDataset.tool.js";
|
|
5
|
-
import { createExecuteCommandTool } from "../executeCommand.tool.js";
|
|
6
|
-
import { createClearDatasetTool } from "../clearDataset.tool.js";
|
|
7
|
-
import { buildFileDatasetPrompt } from "./prompts.js";
|
|
8
|
-
import { generateFilePreview, ensurePreviewScriptsAvailable } from "./filepreview.js";
|
|
1
|
+
import { createContext, INPUT_TEXT_ITEM_TYPE, WEB_CHANNEL, } from "@ekairos/events";
|
|
9
2
|
import { id } from "@instantdb/admin";
|
|
10
|
-
import {
|
|
11
|
-
import {
|
|
3
|
+
import { createClearDatasetTool } from "../clearDataset.tool.js";
|
|
4
|
+
import { createCompleteDatasetTool, didCompleteDatasetSucceed, getDatasetFatalFailure, } from "../completeDataset.tool.js";
|
|
12
5
|
import { datasetGetByIdStep } from "../dataset/steps.js";
|
|
6
|
+
import { createExecuteCommandTool } from "../executeCommand.tool.js";
|
|
7
|
+
import { createGenerateSchemaTool } from "./generateSchema.tool.js";
|
|
8
|
+
import { buildFileDatasetPromptStep, generateFileParsePreviewStep, initializeFileParseSandboxStep, } from "./file-dataset.steps.js";
|
|
13
9
|
async function awaitContextRun(run) {
|
|
14
10
|
if (!run)
|
|
15
11
|
return;
|
|
@@ -19,63 +15,6 @@ async function awaitContextRun(run) {
|
|
|
19
15
|
}
|
|
20
16
|
await run;
|
|
21
17
|
}
|
|
22
|
-
async function initializeSandbox(runtime, sandboxId, datasetId, fileId, state) {
|
|
23
|
-
"use step";
|
|
24
|
-
if (state.initialized) {
|
|
25
|
-
return state.filePath;
|
|
26
|
-
}
|
|
27
|
-
console.log(`[FileParseContext ${datasetId}] Initializing sandbox...`);
|
|
28
|
-
await ensurePreviewScriptsAvailable(runtime, sandboxId);
|
|
29
|
-
console.log(`[FileParseContext ${datasetId}] Installing Python dependencies...`);
|
|
30
|
-
const pipInstall = await runDatasetSandboxCommandStep({
|
|
31
|
-
runtime,
|
|
32
|
-
sandboxId,
|
|
33
|
-
cmd: "python",
|
|
34
|
-
args: ["-m", "pip", "install", "pandas", "openpyxl", "--quiet", "--upgrade"],
|
|
35
|
-
});
|
|
36
|
-
const installStderr = pipInstall.stderr;
|
|
37
|
-
if (installStderr && (installStderr.includes("ERROR") || installStderr.includes("FAILED"))) {
|
|
38
|
-
throw new Error(`pip install failed: ${installStderr.substring(0, 300)}`);
|
|
39
|
-
}
|
|
40
|
-
console.log(`[FileParseContext ${datasetId}] Fetching file from InstantDB...`);
|
|
41
|
-
const file = await readInstantFileStep({ runtime, fileId });
|
|
42
|
-
console.log(`[FileParseContext ${datasetId}] Creating dataset workstation...`);
|
|
43
|
-
const workstation = getDatasetWorkstation(datasetId);
|
|
44
|
-
await runDatasetSandboxCommandStep({
|
|
45
|
-
runtime,
|
|
46
|
-
sandboxId,
|
|
47
|
-
cmd: "mkdir",
|
|
48
|
-
args: ["-p", workstation],
|
|
49
|
-
});
|
|
50
|
-
const fileName = file.contentDisposition ?? "";
|
|
51
|
-
const fileExtension = fileName.includes(".") ? fileName.substring(fileName.lastIndexOf(".")) : "";
|
|
52
|
-
const sandboxFilePath = `${workstation}/${fileId}${fileExtension}`;
|
|
53
|
-
await writeDatasetSandboxFilesStep({
|
|
54
|
-
runtime,
|
|
55
|
-
sandboxId,
|
|
56
|
-
files: [
|
|
57
|
-
{
|
|
58
|
-
path: sandboxFilePath,
|
|
59
|
-
contentBase64: file.contentBase64,
|
|
60
|
-
},
|
|
61
|
-
],
|
|
62
|
-
});
|
|
63
|
-
console.log(`[FileParseContext ${datasetId}] ✅ Workstation created: ${workstation}`);
|
|
64
|
-
console.log(`[FileParseContext ${datasetId}] ✅ File saved: ${sandboxFilePath}`);
|
|
65
|
-
state.filePath = sandboxFilePath;
|
|
66
|
-
state.initialized = true;
|
|
67
|
-
return sandboxFilePath;
|
|
68
|
-
}
|
|
69
|
-
/**
|
|
70
|
-
* FileParseContext
|
|
71
|
-
*
|
|
72
|
-
* Uso:
|
|
73
|
-
* - Crear una instancia con `fileId`, `instructions` y un `sandbox`
|
|
74
|
-
* - Llamar `getDataset()` para crear un dataset nuevo (crea un datasetId interno)
|
|
75
|
-
* - Llamar `followUp(datasetId, feedback)` para iterar el mismo dataset con feedback
|
|
76
|
-
*
|
|
77
|
-
* Internamente corre un Context (`createContext("file.parse")`) que itera hasta que se ejecuta el tool `completeDataset`.
|
|
78
|
-
*/
|
|
79
18
|
function createFileParseContextDefinition(params) {
|
|
80
19
|
const fallbackDatasetId = params.datasetId;
|
|
81
20
|
const model = params.model ?? "openai/gpt-5";
|
|
@@ -96,18 +35,31 @@ function createFileParseContextDefinition(params) {
|
|
|
96
35
|
if (!sandboxId) {
|
|
97
36
|
throw new Error("dataset_sandbox_required");
|
|
98
37
|
}
|
|
99
|
-
const
|
|
38
|
+
const initialized = await initializeFileParseSandboxStep({
|
|
39
|
+
runtime,
|
|
40
|
+
sandboxId,
|
|
41
|
+
datasetId,
|
|
42
|
+
fileId,
|
|
43
|
+
state: sandboxState,
|
|
44
|
+
});
|
|
45
|
+
const sandboxFilePath = initialized.filePath;
|
|
100
46
|
let filePreview = undefined;
|
|
101
47
|
try {
|
|
102
|
-
filePreview = await
|
|
48
|
+
filePreview = await generateFileParsePreviewStep({
|
|
49
|
+
runtime,
|
|
50
|
+
sandboxId,
|
|
51
|
+
sandboxFilePath,
|
|
52
|
+
datasetId,
|
|
53
|
+
});
|
|
103
54
|
}
|
|
104
55
|
catch {
|
|
105
|
-
// optional
|
|
56
|
+
// Preview is optional; parsing can still proceed from the file path.
|
|
106
57
|
}
|
|
107
58
|
let schema = null;
|
|
108
59
|
const datasetResult = await datasetGetByIdStep({ runtime, datasetId });
|
|
109
|
-
if (datasetResult.ok && datasetResult.data.schema)
|
|
60
|
+
if (datasetResult.ok && datasetResult.data.schema) {
|
|
110
61
|
schema = datasetResult.data.schema;
|
|
62
|
+
}
|
|
111
63
|
const ctx = {
|
|
112
64
|
datasetId,
|
|
113
65
|
fileId,
|
|
@@ -127,13 +79,13 @@ function createFileParseContextDefinition(params) {
|
|
|
127
79
|
fileId,
|
|
128
80
|
instructions,
|
|
129
81
|
sandboxId,
|
|
130
|
-
sandboxState,
|
|
82
|
+
sandboxState: initialized.state,
|
|
131
83
|
ctx,
|
|
132
84
|
};
|
|
133
85
|
})
|
|
134
86
|
.narrative(async (stored) => {
|
|
135
87
|
const ctx = stored?.content?.ctx;
|
|
136
|
-
const base =
|
|
88
|
+
const base = await buildFileDatasetPromptStep({ context: ctx });
|
|
137
89
|
const userInstructions = String(ctx?.instructions ?? "").trim();
|
|
138
90
|
if (!userInstructions)
|
|
139
91
|
return base;
|
|
@@ -184,6 +136,10 @@ function createFileParseContextDefinition(params) {
|
|
|
184
136
|
return actions;
|
|
185
137
|
})
|
|
186
138
|
.shouldContinue(({ reactionEvent }) => {
|
|
139
|
+
const fatalFailure = getDatasetFatalFailure(reactionEvent);
|
|
140
|
+
if (fatalFailure) {
|
|
141
|
+
throw new Error(fatalFailure);
|
|
142
|
+
}
|
|
187
143
|
return !didCompleteDatasetSucceed(reactionEvent);
|
|
188
144
|
});
|
|
189
145
|
if (params.reactor) {
|
|
@@ -195,16 +151,6 @@ function createFileParseContextDefinition(params) {
|
|
|
195
151
|
const context = contextBuilder.build();
|
|
196
152
|
return { datasetId: fallbackDatasetId ?? "", context };
|
|
197
153
|
}
|
|
198
|
-
/**
|
|
199
|
-
* Factory (DX-first):
|
|
200
|
-
*
|
|
201
|
-
* Usage:
|
|
202
|
-
* const { datasetId } = await createFileParseContext(fileId, { instructions }).parse(runtime)
|
|
203
|
-
*
|
|
204
|
-
* - Uses the caller runtime; no secondary runtime is created.
|
|
205
|
-
* - All I/O happens in `"use step"` functions via the provided Ekairos runtime.
|
|
206
|
-
* - `parse()` is the entrypoint; it calls `context.react(...)` internally.
|
|
207
|
-
*/
|
|
208
154
|
export function createFileParseContext(fileId, opts) {
|
|
209
155
|
const datasetId = opts?.datasetId ?? id();
|
|
210
156
|
const params = {
|
|
@@ -225,14 +171,25 @@ export function createFileParseContext(fileId, opts) {
|
|
|
225
171
|
channel: WEB_CHANNEL,
|
|
226
172
|
createdAt: new Date().toISOString(),
|
|
227
173
|
content: {
|
|
228
|
-
parts: [
|
|
174
|
+
parts: [
|
|
175
|
+
{
|
|
176
|
+
type: "text",
|
|
177
|
+
text: options.prompt ?? "generate a dataset for this file",
|
|
178
|
+
},
|
|
179
|
+
],
|
|
229
180
|
},
|
|
230
181
|
};
|
|
231
182
|
const shell = await context.react(triggerEvent, {
|
|
232
183
|
runtime: runtime,
|
|
233
184
|
context: { key: `dataset:${datasetId}` },
|
|
234
185
|
durable: options.durable ?? false,
|
|
235
|
-
options: {
|
|
186
|
+
options: {
|
|
187
|
+
silent: true,
|
|
188
|
+
preventClose: true,
|
|
189
|
+
sendFinish: false,
|
|
190
|
+
maxIterations: 20,
|
|
191
|
+
maxModelSteps: 5,
|
|
192
|
+
},
|
|
236
193
|
__initialContent: {
|
|
237
194
|
datasetId,
|
|
238
195
|
fileId,
|
|
@@ -244,7 +201,6 @@ export function createFileParseContext(fileId, opts) {
|
|
|
244
201
|
await awaitContextRun(shell.run);
|
|
245
202
|
return { datasetId };
|
|
246
203
|
},
|
|
247
|
-
// Optional: expose the built context for advanced callers (not required for parse DX)
|
|
248
204
|
context,
|
|
249
205
|
};
|
|
250
206
|
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { FileParseContext, SandboxState } from "./file-dataset.types.js";
|
|
2
|
+
import type { FilePreviewContext } from "./filepreview.types.js";
|
|
3
|
+
export declare function initializeFileParseSandboxStep(params: {
|
|
4
|
+
runtime: any;
|
|
5
|
+
sandboxId: string;
|
|
6
|
+
datasetId: string;
|
|
7
|
+
fileId: string;
|
|
8
|
+
state: SandboxState;
|
|
9
|
+
}): Promise<{
|
|
10
|
+
filePath: string;
|
|
11
|
+
state: SandboxState;
|
|
12
|
+
}>;
|
|
13
|
+
export declare function generateFileParsePreviewStep(params: {
|
|
14
|
+
runtime: any;
|
|
15
|
+
sandboxId: string;
|
|
16
|
+
sandboxFilePath: string;
|
|
17
|
+
datasetId: string;
|
|
18
|
+
}): Promise<FilePreviewContext>;
|
|
19
|
+
export declare function buildFileDatasetPromptStep(params: {
|
|
20
|
+
context: FileParseContext;
|
|
21
|
+
}): Promise<string>;
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import { getDatasetWorkstation } from "../datasetFiles.js";
|
|
2
|
+
import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
|
|
3
|
+
import { buildFileDatasetPrompt } from "./prompts.js";
|
|
4
|
+
import { generateFilePreview, ensurePreviewScriptsAvailable } from "./filepreview.js";
|
|
5
|
+
import { readInstantFileStep } from "./steps.js";
|
|
6
|
+
export async function initializeFileParseSandboxStep(params) {
|
|
7
|
+
"use step";
|
|
8
|
+
if (params.state.initialized) {
|
|
9
|
+
return { filePath: params.state.filePath, state: params.state };
|
|
10
|
+
}
|
|
11
|
+
console.log(`[FileParseContext ${params.datasetId}] Initializing sandbox...`);
|
|
12
|
+
await ensurePreviewScriptsAvailable(params.runtime, params.sandboxId);
|
|
13
|
+
console.log(`[FileParseContext ${params.datasetId}] Installing Python dependencies...`);
|
|
14
|
+
const pipInstall = await runDatasetSandboxCommandStep({
|
|
15
|
+
runtime: params.runtime,
|
|
16
|
+
sandboxId: params.sandboxId,
|
|
17
|
+
cmd: "python",
|
|
18
|
+
args: ["-m", "pip", "install", "pandas", "openpyxl", "--quiet", "--upgrade"],
|
|
19
|
+
});
|
|
20
|
+
const installStderr = pipInstall.stderr;
|
|
21
|
+
if (installStderr && (installStderr.includes("ERROR") || installStderr.includes("FAILED"))) {
|
|
22
|
+
throw new Error(`pip install failed: ${installStderr.substring(0, 300)}`);
|
|
23
|
+
}
|
|
24
|
+
console.log(`[FileParseContext ${params.datasetId}] Fetching file from InstantDB...`);
|
|
25
|
+
const file = await readInstantFileStep({ runtime: params.runtime, fileId: params.fileId });
|
|
26
|
+
console.log(`[FileParseContext ${params.datasetId}] Creating dataset workstation...`);
|
|
27
|
+
const workstation = getDatasetWorkstation(params.datasetId);
|
|
28
|
+
await runDatasetSandboxCommandStep({
|
|
29
|
+
runtime: params.runtime,
|
|
30
|
+
sandboxId: params.sandboxId,
|
|
31
|
+
cmd: "mkdir",
|
|
32
|
+
args: ["-p", workstation],
|
|
33
|
+
});
|
|
34
|
+
const fileName = file.contentDisposition ?? "";
|
|
35
|
+
const fileExtension = fileName.includes(".") ? fileName.substring(fileName.lastIndexOf(".")) : "";
|
|
36
|
+
const sandboxFilePath = `${workstation}/${params.fileId}${fileExtension}`;
|
|
37
|
+
await writeDatasetSandboxFilesStep({
|
|
38
|
+
runtime: params.runtime,
|
|
39
|
+
sandboxId: params.sandboxId,
|
|
40
|
+
files: [
|
|
41
|
+
{
|
|
42
|
+
path: sandboxFilePath,
|
|
43
|
+
contentBase64: file.contentBase64,
|
|
44
|
+
},
|
|
45
|
+
],
|
|
46
|
+
});
|
|
47
|
+
console.log(`[FileParseContext ${params.datasetId}] Workstation created: ${workstation}`);
|
|
48
|
+
console.log(`[FileParseContext ${params.datasetId}] File saved: ${sandboxFilePath}`);
|
|
49
|
+
const state = {
|
|
50
|
+
initialized: true,
|
|
51
|
+
filePath: sandboxFilePath,
|
|
52
|
+
};
|
|
53
|
+
return { filePath: sandboxFilePath, state };
|
|
54
|
+
}
|
|
55
|
+
export async function generateFileParsePreviewStep(params) {
|
|
56
|
+
"use step";
|
|
57
|
+
return await generateFilePreview(params.runtime, params.sandboxId, params.sandboxFilePath, params.datasetId);
|
|
58
|
+
}
|
|
59
|
+
export async function buildFileDatasetPromptStep(params) {
|
|
60
|
+
"use step";
|
|
61
|
+
return buildFileDatasetPrompt(params.context);
|
|
62
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import type { ContextReactor } from "@ekairos/events";
|
|
2
|
+
import type { FilePreviewContext } from "./filepreview.types.js";
|
|
3
|
+
export type SandboxState = {
|
|
4
|
+
initialized: boolean;
|
|
5
|
+
filePath: string;
|
|
6
|
+
};
|
|
7
|
+
export type FileParseContext = {
|
|
8
|
+
datasetId: string;
|
|
9
|
+
fileId: string;
|
|
10
|
+
instructions: string;
|
|
11
|
+
sandboxConfig: {
|
|
12
|
+
filePath: string;
|
|
13
|
+
};
|
|
14
|
+
analysis: any[];
|
|
15
|
+
schema: any | null;
|
|
16
|
+
plan: any | null;
|
|
17
|
+
executionResult: any | null;
|
|
18
|
+
errors: string[];
|
|
19
|
+
iterationCount: number;
|
|
20
|
+
filePreview?: FilePreviewContext;
|
|
21
|
+
};
|
|
22
|
+
export type FileParseContextParams = {
|
|
23
|
+
fileId?: string;
|
|
24
|
+
instructions?: string;
|
|
25
|
+
sandboxId?: string;
|
|
26
|
+
datasetId?: string;
|
|
27
|
+
model?: string;
|
|
28
|
+
reactor?: ContextReactor<any, any>;
|
|
29
|
+
};
|
|
30
|
+
export type FileParseRunOptions = {
|
|
31
|
+
prompt?: string;
|
|
32
|
+
durable?: boolean;
|
|
33
|
+
};
|
|
34
|
+
export type FileParseContextBuilder<Env extends {
|
|
35
|
+
orgId: string;
|
|
36
|
+
}> = {
|
|
37
|
+
datasetId: string;
|
|
38
|
+
context: any;
|
|
39
|
+
};
|
|
40
|
+
export type DatasetResult = {
|
|
41
|
+
id: string;
|
|
42
|
+
status?: string;
|
|
43
|
+
title?: string;
|
|
44
|
+
schema?: any;
|
|
45
|
+
analysis?: any;
|
|
46
|
+
calculatedTotalRows?: number;
|
|
47
|
+
actualGeneratedRowCount?: number;
|
|
48
|
+
createdAt?: number;
|
|
49
|
+
updatedAt?: number;
|
|
50
|
+
};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -1,34 +1,5 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
metadata?: {
|
|
4
|
-
description: string;
|
|
5
|
-
script: string;
|
|
6
|
-
command: string;
|
|
7
|
-
stdout: string;
|
|
8
|
-
stderr: string;
|
|
9
|
-
};
|
|
10
|
-
head?: {
|
|
11
|
-
description: string;
|
|
12
|
-
script: string;
|
|
13
|
-
command: string;
|
|
14
|
-
stdout: string;
|
|
15
|
-
stderr: string;
|
|
16
|
-
};
|
|
17
|
-
tail?: {
|
|
18
|
-
description: string;
|
|
19
|
-
script: string;
|
|
20
|
-
command: string;
|
|
21
|
-
stdout: string;
|
|
22
|
-
stderr: string;
|
|
23
|
-
};
|
|
24
|
-
mid?: {
|
|
25
|
-
description: string;
|
|
26
|
-
script: string;
|
|
27
|
-
command: string;
|
|
28
|
-
stdout: string;
|
|
29
|
-
stderr: string;
|
|
30
|
-
};
|
|
31
|
-
};
|
|
1
|
+
import type { FilePreviewContext } from "./filepreview.types.js";
|
|
2
|
+
export type { FilePreviewContext } from "./filepreview.types.js";
|
|
32
3
|
interface PreviewOptions {
|
|
33
4
|
headLines?: number;
|
|
34
5
|
tailLines?: number;
|
|
@@ -36,4 +7,3 @@ interface PreviewOptions {
|
|
|
36
7
|
}
|
|
37
8
|
export declare function ensurePreviewScriptsAvailable(runtime: any, sandboxId: string): Promise<void>;
|
|
38
9
|
export declare function generateFilePreview(runtime: any, sandboxId: string, sandboxFilePath: string, datasetId: string, options?: PreviewOptions): Promise<FilePreviewContext>;
|
|
39
|
-
export {};
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
export type FilePreviewContext = {
|
|
2
|
+
totalRows: number;
|
|
3
|
+
metadata?: {
|
|
4
|
+
description: string;
|
|
5
|
+
script: string;
|
|
6
|
+
command: string;
|
|
7
|
+
stdout: string;
|
|
8
|
+
stderr: string;
|
|
9
|
+
};
|
|
10
|
+
head?: {
|
|
11
|
+
description: string;
|
|
12
|
+
script: string;
|
|
13
|
+
command: string;
|
|
14
|
+
stdout: string;
|
|
15
|
+
stderr: string;
|
|
16
|
+
};
|
|
17
|
+
tail?: {
|
|
18
|
+
description: string;
|
|
19
|
+
script: string;
|
|
20
|
+
command: string;
|
|
21
|
+
stdout: string;
|
|
22
|
+
stderr: string;
|
|
23
|
+
};
|
|
24
|
+
mid?: {
|
|
25
|
+
description: string;
|
|
26
|
+
script: string;
|
|
27
|
+
command: string;
|
|
28
|
+
stdout: string;
|
|
29
|
+
stderr: string;
|
|
30
|
+
};
|
|
31
|
+
};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/dist/file/prompts.d.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { FileParseContext } from "./file-dataset.
|
|
1
|
+
import type { FileParseContext } from "./file-dataset.types.js";
|
|
2
2
|
export declare function buildFileDatasetPrompt(context: FileParseContext): string;
|
package/dist/index.d.ts
CHANGED
|
@@ -3,3 +3,5 @@ export * from "./domain.js";
|
|
|
3
3
|
export * from "./materializeDataset.tool.js";
|
|
4
4
|
export * from "./schema.js";
|
|
5
5
|
export * from "./service.js";
|
|
6
|
+
export { registerFileParseContext } from "./file/file-dataset.agent.js";
|
|
7
|
+
export { registerTransformDatasetContext } from "./transform/index.js";
|
package/dist/index.js
CHANGED
|
@@ -3,3 +3,5 @@ export * from "./domain.js";
|
|
|
3
3
|
export * from "./materializeDataset.tool.js";
|
|
4
4
|
export * from "./schema.js";
|
|
5
5
|
export * from "./service.js";
|
|
6
|
+
export { registerFileParseContext } from "./file/file-dataset.agent.js";
|
|
7
|
+
export { registerTransformDatasetContext } from "./transform/index.js";
|
package/dist/sandbox/steps.d.ts
CHANGED
|
@@ -27,6 +27,14 @@ export declare function writeDatasetSandboxFilesStep(params: {
|
|
|
27
27
|
contentBase64: string;
|
|
28
28
|
}>;
|
|
29
29
|
}): Promise<void>;
|
|
30
|
+
export declare function writeDatasetSandboxTextFilesStep(params: {
|
|
31
|
+
runtime: any;
|
|
32
|
+
sandboxId: DatasetSandboxId;
|
|
33
|
+
files: Array<{
|
|
34
|
+
path: string;
|
|
35
|
+
content: string;
|
|
36
|
+
}>;
|
|
37
|
+
}): Promise<void>;
|
|
30
38
|
export declare function readDatasetSandboxFileStep(params: {
|
|
31
39
|
runtime: any;
|
|
32
40
|
sandboxId: DatasetSandboxId;
|
|
@@ -34,6 +42,13 @@ export declare function readDatasetSandboxFileStep(params: {
|
|
|
34
42
|
}): Promise<{
|
|
35
43
|
contentBase64: string;
|
|
36
44
|
}>;
|
|
45
|
+
export declare function readDatasetSandboxTextFileStep(params: {
|
|
46
|
+
runtime: any;
|
|
47
|
+
sandboxId: DatasetSandboxId;
|
|
48
|
+
path: string;
|
|
49
|
+
}): Promise<{
|
|
50
|
+
content: string;
|
|
51
|
+
}>;
|
|
37
52
|
export declare function stopDatasetSandboxStep(params: {
|
|
38
53
|
runtime: any;
|
|
39
54
|
sandboxId: DatasetSandboxId;
|
package/dist/sandbox/steps.js
CHANGED
|
@@ -117,6 +117,25 @@ export async function writeDatasetSandboxFilesStep(params) {
|
|
|
117
117
|
if (!result.ok)
|
|
118
118
|
throw new Error(result.error);
|
|
119
119
|
}
|
|
120
|
+
export async function writeDatasetSandboxTextFilesStep(params) {
|
|
121
|
+
"use step";
|
|
122
|
+
if (isLocalDatasetSandboxMode()) {
|
|
123
|
+
for (const file of params.files) {
|
|
124
|
+
await fs.mkdir(path.dirname(file.path), { recursive: true });
|
|
125
|
+
await fs.writeFile(file.path, file.content, "utf-8");
|
|
126
|
+
}
|
|
127
|
+
return;
|
|
128
|
+
}
|
|
129
|
+
const db = await getRuntimeDb(params.runtime);
|
|
130
|
+
const service = new SandboxService(db);
|
|
131
|
+
const files = params.files.map((file) => ({
|
|
132
|
+
path: file.path,
|
|
133
|
+
contentBase64: Buffer.from(file.content, "utf-8").toString("base64"),
|
|
134
|
+
}));
|
|
135
|
+
const result = await service.writeFiles(params.sandboxId, files);
|
|
136
|
+
if (!result.ok)
|
|
137
|
+
throw new Error(result.error);
|
|
138
|
+
}
|
|
120
139
|
export async function readDatasetSandboxFileStep(params) {
|
|
121
140
|
"use step";
|
|
122
141
|
if (isLocalDatasetSandboxMode()) {
|
|
@@ -130,6 +149,19 @@ export async function readDatasetSandboxFileStep(params) {
|
|
|
130
149
|
throw new Error(result.error);
|
|
131
150
|
return result.data;
|
|
132
151
|
}
|
|
152
|
+
export async function readDatasetSandboxTextFileStep(params) {
|
|
153
|
+
"use step";
|
|
154
|
+
if (isLocalDatasetSandboxMode()) {
|
|
155
|
+
const content = await fs.readFile(params.path, "utf-8");
|
|
156
|
+
return { content };
|
|
157
|
+
}
|
|
158
|
+
const db = await getRuntimeDb(params.runtime);
|
|
159
|
+
const service = new SandboxService(db);
|
|
160
|
+
const result = await service.readFile(params.sandboxId, params.path);
|
|
161
|
+
if (!result.ok)
|
|
162
|
+
throw new Error(result.error);
|
|
163
|
+
return { content: Buffer.from(result.data.contentBase64, "base64").toString("utf-8") };
|
|
164
|
+
}
|
|
133
165
|
export async function stopDatasetSandboxStep(params) {
|
|
134
166
|
"use step";
|
|
135
167
|
if (isLocalDatasetSandboxMode()) {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
export { createTransformDatasetContext, type TransformDatasetAgentParams, type TransformDatasetContext, } from "./transform-dataset.agent.js";
|
|
1
|
+
export { createTransformDatasetContext, registerTransformDatasetContext, type TransformDatasetAgentParams, type TransformDatasetContext, type TransformDatasetRunOptions, } from "./transform-dataset.agent.js";
|
|
2
2
|
export { transformDataset, type TransformDatasetInput, type TransformDatasetResult, } from "./transformDataset.js";
|
package/dist/transform/index.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
export { createTransformDatasetContext, } from "./transform-dataset.agent.js";
|
|
1
|
+
export { createTransformDatasetContext, registerTransformDatasetContext, } from "./transform-dataset.agent.js";
|
|
2
2
|
export { transformDataset, } from "./transformDataset.js";
|
|
@@ -1,34 +1,2 @@
|
|
|
1
|
-
|
|
2
|
-
datasetId: string;
|
|
3
|
-
sourceDatasetIds: string[];
|
|
4
|
-
outputSchema: any;
|
|
5
|
-
sandboxConfig: {
|
|
6
|
-
sourcePaths: Array<{
|
|
7
|
-
datasetId: string;
|
|
8
|
-
path: string;
|
|
9
|
-
}>;
|
|
10
|
-
outputPath: string;
|
|
11
|
-
};
|
|
12
|
-
sourcePreviews?: Array<{
|
|
13
|
-
datasetId: string;
|
|
14
|
-
preview: {
|
|
15
|
-
totalRows: number;
|
|
16
|
-
metadata?: {
|
|
17
|
-
description: string;
|
|
18
|
-
script: string;
|
|
19
|
-
command: string;
|
|
20
|
-
stdout: string;
|
|
21
|
-
stderr: string;
|
|
22
|
-
};
|
|
23
|
-
head?: {
|
|
24
|
-
description: string;
|
|
25
|
-
script: string;
|
|
26
|
-
command: string;
|
|
27
|
-
stdout: string;
|
|
28
|
-
stderr: string;
|
|
29
|
-
};
|
|
30
|
-
};
|
|
31
|
-
}>;
|
|
32
|
-
errors: string[];
|
|
33
|
-
};
|
|
1
|
+
import type { TransformPromptContext } from "./transform-dataset.types.js";
|
|
34
2
|
export declare function buildTransformDatasetPrompt(context: TransformPromptContext): string;
|
|
@@ -1,48 +1,6 @@
|
|
|
1
1
|
import { type ContextReactor } from "@ekairos/events";
|
|
2
|
-
import {
|
|
3
|
-
export type TransformDatasetContext
|
|
4
|
-
datasetId: string;
|
|
5
|
-
sourceDatasetIds: string[];
|
|
6
|
-
outputSchema: any;
|
|
7
|
-
sandboxConfig: {
|
|
8
|
-
sourcePaths: Array<{
|
|
9
|
-
datasetId: string;
|
|
10
|
-
path: string;
|
|
11
|
-
}>;
|
|
12
|
-
outputPath: string;
|
|
13
|
-
};
|
|
14
|
-
sourcePreviews?: Array<{
|
|
15
|
-
datasetId: string;
|
|
16
|
-
preview: TransformSourcePreviewContext;
|
|
17
|
-
}>;
|
|
18
|
-
errors: string[];
|
|
19
|
-
iterationCount: number;
|
|
20
|
-
instructions?: string;
|
|
21
|
-
};
|
|
22
|
-
export type TransformDatasetAgentParams = {
|
|
23
|
-
sourceDatasetIds?: string[];
|
|
24
|
-
outputSchema?: any;
|
|
25
|
-
instructions?: string;
|
|
26
|
-
datasetId?: string;
|
|
27
|
-
model?: string;
|
|
28
|
-
sandboxId?: string;
|
|
29
|
-
reactor?: ContextReactor<any, any>;
|
|
30
|
-
};
|
|
31
|
-
export type TransformDatasetRunOptions = {
|
|
32
|
-
prompt?: string;
|
|
33
|
-
durable?: boolean;
|
|
34
|
-
};
|
|
35
|
-
export type TransformDatasetResult = {
|
|
36
|
-
id: string;
|
|
37
|
-
status?: string;
|
|
38
|
-
title?: string;
|
|
39
|
-
schema?: any;
|
|
40
|
-
analysis?: any;
|
|
41
|
-
calculatedTotalRows?: number;
|
|
42
|
-
actualGeneratedRowCount?: number;
|
|
43
|
-
createdAt?: number;
|
|
44
|
-
updatedAt?: number;
|
|
45
|
-
};
|
|
2
|
+
import type { TransformDatasetRunOptions } from "./transform-dataset.types.js";
|
|
3
|
+
export type { TransformDatasetAgentParams, TransformDatasetContext, TransformDatasetResult, TransformDatasetRunOptions, TransformPromptContext, TransformSandboxState, } from "./transform-dataset.types.js";
|
|
46
4
|
export declare function createTransformDatasetContext<Env extends {
|
|
47
5
|
orgId: string;
|
|
48
6
|
}>(params: {
|
|
@@ -1,13 +1,10 @@
|
|
|
1
|
-
import { createContext, INPUT_TEXT_ITEM_TYPE, WEB_CHANNEL } from "@ekairos/events";
|
|
2
|
-
import { createCompleteDatasetTool, didCompleteDatasetSucceed } from "../completeDataset.tool.js";
|
|
3
|
-
import { createExecuteCommandTool } from "../executeCommand.tool.js";
|
|
4
|
-
import { createClearDatasetTool } from "../clearDataset.tool.js";
|
|
5
|
-
import { buildTransformDatasetPrompt } from "./prompts.js";
|
|
6
|
-
import { getDatasetWorkstation, getDatasetOutputPath } from "../datasetFiles.js";
|
|
1
|
+
import { createContext, INPUT_TEXT_ITEM_TYPE, WEB_CHANNEL, } from "@ekairos/events";
|
|
7
2
|
import { id } from "@instantdb/admin";
|
|
8
|
-
import {
|
|
9
|
-
import {
|
|
10
|
-
import {
|
|
3
|
+
import { createClearDatasetTool } from "../clearDataset.tool.js";
|
|
4
|
+
import { createCompleteDatasetTool, didCompleteDatasetSucceed, getDatasetFatalFailure, } from "../completeDataset.tool.js";
|
|
5
|
+
import { datasetUpdateSchemaStep } from "../dataset/steps.js";
|
|
6
|
+
import { createExecuteCommandTool } from "../executeCommand.tool.js";
|
|
7
|
+
import { buildTransformDatasetPromptStep, ensureTransformSourcesInSandboxStep, generateTransformSourcePreviewsStep, } from "./transform-dataset.steps.js";
|
|
11
8
|
async function awaitContextRun(run) {
|
|
12
9
|
if (!run)
|
|
13
10
|
return;
|
|
@@ -17,28 +14,6 @@ async function awaitContextRun(run) {
|
|
|
17
14
|
}
|
|
18
15
|
await run;
|
|
19
16
|
}
|
|
20
|
-
async function ensureSourcesInSandbox(runtime, sandboxId, datasetId, sourceDatasetIds, state) {
|
|
21
|
-
"use step";
|
|
22
|
-
if (state.initialized) {
|
|
23
|
-
return { sourcePaths: state.sourcePaths, outputPath: getDatasetOutputPath(datasetId) };
|
|
24
|
-
}
|
|
25
|
-
const workstation = getDatasetWorkstation(datasetId);
|
|
26
|
-
await runDatasetSandboxCommandStep({ runtime, sandboxId, cmd: "mkdir", args: ["-p", workstation] });
|
|
27
|
-
const sourcePaths = [];
|
|
28
|
-
for (const sourceDatasetId of sourceDatasetIds) {
|
|
29
|
-
const sourcePath = `${workstation}/source_${sourceDatasetId}.jsonl`;
|
|
30
|
-
const source = await datasetReadOutputJsonlStep({ runtime, datasetId: sourceDatasetId });
|
|
31
|
-
await writeDatasetSandboxFilesStep({
|
|
32
|
-
runtime,
|
|
33
|
-
sandboxId,
|
|
34
|
-
files: [{ path: sourcePath, contentBase64: source.contentBase64 }],
|
|
35
|
-
});
|
|
36
|
-
sourcePaths.push({ datasetId: sourceDatasetId, path: sourcePath });
|
|
37
|
-
}
|
|
38
|
-
state.sourcePaths = sourcePaths;
|
|
39
|
-
state.initialized = true;
|
|
40
|
-
return { sourcePaths, outputPath: getDatasetOutputPath(datasetId) };
|
|
41
|
-
}
|
|
42
17
|
function createTransformDatasetContextDefinition(params) {
|
|
43
18
|
const fallbackDatasetId = params.datasetId;
|
|
44
19
|
const model = params.model ?? "openai/gpt-5";
|
|
@@ -67,18 +42,19 @@ function createTransformDatasetContextDefinition(params) {
|
|
|
67
42
|
if (!sandboxId) {
|
|
68
43
|
throw new Error("dataset_sandbox_required");
|
|
69
44
|
}
|
|
70
|
-
const
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
45
|
+
const initialized = await ensureTransformSourcesInSandboxStep({
|
|
46
|
+
runtime,
|
|
47
|
+
sandboxId,
|
|
48
|
+
datasetId,
|
|
49
|
+
sourceDatasetIds,
|
|
50
|
+
state: sandboxState,
|
|
51
|
+
});
|
|
52
|
+
const sourcePreviews = await generateTransformSourcePreviewsStep({
|
|
53
|
+
runtime,
|
|
54
|
+
sandboxId,
|
|
55
|
+
datasetId,
|
|
56
|
+
sourcePaths: initialized.sourcePaths,
|
|
57
|
+
});
|
|
82
58
|
await datasetUpdateSchemaStep({
|
|
83
59
|
runtime,
|
|
84
60
|
datasetId,
|
|
@@ -89,11 +65,16 @@ function createTransformDatasetContextDefinition(params) {
|
|
|
89
65
|
datasetId,
|
|
90
66
|
sourceDatasetIds,
|
|
91
67
|
outputSchema,
|
|
92
|
-
sandboxConfig: {
|
|
68
|
+
sandboxConfig: {
|
|
69
|
+
sourcePaths: initialized.sourcePaths,
|
|
70
|
+
outputPath: initialized.outputPath,
|
|
71
|
+
},
|
|
93
72
|
sourcePreviews: sourcePreviews.length > 0 ? sourcePreviews : undefined,
|
|
94
73
|
errors: [],
|
|
95
74
|
};
|
|
96
|
-
const basePrompt =
|
|
75
|
+
const basePrompt = await buildTransformDatasetPromptStep({
|
|
76
|
+
context: promptContext,
|
|
77
|
+
});
|
|
97
78
|
const userInstructions = String(instructions ?? "").trim();
|
|
98
79
|
const system = userInstructions
|
|
99
80
|
? [
|
|
@@ -112,9 +93,12 @@ function createTransformDatasetContextDefinition(params) {
|
|
|
112
93
|
outputSchema,
|
|
113
94
|
instructions,
|
|
114
95
|
sandboxId,
|
|
115
|
-
sandboxState,
|
|
96
|
+
sandboxState: initialized.state,
|
|
116
97
|
system,
|
|
117
|
-
sandboxConfig: {
|
|
98
|
+
sandboxConfig: {
|
|
99
|
+
sourcePaths: initialized.sourcePaths,
|
|
100
|
+
outputPath: initialized.outputPath,
|
|
101
|
+
},
|
|
118
102
|
};
|
|
119
103
|
})
|
|
120
104
|
.narrative(async (stored) => {
|
|
@@ -146,6 +130,10 @@ function createTransformDatasetContextDefinition(params) {
|
|
|
146
130
|
};
|
|
147
131
|
})
|
|
148
132
|
.shouldContinue(({ reactionEvent }) => {
|
|
133
|
+
const fatalFailure = getDatasetFatalFailure(reactionEvent);
|
|
134
|
+
if (fatalFailure) {
|
|
135
|
+
throw new Error(fatalFailure);
|
|
136
|
+
}
|
|
149
137
|
return !didCompleteDatasetSucceed(reactionEvent);
|
|
150
138
|
});
|
|
151
139
|
if (params.reactor) {
|
|
@@ -193,7 +181,13 @@ export function createTransformDatasetContext(params) {
|
|
|
193
181
|
runtime: runtime,
|
|
194
182
|
context: { key: `dataset:${datasetId}` },
|
|
195
183
|
durable: options.durable ?? false,
|
|
196
|
-
options: {
|
|
184
|
+
options: {
|
|
185
|
+
silent: true,
|
|
186
|
+
preventClose: true,
|
|
187
|
+
sendFinish: false,
|
|
188
|
+
maxIterations: 20,
|
|
189
|
+
maxModelSteps: 5,
|
|
190
|
+
},
|
|
197
191
|
__initialContent: {
|
|
198
192
|
datasetId,
|
|
199
193
|
sourceDatasetIds: params.sourceDatasetIds,
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import type { TransformPromptContext, TransformSandboxState, TransformSourcePreviewContext } from "./transform-dataset.types.js";
|
|
2
|
+
export declare function ensureTransformSourcesInSandboxStep(params: {
|
|
3
|
+
runtime: any;
|
|
4
|
+
sandboxId: string;
|
|
5
|
+
datasetId: string;
|
|
6
|
+
sourceDatasetIds: string[];
|
|
7
|
+
state: TransformSandboxState;
|
|
8
|
+
}): Promise<{
|
|
9
|
+
sourcePaths: Array<{
|
|
10
|
+
datasetId: string;
|
|
11
|
+
path: string;
|
|
12
|
+
}>;
|
|
13
|
+
outputPath: string;
|
|
14
|
+
state: TransformSandboxState;
|
|
15
|
+
}>;
|
|
16
|
+
export declare function generateTransformSourcePreviewsStep(params: {
|
|
17
|
+
runtime: any;
|
|
18
|
+
sandboxId: string;
|
|
19
|
+
datasetId: string;
|
|
20
|
+
sourcePaths: Array<{
|
|
21
|
+
datasetId: string;
|
|
22
|
+
path: string;
|
|
23
|
+
}>;
|
|
24
|
+
}): Promise<Array<{
|
|
25
|
+
datasetId: string;
|
|
26
|
+
preview: TransformSourcePreviewContext;
|
|
27
|
+
}>>;
|
|
28
|
+
export declare function buildTransformDatasetPromptStep(params: {
|
|
29
|
+
context: TransformPromptContext;
|
|
30
|
+
}): Promise<string>;
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import { getDatasetOutputPath, getDatasetWorkstation } from "../datasetFiles.js";
|
|
2
|
+
import { datasetReadOutputJsonlStep } from "../dataset/steps.js";
|
|
3
|
+
import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
|
|
4
|
+
import { generateSourcePreview } from "./filepreview.js";
|
|
5
|
+
import { buildTransformDatasetPrompt } from "./prompts.js";
|
|
6
|
+
export async function ensureTransformSourcesInSandboxStep(params) {
|
|
7
|
+
"use step";
|
|
8
|
+
if (params.state.initialized) {
|
|
9
|
+
return {
|
|
10
|
+
sourcePaths: params.state.sourcePaths,
|
|
11
|
+
outputPath: getDatasetOutputPath(params.datasetId),
|
|
12
|
+
state: params.state,
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
const workstation = getDatasetWorkstation(params.datasetId);
|
|
16
|
+
await runDatasetSandboxCommandStep({
|
|
17
|
+
runtime: params.runtime,
|
|
18
|
+
sandboxId: params.sandboxId,
|
|
19
|
+
cmd: "mkdir",
|
|
20
|
+
args: ["-p", workstation],
|
|
21
|
+
});
|
|
22
|
+
const sourcePaths = [];
|
|
23
|
+
for (const sourceDatasetId of params.sourceDatasetIds) {
|
|
24
|
+
const sourcePath = `${workstation}/source_${sourceDatasetId}.jsonl`;
|
|
25
|
+
const source = await datasetReadOutputJsonlStep({
|
|
26
|
+
runtime: params.runtime,
|
|
27
|
+
datasetId: sourceDatasetId,
|
|
28
|
+
});
|
|
29
|
+
await writeDatasetSandboxFilesStep({
|
|
30
|
+
runtime: params.runtime,
|
|
31
|
+
sandboxId: params.sandboxId,
|
|
32
|
+
files: [{ path: sourcePath, contentBase64: source.contentBase64 }],
|
|
33
|
+
});
|
|
34
|
+
sourcePaths.push({ datasetId: sourceDatasetId, path: sourcePath });
|
|
35
|
+
}
|
|
36
|
+
return {
|
|
37
|
+
sourcePaths,
|
|
38
|
+
outputPath: getDatasetOutputPath(params.datasetId),
|
|
39
|
+
state: {
|
|
40
|
+
initialized: true,
|
|
41
|
+
sourcePaths,
|
|
42
|
+
},
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
export async function generateTransformSourcePreviewsStep(params) {
|
|
46
|
+
"use step";
|
|
47
|
+
const sourcePreviews = [];
|
|
48
|
+
for (const sourcePath of params.sourcePaths) {
|
|
49
|
+
try {
|
|
50
|
+
const preview = await generateSourcePreview(params.runtime, params.sandboxId, sourcePath.path, params.datasetId);
|
|
51
|
+
sourcePreviews.push({ datasetId: sourcePath.datasetId, preview });
|
|
52
|
+
}
|
|
53
|
+
catch {
|
|
54
|
+
// Source preview is optional; transformation can still read the JSONL files.
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
return sourcePreviews;
|
|
58
|
+
}
|
|
59
|
+
export async function buildTransformDatasetPromptStep(params) {
|
|
60
|
+
"use step";
|
|
61
|
+
return buildTransformDatasetPrompt(params.context);
|
|
62
|
+
}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import type { ContextReactor } from "@ekairos/events";
|
|
2
|
+
import type { TransformSourcePreviewContext } from "./filepreview.js";
|
|
3
|
+
export type { TransformSourcePreviewContext } from "./filepreview.js";
|
|
4
|
+
export type TransformSandboxState = {
|
|
5
|
+
initialized: boolean;
|
|
6
|
+
sourcePaths: Array<{
|
|
7
|
+
datasetId: string;
|
|
8
|
+
path: string;
|
|
9
|
+
}>;
|
|
10
|
+
};
|
|
11
|
+
export type TransformDatasetContext = {
|
|
12
|
+
datasetId: string;
|
|
13
|
+
sourceDatasetIds: string[];
|
|
14
|
+
outputSchema: any;
|
|
15
|
+
sandboxConfig: {
|
|
16
|
+
sourcePaths: Array<{
|
|
17
|
+
datasetId: string;
|
|
18
|
+
path: string;
|
|
19
|
+
}>;
|
|
20
|
+
outputPath: string;
|
|
21
|
+
};
|
|
22
|
+
sourcePreviews?: Array<{
|
|
23
|
+
datasetId: string;
|
|
24
|
+
preview: TransformSourcePreviewContext;
|
|
25
|
+
}>;
|
|
26
|
+
errors: string[];
|
|
27
|
+
iterationCount: number;
|
|
28
|
+
instructions?: string;
|
|
29
|
+
};
|
|
30
|
+
export type TransformDatasetAgentParams = {
|
|
31
|
+
sourceDatasetIds?: string[];
|
|
32
|
+
outputSchema?: any;
|
|
33
|
+
instructions?: string;
|
|
34
|
+
datasetId?: string;
|
|
35
|
+
model?: string;
|
|
36
|
+
sandboxId?: string;
|
|
37
|
+
reactor?: ContextReactor<any, any>;
|
|
38
|
+
};
|
|
39
|
+
export type TransformDatasetRunOptions = {
|
|
40
|
+
prompt?: string;
|
|
41
|
+
durable?: boolean;
|
|
42
|
+
};
|
|
43
|
+
export type TransformDatasetResult = {
|
|
44
|
+
id: string;
|
|
45
|
+
status?: string;
|
|
46
|
+
title?: string;
|
|
47
|
+
schema?: any;
|
|
48
|
+
analysis?: any;
|
|
49
|
+
calculatedTotalRows?: number;
|
|
50
|
+
actualGeneratedRowCount?: number;
|
|
51
|
+
createdAt?: number;
|
|
52
|
+
updatedAt?: number;
|
|
53
|
+
};
|
|
54
|
+
export type TransformPromptContext = {
|
|
55
|
+
datasetId: string;
|
|
56
|
+
sourceDatasetIds: string[];
|
|
57
|
+
outputSchema: any;
|
|
58
|
+
sandboxConfig: {
|
|
59
|
+
sourcePaths: Array<{
|
|
60
|
+
datasetId: string;
|
|
61
|
+
path: string;
|
|
62
|
+
}>;
|
|
63
|
+
outputPath: string;
|
|
64
|
+
};
|
|
65
|
+
sourcePreviews?: Array<{
|
|
66
|
+
datasetId: string;
|
|
67
|
+
preview: {
|
|
68
|
+
totalRows: number;
|
|
69
|
+
metadata?: {
|
|
70
|
+
description: string;
|
|
71
|
+
script: string;
|
|
72
|
+
command: string;
|
|
73
|
+
stdout: string;
|
|
74
|
+
stderr: string;
|
|
75
|
+
};
|
|
76
|
+
head?: {
|
|
77
|
+
description: string;
|
|
78
|
+
script: string;
|
|
79
|
+
command: string;
|
|
80
|
+
stdout: string;
|
|
81
|
+
stderr: string;
|
|
82
|
+
};
|
|
83
|
+
};
|
|
84
|
+
}>;
|
|
85
|
+
errors: string[];
|
|
86
|
+
};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ekairos/dataset",
|
|
3
|
-
"version": "1.22.
|
|
3
|
+
"version": "1.22.57-beta.development.0",
|
|
4
4
|
"description": "Pulzar Dataset Tools",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -65,9 +65,9 @@
|
|
|
65
65
|
"test:ai-sdk:instant": "vitest run -c vitest.codex.config.mts src/tests/materializeDataset.ai-sdk.instant.test.ts"
|
|
66
66
|
},
|
|
67
67
|
"dependencies": {
|
|
68
|
-
"@ekairos/domain": "^1.22.
|
|
69
|
-
"@ekairos/events": "^1.22.
|
|
70
|
-
"@ekairos/sandbox": "^1.22.
|
|
68
|
+
"@ekairos/domain": "^1.22.57-beta.development.0",
|
|
69
|
+
"@ekairos/events": "^1.22.57-beta.development.0",
|
|
70
|
+
"@ekairos/sandbox": "^1.22.57-beta.development.0",
|
|
71
71
|
"@instantdb/admin": "0.22.158",
|
|
72
72
|
"@instantdb/core": "0.22.142",
|
|
73
73
|
"ai": "^5.0.44",
|