@ekairos/dataset 1.22.80-beta.development.0 → 1.22.81-beta.development.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/datasetFiles.js
CHANGED
|
@@ -1,22 +1,10 @@
|
|
|
1
1
|
export const DATASET_OUTPUT_FILE_NAME = "output.jsonl";
|
|
2
|
-
const
|
|
3
|
-
const DEFAULT_DAYTONA_WORKDIR_BASE = "/home/daytona/.ekairos/datasets";
|
|
4
|
-
const DEFAULT_SPRITES_WORKDIR_BASE = "/workspace/.ekairos/datasets";
|
|
2
|
+
const DATASET_WORKDIR_BASE = "/tmp/ekairos/dataset";
|
|
5
3
|
function trimTrailingSlash(value) {
|
|
6
4
|
return value.endsWith("/") ? value.slice(0, -1) : value;
|
|
7
5
|
}
|
|
8
6
|
export function getDatasetWorkdirBase() {
|
|
9
|
-
|
|
10
|
-
if (explicit)
|
|
11
|
-
return trimTrailingSlash(explicit);
|
|
12
|
-
const provider = String(process.env.SANDBOX_PROVIDER ?? "").trim().toLowerCase();
|
|
13
|
-
if (provider === "daytona")
|
|
14
|
-
return DEFAULT_DAYTONA_WORKDIR_BASE;
|
|
15
|
-
if (provider === "vercel")
|
|
16
|
-
return DEFAULT_VERCEL_WORKDIR_BASE;
|
|
17
|
-
if (provider === "sprites")
|
|
18
|
-
return DEFAULT_SPRITES_WORKDIR_BASE;
|
|
19
|
-
return DEFAULT_VERCEL_WORKDIR_BASE;
|
|
7
|
+
return trimTrailingSlash(DATASET_WORKDIR_BASE);
|
|
20
8
|
}
|
|
21
9
|
export function getDatasetWorkstation(datasetId) {
|
|
22
10
|
return `${getDatasetWorkdirBase()}/${datasetId}`;
|
|
@@ -1,26 +1,14 @@
|
|
|
1
1
|
import { getDatasetSourcesDir, getDatasetStandardDirs, getDatasetWorkstation, } from "../datasetFiles.js";
|
|
2
2
|
import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
|
|
3
3
|
import { buildFileDatasetPrompt } from "./prompts.js";
|
|
4
|
-
import { generateFilePreview
|
|
4
|
+
import { generateFilePreview } from "./filepreview.js";
|
|
5
5
|
import { readInstantFileStep } from "./steps.js";
|
|
6
6
|
export async function initializeFileParseSandboxStep(params) {
|
|
7
7
|
"use step";
|
|
8
8
|
if (params.state.initialized) {
|
|
9
9
|
return { filePath: params.state.filePath, state: params.state };
|
|
10
10
|
}
|
|
11
|
-
console.log(`[FileParseContext ${params.datasetId}]
|
|
12
|
-
await ensurePreviewScriptsAvailable(params.runtime, params.sandboxId);
|
|
13
|
-
console.log(`[FileParseContext ${params.datasetId}] Installing Python dependencies...`);
|
|
14
|
-
const pipInstall = await runDatasetSandboxCommandStep({
|
|
15
|
-
runtime: params.runtime,
|
|
16
|
-
sandboxId: params.sandboxId,
|
|
17
|
-
cmd: "python",
|
|
18
|
-
args: ["-m", "pip", "install", "pandas", "openpyxl", "--quiet", "--upgrade"],
|
|
19
|
-
});
|
|
20
|
-
const installStderr = pipInstall.stderr;
|
|
21
|
-
if (installStderr && (installStderr.includes("ERROR") || installStderr.includes("FAILED"))) {
|
|
22
|
-
throw new Error(`pip install failed: ${installStderr.substring(0, 300)}`);
|
|
23
|
-
}
|
|
11
|
+
console.log(`[FileParseContext ${params.datasetId}] Preparing source file in sandbox...`);
|
|
24
12
|
console.log(`[FileParseContext ${params.datasetId}] Fetching file from InstantDB...`);
|
|
25
13
|
const file = await readInstantFileStep({ runtime: params.runtime, fileId: params.fileId });
|
|
26
14
|
console.log(`[FileParseContext ${params.datasetId}] Creating dataset workstation...`);
|
|
@@ -5,7 +5,6 @@ interface PreviewOptions {
|
|
|
5
5
|
tailLines?: number;
|
|
6
6
|
midLines?: number;
|
|
7
7
|
}
|
|
8
|
-
export declare function resolveFilePreviewScriptPath(scriptName: string): string;
|
|
9
8
|
export declare function getEmbeddedFilePreviewScriptBase64(scriptName: string): string;
|
|
10
|
-
export declare function ensurePreviewScriptsAvailable(
|
|
9
|
+
export declare function ensurePreviewScriptsAvailable(_runtime: any, _sandboxId: string): Promise<void>;
|
|
11
10
|
export declare function generateFilePreview(runtime: any, sandboxId: string, sandboxFilePath: string, datasetId: string, options?: PreviewOptions): Promise<FilePreviewContext>;
|
package/dist/file/filepreview.js
CHANGED
|
@@ -1,40 +1,8 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { dirname, join } from "node:path";
|
|
3
|
-
import { fileURLToPath } from "node:url";
|
|
4
|
-
import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
|
|
1
|
+
import { runDatasetSandboxCommandStep } from "../sandbox/steps.js";
|
|
5
2
|
import { PYTHON_SCRIPT_BASE64_BY_NAME } from "./scripts.generated.js";
|
|
6
3
|
const DEFAULT_HEAD_LINES = 50;
|
|
7
4
|
const DEFAULT_TAIL_LINES = 20;
|
|
8
5
|
const DEFAULT_MID_LINES = 20;
|
|
9
|
-
const SANDBOX_SCRIPT_DIRECTORY = "/tmp/ekairos/dataset/file/scripts";
|
|
10
|
-
const PYTHON_SCRIPT_FILES = [
|
|
11
|
-
"file_metadata.py",
|
|
12
|
-
"preview_head_csv.py",
|
|
13
|
-
"preview_head_excel.py",
|
|
14
|
-
"preview_mid_csv.py",
|
|
15
|
-
"preview_mid_excel.py",
|
|
16
|
-
"preview_tail_csv.py",
|
|
17
|
-
"preview_tail_excel.py",
|
|
18
|
-
];
|
|
19
|
-
export function resolveFilePreviewScriptPath(scriptName) {
|
|
20
|
-
const currentDir = dirname(fileURLToPath(import.meta.url));
|
|
21
|
-
const taskRoot = String(process.env.LAMBDA_TASK_ROOT ?? "").trim();
|
|
22
|
-
const candidates = [
|
|
23
|
-
join(currentDir, "scripts", scriptName),
|
|
24
|
-
join(process.cwd(), "node_modules", "@ekairos", "dataset", "dist", "file", "scripts", scriptName),
|
|
25
|
-
taskRoot
|
|
26
|
-
? join(taskRoot, "node_modules", "@ekairos", "dataset", "dist", "file", "scripts", scriptName)
|
|
27
|
-
: "",
|
|
28
|
-
join(process.cwd(), "packages", "dataset", "dist", "file", "scripts", scriptName),
|
|
29
|
-
join(process.cwd(), "packages", "dataset", "src", "file", "scripts", scriptName),
|
|
30
|
-
].filter(Boolean);
|
|
31
|
-
for (const candidate of candidates) {
|
|
32
|
-
if (existsSync(candidate)) {
|
|
33
|
-
return candidate;
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
throw new Error(`dataset_preview_script_not_found:${scriptName}; searched=${candidates.join(",")}`);
|
|
37
|
-
}
|
|
38
6
|
export function getEmbeddedFilePreviewScriptBase64(scriptName) {
|
|
39
7
|
const embedded = PYTHON_SCRIPT_BASE64_BY_NAME[scriptName];
|
|
40
8
|
if (!embedded) {
|
|
@@ -42,31 +10,9 @@ export function getEmbeddedFilePreviewScriptBase64(scriptName) {
|
|
|
42
10
|
}
|
|
43
11
|
return embedded;
|
|
44
12
|
}
|
|
45
|
-
function readFilePreviewScriptBase64(scriptName) {
|
|
46
|
-
try {
|
|
47
|
-
const scriptPath = resolveFilePreviewScriptPath(scriptName);
|
|
48
|
-
return Buffer.from(readFileSync(scriptPath)).toString("base64");
|
|
49
|
-
}
|
|
50
|
-
catch (error) {
|
|
51
|
-
try {
|
|
52
|
-
return getEmbeddedFilePreviewScriptBase64(scriptName);
|
|
53
|
-
}
|
|
54
|
-
catch {
|
|
55
|
-
throw error;
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
13
|
function readFilePreviewScriptText(scriptName) {
|
|
60
|
-
|
|
61
|
-
const scriptPath = resolveFilePreviewScriptPath(scriptName);
|
|
62
|
-
return readFileSync(scriptPath, "utf-8");
|
|
63
|
-
}
|
|
64
|
-
catch {
|
|
65
|
-
return Buffer.from(getEmbeddedFilePreviewScriptBase64(scriptName), "base64").toString("utf-8");
|
|
66
|
-
}
|
|
14
|
+
return Buffer.from(getEmbeddedFilePreviewScriptBase64(scriptName), "base64").toString("utf-8");
|
|
67
15
|
}
|
|
68
|
-
const preparedSandboxIds = new Set();
|
|
69
|
-
const sandboxSetupPromises = new Map();
|
|
70
16
|
function sanitizePreviewText(value) {
|
|
71
17
|
return String(value ?? "")
|
|
72
18
|
.replace(/\u0000/g, "")
|
|
@@ -99,65 +45,16 @@ function validateScriptResult(result, context) {
|
|
|
99
45
|
throw new Error(`${context} failed: ${stderr.substring(0, 500)}`);
|
|
100
46
|
}
|
|
101
47
|
}
|
|
102
|
-
export async function ensurePreviewScriptsAvailable(
|
|
103
|
-
|
|
104
|
-
return;
|
|
105
|
-
}
|
|
106
|
-
const inFlight = sandboxSetupPromises.get(sandboxId);
|
|
107
|
-
if (inFlight) {
|
|
108
|
-
await inFlight;
|
|
109
|
-
return;
|
|
110
|
-
}
|
|
111
|
-
const setupPromise = (async () => {
|
|
112
|
-
try {
|
|
113
|
-
await runDatasetSandboxCommandStep({
|
|
114
|
-
runtime,
|
|
115
|
-
sandboxId,
|
|
116
|
-
cmd: "mkdir",
|
|
117
|
-
args: ["-p", SANDBOX_SCRIPT_DIRECTORY],
|
|
118
|
-
});
|
|
119
|
-
}
|
|
120
|
-
catch (error) {
|
|
121
|
-
console.warn("[Dataset Scripts] Failed to create sandbox scripts directory", error);
|
|
122
|
-
}
|
|
123
|
-
const filesToWrite = [];
|
|
124
|
-
for (const scriptName of PYTHON_SCRIPT_FILES) {
|
|
125
|
-
try {
|
|
126
|
-
filesToWrite.push({
|
|
127
|
-
path: `${SANDBOX_SCRIPT_DIRECTORY}/${scriptName}`,
|
|
128
|
-
contentBase64: readFilePreviewScriptBase64(scriptName),
|
|
129
|
-
});
|
|
130
|
-
}
|
|
131
|
-
catch (error) {
|
|
132
|
-
console.error(`[Dataset Scripts] Failed to read script ${scriptName}`, error);
|
|
133
|
-
throw error;
|
|
134
|
-
}
|
|
135
|
-
}
|
|
136
|
-
if (filesToWrite.length > 0) {
|
|
137
|
-
await writeDatasetSandboxFilesStep({
|
|
138
|
-
runtime,
|
|
139
|
-
sandboxId,
|
|
140
|
-
files: filesToWrite,
|
|
141
|
-
});
|
|
142
|
-
}
|
|
143
|
-
})();
|
|
144
|
-
sandboxSetupPromises.set(sandboxId, setupPromise);
|
|
145
|
-
try {
|
|
146
|
-
await setupPromise;
|
|
147
|
-
preparedSandboxIds.add(sandboxId);
|
|
148
|
-
}
|
|
149
|
-
catch (error) {
|
|
150
|
-
sandboxSetupPromises.delete(sandboxId);
|
|
151
|
-
throw error;
|
|
152
|
-
}
|
|
48
|
+
export async function ensurePreviewScriptsAvailable(_runtime, _sandboxId) {
|
|
49
|
+
return;
|
|
153
50
|
}
|
|
154
51
|
export async function generateFilePreview(runtime, sandboxId, sandboxFilePath, datasetId, options = {}) {
|
|
155
52
|
const context = {
|
|
156
53
|
totalRows: 0,
|
|
157
54
|
};
|
|
158
55
|
try {
|
|
159
|
-
await ensurePreviewScriptsAvailable(runtime, sandboxId);
|
|
160
56
|
const metadataResult = await runScript(runtime, sandboxId, "file_metadata.py", [sandboxFilePath], "Extracts file metadata: name, extension, size, row count estimate, column count, and header preview");
|
|
57
|
+
validateScriptResult(metadataResult, `preview_metadata for ${datasetId}`);
|
|
161
58
|
context.metadata = metadataResult;
|
|
162
59
|
let previewKind = null;
|
|
163
60
|
if (metadataResult.stdout) {
|
|
@@ -219,25 +116,19 @@ export async function generateFilePreview(runtime, sandboxId, sandboxFilePath, d
|
|
|
219
116
|
}
|
|
220
117
|
catch (error) {
|
|
221
118
|
console.error(`[Dataset ${datasetId}] Error generating file preview:`, error);
|
|
119
|
+
throw error;
|
|
222
120
|
}
|
|
223
121
|
return context;
|
|
224
122
|
}
|
|
225
123
|
async function runScript(runtime, sandboxId, scriptName, args, description) {
|
|
226
|
-
const
|
|
227
|
-
const command = `python
|
|
228
|
-
let scriptContent = "";
|
|
229
|
-
try {
|
|
230
|
-
scriptContent = readFilePreviewScriptText(scriptName);
|
|
231
|
-
}
|
|
232
|
-
catch (error) {
|
|
233
|
-
console.warn(`Failed to read script ${scriptName}:`, error);
|
|
234
|
-
}
|
|
124
|
+
const scriptContent = readFilePreviewScriptText(scriptName);
|
|
125
|
+
const command = `python -c <${scriptName}> ${args.join(" ")}`;
|
|
235
126
|
try {
|
|
236
127
|
const result = await runDatasetSandboxCommandStep({
|
|
237
128
|
runtime,
|
|
238
129
|
sandboxId,
|
|
239
130
|
cmd: "python",
|
|
240
|
-
args: [
|
|
131
|
+
args: ["-c", scriptContent, ...args],
|
|
241
132
|
});
|
|
242
133
|
return {
|
|
243
134
|
description,
|
|
@@ -1,30 +1,18 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep } from "../sandbox/steps.js";
|
|
1
|
+
import { runDatasetSandboxCommandStep } from "../sandbox/steps.js";
|
|
3
2
|
const DEFAULT_HEAD_LINES = 50;
|
|
4
3
|
async function runPythonSnippet(runtime, sandboxId, datasetId, scriptName, code, args, description) {
|
|
5
|
-
const scriptPath = `${getDatasetScriptsDir(datasetId)}/${scriptName}.py`;
|
|
6
|
-
await writeDatasetSandboxFilesStep({
|
|
7
|
-
runtime,
|
|
8
|
-
sandboxId,
|
|
9
|
-
files: [
|
|
10
|
-
{
|
|
11
|
-
path: scriptPath,
|
|
12
|
-
contentBase64: Buffer.from(code, "utf-8").toString("base64"),
|
|
13
|
-
},
|
|
14
|
-
],
|
|
15
|
-
});
|
|
16
4
|
const result = await runDatasetSandboxCommandStep({
|
|
17
5
|
runtime,
|
|
18
6
|
sandboxId,
|
|
19
7
|
cmd: "python",
|
|
20
|
-
args: [
|
|
8
|
+
args: ["-c", code, ...args],
|
|
21
9
|
});
|
|
22
10
|
const stdout = result.stdout || "";
|
|
23
11
|
const stderr = result.stderr || "";
|
|
24
12
|
return {
|
|
25
13
|
description,
|
|
26
14
|
script: code,
|
|
27
|
-
command: `python
|
|
15
|
+
command: `python -c <${scriptName}.py> ${args.join(" ")}`,
|
|
28
16
|
stdout,
|
|
29
17
|
stderr,
|
|
30
18
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ekairos/dataset",
|
|
3
|
-
"version": "1.22.
|
|
3
|
+
"version": "1.22.81-beta.development.0",
|
|
4
4
|
"description": "Pulzar Dataset Tools",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -65,9 +65,9 @@
|
|
|
65
65
|
"test:ai-sdk:instant": "vitest run -c vitest.codex.config.mts src/tests/materializeDataset.ai-sdk.instant.test.ts"
|
|
66
66
|
},
|
|
67
67
|
"dependencies": {
|
|
68
|
-
"@ekairos/domain": "^1.22.
|
|
69
|
-
"@ekairos/events": "^1.22.
|
|
70
|
-
"@ekairos/sandbox": "^1.22.
|
|
68
|
+
"@ekairos/domain": "^1.22.81-beta.development.0",
|
|
69
|
+
"@ekairos/events": "^1.22.81-beta.development.0",
|
|
70
|
+
"@ekairos/sandbox": "^1.22.81-beta.development.0",
|
|
71
71
|
"@instantdb/admin": "0.22.158",
|
|
72
72
|
"@instantdb/core": "0.22.142",
|
|
73
73
|
"ai": "^5.0.44",
|