@ekairos/dataset 1.22.58-beta.development.0 → 1.22.60-beta.development.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/builder/materialize.js +64 -1
- package/dist/file/filepreview.js +32 -8
- package/package.json +4 -4
|
@@ -3,13 +3,66 @@ import { createTransformDatasetContext } from "../transform/transform-dataset.ag
|
|
|
3
3
|
import { datasetInferAndUpdateSchemaStep, datasetReadOneStep, } from "../dataset/steps.js";
|
|
4
4
|
import { registerDatasetAgentMaterializers } from "./agentMaterializers.js";
|
|
5
5
|
import { buildFileDefaultInstructions, buildRawSourceInstructions, buildTransformInstructions, } from "./instructions.js";
|
|
6
|
-
import { createOrUpdateDatasetMetadata, uploadInlineTextSource, } from "./persistence.js";
|
|
6
|
+
import { createOrUpdateDatasetMetadata, materializeRowsToDataset, uploadInlineTextSource, } from "./persistence.js";
|
|
7
7
|
import { getDomainDescriptor } from "./sourceRows.js";
|
|
8
8
|
import { materializeQuerySource } from "./materializeQuery.js";
|
|
9
9
|
import { createDatasetSandboxStep } from "../sandbox/steps.js";
|
|
10
10
|
function makeIntermediateDatasetId(targetDatasetId, sourceKind, index) {
|
|
11
11
|
return `${targetDatasetId}__${sourceKind}_${index}`;
|
|
12
12
|
}
|
|
13
|
+
function normalizeParsedTextRows(value) {
|
|
14
|
+
if (Array.isArray(value)) {
|
|
15
|
+
return value.map((item) => (item && typeof item === "object" ? item : { value: item }));
|
|
16
|
+
}
|
|
17
|
+
if (value && typeof value === "object")
|
|
18
|
+
return [value];
|
|
19
|
+
return [{ value }];
|
|
20
|
+
}
|
|
21
|
+
function materializeRawTextRows(source) {
|
|
22
|
+
const text = String(source.text ?? "");
|
|
23
|
+
const mimeType = String(source.mimeType ?? "").toLowerCase();
|
|
24
|
+
const name = String(source.name ?? "").toLowerCase();
|
|
25
|
+
const shouldParseJson = mimeType.includes("json") || name.endsWith(".json") || name.endsWith(".jsonl");
|
|
26
|
+
if (shouldParseJson) {
|
|
27
|
+
try {
|
|
28
|
+
if (name.endsWith(".jsonl")) {
|
|
29
|
+
const rows = text
|
|
30
|
+
.split(/\r?\n/g)
|
|
31
|
+
.map((line) => line.trim())
|
|
32
|
+
.filter(Boolean)
|
|
33
|
+
.map((line) => JSON.parse(line));
|
|
34
|
+
return rows.flatMap((row) => normalizeParsedTextRows(row));
|
|
35
|
+
}
|
|
36
|
+
return normalizeParsedTextRows(JSON.parse(text));
|
|
37
|
+
}
|
|
38
|
+
catch {
|
|
39
|
+
return [{ text }];
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
return [{ text }];
|
|
43
|
+
}
|
|
44
|
+
async function materializeRawTextSource(state, source, targetDatasetId) {
|
|
45
|
+
const rows = materializeRawTextRows(source);
|
|
46
|
+
await materializeRowsToDataset(state.runtime, {
|
|
47
|
+
datasetId: targetDatasetId,
|
|
48
|
+
sandboxId: state.sandboxId,
|
|
49
|
+
title: state.title ?? source.name ?? targetDatasetId,
|
|
50
|
+
instructions: state.instructions,
|
|
51
|
+
sources: [
|
|
52
|
+
{
|
|
53
|
+
kind: "text",
|
|
54
|
+
mimeType: source.mimeType,
|
|
55
|
+
name: source.name,
|
|
56
|
+
description: source.description,
|
|
57
|
+
},
|
|
58
|
+
],
|
|
59
|
+
sourceKinds: ["text"],
|
|
60
|
+
rows,
|
|
61
|
+
schema: state.outputSchema,
|
|
62
|
+
first: state.first,
|
|
63
|
+
});
|
|
64
|
+
return targetDatasetId;
|
|
65
|
+
}
|
|
13
66
|
async function resolveDatasetSandboxId(state, targetDatasetId) {
|
|
14
67
|
const sandboxId = String(state.sandboxId ?? "").trim();
|
|
15
68
|
if (sandboxId)
|
|
@@ -90,6 +143,16 @@ async function normalizeSourceToDatasetId(state, source, targetDatasetId, source
|
|
|
90
143
|
});
|
|
91
144
|
return intermediateDatasetId;
|
|
92
145
|
}
|
|
146
|
+
if (source.kind === "text") {
|
|
147
|
+
await materializeRawTextSource({
|
|
148
|
+
...state,
|
|
149
|
+
outputSchema: undefined,
|
|
150
|
+
first: false,
|
|
151
|
+
instructions: buildRawSourceInstructions(source.kind),
|
|
152
|
+
title: source.name ?? state.title,
|
|
153
|
+
}, source, intermediateDatasetId);
|
|
154
|
+
return intermediateDatasetId;
|
|
155
|
+
}
|
|
93
156
|
await materializeSingleFileLikeSource({
|
|
94
157
|
...state,
|
|
95
158
|
outputSchema: undefined,
|
package/dist/file/filepreview.js
CHANGED
|
@@ -29,6 +29,26 @@ function resolveScriptPath(scriptName) {
|
|
|
29
29
|
}
|
|
30
30
|
const preparedSandboxIds = new Set();
|
|
31
31
|
const sandboxSetupPromises = new Map();
|
|
32
|
+
function sanitizePreviewText(value) {
|
|
33
|
+
return String(value ?? "")
|
|
34
|
+
.replace(/\u0000/g, "")
|
|
35
|
+
.replace(/[\u0001-\u0008\u000B\u000C\u000E-\u001F\u007F]/g, "");
|
|
36
|
+
}
|
|
37
|
+
function getPreviewKind(extension) {
|
|
38
|
+
const normalized = extension.toLowerCase();
|
|
39
|
+
if (normalized === ".xlsx" || normalized === ".xls")
|
|
40
|
+
return "excel";
|
|
41
|
+
if (normalized === ".csv" ||
|
|
42
|
+
normalized === ".tsv" ||
|
|
43
|
+
normalized === ".txt" ||
|
|
44
|
+
normalized === ".log" ||
|
|
45
|
+
normalized === ".json" ||
|
|
46
|
+
normalized === ".jsonl" ||
|
|
47
|
+
normalized === ".md") {
|
|
48
|
+
return "text";
|
|
49
|
+
}
|
|
50
|
+
return null;
|
|
51
|
+
}
|
|
32
52
|
function validateScriptResult(result, context) {
|
|
33
53
|
if (!result.stderr) {
|
|
34
54
|
return;
|
|
@@ -103,13 +123,13 @@ export async function generateFilePreview(runtime, sandboxId, sandboxFilePath, d
|
|
|
103
123
|
await ensurePreviewScriptsAvailable(runtime, sandboxId);
|
|
104
124
|
const metadataResult = await runScript(runtime, sandboxId, "file_metadata.py", [sandboxFilePath], "Extracts file metadata: name, extension, size, row count estimate, column count, and header preview");
|
|
105
125
|
context.metadata = metadataResult;
|
|
106
|
-
let
|
|
126
|
+
let previewKind = null;
|
|
107
127
|
if (metadataResult.stdout) {
|
|
108
128
|
try {
|
|
109
129
|
const metadataJson = JSON.parse(metadataResult.stdout);
|
|
110
130
|
context.totalRows = metadataJson.row_count_estimate || 0;
|
|
111
131
|
const extension = metadataJson.extension || "";
|
|
112
|
-
|
|
132
|
+
previewKind = getPreviewKind(extension);
|
|
113
133
|
}
|
|
114
134
|
catch {
|
|
115
135
|
console.warn(`[Dataset ${datasetId}] Failed to parse metadata JSON`);
|
|
@@ -122,9 +142,13 @@ export async function generateFilePreview(runtime, sandboxId, sandboxFilePath, d
|
|
|
122
142
|
console.log(`[Dataset ${datasetId}] No rows detected, skipping preview`);
|
|
123
143
|
return context;
|
|
124
144
|
}
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
145
|
+
if (!previewKind) {
|
|
146
|
+
console.log(`[Dataset ${datasetId}] Binary or unsupported preview format, keeping metadata only`);
|
|
147
|
+
return context;
|
|
148
|
+
}
|
|
149
|
+
const headScript = previewKind === "excel" ? "preview_head_excel.py" : "preview_head_csv.py";
|
|
150
|
+
const tailScript = previewKind === "excel" ? "preview_tail_excel.py" : "preview_tail_csv.py";
|
|
151
|
+
const midScript = previewKind === "excel" ? "preview_mid_excel.py" : "preview_mid_csv.py";
|
|
128
152
|
if (totalRows <= headLines) {
|
|
129
153
|
console.log(`[Dataset ${datasetId}] File has ${totalRows} rows, reading all with head only`);
|
|
130
154
|
const headResult = await runScript(runtime, sandboxId, headScript, [sandboxFilePath, String(totalRows)], `Reads the first ${totalRows} rows (entire file)`);
|
|
@@ -184,8 +208,8 @@ async function runScript(runtime, sandboxId, scriptName, args, description) {
|
|
|
184
208
|
description,
|
|
185
209
|
script: scriptContent,
|
|
186
210
|
command,
|
|
187
|
-
stdout: result.stdout
|
|
188
|
-
stderr: result.stderr
|
|
211
|
+
stdout: sanitizePreviewText(result.stdout),
|
|
212
|
+
stderr: sanitizePreviewText(result.stderr),
|
|
189
213
|
};
|
|
190
214
|
}
|
|
191
215
|
catch (error) {
|
|
@@ -194,7 +218,7 @@ async function runScript(runtime, sandboxId, scriptName, args, description) {
|
|
|
194
218
|
script: scriptContent,
|
|
195
219
|
command,
|
|
196
220
|
stdout: "",
|
|
197
|
-
stderr: error instanceof Error ? error.message : String(error),
|
|
221
|
+
stderr: sanitizePreviewText(error instanceof Error ? error.message : String(error)),
|
|
198
222
|
};
|
|
199
223
|
}
|
|
200
224
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ekairos/dataset",
|
|
3
|
-
"version": "1.22.
|
|
3
|
+
"version": "1.22.60-beta.development.0",
|
|
4
4
|
"description": "Pulzar Dataset Tools",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -65,9 +65,9 @@
|
|
|
65
65
|
"test:ai-sdk:instant": "vitest run -c vitest.codex.config.mts src/tests/materializeDataset.ai-sdk.instant.test.ts"
|
|
66
66
|
},
|
|
67
67
|
"dependencies": {
|
|
68
|
-
"@ekairos/domain": "^1.22.
|
|
69
|
-
"@ekairos/events": "^1.22.
|
|
70
|
-
"@ekairos/sandbox": "^1.22.
|
|
68
|
+
"@ekairos/domain": "^1.22.60-beta.development.0",
|
|
69
|
+
"@ekairos/events": "^1.22.60-beta.development.0",
|
|
70
|
+
"@ekairos/sandbox": "^1.22.60-beta.development.0",
|
|
71
71
|
"@instantdb/admin": "0.22.158",
|
|
72
72
|
"@instantdb/core": "0.22.142",
|
|
73
73
|
"ai": "^5.0.44",
|