@ekairos/dataset 1.22.78-beta.development.0 → 1.22.79-beta.development.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/builder/materialize.js +165 -1
- package/package.json +4 -4
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
import { createFileParseContext } from "../file/file-dataset.agent.js";
|
|
2
|
+
import { readInstantFileStep } from "../file/steps.js";
|
|
2
3
|
import { createTransformDatasetContext } from "../transform/transform-dataset.agent.js";
|
|
3
4
|
import { datasetInferAndUpdateSchemaStep, datasetReadOneStep, } from "../dataset/steps.js";
|
|
5
|
+
import { getDatasetOutputPath, getDatasetWorkstation } from "../datasetFiles.js";
|
|
4
6
|
import { registerDatasetAgentMaterializers } from "./agentMaterializers.js";
|
|
5
7
|
import { buildFileDefaultInstructions, buildRawSourceInstructions, buildTransformInstructions, } from "./instructions.js";
|
|
6
8
|
import { createOrUpdateDatasetMetadata, materializeRowsToDataset, uploadInlineTextSource, } from "./persistence.js";
|
|
7
9
|
import { getDomainDescriptor } from "./sourceRows.js";
|
|
8
10
|
import { materializeQuerySource } from "./materializeQuery.js";
|
|
9
|
-
import { createDatasetSandboxStep } from "../sandbox/steps.js";
|
|
11
|
+
import { createDatasetSandboxStep, readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep, writeDatasetSandboxTextFilesStep, } from "../sandbox/steps.js";
|
|
10
12
|
function makeIntermediateDatasetId(targetDatasetId, sourceKind, index) {
|
|
11
13
|
return `${targetDatasetId}__${sourceKind}_${index}`;
|
|
12
14
|
}
|
|
@@ -41,6 +43,163 @@ function materializeRawTextRows(source) {
|
|
|
41
43
|
}
|
|
42
44
|
return [{ text }];
|
|
43
45
|
}
|
|
46
|
+
function parseContentDispositionFileName(value) {
|
|
47
|
+
const text = String(value ?? "");
|
|
48
|
+
const utf8Match = /filename\*=UTF-8''([^;]+)/i.exec(text);
|
|
49
|
+
if (utf8Match?.[1]) {
|
|
50
|
+
try {
|
|
51
|
+
return decodeURIComponent(utf8Match[1]).trim();
|
|
52
|
+
}
|
|
53
|
+
catch {
|
|
54
|
+
return utf8Match[1].trim();
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
const quotedMatch = /filename="([^"]+)"/i.exec(text);
|
|
58
|
+
if (quotedMatch?.[1])
|
|
59
|
+
return quotedMatch[1].trim();
|
|
60
|
+
const plainMatch = /filename=([^;]+)/i.exec(text);
|
|
61
|
+
if (plainMatch?.[1])
|
|
62
|
+
return plainMatch[1].trim();
|
|
63
|
+
return "";
|
|
64
|
+
}
|
|
65
|
+
function isPdfContentDisposition(value) {
|
|
66
|
+
const text = String(value ?? "").toLowerCase();
|
|
67
|
+
return text.includes("application/pdf") || text.includes(".pdf");
|
|
68
|
+
}
|
|
69
|
+
function sanitizePdfFileName(value, fallback) {
|
|
70
|
+
const name = String(value ?? "").trim() || fallback;
|
|
71
|
+
const cleaned = name.replace(/[\\/:"*?<>|]+/g, "_").replace(/\s+/g, "_").slice(0, 120);
|
|
72
|
+
return cleaned.toLowerCase().endsWith(".pdf") ? cleaned : `${cleaned || fallback}.pdf`;
|
|
73
|
+
}
|
|
74
|
+
function pdfTextRowsSchema() {
|
|
75
|
+
return {
|
|
76
|
+
title: "PdfTextPage",
|
|
77
|
+
description: "Extracted PDF page text",
|
|
78
|
+
schema: {
|
|
79
|
+
type: "object",
|
|
80
|
+
additionalProperties: false,
|
|
81
|
+
required: ["fileId", "fileName", "pageNumber", "text"],
|
|
82
|
+
properties: {
|
|
83
|
+
fileId: { type: "string" },
|
|
84
|
+
fileName: { type: "string" },
|
|
85
|
+
pageNumber: { type: "number" },
|
|
86
|
+
text: { type: "string" },
|
|
87
|
+
},
|
|
88
|
+
},
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
function parseJsonlDataRows(content) {
|
|
92
|
+
return String(content ?? "")
|
|
93
|
+
.split(/\r?\n/g)
|
|
94
|
+
.map((line) => line.trim())
|
|
95
|
+
.filter(Boolean)
|
|
96
|
+
.map((line) => JSON.parse(line))
|
|
97
|
+
.map((record) => record?.data)
|
|
98
|
+
.filter((row) => row && typeof row === "object" && !Array.isArray(row));
|
|
99
|
+
}
|
|
100
|
+
async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
|
|
101
|
+
const file = await readInstantFileStep({ runtime: state.runtime, fileId: source.fileId });
|
|
102
|
+
if (!isPdfContentDisposition(file.contentDisposition))
|
|
103
|
+
return null;
|
|
104
|
+
const sandboxId = await resolveDatasetSandboxId(state, targetDatasetId);
|
|
105
|
+
const workstation = getDatasetWorkstation(targetDatasetId);
|
|
106
|
+
const outputPath = getDatasetOutputPath(targetDatasetId);
|
|
107
|
+
const fileName = sanitizePdfFileName(parseContentDispositionFileName(file.contentDisposition), `${source.fileId}.pdf`);
|
|
108
|
+
const sourcePath = `${workstation}/${fileName}`;
|
|
109
|
+
const scriptPath = `${workstation}/extract_pdf_text.py`;
|
|
110
|
+
await runDatasetSandboxCommandStep({
|
|
111
|
+
runtime: state.runtime,
|
|
112
|
+
sandboxId,
|
|
113
|
+
cmd: "mkdir",
|
|
114
|
+
args: ["-p", workstation],
|
|
115
|
+
});
|
|
116
|
+
await writeDatasetSandboxFilesStep({
|
|
117
|
+
runtime: state.runtime,
|
|
118
|
+
sandboxId,
|
|
119
|
+
files: [{ path: sourcePath, contentBase64: file.contentBase64 }],
|
|
120
|
+
});
|
|
121
|
+
const install = await runDatasetSandboxCommandStep({
|
|
122
|
+
runtime: state.runtime,
|
|
123
|
+
sandboxId,
|
|
124
|
+
cmd: "python",
|
|
125
|
+
args: ["-m", "pip", "install", "pypdf", "--quiet"],
|
|
126
|
+
});
|
|
127
|
+
if (install.exitCode !== 0) {
|
|
128
|
+
throw new Error(`dataset_pdf_dependency_install_failed:${install.stderr || install.stdout}`);
|
|
129
|
+
}
|
|
130
|
+
await writeDatasetSandboxTextFilesStep({
|
|
131
|
+
runtime: state.runtime,
|
|
132
|
+
sandboxId,
|
|
133
|
+
files: [
|
|
134
|
+
{
|
|
135
|
+
path: scriptPath,
|
|
136
|
+
content: [
|
|
137
|
+
"from pathlib import Path",
|
|
138
|
+
"import json",
|
|
139
|
+
"import sys",
|
|
140
|
+
"from pypdf import PdfReader",
|
|
141
|
+
"",
|
|
142
|
+
"source_path = Path(sys.argv[1])",
|
|
143
|
+
"output_path = Path(sys.argv[2])",
|
|
144
|
+
"file_id = sys.argv[3]",
|
|
145
|
+
"file_name = sys.argv[4]",
|
|
146
|
+
"reader = PdfReader(str(source_path))",
|
|
147
|
+
"rows = 0",
|
|
148
|
+
"with output_path.open('w', encoding='utf-8') as out:",
|
|
149
|
+
" for index, page in enumerate(reader.pages, start=1):",
|
|
150
|
+
" text = page.extract_text() or ''",
|
|
151
|
+
" text = text.replace('\\x00', '').strip()",
|
|
152
|
+
" if not text:",
|
|
153
|
+
" continue",
|
|
154
|
+
" data = {",
|
|
155
|
+
" 'fileId': file_id,",
|
|
156
|
+
" 'fileName': file_name,",
|
|
157
|
+
" 'pageNumber': index,",
|
|
158
|
+
" 'text': text,",
|
|
159
|
+
" }",
|
|
160
|
+
" out.write(json.dumps({'type': 'row', 'data': data}, ensure_ascii=False) + '\\n')",
|
|
161
|
+
" rows += 1",
|
|
162
|
+
" if rows == 0:",
|
|
163
|
+
" data = {'fileId': file_id, 'fileName': file_name, 'pageNumber': 0, 'text': ''}",
|
|
164
|
+
" out.write(json.dumps({'type': 'row', 'data': data}, ensure_ascii=False) + '\\n')",
|
|
165
|
+
" rows = 1",
|
|
166
|
+
"print(f'extracted_pdf_pages={len(reader.pages)} rows={rows} output={output_path}')",
|
|
167
|
+
"",
|
|
168
|
+
].join("\n"),
|
|
169
|
+
},
|
|
170
|
+
],
|
|
171
|
+
});
|
|
172
|
+
const extraction = await runDatasetSandboxCommandStep({
|
|
173
|
+
runtime: state.runtime,
|
|
174
|
+
sandboxId,
|
|
175
|
+
cmd: "python",
|
|
176
|
+
args: [scriptPath, sourcePath, outputPath, source.fileId, fileName],
|
|
177
|
+
});
|
|
178
|
+
if (extraction.exitCode !== 0) {
|
|
179
|
+
throw new Error(`dataset_pdf_text_extraction_failed:${extraction.stderr || extraction.stdout}`);
|
|
180
|
+
}
|
|
181
|
+
const output = await readDatasetSandboxTextFileStep({
|
|
182
|
+
runtime: state.runtime,
|
|
183
|
+
sandboxId,
|
|
184
|
+
path: outputPath,
|
|
185
|
+
});
|
|
186
|
+
const rows = parseJsonlDataRows(output.content);
|
|
187
|
+
if (rows.length === 0) {
|
|
188
|
+
throw new Error("dataset_pdf_text_extraction_empty");
|
|
189
|
+
}
|
|
190
|
+
await materializeRowsToDataset(state.runtime, {
|
|
191
|
+
datasetId: targetDatasetId,
|
|
192
|
+
sandboxId,
|
|
193
|
+
title: state.title ?? fileName,
|
|
194
|
+
instructions: state.instructions,
|
|
195
|
+
sources: [{ kind: "file", fileId: source.fileId, description: source.description }],
|
|
196
|
+
sourceKinds: ["file"],
|
|
197
|
+
rows,
|
|
198
|
+
schema: pdfTextRowsSchema(),
|
|
199
|
+
first: state.first,
|
|
200
|
+
});
|
|
201
|
+
return targetDatasetId;
|
|
202
|
+
}
|
|
44
203
|
async function materializeRawTextSource(state, source, targetDatasetId) {
|
|
45
204
|
const rows = materializeRawTextRows(source);
|
|
46
205
|
await materializeRowsToDataset(state.runtime, {
|
|
@@ -97,6 +256,11 @@ export async function resolveDatasetAgentDurable(requestedDurable) {
|
|
|
97
256
|
return true;
|
|
98
257
|
}
|
|
99
258
|
export async function materializeSingleFileLikeSource(state, source, targetDatasetId) {
|
|
259
|
+
if (source.kind === "file" && !state.outputSchema) {
|
|
260
|
+
const materializedPdf = await tryMaterializeRawPdfFileSource(state, source, targetDatasetId);
|
|
261
|
+
if (materializedPdf)
|
|
262
|
+
return materializedPdf;
|
|
263
|
+
}
|
|
100
264
|
if (!state.reactor) {
|
|
101
265
|
throw new Error("dataset_reactor_required");
|
|
102
266
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ekairos/dataset",
|
|
3
|
-
"version": "1.22.
|
|
3
|
+
"version": "1.22.79-beta.development.0",
|
|
4
4
|
"description": "Pulzar Dataset Tools",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -65,9 +65,9 @@
|
|
|
65
65
|
"test:ai-sdk:instant": "vitest run -c vitest.codex.config.mts src/tests/materializeDataset.ai-sdk.instant.test.ts"
|
|
66
66
|
},
|
|
67
67
|
"dependencies": {
|
|
68
|
-
"@ekairos/domain": "^1.22.
|
|
69
|
-
"@ekairos/events": "^1.22.
|
|
70
|
-
"@ekairos/sandbox": "^1.22.
|
|
68
|
+
"@ekairos/domain": "^1.22.79-beta.development.0",
|
|
69
|
+
"@ekairos/events": "^1.22.79-beta.development.0",
|
|
70
|
+
"@ekairos/sandbox": "^1.22.79-beta.development.0",
|
|
71
71
|
"@instantdb/admin": "0.22.158",
|
|
72
72
|
"@instantdb/core": "0.22.142",
|
|
73
73
|
"ai": "^5.0.44",
|