@ekairos/dataset 1.22.77-beta.development.0 → 1.22.79-beta.development.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { AnyDatasetRuntime, DatasetBuilderState, InternalSource } from "./types.js";
|
|
2
|
+
export declare function resolveDatasetAgentDurable(requestedDurable?: boolean): Promise<boolean>;
|
|
2
3
|
export declare function materializeSingleFileLikeSource<Runtime extends AnyDatasetRuntime>(state: DatasetBuilderState<Runtime>, source: Extract<InternalSource, {
|
|
3
4
|
kind: "file" | "text";
|
|
4
5
|
}>, targetDatasetId: string): Promise<string>;
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
import { createFileParseContext } from "../file/file-dataset.agent.js";
|
|
2
|
+
import { readInstantFileStep } from "../file/steps.js";
|
|
2
3
|
import { createTransformDatasetContext } from "../transform/transform-dataset.agent.js";
|
|
3
4
|
import { datasetInferAndUpdateSchemaStep, datasetReadOneStep, } from "../dataset/steps.js";
|
|
5
|
+
import { getDatasetOutputPath, getDatasetWorkstation } from "../datasetFiles.js";
|
|
4
6
|
import { registerDatasetAgentMaterializers } from "./agentMaterializers.js";
|
|
5
7
|
import { buildFileDefaultInstructions, buildRawSourceInstructions, buildTransformInstructions, } from "./instructions.js";
|
|
6
8
|
import { createOrUpdateDatasetMetadata, materializeRowsToDataset, uploadInlineTextSource, } from "./persistence.js";
|
|
7
9
|
import { getDomainDescriptor } from "./sourceRows.js";
|
|
8
10
|
import { materializeQuerySource } from "./materializeQuery.js";
|
|
9
|
-
import { createDatasetSandboxStep } from "../sandbox/steps.js";
|
|
11
|
+
import { createDatasetSandboxStep, readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep, writeDatasetSandboxTextFilesStep, } from "../sandbox/steps.js";
|
|
10
12
|
function makeIntermediateDatasetId(targetDatasetId, sourceKind, index) {
|
|
11
13
|
return `${targetDatasetId}__${sourceKind}_${index}`;
|
|
12
14
|
}
|
|
@@ -41,6 +43,163 @@ function materializeRawTextRows(source) {
|
|
|
41
43
|
}
|
|
42
44
|
return [{ text }];
|
|
43
45
|
}
|
|
46
|
+
function parseContentDispositionFileName(value) {
|
|
47
|
+
const text = String(value ?? "");
|
|
48
|
+
const utf8Match = /filename\*=UTF-8''([^;]+)/i.exec(text);
|
|
49
|
+
if (utf8Match?.[1]) {
|
|
50
|
+
try {
|
|
51
|
+
return decodeURIComponent(utf8Match[1]).trim();
|
|
52
|
+
}
|
|
53
|
+
catch {
|
|
54
|
+
return utf8Match[1].trim();
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
const quotedMatch = /filename="([^"]+)"/i.exec(text);
|
|
58
|
+
if (quotedMatch?.[1])
|
|
59
|
+
return quotedMatch[1].trim();
|
|
60
|
+
const plainMatch = /filename=([^;]+)/i.exec(text);
|
|
61
|
+
if (plainMatch?.[1])
|
|
62
|
+
return plainMatch[1].trim();
|
|
63
|
+
return "";
|
|
64
|
+
}
|
|
65
|
+
function isPdfContentDisposition(value) {
|
|
66
|
+
const text = String(value ?? "").toLowerCase();
|
|
67
|
+
return text.includes("application/pdf") || text.includes(".pdf");
|
|
68
|
+
}
|
|
69
|
+
function sanitizePdfFileName(value, fallback) {
|
|
70
|
+
const name = String(value ?? "").trim() || fallback;
|
|
71
|
+
const cleaned = name.replace(/[\\/:"*?<>|]+/g, "_").replace(/\s+/g, "_").slice(0, 120);
|
|
72
|
+
return cleaned.toLowerCase().endsWith(".pdf") ? cleaned : `${cleaned || fallback}.pdf`;
|
|
73
|
+
}
|
|
74
|
+
function pdfTextRowsSchema() {
|
|
75
|
+
return {
|
|
76
|
+
title: "PdfTextPage",
|
|
77
|
+
description: "Extracted PDF page text",
|
|
78
|
+
schema: {
|
|
79
|
+
type: "object",
|
|
80
|
+
additionalProperties: false,
|
|
81
|
+
required: ["fileId", "fileName", "pageNumber", "text"],
|
|
82
|
+
properties: {
|
|
83
|
+
fileId: { type: "string" },
|
|
84
|
+
fileName: { type: "string" },
|
|
85
|
+
pageNumber: { type: "number" },
|
|
86
|
+
text: { type: "string" },
|
|
87
|
+
},
|
|
88
|
+
},
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
function parseJsonlDataRows(content) {
|
|
92
|
+
return String(content ?? "")
|
|
93
|
+
.split(/\r?\n/g)
|
|
94
|
+
.map((line) => line.trim())
|
|
95
|
+
.filter(Boolean)
|
|
96
|
+
.map((line) => JSON.parse(line))
|
|
97
|
+
.map((record) => record?.data)
|
|
98
|
+
.filter((row) => row && typeof row === "object" && !Array.isArray(row));
|
|
99
|
+
}
|
|
100
|
+
async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
|
|
101
|
+
const file = await readInstantFileStep({ runtime: state.runtime, fileId: source.fileId });
|
|
102
|
+
if (!isPdfContentDisposition(file.contentDisposition))
|
|
103
|
+
return null;
|
|
104
|
+
const sandboxId = await resolveDatasetSandboxId(state, targetDatasetId);
|
|
105
|
+
const workstation = getDatasetWorkstation(targetDatasetId);
|
|
106
|
+
const outputPath = getDatasetOutputPath(targetDatasetId);
|
|
107
|
+
const fileName = sanitizePdfFileName(parseContentDispositionFileName(file.contentDisposition), `${source.fileId}.pdf`);
|
|
108
|
+
const sourcePath = `${workstation}/${fileName}`;
|
|
109
|
+
const scriptPath = `${workstation}/extract_pdf_text.py`;
|
|
110
|
+
await runDatasetSandboxCommandStep({
|
|
111
|
+
runtime: state.runtime,
|
|
112
|
+
sandboxId,
|
|
113
|
+
cmd: "mkdir",
|
|
114
|
+
args: ["-p", workstation],
|
|
115
|
+
});
|
|
116
|
+
await writeDatasetSandboxFilesStep({
|
|
117
|
+
runtime: state.runtime,
|
|
118
|
+
sandboxId,
|
|
119
|
+
files: [{ path: sourcePath, contentBase64: file.contentBase64 }],
|
|
120
|
+
});
|
|
121
|
+
const install = await runDatasetSandboxCommandStep({
|
|
122
|
+
runtime: state.runtime,
|
|
123
|
+
sandboxId,
|
|
124
|
+
cmd: "python",
|
|
125
|
+
args: ["-m", "pip", "install", "pypdf", "--quiet"],
|
|
126
|
+
});
|
|
127
|
+
if (install.exitCode !== 0) {
|
|
128
|
+
throw new Error(`dataset_pdf_dependency_install_failed:${install.stderr || install.stdout}`);
|
|
129
|
+
}
|
|
130
|
+
await writeDatasetSandboxTextFilesStep({
|
|
131
|
+
runtime: state.runtime,
|
|
132
|
+
sandboxId,
|
|
133
|
+
files: [
|
|
134
|
+
{
|
|
135
|
+
path: scriptPath,
|
|
136
|
+
content: [
|
|
137
|
+
"from pathlib import Path",
|
|
138
|
+
"import json",
|
|
139
|
+
"import sys",
|
|
140
|
+
"from pypdf import PdfReader",
|
|
141
|
+
"",
|
|
142
|
+
"source_path = Path(sys.argv[1])",
|
|
143
|
+
"output_path = Path(sys.argv[2])",
|
|
144
|
+
"file_id = sys.argv[3]",
|
|
145
|
+
"file_name = sys.argv[4]",
|
|
146
|
+
"reader = PdfReader(str(source_path))",
|
|
147
|
+
"rows = 0",
|
|
148
|
+
"with output_path.open('w', encoding='utf-8') as out:",
|
|
149
|
+
" for index, page in enumerate(reader.pages, start=1):",
|
|
150
|
+
" text = page.extract_text() or ''",
|
|
151
|
+
" text = text.replace('\\x00', '').strip()",
|
|
152
|
+
" if not text:",
|
|
153
|
+
" continue",
|
|
154
|
+
" data = {",
|
|
155
|
+
" 'fileId': file_id,",
|
|
156
|
+
" 'fileName': file_name,",
|
|
157
|
+
" 'pageNumber': index,",
|
|
158
|
+
" 'text': text,",
|
|
159
|
+
" }",
|
|
160
|
+
" out.write(json.dumps({'type': 'row', 'data': data}, ensure_ascii=False) + '\\n')",
|
|
161
|
+
" rows += 1",
|
|
162
|
+
" if rows == 0:",
|
|
163
|
+
" data = {'fileId': file_id, 'fileName': file_name, 'pageNumber': 0, 'text': ''}",
|
|
164
|
+
" out.write(json.dumps({'type': 'row', 'data': data}, ensure_ascii=False) + '\\n')",
|
|
165
|
+
" rows = 1",
|
|
166
|
+
"print(f'extracted_pdf_pages={len(reader.pages)} rows={rows} output={output_path}')",
|
|
167
|
+
"",
|
|
168
|
+
].join("\n"),
|
|
169
|
+
},
|
|
170
|
+
],
|
|
171
|
+
});
|
|
172
|
+
const extraction = await runDatasetSandboxCommandStep({
|
|
173
|
+
runtime: state.runtime,
|
|
174
|
+
sandboxId,
|
|
175
|
+
cmd: "python",
|
|
176
|
+
args: [scriptPath, sourcePath, outputPath, source.fileId, fileName],
|
|
177
|
+
});
|
|
178
|
+
if (extraction.exitCode !== 0) {
|
|
179
|
+
throw new Error(`dataset_pdf_text_extraction_failed:${extraction.stderr || extraction.stdout}`);
|
|
180
|
+
}
|
|
181
|
+
const output = await readDatasetSandboxTextFileStep({
|
|
182
|
+
runtime: state.runtime,
|
|
183
|
+
sandboxId,
|
|
184
|
+
path: outputPath,
|
|
185
|
+
});
|
|
186
|
+
const rows = parseJsonlDataRows(output.content);
|
|
187
|
+
if (rows.length === 0) {
|
|
188
|
+
throw new Error("dataset_pdf_text_extraction_empty");
|
|
189
|
+
}
|
|
190
|
+
await materializeRowsToDataset(state.runtime, {
|
|
191
|
+
datasetId: targetDatasetId,
|
|
192
|
+
sandboxId,
|
|
193
|
+
title: state.title ?? fileName,
|
|
194
|
+
instructions: state.instructions,
|
|
195
|
+
sources: [{ kind: "file", fileId: source.fileId, description: source.description }],
|
|
196
|
+
sourceKinds: ["file"],
|
|
197
|
+
rows,
|
|
198
|
+
schema: pdfTextRowsSchema(),
|
|
199
|
+
first: state.first,
|
|
200
|
+
});
|
|
201
|
+
return targetDatasetId;
|
|
202
|
+
}
|
|
44
203
|
async function materializeRawTextSource(state, source, targetDatasetId) {
|
|
45
204
|
const rows = materializeRawTextRows(source);
|
|
46
205
|
await materializeRowsToDataset(state.runtime, {
|
|
@@ -82,7 +241,26 @@ async function resolveDatasetSandboxId(state, targetDatasetId) {
|
|
|
82
241
|
});
|
|
83
242
|
return created.sandboxId;
|
|
84
243
|
}
|
|
244
|
+
export async function resolveDatasetAgentDurable(requestedDurable) {
|
|
245
|
+
if (!requestedDurable)
|
|
246
|
+
return false;
|
|
247
|
+
try {
|
|
248
|
+
const { getWorkflowMetadata } = await import("workflow");
|
|
249
|
+
const workflowRunId = getWorkflowMetadata?.()?.workflowRunId;
|
|
250
|
+
if (workflowRunId)
|
|
251
|
+
return false;
|
|
252
|
+
}
|
|
253
|
+
catch {
|
|
254
|
+
// Outside Workflow runtime there is no active metadata, so honor the caller.
|
|
255
|
+
}
|
|
256
|
+
return true;
|
|
257
|
+
}
|
|
85
258
|
export async function materializeSingleFileLikeSource(state, source, targetDatasetId) {
|
|
259
|
+
if (source.kind === "file" && !state.outputSchema) {
|
|
260
|
+
const materializedPdf = await tryMaterializeRawPdfFileSource(state, source, targetDatasetId);
|
|
261
|
+
if (materializedPdf)
|
|
262
|
+
return materializedPdf;
|
|
263
|
+
}
|
|
86
264
|
if (!state.reactor) {
|
|
87
265
|
throw new Error("dataset_reactor_required");
|
|
88
266
|
}
|
|
@@ -115,7 +293,9 @@ export async function materializeSingleFileLikeSource(state, source, targetDatas
|
|
|
115
293
|
reactor: state.reactor,
|
|
116
294
|
sandboxId,
|
|
117
295
|
});
|
|
118
|
-
await parseContext.parse(state.runtime, {
|
|
296
|
+
await parseContext.parse(state.runtime, {
|
|
297
|
+
durable: await resolveDatasetAgentDurable(state.durable),
|
|
298
|
+
});
|
|
119
299
|
if (!state.outputSchema) {
|
|
120
300
|
await datasetInferAndUpdateSchemaStep({
|
|
121
301
|
runtime: state.runtime,
|
|
@@ -207,7 +387,9 @@ export async function materializeDerivedDataset(state, targetDatasetId) {
|
|
|
207
387
|
reactor: stateWithSandbox.reactor,
|
|
208
388
|
sandboxId,
|
|
209
389
|
});
|
|
210
|
-
await transformContext.transform(stateWithSandbox.runtime, {
|
|
390
|
+
await transformContext.transform(stateWithSandbox.runtime, {
|
|
391
|
+
durable: await resolveDatasetAgentDurable(stateWithSandbox.durable),
|
|
392
|
+
});
|
|
211
393
|
if (!stateWithSandbox.outputSchema) {
|
|
212
394
|
await datasetInferAndUpdateSchemaStep({
|
|
213
395
|
runtime: stateWithSandbox.runtime,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ekairos/dataset",
|
|
3
|
-
"version": "1.22.
|
|
3
|
+
"version": "1.22.79-beta.development.0",
|
|
4
4
|
"description": "Pulzar Dataset Tools",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -65,9 +65,9 @@
|
|
|
65
65
|
"test:ai-sdk:instant": "vitest run -c vitest.codex.config.mts src/tests/materializeDataset.ai-sdk.instant.test.ts"
|
|
66
66
|
},
|
|
67
67
|
"dependencies": {
|
|
68
|
-
"@ekairos/domain": "^1.22.
|
|
69
|
-
"@ekairos/events": "^1.22.
|
|
70
|
-
"@ekairos/sandbox": "^1.22.
|
|
68
|
+
"@ekairos/domain": "^1.22.79-beta.development.0",
|
|
69
|
+
"@ekairos/events": "^1.22.79-beta.development.0",
|
|
70
|
+
"@ekairos/sandbox": "^1.22.79-beta.development.0",
|
|
71
71
|
"@instantdb/admin": "0.22.158",
|
|
72
72
|
"@instantdb/core": "0.22.142",
|
|
73
73
|
"ai": "^5.0.44",
|