@ekairos/dataset 1.22.78-beta.development.0 → 1.22.79-beta.development.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,14 @@
1
1
  import { createFileParseContext } from "../file/file-dataset.agent.js";
2
+ import { readInstantFileStep } from "../file/steps.js";
2
3
  import { createTransformDatasetContext } from "../transform/transform-dataset.agent.js";
3
4
  import { datasetInferAndUpdateSchemaStep, datasetReadOneStep, } from "../dataset/steps.js";
5
+ import { getDatasetOutputPath, getDatasetWorkstation } from "../datasetFiles.js";
4
6
  import { registerDatasetAgentMaterializers } from "./agentMaterializers.js";
5
7
  import { buildFileDefaultInstructions, buildRawSourceInstructions, buildTransformInstructions, } from "./instructions.js";
6
8
  import { createOrUpdateDatasetMetadata, materializeRowsToDataset, uploadInlineTextSource, } from "./persistence.js";
7
9
  import { getDomainDescriptor } from "./sourceRows.js";
8
10
  import { materializeQuerySource } from "./materializeQuery.js";
9
- import { createDatasetSandboxStep } from "../sandbox/steps.js";
11
+ import { createDatasetSandboxStep, readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep, writeDatasetSandboxTextFilesStep, } from "../sandbox/steps.js";
10
12
  function makeIntermediateDatasetId(targetDatasetId, sourceKind, index) {
11
13
  return `${targetDatasetId}__${sourceKind}_${index}`;
12
14
  }
@@ -41,6 +43,163 @@ function materializeRawTextRows(source) {
41
43
  }
42
44
  return [{ text }];
43
45
  }
46
+ function parseContentDispositionFileName(value) {
47
+ const text = String(value ?? "");
48
+ const utf8Match = /filename\*=UTF-8''([^;]+)/i.exec(text);
49
+ if (utf8Match?.[1]) {
50
+ try {
51
+ return decodeURIComponent(utf8Match[1]).trim();
52
+ }
53
+ catch {
54
+ return utf8Match[1].trim();
55
+ }
56
+ }
57
+ const quotedMatch = /filename="([^"]+)"/i.exec(text);
58
+ if (quotedMatch?.[1])
59
+ return quotedMatch[1].trim();
60
+ const plainMatch = /filename=([^;]+)/i.exec(text);
61
+ if (plainMatch?.[1])
62
+ return plainMatch[1].trim();
63
+ return "";
64
+ }
65
+ function isPdfContentDisposition(value) {
66
+ const text = String(value ?? "").toLowerCase();
67
+ return text.includes("application/pdf") || text.includes(".pdf");
68
+ }
69
+ function sanitizePdfFileName(value, fallback) {
70
+ const name = String(value ?? "").trim() || fallback;
71
+ const cleaned = name.replace(/[\\/:"*?<>|]+/g, "_").replace(/\s+/g, "_").slice(0, 120);
72
+ return cleaned.toLowerCase().endsWith(".pdf") ? cleaned : `${cleaned || fallback}.pdf`;
73
+ }
74
+ function pdfTextRowsSchema() {
75
+ return {
76
+ title: "PdfTextPage",
77
+ description: "Extracted PDF page text",
78
+ schema: {
79
+ type: "object",
80
+ additionalProperties: false,
81
+ required: ["fileId", "fileName", "pageNumber", "text"],
82
+ properties: {
83
+ fileId: { type: "string" },
84
+ fileName: { type: "string" },
85
+ pageNumber: { type: "number" },
86
+ text: { type: "string" },
87
+ },
88
+ },
89
+ };
90
+ }
91
+ function parseJsonlDataRows(content) {
92
+ return String(content ?? "")
93
+ .split(/\r?\n/g)
94
+ .map((line) => line.trim())
95
+ .filter(Boolean)
96
+ .map((line) => JSON.parse(line))
97
+ .map((record) => record?.data)
98
+ .filter((row) => row && typeof row === "object" && !Array.isArray(row));
99
+ }
100
+ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
101
+ const file = await readInstantFileStep({ runtime: state.runtime, fileId: source.fileId });
102
+ if (!isPdfContentDisposition(file.contentDisposition))
103
+ return null;
104
+ const sandboxId = await resolveDatasetSandboxId(state, targetDatasetId);
105
+ const workstation = getDatasetWorkstation(targetDatasetId);
106
+ const outputPath = getDatasetOutputPath(targetDatasetId);
107
+ const fileName = sanitizePdfFileName(parseContentDispositionFileName(file.contentDisposition), `${source.fileId}.pdf`);
108
+ const sourcePath = `${workstation}/${fileName}`;
109
+ const scriptPath = `${workstation}/extract_pdf_text.py`;
110
+ await runDatasetSandboxCommandStep({
111
+ runtime: state.runtime,
112
+ sandboxId,
113
+ cmd: "mkdir",
114
+ args: ["-p", workstation],
115
+ });
116
+ await writeDatasetSandboxFilesStep({
117
+ runtime: state.runtime,
118
+ sandboxId,
119
+ files: [{ path: sourcePath, contentBase64: file.contentBase64 }],
120
+ });
121
+ const install = await runDatasetSandboxCommandStep({
122
+ runtime: state.runtime,
123
+ sandboxId,
124
+ cmd: "python",
125
+ args: ["-m", "pip", "install", "pypdf", "--quiet"],
126
+ });
127
+ if (install.exitCode !== 0) {
128
+ throw new Error(`dataset_pdf_dependency_install_failed:${install.stderr || install.stdout}`);
129
+ }
130
+ await writeDatasetSandboxTextFilesStep({
131
+ runtime: state.runtime,
132
+ sandboxId,
133
+ files: [
134
+ {
135
+ path: scriptPath,
136
+ content: [
137
+ "from pathlib import Path",
138
+ "import json",
139
+ "import sys",
140
+ "from pypdf import PdfReader",
141
+ "",
142
+ "source_path = Path(sys.argv[1])",
143
+ "output_path = Path(sys.argv[2])",
144
+ "file_id = sys.argv[3]",
145
+ "file_name = sys.argv[4]",
146
+ "reader = PdfReader(str(source_path))",
147
+ "rows = 0",
148
+ "with output_path.open('w', encoding='utf-8') as out:",
149
+ " for index, page in enumerate(reader.pages, start=1):",
150
+ " text = page.extract_text() or ''",
151
+ " text = text.replace('\\x00', '').strip()",
152
+ " if not text:",
153
+ " continue",
154
+ " data = {",
155
+ " 'fileId': file_id,",
156
+ " 'fileName': file_name,",
157
+ " 'pageNumber': index,",
158
+ " 'text': text,",
159
+ " }",
160
+ " out.write(json.dumps({'type': 'row', 'data': data}, ensure_ascii=False) + '\\n')",
161
+ " rows += 1",
162
+ " if rows == 0:",
163
+ " data = {'fileId': file_id, 'fileName': file_name, 'pageNumber': 0, 'text': ''}",
164
+ " out.write(json.dumps({'type': 'row', 'data': data}, ensure_ascii=False) + '\\n')",
165
+ " rows = 1",
166
+ "print(f'extracted_pdf_pages={len(reader.pages)} rows={rows} output={output_path}')",
167
+ "",
168
+ ].join("\n"),
169
+ },
170
+ ],
171
+ });
172
+ const extraction = await runDatasetSandboxCommandStep({
173
+ runtime: state.runtime,
174
+ sandboxId,
175
+ cmd: "python",
176
+ args: [scriptPath, sourcePath, outputPath, source.fileId, fileName],
177
+ });
178
+ if (extraction.exitCode !== 0) {
179
+ throw new Error(`dataset_pdf_text_extraction_failed:${extraction.stderr || extraction.stdout}`);
180
+ }
181
+ const output = await readDatasetSandboxTextFileStep({
182
+ runtime: state.runtime,
183
+ sandboxId,
184
+ path: outputPath,
185
+ });
186
+ const rows = parseJsonlDataRows(output.content);
187
+ if (rows.length === 0) {
188
+ throw new Error("dataset_pdf_text_extraction_empty");
189
+ }
190
+ await materializeRowsToDataset(state.runtime, {
191
+ datasetId: targetDatasetId,
192
+ sandboxId,
193
+ title: state.title ?? fileName,
194
+ instructions: state.instructions,
195
+ sources: [{ kind: "file", fileId: source.fileId, description: source.description }],
196
+ sourceKinds: ["file"],
197
+ rows,
198
+ schema: pdfTextRowsSchema(),
199
+ first: state.first,
200
+ });
201
+ return targetDatasetId;
202
+ }
44
203
  async function materializeRawTextSource(state, source, targetDatasetId) {
45
204
  const rows = materializeRawTextRows(source);
46
205
  await materializeRowsToDataset(state.runtime, {
@@ -97,6 +256,11 @@ export async function resolveDatasetAgentDurable(requestedDurable) {
97
256
  return true;
98
257
  }
99
258
  export async function materializeSingleFileLikeSource(state, source, targetDatasetId) {
259
+ if (source.kind === "file" && !state.outputSchema) {
260
+ const materializedPdf = await tryMaterializeRawPdfFileSource(state, source, targetDatasetId);
261
+ if (materializedPdf)
262
+ return materializedPdf;
263
+ }
100
264
  if (!state.reactor) {
101
265
  throw new Error("dataset_reactor_required");
102
266
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ekairos/dataset",
3
- "version": "1.22.78-beta.development.0",
3
+ "version": "1.22.79-beta.development.0",
4
4
  "description": "Pulzar Dataset Tools",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -65,9 +65,9 @@
65
65
  "test:ai-sdk:instant": "vitest run -c vitest.codex.config.mts src/tests/materializeDataset.ai-sdk.instant.test.ts"
66
66
  },
67
67
  "dependencies": {
68
- "@ekairos/domain": "^1.22.78-beta.development.0",
69
- "@ekairos/events": "^1.22.78-beta.development.0",
70
- "@ekairos/sandbox": "^1.22.78-beta.development.0",
68
+ "@ekairos/domain": "^1.22.79-beta.development.0",
69
+ "@ekairos/events": "^1.22.79-beta.development.0",
70
+ "@ekairos/sandbox": "^1.22.79-beta.development.0",
71
71
  "@instantdb/admin": "0.22.158",
72
72
  "@instantdb/core": "0.22.142",
73
73
  "ai": "^5.0.44",