@ekairos/dataset 1.22.77-beta.development.0 → 1.22.79-beta.development.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  import type { AnyDatasetRuntime, DatasetBuilderState, InternalSource } from "./types.js";
2
+ export declare function resolveDatasetAgentDurable(requestedDurable?: boolean): Promise<boolean>;
2
3
  export declare function materializeSingleFileLikeSource<Runtime extends AnyDatasetRuntime>(state: DatasetBuilderState<Runtime>, source: Extract<InternalSource, {
3
4
  kind: "file" | "text";
4
5
  }>, targetDatasetId: string): Promise<string>;
@@ -1,12 +1,14 @@
1
1
  import { createFileParseContext } from "../file/file-dataset.agent.js";
2
+ import { readInstantFileStep } from "../file/steps.js";
2
3
  import { createTransformDatasetContext } from "../transform/transform-dataset.agent.js";
3
4
  import { datasetInferAndUpdateSchemaStep, datasetReadOneStep, } from "../dataset/steps.js";
5
+ import { getDatasetOutputPath, getDatasetWorkstation } from "../datasetFiles.js";
4
6
  import { registerDatasetAgentMaterializers } from "./agentMaterializers.js";
5
7
  import { buildFileDefaultInstructions, buildRawSourceInstructions, buildTransformInstructions, } from "./instructions.js";
6
8
  import { createOrUpdateDatasetMetadata, materializeRowsToDataset, uploadInlineTextSource, } from "./persistence.js";
7
9
  import { getDomainDescriptor } from "./sourceRows.js";
8
10
  import { materializeQuerySource } from "./materializeQuery.js";
9
- import { createDatasetSandboxStep } from "../sandbox/steps.js";
11
+ import { createDatasetSandboxStep, readDatasetSandboxTextFileStep, runDatasetSandboxCommandStep, writeDatasetSandboxFilesStep, writeDatasetSandboxTextFilesStep, } from "../sandbox/steps.js";
10
12
  function makeIntermediateDatasetId(targetDatasetId, sourceKind, index) {
11
13
  return `${targetDatasetId}__${sourceKind}_${index}`;
12
14
  }
@@ -41,6 +43,163 @@ function materializeRawTextRows(source) {
41
43
  }
42
44
  return [{ text }];
43
45
  }
46
+ function parseContentDispositionFileName(value) {
47
+ const text = String(value ?? "");
48
+ const utf8Match = /filename\*=UTF-8''([^;]+)/i.exec(text);
49
+ if (utf8Match?.[1]) {
50
+ try {
51
+ return decodeURIComponent(utf8Match[1]).trim();
52
+ }
53
+ catch {
54
+ return utf8Match[1].trim();
55
+ }
56
+ }
57
+ const quotedMatch = /filename="([^"]+)"/i.exec(text);
58
+ if (quotedMatch?.[1])
59
+ return quotedMatch[1].trim();
60
+ const plainMatch = /filename=([^;]+)/i.exec(text);
61
+ if (plainMatch?.[1])
62
+ return plainMatch[1].trim();
63
+ return "";
64
+ }
65
+ function isPdfContentDisposition(value) {
66
+ const text = String(value ?? "").toLowerCase();
67
+ return text.includes("application/pdf") || text.includes(".pdf");
68
+ }
69
+ function sanitizePdfFileName(value, fallback) {
70
+ const name = String(value ?? "").trim() || fallback;
71
+ const cleaned = name.replace(/[\\/:"*?<>|]+/g, "_").replace(/\s+/g, "_").slice(0, 120);
72
+ return cleaned.toLowerCase().endsWith(".pdf") ? cleaned : `${cleaned || fallback}.pdf`;
73
+ }
74
+ function pdfTextRowsSchema() {
75
+ return {
76
+ title: "PdfTextPage",
77
+ description: "Extracted PDF page text",
78
+ schema: {
79
+ type: "object",
80
+ additionalProperties: false,
81
+ required: ["fileId", "fileName", "pageNumber", "text"],
82
+ properties: {
83
+ fileId: { type: "string" },
84
+ fileName: { type: "string" },
85
+ pageNumber: { type: "number" },
86
+ text: { type: "string" },
87
+ },
88
+ },
89
+ };
90
+ }
91
+ function parseJsonlDataRows(content) {
92
+ return String(content ?? "")
93
+ .split(/\r?\n/g)
94
+ .map((line) => line.trim())
95
+ .filter(Boolean)
96
+ .map((line) => JSON.parse(line))
97
+ .map((record) => record?.data)
98
+ .filter((row) => row && typeof row === "object" && !Array.isArray(row));
99
+ }
100
+ async function tryMaterializeRawPdfFileSource(state, source, targetDatasetId) {
101
+ const file = await readInstantFileStep({ runtime: state.runtime, fileId: source.fileId });
102
+ if (!isPdfContentDisposition(file.contentDisposition))
103
+ return null;
104
+ const sandboxId = await resolveDatasetSandboxId(state, targetDatasetId);
105
+ const workstation = getDatasetWorkstation(targetDatasetId);
106
+ const outputPath = getDatasetOutputPath(targetDatasetId);
107
+ const fileName = sanitizePdfFileName(parseContentDispositionFileName(file.contentDisposition), `${source.fileId}.pdf`);
108
+ const sourcePath = `${workstation}/${fileName}`;
109
+ const scriptPath = `${workstation}/extract_pdf_text.py`;
110
+ await runDatasetSandboxCommandStep({
111
+ runtime: state.runtime,
112
+ sandboxId,
113
+ cmd: "mkdir",
114
+ args: ["-p", workstation],
115
+ });
116
+ await writeDatasetSandboxFilesStep({
117
+ runtime: state.runtime,
118
+ sandboxId,
119
+ files: [{ path: sourcePath, contentBase64: file.contentBase64 }],
120
+ });
121
+ const install = await runDatasetSandboxCommandStep({
122
+ runtime: state.runtime,
123
+ sandboxId,
124
+ cmd: "python",
125
+ args: ["-m", "pip", "install", "pypdf", "--quiet"],
126
+ });
127
+ if (install.exitCode !== 0) {
128
+ throw new Error(`dataset_pdf_dependency_install_failed:${install.stderr || install.stdout}`);
129
+ }
130
+ await writeDatasetSandboxTextFilesStep({
131
+ runtime: state.runtime,
132
+ sandboxId,
133
+ files: [
134
+ {
135
+ path: scriptPath,
136
+ content: [
137
+ "from pathlib import Path",
138
+ "import json",
139
+ "import sys",
140
+ "from pypdf import PdfReader",
141
+ "",
142
+ "source_path = Path(sys.argv[1])",
143
+ "output_path = Path(sys.argv[2])",
144
+ "file_id = sys.argv[3]",
145
+ "file_name = sys.argv[4]",
146
+ "reader = PdfReader(str(source_path))",
147
+ "rows = 0",
148
+ "with output_path.open('w', encoding='utf-8') as out:",
149
+ " for index, page in enumerate(reader.pages, start=1):",
150
+ " text = page.extract_text() or ''",
151
+ " text = text.replace('\\x00', '').strip()",
152
+ " if not text:",
153
+ " continue",
154
+ " data = {",
155
+ " 'fileId': file_id,",
156
+ " 'fileName': file_name,",
157
+ " 'pageNumber': index,",
158
+ " 'text': text,",
159
+ " }",
160
+ " out.write(json.dumps({'type': 'row', 'data': data}, ensure_ascii=False) + '\\n')",
161
+ " rows += 1",
162
+ " if rows == 0:",
163
+ " data = {'fileId': file_id, 'fileName': file_name, 'pageNumber': 0, 'text': ''}",
164
+ " out.write(json.dumps({'type': 'row', 'data': data}, ensure_ascii=False) + '\\n')",
165
+ " rows = 1",
166
+ "print(f'extracted_pdf_pages={len(reader.pages)} rows={rows} output={output_path}')",
167
+ "",
168
+ ].join("\n"),
169
+ },
170
+ ],
171
+ });
172
+ const extraction = await runDatasetSandboxCommandStep({
173
+ runtime: state.runtime,
174
+ sandboxId,
175
+ cmd: "python",
176
+ args: [scriptPath, sourcePath, outputPath, source.fileId, fileName],
177
+ });
178
+ if (extraction.exitCode !== 0) {
179
+ throw new Error(`dataset_pdf_text_extraction_failed:${extraction.stderr || extraction.stdout}`);
180
+ }
181
+ const output = await readDatasetSandboxTextFileStep({
182
+ runtime: state.runtime,
183
+ sandboxId,
184
+ path: outputPath,
185
+ });
186
+ const rows = parseJsonlDataRows(output.content);
187
+ if (rows.length === 0) {
188
+ throw new Error("dataset_pdf_text_extraction_empty");
189
+ }
190
+ await materializeRowsToDataset(state.runtime, {
191
+ datasetId: targetDatasetId,
192
+ sandboxId,
193
+ title: state.title ?? fileName,
194
+ instructions: state.instructions,
195
+ sources: [{ kind: "file", fileId: source.fileId, description: source.description }],
196
+ sourceKinds: ["file"],
197
+ rows,
198
+ schema: pdfTextRowsSchema(),
199
+ first: state.first,
200
+ });
201
+ return targetDatasetId;
202
+ }
44
203
  async function materializeRawTextSource(state, source, targetDatasetId) {
45
204
  const rows = materializeRawTextRows(source);
46
205
  await materializeRowsToDataset(state.runtime, {
@@ -82,7 +241,26 @@ async function resolveDatasetSandboxId(state, targetDatasetId) {
82
241
  });
83
242
  return created.sandboxId;
84
243
  }
244
+ export async function resolveDatasetAgentDurable(requestedDurable) {
245
+ if (!requestedDurable)
246
+ return false;
247
+ try {
248
+ const { getWorkflowMetadata } = await import("workflow");
249
+ const workflowRunId = getWorkflowMetadata?.()?.workflowRunId;
250
+ if (workflowRunId)
251
+ return false;
252
+ }
253
+ catch {
254
+ // Outside Workflow runtime there is no active metadata, so honor the caller.
255
+ }
256
+ return true;
257
+ }
85
258
  export async function materializeSingleFileLikeSource(state, source, targetDatasetId) {
259
+ if (source.kind === "file" && !state.outputSchema) {
260
+ const materializedPdf = await tryMaterializeRawPdfFileSource(state, source, targetDatasetId);
261
+ if (materializedPdf)
262
+ return materializedPdf;
263
+ }
86
264
  if (!state.reactor) {
87
265
  throw new Error("dataset_reactor_required");
88
266
  }
@@ -115,7 +293,9 @@ export async function materializeSingleFileLikeSource(state, source, targetDatas
115
293
  reactor: state.reactor,
116
294
  sandboxId,
117
295
  });
118
- await parseContext.parse(state.runtime, { durable: state.durable });
296
+ await parseContext.parse(state.runtime, {
297
+ durable: await resolveDatasetAgentDurable(state.durable),
298
+ });
119
299
  if (!state.outputSchema) {
120
300
  await datasetInferAndUpdateSchemaStep({
121
301
  runtime: state.runtime,
@@ -207,7 +387,9 @@ export async function materializeDerivedDataset(state, targetDatasetId) {
207
387
  reactor: stateWithSandbox.reactor,
208
388
  sandboxId,
209
389
  });
210
- await transformContext.transform(stateWithSandbox.runtime, { durable: stateWithSandbox.durable });
390
+ await transformContext.transform(stateWithSandbox.runtime, {
391
+ durable: await resolveDatasetAgentDurable(stateWithSandbox.durable),
392
+ });
211
393
  if (!stateWithSandbox.outputSchema) {
212
394
  await datasetInferAndUpdateSchemaStep({
213
395
  runtime: stateWithSandbox.runtime,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ekairos/dataset",
3
- "version": "1.22.77-beta.development.0",
3
+ "version": "1.22.79-beta.development.0",
4
4
  "description": "Pulzar Dataset Tools",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -65,9 +65,9 @@
65
65
  "test:ai-sdk:instant": "vitest run -c vitest.codex.config.mts src/tests/materializeDataset.ai-sdk.instant.test.ts"
66
66
  },
67
67
  "dependencies": {
68
- "@ekairos/domain": "^1.22.77-beta.development.0",
69
- "@ekairos/events": "^1.22.77-beta.development.0",
70
- "@ekairos/sandbox": "^1.22.77-beta.development.0",
68
+ "@ekairos/domain": "^1.22.79-beta.development.0",
69
+ "@ekairos/events": "^1.22.79-beta.development.0",
70
+ "@ekairos/sandbox": "^1.22.79-beta.development.0",
71
71
  "@instantdb/admin": "0.22.158",
72
72
  "@instantdb/core": "0.22.142",
73
73
  "ai": "^5.0.44",