@agentionai/agents 0.12.0-beta → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agents/Agent.d.ts +9 -3
- package/dist/agents/Agent.js +4 -0
- package/dist/agents/AgentConfig.d.ts +12 -2
- package/dist/agents/model-types.d.ts +7 -1
- package/dist/agents/ollama/OllamaAgent.d.ts +69 -0
- package/dist/agents/ollama/OllamaAgent.js +304 -0
- package/dist/chunkers/index.d.ts +0 -1
- package/dist/chunkers/index.js +1 -3
- package/dist/history/transformers.d.ts +36 -0
- package/dist/history/transformers.js +78 -1
- package/dist/history/types.d.ts +8 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.js +4 -1
- package/dist/ingestion/IngestionPipeline.d.ts +1 -73
- package/dist/ingestion/IngestionPipeline.js +1 -110
- package/dist/ollama.d.ts +4 -0
- package/dist/ollama.js +24 -0
- package/dist/viz/types.d.ts +1 -1
- package/package.json +6 -42
- package/dist/chunkers/ElementChunker.d.ts +0 -100
- package/dist/chunkers/ElementChunker.js +0 -242
- package/dist/parsers/DocumentParser.d.ts +0 -36
- package/dist/parsers/DocumentParser.js +0 -35
- package/dist/parsers/LlamaIndexParser.d.ts +0 -58
- package/dist/parsers/LlamaIndexParser.js +0 -71
- package/dist/parsers/OllamaOCRParser.d.ts +0 -98
- package/dist/parsers/OllamaOCRParser.js +0 -203
- package/dist/parsers/UnstructuredAPIParser.d.ts +0 -57
- package/dist/parsers/UnstructuredAPIParser.js +0 -131
- package/dist/parsers/UnstructuredLocalParser.d.ts +0 -42
- package/dist/parsers/UnstructuredLocalParser.js +0 -118
- package/dist/parsers/index.d.ts +0 -3
- package/dist/parsers/index.js +0 -6
- package/dist/parsers/types.d.ts +0 -50
- package/dist/parsers/types.js +0 -3
|
@@ -1,203 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
-
if (k2 === undefined) k2 = k;
|
|
4
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
-
}
|
|
8
|
-
Object.defineProperty(o, k2, desc);
|
|
9
|
-
}) : (function(o, m, k, k2) {
|
|
10
|
-
if (k2 === undefined) k2 = k;
|
|
11
|
-
o[k2] = m[k];
|
|
12
|
-
}));
|
|
13
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
-
}) : function(o, v) {
|
|
16
|
-
o["default"] = v;
|
|
17
|
-
});
|
|
18
|
-
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
-
var ownKeys = function(o) {
|
|
20
|
-
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
-
var ar = [];
|
|
22
|
-
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
-
return ar;
|
|
24
|
-
};
|
|
25
|
-
return ownKeys(o);
|
|
26
|
-
};
|
|
27
|
-
return function (mod) {
|
|
28
|
-
if (mod && mod.__esModule) return mod;
|
|
29
|
-
var result = {};
|
|
30
|
-
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
-
__setModuleDefault(result, mod);
|
|
32
|
-
return result;
|
|
33
|
-
};
|
|
34
|
-
})();
|
|
35
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
-
exports.OllamaOCRParser = void 0;
|
|
37
|
-
const fs = __importStar(require("fs"));
|
|
38
|
-
const path = __importStar(require("path"));
|
|
39
|
-
const DocumentParser_1 = require("./DocumentParser");
|
|
40
|
-
/** Supported image extensions for direct OCR (no conversion needed). */
|
|
41
|
-
const IMAGE_EXTS = new Set([".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"]);
|
|
42
|
-
/**
|
|
43
|
-
* Document parser that uses a locally-running **Ollama** vision model (e.g. `glm-ocr`)
|
|
44
|
-
* to perform OCR on image files and PDF documents.
|
|
45
|
-
*
|
|
46
|
-
* **Supported file types:**
|
|
47
|
-
* - Images: `.jpg`, `.jpeg`, `.png`, `.gif`, `.webp`, `.bmp` — no extra dependencies
|
|
48
|
-
* - PDF: requires the optional peer dependency `pdf-to-img` (`npm install pdf-to-img`)
|
|
49
|
-
*
|
|
50
|
-
* **Ollama must be running** with the model pulled:
|
|
51
|
-
* ```bash
|
|
52
|
-
* ollama pull glm-ocr
|
|
53
|
-
* ollama serve # if not already running
|
|
54
|
-
* ```
|
|
55
|
-
*
|
|
56
|
-
* @example
|
|
57
|
-
* ```typescript
|
|
58
|
-
* import { OllamaOCRParser } from "@agentionai/agents/parsers/ollama-ocr";
|
|
59
|
-
*
|
|
60
|
-
* const parser = new OllamaOCRParser({
|
|
61
|
-
* model: "glm-ocr",
|
|
62
|
-
* pdfScale: 1.5,
|
|
63
|
-
* onProgress: (page, total) => console.log(`OCR page ${page}/${total}...`),
|
|
64
|
-
* });
|
|
65
|
-
*
|
|
66
|
-
* // Parse an image
|
|
67
|
-
* const doc = await parser.parse("/path/to/scan.png");
|
|
68
|
-
*
|
|
69
|
-
* // Parse a PDF (requires: npm install pdf-to-img)
|
|
70
|
-
* const pdf = await parser.parse("/path/to/report.pdf");
|
|
71
|
-
*
|
|
72
|
-
* // Use with IngestionPipeline
|
|
73
|
-
* await pipeline.ingestFile("/path/to/scan.png", parser);
|
|
74
|
-
* ```
|
|
75
|
-
*/
|
|
76
|
-
class OllamaOCRParser extends DocumentParser_1.DocumentParser {
|
|
77
|
-
constructor(config = {}) {
|
|
78
|
-
super();
|
|
79
|
-
this.name = "ollama-ocr";
|
|
80
|
-
this.model = config.model ?? "glm-ocr";
|
|
81
|
-
this.baseUrl = (config.baseUrl ?? "http://localhost:11434").replace(/\/$/, "");
|
|
82
|
-
this.prompt =
|
|
83
|
-
config.prompt ??
|
|
84
|
-
"Extract and transcribe all text from this image. Preserve the original structure, headings, and formatting as much as possible. Output only the extracted text.";
|
|
85
|
-
this.pdfScale = config.pdfScale ?? 2.0;
|
|
86
|
-
this.concurrency = config.concurrency ?? 3;
|
|
87
|
-
this.onProgress = config.onProgress;
|
|
88
|
-
}
|
|
89
|
-
/**
|
|
90
|
-
* Parse a document file using Ollama OCR.
|
|
91
|
-
*
|
|
92
|
-
* @param filePath - Path to the image or PDF file
|
|
93
|
-
* @param options - Optional hints (unused by this parser; provided for interface compatibility)
|
|
94
|
-
*/
|
|
95
|
-
async parse(filePath, _options) {
|
|
96
|
-
const ext = path.extname(filePath).toLowerCase();
|
|
97
|
-
if (IMAGE_EXTS.has(ext)) {
|
|
98
|
-
return this.parseImageFile(filePath);
|
|
99
|
-
}
|
|
100
|
-
if (ext === ".pdf") {
|
|
101
|
-
return this.parsePdf(filePath);
|
|
102
|
-
}
|
|
103
|
-
throw new Error(`OllamaOCRParser: unsupported file type "${ext}". ` +
|
|
104
|
-
`Supported: ${[...IMAGE_EXTS].join(", ")}, .pdf`);
|
|
105
|
-
}
|
|
106
|
-
// ---------------------------------------------------------------------------
|
|
107
|
-
// Private helpers
|
|
108
|
-
// ---------------------------------------------------------------------------
|
|
109
|
-
async parseImageFile(filePath) {
|
|
110
|
-
const base64 = fs.readFileSync(filePath).toString("base64");
|
|
111
|
-
const text = await this.runOCR(base64);
|
|
112
|
-
const element = {
|
|
113
|
-
type: "NarrativeText",
|
|
114
|
-
text,
|
|
115
|
-
metadata: { source: filePath },
|
|
116
|
-
};
|
|
117
|
-
return { text, elements: [element], metadata: { filePath, pages: 1 } };
|
|
118
|
-
}
|
|
119
|
-
async parsePdf(filePath) {
|
|
120
|
-
// Dynamically load pdf-to-img — optional peer dep, no system deps required
|
|
121
|
-
let pdfFn;
|
|
122
|
-
try {
|
|
123
|
-
// Resolve the module from process.cwd() so that peer deps installed in
|
|
124
|
-
// the consuming project (not this library's own node_modules) are found.
|
|
125
|
-
const { createRequire } = await Promise.resolve().then(() => __importStar(require("module")));
|
|
126
|
-
const { pathToFileURL } = await Promise.resolve().then(() => __importStar(require("url")));
|
|
127
|
-
const requireFromCwd = createRequire(path.resolve(process.cwd(), "__placeholder__.js"));
|
|
128
|
-
const resolved = requireFromCwd.resolve("pdf-to-img");
|
|
129
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
130
|
-
({ pdf: pdfFn } = await Promise.resolve(`${pathToFileURL(resolved).href}`).then(s => __importStar(require(s))));
|
|
131
|
-
}
|
|
132
|
-
catch (err) {
|
|
133
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
134
|
-
if (message.includes("Cannot find module") ||
|
|
135
|
-
message.includes("MODULE_NOT_FOUND") ||
|
|
136
|
-
message.includes("ERR_MODULE_NOT_FOUND")) {
|
|
137
|
-
throw new Error("OllamaOCRParser: PDF parsing requires 'pdf-to-img'. " +
|
|
138
|
-
"Install it with: npm install pdf-to-img");
|
|
139
|
-
}
|
|
140
|
-
throw err;
|
|
141
|
-
}
|
|
142
|
-
const doc = await pdfFn(filePath, { scale: this.pdfScale });
|
|
143
|
-
const total = doc.length;
|
|
144
|
-
// Step 1: render all pages to buffers (fast — pure JS, no network)
|
|
145
|
-
const pageBuffers = [];
|
|
146
|
-
for await (const buf of doc) {
|
|
147
|
-
pageBuffers.push(buf);
|
|
148
|
-
}
|
|
149
|
-
// Step 2: OCR pages in parallel using a worker-pool with concurrency limit
|
|
150
|
-
const elements = new Array(total);
|
|
151
|
-
let completed = 0;
|
|
152
|
-
const queue = pageBuffers.map((buf, i) => async () => {
|
|
153
|
-
const text = await this.runOCR(buf.toString("base64"));
|
|
154
|
-
elements[i] = {
|
|
155
|
-
type: "NarrativeText",
|
|
156
|
-
text,
|
|
157
|
-
metadata: { page_number: i + 1, source: filePath },
|
|
158
|
-
};
|
|
159
|
-
this.onProgress?.(++completed, total);
|
|
160
|
-
});
|
|
161
|
-
const workers = Array.from({ length: Math.min(this.concurrency, total) }, async () => {
|
|
162
|
-
while (queue.length > 0) {
|
|
163
|
-
await queue.shift()();
|
|
164
|
-
}
|
|
165
|
-
});
|
|
166
|
-
await Promise.all(workers);
|
|
167
|
-
return {
|
|
168
|
-
text: this.elementsToText(elements),
|
|
169
|
-
elements,
|
|
170
|
-
metadata: { filePath, pages: total },
|
|
171
|
-
};
|
|
172
|
-
}
|
|
173
|
-
/**
|
|
174
|
-
* Send a base64-encoded image to the Ollama chat API and return the extracted text.
|
|
175
|
-
*/
|
|
176
|
-
async runOCR(base64Image) {
|
|
177
|
-
const url = `${this.baseUrl}/api/chat`;
|
|
178
|
-
const response = await fetch(url, {
|
|
179
|
-
method: "POST",
|
|
180
|
-
headers: { "Content-Type": "application/json" },
|
|
181
|
-
body: JSON.stringify({
|
|
182
|
-
model: this.model,
|
|
183
|
-
messages: [
|
|
184
|
-
{
|
|
185
|
-
role: "user",
|
|
186
|
-
content: this.prompt,
|
|
187
|
-
images: [base64Image],
|
|
188
|
-
},
|
|
189
|
-
],
|
|
190
|
-
stream: false,
|
|
191
|
-
}),
|
|
192
|
-
});
|
|
193
|
-
if (!response.ok) {
|
|
194
|
-
const body = await response.text().catch(() => "");
|
|
195
|
-
throw new Error(`OllamaOCRParser: Ollama API error ${response.status} ${response.statusText}` +
|
|
196
|
-
(body ? `\n${body}` : ""));
|
|
197
|
-
}
|
|
198
|
-
const data = (await response.json());
|
|
199
|
-
return data.message?.content?.trim() ?? "";
|
|
200
|
-
}
|
|
201
|
-
}
|
|
202
|
-
exports.OllamaOCRParser = OllamaOCRParser;
|
|
203
|
-
//# sourceMappingURL=OllamaOCRParser.js.map
|
|
@@ -1,57 +0,0 @@
|
|
|
1
|
-
import { DocumentParser } from "./DocumentParser";
|
|
2
|
-
import { ParsedDocument, ParseOptions } from "./types";
|
|
3
|
-
/**
|
|
4
|
-
* Configuration for {@link UnstructuredAPIParser}.
|
|
5
|
-
*/
|
|
6
|
-
export interface UnstructuredAPIParserConfig {
|
|
7
|
-
/**
|
|
8
|
-
* API key for the Unstructured hosted service.
|
|
9
|
-
* Not required when `serverUrl` points to a self-hosted instance
|
|
10
|
-
* that does not enforce authentication.
|
|
11
|
-
*/
|
|
12
|
-
apiKey?: string;
|
|
13
|
-
/**
|
|
14
|
-
* Base URL of the Unstructured API.
|
|
15
|
-
* Defaults to the official hosted endpoint when an `apiKey` is provided.
|
|
16
|
-
* Set to your own host when running the open-source API server locally:
|
|
17
|
-
* ```
|
|
18
|
-
* docker run -p 8000:8000 downloads.unstructured.io/unstructured-io/unstructured-api:latest
|
|
19
|
-
* serverUrl: "http://localhost:8000"
|
|
20
|
-
* ```
|
|
21
|
-
*/
|
|
22
|
-
serverUrl?: string;
|
|
23
|
-
}
|
|
24
|
-
/**
|
|
25
|
-
* Document parser backed by the **Unstructured REST API** — either the
|
|
26
|
-
* official hosted service or a self-hosted open-source API server.
|
|
27
|
-
*
|
|
28
|
-
* Uses the official `unstructured-client` npm package under the hood.
|
|
29
|
-
*
|
|
30
|
-
* **Peer dependency:** `unstructured-client`
|
|
31
|
-
*
|
|
32
|
-
* @example
|
|
33
|
-
* ```typescript
|
|
34
|
-
* // Hosted service
|
|
35
|
-
* const parser = new UnstructuredAPIParser({ apiKey: process.env.UNSTRUCTURED_API_KEY });
|
|
36
|
-
*
|
|
37
|
-
* // Self-hosted (no auth required)
|
|
38
|
-
* const parser = new UnstructuredAPIParser({ serverUrl: "http://localhost:8000" });
|
|
39
|
-
*
|
|
40
|
-
* const doc = await parser.parse("/path/to/report.pdf", { strategy: "hi_res" });
|
|
41
|
-
* await pipeline.ingestFile("/path/to/report.pdf", parser);
|
|
42
|
-
* ```
|
|
43
|
-
*/
|
|
44
|
-
export declare class UnstructuredAPIParser extends DocumentParser {
|
|
45
|
-
private readonly config;
|
|
46
|
-
readonly name = "unstructured-api";
|
|
47
|
-
constructor(config?: UnstructuredAPIParserConfig);
|
|
48
|
-
/**
|
|
49
|
-
* Parse a file via the Unstructured API.
|
|
50
|
-
*
|
|
51
|
-
* @param filePath - Path to the document to parse (read from disk)
|
|
52
|
-
* @param options - Strategy, languages, and any other partition parameters
|
|
53
|
-
*/
|
|
54
|
-
parse(filePath: string, options?: ParseOptions): Promise<ParsedDocument>;
|
|
55
|
-
private mapRawElements;
|
|
56
|
-
}
|
|
57
|
-
//# sourceMappingURL=UnstructuredAPIParser.d.ts.map
|
|
@@ -1,131 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
-
if (k2 === undefined) k2 = k;
|
|
4
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
-
}
|
|
8
|
-
Object.defineProperty(o, k2, desc);
|
|
9
|
-
}) : (function(o, m, k, k2) {
|
|
10
|
-
if (k2 === undefined) k2 = k;
|
|
11
|
-
o[k2] = m[k];
|
|
12
|
-
}));
|
|
13
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
-
}) : function(o, v) {
|
|
16
|
-
o["default"] = v;
|
|
17
|
-
});
|
|
18
|
-
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
-
var ownKeys = function(o) {
|
|
20
|
-
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
-
var ar = [];
|
|
22
|
-
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
-
return ar;
|
|
24
|
-
};
|
|
25
|
-
return ownKeys(o);
|
|
26
|
-
};
|
|
27
|
-
return function (mod) {
|
|
28
|
-
if (mod && mod.__esModule) return mod;
|
|
29
|
-
var result = {};
|
|
30
|
-
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
-
__setModuleDefault(result, mod);
|
|
32
|
-
return result;
|
|
33
|
-
};
|
|
34
|
-
})();
|
|
35
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
-
exports.UnstructuredAPIParser = void 0;
|
|
37
|
-
const DocumentParser_1 = require("./DocumentParser");
|
|
38
|
-
const STRATEGY_MAP = {
|
|
39
|
-
auto: "auto",
|
|
40
|
-
fast: "fast",
|
|
41
|
-
hi_res: "hi_res",
|
|
42
|
-
ocr_only: "ocr_only",
|
|
43
|
-
};
|
|
44
|
-
/**
|
|
45
|
-
* Document parser backed by the **Unstructured REST API** — either the
|
|
46
|
-
* official hosted service or a self-hosted open-source API server.
|
|
47
|
-
*
|
|
48
|
-
* Uses the official `unstructured-client` npm package under the hood.
|
|
49
|
-
*
|
|
50
|
-
* **Peer dependency:** `unstructured-client`
|
|
51
|
-
*
|
|
52
|
-
* @example
|
|
53
|
-
* ```typescript
|
|
54
|
-
* // Hosted service
|
|
55
|
-
* const parser = new UnstructuredAPIParser({ apiKey: process.env.UNSTRUCTURED_API_KEY });
|
|
56
|
-
*
|
|
57
|
-
* // Self-hosted (no auth required)
|
|
58
|
-
* const parser = new UnstructuredAPIParser({ serverUrl: "http://localhost:8000" });
|
|
59
|
-
*
|
|
60
|
-
* const doc = await parser.parse("/path/to/report.pdf", { strategy: "hi_res" });
|
|
61
|
-
* await pipeline.ingestFile("/path/to/report.pdf", parser);
|
|
62
|
-
* ```
|
|
63
|
-
*/
|
|
64
|
-
class UnstructuredAPIParser extends DocumentParser_1.DocumentParser {
|
|
65
|
-
constructor(config = {}) {
|
|
66
|
-
super();
|
|
67
|
-
this.config = config;
|
|
68
|
-
this.name = "unstructured-api";
|
|
69
|
-
}
|
|
70
|
-
/**
|
|
71
|
-
* Parse a file via the Unstructured API.
|
|
72
|
-
*
|
|
73
|
-
* @param filePath - Path to the document to parse (read from disk)
|
|
74
|
-
* @param options - Strategy, languages, and any other partition parameters
|
|
75
|
-
*/
|
|
76
|
-
async parse(filePath, options) {
|
|
77
|
-
const pkg = "unstructured-client";
|
|
78
|
-
let UnstructuredClient;
|
|
79
|
-
try {
|
|
80
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
81
|
-
({ UnstructuredClient } = await Promise.resolve(`${pkg}`).then(s => __importStar(require(s))));
|
|
82
|
-
}
|
|
83
|
-
catch {
|
|
84
|
-
throw new Error("UnstructuredAPIParser requires 'unstructured-client'. " +
|
|
85
|
-
"Install it with: npm install unstructured-client");
|
|
86
|
-
}
|
|
87
|
-
const fs = await Promise.resolve().then(() => __importStar(require("fs")));
|
|
88
|
-
const path = await Promise.resolve().then(() => __importStar(require("path")));
|
|
89
|
-
const clientConfig = {};
|
|
90
|
-
if (this.config.apiKey) {
|
|
91
|
-
clientConfig["security"] = { apiKeyAuth: this.config.apiKey };
|
|
92
|
-
}
|
|
93
|
-
if (this.config.serverUrl) {
|
|
94
|
-
clientConfig["serverURL"] = this.config.serverUrl;
|
|
95
|
-
}
|
|
96
|
-
const client = new UnstructuredClient(clientConfig);
|
|
97
|
-
const fileContent = fs.readFileSync(filePath);
|
|
98
|
-
const fileName = path.basename(filePath);
|
|
99
|
-
const { strategy, languages, ...rest } = options ?? {};
|
|
100
|
-
const res = await client.general.partition({
|
|
101
|
-
partitionParameters: {
|
|
102
|
-
files: { content: fileContent, fileName },
|
|
103
|
-
strategy: STRATEGY_MAP[strategy ?? "auto"] ?? "auto",
|
|
104
|
-
...(languages ? { languages } : {}),
|
|
105
|
-
...rest,
|
|
106
|
-
},
|
|
107
|
-
});
|
|
108
|
-
const rawElements = res.elements ?? [];
|
|
109
|
-
const elements = this.mapRawElements(rawElements);
|
|
110
|
-
return {
|
|
111
|
-
text: this.elementsToText(elements),
|
|
112
|
-
elements,
|
|
113
|
-
};
|
|
114
|
-
}
|
|
115
|
-
mapRawElements(raw) {
|
|
116
|
-
return raw.map((el) => {
|
|
117
|
-
const e = el;
|
|
118
|
-
return {
|
|
119
|
-
type: typeof e["type"] === "string" ? e["type"] : "unknown",
|
|
120
|
-
text: typeof e["text"] === "string" ? e["text"] : "",
|
|
121
|
-
metadata: e["metadata"] != null &&
|
|
122
|
-
typeof e["metadata"] === "object" &&
|
|
123
|
-
!Array.isArray(e["metadata"])
|
|
124
|
-
? e["metadata"]
|
|
125
|
-
: undefined,
|
|
126
|
-
};
|
|
127
|
-
});
|
|
128
|
-
}
|
|
129
|
-
}
|
|
130
|
-
exports.UnstructuredAPIParser = UnstructuredAPIParser;
|
|
131
|
-
//# sourceMappingURL=UnstructuredAPIParser.js.map
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
import { DocumentParser } from "./DocumentParser";
|
|
2
|
-
import { ParsedDocument, ParseOptions } from "./types";
|
|
3
|
-
/**
|
|
4
|
-
* Document parser that uses the **local** (open-source Python) version of
|
|
5
|
-
* Unstructured via the `@epilogo/unstructured-io-node` npm bridge.
|
|
6
|
-
*
|
|
7
|
-
* The bridge spawns a Python virtual environment and calls the Python
|
|
8
|
-
* `unstructured` library directly — no API key required, but Python 3.8+
|
|
9
|
-
* and system dependencies (poppler, tesseract, etc.) must be available.
|
|
10
|
-
*
|
|
11
|
-
* **Peer dependency:** `@epilogo/unstructured-io-node`
|
|
12
|
-
*
|
|
13
|
-
* @example
|
|
14
|
-
* ```typescript
|
|
15
|
-
* import { UnstructuredLocalParser } from "@agentionai/agents/parsers";
|
|
16
|
-
*
|
|
17
|
-
* const parser = new UnstructuredLocalParser();
|
|
18
|
-
* const doc = await parser.parse("/path/to/report.pdf", {
|
|
19
|
-
* strategy: "hi_res",
|
|
20
|
-
* languages: ["eng"],
|
|
21
|
-
* });
|
|
22
|
-
* console.log(doc.elements?.length, "elements");
|
|
23
|
-
*
|
|
24
|
-
* // Use with IngestionPipeline
|
|
25
|
-
* await pipeline.ingestFile("/path/to/report.pdf", parser);
|
|
26
|
-
* ```
|
|
27
|
-
*/
|
|
28
|
-
export declare class UnstructuredLocalParser extends DocumentParser {
|
|
29
|
-
readonly name = "unstructured-local";
|
|
30
|
-
/**
|
|
31
|
-
* Parse a file using the local Python Unstructured library.
|
|
32
|
-
*
|
|
33
|
-
* On first call, `ensureEnvironmentSetup()` is invoked to download the
|
|
34
|
-
* Python venv if it does not already exist (one-time, slow operation).
|
|
35
|
-
*
|
|
36
|
-
* @param filePath - Path to the document to parse
|
|
37
|
-
* @param options - Strategy, languages, and any other unstructured kwargs
|
|
38
|
-
*/
|
|
39
|
-
parse(filePath: string, options?: ParseOptions): Promise<ParsedDocument>;
|
|
40
|
-
private mapRawElements;
|
|
41
|
-
}
|
|
42
|
-
//# sourceMappingURL=UnstructuredLocalParser.d.ts.map
|
|
@@ -1,118 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
-
if (k2 === undefined) k2 = k;
|
|
4
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
-
}
|
|
8
|
-
Object.defineProperty(o, k2, desc);
|
|
9
|
-
}) : (function(o, m, k, k2) {
|
|
10
|
-
if (k2 === undefined) k2 = k;
|
|
11
|
-
o[k2] = m[k];
|
|
12
|
-
}));
|
|
13
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
-
}) : function(o, v) {
|
|
16
|
-
o["default"] = v;
|
|
17
|
-
});
|
|
18
|
-
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
-
var ownKeys = function(o) {
|
|
20
|
-
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
-
var ar = [];
|
|
22
|
-
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
-
return ar;
|
|
24
|
-
};
|
|
25
|
-
return ownKeys(o);
|
|
26
|
-
};
|
|
27
|
-
return function (mod) {
|
|
28
|
-
if (mod && mod.__esModule) return mod;
|
|
29
|
-
var result = {};
|
|
30
|
-
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
-
__setModuleDefault(result, mod);
|
|
32
|
-
return result;
|
|
33
|
-
};
|
|
34
|
-
})();
|
|
35
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
-
exports.UnstructuredLocalParser = void 0;
|
|
37
|
-
const DocumentParser_1 = require("./DocumentParser");
|
|
38
|
-
/**
|
|
39
|
-
* Document parser that uses the **local** (open-source Python) version of
|
|
40
|
-
* Unstructured via the `@epilogo/unstructured-io-node` npm bridge.
|
|
41
|
-
*
|
|
42
|
-
* The bridge spawns a Python virtual environment and calls the Python
|
|
43
|
-
* `unstructured` library directly — no API key required, but Python 3.8+
|
|
44
|
-
* and system dependencies (poppler, tesseract, etc.) must be available.
|
|
45
|
-
*
|
|
46
|
-
* **Peer dependency:** `@epilogo/unstructured-io-node`
|
|
47
|
-
*
|
|
48
|
-
* @example
|
|
49
|
-
* ```typescript
|
|
50
|
-
* import { UnstructuredLocalParser } from "@agentionai/agents/parsers";
|
|
51
|
-
*
|
|
52
|
-
* const parser = new UnstructuredLocalParser();
|
|
53
|
-
* const doc = await parser.parse("/path/to/report.pdf", {
|
|
54
|
-
* strategy: "hi_res",
|
|
55
|
-
* languages: ["eng"],
|
|
56
|
-
* });
|
|
57
|
-
* console.log(doc.elements?.length, "elements");
|
|
58
|
-
*
|
|
59
|
-
* // Use with IngestionPipeline
|
|
60
|
-
* await pipeline.ingestFile("/path/to/report.pdf", parser);
|
|
61
|
-
* ```
|
|
62
|
-
*/
|
|
63
|
-
class UnstructuredLocalParser extends DocumentParser_1.DocumentParser {
|
|
64
|
-
constructor() {
|
|
65
|
-
super(...arguments);
|
|
66
|
-
this.name = "unstructured-local";
|
|
67
|
-
}
|
|
68
|
-
/**
|
|
69
|
-
* Parse a file using the local Python Unstructured library.
|
|
70
|
-
*
|
|
71
|
-
* On first call, `ensureEnvironmentSetup()` is invoked to download the
|
|
72
|
-
* Python venv if it does not already exist (one-time, slow operation).
|
|
73
|
-
*
|
|
74
|
-
* @param filePath - Path to the document to parse
|
|
75
|
-
* @param options - Strategy, languages, and any other unstructured kwargs
|
|
76
|
-
*/
|
|
77
|
-
async parse(filePath, options) {
|
|
78
|
-
const pkg = "@epilogo/unstructured-io-node";
|
|
79
|
-
let UnstructuredIO;
|
|
80
|
-
try {
|
|
81
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
82
|
-
({ UnstructuredIO } = await Promise.resolve(`${pkg}`).then(s => __importStar(require(s))));
|
|
83
|
-
}
|
|
84
|
-
catch {
|
|
85
|
-
throw new Error("UnstructuredLocalParser requires '@epilogo/unstructured-io-node'. " +
|
|
86
|
-
"Install it with: npm install @epilogo/unstructured-io-node");
|
|
87
|
-
}
|
|
88
|
-
await UnstructuredIO.ensureEnvironmentSetup();
|
|
89
|
-
const { strategy, languages, ...rest } = options ?? {};
|
|
90
|
-
const rawElements = await UnstructuredIO.partition({
|
|
91
|
-
filename: filePath,
|
|
92
|
-
strategy: strategy ?? "auto",
|
|
93
|
-
...(languages ? { languages } : {}),
|
|
94
|
-
...rest,
|
|
95
|
-
});
|
|
96
|
-
const elements = this.mapRawElements(rawElements);
|
|
97
|
-
return {
|
|
98
|
-
text: this.elementsToText(elements),
|
|
99
|
-
elements,
|
|
100
|
-
};
|
|
101
|
-
}
|
|
102
|
-
mapRawElements(raw) {
|
|
103
|
-
return raw.map((el) => {
|
|
104
|
-
const e = el;
|
|
105
|
-
return {
|
|
106
|
-
type: typeof e["type"] === "string" ? e["type"] : "unknown",
|
|
107
|
-
text: typeof e["text"] === "string" ? e["text"] : "",
|
|
108
|
-
metadata: e["metadata"] != null &&
|
|
109
|
-
typeof e["metadata"] === "object" &&
|
|
110
|
-
!Array.isArray(e["metadata"])
|
|
111
|
-
? e["metadata"]
|
|
112
|
-
: undefined,
|
|
113
|
-
};
|
|
114
|
-
});
|
|
115
|
-
}
|
|
116
|
-
}
|
|
117
|
-
exports.UnstructuredLocalParser = UnstructuredLocalParser;
|
|
118
|
-
//# sourceMappingURL=UnstructuredLocalParser.js.map
|
package/dist/parsers/index.d.ts
DELETED
package/dist/parsers/index.js
DELETED
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.DocumentParser = void 0;
|
|
4
|
-
var DocumentParser_1 = require("./DocumentParser");
|
|
5
|
-
Object.defineProperty(exports, "DocumentParser", { enumerable: true, get: function () { return DocumentParser_1.DocumentParser; } });
|
|
6
|
-
//# sourceMappingURL=index.js.map
|
package/dist/parsers/types.d.ts
DELETED
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* A single structured element extracted from a document.
|
|
3
|
-
* Matches the element format returned by Unstructured and similar parsers.
|
|
4
|
-
*/
|
|
5
|
-
export interface ParsedElement {
|
|
6
|
-
/**
|
|
7
|
-
* Element type — e.g. "Title", "NarrativeText", "Table", "Image",
|
|
8
|
-
* "ListItem", "Header", "Footer", "Document", etc.
|
|
9
|
-
*/
|
|
10
|
-
type: string;
|
|
11
|
-
/** Text content of this element */
|
|
12
|
-
text: string;
|
|
13
|
-
/**
|
|
14
|
-
* Parser-provided metadata — e.g. page_number, coordinates, languages,
|
|
15
|
-
* file_directory, filename, filetype, etc.
|
|
16
|
-
*/
|
|
17
|
-
metadata?: Record<string, unknown>;
|
|
18
|
-
}
|
|
19
|
-
/**
|
|
20
|
-
* The result of parsing a document file.
|
|
21
|
-
*/
|
|
22
|
-
export interface ParsedDocument {
|
|
23
|
-
/** Full plain-text content (elements joined by double newlines) */
|
|
24
|
-
text: string;
|
|
25
|
-
/**
|
|
26
|
-
* Structured elements if the parser supports them.
|
|
27
|
-
* Absent when the parser only returns plain text.
|
|
28
|
-
*/
|
|
29
|
-
elements?: ParsedElement[];
|
|
30
|
-
/** File-level metadata from the parser, when available */
|
|
31
|
-
metadata?: Record<string, unknown>;
|
|
32
|
-
}
|
|
33
|
-
/**
|
|
34
|
-
* Options shared across all document parsers.
|
|
35
|
-
*/
|
|
36
|
-
export interface ParseOptions {
|
|
37
|
-
/**
|
|
38
|
-
* Parsing strategy.
|
|
39
|
-
* - `"auto"`: Let the parser decide (default)
|
|
40
|
-
* - `"fast"`: Text extraction only, no OCR
|
|
41
|
-
* - `"hi_res"`: High-resolution layout analysis with OCR
|
|
42
|
-
* - `"ocr_only"`: Force OCR on every page
|
|
43
|
-
*/
|
|
44
|
-
strategy?: "auto" | "fast" | "hi_res" | "ocr_only";
|
|
45
|
-
/** Languages to use for OCR (ISO 639-1 codes, e.g. `["eng", "fra"]`) */
|
|
46
|
-
languages?: string[];
|
|
47
|
-
/** Pass-through options specific to the underlying parser */
|
|
48
|
-
[key: string]: unknown;
|
|
49
|
-
}
|
|
50
|
-
//# sourceMappingURL=types.d.ts.map
|
package/dist/parsers/types.js
DELETED