@agentionai/agents 0.11.0 → 0.12.0-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,58 @@
1
+ import { DocumentParser } from "./DocumentParser";
2
+ import { ParsedDocument, ParseOptions } from "./types";
3
+ /**
4
+ * A LlamaIndex reader instance.
5
+ * Matches the `BaseReader` interface from `llamaindex` and `@llamaindex/readers`.
6
+ */
7
+ export interface LlamaIndexReader {
8
+ loadData(filePath: string, ...args: unknown[]): Promise<Array<{
9
+ text: string;
10
+ metadata?: Record<string, unknown>;
11
+ }>>;
12
+ }
13
+ /**
14
+ * Document parser that delegates to any **LlamaIndex reader**.
15
+ *
16
+ * Pass any reader from `llamaindex` or `@llamaindex/readers` — e.g.
17
+ * `PDFReader`, `DocxReader`, `HTMLReader`, `LlamaParseReader`, etc. — and
18
+ * this class normalises the output into a {@link ParsedDocument}.
19
+ *
20
+ * **Peer dependency:** `llamaindex` and/or `@llamaindex/readers`
21
+ *
22
+ * @example
23
+ * ```typescript
24
+ * import { PDFReader } from "@llamaindex/readers/pdf";
25
+ * import { LlamaIndexParser } from "@agentionai/agents/parsers";
26
+ *
27
+ * const parser = new LlamaIndexParser(new PDFReader());
28
+ * const doc = await parser.parse("/path/to/report.pdf");
29
+ * await pipeline.ingestFile("/path/to/report.pdf", parser);
30
+ * ```
31
+ *
32
+ * @example Using LlamaParse (cloud OCR / layout AI)
33
+ * ```typescript
34
+ * import { LlamaParseReader } from "llamaindex";
35
+ *
36
+ * const parser = new LlamaIndexParser(
37
+ * new LlamaParseReader({ resultType: "markdown" })
38
+ * );
39
+ * ```
40
+ */
41
+ export declare class LlamaIndexParser extends DocumentParser {
42
+ private readonly reader;
43
+ readonly name: string;
44
+ /**
45
+ * @param reader - Any LlamaIndex reader instance
46
+ * @param readerName - Optional label used in {@link name}; defaults to the
47
+ * reader's constructor name
48
+ */
49
+ constructor(reader: LlamaIndexReader, readerName?: string);
50
+ /**
51
+ * Parse a file using the configured LlamaIndex reader.
52
+ *
53
+ * @param filePath - Path to the document file
54
+ * @param options - Currently unused; kept for interface compatibility
55
+ */
56
+ parse(filePath: string, _options?: ParseOptions): Promise<ParsedDocument>;
57
+ }
58
+ //# sourceMappingURL=LlamaIndexParser.d.ts.map
@@ -0,0 +1,71 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.LlamaIndexParser = void 0;
4
+ const DocumentParser_1 = require("./DocumentParser");
5
+ /**
6
+ * Document parser that delegates to any **LlamaIndex reader**.
7
+ *
8
+ * Pass any reader from `llamaindex` or `@llamaindex/readers` — e.g.
9
+ * `PDFReader`, `DocxReader`, `HTMLReader`, `LlamaParseReader`, etc. — and
10
+ * this class normalises the output into a {@link ParsedDocument}.
11
+ *
12
+ * **Peer dependency:** `llamaindex` and/or `@llamaindex/readers`
13
+ *
14
+ * @example
15
+ * ```typescript
16
+ * import { PDFReader } from "@llamaindex/readers/pdf";
17
+ * import { LlamaIndexParser } from "@agentionai/agents/parsers";
18
+ *
19
+ * const parser = new LlamaIndexParser(new PDFReader());
20
+ * const doc = await parser.parse("/path/to/report.pdf");
21
+ * await pipeline.ingestFile("/path/to/report.pdf", parser);
22
+ * ```
23
+ *
24
+ * @example Using LlamaParse (cloud OCR / layout AI)
25
+ * ```typescript
26
+ * import { LlamaParseReader } from "llamaindex";
27
+ *
28
+ * const parser = new LlamaIndexParser(
29
+ * new LlamaParseReader({ resultType: "markdown" })
30
+ * );
31
+ * ```
32
+ */
33
+ class LlamaIndexParser extends DocumentParser_1.DocumentParser {
34
+ /**
35
+ * @param reader - Any LlamaIndex reader instance
36
+ * @param readerName - Optional label used in {@link name}; defaults to the
37
+ * reader's constructor name
38
+ */
39
+ constructor(reader, readerName) {
40
+ super();
41
+ this.reader = reader;
42
+ this.name = `llamaindex:${readerName ?? reader.constructor?.name ?? "reader"}`;
43
+ }
44
+ /**
45
+ * Parse a file using the configured LlamaIndex reader.
46
+ *
47
+ * @param filePath - Path to the document file
48
+ * @param options - Currently unused; kept for interface compatibility
49
+ */
50
+ async parse(filePath, _options) {
51
+ let docs;
52
+ try {
53
+ docs = await this.reader.loadData(filePath);
54
+ }
55
+ catch (err) {
56
+ const msg = err instanceof Error ? err.message : String(err);
57
+ throw new Error(`LlamaIndexParser (${this.name}) failed to load "${filePath}": ${msg}`);
58
+ }
59
+ const elements = docs.map((doc, i) => ({
60
+ type: "Document",
61
+ text: doc.text ?? "",
62
+ metadata: { ...doc.metadata, doc_index: i },
63
+ }));
64
+ return {
65
+ text: this.elementsToText(elements),
66
+ elements,
67
+ };
68
+ }
69
+ }
70
+ exports.LlamaIndexParser = LlamaIndexParser;
71
+ //# sourceMappingURL=LlamaIndexParser.js.map
@@ -0,0 +1,98 @@
1
+ import { DocumentParser } from "./DocumentParser";
2
+ import { ParsedDocument, ParseOptions } from "./types";
3
+ /**
4
+ * Configuration for {@link OllamaOCRParser}.
5
+ */
6
+ export interface OllamaOCRParserConfig {
7
+ /**
8
+ * Ollama model to use for OCR.
9
+ * @default "glm-ocr"
10
+ */
11
+ model?: string;
12
+ /**
13
+ * Base URL of the local Ollama server.
14
+ * @default "http://localhost:11434"
15
+ */
16
+ baseUrl?: string;
17
+ /**
18
+ * Prompt sent alongside each image.
19
+ * @default "Extract and transcribe all text from this image. Preserve the original structure, headings, and formatting as much as possible. Output only the extracted text."
20
+ */
21
+ prompt?: string;
22
+ /**
23
+ * Scale factor for rendering PDF pages to images.
24
+ * 1.0 = 72 DPI, 2.0 = 144 DPI. Lower is faster; higher improves OCR accuracy.
25
+ * @default 2.0
26
+ */
27
+ pdfScale?: number;
28
+ /**
29
+ * Number of pages to OCR in parallel.
30
+ * Higher values are faster but use more memory and GPU.
31
+ * @default 3
32
+ */
33
+ concurrency?: number;
34
+ /**
35
+ * Called after each PDF page is OCR'd.
36
+ * With concurrency > 1 pages may complete out of order,
37
+ * but the final document is always in the correct page order.
38
+ */
39
+ onProgress?: (completed: number, total: number) => void;
40
+ }
41
+ /**
42
+ * Document parser that uses a locally-running **Ollama** vision model (e.g. `glm-ocr`)
43
+ * to perform OCR on image files and PDF documents.
44
+ *
45
+ * **Supported file types:**
46
+ * - Images: `.jpg`, `.jpeg`, `.png`, `.gif`, `.webp`, `.bmp` — no extra dependencies
47
+ * - PDF: requires the optional peer dependency `pdf-to-img` (`npm install pdf-to-img`)
48
+ *
49
+ * **Ollama must be running** with the model pulled:
50
+ * ```bash
51
+ * ollama pull glm-ocr
52
+ * ollama serve # if not already running
53
+ * ```
54
+ *
55
+ * @example
56
+ * ```typescript
57
+ * import { OllamaOCRParser } from "@agentionai/agents/parsers/ollama-ocr";
58
+ *
59
+ * const parser = new OllamaOCRParser({
60
+ * model: "glm-ocr",
61
+ * pdfScale: 1.5,
62
+ * onProgress: (page, total) => console.log(`OCR page ${page}/${total}...`),
63
+ * });
64
+ *
65
+ * // Parse an image
66
+ * const doc = await parser.parse("/path/to/scan.png");
67
+ *
68
+ * // Parse a PDF (requires: npm install pdf-to-img)
69
+ * const pdf = await parser.parse("/path/to/report.pdf");
70
+ *
71
+ * // Use with IngestionPipeline
72
+ * await pipeline.ingestFile("/path/to/scan.png", parser);
73
+ * ```
74
+ */
75
+ export declare class OllamaOCRParser extends DocumentParser {
76
+ readonly name = "ollama-ocr";
77
+ private readonly model;
78
+ private readonly baseUrl;
79
+ private readonly prompt;
80
+ private readonly pdfScale;
81
+ private readonly concurrency;
82
+ private readonly onProgress?;
83
+ constructor(config?: OllamaOCRParserConfig);
84
+ /**
85
+ * Parse a document file using Ollama OCR.
86
+ *
87
+ * @param filePath - Path to the image or PDF file
88
+ * @param options - Optional hints (unused by this parser; provided for interface compatibility)
89
+ */
90
+ parse(filePath: string, _options?: ParseOptions): Promise<ParsedDocument>;
91
+ private parseImageFile;
92
+ private parsePdf;
93
+ /**
94
+ * Send a base64-encoded image to the Ollama chat API and return the extracted text.
95
+ */
96
+ private runOCR;
97
+ }
98
+ //# sourceMappingURL=OllamaOCRParser.d.ts.map
@@ -0,0 +1,203 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.OllamaOCRParser = void 0;
37
+ const fs = __importStar(require("fs"));
38
+ const path = __importStar(require("path"));
39
+ const DocumentParser_1 = require("./DocumentParser");
40
+ /** Supported image extensions for direct OCR (no conversion needed). */
41
+ const IMAGE_EXTS = new Set([".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"]);
42
+ /**
43
+ * Document parser that uses a locally-running **Ollama** vision model (e.g. `glm-ocr`)
44
+ * to perform OCR on image files and PDF documents.
45
+ *
46
+ * **Supported file types:**
47
+ * - Images: `.jpg`, `.jpeg`, `.png`, `.gif`, `.webp`, `.bmp` — no extra dependencies
48
+ * - PDF: requires the optional peer dependency `pdf-to-img` (`npm install pdf-to-img`)
49
+ *
50
+ * **Ollama must be running** with the model pulled:
51
+ * ```bash
52
+ * ollama pull glm-ocr
53
+ * ollama serve # if not already running
54
+ * ```
55
+ *
56
+ * @example
57
+ * ```typescript
58
+ * import { OllamaOCRParser } from "@agentionai/agents/parsers/ollama-ocr";
59
+ *
60
+ * const parser = new OllamaOCRParser({
61
+ * model: "glm-ocr",
62
+ * pdfScale: 1.5,
63
+ * onProgress: (page, total) => console.log(`OCR page ${page}/${total}...`),
64
+ * });
65
+ *
66
+ * // Parse an image
67
+ * const doc = await parser.parse("/path/to/scan.png");
68
+ *
69
+ * // Parse a PDF (requires: npm install pdf-to-img)
70
+ * const pdf = await parser.parse("/path/to/report.pdf");
71
+ *
72
+ * // Use with IngestionPipeline
73
+ * await pipeline.ingestFile("/path/to/scan.png", parser);
74
+ * ```
75
+ */
76
+ class OllamaOCRParser extends DocumentParser_1.DocumentParser {
77
+ constructor(config = {}) {
78
+ super();
79
+ this.name = "ollama-ocr";
80
+ this.model = config.model ?? "glm-ocr";
81
+ this.baseUrl = (config.baseUrl ?? "http://localhost:11434").replace(/\/$/, "");
82
+ this.prompt =
83
+ config.prompt ??
84
+ "Extract and transcribe all text from this image. Preserve the original structure, headings, and formatting as much as possible. Output only the extracted text.";
85
+ this.pdfScale = config.pdfScale ?? 2.0;
86
+ this.concurrency = config.concurrency ?? 3;
87
+ this.onProgress = config.onProgress;
88
+ }
89
+ /**
90
+ * Parse a document file using Ollama OCR.
91
+ *
92
+ * @param filePath - Path to the image or PDF file
93
+ * @param options - Optional hints (unused by this parser; provided for interface compatibility)
94
+ */
95
+ async parse(filePath, _options) {
96
+ const ext = path.extname(filePath).toLowerCase();
97
+ if (IMAGE_EXTS.has(ext)) {
98
+ return this.parseImageFile(filePath);
99
+ }
100
+ if (ext === ".pdf") {
101
+ return this.parsePdf(filePath);
102
+ }
103
+ throw new Error(`OllamaOCRParser: unsupported file type "${ext}". ` +
104
+ `Supported: ${[...IMAGE_EXTS].join(", ")}, .pdf`);
105
+ }
106
+ // ---------------------------------------------------------------------------
107
+ // Private helpers
108
+ // ---------------------------------------------------------------------------
109
+ async parseImageFile(filePath) {
110
+ const base64 = fs.readFileSync(filePath).toString("base64");
111
+ const text = await this.runOCR(base64);
112
+ const element = {
113
+ type: "NarrativeText",
114
+ text,
115
+ metadata: { source: filePath },
116
+ };
117
+ return { text, elements: [element], metadata: { filePath, pages: 1 } };
118
+ }
119
+ async parsePdf(filePath) {
120
+ // Dynamically load pdf-to-img — optional peer dep, no system deps required
121
+ let pdfFn;
122
+ try {
123
+ // Resolve the module from process.cwd() so that peer deps installed in
124
+ // the consuming project (not this library's own node_modules) are found.
125
+ const { createRequire } = await Promise.resolve().then(() => __importStar(require("module")));
126
+ const { pathToFileURL } = await Promise.resolve().then(() => __importStar(require("url")));
127
+ const requireFromCwd = createRequire(path.resolve(process.cwd(), "__placeholder__.js"));
128
+ const resolved = requireFromCwd.resolve("pdf-to-img");
129
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
130
+ ({ pdf: pdfFn } = await Promise.resolve(`${pathToFileURL(resolved).href}`).then(s => __importStar(require(s))));
131
+ }
132
+ catch (err) {
133
+ const message = err instanceof Error ? err.message : String(err);
134
+ if (message.includes("Cannot find module") ||
135
+ message.includes("MODULE_NOT_FOUND") ||
136
+ message.includes("ERR_MODULE_NOT_FOUND")) {
137
+ throw new Error("OllamaOCRParser: PDF parsing requires 'pdf-to-img'. " +
138
+ "Install it with: npm install pdf-to-img");
139
+ }
140
+ throw err;
141
+ }
142
+ const doc = await pdfFn(filePath, { scale: this.pdfScale });
143
+ const total = doc.length;
144
+ // Step 1: render all pages to buffers (fast — pure JS, no network)
145
+ const pageBuffers = [];
146
+ for await (const buf of doc) {
147
+ pageBuffers.push(buf);
148
+ }
149
+ // Step 2: OCR pages in parallel using a worker-pool with concurrency limit
150
+ const elements = new Array(total);
151
+ let completed = 0;
152
+ const queue = pageBuffers.map((buf, i) => async () => {
153
+ const text = await this.runOCR(buf.toString("base64"));
154
+ elements[i] = {
155
+ type: "NarrativeText",
156
+ text,
157
+ metadata: { page_number: i + 1, source: filePath },
158
+ };
159
+ this.onProgress?.(++completed, total);
160
+ });
161
+ const workers = Array.from({ length: Math.min(this.concurrency, total) }, async () => {
162
+ while (queue.length > 0) {
163
+ await queue.shift()();
164
+ }
165
+ });
166
+ await Promise.all(workers);
167
+ return {
168
+ text: this.elementsToText(elements),
169
+ elements,
170
+ metadata: { filePath, pages: total },
171
+ };
172
+ }
173
+ /**
174
+ * Send a base64-encoded image to the Ollama chat API and return the extracted text.
175
+ */
176
+ async runOCR(base64Image) {
177
+ const url = `${this.baseUrl}/api/chat`;
178
+ const response = await fetch(url, {
179
+ method: "POST",
180
+ headers: { "Content-Type": "application/json" },
181
+ body: JSON.stringify({
182
+ model: this.model,
183
+ messages: [
184
+ {
185
+ role: "user",
186
+ content: this.prompt,
187
+ images: [base64Image],
188
+ },
189
+ ],
190
+ stream: false,
191
+ }),
192
+ });
193
+ if (!response.ok) {
194
+ const body = await response.text().catch(() => "");
195
+ throw new Error(`OllamaOCRParser: Ollama API error ${response.status} ${response.statusText}` +
196
+ (body ? `\n${body}` : ""));
197
+ }
198
+ const data = (await response.json());
199
+ return data.message?.content?.trim() ?? "";
200
+ }
201
+ }
202
+ exports.OllamaOCRParser = OllamaOCRParser;
203
+ //# sourceMappingURL=OllamaOCRParser.js.map
@@ -0,0 +1,57 @@
1
+ import { DocumentParser } from "./DocumentParser";
2
+ import { ParsedDocument, ParseOptions } from "./types";
3
+ /**
4
+ * Configuration for {@link UnstructuredAPIParser}.
5
+ */
6
+ export interface UnstructuredAPIParserConfig {
7
+ /**
8
+ * API key for the Unstructured hosted service.
9
+ * Not required when `serverUrl` points to a self-hosted instance
10
+ * that does not enforce authentication.
11
+ */
12
+ apiKey?: string;
13
+ /**
14
+ * Base URL of the Unstructured API.
15
+ * Defaults to the official hosted endpoint when an `apiKey` is provided.
16
+ * Set to your own host when running the open-source API server locally:
17
+ * ```
18
+ * docker run -p 8000:8000 downloads.unstructured.io/unstructured-io/unstructured-api:latest
19
+ * serverUrl: "http://localhost:8000"
20
+ * ```
21
+ */
22
+ serverUrl?: string;
23
+ }
24
+ /**
25
+ * Document parser backed by the **Unstructured REST API** — either the
26
+ * official hosted service or a self-hosted open-source API server.
27
+ *
28
+ * Uses the official `unstructured-client` npm package under the hood.
29
+ *
30
+ * **Peer dependency:** `unstructured-client`
31
+ *
32
+ * @example
33
+ * ```typescript
34
+ * // Hosted service
35
+ * const parser = new UnstructuredAPIParser({ apiKey: process.env.UNSTRUCTURED_API_KEY });
36
+ *
37
+ * // Self-hosted (no auth required)
38
+ * const parser = new UnstructuredAPIParser({ serverUrl: "http://localhost:8000" });
39
+ *
40
+ * const doc = await parser.parse("/path/to/report.pdf", { strategy: "hi_res" });
41
+ * await pipeline.ingestFile("/path/to/report.pdf", parser);
42
+ * ```
43
+ */
44
+ export declare class UnstructuredAPIParser extends DocumentParser {
45
+ private readonly config;
46
+ readonly name = "unstructured-api";
47
+ constructor(config?: UnstructuredAPIParserConfig);
48
+ /**
49
+ * Parse a file via the Unstructured API.
50
+ *
51
+ * @param filePath - Path to the document to parse (read from disk)
52
+ * @param options - Strategy, languages, and any other partition parameters
53
+ */
54
+ parse(filePath: string, options?: ParseOptions): Promise<ParsedDocument>;
55
+ private mapRawElements;
56
+ }
57
+ //# sourceMappingURL=UnstructuredAPIParser.d.ts.map
@@ -0,0 +1,131 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.UnstructuredAPIParser = void 0;
37
+ const DocumentParser_1 = require("./DocumentParser");
38
+ const STRATEGY_MAP = {
39
+ auto: "auto",
40
+ fast: "fast",
41
+ hi_res: "hi_res",
42
+ ocr_only: "ocr_only",
43
+ };
44
+ /**
45
+ * Document parser backed by the **Unstructured REST API** — either the
46
+ * official hosted service or a self-hosted open-source API server.
47
+ *
48
+ * Uses the official `unstructured-client` npm package under the hood.
49
+ *
50
+ * **Peer dependency:** `unstructured-client`
51
+ *
52
+ * @example
53
+ * ```typescript
54
+ * // Hosted service
55
+ * const parser = new UnstructuredAPIParser({ apiKey: process.env.UNSTRUCTURED_API_KEY });
56
+ *
57
+ * // Self-hosted (no auth required)
58
+ * const parser = new UnstructuredAPIParser({ serverUrl: "http://localhost:8000" });
59
+ *
60
+ * const doc = await parser.parse("/path/to/report.pdf", { strategy: "hi_res" });
61
+ * await pipeline.ingestFile("/path/to/report.pdf", parser);
62
+ * ```
63
+ */
64
+ class UnstructuredAPIParser extends DocumentParser_1.DocumentParser {
65
+ constructor(config = {}) {
66
+ super();
67
+ this.config = config;
68
+ this.name = "unstructured-api";
69
+ }
70
+ /**
71
+ * Parse a file via the Unstructured API.
72
+ *
73
+ * @param filePath - Path to the document to parse (read from disk)
74
+ * @param options - Strategy, languages, and any other partition parameters
75
+ */
76
+ async parse(filePath, options) {
77
+ const pkg = "unstructured-client";
78
+ let UnstructuredClient;
79
+ try {
80
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
81
+ ({ UnstructuredClient } = await Promise.resolve(`${pkg}`).then(s => __importStar(require(s))));
82
+ }
83
+ catch {
84
+ throw new Error("UnstructuredAPIParser requires 'unstructured-client'. " +
85
+ "Install it with: npm install unstructured-client");
86
+ }
87
+ const fs = await Promise.resolve().then(() => __importStar(require("fs")));
88
+ const path = await Promise.resolve().then(() => __importStar(require("path")));
89
+ const clientConfig = {};
90
+ if (this.config.apiKey) {
91
+ clientConfig["security"] = { apiKeyAuth: this.config.apiKey };
92
+ }
93
+ if (this.config.serverUrl) {
94
+ clientConfig["serverURL"] = this.config.serverUrl;
95
+ }
96
+ const client = new UnstructuredClient(clientConfig);
97
+ const fileContent = fs.readFileSync(filePath);
98
+ const fileName = path.basename(filePath);
99
+ const { strategy, languages, ...rest } = options ?? {};
100
+ const res = await client.general.partition({
101
+ partitionParameters: {
102
+ files: { content: fileContent, fileName },
103
+ strategy: STRATEGY_MAP[strategy ?? "auto"] ?? "auto",
104
+ ...(languages ? { languages } : {}),
105
+ ...rest,
106
+ },
107
+ });
108
+ const rawElements = res.elements ?? [];
109
+ const elements = this.mapRawElements(rawElements);
110
+ return {
111
+ text: this.elementsToText(elements),
112
+ elements,
113
+ };
114
+ }
115
+ mapRawElements(raw) {
116
+ return raw.map((el) => {
117
+ const e = el;
118
+ return {
119
+ type: typeof e["type"] === "string" ? e["type"] : "unknown",
120
+ text: typeof e["text"] === "string" ? e["text"] : "",
121
+ metadata: e["metadata"] != null &&
122
+ typeof e["metadata"] === "object" &&
123
+ !Array.isArray(e["metadata"])
124
+ ? e["metadata"]
125
+ : undefined,
126
+ };
127
+ });
128
+ }
129
+ }
130
+ exports.UnstructuredAPIParser = UnstructuredAPIParser;
131
+ //# sourceMappingURL=UnstructuredAPIParser.js.map
@@ -0,0 +1,42 @@
1
+ import { DocumentParser } from "./DocumentParser";
2
+ import { ParsedDocument, ParseOptions } from "./types";
3
+ /**
4
+ * Document parser that uses the **local** (open-source Python) version of
5
+ * Unstructured via the `@epilogo/unstructured-io-node` npm bridge.
6
+ *
7
+ * The bridge spawns a Python virtual environment and calls the Python
8
+ * `unstructured` library directly — no API key required, but Python 3.8+
9
+ * and system dependencies (poppler, tesseract, etc.) must be available.
10
+ *
11
+ * **Peer dependency:** `@epilogo/unstructured-io-node`
12
+ *
13
+ * @example
14
+ * ```typescript
15
+ * import { UnstructuredLocalParser } from "@agentionai/agents/parsers";
16
+ *
17
+ * const parser = new UnstructuredLocalParser();
18
+ * const doc = await parser.parse("/path/to/report.pdf", {
19
+ * strategy: "hi_res",
20
+ * languages: ["eng"],
21
+ * });
22
+ * console.log(doc.elements?.length, "elements");
23
+ *
24
+ * // Use with IngestionPipeline
25
+ * await pipeline.ingestFile("/path/to/report.pdf", parser);
26
+ * ```
27
+ */
28
+ export declare class UnstructuredLocalParser extends DocumentParser {
29
+ readonly name = "unstructured-local";
30
+ /**
31
+ * Parse a file using the local Python Unstructured library.
32
+ *
33
+ * On first call, `ensureEnvironmentSetup()` is invoked to download the
34
+ * Python venv if it does not already exist (one-time, slow operation).
35
+ *
36
+ * @param filePath - Path to the document to parse
37
+ * @param options - Strategy, languages, and any other unstructured kwargs
38
+ */
39
+ parse(filePath: string, options?: ParseOptions): Promise<ParsedDocument>;
40
+ private mapRawElements;
41
+ }
42
+ //# sourceMappingURL=UnstructuredLocalParser.d.ts.map