@agentionai/agents 0.10.2 → 0.12.0-beta
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunkers/ElementChunker.d.ts +100 -0
- package/dist/chunkers/ElementChunker.js +242 -0
- package/dist/chunkers/index.d.ts +1 -0
- package/dist/chunkers/index.js +3 -1
- package/dist/ingestion/IngestionPipeline.d.ts +73 -1
- package/dist/ingestion/IngestionPipeline.js +110 -1
- package/dist/parsers/DocumentParser.d.ts +36 -0
- package/dist/parsers/DocumentParser.js +35 -0
- package/dist/parsers/LlamaIndexParser.d.ts +58 -0
- package/dist/parsers/LlamaIndexParser.js +71 -0
- package/dist/parsers/OllamaOCRParser.d.ts +98 -0
- package/dist/parsers/OllamaOCRParser.js +203 -0
- package/dist/parsers/UnstructuredAPIParser.d.ts +57 -0
- package/dist/parsers/UnstructuredAPIParser.js +131 -0
- package/dist/parsers/UnstructuredLocalParser.d.ts +42 -0
- package/dist/parsers/UnstructuredLocalParser.js +118 -0
- package/dist/parsers/index.d.ts +3 -0
- package/dist/parsers/index.js +6 -0
- package/dist/parsers/types.d.ts +50 -0
- package/dist/parsers/types.js +3 -0
- package/dist/vectorstore/LanceDBVectorStore.d.ts +1 -16
- package/dist/vectorstore/OpenSearchVectorStore.d.ts +259 -0
- package/dist/vectorstore/OpenSearchVectorStore.js +481 -0
- package/dist/vectorstore/VectorStore.d.ts +25 -0
- package/dist/vectorstore/index.d.ts +3 -2
- package/dist/vectorstore/index.js +3 -1
- package/package.json +50 -2
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { DocumentParser } from "./DocumentParser";
|
|
2
|
+
import { ParsedDocument, ParseOptions } from "./types";
|
|
3
|
+
/**
|
|
4
|
+
* A LlamaIndex reader instance.
|
|
5
|
+
* Matches the `BaseReader` interface from `llamaindex` and `@llamaindex/readers`.
|
|
6
|
+
*/
|
|
7
|
+
export interface LlamaIndexReader {
|
|
8
|
+
loadData(filePath: string, ...args: unknown[]): Promise<Array<{
|
|
9
|
+
text: string;
|
|
10
|
+
metadata?: Record<string, unknown>;
|
|
11
|
+
}>>;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Document parser that delegates to any **LlamaIndex reader**.
|
|
15
|
+
*
|
|
16
|
+
* Pass any reader from `llamaindex` or `@llamaindex/readers` — e.g.
|
|
17
|
+
* `PDFReader`, `DocxReader`, `HTMLReader`, `LlamaParseReader`, etc. — and
|
|
18
|
+
* this class normalises the output into a {@link ParsedDocument}.
|
|
19
|
+
*
|
|
20
|
+
* **Peer dependency:** `llamaindex` and/or `@llamaindex/readers`
|
|
21
|
+
*
|
|
22
|
+
* @example
|
|
23
|
+
* ```typescript
|
|
24
|
+
* import { PDFReader } from "@llamaindex/readers/pdf";
|
|
25
|
+
* import { LlamaIndexParser } from "@agentionai/agents/parsers";
|
|
26
|
+
*
|
|
27
|
+
* const parser = new LlamaIndexParser(new PDFReader());
|
|
28
|
+
* const doc = await parser.parse("/path/to/report.pdf");
|
|
29
|
+
* await pipeline.ingestFile("/path/to/report.pdf", parser);
|
|
30
|
+
* ```
|
|
31
|
+
*
|
|
32
|
+
* @example Using LlamaParse (cloud OCR / layout AI)
|
|
33
|
+
* ```typescript
|
|
34
|
+
* import { LlamaParseReader } from "llamaindex";
|
|
35
|
+
*
|
|
36
|
+
* const parser = new LlamaIndexParser(
|
|
37
|
+
* new LlamaParseReader({ resultType: "markdown" })
|
|
38
|
+
* );
|
|
39
|
+
* ```
|
|
40
|
+
*/
|
|
41
|
+
export declare class LlamaIndexParser extends DocumentParser {
|
|
42
|
+
private readonly reader;
|
|
43
|
+
readonly name: string;
|
|
44
|
+
/**
|
|
45
|
+
* @param reader - Any LlamaIndex reader instance
|
|
46
|
+
* @param readerName - Optional label used in {@link name}; defaults to the
|
|
47
|
+
* reader's constructor name
|
|
48
|
+
*/
|
|
49
|
+
constructor(reader: LlamaIndexReader, readerName?: string);
|
|
50
|
+
/**
|
|
51
|
+
* Parse a file using the configured LlamaIndex reader.
|
|
52
|
+
*
|
|
53
|
+
* @param filePath - Path to the document file
|
|
54
|
+
* @param options - Currently unused; kept for interface compatibility
|
|
55
|
+
*/
|
|
56
|
+
parse(filePath: string, _options?: ParseOptions): Promise<ParsedDocument>;
|
|
57
|
+
}
|
|
58
|
+
//# sourceMappingURL=LlamaIndexParser.d.ts.map
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.LlamaIndexParser = void 0;
|
|
4
|
+
const DocumentParser_1 = require("./DocumentParser");
|
|
5
|
+
/**
|
|
6
|
+
* Document parser that delegates to any **LlamaIndex reader**.
|
|
7
|
+
*
|
|
8
|
+
* Pass any reader from `llamaindex` or `@llamaindex/readers` — e.g.
|
|
9
|
+
* `PDFReader`, `DocxReader`, `HTMLReader`, `LlamaParseReader`, etc. — and
|
|
10
|
+
* this class normalises the output into a {@link ParsedDocument}.
|
|
11
|
+
*
|
|
12
|
+
* **Peer dependency:** `llamaindex` and/or `@llamaindex/readers`
|
|
13
|
+
*
|
|
14
|
+
* @example
|
|
15
|
+
* ```typescript
|
|
16
|
+
* import { PDFReader } from "@llamaindex/readers/pdf";
|
|
17
|
+
* import { LlamaIndexParser } from "@agentionai/agents/parsers";
|
|
18
|
+
*
|
|
19
|
+
* const parser = new LlamaIndexParser(new PDFReader());
|
|
20
|
+
* const doc = await parser.parse("/path/to/report.pdf");
|
|
21
|
+
* await pipeline.ingestFile("/path/to/report.pdf", parser);
|
|
22
|
+
* ```
|
|
23
|
+
*
|
|
24
|
+
* @example Using LlamaParse (cloud OCR / layout AI)
|
|
25
|
+
* ```typescript
|
|
26
|
+
* import { LlamaParseReader } from "llamaindex";
|
|
27
|
+
*
|
|
28
|
+
* const parser = new LlamaIndexParser(
|
|
29
|
+
* new LlamaParseReader({ resultType: "markdown" })
|
|
30
|
+
* );
|
|
31
|
+
* ```
|
|
32
|
+
*/
|
|
33
|
+
class LlamaIndexParser extends DocumentParser_1.DocumentParser {
|
|
34
|
+
/**
|
|
35
|
+
* @param reader - Any LlamaIndex reader instance
|
|
36
|
+
* @param readerName - Optional label used in {@link name}; defaults to the
|
|
37
|
+
* reader's constructor name
|
|
38
|
+
*/
|
|
39
|
+
constructor(reader, readerName) {
|
|
40
|
+
super();
|
|
41
|
+
this.reader = reader;
|
|
42
|
+
this.name = `llamaindex:${readerName ?? reader.constructor?.name ?? "reader"}`;
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Parse a file using the configured LlamaIndex reader.
|
|
46
|
+
*
|
|
47
|
+
* @param filePath - Path to the document file
|
|
48
|
+
* @param options - Currently unused; kept for interface compatibility
|
|
49
|
+
*/
|
|
50
|
+
async parse(filePath, _options) {
|
|
51
|
+
let docs;
|
|
52
|
+
try {
|
|
53
|
+
docs = await this.reader.loadData(filePath);
|
|
54
|
+
}
|
|
55
|
+
catch (err) {
|
|
56
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
57
|
+
throw new Error(`LlamaIndexParser (${this.name}) failed to load "${filePath}": ${msg}`);
|
|
58
|
+
}
|
|
59
|
+
const elements = docs.map((doc, i) => ({
|
|
60
|
+
type: "Document",
|
|
61
|
+
text: doc.text ?? "",
|
|
62
|
+
metadata: { ...doc.metadata, doc_index: i },
|
|
63
|
+
}));
|
|
64
|
+
return {
|
|
65
|
+
text: this.elementsToText(elements),
|
|
66
|
+
elements,
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
exports.LlamaIndexParser = LlamaIndexParser;
|
|
71
|
+
//# sourceMappingURL=LlamaIndexParser.js.map
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import { DocumentParser } from "./DocumentParser";
|
|
2
|
+
import { ParsedDocument, ParseOptions } from "./types";
|
|
3
|
+
/**
|
|
4
|
+
* Configuration for {@link OllamaOCRParser}.
|
|
5
|
+
*/
|
|
6
|
+
export interface OllamaOCRParserConfig {
|
|
7
|
+
/**
|
|
8
|
+
* Ollama model to use for OCR.
|
|
9
|
+
* @default "glm-ocr"
|
|
10
|
+
*/
|
|
11
|
+
model?: string;
|
|
12
|
+
/**
|
|
13
|
+
* Base URL of the local Ollama server.
|
|
14
|
+
* @default "http://localhost:11434"
|
|
15
|
+
*/
|
|
16
|
+
baseUrl?: string;
|
|
17
|
+
/**
|
|
18
|
+
* Prompt sent alongside each image.
|
|
19
|
+
* @default "Extract and transcribe all text from this image. Preserve the original structure, headings, and formatting as much as possible. Output only the extracted text."
|
|
20
|
+
*/
|
|
21
|
+
prompt?: string;
|
|
22
|
+
/**
|
|
23
|
+
* Scale factor for rendering PDF pages to images.
|
|
24
|
+
* 1.0 = 72 DPI, 2.0 = 144 DPI. Lower is faster; higher improves OCR accuracy.
|
|
25
|
+
* @default 2.0
|
|
26
|
+
*/
|
|
27
|
+
pdfScale?: number;
|
|
28
|
+
/**
|
|
29
|
+
* Number of pages to OCR in parallel.
|
|
30
|
+
* Higher values are faster but use more memory and GPU.
|
|
31
|
+
* @default 3
|
|
32
|
+
*/
|
|
33
|
+
concurrency?: number;
|
|
34
|
+
/**
|
|
35
|
+
* Called after each PDF page is OCR'd.
|
|
36
|
+
* With concurrency > 1 pages may complete out of order,
|
|
37
|
+
* but the final document is always in the correct page order.
|
|
38
|
+
*/
|
|
39
|
+
onProgress?: (completed: number, total: number) => void;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Document parser that uses a locally-running **Ollama** vision model (e.g. `glm-ocr`)
|
|
43
|
+
* to perform OCR on image files and PDF documents.
|
|
44
|
+
*
|
|
45
|
+
* **Supported file types:**
|
|
46
|
+
* - Images: `.jpg`, `.jpeg`, `.png`, `.gif`, `.webp`, `.bmp` — no extra dependencies
|
|
47
|
+
* - PDF: requires the optional peer dependency `pdf-to-img` (`npm install pdf-to-img`)
|
|
48
|
+
*
|
|
49
|
+
* **Ollama must be running** with the model pulled:
|
|
50
|
+
* ```bash
|
|
51
|
+
* ollama pull glm-ocr
|
|
52
|
+
* ollama serve # if not already running
|
|
53
|
+
* ```
|
|
54
|
+
*
|
|
55
|
+
* @example
|
|
56
|
+
* ```typescript
|
|
57
|
+
* import { OllamaOCRParser } from "@agentionai/agents/parsers/ollama-ocr";
|
|
58
|
+
*
|
|
59
|
+
* const parser = new OllamaOCRParser({
|
|
60
|
+
* model: "glm-ocr",
|
|
61
|
+
* pdfScale: 1.5,
|
|
62
|
+
* onProgress: (page, total) => console.log(`OCR page ${page}/${total}...`),
|
|
63
|
+
* });
|
|
64
|
+
*
|
|
65
|
+
* // Parse an image
|
|
66
|
+
* const doc = await parser.parse("/path/to/scan.png");
|
|
67
|
+
*
|
|
68
|
+
* // Parse a PDF (requires: npm install pdf-to-img)
|
|
69
|
+
* const pdf = await parser.parse("/path/to/report.pdf");
|
|
70
|
+
*
|
|
71
|
+
* // Use with IngestionPipeline
|
|
72
|
+
* await pipeline.ingestFile("/path/to/scan.png", parser);
|
|
73
|
+
* ```
|
|
74
|
+
*/
|
|
75
|
+
export declare class OllamaOCRParser extends DocumentParser {
|
|
76
|
+
readonly name = "ollama-ocr";
|
|
77
|
+
private readonly model;
|
|
78
|
+
private readonly baseUrl;
|
|
79
|
+
private readonly prompt;
|
|
80
|
+
private readonly pdfScale;
|
|
81
|
+
private readonly concurrency;
|
|
82
|
+
private readonly onProgress?;
|
|
83
|
+
constructor(config?: OllamaOCRParserConfig);
|
|
84
|
+
/**
|
|
85
|
+
* Parse a document file using Ollama OCR.
|
|
86
|
+
*
|
|
87
|
+
* @param filePath - Path to the image or PDF file
|
|
88
|
+
* @param options - Optional hints (unused by this parser; provided for interface compatibility)
|
|
89
|
+
*/
|
|
90
|
+
parse(filePath: string, _options?: ParseOptions): Promise<ParsedDocument>;
|
|
91
|
+
private parseImageFile;
|
|
92
|
+
private parsePdf;
|
|
93
|
+
/**
|
|
94
|
+
* Send a base64-encoded image to the Ollama chat API and return the extracted text.
|
|
95
|
+
*/
|
|
96
|
+
private runOCR;
|
|
97
|
+
}
|
|
98
|
+
//# sourceMappingURL=OllamaOCRParser.d.ts.map
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.OllamaOCRParser = void 0;
|
|
37
|
+
const fs = __importStar(require("fs"));
|
|
38
|
+
const path = __importStar(require("path"));
|
|
39
|
+
const DocumentParser_1 = require("./DocumentParser");
|
|
40
|
+
/** Supported image extensions for direct OCR (no conversion needed). */
|
|
41
|
+
const IMAGE_EXTS = new Set([".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"]);
|
|
42
|
+
/**
|
|
43
|
+
* Document parser that uses a locally-running **Ollama** vision model (e.g. `glm-ocr`)
|
|
44
|
+
* to perform OCR on image files and PDF documents.
|
|
45
|
+
*
|
|
46
|
+
* **Supported file types:**
|
|
47
|
+
* - Images: `.jpg`, `.jpeg`, `.png`, `.gif`, `.webp`, `.bmp` — no extra dependencies
|
|
48
|
+
* - PDF: requires the optional peer dependency `pdf-to-img` (`npm install pdf-to-img`)
|
|
49
|
+
*
|
|
50
|
+
* **Ollama must be running** with the model pulled:
|
|
51
|
+
* ```bash
|
|
52
|
+
* ollama pull glm-ocr
|
|
53
|
+
* ollama serve # if not already running
|
|
54
|
+
* ```
|
|
55
|
+
*
|
|
56
|
+
* @example
|
|
57
|
+
* ```typescript
|
|
58
|
+
* import { OllamaOCRParser } from "@agentionai/agents/parsers/ollama-ocr";
|
|
59
|
+
*
|
|
60
|
+
* const parser = new OllamaOCRParser({
|
|
61
|
+
* model: "glm-ocr",
|
|
62
|
+
* pdfScale: 1.5,
|
|
63
|
+
* onProgress: (page, total) => console.log(`OCR page ${page}/${total}...`),
|
|
64
|
+
* });
|
|
65
|
+
*
|
|
66
|
+
* // Parse an image
|
|
67
|
+
* const doc = await parser.parse("/path/to/scan.png");
|
|
68
|
+
*
|
|
69
|
+
* // Parse a PDF (requires: npm install pdf-to-img)
|
|
70
|
+
* const pdf = await parser.parse("/path/to/report.pdf");
|
|
71
|
+
*
|
|
72
|
+
* // Use with IngestionPipeline
|
|
73
|
+
* await pipeline.ingestFile("/path/to/scan.png", parser);
|
|
74
|
+
* ```
|
|
75
|
+
*/
|
|
76
|
+
class OllamaOCRParser extends DocumentParser_1.DocumentParser {
|
|
77
|
+
constructor(config = {}) {
|
|
78
|
+
super();
|
|
79
|
+
this.name = "ollama-ocr";
|
|
80
|
+
this.model = config.model ?? "glm-ocr";
|
|
81
|
+
this.baseUrl = (config.baseUrl ?? "http://localhost:11434").replace(/\/$/, "");
|
|
82
|
+
this.prompt =
|
|
83
|
+
config.prompt ??
|
|
84
|
+
"Extract and transcribe all text from this image. Preserve the original structure, headings, and formatting as much as possible. Output only the extracted text.";
|
|
85
|
+
this.pdfScale = config.pdfScale ?? 2.0;
|
|
86
|
+
this.concurrency = config.concurrency ?? 3;
|
|
87
|
+
this.onProgress = config.onProgress;
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Parse a document file using Ollama OCR.
|
|
91
|
+
*
|
|
92
|
+
* @param filePath - Path to the image or PDF file
|
|
93
|
+
* @param options - Optional hints (unused by this parser; provided for interface compatibility)
|
|
94
|
+
*/
|
|
95
|
+
async parse(filePath, _options) {
|
|
96
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
97
|
+
if (IMAGE_EXTS.has(ext)) {
|
|
98
|
+
return this.parseImageFile(filePath);
|
|
99
|
+
}
|
|
100
|
+
if (ext === ".pdf") {
|
|
101
|
+
return this.parsePdf(filePath);
|
|
102
|
+
}
|
|
103
|
+
throw new Error(`OllamaOCRParser: unsupported file type "${ext}". ` +
|
|
104
|
+
`Supported: ${[...IMAGE_EXTS].join(", ")}, .pdf`);
|
|
105
|
+
}
|
|
106
|
+
// ---------------------------------------------------------------------------
|
|
107
|
+
// Private helpers
|
|
108
|
+
// ---------------------------------------------------------------------------
|
|
109
|
+
async parseImageFile(filePath) {
|
|
110
|
+
const base64 = fs.readFileSync(filePath).toString("base64");
|
|
111
|
+
const text = await this.runOCR(base64);
|
|
112
|
+
const element = {
|
|
113
|
+
type: "NarrativeText",
|
|
114
|
+
text,
|
|
115
|
+
metadata: { source: filePath },
|
|
116
|
+
};
|
|
117
|
+
return { text, elements: [element], metadata: { filePath, pages: 1 } };
|
|
118
|
+
}
|
|
119
|
+
async parsePdf(filePath) {
|
|
120
|
+
// Dynamically load pdf-to-img — optional peer dep, no system deps required
|
|
121
|
+
let pdfFn;
|
|
122
|
+
try {
|
|
123
|
+
// Resolve the module from process.cwd() so that peer deps installed in
|
|
124
|
+
// the consuming project (not this library's own node_modules) are found.
|
|
125
|
+
const { createRequire } = await Promise.resolve().then(() => __importStar(require("module")));
|
|
126
|
+
const { pathToFileURL } = await Promise.resolve().then(() => __importStar(require("url")));
|
|
127
|
+
const requireFromCwd = createRequire(path.resolve(process.cwd(), "__placeholder__.js"));
|
|
128
|
+
const resolved = requireFromCwd.resolve("pdf-to-img");
|
|
129
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
130
|
+
({ pdf: pdfFn } = await Promise.resolve(`${pathToFileURL(resolved).href}`).then(s => __importStar(require(s))));
|
|
131
|
+
}
|
|
132
|
+
catch (err) {
|
|
133
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
134
|
+
if (message.includes("Cannot find module") ||
|
|
135
|
+
message.includes("MODULE_NOT_FOUND") ||
|
|
136
|
+
message.includes("ERR_MODULE_NOT_FOUND")) {
|
|
137
|
+
throw new Error("OllamaOCRParser: PDF parsing requires 'pdf-to-img'. " +
|
|
138
|
+
"Install it with: npm install pdf-to-img");
|
|
139
|
+
}
|
|
140
|
+
throw err;
|
|
141
|
+
}
|
|
142
|
+
const doc = await pdfFn(filePath, { scale: this.pdfScale });
|
|
143
|
+
const total = doc.length;
|
|
144
|
+
// Step 1: render all pages to buffers (fast — pure JS, no network)
|
|
145
|
+
const pageBuffers = [];
|
|
146
|
+
for await (const buf of doc) {
|
|
147
|
+
pageBuffers.push(buf);
|
|
148
|
+
}
|
|
149
|
+
// Step 2: OCR pages in parallel using a worker-pool with concurrency limit
|
|
150
|
+
const elements = new Array(total);
|
|
151
|
+
let completed = 0;
|
|
152
|
+
const queue = pageBuffers.map((buf, i) => async () => {
|
|
153
|
+
const text = await this.runOCR(buf.toString("base64"));
|
|
154
|
+
elements[i] = {
|
|
155
|
+
type: "NarrativeText",
|
|
156
|
+
text,
|
|
157
|
+
metadata: { page_number: i + 1, source: filePath },
|
|
158
|
+
};
|
|
159
|
+
this.onProgress?.(++completed, total);
|
|
160
|
+
});
|
|
161
|
+
const workers = Array.from({ length: Math.min(this.concurrency, total) }, async () => {
|
|
162
|
+
while (queue.length > 0) {
|
|
163
|
+
await queue.shift()();
|
|
164
|
+
}
|
|
165
|
+
});
|
|
166
|
+
await Promise.all(workers);
|
|
167
|
+
return {
|
|
168
|
+
text: this.elementsToText(elements),
|
|
169
|
+
elements,
|
|
170
|
+
metadata: { filePath, pages: total },
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Send a base64-encoded image to the Ollama chat API and return the extracted text.
|
|
175
|
+
*/
|
|
176
|
+
async runOCR(base64Image) {
|
|
177
|
+
const url = `${this.baseUrl}/api/chat`;
|
|
178
|
+
const response = await fetch(url, {
|
|
179
|
+
method: "POST",
|
|
180
|
+
headers: { "Content-Type": "application/json" },
|
|
181
|
+
body: JSON.stringify({
|
|
182
|
+
model: this.model,
|
|
183
|
+
messages: [
|
|
184
|
+
{
|
|
185
|
+
role: "user",
|
|
186
|
+
content: this.prompt,
|
|
187
|
+
images: [base64Image],
|
|
188
|
+
},
|
|
189
|
+
],
|
|
190
|
+
stream: false,
|
|
191
|
+
}),
|
|
192
|
+
});
|
|
193
|
+
if (!response.ok) {
|
|
194
|
+
const body = await response.text().catch(() => "");
|
|
195
|
+
throw new Error(`OllamaOCRParser: Ollama API error ${response.status} ${response.statusText}` +
|
|
196
|
+
(body ? `\n${body}` : ""));
|
|
197
|
+
}
|
|
198
|
+
const data = (await response.json());
|
|
199
|
+
return data.message?.content?.trim() ?? "";
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
exports.OllamaOCRParser = OllamaOCRParser;
|
|
203
|
+
//# sourceMappingURL=OllamaOCRParser.js.map
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { DocumentParser } from "./DocumentParser";
|
|
2
|
+
import { ParsedDocument, ParseOptions } from "./types";
|
|
3
|
+
/**
|
|
4
|
+
* Configuration for {@link UnstructuredAPIParser}.
|
|
5
|
+
*/
|
|
6
|
+
export interface UnstructuredAPIParserConfig {
|
|
7
|
+
/**
|
|
8
|
+
* API key for the Unstructured hosted service.
|
|
9
|
+
* Not required when `serverUrl` points to a self-hosted instance
|
|
10
|
+
* that does not enforce authentication.
|
|
11
|
+
*/
|
|
12
|
+
apiKey?: string;
|
|
13
|
+
/**
|
|
14
|
+
* Base URL of the Unstructured API.
|
|
15
|
+
* Defaults to the official hosted endpoint when an `apiKey` is provided.
|
|
16
|
+
* Set to your own host when running the open-source API server locally:
|
|
17
|
+
* ```
|
|
18
|
+
* docker run -p 8000:8000 downloads.unstructured.io/unstructured-io/unstructured-api:latest
|
|
19
|
+
* serverUrl: "http://localhost:8000"
|
|
20
|
+
* ```
|
|
21
|
+
*/
|
|
22
|
+
serverUrl?: string;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Document parser backed by the **Unstructured REST API** — either the
|
|
26
|
+
* official hosted service or a self-hosted open-source API server.
|
|
27
|
+
*
|
|
28
|
+
* Uses the official `unstructured-client` npm package under the hood.
|
|
29
|
+
*
|
|
30
|
+
* **Peer dependency:** `unstructured-client`
|
|
31
|
+
*
|
|
32
|
+
* @example
|
|
33
|
+
* ```typescript
|
|
34
|
+
* // Hosted service
|
|
35
|
+
* const parser = new UnstructuredAPIParser({ apiKey: process.env.UNSTRUCTURED_API_KEY });
|
|
36
|
+
*
|
|
37
|
+
* // Self-hosted (no auth required)
|
|
38
|
+
* const parser = new UnstructuredAPIParser({ serverUrl: "http://localhost:8000" });
|
|
39
|
+
*
|
|
40
|
+
* const doc = await parser.parse("/path/to/report.pdf", { strategy: "hi_res" });
|
|
41
|
+
* await pipeline.ingestFile("/path/to/report.pdf", parser);
|
|
42
|
+
* ```
|
|
43
|
+
*/
|
|
44
|
+
export declare class UnstructuredAPIParser extends DocumentParser {
|
|
45
|
+
private readonly config;
|
|
46
|
+
readonly name = "unstructured-api";
|
|
47
|
+
constructor(config?: UnstructuredAPIParserConfig);
|
|
48
|
+
/**
|
|
49
|
+
* Parse a file via the Unstructured API.
|
|
50
|
+
*
|
|
51
|
+
* @param filePath - Path to the document to parse (read from disk)
|
|
52
|
+
* @param options - Strategy, languages, and any other partition parameters
|
|
53
|
+
*/
|
|
54
|
+
parse(filePath: string, options?: ParseOptions): Promise<ParsedDocument>;
|
|
55
|
+
private mapRawElements;
|
|
56
|
+
}
|
|
57
|
+
//# sourceMappingURL=UnstructuredAPIParser.d.ts.map
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.UnstructuredAPIParser = void 0;
|
|
37
|
+
const DocumentParser_1 = require("./DocumentParser");
|
|
38
|
+
const STRATEGY_MAP = {
|
|
39
|
+
auto: "auto",
|
|
40
|
+
fast: "fast",
|
|
41
|
+
hi_res: "hi_res",
|
|
42
|
+
ocr_only: "ocr_only",
|
|
43
|
+
};
|
|
44
|
+
/**
|
|
45
|
+
* Document parser backed by the **Unstructured REST API** — either the
|
|
46
|
+
* official hosted service or a self-hosted open-source API server.
|
|
47
|
+
*
|
|
48
|
+
* Uses the official `unstructured-client` npm package under the hood.
|
|
49
|
+
*
|
|
50
|
+
* **Peer dependency:** `unstructured-client`
|
|
51
|
+
*
|
|
52
|
+
* @example
|
|
53
|
+
* ```typescript
|
|
54
|
+
* // Hosted service
|
|
55
|
+
* const parser = new UnstructuredAPIParser({ apiKey: process.env.UNSTRUCTURED_API_KEY });
|
|
56
|
+
*
|
|
57
|
+
* // Self-hosted (no auth required)
|
|
58
|
+
* const parser = new UnstructuredAPIParser({ serverUrl: "http://localhost:8000" });
|
|
59
|
+
*
|
|
60
|
+
* const doc = await parser.parse("/path/to/report.pdf", { strategy: "hi_res" });
|
|
61
|
+
* await pipeline.ingestFile("/path/to/report.pdf", parser);
|
|
62
|
+
* ```
|
|
63
|
+
*/
|
|
64
|
+
class UnstructuredAPIParser extends DocumentParser_1.DocumentParser {
|
|
65
|
+
constructor(config = {}) {
|
|
66
|
+
super();
|
|
67
|
+
this.config = config;
|
|
68
|
+
this.name = "unstructured-api";
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Parse a file via the Unstructured API.
|
|
72
|
+
*
|
|
73
|
+
* @param filePath - Path to the document to parse (read from disk)
|
|
74
|
+
* @param options - Strategy, languages, and any other partition parameters
|
|
75
|
+
*/
|
|
76
|
+
async parse(filePath, options) {
|
|
77
|
+
const pkg = "unstructured-client";
|
|
78
|
+
let UnstructuredClient;
|
|
79
|
+
try {
|
|
80
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
81
|
+
({ UnstructuredClient } = await Promise.resolve(`${pkg}`).then(s => __importStar(require(s))));
|
|
82
|
+
}
|
|
83
|
+
catch {
|
|
84
|
+
throw new Error("UnstructuredAPIParser requires 'unstructured-client'. " +
|
|
85
|
+
"Install it with: npm install unstructured-client");
|
|
86
|
+
}
|
|
87
|
+
const fs = await Promise.resolve().then(() => __importStar(require("fs")));
|
|
88
|
+
const path = await Promise.resolve().then(() => __importStar(require("path")));
|
|
89
|
+
const clientConfig = {};
|
|
90
|
+
if (this.config.apiKey) {
|
|
91
|
+
clientConfig["security"] = { apiKeyAuth: this.config.apiKey };
|
|
92
|
+
}
|
|
93
|
+
if (this.config.serverUrl) {
|
|
94
|
+
clientConfig["serverURL"] = this.config.serverUrl;
|
|
95
|
+
}
|
|
96
|
+
const client = new UnstructuredClient(clientConfig);
|
|
97
|
+
const fileContent = fs.readFileSync(filePath);
|
|
98
|
+
const fileName = path.basename(filePath);
|
|
99
|
+
const { strategy, languages, ...rest } = options ?? {};
|
|
100
|
+
const res = await client.general.partition({
|
|
101
|
+
partitionParameters: {
|
|
102
|
+
files: { content: fileContent, fileName },
|
|
103
|
+
strategy: STRATEGY_MAP[strategy ?? "auto"] ?? "auto",
|
|
104
|
+
...(languages ? { languages } : {}),
|
|
105
|
+
...rest,
|
|
106
|
+
},
|
|
107
|
+
});
|
|
108
|
+
const rawElements = res.elements ?? [];
|
|
109
|
+
const elements = this.mapRawElements(rawElements);
|
|
110
|
+
return {
|
|
111
|
+
text: this.elementsToText(elements),
|
|
112
|
+
elements,
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
mapRawElements(raw) {
|
|
116
|
+
return raw.map((el) => {
|
|
117
|
+
const e = el;
|
|
118
|
+
return {
|
|
119
|
+
type: typeof e["type"] === "string" ? e["type"] : "unknown",
|
|
120
|
+
text: typeof e["text"] === "string" ? e["text"] : "",
|
|
121
|
+
metadata: e["metadata"] != null &&
|
|
122
|
+
typeof e["metadata"] === "object" &&
|
|
123
|
+
!Array.isArray(e["metadata"])
|
|
124
|
+
? e["metadata"]
|
|
125
|
+
: undefined,
|
|
126
|
+
};
|
|
127
|
+
});
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
exports.UnstructuredAPIParser = UnstructuredAPIParser;
|
|
131
|
+
//# sourceMappingURL=UnstructuredAPIParser.js.map
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import { DocumentParser } from "./DocumentParser";
|
|
2
|
+
import { ParsedDocument, ParseOptions } from "./types";
|
|
3
|
+
/**
|
|
4
|
+
* Document parser that uses the **local** (open-source Python) version of
|
|
5
|
+
* Unstructured via the `@epilogo/unstructured-io-node` npm bridge.
|
|
6
|
+
*
|
|
7
|
+
* The bridge spawns a Python virtual environment and calls the Python
|
|
8
|
+
* `unstructured` library directly — no API key required, but Python 3.8+
|
|
9
|
+
* and system dependencies (poppler, tesseract, etc.) must be available.
|
|
10
|
+
*
|
|
11
|
+
* **Peer dependency:** `@epilogo/unstructured-io-node`
|
|
12
|
+
*
|
|
13
|
+
* @example
|
|
14
|
+
* ```typescript
|
|
15
|
+
* import { UnstructuredLocalParser } from "@agentionai/agents/parsers";
|
|
16
|
+
*
|
|
17
|
+
* const parser = new UnstructuredLocalParser();
|
|
18
|
+
* const doc = await parser.parse("/path/to/report.pdf", {
|
|
19
|
+
* strategy: "hi_res",
|
|
20
|
+
* languages: ["eng"],
|
|
21
|
+
* });
|
|
22
|
+
* console.log(doc.elements?.length, "elements");
|
|
23
|
+
*
|
|
24
|
+
* // Use with IngestionPipeline
|
|
25
|
+
* await pipeline.ingestFile("/path/to/report.pdf", parser);
|
|
26
|
+
* ```
|
|
27
|
+
*/
|
|
28
|
+
export declare class UnstructuredLocalParser extends DocumentParser {
|
|
29
|
+
readonly name = "unstructured-local";
|
|
30
|
+
/**
|
|
31
|
+
* Parse a file using the local Python Unstructured library.
|
|
32
|
+
*
|
|
33
|
+
* On first call, `ensureEnvironmentSetup()` is invoked to download the
|
|
34
|
+
* Python venv if it does not already exist (one-time, slow operation).
|
|
35
|
+
*
|
|
36
|
+
* @param filePath - Path to the document to parse
|
|
37
|
+
* @param options - Strategy, languages, and any other unstructured kwargs
|
|
38
|
+
*/
|
|
39
|
+
parse(filePath: string, options?: ParseOptions): Promise<ParsedDocument>;
|
|
40
|
+
private mapRawElements;
|
|
41
|
+
}
|
|
42
|
+
//# sourceMappingURL=UnstructuredLocalParser.d.ts.map
|