@agentionai/agents 0.12.0-beta → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agents/Agent.d.ts +9 -3
- package/dist/agents/Agent.js +4 -0
- package/dist/agents/AgentConfig.d.ts +12 -2
- package/dist/agents/model-types.d.ts +7 -1
- package/dist/agents/ollama/OllamaAgent.d.ts +69 -0
- package/dist/agents/ollama/OllamaAgent.js +304 -0
- package/dist/chunkers/index.d.ts +0 -1
- package/dist/chunkers/index.js +1 -3
- package/dist/history/transformers.d.ts +36 -0
- package/dist/history/transformers.js +78 -1
- package/dist/history/types.d.ts +8 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.js +4 -1
- package/dist/ingestion/IngestionPipeline.d.ts +1 -73
- package/dist/ingestion/IngestionPipeline.js +1 -110
- package/dist/ollama.d.ts +4 -0
- package/dist/ollama.js +24 -0
- package/dist/viz/types.d.ts +1 -1
- package/package.json +6 -42
- package/dist/chunkers/ElementChunker.d.ts +0 -100
- package/dist/chunkers/ElementChunker.js +0 -242
- package/dist/parsers/DocumentParser.d.ts +0 -36
- package/dist/parsers/DocumentParser.js +0 -35
- package/dist/parsers/LlamaIndexParser.d.ts +0 -58
- package/dist/parsers/LlamaIndexParser.js +0 -71
- package/dist/parsers/OllamaOCRParser.d.ts +0 -98
- package/dist/parsers/OllamaOCRParser.js +0 -203
- package/dist/parsers/UnstructuredAPIParser.d.ts +0 -57
- package/dist/parsers/UnstructuredAPIParser.js +0 -131
- package/dist/parsers/UnstructuredLocalParser.d.ts +0 -42
- package/dist/parsers/UnstructuredLocalParser.js +0 -118
- package/dist/parsers/index.d.ts +0 -3
- package/dist/parsers/index.js +0 -6
- package/dist/parsers/types.d.ts +0 -50
- package/dist/parsers/types.js +0 -3
|
@@ -1,242 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.ElementChunker = void 0;
|
|
4
|
-
const Chunker_1 = require("./Chunker");
|
|
5
|
-
/**
|
|
6
|
-
* Chunks a document by grouping its **structured elements** rather than
|
|
7
|
-
* splitting raw text. Designed for use with parsers that return element
|
|
8
|
-
* lists (e.g. {@link UnstructuredLocalParser}, {@link UnstructuredAPIParser}).
|
|
9
|
-
*
|
|
10
|
-
* **How it works:**
|
|
11
|
-
* 1. Adjacent elements are merged into a single chunk until the combined
|
|
12
|
-
* character count would exceed `chunkSize`.
|
|
13
|
-
* 2. A `breakOnTypes` element (default: `"Title"`) always starts a fresh
|
|
14
|
-
* chunk so that headings introduce their section's content.
|
|
15
|
-
* 3. A single element whose text exceeds `chunkSize` is split recursively
|
|
16
|
-
* using separator heuristics (paragraphs → sentences → words → characters).
|
|
17
|
-
* 4. Element types are stored in `chunk.metadata.element_types`; page number
|
|
18
|
-
* is stored in `chunk.metadata.page` when available.
|
|
19
|
-
*
|
|
20
|
-
* Use via {@link IngestionPipeline.ingestFile} — the pipeline automatically
|
|
21
|
-
* calls `chunkElements()` instead of `chunk()` when this chunker is used and
|
|
22
|
-
* the parser returns a structured element list.
|
|
23
|
-
*
|
|
24
|
-
* @example
|
|
25
|
-
* ```typescript
|
|
26
|
-
* import { ElementChunker } from '@agentionai/agents/chunkers';
|
|
27
|
-
* import { UnstructuredLocalParser } from '@agentionai/agents/parsers/unstructured-local';
|
|
28
|
-
*
|
|
29
|
-
* const pipeline = new IngestionPipeline(
|
|
30
|
-
* new ElementChunker({ chunkSize: 1000 }),
|
|
31
|
-
* embeddings,
|
|
32
|
-
* store,
|
|
33
|
-
* );
|
|
34
|
-
*
|
|
35
|
-
* await pipeline.ingestFile('/docs/report.pdf', new UnstructuredLocalParser(), {
|
|
36
|
-
* strategy: 'hi_res',
|
|
37
|
-
* });
|
|
38
|
-
* ```
|
|
39
|
-
*/
|
|
40
|
-
class ElementChunker extends Chunker_1.Chunker {
|
|
41
|
-
constructor(config) {
|
|
42
|
-
super(config);
|
|
43
|
-
this.name = "ElementChunker";
|
|
44
|
-
this.excludeTypes = new Set(config.excludeTypes ?? []);
|
|
45
|
-
this.breakOnTypes = new Set(config.breakOnTypes ?? ["Title"]);
|
|
46
|
-
}
|
|
47
|
-
// ─── element-aware primary path ──────────────────────────────────────────
|
|
48
|
-
/**
|
|
49
|
-
* Chunk a list of structured elements into {@link Chunk} objects.
|
|
50
|
-
*
|
|
51
|
-
* This is the primary entry point when using this chunker with a parser.
|
|
52
|
-
* Called automatically by {@link IngestionPipeline.ingestFile} when
|
|
53
|
-
* the parsed document has an `elements` array.
|
|
54
|
-
*
|
|
55
|
-
* @param elements - Parsed elements from a {@link DocumentParser}
|
|
56
|
-
* @param options - Source tracking and custom metadata
|
|
57
|
-
*/
|
|
58
|
-
async chunkElements(elements, options) {
|
|
59
|
-
const { chunkSize } = this.config;
|
|
60
|
-
const chunks = [];
|
|
61
|
-
// Working group — elements accumulated into the next chunk
|
|
62
|
-
let groupElements = [];
|
|
63
|
-
let groupSize = 0;
|
|
64
|
-
const flush = () => {
|
|
65
|
-
if (groupElements.length === 0)
|
|
66
|
-
return;
|
|
67
|
-
const content = groupElements
|
|
68
|
-
.map((el) => el.text)
|
|
69
|
-
.filter(Boolean)
|
|
70
|
-
.join("\n\n");
|
|
71
|
-
if (content.trim()) {
|
|
72
|
-
chunks.push(this.buildChunk(content, groupElements, chunks.length, options));
|
|
73
|
-
}
|
|
74
|
-
groupElements = [];
|
|
75
|
-
groupSize = 0;
|
|
76
|
-
};
|
|
77
|
-
for (const el of elements) {
|
|
78
|
-
if (this.excludeTypes.has(el.type))
|
|
79
|
-
continue;
|
|
80
|
-
const text = el.text?.trim() ?? "";
|
|
81
|
-
if (!text)
|
|
82
|
-
continue;
|
|
83
|
-
// Break-on-type: flush current group before adding this element
|
|
84
|
-
if (this.breakOnTypes.has(el.type) && groupElements.length > 0) {
|
|
85
|
-
flush();
|
|
86
|
-
}
|
|
87
|
-
if (text.length > chunkSize) {
|
|
88
|
-
// Flush current group first, then split the large element into sub-chunks
|
|
89
|
-
flush();
|
|
90
|
-
const subTexts = this.splitLargeText(text);
|
|
91
|
-
for (const subText of subTexts) {
|
|
92
|
-
chunks.push(this.buildChunk(subText, [el], chunks.length, options));
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
else if (groupSize + text.length > chunkSize && groupElements.length > 0) {
|
|
96
|
-
// Adding this element would overflow — flush and start fresh
|
|
97
|
-
flush();
|
|
98
|
-
groupElements.push(el);
|
|
99
|
-
groupSize = text.length;
|
|
100
|
-
}
|
|
101
|
-
else {
|
|
102
|
-
groupElements.push(el);
|
|
103
|
-
groupSize += text.length;
|
|
104
|
-
}
|
|
105
|
-
}
|
|
106
|
-
flush();
|
|
107
|
-
if (chunks.length === 0)
|
|
108
|
-
return [];
|
|
109
|
-
// Set correct total and link chunks
|
|
110
|
-
for (const chunk of chunks) {
|
|
111
|
-
chunk.metadata.total = chunks.length;
|
|
112
|
-
}
|
|
113
|
-
this.linkChunks(chunks);
|
|
114
|
-
if (this.config.chunkProcessor) {
|
|
115
|
-
return this.applyProcessor(chunks);
|
|
116
|
-
}
|
|
117
|
-
return chunks;
|
|
118
|
-
}
|
|
119
|
-
// ─── text fallback path (required by Chunker) ────────────────────────────
|
|
120
|
-
/**
|
|
121
|
-
* Fallback text splitting used when {@link Chunker.chunk} is called directly
|
|
122
|
-
* (i.e. without a structured element list). Splits on double newlines first,
|
|
123
|
-
* then sentences, then words.
|
|
124
|
-
*/
|
|
125
|
-
splitText(text) {
|
|
126
|
-
return this.splitLargeText(text);
|
|
127
|
-
}
|
|
128
|
-
// ─── helpers ─────────────────────────────────────────────────────────────
|
|
129
|
-
/**
|
|
130
|
-
* Build a {@link Chunk} from a group of elements.
|
|
131
|
-
*/
|
|
132
|
-
buildChunk(content, sourceElements, index, options) {
|
|
133
|
-
const elementTypes = [...new Set(sourceElements.map((el) => el.type))];
|
|
134
|
-
// Use page_number from the first element that provides it
|
|
135
|
-
const page = sourceElements
|
|
136
|
-
.map((el) => el.metadata?.["page_number"])
|
|
137
|
-
.find((p) => p != null);
|
|
138
|
-
const metadata = {
|
|
139
|
-
index,
|
|
140
|
-
total: 0, // set after all chunks are built
|
|
141
|
-
prev_id: null,
|
|
142
|
-
next_id: null,
|
|
143
|
-
start: 0,
|
|
144
|
-
end: content.length,
|
|
145
|
-
source_id: options?.sourceId,
|
|
146
|
-
source_path: options?.sourcePath,
|
|
147
|
-
char_count: content.length,
|
|
148
|
-
hash: this.computeHash(content),
|
|
149
|
-
section: this.detectSectionTitle(content),
|
|
150
|
-
page,
|
|
151
|
-
element_types: elementTypes,
|
|
152
|
-
...options?.metadata,
|
|
153
|
-
};
|
|
154
|
-
return {
|
|
155
|
-
id: this.generateId(content, index, options?.sourceId),
|
|
156
|
-
content,
|
|
157
|
-
metadata,
|
|
158
|
-
};
|
|
159
|
-
}
|
|
160
|
-
/**
|
|
161
|
-
* Split a single large element text using separator heuristics.
|
|
162
|
-
*/
|
|
163
|
-
splitLargeText(text) {
|
|
164
|
-
const { chunkSize, chunkOverlap = 0 } = this.config;
|
|
165
|
-
if (text.length <= chunkSize)
|
|
166
|
-
return [text];
|
|
167
|
-
const separators = ["\n\n", "\n", ". ", " "];
|
|
168
|
-
for (const sep of separators) {
|
|
169
|
-
const parts = text.split(sep).filter((s) => s.trim());
|
|
170
|
-
if (parts.length <= 1)
|
|
171
|
-
continue;
|
|
172
|
-
const merged = this.mergeToSize(parts, sep, chunkSize);
|
|
173
|
-
// Apply overlap if configured
|
|
174
|
-
if (chunkOverlap > 0 && merged.length > 1) {
|
|
175
|
-
return this.applyCharOverlap(merged, chunkOverlap);
|
|
176
|
-
}
|
|
177
|
-
return merged;
|
|
178
|
-
}
|
|
179
|
-
return this.forceSplit(text, chunkSize, chunkOverlap);
|
|
180
|
-
}
|
|
181
|
-
/**
|
|
182
|
-
* Greedily merge string parts into windows of at most `maxSize` characters.
|
|
183
|
-
*/
|
|
184
|
-
mergeToSize(parts, sep, maxSize) {
|
|
185
|
-
const result = [];
|
|
186
|
-
let current = "";
|
|
187
|
-
for (const part of parts) {
|
|
188
|
-
const addition = current ? sep + part : part;
|
|
189
|
-
if (current.length + addition.length <= maxSize) {
|
|
190
|
-
current = current + addition;
|
|
191
|
-
}
|
|
192
|
-
else {
|
|
193
|
-
if (current)
|
|
194
|
-
result.push(current);
|
|
195
|
-
if (part.length > maxSize) {
|
|
196
|
-
result.push(...this.forceSplit(part, maxSize, 0));
|
|
197
|
-
current = "";
|
|
198
|
-
}
|
|
199
|
-
else {
|
|
200
|
-
current = part;
|
|
201
|
-
}
|
|
202
|
-
}
|
|
203
|
-
}
|
|
204
|
-
if (current)
|
|
205
|
-
result.push(current);
|
|
206
|
-
return result;
|
|
207
|
-
}
|
|
208
|
-
/**
|
|
209
|
-
* Hard character-count split when no separator works.
|
|
210
|
-
*/
|
|
211
|
-
forceSplit(text, size, overlap) {
|
|
212
|
-
const chunks = [];
|
|
213
|
-
const step = size - overlap;
|
|
214
|
-
let start = 0;
|
|
215
|
-
while (start < text.length) {
|
|
216
|
-
const end = Math.min(start + size, text.length);
|
|
217
|
-
const slice = text.slice(start, end);
|
|
218
|
-
if (slice.trim())
|
|
219
|
-
chunks.push(slice);
|
|
220
|
-
if (end >= text.length)
|
|
221
|
-
break;
|
|
222
|
-
start += step;
|
|
223
|
-
}
|
|
224
|
-
return chunks;
|
|
225
|
-
}
|
|
226
|
-
/**
|
|
227
|
-
* Apply character-level overlap between already-split strings.
|
|
228
|
-
*/
|
|
229
|
-
applyCharOverlap(chunks, overlap) {
|
|
230
|
-
if (chunks.length <= 1)
|
|
231
|
-
return chunks;
|
|
232
|
-
const result = [chunks[0]];
|
|
233
|
-
for (let i = 1; i < chunks.length; i++) {
|
|
234
|
-
const prev = chunks[i - 1];
|
|
235
|
-
const tail = prev.length > overlap ? prev.slice(prev.length - overlap) : prev;
|
|
236
|
-
result.push(tail + " " + chunks[i]);
|
|
237
|
-
}
|
|
238
|
-
return result;
|
|
239
|
-
}
|
|
240
|
-
}
|
|
241
|
-
exports.ElementChunker = ElementChunker;
|
|
242
|
-
//# sourceMappingURL=ElementChunker.js.map
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
import { ParsedDocument, ParsedElement, ParseOptions } from "./types";
|
|
2
|
-
/**
|
|
3
|
-
* Abstract base class for document parsers.
|
|
4
|
-
*
|
|
5
|
-
* Implementations wrap third-party libraries (Unstructured, LlamaIndex, etc.)
|
|
6
|
-
* and normalise their output into a {@link ParsedDocument} that can be fed
|
|
7
|
-
* directly into an {@link IngestionPipeline}.
|
|
8
|
-
*
|
|
9
|
-
* All peer dependencies are loaded lazily via dynamic import so that
|
|
10
|
-
* users only need to install what they actually use.
|
|
11
|
-
*
|
|
12
|
-
* @example
|
|
13
|
-
* ```typescript
|
|
14
|
-
* const parser = new UnstructuredLocalParser();
|
|
15
|
-
* const doc = await parser.parse("/path/to/report.pdf");
|
|
16
|
-
* console.log(doc.elements?.length, "elements parsed");
|
|
17
|
-
* await pipeline.ingestFile("/path/to/report.pdf", parser);
|
|
18
|
-
* ```
|
|
19
|
-
*/
|
|
20
|
-
export declare abstract class DocumentParser {
|
|
21
|
-
/** Human-readable parser identifier (e.g. "unstructured-local") */
|
|
22
|
-
abstract readonly name: string;
|
|
23
|
-
/**
|
|
24
|
-
* Parse a document file and return its content.
|
|
25
|
-
*
|
|
26
|
-
* @param filePath - Absolute or relative path to the file
|
|
27
|
-
* @param options - Optional parsing hints (strategy, languages, etc.)
|
|
28
|
-
*/
|
|
29
|
-
abstract parse(filePath: string, options?: ParseOptions): Promise<ParsedDocument>;
|
|
30
|
-
/**
|
|
31
|
-
* Join elements into a single plain-text string.
|
|
32
|
-
* Filters out empty strings and separates elements with a blank line.
|
|
33
|
-
*/
|
|
34
|
-
protected elementsToText(elements: ParsedElement[]): string;
|
|
35
|
-
}
|
|
36
|
-
//# sourceMappingURL=DocumentParser.d.ts.map
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.DocumentParser = void 0;
|
|
4
|
-
/**
|
|
5
|
-
* Abstract base class for document parsers.
|
|
6
|
-
*
|
|
7
|
-
* Implementations wrap third-party libraries (Unstructured, LlamaIndex, etc.)
|
|
8
|
-
* and normalise their output into a {@link ParsedDocument} that can be fed
|
|
9
|
-
* directly into an {@link IngestionPipeline}.
|
|
10
|
-
*
|
|
11
|
-
* All peer dependencies are loaded lazily via dynamic import so that
|
|
12
|
-
* users only need to install what they actually use.
|
|
13
|
-
*
|
|
14
|
-
* @example
|
|
15
|
-
* ```typescript
|
|
16
|
-
* const parser = new UnstructuredLocalParser();
|
|
17
|
-
* const doc = await parser.parse("/path/to/report.pdf");
|
|
18
|
-
* console.log(doc.elements?.length, "elements parsed");
|
|
19
|
-
* await pipeline.ingestFile("/path/to/report.pdf", parser);
|
|
20
|
-
* ```
|
|
21
|
-
*/
|
|
22
|
-
class DocumentParser {
|
|
23
|
-
/**
|
|
24
|
-
* Join elements into a single plain-text string.
|
|
25
|
-
* Filters out empty strings and separates elements with a blank line.
|
|
26
|
-
*/
|
|
27
|
-
elementsToText(elements) {
|
|
28
|
-
return elements
|
|
29
|
-
.map((el) => el.text)
|
|
30
|
-
.filter(Boolean)
|
|
31
|
-
.join("\n\n");
|
|
32
|
-
}
|
|
33
|
-
}
|
|
34
|
-
exports.DocumentParser = DocumentParser;
|
|
35
|
-
//# sourceMappingURL=DocumentParser.js.map
|
|
@@ -1,58 +0,0 @@
|
|
|
1
|
-
import { DocumentParser } from "./DocumentParser";
|
|
2
|
-
import { ParsedDocument, ParseOptions } from "./types";
|
|
3
|
-
/**
|
|
4
|
-
* A LlamaIndex reader instance.
|
|
5
|
-
* Matches the `BaseReader` interface from `llamaindex` and `@llamaindex/readers`.
|
|
6
|
-
*/
|
|
7
|
-
export interface LlamaIndexReader {
|
|
8
|
-
loadData(filePath: string, ...args: unknown[]): Promise<Array<{
|
|
9
|
-
text: string;
|
|
10
|
-
metadata?: Record<string, unknown>;
|
|
11
|
-
}>>;
|
|
12
|
-
}
|
|
13
|
-
/**
|
|
14
|
-
* Document parser that delegates to any **LlamaIndex reader**.
|
|
15
|
-
*
|
|
16
|
-
* Pass any reader from `llamaindex` or `@llamaindex/readers` — e.g.
|
|
17
|
-
* `PDFReader`, `DocxReader`, `HTMLReader`, `LlamaParseReader`, etc. — and
|
|
18
|
-
* this class normalises the output into a {@link ParsedDocument}.
|
|
19
|
-
*
|
|
20
|
-
* **Peer dependency:** `llamaindex` and/or `@llamaindex/readers`
|
|
21
|
-
*
|
|
22
|
-
* @example
|
|
23
|
-
* ```typescript
|
|
24
|
-
* import { PDFReader } from "@llamaindex/readers/pdf";
|
|
25
|
-
* import { LlamaIndexParser } from "@agentionai/agents/parsers";
|
|
26
|
-
*
|
|
27
|
-
* const parser = new LlamaIndexParser(new PDFReader());
|
|
28
|
-
* const doc = await parser.parse("/path/to/report.pdf");
|
|
29
|
-
* await pipeline.ingestFile("/path/to/report.pdf", parser);
|
|
30
|
-
* ```
|
|
31
|
-
*
|
|
32
|
-
* @example Using LlamaParse (cloud OCR / layout AI)
|
|
33
|
-
* ```typescript
|
|
34
|
-
* import { LlamaParseReader } from "llamaindex";
|
|
35
|
-
*
|
|
36
|
-
* const parser = new LlamaIndexParser(
|
|
37
|
-
* new LlamaParseReader({ resultType: "markdown" })
|
|
38
|
-
* );
|
|
39
|
-
* ```
|
|
40
|
-
*/
|
|
41
|
-
export declare class LlamaIndexParser extends DocumentParser {
|
|
42
|
-
private readonly reader;
|
|
43
|
-
readonly name: string;
|
|
44
|
-
/**
|
|
45
|
-
* @param reader - Any LlamaIndex reader instance
|
|
46
|
-
* @param readerName - Optional label used in {@link name}; defaults to the
|
|
47
|
-
* reader's constructor name
|
|
48
|
-
*/
|
|
49
|
-
constructor(reader: LlamaIndexReader, readerName?: string);
|
|
50
|
-
/**
|
|
51
|
-
* Parse a file using the configured LlamaIndex reader.
|
|
52
|
-
*
|
|
53
|
-
* @param filePath - Path to the document file
|
|
54
|
-
* @param options - Currently unused; kept for interface compatibility
|
|
55
|
-
*/
|
|
56
|
-
parse(filePath: string, _options?: ParseOptions): Promise<ParsedDocument>;
|
|
57
|
-
}
|
|
58
|
-
//# sourceMappingURL=LlamaIndexParser.d.ts.map
|
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.LlamaIndexParser = void 0;
|
|
4
|
-
const DocumentParser_1 = require("./DocumentParser");
|
|
5
|
-
/**
|
|
6
|
-
* Document parser that delegates to any **LlamaIndex reader**.
|
|
7
|
-
*
|
|
8
|
-
* Pass any reader from `llamaindex` or `@llamaindex/readers` — e.g.
|
|
9
|
-
* `PDFReader`, `DocxReader`, `HTMLReader`, `LlamaParseReader`, etc. — and
|
|
10
|
-
* this class normalises the output into a {@link ParsedDocument}.
|
|
11
|
-
*
|
|
12
|
-
* **Peer dependency:** `llamaindex` and/or `@llamaindex/readers`
|
|
13
|
-
*
|
|
14
|
-
* @example
|
|
15
|
-
* ```typescript
|
|
16
|
-
* import { PDFReader } from "@llamaindex/readers/pdf";
|
|
17
|
-
* import { LlamaIndexParser } from "@agentionai/agents/parsers";
|
|
18
|
-
*
|
|
19
|
-
* const parser = new LlamaIndexParser(new PDFReader());
|
|
20
|
-
* const doc = await parser.parse("/path/to/report.pdf");
|
|
21
|
-
* await pipeline.ingestFile("/path/to/report.pdf", parser);
|
|
22
|
-
* ```
|
|
23
|
-
*
|
|
24
|
-
* @example Using LlamaParse (cloud OCR / layout AI)
|
|
25
|
-
* ```typescript
|
|
26
|
-
* import { LlamaParseReader } from "llamaindex";
|
|
27
|
-
*
|
|
28
|
-
* const parser = new LlamaIndexParser(
|
|
29
|
-
* new LlamaParseReader({ resultType: "markdown" })
|
|
30
|
-
* );
|
|
31
|
-
* ```
|
|
32
|
-
*/
|
|
33
|
-
class LlamaIndexParser extends DocumentParser_1.DocumentParser {
|
|
34
|
-
/**
|
|
35
|
-
* @param reader - Any LlamaIndex reader instance
|
|
36
|
-
* @param readerName - Optional label used in {@link name}; defaults to the
|
|
37
|
-
* reader's constructor name
|
|
38
|
-
*/
|
|
39
|
-
constructor(reader, readerName) {
|
|
40
|
-
super();
|
|
41
|
-
this.reader = reader;
|
|
42
|
-
this.name = `llamaindex:${readerName ?? reader.constructor?.name ?? "reader"}`;
|
|
43
|
-
}
|
|
44
|
-
/**
|
|
45
|
-
* Parse a file using the configured LlamaIndex reader.
|
|
46
|
-
*
|
|
47
|
-
* @param filePath - Path to the document file
|
|
48
|
-
* @param options - Currently unused; kept for interface compatibility
|
|
49
|
-
*/
|
|
50
|
-
async parse(filePath, _options) {
|
|
51
|
-
let docs;
|
|
52
|
-
try {
|
|
53
|
-
docs = await this.reader.loadData(filePath);
|
|
54
|
-
}
|
|
55
|
-
catch (err) {
|
|
56
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
57
|
-
throw new Error(`LlamaIndexParser (${this.name}) failed to load "${filePath}": ${msg}`);
|
|
58
|
-
}
|
|
59
|
-
const elements = docs.map((doc, i) => ({
|
|
60
|
-
type: "Document",
|
|
61
|
-
text: doc.text ?? "",
|
|
62
|
-
metadata: { ...doc.metadata, doc_index: i },
|
|
63
|
-
}));
|
|
64
|
-
return {
|
|
65
|
-
text: this.elementsToText(elements),
|
|
66
|
-
elements,
|
|
67
|
-
};
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
exports.LlamaIndexParser = LlamaIndexParser;
|
|
71
|
-
//# sourceMappingURL=LlamaIndexParser.js.map
|
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
import { DocumentParser } from "./DocumentParser";
|
|
2
|
-
import { ParsedDocument, ParseOptions } from "./types";
|
|
3
|
-
/**
|
|
4
|
-
* Configuration for {@link OllamaOCRParser}.
|
|
5
|
-
*/
|
|
6
|
-
export interface OllamaOCRParserConfig {
|
|
7
|
-
/**
|
|
8
|
-
* Ollama model to use for OCR.
|
|
9
|
-
* @default "glm-ocr"
|
|
10
|
-
*/
|
|
11
|
-
model?: string;
|
|
12
|
-
/**
|
|
13
|
-
* Base URL of the local Ollama server.
|
|
14
|
-
* @default "http://localhost:11434"
|
|
15
|
-
*/
|
|
16
|
-
baseUrl?: string;
|
|
17
|
-
/**
|
|
18
|
-
* Prompt sent alongside each image.
|
|
19
|
-
* @default "Extract and transcribe all text from this image. Preserve the original structure, headings, and formatting as much as possible. Output only the extracted text."
|
|
20
|
-
*/
|
|
21
|
-
prompt?: string;
|
|
22
|
-
/**
|
|
23
|
-
* Scale factor for rendering PDF pages to images.
|
|
24
|
-
* 1.0 = 72 DPI, 2.0 = 144 DPI. Lower is faster; higher improves OCR accuracy.
|
|
25
|
-
* @default 2.0
|
|
26
|
-
*/
|
|
27
|
-
pdfScale?: number;
|
|
28
|
-
/**
|
|
29
|
-
* Number of pages to OCR in parallel.
|
|
30
|
-
* Higher values are faster but use more memory and GPU.
|
|
31
|
-
* @default 3
|
|
32
|
-
*/
|
|
33
|
-
concurrency?: number;
|
|
34
|
-
/**
|
|
35
|
-
* Called after each PDF page is OCR'd.
|
|
36
|
-
* With concurrency > 1 pages may complete out of order,
|
|
37
|
-
* but the final document is always in the correct page order.
|
|
38
|
-
*/
|
|
39
|
-
onProgress?: (completed: number, total: number) => void;
|
|
40
|
-
}
|
|
41
|
-
/**
|
|
42
|
-
* Document parser that uses a locally-running **Ollama** vision model (e.g. `glm-ocr`)
|
|
43
|
-
* to perform OCR on image files and PDF documents.
|
|
44
|
-
*
|
|
45
|
-
* **Supported file types:**
|
|
46
|
-
* - Images: `.jpg`, `.jpeg`, `.png`, `.gif`, `.webp`, `.bmp` — no extra dependencies
|
|
47
|
-
* - PDF: requires the optional peer dependency `pdf-to-img` (`npm install pdf-to-img`)
|
|
48
|
-
*
|
|
49
|
-
* **Ollama must be running** with the model pulled:
|
|
50
|
-
* ```bash
|
|
51
|
-
* ollama pull glm-ocr
|
|
52
|
-
* ollama serve # if not already running
|
|
53
|
-
* ```
|
|
54
|
-
*
|
|
55
|
-
* @example
|
|
56
|
-
* ```typescript
|
|
57
|
-
* import { OllamaOCRParser } from "@agentionai/agents/parsers/ollama-ocr";
|
|
58
|
-
*
|
|
59
|
-
* const parser = new OllamaOCRParser({
|
|
60
|
-
* model: "glm-ocr",
|
|
61
|
-
* pdfScale: 1.5,
|
|
62
|
-
* onProgress: (page, total) => console.log(`OCR page ${page}/${total}...`),
|
|
63
|
-
* });
|
|
64
|
-
*
|
|
65
|
-
* // Parse an image
|
|
66
|
-
* const doc = await parser.parse("/path/to/scan.png");
|
|
67
|
-
*
|
|
68
|
-
* // Parse a PDF (requires: npm install pdf-to-img)
|
|
69
|
-
* const pdf = await parser.parse("/path/to/report.pdf");
|
|
70
|
-
*
|
|
71
|
-
* // Use with IngestionPipeline
|
|
72
|
-
* await pipeline.ingestFile("/path/to/scan.png", parser);
|
|
73
|
-
* ```
|
|
74
|
-
*/
|
|
75
|
-
export declare class OllamaOCRParser extends DocumentParser {
|
|
76
|
-
readonly name = "ollama-ocr";
|
|
77
|
-
private readonly model;
|
|
78
|
-
private readonly baseUrl;
|
|
79
|
-
private readonly prompt;
|
|
80
|
-
private readonly pdfScale;
|
|
81
|
-
private readonly concurrency;
|
|
82
|
-
private readonly onProgress?;
|
|
83
|
-
constructor(config?: OllamaOCRParserConfig);
|
|
84
|
-
/**
|
|
85
|
-
* Parse a document file using Ollama OCR.
|
|
86
|
-
*
|
|
87
|
-
* @param filePath - Path to the image or PDF file
|
|
88
|
-
* @param options - Optional hints (unused by this parser; provided for interface compatibility)
|
|
89
|
-
*/
|
|
90
|
-
parse(filePath: string, _options?: ParseOptions): Promise<ParsedDocument>;
|
|
91
|
-
private parseImageFile;
|
|
92
|
-
private parsePdf;
|
|
93
|
-
/**
|
|
94
|
-
* Send a base64-encoded image to the Ollama chat API and return the extracted text.
|
|
95
|
-
*/
|
|
96
|
-
private runOCR;
|
|
97
|
-
}
|
|
98
|
-
//# sourceMappingURL=OllamaOCRParser.d.ts.map
|