@agentionai/agents 0.11.0 → 0.12.0-beta
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunkers/ElementChunker.d.ts +100 -0
- package/dist/chunkers/ElementChunker.js +242 -0
- package/dist/chunkers/index.d.ts +1 -0
- package/dist/chunkers/index.js +3 -1
- package/dist/ingestion/IngestionPipeline.d.ts +73 -1
- package/dist/ingestion/IngestionPipeline.js +110 -1
- package/dist/parsers/DocumentParser.d.ts +36 -0
- package/dist/parsers/DocumentParser.js +35 -0
- package/dist/parsers/LlamaIndexParser.d.ts +58 -0
- package/dist/parsers/LlamaIndexParser.js +71 -0
- package/dist/parsers/OllamaOCRParser.d.ts +98 -0
- package/dist/parsers/OllamaOCRParser.js +203 -0
- package/dist/parsers/UnstructuredAPIParser.d.ts +57 -0
- package/dist/parsers/UnstructuredAPIParser.js +131 -0
- package/dist/parsers/UnstructuredLocalParser.d.ts +42 -0
- package/dist/parsers/UnstructuredLocalParser.js +118 -0
- package/dist/parsers/index.d.ts +3 -0
- package/dist/parsers/index.js +6 -0
- package/dist/parsers/types.d.ts +50 -0
- package/dist/parsers/types.js +3 -0
- package/package.json +46 -2
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import { Chunker } from "./Chunker";
|
|
2
|
+
import { Chunk, ChunkerConfig, ChunkOptions } from "./types";
|
|
3
|
+
import { ParsedElement } from "../parsers/types";
|
|
4
|
+
/**
|
|
5
|
+
* Configuration for {@link ElementChunker}.
|
|
6
|
+
*/
|
|
7
|
+
export interface ElementChunkerConfig extends ChunkerConfig {
|
|
8
|
+
/**
|
|
9
|
+
* Element types to skip entirely.
|
|
10
|
+
* Useful for dropping decorative or non-content elements.
|
|
11
|
+
* @example ["Image", "PageBreak", "Header", "Footer"]
|
|
12
|
+
*/
|
|
13
|
+
excludeTypes?: string[];
|
|
14
|
+
/**
|
|
15
|
+
* Element types that always start a new chunk, even if there is room
|
|
16
|
+
* in the current one. Use this to keep headings at the top of their
|
|
17
|
+
* section's chunk.
|
|
18
|
+
* @default ["Title"]
|
|
19
|
+
*/
|
|
20
|
+
breakOnTypes?: string[];
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Chunks a document by grouping its **structured elements** rather than
|
|
24
|
+
* splitting raw text. Designed for use with parsers that return element
|
|
25
|
+
* lists (e.g. {@link UnstructuredLocalParser}, {@link UnstructuredAPIParser}).
|
|
26
|
+
*
|
|
27
|
+
* **How it works:**
|
|
28
|
+
* 1. Adjacent elements are merged into a single chunk until the combined
|
|
29
|
+
* character count would exceed `chunkSize`.
|
|
30
|
+
* 2. A `breakOnTypes` element (default: `"Title"`) always starts a fresh
|
|
31
|
+
* chunk so that headings introduce their section's content.
|
|
32
|
+
* 3. A single element whose text exceeds `chunkSize` is split recursively
|
|
33
|
+
* using separator heuristics (paragraphs → sentences → words → characters).
|
|
34
|
+
* 4. Element types are stored in `chunk.metadata.element_types`; page number
|
|
35
|
+
* is stored in `chunk.metadata.page` when available.
|
|
36
|
+
*
|
|
37
|
+
* Use via {@link IngestionPipeline.ingestFile} — the pipeline automatically
|
|
38
|
+
* calls `chunkElements()` instead of `chunk()` when this chunker is used and
|
|
39
|
+
* the parser returns a structured element list.
|
|
40
|
+
*
|
|
41
|
+
* @example
|
|
42
|
+
* ```typescript
|
|
43
|
+
* import { ElementChunker } from '@agentionai/agents/chunkers';
|
|
44
|
+
* import { UnstructuredLocalParser } from '@agentionai/agents/parsers/unstructured-local';
|
|
45
|
+
*
|
|
46
|
+
* const pipeline = new IngestionPipeline(
|
|
47
|
+
* new ElementChunker({ chunkSize: 1000 }),
|
|
48
|
+
* embeddings,
|
|
49
|
+
* store,
|
|
50
|
+
* );
|
|
51
|
+
*
|
|
52
|
+
* await pipeline.ingestFile('/docs/report.pdf', new UnstructuredLocalParser(), {
|
|
53
|
+
* strategy: 'hi_res',
|
|
54
|
+
* });
|
|
55
|
+
* ```
|
|
56
|
+
*/
|
|
57
|
+
export declare class ElementChunker extends Chunker {
|
|
58
|
+
readonly name = "ElementChunker";
|
|
59
|
+
private readonly excludeTypes;
|
|
60
|
+
private readonly breakOnTypes;
|
|
61
|
+
constructor(config: ElementChunkerConfig);
|
|
62
|
+
/**
|
|
63
|
+
* Chunk a list of structured elements into {@link Chunk} objects.
|
|
64
|
+
*
|
|
65
|
+
* This is the primary entry point when using this chunker with a parser.
|
|
66
|
+
* Called automatically by {@link IngestionPipeline.ingestFile} when
|
|
67
|
+
* the parsed document has an `elements` array.
|
|
68
|
+
*
|
|
69
|
+
* @param elements - Parsed elements from a {@link DocumentParser}
|
|
70
|
+
* @param options - Source tracking and custom metadata
|
|
71
|
+
*/
|
|
72
|
+
chunkElements(elements: ParsedElement[], options?: ChunkOptions): Promise<Chunk[]>;
|
|
73
|
+
/**
|
|
74
|
+
* Fallback text splitting used when {@link Chunker.chunk} is called directly
|
|
75
|
+
* (i.e. without a structured element list). Splits on double newlines first,
|
|
76
|
+
* then sentences, then words.
|
|
77
|
+
*/
|
|
78
|
+
protected splitText(text: string): string[];
|
|
79
|
+
/**
|
|
80
|
+
* Build a {@link Chunk} from a group of elements.
|
|
81
|
+
*/
|
|
82
|
+
private buildChunk;
|
|
83
|
+
/**
|
|
84
|
+
* Split a single large element text using separator heuristics.
|
|
85
|
+
*/
|
|
86
|
+
private splitLargeText;
|
|
87
|
+
/**
|
|
88
|
+
* Greedily merge string parts into windows of at most `maxSize` characters.
|
|
89
|
+
*/
|
|
90
|
+
private mergeToSize;
|
|
91
|
+
/**
|
|
92
|
+
* Hard character-count split when no separator works.
|
|
93
|
+
*/
|
|
94
|
+
private forceSplit;
|
|
95
|
+
/**
|
|
96
|
+
* Apply character-level overlap between already-split strings.
|
|
97
|
+
*/
|
|
98
|
+
private applyCharOverlap;
|
|
99
|
+
}
|
|
100
|
+
//# sourceMappingURL=ElementChunker.d.ts.map
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.ElementChunker = void 0;
|
|
4
|
+
const Chunker_1 = require("./Chunker");
|
|
5
|
+
/**
|
|
6
|
+
* Chunks a document by grouping its **structured elements** rather than
|
|
7
|
+
* splitting raw text. Designed for use with parsers that return element
|
|
8
|
+
* lists (e.g. {@link UnstructuredLocalParser}, {@link UnstructuredAPIParser}).
|
|
9
|
+
*
|
|
10
|
+
* **How it works:**
|
|
11
|
+
* 1. Adjacent elements are merged into a single chunk until the combined
|
|
12
|
+
* character count would exceed `chunkSize`.
|
|
13
|
+
* 2. A `breakOnTypes` element (default: `"Title"`) always starts a fresh
|
|
14
|
+
* chunk so that headings introduce their section's content.
|
|
15
|
+
* 3. A single element whose text exceeds `chunkSize` is split recursively
|
|
16
|
+
* using separator heuristics (paragraphs → sentences → words → characters).
|
|
17
|
+
* 4. Element types are stored in `chunk.metadata.element_types`; page number
|
|
18
|
+
* is stored in `chunk.metadata.page` when available.
|
|
19
|
+
*
|
|
20
|
+
* Use via {@link IngestionPipeline.ingestFile} — the pipeline automatically
|
|
21
|
+
* calls `chunkElements()` instead of `chunk()` when this chunker is used and
|
|
22
|
+
* the parser returns a structured element list.
|
|
23
|
+
*
|
|
24
|
+
* @example
|
|
25
|
+
* ```typescript
|
|
26
|
+
* import { ElementChunker } from '@agentionai/agents/chunkers';
|
|
27
|
+
* import { UnstructuredLocalParser } from '@agentionai/agents/parsers/unstructured-local';
|
|
28
|
+
*
|
|
29
|
+
* const pipeline = new IngestionPipeline(
|
|
30
|
+
* new ElementChunker({ chunkSize: 1000 }),
|
|
31
|
+
* embeddings,
|
|
32
|
+
* store,
|
|
33
|
+
* );
|
|
34
|
+
*
|
|
35
|
+
* await pipeline.ingestFile('/docs/report.pdf', new UnstructuredLocalParser(), {
|
|
36
|
+
* strategy: 'hi_res',
|
|
37
|
+
* });
|
|
38
|
+
* ```
|
|
39
|
+
*/
|
|
40
|
+
class ElementChunker extends Chunker_1.Chunker {
|
|
41
|
+
constructor(config) {
|
|
42
|
+
super(config);
|
|
43
|
+
this.name = "ElementChunker";
|
|
44
|
+
this.excludeTypes = new Set(config.excludeTypes ?? []);
|
|
45
|
+
this.breakOnTypes = new Set(config.breakOnTypes ?? ["Title"]);
|
|
46
|
+
}
|
|
47
|
+
// ─── element-aware primary path ──────────────────────────────────────────
|
|
48
|
+
/**
|
|
49
|
+
* Chunk a list of structured elements into {@link Chunk} objects.
|
|
50
|
+
*
|
|
51
|
+
* This is the primary entry point when using this chunker with a parser.
|
|
52
|
+
* Called automatically by {@link IngestionPipeline.ingestFile} when
|
|
53
|
+
* the parsed document has an `elements` array.
|
|
54
|
+
*
|
|
55
|
+
* @param elements - Parsed elements from a {@link DocumentParser}
|
|
56
|
+
* @param options - Source tracking and custom metadata
|
|
57
|
+
*/
|
|
58
|
+
async chunkElements(elements, options) {
|
|
59
|
+
const { chunkSize } = this.config;
|
|
60
|
+
const chunks = [];
|
|
61
|
+
// Working group — elements accumulated into the next chunk
|
|
62
|
+
let groupElements = [];
|
|
63
|
+
let groupSize = 0;
|
|
64
|
+
const flush = () => {
|
|
65
|
+
if (groupElements.length === 0)
|
|
66
|
+
return;
|
|
67
|
+
const content = groupElements
|
|
68
|
+
.map((el) => el.text)
|
|
69
|
+
.filter(Boolean)
|
|
70
|
+
.join("\n\n");
|
|
71
|
+
if (content.trim()) {
|
|
72
|
+
chunks.push(this.buildChunk(content, groupElements, chunks.length, options));
|
|
73
|
+
}
|
|
74
|
+
groupElements = [];
|
|
75
|
+
groupSize = 0;
|
|
76
|
+
};
|
|
77
|
+
for (const el of elements) {
|
|
78
|
+
if (this.excludeTypes.has(el.type))
|
|
79
|
+
continue;
|
|
80
|
+
const text = el.text?.trim() ?? "";
|
|
81
|
+
if (!text)
|
|
82
|
+
continue;
|
|
83
|
+
// Break-on-type: flush current group before adding this element
|
|
84
|
+
if (this.breakOnTypes.has(el.type) && groupElements.length > 0) {
|
|
85
|
+
flush();
|
|
86
|
+
}
|
|
87
|
+
if (text.length > chunkSize) {
|
|
88
|
+
// Flush current group first, then split the large element into sub-chunks
|
|
89
|
+
flush();
|
|
90
|
+
const subTexts = this.splitLargeText(text);
|
|
91
|
+
for (const subText of subTexts) {
|
|
92
|
+
chunks.push(this.buildChunk(subText, [el], chunks.length, options));
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
else if (groupSize + text.length > chunkSize && groupElements.length > 0) {
|
|
96
|
+
// Adding this element would overflow — flush and start fresh
|
|
97
|
+
flush();
|
|
98
|
+
groupElements.push(el);
|
|
99
|
+
groupSize = text.length;
|
|
100
|
+
}
|
|
101
|
+
else {
|
|
102
|
+
groupElements.push(el);
|
|
103
|
+
groupSize += text.length;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
flush();
|
|
107
|
+
if (chunks.length === 0)
|
|
108
|
+
return [];
|
|
109
|
+
// Set correct total and link chunks
|
|
110
|
+
for (const chunk of chunks) {
|
|
111
|
+
chunk.metadata.total = chunks.length;
|
|
112
|
+
}
|
|
113
|
+
this.linkChunks(chunks);
|
|
114
|
+
if (this.config.chunkProcessor) {
|
|
115
|
+
return this.applyProcessor(chunks);
|
|
116
|
+
}
|
|
117
|
+
return chunks;
|
|
118
|
+
}
|
|
119
|
+
// ─── text fallback path (required by Chunker) ────────────────────────────
|
|
120
|
+
/**
|
|
121
|
+
* Fallback text splitting used when {@link Chunker.chunk} is called directly
|
|
122
|
+
* (i.e. without a structured element list). Splits on double newlines first,
|
|
123
|
+
* then sentences, then words.
|
|
124
|
+
*/
|
|
125
|
+
splitText(text) {
|
|
126
|
+
return this.splitLargeText(text);
|
|
127
|
+
}
|
|
128
|
+
// ─── helpers ─────────────────────────────────────────────────────────────
|
|
129
|
+
/**
|
|
130
|
+
* Build a {@link Chunk} from a group of elements.
|
|
131
|
+
*/
|
|
132
|
+
buildChunk(content, sourceElements, index, options) {
|
|
133
|
+
const elementTypes = [...new Set(sourceElements.map((el) => el.type))];
|
|
134
|
+
// Use page_number from the first element that provides it
|
|
135
|
+
const page = sourceElements
|
|
136
|
+
.map((el) => el.metadata?.["page_number"])
|
|
137
|
+
.find((p) => p != null);
|
|
138
|
+
const metadata = {
|
|
139
|
+
index,
|
|
140
|
+
total: 0, // set after all chunks are built
|
|
141
|
+
prev_id: null,
|
|
142
|
+
next_id: null,
|
|
143
|
+
start: 0,
|
|
144
|
+
end: content.length,
|
|
145
|
+
source_id: options?.sourceId,
|
|
146
|
+
source_path: options?.sourcePath,
|
|
147
|
+
char_count: content.length,
|
|
148
|
+
hash: this.computeHash(content),
|
|
149
|
+
section: this.detectSectionTitle(content),
|
|
150
|
+
page,
|
|
151
|
+
element_types: elementTypes,
|
|
152
|
+
...options?.metadata,
|
|
153
|
+
};
|
|
154
|
+
return {
|
|
155
|
+
id: this.generateId(content, index, options?.sourceId),
|
|
156
|
+
content,
|
|
157
|
+
metadata,
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
/**
|
|
161
|
+
* Split a single large element text using separator heuristics.
|
|
162
|
+
*/
|
|
163
|
+
splitLargeText(text) {
|
|
164
|
+
const { chunkSize, chunkOverlap = 0 } = this.config;
|
|
165
|
+
if (text.length <= chunkSize)
|
|
166
|
+
return [text];
|
|
167
|
+
const separators = ["\n\n", "\n", ". ", " "];
|
|
168
|
+
for (const sep of separators) {
|
|
169
|
+
const parts = text.split(sep).filter((s) => s.trim());
|
|
170
|
+
if (parts.length <= 1)
|
|
171
|
+
continue;
|
|
172
|
+
const merged = this.mergeToSize(parts, sep, chunkSize);
|
|
173
|
+
// Apply overlap if configured
|
|
174
|
+
if (chunkOverlap > 0 && merged.length > 1) {
|
|
175
|
+
return this.applyCharOverlap(merged, chunkOverlap);
|
|
176
|
+
}
|
|
177
|
+
return merged;
|
|
178
|
+
}
|
|
179
|
+
return this.forceSplit(text, chunkSize, chunkOverlap);
|
|
180
|
+
}
|
|
181
|
+
/**
|
|
182
|
+
* Greedily merge string parts into windows of at most `maxSize` characters.
|
|
183
|
+
*/
|
|
184
|
+
mergeToSize(parts, sep, maxSize) {
|
|
185
|
+
const result = [];
|
|
186
|
+
let current = "";
|
|
187
|
+
for (const part of parts) {
|
|
188
|
+
const addition = current ? sep + part : part;
|
|
189
|
+
if (current.length + addition.length <= maxSize) {
|
|
190
|
+
current = current + addition;
|
|
191
|
+
}
|
|
192
|
+
else {
|
|
193
|
+
if (current)
|
|
194
|
+
result.push(current);
|
|
195
|
+
if (part.length > maxSize) {
|
|
196
|
+
result.push(...this.forceSplit(part, maxSize, 0));
|
|
197
|
+
current = "";
|
|
198
|
+
}
|
|
199
|
+
else {
|
|
200
|
+
current = part;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
if (current)
|
|
205
|
+
result.push(current);
|
|
206
|
+
return result;
|
|
207
|
+
}
|
|
208
|
+
/**
|
|
209
|
+
* Hard character-count split when no separator works.
|
|
210
|
+
*/
|
|
211
|
+
forceSplit(text, size, overlap) {
|
|
212
|
+
const chunks = [];
|
|
213
|
+
const step = size - overlap;
|
|
214
|
+
let start = 0;
|
|
215
|
+
while (start < text.length) {
|
|
216
|
+
const end = Math.min(start + size, text.length);
|
|
217
|
+
const slice = text.slice(start, end);
|
|
218
|
+
if (slice.trim())
|
|
219
|
+
chunks.push(slice);
|
|
220
|
+
if (end >= text.length)
|
|
221
|
+
break;
|
|
222
|
+
start += step;
|
|
223
|
+
}
|
|
224
|
+
return chunks;
|
|
225
|
+
}
|
|
226
|
+
/**
|
|
227
|
+
* Apply character-level overlap between already-split strings.
|
|
228
|
+
*/
|
|
229
|
+
applyCharOverlap(chunks, overlap) {
|
|
230
|
+
if (chunks.length <= 1)
|
|
231
|
+
return chunks;
|
|
232
|
+
const result = [chunks[0]];
|
|
233
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
234
|
+
const prev = chunks[i - 1];
|
|
235
|
+
const tail = prev.length > overlap ? prev.slice(prev.length - overlap) : prev;
|
|
236
|
+
result.push(tail + " " + chunks[i]);
|
|
237
|
+
}
|
|
238
|
+
return result;
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
exports.ElementChunker = ElementChunker;
|
|
242
|
+
//# sourceMappingURL=ElementChunker.js.map
|
package/dist/chunkers/index.d.ts
CHANGED
|
@@ -3,4 +3,5 @@ export { Chunker } from "./Chunker";
|
|
|
3
3
|
export { TextChunker } from "./TextChunker";
|
|
4
4
|
export { RecursiveChunker } from "./RecursiveChunker";
|
|
5
5
|
export { TokenChunker } from "./TokenChunker";
|
|
6
|
+
export { ElementChunker, type ElementChunkerConfig } from "./ElementChunker";
|
|
6
7
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/chunkers/index.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.TokenChunker = exports.RecursiveChunker = exports.TextChunker = exports.Chunker = void 0;
|
|
3
|
+
exports.ElementChunker = exports.TokenChunker = exports.RecursiveChunker = exports.TextChunker = exports.Chunker = void 0;
|
|
4
4
|
// Base class
|
|
5
5
|
var Chunker_1 = require("./Chunker");
|
|
6
6
|
Object.defineProperty(exports, "Chunker", { enumerable: true, get: function () { return Chunker_1.Chunker; } });
|
|
@@ -11,4 +11,6 @@ var RecursiveChunker_1 = require("./RecursiveChunker");
|
|
|
11
11
|
Object.defineProperty(exports, "RecursiveChunker", { enumerable: true, get: function () { return RecursiveChunker_1.RecursiveChunker; } });
|
|
12
12
|
var TokenChunker_1 = require("./TokenChunker");
|
|
13
13
|
Object.defineProperty(exports, "TokenChunker", { enumerable: true, get: function () { return TokenChunker_1.TokenChunker; } });
|
|
14
|
+
var ElementChunker_1 = require("./ElementChunker");
|
|
15
|
+
Object.defineProperty(exports, "ElementChunker", { enumerable: true, get: function () { return ElementChunker_1.ElementChunker; } });
|
|
14
16
|
//# sourceMappingURL=index.js.map
|
|
@@ -2,6 +2,8 @@ import { Chunk, ChunkOptions } from "../chunkers/types";
|
|
|
2
2
|
import { Chunker } from "../chunkers/Chunker";
|
|
3
3
|
import { Embeddings } from "../embeddings/Embeddings";
|
|
4
4
|
import { VectorStore } from "../vectorstore/VectorStore";
|
|
5
|
+
import { DocumentParser } from "../parsers/DocumentParser";
|
|
6
|
+
import { ParseOptions } from "../parsers/types";
|
|
5
7
|
import { IngestionOptions, IngestionResult, DocumentInput } from "./types";
|
|
6
8
|
/**
|
|
7
9
|
* Pipeline for ingesting documents into a vector store.
|
|
@@ -31,7 +33,15 @@ export declare class IngestionPipeline {
|
|
|
31
33
|
private chunker;
|
|
32
34
|
private embeddings;
|
|
33
35
|
private store;
|
|
34
|
-
|
|
36
|
+
private parser?;
|
|
37
|
+
/**
|
|
38
|
+
* @param chunker - Chunker to split parsed/raw text into chunks
|
|
39
|
+
* @param embeddings - Embeddings provider
|
|
40
|
+
* @param store - Vector store for persistence
|
|
41
|
+
* @param parser - Optional default parser used by {@link ingestFile} and
|
|
42
|
+
* {@link ingestFiles} when no parser is passed at call time
|
|
43
|
+
*/
|
|
44
|
+
constructor(chunker: Chunker, embeddings: Embeddings, store: VectorStore, parser?: DocumentParser);
|
|
35
45
|
/**
|
|
36
46
|
* Ingest a single document into the vector store.
|
|
37
47
|
*
|
|
@@ -48,6 +58,64 @@ export declare class IngestionPipeline {
|
|
|
48
58
|
* @returns Aggregated result of all ingestions
|
|
49
59
|
*/
|
|
50
60
|
ingestMany(documents: DocumentInput[], options?: IngestionOptions): Promise<IngestionResult>;
|
|
61
|
+
/**
|
|
62
|
+
* Parse a file and ingest it into the vector store.
|
|
63
|
+
*
|
|
64
|
+
* Combines parsing + chunking + embedding + storing in a single call.
|
|
65
|
+
* When the pipeline's chunker is an {@link ElementChunker} and the parser
|
|
66
|
+
* returns structured elements, chunking is done on element boundaries
|
|
67
|
+
* instead of raw text.
|
|
68
|
+
*
|
|
69
|
+
* The `parser` argument is optional when one was configured on the pipeline
|
|
70
|
+
* constructor; it is required otherwise.
|
|
71
|
+
*
|
|
72
|
+
* @example Using a pipeline-level parser:
|
|
73
|
+
* ```typescript
|
|
74
|
+
* const pipeline = new IngestionPipeline(
|
|
75
|
+
* new ElementChunker({ chunkSize: 1000 }),
|
|
76
|
+
* embeddings,
|
|
77
|
+
* store,
|
|
78
|
+
* new UnstructuredLocalParser(),
|
|
79
|
+
* );
|
|
80
|
+
* await pipeline.ingestFile("/docs/report.pdf", { strategy: "hi_res" });
|
|
81
|
+
* ```
|
|
82
|
+
*
|
|
83
|
+
* @example Passing a parser per call:
|
|
84
|
+
* ```typescript
|
|
85
|
+
* await pipeline.ingestFile("/docs/report.pdf", new UnstructuredLocalParser(), {
|
|
86
|
+
* strategy: "hi_res",
|
|
87
|
+
* sourceId: "report-2024",
|
|
88
|
+
* });
|
|
89
|
+
* ```
|
|
90
|
+
*/
|
|
91
|
+
ingestFile(filePath: string, options?: ParseOptions & ChunkOptions & IngestionOptions): Promise<IngestionResult>;
|
|
92
|
+
ingestFile(filePath: string, parser: DocumentParser, options?: ParseOptions & ChunkOptions & IngestionOptions): Promise<IngestionResult>;
|
|
93
|
+
/**
|
|
94
|
+
* Parse and ingest multiple files.
|
|
95
|
+
*
|
|
96
|
+
* Files are parsed sequentially; all chunks are batched together for
|
|
97
|
+
* embedding and storage. When the pipeline uses an {@link ElementChunker}
|
|
98
|
+
* and the parser returns structured elements, element-aware chunking is
|
|
99
|
+
* applied per file (preserving `element_types` and `page` metadata).
|
|
100
|
+
* The `parser` argument is optional when one was set on the pipeline
|
|
101
|
+
* constructor.
|
|
102
|
+
*
|
|
103
|
+
* @example Using a pipeline-level parser:
|
|
104
|
+
* ```typescript
|
|
105
|
+
* await pipeline.ingestFiles(["/a.pdf", "/b.docx"], { skipDuplicates: true });
|
|
106
|
+
* ```
|
|
107
|
+
*
|
|
108
|
+
* @example Passing a parser per call:
|
|
109
|
+
* ```typescript
|
|
110
|
+
* await pipeline.ingestFiles(
|
|
111
|
+
* ["/docs/a.pdf", "/docs/b.docx"],
|
|
112
|
+
* new UnstructuredAPIParser({ serverUrl: "http://localhost:8000" }),
|
|
113
|
+
* { strategy: "auto", skipDuplicates: true }
|
|
114
|
+
* );
|
|
115
|
+
* ```
|
|
116
|
+
*/
|
|
117
|
+
ingestFiles(filePaths: string[], options?: ParseOptions & ChunkOptions & IngestionOptions): Promise<IngestionResult>;
|
|
118
|
+
ingestFiles(filePaths: string[], parser: DocumentParser, options?: ParseOptions & ChunkOptions & IngestionOptions): Promise<IngestionResult>;
|
|
51
119
|
/**
|
|
52
120
|
* Ingest pre-chunked data into the vector store.
|
|
53
121
|
* Useful when chunking is done separately.
|
|
@@ -82,5 +150,9 @@ export declare class IngestionPipeline {
|
|
|
82
150
|
* Get the vector store used by this pipeline.
|
|
83
151
|
*/
|
|
84
152
|
getStore(): VectorStore;
|
|
153
|
+
/**
|
|
154
|
+
* Get the default parser configured on this pipeline, if any.
|
|
155
|
+
*/
|
|
156
|
+
getParser(): DocumentParser | undefined;
|
|
85
157
|
}
|
|
86
158
|
//# sourceMappingURL=IngestionPipeline.d.ts.map
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.IngestionPipeline = void 0;
|
|
4
|
+
const ElementChunker_1 = require("../chunkers/ElementChunker");
|
|
4
5
|
/**
|
|
5
6
|
* Pipeline for ingesting documents into a vector store.
|
|
6
7
|
* Orchestrates the flow: chunk → batch embed → store
|
|
@@ -26,10 +27,18 @@ exports.IngestionPipeline = void 0;
|
|
|
26
27
|
* ```
|
|
27
28
|
*/
|
|
28
29
|
class IngestionPipeline {
|
|
29
|
-
|
|
30
|
+
/**
|
|
31
|
+
* @param chunker - Chunker to split parsed/raw text into chunks
|
|
32
|
+
* @param embeddings - Embeddings provider
|
|
33
|
+
* @param store - Vector store for persistence
|
|
34
|
+
* @param parser - Optional default parser used by {@link ingestFile} and
|
|
35
|
+
* {@link ingestFiles} when no parser is passed at call time
|
|
36
|
+
*/
|
|
37
|
+
constructor(chunker, embeddings, store, parser) {
|
|
30
38
|
this.chunker = chunker;
|
|
31
39
|
this.embeddings = embeddings;
|
|
32
40
|
this.store = store;
|
|
41
|
+
this.parser = parser;
|
|
33
42
|
}
|
|
34
43
|
/**
|
|
35
44
|
* Ingest a single document into the vector store.
|
|
@@ -95,6 +104,100 @@ class IngestionPipeline {
|
|
|
95
104
|
// Process all chunks together
|
|
96
105
|
return this.processChunks(allChunks, options ?? {}, startTime);
|
|
97
106
|
}
|
|
107
|
+
async ingestFile(filePath, parserOrOptions, options) {
|
|
108
|
+
let parser;
|
|
109
|
+
let opts;
|
|
110
|
+
if (parserOrOptions != null && typeof parserOrOptions.parse === "function") {
|
|
111
|
+
parser = parserOrOptions;
|
|
112
|
+
opts = options;
|
|
113
|
+
}
|
|
114
|
+
else {
|
|
115
|
+
parser = this.parser;
|
|
116
|
+
opts = parserOrOptions;
|
|
117
|
+
}
|
|
118
|
+
if (!parser) {
|
|
119
|
+
throw new Error("No parser provided. Pass a DocumentParser to ingestFile() or set one in the IngestionPipeline constructor.");
|
|
120
|
+
}
|
|
121
|
+
const parseOptions = {
|
|
122
|
+
strategy: opts?.strategy,
|
|
123
|
+
languages: opts?.languages,
|
|
124
|
+
};
|
|
125
|
+
const parsed = await parser.parse(filePath, parseOptions);
|
|
126
|
+
const chunkOptions = {
|
|
127
|
+
sourceId: opts?.sourceId,
|
|
128
|
+
sourcePath: opts?.sourcePath ?? filePath,
|
|
129
|
+
metadata: opts?.metadata,
|
|
130
|
+
};
|
|
131
|
+
const ingestionOptions = {
|
|
132
|
+
batchSize: opts?.batchSize,
|
|
133
|
+
onProgress: opts?.onProgress,
|
|
134
|
+
onError: opts?.onError,
|
|
135
|
+
skipDuplicates: opts?.skipDuplicates,
|
|
136
|
+
};
|
|
137
|
+
// When the pipeline uses an ElementChunker and the parser returned
|
|
138
|
+
// structured elements, chunk on element boundaries instead of raw text.
|
|
139
|
+
if (this.chunker instanceof ElementChunker_1.ElementChunker && parsed.elements?.length) {
|
|
140
|
+
const startTime = Date.now();
|
|
141
|
+
const chunks = await this.chunker.chunkElements(parsed.elements, chunkOptions);
|
|
142
|
+
return this.processChunks(chunks, ingestionOptions, startTime);
|
|
143
|
+
}
|
|
144
|
+
return this.ingest(parsed.text, { ...chunkOptions, ...ingestionOptions });
|
|
145
|
+
}
|
|
146
|
+
async ingestFiles(filePaths, parserOrOptions, options) {
|
|
147
|
+
let parser;
|
|
148
|
+
let opts;
|
|
149
|
+
if (parserOrOptions != null && typeof parserOrOptions.parse === "function") {
|
|
150
|
+
parser = parserOrOptions;
|
|
151
|
+
opts = options;
|
|
152
|
+
}
|
|
153
|
+
else {
|
|
154
|
+
parser = this.parser;
|
|
155
|
+
opts = parserOrOptions;
|
|
156
|
+
}
|
|
157
|
+
if (!parser) {
|
|
158
|
+
throw new Error("No parser provided. Pass a DocumentParser to ingestFiles() or set one in the IngestionPipeline constructor.");
|
|
159
|
+
}
|
|
160
|
+
const parseOptions = {
|
|
161
|
+
strategy: opts?.strategy,
|
|
162
|
+
languages: opts?.languages,
|
|
163
|
+
};
|
|
164
|
+
const ingestionOptions = {
|
|
165
|
+
batchSize: opts?.batchSize,
|
|
166
|
+
onProgress: opts?.onProgress,
|
|
167
|
+
onError: opts?.onError,
|
|
168
|
+
skipDuplicates: opts?.skipDuplicates,
|
|
169
|
+
};
|
|
170
|
+
const startTime = Date.now();
|
|
171
|
+
const allChunks = [];
|
|
172
|
+
this.emitProgress(ingestionOptions.onProgress, {
|
|
173
|
+
phase: "chunking",
|
|
174
|
+
processed: 0,
|
|
175
|
+
total: filePaths.length,
|
|
176
|
+
});
|
|
177
|
+
for (let i = 0; i < filePaths.length; i++) {
|
|
178
|
+
const filePath = filePaths[i];
|
|
179
|
+
const parsed = await parser.parse(filePath, parseOptions);
|
|
180
|
+
const chunkOptions = {
|
|
181
|
+
sourceId: opts?.sourceId,
|
|
182
|
+
sourcePath: filePath,
|
|
183
|
+
metadata: opts?.metadata,
|
|
184
|
+
};
|
|
185
|
+
let fileChunks;
|
|
186
|
+
if (this.chunker instanceof ElementChunker_1.ElementChunker && parsed.elements?.length) {
|
|
187
|
+
fileChunks = await this.chunker.chunkElements(parsed.elements, chunkOptions);
|
|
188
|
+
}
|
|
189
|
+
else {
|
|
190
|
+
fileChunks = await this.chunker.chunk(parsed.text, chunkOptions);
|
|
191
|
+
}
|
|
192
|
+
allChunks.push(...fileChunks);
|
|
193
|
+
this.emitProgress(ingestionOptions.onProgress, {
|
|
194
|
+
phase: "chunking",
|
|
195
|
+
processed: i + 1,
|
|
196
|
+
total: filePaths.length,
|
|
197
|
+
});
|
|
198
|
+
}
|
|
199
|
+
return this.processChunks(allChunks, ingestionOptions, startTime);
|
|
200
|
+
}
|
|
98
201
|
/**
|
|
99
202
|
* Ingest pre-chunked data into the vector store.
|
|
100
203
|
* Useful when chunking is done separately.
|
|
@@ -261,6 +364,12 @@ class IngestionPipeline {
|
|
|
261
364
|
getStore() {
|
|
262
365
|
return this.store;
|
|
263
366
|
}
|
|
367
|
+
/**
|
|
368
|
+
* Get the default parser configured on this pipeline, if any.
|
|
369
|
+
*/
|
|
370
|
+
getParser() {
|
|
371
|
+
return this.parser;
|
|
372
|
+
}
|
|
264
373
|
}
|
|
265
374
|
exports.IngestionPipeline = IngestionPipeline;
|
|
266
375
|
//# sourceMappingURL=IngestionPipeline.js.map
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { ParsedDocument, ParsedElement, ParseOptions } from "./types";
|
|
2
|
+
/**
|
|
3
|
+
* Abstract base class for document parsers.
|
|
4
|
+
*
|
|
5
|
+
* Implementations wrap third-party libraries (Unstructured, LlamaIndex, etc.)
|
|
6
|
+
* and normalise their output into a {@link ParsedDocument} that can be fed
|
|
7
|
+
* directly into an {@link IngestionPipeline}.
|
|
8
|
+
*
|
|
9
|
+
* All peer dependencies are loaded lazily via dynamic import so that
|
|
10
|
+
* users only need to install what they actually use.
|
|
11
|
+
*
|
|
12
|
+
* @example
|
|
13
|
+
* ```typescript
|
|
14
|
+
* const parser = new UnstructuredLocalParser();
|
|
15
|
+
* const doc = await parser.parse("/path/to/report.pdf");
|
|
16
|
+
* console.log(doc.elements?.length, "elements parsed");
|
|
17
|
+
* await pipeline.ingestFile("/path/to/report.pdf", parser);
|
|
18
|
+
* ```
|
|
19
|
+
*/
|
|
20
|
+
export declare abstract class DocumentParser {
|
|
21
|
+
/** Human-readable parser identifier (e.g. "unstructured-local") */
|
|
22
|
+
abstract readonly name: string;
|
|
23
|
+
/**
|
|
24
|
+
* Parse a document file and return its content.
|
|
25
|
+
*
|
|
26
|
+
* @param filePath - Absolute or relative path to the file
|
|
27
|
+
* @param options - Optional parsing hints (strategy, languages, etc.)
|
|
28
|
+
*/
|
|
29
|
+
abstract parse(filePath: string, options?: ParseOptions): Promise<ParsedDocument>;
|
|
30
|
+
/**
|
|
31
|
+
* Join elements into a single plain-text string.
|
|
32
|
+
* Filters out empty strings and separates elements with a blank line.
|
|
33
|
+
*/
|
|
34
|
+
protected elementsToText(elements: ParsedElement[]): string;
|
|
35
|
+
}
|
|
36
|
+
//# sourceMappingURL=DocumentParser.d.ts.map
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.DocumentParser = void 0;
|
|
4
|
+
/**
|
|
5
|
+
* Abstract base class for document parsers.
|
|
6
|
+
*
|
|
7
|
+
* Implementations wrap third-party libraries (Unstructured, LlamaIndex, etc.)
|
|
8
|
+
* and normalise their output into a {@link ParsedDocument} that can be fed
|
|
9
|
+
* directly into an {@link IngestionPipeline}.
|
|
10
|
+
*
|
|
11
|
+
* All peer dependencies are loaded lazily via dynamic import so that
|
|
12
|
+
* users only need to install what they actually use.
|
|
13
|
+
*
|
|
14
|
+
* @example
|
|
15
|
+
* ```typescript
|
|
16
|
+
* const parser = new UnstructuredLocalParser();
|
|
17
|
+
* const doc = await parser.parse("/path/to/report.pdf");
|
|
18
|
+
* console.log(doc.elements?.length, "elements parsed");
|
|
19
|
+
* await pipeline.ingestFile("/path/to/report.pdf", parser);
|
|
20
|
+
* ```
|
|
21
|
+
*/
|
|
22
|
+
class DocumentParser {
|
|
23
|
+
/**
|
|
24
|
+
* Join elements into a single plain-text string.
|
|
25
|
+
* Filters out empty strings and separates elements with a blank line.
|
|
26
|
+
*/
|
|
27
|
+
elementsToText(elements) {
|
|
28
|
+
return elements
|
|
29
|
+
.map((el) => el.text)
|
|
30
|
+
.filter(Boolean)
|
|
31
|
+
.join("\n\n");
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
exports.DocumentParser = DocumentParser;
|
|
35
|
+
//# sourceMappingURL=DocumentParser.js.map
|