@agentionai/agents 0.10.2 → 0.12.0-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,100 @@
1
+ import { Chunker } from "./Chunker";
2
+ import { Chunk, ChunkerConfig, ChunkOptions } from "./types";
3
+ import { ParsedElement } from "../parsers/types";
4
+ /**
5
+ * Configuration for {@link ElementChunker}.
6
+ */
7
+ export interface ElementChunkerConfig extends ChunkerConfig {
8
+ /**
9
+ * Element types to skip entirely.
10
+ * Useful for dropping decorative or non-content elements.
11
+ * @example ["Image", "PageBreak", "Header", "Footer"]
12
+ */
13
+ excludeTypes?: string[];
14
+ /**
15
+ * Element types that always start a new chunk, even if there is room
16
+ * in the current one. Use this to keep headings at the top of their
17
+ * section's chunk.
18
+ * @default ["Title"]
19
+ */
20
+ breakOnTypes?: string[];
21
+ }
22
+ /**
23
+ * Chunks a document by grouping its **structured elements** rather than
24
+ * splitting raw text. Designed for use with parsers that return element
25
+ * lists (e.g. {@link UnstructuredLocalParser}, {@link UnstructuredAPIParser}).
26
+ *
27
+ * **How it works:**
28
+ * 1. Adjacent elements are merged into a single chunk until the combined
29
+ * character count would exceed `chunkSize`.
30
+ * 2. A `breakOnTypes` element (default: `"Title"`) always starts a fresh
31
+ * chunk so that headings introduce their section's content.
32
+ * 3. A single element whose text exceeds `chunkSize` is split recursively
33
+ * using separator heuristics (paragraphs → sentences → words → characters).
34
+ * 4. Element types are stored in `chunk.metadata.element_types`; page number
35
+ * is stored in `chunk.metadata.page` when available.
36
+ *
37
+ * Use via {@link IngestionPipeline.ingestFile} — the pipeline automatically
38
+ * calls `chunkElements()` instead of `chunk()` when this chunker is used and
39
+ * the parser returns a structured element list.
40
+ *
41
+ * @example
42
+ * ```typescript
43
+ * import { ElementChunker } from '@agentionai/agents/chunkers';
44
+ * import { UnstructuredLocalParser } from '@agentionai/agents/parsers/unstructured-local';
45
+ *
46
+ * const pipeline = new IngestionPipeline(
47
+ * new ElementChunker({ chunkSize: 1000 }),
48
+ * embeddings,
49
+ * store,
50
+ * );
51
+ *
52
+ * await pipeline.ingestFile('/docs/report.pdf', new UnstructuredLocalParser(), {
53
+ * strategy: 'hi_res',
54
+ * });
55
+ * ```
56
+ */
57
+ export declare class ElementChunker extends Chunker {
58
+ readonly name = "ElementChunker";
59
+ private readonly excludeTypes;
60
+ private readonly breakOnTypes;
61
+ constructor(config: ElementChunkerConfig);
62
+ /**
63
+ * Chunk a list of structured elements into {@link Chunk} objects.
64
+ *
65
+ * This is the primary entry point when using this chunker with a parser.
66
+ * Called automatically by {@link IngestionPipeline.ingestFile} when
67
+ * the parsed document has an `elements` array.
68
+ *
69
+ * @param elements - Parsed elements from a {@link DocumentParser}
70
+ * @param options - Source tracking and custom metadata
71
+ */
72
+ chunkElements(elements: ParsedElement[], options?: ChunkOptions): Promise<Chunk[]>;
73
+ /**
74
+ * Fallback text splitting used when {@link Chunker.chunk} is called directly
75
+ * (i.e. without a structured element list). Splits on double newlines first,
76
+ * then sentences, then words.
77
+ */
78
+ protected splitText(text: string): string[];
79
+ /**
80
+ * Build a {@link Chunk} from a group of elements.
81
+ */
82
+ private buildChunk;
83
+ /**
84
+ * Split a single large element text using separator heuristics.
85
+ */
86
+ private splitLargeText;
87
+ /**
88
+ * Greedily merge string parts into windows of at most `maxSize` characters.
89
+ */
90
+ private mergeToSize;
91
+ /**
92
+ * Hard character-count split when no separator works.
93
+ */
94
+ private forceSplit;
95
+ /**
96
+ * Apply character-level overlap between already-split strings.
97
+ */
98
+ private applyCharOverlap;
99
+ }
100
+ //# sourceMappingURL=ElementChunker.d.ts.map
@@ -0,0 +1,242 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.ElementChunker = void 0;
4
+ const Chunker_1 = require("./Chunker");
5
+ /**
6
+ * Chunks a document by grouping its **structured elements** rather than
7
+ * splitting raw text. Designed for use with parsers that return element
8
+ * lists (e.g. {@link UnstructuredLocalParser}, {@link UnstructuredAPIParser}).
9
+ *
10
+ * **How it works:**
11
+ * 1. Adjacent elements are merged into a single chunk until the combined
12
+ * character count would exceed `chunkSize`.
13
+ * 2. A `breakOnTypes` element (default: `"Title"`) always starts a fresh
14
+ * chunk so that headings introduce their section's content.
15
+ * 3. A single element whose text exceeds `chunkSize` is split recursively
16
+ * using separator heuristics (paragraphs → sentences → words → characters).
17
+ * 4. Element types are stored in `chunk.metadata.element_types`; page number
18
+ * is stored in `chunk.metadata.page` when available.
19
+ *
20
+ * Use via {@link IngestionPipeline.ingestFile} — the pipeline automatically
21
+ * calls `chunkElements()` instead of `chunk()` when this chunker is used and
22
+ * the parser returns a structured element list.
23
+ *
24
+ * @example
25
+ * ```typescript
26
+ * import { ElementChunker } from '@agentionai/agents/chunkers';
27
+ * import { UnstructuredLocalParser } from '@agentionai/agents/parsers/unstructured-local';
28
+ *
29
+ * const pipeline = new IngestionPipeline(
30
+ * new ElementChunker({ chunkSize: 1000 }),
31
+ * embeddings,
32
+ * store,
33
+ * );
34
+ *
35
+ * await pipeline.ingestFile('/docs/report.pdf', new UnstructuredLocalParser(), {
36
+ * strategy: 'hi_res',
37
+ * });
38
+ * ```
39
+ */
40
+ class ElementChunker extends Chunker_1.Chunker {
41
+ constructor(config) {
42
+ super(config);
43
+ this.name = "ElementChunker";
44
+ this.excludeTypes = new Set(config.excludeTypes ?? []);
45
+ this.breakOnTypes = new Set(config.breakOnTypes ?? ["Title"]);
46
+ }
47
+ // ─── element-aware primary path ──────────────────────────────────────────
48
+ /**
49
+ * Chunk a list of structured elements into {@link Chunk} objects.
50
+ *
51
+ * This is the primary entry point when using this chunker with a parser.
52
+ * Called automatically by {@link IngestionPipeline.ingestFile} when
53
+ * the parsed document has an `elements` array.
54
+ *
55
+ * @param elements - Parsed elements from a {@link DocumentParser}
56
+ * @param options - Source tracking and custom metadata
57
+ */
58
+ async chunkElements(elements, options) {
59
+ const { chunkSize } = this.config;
60
+ const chunks = [];
61
+ // Working group — elements accumulated into the next chunk
62
+ let groupElements = [];
63
+ let groupSize = 0;
64
+ const flush = () => {
65
+ if (groupElements.length === 0)
66
+ return;
67
+ const content = groupElements
68
+ .map((el) => el.text)
69
+ .filter(Boolean)
70
+ .join("\n\n");
71
+ if (content.trim()) {
72
+ chunks.push(this.buildChunk(content, groupElements, chunks.length, options));
73
+ }
74
+ groupElements = [];
75
+ groupSize = 0;
76
+ };
77
+ for (const el of elements) {
78
+ if (this.excludeTypes.has(el.type))
79
+ continue;
80
+ const text = el.text?.trim() ?? "";
81
+ if (!text)
82
+ continue;
83
+ // Break-on-type: flush current group before adding this element
84
+ if (this.breakOnTypes.has(el.type) && groupElements.length > 0) {
85
+ flush();
86
+ }
87
+ if (text.length > chunkSize) {
88
+ // Flush current group first, then split the large element into sub-chunks
89
+ flush();
90
+ const subTexts = this.splitLargeText(text);
91
+ for (const subText of subTexts) {
92
+ chunks.push(this.buildChunk(subText, [el], chunks.length, options));
93
+ }
94
+ }
95
+ else if (groupSize + text.length > chunkSize && groupElements.length > 0) {
96
+ // Adding this element would overflow — flush and start fresh
97
+ flush();
98
+ groupElements.push(el);
99
+ groupSize = text.length;
100
+ }
101
+ else {
102
+ groupElements.push(el);
103
+ groupSize += text.length;
104
+ }
105
+ }
106
+ flush();
107
+ if (chunks.length === 0)
108
+ return [];
109
+ // Set correct total and link chunks
110
+ for (const chunk of chunks) {
111
+ chunk.metadata.total = chunks.length;
112
+ }
113
+ this.linkChunks(chunks);
114
+ if (this.config.chunkProcessor) {
115
+ return this.applyProcessor(chunks);
116
+ }
117
+ return chunks;
118
+ }
119
+ // ─── text fallback path (required by Chunker) ────────────────────────────
120
+ /**
121
+ * Fallback text splitting used when {@link Chunker.chunk} is called directly
122
+ * (i.e. without a structured element list). Splits on double newlines first,
123
+ * then sentences, then words.
124
+ */
125
+ splitText(text) {
126
+ return this.splitLargeText(text);
127
+ }
128
+ // ─── helpers ─────────────────────────────────────────────────────────────
129
+ /**
130
+ * Build a {@link Chunk} from a group of elements.
131
+ */
132
+ buildChunk(content, sourceElements, index, options) {
133
+ const elementTypes = [...new Set(sourceElements.map((el) => el.type))];
134
+ // Use page_number from the first element that provides it
135
+ const page = sourceElements
136
+ .map((el) => el.metadata?.["page_number"])
137
+ .find((p) => p != null);
138
+ const metadata = {
139
+ index,
140
+ total: 0, // set after all chunks are built
141
+ prev_id: null,
142
+ next_id: null,
143
+ start: 0,
144
+ end: content.length,
145
+ source_id: options?.sourceId,
146
+ source_path: options?.sourcePath,
147
+ char_count: content.length,
148
+ hash: this.computeHash(content),
149
+ section: this.detectSectionTitle(content),
150
+ page,
151
+ element_types: elementTypes,
152
+ ...options?.metadata,
153
+ };
154
+ return {
155
+ id: this.generateId(content, index, options?.sourceId),
156
+ content,
157
+ metadata,
158
+ };
159
+ }
160
+ /**
161
+ * Split a single large element text using separator heuristics.
162
+ */
163
+ splitLargeText(text) {
164
+ const { chunkSize, chunkOverlap = 0 } = this.config;
165
+ if (text.length <= chunkSize)
166
+ return [text];
167
+ const separators = ["\n\n", "\n", ". ", " "];
168
+ for (const sep of separators) {
169
+ const parts = text.split(sep).filter((s) => s.trim());
170
+ if (parts.length <= 1)
171
+ continue;
172
+ const merged = this.mergeToSize(parts, sep, chunkSize);
173
+ // Apply overlap if configured
174
+ if (chunkOverlap > 0 && merged.length > 1) {
175
+ return this.applyCharOverlap(merged, chunkOverlap);
176
+ }
177
+ return merged;
178
+ }
179
+ return this.forceSplit(text, chunkSize, chunkOverlap);
180
+ }
181
+ /**
182
+ * Greedily merge string parts into windows of at most `maxSize` characters.
183
+ */
184
+ mergeToSize(parts, sep, maxSize) {
185
+ const result = [];
186
+ let current = "";
187
+ for (const part of parts) {
188
+ const addition = current ? sep + part : part;
189
+ if (current.length + addition.length <= maxSize) {
190
+ current = current + addition;
191
+ }
192
+ else {
193
+ if (current)
194
+ result.push(current);
195
+ if (part.length > maxSize) {
196
+ result.push(...this.forceSplit(part, maxSize, 0));
197
+ current = "";
198
+ }
199
+ else {
200
+ current = part;
201
+ }
202
+ }
203
+ }
204
+ if (current)
205
+ result.push(current);
206
+ return result;
207
+ }
208
+ /**
209
+ * Hard character-count split when no separator works.
210
+ */
211
+ forceSplit(text, size, overlap) {
212
+ const chunks = [];
213
+ const step = size - overlap;
214
+ let start = 0;
215
+ while (start < text.length) {
216
+ const end = Math.min(start + size, text.length);
217
+ const slice = text.slice(start, end);
218
+ if (slice.trim())
219
+ chunks.push(slice);
220
+ if (end >= text.length)
221
+ break;
222
+ start += step;
223
+ }
224
+ return chunks;
225
+ }
226
+ /**
227
+ * Apply character-level overlap between already-split strings.
228
+ */
229
+ applyCharOverlap(chunks, overlap) {
230
+ if (chunks.length <= 1)
231
+ return chunks;
232
+ const result = [chunks[0]];
233
+ for (let i = 1; i < chunks.length; i++) {
234
+ const prev = chunks[i - 1];
235
+ const tail = prev.length > overlap ? prev.slice(prev.length - overlap) : prev;
236
+ result.push(tail + " " + chunks[i]);
237
+ }
238
+ return result;
239
+ }
240
+ }
241
+ exports.ElementChunker = ElementChunker;
242
+ //# sourceMappingURL=ElementChunker.js.map
@@ -3,4 +3,5 @@ export { Chunker } from "./Chunker";
3
3
  export { TextChunker } from "./TextChunker";
4
4
  export { RecursiveChunker } from "./RecursiveChunker";
5
5
  export { TokenChunker } from "./TokenChunker";
6
+ export { ElementChunker, type ElementChunkerConfig } from "./ElementChunker";
6
7
  //# sourceMappingURL=index.d.ts.map
@@ -1,6 +1,6 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.TokenChunker = exports.RecursiveChunker = exports.TextChunker = exports.Chunker = void 0;
3
+ exports.ElementChunker = exports.TokenChunker = exports.RecursiveChunker = exports.TextChunker = exports.Chunker = void 0;
4
4
  // Base class
5
5
  var Chunker_1 = require("./Chunker");
6
6
  Object.defineProperty(exports, "Chunker", { enumerable: true, get: function () { return Chunker_1.Chunker; } });
@@ -11,4 +11,6 @@ var RecursiveChunker_1 = require("./RecursiveChunker");
11
11
  Object.defineProperty(exports, "RecursiveChunker", { enumerable: true, get: function () { return RecursiveChunker_1.RecursiveChunker; } });
12
12
  var TokenChunker_1 = require("./TokenChunker");
13
13
  Object.defineProperty(exports, "TokenChunker", { enumerable: true, get: function () { return TokenChunker_1.TokenChunker; } });
14
+ var ElementChunker_1 = require("./ElementChunker");
15
+ Object.defineProperty(exports, "ElementChunker", { enumerable: true, get: function () { return ElementChunker_1.ElementChunker; } });
14
16
  //# sourceMappingURL=index.js.map
@@ -2,6 +2,8 @@ import { Chunk, ChunkOptions } from "../chunkers/types";
2
2
  import { Chunker } from "../chunkers/Chunker";
3
3
  import { Embeddings } from "../embeddings/Embeddings";
4
4
  import { VectorStore } from "../vectorstore/VectorStore";
5
+ import { DocumentParser } from "../parsers/DocumentParser";
6
+ import { ParseOptions } from "../parsers/types";
5
7
  import { IngestionOptions, IngestionResult, DocumentInput } from "./types";
6
8
  /**
7
9
  * Pipeline for ingesting documents into a vector store.
@@ -31,7 +33,15 @@ export declare class IngestionPipeline {
31
33
  private chunker;
32
34
  private embeddings;
33
35
  private store;
34
- constructor(chunker: Chunker, embeddings: Embeddings, store: VectorStore);
36
+ private parser?;
37
+ /**
38
+ * @param chunker - Chunker to split parsed/raw text into chunks
39
+ * @param embeddings - Embeddings provider
40
+ * @param store - Vector store for persistence
41
+ * @param parser - Optional default parser used by {@link ingestFile} and
42
+ * {@link ingestFiles} when no parser is passed at call time
43
+ */
44
+ constructor(chunker: Chunker, embeddings: Embeddings, store: VectorStore, parser?: DocumentParser);
35
45
  /**
36
46
  * Ingest a single document into the vector store.
37
47
  *
@@ -48,6 +58,64 @@ export declare class IngestionPipeline {
48
58
  * @returns Aggregated result of all ingestions
49
59
  */
50
60
  ingestMany(documents: DocumentInput[], options?: IngestionOptions): Promise<IngestionResult>;
61
+ /**
62
+ * Parse a file and ingest it into the vector store.
63
+ *
64
+ * Combines parsing + chunking + embedding + storing in a single call.
65
+ * When the pipeline's chunker is an {@link ElementChunker} and the parser
66
+ * returns structured elements, chunking is done on element boundaries
67
+ * instead of raw text.
68
+ *
69
+ * The `parser` argument is optional when one was configured on the pipeline
70
+ * constructor; it is required otherwise.
71
+ *
72
+ * @example Using a pipeline-level parser:
73
+ * ```typescript
74
+ * const pipeline = new IngestionPipeline(
75
+ * new ElementChunker({ chunkSize: 1000 }),
76
+ * embeddings,
77
+ * store,
78
+ * new UnstructuredLocalParser(),
79
+ * );
80
+ * await pipeline.ingestFile("/docs/report.pdf", { strategy: "hi_res" });
81
+ * ```
82
+ *
83
+ * @example Passing a parser per call:
84
+ * ```typescript
85
+ * await pipeline.ingestFile("/docs/report.pdf", new UnstructuredLocalParser(), {
86
+ * strategy: "hi_res",
87
+ * sourceId: "report-2024",
88
+ * });
89
+ * ```
90
+ */
91
+ ingestFile(filePath: string, options?: ParseOptions & ChunkOptions & IngestionOptions): Promise<IngestionResult>;
92
+ ingestFile(filePath: string, parser: DocumentParser, options?: ParseOptions & ChunkOptions & IngestionOptions): Promise<IngestionResult>;
93
+ /**
94
+ * Parse and ingest multiple files.
95
+ *
96
+ * Files are parsed sequentially; all chunks are batched together for
97
+ * embedding and storage. When the pipeline uses an {@link ElementChunker}
98
+ * and the parser returns structured elements, element-aware chunking is
99
+ * applied per file (preserving `element_types` and `page` metadata).
100
+ * The `parser` argument is optional when one was set on the pipeline
101
+ * constructor.
102
+ *
103
+ * @example Using a pipeline-level parser:
104
+ * ```typescript
105
+ * await pipeline.ingestFiles(["/a.pdf", "/b.docx"], { skipDuplicates: true });
106
+ * ```
107
+ *
108
+ * @example Passing a parser per call:
109
+ * ```typescript
110
+ * await pipeline.ingestFiles(
111
+ * ["/docs/a.pdf", "/docs/b.docx"],
112
+ * new UnstructuredAPIParser({ serverUrl: "http://localhost:8000" }),
113
+ * { strategy: "auto", skipDuplicates: true }
114
+ * );
115
+ * ```
116
+ */
117
+ ingestFiles(filePaths: string[], options?: ParseOptions & ChunkOptions & IngestionOptions): Promise<IngestionResult>;
118
+ ingestFiles(filePaths: string[], parser: DocumentParser, options?: ParseOptions & ChunkOptions & IngestionOptions): Promise<IngestionResult>;
51
119
  /**
52
120
  * Ingest pre-chunked data into the vector store.
53
121
  * Useful when chunking is done separately.
@@ -82,5 +150,9 @@ export declare class IngestionPipeline {
82
150
  * Get the vector store used by this pipeline.
83
151
  */
84
152
  getStore(): VectorStore;
153
+ /**
154
+ * Get the default parser configured on this pipeline, if any.
155
+ */
156
+ getParser(): DocumentParser | undefined;
85
157
  }
86
158
  //# sourceMappingURL=IngestionPipeline.d.ts.map
@@ -1,6 +1,7 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.IngestionPipeline = void 0;
4
+ const ElementChunker_1 = require("../chunkers/ElementChunker");
4
5
  /**
5
6
  * Pipeline for ingesting documents into a vector store.
6
7
  * Orchestrates the flow: chunk → batch embed → store
@@ -26,10 +27,18 @@ exports.IngestionPipeline = void 0;
26
27
  * ```
27
28
  */
28
29
  class IngestionPipeline {
29
- constructor(chunker, embeddings, store) {
30
+ /**
31
+ * @param chunker - Chunker to split parsed/raw text into chunks
32
+ * @param embeddings - Embeddings provider
33
+ * @param store - Vector store for persistence
34
+ * @param parser - Optional default parser used by {@link ingestFile} and
35
+ * {@link ingestFiles} when no parser is passed at call time
36
+ */
37
+ constructor(chunker, embeddings, store, parser) {
30
38
  this.chunker = chunker;
31
39
  this.embeddings = embeddings;
32
40
  this.store = store;
41
+ this.parser = parser;
33
42
  }
34
43
  /**
35
44
  * Ingest a single document into the vector store.
@@ -95,6 +104,100 @@ class IngestionPipeline {
95
104
  // Process all chunks together
96
105
  return this.processChunks(allChunks, options ?? {}, startTime);
97
106
  }
107
+ async ingestFile(filePath, parserOrOptions, options) {
108
+ let parser;
109
+ let opts;
110
+ if (parserOrOptions != null && typeof parserOrOptions.parse === "function") {
111
+ parser = parserOrOptions;
112
+ opts = options;
113
+ }
114
+ else {
115
+ parser = this.parser;
116
+ opts = parserOrOptions;
117
+ }
118
+ if (!parser) {
119
+ throw new Error("No parser provided. Pass a DocumentParser to ingestFile() or set one in the IngestionPipeline constructor.");
120
+ }
121
+ const parseOptions = {
122
+ strategy: opts?.strategy,
123
+ languages: opts?.languages,
124
+ };
125
+ const parsed = await parser.parse(filePath, parseOptions);
126
+ const chunkOptions = {
127
+ sourceId: opts?.sourceId,
128
+ sourcePath: opts?.sourcePath ?? filePath,
129
+ metadata: opts?.metadata,
130
+ };
131
+ const ingestionOptions = {
132
+ batchSize: opts?.batchSize,
133
+ onProgress: opts?.onProgress,
134
+ onError: opts?.onError,
135
+ skipDuplicates: opts?.skipDuplicates,
136
+ };
137
+ // When the pipeline uses an ElementChunker and the parser returned
138
+ // structured elements, chunk on element boundaries instead of raw text.
139
+ if (this.chunker instanceof ElementChunker_1.ElementChunker && parsed.elements?.length) {
140
+ const startTime = Date.now();
141
+ const chunks = await this.chunker.chunkElements(parsed.elements, chunkOptions);
142
+ return this.processChunks(chunks, ingestionOptions, startTime);
143
+ }
144
+ return this.ingest(parsed.text, { ...chunkOptions, ...ingestionOptions });
145
+ }
146
+ async ingestFiles(filePaths, parserOrOptions, options) {
147
+ let parser;
148
+ let opts;
149
+ if (parserOrOptions != null && typeof parserOrOptions.parse === "function") {
150
+ parser = parserOrOptions;
151
+ opts = options;
152
+ }
153
+ else {
154
+ parser = this.parser;
155
+ opts = parserOrOptions;
156
+ }
157
+ if (!parser) {
158
+ throw new Error("No parser provided. Pass a DocumentParser to ingestFiles() or set one in the IngestionPipeline constructor.");
159
+ }
160
+ const parseOptions = {
161
+ strategy: opts?.strategy,
162
+ languages: opts?.languages,
163
+ };
164
+ const ingestionOptions = {
165
+ batchSize: opts?.batchSize,
166
+ onProgress: opts?.onProgress,
167
+ onError: opts?.onError,
168
+ skipDuplicates: opts?.skipDuplicates,
169
+ };
170
+ const startTime = Date.now();
171
+ const allChunks = [];
172
+ this.emitProgress(ingestionOptions.onProgress, {
173
+ phase: "chunking",
174
+ processed: 0,
175
+ total: filePaths.length,
176
+ });
177
+ for (let i = 0; i < filePaths.length; i++) {
178
+ const filePath = filePaths[i];
179
+ const parsed = await parser.parse(filePath, parseOptions);
180
+ const chunkOptions = {
181
+ sourceId: opts?.sourceId,
182
+ sourcePath: filePath,
183
+ metadata: opts?.metadata,
184
+ };
185
+ let fileChunks;
186
+ if (this.chunker instanceof ElementChunker_1.ElementChunker && parsed.elements?.length) {
187
+ fileChunks = await this.chunker.chunkElements(parsed.elements, chunkOptions);
188
+ }
189
+ else {
190
+ fileChunks = await this.chunker.chunk(parsed.text, chunkOptions);
191
+ }
192
+ allChunks.push(...fileChunks);
193
+ this.emitProgress(ingestionOptions.onProgress, {
194
+ phase: "chunking",
195
+ processed: i + 1,
196
+ total: filePaths.length,
197
+ });
198
+ }
199
+ return this.processChunks(allChunks, ingestionOptions, startTime);
200
+ }
98
201
  /**
99
202
  * Ingest pre-chunked data into the vector store.
100
203
  * Useful when chunking is done separately.
@@ -261,6 +364,12 @@ class IngestionPipeline {
261
364
  getStore() {
262
365
  return this.store;
263
366
  }
367
+ /**
368
+ * Get the default parser configured on this pipeline, if any.
369
+ */
370
+ getParser() {
371
+ return this.parser;
372
+ }
264
373
  }
265
374
  exports.IngestionPipeline = IngestionPipeline;
266
375
  //# sourceMappingURL=IngestionPipeline.js.map
@@ -0,0 +1,36 @@
1
+ import { ParsedDocument, ParsedElement, ParseOptions } from "./types";
2
+ /**
3
+ * Abstract base class for document parsers.
4
+ *
5
+ * Implementations wrap third-party libraries (Unstructured, LlamaIndex, etc.)
6
+ * and normalise their output into a {@link ParsedDocument} that can be fed
7
+ * directly into an {@link IngestionPipeline}.
8
+ *
9
+ * All peer dependencies are loaded lazily via dynamic import so that
10
+ * users only need to install what they actually use.
11
+ *
12
+ * @example
13
+ * ```typescript
14
+ * const parser = new UnstructuredLocalParser();
15
+ * const doc = await parser.parse("/path/to/report.pdf");
16
+ * console.log(doc.elements?.length, "elements parsed");
17
+ * await pipeline.ingestFile("/path/to/report.pdf", parser);
18
+ * ```
19
+ */
20
+ export declare abstract class DocumentParser {
21
+ /** Human-readable parser identifier (e.g. "unstructured-local") */
22
+ abstract readonly name: string;
23
+ /**
24
+ * Parse a document file and return its content.
25
+ *
26
+ * @param filePath - Absolute or relative path to the file
27
+ * @param options - Optional parsing hints (strategy, languages, etc.)
28
+ */
29
+ abstract parse(filePath: string, options?: ParseOptions): Promise<ParsedDocument>;
30
+ /**
31
+ * Join elements into a single plain-text string.
32
+ * Filters out empty strings and separates elements with a blank line.
33
+ */
34
+ protected elementsToText(elements: ParsedElement[]): string;
35
+ }
36
+ //# sourceMappingURL=DocumentParser.d.ts.map
@@ -0,0 +1,35 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.DocumentParser = void 0;
4
+ /**
5
+ * Abstract base class for document parsers.
6
+ *
7
+ * Implementations wrap third-party libraries (Unstructured, LlamaIndex, etc.)
8
+ * and normalise their output into a {@link ParsedDocument} that can be fed
9
+ * directly into an {@link IngestionPipeline}.
10
+ *
11
+ * All peer dependencies are loaded lazily via dynamic import so that
12
+ * users only need to install what they actually use.
13
+ *
14
+ * @example
15
+ * ```typescript
16
+ * const parser = new UnstructuredLocalParser();
17
+ * const doc = await parser.parse("/path/to/report.pdf");
18
+ * console.log(doc.elements?.length, "elements parsed");
19
+ * await pipeline.ingestFile("/path/to/report.pdf", parser);
20
+ * ```
21
+ */
22
+ class DocumentParser {
23
+ /**
24
+ * Join elements into a single plain-text string.
25
+ * Filters out empty strings and separates elements with a blank line.
26
+ */
27
+ elementsToText(elements) {
28
+ return elements
29
+ .map((el) => el.text)
30
+ .filter(Boolean)
31
+ .join("\n\n");
32
+ }
33
+ }
34
+ exports.DocumentParser = DocumentParser;
35
+ //# sourceMappingURL=DocumentParser.js.map