@lov3kaizen/agentsea-ingest 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +278 -0
- package/dist/index.d.ts +1558 -0
- package/dist/index.js +4007 -0
- package/dist/index.js.map +1 -0
- package/package.json +89 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,1558 @@
|
|
|
1
|
+
type DocumentType = 'pdf' | 'docx' | 'html' | 'markdown' | 'txt' | 'csv' | 'xlsx' | 'pptx' | 'email' | 'epub' | 'json' | 'unknown';
|
|
2
|
+
type ElementType = 'text' | 'title' | 'heading' | 'paragraph' | 'list' | 'list_item' | 'table' | 'image' | 'code' | 'quote' | 'link' | 'footnote' | 'header' | 'footer' | 'page_break' | 'unknown';
|
|
3
|
+
interface DocumentMetadata {
|
|
4
|
+
filename?: string;
|
|
5
|
+
mimeType?: string;
|
|
6
|
+
title?: string;
|
|
7
|
+
author?: string;
|
|
8
|
+
createdAt?: Date;
|
|
9
|
+
modifiedAt?: Date;
|
|
10
|
+
pageCount?: number;
|
|
11
|
+
wordCount?: number;
|
|
12
|
+
characterCount?: number;
|
|
13
|
+
language?: string;
|
|
14
|
+
fileSize?: number;
|
|
15
|
+
sourceUrl?: string;
|
|
16
|
+
custom?: Record<string, unknown>;
|
|
17
|
+
}
|
|
18
|
+
interface ChunkMetadata {
|
|
19
|
+
index: number;
|
|
20
|
+
pageNumber?: number;
|
|
21
|
+
sectionPath?: string[];
|
|
22
|
+
startOffset?: number;
|
|
23
|
+
endOffset?: number;
|
|
24
|
+
elementType?: ElementType;
|
|
25
|
+
parentId?: string;
|
|
26
|
+
childIds?: string[];
|
|
27
|
+
custom?: Record<string, unknown>;
|
|
28
|
+
}
|
|
29
|
+
interface Chunk {
|
|
30
|
+
id: string;
|
|
31
|
+
documentId: string;
|
|
32
|
+
text: string;
|
|
33
|
+
tokenCount: number;
|
|
34
|
+
metadata: ChunkMetadata;
|
|
35
|
+
embedding?: number[];
|
|
36
|
+
}
|
|
37
|
+
interface Element {
|
|
38
|
+
type: ElementType;
|
|
39
|
+
text: string;
|
|
40
|
+
pageNumber?: number;
|
|
41
|
+
bbox?: BoundingBox;
|
|
42
|
+
metadata?: Record<string, unknown>;
|
|
43
|
+
children?: Element[];
|
|
44
|
+
}
|
|
45
|
+
interface BoundingBox {
|
|
46
|
+
x: number;
|
|
47
|
+
y: number;
|
|
48
|
+
width: number;
|
|
49
|
+
height: number;
|
|
50
|
+
}
|
|
51
|
+
interface TableData {
|
|
52
|
+
id: string;
|
|
53
|
+
pageNumber?: number;
|
|
54
|
+
headers: string[];
|
|
55
|
+
rows: string[][];
|
|
56
|
+
raw?: unknown;
|
|
57
|
+
bbox?: BoundingBox;
|
|
58
|
+
caption?: string;
|
|
59
|
+
}
|
|
60
|
+
interface ImageData {
|
|
61
|
+
id: string;
|
|
62
|
+
pageNumber?: number;
|
|
63
|
+
width: number;
|
|
64
|
+
height: number;
|
|
65
|
+
format: string;
|
|
66
|
+
url?: string;
|
|
67
|
+
base64?: string;
|
|
68
|
+
ocrText?: string;
|
|
69
|
+
caption?: string;
|
|
70
|
+
altText?: string;
|
|
71
|
+
bbox?: BoundingBox;
|
|
72
|
+
}
|
|
73
|
+
interface EnrichmentData {
|
|
74
|
+
entities?: Entity[];
|
|
75
|
+
keywords?: string[];
|
|
76
|
+
summary?: string;
|
|
77
|
+
sentiment?: SentimentResult;
|
|
78
|
+
topics?: string[];
|
|
79
|
+
custom?: Record<string, unknown>;
|
|
80
|
+
}
|
|
81
|
+
interface Entity {
|
|
82
|
+
type: string;
|
|
83
|
+
value: string;
|
|
84
|
+
count: number;
|
|
85
|
+
positions?: Array<{
|
|
86
|
+
start: number;
|
|
87
|
+
end: number;
|
|
88
|
+
}>;
|
|
89
|
+
}
|
|
90
|
+
interface SentimentResult {
|
|
91
|
+
label: 'positive' | 'negative' | 'neutral' | 'mixed';
|
|
92
|
+
score: number;
|
|
93
|
+
confidence?: number;
|
|
94
|
+
}
|
|
95
|
+
interface ProcessedDocument {
|
|
96
|
+
id: string;
|
|
97
|
+
type: DocumentType;
|
|
98
|
+
text: string;
|
|
99
|
+
metadata: DocumentMetadata;
|
|
100
|
+
elements: Element[];
|
|
101
|
+
chunks: Chunk[];
|
|
102
|
+
tables: TableData[];
|
|
103
|
+
images: ImageData[];
|
|
104
|
+
enrichment?: EnrichmentData;
|
|
105
|
+
processedAt: Date;
|
|
106
|
+
errors?: ProcessingError[];
|
|
107
|
+
}
|
|
108
|
+
interface ProcessingError {
|
|
109
|
+
stage: string;
|
|
110
|
+
message: string;
|
|
111
|
+
details?: unknown;
|
|
112
|
+
}
|
|
113
|
+
interface DocumentInput {
|
|
114
|
+
path?: string;
|
|
115
|
+
buffer?: Buffer;
|
|
116
|
+
url?: string;
|
|
117
|
+
filename?: string;
|
|
118
|
+
mimeType?: string;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
interface ParserCapabilities {
|
|
122
|
+
text: boolean;
|
|
123
|
+
structure: boolean;
|
|
124
|
+
tables: boolean;
|
|
125
|
+
images: boolean;
|
|
126
|
+
metadata: boolean;
|
|
127
|
+
streaming: boolean;
|
|
128
|
+
}
|
|
129
|
+
interface ParserOptions {
|
|
130
|
+
extractImages?: boolean;
|
|
131
|
+
extractTables?: boolean;
|
|
132
|
+
extractMetadata?: boolean;
|
|
133
|
+
ocrEngine?: OCREngineInterface;
|
|
134
|
+
password?: string;
|
|
135
|
+
maxPages?: number;
|
|
136
|
+
pageRange?: {
|
|
137
|
+
start: number;
|
|
138
|
+
end: number;
|
|
139
|
+
};
|
|
140
|
+
custom?: Record<string, unknown>;
|
|
141
|
+
}
|
|
142
|
+
interface ParseResult {
|
|
143
|
+
type: DocumentType;
|
|
144
|
+
text: string;
|
|
145
|
+
elements: Element[];
|
|
146
|
+
tables: TableData[];
|
|
147
|
+
images: ImageData[];
|
|
148
|
+
metadata: DocumentMetadata;
|
|
149
|
+
warnings?: string[];
|
|
150
|
+
}
|
|
151
|
+
interface Parser {
|
|
152
|
+
readonly name: string;
|
|
153
|
+
readonly supportedMimeTypes: string[];
|
|
154
|
+
readonly supportedExtensions: string[];
|
|
155
|
+
readonly capabilities: ParserCapabilities;
|
|
156
|
+
canParse(mimeType: string, extension?: string): boolean;
|
|
157
|
+
parse(buffer: Buffer, options?: ParserOptions): Promise<ParseResult>;
|
|
158
|
+
parseStream?(stream: NodeJS.ReadableStream, options?: ParserOptions): AsyncIterableIterator<Element>;
|
|
159
|
+
}
|
|
160
|
+
interface OCREngineInterface {
|
|
161
|
+
recognize(image: Buffer): Promise<string>;
|
|
162
|
+
}
|
|
163
|
+
interface PDFParserOptions extends ParserOptions {
|
|
164
|
+
useNativeText?: boolean;
|
|
165
|
+
preserveFormatting?: boolean;
|
|
166
|
+
extractForms?: boolean;
|
|
167
|
+
extractAnnotations?: boolean;
|
|
168
|
+
}
|
|
169
|
+
interface DOCXParserOptions extends ParserOptions {
|
|
170
|
+
includeStyles?: boolean;
|
|
171
|
+
preserveNumbering?: boolean;
|
|
172
|
+
extractComments?: boolean;
|
|
173
|
+
}
|
|
174
|
+
interface HTMLParserOptions extends ParserOptions {
|
|
175
|
+
contentSelector?: string;
|
|
176
|
+
excludeSelectors?: string[];
|
|
177
|
+
extractLinks?: boolean;
|
|
178
|
+
baseUrl?: string;
|
|
179
|
+
}
|
|
180
|
+
interface MarkdownParserOptions extends ParserOptions {
|
|
181
|
+
gfm?: boolean;
|
|
182
|
+
extractFrontmatter?: boolean;
|
|
183
|
+
preserveCodeBlocks?: boolean;
|
|
184
|
+
}
|
|
185
|
+
interface CSVParserOptions extends ParserOptions {
|
|
186
|
+
delimiter?: string;
|
|
187
|
+
hasHeader?: boolean;
|
|
188
|
+
quote?: string;
|
|
189
|
+
encoding?: BufferEncoding;
|
|
190
|
+
}
|
|
191
|
+
interface ExcelParserOptions extends ParserOptions {
|
|
192
|
+
sheets?: string[] | number[];
|
|
193
|
+
includeFormulas?: boolean;
|
|
194
|
+
includeStyles?: boolean;
|
|
195
|
+
}
|
|
196
|
+
interface ParserRegistryConfig {
|
|
197
|
+
defaultOptions?: ParserOptions;
|
|
198
|
+
customParsers?: Parser[];
|
|
199
|
+
mimeTypeOverrides?: Record<string, Parser>;
|
|
200
|
+
registerBuiltIns?: boolean;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
type ChunkingStrategy = 'fixed' | 'semantic' | 'recursive' | 'hierarchical' | 'sentence' | 'paragraph' | 'sliding_window' | 'custom';
|
|
204
|
+
type TokenCounter = (text: string) => number;
|
|
205
|
+
interface ChunkingOptions {
|
|
206
|
+
maxTokens?: number;
|
|
207
|
+
maxCharacters?: number;
|
|
208
|
+
overlap?: number;
|
|
209
|
+
overlapPercent?: number;
|
|
210
|
+
preserveElements?: boolean;
|
|
211
|
+
includeMetadata?: boolean;
|
|
212
|
+
tokenCounter?: TokenCounter;
|
|
213
|
+
}
|
|
214
|
+
interface FixedChunkingOptions extends ChunkingOptions {
|
|
215
|
+
splitOnWords?: boolean;
|
|
216
|
+
splitOnSentences?: boolean;
|
|
217
|
+
}
|
|
218
|
+
interface SemanticChunkingOptions extends ChunkingOptions {
|
|
219
|
+
similarityThreshold?: number;
|
|
220
|
+
embedFunction?: (text: string) => Promise<number[]>;
|
|
221
|
+
minChunkSize?: number;
|
|
222
|
+
}
|
|
223
|
+
interface RecursiveChunkingOptions extends ChunkingOptions {
|
|
224
|
+
separators?: string[];
|
|
225
|
+
keepSeparator?: boolean;
|
|
226
|
+
minChunkSize?: number;
|
|
227
|
+
}
|
|
228
|
+
interface HierarchicalChunkingOptions extends ChunkingOptions {
|
|
229
|
+
headingLevels?: number[];
|
|
230
|
+
includeParentContext?: boolean;
|
|
231
|
+
maxDepth?: number;
|
|
232
|
+
}
|
|
233
|
+
interface SentenceChunkingOptions extends ChunkingOptions {
|
|
234
|
+
minSentences?: number;
|
|
235
|
+
maxSentences?: number;
|
|
236
|
+
delimiters?: string[];
|
|
237
|
+
}
|
|
238
|
+
interface ParagraphChunkingOptions extends ChunkingOptions {
|
|
239
|
+
minParagraphs?: number;
|
|
240
|
+
maxParagraphs?: number;
|
|
241
|
+
separatorPattern?: RegExp;
|
|
242
|
+
}
|
|
243
|
+
interface SlidingWindowChunkingOptions extends ChunkingOptions {
|
|
244
|
+
windowSize: number;
|
|
245
|
+
stepSize: number;
|
|
246
|
+
}
|
|
247
|
+
interface Chunker {
|
|
248
|
+
readonly name: string;
|
|
249
|
+
readonly strategy: ChunkingStrategy;
|
|
250
|
+
chunk(text: string, options?: ChunkingOptions): Chunk[] | Promise<Chunk[]>;
|
|
251
|
+
chunkElements(elements: Element[], options?: ChunkingOptions): Chunk[] | Promise<Chunk[]>;
|
|
252
|
+
estimateChunks(text: string, options?: ChunkingOptions): number;
|
|
253
|
+
}
|
|
254
|
+
interface ChunkResult {
|
|
255
|
+
chunks: Chunk[];
|
|
256
|
+
totalChunks: number;
|
|
257
|
+
averageTokens: number;
|
|
258
|
+
overlapRatio: number;
|
|
259
|
+
processingTime: number;
|
|
260
|
+
}
|
|
261
|
+
interface OverlapConfig {
|
|
262
|
+
size: number;
|
|
263
|
+
unit: 'tokens' | 'characters';
|
|
264
|
+
includeContext?: boolean;
|
|
265
|
+
}
|
|
266
|
+
interface BoundaryDetector {
|
|
267
|
+
detectBoundaries(text: string): number[];
|
|
268
|
+
getBoundaryType(text: string, position: number): BoundaryType;
|
|
269
|
+
}
|
|
270
|
+
type BoundaryType = 'sentence' | 'paragraph' | 'section' | 'page' | 'element' | 'word' | 'none';
|
|
271
|
+
interface ChunkIndex {
|
|
272
|
+
add(chunk: Chunk): void;
|
|
273
|
+
search(query: string, limit?: number): Chunk[];
|
|
274
|
+
findByMetadata(filter: Partial<ChunkMetadata>): Chunk[];
|
|
275
|
+
get(id: string): Chunk | undefined;
|
|
276
|
+
getAll(): Chunk[];
|
|
277
|
+
}
|
|
278
|
+
interface ChunkerRegistryConfig {
|
|
279
|
+
defaultOptions?: ChunkingOptions;
|
|
280
|
+
customChunkers?: Chunker[];
|
|
281
|
+
strategyOverrides?: Record<ChunkingStrategy, Chunker>;
|
|
282
|
+
registerBuiltIns?: boolean;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
interface TableExtractionOptions {
|
|
286
|
+
detectMergedCells?: boolean;
|
|
287
|
+
preserveFormatting?: boolean;
|
|
288
|
+
detectHeaders?: boolean;
|
|
289
|
+
minRows?: number;
|
|
290
|
+
minColumns?: number;
|
|
291
|
+
outputFormat?: TableOutputFormat;
|
|
292
|
+
}
|
|
293
|
+
type TableOutputFormat = 'array' | 'csv' | 'json' | 'markdown' | 'html';
|
|
294
|
+
interface ExtractedTable extends TableData {
|
|
295
|
+
confidence: number;
|
|
296
|
+
tableType?: 'data' | 'layout' | 'form';
|
|
297
|
+
structure?: TableStructure;
|
|
298
|
+
}
|
|
299
|
+
interface TableStructure {
|
|
300
|
+
headerRows: number;
|
|
301
|
+
headerColumns: number;
|
|
302
|
+
mergedCells?: MergedCellRegion[];
|
|
303
|
+
columnTypes?: ColumnType[];
|
|
304
|
+
}
|
|
305
|
+
interface MergedCellRegion {
|
|
306
|
+
startRow: number;
|
|
307
|
+
startCol: number;
|
|
308
|
+
endRow: number;
|
|
309
|
+
endCol: number;
|
|
310
|
+
}
|
|
311
|
+
interface ColumnType {
|
|
312
|
+
index: number;
|
|
313
|
+
type: 'text' | 'number' | 'date' | 'currency' | 'percentage' | 'mixed';
|
|
314
|
+
format?: string;
|
|
315
|
+
}
|
|
316
|
+
interface ImageExtractionOptions {
|
|
317
|
+
minWidth?: number;
|
|
318
|
+
minHeight?: number;
|
|
319
|
+
outputFormat?: ImageOutputFormat;
|
|
320
|
+
quality?: number;
|
|
321
|
+
embeddedOnly?: boolean;
|
|
322
|
+
includeData?: boolean;
|
|
323
|
+
runOcr?: boolean;
|
|
324
|
+
generateCaptions?: boolean;
|
|
325
|
+
}
|
|
326
|
+
type ImageOutputFormat = 'png' | 'jpeg' | 'webp' | 'original';
|
|
327
|
+
interface ExtractedImage extends ImageData {
|
|
328
|
+
confidence: number;
|
|
329
|
+
imageType?: ImageType;
|
|
330
|
+
colorAnalysis?: ColorAnalysis;
|
|
331
|
+
labels?: string[];
|
|
332
|
+
}
|
|
333
|
+
type ImageType = 'photo' | 'diagram' | 'chart' | 'screenshot' | 'logo' | 'icon' | 'illustration' | 'unknown';
|
|
334
|
+
interface ColorAnalysis {
|
|
335
|
+
dominantColors: string[];
|
|
336
|
+
isGrayscale: boolean;
|
|
337
|
+
brightness: number;
|
|
338
|
+
}
|
|
339
|
+
interface MetadataExtractionOptions {
|
|
340
|
+
standard?: boolean;
|
|
341
|
+
custom?: boolean;
|
|
342
|
+
statistics?: boolean;
|
|
343
|
+
parseDates?: boolean;
|
|
344
|
+
}
|
|
345
|
+
interface ExtendedMetadata extends DocumentMetadata {
|
|
346
|
+
keywords?: string[];
|
|
347
|
+
subject?: string;
|
|
348
|
+
category?: string;
|
|
349
|
+
version?: string;
|
|
350
|
+
status?: string;
|
|
351
|
+
contributors?: string[];
|
|
352
|
+
publisher?: string;
|
|
353
|
+
copyright?: string;
|
|
354
|
+
customProperties?: Record<string, string | number | boolean | Date>;
|
|
355
|
+
}
|
|
356
|
+
interface DocumentStatistics {
|
|
357
|
+
pageCount: number;
|
|
358
|
+
wordCount: number;
|
|
359
|
+
characterCount: number;
|
|
360
|
+
characterCountNoSpaces: number;
|
|
361
|
+
paragraphCount: number;
|
|
362
|
+
sentenceCount: number;
|
|
363
|
+
lineCount: number;
|
|
364
|
+
tableCount: number;
|
|
365
|
+
imageCount: number;
|
|
366
|
+
readingTime: number;
|
|
367
|
+
}
|
|
368
|
+
interface LinkExtractionOptions {
|
|
369
|
+
internal?: boolean;
|
|
370
|
+
external?: boolean;
|
|
371
|
+
anchors?: boolean;
|
|
372
|
+
resolveRelative?: boolean;
|
|
373
|
+
baseUrl?: string;
|
|
374
|
+
}
|
|
375
|
+
interface ExtractedLink {
|
|
376
|
+
url: string;
|
|
377
|
+
text?: string;
|
|
378
|
+
title?: string;
|
|
379
|
+
type: 'internal' | 'external' | 'anchor' | 'mailto' | 'tel';
|
|
380
|
+
position?: LinkPosition;
|
|
381
|
+
}
|
|
382
|
+
interface LinkPosition {
|
|
383
|
+
pageNumber?: number;
|
|
384
|
+
elementIndex?: number;
|
|
385
|
+
startOffset?: number;
|
|
386
|
+
endOffset?: number;
|
|
387
|
+
}
|
|
388
|
+
interface FormField {
|
|
389
|
+
name: string;
|
|
390
|
+
type: 'text' | 'checkbox' | 'radio' | 'select' | 'textarea' | 'date' | 'number';
|
|
391
|
+
value?: string | boolean | number;
|
|
392
|
+
label?: string;
|
|
393
|
+
required?: boolean;
|
|
394
|
+
options?: string[];
|
|
395
|
+
bbox?: BoundingBox;
|
|
396
|
+
}
|
|
397
|
+
interface Annotation {
|
|
398
|
+
id: string;
|
|
399
|
+
type: 'highlight' | 'underline' | 'strikeout' | 'comment' | 'sticky_note';
|
|
400
|
+
text?: string;
|
|
401
|
+
content?: string;
|
|
402
|
+
author?: string;
|
|
403
|
+
createdAt?: Date;
|
|
404
|
+
pageNumber?: number;
|
|
405
|
+
bbox?: BoundingBox;
|
|
406
|
+
}
|
|
407
|
+
interface Extractor<T, O = unknown> {
|
|
408
|
+
readonly name: string;
|
|
409
|
+
extract(buffer: Buffer, options?: O): Promise<T[]>;
|
|
410
|
+
isSupported(mimeType: string): boolean;
|
|
411
|
+
}
|
|
412
|
+
interface TableExtractor extends Extractor<ExtractedTable, TableExtractionOptions> {
|
|
413
|
+
convertTo(table: ExtractedTable, format: TableOutputFormat): string;
|
|
414
|
+
}
|
|
415
|
+
interface ImageExtractor extends Extractor<ExtractedImage, ImageExtractionOptions> {
|
|
416
|
+
getImageBuffer(image: ExtractedImage): Promise<Buffer>;
|
|
417
|
+
}
|
|
418
|
+
interface MetadataExtractor extends Extractor<ExtendedMetadata, MetadataExtractionOptions> {
|
|
419
|
+
getStatistics(buffer: Buffer): Promise<DocumentStatistics>;
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
type OCREngineType = 'tesseract' | 'google_vision' | 'aws_textract' | 'azure_vision' | 'custom';
|
|
423
|
+
interface OCRConfig {
|
|
424
|
+
engine: OCREngineType;
|
|
425
|
+
languages?: string[];
|
|
426
|
+
pageSegMode?: PageSegmentationMode;
|
|
427
|
+
engineConfig?: Record<string, unknown>;
|
|
428
|
+
preprocessing?: OCRPreprocessingOptions;
|
|
429
|
+
confidenceThreshold?: number;
|
|
430
|
+
}
|
|
431
|
+
type PageSegmentationMode = 'auto' | 'single_block' | 'single_column' | 'single_line' | 'single_word' | 'single_char' | 'sparse_text';
|
|
432
|
+
interface OCRPreprocessingOptions {
|
|
433
|
+
deskew?: boolean;
|
|
434
|
+
denoise?: boolean;
|
|
435
|
+
binarize?: boolean;
|
|
436
|
+
binarizeThreshold?: number;
|
|
437
|
+
scale?: number;
|
|
438
|
+
enhanceContrast?: boolean;
|
|
439
|
+
removeBorders?: boolean;
|
|
440
|
+
}
|
|
441
|
+
interface OCRResult {
|
|
442
|
+
text: string;
|
|
443
|
+
confidence: number;
|
|
444
|
+
language?: string;
|
|
445
|
+
blocks?: OCRBlock[];
|
|
446
|
+
processingTime: number;
|
|
447
|
+
engine: OCREngineType;
|
|
448
|
+
}
|
|
449
|
+
interface OCRBlock {
|
|
450
|
+
type: OCRBlockType;
|
|
451
|
+
text: string;
|
|
452
|
+
confidence: number;
|
|
453
|
+
bbox: BoundingBox;
|
|
454
|
+
children?: OCRElement[];
|
|
455
|
+
}
|
|
456
|
+
type OCRBlockType = 'paragraph' | 'line' | 'word' | 'table' | 'figure' | 'unknown';
|
|
457
|
+
interface OCRElement {
|
|
458
|
+
type: 'line' | 'word' | 'character';
|
|
459
|
+
text: string;
|
|
460
|
+
confidence: number;
|
|
461
|
+
bbox: BoundingBox;
|
|
462
|
+
font?: OCRFontInfo;
|
|
463
|
+
}
|
|
464
|
+
interface OCRFontInfo {
|
|
465
|
+
name?: string;
|
|
466
|
+
size?: number;
|
|
467
|
+
bold?: boolean;
|
|
468
|
+
italic?: boolean;
|
|
469
|
+
underline?: boolean;
|
|
470
|
+
monospace?: boolean;
|
|
471
|
+
}
|
|
472
|
+
interface OCREngine {
|
|
473
|
+
readonly name: string;
|
|
474
|
+
readonly type: OCREngineType;
|
|
475
|
+
readonly supportedLanguages: string[];
|
|
476
|
+
initialize(): Promise<void>;
|
|
477
|
+
isInitialized(): boolean;
|
|
478
|
+
recognize(image: Buffer, options?: OCROptions): Promise<OCRResult>;
|
|
479
|
+
recognizeUrl?(url: string, options?: OCROptions): Promise<OCRResult>;
|
|
480
|
+
recognizeBatch?(images: Buffer[], options?: OCROptions): Promise<OCRResult[]>;
|
|
481
|
+
detectTextRegions?(image: Buffer): Promise<BoundingBox[]>;
|
|
482
|
+
terminate(): Promise<void>;
|
|
483
|
+
}
|
|
484
|
+
interface OCROptions {
|
|
485
|
+
languages?: string[];
|
|
486
|
+
pageSegMode?: PageSegmentationMode;
|
|
487
|
+
includeDetails?: boolean;
|
|
488
|
+
preprocessing?: OCRPreprocessingOptions;
|
|
489
|
+
roi?: BoundingBox;
|
|
490
|
+
}
|
|
491
|
+
interface TesseractConfig {
|
|
492
|
+
dataPath?: string;
|
|
493
|
+
workerCount?: number;
|
|
494
|
+
cacheWorkers?: boolean;
|
|
495
|
+
oem?: 0 | 1 | 2 | 3;
|
|
496
|
+
psm?: number;
|
|
497
|
+
whitelist?: string;
|
|
498
|
+
blacklist?: string;
|
|
499
|
+
}
|
|
500
|
+
interface GoogleVisionConfig {
|
|
501
|
+
credentials?: string | Record<string, unknown>;
|
|
502
|
+
features?: GoogleVisionFeature[];
|
|
503
|
+
imageContext?: {
|
|
504
|
+
languageHints?: string[];
|
|
505
|
+
cropHintsParams?: {
|
|
506
|
+
aspectRatios: number[];
|
|
507
|
+
};
|
|
508
|
+
};
|
|
509
|
+
}
|
|
510
|
+
type GoogleVisionFeature = 'TEXT_DETECTION' | 'DOCUMENT_TEXT_DETECTION' | 'LABEL_DETECTION' | 'LOGO_DETECTION' | 'FACE_DETECTION';
|
|
511
|
+
interface AWSTextractConfig {
|
|
512
|
+
region?: string;
|
|
513
|
+
credentials?: {
|
|
514
|
+
accessKeyId: string;
|
|
515
|
+
secretAccessKey: string;
|
|
516
|
+
};
|
|
517
|
+
featureTypes?: ('TABLES' | 'FORMS' | 'QUERIES' | 'SIGNATURES' | 'LAYOUT')[];
|
|
518
|
+
}
|
|
519
|
+
interface AzureVisionConfig {
|
|
520
|
+
endpoint: string;
|
|
521
|
+
apiKey: string;
|
|
522
|
+
apiVersion?: string;
|
|
523
|
+
modelVersion?: 'latest' | '2022-04-30' | '2023-02-28-preview';
|
|
524
|
+
}
|
|
525
|
+
interface OCREngineFactoryConfig {
|
|
526
|
+
defaultEngine: OCREngineType;
|
|
527
|
+
engines?: {
|
|
528
|
+
tesseract?: TesseractConfig;
|
|
529
|
+
google_vision?: GoogleVisionConfig;
|
|
530
|
+
aws_textract?: AWSTextractConfig;
|
|
531
|
+
azure_vision?: AzureVisionConfig;
|
|
532
|
+
};
|
|
533
|
+
}
|
|
534
|
+
interface OCRQualityMetrics {
|
|
535
|
+
averageConfidence: number;
|
|
536
|
+
lowConfidenceWords: number;
|
|
537
|
+
noiseLevel: 'low' | 'medium' | 'high';
|
|
538
|
+
suggestions?: string[];
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
type CleaningOperation = 'normalize_whitespace' | 'remove_extra_whitespace' | 'normalize_unicode' | 'remove_control_chars' | 'fix_encoding' | 'remove_html_tags' | 'decode_html_entities' | 'remove_urls' | 'remove_emails' | 'remove_phone_numbers' | 'remove_special_chars' | 'lowercase' | 'uppercase' | 'trim' | 'remove_punctuation' | 'remove_numbers' | 'remove_stopwords' | 'stem' | 'lemmatize' | 'fix_hyphenation' | 'merge_lines' | 'remove_headers_footers' | 'remove_page_numbers' | 'deduplicate_lines' | 'custom';
|
|
542
|
+
interface CleaningConfig {
|
|
543
|
+
operations: CleaningOperation[];
|
|
544
|
+
customOperations?: Record<string, CleaningHandler>;
|
|
545
|
+
preservePatterns?: RegExp[];
|
|
546
|
+
language?: string;
|
|
547
|
+
stopwords?: string[];
|
|
548
|
+
}
|
|
549
|
+
type CleaningHandler = (text: string, options?: CleaningOptions) => string;
|
|
550
|
+
interface CleaningOptions {
|
|
551
|
+
preserveNewlines?: boolean;
|
|
552
|
+
preserveCase?: boolean;
|
|
553
|
+
minWordLength?: number;
|
|
554
|
+
maxNewlines?: number;
|
|
555
|
+
removePatterns?: RegExp[];
|
|
556
|
+
replacePatterns?: Array<{
|
|
557
|
+
pattern: RegExp;
|
|
558
|
+
replacement: string;
|
|
559
|
+
}>;
|
|
560
|
+
encoding?: BufferEncoding;
|
|
561
|
+
replacements?: Record<string, string>;
|
|
562
|
+
}
|
|
563
|
+
interface CleaningResult {
|
|
564
|
+
text: string;
|
|
565
|
+
originalLength: number;
|
|
566
|
+
cleanedLength: number;
|
|
567
|
+
operationsApplied: CleaningOperation[];
|
|
568
|
+
changes: CleaningChange[];
|
|
569
|
+
processingTime: number;
|
|
570
|
+
}
|
|
571
|
+
interface CleaningChange {
|
|
572
|
+
operation: CleaningOperation;
|
|
573
|
+
count: number;
|
|
574
|
+
samples?: string[];
|
|
575
|
+
}
|
|
576
|
+
interface TextNormalizer {
|
|
577
|
+
readonly name: string;
|
|
578
|
+
normalize(text: string, options?: NormalizationOptions): string;
|
|
579
|
+
needsNormalization(text: string): boolean;
|
|
580
|
+
}
|
|
581
|
+
interface NormalizationOptions {
|
|
582
|
+
unicodeForm?: 'NFC' | 'NFD' | 'NFKC' | 'NFKD';
|
|
583
|
+
normalizeQuotes?: boolean;
|
|
584
|
+
normalizeDashes?: boolean;
|
|
585
|
+
normalizeEllipsis?: boolean;
|
|
586
|
+
removeAccents?: boolean;
|
|
587
|
+
toAscii?: boolean;
|
|
588
|
+
}
|
|
589
|
+
interface DeduplicationOptions {
|
|
590
|
+
threshold?: number;
|
|
591
|
+
scope?: 'exact' | 'fuzzy' | 'semantic';
|
|
592
|
+
hashAlgorithm?: 'md5' | 'sha256' | 'simhash' | 'minhash';
|
|
593
|
+
ngramSize?: number;
|
|
594
|
+
keep?: 'first' | 'last';
|
|
595
|
+
compareFields?: string[];
|
|
596
|
+
}
|
|
597
|
+
interface DeduplicationResult<T> {
|
|
598
|
+
unique: T[];
|
|
599
|
+
duplicates: DuplicateGroup<T>[];
|
|
600
|
+
duplicateCount: number;
|
|
601
|
+
processingTime: number;
|
|
602
|
+
}
|
|
603
|
+
interface DuplicateGroup<T> {
|
|
604
|
+
kept: T;
|
|
605
|
+
removed: T[];
|
|
606
|
+
similarities?: number[];
|
|
607
|
+
}
|
|
608
|
+
interface ContentFilter {
|
|
609
|
+
readonly name: string;
|
|
610
|
+
shouldFilter(content: string): boolean;
|
|
611
|
+
filter(content: string): string;
|
|
612
|
+
getFilterReason(content: string): string | null;
|
|
613
|
+
}
|
|
614
|
+
interface ContentFilterOptions {
|
|
615
|
+
filterProfanity?: boolean;
|
|
616
|
+
filterPII?: boolean;
|
|
617
|
+
customPatterns?: RegExp[];
|
|
618
|
+
replacement?: string;
|
|
619
|
+
mask?: boolean;
|
|
620
|
+
piiTypes?: PIIType[];
|
|
621
|
+
}
|
|
622
|
+
type PIIType = 'email' | 'phone' | 'ssn' | 'credit_card' | 'address' | 'name' | 'date_of_birth' | 'ip_address' | 'passport' | 'drivers_license';
|
|
623
|
+
interface PIIDetectionResult {
|
|
624
|
+
instances: PIIInstance[];
|
|
625
|
+
totalFound: number;
|
|
626
|
+
riskLevel: 'none' | 'low' | 'medium' | 'high';
|
|
627
|
+
}
|
|
628
|
+
interface PIIInstance {
|
|
629
|
+
type: PIIType;
|
|
630
|
+
value: string;
|
|
631
|
+
position: {
|
|
632
|
+
start: number;
|
|
633
|
+
end: number;
|
|
634
|
+
};
|
|
635
|
+
confidence: number;
|
|
636
|
+
}
|
|
637
|
+
interface HeaderFooterOptions {
|
|
638
|
+
method: 'pattern' | 'position' | 'similarity';
|
|
639
|
+
patterns?: RegExp[];
|
|
640
|
+
positionThreshold?: number;
|
|
641
|
+
similarityThreshold?: number;
|
|
642
|
+
minOccurrences?: number;
|
|
643
|
+
}
|
|
644
|
+
interface DetectedHeaderFooter {
|
|
645
|
+
type: 'header' | 'footer';
|
|
646
|
+
content: string;
|
|
647
|
+
pages: number[];
|
|
648
|
+
confidence: number;
|
|
649
|
+
}
|
|
650
|
+
interface TextCleaner {
|
|
651
|
+
readonly name: string;
|
|
652
|
+
clean(text: string, config: CleaningConfig): CleaningResult;
|
|
653
|
+
applyOperation(text: string, operation: CleaningOperation, options?: CleaningOptions): string;
|
|
654
|
+
getAvailableOperations(): CleaningOperation[];
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
type EnrichmentType = 'entities' | 'keywords' | 'summary' | 'sentiment' | 'topics' | 'language' | 'classification' | 'embeddings' | 'relations' | 'custom';
|
|
658
|
+
interface EnrichmentConfig {
|
|
659
|
+
enrichments: EnrichmentType[];
|
|
660
|
+
entities?: EntityExtractionConfig;
|
|
661
|
+
keywords?: KeywordExtractionConfig;
|
|
662
|
+
summary?: SummarizationConfig;
|
|
663
|
+
sentiment?: SentimentConfig;
|
|
664
|
+
topics?: TopicConfig;
|
|
665
|
+
classification?: ClassificationConfig;
|
|
666
|
+
embeddings?: EmbeddingConfig;
|
|
667
|
+
customHandlers?: Record<string, EnrichmentHandler>;
|
|
668
|
+
}
|
|
669
|
+
type EnrichmentHandler = (text: string, options?: unknown) => Promise<unknown>;
|
|
670
|
+
interface EntityExtractionConfig {
|
|
671
|
+
types?: EntityType[];
|
|
672
|
+
useLLM?: boolean;
|
|
673
|
+
llmModel?: string;
|
|
674
|
+
mergeOverlapping?: boolean;
|
|
675
|
+
minConfidence?: number;
|
|
676
|
+
maxEntities?: number;
|
|
677
|
+
customPatterns?: Record<string, RegExp>;
|
|
678
|
+
}
|
|
679
|
+
type EntityType = 'PERSON' | 'ORGANIZATION' | 'LOCATION' | 'DATE' | 'TIME' | 'MONEY' | 'PERCENT' | 'EMAIL' | 'PHONE' | 'URL' | 'PRODUCT' | 'EVENT' | 'WORK_OF_ART' | 'LAW' | 'LANGUAGE' | 'QUANTITY' | 'ORDINAL' | 'CARDINAL' | 'CUSTOM';
|
|
680
|
+
interface ExtendedEntity extends Entity {
|
|
681
|
+
entityType: EntityType;
|
|
682
|
+
confidence: number;
|
|
683
|
+
normalizedValue?: string;
|
|
684
|
+
wikiLink?: string;
|
|
685
|
+
description?: string;
|
|
686
|
+
relatedEntities?: string[];
|
|
687
|
+
}
|
|
688
|
+
interface KeywordExtractionConfig {
|
|
689
|
+
method?: KeywordExtractionMethod;
|
|
690
|
+
maxKeywords?: number;
|
|
691
|
+
minScore?: number;
|
|
692
|
+
includePhrases?: boolean;
|
|
693
|
+
maxPhraseLength?: number;
|
|
694
|
+
stopwords?: string[];
|
|
695
|
+
language?: string;
|
|
696
|
+
}
|
|
697
|
+
type KeywordExtractionMethod = 'tfidf' | 'textrank' | 'rake' | 'yake' | 'keybert' | 'llm' | 'frequency';
|
|
698
|
+
interface ExtractedKeyword {
|
|
699
|
+
keyword: string;
|
|
700
|
+
score: number;
|
|
701
|
+
count: number;
|
|
702
|
+
isPhrase: boolean;
|
|
703
|
+
positions?: Array<{
|
|
704
|
+
start: number;
|
|
705
|
+
end: number;
|
|
706
|
+
}>;
|
|
707
|
+
}
|
|
708
|
+
interface SummarizationConfig {
|
|
709
|
+
type?: SummaryType;
|
|
710
|
+
maxLength?: number;
|
|
711
|
+
lengthUnit?: 'words' | 'sentences' | 'characters';
|
|
712
|
+
model?: string;
|
|
713
|
+
focusAspects?: string[];
|
|
714
|
+
bulletPoints?: boolean;
|
|
715
|
+
targetAudience?: string;
|
|
716
|
+
}
|
|
717
|
+
type SummaryType = 'extractive' | 'abstractive' | 'hybrid' | 'key_points';
|
|
718
|
+
interface SummaryResult {
|
|
719
|
+
text: string;
|
|
720
|
+
type: SummaryType;
|
|
721
|
+
compressionRatio: number;
|
|
722
|
+
keySentences?: string[];
|
|
723
|
+
topics?: string[];
|
|
724
|
+
}
|
|
725
|
+
interface SentimentConfig {
|
|
726
|
+
granularity?: 'document' | 'paragraph' | 'sentence';
|
|
727
|
+
aspects?: string[];
|
|
728
|
+
includeEmotions?: boolean;
|
|
729
|
+
model?: string;
|
|
730
|
+
}
|
|
731
|
+
interface ExtendedSentimentResult extends SentimentResult {
|
|
732
|
+
aspects?: Record<string, SentimentResult>;
|
|
733
|
+
emotions?: EmotionResult[];
|
|
734
|
+
segments?: SentimentSegment[];
|
|
735
|
+
}
|
|
736
|
+
interface EmotionResult {
|
|
737
|
+
emotion: EmotionType;
|
|
738
|
+
score: number;
|
|
739
|
+
}
|
|
740
|
+
type EmotionType = 'joy' | 'sadness' | 'anger' | 'fear' | 'surprise' | 'disgust' | 'trust' | 'anticipation' | 'neutral';
|
|
741
|
+
interface SentimentSegment {
|
|
742
|
+
text: string;
|
|
743
|
+
sentiment: SentimentResult;
|
|
744
|
+
start: number;
|
|
745
|
+
end: number;
|
|
746
|
+
}
|
|
747
|
+
interface TopicConfig {
|
|
748
|
+
method?: TopicMethod;
|
|
749
|
+
numTopics?: number;
|
|
750
|
+
wordsPerTopic?: number;
|
|
751
|
+
predefinedTopics?: string[];
|
|
752
|
+
model?: string;
|
|
753
|
+
}
|
|
754
|
+
type TopicMethod = 'lda' | 'nmf' | 'bertopic' | 'llm' | 'zero_shot';
|
|
755
|
+
interface TopicResult {
|
|
756
|
+
label: string;
|
|
757
|
+
score: number;
|
|
758
|
+
topWords?: string[];
|
|
759
|
+
isPredefined?: boolean;
|
|
760
|
+
}
|
|
761
|
+
interface ClassificationConfig {
|
|
762
|
+
labels: string[];
|
|
763
|
+
multiLabel?: boolean;
|
|
764
|
+
minConfidence?: number;
|
|
765
|
+
model?: string;
|
|
766
|
+
customPrompt?: string;
|
|
767
|
+
}
|
|
768
|
+
interface ClassificationResult {
|
|
769
|
+
labels: Array<{
|
|
770
|
+
label: string;
|
|
771
|
+
score: number;
|
|
772
|
+
}>;
|
|
773
|
+
primaryLabel: string;
|
|
774
|
+
confidence: number;
|
|
775
|
+
}
|
|
776
|
+
interface EmbeddingConfig {
|
|
777
|
+
model: string;
|
|
778
|
+
dimensions?: number;
|
|
779
|
+
normalize?: boolean;
|
|
780
|
+
batchSize?: number;
|
|
781
|
+
}
|
|
782
|
+
interface EmbeddingResult {
|
|
783
|
+
embedding: number[];
|
|
784
|
+
model: string;
|
|
785
|
+
tokenCount: number;
|
|
786
|
+
}
|
|
787
|
+
interface RelationConfig {
|
|
788
|
+
relationTypes?: string[];
|
|
789
|
+
useLLM?: boolean;
|
|
790
|
+
model?: string;
|
|
791
|
+
}
|
|
792
|
+
interface ExtractedRelation {
|
|
793
|
+
subject: ExtendedEntity;
|
|
794
|
+
relation: string;
|
|
795
|
+
object: ExtendedEntity;
|
|
796
|
+
confidence: number;
|
|
797
|
+
sourceText?: string;
|
|
798
|
+
}
|
|
799
|
+
interface Enricher {
|
|
800
|
+
readonly name: string;
|
|
801
|
+
readonly supportedTypes: EnrichmentType[];
|
|
802
|
+
enrich(text: string, config: EnrichmentConfig): Promise<EnrichmentData>;
|
|
803
|
+
supports(type: EnrichmentType): boolean;
|
|
804
|
+
}
|
|
805
|
+
interface EnrichmentResult extends EnrichmentData {
|
|
806
|
+
extendedEntities?: ExtendedEntity[];
|
|
807
|
+
extendedKeywords?: ExtractedKeyword[];
|
|
808
|
+
summaryResult?: SummaryResult;
|
|
809
|
+
extendedSentiment?: ExtendedSentimentResult;
|
|
810
|
+
topicResults?: TopicResult[];
|
|
811
|
+
classificationResults?: ClassificationResult;
|
|
812
|
+
embeddingResult?: EmbeddingResult;
|
|
813
|
+
relations?: ExtractedRelation[];
|
|
814
|
+
processingTime: number;
|
|
815
|
+
}
|
|
816
|
+
|
|
817
|
+
type StorageAdapterType = 'memory' | 'file' | 'sqlite' | 'postgres' | 'mongodb' | 'redis' | 'pinecone' | 'weaviate' | 'qdrant' | 'chroma' | 'custom';
|
|
818
|
+
interface StorageConfig {
|
|
819
|
+
adapter: StorageAdapterType;
|
|
820
|
+
connectionString?: string;
|
|
821
|
+
database?: string;
|
|
822
|
+
documentsCollection?: string;
|
|
823
|
+
chunksCollection?: string;
|
|
824
|
+
options?: Record<string, unknown>;
|
|
825
|
+
}
|
|
826
|
+
interface DocumentStorage {
|
|
827
|
+
readonly name: string;
|
|
828
|
+
readonly type: StorageAdapterType;
|
|
829
|
+
initialize(): Promise<void>;
|
|
830
|
+
store(document: ProcessedDocument): Promise<string>;
|
|
831
|
+
storeBatch(documents: ProcessedDocument[]): Promise<string[]>;
|
|
832
|
+
get(id: string): Promise<ProcessedDocument | null>;
|
|
833
|
+
getBatch(ids: string[]): Promise<(ProcessedDocument | null)[]>;
|
|
834
|
+
update(id: string, updates: Partial<ProcessedDocument>): Promise<void>;
|
|
835
|
+
delete(id: string): Promise<void>;
|
|
836
|
+
deleteBatch(ids: string[]): Promise<void>;
|
|
837
|
+
list(options?: ListOptions): Promise<ListResult<ProcessedDocument>>;
|
|
838
|
+
search(query: DocumentQuery): Promise<ProcessedDocument[]>;
|
|
839
|
+
count(query?: DocumentQuery): Promise<number>;
|
|
840
|
+
exists(id: string): Promise<boolean>;
|
|
841
|
+
close(): Promise<void>;
|
|
842
|
+
}
|
|
843
|
+
interface ChunkStorage {
|
|
844
|
+
readonly name: string;
|
|
845
|
+
readonly type: StorageAdapterType;
|
|
846
|
+
initialize(): Promise<void>;
|
|
847
|
+
store(chunk: Chunk): Promise<string>;
|
|
848
|
+
storeBatch(chunks: Chunk[]): Promise<string[]>;
|
|
849
|
+
get(id: string): Promise<Chunk | null>;
|
|
850
|
+
getBatch(ids: string[]): Promise<(Chunk | null)[]>;
|
|
851
|
+
getByDocumentId(documentId: string): Promise<Chunk[]>;
|
|
852
|
+
update(id: string, updates: Partial<Chunk>): Promise<void>;
|
|
853
|
+
delete(id: string): Promise<void>;
|
|
854
|
+
deleteByDocumentId(documentId: string): Promise<void>;
|
|
855
|
+
list(options?: ListOptions): Promise<ListResult<Chunk>>;
|
|
856
|
+
count(documentId?: string): Promise<number>;
|
|
857
|
+
close(): Promise<void>;
|
|
858
|
+
}
|
|
859
|
+
interface VectorStorage {
|
|
860
|
+
readonly name: string;
|
|
861
|
+
readonly type: StorageAdapterType;
|
|
862
|
+
readonly dimensions: number;
|
|
863
|
+
initialize(): Promise<void>;
|
|
864
|
+
store(chunk: Chunk): Promise<string>;
|
|
865
|
+
storeBatch(chunks: Chunk[]): Promise<string[]>;
|
|
866
|
+
search(vector: number[], options?: VectorSearchOptions): Promise<VectorSearchResult[]>;
|
|
867
|
+
searchByText?(text: string, options?: VectorSearchOptions): Promise<VectorSearchResult[]>;
|
|
868
|
+
updateEmbedding(id: string, embedding: number[]): Promise<void>;
|
|
869
|
+
delete(id: string): Promise<void>;
|
|
870
|
+
deleteByDocumentId(documentId: string): Promise<void>;
|
|
871
|
+
getStats(): Promise<VectorIndexStats>;
|
|
872
|
+
close(): Promise<void>;
|
|
873
|
+
}
|
|
874
|
+
interface ListOptions {
|
|
875
|
+
limit?: number;
|
|
876
|
+
offset?: number;
|
|
877
|
+
cursor?: string;
|
|
878
|
+
sortBy?: string;
|
|
879
|
+
sortOrder?: 'asc' | 'desc';
|
|
880
|
+
filter?: Record<string, unknown>;
|
|
881
|
+
}
|
|
882
|
+
interface ListResult<T> {
|
|
883
|
+
items: T[];
|
|
884
|
+
total: number;
|
|
885
|
+
hasMore: boolean;
|
|
886
|
+
nextCursor?: string;
|
|
887
|
+
}
|
|
888
|
+
interface DocumentQuery {
|
|
889
|
+
text?: string;
|
|
890
|
+
type?: string | string[];
|
|
891
|
+
metadata?: Record<string, unknown>;
|
|
892
|
+
dateRange?: {
|
|
893
|
+
field: 'createdAt' | 'modifiedAt' | 'processedAt';
|
|
894
|
+
start?: Date;
|
|
895
|
+
end?: Date;
|
|
896
|
+
};
|
|
897
|
+
limit?: number;
|
|
898
|
+
offset?: number;
|
|
899
|
+
}
|
|
900
|
+
interface VectorSearchOptions {
|
|
901
|
+
topK?: number;
|
|
902
|
+
minScore?: number;
|
|
903
|
+
filter?: Record<string, unknown>;
|
|
904
|
+
includeEmbeddings?: boolean;
|
|
905
|
+
includeMetadata?: boolean;
|
|
906
|
+
namespace?: string;
|
|
907
|
+
}
|
|
908
|
+
interface VectorSearchResult {
|
|
909
|
+
id: string;
|
|
910
|
+
score: number;
|
|
911
|
+
chunk: Chunk;
|
|
912
|
+
distance?: number;
|
|
913
|
+
}
|
|
914
|
+
interface VectorIndexStats {
|
|
915
|
+
totalVectors: number;
|
|
916
|
+
dimensions: number;
|
|
917
|
+
indexType?: string;
|
|
918
|
+
memoryUsage?: number;
|
|
919
|
+
namespaces?: string[];
|
|
920
|
+
}
|
|
921
|
+
interface MemoryStorageOptions {
|
|
922
|
+
maxItems?: number;
|
|
923
|
+
ttl?: number;
|
|
924
|
+
}
|
|
925
|
+
interface FileStorageOptions {
|
|
926
|
+
baseDir: string;
|
|
927
|
+
format?: 'json' | 'msgpack';
|
|
928
|
+
createDirs?: boolean;
|
|
929
|
+
compression?: 'gzip' | 'none';
|
|
930
|
+
}
|
|
931
|
+
interface SQLiteStorageOptions {
|
|
932
|
+
path: string;
|
|
933
|
+
walMode?: boolean;
|
|
934
|
+
busyTimeout?: number;
|
|
935
|
+
}
|
|
936
|
+
interface PostgresStorageOptions {
|
|
937
|
+
connectionString: string;
|
|
938
|
+
schema?: string;
|
|
939
|
+
poolSize?: number;
|
|
940
|
+
ssl?: boolean;
|
|
941
|
+
enablePgvector?: boolean;
|
|
942
|
+
}
|
|
943
|
+
interface MongoDBStorageOptions {
|
|
944
|
+
connectionString: string;
|
|
945
|
+
database: string;
|
|
946
|
+
writeConcern?: 'majority' | number;
|
|
947
|
+
readPreference?: 'primary' | 'secondary' | 'nearest';
|
|
948
|
+
}
|
|
949
|
+
interface RedisStorageOptions {
|
|
950
|
+
url: string;
|
|
951
|
+
prefix?: string;
|
|
952
|
+
ttl?: number;
|
|
953
|
+
cluster?: boolean;
|
|
954
|
+
}
|
|
955
|
+
interface PineconeStorageOptions {
|
|
956
|
+
apiKey: string;
|
|
957
|
+
environment: string;
|
|
958
|
+
indexName: string;
|
|
959
|
+
namespace?: string;
|
|
960
|
+
metric?: 'cosine' | 'euclidean' | 'dotproduct';
|
|
961
|
+
}
|
|
962
|
+
interface WeaviateStorageOptions {
|
|
963
|
+
url: string;
|
|
964
|
+
apiKey?: string;
|
|
965
|
+
className: string;
|
|
966
|
+
schema?: Record<string, unknown>;
|
|
967
|
+
}
|
|
968
|
+
interface QdrantStorageOptions {
|
|
969
|
+
url: string;
|
|
970
|
+
apiKey?: string;
|
|
971
|
+
collectionName: string;
|
|
972
|
+
dimensions: number;
|
|
973
|
+
distance?: 'Cosine' | 'Euclid' | 'Dot';
|
|
974
|
+
}
|
|
975
|
+
interface ChromaStorageOptions {
|
|
976
|
+
path: string;
|
|
977
|
+
collectionName: string;
|
|
978
|
+
embeddingFunction?: (texts: string[]) => Promise<number[][]>;
|
|
979
|
+
}
|
|
980
|
+
interface StorageFactoryConfig {
|
|
981
|
+
defaultAdapter: StorageAdapterType;
|
|
982
|
+
adapters?: {
|
|
983
|
+
memory?: MemoryStorageOptions;
|
|
984
|
+
file?: FileStorageOptions;
|
|
985
|
+
sqlite?: SQLiteStorageOptions;
|
|
986
|
+
postgres?: PostgresStorageOptions;
|
|
987
|
+
mongodb?: MongoDBStorageOptions;
|
|
988
|
+
redis?: RedisStorageOptions;
|
|
989
|
+
pinecone?: PineconeStorageOptions;
|
|
990
|
+
weaviate?: WeaviateStorageOptions;
|
|
991
|
+
qdrant?: QdrantStorageOptions;
|
|
992
|
+
chroma?: ChromaStorageOptions;
|
|
993
|
+
};
|
|
994
|
+
}
|
|
995
|
+
|
|
996
|
+
type PipelineStage = 'load' | 'parse' | 'extract' | 'clean' | 'chunk' | 'enrich' | 'embed' | 'store' | 'custom';
|
|
997
|
+
interface PipelineConfig {
|
|
998
|
+
name?: string;
|
|
999
|
+
stages?: PipelineStage[];
|
|
1000
|
+
parser?: ParserOptions;
|
|
1001
|
+
extraction?: ExtractionConfig;
|
|
1002
|
+
cleaning?: CleaningConfig;
|
|
1003
|
+
chunking?: ChunkingConfig;
|
|
1004
|
+
enrichment?: EnrichmentConfig;
|
|
1005
|
+
embedding?: EmbeddingPipelineConfig;
|
|
1006
|
+
storage?: StorageConfig;
|
|
1007
|
+
ocr?: OCRConfig;
|
|
1008
|
+
errorHandling?: ErrorHandlingConfig;
|
|
1009
|
+
callbacks?: PipelineCallbacks;
|
|
1010
|
+
customStages?: Record<string, CustomStageHandler>;
|
|
1011
|
+
}
|
|
1012
|
+
interface ExtractionConfig {
|
|
1013
|
+
tables?: TableExtractionOptions;
|
|
1014
|
+
images?: ImageExtractionOptions;
|
|
1015
|
+
metadata?: MetadataExtractionOptions;
|
|
1016
|
+
}
|
|
1017
|
+
interface ChunkingConfig extends ChunkingOptions {
|
|
1018
|
+
strategy: ChunkingStrategy;
|
|
1019
|
+
}
|
|
1020
|
+
interface EmbeddingPipelineConfig {
|
|
1021
|
+
model: string;
|
|
1022
|
+
provider?: string;
|
|
1023
|
+
batchSize?: number;
|
|
1024
|
+
embedChunks?: boolean;
|
|
1025
|
+
embedSummary?: boolean;
|
|
1026
|
+
}
|
|
1027
|
+
interface ErrorHandlingConfig {
|
|
1028
|
+
continueOnError?: boolean;
|
|
1029
|
+
maxRetries?: number;
|
|
1030
|
+
retryDelay?: number;
|
|
1031
|
+
onError?: (error: PipelineError) => void;
|
|
1032
|
+
skipFailing?: boolean;
|
|
1033
|
+
}
|
|
1034
|
+
interface PipelineCallbacks {
|
|
1035
|
+
onStageStart?: (stage: PipelineStage, documentId: string) => void;
|
|
1036
|
+
onStageComplete?: (stage: PipelineStage, documentId: string, result: unknown) => void;
|
|
1037
|
+
onStageError?: (stage: PipelineStage, documentId: string, error: Error) => void;
|
|
1038
|
+
onDocumentStart?: (documentId: string, input: DocumentInput) => void;
|
|
1039
|
+
onDocumentComplete?: (document: ProcessedDocument) => void;
|
|
1040
|
+
onProgress?: (progress: PipelineProgress) => void;
|
|
1041
|
+
}
|
|
1042
|
+
type CustomStageHandler = (document: ProcessedDocument, context: PipelineContext) => Promise<ProcessedDocument>;
|
|
1043
|
+
interface PipelineContext {
|
|
1044
|
+
config: PipelineConfig;
|
|
1045
|
+
currentStage: PipelineStage;
|
|
1046
|
+
stageResults: Map<PipelineStage, unknown>;
|
|
1047
|
+
sharedData: Map<string, unknown>;
|
|
1048
|
+
abortSignal?: AbortSignal;
|
|
1049
|
+
logger?: PipelineLogger;
|
|
1050
|
+
}
|
|
1051
|
+
interface PipelineLogger {
|
|
1052
|
+
debug(message: string, meta?: Record<string, unknown>): void;
|
|
1053
|
+
info(message: string, meta?: Record<string, unknown>): void;
|
|
1054
|
+
warn(message: string, meta?: Record<string, unknown>): void;
|
|
1055
|
+
error(message: string, meta?: Record<string, unknown>): void;
|
|
1056
|
+
}
|
|
1057
|
+
interface PipelineProgress {
|
|
1058
|
+
documentIndex: number;
|
|
1059
|
+
totalDocuments: number;
|
|
1060
|
+
currentStage: PipelineStage;
|
|
1061
|
+
stageProgress: number;
|
|
1062
|
+
overallProgress: number;
|
|
1063
|
+
elapsedTime: number;
|
|
1064
|
+
estimatedRemaining?: number;
|
|
1065
|
+
}
|
|
1066
|
+
interface PipelineError {
|
|
1067
|
+
stage: PipelineStage;
|
|
1068
|
+
documentId?: string;
|
|
1069
|
+
message: string;
|
|
1070
|
+
cause?: Error;
|
|
1071
|
+
recoverable: boolean;
|
|
1072
|
+
retryCount?: number;
|
|
1073
|
+
}
|
|
1074
|
+
interface PipelineResult {
|
|
1075
|
+
documents: ProcessedDocument[];
|
|
1076
|
+
successCount: number;
|
|
1077
|
+
failedCount: number;
|
|
1078
|
+
skippedCount: number;
|
|
1079
|
+
totalChunks: number;
|
|
1080
|
+
errors: PipelineError[];
|
|
1081
|
+
processingTime: number;
|
|
1082
|
+
stageTimings: Map<PipelineStage, number>;
|
|
1083
|
+
}
|
|
1084
|
+
interface Pipeline$1 {
|
|
1085
|
+
readonly name: string;
|
|
1086
|
+
readonly config: PipelineConfig;
|
|
1087
|
+
process(input: DocumentInput): Promise<ProcessedDocument>;
|
|
1088
|
+
processBatch(inputs: DocumentInput[]): Promise<PipelineResult>;
|
|
1089
|
+
processStream(inputs: AsyncIterable<DocumentInput>): AsyncIterable<ProcessedDocument>;
|
|
1090
|
+
addStage(name: string, handler: CustomStageHandler, after?: PipelineStage): void;
|
|
1091
|
+
removeStage(stage: string): void;
|
|
1092
|
+
getStageHandler(stage: string): CustomStageHandler | undefined;
|
|
1093
|
+
validate(): PipelineValidationResult;
|
|
1094
|
+
abort(): void;
|
|
1095
|
+
}
|
|
1096
|
+
interface PipelineValidationResult {
|
|
1097
|
+
valid: boolean;
|
|
1098
|
+
errors: string[];
|
|
1099
|
+
warnings: string[];
|
|
1100
|
+
}
|
|
1101
|
+
interface IngesterConfig extends PipelineConfig {
|
|
1102
|
+
concurrency?: number;
|
|
1103
|
+
defaultType?: string;
|
|
1104
|
+
fileSizeLimit?: number;
|
|
1105
|
+
supportedMimeTypes?: string[];
|
|
1106
|
+
watchMode?: WatchModeConfig;
|
|
1107
|
+
}
|
|
1108
|
+
interface WatchModeConfig {
|
|
1109
|
+
enabled: boolean;
|
|
1110
|
+
paths: string[];
|
|
1111
|
+
include?: string[];
|
|
1112
|
+
exclude?: string[];
|
|
1113
|
+
debounceDelay?: number;
|
|
1114
|
+
processExisting?: boolean;
|
|
1115
|
+
}
|
|
1116
|
+
interface Ingester$1 extends Pipeline$1 {
|
|
1117
|
+
ingestFile(path: string): Promise<ProcessedDocument>;
|
|
1118
|
+
ingestUrl(url: string): Promise<ProcessedDocument>;
|
|
1119
|
+
ingestBuffer(buffer: Buffer, filename?: string): Promise<ProcessedDocument>;
|
|
1120
|
+
ingestDirectory(path: string, options?: DirectoryIngestOptions): Promise<PipelineResult>;
|
|
1121
|
+
startWatching(): void;
|
|
1122
|
+
stopWatching(): void;
|
|
1123
|
+
getStatus(): IngesterStatus;
|
|
1124
|
+
}
|
|
1125
|
+
interface DirectoryIngestOptions {
|
|
1126
|
+
recursive?: boolean;
|
|
1127
|
+
include?: string[];
|
|
1128
|
+
exclude?: string[];
|
|
1129
|
+
maxFiles?: number;
|
|
1130
|
+
sortBy?: 'name' | 'date' | 'size';
|
|
1131
|
+
}
|
|
1132
|
+
interface IngesterStatus {
|
|
1133
|
+
isProcessing: boolean;
|
|
1134
|
+
isWatching: boolean;
|
|
1135
|
+
documentsProcessed: number;
|
|
1136
|
+
documentsPending: number;
|
|
1137
|
+
currentDocument?: string;
|
|
1138
|
+
errorsCount: number;
|
|
1139
|
+
uptime: number;
|
|
1140
|
+
}
|
|
1141
|
+
interface BatchOptions {
|
|
1142
|
+
batchSize?: number;
|
|
1143
|
+
concurrency?: number;
|
|
1144
|
+
onProgress?: (processed: number, total: number) => void;
|
|
1145
|
+
continueOnError?: boolean;
|
|
1146
|
+
}
|
|
1147
|
+
interface StreamOptions {
|
|
1148
|
+
highWaterMark?: number;
|
|
1149
|
+
bufferSize?: number;
|
|
1150
|
+
timeout?: number;
|
|
1151
|
+
}
|
|
1152
|
+
interface PipelineBuilder$1 {
|
|
1153
|
+
withParser(options: ParserOptions): this;
|
|
1154
|
+
withExtraction(options: ExtractionConfig): this;
|
|
1155
|
+
withCleaning(config: CleaningConfig): this;
|
|
1156
|
+
withChunking(config: ChunkingConfig): this;
|
|
1157
|
+
withEnrichment(config: EnrichmentConfig): this;
|
|
1158
|
+
withEmbedding(config: EmbeddingPipelineConfig): this;
|
|
1159
|
+
withStorage(config: StorageConfig): this;
|
|
1160
|
+
withOCR(config: OCRConfig): this;
|
|
1161
|
+
addCustomStage(name: string, handler: CustomStageHandler, after?: PipelineStage): this;
|
|
1162
|
+
withErrorHandling(config: ErrorHandlingConfig): this;
|
|
1163
|
+
withCallbacks(callbacks: PipelineCallbacks): this;
|
|
1164
|
+
build(): Pipeline$1;
|
|
1165
|
+
}
|
|
1166
|
+
type DocumentEvent = {
|
|
1167
|
+
type: 'document:loaded';
|
|
1168
|
+
documentId: string;
|
|
1169
|
+
metadata: DocumentMetadata;
|
|
1170
|
+
} | {
|
|
1171
|
+
type: 'document:parsed';
|
|
1172
|
+
documentId: string;
|
|
1173
|
+
elementCount: number;
|
|
1174
|
+
} | {
|
|
1175
|
+
type: 'document:extracted';
|
|
1176
|
+
documentId: string;
|
|
1177
|
+
tables: number;
|
|
1178
|
+
images: number;
|
|
1179
|
+
} | {
|
|
1180
|
+
type: 'document:cleaned';
|
|
1181
|
+
documentId: string;
|
|
1182
|
+
originalLength: number;
|
|
1183
|
+
cleanedLength: number;
|
|
1184
|
+
} | {
|
|
1185
|
+
type: 'document:chunked';
|
|
1186
|
+
documentId: string;
|
|
1187
|
+
chunkCount: number;
|
|
1188
|
+
} | {
|
|
1189
|
+
type: 'document:enriched';
|
|
1190
|
+
documentId: string;
|
|
1191
|
+
enrichments: string[];
|
|
1192
|
+
} | {
|
|
1193
|
+
type: 'document:embedded';
|
|
1194
|
+
documentId: string;
|
|
1195
|
+
embeddingCount: number;
|
|
1196
|
+
} | {
|
|
1197
|
+
type: 'document:stored';
|
|
1198
|
+
documentId: string;
|
|
1199
|
+
storageId: string;
|
|
1200
|
+
} | {
|
|
1201
|
+
type: 'document:completed';
|
|
1202
|
+
document: ProcessedDocument;
|
|
1203
|
+
} | {
|
|
1204
|
+
type: 'document:error';
|
|
1205
|
+
documentId: string;
|
|
1206
|
+
error: PipelineError;
|
|
1207
|
+
};
|
|
1208
|
+
interface PipelineEventEmitter {
|
|
1209
|
+
on(event: DocumentEvent['type'], handler: (event: DocumentEvent) => void): void;
|
|
1210
|
+
off(event: DocumentEvent['type'], handler: (event: DocumentEvent) => void): void;
|
|
1211
|
+
once(event: DocumentEvent['type'], handler: (event: DocumentEvent) => void): void;
|
|
1212
|
+
emit(event: DocumentEvent): void;
|
|
1213
|
+
}
|
|
1214
|
+
|
|
1215
|
+
type EventHandler<T> = (event: T) => void;
|
|
1216
|
+
declare class IngestEventEmitter implements PipelineEventEmitter {
|
|
1217
|
+
private listeners;
|
|
1218
|
+
on(eventType: DocumentEvent['type'], handler: EventHandler<DocumentEvent>): void;
|
|
1219
|
+
off(eventType: DocumentEvent['type'], handler: EventHandler<DocumentEvent>): void;
|
|
1220
|
+
once(eventType: DocumentEvent['type'], handler: EventHandler<DocumentEvent>): void;
|
|
1221
|
+
emit(event: DocumentEvent): void;
|
|
1222
|
+
removeAllListeners(eventType?: DocumentEvent['type']): void;
|
|
1223
|
+
listenerCount(eventType: DocumentEvent['type']): number;
|
|
1224
|
+
}
|
|
1225
|
+
declare function createEventEmitter(): PipelineEventEmitter;
|
|
1226
|
+
|
|
1227
|
+
declare class ParserRegistry {
|
|
1228
|
+
private parsers;
|
|
1229
|
+
private mimeTypeOverrides;
|
|
1230
|
+
private defaultOptions;
|
|
1231
|
+
constructor(config?: ParserRegistryConfig);
|
|
1232
|
+
register(parser: Parser): void;
|
|
1233
|
+
unregister(name: string): void;
|
|
1234
|
+
get(name: string): Parser | undefined;
|
|
1235
|
+
findParser(mimeType?: string, extension?: string): Parser | undefined;
|
|
1236
|
+
parse(buffer: Buffer, mimeType?: string, extension?: string, options?: ParserOptions): Promise<ParseResult>;
|
|
1237
|
+
isSupported(mimeType?: string, extension?: string): boolean;
|
|
1238
|
+
getAll(): Parser[];
|
|
1239
|
+
getSupportedMimeTypes(): string[];
|
|
1240
|
+
getSupportedExtensions(): string[];
|
|
1241
|
+
static detectMimeType(extension: string): string | undefined;
|
|
1242
|
+
static detectExtension(mimeType: string): string | undefined;
|
|
1243
|
+
static getExtension(filename: string): string;
|
|
1244
|
+
}
|
|
1245
|
+
declare function createParserRegistry(config?: ParserRegistryConfig): ParserRegistry;
|
|
1246
|
+
|
|
1247
|
+
declare class ChunkerRegistry {
|
|
1248
|
+
private chunkers;
|
|
1249
|
+
private defaultOptions;
|
|
1250
|
+
constructor(config?: ChunkerRegistryConfig);
|
|
1251
|
+
register(chunker: Chunker): void;
|
|
1252
|
+
unregister(strategy: ChunkingStrategy): void;
|
|
1253
|
+
get(strategy: ChunkingStrategy): Chunker | undefined;
|
|
1254
|
+
chunk(text: string, strategy: ChunkingStrategy, options?: ChunkingOptions): Chunk[] | Promise<Chunk[]>;
|
|
1255
|
+
chunkElements(elements: Element[], strategy: ChunkingStrategy, options?: ChunkingOptions): Chunk[] | Promise<Chunk[]>;
|
|
1256
|
+
isSupported(strategy: ChunkingStrategy): boolean;
|
|
1257
|
+
getAll(): Chunker[];
|
|
1258
|
+
getSupportedStrategies(): ChunkingStrategy[];
|
|
1259
|
+
setDefaultOptions(options: ChunkingOptions): void;
|
|
1260
|
+
getDefaultOptions(): ChunkingOptions;
|
|
1261
|
+
}
|
|
1262
|
+
declare function createChunkerRegistry(config?: ChunkerRegistryConfig): ChunkerRegistry;
|
|
1263
|
+
|
|
1264
|
+
declare class Pipeline implements Pipeline$1 {
|
|
1265
|
+
readonly name: string;
|
|
1266
|
+
readonly config: PipelineConfig;
|
|
1267
|
+
private parserRegistry;
|
|
1268
|
+
private chunkerRegistry;
|
|
1269
|
+
private eventEmitter;
|
|
1270
|
+
private customStages;
|
|
1271
|
+
private stageOrder;
|
|
1272
|
+
private aborted;
|
|
1273
|
+
constructor(config?: PipelineConfig);
|
|
1274
|
+
process(input: DocumentInput): Promise<ProcessedDocument>;
|
|
1275
|
+
processBatch(inputs: DocumentInput[]): Promise<PipelineResult>;
|
|
1276
|
+
processStream(inputs: AsyncIterable<DocumentInput>): AsyncIterable<ProcessedDocument>;
|
|
1277
|
+
addStage(name: string, handler: CustomStageHandler, after?: PipelineStage): void;
|
|
1278
|
+
removeStage(stage: string): void;
|
|
1279
|
+
getStageHandler(stage: string): CustomStageHandler | undefined;
|
|
1280
|
+
validate(): PipelineValidationResult;
|
|
1281
|
+
abort(): void;
|
|
1282
|
+
getEventEmitter(): IngestEventEmitter;
|
|
1283
|
+
getParserRegistry(): ParserRegistry;
|
|
1284
|
+
getChunkerRegistry(): ChunkerRegistry;
|
|
1285
|
+
private executeStage;
|
|
1286
|
+
private executeLoad;
|
|
1287
|
+
private executeParse;
|
|
1288
|
+
private executeExtract;
|
|
1289
|
+
private executeClean;
|
|
1290
|
+
private applyCleaningOperation;
|
|
1291
|
+
private executeChunk;
|
|
1292
|
+
private simpleChunk;
|
|
1293
|
+
private estimateTokens;
|
|
1294
|
+
private executeEnrich;
|
|
1295
|
+
private executeEmbed;
|
|
1296
|
+
private executeStore;
|
|
1297
|
+
private createPipelineError;
|
|
1298
|
+
}
|
|
1299
|
+
declare function createPipeline(config?: PipelineConfig): Pipeline;
|
|
1300
|
+
|
|
1301
|
+
declare class PipelineBuilder implements PipelineBuilder$1 {
|
|
1302
|
+
private config;
|
|
1303
|
+
private customStages;
|
|
1304
|
+
withName(name: string): this;
|
|
1305
|
+
withStages(stages: PipelineStage[]): this;
|
|
1306
|
+
withParser(options: ParserOptions): this;
|
|
1307
|
+
withExtraction(options: ExtractionConfig): this;
|
|
1308
|
+
withCleaning(config: CleaningConfig): this;
|
|
1309
|
+
withChunking(config: ChunkingConfig): this;
|
|
1310
|
+
withEnrichment(config: EnrichmentConfig): this;
|
|
1311
|
+
withEmbedding(config: EmbeddingPipelineConfig): this;
|
|
1312
|
+
withStorage(config: StorageConfig): this;
|
|
1313
|
+
withOCR(config: OCRConfig): this;
|
|
1314
|
+
addCustomStage(name: string, handler: CustomStageHandler, after?: PipelineStage): this;
|
|
1315
|
+
withErrorHandling(config: ErrorHandlingConfig): this;
|
|
1316
|
+
withCallbacks(callbacks: PipelineCallbacks): this;
|
|
1317
|
+
build(): Pipeline$1;
|
|
1318
|
+
clone(): PipelineBuilder;
|
|
1319
|
+
}
|
|
1320
|
+
declare function createPipelineBuilder(): PipelineBuilder;
|
|
1321
|
+
declare const pipelines: {
|
|
1322
|
+
simple(): PipelineBuilder;
|
|
1323
|
+
full(): PipelineBuilder;
|
|
1324
|
+
rag(): PipelineBuilder;
|
|
1325
|
+
analysis(): PipelineBuilder;
|
|
1326
|
+
ocr(): PipelineBuilder;
|
|
1327
|
+
};
|
|
1328
|
+
|
|
1329
|
+
declare class Ingester extends Pipeline implements Ingester$1 {
|
|
1330
|
+
private ingesterConfig;
|
|
1331
|
+
private status;
|
|
1332
|
+
private watcher;
|
|
1333
|
+
private processedCount;
|
|
1334
|
+
private pendingQueue;
|
|
1335
|
+
private startTime;
|
|
1336
|
+
constructor(config?: IngesterConfig);
|
|
1337
|
+
ingestFile(path: string): Promise<ProcessedDocument>;
|
|
1338
|
+
ingestUrl(url: string): Promise<ProcessedDocument>;
|
|
1339
|
+
ingestBuffer(buffer: Buffer, filename?: string): Promise<ProcessedDocument>;
|
|
1340
|
+
ingestDirectory(dirPath: string, options?: DirectoryIngestOptions): Promise<PipelineResult>;
|
|
1341
|
+
startWatching(): void;
|
|
1342
|
+
stopWatching(): void;
|
|
1343
|
+
getStatus(): IngesterStatus;
|
|
1344
|
+
process(input: DocumentInput): Promise<ProcessedDocument>;
|
|
1345
|
+
private listFiles;
|
|
1346
|
+
private matchPattern;
|
|
1347
|
+
private validateFileSize;
|
|
1348
|
+
private validateBufferSize;
|
|
1349
|
+
private isSupportedMimeType;
|
|
1350
|
+
private setupWatcher;
|
|
1351
|
+
}
|
|
1352
|
+
declare function createIngester(config?: IngesterConfig): Ingester;
|
|
1353
|
+
|
|
1354
|
+
declare abstract class BaseParser implements Parser {
|
|
1355
|
+
abstract readonly name: string;
|
|
1356
|
+
abstract readonly supportedMimeTypes: string[];
|
|
1357
|
+
abstract readonly supportedExtensions: string[];
|
|
1358
|
+
abstract readonly capabilities: ParserCapabilities;
|
|
1359
|
+
canParse(mimeType: string, extension?: string): boolean;
|
|
1360
|
+
abstract parse(buffer: Buffer, options?: ParserOptions): Promise<ParseResult>;
|
|
1361
|
+
parseStream?(stream: NodeJS.ReadableStream, options?: ParserOptions): AsyncIterableIterator<Element>;
|
|
1362
|
+
protected createEmptyResult(type: DocumentType): ParseResult;
|
|
1363
|
+
protected createElement(type: Element['type'], text: string, pageNumber?: number, metadata?: Record<string, unknown>): Element;
|
|
1364
|
+
protected createTable(headers: string[], rows: string[][], pageNumber?: number, caption?: string): TableData;
|
|
1365
|
+
protected createImage(width: number, height: number, format: string, options?: Partial<ImageData>): ImageData;
|
|
1366
|
+
protected estimateWordCount(text: string): number;
|
|
1367
|
+
protected estimateCharacterCount(text: string): number;
|
|
1368
|
+
protected extractTextFromElements(elements: Element[]): string;
|
|
1369
|
+
protected mergeMetadata(extracted: Partial<DocumentMetadata>, text: string): DocumentMetadata;
|
|
1370
|
+
}
|
|
1371
|
+
|
|
1372
|
+
declare class PDFParser extends BaseParser {
|
|
1373
|
+
readonly name = "pdf-parser";
|
|
1374
|
+
readonly supportedMimeTypes: string[];
|
|
1375
|
+
readonly supportedExtensions: string[];
|
|
1376
|
+
readonly capabilities: ParserCapabilities;
|
|
1377
|
+
parse(buffer: Buffer, options?: PDFParserOptions): Promise<ParseResult>;
|
|
1378
|
+
private createElements;
|
|
1379
|
+
private detectElementType;
|
|
1380
|
+
private parsePDFDate;
|
|
1381
|
+
private renderPage;
|
|
1382
|
+
}
|
|
1383
|
+
declare function createPDFParser(): PDFParser;
|
|
1384
|
+
|
|
1385
|
+
declare class DOCXParser extends BaseParser {
|
|
1386
|
+
readonly name = "docx-parser";
|
|
1387
|
+
readonly supportedMimeTypes: string[];
|
|
1388
|
+
readonly supportedExtensions: string[];
|
|
1389
|
+
readonly capabilities: ParserCapabilities;
|
|
1390
|
+
parse(buffer: Buffer, options?: DOCXParserOptions): Promise<ParseResult>;
|
|
1391
|
+
private parseHtmlStructure;
|
|
1392
|
+
}
|
|
1393
|
+
declare function createDOCXParser(): DOCXParser;
|
|
1394
|
+
|
|
1395
|
+
declare class HTMLParser extends BaseParser {
|
|
1396
|
+
readonly name = "html-parser";
|
|
1397
|
+
readonly supportedMimeTypes: string[];
|
|
1398
|
+
readonly supportedExtensions: string[];
|
|
1399
|
+
readonly capabilities: ParserCapabilities;
|
|
1400
|
+
parse(buffer: Buffer, options?: HTMLParserOptions): Promise<ParseResult>;
|
|
1401
|
+
private extractMetadata;
|
|
1402
|
+
private extractElements;
|
|
1403
|
+
private extractTables;
|
|
1404
|
+
private extractImages;
|
|
1405
|
+
private extractText;
|
|
1406
|
+
}
|
|
1407
|
+
declare function createHTMLParser(): HTMLParser;
|
|
1408
|
+
|
|
1409
|
+
declare class MarkdownParser extends BaseParser {
|
|
1410
|
+
readonly name = "markdown-parser";
|
|
1411
|
+
readonly supportedMimeTypes: string[];
|
|
1412
|
+
readonly supportedExtensions: string[];
|
|
1413
|
+
readonly capabilities: ParserCapabilities;
|
|
1414
|
+
parse(buffer: Buffer, options?: MarkdownParserOptions): Promise<ParseResult>;
|
|
1415
|
+
private extractFrontmatter;
|
|
1416
|
+
private extractElements;
|
|
1417
|
+
private extractTables;
|
|
1418
|
+
private extractImages;
|
|
1419
|
+
private getImageFormat;
|
|
1420
|
+
private extractText;
|
|
1421
|
+
}
|
|
1422
|
+
declare function createMarkdownParser(): MarkdownParser;
|
|
1423
|
+
|
|
1424
|
+
declare class TextParser extends BaseParser {
|
|
1425
|
+
readonly name = "text-parser";
|
|
1426
|
+
readonly supportedMimeTypes: string[];
|
|
1427
|
+
readonly supportedExtensions: string[];
|
|
1428
|
+
readonly capabilities: ParserCapabilities;
|
|
1429
|
+
parse(buffer: Buffer, options?: ParserOptions): Promise<ParseResult>;
|
|
1430
|
+
parseStream(stream: NodeJS.ReadableStream, _options?: ParserOptions): AsyncIterableIterator<Element>;
|
|
1431
|
+
private extractElements;
|
|
1432
|
+
private detectElementType;
|
|
1433
|
+
}
|
|
1434
|
+
declare function createTextParser(): TextParser;
|
|
1435
|
+
|
|
1436
|
+
declare class CSVParser extends BaseParser {
|
|
1437
|
+
readonly name = "csv-parser";
|
|
1438
|
+
readonly supportedMimeTypes: string[];
|
|
1439
|
+
readonly supportedExtensions: string[];
|
|
1440
|
+
readonly capabilities: ParserCapabilities;
|
|
1441
|
+
parse(buffer: Buffer, options?: CSVParserOptions): Promise<ParseResult>;
|
|
1442
|
+
parseStream(stream: NodeJS.ReadableStream, options?: CSVParserOptions): AsyncIterableIterator<Element>;
|
|
1443
|
+
private detectDelimiter;
|
|
1444
|
+
private parseCSV;
|
|
1445
|
+
private parseRow;
|
|
1446
|
+
private createTextRepresentation;
|
|
1447
|
+
private createElements;
|
|
1448
|
+
}
|
|
1449
|
+
declare function createCSVParser(): CSVParser;
|
|
1450
|
+
|
|
1451
|
+
declare class ExcelParser extends BaseParser {
|
|
1452
|
+
readonly name = "excel-parser";
|
|
1453
|
+
readonly supportedMimeTypes: string[];
|
|
1454
|
+
readonly supportedExtensions: string[];
|
|
1455
|
+
readonly capabilities: ParserCapabilities;
|
|
1456
|
+
parse(buffer: Buffer, options?: ExcelParserOptions): Promise<ParseResult>;
|
|
1457
|
+
}
|
|
1458
|
+
declare function createExcelParser(): ExcelParser;
|
|
1459
|
+
|
|
1460
|
+
declare class JSONParser extends BaseParser {
|
|
1461
|
+
readonly name = "json-parser";
|
|
1462
|
+
readonly supportedMimeTypes: string[];
|
|
1463
|
+
readonly supportedExtensions: string[];
|
|
1464
|
+
readonly capabilities: ParserCapabilities;
|
|
1465
|
+
parse(buffer: Buffer, options?: ParserOptions): Promise<ParseResult>;
|
|
1466
|
+
private isJSONLines;
|
|
1467
|
+
private parseJSONLines;
|
|
1468
|
+
private extractElements;
|
|
1469
|
+
private createTextRepresentation;
|
|
1470
|
+
private extractTables;
|
|
1471
|
+
private extractTablesFromArray;
|
|
1472
|
+
}
|
|
1473
|
+
declare function createJSONParser(): JSONParser;
|
|
1474
|
+
|
|
1475
|
+
declare function getBuiltInParsers(): Parser[];
|
|
1476
|
+
declare function registerBuiltInParsers(register: (parser: Parser) => void): void;
|
|
1477
|
+
|
|
1478
|
+
declare abstract class BaseChunker implements Chunker {
|
|
1479
|
+
abstract readonly name: string;
|
|
1480
|
+
abstract readonly strategy: ChunkingStrategy;
|
|
1481
|
+
protected tokenCounter: TokenCounter;
|
|
1482
|
+
constructor(tokenCounter?: TokenCounter);
|
|
1483
|
+
abstract chunk(text: string, options?: ChunkingOptions): Chunk[] | Promise<Chunk[]>;
|
|
1484
|
+
chunkElements(elements: Element[], options?: ChunkingOptions): Chunk[] | Promise<Chunk[]>;
|
|
1485
|
+
estimateChunks(text: string, options?: ChunkingOptions): number;
|
|
1486
|
+
protected createChunk(text: string, documentId: string, index: number, metadata?: Partial<ChunkMetadata>): Chunk;
|
|
1487
|
+
protected extractTextFromElements(elements: Element[]): string;
|
|
1488
|
+
protected findElementTypeForChunk(chunk: Chunk, elements: Element[]): Element['type'] | undefined;
|
|
1489
|
+
protected splitAtSentenceBoundaries(text: string): string[];
|
|
1490
|
+
protected splitAtParagraphBoundaries(text: string): string[];
|
|
1491
|
+
protected mergeToTargetSize(texts: string[], maxTokens: number, separator?: string): string[];
|
|
1492
|
+
protected applyOverlap(chunks: string[], overlapTokens: number): string[];
|
|
1493
|
+
}
|
|
1494
|
+
|
|
1495
|
+
declare class FixedChunker extends BaseChunker {
|
|
1496
|
+
readonly name = "fixed-chunker";
|
|
1497
|
+
readonly strategy: "fixed";
|
|
1498
|
+
chunk(text: string, options?: FixedChunkingOptions): Chunk[];
|
|
1499
|
+
}
|
|
1500
|
+
declare function createFixedChunker(): FixedChunker;
|
|
1501
|
+
|
|
1502
|
+
declare class RecursiveChunker extends BaseChunker {
|
|
1503
|
+
readonly name = "recursive-chunker";
|
|
1504
|
+
readonly strategy: "recursive";
|
|
1505
|
+
chunk(text: string, options?: RecursiveChunkingOptions): Chunk[];
|
|
1506
|
+
private recursiveSplit;
|
|
1507
|
+
private splitBySeparator;
|
|
1508
|
+
private splitByCharacters;
|
|
1509
|
+
private mergeSplits;
|
|
1510
|
+
}
|
|
1511
|
+
declare function createRecursiveChunker(): RecursiveChunker;
|
|
1512
|
+
|
|
1513
|
+
declare class SentenceChunker extends BaseChunker {
|
|
1514
|
+
readonly name = "sentence-chunker";
|
|
1515
|
+
readonly strategy: "sentence";
|
|
1516
|
+
chunk(text: string, options?: SentenceChunkingOptions): Chunk[];
|
|
1517
|
+
private splitIntoSentences;
|
|
1518
|
+
private groupSentences;
|
|
1519
|
+
private applyOverlapSentences;
|
|
1520
|
+
}
|
|
1521
|
+
declare function createSentenceChunker(): SentenceChunker;
|
|
1522
|
+
|
|
1523
|
+
declare class ParagraphChunker extends BaseChunker {
|
|
1524
|
+
readonly name = "paragraph-chunker";
|
|
1525
|
+
readonly strategy: "paragraph";
|
|
1526
|
+
chunk(text: string, options?: ParagraphChunkingOptions): Chunk[];
|
|
1527
|
+
private groupParagraphs;
|
|
1528
|
+
private applyOverlapParagraphs;
|
|
1529
|
+
}
|
|
1530
|
+
declare function createParagraphChunker(): ParagraphChunker;
|
|
1531
|
+
|
|
1532
|
+
declare class SemanticChunker extends BaseChunker {
|
|
1533
|
+
readonly name = "semantic-chunker";
|
|
1534
|
+
readonly strategy: "semantic";
|
|
1535
|
+
chunk(text: string, options?: SemanticChunkingOptions): Promise<Chunk[]>;
|
|
1536
|
+
private splitIntoSentences;
|
|
1537
|
+
private findSemanticBreakpoints;
|
|
1538
|
+
private cosineSimilarity;
|
|
1539
|
+
}
|
|
1540
|
+
declare function createSemanticChunker(): SemanticChunker;
|
|
1541
|
+
|
|
1542
|
+
declare class HierarchicalChunker extends BaseChunker {
|
|
1543
|
+
readonly name = "hierarchical-chunker";
|
|
1544
|
+
readonly strategy: "hierarchical";
|
|
1545
|
+
chunk(text: string, options?: HierarchicalChunkingOptions): Chunk[];
|
|
1546
|
+
chunkElements(elements: Element[], options?: HierarchicalChunkingOptions): Chunk[];
|
|
1547
|
+
private parseHierarchy;
|
|
1548
|
+
private buildHierarchyFromElements;
|
|
1549
|
+
private flattenToChunks;
|
|
1550
|
+
private splitLargeSection;
|
|
1551
|
+
}
|
|
1552
|
+
declare function createHierarchicalChunker(): HierarchicalChunker;
|
|
1553
|
+
|
|
1554
|
+
declare function getBuiltInChunkers(): Chunker[];
|
|
1555
|
+
declare function createChunker(strategy: ChunkingStrategy): Chunker;
|
|
1556
|
+
declare function registerBuiltInChunkers(register: (chunker: Chunker) => void): void;
|
|
1557
|
+
|
|
1558
|
+
export { type AWSTextractConfig, type Annotation, type AzureVisionConfig, BaseChunker, BaseParser, type BatchOptions, type BoundaryDetector, type BoundaryType, type BoundingBox, CSVParser, type CSVParserOptions, type ChromaStorageOptions, type Chunk, type ChunkIndex, type ChunkMetadata, type ChunkResult, type ChunkStorage, type Chunker, ChunkerRegistry, type ChunkerRegistryConfig, type ChunkingConfig, type ChunkingOptions, type ChunkingStrategy, type ClassificationConfig, type ClassificationResult, type CleaningChange, type CleaningConfig, type CleaningHandler, type CleaningOperation, type CleaningOptions, type CleaningResult, type ColorAnalysis, type ColumnType, type ContentFilter, type ContentFilterOptions, type CustomStageHandler, DOCXParser, type DOCXParserOptions, type DeduplicationOptions, type DeduplicationResult, type DetectedHeaderFooter, type DirectoryIngestOptions, type DocumentEvent, type DocumentInput, type DocumentMetadata, type DocumentQuery, type DocumentStatistics, type DocumentStorage, type DocumentType, type DuplicateGroup, type Element, type ElementType, type EmbeddingConfig, type EmbeddingPipelineConfig, type EmbeddingResult, type EmotionResult, type EmotionType, type Enricher, type EnrichmentConfig, type EnrichmentData, type EnrichmentHandler, type EnrichmentResult, type EnrichmentType, type Entity, type EntityExtractionConfig, type EntityType, type ErrorHandlingConfig, ExcelParser, type ExcelParserOptions, type ExtendedEntity, type ExtendedMetadata, type ExtendedSentimentResult, type ExtractedImage, type ExtractedKeyword, type ExtractedLink, type ExtractedRelation, type ExtractedTable, type ExtractionConfig, type Extractor, type FileStorageOptions, FixedChunker, type FixedChunkingOptions, type FormField, type GoogleVisionConfig, type GoogleVisionFeature, HTMLParser, type HTMLParserOptions, type HeaderFooterOptions, HierarchicalChunker, type HierarchicalChunkingOptions, type ImageData, type ImageExtractionOptions, type ImageExtractor, type ImageOutputFormat, type ImageType, IngestEventEmitter, Ingester, type IngesterConfig, type IngesterStatus, JSONParser, type KeywordExtractionConfig, type KeywordExtractionMethod, type LinkExtractionOptions, type LinkPosition, type ListOptions, type ListResult, MarkdownParser, type MarkdownParserOptions, type MemoryStorageOptions, type MergedCellRegion, type MetadataExtractionOptions, type MetadataExtractor, type MongoDBStorageOptions, type NormalizationOptions, type OCRBlock, type OCRBlockType, type OCRConfig, type OCRElement, type OCREngine, type OCREngineFactoryConfig, type OCREngineInterface, type OCREngineType, type OCRFontInfo, type OCROptions, type OCRPreprocessingOptions, type OCRQualityMetrics, type OCRResult, type OverlapConfig, PDFParser, type PDFParserOptions, type PIIDetectionResult, type PIIInstance, type PIIType, type PageSegmentationMode, ParagraphChunker, type ParagraphChunkingOptions, type ParseResult, type Parser, type ParserCapabilities, type ParserOptions, ParserRegistry, type ParserRegistryConfig, type PineconeStorageOptions, Pipeline, PipelineBuilder, type PipelineCallbacks, type PipelineConfig, type PipelineContext, type PipelineError, type PipelineEventEmitter, type PipelineLogger, type PipelineProgress, type PipelineResult, type PipelineStage, type PipelineValidationResult, type PostgresStorageOptions, type ProcessedDocument, type ProcessingError, type QdrantStorageOptions, RecursiveChunker, type RecursiveChunkingOptions, type RedisStorageOptions, type RelationConfig, type SQLiteStorageOptions, SemanticChunker, type SemanticChunkingOptions, SentenceChunker, type SentenceChunkingOptions, type SentimentConfig, type SentimentResult, type SentimentSegment, type SlidingWindowChunkingOptions, type StorageAdapterType, type StorageConfig, type StorageFactoryConfig, type StreamOptions, type SummarizationConfig, type SummaryResult, type SummaryType, type TableData, type TableExtractionOptions, type TableExtractor, type TableOutputFormat, type TableStructure, type TesseractConfig, type TextCleaner, type TextNormalizer, TextParser, type TokenCounter, type TopicConfig, type TopicMethod, type TopicResult, type VectorIndexStats, type VectorSearchOptions, type VectorSearchResult, type VectorStorage, type WatchModeConfig, type WeaviateStorageOptions, createCSVParser, createChunker, createChunkerRegistry, createDOCXParser, createEventEmitter, createExcelParser, createFixedChunker, createHTMLParser, createHierarchicalChunker, createIngester, createJSONParser, createMarkdownParser, createPDFParser, createParagraphChunker, createParserRegistry, createPipeline, createPipelineBuilder, createRecursiveChunker, createSemanticChunker, createSentenceChunker, createTextParser, getBuiltInChunkers, getBuiltInParsers, pipelines, registerBuiltInChunkers, registerBuiltInParsers };
|