@kreuzberg/node 4.0.0-rc.8 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +342 -530
- package/dist/cli.d.mts +4 -0
- package/dist/cli.d.ts +4 -0
- package/dist/cli.js +12 -2
- package/dist/cli.js.map +1 -1
- package/dist/cli.mjs +12 -1
- package/dist/cli.mjs.map +1 -1
- package/dist/index.d.mts +337 -62
- package/dist/index.d.ts +337 -62
- package/dist/index.js +285 -56
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +277 -56
- package/dist/index.mjs.map +1 -1
- package/dist/types.d.mts +469 -54
- package/dist/types.d.ts +469 -54
- package/dist/types.js.map +1 -1
- package/index.d.ts +662 -1
- package/index.js +85 -55
- package/metadata.d.ts +53 -33
- package/package.json +17 -19
package/dist/types.d.mts
CHANGED
|
@@ -4,108 +4,316 @@
|
|
|
4
4
|
* These types mirror the strongly-typed Rust metadata structures,
|
|
5
5
|
* providing type safety for TypeScript users.
|
|
6
6
|
*/
|
|
7
|
+
/**
|
|
8
|
+
* Tesseract OCR engine configuration options.
|
|
9
|
+
*
|
|
10
|
+
* @example
|
|
11
|
+
* ```typescript
|
|
12
|
+
* const config: TesseractConfig = {
|
|
13
|
+
* psm: 6,
|
|
14
|
+
* enableTableDetection: true,
|
|
15
|
+
* tesseditCharWhitelist: '0123456789'
|
|
16
|
+
* };
|
|
17
|
+
* ```
|
|
18
|
+
*/
|
|
7
19
|
interface TesseractConfig {
|
|
20
|
+
/**
|
|
21
|
+
* Page Segmentation Mode (0-13). Controls how Tesseract segments and recognizes text.
|
|
22
|
+
* Common values: 3 (auto), 6 (single uniform block), 11 (sparse text).
|
|
23
|
+
* Default: 3 (auto layout analysis).
|
|
24
|
+
*/
|
|
8
25
|
psm?: number;
|
|
26
|
+
/**
|
|
27
|
+
* Enable table detection during OCR processing.
|
|
28
|
+
* When true, Tesseract attempts to preserve table structure in the output.
|
|
29
|
+
* Default: false.
|
|
30
|
+
*/
|
|
9
31
|
enableTableDetection?: boolean;
|
|
32
|
+
/**
|
|
33
|
+
* Whitelist of characters Tesseract should recognize.
|
|
34
|
+
* Only these characters will be returned by the OCR engine.
|
|
35
|
+
* Use empty string to allow all characters. Useful for constraining output to digits,
|
|
36
|
+
* specific alphabets, or other character sets.
|
|
37
|
+
* Default: null (recognize all).
|
|
38
|
+
*/
|
|
10
39
|
tesseditCharWhitelist?: string;
|
|
11
40
|
}
|
|
41
|
+
/**
|
|
42
|
+
* OCR (Optical Character Recognition) configuration.
|
|
43
|
+
*
|
|
44
|
+
* Controls which OCR engine to use and how it processes images.
|
|
45
|
+
*/
|
|
12
46
|
interface OcrConfig {
|
|
47
|
+
/** OCR backend name (e.g., 'tesseract', 'paddleocr', 'easyocr'). Required. */
|
|
13
48
|
backend: string;
|
|
49
|
+
/** ISO 639-1/3 language code(s) for OCR (e.g., 'eng', 'fra', 'deu'). Default: 'eng'. */
|
|
14
50
|
language?: string;
|
|
51
|
+
/** Tesseract engine-specific configuration options. Only used when backend is 'tesseract'. */
|
|
15
52
|
tesseractConfig?: TesseractConfig;
|
|
16
53
|
}
|
|
54
|
+
/**
|
|
55
|
+
* Document chunking configuration for splitting large documents.
|
|
56
|
+
*
|
|
57
|
+
* Breaks large documents into smaller, manageable chunks while preserving context.
|
|
58
|
+
* Useful for RAG (Retrieval Augmented Generation) and vector database indexing.
|
|
59
|
+
*/
|
|
17
60
|
interface ChunkingConfig {
|
|
61
|
+
/** Maximum characters per chunk. Default: 4096. */
|
|
18
62
|
maxChars?: number;
|
|
63
|
+
/** Maximum overlapping characters between consecutive chunks for context preservation. Default: 512. */
|
|
19
64
|
maxOverlap?: number;
|
|
65
|
+
/**
|
|
66
|
+
* Alternative to maxChars: chunk size using different unit.
|
|
67
|
+
* Mutually exclusive with maxChars.
|
|
68
|
+
*/
|
|
20
69
|
chunkSize?: number;
|
|
70
|
+
/**
|
|
71
|
+
* Alternative to maxOverlap: overlap amount using different unit.
|
|
72
|
+
* Mutually exclusive with maxOverlap.
|
|
73
|
+
*/
|
|
21
74
|
chunkOverlap?: number;
|
|
75
|
+
/**
|
|
76
|
+
* Named preset configuration (e.g., 'default', 'aggressive', 'minimal').
|
|
77
|
+
* Uses preset values if neither maxChars nor chunkSize is specified.
|
|
78
|
+
*/
|
|
22
79
|
preset?: string;
|
|
80
|
+
/** Embedding configuration for generating vector embeddings for each chunk. */
|
|
23
81
|
embedding?: Record<string, unknown>;
|
|
82
|
+
/** Enable or disable chunking. Default: true when chunking config is provided. */
|
|
24
83
|
enabled?: boolean;
|
|
25
84
|
}
|
|
85
|
+
/**
|
|
86
|
+
* Language detection configuration.
|
|
87
|
+
*
|
|
88
|
+
* Automatically detects the language(s) of extracted content.
|
|
89
|
+
*/
|
|
26
90
|
interface LanguageDetectionConfig {
|
|
91
|
+
/** Enable automatic language detection. Default: true. */
|
|
27
92
|
enabled?: boolean;
|
|
93
|
+
/** Minimum confidence score (0.0-1.0) for language detection. Default: 0.5. */
|
|
28
94
|
minConfidence?: number;
|
|
95
|
+
/** Detect multiple languages in the same document. Default: false. */
|
|
29
96
|
detectMultiple?: boolean;
|
|
30
97
|
}
|
|
98
|
+
/**
|
|
99
|
+
* Token reduction configuration for optimizing token usage.
|
|
100
|
+
*
|
|
101
|
+
* Reduces the number of tokens in extracted content while preserving meaning.
|
|
102
|
+
* Useful for reducing costs in LLM pipelines.
|
|
103
|
+
*/
|
|
31
104
|
interface TokenReductionConfig {
|
|
105
|
+
/** Reduction mode: 'aggressive' or 'conservative'. Default: 'conservative'. */
|
|
32
106
|
mode?: string;
|
|
107
|
+
/** Preserve tokens for semantically important words even in aggressive mode. Default: true. */
|
|
33
108
|
preserveImportantWords?: boolean;
|
|
34
109
|
}
|
|
110
|
+
/**
|
|
111
|
+
* Hierarchy extraction configuration.
|
|
112
|
+
*
|
|
113
|
+
* Controls document hierarchy detection based on font size clustering.
|
|
114
|
+
*/
|
|
115
|
+
interface HierarchyConfig {
|
|
116
|
+
/** Enable hierarchy extraction. Default: true. */
|
|
117
|
+
enabled?: boolean;
|
|
118
|
+
/** Number of font size clusters (2-10). Default: 6. */
|
|
119
|
+
kClusters?: number;
|
|
120
|
+
/** Include bounding box information. Default: true. */
|
|
121
|
+
includeBbox?: boolean;
|
|
122
|
+
/** OCR coverage threshold (0.0-1.0). Default: null. */
|
|
123
|
+
ocrCoverageThreshold?: number | null;
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* PDF-specific extraction configuration.
|
|
127
|
+
*
|
|
128
|
+
* Controls how PDF documents are processed.
|
|
129
|
+
*/
|
|
35
130
|
interface PdfConfig {
|
|
131
|
+
/** Extract images from PDF pages. Default: true. */
|
|
36
132
|
extractImages?: boolean;
|
|
133
|
+
/** List of passwords to try for password-protected PDFs. */
|
|
37
134
|
passwords?: string[];
|
|
135
|
+
/** Extract document metadata (title, author, creation date, etc.). Default: true. */
|
|
38
136
|
extractMetadata?: boolean;
|
|
137
|
+
/** Hierarchy extraction configuration. */
|
|
138
|
+
hierarchy?: HierarchyConfig;
|
|
39
139
|
}
|
|
140
|
+
/**
|
|
141
|
+
* Image extraction and processing configuration.
|
|
142
|
+
*
|
|
143
|
+
* Controls how images are extracted and optimized from documents.
|
|
144
|
+
*/
|
|
40
145
|
interface ImageExtractionConfig {
|
|
146
|
+
/** Enable image extraction from documents. Default: true. */
|
|
41
147
|
extractImages?: boolean;
|
|
148
|
+
/** Target DPI (dots per inch) for extracted images. Higher DPI = better quality but larger files. Default: 150. */
|
|
42
149
|
targetDpi?: number;
|
|
150
|
+
/** Maximum image dimension (width or height) in pixels. Images larger than this are downscaled. Default: 2000. */
|
|
43
151
|
maxImageDimension?: number;
|
|
152
|
+
/** Automatically adjust DPI based on image content and quality. Default: true. */
|
|
44
153
|
autoAdjustDpi?: boolean;
|
|
154
|
+
/** Minimum DPI to maintain for image quality. Default: 72. */
|
|
45
155
|
minDpi?: number;
|
|
156
|
+
/** Maximum DPI to avoid excessive file sizes. Default: 300. */
|
|
46
157
|
maxDpi?: number;
|
|
47
158
|
}
|
|
159
|
+
/**
|
|
160
|
+
* Post-processor configuration for modifying extracted content.
|
|
161
|
+
*
|
|
162
|
+
* Post-processors allow customization and cleanup of extraction results
|
|
163
|
+
* without failing the extraction if they encounter errors.
|
|
164
|
+
*/
|
|
48
165
|
interface PostProcessorConfig {
|
|
166
|
+
/** Enable or disable post-processing entirely. Default: true. */
|
|
49
167
|
enabled?: boolean;
|
|
168
|
+
/** List of processor names to enable (allowlist). When set, only these are used. */
|
|
50
169
|
enabledProcessors?: string[];
|
|
170
|
+
/** List of processor names to disable (denylist). These are skipped. */
|
|
51
171
|
disabledProcessors?: string[];
|
|
52
172
|
}
|
|
173
|
+
/**
|
|
174
|
+
* HTML preprocessing options.
|
|
175
|
+
*
|
|
176
|
+
* Cleans HTML content before conversion to Markdown.
|
|
177
|
+
*/
|
|
53
178
|
interface HtmlPreprocessingOptions {
|
|
179
|
+
/** Enable HTML preprocessing. Default: true. */
|
|
54
180
|
enabled?: boolean;
|
|
181
|
+
/** Preset cleanup level: 'minimal' (light), 'standard' (balanced), 'aggressive' (heavy). Default: 'standard'. */
|
|
55
182
|
preset?: "minimal" | "standard" | "aggressive";
|
|
183
|
+
/** Remove navigation menus and headers. Default: true. */
|
|
56
184
|
removeNavigation?: boolean;
|
|
185
|
+
/** Remove form elements. Default: true. */
|
|
57
186
|
removeForms?: boolean;
|
|
58
187
|
}
|
|
188
|
+
/**
|
|
189
|
+
* HTML to Markdown conversion configuration options.
|
|
190
|
+
*
|
|
191
|
+
* Controls how HTML content is converted to Markdown format, including formatting,
|
|
192
|
+
* escaping, and special handling for various HTML elements.
|
|
193
|
+
*/
|
|
59
194
|
interface HtmlConversionOptions {
|
|
195
|
+
/** Heading style conversion: "atx" (# style), "underlined" (underline style), or "atx_closed" (# style closed). Default: "atx". */
|
|
60
196
|
headingStyle?: "atx" | "underlined" | "atx_closed";
|
|
197
|
+
/** List indentation type: "spaces" or "tabs". Default: "spaces". */
|
|
61
198
|
listIndentType?: "spaces" | "tabs";
|
|
199
|
+
/** Number of spaces/tabs per list indent level. Default: 4. */
|
|
62
200
|
listIndentWidth?: number;
|
|
201
|
+
/** Bullet characters for unordered lists (e.g., '*', '-', '+'). Default: '*'. */
|
|
63
202
|
bullets?: string;
|
|
203
|
+
/** Markdown symbol for strong/bold emphasis: '**' or '__'. Default: '**'. */
|
|
64
204
|
strongEmSymbol?: string;
|
|
205
|
+
/** Escape asterisks (*) in text to prevent accidental formatting. Default: false. */
|
|
65
206
|
escapeAsterisks?: boolean;
|
|
207
|
+
/** Escape underscores (_) in text to prevent accidental formatting. Default: false. */
|
|
66
208
|
escapeUnderscores?: boolean;
|
|
209
|
+
/** Escape miscellaneous special characters. Default: false. */
|
|
67
210
|
escapeMisc?: boolean;
|
|
211
|
+
/** Escape ASCII control characters. Default: false. */
|
|
68
212
|
escapeAscii?: boolean;
|
|
213
|
+
/** Default code language for syntax highlighting in code blocks (e.g., 'javascript'). Default: null. */
|
|
69
214
|
codeLanguage?: string;
|
|
215
|
+
/** Convert HTML links to Markdown autolinks format ([text](url)). Default: true. */
|
|
70
216
|
autolinks?: boolean;
|
|
217
|
+
/** Use the HTML title element as default for links when no text is available. Default: false. */
|
|
71
218
|
defaultTitle?: boolean;
|
|
219
|
+
/** Insert <br> tags in Markdown tables. Default: false. */
|
|
72
220
|
brInTables?: boolean;
|
|
221
|
+
/** Use HOCR spatial table format for better table structure preservation. Default: false. */
|
|
73
222
|
hocrSpatialTables?: boolean;
|
|
223
|
+
/** Highlight style for marked/highlighted text: "double_equal" (==text==), "html" (<mark>), "bold" (**text**), or "none". Default: "none". */
|
|
74
224
|
highlightStyle?: "double_equal" | "html" | "bold" | "none";
|
|
225
|
+
/** Extract metadata from HTML (title, meta tags, etc.). Default: false. */
|
|
75
226
|
extractMetadata?: boolean;
|
|
227
|
+
/** Whitespace handling: "normalized" (collapse whitespace) or "strict" (preserve all whitespace). Default: "normalized". */
|
|
76
228
|
whitespaceMode?: "normalized" | "strict";
|
|
229
|
+
/** Remove newlines from output (convert to single line). Default: false. */
|
|
77
230
|
stripNewlines?: boolean;
|
|
231
|
+
/** Enable line wrapping at specified width. Default: true. */
|
|
78
232
|
wrap?: boolean;
|
|
233
|
+
/** Maximum line width when wrapping is enabled. Default: 80. */
|
|
79
234
|
wrapWidth?: number;
|
|
235
|
+
/** Convert as inline Markdown instead of block elements. Default: false. */
|
|
80
236
|
convertAsInline?: boolean;
|
|
237
|
+
/** Markdown symbol for subscript text (e.g., '~' for ~text~). Default: '~'. */
|
|
81
238
|
subSymbol?: string;
|
|
239
|
+
/** Markdown symbol for superscript text (e.g., '^' for ^text^). Default: '^'. */
|
|
82
240
|
supSymbol?: string;
|
|
241
|
+
/** Newline style in output: "spaces" (two spaces + newline) or "backslash" (backslash + newline). Default: "spaces". */
|
|
83
242
|
newlineStyle?: "spaces" | "backslash";
|
|
243
|
+
/** Code block style: "indented" (4-space indent), "backticks" (```), or "tildes" (~~~). Default: "backticks". */
|
|
84
244
|
codeBlockStyle?: "indented" | "backticks" | "tildes";
|
|
245
|
+
/** List of HTML tag names to keep as inline images (don't convert). Default: []. */
|
|
85
246
|
keepInlineImagesIn?: string[];
|
|
247
|
+
/** Character encoding for output (e.g., 'utf-8', 'ascii'). Default: 'utf-8'. */
|
|
86
248
|
encoding?: string;
|
|
249
|
+
/** Enable debug mode for detailed conversion logging. Default: false. */
|
|
87
250
|
debug?: boolean;
|
|
251
|
+
/** List of HTML tag names to remove entirely from output. Default: []. */
|
|
88
252
|
stripTags?: string[];
|
|
253
|
+
/** List of HTML tag names to preserve in output (don't convert to Markdown). Default: []. */
|
|
89
254
|
preserveTags?: string[];
|
|
255
|
+
/** HTML preprocessing options for cleaning HTML before conversion. */
|
|
90
256
|
preprocessing?: HtmlPreprocessingOptions;
|
|
91
257
|
}
|
|
258
|
+
/** Keyword extraction algorithm type. */
|
|
92
259
|
type KeywordAlgorithm = "yake" | "rake";
|
|
260
|
+
/**
|
|
261
|
+
* YAKE (Yet Another Keyword Extractor) algorithm configuration.
|
|
262
|
+
*
|
|
263
|
+
* YAKE is an unsupervised keyword extraction method that doesn't require training data.
|
|
264
|
+
*/
|
|
93
265
|
interface YakeParams {
|
|
266
|
+
/** Window size for co-occurrence analysis (number of words to consider). Default: 3. */
|
|
94
267
|
windowSize?: number;
|
|
95
268
|
}
|
|
269
|
+
/**
|
|
270
|
+
* RAKE (Rapid Automatic Keyword Extraction) algorithm configuration.
|
|
271
|
+
*
|
|
272
|
+
* RAKE extracts keywords based on word co-occurrence and statistical measures.
|
|
273
|
+
*/
|
|
96
274
|
interface RakeParams {
|
|
275
|
+
/** Minimum word length to consider as keyword. Default: 3. */
|
|
97
276
|
minWordLength?: number;
|
|
277
|
+
/** Maximum number of words per keyword phrase. Default: 3. */
|
|
98
278
|
maxWordsPerPhrase?: number;
|
|
99
279
|
}
|
|
280
|
+
/**
|
|
281
|
+
* Keyword extraction configuration.
|
|
282
|
+
*
|
|
283
|
+
* Extracts important keywords/phrases from document content using YAKE or RAKE algorithms.
|
|
284
|
+
*/
|
|
100
285
|
interface KeywordConfig {
|
|
286
|
+
/** Extraction algorithm: "yake" or "rake". Default: "yake". */
|
|
101
287
|
algorithm?: KeywordAlgorithm;
|
|
288
|
+
/** Maximum number of keywords to extract. Default: 10. */
|
|
102
289
|
maxKeywords?: number;
|
|
290
|
+
/** Minimum relevance score (0.0-1.0) for keywords. Keywords below this are filtered out. Default: 0.1. */
|
|
103
291
|
minScore?: number;
|
|
292
|
+
/** N-gram range: [min_length, max_length] for phrase keywords (e.g., [1, 3] for 1-3 word phrases). Default: [1, 3]. */
|
|
104
293
|
ngramRange?: [number, number];
|
|
294
|
+
/** Language for keyword extraction (e.g., 'en', 'de', 'fr'). Default: 'en'. */
|
|
105
295
|
language?: string;
|
|
296
|
+
/** YAKE algorithm-specific parameters. Only used when algorithm is "yake". */
|
|
106
297
|
yakeParams?: YakeParams;
|
|
298
|
+
/** RAKE algorithm-specific parameters. Only used when algorithm is "rake". */
|
|
107
299
|
rakeParams?: RakeParams;
|
|
108
300
|
}
|
|
301
|
+
/**
|
|
302
|
+
* Extracted keyword with relevance metadata.
|
|
303
|
+
*
|
|
304
|
+
* Represents a single keyword extracted from text along with its relevance score,
|
|
305
|
+
* the algorithm that extracted it, and optional position information.
|
|
306
|
+
*/
|
|
307
|
+
interface ExtractedKeyword {
|
|
308
|
+
/** The keyword text */
|
|
309
|
+
text: string;
|
|
310
|
+
/** Relevance score (higher is better, algorithm-specific range) */
|
|
311
|
+
score: number;
|
|
312
|
+
/** Algorithm that extracted this keyword */
|
|
313
|
+
algorithm: KeywordAlgorithm;
|
|
314
|
+
/** Optional positions where keyword appears in text (character offsets) */
|
|
315
|
+
positions?: number[];
|
|
316
|
+
}
|
|
109
317
|
/**
|
|
110
318
|
* Page tracking and extraction configuration.
|
|
111
319
|
*
|
|
@@ -113,7 +321,7 @@ interface KeywordConfig {
|
|
|
113
321
|
* Page range information in chunk metadata (first_page/last_page) is automatically
|
|
114
322
|
* enabled when page boundaries are available and chunking is configured.
|
|
115
323
|
*/
|
|
116
|
-
interface
|
|
324
|
+
interface PageExtractionConfig {
|
|
117
325
|
/** Extract pages as separate array (ExtractionResult.pages) */
|
|
118
326
|
extractPages?: boolean;
|
|
119
327
|
/** Insert page markers in main content string */
|
|
@@ -121,25 +329,53 @@ interface PageConfig {
|
|
|
121
329
|
/** Page marker format (use {page_num} placeholder) */
|
|
122
330
|
markerFormat?: string;
|
|
123
331
|
}
|
|
332
|
+
/**
|
|
333
|
+
* Main extraction configuration interface.
|
|
334
|
+
*
|
|
335
|
+
* Combines all sub-configurations for document extraction, OCR, chunking, post-processing, etc.
|
|
336
|
+
* All fields are optional and use sensible defaults.
|
|
337
|
+
*/
|
|
124
338
|
interface ExtractionConfig {
|
|
339
|
+
/** Enable caching of extraction results for identical inputs. Default: true. */
|
|
125
340
|
useCache?: boolean;
|
|
341
|
+
/** Enable quality processing filters to improve extraction reliability. Default: false. */
|
|
126
342
|
enableQualityProcessing?: boolean;
|
|
343
|
+
/** OCR configuration for text extraction from images. Only used when document contains images or forceOcr is true. */
|
|
127
344
|
ocr?: OcrConfig;
|
|
345
|
+
/** Force OCR processing even for documents with selectable text. Useful for scanned documents. Default: false. */
|
|
128
346
|
forceOcr?: boolean;
|
|
347
|
+
/** Chunking configuration for splitting documents into smaller pieces for RAG or vector DB. */
|
|
129
348
|
chunking?: ChunkingConfig;
|
|
349
|
+
/** Image extraction and optimization configuration. */
|
|
130
350
|
images?: ImageExtractionConfig;
|
|
351
|
+
/** PDF-specific extraction options (passwords, metadata, etc.). */
|
|
131
352
|
pdfOptions?: PdfConfig;
|
|
353
|
+
/** Token reduction configuration for optimizing token usage in LLM pipelines. */
|
|
132
354
|
tokenReduction?: TokenReductionConfig;
|
|
355
|
+
/** Language detection configuration for automatic language identification. */
|
|
133
356
|
languageDetection?: LanguageDetectionConfig;
|
|
357
|
+
/** Post-processor configuration for customizing extraction results. */
|
|
134
358
|
postprocessor?: PostProcessorConfig;
|
|
359
|
+
/** HTML to Markdown conversion options for HTML content. */
|
|
135
360
|
htmlOptions?: HtmlConversionOptions;
|
|
361
|
+
/** Keyword extraction configuration for extracting important phrases. */
|
|
136
362
|
keywords?: KeywordConfig;
|
|
137
|
-
|
|
363
|
+
/** Page tracking and extraction configuration for multi-page documents. */
|
|
364
|
+
pages?: PageExtractionConfig;
|
|
365
|
+
/** Maximum number of concurrent extractions in batch operations. Default: 4. */
|
|
138
366
|
maxConcurrentExtractions?: number;
|
|
139
367
|
}
|
|
368
|
+
/**
|
|
369
|
+
* Extracted table data from document.
|
|
370
|
+
*
|
|
371
|
+
* Contains both cell data and Markdown representation for easy display and processing.
|
|
372
|
+
*/
|
|
140
373
|
interface Table {
|
|
374
|
+
/** 2D array of cell contents (rows × columns) */
|
|
141
375
|
cells: string[][];
|
|
376
|
+
/** Markdown representation of the table for display or parsing */
|
|
142
377
|
markdown: string;
|
|
378
|
+
/** Page number where this table was found (1-indexed) */
|
|
143
379
|
pageNumber: number;
|
|
144
380
|
}
|
|
145
381
|
interface ExcelMetadata {
|
|
@@ -180,28 +416,50 @@ interface TextMetadata {
|
|
|
180
416
|
links?: [string, string][] | null;
|
|
181
417
|
codeBlocks?: [string, string][] | null;
|
|
182
418
|
}
|
|
419
|
+
interface HeaderMetadata {
|
|
420
|
+
level: number;
|
|
421
|
+
text: string;
|
|
422
|
+
id?: string | null;
|
|
423
|
+
depth: number;
|
|
424
|
+
htmlOffset: number;
|
|
425
|
+
}
|
|
426
|
+
interface LinkMetadata {
|
|
427
|
+
href: string;
|
|
428
|
+
text: string;
|
|
429
|
+
title?: string | null;
|
|
430
|
+
linkType: "anchor" | "internal" | "external" | "email" | "phone" | "other";
|
|
431
|
+
rel: string[];
|
|
432
|
+
attributes: Record<string, string>;
|
|
433
|
+
}
|
|
434
|
+
interface HtmlImageMetadata {
|
|
435
|
+
src: string;
|
|
436
|
+
alt?: string | null;
|
|
437
|
+
title?: string | null;
|
|
438
|
+
dimensions?: [number, number] | null;
|
|
439
|
+
imageType: "data_uri" | "inline_svg" | "external" | "relative";
|
|
440
|
+
attributes: Record<string, string>;
|
|
441
|
+
}
|
|
442
|
+
interface StructuredData {
|
|
443
|
+
dataType: "json_ld" | "microdata" | "rdfa";
|
|
444
|
+
rawJson: string;
|
|
445
|
+
schemaType?: string | null;
|
|
446
|
+
}
|
|
183
447
|
interface HtmlMetadata {
|
|
184
448
|
title?: string | null;
|
|
185
449
|
description?: string | null;
|
|
186
|
-
keywords
|
|
450
|
+
keywords: string[];
|
|
187
451
|
author?: string | null;
|
|
188
|
-
|
|
452
|
+
canonicalUrl?: string | null;
|
|
189
453
|
baseHref?: string | null;
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
twitterImage?: string | null;
|
|
200
|
-
twitterSite?: string | null;
|
|
201
|
-
twitterCreator?: string | null;
|
|
202
|
-
linkAuthor?: string | null;
|
|
203
|
-
linkLicense?: string | null;
|
|
204
|
-
linkAlternate?: string | null;
|
|
454
|
+
language?: string | null;
|
|
455
|
+
textDirection?: "ltr" | "rtl" | "auto" | null;
|
|
456
|
+
openGraph: Record<string, string>;
|
|
457
|
+
twitterCard: Record<string, string>;
|
|
458
|
+
metaTags: Record<string, string>;
|
|
459
|
+
htmlHeaders: HeaderMetadata[];
|
|
460
|
+
htmlLinks: LinkMetadata[];
|
|
461
|
+
htmlImages: HtmlImageMetadata[];
|
|
462
|
+
structuredData: StructuredData[];
|
|
205
463
|
}
|
|
206
464
|
interface PdfMetadata {
|
|
207
465
|
title?: string | null;
|
|
@@ -329,38 +587,62 @@ interface ChunkMetadata {
|
|
|
329
587
|
/** Last page number this chunk spans (1-indexed, only when page tracking enabled) */
|
|
330
588
|
lastPage?: number | null;
|
|
331
589
|
}
|
|
590
|
+
/**
|
|
591
|
+
* Text chunk with optional embedding.
|
|
592
|
+
*
|
|
593
|
+
* Represents a segment of a document created by the chunking algorithm, useful for RAG and vector databases.
|
|
594
|
+
*/
|
|
332
595
|
interface Chunk {
|
|
596
|
+
/** Text content of this chunk */
|
|
333
597
|
content: string;
|
|
598
|
+
/** Vector embedding for this chunk (if embedding model was used) */
|
|
334
599
|
embedding?: number[] | null;
|
|
600
|
+
/** Metadata about chunk position and properties in the document */
|
|
335
601
|
metadata: ChunkMetadata;
|
|
336
602
|
}
|
|
603
|
+
/**
|
|
604
|
+
* Extracted image from document with optional OCR result.
|
|
605
|
+
*
|
|
606
|
+
* Contains image data and metadata about position, dimensions, and properties.
|
|
607
|
+
*/
|
|
337
608
|
interface ExtractedImage {
|
|
609
|
+
/** Raw image bytes as Uint8Array */
|
|
338
610
|
data: Uint8Array;
|
|
611
|
+
/** Image format (e.g., 'png', 'jpeg', 'tiff') */
|
|
339
612
|
format: string;
|
|
613
|
+
/** Sequential index of this image in the document (0-indexed) */
|
|
340
614
|
imageIndex: number;
|
|
615
|
+
/** Page number where this image was found (1-indexed), null if unknown */
|
|
341
616
|
pageNumber?: number | null;
|
|
617
|
+
/** Image width in pixels, null if unknown */
|
|
342
618
|
width?: number | null;
|
|
619
|
+
/** Image height in pixels, null if unknown */
|
|
343
620
|
height?: number | null;
|
|
621
|
+
/** Color space (e.g., 'RGB', 'CMYK', 'Grayscale'), null if unknown */
|
|
344
622
|
colorspace?: string | null;
|
|
623
|
+
/** Bits per color component (e.g., 8 for 8-bit), null if unknown */
|
|
345
624
|
bitsPerComponent?: number | null;
|
|
625
|
+
/** Whether this is a mask image (used internally by PDF) */
|
|
346
626
|
isMask: boolean;
|
|
627
|
+
/** Image description or caption if available */
|
|
347
628
|
description?: string | null;
|
|
629
|
+
/** OCR extraction result if OCR was run on this image, null otherwise */
|
|
348
630
|
ocrResult?: ExtractionResult | null;
|
|
349
631
|
}
|
|
350
632
|
/**
|
|
351
633
|
* Content for a single page/slide/sheet.
|
|
352
634
|
*
|
|
353
635
|
* When page extraction is enabled, documents are split into per-page content
|
|
354
|
-
* with associated tables and images mapped to each page.
|
|
636
|
+
* with associated tables and images mapped to each page. This allows for page-specific processing.
|
|
355
637
|
*/
|
|
356
638
|
interface PageContent {
|
|
357
|
-
/** Page number (1-indexed) */
|
|
639
|
+
/** Page number (1-indexed) starting from 1 */
|
|
358
640
|
pageNumber: number;
|
|
359
|
-
/** Text content
|
|
641
|
+
/** Text content extracted from this page */
|
|
360
642
|
content: string;
|
|
361
|
-
/** Tables found
|
|
643
|
+
/** Tables found and extracted from this page */
|
|
362
644
|
tables: Table[];
|
|
363
|
-
/** Images found
|
|
645
|
+
/** Images found and extracted from this page */
|
|
364
646
|
images: ExtractedImage[];
|
|
365
647
|
}
|
|
366
648
|
/**
|
|
@@ -413,23 +695,17 @@ interface Metadata {
|
|
|
413
695
|
headers?: string[] | null;
|
|
414
696
|
links?: [string, string][] | null;
|
|
415
697
|
code_blocks?: [string, string][] | null;
|
|
416
|
-
|
|
698
|
+
canonical_url?: string | null;
|
|
417
699
|
base_href?: string | null;
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
twitter_image?: string | null;
|
|
428
|
-
twitter_site?: string | null;
|
|
429
|
-
twitter_creator?: string | null;
|
|
430
|
-
link_author?: string | null;
|
|
431
|
-
link_license?: string | null;
|
|
432
|
-
link_alternate?: string | null;
|
|
700
|
+
open_graph?: Record<string, string>;
|
|
701
|
+
twitter_card?: Record<string, string>;
|
|
702
|
+
meta_tags?: Record<string, string>;
|
|
703
|
+
html_language?: string | null;
|
|
704
|
+
text_direction?: "ltr" | "rtl" | "auto" | null;
|
|
705
|
+
html_headers?: HeaderMetadata[];
|
|
706
|
+
html_links?: LinkMetadata[];
|
|
707
|
+
html_images?: HtmlImageMetadata[];
|
|
708
|
+
structured_data?: StructuredData[];
|
|
433
709
|
psm?: number;
|
|
434
710
|
output_format?: string;
|
|
435
711
|
table_count?: number;
|
|
@@ -439,92 +715,141 @@ interface Metadata {
|
|
|
439
715
|
json_schema?: Record<string, unknown> | null;
|
|
440
716
|
page_structure?: PageStructure | null;
|
|
441
717
|
error?: ErrorMetadata | null;
|
|
442
|
-
|
|
718
|
+
/**
|
|
719
|
+
* Additional fields may be added at runtime by postprocessors.
|
|
720
|
+
* Use bracket notation to safely access unexpected properties.
|
|
721
|
+
*/
|
|
722
|
+
[key: string]: unknown;
|
|
443
723
|
}
|
|
724
|
+
/**
|
|
725
|
+
* Complete extraction result from document processing.
|
|
726
|
+
*
|
|
727
|
+
* Contains all extracted content, metadata, and optional processed data like chunks and images.
|
|
728
|
+
* This is the primary return value from extraction functions.
|
|
729
|
+
*/
|
|
444
730
|
interface ExtractionResult {
|
|
731
|
+
/** Extracted text content from the document (main content) */
|
|
445
732
|
content: string;
|
|
733
|
+
/** MIME type of the input document (e.g., 'application/pdf', 'text/html') */
|
|
446
734
|
mimeType: string;
|
|
735
|
+
/** Document metadata including title, author, creation date, language, and format-specific fields */
|
|
447
736
|
metadata: Metadata;
|
|
737
|
+
/** Tables extracted from the document (2D cell arrays with Markdown representation) */
|
|
448
738
|
tables: Table[];
|
|
739
|
+
/** Detected languages in the document (ISO 639-1 codes, e.g., ['en', 'de']), null if detection disabled */
|
|
449
740
|
detectedLanguages: string[] | null;
|
|
741
|
+
/** Document chunks for RAG/vector databases (if chunking was enabled), null otherwise */
|
|
450
742
|
chunks: Chunk[] | null;
|
|
743
|
+
/** Images extracted from document with metadata (if image extraction was enabled), null otherwise */
|
|
451
744
|
images: ExtractedImage[] | null;
|
|
745
|
+
/** Per-page content when page extraction is enabled, null otherwise. Each item contains page number, content, tables, and images. */
|
|
452
746
|
pages?: PageContent[] | null;
|
|
747
|
+
/** Extracted keywords when keyword extraction is enabled, null otherwise */
|
|
748
|
+
keywords?: ExtractedKeyword[] | null;
|
|
453
749
|
}
|
|
750
|
+
/** Post-processor execution stage in the extraction pipeline. */
|
|
454
751
|
type ProcessingStage = "early" | "middle" | "late";
|
|
752
|
+
/**
|
|
753
|
+
* Protocol for custom post-processors that modify extraction results.
|
|
754
|
+
*
|
|
755
|
+
* Post-processors enrich or transform extraction results without failing the extraction.
|
|
756
|
+
* If a post-processor throws an error, it's logged but extraction continues.
|
|
757
|
+
* Only works with async extraction functions (`extractFile`, `extractBytes`, etc.).
|
|
758
|
+
*/
|
|
455
759
|
interface PostProcessorProtocol {
|
|
456
760
|
/**
|
|
457
761
|
* Return the unique name of this postprocessor.
|
|
762
|
+
*
|
|
763
|
+
* @returns Unique processor name (case-sensitive, alphanumeric + underscores recommended)
|
|
458
764
|
*/
|
|
459
765
|
name(): string;
|
|
460
766
|
/**
|
|
461
767
|
* Process and enrich an extraction result.
|
|
462
768
|
*
|
|
769
|
+
* Modify the result to add new metadata, transform content, or perform other enrichment.
|
|
770
|
+
* If this throws an error, it's logged but extraction continues.
|
|
771
|
+
*
|
|
463
772
|
* @param result - ExtractionResult with extracted content, metadata, and tables
|
|
464
|
-
* @returns Modified result with enriched
|
|
773
|
+
* @returns Modified result with enriched data. Can be async or sync.
|
|
465
774
|
*/
|
|
466
775
|
process(result: ExtractionResult): ExtractionResult | Promise<ExtractionResult>;
|
|
467
776
|
/**
|
|
468
777
|
* Return the processing stage for this processor.
|
|
469
778
|
*
|
|
779
|
+
* Determines when this processor runs relative to others:
|
|
780
|
+
* - "early": Runs first, before other processors (good for cleanup/normalization)
|
|
781
|
+
* - "middle": Runs with other middle-stage processors (default)
|
|
782
|
+
* - "late": Runs last, after others (good for final enrichment)
|
|
783
|
+
*
|
|
470
784
|
* @returns One of "early", "middle", or "late" (default: "middle")
|
|
471
785
|
*/
|
|
472
786
|
processingStage?(): ProcessingStage;
|
|
473
787
|
/**
|
|
474
|
-
* Initialize the processor (e.g., load ML models).
|
|
788
|
+
* Initialize the processor (e.g., load ML models, setup resources).
|
|
475
789
|
*
|
|
476
|
-
* Called once when the processor is registered.
|
|
790
|
+
* Called once when the processor is first registered. Use for expensive operations.
|
|
477
791
|
*/
|
|
478
792
|
initialize?(): void | Promise<void>;
|
|
479
793
|
/**
|
|
480
794
|
* Shutdown the processor and release resources.
|
|
481
795
|
*
|
|
482
|
-
* Called when the processor is unregistered.
|
|
796
|
+
* Called when the processor is unregistered. Use for cleanup (closing connections, freeing memory).
|
|
483
797
|
*/
|
|
484
798
|
shutdown?(): void | Promise<void>;
|
|
485
799
|
}
|
|
800
|
+
/**
|
|
801
|
+
* Protocol for custom validators that check extraction results.
|
|
802
|
+
*
|
|
803
|
+
* Validators perform quality checks and fail the extraction if validation fails.
|
|
804
|
+
* Unlike post-processors, validator errors cause the entire extraction to fail.
|
|
805
|
+
* Useful for enforcing quality standards on extracted content.
|
|
806
|
+
*/
|
|
486
807
|
interface ValidatorProtocol {
|
|
487
808
|
/**
|
|
488
809
|
* Return the unique name of this validator.
|
|
810
|
+
*
|
|
811
|
+
* @returns Unique validator name (case-sensitive, alphanumeric + underscores recommended)
|
|
489
812
|
*/
|
|
490
813
|
name(): string;
|
|
491
814
|
/**
|
|
492
815
|
* Validate an extraction result.
|
|
493
816
|
*
|
|
494
|
-
* Throw an error if validation fails. The error message
|
|
495
|
-
* If validation passes, return without throwing.
|
|
817
|
+
* Throw an error if validation fails. The error message will be used as the extraction error.
|
|
818
|
+
* If validation passes, return without throwing (return value is ignored).
|
|
496
819
|
*
|
|
497
820
|
* @param result - ExtractionResult to validate
|
|
498
|
-
* @throws Error
|
|
821
|
+
* @throws {Error} If validation fails (extraction will fail with this error)
|
|
499
822
|
*/
|
|
500
823
|
validate(result: ExtractionResult): void | Promise<void>;
|
|
501
824
|
/**
|
|
502
825
|
* Return the validation priority.
|
|
503
826
|
*
|
|
504
|
-
* Higher priority validators run first. Useful for running cheap validations
|
|
827
|
+
* Higher priority validators run first. Useful for running cheap validations (e.g., length checks)
|
|
828
|
+
* before expensive ones (e.g., AI-based quality checks) to fail fast.
|
|
505
829
|
*
|
|
506
|
-
* @returns Priority value (higher = runs earlier, default: 50)
|
|
830
|
+
* @returns Priority value (higher = runs earlier, default: 50). Range: 0-1000.
|
|
507
831
|
*/
|
|
508
832
|
priority?(): number;
|
|
509
833
|
/**
|
|
510
834
|
* Check if this validator should run for a given result.
|
|
511
835
|
*
|
|
512
836
|
* Allows conditional validation based on MIME type, metadata, or content.
|
|
837
|
+
* This is evaluated before validation, so expensive checks can be skipped for irrelevant documents.
|
|
513
838
|
*
|
|
514
839
|
* @param result - ExtractionResult to check
|
|
515
840
|
* @returns true if validator should run, false to skip (default: true)
|
|
516
841
|
*/
|
|
517
842
|
shouldValidate?(result: ExtractionResult): boolean;
|
|
518
843
|
/**
|
|
519
|
-
* Initialize the validator.
|
|
844
|
+
* Initialize the validator (e.g., load ML models, setup resources).
|
|
520
845
|
*
|
|
521
|
-
* Called once when the validator is registered.
|
|
846
|
+
* Called once when the validator is first registered. Use for expensive operations.
|
|
522
847
|
*/
|
|
523
848
|
initialize?(): void | Promise<void>;
|
|
524
849
|
/**
|
|
525
850
|
* Shutdown the validator and release resources.
|
|
526
851
|
*
|
|
527
|
-
* Called when the validator is unregistered.
|
|
852
|
+
* Called when the validator is unregistered. Use for cleanup (closing connections, freeing memory).
|
|
528
853
|
*/
|
|
529
854
|
shutdown?(): void | Promise<void>;
|
|
530
855
|
}
|
|
@@ -662,5 +987,95 @@ interface OcrBackendProtocol {
|
|
|
662
987
|
*/
|
|
663
988
|
shutdown?(): void | Promise<void>;
|
|
664
989
|
}
|
|
990
|
+
/**
|
|
991
|
+
* Result of error message classification into error codes.
|
|
992
|
+
*
|
|
993
|
+
* Provides classification details including the error code, name,
|
|
994
|
+
* description, and confidence score for the classification.
|
|
995
|
+
*
|
|
996
|
+
* @example
|
|
997
|
+
* ```typescript
|
|
998
|
+
* import { classifyError, ErrorCode } from '@kreuzberg/node';
|
|
999
|
+
*
|
|
1000
|
+
* const result = classifyError("File not found in read operation");
|
|
1001
|
+
* if (result.code === ErrorCode.IoError) {
|
|
1002
|
+
* console.error(`I/O Error: ${result.description}`);
|
|
1003
|
+
* console.log(`Confidence: ${result.confidence}`);
|
|
1004
|
+
* }
|
|
1005
|
+
* ```
|
|
1006
|
+
*/
|
|
1007
|
+
interface ErrorClassification {
|
|
1008
|
+
/**
|
|
1009
|
+
* The numeric error code (0-7) representing the error type.
|
|
1010
|
+
*/
|
|
1011
|
+
code: number;
|
|
1012
|
+
/**
|
|
1013
|
+
* The human-readable name of the error code (e.g., "validation", "ocr").
|
|
1014
|
+
*/
|
|
1015
|
+
name: string;
|
|
1016
|
+
/**
|
|
1017
|
+
* A brief description of the error type.
|
|
1018
|
+
*/
|
|
1019
|
+
description: string;
|
|
1020
|
+
/**
|
|
1021
|
+
* Confidence score (0.0-1.0) indicating how certain the classification is.
|
|
1022
|
+
* Higher values indicate higher confidence in the classification.
|
|
1023
|
+
*/
|
|
1024
|
+
confidence: number;
|
|
1025
|
+
}
|
|
1026
|
+
/**
|
|
1027
|
+
* Opaque handle to a worker pool for concurrent extraction operations.
|
|
1028
|
+
*
|
|
1029
|
+
* Worker pools enable parallel processing of CPU-bound document extraction
|
|
1030
|
+
* tasks by distributing work across multiple threads. This is especially
|
|
1031
|
+
* useful for batch processing large numbers of documents.
|
|
1032
|
+
*
|
|
1033
|
+
* @example
|
|
1034
|
+
* ```typescript
|
|
1035
|
+
* import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
1036
|
+
*
|
|
1037
|
+
* const pool = createWorkerPool(4); // 4 concurrent workers
|
|
1038
|
+
* try {
|
|
1039
|
+
* const result = await extractFileInWorker(pool, 'document.pdf');
|
|
1040
|
+
* console.log(result.content);
|
|
1041
|
+
* } finally {
|
|
1042
|
+
* await closeWorkerPool(pool);
|
|
1043
|
+
* }
|
|
1044
|
+
* ```
|
|
1045
|
+
*/
|
|
1046
|
+
interface WorkerPool {
|
|
1047
|
+
/** Internal pool identifier (opaque) */
|
|
1048
|
+
readonly poolId: number;
|
|
1049
|
+
}
|
|
1050
|
+
/**
|
|
1051
|
+
* Worker pool statistics.
|
|
1052
|
+
*
|
|
1053
|
+
* Provides information about the current state of a worker pool including
|
|
1054
|
+
* pool size, number of active workers, and queued tasks.
|
|
1055
|
+
*
|
|
1056
|
+
* @example
|
|
1057
|
+
* ```typescript
|
|
1058
|
+
* import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
|
|
1059
|
+
*
|
|
1060
|
+
* const pool = createWorkerPool(4);
|
|
1061
|
+
* const stats = getWorkerPoolStats(pool);
|
|
1062
|
+
* console.log(`Active: ${stats.activeWorkers}/${stats.size}`);
|
|
1063
|
+
* console.log(`Queued: ${stats.queuedTasks}`);
|
|
1064
|
+
* ```
|
|
1065
|
+
*/
|
|
1066
|
+
interface WorkerPoolStats {
|
|
1067
|
+
/**
|
|
1068
|
+
* Maximum number of concurrent workers in the pool.
|
|
1069
|
+
*/
|
|
1070
|
+
size: number;
|
|
1071
|
+
/**
|
|
1072
|
+
* Number of currently active (executing) workers.
|
|
1073
|
+
*/
|
|
1074
|
+
activeWorkers: number;
|
|
1075
|
+
/**
|
|
1076
|
+
* Number of tasks waiting in the queue.
|
|
1077
|
+
*/
|
|
1078
|
+
queuedTasks: number;
|
|
1079
|
+
}
|
|
665
1080
|
|
|
666
|
-
export type { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractionConfig, ExtractionResult, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary,
|
|
1081
|
+
export type { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };
|