@kreuzberg/node 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,195 +1,153 @@
1
+ import { ErrorClassification, ExtractionConfig, ExtractionResult, WorkerPool, WorkerPoolStats, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol } from './types.js';
2
+ export { Chunk, ChunkingConfig, ExtractedImage, HtmlConversionOptions, HtmlPreprocessingOptions, ImageExtractionConfig, KeywordConfig, LanguageDetectionConfig, OcrConfig, PageContent, PageExtractionConfig, PdfConfig, PostProcessorConfig, Table, TesseractConfig, TokenReductionConfig } from './types.js';
1
3
  import { PanicContext } from './errors.js';
2
4
  export { CacheError, ErrorCode, ImageProcessingError, KreuzbergError, MissingDependencyError, OcrError, ParsingError, PluginError, ValidationError } from './errors.js';
3
- import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, ErrorClassification, WorkerPool, WorkerPoolStats } from './types.js';
4
- export { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.js';
5
5
  export { GutenOcrBackend } from './ocr/guten-ocr.js';
6
6
 
7
7
  /**
8
- * Kreuzberg - Multi-language document intelligence framework.
9
- *
10
- * This is a TypeScript SDK around a high-performance Rust core.
11
- * All extraction logic, chunking, quality processing, and language detection
12
- * are implemented in Rust for maximum performance.
13
- *
14
- * ## API Usage Recommendations
15
- *
16
- * **For processing multiple documents**, prefer batch APIs:
17
- * - Use `batchExtractFiles()` / `batchExtractFilesSync()` for multiple files
18
- * - Use `batchExtractBytes()` / `batchExtractBytesSync()` for multiple byte arrays
19
- *
20
- * **Batch APIs provide**:
21
- * - Better performance (parallel processing in Rust)
22
- * - More reliable memory management
23
- * - Recommended for all multi-document workflows
8
+ * Get the error code for the last FFI error.
24
9
  *
25
- * **Single extraction APIs** (`extractFile`, `extractBytes`) are suitable for:
26
- * - One-off document processing
27
- * - Interactive applications processing documents on-demand
28
- * - Avoid calling these in tight loops - use batch APIs instead
10
+ * Returns the FFI error code as an integer. This is useful for programmatic error handling
11
+ * and distinguishing between different types of failures in native code.
29
12
  *
30
- * ## Supported Formats
13
+ * Error codes:
14
+ * - 0: Success (no error)
15
+ * - 1: GenericError
16
+ * - 2: Panic
17
+ * - 3: InvalidArgument
18
+ * - 4: IoError
19
+ * - 5: ParsingError
20
+ * - 6: OcrError
21
+ * - 7: MissingDependency
31
22
  *
32
- * - **Documents**: PDF, DOCX, PPTX, XLSX, DOC, PPT (with LibreOffice)
33
- * - **Text**: Markdown, Plain Text, XML
34
- * - **Web**: HTML (converted to Markdown)
35
- * - **Data**: JSON, YAML, TOML
36
- * - **Email**: EML, MSG
37
- * - **Images**: PNG, JPEG, TIFF (with OCR support)
23
+ * @returns The integer error code
38
24
  *
39
25
  * @example
40
26
  * ```typescript
41
- * import { extractFile, batchExtractFiles } from '@kreuzberg/node';
42
- *
43
- * // Single file extraction
44
- * const result = await extractFile('document.pdf');
45
- * console.log(result.content);
27
+ * import { extractFile, getLastErrorCode, ErrorCode } from '@kreuzberg/node';
46
28
  *
47
- * // Multiple files (recommended approach)
48
- * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
49
- * const results = await batchExtractFiles(files);
50
- * results.forEach(r => console.log(r.content));
29
+ * try {
30
+ * const result = await extractFile('document.pdf');
31
+ * } catch (error) {
32
+ * const code = getLastErrorCode();
33
+ * if (code === ErrorCode.Panic) {
34
+ * console.error('Native code panic detected');
35
+ * }
36
+ * }
51
37
  * ```
52
38
  */
53
-
54
- /**
55
- * @internal Allows tests to provide a mocked native binding.
56
- */
57
- declare function __setBindingForTests(mock: unknown): void;
58
- /**
59
- * @internal Resets the cached native binding for tests.
60
- */
61
- declare function __resetBindingForTests(): void;
39
+ declare function getLastErrorCode(): number;
62
40
  /**
63
- * Extract content from a single file (synchronous).
41
+ * Get panic context information if the last error was a panic.
64
42
  *
65
- * **Usage Note**: For processing multiple files, prefer `batchExtractFilesSync()` which
66
- * provides better performance and memory management.
43
+ * Returns detailed information about a panic in native code, or null if the last error was not a panic.
44
+ * This provides debugging information when native code panics.
67
45
  *
68
- * @param filePath - Path to the file to extract (string). Can be absolute or relative.
69
- * @param mimeType - Optional MIME type hint for format detection. If null, MIME type is auto-detected from file extension or content.
70
- * @param config - Extraction configuration object. If null, uses default extraction settings.
71
- * @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
72
- * @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
73
- * @throws {ParsingError} When document format is invalid or corrupted
74
- * @throws {OcrError} When OCR processing fails (if OCR is enabled)
75
- * @throws {ValidationError} When extraction result fails validation (if validators registered)
76
- * @throws {KreuzbergError} For other extraction-related failures
46
+ * @returns A `PanicContext` object with file, line, function, message, and timestamp_secs, or null if no panic context is available
77
47
  *
78
48
  * @example
79
49
  * ```typescript
80
- * import { extractFileSync } from '@kreuzberg/node';
81
- *
82
- * // Basic usage
83
- * const result = extractFileSync('document.pdf');
84
- * console.log(result.content);
50
+ * import { extractFile, getLastPanicContext } from '@kreuzberg/node';
85
51
  *
86
- * // With OCR configuration
87
- * const config = {
88
- * ocr: {
89
- * backend: 'tesseract',
90
- * language: 'eng',
91
- * tesseractConfig: {
92
- * psm: 6,
93
- * enableTableDetection: true,
94
- * },
95
- * },
96
- * };
97
- * const result2 = extractFileSync('scanned.pdf', null, config);
52
+ * try {
53
+ * const result = await extractFile('document.pdf');
54
+ * } catch (error) {
55
+ * const context = getLastPanicContext();
56
+ * if (context) {
57
+ * console.error(`Panic at ${context.file}:${context.line}`);
58
+ * console.error(`In function: ${context.function}`);
59
+ * console.error(`Message: ${context.message}`);
60
+ * }
61
+ * }
98
62
  * ```
99
63
  */
100
- declare function extractFileSync(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): ExtractionResult;
64
+ declare function getLastPanicContext(): PanicContext | null;
101
65
  /**
102
- * Extract content from a single file (asynchronous).
66
+ * Returns the human-readable name for an error code.
103
67
  *
104
- * **Usage Note**: For processing multiple files, prefer `batchExtractFiles()` which
105
- * provides better performance and memory management.
68
+ * Maps numeric error codes to their string names, providing a consistent way
69
+ * to get error code names across all platforms.
106
70
  *
107
- * @param filePath - Path to the file to extract (string). Can be absolute or relative.
108
- * @param mimeType - Optional MIME type hint for format detection. If null, MIME type is auto-detected from file extension or content.
109
- * @param config - Extraction configuration object. If null, uses default extraction settings.
110
- * @returns Promise<ExtractionResult> containing extracted content, metadata, tables, and optional chunks/images
111
- * @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
112
- * @throws {ParsingError} When document format is invalid or corrupted
113
- * @throws {OcrError} When OCR processing fails (if OCR is enabled)
114
- * @throws {ValidationError} When extraction result fails validation (if validators registered)
115
- * @throws {KreuzbergError} For other extraction-related failures
71
+ * @param code - The numeric error code (0-7)
72
+ * @returns The error code name as a string (e.g., "validation", "ocr", "unknown")
116
73
  *
117
74
  * @example
118
75
  * ```typescript
119
- * import { extractFile } from '@kreuzberg/node';
120
- *
121
- * // Basic usage
122
- * const result = await extractFile('document.pdf');
123
- * console.log(result.content);
76
+ * import { getErrorCodeName } from '@kreuzberg/node';
124
77
  *
125
- * // With chunking enabled
126
- * const config = {
127
- * chunking: {
128
- * maxChars: 1000,
129
- * maxOverlap: 200,
130
- * },
131
- * };
132
- * const result2 = await extractFile('long_document.pdf', null, config);
133
- * console.log(result2.chunks); // Array of text chunks
78
+ * const name = getErrorCodeName(0); // returns "validation"
79
+ * const name = getErrorCodeName(2); // returns "ocr"
80
+ * const name = getErrorCodeName(99); // returns "unknown"
134
81
  * ```
135
82
  */
136
- declare function extractFile(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
83
+ declare function getErrorCodeName(code: number): string;
137
84
  /**
138
- * Extract content from raw bytes (synchronous).
85
+ * Returns the description for an error code.
139
86
  *
140
- * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytesSync()`
141
- * which provides better performance and memory management.
87
+ * Retrieves user-friendly descriptions of error types from the FFI layer.
142
88
  *
143
- * @param data - File content as Uint8Array (Buffer will be converted)
144
- * @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string.
145
- * @param config - Extraction configuration object. If null, uses default extraction settings.
146
- * @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
147
- * @throws {TypeError} When data is not a valid Uint8Array
148
- * @throws {Error} When file cannot be read or parsed
149
- * @throws {ParsingError} When document format is invalid or corrupted
150
- * @throws {OcrError} When OCR processing fails (if OCR is enabled)
151
- * @throws {ValidationError} When extraction result fails validation (if validators registered)
152
- * @throws {KreuzbergError} For other extraction-related failures
89
+ * @param code - The numeric error code (0-7)
90
+ * @returns A brief description of the error type
153
91
  *
154
92
  * @example
155
93
  * ```typescript
156
- * import { extractBytesSync } from '@kreuzberg/node';
157
- * import { readFileSync } from 'fs';
94
+ * import { getErrorCodeDescription } from '@kreuzberg/node';
158
95
  *
159
- * const data = readFileSync('document.pdf');
160
- * const result = extractBytesSync(data, 'application/pdf');
161
- * console.log(result.content);
96
+ * const desc = getErrorCodeDescription(0); // returns "Input validation error"
97
+ * const desc = getErrorCodeDescription(4); // returns "File system I/O error"
98
+ * const desc = getErrorCodeDescription(99); // returns "Unknown error code"
162
99
  * ```
163
100
  */
164
- declare function extractBytesSync(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig$1 | null): ExtractionResult;
101
+ declare function getErrorCodeDescription(code: number): string;
165
102
  /**
166
- * Extract content from raw bytes (asynchronous).
103
+ * Classifies an error message string into an error code category.
167
104
  *
168
- * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytes()`
169
- * which provides better performance and memory management.
105
+ * This function analyzes the error message content and returns the most likely
106
+ * error code (0-7) based on keyword patterns. Used to programmatically classify
107
+ * errors for handling purposes.
170
108
  *
171
- * @param data - File content as Uint8Array (Buffer will be converted)
172
- * @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string.
173
- * @param config - Extraction configuration object. If null, uses default extraction settings.
174
- * @returns Promise<ExtractionResult> containing extracted content, metadata, tables, and optional chunks/images
175
- * @throws {TypeError} When data is not a valid Uint8Array
176
- * @throws {Error} When file cannot be read or parsed
177
- * @throws {ParsingError} When document format is invalid or corrupted
178
- * @throws {OcrError} When OCR processing fails (if OCR is enabled)
179
- * @throws {ValidationError} When extraction result fails validation (if validators registered)
180
- * @throws {KreuzbergError} For other extraction-related failures
109
+ * The classification is based on keyword matching:
110
+ * - **Validation (0)**: Keywords like "invalid", "validation", "schema", "required"
111
+ * - **Parsing (1)**: Keywords like "parsing", "corrupted", "malformed"
112
+ * - **Ocr (2)**: Keywords like "ocr", "tesseract", "language", "model"
113
+ * - **MissingDependency (3)**: Keywords like "not found", "missing", "dependency"
114
+ * - **Io (4)**: Keywords like "file", "disk", "read", "write", "permission"
115
+ * - **Plugin (5)**: Keywords like "plugin", "register", "extension"
116
+ * - **UnsupportedFormat (6)**: Keywords like "unsupported", "format", "mime"
117
+ * - **Internal (7)**: Keywords like "internal", "bug", "panic"
118
+ *
119
+ * @param errorMessage - The error message string to classify
120
+ * @returns An object with the classification details
181
121
  *
182
122
  * @example
183
123
  * ```typescript
184
- * import { extractBytes } from '@kreuzberg/node';
185
- * import { readFile } from 'fs/promises';
124
+ * import { classifyError } from '@kreuzberg/node';
186
125
  *
187
- * const data = await readFile('document.pdf');
188
- * const result = await extractBytes(data, 'application/pdf');
189
- * console.log(result.content);
126
+ * const result = classifyError("PDF file is corrupted");
127
+ * // Returns: { code: 1, name: "parsing", confidence: 0.95 }
128
+ *
129
+ * const result = classifyError("Tesseract not found");
130
+ * // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 }
190
131
  * ```
191
132
  */
192
- declare function extractBytes(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
133
+ declare function classifyError(errorMessage: string): ErrorClassification;
134
+
135
+ /**
136
+ * Batch extraction APIs for processing multiple documents.
137
+ *
138
+ * This module provides synchronous and asynchronous functions for extracting content
139
+ * from multiple files or byte arrays in parallel. Batch operations offer better
140
+ * performance and memory management compared to calling single extraction functions
141
+ * in a loop.
142
+ *
143
+ * **Benefits of Batch Processing**:
144
+ * - Parallel processing in Rust for maximum performance
145
+ * - Optimized memory usage across all extractions
146
+ * - More reliable for large-scale document processing
147
+ *
148
+ * @internal This module is part of Layer 2 (extraction APIs).
149
+ */
150
+
193
151
  /**
194
152
  * Extract content from multiple files in parallel (synchronous).
195
153
  *
@@ -222,7 +180,7 @@ declare function extractBytes(dataOrPath: Uint8Array | string, mimeType: string,
222
180
  * });
223
181
  * ```
224
182
  */
225
- declare function batchExtractFilesSync(paths: string[], config?: ExtractionConfig$1 | null): ExtractionResult[];
183
+ declare function batchExtractFilesSync(paths: string[], config?: ExtractionConfig | null): ExtractionResult[];
226
184
  /**
227
185
  * Extract content from multiple files in parallel (asynchronous).
228
186
  *
@@ -258,7 +216,7 @@ declare function batchExtractFilesSync(paths: string[], config?: ExtractionConfi
258
216
  * .reduce((a, b) => a + b, 0);
259
217
  * ```
260
218
  */
261
- declare function batchExtractFiles(paths: string[], config?: ExtractionConfig$1 | null): Promise<ExtractionResult[]>;
219
+ declare function batchExtractFiles(paths: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
262
220
  /**
263
221
  * Extract content from multiple byte arrays in parallel (synchronous).
264
222
  *
@@ -296,7 +254,7 @@ declare function batchExtractFiles(paths: string[], config?: ExtractionConfig$1
296
254
  * });
297
255
  * ```
298
256
  */
299
- declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig$1 | null): ExtractionResult[];
257
+ declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig | null): ExtractionResult[];
300
258
  /**
301
259
  * Extract content from multiple byte arrays in parallel (asynchronous).
302
260
  *
@@ -338,55 +296,355 @@ declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string
338
296
  * .reduce((a, b) => a + b, 0);
339
297
  * ```
340
298
  */
341
- declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig$1 | null): Promise<ExtractionResult[]>;
299
+ declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
300
+
342
301
  /**
343
- * Register a custom postprocessor.
302
+ * Single-document extraction APIs.
344
303
  *
345
- * **IMPORTANT**: Custom processors only work with **async extraction functions**:
346
- * - `extractFile()`, `extractBytes()`, `batchExtractFiles()`, `batchExtractBytes()`
347
- * - `extractFileSync()`, `extractBytesSync()`, etc. (will skip custom processors)
304
+ * This module provides synchronous and asynchronous functions for extracting content
305
+ * from a single file or byte array. These are convenience wrappers around the native
306
+ * binding that handle config normalization and result conversion.
348
307
  *
349
- * This limitation exists because sync extraction blocks the Node.js event loop,
350
- * preventing JavaScript callbacks from executing. For v4.0, use async extraction
351
- * when you need custom processors.
308
+ * **Usage Note**: For processing multiple files, prefer batch extraction functions
309
+ * (`batchExtractFiles`, `batchExtractFilesSync`) which provide better performance
310
+ * and memory management.
352
311
  *
353
- * @param processor - PostProcessorProtocol implementation with name(), process(), and optional processingStage()
354
- * @throws {Error} If processor is missing required methods (name or process)
355
- * @throws {Error} If processor name is empty string
356
- * @throws {Error} If a processor with the same name is already registered
312
+ * @internal This module is part of Layer 2 (extraction APIs).
313
+ */
314
+
315
+ /**
316
+ * Extract content from a single file (synchronous).
357
317
  *
358
- * @example
359
- * ```typescript
360
- * import { registerPostProcessor, extractFile, ExtractionResult } from '@kreuzberg/node';
318
+ * **Usage Note**: For processing multiple files, prefer `batchExtractFilesSync()` which
319
+ * provides better performance and memory management.
361
320
  *
362
- * class MyProcessor implements PostProcessorProtocol {
363
- * name(): string {
364
- * return 'my_processor';
365
- * }
321
+ * @param filePath - Path to the file to extract (string). Can be absolute or relative.
322
+ * @param mimeTypeOrConfig - Optional MIME type hint or extraction configuration.
323
+ * If a string, treated as MIME type. If an object, treated as ExtractionConfig.
324
+ * If null, MIME type is auto-detected from file extension or content.
325
+ * @param maybeConfig - Extraction configuration object. If null, uses default extraction settings.
326
+ * Only used if second parameter is a MIME type string.
327
+ * @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
328
+ * @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
329
+ * @throws {ParsingError} When document format is invalid or corrupted
330
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
331
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
332
+ * @throws {KreuzbergError} For other extraction-related failures
366
333
  *
367
- * process(result: ExtractionResult): ExtractionResult {
368
- * result.metadata.customField = 'custom_value';
369
- * return result;
370
- * }
334
+ * @example
335
+ * ```typescript
336
+ * import { extractFileSync } from '@kreuzberg/node';
371
337
  *
372
- * processingStage(): 'early' | 'middle' | 'late' {
373
- * return 'middle';
374
- * }
375
- * }
338
+ * // Basic usage
339
+ * const result = extractFileSync('document.pdf');
340
+ * console.log(result.content);
376
341
  *
377
- * registerPostProcessor(new MyProcessor());
342
+ * // With explicit MIME type
343
+ * const result2 = extractFileSync('document.pdf', 'application/pdf');
378
344
  *
379
- * // Use async extraction (required for custom processors)
380
- * const result = await extractFile('document.pdf');
381
- * console.log(result.metadata.customField); // 'custom_value'
345
+ * // With configuration
346
+ * const result3 = extractFileSync('document.pdf', {
347
+ * chunking: {
348
+ * maxChars: 1000,
349
+ * maxOverlap: 200,
350
+ * },
351
+ * });
382
352
  * ```
383
353
  */
384
- declare function registerPostProcessor(processor: PostProcessorProtocol): void;
354
+ declare function extractFileSync(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig, maybeConfig?: ExtractionConfig | null): ExtractionResult;
385
355
  /**
386
- * Unregister a postprocessor by name.
356
+ * Extract content from a single file (asynchronous).
387
357
  *
388
- * Removes a previously registered postprocessor from the registry.
389
- * If the processor doesn't exist, this is a no-op (does not throw).
358
+ * **Usage Note**: For processing multiple files, prefer `batchExtractFiles()` which
359
+ * provides better performance and memory management.
360
+ *
361
+ * @param filePath - Path to the file to extract (string). Can be absolute or relative.
362
+ * @param mimeTypeOrConfig - Optional MIME type hint or extraction configuration.
363
+ * If a string, treated as MIME type. If an object, treated as ExtractionConfig.
364
+ * If null, MIME type is auto-detected from file extension or content.
365
+ * @param maybeConfig - Extraction configuration object. If null, uses default extraction settings.
366
+ * Only used if second parameter is a MIME type string.
367
+ * @returns Promise<ExtractionResult> containing extracted content, metadata, tables, and optional chunks/images
368
+ * @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
369
+ * @throws {ParsingError} When document format is invalid or corrupted
370
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
371
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
372
+ * @throws {KreuzbergError} For other extraction-related failures
373
+ *
374
+ * @example
375
+ * ```typescript
376
+ * import { extractFile } from '@kreuzberg/node';
377
+ *
378
+ * // Basic usage
379
+ * const result = await extractFile('document.pdf');
380
+ * console.log(result.content);
381
+ *
382
+ * // With chunking enabled
383
+ * const config = {
384
+ * chunking: {
385
+ * maxChars: 1000,
386
+ * maxOverlap: 200,
387
+ * },
388
+ * };
389
+ * const result2 = await extractFile('long_document.pdf', null, config);
390
+ * console.log(result2.chunks); // Array of text chunks
391
+ * ```
392
+ */
393
+ declare function extractFile(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig, maybeConfig?: ExtractionConfig | null): Promise<ExtractionResult>;
394
+ /**
395
+ * Extract content from raw bytes (synchronous).
396
+ *
397
+ * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytesSync()`
398
+ * which provides better performance and memory management.
399
+ *
400
+ * @param data - File content as Uint8Array (Buffer will be converted)
401
+ * @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string.
402
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
403
+ * @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
404
+ * @throws {TypeError} When data is not a valid Uint8Array
405
+ * @throws {Error} When file cannot be read or parsed
406
+ * @throws {ParsingError} When document format is invalid or corrupted
407
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
408
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
409
+ * @throws {KreuzbergError} For other extraction-related failures
410
+ *
411
+ * @example
412
+ * ```typescript
413
+ * import { extractBytesSync } from '@kreuzberg/node';
414
+ * import { readFileSync } from 'fs';
415
+ *
416
+ * const data = readFileSync('document.pdf');
417
+ * const result = extractBytesSync(data, 'application/pdf');
418
+ * console.log(result.content);
419
+ * ```
420
+ */
421
+ declare function extractBytesSync(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig | null): ExtractionResult;
422
+ /**
423
+ * Extract content from raw bytes (asynchronous).
424
+ *
425
+ * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytes()`
426
+ * which provides better performance and memory management.
427
+ *
428
+ * @param data - File content as Uint8Array (Buffer will be converted)
429
+ * @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string.
430
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
431
+ * @returns Promise<ExtractionResult> containing extracted content, metadata, tables, and optional chunks/images
432
+ * @throws {TypeError} When data is not a valid Uint8Array
433
+ * @throws {Error} When file cannot be read or parsed
434
+ * @throws {ParsingError} When document format is invalid or corrupted
435
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
436
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
437
+ * @throws {KreuzbergError} For other extraction-related failures
438
+ *
439
+ * @example
440
+ * ```typescript
441
+ * import { extractBytes } from '@kreuzberg/node';
442
+ * import { readFile } from 'fs/promises';
443
+ *
444
+ * const data = await readFile('document.pdf');
445
+ * const result = await extractBytes(data, 'application/pdf');
446
+ * console.log(result.content);
447
+ * ```
448
+ */
449
+ declare function extractBytes(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig | null): Promise<ExtractionResult>;
450
+
451
+ /**
452
+ * Worker pool management for concurrent document extraction.
453
+ *
454
+ * This module provides utilities for creating and managing worker pools that enable
455
+ * concurrent extraction of documents using Node.js worker threads. Worker pools allow
456
+ * multiple extraction operations to run in parallel with configurable pool sizes.
457
+ *
458
+ * **Usage Pattern**:
459
+ * 1. Create a pool with `createWorkerPool(size)`
460
+ * 2. Submit tasks with `extractFileInWorker()` or `batchExtractFilesInWorker()`
461
+ * 3. Close the pool with `closeWorkerPool()` when done
462
+ *
463
+ * @internal This module is part of Layer 2 (extraction APIs).
464
+ */
465
+
466
+ /**
467
+ * Create a new worker pool for concurrent extraction operations.
468
+ *
469
+ * Creates a pool of worker threads that can process extraction tasks concurrently.
470
+ * The pool manages a queue of pending tasks and distributes them across available workers.
471
+ *
472
+ * @param size - Optional number of workers in the pool. If not specified, defaults to the number of CPU cores.
473
+ * @returns WorkerPool instance that can be used with extraction functions
474
+ *
475
+ * @example
476
+ * ```typescript
477
+ * import { createWorkerPool } from '@kreuzberg/node';
478
+ *
479
+ * // Create pool with default size (number of CPU cores)
480
+ * const pool = createWorkerPool();
481
+ *
482
+ * // Create pool with 4 workers
483
+ * const pool4 = createWorkerPool(4);
484
+ * ```
485
+ */
486
+ declare function createWorkerPool(size?: number): WorkerPool;
487
+ /**
488
+ * Get statistics about a worker pool.
489
+ *
490
+ * Returns information about the pool's current state, including the number of active workers,
491
+ * queued tasks, and total processed tasks.
492
+ *
493
+ * @param pool - The worker pool instance
494
+ * @returns WorkerPoolStats with pool information
495
+ *
496
+ * @example
497
+ * ```typescript
498
+ * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
499
+ *
500
+ * const pool = createWorkerPool(4);
501
+ * const stats = getWorkerPoolStats(pool);
502
+ *
503
+ * console.log(`Pool size: ${stats.size}`);
504
+ * console.log(`Active workers: ${stats.activeWorkers}`);
505
+ * console.log(`Queued tasks: ${stats.queuedTasks}`);
506
+ * ```
507
+ */
508
+ declare function getWorkerPoolStats(pool: WorkerPool): WorkerPoolStats;
509
+ /**
510
+ * Extract content from a single file using a worker pool (asynchronous).
511
+ *
512
+ * Submits an extraction task to the worker pool. The task is executed by one of the
513
+ * available workers in the background, allowing other tasks to be processed concurrently.
514
+ *
515
+ * @param pool - The worker pool instance
516
+ * @param filePath - Path to the file to extract
517
+ * @param mimeTypeOrConfig - Optional MIME type or extraction configuration.
518
+ * If a string, treated as MIME type. If an object, treated as ExtractionConfig.
519
+ * If null, MIME type is auto-detected from file extension or content.
520
+ * @param maybeConfig - Extraction configuration object. If null, uses default extraction settings.
521
+ * Only used if second parameter is a MIME type string.
522
+ * @returns Promise<ExtractionResult> containing extracted content and metadata
523
+ *
524
+ * @throws {Error} If the file cannot be read or extraction fails
525
+ *
526
+ * @example
527
+ * ```typescript
528
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
529
+ *
530
+ * const pool = createWorkerPool(4);
531
+ *
532
+ * try {
533
+ * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
534
+ * const results = await Promise.all(
535
+ * files.map(f => extractFileInWorker(pool, f))
536
+ * );
537
+ *
538
+ * results.forEach((r, i) => {
539
+ * console.log(`${files[i]}: ${r.content.substring(0, 100)}...`);
540
+ * });
541
+ * } finally {
542
+ * await closeWorkerPool(pool);
543
+ * }
544
+ * ```
545
+ */
546
+ declare function extractFileInWorker(pool: WorkerPool, filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig, maybeConfig?: ExtractionConfig | null): Promise<ExtractionResult>;
547
+ /**
548
+ * Extract content from multiple files in parallel using a worker pool (asynchronous).
549
+ *
550
+ * Submits multiple extraction tasks to the worker pool for concurrent processing.
551
+ * This is more efficient than using `extractFileInWorker` multiple times sequentially.
552
+ *
553
+ * @param pool - The worker pool instance
554
+ * @param paths - Array of file paths to extract
555
+ * @param config - Extraction configuration object (applies to all files). If null, uses default extraction settings.
556
+ * @returns Promise<ExtractionResult[]> array of results (one per file, in same order)
557
+ *
558
+ * @throws {Error} If any file cannot be read or extraction fails
559
+ *
560
+ * @example
561
+ * ```typescript
562
+ * import { createWorkerPool, batchExtractFilesInWorker, closeWorkerPool } from '@kreuzberg/node';
563
+ *
564
+ * const pool = createWorkerPool(4);
565
+ *
566
+ * try {
567
+ * const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf'];
568
+ * const results = await batchExtractFilesInWorker(pool, files, {
569
+ * ocr: { backend: 'tesseract', language: 'eng' }
570
+ * });
571
+ *
572
+ * const total = results.reduce((sum, r) => sum + extractAmount(r.content), 0);
573
+ * console.log(`Total: $${total}`);
574
+ * } finally {
575
+ * await closeWorkerPool(pool);
576
+ * }
577
+ * ```
578
+ */
579
+ declare function batchExtractFilesInWorker(pool: WorkerPool, paths: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
580
+ /**
581
+ * Close a worker pool and shut down all worker threads.
582
+ *
583
+ * Should be called when the pool is no longer needed to clean up resources
584
+ * and gracefully shut down worker threads. Any pending tasks will be cancelled.
585
+ *
586
+ * @param pool - The worker pool instance to close
587
+ * @returns Promise that resolves when the pool is fully closed
588
+ *
589
+ * @throws {Error} If pool shutdown fails
590
+ *
591
+ * @example
592
+ * ```typescript
593
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
594
+ *
595
+ * const pool = createWorkerPool(4);
596
+ *
597
+ * try {
598
+ * const result = await extractFileInWorker(pool, 'document.pdf');
599
+ * console.log(result.content);
600
+ * } finally {
601
+ * // Clean up the pool
602
+ * await closeWorkerPool(pool);
603
+ * }
604
+ * ```
605
+ */
606
+ declare function closeWorkerPool(pool: WorkerPool): Promise<void>;
607
+
608
+ /**
609
+ * Register a custom post-processor.
610
+ *
611
+ * Post-processors allow you to hook into the extraction pipeline and transform
612
+ * the extraction results. They run after the core extraction is complete.
613
+ *
614
+ * Post-processors are async and can modify extraction results before they are
615
+ * returned to the caller.
616
+ *
617
+ * @param processor - Post-processor implementing PostProcessorProtocol
618
+ *
619
+ * @example
620
+ * ```typescript
621
+ * import { registerPostProcessor, extractFile } from '@kreuzberg/node';
622
+ *
623
+ * class CustomProcessor {
624
+ * name() {
625
+ * return 'custom_processor';
626
+ * }
627
+ * processingStage() {
628
+ * return 'post';
629
+ * }
630
+ * async process(result) {
631
+ * // Add custom metadata
632
+ * result.metadata.customField = 'custom_value';
633
+ * return result;
634
+ * }
635
+ * }
636
+ *
637
+ * // Use async extraction (required for custom processors)
638
+ * const result = await extractFile('document.pdf');
639
+ * console.log(result.metadata.customField); // 'custom_value'
640
+ * ```
641
+ */
642
+ declare function registerPostProcessor(processor: PostProcessorProtocol): void;
643
+ /**
644
+ * Unregister a postprocessor by name.
645
+ *
646
+ * Removes a previously registered postprocessor from the registry.
647
+ * If the processor doesn't exist, this is a no-op (does not throw).
390
648
  *
391
649
  * @param name - Name of the processor to unregister (case-sensitive)
392
650
  *
@@ -428,6 +686,7 @@ declare function clearPostProcessors(): void;
428
686
  * ```
429
687
  */
430
688
  declare function listPostProcessors(): string[];
689
+
431
690
  /**
432
691
  * Register a custom validator.
433
692
  *
@@ -435,27 +694,26 @@ declare function listPostProcessors(): string[];
435
694
  * Unlike post-processors, validator errors **fail fast** - if a validator throws an error,
436
695
  * the extraction fails immediately.
437
696
  *
438
- * @param validator - ValidatorProtocol implementation with name(), validate(), and optional priority()/shouldValidate()
439
- * @throws {Error} If validator is missing required methods (name or validate)
440
- * @throws {Error} If validator name is empty string
441
- * @throws {Error} If a validator with the same name is already registered
697
+ * Validators are async and run after post-processors in the extraction pipeline.
698
+ *
699
+ * @param validator - Validator implementing ValidatorProtocol
442
700
  *
443
701
  * @example
444
702
  * ```typescript
445
- * import { registerValidator } from '@kreuzberg/node';
703
+ * import { registerValidator, extractFile } from '@kreuzberg/node';
446
704
  *
447
- * class MinLengthValidator implements ValidatorProtocol {
448
- * name(): string {
705
+ * class MinLengthValidator {
706
+ * name() {
449
707
  * return 'min_length_validator';
450
708
  * }
451
709
  *
452
- * priority(): number {
453
- * return 100; // Run early
710
+ * priority() {
711
+ * return 100;
454
712
  * }
455
713
  *
456
- * validate(result: ExtractionResult): void {
457
- * if (result.content.length < 100) {
458
- * throw new Error('Content too short: minimum 100 characters required');
714
+ * async validate(result) {
715
+ * if (result.content.length < 10) {
716
+ * throw new Error('Content too short');
459
717
  * }
460
718
  * }
461
719
  * }
@@ -510,20 +768,93 @@ declare function clearValidators(): void;
510
768
  * ```
511
769
  */
512
770
  declare function listValidators(): string[];
513
- declare function registerOcrBackend(backend: OcrBackendProtocol): void;
771
+
514
772
  /**
515
- * List all registered OCR backends.
773
+ * Register a custom OCR backend.
516
774
  *
517
- * Returns an array of names of all currently registered OCR backends,
518
- * including built-in backends like "tesseract".
775
+ * This function registers a JavaScript OCR backend that will be used by Kreuzberg's
776
+ * extraction pipeline when OCR is enabled. The backend must implement the
777
+ * {@link OcrBackendProtocol} interface.
519
778
  *
520
- * @returns Array of OCR backend names (empty array if none registered)
779
+ * ## Usage
780
+ *
781
+ * 1. Create a class implementing {@link OcrBackendProtocol}
782
+ * 2. Call `initialize()` on your backend instance (if needed)
783
+ * 3. Register the backend with `registerOcrBackend()`
784
+ * 4. Use the backend name in extraction config
785
+ *
786
+ * ## Thread Safety
787
+ *
788
+ * The registered backend must be thread-safe as it may be called concurrently
789
+ * from multiple Rust async tasks. Ensure your implementation handles concurrent
790
+ * calls properly.
791
+ *
792
+ * @param backend - OcrBackendProtocol implementation with name(), supportedLanguages(), and processImage()
793
+ * @throws {Error} If backend is missing required methods (name, supportedLanguages, or processImage)
794
+ * @throws {Error} If backend name is empty string or contains invalid characters
795
+ * @throws {Error} If a backend with the same name is already registered
796
+ * @throws {Error} If registration fails due to FFI issues
521
797
  *
522
798
  * @example
523
799
  * ```typescript
524
- * import { listOcrBackends } from '@kreuzberg/node';
800
+ * import { GutenOcrBackend } from '@kreuzberg/node/ocr/guten-ocr';
801
+ * import { registerOcrBackend, extractFile } from '@kreuzberg/node';
525
802
  *
526
- * const backends = listOcrBackends();
803
+ * // Create and initialize backend
804
+ * const backend = new GutenOcrBackend();
805
+ * await backend.initialize();
806
+ *
807
+ * // Register with Kreuzberg
808
+ * registerOcrBackend(backend);
809
+ *
810
+ * // Use in extraction
811
+ * const result = await extractFile('scanned.pdf', null, {
812
+ * ocr: { backend: 'guten-ocr', language: 'en' }
813
+ * });
814
+ * console.log(result.content);
815
+ * ```
816
+ *
817
+ * @example
818
+ * ```typescript
819
+ * import { registerOcrBackend } from '@kreuzberg/node';
820
+ *
821
+ * class MyOcrBackend {
822
+ * name() {
823
+ * return 'my-ocr';
824
+ * }
825
+ *
826
+ * supportedLanguages(): string[] {
827
+ * return ['en', 'de', 'fr'];
828
+ * }
829
+ *
830
+ * async processImage(imageBytes: Uint8Array, language: string) {
831
+ * const text = await myCustomOcrEngine(imageBytes, language);
832
+ * return {
833
+ * content: text,
834
+ * mime_type: 'text/plain',
835
+ * metadata: { confidence: 0.95, language },
836
+ * tables: []
837
+ * };
838
+ * }
839
+ * }
840
+ *
841
+ * registerOcrBackend(new MyOcrBackend());
842
+ * ```
843
+ */
844
+ declare function registerOcrBackend(backend: OcrBackendProtocol): void;
845
+ /**
846
+ * List all registered OCR backends.
847
+ *
848
+ * Returns an array of names of all currently registered OCR backends,
849
+ * including built-in backends like "tesseract".
850
+ *
851
+ * @returns Array of OCR backend names (empty array if none registered)
852
+ *
853
+ * @example
854
+ * ```typescript
855
+ * import { listOcrBackends } from '@kreuzberg/node';
856
+ *
857
+ * const backends = listOcrBackends();
527
858
  * console.log(backends); // ['tesseract', 'my-custom-backend', ...]
528
859
  * ```
529
860
  */
@@ -560,6 +891,7 @@ declare function unregisterOcrBackend(name: string): void;
560
891
  * ```
561
892
  */
562
893
  declare function clearOcrBackends(): void;
894
+
563
895
  /**
564
896
  * List all registered document extractors.
565
897
  *
@@ -573,7 +905,7 @@ declare function clearOcrBackends(): void;
573
905
  * import { listDocumentExtractors } from '@kreuzberg/node';
574
906
  *
575
907
  * const extractors = listDocumentExtractors();
576
- * console.log(extractors); // ['PDFExtractor', 'ImageExtractor', ...]
908
+ * console.log(extractors); // ['pdf', 'docx', 'xlsx', 'custom-extractor', ...]
577
909
  * ```
578
910
  */
579
911
  declare function listDocumentExtractors(): string[];
@@ -609,87 +941,26 @@ declare function unregisterDocumentExtractor(name: string): void;
609
941
  * ```
610
942
  */
611
943
  declare function clearDocumentExtractors(): void;
944
+
612
945
  /**
613
- * ExtractionConfig namespace with static methods for loading configuration from files.
946
+ * Load extraction configuration from a file.
614
947
  *
615
- * Provides factory methods to load extraction configuration from TOML, YAML, or JSON files,
616
- * or to discover configuration files in the current directory tree.
948
+ * @param filePath - Path to the configuration file
949
+ * @returns ExtractionConfig object loaded from the file
617
950
  *
618
- * For creating configurations programmatically, use plain TypeScript objects instead:
619
- *
620
- * @example
621
- * ```typescript
622
- * import { ExtractionConfig, extractFile } from '@kreuzberg/node';
951
+ * @deprecated Use ExtractionConfig.fromFile() instead
952
+ */
953
+ declare function loadConfigFile(filePath: string): ExtractionConfig;
954
+ /**
955
+ * Load extraction configuration from a specified path.
623
956
  *
624
- * // Load configuration from file
625
- * const config1 = ExtractionConfig.fromFile('config.toml');
957
+ * @param path - Path to the configuration file or directory
958
+ * @returns ExtractionConfig object or null
626
959
  *
627
- * // Or create with plain object
628
- * const config2 = {
629
- * chunking: { maxChars: 2048 },
630
- * ocr: { backend: 'tesseract', language: 'eng' }
631
- * };
632
- *
633
- * // Use with extraction
634
- * const result = await extractFile('document.pdf', null, config2);
635
- * ```
960
+ * @deprecated Use ExtractionConfig.fromFile() or ExtractionConfig.discover() instead
636
961
  */
637
- declare const ExtractionConfig: {
638
- /**
639
- * Load extraction configuration from a file.
640
- *
641
- * Automatically detects the file format based on extension:
642
- * - `.toml` - TOML format
643
- * - `.yaml` - YAML format
644
- * - `.json` - JSON format
645
- *
646
- * @param filePath - Path to the configuration file (absolute or relative)
647
- * @returns ExtractionConfig object loaded from the file
648
- *
649
- * @throws {Error} If file does not exist or is not accessible
650
- * @throws {Error} If file content is not valid TOML/YAML/JSON
651
- * @throws {Error} If configuration structure is invalid
652
- * @throws {Error} If file extension is not supported
653
- *
654
- * @example
655
- * ```typescript
656
- * import { ExtractionConfig } from '@kreuzberg/node';
657
- *
658
- * // Load from TOML file
659
- * const config1 = ExtractionConfig.fromFile('kreuzberg.toml');
660
- *
661
- * // Load from YAML file
662
- * const config2 = ExtractionConfig.fromFile('./config.yaml');
663
- *
664
- * // Load from JSON file
665
- * const config3 = ExtractionConfig.fromFile('./config.json');
666
- * ```
667
- */
668
- fromFile(filePath: string): ExtractionConfig$1;
669
- /**
670
- * Discover and load configuration from current or parent directories.
671
- *
672
- * Searches for a `kreuzberg.toml` file starting from the current working directory
673
- * and traversing up the directory tree. Returns the first configuration file found.
674
- *
675
- * @returns ExtractionConfig object if found, or null if no configuration file exists
676
- *
677
- * @example
678
- * ```typescript
679
- * import { ExtractionConfig } from '@kreuzberg/node';
680
- *
681
- * // Try to find config in current or parent directories
682
- * const config = ExtractionConfig.discover();
683
- * if (config) {
684
- * console.log('Found configuration');
685
- * // Use config for extraction
686
- * } else {
687
- * console.log('No configuration file found, using defaults');
688
- * }
689
- * ```
690
- */
691
- discover(): ExtractionConfig$1 | null;
692
- };
962
+ declare function loadConfigFromPath(path: string): ExtractionConfig | null;
963
+
693
964
  /**
694
965
  * Detect MIME type from raw bytes.
695
966
  *
@@ -800,6 +1071,7 @@ declare function validateMimeType(mimeType: string): string;
800
1071
  * ```
801
1072
  */
802
1073
  declare function getExtensionsForMime(mimeType: string): string[];
1074
+
803
1075
  /**
804
1076
  * Embedding preset configuration.
805
1077
  *
@@ -820,28 +1092,29 @@ interface EmbeddingPreset {
820
1092
  description: string;
821
1093
  }
822
1094
  /**
823
- * List all available embedding preset names.
1095
+ * Get all available embedding presets.
824
1096
  *
825
- * Returns an array of preset names that can be used with `getEmbeddingPreset`.
1097
+ * Returns an array of names of all available embedding model presets.
826
1098
  *
827
- * @returns Array of 4 preset names: ["fast", "balanced", "quality", "multilingual"]
1099
+ * @returns Array of preset names (e.g., ["fast", "balanced", "quality", "multilingual"])
828
1100
  *
829
1101
  * @example
830
1102
  * ```typescript
831
1103
  * import { listEmbeddingPresets } from '@kreuzberg/node';
832
1104
  *
833
1105
  * const presets = listEmbeddingPresets();
834
- * console.log(presets); // ['fast', 'balanced', 'quality', 'multilingual']
1106
+ * console.log('Available presets:', presets);
835
1107
  * ```
836
1108
  */
837
1109
  declare function listEmbeddingPresets(): string[];
838
1110
  /**
839
- * Get a specific embedding preset by name.
1111
+ * Get embedding preset configuration by name.
840
1112
  *
841
- * Returns a preset configuration object, or null if the preset name is not found.
1113
+ * Retrieves the configuration for a specific embedding model preset.
1114
+ * Returns null if the preset doesn't exist.
842
1115
  *
843
- * @param name - The preset name (case-sensitive)
844
- * @returns An `EmbeddingPreset` object or `null` if not found
1116
+ * @param name - Name of the preset (e.g., "balanced", "fast", "quality")
1117
+ * @returns EmbeddingPreset configuration if found, null otherwise
845
1118
  *
846
1119
  * @example
847
1120
  * ```typescript
@@ -855,278 +1128,78 @@ declare function listEmbeddingPresets(): string[];
855
1128
  * ```
856
1129
  */
857
1130
  declare function getEmbeddingPreset(name: string): EmbeddingPreset | null;
1131
+
858
1132
  /**
859
- * Get the error code for the last FFI error.
860
- *
861
- * Returns the FFI error code as an integer. This is useful for programmatic error handling
862
- * and distinguishing between different types of failures in native code.
863
- *
864
- * Error codes:
865
- * - 0: Success (no error)
866
- * - 1: GenericError
867
- * - 2: Panic
868
- * - 3: InvalidArgument
869
- * - 4: IoError
870
- * - 5: ParsingError
871
- * - 6: OcrError
872
- * - 7: MissingDependency
873
- *
874
- * @returns The integer error code
875
- *
876
- * @example
877
- * ```typescript
878
- * import { extractFile, getLastErrorCode, ErrorCode } from '@kreuzberg/node';
879
- *
880
- * try {
881
- * const result = await extractFile('document.pdf');
882
- * } catch (error) {
883
- * const code = getLastErrorCode();
884
- * if (code === ErrorCode.Panic) {
885
- * console.error('Native code panic detected');
886
- * }
887
- * }
888
- * ```
889
- */
890
- declare function getLastErrorCode(): number;
891
- /**
892
- * Get panic context information if the last error was a panic.
893
- *
894
- * Returns detailed information about a panic in native code, or null if the last error was not a panic.
895
- * This provides debugging information when native code panics.
896
- *
897
- * @returns A `PanicContext` object with file, line, function, message, and timestamp_secs, or null if no panic context is available
898
- *
899
- * @example
900
- * ```typescript
901
- * import { extractFile, getLastPanicContext } from '@kreuzberg/node';
902
- *
903
- * try {
904
- * const result = await extractFile('document.pdf');
905
- * } catch (error) {
906
- * const context = getLastPanicContext();
907
- * if (context) {
908
- * console.error(`Panic at ${context.file}:${context.line}`);
909
- * console.error(`In function: ${context.function}`);
910
- * console.error(`Message: ${context.message}`);
911
- * }
912
- * }
913
- * ```
914
- */
915
- declare function getLastPanicContext(): PanicContext | null;
916
- /**
917
- * Returns the human-readable name for an error code.
918
- *
919
- * Maps numeric error codes to their string names, providing a consistent way
920
- * to get error code names across all platforms.
921
- *
922
- * @param code - The numeric error code (0-7)
923
- * @returns The error code name as a string (e.g., "validation", "ocr", "unknown")
924
- *
925
- * @example
926
- * ```typescript
927
- * import { getErrorCodeName } from '@kreuzberg/node';
928
- *
929
- * const name = getErrorCodeName(0); // returns "validation"
930
- * const name = getErrorCodeName(2); // returns "ocr"
931
- * const name = getErrorCodeName(99); // returns "unknown"
932
- * ```
933
- */
934
- declare function getErrorCodeName(code: number): string;
935
- /**
936
- * Returns the description for an error code.
937
- *
938
- * Retrieves user-friendly descriptions of error types from the FFI layer.
939
- *
940
- * @param code - The numeric error code (0-7)
941
- * @returns A brief description of the error type
942
- *
943
- * @example
944
- * ```typescript
945
- * import { getErrorCodeDescription } from '@kreuzberg/node';
946
- *
947
- * const desc = getErrorCodeDescription(0); // returns "Input validation error"
948
- * const desc = getErrorCodeDescription(4); // returns "File system I/O error"
949
- * const desc = getErrorCodeDescription(99); // returns "Unknown error code"
950
- * ```
951
- */
952
- declare function getErrorCodeDescription(code: number): string;
953
- /**
954
- * Classifies an error message string into an error code category.
955
- *
956
- * This function analyzes the error message content and returns the most likely
957
- * error code (0-7) based on keyword patterns. Used to programmatically classify
958
- * errors for handling purposes.
959
- *
960
- * The classification is based on keyword matching:
961
- * - **Validation (0)**: Keywords like "invalid", "validation", "schema", "required"
962
- * - **Parsing (1)**: Keywords like "parsing", "corrupted", "malformed"
963
- * - **Ocr (2)**: Keywords like "ocr", "tesseract", "language", "model"
964
- * - **MissingDependency (3)**: Keywords like "not found", "missing", "dependency"
965
- * - **Io (4)**: Keywords like "file", "disk", "read", "write", "permission"
966
- * - **Plugin (5)**: Keywords like "plugin", "register", "extension"
967
- * - **UnsupportedFormat (6)**: Keywords like "unsupported", "format", "mime"
968
- * - **Internal (7)**: Keywords like "internal", "bug", "panic"
969
- *
970
- * @param errorMessage - The error message string to classify
971
- * @returns An object with the classification details
972
- *
973
- * @example
974
- * ```typescript
975
- * import { classifyError } from '@kreuzberg/node';
976
- *
977
- * const result = classifyError("PDF file is corrupted");
978
- * // Returns: { code: 1, name: "parsing", confidence: 0.95 }
979
- *
980
- * const result = classifyError("Tesseract not found");
981
- * // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 }
982
- * ```
983
- */
984
- declare function classifyError(errorMessage: string): ErrorClassification;
985
- /**
986
- * Create a worker pool for concurrent file extraction.
987
- *
988
- * The worker pool manages a set of background worker threads that can process
989
- * extraction requests concurrently, improving throughput when handling multiple files.
990
- *
991
- * @param size - Optional number of worker threads (defaults to CPU count). Must be > 0
992
- * @returns A WorkerPool instance to use with extraction functions
993
- *
994
- * @throws {Error} If size is invalid or pool creation fails
995
- *
996
- * @example
997
- * ```typescript
998
- * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
999
- *
1000
- * // Create pool with 4 workers
1001
- * const pool = createWorkerPool(4);
1002
- *
1003
- * try {
1004
- * const result = await extractFileInWorker(pool, 'document.pdf');
1005
- * console.log(result.content);
1006
- * } finally {
1007
- * // Always close the pool when done
1008
- * await closeWorkerPool(pool);
1009
- * }
1010
- * ```
1133
+ * @internal Allows tests to provide a mocked native binding.
1011
1134
  */
1012
- declare function createWorkerPool(size?: number): WorkerPool;
1135
+ declare function __setBindingForTests(mock: unknown): void;
1013
1136
  /**
1014
- * Get statistics about a worker pool.
1015
- *
1016
- * Returns information about the pool's current state, including the number of active workers,
1017
- * queued tasks, and total processed tasks.
1018
- *
1019
- * @param pool - The worker pool instance
1020
- * @returns WorkerPoolStats with pool information
1021
- *
1022
- * @example
1023
- * ```typescript
1024
- * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
1025
- *
1026
- * const pool = createWorkerPool(4);
1027
- * const stats = getWorkerPoolStats(pool);
1028
- *
1029
- * console.log(`Pool size: ${stats.size}`);
1030
- * console.log(`Active workers: ${stats.activeWorkers}`);
1031
- * console.log(`Queued tasks: ${stats.queuedTasks}`);
1032
- * ```
1137
+ * @internal Resets the cached native binding for tests.
1033
1138
  */
1034
- declare function getWorkerPoolStats(pool: WorkerPool): WorkerPoolStats;
1139
+ declare function __resetBindingForTests(): void;
1140
+
1035
1141
  /**
1036
- * Extract content from a single file using a worker pool (asynchronous).
1037
- *
1038
- * Submits an extraction task to the worker pool. The task is executed by one of the
1039
- * available workers in the background, allowing other tasks to be processed concurrently.
1142
+ * Kreuzberg - Multi-language document intelligence framework.
1040
1143
  *
1041
- * @param pool - The worker pool instance
1042
- * @param filePath - Path to the file to extract
1043
- * @param mimeTypeOrConfig - Optional MIME type or extraction configuration
1044
- * @param maybeConfig - Optional extraction configuration (if second param is MIME type)
1045
- * @returns Promise<ExtractionResult> containing extracted content and metadata
1144
+ * This is a TypeScript SDK around a high-performance Rust core.
1145
+ * All extraction logic, chunking, quality processing, and language detection
1146
+ * are implemented in Rust for maximum performance.
1046
1147
  *
1047
- * @throws {Error} If the file cannot be read or extraction fails
1148
+ * ## Module Organization
1048
1149
  *
1049
- * @example
1050
- * ```typescript
1051
- * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
1150
+ * The SDK is organized into logical domains:
1151
+ * - **Extraction**: Single and batch document extraction with worker pool support
1152
+ * - **Types**: Core type definitions and interfaces
1153
+ * - **Errors**: Error classes and diagnostic utilities
1154
+ * - **Plugins**: Custom post-processors, validators, and OCR backends
1155
+ * - **Registry**: Plugin and document extractor management
1156
+ * - **Config**: Configuration loading and management
1157
+ * - **MIME**: MIME type detection and validation
1158
+ * - **Embeddings**: Embedding model presets
1052
1159
  *
1053
- * const pool = createWorkerPool(4);
1160
+ * ## API Usage Recommendations
1054
1161
  *
1055
- * try {
1056
- * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
1057
- * const results = await Promise.all(
1058
- * files.map(f => extractFileInWorker(pool, f))
1059
- * );
1162
+ * **For processing multiple documents**, prefer batch APIs:
1163
+ * - Use `batchExtractFiles()` / `batchExtractFilesSync()` for multiple files
1164
+ * - Use `batchExtractBytes()` / `batchExtractBytesSync()` for multiple byte arrays
1165
+ * - Use worker pool APIs for high-concurrency scenarios
1060
1166
  *
1061
- * results.forEach((r, i) => {
1062
- * console.log(`${files[i]}: ${r.content.substring(0, 100)}...`);
1063
- * });
1064
- * } finally {
1065
- * await closeWorkerPool(pool);
1066
- * }
1067
- * ```
1068
- */
1069
- declare function extractFileInWorker(pool: WorkerPool, filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
1070
- /**
1071
- * Extract content from multiple files in parallel using a worker pool (asynchronous).
1167
+ * **Batch APIs provide**:
1168
+ * - Better performance (parallel processing in Rust)
1169
+ * - More reliable memory management
1170
+ * - Recommended for all multi-document workflows
1072
1171
  *
1073
- * Submits multiple extraction tasks to the worker pool for concurrent processing.
1074
- * This is more efficient than using `extractFileInWorker` multiple times sequentially.
1172
+ * **Single extraction APIs** (`extractFile`, `extractBytes`) are suitable for:
1173
+ * - One-off document processing
1174
+ * - Interactive applications processing documents on-demand
1175
+ * - Avoid calling these in tight loops - use batch APIs instead
1075
1176
  *
1076
- * @param pool - The worker pool instance
1077
- * @param paths - Array of file paths to extract
1078
- * @param config - Extraction configuration object (applies to all files)
1079
- * @returns Promise<ExtractionResult[]> array of results (one per file, in same order)
1177
+ * ## Supported Formats
1080
1178
  *
1081
- * @throws {Error} If any file cannot be read or extraction fails
1179
+ * - **Documents**: PDF, DOCX, PPTX, XLSX, DOC, PPT (with LibreOffice)
1180
+ * - **Text**: Markdown, Plain Text, XML
1181
+ * - **Web**: HTML (converted to Markdown)
1182
+ * - **Data**: JSON, YAML, TOML
1183
+ * - **Email**: EML, MSG
1184
+ * - **Images**: PNG, JPEG, TIFF (with OCR support)
1082
1185
  *
1083
1186
  * @example
1084
1187
  * ```typescript
1085
- * import { createWorkerPool, batchExtractFilesInWorker, closeWorkerPool } from '@kreuzberg/node';
1086
- *
1087
- * const pool = createWorkerPool(4);
1188
+ * import { extractFile, batchExtractFiles } from '@kreuzberg/node';
1088
1189
  *
1089
- * try {
1090
- * const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf'];
1091
- * const results = await batchExtractFilesInWorker(pool, files, {
1092
- * ocr: { backend: 'tesseract', language: 'eng' }
1093
- * });
1190
+ * // Single file extraction
1191
+ * const result = await extractFile('document.pdf');
1192
+ * console.log(result.content);
1094
1193
  *
1095
- * const total = results.reduce((sum, r) => sum + extractAmount(r.content), 0);
1096
- * console.log(`Total: $${total}`);
1097
- * } finally {
1098
- * await closeWorkerPool(pool);
1099
- * }
1194
+ * // Multiple files (recommended approach)
1195
+ * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
1196
+ * const results = await batchExtractFiles(files);
1197
+ * results.forEach(r => console.log(r.content));
1100
1198
  * ```
1101
- */
1102
- declare function batchExtractFilesInWorker(pool: WorkerPool, paths: string[], config?: ExtractionConfig$1 | null): Promise<ExtractionResult[]>;
1103
- /**
1104
- * Close a worker pool and shut down all worker threads.
1105
- *
1106
- * Should be called when the pool is no longer needed to clean up resources
1107
- * and gracefully shut down worker threads. Any pending tasks will be cancelled.
1108
- *
1109
- * @param pool - The worker pool instance to close
1110
- * @returns Promise that resolves when the pool is fully closed
1111
- *
1112
- * @throws {Error} If pool shutdown fails
1113
- *
1114
- * @example
1115
- * ```typescript
1116
- * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
1117
- *
1118
- * const pool = createWorkerPool(4);
1119
1199
  *
1120
- * try {
1121
- * const result = await extractFileInWorker(pool, 'document.pdf');
1122
- * console.log(result.content);
1123
- * } finally {
1124
- * // Clean up the pool
1125
- * await closeWorkerPool(pool);
1126
- * }
1127
- * ```
1200
+ * @module @kreuzberg/node
1128
1201
  */
1129
- declare function closeWorkerPool(pool: WorkerPool): Promise<void>;
1130
- declare const __version__ = "4.0.8";
1131
1202
 
1132
- export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
1203
+ declare const __version__ = "4.1.1";
1204
+
1205
+ export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };