@kreuzberg/node 4.0.0-rc.8 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  import { PanicContext } from './errors.js';
2
2
  export { CacheError, ErrorCode, ImageProcessingError, KreuzbergError, MissingDependencyError, OcrError, ParsingError, PluginError, ValidationError } from './errors.js';
3
- import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol } from './types.js';
4
- export { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, Metadata, OcrConfig, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.js';
3
+ import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, ErrorClassification, WorkerPool, WorkerPoolStats } from './types.js';
4
+ export { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.js';
5
5
  export { GutenOcrBackend } from './ocr/guten-ocr.js';
6
6
 
7
7
  /**
@@ -65,10 +65,15 @@ declare function __resetBindingForTests(): void;
65
65
  * **Usage Note**: For processing multiple files, prefer `batchExtractFilesSync()` which
66
66
  * provides better performance and memory management.
67
67
  *
68
- * @param filePath - Path to the file (string)
69
- * @param mimeType - Optional MIME type hint (auto-detected if null)
70
- * @param config - Extraction configuration (uses defaults if null)
71
- * @returns ExtractionResult with content, metadata, and tables
68
+ * @param filePath - Path to the file to extract (string). Can be absolute or relative.
69
+ * @param mimeType - Optional MIME type hint for format detection. If null, MIME type is auto-detected from file extension or content.
70
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
71
+ * @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
72
+ * @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
73
+ * @throws {ParsingError} When document format is invalid or corrupted
74
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
75
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
76
+ * @throws {KreuzbergError} For other extraction-related failures
72
77
  *
73
78
  * @example
74
79
  * ```typescript
@@ -92,17 +97,22 @@ declare function __resetBindingForTests(): void;
92
97
  * const result2 = extractFileSync('scanned.pdf', null, config);
93
98
  * ```
94
99
  */
95
- declare function extractFileSync(filePath: string, mimeType?: string | null, config?: ExtractionConfig$1 | null): ExtractionResult;
100
+ declare function extractFileSync(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): ExtractionResult;
96
101
  /**
97
102
  * Extract content from a single file (asynchronous).
98
103
  *
99
104
  * **Usage Note**: For processing multiple files, prefer `batchExtractFiles()` which
100
105
  * provides better performance and memory management.
101
106
  *
102
- * @param filePath - Path to the file (string)
103
- * @param mimeType - Optional MIME type hint (auto-detected if null)
104
- * @param config - Extraction configuration (uses defaults if null)
105
- * @returns Promise<ExtractionResult> with content, metadata, and tables
107
+ * @param filePath - Path to the file to extract (string). Can be absolute or relative.
108
+ * @param mimeType - Optional MIME type hint for format detection. If null, MIME type is auto-detected from file extension or content.
109
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
110
+ * @returns Promise<ExtractionResult> containing extracted content, metadata, tables, and optional chunks/images
111
+ * @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
112
+ * @throws {ParsingError} When document format is invalid or corrupted
113
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
114
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
115
+ * @throws {KreuzbergError} For other extraction-related failures
106
116
  *
107
117
  * @example
108
118
  * ```typescript
@@ -123,17 +133,23 @@ declare function extractFileSync(filePath: string, mimeType?: string | null, con
123
133
  * console.log(result2.chunks); // Array of text chunks
124
134
  * ```
125
135
  */
126
- declare function extractFile(filePath: string, mimeType?: string | null, config?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
136
+ declare function extractFile(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
127
137
  /**
128
138
  * Extract content from raw bytes (synchronous).
129
139
  *
130
140
  * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytesSync()`
131
141
  * which provides better performance and memory management.
132
142
  *
133
- * @param data - File content as Uint8Array
134
- * @param mimeType - MIME type of the data (required for format detection)
135
- * @param config - Extraction configuration (uses defaults if null)
136
- * @returns ExtractionResult with content, metadata, and tables
143
+ * @param data - File content as Uint8Array (Buffer will be converted)
144
+ * @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string.
145
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
146
+ * @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
147
+ * @throws {TypeError} When data is not a valid Uint8Array
148
+ * @throws {Error} When file cannot be read or parsed
149
+ * @throws {ParsingError} When document format is invalid or corrupted
150
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
151
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
152
+ * @throws {KreuzbergError} For other extraction-related failures
137
153
  *
138
154
  * @example
139
155
  * ```typescript
@@ -145,17 +161,23 @@ declare function extractFile(filePath: string, mimeType?: string | null, config?
145
161
  * console.log(result.content);
146
162
  * ```
147
163
  */
148
- declare function extractBytesSync(data: Uint8Array, mimeType: string, config?: ExtractionConfig$1 | null): ExtractionResult;
164
+ declare function extractBytesSync(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig$1 | null): ExtractionResult;
149
165
  /**
150
166
  * Extract content from raw bytes (asynchronous).
151
167
  *
152
168
  * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytes()`
153
169
  * which provides better performance and memory management.
154
170
  *
155
- * @param data - File content as Uint8Array
156
- * @param mimeType - MIME type of the data (required for format detection)
157
- * @param config - Extraction configuration (uses defaults if null)
158
- * @returns Promise<ExtractionResult> with content, metadata, and tables
171
+ * @param data - File content as Uint8Array (Buffer will be converted)
172
+ * @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string.
173
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
174
+ * @returns Promise<ExtractionResult> containing extracted content, metadata, tables, and optional chunks/images
175
+ * @throws {TypeError} When data is not a valid Uint8Array
176
+ * @throws {Error} When file cannot be read or parsed
177
+ * @throws {ParsingError} When document format is invalid or corrupted
178
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
179
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
180
+ * @throws {KreuzbergError} For other extraction-related failures
159
181
  *
160
182
  * @example
161
183
  * ```typescript
@@ -167,7 +189,7 @@ declare function extractBytesSync(data: Uint8Array, mimeType: string, config?: E
167
189
  * console.log(result.content);
168
190
  * ```
169
191
  */
170
- declare function extractBytes(data: Uint8Array, mimeType: string, config?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
192
+ declare function extractBytes(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
171
193
  /**
172
194
  * Extract content from multiple files in parallel (synchronous).
173
195
  *
@@ -179,9 +201,14 @@ declare function extractBytes(data: Uint8Array, mimeType: string, config?: Extra
179
201
  * - Optimized memory usage across all extractions
180
202
  * - More reliable for batch document processing
181
203
  *
182
- * @param paths - List of file paths to extract
183
- * @param config - Extraction configuration (uses defaults if null)
204
+ * @param paths - List of file paths to extract (absolute or relative paths)
205
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
184
206
  * @returns Array of ExtractionResults (one per file, in same order as input)
207
+ * @throws {Error} If any file cannot be read or parsed
208
+ * @throws {ParsingError} When any document format is invalid or corrupted
209
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
210
+ * @throws {ValidationError} When any extraction result fails validation (if validators registered)
211
+ * @throws {KreuzbergError} For other extraction-related failures
185
212
  *
186
213
  * @example
187
214
  * ```typescript
@@ -207,9 +234,14 @@ declare function batchExtractFilesSync(paths: string[], config?: ExtractionConfi
207
234
  * - Optimized memory usage across all extractions
208
235
  * - More reliable for batch document processing
209
236
  *
210
- * @param paths - List of file paths to extract
211
- * @param config - Extraction configuration (uses defaults if null)
237
+ * @param paths - List of file paths to extract (absolute or relative paths)
238
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
212
239
  * @returns Promise resolving to array of ExtractionResults (one per file, in same order as input)
240
+ * @throws {Error} If any file cannot be read or parsed
241
+ * @throws {ParsingError} When any document format is invalid or corrupted
242
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
243
+ * @throws {ValidationError} When any extraction result fails validation (if validators registered)
244
+ * @throws {KreuzbergError} For other extraction-related failures
213
245
  *
214
246
  * @example
215
247
  * ```typescript
@@ -238,10 +270,16 @@ declare function batchExtractFiles(paths: string[], config?: ExtractionConfig$1
238
270
  * - Optimized memory usage across all extractions
239
271
  * - More reliable for batch document processing
240
272
  *
241
- * @param dataList - List of file contents as Uint8Arrays
242
- * @param mimeTypes - List of MIME types (one per data item, required for format detection)
243
- * @param config - Extraction configuration (uses defaults if null)
273
+ * @param dataList - List of file contents as Uint8Arrays (must be same length as mimeTypes)
274
+ * @param mimeTypes - List of MIME types (one per data item, required for accurate format detection)
275
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
244
276
  * @returns Array of ExtractionResults (one per data item, in same order as input)
277
+ * @throws {TypeError} When dataList contains non-Uint8Array items or length mismatch with mimeTypes
278
+ * @throws {Error} If any data cannot be read or parsed
279
+ * @throws {ParsingError} When any document format is invalid or corrupted
280
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
281
+ * @throws {ValidationError} When any extraction result fails validation (if validators registered)
282
+ * @throws {KreuzbergError} For other extraction-related failures
245
283
  *
246
284
  * @example
247
285
  * ```typescript
@@ -270,10 +308,16 @@ declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string
270
308
  * - Optimized memory usage across all extractions
271
309
  * - More reliable for batch document processing
272
310
  *
273
- * @param dataList - List of file contents as Uint8Arrays
274
- * @param mimeTypes - List of MIME types (one per data item, required for format detection)
275
- * @param config - Extraction configuration (uses defaults if null)
311
+ * @param dataList - List of file contents as Uint8Arrays (must be same length as mimeTypes)
312
+ * @param mimeTypes - List of MIME types (one per data item, required for accurate format detection)
313
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
276
314
  * @returns Promise resolving to array of ExtractionResults (one per data item, in same order as input)
315
+ * @throws {TypeError} When dataList contains non-Uint8Array items or length mismatch with mimeTypes
316
+ * @throws {Error} If any data cannot be read or parsed
317
+ * @throws {ParsingError} When any document format is invalid or corrupted
318
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
319
+ * @throws {ValidationError} When any extraction result fails validation (if validators registered)
320
+ * @throws {KreuzbergError} For other extraction-related failures
277
321
  *
278
322
  * @example
279
323
  * ```typescript
@@ -306,7 +350,10 @@ declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[],
306
350
  * preventing JavaScript callbacks from executing. For v4.0, use async extraction
307
351
  * when you need custom processors.
308
352
  *
309
- * @param processor - PostProcessorProtocol implementation
353
+ * @param processor - PostProcessorProtocol implementation with name(), process(), and optional processingStage()
354
+ * @throws {Error} If processor is missing required methods (name or process)
355
+ * @throws {Error} If processor name is empty string
356
+ * @throws {Error} If a processor with the same name is already registered
310
357
  *
311
358
  * @example
312
359
  * ```typescript
@@ -339,8 +386,9 @@ declare function registerPostProcessor(processor: PostProcessorProtocol): void;
339
386
  * Unregister a postprocessor by name.
340
387
  *
341
388
  * Removes a previously registered postprocessor from the registry.
389
+ * If the processor doesn't exist, this is a no-op (does not throw).
342
390
  *
343
- * @param name - Name of the processor to unregister
391
+ * @param name - Name of the processor to unregister (case-sensitive)
344
392
  *
345
393
  * @example
346
394
  * ```typescript
@@ -353,7 +401,8 @@ declare function unregisterPostProcessor(name: string): void;
353
401
  /**
354
402
  * Clear all registered postprocessors.
355
403
  *
356
- * Removes all postprocessors from the registry.
404
+ * Removes all postprocessors from the registry. Useful for test cleanup or resetting state.
405
+ * If no postprocessors are registered, this is a no-op.
357
406
  *
358
407
  * @example
359
408
  * ```typescript
@@ -366,9 +415,9 @@ declare function clearPostProcessors(): void;
366
415
  /**
367
416
  * List all registered post-processors.
368
417
  *
369
- * Returns the names of all currently registered post-processors.
418
+ * Returns the names of all currently registered post-processors (both built-in and custom).
370
419
  *
371
- * @returns Array of post-processor names
420
+ * @returns Array of post-processor names (empty array if none registered)
372
421
  *
373
422
  * @example
374
423
  * ```typescript
@@ -386,7 +435,10 @@ declare function listPostProcessors(): string[];
386
435
  * Unlike post-processors, validator errors **fail fast** - if a validator throws an error,
387
436
  * the extraction fails immediately.
388
437
  *
389
- * @param validator - ValidatorProtocol implementation
438
+ * @param validator - ValidatorProtocol implementation with name(), validate(), and optional priority()/shouldValidate()
439
+ * @throws {Error} If validator is missing required methods (name or validate)
440
+ * @throws {Error} If validator name is empty string
441
+ * @throws {Error} If a validator with the same name is already registered
390
442
  *
391
443
  * @example
392
444
  * ```typescript
@@ -416,8 +468,9 @@ declare function registerValidator(validator: ValidatorProtocol): void;
416
468
  * Unregister a validator by name.
417
469
  *
418
470
  * Removes a previously registered validator from the global registry.
471
+ * If the validator doesn't exist, this is a no-op (does not throw).
419
472
  *
420
- * @param name - Validator name to unregister
473
+ * @param name - Validator name to unregister (case-sensitive)
421
474
  *
422
475
  * @example
423
476
  * ```typescript
@@ -444,9 +497,9 @@ declare function clearValidators(): void;
444
497
  /**
445
498
  * List all registered validators.
446
499
  *
447
- * Returns the names of all currently registered validators.
500
+ * Returns the names of all currently registered validators (both built-in and custom).
448
501
  *
449
- * @returns Array of validator names
502
+ * @returns Array of validator names (empty array if none registered)
450
503
  *
451
504
  * @example
452
505
  * ```typescript
@@ -464,7 +517,7 @@ declare function registerOcrBackend(backend: OcrBackendProtocol): void;
464
517
  * Returns an array of names of all currently registered OCR backends,
465
518
  * including built-in backends like "tesseract".
466
519
  *
467
- * @returns Array of OCR backend names
520
+ * @returns Array of OCR backend names (empty array if none registered)
468
521
  *
469
522
  * @example
470
523
  * ```typescript
@@ -497,7 +550,7 @@ declare function unregisterOcrBackend(name: string): void;
497
550
  *
498
551
  * Removes all OCR backends from the registry, including built-in backends.
499
552
  * Use with caution as this will make OCR functionality unavailable until
500
- * backends are re-registered.
553
+ * backends are re-registered. If no backends are registered, this is a no-op.
501
554
  *
502
555
  * @example
503
556
  * ```typescript
@@ -513,7 +566,7 @@ declare function clearOcrBackends(): void;
513
566
  * Returns an array of names of all currently registered document extractors,
514
567
  * including built-in extractors for PDF, Office documents, images, etc.
515
568
  *
516
- * @returns Array of document extractor names
569
+ * @returns Array of document extractor names (empty array if none registered)
517
570
  *
518
571
  * @example
519
572
  * ```typescript
@@ -559,18 +612,26 @@ declare function clearDocumentExtractors(): void;
559
612
  /**
560
613
  * ExtractionConfig namespace with static methods for loading configuration from files.
561
614
  *
562
- * Provides a factory method to load extraction configuration from TOML, YAML, or JSON files.
563
- * The file format is automatically detected based on the file extension.
615
+ * Provides factory methods to load extraction configuration from TOML, YAML, or JSON files,
616
+ * or to discover configuration files in the current directory tree.
617
+ *
618
+ * For creating configurations programmatically, use plain TypeScript objects instead:
564
619
  *
565
620
  * @example
566
621
  * ```typescript
567
622
  * import { ExtractionConfig, extractFile } from '@kreuzberg/node';
568
623
  *
569
624
  * // Load configuration from file
570
- * const config = ExtractionConfig.fromFile('config.toml');
625
+ * const config1 = ExtractionConfig.fromFile('config.toml');
626
+ *
627
+ * // Or create with plain object
628
+ * const config2 = {
629
+ * chunking: { maxChars: 2048 },
630
+ * ocr: { backend: 'tesseract', language: 'eng' }
631
+ * };
571
632
  *
572
633
  * // Use with extraction
573
- * const result = await extractFile('document.pdf', null, config);
634
+ * const result = await extractFile('document.pdf', null, config2);
574
635
  * ```
575
636
  */
576
637
  declare const ExtractionConfig: {
@@ -658,30 +719,30 @@ declare function detectMimeType(bytes: Buffer): string;
658
719
  /**
659
720
  * Detect MIME type from a file path.
660
721
  *
661
- * Uses file extension to determine MIME type. Falls back to `mime_guess` crate
662
- * if extension-based detection fails.
722
+ * Determines the MIME type based on the file extension in the provided path.
723
+ * By default, checks if the file exists; can be disabled with checkExists parameter.
663
724
  *
664
- * @param path - Path to the file (string)
665
- * @param checkExists - Whether to verify file existence (default: true)
666
- * @returns The detected MIME type string
725
+ * @param filePath - The file path to detect MIME type from (e.g., 'document.pdf')
726
+ * @param checkExists - Whether to verify the file exists (default: true)
727
+ * @returns The detected MIME type as a string (e.g., 'application/pdf')
667
728
  *
668
- * @throws {Error} If file doesn't exist (when checkExists is true)
669
- * @throws {Error} If MIME type cannot be determined from path/extension
670
- * @throws {Error} If extension is unknown
729
+ * @throws {Error} If MIME type cannot be determined from the file extension,
730
+ * or if checkExists is true and the file does not exist
671
731
  *
672
732
  * @example
673
733
  * ```typescript
674
734
  * import { detectMimeTypeFromPath } from '@kreuzberg/node';
675
735
  *
676
- * // Detect from existing file
677
- * const mimeType = detectMimeTypeFromPath('document.pdf');
736
+ * // Detect MIME type from existing file
737
+ * const mimeType = detectMimeTypeFromPath('/path/to/document.pdf');
678
738
  * console.log(mimeType); // 'application/pdf'
679
739
  *
680
- * const mimeType2 = detectMimeTypeFromPath('document.docx');
740
+ * // Detect without checking file existence
741
+ * const mimeType2 = detectMimeTypeFromPath('document.docx', false);
681
742
  * console.log(mimeType2); // 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
682
743
  * ```
683
744
  */
684
- declare function detectMimeTypeFromPath(path: string, checkExists?: boolean): string;
745
+ declare function detectMimeTypeFromPath(filePath: string, checkExists?: boolean): string;
685
746
  /**
686
747
  * Validate that a MIME type is supported by Kreuzberg.
687
748
  *
@@ -852,6 +913,220 @@ declare function getLastErrorCode(): number;
852
913
  * ```
853
914
  */
854
915
  declare function getLastPanicContext(): PanicContext | null;
855
- declare const __version__ = "4.0.0-rc.8";
916
+ /**
917
+ * Returns the human-readable name for an error code.
918
+ *
919
+ * Maps numeric error codes to their string names, providing a consistent way
920
+ * to get error code names across all platforms.
921
+ *
922
+ * @param code - The numeric error code (0-7)
923
+ * @returns The error code name as a string (e.g., "validation", "ocr", "unknown")
924
+ *
925
+ * @example
926
+ * ```typescript
927
+ * import { getErrorCodeName } from '@kreuzberg/node';
928
+ *
929
+ * const name = getErrorCodeName(0); // returns "validation"
930
+ * const name = getErrorCodeName(2); // returns "ocr"
931
+ * const name = getErrorCodeName(99); // returns "unknown"
932
+ * ```
933
+ */
934
+ declare function getErrorCodeName(code: number): string;
935
+ /**
936
+ * Returns the description for an error code.
937
+ *
938
+ * Retrieves user-friendly descriptions of error types from the FFI layer.
939
+ *
940
+ * @param code - The numeric error code (0-7)
941
+ * @returns A brief description of the error type
942
+ *
943
+ * @example
944
+ * ```typescript
945
+ * import { getErrorCodeDescription } from '@kreuzberg/node';
946
+ *
947
+ * const desc = getErrorCodeDescription(0); // returns "Input validation error"
948
+ * const desc = getErrorCodeDescription(4); // returns "File system I/O error"
949
+ * const desc = getErrorCodeDescription(99); // returns "Unknown error code"
950
+ * ```
951
+ */
952
+ declare function getErrorCodeDescription(code: number): string;
953
+ /**
954
+ * Classifies an error message string into an error code category.
955
+ *
956
+ * This function analyzes the error message content and returns the most likely
957
+ * error code (0-7) based on keyword patterns. Used to programmatically classify
958
+ * errors for handling purposes.
959
+ *
960
+ * The classification is based on keyword matching:
961
+ * - **Validation (0)**: Keywords like "invalid", "validation", "schema", "required"
962
+ * - **Parsing (1)**: Keywords like "parsing", "corrupted", "malformed"
963
+ * - **Ocr (2)**: Keywords like "ocr", "tesseract", "language", "model"
964
+ * - **MissingDependency (3)**: Keywords like "not found", "missing", "dependency"
965
+ * - **Io (4)**: Keywords like "file", "disk", "read", "write", "permission"
966
+ * - **Plugin (5)**: Keywords like "plugin", "register", "extension"
967
+ * - **UnsupportedFormat (6)**: Keywords like "unsupported", "format", "mime"
968
+ * - **Internal (7)**: Keywords like "internal", "bug", "panic"
969
+ *
970
+ * @param errorMessage - The error message string to classify
971
+ * @returns An object with the classification details
972
+ *
973
+ * @example
974
+ * ```typescript
975
+ * import { classifyError } from '@kreuzberg/node';
976
+ *
977
+ * const result = classifyError("PDF file is corrupted");
978
+ * // Returns: { code: 1, name: "parsing", confidence: 0.95 }
979
+ *
980
+ * const result = classifyError("Tesseract not found");
981
+ * // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 }
982
+ * ```
983
+ */
984
+ declare function classifyError(errorMessage: string): ErrorClassification;
985
+ /**
986
+ * Create a worker pool for concurrent file extraction.
987
+ *
988
+ * The worker pool manages a set of background worker threads that can process
989
+ * extraction requests concurrently, improving throughput when handling multiple files.
990
+ *
991
+ * @param size - Optional number of worker threads (defaults to CPU count). Must be > 0
992
+ * @returns A WorkerPool instance to use with extraction functions
993
+ *
994
+ * @throws {Error} If size is invalid or pool creation fails
995
+ *
996
+ * @example
997
+ * ```typescript
998
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
999
+ *
1000
+ * // Create pool with 4 workers
1001
+ * const pool = createWorkerPool(4);
1002
+ *
1003
+ * try {
1004
+ * const result = await extractFileInWorker(pool, 'document.pdf');
1005
+ * console.log(result.content);
1006
+ * } finally {
1007
+ * // Always close the pool when done
1008
+ * await closeWorkerPool(pool);
1009
+ * }
1010
+ * ```
1011
+ */
1012
+ declare function createWorkerPool(size?: number): WorkerPool;
1013
+ /**
1014
+ * Get statistics about a worker pool.
1015
+ *
1016
+ * Returns information about the pool's current state, including the number of active workers,
1017
+ * queued tasks, and total processed tasks.
1018
+ *
1019
+ * @param pool - The worker pool instance
1020
+ * @returns WorkerPoolStats with pool information
1021
+ *
1022
+ * @example
1023
+ * ```typescript
1024
+ * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
1025
+ *
1026
+ * const pool = createWorkerPool(4);
1027
+ * const stats = getWorkerPoolStats(pool);
1028
+ *
1029
+ * console.log(`Pool size: ${stats.size}`);
1030
+ * console.log(`Active workers: ${stats.activeWorkers}`);
1031
+ * console.log(`Queued tasks: ${stats.queuedTasks}`);
1032
+ * ```
1033
+ */
1034
+ declare function getWorkerPoolStats(pool: WorkerPool): WorkerPoolStats;
1035
+ /**
1036
+ * Extract content from a single file using a worker pool (asynchronous).
1037
+ *
1038
+ * Submits an extraction task to the worker pool. The task is executed by one of the
1039
+ * available workers in the background, allowing other tasks to be processed concurrently.
1040
+ *
1041
+ * @param pool - The worker pool instance
1042
+ * @param filePath - Path to the file to extract
1043
+ * @param mimeTypeOrConfig - Optional MIME type or extraction configuration
1044
+ * @param maybeConfig - Optional extraction configuration (if second param is MIME type)
1045
+ * @returns Promise<ExtractionResult> containing extracted content and metadata
1046
+ *
1047
+ * @throws {Error} If the file cannot be read or extraction fails
1048
+ *
1049
+ * @example
1050
+ * ```typescript
1051
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
1052
+ *
1053
+ * const pool = createWorkerPool(4);
1054
+ *
1055
+ * try {
1056
+ * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
1057
+ * const results = await Promise.all(
1058
+ * files.map(f => extractFileInWorker(pool, f))
1059
+ * );
1060
+ *
1061
+ * results.forEach((r, i) => {
1062
+ * console.log(`${files[i]}: ${r.content.substring(0, 100)}...`);
1063
+ * });
1064
+ * } finally {
1065
+ * await closeWorkerPool(pool);
1066
+ * }
1067
+ * ```
1068
+ */
1069
+ declare function extractFileInWorker(pool: WorkerPool, filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
1070
+ /**
1071
+ * Extract content from multiple files in parallel using a worker pool (asynchronous).
1072
+ *
1073
+ * Submits multiple extraction tasks to the worker pool for concurrent processing.
1074
+ * This is more efficient than using `extractFileInWorker` multiple times sequentially.
1075
+ *
1076
+ * @param pool - The worker pool instance
1077
+ * @param paths - Array of file paths to extract
1078
+ * @param config - Extraction configuration object (applies to all files)
1079
+ * @returns Promise<ExtractionResult[]> array of results (one per file, in same order)
1080
+ *
1081
+ * @throws {Error} If any file cannot be read or extraction fails
1082
+ *
1083
+ * @example
1084
+ * ```typescript
1085
+ * import { createWorkerPool, batchExtractFilesInWorker, closeWorkerPool } from '@kreuzberg/node';
1086
+ *
1087
+ * const pool = createWorkerPool(4);
1088
+ *
1089
+ * try {
1090
+ * const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf'];
1091
+ * const results = await batchExtractFilesInWorker(pool, files, {
1092
+ * ocr: { backend: 'tesseract', language: 'eng' }
1093
+ * });
1094
+ *
1095
+ * const total = results.reduce((sum, r) => sum + extractAmount(r.content), 0);
1096
+ * console.log(`Total: $${total}`);
1097
+ * } finally {
1098
+ * await closeWorkerPool(pool);
1099
+ * }
1100
+ * ```
1101
+ */
1102
+ declare function batchExtractFilesInWorker(pool: WorkerPool, paths: string[], config?: ExtractionConfig$1 | null): Promise<ExtractionResult[]>;
1103
+ /**
1104
+ * Close a worker pool and shut down all worker threads.
1105
+ *
1106
+ * Should be called when the pool is no longer needed to clean up resources
1107
+ * and gracefully shut down worker threads. Any pending tasks will be cancelled.
1108
+ *
1109
+ * @param pool - The worker pool instance to close
1110
+ * @returns Promise that resolves when the pool is fully closed
1111
+ *
1112
+ * @throws {Error} If pool shutdown fails
1113
+ *
1114
+ * @example
1115
+ * ```typescript
1116
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
1117
+ *
1118
+ * const pool = createWorkerPool(4);
1119
+ *
1120
+ * try {
1121
+ * const result = await extractFileInWorker(pool, 'document.pdf');
1122
+ * console.log(result.content);
1123
+ * } finally {
1124
+ * // Clean up the pool
1125
+ * await closeWorkerPool(pool);
1126
+ * }
1127
+ * ```
1128
+ */
1129
+ declare function closeWorkerPool(pool: WorkerPool): Promise<void>;
1130
+ declare const __version__ = "4.0.0";
856
1131
 
857
- export { type EmbeddingPreset, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesSync, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileSync, getEmbeddingPreset, getExtensionsForMime, getLastErrorCode, getLastPanicContext, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
1132
+ export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };