@kreuzberg/node 4.0.0-rc.6 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1132 @@
1
+ import { PanicContext } from './errors.js';
2
+ export { CacheError, ErrorCode, ImageProcessingError, KreuzbergError, MissingDependencyError, OcrError, ParsingError, PluginError, ValidationError } from './errors.js';
3
+ import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, ErrorClassification, WorkerPool, WorkerPoolStats } from './types.js';
4
+ export { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.js';
5
+ export { GutenOcrBackend } from './ocr/guten-ocr.js';
6
+
7
+ /**
8
+ * Kreuzberg - Multi-language document intelligence framework.
9
+ *
10
+ * This is a TypeScript SDK around a high-performance Rust core.
11
+ * All extraction logic, chunking, quality processing, and language detection
12
+ * are implemented in Rust for maximum performance.
13
+ *
14
+ * ## API Usage Recommendations
15
+ *
16
+ * **For processing multiple documents**, prefer batch APIs:
17
+ * - Use `batchExtractFiles()` / `batchExtractFilesSync()` for multiple files
18
+ * - Use `batchExtractBytes()` / `batchExtractBytesSync()` for multiple byte arrays
19
+ *
20
+ * **Batch APIs provide**:
21
+ * - Better performance (parallel processing in Rust)
22
+ * - More reliable memory management
23
+ * - Recommended for all multi-document workflows
24
+ *
25
+ * **Single extraction APIs** (`extractFile`, `extractBytes`) are suitable for:
26
+ * - One-off document processing
27
+ * - Interactive applications processing documents on-demand
28
+ * - Avoid calling these in tight loops - use batch APIs instead
29
+ *
30
+ * ## Supported Formats
31
+ *
32
+ * - **Documents**: PDF, DOCX, PPTX, XLSX, DOC, PPT (with LibreOffice)
33
+ * - **Text**: Markdown, Plain Text, XML
34
+ * - **Web**: HTML (converted to Markdown)
35
+ * - **Data**: JSON, YAML, TOML
36
+ * - **Email**: EML, MSG
37
+ * - **Images**: PNG, JPEG, TIFF (with OCR support)
38
+ *
39
+ * @example
40
+ * ```typescript
41
+ * import { extractFile, batchExtractFiles } from '@kreuzberg/node';
42
+ *
43
+ * // Single file extraction
44
+ * const result = await extractFile('document.pdf');
45
+ * console.log(result.content);
46
+ *
47
+ * // Multiple files (recommended approach)
48
+ * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
49
+ * const results = await batchExtractFiles(files);
50
+ * results.forEach(r => console.log(r.content));
51
+ * ```
52
+ */
53
+
54
+ /**
55
+ * @internal Allows tests to provide a mocked native binding.
56
+ */
57
+ declare function __setBindingForTests(mock: unknown): void;
58
+ /**
59
+ * @internal Resets the cached native binding for tests.
60
+ */
61
+ declare function __resetBindingForTests(): void;
62
+ /**
63
+ * Extract content from a single file (synchronous).
64
+ *
65
+ * **Usage Note**: For processing multiple files, prefer `batchExtractFilesSync()` which
66
+ * provides better performance and memory management.
67
+ *
68
+ * @param filePath - Path to the file to extract (string). Can be absolute or relative.
69
+ * @param mimeType - Optional MIME type hint for format detection. If null, MIME type is auto-detected from file extension or content.
70
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
71
+ * @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
72
+ * @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
73
+ * @throws {ParsingError} When document format is invalid or corrupted
74
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
75
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
76
+ * @throws {KreuzbergError} For other extraction-related failures
77
+ *
78
+ * @example
79
+ * ```typescript
80
+ * import { extractFileSync } from '@kreuzberg/node';
81
+ *
82
+ * // Basic usage
83
+ * const result = extractFileSync('document.pdf');
84
+ * console.log(result.content);
85
+ *
86
+ * // With OCR configuration
87
+ * const config = {
88
+ * ocr: {
89
+ * backend: 'tesseract',
90
+ * language: 'eng',
91
+ * tesseractConfig: {
92
+ * psm: 6,
93
+ * enableTableDetection: true,
94
+ * },
95
+ * },
96
+ * };
97
+ * const result2 = extractFileSync('scanned.pdf', null, config);
98
+ * ```
99
+ */
100
+ declare function extractFileSync(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): ExtractionResult;
101
+ /**
102
+ * Extract content from a single file (asynchronous).
103
+ *
104
+ * **Usage Note**: For processing multiple files, prefer `batchExtractFiles()` which
105
+ * provides better performance and memory management.
106
+ *
107
+ * @param filePath - Path to the file to extract (string). Can be absolute or relative.
108
+ * @param mimeType - Optional MIME type hint for format detection. If null, MIME type is auto-detected from file extension or content.
109
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
110
+ * @returns Promise<ExtractionResult> containing extracted content, metadata, tables, and optional chunks/images
111
+ * @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
112
+ * @throws {ParsingError} When document format is invalid or corrupted
113
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
114
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
115
+ * @throws {KreuzbergError} For other extraction-related failures
116
+ *
117
+ * @example
118
+ * ```typescript
119
+ * import { extractFile } from '@kreuzberg/node';
120
+ *
121
+ * // Basic usage
122
+ * const result = await extractFile('document.pdf');
123
+ * console.log(result.content);
124
+ *
125
+ * // With chunking enabled
126
+ * const config = {
127
+ * chunking: {
128
+ * maxChars: 1000,
129
+ * maxOverlap: 200,
130
+ * },
131
+ * };
132
+ * const result2 = await extractFile('long_document.pdf', null, config);
133
+ * console.log(result2.chunks); // Array of text chunks
134
+ * ```
135
+ */
136
+ declare function extractFile(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
137
+ /**
138
+ * Extract content from raw bytes (synchronous).
139
+ *
140
+ * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytesSync()`
141
+ * which provides better performance and memory management.
142
+ *
143
+ * @param data - File content as Uint8Array (Buffer will be converted)
144
+ * @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string.
145
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
146
+ * @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
147
+ * @throws {TypeError} When data is not a valid Uint8Array
148
+ * @throws {Error} When file cannot be read or parsed
149
+ * @throws {ParsingError} When document format is invalid or corrupted
150
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
151
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
152
+ * @throws {KreuzbergError} For other extraction-related failures
153
+ *
154
+ * @example
155
+ * ```typescript
156
+ * import { extractBytesSync } from '@kreuzberg/node';
157
+ * import { readFileSync } from 'fs';
158
+ *
159
+ * const data = readFileSync('document.pdf');
160
+ * const result = extractBytesSync(data, 'application/pdf');
161
+ * console.log(result.content);
162
+ * ```
163
+ */
164
+ declare function extractBytesSync(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig$1 | null): ExtractionResult;
165
+ /**
166
+ * Extract content from raw bytes (asynchronous).
167
+ *
168
+ * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytes()`
169
+ * which provides better performance and memory management.
170
+ *
171
+ * @param data - File content as Uint8Array (Buffer will be converted)
172
+ * @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string.
173
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
174
+ * @returns Promise<ExtractionResult> containing extracted content, metadata, tables, and optional chunks/images
175
+ * @throws {TypeError} When data is not a valid Uint8Array
176
+ * @throws {Error} When file cannot be read or parsed
177
+ * @throws {ParsingError} When document format is invalid or corrupted
178
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
179
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
180
+ * @throws {KreuzbergError} For other extraction-related failures
181
+ *
182
+ * @example
183
+ * ```typescript
184
+ * import { extractBytes } from '@kreuzberg/node';
185
+ * import { readFile } from 'fs/promises';
186
+ *
187
+ * const data = await readFile('document.pdf');
188
+ * const result = await extractBytes(data, 'application/pdf');
189
+ * console.log(result.content);
190
+ * ```
191
+ */
192
+ declare function extractBytes(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
193
+ /**
194
+ * Extract content from multiple files in parallel (synchronous).
195
+ *
196
+ * **Recommended for**: Processing multiple documents efficiently with better
197
+ * performance and memory management compared to individual `extractFileSync()` calls.
198
+ *
199
+ * **Benefits**:
200
+ * - Parallel processing in Rust for maximum performance
201
+ * - Optimized memory usage across all extractions
202
+ * - More reliable for batch document processing
203
+ *
204
+ * @param paths - List of file paths to extract (absolute or relative paths)
205
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
206
+ * @returns Array of ExtractionResults (one per file, in same order as input)
207
+ * @throws {Error} If any file cannot be read or parsed
208
+ * @throws {ParsingError} When any document format is invalid or corrupted
209
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
210
+ * @throws {ValidationError} When any extraction result fails validation (if validators registered)
211
+ * @throws {KreuzbergError} For other extraction-related failures
212
+ *
213
+ * @example
214
+ * ```typescript
215
+ * import { batchExtractFilesSync } from '@kreuzberg/node';
216
+ *
217
+ * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
218
+ * const results = batchExtractFilesSync(files);
219
+ *
220
+ * results.forEach((result, i) => {
221
+ * console.log(`File ${files[i]}: ${result.content.substring(0, 100)}...`);
222
+ * });
223
+ * ```
224
+ */
225
+ declare function batchExtractFilesSync(paths: string[], config?: ExtractionConfig$1 | null): ExtractionResult[];
226
+ /**
227
+ * Extract content from multiple files in parallel (asynchronous).
228
+ *
229
+ * **Recommended for**: Processing multiple documents efficiently with better
230
+ * performance and memory management compared to individual `extractFile()` calls.
231
+ *
232
+ * **Benefits**:
233
+ * - Parallel processing in Rust for maximum performance
234
+ * - Optimized memory usage across all extractions
235
+ * - More reliable for batch document processing
236
+ *
237
+ * @param paths - List of file paths to extract (absolute or relative paths)
238
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
239
+ * @returns Promise resolving to array of ExtractionResults (one per file, in same order as input)
240
+ * @throws {Error} If any file cannot be read or parsed
241
+ * @throws {ParsingError} When any document format is invalid or corrupted
242
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
243
+ * @throws {ValidationError} When any extraction result fails validation (if validators registered)
244
+ * @throws {KreuzbergError} For other extraction-related failures
245
+ *
246
+ * @example
247
+ * ```typescript
248
+ * import { batchExtractFiles } from '@kreuzberg/node';
249
+ *
250
+ * const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf'];
251
+ * const results = await batchExtractFiles(files, {
252
+ * ocr: { backend: 'tesseract', language: 'eng' }
253
+ * });
254
+ *
255
+ * // Process all results
256
+ * const totalAmount = results
257
+ * .map(r => extractAmount(r.content))
258
+ * .reduce((a, b) => a + b, 0);
259
+ * ```
260
+ */
261
+ declare function batchExtractFiles(paths: string[], config?: ExtractionConfig$1 | null): Promise<ExtractionResult[]>;
262
+ /**
263
+ * Extract content from multiple byte arrays in parallel (synchronous).
264
+ *
265
+ * **Recommended for**: Processing multiple documents from memory efficiently with better
266
+ * performance and memory management compared to individual `extractBytesSync()` calls.
267
+ *
268
+ * **Benefits**:
269
+ * - Parallel processing in Rust for maximum performance
270
+ * - Optimized memory usage across all extractions
271
+ * - More reliable for batch document processing
272
+ *
273
+ * @param dataList - List of file contents as Uint8Arrays (must be same length as mimeTypes)
274
+ * @param mimeTypes - List of MIME types (one per data item, required for accurate format detection)
275
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
276
+ * @returns Array of ExtractionResults (one per data item, in same order as input)
277
+ * @throws {TypeError} When dataList contains non-Uint8Array items or length mismatch with mimeTypes
278
+ * @throws {Error} If any data cannot be read or parsed
279
+ * @throws {ParsingError} When any document format is invalid or corrupted
280
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
281
+ * @throws {ValidationError} When any extraction result fails validation (if validators registered)
282
+ * @throws {KreuzbergError} For other extraction-related failures
283
+ *
284
+ * @example
285
+ * ```typescript
286
+ * import { batchExtractBytesSync } from '@kreuzberg/node';
287
+ * import { readFileSync } from 'fs';
288
+ *
289
+ * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
290
+ * const dataList = files.map(f => readFileSync(f));
291
+ * const mimeTypes = ['application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'];
292
+ *
293
+ * const results = batchExtractBytesSync(dataList, mimeTypes);
294
+ * results.forEach((result, i) => {
295
+ * console.log(`File ${files[i]}: ${result.content.substring(0, 100)}...`);
296
+ * });
297
+ * ```
298
+ */
299
+ declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig$1 | null): ExtractionResult[];
300
+ /**
301
+ * Extract content from multiple byte arrays in parallel (asynchronous).
302
+ *
303
+ * **Recommended for**: Processing multiple documents from memory efficiently with better
304
+ * performance and memory management compared to individual `extractBytes()` calls.
305
+ *
306
+ * **Benefits**:
307
+ * - Parallel processing in Rust for maximum performance
308
+ * - Optimized memory usage across all extractions
309
+ * - More reliable for batch document processing
310
+ *
311
+ * @param dataList - List of file contents as Uint8Arrays (must be same length as mimeTypes)
312
+ * @param mimeTypes - List of MIME types (one per data item, required for accurate format detection)
313
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
314
+ * @returns Promise resolving to array of ExtractionResults (one per data item, in same order as input)
315
+ * @throws {TypeError} When dataList contains non-Uint8Array items or length mismatch with mimeTypes
316
+ * @throws {Error} If any data cannot be read or parsed
317
+ * @throws {ParsingError} When any document format is invalid or corrupted
318
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
319
+ * @throws {ValidationError} When any extraction result fails validation (if validators registered)
320
+ * @throws {KreuzbergError} For other extraction-related failures
321
+ *
322
+ * @example
323
+ * ```typescript
324
+ * import { batchExtractBytes } from '@kreuzberg/node';
325
+ * import { readFile } from 'fs/promises';
326
+ *
327
+ * const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf'];
328
+ * const dataList = await Promise.all(files.map(f => readFile(f)));
329
+ * const mimeTypes = files.map(() => 'application/pdf');
330
+ *
331
+ * const results = await batchExtractBytes(dataList, mimeTypes, {
332
+ * ocr: { backend: 'tesseract', language: 'eng' }
333
+ * });
334
+ *
335
+ * // Process all results
336
+ * const totalAmount = results
337
+ * .map(r => extractAmount(r.content))
338
+ * .reduce((a, b) => a + b, 0);
339
+ * ```
340
+ */
341
+ declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig$1 | null): Promise<ExtractionResult[]>;
342
+ /**
343
+ * Register a custom postprocessor.
344
+ *
345
+ * **IMPORTANT**: Custom processors only work with **async extraction functions**:
346
+ * - ✅ `extractFile()`, `extractBytes()`, `batchExtractFiles()`, `batchExtractBytes()`
347
+ * - ❌ `extractFileSync()`, `extractBytesSync()`, etc. (will skip custom processors)
348
+ *
349
+ * This limitation exists because sync extraction blocks the Node.js event loop,
350
+ * preventing JavaScript callbacks from executing. For v4.0, use async extraction
351
+ * when you need custom processors.
352
+ *
353
+ * @param processor - PostProcessorProtocol implementation with name(), process(), and optional processingStage()
354
+ * @throws {Error} If processor is missing required methods (name or process)
355
+ * @throws {Error} If processor name is empty string
356
+ * @throws {Error} If a processor with the same name is already registered
357
+ *
358
+ * @example
359
+ * ```typescript
360
+ * import { registerPostProcessor, extractFile, ExtractionResult } from '@kreuzberg/node';
361
+ *
362
+ * class MyProcessor implements PostProcessorProtocol {
363
+ * name(): string {
364
+ * return 'my_processor';
365
+ * }
366
+ *
367
+ * process(result: ExtractionResult): ExtractionResult {
368
+ * result.metadata.customField = 'custom_value';
369
+ * return result;
370
+ * }
371
+ *
372
+ * processingStage(): 'early' | 'middle' | 'late' {
373
+ * return 'middle';
374
+ * }
375
+ * }
376
+ *
377
+ * registerPostProcessor(new MyProcessor());
378
+ *
379
+ * // Use async extraction (required for custom processors)
380
+ * const result = await extractFile('document.pdf');
381
+ * console.log(result.metadata.customField); // 'custom_value'
382
+ * ```
383
+ */
384
+ declare function registerPostProcessor(processor: PostProcessorProtocol): void;
385
+ /**
386
+ * Unregister a postprocessor by name.
387
+ *
388
+ * Removes a previously registered postprocessor from the registry.
389
+ * If the processor doesn't exist, this is a no-op (does not throw).
390
+ *
391
+ * @param name - Name of the processor to unregister (case-sensitive)
392
+ *
393
+ * @example
394
+ * ```typescript
395
+ * import { unregisterPostProcessor } from '@kreuzberg/node';
396
+ *
397
+ * unregisterPostProcessor('my_processor');
398
+ * ```
399
+ */
400
+ declare function unregisterPostProcessor(name: string): void;
401
+ /**
402
+ * Clear all registered postprocessors.
403
+ *
404
+ * Removes all postprocessors from the registry. Useful for test cleanup or resetting state.
405
+ * If no postprocessors are registered, this is a no-op.
406
+ *
407
+ * @example
408
+ * ```typescript
409
+ * import { clearPostProcessors } from '@kreuzberg/node';
410
+ *
411
+ * clearPostProcessors();
412
+ * ```
413
+ */
414
+ declare function clearPostProcessors(): void;
415
+ /**
416
+ * List all registered post-processors.
417
+ *
418
+ * Returns the names of all currently registered post-processors (both built-in and custom).
419
+ *
420
+ * @returns Array of post-processor names (empty array if none registered)
421
+ *
422
+ * @example
423
+ * ```typescript
424
+ * import { listPostProcessors } from '@kreuzberg/node';
425
+ *
426
+ * const names = listPostProcessors();
427
+ * console.log('Registered post-processors:', names);
428
+ * ```
429
+ */
430
+ declare function listPostProcessors(): string[];
431
+ /**
432
+ * Register a custom validator.
433
+ *
434
+ * Validators check extraction results for quality, completeness, or correctness.
435
+ * Unlike post-processors, validator errors **fail fast** - if a validator throws an error,
436
+ * the extraction fails immediately.
437
+ *
438
+ * @param validator - ValidatorProtocol implementation with name(), validate(), and optional priority()/shouldValidate()
439
+ * @throws {Error} If validator is missing required methods (name or validate)
440
+ * @throws {Error} If validator name is empty string
441
+ * @throws {Error} If a validator with the same name is already registered
442
+ *
443
+ * @example
444
+ * ```typescript
445
+ * import { registerValidator } from '@kreuzberg/node';
446
+ *
447
+ * class MinLengthValidator implements ValidatorProtocol {
448
+ * name(): string {
449
+ * return 'min_length_validator';
450
+ * }
451
+ *
452
+ * priority(): number {
453
+ * return 100; // Run early
454
+ * }
455
+ *
456
+ * validate(result: ExtractionResult): void {
457
+ * if (result.content.length < 100) {
458
+ * throw new Error('Content too short: minimum 100 characters required');
459
+ * }
460
+ * }
461
+ * }
462
+ *
463
+ * registerValidator(new MinLengthValidator());
464
+ * ```
465
+ */
466
+ declare function registerValidator(validator: ValidatorProtocol): void;
467
+ /**
468
+ * Unregister a validator by name.
469
+ *
470
+ * Removes a previously registered validator from the global registry.
471
+ * If the validator doesn't exist, this is a no-op (does not throw).
472
+ *
473
+ * @param name - Validator name to unregister (case-sensitive)
474
+ *
475
+ * @example
476
+ * ```typescript
477
+ * import { unregisterValidator } from '@kreuzberg/node';
478
+ *
479
+ * unregisterValidator('min_length_validator');
480
+ * ```
481
+ */
482
+ declare function unregisterValidator(name: string): void;
483
+ /**
484
+ * Clear all registered validators.
485
+ *
486
+ * Removes all validators from the global registry. Useful for test cleanup
487
+ * or resetting state.
488
+ *
489
+ * @example
490
+ * ```typescript
491
+ * import { clearValidators } from '@kreuzberg/node';
492
+ *
493
+ * clearValidators();
494
+ * ```
495
+ */
496
+ declare function clearValidators(): void;
497
+ /**
498
+ * List all registered validators.
499
+ *
500
+ * Returns the names of all currently registered validators (both built-in and custom).
501
+ *
502
+ * @returns Array of validator names (empty array if none registered)
503
+ *
504
+ * @example
505
+ * ```typescript
506
+ * import { listValidators } from '@kreuzberg/node';
507
+ *
508
+ * const names = listValidators();
509
+ * console.log('Registered validators:', names);
510
+ * ```
511
+ */
512
+ declare function listValidators(): string[];
513
+ declare function registerOcrBackend(backend: OcrBackendProtocol): void;
514
+ /**
515
+ * List all registered OCR backends.
516
+ *
517
+ * Returns an array of names of all currently registered OCR backends,
518
+ * including built-in backends like "tesseract".
519
+ *
520
+ * @returns Array of OCR backend names (empty array if none registered)
521
+ *
522
+ * @example
523
+ * ```typescript
524
+ * import { listOcrBackends } from '@kreuzberg/node';
525
+ *
526
+ * const backends = listOcrBackends();
527
+ * console.log(backends); // ['tesseract', 'my-custom-backend', ...]
528
+ * ```
529
+ */
530
+ declare function listOcrBackends(): string[];
531
+ /**
532
+ * Unregister an OCR backend by name.
533
+ *
534
+ * Removes the specified OCR backend from the registry. If the backend doesn't exist,
535
+ * this operation is a no-op (does not throw an error).
536
+ *
537
+ * @param name - Name of the OCR backend to unregister
538
+ *
539
+ * @example
540
+ * ```typescript
541
+ * import { unregisterOcrBackend } from '@kreuzberg/node';
542
+ *
543
+ * // Unregister a custom backend
544
+ * unregisterOcrBackend('my-custom-ocr');
545
+ * ```
546
+ */
547
+ declare function unregisterOcrBackend(name: string): void;
548
+ /**
549
+ * Clear all registered OCR backends.
550
+ *
551
+ * Removes all OCR backends from the registry, including built-in backends.
552
+ * Use with caution as this will make OCR functionality unavailable until
553
+ * backends are re-registered. If no backends are registered, this is a no-op.
554
+ *
555
+ * @example
556
+ * ```typescript
557
+ * import { clearOcrBackends } from '@kreuzberg/node';
558
+ *
559
+ * clearOcrBackends();
560
+ * ```
561
+ */
562
+ declare function clearOcrBackends(): void;
563
+ /**
564
+ * List all registered document extractors.
565
+ *
566
+ * Returns an array of names of all currently registered document extractors,
567
+ * including built-in extractors for PDF, Office documents, images, etc.
568
+ *
569
+ * @returns Array of document extractor names (empty array if none registered)
570
+ *
571
+ * @example
572
+ * ```typescript
573
+ * import { listDocumentExtractors } from '@kreuzberg/node';
574
+ *
575
+ * const extractors = listDocumentExtractors();
576
+ * console.log(extractors); // ['PDFExtractor', 'ImageExtractor', ...]
577
+ * ```
578
+ */
579
+ declare function listDocumentExtractors(): string[];
580
+ /**
581
+ * Unregister a document extractor by name.
582
+ *
583
+ * Removes the specified document extractor from the registry. If the extractor
584
+ * doesn't exist, this operation is a no-op (does not throw an error).
585
+ *
586
+ * @param name - Name of the document extractor to unregister
587
+ *
588
+ * @example
589
+ * ```typescript
590
+ * import { unregisterDocumentExtractor } from '@kreuzberg/node';
591
+ *
592
+ * // Unregister a custom extractor
593
+ * unregisterDocumentExtractor('MyCustomExtractor');
594
+ * ```
595
+ */
596
+ declare function unregisterDocumentExtractor(name: string): void;
597
+ /**
598
+ * Clear all registered document extractors.
599
+ *
600
+ * Removes all document extractors from the registry, including built-in extractors.
601
+ * Use with caution as this will make document extraction unavailable until
602
+ * extractors are re-registered.
603
+ *
604
+ * @example
605
+ * ```typescript
606
+ * import { clearDocumentExtractors } from '@kreuzberg/node';
607
+ *
608
+ * clearDocumentExtractors();
609
+ * ```
610
+ */
611
+ declare function clearDocumentExtractors(): void;
612
+ /**
613
+ * ExtractionConfig namespace with static methods for loading configuration from files.
614
+ *
615
+ * Provides factory methods to load extraction configuration from TOML, YAML, or JSON files,
616
+ * or to discover configuration files in the current directory tree.
617
+ *
618
+ * For creating configurations programmatically, use plain TypeScript objects instead:
619
+ *
620
+ * @example
621
+ * ```typescript
622
+ * import { ExtractionConfig, extractFile } from '@kreuzberg/node';
623
+ *
624
+ * // Load configuration from file
625
+ * const config1 = ExtractionConfig.fromFile('config.toml');
626
+ *
627
+ * // Or create with plain object
628
+ * const config2 = {
629
+ * chunking: { maxChars: 2048 },
630
+ * ocr: { backend: 'tesseract', language: 'eng' }
631
+ * };
632
+ *
633
+ * // Use with extraction
634
+ * const result = await extractFile('document.pdf', null, config2);
635
+ * ```
636
+ */
637
+ declare const ExtractionConfig: {
638
+ /**
639
+ * Load extraction configuration from a file.
640
+ *
641
+ * Automatically detects the file format based on extension:
642
+ * - `.toml` - TOML format
643
+ * - `.yaml` - YAML format
644
+ * - `.json` - JSON format
645
+ *
646
+ * @param filePath - Path to the configuration file (absolute or relative)
647
+ * @returns ExtractionConfig object loaded from the file
648
+ *
649
+ * @throws {Error} If file does not exist or is not accessible
650
+ * @throws {Error} If file content is not valid TOML/YAML/JSON
651
+ * @throws {Error} If configuration structure is invalid
652
+ * @throws {Error} If file extension is not supported
653
+ *
654
+ * @example
655
+ * ```typescript
656
+ * import { ExtractionConfig } from '@kreuzberg/node';
657
+ *
658
+ * // Load from TOML file
659
+ * const config1 = ExtractionConfig.fromFile('kreuzberg.toml');
660
+ *
661
+ * // Load from YAML file
662
+ * const config2 = ExtractionConfig.fromFile('./config.yaml');
663
+ *
664
+ * // Load from JSON file
665
+ * const config3 = ExtractionConfig.fromFile('./config.json');
666
+ * ```
667
+ */
668
+ fromFile(filePath: string): ExtractionConfig$1;
669
+ /**
670
+ * Discover and load configuration from current or parent directories.
671
+ *
672
+ * Searches for a `kreuzberg.toml` file starting from the current working directory
673
+ * and traversing up the directory tree. Returns the first configuration file found.
674
+ *
675
+ * @returns ExtractionConfig object if found, or null if no configuration file exists
676
+ *
677
+ * @example
678
+ * ```typescript
679
+ * import { ExtractionConfig } from '@kreuzberg/node';
680
+ *
681
+ * // Try to find config in current or parent directories
682
+ * const config = ExtractionConfig.discover();
683
+ * if (config) {
684
+ * console.log('Found configuration');
685
+ * // Use config for extraction
686
+ * } else {
687
+ * console.log('No configuration file found, using defaults');
688
+ * }
689
+ * ```
690
+ */
691
+ discover(): ExtractionConfig$1 | null;
692
+ };
693
+ /**
694
+ * Detect MIME type from raw bytes.
695
+ *
696
+ * Uses content inspection (magic bytes) to determine MIME type.
697
+ * This is more accurate than extension-based detection but requires
698
+ * reading the file content.
699
+ *
700
+ * @param bytes - Raw file content as Buffer
701
+ * @returns The detected MIME type string
702
+ *
703
+ * @throws {Error} If MIME type cannot be determined from content
704
+ *
705
+ * @example
706
+ * ```typescript
707
+ * import { detectMimeType } from '@kreuzberg/node';
708
+ * import * as fs from 'fs';
709
+ *
710
+ * // Read file content
711
+ * const content = fs.readFileSync('document.pdf');
712
+ *
713
+ * // Detect MIME type from bytes
714
+ * const mimeType = detectMimeType(content);
715
+ * console.log(mimeType); // 'application/pdf'
716
+ * ```
717
+ */
718
+ declare function detectMimeType(bytes: Buffer): string;
719
+ /**
720
+ * Detect MIME type from a file path.
721
+ *
722
+ * Determines the MIME type based on the file extension in the provided path.
723
+ * By default, checks if the file exists; can be disabled with checkExists parameter.
724
+ *
725
+ * @param filePath - The file path to detect MIME type from (e.g., 'document.pdf')
726
+ * @param checkExists - Whether to verify the file exists (default: true)
727
+ * @returns The detected MIME type as a string (e.g., 'application/pdf')
728
+ *
729
+ * @throws {Error} If MIME type cannot be determined from the file extension,
730
+ * or if checkExists is true and the file does not exist
731
+ *
732
+ * @example
733
+ * ```typescript
734
+ * import { detectMimeTypeFromPath } from '@kreuzberg/node';
735
+ *
736
+ * // Detect MIME type from existing file
737
+ * const mimeType = detectMimeTypeFromPath('/path/to/document.pdf');
738
+ * console.log(mimeType); // 'application/pdf'
739
+ *
740
+ * // Detect without checking file existence
741
+ * const mimeType2 = detectMimeTypeFromPath('document.docx', false);
742
+ * console.log(mimeType2); // 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
743
+ * ```
744
+ */
745
+ declare function detectMimeTypeFromPath(filePath: string, checkExists?: boolean): string;
746
+ /**
747
+ * Validate that a MIME type is supported by Kreuzberg.
748
+ *
749
+ * Checks if a MIME type is in the list of supported formats. Note that any
750
+ * `image/*` MIME type is automatically considered valid.
751
+ *
752
+ * @param mimeType - The MIME type to validate (string)
753
+ * @returns The validated MIME type (may be normalized)
754
+ *
755
+ * @throws {Error} If the MIME type is not supported
756
+ *
757
+ * @example
758
+ * ```typescript
759
+ * import { validateMimeType } from '@kreuzberg/node';
760
+ *
761
+ * // Validate supported type
762
+ * const validated = validateMimeType('application/pdf');
763
+ * console.log(validated); // 'application/pdf'
764
+ *
765
+ * // Validate custom image type
766
+ * const validated2 = validateMimeType('image/custom-format');
767
+ * console.log(validated2); // 'image/custom-format' (any image/* is valid)
768
+ *
769
+ * // Validate unsupported type (throws error)
770
+ * try {
771
+ * validateMimeType('video/mp4');
772
+ * } catch (err) {
773
+ * console.error(err); // Error: Unsupported format: video/mp4
774
+ * }
775
+ * ```
776
+ */
777
+ declare function validateMimeType(mimeType: string): string;
778
+ /**
779
+ * Get file extensions for a given MIME type.
780
+ *
781
+ * Returns an array of file extensions commonly associated with the specified
782
+ * MIME type. For example, 'application/pdf' returns ['pdf'].
783
+ *
784
+ * @param mimeType - The MIME type to look up (e.g., 'application/pdf', 'image/jpeg')
785
+ * @returns Array of file extensions (without leading dots)
786
+ *
787
+ * @throws {Error} If the MIME type is not recognized or supported
788
+ *
789
+ * @example
790
+ * ```typescript
791
+ * import { getExtensionsForMime } from '@kreuzberg/node';
792
+ *
793
+ * // Get extensions for PDF
794
+ * const pdfExts = getExtensionsForMime('application/pdf');
795
+ * console.log(pdfExts); // ['pdf']
796
+ *
797
+ * // Get extensions for JPEG
798
+ * const jpegExts = getExtensionsForMime('image/jpeg');
799
+ * console.log(jpegExts); // ['jpg', 'jpeg']
800
+ * ```
801
+ */
802
+ declare function getExtensionsForMime(mimeType: string): string[];
803
+ /**
804
+ * Embedding preset configuration.
805
+ *
806
+ * Contains all settings for a specific embedding model preset.
807
+ */
808
+ interface EmbeddingPreset {
809
+ /** Name of the preset (e.g., "fast", "balanced", "quality", "multilingual") */
810
+ name: string;
811
+ /** Recommended chunk size in characters */
812
+ chunkSize: number;
813
+ /** Recommended overlap in characters */
814
+ overlap: number;
815
+ /** Model identifier (e.g., "AllMiniLML6V2Q", "BGEBaseENV15") */
816
+ modelName: string;
817
+ /** Embedding vector dimensions */
818
+ dimensions: number;
819
+ /** Human-readable description of the preset */
820
+ description: string;
821
+ }
822
+ /**
823
+ * List all available embedding preset names.
824
+ *
825
+ * Returns an array of preset names that can be used with `getEmbeddingPreset`.
826
+ *
827
+ * @returns Array of 4 preset names: ["fast", "balanced", "quality", "multilingual"]
828
+ *
829
+ * @example
830
+ * ```typescript
831
+ * import { listEmbeddingPresets } from '@kreuzberg/node';
832
+ *
833
+ * const presets = listEmbeddingPresets();
834
+ * console.log(presets); // ['fast', 'balanced', 'quality', 'multilingual']
835
+ * ```
836
+ */
837
+ declare function listEmbeddingPresets(): string[];
838
+ /**
839
+ * Get a specific embedding preset by name.
840
+ *
841
+ * Returns a preset configuration object, or null if the preset name is not found.
842
+ *
843
+ * @param name - The preset name (case-sensitive)
844
+ * @returns An `EmbeddingPreset` object or `null` if not found
845
+ *
846
+ * @example
847
+ * ```typescript
848
+ * import { getEmbeddingPreset } from '@kreuzberg/node';
849
+ *
850
+ * const preset = getEmbeddingPreset('balanced');
851
+ * if (preset) {
852
+ * console.log(`Model: ${preset.modelName}, Dims: ${preset.dimensions}`);
853
+ * // Model: BGEBaseENV15, Dims: 768
854
+ * }
855
+ * ```
856
+ */
857
+ declare function getEmbeddingPreset(name: string): EmbeddingPreset | null;
858
+ /**
859
+ * Get the error code for the last FFI error.
860
+ *
861
+ * Returns the FFI error code as an integer. This is useful for programmatic error handling
862
+ * and distinguishing between different types of failures in native code.
863
+ *
864
+ * Error codes:
865
+ * - 0: Success (no error)
866
+ * - 1: GenericError
867
+ * - 2: Panic
868
+ * - 3: InvalidArgument
869
+ * - 4: IoError
870
+ * - 5: ParsingError
871
+ * - 6: OcrError
872
+ * - 7: MissingDependency
873
+ *
874
+ * @returns The integer error code
875
+ *
876
+ * @example
877
+ * ```typescript
878
+ * import { extractFile, getLastErrorCode, ErrorCode } from '@kreuzberg/node';
879
+ *
880
+ * try {
881
+ * const result = await extractFile('document.pdf');
882
+ * } catch (error) {
883
+ * const code = getLastErrorCode();
884
+ * if (code === ErrorCode.Panic) {
885
+ * console.error('Native code panic detected');
886
+ * }
887
+ * }
888
+ * ```
889
+ */
890
+ declare function getLastErrorCode(): number;
891
+ /**
892
+ * Get panic context information if the last error was a panic.
893
+ *
894
+ * Returns detailed information about a panic in native code, or null if the last error was not a panic.
895
+ * This provides debugging information when native code panics.
896
+ *
897
+ * @returns A `PanicContext` object with file, line, function, message, and timestamp_secs, or null if no panic context is available
898
+ *
899
+ * @example
900
+ * ```typescript
901
+ * import { extractFile, getLastPanicContext } from '@kreuzberg/node';
902
+ *
903
+ * try {
904
+ * const result = await extractFile('document.pdf');
905
+ * } catch (error) {
906
+ * const context = getLastPanicContext();
907
+ * if (context) {
908
+ * console.error(`Panic at ${context.file}:${context.line}`);
909
+ * console.error(`In function: ${context.function}`);
910
+ * console.error(`Message: ${context.message}`);
911
+ * }
912
+ * }
913
+ * ```
914
+ */
915
+ declare function getLastPanicContext(): PanicContext | null;
916
+ /**
917
+ * Returns the human-readable name for an error code.
918
+ *
919
+ * Maps numeric error codes to their string names, providing a consistent way
920
+ * to get error code names across all platforms.
921
+ *
922
+ * @param code - The numeric error code (0-7)
923
+ * @returns The error code name as a string (e.g., "validation", "ocr", "unknown")
924
+ *
925
+ * @example
926
+ * ```typescript
927
+ * import { getErrorCodeName } from '@kreuzberg/node';
928
+ *
929
+ * const name = getErrorCodeName(0); // returns "validation"
930
+ * const name = getErrorCodeName(2); // returns "ocr"
931
+ * const name = getErrorCodeName(99); // returns "unknown"
932
+ * ```
933
+ */
934
+ declare function getErrorCodeName(code: number): string;
935
+ /**
936
+ * Returns the description for an error code.
937
+ *
938
+ * Retrieves user-friendly descriptions of error types from the FFI layer.
939
+ *
940
+ * @param code - The numeric error code (0-7)
941
+ * @returns A brief description of the error type
942
+ *
943
+ * @example
944
+ * ```typescript
945
+ * import { getErrorCodeDescription } from '@kreuzberg/node';
946
+ *
947
+ * const desc = getErrorCodeDescription(0); // returns "Input validation error"
948
+ * const desc = getErrorCodeDescription(4); // returns "File system I/O error"
949
+ * const desc = getErrorCodeDescription(99); // returns "Unknown error code"
950
+ * ```
951
+ */
952
+ declare function getErrorCodeDescription(code: number): string;
953
+ /**
954
+ * Classifies an error message string into an error code category.
955
+ *
956
+ * This function analyzes the error message content and returns the most likely
957
+ * error code (0-7) based on keyword patterns. Used to programmatically classify
958
+ * errors for handling purposes.
959
+ *
960
+ * The classification is based on keyword matching:
961
+ * - **Validation (0)**: Keywords like "invalid", "validation", "schema", "required"
962
+ * - **Parsing (1)**: Keywords like "parsing", "corrupted", "malformed"
963
+ * - **Ocr (2)**: Keywords like "ocr", "tesseract", "language", "model"
964
+ * - **MissingDependency (3)**: Keywords like "not found", "missing", "dependency"
965
+ * - **Io (4)**: Keywords like "file", "disk", "read", "write", "permission"
966
+ * - **Plugin (5)**: Keywords like "plugin", "register", "extension"
967
+ * - **UnsupportedFormat (6)**: Keywords like "unsupported", "format", "mime"
968
+ * - **Internal (7)**: Keywords like "internal", "bug", "panic"
969
+ *
970
+ * @param errorMessage - The error message string to classify
971
+ * @returns An object with the classification details
972
+ *
973
+ * @example
974
+ * ```typescript
975
+ * import { classifyError } from '@kreuzberg/node';
976
+ *
977
+ * const result = classifyError("PDF file is corrupted");
978
+ * // Returns: { code: 1, name: "parsing", confidence: 0.95 }
979
+ *
980
+ * const result = classifyError("Tesseract not found");
981
+ * // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 }
982
+ * ```
983
+ */
984
+ declare function classifyError(errorMessage: string): ErrorClassification;
985
+ /**
986
+ * Create a worker pool for concurrent file extraction.
987
+ *
988
+ * The worker pool manages a set of background worker threads that can process
989
+ * extraction requests concurrently, improving throughput when handling multiple files.
990
+ *
991
+ * @param size - Optional number of worker threads (defaults to CPU count). Must be > 0
992
+ * @returns A WorkerPool instance to use with extraction functions
993
+ *
994
+ * @throws {Error} If size is invalid or pool creation fails
995
+ *
996
+ * @example
997
+ * ```typescript
998
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
999
+ *
1000
+ * // Create pool with 4 workers
1001
+ * const pool = createWorkerPool(4);
1002
+ *
1003
+ * try {
1004
+ * const result = await extractFileInWorker(pool, 'document.pdf');
1005
+ * console.log(result.content);
1006
+ * } finally {
1007
+ * // Always close the pool when done
1008
+ * await closeWorkerPool(pool);
1009
+ * }
1010
+ * ```
1011
+ */
1012
+ declare function createWorkerPool(size?: number): WorkerPool;
1013
+ /**
1014
+ * Get statistics about a worker pool.
1015
+ *
1016
+ * Returns information about the pool's current state, including the number of active workers,
1017
+ * queued tasks, and total processed tasks.
1018
+ *
1019
+ * @param pool - The worker pool instance
1020
+ * @returns WorkerPoolStats with pool information
1021
+ *
1022
+ * @example
1023
+ * ```typescript
1024
+ * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
1025
+ *
1026
+ * const pool = createWorkerPool(4);
1027
+ * const stats = getWorkerPoolStats(pool);
1028
+ *
1029
+ * console.log(`Pool size: ${stats.size}`);
1030
+ * console.log(`Active workers: ${stats.activeWorkers}`);
1031
+ * console.log(`Queued tasks: ${stats.queuedTasks}`);
1032
+ * ```
1033
+ */
1034
+ declare function getWorkerPoolStats(pool: WorkerPool): WorkerPoolStats;
1035
+ /**
1036
+ * Extract content from a single file using a worker pool (asynchronous).
1037
+ *
1038
+ * Submits an extraction task to the worker pool. The task is executed by one of the
1039
+ * available workers in the background, allowing other tasks to be processed concurrently.
1040
+ *
1041
+ * @param pool - The worker pool instance
1042
+ * @param filePath - Path to the file to extract
1043
+ * @param mimeTypeOrConfig - Optional MIME type or extraction configuration
1044
+ * @param maybeConfig - Optional extraction configuration (if second param is MIME type)
1045
+ * @returns Promise<ExtractionResult> containing extracted content and metadata
1046
+ *
1047
+ * @throws {Error} If the file cannot be read or extraction fails
1048
+ *
1049
+ * @example
1050
+ * ```typescript
1051
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
1052
+ *
1053
+ * const pool = createWorkerPool(4);
1054
+ *
1055
+ * try {
1056
+ * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
1057
+ * const results = await Promise.all(
1058
+ * files.map(f => extractFileInWorker(pool, f))
1059
+ * );
1060
+ *
1061
+ * results.forEach((r, i) => {
1062
+ * console.log(`${files[i]}: ${r.content.substring(0, 100)}...`);
1063
+ * });
1064
+ * } finally {
1065
+ * await closeWorkerPool(pool);
1066
+ * }
1067
+ * ```
1068
+ */
1069
+ declare function extractFileInWorker(pool: WorkerPool, filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
1070
+ /**
1071
+ * Extract content from multiple files in parallel using a worker pool (asynchronous).
1072
+ *
1073
+ * Submits multiple extraction tasks to the worker pool for concurrent processing.
1074
+ * This is more efficient than using `extractFileInWorker` multiple times sequentially.
1075
+ *
1076
+ * @param pool - The worker pool instance
1077
+ * @param paths - Array of file paths to extract
1078
+ * @param config - Extraction configuration object (applies to all files)
1079
+ * @returns Promise<ExtractionResult[]> array of results (one per file, in same order)
1080
+ *
1081
+ * @throws {Error} If any file cannot be read or extraction fails
1082
+ *
1083
+ * @example
1084
+ * ```typescript
1085
+ * import { createWorkerPool, batchExtractFilesInWorker, closeWorkerPool } from '@kreuzberg/node';
1086
+ *
1087
+ * const pool = createWorkerPool(4);
1088
+ *
1089
+ * try {
1090
+ * const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf'];
1091
+ * const results = await batchExtractFilesInWorker(pool, files, {
1092
+ * ocr: { backend: 'tesseract', language: 'eng' }
1093
+ * });
1094
+ *
1095
+ * const total = results.reduce((sum, r) => sum + extractAmount(r.content), 0);
1096
+ * console.log(`Total: $${total}`);
1097
+ * } finally {
1098
+ * await closeWorkerPool(pool);
1099
+ * }
1100
+ * ```
1101
+ */
1102
+ declare function batchExtractFilesInWorker(pool: WorkerPool, paths: string[], config?: ExtractionConfig$1 | null): Promise<ExtractionResult[]>;
1103
+ /**
1104
+ * Close a worker pool and shut down all worker threads.
1105
+ *
1106
+ * Should be called when the pool is no longer needed to clean up resources
1107
+ * and gracefully shut down worker threads. Any pending tasks will be cancelled.
1108
+ *
1109
+ * @param pool - The worker pool instance to close
1110
+ * @returns Promise that resolves when the pool is fully closed
1111
+ *
1112
+ * @throws {Error} If pool shutdown fails
1113
+ *
1114
+ * @example
1115
+ * ```typescript
1116
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
1117
+ *
1118
+ * const pool = createWorkerPool(4);
1119
+ *
1120
+ * try {
1121
+ * const result = await extractFileInWorker(pool, 'document.pdf');
1122
+ * console.log(result.content);
1123
+ * } finally {
1124
+ * // Clean up the pool
1125
+ * await closeWorkerPool(pool);
1126
+ * }
1127
+ * ```
1128
+ */
1129
+ declare function closeWorkerPool(pool: WorkerPool): Promise<void>;
1130
+ declare const __version__ = "4.0.0";
1131
+
1132
+ export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };