@kreuzberg/node 4.0.0-rc.6 → 4.0.0-rc.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,857 @@
1
+ import { PanicContext } from './errors.js';
2
+ export { CacheError, ErrorCode, ImageProcessingError, KreuzbergError, MissingDependencyError, OcrError, ParsingError, PluginError, ValidationError } from './errors.js';
3
+ import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol } from './types.js';
4
+ export { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, Metadata, OcrConfig, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.js';
5
+ export { GutenOcrBackend } from './ocr/guten-ocr.js';
6
+
7
+ /**
8
+ * Kreuzberg - Multi-language document intelligence framework.
9
+ *
10
+ * This is a TypeScript SDK around a high-performance Rust core.
11
+ * All extraction logic, chunking, quality processing, and language detection
12
+ * are implemented in Rust for maximum performance.
13
+ *
14
+ * ## API Usage Recommendations
15
+ *
16
+ * **For processing multiple documents**, prefer batch APIs:
17
+ * - Use `batchExtractFiles()` / `batchExtractFilesSync()` for multiple files
18
+ * - Use `batchExtractBytes()` / `batchExtractBytesSync()` for multiple byte arrays
19
+ *
20
+ * **Batch APIs provide**:
21
+ * - Better performance (parallel processing in Rust)
22
+ * - More reliable memory management
23
+ * - Recommended for all multi-document workflows
24
+ *
25
+ * **Single extraction APIs** (`extractFile`, `extractBytes`) are suitable for:
26
+ * - One-off document processing
27
+ * - Interactive applications processing documents on-demand
28
+ * - Avoid calling these in tight loops - use batch APIs instead
29
+ *
30
+ * ## Supported Formats
31
+ *
32
+ * - **Documents**: PDF, DOCX, PPTX, XLSX, DOC, PPT (with LibreOffice)
33
+ * - **Text**: Markdown, Plain Text, XML
34
+ * - **Web**: HTML (converted to Markdown)
35
+ * - **Data**: JSON, YAML, TOML
36
+ * - **Email**: EML, MSG
37
+ * - **Images**: PNG, JPEG, TIFF (with OCR support)
38
+ *
39
+ * @example
40
+ * ```typescript
41
+ * import { extractFile, batchExtractFiles } from '@kreuzberg/node';
42
+ *
43
+ * // Single file extraction
44
+ * const result = await extractFile('document.pdf');
45
+ * console.log(result.content);
46
+ *
47
+ * // Multiple files (recommended approach)
48
+ * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
49
+ * const results = await batchExtractFiles(files);
50
+ * results.forEach(r => console.log(r.content));
51
+ * ```
52
+ */
53
+
54
+ /**
55
+ * @internal Allows tests to provide a mocked native binding.
56
+ */
57
+ declare function __setBindingForTests(mock: unknown): void;
58
+ /**
59
+ * @internal Resets the cached native binding for tests.
60
+ */
61
+ declare function __resetBindingForTests(): void;
62
+ /**
63
+ * Extract content from a single file (synchronous).
64
+ *
65
+ * **Usage Note**: For processing multiple files, prefer `batchExtractFilesSync()` which
66
+ * provides better performance and memory management.
67
+ *
68
+ * @param filePath - Path to the file (string)
69
+ * @param mimeType - Optional MIME type hint (auto-detected if null)
70
+ * @param config - Extraction configuration (uses defaults if null)
71
+ * @returns ExtractionResult with content, metadata, and tables
72
+ *
73
+ * @example
74
+ * ```typescript
75
+ * import { extractFileSync } from '@kreuzberg/node';
76
+ *
77
+ * // Basic usage
78
+ * const result = extractFileSync('document.pdf');
79
+ * console.log(result.content);
80
+ *
81
+ * // With OCR configuration
82
+ * const config = {
83
+ * ocr: {
84
+ * backend: 'tesseract',
85
+ * language: 'eng',
86
+ * tesseractConfig: {
87
+ * psm: 6,
88
+ * enableTableDetection: true,
89
+ * },
90
+ * },
91
+ * };
92
+ * const result2 = extractFileSync('scanned.pdf', null, config);
93
+ * ```
94
+ */
95
+ declare function extractFileSync(filePath: string, mimeType?: string | null, config?: ExtractionConfig$1 | null): ExtractionResult;
96
+ /**
97
+ * Extract content from a single file (asynchronous).
98
+ *
99
+ * **Usage Note**: For processing multiple files, prefer `batchExtractFiles()` which
100
+ * provides better performance and memory management.
101
+ *
102
+ * @param filePath - Path to the file (string)
103
+ * @param mimeType - Optional MIME type hint (auto-detected if null)
104
+ * @param config - Extraction configuration (uses defaults if null)
105
+ * @returns Promise<ExtractionResult> with content, metadata, and tables
106
+ *
107
+ * @example
108
+ * ```typescript
109
+ * import { extractFile } from '@kreuzberg/node';
110
+ *
111
+ * // Basic usage
112
+ * const result = await extractFile('document.pdf');
113
+ * console.log(result.content);
114
+ *
115
+ * // With chunking enabled
116
+ * const config = {
117
+ * chunking: {
118
+ * maxChars: 1000,
119
+ * maxOverlap: 200,
120
+ * },
121
+ * };
122
+ * const result2 = await extractFile('long_document.pdf', null, config);
123
+ * console.log(result2.chunks); // Array of text chunks
124
+ * ```
125
+ */
126
+ declare function extractFile(filePath: string, mimeType?: string | null, config?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
127
+ /**
128
+ * Extract content from raw bytes (synchronous).
129
+ *
130
+ * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytesSync()`
131
+ * which provides better performance and memory management.
132
+ *
133
+ * @param data - File content as Uint8Array
134
+ * @param mimeType - MIME type of the data (required for format detection)
135
+ * @param config - Extraction configuration (uses defaults if null)
136
+ * @returns ExtractionResult with content, metadata, and tables
137
+ *
138
+ * @example
139
+ * ```typescript
140
+ * import { extractBytesSync } from '@kreuzberg/node';
141
+ * import { readFileSync } from 'fs';
142
+ *
143
+ * const data = readFileSync('document.pdf');
144
+ * const result = extractBytesSync(data, 'application/pdf');
145
+ * console.log(result.content);
146
+ * ```
147
+ */
148
+ declare function extractBytesSync(data: Uint8Array, mimeType: string, config?: ExtractionConfig$1 | null): ExtractionResult;
149
+ /**
150
+ * Extract content from raw bytes (asynchronous).
151
+ *
152
+ * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytes()`
153
+ * which provides better performance and memory management.
154
+ *
155
+ * @param data - File content as Uint8Array
156
+ * @param mimeType - MIME type of the data (required for format detection)
157
+ * @param config - Extraction configuration (uses defaults if null)
158
+ * @returns Promise<ExtractionResult> with content, metadata, and tables
159
+ *
160
+ * @example
161
+ * ```typescript
162
+ * import { extractBytes } from '@kreuzberg/node';
163
+ * import { readFile } from 'fs/promises';
164
+ *
165
+ * const data = await readFile('document.pdf');
166
+ * const result = await extractBytes(data, 'application/pdf');
167
+ * console.log(result.content);
168
+ * ```
169
+ */
170
+ declare function extractBytes(data: Uint8Array, mimeType: string, config?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
171
+ /**
172
+ * Extract content from multiple files in parallel (synchronous).
173
+ *
174
+ * **Recommended for**: Processing multiple documents efficiently with better
175
+ * performance and memory management compared to individual `extractFileSync()` calls.
176
+ *
177
+ * **Benefits**:
178
+ * - Parallel processing in Rust for maximum performance
179
+ * - Optimized memory usage across all extractions
180
+ * - More reliable for batch document processing
181
+ *
182
+ * @param paths - List of file paths to extract
183
+ * @param config - Extraction configuration (uses defaults if null)
184
+ * @returns Array of ExtractionResults (one per file, in same order as input)
185
+ *
186
+ * @example
187
+ * ```typescript
188
+ * import { batchExtractFilesSync } from '@kreuzberg/node';
189
+ *
190
+ * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
191
+ * const results = batchExtractFilesSync(files);
192
+ *
193
+ * results.forEach((result, i) => {
194
+ * console.log(`File ${files[i]}: ${result.content.substring(0, 100)}...`);
195
+ * });
196
+ * ```
197
+ */
198
+ declare function batchExtractFilesSync(paths: string[], config?: ExtractionConfig$1 | null): ExtractionResult[];
199
+ /**
200
+ * Extract content from multiple files in parallel (asynchronous).
201
+ *
202
+ * **Recommended for**: Processing multiple documents efficiently with better
203
+ * performance and memory management compared to individual `extractFile()` calls.
204
+ *
205
+ * **Benefits**:
206
+ * - Parallel processing in Rust for maximum performance
207
+ * - Optimized memory usage across all extractions
208
+ * - More reliable for batch document processing
209
+ *
210
+ * @param paths - List of file paths to extract
211
+ * @param config - Extraction configuration (uses defaults if null)
212
+ * @returns Promise resolving to array of ExtractionResults (one per file, in same order as input)
213
+ *
214
+ * @example
215
+ * ```typescript
216
+ * import { batchExtractFiles } from '@kreuzberg/node';
217
+ *
218
+ * const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf'];
219
+ * const results = await batchExtractFiles(files, {
220
+ * ocr: { backend: 'tesseract', language: 'eng' }
221
+ * });
222
+ *
223
+ * // Process all results
224
+ * const totalAmount = results
225
+ * .map(r => extractAmount(r.content))
226
+ * .reduce((a, b) => a + b, 0);
227
+ * ```
228
+ */
229
+ declare function batchExtractFiles(paths: string[], config?: ExtractionConfig$1 | null): Promise<ExtractionResult[]>;
230
+ /**
231
+ * Extract content from multiple byte arrays in parallel (synchronous).
232
+ *
233
+ * **Recommended for**: Processing multiple documents from memory efficiently with better
234
+ * performance and memory management compared to individual `extractBytesSync()` calls.
235
+ *
236
+ * **Benefits**:
237
+ * - Parallel processing in Rust for maximum performance
238
+ * - Optimized memory usage across all extractions
239
+ * - More reliable for batch document processing
240
+ *
241
+ * @param dataList - List of file contents as Uint8Arrays
242
+ * @param mimeTypes - List of MIME types (one per data item, required for format detection)
243
+ * @param config - Extraction configuration (uses defaults if null)
244
+ * @returns Array of ExtractionResults (one per data item, in same order as input)
245
+ *
246
+ * @example
247
+ * ```typescript
248
+ * import { batchExtractBytesSync } from '@kreuzberg/node';
249
+ * import { readFileSync } from 'fs';
250
+ *
251
+ * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
252
+ * const dataList = files.map(f => readFileSync(f));
253
+ * const mimeTypes = ['application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'];
254
+ *
255
+ * const results = batchExtractBytesSync(dataList, mimeTypes);
256
+ * results.forEach((result, i) => {
257
+ * console.log(`File ${files[i]}: ${result.content.substring(0, 100)}...`);
258
+ * });
259
+ * ```
260
+ */
261
+ declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig$1 | null): ExtractionResult[];
262
+ /**
263
+ * Extract content from multiple byte arrays in parallel (asynchronous).
264
+ *
265
+ * **Recommended for**: Processing multiple documents from memory efficiently with better
266
+ * performance and memory management compared to individual `extractBytes()` calls.
267
+ *
268
+ * **Benefits**:
269
+ * - Parallel processing in Rust for maximum performance
270
+ * - Optimized memory usage across all extractions
271
+ * - More reliable for batch document processing
272
+ *
273
+ * @param dataList - List of file contents as Uint8Arrays
274
+ * @param mimeTypes - List of MIME types (one per data item, required for format detection)
275
+ * @param config - Extraction configuration (uses defaults if null)
276
+ * @returns Promise resolving to array of ExtractionResults (one per data item, in same order as input)
277
+ *
278
+ * @example
279
+ * ```typescript
280
+ * import { batchExtractBytes } from '@kreuzberg/node';
281
+ * import { readFile } from 'fs/promises';
282
+ *
283
+ * const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf'];
284
+ * const dataList = await Promise.all(files.map(f => readFile(f)));
285
+ * const mimeTypes = files.map(() => 'application/pdf');
286
+ *
287
+ * const results = await batchExtractBytes(dataList, mimeTypes, {
288
+ * ocr: { backend: 'tesseract', language: 'eng' }
289
+ * });
290
+ *
291
+ * // Process all results
292
+ * const totalAmount = results
293
+ * .map(r => extractAmount(r.content))
294
+ * .reduce((a, b) => a + b, 0);
295
+ * ```
296
+ */
297
+ declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig$1 | null): Promise<ExtractionResult[]>;
298
+ /**
299
+ * Register a custom postprocessor.
300
+ *
301
+ * **IMPORTANT**: Custom processors only work with **async extraction functions**:
302
+ * - ✅ `extractFile()`, `extractBytes()`, `batchExtractFiles()`, `batchExtractBytes()`
303
+ * - ❌ `extractFileSync()`, `extractBytesSync()`, etc. (will skip custom processors)
304
+ *
305
+ * This limitation exists because sync extraction blocks the Node.js event loop,
306
+ * preventing JavaScript callbacks from executing. For v4.0, use async extraction
307
+ * when you need custom processors.
308
+ *
309
+ * @param processor - PostProcessorProtocol implementation
310
+ *
311
+ * @example
312
+ * ```typescript
313
+ * import { registerPostProcessor, extractFile, ExtractionResult } from '@kreuzberg/node';
314
+ *
315
+ * class MyProcessor implements PostProcessorProtocol {
316
+ * name(): string {
317
+ * return 'my_processor';
318
+ * }
319
+ *
320
+ * process(result: ExtractionResult): ExtractionResult {
321
+ * result.metadata.customField = 'custom_value';
322
+ * return result;
323
+ * }
324
+ *
325
+ * processingStage(): 'early' | 'middle' | 'late' {
326
+ * return 'middle';
327
+ * }
328
+ * }
329
+ *
330
+ * registerPostProcessor(new MyProcessor());
331
+ *
332
+ * // Use async extraction (required for custom processors)
333
+ * const result = await extractFile('document.pdf');
334
+ * console.log(result.metadata.customField); // 'custom_value'
335
+ * ```
336
+ */
337
+ declare function registerPostProcessor(processor: PostProcessorProtocol): void;
338
+ /**
339
+ * Unregister a postprocessor by name.
340
+ *
341
+ * Removes a previously registered postprocessor from the registry.
342
+ *
343
+ * @param name - Name of the processor to unregister
344
+ *
345
+ * @example
346
+ * ```typescript
347
+ * import { unregisterPostProcessor } from '@kreuzberg/node';
348
+ *
349
+ * unregisterPostProcessor('my_processor');
350
+ * ```
351
+ */
352
+ declare function unregisterPostProcessor(name: string): void;
353
+ /**
354
+ * Clear all registered postprocessors.
355
+ *
356
+ * Removes all postprocessors from the registry.
357
+ *
358
+ * @example
359
+ * ```typescript
360
+ * import { clearPostProcessors } from '@kreuzberg/node';
361
+ *
362
+ * clearPostProcessors();
363
+ * ```
364
+ */
365
+ declare function clearPostProcessors(): void;
366
+ /**
367
+ * List all registered post-processors.
368
+ *
369
+ * Returns the names of all currently registered post-processors.
370
+ *
371
+ * @returns Array of post-processor names
372
+ *
373
+ * @example
374
+ * ```typescript
375
+ * import { listPostProcessors } from '@kreuzberg/node';
376
+ *
377
+ * const names = listPostProcessors();
378
+ * console.log('Registered post-processors:', names);
379
+ * ```
380
+ */
381
+ declare function listPostProcessors(): string[];
382
+ /**
383
+ * Register a custom validator.
384
+ *
385
+ * Validators check extraction results for quality, completeness, or correctness.
386
+ * Unlike post-processors, validator errors **fail fast** - if a validator throws an error,
387
+ * the extraction fails immediately.
388
+ *
389
+ * @param validator - ValidatorProtocol implementation
390
+ *
391
+ * @example
392
+ * ```typescript
393
+ * import { registerValidator } from '@kreuzberg/node';
394
+ *
395
+ * class MinLengthValidator implements ValidatorProtocol {
396
+ * name(): string {
397
+ * return 'min_length_validator';
398
+ * }
399
+ *
400
+ * priority(): number {
401
+ * return 100; // Run early
402
+ * }
403
+ *
404
+ * validate(result: ExtractionResult): void {
405
+ * if (result.content.length < 100) {
406
+ * throw new Error('Content too short: minimum 100 characters required');
407
+ * }
408
+ * }
409
+ * }
410
+ *
411
+ * registerValidator(new MinLengthValidator());
412
+ * ```
413
+ */
414
+ declare function registerValidator(validator: ValidatorProtocol): void;
415
+ /**
416
+ * Unregister a validator by name.
417
+ *
418
+ * Removes a previously registered validator from the global registry.
419
+ *
420
+ * @param name - Validator name to unregister
421
+ *
422
+ * @example
423
+ * ```typescript
424
+ * import { unregisterValidator } from '@kreuzberg/node';
425
+ *
426
+ * unregisterValidator('min_length_validator');
427
+ * ```
428
+ */
429
+ declare function unregisterValidator(name: string): void;
430
+ /**
431
+ * Clear all registered validators.
432
+ *
433
+ * Removes all validators from the global registry. Useful for test cleanup
434
+ * or resetting state.
435
+ *
436
+ * @example
437
+ * ```typescript
438
+ * import { clearValidators } from '@kreuzberg/node';
439
+ *
440
+ * clearValidators();
441
+ * ```
442
+ */
443
+ declare function clearValidators(): void;
444
+ /**
445
+ * List all registered validators.
446
+ *
447
+ * Returns the names of all currently registered validators.
448
+ *
449
+ * @returns Array of validator names
450
+ *
451
+ * @example
452
+ * ```typescript
453
+ * import { listValidators } from '@kreuzberg/node';
454
+ *
455
+ * const names = listValidators();
456
+ * console.log('Registered validators:', names);
457
+ * ```
458
+ */
459
+ declare function listValidators(): string[];
460
+ declare function registerOcrBackend(backend: OcrBackendProtocol): void;
461
+ /**
462
+ * List all registered OCR backends.
463
+ *
464
+ * Returns an array of names of all currently registered OCR backends,
465
+ * including built-in backends like "tesseract".
466
+ *
467
+ * @returns Array of OCR backend names
468
+ *
469
+ * @example
470
+ * ```typescript
471
+ * import { listOcrBackends } from '@kreuzberg/node';
472
+ *
473
+ * const backends = listOcrBackends();
474
+ * console.log(backends); // ['tesseract', 'my-custom-backend', ...]
475
+ * ```
476
+ */
477
+ declare function listOcrBackends(): string[];
478
+ /**
479
+ * Unregister an OCR backend by name.
480
+ *
481
+ * Removes the specified OCR backend from the registry. If the backend doesn't exist,
482
+ * this operation is a no-op (does not throw an error).
483
+ *
484
+ * @param name - Name of the OCR backend to unregister
485
+ *
486
+ * @example
487
+ * ```typescript
488
+ * import { unregisterOcrBackend } from '@kreuzberg/node';
489
+ *
490
+ * // Unregister a custom backend
491
+ * unregisterOcrBackend('my-custom-ocr');
492
+ * ```
493
+ */
494
+ declare function unregisterOcrBackend(name: string): void;
495
+ /**
496
+ * Clear all registered OCR backends.
497
+ *
498
+ * Removes all OCR backends from the registry, including built-in backends.
499
+ * Use with caution as this will make OCR functionality unavailable until
500
+ * backends are re-registered.
501
+ *
502
+ * @example
503
+ * ```typescript
504
+ * import { clearOcrBackends } from '@kreuzberg/node';
505
+ *
506
+ * clearOcrBackends();
507
+ * ```
508
+ */
509
+ declare function clearOcrBackends(): void;
510
+ /**
511
+ * List all registered document extractors.
512
+ *
513
+ * Returns an array of names of all currently registered document extractors,
514
+ * including built-in extractors for PDF, Office documents, images, etc.
515
+ *
516
+ * @returns Array of document extractor names
517
+ *
518
+ * @example
519
+ * ```typescript
520
+ * import { listDocumentExtractors } from '@kreuzberg/node';
521
+ *
522
+ * const extractors = listDocumentExtractors();
523
+ * console.log(extractors); // ['PDFExtractor', 'ImageExtractor', ...]
524
+ * ```
525
+ */
526
+ declare function listDocumentExtractors(): string[];
527
+ /**
528
+ * Unregister a document extractor by name.
529
+ *
530
+ * Removes the specified document extractor from the registry. If the extractor
531
+ * doesn't exist, this operation is a no-op (does not throw an error).
532
+ *
533
+ * @param name - Name of the document extractor to unregister
534
+ *
535
+ * @example
536
+ * ```typescript
537
+ * import { unregisterDocumentExtractor } from '@kreuzberg/node';
538
+ *
539
+ * // Unregister a custom extractor
540
+ * unregisterDocumentExtractor('MyCustomExtractor');
541
+ * ```
542
+ */
543
+ declare function unregisterDocumentExtractor(name: string): void;
544
+ /**
545
+ * Clear all registered document extractors.
546
+ *
547
+ * Removes all document extractors from the registry, including built-in extractors.
548
+ * Use with caution as this will make document extraction unavailable until
549
+ * extractors are re-registered.
550
+ *
551
+ * @example
552
+ * ```typescript
553
+ * import { clearDocumentExtractors } from '@kreuzberg/node';
554
+ *
555
+ * clearDocumentExtractors();
556
+ * ```
557
+ */
558
+ declare function clearDocumentExtractors(): void;
559
+ /**
560
+ * ExtractionConfig namespace with static methods for loading configuration from files.
561
+ *
562
+ * Provides a factory method to load extraction configuration from TOML, YAML, or JSON files.
563
+ * The file format is automatically detected based on the file extension.
564
+ *
565
+ * @example
566
+ * ```typescript
567
+ * import { ExtractionConfig, extractFile } from '@kreuzberg/node';
568
+ *
569
+ * // Load configuration from file
570
+ * const config = ExtractionConfig.fromFile('config.toml');
571
+ *
572
+ * // Use with extraction
573
+ * const result = await extractFile('document.pdf', null, config);
574
+ * ```
575
+ */
576
+ declare const ExtractionConfig: {
577
+ /**
578
+ * Load extraction configuration from a file.
579
+ *
580
+ * Automatically detects the file format based on extension:
581
+ * - `.toml` - TOML format
582
+ * - `.yaml` - YAML format
583
+ * - `.json` - JSON format
584
+ *
585
+ * @param filePath - Path to the configuration file (absolute or relative)
586
+ * @returns ExtractionConfig object loaded from the file
587
+ *
588
+ * @throws {Error} If file does not exist or is not accessible
589
+ * @throws {Error} If file content is not valid TOML/YAML/JSON
590
+ * @throws {Error} If configuration structure is invalid
591
+ * @throws {Error} If file extension is not supported
592
+ *
593
+ * @example
594
+ * ```typescript
595
+ * import { ExtractionConfig } from '@kreuzberg/node';
596
+ *
597
+ * // Load from TOML file
598
+ * const config1 = ExtractionConfig.fromFile('kreuzberg.toml');
599
+ *
600
+ * // Load from YAML file
601
+ * const config2 = ExtractionConfig.fromFile('./config.yaml');
602
+ *
603
+ * // Load from JSON file
604
+ * const config3 = ExtractionConfig.fromFile('./config.json');
605
+ * ```
606
+ */
607
+ fromFile(filePath: string): ExtractionConfig$1;
608
+ /**
609
+ * Discover and load configuration from current or parent directories.
610
+ *
611
+ * Searches for a `kreuzberg.toml` file starting from the current working directory
612
+ * and traversing up the directory tree. Returns the first configuration file found.
613
+ *
614
+ * @returns ExtractionConfig object if found, or null if no configuration file exists
615
+ *
616
+ * @example
617
+ * ```typescript
618
+ * import { ExtractionConfig } from '@kreuzberg/node';
619
+ *
620
+ * // Try to find config in current or parent directories
621
+ * const config = ExtractionConfig.discover();
622
+ * if (config) {
623
+ * console.log('Found configuration');
624
+ * // Use config for extraction
625
+ * } else {
626
+ * console.log('No configuration file found, using defaults');
627
+ * }
628
+ * ```
629
+ */
630
+ discover(): ExtractionConfig$1 | null;
631
+ };
632
+ /**
633
+ * Detect MIME type from raw bytes.
634
+ *
635
+ * Uses content inspection (magic bytes) to determine MIME type.
636
+ * This is more accurate than extension-based detection but requires
637
+ * reading the file content.
638
+ *
639
+ * @param bytes - Raw file content as Buffer
640
+ * @returns The detected MIME type string
641
+ *
642
+ * @throws {Error} If MIME type cannot be determined from content
643
+ *
644
+ * @example
645
+ * ```typescript
646
+ * import { detectMimeType } from '@kreuzberg/node';
647
+ * import * as fs from 'fs';
648
+ *
649
+ * // Read file content
650
+ * const content = fs.readFileSync('document.pdf');
651
+ *
652
+ * // Detect MIME type from bytes
653
+ * const mimeType = detectMimeType(content);
654
+ * console.log(mimeType); // 'application/pdf'
655
+ * ```
656
+ */
657
+ declare function detectMimeType(bytes: Buffer): string;
658
+ /**
659
+ * Detect MIME type from a file path.
660
+ *
661
+ * Uses file extension to determine MIME type. Falls back to `mime_guess` crate
662
+ * if extension-based detection fails.
663
+ *
664
+ * @param path - Path to the file (string)
665
+ * @param checkExists - Whether to verify file existence (default: true)
666
+ * @returns The detected MIME type string
667
+ *
668
+ * @throws {Error} If file doesn't exist (when checkExists is true)
669
+ * @throws {Error} If MIME type cannot be determined from path/extension
670
+ * @throws {Error} If extension is unknown
671
+ *
672
+ * @example
673
+ * ```typescript
674
+ * import { detectMimeTypeFromPath } from '@kreuzberg/node';
675
+ *
676
+ * // Detect from existing file
677
+ * const mimeType = detectMimeTypeFromPath('document.pdf');
678
+ * console.log(mimeType); // 'application/pdf'
679
+ *
680
+ * const mimeType2 = detectMimeTypeFromPath('document.docx');
681
+ * console.log(mimeType2); // 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
682
+ * ```
683
+ */
684
+ declare function detectMimeTypeFromPath(path: string, checkExists?: boolean): string;
685
+ /**
686
+ * Validate that a MIME type is supported by Kreuzberg.
687
+ *
688
+ * Checks if a MIME type is in the list of supported formats. Note that any
689
+ * `image/*` MIME type is automatically considered valid.
690
+ *
691
+ * @param mimeType - The MIME type to validate (string)
692
+ * @returns The validated MIME type (may be normalized)
693
+ *
694
+ * @throws {Error} If the MIME type is not supported
695
+ *
696
+ * @example
697
+ * ```typescript
698
+ * import { validateMimeType } from '@kreuzberg/node';
699
+ *
700
+ * // Validate supported type
701
+ * const validated = validateMimeType('application/pdf');
702
+ * console.log(validated); // 'application/pdf'
703
+ *
704
+ * // Validate custom image type
705
+ * const validated2 = validateMimeType('image/custom-format');
706
+ * console.log(validated2); // 'image/custom-format' (any image/* is valid)
707
+ *
708
+ * // Validate unsupported type (throws error)
709
+ * try {
710
+ * validateMimeType('video/mp4');
711
+ * } catch (err) {
712
+ * console.error(err); // Error: Unsupported format: video/mp4
713
+ * }
714
+ * ```
715
+ */
716
+ declare function validateMimeType(mimeType: string): string;
717
+ /**
718
+ * Get file extensions for a given MIME type.
719
+ *
720
+ * Returns an array of file extensions commonly associated with the specified
721
+ * MIME type. For example, 'application/pdf' returns ['pdf'].
722
+ *
723
+ * @param mimeType - The MIME type to look up (e.g., 'application/pdf', 'image/jpeg')
724
+ * @returns Array of file extensions (without leading dots)
725
+ *
726
+ * @throws {Error} If the MIME type is not recognized or supported
727
+ *
728
+ * @example
729
+ * ```typescript
730
+ * import { getExtensionsForMime } from '@kreuzberg/node';
731
+ *
732
+ * // Get extensions for PDF
733
+ * const pdfExts = getExtensionsForMime('application/pdf');
734
+ * console.log(pdfExts); // ['pdf']
735
+ *
736
+ * // Get extensions for JPEG
737
+ * const jpegExts = getExtensionsForMime('image/jpeg');
738
+ * console.log(jpegExts); // ['jpg', 'jpeg']
739
+ * ```
740
+ */
741
+ declare function getExtensionsForMime(mimeType: string): string[];
742
+ /**
743
+ * Embedding preset configuration.
744
+ *
745
+ * Contains all settings for a specific embedding model preset.
746
+ */
747
+ interface EmbeddingPreset {
748
+ /** Name of the preset (e.g., "fast", "balanced", "quality", "multilingual") */
749
+ name: string;
750
+ /** Recommended chunk size in characters */
751
+ chunkSize: number;
752
+ /** Recommended overlap in characters */
753
+ overlap: number;
754
+ /** Model identifier (e.g., "AllMiniLML6V2Q", "BGEBaseENV15") */
755
+ modelName: string;
756
+ /** Embedding vector dimensions */
757
+ dimensions: number;
758
+ /** Human-readable description of the preset */
759
+ description: string;
760
+ }
761
+ /**
762
+ * List all available embedding preset names.
763
+ *
764
+ * Returns an array of preset names that can be used with `getEmbeddingPreset`.
765
+ *
766
+ * @returns Array of 4 preset names: ["fast", "balanced", "quality", "multilingual"]
767
+ *
768
+ * @example
769
+ * ```typescript
770
+ * import { listEmbeddingPresets } from '@kreuzberg/node';
771
+ *
772
+ * const presets = listEmbeddingPresets();
773
+ * console.log(presets); // ['fast', 'balanced', 'quality', 'multilingual']
774
+ * ```
775
+ */
776
+ declare function listEmbeddingPresets(): string[];
777
+ /**
778
+ * Get a specific embedding preset by name.
779
+ *
780
+ * Returns a preset configuration object, or null if the preset name is not found.
781
+ *
782
+ * @param name - The preset name (case-sensitive)
783
+ * @returns An `EmbeddingPreset` object or `null` if not found
784
+ *
785
+ * @example
786
+ * ```typescript
787
+ * import { getEmbeddingPreset } from '@kreuzberg/node';
788
+ *
789
+ * const preset = getEmbeddingPreset('balanced');
790
+ * if (preset) {
791
+ * console.log(`Model: ${preset.modelName}, Dims: ${preset.dimensions}`);
792
+ * // Model: BGEBaseENV15, Dims: 768
793
+ * }
794
+ * ```
795
+ */
796
+ declare function getEmbeddingPreset(name: string): EmbeddingPreset | null;
797
+ /**
798
+ * Get the error code for the last FFI error.
799
+ *
800
+ * Returns the FFI error code as an integer. This is useful for programmatic error handling
801
+ * and distinguishing between different types of failures in native code.
802
+ *
803
+ * Error codes:
804
+ * - 0: Success (no error)
805
+ * - 1: GenericError
806
+ * - 2: Panic
807
+ * - 3: InvalidArgument
808
+ * - 4: IoError
809
+ * - 5: ParsingError
810
+ * - 6: OcrError
811
+ * - 7: MissingDependency
812
+ *
813
+ * @returns The integer error code
814
+ *
815
+ * @example
816
+ * ```typescript
817
+ * import { extractFile, getLastErrorCode, ErrorCode } from '@kreuzberg/node';
818
+ *
819
+ * try {
820
+ * const result = await extractFile('document.pdf');
821
+ * } catch (error) {
822
+ * const code = getLastErrorCode();
823
+ * if (code === ErrorCode.Panic) {
824
+ * console.error('Native code panic detected');
825
+ * }
826
+ * }
827
+ * ```
828
+ */
829
+ declare function getLastErrorCode(): number;
830
+ /**
831
+ * Get panic context information if the last error was a panic.
832
+ *
833
+ * Returns detailed information about a panic in native code, or null if the last error was not a panic.
834
+ * This provides debugging information when native code panics.
835
+ *
836
+ * @returns A `PanicContext` object with file, line, function, message, and timestamp_secs, or null if no panic context is available
837
+ *
838
+ * @example
839
+ * ```typescript
840
+ * import { extractFile, getLastPanicContext } from '@kreuzberg/node';
841
+ *
842
+ * try {
843
+ * const result = await extractFile('document.pdf');
844
+ * } catch (error) {
845
+ * const context = getLastPanicContext();
846
+ * if (context) {
847
+ * console.error(`Panic at ${context.file}:${context.line}`);
848
+ * console.error(`In function: ${context.function}`);
849
+ * console.error(`Message: ${context.message}`);
850
+ * }
851
+ * }
852
+ * ```
853
+ */
854
+ declare function getLastPanicContext(): PanicContext | null;
855
+ declare const __version__ = "4.0.0-rc.8";
856
+
857
+ export { type EmbeddingPreset, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesSync, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileSync, getEmbeddingPreset, getExtensionsForMime, getLastErrorCode, getLastPanicContext, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };