@kreuzberg/node 4.0.0-rc.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.d.ts ADDED
@@ -0,0 +1,1109 @@
1
+ /* auto-generated by NAPI-RS */
2
+ /* eslint-disable */
3
+ /**
4
+ * Batch extract from multiple byte arrays (asynchronous).
5
+ *
6
+ * Asynchronously processes multiple in-memory buffers in parallel. Non-blocking
7
+ * alternative to `batchExtractBytesSync`.
8
+ *
9
+ * # Parameters
10
+ *
11
+ * * `data_list` - Array of buffers to extract
12
+ * * `mime_types` - Array of MIME types (must match data_list length)
13
+ * * `config` - Optional extraction configuration
14
+ *
15
+ * # Returns
16
+ *
17
+ * Promise resolving to array of `ExtractionResult`.
18
+ *
19
+ * # Example
20
+ *
21
+ * ```typescript
22
+ * import { batchExtractBytes } from '@kreuzberg/node';
23
+ *
24
+ * const responses = await Promise.all([
25
+ * fetch('https://example.com/doc1.pdf'),
26
+ * fetch('https://example.com/doc2.pdf')
27
+ * ]);
28
+ * const buffers = await Promise.all(
29
+ * responses.map(r => r.arrayBuffer().then(b => Buffer.from(b)))
30
+ * );
31
+ * const results = await batchExtractBytes(
32
+ * buffers,
33
+ * ['application/pdf', 'application/pdf'],
34
+ * null
35
+ * );
36
+ * ```
37
+ */
38
+ export declare function batchExtractBytes(dataList: Array<Buffer>, mimeTypes: Array<string>, config?: JsExtractionConfig | undefined | null): Promise<Array<JsExtractionResult>>
39
+
40
+ /**
41
+ * Batch extract from multiple byte arrays (synchronous).
42
+ *
43
+ * Synchronously processes multiple in-memory buffers in parallel. Requires
44
+ * corresponding MIME types for each buffer.
45
+ *
46
+ * # Parameters
47
+ *
48
+ * * `data_list` - Array of buffers to extract
49
+ * * `mime_types` - Array of MIME types (must match data_list length)
50
+ * * `config` - Optional extraction configuration
51
+ *
52
+ * # Returns
53
+ *
54
+ * Array of `ExtractionResult` in the same order as inputs.
55
+ *
56
+ * # Errors
57
+ *
58
+ * Throws if data_list and mime_types lengths don't match.
59
+ *
60
+ * # Example
61
+ *
62
+ * ```typescript
63
+ * import { batchExtractBytesSync } from '@kreuzberg/node';
64
+ *
65
+ * const buffers = [buffer1, buffer2, buffer3];
66
+ * const mimeTypes = ['application/pdf', 'image/png', 'text/plain'];
67
+ * const results = batchExtractBytesSync(buffers, mimeTypes, null);
68
+ * ```
69
+ */
70
+ export declare function batchExtractBytesSync(dataList: Array<Buffer>, mimeTypes: Array<string>, config?: JsExtractionConfig | undefined | null): Array<JsExtractionResult>
71
+
72
+ /**
73
+ * Batch extract from multiple files (asynchronous).
74
+ *
75
+ * Asynchronously processes multiple files in parallel. Non-blocking alternative
76
+ * to `batchExtractFilesSync` with same performance benefits.
77
+ *
78
+ * # Parameters
79
+ *
80
+ * * `paths` - Array of file paths to extract
81
+ * * `config` - Optional extraction configuration (applied to all files)
82
+ *
83
+ * # Returns
84
+ *
85
+ * Promise resolving to array of `ExtractionResult`.
86
+ *
87
+ * # Example
88
+ *
89
+ * ```typescript
90
+ * import { batchExtractFiles } from '@kreuzberg/node';
91
+ *
92
+ * const files = ['report1.pdf', 'report2.pdf', 'report3.pdf'];
93
+ * const results = await batchExtractFiles(files, null);
94
+ * console.log(`Processed ${results.length} files`);
95
+ * ```
96
+ */
97
+ export declare function batchExtractFiles(paths: Array<string>, config?: JsExtractionConfig | undefined | null): Promise<Array<JsExtractionResult>>
98
+
99
+ /**
100
+ * Batch extract from multiple files (synchronous).
101
+ *
102
+ * Synchronously processes multiple files in parallel using Rayon. Significantly
103
+ * faster than sequential processing for large batches.
104
+ *
105
+ * # Parameters
106
+ *
107
+ * * `paths` - Array of file paths to extract
108
+ * * `config` - Optional extraction configuration (applied to all files)
109
+ *
110
+ * # Returns
111
+ *
112
+ * Array of `ExtractionResult` in the same order as input paths.
113
+ *
114
+ * # Example
115
+ *
116
+ * ```typescript
117
+ * import { batchExtractFilesSync } from '@kreuzberg/node';
118
+ *
119
+ * const files = ['doc1.pdf', 'doc2.docx', 'doc3.txt'];
120
+ * const results = batchExtractFilesSync(files, null);
121
+ * results.forEach((result, i) => {
122
+ * console.log(`File ${files[i]}: ${result.content.substring(0, 100)}...`);
123
+ * });
124
+ * ```
125
+ */
126
+ export declare function batchExtractFilesSync(paths: Array<string>, config?: JsExtractionConfig | undefined | null): Array<JsExtractionResult>
127
+
128
+ /**
129
+ * Clear all registered document extractors.
130
+ *
131
+ * Removes all document extractors from the registry, including built-in extractors.
132
+ * Use with caution as this will make document extraction unavailable until
133
+ * extractors are re-registered.
134
+ *
135
+ * # Example
136
+ *
137
+ * ```typescript
138
+ * import { clearDocumentExtractors } from 'kreuzberg';
139
+ *
140
+ * clearDocumentExtractors();
141
+ * ```
142
+ */
143
+ export declare function clearDocumentExtractors(): void
144
+
145
+ /**
146
+ * Clear all registered OCR backends.
147
+ *
148
+ * Removes all OCR backends from the registry, including built-in backends.
149
+ * Use with caution as this will make OCR functionality unavailable until
150
+ * backends are re-registered.
151
+ *
152
+ * # Example
153
+ *
154
+ * ```typescript
155
+ * import { clearOcrBackends } from 'kreuzberg';
156
+ *
157
+ * clearOcrBackends();
158
+ * ```
159
+ */
160
+ export declare function clearOcrBackends(): void
161
+
162
+ /** Clear all registered postprocessors */
163
+ export declare function clearPostProcessors(): void
164
+
165
+ /** Clear all registered validators */
166
+ export declare function clearValidators(): void
167
+
168
+ /**
169
+ * Detect MIME type from raw bytes.
170
+ *
171
+ * Uses content inspection (magic bytes) to determine MIME type.
172
+ * This is more accurate than extension-based detection but requires
173
+ * reading the file content.
174
+ *
175
+ * # Parameters
176
+ *
177
+ * * `bytes` - Raw file content as Buffer
178
+ *
179
+ * # Returns
180
+ *
181
+ * The detected MIME type string.
182
+ *
183
+ * # Errors
184
+ *
185
+ * Throws an error if MIME type cannot be determined from content.
186
+ *
187
+ * # Example
188
+ *
189
+ * ```typescript
190
+ * import { detectMimeType } from 'kreuzberg';
191
+ * import * as fs from 'fs';
192
+ *
193
+ * // Read file content
194
+ * const content = fs.readFileSync('document.pdf');
195
+ *
196
+ * // Detect MIME type from bytes
197
+ * const mimeType = detectMimeType(content);
198
+ * console.log(mimeType); // 'application/pdf'
199
+ * ```
200
+ */
201
+ export declare function detectMimeType(bytes: Buffer): string
202
+
203
+ /**
204
+ * Detect MIME type from a file path.
205
+ *
206
+ * Uses file extension to determine MIME type. Falls back to `mime_guess` crate
207
+ * if extension-based detection fails.
208
+ *
209
+ * # Parameters
210
+ *
211
+ * * `path` - Path to the file (string)
212
+ * * `check_exists` - Whether to verify file existence (default: true)
213
+ *
214
+ * # Returns
215
+ *
216
+ * The detected MIME type string.
217
+ *
218
+ * # Errors
219
+ *
220
+ * Throws an error if:
221
+ * - File doesn't exist (when check_exists is true)
222
+ * - MIME type cannot be determined from path/extension
223
+ * - Extension is unknown
224
+ *
225
+ * # Example
226
+ *
227
+ * ```typescript
228
+ * import { detectMimeTypeFromPath } from 'kreuzberg';
229
+ *
230
+ * // Detect from existing file
231
+ * const mimeType = detectMimeTypeFromPath('document.pdf');
232
+ * console.log(mimeType); // 'application/pdf'
233
+ *
234
+ * const mimeType2 = detectMimeTypeFromPath('document.docx');
235
+ * console.log(mimeType2); // 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
236
+ * ```
237
+ */
238
+ export declare function detectMimeTypeFromPath(path: string, checkExists?: boolean | undefined | null): string
239
+
240
+ /**
241
+ * Discover and load extraction configuration from current or parent directories.
242
+ *
243
+ * Searches for a `kreuzberg.toml` file starting from the current working directory
244
+ * and traversing up the directory tree. Returns the first configuration file found.
245
+ *
246
+ * # Returns
247
+ *
248
+ * `JsExtractionConfig` object if a configuration file is found, or `null` if no
249
+ * configuration file exists in the current or parent directories.
250
+ *
251
+ * # Example
252
+ *
253
+ * ```typescript
254
+ * import { ExtractionConfig } from 'kreuzberg';
255
+ *
256
+ * // Try to find config in current or parent directories
257
+ * const config = ExtractionConfig.discover();
258
+ * if (config) {
259
+ * console.log('Found configuration');
260
+ * // Use config for extraction
261
+ * } else {
262
+ * console.log('No configuration file found, using defaults');
263
+ * }
264
+ * ```
265
+ */
266
+ export declare function discoverExtractionConfig(): JsExtractionConfig | null
267
+
268
+ /**
269
+ * Embedding preset configuration for TypeScript bindings.
270
+ *
271
+ * Contains all settings for a specific embedding model preset.
272
+ */
273
+ export interface EmbeddingPreset {
274
+ /** Name of the preset (e.g., "fast", "balanced", "quality", "multilingual") */
275
+ name: string
276
+ /** Recommended chunk size in characters */
277
+ chunkSize: number
278
+ /** Recommended overlap in characters */
279
+ overlap: number
280
+ /** Model identifier (e.g., "AllMiniLML6V2Q", "BGEBaseENV15") */
281
+ modelName: string
282
+ /** Embedding vector dimensions */
283
+ dimensions: number
284
+ /** Human-readable description of the preset */
285
+ description: string
286
+ }
287
+
288
+ /**
289
+ * Extract content from bytes (asynchronous).
290
+ *
291
+ * Asynchronously extracts content from a byte buffer. Non-blocking alternative
292
+ * to `extractBytesSync` for processing in-memory data.
293
+ *
294
+ * # Parameters
295
+ *
296
+ * * `data` - Buffer containing the document bytes
297
+ * * `mime_type` - MIME type of the data
298
+ * * `config` - Optional extraction configuration
299
+ *
300
+ * # Returns
301
+ *
302
+ * Promise resolving to `ExtractionResult`.
303
+ *
304
+ * # Example
305
+ *
306
+ * ```typescript
307
+ * import { extractBytes } from '@kreuzberg/node';
308
+ *
309
+ * const response = await fetch('https://example.com/document.pdf');
310
+ * const buffer = Buffer.from(await response.arrayBuffer());
311
+ * const result = await extractBytes(buffer, 'application/pdf', null);
312
+ * ```
313
+ */
314
+ export declare function extractBytes(data: Buffer, mimeType: string, config?: JsExtractionConfig | undefined | null): Promise<JsExtractionResult>
315
+
316
+ /**
317
+ * Extract content from bytes (synchronous).
318
+ *
319
+ * Synchronously extracts content from a byte buffer without requiring a file path.
320
+ * Useful for processing in-memory data, network streams, or database BLOBs.
321
+ *
322
+ * # Parameters
323
+ *
324
+ * * `data` - Buffer containing the document bytes
325
+ * * `mime_type` - MIME type of the data (e.g., "application/pdf", "image/png")
326
+ * * `config` - Optional extraction configuration
327
+ *
328
+ * # Returns
329
+ *
330
+ * `ExtractionResult` with extracted content and metadata.
331
+ *
332
+ * # Errors
333
+ *
334
+ * Throws an error if data is malformed or MIME type is unsupported.
335
+ *
336
+ * # Example
337
+ *
338
+ * ```typescript
339
+ * import { extractBytesSync } from '@kreuzberg/node';
340
+ * import fs from 'fs';
341
+ *
342
+ * const buffer = fs.readFileSync('document.pdf');
343
+ * const result = extractBytesSync(buffer, 'application/pdf', null);
344
+ * console.log(result.content);
345
+ * ```
346
+ */
347
+ export declare function extractBytesSync(data: Buffer, mimeType: string, config?: JsExtractionConfig | undefined | null): JsExtractionResult
348
+
349
+ /**
350
+ * Extract content from a file (asynchronous).
351
+ *
352
+ * Asynchronously extracts text, tables, images, and metadata from a document file.
353
+ * Non-blocking alternative to `extractFileSync` for use in async/await contexts.
354
+ *
355
+ * # Parameters
356
+ *
357
+ * * `file_path` - Path to the file to extract (absolute or relative)
358
+ * * `mime_type` - Optional MIME type hint (auto-detected if omitted)
359
+ * * `config` - Optional extraction configuration (OCR, chunking, etc.)
360
+ *
361
+ * # Returns
362
+ *
363
+ * Promise resolving to `ExtractionResult` with extracted content and metadata.
364
+ *
365
+ * # Errors
366
+ *
367
+ * Rejects if file processing fails (see `extractFileSync` for error conditions).
368
+ *
369
+ * # Example
370
+ *
371
+ * ```typescript
372
+ * import { extractFile } from '@kreuzberg/node';
373
+ *
374
+ * // Async/await usage
375
+ * const result = await extractFile('document.pdf', null, null);
376
+ * console.log(result.content);
377
+ *
378
+ * // Promise usage
379
+ * extractFile('report.docx', null, null)
380
+ * .then(result => console.log(result.content))
381
+ * .catch(err => console.error(err));
382
+ * ```
383
+ */
384
+ export declare function extractFile(filePath: string, mimeType?: string | undefined | null, config?: JsExtractionConfig | undefined | null): Promise<JsExtractionResult>
385
+
386
+ /**
387
+ * Extract content from a file (synchronous).
388
+ *
389
+ * Synchronously extracts text, tables, images, and metadata from a document file.
390
+ * Supports 118+ file formats including PDFs, Office documents, images, and more.
391
+ *
392
+ * # Parameters
393
+ *
394
+ * * `file_path` - Path to the file to extract (absolute or relative)
395
+ * * `mime_type` - Optional MIME type hint (auto-detected if omitted)
396
+ * * `config` - Optional extraction configuration (OCR, chunking, etc.)
397
+ *
398
+ * # Returns
399
+ *
400
+ * `ExtractionResult` containing:
401
+ * - `content`: Extracted text content
402
+ * - `mimeType`: Detected MIME type
403
+ * - `metadata`: File metadata (author, title, etc.)
404
+ * - `tables`: Extracted tables (if any)
405
+ * - `images`: Extracted images (if configured)
406
+ * - `chunks`: Text chunks (if chunking enabled)
407
+ * - `detectedLanguages`: Detected languages (if enabled)
408
+ *
409
+ * # Errors
410
+ *
411
+ * Throws an error if:
412
+ * - File does not exist or is not accessible
413
+ * - File format is unsupported
414
+ * - File is corrupted or malformed
415
+ * - OCR processing fails (if enabled)
416
+ *
417
+ * # Example
418
+ *
419
+ * ```typescript
420
+ * import { extractFileSync, ExtractionConfig } from '@kreuzberg/node';
421
+ *
422
+ * // Basic extraction
423
+ * const result = extractFileSync('document.pdf', null, null);
424
+ * console.log(result.content);
425
+ *
426
+ * // With MIME type hint
427
+ * const result2 = extractFileSync('file.bin', 'application/pdf', null);
428
+ *
429
+ * // With OCR enabled
430
+ * const config: ExtractionConfig = {
431
+ * ocr: {
432
+ * backend: 'tesseract',
433
+ * language: 'eng',
434
+ * }
435
+ * };
436
+ * const result3 = extractFileSync('scanned.pdf', null, config);
437
+ * ```
438
+ */
439
+ export declare function extractFileSync(filePath: string, mimeType?: string | undefined | null, config?: JsExtractionConfig | undefined | null): JsExtractionResult
440
+
441
+ /**
442
+ * Get a specific embedding preset by name.
443
+ *
444
+ * Returns a preset configuration object, or null if the preset name is not found.
445
+ *
446
+ * # Arguments
447
+ *
448
+ * * `name` - The preset name (case-sensitive)
449
+ *
450
+ * # Returns
451
+ *
452
+ * An `EmbeddingPreset` object with the following properties:
453
+ * - `name`: string - Preset name
454
+ * - `chunkSize`: number - Recommended chunk size in characters
455
+ * - `overlap`: number - Recommended overlap in characters
456
+ * - `modelName`: string - Model identifier
457
+ * - `dimensions`: number - Embedding vector dimensions
458
+ * - `description`: string - Human-readable description
459
+ *
460
+ * Returns `null` if preset name is not found.
461
+ *
462
+ * # Example
463
+ *
464
+ * ```typescript
465
+ * import { getEmbeddingPreset } from 'kreuzberg';
466
+ *
467
+ * const preset = getEmbeddingPreset('balanced');
468
+ * if (preset) {
469
+ * console.log(`Model: ${preset.modelName}, Dims: ${preset.dimensions}`);
470
+ * // Model: BGEBaseENV15, Dims: 768
471
+ * }
472
+ * ```
473
+ */
474
+ export declare function getEmbeddingPreset(name: string): EmbeddingPreset | null
475
+
476
+ /**
477
+ * Get file extensions for a given MIME type.
478
+ *
479
+ * Returns an array of file extensions commonly associated with the specified
480
+ * MIME type. For example, 'application/pdf' returns ['pdf'].
481
+ *
482
+ * # Parameters
483
+ *
484
+ * * `mime_type` - The MIME type to look up (e.g., 'application/pdf', 'image/jpeg')
485
+ *
486
+ * # Returns
487
+ *
488
+ * Array of file extensions (without leading dots).
489
+ *
490
+ * # Errors
491
+ *
492
+ * Throws an error if the MIME type is not recognized or supported.
493
+ *
494
+ * # Example
495
+ *
496
+ * ```typescript
497
+ * import { getExtensionsForMime } from 'kreuzberg';
498
+ *
499
+ * // Get extensions for PDF
500
+ * const pdfExts = getExtensionsForMime('application/pdf');
501
+ * console.log(pdfExts); // ['pdf']
502
+ *
503
+ * // Get extensions for JPEG
504
+ * const jpegExts = getExtensionsForMime('image/jpeg');
505
+ * console.log(jpegExts); // ['jpg', 'jpeg']
506
+ * ```
507
+ */
508
+ export declare function getExtensionsForMime(mimeType: string): Array<string>
509
+
510
+ /**
511
+ * Get the error code for the last FFI error.
512
+ *
513
+ * Returns the FFI error code as an integer. Error codes are:
514
+ * - 0: Success (no error)
515
+ * - 1: GenericError
516
+ * - 2: Panic
517
+ * - 3: InvalidArgument
518
+ * - 4: IoError
519
+ * - 5: ParsingError
520
+ * - 6: OcrError
521
+ * - 7: MissingDependency
522
+ *
523
+ * This is useful for programmatic error handling and distinguishing
524
+ * between different types of failures in native code.
525
+ *
526
+ * # Returns
527
+ *
528
+ * The integer error code.
529
+ *
530
+ * # Example
531
+ *
532
+ * ```typescript
533
+ * import { extractFile, getLastErrorCode, ErrorCode } from '@kreuzberg/node';
534
+ *
535
+ * try {
536
+ * const result = await extractFile('document.pdf');
537
+ * } catch (error) {
538
+ * const code = getLastErrorCode();
539
+ * if (code === ErrorCode.Panic) {
540
+ * console.error('Native code panic detected');
541
+ * }
542
+ * }
543
+ * ```
544
+ */
545
+ export declare function getLastErrorCode(): number
546
+
547
+ /**
548
+ * Get panic context information if the last error was a panic.
549
+ *
550
+ * Returns detailed information about a panic in native code, or null
551
+ * if the last error was not a panic.
552
+ *
553
+ * # Returns
554
+ *
555
+ * A `PanicContext` object with:
556
+ * - `file`: string - Source file where panic occurred
557
+ * - `line`: number - Line number
558
+ * - `function`: string - Function name
559
+ * - `message`: string - Panic message
560
+ * - `timestamp_secs`: number - Unix timestamp (seconds since epoch)
561
+ *
562
+ * Returns `null` if no panic context is available.
563
+ *
564
+ * # Example
565
+ *
566
+ * ```typescript
567
+ * import { extractFile, getLastPanicContext } from '@kreuzberg/node';
568
+ *
569
+ * try {
570
+ * const result = await extractFile('document.pdf');
571
+ * } catch (error) {
572
+ * const context = getLastPanicContext();
573
+ * if (context) {
574
+ * console.error(`Panic at ${context.file}:${context.line}`);
575
+ * console.error(`In function: ${context.function}`);
576
+ * console.error(`Message: ${context.message}`);
577
+ * }
578
+ * }
579
+ * ```
580
+ */
581
+ export declare function getLastPanicContext(): any | null
582
+
583
+ export interface JsChunk {
584
+ content: string
585
+ embedding?: number[] | undefined
586
+ metadata: JsChunkMetadata
587
+ }
588
+
589
+ export interface JsChunkingConfig {
590
+ maxChars?: number
591
+ maxOverlap?: number
592
+ /** Optional embedding configuration for generating embeddings */
593
+ embedding?: JsEmbeddingConfig
594
+ /** Optional preset name for chunking parameters */
595
+ preset?: string
596
+ }
597
+
598
+ export interface JsChunkMetadata {
599
+ charStart: number
600
+ charEnd: number
601
+ tokenCount?: number
602
+ chunkIndex: number
603
+ totalChunks: number
604
+ }
605
+
606
+ /** Embedding generation configuration for Node.js bindings. */
607
+ export interface JsEmbeddingConfig {
608
+ /** Embedding model configuration */
609
+ model?: JsEmbeddingModelType
610
+ /** Whether to normalize embeddings (L2 normalization) */
611
+ normalize?: boolean
612
+ /** Batch size for embedding generation */
613
+ batchSize?: number
614
+ /** Whether to show download progress for models */
615
+ showDownloadProgress?: boolean
616
+ /** Custom cache directory for model storage */
617
+ cacheDir?: string
618
+ }
619
+
620
+ /**
621
+ * Embedding model type configuration for Node.js bindings.
622
+ *
623
+ * This struct represents different embedding model sources:
624
+ * - `preset`: Use a named preset (e.g., "balanced", "fast", "quality", "multilingual")
625
+ * - `fastembed`: Use a FastEmbed model with custom dimensions
626
+ * - `custom`: Use a custom ONNX model
627
+ */
628
+ export interface JsEmbeddingModelType {
629
+ /** Type of model: "preset", "fastembed", or "custom" */
630
+ modelType: string
631
+ /** For preset: preset name; for fastembed/custom: model ID */
632
+ value: string
633
+ /** Number of dimensions (only for fastembed/custom) */
634
+ dimensions?: number
635
+ }
636
+
637
+ export interface JsExtractedImage {
638
+ data: Buffer
639
+ format: string
640
+ imageIndex: number
641
+ pageNumber?: number
642
+ width?: number
643
+ height?: number
644
+ colorspace?: string
645
+ bitsPerComponent?: number
646
+ isMask: boolean
647
+ description?: string
648
+ ocrResult?: JsExtractionResult | undefined
649
+ }
650
+
651
+ export interface JsExtractionConfig {
652
+ useCache?: boolean
653
+ enableQualityProcessing?: boolean
654
+ ocr?: JsOcrConfig
655
+ forceOcr?: boolean
656
+ chunking?: JsChunkingConfig
657
+ images?: JsImageExtractionConfig
658
+ pdfOptions?: JsPdfConfig
659
+ tokenReduction?: JsTokenReductionConfig
660
+ languageDetection?: JsLanguageDetectionConfig
661
+ postprocessor?: JsPostProcessorConfig
662
+ keywords?: JsKeywordConfig
663
+ htmlOptions?: JsHtmlOptions
664
+ maxConcurrentExtractions?: number
665
+ }
666
+
667
+ export interface JsExtractionResult {
668
+ content: string
669
+ mimeType: string
670
+ metadata: Metadata
671
+ tables: Array<JsTable>
672
+ detectedLanguages?: Array<string>
673
+ chunks?: Array<JsChunk>
674
+ images?: Array<JsExtractedImage>
675
+ }
676
+
677
+ export interface JsHtmlOptions {
678
+ headingStyle?: string
679
+ listIndentType?: string
680
+ listIndentWidth?: number
681
+ bullets?: string
682
+ strongEmSymbol?: string
683
+ escapeAsterisks?: boolean
684
+ escapeUnderscores?: boolean
685
+ escapeMisc?: boolean
686
+ escapeAscii?: boolean
687
+ codeLanguage?: string
688
+ autolinks?: boolean
689
+ defaultTitle?: boolean
690
+ brInTables?: boolean
691
+ hocrSpatialTables?: boolean
692
+ highlightStyle?: string
693
+ extractMetadata?: boolean
694
+ whitespaceMode?: string
695
+ stripNewlines?: boolean
696
+ wrap?: boolean
697
+ wrapWidth?: number
698
+ convertAsInline?: boolean
699
+ subSymbol?: string
700
+ supSymbol?: string
701
+ newlineStyle?: string
702
+ codeBlockStyle?: string
703
+ keepInlineImagesIn?: Array<string>
704
+ encoding?: string
705
+ debug?: boolean
706
+ stripTags?: Array<string>
707
+ preserveTags?: Array<string>
708
+ preprocessing?: JsHtmlPreprocessingOptions
709
+ }
710
+
711
+ export interface JsHtmlPreprocessingOptions {
712
+ enabled?: boolean
713
+ preset?: string
714
+ removeNavigation?: boolean
715
+ removeForms?: boolean
716
+ }
717
+
718
+ export interface JsImageExtractionConfig {
719
+ extractImages?: boolean
720
+ targetDpi?: number
721
+ maxImageDimension?: number
722
+ autoAdjustDpi?: boolean
723
+ minDpi?: number
724
+ maxDpi?: number
725
+ }
726
+
727
+ export interface JsKeywordConfig {
728
+ algorithm?: string
729
+ maxKeywords?: number
730
+ minScore?: number
731
+ ngramRange?: [number, number] | undefined
732
+ language?: string
733
+ yakeParams?: JsYakeParams
734
+ rakeParams?: JsRakeParams
735
+ }
736
+
737
+ export interface JsLanguageDetectionConfig {
738
+ enabled?: boolean
739
+ minConfidence?: number
740
+ detectMultiple?: boolean
741
+ }
742
+
743
+ export interface JsOcrConfig {
744
+ backend: string
745
+ language?: string
746
+ tesseractConfig?: JsTesseractConfig
747
+ }
748
+
749
+ export interface JsPdfConfig {
750
+ extractImages?: boolean
751
+ passwords?: Array<string>
752
+ extractMetadata?: boolean
753
+ }
754
+
755
+ export interface JsPostProcessorConfig {
756
+ enabled?: boolean
757
+ enabledProcessors?: Array<string>
758
+ disabledProcessors?: Array<string>
759
+ }
760
+
761
+ export interface JsRakeParams {
762
+ minWordLength?: number
763
+ maxWordsPerPhrase?: number
764
+ }
765
+
766
+ export interface JsTable {
767
+ cells: Array<Array<string>>
768
+ markdown: string
769
+ pageNumber: number
770
+ }
771
+
772
+ export interface JsTesseractConfig {
773
+ psm?: number
774
+ enableTableDetection?: boolean
775
+ tesseditCharWhitelist?: string
776
+ }
777
+
778
+ export interface JsTokenReductionConfig {
779
+ mode?: string
780
+ preserveImportantWords?: boolean
781
+ }
782
+
783
+ export interface JsYakeParams {
784
+ windowSize?: number
785
+ }
786
+
787
+ /**
788
+ * List all registered document extractors.
789
+ *
790
+ * Returns an array of names of all currently registered document extractors,
791
+ * including built-in extractors for PDF, Office documents, images, etc.
792
+ *
793
+ * # Returns
794
+ *
795
+ * Array of document extractor names.
796
+ *
797
+ * # Example
798
+ *
799
+ * ```typescript
800
+ * import { listDocumentExtractors } from 'kreuzberg';
801
+ *
802
+ * const extractors = listDocumentExtractors();
803
+ * console.log(extractors); // ['PDFExtractor', 'ImageExtractor', ...]
804
+ * ```
805
+ */
806
+ export declare function listDocumentExtractors(): Array<string>
807
+
808
+ /**
809
+ * List all available embedding preset names.
810
+ *
811
+ * Returns an array of preset names that can be used with `getEmbeddingPreset`.
812
+ *
813
+ * # Returns
814
+ *
815
+ * Array of 4 preset names: ["fast", "balanced", "quality", "multilingual"]
816
+ *
817
+ * # Example
818
+ *
819
+ * ```typescript
820
+ * import { listEmbeddingPresets } from 'kreuzberg';
821
+ *
822
+ * const presets = listEmbeddingPresets();
823
+ * console.log(presets); // ['fast', 'balanced', 'quality', 'multilingual']
824
+ * ```
825
+ */
826
+ export declare function listEmbeddingPresets(): Array<string>
827
+
828
+ /**
829
+ * List all registered OCR backends.
830
+ *
831
+ * Returns an array of names of all currently registered OCR backends,
832
+ * including built-in backends like "tesseract".
833
+ *
834
+ * # Returns
835
+ *
836
+ * Array of OCR backend names.
837
+ *
838
+ * # Example
839
+ *
840
+ * ```typescript
841
+ * import { listOcrBackends } from 'kreuzberg';
842
+ *
843
+ * const backends = listOcrBackends();
844
+ * console.log(backends); // ['tesseract', 'my-custom-backend', ...]
845
+ * ```
846
+ */
847
+ export declare function listOcrBackends(): Array<string>
848
+
849
+ /** List all registered post-processors */
850
+ export declare function listPostProcessors(): Array<string>
851
+
852
+ /** List all registered validators */
853
+ export declare function listValidators(): Array<string>
854
+
855
+ /**
856
+ * Load extraction configuration from a file.
857
+ *
858
+ * Automatically detects the file format based on extension:
859
+ * - `.toml` - TOML format
860
+ * - `.yaml` - YAML format
861
+ * - `.json` - JSON format
862
+ *
863
+ * # Parameters
864
+ *
865
+ * * `file_path` - Path to the configuration file (absolute or relative)
866
+ *
867
+ * # Returns
868
+ *
869
+ * `JsExtractionConfig` object with loaded configuration.
870
+ *
871
+ * # Errors
872
+ *
873
+ * Throws an error if:
874
+ * - File does not exist or is not accessible
875
+ * - File content is not valid TOML/YAML/JSON
876
+ * - Configuration structure is invalid
877
+ *
878
+ * # Example
879
+ *
880
+ * ```typescript
881
+ * import { loadExtractionConfigFromFile } from 'kreuzberg';
882
+ *
883
+ * // Load from TOML file
884
+ * const config = loadExtractionConfigFromFile('kreuzberg.toml');
885
+ *
886
+ * // Load from YAML file
887
+ * const config2 = loadExtractionConfigFromFile('./config.yaml');
888
+ *
889
+ * // Use with extraction
890
+ * const result = await extractFile('document.pdf', null, config);
891
+ * ```
892
+ */
893
+ export declare function loadExtractionConfigFromFile(filePath: string): JsExtractionConfig
894
+
895
+ /**
896
+ * Register a custom OCR backend
897
+ *
898
+ * Registers a JavaScript OCR backend that can process images and extract text.
899
+ *
900
+ * # Arguments
901
+ *
902
+ * * `backend` - JavaScript object with the following interface:
903
+ * - `name(): string` - Unique backend name
904
+ * - `supportedLanguages(): string[]` - Array of supported ISO 639-2/3 language codes
905
+ * - `processImage(imageBytes: string, language: string): Promise<result>` - Process image and return extraction result
906
+ *
907
+ * # Implementation Notes
908
+ *
909
+ * Due to NAPI ThreadsafeFunction limitations, the processImage function receives:
910
+ * - `imageBytes` as a Base64 string (first argument)
911
+ * - `language` as string (second argument)
912
+ *
913
+ * And must return a Promise resolving to a JSON-serializable object with:
914
+ * ```typescript
915
+ * {
916
+ * content: string,
917
+ * mime_type: string, // default: "text/plain"
918
+ * metadata: object, // default: {}
919
+ * tables: array // default: []
920
+ * }
921
+ * ```
922
+ *
923
+ * # Example
924
+ *
925
+ * ```typescript
926
+ * import { registerOcrBackend } from '@kreuzberg/node';
927
+ *
928
+ * registerOcrBackend({
929
+ * name: () => "my-ocr",
930
+ * supportedLanguages: () => ["eng", "deu", "fra"],
931
+ * processImage: async (imageBytes, language) => {
932
+ * const buffer = Buffer.from(imageBytes, "base64");
933
+ * const text = await myOcrLibrary.process(buffer, language);
934
+ * return {
935
+ * content: text,
936
+ * mime_type: "text/plain",
937
+ * metadata: { confidence: 0.95 },
938
+ * tables: []
939
+ * };
940
+ * }
941
+ * });
942
+ * ```
943
+ */
944
+ export declare function registerOcrBackend(backend: object): void
945
+
946
+ /**
947
+ * Register a custom postprocessor
948
+ *
949
+ * Registers a JavaScript PostProcessor that will be called after extraction.
950
+ *
951
+ * # Arguments
952
+ *
953
+ * * `processor` - JavaScript object with the following interface:
954
+ * - `name(): string` - Unique processor name
955
+ * - `process(...args): string` - Process function that receives JSON string as args\[0\]
956
+ * - `processingStage(): "early" | "middle" | "late"` - Optional processing stage
957
+ *
958
+ * # Implementation Notes
959
+ *
960
+ * Due to NAPI ThreadsafeFunction limitations, the process function receives the extraction
961
+ * result as a JSON string in args\[0\] and must return a JSON string. Use the TypeScript
962
+ * wrapper functions for a cleaner API.
963
+ *
964
+ * # Example
965
+ *
966
+ * ```typescript
967
+ * import { registerPostProcessor } from '@kreuzberg/node';
968
+ *
969
+ * registerPostProcessor({
970
+ * name: () => "word-counter",
971
+ * processingStage: () => "middle",
972
+ * process: (...args) => {
973
+ * const result = JSON.parse(args[0]);
974
+ * const wordCount = result.content.split(/\s+/).length;
975
+ * result.metadata.word_count = wordCount;
976
+ * return JSON.stringify(result);
977
+ * }
978
+ * });
979
+ * ```
980
+ */
981
+ export declare function registerPostProcessor(processor: object): void
982
+
983
+ /**
984
+ * Register a custom validator
985
+ *
986
+ * Registers a JavaScript Validator that will be called after extraction.
987
+ *
988
+ * # Arguments
989
+ *
990
+ * * `validator` - JavaScript object with the following interface:
991
+ * - `name(): string` - Unique validator name
992
+ * - `validate(...args): Promise<string>` - Validate function that receives JSON string as args\[0\]
993
+ * - `priority(): number` - Optional priority (defaults to 50, higher runs first)
994
+ *
995
+ * # Implementation Notes
996
+ *
997
+ * Due to NAPI ThreadsafeFunction limitations, the validate function receives the extraction
998
+ * result as a JSON string in args\[0\]. On success, return an empty string. On validation
999
+ * failure, throw an error (the Promise should reject). Use the TypeScript wrapper functions
1000
+ * for a cleaner API.
1001
+ *
1002
+ * # Example
1003
+ *
1004
+ * ```typescript
1005
+ * import { registerValidator } from '@kreuzberg/node';
1006
+ *
1007
+ * registerValidator({
1008
+ * name: () => "min-length",
1009
+ * priority: () => 100,
1010
+ * validate: async (...args) => {
1011
+ * const result = JSON.parse(args[0]);
1012
+ * if (result.content.length < 100) {
1013
+ * throw new Error("ValidationError: Content too short");
1014
+ * }
1015
+ * return ""; // Success - return empty string
1016
+ * }
1017
+ * });
1018
+ * ```
1019
+ */
1020
+ export declare function registerValidator(validator: object): void
1021
+
1022
+ /**
1023
+ * Unregister a document extractor by name.
1024
+ *
1025
+ * Removes the specified document extractor from the registry. If the extractor
1026
+ * doesn't exist, this operation is a no-op (does not throw an error).
1027
+ *
1028
+ * # Parameters
1029
+ *
1030
+ * * `name` - Name of the document extractor to unregister
1031
+ *
1032
+ * # Example
1033
+ *
1034
+ * ```typescript
1035
+ * import { unregisterDocumentExtractor } from 'kreuzberg';
1036
+ *
1037
+ * // Unregister a custom extractor
1038
+ * unregisterDocumentExtractor('MyCustomExtractor');
1039
+ * ```
1040
+ */
1041
+ export declare function unregisterDocumentExtractor(name: string): void
1042
+
1043
+ /**
1044
+ * Unregister an OCR backend by name.
1045
+ *
1046
+ * Removes the specified OCR backend from the registry. If the backend doesn't exist,
1047
+ * this operation is a no-op (does not throw an error).
1048
+ *
1049
+ * # Parameters
1050
+ *
1051
+ * * `name` - Name of the OCR backend to unregister
1052
+ *
1053
+ * # Example
1054
+ *
1055
+ * ```typescript
1056
+ * import { unregisterOcrBackend } from 'kreuzberg';
1057
+ *
1058
+ * // Unregister a custom backend
1059
+ * unregisterOcrBackend('my-custom-ocr');
1060
+ * ```
1061
+ */
1062
+ export declare function unregisterOcrBackend(name: string): void
1063
+
1064
+ /** Unregister a postprocessor by name */
1065
+ export declare function unregisterPostProcessor(name: string): void
1066
+
1067
+ /** Unregister a validator by name */
1068
+ export declare function unregisterValidator(name: string): void
1069
+
1070
+ /**
1071
+ * Validate that a MIME type is supported by Kreuzberg.
1072
+ *
1073
+ * Checks if a MIME type is in the list of supported formats. Note that any
1074
+ * `image/*` MIME type is automatically considered valid.
1075
+ *
1076
+ * # Parameters
1077
+ *
1078
+ * * `mime_type` - The MIME type to validate (string)
1079
+ *
1080
+ * # Returns
1081
+ *
1082
+ * The validated MIME type (may be normalized).
1083
+ *
1084
+ * # Errors
1085
+ *
1086
+ * Throws an error if the MIME type is not supported.
1087
+ *
1088
+ * # Example
1089
+ *
1090
+ * ```typescript
1091
+ * import { validateMimeType } from 'kreuzberg';
1092
+ *
1093
+ * // Validate supported type
1094
+ * const validated = validateMimeType('application/pdf');
1095
+ * console.log(validated); // 'application/pdf'
1096
+ *
1097
+ * // Validate custom image type
1098
+ * const validated2 = validateMimeType('image/custom-format');
1099
+ * console.log(validated2); // 'image/custom-format' (any image/* is valid)
1100
+ *
1101
+ * // Validate unsupported type (throws error)
1102
+ * try {
1103
+ * validateMimeType('video/mp4');
1104
+ * } catch (err) {
1105
+ * console.error(err); // Error: Unsupported format: video/mp4
1106
+ * }
1107
+ * ```
1108
+ */
1109
+ export declare function validateMimeType(mimeType: string): string