@kreuzberg/node 4.0.0-rc.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.d.ts ADDED
@@ -0,0 +1,1118 @@
1
+ /* auto-generated by NAPI-RS */
2
+ /* eslint-disable */
3
+ /**
4
+ * Batch extract from multiple byte arrays (asynchronous).
5
+ *
6
+ * Asynchronously processes multiple in-memory buffers in parallel. Non-blocking
7
+ * alternative to `batchExtractBytesSync`.
8
+ *
9
+ * # Parameters
10
+ *
11
+ * * `data_list` - Array of buffers to extract
12
+ * * `mime_types` - Array of MIME types (must match data_list length)
13
+ * * `config` - Optional extraction configuration
14
+ *
15
+ * # Returns
16
+ *
17
+ * Promise resolving to array of `ExtractionResult`.
18
+ *
19
+ * # Example
20
+ *
21
+ * ```typescript
22
+ * import { batchExtractBytes } from '@kreuzberg/node';
23
+ *
24
+ * const responses = await Promise.all([
25
+ * fetch('https://example.com/doc1.pdf'),
26
+ * fetch('https://example.com/doc2.pdf')
27
+ * ]);
28
+ * const buffers = await Promise.all(
29
+ * responses.map(r => r.arrayBuffer().then(b => Buffer.from(b)))
30
+ * );
31
+ * const results = await batchExtractBytes(
32
+ * buffers,
33
+ * ['application/pdf', 'application/pdf'],
34
+ * null
35
+ * );
36
+ * ```
37
+ */
38
+ export declare function batchExtractBytes(dataList: Array<Buffer>, mimeTypes: Array<string>, config?: JsExtractionConfig | undefined | null): Promise<Array<JsExtractionResult>>
39
+
40
+ /**
41
+ * Batch extract from multiple byte arrays (synchronous).
42
+ *
43
+ * Synchronously processes multiple in-memory buffers in parallel. Requires
44
+ * corresponding MIME types for each buffer.
45
+ *
46
+ * # Parameters
47
+ *
48
+ * * `data_list` - Array of buffers to extract
49
+ * * `mime_types` - Array of MIME types (must match data_list length)
50
+ * * `config` - Optional extraction configuration
51
+ *
52
+ * # Returns
53
+ *
54
+ * Array of `ExtractionResult` in the same order as inputs.
55
+ *
56
+ * # Errors
57
+ *
58
+ * Throws if data_list and mime_types lengths don't match.
59
+ *
60
+ * # Example
61
+ *
62
+ * ```typescript
63
+ * import { batchExtractBytesSync } from '@kreuzberg/node';
64
+ *
65
+ * const buffers = [buffer1, buffer2, buffer3];
66
+ * const mimeTypes = ['application/pdf', 'image/png', 'text/plain'];
67
+ * const results = batchExtractBytesSync(buffers, mimeTypes, null);
68
+ * ```
69
+ */
70
+ export declare function batchExtractBytesSync(dataList: Array<Buffer>, mimeTypes: Array<string>, config?: JsExtractionConfig | undefined | null): Array<JsExtractionResult>
71
+
72
+ /**
73
+ * Batch extract from multiple files (asynchronous).
74
+ *
75
+ * Asynchronously processes multiple files in parallel. Non-blocking alternative
76
+ * to `batchExtractFilesSync` with same performance benefits.
77
+ *
78
+ * # Parameters
79
+ *
80
+ * * `paths` - Array of file paths to extract
81
+ * * `config` - Optional extraction configuration (applied to all files)
82
+ *
83
+ * # Returns
84
+ *
85
+ * Promise resolving to array of `ExtractionResult`.
86
+ *
87
+ * # Example
88
+ *
89
+ * ```typescript
90
+ * import { batchExtractFiles } from '@kreuzberg/node';
91
+ *
92
+ * const files = ['report1.pdf', 'report2.pdf', 'report3.pdf'];
93
+ * const results = await batchExtractFiles(files, null);
94
+ * console.log(`Processed ${results.length} files`);
95
+ * ```
96
+ */
97
+ export declare function batchExtractFiles(paths: Array<string>, config?: JsExtractionConfig | undefined | null): Promise<Array<JsExtractionResult>>
98
+
99
+ /**
100
+ * Batch extract from multiple files (synchronous).
101
+ *
102
+ * Synchronously processes multiple files in parallel using Rayon. Significantly
103
+ * faster than sequential processing for large batches.
104
+ *
105
+ * # Parameters
106
+ *
107
+ * * `paths` - Array of file paths to extract
108
+ * * `config` - Optional extraction configuration (applied to all files)
109
+ *
110
+ * # Returns
111
+ *
112
+ * Array of `ExtractionResult` in the same order as input paths.
113
+ *
114
+ * # Example
115
+ *
116
+ * ```typescript
117
+ * import { batchExtractFilesSync } from '@kreuzberg/node';
118
+ *
119
+ * const files = ['doc1.pdf', 'doc2.docx', 'doc3.txt'];
120
+ * const results = batchExtractFilesSync(files, null);
121
+ * results.forEach((result, i) => {
122
+ * console.log(`File ${files[i]}: ${result.content.substring(0, 100)}...`);
123
+ * });
124
+ * ```
125
+ */
126
+ export declare function batchExtractFilesSync(paths: Array<string>, config?: JsExtractionConfig | undefined | null): Array<JsExtractionResult>
127
+
128
+ /**
129
+ * Clear all registered document extractors.
130
+ *
131
+ * Removes all document extractors from the registry, including built-in extractors.
132
+ * Use with caution as this will make document extraction unavailable until
133
+ * extractors are re-registered.
134
+ *
135
+ * # Example
136
+ *
137
+ * ```typescript
138
+ * import { clearDocumentExtractors } from 'kreuzberg';
139
+ *
140
+ * clearDocumentExtractors();
141
+ * ```
142
+ */
143
+ export declare function clearDocumentExtractors(): void
144
+
145
+ /**
146
+ * Clear all registered OCR backends.
147
+ *
148
+ * Removes all OCR backends from the registry, including built-in backends.
149
+ * Use with caution as this will make OCR functionality unavailable until
150
+ * backends are re-registered.
151
+ *
152
+ * # Example
153
+ *
154
+ * ```typescript
155
+ * import { clearOcrBackends } from 'kreuzberg';
156
+ *
157
+ * clearOcrBackends();
158
+ * ```
159
+ */
160
+ export declare function clearOcrBackends(): void
161
+
162
+ /** Clear all registered postprocessors */
163
+ export declare function clearPostProcessors(): void
164
+
165
+ /** Clear all registered validators */
166
+ export declare function clearValidators(): void
167
+
168
+ /**
169
+ * Detect MIME type from raw bytes.
170
+ *
171
+ * Uses content inspection (magic bytes) to determine MIME type.
172
+ * This is more accurate than extension-based detection but requires
173
+ * reading the file content.
174
+ *
175
+ * # Parameters
176
+ *
177
+ * * `bytes` - Raw file content as Buffer
178
+ *
179
+ * # Returns
180
+ *
181
+ * The detected MIME type string.
182
+ *
183
+ * # Errors
184
+ *
185
+ * Throws an error if MIME type cannot be determined from content.
186
+ *
187
+ * # Example
188
+ *
189
+ * ```typescript
190
+ * import { detectMimeType } from 'kreuzberg';
191
+ * import * as fs from 'fs';
192
+ *
193
+ * // Read file content
194
+ * const content = fs.readFileSync('document.pdf');
195
+ *
196
+ * // Detect MIME type from bytes
197
+ * const mimeType = detectMimeType(content);
198
+ * console.log(mimeType); // 'application/pdf'
199
+ * ```
200
+ */
201
+ export declare function detectMimeTypeFromBytes(bytes: Buffer): string
202
+
203
+ /**
204
+ * Detect MIME type from a file path.
205
+ *
206
+ * Uses file extension to determine MIME type. Falls back to `mime_guess` crate
207
+ * if extension-based detection fails.
208
+ *
209
+ * # Parameters
210
+ *
211
+ * * `path` - Path to the file (string)
212
+ * * `check_exists` - Whether to verify file existence (default: true)
213
+ *
214
+ * # Returns
215
+ *
216
+ * The detected MIME type string.
217
+ *
218
+ * # Errors
219
+ *
220
+ * Throws an error if:
221
+ * - File doesn't exist (when check_exists is true)
222
+ * - MIME type cannot be determined from path/extension
223
+ * - Extension is unknown
224
+ *
225
+ * # Example
226
+ *
227
+ * ```typescript
228
+ * import { detectMimeTypeFromPath } from 'kreuzberg';
229
+ *
230
+ * // Detect from existing file
231
+ * const mimeType = detectMimeTypeFromPath('document.pdf');
232
+ * console.log(mimeType); // 'application/pdf'
233
+ *
234
+ * const mimeType2 = detectMimeTypeFromPath('document.docx');
235
+ * console.log(mimeType2); // 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
236
+ * ```
237
+ */
238
+ export declare function detectMimeTypeFromPath(path: string, checkExists?: boolean | undefined | null): string
239
+
240
+ /**
241
+ * Discover and load extraction configuration from current or parent directories.
242
+ *
243
+ * Searches for a `kreuzberg.toml` file starting from the current working directory
244
+ * and traversing up the directory tree. Returns the first configuration file found.
245
+ *
246
+ * # Returns
247
+ *
248
+ * `JsExtractionConfig` object if a configuration file is found, or `null` if no
249
+ * configuration file exists in the current or parent directories.
250
+ *
251
+ * # Example
252
+ *
253
+ * ```typescript
254
+ * import { ExtractionConfig } from 'kreuzberg';
255
+ *
256
+ * // Try to find config in current or parent directories
257
+ * const config = ExtractionConfig.discover();
258
+ * if (config) {
259
+ * console.log('Found configuration');
260
+ * // Use config for extraction
261
+ * } else {
262
+ * console.log('No configuration file found, using defaults');
263
+ * }
264
+ * ```
265
+ */
266
+ export declare function discoverExtractionConfig(): JsExtractionConfig | null
267
+
268
+ /**
269
+ * Embedding preset configuration for TypeScript bindings.
270
+ *
271
+ * Contains all settings for a specific embedding model preset.
272
+ */
273
+ export interface EmbeddingPreset {
274
+ /** Name of the preset (e.g., "fast", "balanced", "quality", "multilingual") */
275
+ name: string
276
+ /** Recommended chunk size in characters */
277
+ chunkSize: number
278
+ /** Recommended overlap in characters */
279
+ overlap: number
280
+ /** Model identifier (e.g., "AllMiniLML6V2Q", "BGEBaseENV15") */
281
+ modelName: string
282
+ /** Embedding vector dimensions */
283
+ dimensions: number
284
+ /** Human-readable description of the preset */
285
+ description: string
286
+ }
287
+
288
+ /**
289
+ * Extract content from bytes (asynchronous).
290
+ *
291
+ * Asynchronously extracts content from a byte buffer. Non-blocking alternative
292
+ * to `extractBytesSync` for processing in-memory data.
293
+ *
294
+ * # Parameters
295
+ *
296
+ * * `data` - Buffer containing the document bytes
297
+ * * `mime_type` - MIME type of the data
298
+ * * `config` - Optional extraction configuration
299
+ *
300
+ * # Returns
301
+ *
302
+ * Promise resolving to `ExtractionResult`.
303
+ *
304
+ * # Example
305
+ *
306
+ * ```typescript
307
+ * import { extractBytes } from '@kreuzberg/node';
308
+ *
309
+ * const response = await fetch('https://example.com/document.pdf');
310
+ * const buffer = Buffer.from(await response.arrayBuffer());
311
+ * const result = await extractBytes(buffer, 'application/pdf', null);
312
+ * ```
313
+ */
314
+ export declare function extractBytes(data: Buffer, mimeType: string, config?: JsExtractionConfig | undefined | null): Promise<JsExtractionResult>
315
+
316
+ /**
317
+ * Extract content from bytes (synchronous).
318
+ *
319
+ * Synchronously extracts content from a byte buffer without requiring a file path.
320
+ * Useful for processing in-memory data, network streams, or database BLOBs.
321
+ *
322
+ * # Parameters
323
+ *
324
+ * * `data` - Buffer containing the document bytes
325
+ * * `mime_type` - MIME type of the data (e.g., "application/pdf", "image/png")
326
+ * * `config` - Optional extraction configuration
327
+ *
328
+ * # Returns
329
+ *
330
+ * `ExtractionResult` with extracted content and metadata.
331
+ *
332
+ * # Errors
333
+ *
334
+ * Throws an error if data is malformed or MIME type is unsupported.
335
+ *
336
+ * # Example
337
+ *
338
+ * ```typescript
339
+ * import { extractBytesSync } from '@kreuzberg/node';
340
+ * import fs from 'fs';
341
+ *
342
+ * const buffer = fs.readFileSync('document.pdf');
343
+ * const result = extractBytesSync(buffer, 'application/pdf', null);
344
+ * console.log(result.content);
345
+ * ```
346
+ */
347
+ export declare function extractBytesSync(data: Buffer, mimeType: string, config?: JsExtractionConfig | undefined | null): JsExtractionResult
348
+
349
+ /**
350
+ * Extract content from a file (asynchronous).
351
+ *
352
+ * Asynchronously extracts text, tables, images, and metadata from a document file.
353
+ * Non-blocking alternative to `extractFileSync` for use in async/await contexts.
354
+ *
355
+ * # Parameters
356
+ *
357
+ * * `file_path` - Path to the file to extract (absolute or relative)
358
+ * * `mime_type` - Optional MIME type hint (auto-detected if omitted)
359
+ * * `config` - Optional extraction configuration (OCR, chunking, etc.)
360
+ *
361
+ * # Returns
362
+ *
363
+ * Promise resolving to `ExtractionResult` with extracted content and metadata.
364
+ *
365
+ * # Errors
366
+ *
367
+ * Rejects if file processing fails (see `extractFileSync` for error conditions).
368
+ *
369
+ * # Example
370
+ *
371
+ * ```typescript
372
+ * import { extractFile } from '@kreuzberg/node';
373
+ *
374
+ * // Async/await usage
375
+ * const result = await extractFile('document.pdf', null, null);
376
+ * console.log(result.content);
377
+ *
378
+ * // Promise usage
379
+ * extractFile('report.docx', null, null)
380
+ * .then(result => console.log(result.content))
381
+ * .catch(err => console.error(err));
382
+ * ```
383
+ */
384
+ export declare function extractFile(filePath: string, mimeType?: string | undefined | null, config?: JsExtractionConfig | undefined | null): Promise<JsExtractionResult>
385
+
386
+ /**
387
+ * Extract content from a file (synchronous).
388
+ *
389
+ * Synchronously extracts text, tables, images, and metadata from a document file.
390
+ * Supports 118+ file formats including PDFs, Office documents, images, and more.
391
+ *
392
+ * # Parameters
393
+ *
394
+ * * `file_path` - Path to the file to extract (absolute or relative)
395
+ * * `mime_type` - Optional MIME type hint (auto-detected if omitted)
396
+ * * `config` - Optional extraction configuration (OCR, chunking, etc.)
397
+ *
398
+ * # Returns
399
+ *
400
+ * `ExtractionResult` containing:
401
+ * - `content`: Extracted text content
402
+ * - `mimeType`: Detected MIME type
403
+ * - `metadata`: File metadata (author, title, etc.)
404
+ * - `tables`: Extracted tables (if any)
405
+ * - `images`: Extracted images (if configured)
406
+ * - `chunks`: Text chunks (if chunking enabled)
407
+ * - `detectedLanguages`: Detected languages (if enabled)
408
+ *
409
+ * # Errors
410
+ *
411
+ * Throws an error if:
412
+ * - File does not exist or is not accessible
413
+ * - File format is unsupported
414
+ * - File is corrupted or malformed
415
+ * - OCR processing fails (if enabled)
416
+ *
417
+ * # Example
418
+ *
419
+ * ```typescript
420
+ * import { extractFileSync, ExtractionConfig } from '@kreuzberg/node';
421
+ *
422
+ * // Basic extraction
423
+ * const result = extractFileSync('document.pdf', null, null);
424
+ * console.log(result.content);
425
+ *
426
+ * // With MIME type hint
427
+ * const result2 = extractFileSync('file.bin', 'application/pdf', null);
428
+ *
429
+ * // With OCR enabled
430
+ * const config: ExtractionConfig = {
431
+ * ocr: {
432
+ * backend: 'tesseract',
433
+ * language: 'eng',
434
+ * }
435
+ * };
436
+ * const result3 = extractFileSync('scanned.pdf', null, config);
437
+ * ```
438
+ */
439
+ export declare function extractFileSync(filePath: string, mimeType?: string | undefined | null, config?: JsExtractionConfig | undefined | null): JsExtractionResult
440
+
441
+ /**
442
+ * Get a specific embedding preset by name.
443
+ *
444
+ * Returns a preset configuration object, or null if the preset name is not found.
445
+ *
446
+ * # Arguments
447
+ *
448
+ * * `name` - The preset name (case-sensitive)
449
+ *
450
+ * # Returns
451
+ *
452
+ * An `EmbeddingPreset` object with the following properties:
453
+ * - `name`: string - Preset name
454
+ * - `chunkSize`: number - Recommended chunk size in characters
455
+ * - `overlap`: number - Recommended overlap in characters
456
+ * - `modelName`: string - Model identifier
457
+ * - `dimensions`: number - Embedding vector dimensions
458
+ * - `description`: string - Human-readable description
459
+ *
460
+ * Returns `null` if preset name is not found.
461
+ *
462
+ * # Example
463
+ *
464
+ * ```typescript
465
+ * import { getEmbeddingPreset } from 'kreuzberg';
466
+ *
467
+ * const preset = getEmbeddingPreset('balanced');
468
+ * if (preset) {
469
+ * console.log(`Model: ${preset.modelName}, Dims: ${preset.dimensions}`);
470
+ * // Model: BGEBaseENV15, Dims: 768
471
+ * }
472
+ * ```
473
+ */
474
+ export declare function getEmbeddingPreset(name: string): EmbeddingPreset | null
475
+
476
+ /**
477
+ * Get file extensions for a given MIME type.
478
+ *
479
+ * Returns an array of file extensions commonly associated with the specified
480
+ * MIME type. For example, 'application/pdf' returns ['pdf'].
481
+ *
482
+ * # Parameters
483
+ *
484
+ * * `mime_type` - The MIME type to look up (e.g., 'application/pdf', 'image/jpeg')
485
+ *
486
+ * # Returns
487
+ *
488
+ * Array of file extensions (without leading dots).
489
+ *
490
+ * # Errors
491
+ *
492
+ * Throws an error if the MIME type is not recognized or supported.
493
+ *
494
+ * # Example
495
+ *
496
+ * ```typescript
497
+ * import { getExtensionsForMime } from 'kreuzberg';
498
+ *
499
+ * // Get extensions for PDF
500
+ * const pdfExts = getExtensionsForMime('application/pdf');
501
+ * console.log(pdfExts); // ['pdf']
502
+ *
503
+ * // Get extensions for JPEG
504
+ * const jpegExts = getExtensionsForMime('image/jpeg');
505
+ * console.log(jpegExts); // ['jpg', 'jpeg']
506
+ * ```
507
+ */
508
+ export declare function getExtensionsForMime(mimeType: string): Array<string>
509
+
510
+ /**
511
+ * Get the error code for the last FFI error.
512
+ *
513
+ * Returns the FFI error code as an integer. Error codes are:
514
+ * - 0: Success (no error)
515
+ * - 1: GenericError
516
+ * - 2: Panic
517
+ * - 3: InvalidArgument
518
+ * - 4: IoError
519
+ * - 5: ParsingError
520
+ * - 6: OcrError
521
+ * - 7: MissingDependency
522
+ *
523
+ * This is useful for programmatic error handling and distinguishing
524
+ * between different types of failures in native code.
525
+ *
526
+ * # Returns
527
+ *
528
+ * The integer error code.
529
+ *
530
+ * # Example
531
+ *
532
+ * ```typescript
533
+ * import { extractFile, getLastErrorCode, ErrorCode } from '@kreuzberg/node';
534
+ *
535
+ * try {
536
+ * const result = await extractFile('document.pdf');
537
+ * } catch (error) {
538
+ * const code = getLastErrorCode();
539
+ * if (code === ErrorCode.Panic) {
540
+ * console.error('Native code panic detected');
541
+ * }
542
+ * }
543
+ * ```
544
+ */
545
+ export declare function getLastErrorCode(): number
546
+
547
+ /**
548
+ * Get panic context information if the last error was a panic.
549
+ *
550
+ * Returns detailed information about a panic in native code, or null
551
+ * if the last error was not a panic.
552
+ *
553
+ * # Returns
554
+ *
555
+ * A `PanicContext` object with:
556
+ * - `file`: string - Source file where panic occurred
557
+ * - `line`: number - Line number
558
+ * - `function`: string - Function name
559
+ * - `message`: string - Panic message
560
+ * - `timestamp_secs`: number - Unix timestamp (seconds since epoch)
561
+ *
562
+ * Returns `null` if no panic context is available.
563
+ *
564
+ * # Example
565
+ *
566
+ * ```typescript
567
+ * import { extractFile, getLastPanicContext } from '@kreuzberg/node';
568
+ *
569
+ * try {
570
+ * const result = await extractFile('document.pdf');
571
+ * } catch (error) {
572
+ * const context = getLastPanicContext();
573
+ * if (context) {
574
+ * console.error(`Panic at ${context.file}:${context.line}`);
575
+ * console.error(`In function: ${context.function}`);
576
+ * console.error(`Message: ${context.message}`);
577
+ * }
578
+ * }
579
+ * ```
580
+ */
581
+ export declare function getLastPanicContext(): any | null
582
+
583
+ export interface JsChunk {
584
+ content: string
585
+ embedding?: number[] | undefined
586
+ metadata: JsChunkMetadata
587
+ }
588
+
589
+ export interface JsChunkingConfig {
590
+ maxChars?: number
591
+ maxOverlap?: number
592
+ /** Optional embedding configuration for generating embeddings */
593
+ embedding?: JsEmbeddingConfig
594
+ /** Optional preset name for chunking parameters */
595
+ preset?: string
596
+ }
597
+
598
+ export interface JsChunkMetadata {
599
+ byteStart: number
600
+ byteEnd: number
601
+ tokenCount?: number
602
+ chunkIndex: number
603
+ totalChunks: number
604
+ firstPage?: number
605
+ lastPage?: number
606
+ }
607
+
608
+ /** Embedding generation configuration for Node.js bindings. */
609
+ export interface JsEmbeddingConfig {
610
+ /** Embedding model configuration */
611
+ model?: JsEmbeddingModelType
612
+ /** Whether to normalize embeddings (L2 normalization) */
613
+ normalize?: boolean
614
+ /** Batch size for embedding generation */
615
+ batchSize?: number
616
+ /** Whether to show download progress for models */
617
+ showDownloadProgress?: boolean
618
+ /** Custom cache directory for model storage */
619
+ cacheDir?: string
620
+ }
621
+
622
+ /**
623
+ * Embedding model type configuration for Node.js bindings.
624
+ *
625
+ * This struct represents different embedding model sources:
626
+ * - `preset`: Use a named preset (e.g., "balanced", "fast", "quality", "multilingual")
627
+ * - `fastembed`: Use a FastEmbed model with custom dimensions
628
+ * - `custom`: Use a custom ONNX model
629
+ */
630
+ export interface JsEmbeddingModelType {
631
+ /** Type of model: "preset", "fastembed", or "custom" */
632
+ modelType: string
633
+ /** For preset: preset name; for fastembed/custom: model ID */
634
+ value: string
635
+ /** Number of dimensions (only for fastembed/custom) */
636
+ dimensions?: number
637
+ }
638
+
639
+ export interface JsExtractedImage {
640
+ data: Buffer
641
+ format: string
642
+ imageIndex: number
643
+ pageNumber?: number
644
+ width?: number
645
+ height?: number
646
+ colorspace?: string
647
+ bitsPerComponent?: number
648
+ isMask: boolean
649
+ description?: string
650
+ ocrResult?: JsExtractionResult | undefined
651
+ }
652
+
653
+ export interface JsExtractionConfig {
654
+ useCache?: boolean
655
+ enableQualityProcessing?: boolean
656
+ ocr?: JsOcrConfig
657
+ forceOcr?: boolean
658
+ chunking?: JsChunkingConfig
659
+ images?: JsImageExtractionConfig
660
+ pdfOptions?: JsPdfConfig
661
+ tokenReduction?: JsTokenReductionConfig
662
+ languageDetection?: JsLanguageDetectionConfig
663
+ postprocessor?: JsPostProcessorConfig
664
+ keywords?: JsKeywordConfig
665
+ htmlOptions?: JsHtmlOptions
666
+ maxConcurrentExtractions?: number
667
+ pages?: JsPageConfig
668
+ }
669
+
670
+ export interface JsExtractionResult {
671
+ content: string
672
+ mimeType: string
673
+ metadata: Metadata
674
+ tables: Array<JsTable>
675
+ detectedLanguages?: Array<string>
676
+ chunks?: Array<JsChunk>
677
+ images?: Array<JsExtractedImage>
678
+ }
679
+
680
+ export interface JsHtmlOptions {
681
+ headingStyle?: string
682
+ listIndentType?: string
683
+ listIndentWidth?: number
684
+ bullets?: string
685
+ strongEmSymbol?: string
686
+ escapeAsterisks?: boolean
687
+ escapeUnderscores?: boolean
688
+ escapeMisc?: boolean
689
+ escapeAscii?: boolean
690
+ codeLanguage?: string
691
+ autolinks?: boolean
692
+ defaultTitle?: boolean
693
+ brInTables?: boolean
694
+ hocrSpatialTables?: boolean
695
+ highlightStyle?: string
696
+ extractMetadata?: boolean
697
+ whitespaceMode?: string
698
+ stripNewlines?: boolean
699
+ wrap?: boolean
700
+ wrapWidth?: number
701
+ convertAsInline?: boolean
702
+ subSymbol?: string
703
+ supSymbol?: string
704
+ newlineStyle?: string
705
+ codeBlockStyle?: string
706
+ keepInlineImagesIn?: Array<string>
707
+ encoding?: string
708
+ debug?: boolean
709
+ stripTags?: Array<string>
710
+ preserveTags?: Array<string>
711
+ preprocessing?: JsHtmlPreprocessingOptions
712
+ }
713
+
714
+ export interface JsHtmlPreprocessingOptions {
715
+ enabled?: boolean
716
+ preset?: string
717
+ removeNavigation?: boolean
718
+ removeForms?: boolean
719
+ }
720
+
721
+ export interface JsImageExtractionConfig {
722
+ extractImages?: boolean
723
+ targetDpi?: number
724
+ maxImageDimension?: number
725
+ autoAdjustDpi?: boolean
726
+ minDpi?: number
727
+ maxDpi?: number
728
+ }
729
+
730
+ export interface JsKeywordConfig {
731
+ algorithm?: string
732
+ maxKeywords?: number
733
+ minScore?: number
734
+ ngramRange?: [number, number] | undefined
735
+ language?: string
736
+ yakeParams?: JsYakeParams
737
+ rakeParams?: JsRakeParams
738
+ }
739
+
740
+ export interface JsLanguageDetectionConfig {
741
+ enabled?: boolean
742
+ minConfidence?: number
743
+ detectMultiple?: boolean
744
+ }
745
+
746
+ export interface JsOcrConfig {
747
+ backend: string
748
+ language?: string
749
+ tesseractConfig?: JsTesseractConfig
750
+ }
751
+
752
+ export interface JsPageConfig {
753
+ extractPages?: boolean
754
+ insertPageMarkers?: boolean
755
+ markerFormat?: string
756
+ }
757
+
758
+ export interface JsPdfConfig {
759
+ extractImages?: boolean
760
+ passwords?: Array<string>
761
+ extractMetadata?: boolean
762
+ }
763
+
764
+ export interface JsPostProcessorConfig {
765
+ enabled?: boolean
766
+ enabledProcessors?: Array<string>
767
+ disabledProcessors?: Array<string>
768
+ }
769
+
770
+ export interface JsRakeParams {
771
+ minWordLength?: number
772
+ maxWordsPerPhrase?: number
773
+ }
774
+
775
+ export interface JsTable {
776
+ cells: Array<Array<string>>
777
+ markdown: string
778
+ pageNumber: number
779
+ }
780
+
781
+ export interface JsTesseractConfig {
782
+ psm?: number
783
+ enableTableDetection?: boolean
784
+ tesseditCharWhitelist?: string
785
+ }
786
+
787
+ export interface JsTokenReductionConfig {
788
+ mode?: string
789
+ preserveImportantWords?: boolean
790
+ }
791
+
792
+ export interface JsYakeParams {
793
+ windowSize?: number
794
+ }
795
+
796
+ /**
797
+ * List all registered document extractors.
798
+ *
799
+ * Returns an array of names of all currently registered document extractors,
800
+ * including built-in extractors for PDF, Office documents, images, etc.
801
+ *
802
+ * # Returns
803
+ *
804
+ * Array of document extractor names.
805
+ *
806
+ * # Example
807
+ *
808
+ * ```typescript
809
+ * import { listDocumentExtractors } from 'kreuzberg';
810
+ *
811
+ * const extractors = listDocumentExtractors();
812
+ * console.log(extractors); // ['PDFExtractor', 'ImageExtractor', ...]
813
+ * ```
814
+ */
815
+ export declare function listDocumentExtractors(): Array<string>
816
+
817
+ /**
818
+ * List all available embedding preset names.
819
+ *
820
+ * Returns an array of preset names that can be used with `getEmbeddingPreset`.
821
+ *
822
+ * # Returns
823
+ *
824
+ * Array of 4 preset names: ["fast", "balanced", "quality", "multilingual"]
825
+ *
826
+ * # Example
827
+ *
828
+ * ```typescript
829
+ * import { listEmbeddingPresets } from 'kreuzberg';
830
+ *
831
+ * const presets = listEmbeddingPresets();
832
+ * console.log(presets); // ['fast', 'balanced', 'quality', 'multilingual']
833
+ * ```
834
+ */
835
+ export declare function listEmbeddingPresets(): Array<string>
836
+
837
+ /**
838
+ * List all registered OCR backends.
839
+ *
840
+ * Returns an array of names of all currently registered OCR backends,
841
+ * including built-in backends like "tesseract".
842
+ *
843
+ * # Returns
844
+ *
845
+ * Array of OCR backend names.
846
+ *
847
+ * # Example
848
+ *
849
+ * ```typescript
850
+ * import { listOcrBackends } from 'kreuzberg';
851
+ *
852
+ * const backends = listOcrBackends();
853
+ * console.log(backends); // ['tesseract', 'my-custom-backend', ...]
854
+ * ```
855
+ */
856
+ export declare function listOcrBackends(): Array<string>
857
+
858
+ /** List all registered post-processors */
859
+ export declare function listPostProcessors(): Array<string>
860
+
861
+ /** List all registered validators */
862
+ export declare function listValidators(): Array<string>
863
+
864
+ /**
865
+ * Load extraction configuration from a file.
866
+ *
867
+ * Automatically detects the file format based on extension:
868
+ * - `.toml` - TOML format
869
+ * - `.yaml` - YAML format
870
+ * - `.json` - JSON format
871
+ *
872
+ * # Parameters
873
+ *
874
+ * * `file_path` - Path to the configuration file (absolute or relative)
875
+ *
876
+ * # Returns
877
+ *
878
+ * `JsExtractionConfig` object with loaded configuration.
879
+ *
880
+ * # Errors
881
+ *
882
+ * Throws an error if:
883
+ * - File does not exist or is not accessible
884
+ * - File content is not valid TOML/YAML/JSON
885
+ * - Configuration structure is invalid
886
+ *
887
+ * # Example
888
+ *
889
+ * ```typescript
890
+ * import { loadExtractionConfigFromFile } from 'kreuzberg';
891
+ *
892
+ * // Load from TOML file
893
+ * const config = loadExtractionConfigFromFile('kreuzberg.toml');
894
+ *
895
+ * // Load from YAML file
896
+ * const config2 = loadExtractionConfigFromFile('./config.yaml');
897
+ *
898
+ * // Use with extraction
899
+ * const result = await extractFile('document.pdf', null, config);
900
+ * ```
901
+ */
902
+ export declare function loadExtractionConfigFromFile(filePath: string): JsExtractionConfig
903
+
904
+ /**
905
+ * Register a custom OCR backend
906
+ *
907
+ * Registers a JavaScript OCR backend that can process images and extract text.
908
+ *
909
+ * # Arguments
910
+ *
911
+ * * `backend` - JavaScript object with the following interface:
912
+ * - `name(): string` - Unique backend name
913
+ * - `supportedLanguages(): string[]` - Array of supported ISO 639-2/3 language codes
914
+ * - `processImage(imageBytes: string, language: string): Promise<result>` - Process image and return extraction result
915
+ *
916
+ * # Implementation Notes
917
+ *
918
+ * Due to NAPI ThreadsafeFunction limitations, the processImage function receives:
919
+ * - `imageBytes` as a Base64 string (first argument)
920
+ * - `language` as string (second argument)
921
+ *
922
+ * And must return a Promise resolving to a JSON-serializable object with:
923
+ * ```typescript
924
+ * {
925
+ * content: string,
926
+ * mime_type: string, // default: "text/plain"
927
+ * metadata: object, // default: {}
928
+ * tables: array // default: []
929
+ * }
930
+ * ```
931
+ *
932
+ * # Example
933
+ *
934
+ * ```typescript
935
+ * import { registerOcrBackend } from '@kreuzberg/node';
936
+ *
937
+ * registerOcrBackend({
938
+ * name: () => "my-ocr",
939
+ * supportedLanguages: () => ["eng", "deu", "fra"],
940
+ * processImage: async (imageBytes, language) => {
941
+ * const buffer = Buffer.from(imageBytes, "base64");
942
+ * const text = await myOcrLibrary.process(buffer, language);
943
+ * return {
944
+ * content: text,
945
+ * mime_type: "text/plain",
946
+ * metadata: { confidence: 0.95 },
947
+ * tables: []
948
+ * };
949
+ * }
950
+ * });
951
+ * ```
952
+ */
953
+ export declare function registerOcrBackend(backend: object): void
954
+
955
+ /**
956
+ * Register a custom postprocessor
957
+ *
958
+ * Registers a JavaScript PostProcessor that will be called after extraction.
959
+ *
960
+ * # Arguments
961
+ *
962
+ * * `processor` - JavaScript object with the following interface:
963
+ * - `name(): string` - Unique processor name
964
+ * - `process(...args): string` - Process function that receives JSON string as args\[0\]
965
+ * - `processingStage(): "early" | "middle" | "late"` - Optional processing stage
966
+ *
967
+ * # Implementation Notes
968
+ *
969
+ * Due to NAPI ThreadsafeFunction limitations, the process function receives the extraction
970
+ * result as a JSON string in args\[0\] and must return a JSON string. Use the TypeScript
971
+ * wrapper functions for a cleaner API.
972
+ *
973
+ * # Example
974
+ *
975
+ * ```typescript
976
+ * import { registerPostProcessor } from '@kreuzberg/node';
977
+ *
978
+ * registerPostProcessor({
979
+ * name: () => "word-counter",
980
+ * processingStage: () => "middle",
981
+ * process: (...args) => {
982
+ * const result = JSON.parse(args[0]);
983
+ * const wordCount = result.content.split(/\s+/).length;
984
+ * result.metadata.word_count = wordCount;
985
+ * return JSON.stringify(result);
986
+ * }
987
+ * });
988
+ * ```
989
+ */
990
+ export declare function registerPostProcessor(processor: object): void
991
+
992
+ /**
993
+ * Register a custom validator
994
+ *
995
+ * Registers a JavaScript Validator that will be called after extraction.
996
+ *
997
+ * # Arguments
998
+ *
999
+ * * `validator` - JavaScript object with the following interface:
1000
+ * - `name(): string` - Unique validator name
1001
+ * - `validate(...args): Promise<string>` - Validate function that receives JSON string as args\[0\]
1002
+ * - `priority(): number` - Optional priority (defaults to 50, higher runs first)
1003
+ *
1004
+ * # Implementation Notes
1005
+ *
1006
+ * Due to NAPI ThreadsafeFunction limitations, the validate function receives the extraction
1007
+ * result as a JSON string in args\[0\]. On success, return an empty string. On validation
1008
+ * failure, throw an error (the Promise should reject). Use the TypeScript wrapper functions
1009
+ * for a cleaner API.
1010
+ *
1011
+ * # Example
1012
+ *
1013
+ * ```typescript
1014
+ * import { registerValidator } from '@kreuzberg/node';
1015
+ *
1016
+ * registerValidator({
1017
+ * name: () => "min-length",
1018
+ * priority: () => 100,
1019
+ * validate: async (...args) => {
1020
+ * const result = JSON.parse(args[0]);
1021
+ * if (result.content.length < 100) {
1022
+ * throw new Error("ValidationError: Content too short");
1023
+ * }
1024
+ * return ""; // Success - return empty string
1025
+ * }
1026
+ * });
1027
+ * ```
1028
+ */
1029
+ export declare function registerValidator(validator: object): void
1030
+
1031
+ /**
1032
+ * Unregister a document extractor by name.
1033
+ *
1034
+ * Removes the specified document extractor from the registry. If the extractor
1035
+ * doesn't exist, this operation is a no-op (does not throw an error).
1036
+ *
1037
+ * # Parameters
1038
+ *
1039
+ * * `name` - Name of the document extractor to unregister
1040
+ *
1041
+ * # Example
1042
+ *
1043
+ * ```typescript
1044
+ * import { unregisterDocumentExtractor } from 'kreuzberg';
1045
+ *
1046
+ * // Unregister a custom extractor
1047
+ * unregisterDocumentExtractor('MyCustomExtractor');
1048
+ * ```
1049
+ */
1050
+ export declare function unregisterDocumentExtractor(name: string): void
1051
+
1052
+ /**
1053
+ * Unregister an OCR backend by name.
1054
+ *
1055
+ * Removes the specified OCR backend from the registry. If the backend doesn't exist,
1056
+ * this operation is a no-op (does not throw an error).
1057
+ *
1058
+ * # Parameters
1059
+ *
1060
+ * * `name` - Name of the OCR backend to unregister
1061
+ *
1062
+ * # Example
1063
+ *
1064
+ * ```typescript
1065
+ * import { unregisterOcrBackend } from 'kreuzberg';
1066
+ *
1067
+ * // Unregister a custom backend
1068
+ * unregisterOcrBackend('my-custom-ocr');
1069
+ * ```
1070
+ */
1071
+ export declare function unregisterOcrBackend(name: string): void
1072
+
1073
+ /** Unregister a postprocessor by name */
1074
+ export declare function unregisterPostProcessor(name: string): void
1075
+
1076
+ /** Unregister a validator by name */
1077
+ export declare function unregisterValidator(name: string): void
1078
+
1079
+ /**
1080
+ * Validate that a MIME type is supported by Kreuzberg.
1081
+ *
1082
+ * Checks if a MIME type is in the list of supported formats. Note that any
1083
+ * `image/*` MIME type is automatically considered valid.
1084
+ *
1085
+ * # Parameters
1086
+ *
1087
+ * * `mime_type` - The MIME type to validate (string)
1088
+ *
1089
+ * # Returns
1090
+ *
1091
+ * The validated MIME type (may be normalized).
1092
+ *
1093
+ * # Errors
1094
+ *
1095
+ * Throws an error if the MIME type is not supported.
1096
+ *
1097
+ * # Example
1098
+ *
1099
+ * ```typescript
1100
+ * import { validateMimeType } from 'kreuzberg';
1101
+ *
1102
+ * // Validate supported type
1103
+ * const validated = validateMimeType('application/pdf');
1104
+ * console.log(validated); // 'application/pdf'
1105
+ *
1106
+ * // Validate custom image type
1107
+ * const validated2 = validateMimeType('image/custom-format');
1108
+ * console.log(validated2); // 'image/custom-format' (any image/* is valid)
1109
+ *
1110
+ * // Validate unsupported type (throws error)
1111
+ * try {
1112
+ * validateMimeType('video/mp4');
1113
+ * } catch (err) {
1114
+ * console.error(err); // Error: Unsupported format: video/mp4
1115
+ * }
1116
+ * ```
1117
+ */
1118
+ export declare function validateMimeType(mimeType: string): string