@kreuzberg/node 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.d.ts CHANGED
@@ -135,33 +135,6 @@ export declare function batchExtractFiles(paths: Array<string>, config?: JsExtra
135
135
  */
136
136
  export declare function batchExtractFilesInWorker(pool: JsWorkerPool, filePaths: Array<string>, config?: JsExtractionConfig | undefined | null): Promise<Array<JsExtractionResult>>
137
137
 
138
- /**
139
- * Batch extract from multiple files (synchronous).
140
- *
141
- * Synchronously processes multiple files in parallel using Rayon. Significantly
142
- * faster than sequential processing for large batches.
143
- *
144
- * # Parameters
145
- *
146
- * * `paths` - Array of file paths to extract
147
- * * `config` - Optional extraction configuration (applied to all files)
148
- *
149
- * # Returns
150
- *
151
- * Array of `ExtractionResult` in the same order as input paths.
152
- *
153
- * # Example
154
- *
155
- * ```typescript
156
- * import { batchExtractFilesSync } from '@kreuzberg/node';
157
- *
158
- * const files = ['doc1.pdf', 'doc2.docx', 'doc3.txt'];
159
- * const results = batchExtractFilesSync(files, null);
160
- * results.forEach((result, i) => {
161
- * console.log(`File ${files[i]}: ${result.content.substring(0, 100)}...`);
162
- * });
163
- * ```
164
- */
165
138
  export declare function batchExtractFilesSync(paths: Array<string>, config?: JsExtractionConfig | undefined | null): Array<JsExtractionResult>
166
139
 
167
140
  export declare function classifyError(errorMessage: string): ErrorClassification
@@ -183,21 +156,7 @@ export declare function classifyError(errorMessage: string): ErrorClassification
183
156
  */
184
157
  export declare function clearDocumentExtractors(): void
185
158
 
186
- /**
187
- * Clear all registered OCR backends.
188
- *
189
- * Removes all OCR backends from the registry, including built-in backends.
190
- * Use with caution as this will make OCR functionality unavailable until
191
- * backends are re-registered.
192
- *
193
- * # Example
194
- *
195
- * ```typescript
196
- * import { clearOcrBackends } from 'kreuzberg';
197
- *
198
- * clearOcrBackends();
199
- * ```
200
- */
159
+ /** Clear all registered OCR backends */
201
160
  export declare function clearOcrBackends(): void
202
161
 
203
162
  /** Clear all registered postprocessors */
@@ -329,15 +288,14 @@ export declare function createWorkerPool(size?: number | undefined | null): JsWo
329
288
  * # Example
330
289
  *
331
290
  * ```typescript
332
- * import { detectMimeType } from 'kreuzberg';
291
+ * import { detectMimeTypeFromBytes } from 'kreuzberg';
333
292
  * import * as fs from 'fs';
334
293
  *
335
294
  * // Read file content
336
295
  * const content = fs.readFileSync('document.pdf');
337
296
  *
338
297
  * // Detect MIME type from bytes
339
- * const mimeType = detectMimeType(content);
340
- * console.log(mimeType); // 'application/pdf'
298
+ * const mimeType = detectMimeTypeFromBytes(content);
341
299
  * ```
342
300
  */
343
301
  export declare function detectMimeTypeFromBytes(bytes: Buffer): string
@@ -345,68 +303,83 @@ export declare function detectMimeTypeFromBytes(bytes: Buffer): string
345
303
  /**
346
304
  * Detect MIME type from a file path.
347
305
  *
348
- * Uses file extension to determine MIME type. Falls back to `mime_guess` crate
349
- * if extension-based detection fails.
306
+ * Determines the MIME type based on the file extension in the provided path.
307
+ * By default, checks if the file exists; can be disabled with check_exists parameter.
350
308
  *
351
309
  * # Parameters
352
310
  *
353
- * * `path` - Path to the file (string)
354
- * * `check_exists` - Whether to verify file existence (default: true)
311
+ * * `path` - The file path to detect MIME type from (e.g., 'document.pdf')
312
+ * * `check_exists` - Whether to verify the file exists (default: true)
355
313
  *
356
314
  * # Returns
357
315
  *
358
- * The detected MIME type string.
316
+ * The detected MIME type as a string (e.g., 'application/pdf').
359
317
  *
360
318
  * # Errors
361
319
  *
362
- * Throws an error if:
363
- * - File doesn't exist (when check_exists is true)
364
- * - MIME type cannot be determined from path/extension
365
- * - Extension is unknown
320
+ * Throws an error if MIME type cannot be determined from the file extension,
321
+ * or if check_exists is true and the file does not exist.
366
322
  *
367
323
  * # Example
368
324
  *
369
325
  * ```typescript
370
326
  * import { detectMimeTypeFromPath } from 'kreuzberg';
371
327
  *
372
- * // Detect from existing file
373
- * const mimeType = detectMimeTypeFromPath('document.pdf');
374
- * console.log(mimeType); // 'application/pdf'
328
+ * // Detect MIME type from existing file
329
+ * const mimeType = detectMimeTypeFromPath('/path/to/document.pdf');
375
330
  *
376
- * const mimeType2 = detectMimeTypeFromPath('document.docx');
377
- * console.log(mimeType2); // 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
331
+ * // Detect without checking file existence
332
+ * const mimeType2 = detectMimeTypeFromPath('document.docx', false);
378
333
  * ```
379
334
  */
380
335
  export declare function detectMimeTypeFromPath(path: string, checkExists?: boolean | undefined | null): string
381
336
 
382
337
  /**
383
- * Discover and load extraction configuration from current or parent directories.
338
+ * Discover extraction configuration file in current directory or parent directories.
384
339
  *
385
- * Searches for a `kreuzberg.toml` file starting from the current working directory
386
- * and traversing up the directory tree. Returns the first configuration file found.
340
+ * Searches for configuration files in the following order:
341
+ * 1. `kreuzberg.toml`
342
+ * 2. `kreuzberg.yaml` / `kreuzberg.yml`
343
+ * 3. `kreuzberg.json`
344
+ * 4. Searches parent directories up to the filesystem root
345
+ *
346
+ * Returns the first configuration file found or throws an error if none found.
387
347
  *
388
348
  * # Returns
389
349
  *
390
- * `JsExtractionConfig` object if a configuration file is found, or `null` if no
391
- * configuration file exists in the current or parent directories.
350
+ * `JsExtractionConfig` object with discovered configuration.
351
+ *
352
+ * # Errors
353
+ *
354
+ * Throws an error if no configuration file is found.
392
355
  *
393
356
  * # Example
394
357
  *
395
358
  * ```typescript
396
- * import { ExtractionConfig } from 'kreuzberg';
359
+ * import { discoverExtractionConfig } from 'kreuzberg';
397
360
  *
398
- * // Try to find config in current or parent directories
399
- * const config = ExtractionConfig.discover();
400
- * if (config) {
401
- * console.log('Found configuration');
402
- * // Use config for extraction
403
- * } else {
404
- * console.log('No configuration file found, using defaults');
405
- * }
361
+ * // Automatically finds kreuzberg.toml or kreuzberg.yaml in current or parent directories
362
+ * const config = discoverExtractionConfig();
363
+ * const result = await extractFile('document.pdf', null, config);
406
364
  * ```
407
365
  */
408
366
  export declare function discoverExtractionConfig(): JsExtractionConfig | null
409
367
 
368
+ export interface EmbeddingPreset {
369
+ /** Name of the preset (e.g., "fast", "balanced", "quality", "multilingual") */
370
+ name: string
371
+ /** Recommended chunk size in characters */
372
+ chunkSize: number
373
+ /** Recommended overlap in characters */
374
+ overlap: number
375
+ /** Model identifier (e.g., "AllMiniLML6V2Q", "BGEBaseENV15") */
376
+ modelName: string
377
+ /** Embedding vector dimensions */
378
+ dimensions: number
379
+ /** Human-readable description of the preset */
380
+ description: string
381
+ }
382
+
410
383
  /**
411
384
  * Embedding preset configuration for TypeScript bindings.
412
385
  *
@@ -604,60 +577,42 @@ export declare function extractFile(filePath: string, mimeType?: string | undefi
604
577
  */
605
578
  export declare function extractFileInWorker(pool: JsWorkerPool, filePath: string, password?: string | undefined | null, config?: JsExtractionConfig | undefined | null): Promise<JsExtractionResult>
606
579
 
580
+ export declare function extractFileSync(filePath: string, mimeType?: string | undefined | null, config?: JsExtractionConfig | undefined | null): JsExtractionResult
581
+
607
582
  /**
608
- * Extract content from a file (synchronous).
583
+ * Get a specific embedding preset by name.
609
584
  *
610
- * Synchronously extracts text, tables, images, and metadata from a document file.
611
- * Supports 118+ file formats including PDFs, Office documents, images, and more.
585
+ * Returns a preset configuration object, or null if the preset name is not found.
612
586
  *
613
- * # Parameters
587
+ * # Arguments
614
588
  *
615
- * * `file_path` - Path to the file to extract (absolute or relative)
616
- * * `mime_type` - Optional MIME type hint (auto-detected if omitted)
617
- * * `config` - Optional extraction configuration (OCR, chunking, etc.)
589
+ * * `name` - The preset name (case-sensitive)
618
590
  *
619
591
  * # Returns
620
592
  *
621
- * `ExtractionResult` containing:
622
- * - `content`: Extracted text content
623
- * - `mimeType`: Detected MIME type
624
- * - `metadata`: File metadata (author, title, etc.)
625
- * - `tables`: Extracted tables (if any)
626
- * - `images`: Extracted images (if configured)
627
- * - `chunks`: Text chunks (if chunking enabled)
628
- * - `detectedLanguages`: Detected languages (if enabled)
629
- *
630
- * # Errors
593
+ * An `EmbeddingPreset` object with the following properties:
594
+ * - `name`: string - Preset name
595
+ * - `chunkSize`: number - Recommended chunk size in characters
596
+ * - `overlap`: number - Recommended overlap in characters
597
+ * - `modelName`: string - Model identifier
598
+ * - `dimensions`: number - Embedding vector dimensions
599
+ * - `description`: string - Human-readable description
631
600
  *
632
- * Throws an error if:
633
- * - File does not exist or is not accessible
634
- * - File format is unsupported
635
- * - File is corrupted or malformed
636
- * - OCR processing fails (if enabled)
601
+ * Returns `null` if preset name is not found.
637
602
  *
638
603
  * # Example
639
604
  *
640
605
  * ```typescript
641
- * import { extractFileSync, ExtractionConfig } from '@kreuzberg/node';
642
- *
643
- * // Basic extraction
644
- * const result = extractFileSync('document.pdf', null, null);
645
- * console.log(result.content);
646
- *
647
- * // With MIME type hint
648
- * const result2 = extractFileSync('file.bin', 'application/pdf', null);
606
+ * import { getEmbeddingPreset } from 'kreuzberg';
649
607
  *
650
- * // With OCR enabled
651
- * const config: ExtractionConfig = {
652
- * ocr: {
653
- * backend: 'tesseract',
654
- * language: 'eng',
655
- * }
656
- * };
657
- * const result3 = extractFileSync('scanned.pdf', null, config);
608
+ * const preset = getEmbeddingPreset('balanced');
609
+ * if (preset) {
610
+ * console.log(`Model: ${preset.modelName}, Dims: ${preset.dimensions}`);
611
+ * // Model: BGEBaseENV15, Dims: 768
612
+ * }
658
613
  * ```
659
614
  */
660
- export declare function extractFileSync(filePath: string, mimeType?: string | undefined | null, config?: JsExtractionConfig | undefined | null): JsExtractionResult
615
+ export declare function getEmbeddingPreset(name: string): EmbeddingPreset | null
661
616
 
662
617
  /**
663
618
  * Get a specific embedding preset by name.
@@ -1195,25 +1150,6 @@ export interface JsYakeParams {
1195
1150
  windowSize?: number
1196
1151
  }
1197
1152
 
1198
- /**
1199
- * List all registered document extractors.
1200
- *
1201
- * Returns an array of names of all currently registered document extractors,
1202
- * including built-in extractors for PDF, Office documents, images, etc.
1203
- *
1204
- * # Returns
1205
- *
1206
- * Array of document extractor names.
1207
- *
1208
- * # Example
1209
- *
1210
- * ```typescript
1211
- * import { listDocumentExtractors } from 'kreuzberg';
1212
- *
1213
- * const extractors = listDocumentExtractors();
1214
- * console.log(extractors); // ['PDFExtractor', 'ImageExtractor', ...]
1215
- * ```
1216
- */
1217
1153
  export declare function listDocumentExtractors(): Array<string>
1218
1154
 
1219
1155
  /**
@@ -1237,24 +1173,26 @@ export declare function listDocumentExtractors(): Array<string>
1237
1173
  export declare function listEmbeddingPresets(): Array<string>
1238
1174
 
1239
1175
  /**
1240
- * List all registered OCR backends.
1176
+ * List all available embedding preset names.
1241
1177
  *
1242
- * Returns an array of names of all currently registered OCR backends,
1243
- * including built-in backends like "tesseract".
1178
+ * Returns an array of preset names that can be used with `getEmbeddingPreset`.
1244
1179
  *
1245
1180
  * # Returns
1246
1181
  *
1247
- * Array of OCR backend names.
1182
+ * Array of 4 preset names: ["fast", "balanced", "quality", "multilingual"]
1248
1183
  *
1249
1184
  * # Example
1250
1185
  *
1251
1186
  * ```typescript
1252
- * import { listOcrBackends } from 'kreuzberg';
1187
+ * import { listEmbeddingPresets } from 'kreuzberg';
1253
1188
  *
1254
- * const backends = listOcrBackends();
1255
- * console.log(backends); // ['tesseract', 'my-custom-backend', ...]
1189
+ * const presets = listEmbeddingPresets();
1190
+ * console.log(presets); // ['fast', 'balanced', 'quality', 'multilingual']
1256
1191
  * ```
1257
1192
  */
1193
+ export declare function listEmbeddingPresets(): Array<string>
1194
+
1195
+ /** List all registered OCR backends */
1258
1196
  export declare function listOcrBackends(): Array<string>
1259
1197
 
1260
1198
  /** List all registered post-processors */
@@ -1451,25 +1389,7 @@ export declare function registerValidator(validator: object): void
1451
1389
  */
1452
1390
  export declare function unregisterDocumentExtractor(name: string): void
1453
1391
 
1454
- /**
1455
- * Unregister an OCR backend by name.
1456
- *
1457
- * Removes the specified OCR backend from the registry. If the backend doesn't exist,
1458
- * this operation is a no-op (does not throw an error).
1459
- *
1460
- * # Parameters
1461
- *
1462
- * * `name` - Name of the OCR backend to unregister
1463
- *
1464
- * # Example
1465
- *
1466
- * ```typescript
1467
- * import { unregisterOcrBackend } from 'kreuzberg';
1468
- *
1469
- * // Unregister a custom backend
1470
- * unregisterOcrBackend('my-custom-ocr');
1471
- * ```
1472
- */
1392
+ /** Unregister an OCR backend by name */
1473
1393
  export declare function unregisterOcrBackend(name: string): void
1474
1394
 
1475
1395
  /** Unregister a postprocessor by name */
@@ -1623,27 +1543,6 @@ export declare function validateLanguageCode(code: string): boolean
1623
1543
  * # Errors
1624
1544
  *
1625
1545
  * Throws an error if the MIME type is not supported.
1626
- *
1627
- * # Example
1628
- *
1629
- * ```typescript
1630
- * import { validateMimeType } from 'kreuzberg';
1631
- *
1632
- * // Validate supported type
1633
- * const validated = validateMimeType('application/pdf');
1634
- * console.log(validated); // 'application/pdf'
1635
- *
1636
- * // Validate custom image type
1637
- * const validated2 = validateMimeType('image/custom-format');
1638
- * console.log(validated2); // 'image/custom-format' (any image/* is valid)
1639
- *
1640
- * // Validate unsupported type (throws error)
1641
- * try {
1642
- * validateMimeType('video/mp4');
1643
- * } catch (err) {
1644
- * console.error(err); // Error: Unsupported format: video/mp4
1645
- * }
1646
- * ```
1647
1546
  */
1648
1547
  export declare function validateMimeType(mimeType: string): string
1649
1548