@kreuzberg/node 4.0.7 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -1
- package/dist/cli.js +6 -4
- package/dist/cli.js.map +1 -1
- package/dist/cli.mjs +13 -5
- package/dist/cli.mjs.map +1 -1
- package/dist/errors.js +26 -24
- package/dist/errors.js.map +1 -1
- package/dist/errors.mjs +25 -24
- package/dist/errors.mjs.map +1 -1
- package/dist/index.d.mts +608 -535
- package/dist/index.d.ts +608 -535
- package/dist/index.js +682 -338
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +662 -334
- package/dist/index.mjs.map +1 -1
- package/dist/ocr/guten-ocr.js +4 -2
- package/dist/ocr/guten-ocr.js.map +1 -1
- package/dist/ocr/guten-ocr.mjs +3 -2
- package/dist/ocr/guten-ocr.mjs.map +1 -1
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -1
- package/index.d.ts +77 -178
- package/index.js +54 -52
- package/package.json +7 -7
package/index.d.ts
CHANGED
|
@@ -135,33 +135,6 @@ export declare function batchExtractFiles(paths: Array<string>, config?: JsExtra
|
|
|
135
135
|
*/
|
|
136
136
|
export declare function batchExtractFilesInWorker(pool: JsWorkerPool, filePaths: Array<string>, config?: JsExtractionConfig | undefined | null): Promise<Array<JsExtractionResult>>
|
|
137
137
|
|
|
138
|
-
/**
|
|
139
|
-
* Batch extract from multiple files (synchronous).
|
|
140
|
-
*
|
|
141
|
-
* Synchronously processes multiple files in parallel using Rayon. Significantly
|
|
142
|
-
* faster than sequential processing for large batches.
|
|
143
|
-
*
|
|
144
|
-
* # Parameters
|
|
145
|
-
*
|
|
146
|
-
* * `paths` - Array of file paths to extract
|
|
147
|
-
* * `config` - Optional extraction configuration (applied to all files)
|
|
148
|
-
*
|
|
149
|
-
* # Returns
|
|
150
|
-
*
|
|
151
|
-
* Array of `ExtractionResult` in the same order as input paths.
|
|
152
|
-
*
|
|
153
|
-
* # Example
|
|
154
|
-
*
|
|
155
|
-
* ```typescript
|
|
156
|
-
* import { batchExtractFilesSync } from '@kreuzberg/node';
|
|
157
|
-
*
|
|
158
|
-
* const files = ['doc1.pdf', 'doc2.docx', 'doc3.txt'];
|
|
159
|
-
* const results = batchExtractFilesSync(files, null);
|
|
160
|
-
* results.forEach((result, i) => {
|
|
161
|
-
* console.log(`File ${files[i]}: ${result.content.substring(0, 100)}...`);
|
|
162
|
-
* });
|
|
163
|
-
* ```
|
|
164
|
-
*/
|
|
165
138
|
export declare function batchExtractFilesSync(paths: Array<string>, config?: JsExtractionConfig | undefined | null): Array<JsExtractionResult>
|
|
166
139
|
|
|
167
140
|
export declare function classifyError(errorMessage: string): ErrorClassification
|
|
@@ -183,21 +156,7 @@ export declare function classifyError(errorMessage: string): ErrorClassification
|
|
|
183
156
|
*/
|
|
184
157
|
export declare function clearDocumentExtractors(): void
|
|
185
158
|
|
|
186
|
-
/**
|
|
187
|
-
* Clear all registered OCR backends.
|
|
188
|
-
*
|
|
189
|
-
* Removes all OCR backends from the registry, including built-in backends.
|
|
190
|
-
* Use with caution as this will make OCR functionality unavailable until
|
|
191
|
-
* backends are re-registered.
|
|
192
|
-
*
|
|
193
|
-
* # Example
|
|
194
|
-
*
|
|
195
|
-
* ```typescript
|
|
196
|
-
* import { clearOcrBackends } from 'kreuzberg';
|
|
197
|
-
*
|
|
198
|
-
* clearOcrBackends();
|
|
199
|
-
* ```
|
|
200
|
-
*/
|
|
159
|
+
/** Clear all registered OCR backends */
|
|
201
160
|
export declare function clearOcrBackends(): void
|
|
202
161
|
|
|
203
162
|
/** Clear all registered postprocessors */
|
|
@@ -329,15 +288,14 @@ export declare function createWorkerPool(size?: number | undefined | null): JsWo
|
|
|
329
288
|
* # Example
|
|
330
289
|
*
|
|
331
290
|
* ```typescript
|
|
332
|
-
* import {
|
|
291
|
+
* import { detectMimeTypeFromBytes } from 'kreuzberg';
|
|
333
292
|
* import * as fs from 'fs';
|
|
334
293
|
*
|
|
335
294
|
* // Read file content
|
|
336
295
|
* const content = fs.readFileSync('document.pdf');
|
|
337
296
|
*
|
|
338
297
|
* // Detect MIME type from bytes
|
|
339
|
-
* const mimeType =
|
|
340
|
-
* console.log(mimeType); // 'application/pdf'
|
|
298
|
+
* const mimeType = detectMimeTypeFromBytes(content);
|
|
341
299
|
* ```
|
|
342
300
|
*/
|
|
343
301
|
export declare function detectMimeTypeFromBytes(bytes: Buffer): string
|
|
@@ -345,68 +303,83 @@ export declare function detectMimeTypeFromBytes(bytes: Buffer): string
|
|
|
345
303
|
/**
|
|
346
304
|
* Detect MIME type from a file path.
|
|
347
305
|
*
|
|
348
|
-
*
|
|
349
|
-
* if
|
|
306
|
+
* Determines the MIME type based on the file extension in the provided path.
|
|
307
|
+
* By default, checks if the file exists; can be disabled with check_exists parameter.
|
|
350
308
|
*
|
|
351
309
|
* # Parameters
|
|
352
310
|
*
|
|
353
|
-
* * `path` -
|
|
354
|
-
* * `check_exists` - Whether to verify file
|
|
311
|
+
* * `path` - The file path to detect MIME type from (e.g., 'document.pdf')
|
|
312
|
+
* * `check_exists` - Whether to verify the file exists (default: true)
|
|
355
313
|
*
|
|
356
314
|
* # Returns
|
|
357
315
|
*
|
|
358
|
-
* The detected MIME type string.
|
|
316
|
+
* The detected MIME type as a string (e.g., 'application/pdf').
|
|
359
317
|
*
|
|
360
318
|
* # Errors
|
|
361
319
|
*
|
|
362
|
-
* Throws an error if
|
|
363
|
-
*
|
|
364
|
-
* - MIME type cannot be determined from path/extension
|
|
365
|
-
* - Extension is unknown
|
|
320
|
+
* Throws an error if MIME type cannot be determined from the file extension,
|
|
321
|
+
* or if check_exists is true and the file does not exist.
|
|
366
322
|
*
|
|
367
323
|
* # Example
|
|
368
324
|
*
|
|
369
325
|
* ```typescript
|
|
370
326
|
* import { detectMimeTypeFromPath } from 'kreuzberg';
|
|
371
327
|
*
|
|
372
|
-
* // Detect from existing file
|
|
373
|
-
* const mimeType = detectMimeTypeFromPath('document.pdf');
|
|
374
|
-
* console.log(mimeType); // 'application/pdf'
|
|
328
|
+
* // Detect MIME type from existing file
|
|
329
|
+
* const mimeType = detectMimeTypeFromPath('/path/to/document.pdf');
|
|
375
330
|
*
|
|
376
|
-
*
|
|
377
|
-
*
|
|
331
|
+
* // Detect without checking file existence
|
|
332
|
+
* const mimeType2 = detectMimeTypeFromPath('document.docx', false);
|
|
378
333
|
* ```
|
|
379
334
|
*/
|
|
380
335
|
export declare function detectMimeTypeFromPath(path: string, checkExists?: boolean | undefined | null): string
|
|
381
336
|
|
|
382
337
|
/**
|
|
383
|
-
* Discover
|
|
338
|
+
* Discover extraction configuration file in current directory or parent directories.
|
|
384
339
|
*
|
|
385
|
-
* Searches for
|
|
386
|
-
*
|
|
340
|
+
* Searches for configuration files in the following order:
|
|
341
|
+
* 1. `kreuzberg.toml`
|
|
342
|
+
* 2. `kreuzberg.yaml` / `kreuzberg.yml`
|
|
343
|
+
* 3. `kreuzberg.json`
|
|
344
|
+
* 4. Searches parent directories up to the filesystem root
|
|
345
|
+
*
|
|
346
|
+
* Returns the first configuration file found or throws an error if none found.
|
|
387
347
|
*
|
|
388
348
|
* # Returns
|
|
389
349
|
*
|
|
390
|
-
* `JsExtractionConfig` object
|
|
391
|
-
*
|
|
350
|
+
* `JsExtractionConfig` object with discovered configuration.
|
|
351
|
+
*
|
|
352
|
+
* # Errors
|
|
353
|
+
*
|
|
354
|
+
* Throws an error if no configuration file is found.
|
|
392
355
|
*
|
|
393
356
|
* # Example
|
|
394
357
|
*
|
|
395
358
|
* ```typescript
|
|
396
|
-
* import {
|
|
359
|
+
* import { discoverExtractionConfig } from 'kreuzberg';
|
|
397
360
|
*
|
|
398
|
-
* //
|
|
399
|
-
* const config =
|
|
400
|
-
*
|
|
401
|
-
* console.log('Found configuration');
|
|
402
|
-
* // Use config for extraction
|
|
403
|
-
* } else {
|
|
404
|
-
* console.log('No configuration file found, using defaults');
|
|
405
|
-
* }
|
|
361
|
+
* // Automatically finds kreuzberg.toml or kreuzberg.yaml in current or parent directories
|
|
362
|
+
* const config = discoverExtractionConfig();
|
|
363
|
+
* const result = await extractFile('document.pdf', null, config);
|
|
406
364
|
* ```
|
|
407
365
|
*/
|
|
408
366
|
export declare function discoverExtractionConfig(): JsExtractionConfig | null
|
|
409
367
|
|
|
368
|
+
export interface EmbeddingPreset {
|
|
369
|
+
/** Name of the preset (e.g., "fast", "balanced", "quality", "multilingual") */
|
|
370
|
+
name: string
|
|
371
|
+
/** Recommended chunk size in characters */
|
|
372
|
+
chunkSize: number
|
|
373
|
+
/** Recommended overlap in characters */
|
|
374
|
+
overlap: number
|
|
375
|
+
/** Model identifier (e.g., "AllMiniLML6V2Q", "BGEBaseENV15") */
|
|
376
|
+
modelName: string
|
|
377
|
+
/** Embedding vector dimensions */
|
|
378
|
+
dimensions: number
|
|
379
|
+
/** Human-readable description of the preset */
|
|
380
|
+
description: string
|
|
381
|
+
}
|
|
382
|
+
|
|
410
383
|
/**
|
|
411
384
|
* Embedding preset configuration for TypeScript bindings.
|
|
412
385
|
*
|
|
@@ -604,60 +577,42 @@ export declare function extractFile(filePath: string, mimeType?: string | undefi
|
|
|
604
577
|
*/
|
|
605
578
|
export declare function extractFileInWorker(pool: JsWorkerPool, filePath: string, password?: string | undefined | null, config?: JsExtractionConfig | undefined | null): Promise<JsExtractionResult>
|
|
606
579
|
|
|
580
|
+
export declare function extractFileSync(filePath: string, mimeType?: string | undefined | null, config?: JsExtractionConfig | undefined | null): JsExtractionResult
|
|
581
|
+
|
|
607
582
|
/**
|
|
608
|
-
*
|
|
583
|
+
* Get a specific embedding preset by name.
|
|
609
584
|
*
|
|
610
|
-
*
|
|
611
|
-
* Supports 118+ file formats including PDFs, Office documents, images, and more.
|
|
585
|
+
* Returns a preset configuration object, or null if the preset name is not found.
|
|
612
586
|
*
|
|
613
|
-
* #
|
|
587
|
+
* # Arguments
|
|
614
588
|
*
|
|
615
|
-
* * `
|
|
616
|
-
* * `mime_type` - Optional MIME type hint (auto-detected if omitted)
|
|
617
|
-
* * `config` - Optional extraction configuration (OCR, chunking, etc.)
|
|
589
|
+
* * `name` - The preset name (case-sensitive)
|
|
618
590
|
*
|
|
619
591
|
* # Returns
|
|
620
592
|
*
|
|
621
|
-
* `
|
|
622
|
-
* - `
|
|
623
|
-
* - `
|
|
624
|
-
* - `
|
|
625
|
-
* - `
|
|
626
|
-
* - `
|
|
627
|
-
* - `
|
|
628
|
-
* - `detectedLanguages`: Detected languages (if enabled)
|
|
629
|
-
*
|
|
630
|
-
* # Errors
|
|
593
|
+
* An `EmbeddingPreset` object with the following properties:
|
|
594
|
+
* - `name`: string - Preset name
|
|
595
|
+
* - `chunkSize`: number - Recommended chunk size in characters
|
|
596
|
+
* - `overlap`: number - Recommended overlap in characters
|
|
597
|
+
* - `modelName`: string - Model identifier
|
|
598
|
+
* - `dimensions`: number - Embedding vector dimensions
|
|
599
|
+
* - `description`: string - Human-readable description
|
|
631
600
|
*
|
|
632
|
-
*
|
|
633
|
-
* - File does not exist or is not accessible
|
|
634
|
-
* - File format is unsupported
|
|
635
|
-
* - File is corrupted or malformed
|
|
636
|
-
* - OCR processing fails (if enabled)
|
|
601
|
+
* Returns `null` if preset name is not found.
|
|
637
602
|
*
|
|
638
603
|
* # Example
|
|
639
604
|
*
|
|
640
605
|
* ```typescript
|
|
641
|
-
* import {
|
|
642
|
-
*
|
|
643
|
-
* // Basic extraction
|
|
644
|
-
* const result = extractFileSync('document.pdf', null, null);
|
|
645
|
-
* console.log(result.content);
|
|
646
|
-
*
|
|
647
|
-
* // With MIME type hint
|
|
648
|
-
* const result2 = extractFileSync('file.bin', 'application/pdf', null);
|
|
606
|
+
* import { getEmbeddingPreset } from 'kreuzberg';
|
|
649
607
|
*
|
|
650
|
-
*
|
|
651
|
-
*
|
|
652
|
-
*
|
|
653
|
-
*
|
|
654
|
-
*
|
|
655
|
-
* }
|
|
656
|
-
* };
|
|
657
|
-
* const result3 = extractFileSync('scanned.pdf', null, config);
|
|
608
|
+
* const preset = getEmbeddingPreset('balanced');
|
|
609
|
+
* if (preset) {
|
|
610
|
+
* console.log(`Model: ${preset.modelName}, Dims: ${preset.dimensions}`);
|
|
611
|
+
* // Model: BGEBaseENV15, Dims: 768
|
|
612
|
+
* }
|
|
658
613
|
* ```
|
|
659
614
|
*/
|
|
660
|
-
export declare function
|
|
615
|
+
export declare function getEmbeddingPreset(name: string): EmbeddingPreset | null
|
|
661
616
|
|
|
662
617
|
/**
|
|
663
618
|
* Get a specific embedding preset by name.
|
|
@@ -1195,25 +1150,6 @@ export interface JsYakeParams {
|
|
|
1195
1150
|
windowSize?: number
|
|
1196
1151
|
}
|
|
1197
1152
|
|
|
1198
|
-
/**
|
|
1199
|
-
* List all registered document extractors.
|
|
1200
|
-
*
|
|
1201
|
-
* Returns an array of names of all currently registered document extractors,
|
|
1202
|
-
* including built-in extractors for PDF, Office documents, images, etc.
|
|
1203
|
-
*
|
|
1204
|
-
* # Returns
|
|
1205
|
-
*
|
|
1206
|
-
* Array of document extractor names.
|
|
1207
|
-
*
|
|
1208
|
-
* # Example
|
|
1209
|
-
*
|
|
1210
|
-
* ```typescript
|
|
1211
|
-
* import { listDocumentExtractors } from 'kreuzberg';
|
|
1212
|
-
*
|
|
1213
|
-
* const extractors = listDocumentExtractors();
|
|
1214
|
-
* console.log(extractors); // ['PDFExtractor', 'ImageExtractor', ...]
|
|
1215
|
-
* ```
|
|
1216
|
-
*/
|
|
1217
1153
|
export declare function listDocumentExtractors(): Array<string>
|
|
1218
1154
|
|
|
1219
1155
|
/**
|
|
@@ -1237,24 +1173,26 @@ export declare function listDocumentExtractors(): Array<string>
|
|
|
1237
1173
|
export declare function listEmbeddingPresets(): Array<string>
|
|
1238
1174
|
|
|
1239
1175
|
/**
|
|
1240
|
-
* List all
|
|
1176
|
+
* List all available embedding preset names.
|
|
1241
1177
|
*
|
|
1242
|
-
* Returns an array of names
|
|
1243
|
-
* including built-in backends like "tesseract".
|
|
1178
|
+
* Returns an array of preset names that can be used with `getEmbeddingPreset`.
|
|
1244
1179
|
*
|
|
1245
1180
|
* # Returns
|
|
1246
1181
|
*
|
|
1247
|
-
* Array of
|
|
1182
|
+
* Array of 4 preset names: ["fast", "balanced", "quality", "multilingual"]
|
|
1248
1183
|
*
|
|
1249
1184
|
* # Example
|
|
1250
1185
|
*
|
|
1251
1186
|
* ```typescript
|
|
1252
|
-
* import {
|
|
1187
|
+
* import { listEmbeddingPresets } from 'kreuzberg';
|
|
1253
1188
|
*
|
|
1254
|
-
* const
|
|
1255
|
-
* console.log(
|
|
1189
|
+
* const presets = listEmbeddingPresets();
|
|
1190
|
+
* console.log(presets); // ['fast', 'balanced', 'quality', 'multilingual']
|
|
1256
1191
|
* ```
|
|
1257
1192
|
*/
|
|
1193
|
+
export declare function listEmbeddingPresets(): Array<string>
|
|
1194
|
+
|
|
1195
|
+
/** List all registered OCR backends */
|
|
1258
1196
|
export declare function listOcrBackends(): Array<string>
|
|
1259
1197
|
|
|
1260
1198
|
/** List all registered post-processors */
|
|
@@ -1451,25 +1389,7 @@ export declare function registerValidator(validator: object): void
|
|
|
1451
1389
|
*/
|
|
1452
1390
|
export declare function unregisterDocumentExtractor(name: string): void
|
|
1453
1391
|
|
|
1454
|
-
/**
|
|
1455
|
-
* Unregister an OCR backend by name.
|
|
1456
|
-
*
|
|
1457
|
-
* Removes the specified OCR backend from the registry. If the backend doesn't exist,
|
|
1458
|
-
* this operation is a no-op (does not throw an error).
|
|
1459
|
-
*
|
|
1460
|
-
* # Parameters
|
|
1461
|
-
*
|
|
1462
|
-
* * `name` - Name of the OCR backend to unregister
|
|
1463
|
-
*
|
|
1464
|
-
* # Example
|
|
1465
|
-
*
|
|
1466
|
-
* ```typescript
|
|
1467
|
-
* import { unregisterOcrBackend } from 'kreuzberg';
|
|
1468
|
-
*
|
|
1469
|
-
* // Unregister a custom backend
|
|
1470
|
-
* unregisterOcrBackend('my-custom-ocr');
|
|
1471
|
-
* ```
|
|
1472
|
-
*/
|
|
1392
|
+
/** Unregister an OCR backend by name */
|
|
1473
1393
|
export declare function unregisterOcrBackend(name: string): void
|
|
1474
1394
|
|
|
1475
1395
|
/** Unregister a postprocessor by name */
|
|
@@ -1623,27 +1543,6 @@ export declare function validateLanguageCode(code: string): boolean
|
|
|
1623
1543
|
* # Errors
|
|
1624
1544
|
*
|
|
1625
1545
|
* Throws an error if the MIME type is not supported.
|
|
1626
|
-
*
|
|
1627
|
-
* # Example
|
|
1628
|
-
*
|
|
1629
|
-
* ```typescript
|
|
1630
|
-
* import { validateMimeType } from 'kreuzberg';
|
|
1631
|
-
*
|
|
1632
|
-
* // Validate supported type
|
|
1633
|
-
* const validated = validateMimeType('application/pdf');
|
|
1634
|
-
* console.log(validated); // 'application/pdf'
|
|
1635
|
-
*
|
|
1636
|
-
* // Validate custom image type
|
|
1637
|
-
* const validated2 = validateMimeType('image/custom-format');
|
|
1638
|
-
* console.log(validated2); // 'image/custom-format' (any image/* is valid)
|
|
1639
|
-
*
|
|
1640
|
-
* // Validate unsupported type (throws error)
|
|
1641
|
-
* try {
|
|
1642
|
-
* validateMimeType('video/mp4');
|
|
1643
|
-
* } catch (err) {
|
|
1644
|
-
* console.error(err); // Error: Unsupported format: video/mp4
|
|
1645
|
-
* }
|
|
1646
|
-
* ```
|
|
1647
1546
|
*/
|
|
1648
1547
|
export declare function validateMimeType(mimeType: string): string
|
|
1649
1548
|
|