@kreuzberg/node 4.0.0-rc.14 → 4.0.0-rc.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.d.ts CHANGED
@@ -125,6 +125,8 @@ export declare function batchExtractFiles(paths: Array<string>, config?: JsExtra
125
125
  */
126
126
  export declare function batchExtractFilesSync(paths: Array<string>, config?: JsExtractionConfig | undefined | null): Array<JsExtractionResult>
127
127
 
128
+ export declare function classifyError(errorMessage: string): ErrorClassification
129
+
128
130
  /**
129
131
  * Clear all registered document extractors.
130
132
  *
@@ -165,6 +167,56 @@ export declare function clearPostProcessors(): void
165
167
  /** Clear all registered validators */
166
168
  export declare function clearValidators(): void
167
169
 
170
+ /**
171
+ * Get a specific field from config (represented as JSON string) by name via FFI.
172
+ *
173
+ * Retrieves a configuration field by path, supporting nested access with
174
+ * dot notation (e.g., "ocr.backend"). Returns the field value as a JSON string.
175
+ *
176
+ * # Arguments
177
+ *
178
+ * * `json_str` - A JSON string representation of the configuration
179
+ * * `field_name` - The field path to retrieve (e.g., "useCache", "ocr.backend")
180
+ *
181
+ * # Returns
182
+ *
183
+ * The field value as a JSON string, or null if not found
184
+ */
185
+ export declare function configGetFieldInternal(jsonStr: string, fieldName: string): string | null
186
+
187
+ /**
188
+ * Merge two configs (override takes precedence over base) via FFI.
189
+ *
190
+ * Performs a shallow merge where fields from the override config take
191
+ * precedence over fields in the base config.
192
+ *
193
+ * # Arguments
194
+ *
195
+ * * `base_json` - A JSON string representation of the base ExtractionConfig
196
+ * * `override_json` - A JSON string representation of the override ExtractionConfig
197
+ *
198
+ * # Returns
199
+ *
200
+ * The merged configuration as a JSON string, or error
201
+ */
202
+ export declare function configMergeInternal(baseJson: string, overrideJson: string): string
203
+
204
+ /**
205
+ * Validate and normalize an ExtractionConfig JSON string via FFI.
206
+ *
207
+ * This validates the JSON and returns a normalized version, using the shared
208
+ * FFI layer to ensure consistent validation across all language bindings.
209
+ *
210
+ * # Arguments
211
+ *
212
+ * * `json_str` - A JSON string containing the configuration
213
+ *
214
+ * # Returns
215
+ *
216
+ * The normalized JSON string representation of the config, or error
217
+ */
218
+ export declare function configValidateAndNormalize(jsonStr: string): string
219
+
168
220
  /**
169
221
  * Detect MIME type from raw bytes.
170
222
  *
@@ -285,6 +337,53 @@ export interface EmbeddingPreset {
285
337
  description: string
286
338
  }
287
339
 
340
+ /**
341
+ * Classifies an error message string into an error code category.
342
+ *
343
+ * This function analyzes the error message content and returns the most likely
344
+ * error code (0-7) based on keyword patterns. Used to programmatically classify
345
+ * errors for handling purposes.
346
+ *
347
+ * # Arguments
348
+ *
349
+ * * `error_message` - The error message string to classify
350
+ *
351
+ * # Returns
352
+ *
353
+ * An object with:
354
+ * - `code`: The numeric error code (0-7)
355
+ * - `name`: The error code name string
356
+ * - `description`: Brief description of the error type
357
+ * - `confidence`: Confidence score (0.0-1.0) of the classification
358
+ *
359
+ * # Classification Rules
360
+ *
361
+ * - **Validation (0)**: Keywords: invalid, validation, invalid_argument, schema, required, unexpected field
362
+ * - **Parsing (1)**: Keywords: parsing, parse_error, corrupted, malformed, invalid format, decode, encoding
363
+ * - **Ocr (2)**: Keywords: ocr, optical, character, recognition, tesseract, language, model
364
+ * - **MissingDependency (3)**: Keywords: not found, not installed, missing, dependency, require, unavailable
365
+ * - **Io (4)**: Keywords: io, file, disk, read, write, permission, access, path
366
+ * - **Plugin (5)**: Keywords: plugin, register, extension, handler, processor
367
+ * - **UnsupportedFormat (6)**: Keywords: unsupported, format, mime, type, codec
368
+ * - **Internal (7)**: Keywords: internal, bug, panic, unexpected, invariant
369
+ *
370
+ * # Examples
371
+ *
372
+ * ```typescript
373
+ * const result = classifyError("PDF file is corrupted");
374
+ * // Returns: { code: 1, name: "parsing", confidence: 0.95 }
375
+ *
376
+ * const result = classifyError("Tesseract not found");
377
+ * // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 }
378
+ * ```
379
+ */
380
+ export interface ErrorClassification {
381
+ code: number
382
+ name: string
383
+ description: string
384
+ confidence: number
385
+ }
386
+
288
387
  /**
289
388
  * Extract content from bytes (asynchronous).
290
389
  *
@@ -473,6 +572,52 @@ export declare function extractFileSync(filePath: string, mimeType?: string | un
473
572
  */
474
573
  export declare function getEmbeddingPreset(name: string): EmbeddingPreset | null
475
574
 
575
+ /**
576
+ * Returns the description for an error code.
577
+ *
578
+ * Maps to FFI function kreuzberg_error_code_description().
579
+ *
580
+ * # Arguments
581
+ *
582
+ * * `code` - Numeric error code (0-7)
583
+ *
584
+ * # Returns
585
+ *
586
+ * A string containing a brief description of the error
587
+ *
588
+ * # Examples
589
+ *
590
+ * ```typescript
591
+ * const desc = getErrorCodeDescription(0); // returns "Input validation error"
592
+ * const desc = getErrorCodeDescription(4); // returns "File system I/O error"
593
+ * const desc = getErrorCodeDescription(99); // returns "Unknown error code"
594
+ * ```
595
+ */
596
+ export declare function getErrorCodeDescription(code: number): string
597
+
598
+ /**
599
+ * Returns the human-readable name for an error code.
600
+ *
601
+ * Maps to FFI function kreuzberg_error_code_name().
602
+ *
603
+ * # Arguments
604
+ *
605
+ * * `code` - Numeric error code (0-7)
606
+ *
607
+ * # Returns
608
+ *
609
+ * A string containing the error code name (e.g., "validation", "ocr", "unknown")
610
+ *
611
+ * # Examples
612
+ *
613
+ * ```typescript
614
+ * const name = getErrorCodeName(0); // returns "validation"
615
+ * const name = getErrorCodeName(2); // returns "ocr"
616
+ * const name = getErrorCodeName(99); // returns "unknown"
617
+ * ```
618
+ */
619
+ export declare function getErrorCodeName(code: number): string
620
+
476
621
  /**
477
622
  * Get file extensions for a given MIME type.
478
623
  *
@@ -580,6 +725,86 @@ export declare function getLastErrorCode(): number
580
725
  */
581
726
  export declare function getLastPanicContext(): any | null
582
727
 
728
+ /**
729
+ * Get valid binarization methods.
730
+ *
731
+ * Returns a list of all valid binarization method values.
732
+ *
733
+ * # Returns
734
+ *
735
+ * Array of valid binarization methods: ["otsu", "adaptive", "sauvola"]
736
+ *
737
+ * # Example
738
+ *
739
+ * ```typescript
740
+ * import { getValidBinarizationMethods } from '@kreuzberg/node';
741
+ *
742
+ * const methods = getValidBinarizationMethods();
743
+ * console.log(methods); // ['otsu', 'adaptive', 'sauvola']
744
+ * ```
745
+ */
746
+ export declare function getValidBinarizationMethods(): Array<string>
747
+
748
+ /**
749
+ * Get valid language codes.
750
+ *
751
+ * Returns a list of all valid language codes in ISO 639-1 and 639-3 formats.
752
+ *
753
+ * # Returns
754
+ *
755
+ * Array of valid language codes (both 2-letter and 3-letter codes)
756
+ *
757
+ * # Example
758
+ *
759
+ * ```typescript
760
+ * import { getValidLanguageCodes } from '@kreuzberg/node';
761
+ *
762
+ * const codes = getValidLanguageCodes();
763
+ * console.log(codes); // ['en', 'de', 'fr', ..., 'eng', 'deu', 'fra', ...]
764
+ * ```
765
+ */
766
+ export declare function getValidLanguageCodes(): Array<string>
767
+
768
+ /**
769
+ * Get valid OCR backends.
770
+ *
771
+ * Returns a list of all valid OCR backend values.
772
+ *
773
+ * # Returns
774
+ *
775
+ * Array of valid OCR backends: ["tesseract", "easyocr", "paddleocr"]
776
+ *
777
+ * # Example
778
+ *
779
+ * ```typescript
780
+ * import { getValidOcrBackends } from '@kreuzberg/node';
781
+ *
782
+ * const backends = getValidOcrBackends();
783
+ * console.log(backends); // ['tesseract', 'easyocr', 'paddleocr']
784
+ * ```
785
+ */
786
+ export declare function getValidOcrBackends(): Array<string>
787
+
788
+ /**
789
+ * Get valid token reduction levels.
790
+ *
791
+ * Returns a list of all valid token reduction level values.
792
+ *
793
+ * # Returns
794
+ *
795
+ * Array of valid levels: ["off", "light", "moderate", "aggressive", "maximum"]
796
+ *
797
+ * # Example
798
+ *
799
+ * ```typescript
800
+ * import { getValidTokenReductionLevels } from '@kreuzberg/node';
801
+ *
802
+ * const levels = getValidTokenReductionLevels();
803
+ * console.log(levels); // ['off', 'light', 'moderate', 'aggressive', 'maximum']
804
+ * ```
805
+ */
806
+ export declare function getValidTokenReductionLevels(): Array<string>
807
+
583
808
  export interface JsChunk {
584
809
  content: string
585
810
  embedding?: number[] | undefined
@@ -1076,6 +1301,134 @@ export declare function unregisterPostProcessor(name: string): void
1076
1301
  /** Unregister a validator by name */
1077
1302
  export declare function unregisterValidator(name: string): void
1078
1303
 
1304
+ /**
1305
+ * Validates a binarization method string.
1306
+ *
1307
+ * Valid methods: "otsu", "adaptive", "sauvola"
1308
+ *
1309
+ * # Arguments
1310
+ *
1311
+ * * `method` - The binarization method to validate
1312
+ *
1313
+ * # Returns
1314
+ *
1315
+ * `true` if valid, `false` if invalid.
1316
+ *
1317
+ * # Example
1318
+ *
1319
+ * ```typescript
1320
+ * import { validateBinarizationMethod } from '@kreuzberg/node';
1321
+ *
1322
+ * if (validateBinarizationMethod('otsu')) {
1323
+ * console.log('Valid method');
1324
+ * } else {
1325
+ * console.log('Invalid method');
1326
+ * }
1327
+ * ```
1328
+ */
1329
+ export declare function validateBinarizationMethod(method: string): boolean
1330
+
1331
+ /**
1332
+ * Validates chunking parameters.
1333
+ *
1334
+ * Checks that `maxChars > 0` and `maxOverlap < maxChars`.
1335
+ *
1336
+ * # Arguments
1337
+ *
1338
+ * * `max_chars` - Maximum characters per chunk
1339
+ * * `max_overlap` - Maximum overlap between chunks
1340
+ *
1341
+ * # Returns
1342
+ *
1343
+ * `true` if valid, `false` if invalid.
1344
+ *
1345
+ * # Example
1346
+ *
1347
+ * ```typescript
1348
+ * import { validateChunkingParams } from '@kreuzberg/node';
1349
+ *
1350
+ * if (validateChunkingParams(1000, 200)) {
1351
+ * console.log('Valid chunking parameters');
1352
+ * }
1353
+ * ```
1354
+ */
1355
+ export declare function validateChunkingParams(maxChars: number, maxOverlap: number): boolean
1356
+
1357
+ /**
1358
+ * Validates a confidence threshold value.
1359
+ *
1360
+ * Valid range: 0.0 to 1.0 (inclusive)
1361
+ *
1362
+ * # Arguments
1363
+ *
1364
+ * * `confidence` - The confidence threshold to validate
1365
+ *
1366
+ * # Returns
1367
+ *
1368
+ * `true` if valid, `false` if invalid.
1369
+ *
1370
+ * # Example
1371
+ *
1372
+ * ```typescript
1373
+ * import { validateConfidence } from '@kreuzberg/node';
1374
+ *
1375
+ * if (validateConfidence(0.75)) {
1376
+ * console.log('Valid confidence threshold');
1377
+ * }
1378
+ * ```
1379
+ */
1380
+ export declare function validateConfidence(confidence: number): boolean
1381
+
1382
+ /**
1383
+ * Validates a DPI (dots per inch) value.
1384
+ *
1385
+ * Valid range: 1-2400
1386
+ *
1387
+ * # Arguments
1388
+ *
1389
+ * * `dpi` - The DPI value to validate
1390
+ *
1391
+ * # Returns
1392
+ *
1393
+ * `true` if valid, `false` if invalid.
1394
+ *
1395
+ * # Example
1396
+ *
1397
+ * ```typescript
1398
+ * import { validateDpi } from '@kreuzberg/node';
1399
+ *
1400
+ * if (validateDpi(300)) {
1401
+ * console.log('Valid DPI');
1402
+ * }
1403
+ * ```
1404
+ */
1405
+ export declare function validateDpi(dpi: number): boolean
1406
+
1407
+ /**
1408
+ * Validates a language code (ISO 639-1 or 639-3 format).
1409
+ *
1410
+ * Accepts both 2-letter codes (e.g., "en", "de") and 3-letter codes (e.g., "eng", "deu").
1411
+ *
1412
+ * # Arguments
1413
+ *
1414
+ * * `code` - The language code to validate
1415
+ *
1416
+ * # Returns
1417
+ *
1418
+ * `true` if valid, `false` if invalid.
1419
+ *
1420
+ * # Example
1421
+ *
1422
+ * ```typescript
1423
+ * import { validateLanguageCode } from '@kreuzberg/node';
1424
+ *
1425
+ * if (validateLanguageCode('en')) {
1426
+ * console.log('Valid language code');
1427
+ * }
1428
+ * ```
1429
+ */
1430
+ export declare function validateLanguageCode(code: string): boolean
1431
+
1079
1432
  /**
1080
1433
  * Validate that a MIME type is supported by Kreuzberg.
1081
1434
  *
@@ -1116,3 +1469,128 @@ export declare function unregisterValidator(name: string): void
1116
1469
  * ```
1117
1470
  */
1118
1471
  export declare function validateMimeType(mimeType: string): string
1472
+
1473
+ /**
1474
+ * Validates an OCR backend string.
1475
+ *
1476
+ * Valid backends: "tesseract", "easyocr", "paddleocr"
1477
+ *
1478
+ * # Arguments
1479
+ *
1480
+ * * `backend` - The OCR backend to validate
1481
+ *
1482
+ * # Returns
1483
+ *
1484
+ * `true` if valid, `false` if invalid.
1485
+ *
1486
+ * # Example
1487
+ *
1488
+ * ```typescript
1489
+ * import { validateOcrBackend } from '@kreuzberg/node';
1490
+ *
1491
+ * if (validateOcrBackend('tesseract')) {
1492
+ * console.log('Valid backend');
1493
+ * }
1494
+ * ```
1495
+ */
1496
+ export declare function validateOcrBackend(backend: string): boolean
1497
+
1498
+ /**
1499
+ * Validates a tesseract output format string.
1500
+ *
1501
+ * Valid formats: "text", "markdown"
1502
+ *
1503
+ * # Arguments
1504
+ *
1505
+ * * `format` - The output format to validate
1506
+ *
1507
+ * # Returns
1508
+ *
1509
+ * `true` if valid, `false` if invalid.
1510
+ *
1511
+ * # Example
1512
+ *
1513
+ * ```typescript
1514
+ * import { validateOutputFormat } from '@kreuzberg/node';
1515
+ *
1516
+ * if (validateOutputFormat('markdown')) {
1517
+ * console.log('Valid output format');
1518
+ * }
1519
+ * ```
1520
+ */
1521
+ export declare function validateOutputFormat(format: string): boolean
1522
+
1523
+ /**
1524
+ * Validates a Tesseract OCR Engine Mode (OEM) value.
1525
+ *
1526
+ * Valid range: 0-3
1527
+ *
1528
+ * # Arguments
1529
+ *
1530
+ * * `oem` - The OEM value to validate
1531
+ *
1532
+ * # Returns
1533
+ *
1534
+ * `true` if valid (0-3), `false` otherwise.
1535
+ *
1536
+ * # Example
1537
+ *
1538
+ * ```typescript
1539
+ * import { validateTesseractOem } from '@kreuzberg/node';
1540
+ *
1541
+ * if (validateTesseractOem(1)) {
1542
+ * console.log('Valid OEM');
1543
+ * }
1544
+ * ```
1545
+ */
1546
+ export declare function validateTesseractOem(oem: number): boolean
1547
+
1548
+ /**
1549
+ * Validates a Tesseract Page Segmentation Mode (PSM) value.
1550
+ *
1551
+ * Valid range: 0-13
1552
+ *
1553
+ * # Arguments
1554
+ *
1555
+ * * `psm` - The PSM value to validate
1556
+ *
1557
+ * # Returns
1558
+ *
1559
+ * `true` if valid (0-13), `false` otherwise.
1560
+ *
1561
+ * # Example
1562
+ *
1563
+ * ```typescript
1564
+ * import { validateTesseractPsm } from '@kreuzberg/node';
1565
+ *
1566
+ * if (validateTesseractPsm(3)) {
1567
+ * console.log('Valid PSM');
1568
+ * }
1569
+ * ```
1570
+ */
1571
+ export declare function validateTesseractPsm(psm: number): boolean
1572
+
1573
+ /**
1574
+ * Validates a token reduction level string.
1575
+ *
1576
+ * Valid levels: "off", "light", "moderate", "aggressive", "maximum"
1577
+ *
1578
+ * # Arguments
1579
+ *
1580
+ * * `level` - The token reduction level to validate
1581
+ *
1582
+ * # Returns
1583
+ *
1584
+ * `true` if valid, `false` if invalid.
1585
+ *
1586
+ * # Example
1587
+ *
1588
+ * ```typescript
1589
+ * import { validateTokenReductionLevel } from '@kreuzberg/node';
1590
+ *
1591
+ * if (validateTokenReductionLevel('moderate')) {
1592
+ * console.log('Valid token reduction level');
1593
+ * }
1594
+ * ```
1595
+ */
1596
+ export declare function validateTokenReductionLevel(level: string): boolean