@kreuzberg/node 4.0.0-rc.14 → 4.0.0-rc.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -1,6 +1,6 @@
1
1
  import { PanicContext } from './errors.mjs';
2
2
  export { CacheError, ErrorCode, ImageProcessingError, KreuzbergError, MissingDependencyError, OcrError, ParsingError, PluginError, ValidationError } from './errors.mjs';
3
- import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol } from './types.mjs';
3
+ import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, ErrorClassification } from './types.mjs';
4
4
  export { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, Metadata, OcrConfig, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.mjs';
5
5
  export { GutenOcrBackend } from './ocr/guten-ocr.mjs';
6
6
 
@@ -903,6 +903,75 @@ declare function getLastErrorCode(): number;
903
903
  * ```
904
904
  */
905
905
  declare function getLastPanicContext(): PanicContext | null;
906
- declare const __version__ = "4.0.0-rc.14";
906
+ /**
907
+ * Returns the human-readable name for an error code.
908
+ *
909
+ * Maps numeric error codes to their string names, providing a consistent way
910
+ * to get error code names across all platforms.
911
+ *
912
+ * @param code - The numeric error code (0-7)
913
+ * @returns The error code name as a string (e.g., "validation", "ocr", "unknown")
914
+ *
915
+ * @example
916
+ * ```typescript
917
+ * import { getErrorCodeName } from '@kreuzberg/node';
918
+ *
919
+ * const name = getErrorCodeName(0); // returns "validation"
920
+ * const name = getErrorCodeName(2); // returns "ocr"
921
+ * const name = getErrorCodeName(99); // returns "unknown"
922
+ * ```
923
+ */
924
+ declare function getErrorCodeName(code: number): string;
925
+ /**
926
+ * Returns the description for an error code.
927
+ *
928
+ * Retrieves user-friendly descriptions of error types from the FFI layer.
929
+ *
930
+ * @param code - The numeric error code (0-7)
931
+ * @returns A brief description of the error type
932
+ *
933
+ * @example
934
+ * ```typescript
935
+ * import { getErrorCodeDescription } from '@kreuzberg/node';
936
+ *
937
+ * const desc = getErrorCodeDescription(0); // returns "Input validation error"
938
+ * const desc = getErrorCodeDescription(4); // returns "File system I/O error"
939
+ * const desc = getErrorCodeDescription(99); // returns "Unknown error code"
940
+ * ```
941
+ */
942
+ declare function getErrorCodeDescription(code: number): string;
943
+ /**
944
+ * Classifies an error message string into an error code category.
945
+ *
946
+ * This function analyzes the error message content and returns the most likely
947
+ * error code (0-7) based on keyword patterns. Used to programmatically classify
948
+ * errors for handling purposes.
949
+ *
950
+ * The classification is based on keyword matching:
951
+ * - **Validation (0)**: Keywords like "invalid", "validation", "schema", "required"
952
+ * - **Parsing (1)**: Keywords like "parsing", "corrupted", "malformed"
953
+ * - **Ocr (2)**: Keywords like "ocr", "tesseract", "language", "model"
954
+ * - **MissingDependency (3)**: Keywords like "not found", "missing", "dependency"
955
+ * - **Io (4)**: Keywords like "file", "disk", "read", "write", "permission"
956
+ * - **Plugin (5)**: Keywords like "plugin", "register", "extension"
957
+ * - **UnsupportedFormat (6)**: Keywords like "unsupported", "format", "mime"
958
+ * - **Internal (7)**: Keywords like "internal", "bug", "panic"
959
+ *
960
+ * @param errorMessage - The error message string to classify
961
+ * @returns An object with the classification details
962
+ *
963
+ * @example
964
+ * ```typescript
965
+ * import { classifyError } from '@kreuzberg/node';
966
+ *
967
+ * const result = classifyError("PDF file is corrupted");
968
+ * // Returns: { code: 1, name: "parsing", confidence: 0.95 }
969
+ *
970
+ * const result = classifyError("Tesseract not found");
971
+ * // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 }
972
+ * ```
973
+ */
974
+ declare function classifyError(errorMessage: string): ErrorClassification;
975
+ declare const __version__ = "4.0.0-rc.16";
907
976
 
908
- export { type EmbeddingPreset, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesSync, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileSync, getEmbeddingPreset, getExtensionsForMime, getLastErrorCode, getLastPanicContext, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
977
+ export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
package/dist/index.d.ts CHANGED
@@ -1,6 +1,6 @@
1
1
  import { PanicContext } from './errors.js';
2
2
  export { CacheError, ErrorCode, ImageProcessingError, KreuzbergError, MissingDependencyError, OcrError, ParsingError, PluginError, ValidationError } from './errors.js';
3
- import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol } from './types.js';
3
+ import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, ErrorClassification } from './types.js';
4
4
  export { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, Metadata, OcrConfig, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.js';
5
5
  export { GutenOcrBackend } from './ocr/guten-ocr.js';
6
6
 
@@ -903,6 +903,75 @@ declare function getLastErrorCode(): number;
903
903
  * ```
904
904
  */
905
905
  declare function getLastPanicContext(): PanicContext | null;
906
- declare const __version__ = "4.0.0-rc.14";
906
+ /**
907
+ * Returns the human-readable name for an error code.
908
+ *
909
+ * Maps numeric error codes to their string names, providing a consistent way
910
+ * to get error code names across all platforms.
911
+ *
912
+ * @param code - The numeric error code (0-7)
913
+ * @returns The error code name as a string (e.g., "validation", "ocr", "unknown")
914
+ *
915
+ * @example
916
+ * ```typescript
917
+ * import { getErrorCodeName } from '@kreuzberg/node';
918
+ *
919
+ * const name = getErrorCodeName(0); // returns "validation"
920
+ * const name = getErrorCodeName(2); // returns "ocr"
921
+ * const name = getErrorCodeName(99); // returns "unknown"
922
+ * ```
923
+ */
924
+ declare function getErrorCodeName(code: number): string;
925
+ /**
926
+ * Returns the description for an error code.
927
+ *
928
+ * Retrieves user-friendly descriptions of error types from the FFI layer.
929
+ *
930
+ * @param code - The numeric error code (0-7)
931
+ * @returns A brief description of the error type
932
+ *
933
+ * @example
934
+ * ```typescript
935
+ * import { getErrorCodeDescription } from '@kreuzberg/node';
936
+ *
937
+ * const desc = getErrorCodeDescription(0); // returns "Input validation error"
938
+ * const desc = getErrorCodeDescription(4); // returns "File system I/O error"
939
+ * const desc = getErrorCodeDescription(99); // returns "Unknown error code"
940
+ * ```
941
+ */
942
+ declare function getErrorCodeDescription(code: number): string;
943
+ /**
944
+ * Classifies an error message string into an error code category.
945
+ *
946
+ * This function analyzes the error message content and returns the most likely
947
+ * error code (0-7) based on keyword patterns. Used to programmatically classify
948
+ * errors for handling purposes.
949
+ *
950
+ * The classification is based on keyword matching:
951
+ * - **Validation (0)**: Keywords like "invalid", "validation", "schema", "required"
952
+ * - **Parsing (1)**: Keywords like "parsing", "corrupted", "malformed"
953
+ * - **Ocr (2)**: Keywords like "ocr", "tesseract", "language", "model"
954
+ * - **MissingDependency (3)**: Keywords like "not found", "missing", "dependency"
955
+ * - **Io (4)**: Keywords like "file", "disk", "read", "write", "permission"
956
+ * - **Plugin (5)**: Keywords like "plugin", "register", "extension"
957
+ * - **UnsupportedFormat (6)**: Keywords like "unsupported", "format", "mime"
958
+ * - **Internal (7)**: Keywords like "internal", "bug", "panic"
959
+ *
960
+ * @param errorMessage - The error message string to classify
961
+ * @returns An object with the classification details
962
+ *
963
+ * @example
964
+ * ```typescript
965
+ * import { classifyError } from '@kreuzberg/node';
966
+ *
967
+ * const result = classifyError("PDF file is corrupted");
968
+ * // Returns: { code: 1, name: "parsing", confidence: 0.95 }
969
+ *
970
+ * const result = classifyError("Tesseract not found");
971
+ * // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 }
972
+ * ```
973
+ */
974
+ declare function classifyError(errorMessage: string): ErrorClassification;
975
+ declare const __version__ = "4.0.0-rc.16";
907
976
 
908
- export { type EmbeddingPreset, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesSync, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileSync, getEmbeddingPreset, getExtensionsForMime, getLastErrorCode, getLastPanicContext, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
977
+ export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
package/dist/index.js CHANGED
@@ -37,6 +37,7 @@ __export(index_exports, {
37
37
  batchExtractBytesSync: () => batchExtractBytesSync,
38
38
  batchExtractFiles: () => batchExtractFiles,
39
39
  batchExtractFilesSync: () => batchExtractFilesSync,
40
+ classifyError: () => classifyError,
40
41
  clearDocumentExtractors: () => clearDocumentExtractors,
41
42
  clearOcrBackends: () => clearOcrBackends,
42
43
  clearPostProcessors: () => clearPostProcessors,
@@ -48,6 +49,8 @@ __export(index_exports, {
48
49
  extractFile: () => extractFile,
49
50
  extractFileSync: () => extractFileSync,
50
51
  getEmbeddingPreset: () => getEmbeddingPreset,
52
+ getErrorCodeDescription: () => getErrorCodeDescription,
53
+ getErrorCodeName: () => getErrorCodeName,
51
54
  getExtensionsForMime: () => getExtensionsForMime,
52
55
  getLastErrorCode: () => getLastErrorCode,
53
56
  getLastPanicContext: () => getLastPanicContext,
@@ -865,7 +868,20 @@ function getLastPanicContext() {
865
868
  const result = binding2.getLastPanicContext();
866
869
  return result;
867
870
  }
868
- const __version__ = "4.0.0-rc.14";
871
+ function getErrorCodeName(code) {
872
+ const binding2 = getBinding();
873
+ return binding2.getErrorCodeName(code);
874
+ }
875
+ function getErrorCodeDescription(code) {
876
+ const binding2 = getBinding();
877
+ return binding2.getErrorCodeDescription(code);
878
+ }
879
+ function classifyError(errorMessage) {
880
+ const binding2 = getBinding();
881
+ const result = binding2.classifyError(errorMessage);
882
+ return result;
883
+ }
884
+ const __version__ = "4.0.0-rc.16";
869
885
  // Annotate the CommonJS export names for ESM import in node:
870
886
  0 && (module.exports = {
871
887
  CacheError,
@@ -886,6 +902,7 @@ const __version__ = "4.0.0-rc.14";
886
902
  batchExtractBytesSync,
887
903
  batchExtractFiles,
888
904
  batchExtractFilesSync,
905
+ classifyError,
889
906
  clearDocumentExtractors,
890
907
  clearOcrBackends,
891
908
  clearPostProcessors,
@@ -897,6 +914,8 @@ const __version__ = "4.0.0-rc.14";
897
914
  extractFile,
898
915
  extractFileSync,
899
916
  getEmbeddingPreset,
917
+ getErrorCodeDescription,
918
+ getErrorCodeName,
900
919
  getExtensionsForMime,
901
920
  getLastErrorCode,
902
921
  getLastPanicContext,