@kreuzberg/node 4.0.0-rc.15 → 4.0.0-rc.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +180 -23
- package/dist/index.d.ts +180 -23
- package/dist/index.js +181 -29
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +178 -29
- package/dist/index.mjs.map +1 -1
- package/dist/types.d.mts +37 -1
- package/dist/types.d.ts +37 -1
- package/dist/types.js.map +1 -1
- package/index.d.ts +478 -0
- package/index.js +72 -52
- package/package.json +5 -5
package/dist/index.d.mts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { PanicContext } from './errors.mjs';
|
|
2
2
|
export { CacheError, ErrorCode, ImageProcessingError, KreuzbergError, MissingDependencyError, OcrError, ParsingError, PluginError, ValidationError } from './errors.mjs';
|
|
3
|
-
import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol } from './types.mjs';
|
|
4
|
-
export { ArchiveMetadata, Chunk, ChunkMetadata,
|
|
3
|
+
import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, OcrConfig, ChunkingConfig, ImageExtractionConfig, PdfConfig, KeywordConfig, LanguageDetectionConfig, ErrorClassification } from './types.mjs';
|
|
4
|
+
export { ArchiveMetadata, Chunk, ChunkMetadata, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, Metadata, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.mjs';
|
|
5
5
|
export { GutenOcrBackend } from './ocr/guten-ocr.mjs';
|
|
6
6
|
|
|
7
7
|
/**
|
|
@@ -97,7 +97,7 @@ declare function __resetBindingForTests(): void;
|
|
|
97
97
|
* const result2 = extractFileSync('scanned.pdf', null, config);
|
|
98
98
|
* ```
|
|
99
99
|
*/
|
|
100
|
-
declare function extractFileSync(filePath: string,
|
|
100
|
+
declare function extractFileSync(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): ExtractionResult;
|
|
101
101
|
/**
|
|
102
102
|
* Extract content from a single file (asynchronous).
|
|
103
103
|
*
|
|
@@ -133,7 +133,7 @@ declare function extractFileSync(filePath: string, mimeType?: string | null, con
|
|
|
133
133
|
* console.log(result2.chunks); // Array of text chunks
|
|
134
134
|
* ```
|
|
135
135
|
*/
|
|
136
|
-
declare function extractFile(filePath: string,
|
|
136
|
+
declare function extractFile(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
|
|
137
137
|
/**
|
|
138
138
|
* Extract content from raw bytes (synchronous).
|
|
139
139
|
*
|
|
@@ -161,7 +161,7 @@ declare function extractFile(filePath: string, mimeType?: string | null, config?
|
|
|
161
161
|
* console.log(result.content);
|
|
162
162
|
* ```
|
|
163
163
|
*/
|
|
164
|
-
declare function extractBytesSync(
|
|
164
|
+
declare function extractBytesSync(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig$1 | null): ExtractionResult;
|
|
165
165
|
/**
|
|
166
166
|
* Extract content from raw bytes (asynchronous).
|
|
167
167
|
*
|
|
@@ -189,7 +189,7 @@ declare function extractBytesSync(data: Uint8Array, mimeType: string, config?: E
|
|
|
189
189
|
* console.log(result.content);
|
|
190
190
|
* ```
|
|
191
191
|
*/
|
|
192
|
-
declare function extractBytes(
|
|
192
|
+
declare function extractBytes(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
|
|
193
193
|
/**
|
|
194
194
|
* Extract content from multiple files in parallel (synchronous).
|
|
195
195
|
*
|
|
@@ -610,23 +610,109 @@ declare function unregisterDocumentExtractor(name: string): void;
|
|
|
610
610
|
*/
|
|
611
611
|
declare function clearDocumentExtractors(): void;
|
|
612
612
|
/**
|
|
613
|
-
*
|
|
613
|
+
* Builder class for creating ExtractionConfig objects with a fluent API.
|
|
614
614
|
*
|
|
615
|
-
* Provides a
|
|
616
|
-
*
|
|
615
|
+
* Provides a convenient way to build extraction configurations using method chaining.
|
|
616
|
+
*
|
|
617
|
+
* @example
|
|
618
|
+
* ```typescript
|
|
619
|
+
* import { ExtractionConfig, extractFile } from '@kreuzberg/node';
|
|
620
|
+
*
|
|
621
|
+
* // Create with builder pattern
|
|
622
|
+
* const config = ExtractionConfig.default()
|
|
623
|
+
* .withChunking({ maxChars: 2048 })
|
|
624
|
+
* .withOcr({ backend: 'tesseract', language: 'eng' })
|
|
625
|
+
* .build();
|
|
626
|
+
*
|
|
627
|
+
* const result = await extractFile('document.pdf', null, config);
|
|
628
|
+
* ```
|
|
629
|
+
*/
|
|
630
|
+
declare class ExtractionConfigBuilder {
|
|
631
|
+
private config;
|
|
632
|
+
/**
|
|
633
|
+
* Create a new builder with default configuration.
|
|
634
|
+
*/
|
|
635
|
+
static default(): ExtractionConfigBuilder;
|
|
636
|
+
/**
|
|
637
|
+
* Set OCR configuration.
|
|
638
|
+
*/
|
|
639
|
+
withOcr(ocr: OcrConfig): ExtractionConfigBuilder;
|
|
640
|
+
/**
|
|
641
|
+
* Set chunking configuration.
|
|
642
|
+
*/
|
|
643
|
+
withChunking(chunking: ChunkingConfig): ExtractionConfigBuilder;
|
|
644
|
+
/**
|
|
645
|
+
* Set image extraction configuration.
|
|
646
|
+
*/
|
|
647
|
+
withImageExtraction(images: ImageExtractionConfig): ExtractionConfigBuilder;
|
|
648
|
+
/**
|
|
649
|
+
* Set PDF configuration.
|
|
650
|
+
*/
|
|
651
|
+
withPdf(pdf: PdfConfig): ExtractionConfigBuilder;
|
|
652
|
+
/**
|
|
653
|
+
* Set keyword extraction configuration.
|
|
654
|
+
*/
|
|
655
|
+
withKeywords(keywords: KeywordConfig): ExtractionConfigBuilder;
|
|
656
|
+
/**
|
|
657
|
+
* Set language detection configuration.
|
|
658
|
+
*/
|
|
659
|
+
withLanguageDetection(languageDetection: LanguageDetectionConfig): ExtractionConfigBuilder;
|
|
660
|
+
/**
|
|
661
|
+
* Set whether to enable metadata extraction.
|
|
662
|
+
*/
|
|
663
|
+
withMetadataExtraction(enabled: boolean): ExtractionConfigBuilder;
|
|
664
|
+
/**
|
|
665
|
+
* Set whether to enable quality mode.
|
|
666
|
+
*/
|
|
667
|
+
withQualityMode(enabled: boolean): ExtractionConfigBuilder;
|
|
668
|
+
/**
|
|
669
|
+
* Build and return the final ExtractionConfig object.
|
|
670
|
+
*/
|
|
671
|
+
build(): ExtractionConfig$1;
|
|
672
|
+
}
|
|
673
|
+
/**
|
|
674
|
+
* ExtractionConfig namespace with static methods for loading configuration from files
|
|
675
|
+
* and creating new configurations with the builder pattern.
|
|
676
|
+
*
|
|
677
|
+
* Provides factory methods to load extraction configuration from TOML, YAML, or JSON files,
|
|
678
|
+
* or to create configurations using a fluent builder API.
|
|
617
679
|
*
|
|
618
680
|
* @example
|
|
619
681
|
* ```typescript
|
|
620
682
|
* import { ExtractionConfig, extractFile } from '@kreuzberg/node';
|
|
621
683
|
*
|
|
622
684
|
* // Load configuration from file
|
|
623
|
-
* const
|
|
685
|
+
* const config1 = ExtractionConfig.fromFile('config.toml');
|
|
686
|
+
*
|
|
687
|
+
* // Create with builder pattern
|
|
688
|
+
* const config2 = ExtractionConfig.default()
|
|
689
|
+
* .withChunking({ maxChars: 2048 })
|
|
690
|
+
* .build();
|
|
624
691
|
*
|
|
625
692
|
* // Use with extraction
|
|
626
|
-
* const result = await extractFile('document.pdf', null,
|
|
693
|
+
* const result = await extractFile('document.pdf', null, config2);
|
|
627
694
|
* ```
|
|
628
695
|
*/
|
|
629
696
|
declare const ExtractionConfig: {
|
|
697
|
+
/**
|
|
698
|
+
* Create a default extraction configuration using the builder pattern.
|
|
699
|
+
*
|
|
700
|
+
* Returns a builder object that allows you to configure extraction settings
|
|
701
|
+
* using method chaining.
|
|
702
|
+
*
|
|
703
|
+
* @returns ExtractionConfigBuilder for chaining configuration calls
|
|
704
|
+
*
|
|
705
|
+
* @example
|
|
706
|
+
* ```typescript
|
|
707
|
+
* import { ExtractionConfig } from '@kreuzberg/node';
|
|
708
|
+
*
|
|
709
|
+
* const config = ExtractionConfig.default()
|
|
710
|
+
* .withChunking({ maxChars: 2048 })
|
|
711
|
+
* .withOcr({ backend: 'tesseract', language: 'eng' })
|
|
712
|
+
* .build();
|
|
713
|
+
* ```
|
|
714
|
+
*/
|
|
715
|
+
default(): ExtractionConfigBuilder;
|
|
630
716
|
/**
|
|
631
717
|
* Load extraction configuration from a file.
|
|
632
718
|
*
|
|
@@ -711,28 +797,30 @@ declare function detectMimeType(bytes: Buffer): string;
|
|
|
711
797
|
/**
|
|
712
798
|
* Detect MIME type from a file path.
|
|
713
799
|
*
|
|
714
|
-
*
|
|
715
|
-
* if
|
|
800
|
+
* Determines the MIME type based on the file extension in the provided path.
|
|
801
|
+
* By default, checks if the file exists; can be disabled with checkExists parameter.
|
|
716
802
|
*
|
|
717
|
-
* @param
|
|
718
|
-
* @
|
|
803
|
+
* @param filePath - The file path to detect MIME type from (e.g., 'document.pdf')
|
|
804
|
+
* @param checkExists - Whether to verify the file exists (default: true)
|
|
805
|
+
* @returns The detected MIME type as a string (e.g., 'application/pdf')
|
|
719
806
|
*
|
|
720
|
-
* @throws {Error} If MIME type cannot be determined from
|
|
721
|
-
*
|
|
807
|
+
* @throws {Error} If MIME type cannot be determined from the file extension,
|
|
808
|
+
* or if checkExists is true and the file does not exist
|
|
722
809
|
*
|
|
723
810
|
* @example
|
|
724
811
|
* ```typescript
|
|
725
812
|
* import { detectMimeTypeFromPath } from '@kreuzberg/node';
|
|
726
813
|
*
|
|
727
|
-
* // Detect from existing file
|
|
728
|
-
* const mimeType = detectMimeTypeFromPath('document.pdf');
|
|
814
|
+
* // Detect MIME type from existing file
|
|
815
|
+
* const mimeType = detectMimeTypeFromPath('/path/to/document.pdf');
|
|
729
816
|
* console.log(mimeType); // 'application/pdf'
|
|
730
817
|
*
|
|
731
|
-
*
|
|
818
|
+
* // Detect without checking file existence
|
|
819
|
+
* const mimeType2 = detectMimeTypeFromPath('document.docx', false);
|
|
732
820
|
* console.log(mimeType2); // 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
|
733
821
|
* ```
|
|
734
822
|
*/
|
|
735
|
-
declare function detectMimeTypeFromPath(
|
|
823
|
+
declare function detectMimeTypeFromPath(filePath: string, checkExists?: boolean): string;
|
|
736
824
|
/**
|
|
737
825
|
* Validate that a MIME type is supported by Kreuzberg.
|
|
738
826
|
*
|
|
@@ -903,6 +991,75 @@ declare function getLastErrorCode(): number;
|
|
|
903
991
|
* ```
|
|
904
992
|
*/
|
|
905
993
|
declare function getLastPanicContext(): PanicContext | null;
|
|
906
|
-
|
|
994
|
+
/**
|
|
995
|
+
* Returns the human-readable name for an error code.
|
|
996
|
+
*
|
|
997
|
+
* Maps numeric error codes to their string names, providing a consistent way
|
|
998
|
+
* to get error code names across all platforms.
|
|
999
|
+
*
|
|
1000
|
+
* @param code - The numeric error code (0-7)
|
|
1001
|
+
* @returns The error code name as a string (e.g., "validation", "ocr", "unknown")
|
|
1002
|
+
*
|
|
1003
|
+
* @example
|
|
1004
|
+
* ```typescript
|
|
1005
|
+
* import { getErrorCodeName } from '@kreuzberg/node';
|
|
1006
|
+
*
|
|
1007
|
+
* const name = getErrorCodeName(0); // returns "validation"
|
|
1008
|
+
* const name = getErrorCodeName(2); // returns "ocr"
|
|
1009
|
+
* const name = getErrorCodeName(99); // returns "unknown"
|
|
1010
|
+
* ```
|
|
1011
|
+
*/
|
|
1012
|
+
declare function getErrorCodeName(code: number): string;
|
|
1013
|
+
/**
|
|
1014
|
+
* Returns the description for an error code.
|
|
1015
|
+
*
|
|
1016
|
+
* Retrieves user-friendly descriptions of error types from the FFI layer.
|
|
1017
|
+
*
|
|
1018
|
+
* @param code - The numeric error code (0-7)
|
|
1019
|
+
* @returns A brief description of the error type
|
|
1020
|
+
*
|
|
1021
|
+
* @example
|
|
1022
|
+
* ```typescript
|
|
1023
|
+
* import { getErrorCodeDescription } from '@kreuzberg/node';
|
|
1024
|
+
*
|
|
1025
|
+
* const desc = getErrorCodeDescription(0); // returns "Input validation error"
|
|
1026
|
+
* const desc = getErrorCodeDescription(4); // returns "File system I/O error"
|
|
1027
|
+
* const desc = getErrorCodeDescription(99); // returns "Unknown error code"
|
|
1028
|
+
* ```
|
|
1029
|
+
*/
|
|
1030
|
+
declare function getErrorCodeDescription(code: number): string;
|
|
1031
|
+
/**
|
|
1032
|
+
* Classifies an error message string into an error code category.
|
|
1033
|
+
*
|
|
1034
|
+
* This function analyzes the error message content and returns the most likely
|
|
1035
|
+
* error code (0-7) based on keyword patterns. Used to programmatically classify
|
|
1036
|
+
* errors for handling purposes.
|
|
1037
|
+
*
|
|
1038
|
+
* The classification is based on keyword matching:
|
|
1039
|
+
* - **Validation (0)**: Keywords like "invalid", "validation", "schema", "required"
|
|
1040
|
+
* - **Parsing (1)**: Keywords like "parsing", "corrupted", "malformed"
|
|
1041
|
+
* - **Ocr (2)**: Keywords like "ocr", "tesseract", "language", "model"
|
|
1042
|
+
* - **MissingDependency (3)**: Keywords like "not found", "missing", "dependency"
|
|
1043
|
+
* - **Io (4)**: Keywords like "file", "disk", "read", "write", "permission"
|
|
1044
|
+
* - **Plugin (5)**: Keywords like "plugin", "register", "extension"
|
|
1045
|
+
* - **UnsupportedFormat (6)**: Keywords like "unsupported", "format", "mime"
|
|
1046
|
+
* - **Internal (7)**: Keywords like "internal", "bug", "panic"
|
|
1047
|
+
*
|
|
1048
|
+
* @param errorMessage - The error message string to classify
|
|
1049
|
+
* @returns An object with the classification details
|
|
1050
|
+
*
|
|
1051
|
+
* @example
|
|
1052
|
+
* ```typescript
|
|
1053
|
+
* import { classifyError } from '@kreuzberg/node';
|
|
1054
|
+
*
|
|
1055
|
+
* const result = classifyError("PDF file is corrupted");
|
|
1056
|
+
* // Returns: { code: 1, name: "parsing", confidence: 0.95 }
|
|
1057
|
+
*
|
|
1058
|
+
* const result = classifyError("Tesseract not found");
|
|
1059
|
+
* // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 }
|
|
1060
|
+
* ```
|
|
1061
|
+
*/
|
|
1062
|
+
declare function classifyError(errorMessage: string): ErrorClassification;
|
|
1063
|
+
declare const __version__ = "4.0.0-rc.17";
|
|
907
1064
|
|
|
908
|
-
export { type EmbeddingPreset, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesSync, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileSync, getEmbeddingPreset, getExtensionsForMime, getLastErrorCode, getLastPanicContext, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
|
|
1065
|
+
export { ChunkingConfig, type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, ImageExtractionConfig, KeywordConfig, LanguageDetectionConfig, OcrBackendProtocol, OcrConfig, PanicContext, PdfConfig, PostProcessorProtocol, ValidatorProtocol, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { PanicContext } from './errors.js';
|
|
2
2
|
export { CacheError, ErrorCode, ImageProcessingError, KreuzbergError, MissingDependencyError, OcrError, ParsingError, PluginError, ValidationError } from './errors.js';
|
|
3
|
-
import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol } from './types.js';
|
|
4
|
-
export { ArchiveMetadata, Chunk, ChunkMetadata,
|
|
3
|
+
import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, OcrConfig, ChunkingConfig, ImageExtractionConfig, PdfConfig, KeywordConfig, LanguageDetectionConfig, ErrorClassification } from './types.js';
|
|
4
|
+
export { ArchiveMetadata, Chunk, ChunkMetadata, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, Metadata, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.js';
|
|
5
5
|
export { GutenOcrBackend } from './ocr/guten-ocr.js';
|
|
6
6
|
|
|
7
7
|
/**
|
|
@@ -97,7 +97,7 @@ declare function __resetBindingForTests(): void;
|
|
|
97
97
|
* const result2 = extractFileSync('scanned.pdf', null, config);
|
|
98
98
|
* ```
|
|
99
99
|
*/
|
|
100
|
-
declare function extractFileSync(filePath: string,
|
|
100
|
+
declare function extractFileSync(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): ExtractionResult;
|
|
101
101
|
/**
|
|
102
102
|
* Extract content from a single file (asynchronous).
|
|
103
103
|
*
|
|
@@ -133,7 +133,7 @@ declare function extractFileSync(filePath: string, mimeType?: string | null, con
|
|
|
133
133
|
* console.log(result2.chunks); // Array of text chunks
|
|
134
134
|
* ```
|
|
135
135
|
*/
|
|
136
|
-
declare function extractFile(filePath: string,
|
|
136
|
+
declare function extractFile(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
|
|
137
137
|
/**
|
|
138
138
|
* Extract content from raw bytes (synchronous).
|
|
139
139
|
*
|
|
@@ -161,7 +161,7 @@ declare function extractFile(filePath: string, mimeType?: string | null, config?
|
|
|
161
161
|
* console.log(result.content);
|
|
162
162
|
* ```
|
|
163
163
|
*/
|
|
164
|
-
declare function extractBytesSync(
|
|
164
|
+
declare function extractBytesSync(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig$1 | null): ExtractionResult;
|
|
165
165
|
/**
|
|
166
166
|
* Extract content from raw bytes (asynchronous).
|
|
167
167
|
*
|
|
@@ -189,7 +189,7 @@ declare function extractBytesSync(data: Uint8Array, mimeType: string, config?: E
|
|
|
189
189
|
* console.log(result.content);
|
|
190
190
|
* ```
|
|
191
191
|
*/
|
|
192
|
-
declare function extractBytes(
|
|
192
|
+
declare function extractBytes(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
|
|
193
193
|
/**
|
|
194
194
|
* Extract content from multiple files in parallel (synchronous).
|
|
195
195
|
*
|
|
@@ -610,23 +610,109 @@ declare function unregisterDocumentExtractor(name: string): void;
|
|
|
610
610
|
*/
|
|
611
611
|
declare function clearDocumentExtractors(): void;
|
|
612
612
|
/**
|
|
613
|
-
*
|
|
613
|
+
* Builder class for creating ExtractionConfig objects with a fluent API.
|
|
614
614
|
*
|
|
615
|
-
* Provides a
|
|
616
|
-
*
|
|
615
|
+
* Provides a convenient way to build extraction configurations using method chaining.
|
|
616
|
+
*
|
|
617
|
+
* @example
|
|
618
|
+
* ```typescript
|
|
619
|
+
* import { ExtractionConfig, extractFile } from '@kreuzberg/node';
|
|
620
|
+
*
|
|
621
|
+
* // Create with builder pattern
|
|
622
|
+
* const config = ExtractionConfig.default()
|
|
623
|
+
* .withChunking({ maxChars: 2048 })
|
|
624
|
+
* .withOcr({ backend: 'tesseract', language: 'eng' })
|
|
625
|
+
* .build();
|
|
626
|
+
*
|
|
627
|
+
* const result = await extractFile('document.pdf', null, config);
|
|
628
|
+
* ```
|
|
629
|
+
*/
|
|
630
|
+
declare class ExtractionConfigBuilder {
|
|
631
|
+
private config;
|
|
632
|
+
/**
|
|
633
|
+
* Create a new builder with default configuration.
|
|
634
|
+
*/
|
|
635
|
+
static default(): ExtractionConfigBuilder;
|
|
636
|
+
/**
|
|
637
|
+
* Set OCR configuration.
|
|
638
|
+
*/
|
|
639
|
+
withOcr(ocr: OcrConfig): ExtractionConfigBuilder;
|
|
640
|
+
/**
|
|
641
|
+
* Set chunking configuration.
|
|
642
|
+
*/
|
|
643
|
+
withChunking(chunking: ChunkingConfig): ExtractionConfigBuilder;
|
|
644
|
+
/**
|
|
645
|
+
* Set image extraction configuration.
|
|
646
|
+
*/
|
|
647
|
+
withImageExtraction(images: ImageExtractionConfig): ExtractionConfigBuilder;
|
|
648
|
+
/**
|
|
649
|
+
* Set PDF configuration.
|
|
650
|
+
*/
|
|
651
|
+
withPdf(pdf: PdfConfig): ExtractionConfigBuilder;
|
|
652
|
+
/**
|
|
653
|
+
* Set keyword extraction configuration.
|
|
654
|
+
*/
|
|
655
|
+
withKeywords(keywords: KeywordConfig): ExtractionConfigBuilder;
|
|
656
|
+
/**
|
|
657
|
+
* Set language detection configuration.
|
|
658
|
+
*/
|
|
659
|
+
withLanguageDetection(languageDetection: LanguageDetectionConfig): ExtractionConfigBuilder;
|
|
660
|
+
/**
|
|
661
|
+
* Set whether to enable metadata extraction.
|
|
662
|
+
*/
|
|
663
|
+
withMetadataExtraction(enabled: boolean): ExtractionConfigBuilder;
|
|
664
|
+
/**
|
|
665
|
+
* Set whether to enable quality mode.
|
|
666
|
+
*/
|
|
667
|
+
withQualityMode(enabled: boolean): ExtractionConfigBuilder;
|
|
668
|
+
/**
|
|
669
|
+
* Build and return the final ExtractionConfig object.
|
|
670
|
+
*/
|
|
671
|
+
build(): ExtractionConfig$1;
|
|
672
|
+
}
|
|
673
|
+
/**
|
|
674
|
+
* ExtractionConfig namespace with static methods for loading configuration from files
|
|
675
|
+
* and creating new configurations with the builder pattern.
|
|
676
|
+
*
|
|
677
|
+
* Provides factory methods to load extraction configuration from TOML, YAML, or JSON files,
|
|
678
|
+
* or to create configurations using a fluent builder API.
|
|
617
679
|
*
|
|
618
680
|
* @example
|
|
619
681
|
* ```typescript
|
|
620
682
|
* import { ExtractionConfig, extractFile } from '@kreuzberg/node';
|
|
621
683
|
*
|
|
622
684
|
* // Load configuration from file
|
|
623
|
-
* const
|
|
685
|
+
* const config1 = ExtractionConfig.fromFile('config.toml');
|
|
686
|
+
*
|
|
687
|
+
* // Create with builder pattern
|
|
688
|
+
* const config2 = ExtractionConfig.default()
|
|
689
|
+
* .withChunking({ maxChars: 2048 })
|
|
690
|
+
* .build();
|
|
624
691
|
*
|
|
625
692
|
* // Use with extraction
|
|
626
|
-
* const result = await extractFile('document.pdf', null,
|
|
693
|
+
* const result = await extractFile('document.pdf', null, config2);
|
|
627
694
|
* ```
|
|
628
695
|
*/
|
|
629
696
|
declare const ExtractionConfig: {
|
|
697
|
+
/**
|
|
698
|
+
* Create a default extraction configuration using the builder pattern.
|
|
699
|
+
*
|
|
700
|
+
* Returns a builder object that allows you to configure extraction settings
|
|
701
|
+
* using method chaining.
|
|
702
|
+
*
|
|
703
|
+
* @returns ExtractionConfigBuilder for chaining configuration calls
|
|
704
|
+
*
|
|
705
|
+
* @example
|
|
706
|
+
* ```typescript
|
|
707
|
+
* import { ExtractionConfig } from '@kreuzberg/node';
|
|
708
|
+
*
|
|
709
|
+
* const config = ExtractionConfig.default()
|
|
710
|
+
* .withChunking({ maxChars: 2048 })
|
|
711
|
+
* .withOcr({ backend: 'tesseract', language: 'eng' })
|
|
712
|
+
* .build();
|
|
713
|
+
* ```
|
|
714
|
+
*/
|
|
715
|
+
default(): ExtractionConfigBuilder;
|
|
630
716
|
/**
|
|
631
717
|
* Load extraction configuration from a file.
|
|
632
718
|
*
|
|
@@ -711,28 +797,30 @@ declare function detectMimeType(bytes: Buffer): string;
|
|
|
711
797
|
/**
|
|
712
798
|
* Detect MIME type from a file path.
|
|
713
799
|
*
|
|
714
|
-
*
|
|
715
|
-
* if
|
|
800
|
+
* Determines the MIME type based on the file extension in the provided path.
|
|
801
|
+
* By default, checks if the file exists; can be disabled with checkExists parameter.
|
|
716
802
|
*
|
|
717
|
-
* @param
|
|
718
|
-
* @
|
|
803
|
+
* @param filePath - The file path to detect MIME type from (e.g., 'document.pdf')
|
|
804
|
+
* @param checkExists - Whether to verify the file exists (default: true)
|
|
805
|
+
* @returns The detected MIME type as a string (e.g., 'application/pdf')
|
|
719
806
|
*
|
|
720
|
-
* @throws {Error} If MIME type cannot be determined from
|
|
721
|
-
*
|
|
807
|
+
* @throws {Error} If MIME type cannot be determined from the file extension,
|
|
808
|
+
* or if checkExists is true and the file does not exist
|
|
722
809
|
*
|
|
723
810
|
* @example
|
|
724
811
|
* ```typescript
|
|
725
812
|
* import { detectMimeTypeFromPath } from '@kreuzberg/node';
|
|
726
813
|
*
|
|
727
|
-
* // Detect from existing file
|
|
728
|
-
* const mimeType = detectMimeTypeFromPath('document.pdf');
|
|
814
|
+
* // Detect MIME type from existing file
|
|
815
|
+
* const mimeType = detectMimeTypeFromPath('/path/to/document.pdf');
|
|
729
816
|
* console.log(mimeType); // 'application/pdf'
|
|
730
817
|
*
|
|
731
|
-
*
|
|
818
|
+
* // Detect without checking file existence
|
|
819
|
+
* const mimeType2 = detectMimeTypeFromPath('document.docx', false);
|
|
732
820
|
* console.log(mimeType2); // 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
|
733
821
|
* ```
|
|
734
822
|
*/
|
|
735
|
-
declare function detectMimeTypeFromPath(
|
|
823
|
+
declare function detectMimeTypeFromPath(filePath: string, checkExists?: boolean): string;
|
|
736
824
|
/**
|
|
737
825
|
* Validate that a MIME type is supported by Kreuzberg.
|
|
738
826
|
*
|
|
@@ -903,6 +991,75 @@ declare function getLastErrorCode(): number;
|
|
|
903
991
|
* ```
|
|
904
992
|
*/
|
|
905
993
|
declare function getLastPanicContext(): PanicContext | null;
|
|
906
|
-
|
|
994
|
+
/**
|
|
995
|
+
* Returns the human-readable name for an error code.
|
|
996
|
+
*
|
|
997
|
+
* Maps numeric error codes to their string names, providing a consistent way
|
|
998
|
+
* to get error code names across all platforms.
|
|
999
|
+
*
|
|
1000
|
+
* @param code - The numeric error code (0-7)
|
|
1001
|
+
* @returns The error code name as a string (e.g., "validation", "ocr", "unknown")
|
|
1002
|
+
*
|
|
1003
|
+
* @example
|
|
1004
|
+
* ```typescript
|
|
1005
|
+
* import { getErrorCodeName } from '@kreuzberg/node';
|
|
1006
|
+
*
|
|
1007
|
+
* const name = getErrorCodeName(0); // returns "validation"
|
|
1008
|
+
* const name = getErrorCodeName(2); // returns "ocr"
|
|
1009
|
+
* const name = getErrorCodeName(99); // returns "unknown"
|
|
1010
|
+
* ```
|
|
1011
|
+
*/
|
|
1012
|
+
declare function getErrorCodeName(code: number): string;
|
|
1013
|
+
/**
|
|
1014
|
+
* Returns the description for an error code.
|
|
1015
|
+
*
|
|
1016
|
+
* Retrieves user-friendly descriptions of error types from the FFI layer.
|
|
1017
|
+
*
|
|
1018
|
+
* @param code - The numeric error code (0-7)
|
|
1019
|
+
* @returns A brief description of the error type
|
|
1020
|
+
*
|
|
1021
|
+
* @example
|
|
1022
|
+
* ```typescript
|
|
1023
|
+
* import { getErrorCodeDescription } from '@kreuzberg/node';
|
|
1024
|
+
*
|
|
1025
|
+
* const desc = getErrorCodeDescription(0); // returns "Input validation error"
|
|
1026
|
+
* const desc = getErrorCodeDescription(4); // returns "File system I/O error"
|
|
1027
|
+
* const desc = getErrorCodeDescription(99); // returns "Unknown error code"
|
|
1028
|
+
* ```
|
|
1029
|
+
*/
|
|
1030
|
+
declare function getErrorCodeDescription(code: number): string;
|
|
1031
|
+
/**
|
|
1032
|
+
* Classifies an error message string into an error code category.
|
|
1033
|
+
*
|
|
1034
|
+
* This function analyzes the error message content and returns the most likely
|
|
1035
|
+
* error code (0-7) based on keyword patterns. Used to programmatically classify
|
|
1036
|
+
* errors for handling purposes.
|
|
1037
|
+
*
|
|
1038
|
+
* The classification is based on keyword matching:
|
|
1039
|
+
* - **Validation (0)**: Keywords like "invalid", "validation", "schema", "required"
|
|
1040
|
+
* - **Parsing (1)**: Keywords like "parsing", "corrupted", "malformed"
|
|
1041
|
+
* - **Ocr (2)**: Keywords like "ocr", "tesseract", "language", "model"
|
|
1042
|
+
* - **MissingDependency (3)**: Keywords like "not found", "missing", "dependency"
|
|
1043
|
+
* - **Io (4)**: Keywords like "file", "disk", "read", "write", "permission"
|
|
1044
|
+
* - **Plugin (5)**: Keywords like "plugin", "register", "extension"
|
|
1045
|
+
* - **UnsupportedFormat (6)**: Keywords like "unsupported", "format", "mime"
|
|
1046
|
+
* - **Internal (7)**: Keywords like "internal", "bug", "panic"
|
|
1047
|
+
*
|
|
1048
|
+
* @param errorMessage - The error message string to classify
|
|
1049
|
+
* @returns An object with the classification details
|
|
1050
|
+
*
|
|
1051
|
+
* @example
|
|
1052
|
+
* ```typescript
|
|
1053
|
+
* import { classifyError } from '@kreuzberg/node';
|
|
1054
|
+
*
|
|
1055
|
+
* const result = classifyError("PDF file is corrupted");
|
|
1056
|
+
* // Returns: { code: 1, name: "parsing", confidence: 0.95 }
|
|
1057
|
+
*
|
|
1058
|
+
* const result = classifyError("Tesseract not found");
|
|
1059
|
+
* // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 }
|
|
1060
|
+
* ```
|
|
1061
|
+
*/
|
|
1062
|
+
declare function classifyError(errorMessage: string): ErrorClassification;
|
|
1063
|
+
declare const __version__ = "4.0.0-rc.17";
|
|
907
1064
|
|
|
908
|
-
export { type EmbeddingPreset, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesSync, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileSync, getEmbeddingPreset, getExtensionsForMime, getLastErrorCode, getLastPanicContext, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
|
|
1065
|
+
export { ChunkingConfig, type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, ImageExtractionConfig, KeywordConfig, LanguageDetectionConfig, OcrBackendProtocol, OcrConfig, PanicContext, PdfConfig, PostProcessorProtocol, ValidatorProtocol, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
|