@kreuzberg/node 4.0.7 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -1
- package/dist/cli.js +6 -4
- package/dist/cli.js.map +1 -1
- package/dist/cli.mjs +13 -5
- package/dist/cli.mjs.map +1 -1
- package/dist/errors.js +26 -24
- package/dist/errors.js.map +1 -1
- package/dist/errors.mjs +25 -24
- package/dist/errors.mjs.map +1 -1
- package/dist/index.d.mts +608 -535
- package/dist/index.d.ts +608 -535
- package/dist/index.js +682 -338
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +662 -334
- package/dist/index.mjs.map +1 -1
- package/dist/ocr/guten-ocr.js +4 -2
- package/dist/ocr/guten-ocr.js.map +1 -1
- package/dist/ocr/guten-ocr.mjs +3 -2
- package/dist/ocr/guten-ocr.mjs.map +1 -1
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -1
- package/index.d.ts +77 -178
- package/index.js +54 -52
- package/package.json +7 -7
package/dist/index.d.mts
CHANGED
|
@@ -1,195 +1,153 @@
|
|
|
1
|
+
import { ErrorClassification, ExtractionConfig, ExtractionResult, WorkerPool, WorkerPoolStats, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol } from './types.mjs';
|
|
2
|
+
export { Chunk, ChunkingConfig, ExtractedImage, HtmlConversionOptions, HtmlPreprocessingOptions, ImageExtractionConfig, KeywordConfig, LanguageDetectionConfig, OcrConfig, PageContent, PageExtractionConfig, PdfConfig, PostProcessorConfig, Table, TesseractConfig, TokenReductionConfig } from './types.mjs';
|
|
1
3
|
import { PanicContext } from './errors.mjs';
|
|
2
4
|
export { CacheError, ErrorCode, ImageProcessingError, KreuzbergError, MissingDependencyError, OcrError, ParsingError, PluginError, ValidationError } from './errors.mjs';
|
|
3
|
-
import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, ErrorClassification, WorkerPool, WorkerPoolStats } from './types.mjs';
|
|
4
|
-
export { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.mjs';
|
|
5
5
|
export { GutenOcrBackend } from './ocr/guten-ocr.mjs';
|
|
6
6
|
|
|
7
7
|
/**
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
* This is a TypeScript SDK around a high-performance Rust core.
|
|
11
|
-
* All extraction logic, chunking, quality processing, and language detection
|
|
12
|
-
* are implemented in Rust for maximum performance.
|
|
13
|
-
*
|
|
14
|
-
* ## API Usage Recommendations
|
|
15
|
-
*
|
|
16
|
-
* **For processing multiple documents**, prefer batch APIs:
|
|
17
|
-
* - Use `batchExtractFiles()` / `batchExtractFilesSync()` for multiple files
|
|
18
|
-
* - Use `batchExtractBytes()` / `batchExtractBytesSync()` for multiple byte arrays
|
|
19
|
-
*
|
|
20
|
-
* **Batch APIs provide**:
|
|
21
|
-
* - Better performance (parallel processing in Rust)
|
|
22
|
-
* - More reliable memory management
|
|
23
|
-
* - Recommended for all multi-document workflows
|
|
8
|
+
* Get the error code for the last FFI error.
|
|
24
9
|
*
|
|
25
|
-
*
|
|
26
|
-
*
|
|
27
|
-
* - Interactive applications processing documents on-demand
|
|
28
|
-
* - Avoid calling these in tight loops - use batch APIs instead
|
|
10
|
+
* Returns the FFI error code as an integer. This is useful for programmatic error handling
|
|
11
|
+
* and distinguishing between different types of failures in native code.
|
|
29
12
|
*
|
|
30
|
-
*
|
|
13
|
+
* Error codes:
|
|
14
|
+
* - 0: Success (no error)
|
|
15
|
+
* - 1: GenericError
|
|
16
|
+
* - 2: Panic
|
|
17
|
+
* - 3: InvalidArgument
|
|
18
|
+
* - 4: IoError
|
|
19
|
+
* - 5: ParsingError
|
|
20
|
+
* - 6: OcrError
|
|
21
|
+
* - 7: MissingDependency
|
|
31
22
|
*
|
|
32
|
-
*
|
|
33
|
-
* - **Text**: Markdown, Plain Text, XML
|
|
34
|
-
* - **Web**: HTML (converted to Markdown)
|
|
35
|
-
* - **Data**: JSON, YAML, TOML
|
|
36
|
-
* - **Email**: EML, MSG
|
|
37
|
-
* - **Images**: PNG, JPEG, TIFF (with OCR support)
|
|
23
|
+
* @returns The integer error code
|
|
38
24
|
*
|
|
39
25
|
* @example
|
|
40
26
|
* ```typescript
|
|
41
|
-
* import { extractFile,
|
|
42
|
-
*
|
|
43
|
-
* // Single file extraction
|
|
44
|
-
* const result = await extractFile('document.pdf');
|
|
45
|
-
* console.log(result.content);
|
|
27
|
+
* import { extractFile, getLastErrorCode, ErrorCode } from '@kreuzberg/node';
|
|
46
28
|
*
|
|
47
|
-
*
|
|
48
|
-
*
|
|
49
|
-
*
|
|
50
|
-
*
|
|
29
|
+
* try {
|
|
30
|
+
* const result = await extractFile('document.pdf');
|
|
31
|
+
* } catch (error) {
|
|
32
|
+
* const code = getLastErrorCode();
|
|
33
|
+
* if (code === ErrorCode.Panic) {
|
|
34
|
+
* console.error('Native code panic detected');
|
|
35
|
+
* }
|
|
36
|
+
* }
|
|
51
37
|
* ```
|
|
52
38
|
*/
|
|
53
|
-
|
|
54
|
-
/**
|
|
55
|
-
* @internal Allows tests to provide a mocked native binding.
|
|
56
|
-
*/
|
|
57
|
-
declare function __setBindingForTests(mock: unknown): void;
|
|
58
|
-
/**
|
|
59
|
-
* @internal Resets the cached native binding for tests.
|
|
60
|
-
*/
|
|
61
|
-
declare function __resetBindingForTests(): void;
|
|
39
|
+
declare function getLastErrorCode(): number;
|
|
62
40
|
/**
|
|
63
|
-
*
|
|
41
|
+
* Get panic context information if the last error was a panic.
|
|
64
42
|
*
|
|
65
|
-
*
|
|
66
|
-
* provides
|
|
43
|
+
* Returns detailed information about a panic in native code, or null if the last error was not a panic.
|
|
44
|
+
* This provides debugging information when native code panics.
|
|
67
45
|
*
|
|
68
|
-
* @
|
|
69
|
-
* @param mimeType - Optional MIME type hint for format detection. If null, MIME type is auto-detected from file extension or content.
|
|
70
|
-
* @param config - Extraction configuration object. If null, uses default extraction settings.
|
|
71
|
-
* @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
|
|
72
|
-
* @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
|
|
73
|
-
* @throws {ParsingError} When document format is invalid or corrupted
|
|
74
|
-
* @throws {OcrError} When OCR processing fails (if OCR is enabled)
|
|
75
|
-
* @throws {ValidationError} When extraction result fails validation (if validators registered)
|
|
76
|
-
* @throws {KreuzbergError} For other extraction-related failures
|
|
46
|
+
* @returns A `PanicContext` object with file, line, function, message, and timestamp_secs, or null if no panic context is available
|
|
77
47
|
*
|
|
78
48
|
* @example
|
|
79
49
|
* ```typescript
|
|
80
|
-
* import {
|
|
81
|
-
*
|
|
82
|
-
* // Basic usage
|
|
83
|
-
* const result = extractFileSync('document.pdf');
|
|
84
|
-
* console.log(result.content);
|
|
50
|
+
* import { extractFile, getLastPanicContext } from '@kreuzberg/node';
|
|
85
51
|
*
|
|
86
|
-
*
|
|
87
|
-
*
|
|
88
|
-
*
|
|
89
|
-
*
|
|
90
|
-
*
|
|
91
|
-
*
|
|
92
|
-
*
|
|
93
|
-
*
|
|
94
|
-
*
|
|
95
|
-
*
|
|
96
|
-
* };
|
|
97
|
-
* const result2 = extractFileSync('scanned.pdf', null, config);
|
|
52
|
+
* try {
|
|
53
|
+
* const result = await extractFile('document.pdf');
|
|
54
|
+
* } catch (error) {
|
|
55
|
+
* const context = getLastPanicContext();
|
|
56
|
+
* if (context) {
|
|
57
|
+
* console.error(`Panic at ${context.file}:${context.line}`);
|
|
58
|
+
* console.error(`In function: ${context.function}`);
|
|
59
|
+
* console.error(`Message: ${context.message}`);
|
|
60
|
+
* }
|
|
61
|
+
* }
|
|
98
62
|
* ```
|
|
99
63
|
*/
|
|
100
|
-
declare function
|
|
64
|
+
declare function getLastPanicContext(): PanicContext | null;
|
|
101
65
|
/**
|
|
102
|
-
*
|
|
66
|
+
* Returns the human-readable name for an error code.
|
|
103
67
|
*
|
|
104
|
-
*
|
|
105
|
-
*
|
|
68
|
+
* Maps numeric error codes to their string names, providing a consistent way
|
|
69
|
+
* to get error code names across all platforms.
|
|
106
70
|
*
|
|
107
|
-
* @param
|
|
108
|
-
* @
|
|
109
|
-
* @param config - Extraction configuration object. If null, uses default extraction settings.
|
|
110
|
-
* @returns Promise<ExtractionResult> containing extracted content, metadata, tables, and optional chunks/images
|
|
111
|
-
* @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
|
|
112
|
-
* @throws {ParsingError} When document format is invalid or corrupted
|
|
113
|
-
* @throws {OcrError} When OCR processing fails (if OCR is enabled)
|
|
114
|
-
* @throws {ValidationError} When extraction result fails validation (if validators registered)
|
|
115
|
-
* @throws {KreuzbergError} For other extraction-related failures
|
|
71
|
+
* @param code - The numeric error code (0-7)
|
|
72
|
+
* @returns The error code name as a string (e.g., "validation", "ocr", "unknown")
|
|
116
73
|
*
|
|
117
74
|
* @example
|
|
118
75
|
* ```typescript
|
|
119
|
-
* import {
|
|
120
|
-
*
|
|
121
|
-
* // Basic usage
|
|
122
|
-
* const result = await extractFile('document.pdf');
|
|
123
|
-
* console.log(result.content);
|
|
76
|
+
* import { getErrorCodeName } from '@kreuzberg/node';
|
|
124
77
|
*
|
|
125
|
-
* //
|
|
126
|
-
* const
|
|
127
|
-
*
|
|
128
|
-
* maxChars: 1000,
|
|
129
|
-
* maxOverlap: 200,
|
|
130
|
-
* },
|
|
131
|
-
* };
|
|
132
|
-
* const result2 = await extractFile('long_document.pdf', null, config);
|
|
133
|
-
* console.log(result2.chunks); // Array of text chunks
|
|
78
|
+
* const name = getErrorCodeName(0); // returns "validation"
|
|
79
|
+
* const name = getErrorCodeName(2); // returns "ocr"
|
|
80
|
+
* const name = getErrorCodeName(99); // returns "unknown"
|
|
134
81
|
* ```
|
|
135
82
|
*/
|
|
136
|
-
declare function
|
|
83
|
+
declare function getErrorCodeName(code: number): string;
|
|
137
84
|
/**
|
|
138
|
-
*
|
|
85
|
+
* Returns the description for an error code.
|
|
139
86
|
*
|
|
140
|
-
*
|
|
141
|
-
* which provides better performance and memory management.
|
|
87
|
+
* Retrieves user-friendly descriptions of error types from the FFI layer.
|
|
142
88
|
*
|
|
143
|
-
* @param
|
|
144
|
-
* @
|
|
145
|
-
* @param config - Extraction configuration object. If null, uses default extraction settings.
|
|
146
|
-
* @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
|
|
147
|
-
* @throws {TypeError} When data is not a valid Uint8Array
|
|
148
|
-
* @throws {Error} When file cannot be read or parsed
|
|
149
|
-
* @throws {ParsingError} When document format is invalid or corrupted
|
|
150
|
-
* @throws {OcrError} When OCR processing fails (if OCR is enabled)
|
|
151
|
-
* @throws {ValidationError} When extraction result fails validation (if validators registered)
|
|
152
|
-
* @throws {KreuzbergError} For other extraction-related failures
|
|
89
|
+
* @param code - The numeric error code (0-7)
|
|
90
|
+
* @returns A brief description of the error type
|
|
153
91
|
*
|
|
154
92
|
* @example
|
|
155
93
|
* ```typescript
|
|
156
|
-
* import {
|
|
157
|
-
* import { readFileSync } from 'fs';
|
|
94
|
+
* import { getErrorCodeDescription } from '@kreuzberg/node';
|
|
158
95
|
*
|
|
159
|
-
* const
|
|
160
|
-
* const
|
|
161
|
-
*
|
|
96
|
+
* const desc = getErrorCodeDescription(0); // returns "Input validation error"
|
|
97
|
+
* const desc = getErrorCodeDescription(4); // returns "File system I/O error"
|
|
98
|
+
* const desc = getErrorCodeDescription(99); // returns "Unknown error code"
|
|
162
99
|
* ```
|
|
163
100
|
*/
|
|
164
|
-
declare function
|
|
101
|
+
declare function getErrorCodeDescription(code: number): string;
|
|
165
102
|
/**
|
|
166
|
-
*
|
|
103
|
+
* Classifies an error message string into an error code category.
|
|
167
104
|
*
|
|
168
|
-
*
|
|
169
|
-
*
|
|
105
|
+
* This function analyzes the error message content and returns the most likely
|
|
106
|
+
* error code (0-7) based on keyword patterns. Used to programmatically classify
|
|
107
|
+
* errors for handling purposes.
|
|
170
108
|
*
|
|
171
|
-
*
|
|
172
|
-
*
|
|
173
|
-
*
|
|
174
|
-
*
|
|
175
|
-
*
|
|
176
|
-
*
|
|
177
|
-
*
|
|
178
|
-
*
|
|
179
|
-
*
|
|
180
|
-
*
|
|
109
|
+
* The classification is based on keyword matching:
|
|
110
|
+
* - **Validation (0)**: Keywords like "invalid", "validation", "schema", "required"
|
|
111
|
+
* - **Parsing (1)**: Keywords like "parsing", "corrupted", "malformed"
|
|
112
|
+
* - **Ocr (2)**: Keywords like "ocr", "tesseract", "language", "model"
|
|
113
|
+
* - **MissingDependency (3)**: Keywords like "not found", "missing", "dependency"
|
|
114
|
+
* - **Io (4)**: Keywords like "file", "disk", "read", "write", "permission"
|
|
115
|
+
* - **Plugin (5)**: Keywords like "plugin", "register", "extension"
|
|
116
|
+
* - **UnsupportedFormat (6)**: Keywords like "unsupported", "format", "mime"
|
|
117
|
+
* - **Internal (7)**: Keywords like "internal", "bug", "panic"
|
|
118
|
+
*
|
|
119
|
+
* @param errorMessage - The error message string to classify
|
|
120
|
+
* @returns An object with the classification details
|
|
181
121
|
*
|
|
182
122
|
* @example
|
|
183
123
|
* ```typescript
|
|
184
|
-
* import {
|
|
185
|
-
* import { readFile } from 'fs/promises';
|
|
124
|
+
* import { classifyError } from '@kreuzberg/node';
|
|
186
125
|
*
|
|
187
|
-
* const
|
|
188
|
-
*
|
|
189
|
-
*
|
|
126
|
+
* const result = classifyError("PDF file is corrupted");
|
|
127
|
+
* // Returns: { code: 1, name: "parsing", confidence: 0.95 }
|
|
128
|
+
*
|
|
129
|
+
* const result = classifyError("Tesseract not found");
|
|
130
|
+
* // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 }
|
|
190
131
|
* ```
|
|
191
132
|
*/
|
|
192
|
-
declare function
|
|
133
|
+
declare function classifyError(errorMessage: string): ErrorClassification;
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Batch extraction APIs for processing multiple documents.
|
|
137
|
+
*
|
|
138
|
+
* This module provides synchronous and asynchronous functions for extracting content
|
|
139
|
+
* from multiple files or byte arrays in parallel. Batch operations offer better
|
|
140
|
+
* performance and memory management compared to calling single extraction functions
|
|
141
|
+
* in a loop.
|
|
142
|
+
*
|
|
143
|
+
* **Benefits of Batch Processing**:
|
|
144
|
+
* - Parallel processing in Rust for maximum performance
|
|
145
|
+
* - Optimized memory usage across all extractions
|
|
146
|
+
* - More reliable for large-scale document processing
|
|
147
|
+
*
|
|
148
|
+
* @internal This module is part of Layer 2 (extraction APIs).
|
|
149
|
+
*/
|
|
150
|
+
|
|
193
151
|
/**
|
|
194
152
|
* Extract content from multiple files in parallel (synchronous).
|
|
195
153
|
*
|
|
@@ -222,7 +180,7 @@ declare function extractBytes(dataOrPath: Uint8Array | string, mimeType: string,
|
|
|
222
180
|
* });
|
|
223
181
|
* ```
|
|
224
182
|
*/
|
|
225
|
-
declare function batchExtractFilesSync(paths: string[], config?: ExtractionConfig
|
|
183
|
+
declare function batchExtractFilesSync(paths: string[], config?: ExtractionConfig | null): ExtractionResult[];
|
|
226
184
|
/**
|
|
227
185
|
* Extract content from multiple files in parallel (asynchronous).
|
|
228
186
|
*
|
|
@@ -258,7 +216,7 @@ declare function batchExtractFilesSync(paths: string[], config?: ExtractionConfi
|
|
|
258
216
|
* .reduce((a, b) => a + b, 0);
|
|
259
217
|
* ```
|
|
260
218
|
*/
|
|
261
|
-
declare function batchExtractFiles(paths: string[], config?: ExtractionConfig
|
|
219
|
+
declare function batchExtractFiles(paths: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
|
|
262
220
|
/**
|
|
263
221
|
* Extract content from multiple byte arrays in parallel (synchronous).
|
|
264
222
|
*
|
|
@@ -296,7 +254,7 @@ declare function batchExtractFiles(paths: string[], config?: ExtractionConfig$1
|
|
|
296
254
|
* });
|
|
297
255
|
* ```
|
|
298
256
|
*/
|
|
299
|
-
declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig
|
|
257
|
+
declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig | null): ExtractionResult[];
|
|
300
258
|
/**
|
|
301
259
|
* Extract content from multiple byte arrays in parallel (asynchronous).
|
|
302
260
|
*
|
|
@@ -338,55 +296,355 @@ declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string
|
|
|
338
296
|
* .reduce((a, b) => a + b, 0);
|
|
339
297
|
* ```
|
|
340
298
|
*/
|
|
341
|
-
declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig
|
|
299
|
+
declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
|
|
300
|
+
|
|
342
301
|
/**
|
|
343
|
-
*
|
|
302
|
+
* Single-document extraction APIs.
|
|
344
303
|
*
|
|
345
|
-
*
|
|
346
|
-
*
|
|
347
|
-
*
|
|
304
|
+
* This module provides synchronous and asynchronous functions for extracting content
|
|
305
|
+
* from a single file or byte array. These are convenience wrappers around the native
|
|
306
|
+
* binding that handle config normalization and result conversion.
|
|
348
307
|
*
|
|
349
|
-
*
|
|
350
|
-
*
|
|
351
|
-
*
|
|
308
|
+
* **Usage Note**: For processing multiple files, prefer batch extraction functions
|
|
309
|
+
* (`batchExtractFiles`, `batchExtractFilesSync`) which provide better performance
|
|
310
|
+
* and memory management.
|
|
352
311
|
*
|
|
353
|
-
* @
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
312
|
+
* @internal This module is part of Layer 2 (extraction APIs).
|
|
313
|
+
*/
|
|
314
|
+
|
|
315
|
+
/**
|
|
316
|
+
* Extract content from a single file (synchronous).
|
|
357
317
|
*
|
|
358
|
-
*
|
|
359
|
-
*
|
|
360
|
-
* import { registerPostProcessor, extractFile, ExtractionResult } from '@kreuzberg/node';
|
|
318
|
+
* **Usage Note**: For processing multiple files, prefer `batchExtractFilesSync()` which
|
|
319
|
+
* provides better performance and memory management.
|
|
361
320
|
*
|
|
362
|
-
*
|
|
363
|
-
*
|
|
364
|
-
*
|
|
365
|
-
*
|
|
321
|
+
* @param filePath - Path to the file to extract (string). Can be absolute or relative.
|
|
322
|
+
* @param mimeTypeOrConfig - Optional MIME type hint or extraction configuration.
|
|
323
|
+
* If a string, treated as MIME type. If an object, treated as ExtractionConfig.
|
|
324
|
+
* If null, MIME type is auto-detected from file extension or content.
|
|
325
|
+
* @param maybeConfig - Extraction configuration object. If null, uses default extraction settings.
|
|
326
|
+
* Only used if second parameter is a MIME type string.
|
|
327
|
+
* @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
|
|
328
|
+
* @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
|
|
329
|
+
* @throws {ParsingError} When document format is invalid or corrupted
|
|
330
|
+
* @throws {OcrError} When OCR processing fails (if OCR is enabled)
|
|
331
|
+
* @throws {ValidationError} When extraction result fails validation (if validators registered)
|
|
332
|
+
* @throws {KreuzbergError} For other extraction-related failures
|
|
366
333
|
*
|
|
367
|
-
*
|
|
368
|
-
*
|
|
369
|
-
*
|
|
370
|
-
* }
|
|
334
|
+
* @example
|
|
335
|
+
* ```typescript
|
|
336
|
+
* import { extractFileSync } from '@kreuzberg/node';
|
|
371
337
|
*
|
|
372
|
-
*
|
|
373
|
-
*
|
|
374
|
-
*
|
|
375
|
-
* }
|
|
338
|
+
* // Basic usage
|
|
339
|
+
* const result = extractFileSync('document.pdf');
|
|
340
|
+
* console.log(result.content);
|
|
376
341
|
*
|
|
377
|
-
*
|
|
342
|
+
* // With explicit MIME type
|
|
343
|
+
* const result2 = extractFileSync('document.pdf', 'application/pdf');
|
|
378
344
|
*
|
|
379
|
-
* //
|
|
380
|
-
* const
|
|
381
|
-
*
|
|
345
|
+
* // With configuration
|
|
346
|
+
* const result3 = extractFileSync('document.pdf', {
|
|
347
|
+
* chunking: {
|
|
348
|
+
* maxChars: 1000,
|
|
349
|
+
* maxOverlap: 200,
|
|
350
|
+
* },
|
|
351
|
+
* });
|
|
382
352
|
* ```
|
|
383
353
|
*/
|
|
384
|
-
declare function
|
|
354
|
+
declare function extractFileSync(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig, maybeConfig?: ExtractionConfig | null): ExtractionResult;
|
|
385
355
|
/**
|
|
386
|
-
*
|
|
356
|
+
* Extract content from a single file (asynchronous).
|
|
387
357
|
*
|
|
388
|
-
*
|
|
389
|
-
*
|
|
358
|
+
* **Usage Note**: For processing multiple files, prefer `batchExtractFiles()` which
|
|
359
|
+
* provides better performance and memory management.
|
|
360
|
+
*
|
|
361
|
+
* @param filePath - Path to the file to extract (string). Can be absolute or relative.
|
|
362
|
+
* @param mimeTypeOrConfig - Optional MIME type hint or extraction configuration.
|
|
363
|
+
* If a string, treated as MIME type. If an object, treated as ExtractionConfig.
|
|
364
|
+
* If null, MIME type is auto-detected from file extension or content.
|
|
365
|
+
* @param maybeConfig - Extraction configuration object. If null, uses default extraction settings.
|
|
366
|
+
* Only used if second parameter is a MIME type string.
|
|
367
|
+
* @returns Promise<ExtractionResult> containing extracted content, metadata, tables, and optional chunks/images
|
|
368
|
+
* @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
|
|
369
|
+
* @throws {ParsingError} When document format is invalid or corrupted
|
|
370
|
+
* @throws {OcrError} When OCR processing fails (if OCR is enabled)
|
|
371
|
+
* @throws {ValidationError} When extraction result fails validation (if validators registered)
|
|
372
|
+
* @throws {KreuzbergError} For other extraction-related failures
|
|
373
|
+
*
|
|
374
|
+
* @example
|
|
375
|
+
* ```typescript
|
|
376
|
+
* import { extractFile } from '@kreuzberg/node';
|
|
377
|
+
*
|
|
378
|
+
* // Basic usage
|
|
379
|
+
* const result = await extractFile('document.pdf');
|
|
380
|
+
* console.log(result.content);
|
|
381
|
+
*
|
|
382
|
+
* // With chunking enabled
|
|
383
|
+
* const config = {
|
|
384
|
+
* chunking: {
|
|
385
|
+
* maxChars: 1000,
|
|
386
|
+
* maxOverlap: 200,
|
|
387
|
+
* },
|
|
388
|
+
* };
|
|
389
|
+
* const result2 = await extractFile('long_document.pdf', null, config);
|
|
390
|
+
* console.log(result2.chunks); // Array of text chunks
|
|
391
|
+
* ```
|
|
392
|
+
*/
|
|
393
|
+
declare function extractFile(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig, maybeConfig?: ExtractionConfig | null): Promise<ExtractionResult>;
|
|
394
|
+
/**
|
|
395
|
+
* Extract content from raw bytes (synchronous).
|
|
396
|
+
*
|
|
397
|
+
* **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytesSync()`
|
|
398
|
+
* which provides better performance and memory management.
|
|
399
|
+
*
|
|
400
|
+
* @param data - File content as Uint8Array (Buffer will be converted)
|
|
401
|
+
* @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string.
|
|
402
|
+
* @param config - Extraction configuration object. If null, uses default extraction settings.
|
|
403
|
+
* @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
|
|
404
|
+
* @throws {TypeError} When data is not a valid Uint8Array
|
|
405
|
+
* @throws {Error} When file cannot be read or parsed
|
|
406
|
+
* @throws {ParsingError} When document format is invalid or corrupted
|
|
407
|
+
* @throws {OcrError} When OCR processing fails (if OCR is enabled)
|
|
408
|
+
* @throws {ValidationError} When extraction result fails validation (if validators registered)
|
|
409
|
+
* @throws {KreuzbergError} For other extraction-related failures
|
|
410
|
+
*
|
|
411
|
+
* @example
|
|
412
|
+
* ```typescript
|
|
413
|
+
* import { extractBytesSync } from '@kreuzberg/node';
|
|
414
|
+
* import { readFileSync } from 'fs';
|
|
415
|
+
*
|
|
416
|
+
* const data = readFileSync('document.pdf');
|
|
417
|
+
* const result = extractBytesSync(data, 'application/pdf');
|
|
418
|
+
* console.log(result.content);
|
|
419
|
+
* ```
|
|
420
|
+
*/
|
|
421
|
+
declare function extractBytesSync(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig | null): ExtractionResult;
|
|
422
|
+
/**
|
|
423
|
+
* Extract content from raw bytes (asynchronous).
|
|
424
|
+
*
|
|
425
|
+
* **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytes()`
|
|
426
|
+
* which provides better performance and memory management.
|
|
427
|
+
*
|
|
428
|
+
* @param data - File content as Uint8Array (Buffer will be converted)
|
|
429
|
+
* @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string.
|
|
430
|
+
* @param config - Extraction configuration object. If null, uses default extraction settings.
|
|
431
|
+
* @returns Promise<ExtractionResult> containing extracted content, metadata, tables, and optional chunks/images
|
|
432
|
+
* @throws {TypeError} When data is not a valid Uint8Array
|
|
433
|
+
* @throws {Error} When file cannot be read or parsed
|
|
434
|
+
* @throws {ParsingError} When document format is invalid or corrupted
|
|
435
|
+
* @throws {OcrError} When OCR processing fails (if OCR is enabled)
|
|
436
|
+
* @throws {ValidationError} When extraction result fails validation (if validators registered)
|
|
437
|
+
* @throws {KreuzbergError} For other extraction-related failures
|
|
438
|
+
*
|
|
439
|
+
* @example
|
|
440
|
+
* ```typescript
|
|
441
|
+
* import { extractBytes } from '@kreuzberg/node';
|
|
442
|
+
* import { readFile } from 'fs/promises';
|
|
443
|
+
*
|
|
444
|
+
* const data = await readFile('document.pdf');
|
|
445
|
+
* const result = await extractBytes(data, 'application/pdf');
|
|
446
|
+
* console.log(result.content);
|
|
447
|
+
* ```
|
|
448
|
+
*/
|
|
449
|
+
declare function extractBytes(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig | null): Promise<ExtractionResult>;
|
|
450
|
+
|
|
451
|
+
/**
|
|
452
|
+
* Worker pool management for concurrent document extraction.
|
|
453
|
+
*
|
|
454
|
+
* This module provides utilities for creating and managing worker pools that enable
|
|
455
|
+
* concurrent extraction of documents using Node.js worker threads. Worker pools allow
|
|
456
|
+
* multiple extraction operations to run in parallel with configurable pool sizes.
|
|
457
|
+
*
|
|
458
|
+
* **Usage Pattern**:
|
|
459
|
+
* 1. Create a pool with `createWorkerPool(size)`
|
|
460
|
+
* 2. Submit tasks with `extractFileInWorker()` or `batchExtractFilesInWorker()`
|
|
461
|
+
* 3. Close the pool with `closeWorkerPool()` when done
|
|
462
|
+
*
|
|
463
|
+
* @internal This module is part of Layer 2 (extraction APIs).
|
|
464
|
+
*/
|
|
465
|
+
|
|
466
|
+
/**
|
|
467
|
+
* Create a new worker pool for concurrent extraction operations.
|
|
468
|
+
*
|
|
469
|
+
* Creates a pool of worker threads that can process extraction tasks concurrently.
|
|
470
|
+
* The pool manages a queue of pending tasks and distributes them across available workers.
|
|
471
|
+
*
|
|
472
|
+
* @param size - Optional number of workers in the pool. If not specified, defaults to the number of CPU cores.
|
|
473
|
+
* @returns WorkerPool instance that can be used with extraction functions
|
|
474
|
+
*
|
|
475
|
+
* @example
|
|
476
|
+
* ```typescript
|
|
477
|
+
* import { createWorkerPool } from '@kreuzberg/node';
|
|
478
|
+
*
|
|
479
|
+
* // Create pool with default size (number of CPU cores)
|
|
480
|
+
* const pool = createWorkerPool();
|
|
481
|
+
*
|
|
482
|
+
* // Create pool with 4 workers
|
|
483
|
+
* const pool4 = createWorkerPool(4);
|
|
484
|
+
* ```
|
|
485
|
+
*/
|
|
486
|
+
declare function createWorkerPool(size?: number): WorkerPool;
|
|
487
|
+
/**
|
|
488
|
+
* Get statistics about a worker pool.
|
|
489
|
+
*
|
|
490
|
+
* Returns information about the pool's current state, including the number of active workers,
|
|
491
|
+
* queued tasks, and total processed tasks.
|
|
492
|
+
*
|
|
493
|
+
* @param pool - The worker pool instance
|
|
494
|
+
* @returns WorkerPoolStats with pool information
|
|
495
|
+
*
|
|
496
|
+
* @example
|
|
497
|
+
* ```typescript
|
|
498
|
+
* import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
|
|
499
|
+
*
|
|
500
|
+
* const pool = createWorkerPool(4);
|
|
501
|
+
* const stats = getWorkerPoolStats(pool);
|
|
502
|
+
*
|
|
503
|
+
* console.log(`Pool size: ${stats.size}`);
|
|
504
|
+
* console.log(`Active workers: ${stats.activeWorkers}`);
|
|
505
|
+
* console.log(`Queued tasks: ${stats.queuedTasks}`);
|
|
506
|
+
* ```
|
|
507
|
+
*/
|
|
508
|
+
declare function getWorkerPoolStats(pool: WorkerPool): WorkerPoolStats;
|
|
509
|
+
/**
|
|
510
|
+
* Extract content from a single file using a worker pool (asynchronous).
|
|
511
|
+
*
|
|
512
|
+
* Submits an extraction task to the worker pool. The task is executed by one of the
|
|
513
|
+
* available workers in the background, allowing other tasks to be processed concurrently.
|
|
514
|
+
*
|
|
515
|
+
* @param pool - The worker pool instance
|
|
516
|
+
* @param filePath - Path to the file to extract
|
|
517
|
+
* @param mimeTypeOrConfig - Optional MIME type or extraction configuration.
|
|
518
|
+
* If a string, treated as MIME type. If an object, treated as ExtractionConfig.
|
|
519
|
+
* If null, MIME type is auto-detected from file extension or content.
|
|
520
|
+
* @param maybeConfig - Extraction configuration object. If null, uses default extraction settings.
|
|
521
|
+
* Only used if second parameter is a MIME type string.
|
|
522
|
+
* @returns Promise<ExtractionResult> containing extracted content and metadata
|
|
523
|
+
*
|
|
524
|
+
* @throws {Error} If the file cannot be read or extraction fails
|
|
525
|
+
*
|
|
526
|
+
* @example
|
|
527
|
+
* ```typescript
|
|
528
|
+
* import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
529
|
+
*
|
|
530
|
+
* const pool = createWorkerPool(4);
|
|
531
|
+
*
|
|
532
|
+
* try {
|
|
533
|
+
* const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
|
|
534
|
+
* const results = await Promise.all(
|
|
535
|
+
* files.map(f => extractFileInWorker(pool, f))
|
|
536
|
+
* );
|
|
537
|
+
*
|
|
538
|
+
* results.forEach((r, i) => {
|
|
539
|
+
* console.log(`${files[i]}: ${r.content.substring(0, 100)}...`);
|
|
540
|
+
* });
|
|
541
|
+
* } finally {
|
|
542
|
+
* await closeWorkerPool(pool);
|
|
543
|
+
* }
|
|
544
|
+
* ```
|
|
545
|
+
*/
|
|
546
|
+
declare function extractFileInWorker(pool: WorkerPool, filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig, maybeConfig?: ExtractionConfig | null): Promise<ExtractionResult>;
|
|
547
|
+
/**
|
|
548
|
+
* Extract content from multiple files in parallel using a worker pool (asynchronous).
|
|
549
|
+
*
|
|
550
|
+
* Submits multiple extraction tasks to the worker pool for concurrent processing.
|
|
551
|
+
* This is more efficient than using `extractFileInWorker` multiple times sequentially.
|
|
552
|
+
*
|
|
553
|
+
* @param pool - The worker pool instance
|
|
554
|
+
* @param paths - Array of file paths to extract
|
|
555
|
+
* @param config - Extraction configuration object (applies to all files). If null, uses default extraction settings.
|
|
556
|
+
* @returns Promise<ExtractionResult[]> array of results (one per file, in same order)
|
|
557
|
+
*
|
|
558
|
+
* @throws {Error} If any file cannot be read or extraction fails
|
|
559
|
+
*
|
|
560
|
+
* @example
|
|
561
|
+
* ```typescript
|
|
562
|
+
* import { createWorkerPool, batchExtractFilesInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
563
|
+
*
|
|
564
|
+
* const pool = createWorkerPool(4);
|
|
565
|
+
*
|
|
566
|
+
* try {
|
|
567
|
+
* const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf'];
|
|
568
|
+
* const results = await batchExtractFilesInWorker(pool, files, {
|
|
569
|
+
* ocr: { backend: 'tesseract', language: 'eng' }
|
|
570
|
+
* });
|
|
571
|
+
*
|
|
572
|
+
* const total = results.reduce((sum, r) => sum + extractAmount(r.content), 0);
|
|
573
|
+
* console.log(`Total: $${total}`);
|
|
574
|
+
* } finally {
|
|
575
|
+
* await closeWorkerPool(pool);
|
|
576
|
+
* }
|
|
577
|
+
* ```
|
|
578
|
+
*/
|
|
579
|
+
declare function batchExtractFilesInWorker(pool: WorkerPool, paths: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
|
|
580
|
+
/**
|
|
581
|
+
* Close a worker pool and shut down all worker threads.
|
|
582
|
+
*
|
|
583
|
+
* Should be called when the pool is no longer needed to clean up resources
|
|
584
|
+
* and gracefully shut down worker threads. Any pending tasks will be cancelled.
|
|
585
|
+
*
|
|
586
|
+
* @param pool - The worker pool instance to close
|
|
587
|
+
* @returns Promise that resolves when the pool is fully closed
|
|
588
|
+
*
|
|
589
|
+
* @throws {Error} If pool shutdown fails
|
|
590
|
+
*
|
|
591
|
+
* @example
|
|
592
|
+
* ```typescript
|
|
593
|
+
* import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
594
|
+
*
|
|
595
|
+
* const pool = createWorkerPool(4);
|
|
596
|
+
*
|
|
597
|
+
* try {
|
|
598
|
+
* const result = await extractFileInWorker(pool, 'document.pdf');
|
|
599
|
+
* console.log(result.content);
|
|
600
|
+
* } finally {
|
|
601
|
+
* // Clean up the pool
|
|
602
|
+
* await closeWorkerPool(pool);
|
|
603
|
+
* }
|
|
604
|
+
* ```
|
|
605
|
+
*/
|
|
606
|
+
declare function closeWorkerPool(pool: WorkerPool): Promise<void>;
|
|
607
|
+
|
|
608
|
+
/**
|
|
609
|
+
* Register a custom post-processor.
|
|
610
|
+
*
|
|
611
|
+
* Post-processors allow you to hook into the extraction pipeline and transform
|
|
612
|
+
* the extraction results. They run after the core extraction is complete.
|
|
613
|
+
*
|
|
614
|
+
* Post-processors are async and can modify extraction results before they are
|
|
615
|
+
* returned to the caller.
|
|
616
|
+
*
|
|
617
|
+
* @param processor - Post-processor implementing PostProcessorProtocol
|
|
618
|
+
*
|
|
619
|
+
* @example
|
|
620
|
+
* ```typescript
|
|
621
|
+
* import { registerPostProcessor, extractFile } from '@kreuzberg/node';
|
|
622
|
+
*
|
|
623
|
+
* class CustomProcessor {
|
|
624
|
+
* name() {
|
|
625
|
+
* return 'custom_processor';
|
|
626
|
+
* }
|
|
627
|
+
* processingStage() {
|
|
628
|
+
* return 'post';
|
|
629
|
+
* }
|
|
630
|
+
* async process(result) {
|
|
631
|
+
* // Add custom metadata
|
|
632
|
+
* result.metadata.customField = 'custom_value';
|
|
633
|
+
* return result;
|
|
634
|
+
* }
|
|
635
|
+
* }
|
|
636
|
+
*
|
|
637
|
+
* // Use async extraction (required for custom processors)
|
|
638
|
+
* const result = await extractFile('document.pdf');
|
|
639
|
+
* console.log(result.metadata.customField); // 'custom_value'
|
|
640
|
+
* ```
|
|
641
|
+
*/
|
|
642
|
+
declare function registerPostProcessor(processor: PostProcessorProtocol): void;
|
|
643
|
+
/**
|
|
644
|
+
* Unregister a postprocessor by name.
|
|
645
|
+
*
|
|
646
|
+
* Removes a previously registered postprocessor from the registry.
|
|
647
|
+
* If the processor doesn't exist, this is a no-op (does not throw).
|
|
390
648
|
*
|
|
391
649
|
* @param name - Name of the processor to unregister (case-sensitive)
|
|
392
650
|
*
|
|
@@ -428,6 +686,7 @@ declare function clearPostProcessors(): void;
|
|
|
428
686
|
* ```
|
|
429
687
|
*/
|
|
430
688
|
declare function listPostProcessors(): string[];
|
|
689
|
+
|
|
431
690
|
/**
|
|
432
691
|
* Register a custom validator.
|
|
433
692
|
*
|
|
@@ -435,27 +694,26 @@ declare function listPostProcessors(): string[];
|
|
|
435
694
|
* Unlike post-processors, validator errors **fail fast** - if a validator throws an error,
|
|
436
695
|
* the extraction fails immediately.
|
|
437
696
|
*
|
|
438
|
-
*
|
|
439
|
-
*
|
|
440
|
-
* @
|
|
441
|
-
* @throws {Error} If a validator with the same name is already registered
|
|
697
|
+
* Validators are async and run after post-processors in the extraction pipeline.
|
|
698
|
+
*
|
|
699
|
+
* @param validator - Validator implementing ValidatorProtocol
|
|
442
700
|
*
|
|
443
701
|
* @example
|
|
444
702
|
* ```typescript
|
|
445
|
-
* import { registerValidator } from '@kreuzberg/node';
|
|
703
|
+
* import { registerValidator, extractFile } from '@kreuzberg/node';
|
|
446
704
|
*
|
|
447
|
-
* class MinLengthValidator
|
|
448
|
-
* name()
|
|
705
|
+
* class MinLengthValidator {
|
|
706
|
+
* name() {
|
|
449
707
|
* return 'min_length_validator';
|
|
450
708
|
* }
|
|
451
709
|
*
|
|
452
|
-
* priority()
|
|
453
|
-
* return 100;
|
|
710
|
+
* priority() {
|
|
711
|
+
* return 100;
|
|
454
712
|
* }
|
|
455
713
|
*
|
|
456
|
-
* validate(result
|
|
457
|
-
* if (result.content.length <
|
|
458
|
-
* throw new Error('Content too short
|
|
714
|
+
* async validate(result) {
|
|
715
|
+
* if (result.content.length < 10) {
|
|
716
|
+
* throw new Error('Content too short');
|
|
459
717
|
* }
|
|
460
718
|
* }
|
|
461
719
|
* }
|
|
@@ -510,20 +768,93 @@ declare function clearValidators(): void;
|
|
|
510
768
|
* ```
|
|
511
769
|
*/
|
|
512
770
|
declare function listValidators(): string[];
|
|
513
|
-
|
|
771
|
+
|
|
514
772
|
/**
|
|
515
|
-
*
|
|
773
|
+
* Register a custom OCR backend.
|
|
516
774
|
*
|
|
517
|
-
*
|
|
518
|
-
*
|
|
775
|
+
* This function registers a JavaScript OCR backend that will be used by Kreuzberg's
|
|
776
|
+
* extraction pipeline when OCR is enabled. The backend must implement the
|
|
777
|
+
* {@link OcrBackendProtocol} interface.
|
|
519
778
|
*
|
|
520
|
-
*
|
|
779
|
+
* ## Usage
|
|
780
|
+
*
|
|
781
|
+
* 1. Create a class implementing {@link OcrBackendProtocol}
|
|
782
|
+
* 2. Call `initialize()` on your backend instance (if needed)
|
|
783
|
+
* 3. Register the backend with `registerOcrBackend()`
|
|
784
|
+
* 4. Use the backend name in extraction config
|
|
785
|
+
*
|
|
786
|
+
* ## Thread Safety
|
|
787
|
+
*
|
|
788
|
+
* The registered backend must be thread-safe as it may be called concurrently
|
|
789
|
+
* from multiple Rust async tasks. Ensure your implementation handles concurrent
|
|
790
|
+
* calls properly.
|
|
791
|
+
*
|
|
792
|
+
* @param backend - OcrBackendProtocol implementation with name(), supportedLanguages(), and processImage()
|
|
793
|
+
* @throws {Error} If backend is missing required methods (name, supportedLanguages, or processImage)
|
|
794
|
+
* @throws {Error} If backend name is empty string or contains invalid characters
|
|
795
|
+
* @throws {Error} If a backend with the same name is already registered
|
|
796
|
+
* @throws {Error} If registration fails due to FFI issues
|
|
521
797
|
*
|
|
522
798
|
* @example
|
|
523
799
|
* ```typescript
|
|
524
|
-
* import {
|
|
800
|
+
* import { GutenOcrBackend } from '@kreuzberg/node/ocr/guten-ocr';
|
|
801
|
+
* import { registerOcrBackend, extractFile } from '@kreuzberg/node';
|
|
525
802
|
*
|
|
526
|
-
*
|
|
803
|
+
* // Create and initialize backend
|
|
804
|
+
* const backend = new GutenOcrBackend();
|
|
805
|
+
* await backend.initialize();
|
|
806
|
+
*
|
|
807
|
+
* // Register with Kreuzberg
|
|
808
|
+
* registerOcrBackend(backend);
|
|
809
|
+
*
|
|
810
|
+
* // Use in extraction
|
|
811
|
+
* const result = await extractFile('scanned.pdf', null, {
|
|
812
|
+
* ocr: { backend: 'guten-ocr', language: 'en' }
|
|
813
|
+
* });
|
|
814
|
+
* console.log(result.content);
|
|
815
|
+
* ```
|
|
816
|
+
*
|
|
817
|
+
* @example
|
|
818
|
+
* ```typescript
|
|
819
|
+
* import { registerOcrBackend } from '@kreuzberg/node';
|
|
820
|
+
*
|
|
821
|
+
* class MyOcrBackend {
|
|
822
|
+
* name() {
|
|
823
|
+
* return 'my-ocr';
|
|
824
|
+
* }
|
|
825
|
+
*
|
|
826
|
+
* supportedLanguages(): string[] {
|
|
827
|
+
* return ['en', 'de', 'fr'];
|
|
828
|
+
* }
|
|
829
|
+
*
|
|
830
|
+
* async processImage(imageBytes: Uint8Array, language: string) {
|
|
831
|
+
* const text = await myCustomOcrEngine(imageBytes, language);
|
|
832
|
+
* return {
|
|
833
|
+
* content: text,
|
|
834
|
+
* mime_type: 'text/plain',
|
|
835
|
+
* metadata: { confidence: 0.95, language },
|
|
836
|
+
* tables: []
|
|
837
|
+
* };
|
|
838
|
+
* }
|
|
839
|
+
* }
|
|
840
|
+
*
|
|
841
|
+
* registerOcrBackend(new MyOcrBackend());
|
|
842
|
+
* ```
|
|
843
|
+
*/
|
|
844
|
+
declare function registerOcrBackend(backend: OcrBackendProtocol): void;
|
|
845
|
+
/**
|
|
846
|
+
* List all registered OCR backends.
|
|
847
|
+
*
|
|
848
|
+
* Returns an array of names of all currently registered OCR backends,
|
|
849
|
+
* including built-in backends like "tesseract".
|
|
850
|
+
*
|
|
851
|
+
* @returns Array of OCR backend names (empty array if none registered)
|
|
852
|
+
*
|
|
853
|
+
* @example
|
|
854
|
+
* ```typescript
|
|
855
|
+
* import { listOcrBackends } from '@kreuzberg/node';
|
|
856
|
+
*
|
|
857
|
+
* const backends = listOcrBackends();
|
|
527
858
|
* console.log(backends); // ['tesseract', 'my-custom-backend', ...]
|
|
528
859
|
* ```
|
|
529
860
|
*/
|
|
@@ -560,6 +891,7 @@ declare function unregisterOcrBackend(name: string): void;
|
|
|
560
891
|
* ```
|
|
561
892
|
*/
|
|
562
893
|
declare function clearOcrBackends(): void;
|
|
894
|
+
|
|
563
895
|
/**
|
|
564
896
|
* List all registered document extractors.
|
|
565
897
|
*
|
|
@@ -573,7 +905,7 @@ declare function clearOcrBackends(): void;
|
|
|
573
905
|
* import { listDocumentExtractors } from '@kreuzberg/node';
|
|
574
906
|
*
|
|
575
907
|
* const extractors = listDocumentExtractors();
|
|
576
|
-
* console.log(extractors); // ['
|
|
908
|
+
* console.log(extractors); // ['pdf', 'docx', 'xlsx', 'custom-extractor', ...]
|
|
577
909
|
* ```
|
|
578
910
|
*/
|
|
579
911
|
declare function listDocumentExtractors(): string[];
|
|
@@ -609,87 +941,26 @@ declare function unregisterDocumentExtractor(name: string): void;
|
|
|
609
941
|
* ```
|
|
610
942
|
*/
|
|
611
943
|
declare function clearDocumentExtractors(): void;
|
|
944
|
+
|
|
612
945
|
/**
|
|
613
|
-
*
|
|
946
|
+
* Load extraction configuration from a file.
|
|
614
947
|
*
|
|
615
|
-
*
|
|
616
|
-
*
|
|
948
|
+
* @param filePath - Path to the configuration file
|
|
949
|
+
* @returns ExtractionConfig object loaded from the file
|
|
617
950
|
*
|
|
618
|
-
*
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
*
|
|
951
|
+
* @deprecated Use ExtractionConfig.fromFile() instead
|
|
952
|
+
*/
|
|
953
|
+
declare function loadConfigFile(filePath: string): ExtractionConfig;
|
|
954
|
+
/**
|
|
955
|
+
* Load extraction configuration from a specified path.
|
|
623
956
|
*
|
|
624
|
-
*
|
|
625
|
-
*
|
|
957
|
+
* @param path - Path to the configuration file or directory
|
|
958
|
+
* @returns ExtractionConfig object or null
|
|
626
959
|
*
|
|
627
|
-
*
|
|
628
|
-
* const config2 = {
|
|
629
|
-
* chunking: { maxChars: 2048 },
|
|
630
|
-
* ocr: { backend: 'tesseract', language: 'eng' }
|
|
631
|
-
* };
|
|
632
|
-
*
|
|
633
|
-
* // Use with extraction
|
|
634
|
-
* const result = await extractFile('document.pdf', null, config2);
|
|
635
|
-
* ```
|
|
960
|
+
* @deprecated Use ExtractionConfig.fromFile() or ExtractionConfig.discover() instead
|
|
636
961
|
*/
|
|
637
|
-
declare
|
|
638
|
-
|
|
639
|
-
* Load extraction configuration from a file.
|
|
640
|
-
*
|
|
641
|
-
* Automatically detects the file format based on extension:
|
|
642
|
-
* - `.toml` - TOML format
|
|
643
|
-
* - `.yaml` - YAML format
|
|
644
|
-
* - `.json` - JSON format
|
|
645
|
-
*
|
|
646
|
-
* @param filePath - Path to the configuration file (absolute or relative)
|
|
647
|
-
* @returns ExtractionConfig object loaded from the file
|
|
648
|
-
*
|
|
649
|
-
* @throws {Error} If file does not exist or is not accessible
|
|
650
|
-
* @throws {Error} If file content is not valid TOML/YAML/JSON
|
|
651
|
-
* @throws {Error} If configuration structure is invalid
|
|
652
|
-
* @throws {Error} If file extension is not supported
|
|
653
|
-
*
|
|
654
|
-
* @example
|
|
655
|
-
* ```typescript
|
|
656
|
-
* import { ExtractionConfig } from '@kreuzberg/node';
|
|
657
|
-
*
|
|
658
|
-
* // Load from TOML file
|
|
659
|
-
* const config1 = ExtractionConfig.fromFile('kreuzberg.toml');
|
|
660
|
-
*
|
|
661
|
-
* // Load from YAML file
|
|
662
|
-
* const config2 = ExtractionConfig.fromFile('./config.yaml');
|
|
663
|
-
*
|
|
664
|
-
* // Load from JSON file
|
|
665
|
-
* const config3 = ExtractionConfig.fromFile('./config.json');
|
|
666
|
-
* ```
|
|
667
|
-
*/
|
|
668
|
-
fromFile(filePath: string): ExtractionConfig$1;
|
|
669
|
-
/**
|
|
670
|
-
* Discover and load configuration from current or parent directories.
|
|
671
|
-
*
|
|
672
|
-
* Searches for a `kreuzberg.toml` file starting from the current working directory
|
|
673
|
-
* and traversing up the directory tree. Returns the first configuration file found.
|
|
674
|
-
*
|
|
675
|
-
* @returns ExtractionConfig object if found, or null if no configuration file exists
|
|
676
|
-
*
|
|
677
|
-
* @example
|
|
678
|
-
* ```typescript
|
|
679
|
-
* import { ExtractionConfig } from '@kreuzberg/node';
|
|
680
|
-
*
|
|
681
|
-
* // Try to find config in current or parent directories
|
|
682
|
-
* const config = ExtractionConfig.discover();
|
|
683
|
-
* if (config) {
|
|
684
|
-
* console.log('Found configuration');
|
|
685
|
-
* // Use config for extraction
|
|
686
|
-
* } else {
|
|
687
|
-
* console.log('No configuration file found, using defaults');
|
|
688
|
-
* }
|
|
689
|
-
* ```
|
|
690
|
-
*/
|
|
691
|
-
discover(): ExtractionConfig$1 | null;
|
|
692
|
-
};
|
|
962
|
+
declare function loadConfigFromPath(path: string): ExtractionConfig | null;
|
|
963
|
+
|
|
693
964
|
/**
|
|
694
965
|
* Detect MIME type from raw bytes.
|
|
695
966
|
*
|
|
@@ -800,6 +1071,7 @@ declare function validateMimeType(mimeType: string): string;
|
|
|
800
1071
|
* ```
|
|
801
1072
|
*/
|
|
802
1073
|
declare function getExtensionsForMime(mimeType: string): string[];
|
|
1074
|
+
|
|
803
1075
|
/**
|
|
804
1076
|
* Embedding preset configuration.
|
|
805
1077
|
*
|
|
@@ -820,28 +1092,29 @@ interface EmbeddingPreset {
|
|
|
820
1092
|
description: string;
|
|
821
1093
|
}
|
|
822
1094
|
/**
|
|
823
|
-
*
|
|
1095
|
+
* Get all available embedding presets.
|
|
824
1096
|
*
|
|
825
|
-
* Returns an array of
|
|
1097
|
+
* Returns an array of names of all available embedding model presets.
|
|
826
1098
|
*
|
|
827
|
-
* @returns Array of
|
|
1099
|
+
* @returns Array of preset names (e.g., ["fast", "balanced", "quality", "multilingual"])
|
|
828
1100
|
*
|
|
829
1101
|
* @example
|
|
830
1102
|
* ```typescript
|
|
831
1103
|
* import { listEmbeddingPresets } from '@kreuzberg/node';
|
|
832
1104
|
*
|
|
833
1105
|
* const presets = listEmbeddingPresets();
|
|
834
|
-
* console.log(
|
|
1106
|
+
* console.log('Available presets:', presets);
|
|
835
1107
|
* ```
|
|
836
1108
|
*/
|
|
837
1109
|
declare function listEmbeddingPresets(): string[];
|
|
838
1110
|
/**
|
|
839
|
-
* Get
|
|
1111
|
+
* Get embedding preset configuration by name.
|
|
840
1112
|
*
|
|
841
|
-
*
|
|
1113
|
+
* Retrieves the configuration for a specific embedding model preset.
|
|
1114
|
+
* Returns null if the preset doesn't exist.
|
|
842
1115
|
*
|
|
843
|
-
* @param name -
|
|
844
|
-
* @returns
|
|
1116
|
+
* @param name - Name of the preset (e.g., "balanced", "fast", "quality")
|
|
1117
|
+
* @returns EmbeddingPreset configuration if found, null otherwise
|
|
845
1118
|
*
|
|
846
1119
|
* @example
|
|
847
1120
|
* ```typescript
|
|
@@ -855,278 +1128,78 @@ declare function listEmbeddingPresets(): string[];
|
|
|
855
1128
|
* ```
|
|
856
1129
|
*/
|
|
857
1130
|
declare function getEmbeddingPreset(name: string): EmbeddingPreset | null;
|
|
1131
|
+
|
|
858
1132
|
/**
|
|
859
|
-
*
|
|
860
|
-
*
|
|
861
|
-
* Returns the FFI error code as an integer. This is useful for programmatic error handling
|
|
862
|
-
* and distinguishing between different types of failures in native code.
|
|
863
|
-
*
|
|
864
|
-
* Error codes:
|
|
865
|
-
* - 0: Success (no error)
|
|
866
|
-
* - 1: GenericError
|
|
867
|
-
* - 2: Panic
|
|
868
|
-
* - 3: InvalidArgument
|
|
869
|
-
* - 4: IoError
|
|
870
|
-
* - 5: ParsingError
|
|
871
|
-
* - 6: OcrError
|
|
872
|
-
* - 7: MissingDependency
|
|
873
|
-
*
|
|
874
|
-
* @returns The integer error code
|
|
875
|
-
*
|
|
876
|
-
* @example
|
|
877
|
-
* ```typescript
|
|
878
|
-
* import { extractFile, getLastErrorCode, ErrorCode } from '@kreuzberg/node';
|
|
879
|
-
*
|
|
880
|
-
* try {
|
|
881
|
-
* const result = await extractFile('document.pdf');
|
|
882
|
-
* } catch (error) {
|
|
883
|
-
* const code = getLastErrorCode();
|
|
884
|
-
* if (code === ErrorCode.Panic) {
|
|
885
|
-
* console.error('Native code panic detected');
|
|
886
|
-
* }
|
|
887
|
-
* }
|
|
888
|
-
* ```
|
|
889
|
-
*/
|
|
890
|
-
declare function getLastErrorCode(): number;
|
|
891
|
-
/**
|
|
892
|
-
* Get panic context information if the last error was a panic.
|
|
893
|
-
*
|
|
894
|
-
* Returns detailed information about a panic in native code, or null if the last error was not a panic.
|
|
895
|
-
* This provides debugging information when native code panics.
|
|
896
|
-
*
|
|
897
|
-
* @returns A `PanicContext` object with file, line, function, message, and timestamp_secs, or null if no panic context is available
|
|
898
|
-
*
|
|
899
|
-
* @example
|
|
900
|
-
* ```typescript
|
|
901
|
-
* import { extractFile, getLastPanicContext } from '@kreuzberg/node';
|
|
902
|
-
*
|
|
903
|
-
* try {
|
|
904
|
-
* const result = await extractFile('document.pdf');
|
|
905
|
-
* } catch (error) {
|
|
906
|
-
* const context = getLastPanicContext();
|
|
907
|
-
* if (context) {
|
|
908
|
-
* console.error(`Panic at ${context.file}:${context.line}`);
|
|
909
|
-
* console.error(`In function: ${context.function}`);
|
|
910
|
-
* console.error(`Message: ${context.message}`);
|
|
911
|
-
* }
|
|
912
|
-
* }
|
|
913
|
-
* ```
|
|
914
|
-
*/
|
|
915
|
-
declare function getLastPanicContext(): PanicContext | null;
|
|
916
|
-
/**
|
|
917
|
-
* Returns the human-readable name for an error code.
|
|
918
|
-
*
|
|
919
|
-
* Maps numeric error codes to their string names, providing a consistent way
|
|
920
|
-
* to get error code names across all platforms.
|
|
921
|
-
*
|
|
922
|
-
* @param code - The numeric error code (0-7)
|
|
923
|
-
* @returns The error code name as a string (e.g., "validation", "ocr", "unknown")
|
|
924
|
-
*
|
|
925
|
-
* @example
|
|
926
|
-
* ```typescript
|
|
927
|
-
* import { getErrorCodeName } from '@kreuzberg/node';
|
|
928
|
-
*
|
|
929
|
-
* const name = getErrorCodeName(0); // returns "validation"
|
|
930
|
-
* const name = getErrorCodeName(2); // returns "ocr"
|
|
931
|
-
* const name = getErrorCodeName(99); // returns "unknown"
|
|
932
|
-
* ```
|
|
933
|
-
*/
|
|
934
|
-
declare function getErrorCodeName(code: number): string;
|
|
935
|
-
/**
|
|
936
|
-
* Returns the description for an error code.
|
|
937
|
-
*
|
|
938
|
-
* Retrieves user-friendly descriptions of error types from the FFI layer.
|
|
939
|
-
*
|
|
940
|
-
* @param code - The numeric error code (0-7)
|
|
941
|
-
* @returns A brief description of the error type
|
|
942
|
-
*
|
|
943
|
-
* @example
|
|
944
|
-
* ```typescript
|
|
945
|
-
* import { getErrorCodeDescription } from '@kreuzberg/node';
|
|
946
|
-
*
|
|
947
|
-
* const desc = getErrorCodeDescription(0); // returns "Input validation error"
|
|
948
|
-
* const desc = getErrorCodeDescription(4); // returns "File system I/O error"
|
|
949
|
-
* const desc = getErrorCodeDescription(99); // returns "Unknown error code"
|
|
950
|
-
* ```
|
|
951
|
-
*/
|
|
952
|
-
declare function getErrorCodeDescription(code: number): string;
|
|
953
|
-
/**
|
|
954
|
-
* Classifies an error message string into an error code category.
|
|
955
|
-
*
|
|
956
|
-
* This function analyzes the error message content and returns the most likely
|
|
957
|
-
* error code (0-7) based on keyword patterns. Used to programmatically classify
|
|
958
|
-
* errors for handling purposes.
|
|
959
|
-
*
|
|
960
|
-
* The classification is based on keyword matching:
|
|
961
|
-
* - **Validation (0)**: Keywords like "invalid", "validation", "schema", "required"
|
|
962
|
-
* - **Parsing (1)**: Keywords like "parsing", "corrupted", "malformed"
|
|
963
|
-
* - **Ocr (2)**: Keywords like "ocr", "tesseract", "language", "model"
|
|
964
|
-
* - **MissingDependency (3)**: Keywords like "not found", "missing", "dependency"
|
|
965
|
-
* - **Io (4)**: Keywords like "file", "disk", "read", "write", "permission"
|
|
966
|
-
* - **Plugin (5)**: Keywords like "plugin", "register", "extension"
|
|
967
|
-
* - **UnsupportedFormat (6)**: Keywords like "unsupported", "format", "mime"
|
|
968
|
-
* - **Internal (7)**: Keywords like "internal", "bug", "panic"
|
|
969
|
-
*
|
|
970
|
-
* @param errorMessage - The error message string to classify
|
|
971
|
-
* @returns An object with the classification details
|
|
972
|
-
*
|
|
973
|
-
* @example
|
|
974
|
-
* ```typescript
|
|
975
|
-
* import { classifyError } from '@kreuzberg/node';
|
|
976
|
-
*
|
|
977
|
-
* const result = classifyError("PDF file is corrupted");
|
|
978
|
-
* // Returns: { code: 1, name: "parsing", confidence: 0.95 }
|
|
979
|
-
*
|
|
980
|
-
* const result = classifyError("Tesseract not found");
|
|
981
|
-
* // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 }
|
|
982
|
-
* ```
|
|
983
|
-
*/
|
|
984
|
-
declare function classifyError(errorMessage: string): ErrorClassification;
|
|
985
|
-
/**
|
|
986
|
-
* Create a worker pool for concurrent file extraction.
|
|
987
|
-
*
|
|
988
|
-
* The worker pool manages a set of background worker threads that can process
|
|
989
|
-
* extraction requests concurrently, improving throughput when handling multiple files.
|
|
990
|
-
*
|
|
991
|
-
* @param size - Optional number of worker threads (defaults to CPU count). Must be > 0
|
|
992
|
-
* @returns A WorkerPool instance to use with extraction functions
|
|
993
|
-
*
|
|
994
|
-
* @throws {Error} If size is invalid or pool creation fails
|
|
995
|
-
*
|
|
996
|
-
* @example
|
|
997
|
-
* ```typescript
|
|
998
|
-
* import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
999
|
-
*
|
|
1000
|
-
* // Create pool with 4 workers
|
|
1001
|
-
* const pool = createWorkerPool(4);
|
|
1002
|
-
*
|
|
1003
|
-
* try {
|
|
1004
|
-
* const result = await extractFileInWorker(pool, 'document.pdf');
|
|
1005
|
-
* console.log(result.content);
|
|
1006
|
-
* } finally {
|
|
1007
|
-
* // Always close the pool when done
|
|
1008
|
-
* await closeWorkerPool(pool);
|
|
1009
|
-
* }
|
|
1010
|
-
* ```
|
|
1133
|
+
* @internal Allows tests to provide a mocked native binding.
|
|
1011
1134
|
*/
|
|
1012
|
-
declare function
|
|
1135
|
+
declare function __setBindingForTests(mock: unknown): void;
|
|
1013
1136
|
/**
|
|
1014
|
-
*
|
|
1015
|
-
*
|
|
1016
|
-
* Returns information about the pool's current state, including the number of active workers,
|
|
1017
|
-
* queued tasks, and total processed tasks.
|
|
1018
|
-
*
|
|
1019
|
-
* @param pool - The worker pool instance
|
|
1020
|
-
* @returns WorkerPoolStats with pool information
|
|
1021
|
-
*
|
|
1022
|
-
* @example
|
|
1023
|
-
* ```typescript
|
|
1024
|
-
* import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
|
|
1025
|
-
*
|
|
1026
|
-
* const pool = createWorkerPool(4);
|
|
1027
|
-
* const stats = getWorkerPoolStats(pool);
|
|
1028
|
-
*
|
|
1029
|
-
* console.log(`Pool size: ${stats.size}`);
|
|
1030
|
-
* console.log(`Active workers: ${stats.activeWorkers}`);
|
|
1031
|
-
* console.log(`Queued tasks: ${stats.queuedTasks}`);
|
|
1032
|
-
* ```
|
|
1137
|
+
* @internal Resets the cached native binding for tests.
|
|
1033
1138
|
*/
|
|
1034
|
-
declare function
|
|
1139
|
+
declare function __resetBindingForTests(): void;
|
|
1140
|
+
|
|
1035
1141
|
/**
|
|
1036
|
-
*
|
|
1037
|
-
*
|
|
1038
|
-
* Submits an extraction task to the worker pool. The task is executed by one of the
|
|
1039
|
-
* available workers in the background, allowing other tasks to be processed concurrently.
|
|
1142
|
+
* Kreuzberg - Multi-language document intelligence framework.
|
|
1040
1143
|
*
|
|
1041
|
-
*
|
|
1042
|
-
*
|
|
1043
|
-
*
|
|
1044
|
-
* @param maybeConfig - Optional extraction configuration (if second param is MIME type)
|
|
1045
|
-
* @returns Promise<ExtractionResult> containing extracted content and metadata
|
|
1144
|
+
* This is a TypeScript SDK around a high-performance Rust core.
|
|
1145
|
+
* All extraction logic, chunking, quality processing, and language detection
|
|
1146
|
+
* are implemented in Rust for maximum performance.
|
|
1046
1147
|
*
|
|
1047
|
-
*
|
|
1148
|
+
* ## Module Organization
|
|
1048
1149
|
*
|
|
1049
|
-
*
|
|
1050
|
-
*
|
|
1051
|
-
*
|
|
1150
|
+
* The SDK is organized into logical domains:
|
|
1151
|
+
* - **Extraction**: Single and batch document extraction with worker pool support
|
|
1152
|
+
* - **Types**: Core type definitions and interfaces
|
|
1153
|
+
* - **Errors**: Error classes and diagnostic utilities
|
|
1154
|
+
* - **Plugins**: Custom post-processors, validators, and OCR backends
|
|
1155
|
+
* - **Registry**: Plugin and document extractor management
|
|
1156
|
+
* - **Config**: Configuration loading and management
|
|
1157
|
+
* - **MIME**: MIME type detection and validation
|
|
1158
|
+
* - **Embeddings**: Embedding model presets
|
|
1052
1159
|
*
|
|
1053
|
-
*
|
|
1160
|
+
* ## API Usage Recommendations
|
|
1054
1161
|
*
|
|
1055
|
-
*
|
|
1056
|
-
*
|
|
1057
|
-
*
|
|
1058
|
-
*
|
|
1059
|
-
* );
|
|
1162
|
+
* **For processing multiple documents**, prefer batch APIs:
|
|
1163
|
+
* - Use `batchExtractFiles()` / `batchExtractFilesSync()` for multiple files
|
|
1164
|
+
* - Use `batchExtractBytes()` / `batchExtractBytesSync()` for multiple byte arrays
|
|
1165
|
+
* - Use worker pool APIs for high-concurrency scenarios
|
|
1060
1166
|
*
|
|
1061
|
-
*
|
|
1062
|
-
*
|
|
1063
|
-
*
|
|
1064
|
-
*
|
|
1065
|
-
* await closeWorkerPool(pool);
|
|
1066
|
-
* }
|
|
1067
|
-
* ```
|
|
1068
|
-
*/
|
|
1069
|
-
declare function extractFileInWorker(pool: WorkerPool, filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
|
|
1070
|
-
/**
|
|
1071
|
-
* Extract content from multiple files in parallel using a worker pool (asynchronous).
|
|
1167
|
+
* **Batch APIs provide**:
|
|
1168
|
+
* - Better performance (parallel processing in Rust)
|
|
1169
|
+
* - More reliable memory management
|
|
1170
|
+
* - Recommended for all multi-document workflows
|
|
1072
1171
|
*
|
|
1073
|
-
*
|
|
1074
|
-
*
|
|
1172
|
+
* **Single extraction APIs** (`extractFile`, `extractBytes`) are suitable for:
|
|
1173
|
+
* - One-off document processing
|
|
1174
|
+
* - Interactive applications processing documents on-demand
|
|
1175
|
+
* - Avoid calling these in tight loops - use batch APIs instead
|
|
1075
1176
|
*
|
|
1076
|
-
*
|
|
1077
|
-
* @param paths - Array of file paths to extract
|
|
1078
|
-
* @param config - Extraction configuration object (applies to all files)
|
|
1079
|
-
* @returns Promise<ExtractionResult[]> array of results (one per file, in same order)
|
|
1177
|
+
* ## Supported Formats
|
|
1080
1178
|
*
|
|
1081
|
-
*
|
|
1179
|
+
* - **Documents**: PDF, DOCX, PPTX, XLSX, DOC, PPT (with LibreOffice)
|
|
1180
|
+
* - **Text**: Markdown, Plain Text, XML
|
|
1181
|
+
* - **Web**: HTML (converted to Markdown)
|
|
1182
|
+
* - **Data**: JSON, YAML, TOML
|
|
1183
|
+
* - **Email**: EML, MSG
|
|
1184
|
+
* - **Images**: PNG, JPEG, TIFF (with OCR support)
|
|
1082
1185
|
*
|
|
1083
1186
|
* @example
|
|
1084
1187
|
* ```typescript
|
|
1085
|
-
* import {
|
|
1086
|
-
*
|
|
1087
|
-
* const pool = createWorkerPool(4);
|
|
1188
|
+
* import { extractFile, batchExtractFiles } from '@kreuzberg/node';
|
|
1088
1189
|
*
|
|
1089
|
-
*
|
|
1090
|
-
*
|
|
1091
|
-
*
|
|
1092
|
-
* ocr: { backend: 'tesseract', language: 'eng' }
|
|
1093
|
-
* });
|
|
1190
|
+
* // Single file extraction
|
|
1191
|
+
* const result = await extractFile('document.pdf');
|
|
1192
|
+
* console.log(result.content);
|
|
1094
1193
|
*
|
|
1095
|
-
*
|
|
1096
|
-
*
|
|
1097
|
-
*
|
|
1098
|
-
*
|
|
1099
|
-
* }
|
|
1194
|
+
* // Multiple files (recommended approach)
|
|
1195
|
+
* const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
|
|
1196
|
+
* const results = await batchExtractFiles(files);
|
|
1197
|
+
* results.forEach(r => console.log(r.content));
|
|
1100
1198
|
* ```
|
|
1101
|
-
*/
|
|
1102
|
-
declare function batchExtractFilesInWorker(pool: WorkerPool, paths: string[], config?: ExtractionConfig$1 | null): Promise<ExtractionResult[]>;
|
|
1103
|
-
/**
|
|
1104
|
-
* Close a worker pool and shut down all worker threads.
|
|
1105
|
-
*
|
|
1106
|
-
* Should be called when the pool is no longer needed to clean up resources
|
|
1107
|
-
* and gracefully shut down worker threads. Any pending tasks will be cancelled.
|
|
1108
|
-
*
|
|
1109
|
-
* @param pool - The worker pool instance to close
|
|
1110
|
-
* @returns Promise that resolves when the pool is fully closed
|
|
1111
|
-
*
|
|
1112
|
-
* @throws {Error} If pool shutdown fails
|
|
1113
|
-
*
|
|
1114
|
-
* @example
|
|
1115
|
-
* ```typescript
|
|
1116
|
-
* import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
1117
|
-
*
|
|
1118
|
-
* const pool = createWorkerPool(4);
|
|
1119
1199
|
*
|
|
1120
|
-
*
|
|
1121
|
-
* const result = await extractFileInWorker(pool, 'document.pdf');
|
|
1122
|
-
* console.log(result.content);
|
|
1123
|
-
* } finally {
|
|
1124
|
-
* // Clean up the pool
|
|
1125
|
-
* await closeWorkerPool(pool);
|
|
1126
|
-
* }
|
|
1127
|
-
* ```
|
|
1200
|
+
* @module @kreuzberg/node
|
|
1128
1201
|
*/
|
|
1129
|
-
declare function closeWorkerPool(pool: WorkerPool): Promise<void>;
|
|
1130
|
-
declare const __version__ = "4.0.7";
|
|
1131
1202
|
|
|
1132
|
-
|
|
1203
|
+
declare const __version__ = "4.1.0";
|
|
1204
|
+
|
|
1205
|
+
export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
|