@kreuzberg/node 4.0.0-rc.8 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +321 -514
- package/dist/cli.d.mts +4 -0
- package/dist/cli.d.ts +4 -0
- package/dist/cli.js +12 -2
- package/dist/cli.js.map +1 -1
- package/dist/cli.mjs +12 -1
- package/dist/cli.mjs.map +1 -1
- package/dist/index.d.mts +337 -62
- package/dist/index.d.ts +337 -62
- package/dist/index.js +285 -56
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +277 -56
- package/dist/index.mjs.map +1 -1
- package/dist/types.d.mts +469 -54
- package/dist/types.d.ts +469 -54
- package/dist/types.js.map +1 -1
- package/index.d.ts +662 -1
- package/index.js +85 -55
- package/metadata.d.ts +53 -33
- package/package.json +17 -19
package/index.d.ts
CHANGED
|
@@ -1,5 +1,10 @@
|
|
|
1
1
|
/* auto-generated by NAPI-RS */
|
|
2
2
|
/* eslint-disable */
|
|
3
|
+
/** Opaque handle to a worker pool */
|
|
4
|
+
export declare class JsWorkerPool {
|
|
5
|
+
|
|
6
|
+
}
|
|
7
|
+
|
|
3
8
|
/**
|
|
4
9
|
* Batch extract from multiple byte arrays (asynchronous).
|
|
5
10
|
*
|
|
@@ -96,6 +101,40 @@ export declare function batchExtractBytesSync(dataList: Array<Buffer>, mimeTypes
|
|
|
96
101
|
*/
|
|
97
102
|
export declare function batchExtractFiles(paths: Array<string>, config?: JsExtractionConfig | undefined | null): Promise<Array<JsExtractionResult>>
|
|
98
103
|
|
|
104
|
+
/**
|
|
105
|
+
* Extract multiple files using worker threads from the pool.
|
|
106
|
+
*
|
|
107
|
+
* Submits multiple file extraction tasks to the worker pool for concurrent
|
|
108
|
+
* processing. Files are processed in parallel up to the pool size limit.
|
|
109
|
+
*
|
|
110
|
+
* # Parameters
|
|
111
|
+
*
|
|
112
|
+
* * `pool` - Worker pool handle
|
|
113
|
+
* * `file_paths` - Array of file paths to extract
|
|
114
|
+
* * `config` - Optional extraction configuration applied to all files
|
|
115
|
+
*
|
|
116
|
+
* # Returns
|
|
117
|
+
*
|
|
118
|
+
* Promise resolving to array of extraction results in the same order as input paths.
|
|
119
|
+
*
|
|
120
|
+
* # Example
|
|
121
|
+
*
|
|
122
|
+
* ```typescript
|
|
123
|
+
* import { createWorkerPool, batchExtractFilesInWorker } from '@kreuzberg/node';
|
|
124
|
+
*
|
|
125
|
+
* const pool = createWorkerPool(4);
|
|
126
|
+
* const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
|
|
127
|
+
* const results = await batchExtractFilesInWorker(pool, files, {
|
|
128
|
+
* useCache: true
|
|
129
|
+
* });
|
|
130
|
+
*
|
|
131
|
+
* results.forEach((result, i) => {
|
|
132
|
+
* console.log(`File ${i + 1}: ${result.content.length} chars`);
|
|
133
|
+
* });
|
|
134
|
+
* ```
|
|
135
|
+
*/
|
|
136
|
+
export declare function batchExtractFilesInWorker(pool: JsWorkerPool, filePaths: Array<string>, config?: JsExtractionConfig | undefined | null): Promise<Array<JsExtractionResult>>
|
|
137
|
+
|
|
99
138
|
/**
|
|
100
139
|
* Batch extract from multiple files (synchronous).
|
|
101
140
|
*
|
|
@@ -125,6 +164,8 @@ export declare function batchExtractFiles(paths: Array<string>, config?: JsExtra
|
|
|
125
164
|
*/
|
|
126
165
|
export declare function batchExtractFilesSync(paths: Array<string>, config?: JsExtractionConfig | undefined | null): Array<JsExtractionResult>
|
|
127
166
|
|
|
167
|
+
export declare function classifyError(errorMessage: string): ErrorClassification
|
|
168
|
+
|
|
128
169
|
/**
|
|
129
170
|
* Clear all registered document extractors.
|
|
130
171
|
*
|
|
@@ -165,6 +206,107 @@ export declare function clearPostProcessors(): void
|
|
|
165
206
|
/** Clear all registered validators */
|
|
166
207
|
export declare function clearValidators(): void
|
|
167
208
|
|
|
209
|
+
/**
|
|
210
|
+
* Close and shutdown a worker pool gracefully.
|
|
211
|
+
*
|
|
212
|
+
* Waits for all in-flight extraction tasks to complete before shutting down
|
|
213
|
+
* the pool. After calling this function, the pool handle becomes invalid.
|
|
214
|
+
*
|
|
215
|
+
* # Parameters
|
|
216
|
+
*
|
|
217
|
+
* * `pool` - Worker pool handle
|
|
218
|
+
*
|
|
219
|
+
* # Returns
|
|
220
|
+
*
|
|
221
|
+
* Promise that resolves when all workers have completed and pool is closed.
|
|
222
|
+
*
|
|
223
|
+
* # Example
|
|
224
|
+
*
|
|
225
|
+
* ```typescript
|
|
226
|
+
* import { createWorkerPool, closeWorkerPool } from '@kreuzberg/node';
|
|
227
|
+
*
|
|
228
|
+
* const pool = createWorkerPool(4);
|
|
229
|
+
* // ... use pool for extractions ...
|
|
230
|
+
* await closeWorkerPool(pool); // Wait for completion and cleanup
|
|
231
|
+
* ```
|
|
232
|
+
*/
|
|
233
|
+
export declare function closeWorkerPool(pool: JsWorkerPool): Promise<void>
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* Get a specific field from config (represented as JSON string) by name via FFI.
|
|
237
|
+
*
|
|
238
|
+
* Retrieves a configuration field by path, supporting nested access with
|
|
239
|
+
* dot notation (e.g., "ocr.backend"). Returns the field value as a JSON string.
|
|
240
|
+
*
|
|
241
|
+
* # Arguments
|
|
242
|
+
*
|
|
243
|
+
* * `json_str` - A JSON string representation of the configuration
|
|
244
|
+
* * `field_name` - The field path to retrieve (e.g., "useCache", "ocr.backend")
|
|
245
|
+
*
|
|
246
|
+
* # Returns
|
|
247
|
+
*
|
|
248
|
+
* The field value as a JSON string, or null if not found
|
|
249
|
+
*/
|
|
250
|
+
export declare function configGetFieldInternal(jsonStr: string, fieldName: string): string | null
|
|
251
|
+
|
|
252
|
+
/**
|
|
253
|
+
* Merge two configs (override takes precedence over base) via FFI.
|
|
254
|
+
*
|
|
255
|
+
* Performs a shallow merge where fields from the override config take
|
|
256
|
+
* precedence over fields in the base config.
|
|
257
|
+
*
|
|
258
|
+
* # Arguments
|
|
259
|
+
*
|
|
260
|
+
* * `base_json` - A JSON string representation of the base ExtractionConfig
|
|
261
|
+
* * `override_json` - A JSON string representation of the override ExtractionConfig
|
|
262
|
+
*
|
|
263
|
+
* # Returns
|
|
264
|
+
*
|
|
265
|
+
* The merged configuration as a JSON string, or error
|
|
266
|
+
*/
|
|
267
|
+
export declare function configMergeInternal(baseJson: string, overrideJson: string): string
|
|
268
|
+
|
|
269
|
+
/**
|
|
270
|
+
* Validate and normalize an ExtractionConfig JSON string via FFI.
|
|
271
|
+
*
|
|
272
|
+
* This validates the JSON and returns a normalized version, using the shared
|
|
273
|
+
* FFI layer to ensure consistent validation across all language bindings.
|
|
274
|
+
*
|
|
275
|
+
* # Arguments
|
|
276
|
+
*
|
|
277
|
+
* * `json_str` - A JSON string containing the configuration
|
|
278
|
+
*
|
|
279
|
+
* # Returns
|
|
280
|
+
*
|
|
281
|
+
* The normalized JSON string representation of the config, or error
|
|
282
|
+
*/
|
|
283
|
+
export declare function configValidateAndNormalize(jsonStr: string): string
|
|
284
|
+
|
|
285
|
+
/**
|
|
286
|
+
* Create a new worker pool for concurrent extraction operations.
|
|
287
|
+
*
|
|
288
|
+
* Creates a pool of worker threads for CPU-bound document extraction.
|
|
289
|
+
* Tasks submitted to the pool will be executed concurrently up to the pool size.
|
|
290
|
+
*
|
|
291
|
+
* # Parameters
|
|
292
|
+
*
|
|
293
|
+
* * `size` - Number of concurrent workers (defaults to CPU count)
|
|
294
|
+
*
|
|
295
|
+
* # Returns
|
|
296
|
+
*
|
|
297
|
+
* Worker pool handle that can be used with extraction functions.
|
|
298
|
+
*
|
|
299
|
+
* # Example
|
|
300
|
+
*
|
|
301
|
+
* ```typescript
|
|
302
|
+
* import { createWorkerPool } from '@kreuzberg/node';
|
|
303
|
+
*
|
|
304
|
+
* const pool = createWorkerPool(4); // 4 concurrent workers
|
|
305
|
+
* console.log(`Pool created with ${pool.size} workers`);
|
|
306
|
+
* ```
|
|
307
|
+
*/
|
|
308
|
+
export declare function createWorkerPool(size?: number | undefined | null): JsWorkerPool
|
|
309
|
+
|
|
168
310
|
/**
|
|
169
311
|
* Detect MIME type from raw bytes.
|
|
170
312
|
*
|
|
@@ -198,7 +340,7 @@ export declare function clearValidators(): void
|
|
|
198
340
|
* console.log(mimeType); // 'application/pdf'
|
|
199
341
|
* ```
|
|
200
342
|
*/
|
|
201
|
-
export declare function
|
|
343
|
+
export declare function detectMimeTypeFromBytes(bytes: Buffer): string
|
|
202
344
|
|
|
203
345
|
/**
|
|
204
346
|
* Detect MIME type from a file path.
|
|
@@ -285,6 +427,53 @@ export interface EmbeddingPreset {
|
|
|
285
427
|
description: string
|
|
286
428
|
}
|
|
287
429
|
|
|
430
|
+
/**
|
|
431
|
+
* Classifies an error message string into an error code category.
|
|
432
|
+
*
|
|
433
|
+
* This function analyzes the error message content and returns the most likely
|
|
434
|
+
* error code (0-7) based on keyword patterns. Used to programmatically classify
|
|
435
|
+
* errors for handling purposes.
|
|
436
|
+
*
|
|
437
|
+
* # Arguments
|
|
438
|
+
*
|
|
439
|
+
* * `error_message` - The error message string to classify
|
|
440
|
+
*
|
|
441
|
+
* # Returns
|
|
442
|
+
*
|
|
443
|
+
* An object with:
|
|
444
|
+
* - `code`: The numeric error code (0-7)
|
|
445
|
+
* - `name`: The error code name string
|
|
446
|
+
* - `description`: Brief description of the error type
|
|
447
|
+
* - `confidence`: Confidence score (0.0-1.0) of the classification
|
|
448
|
+
*
|
|
449
|
+
* # Classification Rules
|
|
450
|
+
*
|
|
451
|
+
* - **Validation (0)**: Keywords: invalid, validation, invalid_argument, schema, required, unexpected field
|
|
452
|
+
* - **Parsing (1)**: Keywords: parsing, parse_error, corrupted, malformed, invalid format, decode, encoding
|
|
453
|
+
* - **Ocr (2)**: Keywords: ocr, optical, character, recognition, tesseract, language, model
|
|
454
|
+
* - **MissingDependency (3)**: Keywords: not found, not installed, missing, dependency, require, unavailable
|
|
455
|
+
* - **Io (4)**: Keywords: io, file, disk, read, write, permission, access, path
|
|
456
|
+
* - **Plugin (5)**: Keywords: plugin, register, extension, handler, processor
|
|
457
|
+
* - **UnsupportedFormat (6)**: Keywords: unsupported, format, mime, type, codec
|
|
458
|
+
* - **Internal (7)**: Keywords: internal, bug, panic, unexpected, invariant
|
|
459
|
+
*
|
|
460
|
+
* # Examples
|
|
461
|
+
*
|
|
462
|
+
* ```typescript
|
|
463
|
+
* const result = classifyError("PDF file is corrupted");
|
|
464
|
+
* // Returns: { code: 1, name: "parsing", confidence: 0.95 }
|
|
465
|
+
*
|
|
466
|
+
* const result = classifyError("Tesseract not found");
|
|
467
|
+
* // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 }
|
|
468
|
+
* ```
|
|
469
|
+
*/
|
|
470
|
+
export interface ErrorClassification {
|
|
471
|
+
code: number
|
|
472
|
+
name: string
|
|
473
|
+
description: string
|
|
474
|
+
confidence: number
|
|
475
|
+
}
|
|
476
|
+
|
|
288
477
|
/**
|
|
289
478
|
* Extract content from bytes (asynchronous).
|
|
290
479
|
*
|
|
@@ -383,6 +572,38 @@ export declare function extractBytesSync(data: Buffer, mimeType: string, config?
|
|
|
383
572
|
*/
|
|
384
573
|
export declare function extractFile(filePath: string, mimeType?: string | undefined | null, config?: JsExtractionConfig | undefined | null): Promise<JsExtractionResult>
|
|
385
574
|
|
|
575
|
+
/**
|
|
576
|
+
* Extract a file using a worker thread from the pool.
|
|
577
|
+
*
|
|
578
|
+
* Submits a file extraction task to the worker pool. The task will execute
|
|
579
|
+
* when a worker thread becomes available. This is useful for CPU-bound
|
|
580
|
+
* extraction operations that need to be run concurrently.
|
|
581
|
+
*
|
|
582
|
+
* # Parameters
|
|
583
|
+
*
|
|
584
|
+
* * `pool` - Worker pool handle
|
|
585
|
+
* * `file_path` - Path to the file to extract
|
|
586
|
+
* * `password` - Optional password for encrypted files
|
|
587
|
+
* * `config` - Optional extraction configuration
|
|
588
|
+
*
|
|
589
|
+
* # Returns
|
|
590
|
+
*
|
|
591
|
+
* Promise resolving to extraction result.
|
|
592
|
+
*
|
|
593
|
+
* # Example
|
|
594
|
+
*
|
|
595
|
+
* ```typescript
|
|
596
|
+
* import { createWorkerPool, extractFileInWorker } from '@kreuzberg/node';
|
|
597
|
+
*
|
|
598
|
+
* const pool = createWorkerPool(4);
|
|
599
|
+
* const result = await extractFileInWorker(pool, 'document.pdf', null, {
|
|
600
|
+
* useCache: true
|
|
601
|
+
* });
|
|
602
|
+
* console.log(result.content);
|
|
603
|
+
* ```
|
|
604
|
+
*/
|
|
605
|
+
export declare function extractFileInWorker(pool: JsWorkerPool, filePath: string, password?: string | undefined | null, config?: JsExtractionConfig | undefined | null): Promise<JsExtractionResult>
|
|
606
|
+
|
|
386
607
|
/**
|
|
387
608
|
* Extract content from a file (synchronous).
|
|
388
609
|
*
|
|
@@ -473,6 +694,52 @@ export declare function extractFileSync(filePath: string, mimeType?: string | un
|
|
|
473
694
|
*/
|
|
474
695
|
export declare function getEmbeddingPreset(name: string): EmbeddingPreset | null
|
|
475
696
|
|
|
697
|
+
/**
|
|
698
|
+
* Returns the description for an error code.
|
|
699
|
+
*
|
|
700
|
+
* Maps to FFI function kreuzberg_error_code_description().
|
|
701
|
+
*
|
|
702
|
+
* # Arguments
|
|
703
|
+
*
|
|
704
|
+
* * `code` - Numeric error code (0-7)
|
|
705
|
+
*
|
|
706
|
+
* # Returns
|
|
707
|
+
*
|
|
708
|
+
* A string containing a brief description of the error
|
|
709
|
+
*
|
|
710
|
+
* # Examples
|
|
711
|
+
*
|
|
712
|
+
* ```typescript
|
|
713
|
+
* const desc = getErrorCodeDescription(0); // returns "Input validation error"
|
|
714
|
+
* const desc = getErrorCodeDescription(4); // returns "File system I/O error"
|
|
715
|
+
* const desc = getErrorCodeDescription(99); // returns "Unknown error code"
|
|
716
|
+
* ```
|
|
717
|
+
*/
|
|
718
|
+
export declare function getErrorCodeDescription(code: number): string
|
|
719
|
+
|
|
720
|
+
/**
|
|
721
|
+
* Returns the human-readable name for an error code.
|
|
722
|
+
*
|
|
723
|
+
* Maps to FFI function kreuzberg_error_code_name().
|
|
724
|
+
*
|
|
725
|
+
* # Arguments
|
|
726
|
+
*
|
|
727
|
+
* * `code` - Numeric error code (0-7)
|
|
728
|
+
*
|
|
729
|
+
* # Returns
|
|
730
|
+
*
|
|
731
|
+
* A string containing the error code name (e.g., "validation", "ocr", "unknown")
|
|
732
|
+
*
|
|
733
|
+
* # Examples
|
|
734
|
+
*
|
|
735
|
+
* ```typescript
|
|
736
|
+
* const name = getErrorCodeName(0); // returns "validation"
|
|
737
|
+
* const name = getErrorCodeName(2); // returns "ocr"
|
|
738
|
+
* const name = getErrorCodeName(99); // returns "unknown"
|
|
739
|
+
* ```
|
|
740
|
+
*/
|
|
741
|
+
export declare function getErrorCodeName(code: number): string
|
|
742
|
+
|
|
476
743
|
/**
|
|
477
744
|
* Get file extensions for a given MIME type.
|
|
478
745
|
*
|
|
@@ -580,6 +847,112 @@ export declare function getLastErrorCode(): number
|
|
|
580
847
|
*/
|
|
581
848
|
export declare function getLastPanicContext(): any | null
|
|
582
849
|
|
|
850
|
+
/**
|
|
851
|
+
* Get valid binarization methods.
|
|
852
|
+
*
|
|
853
|
+
* Returns a list of all valid binarization method values.
|
|
854
|
+
*
|
|
855
|
+
* # Returns
|
|
856
|
+
*
|
|
857
|
+
* Array of valid binarization methods: ["otsu", "adaptive", "sauvola"]
|
|
858
|
+
*
|
|
859
|
+
* # Example
|
|
860
|
+
*
|
|
861
|
+
* ```typescript
|
|
862
|
+
* import { getValidBinarizationMethods } from '@kreuzberg/node';
|
|
863
|
+
*
|
|
864
|
+
* const methods = getValidBinarizationMethods();
|
|
865
|
+
* console.log(methods); // ['otsu', 'adaptive', 'sauvola']
|
|
866
|
+
* ```
|
|
867
|
+
*/
|
|
868
|
+
export declare function getValidBinarizationMethods(): Array<string>
|
|
869
|
+
|
|
870
|
+
/**
|
|
871
|
+
* Get valid language codes.
|
|
872
|
+
*
|
|
873
|
+
* Returns a list of all valid language codes in ISO 639-1 and 639-3 formats.
|
|
874
|
+
*
|
|
875
|
+
* # Returns
|
|
876
|
+
*
|
|
877
|
+
* Array of valid language codes (both 2-letter and 3-letter codes)
|
|
878
|
+
*
|
|
879
|
+
* # Example
|
|
880
|
+
*
|
|
881
|
+
* ```typescript
|
|
882
|
+
* import { getValidLanguageCodes } from '@kreuzberg/node';
|
|
883
|
+
*
|
|
884
|
+
* const codes = getValidLanguageCodes();
|
|
885
|
+
* console.log(codes); // ['en', 'de', 'fr', ..., 'eng', 'deu', 'fra', ...]
|
|
886
|
+
* ```
|
|
887
|
+
*/
|
|
888
|
+
export declare function getValidLanguageCodes(): Array<string>
|
|
889
|
+
|
|
890
|
+
/**
|
|
891
|
+
* Get valid OCR backends.
|
|
892
|
+
*
|
|
893
|
+
* Returns a list of all valid OCR backend values.
|
|
894
|
+
*
|
|
895
|
+
* # Returns
|
|
896
|
+
*
|
|
897
|
+
* Array of valid OCR backends: ["tesseract", "easyocr", "paddleocr"]
|
|
898
|
+
*
|
|
899
|
+
* # Example
|
|
900
|
+
*
|
|
901
|
+
* ```typescript
|
|
902
|
+
* import { getValidOcrBackends } from '@kreuzberg/node';
|
|
903
|
+
*
|
|
904
|
+
* const backends = getValidOcrBackends();
|
|
905
|
+
* console.log(backends); // ['tesseract', 'easyocr', 'paddleocr']
|
|
906
|
+
* ```
|
|
907
|
+
*/
|
|
908
|
+
export declare function getValidOcrBackends(): Array<string>
|
|
909
|
+
|
|
910
|
+
/**
|
|
911
|
+
* Get valid token reduction levels.
|
|
912
|
+
*
|
|
913
|
+
* Returns a list of all valid token reduction level values.
|
|
914
|
+
*
|
|
915
|
+
* # Returns
|
|
916
|
+
*
|
|
917
|
+
* Array of valid levels: ["off", "light", "moderate", "aggressive", "maximum"]
|
|
918
|
+
*
|
|
919
|
+
* # Example
|
|
920
|
+
*
|
|
921
|
+
* ```typescript
|
|
922
|
+
* import { getValidTokenReductionLevels } from '@kreuzberg/node';
|
|
923
|
+
*
|
|
924
|
+
* const levels = getValidTokenReductionLevels();
|
|
925
|
+
* console.log(levels); // ['off', 'light', 'moderate', 'aggressive', 'maximum']
|
|
926
|
+
* ```
|
|
927
|
+
*/
|
|
928
|
+
export declare function getValidTokenReductionLevels(): Array<string>
|
|
929
|
+
|
|
930
|
+
/**
|
|
931
|
+
* Get worker pool statistics.
|
|
932
|
+
*
|
|
933
|
+
* Returns current statistics about the worker pool including size,
|
|
934
|
+
* active workers, and queued tasks.
|
|
935
|
+
*
|
|
936
|
+
* # Parameters
|
|
937
|
+
*
|
|
938
|
+
* * `pool` - Worker pool handle
|
|
939
|
+
*
|
|
940
|
+
* # Returns
|
|
941
|
+
*
|
|
942
|
+
* Pool statistics object with size, activeWorkers, and queuedTasks fields.
|
|
943
|
+
*
|
|
944
|
+
* # Example
|
|
945
|
+
*
|
|
946
|
+
* ```typescript
|
|
947
|
+
* import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
|
|
948
|
+
*
|
|
949
|
+
* const pool = createWorkerPool(4);
|
|
950
|
+
* const stats = getWorkerPoolStats(pool);
|
|
951
|
+
* console.log(`Active: ${stats.activeWorkers}/${stats.size}`);
|
|
952
|
+
* ```
|
|
953
|
+
*/
|
|
954
|
+
export declare function getWorkerPoolStats(pool: JsWorkerPool): WorkerPoolStats
|
|
955
|
+
|
|
583
956
|
export interface JsChunk {
|
|
584
957
|
content: string
|
|
585
958
|
embedding?: number[] | undefined
|
|
@@ -675,6 +1048,21 @@ export interface JsExtractionResult {
|
|
|
675
1048
|
detectedLanguages?: Array<string>
|
|
676
1049
|
chunks?: Array<JsChunk>
|
|
677
1050
|
images?: Array<JsExtractedImage>
|
|
1051
|
+
pages?: Array<JsPageContent>
|
|
1052
|
+
}
|
|
1053
|
+
|
|
1054
|
+
export interface JsHierarchicalBlock {
|
|
1055
|
+
text: string
|
|
1056
|
+
fontSize: number
|
|
1057
|
+
level: string
|
|
1058
|
+
bbox?: [number, number, number, number] | undefined
|
|
1059
|
+
}
|
|
1060
|
+
|
|
1061
|
+
export interface JsHierarchyConfig {
|
|
1062
|
+
enabled?: boolean
|
|
1063
|
+
kClusters?: number
|
|
1064
|
+
includeBbox?: boolean
|
|
1065
|
+
ocrCoverageThreshold?: number
|
|
678
1066
|
}
|
|
679
1067
|
|
|
680
1068
|
export interface JsHtmlOptions {
|
|
@@ -755,10 +1143,24 @@ export interface JsPageConfig {
|
|
|
755
1143
|
markerFormat?: string
|
|
756
1144
|
}
|
|
757
1145
|
|
|
1146
|
+
export interface JsPageContent {
|
|
1147
|
+
pageNumber: number
|
|
1148
|
+
content: string
|
|
1149
|
+
tables: Array<JsTable>
|
|
1150
|
+
images: Array<JsExtractedImage>
|
|
1151
|
+
hierarchy?: JsPageHierarchy
|
|
1152
|
+
}
|
|
1153
|
+
|
|
1154
|
+
export interface JsPageHierarchy {
|
|
1155
|
+
blockCount: number
|
|
1156
|
+
blocks: Array<JsHierarchicalBlock>
|
|
1157
|
+
}
|
|
1158
|
+
|
|
758
1159
|
export interface JsPdfConfig {
|
|
759
1160
|
extractImages?: boolean
|
|
760
1161
|
passwords?: Array<string>
|
|
761
1162
|
extractMetadata?: boolean
|
|
1163
|
+
hierarchy?: JsHierarchyConfig
|
|
762
1164
|
}
|
|
763
1165
|
|
|
764
1166
|
export interface JsPostProcessorConfig {
|
|
@@ -1076,6 +1478,134 @@ export declare function unregisterPostProcessor(name: string): void
|
|
|
1076
1478
|
/** Unregister a validator by name */
|
|
1077
1479
|
export declare function unregisterValidator(name: string): void
|
|
1078
1480
|
|
|
1481
|
+
/**
|
|
1482
|
+
* Validates a binarization method string.
|
|
1483
|
+
*
|
|
1484
|
+
* Valid methods: "otsu", "adaptive", "sauvola"
|
|
1485
|
+
*
|
|
1486
|
+
* # Arguments
|
|
1487
|
+
*
|
|
1488
|
+
* * `method` - The binarization method to validate
|
|
1489
|
+
*
|
|
1490
|
+
* # Returns
|
|
1491
|
+
*
|
|
1492
|
+
* `true` if valid, `false` if invalid.
|
|
1493
|
+
*
|
|
1494
|
+
* # Example
|
|
1495
|
+
*
|
|
1496
|
+
* ```typescript
|
|
1497
|
+
* import { validateBinarizationMethod } from '@kreuzberg/node';
|
|
1498
|
+
*
|
|
1499
|
+
* if (validateBinarizationMethod('otsu')) {
|
|
1500
|
+
* console.log('Valid method');
|
|
1501
|
+
* } else {
|
|
1502
|
+
* console.log('Invalid method');
|
|
1503
|
+
* }
|
|
1504
|
+
* ```
|
|
1505
|
+
*/
|
|
1506
|
+
export declare function validateBinarizationMethod(method: string): boolean
|
|
1507
|
+
|
|
1508
|
+
/**
|
|
1509
|
+
* Validates chunking parameters.
|
|
1510
|
+
*
|
|
1511
|
+
* Checks that `maxChars > 0` and `maxOverlap < maxChars`.
|
|
1512
|
+
*
|
|
1513
|
+
* # Arguments
|
|
1514
|
+
*
|
|
1515
|
+
* * `max_chars` - Maximum characters per chunk
|
|
1516
|
+
* * `max_overlap` - Maximum overlap between chunks
|
|
1517
|
+
*
|
|
1518
|
+
* # Returns
|
|
1519
|
+
*
|
|
1520
|
+
* `true` if valid, `false` if invalid.
|
|
1521
|
+
*
|
|
1522
|
+
* # Example
|
|
1523
|
+
*
|
|
1524
|
+
* ```typescript
|
|
1525
|
+
* import { validateChunkingParams } from '@kreuzberg/node';
|
|
1526
|
+
*
|
|
1527
|
+
* if (validateChunkingParams(1000, 200)) {
|
|
1528
|
+
* console.log('Valid chunking parameters');
|
|
1529
|
+
* }
|
|
1530
|
+
* ```
|
|
1531
|
+
*/
|
|
1532
|
+
export declare function validateChunkingParams(maxChars: number, maxOverlap: number): boolean
|
|
1533
|
+
|
|
1534
|
+
/**
|
|
1535
|
+
* Validates a confidence threshold value.
|
|
1536
|
+
*
|
|
1537
|
+
* Valid range: 0.0 to 1.0 (inclusive)
|
|
1538
|
+
*
|
|
1539
|
+
* # Arguments
|
|
1540
|
+
*
|
|
1541
|
+
* * `confidence` - The confidence threshold to validate
|
|
1542
|
+
*
|
|
1543
|
+
* # Returns
|
|
1544
|
+
*
|
|
1545
|
+
* `true` if valid, `false` if invalid.
|
|
1546
|
+
*
|
|
1547
|
+
* # Example
|
|
1548
|
+
*
|
|
1549
|
+
* ```typescript
|
|
1550
|
+
* import { validateConfidence } from '@kreuzberg/node';
|
|
1551
|
+
*
|
|
1552
|
+
* if (validateConfidence(0.75)) {
|
|
1553
|
+
* console.log('Valid confidence threshold');
|
|
1554
|
+
* }
|
|
1555
|
+
* ```
|
|
1556
|
+
*/
|
|
1557
|
+
export declare function validateConfidence(confidence: number): boolean
|
|
1558
|
+
|
|
1559
|
+
/**
|
|
1560
|
+
* Validates a DPI (dots per inch) value.
|
|
1561
|
+
*
|
|
1562
|
+
* Valid range: 1-2400
|
|
1563
|
+
*
|
|
1564
|
+
* # Arguments
|
|
1565
|
+
*
|
|
1566
|
+
* * `dpi` - The DPI value to validate
|
|
1567
|
+
*
|
|
1568
|
+
* # Returns
|
|
1569
|
+
*
|
|
1570
|
+
* `true` if valid, `false` if invalid.
|
|
1571
|
+
*
|
|
1572
|
+
* # Example
|
|
1573
|
+
*
|
|
1574
|
+
* ```typescript
|
|
1575
|
+
* import { validateDpi } from '@kreuzberg/node';
|
|
1576
|
+
*
|
|
1577
|
+
* if (validateDpi(300)) {
|
|
1578
|
+
* console.log('Valid DPI');
|
|
1579
|
+
* }
|
|
1580
|
+
* ```
|
|
1581
|
+
*/
|
|
1582
|
+
export declare function validateDpi(dpi: number): boolean
|
|
1583
|
+
|
|
1584
|
+
/**
|
|
1585
|
+
* Validates a language code (ISO 639-1 or 639-3 format).
|
|
1586
|
+
*
|
|
1587
|
+
* Accepts both 2-letter codes (e.g., "en", "de") and 3-letter codes (e.g., "eng", "deu").
|
|
1588
|
+
*
|
|
1589
|
+
* # Arguments
|
|
1590
|
+
*
|
|
1591
|
+
* * `code` - The language code to validate
|
|
1592
|
+
*
|
|
1593
|
+
* # Returns
|
|
1594
|
+
*
|
|
1595
|
+
* `true` if valid, `false` if invalid.
|
|
1596
|
+
*
|
|
1597
|
+
* # Example
|
|
1598
|
+
*
|
|
1599
|
+
* ```typescript
|
|
1600
|
+
* import { validateLanguageCode } from '@kreuzberg/node';
|
|
1601
|
+
*
|
|
1602
|
+
* if (validateLanguageCode('en')) {
|
|
1603
|
+
* console.log('Valid language code');
|
|
1604
|
+
* }
|
|
1605
|
+
* ```
|
|
1606
|
+
*/
|
|
1607
|
+
export declare function validateLanguageCode(code: string): boolean
|
|
1608
|
+
|
|
1079
1609
|
/**
|
|
1080
1610
|
* Validate that a MIME type is supported by Kreuzberg.
|
|
1081
1611
|
*
|
|
@@ -1116,3 +1646,134 @@ export declare function unregisterValidator(name: string): void
|
|
|
1116
1646
|
* ```
|
|
1117
1647
|
*/
|
|
1118
1648
|
export declare function validateMimeType(mimeType: string): string
|
|
1649
|
+
|
|
1650
|
+
/**
|
|
1651
|
+
* Validates an OCR backend string.
|
|
1652
|
+
*
|
|
1653
|
+
* Valid backends: "tesseract", "easyocr", "paddleocr"
|
|
1654
|
+
*
|
|
1655
|
+
* # Arguments
|
|
1656
|
+
*
|
|
1657
|
+
* * `backend` - The OCR backend to validate
|
|
1658
|
+
*
|
|
1659
|
+
* # Returns
|
|
1660
|
+
*
|
|
1661
|
+
* `true` if valid, `false` if invalid.
|
|
1662
|
+
*
|
|
1663
|
+
* # Example
|
|
1664
|
+
*
|
|
1665
|
+
* ```typescript
|
|
1666
|
+
* import { validateOcrBackend } from '@kreuzberg/node';
|
|
1667
|
+
*
|
|
1668
|
+
* if (validateOcrBackend('tesseract')) {
|
|
1669
|
+
* console.log('Valid backend');
|
|
1670
|
+
* }
|
|
1671
|
+
* ```
|
|
1672
|
+
*/
|
|
1673
|
+
export declare function validateOcrBackend(backend: string): boolean
|
|
1674
|
+
|
|
1675
|
+
/**
|
|
1676
|
+
* Validates a tesseract output format string.
|
|
1677
|
+
*
|
|
1678
|
+
* Valid formats: "text", "markdown"
|
|
1679
|
+
*
|
|
1680
|
+
* # Arguments
|
|
1681
|
+
*
|
|
1682
|
+
* * `format` - The output format to validate
|
|
1683
|
+
*
|
|
1684
|
+
* # Returns
|
|
1685
|
+
*
|
|
1686
|
+
* `true` if valid, `false` if invalid.
|
|
1687
|
+
*
|
|
1688
|
+
* # Example
|
|
1689
|
+
*
|
|
1690
|
+
* ```typescript
|
|
1691
|
+
* import { validateOutputFormat } from '@kreuzberg/node';
|
|
1692
|
+
*
|
|
1693
|
+
* if (validateOutputFormat('markdown')) {
|
|
1694
|
+
* console.log('Valid output format');
|
|
1695
|
+
* }
|
|
1696
|
+
* ```
|
|
1697
|
+
*/
|
|
1698
|
+
export declare function validateOutputFormat(format: string): boolean
|
|
1699
|
+
|
|
1700
|
+
/**
|
|
1701
|
+
* Validates a Tesseract OCR Engine Mode (OEM) value.
|
|
1702
|
+
*
|
|
1703
|
+
* Valid range: 0-3
|
|
1704
|
+
*
|
|
1705
|
+
* # Arguments
|
|
1706
|
+
*
|
|
1707
|
+
* * `oem` - The OEM value to validate
|
|
1708
|
+
*
|
|
1709
|
+
* # Returns
|
|
1710
|
+
*
|
|
1711
|
+
* `true` if valid (0-3), `false` otherwise.
|
|
1712
|
+
*
|
|
1713
|
+
* # Example
|
|
1714
|
+
*
|
|
1715
|
+
* ```typescript
|
|
1716
|
+
* import { validateTesseractOem } from '@kreuzberg/node';
|
|
1717
|
+
*
|
|
1718
|
+
* if (validateTesseractOem(1)) {
|
|
1719
|
+
* console.log('Valid OEM');
|
|
1720
|
+
* }
|
|
1721
|
+
* ```
|
|
1722
|
+
*/
|
|
1723
|
+
export declare function validateTesseractOem(oem: number): boolean
|
|
1724
|
+
|
|
1725
|
+
/**
|
|
1726
|
+
* Validates a Tesseract Page Segmentation Mode (PSM) value.
|
|
1727
|
+
*
|
|
1728
|
+
* Valid range: 0-13
|
|
1729
|
+
*
|
|
1730
|
+
* # Arguments
|
|
1731
|
+
*
|
|
1732
|
+
* * `psm` - The PSM value to validate
|
|
1733
|
+
*
|
|
1734
|
+
* # Returns
|
|
1735
|
+
*
|
|
1736
|
+
* `true` if valid (0-13), `false` otherwise.
|
|
1737
|
+
*
|
|
1738
|
+
* # Example
|
|
1739
|
+
*
|
|
1740
|
+
* ```typescript
|
|
1741
|
+
* import { validateTesseractPsm } from '@kreuzberg/node';
|
|
1742
|
+
*
|
|
1743
|
+
* if (validateTesseractPsm(3)) {
|
|
1744
|
+
* console.log('Valid PSM');
|
|
1745
|
+
* }
|
|
1746
|
+
* ```
|
|
1747
|
+
*/
|
|
1748
|
+
export declare function validateTesseractPsm(psm: number): boolean
|
|
1749
|
+
|
|
1750
|
+
/**
|
|
1751
|
+
* Validates a token reduction level string.
|
|
1752
|
+
*
|
|
1753
|
+
* Valid levels: "off", "light", "moderate", "aggressive", "maximum"
|
|
1754
|
+
*
|
|
1755
|
+
* # Arguments
|
|
1756
|
+
*
|
|
1757
|
+
* * `level` - The token reduction level to validate
|
|
1758
|
+
*
|
|
1759
|
+
* # Returns
|
|
1760
|
+
*
|
|
1761
|
+
* `true` if valid, `false` if invalid.
|
|
1762
|
+
*
|
|
1763
|
+
* # Example
|
|
1764
|
+
*
|
|
1765
|
+
* ```typescript
|
|
1766
|
+
* import { validateTokenReductionLevel } from '@kreuzberg/node';
|
|
1767
|
+
*
|
|
1768
|
+
* if (validateTokenReductionLevel('moderate')) {
|
|
1769
|
+
* console.log('Valid token reduction level');
|
|
1770
|
+
* }
|
|
1771
|
+
* ```
|
|
1772
|
+
*/
|
|
1773
|
+
export declare function validateTokenReductionLevel(level: string): boolean
|
|
1774
|
+
|
|
1775
|
+
export interface WorkerPoolStats {
|
|
1776
|
+
size: number
|
|
1777
|
+
activeWorkers: number
|
|
1778
|
+
queuedTasks: number
|
|
1779
|
+
}
|