@kreuzberg/node 4.0.0-rc.22 → 4.0.0-rc.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +345 -534
- package/dist/cli.d.mts +4 -0
- package/dist/cli.d.ts +4 -0
- package/dist/cli.js +12 -2
- package/dist/cli.js.map +1 -1
- package/dist/cli.mjs +12 -1
- package/dist/cli.mjs.map +1 -1
- package/dist/index.d.mts +158 -91
- package/dist/index.d.ts +158 -91
- package/dist/index.js +77 -103
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +72 -103
- package/dist/index.mjs.map +1 -1
- package/dist/types.d.mts +92 -3
- package/dist/types.d.ts +92 -3
- package/dist/types.js.map +1 -1
- package/index.d.ts +183 -0
- package/index.js +64 -54
- package/package.json +5 -6
package/dist/index.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { PanicContext } from './errors.js';
|
|
2
2
|
export { CacheError, ErrorCode, ImageProcessingError, KreuzbergError, MissingDependencyError, OcrError, ParsingError, PluginError, ValidationError } from './errors.js';
|
|
3
|
-
import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol,
|
|
4
|
-
export { ArchiveMetadata, Chunk, ChunkMetadata, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, HeaderMetadata, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, LinkMetadata, Metadata, OcrMetadata, PageBoundary,
|
|
3
|
+
import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, ErrorClassification, WorkerPool, WorkerPoolStats } from './types.js';
|
|
4
|
+
export { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.js';
|
|
5
5
|
export { GutenOcrBackend } from './ocr/guten-ocr.js';
|
|
6
6
|
|
|
7
7
|
/**
|
|
@@ -610,72 +610,12 @@ declare function unregisterDocumentExtractor(name: string): void;
|
|
|
610
610
|
*/
|
|
611
611
|
declare function clearDocumentExtractors(): void;
|
|
612
612
|
/**
|
|
613
|
-
*
|
|
614
|
-
*
|
|
615
|
-
* Provides a convenient way to build extraction configurations using method chaining.
|
|
616
|
-
*
|
|
617
|
-
* @example
|
|
618
|
-
* ```typescript
|
|
619
|
-
* import { ExtractionConfig, extractFile } from '@kreuzberg/node';
|
|
620
|
-
*
|
|
621
|
-
* // Create with builder pattern
|
|
622
|
-
* const config = ExtractionConfig.default()
|
|
623
|
-
* .withChunking({ maxChars: 2048 })
|
|
624
|
-
* .withOcr({ backend: 'tesseract', language: 'eng' })
|
|
625
|
-
* .build();
|
|
626
|
-
*
|
|
627
|
-
* const result = await extractFile('document.pdf', null, config);
|
|
628
|
-
* ```
|
|
629
|
-
*/
|
|
630
|
-
declare class ExtractionConfigBuilder {
|
|
631
|
-
private config;
|
|
632
|
-
/**
|
|
633
|
-
* Create a new builder with default configuration.
|
|
634
|
-
*/
|
|
635
|
-
static default(): ExtractionConfigBuilder;
|
|
636
|
-
/**
|
|
637
|
-
* Set OCR configuration.
|
|
638
|
-
*/
|
|
639
|
-
withOcr(ocr: OcrConfig): ExtractionConfigBuilder;
|
|
640
|
-
/**
|
|
641
|
-
* Set chunking configuration.
|
|
642
|
-
*/
|
|
643
|
-
withChunking(chunking: ChunkingConfig): ExtractionConfigBuilder;
|
|
644
|
-
/**
|
|
645
|
-
* Set image extraction configuration.
|
|
646
|
-
*/
|
|
647
|
-
withImageExtraction(images: ImageExtractionConfig): ExtractionConfigBuilder;
|
|
648
|
-
/**
|
|
649
|
-
* Set PDF configuration.
|
|
650
|
-
*/
|
|
651
|
-
withPdf(pdf: PdfConfig): ExtractionConfigBuilder;
|
|
652
|
-
/**
|
|
653
|
-
* Set keyword extraction configuration.
|
|
654
|
-
*/
|
|
655
|
-
withKeywords(keywords: KeywordConfig): ExtractionConfigBuilder;
|
|
656
|
-
/**
|
|
657
|
-
* Set language detection configuration.
|
|
658
|
-
*/
|
|
659
|
-
withLanguageDetection(languageDetection: LanguageDetectionConfig): ExtractionConfigBuilder;
|
|
660
|
-
/**
|
|
661
|
-
* Set whether to enable metadata extraction.
|
|
662
|
-
*/
|
|
663
|
-
withMetadataExtraction(enabled: boolean): ExtractionConfigBuilder;
|
|
664
|
-
/**
|
|
665
|
-
* Set whether to enable quality mode.
|
|
666
|
-
*/
|
|
667
|
-
withQualityMode(enabled: boolean): ExtractionConfigBuilder;
|
|
668
|
-
/**
|
|
669
|
-
* Build and return the final ExtractionConfig object.
|
|
670
|
-
*/
|
|
671
|
-
build(): ExtractionConfig$1;
|
|
672
|
-
}
|
|
673
|
-
/**
|
|
674
|
-
* ExtractionConfig namespace with static methods for loading configuration from files
|
|
675
|
-
* and creating new configurations with the builder pattern.
|
|
613
|
+
* ExtractionConfig namespace with static methods for loading configuration from files.
|
|
676
614
|
*
|
|
677
615
|
* Provides factory methods to load extraction configuration from TOML, YAML, or JSON files,
|
|
678
|
-
* or to
|
|
616
|
+
* or to discover configuration files in the current directory tree.
|
|
617
|
+
*
|
|
618
|
+
* For creating configurations programmatically, use plain TypeScript objects instead:
|
|
679
619
|
*
|
|
680
620
|
* @example
|
|
681
621
|
* ```typescript
|
|
@@ -684,35 +624,17 @@ declare class ExtractionConfigBuilder {
|
|
|
684
624
|
* // Load configuration from file
|
|
685
625
|
* const config1 = ExtractionConfig.fromFile('config.toml');
|
|
686
626
|
*
|
|
687
|
-
* //
|
|
688
|
-
* const config2 =
|
|
689
|
-
*
|
|
690
|
-
*
|
|
627
|
+
* // Or create with plain object
|
|
628
|
+
* const config2 = {
|
|
629
|
+
* chunking: { maxChars: 2048 },
|
|
630
|
+
* ocr: { backend: 'tesseract', language: 'eng' }
|
|
631
|
+
* };
|
|
691
632
|
*
|
|
692
633
|
* // Use with extraction
|
|
693
634
|
* const result = await extractFile('document.pdf', null, config2);
|
|
694
635
|
* ```
|
|
695
636
|
*/
|
|
696
637
|
declare const ExtractionConfig: {
|
|
697
|
-
/**
|
|
698
|
-
* Create a default extraction configuration using the builder pattern.
|
|
699
|
-
*
|
|
700
|
-
* Returns a builder object that allows you to configure extraction settings
|
|
701
|
-
* using method chaining.
|
|
702
|
-
*
|
|
703
|
-
* @returns ExtractionConfigBuilder for chaining configuration calls
|
|
704
|
-
*
|
|
705
|
-
* @example
|
|
706
|
-
* ```typescript
|
|
707
|
-
* import { ExtractionConfig } from '@kreuzberg/node';
|
|
708
|
-
*
|
|
709
|
-
* const config = ExtractionConfig.default()
|
|
710
|
-
* .withChunking({ maxChars: 2048 })
|
|
711
|
-
* .withOcr({ backend: 'tesseract', language: 'eng' })
|
|
712
|
-
* .build();
|
|
713
|
-
* ```
|
|
714
|
-
*/
|
|
715
|
-
default(): ExtractionConfigBuilder;
|
|
716
638
|
/**
|
|
717
639
|
* Load extraction configuration from a file.
|
|
718
640
|
*
|
|
@@ -1060,6 +982,151 @@ declare function getErrorCodeDescription(code: number): string;
|
|
|
1060
982
|
* ```
|
|
1061
983
|
*/
|
|
1062
984
|
declare function classifyError(errorMessage: string): ErrorClassification;
|
|
1063
|
-
|
|
985
|
+
/**
|
|
986
|
+
* Create a worker pool for concurrent file extraction.
|
|
987
|
+
*
|
|
988
|
+
* The worker pool manages a set of background worker threads that can process
|
|
989
|
+
* extraction requests concurrently, improving throughput when handling multiple files.
|
|
990
|
+
*
|
|
991
|
+
* @param size - Optional number of worker threads (defaults to CPU count). Must be > 0
|
|
992
|
+
* @returns A WorkerPool instance to use with extraction functions
|
|
993
|
+
*
|
|
994
|
+
* @throws {Error} If size is invalid or pool creation fails
|
|
995
|
+
*
|
|
996
|
+
* @example
|
|
997
|
+
* ```typescript
|
|
998
|
+
* import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
999
|
+
*
|
|
1000
|
+
* // Create pool with 4 workers
|
|
1001
|
+
* const pool = createWorkerPool(4);
|
|
1002
|
+
*
|
|
1003
|
+
* try {
|
|
1004
|
+
* const result = await extractFileInWorker(pool, 'document.pdf');
|
|
1005
|
+
* console.log(result.content);
|
|
1006
|
+
* } finally {
|
|
1007
|
+
* // Always close the pool when done
|
|
1008
|
+
* await closeWorkerPool(pool);
|
|
1009
|
+
* }
|
|
1010
|
+
* ```
|
|
1011
|
+
*/
|
|
1012
|
+
declare function createWorkerPool(size?: number): WorkerPool;
|
|
1013
|
+
/**
|
|
1014
|
+
* Get statistics about a worker pool.
|
|
1015
|
+
*
|
|
1016
|
+
* Returns information about the pool's current state, including the number of active workers,
|
|
1017
|
+
* queued tasks, and total processed tasks.
|
|
1018
|
+
*
|
|
1019
|
+
* @param pool - The worker pool instance
|
|
1020
|
+
* @returns WorkerPoolStats with pool information
|
|
1021
|
+
*
|
|
1022
|
+
* @example
|
|
1023
|
+
* ```typescript
|
|
1024
|
+
* import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
|
|
1025
|
+
*
|
|
1026
|
+
* const pool = createWorkerPool(4);
|
|
1027
|
+
* const stats = getWorkerPoolStats(pool);
|
|
1028
|
+
*
|
|
1029
|
+
* console.log(`Pool size: ${stats.size}`);
|
|
1030
|
+
* console.log(`Active workers: ${stats.activeWorkers}`);
|
|
1031
|
+
* console.log(`Queued tasks: ${stats.queuedTasks}`);
|
|
1032
|
+
* ```
|
|
1033
|
+
*/
|
|
1034
|
+
declare function getWorkerPoolStats(pool: WorkerPool): WorkerPoolStats;
|
|
1035
|
+
/**
|
|
1036
|
+
* Extract content from a single file using a worker pool (asynchronous).
|
|
1037
|
+
*
|
|
1038
|
+
* Submits an extraction task to the worker pool. The task is executed by one of the
|
|
1039
|
+
* available workers in the background, allowing other tasks to be processed concurrently.
|
|
1040
|
+
*
|
|
1041
|
+
* @param pool - The worker pool instance
|
|
1042
|
+
* @param filePath - Path to the file to extract
|
|
1043
|
+
* @param mimeTypeOrConfig - Optional MIME type or extraction configuration
|
|
1044
|
+
* @param maybeConfig - Optional extraction configuration (if second param is MIME type)
|
|
1045
|
+
* @returns Promise<ExtractionResult> containing extracted content and metadata
|
|
1046
|
+
*
|
|
1047
|
+
* @throws {Error} If the file cannot be read or extraction fails
|
|
1048
|
+
*
|
|
1049
|
+
* @example
|
|
1050
|
+
* ```typescript
|
|
1051
|
+
* import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
1052
|
+
*
|
|
1053
|
+
* const pool = createWorkerPool(4);
|
|
1054
|
+
*
|
|
1055
|
+
* try {
|
|
1056
|
+
* const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
|
|
1057
|
+
* const results = await Promise.all(
|
|
1058
|
+
* files.map(f => extractFileInWorker(pool, f))
|
|
1059
|
+
* );
|
|
1060
|
+
*
|
|
1061
|
+
* results.forEach((r, i) => {
|
|
1062
|
+
* console.log(`${files[i]}: ${r.content.substring(0, 100)}...`);
|
|
1063
|
+
* });
|
|
1064
|
+
* } finally {
|
|
1065
|
+
* await closeWorkerPool(pool);
|
|
1066
|
+
* }
|
|
1067
|
+
* ```
|
|
1068
|
+
*/
|
|
1069
|
+
declare function extractFileInWorker(pool: WorkerPool, filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
|
|
1070
|
+
/**
|
|
1071
|
+
* Extract content from multiple files in parallel using a worker pool (asynchronous).
|
|
1072
|
+
*
|
|
1073
|
+
* Submits multiple extraction tasks to the worker pool for concurrent processing.
|
|
1074
|
+
* This is more efficient than using `extractFileInWorker` multiple times sequentially.
|
|
1075
|
+
*
|
|
1076
|
+
* @param pool - The worker pool instance
|
|
1077
|
+
* @param paths - Array of file paths to extract
|
|
1078
|
+
* @param config - Extraction configuration object (applies to all files)
|
|
1079
|
+
* @returns Promise<ExtractionResult[]> array of results (one per file, in same order)
|
|
1080
|
+
*
|
|
1081
|
+
* @throws {Error} If any file cannot be read or extraction fails
|
|
1082
|
+
*
|
|
1083
|
+
* @example
|
|
1084
|
+
* ```typescript
|
|
1085
|
+
* import { createWorkerPool, batchExtractFilesInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
1086
|
+
*
|
|
1087
|
+
* const pool = createWorkerPool(4);
|
|
1088
|
+
*
|
|
1089
|
+
* try {
|
|
1090
|
+
* const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf'];
|
|
1091
|
+
* const results = await batchExtractFilesInWorker(pool, files, {
|
|
1092
|
+
* ocr: { backend: 'tesseract', language: 'eng' }
|
|
1093
|
+
* });
|
|
1094
|
+
*
|
|
1095
|
+
* const total = results.reduce((sum, r) => sum + extractAmount(r.content), 0);
|
|
1096
|
+
* console.log(`Total: $${total}`);
|
|
1097
|
+
* } finally {
|
|
1098
|
+
* await closeWorkerPool(pool);
|
|
1099
|
+
* }
|
|
1100
|
+
* ```
|
|
1101
|
+
*/
|
|
1102
|
+
declare function batchExtractFilesInWorker(pool: WorkerPool, paths: string[], config?: ExtractionConfig$1 | null): Promise<ExtractionResult[]>;
|
|
1103
|
+
/**
|
|
1104
|
+
* Close a worker pool and shut down all worker threads.
|
|
1105
|
+
*
|
|
1106
|
+
* Should be called when the pool is no longer needed to clean up resources
|
|
1107
|
+
* and gracefully shut down worker threads. Any pending tasks will be cancelled.
|
|
1108
|
+
*
|
|
1109
|
+
* @param pool - The worker pool instance to close
|
|
1110
|
+
* @returns Promise that resolves when the pool is fully closed
|
|
1111
|
+
*
|
|
1112
|
+
* @throws {Error} If pool shutdown fails
|
|
1113
|
+
*
|
|
1114
|
+
* @example
|
|
1115
|
+
* ```typescript
|
|
1116
|
+
* import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
1117
|
+
*
|
|
1118
|
+
* const pool = createWorkerPool(4);
|
|
1119
|
+
*
|
|
1120
|
+
* try {
|
|
1121
|
+
* const result = await extractFileInWorker(pool, 'document.pdf');
|
|
1122
|
+
* console.log(result.content);
|
|
1123
|
+
* } finally {
|
|
1124
|
+
* // Clean up the pool
|
|
1125
|
+
* await closeWorkerPool(pool);
|
|
1126
|
+
* }
|
|
1127
|
+
* ```
|
|
1128
|
+
*/
|
|
1129
|
+
declare function closeWorkerPool(pool: WorkerPool): Promise<void>;
|
|
1130
|
+
declare const __version__ = "4.0.0-rc.25";
|
|
1064
1131
|
|
|
1065
|
-
export {
|
|
1132
|
+
export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
|
package/dist/index.js
CHANGED
|
@@ -36,17 +36,21 @@ __export(index_exports, {
|
|
|
36
36
|
batchExtractBytes: () => batchExtractBytes,
|
|
37
37
|
batchExtractBytesSync: () => batchExtractBytesSync,
|
|
38
38
|
batchExtractFiles: () => batchExtractFiles,
|
|
39
|
+
batchExtractFilesInWorker: () => batchExtractFilesInWorker,
|
|
39
40
|
batchExtractFilesSync: () => batchExtractFilesSync,
|
|
40
41
|
classifyError: () => classifyError,
|
|
41
42
|
clearDocumentExtractors: () => clearDocumentExtractors,
|
|
42
43
|
clearOcrBackends: () => clearOcrBackends,
|
|
43
44
|
clearPostProcessors: () => clearPostProcessors,
|
|
44
45
|
clearValidators: () => clearValidators,
|
|
46
|
+
closeWorkerPool: () => closeWorkerPool,
|
|
47
|
+
createWorkerPool: () => createWorkerPool,
|
|
45
48
|
detectMimeType: () => detectMimeType,
|
|
46
49
|
detectMimeTypeFromPath: () => detectMimeTypeFromPath,
|
|
47
50
|
extractBytes: () => extractBytes,
|
|
48
51
|
extractBytesSync: () => extractBytesSync,
|
|
49
52
|
extractFile: () => extractFile,
|
|
53
|
+
extractFileInWorker: () => extractFileInWorker,
|
|
50
54
|
extractFileSync: () => extractFileSync,
|
|
51
55
|
getEmbeddingPreset: () => getEmbeddingPreset,
|
|
52
56
|
getErrorCodeDescription: () => getErrorCodeDescription,
|
|
@@ -54,6 +58,7 @@ __export(index_exports, {
|
|
|
54
58
|
getExtensionsForMime: () => getExtensionsForMime,
|
|
55
59
|
getLastErrorCode: () => getLastErrorCode,
|
|
56
60
|
getLastPanicContext: () => getLastPanicContext,
|
|
61
|
+
getWorkerPoolStats: () => getWorkerPoolStats,
|
|
57
62
|
listDocumentExtractors: () => listDocumentExtractors,
|
|
58
63
|
listEmbeddingPresets: () => listEmbeddingPresets,
|
|
59
64
|
listOcrBackends: () => listOcrBackends,
|
|
@@ -133,7 +138,16 @@ function __resetBindingForTests() {
|
|
|
133
138
|
bindingInitialized = false;
|
|
134
139
|
}
|
|
135
140
|
function loadNativeBinding() {
|
|
136
|
-
|
|
141
|
+
let localRequire;
|
|
142
|
+
if (typeof require !== "undefined") {
|
|
143
|
+
localRequire = require;
|
|
144
|
+
} else {
|
|
145
|
+
try {
|
|
146
|
+
localRequire = (0, import_node_module.createRequire)(import_meta.url);
|
|
147
|
+
} catch {
|
|
148
|
+
localRequire = void 0;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
137
151
|
if (!localRequire) {
|
|
138
152
|
throw new Error("Unable to resolve native binding loader (require not available).");
|
|
139
153
|
}
|
|
@@ -317,9 +331,9 @@ function convertResult(rawResult) {
|
|
|
317
331
|
metadata: {},
|
|
318
332
|
tables: [],
|
|
319
333
|
detectedLanguages: null,
|
|
320
|
-
chunks:
|
|
321
|
-
images:
|
|
322
|
-
pages:
|
|
334
|
+
chunks: null,
|
|
335
|
+
images: null,
|
|
336
|
+
pages: null
|
|
323
337
|
};
|
|
324
338
|
}
|
|
325
339
|
const result = rawResult;
|
|
@@ -335,9 +349,9 @@ function convertResult(rawResult) {
|
|
|
335
349
|
tables: Array.isArray(result["tables"]) ? result["tables"] : [],
|
|
336
350
|
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
337
351
|
detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
|
|
338
|
-
chunks:
|
|
339
|
-
images:
|
|
340
|
-
pages:
|
|
352
|
+
chunks: null,
|
|
353
|
+
images: null,
|
|
354
|
+
pages: null
|
|
341
355
|
};
|
|
342
356
|
const chunksData = result["chunks"];
|
|
343
357
|
if (Array.isArray(chunksData)) {
|
|
@@ -515,9 +529,9 @@ function normalizePageConfig(pages) {
|
|
|
515
529
|
return void 0;
|
|
516
530
|
}
|
|
517
531
|
const normalized = {};
|
|
518
|
-
setIfDefined(normalized, "
|
|
519
|
-
setIfDefined(normalized, "
|
|
520
|
-
setIfDefined(normalized, "
|
|
532
|
+
setIfDefined(normalized, "extractPages", pages.extractPages);
|
|
533
|
+
setIfDefined(normalized, "insertPageMarkers", pages.insertPageMarkers);
|
|
534
|
+
setIfDefined(normalized, "markerFormat", pages.markerFormat);
|
|
521
535
|
return normalized;
|
|
522
536
|
}
|
|
523
537
|
function normalizeExtractionConfig(config) {
|
|
@@ -813,99 +827,7 @@ function clearDocumentExtractors() {
|
|
|
813
827
|
const binding2 = getBinding();
|
|
814
828
|
binding2.clearDocumentExtractors();
|
|
815
829
|
}
|
|
816
|
-
class ExtractionConfigBuilder {
|
|
817
|
-
config = {};
|
|
818
|
-
/**
|
|
819
|
-
* Create a new builder with default configuration.
|
|
820
|
-
*/
|
|
821
|
-
static default() {
|
|
822
|
-
return new ExtractionConfigBuilder();
|
|
823
|
-
}
|
|
824
|
-
/**
|
|
825
|
-
* Set OCR configuration.
|
|
826
|
-
*/
|
|
827
|
-
withOcr(ocr) {
|
|
828
|
-
this.config["ocr"] = ocr;
|
|
829
|
-
return this;
|
|
830
|
-
}
|
|
831
|
-
/**
|
|
832
|
-
* Set chunking configuration.
|
|
833
|
-
*/
|
|
834
|
-
withChunking(chunking) {
|
|
835
|
-
this.config["chunking"] = chunking;
|
|
836
|
-
return this;
|
|
837
|
-
}
|
|
838
|
-
/**
|
|
839
|
-
* Set image extraction configuration.
|
|
840
|
-
*/
|
|
841
|
-
withImageExtraction(images) {
|
|
842
|
-
this.config["imageExtraction"] = images;
|
|
843
|
-
return this;
|
|
844
|
-
}
|
|
845
|
-
/**
|
|
846
|
-
* Set PDF configuration.
|
|
847
|
-
*/
|
|
848
|
-
withPdf(pdf) {
|
|
849
|
-
this.config["pdf"] = pdf;
|
|
850
|
-
return this;
|
|
851
|
-
}
|
|
852
|
-
/**
|
|
853
|
-
* Set keyword extraction configuration.
|
|
854
|
-
*/
|
|
855
|
-
withKeywords(keywords) {
|
|
856
|
-
this.config["keywords"] = keywords;
|
|
857
|
-
return this;
|
|
858
|
-
}
|
|
859
|
-
/**
|
|
860
|
-
* Set language detection configuration.
|
|
861
|
-
*/
|
|
862
|
-
withLanguageDetection(languageDetection) {
|
|
863
|
-
this.config["languageDetection"] = languageDetection;
|
|
864
|
-
return this;
|
|
865
|
-
}
|
|
866
|
-
/**
|
|
867
|
-
* Set whether to enable metadata extraction.
|
|
868
|
-
*/
|
|
869
|
-
withMetadataExtraction(enabled) {
|
|
870
|
-
this.config["metadataExtraction"] = enabled;
|
|
871
|
-
return this;
|
|
872
|
-
}
|
|
873
|
-
/**
|
|
874
|
-
* Set whether to enable quality mode.
|
|
875
|
-
*/
|
|
876
|
-
withQualityMode(enabled) {
|
|
877
|
-
this.config["qualityMode"] = enabled;
|
|
878
|
-
return this;
|
|
879
|
-
}
|
|
880
|
-
/**
|
|
881
|
-
* Build and return the final ExtractionConfig object.
|
|
882
|
-
*/
|
|
883
|
-
build() {
|
|
884
|
-
return this.config;
|
|
885
|
-
}
|
|
886
|
-
}
|
|
887
830
|
const ExtractionConfig = {
|
|
888
|
-
/**
|
|
889
|
-
* Create a default extraction configuration using the builder pattern.
|
|
890
|
-
*
|
|
891
|
-
* Returns a builder object that allows you to configure extraction settings
|
|
892
|
-
* using method chaining.
|
|
893
|
-
*
|
|
894
|
-
* @returns ExtractionConfigBuilder for chaining configuration calls
|
|
895
|
-
*
|
|
896
|
-
* @example
|
|
897
|
-
* ```typescript
|
|
898
|
-
* import { ExtractionConfig } from '@kreuzberg/node';
|
|
899
|
-
*
|
|
900
|
-
* const config = ExtractionConfig.default()
|
|
901
|
-
* .withChunking({ maxChars: 2048 })
|
|
902
|
-
* .withOcr({ backend: 'tesseract', language: 'eng' })
|
|
903
|
-
* .build();
|
|
904
|
-
* ```
|
|
905
|
-
*/
|
|
906
|
-
default() {
|
|
907
|
-
return ExtractionConfigBuilder.default();
|
|
908
|
-
},
|
|
909
831
|
/**
|
|
910
832
|
* Load extraction configuration from a file.
|
|
911
833
|
*
|
|
@@ -1014,7 +936,54 @@ function classifyError(errorMessage) {
|
|
|
1014
936
|
const result = binding2.classifyError(errorMessage);
|
|
1015
937
|
return result;
|
|
1016
938
|
}
|
|
1017
|
-
|
|
939
|
+
function createWorkerPool(size) {
|
|
940
|
+
const binding2 = getBinding();
|
|
941
|
+
const rawPool = binding2.createWorkerPool(size);
|
|
942
|
+
return rawPool;
|
|
943
|
+
}
|
|
944
|
+
function getWorkerPoolStats(pool) {
|
|
945
|
+
const binding2 = getBinding();
|
|
946
|
+
const rawStats = binding2.getWorkerPoolStats(pool);
|
|
947
|
+
return rawStats;
|
|
948
|
+
}
|
|
949
|
+
async function extractFileInWorker(pool, filePath, mimeTypeOrConfig, maybeConfig) {
|
|
950
|
+
let mimeType = null;
|
|
951
|
+
let config = null;
|
|
952
|
+
if (typeof mimeTypeOrConfig === "string") {
|
|
953
|
+
mimeType = mimeTypeOrConfig;
|
|
954
|
+
config = maybeConfig ?? null;
|
|
955
|
+
} else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
|
|
956
|
+
config = mimeTypeOrConfig;
|
|
957
|
+
mimeType = null;
|
|
958
|
+
} else {
|
|
959
|
+
config = maybeConfig ?? null;
|
|
960
|
+
mimeType = null;
|
|
961
|
+
}
|
|
962
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
963
|
+
const binding2 = getBinding();
|
|
964
|
+
const rawResult = await binding2.extractFileInWorker(
|
|
965
|
+
pool,
|
|
966
|
+
filePath,
|
|
967
|
+
mimeType,
|
|
968
|
+
normalizedConfig
|
|
969
|
+
);
|
|
970
|
+
return convertResult(rawResult);
|
|
971
|
+
}
|
|
972
|
+
async function batchExtractFilesInWorker(pool, paths, config = null) {
|
|
973
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
974
|
+
const binding2 = getBinding();
|
|
975
|
+
const rawResults = await binding2.batchExtractFilesInWorker(
|
|
976
|
+
pool,
|
|
977
|
+
paths,
|
|
978
|
+
normalizedConfig
|
|
979
|
+
);
|
|
980
|
+
return rawResults.map(convertResult);
|
|
981
|
+
}
|
|
982
|
+
async function closeWorkerPool(pool) {
|
|
983
|
+
const binding2 = getBinding();
|
|
984
|
+
await binding2.closeWorkerPool(pool);
|
|
985
|
+
}
|
|
986
|
+
const __version__ = "4.0.0-rc.25";
|
|
1018
987
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1019
988
|
0 && (module.exports = {
|
|
1020
989
|
CacheError,
|
|
@@ -1034,17 +1003,21 @@ const __version__ = "4.0.0-rc.22";
|
|
|
1034
1003
|
batchExtractBytes,
|
|
1035
1004
|
batchExtractBytesSync,
|
|
1036
1005
|
batchExtractFiles,
|
|
1006
|
+
batchExtractFilesInWorker,
|
|
1037
1007
|
batchExtractFilesSync,
|
|
1038
1008
|
classifyError,
|
|
1039
1009
|
clearDocumentExtractors,
|
|
1040
1010
|
clearOcrBackends,
|
|
1041
1011
|
clearPostProcessors,
|
|
1042
1012
|
clearValidators,
|
|
1013
|
+
closeWorkerPool,
|
|
1014
|
+
createWorkerPool,
|
|
1043
1015
|
detectMimeType,
|
|
1044
1016
|
detectMimeTypeFromPath,
|
|
1045
1017
|
extractBytes,
|
|
1046
1018
|
extractBytesSync,
|
|
1047
1019
|
extractFile,
|
|
1020
|
+
extractFileInWorker,
|
|
1048
1021
|
extractFileSync,
|
|
1049
1022
|
getEmbeddingPreset,
|
|
1050
1023
|
getErrorCodeDescription,
|
|
@@ -1052,6 +1025,7 @@ const __version__ = "4.0.0-rc.22";
|
|
|
1052
1025
|
getExtensionsForMime,
|
|
1053
1026
|
getLastErrorCode,
|
|
1054
1027
|
getLastPanicContext,
|
|
1028
|
+
getWorkerPoolStats,
|
|
1055
1029
|
listDocumentExtractors,
|
|
1056
1030
|
listEmbeddingPresets,
|
|
1057
1031
|
listOcrBackends,
|