@kreuzberg/node 4.0.0-rc.21 → 4.0.0-rc.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +345 -534
- package/dist/cli.d.mts +4 -0
- package/dist/cli.d.ts +4 -0
- package/dist/cli.js +12 -2
- package/dist/cli.js.map +1 -1
- package/dist/cli.mjs +12 -1
- package/dist/cli.mjs.map +1 -1
- package/dist/index.d.mts +158 -91
- package/dist/index.d.ts +158 -91
- package/dist/index.js +77 -103
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +72 -103
- package/dist/index.mjs.map +1 -1
- package/dist/types.d.mts +141 -36
- package/dist/types.d.ts +141 -36
- package/dist/types.js.map +1 -1
- package/index.d.ts +183 -0
- package/index.js +64 -54
- package/metadata.d.ts +53 -33
- package/package.json +5 -6
package/dist/index.mjs
CHANGED
|
@@ -71,7 +71,16 @@ function __resetBindingForTests() {
|
|
|
71
71
|
bindingInitialized = false;
|
|
72
72
|
}
|
|
73
73
|
function loadNativeBinding() {
|
|
74
|
-
|
|
74
|
+
let localRequire;
|
|
75
|
+
if (typeof require !== "undefined") {
|
|
76
|
+
localRequire = require;
|
|
77
|
+
} else {
|
|
78
|
+
try {
|
|
79
|
+
localRequire = createRequire(import.meta.url);
|
|
80
|
+
} catch {
|
|
81
|
+
localRequire = void 0;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
75
84
|
if (!localRequire) {
|
|
76
85
|
throw new Error("Unable to resolve native binding loader (require not available).");
|
|
77
86
|
}
|
|
@@ -255,9 +264,9 @@ function convertResult(rawResult) {
|
|
|
255
264
|
metadata: {},
|
|
256
265
|
tables: [],
|
|
257
266
|
detectedLanguages: null,
|
|
258
|
-
chunks:
|
|
259
|
-
images:
|
|
260
|
-
pages:
|
|
267
|
+
chunks: null,
|
|
268
|
+
images: null,
|
|
269
|
+
pages: null
|
|
261
270
|
};
|
|
262
271
|
}
|
|
263
272
|
const result = rawResult;
|
|
@@ -273,9 +282,9 @@ function convertResult(rawResult) {
|
|
|
273
282
|
tables: Array.isArray(result["tables"]) ? result["tables"] : [],
|
|
274
283
|
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
275
284
|
detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
|
|
276
|
-
chunks:
|
|
277
|
-
images:
|
|
278
|
-
pages:
|
|
285
|
+
chunks: null,
|
|
286
|
+
images: null,
|
|
287
|
+
pages: null
|
|
279
288
|
};
|
|
280
289
|
const chunksData = result["chunks"];
|
|
281
290
|
if (Array.isArray(chunksData)) {
|
|
@@ -453,9 +462,9 @@ function normalizePageConfig(pages) {
|
|
|
453
462
|
return void 0;
|
|
454
463
|
}
|
|
455
464
|
const normalized = {};
|
|
456
|
-
setIfDefined(normalized, "
|
|
457
|
-
setIfDefined(normalized, "
|
|
458
|
-
setIfDefined(normalized, "
|
|
465
|
+
setIfDefined(normalized, "extractPages", pages.extractPages);
|
|
466
|
+
setIfDefined(normalized, "insertPageMarkers", pages.insertPageMarkers);
|
|
467
|
+
setIfDefined(normalized, "markerFormat", pages.markerFormat);
|
|
459
468
|
return normalized;
|
|
460
469
|
}
|
|
461
470
|
function normalizeExtractionConfig(config) {
|
|
@@ -751,99 +760,7 @@ function clearDocumentExtractors() {
|
|
|
751
760
|
const binding2 = getBinding();
|
|
752
761
|
binding2.clearDocumentExtractors();
|
|
753
762
|
}
|
|
754
|
-
class ExtractionConfigBuilder {
|
|
755
|
-
config = {};
|
|
756
|
-
/**
|
|
757
|
-
* Create a new builder with default configuration.
|
|
758
|
-
*/
|
|
759
|
-
static default() {
|
|
760
|
-
return new ExtractionConfigBuilder();
|
|
761
|
-
}
|
|
762
|
-
/**
|
|
763
|
-
* Set OCR configuration.
|
|
764
|
-
*/
|
|
765
|
-
withOcr(ocr) {
|
|
766
|
-
this.config["ocr"] = ocr;
|
|
767
|
-
return this;
|
|
768
|
-
}
|
|
769
|
-
/**
|
|
770
|
-
* Set chunking configuration.
|
|
771
|
-
*/
|
|
772
|
-
withChunking(chunking) {
|
|
773
|
-
this.config["chunking"] = chunking;
|
|
774
|
-
return this;
|
|
775
|
-
}
|
|
776
|
-
/**
|
|
777
|
-
* Set image extraction configuration.
|
|
778
|
-
*/
|
|
779
|
-
withImageExtraction(images) {
|
|
780
|
-
this.config["imageExtraction"] = images;
|
|
781
|
-
return this;
|
|
782
|
-
}
|
|
783
|
-
/**
|
|
784
|
-
* Set PDF configuration.
|
|
785
|
-
*/
|
|
786
|
-
withPdf(pdf) {
|
|
787
|
-
this.config["pdf"] = pdf;
|
|
788
|
-
return this;
|
|
789
|
-
}
|
|
790
|
-
/**
|
|
791
|
-
* Set keyword extraction configuration.
|
|
792
|
-
*/
|
|
793
|
-
withKeywords(keywords) {
|
|
794
|
-
this.config["keywords"] = keywords;
|
|
795
|
-
return this;
|
|
796
|
-
}
|
|
797
|
-
/**
|
|
798
|
-
* Set language detection configuration.
|
|
799
|
-
*/
|
|
800
|
-
withLanguageDetection(languageDetection) {
|
|
801
|
-
this.config["languageDetection"] = languageDetection;
|
|
802
|
-
return this;
|
|
803
|
-
}
|
|
804
|
-
/**
|
|
805
|
-
* Set whether to enable metadata extraction.
|
|
806
|
-
*/
|
|
807
|
-
withMetadataExtraction(enabled) {
|
|
808
|
-
this.config["metadataExtraction"] = enabled;
|
|
809
|
-
return this;
|
|
810
|
-
}
|
|
811
|
-
/**
|
|
812
|
-
* Set whether to enable quality mode.
|
|
813
|
-
*/
|
|
814
|
-
withQualityMode(enabled) {
|
|
815
|
-
this.config["qualityMode"] = enabled;
|
|
816
|
-
return this;
|
|
817
|
-
}
|
|
818
|
-
/**
|
|
819
|
-
* Build and return the final ExtractionConfig object.
|
|
820
|
-
*/
|
|
821
|
-
build() {
|
|
822
|
-
return this.config;
|
|
823
|
-
}
|
|
824
|
-
}
|
|
825
763
|
const ExtractionConfig = {
|
|
826
|
-
/**
|
|
827
|
-
* Create a default extraction configuration using the builder pattern.
|
|
828
|
-
*
|
|
829
|
-
* Returns a builder object that allows you to configure extraction settings
|
|
830
|
-
* using method chaining.
|
|
831
|
-
*
|
|
832
|
-
* @returns ExtractionConfigBuilder for chaining configuration calls
|
|
833
|
-
*
|
|
834
|
-
* @example
|
|
835
|
-
* ```typescript
|
|
836
|
-
* import { ExtractionConfig } from '@kreuzberg/node';
|
|
837
|
-
*
|
|
838
|
-
* const config = ExtractionConfig.default()
|
|
839
|
-
* .withChunking({ maxChars: 2048 })
|
|
840
|
-
* .withOcr({ backend: 'tesseract', language: 'eng' })
|
|
841
|
-
* .build();
|
|
842
|
-
* ```
|
|
843
|
-
*/
|
|
844
|
-
default() {
|
|
845
|
-
return ExtractionConfigBuilder.default();
|
|
846
|
-
},
|
|
847
764
|
/**
|
|
848
765
|
* Load extraction configuration from a file.
|
|
849
766
|
*
|
|
@@ -952,7 +869,54 @@ function classifyError(errorMessage) {
|
|
|
952
869
|
const result = binding2.classifyError(errorMessage);
|
|
953
870
|
return result;
|
|
954
871
|
}
|
|
955
|
-
|
|
872
|
+
function createWorkerPool(size) {
|
|
873
|
+
const binding2 = getBinding();
|
|
874
|
+
const rawPool = binding2.createWorkerPool(size);
|
|
875
|
+
return rawPool;
|
|
876
|
+
}
|
|
877
|
+
function getWorkerPoolStats(pool) {
|
|
878
|
+
const binding2 = getBinding();
|
|
879
|
+
const rawStats = binding2.getWorkerPoolStats(pool);
|
|
880
|
+
return rawStats;
|
|
881
|
+
}
|
|
882
|
+
async function extractFileInWorker(pool, filePath, mimeTypeOrConfig, maybeConfig) {
|
|
883
|
+
let mimeType = null;
|
|
884
|
+
let config = null;
|
|
885
|
+
if (typeof mimeTypeOrConfig === "string") {
|
|
886
|
+
mimeType = mimeTypeOrConfig;
|
|
887
|
+
config = maybeConfig ?? null;
|
|
888
|
+
} else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
|
|
889
|
+
config = mimeTypeOrConfig;
|
|
890
|
+
mimeType = null;
|
|
891
|
+
} else {
|
|
892
|
+
config = maybeConfig ?? null;
|
|
893
|
+
mimeType = null;
|
|
894
|
+
}
|
|
895
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
896
|
+
const binding2 = getBinding();
|
|
897
|
+
const rawResult = await binding2.extractFileInWorker(
|
|
898
|
+
pool,
|
|
899
|
+
filePath,
|
|
900
|
+
mimeType,
|
|
901
|
+
normalizedConfig
|
|
902
|
+
);
|
|
903
|
+
return convertResult(rawResult);
|
|
904
|
+
}
|
|
905
|
+
async function batchExtractFilesInWorker(pool, paths, config = null) {
|
|
906
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
907
|
+
const binding2 = getBinding();
|
|
908
|
+
const rawResults = await binding2.batchExtractFilesInWorker(
|
|
909
|
+
pool,
|
|
910
|
+
paths,
|
|
911
|
+
normalizedConfig
|
|
912
|
+
);
|
|
913
|
+
return rawResults.map(convertResult);
|
|
914
|
+
}
|
|
915
|
+
async function closeWorkerPool(pool) {
|
|
916
|
+
const binding2 = getBinding();
|
|
917
|
+
await binding2.closeWorkerPool(pool);
|
|
918
|
+
}
|
|
919
|
+
const __version__ = "4.0.0-rc.24";
|
|
956
920
|
export {
|
|
957
921
|
CacheError,
|
|
958
922
|
ErrorCode,
|
|
@@ -971,17 +935,21 @@ export {
|
|
|
971
935
|
batchExtractBytes,
|
|
972
936
|
batchExtractBytesSync,
|
|
973
937
|
batchExtractFiles,
|
|
938
|
+
batchExtractFilesInWorker,
|
|
974
939
|
batchExtractFilesSync,
|
|
975
940
|
classifyError,
|
|
976
941
|
clearDocumentExtractors,
|
|
977
942
|
clearOcrBackends,
|
|
978
943
|
clearPostProcessors,
|
|
979
944
|
clearValidators,
|
|
945
|
+
closeWorkerPool,
|
|
946
|
+
createWorkerPool,
|
|
980
947
|
detectMimeType,
|
|
981
948
|
detectMimeTypeFromPath,
|
|
982
949
|
extractBytes,
|
|
983
950
|
extractBytesSync,
|
|
984
951
|
extractFile,
|
|
952
|
+
extractFileInWorker,
|
|
985
953
|
extractFileSync,
|
|
986
954
|
getEmbeddingPreset,
|
|
987
955
|
getErrorCodeDescription,
|
|
@@ -989,6 +957,7 @@ export {
|
|
|
989
957
|
getExtensionsForMime,
|
|
990
958
|
getLastErrorCode,
|
|
991
959
|
getLastPanicContext,
|
|
960
|
+
getWorkerPoolStats,
|
|
992
961
|
listDocumentExtractors,
|
|
993
962
|
listEmbeddingPresets,
|
|
994
963
|
listOcrBackends,
|