@kreuzberg/node 4.0.0-rc.15 → 4.0.0-rc.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +180 -23
- package/dist/index.d.ts +180 -23
- package/dist/index.js +181 -29
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +178 -29
- package/dist/index.mjs.map +1 -1
- package/dist/types.d.mts +37 -1
- package/dist/types.d.ts +37 -1
- package/dist/types.js.map +1 -1
- package/index.d.ts +478 -0
- package/index.js +72 -52
- package/package.json +5 -5
package/dist/index.js
CHANGED
|
@@ -37,6 +37,7 @@ __export(index_exports, {
|
|
|
37
37
|
batchExtractBytesSync: () => batchExtractBytesSync,
|
|
38
38
|
batchExtractFiles: () => batchExtractFiles,
|
|
39
39
|
batchExtractFilesSync: () => batchExtractFilesSync,
|
|
40
|
+
classifyError: () => classifyError,
|
|
40
41
|
clearDocumentExtractors: () => clearDocumentExtractors,
|
|
41
42
|
clearOcrBackends: () => clearOcrBackends,
|
|
42
43
|
clearPostProcessors: () => clearPostProcessors,
|
|
@@ -48,6 +49,8 @@ __export(index_exports, {
|
|
|
48
49
|
extractFile: () => extractFile,
|
|
49
50
|
extractFileSync: () => extractFileSync,
|
|
50
51
|
getEmbeddingPreset: () => getEmbeddingPreset,
|
|
52
|
+
getErrorCodeDescription: () => getErrorCodeDescription,
|
|
53
|
+
getErrorCodeName: () => getErrorCodeName,
|
|
51
54
|
getExtensionsForMime: () => getExtensionsForMime,
|
|
52
55
|
getLastErrorCode: () => getLastErrorCode,
|
|
53
56
|
getLastPanicContext: () => getLastPanicContext,
|
|
@@ -67,6 +70,7 @@ __export(index_exports, {
|
|
|
67
70
|
});
|
|
68
71
|
module.exports = __toCommonJS(index_exports);
|
|
69
72
|
var import_node_module = require("node:module");
|
|
73
|
+
var import_node_fs = require("node:fs");
|
|
70
74
|
var import_errors = require("./errors.js");
|
|
71
75
|
var import_guten_ocr = require("./ocr/guten-ocr.js");
|
|
72
76
|
__reExport(index_exports, require("./types.js"), module.exports);
|
|
@@ -313,15 +317,15 @@ function convertResult(rawResult) {
|
|
|
313
317
|
metadata: {},
|
|
314
318
|
tables: [],
|
|
315
319
|
detectedLanguages: null,
|
|
316
|
-
chunks:
|
|
317
|
-
images:
|
|
318
|
-
pages:
|
|
320
|
+
chunks: void 0,
|
|
321
|
+
images: void 0,
|
|
322
|
+
pages: void 0
|
|
319
323
|
};
|
|
320
324
|
}
|
|
321
325
|
const result = rawResult;
|
|
322
326
|
const metadata = result["metadata"];
|
|
323
327
|
const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
|
|
324
|
-
|
|
328
|
+
const returnObj = {
|
|
325
329
|
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
326
330
|
content: result["content"] ?? "",
|
|
327
331
|
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
@@ -331,19 +335,23 @@ function convertResult(rawResult) {
|
|
|
331
335
|
tables: Array.isArray(result["tables"]) ? result["tables"] : [],
|
|
332
336
|
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
333
337
|
detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
|
|
334
|
-
chunks:
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
})(),
|
|
338
|
-
images: (() => {
|
|
339
|
-
const imagesData = result["images"];
|
|
340
|
-
return Array.isArray(imagesData) ? imagesData.map((image) => convertImage(image)) : null;
|
|
341
|
-
})(),
|
|
342
|
-
pages: (() => {
|
|
343
|
-
const pagesData = result["pages"];
|
|
344
|
-
return Array.isArray(pagesData) ? pagesData.map((page) => convertPageContent(page)) : null;
|
|
345
|
-
})()
|
|
338
|
+
chunks: void 0,
|
|
339
|
+
images: void 0,
|
|
340
|
+
pages: void 0
|
|
346
341
|
};
|
|
342
|
+
const chunksData = result["chunks"];
|
|
343
|
+
if (Array.isArray(chunksData)) {
|
|
344
|
+
returnObj.chunks = chunksData.map((chunk) => convertChunk(chunk));
|
|
345
|
+
}
|
|
346
|
+
const imagesData = result["images"];
|
|
347
|
+
if (Array.isArray(imagesData)) {
|
|
348
|
+
returnObj.images = imagesData.map((image) => convertImage(image));
|
|
349
|
+
}
|
|
350
|
+
const pagesData = result["pages"];
|
|
351
|
+
if (Array.isArray(pagesData)) {
|
|
352
|
+
returnObj.pages = pagesData.map((page) => convertPageContent(page));
|
|
353
|
+
}
|
|
354
|
+
return returnObj;
|
|
347
355
|
}
|
|
348
356
|
function setIfDefined(target, key, value) {
|
|
349
357
|
if (value !== void 0) {
|
|
@@ -543,23 +551,59 @@ function normalizeExtractionConfig(config) {
|
|
|
543
551
|
setIfDefined(normalized, "htmlOptions", htmlOptions);
|
|
544
552
|
return normalized;
|
|
545
553
|
}
|
|
546
|
-
function extractFileSync(filePath,
|
|
554
|
+
function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
|
|
555
|
+
let mimeType = null;
|
|
556
|
+
let config = null;
|
|
557
|
+
if (typeof mimeTypeOrConfig === "string") {
|
|
558
|
+
mimeType = mimeTypeOrConfig;
|
|
559
|
+
config = maybeConfig ?? null;
|
|
560
|
+
} else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
|
|
561
|
+
config = mimeTypeOrConfig;
|
|
562
|
+
mimeType = null;
|
|
563
|
+
} else {
|
|
564
|
+
config = maybeConfig ?? null;
|
|
565
|
+
mimeType = null;
|
|
566
|
+
}
|
|
547
567
|
const normalizedConfig = normalizeExtractionConfig(config);
|
|
548
568
|
const rawResult = getBinding().extractFileSync(filePath, mimeType, normalizedConfig);
|
|
549
569
|
return convertResult(rawResult);
|
|
550
570
|
}
|
|
551
|
-
async function extractFile(filePath,
|
|
571
|
+
async function extractFile(filePath, mimeTypeOrConfig, maybeConfig) {
|
|
572
|
+
let mimeType = null;
|
|
573
|
+
let config = null;
|
|
574
|
+
if (typeof mimeTypeOrConfig === "string") {
|
|
575
|
+
mimeType = mimeTypeOrConfig;
|
|
576
|
+
config = maybeConfig ?? null;
|
|
577
|
+
} else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
|
|
578
|
+
config = mimeTypeOrConfig;
|
|
579
|
+
mimeType = null;
|
|
580
|
+
} else {
|
|
581
|
+
config = maybeConfig ?? null;
|
|
582
|
+
mimeType = null;
|
|
583
|
+
}
|
|
552
584
|
const normalizedConfig = normalizeExtractionConfig(config);
|
|
553
585
|
const rawResult = await getBinding().extractFile(filePath, mimeType, normalizedConfig);
|
|
554
586
|
return convertResult(rawResult);
|
|
555
587
|
}
|
|
556
|
-
function extractBytesSync(
|
|
588
|
+
function extractBytesSync(dataOrPath, mimeType, config = null) {
|
|
589
|
+
let data;
|
|
590
|
+
if (typeof dataOrPath === "string") {
|
|
591
|
+
data = (0, import_node_fs.readFileSync)(dataOrPath);
|
|
592
|
+
} else {
|
|
593
|
+
data = dataOrPath;
|
|
594
|
+
}
|
|
557
595
|
const validated = assertUint8Array(data, "data");
|
|
558
596
|
const normalizedConfig = normalizeExtractionConfig(config);
|
|
559
597
|
const rawResult = getBinding().extractBytesSync(Buffer.from(validated), mimeType, normalizedConfig);
|
|
560
598
|
return convertResult(rawResult);
|
|
561
599
|
}
|
|
562
|
-
async function extractBytes(
|
|
600
|
+
async function extractBytes(dataOrPath, mimeType, config = null) {
|
|
601
|
+
let data;
|
|
602
|
+
if (typeof dataOrPath === "string") {
|
|
603
|
+
data = (0, import_node_fs.readFileSync)(dataOrPath);
|
|
604
|
+
} else {
|
|
605
|
+
data = dataOrPath;
|
|
606
|
+
}
|
|
563
607
|
const validated = assertUint8Array(data, "data");
|
|
564
608
|
if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
|
|
565
609
|
console.log("[TypeScript] Debug input header:", Array.from(validated.slice(0, 8)));
|
|
@@ -599,8 +643,8 @@ async function batchExtractBytes(dataList, mimeTypes, config = null) {
|
|
|
599
643
|
function registerPostProcessor(processor) {
|
|
600
644
|
const binding2 = getBinding();
|
|
601
645
|
const wrappedProcessor = {
|
|
602
|
-
name: processor.name.
|
|
603
|
-
processingStage: processor.processingStage
|
|
646
|
+
name: typeof processor.name === "function" ? processor.name() : processor.name,
|
|
647
|
+
processingStage: typeof processor.processingStage === "function" ? processor.processingStage() : processor.processingStage,
|
|
604
648
|
async process(...args) {
|
|
605
649
|
const wrappedValue = args[0];
|
|
606
650
|
const jsonString = wrappedValue[0];
|
|
@@ -653,8 +697,8 @@ function listPostProcessors() {
|
|
|
653
697
|
function registerValidator(validator) {
|
|
654
698
|
const binding2 = getBinding();
|
|
655
699
|
const wrappedValidator = {
|
|
656
|
-
name: validator.name.
|
|
657
|
-
priority: validator.priority
|
|
700
|
+
name: typeof validator.name === "function" ? validator.name() : validator.name,
|
|
701
|
+
priority: typeof validator.priority === "function" ? validator.priority() : validator.priority,
|
|
658
702
|
async validate(...args) {
|
|
659
703
|
const jsonString = args[0];
|
|
660
704
|
if (!jsonString || jsonString === "undefined") {
|
|
@@ -703,8 +747,8 @@ function describePayload(value) {
|
|
|
703
747
|
function registerOcrBackend(backend) {
|
|
704
748
|
const binding2 = getBinding();
|
|
705
749
|
const wrappedBackend = {
|
|
706
|
-
name: backend.name.
|
|
707
|
-
supportedLanguages: backend.supportedLanguages.
|
|
750
|
+
name: typeof backend.name === "function" ? backend.name() : backend.name,
|
|
751
|
+
supportedLanguages: typeof backend.supportedLanguages === "function" ? backend.supportedLanguages() : backend.supportedLanguages ?? ["en"],
|
|
708
752
|
async processImage(...processArgs) {
|
|
709
753
|
const [imagePayload, maybeLanguage] = processArgs;
|
|
710
754
|
if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
|
|
@@ -769,7 +813,99 @@ function clearDocumentExtractors() {
|
|
|
769
813
|
const binding2 = getBinding();
|
|
770
814
|
binding2.clearDocumentExtractors();
|
|
771
815
|
}
|
|
816
|
+
class ExtractionConfigBuilder {
|
|
817
|
+
config = {};
|
|
818
|
+
/**
|
|
819
|
+
* Create a new builder with default configuration.
|
|
820
|
+
*/
|
|
821
|
+
static default() {
|
|
822
|
+
return new ExtractionConfigBuilder();
|
|
823
|
+
}
|
|
824
|
+
/**
|
|
825
|
+
* Set OCR configuration.
|
|
826
|
+
*/
|
|
827
|
+
withOcr(ocr) {
|
|
828
|
+
this.config["ocr"] = ocr;
|
|
829
|
+
return this;
|
|
830
|
+
}
|
|
831
|
+
/**
|
|
832
|
+
* Set chunking configuration.
|
|
833
|
+
*/
|
|
834
|
+
withChunking(chunking) {
|
|
835
|
+
this.config["chunking"] = chunking;
|
|
836
|
+
return this;
|
|
837
|
+
}
|
|
838
|
+
/**
|
|
839
|
+
* Set image extraction configuration.
|
|
840
|
+
*/
|
|
841
|
+
withImageExtraction(images) {
|
|
842
|
+
this.config["imageExtraction"] = images;
|
|
843
|
+
return this;
|
|
844
|
+
}
|
|
845
|
+
/**
|
|
846
|
+
* Set PDF configuration.
|
|
847
|
+
*/
|
|
848
|
+
withPdf(pdf) {
|
|
849
|
+
this.config["pdf"] = pdf;
|
|
850
|
+
return this;
|
|
851
|
+
}
|
|
852
|
+
/**
|
|
853
|
+
* Set keyword extraction configuration.
|
|
854
|
+
*/
|
|
855
|
+
withKeywords(keywords) {
|
|
856
|
+
this.config["keywords"] = keywords;
|
|
857
|
+
return this;
|
|
858
|
+
}
|
|
859
|
+
/**
|
|
860
|
+
* Set language detection configuration.
|
|
861
|
+
*/
|
|
862
|
+
withLanguageDetection(languageDetection) {
|
|
863
|
+
this.config["languageDetection"] = languageDetection;
|
|
864
|
+
return this;
|
|
865
|
+
}
|
|
866
|
+
/**
|
|
867
|
+
* Set whether to enable metadata extraction.
|
|
868
|
+
*/
|
|
869
|
+
withMetadataExtraction(enabled) {
|
|
870
|
+
this.config["metadataExtraction"] = enabled;
|
|
871
|
+
return this;
|
|
872
|
+
}
|
|
873
|
+
/**
|
|
874
|
+
* Set whether to enable quality mode.
|
|
875
|
+
*/
|
|
876
|
+
withQualityMode(enabled) {
|
|
877
|
+
this.config["qualityMode"] = enabled;
|
|
878
|
+
return this;
|
|
879
|
+
}
|
|
880
|
+
/**
|
|
881
|
+
* Build and return the final ExtractionConfig object.
|
|
882
|
+
*/
|
|
883
|
+
build() {
|
|
884
|
+
return this.config;
|
|
885
|
+
}
|
|
886
|
+
}
|
|
772
887
|
const ExtractionConfig = {
|
|
888
|
+
/**
|
|
889
|
+
* Create a default extraction configuration using the builder pattern.
|
|
890
|
+
*
|
|
891
|
+
* Returns a builder object that allows you to configure extraction settings
|
|
892
|
+
* using method chaining.
|
|
893
|
+
*
|
|
894
|
+
* @returns ExtractionConfigBuilder for chaining configuration calls
|
|
895
|
+
*
|
|
896
|
+
* @example
|
|
897
|
+
* ```typescript
|
|
898
|
+
* import { ExtractionConfig } from '@kreuzberg/node';
|
|
899
|
+
*
|
|
900
|
+
* const config = ExtractionConfig.default()
|
|
901
|
+
* .withChunking({ maxChars: 2048 })
|
|
902
|
+
* .withOcr({ backend: 'tesseract', language: 'eng' })
|
|
903
|
+
* .build();
|
|
904
|
+
* ```
|
|
905
|
+
*/
|
|
906
|
+
default() {
|
|
907
|
+
return ExtractionConfigBuilder.default();
|
|
908
|
+
},
|
|
773
909
|
/**
|
|
774
910
|
* Load extraction configuration from a file.
|
|
775
911
|
*
|
|
@@ -835,9 +971,9 @@ function detectMimeType(bytes) {
|
|
|
835
971
|
const binding2 = getBinding();
|
|
836
972
|
return binding2.detectMimeTypeFromBytes(bytes);
|
|
837
973
|
}
|
|
838
|
-
function detectMimeTypeFromPath(
|
|
974
|
+
function detectMimeTypeFromPath(filePath, checkExists) {
|
|
839
975
|
const binding2 = getBinding();
|
|
840
|
-
return binding2.detectMimeTypeFromPath(
|
|
976
|
+
return binding2.detectMimeTypeFromPath(filePath, checkExists);
|
|
841
977
|
}
|
|
842
978
|
function validateMimeType(mimeType) {
|
|
843
979
|
const binding2 = getBinding();
|
|
@@ -865,7 +1001,20 @@ function getLastPanicContext() {
|
|
|
865
1001
|
const result = binding2.getLastPanicContext();
|
|
866
1002
|
return result;
|
|
867
1003
|
}
|
|
868
|
-
|
|
1004
|
+
function getErrorCodeName(code) {
|
|
1005
|
+
const binding2 = getBinding();
|
|
1006
|
+
return binding2.getErrorCodeName(code);
|
|
1007
|
+
}
|
|
1008
|
+
function getErrorCodeDescription(code) {
|
|
1009
|
+
const binding2 = getBinding();
|
|
1010
|
+
return binding2.getErrorCodeDescription(code);
|
|
1011
|
+
}
|
|
1012
|
+
function classifyError(errorMessage) {
|
|
1013
|
+
const binding2 = getBinding();
|
|
1014
|
+
const result = binding2.classifyError(errorMessage);
|
|
1015
|
+
return result;
|
|
1016
|
+
}
|
|
1017
|
+
const __version__ = "4.0.0-rc.17";
|
|
869
1018
|
// Annotate the CommonJS export names for ESM import in node:
|
|
870
1019
|
0 && (module.exports = {
|
|
871
1020
|
CacheError,
|
|
@@ -886,6 +1035,7 @@ const __version__ = "4.0.0-rc.15";
|
|
|
886
1035
|
batchExtractBytesSync,
|
|
887
1036
|
batchExtractFiles,
|
|
888
1037
|
batchExtractFilesSync,
|
|
1038
|
+
classifyError,
|
|
889
1039
|
clearDocumentExtractors,
|
|
890
1040
|
clearOcrBackends,
|
|
891
1041
|
clearPostProcessors,
|
|
@@ -897,6 +1047,8 @@ const __version__ = "4.0.0-rc.15";
|
|
|
897
1047
|
extractFile,
|
|
898
1048
|
extractFileSync,
|
|
899
1049
|
getEmbeddingPreset,
|
|
1050
|
+
getErrorCodeDescription,
|
|
1051
|
+
getErrorCodeName,
|
|
900
1052
|
getExtensionsForMime,
|
|
901
1053
|
getLastErrorCode,
|
|
902
1054
|
getLastPanicContext,
|