@kreuzberg/node 4.2.15 → 4.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -11
- package/dist/errors.d.mts +2 -3
- package/dist/errors.d.ts +2 -3
- package/dist/errors.js.map +1 -1
- package/dist/errors.mjs.map +1 -1
- package/dist/index.d.mts +5 -14
- package/dist/index.d.ts +5 -14
- package/dist/index.js +19 -215
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +19 -204
- package/dist/index.mjs.map +1 -1
- package/dist/types.d.mts +137 -11
- package/dist/types.d.ts +137 -11
- package/dist/types.js.map +1 -1
- package/index.d.ts +27 -0
- package/index.js +52 -52
- package/package.json +11 -9
- package/dist/ocr/guten-ocr.d.mts +0 -193
- package/dist/ocr/guten-ocr.d.ts +0 -193
- package/dist/ocr/guten-ocr.js +0 -234
- package/dist/ocr/guten-ocr.js.map +0 -1
- package/dist/ocr/guten-ocr.mjs +0 -199
- package/dist/ocr/guten-ocr.mjs.map +0 -1
package/dist/index.mjs
CHANGED
|
@@ -281,6 +281,8 @@ function normalizeOcrConfig(ocr) {
|
|
|
281
281
|
if (tesseract) {
|
|
282
282
|
setIfDefined(normalized, "tesseractConfig", tesseract);
|
|
283
283
|
}
|
|
284
|
+
setIfDefined(normalized, "paddleOcrConfig", ocr.paddleOcrConfig);
|
|
285
|
+
setIfDefined(normalized, "elementConfig", ocr.elementConfig);
|
|
284
286
|
return normalized;
|
|
285
287
|
}
|
|
286
288
|
function normalizeChunkingConfig(chunking) {
|
|
@@ -429,6 +431,7 @@ function normalizeExtractionConfig(config) {
|
|
|
429
431
|
setIfDefined(normalized, "useCache", config.useCache);
|
|
430
432
|
setIfDefined(normalized, "enableQualityProcessing", config.enableQualityProcessing);
|
|
431
433
|
setIfDefined(normalized, "forceOcr", config.forceOcr);
|
|
434
|
+
setIfDefined(normalized, "includeDocumentStructure", config.includeDocumentStructure);
|
|
432
435
|
setIfDefined(normalized, "maxConcurrentExtractions", config.maxConcurrentExtractions);
|
|
433
436
|
const ocr = normalizeOcrConfig(config.ocr);
|
|
434
437
|
setIfDefined(normalized, "ocr", ocr);
|
|
@@ -610,7 +613,9 @@ function convertPageContent(rawPage) {
|
|
|
610
613
|
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
611
614
|
tables: Array.isArray(page["tables"]) ? page["tables"] : [],
|
|
612
615
|
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
613
|
-
images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : []
|
|
616
|
+
images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : [],
|
|
617
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
618
|
+
isBlank: page["isBlank"] ?? null
|
|
614
619
|
};
|
|
615
620
|
}
|
|
616
621
|
function convertResult(rawResult) {
|
|
@@ -624,7 +629,8 @@ function convertResult(rawResult) {
|
|
|
624
629
|
chunks: null,
|
|
625
630
|
images: null,
|
|
626
631
|
elements: null,
|
|
627
|
-
pages: null
|
|
632
|
+
pages: null,
|
|
633
|
+
document: null
|
|
628
634
|
};
|
|
629
635
|
}
|
|
630
636
|
const result = rawResult;
|
|
@@ -643,7 +649,9 @@ function convertResult(rawResult) {
|
|
|
643
649
|
chunks: null,
|
|
644
650
|
images: null,
|
|
645
651
|
elements: null,
|
|
646
|
-
pages: null
|
|
652
|
+
pages: null,
|
|
653
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
654
|
+
document: result["document"] ?? null
|
|
647
655
|
};
|
|
648
656
|
const chunksData = result["chunks"];
|
|
649
657
|
if (Array.isArray(chunksData)) {
|
|
@@ -661,6 +669,10 @@ function convertResult(rawResult) {
|
|
|
661
669
|
if (Array.isArray(pagesData)) {
|
|
662
670
|
returnObj.pages = pagesData.map((page) => convertPageContent(page));
|
|
663
671
|
}
|
|
672
|
+
const ocrElementsData = result["ocrElements"];
|
|
673
|
+
if (Array.isArray(ocrElementsData)) {
|
|
674
|
+
returnObj.ocrElements = ocrElementsData;
|
|
675
|
+
}
|
|
664
676
|
return returnObj;
|
|
665
677
|
}
|
|
666
678
|
|
|
@@ -750,7 +762,7 @@ async function extractBytes(dataOrPath, mimeType, config = null) {
|
|
|
750
762
|
data = dataOrPath;
|
|
751
763
|
}
|
|
752
764
|
const validated = assertUint8Array(data, "data");
|
|
753
|
-
if (process.env["
|
|
765
|
+
if (process.env["KREUZBERG_DEBUG_OCR"] === "1") {
|
|
754
766
|
console.log("[TypeScript] Debug input header:", Array.from(validated.slice(0, 8)));
|
|
755
767
|
}
|
|
756
768
|
const normalizedConfig = normalizeExtractionConfig(config);
|
|
@@ -903,202 +915,6 @@ function listValidators() {
|
|
|
903
915
|
return binding2.listValidators();
|
|
904
916
|
}
|
|
905
917
|
|
|
906
|
-
// typescript/ocr/guten-ocr.ts
|
|
907
|
-
var GutenOcrBackend = class {
|
|
908
|
-
ocr = null;
|
|
909
|
-
ocrModule = null;
|
|
910
|
-
options;
|
|
911
|
-
/**
|
|
912
|
-
* Create a new Guten OCR backend.
|
|
913
|
-
*
|
|
914
|
-
* @param options - Optional configuration for Guten OCR
|
|
915
|
-
* @param options.models - Custom model paths (default: uses bundled models)
|
|
916
|
-
* @param options.isDebug - Enable debug mode (default: false)
|
|
917
|
-
* @param options.debugOutputDir - Directory for debug output (default: undefined)
|
|
918
|
-
* @param options.onnxOptions - Custom ONNX Runtime options (default: undefined)
|
|
919
|
-
*
|
|
920
|
-
* @example
|
|
921
|
-
* ```typescript
|
|
922
|
-
* // Default configuration
|
|
923
|
-
* const backend = new GutenOcrBackend();
|
|
924
|
-
*
|
|
925
|
-
* // With debug enabled
|
|
926
|
-
* const debugBackend = new GutenOcrBackend({
|
|
927
|
-
* isDebug: true,
|
|
928
|
-
* debugOutputDir: './ocr_debug'
|
|
929
|
-
* });
|
|
930
|
-
* ```
|
|
931
|
-
*/
|
|
932
|
-
constructor(options) {
|
|
933
|
-
if (options !== void 0) {
|
|
934
|
-
this.options = options;
|
|
935
|
-
}
|
|
936
|
-
}
|
|
937
|
-
/**
|
|
938
|
-
* Get the backend name.
|
|
939
|
-
*
|
|
940
|
-
* @returns Backend name ("guten-ocr")
|
|
941
|
-
*/
|
|
942
|
-
name() {
|
|
943
|
-
return "guten-ocr";
|
|
944
|
-
}
|
|
945
|
-
/**
|
|
946
|
-
* Get list of supported language codes.
|
|
947
|
-
*
|
|
948
|
-
* Guten OCR supports multiple languages depending on the model configuration.
|
|
949
|
-
* The default models support English and Chinese.
|
|
950
|
-
*
|
|
951
|
-
* @returns Array of ISO 639-1/2 language codes
|
|
952
|
-
*/
|
|
953
|
-
supportedLanguages() {
|
|
954
|
-
return ["en", "eng", "ch_sim", "ch_tra", "chinese"];
|
|
955
|
-
}
|
|
956
|
-
/**
|
|
957
|
-
* Initialize the OCR backend.
|
|
958
|
-
*
|
|
959
|
-
* This method loads the Guten OCR module and creates an OCR instance.
|
|
960
|
-
* Call this before using processImage().
|
|
961
|
-
*
|
|
962
|
-
* @throws {Error} If @gutenye/ocr-node is not installed
|
|
963
|
-
* @throws {Error} If OCR initialization fails
|
|
964
|
-
*
|
|
965
|
-
* @example
|
|
966
|
-
* ```typescript
|
|
967
|
-
* const backend = new GutenOcrBackend();
|
|
968
|
-
* await backend.initialize();
|
|
969
|
-
* ```
|
|
970
|
-
*/
|
|
971
|
-
async initialize() {
|
|
972
|
-
if (this.ocr !== null) {
|
|
973
|
-
return;
|
|
974
|
-
}
|
|
975
|
-
try {
|
|
976
|
-
this.ocrModule = await import("@gutenye/ocr-node").then((m) => m.default || m);
|
|
977
|
-
} catch (e) {
|
|
978
|
-
const error = e;
|
|
979
|
-
throw new Error(
|
|
980
|
-
`Guten OCR support requires the '@gutenye/ocr-node' package. Install with: npm install @gutenye/ocr-node. Error: ${error.message}`
|
|
981
|
-
);
|
|
982
|
-
}
|
|
983
|
-
try {
|
|
984
|
-
this.ocr = await this.ocrModule?.create(this.options) ?? null;
|
|
985
|
-
} catch (e) {
|
|
986
|
-
const error = e;
|
|
987
|
-
throw new Error(`Failed to initialize Guten OCR: ${error.message}`);
|
|
988
|
-
}
|
|
989
|
-
}
|
|
990
|
-
/**
|
|
991
|
-
* Shutdown the backend and release resources.
|
|
992
|
-
*
|
|
993
|
-
* This method cleans up all resources associated with the backend,
|
|
994
|
-
* including the GutenOCR instance and module references.
|
|
995
|
-
*
|
|
996
|
-
* @example
|
|
997
|
-
* ```typescript
|
|
998
|
-
* const backend = new GutenOcrBackend();
|
|
999
|
-
* await backend.initialize();
|
|
1000
|
-
* // ... use backend ...
|
|
1001
|
-
* await backend.shutdown();
|
|
1002
|
-
* ```
|
|
1003
|
-
*/
|
|
1004
|
-
async shutdown() {
|
|
1005
|
-
if (this.ocr !== null) {
|
|
1006
|
-
this.ocr = null;
|
|
1007
|
-
}
|
|
1008
|
-
if (this.ocrModule !== null) {
|
|
1009
|
-
this.ocrModule = null;
|
|
1010
|
-
}
|
|
1011
|
-
}
|
|
1012
|
-
/**
|
|
1013
|
-
* Process image bytes and extract text using Guten OCR.
|
|
1014
|
-
*
|
|
1015
|
-
* This method:
|
|
1016
|
-
* 1. Decodes the image using sharp (if pixel data is needed) or passes bytes directly
|
|
1017
|
-
* 2. Runs OCR detection to find text regions
|
|
1018
|
-
* 3. Runs OCR recognition on each text region
|
|
1019
|
-
* 4. Returns extracted text with metadata
|
|
1020
|
-
*
|
|
1021
|
-
* @param imageBytes - Raw image data (PNG, JPEG, TIFF, etc.)
|
|
1022
|
-
* @param language - Language code (must be in supportedLanguages())
|
|
1023
|
-
* @returns Promise resolving to OCR result with content and metadata
|
|
1024
|
-
*
|
|
1025
|
-
* @throws {Error} If backend is not initialized
|
|
1026
|
-
* @throws {Error} If OCR processing fails
|
|
1027
|
-
*
|
|
1028
|
-
* @example
|
|
1029
|
-
* ```typescript
|
|
1030
|
-
* import { readFile } from 'fs/promises';
|
|
1031
|
-
*
|
|
1032
|
-
* const backend = new GutenOcrBackend();
|
|
1033
|
-
* await backend.initialize();
|
|
1034
|
-
*
|
|
1035
|
-
* const imageBytes = await readFile('scanned.png');
|
|
1036
|
-
* const result = await backend.processImage(imageBytes, 'en');
|
|
1037
|
-
* console.log(result.content);
|
|
1038
|
-
* console.log(result.metadata.confidence);
|
|
1039
|
-
* ```
|
|
1040
|
-
*/
|
|
1041
|
-
async processImage(imageBytes, language) {
|
|
1042
|
-
if (this.ocr === null) {
|
|
1043
|
-
await this.initialize();
|
|
1044
|
-
}
|
|
1045
|
-
if (this.ocr === null) {
|
|
1046
|
-
throw new Error("Guten OCR backend failed to initialize");
|
|
1047
|
-
}
|
|
1048
|
-
try {
|
|
1049
|
-
const buffer = typeof imageBytes === "string" ? Buffer.from(imageBytes, "base64") : Buffer.from(imageBytes);
|
|
1050
|
-
const debugEnv = process.env["KREUZBERG_DEBUG_GUTEN"];
|
|
1051
|
-
if (debugEnv === "1") {
|
|
1052
|
-
const header = Array.from(buffer.subarray(0, 8));
|
|
1053
|
-
console.log("[Guten OCR] Debug input header:", header);
|
|
1054
|
-
console.log(
|
|
1055
|
-
"[Guten OCR] Buffer?",
|
|
1056
|
-
Buffer.isBuffer(buffer),
|
|
1057
|
-
"constructor",
|
|
1058
|
-
imageBytes?.constructor?.name,
|
|
1059
|
-
"length",
|
|
1060
|
-
buffer.length,
|
|
1061
|
-
"type",
|
|
1062
|
-
typeof imageBytes
|
|
1063
|
-
);
|
|
1064
|
-
}
|
|
1065
|
-
let width = 0;
|
|
1066
|
-
let height = 0;
|
|
1067
|
-
try {
|
|
1068
|
-
const sharpModule = await import("sharp");
|
|
1069
|
-
const sharp = sharpModule.default || sharpModule;
|
|
1070
|
-
const image = sharp(buffer);
|
|
1071
|
-
const metadata = await image.metadata();
|
|
1072
|
-
const metadataRecord = metadata;
|
|
1073
|
-
width = metadataRecord["width"] ?? 0;
|
|
1074
|
-
height = metadataRecord["height"] ?? 0;
|
|
1075
|
-
} catch (metadataError) {
|
|
1076
|
-
const error = metadataError;
|
|
1077
|
-
console.warn(`[Guten OCR] Unable to read image metadata via sharp: ${error.message}`);
|
|
1078
|
-
}
|
|
1079
|
-
const result = await this.ocr.detect(buffer);
|
|
1080
|
-
const textLines = result.map((line) => line.text);
|
|
1081
|
-
const content = textLines.join("\n");
|
|
1082
|
-
const avgConfidence = result.length > 0 ? result.reduce((sum, line) => sum + line.mean, 0) / result.length : 0;
|
|
1083
|
-
return {
|
|
1084
|
-
content,
|
|
1085
|
-
mime_type: "text/plain",
|
|
1086
|
-
metadata: {
|
|
1087
|
-
width,
|
|
1088
|
-
height,
|
|
1089
|
-
confidence: avgConfidence,
|
|
1090
|
-
text_regions: result.length,
|
|
1091
|
-
language
|
|
1092
|
-
},
|
|
1093
|
-
tables: []
|
|
1094
|
-
};
|
|
1095
|
-
} catch (e) {
|
|
1096
|
-
const error = e;
|
|
1097
|
-
throw new Error(`Guten OCR processing failed: ${error.message}`);
|
|
1098
|
-
}
|
|
1099
|
-
}
|
|
1100
|
-
};
|
|
1101
|
-
|
|
1102
918
|
// typescript/plugins/ocr-backends.ts
|
|
1103
919
|
function isOcrProcessTuple(value) {
|
|
1104
920
|
return Array.isArray(value) && value.length === 2 && typeof value[1] === "string" && (typeof value[0] === "string" || Buffer.isBuffer(value[0]) || value[0] instanceof Uint8Array);
|
|
@@ -1119,7 +935,7 @@ function registerOcrBackend(backend) {
|
|
|
1119
935
|
supportedLanguages: typeof backend.supportedLanguages === "function" ? backend.supportedLanguages() : backend.supportedLanguages ?? ["en"],
|
|
1120
936
|
async processImage(...processArgs) {
|
|
1121
937
|
const [imagePayload, maybeLanguage] = processArgs;
|
|
1122
|
-
if (process.env["
|
|
938
|
+
if (process.env["KREUZBERG_DEBUG_OCR"] === "1") {
|
|
1123
939
|
console.log("[registerOcrBackend] JS arguments", { length: processArgs.length });
|
|
1124
940
|
console.log("[registerOcrBackend] Raw args", {
|
|
1125
941
|
imagePayloadType: Array.isArray(imagePayload) ? "tuple" : typeof imagePayload,
|
|
@@ -1139,7 +955,7 @@ function registerOcrBackend(backend) {
|
|
|
1139
955
|
if (typeof language !== "string") {
|
|
1140
956
|
throw new Error("OCR backend did not receive a language parameter");
|
|
1141
957
|
}
|
|
1142
|
-
if (process.env["
|
|
958
|
+
if (process.env["KREUZBERG_DEBUG_OCR"] === "1") {
|
|
1143
959
|
const length = typeof rawBytes === "string" ? rawBytes.length : rawBytes.length;
|
|
1144
960
|
console.log(
|
|
1145
961
|
"[registerOcrBackend] Received payload",
|
|
@@ -1288,12 +1104,11 @@ function getEmbeddingPreset(name) {
|
|
|
1288
1104
|
}
|
|
1289
1105
|
|
|
1290
1106
|
// typescript/index.ts
|
|
1291
|
-
var __version__ = "4.
|
|
1107
|
+
var __version__ = "4.3.0";
|
|
1292
1108
|
export {
|
|
1293
1109
|
CacheError,
|
|
1294
1110
|
ErrorCode,
|
|
1295
1111
|
ExtractionConfig,
|
|
1296
|
-
GutenOcrBackend,
|
|
1297
1112
|
ImageProcessingError,
|
|
1298
1113
|
KreuzbergError,
|
|
1299
1114
|
MissingDependencyError,
|