@kreuzberg/node 4.2.15 → 4.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -11
- package/dist/errors.d.mts +2 -3
- package/dist/errors.d.ts +2 -3
- package/dist/errors.js.map +1 -1
- package/dist/errors.mjs.map +1 -1
- package/dist/index.d.mts +5 -14
- package/dist/index.d.ts +5 -14
- package/dist/index.js +19 -215
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +19 -204
- package/dist/index.mjs.map +1 -1
- package/dist/types.d.mts +137 -11
- package/dist/types.d.ts +137 -11
- package/dist/types.js.map +1 -1
- package/index.d.ts +27 -0
- package/index.js +52 -52
- package/package.json +11 -9
- package/dist/ocr/guten-ocr.d.mts +0 -193
- package/dist/ocr/guten-ocr.d.ts +0 -193
- package/dist/ocr/guten-ocr.js +0 -234
- package/dist/ocr/guten-ocr.js.map +0 -1
- package/dist/ocr/guten-ocr.mjs +0 -199
- package/dist/ocr/guten-ocr.mjs.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
var __create = Object.create;
|
|
3
2
|
var __defProp = Object.defineProperty;
|
|
4
3
|
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
4
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
-
var __getProtoOf = Object.getPrototypeOf;
|
|
7
5
|
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
6
|
var __export = (target, all) => {
|
|
9
7
|
for (var name in all)
|
|
@@ -17,14 +15,6 @@ var __copyProps = (to, from, except, desc) => {
|
|
|
17
15
|
}
|
|
18
16
|
return to;
|
|
19
17
|
};
|
|
20
|
-
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
-
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
-
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
-
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
-
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
-
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
-
mod
|
|
27
|
-
));
|
|
28
18
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
19
|
|
|
30
20
|
// typescript/index.ts
|
|
@@ -33,7 +23,6 @@ __export(index_exports, {
|
|
|
33
23
|
CacheError: () => CacheError,
|
|
34
24
|
ErrorCode: () => ErrorCode,
|
|
35
25
|
ExtractionConfig: () => ExtractionConfig,
|
|
36
|
-
GutenOcrBackend: () => GutenOcrBackend,
|
|
37
26
|
ImageProcessingError: () => ImageProcessingError,
|
|
38
27
|
KreuzbergError: () => KreuzbergError,
|
|
39
28
|
MissingDependencyError: () => MissingDependencyError,
|
|
@@ -365,6 +354,8 @@ function normalizeOcrConfig(ocr) {
|
|
|
365
354
|
if (tesseract) {
|
|
366
355
|
setIfDefined(normalized, "tesseractConfig", tesseract);
|
|
367
356
|
}
|
|
357
|
+
setIfDefined(normalized, "paddleOcrConfig", ocr.paddleOcrConfig);
|
|
358
|
+
setIfDefined(normalized, "elementConfig", ocr.elementConfig);
|
|
368
359
|
return normalized;
|
|
369
360
|
}
|
|
370
361
|
function normalizeChunkingConfig(chunking) {
|
|
@@ -513,6 +504,7 @@ function normalizeExtractionConfig(config) {
|
|
|
513
504
|
setIfDefined(normalized, "useCache", config.useCache);
|
|
514
505
|
setIfDefined(normalized, "enableQualityProcessing", config.enableQualityProcessing);
|
|
515
506
|
setIfDefined(normalized, "forceOcr", config.forceOcr);
|
|
507
|
+
setIfDefined(normalized, "includeDocumentStructure", config.includeDocumentStructure);
|
|
516
508
|
setIfDefined(normalized, "maxConcurrentExtractions", config.maxConcurrentExtractions);
|
|
517
509
|
const ocr = normalizeOcrConfig(config.ocr);
|
|
518
510
|
setIfDefined(normalized, "ocr", ocr);
|
|
@@ -694,7 +686,9 @@ function convertPageContent(rawPage) {
|
|
|
694
686
|
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
695
687
|
tables: Array.isArray(page["tables"]) ? page["tables"] : [],
|
|
696
688
|
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
697
|
-
images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : []
|
|
689
|
+
images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : [],
|
|
690
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
691
|
+
isBlank: page["isBlank"] ?? null
|
|
698
692
|
};
|
|
699
693
|
}
|
|
700
694
|
function convertResult(rawResult) {
|
|
@@ -708,7 +702,8 @@ function convertResult(rawResult) {
|
|
|
708
702
|
chunks: null,
|
|
709
703
|
images: null,
|
|
710
704
|
elements: null,
|
|
711
|
-
pages: null
|
|
705
|
+
pages: null,
|
|
706
|
+
document: null
|
|
712
707
|
};
|
|
713
708
|
}
|
|
714
709
|
const result = rawResult;
|
|
@@ -727,7 +722,9 @@ function convertResult(rawResult) {
|
|
|
727
722
|
chunks: null,
|
|
728
723
|
images: null,
|
|
729
724
|
elements: null,
|
|
730
|
-
pages: null
|
|
725
|
+
pages: null,
|
|
726
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
727
|
+
document: result["document"] ?? null
|
|
731
728
|
};
|
|
732
729
|
const chunksData = result["chunks"];
|
|
733
730
|
if (Array.isArray(chunksData)) {
|
|
@@ -745,6 +742,10 @@ function convertResult(rawResult) {
|
|
|
745
742
|
if (Array.isArray(pagesData)) {
|
|
746
743
|
returnObj.pages = pagesData.map((page) => convertPageContent(page));
|
|
747
744
|
}
|
|
745
|
+
const ocrElementsData = result["ocrElements"];
|
|
746
|
+
if (Array.isArray(ocrElementsData)) {
|
|
747
|
+
returnObj.ocrElements = ocrElementsData;
|
|
748
|
+
}
|
|
748
749
|
return returnObj;
|
|
749
750
|
}
|
|
750
751
|
|
|
@@ -834,7 +835,7 @@ async function extractBytes(dataOrPath, mimeType, config = null) {
|
|
|
834
835
|
data = dataOrPath;
|
|
835
836
|
}
|
|
836
837
|
const validated = assertUint8Array(data, "data");
|
|
837
|
-
if (process.env["
|
|
838
|
+
if (process.env["KREUZBERG_DEBUG_OCR"] === "1") {
|
|
838
839
|
console.log("[TypeScript] Debug input header:", Array.from(validated.slice(0, 8)));
|
|
839
840
|
}
|
|
840
841
|
const normalizedConfig = normalizeExtractionConfig(config);
|
|
@@ -987,202 +988,6 @@ function listValidators() {
|
|
|
987
988
|
return binding2.listValidators();
|
|
988
989
|
}
|
|
989
990
|
|
|
990
|
-
// typescript/ocr/guten-ocr.ts
|
|
991
|
-
var GutenOcrBackend = class {
|
|
992
|
-
ocr = null;
|
|
993
|
-
ocrModule = null;
|
|
994
|
-
options;
|
|
995
|
-
/**
|
|
996
|
-
* Create a new Guten OCR backend.
|
|
997
|
-
*
|
|
998
|
-
* @param options - Optional configuration for Guten OCR
|
|
999
|
-
* @param options.models - Custom model paths (default: uses bundled models)
|
|
1000
|
-
* @param options.isDebug - Enable debug mode (default: false)
|
|
1001
|
-
* @param options.debugOutputDir - Directory for debug output (default: undefined)
|
|
1002
|
-
* @param options.onnxOptions - Custom ONNX Runtime options (default: undefined)
|
|
1003
|
-
*
|
|
1004
|
-
* @example
|
|
1005
|
-
* ```typescript
|
|
1006
|
-
* // Default configuration
|
|
1007
|
-
* const backend = new GutenOcrBackend();
|
|
1008
|
-
*
|
|
1009
|
-
* // With debug enabled
|
|
1010
|
-
* const debugBackend = new GutenOcrBackend({
|
|
1011
|
-
* isDebug: true,
|
|
1012
|
-
* debugOutputDir: './ocr_debug'
|
|
1013
|
-
* });
|
|
1014
|
-
* ```
|
|
1015
|
-
*/
|
|
1016
|
-
constructor(options) {
|
|
1017
|
-
if (options !== void 0) {
|
|
1018
|
-
this.options = options;
|
|
1019
|
-
}
|
|
1020
|
-
}
|
|
1021
|
-
/**
|
|
1022
|
-
* Get the backend name.
|
|
1023
|
-
*
|
|
1024
|
-
* @returns Backend name ("guten-ocr")
|
|
1025
|
-
*/
|
|
1026
|
-
name() {
|
|
1027
|
-
return "guten-ocr";
|
|
1028
|
-
}
|
|
1029
|
-
/**
|
|
1030
|
-
* Get list of supported language codes.
|
|
1031
|
-
*
|
|
1032
|
-
* Guten OCR supports multiple languages depending on the model configuration.
|
|
1033
|
-
* The default models support English and Chinese.
|
|
1034
|
-
*
|
|
1035
|
-
* @returns Array of ISO 639-1/2 language codes
|
|
1036
|
-
*/
|
|
1037
|
-
supportedLanguages() {
|
|
1038
|
-
return ["en", "eng", "ch_sim", "ch_tra", "chinese"];
|
|
1039
|
-
}
|
|
1040
|
-
/**
|
|
1041
|
-
* Initialize the OCR backend.
|
|
1042
|
-
*
|
|
1043
|
-
* This method loads the Guten OCR module and creates an OCR instance.
|
|
1044
|
-
* Call this before using processImage().
|
|
1045
|
-
*
|
|
1046
|
-
* @throws {Error} If @gutenye/ocr-node is not installed
|
|
1047
|
-
* @throws {Error} If OCR initialization fails
|
|
1048
|
-
*
|
|
1049
|
-
* @example
|
|
1050
|
-
* ```typescript
|
|
1051
|
-
* const backend = new GutenOcrBackend();
|
|
1052
|
-
* await backend.initialize();
|
|
1053
|
-
* ```
|
|
1054
|
-
*/
|
|
1055
|
-
async initialize() {
|
|
1056
|
-
if (this.ocr !== null) {
|
|
1057
|
-
return;
|
|
1058
|
-
}
|
|
1059
|
-
try {
|
|
1060
|
-
this.ocrModule = await import("@gutenye/ocr-node").then((m) => m.default || m);
|
|
1061
|
-
} catch (e) {
|
|
1062
|
-
const error = e;
|
|
1063
|
-
throw new Error(
|
|
1064
|
-
`Guten OCR support requires the '@gutenye/ocr-node' package. Install with: npm install @gutenye/ocr-node. Error: ${error.message}`
|
|
1065
|
-
);
|
|
1066
|
-
}
|
|
1067
|
-
try {
|
|
1068
|
-
this.ocr = await this.ocrModule?.create(this.options) ?? null;
|
|
1069
|
-
} catch (e) {
|
|
1070
|
-
const error = e;
|
|
1071
|
-
throw new Error(`Failed to initialize Guten OCR: ${error.message}`);
|
|
1072
|
-
}
|
|
1073
|
-
}
|
|
1074
|
-
/**
|
|
1075
|
-
* Shutdown the backend and release resources.
|
|
1076
|
-
*
|
|
1077
|
-
* This method cleans up all resources associated with the backend,
|
|
1078
|
-
* including the GutenOCR instance and module references.
|
|
1079
|
-
*
|
|
1080
|
-
* @example
|
|
1081
|
-
* ```typescript
|
|
1082
|
-
* const backend = new GutenOcrBackend();
|
|
1083
|
-
* await backend.initialize();
|
|
1084
|
-
* // ... use backend ...
|
|
1085
|
-
* await backend.shutdown();
|
|
1086
|
-
* ```
|
|
1087
|
-
*/
|
|
1088
|
-
async shutdown() {
|
|
1089
|
-
if (this.ocr !== null) {
|
|
1090
|
-
this.ocr = null;
|
|
1091
|
-
}
|
|
1092
|
-
if (this.ocrModule !== null) {
|
|
1093
|
-
this.ocrModule = null;
|
|
1094
|
-
}
|
|
1095
|
-
}
|
|
1096
|
-
/**
|
|
1097
|
-
* Process image bytes and extract text using Guten OCR.
|
|
1098
|
-
*
|
|
1099
|
-
* This method:
|
|
1100
|
-
* 1. Decodes the image using sharp (if pixel data is needed) or passes bytes directly
|
|
1101
|
-
* 2. Runs OCR detection to find text regions
|
|
1102
|
-
* 3. Runs OCR recognition on each text region
|
|
1103
|
-
* 4. Returns extracted text with metadata
|
|
1104
|
-
*
|
|
1105
|
-
* @param imageBytes - Raw image data (PNG, JPEG, TIFF, etc.)
|
|
1106
|
-
* @param language - Language code (must be in supportedLanguages())
|
|
1107
|
-
* @returns Promise resolving to OCR result with content and metadata
|
|
1108
|
-
*
|
|
1109
|
-
* @throws {Error} If backend is not initialized
|
|
1110
|
-
* @throws {Error} If OCR processing fails
|
|
1111
|
-
*
|
|
1112
|
-
* @example
|
|
1113
|
-
* ```typescript
|
|
1114
|
-
* import { readFile } from 'fs/promises';
|
|
1115
|
-
*
|
|
1116
|
-
* const backend = new GutenOcrBackend();
|
|
1117
|
-
* await backend.initialize();
|
|
1118
|
-
*
|
|
1119
|
-
* const imageBytes = await readFile('scanned.png');
|
|
1120
|
-
* const result = await backend.processImage(imageBytes, 'en');
|
|
1121
|
-
* console.log(result.content);
|
|
1122
|
-
* console.log(result.metadata.confidence);
|
|
1123
|
-
* ```
|
|
1124
|
-
*/
|
|
1125
|
-
async processImage(imageBytes, language) {
|
|
1126
|
-
if (this.ocr === null) {
|
|
1127
|
-
await this.initialize();
|
|
1128
|
-
}
|
|
1129
|
-
if (this.ocr === null) {
|
|
1130
|
-
throw new Error("Guten OCR backend failed to initialize");
|
|
1131
|
-
}
|
|
1132
|
-
try {
|
|
1133
|
-
const buffer = typeof imageBytes === "string" ? Buffer.from(imageBytes, "base64") : Buffer.from(imageBytes);
|
|
1134
|
-
const debugEnv = process.env["KREUZBERG_DEBUG_GUTEN"];
|
|
1135
|
-
if (debugEnv === "1") {
|
|
1136
|
-
const header = Array.from(buffer.subarray(0, 8));
|
|
1137
|
-
console.log("[Guten OCR] Debug input header:", header);
|
|
1138
|
-
console.log(
|
|
1139
|
-
"[Guten OCR] Buffer?",
|
|
1140
|
-
Buffer.isBuffer(buffer),
|
|
1141
|
-
"constructor",
|
|
1142
|
-
imageBytes?.constructor?.name,
|
|
1143
|
-
"length",
|
|
1144
|
-
buffer.length,
|
|
1145
|
-
"type",
|
|
1146
|
-
typeof imageBytes
|
|
1147
|
-
);
|
|
1148
|
-
}
|
|
1149
|
-
let width = 0;
|
|
1150
|
-
let height = 0;
|
|
1151
|
-
try {
|
|
1152
|
-
const sharpModule = await import("sharp");
|
|
1153
|
-
const sharp = sharpModule.default || sharpModule;
|
|
1154
|
-
const image = sharp(buffer);
|
|
1155
|
-
const metadata = await image.metadata();
|
|
1156
|
-
const metadataRecord = metadata;
|
|
1157
|
-
width = metadataRecord["width"] ?? 0;
|
|
1158
|
-
height = metadataRecord["height"] ?? 0;
|
|
1159
|
-
} catch (metadataError) {
|
|
1160
|
-
const error = metadataError;
|
|
1161
|
-
console.warn(`[Guten OCR] Unable to read image metadata via sharp: ${error.message}`);
|
|
1162
|
-
}
|
|
1163
|
-
const result = await this.ocr.detect(buffer);
|
|
1164
|
-
const textLines = result.map((line) => line.text);
|
|
1165
|
-
const content = textLines.join("\n");
|
|
1166
|
-
const avgConfidence = result.length > 0 ? result.reduce((sum, line) => sum + line.mean, 0) / result.length : 0;
|
|
1167
|
-
return {
|
|
1168
|
-
content,
|
|
1169
|
-
mime_type: "text/plain",
|
|
1170
|
-
metadata: {
|
|
1171
|
-
width,
|
|
1172
|
-
height,
|
|
1173
|
-
confidence: avgConfidence,
|
|
1174
|
-
text_regions: result.length,
|
|
1175
|
-
language
|
|
1176
|
-
},
|
|
1177
|
-
tables: []
|
|
1178
|
-
};
|
|
1179
|
-
} catch (e) {
|
|
1180
|
-
const error = e;
|
|
1181
|
-
throw new Error(`Guten OCR processing failed: ${error.message}`);
|
|
1182
|
-
}
|
|
1183
|
-
}
|
|
1184
|
-
};
|
|
1185
|
-
|
|
1186
991
|
// typescript/plugins/ocr-backends.ts
|
|
1187
992
|
function isOcrProcessTuple(value) {
|
|
1188
993
|
return Array.isArray(value) && value.length === 2 && typeof value[1] === "string" && (typeof value[0] === "string" || Buffer.isBuffer(value[0]) || value[0] instanceof Uint8Array);
|
|
@@ -1203,7 +1008,7 @@ function registerOcrBackend(backend) {
|
|
|
1203
1008
|
supportedLanguages: typeof backend.supportedLanguages === "function" ? backend.supportedLanguages() : backend.supportedLanguages ?? ["en"],
|
|
1204
1009
|
async processImage(...processArgs) {
|
|
1205
1010
|
const [imagePayload, maybeLanguage] = processArgs;
|
|
1206
|
-
if (process.env["
|
|
1011
|
+
if (process.env["KREUZBERG_DEBUG_OCR"] === "1") {
|
|
1207
1012
|
console.log("[registerOcrBackend] JS arguments", { length: processArgs.length });
|
|
1208
1013
|
console.log("[registerOcrBackend] Raw args", {
|
|
1209
1014
|
imagePayloadType: Array.isArray(imagePayload) ? "tuple" : typeof imagePayload,
|
|
@@ -1223,7 +1028,7 @@ function registerOcrBackend(backend) {
|
|
|
1223
1028
|
if (typeof language !== "string") {
|
|
1224
1029
|
throw new Error("OCR backend did not receive a language parameter");
|
|
1225
1030
|
}
|
|
1226
|
-
if (process.env["
|
|
1031
|
+
if (process.env["KREUZBERG_DEBUG_OCR"] === "1") {
|
|
1227
1032
|
const length = typeof rawBytes === "string" ? rawBytes.length : rawBytes.length;
|
|
1228
1033
|
console.log(
|
|
1229
1034
|
"[registerOcrBackend] Received payload",
|
|
@@ -1372,13 +1177,12 @@ function getEmbeddingPreset(name) {
|
|
|
1372
1177
|
}
|
|
1373
1178
|
|
|
1374
1179
|
// typescript/index.ts
|
|
1375
|
-
var __version__ = "4.
|
|
1180
|
+
var __version__ = "4.3.1";
|
|
1376
1181
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1377
1182
|
0 && (module.exports = {
|
|
1378
1183
|
CacheError,
|
|
1379
1184
|
ErrorCode,
|
|
1380
1185
|
ExtractionConfig,
|
|
1381
|
-
GutenOcrBackend,
|
|
1382
1186
|
ImageProcessingError,
|
|
1383
1187
|
KreuzbergError,
|
|
1384
1188
|
MissingDependencyError,
|