@kreuzberg/node 4.2.15 → 4.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,9 +1,7 @@
1
1
  "use strict";
2
- var __create = Object.create;
3
2
  var __defProp = Object.defineProperty;
4
3
  var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
4
  var __getOwnPropNames = Object.getOwnPropertyNames;
6
- var __getProtoOf = Object.getPrototypeOf;
7
5
  var __hasOwnProp = Object.prototype.hasOwnProperty;
8
6
  var __export = (target, all) => {
9
7
  for (var name in all)
@@ -17,14 +15,6 @@ var __copyProps = (to, from, except, desc) => {
17
15
  }
18
16
  return to;
19
17
  };
20
- var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
- // If the importer is in node compatibility mode or this is not an ESM
22
- // file that has been converted to a CommonJS file using a Babel-
23
- // compatible transform (i.e. "__esModule" has not been set), then set
24
- // "default" to the CommonJS "module.exports" for node compatibility.
25
- isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
- mod
27
- ));
28
18
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
19
 
30
20
  // typescript/index.ts
@@ -33,7 +23,6 @@ __export(index_exports, {
33
23
  CacheError: () => CacheError,
34
24
  ErrorCode: () => ErrorCode,
35
25
  ExtractionConfig: () => ExtractionConfig,
36
- GutenOcrBackend: () => GutenOcrBackend,
37
26
  ImageProcessingError: () => ImageProcessingError,
38
27
  KreuzbergError: () => KreuzbergError,
39
28
  MissingDependencyError: () => MissingDependencyError,
@@ -365,6 +354,8 @@ function normalizeOcrConfig(ocr) {
365
354
  if (tesseract) {
366
355
  setIfDefined(normalized, "tesseractConfig", tesseract);
367
356
  }
357
+ setIfDefined(normalized, "paddleOcrConfig", ocr.paddleOcrConfig);
358
+ setIfDefined(normalized, "elementConfig", ocr.elementConfig);
368
359
  return normalized;
369
360
  }
370
361
  function normalizeChunkingConfig(chunking) {
@@ -513,6 +504,7 @@ function normalizeExtractionConfig(config) {
513
504
  setIfDefined(normalized, "useCache", config.useCache);
514
505
  setIfDefined(normalized, "enableQualityProcessing", config.enableQualityProcessing);
515
506
  setIfDefined(normalized, "forceOcr", config.forceOcr);
507
+ setIfDefined(normalized, "includeDocumentStructure", config.includeDocumentStructure);
516
508
  setIfDefined(normalized, "maxConcurrentExtractions", config.maxConcurrentExtractions);
517
509
  const ocr = normalizeOcrConfig(config.ocr);
518
510
  setIfDefined(normalized, "ocr", ocr);
@@ -694,7 +686,9 @@ function convertPageContent(rawPage) {
694
686
  // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
695
687
  tables: Array.isArray(page["tables"]) ? page["tables"] : [],
696
688
  // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
697
- images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : []
689
+ images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : [],
690
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
691
+ isBlank: page["isBlank"] ?? null
698
692
  };
699
693
  }
700
694
  function convertResult(rawResult) {
@@ -708,7 +702,8 @@ function convertResult(rawResult) {
708
702
  chunks: null,
709
703
  images: null,
710
704
  elements: null,
711
- pages: null
705
+ pages: null,
706
+ document: null
712
707
  };
713
708
  }
714
709
  const result = rawResult;
@@ -727,7 +722,9 @@ function convertResult(rawResult) {
727
722
  chunks: null,
728
723
  images: null,
729
724
  elements: null,
730
- pages: null
725
+ pages: null,
726
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
727
+ document: result["document"] ?? null
731
728
  };
732
729
  const chunksData = result["chunks"];
733
730
  if (Array.isArray(chunksData)) {
@@ -745,6 +742,10 @@ function convertResult(rawResult) {
745
742
  if (Array.isArray(pagesData)) {
746
743
  returnObj.pages = pagesData.map((page) => convertPageContent(page));
747
744
  }
745
+ const ocrElementsData = result["ocrElements"];
746
+ if (Array.isArray(ocrElementsData)) {
747
+ returnObj.ocrElements = ocrElementsData;
748
+ }
748
749
  return returnObj;
749
750
  }
750
751
 
@@ -834,7 +835,7 @@ async function extractBytes(dataOrPath, mimeType, config = null) {
834
835
  data = dataOrPath;
835
836
  }
836
837
  const validated = assertUint8Array(data, "data");
837
- if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
838
+ if (process.env["KREUZBERG_DEBUG_OCR"] === "1") {
838
839
  console.log("[TypeScript] Debug input header:", Array.from(validated.slice(0, 8)));
839
840
  }
840
841
  const normalizedConfig = normalizeExtractionConfig(config);
@@ -987,202 +988,6 @@ function listValidators() {
987
988
  return binding2.listValidators();
988
989
  }
989
990
 
990
- // typescript/ocr/guten-ocr.ts
991
- var GutenOcrBackend = class {
992
- ocr = null;
993
- ocrModule = null;
994
- options;
995
- /**
996
- * Create a new Guten OCR backend.
997
- *
998
- * @param options - Optional configuration for Guten OCR
999
- * @param options.models - Custom model paths (default: uses bundled models)
1000
- * @param options.isDebug - Enable debug mode (default: false)
1001
- * @param options.debugOutputDir - Directory for debug output (default: undefined)
1002
- * @param options.onnxOptions - Custom ONNX Runtime options (default: undefined)
1003
- *
1004
- * @example
1005
- * ```typescript
1006
- * // Default configuration
1007
- * const backend = new GutenOcrBackend();
1008
- *
1009
- * // With debug enabled
1010
- * const debugBackend = new GutenOcrBackend({
1011
- * isDebug: true,
1012
- * debugOutputDir: './ocr_debug'
1013
- * });
1014
- * ```
1015
- */
1016
- constructor(options) {
1017
- if (options !== void 0) {
1018
- this.options = options;
1019
- }
1020
- }
1021
- /**
1022
- * Get the backend name.
1023
- *
1024
- * @returns Backend name ("guten-ocr")
1025
- */
1026
- name() {
1027
- return "guten-ocr";
1028
- }
1029
- /**
1030
- * Get list of supported language codes.
1031
- *
1032
- * Guten OCR supports multiple languages depending on the model configuration.
1033
- * The default models support English and Chinese.
1034
- *
1035
- * @returns Array of ISO 639-1/2 language codes
1036
- */
1037
- supportedLanguages() {
1038
- return ["en", "eng", "ch_sim", "ch_tra", "chinese"];
1039
- }
1040
- /**
1041
- * Initialize the OCR backend.
1042
- *
1043
- * This method loads the Guten OCR module and creates an OCR instance.
1044
- * Call this before using processImage().
1045
- *
1046
- * @throws {Error} If @gutenye/ocr-node is not installed
1047
- * @throws {Error} If OCR initialization fails
1048
- *
1049
- * @example
1050
- * ```typescript
1051
- * const backend = new GutenOcrBackend();
1052
- * await backend.initialize();
1053
- * ```
1054
- */
1055
- async initialize() {
1056
- if (this.ocr !== null) {
1057
- return;
1058
- }
1059
- try {
1060
- this.ocrModule = await import("@gutenye/ocr-node").then((m) => m.default || m);
1061
- } catch (e) {
1062
- const error = e;
1063
- throw new Error(
1064
- `Guten OCR support requires the '@gutenye/ocr-node' package. Install with: npm install @gutenye/ocr-node. Error: ${error.message}`
1065
- );
1066
- }
1067
- try {
1068
- this.ocr = await this.ocrModule?.create(this.options) ?? null;
1069
- } catch (e) {
1070
- const error = e;
1071
- throw new Error(`Failed to initialize Guten OCR: ${error.message}`);
1072
- }
1073
- }
1074
- /**
1075
- * Shutdown the backend and release resources.
1076
- *
1077
- * This method cleans up all resources associated with the backend,
1078
- * including the GutenOCR instance and module references.
1079
- *
1080
- * @example
1081
- * ```typescript
1082
- * const backend = new GutenOcrBackend();
1083
- * await backend.initialize();
1084
- * // ... use backend ...
1085
- * await backend.shutdown();
1086
- * ```
1087
- */
1088
- async shutdown() {
1089
- if (this.ocr !== null) {
1090
- this.ocr = null;
1091
- }
1092
- if (this.ocrModule !== null) {
1093
- this.ocrModule = null;
1094
- }
1095
- }
1096
- /**
1097
- * Process image bytes and extract text using Guten OCR.
1098
- *
1099
- * This method:
1100
- * 1. Decodes the image using sharp (if pixel data is needed) or passes bytes directly
1101
- * 2. Runs OCR detection to find text regions
1102
- * 3. Runs OCR recognition on each text region
1103
- * 4. Returns extracted text with metadata
1104
- *
1105
- * @param imageBytes - Raw image data (PNG, JPEG, TIFF, etc.)
1106
- * @param language - Language code (must be in supportedLanguages())
1107
- * @returns Promise resolving to OCR result with content and metadata
1108
- *
1109
- * @throws {Error} If backend is not initialized
1110
- * @throws {Error} If OCR processing fails
1111
- *
1112
- * @example
1113
- * ```typescript
1114
- * import { readFile } from 'fs/promises';
1115
- *
1116
- * const backend = new GutenOcrBackend();
1117
- * await backend.initialize();
1118
- *
1119
- * const imageBytes = await readFile('scanned.png');
1120
- * const result = await backend.processImage(imageBytes, 'en');
1121
- * console.log(result.content);
1122
- * console.log(result.metadata.confidence);
1123
- * ```
1124
- */
1125
- async processImage(imageBytes, language) {
1126
- if (this.ocr === null) {
1127
- await this.initialize();
1128
- }
1129
- if (this.ocr === null) {
1130
- throw new Error("Guten OCR backend failed to initialize");
1131
- }
1132
- try {
1133
- const buffer = typeof imageBytes === "string" ? Buffer.from(imageBytes, "base64") : Buffer.from(imageBytes);
1134
- const debugEnv = process.env["KREUZBERG_DEBUG_GUTEN"];
1135
- if (debugEnv === "1") {
1136
- const header = Array.from(buffer.subarray(0, 8));
1137
- console.log("[Guten OCR] Debug input header:", header);
1138
- console.log(
1139
- "[Guten OCR] Buffer?",
1140
- Buffer.isBuffer(buffer),
1141
- "constructor",
1142
- imageBytes?.constructor?.name,
1143
- "length",
1144
- buffer.length,
1145
- "type",
1146
- typeof imageBytes
1147
- );
1148
- }
1149
- let width = 0;
1150
- let height = 0;
1151
- try {
1152
- const sharpModule = await import("sharp");
1153
- const sharp = sharpModule.default || sharpModule;
1154
- const image = sharp(buffer);
1155
- const metadata = await image.metadata();
1156
- const metadataRecord = metadata;
1157
- width = metadataRecord["width"] ?? 0;
1158
- height = metadataRecord["height"] ?? 0;
1159
- } catch (metadataError) {
1160
- const error = metadataError;
1161
- console.warn(`[Guten OCR] Unable to read image metadata via sharp: ${error.message}`);
1162
- }
1163
- const result = await this.ocr.detect(buffer);
1164
- const textLines = result.map((line) => line.text);
1165
- const content = textLines.join("\n");
1166
- const avgConfidence = result.length > 0 ? result.reduce((sum, line) => sum + line.mean, 0) / result.length : 0;
1167
- return {
1168
- content,
1169
- mime_type: "text/plain",
1170
- metadata: {
1171
- width,
1172
- height,
1173
- confidence: avgConfidence,
1174
- text_regions: result.length,
1175
- language
1176
- },
1177
- tables: []
1178
- };
1179
- } catch (e) {
1180
- const error = e;
1181
- throw new Error(`Guten OCR processing failed: ${error.message}`);
1182
- }
1183
- }
1184
- };
1185
-
1186
991
  // typescript/plugins/ocr-backends.ts
1187
992
  function isOcrProcessTuple(value) {
1188
993
  return Array.isArray(value) && value.length === 2 && typeof value[1] === "string" && (typeof value[0] === "string" || Buffer.isBuffer(value[0]) || value[0] instanceof Uint8Array);
@@ -1203,7 +1008,7 @@ function registerOcrBackend(backend) {
1203
1008
  supportedLanguages: typeof backend.supportedLanguages === "function" ? backend.supportedLanguages() : backend.supportedLanguages ?? ["en"],
1204
1009
  async processImage(...processArgs) {
1205
1010
  const [imagePayload, maybeLanguage] = processArgs;
1206
- if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
1011
+ if (process.env["KREUZBERG_DEBUG_OCR"] === "1") {
1207
1012
  console.log("[registerOcrBackend] JS arguments", { length: processArgs.length });
1208
1013
  console.log("[registerOcrBackend] Raw args", {
1209
1014
  imagePayloadType: Array.isArray(imagePayload) ? "tuple" : typeof imagePayload,
@@ -1223,7 +1028,7 @@ function registerOcrBackend(backend) {
1223
1028
  if (typeof language !== "string") {
1224
1029
  throw new Error("OCR backend did not receive a language parameter");
1225
1030
  }
1226
- if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
1031
+ if (process.env["KREUZBERG_DEBUG_OCR"] === "1") {
1227
1032
  const length = typeof rawBytes === "string" ? rawBytes.length : rawBytes.length;
1228
1033
  console.log(
1229
1034
  "[registerOcrBackend] Received payload",
@@ -1372,13 +1177,12 @@ function getEmbeddingPreset(name) {
1372
1177
  }
1373
1178
 
1374
1179
  // typescript/index.ts
1375
- var __version__ = "4.2.15";
1180
+ var __version__ = "4.3.0";
1376
1181
  // Annotate the CommonJS export names for ESM import in node:
1377
1182
  0 && (module.exports = {
1378
1183
  CacheError,
1379
1184
  ErrorCode,
1380
1185
  ExtractionConfig,
1381
- GutenOcrBackend,
1382
1186
  ImageProcessingError,
1383
1187
  KreuzbergError,
1384
1188
  MissingDependencyError,