@kreuzberg/node 4.2.14 → 4.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -281,6 +281,8 @@ function normalizeOcrConfig(ocr) {
281
281
  if (tesseract) {
282
282
  setIfDefined(normalized, "tesseractConfig", tesseract);
283
283
  }
284
+ setIfDefined(normalized, "paddleOcrConfig", ocr.paddleOcrConfig);
285
+ setIfDefined(normalized, "elementConfig", ocr.elementConfig);
284
286
  return normalized;
285
287
  }
286
288
  function normalizeChunkingConfig(chunking) {
@@ -429,6 +431,7 @@ function normalizeExtractionConfig(config) {
429
431
  setIfDefined(normalized, "useCache", config.useCache);
430
432
  setIfDefined(normalized, "enableQualityProcessing", config.enableQualityProcessing);
431
433
  setIfDefined(normalized, "forceOcr", config.forceOcr);
434
+ setIfDefined(normalized, "includeDocumentStructure", config.includeDocumentStructure);
432
435
  setIfDefined(normalized, "maxConcurrentExtractions", config.maxConcurrentExtractions);
433
436
  const ocr = normalizeOcrConfig(config.ocr);
434
437
  setIfDefined(normalized, "ocr", ocr);
@@ -610,7 +613,9 @@ function convertPageContent(rawPage) {
610
613
  // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
611
614
  tables: Array.isArray(page["tables"]) ? page["tables"] : [],
612
615
  // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
613
- images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : []
616
+ images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : [],
617
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
618
+ isBlank: page["isBlank"] ?? null
614
619
  };
615
620
  }
616
621
  function convertResult(rawResult) {
@@ -624,7 +629,8 @@ function convertResult(rawResult) {
624
629
  chunks: null,
625
630
  images: null,
626
631
  elements: null,
627
- pages: null
632
+ pages: null,
633
+ document: null
628
634
  };
629
635
  }
630
636
  const result = rawResult;
@@ -643,7 +649,9 @@ function convertResult(rawResult) {
643
649
  chunks: null,
644
650
  images: null,
645
651
  elements: null,
646
- pages: null
652
+ pages: null,
653
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
654
+ document: result["document"] ?? null
647
655
  };
648
656
  const chunksData = result["chunks"];
649
657
  if (Array.isArray(chunksData)) {
@@ -661,6 +669,10 @@ function convertResult(rawResult) {
661
669
  if (Array.isArray(pagesData)) {
662
670
  returnObj.pages = pagesData.map((page) => convertPageContent(page));
663
671
  }
672
+ const ocrElementsData = result["ocrElements"];
673
+ if (Array.isArray(ocrElementsData)) {
674
+ returnObj.ocrElements = ocrElementsData;
675
+ }
664
676
  return returnObj;
665
677
  }
666
678
 
@@ -750,7 +762,7 @@ async function extractBytes(dataOrPath, mimeType, config = null) {
750
762
  data = dataOrPath;
751
763
  }
752
764
  const validated = assertUint8Array(data, "data");
753
- if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
765
+ if (process.env["KREUZBERG_DEBUG_OCR"] === "1") {
754
766
  console.log("[TypeScript] Debug input header:", Array.from(validated.slice(0, 8)));
755
767
  }
756
768
  const normalizedConfig = normalizeExtractionConfig(config);
@@ -903,202 +915,6 @@ function listValidators() {
903
915
  return binding2.listValidators();
904
916
  }
905
917
 
906
- // typescript/ocr/guten-ocr.ts
907
- var GutenOcrBackend = class {
908
- ocr = null;
909
- ocrModule = null;
910
- options;
911
- /**
912
- * Create a new Guten OCR backend.
913
- *
914
- * @param options - Optional configuration for Guten OCR
915
- * @param options.models - Custom model paths (default: uses bundled models)
916
- * @param options.isDebug - Enable debug mode (default: false)
917
- * @param options.debugOutputDir - Directory for debug output (default: undefined)
918
- * @param options.onnxOptions - Custom ONNX Runtime options (default: undefined)
919
- *
920
- * @example
921
- * ```typescript
922
- * // Default configuration
923
- * const backend = new GutenOcrBackend();
924
- *
925
- * // With debug enabled
926
- * const debugBackend = new GutenOcrBackend({
927
- * isDebug: true,
928
- * debugOutputDir: './ocr_debug'
929
- * });
930
- * ```
931
- */
932
- constructor(options) {
933
- if (options !== void 0) {
934
- this.options = options;
935
- }
936
- }
937
- /**
938
- * Get the backend name.
939
- *
940
- * @returns Backend name ("guten-ocr")
941
- */
942
- name() {
943
- return "guten-ocr";
944
- }
945
- /**
946
- * Get list of supported language codes.
947
- *
948
- * Guten OCR supports multiple languages depending on the model configuration.
949
- * The default models support English and Chinese.
950
- *
951
- * @returns Array of ISO 639-1/2 language codes
952
- */
953
- supportedLanguages() {
954
- return ["en", "eng", "ch_sim", "ch_tra", "chinese"];
955
- }
956
- /**
957
- * Initialize the OCR backend.
958
- *
959
- * This method loads the Guten OCR module and creates an OCR instance.
960
- * Call this before using processImage().
961
- *
962
- * @throws {Error} If @gutenye/ocr-node is not installed
963
- * @throws {Error} If OCR initialization fails
964
- *
965
- * @example
966
- * ```typescript
967
- * const backend = new GutenOcrBackend();
968
- * await backend.initialize();
969
- * ```
970
- */
971
- async initialize() {
972
- if (this.ocr !== null) {
973
- return;
974
- }
975
- try {
976
- this.ocrModule = await import("@gutenye/ocr-node").then((m) => m.default || m);
977
- } catch (e) {
978
- const error = e;
979
- throw new Error(
980
- `Guten OCR support requires the '@gutenye/ocr-node' package. Install with: npm install @gutenye/ocr-node. Error: ${error.message}`
981
- );
982
- }
983
- try {
984
- this.ocr = await this.ocrModule?.create(this.options) ?? null;
985
- } catch (e) {
986
- const error = e;
987
- throw new Error(`Failed to initialize Guten OCR: ${error.message}`);
988
- }
989
- }
990
- /**
991
- * Shutdown the backend and release resources.
992
- *
993
- * This method cleans up all resources associated with the backend,
994
- * including the GutenOCR instance and module references.
995
- *
996
- * @example
997
- * ```typescript
998
- * const backend = new GutenOcrBackend();
999
- * await backend.initialize();
1000
- * // ... use backend ...
1001
- * await backend.shutdown();
1002
- * ```
1003
- */
1004
- async shutdown() {
1005
- if (this.ocr !== null) {
1006
- this.ocr = null;
1007
- }
1008
- if (this.ocrModule !== null) {
1009
- this.ocrModule = null;
1010
- }
1011
- }
1012
- /**
1013
- * Process image bytes and extract text using Guten OCR.
1014
- *
1015
- * This method:
1016
- * 1. Decodes the image using sharp (if pixel data is needed) or passes bytes directly
1017
- * 2. Runs OCR detection to find text regions
1018
- * 3. Runs OCR recognition on each text region
1019
- * 4. Returns extracted text with metadata
1020
- *
1021
- * @param imageBytes - Raw image data (PNG, JPEG, TIFF, etc.)
1022
- * @param language - Language code (must be in supportedLanguages())
1023
- * @returns Promise resolving to OCR result with content and metadata
1024
- *
1025
- * @throws {Error} If backend is not initialized
1026
- * @throws {Error} If OCR processing fails
1027
- *
1028
- * @example
1029
- * ```typescript
1030
- * import { readFile } from 'fs/promises';
1031
- *
1032
- * const backend = new GutenOcrBackend();
1033
- * await backend.initialize();
1034
- *
1035
- * const imageBytes = await readFile('scanned.png');
1036
- * const result = await backend.processImage(imageBytes, 'en');
1037
- * console.log(result.content);
1038
- * console.log(result.metadata.confidence);
1039
- * ```
1040
- */
1041
- async processImage(imageBytes, language) {
1042
- if (this.ocr === null) {
1043
- await this.initialize();
1044
- }
1045
- if (this.ocr === null) {
1046
- throw new Error("Guten OCR backend failed to initialize");
1047
- }
1048
- try {
1049
- const buffer = typeof imageBytes === "string" ? Buffer.from(imageBytes, "base64") : Buffer.from(imageBytes);
1050
- const debugEnv = process.env["KREUZBERG_DEBUG_GUTEN"];
1051
- if (debugEnv === "1") {
1052
- const header = Array.from(buffer.subarray(0, 8));
1053
- console.log("[Guten OCR] Debug input header:", header);
1054
- console.log(
1055
- "[Guten OCR] Buffer?",
1056
- Buffer.isBuffer(buffer),
1057
- "constructor",
1058
- imageBytes?.constructor?.name,
1059
- "length",
1060
- buffer.length,
1061
- "type",
1062
- typeof imageBytes
1063
- );
1064
- }
1065
- let width = 0;
1066
- let height = 0;
1067
- try {
1068
- const sharpModule = await import("sharp");
1069
- const sharp = sharpModule.default || sharpModule;
1070
- const image = sharp(buffer);
1071
- const metadata = await image.metadata();
1072
- const metadataRecord = metadata;
1073
- width = metadataRecord["width"] ?? 0;
1074
- height = metadataRecord["height"] ?? 0;
1075
- } catch (metadataError) {
1076
- const error = metadataError;
1077
- console.warn(`[Guten OCR] Unable to read image metadata via sharp: ${error.message}`);
1078
- }
1079
- const result = await this.ocr.detect(buffer);
1080
- const textLines = result.map((line) => line.text);
1081
- const content = textLines.join("\n");
1082
- const avgConfidence = result.length > 0 ? result.reduce((sum, line) => sum + line.mean, 0) / result.length : 0;
1083
- return {
1084
- content,
1085
- mime_type: "text/plain",
1086
- metadata: {
1087
- width,
1088
- height,
1089
- confidence: avgConfidence,
1090
- text_regions: result.length,
1091
- language
1092
- },
1093
- tables: []
1094
- };
1095
- } catch (e) {
1096
- const error = e;
1097
- throw new Error(`Guten OCR processing failed: ${error.message}`);
1098
- }
1099
- }
1100
- };
1101
-
1102
918
  // typescript/plugins/ocr-backends.ts
1103
919
  function isOcrProcessTuple(value) {
1104
920
  return Array.isArray(value) && value.length === 2 && typeof value[1] === "string" && (typeof value[0] === "string" || Buffer.isBuffer(value[0]) || value[0] instanceof Uint8Array);
@@ -1119,7 +935,7 @@ function registerOcrBackend(backend) {
1119
935
  supportedLanguages: typeof backend.supportedLanguages === "function" ? backend.supportedLanguages() : backend.supportedLanguages ?? ["en"],
1120
936
  async processImage(...processArgs) {
1121
937
  const [imagePayload, maybeLanguage] = processArgs;
1122
- if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
938
+ if (process.env["KREUZBERG_DEBUG_OCR"] === "1") {
1123
939
  console.log("[registerOcrBackend] JS arguments", { length: processArgs.length });
1124
940
  console.log("[registerOcrBackend] Raw args", {
1125
941
  imagePayloadType: Array.isArray(imagePayload) ? "tuple" : typeof imagePayload,
@@ -1139,7 +955,7 @@ function registerOcrBackend(backend) {
1139
955
  if (typeof language !== "string") {
1140
956
  throw new Error("OCR backend did not receive a language parameter");
1141
957
  }
1142
- if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
958
+ if (process.env["KREUZBERG_DEBUG_OCR"] === "1") {
1143
959
  const length = typeof rawBytes === "string" ? rawBytes.length : rawBytes.length;
1144
960
  console.log(
1145
961
  "[registerOcrBackend] Received payload",
@@ -1288,12 +1104,11 @@ function getEmbeddingPreset(name) {
1288
1104
  }
1289
1105
 
1290
1106
  // typescript/index.ts
1291
- var __version__ = "4.2.14";
1107
+ var __version__ = "4.3.0";
1292
1108
  export {
1293
1109
  CacheError,
1294
1110
  ErrorCode,
1295
1111
  ExtractionConfig,
1296
- GutenOcrBackend,
1297
1112
  ImageProcessingError,
1298
1113
  KreuzbergError,
1299
1114
  MissingDependencyError,