npm - @kreuzberg/node - Versions diffs - 4.2.15 → 4.3.0 - Mend

@kreuzberg/node 4.2.15 → 4.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/README.md +9 -11
package/dist/errors.d.mts +2 -3
package/dist/errors.d.ts +2 -3
package/dist/errors.js.map +1 -1
package/dist/errors.mjs.map +1 -1
package/dist/index.d.mts +5 -14
package/dist/index.d.ts +5 -14
package/dist/index.js +19 -215
package/dist/index.js.map +1 -1
package/dist/index.mjs +19 -204
package/dist/index.mjs.map +1 -1
package/dist/types.d.mts +137 -11
package/dist/types.d.ts +137 -11
package/dist/types.js.map +1 -1
package/index.d.ts +27 -0
package/index.js +52 -52
package/package.json +11 -9
package/dist/ocr/guten-ocr.d.mts +0 -193
package/dist/ocr/guten-ocr.d.ts +0 -193
package/dist/ocr/guten-ocr.js +0 -234
package/dist/ocr/guten-ocr.js.map +0 -1
package/dist/ocr/guten-ocr.mjs +0 -199
package/dist/ocr/guten-ocr.mjs.map +0 -1

package/dist/index.js CHANGED Viewed

@@ -1,9 +1,7 @@
 "use strict";
-var __create = Object.create;
 var __defProp = Object.defineProperty;
 var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
 var __getOwnPropNames = Object.getOwnPropertyNames;
-var __getProtoOf = Object.getPrototypeOf;
 var __hasOwnProp = Object.prototype.hasOwnProperty;
 var __export = (target, all) => {
   for (var name in all)
@@ -17,14 +15,6 @@ var __copyProps = (to, from, except, desc) => {
   }
   return to;
 };
-var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
-  // If the importer is in node compatibility mode or this is not an ESM
-  // file that has been converted to a CommonJS file using a Babel-
-  // compatible transform (i.e. "__esModule" has not been set), then set
-  // "default" to the CommonJS "module.exports" for node compatibility.
-  isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
-  mod
-));
 var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
 // typescript/index.ts
@@ -33,7 +23,6 @@ __export(index_exports, {
   CacheError: () => CacheError,
   ErrorCode: () => ErrorCode,
   ExtractionConfig: () => ExtractionConfig,
-  GutenOcrBackend: () => GutenOcrBackend,
   ImageProcessingError: () => ImageProcessingError,
   KreuzbergError: () => KreuzbergError,
   MissingDependencyError: () => MissingDependencyError,
@@ -365,6 +354,8 @@ function normalizeOcrConfig(ocr) {
   if (tesseract) {
     setIfDefined(normalized, "tesseractConfig", tesseract);
   }
+  setIfDefined(normalized, "paddleOcrConfig", ocr.paddleOcrConfig);
+  setIfDefined(normalized, "elementConfig", ocr.elementConfig);
   return normalized;
 }
 function normalizeChunkingConfig(chunking) {
@@ -513,6 +504,7 @@ function normalizeExtractionConfig(config) {
   setIfDefined(normalized, "useCache", config.useCache);
   setIfDefined(normalized, "enableQualityProcessing", config.enableQualityProcessing);
   setIfDefined(normalized, "forceOcr", config.forceOcr);
+  setIfDefined(normalized, "includeDocumentStructure", config.includeDocumentStructure);
   setIfDefined(normalized, "maxConcurrentExtractions", config.maxConcurrentExtractions);
   const ocr = normalizeOcrConfig(config.ocr);
   setIfDefined(normalized, "ocr", ocr);
@@ -694,7 +686,9 @@ function convertPageContent(rawPage) {
     // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
     tables: Array.isArray(page["tables"]) ? page["tables"] : [],
     // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
-    images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : []
+    images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : [],
+    // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
+    isBlank: page["isBlank"] ?? null
   };
 }
 function convertResult(rawResult) {
@@ -708,7 +702,8 @@ function convertResult(rawResult) {
       chunks: null,
       images: null,
       elements: null,
-      pages: null
+      pages: null,
+      document: null
     };
   }
   const result = rawResult;
@@ -727,7 +722,9 @@ function convertResult(rawResult) {
     chunks: null,
     images: null,
     elements: null,
-    pages: null
+    pages: null,
+    // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
+    document: result["document"] ?? null
   };
   const chunksData = result["chunks"];
   if (Array.isArray(chunksData)) {
@@ -745,6 +742,10 @@ function convertResult(rawResult) {
   if (Array.isArray(pagesData)) {
     returnObj.pages = pagesData.map((page) => convertPageContent(page));
   }
+  const ocrElementsData = result["ocrElements"];
+  if (Array.isArray(ocrElementsData)) {
+    returnObj.ocrElements = ocrElementsData;
+  }
   return returnObj;
 }
@@ -834,7 +835,7 @@ async function extractBytes(dataOrPath, mimeType, config = null) {
     data = dataOrPath;
   }
   const validated = assertUint8Array(data, "data");
-  if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
+  if (process.env["KREUZBERG_DEBUG_OCR"] === "1") {
     console.log("[TypeScript] Debug input header:", Array.from(validated.slice(0, 8)));
   }
   const normalizedConfig = normalizeExtractionConfig(config);
@@ -987,202 +988,6 @@ function listValidators() {
   return binding2.listValidators();
 }
-// typescript/ocr/guten-ocr.ts
-var GutenOcrBackend = class {
-  ocr = null;
-  ocrModule = null;
-  options;
-  /**
-   * Create a new Guten OCR backend.
-   *
-   * @param options - Optional configuration for Guten OCR
-   * @param options.models - Custom model paths (default: uses bundled models)
-   * @param options.isDebug - Enable debug mode (default: false)
-   * @param options.debugOutputDir - Directory for debug output (default: undefined)
-   * @param options.onnxOptions - Custom ONNX Runtime options (default: undefined)
-   *
-   * @example
-   * ```typescript
-   * // Default configuration
-   * const backend = new GutenOcrBackend();
-   *
-   * // With debug enabled
-   * const debugBackend = new GutenOcrBackend({
-   *   isDebug: true,
-   *   debugOutputDir: './ocr_debug'
-   * });
-   * ```
-   */
-  constructor(options) {
-    if (options !== void 0) {
-      this.options = options;
-    }
-  }
-  /**
-   * Get the backend name.
-   *
-   * @returns Backend name ("guten-ocr")
-   */
-  name() {
-    return "guten-ocr";
-  }
-  /**
-   * Get list of supported language codes.
-   *
-   * Guten OCR supports multiple languages depending on the model configuration.
-   * The default models support English and Chinese.
-   *
-   * @returns Array of ISO 639-1/2 language codes
-   */
-  supportedLanguages() {
-    return ["en", "eng", "ch_sim", "ch_tra", "chinese"];
-  }
-  /**
-   * Initialize the OCR backend.
-   *
-   * This method loads the Guten OCR module and creates an OCR instance.
-   * Call this before using processImage().
-   *
-   * @throws {Error} If @gutenye/ocr-node is not installed
-   * @throws {Error} If OCR initialization fails
-   *
-   * @example
-   * ```typescript
-   * const backend = new GutenOcrBackend();
-   * await backend.initialize();
-   * ```
-   */
-  async initialize() {
-    if (this.ocr !== null) {
-      return;
-    }
-    try {
-      this.ocrModule = await import("@gutenye/ocr-node").then((m) => m.default || m);
-    } catch (e) {
-      const error = e;
-      throw new Error(
-        `Guten OCR support requires the '@gutenye/ocr-node' package. Install with: npm install @gutenye/ocr-node. Error: ${error.message}`
-      );
-    }
-    try {
-      this.ocr = await this.ocrModule?.create(this.options) ?? null;
-    } catch (e) {
-      const error = e;
-      throw new Error(`Failed to initialize Guten OCR: ${error.message}`);
-    }
-  }
-  /**
-   * Shutdown the backend and release resources.
-   *
-   * This method cleans up all resources associated with the backend,
-   * including the GutenOCR instance and module references.
-   *
-   * @example
-   * ```typescript
-   * const backend = new GutenOcrBackend();
-   * await backend.initialize();
-   * // ... use backend ...
-   * await backend.shutdown();
-   * ```
-   */
-  async shutdown() {
-    if (this.ocr !== null) {
-      this.ocr = null;
-    }
-    if (this.ocrModule !== null) {
-      this.ocrModule = null;
-    }
-  }
-  /**
-   * Process image bytes and extract text using Guten OCR.
-   *
-   * This method:
-   * 1. Decodes the image using sharp (if pixel data is needed) or passes bytes directly
-   * 2. Runs OCR detection to find text regions
-   * 3. Runs OCR recognition on each text region
-   * 4. Returns extracted text with metadata
-   *
-   * @param imageBytes - Raw image data (PNG, JPEG, TIFF, etc.)
-   * @param language - Language code (must be in supportedLanguages())
-   * @returns Promise resolving to OCR result with content and metadata
-   *
-   * @throws {Error} If backend is not initialized
-   * @throws {Error} If OCR processing fails
-   *
-   * @example
-   * ```typescript
-   * import { readFile } from 'fs/promises';
-   *
-   * const backend = new GutenOcrBackend();
-   * await backend.initialize();
-   *
-   * const imageBytes = await readFile('scanned.png');
-   * const result = await backend.processImage(imageBytes, 'en');
-   * console.log(result.content);
-   * console.log(result.metadata.confidence);
-   * ```
-   */
-  async processImage(imageBytes, language) {
-    if (this.ocr === null) {
-      await this.initialize();
-    }
-    if (this.ocr === null) {
-      throw new Error("Guten OCR backend failed to initialize");
-    }
-    try {
-      const buffer = typeof imageBytes === "string" ? Buffer.from(imageBytes, "base64") : Buffer.from(imageBytes);
-      const debugEnv = process.env["KREUZBERG_DEBUG_GUTEN"];
-      if (debugEnv === "1") {
-        const header = Array.from(buffer.subarray(0, 8));
-        console.log("[Guten OCR] Debug input header:", header);
-        console.log(
-          "[Guten OCR] Buffer?",
-          Buffer.isBuffer(buffer),
-          "constructor",
-          imageBytes?.constructor?.name,
-          "length",
-          buffer.length,
-          "type",
-          typeof imageBytes
-        );
-      }
-      let width = 0;
-      let height = 0;
-      try {
-        const sharpModule = await import("sharp");
-        const sharp = sharpModule.default || sharpModule;
-        const image = sharp(buffer);
-        const metadata = await image.metadata();
-        const metadataRecord = metadata;
-        width = metadataRecord["width"] ?? 0;
-        height = metadataRecord["height"] ?? 0;
-      } catch (metadataError) {
-        const error = metadataError;
-        console.warn(`[Guten OCR] Unable to read image metadata via sharp: ${error.message}`);
-      }
-      const result = await this.ocr.detect(buffer);
-      const textLines = result.map((line) => line.text);
-      const content = textLines.join("\n");
-      const avgConfidence = result.length > 0 ? result.reduce((sum, line) => sum + line.mean, 0) / result.length : 0;
-      return {
-        content,
-        mime_type: "text/plain",
-        metadata: {
-          width,
-          height,
-          confidence: avgConfidence,
-          text_regions: result.length,
-          language
-        },
-        tables: []
-      };
-    } catch (e) {
-      const error = e;
-      throw new Error(`Guten OCR processing failed: ${error.message}`);
-    }
-  }
-};
 // typescript/plugins/ocr-backends.ts
 function isOcrProcessTuple(value) {
   return Array.isArray(value) && value.length === 2 && typeof value[1] === "string" && (typeof value[0] === "string" || Buffer.isBuffer(value[0]) || value[0] instanceof Uint8Array);
@@ -1203,7 +1008,7 @@ function registerOcrBackend(backend) {
     supportedLanguages: typeof backend.supportedLanguages === "function" ? backend.supportedLanguages() : backend.supportedLanguages ?? ["en"],
     async processImage(...processArgs) {
       const [imagePayload, maybeLanguage] = processArgs;
-      if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
+      if (process.env["KREUZBERG_DEBUG_OCR"] === "1") {
         console.log("[registerOcrBackend] JS arguments", { length: processArgs.length });
         console.log("[registerOcrBackend] Raw args", {
           imagePayloadType: Array.isArray(imagePayload) ? "tuple" : typeof imagePayload,
@@ -1223,7 +1028,7 @@ function registerOcrBackend(backend) {
       if (typeof language !== "string") {
         throw new Error("OCR backend did not receive a language parameter");
       }
-      if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
+      if (process.env["KREUZBERG_DEBUG_OCR"] === "1") {
         const length = typeof rawBytes === "string" ? rawBytes.length : rawBytes.length;
         console.log(
           "[registerOcrBackend] Received payload",
@@ -1372,13 +1177,12 @@ function getEmbeddingPreset(name) {
 }
 // typescript/index.ts
-var __version__ = "4.2.15";
+var __version__ = "4.3.0";
 // Annotate the CommonJS export names for ESM import in node:
 0 && (module.exports = {
   CacheError,
   ErrorCode,
   ExtractionConfig,
-  GutenOcrBackend,
   ImageProcessingError,
   KreuzbergError,
   MissingDependencyError,