npm - @kreuzberg/wasm - Versions diffs - 4.4.0 → 4.4.2 - Mend

@kreuzberg/wasm 4.4.0 → 4.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/README.md +1 -1
package/dist/extraction/batch.d.ts +7 -4
package/dist/extraction/batch.d.ts.map +1 -1
package/dist/extraction/bytes.d.ts +7 -4
package/dist/extraction/bytes.d.ts.map +1 -1
package/dist/index.js +192 -7
package/dist/index.js.map +1 -1
package/dist/initialization/wasm-loader.d.ts.map +1 -1
package/dist/ocr/enabler.d.ts.map +1 -1
package/dist/ocr/ocr-worker.d.ts +23 -0
package/dist/ocr/ocr-worker.d.ts.map +1 -0
package/dist/ocr/ocr-worker.js +78 -0
package/dist/ocr/ocr-worker.js.map +1 -0
package/dist/ocr/worker-bridge.d.ts +29 -0
package/dist/ocr/worker-bridge.d.ts.map +1 -0
package/dist/pkg/README.md +1 -1
package/dist/pkg/kreuzberg_wasm.js +12 -12
package/dist/pkg/kreuzberg_wasm_bg.js +12 -12
package/dist/pkg/kreuzberg_wasm_bg.wasm +0 -0
package/dist/pkg/kreuzberg_wasm_bg.wasm.d.ts +5 -5
package/package.json +3 -3

package/README.md CHANGED Viewed

@@ -22,7 +22,7 @@
     <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
   </a>
   <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
-    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.0" alt="Go">
+    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.2" alt="Go">
   </a>
   <a href="https://www.nuget.org/packages/Kreuzberg/">
     <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">

package/dist/extraction/batch.d.ts CHANGED Viewed

@@ -33,8 +33,11 @@ export declare function batchExtractBytes(files: Array<{
 /**
  * Batch extract content from multiple byte arrays synchronously
  *
- * Synchronous version of batchExtractBytes. Extracts content from multiple documents
- * in a single batch operation without async operations.
+ * Synchronous version of {@link batchExtractBytes}. Extracts content from multiple
+ * documents in a single batch operation without async/await.
+ *
+ * **Note:** This function blocks the current thread until all extractions complete.
+ * For large batches, prefer the async {@link batchExtractBytes} function.
  *
  * @param files - Array of objects containing data (Uint8Array) and mimeType (string)
  * @param config - Optional extraction configuration applied to all files
@@ -44,8 +47,8 @@ export declare function batchExtractBytes(files: Array<{
  * @example
  * ```typescript
  * const files = [
- *   { data: pdfBytes, mimeType: 'application/pdf' },
- *   { data: docxBytes, mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }
+ *   { data: txtBytes, mimeType: 'text/plain' },
+ *   { data: htmlBytes, mimeType: 'text/html' }
  * ];
  * const results = batchExtractBytesSync(files);
  * results.forEach((result) => console.log(result.content));

package/dist/extraction/batch.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"batch.d.ts","sourceRoot":"","sources":["../../typescript/extraction/batch.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,KAAK,EAAE,gBAAgB,IAAI,oBAAoB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAG9F;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAsB,iBAAiB,CACtC,KAAK,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,UAAU,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAA;CAAE,CAAC,EACpD,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,EAAE,CAAC,CA6D7B;AAED~~;;;;;;;;;;;;;;;;;;;;GAoBG~~;AACH,wBAAgB,qBAAqB,CACpC,KAAK,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,UAAU,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAA;CAAE,CAAC,EACpD,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,gBAAgB,EAAE,CA6DpB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAsB,iBAAiB,CACtC,KAAK,EAAE,IAAI,EAAE,EACb,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,EAAE,CAAC,CAiC7B"}
1	+ {"version":3,"file":"batch.d.ts","sourceRoot":"","sources":["../../typescript/extraction/batch.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,KAAK,EAAE,gBAAgB,IAAI,oBAAoB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAG9F;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAsB,iBAAiB,CACtC,KAAK,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,UAAU,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAA;CAAE,CAAC,EACpD,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,EAAE,CAAC,CA6D7B;AAED;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,wBAAgB,qBAAqB,CACpC,KAAK,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,UAAU,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAA;CAAE,CAAC,EACpD,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,gBAAgB,EAAE,CA6DpB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAsB,iBAAiB,CACtC,KAAK,EAAE,IAAI,EAAE,EACb,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,EAAE,CAAC,CAiC7B"}

package/dist/extraction/bytes.d.ts CHANGED Viewed

@@ -49,11 +49,14 @@ export declare function extractBytes(data: Uint8Array, mimeType: string, config?
 /**
  * Extract content from bytes synchronously
  *
- * Synchronous version of extractBytes. Performs extraction without async operations.
- * Note: Some extraction features may still be async internally, but the wrapper is synchronous.
+ * Synchronous version of {@link extractBytes}. Extracts text, metadata, tables,
+ * and other content from document bytes without async/await.
+ *
+ * **Note:** This function blocks the current thread until extraction completes.
+ * For large documents, prefer the async {@link extractBytes} function.
  *
  * @param data - The document bytes to extract from
- * @param mimeType - MIME type of the document
+ * @param mimeType - MIME type of the document (e.g., 'application/pdf', 'image/jpeg')
  * @param config - Optional extraction configuration
  * @returns The extraction result
  * @throws {Error} If WASM module is not initialized or extraction fails
@@ -61,7 +64,7 @@ export declare function extractBytes(data: Uint8Array, mimeType: string, config?
  * @example
  * ```typescript
  * const bytes = new Uint8Array(buffer);
- * const result = extractBytesSync(bytes, 'application/pdf');
+ * const result = extractBytesSync(bytes, 'text/plain');
  * console.log(result.content);
  * ```
  */

package/dist/extraction/bytes.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"bytes.d.ts","sourceRoot":"","sources":["../../typescript/extraction/bytes.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,OAAO,KAAK,EAAE,gBAAgB,IAAI,oBAAoB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAG9F;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAwCG;AACH,wBAAsB,YAAY,CACjC,IAAI,EAAE,UAAU,EAChB,QAAQ,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,CAAC,CA4B3B;AAED~~;;;;;;;;;;;;;;;;;;GAkBG~~;AACH,wBAAgB,gBAAgB,CAC/B,IAAI,EAAE,UAAU,EAChB,QAAQ,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,gBAAgB,CA4BlB"}
1	+ {"version":3,"file":"bytes.d.ts","sourceRoot":"","sources":["../../typescript/extraction/bytes.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,OAAO,KAAK,EAAE,gBAAgB,IAAI,oBAAoB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAG9F;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAwCG;AACH,wBAAsB,YAAY,CACjC,IAAI,EAAE,UAAU,EAChB,QAAQ,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,CAAC,CA4B3B;AAED;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,wBAAgB,gBAAgB,CAC/B,IAAI,EAAE,UAAU,EAChB,QAAQ,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,gBAAgB,CA4BlB"}

package/dist/index.js CHANGED Viewed

@@ -606,9 +606,11 @@ async function initWasm(options) {
         }
       }
       if (loadedModule && typeof loadedModule.initialize_pdfium_render === "function") {
-        initializePdfiumAsync(loadedModule).catch((error) => {
+        try {
+          await initializePdfiumAsync(loadedModule);
+        } catch (error) {
           console.warn("PDFium auto-initialization failed (PDF extraction disabled):", error);
-        });
+        }
       }
       setInitialized(true);
       setInitializationError(null);
@@ -1298,6 +1300,161 @@ var TesseractWasmBackend = class {
   }
 };
+// typescript/ocr/worker-bridge.ts
+var workerHandle = null;
+var pendingRequests = /* @__PURE__ */ new Map();
+var nextRequestId = 0;
+var workerReady = false;
+var readyResolve = null;
+var readyReject = null;
+var useFallback = false;
+var fallbackFn = null;
+function handleWorkerMessage(msg) {
+  switch (msg["type"]) {
+    case "ready":
+      workerReady = true;
+      readyResolve?.();
+      readyResolve = null;
+      readyReject = null;
+      break;
+    case "init-error":
+      readyReject?.(new Error(msg["error"]));
+      readyResolve = null;
+      readyReject = null;
+      break;
+    case "result": {
+      const id = msg["id"];
+      const pending = pendingRequests.get(id);
+      if (pending) {
+        pendingRequests.delete(id);
+        pending.resolve(msg["text"]);
+      }
+      break;
+    }
+    case "error": {
+      const id = msg["id"];
+      const pending = pendingRequests.get(id);
+      if (pending) {
+        pendingRequests.delete(id);
+        pending.reject(new Error(msg["error"]));
+      }
+      break;
+    }
+  }
+}
+async function createOcrWorker(wasmGluePath, wasmBinary, directFallback) {
+  fallbackFn = directFallback;
+  if (workerHandle) return;
+  const readyPromise = new Promise((resolve, reject) => {
+    readyResolve = resolve;
+    readyReject = reject;
+  });
+  try {
+    if (isNode()) {
+      await createNodeWorker(wasmGluePath, wasmBinary);
+    } else if (typeof Worker !== "undefined") {
+      await createBrowserWorker(wasmGluePath, wasmBinary);
+    } else {
+      useFallback = true;
+      return;
+    }
+    await readyPromise;
+  } catch {
+    workerHandle = null;
+    useFallback = true;
+  }
+}
+async function createNodeWorker(wasmGluePath, wasmBinary) {
+  const { Worker: Worker2 } = await import(
+    /* @vite-ignore */
+    "worker_threads"
+  );
+  const nodePath = await import(
+    /* @vite-ignore */
+    "path"
+  );
+  const nodeUrl = await import(
+    /* @vite-ignore */
+    "url"
+  );
+  const __dirname = nodePath.dirname(nodeUrl.fileURLToPath(import.meta.url));
+  const workerPath = nodePath.join(__dirname, "ocr-worker.js");
+  const worker = new Worker2(workerPath, {
+    workerData: { wasmGluePath, wasmBinary }
+  });
+  worker.on("message", (msg) => handleWorkerMessage(msg));
+  worker.on("error", (err) => {
+    for (const pending of pendingRequests.values()) {
+      pending.reject(err);
+    }
+    pendingRequests.clear();
+    readyReject?.(err);
+  });
+  workerHandle = {
+    postMessage: (data) => worker.postMessage(data),
+    terminate: () => worker.terminate()
+  };
+}
+async function createBrowserWorker(wasmGluePath, wasmBinary) {
+  const workerUrl = new URL("./ocr-worker.js", import.meta.url);
+  const worker = new Worker(workerUrl, { type: "module" });
+  worker.onmessage = (e) => handleWorkerMessage(e.data);
+  worker.onerror = (e) => {
+    const err = new Error(e.message);
+    for (const pending of pendingRequests.values()) {
+      pending.reject(err);
+    }
+    pendingRequests.clear();
+    readyReject?.(err);
+  };
+  workerHandle = {
+    postMessage: (data) => worker.postMessage(data),
+    terminate: () => worker.terminate()
+  };
+  worker.postMessage({
+    type: "init",
+    wasmGluePath,
+    wasmBinary
+  });
+}
+function runOcrInWorker(imageData, tessdata, language) {
+  if (useFallback || !workerHandle || !workerReady) {
+    if (fallbackFn) {
+      try {
+        const text = fallbackFn(imageData, tessdata, language);
+        return Promise.resolve(text);
+      } catch (e) {
+        return Promise.reject(e instanceof Error ? e : new Error(String(e)));
+      }
+    }
+    return Promise.reject(new Error("OCR worker not initialized and no fallback available"));
+  }
+  const id = nextRequestId++;
+  return new Promise((resolve, reject) => {
+    pendingRequests.set(id, { resolve, reject });
+    workerHandle.postMessage({
+      type: "ocr",
+      id,
+      imageData,
+      tessdata,
+      language
+    });
+  });
+}
+async function terminateOcrWorker() {
+  if (workerHandle) {
+    await workerHandle.terminate();
+    workerHandle = null;
+  }
+  workerReady = false;
+  useFallback = false;
+  fallbackFn = null;
+  for (const pending of pendingRequests.values()) {
+    pending.reject(new Error("OCR worker terminated"));
+  }
+  pendingRequests.clear();
+}
 // typescript/ocr/enabler.ts
 var TESSDATA_CDN_BASE = "https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/main";
 var NativeWasmOcrBackend = class {
@@ -1360,19 +1517,47 @@ var NativeWasmOcrBackend = class {
         "Native WASM OCR is not available. Build with the 'ocr-wasm' feature to enable kreuzberg-tesseract."
       );
     }
+    let wasmGluePath;
+    let wasmBinary;
+    if (isNode()) {
+      const nodePath = await import(
+        /* @vite-ignore */
+        "path"
+      );
+      const nodeUrl = await import(
+        /* @vite-ignore */
+        "url"
+      );
+      const nodeFs = await import(
+        /* @vite-ignore */
+        "fs/promises"
+      );
+      const __dirname = nodePath.dirname(nodeUrl.fileURLToPath(import.meta.url));
+      wasmGluePath = nodePath.join(__dirname, "..", "pkg", "kreuzberg_wasm.js");
+      try {
+        const wasmPath = nodePath.join(__dirname, "..", "pkg", "kreuzberg_wasm_bg.wasm");
+        const buf = await nodeFs.readFile(wasmPath);
+        wasmBinary = new Uint8Array(buf);
+      } catch {
+      }
+    } else {
+      wasmGluePath = new URL("./pkg/kreuzberg_wasm.js", import.meta.url).href;
+    }
+    const directFallback = (imageData, tessdata, language) => {
+      if (!wasm2.ocrRecognize) throw new Error("ocrRecognize not available");
+      return wasm2.ocrRecognize(imageData, tessdata, language);
+    };
+    await createOcrWorker(wasmGluePath, wasmBinary, directFallback);
   }
   async shutdown() {
     this.tessdataCache.clear();
     this.progressCallback = null;
+    await terminateOcrWorker();
   }
   setProgressCallback(callback) {
     this.progressCallback = callback;
   }
   async processImage(imageBytes, language) {
-    const wasm2 = getWasmModule();
-    if (!wasm2?.ocrRecognize) {
-      throw new Error("Native WASM OCR function not available");
-    }
     const normalizedLang = language.toLowerCase();
     this.reportProgress(10);
     const tessdata = await this.getTessdata(normalizedLang);
@@ -1388,7 +1573,7 @@ var NativeWasmOcrBackend = class {
       imageData = imageBytes;
     }
     this.reportProgress(50);
-    const text = wasm2.ocrRecognize(imageData, tessdata, normalizedLang);
+    const text = await runOcrInWorker(imageData, tessdata, normalizedLang);
     this.reportProgress(90);
     return {
       content: text,