@kreuzberg/wasm 4.4.0 → 4.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.0" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.2" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -33,8 +33,11 @@ export declare function batchExtractBytes(files: Array<{
33
33
  /**
34
34
  * Batch extract content from multiple byte arrays synchronously
35
35
  *
36
- * Synchronous version of batchExtractBytes. Extracts content from multiple documents
37
- * in a single batch operation without async operations.
36
+ * Synchronous version of {@link batchExtractBytes}. Extracts content from multiple
37
+ * documents in a single batch operation without async/await.
38
+ *
39
+ * **Note:** This function blocks the current thread until all extractions complete.
40
+ * For large batches, prefer the async {@link batchExtractBytes} function.
38
41
  *
39
42
  * @param files - Array of objects containing data (Uint8Array) and mimeType (string)
40
43
  * @param config - Optional extraction configuration applied to all files
@@ -44,8 +47,8 @@ export declare function batchExtractBytes(files: Array<{
44
47
  * @example
45
48
  * ```typescript
46
49
  * const files = [
47
- * { data: pdfBytes, mimeType: 'application/pdf' },
48
- * { data: docxBytes, mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }
50
+ * { data: txtBytes, mimeType: 'text/plain' },
51
+ * { data: htmlBytes, mimeType: 'text/html' }
49
52
  * ];
50
53
  * const results = batchExtractBytesSync(files);
51
54
  * results.forEach((result) => console.log(result.content));
@@ -1 +1 @@
1
- {"version":3,"file":"batch.d.ts","sourceRoot":"","sources":["../../typescript/extraction/batch.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,KAAK,EAAE,gBAAgB,IAAI,oBAAoB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAG9F;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAsB,iBAAiB,CACtC,KAAK,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,UAAU,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAA;CAAE,CAAC,EACpD,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,EAAE,CAAC,CA6D7B;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAgB,qBAAqB,CACpC,KAAK,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,UAAU,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAA;CAAE,CAAC,EACpD,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,gBAAgB,EAAE,CA6DpB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAsB,iBAAiB,CACtC,KAAK,EAAE,IAAI,EAAE,EACb,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,EAAE,CAAC,CAiC7B"}
1
+ {"version":3,"file":"batch.d.ts","sourceRoot":"","sources":["../../typescript/extraction/batch.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,KAAK,EAAE,gBAAgB,IAAI,oBAAoB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAG9F;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAsB,iBAAiB,CACtC,KAAK,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,UAAU,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAA;CAAE,CAAC,EACpD,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,EAAE,CAAC,CA6D7B;AAED;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,wBAAgB,qBAAqB,CACpC,KAAK,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,UAAU,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAA;CAAE,CAAC,EACpD,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,gBAAgB,EAAE,CA6DpB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAsB,iBAAiB,CACtC,KAAK,EAAE,IAAI,EAAE,EACb,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,EAAE,CAAC,CAiC7B"}
@@ -49,11 +49,14 @@ export declare function extractBytes(data: Uint8Array, mimeType: string, config?
49
49
  /**
50
50
  * Extract content from bytes synchronously
51
51
  *
52
- * Synchronous version of extractBytes. Performs extraction without async operations.
53
- * Note: Some extraction features may still be async internally, but the wrapper is synchronous.
52
+ * Synchronous version of {@link extractBytes}. Extracts text, metadata, tables,
53
+ * and other content from document bytes without async/await.
54
+ *
55
+ * **Note:** This function blocks the current thread until extraction completes.
56
+ * For large documents, prefer the async {@link extractBytes} function.
54
57
  *
55
58
  * @param data - The document bytes to extract from
56
- * @param mimeType - MIME type of the document
59
+ * @param mimeType - MIME type of the document (e.g., 'application/pdf', 'image/jpeg')
57
60
  * @param config - Optional extraction configuration
58
61
  * @returns The extraction result
59
62
  * @throws {Error} If WASM module is not initialized or extraction fails
@@ -61,7 +64,7 @@ export declare function extractBytes(data: Uint8Array, mimeType: string, config?
61
64
  * @example
62
65
  * ```typescript
63
66
  * const bytes = new Uint8Array(buffer);
64
- * const result = extractBytesSync(bytes, 'application/pdf');
67
+ * const result = extractBytesSync(bytes, 'text/plain');
65
68
  * console.log(result.content);
66
69
  * ```
67
70
  */
@@ -1 +1 @@
1
- {"version":3,"file":"bytes.d.ts","sourceRoot":"","sources":["../../typescript/extraction/bytes.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,OAAO,KAAK,EAAE,gBAAgB,IAAI,oBAAoB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAG9F;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAwCG;AACH,wBAAsB,YAAY,CACjC,IAAI,EAAE,UAAU,EAChB,QAAQ,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,CAAC,CA4B3B;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAgB,gBAAgB,CAC/B,IAAI,EAAE,UAAU,EAChB,QAAQ,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,gBAAgB,CA4BlB"}
1
+ {"version":3,"file":"bytes.d.ts","sourceRoot":"","sources":["../../typescript/extraction/bytes.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,OAAO,KAAK,EAAE,gBAAgB,IAAI,oBAAoB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAG9F;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAwCG;AACH,wBAAsB,YAAY,CACjC,IAAI,EAAE,UAAU,EAChB,QAAQ,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,CAAC,CA4B3B;AAED;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,wBAAgB,gBAAgB,CAC/B,IAAI,EAAE,UAAU,EAChB,QAAQ,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,gBAAgB,CA4BlB"}
package/dist/index.js CHANGED
@@ -606,9 +606,11 @@ async function initWasm(options) {
606
606
  }
607
607
  }
608
608
  if (loadedModule && typeof loadedModule.initialize_pdfium_render === "function") {
609
- initializePdfiumAsync(loadedModule).catch((error) => {
609
+ try {
610
+ await initializePdfiumAsync(loadedModule);
611
+ } catch (error) {
610
612
  console.warn("PDFium auto-initialization failed (PDF extraction disabled):", error);
611
- });
613
+ }
612
614
  }
613
615
  setInitialized(true);
614
616
  setInitializationError(null);
@@ -1298,6 +1300,161 @@ var TesseractWasmBackend = class {
1298
1300
  }
1299
1301
  };
1300
1302
 
1303
+ // typescript/ocr/worker-bridge.ts
1304
+ var workerHandle = null;
1305
+ var pendingRequests = /* @__PURE__ */ new Map();
1306
+ var nextRequestId = 0;
1307
+ var workerReady = false;
1308
+ var readyResolve = null;
1309
+ var readyReject = null;
1310
+ var useFallback = false;
1311
+ var fallbackFn = null;
1312
+ function handleWorkerMessage(msg) {
1313
+ switch (msg["type"]) {
1314
+ case "ready":
1315
+ workerReady = true;
1316
+ readyResolve?.();
1317
+ readyResolve = null;
1318
+ readyReject = null;
1319
+ break;
1320
+ case "init-error":
1321
+ readyReject?.(new Error(msg["error"]));
1322
+ readyResolve = null;
1323
+ readyReject = null;
1324
+ break;
1325
+ case "result": {
1326
+ const id = msg["id"];
1327
+ const pending = pendingRequests.get(id);
1328
+ if (pending) {
1329
+ pendingRequests.delete(id);
1330
+ pending.resolve(msg["text"]);
1331
+ }
1332
+ break;
1333
+ }
1334
+ case "error": {
1335
+ const id = msg["id"];
1336
+ const pending = pendingRequests.get(id);
1337
+ if (pending) {
1338
+ pendingRequests.delete(id);
1339
+ pending.reject(new Error(msg["error"]));
1340
+ }
1341
+ break;
1342
+ }
1343
+ }
1344
+ }
1345
+ async function createOcrWorker(wasmGluePath, wasmBinary, directFallback) {
1346
+ fallbackFn = directFallback;
1347
+ if (workerHandle) return;
1348
+ const readyPromise = new Promise((resolve, reject) => {
1349
+ readyResolve = resolve;
1350
+ readyReject = reject;
1351
+ });
1352
+ try {
1353
+ if (isNode()) {
1354
+ await createNodeWorker(wasmGluePath, wasmBinary);
1355
+ } else if (typeof Worker !== "undefined") {
1356
+ await createBrowserWorker(wasmGluePath, wasmBinary);
1357
+ } else {
1358
+ useFallback = true;
1359
+ return;
1360
+ }
1361
+ await readyPromise;
1362
+ } catch {
1363
+ workerHandle = null;
1364
+ useFallback = true;
1365
+ }
1366
+ }
1367
+ async function createNodeWorker(wasmGluePath, wasmBinary) {
1368
+ const { Worker: Worker2 } = await import(
1369
+ /* @vite-ignore */
1370
+ "worker_threads"
1371
+ );
1372
+ const nodePath = await import(
1373
+ /* @vite-ignore */
1374
+ "path"
1375
+ );
1376
+ const nodeUrl = await import(
1377
+ /* @vite-ignore */
1378
+ "url"
1379
+ );
1380
+ const __dirname = nodePath.dirname(nodeUrl.fileURLToPath(import.meta.url));
1381
+ const workerPath = nodePath.join(__dirname, "ocr-worker.js");
1382
+ const worker = new Worker2(workerPath, {
1383
+ workerData: { wasmGluePath, wasmBinary }
1384
+ });
1385
+ worker.on("message", (msg) => handleWorkerMessage(msg));
1386
+ worker.on("error", (err) => {
1387
+ for (const pending of pendingRequests.values()) {
1388
+ pending.reject(err);
1389
+ }
1390
+ pendingRequests.clear();
1391
+ readyReject?.(err);
1392
+ });
1393
+ workerHandle = {
1394
+ postMessage: (data) => worker.postMessage(data),
1395
+ terminate: () => worker.terminate()
1396
+ };
1397
+ }
1398
+ async function createBrowserWorker(wasmGluePath, wasmBinary) {
1399
+ const workerUrl = new URL("./ocr-worker.js", import.meta.url);
1400
+ const worker = new Worker(workerUrl, { type: "module" });
1401
+ worker.onmessage = (e) => handleWorkerMessage(e.data);
1402
+ worker.onerror = (e) => {
1403
+ const err = new Error(e.message);
1404
+ for (const pending of pendingRequests.values()) {
1405
+ pending.reject(err);
1406
+ }
1407
+ pendingRequests.clear();
1408
+ readyReject?.(err);
1409
+ };
1410
+ workerHandle = {
1411
+ postMessage: (data) => worker.postMessage(data),
1412
+ terminate: () => worker.terminate()
1413
+ };
1414
+ worker.postMessage({
1415
+ type: "init",
1416
+ wasmGluePath,
1417
+ wasmBinary
1418
+ });
1419
+ }
1420
+ function runOcrInWorker(imageData, tessdata, language) {
1421
+ if (useFallback || !workerHandle || !workerReady) {
1422
+ if (fallbackFn) {
1423
+ try {
1424
+ const text = fallbackFn(imageData, tessdata, language);
1425
+ return Promise.resolve(text);
1426
+ } catch (e) {
1427
+ return Promise.reject(e instanceof Error ? e : new Error(String(e)));
1428
+ }
1429
+ }
1430
+ return Promise.reject(new Error("OCR worker not initialized and no fallback available"));
1431
+ }
1432
+ const id = nextRequestId++;
1433
+ return new Promise((resolve, reject) => {
1434
+ pendingRequests.set(id, { resolve, reject });
1435
+ workerHandle.postMessage({
1436
+ type: "ocr",
1437
+ id,
1438
+ imageData,
1439
+ tessdata,
1440
+ language
1441
+ });
1442
+ });
1443
+ }
1444
+ async function terminateOcrWorker() {
1445
+ if (workerHandle) {
1446
+ await workerHandle.terminate();
1447
+ workerHandle = null;
1448
+ }
1449
+ workerReady = false;
1450
+ useFallback = false;
1451
+ fallbackFn = null;
1452
+ for (const pending of pendingRequests.values()) {
1453
+ pending.reject(new Error("OCR worker terminated"));
1454
+ }
1455
+ pendingRequests.clear();
1456
+ }
1457
+
1301
1458
  // typescript/ocr/enabler.ts
1302
1459
  var TESSDATA_CDN_BASE = "https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/main";
1303
1460
  var NativeWasmOcrBackend = class {
@@ -1360,19 +1517,47 @@ var NativeWasmOcrBackend = class {
1360
1517
  "Native WASM OCR is not available. Build with the 'ocr-wasm' feature to enable kreuzberg-tesseract."
1361
1518
  );
1362
1519
  }
1520
+ let wasmGluePath;
1521
+ let wasmBinary;
1522
+ if (isNode()) {
1523
+ const nodePath = await import(
1524
+ /* @vite-ignore */
1525
+ "path"
1526
+ );
1527
+ const nodeUrl = await import(
1528
+ /* @vite-ignore */
1529
+ "url"
1530
+ );
1531
+ const nodeFs = await import(
1532
+ /* @vite-ignore */
1533
+ "fs/promises"
1534
+ );
1535
+ const __dirname = nodePath.dirname(nodeUrl.fileURLToPath(import.meta.url));
1536
+ wasmGluePath = nodePath.join(__dirname, "..", "pkg", "kreuzberg_wasm.js");
1537
+ try {
1538
+ const wasmPath = nodePath.join(__dirname, "..", "pkg", "kreuzberg_wasm_bg.wasm");
1539
+ const buf = await nodeFs.readFile(wasmPath);
1540
+ wasmBinary = new Uint8Array(buf);
1541
+ } catch {
1542
+ }
1543
+ } else {
1544
+ wasmGluePath = new URL("./pkg/kreuzberg_wasm.js", import.meta.url).href;
1545
+ }
1546
+ const directFallback = (imageData, tessdata, language) => {
1547
+ if (!wasm2.ocrRecognize) throw new Error("ocrRecognize not available");
1548
+ return wasm2.ocrRecognize(imageData, tessdata, language);
1549
+ };
1550
+ await createOcrWorker(wasmGluePath, wasmBinary, directFallback);
1363
1551
  }
1364
1552
  async shutdown() {
1365
1553
  this.tessdataCache.clear();
1366
1554
  this.progressCallback = null;
1555
+ await terminateOcrWorker();
1367
1556
  }
1368
1557
  setProgressCallback(callback) {
1369
1558
  this.progressCallback = callback;
1370
1559
  }
1371
1560
  async processImage(imageBytes, language) {
1372
- const wasm2 = getWasmModule();
1373
- if (!wasm2?.ocrRecognize) {
1374
- throw new Error("Native WASM OCR function not available");
1375
- }
1376
1561
  const normalizedLang = language.toLowerCase();
1377
1562
  this.reportProgress(10);
1378
1563
  const tessdata = await this.getTessdata(normalizedLang);
@@ -1388,7 +1573,7 @@ var NativeWasmOcrBackend = class {
1388
1573
  imageData = imageBytes;
1389
1574
  }
1390
1575
  this.reportProgress(50);
1391
- const text = wasm2.ocrRecognize(imageData, tessdata, normalizedLang);
1576
+ const text = await runOcrInWorker(imageData, tessdata, normalizedLang);
1392
1577
  this.reportProgress(90);
1393
1578
  return {
1394
1579
  content: text,