@kreuzberg/wasm 4.4.1 → 4.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/extraction/batch.d.ts +7 -4
- package/dist/extraction/batch.d.ts.map +1 -1
- package/dist/extraction/bytes.d.ts +7 -4
- package/dist/extraction/bytes.d.ts.map +1 -1
- package/dist/index.js +192 -7
- package/dist/index.js.map +1 -1
- package/dist/initialization/wasm-loader.d.ts.map +1 -1
- package/dist/ocr/enabler.d.ts.map +1 -1
- package/dist/ocr/ocr-worker.d.ts +23 -0
- package/dist/ocr/ocr-worker.d.ts.map +1 -0
- package/dist/ocr/ocr-worker.js +78 -0
- package/dist/ocr/ocr-worker.js.map +1 -0
- package/dist/ocr/worker-bridge.d.ts +29 -0
- package/dist/ocr/worker-bridge.d.ts.map +1 -0
- package/dist/pkg/README.md +1 -1
- package/dist/pkg/kreuzberg_wasm.js +12 -12
- package/dist/pkg/kreuzberg_wasm_bg.js +12 -12
- package/dist/pkg/kreuzberg_wasm_bg.wasm +0 -0
- package/dist/pkg/kreuzberg_wasm_bg.wasm.d.ts +5 -5
- package/package.json +120 -120
package/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.2" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -33,8 +33,11 @@ export declare function batchExtractBytes(files: Array<{
|
|
|
33
33
|
/**
|
|
34
34
|
* Batch extract content from multiple byte arrays synchronously
|
|
35
35
|
*
|
|
36
|
-
* Synchronous version of batchExtractBytes. Extracts content from multiple
|
|
37
|
-
* in a single batch operation without async
|
|
36
|
+
* Synchronous version of {@link batchExtractBytes}. Extracts content from multiple
|
|
37
|
+
* documents in a single batch operation without async/await.
|
|
38
|
+
*
|
|
39
|
+
* **Note:** This function blocks the current thread until all extractions complete.
|
|
40
|
+
* For large batches, prefer the async {@link batchExtractBytes} function.
|
|
38
41
|
*
|
|
39
42
|
* @param files - Array of objects containing data (Uint8Array) and mimeType (string)
|
|
40
43
|
* @param config - Optional extraction configuration applied to all files
|
|
@@ -44,8 +47,8 @@ export declare function batchExtractBytes(files: Array<{
|
|
|
44
47
|
* @example
|
|
45
48
|
* ```typescript
|
|
46
49
|
* const files = [
|
|
47
|
-
* { data:
|
|
48
|
-
* { data:
|
|
50
|
+
* { data: txtBytes, mimeType: 'text/plain' },
|
|
51
|
+
* { data: htmlBytes, mimeType: 'text/html' }
|
|
49
52
|
* ];
|
|
50
53
|
* const results = batchExtractBytesSync(files);
|
|
51
54
|
* results.forEach((result) => console.log(result.content));
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"batch.d.ts","sourceRoot":"","sources":["../../typescript/extraction/batch.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,KAAK,EAAE,gBAAgB,IAAI,oBAAoB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAG9F;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAsB,iBAAiB,CACtC,KAAK,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,UAAU,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAA;CAAE,CAAC,EACpD,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,EAAE,CAAC,CA6D7B;AAED
|
|
1
|
+
{"version":3,"file":"batch.d.ts","sourceRoot":"","sources":["../../typescript/extraction/batch.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,KAAK,EAAE,gBAAgB,IAAI,oBAAoB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAG9F;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAsB,iBAAiB,CACtC,KAAK,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,UAAU,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAA;CAAE,CAAC,EACpD,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,EAAE,CAAC,CA6D7B;AAED;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,wBAAgB,qBAAqB,CACpC,KAAK,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,UAAU,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAA;CAAE,CAAC,EACpD,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,gBAAgB,EAAE,CA6DpB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAsB,iBAAiB,CACtC,KAAK,EAAE,IAAI,EAAE,EACb,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,EAAE,CAAC,CAiC7B"}
|
|
@@ -49,11 +49,14 @@ export declare function extractBytes(data: Uint8Array, mimeType: string, config?
|
|
|
49
49
|
/**
|
|
50
50
|
* Extract content from bytes synchronously
|
|
51
51
|
*
|
|
52
|
-
* Synchronous version of extractBytes.
|
|
53
|
-
*
|
|
52
|
+
* Synchronous version of {@link extractBytes}. Extracts text, metadata, tables,
|
|
53
|
+
* and other content from document bytes without async/await.
|
|
54
|
+
*
|
|
55
|
+
* **Note:** This function blocks the current thread until extraction completes.
|
|
56
|
+
* For large documents, prefer the async {@link extractBytes} function.
|
|
54
57
|
*
|
|
55
58
|
* @param data - The document bytes to extract from
|
|
56
|
-
* @param mimeType - MIME type of the document
|
|
59
|
+
* @param mimeType - MIME type of the document (e.g., 'application/pdf', 'image/jpeg')
|
|
57
60
|
* @param config - Optional extraction configuration
|
|
58
61
|
* @returns The extraction result
|
|
59
62
|
* @throws {Error} If WASM module is not initialized or extraction fails
|
|
@@ -61,7 +64,7 @@ export declare function extractBytes(data: Uint8Array, mimeType: string, config?
|
|
|
61
64
|
* @example
|
|
62
65
|
* ```typescript
|
|
63
66
|
* const bytes = new Uint8Array(buffer);
|
|
64
|
-
* const result = extractBytesSync(bytes, '
|
|
67
|
+
* const result = extractBytesSync(bytes, 'text/plain');
|
|
65
68
|
* console.log(result.content);
|
|
66
69
|
* ```
|
|
67
70
|
*/
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"bytes.d.ts","sourceRoot":"","sources":["../../typescript/extraction/bytes.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,OAAO,KAAK,EAAE,gBAAgB,IAAI,oBAAoB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAG9F;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAwCG;AACH,wBAAsB,YAAY,CACjC,IAAI,EAAE,UAAU,EAChB,QAAQ,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,CAAC,CA4B3B;AAED
|
|
1
|
+
{"version":3,"file":"bytes.d.ts","sourceRoot":"","sources":["../../typescript/extraction/bytes.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,OAAO,KAAK,EAAE,gBAAgB,IAAI,oBAAoB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAG9F;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAwCG;AACH,wBAAsB,YAAY,CACjC,IAAI,EAAE,UAAU,EAChB,QAAQ,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,CAAC,CA4B3B;AAED;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,wBAAgB,gBAAgB,CAC/B,IAAI,EAAE,UAAU,EAChB,QAAQ,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,gBAAgB,CA4BlB"}
|
package/dist/index.js
CHANGED
|
@@ -606,9 +606,11 @@ async function initWasm(options) {
|
|
|
606
606
|
}
|
|
607
607
|
}
|
|
608
608
|
if (loadedModule && typeof loadedModule.initialize_pdfium_render === "function") {
|
|
609
|
-
|
|
609
|
+
try {
|
|
610
|
+
await initializePdfiumAsync(loadedModule);
|
|
611
|
+
} catch (error) {
|
|
610
612
|
console.warn("PDFium auto-initialization failed (PDF extraction disabled):", error);
|
|
611
|
-
}
|
|
613
|
+
}
|
|
612
614
|
}
|
|
613
615
|
setInitialized(true);
|
|
614
616
|
setInitializationError(null);
|
|
@@ -1298,6 +1300,161 @@ var TesseractWasmBackend = class {
|
|
|
1298
1300
|
}
|
|
1299
1301
|
};
|
|
1300
1302
|
|
|
1303
|
+
// typescript/ocr/worker-bridge.ts
|
|
1304
|
+
var workerHandle = null;
|
|
1305
|
+
var pendingRequests = /* @__PURE__ */ new Map();
|
|
1306
|
+
var nextRequestId = 0;
|
|
1307
|
+
var workerReady = false;
|
|
1308
|
+
var readyResolve = null;
|
|
1309
|
+
var readyReject = null;
|
|
1310
|
+
var useFallback = false;
|
|
1311
|
+
var fallbackFn = null;
|
|
1312
|
+
function handleWorkerMessage(msg) {
|
|
1313
|
+
switch (msg["type"]) {
|
|
1314
|
+
case "ready":
|
|
1315
|
+
workerReady = true;
|
|
1316
|
+
readyResolve?.();
|
|
1317
|
+
readyResolve = null;
|
|
1318
|
+
readyReject = null;
|
|
1319
|
+
break;
|
|
1320
|
+
case "init-error":
|
|
1321
|
+
readyReject?.(new Error(msg["error"]));
|
|
1322
|
+
readyResolve = null;
|
|
1323
|
+
readyReject = null;
|
|
1324
|
+
break;
|
|
1325
|
+
case "result": {
|
|
1326
|
+
const id = msg["id"];
|
|
1327
|
+
const pending = pendingRequests.get(id);
|
|
1328
|
+
if (pending) {
|
|
1329
|
+
pendingRequests.delete(id);
|
|
1330
|
+
pending.resolve(msg["text"]);
|
|
1331
|
+
}
|
|
1332
|
+
break;
|
|
1333
|
+
}
|
|
1334
|
+
case "error": {
|
|
1335
|
+
const id = msg["id"];
|
|
1336
|
+
const pending = pendingRequests.get(id);
|
|
1337
|
+
if (pending) {
|
|
1338
|
+
pendingRequests.delete(id);
|
|
1339
|
+
pending.reject(new Error(msg["error"]));
|
|
1340
|
+
}
|
|
1341
|
+
break;
|
|
1342
|
+
}
|
|
1343
|
+
}
|
|
1344
|
+
}
|
|
1345
|
+
async function createOcrWorker(wasmGluePath, wasmBinary, directFallback) {
|
|
1346
|
+
fallbackFn = directFallback;
|
|
1347
|
+
if (workerHandle) return;
|
|
1348
|
+
const readyPromise = new Promise((resolve, reject) => {
|
|
1349
|
+
readyResolve = resolve;
|
|
1350
|
+
readyReject = reject;
|
|
1351
|
+
});
|
|
1352
|
+
try {
|
|
1353
|
+
if (isNode()) {
|
|
1354
|
+
await createNodeWorker(wasmGluePath, wasmBinary);
|
|
1355
|
+
} else if (typeof Worker !== "undefined") {
|
|
1356
|
+
await createBrowserWorker(wasmGluePath, wasmBinary);
|
|
1357
|
+
} else {
|
|
1358
|
+
useFallback = true;
|
|
1359
|
+
return;
|
|
1360
|
+
}
|
|
1361
|
+
await readyPromise;
|
|
1362
|
+
} catch {
|
|
1363
|
+
workerHandle = null;
|
|
1364
|
+
useFallback = true;
|
|
1365
|
+
}
|
|
1366
|
+
}
|
|
1367
|
+
async function createNodeWorker(wasmGluePath, wasmBinary) {
|
|
1368
|
+
const { Worker: Worker2 } = await import(
|
|
1369
|
+
/* @vite-ignore */
|
|
1370
|
+
"worker_threads"
|
|
1371
|
+
);
|
|
1372
|
+
const nodePath = await import(
|
|
1373
|
+
/* @vite-ignore */
|
|
1374
|
+
"path"
|
|
1375
|
+
);
|
|
1376
|
+
const nodeUrl = await import(
|
|
1377
|
+
/* @vite-ignore */
|
|
1378
|
+
"url"
|
|
1379
|
+
);
|
|
1380
|
+
const __dirname = nodePath.dirname(nodeUrl.fileURLToPath(import.meta.url));
|
|
1381
|
+
const workerPath = nodePath.join(__dirname, "ocr-worker.js");
|
|
1382
|
+
const worker = new Worker2(workerPath, {
|
|
1383
|
+
workerData: { wasmGluePath, wasmBinary }
|
|
1384
|
+
});
|
|
1385
|
+
worker.on("message", (msg) => handleWorkerMessage(msg));
|
|
1386
|
+
worker.on("error", (err) => {
|
|
1387
|
+
for (const pending of pendingRequests.values()) {
|
|
1388
|
+
pending.reject(err);
|
|
1389
|
+
}
|
|
1390
|
+
pendingRequests.clear();
|
|
1391
|
+
readyReject?.(err);
|
|
1392
|
+
});
|
|
1393
|
+
workerHandle = {
|
|
1394
|
+
postMessage: (data) => worker.postMessage(data),
|
|
1395
|
+
terminate: () => worker.terminate()
|
|
1396
|
+
};
|
|
1397
|
+
}
|
|
1398
|
+
async function createBrowserWorker(wasmGluePath, wasmBinary) {
|
|
1399
|
+
const workerUrl = new URL("./ocr-worker.js", import.meta.url);
|
|
1400
|
+
const worker = new Worker(workerUrl, { type: "module" });
|
|
1401
|
+
worker.onmessage = (e) => handleWorkerMessage(e.data);
|
|
1402
|
+
worker.onerror = (e) => {
|
|
1403
|
+
const err = new Error(e.message);
|
|
1404
|
+
for (const pending of pendingRequests.values()) {
|
|
1405
|
+
pending.reject(err);
|
|
1406
|
+
}
|
|
1407
|
+
pendingRequests.clear();
|
|
1408
|
+
readyReject?.(err);
|
|
1409
|
+
};
|
|
1410
|
+
workerHandle = {
|
|
1411
|
+
postMessage: (data) => worker.postMessage(data),
|
|
1412
|
+
terminate: () => worker.terminate()
|
|
1413
|
+
};
|
|
1414
|
+
worker.postMessage({
|
|
1415
|
+
type: "init",
|
|
1416
|
+
wasmGluePath,
|
|
1417
|
+
wasmBinary
|
|
1418
|
+
});
|
|
1419
|
+
}
|
|
1420
|
+
function runOcrInWorker(imageData, tessdata, language) {
|
|
1421
|
+
if (useFallback || !workerHandle || !workerReady) {
|
|
1422
|
+
if (fallbackFn) {
|
|
1423
|
+
try {
|
|
1424
|
+
const text = fallbackFn(imageData, tessdata, language);
|
|
1425
|
+
return Promise.resolve(text);
|
|
1426
|
+
} catch (e) {
|
|
1427
|
+
return Promise.reject(e instanceof Error ? e : new Error(String(e)));
|
|
1428
|
+
}
|
|
1429
|
+
}
|
|
1430
|
+
return Promise.reject(new Error("OCR worker not initialized and no fallback available"));
|
|
1431
|
+
}
|
|
1432
|
+
const id = nextRequestId++;
|
|
1433
|
+
return new Promise((resolve, reject) => {
|
|
1434
|
+
pendingRequests.set(id, { resolve, reject });
|
|
1435
|
+
workerHandle.postMessage({
|
|
1436
|
+
type: "ocr",
|
|
1437
|
+
id,
|
|
1438
|
+
imageData,
|
|
1439
|
+
tessdata,
|
|
1440
|
+
language
|
|
1441
|
+
});
|
|
1442
|
+
});
|
|
1443
|
+
}
|
|
1444
|
+
async function terminateOcrWorker() {
|
|
1445
|
+
if (workerHandle) {
|
|
1446
|
+
await workerHandle.terminate();
|
|
1447
|
+
workerHandle = null;
|
|
1448
|
+
}
|
|
1449
|
+
workerReady = false;
|
|
1450
|
+
useFallback = false;
|
|
1451
|
+
fallbackFn = null;
|
|
1452
|
+
for (const pending of pendingRequests.values()) {
|
|
1453
|
+
pending.reject(new Error("OCR worker terminated"));
|
|
1454
|
+
}
|
|
1455
|
+
pendingRequests.clear();
|
|
1456
|
+
}
|
|
1457
|
+
|
|
1301
1458
|
// typescript/ocr/enabler.ts
|
|
1302
1459
|
var TESSDATA_CDN_BASE = "https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/main";
|
|
1303
1460
|
var NativeWasmOcrBackend = class {
|
|
@@ -1360,19 +1517,47 @@ var NativeWasmOcrBackend = class {
|
|
|
1360
1517
|
"Native WASM OCR is not available. Build with the 'ocr-wasm' feature to enable kreuzberg-tesseract."
|
|
1361
1518
|
);
|
|
1362
1519
|
}
|
|
1520
|
+
let wasmGluePath;
|
|
1521
|
+
let wasmBinary;
|
|
1522
|
+
if (isNode()) {
|
|
1523
|
+
const nodePath = await import(
|
|
1524
|
+
/* @vite-ignore */
|
|
1525
|
+
"path"
|
|
1526
|
+
);
|
|
1527
|
+
const nodeUrl = await import(
|
|
1528
|
+
/* @vite-ignore */
|
|
1529
|
+
"url"
|
|
1530
|
+
);
|
|
1531
|
+
const nodeFs = await import(
|
|
1532
|
+
/* @vite-ignore */
|
|
1533
|
+
"fs/promises"
|
|
1534
|
+
);
|
|
1535
|
+
const __dirname = nodePath.dirname(nodeUrl.fileURLToPath(import.meta.url));
|
|
1536
|
+
wasmGluePath = nodePath.join(__dirname, "..", "pkg", "kreuzberg_wasm.js");
|
|
1537
|
+
try {
|
|
1538
|
+
const wasmPath = nodePath.join(__dirname, "..", "pkg", "kreuzberg_wasm_bg.wasm");
|
|
1539
|
+
const buf = await nodeFs.readFile(wasmPath);
|
|
1540
|
+
wasmBinary = new Uint8Array(buf);
|
|
1541
|
+
} catch {
|
|
1542
|
+
}
|
|
1543
|
+
} else {
|
|
1544
|
+
wasmGluePath = new URL("./pkg/kreuzberg_wasm.js", import.meta.url).href;
|
|
1545
|
+
}
|
|
1546
|
+
const directFallback = (imageData, tessdata, language) => {
|
|
1547
|
+
if (!wasm2.ocrRecognize) throw new Error("ocrRecognize not available");
|
|
1548
|
+
return wasm2.ocrRecognize(imageData, tessdata, language);
|
|
1549
|
+
};
|
|
1550
|
+
await createOcrWorker(wasmGluePath, wasmBinary, directFallback);
|
|
1363
1551
|
}
|
|
1364
1552
|
async shutdown() {
|
|
1365
1553
|
this.tessdataCache.clear();
|
|
1366
1554
|
this.progressCallback = null;
|
|
1555
|
+
await terminateOcrWorker();
|
|
1367
1556
|
}
|
|
1368
1557
|
setProgressCallback(callback) {
|
|
1369
1558
|
this.progressCallback = callback;
|
|
1370
1559
|
}
|
|
1371
1560
|
async processImage(imageBytes, language) {
|
|
1372
|
-
const wasm2 = getWasmModule();
|
|
1373
|
-
if (!wasm2?.ocrRecognize) {
|
|
1374
|
-
throw new Error("Native WASM OCR function not available");
|
|
1375
|
-
}
|
|
1376
1561
|
const normalizedLang = language.toLowerCase();
|
|
1377
1562
|
this.reportProgress(10);
|
|
1378
1563
|
const tessdata = await this.getTessdata(normalizedLang);
|
|
@@ -1388,7 +1573,7 @@ var NativeWasmOcrBackend = class {
|
|
|
1388
1573
|
imageData = imageBytes;
|
|
1389
1574
|
}
|
|
1390
1575
|
this.reportProgress(50);
|
|
1391
|
-
const text =
|
|
1576
|
+
const text = await runOcrInWorker(imageData, tessdata, normalizedLang);
|
|
1392
1577
|
this.reportProgress(90);
|
|
1393
1578
|
return {
|
|
1394
1579
|
content: text,
|