pdf-oxide-wasm 0.3.51 → 0.3.52
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bundler/pdf_oxide.d.ts +25 -9
- package/bundler/pdf_oxide_bg.js +42 -30
- package/bundler/pdf_oxide_bg.wasm +0 -0
- package/nodejs/pdf_oxide.d.ts +25 -9
- package/nodejs/pdf_oxide.js +42 -30
- package/nodejs/pdf_oxide_bg.wasm +0 -0
- package/package.json +1 -1
- package/web/pdf_oxide.d.ts +25 -9
- package/web/pdf_oxide.js +42 -30
- package/web/pdf_oxide_bg.wasm +0 -0
package/bundler/pdf_oxide.d.ts
CHANGED
|
@@ -644,7 +644,9 @@ export class WasmHeader {
|
|
|
644
644
|
}
|
|
645
645
|
|
|
646
646
|
/**
|
|
647
|
-
* OCR configuration for WebAssembly.
|
|
647
|
+
* OCR configuration for WebAssembly. (Currently a marker — the engine
|
|
648
|
+
* uses tuned defaults; knobs are exposed as the WASM OCR surface
|
|
649
|
+
* matures, #524.)
|
|
648
650
|
*/
|
|
649
651
|
export class WasmOcrConfig {
|
|
650
652
|
free(): void;
|
|
@@ -656,15 +658,27 @@ export class WasmOcrConfig {
|
|
|
656
658
|
}
|
|
657
659
|
|
|
658
660
|
/**
|
|
659
|
-
* OCR engine for WebAssembly.
|
|
661
|
+
* OCR engine for WebAssembly (#524).
|
|
662
|
+
*
|
|
663
|
+
* OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
|
|
664
|
+
* native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
|
|
665
|
+
* the browser/Deno/edge host fetches the detector + recognizer ONNX
|
|
666
|
+
* files and the char dictionary (see `modelManifest()` for the URLs)
|
|
667
|
+
* — typically `fetch()` + the Cache API / IndexedDB for the
|
|
668
|
+
* tens-of-MB models — then hands the bytes to the constructor. This
|
|
669
|
+
* only works in the `wasm-ocr` build of `pdf-oxide`; the default
|
|
670
|
+
* `pdf-oxide-wasm` has no OCR (the constructor returns an error
|
|
671
|
+
* explaining this).
|
|
660
672
|
*/
|
|
661
673
|
export class WasmOcrEngine {
|
|
662
674
|
free(): void;
|
|
663
675
|
[Symbol.dispose](): void;
|
|
664
676
|
/**
|
|
665
|
-
*
|
|
677
|
+
* Not available in this build. OCR needs the `wasm-ocr` build of
|
|
678
|
+
* `pdf-oxide` (the pure-Rust tract backend); the default
|
|
679
|
+
* `pdf-oxide-wasm` ships without it.
|
|
666
680
|
*/
|
|
667
|
-
constructor(
|
|
681
|
+
constructor(_det_model: Uint8Array, _rec_model: Uint8Array, _dict: string, _config?: WasmOcrConfig | null);
|
|
668
682
|
}
|
|
669
683
|
|
|
670
684
|
/**
|
|
@@ -1017,12 +1031,10 @@ export class WasmPdfDocument {
|
|
|
1017
1031
|
*/
|
|
1018
1032
|
extractTextLines(page_index: number, region?: Float32Array | null): any;
|
|
1019
1033
|
/**
|
|
1020
|
-
* Extract text using OCR
|
|
1021
|
-
*
|
|
1022
|
-
* NOTE: OCR is not yet supported in the WebAssembly build due to missing
|
|
1023
|
-
* ONNX Runtime support for the web backend in the current implementation.
|
|
1034
|
+
* Extract text using OCR. Not available in this build — OCR needs
|
|
1035
|
+
* the `wasm-ocr` build of `pdf-oxide`.
|
|
1024
1036
|
*/
|
|
1025
|
-
extractTextOcr(_page_index: number, _engine
|
|
1037
|
+
extractTextOcr(_page_index: number, _engine: WasmOcrEngine): string;
|
|
1026
1038
|
/**
|
|
1027
1039
|
* Extract word-level data from a page.
|
|
1028
1040
|
*
|
|
@@ -1445,6 +1457,10 @@ export class WasmPdfPageRegion {
|
|
|
1445
1457
|
extractTextLines(): any;
|
|
1446
1458
|
/**
|
|
1447
1459
|
* Extract text using OCR from this region.
|
|
1460
|
+
*
|
|
1461
|
+
* Region-scoped OCR is not wired yet; use the page-level
|
|
1462
|
+
* `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
|
|
1463
|
+
* (#524 follow-up).
|
|
1448
1464
|
*/
|
|
1449
1465
|
extractTextOcr(_engine?: WasmOcrEngine | null): string;
|
|
1450
1466
|
/**
|
package/bundler/pdf_oxide_bg.js
CHANGED
|
@@ -2544,7 +2544,9 @@ export class WasmHeader {
|
|
|
2544
2544
|
if (Symbol.dispose) WasmHeader.prototype[Symbol.dispose] = WasmHeader.prototype.free;
|
|
2545
2545
|
|
|
2546
2546
|
/**
|
|
2547
|
-
* OCR configuration for WebAssembly.
|
|
2547
|
+
* OCR configuration for WebAssembly. (Currently a marker — the engine
|
|
2548
|
+
* uses tuned defaults; knobs are exposed as the WASM OCR surface
|
|
2549
|
+
* matures, #524.)
|
|
2548
2550
|
*/
|
|
2549
2551
|
export class WasmOcrConfig {
|
|
2550
2552
|
__destroy_into_raw() {
|
|
@@ -2570,7 +2572,17 @@ export class WasmOcrConfig {
|
|
|
2570
2572
|
if (Symbol.dispose) WasmOcrConfig.prototype[Symbol.dispose] = WasmOcrConfig.prototype.free;
|
|
2571
2573
|
|
|
2572
2574
|
/**
|
|
2573
|
-
* OCR engine for WebAssembly.
|
|
2575
|
+
* OCR engine for WebAssembly (#524).
|
|
2576
|
+
*
|
|
2577
|
+
* OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
|
|
2578
|
+
* native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
|
|
2579
|
+
* the browser/Deno/edge host fetches the detector + recognizer ONNX
|
|
2580
|
+
* files and the char dictionary (see `modelManifest()` for the URLs)
|
|
2581
|
+
* — typically `fetch()` + the Cache API / IndexedDB for the
|
|
2582
|
+
* tens-of-MB models — then hands the bytes to the constructor. This
|
|
2583
|
+
* only works in the `wasm-ocr` build of `pdf-oxide`; the default
|
|
2584
|
+
* `pdf-oxide-wasm` has no OCR (the constructor returns an error
|
|
2585
|
+
* explaining this).
|
|
2574
2586
|
*/
|
|
2575
2587
|
export class WasmOcrEngine {
|
|
2576
2588
|
__destroy_into_raw() {
|
|
@@ -2584,20 +2596,22 @@ export class WasmOcrEngine {
|
|
|
2584
2596
|
wasm.__wbg_wasmocrengine_free(ptr, 0);
|
|
2585
2597
|
}
|
|
2586
2598
|
/**
|
|
2587
|
-
*
|
|
2588
|
-
*
|
|
2589
|
-
*
|
|
2590
|
-
* @param {
|
|
2599
|
+
* Not available in this build. OCR needs the `wasm-ocr` build of
|
|
2600
|
+
* `pdf-oxide` (the pure-Rust tract backend); the default
|
|
2601
|
+
* `pdf-oxide-wasm` ships without it.
|
|
2602
|
+
* @param {Uint8Array} _det_model
|
|
2603
|
+
* @param {Uint8Array} _rec_model
|
|
2604
|
+
* @param {string} _dict
|
|
2591
2605
|
* @param {WasmOcrConfig | null} [_config]
|
|
2592
2606
|
*/
|
|
2593
|
-
constructor(
|
|
2607
|
+
constructor(_det_model, _rec_model, _dict, _config) {
|
|
2594
2608
|
try {
|
|
2595
2609
|
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
2596
|
-
const ptr0 =
|
|
2610
|
+
const ptr0 = passArray8ToWasm0(_det_model, wasm.__wbindgen_export);
|
|
2597
2611
|
const len0 = WASM_VECTOR_LEN;
|
|
2598
|
-
const ptr1 =
|
|
2612
|
+
const ptr1 = passArray8ToWasm0(_rec_model, wasm.__wbindgen_export);
|
|
2599
2613
|
const len1 = WASM_VECTOR_LEN;
|
|
2600
|
-
const ptr2 = passStringToWasm0(
|
|
2614
|
+
const ptr2 = passStringToWasm0(_dict, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
2601
2615
|
const len2 = WASM_VECTOR_LEN;
|
|
2602
2616
|
let ptr3 = 0;
|
|
2603
2617
|
if (!isLikeNone(_config)) {
|
|
@@ -3902,41 +3916,35 @@ export class WasmPdfDocument {
|
|
|
3902
3916
|
}
|
|
3903
3917
|
}
|
|
3904
3918
|
/**
|
|
3905
|
-
* Extract text using OCR
|
|
3906
|
-
*
|
|
3907
|
-
* NOTE: OCR is not yet supported in the WebAssembly build due to missing
|
|
3908
|
-
* ONNX Runtime support for the web backend in the current implementation.
|
|
3919
|
+
* Extract text using OCR. Not available in this build — OCR needs
|
|
3920
|
+
* the `wasm-ocr` build of `pdf-oxide`.
|
|
3909
3921
|
* @param {number} _page_index
|
|
3910
|
-
* @param {WasmOcrEngine
|
|
3922
|
+
* @param {WasmOcrEngine} _engine
|
|
3911
3923
|
* @returns {string}
|
|
3912
3924
|
*/
|
|
3913
3925
|
extractTextOcr(_page_index, _engine) {
|
|
3914
|
-
let
|
|
3915
|
-
let
|
|
3926
|
+
let deferred2_0;
|
|
3927
|
+
let deferred2_1;
|
|
3916
3928
|
try {
|
|
3917
3929
|
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
3918
|
-
|
|
3919
|
-
|
|
3920
|
-
_assertClass(_engine, WasmOcrEngine);
|
|
3921
|
-
ptr0 = _engine.__destroy_into_raw();
|
|
3922
|
-
}
|
|
3923
|
-
wasm.wasmpdfdocument_extractTextOcr(retptr, this.__wbg_ptr, _page_index, ptr0);
|
|
3930
|
+
_assertClass(_engine, WasmOcrEngine);
|
|
3931
|
+
wasm.wasmpdfdocument_extractTextOcr(retptr, this.__wbg_ptr, _page_index, _engine.__wbg_ptr);
|
|
3924
3932
|
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
3925
3933
|
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
3926
3934
|
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
3927
3935
|
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
3928
|
-
var
|
|
3929
|
-
var
|
|
3936
|
+
var ptr1 = r0;
|
|
3937
|
+
var len1 = r1;
|
|
3930
3938
|
if (r3) {
|
|
3931
|
-
|
|
3939
|
+
ptr1 = 0; len1 = 0;
|
|
3932
3940
|
throw takeObject(r2);
|
|
3933
3941
|
}
|
|
3934
|
-
|
|
3935
|
-
|
|
3936
|
-
return getStringFromWasm0(
|
|
3942
|
+
deferred2_0 = ptr1;
|
|
3943
|
+
deferred2_1 = len1;
|
|
3944
|
+
return getStringFromWasm0(ptr1, len1);
|
|
3937
3945
|
} finally {
|
|
3938
3946
|
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
3939
|
-
wasm.__wbindgen_export4(
|
|
3947
|
+
wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
|
|
3940
3948
|
}
|
|
3941
3949
|
}
|
|
3942
3950
|
/**
|
|
@@ -5695,6 +5703,10 @@ export class WasmPdfPageRegion {
|
|
|
5695
5703
|
}
|
|
5696
5704
|
/**
|
|
5697
5705
|
* Extract text using OCR from this region.
|
|
5706
|
+
*
|
|
5707
|
+
* Region-scoped OCR is not wired yet; use the page-level
|
|
5708
|
+
* `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
|
|
5709
|
+
* (#524 follow-up).
|
|
5698
5710
|
* @param {WasmOcrEngine | null} [_engine]
|
|
5699
5711
|
* @returns {string}
|
|
5700
5712
|
*/
|
|
Binary file
|
package/nodejs/pdf_oxide.d.ts
CHANGED
|
@@ -644,7 +644,9 @@ export class WasmHeader {
|
|
|
644
644
|
}
|
|
645
645
|
|
|
646
646
|
/**
|
|
647
|
-
* OCR configuration for WebAssembly.
|
|
647
|
+
* OCR configuration for WebAssembly. (Currently a marker — the engine
|
|
648
|
+
* uses tuned defaults; knobs are exposed as the WASM OCR surface
|
|
649
|
+
* matures, #524.)
|
|
648
650
|
*/
|
|
649
651
|
export class WasmOcrConfig {
|
|
650
652
|
free(): void;
|
|
@@ -656,15 +658,27 @@ export class WasmOcrConfig {
|
|
|
656
658
|
}
|
|
657
659
|
|
|
658
660
|
/**
|
|
659
|
-
* OCR engine for WebAssembly.
|
|
661
|
+
* OCR engine for WebAssembly (#524).
|
|
662
|
+
*
|
|
663
|
+
* OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
|
|
664
|
+
* native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
|
|
665
|
+
* the browser/Deno/edge host fetches the detector + recognizer ONNX
|
|
666
|
+
* files and the char dictionary (see `modelManifest()` for the URLs)
|
|
667
|
+
* — typically `fetch()` + the Cache API / IndexedDB for the
|
|
668
|
+
* tens-of-MB models — then hands the bytes to the constructor. This
|
|
669
|
+
* only works in the `wasm-ocr` build of `pdf-oxide`; the default
|
|
670
|
+
* `pdf-oxide-wasm` has no OCR (the constructor returns an error
|
|
671
|
+
* explaining this).
|
|
660
672
|
*/
|
|
661
673
|
export class WasmOcrEngine {
|
|
662
674
|
free(): void;
|
|
663
675
|
[Symbol.dispose](): void;
|
|
664
676
|
/**
|
|
665
|
-
*
|
|
677
|
+
* Not available in this build. OCR needs the `wasm-ocr` build of
|
|
678
|
+
* `pdf-oxide` (the pure-Rust tract backend); the default
|
|
679
|
+
* `pdf-oxide-wasm` ships without it.
|
|
666
680
|
*/
|
|
667
|
-
constructor(
|
|
681
|
+
constructor(_det_model: Uint8Array, _rec_model: Uint8Array, _dict: string, _config?: WasmOcrConfig | null);
|
|
668
682
|
}
|
|
669
683
|
|
|
670
684
|
/**
|
|
@@ -1017,12 +1031,10 @@ export class WasmPdfDocument {
|
|
|
1017
1031
|
*/
|
|
1018
1032
|
extractTextLines(page_index: number, region?: Float32Array | null): any;
|
|
1019
1033
|
/**
|
|
1020
|
-
* Extract text using OCR
|
|
1021
|
-
*
|
|
1022
|
-
* NOTE: OCR is not yet supported in the WebAssembly build due to missing
|
|
1023
|
-
* ONNX Runtime support for the web backend in the current implementation.
|
|
1034
|
+
* Extract text using OCR. Not available in this build — OCR needs
|
|
1035
|
+
* the `wasm-ocr` build of `pdf-oxide`.
|
|
1024
1036
|
*/
|
|
1025
|
-
extractTextOcr(_page_index: number, _engine
|
|
1037
|
+
extractTextOcr(_page_index: number, _engine: WasmOcrEngine): string;
|
|
1026
1038
|
/**
|
|
1027
1039
|
* Extract word-level data from a page.
|
|
1028
1040
|
*
|
|
@@ -1445,6 +1457,10 @@ export class WasmPdfPageRegion {
|
|
|
1445
1457
|
extractTextLines(): any;
|
|
1446
1458
|
/**
|
|
1447
1459
|
* Extract text using OCR from this region.
|
|
1460
|
+
*
|
|
1461
|
+
* Region-scoped OCR is not wired yet; use the page-level
|
|
1462
|
+
* `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
|
|
1463
|
+
* (#524 follow-up).
|
|
1448
1464
|
*/
|
|
1449
1465
|
extractTextOcr(_engine?: WasmOcrEngine | null): string;
|
|
1450
1466
|
/**
|
package/nodejs/pdf_oxide.js
CHANGED
|
@@ -2559,7 +2559,9 @@ if (Symbol.dispose) WasmHeader.prototype[Symbol.dispose] = WasmHeader.prototype.
|
|
|
2559
2559
|
exports.WasmHeader = WasmHeader;
|
|
2560
2560
|
|
|
2561
2561
|
/**
|
|
2562
|
-
* OCR configuration for WebAssembly.
|
|
2562
|
+
* OCR configuration for WebAssembly. (Currently a marker — the engine
|
|
2563
|
+
* uses tuned defaults; knobs are exposed as the WASM OCR surface
|
|
2564
|
+
* matures, #524.)
|
|
2563
2565
|
*/
|
|
2564
2566
|
class WasmOcrConfig {
|
|
2565
2567
|
__destroy_into_raw() {
|
|
@@ -2586,7 +2588,17 @@ if (Symbol.dispose) WasmOcrConfig.prototype[Symbol.dispose] = WasmOcrConfig.prot
|
|
|
2586
2588
|
exports.WasmOcrConfig = WasmOcrConfig;
|
|
2587
2589
|
|
|
2588
2590
|
/**
|
|
2589
|
-
* OCR engine for WebAssembly.
|
|
2591
|
+
* OCR engine for WebAssembly (#524).
|
|
2592
|
+
*
|
|
2593
|
+
* OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
|
|
2594
|
+
* native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
|
|
2595
|
+
* the browser/Deno/edge host fetches the detector + recognizer ONNX
|
|
2596
|
+
* files and the char dictionary (see `modelManifest()` for the URLs)
|
|
2597
|
+
* — typically `fetch()` + the Cache API / IndexedDB for the
|
|
2598
|
+
* tens-of-MB models — then hands the bytes to the constructor. This
|
|
2599
|
+
* only works in the `wasm-ocr` build of `pdf-oxide`; the default
|
|
2600
|
+
* `pdf-oxide-wasm` has no OCR (the constructor returns an error
|
|
2601
|
+
* explaining this).
|
|
2590
2602
|
*/
|
|
2591
2603
|
class WasmOcrEngine {
|
|
2592
2604
|
__destroy_into_raw() {
|
|
@@ -2600,20 +2612,22 @@ class WasmOcrEngine {
|
|
|
2600
2612
|
wasm.__wbg_wasmocrengine_free(ptr, 0);
|
|
2601
2613
|
}
|
|
2602
2614
|
/**
|
|
2603
|
-
*
|
|
2604
|
-
*
|
|
2605
|
-
*
|
|
2606
|
-
* @param {
|
|
2615
|
+
* Not available in this build. OCR needs the `wasm-ocr` build of
|
|
2616
|
+
* `pdf-oxide` (the pure-Rust tract backend); the default
|
|
2617
|
+
* `pdf-oxide-wasm` ships without it.
|
|
2618
|
+
* @param {Uint8Array} _det_model
|
|
2619
|
+
* @param {Uint8Array} _rec_model
|
|
2620
|
+
* @param {string} _dict
|
|
2607
2621
|
* @param {WasmOcrConfig | null} [_config]
|
|
2608
2622
|
*/
|
|
2609
|
-
constructor(
|
|
2623
|
+
constructor(_det_model, _rec_model, _dict, _config) {
|
|
2610
2624
|
try {
|
|
2611
2625
|
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
2612
|
-
const ptr0 =
|
|
2626
|
+
const ptr0 = passArray8ToWasm0(_det_model, wasm.__wbindgen_export);
|
|
2613
2627
|
const len0 = WASM_VECTOR_LEN;
|
|
2614
|
-
const ptr1 =
|
|
2628
|
+
const ptr1 = passArray8ToWasm0(_rec_model, wasm.__wbindgen_export);
|
|
2615
2629
|
const len1 = WASM_VECTOR_LEN;
|
|
2616
|
-
const ptr2 = passStringToWasm0(
|
|
2630
|
+
const ptr2 = passStringToWasm0(_dict, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
2617
2631
|
const len2 = WASM_VECTOR_LEN;
|
|
2618
2632
|
let ptr3 = 0;
|
|
2619
2633
|
if (!isLikeNone(_config)) {
|
|
@@ -3921,41 +3935,35 @@ class WasmPdfDocument {
|
|
|
3921
3935
|
}
|
|
3922
3936
|
}
|
|
3923
3937
|
/**
|
|
3924
|
-
* Extract text using OCR
|
|
3925
|
-
*
|
|
3926
|
-
* NOTE: OCR is not yet supported in the WebAssembly build due to missing
|
|
3927
|
-
* ONNX Runtime support for the web backend in the current implementation.
|
|
3938
|
+
* Extract text using OCR. Not available in this build — OCR needs
|
|
3939
|
+
* the `wasm-ocr` build of `pdf-oxide`.
|
|
3928
3940
|
* @param {number} _page_index
|
|
3929
|
-
* @param {WasmOcrEngine
|
|
3941
|
+
* @param {WasmOcrEngine} _engine
|
|
3930
3942
|
* @returns {string}
|
|
3931
3943
|
*/
|
|
3932
3944
|
extractTextOcr(_page_index, _engine) {
|
|
3933
|
-
let
|
|
3934
|
-
let
|
|
3945
|
+
let deferred2_0;
|
|
3946
|
+
let deferred2_1;
|
|
3935
3947
|
try {
|
|
3936
3948
|
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
3937
|
-
|
|
3938
|
-
|
|
3939
|
-
_assertClass(_engine, WasmOcrEngine);
|
|
3940
|
-
ptr0 = _engine.__destroy_into_raw();
|
|
3941
|
-
}
|
|
3942
|
-
wasm.wasmpdfdocument_extractTextOcr(retptr, this.__wbg_ptr, _page_index, ptr0);
|
|
3949
|
+
_assertClass(_engine, WasmOcrEngine);
|
|
3950
|
+
wasm.wasmpdfdocument_extractTextOcr(retptr, this.__wbg_ptr, _page_index, _engine.__wbg_ptr);
|
|
3943
3951
|
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
3944
3952
|
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
3945
3953
|
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
3946
3954
|
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
3947
|
-
var
|
|
3948
|
-
var
|
|
3955
|
+
var ptr1 = r0;
|
|
3956
|
+
var len1 = r1;
|
|
3949
3957
|
if (r3) {
|
|
3950
|
-
|
|
3958
|
+
ptr1 = 0; len1 = 0;
|
|
3951
3959
|
throw takeObject(r2);
|
|
3952
3960
|
}
|
|
3953
|
-
|
|
3954
|
-
|
|
3955
|
-
return getStringFromWasm0(
|
|
3961
|
+
deferred2_0 = ptr1;
|
|
3962
|
+
deferred2_1 = len1;
|
|
3963
|
+
return getStringFromWasm0(ptr1, len1);
|
|
3956
3964
|
} finally {
|
|
3957
3965
|
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
3958
|
-
wasm.__wbindgen_export4(
|
|
3966
|
+
wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
|
|
3959
3967
|
}
|
|
3960
3968
|
}
|
|
3961
3969
|
/**
|
|
@@ -5715,6 +5723,10 @@ class WasmPdfPageRegion {
|
|
|
5715
5723
|
}
|
|
5716
5724
|
/**
|
|
5717
5725
|
* Extract text using OCR from this region.
|
|
5726
|
+
*
|
|
5727
|
+
* Region-scoped OCR is not wired yet; use the page-level
|
|
5728
|
+
* `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
|
|
5729
|
+
* (#524 follow-up).
|
|
5718
5730
|
* @param {WasmOcrEngine | null} [_engine]
|
|
5719
5731
|
* @returns {string}
|
|
5720
5732
|
*/
|
package/nodejs/pdf_oxide_bg.wasm
CHANGED
|
Binary file
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pdf-oxide-wasm",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.52",
|
|
4
4
|
"description": "Fast, zero-dependency PDF toolkit for Node.js, browsers, and edge runtimes — text extraction, markdown/HTML conversion, search, form filling, creation, and editing. Rust core compiled to WebAssembly.",
|
|
5
5
|
"license": "MIT OR Apache-2.0",
|
|
6
6
|
"repository": {
|
package/web/pdf_oxide.d.ts
CHANGED
|
@@ -644,7 +644,9 @@ export class WasmHeader {
|
|
|
644
644
|
}
|
|
645
645
|
|
|
646
646
|
/**
|
|
647
|
-
* OCR configuration for WebAssembly.
|
|
647
|
+
* OCR configuration for WebAssembly. (Currently a marker — the engine
|
|
648
|
+
* uses tuned defaults; knobs are exposed as the WASM OCR surface
|
|
649
|
+
* matures, #524.)
|
|
648
650
|
*/
|
|
649
651
|
export class WasmOcrConfig {
|
|
650
652
|
free(): void;
|
|
@@ -656,15 +658,27 @@ export class WasmOcrConfig {
|
|
|
656
658
|
}
|
|
657
659
|
|
|
658
660
|
/**
|
|
659
|
-
* OCR engine for WebAssembly.
|
|
661
|
+
* OCR engine for WebAssembly (#524).
|
|
662
|
+
*
|
|
663
|
+
* OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
|
|
664
|
+
* native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
|
|
665
|
+
* the browser/Deno/edge host fetches the detector + recognizer ONNX
|
|
666
|
+
* files and the char dictionary (see `modelManifest()` for the URLs)
|
|
667
|
+
* — typically `fetch()` + the Cache API / IndexedDB for the
|
|
668
|
+
* tens-of-MB models — then hands the bytes to the constructor. This
|
|
669
|
+
* only works in the `wasm-ocr` build of `pdf-oxide`; the default
|
|
670
|
+
* `pdf-oxide-wasm` has no OCR (the constructor returns an error
|
|
671
|
+
* explaining this).
|
|
660
672
|
*/
|
|
661
673
|
export class WasmOcrEngine {
|
|
662
674
|
free(): void;
|
|
663
675
|
[Symbol.dispose](): void;
|
|
664
676
|
/**
|
|
665
|
-
*
|
|
677
|
+
* Not available in this build. OCR needs the `wasm-ocr` build of
|
|
678
|
+
* `pdf-oxide` (the pure-Rust tract backend); the default
|
|
679
|
+
* `pdf-oxide-wasm` ships without it.
|
|
666
680
|
*/
|
|
667
|
-
constructor(
|
|
681
|
+
constructor(_det_model: Uint8Array, _rec_model: Uint8Array, _dict: string, _config?: WasmOcrConfig | null);
|
|
668
682
|
}
|
|
669
683
|
|
|
670
684
|
/**
|
|
@@ -1017,12 +1031,10 @@ export class WasmPdfDocument {
|
|
|
1017
1031
|
*/
|
|
1018
1032
|
extractTextLines(page_index: number, region?: Float32Array | null): any;
|
|
1019
1033
|
/**
|
|
1020
|
-
* Extract text using OCR
|
|
1021
|
-
*
|
|
1022
|
-
* NOTE: OCR is not yet supported in the WebAssembly build due to missing
|
|
1023
|
-
* ONNX Runtime support for the web backend in the current implementation.
|
|
1034
|
+
* Extract text using OCR. Not available in this build — OCR needs
|
|
1035
|
+
* the `wasm-ocr` build of `pdf-oxide`.
|
|
1024
1036
|
*/
|
|
1025
|
-
extractTextOcr(_page_index: number, _engine
|
|
1037
|
+
extractTextOcr(_page_index: number, _engine: WasmOcrEngine): string;
|
|
1026
1038
|
/**
|
|
1027
1039
|
* Extract word-level data from a page.
|
|
1028
1040
|
*
|
|
@@ -1445,6 +1457,10 @@ export class WasmPdfPageRegion {
|
|
|
1445
1457
|
extractTextLines(): any;
|
|
1446
1458
|
/**
|
|
1447
1459
|
* Extract text using OCR from this region.
|
|
1460
|
+
*
|
|
1461
|
+
* Region-scoped OCR is not wired yet; use the page-level
|
|
1462
|
+
* `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
|
|
1463
|
+
* (#524 follow-up).
|
|
1448
1464
|
*/
|
|
1449
1465
|
extractTextOcr(_engine?: WasmOcrEngine | null): string;
|
|
1450
1466
|
/**
|
package/web/pdf_oxide.js
CHANGED
|
@@ -2546,7 +2546,9 @@ export class WasmHeader {
|
|
|
2546
2546
|
if (Symbol.dispose) WasmHeader.prototype[Symbol.dispose] = WasmHeader.prototype.free;
|
|
2547
2547
|
|
|
2548
2548
|
/**
|
|
2549
|
-
* OCR configuration for WebAssembly.
|
|
2549
|
+
* OCR configuration for WebAssembly. (Currently a marker — the engine
|
|
2550
|
+
* uses tuned defaults; knobs are exposed as the WASM OCR surface
|
|
2551
|
+
* matures, #524.)
|
|
2550
2552
|
*/
|
|
2551
2553
|
export class WasmOcrConfig {
|
|
2552
2554
|
__destroy_into_raw() {
|
|
@@ -2572,7 +2574,17 @@ export class WasmOcrConfig {
|
|
|
2572
2574
|
if (Symbol.dispose) WasmOcrConfig.prototype[Symbol.dispose] = WasmOcrConfig.prototype.free;
|
|
2573
2575
|
|
|
2574
2576
|
/**
|
|
2575
|
-
* OCR engine for WebAssembly.
|
|
2577
|
+
* OCR engine for WebAssembly (#524).
|
|
2578
|
+
*
|
|
2579
|
+
* OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
|
|
2580
|
+
* native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
|
|
2581
|
+
* the browser/Deno/edge host fetches the detector + recognizer ONNX
|
|
2582
|
+
* files and the char dictionary (see `modelManifest()` for the URLs)
|
|
2583
|
+
* — typically `fetch()` + the Cache API / IndexedDB for the
|
|
2584
|
+
* tens-of-MB models — then hands the bytes to the constructor. This
|
|
2585
|
+
* only works in the `wasm-ocr` build of `pdf-oxide`; the default
|
|
2586
|
+
* `pdf-oxide-wasm` has no OCR (the constructor returns an error
|
|
2587
|
+
* explaining this).
|
|
2576
2588
|
*/
|
|
2577
2589
|
export class WasmOcrEngine {
|
|
2578
2590
|
__destroy_into_raw() {
|
|
@@ -2586,20 +2598,22 @@ export class WasmOcrEngine {
|
|
|
2586
2598
|
wasm.__wbg_wasmocrengine_free(ptr, 0);
|
|
2587
2599
|
}
|
|
2588
2600
|
/**
|
|
2589
|
-
*
|
|
2590
|
-
*
|
|
2591
|
-
*
|
|
2592
|
-
* @param {
|
|
2601
|
+
* Not available in this build. OCR needs the `wasm-ocr` build of
|
|
2602
|
+
* `pdf-oxide` (the pure-Rust tract backend); the default
|
|
2603
|
+
* `pdf-oxide-wasm` ships without it.
|
|
2604
|
+
* @param {Uint8Array} _det_model
|
|
2605
|
+
* @param {Uint8Array} _rec_model
|
|
2606
|
+
* @param {string} _dict
|
|
2593
2607
|
* @param {WasmOcrConfig | null} [_config]
|
|
2594
2608
|
*/
|
|
2595
|
-
constructor(
|
|
2609
|
+
constructor(_det_model, _rec_model, _dict, _config) {
|
|
2596
2610
|
try {
|
|
2597
2611
|
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
2598
|
-
const ptr0 =
|
|
2612
|
+
const ptr0 = passArray8ToWasm0(_det_model, wasm.__wbindgen_export);
|
|
2599
2613
|
const len0 = WASM_VECTOR_LEN;
|
|
2600
|
-
const ptr1 =
|
|
2614
|
+
const ptr1 = passArray8ToWasm0(_rec_model, wasm.__wbindgen_export);
|
|
2601
2615
|
const len1 = WASM_VECTOR_LEN;
|
|
2602
|
-
const ptr2 = passStringToWasm0(
|
|
2616
|
+
const ptr2 = passStringToWasm0(_dict, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
2603
2617
|
const len2 = WASM_VECTOR_LEN;
|
|
2604
2618
|
let ptr3 = 0;
|
|
2605
2619
|
if (!isLikeNone(_config)) {
|
|
@@ -3904,41 +3918,35 @@ export class WasmPdfDocument {
|
|
|
3904
3918
|
}
|
|
3905
3919
|
}
|
|
3906
3920
|
/**
|
|
3907
|
-
* Extract text using OCR
|
|
3908
|
-
*
|
|
3909
|
-
* NOTE: OCR is not yet supported in the WebAssembly build due to missing
|
|
3910
|
-
* ONNX Runtime support for the web backend in the current implementation.
|
|
3921
|
+
* Extract text using OCR. Not available in this build — OCR needs
|
|
3922
|
+
* the `wasm-ocr` build of `pdf-oxide`.
|
|
3911
3923
|
* @param {number} _page_index
|
|
3912
|
-
* @param {WasmOcrEngine
|
|
3924
|
+
* @param {WasmOcrEngine} _engine
|
|
3913
3925
|
* @returns {string}
|
|
3914
3926
|
*/
|
|
3915
3927
|
extractTextOcr(_page_index, _engine) {
|
|
3916
|
-
let
|
|
3917
|
-
let
|
|
3928
|
+
let deferred2_0;
|
|
3929
|
+
let deferred2_1;
|
|
3918
3930
|
try {
|
|
3919
3931
|
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
3920
|
-
|
|
3921
|
-
|
|
3922
|
-
_assertClass(_engine, WasmOcrEngine);
|
|
3923
|
-
ptr0 = _engine.__destroy_into_raw();
|
|
3924
|
-
}
|
|
3925
|
-
wasm.wasmpdfdocument_extractTextOcr(retptr, this.__wbg_ptr, _page_index, ptr0);
|
|
3932
|
+
_assertClass(_engine, WasmOcrEngine);
|
|
3933
|
+
wasm.wasmpdfdocument_extractTextOcr(retptr, this.__wbg_ptr, _page_index, _engine.__wbg_ptr);
|
|
3926
3934
|
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
3927
3935
|
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
3928
3936
|
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
3929
3937
|
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
3930
|
-
var
|
|
3931
|
-
var
|
|
3938
|
+
var ptr1 = r0;
|
|
3939
|
+
var len1 = r1;
|
|
3932
3940
|
if (r3) {
|
|
3933
|
-
|
|
3941
|
+
ptr1 = 0; len1 = 0;
|
|
3934
3942
|
throw takeObject(r2);
|
|
3935
3943
|
}
|
|
3936
|
-
|
|
3937
|
-
|
|
3938
|
-
return getStringFromWasm0(
|
|
3944
|
+
deferred2_0 = ptr1;
|
|
3945
|
+
deferred2_1 = len1;
|
|
3946
|
+
return getStringFromWasm0(ptr1, len1);
|
|
3939
3947
|
} finally {
|
|
3940
3948
|
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
3941
|
-
wasm.__wbindgen_export4(
|
|
3949
|
+
wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
|
|
3942
3950
|
}
|
|
3943
3951
|
}
|
|
3944
3952
|
/**
|
|
@@ -5697,6 +5705,10 @@ export class WasmPdfPageRegion {
|
|
|
5697
5705
|
}
|
|
5698
5706
|
/**
|
|
5699
5707
|
* Extract text using OCR from this region.
|
|
5708
|
+
*
|
|
5709
|
+
* Region-scoped OCR is not wired yet; use the page-level
|
|
5710
|
+
* `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
|
|
5711
|
+
* (#524 follow-up).
|
|
5700
5712
|
* @param {WasmOcrEngine | null} [_engine]
|
|
5701
5713
|
* @returns {string}
|
|
5702
5714
|
*/
|
package/web/pdf_oxide_bg.wasm
CHANGED
|
Binary file
|