pdf-oxide-wasm 0.3.50 → 0.3.52
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bundler/pdf_oxide.d.ts +66 -9
- package/bundler/pdf_oxide.js +1 -1
- package/bundler/pdf_oxide_bg.js +205 -30
- package/bundler/pdf_oxide_bg.wasm +0 -0
- package/bundler/pdf_oxide_bg.wasm.d.ts +6 -0
- package/nodejs/pdf_oxide.d.ts +66 -9
- package/nodejs/pdf_oxide.js +207 -30
- package/nodejs/pdf_oxide_bg.wasm +0 -0
- package/nodejs/pdf_oxide_bg.wasm.d.ts +6 -0
- package/package.json +1 -1
- package/web/pdf_oxide.d.ts +72 -9
- package/web/pdf_oxide.js +205 -30
- package/web/pdf_oxide_bg.wasm +0 -0
- package/web/pdf_oxide_bg.wasm.d.ts +6 -0
package/nodejs/pdf_oxide.js
CHANGED
|
@@ -2559,7 +2559,9 @@ if (Symbol.dispose) WasmHeader.prototype[Symbol.dispose] = WasmHeader.prototype.
|
|
|
2559
2559
|
exports.WasmHeader = WasmHeader;
|
|
2560
2560
|
|
|
2561
2561
|
/**
|
|
2562
|
-
* OCR configuration for WebAssembly.
|
|
2562
|
+
* OCR configuration for WebAssembly. (Currently a marker — the engine
|
|
2563
|
+
* uses tuned defaults; knobs are exposed as the WASM OCR surface
|
|
2564
|
+
* matures, #524.)
|
|
2563
2565
|
*/
|
|
2564
2566
|
class WasmOcrConfig {
|
|
2565
2567
|
__destroy_into_raw() {
|
|
@@ -2586,7 +2588,17 @@ if (Symbol.dispose) WasmOcrConfig.prototype[Symbol.dispose] = WasmOcrConfig.prot
|
|
|
2586
2588
|
exports.WasmOcrConfig = WasmOcrConfig;
|
|
2587
2589
|
|
|
2588
2590
|
/**
|
|
2589
|
-
* OCR engine for WebAssembly.
|
|
2591
|
+
* OCR engine for WebAssembly (#524).
|
|
2592
|
+
*
|
|
2593
|
+
* OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
|
|
2594
|
+
* native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
|
|
2595
|
+
* the browser/Deno/edge host fetches the detector + recognizer ONNX
|
|
2596
|
+
* files and the char dictionary (see `modelManifest()` for the URLs)
|
|
2597
|
+
* — typically `fetch()` + the Cache API / IndexedDB for the
|
|
2598
|
+
* tens-of-MB models — then hands the bytes to the constructor. This
|
|
2599
|
+
* only works in the `wasm-ocr` build of `pdf-oxide`; the default
|
|
2600
|
+
* `pdf-oxide-wasm` has no OCR (the constructor returns an error
|
|
2601
|
+
* explaining this).
|
|
2590
2602
|
*/
|
|
2591
2603
|
class WasmOcrEngine {
|
|
2592
2604
|
__destroy_into_raw() {
|
|
@@ -2600,20 +2612,22 @@ class WasmOcrEngine {
|
|
|
2600
2612
|
wasm.__wbg_wasmocrengine_free(ptr, 0);
|
|
2601
2613
|
}
|
|
2602
2614
|
/**
|
|
2603
|
-
*
|
|
2604
|
-
*
|
|
2605
|
-
*
|
|
2606
|
-
* @param {
|
|
2615
|
+
* Not available in this build. OCR needs the `wasm-ocr` build of
|
|
2616
|
+
* `pdf-oxide` (the pure-Rust tract backend); the default
|
|
2617
|
+
* `pdf-oxide-wasm` ships without it.
|
|
2618
|
+
* @param {Uint8Array} _det_model
|
|
2619
|
+
* @param {Uint8Array} _rec_model
|
|
2620
|
+
* @param {string} _dict
|
|
2607
2621
|
* @param {WasmOcrConfig | null} [_config]
|
|
2608
2622
|
*/
|
|
2609
|
-
constructor(
|
|
2623
|
+
constructor(_det_model, _rec_model, _dict, _config) {
|
|
2610
2624
|
try {
|
|
2611
2625
|
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
2612
|
-
const ptr0 =
|
|
2626
|
+
const ptr0 = passArray8ToWasm0(_det_model, wasm.__wbindgen_export);
|
|
2613
2627
|
const len0 = WASM_VECTOR_LEN;
|
|
2614
|
-
const ptr1 =
|
|
2628
|
+
const ptr1 = passArray8ToWasm0(_rec_model, wasm.__wbindgen_export);
|
|
2615
2629
|
const len1 = WASM_VECTOR_LEN;
|
|
2616
|
-
const ptr2 = passStringToWasm0(
|
|
2630
|
+
const ptr2 = passStringToWasm0(_dict, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
2617
2631
|
const len2 = WASM_VECTOR_LEN;
|
|
2618
2632
|
let ptr3 = 0;
|
|
2619
2633
|
if (!isLikeNone(_config)) {
|
|
@@ -3144,6 +3158,64 @@ class WasmPdfDocument {
|
|
|
3144
3158
|
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
3145
3159
|
}
|
|
3146
3160
|
}
|
|
3161
|
+
/**
|
|
3162
|
+
* Cheap per-page text-vs-OCR classification → JSON
|
|
3163
|
+
* `DocumentClassification`.
|
|
3164
|
+
* @returns {string}
|
|
3165
|
+
*/
|
|
3166
|
+
classifyDocument() {
|
|
3167
|
+
let deferred2_0;
|
|
3168
|
+
let deferred2_1;
|
|
3169
|
+
try {
|
|
3170
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
3171
|
+
wasm.wasmpdfdocument_classifyDocument(retptr, this.__wbg_ptr);
|
|
3172
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
3173
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
3174
|
+
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
3175
|
+
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
3176
|
+
var ptr1 = r0;
|
|
3177
|
+
var len1 = r1;
|
|
3178
|
+
if (r3) {
|
|
3179
|
+
ptr1 = 0; len1 = 0;
|
|
3180
|
+
throw takeObject(r2);
|
|
3181
|
+
}
|
|
3182
|
+
deferred2_0 = ptr1;
|
|
3183
|
+
deferred2_1 = len1;
|
|
3184
|
+
return getStringFromWasm0(ptr1, len1);
|
|
3185
|
+
} finally {
|
|
3186
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
3187
|
+
wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
|
|
3188
|
+
}
|
|
3189
|
+
}
|
|
3190
|
+
/**
|
|
3191
|
+
* Cheap per-page classification → JSON `PageClassification`.
|
|
3192
|
+
* @param {number} page_index
|
|
3193
|
+
* @returns {string}
|
|
3194
|
+
*/
|
|
3195
|
+
classifyPage(page_index) {
|
|
3196
|
+
let deferred2_0;
|
|
3197
|
+
let deferred2_1;
|
|
3198
|
+
try {
|
|
3199
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
3200
|
+
wasm.wasmpdfdocument_classifyPage(retptr, this.__wbg_ptr, page_index);
|
|
3201
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
3202
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
3203
|
+
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
3204
|
+
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
3205
|
+
var ptr1 = r0;
|
|
3206
|
+
var len1 = r1;
|
|
3207
|
+
if (r3) {
|
|
3208
|
+
ptr1 = 0; len1 = 0;
|
|
3209
|
+
throw takeObject(r2);
|
|
3210
|
+
}
|
|
3211
|
+
deferred2_0 = ptr1;
|
|
3212
|
+
deferred2_1 = len1;
|
|
3213
|
+
return getStringFromWasm0(ptr1, len1);
|
|
3214
|
+
} finally {
|
|
3215
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
3216
|
+
wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
|
|
3217
|
+
}
|
|
3218
|
+
}
|
|
3147
3219
|
/**
|
|
3148
3220
|
* Clear all pending erase operations for a page.
|
|
3149
3221
|
* @param {number} page_index
|
|
@@ -3572,6 +3644,40 @@ class WasmPdfDocument {
|
|
|
3572
3644
|
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
3573
3645
|
}
|
|
3574
3646
|
}
|
|
3647
|
+
/**
|
|
3648
|
+
* Rich per-page extraction → JSON `PageExtraction` (per-region
|
|
3649
|
+
* bbox + typed reason). `optionsJson` is `{}`-tolerant
|
|
3650
|
+
* `AutoExtractOptions`; undefined/empty → defaults.
|
|
3651
|
+
* @param {number} page_index
|
|
3652
|
+
* @param {string | null} [options_json]
|
|
3653
|
+
* @returns {string}
|
|
3654
|
+
*/
|
|
3655
|
+
extractPageAuto(page_index, options_json) {
|
|
3656
|
+
let deferred3_0;
|
|
3657
|
+
let deferred3_1;
|
|
3658
|
+
try {
|
|
3659
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
3660
|
+
var ptr0 = isLikeNone(options_json) ? 0 : passStringToWasm0(options_json, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
3661
|
+
var len0 = WASM_VECTOR_LEN;
|
|
3662
|
+
wasm.wasmpdfdocument_extractPageAuto(retptr, this.__wbg_ptr, page_index, ptr0, len0);
|
|
3663
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
3664
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
3665
|
+
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
3666
|
+
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
3667
|
+
var ptr2 = r0;
|
|
3668
|
+
var len2 = r1;
|
|
3669
|
+
if (r3) {
|
|
3670
|
+
ptr2 = 0; len2 = 0;
|
|
3671
|
+
throw takeObject(r2);
|
|
3672
|
+
}
|
|
3673
|
+
deferred3_0 = ptr2;
|
|
3674
|
+
deferred3_1 = len2;
|
|
3675
|
+
return getStringFromWasm0(ptr2, len2);
|
|
3676
|
+
} finally {
|
|
3677
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
3678
|
+
wasm.__wbindgen_export4(deferred3_0, deferred3_1, 1);
|
|
3679
|
+
}
|
|
3680
|
+
}
|
|
3575
3681
|
/**
|
|
3576
3682
|
* Extract complete page text data in a single call.
|
|
3577
3683
|
*
|
|
@@ -3773,6 +3879,36 @@ class WasmPdfDocument {
|
|
|
3773
3879
|
wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
|
|
3774
3880
|
}
|
|
3775
3881
|
}
|
|
3882
|
+
/**
|
|
3883
|
+
* One-shot auto text extraction — graceful native fallback (never
|
|
3884
|
+
* the opaque OCR error #513).
|
|
3885
|
+
* @param {number} page_index
|
|
3886
|
+
* @returns {string}
|
|
3887
|
+
*/
|
|
3888
|
+
extractTextAuto(page_index) {
|
|
3889
|
+
let deferred2_0;
|
|
3890
|
+
let deferred2_1;
|
|
3891
|
+
try {
|
|
3892
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
3893
|
+
wasm.wasmpdfdocument_extractTextAuto(retptr, this.__wbg_ptr, page_index);
|
|
3894
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
3895
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
3896
|
+
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
3897
|
+
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
3898
|
+
var ptr1 = r0;
|
|
3899
|
+
var len1 = r1;
|
|
3900
|
+
if (r3) {
|
|
3901
|
+
ptr1 = 0; len1 = 0;
|
|
3902
|
+
throw takeObject(r2);
|
|
3903
|
+
}
|
|
3904
|
+
deferred2_0 = ptr1;
|
|
3905
|
+
deferred2_1 = len1;
|
|
3906
|
+
return getStringFromWasm0(ptr1, len1);
|
|
3907
|
+
} finally {
|
|
3908
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
3909
|
+
wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
|
|
3910
|
+
}
|
|
3911
|
+
}
|
|
3776
3912
|
/**
|
|
3777
3913
|
* Extract text lines from a page.
|
|
3778
3914
|
*
|
|
@@ -3799,41 +3935,35 @@ class WasmPdfDocument {
|
|
|
3799
3935
|
}
|
|
3800
3936
|
}
|
|
3801
3937
|
/**
|
|
3802
|
-
* Extract text using OCR
|
|
3803
|
-
*
|
|
3804
|
-
* NOTE: OCR is not yet supported in the WebAssembly build due to missing
|
|
3805
|
-
* ONNX Runtime support for the web backend in the current implementation.
|
|
3938
|
+
* Extract text using OCR. Not available in this build — OCR needs
|
|
3939
|
+
* the `wasm-ocr` build of `pdf-oxide`.
|
|
3806
3940
|
* @param {number} _page_index
|
|
3807
|
-
* @param {WasmOcrEngine
|
|
3941
|
+
* @param {WasmOcrEngine} _engine
|
|
3808
3942
|
* @returns {string}
|
|
3809
3943
|
*/
|
|
3810
3944
|
extractTextOcr(_page_index, _engine) {
|
|
3811
|
-
let
|
|
3812
|
-
let
|
|
3945
|
+
let deferred2_0;
|
|
3946
|
+
let deferred2_1;
|
|
3813
3947
|
try {
|
|
3814
3948
|
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
3815
|
-
|
|
3816
|
-
|
|
3817
|
-
_assertClass(_engine, WasmOcrEngine);
|
|
3818
|
-
ptr0 = _engine.__destroy_into_raw();
|
|
3819
|
-
}
|
|
3820
|
-
wasm.wasmpdfdocument_extractTextOcr(retptr, this.__wbg_ptr, _page_index, ptr0);
|
|
3949
|
+
_assertClass(_engine, WasmOcrEngine);
|
|
3950
|
+
wasm.wasmpdfdocument_extractTextOcr(retptr, this.__wbg_ptr, _page_index, _engine.__wbg_ptr);
|
|
3821
3951
|
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
3822
3952
|
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
3823
3953
|
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
3824
3954
|
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
3825
|
-
var
|
|
3826
|
-
var
|
|
3955
|
+
var ptr1 = r0;
|
|
3956
|
+
var len1 = r1;
|
|
3827
3957
|
if (r3) {
|
|
3828
|
-
|
|
3958
|
+
ptr1 = 0; len1 = 0;
|
|
3829
3959
|
throw takeObject(r2);
|
|
3830
3960
|
}
|
|
3831
|
-
|
|
3832
|
-
|
|
3833
|
-
return getStringFromWasm0(
|
|
3961
|
+
deferred2_0 = ptr1;
|
|
3962
|
+
deferred2_1 = len1;
|
|
3963
|
+
return getStringFromWasm0(ptr1, len1);
|
|
3834
3964
|
} finally {
|
|
3835
3965
|
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
3836
|
-
wasm.__wbindgen_export4(
|
|
3966
|
+
wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
|
|
3837
3967
|
}
|
|
3838
3968
|
}
|
|
3839
3969
|
/**
|
|
@@ -5593,6 +5723,10 @@ class WasmPdfPageRegion {
|
|
|
5593
5723
|
}
|
|
5594
5724
|
/**
|
|
5595
5725
|
* Extract text using OCR from this region.
|
|
5726
|
+
*
|
|
5727
|
+
* Region-scoped OCR is not wired yet; use the page-level
|
|
5728
|
+
* `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
|
|
5729
|
+
* (#524 follow-up).
|
|
5596
5730
|
* @param {WasmOcrEngine | null} [_engine]
|
|
5597
5731
|
* @returns {string}
|
|
5598
5732
|
*/
|
|
@@ -6183,6 +6317,37 @@ function hasDocumentTimestamp(pdf_data) {
|
|
|
6183
6317
|
}
|
|
6184
6318
|
exports.hasDocumentTimestamp = hasDocumentTimestamp;
|
|
6185
6319
|
|
|
6320
|
+
/**
|
|
6321
|
+
* #519: Air-gapped OCR model manifest — JSON (detector + every
|
|
6322
|
+
* supported language's cache filenames and source URLs).
|
|
6323
|
+
*
|
|
6324
|
+
* WASM provisioning is **host-side**: browser/WASM has no filesystem
|
|
6325
|
+
* or network-to-disk, so a download-to-cache prefetch cannot run
|
|
6326
|
+
* here. This manifest is informational — it lets the JS host learn
|
|
6327
|
+
* which model files/URLs to fetch and bundle (or ship out of band)
|
|
6328
|
+
* before driving OCR. There is intentionally no `prefetchModels` in
|
|
6329
|
+
* the WASM surface (see `prefetchAvailable`, which always returns
|
|
6330
|
+
* `false`).
|
|
6331
|
+
* @returns {string}
|
|
6332
|
+
*/
|
|
6333
|
+
function modelManifest() {
|
|
6334
|
+
let deferred1_0;
|
|
6335
|
+
let deferred1_1;
|
|
6336
|
+
try {
|
|
6337
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
6338
|
+
wasm.modelManifest(retptr);
|
|
6339
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
6340
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
6341
|
+
deferred1_0 = r0;
|
|
6342
|
+
deferred1_1 = r1;
|
|
6343
|
+
return getStringFromWasm0(r0, r1);
|
|
6344
|
+
} finally {
|
|
6345
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
6346
|
+
wasm.__wbindgen_export4(deferred1_0, deferred1_1, 1);
|
|
6347
|
+
}
|
|
6348
|
+
}
|
|
6349
|
+
exports.modelManifest = modelManifest;
|
|
6350
|
+
|
|
6186
6351
|
/**
|
|
6187
6352
|
* Plan a bookmark split without producing PDFs. Returns a JSON array
|
|
6188
6353
|
* of segment objects (`index, startPage…` shape from
|
|
@@ -6215,6 +6380,18 @@ function planSplitByBookmarks(src_bytes, title_prefix, ignore_case, level, inclu
|
|
|
6215
6380
|
}
|
|
6216
6381
|
exports.planSplitByBookmarks = planSplitByBookmarks;
|
|
6217
6382
|
|
|
6383
|
+
/**
|
|
6384
|
+
* #519: Whether this build can download OCR models to a local cache.
|
|
6385
|
+
* Always `false` in WASM — provisioning is host-side (see
|
|
6386
|
+
* `modelManifest`).
|
|
6387
|
+
* @returns {boolean}
|
|
6388
|
+
*/
|
|
6389
|
+
function prefetchAvailable() {
|
|
6390
|
+
const ret = wasm.prefetchAvailable();
|
|
6391
|
+
return ret !== 0;
|
|
6392
|
+
}
|
|
6393
|
+
exports.prefetchAvailable = prefetchAvailable;
|
|
6394
|
+
|
|
6218
6395
|
/**
|
|
6219
6396
|
* Install the process-wide runtime crypto policy from its grammar
|
|
6220
6397
|
* string (`"compat"|"strict"|"fips-strict"[;…]`). Fail-closed:
|
package/nodejs/pdf_oxide_bg.wasm
CHANGED
|
Binary file
|
|
@@ -23,7 +23,9 @@ export const cryptoPolicy: (a: number) => void;
|
|
|
23
23
|
export const generateBarcodeSvg: (a: number, b: number, c: number, d: number) => void;
|
|
24
24
|
export const generateQrSvg: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
25
25
|
export const hasDocumentTimestamp: (a: number, b: number) => number;
|
|
26
|
+
export const modelManifest: (a: number) => void;
|
|
26
27
|
export const planSplitByBookmarks: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => void;
|
|
28
|
+
export const prefetchAvailable: () => number;
|
|
27
29
|
export const setCryptoPolicy: (a: number, b: number, c: number) => void;
|
|
28
30
|
export const setLogLevel: (a: number, b: number, c: number) => void;
|
|
29
31
|
export const signPdfBytes: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => void;
|
|
@@ -158,6 +160,8 @@ export const wasmpdfdocument_applyAllRedactions: (a: number, b: number) => void;
|
|
|
158
160
|
export const wasmpdfdocument_applyPageRedactions: (a: number, b: number, c: number) => void;
|
|
159
161
|
export const wasmpdfdocument_applyRedactionsDestructive: (a: number, b: number, c: number) => void;
|
|
160
162
|
export const wasmpdfdocument_authenticate: (a: number, b: number, c: number, d: number) => void;
|
|
163
|
+
export const wasmpdfdocument_classifyDocument: (a: number, b: number) => void;
|
|
164
|
+
export const wasmpdfdocument_classifyPage: (a: number, b: number, c: number) => void;
|
|
161
165
|
export const wasmpdfdocument_clearEraseRegions: (a: number, b: number, c: number) => void;
|
|
162
166
|
export const wasmpdfdocument_convertToPdfA: (a: number, b: number, c: number, d: number) => void;
|
|
163
167
|
export const wasmpdfdocument_cropMargins: (a: number, b: number, c: number, d: number, e: number, f: number) => void;
|
|
@@ -177,6 +181,7 @@ export const wasmpdfdocument_extractChars: (a: number, b: number, c: number, d:
|
|
|
177
181
|
export const wasmpdfdocument_extractImageBytes: (a: number, b: number, c: number) => void;
|
|
178
182
|
export const wasmpdfdocument_extractImages: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
179
183
|
export const wasmpdfdocument_extractLines: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
184
|
+
export const wasmpdfdocument_extractPageAuto: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
180
185
|
export const wasmpdfdocument_extractPageText: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
181
186
|
export const wasmpdfdocument_extractPages: (a: number, b: number, c: number, d: number) => void;
|
|
182
187
|
export const wasmpdfdocument_extractPaths: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
@@ -184,6 +189,7 @@ export const wasmpdfdocument_extractRects: (a: number, b: number, c: number, d:
|
|
|
184
189
|
export const wasmpdfdocument_extractSpans: (a: number, b: number, c: number, d: number, e: number, f: number, g: number) => void;
|
|
185
190
|
export const wasmpdfdocument_extractTables: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
186
191
|
export const wasmpdfdocument_extractText: (a: number, b: number, c: number, d: number) => void;
|
|
192
|
+
export const wasmpdfdocument_extractTextAuto: (a: number, b: number, c: number) => void;
|
|
187
193
|
export const wasmpdfdocument_extractTextLines: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
188
194
|
export const wasmpdfdocument_extractTextOcr: (a: number, b: number, c: number, d: number) => void;
|
|
189
195
|
export const wasmpdfdocument_extractWords: (a: number, b: number, c: number, d: number, e: number) => void;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pdf-oxide-wasm",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.52",
|
|
4
4
|
"description": "Fast, zero-dependency PDF toolkit for Node.js, browsers, and edge runtimes — text extraction, markdown/HTML conversion, search, form filling, creation, and editing. Rust core compiled to WebAssembly.",
|
|
5
5
|
"license": "MIT OR Apache-2.0",
|
|
6
6
|
"repository": {
|
package/web/pdf_oxide.d.ts
CHANGED
|
@@ -644,7 +644,9 @@ export class WasmHeader {
|
|
|
644
644
|
}
|
|
645
645
|
|
|
646
646
|
/**
|
|
647
|
-
* OCR configuration for WebAssembly.
|
|
647
|
+
* OCR configuration for WebAssembly. (Currently a marker — the engine
|
|
648
|
+
* uses tuned defaults; knobs are exposed as the WASM OCR surface
|
|
649
|
+
* matures, #524.)
|
|
648
650
|
*/
|
|
649
651
|
export class WasmOcrConfig {
|
|
650
652
|
free(): void;
|
|
@@ -656,15 +658,27 @@ export class WasmOcrConfig {
|
|
|
656
658
|
}
|
|
657
659
|
|
|
658
660
|
/**
|
|
659
|
-
* OCR engine for WebAssembly.
|
|
661
|
+
* OCR engine for WebAssembly (#524).
|
|
662
|
+
*
|
|
663
|
+
* OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
|
|
664
|
+
* native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
|
|
665
|
+
* the browser/Deno/edge host fetches the detector + recognizer ONNX
|
|
666
|
+
* files and the char dictionary (see `modelManifest()` for the URLs)
|
|
667
|
+
* — typically `fetch()` + the Cache API / IndexedDB for the
|
|
668
|
+
* tens-of-MB models — then hands the bytes to the constructor. This
|
|
669
|
+
* only works in the `wasm-ocr` build of `pdf-oxide`; the default
|
|
670
|
+
* `pdf-oxide-wasm` has no OCR (the constructor returns an error
|
|
671
|
+
* explaining this).
|
|
660
672
|
*/
|
|
661
673
|
export class WasmOcrEngine {
|
|
662
674
|
free(): void;
|
|
663
675
|
[Symbol.dispose](): void;
|
|
664
676
|
/**
|
|
665
|
-
*
|
|
677
|
+
* Not available in this build. OCR needs the `wasm-ocr` build of
|
|
678
|
+
* `pdf-oxide` (the pure-Rust tract backend); the default
|
|
679
|
+
* `pdf-oxide-wasm` ships without it.
|
|
666
680
|
*/
|
|
667
|
-
constructor(
|
|
681
|
+
constructor(_det_model: Uint8Array, _rec_model: Uint8Array, _dict: string, _config?: WasmOcrConfig | null);
|
|
668
682
|
}
|
|
669
683
|
|
|
670
684
|
/**
|
|
@@ -815,6 +829,15 @@ export class WasmPdfDocument {
|
|
|
815
829
|
* @returns true if authentication succeeded
|
|
816
830
|
*/
|
|
817
831
|
authenticate(password: string): boolean;
|
|
832
|
+
/**
|
|
833
|
+
* Cheap per-page text-vs-OCR classification → JSON
|
|
834
|
+
* `DocumentClassification`.
|
|
835
|
+
*/
|
|
836
|
+
classifyDocument(): string;
|
|
837
|
+
/**
|
|
838
|
+
* Cheap per-page classification → JSON `PageClassification`.
|
|
839
|
+
*/
|
|
840
|
+
classifyPage(page_index: number): string;
|
|
818
841
|
/**
|
|
819
842
|
* Clear all pending erase operations for a page.
|
|
820
843
|
*/
|
|
@@ -934,6 +957,12 @@ export class WasmPdfDocument {
|
|
|
934
957
|
* @returns Array of path objects
|
|
935
958
|
*/
|
|
936
959
|
extractLines(page_index: number, region?: Float32Array | null): any;
|
|
960
|
+
/**
|
|
961
|
+
* Rich per-page extraction → JSON `PageExtraction` (per-region
|
|
962
|
+
* bbox + typed reason). `optionsJson` is `{}`-tolerant
|
|
963
|
+
* `AutoExtractOptions`; undefined/empty → defaults.
|
|
964
|
+
*/
|
|
965
|
+
extractPageAuto(page_index: number, options_json?: string | null): string;
|
|
937
966
|
/**
|
|
938
967
|
* Extract complete page text data in a single call.
|
|
939
968
|
*
|
|
@@ -990,6 +1019,11 @@ export class WasmPdfDocument {
|
|
|
990
1019
|
* @param region - Optional [x, y, width, height] to filter by
|
|
991
1020
|
*/
|
|
992
1021
|
extractText(page_index: number, region: any): string;
|
|
1022
|
+
/**
|
|
1023
|
+
* One-shot auto text extraction — graceful native fallback (never
|
|
1024
|
+
* the opaque OCR error #513).
|
|
1025
|
+
*/
|
|
1026
|
+
extractTextAuto(page_index: number): string;
|
|
993
1027
|
/**
|
|
994
1028
|
* Extract text lines from a page.
|
|
995
1029
|
*
|
|
@@ -997,12 +1031,10 @@ export class WasmPdfDocument {
|
|
|
997
1031
|
*/
|
|
998
1032
|
extractTextLines(page_index: number, region?: Float32Array | null): any;
|
|
999
1033
|
/**
|
|
1000
|
-
* Extract text using OCR
|
|
1001
|
-
*
|
|
1002
|
-
* NOTE: OCR is not yet supported in the WebAssembly build due to missing
|
|
1003
|
-
* ONNX Runtime support for the web backend in the current implementation.
|
|
1034
|
+
* Extract text using OCR. Not available in this build — OCR needs
|
|
1035
|
+
* the `wasm-ocr` build of `pdf-oxide`.
|
|
1004
1036
|
*/
|
|
1005
|
-
extractTextOcr(_page_index: number, _engine
|
|
1037
|
+
extractTextOcr(_page_index: number, _engine: WasmOcrEngine): string;
|
|
1006
1038
|
/**
|
|
1007
1039
|
* Extract word-level data from a page.
|
|
1008
1040
|
*
|
|
@@ -1425,6 +1457,10 @@ export class WasmPdfPageRegion {
|
|
|
1425
1457
|
extractTextLines(): any;
|
|
1426
1458
|
/**
|
|
1427
1459
|
* Extract text using OCR from this region.
|
|
1460
|
+
*
|
|
1461
|
+
* Region-scoped OCR is not wired yet; use the page-level
|
|
1462
|
+
* `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
|
|
1463
|
+
* (#524 follow-up).
|
|
1428
1464
|
*/
|
|
1429
1465
|
extractTextOcr(_engine?: WasmOcrEngine | null): string;
|
|
1430
1466
|
/**
|
|
@@ -1600,6 +1636,20 @@ export function generateQrSvg(data: string, error_correction: number, size: numb
|
|
|
1600
1636
|
*/
|
|
1601
1637
|
export function hasDocumentTimestamp(pdf_data: Uint8Array): boolean;
|
|
1602
1638
|
|
|
1639
|
+
/**
|
|
1640
|
+
* #519: Air-gapped OCR model manifest — JSON (detector + every
|
|
1641
|
+
* supported language's cache filenames and source URLs).
|
|
1642
|
+
*
|
|
1643
|
+
* WASM provisioning is **host-side**: browser/WASM has no filesystem
|
|
1644
|
+
* or network-to-disk, so a download-to-cache prefetch cannot run
|
|
1645
|
+
* here. This manifest is informational — it lets the JS host learn
|
|
1646
|
+
* which model files/URLs to fetch and bundle (or ship out of band)
|
|
1647
|
+
* before driving OCR. There is intentionally no `prefetchModels` in
|
|
1648
|
+
* the WASM surface (see `prefetchAvailable`, which always returns
|
|
1649
|
+
* `false`).
|
|
1650
|
+
*/
|
|
1651
|
+
export function modelManifest(): string;
|
|
1652
|
+
|
|
1603
1653
|
/**
|
|
1604
1654
|
* Plan a bookmark split without producing PDFs. Returns a JSON array
|
|
1605
1655
|
* of segment objects (`index, startPage…` shape from
|
|
@@ -1607,6 +1657,13 @@ export function hasDocumentTimestamp(pdf_data: Uint8Array): boolean;
|
|
|
1607
1657
|
*/
|
|
1608
1658
|
export function planSplitByBookmarks(src_bytes: Uint8Array, title_prefix: string | null | undefined, ignore_case: boolean, level: number, include_front_matter: boolean): any;
|
|
1609
1659
|
|
|
1660
|
+
/**
|
|
1661
|
+
* #519: Whether this build can download OCR models to a local cache.
|
|
1662
|
+
* Always `false` in WASM — provisioning is host-side (see
|
|
1663
|
+
* `modelManifest`).
|
|
1664
|
+
*/
|
|
1665
|
+
export function prefetchAvailable(): boolean;
|
|
1666
|
+
|
|
1610
1667
|
/**
|
|
1611
1668
|
* Install the process-wide runtime crypto policy from its grammar
|
|
1612
1669
|
* string (`"compat"|"strict"|"fips-strict"[;…]`). Fail-closed:
|
|
@@ -1687,7 +1744,9 @@ export interface InitOutput {
|
|
|
1687
1744
|
readonly generateBarcodeSvg: (a: number, b: number, c: number, d: number) => void;
|
|
1688
1745
|
readonly generateQrSvg: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
1689
1746
|
readonly hasDocumentTimestamp: (a: number, b: number) => number;
|
|
1747
|
+
readonly modelManifest: (a: number) => void;
|
|
1690
1748
|
readonly planSplitByBookmarks: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => void;
|
|
1749
|
+
readonly prefetchAvailable: () => number;
|
|
1691
1750
|
readonly setCryptoPolicy: (a: number, b: number, c: number) => void;
|
|
1692
1751
|
readonly setLogLevel: (a: number, b: number, c: number) => void;
|
|
1693
1752
|
readonly signPdfBytes: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => void;
|
|
@@ -1822,6 +1881,8 @@ export interface InitOutput {
|
|
|
1822
1881
|
readonly wasmpdfdocument_applyPageRedactions: (a: number, b: number, c: number) => void;
|
|
1823
1882
|
readonly wasmpdfdocument_applyRedactionsDestructive: (a: number, b: number, c: number) => void;
|
|
1824
1883
|
readonly wasmpdfdocument_authenticate: (a: number, b: number, c: number, d: number) => void;
|
|
1884
|
+
readonly wasmpdfdocument_classifyDocument: (a: number, b: number) => void;
|
|
1885
|
+
readonly wasmpdfdocument_classifyPage: (a: number, b: number, c: number) => void;
|
|
1825
1886
|
readonly wasmpdfdocument_clearEraseRegions: (a: number, b: number, c: number) => void;
|
|
1826
1887
|
readonly wasmpdfdocument_convertToPdfA: (a: number, b: number, c: number, d: number) => void;
|
|
1827
1888
|
readonly wasmpdfdocument_cropMargins: (a: number, b: number, c: number, d: number, e: number, f: number) => void;
|
|
@@ -1841,6 +1902,7 @@ export interface InitOutput {
|
|
|
1841
1902
|
readonly wasmpdfdocument_extractImageBytes: (a: number, b: number, c: number) => void;
|
|
1842
1903
|
readonly wasmpdfdocument_extractImages: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
1843
1904
|
readonly wasmpdfdocument_extractLines: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
1905
|
+
readonly wasmpdfdocument_extractPageAuto: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
1844
1906
|
readonly wasmpdfdocument_extractPageText: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
1845
1907
|
readonly wasmpdfdocument_extractPages: (a: number, b: number, c: number, d: number) => void;
|
|
1846
1908
|
readonly wasmpdfdocument_extractPaths: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
@@ -1848,6 +1910,7 @@ export interface InitOutput {
|
|
|
1848
1910
|
readonly wasmpdfdocument_extractSpans: (a: number, b: number, c: number, d: number, e: number, f: number, g: number) => void;
|
|
1849
1911
|
readonly wasmpdfdocument_extractTables: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
1850
1912
|
readonly wasmpdfdocument_extractText: (a: number, b: number, c: number, d: number) => void;
|
|
1913
|
+
readonly wasmpdfdocument_extractTextAuto: (a: number, b: number, c: number) => void;
|
|
1851
1914
|
readonly wasmpdfdocument_extractTextLines: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
1852
1915
|
readonly wasmpdfdocument_extractTextOcr: (a: number, b: number, c: number, d: number) => void;
|
|
1853
1916
|
readonly wasmpdfdocument_extractWords: (a: number, b: number, c: number, d: number, e: number) => void;
|