pdf-oxide-wasm 0.3.50 → 0.3.52

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/web/pdf_oxide.js CHANGED
@@ -2546,7 +2546,9 @@ export class WasmHeader {
2546
2546
  if (Symbol.dispose) WasmHeader.prototype[Symbol.dispose] = WasmHeader.prototype.free;
2547
2547
 
2548
2548
  /**
2549
- * OCR configuration for WebAssembly.
2549
+ * OCR configuration for WebAssembly. (Currently a marker — the engine
2550
+ * uses tuned defaults; knobs are exposed as the WASM OCR surface
2551
+ * matures, #524.)
2550
2552
  */
2551
2553
  export class WasmOcrConfig {
2552
2554
  __destroy_into_raw() {
@@ -2572,7 +2574,17 @@ export class WasmOcrConfig {
2572
2574
  if (Symbol.dispose) WasmOcrConfig.prototype[Symbol.dispose] = WasmOcrConfig.prototype.free;
2573
2575
 
2574
2576
  /**
2575
- * OCR engine for WebAssembly.
2577
+ * OCR engine for WebAssembly (#524).
2578
+ *
2579
+ * OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
2580
+ * native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
2581
+ * the browser/Deno/edge host fetches the detector + recognizer ONNX
2582
+ * files and the char dictionary (see `modelManifest()` for the URLs)
2583
+ * — typically `fetch()` + the Cache API / IndexedDB for the
2584
+ * tens-of-MB models — then hands the bytes to the constructor. This
2585
+ * only works in the `wasm-ocr` build of `pdf-oxide`; the default
2586
+ * `pdf-oxide-wasm` has no OCR (the constructor returns an error
2587
+ * explaining this).
2576
2588
  */
2577
2589
  export class WasmOcrEngine {
2578
2590
  __destroy_into_raw() {
@@ -2586,20 +2598,22 @@ export class WasmOcrEngine {
2586
2598
  wasm.__wbg_wasmocrengine_free(ptr, 0);
2587
2599
  }
2588
2600
  /**
2589
- * Create a new OCR engine.
2590
- * @param {string} _det_model_path
2591
- * @param {string} _rec_model_path
2592
- * @param {string} _dict_path
2601
+ * Not available in this build. OCR needs the `wasm-ocr` build of
2602
+ * `pdf-oxide` (the pure-Rust tract backend); the default
2603
+ * `pdf-oxide-wasm` ships without it.
2604
+ * @param {Uint8Array} _det_model
2605
+ * @param {Uint8Array} _rec_model
2606
+ * @param {string} _dict
2593
2607
  * @param {WasmOcrConfig | null} [_config]
2594
2608
  */
2595
- constructor(_det_model_path, _rec_model_path, _dict_path, _config) {
2609
+ constructor(_det_model, _rec_model, _dict, _config) {
2596
2610
  try {
2597
2611
  const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
2598
- const ptr0 = passStringToWasm0(_det_model_path, wasm.__wbindgen_export, wasm.__wbindgen_export2);
2612
+ const ptr0 = passArray8ToWasm0(_det_model, wasm.__wbindgen_export);
2599
2613
  const len0 = WASM_VECTOR_LEN;
2600
- const ptr1 = passStringToWasm0(_rec_model_path, wasm.__wbindgen_export, wasm.__wbindgen_export2);
2614
+ const ptr1 = passArray8ToWasm0(_rec_model, wasm.__wbindgen_export);
2601
2615
  const len1 = WASM_VECTOR_LEN;
2602
- const ptr2 = passStringToWasm0(_dict_path, wasm.__wbindgen_export, wasm.__wbindgen_export2);
2616
+ const ptr2 = passStringToWasm0(_dict, wasm.__wbindgen_export, wasm.__wbindgen_export2);
2603
2617
  const len2 = WASM_VECTOR_LEN;
2604
2618
  let ptr3 = 0;
2605
2619
  if (!isLikeNone(_config)) {
@@ -3127,6 +3141,64 @@ export class WasmPdfDocument {
3127
3141
  wasm.__wbindgen_add_to_stack_pointer(16);
3128
3142
  }
3129
3143
  }
3144
+ /**
3145
+ * Cheap per-page text-vs-OCR classification → JSON
3146
+ * `DocumentClassification`.
3147
+ * @returns {string}
3148
+ */
3149
+ classifyDocument() {
3150
+ let deferred2_0;
3151
+ let deferred2_1;
3152
+ try {
3153
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
3154
+ wasm.wasmpdfdocument_classifyDocument(retptr, this.__wbg_ptr);
3155
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
3156
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
3157
+ var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
3158
+ var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
3159
+ var ptr1 = r0;
3160
+ var len1 = r1;
3161
+ if (r3) {
3162
+ ptr1 = 0; len1 = 0;
3163
+ throw takeObject(r2);
3164
+ }
3165
+ deferred2_0 = ptr1;
3166
+ deferred2_1 = len1;
3167
+ return getStringFromWasm0(ptr1, len1);
3168
+ } finally {
3169
+ wasm.__wbindgen_add_to_stack_pointer(16);
3170
+ wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
3171
+ }
3172
+ }
3173
+ /**
3174
+ * Cheap per-page classification → JSON `PageClassification`.
3175
+ * @param {number} page_index
3176
+ * @returns {string}
3177
+ */
3178
+ classifyPage(page_index) {
3179
+ let deferred2_0;
3180
+ let deferred2_1;
3181
+ try {
3182
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
3183
+ wasm.wasmpdfdocument_classifyPage(retptr, this.__wbg_ptr, page_index);
3184
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
3185
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
3186
+ var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
3187
+ var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
3188
+ var ptr1 = r0;
3189
+ var len1 = r1;
3190
+ if (r3) {
3191
+ ptr1 = 0; len1 = 0;
3192
+ throw takeObject(r2);
3193
+ }
3194
+ deferred2_0 = ptr1;
3195
+ deferred2_1 = len1;
3196
+ return getStringFromWasm0(ptr1, len1);
3197
+ } finally {
3198
+ wasm.__wbindgen_add_to_stack_pointer(16);
3199
+ wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
3200
+ }
3201
+ }
3130
3202
  /**
3131
3203
  * Clear all pending erase operations for a page.
3132
3204
  * @param {number} page_index
@@ -3555,6 +3627,40 @@ export class WasmPdfDocument {
3555
3627
  wasm.__wbindgen_add_to_stack_pointer(16);
3556
3628
  }
3557
3629
  }
3630
+ /**
3631
+ * Rich per-page extraction → JSON `PageExtraction` (per-region
3632
+ * bbox + typed reason). `optionsJson` is `{}`-tolerant
3633
+ * `AutoExtractOptions`; undefined/empty → defaults.
3634
+ * @param {number} page_index
3635
+ * @param {string | null} [options_json]
3636
+ * @returns {string}
3637
+ */
3638
+ extractPageAuto(page_index, options_json) {
3639
+ let deferred3_0;
3640
+ let deferred3_1;
3641
+ try {
3642
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
3643
+ var ptr0 = isLikeNone(options_json) ? 0 : passStringToWasm0(options_json, wasm.__wbindgen_export, wasm.__wbindgen_export2);
3644
+ var len0 = WASM_VECTOR_LEN;
3645
+ wasm.wasmpdfdocument_extractPageAuto(retptr, this.__wbg_ptr, page_index, ptr0, len0);
3646
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
3647
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
3648
+ var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
3649
+ var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
3650
+ var ptr2 = r0;
3651
+ var len2 = r1;
3652
+ if (r3) {
3653
+ ptr2 = 0; len2 = 0;
3654
+ throw takeObject(r2);
3655
+ }
3656
+ deferred3_0 = ptr2;
3657
+ deferred3_1 = len2;
3658
+ return getStringFromWasm0(ptr2, len2);
3659
+ } finally {
3660
+ wasm.__wbindgen_add_to_stack_pointer(16);
3661
+ wasm.__wbindgen_export4(deferred3_0, deferred3_1, 1);
3662
+ }
3663
+ }
3558
3664
  /**
3559
3665
  * Extract complete page text data in a single call.
3560
3666
  *
@@ -3756,6 +3862,36 @@ export class WasmPdfDocument {
3756
3862
  wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
3757
3863
  }
3758
3864
  }
3865
+ /**
3866
+ * One-shot auto text extraction — graceful native fallback (never
3867
+ * the opaque OCR error #513).
3868
+ * @param {number} page_index
3869
+ * @returns {string}
3870
+ */
3871
+ extractTextAuto(page_index) {
3872
+ let deferred2_0;
3873
+ let deferred2_1;
3874
+ try {
3875
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
3876
+ wasm.wasmpdfdocument_extractTextAuto(retptr, this.__wbg_ptr, page_index);
3877
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
3878
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
3879
+ var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
3880
+ var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
3881
+ var ptr1 = r0;
3882
+ var len1 = r1;
3883
+ if (r3) {
3884
+ ptr1 = 0; len1 = 0;
3885
+ throw takeObject(r2);
3886
+ }
3887
+ deferred2_0 = ptr1;
3888
+ deferred2_1 = len1;
3889
+ return getStringFromWasm0(ptr1, len1);
3890
+ } finally {
3891
+ wasm.__wbindgen_add_to_stack_pointer(16);
3892
+ wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
3893
+ }
3894
+ }
3759
3895
  /**
3760
3896
  * Extract text lines from a page.
3761
3897
  *
@@ -3782,41 +3918,35 @@ export class WasmPdfDocument {
3782
3918
  }
3783
3919
  }
3784
3920
  /**
3785
- * Extract text using OCR (optical character recognition).
3786
- *
3787
- * NOTE: OCR is not yet supported in the WebAssembly build due to missing
3788
- * ONNX Runtime support for the web backend in the current implementation.
3921
+ * Extract text using OCR. Not available in this build — OCR needs
3922
+ * the `wasm-ocr` build of `pdf-oxide`.
3789
3923
  * @param {number} _page_index
3790
- * @param {WasmOcrEngine | null} [_engine]
3924
+ * @param {WasmOcrEngine} _engine
3791
3925
  * @returns {string}
3792
3926
  */
3793
3927
  extractTextOcr(_page_index, _engine) {
3794
- let deferred3_0;
3795
- let deferred3_1;
3928
+ let deferred2_0;
3929
+ let deferred2_1;
3796
3930
  try {
3797
3931
  const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
3798
- let ptr0 = 0;
3799
- if (!isLikeNone(_engine)) {
3800
- _assertClass(_engine, WasmOcrEngine);
3801
- ptr0 = _engine.__destroy_into_raw();
3802
- }
3803
- wasm.wasmpdfdocument_extractTextOcr(retptr, this.__wbg_ptr, _page_index, ptr0);
3932
+ _assertClass(_engine, WasmOcrEngine);
3933
+ wasm.wasmpdfdocument_extractTextOcr(retptr, this.__wbg_ptr, _page_index, _engine.__wbg_ptr);
3804
3934
  var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
3805
3935
  var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
3806
3936
  var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
3807
3937
  var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
3808
- var ptr2 = r0;
3809
- var len2 = r1;
3938
+ var ptr1 = r0;
3939
+ var len1 = r1;
3810
3940
  if (r3) {
3811
- ptr2 = 0; len2 = 0;
3941
+ ptr1 = 0; len1 = 0;
3812
3942
  throw takeObject(r2);
3813
3943
  }
3814
- deferred3_0 = ptr2;
3815
- deferred3_1 = len2;
3816
- return getStringFromWasm0(ptr2, len2);
3944
+ deferred2_0 = ptr1;
3945
+ deferred2_1 = len1;
3946
+ return getStringFromWasm0(ptr1, len1);
3817
3947
  } finally {
3818
3948
  wasm.__wbindgen_add_to_stack_pointer(16);
3819
- wasm.__wbindgen_export4(deferred3_0, deferred3_1, 1);
3949
+ wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
3820
3950
  }
3821
3951
  }
3822
3952
  /**
@@ -5575,6 +5705,10 @@ export class WasmPdfPageRegion {
5575
5705
  }
5576
5706
  /**
5577
5707
  * Extract text using OCR from this region.
5708
+ *
5709
+ * Region-scoped OCR is not wired yet; use the page-level
5710
+ * `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
5711
+ * (#524 follow-up).
5578
5712
  * @param {WasmOcrEngine | null} [_engine]
5579
5713
  * @returns {string}
5580
5714
  */
@@ -6155,6 +6289,36 @@ export function hasDocumentTimestamp(pdf_data) {
6155
6289
  return ret !== 0;
6156
6290
  }
6157
6291
 
6292
+ /**
6293
+ * #519: Air-gapped OCR model manifest — JSON (detector + every
6294
+ * supported language's cache filenames and source URLs).
6295
+ *
6296
+ * WASM provisioning is **host-side**: browser/WASM has no filesystem
6297
+ * or network-to-disk, so a download-to-cache prefetch cannot run
6298
+ * here. This manifest is informational — it lets the JS host learn
6299
+ * which model files/URLs to fetch and bundle (or ship out of band)
6300
+ * before driving OCR. There is intentionally no `prefetchModels` in
6301
+ * the WASM surface (see `prefetchAvailable`, which always returns
6302
+ * `false`).
6303
+ * @returns {string}
6304
+ */
6305
+ export function modelManifest() {
6306
+ let deferred1_0;
6307
+ let deferred1_1;
6308
+ try {
6309
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
6310
+ wasm.modelManifest(retptr);
6311
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
6312
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
6313
+ deferred1_0 = r0;
6314
+ deferred1_1 = r1;
6315
+ return getStringFromWasm0(r0, r1);
6316
+ } finally {
6317
+ wasm.__wbindgen_add_to_stack_pointer(16);
6318
+ wasm.__wbindgen_export4(deferred1_0, deferred1_1, 1);
6319
+ }
6320
+ }
6321
+
6158
6322
  /**
6159
6323
  * Plan a bookmark split without producing PDFs. Returns a JSON array
6160
6324
  * of segment objects (`index, startPage…` shape from
@@ -6186,6 +6350,17 @@ export function planSplitByBookmarks(src_bytes, title_prefix, ignore_case, level
6186
6350
  }
6187
6351
  }
6188
6352
 
6353
+ /**
6354
+ * #519: Whether this build can download OCR models to a local cache.
6355
+ * Always `false` in WASM — provisioning is host-side (see
6356
+ * `modelManifest`).
6357
+ * @returns {boolean}
6358
+ */
6359
+ export function prefetchAvailable() {
6360
+ const ret = wasm.prefetchAvailable();
6361
+ return ret !== 0;
6362
+ }
6363
+
6189
6364
  /**
6190
6365
  * Install the process-wide runtime crypto policy from its grammar
6191
6366
  * string (`"compat"|"strict"|"fips-strict"[;…]`). Fail-closed:
Binary file
@@ -23,7 +23,9 @@ export const cryptoPolicy: (a: number) => void;
23
23
  export const generateBarcodeSvg: (a: number, b: number, c: number, d: number) => void;
24
24
  export const generateQrSvg: (a: number, b: number, c: number, d: number, e: number) => void;
25
25
  export const hasDocumentTimestamp: (a: number, b: number) => number;
26
+ export const modelManifest: (a: number) => void;
26
27
  export const planSplitByBookmarks: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => void;
28
+ export const prefetchAvailable: () => number;
27
29
  export const setCryptoPolicy: (a: number, b: number, c: number) => void;
28
30
  export const setLogLevel: (a: number, b: number, c: number) => void;
29
31
  export const signPdfBytes: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => void;
@@ -158,6 +160,8 @@ export const wasmpdfdocument_applyAllRedactions: (a: number, b: number) => void;
158
160
  export const wasmpdfdocument_applyPageRedactions: (a: number, b: number, c: number) => void;
159
161
  export const wasmpdfdocument_applyRedactionsDestructive: (a: number, b: number, c: number) => void;
160
162
  export const wasmpdfdocument_authenticate: (a: number, b: number, c: number, d: number) => void;
163
+ export const wasmpdfdocument_classifyDocument: (a: number, b: number) => void;
164
+ export const wasmpdfdocument_classifyPage: (a: number, b: number, c: number) => void;
161
165
  export const wasmpdfdocument_clearEraseRegions: (a: number, b: number, c: number) => void;
162
166
  export const wasmpdfdocument_convertToPdfA: (a: number, b: number, c: number, d: number) => void;
163
167
  export const wasmpdfdocument_cropMargins: (a: number, b: number, c: number, d: number, e: number, f: number) => void;
@@ -177,6 +181,7 @@ export const wasmpdfdocument_extractChars: (a: number, b: number, c: number, d:
177
181
  export const wasmpdfdocument_extractImageBytes: (a: number, b: number, c: number) => void;
178
182
  export const wasmpdfdocument_extractImages: (a: number, b: number, c: number, d: number, e: number) => void;
179
183
  export const wasmpdfdocument_extractLines: (a: number, b: number, c: number, d: number, e: number) => void;
184
+ export const wasmpdfdocument_extractPageAuto: (a: number, b: number, c: number, d: number, e: number) => void;
180
185
  export const wasmpdfdocument_extractPageText: (a: number, b: number, c: number, d: number, e: number) => void;
181
186
  export const wasmpdfdocument_extractPages: (a: number, b: number, c: number, d: number) => void;
182
187
  export const wasmpdfdocument_extractPaths: (a: number, b: number, c: number, d: number, e: number) => void;
@@ -184,6 +189,7 @@ export const wasmpdfdocument_extractRects: (a: number, b: number, c: number, d:
184
189
  export const wasmpdfdocument_extractSpans: (a: number, b: number, c: number, d: number, e: number, f: number, g: number) => void;
185
190
  export const wasmpdfdocument_extractTables: (a: number, b: number, c: number, d: number, e: number) => void;
186
191
  export const wasmpdfdocument_extractText: (a: number, b: number, c: number, d: number) => void;
192
+ export const wasmpdfdocument_extractTextAuto: (a: number, b: number, c: number) => void;
187
193
  export const wasmpdfdocument_extractTextLines: (a: number, b: number, c: number, d: number, e: number) => void;
188
194
  export const wasmpdfdocument_extractTextOcr: (a: number, b: number, c: number, d: number) => void;
189
195
  export const wasmpdfdocument_extractWords: (a: number, b: number, c: number, d: number, e: number) => void;