pdf-oxide-wasm 0.3.50 → 0.3.52

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2559,7 +2559,9 @@ if (Symbol.dispose) WasmHeader.prototype[Symbol.dispose] = WasmHeader.prototype.
2559
2559
  exports.WasmHeader = WasmHeader;
2560
2560
 
2561
2561
  /**
2562
- * OCR configuration for WebAssembly.
2562
+ * OCR configuration for WebAssembly. (Currently a marker — the engine
2563
+ * uses tuned defaults; knobs are exposed as the WASM OCR surface
2564
+ * matures, #524.)
2563
2565
  */
2564
2566
  class WasmOcrConfig {
2565
2567
  __destroy_into_raw() {
@@ -2586,7 +2588,17 @@ if (Symbol.dispose) WasmOcrConfig.prototype[Symbol.dispose] = WasmOcrConfig.prot
2586
2588
  exports.WasmOcrConfig = WasmOcrConfig;
2587
2589
 
2588
2590
  /**
2589
- * OCR engine for WebAssembly.
2591
+ * OCR engine for WebAssembly (#524).
2592
+ *
2593
+ * OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
2594
+ * native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
2595
+ * the browser/Deno/edge host fetches the detector + recognizer ONNX
2596
+ * files and the char dictionary (see `modelManifest()` for the URLs)
2597
+ * — typically `fetch()` + the Cache API / IndexedDB for the
2598
+ * tens-of-MB models — then hands the bytes to the constructor. This
2599
+ * only works in the `wasm-ocr` build of `pdf-oxide`; the default
2600
+ * `pdf-oxide-wasm` has no OCR (the constructor returns an error
2601
+ * explaining this).
2590
2602
  */
2591
2603
  class WasmOcrEngine {
2592
2604
  __destroy_into_raw() {
@@ -2600,20 +2612,22 @@ class WasmOcrEngine {
2600
2612
  wasm.__wbg_wasmocrengine_free(ptr, 0);
2601
2613
  }
2602
2614
  /**
2603
- * Create a new OCR engine.
2604
- * @param {string} _det_model_path
2605
- * @param {string} _rec_model_path
2606
- * @param {string} _dict_path
2615
+ * Not available in this build. OCR needs the `wasm-ocr` build of
2616
+ * `pdf-oxide` (the pure-Rust tract backend); the default
2617
+ * `pdf-oxide-wasm` ships without it.
2618
+ * @param {Uint8Array} _det_model
2619
+ * @param {Uint8Array} _rec_model
2620
+ * @param {string} _dict
2607
2621
  * @param {WasmOcrConfig | null} [_config]
2608
2622
  */
2609
- constructor(_det_model_path, _rec_model_path, _dict_path, _config) {
2623
+ constructor(_det_model, _rec_model, _dict, _config) {
2610
2624
  try {
2611
2625
  const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
2612
- const ptr0 = passStringToWasm0(_det_model_path, wasm.__wbindgen_export, wasm.__wbindgen_export2);
2626
+ const ptr0 = passArray8ToWasm0(_det_model, wasm.__wbindgen_export);
2613
2627
  const len0 = WASM_VECTOR_LEN;
2614
- const ptr1 = passStringToWasm0(_rec_model_path, wasm.__wbindgen_export, wasm.__wbindgen_export2);
2628
+ const ptr1 = passArray8ToWasm0(_rec_model, wasm.__wbindgen_export);
2615
2629
  const len1 = WASM_VECTOR_LEN;
2616
- const ptr2 = passStringToWasm0(_dict_path, wasm.__wbindgen_export, wasm.__wbindgen_export2);
2630
+ const ptr2 = passStringToWasm0(_dict, wasm.__wbindgen_export, wasm.__wbindgen_export2);
2617
2631
  const len2 = WASM_VECTOR_LEN;
2618
2632
  let ptr3 = 0;
2619
2633
  if (!isLikeNone(_config)) {
@@ -3144,6 +3158,64 @@ class WasmPdfDocument {
3144
3158
  wasm.__wbindgen_add_to_stack_pointer(16);
3145
3159
  }
3146
3160
  }
3161
+ /**
3162
+ * Cheap per-page text-vs-OCR classification → JSON
3163
+ * `DocumentClassification`.
3164
+ * @returns {string}
3165
+ */
3166
+ classifyDocument() {
3167
+ let deferred2_0;
3168
+ let deferred2_1;
3169
+ try {
3170
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
3171
+ wasm.wasmpdfdocument_classifyDocument(retptr, this.__wbg_ptr);
3172
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
3173
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
3174
+ var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
3175
+ var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
3176
+ var ptr1 = r0;
3177
+ var len1 = r1;
3178
+ if (r3) {
3179
+ ptr1 = 0; len1 = 0;
3180
+ throw takeObject(r2);
3181
+ }
3182
+ deferred2_0 = ptr1;
3183
+ deferred2_1 = len1;
3184
+ return getStringFromWasm0(ptr1, len1);
3185
+ } finally {
3186
+ wasm.__wbindgen_add_to_stack_pointer(16);
3187
+ wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
3188
+ }
3189
+ }
3190
+ /**
3191
+ * Cheap per-page classification → JSON `PageClassification`.
3192
+ * @param {number} page_index
3193
+ * @returns {string}
3194
+ */
3195
+ classifyPage(page_index) {
3196
+ let deferred2_0;
3197
+ let deferred2_1;
3198
+ try {
3199
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
3200
+ wasm.wasmpdfdocument_classifyPage(retptr, this.__wbg_ptr, page_index);
3201
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
3202
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
3203
+ var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
3204
+ var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
3205
+ var ptr1 = r0;
3206
+ var len1 = r1;
3207
+ if (r3) {
3208
+ ptr1 = 0; len1 = 0;
3209
+ throw takeObject(r2);
3210
+ }
3211
+ deferred2_0 = ptr1;
3212
+ deferred2_1 = len1;
3213
+ return getStringFromWasm0(ptr1, len1);
3214
+ } finally {
3215
+ wasm.__wbindgen_add_to_stack_pointer(16);
3216
+ wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
3217
+ }
3218
+ }
3147
3219
  /**
3148
3220
  * Clear all pending erase operations for a page.
3149
3221
  * @param {number} page_index
@@ -3572,6 +3644,40 @@ class WasmPdfDocument {
3572
3644
  wasm.__wbindgen_add_to_stack_pointer(16);
3573
3645
  }
3574
3646
  }
3647
+ /**
3648
+ * Rich per-page extraction → JSON `PageExtraction` (per-region
3649
+ * bbox + typed reason). `optionsJson` is `{}`-tolerant
3650
+ * `AutoExtractOptions`; undefined/empty → defaults.
3651
+ * @param {number} page_index
3652
+ * @param {string | null} [options_json]
3653
+ * @returns {string}
3654
+ */
3655
+ extractPageAuto(page_index, options_json) {
3656
+ let deferred3_0;
3657
+ let deferred3_1;
3658
+ try {
3659
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
3660
+ var ptr0 = isLikeNone(options_json) ? 0 : passStringToWasm0(options_json, wasm.__wbindgen_export, wasm.__wbindgen_export2);
3661
+ var len0 = WASM_VECTOR_LEN;
3662
+ wasm.wasmpdfdocument_extractPageAuto(retptr, this.__wbg_ptr, page_index, ptr0, len0);
3663
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
3664
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
3665
+ var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
3666
+ var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
3667
+ var ptr2 = r0;
3668
+ var len2 = r1;
3669
+ if (r3) {
3670
+ ptr2 = 0; len2 = 0;
3671
+ throw takeObject(r2);
3672
+ }
3673
+ deferred3_0 = ptr2;
3674
+ deferred3_1 = len2;
3675
+ return getStringFromWasm0(ptr2, len2);
3676
+ } finally {
3677
+ wasm.__wbindgen_add_to_stack_pointer(16);
3678
+ wasm.__wbindgen_export4(deferred3_0, deferred3_1, 1);
3679
+ }
3680
+ }
3575
3681
  /**
3576
3682
  * Extract complete page text data in a single call.
3577
3683
  *
@@ -3773,6 +3879,36 @@ class WasmPdfDocument {
3773
3879
  wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
3774
3880
  }
3775
3881
  }
3882
+ /**
3883
+ * One-shot auto text extraction — graceful native fallback (never
3884
+ * the opaque OCR error #513).
3885
+ * @param {number} page_index
3886
+ * @returns {string}
3887
+ */
3888
+ extractTextAuto(page_index) {
3889
+ let deferred2_0;
3890
+ let deferred2_1;
3891
+ try {
3892
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
3893
+ wasm.wasmpdfdocument_extractTextAuto(retptr, this.__wbg_ptr, page_index);
3894
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
3895
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
3896
+ var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
3897
+ var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
3898
+ var ptr1 = r0;
3899
+ var len1 = r1;
3900
+ if (r3) {
3901
+ ptr1 = 0; len1 = 0;
3902
+ throw takeObject(r2);
3903
+ }
3904
+ deferred2_0 = ptr1;
3905
+ deferred2_1 = len1;
3906
+ return getStringFromWasm0(ptr1, len1);
3907
+ } finally {
3908
+ wasm.__wbindgen_add_to_stack_pointer(16);
3909
+ wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
3910
+ }
3911
+ }
3776
3912
  /**
3777
3913
  * Extract text lines from a page.
3778
3914
  *
@@ -3799,41 +3935,35 @@ class WasmPdfDocument {
3799
3935
  }
3800
3936
  }
3801
3937
  /**
3802
- * Extract text using OCR (optical character recognition).
3803
- *
3804
- * NOTE: OCR is not yet supported in the WebAssembly build due to missing
3805
- * ONNX Runtime support for the web backend in the current implementation.
3938
+ * Extract text using OCR. Not available in this build — OCR needs
3939
+ * the `wasm-ocr` build of `pdf-oxide`.
3806
3940
  * @param {number} _page_index
3807
- * @param {WasmOcrEngine | null} [_engine]
3941
+ * @param {WasmOcrEngine} _engine
3808
3942
  * @returns {string}
3809
3943
  */
3810
3944
  extractTextOcr(_page_index, _engine) {
3811
- let deferred3_0;
3812
- let deferred3_1;
3945
+ let deferred2_0;
3946
+ let deferred2_1;
3813
3947
  try {
3814
3948
  const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
3815
- let ptr0 = 0;
3816
- if (!isLikeNone(_engine)) {
3817
- _assertClass(_engine, WasmOcrEngine);
3818
- ptr0 = _engine.__destroy_into_raw();
3819
- }
3820
- wasm.wasmpdfdocument_extractTextOcr(retptr, this.__wbg_ptr, _page_index, ptr0);
3949
+ _assertClass(_engine, WasmOcrEngine);
3950
+ wasm.wasmpdfdocument_extractTextOcr(retptr, this.__wbg_ptr, _page_index, _engine.__wbg_ptr);
3821
3951
  var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
3822
3952
  var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
3823
3953
  var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
3824
3954
  var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
3825
- var ptr2 = r0;
3826
- var len2 = r1;
3955
+ var ptr1 = r0;
3956
+ var len1 = r1;
3827
3957
  if (r3) {
3828
- ptr2 = 0; len2 = 0;
3958
+ ptr1 = 0; len1 = 0;
3829
3959
  throw takeObject(r2);
3830
3960
  }
3831
- deferred3_0 = ptr2;
3832
- deferred3_1 = len2;
3833
- return getStringFromWasm0(ptr2, len2);
3961
+ deferred2_0 = ptr1;
3962
+ deferred2_1 = len1;
3963
+ return getStringFromWasm0(ptr1, len1);
3834
3964
  } finally {
3835
3965
  wasm.__wbindgen_add_to_stack_pointer(16);
3836
- wasm.__wbindgen_export4(deferred3_0, deferred3_1, 1);
3966
+ wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
3837
3967
  }
3838
3968
  }
3839
3969
  /**
@@ -5593,6 +5723,10 @@ class WasmPdfPageRegion {
5593
5723
  }
5594
5724
  /**
5595
5725
  * Extract text using OCR from this region.
5726
+ *
5727
+ * Region-scoped OCR is not wired yet; use the page-level
5728
+ * `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
5729
+ * (#524 follow-up).
5596
5730
  * @param {WasmOcrEngine | null} [_engine]
5597
5731
  * @returns {string}
5598
5732
  */
@@ -6183,6 +6317,37 @@ function hasDocumentTimestamp(pdf_data) {
6183
6317
  }
6184
6318
  exports.hasDocumentTimestamp = hasDocumentTimestamp;
6185
6319
 
6320
+ /**
6321
+ * #519: Air-gapped OCR model manifest — JSON (detector + every
6322
+ * supported language's cache filenames and source URLs).
6323
+ *
6324
+ * WASM provisioning is **host-side**: browser/WASM has no filesystem
6325
+ * or network-to-disk, so a download-to-cache prefetch cannot run
6326
+ * here. This manifest is informational — it lets the JS host learn
6327
+ * which model files/URLs to fetch and bundle (or ship out of band)
6328
+ * before driving OCR. There is intentionally no `prefetchModels` in
6329
+ * the WASM surface (see `prefetchAvailable`, which always returns
6330
+ * `false`).
6331
+ * @returns {string}
6332
+ */
6333
+ function modelManifest() {
6334
+ let deferred1_0;
6335
+ let deferred1_1;
6336
+ try {
6337
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
6338
+ wasm.modelManifest(retptr);
6339
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
6340
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
6341
+ deferred1_0 = r0;
6342
+ deferred1_1 = r1;
6343
+ return getStringFromWasm0(r0, r1);
6344
+ } finally {
6345
+ wasm.__wbindgen_add_to_stack_pointer(16);
6346
+ wasm.__wbindgen_export4(deferred1_0, deferred1_1, 1);
6347
+ }
6348
+ }
6349
+ exports.modelManifest = modelManifest;
6350
+
6186
6351
  /**
6187
6352
  * Plan a bookmark split without producing PDFs. Returns a JSON array
6188
6353
  * of segment objects (`index, startPage…` shape from
@@ -6215,6 +6380,18 @@ function planSplitByBookmarks(src_bytes, title_prefix, ignore_case, level, inclu
6215
6380
  }
6216
6381
  exports.planSplitByBookmarks = planSplitByBookmarks;
6217
6382
 
6383
+ /**
6384
+ * #519: Whether this build can download OCR models to a local cache.
6385
+ * Always `false` in WASM — provisioning is host-side (see
6386
+ * `modelManifest`).
6387
+ * @returns {boolean}
6388
+ */
6389
+ function prefetchAvailable() {
6390
+ const ret = wasm.prefetchAvailable();
6391
+ return ret !== 0;
6392
+ }
6393
+ exports.prefetchAvailable = prefetchAvailable;
6394
+
6218
6395
  /**
6219
6396
  * Install the process-wide runtime crypto policy from its grammar
6220
6397
  * string (`"compat"|"strict"|"fips-strict"[;…]`). Fail-closed:
Binary file
@@ -23,7 +23,9 @@ export const cryptoPolicy: (a: number) => void;
23
23
  export const generateBarcodeSvg: (a: number, b: number, c: number, d: number) => void;
24
24
  export const generateQrSvg: (a: number, b: number, c: number, d: number, e: number) => void;
25
25
  export const hasDocumentTimestamp: (a: number, b: number) => number;
26
+ export const modelManifest: (a: number) => void;
26
27
  export const planSplitByBookmarks: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => void;
28
+ export const prefetchAvailable: () => number;
27
29
  export const setCryptoPolicy: (a: number, b: number, c: number) => void;
28
30
  export const setLogLevel: (a: number, b: number, c: number) => void;
29
31
  export const signPdfBytes: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => void;
@@ -158,6 +160,8 @@ export const wasmpdfdocument_applyAllRedactions: (a: number, b: number) => void;
158
160
  export const wasmpdfdocument_applyPageRedactions: (a: number, b: number, c: number) => void;
159
161
  export const wasmpdfdocument_applyRedactionsDestructive: (a: number, b: number, c: number) => void;
160
162
  export const wasmpdfdocument_authenticate: (a: number, b: number, c: number, d: number) => void;
163
+ export const wasmpdfdocument_classifyDocument: (a: number, b: number) => void;
164
+ export const wasmpdfdocument_classifyPage: (a: number, b: number, c: number) => void;
161
165
  export const wasmpdfdocument_clearEraseRegions: (a: number, b: number, c: number) => void;
162
166
  export const wasmpdfdocument_convertToPdfA: (a: number, b: number, c: number, d: number) => void;
163
167
  export const wasmpdfdocument_cropMargins: (a: number, b: number, c: number, d: number, e: number, f: number) => void;
@@ -177,6 +181,7 @@ export const wasmpdfdocument_extractChars: (a: number, b: number, c: number, d:
177
181
  export const wasmpdfdocument_extractImageBytes: (a: number, b: number, c: number) => void;
178
182
  export const wasmpdfdocument_extractImages: (a: number, b: number, c: number, d: number, e: number) => void;
179
183
  export const wasmpdfdocument_extractLines: (a: number, b: number, c: number, d: number, e: number) => void;
184
+ export const wasmpdfdocument_extractPageAuto: (a: number, b: number, c: number, d: number, e: number) => void;
180
185
  export const wasmpdfdocument_extractPageText: (a: number, b: number, c: number, d: number, e: number) => void;
181
186
  export const wasmpdfdocument_extractPages: (a: number, b: number, c: number, d: number) => void;
182
187
  export const wasmpdfdocument_extractPaths: (a: number, b: number, c: number, d: number, e: number) => void;
@@ -184,6 +189,7 @@ export const wasmpdfdocument_extractRects: (a: number, b: number, c: number, d:
184
189
  export const wasmpdfdocument_extractSpans: (a: number, b: number, c: number, d: number, e: number, f: number, g: number) => void;
185
190
  export const wasmpdfdocument_extractTables: (a: number, b: number, c: number, d: number, e: number) => void;
186
191
  export const wasmpdfdocument_extractText: (a: number, b: number, c: number, d: number) => void;
192
+ export const wasmpdfdocument_extractTextAuto: (a: number, b: number, c: number) => void;
187
193
  export const wasmpdfdocument_extractTextLines: (a: number, b: number, c: number, d: number, e: number) => void;
188
194
  export const wasmpdfdocument_extractTextOcr: (a: number, b: number, c: number, d: number) => void;
189
195
  export const wasmpdfdocument_extractWords: (a: number, b: number, c: number, d: number, e: number) => void;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pdf-oxide-wasm",
3
- "version": "0.3.50",
3
+ "version": "0.3.52",
4
4
  "description": "Fast, zero-dependency PDF toolkit for Node.js, browsers, and edge runtimes — text extraction, markdown/HTML conversion, search, form filling, creation, and editing. Rust core compiled to WebAssembly.",
5
5
  "license": "MIT OR Apache-2.0",
6
6
  "repository": {
@@ -644,7 +644,9 @@ export class WasmHeader {
644
644
  }
645
645
 
646
646
  /**
647
- * OCR configuration for WebAssembly.
647
+ * OCR configuration for WebAssembly. (Currently a marker — the engine
648
+ * uses tuned defaults; knobs are exposed as the WASM OCR surface
649
+ * matures, #524.)
648
650
  */
649
651
  export class WasmOcrConfig {
650
652
  free(): void;
@@ -656,15 +658,27 @@ export class WasmOcrConfig {
656
658
  }
657
659
 
658
660
  /**
659
- * OCR engine for WebAssembly.
661
+ * OCR engine for WebAssembly (#524).
662
+ *
663
+ * OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
664
+ * native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
665
+ * the browser/Deno/edge host fetches the detector + recognizer ONNX
666
+ * files and the char dictionary (see `modelManifest()` for the URLs)
667
+ * — typically `fetch()` + the Cache API / IndexedDB for the
668
+ * tens-of-MB models — then hands the bytes to the constructor. This
669
+ * only works in the `wasm-ocr` build of `pdf-oxide`; the default
670
+ * `pdf-oxide-wasm` has no OCR (the constructor returns an error
671
+ * explaining this).
660
672
  */
661
673
  export class WasmOcrEngine {
662
674
  free(): void;
663
675
  [Symbol.dispose](): void;
664
676
  /**
665
- * Create a new OCR engine.
677
+ * Not available in this build. OCR needs the `wasm-ocr` build of
678
+ * `pdf-oxide` (the pure-Rust tract backend); the default
679
+ * `pdf-oxide-wasm` ships without it.
666
680
  */
667
- constructor(_det_model_path: string, _rec_model_path: string, _dict_path: string, _config?: WasmOcrConfig | null);
681
+ constructor(_det_model: Uint8Array, _rec_model: Uint8Array, _dict: string, _config?: WasmOcrConfig | null);
668
682
  }
669
683
 
670
684
  /**
@@ -815,6 +829,15 @@ export class WasmPdfDocument {
815
829
  * @returns true if authentication succeeded
816
830
  */
817
831
  authenticate(password: string): boolean;
832
+ /**
833
+ * Cheap per-page text-vs-OCR classification → JSON
834
+ * `DocumentClassification`.
835
+ */
836
+ classifyDocument(): string;
837
+ /**
838
+ * Cheap per-page classification → JSON `PageClassification`.
839
+ */
840
+ classifyPage(page_index: number): string;
818
841
  /**
819
842
  * Clear all pending erase operations for a page.
820
843
  */
@@ -934,6 +957,12 @@ export class WasmPdfDocument {
934
957
  * @returns Array of path objects
935
958
  */
936
959
  extractLines(page_index: number, region?: Float32Array | null): any;
960
+ /**
961
+ * Rich per-page extraction → JSON `PageExtraction` (per-region
962
+ * bbox + typed reason). `optionsJson` is `{}`-tolerant
963
+ * `AutoExtractOptions`; undefined/empty → defaults.
964
+ */
965
+ extractPageAuto(page_index: number, options_json?: string | null): string;
937
966
  /**
938
967
  * Extract complete page text data in a single call.
939
968
  *
@@ -990,6 +1019,11 @@ export class WasmPdfDocument {
990
1019
  * @param region - Optional [x, y, width, height] to filter by
991
1020
  */
992
1021
  extractText(page_index: number, region: any): string;
1022
+ /**
1023
+ * One-shot auto text extraction — graceful native fallback (never
1024
+ * the opaque OCR error #513).
1025
+ */
1026
+ extractTextAuto(page_index: number): string;
993
1027
  /**
994
1028
  * Extract text lines from a page.
995
1029
  *
@@ -997,12 +1031,10 @@ export class WasmPdfDocument {
997
1031
  */
998
1032
  extractTextLines(page_index: number, region?: Float32Array | null): any;
999
1033
  /**
1000
- * Extract text using OCR (optical character recognition).
1001
- *
1002
- * NOTE: OCR is not yet supported in the WebAssembly build due to missing
1003
- * ONNX Runtime support for the web backend in the current implementation.
1034
+ * Extract text using OCR. Not available in this build — OCR needs
1035
+ * the `wasm-ocr` build of `pdf-oxide`.
1004
1036
  */
1005
- extractTextOcr(_page_index: number, _engine?: WasmOcrEngine | null): string;
1037
+ extractTextOcr(_page_index: number, _engine: WasmOcrEngine): string;
1006
1038
  /**
1007
1039
  * Extract word-level data from a page.
1008
1040
  *
@@ -1425,6 +1457,10 @@ export class WasmPdfPageRegion {
1425
1457
  extractTextLines(): any;
1426
1458
  /**
1427
1459
  * Extract text using OCR from this region.
1460
+ *
1461
+ * Region-scoped OCR is not wired yet; use the page-level
1462
+ * `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
1463
+ * (#524 follow-up).
1428
1464
  */
1429
1465
  extractTextOcr(_engine?: WasmOcrEngine | null): string;
1430
1466
  /**
@@ -1600,6 +1636,20 @@ export function generateQrSvg(data: string, error_correction: number, size: numb
1600
1636
  */
1601
1637
  export function hasDocumentTimestamp(pdf_data: Uint8Array): boolean;
1602
1638
 
1639
+ /**
1640
+ * #519: Air-gapped OCR model manifest — JSON (detector + every
1641
+ * supported language's cache filenames and source URLs).
1642
+ *
1643
+ * WASM provisioning is **host-side**: browser/WASM has no filesystem
1644
+ * or network-to-disk, so a download-to-cache prefetch cannot run
1645
+ * here. This manifest is informational — it lets the JS host learn
1646
+ * which model files/URLs to fetch and bundle (or ship out of band)
1647
+ * before driving OCR. There is intentionally no `prefetchModels` in
1648
+ * the WASM surface (see `prefetchAvailable`, which always returns
1649
+ * `false`).
1650
+ */
1651
+ export function modelManifest(): string;
1652
+
1603
1653
  /**
1604
1654
  * Plan a bookmark split without producing PDFs. Returns a JSON array
1605
1655
  * of segment objects (`index, startPage…` shape from
@@ -1607,6 +1657,13 @@ export function hasDocumentTimestamp(pdf_data: Uint8Array): boolean;
1607
1657
  */
1608
1658
  export function planSplitByBookmarks(src_bytes: Uint8Array, title_prefix: string | null | undefined, ignore_case: boolean, level: number, include_front_matter: boolean): any;
1609
1659
 
1660
+ /**
1661
+ * #519: Whether this build can download OCR models to a local cache.
1662
+ * Always `false` in WASM — provisioning is host-side (see
1663
+ * `modelManifest`).
1664
+ */
1665
+ export function prefetchAvailable(): boolean;
1666
+
1610
1667
  /**
1611
1668
  * Install the process-wide runtime crypto policy from its grammar
1612
1669
  * string (`"compat"|"strict"|"fips-strict"[;…]`). Fail-closed:
@@ -1687,7 +1744,9 @@ export interface InitOutput {
1687
1744
  readonly generateBarcodeSvg: (a: number, b: number, c: number, d: number) => void;
1688
1745
  readonly generateQrSvg: (a: number, b: number, c: number, d: number, e: number) => void;
1689
1746
  readonly hasDocumentTimestamp: (a: number, b: number) => number;
1747
+ readonly modelManifest: (a: number) => void;
1690
1748
  readonly planSplitByBookmarks: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => void;
1749
+ readonly prefetchAvailable: () => number;
1691
1750
  readonly setCryptoPolicy: (a: number, b: number, c: number) => void;
1692
1751
  readonly setLogLevel: (a: number, b: number, c: number) => void;
1693
1752
  readonly signPdfBytes: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => void;
@@ -1822,6 +1881,8 @@ export interface InitOutput {
1822
1881
  readonly wasmpdfdocument_applyPageRedactions: (a: number, b: number, c: number) => void;
1823
1882
  readonly wasmpdfdocument_applyRedactionsDestructive: (a: number, b: number, c: number) => void;
1824
1883
  readonly wasmpdfdocument_authenticate: (a: number, b: number, c: number, d: number) => void;
1884
+ readonly wasmpdfdocument_classifyDocument: (a: number, b: number) => void;
1885
+ readonly wasmpdfdocument_classifyPage: (a: number, b: number, c: number) => void;
1825
1886
  readonly wasmpdfdocument_clearEraseRegions: (a: number, b: number, c: number) => void;
1826
1887
  readonly wasmpdfdocument_convertToPdfA: (a: number, b: number, c: number, d: number) => void;
1827
1888
  readonly wasmpdfdocument_cropMargins: (a: number, b: number, c: number, d: number, e: number, f: number) => void;
@@ -1841,6 +1902,7 @@ export interface InitOutput {
1841
1902
  readonly wasmpdfdocument_extractImageBytes: (a: number, b: number, c: number) => void;
1842
1903
  readonly wasmpdfdocument_extractImages: (a: number, b: number, c: number, d: number, e: number) => void;
1843
1904
  readonly wasmpdfdocument_extractLines: (a: number, b: number, c: number, d: number, e: number) => void;
1905
+ readonly wasmpdfdocument_extractPageAuto: (a: number, b: number, c: number, d: number, e: number) => void;
1844
1906
  readonly wasmpdfdocument_extractPageText: (a: number, b: number, c: number, d: number, e: number) => void;
1845
1907
  readonly wasmpdfdocument_extractPages: (a: number, b: number, c: number, d: number) => void;
1846
1908
  readonly wasmpdfdocument_extractPaths: (a: number, b: number, c: number, d: number, e: number) => void;
@@ -1848,6 +1910,7 @@ export interface InitOutput {
1848
1910
  readonly wasmpdfdocument_extractSpans: (a: number, b: number, c: number, d: number, e: number, f: number, g: number) => void;
1849
1911
  readonly wasmpdfdocument_extractTables: (a: number, b: number, c: number, d: number, e: number) => void;
1850
1912
  readonly wasmpdfdocument_extractText: (a: number, b: number, c: number, d: number) => void;
1913
+ readonly wasmpdfdocument_extractTextAuto: (a: number, b: number, c: number) => void;
1851
1914
  readonly wasmpdfdocument_extractTextLines: (a: number, b: number, c: number, d: number, e: number) => void;
1852
1915
  readonly wasmpdfdocument_extractTextOcr: (a: number, b: number, c: number, d: number) => void;
1853
1916
  readonly wasmpdfdocument_extractWords: (a: number, b: number, c: number, d: number, e: number) => void;