pdf-oxide-wasm 0.3.50 → 0.3.52

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -644,7 +644,9 @@ export class WasmHeader {
644
644
  }
645
645
 
646
646
  /**
647
- * OCR configuration for WebAssembly.
647
+ * OCR configuration for WebAssembly. (Currently a marker — the engine
648
+ * uses tuned defaults; knobs are exposed as the WASM OCR surface
649
+ * matures, #524.)
648
650
  */
649
651
  export class WasmOcrConfig {
650
652
  free(): void;
@@ -656,15 +658,27 @@ export class WasmOcrConfig {
656
658
  }
657
659
 
658
660
  /**
659
- * OCR engine for WebAssembly.
661
+ * OCR engine for WebAssembly (#524).
662
+ *
663
+ * OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
664
+ * native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
665
+ * the browser/Deno/edge host fetches the detector + recognizer ONNX
666
+ * files and the char dictionary (see `modelManifest()` for the URLs)
667
+ * — typically `fetch()` + the Cache API / IndexedDB for the
668
+ * tens-of-MB models — then hands the bytes to the constructor. This
669
+ * only works in the `wasm-ocr` build of `pdf-oxide`; the default
670
+ * `pdf-oxide-wasm` has no OCR (the constructor returns an error
671
+ * explaining this).
660
672
  */
661
673
  export class WasmOcrEngine {
662
674
  free(): void;
663
675
  [Symbol.dispose](): void;
664
676
  /**
665
- * Create a new OCR engine.
677
+ * Not available in this build. OCR needs the `wasm-ocr` build of
678
+ * `pdf-oxide` (the pure-Rust tract backend); the default
679
+ * `pdf-oxide-wasm` ships without it.
666
680
  */
667
- constructor(_det_model_path: string, _rec_model_path: string, _dict_path: string, _config?: WasmOcrConfig | null);
681
+ constructor(_det_model: Uint8Array, _rec_model: Uint8Array, _dict: string, _config?: WasmOcrConfig | null);
668
682
  }
669
683
 
670
684
  /**
@@ -815,6 +829,15 @@ export class WasmPdfDocument {
815
829
  * @returns true if authentication succeeded
816
830
  */
817
831
  authenticate(password: string): boolean;
832
+ /**
833
+ * Cheap per-page text-vs-OCR classification → JSON
834
+ * `DocumentClassification`.
835
+ */
836
+ classifyDocument(): string;
837
+ /**
838
+ * Cheap per-page classification → JSON `PageClassification`.
839
+ */
840
+ classifyPage(page_index: number): string;
818
841
  /**
819
842
  * Clear all pending erase operations for a page.
820
843
  */
@@ -934,6 +957,12 @@ export class WasmPdfDocument {
934
957
  * @returns Array of path objects
935
958
  */
936
959
  extractLines(page_index: number, region?: Float32Array | null): any;
960
+ /**
961
+ * Rich per-page extraction → JSON `PageExtraction` (per-region
962
+ * bbox + typed reason). `optionsJson` is `{}`-tolerant
963
+ * `AutoExtractOptions`; undefined/empty → defaults.
964
+ */
965
+ extractPageAuto(page_index: number, options_json?: string | null): string;
937
966
  /**
938
967
  * Extract complete page text data in a single call.
939
968
  *
@@ -990,6 +1019,11 @@ export class WasmPdfDocument {
990
1019
  * @param region - Optional [x, y, width, height] to filter by
991
1020
  */
992
1021
  extractText(page_index: number, region: any): string;
1022
+ /**
1023
+ * One-shot auto text extraction — graceful native fallback (never
1024
+ * the opaque OCR error #513).
1025
+ */
1026
+ extractTextAuto(page_index: number): string;
993
1027
  /**
994
1028
  * Extract text lines from a page.
995
1029
  *
@@ -997,12 +1031,10 @@ export class WasmPdfDocument {
997
1031
  */
998
1032
  extractTextLines(page_index: number, region?: Float32Array | null): any;
999
1033
  /**
1000
- * Extract text using OCR (optical character recognition).
1001
- *
1002
- * NOTE: OCR is not yet supported in the WebAssembly build due to missing
1003
- * ONNX Runtime support for the web backend in the current implementation.
1034
+ * Extract text using OCR. Not available in this build — OCR needs
1035
+ * the `wasm-ocr` build of `pdf-oxide`.
1004
1036
  */
1005
- extractTextOcr(_page_index: number, _engine?: WasmOcrEngine | null): string;
1037
+ extractTextOcr(_page_index: number, _engine: WasmOcrEngine): string;
1006
1038
  /**
1007
1039
  * Extract word-level data from a page.
1008
1040
  *
@@ -1425,6 +1457,10 @@ export class WasmPdfPageRegion {
1425
1457
  extractTextLines(): any;
1426
1458
  /**
1427
1459
  * Extract text using OCR from this region.
1460
+ *
1461
+ * Region-scoped OCR is not wired yet; use the page-level
1462
+ * `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
1463
+ * (#524 follow-up).
1428
1464
  */
1429
1465
  extractTextOcr(_engine?: WasmOcrEngine | null): string;
1430
1466
  /**
@@ -1600,6 +1636,20 @@ export function generateQrSvg(data: string, error_correction: number, size: numb
1600
1636
  */
1601
1637
  export function hasDocumentTimestamp(pdf_data: Uint8Array): boolean;
1602
1638
 
1639
+ /**
1640
+ * #519: Air-gapped OCR model manifest — JSON (detector + every
1641
+ * supported language's cache filenames and source URLs).
1642
+ *
1643
+ * WASM provisioning is **host-side**: browser/WASM has no filesystem
1644
+ * or network-to-disk, so a download-to-cache prefetch cannot run
1645
+ * here. This manifest is informational — it lets the JS host learn
1646
+ * which model files/URLs to fetch and bundle (or ship out of band)
1647
+ * before driving OCR. There is intentionally no `prefetchModels` in
1648
+ * the WASM surface (see `prefetchAvailable`, which always returns
1649
+ * `false`).
1650
+ */
1651
+ export function modelManifest(): string;
1652
+
1603
1653
  /**
1604
1654
  * Plan a bookmark split without producing PDFs. Returns a JSON array
1605
1655
  * of segment objects (`index, startPage…` shape from
@@ -1607,6 +1657,13 @@ export function hasDocumentTimestamp(pdf_data: Uint8Array): boolean;
1607
1657
  */
1608
1658
  export function planSplitByBookmarks(src_bytes: Uint8Array, title_prefix: string | null | undefined, ignore_case: boolean, level: number, include_front_matter: boolean): any;
1609
1659
 
1660
+ /**
1661
+ * #519: Whether this build can download OCR models to a local cache.
1662
+ * Always `false` in WASM — provisioning is host-side (see
1663
+ * `modelManifest`).
1664
+ */
1665
+ export function prefetchAvailable(): boolean;
1666
+
1610
1667
  /**
1611
1668
  * Install the process-wide runtime crypto policy from its grammar
1612
1669
  * string (`"compat"|"strict"|"fips-strict"[;…]`). Fail-closed:
@@ -5,5 +5,5 @@ import { __wbg_set_wasm } from "./pdf_oxide_bg.js";
5
5
  __wbg_set_wasm(wasm);
6
6
 
7
7
  export {
8
- Align, ArtifactStyle, Dss, PadesLevel, RevocationMaterial, StreamingTable, WasmArtifact, WasmCertificate, WasmDocumentBuilder, WasmEmbeddedFont, WasmFluentPageBuilder, WasmFooter, WasmHeader, WasmOcrConfig, WasmOcrEngine, WasmPageTemplate, WasmPdf, WasmPdfDocument, WasmPdfPageRegion, WasmSignature, WasmTimestamp, cryptoCbom, cryptoInventory, cryptoPolicy, disableLogging, generateBarcodeSvg, generateQrSvg, hasDocumentTimestamp, planSplitByBookmarks, setCryptoPolicy, setLogLevel, signPdfBytes, signPdfBytesPades, splitByBookmarks
8
+ Align, ArtifactStyle, Dss, PadesLevel, RevocationMaterial, StreamingTable, WasmArtifact, WasmCertificate, WasmDocumentBuilder, WasmEmbeddedFont, WasmFluentPageBuilder, WasmFooter, WasmHeader, WasmOcrConfig, WasmOcrEngine, WasmPageTemplate, WasmPdf, WasmPdfDocument, WasmPdfPageRegion, WasmSignature, WasmTimestamp, cryptoCbom, cryptoInventory, cryptoPolicy, disableLogging, generateBarcodeSvg, generateQrSvg, hasDocumentTimestamp, modelManifest, planSplitByBookmarks, prefetchAvailable, setCryptoPolicy, setLogLevel, signPdfBytes, signPdfBytesPades, splitByBookmarks
9
9
  } from "./pdf_oxide_bg.js";
@@ -2544,7 +2544,9 @@ export class WasmHeader {
2544
2544
  if (Symbol.dispose) WasmHeader.prototype[Symbol.dispose] = WasmHeader.prototype.free;
2545
2545
 
2546
2546
  /**
2547
- * OCR configuration for WebAssembly.
2547
+ * OCR configuration for WebAssembly. (Currently a marker — the engine
2548
+ * uses tuned defaults; knobs are exposed as the WASM OCR surface
2549
+ * matures, #524.)
2548
2550
  */
2549
2551
  export class WasmOcrConfig {
2550
2552
  __destroy_into_raw() {
@@ -2570,7 +2572,17 @@ export class WasmOcrConfig {
2570
2572
  if (Symbol.dispose) WasmOcrConfig.prototype[Symbol.dispose] = WasmOcrConfig.prototype.free;
2571
2573
 
2572
2574
  /**
2573
- * OCR engine for WebAssembly.
2575
+ * OCR engine for WebAssembly (#524).
2576
+ *
2577
+ * OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
2578
+ * native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
2579
+ * the browser/Deno/edge host fetches the detector + recognizer ONNX
2580
+ * files and the char dictionary (see `modelManifest()` for the URLs)
2581
+ * — typically `fetch()` + the Cache API / IndexedDB for the
2582
+ * tens-of-MB models — then hands the bytes to the constructor. This
2583
+ * only works in the `wasm-ocr` build of `pdf-oxide`; the default
2584
+ * `pdf-oxide-wasm` has no OCR (the constructor returns an error
2585
+ * explaining this).
2574
2586
  */
2575
2587
  export class WasmOcrEngine {
2576
2588
  __destroy_into_raw() {
@@ -2584,20 +2596,22 @@ export class WasmOcrEngine {
2584
2596
  wasm.__wbg_wasmocrengine_free(ptr, 0);
2585
2597
  }
2586
2598
  /**
2587
- * Create a new OCR engine.
2588
- * @param {string} _det_model_path
2589
- * @param {string} _rec_model_path
2590
- * @param {string} _dict_path
2599
+ * Not available in this build. OCR needs the `wasm-ocr` build of
2600
+ * `pdf-oxide` (the pure-Rust tract backend); the default
2601
+ * `pdf-oxide-wasm` ships without it.
2602
+ * @param {Uint8Array} _det_model
2603
+ * @param {Uint8Array} _rec_model
2604
+ * @param {string} _dict
2591
2605
  * @param {WasmOcrConfig | null} [_config]
2592
2606
  */
2593
- constructor(_det_model_path, _rec_model_path, _dict_path, _config) {
2607
+ constructor(_det_model, _rec_model, _dict, _config) {
2594
2608
  try {
2595
2609
  const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
2596
- const ptr0 = passStringToWasm0(_det_model_path, wasm.__wbindgen_export, wasm.__wbindgen_export2);
2610
+ const ptr0 = passArray8ToWasm0(_det_model, wasm.__wbindgen_export);
2597
2611
  const len0 = WASM_VECTOR_LEN;
2598
- const ptr1 = passStringToWasm0(_rec_model_path, wasm.__wbindgen_export, wasm.__wbindgen_export2);
2612
+ const ptr1 = passArray8ToWasm0(_rec_model, wasm.__wbindgen_export);
2599
2613
  const len1 = WASM_VECTOR_LEN;
2600
- const ptr2 = passStringToWasm0(_dict_path, wasm.__wbindgen_export, wasm.__wbindgen_export2);
2614
+ const ptr2 = passStringToWasm0(_dict, wasm.__wbindgen_export, wasm.__wbindgen_export2);
2601
2615
  const len2 = WASM_VECTOR_LEN;
2602
2616
  let ptr3 = 0;
2603
2617
  if (!isLikeNone(_config)) {
@@ -3125,6 +3139,64 @@ export class WasmPdfDocument {
3125
3139
  wasm.__wbindgen_add_to_stack_pointer(16);
3126
3140
  }
3127
3141
  }
3142
+ /**
3143
+ * Cheap per-page text-vs-OCR classification → JSON
3144
+ * `DocumentClassification`.
3145
+ * @returns {string}
3146
+ */
3147
+ classifyDocument() {
3148
+ let deferred2_0;
3149
+ let deferred2_1;
3150
+ try {
3151
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
3152
+ wasm.wasmpdfdocument_classifyDocument(retptr, this.__wbg_ptr);
3153
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
3154
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
3155
+ var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
3156
+ var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
3157
+ var ptr1 = r0;
3158
+ var len1 = r1;
3159
+ if (r3) {
3160
+ ptr1 = 0; len1 = 0;
3161
+ throw takeObject(r2);
3162
+ }
3163
+ deferred2_0 = ptr1;
3164
+ deferred2_1 = len1;
3165
+ return getStringFromWasm0(ptr1, len1);
3166
+ } finally {
3167
+ wasm.__wbindgen_add_to_stack_pointer(16);
3168
+ wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
3169
+ }
3170
+ }
3171
+ /**
3172
+ * Cheap per-page classification → JSON `PageClassification`.
3173
+ * @param {number} page_index
3174
+ * @returns {string}
3175
+ */
3176
+ classifyPage(page_index) {
3177
+ let deferred2_0;
3178
+ let deferred2_1;
3179
+ try {
3180
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
3181
+ wasm.wasmpdfdocument_classifyPage(retptr, this.__wbg_ptr, page_index);
3182
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
3183
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
3184
+ var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
3185
+ var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
3186
+ var ptr1 = r0;
3187
+ var len1 = r1;
3188
+ if (r3) {
3189
+ ptr1 = 0; len1 = 0;
3190
+ throw takeObject(r2);
3191
+ }
3192
+ deferred2_0 = ptr1;
3193
+ deferred2_1 = len1;
3194
+ return getStringFromWasm0(ptr1, len1);
3195
+ } finally {
3196
+ wasm.__wbindgen_add_to_stack_pointer(16);
3197
+ wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
3198
+ }
3199
+ }
3128
3200
  /**
3129
3201
  * Clear all pending erase operations for a page.
3130
3202
  * @param {number} page_index
@@ -3553,6 +3625,40 @@ export class WasmPdfDocument {
3553
3625
  wasm.__wbindgen_add_to_stack_pointer(16);
3554
3626
  }
3555
3627
  }
3628
+ /**
3629
+ * Rich per-page extraction → JSON `PageExtraction` (per-region
3630
+ * bbox + typed reason). `optionsJson` is `{}`-tolerant
3631
+ * `AutoExtractOptions`; undefined/empty → defaults.
3632
+ * @param {number} page_index
3633
+ * @param {string | null} [options_json]
3634
+ * @returns {string}
3635
+ */
3636
+ extractPageAuto(page_index, options_json) {
3637
+ let deferred3_0;
3638
+ let deferred3_1;
3639
+ try {
3640
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
3641
+ var ptr0 = isLikeNone(options_json) ? 0 : passStringToWasm0(options_json, wasm.__wbindgen_export, wasm.__wbindgen_export2);
3642
+ var len0 = WASM_VECTOR_LEN;
3643
+ wasm.wasmpdfdocument_extractPageAuto(retptr, this.__wbg_ptr, page_index, ptr0, len0);
3644
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
3645
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
3646
+ var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
3647
+ var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
3648
+ var ptr2 = r0;
3649
+ var len2 = r1;
3650
+ if (r3) {
3651
+ ptr2 = 0; len2 = 0;
3652
+ throw takeObject(r2);
3653
+ }
3654
+ deferred3_0 = ptr2;
3655
+ deferred3_1 = len2;
3656
+ return getStringFromWasm0(ptr2, len2);
3657
+ } finally {
3658
+ wasm.__wbindgen_add_to_stack_pointer(16);
3659
+ wasm.__wbindgen_export4(deferred3_0, deferred3_1, 1);
3660
+ }
3661
+ }
3556
3662
  /**
3557
3663
  * Extract complete page text data in a single call.
3558
3664
  *
@@ -3754,6 +3860,36 @@ export class WasmPdfDocument {
3754
3860
  wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
3755
3861
  }
3756
3862
  }
3863
+ /**
3864
+ * One-shot auto text extraction — graceful native fallback (never
3865
+ * the opaque OCR error #513).
3866
+ * @param {number} page_index
3867
+ * @returns {string}
3868
+ */
3869
+ extractTextAuto(page_index) {
3870
+ let deferred2_0;
3871
+ let deferred2_1;
3872
+ try {
3873
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
3874
+ wasm.wasmpdfdocument_extractTextAuto(retptr, this.__wbg_ptr, page_index);
3875
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
3876
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
3877
+ var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
3878
+ var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
3879
+ var ptr1 = r0;
3880
+ var len1 = r1;
3881
+ if (r3) {
3882
+ ptr1 = 0; len1 = 0;
3883
+ throw takeObject(r2);
3884
+ }
3885
+ deferred2_0 = ptr1;
3886
+ deferred2_1 = len1;
3887
+ return getStringFromWasm0(ptr1, len1);
3888
+ } finally {
3889
+ wasm.__wbindgen_add_to_stack_pointer(16);
3890
+ wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
3891
+ }
3892
+ }
3757
3893
  /**
3758
3894
  * Extract text lines from a page.
3759
3895
  *
@@ -3780,41 +3916,35 @@ export class WasmPdfDocument {
3780
3916
  }
3781
3917
  }
3782
3918
  /**
3783
- * Extract text using OCR (optical character recognition).
3784
- *
3785
- * NOTE: OCR is not yet supported in the WebAssembly build due to missing
3786
- * ONNX Runtime support for the web backend in the current implementation.
3919
+ * Extract text using OCR. Not available in this build — OCR needs
3920
+ * the `wasm-ocr` build of `pdf-oxide`.
3787
3921
  * @param {number} _page_index
3788
- * @param {WasmOcrEngine | null} [_engine]
3922
+ * @param {WasmOcrEngine} _engine
3789
3923
  * @returns {string}
3790
3924
  */
3791
3925
  extractTextOcr(_page_index, _engine) {
3792
- let deferred3_0;
3793
- let deferred3_1;
3926
+ let deferred2_0;
3927
+ let deferred2_1;
3794
3928
  try {
3795
3929
  const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
3796
- let ptr0 = 0;
3797
- if (!isLikeNone(_engine)) {
3798
- _assertClass(_engine, WasmOcrEngine);
3799
- ptr0 = _engine.__destroy_into_raw();
3800
- }
3801
- wasm.wasmpdfdocument_extractTextOcr(retptr, this.__wbg_ptr, _page_index, ptr0);
3930
+ _assertClass(_engine, WasmOcrEngine);
3931
+ wasm.wasmpdfdocument_extractTextOcr(retptr, this.__wbg_ptr, _page_index, _engine.__wbg_ptr);
3802
3932
  var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
3803
3933
  var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
3804
3934
  var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
3805
3935
  var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
3806
- var ptr2 = r0;
3807
- var len2 = r1;
3936
+ var ptr1 = r0;
3937
+ var len1 = r1;
3808
3938
  if (r3) {
3809
- ptr2 = 0; len2 = 0;
3939
+ ptr1 = 0; len1 = 0;
3810
3940
  throw takeObject(r2);
3811
3941
  }
3812
- deferred3_0 = ptr2;
3813
- deferred3_1 = len2;
3814
- return getStringFromWasm0(ptr2, len2);
3942
+ deferred2_0 = ptr1;
3943
+ deferred2_1 = len1;
3944
+ return getStringFromWasm0(ptr1, len1);
3815
3945
  } finally {
3816
3946
  wasm.__wbindgen_add_to_stack_pointer(16);
3817
- wasm.__wbindgen_export4(deferred3_0, deferred3_1, 1);
3947
+ wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
3818
3948
  }
3819
3949
  }
3820
3950
  /**
@@ -5573,6 +5703,10 @@ export class WasmPdfPageRegion {
5573
5703
  }
5574
5704
  /**
5575
5705
  * Extract text using OCR from this region.
5706
+ *
5707
+ * Region-scoped OCR is not wired yet; use the page-level
5708
+ * `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
5709
+ * (#524 follow-up).
5576
5710
  * @param {WasmOcrEngine | null} [_engine]
5577
5711
  * @returns {string}
5578
5712
  */
@@ -6153,6 +6287,36 @@ export function hasDocumentTimestamp(pdf_data) {
6153
6287
  return ret !== 0;
6154
6288
  }
6155
6289
 
6290
+ /**
6291
+ * #519: Air-gapped OCR model manifest — JSON (detector + every
6292
+ * supported language's cache filenames and source URLs).
6293
+ *
6294
+ * WASM provisioning is **host-side**: browser/WASM has no filesystem
6295
+ * or network-to-disk, so a download-to-cache prefetch cannot run
6296
+ * here. This manifest is informational — it lets the JS host learn
6297
+ * which model files/URLs to fetch and bundle (or ship out of band)
6298
+ * before driving OCR. There is intentionally no `prefetchModels` in
6299
+ * the WASM surface (see `prefetchAvailable`, which always returns
6300
+ * `false`).
6301
+ * @returns {string}
6302
+ */
6303
+ export function modelManifest() {
6304
+ let deferred1_0;
6305
+ let deferred1_1;
6306
+ try {
6307
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
6308
+ wasm.modelManifest(retptr);
6309
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
6310
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
6311
+ deferred1_0 = r0;
6312
+ deferred1_1 = r1;
6313
+ return getStringFromWasm0(r0, r1);
6314
+ } finally {
6315
+ wasm.__wbindgen_add_to_stack_pointer(16);
6316
+ wasm.__wbindgen_export4(deferred1_0, deferred1_1, 1);
6317
+ }
6318
+ }
6319
+
6156
6320
  /**
6157
6321
  * Plan a bookmark split without producing PDFs. Returns a JSON array
6158
6322
  * of segment objects (`index, startPage…` shape from
@@ -6184,6 +6348,17 @@ export function planSplitByBookmarks(src_bytes, title_prefix, ignore_case, level
6184
6348
  }
6185
6349
  }
6186
6350
 
6351
+ /**
6352
+ * #519: Whether this build can download OCR models to a local cache.
6353
+ * Always `false` in WASM — provisioning is host-side (see
6354
+ * `modelManifest`).
6355
+ * @returns {boolean}
6356
+ */
6357
+ export function prefetchAvailable() {
6358
+ const ret = wasm.prefetchAvailable();
6359
+ return ret !== 0;
6360
+ }
6361
+
6187
6362
  /**
6188
6363
  * Install the process-wide runtime crypto policy from its grammar
6189
6364
  * string (`"compat"|"strict"|"fips-strict"[;…]`). Fail-closed:
Binary file
@@ -23,7 +23,9 @@ export const cryptoPolicy: (a: number) => void;
23
23
  export const generateBarcodeSvg: (a: number, b: number, c: number, d: number) => void;
24
24
  export const generateQrSvg: (a: number, b: number, c: number, d: number, e: number) => void;
25
25
  export const hasDocumentTimestamp: (a: number, b: number) => number;
26
+ export const modelManifest: (a: number) => void;
26
27
  export const planSplitByBookmarks: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => void;
28
+ export const prefetchAvailable: () => number;
27
29
  export const setCryptoPolicy: (a: number, b: number, c: number) => void;
28
30
  export const setLogLevel: (a: number, b: number, c: number) => void;
29
31
  export const signPdfBytes: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => void;
@@ -158,6 +160,8 @@ export const wasmpdfdocument_applyAllRedactions: (a: number, b: number) => void;
158
160
  export const wasmpdfdocument_applyPageRedactions: (a: number, b: number, c: number) => void;
159
161
  export const wasmpdfdocument_applyRedactionsDestructive: (a: number, b: number, c: number) => void;
160
162
  export const wasmpdfdocument_authenticate: (a: number, b: number, c: number, d: number) => void;
163
+ export const wasmpdfdocument_classifyDocument: (a: number, b: number) => void;
164
+ export const wasmpdfdocument_classifyPage: (a: number, b: number, c: number) => void;
161
165
  export const wasmpdfdocument_clearEraseRegions: (a: number, b: number, c: number) => void;
162
166
  export const wasmpdfdocument_convertToPdfA: (a: number, b: number, c: number, d: number) => void;
163
167
  export const wasmpdfdocument_cropMargins: (a: number, b: number, c: number, d: number, e: number, f: number) => void;
@@ -177,6 +181,7 @@ export const wasmpdfdocument_extractChars: (a: number, b: number, c: number, d:
177
181
  export const wasmpdfdocument_extractImageBytes: (a: number, b: number, c: number) => void;
178
182
  export const wasmpdfdocument_extractImages: (a: number, b: number, c: number, d: number, e: number) => void;
179
183
  export const wasmpdfdocument_extractLines: (a: number, b: number, c: number, d: number, e: number) => void;
184
+ export const wasmpdfdocument_extractPageAuto: (a: number, b: number, c: number, d: number, e: number) => void;
180
185
  export const wasmpdfdocument_extractPageText: (a: number, b: number, c: number, d: number, e: number) => void;
181
186
  export const wasmpdfdocument_extractPages: (a: number, b: number, c: number, d: number) => void;
182
187
  export const wasmpdfdocument_extractPaths: (a: number, b: number, c: number, d: number, e: number) => void;
@@ -184,6 +189,7 @@ export const wasmpdfdocument_extractRects: (a: number, b: number, c: number, d:
184
189
  export const wasmpdfdocument_extractSpans: (a: number, b: number, c: number, d: number, e: number, f: number, g: number) => void;
185
190
  export const wasmpdfdocument_extractTables: (a: number, b: number, c: number, d: number, e: number) => void;
186
191
  export const wasmpdfdocument_extractText: (a: number, b: number, c: number, d: number) => void;
192
+ export const wasmpdfdocument_extractTextAuto: (a: number, b: number, c: number) => void;
187
193
  export const wasmpdfdocument_extractTextLines: (a: number, b: number, c: number, d: number, e: number) => void;
188
194
  export const wasmpdfdocument_extractTextOcr: (a: number, b: number, c: number, d: number) => void;
189
195
  export const wasmpdfdocument_extractWords: (a: number, b: number, c: number, d: number, e: number) => void;
@@ -644,7 +644,9 @@ export class WasmHeader {
644
644
  }
645
645
 
646
646
  /**
647
- * OCR configuration for WebAssembly.
647
+ * OCR configuration for WebAssembly. (Currently a marker — the engine
648
+ * uses tuned defaults; knobs are exposed as the WASM OCR surface
649
+ * matures, #524.)
648
650
  */
649
651
  export class WasmOcrConfig {
650
652
  free(): void;
@@ -656,15 +658,27 @@ export class WasmOcrConfig {
656
658
  }
657
659
 
658
660
  /**
659
- * OCR engine for WebAssembly.
661
+ * OCR engine for WebAssembly (#524).
662
+ *
663
+ * OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
664
+ * native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
665
+ * the browser/Deno/edge host fetches the detector + recognizer ONNX
666
+ * files and the char dictionary (see `modelManifest()` for the URLs)
667
+ * — typically `fetch()` + the Cache API / IndexedDB for the
668
+ * tens-of-MB models — then hands the bytes to the constructor. This
669
+ * only works in the `wasm-ocr` build of `pdf-oxide`; the default
670
+ * `pdf-oxide-wasm` has no OCR (the constructor returns an error
671
+ * explaining this).
660
672
  */
661
673
  export class WasmOcrEngine {
662
674
  free(): void;
663
675
  [Symbol.dispose](): void;
664
676
  /**
665
- * Create a new OCR engine.
677
+ * Not available in this build. OCR needs the `wasm-ocr` build of
678
+ * `pdf-oxide` (the pure-Rust tract backend); the default
679
+ * `pdf-oxide-wasm` ships without it.
666
680
  */
667
- constructor(_det_model_path: string, _rec_model_path: string, _dict_path: string, _config?: WasmOcrConfig | null);
681
+ constructor(_det_model: Uint8Array, _rec_model: Uint8Array, _dict: string, _config?: WasmOcrConfig | null);
668
682
  }
669
683
 
670
684
  /**
@@ -815,6 +829,15 @@ export class WasmPdfDocument {
815
829
  * @returns true if authentication succeeded
816
830
  */
817
831
  authenticate(password: string): boolean;
832
+ /**
833
+ * Cheap per-page text-vs-OCR classification → JSON
834
+ * `DocumentClassification`.
835
+ */
836
+ classifyDocument(): string;
837
+ /**
838
+ * Cheap per-page classification → JSON `PageClassification`.
839
+ */
840
+ classifyPage(page_index: number): string;
818
841
  /**
819
842
  * Clear all pending erase operations for a page.
820
843
  */
@@ -934,6 +957,12 @@ export class WasmPdfDocument {
934
957
  * @returns Array of path objects
935
958
  */
936
959
  extractLines(page_index: number, region?: Float32Array | null): any;
960
+ /**
961
+ * Rich per-page extraction → JSON `PageExtraction` (per-region
962
+ * bbox + typed reason). `optionsJson` is `{}`-tolerant
963
+ * `AutoExtractOptions`; undefined/empty → defaults.
964
+ */
965
+ extractPageAuto(page_index: number, options_json?: string | null): string;
937
966
  /**
938
967
  * Extract complete page text data in a single call.
939
968
  *
@@ -990,6 +1019,11 @@ export class WasmPdfDocument {
990
1019
  * @param region - Optional [x, y, width, height] to filter by
991
1020
  */
992
1021
  extractText(page_index: number, region: any): string;
1022
+ /**
1023
+ * One-shot auto text extraction — graceful native fallback (never
1024
+ * the opaque OCR error #513).
1025
+ */
1026
+ extractTextAuto(page_index: number): string;
993
1027
  /**
994
1028
  * Extract text lines from a page.
995
1029
  *
@@ -997,12 +1031,10 @@ export class WasmPdfDocument {
997
1031
  */
998
1032
  extractTextLines(page_index: number, region?: Float32Array | null): any;
999
1033
  /**
1000
- * Extract text using OCR (optical character recognition).
1001
- *
1002
- * NOTE: OCR is not yet supported in the WebAssembly build due to missing
1003
- * ONNX Runtime support for the web backend in the current implementation.
1034
+ * Extract text using OCR. Not available in this build — OCR needs
1035
+ * the `wasm-ocr` build of `pdf-oxide`.
1004
1036
  */
1005
- extractTextOcr(_page_index: number, _engine?: WasmOcrEngine | null): string;
1037
+ extractTextOcr(_page_index: number, _engine: WasmOcrEngine): string;
1006
1038
  /**
1007
1039
  * Extract word-level data from a page.
1008
1040
  *
@@ -1425,6 +1457,10 @@ export class WasmPdfPageRegion {
1425
1457
  extractTextLines(): any;
1426
1458
  /**
1427
1459
  * Extract text using OCR from this region.
1460
+ *
1461
+ * Region-scoped OCR is not wired yet; use the page-level
1462
+ * `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
1463
+ * (#524 follow-up).
1428
1464
  */
1429
1465
  extractTextOcr(_engine?: WasmOcrEngine | null): string;
1430
1466
  /**
@@ -1600,6 +1636,20 @@ export function generateQrSvg(data: string, error_correction: number, size: numb
1600
1636
  */
1601
1637
  export function hasDocumentTimestamp(pdf_data: Uint8Array): boolean;
1602
1638
 
1639
+ /**
1640
+ * #519: Air-gapped OCR model manifest — JSON (detector + every
1641
+ * supported language's cache filenames and source URLs).
1642
+ *
1643
+ * WASM provisioning is **host-side**: browser/WASM has no filesystem
1644
+ * or network-to-disk, so a download-to-cache prefetch cannot run
1645
+ * here. This manifest is informational — it lets the JS host learn
1646
+ * which model files/URLs to fetch and bundle (or ship out of band)
1647
+ * before driving OCR. There is intentionally no `prefetchModels` in
1648
+ * the WASM surface (see `prefetchAvailable`, which always returns
1649
+ * `false`).
1650
+ */
1651
+ export function modelManifest(): string;
1652
+
1603
1653
  /**
1604
1654
  * Plan a bookmark split without producing PDFs. Returns a JSON array
1605
1655
  * of segment objects (`index, startPage…` shape from
@@ -1607,6 +1657,13 @@ export function hasDocumentTimestamp(pdf_data: Uint8Array): boolean;
1607
1657
  */
1608
1658
  export function planSplitByBookmarks(src_bytes: Uint8Array, title_prefix: string | null | undefined, ignore_case: boolean, level: number, include_front_matter: boolean): any;
1609
1659
 
1660
+ /**
1661
+ * #519: Whether this build can download OCR models to a local cache.
1662
+ * Always `false` in WASM — provisioning is host-side (see
1663
+ * `modelManifest`).
1664
+ */
1665
+ export function prefetchAvailable(): boolean;
1666
+
1610
1667
  /**
1611
1668
  * Install the process-wide runtime crypto policy from its grammar
1612
1669
  * string (`"compat"|"strict"|"fips-strict"[;…]`). Fail-closed: