npm - pdf-oxide-wasm - Versions diffs - 0.3.50 → 0.3.52 - Mend

pdf-oxide-wasm 0.3.50 → 0.3.52

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/bundler/pdf_oxide.d.ts +66 -9
package/bundler/pdf_oxide.js +1 -1
package/bundler/pdf_oxide_bg.js +205 -30
package/bundler/pdf_oxide_bg.wasm +0 -0
package/bundler/pdf_oxide_bg.wasm.d.ts +6 -0
package/nodejs/pdf_oxide.d.ts +66 -9
package/nodejs/pdf_oxide.js +207 -30
package/nodejs/pdf_oxide_bg.wasm +0 -0
package/nodejs/pdf_oxide_bg.wasm.d.ts +6 -0
package/package.json +1 -1
package/web/pdf_oxide.d.ts +72 -9
package/web/pdf_oxide.js +205 -30
package/web/pdf_oxide_bg.wasm +0 -0
package/web/pdf_oxide_bg.wasm.d.ts +6 -0

package/bundler/pdf_oxide.d.ts CHANGED Viewed

@@ -644,7 +644,9 @@ export class WasmHeader {
 }
 /**
- * OCR configuration for WebAssembly.
+ * OCR configuration for WebAssembly. (Currently a marker — the engine
+ * uses tuned defaults; knobs are exposed as the WASM OCR surface
+ * matures, #524.)
  */
 export class WasmOcrConfig {
     free(): void;
@@ -656,15 +658,27 @@ export class WasmOcrConfig {
 }
 /**
- * OCR engine for WebAssembly.
+ * OCR engine for WebAssembly (#524).
+ *
+ * OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
+ * native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
+ * the browser/Deno/edge host fetches the detector + recognizer ONNX
+ * files and the char dictionary (see `modelManifest()` for the URLs)
+ * — typically `fetch()` + the Cache API / IndexedDB for the
+ * tens-of-MB models — then hands the bytes to the constructor. This
+ * only works in the `wasm-ocr` build of `pdf-oxide`; the default
+ * `pdf-oxide-wasm` has no OCR (the constructor returns an error
+ * explaining this).
  */
 export class WasmOcrEngine {
     free(): void;
     [Symbol.dispose](): void;
     /**
-     * Create a new OCR engine.
+     * Not available in this build. OCR needs the `wasm-ocr` build of
+     * `pdf-oxide` (the pure-Rust tract backend); the default
+     * `pdf-oxide-wasm` ships without it.
      */
-    constructor(_det_model_path: string, _rec_model_path: string, _dict_path: string, _config?: WasmOcrConfig | null);
+    constructor(_det_model: Uint8Array, _rec_model: Uint8Array, _dict: string, _config?: WasmOcrConfig | null);
 }
 /**
@@ -815,6 +829,15 @@ export class WasmPdfDocument {
      * @returns true if authentication succeeded
      */
     authenticate(password: string): boolean;
+    /**
+     * Cheap per-page text-vs-OCR classification → JSON
+     * `DocumentClassification`.
+     */
+    classifyDocument(): string;
+    /**
+     * Cheap per-page classification → JSON `PageClassification`.
+     */
+    classifyPage(page_index: number): string;
     /**
      * Clear all pending erase operations for a page.
      */
@@ -934,6 +957,12 @@ export class WasmPdfDocument {
      * @returns Array of path objects
      */
     extractLines(page_index: number, region?: Float32Array | null): any;
+    /**
+     * Rich per-page extraction → JSON `PageExtraction` (per-region
+     * bbox + typed reason). `optionsJson` is `{}`-tolerant
+     * `AutoExtractOptions`; undefined/empty → defaults.
+     */
+    extractPageAuto(page_index: number, options_json?: string | null): string;
     /**
      * Extract complete page text data in a single call.
      *
@@ -990,6 +1019,11 @@ export class WasmPdfDocument {
      * @param region - Optional [x, y, width, height] to filter by
      */
     extractText(page_index: number, region: any): string;
+    /**
+     * One-shot auto text extraction — graceful native fallback (never
+     * the opaque OCR error #513).
+     */
+    extractTextAuto(page_index: number): string;
     /**
      * Extract text lines from a page.
      *
@@ -997,12 +1031,10 @@ export class WasmPdfDocument {
      */
     extractTextLines(page_index: number, region?: Float32Array | null): any;
     /**
-     * Extract text using OCR (optical character recognition).
-     *
-     * NOTE: OCR is not yet supported in the WebAssembly build due to missing
-     * ONNX Runtime support for the web backend in the current implementation.
+     * Extract text using OCR. Not available in this build — OCR needs
+     * the `wasm-ocr` build of `pdf-oxide`.
      */
-    extractTextOcr(_page_index: number, _engine?: WasmOcrEngine | null): string;
+    extractTextOcr(_page_index: number, _engine: WasmOcrEngine): string;
     /**
      * Extract word-level data from a page.
      *
@@ -1425,6 +1457,10 @@ export class WasmPdfPageRegion {
     extractTextLines(): any;
     /**
      * Extract text using OCR from this region.
+     *
+     * Region-scoped OCR is not wired yet; use the page-level
+     * `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
+     * (#524 follow-up).
      */
     extractTextOcr(_engine?: WasmOcrEngine | null): string;
     /**
@@ -1600,6 +1636,20 @@ export function generateQrSvg(data: string, error_correction: number, size: numb
  */
 export function hasDocumentTimestamp(pdf_data: Uint8Array): boolean;
+/**
+ * #519: Air-gapped OCR model manifest — JSON (detector + every
+ * supported language's cache filenames and source URLs).
+ *
+ * WASM provisioning is **host-side**: browser/WASM has no filesystem
+ * or network-to-disk, so a download-to-cache prefetch cannot run
+ * here. This manifest is informational — it lets the JS host learn
+ * which model files/URLs to fetch and bundle (or ship out of band)
+ * before driving OCR. There is intentionally no `prefetchModels` in
+ * the WASM surface (see `prefetchAvailable`, which always returns
+ * `false`).
+ */
+export function modelManifest(): string;
 /**
  * Plan a bookmark split without producing PDFs. Returns a JSON array
  * of segment objects (`index, startPage…` shape from
@@ -1607,6 +1657,13 @@ export function hasDocumentTimestamp(pdf_data: Uint8Array): boolean;
  */
 export function planSplitByBookmarks(src_bytes: Uint8Array, title_prefix: string | null | undefined, ignore_case: boolean, level: number, include_front_matter: boolean): any;
+/**
+ * #519: Whether this build can download OCR models to a local cache.
+ * Always `false` in WASM — provisioning is host-side (see
+ * `modelManifest`).
+ */
+export function prefetchAvailable(): boolean;
 /**
  * Install the process-wide runtime crypto policy from its grammar
  * string (`"compat"|"strict"|"fips-strict"[;…]`). Fail-closed:

package/bundler/pdf_oxide.js CHANGED Viewed

@@ -5,5 +5,5 @@ import { __wbg_set_wasm } from "./pdf_oxide_bg.js";
 __wbg_set_wasm(wasm);
 export {
-    Align, ArtifactStyle, Dss, PadesLevel, RevocationMaterial, StreamingTable, WasmArtifact, WasmCertificate, WasmDocumentBuilder, WasmEmbeddedFont, WasmFluentPageBuilder, WasmFooter, WasmHeader, WasmOcrConfig, WasmOcrEngine, WasmPageTemplate, WasmPdf, WasmPdfDocument, WasmPdfPageRegion, WasmSignature, WasmTimestamp, cryptoCbom, cryptoInventory, cryptoPolicy, disableLogging, generateBarcodeSvg, generateQrSvg, hasDocumentTimestamp, planSplitByBookmarks, setCryptoPolicy, setLogLevel, signPdfBytes, signPdfBytesPades, splitByBookmarks
+    Align, ArtifactStyle, Dss, PadesLevel, RevocationMaterial, StreamingTable, WasmArtifact, WasmCertificate, WasmDocumentBuilder, WasmEmbeddedFont, WasmFluentPageBuilder, WasmFooter, WasmHeader, WasmOcrConfig, WasmOcrEngine, WasmPageTemplate, WasmPdf, WasmPdfDocument, WasmPdfPageRegion, WasmSignature, WasmTimestamp, cryptoCbom, cryptoInventory, cryptoPolicy, disableLogging, generateBarcodeSvg, generateQrSvg, hasDocumentTimestamp, modelManifest, planSplitByBookmarks, prefetchAvailable, setCryptoPolicy, setLogLevel, signPdfBytes, signPdfBytesPades, splitByBookmarks
 } from "./pdf_oxide_bg.js";

package/bundler/pdf_oxide_bg.js CHANGED Viewed

@@ -2544,7 +2544,9 @@ export class WasmHeader {
 if (Symbol.dispose) WasmHeader.prototype[Symbol.dispose] = WasmHeader.prototype.free;
 /**
- * OCR configuration for WebAssembly.
+ * OCR configuration for WebAssembly. (Currently a marker — the engine
+ * uses tuned defaults; knobs are exposed as the WASM OCR surface
+ * matures, #524.)
  */
 export class WasmOcrConfig {
     __destroy_into_raw() {
@@ -2570,7 +2572,17 @@ export class WasmOcrConfig {
 if (Symbol.dispose) WasmOcrConfig.prototype[Symbol.dispose] = WasmOcrConfig.prototype.free;
 /**
- * OCR engine for WebAssembly.
+ * OCR engine for WebAssembly (#524).
+ *
+ * OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
+ * native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
+ * the browser/Deno/edge host fetches the detector + recognizer ONNX
+ * files and the char dictionary (see `modelManifest()` for the URLs)
+ * — typically `fetch()` + the Cache API / IndexedDB for the
+ * tens-of-MB models — then hands the bytes to the constructor. This
+ * only works in the `wasm-ocr` build of `pdf-oxide`; the default
+ * `pdf-oxide-wasm` has no OCR (the constructor returns an error
+ * explaining this).
  */
 export class WasmOcrEngine {
     __destroy_into_raw() {
@@ -2584,20 +2596,22 @@ export class WasmOcrEngine {
         wasm.__wbg_wasmocrengine_free(ptr, 0);
     }
     /**
-     * Create a new OCR engine.
-     * @param {string} _det_model_path
-     * @param {string} _rec_model_path
-     * @param {string} _dict_path
+     * Not available in this build. OCR needs the `wasm-ocr` build of
+     * `pdf-oxide` (the pure-Rust tract backend); the default
+     * `pdf-oxide-wasm` ships without it.
+     * @param {Uint8Array} _det_model
+     * @param {Uint8Array} _rec_model
+     * @param {string} _dict
      * @param {WasmOcrConfig | null} [_config]
      */
-    constructor(_det_model_path, _rec_model_path, _dict_path, _config) {
+    constructor(_det_model, _rec_model, _dict, _config) {
         try {
             const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
-            const ptr0 = passStringToWasm0(_det_model_path, wasm.__wbindgen_export, wasm.__wbindgen_export2);
+            const ptr0 = passArray8ToWasm0(_det_model, wasm.__wbindgen_export);
             const len0 = WASM_VECTOR_LEN;
-            const ptr1 = passStringToWasm0(_rec_model_path, wasm.__wbindgen_export, wasm.__wbindgen_export2);
+            const ptr1 = passArray8ToWasm0(_rec_model, wasm.__wbindgen_export);
             const len1 = WASM_VECTOR_LEN;
-            const ptr2 = passStringToWasm0(_dict_path, wasm.__wbindgen_export, wasm.__wbindgen_export2);
+            const ptr2 = passStringToWasm0(_dict, wasm.__wbindgen_export, wasm.__wbindgen_export2);
             const len2 = WASM_VECTOR_LEN;
             let ptr3 = 0;
             if (!isLikeNone(_config)) {
@@ -3125,6 +3139,64 @@ export class WasmPdfDocument {
             wasm.__wbindgen_add_to_stack_pointer(16);
         }
     }
+    /**
+     * Cheap per-page text-vs-OCR classification → JSON
+     * `DocumentClassification`.
+     * @returns {string}
+     */
+    classifyDocument() {
+        let deferred2_0;
+        let deferred2_1;
+        try {
+            const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
+            wasm.wasmpdfdocument_classifyDocument(retptr, this.__wbg_ptr);
+            var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
+            var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
+            var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
+            var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
+            var ptr1 = r0;
+            var len1 = r1;
+            if (r3) {
+                ptr1 = 0; len1 = 0;
+                throw takeObject(r2);
+            }
+            deferred2_0 = ptr1;
+            deferred2_1 = len1;
+            return getStringFromWasm0(ptr1, len1);
+        } finally {
+            wasm.__wbindgen_add_to_stack_pointer(16);
+            wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
+        }
+    }
+    /**
+     * Cheap per-page classification → JSON `PageClassification`.
+     * @param {number} page_index
+     * @returns {string}
+     */
+    classifyPage(page_index) {
+        let deferred2_0;
+        let deferred2_1;
+        try {
+            const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
+            wasm.wasmpdfdocument_classifyPage(retptr, this.__wbg_ptr, page_index);
+            var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
+            var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
+            var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
+            var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
+            var ptr1 = r0;
+            var len1 = r1;
+            if (r3) {
+                ptr1 = 0; len1 = 0;
+                throw takeObject(r2);
+            }
+            deferred2_0 = ptr1;
+            deferred2_1 = len1;
+            return getStringFromWasm0(ptr1, len1);
+        } finally {
+            wasm.__wbindgen_add_to_stack_pointer(16);
+            wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
+        }
+    }
     /**
      * Clear all pending erase operations for a page.
      * @param {number} page_index
@@ -3553,6 +3625,40 @@ export class WasmPdfDocument {
             wasm.__wbindgen_add_to_stack_pointer(16);
         }
     }
+    /**
+     * Rich per-page extraction → JSON `PageExtraction` (per-region
+     * bbox + typed reason). `optionsJson` is `{}`-tolerant
+     * `AutoExtractOptions`; undefined/empty → defaults.
+     * @param {number} page_index
+     * @param {string | null} [options_json]
+     * @returns {string}
+     */
+    extractPageAuto(page_index, options_json) {
+        let deferred3_0;
+        let deferred3_1;
+        try {
+            const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
+            var ptr0 = isLikeNone(options_json) ? 0 : passStringToWasm0(options_json, wasm.__wbindgen_export, wasm.__wbindgen_export2);
+            var len0 = WASM_VECTOR_LEN;
+            wasm.wasmpdfdocument_extractPageAuto(retptr, this.__wbg_ptr, page_index, ptr0, len0);
+            var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
+            var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
+            var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
+            var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
+            var ptr2 = r0;
+            var len2 = r1;
+            if (r3) {
+                ptr2 = 0; len2 = 0;
+                throw takeObject(r2);
+            }
+            deferred3_0 = ptr2;
+            deferred3_1 = len2;
+            return getStringFromWasm0(ptr2, len2);
+        } finally {
+            wasm.__wbindgen_add_to_stack_pointer(16);
+            wasm.__wbindgen_export4(deferred3_0, deferred3_1, 1);
+        }
+    }
     /**
      * Extract complete page text data in a single call.
      *
@@ -3754,6 +3860,36 @@ export class WasmPdfDocument {
             wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
         }
     }
+    /**
+     * One-shot auto text extraction — graceful native fallback (never
+     * the opaque OCR error #513).
+     * @param {number} page_index
+     * @returns {string}
+     */
+    extractTextAuto(page_index) {
+        let deferred2_0;
+        let deferred2_1;
+        try {
+            const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
+            wasm.wasmpdfdocument_extractTextAuto(retptr, this.__wbg_ptr, page_index);
+            var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
+            var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
+            var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
+            var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
+            var ptr1 = r0;
+            var len1 = r1;
+            if (r3) {
+                ptr1 = 0; len1 = 0;
+                throw takeObject(r2);
+            }
+            deferred2_0 = ptr1;
+            deferred2_1 = len1;
+            return getStringFromWasm0(ptr1, len1);
+        } finally {
+            wasm.__wbindgen_add_to_stack_pointer(16);
+            wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
+        }
+    }
     /**
      * Extract text lines from a page.
      *
@@ -3780,41 +3916,35 @@ export class WasmPdfDocument {
         }
     }
     /**
-     * Extract text using OCR (optical character recognition).
-     *
-     * NOTE: OCR is not yet supported in the WebAssembly build due to missing
-     * ONNX Runtime support for the web backend in the current implementation.
+     * Extract text using OCR. Not available in this build — OCR needs
+     * the `wasm-ocr` build of `pdf-oxide`.
      * @param {number} _page_index
-     * @param {WasmOcrEngine | null} [_engine]
+     * @param {WasmOcrEngine} _engine
      * @returns {string}
      */
     extractTextOcr(_page_index, _engine) {
-        let deferred3_0;
-        let deferred3_1;
+        let deferred2_0;
+        let deferred2_1;
         try {
             const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
-            let ptr0 = 0;
-            if (!isLikeNone(_engine)) {
-                _assertClass(_engine, WasmOcrEngine);
-                ptr0 = _engine.__destroy_into_raw();
-            }
-            wasm.wasmpdfdocument_extractTextOcr(retptr, this.__wbg_ptr, _page_index, ptr0);
+            _assertClass(_engine, WasmOcrEngine);
+            wasm.wasmpdfdocument_extractTextOcr(retptr, this.__wbg_ptr, _page_index, _engine.__wbg_ptr);
             var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
             var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
             var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
             var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
-            var ptr2 = r0;
-            var len2 = r1;
+            var ptr1 = r0;
+            var len1 = r1;
             if (r3) {
-                ptr2 = 0; len2 = 0;
+                ptr1 = 0; len1 = 0;
                 throw takeObject(r2);
             }
-            deferred3_0 = ptr2;
-            deferred3_1 = len2;
-            return getStringFromWasm0(ptr2, len2);
+            deferred2_0 = ptr1;
+            deferred2_1 = len1;
+            return getStringFromWasm0(ptr1, len1);
         } finally {
             wasm.__wbindgen_add_to_stack_pointer(16);
-            wasm.__wbindgen_export4(deferred3_0, deferred3_1, 1);
+            wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
         }
     }
     /**
@@ -5573,6 +5703,10 @@ export class WasmPdfPageRegion {
     }
     /**
      * Extract text using OCR from this region.
+     *
+     * Region-scoped OCR is not wired yet; use the page-level
+     * `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
+     * (#524 follow-up).
      * @param {WasmOcrEngine | null} [_engine]
      * @returns {string}
      */
@@ -6153,6 +6287,36 @@ export function hasDocumentTimestamp(pdf_data) {
     return ret !== 0;
 }
+/**
+ * #519: Air-gapped OCR model manifest — JSON (detector + every
+ * supported language's cache filenames and source URLs).
+ *
+ * WASM provisioning is **host-side**: browser/WASM has no filesystem
+ * or network-to-disk, so a download-to-cache prefetch cannot run
+ * here. This manifest is informational — it lets the JS host learn
+ * which model files/URLs to fetch and bundle (or ship out of band)
+ * before driving OCR. There is intentionally no `prefetchModels` in
+ * the WASM surface (see `prefetchAvailable`, which always returns
+ * `false`).
+ * @returns {string}
+ */
+export function modelManifest() {
+    let deferred1_0;
+    let deferred1_1;
+    try {
+        const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
+        wasm.modelManifest(retptr);
+        var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
+        var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
+        deferred1_0 = r0;
+        deferred1_1 = r1;
+        return getStringFromWasm0(r0, r1);
+    } finally {
+        wasm.__wbindgen_add_to_stack_pointer(16);
+        wasm.__wbindgen_export4(deferred1_0, deferred1_1, 1);
+    }
+}
 /**
  * Plan a bookmark split without producing PDFs. Returns a JSON array
  * of segment objects (`index, startPage…` shape from
@@ -6184,6 +6348,17 @@ export function planSplitByBookmarks(src_bytes, title_prefix, ignore_case, level
     }
 }
+/**
+ * #519: Whether this build can download OCR models to a local cache.
+ * Always `false` in WASM — provisioning is host-side (see
+ * `modelManifest`).
+ * @returns {boolean}
+ */
+export function prefetchAvailable() {
+    const ret = wasm.prefetchAvailable();
+    return ret !== 0;
+}
 /**
  * Install the process-wide runtime crypto policy from its grammar
  * string (`"compat"|"strict"|"fips-strict"[;…]`). Fail-closed:

package/bundler/pdf_oxide_bg.wasm CHANGED Viewed

Binary file

package/bundler/pdf_oxide_bg.wasm.d.ts CHANGED Viewed

@@ -23,7 +23,9 @@ export const cryptoPolicy: (a: number) => void;
 export const generateBarcodeSvg: (a: number, b: number, c: number, d: number) => void;
 export const generateQrSvg: (a: number, b: number, c: number, d: number, e: number) => void;
 export const hasDocumentTimestamp: (a: number, b: number) => number;
+export const modelManifest: (a: number) => void;
 export const planSplitByBookmarks: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => void;
+export const prefetchAvailable: () => number;
 export const setCryptoPolicy: (a: number, b: number, c: number) => void;
 export const setLogLevel: (a: number, b: number, c: number) => void;
 export const signPdfBytes: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => void;
@@ -158,6 +160,8 @@ export const wasmpdfdocument_applyAllRedactions: (a: number, b: number) => void;
 export const wasmpdfdocument_applyPageRedactions: (a: number, b: number, c: number) => void;
 export const wasmpdfdocument_applyRedactionsDestructive: (a: number, b: number, c: number) => void;
 export const wasmpdfdocument_authenticate: (a: number, b: number, c: number, d: number) => void;
+export const wasmpdfdocument_classifyDocument: (a: number, b: number) => void;
+export const wasmpdfdocument_classifyPage: (a: number, b: number, c: number) => void;
 export const wasmpdfdocument_clearEraseRegions: (a: number, b: number, c: number) => void;
 export const wasmpdfdocument_convertToPdfA: (a: number, b: number, c: number, d: number) => void;
 export const wasmpdfdocument_cropMargins: (a: number, b: number, c: number, d: number, e: number, f: number) => void;
@@ -177,6 +181,7 @@ export const wasmpdfdocument_extractChars: (a: number, b: number, c: number, d:
 export const wasmpdfdocument_extractImageBytes: (a: number, b: number, c: number) => void;
 export const wasmpdfdocument_extractImages: (a: number, b: number, c: number, d: number, e: number) => void;
 export const wasmpdfdocument_extractLines: (a: number, b: number, c: number, d: number, e: number) => void;
+export const wasmpdfdocument_extractPageAuto: (a: number, b: number, c: number, d: number, e: number) => void;
 export const wasmpdfdocument_extractPageText: (a: number, b: number, c: number, d: number, e: number) => void;
 export const wasmpdfdocument_extractPages: (a: number, b: number, c: number, d: number) => void;
 export const wasmpdfdocument_extractPaths: (a: number, b: number, c: number, d: number, e: number) => void;
@@ -184,6 +189,7 @@ export const wasmpdfdocument_extractRects: (a: number, b: number, c: number, d:
 export const wasmpdfdocument_extractSpans: (a: number, b: number, c: number, d: number, e: number, f: number, g: number) => void;
 export const wasmpdfdocument_extractTables: (a: number, b: number, c: number, d: number, e: number) => void;
 export const wasmpdfdocument_extractText: (a: number, b: number, c: number, d: number) => void;
+export const wasmpdfdocument_extractTextAuto: (a: number, b: number, c: number) => void;
 export const wasmpdfdocument_extractTextLines: (a: number, b: number, c: number, d: number, e: number) => void;
 export const wasmpdfdocument_extractTextOcr: (a: number, b: number, c: number, d: number) => void;
 export const wasmpdfdocument_extractWords: (a: number, b: number, c: number, d: number, e: number) => void;

package/nodejs/pdf_oxide.d.ts CHANGED Viewed

@@ -644,7 +644,9 @@ export class WasmHeader {
 }
 /**
- * OCR configuration for WebAssembly.
+ * OCR configuration for WebAssembly. (Currently a marker — the engine
+ * uses tuned defaults; knobs are exposed as the WASM OCR surface
+ * matures, #524.)
  */
 export class WasmOcrConfig {
     free(): void;
@@ -656,15 +658,27 @@ export class WasmOcrConfig {
 }
 /**
- * OCR engine for WebAssembly.
+ * OCR engine for WebAssembly (#524).
+ *
+ * OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
+ * native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
+ * the browser/Deno/edge host fetches the detector + recognizer ONNX
+ * files and the char dictionary (see `modelManifest()` for the URLs)
+ * — typically `fetch()` + the Cache API / IndexedDB for the
+ * tens-of-MB models — then hands the bytes to the constructor. This
+ * only works in the `wasm-ocr` build of `pdf-oxide`; the default
+ * `pdf-oxide-wasm` has no OCR (the constructor returns an error
+ * explaining this).
  */
 export class WasmOcrEngine {
     free(): void;
     [Symbol.dispose](): void;
     /**
-     * Create a new OCR engine.
+     * Not available in this build. OCR needs the `wasm-ocr` build of
+     * `pdf-oxide` (the pure-Rust tract backend); the default
+     * `pdf-oxide-wasm` ships without it.
      */
-    constructor(_det_model_path: string, _rec_model_path: string, _dict_path: string, _config?: WasmOcrConfig | null);
+    constructor(_det_model: Uint8Array, _rec_model: Uint8Array, _dict: string, _config?: WasmOcrConfig | null);
 }
 /**
@@ -815,6 +829,15 @@ export class WasmPdfDocument {
      * @returns true if authentication succeeded
      */
     authenticate(password: string): boolean;
+    /**
+     * Cheap per-page text-vs-OCR classification → JSON
+     * `DocumentClassification`.
+     */
+    classifyDocument(): string;
+    /**
+     * Cheap per-page classification → JSON `PageClassification`.
+     */
+    classifyPage(page_index: number): string;
     /**
      * Clear all pending erase operations for a page.
      */
@@ -934,6 +957,12 @@ export class WasmPdfDocument {
      * @returns Array of path objects
      */
     extractLines(page_index: number, region?: Float32Array | null): any;
+    /**
+     * Rich per-page extraction → JSON `PageExtraction` (per-region
+     * bbox + typed reason). `optionsJson` is `{}`-tolerant
+     * `AutoExtractOptions`; undefined/empty → defaults.
+     */
+    extractPageAuto(page_index: number, options_json?: string | null): string;
     /**
      * Extract complete page text data in a single call.
      *
@@ -990,6 +1019,11 @@ export class WasmPdfDocument {
      * @param region - Optional [x, y, width, height] to filter by
      */
     extractText(page_index: number, region: any): string;
+    /**
+     * One-shot auto text extraction — graceful native fallback (never
+     * the opaque OCR error #513).
+     */
+    extractTextAuto(page_index: number): string;
     /**
      * Extract text lines from a page.
      *
@@ -997,12 +1031,10 @@ export class WasmPdfDocument {
      */
     extractTextLines(page_index: number, region?: Float32Array | null): any;
     /**
-     * Extract text using OCR (optical character recognition).
-     *
-     * NOTE: OCR is not yet supported in the WebAssembly build due to missing
-     * ONNX Runtime support for the web backend in the current implementation.
+     * Extract text using OCR. Not available in this build — OCR needs
+     * the `wasm-ocr` build of `pdf-oxide`.
      */
-    extractTextOcr(_page_index: number, _engine?: WasmOcrEngine | null): string;
+    extractTextOcr(_page_index: number, _engine: WasmOcrEngine): string;
     /**
      * Extract word-level data from a page.
      *
@@ -1425,6 +1457,10 @@ export class WasmPdfPageRegion {
     extractTextLines(): any;
     /**
      * Extract text using OCR from this region.
+     *
+     * Region-scoped OCR is not wired yet; use the page-level
+     * `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
+     * (#524 follow-up).
      */
     extractTextOcr(_engine?: WasmOcrEngine | null): string;
     /**
@@ -1600,6 +1636,20 @@ export function generateQrSvg(data: string, error_correction: number, size: numb
  */
 export function hasDocumentTimestamp(pdf_data: Uint8Array): boolean;
+/**
+ * #519: Air-gapped OCR model manifest — JSON (detector + every
+ * supported language's cache filenames and source URLs).
+ *
+ * WASM provisioning is **host-side**: browser/WASM has no filesystem
+ * or network-to-disk, so a download-to-cache prefetch cannot run
+ * here. This manifest is informational — it lets the JS host learn
+ * which model files/URLs to fetch and bundle (or ship out of band)
+ * before driving OCR. There is intentionally no `prefetchModels` in
+ * the WASM surface (see `prefetchAvailable`, which always returns
+ * `false`).
+ */
+export function modelManifest(): string;
 /**
  * Plan a bookmark split without producing PDFs. Returns a JSON array
  * of segment objects (`index, startPage…` shape from
@@ -1607,6 +1657,13 @@ export function hasDocumentTimestamp(pdf_data: Uint8Array): boolean;
  */
 export function planSplitByBookmarks(src_bytes: Uint8Array, title_prefix: string | null | undefined, ignore_case: boolean, level: number, include_front_matter: boolean): any;
+/**
+ * #519: Whether this build can download OCR models to a local cache.
+ * Always `false` in WASM — provisioning is host-side (see
+ * `modelManifest`).
+ */
+export function prefetchAvailable(): boolean;
 /**
  * Install the process-wide runtime crypto policy from its grammar
  * string (`"compat"|"strict"|"fips-strict"[;…]`). Fail-closed: