pdf-oxide-wasm 0.3.50 → 0.3.52
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bundler/pdf_oxide.d.ts +66 -9
- package/bundler/pdf_oxide.js +1 -1
- package/bundler/pdf_oxide_bg.js +205 -30
- package/bundler/pdf_oxide_bg.wasm +0 -0
- package/bundler/pdf_oxide_bg.wasm.d.ts +6 -0
- package/nodejs/pdf_oxide.d.ts +66 -9
- package/nodejs/pdf_oxide.js +207 -30
- package/nodejs/pdf_oxide_bg.wasm +0 -0
- package/nodejs/pdf_oxide_bg.wasm.d.ts +6 -0
- package/package.json +1 -1
- package/web/pdf_oxide.d.ts +72 -9
- package/web/pdf_oxide.js +205 -30
- package/web/pdf_oxide_bg.wasm +0 -0
- package/web/pdf_oxide_bg.wasm.d.ts +6 -0
package/bundler/pdf_oxide.d.ts
CHANGED
|
@@ -644,7 +644,9 @@ export class WasmHeader {
|
|
|
644
644
|
}
|
|
645
645
|
|
|
646
646
|
/**
|
|
647
|
-
* OCR configuration for WebAssembly.
|
|
647
|
+
* OCR configuration for WebAssembly. (Currently a marker — the engine
|
|
648
|
+
* uses tuned defaults; knobs are exposed as the WASM OCR surface
|
|
649
|
+
* matures, #524.)
|
|
648
650
|
*/
|
|
649
651
|
export class WasmOcrConfig {
|
|
650
652
|
free(): void;
|
|
@@ -656,15 +658,27 @@ export class WasmOcrConfig {
|
|
|
656
658
|
}
|
|
657
659
|
|
|
658
660
|
/**
|
|
659
|
-
* OCR engine for WebAssembly.
|
|
661
|
+
* OCR engine for WebAssembly (#524).
|
|
662
|
+
*
|
|
663
|
+
* OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
|
|
664
|
+
* native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
|
|
665
|
+
* the browser/Deno/edge host fetches the detector + recognizer ONNX
|
|
666
|
+
* files and the char dictionary (see `modelManifest()` for the URLs)
|
|
667
|
+
* — typically `fetch()` + the Cache API / IndexedDB for the
|
|
668
|
+
* tens-of-MB models — then hands the bytes to the constructor. This
|
|
669
|
+
* only works in the `wasm-ocr` build of `pdf-oxide`; the default
|
|
670
|
+
* `pdf-oxide-wasm` has no OCR (the constructor returns an error
|
|
671
|
+
* explaining this).
|
|
660
672
|
*/
|
|
661
673
|
export class WasmOcrEngine {
|
|
662
674
|
free(): void;
|
|
663
675
|
[Symbol.dispose](): void;
|
|
664
676
|
/**
|
|
665
|
-
*
|
|
677
|
+
* Not available in this build. OCR needs the `wasm-ocr` build of
|
|
678
|
+
* `pdf-oxide` (the pure-Rust tract backend); the default
|
|
679
|
+
* `pdf-oxide-wasm` ships without it.
|
|
666
680
|
*/
|
|
667
|
-
constructor(
|
|
681
|
+
constructor(_det_model: Uint8Array, _rec_model: Uint8Array, _dict: string, _config?: WasmOcrConfig | null);
|
|
668
682
|
}
|
|
669
683
|
|
|
670
684
|
/**
|
|
@@ -815,6 +829,15 @@ export class WasmPdfDocument {
|
|
|
815
829
|
* @returns true if authentication succeeded
|
|
816
830
|
*/
|
|
817
831
|
authenticate(password: string): boolean;
|
|
832
|
+
/**
|
|
833
|
+
* Cheap per-page text-vs-OCR classification → JSON
|
|
834
|
+
* `DocumentClassification`.
|
|
835
|
+
*/
|
|
836
|
+
classifyDocument(): string;
|
|
837
|
+
/**
|
|
838
|
+
* Cheap per-page classification → JSON `PageClassification`.
|
|
839
|
+
*/
|
|
840
|
+
classifyPage(page_index: number): string;
|
|
818
841
|
/**
|
|
819
842
|
* Clear all pending erase operations for a page.
|
|
820
843
|
*/
|
|
@@ -934,6 +957,12 @@ export class WasmPdfDocument {
|
|
|
934
957
|
* @returns Array of path objects
|
|
935
958
|
*/
|
|
936
959
|
extractLines(page_index: number, region?: Float32Array | null): any;
|
|
960
|
+
/**
|
|
961
|
+
* Rich per-page extraction → JSON `PageExtraction` (per-region
|
|
962
|
+
* bbox + typed reason). `optionsJson` is `{}`-tolerant
|
|
963
|
+
* `AutoExtractOptions`; undefined/empty → defaults.
|
|
964
|
+
*/
|
|
965
|
+
extractPageAuto(page_index: number, options_json?: string | null): string;
|
|
937
966
|
/**
|
|
938
967
|
* Extract complete page text data in a single call.
|
|
939
968
|
*
|
|
@@ -990,6 +1019,11 @@ export class WasmPdfDocument {
|
|
|
990
1019
|
* @param region - Optional [x, y, width, height] to filter by
|
|
991
1020
|
*/
|
|
992
1021
|
extractText(page_index: number, region: any): string;
|
|
1022
|
+
/**
|
|
1023
|
+
* One-shot auto text extraction — graceful native fallback (never
|
|
1024
|
+
* the opaque OCR error #513).
|
|
1025
|
+
*/
|
|
1026
|
+
extractTextAuto(page_index: number): string;
|
|
993
1027
|
/**
|
|
994
1028
|
* Extract text lines from a page.
|
|
995
1029
|
*
|
|
@@ -997,12 +1031,10 @@ export class WasmPdfDocument {
|
|
|
997
1031
|
*/
|
|
998
1032
|
extractTextLines(page_index: number, region?: Float32Array | null): any;
|
|
999
1033
|
/**
|
|
1000
|
-
* Extract text using OCR
|
|
1001
|
-
*
|
|
1002
|
-
* NOTE: OCR is not yet supported in the WebAssembly build due to missing
|
|
1003
|
-
* ONNX Runtime support for the web backend in the current implementation.
|
|
1034
|
+
* Extract text using OCR. Not available in this build — OCR needs
|
|
1035
|
+
* the `wasm-ocr` build of `pdf-oxide`.
|
|
1004
1036
|
*/
|
|
1005
|
-
extractTextOcr(_page_index: number, _engine
|
|
1037
|
+
extractTextOcr(_page_index: number, _engine: WasmOcrEngine): string;
|
|
1006
1038
|
/**
|
|
1007
1039
|
* Extract word-level data from a page.
|
|
1008
1040
|
*
|
|
@@ -1425,6 +1457,10 @@ export class WasmPdfPageRegion {
|
|
|
1425
1457
|
extractTextLines(): any;
|
|
1426
1458
|
/**
|
|
1427
1459
|
* Extract text using OCR from this region.
|
|
1460
|
+
*
|
|
1461
|
+
* Region-scoped OCR is not wired yet; use the page-level
|
|
1462
|
+
* `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
|
|
1463
|
+
* (#524 follow-up).
|
|
1428
1464
|
*/
|
|
1429
1465
|
extractTextOcr(_engine?: WasmOcrEngine | null): string;
|
|
1430
1466
|
/**
|
|
@@ -1600,6 +1636,20 @@ export function generateQrSvg(data: string, error_correction: number, size: numb
|
|
|
1600
1636
|
*/
|
|
1601
1637
|
export function hasDocumentTimestamp(pdf_data: Uint8Array): boolean;
|
|
1602
1638
|
|
|
1639
|
+
/**
|
|
1640
|
+
* #519: Air-gapped OCR model manifest — JSON (detector + every
|
|
1641
|
+
* supported language's cache filenames and source URLs).
|
|
1642
|
+
*
|
|
1643
|
+
* WASM provisioning is **host-side**: browser/WASM has no filesystem
|
|
1644
|
+
* or network-to-disk, so a download-to-cache prefetch cannot run
|
|
1645
|
+
* here. This manifest is informational — it lets the JS host learn
|
|
1646
|
+
* which model files/URLs to fetch and bundle (or ship out of band)
|
|
1647
|
+
* before driving OCR. There is intentionally no `prefetchModels` in
|
|
1648
|
+
* the WASM surface (see `prefetchAvailable`, which always returns
|
|
1649
|
+
* `false`).
|
|
1650
|
+
*/
|
|
1651
|
+
export function modelManifest(): string;
|
|
1652
|
+
|
|
1603
1653
|
/**
|
|
1604
1654
|
* Plan a bookmark split without producing PDFs. Returns a JSON array
|
|
1605
1655
|
* of segment objects (`index, startPage…` shape from
|
|
@@ -1607,6 +1657,13 @@ export function hasDocumentTimestamp(pdf_data: Uint8Array): boolean;
|
|
|
1607
1657
|
*/
|
|
1608
1658
|
export function planSplitByBookmarks(src_bytes: Uint8Array, title_prefix: string | null | undefined, ignore_case: boolean, level: number, include_front_matter: boolean): any;
|
|
1609
1659
|
|
|
1660
|
+
/**
|
|
1661
|
+
* #519: Whether this build can download OCR models to a local cache.
|
|
1662
|
+
* Always `false` in WASM — provisioning is host-side (see
|
|
1663
|
+
* `modelManifest`).
|
|
1664
|
+
*/
|
|
1665
|
+
export function prefetchAvailable(): boolean;
|
|
1666
|
+
|
|
1610
1667
|
/**
|
|
1611
1668
|
* Install the process-wide runtime crypto policy from its grammar
|
|
1612
1669
|
* string (`"compat"|"strict"|"fips-strict"[;…]`). Fail-closed:
|
package/bundler/pdf_oxide.js
CHANGED
|
@@ -5,5 +5,5 @@ import { __wbg_set_wasm } from "./pdf_oxide_bg.js";
|
|
|
5
5
|
__wbg_set_wasm(wasm);
|
|
6
6
|
|
|
7
7
|
export {
|
|
8
|
-
Align, ArtifactStyle, Dss, PadesLevel, RevocationMaterial, StreamingTable, WasmArtifact, WasmCertificate, WasmDocumentBuilder, WasmEmbeddedFont, WasmFluentPageBuilder, WasmFooter, WasmHeader, WasmOcrConfig, WasmOcrEngine, WasmPageTemplate, WasmPdf, WasmPdfDocument, WasmPdfPageRegion, WasmSignature, WasmTimestamp, cryptoCbom, cryptoInventory, cryptoPolicy, disableLogging, generateBarcodeSvg, generateQrSvg, hasDocumentTimestamp, planSplitByBookmarks, setCryptoPolicy, setLogLevel, signPdfBytes, signPdfBytesPades, splitByBookmarks
|
|
8
|
+
Align, ArtifactStyle, Dss, PadesLevel, RevocationMaterial, StreamingTable, WasmArtifact, WasmCertificate, WasmDocumentBuilder, WasmEmbeddedFont, WasmFluentPageBuilder, WasmFooter, WasmHeader, WasmOcrConfig, WasmOcrEngine, WasmPageTemplate, WasmPdf, WasmPdfDocument, WasmPdfPageRegion, WasmSignature, WasmTimestamp, cryptoCbom, cryptoInventory, cryptoPolicy, disableLogging, generateBarcodeSvg, generateQrSvg, hasDocumentTimestamp, modelManifest, planSplitByBookmarks, prefetchAvailable, setCryptoPolicy, setLogLevel, signPdfBytes, signPdfBytesPades, splitByBookmarks
|
|
9
9
|
} from "./pdf_oxide_bg.js";
|
package/bundler/pdf_oxide_bg.js
CHANGED
|
@@ -2544,7 +2544,9 @@ export class WasmHeader {
|
|
|
2544
2544
|
if (Symbol.dispose) WasmHeader.prototype[Symbol.dispose] = WasmHeader.prototype.free;
|
|
2545
2545
|
|
|
2546
2546
|
/**
|
|
2547
|
-
* OCR configuration for WebAssembly.
|
|
2547
|
+
* OCR configuration for WebAssembly. (Currently a marker — the engine
|
|
2548
|
+
* uses tuned defaults; knobs are exposed as the WASM OCR surface
|
|
2549
|
+
* matures, #524.)
|
|
2548
2550
|
*/
|
|
2549
2551
|
export class WasmOcrConfig {
|
|
2550
2552
|
__destroy_into_raw() {
|
|
@@ -2570,7 +2572,17 @@ export class WasmOcrConfig {
|
|
|
2570
2572
|
if (Symbol.dispose) WasmOcrConfig.prototype[Symbol.dispose] = WasmOcrConfig.prototype.free;
|
|
2571
2573
|
|
|
2572
2574
|
/**
|
|
2573
|
-
* OCR engine for WebAssembly.
|
|
2575
|
+
* OCR engine for WebAssembly (#524).
|
|
2576
|
+
*
|
|
2577
|
+
* OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
|
|
2578
|
+
* native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
|
|
2579
|
+
* the browser/Deno/edge host fetches the detector + recognizer ONNX
|
|
2580
|
+
* files and the char dictionary (see `modelManifest()` for the URLs)
|
|
2581
|
+
* — typically `fetch()` + the Cache API / IndexedDB for the
|
|
2582
|
+
* tens-of-MB models — then hands the bytes to the constructor. This
|
|
2583
|
+
* only works in the `wasm-ocr` build of `pdf-oxide`; the default
|
|
2584
|
+
* `pdf-oxide-wasm` has no OCR (the constructor returns an error
|
|
2585
|
+
* explaining this).
|
|
2574
2586
|
*/
|
|
2575
2587
|
export class WasmOcrEngine {
|
|
2576
2588
|
__destroy_into_raw() {
|
|
@@ -2584,20 +2596,22 @@ export class WasmOcrEngine {
|
|
|
2584
2596
|
wasm.__wbg_wasmocrengine_free(ptr, 0);
|
|
2585
2597
|
}
|
|
2586
2598
|
/**
|
|
2587
|
-
*
|
|
2588
|
-
*
|
|
2589
|
-
*
|
|
2590
|
-
* @param {
|
|
2599
|
+
* Not available in this build. OCR needs the `wasm-ocr` build of
|
|
2600
|
+
* `pdf-oxide` (the pure-Rust tract backend); the default
|
|
2601
|
+
* `pdf-oxide-wasm` ships without it.
|
|
2602
|
+
* @param {Uint8Array} _det_model
|
|
2603
|
+
* @param {Uint8Array} _rec_model
|
|
2604
|
+
* @param {string} _dict
|
|
2591
2605
|
* @param {WasmOcrConfig | null} [_config]
|
|
2592
2606
|
*/
|
|
2593
|
-
constructor(
|
|
2607
|
+
constructor(_det_model, _rec_model, _dict, _config) {
|
|
2594
2608
|
try {
|
|
2595
2609
|
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
2596
|
-
const ptr0 =
|
|
2610
|
+
const ptr0 = passArray8ToWasm0(_det_model, wasm.__wbindgen_export);
|
|
2597
2611
|
const len0 = WASM_VECTOR_LEN;
|
|
2598
|
-
const ptr1 =
|
|
2612
|
+
const ptr1 = passArray8ToWasm0(_rec_model, wasm.__wbindgen_export);
|
|
2599
2613
|
const len1 = WASM_VECTOR_LEN;
|
|
2600
|
-
const ptr2 = passStringToWasm0(
|
|
2614
|
+
const ptr2 = passStringToWasm0(_dict, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
2601
2615
|
const len2 = WASM_VECTOR_LEN;
|
|
2602
2616
|
let ptr3 = 0;
|
|
2603
2617
|
if (!isLikeNone(_config)) {
|
|
@@ -3125,6 +3139,64 @@ export class WasmPdfDocument {
|
|
|
3125
3139
|
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
3126
3140
|
}
|
|
3127
3141
|
}
|
|
3142
|
+
/**
|
|
3143
|
+
* Cheap per-page text-vs-OCR classification → JSON
|
|
3144
|
+
* `DocumentClassification`.
|
|
3145
|
+
* @returns {string}
|
|
3146
|
+
*/
|
|
3147
|
+
classifyDocument() {
|
|
3148
|
+
let deferred2_0;
|
|
3149
|
+
let deferred2_1;
|
|
3150
|
+
try {
|
|
3151
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
3152
|
+
wasm.wasmpdfdocument_classifyDocument(retptr, this.__wbg_ptr);
|
|
3153
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
3154
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
3155
|
+
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
3156
|
+
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
3157
|
+
var ptr1 = r0;
|
|
3158
|
+
var len1 = r1;
|
|
3159
|
+
if (r3) {
|
|
3160
|
+
ptr1 = 0; len1 = 0;
|
|
3161
|
+
throw takeObject(r2);
|
|
3162
|
+
}
|
|
3163
|
+
deferred2_0 = ptr1;
|
|
3164
|
+
deferred2_1 = len1;
|
|
3165
|
+
return getStringFromWasm0(ptr1, len1);
|
|
3166
|
+
} finally {
|
|
3167
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
3168
|
+
wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
|
|
3169
|
+
}
|
|
3170
|
+
}
|
|
3171
|
+
/**
|
|
3172
|
+
* Cheap per-page classification → JSON `PageClassification`.
|
|
3173
|
+
* @param {number} page_index
|
|
3174
|
+
* @returns {string}
|
|
3175
|
+
*/
|
|
3176
|
+
classifyPage(page_index) {
|
|
3177
|
+
let deferred2_0;
|
|
3178
|
+
let deferred2_1;
|
|
3179
|
+
try {
|
|
3180
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
3181
|
+
wasm.wasmpdfdocument_classifyPage(retptr, this.__wbg_ptr, page_index);
|
|
3182
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
3183
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
3184
|
+
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
3185
|
+
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
3186
|
+
var ptr1 = r0;
|
|
3187
|
+
var len1 = r1;
|
|
3188
|
+
if (r3) {
|
|
3189
|
+
ptr1 = 0; len1 = 0;
|
|
3190
|
+
throw takeObject(r2);
|
|
3191
|
+
}
|
|
3192
|
+
deferred2_0 = ptr1;
|
|
3193
|
+
deferred2_1 = len1;
|
|
3194
|
+
return getStringFromWasm0(ptr1, len1);
|
|
3195
|
+
} finally {
|
|
3196
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
3197
|
+
wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
|
|
3198
|
+
}
|
|
3199
|
+
}
|
|
3128
3200
|
/**
|
|
3129
3201
|
* Clear all pending erase operations for a page.
|
|
3130
3202
|
* @param {number} page_index
|
|
@@ -3553,6 +3625,40 @@ export class WasmPdfDocument {
|
|
|
3553
3625
|
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
3554
3626
|
}
|
|
3555
3627
|
}
|
|
3628
|
+
/**
|
|
3629
|
+
* Rich per-page extraction → JSON `PageExtraction` (per-region
|
|
3630
|
+
* bbox + typed reason). `optionsJson` is `{}`-tolerant
|
|
3631
|
+
* `AutoExtractOptions`; undefined/empty → defaults.
|
|
3632
|
+
* @param {number} page_index
|
|
3633
|
+
* @param {string | null} [options_json]
|
|
3634
|
+
* @returns {string}
|
|
3635
|
+
*/
|
|
3636
|
+
extractPageAuto(page_index, options_json) {
|
|
3637
|
+
let deferred3_0;
|
|
3638
|
+
let deferred3_1;
|
|
3639
|
+
try {
|
|
3640
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
3641
|
+
var ptr0 = isLikeNone(options_json) ? 0 : passStringToWasm0(options_json, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
3642
|
+
var len0 = WASM_VECTOR_LEN;
|
|
3643
|
+
wasm.wasmpdfdocument_extractPageAuto(retptr, this.__wbg_ptr, page_index, ptr0, len0);
|
|
3644
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
3645
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
3646
|
+
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
3647
|
+
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
3648
|
+
var ptr2 = r0;
|
|
3649
|
+
var len2 = r1;
|
|
3650
|
+
if (r3) {
|
|
3651
|
+
ptr2 = 0; len2 = 0;
|
|
3652
|
+
throw takeObject(r2);
|
|
3653
|
+
}
|
|
3654
|
+
deferred3_0 = ptr2;
|
|
3655
|
+
deferred3_1 = len2;
|
|
3656
|
+
return getStringFromWasm0(ptr2, len2);
|
|
3657
|
+
} finally {
|
|
3658
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
3659
|
+
wasm.__wbindgen_export4(deferred3_0, deferred3_1, 1);
|
|
3660
|
+
}
|
|
3661
|
+
}
|
|
3556
3662
|
/**
|
|
3557
3663
|
* Extract complete page text data in a single call.
|
|
3558
3664
|
*
|
|
@@ -3754,6 +3860,36 @@ export class WasmPdfDocument {
|
|
|
3754
3860
|
wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
|
|
3755
3861
|
}
|
|
3756
3862
|
}
|
|
3863
|
+
/**
|
|
3864
|
+
* One-shot auto text extraction — graceful native fallback (never
|
|
3865
|
+
* the opaque OCR error #513).
|
|
3866
|
+
* @param {number} page_index
|
|
3867
|
+
* @returns {string}
|
|
3868
|
+
*/
|
|
3869
|
+
extractTextAuto(page_index) {
|
|
3870
|
+
let deferred2_0;
|
|
3871
|
+
let deferred2_1;
|
|
3872
|
+
try {
|
|
3873
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
3874
|
+
wasm.wasmpdfdocument_extractTextAuto(retptr, this.__wbg_ptr, page_index);
|
|
3875
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
3876
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
3877
|
+
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
3878
|
+
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
3879
|
+
var ptr1 = r0;
|
|
3880
|
+
var len1 = r1;
|
|
3881
|
+
if (r3) {
|
|
3882
|
+
ptr1 = 0; len1 = 0;
|
|
3883
|
+
throw takeObject(r2);
|
|
3884
|
+
}
|
|
3885
|
+
deferred2_0 = ptr1;
|
|
3886
|
+
deferred2_1 = len1;
|
|
3887
|
+
return getStringFromWasm0(ptr1, len1);
|
|
3888
|
+
} finally {
|
|
3889
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
3890
|
+
wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
|
|
3891
|
+
}
|
|
3892
|
+
}
|
|
3757
3893
|
/**
|
|
3758
3894
|
* Extract text lines from a page.
|
|
3759
3895
|
*
|
|
@@ -3780,41 +3916,35 @@ export class WasmPdfDocument {
|
|
|
3780
3916
|
}
|
|
3781
3917
|
}
|
|
3782
3918
|
/**
|
|
3783
|
-
* Extract text using OCR
|
|
3784
|
-
*
|
|
3785
|
-
* NOTE: OCR is not yet supported in the WebAssembly build due to missing
|
|
3786
|
-
* ONNX Runtime support for the web backend in the current implementation.
|
|
3919
|
+
* Extract text using OCR. Not available in this build — OCR needs
|
|
3920
|
+
* the `wasm-ocr` build of `pdf-oxide`.
|
|
3787
3921
|
* @param {number} _page_index
|
|
3788
|
-
* @param {WasmOcrEngine
|
|
3922
|
+
* @param {WasmOcrEngine} _engine
|
|
3789
3923
|
* @returns {string}
|
|
3790
3924
|
*/
|
|
3791
3925
|
extractTextOcr(_page_index, _engine) {
|
|
3792
|
-
let
|
|
3793
|
-
let
|
|
3926
|
+
let deferred2_0;
|
|
3927
|
+
let deferred2_1;
|
|
3794
3928
|
try {
|
|
3795
3929
|
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
3796
|
-
|
|
3797
|
-
|
|
3798
|
-
_assertClass(_engine, WasmOcrEngine);
|
|
3799
|
-
ptr0 = _engine.__destroy_into_raw();
|
|
3800
|
-
}
|
|
3801
|
-
wasm.wasmpdfdocument_extractTextOcr(retptr, this.__wbg_ptr, _page_index, ptr0);
|
|
3930
|
+
_assertClass(_engine, WasmOcrEngine);
|
|
3931
|
+
wasm.wasmpdfdocument_extractTextOcr(retptr, this.__wbg_ptr, _page_index, _engine.__wbg_ptr);
|
|
3802
3932
|
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
3803
3933
|
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
3804
3934
|
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
3805
3935
|
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
3806
|
-
var
|
|
3807
|
-
var
|
|
3936
|
+
var ptr1 = r0;
|
|
3937
|
+
var len1 = r1;
|
|
3808
3938
|
if (r3) {
|
|
3809
|
-
|
|
3939
|
+
ptr1 = 0; len1 = 0;
|
|
3810
3940
|
throw takeObject(r2);
|
|
3811
3941
|
}
|
|
3812
|
-
|
|
3813
|
-
|
|
3814
|
-
return getStringFromWasm0(
|
|
3942
|
+
deferred2_0 = ptr1;
|
|
3943
|
+
deferred2_1 = len1;
|
|
3944
|
+
return getStringFromWasm0(ptr1, len1);
|
|
3815
3945
|
} finally {
|
|
3816
3946
|
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
3817
|
-
wasm.__wbindgen_export4(
|
|
3947
|
+
wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
|
|
3818
3948
|
}
|
|
3819
3949
|
}
|
|
3820
3950
|
/**
|
|
@@ -5573,6 +5703,10 @@ export class WasmPdfPageRegion {
|
|
|
5573
5703
|
}
|
|
5574
5704
|
/**
|
|
5575
5705
|
* Extract text using OCR from this region.
|
|
5706
|
+
*
|
|
5707
|
+
* Region-scoped OCR is not wired yet; use the page-level
|
|
5708
|
+
* `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
|
|
5709
|
+
* (#524 follow-up).
|
|
5576
5710
|
* @param {WasmOcrEngine | null} [_engine]
|
|
5577
5711
|
* @returns {string}
|
|
5578
5712
|
*/
|
|
@@ -6153,6 +6287,36 @@ export function hasDocumentTimestamp(pdf_data) {
|
|
|
6153
6287
|
return ret !== 0;
|
|
6154
6288
|
}
|
|
6155
6289
|
|
|
6290
|
+
/**
|
|
6291
|
+
* #519: Air-gapped OCR model manifest — JSON (detector + every
|
|
6292
|
+
* supported language's cache filenames and source URLs).
|
|
6293
|
+
*
|
|
6294
|
+
* WASM provisioning is **host-side**: browser/WASM has no filesystem
|
|
6295
|
+
* or network-to-disk, so a download-to-cache prefetch cannot run
|
|
6296
|
+
* here. This manifest is informational — it lets the JS host learn
|
|
6297
|
+
* which model files/URLs to fetch and bundle (or ship out of band)
|
|
6298
|
+
* before driving OCR. There is intentionally no `prefetchModels` in
|
|
6299
|
+
* the WASM surface (see `prefetchAvailable`, which always returns
|
|
6300
|
+
* `false`).
|
|
6301
|
+
* @returns {string}
|
|
6302
|
+
*/
|
|
6303
|
+
export function modelManifest() {
|
|
6304
|
+
let deferred1_0;
|
|
6305
|
+
let deferred1_1;
|
|
6306
|
+
try {
|
|
6307
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
6308
|
+
wasm.modelManifest(retptr);
|
|
6309
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
6310
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
6311
|
+
deferred1_0 = r0;
|
|
6312
|
+
deferred1_1 = r1;
|
|
6313
|
+
return getStringFromWasm0(r0, r1);
|
|
6314
|
+
} finally {
|
|
6315
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
6316
|
+
wasm.__wbindgen_export4(deferred1_0, deferred1_1, 1);
|
|
6317
|
+
}
|
|
6318
|
+
}
|
|
6319
|
+
|
|
6156
6320
|
/**
|
|
6157
6321
|
* Plan a bookmark split without producing PDFs. Returns a JSON array
|
|
6158
6322
|
* of segment objects (`index, startPage…` shape from
|
|
@@ -6184,6 +6348,17 @@ export function planSplitByBookmarks(src_bytes, title_prefix, ignore_case, level
|
|
|
6184
6348
|
}
|
|
6185
6349
|
}
|
|
6186
6350
|
|
|
6351
|
+
/**
|
|
6352
|
+
* #519: Whether this build can download OCR models to a local cache.
|
|
6353
|
+
* Always `false` in WASM — provisioning is host-side (see
|
|
6354
|
+
* `modelManifest`).
|
|
6355
|
+
* @returns {boolean}
|
|
6356
|
+
*/
|
|
6357
|
+
export function prefetchAvailable() {
|
|
6358
|
+
const ret = wasm.prefetchAvailable();
|
|
6359
|
+
return ret !== 0;
|
|
6360
|
+
}
|
|
6361
|
+
|
|
6187
6362
|
/**
|
|
6188
6363
|
* Install the process-wide runtime crypto policy from its grammar
|
|
6189
6364
|
* string (`"compat"|"strict"|"fips-strict"[;…]`). Fail-closed:
|
|
Binary file
|
|
@@ -23,7 +23,9 @@ export const cryptoPolicy: (a: number) => void;
|
|
|
23
23
|
export const generateBarcodeSvg: (a: number, b: number, c: number, d: number) => void;
|
|
24
24
|
export const generateQrSvg: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
25
25
|
export const hasDocumentTimestamp: (a: number, b: number) => number;
|
|
26
|
+
export const modelManifest: (a: number) => void;
|
|
26
27
|
export const planSplitByBookmarks: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => void;
|
|
28
|
+
export const prefetchAvailable: () => number;
|
|
27
29
|
export const setCryptoPolicy: (a: number, b: number, c: number) => void;
|
|
28
30
|
export const setLogLevel: (a: number, b: number, c: number) => void;
|
|
29
31
|
export const signPdfBytes: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => void;
|
|
@@ -158,6 +160,8 @@ export const wasmpdfdocument_applyAllRedactions: (a: number, b: number) => void;
|
|
|
158
160
|
export const wasmpdfdocument_applyPageRedactions: (a: number, b: number, c: number) => void;
|
|
159
161
|
export const wasmpdfdocument_applyRedactionsDestructive: (a: number, b: number, c: number) => void;
|
|
160
162
|
export const wasmpdfdocument_authenticate: (a: number, b: number, c: number, d: number) => void;
|
|
163
|
+
export const wasmpdfdocument_classifyDocument: (a: number, b: number) => void;
|
|
164
|
+
export const wasmpdfdocument_classifyPage: (a: number, b: number, c: number) => void;
|
|
161
165
|
export const wasmpdfdocument_clearEraseRegions: (a: number, b: number, c: number) => void;
|
|
162
166
|
export const wasmpdfdocument_convertToPdfA: (a: number, b: number, c: number, d: number) => void;
|
|
163
167
|
export const wasmpdfdocument_cropMargins: (a: number, b: number, c: number, d: number, e: number, f: number) => void;
|
|
@@ -177,6 +181,7 @@ export const wasmpdfdocument_extractChars: (a: number, b: number, c: number, d:
|
|
|
177
181
|
export const wasmpdfdocument_extractImageBytes: (a: number, b: number, c: number) => void;
|
|
178
182
|
export const wasmpdfdocument_extractImages: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
179
183
|
export const wasmpdfdocument_extractLines: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
184
|
+
export const wasmpdfdocument_extractPageAuto: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
180
185
|
export const wasmpdfdocument_extractPageText: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
181
186
|
export const wasmpdfdocument_extractPages: (a: number, b: number, c: number, d: number) => void;
|
|
182
187
|
export const wasmpdfdocument_extractPaths: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
@@ -184,6 +189,7 @@ export const wasmpdfdocument_extractRects: (a: number, b: number, c: number, d:
|
|
|
184
189
|
export const wasmpdfdocument_extractSpans: (a: number, b: number, c: number, d: number, e: number, f: number, g: number) => void;
|
|
185
190
|
export const wasmpdfdocument_extractTables: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
186
191
|
export const wasmpdfdocument_extractText: (a: number, b: number, c: number, d: number) => void;
|
|
192
|
+
export const wasmpdfdocument_extractTextAuto: (a: number, b: number, c: number) => void;
|
|
187
193
|
export const wasmpdfdocument_extractTextLines: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
188
194
|
export const wasmpdfdocument_extractTextOcr: (a: number, b: number, c: number, d: number) => void;
|
|
189
195
|
export const wasmpdfdocument_extractWords: (a: number, b: number, c: number, d: number, e: number) => void;
|
package/nodejs/pdf_oxide.d.ts
CHANGED
|
@@ -644,7 +644,9 @@ export class WasmHeader {
|
|
|
644
644
|
}
|
|
645
645
|
|
|
646
646
|
/**
|
|
647
|
-
* OCR configuration for WebAssembly.
|
|
647
|
+
* OCR configuration for WebAssembly. (Currently a marker — the engine
|
|
648
|
+
* uses tuned defaults; knobs are exposed as the WASM OCR surface
|
|
649
|
+
* matures, #524.)
|
|
648
650
|
*/
|
|
649
651
|
export class WasmOcrConfig {
|
|
650
652
|
free(): void;
|
|
@@ -656,15 +658,27 @@ export class WasmOcrConfig {
|
|
|
656
658
|
}
|
|
657
659
|
|
|
658
660
|
/**
|
|
659
|
-
* OCR engine for WebAssembly.
|
|
661
|
+
* OCR engine for WebAssembly (#524).
|
|
662
|
+
*
|
|
663
|
+
* OCR runs entirely in-WASM via the pure-Rust `tract` backend — no
|
|
664
|
+
* native ONNX Runtime, no JS bridge. Model **delivery is host-side**:
|
|
665
|
+
* the browser/Deno/edge host fetches the detector + recognizer ONNX
|
|
666
|
+
* files and the char dictionary (see `modelManifest()` for the URLs)
|
|
667
|
+
* — typically `fetch()` + the Cache API / IndexedDB for the
|
|
668
|
+
* tens-of-MB models — then hands the bytes to the constructor. This
|
|
669
|
+
* only works in the `wasm-ocr` build of `pdf-oxide`; the default
|
|
670
|
+
* `pdf-oxide-wasm` has no OCR (the constructor returns an error
|
|
671
|
+
* explaining this).
|
|
660
672
|
*/
|
|
661
673
|
export class WasmOcrEngine {
|
|
662
674
|
free(): void;
|
|
663
675
|
[Symbol.dispose](): void;
|
|
664
676
|
/**
|
|
665
|
-
*
|
|
677
|
+
* Not available in this build. OCR needs the `wasm-ocr` build of
|
|
678
|
+
* `pdf-oxide` (the pure-Rust tract backend); the default
|
|
679
|
+
* `pdf-oxide-wasm` ships without it.
|
|
666
680
|
*/
|
|
667
|
-
constructor(
|
|
681
|
+
constructor(_det_model: Uint8Array, _rec_model: Uint8Array, _dict: string, _config?: WasmOcrConfig | null);
|
|
668
682
|
}
|
|
669
683
|
|
|
670
684
|
/**
|
|
@@ -815,6 +829,15 @@ export class WasmPdfDocument {
|
|
|
815
829
|
* @returns true if authentication succeeded
|
|
816
830
|
*/
|
|
817
831
|
authenticate(password: string): boolean;
|
|
832
|
+
/**
|
|
833
|
+
* Cheap per-page text-vs-OCR classification → JSON
|
|
834
|
+
* `DocumentClassification`.
|
|
835
|
+
*/
|
|
836
|
+
classifyDocument(): string;
|
|
837
|
+
/**
|
|
838
|
+
* Cheap per-page classification → JSON `PageClassification`.
|
|
839
|
+
*/
|
|
840
|
+
classifyPage(page_index: number): string;
|
|
818
841
|
/**
|
|
819
842
|
* Clear all pending erase operations for a page.
|
|
820
843
|
*/
|
|
@@ -934,6 +957,12 @@ export class WasmPdfDocument {
|
|
|
934
957
|
* @returns Array of path objects
|
|
935
958
|
*/
|
|
936
959
|
extractLines(page_index: number, region?: Float32Array | null): any;
|
|
960
|
+
/**
|
|
961
|
+
* Rich per-page extraction → JSON `PageExtraction` (per-region
|
|
962
|
+
* bbox + typed reason). `optionsJson` is `{}`-tolerant
|
|
963
|
+
* `AutoExtractOptions`; undefined/empty → defaults.
|
|
964
|
+
*/
|
|
965
|
+
extractPageAuto(page_index: number, options_json?: string | null): string;
|
|
937
966
|
/**
|
|
938
967
|
* Extract complete page text data in a single call.
|
|
939
968
|
*
|
|
@@ -990,6 +1019,11 @@ export class WasmPdfDocument {
|
|
|
990
1019
|
* @param region - Optional [x, y, width, height] to filter by
|
|
991
1020
|
*/
|
|
992
1021
|
extractText(page_index: number, region: any): string;
|
|
1022
|
+
/**
|
|
1023
|
+
* One-shot auto text extraction — graceful native fallback (never
|
|
1024
|
+
* the opaque OCR error #513).
|
|
1025
|
+
*/
|
|
1026
|
+
extractTextAuto(page_index: number): string;
|
|
993
1027
|
/**
|
|
994
1028
|
* Extract text lines from a page.
|
|
995
1029
|
*
|
|
@@ -997,12 +1031,10 @@ export class WasmPdfDocument {
|
|
|
997
1031
|
*/
|
|
998
1032
|
extractTextLines(page_index: number, region?: Float32Array | null): any;
|
|
999
1033
|
/**
|
|
1000
|
-
* Extract text using OCR
|
|
1001
|
-
*
|
|
1002
|
-
* NOTE: OCR is not yet supported in the WebAssembly build due to missing
|
|
1003
|
-
* ONNX Runtime support for the web backend in the current implementation.
|
|
1034
|
+
* Extract text using OCR. Not available in this build — OCR needs
|
|
1035
|
+
* the `wasm-ocr` build of `pdf-oxide`.
|
|
1004
1036
|
*/
|
|
1005
|
-
extractTextOcr(_page_index: number, _engine
|
|
1037
|
+
extractTextOcr(_page_index: number, _engine: WasmOcrEngine): string;
|
|
1006
1038
|
/**
|
|
1007
1039
|
* Extract word-level data from a page.
|
|
1008
1040
|
*
|
|
@@ -1425,6 +1457,10 @@ export class WasmPdfPageRegion {
|
|
|
1425
1457
|
extractTextLines(): any;
|
|
1426
1458
|
/**
|
|
1427
1459
|
* Extract text using OCR from this region.
|
|
1460
|
+
*
|
|
1461
|
+
* Region-scoped OCR is not wired yet; use the page-level
|
|
1462
|
+
* `WasmPdfDocument.extractTextOcr(pageIndex, engine)` for now
|
|
1463
|
+
* (#524 follow-up).
|
|
1428
1464
|
*/
|
|
1429
1465
|
extractTextOcr(_engine?: WasmOcrEngine | null): string;
|
|
1430
1466
|
/**
|
|
@@ -1600,6 +1636,20 @@ export function generateQrSvg(data: string, error_correction: number, size: numb
|
|
|
1600
1636
|
*/
|
|
1601
1637
|
export function hasDocumentTimestamp(pdf_data: Uint8Array): boolean;
|
|
1602
1638
|
|
|
1639
|
+
/**
|
|
1640
|
+
* #519: Air-gapped OCR model manifest — JSON (detector + every
|
|
1641
|
+
* supported language's cache filenames and source URLs).
|
|
1642
|
+
*
|
|
1643
|
+
* WASM provisioning is **host-side**: browser/WASM has no filesystem
|
|
1644
|
+
* or network-to-disk, so a download-to-cache prefetch cannot run
|
|
1645
|
+
* here. This manifest is informational — it lets the JS host learn
|
|
1646
|
+
* which model files/URLs to fetch and bundle (or ship out of band)
|
|
1647
|
+
* before driving OCR. There is intentionally no `prefetchModels` in
|
|
1648
|
+
* the WASM surface (see `prefetchAvailable`, which always returns
|
|
1649
|
+
* `false`).
|
|
1650
|
+
*/
|
|
1651
|
+
export function modelManifest(): string;
|
|
1652
|
+
|
|
1603
1653
|
/**
|
|
1604
1654
|
* Plan a bookmark split without producing PDFs. Returns a JSON array
|
|
1605
1655
|
* of segment objects (`index, startPage…` shape from
|
|
@@ -1607,6 +1657,13 @@ export function hasDocumentTimestamp(pdf_data: Uint8Array): boolean;
|
|
|
1607
1657
|
*/
|
|
1608
1658
|
export function planSplitByBookmarks(src_bytes: Uint8Array, title_prefix: string | null | undefined, ignore_case: boolean, level: number, include_front_matter: boolean): any;
|
|
1609
1659
|
|
|
1660
|
+
/**
|
|
1661
|
+
* #519: Whether this build can download OCR models to a local cache.
|
|
1662
|
+
* Always `false` in WASM — provisioning is host-side (see
|
|
1663
|
+
* `modelManifest`).
|
|
1664
|
+
*/
|
|
1665
|
+
export function prefetchAvailable(): boolean;
|
|
1666
|
+
|
|
1610
1667
|
/**
|
|
1611
1668
|
* Install the process-wide runtime crypto policy from its grammar
|
|
1612
1669
|
* string (`"compat"|"strict"|"fips-strict"[;…]`). Fail-closed:
|