pdf-oxide-wasm 0.3.13 → 0.3.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/pdf_oxide.d.ts +245 -12
- package/pdf_oxide.js +1096 -45
- package/pdf_oxide_bg.wasm +0 -0
- package/pdf_oxide_bg.wasm.d.ts +61 -6
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pdf-oxide-wasm",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.15",
|
|
4
4
|
"description": "Fast, zero-dependency PDF toolkit for Node.js, browsers, and edge runtimes — text extraction, markdown/HTML conversion, search, form filling, creation, and editing. Rust core compiled to WebAssembly.",
|
|
5
5
|
"license": "MIT OR Apache-2.0",
|
|
6
6
|
"repository": {
|
package/pdf_oxide.d.ts
CHANGED
|
@@ -1,6 +1,86 @@
|
|
|
1
1
|
/* tslint:disable */
|
|
2
2
|
/* eslint-disable */
|
|
3
3
|
|
|
4
|
+
/**
|
|
5
|
+
* Style configuration for header/footer text.
|
|
6
|
+
*/
|
|
7
|
+
export class ArtifactStyle {
|
|
8
|
+
free(): void;
|
|
9
|
+
[Symbol.dispose](): void;
|
|
10
|
+
bold(): ArtifactStyle;
|
|
11
|
+
color(r: number, g: number, b: number): ArtifactStyle;
|
|
12
|
+
font(name: string, size: number): ArtifactStyle;
|
|
13
|
+
constructor();
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* A header or footer artifact definition.
|
|
18
|
+
*/
|
|
19
|
+
export class WasmArtifact {
|
|
20
|
+
free(): void;
|
|
21
|
+
[Symbol.dispose](): void;
|
|
22
|
+
static center(text: string): WasmArtifact;
|
|
23
|
+
static left(text: string): WasmArtifact;
|
|
24
|
+
constructor();
|
|
25
|
+
static right(text: string): WasmArtifact;
|
|
26
|
+
withOffset(offset: number): WasmArtifact;
|
|
27
|
+
withStyle(style: ArtifactStyle): WasmArtifact;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* A footer definition.
|
|
32
|
+
*/
|
|
33
|
+
export class WasmFooter {
|
|
34
|
+
free(): void;
|
|
35
|
+
[Symbol.dispose](): void;
|
|
36
|
+
static center(text: string): WasmFooter;
|
|
37
|
+
static left(text: string): WasmFooter;
|
|
38
|
+
constructor();
|
|
39
|
+
static right(text: string): WasmFooter;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* A header definition.
|
|
44
|
+
*/
|
|
45
|
+
export class WasmHeader {
|
|
46
|
+
free(): void;
|
|
47
|
+
[Symbol.dispose](): void;
|
|
48
|
+
static center(text: string): WasmHeader;
|
|
49
|
+
static left(text: string): WasmHeader;
|
|
50
|
+
constructor();
|
|
51
|
+
static right(text: string): WasmHeader;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* OCR configuration for WebAssembly.
|
|
56
|
+
*/
|
|
57
|
+
export class WasmOcrConfig {
|
|
58
|
+
free(): void;
|
|
59
|
+
[Symbol.dispose](): void;
|
|
60
|
+
constructor();
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* OCR engine for WebAssembly.
|
|
65
|
+
*/
|
|
66
|
+
export class WasmOcrEngine {
|
|
67
|
+
free(): void;
|
|
68
|
+
[Symbol.dispose](): void;
|
|
69
|
+
constructor(_det_model_path: string, _rec_model_path: string, _dict_path: string, _config?: WasmOcrConfig | null);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* A complete page template with header and footer.
|
|
74
|
+
*/
|
|
75
|
+
export class WasmPageTemplate {
|
|
76
|
+
free(): void;
|
|
77
|
+
[Symbol.dispose](): void;
|
|
78
|
+
footer(footer: WasmArtifact): WasmPageTemplate;
|
|
79
|
+
header(header: WasmArtifact): WasmPageTemplate;
|
|
80
|
+
constructor();
|
|
81
|
+
skipFirstPage(): WasmPageTemplate;
|
|
82
|
+
}
|
|
83
|
+
|
|
4
84
|
/**
|
|
5
85
|
* Create new PDF documents from Markdown, HTML, or plain text.
|
|
6
86
|
*
|
|
@@ -101,6 +181,28 @@ export class WasmPdfDocument {
|
|
|
101
181
|
* @param data - File contents as a Uint8Array
|
|
102
182
|
*/
|
|
103
183
|
embedFile(name: string, data: Uint8Array): void;
|
|
184
|
+
/**
|
|
185
|
+
* Erase both header and footer content.
|
|
186
|
+
*
|
|
187
|
+
* @param page_index - Zero-based page number
|
|
188
|
+
*/
|
|
189
|
+
eraseArtifacts(page_index: number): void;
|
|
190
|
+
/**
|
|
191
|
+
* Erase existing footer content.
|
|
192
|
+
*
|
|
193
|
+
* Identifies existing text in the footer area (bottom 15%) and marks it for erasure.
|
|
194
|
+
*
|
|
195
|
+
* @param page_index - Zero-based page number
|
|
196
|
+
*/
|
|
197
|
+
eraseFooter(page_index: number): void;
|
|
198
|
+
/**
|
|
199
|
+
* Erase existing header content.
|
|
200
|
+
*
|
|
201
|
+
* Identifies existing text in the header area (top 15%) and marks it for erasure.
|
|
202
|
+
*
|
|
203
|
+
* @param page_index - Zero-based page number
|
|
204
|
+
*/
|
|
205
|
+
eraseHeader(page_index: number): void;
|
|
104
206
|
/**
|
|
105
207
|
* Erase (whiteout) a rectangular region on a page.
|
|
106
208
|
*/
|
|
@@ -128,8 +230,11 @@ export class WasmPdfDocument {
|
|
|
128
230
|
*
|
|
129
231
|
* Returns an array of objects with: char, bbox {x, y, width, height},
|
|
130
232
|
* font_name, font_size, font_weight, is_italic, color {r, g, b}, etc.
|
|
233
|
+
*
|
|
234
|
+
* @param page_index - Zero-based page number
|
|
235
|
+
* @param region - Optional [x, y, width, height] to filter by
|
|
131
236
|
*/
|
|
132
|
-
extractChars(page_index: number): any;
|
|
237
|
+
extractChars(page_index: number, region?: Float32Array | null): any;
|
|
133
238
|
/**
|
|
134
239
|
* Extract image bytes from a page as PNG data.
|
|
135
240
|
*
|
|
@@ -141,28 +246,80 @@ export class WasmPdfDocument {
|
|
|
141
246
|
*
|
|
142
247
|
* Returns an array of objects with: width, height, color_space,
|
|
143
248
|
* bits_per_component, bbox (if available). Does NOT return raw image bytes.
|
|
249
|
+
*
|
|
250
|
+
* @param page_index - Zero-based page number
|
|
251
|
+
* @param region - Optional [x, y, width, height] to filter by
|
|
252
|
+
*/
|
|
253
|
+
extractImages(page_index: number, region?: Float32Array | null): any;
|
|
254
|
+
/**
|
|
255
|
+
* Extract only straight lines from a page (v0.3.14).
|
|
256
|
+
*
|
|
257
|
+
* Identifies paths that form a single straight line segment.
|
|
258
|
+
*
|
|
259
|
+
* @param page_index - Zero-based page number
|
|
260
|
+
* @param region - Optional [x, y, width, height] to filter by
|
|
261
|
+
* @returns Array of path objects
|
|
144
262
|
*/
|
|
145
|
-
|
|
263
|
+
extractLines(page_index: number, region?: Float32Array | null): any;
|
|
146
264
|
/**
|
|
147
265
|
* Extract vector paths (lines, curves, shapes) from a page.
|
|
148
266
|
*
|
|
149
267
|
* @param page_index - Zero-based page number
|
|
268
|
+
* @param region - Optional [x, y, width, height] to filter by
|
|
150
269
|
* @returns Array of path objects with bbox, stroke_color, fill_color, etc.
|
|
151
270
|
*/
|
|
152
|
-
extractPaths(page_index: number): any;
|
|
271
|
+
extractPaths(page_index: number, region?: Float32Array | null): any;
|
|
272
|
+
/**
|
|
273
|
+
* Extract only rectangles from a page (v0.3.14).
|
|
274
|
+
*
|
|
275
|
+
* Identifies paths that form axis-aligned rectangles.
|
|
276
|
+
*
|
|
277
|
+
* @param page_index - Zero-based page number
|
|
278
|
+
* @param region - Optional [x, y, width, height] to filter by
|
|
279
|
+
* @returns Array of path objects
|
|
280
|
+
*/
|
|
281
|
+
extractRects(page_index: number, region?: Float32Array | null): any;
|
|
153
282
|
/**
|
|
154
283
|
* Extract span-level data from a page.
|
|
155
284
|
*
|
|
156
285
|
* Returns an array of objects with: text, bbox, font_name, font_size,
|
|
157
286
|
* font_weight, is_italic, color, etc.
|
|
158
287
|
*/
|
|
159
|
-
extractSpans(page_index: number): any;
|
|
288
|
+
extractSpans(page_index: number, region?: Float32Array | null): any;
|
|
289
|
+
/**
|
|
290
|
+
* Extract tables from a page (v0.3.14).
|
|
291
|
+
*
|
|
292
|
+
* @param page_index - Zero-based page number
|
|
293
|
+
* @param region - Optional [x, y, width, height] to filter by
|
|
294
|
+
*/
|
|
295
|
+
extractTables(page_index: number, region?: Float32Array | null): any;
|
|
160
296
|
/**
|
|
161
297
|
* Extract plain text from a single page.
|
|
162
298
|
*
|
|
163
299
|
* @param page_index - Zero-based page number
|
|
300
|
+
* @param region - Optional [x, y, width, height] to filter by
|
|
301
|
+
*/
|
|
302
|
+
extractText(page_index: number, region?: Float32Array | null): string;
|
|
303
|
+
/**
|
|
304
|
+
* Extract text lines from a page.
|
|
305
|
+
*
|
|
306
|
+
* Returns an array of objects with: text, bbox, words (array of Word objects).
|
|
307
|
+
*/
|
|
308
|
+
extractTextLines(page_index: number, region?: Float32Array | null): any;
|
|
309
|
+
/**
|
|
310
|
+
* Extract text using OCR (optical character recognition).
|
|
311
|
+
*
|
|
312
|
+
* NOTE: OCR is not yet supported in the WebAssembly build due to missing
|
|
313
|
+
* ONNX Runtime support for the web backend in the current implementation.
|
|
164
314
|
*/
|
|
165
|
-
|
|
315
|
+
extractTextOcr(_page_index: number, _engine?: WasmOcrEngine | null): string;
|
|
316
|
+
/**
|
|
317
|
+
* Extract word-level data from a page.
|
|
318
|
+
*
|
|
319
|
+
* Returns an array of objects with: text, bbox, font_name, font_size,
|
|
320
|
+
* font_weight, is_italic, is_bold.
|
|
321
|
+
*/
|
|
322
|
+
extractWords(page_index: number, region?: Float32Array | null): any;
|
|
166
323
|
/**
|
|
167
324
|
* Flatten all annotations in the document into page content.
|
|
168
325
|
*/
|
|
@@ -271,6 +428,33 @@ export class WasmPdfDocument {
|
|
|
271
428
|
* Get the rotation of a page in degrees (0, 90, 180, 270).
|
|
272
429
|
*/
|
|
273
430
|
pageRotation(page_index: number): number;
|
|
431
|
+
/**
|
|
432
|
+
* Identify and remove both headers and footers.
|
|
433
|
+
*
|
|
434
|
+
* Prioritizes ISO 32000 spec-compliant /Artifact tags, with a heuristic
|
|
435
|
+
* fallback for untagged PDFs.
|
|
436
|
+
*
|
|
437
|
+
* @param threshold - Fraction of pages (0.0-1.0) where text must repeat (heuristic mode)
|
|
438
|
+
*/
|
|
439
|
+
removeArtifacts(threshold: number): number;
|
|
440
|
+
/**
|
|
441
|
+
* Identify and remove footers.
|
|
442
|
+
*
|
|
443
|
+
* Uses spec-compliant /Artifact tags when available (100% accuracy), or
|
|
444
|
+
* falls back to heuristic analysis of the bottom 15% of pages.
|
|
445
|
+
*
|
|
446
|
+
* @param threshold - Fraction of pages (0.0-1.0) where text must repeat (heuristic mode)
|
|
447
|
+
*/
|
|
448
|
+
removeFooters(threshold: number): number;
|
|
449
|
+
/**
|
|
450
|
+
* Identify and remove headers.
|
|
451
|
+
*
|
|
452
|
+
* Uses spec-compliant /Artifact tags when available (100% accuracy), or
|
|
453
|
+
* falls back to heuristic analysis of the top 15% of pages.
|
|
454
|
+
*
|
|
455
|
+
* @param threshold - Fraction of pages (0.0-1.0) where text must repeat (heuristic mode)
|
|
456
|
+
*/
|
|
457
|
+
removeHeaders(threshold: number): number;
|
|
274
458
|
/**
|
|
275
459
|
* Reposition an image on a page.
|
|
276
460
|
*/
|
|
@@ -289,13 +473,6 @@ export class WasmPdfDocument {
|
|
|
289
473
|
rotatePage(page_index: number, degrees: number): void;
|
|
290
474
|
/**
|
|
291
475
|
* Save with encryption and return the resulting PDF as bytes.
|
|
292
|
-
*
|
|
293
|
-
* @param user_password - Password required to open the document
|
|
294
|
-
* @param owner_password - Password for full access (defaults to user_password)
|
|
295
|
-
* @param allow_print - Allow printing (default: true)
|
|
296
|
-
* @param allow_copy - Allow copying text (default: true)
|
|
297
|
-
* @param allow_modify - Allow modifying (default: true)
|
|
298
|
-
* @param allow_annotate - Allow annotations (default: true)
|
|
299
476
|
*/
|
|
300
477
|
saveEncryptedToBytes(user_password: string, owner_password?: string | null, allow_print?: boolean | null, allow_copy?: boolean | null, allow_modify?: boolean | null, allow_annotate?: boolean | null): Uint8Array;
|
|
301
478
|
/**
|
|
@@ -395,6 +572,13 @@ export class WasmPdfDocument {
|
|
|
395
572
|
* Get the PDF version as [major, minor].
|
|
396
573
|
*/
|
|
397
574
|
version(): Uint8Array;
|
|
575
|
+
/**
|
|
576
|
+
* Focus extraction on a specific rectangular region of a page (v0.3.14).
|
|
577
|
+
*
|
|
578
|
+
* @param page_index - Zero-based page number
|
|
579
|
+
* @param region - [x, y, width, height] in points
|
|
580
|
+
*/
|
|
581
|
+
within(page_index: number, region: Float32Array): WasmPdfPageRegion;
|
|
398
582
|
/**
|
|
399
583
|
* Get XMP metadata from the document.
|
|
400
584
|
*
|
|
@@ -402,3 +586,52 @@ export class WasmPdfDocument {
|
|
|
402
586
|
*/
|
|
403
587
|
xmpMetadata(): any;
|
|
404
588
|
}
|
|
589
|
+
|
|
590
|
+
/**
|
|
591
|
+
* A focused view of a PDF page region for scoped extraction (v0.3.14).
|
|
592
|
+
*/
|
|
593
|
+
export class WasmPdfPageRegion {
|
|
594
|
+
private constructor();
|
|
595
|
+
free(): void;
|
|
596
|
+
[Symbol.dispose](): void;
|
|
597
|
+
/**
|
|
598
|
+
* Extract character-level data from this region.
|
|
599
|
+
*/
|
|
600
|
+
extractChars(): any;
|
|
601
|
+
/**
|
|
602
|
+
* Extract images from this region.
|
|
603
|
+
*/
|
|
604
|
+
extractImages(): any;
|
|
605
|
+
/**
|
|
606
|
+
* Extract straight lines from this region.
|
|
607
|
+
*/
|
|
608
|
+
extractLines(): any;
|
|
609
|
+
/**
|
|
610
|
+
* Extract vector paths from this region.
|
|
611
|
+
*/
|
|
612
|
+
extractPaths(): any;
|
|
613
|
+
/**
|
|
614
|
+
* Extract rectangles from this region.
|
|
615
|
+
*/
|
|
616
|
+
extractRects(): any;
|
|
617
|
+
/**
|
|
618
|
+
* Extract tables from this region.
|
|
619
|
+
*/
|
|
620
|
+
extractTables(): any;
|
|
621
|
+
/**
|
|
622
|
+
* Extract text from this region.
|
|
623
|
+
*/
|
|
624
|
+
extractText(): string;
|
|
625
|
+
/**
|
|
626
|
+
* Extract text lines from this region.
|
|
627
|
+
*/
|
|
628
|
+
extractTextLines(): any;
|
|
629
|
+
/**
|
|
630
|
+
* Extract text using OCR from this region.
|
|
631
|
+
*/
|
|
632
|
+
extractTextOcr(_engine?: WasmOcrEngine | null): string;
|
|
633
|
+
/**
|
|
634
|
+
* Extract words from this region.
|
|
635
|
+
*/
|
|
636
|
+
extractWords(): any;
|
|
637
|
+
}
|