pdf-oxide-wasm 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pdf-oxide-wasm",
3
- "version": "0.3.13",
3
+ "version": "0.3.15",
4
4
  "description": "Fast, zero-dependency PDF toolkit for Node.js, browsers, and edge runtimes — text extraction, markdown/HTML conversion, search, form filling, creation, and editing. Rust core compiled to WebAssembly.",
5
5
  "license": "MIT OR Apache-2.0",
6
6
  "repository": {
package/pdf_oxide.d.ts CHANGED
@@ -1,6 +1,86 @@
1
1
  /* tslint:disable */
2
2
  /* eslint-disable */
3
3
 
4
+ /**
5
+ * Style configuration for header/footer text.
6
+ */
7
+ export class ArtifactStyle {
8
+ free(): void;
9
+ [Symbol.dispose](): void;
10
+ bold(): ArtifactStyle;
11
+ color(r: number, g: number, b: number): ArtifactStyle;
12
+ font(name: string, size: number): ArtifactStyle;
13
+ constructor();
14
+ }
15
+
16
+ /**
17
+ * A header or footer artifact definition.
18
+ */
19
+ export class WasmArtifact {
20
+ free(): void;
21
+ [Symbol.dispose](): void;
22
+ static center(text: string): WasmArtifact;
23
+ static left(text: string): WasmArtifact;
24
+ constructor();
25
+ static right(text: string): WasmArtifact;
26
+ withOffset(offset: number): WasmArtifact;
27
+ withStyle(style: ArtifactStyle): WasmArtifact;
28
+ }
29
+
30
+ /**
31
+ * A footer definition.
32
+ */
33
+ export class WasmFooter {
34
+ free(): void;
35
+ [Symbol.dispose](): void;
36
+ static center(text: string): WasmFooter;
37
+ static left(text: string): WasmFooter;
38
+ constructor();
39
+ static right(text: string): WasmFooter;
40
+ }
41
+
42
+ /**
43
+ * A header definition.
44
+ */
45
+ export class WasmHeader {
46
+ free(): void;
47
+ [Symbol.dispose](): void;
48
+ static center(text: string): WasmHeader;
49
+ static left(text: string): WasmHeader;
50
+ constructor();
51
+ static right(text: string): WasmHeader;
52
+ }
53
+
54
+ /**
55
+ * OCR configuration for WebAssembly.
56
+ */
57
+ export class WasmOcrConfig {
58
+ free(): void;
59
+ [Symbol.dispose](): void;
60
+ constructor();
61
+ }
62
+
63
+ /**
64
+ * OCR engine for WebAssembly.
65
+ */
66
+ export class WasmOcrEngine {
67
+ free(): void;
68
+ [Symbol.dispose](): void;
69
+ constructor(_det_model_path: string, _rec_model_path: string, _dict_path: string, _config?: WasmOcrConfig | null);
70
+ }
71
+
72
+ /**
73
+ * A complete page template with header and footer.
74
+ */
75
+ export class WasmPageTemplate {
76
+ free(): void;
77
+ [Symbol.dispose](): void;
78
+ footer(footer: WasmArtifact): WasmPageTemplate;
79
+ header(header: WasmArtifact): WasmPageTemplate;
80
+ constructor();
81
+ skipFirstPage(): WasmPageTemplate;
82
+ }
83
+
4
84
  /**
5
85
  * Create new PDF documents from Markdown, HTML, or plain text.
6
86
  *
@@ -101,6 +181,28 @@ export class WasmPdfDocument {
101
181
  * @param data - File contents as a Uint8Array
102
182
  */
103
183
  embedFile(name: string, data: Uint8Array): void;
184
+ /**
185
+ * Erase both header and footer content.
186
+ *
187
+ * @param page_index - Zero-based page number
188
+ */
189
+ eraseArtifacts(page_index: number): void;
190
+ /**
191
+ * Erase existing footer content.
192
+ *
193
+ * Identifies existing text in the footer area (bottom 15%) and marks it for erasure.
194
+ *
195
+ * @param page_index - Zero-based page number
196
+ */
197
+ eraseFooter(page_index: number): void;
198
+ /**
199
+ * Erase existing header content.
200
+ *
201
+ * Identifies existing text in the header area (top 15%) and marks it for erasure.
202
+ *
203
+ * @param page_index - Zero-based page number
204
+ */
205
+ eraseHeader(page_index: number): void;
104
206
  /**
105
207
  * Erase (whiteout) a rectangular region on a page.
106
208
  */
@@ -128,8 +230,11 @@ export class WasmPdfDocument {
128
230
  *
129
231
  * Returns an array of objects with: char, bbox {x, y, width, height},
130
232
  * font_name, font_size, font_weight, is_italic, color {r, g, b}, etc.
233
+ *
234
+ * @param page_index - Zero-based page number
235
+ * @param region - Optional [x, y, width, height] to filter by
131
236
  */
132
- extractChars(page_index: number): any;
237
+ extractChars(page_index: number, region?: Float32Array | null): any;
133
238
  /**
134
239
  * Extract image bytes from a page as PNG data.
135
240
  *
@@ -141,28 +246,80 @@ export class WasmPdfDocument {
141
246
  *
142
247
  * Returns an array of objects with: width, height, color_space,
143
248
  * bits_per_component, bbox (if available). Does NOT return raw image bytes.
249
+ *
250
+ * @param page_index - Zero-based page number
251
+ * @param region - Optional [x, y, width, height] to filter by
252
+ */
253
+ extractImages(page_index: number, region?: Float32Array | null): any;
254
+ /**
255
+ * Extract only straight lines from a page (v0.3.14).
256
+ *
257
+ * Identifies paths that form a single straight line segment.
258
+ *
259
+ * @param page_index - Zero-based page number
260
+ * @param region - Optional [x, y, width, height] to filter by
261
+ * @returns Array of path objects
144
262
  */
145
- extractImages(page_index: number): any;
263
+ extractLines(page_index: number, region?: Float32Array | null): any;
146
264
  /**
147
265
  * Extract vector paths (lines, curves, shapes) from a page.
148
266
  *
149
267
  * @param page_index - Zero-based page number
268
+ * @param region - Optional [x, y, width, height] to filter by
150
269
  * @returns Array of path objects with bbox, stroke_color, fill_color, etc.
151
270
  */
152
- extractPaths(page_index: number): any;
271
+ extractPaths(page_index: number, region?: Float32Array | null): any;
272
+ /**
273
+ * Extract only rectangles from a page (v0.3.14).
274
+ *
275
+ * Identifies paths that form axis-aligned rectangles.
276
+ *
277
+ * @param page_index - Zero-based page number
278
+ * @param region - Optional [x, y, width, height] to filter by
279
+ * @returns Array of path objects
280
+ */
281
+ extractRects(page_index: number, region?: Float32Array | null): any;
153
282
  /**
154
283
  * Extract span-level data from a page.
155
284
  *
156
285
  * Returns an array of objects with: text, bbox, font_name, font_size,
157
286
  * font_weight, is_italic, color, etc.
158
287
  */
159
- extractSpans(page_index: number): any;
288
+ extractSpans(page_index: number, region?: Float32Array | null): any;
289
+ /**
290
+ * Extract tables from a page (v0.3.14).
291
+ *
292
+ * @param page_index - Zero-based page number
293
+ * @param region - Optional [x, y, width, height] to filter by
294
+ */
295
+ extractTables(page_index: number, region?: Float32Array | null): any;
160
296
  /**
161
297
  * Extract plain text from a single page.
162
298
  *
163
299
  * @param page_index - Zero-based page number
300
+ * @param region - Optional [x, y, width, height] to filter by
301
+ */
302
+ extractText(page_index: number, region?: Float32Array | null): string;
303
+ /**
304
+ * Extract text lines from a page.
305
+ *
306
+ * Returns an array of objects with: text, bbox, words (array of Word objects).
307
+ */
308
+ extractTextLines(page_index: number, region?: Float32Array | null): any;
309
+ /**
310
+ * Extract text using OCR (optical character recognition).
311
+ *
312
+ * NOTE: OCR is not yet supported in the WebAssembly build due to missing
313
+ * ONNX Runtime support for the web backend in the current implementation.
164
314
  */
165
- extractText(page_index: number): string;
315
+ extractTextOcr(_page_index: number, _engine?: WasmOcrEngine | null): string;
316
+ /**
317
+ * Extract word-level data from a page.
318
+ *
319
+ * Returns an array of objects with: text, bbox, font_name, font_size,
320
+ * font_weight, is_italic, is_bold.
321
+ */
322
+ extractWords(page_index: number, region?: Float32Array | null): any;
166
323
  /**
167
324
  * Flatten all annotations in the document into page content.
168
325
  */
@@ -271,6 +428,33 @@ export class WasmPdfDocument {
271
428
  * Get the rotation of a page in degrees (0, 90, 180, 270).
272
429
  */
273
430
  pageRotation(page_index: number): number;
431
+ /**
432
+ * Identify and remove both headers and footers.
433
+ *
434
+ * Prioritizes ISO 32000 spec-compliant /Artifact tags, with a heuristic
435
+ * fallback for untagged PDFs.
436
+ *
437
+ * @param threshold - Fraction of pages (0.0-1.0) where text must repeat (heuristic mode)
438
+ */
439
+ removeArtifacts(threshold: number): number;
440
+ /**
441
+ * Identify and remove footers.
442
+ *
443
+ * Uses spec-compliant /Artifact tags when available (100% accuracy), or
444
+ * falls back to heuristic analysis of the bottom 15% of pages.
445
+ *
446
+ * @param threshold - Fraction of pages (0.0-1.0) where text must repeat (heuristic mode)
447
+ */
448
+ removeFooters(threshold: number): number;
449
+ /**
450
+ * Identify and remove headers.
451
+ *
452
+ * Uses spec-compliant /Artifact tags when available (100% accuracy), or
453
+ * falls back to heuristic analysis of the top 15% of pages.
454
+ *
455
+ * @param threshold - Fraction of pages (0.0-1.0) where text must repeat (heuristic mode)
456
+ */
457
+ removeHeaders(threshold: number): number;
274
458
  /**
275
459
  * Reposition an image on a page.
276
460
  */
@@ -289,13 +473,6 @@ export class WasmPdfDocument {
289
473
  rotatePage(page_index: number, degrees: number): void;
290
474
  /**
291
475
  * Save with encryption and return the resulting PDF as bytes.
292
- *
293
- * @param user_password - Password required to open the document
294
- * @param owner_password - Password for full access (defaults to user_password)
295
- * @param allow_print - Allow printing (default: true)
296
- * @param allow_copy - Allow copying text (default: true)
297
- * @param allow_modify - Allow modifying (default: true)
298
- * @param allow_annotate - Allow annotations (default: true)
299
476
  */
300
477
  saveEncryptedToBytes(user_password: string, owner_password?: string | null, allow_print?: boolean | null, allow_copy?: boolean | null, allow_modify?: boolean | null, allow_annotate?: boolean | null): Uint8Array;
301
478
  /**
@@ -395,6 +572,13 @@ export class WasmPdfDocument {
395
572
  * Get the PDF version as [major, minor].
396
573
  */
397
574
  version(): Uint8Array;
575
+ /**
576
+ * Focus extraction on a specific rectangular region of a page (v0.3.14).
577
+ *
578
+ * @param page_index - Zero-based page number
579
+ * @param region - [x, y, width, height] in points
580
+ */
581
+ within(page_index: number, region: Float32Array): WasmPdfPageRegion;
398
582
  /**
399
583
  * Get XMP metadata from the document.
400
584
  *
@@ -402,3 +586,52 @@ export class WasmPdfDocument {
402
586
  */
403
587
  xmpMetadata(): any;
404
588
  }
589
+
590
+ /**
591
+ * A focused view of a PDF page region for scoped extraction (v0.3.14).
592
+ */
593
+ export class WasmPdfPageRegion {
594
+ private constructor();
595
+ free(): void;
596
+ [Symbol.dispose](): void;
597
+ /**
598
+ * Extract character-level data from this region.
599
+ */
600
+ extractChars(): any;
601
+ /**
602
+ * Extract images from this region.
603
+ */
604
+ extractImages(): any;
605
+ /**
606
+ * Extract straight lines from this region.
607
+ */
608
+ extractLines(): any;
609
+ /**
610
+ * Extract vector paths from this region.
611
+ */
612
+ extractPaths(): any;
613
+ /**
614
+ * Extract rectangles from this region.
615
+ */
616
+ extractRects(): any;
617
+ /**
618
+ * Extract tables from this region.
619
+ */
620
+ extractTables(): any;
621
+ /**
622
+ * Extract text from this region.
623
+ */
624
+ extractText(): string;
625
+ /**
626
+ * Extract text lines from this region.
627
+ */
628
+ extractTextLines(): any;
629
+ /**
630
+ * Extract text using OCR from this region.
631
+ */
632
+ extractTextOcr(_engine?: WasmOcrEngine | null): string;
633
+ /**
634
+ * Extract words from this region.
635
+ */
636
+ extractWords(): any;
637
+ }