pdf-oxide-wasm 0.3.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,47 @@
1
+ # pdf-oxide-wasm
2
+
3
+ High-performance PDF text extraction and manipulation via WebAssembly. Built on the [PDF Oxide](https://github.com/yfedoseev/pdf_oxide) Rust core.
4
+
5
+ ## Quick Start
6
+
7
+ ```javascript
8
+ const { WasmPdfDocument } = require("pdf-oxide-wasm");
9
+ const fs = require("fs");
10
+
11
+ const bytes = new Uint8Array(fs.readFileSync("document.pdf"));
12
+ const doc = new WasmPdfDocument(bytes);
13
+
14
+ console.log(`Pages: ${doc.pageCount()}`);
15
+ console.log(doc.extractText(0));
16
+
17
+ doc.free();
18
+ ```
19
+
20
+ ### ESM
21
+
22
+ ```javascript
23
+ import { WasmPdfDocument } from "pdf-oxide-wasm";
24
+
25
+ const bytes = new Uint8Array(await fs.promises.readFile("document.pdf"));
26
+ const doc = new WasmPdfDocument(bytes);
27
+ const text = doc.extractText(0);
28
+ doc.free();
29
+ ```
30
+
31
+ ## Features
32
+
33
+ - Text extraction (plain text, Markdown, HTML)
34
+ - Character-level and span-level extraction with positions
35
+ - PDF creation from Markdown, HTML, text, and images
36
+ - Form field extraction and filling
37
+ - PDF editing (metadata, rotation, cropping, annotations)
38
+ - Encryption (AES-256)
39
+ - Search with regex support
40
+
41
+ ## Documentation
42
+
43
+ Full API reference and examples: [Getting Started (WASM)](https://github.com/yfedoseev/pdf_oxide/blob/main/docs/getting-started-wasm.md)
44
+
45
+ ## License
46
+
47
+ MIT OR Apache-2.0
package/package.json ADDED
@@ -0,0 +1,27 @@
1
+ {
2
+ "name": "pdf-oxide-wasm",
3
+ "version": "0.3.10",
4
+ "description": "High-performance PDF text extraction and manipulation via WebAssembly",
5
+ "license": "MIT OR Apache-2.0",
6
+ "repository": {
7
+ "type": "git",
8
+ "url": "https://github.com/yfedoseev/pdf_oxide"
9
+ },
10
+ "homepage": "https://github.com/yfedoseev/pdf_oxide/blob/main/docs/getting-started-wasm.md",
11
+ "files": [
12
+ "pdf_oxide_bg.wasm",
13
+ "pdf_oxide.js",
14
+ "pdf_oxide.d.ts",
15
+ "pdf_oxide_bg.wasm.d.ts",
16
+ "README.md"
17
+ ],
18
+ "main": "pdf_oxide.js",
19
+ "types": "pdf_oxide.d.ts",
20
+ "keywords": [
21
+ "pdf",
22
+ "wasm",
23
+ "webassembly",
24
+ "text-extraction",
25
+ "pdf-parser"
26
+ ]
27
+ }
package/pdf_oxide.d.ts ADDED
@@ -0,0 +1,404 @@
1
+ /* tslint:disable */
2
+ /* eslint-disable */
3
+
4
+ /**
5
+ * Create new PDF documents from Markdown, HTML, or plain text.
6
+ *
7
+ * ```javascript
8
+ * const pdf = WasmPdf.fromMarkdown("# Hello\n\nWorld");
9
+ * const bytes = pdf.toBytes(); // Uint8Array
10
+ * console.log(`PDF size: ${pdf.size} bytes`);
11
+ * ```
12
+ */
13
+ export class WasmPdf {
14
+ private constructor();
15
+ free(): void;
16
+ [Symbol.dispose](): void;
17
+ /**
18
+ * Create a PDF from HTML content.
19
+ *
20
+ * @param content - HTML string
21
+ * @param title - Optional document title
22
+ * @param author - Optional document author
23
+ */
24
+ static fromHtml(content: string, title?: string | null, author?: string | null): WasmPdf;
25
+ /**
26
+ * Create a PDF from image bytes (PNG, JPEG, etc.).
27
+ *
28
+ * @param data - Image file contents as a Uint8Array
29
+ */
30
+ static fromImageBytes(data: Uint8Array): WasmPdf;
31
+ /**
32
+ * Create a PDF from Markdown content.
33
+ *
34
+ * @param content - Markdown string
35
+ * @param title - Optional document title
36
+ * @param author - Optional document author
37
+ */
38
+ static fromMarkdown(content: string, title?: string | null, author?: string | null): WasmPdf;
39
+ /**
40
+ * Create a PDF from multiple image byte arrays.
41
+ *
42
+ * Each image becomes a separate page. Pass an array of Uint8Arrays.
43
+ *
44
+ * @param images_array - Array of Uint8Arrays, each containing image file bytes (PNG/JPEG)
45
+ */
46
+ static fromMultipleImageBytes(images_array: any): WasmPdf;
47
+ /**
48
+ * Create a PDF from plain text.
49
+ *
50
+ * @param content - Plain text string
51
+ * @param title - Optional document title
52
+ * @param author - Optional document author
53
+ */
54
+ static fromText(content: string, title?: string | null, author?: string | null): WasmPdf;
55
+ /**
56
+ * Get the PDF as a Uint8Array.
57
+ */
58
+ toBytes(): Uint8Array;
59
+ /**
60
+ * Get the size of the PDF in bytes.
61
+ */
62
+ readonly size: number;
63
+ }
64
+
65
+ /**
66
+ * A PDF document loaded from bytes for use in WebAssembly.
67
+ *
68
+ * Create an instance by passing PDF file bytes to the constructor.
69
+ * Call `.free()` when done to release memory.
70
+ */
71
+ export class WasmPdfDocument {
72
+ free(): void;
73
+ [Symbol.dispose](): void;
74
+ /**
75
+ * Apply all redactions in the document.
76
+ */
77
+ applyAllRedactions(): void;
78
+ /**
79
+ * Apply redactions on a page (removes redacted content permanently).
80
+ */
81
+ applyPageRedactions(page_index: number): void;
82
+ /**
83
+ * Authenticate with a password to decrypt an encrypted PDF.
84
+ *
85
+ * @param password - The password string
86
+ * @returns true if authentication succeeded
87
+ */
88
+ authenticate(password: string): boolean;
89
+ /**
90
+ * Clear all pending erase operations for a page.
91
+ */
92
+ clearEraseRegions(page_index: number): void;
93
+ /**
94
+ * Crop margins from all pages.
95
+ */
96
+ cropMargins(left: number, right: number, top: number, bottom: number): void;
97
+ /**
98
+ * Embed a file into the PDF document.
99
+ *
100
+ * @param name - Display name for the embedded file
101
+ * @param data - File contents as a Uint8Array
102
+ */
103
+ embedFile(name: string, data: Uint8Array): void;
104
+ /**
105
+ * Erase (whiteout) a rectangular region on a page.
106
+ */
107
+ eraseRegion(page_index: number, llx: number, lly: number, urx: number, ury: number): void;
108
+ /**
109
+ * Erase multiple rectangular regions on a page.
110
+ *
111
+ * @param page_index - Zero-based page number
112
+ * @param rects - Flat array of coordinates [llx1,lly1,urx1,ury1, llx2,lly2,urx2,ury2, ...]
113
+ */
114
+ eraseRegions(page_index: number, rects: Float32Array): void;
115
+ /**
116
+ * Export form field data as FDF or XFDF bytes.
117
+ *
118
+ * @param format - "fdf" or "xfdf" (default: "fdf")
119
+ * @returns Uint8Array containing the exported form data
120
+ */
121
+ exportFormData(format?: string | null): Uint8Array;
122
+ /**
123
+ * Extract plain text from all pages, separated by form feed characters.
124
+ */
125
+ extractAllText(): string;
126
+ /**
127
+ * Extract character-level data from a page.
128
+ *
129
+ * Returns an array of objects with: char, bbox {x, y, width, height},
130
+ * font_name, font_size, font_weight, is_italic, color {r, g, b}, etc.
131
+ */
132
+ extractChars(page_index: number): any;
133
+ /**
134
+ * Extract image bytes from a page as PNG data.
135
+ *
136
+ * Returns an array of objects with: width, height, data (Uint8Array of PNG bytes), format ("png").
137
+ */
138
+ extractImageBytes(page_index: number): any;
139
+ /**
140
+ * Extract image metadata from a page.
141
+ *
142
+ * Returns an array of objects with: width, height, color_space,
143
+ * bits_per_component, bbox (if available). Does NOT return raw image bytes.
144
+ */
145
+ extractImages(page_index: number): any;
146
+ /**
147
+ * Extract vector paths (lines, curves, shapes) from a page.
148
+ *
149
+ * @param page_index - Zero-based page number
150
+ * @returns Array of path objects with bbox, stroke_color, fill_color, etc.
151
+ */
152
+ extractPaths(page_index: number): any;
153
+ /**
154
+ * Extract span-level data from a page.
155
+ *
156
+ * Returns an array of objects with: text, bbox, font_name, font_size,
157
+ * font_weight, is_italic, color, etc.
158
+ */
159
+ extractSpans(page_index: number): any;
160
+ /**
161
+ * Extract plain text from a single page.
162
+ *
163
+ * @param page_index - Zero-based page number
164
+ */
165
+ extractText(page_index: number): string;
166
+ /**
167
+ * Flatten all annotations in the document into page content.
168
+ */
169
+ flattenAllAnnotations(): void;
170
+ /**
171
+ * Flatten all form fields into page content.
172
+ *
173
+ * After flattening, form field values become static text and are no longer editable.
174
+ */
175
+ flattenForms(): void;
176
+ /**
177
+ * Flatten form fields on a specific page.
178
+ *
179
+ * @param page_index - Zero-based page number
180
+ */
181
+ flattenFormsOnPage(page_index: number): void;
182
+ /**
183
+ * Flatten annotations on a page into the page content.
184
+ */
185
+ flattenPageAnnotations(page_index: number): void;
186
+ /**
187
+ * Get annotations from a page.
188
+ *
189
+ * @param page_index - Zero-based page number
190
+ * @returns Array of annotation objects with fields like subtype, rect, contents, etc.
191
+ */
192
+ getAnnotations(page_index: number): any;
193
+ /**
194
+ * Get the value of a specific form field by name.
195
+ *
196
+ * @param name - Full qualified field name (e.g., "name" or "topmostSubform[0].Page1[0].f1_01[0]")
197
+ * @returns The field value: string for text, boolean for checkbox, null if not found
198
+ */
199
+ getFormFieldValue(name: string): any;
200
+ /**
201
+ * Get all form fields from the document.
202
+ *
203
+ * Returns an array of form field objects, each with:
204
+ * - name: Full qualified field name
205
+ * - field_type: "text", "button", "choice", "signature", or "unknown"
206
+ * - value: string, boolean, array of strings, or null
207
+ * - tooltip: string or null
208
+ * - bounds: [x1, y1, x2, y2] or null
209
+ * - flags: number or null
210
+ * - max_length: number or null
211
+ * - is_readonly: boolean
212
+ * - is_required: boolean
213
+ */
214
+ getFormFields(): any;
215
+ /**
216
+ * Get the document outline (bookmarks / table of contents).
217
+ *
218
+ * @returns Array of outline items or null if no outline exists.
219
+ * Each item has: { title, page (number|null), dest_name (string, optional), children (array) }
220
+ */
221
+ getOutline(): any;
222
+ /**
223
+ * Check if the document has a structure tree (Tagged PDF).
224
+ */
225
+ hasStructureTree(): boolean;
226
+ /**
227
+ * Check if the document contains XFA form data.
228
+ *
229
+ * @returns true if the document has XFA form data
230
+ */
231
+ hasXfa(): boolean;
232
+ /**
233
+ * Merge another PDF (provided as bytes) into this document.
234
+ *
235
+ * @param data - The PDF file contents to merge as a Uint8Array
236
+ * @returns Number of pages merged
237
+ */
238
+ mergeFrom(data: Uint8Array): number;
239
+ /**
240
+ * Load a PDF document from raw bytes.
241
+ *
242
+ * @param data - The PDF file contents as a Uint8Array
243
+ * @throws Error if the PDF is invalid or cannot be parsed
244
+ */
245
+ constructor(data: Uint8Array);
246
+ /**
247
+ * Get the number of pages in the document.
248
+ */
249
+ pageCount(): number;
250
+ /**
251
+ * Get the CropBox of a page as [llx, lly, urx, ury], or null if not set.
252
+ */
253
+ pageCropBox(page_index: number): any;
254
+ /**
255
+ * Get information about images on a page.
256
+ *
257
+ * Returns an array of {name, bounds: [x, y, width, height], matrix: [a, b, c, d, e, f]}.
258
+ */
259
+ pageImages(page_index: number): any;
260
+ /**
261
+ * Get page label ranges from the document.
262
+ *
263
+ * @returns Array of {start_page, style, prefix, start_value} objects, or empty array
264
+ */
265
+ pageLabels(): any;
266
+ /**
267
+ * Get the MediaBox of a page as [llx, lly, urx, ury].
268
+ */
269
+ pageMediaBox(page_index: number): Float32Array;
270
+ /**
271
+ * Get the rotation of a page in degrees (0, 90, 180, 270).
272
+ */
273
+ pageRotation(page_index: number): number;
274
+ /**
275
+ * Reposition an image on a page.
276
+ */
277
+ repositionImage(page_index: number, name: string, x: number, y: number): void;
278
+ /**
279
+ * Resize an image on a page.
280
+ */
281
+ resizeImage(page_index: number, name: string, width: number, height: number): void;
282
+ /**
283
+ * Rotate all pages by the given degrees.
284
+ */
285
+ rotateAllPages(degrees: number): void;
286
+ /**
287
+ * Rotate a page by the given degrees (adds to current rotation).
288
+ */
289
+ rotatePage(page_index: number, degrees: number): void;
290
+ /**
291
+ * Save with encryption and return the resulting PDF as bytes.
292
+ *
293
+ * @param user_password - Password required to open the document
294
+ * @param owner_password - Password for full access (defaults to user_password)
295
+ * @param allow_print - Allow printing (default: true)
296
+ * @param allow_copy - Allow copying text (default: true)
297
+ * @param allow_modify - Allow modifying (default: true)
298
+ * @param allow_annotate - Allow annotations (default: true)
299
+ */
300
+ saveEncryptedToBytes(user_password: string, owner_password?: string | null, allow_print?: boolean | null, allow_copy?: boolean | null, allow_modify?: boolean | null, allow_annotate?: boolean | null): Uint8Array;
301
+ /**
302
+ * Save all edits and return the resulting PDF as bytes.
303
+ *
304
+ * @returns Uint8Array containing the modified PDF
305
+ */
306
+ saveToBytes(): Uint8Array;
307
+ /**
308
+ * Search for text across all pages.
309
+ *
310
+ * @param pattern - Regex pattern or literal text to search for
311
+ * @param case_insensitive - Case insensitive search (default: false)
312
+ * @param literal - Treat pattern as literal text, not regex (default: false)
313
+ * @param whole_word - Match whole words only (default: false)
314
+ * @param max_results - Maximum results to return, 0 = unlimited (default: 0)
315
+ *
316
+ * Returns an array of {page, text, bbox, start_index, end_index, span_boxes}.
317
+ */
318
+ search(pattern: string, case_insensitive?: boolean | null, literal?: boolean | null, whole_word?: boolean | null, max_results?: number | null): any;
319
+ /**
320
+ * Search for text on a specific page.
321
+ */
322
+ searchPage(page_index: number, pattern: string, case_insensitive?: boolean | null, literal?: boolean | null, whole_word?: boolean | null, max_results?: number | null): any;
323
+ /**
324
+ * Set the document author.
325
+ */
326
+ setAuthor(author: string): void;
327
+ /**
328
+ * Set the value of a form field.
329
+ *
330
+ * @param name - Full qualified field name
331
+ * @param value - New value: string for text fields, boolean for checkboxes
332
+ */
333
+ setFormFieldValue(name: string, value: any): void;
334
+ /**
335
+ * Set the complete bounds of an image on a page.
336
+ */
337
+ setImageBounds(page_index: number, name: string, x: number, y: number, width: number, height: number): void;
338
+ /**
339
+ * Set the document keywords.
340
+ */
341
+ setKeywords(keywords: string): void;
342
+ /**
343
+ * Set the CropBox of a page.
344
+ */
345
+ setPageCropBox(page_index: number, llx: number, lly: number, urx: number, ury: number): void;
346
+ /**
347
+ * Set the MediaBox of a page.
348
+ */
349
+ setPageMediaBox(page_index: number, llx: number, lly: number, urx: number, ury: number): void;
350
+ /**
351
+ * Set the rotation of a page (0, 90, 180, or 270 degrees).
352
+ */
353
+ setPageRotation(page_index: number, degrees: number): void;
354
+ /**
355
+ * Set the document subject.
356
+ */
357
+ setSubject(subject: string): void;
358
+ /**
359
+ * Set the document title.
360
+ */
361
+ setTitle(title: string): void;
362
+ /**
363
+ * Convert a single page to HTML.
364
+ *
365
+ * @param page_index - Zero-based page number
366
+ * @param preserve_layout - Use CSS positioning to preserve layout (default: false)
367
+ * @param detect_headings - Whether to detect headings (default: true)
368
+ */
369
+ toHtml(page_index: number, preserve_layout?: boolean | null, detect_headings?: boolean | null, include_form_fields?: boolean | null): string;
370
+ /**
371
+ * Convert all pages to HTML.
372
+ */
373
+ toHtmlAll(preserve_layout?: boolean | null, detect_headings?: boolean | null, include_form_fields?: boolean | null): string;
374
+ /**
375
+ * Convert a single page to Markdown.
376
+ *
377
+ * @param page_index - Zero-based page number
378
+ * @param detect_headings - Whether to detect headings (default: true)
379
+ * @param include_images - Whether to include images (default: true)
380
+ */
381
+ toMarkdown(page_index: number, detect_headings?: boolean | null, include_images?: boolean | null, include_form_fields?: boolean | null): string;
382
+ /**
383
+ * Convert all pages to Markdown.
384
+ */
385
+ toMarkdownAll(detect_headings?: boolean | null, include_images?: boolean | null, include_form_fields?: boolean | null): string;
386
+ /**
387
+ * Convert a single page to plain text (with layout preservation options).
388
+ */
389
+ toPlainText(page_index: number): string;
390
+ /**
391
+ * Convert all pages to plain text.
392
+ */
393
+ toPlainTextAll(): string;
394
+ /**
395
+ * Get the PDF version as [major, minor].
396
+ */
397
+ version(): Uint8Array;
398
+ /**
399
+ * Get XMP metadata from the document.
400
+ *
401
+ * @returns Object with XMP fields (dc_title, dc_creator, etc.) or null if no XMP
402
+ */
403
+ xmpMetadata(): any;
404
+ }