pdf-oxide-wasm 0.3.10 → 0.3.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +204 -15
- package/package.json +30 -3
- package/pdf_oxide_bg.wasm +0 -0
package/README.md
CHANGED
|
@@ -1,9 +1,40 @@
|
|
|
1
1
|
# pdf-oxide-wasm
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Fast, zero-dependency PDF toolkit for Node.js, browsers, and serverless edge runtimes.
|
|
4
|
+
Extract text, convert to markdown/HTML, search, fill forms, create and edit PDFs — all from WebAssembly.
|
|
5
|
+
|
|
6
|
+
Built on the [pdf-oxide](https://github.com/yfedoseev/pdf_oxide) Rust core. No native binaries, no system dependencies.
|
|
7
|
+
|
|
8
|
+
[](https://www.npmjs.com/package/pdf-oxide-wasm)
|
|
9
|
+
[](https://github.com/yfedoseev/pdf_oxide/blob/main/LICENSE-MIT)
|
|
10
|
+
|
|
11
|
+
## Why pdf-oxide-wasm
|
|
12
|
+
|
|
13
|
+
| Feature | pdf-oxide-wasm | pdf-parse | pdf-lib | pdfjs-dist |
|
|
14
|
+
|---|---|---|---|---|
|
|
15
|
+
| Text extraction | Yes | Yes | No | Yes |
|
|
16
|
+
| Markdown / HTML output | Yes | No | No | No |
|
|
17
|
+
| PDF creation | Yes | No | Yes | No |
|
|
18
|
+
| Form field read/write | Yes | No | Partial | No |
|
|
19
|
+
| Full-text search (regex) | Yes | No | No | No |
|
|
20
|
+
| Image extraction | Yes | No | No | No |
|
|
21
|
+
| Merge, encrypt, edit | Yes | No | Yes | No |
|
|
22
|
+
| Serverless / edge runtimes | Yes | No | No | No |
|
|
23
|
+
| Zero native dependencies | Yes | Yes | Yes | No |
|
|
24
|
+
| WebAssembly-based | Yes | No | No | No |
|
|
25
|
+
| TypeScript types included | Yes | No | Yes | Yes |
|
|
26
|
+
| License | MIT / Apache-2.0 | MIT | MIT | Apache-2.0 |
|
|
27
|
+
|
|
28
|
+
## Install
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
npm install pdf-oxide-wasm
|
|
32
|
+
```
|
|
4
33
|
|
|
5
34
|
## Quick Start
|
|
6
35
|
|
|
36
|
+
### Extract text (Node.js — CommonJS)
|
|
37
|
+
|
|
7
38
|
```javascript
|
|
8
39
|
const { WasmPdfDocument } = require("pdf-oxide-wasm");
|
|
9
40
|
const fs = require("fs");
|
|
@@ -12,35 +43,193 @@ const bytes = new Uint8Array(fs.readFileSync("document.pdf"));
|
|
|
12
43
|
const doc = new WasmPdfDocument(bytes);
|
|
13
44
|
|
|
14
45
|
console.log(`Pages: ${doc.pageCount()}`);
|
|
15
|
-
console.log(doc.extractText(0));
|
|
46
|
+
console.log(doc.extractText(0)); // plain text from page 0
|
|
47
|
+
console.log(doc.toMarkdown(0)); // markdown from page 0
|
|
48
|
+
console.log(doc.toHtml(0)); // HTML from page 0
|
|
16
49
|
|
|
17
50
|
doc.free();
|
|
18
51
|
```
|
|
19
52
|
|
|
20
|
-
### ESM
|
|
53
|
+
### Extract text (ESM / TypeScript)
|
|
21
54
|
|
|
22
|
-
```
|
|
55
|
+
```typescript
|
|
23
56
|
import { WasmPdfDocument } from "pdf-oxide-wasm";
|
|
57
|
+
import { readFile } from "fs/promises";
|
|
24
58
|
|
|
25
|
-
const bytes = new Uint8Array(await
|
|
59
|
+
const bytes = new Uint8Array(await readFile("document.pdf"));
|
|
26
60
|
const doc = new WasmPdfDocument(bytes);
|
|
27
|
-
|
|
61
|
+
|
|
62
|
+
const text = doc.extractAllText();
|
|
63
|
+
const markdown = doc.toMarkdownAll();
|
|
64
|
+
|
|
28
65
|
doc.free();
|
|
29
66
|
```
|
|
30
67
|
|
|
68
|
+
### Create a PDF from Markdown
|
|
69
|
+
|
|
70
|
+
```javascript
|
|
71
|
+
import { WasmPdf } from "pdf-oxide-wasm";
|
|
72
|
+
|
|
73
|
+
const pdf = WasmPdf.fromMarkdown("# Invoice\n\nTotal: $42.00", "Invoice", "Acme Corp");
|
|
74
|
+
const bytes = pdf.toBytes(); // Uint8Array — write to file or send as response
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Search inside a PDF
|
|
78
|
+
|
|
79
|
+
```javascript
|
|
80
|
+
const results = doc.search("quarterly revenue", true); // case-insensitive
|
|
81
|
+
// Returns: [{ page, text, bbox, start_index, end_index, span_boxes }]
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Read and fill form fields
|
|
85
|
+
|
|
86
|
+
```javascript
|
|
87
|
+
const fields = doc.getFormFields();
|
|
88
|
+
// [{ name, field_type, value, tooltip, bounds, is_readonly, is_required }]
|
|
89
|
+
|
|
90
|
+
doc.setFormFieldValue("name", "Jane Doe");
|
|
91
|
+
doc.setFormFieldValue("agree_terms", true);
|
|
92
|
+
|
|
93
|
+
const filledPdf = doc.saveToBytes(); // Uint8Array
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Encrypt a PDF (AES-256)
|
|
97
|
+
|
|
98
|
+
```javascript
|
|
99
|
+
const encrypted = doc.saveEncryptedToBytes(
|
|
100
|
+
"user-password",
|
|
101
|
+
"owner-password",
|
|
102
|
+
true, // allow print
|
|
103
|
+
false, // deny copy
|
|
104
|
+
);
|
|
105
|
+
```
|
|
106
|
+
|
|
31
107
|
## Features
|
|
32
108
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
-
|
|
38
|
-
|
|
39
|
-
|
|
109
|
+
**Text Extraction** — plain text, Markdown, and HTML output formats. Character-level and span-level extraction with bounding boxes, font names, sizes, weights, colors, and italic flags.
|
|
110
|
+
|
|
111
|
+
**Format Conversion** — convert any page or all pages to Markdown (with heading detection, images, form fields), HTML (with optional CSS layout preservation), or structured plain text.
|
|
112
|
+
|
|
113
|
+
**Full-Text Search** — regex and literal search across all pages or a single page. Case-insensitive, whole-word, and max-results options. Returns match positions with bounding boxes.
|
|
114
|
+
|
|
115
|
+
**Image Extraction** — extract image metadata (dimensions, color space, bits per component, bounding boxes) and raw image bytes as PNG.
|
|
116
|
+
|
|
117
|
+
**Form Fields** — read all AcroForm fields (text, button, choice, signature). Get/set individual field values. Export form data as FDF or XFDF. Flatten forms into static content. XFA detection.
|
|
118
|
+
|
|
119
|
+
**PDF Creation** — generate PDFs from Markdown, HTML, plain text, or images (PNG/JPEG). Multi-image support (one page per image). Set title, author metadata.
|
|
120
|
+
|
|
121
|
+
**PDF Editing** — set document metadata (title, author, subject, keywords). Rotate pages, set MediaBox/CropBox, crop margins. Erase (whiteout) regions. Reposition, resize, and set bounds on images. Flatten or apply redactions. Merge PDFs. Embed files.
|
|
122
|
+
|
|
123
|
+
**Encryption** — AES-256 encryption with granular permissions (print, copy, modify, annotate).
|
|
124
|
+
|
|
125
|
+
**Document Structure** — bookmarks/outline (table of contents), annotations (links, comments, form widgets), page labels, XMP metadata, vector paths.
|
|
126
|
+
|
|
127
|
+
## API Reference
|
|
128
|
+
|
|
129
|
+
### `WasmPdfDocument` — read, extract, search, and edit existing PDFs
|
|
130
|
+
|
|
131
|
+
| Method | Description |
|
|
132
|
+
|---|---|
|
|
133
|
+
| `new(data)` | Load PDF from `Uint8Array` |
|
|
134
|
+
| `pageCount()` | Number of pages |
|
|
135
|
+
| `version()` | PDF version as `[major, minor]` |
|
|
136
|
+
| `authenticate(password)` | Decrypt an encrypted PDF |
|
|
137
|
+
| `hasStructureTree()` | Check for Tagged PDF structure |
|
|
138
|
+
| **Text Extraction** | |
|
|
139
|
+
| `extractText(page)` | Plain text from one page |
|
|
140
|
+
| `extractAllText()` | Plain text from all pages |
|
|
141
|
+
| `extractChars(page)` | Character-level data with positions |
|
|
142
|
+
| `extractSpans(page)` | Span-level data with positions |
|
|
143
|
+
| **Format Conversion** | |
|
|
144
|
+
| `toMarkdown(page, headings?, images?, forms?)` | Markdown from one page |
|
|
145
|
+
| `toMarkdownAll(headings?, images?, forms?)` | Markdown from all pages |
|
|
146
|
+
| `toHtml(page, layout?, headings?, forms?)` | HTML from one page |
|
|
147
|
+
| `toHtmlAll(layout?, headings?, forms?)` | HTML from all pages |
|
|
148
|
+
| `toPlainText(page)` | Plain text with layout |
|
|
149
|
+
| `toPlainTextAll()` | Plain text all pages |
|
|
150
|
+
| **Search** | |
|
|
151
|
+
| `search(pattern, caseInsensitive?, literal?, wholeWord?, max?)` | Search all pages |
|
|
152
|
+
| `searchPage(page, pattern, ...)` | Search one page |
|
|
153
|
+
| **Images** | |
|
|
154
|
+
| `extractImages(page)` | Image metadata (dimensions, color space, bbox) |
|
|
155
|
+
| `extractImageBytes(page)` | Image data as PNG `Uint8Array` |
|
|
156
|
+
| `pageImages(page)` | Image placement info (bounds, matrix) |
|
|
157
|
+
| **Forms** | |
|
|
158
|
+
| `getFormFields()` | All form fields with types and values |
|
|
159
|
+
| `getFormFieldValue(name)` | Get a single field value |
|
|
160
|
+
| `setFormFieldValue(name, value)` | Set a field value |
|
|
161
|
+
| `exportFormData(format?)` | Export as FDF or XFDF |
|
|
162
|
+
| `hasXfa()` | Check for XFA form data |
|
|
163
|
+
| `flattenForms()` | Flatten all form fields |
|
|
164
|
+
| `flattenFormsOnPage(page)` | Flatten fields on one page |
|
|
165
|
+
| **Document Structure** | |
|
|
166
|
+
| `getOutline()` | Bookmarks / table of contents |
|
|
167
|
+
| `getAnnotations(page)` | Page annotations |
|
|
168
|
+
| `extractPaths(page)` | Vector paths (lines, curves) |
|
|
169
|
+
| `pageLabels()` | Page label ranges |
|
|
170
|
+
| `xmpMetadata()` | XMP metadata |
|
|
171
|
+
| **Editing** | |
|
|
172
|
+
| `setTitle(title)` | Set document title |
|
|
173
|
+
| `setAuthor(author)` | Set document author |
|
|
174
|
+
| `setSubject(subject)` | Set document subject |
|
|
175
|
+
| `setKeywords(keywords)` | Set document keywords |
|
|
176
|
+
| `setPageRotation(page, degrees)` | Set page rotation |
|
|
177
|
+
| `rotatePage(page, degrees)` | Rotate page by degrees |
|
|
178
|
+
| `rotateAllPages(degrees)` | Rotate all pages |
|
|
179
|
+
| `pageMediaBox(page)` | Get MediaBox |
|
|
180
|
+
| `setPageMediaBox(page, llx, lly, urx, ury)` | Set MediaBox |
|
|
181
|
+
| `pageCropBox(page)` | Get CropBox |
|
|
182
|
+
| `setPageCropBox(page, llx, lly, urx, ury)` | Set CropBox |
|
|
183
|
+
| `cropMargins(left, right, top, bottom)` | Crop all page margins |
|
|
184
|
+
| `eraseRegion(page, llx, lly, urx, ury)` | Whiteout a region |
|
|
185
|
+
| `eraseRegions(page, rects)` | Whiteout multiple regions |
|
|
186
|
+
| `repositionImage(page, name, x, y)` | Move an image |
|
|
187
|
+
| `resizeImage(page, name, w, h)` | Resize an image |
|
|
188
|
+
| `setImageBounds(page, name, x, y, w, h)` | Set image bounds |
|
|
189
|
+
| `flattenPageAnnotations(page)` | Flatten page annotations |
|
|
190
|
+
| `flattenAllAnnotations()` | Flatten all annotations |
|
|
191
|
+
| `applyPageRedactions(page)` | Apply redactions on page |
|
|
192
|
+
| `applyAllRedactions()` | Apply all redactions |
|
|
193
|
+
| `mergeFrom(data)` | Merge another PDF |
|
|
194
|
+
| `embedFile(name, data)` | Embed a file |
|
|
195
|
+
| **Save** | |
|
|
196
|
+
| `saveToBytes()` | Save edits → `Uint8Array` |
|
|
197
|
+
| `saveEncryptedToBytes(userPwd, ownerPwd?, ...)` | Save with AES-256 encryption |
|
|
198
|
+
| `free()` | Release WASM memory |
|
|
199
|
+
|
|
200
|
+
### `WasmPdf` — create new PDFs
|
|
201
|
+
|
|
202
|
+
| Method | Description |
|
|
203
|
+
|---|---|
|
|
204
|
+
| `fromMarkdown(content, title?, author?)` | Create PDF from Markdown |
|
|
205
|
+
| `fromHtml(content, title?, author?)` | Create PDF from HTML |
|
|
206
|
+
| `fromText(content, title?, author?)` | Create PDF from plain text |
|
|
207
|
+
| `fromImageBytes(data)` | Create PDF from image (PNG/JPEG) |
|
|
208
|
+
| `fromMultipleImageBytes(images)` | Create multi-page PDF from images |
|
|
209
|
+
| `toBytes()` | Get PDF as `Uint8Array` |
|
|
210
|
+
| `size` | PDF size in bytes |
|
|
211
|
+
|
|
212
|
+
## Platform Compatibility
|
|
213
|
+
|
|
214
|
+
Works without modification in:
|
|
215
|
+
|
|
216
|
+
- **Node.js** 18+ (CommonJS and ESM)
|
|
217
|
+
- **Browsers** — Chrome, Firefox, Safari, Edge
|
|
218
|
+
- **Cloudflare Workers** — runs in V8 isolates with WASM support
|
|
219
|
+
- **Deno** — native WASM support
|
|
220
|
+
- **Bun** — native WASM support
|
|
221
|
+
|
|
222
|
+
No native binaries, no `node-gyp`, no `postinstall` scripts. Install and use immediately.
|
|
223
|
+
|
|
224
|
+
## Performance
|
|
225
|
+
|
|
226
|
+
pdf-oxide-wasm is built on a Rust PDF parser compiled to WebAssembly. The Rust core ([pdf_oxide](https://crates.io/crates/pdf_oxide)) achieves 0.8ms mean extraction time across 3,830 test PDFs with a 100% success rate — the fastest PDF text extraction library available in Rust. The WASM compilation preserves near-native performance without garbage collection overhead or child process spawning.
|
|
227
|
+
|
|
228
|
+
## Full Documentation
|
|
40
229
|
|
|
41
|
-
|
|
230
|
+
Complete guide with examples: [Getting Started with WASM](https://github.com/yfedoseev/pdf_oxide/blob/main/docs/getting-started-wasm.md)
|
|
42
231
|
|
|
43
|
-
|
|
232
|
+
Rust library documentation: [docs.rs/pdf_oxide](https://docs.rs/pdf_oxide)
|
|
44
233
|
|
|
45
234
|
## License
|
|
46
235
|
|
package/package.json
CHANGED
|
@@ -1,13 +1,19 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pdf-oxide-wasm",
|
|
3
|
-
"version": "0.3.
|
|
4
|
-
"description": "
|
|
3
|
+
"version": "0.3.12",
|
|
4
|
+
"description": "Fast, zero-dependency PDF toolkit for Node.js, browsers, and edge runtimes — text extraction, markdown/HTML conversion, search, form filling, creation, and editing. Rust core compiled to WebAssembly.",
|
|
5
5
|
"license": "MIT OR Apache-2.0",
|
|
6
6
|
"repository": {
|
|
7
7
|
"type": "git",
|
|
8
8
|
"url": "https://github.com/yfedoseev/pdf_oxide"
|
|
9
9
|
},
|
|
10
10
|
"homepage": "https://github.com/yfedoseev/pdf_oxide/blob/main/docs/getting-started-wasm.md",
|
|
11
|
+
"bugs": {
|
|
12
|
+
"url": "https://github.com/yfedoseev/pdf_oxide/issues"
|
|
13
|
+
},
|
|
14
|
+
"engines": {
|
|
15
|
+
"node": ">=18"
|
|
16
|
+
},
|
|
11
17
|
"files": [
|
|
12
18
|
"pdf_oxide_bg.wasm",
|
|
13
19
|
"pdf_oxide.js",
|
|
@@ -17,11 +23,32 @@
|
|
|
17
23
|
],
|
|
18
24
|
"main": "pdf_oxide.js",
|
|
19
25
|
"types": "pdf_oxide.d.ts",
|
|
26
|
+
"sideEffects": false,
|
|
20
27
|
"keywords": [
|
|
21
28
|
"pdf",
|
|
22
29
|
"wasm",
|
|
23
30
|
"webassembly",
|
|
31
|
+
"pdf-parser",
|
|
32
|
+
"pdf-extract",
|
|
24
33
|
"text-extraction",
|
|
25
|
-
"pdf-
|
|
34
|
+
"pdf-to-text",
|
|
35
|
+
"pdf-to-markdown",
|
|
36
|
+
"pdf-reader",
|
|
37
|
+
"pdf-search",
|
|
38
|
+
"pdf-form",
|
|
39
|
+
"pdf-creation",
|
|
40
|
+
"markdown-to-pdf",
|
|
41
|
+
"html-to-pdf",
|
|
42
|
+
"rust",
|
|
43
|
+
"rust-wasm",
|
|
44
|
+
"serverless",
|
|
45
|
+
"cloudflare-workers",
|
|
46
|
+
"node",
|
|
47
|
+
"browser",
|
|
48
|
+
"zero-dependency",
|
|
49
|
+
"pdf-metadata",
|
|
50
|
+
"pdf-merge",
|
|
51
|
+
"pdf-encrypt",
|
|
52
|
+
"typescript"
|
|
26
53
|
]
|
|
27
54
|
}
|
package/pdf_oxide_bg.wasm
CHANGED
|
Binary file
|