@bentopdf/pymupdf-wasm 0.11.12 → 0.11.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +46 -237
- package/build_scripts/Dockerfile +28 -0
- package/build_scripts/README.md +31 -0
- package/build_scripts/scripts/build.js +48 -0
- package/build_scripts/scripts/buildDeps.js +31 -0
- package/build_scripts/scripts/download.js +72 -0
- package/dist/index.js +129 -21
- package/package.json +9 -8
package/README.md
CHANGED
|
@@ -2,6 +2,42 @@
|
|
|
2
2
|
|
|
3
3
|
PyMuPDF compiled to WebAssembly for full PDF manipulation in the browser.
|
|
4
4
|
|
|
5
|
+
## Notice
|
|
6
|
+
|
|
7
|
+
This package is a modified version of PyMuPDF, originally developed by Artifex Software, Inc.
|
|
8
|
+
|
|
9
|
+
It has been adapted for WebAssembly (WASM) and dynamic loading.
|
|
10
|
+
|
|
11
|
+
## Attribution
|
|
12
|
+
|
|
13
|
+
PyMuPDF is copyright © Artifex Software, Inc.
|
|
14
|
+
This package is distributed under the GNU Affero General Public License v3.0.
|
|
15
|
+
|
|
16
|
+
This project is not affiliated with or endorsed by Artifex Software, Inc.
|
|
17
|
+
|
|
18
|
+
## Source Code Availability
|
|
19
|
+
|
|
20
|
+
This program is licensed under the GNU Affero General Public License v3.0.
|
|
21
|
+
If you interact with this program over a network, you are entitled to
|
|
22
|
+
receive the complete corresponding source code.
|
|
23
|
+
|
|
24
|
+
The source code for this package is available at:
|
|
25
|
+
https://github.com/alam00000/bentopdf-pymupdf-wasm
|
|
26
|
+
|
|
27
|
+
This package includes the complete **Corresponding Source** (build scripts and configuration) in the `build_scripts/` directory.
|
|
28
|
+
|
|
29
|
+
### Build Instructions
|
|
30
|
+
|
|
31
|
+
To rebuild the WASM binary from source:
|
|
32
|
+
1. Download the source code from the repository or this package.
|
|
33
|
+
2. Navigate to the `build_scripts/` directory.
|
|
34
|
+
3. Follow the instructions in `build_scripts/README.md` (uses Docker).
|
|
35
|
+
|
|
36
|
+
## License
|
|
37
|
+
|
|
38
|
+
This project is licensed under the [AGPL-3.0-only](LICENSE) license.
|
|
39
|
+
See the License section below for details on included components.
|
|
40
|
+
|
|
5
41
|
## Features
|
|
6
42
|
|
|
7
43
|
- **Open** PDF, XPS, EPUB, and images
|
|
@@ -25,246 +61,19 @@ npm install @bentopdf/pymupdf-wasm
|
|
|
25
61
|
```javascript
|
|
26
62
|
import { PyMuPDF } from '@bentopdf/pymupdf-wasm';
|
|
27
63
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
// Preload (optional, speeds up first operation)
|
|
32
|
-
await pymupdf.load();
|
|
33
|
-
|
|
34
|
-
// Open a PDF
|
|
35
|
-
const doc = await pymupdf.open(pdfFile);
|
|
36
|
-
|
|
37
|
-
// Extract text
|
|
38
|
-
const text = doc.getPage(0).getText();
|
|
39
|
-
|
|
40
|
-
// Save with modifications
|
|
41
|
-
const blob = doc.saveAsBlob();
|
|
42
|
-
doc.close();
|
|
43
|
-
```
|
|
44
|
-
|
|
45
|
-
## API Reference
|
|
46
|
-
|
|
47
|
-
### PyMuPDF Class
|
|
48
|
-
|
|
49
|
-
```javascript
|
|
50
|
-
const pymupdf = new PyMuPDF(assetPath);
|
|
51
|
-
|
|
52
|
-
// Document operations
|
|
53
|
-
const doc = await pymupdf.open(file);
|
|
54
|
-
const doc = await pymupdf.openUrl('https://example.com/doc.pdf');
|
|
55
|
-
const doc = await pymupdf.create(); // Empty PDF
|
|
56
|
-
|
|
57
|
-
// Utilities
|
|
58
|
-
const merged = await pymupdf.merge([pdf1, pdf2, pdf3]);
|
|
59
|
-
const [part1, part2] = await pymupdf.split(pdf, [
|
|
60
|
-
{ start: 0, end: 4 },
|
|
61
|
-
{ start: 5, end: 9 }
|
|
62
|
-
]);
|
|
63
|
-
const text = await pymupdf.extractText(pdf);
|
|
64
|
-
const image = await pymupdf.renderPage(pdf, 0, 150); // 150 DPI
|
|
65
|
-
|
|
66
|
-
// PDF to DOCX
|
|
67
|
-
const docx = await pymupdf.pdfToDocx(pdf);
|
|
68
|
-
|
|
69
|
-
// File to PDF conversion
|
|
70
|
-
// Supports: XPS, EPUB, MOBI, FB2, CBZ, SVG, images (JPEG, PNG, BMP, GIF, TIFF, WEBP)
|
|
71
|
-
const pdfFromXps = await pymupdf.xpsToPdf(xpsFile);
|
|
72
|
-
const pdfFromEpub = await pymupdf.epubToPdf(epubFile);
|
|
73
|
-
const pdfFromImage = await pymupdf.imageToPdf(imageFile);
|
|
74
|
-
const pdfFromSvg = await pymupdf.svgToPdf(svgFile);
|
|
75
|
-
const pdfFromImages = await pymupdf.imagesToPdf([img1, img2, img3]);
|
|
76
|
-
const pdfFromAny = await pymupdf.convertToPdf(file, { filetype: 'svg' });
|
|
77
|
-
|
|
78
|
-
// PDF to other formats
|
|
79
|
-
const images = await pymupdf.pdfToImages(pdf, { format: 'png', dpi: 300 });
|
|
80
|
-
const svgs = await pymupdf.pdfToSvg(pdf);
|
|
81
|
-
const text = await pymupdf.pdfToText(pdf);
|
|
82
|
-
const html = await pymupdf.pdfToHtml(pdf);
|
|
83
|
-
const json = await pymupdf.pdfToJson(pdf);
|
|
84
|
-
const xml = await pymupdf.pdfToXml(pdf);
|
|
85
|
-
```
|
|
86
|
-
|
|
87
|
-
### Document Operations
|
|
88
|
-
|
|
89
|
-
```javascript
|
|
90
|
-
// Properties
|
|
91
|
-
doc.pageCount;
|
|
92
|
-
doc.metadata; // { title, author, ... }
|
|
93
|
-
doc.isEncrypted;
|
|
94
|
-
|
|
95
|
-
// Page access
|
|
96
|
-
const page = doc.getPage(0);
|
|
97
|
-
for (const page of doc.pages()) { ... }
|
|
98
|
-
|
|
99
|
-
// Modify
|
|
100
|
-
doc.deletePage(5);
|
|
101
|
-
doc.insertBlankPage(0);
|
|
102
|
-
doc.movePage(3, 0);
|
|
103
|
-
|
|
104
|
-
// Merge another PDF
|
|
105
|
-
const other = await pymupdf.open(otherPdf);
|
|
106
|
-
doc.insertPdf(other);
|
|
107
|
-
|
|
108
|
-
// Save
|
|
109
|
-
const pdf = doc.save();
|
|
110
|
-
const blob = doc.saveAsBlob();
|
|
111
|
-
doc.close();
|
|
112
|
-
```
|
|
113
|
-
|
|
114
|
-
### Page Operations
|
|
115
|
-
|
|
116
|
-
```javascript
|
|
117
|
-
const page = doc.getPage(0);
|
|
118
|
-
|
|
119
|
-
// Properties
|
|
120
|
-
page.width;
|
|
121
|
-
page.height;
|
|
122
|
-
page.rotation;
|
|
123
|
-
page.setRotation(90);
|
|
124
|
-
|
|
125
|
-
// Text
|
|
126
|
-
const text = page.getText();
|
|
127
|
-
const rects = page.searchFor("keyword");
|
|
128
|
-
page.insertText({ x: 100, y: 100 }, "Hello", { fontsize: 14 });
|
|
129
|
-
|
|
130
|
-
// Images
|
|
131
|
-
const images = page.getImages();
|
|
132
|
-
const img = page.extractImage(images[0].xref);
|
|
133
|
-
page.insertImage(rect, imageData);
|
|
134
|
-
|
|
135
|
-
// Annotations
|
|
136
|
-
page.addHighlight(rect, { r: 1, g: 1, b: 0 });
|
|
137
|
-
page.addTextAnnotation({ x: 100, y: 100 }, "Note");
|
|
138
|
-
const annots = page.getAnnotations();
|
|
139
|
-
|
|
140
|
-
// Render
|
|
141
|
-
const png = await page.toImage({ dpi: 300 });
|
|
142
|
-
const svg = page.toSvg();
|
|
143
|
-
|
|
144
|
-
// Redaction
|
|
145
|
-
page.addRedaction(rect);
|
|
146
|
-
page.applyRedactions();
|
|
147
|
-
```
|
|
148
|
-
|
|
149
|
-
### Security
|
|
150
|
-
|
|
151
|
-
```javascript
|
|
152
|
-
// Encrypt
|
|
153
|
-
const pdf = doc.save({
|
|
154
|
-
encryption: {
|
|
155
|
-
ownerPassword: 'secret',
|
|
156
|
-
userPassword: 'user123',
|
|
157
|
-
permissions: {
|
|
158
|
-
print: true,
|
|
159
|
-
copy: false
|
|
160
|
-
}
|
|
161
|
-
}
|
|
64
|
+
const pymupdf = new PyMuPDF({
|
|
65
|
+
assetPath: '/assets/pymupdf/',
|
|
66
|
+
ghostscriptUrl: 'https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm@0.1.0/' // Optional: for RGB conversion
|
|
162
67
|
});
|
|
163
68
|
|
|
164
|
-
|
|
165
|
-
if (doc.needsPass) {
|
|
166
|
-
doc.authenticate('password');
|
|
167
|
-
}
|
|
168
|
-
```
|
|
169
|
-
|
|
170
|
-
### Forms
|
|
171
|
-
|
|
172
|
-
```javascript
|
|
173
|
-
if (doc.isFormPdf) {
|
|
174
|
-
const fields = doc.getFormFields();
|
|
175
|
-
doc.setFormField('name', 'John Doe');
|
|
176
|
-
doc.setFormField('agree', true);
|
|
177
|
-
}
|
|
178
|
-
```
|
|
179
|
-
|
|
180
|
-
## Asset Files
|
|
181
|
-
|
|
182
|
-
Copy the following files to your assets directory:
|
|
183
|
-
|
|
184
|
-
```
|
|
185
|
-
assets/pymupdf/
|
|
186
|
-
├── pyodide.js
|
|
187
|
-
├── pyodide.asm.js
|
|
188
|
-
├── pyodide.asm.wasm
|
|
189
|
-
├── pyodide-lock.json
|
|
190
|
-
├── python_stdlib.zip
|
|
191
|
-
├── pymupdf-*.whl
|
|
192
|
-
├── fonttools-*.whl
|
|
193
|
-
├── lxml-*.whl
|
|
194
|
-
├── numpy-*.whl
|
|
195
|
-
├── opencv_python-*.whl
|
|
196
|
-
├── pdf2docx-*.whl
|
|
197
|
-
├── python_docx-*.whl
|
|
198
|
-
└── typing_extensions-*.whl
|
|
199
|
-
```
|
|
200
|
-
|
|
201
|
-
## About
|
|
202
|
-
|
|
203
|
-
This package was ported to work with [BentoPDF](https://bentopdf.com), an open-source PDF toolkit. Maintenance and updates will be focused on features required by BentoPDF.
|
|
204
|
-
|
|
205
|
-
- Website: [bentopdf.com](https://bentopdf.com)
|
|
206
|
-
- GitHub: [https://github.com/alam00000/bentopdf](https://github.com/alam00000/bentopdf)
|
|
207
|
-
|
|
208
|
-
## License
|
|
209
|
-
|
|
210
|
-
This project is licensed under the **GNU Affero General Public License v3.0 (AGPL-3.0)**.
|
|
211
|
-
|
|
212
|
-
### Copyright Notices
|
|
213
|
-
|
|
214
|
-
#### BentoPDF
|
|
215
|
-
|
|
216
|
-
```
|
|
217
|
-
Copyright (C) 2025 BentoPDF Contributors
|
|
218
|
-
|
|
219
|
-
This program is free software: you can redistribute it and/or modify
|
|
220
|
-
it under the terms of the GNU Affero General Public License as published
|
|
221
|
-
by the Free Software Foundation, either version 3 of the License, or
|
|
222
|
-
(at your option) any later version.
|
|
223
|
-
|
|
224
|
-
This program is distributed in the hope that it will be useful,
|
|
225
|
-
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
226
|
-
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
227
|
-
GNU Affero General Public License for more details.
|
|
228
|
-
|
|
229
|
-
You should have received a copy of the GNU Affero General Public License
|
|
230
|
-
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
231
|
-
```
|
|
232
|
-
|
|
233
|
-
#### PyMuPDF
|
|
234
|
-
|
|
235
|
-
```
|
|
236
|
-
Copyright (C) 2004-2025 Artifex Software, Inc.
|
|
237
|
-
|
|
238
|
-
PyMuPDF is licensed under the GNU Affero General Public License v3.0 (AGPL-3.0).
|
|
239
|
-
PyMuPDF is a Python binding for MuPDF
|
|
240
|
-
|
|
241
|
-
Source code: https://github.com/pymupdf/PyMuPDF
|
|
242
|
-
License: https://github.com/pymupdf/PyMuPDF/blob/master/COPYING
|
|
243
|
-
```
|
|
244
|
-
|
|
245
|
-
#### Ghostscript
|
|
246
|
-
|
|
247
|
-
```
|
|
248
|
-
Copyright (C) 2001-2025 Artifex Software, Inc.
|
|
249
|
-
|
|
250
|
-
Ghostscript is licensed under the GNU Affero General Public License v3.0 (AGPL-3.0).
|
|
251
|
-
Ghostscript is an interpreter for the PostScript language and PDF files.
|
|
252
|
-
|
|
253
|
-
Source code: https://github.com/ArtifexSoftware/ghostpdl
|
|
254
|
-
License: https://www.ghostscript.com/licensing/
|
|
69
|
+
await pymupdf.load();
|
|
255
70
|
```
|
|
256
71
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
This package combines code from multiple AGPL-3.0 licensed projects. As a combined work,
|
|
260
|
-
it is distributed under the terms of the GNU Affero General Public License v3.0. The source
|
|
261
|
-
code for all components is available in accordance with Section 13 of the AGPL-3.0.
|
|
262
|
-
|
|
263
|
-
For the complete license text, see the [LICENSE](./LICENSE) file.
|
|
72
|
+
## Credits & Copyrights
|
|
264
73
|
|
|
265
|
-
|
|
74
|
+
- **PyMuPDF**: © Artifex Software, Inc. (AGPL-3.0)
|
|
75
|
+
- **Ghostscript**: © Artifex Software, Inc. (AGPL-3.0)
|
|
76
|
+
- **Pyodide**: © Mozilla Foundation / Michael Droettboom (MPL-2.0)
|
|
77
|
+
- **pdf2docx**: © Artifex Software, Inc. (AGPL-3.0)
|
|
266
78
|
|
|
267
|
-
|
|
268
|
-
- [Ghostscript](https://www.ghostscript.com/) - PostScript and PDF interpreter (© Artifex Software, Inc.)
|
|
269
|
-
- [Pyodide](https://pyodide.org/) - Python in the browser
|
|
270
|
-
- [pdf2docx](https://github.com/dothinking/pdf2docx) - PDF to DOCX conversion (© Artifex Software, Inc.)
|
|
79
|
+
This package combines these components and is distributed under AGPL-3.0.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
FROM node:20.11-bookworm-slim AS node-image
|
|
2
|
+
FROM python:3.13.2-slim-bookworm
|
|
3
|
+
|
|
4
|
+
RUN apt-get update \
|
|
5
|
+
&& apt-get install -y --no-install-recommends \
|
|
6
|
+
bzip2 ccache f2c g++ gfortran git make \
|
|
7
|
+
patch pkg-config swig unzip wget xz-utils \
|
|
8
|
+
autoconf autotools-dev automake texinfo dejagnu \
|
|
9
|
+
build-essential libltdl-dev \
|
|
10
|
+
gnupg2 libdbus-glib-1-2 sudo sqlite3 \
|
|
11
|
+
ninja-build jq cmake bison \
|
|
12
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
13
|
+
|
|
14
|
+
COPY --from=node-image /usr/local/bin/node /usr/local/bin/
|
|
15
|
+
COPY --from=node-image /usr/local/lib/node_modules /usr/local/lib/node_modules
|
|
16
|
+
RUN ln -s ../lib/node_modules/npm/bin/npm-cli.js /usr/local/bin/npm \
|
|
17
|
+
&& ln -s ../lib/node_modules/npm/bin/npx-cli.js /usr/local/bin/npx
|
|
18
|
+
|
|
19
|
+
RUN wget https://github.com/emscripten-core/emsdk/archive/refs/tags/4.0.9.tar.gz \
|
|
20
|
+
&& tar -xzf 4.0.9.tar.gz \
|
|
21
|
+
&& cd emsdk-4.0.9 \
|
|
22
|
+
&& ./emsdk install 4.0.9 \
|
|
23
|
+
&& ./emsdk activate 4.0.9 \
|
|
24
|
+
&& cd .. \
|
|
25
|
+
&& mv emsdk-4.0.9 /opt/emsdk \
|
|
26
|
+
&& rm 4.0.9.tar.gz
|
|
27
|
+
|
|
28
|
+
RUN python -m pip install --upgrade pip wheel pyodide-build==0.30.5
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Build Instructions for PyMuPDF WASM
|
|
2
|
+
|
|
3
|
+
These files constitute the "Corresponding Source" build scripts required by the AGPL license.
|
|
4
|
+
|
|
5
|
+
## Prerequisites
|
|
6
|
+
|
|
7
|
+
- Docker
|
|
8
|
+
- Git
|
|
9
|
+
|
|
10
|
+
## Build Steps
|
|
11
|
+
|
|
12
|
+
1. **Build with Docker**:
|
|
13
|
+
The build process is containerized. Use the provided `Dockerfile` to build the WASM artifacts.
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
docker build -t pymupdf-wasm-build .
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
This will compile PyMuPDF to WebAssembly using Emscripten within the container.
|
|
20
|
+
|
|
21
|
+
2. **Extract Artifacts**:
|
|
22
|
+
After building, you can extract the `pymupdf.wasm` and generated JS files from the container.
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
docker run --rm -v $(pwd)/dist:/output pymupdf-wasm-build cp -r /app/dist/* /output/
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## File Descriptions
|
|
29
|
+
|
|
30
|
+
- `Dockerfile`: Defines the build environment and steps for compiling PyMuPDF to WASM.
|
|
31
|
+
- `scripts/`: Helper scripts used during the build process.
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import * as path from "path";
|
|
2
|
+
import * as fs from "fs/promises";
|
|
3
|
+
import esbuild from "esbuild";
|
|
4
|
+
import { run } from "runish";
|
|
5
|
+
|
|
6
|
+
const OUT_DIR = path.resolve("./out");
|
|
7
|
+
const RELEASE_DIR = path.join(OUT_DIR, "release");
|
|
8
|
+
const TSC = path.resolve("node_modules/typescript/bin/tsc");
|
|
9
|
+
const { RELEASE } = process.env;
|
|
10
|
+
const TARGET_DIR = RELEASE ? RELEASE_DIR : OUT_DIR;
|
|
11
|
+
|
|
12
|
+
async function main() {
|
|
13
|
+
await fs.mkdir(OUT_DIR, { recursive: true });
|
|
14
|
+
if (RELEASE) {
|
|
15
|
+
await fs.rm(RELEASE_DIR, { force: true, recursive: true });
|
|
16
|
+
await fs.mkdir(RELEASE_DIR, { recursive: true });
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
await run(TSC, [
|
|
20
|
+
"--declaration",
|
|
21
|
+
"--emitDeclarationOnly",
|
|
22
|
+
"--outDir",
|
|
23
|
+
path.join(OUT_DIR, "types"),
|
|
24
|
+
]);
|
|
25
|
+
|
|
26
|
+
await esbuild.build({
|
|
27
|
+
entryPoints: ["src/index.ts"],
|
|
28
|
+
outdir: TARGET_DIR,
|
|
29
|
+
bundle: true,
|
|
30
|
+
write: true,
|
|
31
|
+
format: "esm",
|
|
32
|
+
target: "es2020",
|
|
33
|
+
minify: !!RELEASE,
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
await esbuild.build({
|
|
37
|
+
entryPoints: ["tests/index.ts"],
|
|
38
|
+
outdir: path.join(OUT_DIR, "tests"),
|
|
39
|
+
bundle: true,
|
|
40
|
+
write: true,
|
|
41
|
+
format: "esm",
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
main().catch((err) => {
|
|
46
|
+
console.error(err);
|
|
47
|
+
process.exit(1);
|
|
48
|
+
});
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import * as path from "path";
|
|
2
|
+
import * as fs from "fs/promises";
|
|
3
|
+
import { run } from "runish";
|
|
4
|
+
|
|
5
|
+
const OUT_DIR = path.resolve("./out");
|
|
6
|
+
const LIB_DIR = path.resolve("./lib");
|
|
7
|
+
|
|
8
|
+
async function main() {
|
|
9
|
+
await fs.mkdir(OUT_DIR, { recursive: true });
|
|
10
|
+
|
|
11
|
+
await run("pyodide", ["build", "--exports", "whole_archive"], {
|
|
12
|
+
cwd: path.join(LIB_DIR, "PyMuPDF"),
|
|
13
|
+
env: {
|
|
14
|
+
SKIP_EMSCRIPTEN_VERSION_CHECK: "1",
|
|
15
|
+
HAVE_LIBCRYPTO: "no",
|
|
16
|
+
OS: "pyodide",
|
|
17
|
+
PYMUPDF_SETUP_FLAVOUR: "pb",
|
|
18
|
+
PYMUPDF_SETUP_MUPDF_BUILD_TESSERACT: "0",
|
|
19
|
+
PYMUPDF_SETUP_MUPDF_TESSERACT: "0",
|
|
20
|
+
...process.env,
|
|
21
|
+
},
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
const whl = "pymupdf-1.26.1-cp313-none-pyodide_2025_0_wasm32.whl";
|
|
25
|
+
await fs.cp(path.join(LIB_DIR, "PyMuPDF/dist", whl), path.join(OUT_DIR, whl));
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
main().catch((err) => {
|
|
29
|
+
console.error(err);
|
|
30
|
+
process.exit(1);
|
|
31
|
+
});
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import * as path from "path";
|
|
2
|
+
import * as fs from "fs/promises";
|
|
3
|
+
import { run } from "runish";
|
|
4
|
+
import { existsSync } from "fs";
|
|
5
|
+
|
|
6
|
+
const LIB_DIR = path.resolve("./lib");
|
|
7
|
+
const OUT_DIR = path.resolve("./out");
|
|
8
|
+
|
|
9
|
+
async function main() {
|
|
10
|
+
await fs.mkdir(LIB_DIR, { recursive: true });
|
|
11
|
+
|
|
12
|
+
const libs = [
|
|
13
|
+
[
|
|
14
|
+
"PyMuPDF",
|
|
15
|
+
"https://github.com/pymupdf/PyMuPDF",
|
|
16
|
+
"4a53405a51d29f2f620c0c7659b7c4d404a9f9c0",
|
|
17
|
+
],
|
|
18
|
+
];
|
|
19
|
+
for (const [name, repo, hash, callback] of libs) {
|
|
20
|
+
process.chdir(LIB_DIR);
|
|
21
|
+
const cloned = await gitClone(name, repo, hash);
|
|
22
|
+
if (cloned && callback) await callback();
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
await fs.mkdir(OUT_DIR, { recursive: true });
|
|
26
|
+
|
|
27
|
+
const assets = [
|
|
28
|
+
"https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/pyodide.js",
|
|
29
|
+
"https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/python_stdlib.zip",
|
|
30
|
+
"https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/pyodide.asm.wasm",
|
|
31
|
+
"https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/pyodide-lock.json",
|
|
32
|
+
"https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/pyodide.asm.js",
|
|
33
|
+
"https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/lxml-5.4.0-cp313-cp313-pyodide_2025_0_wasm32.whl",
|
|
34
|
+
"https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/typing_extensions-4.12.2-py3-none-any.whl",
|
|
35
|
+
"https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/numpy-2.2.5-cp313-cp313-pyodide_2025_0_wasm32.whl",
|
|
36
|
+
"https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/opencv_python-4.11.0.86-cp313-cp313-pyodide_2025_0_wasm32.whl",
|
|
37
|
+
"https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/fonttools-4.56.0-py3-none-any.whl",
|
|
38
|
+
"https://files.pythonhosted.org/packages/d0/00/1e03a4989fa5795da308cd774f05b704ace555a70f9bf9d3be057b680bcf/python_docx-1.2.0-py3-none-any.whl",
|
|
39
|
+
"https://files.pythonhosted.org/packages/b5/f9/6d567df395c0409baf2b4dd9cd30d1e977c70672fe7ec2a684af1e6aa41c/pdf2docx-0.5.8-py3-none-any.whl",
|
|
40
|
+
];
|
|
41
|
+
for (let url of assets) {
|
|
42
|
+
const name = url.split("/").at(-1);
|
|
43
|
+
if (name === "pyodide.js") url = url.replace(/.js$/, () => ".mjs");
|
|
44
|
+
await download(name, url);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
async function gitClone(name, repo, hash) {
|
|
49
|
+
if (existsSync(name)) return;
|
|
50
|
+
|
|
51
|
+
console.log(`git cloning ${name} - ${repo} - ${hash}`);
|
|
52
|
+
await run("git", ["init", name]);
|
|
53
|
+
process.chdir(path.join(LIB_DIR, name));
|
|
54
|
+
await run("git", ["fetch", "--depth", "1", repo, hash]);
|
|
55
|
+
await run("git", ["checkout", "FETCH_HEAD"]);
|
|
56
|
+
return true;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
async function download(name, url) {
|
|
60
|
+
const filePath = path.join(OUT_DIR, name);
|
|
61
|
+
|
|
62
|
+
if (existsSync(filePath)) return;
|
|
63
|
+
|
|
64
|
+
console.log(`downloading ${name} - ${url}`);
|
|
65
|
+
const buf = await fetch(url).then((x) => x.arrayBuffer());
|
|
66
|
+
await fs.writeFile(filePath, Buffer.from(buf));
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
main().catch((err) => {
|
|
70
|
+
console.error(err);
|
|
71
|
+
process.exit(1);
|
|
72
|
+
});
|
package/dist/index.js
CHANGED
|
@@ -1128,17 +1128,21 @@ elif t != "null":
|
|
|
1128
1128
|
};
|
|
1129
1129
|
|
|
1130
1130
|
// src/pymupdf.ts
|
|
1131
|
-
|
|
1132
|
-
|
|
1131
|
+
async function convertPdfToRgb(pdfData, gsBaseUrl) {
|
|
1132
|
+
if (!gsBaseUrl) {
|
|
1133
|
+
throw new Error("Ghostscript URL not configured. Cannot perform RGB conversion.");
|
|
1134
|
+
}
|
|
1133
1135
|
console.log("[convertPdfToRgb] Starting Ghostscript RGB conversion...");
|
|
1134
1136
|
console.log("[convertPdfToRgb] Input size:", pdfData.length);
|
|
1137
|
+
console.log("[convertPdfToRgb] GS base URL:", gsBaseUrl);
|
|
1138
|
+
const normalizedGsUrl = gsBaseUrl.endsWith("/") ? gsBaseUrl : `${gsBaseUrl}/`;
|
|
1139
|
+
const libraryUrl = `${normalizedGsUrl}dist/index.js`;
|
|
1140
|
+
const { loadGhostscriptWASM } = await import(
|
|
1141
|
+
/* @vite-ignore */
|
|
1142
|
+
libraryUrl
|
|
1143
|
+
);
|
|
1135
1144
|
const gs = await loadGhostscriptWASM({
|
|
1136
|
-
|
|
1137
|
-
if (path.endsWith(".wasm")) {
|
|
1138
|
-
return "/ghostscript-wasm/gs.wasm";
|
|
1139
|
-
}
|
|
1140
|
-
return path;
|
|
1141
|
-
},
|
|
1145
|
+
baseUrl: `${normalizedGsUrl}assets/`,
|
|
1142
1146
|
print: (text) => console.log("[GS RGB]", text),
|
|
1143
1147
|
printErr: (text) => console.error("[GS RGB Error]", text)
|
|
1144
1148
|
});
|
|
@@ -1257,8 +1261,10 @@ var PyMuPDF = class {
|
|
|
1257
1261
|
this.docCounter = 0;
|
|
1258
1262
|
if (typeof options === "string") {
|
|
1259
1263
|
this.assetPath = options;
|
|
1264
|
+
this.ghostscriptUrl = "";
|
|
1260
1265
|
} else {
|
|
1261
1266
|
this.assetPath = options?.assetPath ?? "./";
|
|
1267
|
+
this.ghostscriptUrl = options?.ghostscriptUrl ?? "";
|
|
1262
1268
|
}
|
|
1263
1269
|
if (!this.assetPath.endsWith("/")) {
|
|
1264
1270
|
this.assetPath += "/";
|
|
@@ -1429,7 +1435,7 @@ def deskew_image(img_array, angle):
|
|
|
1429
1435
|
let pdfData = new Uint8Array(buf);
|
|
1430
1436
|
console.log("[pdfToDocx] Converting PDF to RGB colorspace with Ghostscript...");
|
|
1431
1437
|
try {
|
|
1432
|
-
const rgbData = await convertPdfToRgb(pdfData);
|
|
1438
|
+
const rgbData = await convertPdfToRgb(pdfData, this.ghostscriptUrl);
|
|
1433
1439
|
pdfData = rgbData;
|
|
1434
1440
|
console.log("[pdfToDocx] RGB conversion complete");
|
|
1435
1441
|
} catch (e) {
|
|
@@ -2312,19 +2318,121 @@ doc.scrub(
|
|
|
2312
2318
|
reset_responses=${scrubResetResponses ? "True" : "False"},
|
|
2313
2319
|
)
|
|
2314
2320
|
|
|
2315
|
-
# 2. Image compression
|
|
2321
|
+
# 2. Image compression (safe per-xref approach to avoid MuPDF buffer overflow
|
|
2322
|
+
# with shared image xrefs across many pages \u2014 bypasses doc.rewrite_images())
|
|
2316
2323
|
if ${compressImages ? "True" : "False"}:
|
|
2317
|
-
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
|
|
2321
|
-
|
|
2322
|
-
|
|
2323
|
-
|
|
2324
|
-
|
|
2325
|
-
|
|
2326
|
-
|
|
2327
|
-
|
|
2324
|
+
import math as _math
|
|
2325
|
+
import sys as _sys
|
|
2326
|
+
|
|
2327
|
+
_dpi_target = ${dpiTarget}
|
|
2328
|
+
_dpi_threshold = ${dpiThreshold}
|
|
2329
|
+
_quality = ${imageQuality}
|
|
2330
|
+
_set_to_gray = ${convertToGray ? "True" : "False"}
|
|
2331
|
+
_process_lossy = ${processLossy ? "True" : "False"}
|
|
2332
|
+
_process_lossless = ${processLossless ? "True" : "False"}
|
|
2333
|
+
_process_bitonal = ${processBitonal ? "True" : "False"}
|
|
2334
|
+
_process_color = ${processColor ? "True" : "False"}
|
|
2335
|
+
_process_gray = ${processGray ? "True" : "False"}
|
|
2336
|
+
|
|
2337
|
+
# Phase 1: Collect unique image xrefs and smask info
|
|
2338
|
+
_xref_info = {}
|
|
2339
|
+
for _page in doc:
|
|
2340
|
+
for _img in _page.get_images(full=True):
|
|
2341
|
+
_xref, _smask = _img[0], _img[1]
|
|
2342
|
+
if _xref > 0:
|
|
2343
|
+
_xref_info.setdefault(_xref, {"smask": _smask, "min_dpi": float("inf")})
|
|
2344
|
+
|
|
2345
|
+
# Phase 2: Calculate effective DPI for each xref across all page usages
|
|
2346
|
+
for _page in doc:
|
|
2347
|
+
for _info in _page.get_image_info(hashes=False, xrefs=True):
|
|
2348
|
+
_xref = _info.get("xref", 0)
|
|
2349
|
+
if _xref not in _xref_info:
|
|
2350
|
+
continue
|
|
2351
|
+
_bbox = _info.get("bbox")
|
|
2352
|
+
_w = _info.get("width", 0)
|
|
2353
|
+
_h = _info.get("height", 0)
|
|
2354
|
+
if _bbox and _w > 0 and _h > 0:
|
|
2355
|
+
_disp_w = abs(_bbox[2] - _bbox[0])
|
|
2356
|
+
_disp_h = abs(_bbox[3] - _bbox[1])
|
|
2357
|
+
if _disp_w > 0 and _disp_h > 0:
|
|
2358
|
+
_dpi = min(_w / _disp_w * 72, _h / _disp_h * 72)
|
|
2359
|
+
if _dpi < _xref_info[_xref]["min_dpi"]:
|
|
2360
|
+
_xref_info[_xref]["min_dpi"] = _dpi
|
|
2361
|
+
|
|
2362
|
+
_effective_threshold = max(_dpi_threshold or 0, (_dpi_target or 0) + 10) if _dpi_target else None
|
|
2363
|
+
|
|
2364
|
+
# Phase 3: Rewrite each image xref individually
|
|
2365
|
+
for _xref, _meta in _xref_info.items():
|
|
2366
|
+
_min_dpi = _meta["min_dpi"]
|
|
2367
|
+
_smask_xref = _meta["smask"]
|
|
2368
|
+
|
|
2369
|
+
_needs_downscale = bool(
|
|
2370
|
+
_dpi_target and _effective_threshold
|
|
2371
|
+
and _min_dpi != float("inf")
|
|
2372
|
+
and _min_dpi > _effective_threshold
|
|
2373
|
+
)
|
|
2374
|
+
if not _needs_downscale and _quality is None and not _set_to_gray:
|
|
2375
|
+
continue
|
|
2376
|
+
|
|
2377
|
+
try:
|
|
2378
|
+
# Check image type filters (match rewrite_images behavior)
|
|
2379
|
+
_xref_obj = doc.xref_object(_xref)
|
|
2380
|
+
_is_lossy = "/DCTDecode" in _xref_obj or "/JPXDecode" in _xref_obj
|
|
2381
|
+
_is_lossless = not _is_lossy
|
|
2382
|
+
if _is_lossy and not _process_lossy:
|
|
2383
|
+
continue
|
|
2384
|
+
if _is_lossless and not _process_lossless:
|
|
2385
|
+
continue
|
|
2386
|
+
|
|
2387
|
+
_pix = pymupdf.Pixmap(doc, _xref)
|
|
2388
|
+
|
|
2389
|
+
# Check colorspace filters
|
|
2390
|
+
_n = _pix.colorspace.n if _pix.colorspace else 0
|
|
2391
|
+
_is_bitonal = (_pix.colorspace and _n == 1 and doc.xref_get_key(_xref, "BitsPerComponent")[1] == "1")
|
|
2392
|
+
_is_gray = (_n == 1 and not _is_bitonal)
|
|
2393
|
+
_is_color = (_n >= 3)
|
|
2394
|
+
if _is_bitonal and not _process_bitonal:
|
|
2395
|
+
_pix = None
|
|
2396
|
+
continue
|
|
2397
|
+
if _is_gray and not _process_gray:
|
|
2398
|
+
_pix = None
|
|
2399
|
+
continue
|
|
2400
|
+
if _is_color and not _process_color:
|
|
2401
|
+
_pix = None
|
|
2402
|
+
continue
|
|
2403
|
+
|
|
2404
|
+
if _set_to_gray and _pix.colorspace and _pix.colorspace.n > 1:
|
|
2405
|
+
_pix = pymupdf.Pixmap(pymupdf.csGRAY, _pix)
|
|
2406
|
+
elif _pix.alpha:
|
|
2407
|
+
_pix = pymupdf.Pixmap(_pix.colorspace or pymupdf.csRGB, _pix)
|
|
2408
|
+
|
|
2409
|
+
if _needs_downscale:
|
|
2410
|
+
_ratio = _min_dpi / _dpi_target
|
|
2411
|
+
_shrink_n = max(0, min(7, int(_math.log2(_ratio))))
|
|
2412
|
+
if _shrink_n > 0:
|
|
2413
|
+
_pix.shrink(_shrink_n)
|
|
2414
|
+
|
|
2415
|
+
_q = _quality if _quality is not None else 85
|
|
2416
|
+
_jpeg_bytes = _pix.tobytes("jpeg", jpg_quality=_q)
|
|
2417
|
+
|
|
2418
|
+
_cs_name = (
|
|
2419
|
+
"/DeviceGray"
|
|
2420
|
+
if _pix.colorspace and _pix.colorspace.n == 1
|
|
2421
|
+
else "/DeviceRGB"
|
|
2422
|
+
)
|
|
2423
|
+
_smask_entry = f"/SMask {_smask_xref} 0 R " if _smask_xref else ""
|
|
2424
|
+
_new_obj = (
|
|
2425
|
+
f"<</Type /XObject /Subtype /Image /BitsPerComponent 8"
|
|
2426
|
+
f" /ColorSpace {_cs_name} /Filter /DCTDecode"
|
|
2427
|
+
f" /Height {_pix.height} /Width {_pix.width}"
|
|
2428
|
+
f" {_smask_entry}>>"
|
|
2429
|
+
)
|
|
2430
|
+
doc.update_object(_xref, _new_obj)
|
|
2431
|
+
doc.update_stream(_xref, _jpeg_bytes, compress=0)
|
|
2432
|
+
_pix = None
|
|
2433
|
+
|
|
2434
|
+
except Exception as _e:
|
|
2435
|
+
_sys.stderr.write(f"[pymupdf-wasm] safe_rewrite_images xref {_xref}: {_e}\\n")
|
|
2328
2436
|
|
|
2329
2437
|
# 3. Font subsetting
|
|
2330
2438
|
if ${subsetFonts ? "True" : "False"}:
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@bentopdf/pymupdf-wasm",
|
|
3
|
-
"version": "0.11.
|
|
3
|
+
"version": "0.11.15",
|
|
4
4
|
"description": "PyMuPDF compiled to WebAssembly - Full PDF manipulation in the browser",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -19,16 +19,14 @@
|
|
|
19
19
|
"files": [
|
|
20
20
|
"dist",
|
|
21
21
|
"types",
|
|
22
|
-
"assets"
|
|
22
|
+
"assets",
|
|
23
|
+
"build_scripts"
|
|
23
24
|
],
|
|
24
25
|
"scripts": {
|
|
25
|
-
"build": "esbuild src/index.ts --bundle --format=esm --outfile=dist/index.js --external:./assets/*
|
|
26
|
-
"dev": "esbuild src/index.ts --bundle --format=esm --outfile=dist/index.js --watch
|
|
26
|
+
"build": "esbuild src/index.ts --bundle --format=esm --outfile=dist/index.js --external:./assets/*",
|
|
27
|
+
"dev": "esbuild src/index.ts --bundle --format=esm --outfile=dist/index.js --watch",
|
|
27
28
|
"typecheck": "tsc --noEmit"
|
|
28
29
|
},
|
|
29
|
-
"peerDependencies": {
|
|
30
|
-
"@bentopdf/gs-wasm": "*"
|
|
31
|
-
},
|
|
32
30
|
"repository": {
|
|
33
31
|
"type": "git",
|
|
34
32
|
"url": "git+https://github.com/alam00000/bentopdf-pymupdf-wasm.git"
|
|
@@ -46,7 +44,10 @@
|
|
|
46
44
|
"extract"
|
|
47
45
|
],
|
|
48
46
|
"author": "BentoPDF",
|
|
49
|
-
"
|
|
47
|
+
"contributors": [
|
|
48
|
+
"Artifex Software, Inc."
|
|
49
|
+
],
|
|
50
|
+
"license": "AGPL-3.0-only",
|
|
50
51
|
"bugs": "https://github.com/alam00000/bentopdf-pymupdf-wasm/issues",
|
|
51
52
|
"devDependencies": {
|
|
52
53
|
"esbuild": "^0.21.2",
|