@bentopdf/pymupdf-wasm 0.11.12 → 0.11.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -2,6 +2,42 @@
2
2
 
3
3
  PyMuPDF compiled to WebAssembly for full PDF manipulation in the browser.
4
4
 
5
+ ## Notice
6
+
7
+ This package is a modified version of PyMuPDF, originally developed by Artifex Software, Inc.
8
+
9
+ It has been adapted for WebAssembly (WASM) and dynamic loading.
10
+
11
+ ## Attribution
12
+
13
+ PyMuPDF is copyright © Artifex Software, Inc.
14
+ This package is distributed under the GNU Affero General Public License v3.0.
15
+
16
+ This project is not affiliated with or endorsed by Artifex Software, Inc.
17
+
18
+ ## Source Code Availability
19
+
20
+ This program is licensed under the GNU Affero General Public License v3.0.
21
+ If you interact with this program over a network, you are entitled to
22
+ receive the complete corresponding source code.
23
+
24
+ The source code for this package is available at:
25
+ https://github.com/alam00000/bentopdf-pymupdf-wasm
26
+
27
+ This package includes the complete **Corresponding Source** (build scripts and configuration) in the `build_scripts/` directory.
28
+
29
+ ### Build Instructions
30
+
31
+ To rebuild the WASM binary from source:
32
+ 1. Download the source code from the repository or this package.
33
+ 2. Navigate to the `build_scripts/` directory.
34
+ 3. Follow the instructions in `build_scripts/README.md` (uses Docker).
35
+
36
+ ## License
37
+
38
+ This project is licensed under the [AGPL-3.0-only](LICENSE) license.
39
+ See the License section below for details on included components.
40
+
5
41
  ## Features
6
42
 
7
43
  - **Open** PDF, XPS, EPUB, and images
@@ -25,246 +61,19 @@ npm install @bentopdf/pymupdf-wasm
25
61
  ```javascript
26
62
  import { PyMuPDF } from '@bentopdf/pymupdf-wasm';
27
63
 
28
- // Initialize with path to assets
29
- const pymupdf = new PyMuPDF('/assets/pymupdf/');
30
-
31
- // Preload (optional, speeds up first operation)
32
- await pymupdf.load();
33
-
34
- // Open a PDF
35
- const doc = await pymupdf.open(pdfFile);
36
-
37
- // Extract text
38
- const text = doc.getPage(0).getText();
39
-
40
- // Save with modifications
41
- const blob = doc.saveAsBlob();
42
- doc.close();
43
- ```
44
-
45
- ## API Reference
46
-
47
- ### PyMuPDF Class
48
-
49
- ```javascript
50
- const pymupdf = new PyMuPDF(assetPath);
51
-
52
- // Document operations
53
- const doc = await pymupdf.open(file);
54
- const doc = await pymupdf.openUrl('https://example.com/doc.pdf');
55
- const doc = await pymupdf.create(); // Empty PDF
56
-
57
- // Utilities
58
- const merged = await pymupdf.merge([pdf1, pdf2, pdf3]);
59
- const [part1, part2] = await pymupdf.split(pdf, [
60
- { start: 0, end: 4 },
61
- { start: 5, end: 9 }
62
- ]);
63
- const text = await pymupdf.extractText(pdf);
64
- const image = await pymupdf.renderPage(pdf, 0, 150); // 150 DPI
65
-
66
- // PDF to DOCX
67
- const docx = await pymupdf.pdfToDocx(pdf);
68
-
69
- // File to PDF conversion
70
- // Supports: XPS, EPUB, MOBI, FB2, CBZ, SVG, images (JPEG, PNG, BMP, GIF, TIFF, WEBP)
71
- const pdfFromXps = await pymupdf.xpsToPdf(xpsFile);
72
- const pdfFromEpub = await pymupdf.epubToPdf(epubFile);
73
- const pdfFromImage = await pymupdf.imageToPdf(imageFile);
74
- const pdfFromSvg = await pymupdf.svgToPdf(svgFile);
75
- const pdfFromImages = await pymupdf.imagesToPdf([img1, img2, img3]);
76
- const pdfFromAny = await pymupdf.convertToPdf(file, { filetype: 'svg' });
77
-
78
- // PDF to other formats
79
- const images = await pymupdf.pdfToImages(pdf, { format: 'png', dpi: 300 });
80
- const svgs = await pymupdf.pdfToSvg(pdf);
81
- const text = await pymupdf.pdfToText(pdf);
82
- const html = await pymupdf.pdfToHtml(pdf);
83
- const json = await pymupdf.pdfToJson(pdf);
84
- const xml = await pymupdf.pdfToXml(pdf);
85
- ```
86
-
87
- ### Document Operations
88
-
89
- ```javascript
90
- // Properties
91
- doc.pageCount;
92
- doc.metadata; // { title, author, ... }
93
- doc.isEncrypted;
94
-
95
- // Page access
96
- const page = doc.getPage(0);
97
- for (const page of doc.pages()) { ... }
98
-
99
- // Modify
100
- doc.deletePage(5);
101
- doc.insertBlankPage(0);
102
- doc.movePage(3, 0);
103
-
104
- // Merge another PDF
105
- const other = await pymupdf.open(otherPdf);
106
- doc.insertPdf(other);
107
-
108
- // Save
109
- const pdf = doc.save();
110
- const blob = doc.saveAsBlob();
111
- doc.close();
112
- ```
113
-
114
- ### Page Operations
115
-
116
- ```javascript
117
- const page = doc.getPage(0);
118
-
119
- // Properties
120
- page.width;
121
- page.height;
122
- page.rotation;
123
- page.setRotation(90);
124
-
125
- // Text
126
- const text = page.getText();
127
- const rects = page.searchFor("keyword");
128
- page.insertText({ x: 100, y: 100 }, "Hello", { fontsize: 14 });
129
-
130
- // Images
131
- const images = page.getImages();
132
- const img = page.extractImage(images[0].xref);
133
- page.insertImage(rect, imageData);
134
-
135
- // Annotations
136
- page.addHighlight(rect, { r: 1, g: 1, b: 0 });
137
- page.addTextAnnotation({ x: 100, y: 100 }, "Note");
138
- const annots = page.getAnnotations();
139
-
140
- // Render
141
- const png = await page.toImage({ dpi: 300 });
142
- const svg = page.toSvg();
143
-
144
- // Redaction
145
- page.addRedaction(rect);
146
- page.applyRedactions();
147
- ```
148
-
149
- ### Security
150
-
151
- ```javascript
152
- // Encrypt
153
- const pdf = doc.save({
154
- encryption: {
155
- ownerPassword: 'secret',
156
- userPassword: 'user123',
157
- permissions: {
158
- print: true,
159
- copy: false
160
- }
161
- }
64
+ const pymupdf = new PyMuPDF({
65
+ assetPath: '/assets/pymupdf/',
66
+ ghostscriptUrl: 'https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm@0.1.0/' // Optional: for RGB conversion
162
67
  });
163
68
 
164
- // Decrypt
165
- if (doc.needsPass) {
166
- doc.authenticate('password');
167
- }
168
- ```
169
-
170
- ### Forms
171
-
172
- ```javascript
173
- if (doc.isFormPdf) {
174
- const fields = doc.getFormFields();
175
- doc.setFormField('name', 'John Doe');
176
- doc.setFormField('agree', true);
177
- }
178
- ```
179
-
180
- ## Asset Files
181
-
182
- Copy the following files to your assets directory:
183
-
184
- ```
185
- assets/pymupdf/
186
- ├── pyodide.js
187
- ├── pyodide.asm.js
188
- ├── pyodide.asm.wasm
189
- ├── pyodide-lock.json
190
- ├── python_stdlib.zip
191
- ├── pymupdf-*.whl
192
- ├── fonttools-*.whl
193
- ├── lxml-*.whl
194
- ├── numpy-*.whl
195
- ├── opencv_python-*.whl
196
- ├── pdf2docx-*.whl
197
- ├── python_docx-*.whl
198
- └── typing_extensions-*.whl
199
- ```
200
-
201
- ## About
202
-
203
- This package was ported to work with [BentoPDF](https://bentopdf.com), an open-source PDF toolkit. Maintenance and updates will be focused on features required by BentoPDF.
204
-
205
- - Website: [bentopdf.com](https://bentopdf.com)
206
- - GitHub: [https://github.com/alam00000/bentopdf](https://github.com/alam00000/bentopdf)
207
-
208
- ## License
209
-
210
- This project is licensed under the **GNU Affero General Public License v3.0 (AGPL-3.0)**.
211
-
212
- ### Copyright Notices
213
-
214
- #### BentoPDF
215
-
216
- ```
217
- Copyright (C) 2025 BentoPDF Contributors
218
-
219
- This program is free software: you can redistribute it and/or modify
220
- it under the terms of the GNU Affero General Public License as published
221
- by the Free Software Foundation, either version 3 of the License, or
222
- (at your option) any later version.
223
-
224
- This program is distributed in the hope that it will be useful,
225
- but WITHOUT ANY WARRANTY; without even the implied warranty of
226
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
227
- GNU Affero General Public License for more details.
228
-
229
- You should have received a copy of the GNU Affero General Public License
230
- along with this program. If not, see <https://www.gnu.org/licenses/>.
231
- ```
232
-
233
- #### PyMuPDF
234
-
235
- ```
236
- Copyright (C) 2004-2025 Artifex Software, Inc.
237
-
238
- PyMuPDF is licensed under the GNU Affero General Public License v3.0 (AGPL-3.0).
239
- PyMuPDF is a Python binding for MuPDF
240
-
241
- Source code: https://github.com/pymupdf/PyMuPDF
242
- License: https://github.com/pymupdf/PyMuPDF/blob/master/COPYING
243
- ```
244
-
245
- #### Ghostscript
246
-
247
- ```
248
- Copyright (C) 2001-2025 Artifex Software, Inc.
249
-
250
- Ghostscript is licensed under the GNU Affero General Public License v3.0 (AGPL-3.0).
251
- Ghostscript is an interpreter for the PostScript language and PDF files.
252
-
253
- Source code: https://github.com/ArtifexSoftware/ghostpdl
254
- License: https://www.ghostscript.com/licensing/
69
+ await pymupdf.load();
255
70
  ```
256
71
 
257
- ### Combined Work Notice
258
-
259
- This package combines code from multiple AGPL-3.0 licensed projects. As a combined work,
260
- it is distributed under the terms of the GNU Affero General Public License v3.0. The source
261
- code for all components is available in accordance with Section 13 of the AGPL-3.0.
262
-
263
- For the complete license text, see the [LICENSE](./LICENSE) file.
72
+ ## Credits & Copyrights
264
73
 
265
- ## Credits
74
+ - **PyMuPDF**: © Artifex Software, Inc. (AGPL-3.0)
75
+ - **Ghostscript**: © Artifex Software, Inc. (AGPL-3.0)
76
+ - **Pyodide**: © Mozilla Foundation / Michael Droettboom (MPL-2.0)
77
+ - **pdf2docx**: © Artifex Software, Inc. (AGPL-3.0)
266
78
 
267
- - [PyMuPDF](https://github.com/pymupdf/PyMuPDF) - Python bindings for MuPDF Artifex Software, Inc.)
268
- - [Ghostscript](https://www.ghostscript.com/) - PostScript and PDF interpreter (© Artifex Software, Inc.)
269
- - [Pyodide](https://pyodide.org/) - Python in the browser
270
- - [pdf2docx](https://github.com/dothinking/pdf2docx) - PDF to DOCX conversion (© Artifex Software, Inc.)
79
+ This package combines these components and is distributed under AGPL-3.0.
@@ -0,0 +1,28 @@
1
+ FROM node:20.11-bookworm-slim AS node-image
2
+ FROM python:3.13.2-slim-bookworm
3
+
4
+ RUN apt-get update \
5
+ && apt-get install -y --no-install-recommends \
6
+ bzip2 ccache f2c g++ gfortran git make \
7
+ patch pkg-config swig unzip wget xz-utils \
8
+ autoconf autotools-dev automake texinfo dejagnu \
9
+ build-essential libltdl-dev \
10
+ gnupg2 libdbus-glib-1-2 sudo sqlite3 \
11
+ ninja-build jq cmake bison \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ COPY --from=node-image /usr/local/bin/node /usr/local/bin/
15
+ COPY --from=node-image /usr/local/lib/node_modules /usr/local/lib/node_modules
16
+ RUN ln -s ../lib/node_modules/npm/bin/npm-cli.js /usr/local/bin/npm \
17
+ && ln -s ../lib/node_modules/npm/bin/npx-cli.js /usr/local/bin/npx
18
+
19
+ RUN wget https://github.com/emscripten-core/emsdk/archive/refs/tags/4.0.9.tar.gz \
20
+ && tar -xzf 4.0.9.tar.gz \
21
+ && cd emsdk-4.0.9 \
22
+ && ./emsdk install 4.0.9 \
23
+ && ./emsdk activate 4.0.9 \
24
+ && cd .. \
25
+ && mv emsdk-4.0.9 /opt/emsdk \
26
+ && rm 4.0.9.tar.gz
27
+
28
+ RUN python -m pip install --upgrade pip wheel pyodide-build==0.30.5
@@ -0,0 +1,31 @@
1
+ # Build Instructions for PyMuPDF WASM
2
+
3
+ These files constitute the "Corresponding Source" build scripts required by the AGPL license.
4
+
5
+ ## Prerequisites
6
+
7
+ - Docker
8
+ - Git
9
+
10
+ ## Build Steps
11
+
12
+ 1. **Build with Docker**:
13
+ The build process is containerized. Use the provided `Dockerfile` to build the WASM artifacts.
14
+
15
+ ```bash
16
+ docker build -t pymupdf-wasm-build .
17
+ ```
18
+
19
+ This will compile PyMuPDF to WebAssembly using Emscripten within the container.
20
+
21
+ 2. **Extract Artifacts**:
22
+ After building, you can extract the `pymupdf.wasm` and generated JS files from the container.
23
+
24
+ ```bash
25
+ docker run --rm -v $(pwd)/dist:/output pymupdf-wasm-build cp -r /app/dist/* /output/
26
+ ```
27
+
28
+ ## File Descriptions
29
+
30
+ - `Dockerfile`: Defines the build environment and steps for compiling PyMuPDF to WASM.
31
+ - `scripts/`: Helper scripts used during the build process.
@@ -0,0 +1,48 @@
1
+ import * as path from "path";
2
+ import * as fs from "fs/promises";
3
+ import esbuild from "esbuild";
4
+ import { run } from "runish";
5
+
6
+ const OUT_DIR = path.resolve("./out");
7
+ const RELEASE_DIR = path.join(OUT_DIR, "release");
8
+ const TSC = path.resolve("node_modules/typescript/bin/tsc");
9
+ const { RELEASE } = process.env;
10
+ const TARGET_DIR = RELEASE ? RELEASE_DIR : OUT_DIR;
11
+
12
+ async function main() {
13
+ await fs.mkdir(OUT_DIR, { recursive: true });
14
+ if (RELEASE) {
15
+ await fs.rm(RELEASE_DIR, { force: true, recursive: true });
16
+ await fs.mkdir(RELEASE_DIR, { recursive: true });
17
+ }
18
+
19
+ await run(TSC, [
20
+ "--declaration",
21
+ "--emitDeclarationOnly",
22
+ "--outDir",
23
+ path.join(OUT_DIR, "types"),
24
+ ]);
25
+
26
+ await esbuild.build({
27
+ entryPoints: ["src/index.ts"],
28
+ outdir: TARGET_DIR,
29
+ bundle: true,
30
+ write: true,
31
+ format: "esm",
32
+ target: "es2020",
33
+ minify: !!RELEASE,
34
+ });
35
+
36
+ await esbuild.build({
37
+ entryPoints: ["tests/index.ts"],
38
+ outdir: path.join(OUT_DIR, "tests"),
39
+ bundle: true,
40
+ write: true,
41
+ format: "esm",
42
+ });
43
+ }
44
+
45
+ main().catch((err) => {
46
+ console.error(err);
47
+ process.exit(1);
48
+ });
@@ -0,0 +1,31 @@
1
+ import * as path from "path";
2
+ import * as fs from "fs/promises";
3
+ import { run } from "runish";
4
+
5
+ const OUT_DIR = path.resolve("./out");
6
+ const LIB_DIR = path.resolve("./lib");
7
+
8
+ async function main() {
9
+ await fs.mkdir(OUT_DIR, { recursive: true });
10
+
11
+ await run("pyodide", ["build", "--exports", "whole_archive"], {
12
+ cwd: path.join(LIB_DIR, "PyMuPDF"),
13
+ env: {
14
+ SKIP_EMSCRIPTEN_VERSION_CHECK: "1",
15
+ HAVE_LIBCRYPTO: "no",
16
+ OS: "pyodide",
17
+ PYMUPDF_SETUP_FLAVOUR: "pb",
18
+ PYMUPDF_SETUP_MUPDF_BUILD_TESSERACT: "0",
19
+ PYMUPDF_SETUP_MUPDF_TESSERACT: "0",
20
+ ...process.env,
21
+ },
22
+ });
23
+
24
+ const whl = "pymupdf-1.26.1-cp313-none-pyodide_2025_0_wasm32.whl";
25
+ await fs.cp(path.join(LIB_DIR, "PyMuPDF/dist", whl), path.join(OUT_DIR, whl));
26
+ }
27
+
28
+ main().catch((err) => {
29
+ console.error(err);
30
+ process.exit(1);
31
+ });
@@ -0,0 +1,72 @@
1
+ import * as path from "path";
2
+ import * as fs from "fs/promises";
3
+ import { run } from "runish";
4
+ import { existsSync } from "fs";
5
+
6
+ const LIB_DIR = path.resolve("./lib");
7
+ const OUT_DIR = path.resolve("./out");
8
+
9
+ async function main() {
10
+ await fs.mkdir(LIB_DIR, { recursive: true });
11
+
12
+ const libs = [
13
+ [
14
+ "PyMuPDF",
15
+ "https://github.com/pymupdf/PyMuPDF",
16
+ "4a53405a51d29f2f620c0c7659b7c4d404a9f9c0",
17
+ ],
18
+ ];
19
+ for (const [name, repo, hash, callback] of libs) {
20
+ process.chdir(LIB_DIR);
21
+ const cloned = await gitClone(name, repo, hash);
22
+ if (cloned && callback) await callback();
23
+ }
24
+
25
+ await fs.mkdir(OUT_DIR, { recursive: true });
26
+
27
+ const assets = [
28
+ "https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/pyodide.js",
29
+ "https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/python_stdlib.zip",
30
+ "https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/pyodide.asm.wasm",
31
+ "https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/pyodide-lock.json",
32
+ "https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/pyodide.asm.js",
33
+ "https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/lxml-5.4.0-cp313-cp313-pyodide_2025_0_wasm32.whl",
34
+ "https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/typing_extensions-4.12.2-py3-none-any.whl",
35
+ "https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/numpy-2.2.5-cp313-cp313-pyodide_2025_0_wasm32.whl",
36
+ "https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/opencv_python-4.11.0.86-cp313-cp313-pyodide_2025_0_wasm32.whl",
37
+ "https://cdn.jsdelivr.net/pyodide/v0.28.0a3/full/fonttools-4.56.0-py3-none-any.whl",
38
+ "https://files.pythonhosted.org/packages/d0/00/1e03a4989fa5795da308cd774f05b704ace555a70f9bf9d3be057b680bcf/python_docx-1.2.0-py3-none-any.whl",
39
+ "https://files.pythonhosted.org/packages/b5/f9/6d567df395c0409baf2b4dd9cd30d1e977c70672fe7ec2a684af1e6aa41c/pdf2docx-0.5.8-py3-none-any.whl",
40
+ ];
41
+ for (let url of assets) {
42
+ const name = url.split("/").at(-1);
43
+ if (name === "pyodide.js") url = url.replace(/.js$/, () => ".mjs");
44
+ await download(name, url);
45
+ }
46
+ }
47
+
48
+ async function gitClone(name, repo, hash) {
49
+ if (existsSync(name)) return;
50
+
51
+ console.log(`git cloning ${name} - ${repo} - ${hash}`);
52
+ await run("git", ["init", name]);
53
+ process.chdir(path.join(LIB_DIR, name));
54
+ await run("git", ["fetch", "--depth", "1", repo, hash]);
55
+ await run("git", ["checkout", "FETCH_HEAD"]);
56
+ return true;
57
+ }
58
+
59
+ async function download(name, url) {
60
+ const filePath = path.join(OUT_DIR, name);
61
+
62
+ if (existsSync(filePath)) return;
63
+
64
+ console.log(`downloading ${name} - ${url}`);
65
+ const buf = await fetch(url).then((x) => x.arrayBuffer());
66
+ await fs.writeFile(filePath, Buffer.from(buf));
67
+ }
68
+
69
+ main().catch((err) => {
70
+ console.error(err);
71
+ process.exit(1);
72
+ });
package/dist/index.js CHANGED
@@ -1128,17 +1128,21 @@ elif t != "null":
1128
1128
  };
1129
1129
 
1130
1130
  // src/pymupdf.ts
1131
- import loadGhostscriptWASM from "@bentopdf/gs-wasm";
1132
- async function convertPdfToRgb(pdfData) {
1131
+ async function convertPdfToRgb(pdfData, gsBaseUrl) {
1132
+ if (!gsBaseUrl) {
1133
+ throw new Error("Ghostscript URL not configured. Cannot perform RGB conversion.");
1134
+ }
1133
1135
  console.log("[convertPdfToRgb] Starting Ghostscript RGB conversion...");
1134
1136
  console.log("[convertPdfToRgb] Input size:", pdfData.length);
1137
+ console.log("[convertPdfToRgb] GS base URL:", gsBaseUrl);
1138
+ const normalizedGsUrl = gsBaseUrl.endsWith("/") ? gsBaseUrl : `${gsBaseUrl}/`;
1139
+ const libraryUrl = `${normalizedGsUrl}dist/index.js`;
1140
+ const { loadGhostscriptWASM } = await import(
1141
+ /* @vite-ignore */
1142
+ libraryUrl
1143
+ );
1135
1144
  const gs = await loadGhostscriptWASM({
1136
- locateFile: (path) => {
1137
- if (path.endsWith(".wasm")) {
1138
- return "/ghostscript-wasm/gs.wasm";
1139
- }
1140
- return path;
1141
- },
1145
+ baseUrl: `${normalizedGsUrl}assets/`,
1142
1146
  print: (text) => console.log("[GS RGB]", text),
1143
1147
  printErr: (text) => console.error("[GS RGB Error]", text)
1144
1148
  });
@@ -1257,8 +1261,10 @@ var PyMuPDF = class {
1257
1261
  this.docCounter = 0;
1258
1262
  if (typeof options === "string") {
1259
1263
  this.assetPath = options;
1264
+ this.ghostscriptUrl = "";
1260
1265
  } else {
1261
1266
  this.assetPath = options?.assetPath ?? "./";
1267
+ this.ghostscriptUrl = options?.ghostscriptUrl ?? "";
1262
1268
  }
1263
1269
  if (!this.assetPath.endsWith("/")) {
1264
1270
  this.assetPath += "/";
@@ -1429,7 +1435,7 @@ def deskew_image(img_array, angle):
1429
1435
  let pdfData = new Uint8Array(buf);
1430
1436
  console.log("[pdfToDocx] Converting PDF to RGB colorspace with Ghostscript...");
1431
1437
  try {
1432
- const rgbData = await convertPdfToRgb(pdfData);
1438
+ const rgbData = await convertPdfToRgb(pdfData, this.ghostscriptUrl);
1433
1439
  pdfData = rgbData;
1434
1440
  console.log("[pdfToDocx] RGB conversion complete");
1435
1441
  } catch (e) {
@@ -2312,19 +2318,121 @@ doc.scrub(
2312
2318
  reset_responses=${scrubResetResponses ? "True" : "False"},
2313
2319
  )
2314
2320
 
2315
- # 2. Image compression
2321
+ # 2. Image compression (safe per-xref approach to avoid MuPDF buffer overflow
2322
+ # with shared image xrefs across many pages \u2014 bypasses doc.rewrite_images())
2316
2323
  if ${compressImages ? "True" : "False"}:
2317
- doc.rewrite_images(
2318
- dpi_threshold=${dpiThreshold},
2319
- dpi_target=${dpiTarget},
2320
- quality=${imageQuality},
2321
- lossy=${processLossy ? "True" : "False"},
2322
- lossless=${processLossless ? "True" : "False"},
2323
- bitonal=${processBitonal ? "True" : "False"},
2324
- color=${processColor ? "True" : "False"},
2325
- gray=${processGray ? "True" : "False"},
2326
- set_to_gray=${convertToGray ? "True" : "False"},
2327
- )
2324
+ import math as _math
2325
+ import sys as _sys
2326
+
2327
+ _dpi_target = ${dpiTarget}
2328
+ _dpi_threshold = ${dpiThreshold}
2329
+ _quality = ${imageQuality}
2330
+ _set_to_gray = ${convertToGray ? "True" : "False"}
2331
+ _process_lossy = ${processLossy ? "True" : "False"}
2332
+ _process_lossless = ${processLossless ? "True" : "False"}
2333
+ _process_bitonal = ${processBitonal ? "True" : "False"}
2334
+ _process_color = ${processColor ? "True" : "False"}
2335
+ _process_gray = ${processGray ? "True" : "False"}
2336
+
2337
+ # Phase 1: Collect unique image xrefs and smask info
2338
+ _xref_info = {}
2339
+ for _page in doc:
2340
+ for _img in _page.get_images(full=True):
2341
+ _xref, _smask = _img[0], _img[1]
2342
+ if _xref > 0:
2343
+ _xref_info.setdefault(_xref, {"smask": _smask, "min_dpi": float("inf")})
2344
+
2345
+ # Phase 2: Calculate effective DPI for each xref across all page usages
2346
+ for _page in doc:
2347
+ for _info in _page.get_image_info(hashes=False, xrefs=True):
2348
+ _xref = _info.get("xref", 0)
2349
+ if _xref not in _xref_info:
2350
+ continue
2351
+ _bbox = _info.get("bbox")
2352
+ _w = _info.get("width", 0)
2353
+ _h = _info.get("height", 0)
2354
+ if _bbox and _w > 0 and _h > 0:
2355
+ _disp_w = abs(_bbox[2] - _bbox[0])
2356
+ _disp_h = abs(_bbox[3] - _bbox[1])
2357
+ if _disp_w > 0 and _disp_h > 0:
2358
+ _dpi = min(_w / _disp_w * 72, _h / _disp_h * 72)
2359
+ if _dpi < _xref_info[_xref]["min_dpi"]:
2360
+ _xref_info[_xref]["min_dpi"] = _dpi
2361
+
2362
+ _effective_threshold = max(_dpi_threshold or 0, (_dpi_target or 0) + 10) if _dpi_target else None
2363
+
2364
+ # Phase 3: Rewrite each image xref individually
2365
+ for _xref, _meta in _xref_info.items():
2366
+ _min_dpi = _meta["min_dpi"]
2367
+ _smask_xref = _meta["smask"]
2368
+
2369
+ _needs_downscale = bool(
2370
+ _dpi_target and _effective_threshold
2371
+ and _min_dpi != float("inf")
2372
+ and _min_dpi > _effective_threshold
2373
+ )
2374
+ if not _needs_downscale and _quality is None and not _set_to_gray:
2375
+ continue
2376
+
2377
+ try:
2378
+ # Check image type filters (match rewrite_images behavior)
2379
+ _xref_obj = doc.xref_object(_xref)
2380
+ _is_lossy = "/DCTDecode" in _xref_obj or "/JPXDecode" in _xref_obj
2381
+ _is_lossless = not _is_lossy
2382
+ if _is_lossy and not _process_lossy:
2383
+ continue
2384
+ if _is_lossless and not _process_lossless:
2385
+ continue
2386
+
2387
+ _pix = pymupdf.Pixmap(doc, _xref)
2388
+
2389
+ # Check colorspace filters
2390
+ _n = _pix.colorspace.n if _pix.colorspace else 0
2391
+ _is_bitonal = (_pix.colorspace and _n == 1 and doc.xref_get_key(_xref, "BitsPerComponent")[1] == "1")
2392
+ _is_gray = (_n == 1 and not _is_bitonal)
2393
+ _is_color = (_n >= 3)
2394
+ if _is_bitonal and not _process_bitonal:
2395
+ _pix = None
2396
+ continue
2397
+ if _is_gray and not _process_gray:
2398
+ _pix = None
2399
+ continue
2400
+ if _is_color and not _process_color:
2401
+ _pix = None
2402
+ continue
2403
+
2404
+ if _set_to_gray and _pix.colorspace and _pix.colorspace.n > 1:
2405
+ _pix = pymupdf.Pixmap(pymupdf.csGRAY, _pix)
2406
+ elif _pix.alpha:
2407
+ _pix = pymupdf.Pixmap(_pix.colorspace or pymupdf.csRGB, _pix)
2408
+
2409
+ if _needs_downscale:
2410
+ _ratio = _min_dpi / _dpi_target
2411
+ _shrink_n = max(0, min(7, int(_math.log2(_ratio))))
2412
+ if _shrink_n > 0:
2413
+ _pix.shrink(_shrink_n)
2414
+
2415
+ _q = _quality if _quality is not None else 85
2416
+ _jpeg_bytes = _pix.tobytes("jpeg", jpg_quality=_q)
2417
+
2418
+ _cs_name = (
2419
+ "/DeviceGray"
2420
+ if _pix.colorspace and _pix.colorspace.n == 1
2421
+ else "/DeviceRGB"
2422
+ )
2423
+ _smask_entry = f"/SMask {_smask_xref} 0 R " if _smask_xref else ""
2424
+ _new_obj = (
2425
+ f"<</Type /XObject /Subtype /Image /BitsPerComponent 8"
2426
+ f" /ColorSpace {_cs_name} /Filter /DCTDecode"
2427
+ f" /Height {_pix.height} /Width {_pix.width}"
2428
+ f" {_smask_entry}>>"
2429
+ )
2430
+ doc.update_object(_xref, _new_obj)
2431
+ doc.update_stream(_xref, _jpeg_bytes, compress=0)
2432
+ _pix = None
2433
+
2434
+ except Exception as _e:
2435
+ _sys.stderr.write(f"[pymupdf-wasm] safe_rewrite_images xref {_xref}: {_e}\\n")
2328
2436
 
2329
2437
  # 3. Font subsetting
2330
2438
  if ${subsetFonts ? "True" : "False"}:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bentopdf/pymupdf-wasm",
3
- "version": "0.11.12",
3
+ "version": "0.11.15",
4
4
  "description": "PyMuPDF compiled to WebAssembly - Full PDF manipulation in the browser",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -19,16 +19,14 @@
19
19
  "files": [
20
20
  "dist",
21
21
  "types",
22
- "assets"
22
+ "assets",
23
+ "build_scripts"
23
24
  ],
24
25
  "scripts": {
25
- "build": "esbuild src/index.ts --bundle --format=esm --outfile=dist/index.js --external:./assets/* --external:@bentopdf/gs-wasm",
26
- "dev": "esbuild src/index.ts --bundle --format=esm --outfile=dist/index.js --watch --external:@bentopdf/gs-wasm",
26
+ "build": "esbuild src/index.ts --bundle --format=esm --outfile=dist/index.js --external:./assets/*",
27
+ "dev": "esbuild src/index.ts --bundle --format=esm --outfile=dist/index.js --watch",
27
28
  "typecheck": "tsc --noEmit"
28
29
  },
29
- "peerDependencies": {
30
- "@bentopdf/gs-wasm": "*"
31
- },
32
30
  "repository": {
33
31
  "type": "git",
34
32
  "url": "git+https://github.com/alam00000/bentopdf-pymupdf-wasm.git"
@@ -46,7 +44,10 @@
46
44
  "extract"
47
45
  ],
48
46
  "author": "BentoPDF",
49
- "license": "AGPL-3.0",
47
+ "contributors": [
48
+ "Artifex Software, Inc."
49
+ ],
50
+ "license": "AGPL-3.0-only",
50
51
  "bugs": "https://github.com/alam00000/bentopdf-pymupdf-wasm/issues",
51
52
  "devDependencies": {
52
53
  "esbuild": "^0.21.2",