@bentopdf/pymupdf-wasm 0.11.14 → 0.11.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +114 -12
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -2318,19 +2318,121 @@ doc.scrub(
|
|
|
2318
2318
|
reset_responses=${scrubResetResponses ? "True" : "False"},
|
|
2319
2319
|
)
|
|
2320
2320
|
|
|
2321
|
-
# 2. Image compression
|
|
2321
|
+
# 2. Image compression (safe per-xref approach to avoid MuPDF buffer overflow
|
|
2322
|
+
# with shared image xrefs across many pages \u2014 bypasses doc.rewrite_images())
|
|
2322
2323
|
if ${compressImages ? "True" : "False"}:
|
|
2323
|
-
|
|
2324
|
-
|
|
2325
|
-
|
|
2326
|
-
|
|
2327
|
-
|
|
2328
|
-
|
|
2329
|
-
|
|
2330
|
-
|
|
2331
|
-
|
|
2332
|
-
|
|
2333
|
-
|
|
2324
|
+
import math as _math
|
|
2325
|
+
import sys as _sys
|
|
2326
|
+
|
|
2327
|
+
_dpi_target = ${dpiTarget}
|
|
2328
|
+
_dpi_threshold = ${dpiThreshold}
|
|
2329
|
+
_quality = ${imageQuality}
|
|
2330
|
+
_set_to_gray = ${convertToGray ? "True" : "False"}
|
|
2331
|
+
_process_lossy = ${processLossy ? "True" : "False"}
|
|
2332
|
+
_process_lossless = ${processLossless ? "True" : "False"}
|
|
2333
|
+
_process_bitonal = ${processBitonal ? "True" : "False"}
|
|
2334
|
+
_process_color = ${processColor ? "True" : "False"}
|
|
2335
|
+
_process_gray = ${processGray ? "True" : "False"}
|
|
2336
|
+
|
|
2337
|
+
# Phase 1: Collect unique image xrefs and smask info
|
|
2338
|
+
_xref_info = {}
|
|
2339
|
+
for _page in doc:
|
|
2340
|
+
for _img in _page.get_images(full=True):
|
|
2341
|
+
_xref, _smask = _img[0], _img[1]
|
|
2342
|
+
if _xref > 0:
|
|
2343
|
+
_xref_info.setdefault(_xref, {"smask": _smask, "min_dpi": float("inf")})
|
|
2344
|
+
|
|
2345
|
+
# Phase 2: Calculate effective DPI for each xref across all page usages
|
|
2346
|
+
for _page in doc:
|
|
2347
|
+
for _info in _page.get_image_info(hashes=False, xrefs=True):
|
|
2348
|
+
_xref = _info.get("xref", 0)
|
|
2349
|
+
if _xref not in _xref_info:
|
|
2350
|
+
continue
|
|
2351
|
+
_bbox = _info.get("bbox")
|
|
2352
|
+
_w = _info.get("width", 0)
|
|
2353
|
+
_h = _info.get("height", 0)
|
|
2354
|
+
if _bbox and _w > 0 and _h > 0:
|
|
2355
|
+
_disp_w = abs(_bbox[2] - _bbox[0])
|
|
2356
|
+
_disp_h = abs(_bbox[3] - _bbox[1])
|
|
2357
|
+
if _disp_w > 0 and _disp_h > 0:
|
|
2358
|
+
_dpi = min(_w / _disp_w * 72, _h / _disp_h * 72)
|
|
2359
|
+
if _dpi < _xref_info[_xref]["min_dpi"]:
|
|
2360
|
+
_xref_info[_xref]["min_dpi"] = _dpi
|
|
2361
|
+
|
|
2362
|
+
_effective_threshold = max(_dpi_threshold or 0, (_dpi_target or 0) + 10) if _dpi_target else None
|
|
2363
|
+
|
|
2364
|
+
# Phase 3: Rewrite each image xref individually
|
|
2365
|
+
for _xref, _meta in _xref_info.items():
|
|
2366
|
+
_min_dpi = _meta["min_dpi"]
|
|
2367
|
+
_smask_xref = _meta["smask"]
|
|
2368
|
+
|
|
2369
|
+
_needs_downscale = bool(
|
|
2370
|
+
_dpi_target and _effective_threshold
|
|
2371
|
+
and _min_dpi != float("inf")
|
|
2372
|
+
and _min_dpi > _effective_threshold
|
|
2373
|
+
)
|
|
2374
|
+
if not _needs_downscale and _quality is None and not _set_to_gray:
|
|
2375
|
+
continue
|
|
2376
|
+
|
|
2377
|
+
try:
|
|
2378
|
+
# Check image type filters (match rewrite_images behavior)
|
|
2379
|
+
_xref_obj = doc.xref_object(_xref)
|
|
2380
|
+
_is_lossy = "/DCTDecode" in _xref_obj or "/JPXDecode" in _xref_obj
|
|
2381
|
+
_is_lossless = not _is_lossy
|
|
2382
|
+
if _is_lossy and not _process_lossy:
|
|
2383
|
+
continue
|
|
2384
|
+
if _is_lossless and not _process_lossless:
|
|
2385
|
+
continue
|
|
2386
|
+
|
|
2387
|
+
_pix = pymupdf.Pixmap(doc, _xref)
|
|
2388
|
+
|
|
2389
|
+
# Check colorspace filters
|
|
2390
|
+
_n = _pix.colorspace.n if _pix.colorspace else 0
|
|
2391
|
+
_is_bitonal = (_pix.colorspace and _n == 1 and doc.xref_get_key(_xref, "BitsPerComponent")[1] == "1")
|
|
2392
|
+
_is_gray = (_n == 1 and not _is_bitonal)
|
|
2393
|
+
_is_color = (_n >= 3)
|
|
2394
|
+
if _is_bitonal and not _process_bitonal:
|
|
2395
|
+
_pix = None
|
|
2396
|
+
continue
|
|
2397
|
+
if _is_gray and not _process_gray:
|
|
2398
|
+
_pix = None
|
|
2399
|
+
continue
|
|
2400
|
+
if _is_color and not _process_color:
|
|
2401
|
+
_pix = None
|
|
2402
|
+
continue
|
|
2403
|
+
|
|
2404
|
+
if _set_to_gray and _pix.colorspace and _pix.colorspace.n > 1:
|
|
2405
|
+
_pix = pymupdf.Pixmap(pymupdf.csGRAY, _pix)
|
|
2406
|
+
elif _pix.alpha:
|
|
2407
|
+
_pix = pymupdf.Pixmap(_pix.colorspace or pymupdf.csRGB, _pix)
|
|
2408
|
+
|
|
2409
|
+
if _needs_downscale:
|
|
2410
|
+
_ratio = _min_dpi / _dpi_target
|
|
2411
|
+
_shrink_n = max(0, min(7, int(_math.log2(_ratio))))
|
|
2412
|
+
if _shrink_n > 0:
|
|
2413
|
+
_pix.shrink(_shrink_n)
|
|
2414
|
+
|
|
2415
|
+
_q = _quality if _quality is not None else 85
|
|
2416
|
+
_jpeg_bytes = _pix.tobytes("jpeg", jpg_quality=_q)
|
|
2417
|
+
|
|
2418
|
+
_cs_name = (
|
|
2419
|
+
"/DeviceGray"
|
|
2420
|
+
if _pix.colorspace and _pix.colorspace.n == 1
|
|
2421
|
+
else "/DeviceRGB"
|
|
2422
|
+
)
|
|
2423
|
+
_smask_entry = f"/SMask {_smask_xref} 0 R " if _smask_xref else ""
|
|
2424
|
+
_new_obj = (
|
|
2425
|
+
f"<</Type /XObject /Subtype /Image /BitsPerComponent 8"
|
|
2426
|
+
f" /ColorSpace {_cs_name} /Filter /DCTDecode"
|
|
2427
|
+
f" /Height {_pix.height} /Width {_pix.width}"
|
|
2428
|
+
f" {_smask_entry}>>"
|
|
2429
|
+
)
|
|
2430
|
+
doc.update_object(_xref, _new_obj)
|
|
2431
|
+
doc.update_stream(_xref, _jpeg_bytes, compress=0)
|
|
2432
|
+
_pix = None
|
|
2433
|
+
|
|
2434
|
+
except Exception as _e:
|
|
2435
|
+
_sys.stderr.write(f"[pymupdf-wasm] safe_rewrite_images xref {_xref}: {_e}\\n")
|
|
2334
2436
|
|
|
2335
2437
|
# 3. Font subsetting
|
|
2336
2438
|
if ${subsetFonts ? "True" : "False"}:
|