docx-shrinker 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docx_shrinker-0.1.0/.github/workflows/python-publish.yml +55 -0
- docx_shrinker-0.1.0/.gitignore +47 -0
- docx_shrinker-0.1.0/.python-version +1 -0
- docx_shrinker-0.1.0/PKG-INFO +132 -0
- docx_shrinker-0.1.0/README.md +121 -0
- docx_shrinker-0.1.0/pyproject.toml +22 -0
- docx_shrinker-0.1.0/src/docx_shrinker/__init__.py +8 -0
- docx_shrinker-0.1.0/src/docx_shrinker/cli.py +119 -0
- docx_shrinker-0.1.0/src/docx_shrinker/core.py +830 -0
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# This workflow will upload a Python Package to PyPI when a release is created
|
|
2
|
+
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
|
|
3
|
+
|
|
4
|
+
name: Upload Python Package
|
|
5
|
+
|
|
6
|
+
on:
|
|
7
|
+
release:
|
|
8
|
+
types: [published]
|
|
9
|
+
|
|
10
|
+
permissions:
|
|
11
|
+
contents: read
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
release-build:
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
|
|
20
|
+
- uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: "3.x"
|
|
23
|
+
|
|
24
|
+
- name: Build release distributions
|
|
25
|
+
run: |
|
|
26
|
+
python -m pip install build
|
|
27
|
+
python -m build
|
|
28
|
+
|
|
29
|
+
- name: Upload distributions
|
|
30
|
+
uses: actions/upload-artifact@v4
|
|
31
|
+
with:
|
|
32
|
+
name: release-dists
|
|
33
|
+
path: dist/
|
|
34
|
+
|
|
35
|
+
pypi-publish:
|
|
36
|
+
runs-on: ubuntu-latest
|
|
37
|
+
needs:
|
|
38
|
+
- release-build
|
|
39
|
+
permissions:
|
|
40
|
+
id-token: write
|
|
41
|
+
|
|
42
|
+
environment:
|
|
43
|
+
name: pypi
|
|
44
|
+
|
|
45
|
+
steps:
|
|
46
|
+
- name: Retrieve release distributions
|
|
47
|
+
uses: actions/download-artifact@v4
|
|
48
|
+
with:
|
|
49
|
+
name: release-dists
|
|
50
|
+
path: dist/
|
|
51
|
+
|
|
52
|
+
- name: Publish release distributions to PyPI
|
|
53
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
54
|
+
with:
|
|
55
|
+
packages-dir: dist/
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
*.egg-info/
|
|
20
|
+
.installed.cfg
|
|
21
|
+
*.egg
|
|
22
|
+
|
|
23
|
+
# Virtual environments
|
|
24
|
+
.venv/
|
|
25
|
+
venv/
|
|
26
|
+
ENV/
|
|
27
|
+
|
|
28
|
+
# IDE
|
|
29
|
+
.idea/
|
|
30
|
+
.vscode/
|
|
31
|
+
*.swp
|
|
32
|
+
*.swo
|
|
33
|
+
|
|
34
|
+
# Testing
|
|
35
|
+
.tox/
|
|
36
|
+
.coverage
|
|
37
|
+
.coverage.*
|
|
38
|
+
htmlcov/
|
|
39
|
+
.pytest_cache/
|
|
40
|
+
|
|
41
|
+
# uv
|
|
42
|
+
.uv/
|
|
43
|
+
uv.lock
|
|
44
|
+
|
|
45
|
+
# OS
|
|
46
|
+
.DS_Store
|
|
47
|
+
Thumbs.db
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.13
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docx-shrinker
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Shrink and sanitize Word (.docx) documents by converting Visio embeddings, compressing images, and stripping metadata
|
|
5
|
+
Author: cognitohazard
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Requires-Dist: pymupdf
|
|
9
|
+
Requires-Dist: pywin32
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# docx-shrinker
|
|
13
|
+
|
|
14
|
+
Shrink and sanitize Word (.docx) documents. Converts embedded Visio diagrams to raster images, compresses oversized media, deduplicates files, and strips metadata, comments, tracked changes, macros, and other cruft.
|
|
15
|
+
|
|
16
|
+
## What it does
|
|
17
|
+
|
|
18
|
+
1. **Convert Visio embeddings** — `.vsdx` → PDF (via Visio COM) → JPG/PNG (via PyMuPDF). Falls back to keeping the EMF preview when Visio is unavailable.
|
|
19
|
+
2. **Convert OLE objects** — Replaces legacy VML `<w:object>` blocks with modern DrawingML `<w:drawing>` inline pictures.
|
|
20
|
+
3. **Compress images** — Resizes raster images exceeding a pixel width threshold and re-compresses JPGs.
|
|
21
|
+
4. **Deduplicate media** — Identifies identical files by hash and rewrites relationships to point to a single copy.
|
|
22
|
+
5. **Strip personal info** — Removes author, last modified by, company, manager, keywords, and other document properties.
|
|
23
|
+
6. **Remove comments and tracked changes** — Deletes comment files and accepts all revisions inline.
|
|
24
|
+
7. **Strip bookmarks** — Removes auto-generated bookmarks (`_GoBack`, empty).
|
|
25
|
+
8. **Remove garbage parts** — Thumbnail, VBA macros, printer settings, ActiveX controls, custom XML data.
|
|
26
|
+
9. **Clean up** — Updates `[Content_Types].xml` and `.rels` files to reflect removed parts.
|
|
27
|
+
10. **Validate output** — Checks ZIP integrity and presence of `[Content_Types].xml` before finalizing.
|
|
28
|
+
|
|
29
|
+
## Requirements
|
|
30
|
+
|
|
31
|
+
- **Python 3.10+**
|
|
32
|
+
- **PyMuPDF** (`pymupdf`) — image compression and PDF-to-image rendering
|
|
33
|
+
- **pywin32** — Visio COM automation (Windows only; Visio conversion is skipped if unavailable)
|
|
34
|
+
- **Microsoft Visio** (optional) — required only for converting embedded `.vsdx` to high-quality images
|
|
35
|
+
|
|
36
|
+
## Installation
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
pip install docx-shrinker
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Or with [uv](https://docs.astral.sh/uv/):
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
uv tool install docx-shrinker
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Usage
|
|
49
|
+
|
|
50
|
+
### Command line
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
docx-shrinker report.docx
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
This produces `report (shrunk).docx` in the same directory.
|
|
57
|
+
|
|
58
|
+
Specify an output path:
|
|
59
|
+
|
|
60
|
+
```
|
|
61
|
+
docx-shrinker report.docx output.docx
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Options
|
|
65
|
+
|
|
66
|
+
| Flag | Default | Description |
|
|
67
|
+
|------|---------|-------------|
|
|
68
|
+
| `--format {jpg,png}` | `jpg` | Image format for converted Visio figures |
|
|
69
|
+
| `--dpi N` | `300` | Rasterization DPI for Visio conversion |
|
|
70
|
+
| `--quality N` | `95` | JPG quality (1–100). Ignored for PNG. |
|
|
71
|
+
| `--max-width N` | `2000` | Max pixel width for raster images. `0` to disable. |
|
|
72
|
+
| `-i, --interactive` | off | After conversion, show top 5 largest images and offer to re-convert at different quality |
|
|
73
|
+
| `--version` | | Show version and exit |
|
|
74
|
+
|
|
75
|
+
### Examples
|
|
76
|
+
|
|
77
|
+
Convert Visio figures to PNG at 150 DPI:
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
docx-shrinker report.docx --format png --dpi 150
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Aggressive compression (lower quality, smaller max width):
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
docx-shrinker report.docx --quality 80 --max-width 1200
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Interactive mode to fine-tune large images:
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
docx-shrinker report.docx -i
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Python API
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from docx_shrinker import shrink_docx
|
|
99
|
+
|
|
100
|
+
result = shrink_docx("input.docx", "output.docx", fmt="jpg", dpi=300, quality=95)
|
|
101
|
+
|
|
102
|
+
print(f"{result['original_size_mb']} MB -> {result['new_size_mb']} MB")
|
|
103
|
+
print(f"Reduction: {result['reduction_percent']}%")
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
The `result` dict contains:
|
|
107
|
+
|
|
108
|
+
| Key | Type | Description |
|
|
109
|
+
|-----|------|-------------|
|
|
110
|
+
| `original_size_mb` | `float` | Original file size |
|
|
111
|
+
| `new_size_mb` | `float` | Output file size |
|
|
112
|
+
| `reduction_mb` | `float` | Size saved |
|
|
113
|
+
| `reduction_percent` | `float` | Percentage reduction |
|
|
114
|
+
| `output_path` | `str` | Path to the output file |
|
|
115
|
+
| `visio_converted` | `list` | `(name, size_kb)` tuples for each converted Visio diagram |
|
|
116
|
+
| `visio_removed` | `int` | Number of `.vsdx` embeddings removed |
|
|
117
|
+
| `images_compressed` | `list` | `(filename, old_kb, new_kb)` tuples |
|
|
118
|
+
| `duplicates_removed` | `int` | Number of duplicate media files removed |
|
|
119
|
+
| `comments_removed` | `int` | Number of comment files removed |
|
|
120
|
+
| `bookmarks_removed` | `int` | Number of bookmarks removed |
|
|
121
|
+
| `garbage_removed` | `list` | Names of removed garbage parts |
|
|
122
|
+
| `warnings` | `list` | Warning messages (e.g., Visio unavailable) |
|
|
123
|
+
|
|
124
|
+
## How it works
|
|
125
|
+
|
|
126
|
+
A `.docx` file is a ZIP archive containing XML and media files. docx-shrinker extracts the archive into a temp directory, applies all transformations in-place, then repacks it into a new ZIP. The original file is never modified.
|
|
127
|
+
|
|
128
|
+
Visio diagrams embedded as OLE objects include both the full `.vsdx` source and a low-resolution EMF preview image. docx-shrinker replaces these with a high-quality raster render and strips the heavy `.vsdx` originals — often the single biggest source of bloat.
|
|
129
|
+
|
|
130
|
+
## License
|
|
131
|
+
|
|
132
|
+
MIT
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# docx-shrinker
|
|
2
|
+
|
|
3
|
+
Shrink and sanitize Word (.docx) documents. Converts embedded Visio diagrams to raster images, compresses oversized media, deduplicates files, and strips metadata, comments, tracked changes, macros, and other cruft.
|
|
4
|
+
|
|
5
|
+
## What it does
|
|
6
|
+
|
|
7
|
+
1. **Convert Visio embeddings** — `.vsdx` → PDF (via Visio COM) → JPG/PNG (via PyMuPDF). Falls back to keeping the EMF preview when Visio is unavailable.
|
|
8
|
+
2. **Convert OLE objects** — Replaces legacy VML `<w:object>` blocks with modern DrawingML `<w:drawing>` inline pictures.
|
|
9
|
+
3. **Compress images** — Resizes raster images exceeding a pixel width threshold and re-compresses JPGs.
|
|
10
|
+
4. **Deduplicate media** — Identifies identical files by hash and rewrites relationships to point to a single copy.
|
|
11
|
+
5. **Strip personal info** — Removes author, last modified by, company, manager, keywords, and other document properties.
|
|
12
|
+
6. **Remove comments and tracked changes** — Deletes comment files and accepts all revisions inline.
|
|
13
|
+
7. **Strip bookmarks** — Removes auto-generated bookmarks (`_GoBack`, empty).
|
|
14
|
+
8. **Remove garbage parts** — Thumbnail, VBA macros, printer settings, ActiveX controls, custom XML data.
|
|
15
|
+
9. **Clean up** — Updates `[Content_Types].xml` and `.rels` files to reflect removed parts.
|
|
16
|
+
10. **Validate output** — Checks ZIP integrity and presence of `[Content_Types].xml` before finalizing.
|
|
17
|
+
|
|
18
|
+
## Requirements
|
|
19
|
+
|
|
20
|
+
- **Python 3.10+**
|
|
21
|
+
- **PyMuPDF** (`pymupdf`) — image compression and PDF-to-image rendering
|
|
22
|
+
- **pywin32** — Visio COM automation (Windows only; Visio conversion is skipped if unavailable)
|
|
23
|
+
- **Microsoft Visio** (optional) — required only for converting embedded `.vsdx` to high-quality images
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
pip install docx-shrinker
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Or with [uv](https://docs.astral.sh/uv/):
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
uv tool install docx-shrinker
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Usage
|
|
38
|
+
|
|
39
|
+
### Command line
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
docx-shrinker report.docx
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
This produces `report (shrunk).docx` in the same directory.
|
|
46
|
+
|
|
47
|
+
Specify an output path:
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
docx-shrinker report.docx output.docx
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Options
|
|
54
|
+
|
|
55
|
+
| Flag | Default | Description |
|
|
56
|
+
|------|---------|-------------|
|
|
57
|
+
| `--format {jpg,png}` | `jpg` | Image format for converted Visio figures |
|
|
58
|
+
| `--dpi N` | `300` | Rasterization DPI for Visio conversion |
|
|
59
|
+
| `--quality N` | `95` | JPG quality (1–100). Ignored for PNG. |
|
|
60
|
+
| `--max-width N` | `2000` | Max pixel width for raster images. `0` to disable. |
|
|
61
|
+
| `-i, --interactive` | off | After conversion, show top 5 largest images and offer to re-convert at different quality |
|
|
62
|
+
| `--version` | | Show version and exit |
|
|
63
|
+
|
|
64
|
+
### Examples
|
|
65
|
+
|
|
66
|
+
Convert Visio figures to PNG at 150 DPI:
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
docx-shrinker report.docx --format png --dpi 150
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Aggressive compression (lower quality, smaller max width):
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
docx-shrinker report.docx --quality 80 --max-width 1200
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
Interactive mode to fine-tune large images:
|
|
79
|
+
|
|
80
|
+
```
|
|
81
|
+
docx-shrinker report.docx -i
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Python API
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
from docx_shrinker import shrink_docx
|
|
88
|
+
|
|
89
|
+
result = shrink_docx("input.docx", "output.docx", fmt="jpg", dpi=300, quality=95)
|
|
90
|
+
|
|
91
|
+
print(f"{result['original_size_mb']} MB -> {result['new_size_mb']} MB")
|
|
92
|
+
print(f"Reduction: {result['reduction_percent']}%")
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
The `result` dict contains:
|
|
96
|
+
|
|
97
|
+
| Key | Type | Description |
|
|
98
|
+
|-----|------|-------------|
|
|
99
|
+
| `original_size_mb` | `float` | Original file size |
|
|
100
|
+
| `new_size_mb` | `float` | Output file size |
|
|
101
|
+
| `reduction_mb` | `float` | Size saved |
|
|
102
|
+
| `reduction_percent` | `float` | Percentage reduction |
|
|
103
|
+
| `output_path` | `str` | Path to the output file |
|
|
104
|
+
| `visio_converted` | `list` | `(name, size_kb)` tuples for each converted Visio diagram |
|
|
105
|
+
| `visio_removed` | `int` | Number of `.vsdx` embeddings removed |
|
|
106
|
+
| `images_compressed` | `list` | `(filename, old_kb, new_kb)` tuples |
|
|
107
|
+
| `duplicates_removed` | `int` | Number of duplicate media files removed |
|
|
108
|
+
| `comments_removed` | `int` | Number of comment files removed |
|
|
109
|
+
| `bookmarks_removed` | `int` | Number of bookmarks removed |
|
|
110
|
+
| `garbage_removed` | `list` | Names of removed garbage parts |
|
|
111
|
+
| `warnings` | `list` | Warning messages (e.g., Visio unavailable) |
|
|
112
|
+
|
|
113
|
+
## How it works
|
|
114
|
+
|
|
115
|
+
A `.docx` file is a ZIP archive containing XML and media files. docx-shrinker extracts the archive into a temp directory, applies all transformations in-place, then repacks it into a new ZIP. The original file is never modified.
|
|
116
|
+
|
|
117
|
+
Visio diagrams embedded as OLE objects include both the full `.vsdx` source and a low-resolution EMF preview image. docx-shrinker replaces these with a high-quality raster render and strips the heavy `.vsdx` originals — often the single biggest source of bloat.
|
|
118
|
+
|
|
119
|
+
## License
|
|
120
|
+
|
|
121
|
+
MIT
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "docx-shrinker"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Shrink and sanitize Word (.docx) documents by converting Visio embeddings, compressing images, and stripping metadata"
|
|
5
|
+
authors = [{name = "cognitohazard"}]
|
|
6
|
+
license = {text = "MIT"}
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
requires-python = ">=3.10"
|
|
9
|
+
dependencies = [
|
|
10
|
+
"pywin32",
|
|
11
|
+
"pymupdf",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[project.scripts]
|
|
15
|
+
docx-shrinker = "docx_shrinker.cli:main"
|
|
16
|
+
|
|
17
|
+
[build-system]
|
|
18
|
+
requires = ["hatchling"]
|
|
19
|
+
build-backend = "hatchling.build"
|
|
20
|
+
|
|
21
|
+
[tool.hatch.build.targets.wheel]
|
|
22
|
+
packages = ["src/docx_shrinker"]
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Command-line interface for docx-shrinker."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
from importlib.metadata import version, PackageNotFoundError
|
|
7
|
+
|
|
8
|
+
from .core import shrink_docx
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
__version__ = version("docx-shrinker")
|
|
12
|
+
except PackageNotFoundError:
|
|
13
|
+
from . import __version__
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _print_result(result):
|
|
17
|
+
"""Print a clean summary of what was done."""
|
|
18
|
+
lines = []
|
|
19
|
+
|
|
20
|
+
# Visio conversions
|
|
21
|
+
for name, size_kb in result['visio_converted']:
|
|
22
|
+
lines.append(f' Visio converted: {name} -> {size_kb} KB')
|
|
23
|
+
if result['visio_removed']:
|
|
24
|
+
lines.append(f' Visio removed: {result["visio_removed"]} embedding(s)')
|
|
25
|
+
|
|
26
|
+
# Image compression
|
|
27
|
+
for fname, old_kb, new_kb in result['images_compressed']:
|
|
28
|
+
lines.append(f' Compressed: {fname}: {old_kb} KB -> {new_kb} KB')
|
|
29
|
+
|
|
30
|
+
# Deduplication
|
|
31
|
+
if result['duplicates_removed']:
|
|
32
|
+
lines.append(f' Deduplicated: {result["duplicates_removed"]} file(s)')
|
|
33
|
+
|
|
34
|
+
# Metadata & markup
|
|
35
|
+
lines.append(f' Personal info: stripped')
|
|
36
|
+
lines.append(f' Tracked changes: accepted/stripped')
|
|
37
|
+
if result['comments_removed']:
|
|
38
|
+
lines.append(f' Comments: {result["comments_removed"]} file(s) removed')
|
|
39
|
+
if result['bookmarks_removed']:
|
|
40
|
+
lines.append(f' Bookmarks: {result["bookmarks_removed"]} removed')
|
|
41
|
+
|
|
42
|
+
# Garbage parts
|
|
43
|
+
if result['garbage_removed']:
|
|
44
|
+
lines.append(f' Garbage removed: {", ".join(result["garbage_removed"])}')
|
|
45
|
+
|
|
46
|
+
# Size summary
|
|
47
|
+
lines.append('')
|
|
48
|
+
lines.append(f' {result["original_size_mb"]} MB -> {result["new_size_mb"]} MB '
|
|
49
|
+
f'(-{result["reduction_mb"]} MB, {result["reduction_percent"]}%)')
|
|
50
|
+
|
|
51
|
+
print('\n'.join(lines))
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _print_warnings(warnings):
|
|
55
|
+
"""Print any warnings that occurred."""
|
|
56
|
+
for w in warnings:
|
|
57
|
+
print(f' WARNING: {w}', file=sys.stderr)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def main() -> int:
|
|
61
|
+
"""Main entry point for the CLI."""
|
|
62
|
+
parser = argparse.ArgumentParser(
|
|
63
|
+
prog='docx-shrinker',
|
|
64
|
+
description='Shrink and sanitize a Word document.',
|
|
65
|
+
epilog='''steps performed:
|
|
66
|
+
1. Convert embedded Visio .vsdx -> PDF (via Visio COM) -> JPG/PNG (via PyMuPDF)
|
|
67
|
+
Falls back to keeping EMF previews when Visio is unavailable.
|
|
68
|
+
2. Convert OLE/VML objects to DrawingML inline pictures
|
|
69
|
+
3. Compress/resize oversized raster images (--max-width)
|
|
70
|
+
4. Deduplicate identical media files
|
|
71
|
+
5. Strip personal info (author, company, manager, etc.)
|
|
72
|
+
6. Remove comments, tracked changes, and revision history
|
|
73
|
+
7. Strip internal bookmarks (_GoBack, etc.)
|
|
74
|
+
8. Remove thumbnail, VBA macros, printer settings, ActiveX, custom XML
|
|
75
|
+
9. Clean up relationships and content types
|
|
76
|
+
10. Validate output ZIP integrity''',
|
|
77
|
+
formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
78
|
+
parser.add_argument('input', help='Input .docx file')
|
|
79
|
+
parser.add_argument('output', nargs='?', default=None,
|
|
80
|
+
help='Output .docx file (default: "input (shrunk).docx")')
|
|
81
|
+
parser.add_argument('--format', choices=['jpg', 'png'], default='jpg',
|
|
82
|
+
help='Image format for converted Visio figures (default: jpg)')
|
|
83
|
+
parser.add_argument('--dpi', type=int, default=300,
|
|
84
|
+
help='Rasterization DPI (default: 300)')
|
|
85
|
+
parser.add_argument('--quality', type=int, default=95,
|
|
86
|
+
help='JPG quality 1-100 (default: 95). PNG is always lossless.')
|
|
87
|
+
parser.add_argument('--max-width', type=int, default=2000,
|
|
88
|
+
help='Max pixel width for non-Visio images (default: 2000). '
|
|
89
|
+
'0 to disable resizing.')
|
|
90
|
+
parser.add_argument('-i', '--interactive', action='store_true',
|
|
91
|
+
help='After conversion, show top 5 largest images and '
|
|
92
|
+
'offer to re-convert at different quality.')
|
|
93
|
+
parser.add_argument('--version', action='version', version=f'%(prog)s {__version__}')
|
|
94
|
+
|
|
95
|
+
args = parser.parse_args()
|
|
96
|
+
|
|
97
|
+
src = args.input
|
|
98
|
+
if args.output:
|
|
99
|
+
dst = args.output
|
|
100
|
+
else:
|
|
101
|
+
base, ext = os.path.splitext(src)
|
|
102
|
+
dst = f'{base} (shrunk){ext}'
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
print(f'Shrinking: {src}')
|
|
106
|
+
result = shrink_docx(src, dst, fmt=args.format, dpi=args.dpi,
|
|
107
|
+
quality=args.quality, max_width=args.max_width,
|
|
108
|
+
interactive=args.interactive)
|
|
109
|
+
_print_warnings(result['warnings'])
|
|
110
|
+
_print_result(result)
|
|
111
|
+
print(f'Saved: {dst}')
|
|
112
|
+
return 0
|
|
113
|
+
except Exception as e:
|
|
114
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
115
|
+
return 1
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
if __name__ == '__main__':
|
|
119
|
+
sys.exit(main())
|
|
@@ -0,0 +1,830 @@
|
|
|
1
|
+
"""Core functionality for shrinking and sanitizing Word (.docx) documents."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import zipfile
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import shutil
|
|
8
|
+
import tempfile
|
|
9
|
+
|
|
10
|
+
# Patterns for parts removed during cleanup (used in content types and rels)
|
|
11
|
+
_CLEANUP_PATTERNS = [
|
|
12
|
+
r'vbaProject\.bin', r'comments\.xml', r'commentsExtended\.xml',
|
|
13
|
+
r'commentsIds\.xml', r'thumbnail\.\w+', r'vbaData\.xml',
|
|
14
|
+
r'printerSettings/', r'activeX/', r'customXml/', r'custom\.xml',
|
|
15
|
+
r'embeddings/[^"]*\.vsdx',
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def extract_vml_dimensions(obj_xml):
|
|
20
|
+
"""Extract width/height in EMU from a VML <w:object> block.
|
|
21
|
+
Searches all style attributes for one containing both 'width:' and 'height:'
|
|
22
|
+
(skipping unrelated styles like 'miter' on stroke elements)."""
|
|
23
|
+
styles = re.findall(r'style="([^"]*)"', obj_xml)
|
|
24
|
+
width_emu = 3048000 # fallback 3.2 inches
|
|
25
|
+
height_emu = 2286000 # fallback 2.4 inches
|
|
26
|
+
|
|
27
|
+
for style in styles:
|
|
28
|
+
w_m = re.search(r'width:([\d.]+)(pt|in)', style)
|
|
29
|
+
h_m = re.search(r'height:([\d.]+)(pt|in)', style)
|
|
30
|
+
if w_m and h_m:
|
|
31
|
+
w_val = float(w_m.group(1))
|
|
32
|
+
h_val = float(h_m.group(1))
|
|
33
|
+
width_emu = int(w_val * 12700) if w_m.group(2) == 'pt' else int(w_val * 914400)
|
|
34
|
+
height_emu = int(h_val * 12700) if h_m.group(2) == 'pt' else int(h_val * 914400)
|
|
35
|
+
break
|
|
36
|
+
|
|
37
|
+
return width_emu, height_emu
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def object_to_drawing(obj_xml, doc_pr_id):
|
|
41
|
+
"""Convert a VML <w:object> block to a DrawingML <w:drawing> block.
|
|
42
|
+
doc_pr_id must be unique across the document."""
|
|
43
|
+
img_match = re.search(r'<v:imagedata\s[^>]*r:id="(rId\d+)"', obj_xml)
|
|
44
|
+
if not img_match:
|
|
45
|
+
return obj_xml
|
|
46
|
+
|
|
47
|
+
img_rid = img_match.group(1)
|
|
48
|
+
cx, cy = extract_vml_dimensions(obj_xml)
|
|
49
|
+
|
|
50
|
+
return (
|
|
51
|
+
f'<w:drawing>'
|
|
52
|
+
f'<wp:inline distT="0" distB="0" distL="0" distR="0">'
|
|
53
|
+
f'<wp:extent cx="{cx}" cy="{cy}"/>'
|
|
54
|
+
f'<wp:docPr id="{doc_pr_id}" name="Picture {doc_pr_id}"/>'
|
|
55
|
+
f'<a:graphic xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">'
|
|
56
|
+
f'<a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/picture">'
|
|
57
|
+
f'<pic:pic xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture">'
|
|
58
|
+
f'<pic:nvPicPr><pic:cNvPr id="{doc_pr_id}" name="Picture {doc_pr_id}"/><pic:cNvPicPr/></pic:nvPicPr>'
|
|
59
|
+
f'<pic:blipFill><a:blip r:embed="{img_rid}"/>'
|
|
60
|
+
f'<a:stretch><a:fillRect/></a:stretch></pic:blipFill>'
|
|
61
|
+
f'<pic:spPr>'
|
|
62
|
+
f'<a:xfrm><a:off x="0" y="0"/><a:ext cx="{cx}" cy="{cy}"/></a:xfrm>'
|
|
63
|
+
f'<a:prstGeom prst="rect"><a:avLst/></a:prstGeom>'
|
|
64
|
+
f'</pic:spPr>'
|
|
65
|
+
f'</pic:pic></a:graphicData></a:graphic>'
|
|
66
|
+
f'</wp:inline></w:drawing>'
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def next_doc_pr_id(doc_xml):
|
|
71
|
+
"""Find the highest existing docPr/cNvPr id in the document and return max + 1."""
|
|
72
|
+
ids = (int(m) for m in re.findall(r'(?:docPr|cNvPr)\b[^>]*\bid="(\d+)"', doc_xml))
|
|
73
|
+
return max(ids, default=0) + 1
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def ensure_namespaces(doc_xml):
|
|
77
|
+
"""Ensure the root <w:document> element declares wp: and r: namespaces
|
|
78
|
+
needed by generated DrawingML blocks."""
|
|
79
|
+
ns = {
|
|
80
|
+
'xmlns:wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
|
|
81
|
+
'xmlns:r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
|
|
82
|
+
}
|
|
83
|
+
m = re.search(r'(<w:document\b[^>]*)(>)', doc_xml)
|
|
84
|
+
if not m:
|
|
85
|
+
return doc_xml
|
|
86
|
+
tag = m.group(1)
|
|
87
|
+
for attr, uri in ns.items():
|
|
88
|
+
if attr not in tag:
|
|
89
|
+
tag += f' {attr}="{uri}"'
|
|
90
|
+
return doc_xml[:m.start()] + tag + m.group(2) + doc_xml[m.end():]
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def compress_media_images(media_dir, max_width=2000, quality=85, skip_files=None):
|
|
94
|
+
"""Re-encode oversized raster images in word/media/ to reduce file size.
|
|
95
|
+
Resizes images wider than max_width and re-compresses JPGs.
|
|
96
|
+
skip_files: set of filenames to skip (e.g. freshly converted Visio images).
|
|
97
|
+
Returns list of (filename, old_kb, new_kb) for images that were shrunk."""
|
|
98
|
+
import fitz
|
|
99
|
+
|
|
100
|
+
results = []
|
|
101
|
+
if not os.path.isdir(media_dir):
|
|
102
|
+
return results
|
|
103
|
+
skip = skip_files or set()
|
|
104
|
+
|
|
105
|
+
for fname in os.listdir(media_dir):
|
|
106
|
+
if fname in skip:
|
|
107
|
+
continue
|
|
108
|
+
ext = os.path.splitext(fname)[1].lower()
|
|
109
|
+
if ext not in ('.png', '.jpg', '.jpeg'):
|
|
110
|
+
continue
|
|
111
|
+
path = os.path.join(media_dir, fname)
|
|
112
|
+
old_size = os.path.getsize(path)
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
pix = fitz.Pixmap(path)
|
|
116
|
+
except Exception:
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
# Drop alpha for JPG output
|
|
120
|
+
if pix.alpha:
|
|
121
|
+
pix = fitz.Pixmap(fitz.csRGB, pix)
|
|
122
|
+
|
|
123
|
+
resized = False
|
|
124
|
+
if pix.width > max_width:
|
|
125
|
+
scale = max_width / pix.width
|
|
126
|
+
new_w = max_width
|
|
127
|
+
new_h = int(pix.height * scale)
|
|
128
|
+
# Resize via PDF re-render
|
|
129
|
+
tmp_pdf = fitz.open()
|
|
130
|
+
page = tmp_pdf.new_page(width=pix.width, height=pix.height)
|
|
131
|
+
page.insert_image(page.rect, pixmap=pix)
|
|
132
|
+
mat = fitz.Matrix(new_w / pix.width, new_h / pix.height)
|
|
133
|
+
pix = tmp_pdf[0].get_pixmap(matrix=mat, alpha=False)
|
|
134
|
+
tmp_pdf.close()
|
|
135
|
+
resized = True
|
|
136
|
+
|
|
137
|
+
# Re-compress: save to temp first, only overwrite if actually smaller
|
|
138
|
+
if ext in ('.jpg', '.jpeg'):
|
|
139
|
+
tmp_path = path + '.recompress' + ext
|
|
140
|
+
pix.save(tmp_path, jpg_quality=quality)
|
|
141
|
+
new_size = os.path.getsize(tmp_path)
|
|
142
|
+
saved_pct = (old_size - new_size) / old_size * 100 if old_size else 0
|
|
143
|
+
if new_size < old_size and saved_pct > 5:
|
|
144
|
+
os.replace(tmp_path, path)
|
|
145
|
+
results.append((fname, old_size // 1024, new_size // 1024))
|
|
146
|
+
else:
|
|
147
|
+
os.remove(tmp_path)
|
|
148
|
+
else:
|
|
149
|
+
# For PNG, only save if we resized (PNG is lossless, rewriting won't help)
|
|
150
|
+
if resized:
|
|
151
|
+
pix.save(path)
|
|
152
|
+
new_size = os.path.getsize(path)
|
|
153
|
+
if new_size < old_size:
|
|
154
|
+
results.append((fname, old_size // 1024, new_size // 1024))
|
|
155
|
+
else:
|
|
156
|
+
continue
|
|
157
|
+
|
|
158
|
+
return results
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def dedup_media(media_dir, unpack_dir):
|
|
162
|
+
"""Deduplicate identical files in word/media/ by hash.
|
|
163
|
+
Rewrites all .rels files to point duplicates to a single canonical file.
|
|
164
|
+
Returns number of duplicates removed."""
|
|
165
|
+
if not os.path.isdir(media_dir):
|
|
166
|
+
return 0
|
|
167
|
+
|
|
168
|
+
# Hash all media files
|
|
169
|
+
hash_to_files = {}
|
|
170
|
+
for fname in os.listdir(media_dir):
|
|
171
|
+
path = os.path.join(media_dir, fname)
|
|
172
|
+
if not os.path.isfile(path):
|
|
173
|
+
continue
|
|
174
|
+
h = hashlib.md5()
|
|
175
|
+
with open(path, 'rb') as fh:
|
|
176
|
+
for chunk in iter(lambda: fh.read(65536), b''):
|
|
177
|
+
h.update(chunk)
|
|
178
|
+
h = h.hexdigest()
|
|
179
|
+
hash_to_files.setdefault(h, []).append(fname)
|
|
180
|
+
|
|
181
|
+
# Find duplicates and build rename map
|
|
182
|
+
renames = {} # {duplicate_fname: canonical_fname}
|
|
183
|
+
removed = 0
|
|
184
|
+
for fnames in hash_to_files.values():
|
|
185
|
+
if len(fnames) < 2:
|
|
186
|
+
continue
|
|
187
|
+
canonical = fnames[0]
|
|
188
|
+
for dup in fnames[1:]:
|
|
189
|
+
renames[dup] = canonical
|
|
190
|
+
os.remove(os.path.join(media_dir, dup))
|
|
191
|
+
removed += 1
|
|
192
|
+
|
|
193
|
+
if not renames:
|
|
194
|
+
return 0
|
|
195
|
+
|
|
196
|
+
# Update all .rels files to point to canonical
|
|
197
|
+
for rels_dir, _, files in os.walk(unpack_dir):
|
|
198
|
+
for fname in files:
|
|
199
|
+
if not fname.endswith('.rels'):
|
|
200
|
+
continue
|
|
201
|
+
rels_path = os.path.join(rels_dir, fname)
|
|
202
|
+
with open(rels_path, 'r', encoding='utf-8') as f:
|
|
203
|
+
content = f.read()
|
|
204
|
+
changed = False
|
|
205
|
+
for dup, canon in renames.items():
|
|
206
|
+
if dup in content:
|
|
207
|
+
content = content.replace(dup, canon)
|
|
208
|
+
changed = True
|
|
209
|
+
if changed:
|
|
210
|
+
with open(rels_path, 'w', encoding='utf-8') as f:
|
|
211
|
+
f.write(content)
|
|
212
|
+
|
|
213
|
+
return removed
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def strip_bookmarks(doc):
|
|
217
|
+
"""Strip auto-generated bookmarks (_GoBack, empty name) from document XML string.
|
|
218
|
+
Returns (modified_doc, count)."""
|
|
219
|
+
to_remove = []
|
|
220
|
+
for pattern in [
|
|
221
|
+
r'<w:bookmarkStart\b[^>]*w:name="_GoBack"[^>]*/>',
|
|
222
|
+
r'<w:bookmarkStart\b[^>]*w:name=""[^>]*/>',
|
|
223
|
+
]:
|
|
224
|
+
for m in re.finditer(pattern, doc):
|
|
225
|
+
id_m = re.search(r'w:id="(\d+)"', m.group(0))
|
|
226
|
+
if id_m:
|
|
227
|
+
to_remove.append((m.group(0), id_m.group(1)))
|
|
228
|
+
|
|
229
|
+
for start_tag, bid in to_remove:
|
|
230
|
+
doc = doc.replace(start_tag, '', 1)
|
|
231
|
+
doc = re.sub(rf'<w:bookmarkEnd\b[^>]*w:id="{bid}"[^>]*/>', '', doc)
|
|
232
|
+
return doc, len(to_remove)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _render_pdf_to_image(pdf_path, img_path, fmt='jpg', dpi=300, quality=95,
|
|
236
|
+
max_width=0):
|
|
237
|
+
"""Render the first page of a PDF to an image file via PyMuPDF.
|
|
238
|
+
Insets by 1pt to crop the Visio page frame border.
|
|
239
|
+
Returns True on success."""
|
|
240
|
+
import fitz
|
|
241
|
+
|
|
242
|
+
pdf_doc = fitz.open(pdf_path)
|
|
243
|
+
page = pdf_doc[0]
|
|
244
|
+
scale = dpi / 72
|
|
245
|
+
|
|
246
|
+
# Crop border: inset by ~1pt (the Visio page frame)
|
|
247
|
+
inset = 1.0 # points
|
|
248
|
+
rect = page.rect
|
|
249
|
+
clip = fitz.Rect(rect.x0 + inset, rect.y0 + inset,
|
|
250
|
+
rect.x1 - inset, rect.y1 - inset)
|
|
251
|
+
|
|
252
|
+
# Cap scale so output width doesn't exceed max_width
|
|
253
|
+
if max_width > 0:
|
|
254
|
+
content_width_pt = clip.x1 - clip.x0
|
|
255
|
+
full_width_px = content_width_pt * scale
|
|
256
|
+
if full_width_px > max_width:
|
|
257
|
+
scale = max_width / content_width_pt
|
|
258
|
+
|
|
259
|
+
mat = fitz.Matrix(scale, scale)
|
|
260
|
+
pix = page.get_pixmap(matrix=mat, alpha=False, clip=clip)
|
|
261
|
+
pdf_doc.close()
|
|
262
|
+
|
|
263
|
+
if fmt == 'jpg':
|
|
264
|
+
pix.save(img_path, jpg_quality=quality)
|
|
265
|
+
else:
|
|
266
|
+
pix.save(img_path)
|
|
267
|
+
return True
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
_VISIO_OPEN_FLAGS = 0x8 | 0x2 # visOpenDontList | visOpenRO
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _export_vsdx_to_pdf(visio, vsdx_path, pdf_path):
|
|
274
|
+
"""Export a single .vsdx to PDF using an already-running Visio instance."""
|
|
275
|
+
doc = visio.Documents.OpenEx(os.path.abspath(vsdx_path), _VISIO_OPEN_FLAGS)
|
|
276
|
+
doc.ExportAsFixedFormat(1, os.path.abspath(pdf_path), 1, 0) # PDF, Print, All
|
|
277
|
+
doc.Close()
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _get_visio(warnings):
|
|
281
|
+
"""Launch Visio COM and return the application object, or None."""
|
|
282
|
+
try:
|
|
283
|
+
import win32com.client
|
|
284
|
+
except ImportError:
|
|
285
|
+
warnings.append('pywin32 not installed — skipping Visio conversion')
|
|
286
|
+
return None
|
|
287
|
+
try:
|
|
288
|
+
visio = win32com.client.Dispatch('Visio.Application')
|
|
289
|
+
except Exception:
|
|
290
|
+
warnings.append('Visio not available — keeping EMF previews')
|
|
291
|
+
return None
|
|
292
|
+
visio.Visible = False
|
|
293
|
+
visio.AlertResponse = 7 # suppress dialogs (answer "No")
|
|
294
|
+
return visio
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def convert_vsdx_via_visio(vsdx_paths, out_dir, warnings, fmt='jpg', dpi=300,
|
|
298
|
+
quality=95, max_width=2000, keep_pdfs=False):
|
|
299
|
+
"""Convert .vsdx files via Visio COM (vsdx->PDF) then PyMuPDF (PDF->image).
|
|
300
|
+
|
|
301
|
+
Returns dict: {vsdx_path: image_path} for successful conversions."""
|
|
302
|
+
results = {}
|
|
303
|
+
visio = _get_visio(warnings)
|
|
304
|
+
if visio is None:
|
|
305
|
+
return results
|
|
306
|
+
|
|
307
|
+
try:
|
|
308
|
+
for vsdx_path in vsdx_paths:
|
|
309
|
+
base = os.path.splitext(os.path.basename(vsdx_path))[0]
|
|
310
|
+
pdf_path = os.path.join(out_dir, f'{base}.pdf')
|
|
311
|
+
img_path = os.path.join(out_dir, f'{base}.{fmt}')
|
|
312
|
+
try:
|
|
313
|
+
_export_vsdx_to_pdf(visio, vsdx_path, pdf_path)
|
|
314
|
+
|
|
315
|
+
if os.path.exists(pdf_path):
|
|
316
|
+
_render_pdf_to_image(pdf_path, img_path, fmt=fmt,
|
|
317
|
+
dpi=dpi, quality=quality,
|
|
318
|
+
max_width=max_width)
|
|
319
|
+
|
|
320
|
+
if not keep_pdfs:
|
|
321
|
+
try:
|
|
322
|
+
os.remove(pdf_path)
|
|
323
|
+
except OSError:
|
|
324
|
+
pass
|
|
325
|
+
|
|
326
|
+
if os.path.exists(img_path):
|
|
327
|
+
results[vsdx_path] = img_path
|
|
328
|
+
except Exception as e:
|
|
329
|
+
warnings.append(f'Visio failed on {base}.vsdx: {e}')
|
|
330
|
+
for p in [pdf_path, img_path]:
|
|
331
|
+
if os.path.exists(p):
|
|
332
|
+
try:
|
|
333
|
+
os.remove(p)
|
|
334
|
+
except OSError:
|
|
335
|
+
pass
|
|
336
|
+
finally:
|
|
337
|
+
try:
|
|
338
|
+
visio.Quit()
|
|
339
|
+
except Exception:
|
|
340
|
+
pass
|
|
341
|
+
|
|
342
|
+
return results
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _strip_xml_tags(path, tags):
|
|
346
|
+
"""Remove specified XML tags from a file."""
|
|
347
|
+
if not os.path.exists(path):
|
|
348
|
+
return
|
|
349
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
350
|
+
xml = f.read()
|
|
351
|
+
for tag in tags:
|
|
352
|
+
xml = re.sub(rf'<{re.escape(tag)}[^>]*>.*?</{re.escape(tag)}>', '', xml, flags=re.DOTALL)
|
|
353
|
+
with open(path, 'w', encoding='utf-8') as f:
|
|
354
|
+
f.write(xml)
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def sanitize_core_props(unpack_dir):
|
|
358
|
+
"""Strip personal info from docProps/core.xml."""
|
|
359
|
+
_strip_xml_tags(os.path.join(unpack_dir, 'docProps', 'core.xml'),
|
|
360
|
+
['dc:creator', 'cp:lastModifiedBy', 'cp:lastPrinted',
|
|
361
|
+
'cp:revision', 'dc:subject', 'cp:keywords',
|
|
362
|
+
'cp:category', 'cp:contentStatus'])
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def sanitize_app_props(unpack_dir):
|
|
366
|
+
"""Strip sensitive fields from docProps/app.xml."""
|
|
367
|
+
_strip_xml_tags(os.path.join(unpack_dir, 'docProps', 'app.xml'),
|
|
368
|
+
['Company', 'Manager', 'HyperlinkBase'])
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def remove_comment_files(unpack_dir):
|
|
372
|
+
"""Remove comments.xml, commentsExtended.xml, commentsIds.xml. Returns count removed."""
|
|
373
|
+
count = 0
|
|
374
|
+
for name in ['comments.xml', 'commentsExtended.xml', 'commentsIds.xml']:
|
|
375
|
+
p = os.path.join(unpack_dir, 'word', name)
|
|
376
|
+
if os.path.exists(p):
|
|
377
|
+
os.remove(p)
|
|
378
|
+
count += 1
|
|
379
|
+
return count
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def strip_comment_refs(doc):
|
|
383
|
+
"""Strip comment range/reference tags from document XML string."""
|
|
384
|
+
doc = re.sub(r'<w:commentRangeStart\b[^>]*/>', '', doc)
|
|
385
|
+
doc = re.sub(r'<w:commentRangeEnd\b[^>]*/>', '', doc)
|
|
386
|
+
doc = re.sub(r'<w:commentReference\b[^>]*/>', '', doc)
|
|
387
|
+
return doc
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def strip_revisions(doc, warnings):
|
|
391
|
+
"""Accept all tracked changes by stripping revision markup from document XML string."""
|
|
392
|
+
balanced = True
|
|
393
|
+
for tag in ['w:del', 'w:ins']:
|
|
394
|
+
opens = len(re.findall(rf'<{tag}\b', doc))
|
|
395
|
+
closes = len(re.findall(rf'</{tag}>', doc))
|
|
396
|
+
if opens != closes:
|
|
397
|
+
warnings.append(f'Mismatched {tag} tags ({opens} open, {closes} close) — '
|
|
398
|
+
f'skipping {tag} removal to avoid content loss')
|
|
399
|
+
balanced = False
|
|
400
|
+
|
|
401
|
+
if balanced:
|
|
402
|
+
doc = re.sub(r'<w:del\b[^>]*>.*?</w:del>', '', doc, flags=re.DOTALL)
|
|
403
|
+
doc = re.sub(r'<w:ins\b[^>]*>(.*?)</w:ins>', r'\1', doc, flags=re.DOTALL)
|
|
404
|
+
|
|
405
|
+
# Always safe to strip property-change blocks and rsid attributes
|
|
406
|
+
for tag in ['rPrChange', 'pPrChange', 'sectPrChange', 'tblPrChange',
|
|
407
|
+
'tblGridChange', 'tcPrChange', 'trPrChange']:
|
|
408
|
+
doc = re.sub(rf'<w:{tag}\b[^>]*>.*?</w:{tag}>', '', doc, flags=re.DOTALL)
|
|
409
|
+
doc = re.sub(r'\s+w:rsid\w*="[^"]*"', '', doc)
|
|
410
|
+
return doc
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def remove_garbage_parts(unpack_dir):
|
|
414
|
+
"""Remove thumbnail, VBA macros, printer settings, ActiveX, custom XML data."""
|
|
415
|
+
removed = []
|
|
416
|
+
|
|
417
|
+
for ext in ['jpeg', 'jpg', 'png', 'emf', 'wmf']:
|
|
418
|
+
p = os.path.join(unpack_dir, 'docProps', f'thumbnail.{ext}')
|
|
419
|
+
if os.path.exists(p):
|
|
420
|
+
os.remove(p)
|
|
421
|
+
removed.append(f'thumbnail.{ext}')
|
|
422
|
+
|
|
423
|
+
vba_path = os.path.join(unpack_dir, 'word', 'vbaProject.bin')
|
|
424
|
+
if os.path.exists(vba_path):
|
|
425
|
+
os.remove(vba_path)
|
|
426
|
+
removed.append('vbaProject.bin')
|
|
427
|
+
vba_data = os.path.join(unpack_dir, 'word', 'vbaData.xml')
|
|
428
|
+
if os.path.exists(vba_data):
|
|
429
|
+
os.remove(vba_data)
|
|
430
|
+
removed.append('vbaData.xml')
|
|
431
|
+
|
|
432
|
+
printer_dir = os.path.join(unpack_dir, 'word', 'printerSettings')
|
|
433
|
+
if os.path.exists(printer_dir):
|
|
434
|
+
shutil.rmtree(printer_dir)
|
|
435
|
+
removed.append('printerSettings/')
|
|
436
|
+
|
|
437
|
+
activex_dir = os.path.join(unpack_dir, 'word', 'activeX')
|
|
438
|
+
if os.path.exists(activex_dir):
|
|
439
|
+
shutil.rmtree(activex_dir)
|
|
440
|
+
removed.append('activeX/')
|
|
441
|
+
|
|
442
|
+
custom_xml_dir = os.path.join(unpack_dir, 'customXml')
|
|
443
|
+
if os.path.exists(custom_xml_dir):
|
|
444
|
+
shutil.rmtree(custom_xml_dir)
|
|
445
|
+
removed.append('customXml/')
|
|
446
|
+
|
|
447
|
+
custom_props = os.path.join(unpack_dir, 'docProps', 'custom.xml')
|
|
448
|
+
if os.path.exists(custom_props):
|
|
449
|
+
os.remove(custom_props)
|
|
450
|
+
removed.append('custom.xml')
|
|
451
|
+
|
|
452
|
+
return removed
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def clean_content_types(unpack_dir):
|
|
456
|
+
"""Remove Content_Types entries that reference deleted parts."""
|
|
457
|
+
ct_path = os.path.join(unpack_dir, '[Content_Types].xml')
|
|
458
|
+
if not os.path.exists(ct_path):
|
|
459
|
+
return
|
|
460
|
+
with open(ct_path, 'r', encoding='utf-8') as f:
|
|
461
|
+
ct = f.read()
|
|
462
|
+
for pattern in _CLEANUP_PATTERNS:
|
|
463
|
+
ct = re.sub(rf'<Override[^>]*{pattern}[^>]*/>', '', ct)
|
|
464
|
+
ct = re.sub(r'<Default[^>]*Extension="bin"[^>]*vbaProject[^>]*/>', '', ct)
|
|
465
|
+
with open(ct_path, 'w', encoding='utf-8') as f:
|
|
466
|
+
f.write(ct)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def clean_relationships(unpack_dir):
|
|
470
|
+
"""Remove .rels entries that reference deleted parts."""
|
|
471
|
+
for rels_dir, _, files in os.walk(unpack_dir):
|
|
472
|
+
for fname in files:
|
|
473
|
+
if not fname.endswith('.rels'):
|
|
474
|
+
continue
|
|
475
|
+
rels_path = os.path.join(rels_dir, fname)
|
|
476
|
+
with open(rels_path, 'r', encoding='utf-8') as f:
|
|
477
|
+
rels = f.read()
|
|
478
|
+
changed = False
|
|
479
|
+
for pattern in _CLEANUP_PATTERNS:
|
|
480
|
+
for tag_re in [rf'<Relationship\b[^>]*Target="[^"]*{pattern}[^"]*"[^>]*/>',
|
|
481
|
+
rf'<Relationship\b[^>]*Target="[^"]*{pattern}[^"]*"[^>]*>.*?</Relationship>']:
|
|
482
|
+
rels, n = re.subn(tag_re, '', rels, flags=re.DOTALL)
|
|
483
|
+
if n:
|
|
484
|
+
changed = True
|
|
485
|
+
if changed:
|
|
486
|
+
with open(rels_path, 'w', encoding='utf-8') as f:
|
|
487
|
+
f.write(rels)
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
def _interactive_reconvert(media_dir, emf_to_vsdx, conversions,
|
|
491
|
+
tmp_dir, fmt, dpi, warnings, max_width=0):
|
|
492
|
+
"""Show the top 5 largest converted images and let the user re-convert
|
|
493
|
+
selected ones from the original .vsdx at a different quality."""
|
|
494
|
+
|
|
495
|
+
# Build list of (filename, size, vsdx_path) for converted images
|
|
496
|
+
items = []
|
|
497
|
+
for emf_name, vsdx_path in emf_to_vsdx.items():
|
|
498
|
+
if vsdx_path not in conversions:
|
|
499
|
+
continue
|
|
500
|
+
emf_base = os.path.splitext(emf_name)[0]
|
|
501
|
+
img_file = f'{emf_base}.{fmt}'
|
|
502
|
+
img_path = os.path.join(media_dir, img_file)
|
|
503
|
+
if os.path.exists(img_path):
|
|
504
|
+
items.append((img_file, os.path.getsize(img_path), vsdx_path))
|
|
505
|
+
|
|
506
|
+
if not items:
|
|
507
|
+
return
|
|
508
|
+
|
|
509
|
+
items.sort(key=lambda x: x[1], reverse=True)
|
|
510
|
+
top5 = items[:5]
|
|
511
|
+
|
|
512
|
+
print(f'\n Top {len(top5)} largest converted images:')
|
|
513
|
+
for i, (fname, size, _) in enumerate(top5, 1):
|
|
514
|
+
print(f' {i}. {fname} ({size // 1024} KB)')
|
|
515
|
+
|
|
516
|
+
print(f'\n Enter numbers to re-convert (e.g. "1 3 5"), new quality (e.g. "1,3 q=80"),')
|
|
517
|
+
print(f' or press Enter to skip: ', end='', flush=True)
|
|
518
|
+
|
|
519
|
+
try:
|
|
520
|
+
user_input = input().strip()
|
|
521
|
+
except (EOFError, KeyboardInterrupt):
|
|
522
|
+
return
|
|
523
|
+
if not user_input:
|
|
524
|
+
return
|
|
525
|
+
|
|
526
|
+
# Parse quality override
|
|
527
|
+
new_quality = 85 # default re-conversion quality (lower than initial)
|
|
528
|
+
if 'q=' in user_input:
|
|
529
|
+
q_match = re.search(r'q=(\d+)', user_input)
|
|
530
|
+
if q_match:
|
|
531
|
+
new_quality = max(1, min(100, int(q_match.group(1))))
|
|
532
|
+
user_input = re.sub(r'q=\d+', '', user_input).strip()
|
|
533
|
+
|
|
534
|
+
# Parse selected indices
|
|
535
|
+
selected = set()
|
|
536
|
+
for part in re.split(r'[,\s]+', user_input):
|
|
537
|
+
if part.isdigit():
|
|
538
|
+
idx = int(part)
|
|
539
|
+
if 1 <= idx <= len(top5):
|
|
540
|
+
selected.add(idx - 1)
|
|
541
|
+
|
|
542
|
+
if not selected:
|
|
543
|
+
return
|
|
544
|
+
|
|
545
|
+
# Check which items need PDF re-export from Visio
|
|
546
|
+
needs_reexport = []
|
|
547
|
+
for idx in sorted(selected):
|
|
548
|
+
fname, _, vsdx_path = top5[idx]
|
|
549
|
+
base = os.path.splitext(os.path.basename(vsdx_path))[0]
|
|
550
|
+
pdf_path = os.path.join(tmp_dir, f'{base}.pdf')
|
|
551
|
+
if not os.path.exists(pdf_path) and os.path.exists(vsdx_path):
|
|
552
|
+
needs_reexport.append((idx, vsdx_path, pdf_path))
|
|
553
|
+
|
|
554
|
+
# Re-export missing PDFs via Visio (one session for all)
|
|
555
|
+
if needs_reexport:
|
|
556
|
+
visio = _get_visio(warnings)
|
|
557
|
+
if visio is not None:
|
|
558
|
+
try:
|
|
559
|
+
for _, vsdx_path, pdf_path in needs_reexport:
|
|
560
|
+
try:
|
|
561
|
+
_export_vsdx_to_pdf(visio, vsdx_path, pdf_path)
|
|
562
|
+
except Exception as e:
|
|
563
|
+
base = os.path.splitext(os.path.basename(vsdx_path))[0]
|
|
564
|
+
warnings.append(f'Could not re-export {base}: {e}')
|
|
565
|
+
finally:
|
|
566
|
+
try:
|
|
567
|
+
visio.Quit()
|
|
568
|
+
except Exception:
|
|
569
|
+
pass
|
|
570
|
+
|
|
571
|
+
print(f' Re-converting {len(selected)} image(s) at quality={new_quality}...')
|
|
572
|
+
for idx in sorted(selected):
|
|
573
|
+
fname, _, vsdx_path = top5[idx]
|
|
574
|
+
img_path = os.path.join(media_dir, fname)
|
|
575
|
+
base = os.path.splitext(os.path.basename(vsdx_path))[0]
|
|
576
|
+
pdf_path = os.path.join(tmp_dir, f'{base}.pdf')
|
|
577
|
+
|
|
578
|
+
if not os.path.exists(pdf_path):
|
|
579
|
+
warnings.append(f'PDF not available for {base}, skipping')
|
|
580
|
+
continue
|
|
581
|
+
|
|
582
|
+
try:
|
|
583
|
+
old_size = os.path.getsize(img_path)
|
|
584
|
+
_render_pdf_to_image(pdf_path, img_path, fmt=fmt, dpi=dpi,
|
|
585
|
+
quality=new_quality, max_width=max_width)
|
|
586
|
+
new_size = os.path.getsize(img_path)
|
|
587
|
+
print(f' {fname}: {old_size // 1024} KB -> {new_size // 1024} KB')
|
|
588
|
+
except Exception as e:
|
|
589
|
+
warnings.append(f'Re-conversion failed for {fname}: {e}')
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
def shrink_docx(src_path, dst_path, fmt='jpg', dpi=300, quality=95, max_width=2000,
|
|
593
|
+
interactive=False):
|
|
594
|
+
"""Shrink and sanitize a Word document.
|
|
595
|
+
|
|
596
|
+
Returns a dict with:
|
|
597
|
+
original_size_mb, new_size_mb, reduction_mb, reduction_percent,
|
|
598
|
+
output_path, visio_converted (list), visio_removed (int),
|
|
599
|
+
images_compressed (list of (name, old_kb, new_kb)),
|
|
600
|
+
duplicates_removed (int), comments_removed (int),
|
|
601
|
+
bookmarks_removed (int), garbage_removed (list),
|
|
602
|
+
warnings (list of str)
|
|
603
|
+
"""
|
|
604
|
+
result = {
|
|
605
|
+
'original_size_mb': 0,
|
|
606
|
+
'new_size_mb': 0,
|
|
607
|
+
'reduction_mb': 0,
|
|
608
|
+
'reduction_percent': 0,
|
|
609
|
+
'output_path': dst_path,
|
|
610
|
+
'visio_converted': [],
|
|
611
|
+
'visio_removed': 0,
|
|
612
|
+
'images_compressed': [],
|
|
613
|
+
'duplicates_removed': 0,
|
|
614
|
+
'comments_removed': 0,
|
|
615
|
+
'bookmarks_removed': 0,
|
|
616
|
+
'garbage_removed': [],
|
|
617
|
+
'warnings': [],
|
|
618
|
+
}
|
|
619
|
+
warnings = result['warnings']
|
|
620
|
+
|
|
621
|
+
with tempfile.TemporaryDirectory() as work:
|
|
622
|
+
unpack_dir = os.path.join(work, 'unpacked')
|
|
623
|
+
tmp_dir = os.path.join(work, 'tmp')
|
|
624
|
+
os.makedirs(tmp_dir)
|
|
625
|
+
|
|
626
|
+
# Unpack
|
|
627
|
+
with zipfile.ZipFile(src_path) as zf:
|
|
628
|
+
zf.extractall(unpack_dir)
|
|
629
|
+
|
|
630
|
+
media_dir = os.path.join(unpack_dir, 'word', 'media')
|
|
631
|
+
embed_dir = os.path.join(unpack_dir, 'word', 'embeddings')
|
|
632
|
+
|
|
633
|
+
# --- 1. Convert .vsdx images via Visio, then remove .vsdx embeddings ---
|
|
634
|
+
rels_path = os.path.join(unpack_dir, 'word', '_rels', 'document.xml.rels')
|
|
635
|
+
if os.path.exists(rels_path):
|
|
636
|
+
with open(rels_path, 'r', encoding='utf-8') as f:
|
|
637
|
+
rels_xml = f.read()
|
|
638
|
+
else:
|
|
639
|
+
rels_xml = ''
|
|
640
|
+
|
|
641
|
+
doc_path = os.path.join(unpack_dir, 'word', 'document.xml')
|
|
642
|
+
with open(doc_path, 'r', encoding='utf-8') as f:
|
|
643
|
+
doc = f.read()
|
|
644
|
+
|
|
645
|
+
# Build rId -> Target mapping from rels
|
|
646
|
+
rid_to_target = {}
|
|
647
|
+
for m in re.finditer(r'<Relationship\b[^>]*\bId="(rId\d+)"[^>]*\bTarget="([^"]*)"', rels_xml):
|
|
648
|
+
rid_to_target[m.group(1)] = m.group(2)
|
|
649
|
+
|
|
650
|
+
# Find each <w:object> and extract OLE rId (-> .vsdx) and image rId (-> .emf)
|
|
651
|
+
emf_to_vsdx = {} # {emf_filename: vsdx_full_path}
|
|
652
|
+
for obj_m in re.finditer(r'<w:object\b[^>]*>.*?</w:object>', doc, flags=re.DOTALL):
|
|
653
|
+
obj_xml = obj_m.group(0)
|
|
654
|
+
ole_match = re.search(r'<o:OLEObject\b[^>]*r:id="(rId\d+)"', obj_xml)
|
|
655
|
+
img_match = re.search(r'<v:imagedata\s[^>]*r:id="(rId\d+)"', obj_xml)
|
|
656
|
+
if not ole_match or not img_match:
|
|
657
|
+
continue
|
|
658
|
+
ole_target = rid_to_target.get(ole_match.group(1), '')
|
|
659
|
+
img_target = rid_to_target.get(img_match.group(1), '')
|
|
660
|
+
if ole_target.endswith('.vsdx') and img_target:
|
|
661
|
+
emf_name = os.path.basename(img_target)
|
|
662
|
+
vsdx_path = os.path.join(unpack_dir, 'word', ole_target.replace('/', os.sep))
|
|
663
|
+
emf_to_vsdx[emf_name] = vsdx_path
|
|
664
|
+
|
|
665
|
+
# Convert via Visio COM (batch — opens Visio once for all files)
|
|
666
|
+
vsdx_paths = [p for p in emf_to_vsdx.values() if os.path.exists(p)]
|
|
667
|
+
conversions = convert_vsdx_via_visio(vsdx_paths, tmp_dir, warnings,
|
|
668
|
+
fmt=fmt, dpi=dpi, quality=quality,
|
|
669
|
+
max_width=max_width,
|
|
670
|
+
keep_pdfs=interactive)
|
|
671
|
+
|
|
672
|
+
# Place converted images and update refs
|
|
673
|
+
converted = [] # list of emf_basename_no_ext
|
|
674
|
+
for emf_name, vsdx_path in emf_to_vsdx.items():
|
|
675
|
+
img_path = conversions.get(vsdx_path)
|
|
676
|
+
if img_path is None:
|
|
677
|
+
continue
|
|
678
|
+
|
|
679
|
+
emf_base = os.path.splitext(emf_name)[0]
|
|
680
|
+
dest = os.path.join(media_dir, f'{emf_base}.{fmt}')
|
|
681
|
+
shutil.copy2(img_path, dest)
|
|
682
|
+
|
|
683
|
+
# Remove the old EMF
|
|
684
|
+
emf_path = os.path.join(media_dir, emf_name)
|
|
685
|
+
if os.path.exists(emf_path):
|
|
686
|
+
os.remove(emf_path)
|
|
687
|
+
|
|
688
|
+
converted.append(emf_base)
|
|
689
|
+
size_kb = os.path.getsize(dest) // 1024
|
|
690
|
+
result['visio_converted'].append((emf_base, size_kb))
|
|
691
|
+
|
|
692
|
+
if emf_to_vsdx and not converted:
|
|
693
|
+
warnings.append(f'Kept {len(emf_to_vsdx)} EMF preview(s) (Visio unavailable)')
|
|
694
|
+
|
|
695
|
+
# Delete all .vsdx files (whether conversion succeeded or not)
|
|
696
|
+
vsdx_removed = 0
|
|
697
|
+
if os.path.isdir(embed_dir):
|
|
698
|
+
remaining = []
|
|
699
|
+
for f in os.listdir(embed_dir):
|
|
700
|
+
if f.endswith('.vsdx'):
|
|
701
|
+
os.remove(os.path.join(embed_dir, f))
|
|
702
|
+
vsdx_removed += 1
|
|
703
|
+
else:
|
|
704
|
+
remaining.append(f)
|
|
705
|
+
if not remaining:
|
|
706
|
+
os.rmdir(embed_dir)
|
|
707
|
+
result['visio_removed'] = vsdx_removed
|
|
708
|
+
|
|
709
|
+
# --- 2. Convert OLE objects to DrawingML ---
|
|
710
|
+
_id_counter = [next_doc_pr_id(doc)]
|
|
711
|
+
|
|
712
|
+
def _replace_object(m):
|
|
713
|
+
r = object_to_drawing(m.group(0), _id_counter[0])
|
|
714
|
+
_id_counter[0] += 1
|
|
715
|
+
return r
|
|
716
|
+
|
|
717
|
+
doc = re.sub(
|
|
718
|
+
r'<w:object\b[^>]*>.*?</w:object>',
|
|
719
|
+
_replace_object,
|
|
720
|
+
doc, flags=re.DOTALL
|
|
721
|
+
)
|
|
722
|
+
doc = ensure_namespaces(doc)
|
|
723
|
+
|
|
724
|
+
# --- 2b. Strip comments, revisions, bookmarks from document XML (in-memory) ---
|
|
725
|
+
doc = strip_comment_refs(doc)
|
|
726
|
+
doc = strip_revisions(doc, warnings)
|
|
727
|
+
doc, bm_removed = strip_bookmarks(doc)
|
|
728
|
+
result['bookmarks_removed'] = bm_removed
|
|
729
|
+
|
|
730
|
+
with open(doc_path, 'w', encoding='utf-8') as f:
|
|
731
|
+
f.write(doc)
|
|
732
|
+
|
|
733
|
+
# --- 3. Update relationships ---
|
|
734
|
+
if rels_xml:
|
|
735
|
+
rels = rels_xml
|
|
736
|
+
|
|
737
|
+
# Remove Visio embedding relationships
|
|
738
|
+
rels = re.sub(r'<Relationship[^>]*Target="embeddings/[^"]*\.vsdx"[^/]*/>', '', rels)
|
|
739
|
+
|
|
740
|
+
# Update converted image refs: .emf -> new format
|
|
741
|
+
for emf_base in converted:
|
|
742
|
+
rels = rels.replace(f'media/{emf_base}.emf"', f'media/{emf_base}.{fmt}"')
|
|
743
|
+
|
|
744
|
+
with open(rels_path, 'w', encoding='utf-8') as f:
|
|
745
|
+
f.write(rels)
|
|
746
|
+
|
|
747
|
+
# --- 4. Update Content_Types ---
|
|
748
|
+
ct_path = os.path.join(unpack_dir, '[Content_Types].xml')
|
|
749
|
+
with open(ct_path, 'r', encoding='utf-8') as f:
|
|
750
|
+
ct = f.read()
|
|
751
|
+
|
|
752
|
+
ct = re.sub(r'<Override[^>]*\.vsdx[^>]*/>', '', ct)
|
|
753
|
+
# Remove stale Default entries for formats no longer present
|
|
754
|
+
has_emf = os.path.isdir(media_dir) and any(f.endswith('.emf') for f in os.listdir(media_dir))
|
|
755
|
+
if not has_emf:
|
|
756
|
+
ct = re.sub(r'<Default[^>]*Extension="emf"[^>]*/>', '', ct)
|
|
757
|
+
has_vsdx = os.path.isdir(embed_dir) and any(f.endswith('.vsdx') for f in os.listdir(embed_dir))
|
|
758
|
+
if not has_vsdx:
|
|
759
|
+
ct = re.sub(r'<Default[^>]*Extension="vsdx"[^>]*/>', '', ct)
|
|
760
|
+
_CONTENT_TYPE_MAP = {'jpg': 'image/jpeg', 'png': 'image/png'}
|
|
761
|
+
if converted and fmt in _CONTENT_TYPE_MAP:
|
|
762
|
+
ext_attr = f'Extension="{fmt}"'
|
|
763
|
+
if ext_attr not in ct:
|
|
764
|
+
mime = _CONTENT_TYPE_MAP[fmt]
|
|
765
|
+
ct = ct.replace('</Types>',
|
|
766
|
+
f'<Default Extension="{fmt}" ContentType="{mime}"/></Types>')
|
|
767
|
+
|
|
768
|
+
with open(ct_path, 'w', encoding='utf-8') as f:
|
|
769
|
+
f.write(ct)
|
|
770
|
+
|
|
771
|
+
# --- 5. Compress/resize oversized raster images ---
|
|
772
|
+
skip_files = {f'{b}.{fmt}' for b in converted}
|
|
773
|
+
compressed = compress_media_images(media_dir, max_width=max_width,
|
|
774
|
+
quality=quality, skip_files=skip_files)
|
|
775
|
+
result['images_compressed'] = compressed
|
|
776
|
+
|
|
777
|
+
# --- 6. Deduplicate identical media files ---
|
|
778
|
+
result['duplicates_removed'] = dedup_media(media_dir, unpack_dir)
|
|
779
|
+
|
|
780
|
+
# --- 7. Remove personal info and sensitive data ---
|
|
781
|
+
sanitize_core_props(unpack_dir)
|
|
782
|
+
sanitize_app_props(unpack_dir)
|
|
783
|
+
|
|
784
|
+
result['comments_removed'] = remove_comment_files(unpack_dir)
|
|
785
|
+
|
|
786
|
+
# --- 9. Remove garbage parts ---
|
|
787
|
+
result['garbage_removed'] = remove_garbage_parts(unpack_dir)
|
|
788
|
+
|
|
789
|
+
# --- 10. Clean up references to deleted parts ---
|
|
790
|
+
clean_content_types(unpack_dir)
|
|
791
|
+
clean_relationships(unpack_dir)
|
|
792
|
+
|
|
793
|
+
# --- 11. Interactive: show top 5 largest images, offer re-conversion ---
|
|
794
|
+
if interactive and emf_to_vsdx:
|
|
795
|
+
_interactive_reconvert(media_dir, emf_to_vsdx, conversions,
|
|
796
|
+
tmp_dir, fmt, dpi, warnings, max_width=max_width)
|
|
797
|
+
|
|
798
|
+
# --- 12. Repack (write to temp file first for atomicity) ---
|
|
799
|
+
tmp_output = dst_path + '.tmp'
|
|
800
|
+
try:
|
|
801
|
+
with zipfile.ZipFile(tmp_output, 'w', zipfile.ZIP_DEFLATED) as zout:
|
|
802
|
+
for root, dirs, files in os.walk(unpack_dir):
|
|
803
|
+
for f in sorted(files):
|
|
804
|
+
full = os.path.join(root, f)
|
|
805
|
+
arc = os.path.relpath(full, unpack_dir)
|
|
806
|
+
zout.write(full, arc)
|
|
807
|
+
|
|
808
|
+
# Validate output before finalizing
|
|
809
|
+
with zipfile.ZipFile(tmp_output, 'r') as zcheck:
|
|
810
|
+
bad = zcheck.testzip()
|
|
811
|
+
if bad:
|
|
812
|
+
raise RuntimeError(f'Output ZIP is corrupt: {bad}')
|
|
813
|
+
if '[Content_Types].xml' not in zcheck.namelist():
|
|
814
|
+
raise RuntimeError('Output ZIP missing [Content_Types].xml')
|
|
815
|
+
|
|
816
|
+
os.replace(tmp_output, dst_path)
|
|
817
|
+
except Exception:
|
|
818
|
+
if os.path.exists(tmp_output):
|
|
819
|
+
os.remove(tmp_output)
|
|
820
|
+
raise
|
|
821
|
+
|
|
822
|
+
orig_size = os.path.getsize(src_path)
|
|
823
|
+
final_size = os.path.getsize(dst_path)
|
|
824
|
+
reduction = orig_size - final_size
|
|
825
|
+
result['original_size_mb'] = round(orig_size / 1024 / 1024, 2)
|
|
826
|
+
result['new_size_mb'] = round(final_size / 1024 / 1024, 2)
|
|
827
|
+
result['reduction_mb'] = round(reduction / 1024 / 1024, 2)
|
|
828
|
+
result['reduction_percent'] = round(reduction / orig_size * 100, 1) if orig_size else 0
|
|
829
|
+
|
|
830
|
+
return result
|