diffpdf 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffpdf/__init__.py +15 -0
- diffpdf/cli.py +71 -0
- diffpdf/comparators.py +22 -0
- diffpdf/hash_check.py +24 -0
- diffpdf/page_check.py +24 -0
- diffpdf/text_check.py +45 -0
- diffpdf/visual_check.py +63 -0
- diffpdf-0.1.2.dist-info/METADATA +82 -0
- diffpdf-0.1.2.dist-info/RECORD +12 -0
- diffpdf-0.1.2.dist-info/WHEEL +4 -0
- diffpdf-0.1.2.dist-info/entry_points.txt +2 -0
- diffpdf-0.1.2.dist-info/licenses/LICENSE +21 -0
diffpdf/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from importlib.metadata import version
|
|
2
|
+
|
|
3
|
+
from .cli import cli
|
|
4
|
+
|
|
5
|
+
__version__ = version("diffpdf")
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def main(args=None): # pragma: no cover
|
|
9
|
+
if args is None:
|
|
10
|
+
cli()
|
|
11
|
+
else:
|
|
12
|
+
cli(args, standalone_mode=False)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
__all__ = ["main", "__version__"]
|
diffpdf/cli.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
import colorlog
|
|
7
|
+
|
|
8
|
+
from .comparators import compare_pdfs
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def setup_logging(debug, save_log): # pragma: no cover
|
|
12
|
+
level = logging.DEBUG if debug else logging.INFO
|
|
13
|
+
|
|
14
|
+
formatter = colorlog.ColoredFormatter(
|
|
15
|
+
"%(log_color)s%(asctime)s %(levelname)-8s%(reset)s %(message)s",
|
|
16
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
17
|
+
log_colors={
|
|
18
|
+
"DEBUG": "cyan",
|
|
19
|
+
"INFO": "green",
|
|
20
|
+
"WARNING": "yellow",
|
|
21
|
+
"ERROR": "red",
|
|
22
|
+
"CRITICAL": "red,bg_white",
|
|
23
|
+
},
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
console_handler = logging.StreamHandler()
|
|
27
|
+
console_handler.setFormatter(formatter)
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger()
|
|
30
|
+
logger.setLevel(level)
|
|
31
|
+
logger.addHandler(console_handler)
|
|
32
|
+
|
|
33
|
+
if save_log:
|
|
34
|
+
file_formatter = logging.Formatter(
|
|
35
|
+
"%(asctime)s %(levelname)-8s %(message)s",
|
|
36
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
37
|
+
)
|
|
38
|
+
file_handler = logging.FileHandler("log.txt")
|
|
39
|
+
file_handler.setFormatter(file_formatter)
|
|
40
|
+
logger.addHandler(file_handler)
|
|
41
|
+
|
|
42
|
+
return logger
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@click.command()
|
|
46
|
+
@click.argument(
|
|
47
|
+
"reference", type=click.Path(exists=True, dir_okay=False, path_type=Path)
|
|
48
|
+
)
|
|
49
|
+
@click.argument("actual", type=click.Path(exists=True, dir_okay=False, path_type=Path))
|
|
50
|
+
@click.option(
|
|
51
|
+
"--threshold", type=float, default=0.1, help="Pixelmatch threshold (0.0-1.0)"
|
|
52
|
+
)
|
|
53
|
+
@click.option("--dpi", type=int, default=96, help="Render resolution")
|
|
54
|
+
@click.option(
|
|
55
|
+
"--output-dir",
|
|
56
|
+
type=click.Path(file_okay=False, path_type=Path),
|
|
57
|
+
default="./",
|
|
58
|
+
help="Diff image output directory",
|
|
59
|
+
)
|
|
60
|
+
@click.option("--debug", is_flag=True, help="Verbose logging")
|
|
61
|
+
@click.option("--save-log", is_flag=True, help="Write log output to log.txt")
|
|
62
|
+
@click.version_option(package_name="diffpdf")
|
|
63
|
+
def cli(reference, actual, threshold, dpi, output_dir, debug, save_log):
|
|
64
|
+
"""Compare two PDF files for structural, textual, and visual differences."""
|
|
65
|
+
logger = setup_logging(debug, save_log)
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
compare_pdfs(reference, actual, threshold, dpi, output_dir, logger)
|
|
69
|
+
except Exception as e: # pragma: no cover
|
|
70
|
+
logger.critical(f"Error: {e}")
|
|
71
|
+
sys.exit(2)
|
diffpdf/comparators.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from .hash_check import check_hash
|
|
5
|
+
from .page_check import check_page_counts
|
|
6
|
+
from .text_check import check_text_content
|
|
7
|
+
from .visual_check import check_visual_content
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def compare_pdfs(
|
|
11
|
+
ref: Path, actual: Path, threshold: float, dpi: int, output_dir: Path, logger
|
|
12
|
+
) -> None:
|
|
13
|
+
check_hash(ref, actual, logger)
|
|
14
|
+
|
|
15
|
+
check_page_counts(ref, actual, logger)
|
|
16
|
+
|
|
17
|
+
check_text_content(ref, actual, logger)
|
|
18
|
+
|
|
19
|
+
check_visual_content(ref, actual, threshold, dpi, output_dir, logger)
|
|
20
|
+
|
|
21
|
+
logger.info("PDFs are equivalent")
|
|
22
|
+
sys.exit(0)
|
diffpdf/hash_check.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def compute_file_hash(filepath: Path) -> str:
|
|
7
|
+
sha256 = hashlib.sha256()
|
|
8
|
+
with open(filepath, "rb") as f:
|
|
9
|
+
for chunk in iter(lambda: f.read(8192), b""):
|
|
10
|
+
sha256.update(chunk)
|
|
11
|
+
return sha256.hexdigest()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def check_hash(ref: Path, actual: Path, logger) -> None:
|
|
15
|
+
logger.info("[1/4] Checking file hashes...")
|
|
16
|
+
|
|
17
|
+
ref_hash = compute_file_hash(ref)
|
|
18
|
+
actual_hash = compute_file_hash(actual)
|
|
19
|
+
|
|
20
|
+
if ref_hash == actual_hash:
|
|
21
|
+
logger.info("Files are identical (hash match)")
|
|
22
|
+
sys.exit(0)
|
|
23
|
+
|
|
24
|
+
logger.info("Hashes differ, continuing checks")
|
diffpdf/page_check.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import fitz
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_page_count(pdf_path: Path) -> int:
|
|
8
|
+
doc = fitz.open(pdf_path)
|
|
9
|
+
count = len(doc)
|
|
10
|
+
doc.close()
|
|
11
|
+
return count
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def check_page_counts(ref: Path, actual: Path, logger) -> None:
|
|
15
|
+
logger.info("[2/4] Checking page counts...")
|
|
16
|
+
|
|
17
|
+
ref_count = get_page_count(ref)
|
|
18
|
+
actual_count = get_page_count(actual)
|
|
19
|
+
|
|
20
|
+
if ref_count != actual_count:
|
|
21
|
+
logger.error(f"Page count mismatch: expected {ref_count}, got {actual_count}")
|
|
22
|
+
sys.exit(1)
|
|
23
|
+
|
|
24
|
+
logger.info(f"Page counts match ({ref_count} pages)")
|
diffpdf/text_check.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import difflib
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import fitz
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def extract_text(pdf_path: Path) -> str:
|
|
9
|
+
doc = fitz.open(pdf_path)
|
|
10
|
+
text = ""
|
|
11
|
+
for page in doc:
|
|
12
|
+
text += page.get_text()
|
|
13
|
+
doc.close()
|
|
14
|
+
return text.strip()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def generate_diff(ref_text: str, actual_text: str) -> str:
|
|
18
|
+
ref_lines = ref_text.splitlines(keepends=True)
|
|
19
|
+
actual_lines = actual_text.splitlines(keepends=True)
|
|
20
|
+
|
|
21
|
+
diff = difflib.unified_diff(
|
|
22
|
+
ref_lines,
|
|
23
|
+
actual_lines,
|
|
24
|
+
fromfile="reference.pdf",
|
|
25
|
+
tofile="actual.pdf",
|
|
26
|
+
lineterm="",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
return "".join(diff)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def check_text_content(ref: Path, actual: Path, logger) -> None:
|
|
33
|
+
logger.info("[3/4] Checking text content...")
|
|
34
|
+
|
|
35
|
+
ref_text = extract_text(ref)
|
|
36
|
+
actual_text = extract_text(actual)
|
|
37
|
+
|
|
38
|
+
if ref_text != actual_text:
|
|
39
|
+
diff = generate_diff(ref_text, actual_text)
|
|
40
|
+
logger.error("Text content mismatch")
|
|
41
|
+
for line in diff.splitlines():
|
|
42
|
+
logger.error(line)
|
|
43
|
+
sys.exit(1)
|
|
44
|
+
|
|
45
|
+
logger.info("Text content matches")
|
diffpdf/visual_check.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import fitz
|
|
5
|
+
from PIL import Image
|
|
6
|
+
from pixelmatch.contrib.PIL import pixelmatch
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def render_page_to_image(pdf_path: Path, page_num: int, dpi: int) -> Image.Image:
|
|
10
|
+
doc = fitz.open(pdf_path)
|
|
11
|
+
page = doc[page_num]
|
|
12
|
+
pix = page.get_pixmap(dpi=dpi)
|
|
13
|
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
|
14
|
+
doc.close()
|
|
15
|
+
return img
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def compare_images(
|
|
19
|
+
ref_img: Image.Image, actual_img: Image.Image, threshold: float, output_path: Path
|
|
20
|
+
) -> bool:
|
|
21
|
+
diff_img = Image.new("RGB", ref_img.size)
|
|
22
|
+
mismatch_count = pixelmatch(ref_img, actual_img, diff_img, threshold=threshold)
|
|
23
|
+
|
|
24
|
+
if mismatch_count > 0:
|
|
25
|
+
diff_img.save(output_path)
|
|
26
|
+
return False
|
|
27
|
+
|
|
28
|
+
return True
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def check_visual_content(
|
|
32
|
+
ref: Path, actual: Path, threshold: float, dpi: int, output_dir: Path, logger
|
|
33
|
+
) -> None:
|
|
34
|
+
logger.info("[4/4] Checking visual content...")
|
|
35
|
+
|
|
36
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
37
|
+
|
|
38
|
+
ref_doc = fitz.open(ref)
|
|
39
|
+
page_count = len(ref_doc)
|
|
40
|
+
ref_doc.close()
|
|
41
|
+
|
|
42
|
+
failing_pages = []
|
|
43
|
+
|
|
44
|
+
for page_num in range(page_count):
|
|
45
|
+
ref_img = render_page_to_image(ref, page_num, dpi)
|
|
46
|
+
actual_img = render_page_to_image(actual, page_num, dpi)
|
|
47
|
+
|
|
48
|
+
ref_name = ref.stem
|
|
49
|
+
actual_name = actual.stem
|
|
50
|
+
output_path = (
|
|
51
|
+
output_dir / f"{ref_name}_vs_{actual_name}_page{page_num + 1}_diff.png"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
passed = compare_images(ref_img, actual_img, threshold, output_path)
|
|
55
|
+
|
|
56
|
+
if not passed:
|
|
57
|
+
failing_pages.append(page_num + 1)
|
|
58
|
+
|
|
59
|
+
if failing_pages:
|
|
60
|
+
logger.error(f"Visual mismatch on pages: {', '.join(map(str, failing_pages))}")
|
|
61
|
+
sys.exit(1)
|
|
62
|
+
|
|
63
|
+
logger.info("Visual content matches")
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: diffpdf
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: A tool for comparing PDF files
|
|
5
|
+
Project-URL: Homepage, https://github.com/JustusRijke/DiffPDF
|
|
6
|
+
Project-URL: Issues, https://github.com/JustusRijke/DiffPDF/issues
|
|
7
|
+
Author-email: Justus Rijke <justusrijke@gmail.com>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
12
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Requires-Python: >=3.10
|
|
15
|
+
Requires-Dist: click
|
|
16
|
+
Requires-Dist: colorlog
|
|
17
|
+
Requires-Dist: pillow>=10.0.0
|
|
18
|
+
Requires-Dist: pixelmatch>=0.3.0
|
|
19
|
+
Requires-Dist: pymupdf>=1.23.0
|
|
20
|
+
Provides-Extra: dev
|
|
21
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
22
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
# DiffPDF
|
|
26
|
+
|
|
27
|
+
[](https://github.com/JustusRijke/DiffPDF/actions/workflows/ci.yml)
|
|
28
|
+
|
|
29
|
+
CLI tool for detecting structural, textual, and visual differences between PDF files, for use in automatic regression tests.
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install diffpdf
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Usage
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
diffpdf <baseline.pdf> <actual.pdf> [OPTIONS]
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## How It Works
|
|
44
|
+
|
|
45
|
+
DiffPDF uses a fail-fast sequential pipeline to compare PDFs:
|
|
46
|
+
|
|
47
|
+
1. **Hash Check** - SHA-256 comparison. If identical, exit immediately with pass.
|
|
48
|
+
2. **Page Count** - Verify both PDFs have the same number of pages.
|
|
49
|
+
3. **Text Content** - Extract and compare text from all pages.
|
|
50
|
+
4. **Visual Check** - Render pages to images and compare using pixelmatch.
|
|
51
|
+
|
|
52
|
+
Each stage only runs if all previous stages pass.
|
|
53
|
+
|
|
54
|
+
**⚠️ Performance Warning:** The Python port of pixelmatch is extremely slow.
|
|
55
|
+
|
|
56
|
+
## Options
|
|
57
|
+
|
|
58
|
+
| Option | Default | Description |
|
|
59
|
+
|--------|---------|-------------|
|
|
60
|
+
| `--threshold` | 0.1 | Pixelmatch threshold (0.0-1.0) |
|
|
61
|
+
| `--dpi` | 96 | Render resolution |
|
|
62
|
+
| `--output-dir` | ./ | Directory for diff images |
|
|
63
|
+
| `--debug` | - | Verbose logging |
|
|
64
|
+
| `--save-log` | - | Write log to log.txt |
|
|
65
|
+
|
|
66
|
+
## Exit Codes
|
|
67
|
+
|
|
68
|
+
- `0` — Pass (PDFs are equivalent)
|
|
69
|
+
- `1` — Fail (differences detected)
|
|
70
|
+
- `2` — Error (invalid input or processing error)
|
|
71
|
+
|
|
72
|
+
## Development
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
pip install -e .[dev]
|
|
76
|
+
pytest tests/ -v
|
|
77
|
+
ruff check .
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Acknowledgements
|
|
81
|
+
|
|
82
|
+
Built with [PyMuPDF](https://pymupdf.readthedocs.io/) for PDF parsing and [pixelmatch-py](https://github.com/whtsky/pixelmatch-py) (Python port of [pixelmatch](https://github.com/mapbox/pixelmatch)) for visual comparison.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
diffpdf/__init__.py,sha256=kggdyYVRCNA8X7vPwLIDQ6uQN2FoNQY2zqUxnDh_dTg,260
|
|
2
|
+
diffpdf/cli.py,sha256=p1eW8b3ioI9JxZqLnU1oQeDlubeoUbWTd7nR4raQKJY,2204
|
|
3
|
+
diffpdf/comparators.py,sha256=zH-TxYvddyvyjRWRtLjIcDhVxe9PZPTNJyAzPCWu9B8,573
|
|
4
|
+
diffpdf/hash_check.py,sha256=SkNTJddjaXCUWFs3wSlDjVIzBKTS2Buvd-vPBZLAvFI,635
|
|
5
|
+
diffpdf/page_check.py,sha256=alcl8KsPFDgVtKpbEJCcwGZqUlQSjzQcv2-7erxq6HM,573
|
|
6
|
+
diffpdf/text_check.py,sha256=WxEH75a-ieFtelL1ieFVmsriz6eElLkZPPlnnW1lGUU,1056
|
|
7
|
+
diffpdf/visual_check.py,sha256=I8-id14MmyRNmcJC3v9Z7R6-JLEnxlCJ4aLYWWWKNZM,1740
|
|
8
|
+
diffpdf-0.1.2.dist-info/METADATA,sha256=TVxYtQHYNj9wXZEduIPY-tjHxmUZ7j-MahJsNESHIic,2489
|
|
9
|
+
diffpdf-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
10
|
+
diffpdf-0.1.2.dist-info/entry_points.txt,sha256=MYzgwCJSVq_fvPI-R5ApSCFDD4W6nqgwQ8kwXcyndcw,41
|
|
11
|
+
diffpdf-0.1.2.dist-info/licenses/LICENSE,sha256=dMsm7NlDabaS_o0khNX5QyX6pc3fzI6SK56YfZxdzuE,1069
|
|
12
|
+
diffpdf-0.1.2.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Justus Rijke
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|