diffpdf 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
diffpdf/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ from importlib.metadata import version
2
+
3
+ from .cli import cli
4
+
5
+ __version__ = version("diffpdf")
6
+
7
+
8
+ def main(args=None): # pragma: no cover
9
+ if args is None:
10
+ cli()
11
+ else:
12
+ cli(args, standalone_mode=False)
13
+
14
+
15
+ __all__ = ["main", "__version__"]
diffpdf/cli.py ADDED
@@ -0,0 +1,71 @@
1
+ import logging
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ import click
6
+ import colorlog
7
+
8
+ from .comparators import compare_pdfs
9
+
10
+
11
+ def setup_logging(debug, save_log): # pragma: no cover
12
+ level = logging.DEBUG if debug else logging.INFO
13
+
14
+ formatter = colorlog.ColoredFormatter(
15
+ "%(log_color)s%(asctime)s %(levelname)-8s%(reset)s %(message)s",
16
+ datefmt="%Y-%m-%d %H:%M:%S",
17
+ log_colors={
18
+ "DEBUG": "cyan",
19
+ "INFO": "green",
20
+ "WARNING": "yellow",
21
+ "ERROR": "red",
22
+ "CRITICAL": "red,bg_white",
23
+ },
24
+ )
25
+
26
+ console_handler = logging.StreamHandler()
27
+ console_handler.setFormatter(formatter)
28
+
29
+ logger = logging.getLogger()
30
+ logger.setLevel(level)
31
+ logger.addHandler(console_handler)
32
+
33
+ if save_log:
34
+ file_formatter = logging.Formatter(
35
+ "%(asctime)s %(levelname)-8s %(message)s",
36
+ datefmt="%Y-%m-%d %H:%M:%S",
37
+ )
38
+ file_handler = logging.FileHandler("log.txt")
39
+ file_handler.setFormatter(file_formatter)
40
+ logger.addHandler(file_handler)
41
+
42
+ return logger
43
+
44
+
45
+ @click.command()
46
+ @click.argument(
47
+ "reference", type=click.Path(exists=True, dir_okay=False, path_type=Path)
48
+ )
49
+ @click.argument("actual", type=click.Path(exists=True, dir_okay=False, path_type=Path))
50
+ @click.option(
51
+ "--threshold", type=float, default=0.1, help="Pixelmatch threshold (0.0-1.0)"
52
+ )
53
+ @click.option("--dpi", type=int, default=96, help="Render resolution")
54
+ @click.option(
55
+ "--output-dir",
56
+ type=click.Path(file_okay=False, path_type=Path),
57
+ default="./",
58
+ help="Diff image output directory",
59
+ )
60
+ @click.option("--debug", is_flag=True, help="Verbose logging")
61
+ @click.option("--save-log", is_flag=True, help="Write log output to log.txt")
62
+ @click.version_option(package_name="diffpdf")
63
+ def cli(reference, actual, threshold, dpi, output_dir, debug, save_log):
64
+ """Compare two PDF files for structural, textual, and visual differences."""
65
+ logger = setup_logging(debug, save_log)
66
+
67
+ try:
68
+ compare_pdfs(reference, actual, threshold, dpi, output_dir, logger)
69
+ except Exception as e: # pragma: no cover
70
+ logger.critical(f"Error: {e}")
71
+ sys.exit(2)
diffpdf/comparators.py ADDED
@@ -0,0 +1,22 @@
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ from .hash_check import check_hash
5
+ from .page_check import check_page_counts
6
+ from .text_check import check_text_content
7
+ from .visual_check import check_visual_content
8
+
9
+
10
+ def compare_pdfs(
11
+ ref: Path, actual: Path, threshold: float, dpi: int, output_dir: Path, logger
12
+ ) -> None:
13
+ check_hash(ref, actual, logger)
14
+
15
+ check_page_counts(ref, actual, logger)
16
+
17
+ check_text_content(ref, actual, logger)
18
+
19
+ check_visual_content(ref, actual, threshold, dpi, output_dir, logger)
20
+
21
+ logger.info("PDFs are equivalent")
22
+ sys.exit(0)
diffpdf/hash_check.py ADDED
@@ -0,0 +1,24 @@
1
+ import hashlib
2
+ import sys
3
+ from pathlib import Path
4
+
5
+
6
+ def compute_file_hash(filepath: Path) -> str:
7
+ sha256 = hashlib.sha256()
8
+ with open(filepath, "rb") as f:
9
+ for chunk in iter(lambda: f.read(8192), b""):
10
+ sha256.update(chunk)
11
+ return sha256.hexdigest()
12
+
13
+
14
+ def check_hash(ref: Path, actual: Path, logger) -> None:
15
+ logger.info("[1/4] Checking file hashes...")
16
+
17
+ ref_hash = compute_file_hash(ref)
18
+ actual_hash = compute_file_hash(actual)
19
+
20
+ if ref_hash == actual_hash:
21
+ logger.info("Files are identical (hash match)")
22
+ sys.exit(0)
23
+
24
+ logger.info("Hashes differ, continuing checks")
diffpdf/page_check.py ADDED
@@ -0,0 +1,24 @@
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ import fitz
5
+
6
+
7
+ def get_page_count(pdf_path: Path) -> int:
8
+ doc = fitz.open(pdf_path)
9
+ count = len(doc)
10
+ doc.close()
11
+ return count
12
+
13
+
14
+ def check_page_counts(ref: Path, actual: Path, logger) -> None:
15
+ logger.info("[2/4] Checking page counts...")
16
+
17
+ ref_count = get_page_count(ref)
18
+ actual_count = get_page_count(actual)
19
+
20
+ if ref_count != actual_count:
21
+ logger.error(f"Page count mismatch: expected {ref_count}, got {actual_count}")
22
+ sys.exit(1)
23
+
24
+ logger.info(f"Page counts match ({ref_count} pages)")
diffpdf/text_check.py ADDED
@@ -0,0 +1,45 @@
1
+ import difflib
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ import fitz
6
+
7
+
8
+ def extract_text(pdf_path: Path) -> str:
9
+ doc = fitz.open(pdf_path)
10
+ text = ""
11
+ for page in doc:
12
+ text += page.get_text()
13
+ doc.close()
14
+ return text.strip()
15
+
16
+
17
+ def generate_diff(ref_text: str, actual_text: str) -> str:
18
+ ref_lines = ref_text.splitlines(keepends=True)
19
+ actual_lines = actual_text.splitlines(keepends=True)
20
+
21
+ diff = difflib.unified_diff(
22
+ ref_lines,
23
+ actual_lines,
24
+ fromfile="reference.pdf",
25
+ tofile="actual.pdf",
26
+ lineterm="",
27
+ )
28
+
29
+ return "".join(diff)
30
+
31
+
32
+ def check_text_content(ref: Path, actual: Path, logger) -> None:
33
+ logger.info("[3/4] Checking text content...")
34
+
35
+ ref_text = extract_text(ref)
36
+ actual_text = extract_text(actual)
37
+
38
+ if ref_text != actual_text:
39
+ diff = generate_diff(ref_text, actual_text)
40
+ logger.error("Text content mismatch")
41
+ for line in diff.splitlines():
42
+ logger.error(line)
43
+ sys.exit(1)
44
+
45
+ logger.info("Text content matches")
@@ -0,0 +1,63 @@
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ import fitz
5
+ from PIL import Image
6
+ from pixelmatch.contrib.PIL import pixelmatch
7
+
8
+
9
+ def render_page_to_image(pdf_path: Path, page_num: int, dpi: int) -> Image.Image:
10
+ doc = fitz.open(pdf_path)
11
+ page = doc[page_num]
12
+ pix = page.get_pixmap(dpi=dpi)
13
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
14
+ doc.close()
15
+ return img
16
+
17
+
18
+ def compare_images(
19
+ ref_img: Image.Image, actual_img: Image.Image, threshold: float, output_path: Path
20
+ ) -> bool:
21
+ diff_img = Image.new("RGB", ref_img.size)
22
+ mismatch_count = pixelmatch(ref_img, actual_img, diff_img, threshold=threshold)
23
+
24
+ if mismatch_count > 0:
25
+ diff_img.save(output_path)
26
+ return False
27
+
28
+ return True
29
+
30
+
31
+ def check_visual_content(
32
+ ref: Path, actual: Path, threshold: float, dpi: int, output_dir: Path, logger
33
+ ) -> None:
34
+ logger.info("[4/4] Checking visual content...")
35
+
36
+ output_dir.mkdir(parents=True, exist_ok=True)
37
+
38
+ ref_doc = fitz.open(ref)
39
+ page_count = len(ref_doc)
40
+ ref_doc.close()
41
+
42
+ failing_pages = []
43
+
44
+ for page_num in range(page_count):
45
+ ref_img = render_page_to_image(ref, page_num, dpi)
46
+ actual_img = render_page_to_image(actual, page_num, dpi)
47
+
48
+ ref_name = ref.stem
49
+ actual_name = actual.stem
50
+ output_path = (
51
+ output_dir / f"{ref_name}_vs_{actual_name}_page{page_num + 1}_diff.png"
52
+ )
53
+
54
+ passed = compare_images(ref_img, actual_img, threshold, output_path)
55
+
56
+ if not passed:
57
+ failing_pages.append(page_num + 1)
58
+
59
+ if failing_pages:
60
+ logger.error(f"Visual mismatch on pages: {', '.join(map(str, failing_pages))}")
61
+ sys.exit(1)
62
+
63
+ logger.info("Visual content matches")
@@ -0,0 +1,82 @@
1
+ Metadata-Version: 2.4
2
+ Name: diffpdf
3
+ Version: 0.1.2
4
+ Summary: A tool for comparing PDF files
5
+ Project-URL: Homepage, https://github.com/JustusRijke/DiffPDF
6
+ Project-URL: Issues, https://github.com/JustusRijke/DiffPDF/issues
7
+ Author-email: Justus Rijke <justusrijke@gmail.com>
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Operating System :: Microsoft :: Windows
12
+ Classifier: Operating System :: POSIX :: Linux
13
+ Classifier: Programming Language :: Python :: 3
14
+ Requires-Python: >=3.10
15
+ Requires-Dist: click
16
+ Requires-Dist: colorlog
17
+ Requires-Dist: pillow>=10.0.0
18
+ Requires-Dist: pixelmatch>=0.3.0
19
+ Requires-Dist: pymupdf>=1.23.0
20
+ Provides-Extra: dev
21
+ Requires-Dist: pytest; extra == 'dev'
22
+ Requires-Dist: ruff; extra == 'dev'
23
+ Description-Content-Type: text/markdown
24
+
25
+ # DiffPDF
26
+
27
+ [![CI](https://github.com/JustusRijke/DiffPDF/actions/workflows/ci.yml/badge.svg)](https://github.com/JustusRijke/DiffPDF/actions/workflows/ci.yml)
28
+
29
+ CLI tool for detecting structural, textual, and visual differences between PDF files, for use in automatic regression tests.
30
+
31
+ ## Installation
32
+
33
+ ```bash
34
+ pip install diffpdf
35
+ ```
36
+
37
+ ## Usage
38
+
39
+ ```bash
40
+ diffpdf <baseline.pdf> <actual.pdf> [OPTIONS]
41
+ ```
42
+
43
+ ## How It Works
44
+
45
+ DiffPDF uses a fail-fast sequential pipeline to compare PDFs:
46
+
47
+ 1. **Hash Check** - SHA-256 comparison. If identical, exit immediately with pass.
48
+ 2. **Page Count** - Verify both PDFs have the same number of pages.
49
+ 3. **Text Content** - Extract and compare text from all pages.
50
+ 4. **Visual Check** - Render pages to images and compare using pixelmatch.
51
+
52
+ Each stage only runs if all previous stages pass.
53
+
54
+ **⚠️ Performance Warning:** The Python port of pixelmatch is extremely slow.
55
+
56
+ ## Options
57
+
58
+ | Option | Default | Description |
59
+ |--------|---------|-------------|
60
+ | `--threshold` | 0.1 | Pixelmatch threshold (0.0-1.0) |
61
+ | `--dpi` | 96 | Render resolution |
62
+ | `--output-dir` | ./ | Directory for diff images |
63
+ | `--debug` | - | Verbose logging |
64
+ | `--save-log` | - | Write log to log.txt |
65
+
66
+ ## Exit Codes
67
+
68
+ - `0` — Pass (PDFs are equivalent)
69
+ - `1` — Fail (differences detected)
70
+ - `2` — Error (invalid input or processing error)
71
+
72
+ ## Development
73
+
74
+ ```bash
75
+ pip install -e .[dev]
76
+ pytest tests/ -v
77
+ ruff check .
78
+ ```
79
+
80
+ ## Acknowledgements
81
+
82
+ Built with [PyMuPDF](https://pymupdf.readthedocs.io/) for PDF parsing and [pixelmatch-py](https://github.com/whtsky/pixelmatch-py) (Python port of [pixelmatch](https://github.com/mapbox/pixelmatch)) for visual comparison.
@@ -0,0 +1,12 @@
1
+ diffpdf/__init__.py,sha256=kggdyYVRCNA8X7vPwLIDQ6uQN2FoNQY2zqUxnDh_dTg,260
2
+ diffpdf/cli.py,sha256=p1eW8b3ioI9JxZqLnU1oQeDlubeoUbWTd7nR4raQKJY,2204
3
+ diffpdf/comparators.py,sha256=zH-TxYvddyvyjRWRtLjIcDhVxe9PZPTNJyAzPCWu9B8,573
4
+ diffpdf/hash_check.py,sha256=SkNTJddjaXCUWFs3wSlDjVIzBKTS2Buvd-vPBZLAvFI,635
5
+ diffpdf/page_check.py,sha256=alcl8KsPFDgVtKpbEJCcwGZqUlQSjzQcv2-7erxq6HM,573
6
+ diffpdf/text_check.py,sha256=WxEH75a-ieFtelL1ieFVmsriz6eElLkZPPlnnW1lGUU,1056
7
+ diffpdf/visual_check.py,sha256=I8-id14MmyRNmcJC3v9Z7R6-JLEnxlCJ4aLYWWWKNZM,1740
8
+ diffpdf-0.1.2.dist-info/METADATA,sha256=TVxYtQHYNj9wXZEduIPY-tjHxmUZ7j-MahJsNESHIic,2489
9
+ diffpdf-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
10
+ diffpdf-0.1.2.dist-info/entry_points.txt,sha256=MYzgwCJSVq_fvPI-R5ApSCFDD4W6nqgwQ8kwXcyndcw,41
11
+ diffpdf-0.1.2.dist-info/licenses/LICENSE,sha256=dMsm7NlDabaS_o0khNX5QyX6pc3fzI6SK56YfZxdzuE,1069
12
+ diffpdf-0.1.2.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ diffpdf = diffpdf:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Justus Rijke
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.