diffpdf 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. diffpdf-0.1.2/.github/dependabot.yml +11 -0
  2. diffpdf-0.1.2/.github/workflows/ci.yml +41 -0
  3. diffpdf-0.1.2/.github/workflows/pypi-publish.yml +32 -0
  4. diffpdf-0.1.2/.gitignore +210 -0
  5. diffpdf-0.1.2/.vscode/settings.json +7 -0
  6. diffpdf-0.1.2/LICENSE +21 -0
  7. diffpdf-0.1.2/MANIFEST.in +2 -0
  8. diffpdf-0.1.2/PKG-INFO +82 -0
  9. diffpdf-0.1.2/README.md +58 -0
  10. diffpdf-0.1.2/hooks/pre-commit +59 -0
  11. diffpdf-0.1.2/pyproject.toml +45 -0
  12. diffpdf-0.1.2/ruff.toml +2 -0
  13. diffpdf-0.1.2/src/diffpdf/__init__.py +15 -0
  14. diffpdf-0.1.2/src/diffpdf/cli.py +71 -0
  15. diffpdf-0.1.2/src/diffpdf/comparators.py +22 -0
  16. diffpdf-0.1.2/src/diffpdf/hash_check.py +24 -0
  17. diffpdf-0.1.2/src/diffpdf/page_check.py +24 -0
  18. diffpdf-0.1.2/src/diffpdf/text_check.py +45 -0
  19. diffpdf-0.1.2/src/diffpdf/visual_check.py +63 -0
  20. diffpdf-0.1.2/tests/assets/fail/1-letter-diff-A.pdf +0 -0
  21. diffpdf-0.1.2/tests/assets/fail/1-letter-diff-B.pdf +0 -0
  22. diffpdf-0.1.2/tests/assets/fail/major-color-diff-A.pdf +0 -0
  23. diffpdf-0.1.2/tests/assets/fail/major-color-diff-B.pdf +0 -0
  24. diffpdf-0.1.2/tests/assets/fail/page-count-diff-A.pdf +0 -0
  25. diffpdf-0.1.2/tests/assets/fail/page-count-diff-B.pdf +0 -0
  26. diffpdf-0.1.2/tests/assets/pass/hash-diff-A.pdf +0 -0
  27. diffpdf-0.1.2/tests/assets/pass/hash-diff-B.pdf +0 -0
  28. diffpdf-0.1.2/tests/assets/pass/identical-A.pdf +0 -0
  29. diffpdf-0.1.2/tests/assets/pass/identical-B.pdf +0 -0
  30. diffpdf-0.1.2/tests/assets/pass/minor-color-diff-A.pdf +0 -0
  31. diffpdf-0.1.2/tests/assets/pass/minor-color-diff-B.pdf +0 -0
  32. diffpdf-0.1.2/tests/assets/pass/multiplatform-diff-A.pdf +0 -0
  33. diffpdf-0.1.2/tests/assets/pass/multiplatform-diff-B.pdf +0 -0
  34. diffpdf-0.1.2/tests/test_cli.py +35 -0
@@ -0,0 +1,11 @@
1
+ version: 2
2
+ updates:
3
+ - package-ecosystem: "pip"
4
+ directory: "/"
5
+ schedule:
6
+ interval: "weekly"
7
+
8
+ - package-ecosystem: "github-actions"
9
+ directory: "/"
10
+ schedule:
11
+ interval: "weekly"
@@ -0,0 +1,41 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+
7
+ jobs:
8
+ test:
9
+ runs-on: ${{ matrix.os }}
10
+ strategy:
11
+ matrix:
12
+ os: [ubuntu-latest, windows-latest]
13
+
14
+ steps:
15
+ - uses: actions/checkout@v6
16
+ with:
17
+ fetch-depth: 0
18
+
19
+ - name: Set up Python
20
+ uses: actions/setup-python@v6
21
+ with:
22
+ python-version: "3.10"
23
+
24
+ - name: Install package with dev dependencies
25
+ run: pip install -e .[dev]
26
+
27
+ - name: Run ruff
28
+ run: ruff check .
29
+
30
+ - name: Run pytest
31
+ run: pytest tests/ -v
32
+
33
+ - name: Verify version detection
34
+ run: |
35
+ VERSION=$(diffpdf --version | sed -n 's/.*version \([0-9]\+\.[0-9]\+\.[0-9]\+\).*/\1/p')
36
+ if [ "$VERSION" = "0.0.0" ]; then
37
+ echo "Error: Version is 0.0.0, setuptools-scm failed to detect version"
38
+ exit 1
39
+ fi
40
+ echo "Version detected correctly: $VERSION"
41
+ shell: bash
@@ -0,0 +1,32 @@
1
+ name: Publish Python Package
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ deploy:
9
+ runs-on: ubuntu-latest
10
+ permissions:
11
+ id-token: write
12
+
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v5
18
+ with:
19
+ python-version: '3.10'
20
+
21
+ - name: Install dependencies
22
+ run: |
23
+ python -m pip install --upgrade pip
24
+ pip install build
25
+
26
+ - name: Build package
27
+ run: python -m build
28
+
29
+ - name: Publish package to PyPI
30
+ uses: pypa/gh-action-pypi-publish@release/v1
31
+ with:
32
+ verbose: true
@@ -0,0 +1,210 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # CLI output
65
+ log.txt
66
+
67
+ # Flask stuff:
68
+ instance/
69
+ .webassets-cache
70
+
71
+ # Scrapy stuff:
72
+ .scrapy
73
+
74
+ # Sphinx documentation
75
+ docs/_build/
76
+
77
+ # PyBuilder
78
+ .pybuilder/
79
+ target/
80
+
81
+ # Jupyter Notebook
82
+ .ipynb_checkpoints
83
+
84
+ # IPython
85
+ profile_default/
86
+ ipython_config.py
87
+
88
+ # pyenv
89
+ # For a library or package, you might want to ignore these files since the code is
90
+ # intended to run in multiple environments; otherwise, check them in:
91
+ # .python-version
92
+
93
+ # pipenv
94
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
96
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
97
+ # install all needed dependencies.
98
+ #Pipfile.lock
99
+
100
+ # UV
101
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
102
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
103
+ # commonly ignored for libraries.
104
+ #uv.lock
105
+
106
+ # poetry
107
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
109
+ # commonly ignored for libraries.
110
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111
+ #poetry.lock
112
+ #poetry.toml
113
+
114
+ # pdm
115
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
116
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
117
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
118
+ #pdm.lock
119
+ #pdm.toml
120
+ .pdm-python
121
+ .pdm-build/
122
+
123
+ # pixi
124
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
125
+ #pixi.lock
126
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
127
+ # in the .venv directory. It is recommended not to include this directory in version control.
128
+ .pixi
129
+
130
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
131
+ __pypackages__/
132
+
133
+ # Celery stuff
134
+ celerybeat-schedule
135
+ celerybeat.pid
136
+
137
+ # SageMath parsed files
138
+ *.sage.py
139
+
140
+ # Environments
141
+ .env
142
+ .envrc
143
+ .venv
144
+ env/
145
+ venv/
146
+ ENV/
147
+ env.bak/
148
+ venv.bak/
149
+
150
+ # Spyder project settings
151
+ .spyderproject
152
+ .spyproject
153
+
154
+ # Rope project settings
155
+ .ropeproject
156
+
157
+ # mkdocs documentation
158
+ /site
159
+
160
+ # mypy
161
+ .mypy_cache/
162
+ .dmypy.json
163
+ dmypy.json
164
+
165
+ # Pyre type checker
166
+ .pyre/
167
+
168
+ # pytype static type analyzer
169
+ .pytype/
170
+
171
+ # Cython debug symbols
172
+ cython_debug/
173
+
174
+ # PyCharm
175
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
176
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
177
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
178
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
179
+ #.idea/
180
+
181
+ # Abstra
182
+ # Abstra is an AI-powered process automation framework.
183
+ # Ignore directories containing user credentials, local state, and settings.
184
+ # Learn more at https://abstra.io/docs
185
+ .abstra/
186
+
187
+ # Visual Studio Code
188
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
189
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
190
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
191
+ # you could uncomment the following to ignore the entire vscode folder
192
+ # .vscode/
193
+
194
+ # Ruff stuff:
195
+ .ruff_cache/
196
+
197
+ # PyPI configuration file
198
+ .pypirc
199
+
200
+ # Cursor
201
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
202
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
203
+ # refer to https://docs.cursor.com/context/ignore-files
204
+ .cursorignore
205
+ .cursorindexingignore
206
+
207
+ # Marimo
208
+ marimo/_static/
209
+ marimo/_lsp/
210
+ __marimo__/
@@ -0,0 +1,7 @@
1
+ {
2
+ "python.testing.pytestArgs": [
3
+ "tests"
4
+ ],
5
+ "python.testing.unittestEnabled": false,
6
+ "python.testing.pytestEnabled": true
7
+ }
diffpdf-0.1.2/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Justus Rijke
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,2 @@
1
+ include README.md
2
+ include LICENSE
diffpdf-0.1.2/PKG-INFO ADDED
@@ -0,0 +1,82 @@
1
+ Metadata-Version: 2.4
2
+ Name: diffpdf
3
+ Version: 0.1.2
4
+ Summary: A tool for comparing PDF files
5
+ Project-URL: Homepage, https://github.com/JustusRijke/DiffPDF
6
+ Project-URL: Issues, https://github.com/JustusRijke/DiffPDF/issues
7
+ Author-email: Justus Rijke <justusrijke@gmail.com>
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Operating System :: Microsoft :: Windows
12
+ Classifier: Operating System :: POSIX :: Linux
13
+ Classifier: Programming Language :: Python :: 3
14
+ Requires-Python: >=3.10
15
+ Requires-Dist: click
16
+ Requires-Dist: colorlog
17
+ Requires-Dist: pillow>=10.0.0
18
+ Requires-Dist: pixelmatch>=0.3.0
19
+ Requires-Dist: pymupdf>=1.23.0
20
+ Provides-Extra: dev
21
+ Requires-Dist: pytest; extra == 'dev'
22
+ Requires-Dist: ruff; extra == 'dev'
23
+ Description-Content-Type: text/markdown
24
+
25
+ # DiffPDF
26
+
27
+ [![CI](https://github.com/JustusRijke/DiffPDF/actions/workflows/ci.yml/badge.svg)](https://github.com/JustusRijke/DiffPDF/actions/workflows/ci.yml)
28
+
29
+ CLI tool for detecting structural, textual, and visual differences between PDF files, for use in automatic regression tests.
30
+
31
+ ## Installation
32
+
33
+ ```bash
34
+ pip install diffpdf
35
+ ```
36
+
37
+ ## Usage
38
+
39
+ ```bash
40
+ diffpdf <baseline.pdf> <actual.pdf> [OPTIONS]
41
+ ```
42
+
43
+ ## How It Works
44
+
45
+ DiffPDF uses a fail-fast sequential pipeline to compare PDFs:
46
+
47
+ 1. **Hash Check** - SHA-256 comparison. If identical, exit immediately with pass.
48
+ 2. **Page Count** - Verify both PDFs have the same number of pages.
49
+ 3. **Text Content** - Extract and compare text from all pages.
50
+ 4. **Visual Check** - Render pages to images and compare using pixelmatch.
51
+
52
+ Each stage only runs if all previous stages pass.
53
+
54
+ **⚠️ Performance Warning:** The Python port of pixelmatch is extremely slow.
55
+
56
+ ## Options
57
+
58
+ | Option | Default | Description |
59
+ |--------|---------|-------------|
60
+ | `--threshold` | 0.1 | Pixelmatch threshold (0.0-1.0) |
61
+ | `--dpi` | 96 | Render resolution |
62
+ | `--output-dir` | ./ | Directory for diff images |
63
+ | `--debug` | - | Verbose logging |
64
+ | `--save-log` | - | Write log to log.txt |
65
+
66
+ ## Exit Codes
67
+
68
+ - `0` — Pass (PDFs are equivalent)
69
+ - `1` — Fail (differences detected)
70
+ - `2` — Error (invalid input or processing error)
71
+
72
+ ## Development
73
+
74
+ ```bash
75
+ pip install -e .[dev]
76
+ pytest tests/ -v
77
+ ruff check .
78
+ ```
79
+
80
+ ## Acknowledgements
81
+
82
+ Built with [PyMuPDF](https://pymupdf.readthedocs.io/) for PDF parsing and [pixelmatch-py](https://github.com/whtsky/pixelmatch-py) (Python port of [pixelmatch](https://github.com/mapbox/pixelmatch)) for visual comparison.
@@ -0,0 +1,58 @@
1
+ # DiffPDF
2
+
3
+ [![CI](https://github.com/JustusRijke/DiffPDF/actions/workflows/ci.yml/badge.svg)](https://github.com/JustusRijke/DiffPDF/actions/workflows/ci.yml)
4
+
5
+ CLI tool for detecting structural, textual, and visual differences between PDF files, for use in automatic regression tests.
6
+
7
+ ## Installation
8
+
9
+ ```bash
10
+ pip install diffpdf
11
+ ```
12
+
13
+ ## Usage
14
+
15
+ ```bash
16
+ diffpdf <baseline.pdf> <actual.pdf> [OPTIONS]
17
+ ```
18
+
19
+ ## How It Works
20
+
21
+ DiffPDF uses a fail-fast sequential pipeline to compare PDFs:
22
+
23
+ 1. **Hash Check** - SHA-256 comparison. If identical, exit immediately with pass.
24
+ 2. **Page Count** - Verify both PDFs have the same number of pages.
25
+ 3. **Text Content** - Extract and compare text from all pages.
26
+ 4. **Visual Check** - Render pages to images and compare using pixelmatch.
27
+
28
+ Each stage only runs if all previous stages pass.
29
+
30
+ **⚠️ Performance Warning:** The Python port of pixelmatch is extremely slow.
31
+
32
+ ## Options
33
+
34
+ | Option | Default | Description |
35
+ |--------|---------|-------------|
36
+ | `--threshold` | 0.1 | Pixelmatch threshold (0.0-1.0) |
37
+ | `--dpi` | 96 | Render resolution |
38
+ | `--output-dir` | ./ | Directory for diff images |
39
+ | `--debug` | - | Verbose logging |
40
+ | `--save-log` | - | Write log to log.txt |
41
+
42
+ ## Exit Codes
43
+
44
+ - `0` — Pass (PDFs are equivalent)
45
+ - `1` — Fail (differences detected)
46
+ - `2` — Error (invalid input or processing error)
47
+
48
+ ## Development
49
+
50
+ ```bash
51
+ pip install -e .[dev]
52
+ pytest tests/ -v
53
+ ruff check .
54
+ ```
55
+
56
+ ## Acknowledgements
57
+
58
+ Built with [PyMuPDF](https://pymupdf.readthedocs.io/) for PDF parsing and [pixelmatch-py](https://github.com/whtsky/pixelmatch-py) (Python port of [pixelmatch](https://github.com/mapbox/pixelmatch)) for visual comparison.
@@ -0,0 +1,59 @@
1
+ #!/bin/sh
2
+
3
+ if git rev-parse --verify HEAD >/dev/null 2>&1
4
+ then
5
+ against=HEAD
6
+ else
7
+ # Initial commit: diff against an empty tree object
8
+ against=$(git hash-object -t tree /dev/null)
9
+ fi
10
+
11
+ # If you want to allow non-ASCII filenames set this variable to true.
12
+ allownonascii=$(git config --type=bool hooks.allownonascii)
13
+
14
+ # Redirect output to stderr.
15
+ exec 1>&2
16
+
17
+ # Cross platform projects tend to avoid non-ASCII filenames; prevent
18
+ # them from being added to the repository. We exploit the fact that the
19
+ # printable range starts at the space character and ends with tilde.
20
+ if [ "$allownonascii" != "true" ] &&
21
+ # Note that the use of brackets around a tr range is ok here, (it's
22
+ # even required, for portability to Solaris 10's /usr/bin/tr), since
23
+ # the square bracket bytes happen to fall in the designated range.
24
+ test $(git diff-index --cached --name-only --diff-filter=A -z $against |
25
+ LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0
26
+ then
27
+ cat <<\EOF
28
+ Error: Attempt to add a non-ASCII file name.
29
+
30
+ This can cause problems if you want to work with people on other platforms.
31
+
32
+ To be portable it is advisable to rename the file.
33
+
34
+ If you know what you are doing you can disable this check using:
35
+
36
+ git config hooks.allownonascii true
37
+ EOF
38
+ exit 1
39
+ fi
40
+
41
+ # Ruff checks
42
+ ruff check
43
+ CHECK_EXIT=$?
44
+
45
+ ruff format --check
46
+ FORMAT_EXIT=$?
47
+
48
+ if [ $CHECK_EXIT -ne 0 ] || [ $FORMAT_EXIT -ne 0 ]; then
49
+ if [ $CHECK_EXIT -ne 0 ]; then
50
+ echo "Ruff found linting errors. Run: ruff check --fix"
51
+ fi
52
+ if [ $FORMAT_EXIT -ne 0 ]; then
53
+ echo "Ruff found formatting issues. Run: ruff format"
54
+ fi
55
+ exit 1
56
+ fi
57
+
58
+ # If there are whitespace errors, print the offending file names and fail.
59
+ exec git diff-index --check --cached $against --
@@ -0,0 +1,45 @@
1
+ [build-system]
2
+ requires = [
3
+ "hatchling",
4
+ "hatch-vcs",
5
+ ]
6
+ build-backend = "hatchling.build"
7
+
8
+ [project]
9
+ name = "diffpdf"
10
+ dynamic = ["version"]
11
+ description = "A tool for comparing PDF files"
12
+ readme = "README.md"
13
+ license = "MIT"
14
+ license-files = ["LICEN[CS]E*"]
15
+ authors = [{name = "Justus Rijke", email="justusrijke@gmail.com"}]
16
+ requires-python = ">=3.10"
17
+ classifiers = [
18
+ "Programming Language :: Python :: 3",
19
+ "Development Status :: 4 - Beta",
20
+ "Operating System :: Microsoft :: Windows",
21
+ "Operating System :: POSIX :: Linux",
22
+ ]
23
+ dependencies = [
24
+ "click",
25
+ "colorlog",
26
+ "pymupdf>=1.23.0",
27
+ "pixelmatch>=0.3.0",
28
+ "Pillow>=10.0.0",
29
+ ]
30
+
31
+ [project.urls]
32
+ Homepage = "https://github.com/JustusRijke/DiffPDF"
33
+ Issues = "https://github.com/JustusRijke/DiffPDF/issues"
34
+
35
+ [project.optional-dependencies]
36
+ dev = ["pytest", "ruff"]
37
+
38
+ [project.scripts]
39
+ diffpdf = "diffpdf:main"
40
+
41
+ [tool.hatch.version]
42
+ source = "vcs"
43
+
44
+ [tool.hatch.version.raw-options]
45
+ local_scheme = "no-local-version"
@@ -0,0 +1,2 @@
1
+ target-version = "py312"
2
+ lint.select = ["I"]
@@ -0,0 +1,15 @@
1
+ from importlib.metadata import version
2
+
3
+ from .cli import cli
4
+
5
+ __version__ = version("diffpdf")
6
+
7
+
8
+ def main(args=None): # pragma: no cover
9
+ if args is None:
10
+ cli()
11
+ else:
12
+ cli(args, standalone_mode=False)
13
+
14
+
15
+ __all__ = ["main", "__version__"]
@@ -0,0 +1,71 @@
1
+ import logging
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ import click
6
+ import colorlog
7
+
8
+ from .comparators import compare_pdfs
9
+
10
+
11
+ def setup_logging(debug, save_log): # pragma: no cover
12
+ level = logging.DEBUG if debug else logging.INFO
13
+
14
+ formatter = colorlog.ColoredFormatter(
15
+ "%(log_color)s%(asctime)s %(levelname)-8s%(reset)s %(message)s",
16
+ datefmt="%Y-%m-%d %H:%M:%S",
17
+ log_colors={
18
+ "DEBUG": "cyan",
19
+ "INFO": "green",
20
+ "WARNING": "yellow",
21
+ "ERROR": "red",
22
+ "CRITICAL": "red,bg_white",
23
+ },
24
+ )
25
+
26
+ console_handler = logging.StreamHandler()
27
+ console_handler.setFormatter(formatter)
28
+
29
+ logger = logging.getLogger()
30
+ logger.setLevel(level)
31
+ logger.addHandler(console_handler)
32
+
33
+ if save_log:
34
+ file_formatter = logging.Formatter(
35
+ "%(asctime)s %(levelname)-8s %(message)s",
36
+ datefmt="%Y-%m-%d %H:%M:%S",
37
+ )
38
+ file_handler = logging.FileHandler("log.txt")
39
+ file_handler.setFormatter(file_formatter)
40
+ logger.addHandler(file_handler)
41
+
42
+ return logger
43
+
44
+
45
+ @click.command()
46
+ @click.argument(
47
+ "reference", type=click.Path(exists=True, dir_okay=False, path_type=Path)
48
+ )
49
+ @click.argument("actual", type=click.Path(exists=True, dir_okay=False, path_type=Path))
50
+ @click.option(
51
+ "--threshold", type=float, default=0.1, help="Pixelmatch threshold (0.0-1.0)"
52
+ )
53
+ @click.option("--dpi", type=int, default=96, help="Render resolution")
54
+ @click.option(
55
+ "--output-dir",
56
+ type=click.Path(file_okay=False, path_type=Path),
57
+ default="./",
58
+ help="Diff image output directory",
59
+ )
60
+ @click.option("--debug", is_flag=True, help="Verbose logging")
61
+ @click.option("--save-log", is_flag=True, help="Write log output to log.txt")
62
+ @click.version_option(package_name="diffpdf")
63
+ def cli(reference, actual, threshold, dpi, output_dir, debug, save_log):
64
+ """Compare two PDF files for structural, textual, and visual differences."""
65
+ logger = setup_logging(debug, save_log)
66
+
67
+ try:
68
+ compare_pdfs(reference, actual, threshold, dpi, output_dir, logger)
69
+ except Exception as e: # pragma: no cover
70
+ logger.critical(f"Error: {e}")
71
+ sys.exit(2)
@@ -0,0 +1,22 @@
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ from .hash_check import check_hash
5
+ from .page_check import check_page_counts
6
+ from .text_check import check_text_content
7
+ from .visual_check import check_visual_content
8
+
9
+
10
+ def compare_pdfs(
11
+ ref: Path, actual: Path, threshold: float, dpi: int, output_dir: Path, logger
12
+ ) -> None:
13
+ check_hash(ref, actual, logger)
14
+
15
+ check_page_counts(ref, actual, logger)
16
+
17
+ check_text_content(ref, actual, logger)
18
+
19
+ check_visual_content(ref, actual, threshold, dpi, output_dir, logger)
20
+
21
+ logger.info("PDFs are equivalent")
22
+ sys.exit(0)
@@ -0,0 +1,24 @@
1
+ import hashlib
2
+ import sys
3
+ from pathlib import Path
4
+
5
+
6
+ def compute_file_hash(filepath: Path) -> str:
7
+ sha256 = hashlib.sha256()
8
+ with open(filepath, "rb") as f:
9
+ for chunk in iter(lambda: f.read(8192), b""):
10
+ sha256.update(chunk)
11
+ return sha256.hexdigest()
12
+
13
+
14
+ def check_hash(ref: Path, actual: Path, logger) -> None:
15
+ logger.info("[1/4] Checking file hashes...")
16
+
17
+ ref_hash = compute_file_hash(ref)
18
+ actual_hash = compute_file_hash(actual)
19
+
20
+ if ref_hash == actual_hash:
21
+ logger.info("Files are identical (hash match)")
22
+ sys.exit(0)
23
+
24
+ logger.info("Hashes differ, continuing checks")
@@ -0,0 +1,24 @@
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ import fitz
5
+
6
+
7
+ def get_page_count(pdf_path: Path) -> int:
8
+ doc = fitz.open(pdf_path)
9
+ count = len(doc)
10
+ doc.close()
11
+ return count
12
+
13
+
14
+ def check_page_counts(ref: Path, actual: Path, logger) -> None:
15
+ logger.info("[2/4] Checking page counts...")
16
+
17
+ ref_count = get_page_count(ref)
18
+ actual_count = get_page_count(actual)
19
+
20
+ if ref_count != actual_count:
21
+ logger.error(f"Page count mismatch: expected {ref_count}, got {actual_count}")
22
+ sys.exit(1)
23
+
24
+ logger.info(f"Page counts match ({ref_count} pages)")
@@ -0,0 +1,45 @@
1
+ import difflib
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ import fitz
6
+
7
+
8
+ def extract_text(pdf_path: Path) -> str:
9
+ doc = fitz.open(pdf_path)
10
+ text = ""
11
+ for page in doc:
12
+ text += page.get_text()
13
+ doc.close()
14
+ return text.strip()
15
+
16
+
17
+ def generate_diff(ref_text: str, actual_text: str) -> str:
18
+ ref_lines = ref_text.splitlines(keepends=True)
19
+ actual_lines = actual_text.splitlines(keepends=True)
20
+
21
+ diff = difflib.unified_diff(
22
+ ref_lines,
23
+ actual_lines,
24
+ fromfile="reference.pdf",
25
+ tofile="actual.pdf",
26
+ lineterm="",
27
+ )
28
+
29
+ return "".join(diff)
30
+
31
+
32
+ def check_text_content(ref: Path, actual: Path, logger) -> None:
33
+ logger.info("[3/4] Checking text content...")
34
+
35
+ ref_text = extract_text(ref)
36
+ actual_text = extract_text(actual)
37
+
38
+ if ref_text != actual_text:
39
+ diff = generate_diff(ref_text, actual_text)
40
+ logger.error("Text content mismatch")
41
+ for line in diff.splitlines():
42
+ logger.error(line)
43
+ sys.exit(1)
44
+
45
+ logger.info("Text content matches")
@@ -0,0 +1,63 @@
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ import fitz
5
+ from PIL import Image
6
+ from pixelmatch.contrib.PIL import pixelmatch
7
+
8
+
9
+ def render_page_to_image(pdf_path: Path, page_num: int, dpi: int) -> Image.Image:
10
+ doc = fitz.open(pdf_path)
11
+ page = doc[page_num]
12
+ pix = page.get_pixmap(dpi=dpi)
13
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
14
+ doc.close()
15
+ return img
16
+
17
+
18
+ def compare_images(
19
+ ref_img: Image.Image, actual_img: Image.Image, threshold: float, output_path: Path
20
+ ) -> bool:
21
+ diff_img = Image.new("RGB", ref_img.size)
22
+ mismatch_count = pixelmatch(ref_img, actual_img, diff_img, threshold=threshold)
23
+
24
+ if mismatch_count > 0:
25
+ diff_img.save(output_path)
26
+ return False
27
+
28
+ return True
29
+
30
+
31
+ def check_visual_content(
32
+ ref: Path, actual: Path, threshold: float, dpi: int, output_dir: Path, logger
33
+ ) -> None:
34
+ logger.info("[4/4] Checking visual content...")
35
+
36
+ output_dir.mkdir(parents=True, exist_ok=True)
37
+
38
+ ref_doc = fitz.open(ref)
39
+ page_count = len(ref_doc)
40
+ ref_doc.close()
41
+
42
+ failing_pages = []
43
+
44
+ for page_num in range(page_count):
45
+ ref_img = render_page_to_image(ref, page_num, dpi)
46
+ actual_img = render_page_to_image(actual, page_num, dpi)
47
+
48
+ ref_name = ref.stem
49
+ actual_name = actual.stem
50
+ output_path = (
51
+ output_dir / f"{ref_name}_vs_{actual_name}_page{page_num + 1}_diff.png"
52
+ )
53
+
54
+ passed = compare_images(ref_img, actual_img, threshold, output_path)
55
+
56
+ if not passed:
57
+ failing_pages.append(page_num + 1)
58
+
59
+ if failing_pages:
60
+ logger.error(f"Visual mismatch on pages: {', '.join(map(str, failing_pages))}")
61
+ sys.exit(1)
62
+
63
+ logger.info("Visual content matches")
@@ -0,0 +1,35 @@
1
+ from pathlib import Path
2
+
3
+ import pytest
4
+ from click.testing import CliRunner
5
+
6
+ from diffpdf.cli import cli
7
+
8
+
9
+ @pytest.mark.parametrize(
10
+ "ref_pdf_rel,actual_pdf_rel,expected_exit_code",
11
+ [
12
+ # Pass cases (exit code 0)
13
+ ("pass/identical-A.pdf", "pass/identical-B.pdf", 0),
14
+ ("pass/hash-diff-A.pdf", "pass/hash-diff-B.pdf", 0),
15
+ ("pass/minor-color-diff-A.pdf", "pass/minor-color-diff-B.pdf", 0),
16
+ ("pass/multiplatform-diff-A.pdf", "pass/multiplatform-diff-B.pdf", 0),
17
+ # Fail cases (exit code 1)
18
+ ("fail/1-letter-diff-A.pdf", "fail/1-letter-diff-B.pdf", 1),
19
+ ("fail/major-color-diff-A.pdf", "fail/major-color-diff-B.pdf", 1),
20
+ ("fail/page-count-diff-A.pdf", "fail/page-count-diff-B.pdf", 1),
21
+ # Critical error cases (exit code 2)
22
+ ("nonexistent.pdf", "another.pdf", 2),
23
+ ],
24
+ )
25
+ def test_cli(ref_pdf_rel, actual_pdf_rel, expected_exit_code):
26
+ """Parametric integration test: CLI should exit with correct code for various PDF pairs."""
27
+ runner = CliRunner()
28
+ test_assets_dir = Path(__file__).parent / "assets"
29
+
30
+ ref_pdf = str(test_assets_dir / ref_pdf_rel)
31
+ actual_pdf = str(test_assets_dir / actual_pdf_rel)
32
+
33
+ result = runner.invoke(cli, [ref_pdf, actual_pdf])
34
+
35
+ assert result.exit_code == expected_exit_code