pdflinkcheck 1.1.47__py3-none-any.whl → 1.1.72__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdflinkcheck/__init__.py +51 -13
- pdflinkcheck/{analyze.py → analyze_pymupdf.py} +54 -224
- pdflinkcheck/analyze_pypdf.py +184 -0
- pdflinkcheck/analyze_pypdf_v2.py +218 -0
- pdflinkcheck/cli.py +238 -39
- pdflinkcheck/data/LICENSE +5 -24
- pdflinkcheck/data/README.md +278 -0
- pdflinkcheck/data/pyproject.toml +98 -0
- pdflinkcheck/datacopy.py +60 -0
- pdflinkcheck/dev.py +109 -0
- pdflinkcheck/gui.py +371 -74
- pdflinkcheck/io.py +118 -11
- pdflinkcheck/report.py +280 -0
- pdflinkcheck/stdlib_server.py +176 -0
- pdflinkcheck/validate.py +380 -0
- pdflinkcheck/version_info.py +83 -0
- {pdflinkcheck-1.1.47.dist-info → pdflinkcheck-1.1.72.dist-info}/METADATA +127 -71
- pdflinkcheck-1.1.72.dist-info/RECORD +21 -0
- pdflinkcheck-1.1.72.dist-info/WHEEL +4 -0
- {pdflinkcheck-1.1.47.dist-info → pdflinkcheck-1.1.72.dist-info}/entry_points.txt +1 -0
- {pdflinkcheck-1.1.47.dist-info → pdflinkcheck-1.1.72.dist-info}/licenses/LICENSE +5 -24
- pdflinkcheck/remnants.py +0 -142
- pdflinkcheck-1.1.47.dist-info/RECORD +0 -13
- pdflinkcheck-1.1.47.dist-info/WHEEL +0 -5
- pdflinkcheck-1.1.47.dist-info/top_level.txt +0 -1
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pdflinkcheck
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.72
|
|
4
4
|
Summary: A purpose-built PDF link analysis and reporting tool with GUI and CLI.
|
|
5
|
+
Author: George Clayton Bennett
|
|
5
6
|
Author-email: George Clayton Bennett <george.bennett@memphistn.gov>
|
|
6
|
-
|
|
7
|
-
Project-URL: Repository, https://github.com/city-of-memphis-wastewater/pdflinkcheck
|
|
7
|
+
License-File: LICENSE
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
9
9
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.10
|
|
@@ -27,26 +27,28 @@ Classifier: Environment :: MacOS X
|
|
|
27
27
|
Classifier: Environment :: Win32 (MS Windows)
|
|
28
28
|
Classifier: Typing :: Typed
|
|
29
29
|
Classifier: Development Status :: 4 - Beta
|
|
30
|
-
Requires-Python: >=3.10
|
|
31
|
-
Description-Content-Type: text/markdown
|
|
32
|
-
License-File: LICENSE
|
|
33
30
|
Requires-Dist: pyhabitat>=1.0.53
|
|
34
|
-
Requires-Dist:
|
|
31
|
+
Requires-Dist: pypdf>=6.4.2
|
|
35
32
|
Requires-Dist: rich>=14.2.0
|
|
36
33
|
Requires-Dist: typer>=0.20.0
|
|
37
|
-
|
|
38
|
-
Requires-Dist:
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
34
|
+
Requires-Dist: pymupdf>=1.26.7 ; extra == 'full'
|
|
35
|
+
Requires-Dist: sv-ttk>=2.6.1 ; extra == 'gui'
|
|
36
|
+
Maintainer: George Clayton Bennett
|
|
37
|
+
Maintainer-email: George Clayton Bennett <george.bennett@memphistn.gov>
|
|
38
|
+
Requires-Python: >=3.10
|
|
39
|
+
Project-URL: Homepage, https://github.com/city-of-memphis-wastewater/pdflinkcheck
|
|
40
|
+
Project-URL: Repository, https://github.com/city-of-memphis-wastewater/pdflinkcheck
|
|
41
|
+
Provides-Extra: full
|
|
42
|
+
Provides-Extra: gui
|
|
43
|
+
Description-Content-Type: text/markdown
|
|
42
44
|
|
|
43
45
|
# pdflinkcheck
|
|
44
46
|
|
|
45
|
-
A purpose-built tool for comprehensive analysis of hyperlinks and
|
|
47
|
+
A purpose-built tool for comprehensive analysis of hyperlinks and GoTo links within PDF documents. Users may leverage either the PyMuPDF or the pypdf library. Use the CLI or the GUI.
|
|
46
48
|
|
|
47
49
|
-----
|
|
48
50
|
|
|
49
|
-

|
|
50
52
|
|
|
51
53
|
-----
|
|
52
54
|
|
|
@@ -54,14 +56,14 @@ A purpose-built tool for comprehensive analysis of hyperlinks and link remnants
|
|
|
54
56
|
|
|
55
57
|
The recommended way to use `pdflinkcheck` is to either install the CLI with `pipx` or to download the appropriate latest binary for your system from [Releases](https://github.com/City-of-Memphis-Wastewater/pdflinkcheck/releases/).
|
|
56
58
|
|
|
57
|
-
### 🚀
|
|
59
|
+
### 🚀 Release Artifact Files (EXE, PYZ, ELF)
|
|
58
60
|
|
|
59
61
|
For the most user-typical experience, download the single-file binary matching your OS.
|
|
60
62
|
|
|
61
63
|
| **File Type** | **Primary Use Case** | **Recommended Launch Method** |
|
|
62
64
|
| :--- | :--- | :--- |
|
|
63
|
-
| **Executable (.exe, .elf
|
|
64
|
-
| **PYZ (Python Zip App)** | **CLI
|
|
65
|
+
| **Executable (.exe, .elf)** | **GUI** | Double-click the file. |
|
|
66
|
+
| **PYZ (Python Zip App)** | **CLI** or **GUI** | Run using your system's `python` command: `python pdflinkcheck-VERSION.pyz --help` |
|
|
65
67
|
|
|
66
68
|
### Installation via pipx
|
|
67
69
|
|
|
@@ -69,7 +71,11 @@ For an isolated environment where you can access `pdflinkcheck` from any termina
|
|
|
69
71
|
|
|
70
72
|
```bash
|
|
71
73
|
# Ensure you have pipx installed first (if not, run: pip install pipx)
|
|
74
|
+
pipx install pdflinkcheck[full]
|
|
75
|
+
|
|
76
|
+
# On Termux
|
|
72
77
|
pipx install pdflinkcheck
|
|
78
|
+
|
|
73
79
|
```
|
|
74
80
|
|
|
75
81
|
-----
|
|
@@ -99,49 +105,53 @@ We are actively working on the following enhancements:
|
|
|
99
105
|
|
|
100
106
|
## 🚀 CLI Usage
|
|
101
107
|
|
|
102
|
-
The core functionality is accessed via the `analyze` command.
|
|
108
|
+
The core functionality is accessed via the `analyze` command.
|
|
109
|
+
|
|
110
|
+
`DEV_TYPER_HELP_TREE=1 pdflinkcheck help-tree`:
|
|
111
|
+

|
|
112
|
+
|
|
113
|
+
`pdflinkcheck --help`:
|
|
114
|
+

|
|
115
|
+
|
|
103
116
|
|
|
104
117
|
### Available Commands
|
|
105
118
|
|
|
106
119
|
|**Command**|**Description**|
|
|
107
120
|
|---|---|
|
|
108
|
-
|`pdflinkcheck analyze`|Analyzes a PDF file for links
|
|
121
|
+
|`pdflinkcheck analyze`|Analyzes a PDF file for links |
|
|
109
122
|
|`pdflinkcheck gui`|Explicitly launch the Graphical User Interface.|
|
|
110
|
-
|`pdflinkcheck
|
|
123
|
+
|`pdflinkcheck docs`|Access documentation, including the README and AGPLv3+ license.|
|
|
111
124
|
|
|
112
125
|
### `analyze` Command Options
|
|
113
126
|
|
|
114
127
|
|**Option**|**Description**|**Default**|
|
|
115
128
|
|---|---|---|
|
|
116
129
|
|`<PDF_PATH>`|**Required.** The path to the PDF file to analyze.|N/A|
|
|
117
|
-
|`--
|
|
118
|
-
|`--
|
|
119
|
-
|`--
|
|
120
|
-
|`--help`|Show command help and exit.|N/A|
|
|
130
|
+
|`--pdf-library / -p`|Select engine: `pymupdf` or `pypdf`.|`pypdf`|
|
|
131
|
+
|`--export-format / -e`|Export to `JSON`, `TXT`, or `None` to suppress file output.|`JSON`|
|
|
132
|
+
|`--max-links / -m`|Maximum links to display per section. Use `0` for all.|`0`|
|
|
121
133
|
|
|
122
134
|
### `gui` Command Options
|
|
123
135
|
|
|
124
136
|
| **Option** | **Description** | **Default** |
|
|
125
137
|
| ---------------------- | ------------------------------------------------------------------------------------------------------------- | -------------- |
|
|
126
138
|
| `--auto-close INTEGER` | **(For testing/automation only).** Delay in milliseconds after which the GUI window will automatically close. | `0` (Disabled) |
|
|
127
|
-
#### Example Runs
|
|
128
|
-
|
|
129
139
|
|
|
140
|
+
#### Example Runs
|
|
130
141
|
|
|
131
142
|
```bash
|
|
132
|
-
# Analyze a document, show all links
|
|
133
|
-
pdflinkcheck analyze "TE Maxson WWTF O&M Manual.pdf" --export-format JSON
|
|
134
|
-
|
|
135
|
-
# Analyze a document but skip the time-consuming remnant check
|
|
136
|
-
pdflinkcheck analyze "another_doc.pdf" --no-check-remnants
|
|
143
|
+
# Analyze a document, show all links, and save the report as JSON and TXT
|
|
144
|
+
pdflinkcheck analyze "TE Maxson WWTF O&M Manual.pdf" --export-format JSON,TXT
|
|
137
145
|
|
|
138
146
|
# Analyze a document but keep the print block short, showing only the first 10 links for each type
|
|
139
147
|
pdflinkcheck analyze "TE Maxson WWTF O&M Manual.pdf" --max-links 10
|
|
140
148
|
|
|
141
149
|
# Show the GUI for only a moment, like in a build check
|
|
142
150
|
pdflinkcheck gui --auto-close 3000
|
|
143
|
-
```
|
|
144
151
|
|
|
152
|
+
# Show both the LICENSE and README.md docs
|
|
153
|
+
pdflinkcheck docs --license --readme
|
|
154
|
+
```
|
|
145
155
|
|
|
146
156
|
-----
|
|
147
157
|
|
|
@@ -151,14 +161,23 @@ For developers importing `pdflinkcheck` into other Python projects, the core ana
|
|
|
151
161
|
|
|
152
162
|
|**Function**|**Description**|
|
|
153
163
|
|---|---|
|
|
154
|
-
|`
|
|
155
|
-
|`
|
|
156
|
-
|`
|
|
164
|
+
|`run_report()`|**(Primary function)** Performs the full analysis, prints to console, and handles file export.|
|
|
165
|
+
|`extract_links_pynupdf()`|Function to retrieve all explicit links (URIs, GoTo, etc.) from a PDF path.|
|
|
166
|
+
|`extract_toc_pymupdf()`|Function to extract the PDF's internal Table of Contents (bookmarks/outline).|
|
|
167
|
+
|`extract_links_pynupdf()`|Function to retrieve all explicit links (URIs, GoTo, etc.) from a PDF path, using the pypdf library.|
|
|
168
|
+
|`extract_toc_pymupdf()`|Function to extract the PDF's internal Table of Contents (bookmarks/outline), using the pypdf library.|
|
|
157
169
|
|
|
158
|
-
|
|
170
|
+
Exanple:
|
|
159
171
|
|
|
160
|
-
```
|
|
161
|
-
from pdflinkcheck.
|
|
172
|
+
```python
|
|
173
|
+
from pdflinkcheck.report import run_report
|
|
174
|
+
from pdflinkcheck.analysis_pymupdf import extract_links_pymupdf, extract_toc_pymupdf 130 from pdflinkcheck.analysis_pymupdf import extract_links_pynupdf, extract_toc_pymupdf
|
|
175
|
+
from pdflinkcheck.analysis_pypdf import extract_links_pypdf, extract_toc_pypdf
|
|
176
|
+
|
|
177
|
+
file = "document1.pdf"
|
|
178
|
+
report_data = run_report(file)
|
|
179
|
+
links_pymupdf = extract_links_pymupdf(file)
|
|
180
|
+
links_pypdf = extract_links_pypdf(file)
|
|
162
181
|
```
|
|
163
182
|
|
|
164
183
|
-----
|
|
@@ -167,24 +186,10 @@ from pdflinkcheck.analyze import run_analysis, extract_links, extract_toc
|
|
|
167
186
|
|
|
168
187
|
* **Active Link Extraction:** Identifies and categorizes all programmed links (External URIs, Internal GoTo/Destinations, Remote Jumps).
|
|
169
188
|
* **Anchor Text Retrieval:** Extracts the visible text corresponding to each link's bounding box.
|
|
170
|
-
* **Remnant Detection:** Scans the document's text layer for unlinked URIs and email addresses that should potentially be converted into active links.
|
|
171
189
|
* **Structural TOC:** Extracts the PDF's internal Table of Contents (bookmarks/outline).
|
|
172
190
|
|
|
173
191
|
-----
|
|
174
192
|
|
|
175
|
-
## 📜 License Implications (AGPLv3+)
|
|
176
|
-
|
|
177
|
-
**pdflinkcheck is licensed under the GNU Affero General Public License version 3 or later (AGPLv3+).**
|
|
178
|
-
|
|
179
|
-
This license has significant implications for **distribution and network use**, particularly for organizations:
|
|
180
|
-
|
|
181
|
-
* **Source Code Provision:** If you distribute this tool (modified or unmodified) to anyone, you **must** provide the full source code under the same license.
|
|
182
|
-
* **Network Interaction (Affero Clause):** If you modify this tool and make the modified version available to users over a computer network (e.g., as a web service or backend), you **must** also offer the source code to those network users.
|
|
183
|
-
|
|
184
|
-
> **Before deploying or modifying this tool for organizational use, especially for internal web services or distribution, please ensure compliance with the AGPLv3+ terms.**
|
|
185
|
-
|
|
186
|
-
-----
|
|
187
|
-
|
|
188
193
|
## 🥚 Optional REPL‑Friendly GUI Access (Easter Egg)
|
|
189
194
|
|
|
190
195
|
For users who prefer exploring tools interactively—especially those coming from MATLAB or other REPL‑first environments—`pdflinkcheck` includes an optional Easter egg that exposes the GUI launcher directly in the library namespace.
|
|
@@ -215,39 +220,37 @@ pdflinkcheck.start_gui()
|
|
|
215
220
|
|
|
216
221
|
If the `PDFLINKCHECK_GUI_EASTEREGG` environment variable is not set—or if GUI support is unavailable—`pdflinkcheck` behaves as a normal library with no GUI functions exposed.
|
|
217
222
|
|
|
218
|
-
|
|
223
|
+
### Another Easter Egg
|
|
219
224
|
|
|
220
|
-
|
|
225
|
+
```bash
|
|
226
|
+
DEV_TYPER_HELP_TREE=1 pdflinkcheck help-tree
|
|
227
|
+
```
|
|
221
228
|
|
|
222
|
-
|
|
229
|
+
This `help-tree` feature has not yet been submitted for inclusion into Typer.
|
|
223
230
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
231
|
+
-----
|
|
232
|
+
|
|
233
|
+
## ⚠️ Compatibility Notes
|
|
227
234
|
|
|
228
235
|
#### Termux Compatibility as a Key Goal
|
|
229
|
-
A key goal of City-of-Memphis-Wastewater is to release all software as Termux-compatible.
|
|
230
|
-
We tried alternative PDF libaries like `pdfminer`, `pdfplumber`, and `borb`, but none of these offered the level of detail concerning GoTo links.
|
|
231
|
-
Due to Termux compatibility goals, we do not generally make Tkinter-based interfaces, so that was a fun, minimalist opportunity on this project.
|
|
236
|
+
A key goal of City-of-Memphis-Wastewater is to release all software as Termux-compatible.
|
|
232
237
|
|
|
233
238
|
Termux compatibility is important in the modern age as Android devices are common among technicians, field engineers, and maintenace staff.
|
|
234
239
|
Android is the most common operating system in the Global South.
|
|
235
240
|
We aim to produce stable software that can do the most possible good.
|
|
236
241
|
|
|
237
|
-
|
|
238
|
-
|
|
242
|
+
While using `PyMuPDF` in Python dependency resolution on Termux simply isn't possible, we are proud to have achieved a work-around by implementing a parallel solution in `pypdf`!
|
|
243
|
+
Now, there is PDF Engine selection in both the CLI and the GUI.
|
|
244
|
+
`pypdf` is the default in pdflinkcheck.report.run_report(); PyMuPDF can be explicitly requested in the CLI and is the default in the TKinter GUI.
|
|
239
245
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
-
|
|
243
|
-
- Alpine-compatible Docker image.
|
|
244
|
-
- Web-stack GUI rather than Tkinter, to be compatible with Termux.
|
|
245
|
-
- A different license from the AGPL3, if we choose at that time.
|
|
246
|
+
Now that `pdflinkcheck` can run on Termux, we may find a work-around and be able to drop the PyMuPDF dependency.
|
|
247
|
+
- Build `pypdf`-only artifacts, to reduce size.
|
|
248
|
+
- Build a web-stack GUI as an alternative to the Tkinter GUI, to be compatible with Termux.
|
|
246
249
|
|
|
247
|
-
|
|
250
|
+
Because it works, we plan to keep the `PyMuPDF` portion of the codebase.
|
|
248
251
|
|
|
249
252
|
### Document Compatibility:
|
|
250
|
-
|
|
253
|
+
Not all PDF files can be processed successfully. This tool is designed primarily for digitally generated (vector-based) PDFs.
|
|
251
254
|
|
|
252
255
|
Processing may fail or yield incomplete results for:
|
|
253
256
|
* **Scanned PDFs** (images of text) that lack an accessible text layer.
|
|
@@ -256,11 +259,64 @@ Processing may fail or yield incomplete results for:
|
|
|
256
259
|
|
|
257
260
|
-----
|
|
258
261
|
|
|
262
|
+
## PDF Library Selection
|
|
263
|
+
At long last, `PyMuPDF` is an optional dependency. The default is `pypdf`. All testing has shown identical performance, though the `analyze_pymupdf.py` is faster and more direct and robust than `analyze_pypdf.py`, which requires a lot of intentional parsing.
|
|
264
|
+
|
|
265
|
+
Binaries and artifacts are expected to contain PyMuPDF, unless they are build on Android. The GUI and CLI interfaces both allow selection of the library; if PyMuPDF is selected but is not available, the user will be warned.
|
|
266
|
+
|
|
267
|
+
To install the complete version use one of these options:
|
|
268
|
+
|
|
269
|
+
```bash
|
|
270
|
+
pip install "pdflinkcheck[full]"
|
|
271
|
+
pipx install "pdflinkcheck[full]"
|
|
272
|
+
uv tool install "pdflinkcheck[full]"
|
|
273
|
+
uv add "pdflinkcheck[full]"
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
-----
|
|
277
|
+
|
|
259
278
|
## Run from Source (Developers)
|
|
260
279
|
|
|
261
280
|
```bash
|
|
262
281
|
git clone http://github.com/city-of-memphis-wastewater/pdflinkcheck.git
|
|
263
282
|
cd pdflinkcheck
|
|
283
|
+
|
|
284
|
+
# To include the PyMuPDF dependency in the installation:
|
|
285
|
+
uv sync --extras full
|
|
286
|
+
|
|
287
|
+
# On Termux, to not include PyMuPDF:
|
|
264
288
|
uv sync
|
|
289
|
+
|
|
290
|
+
# To include developer depedecies:
|
|
291
|
+
uv sync --all-extras --group dev
|
|
292
|
+
|
|
293
|
+
# Run the CLI
|
|
265
294
|
uv run python src/pdflinkcheck/cli.py --help
|
|
295
|
+
|
|
296
|
+
# Run a basic webapp and Termux-facing browser-based interface
|
|
297
|
+
uv run python -m pdflinkcheck.stdlib_server
|
|
266
298
|
```
|
|
299
|
+
|
|
300
|
+
-----
|
|
301
|
+
|
|
302
|
+
## 📜 License Implications (AGPLv3+)
|
|
303
|
+
|
|
304
|
+
**`pdflinkcheck` is licensed under the `GNU Affero General Public License` version 3 or later (`AGPLv3+`).**
|
|
305
|
+
|
|
306
|
+
The `AGPL3+` is required for portions of this codebase because `pdflinkcheck` uses `PyMuPDF`, which is licensed under the `AGPL3`.
|
|
307
|
+
|
|
308
|
+
To stay in compliance, the AGPL3 license text is readily available in the CLI and the GUI, and it is included in the build artifacts.
|
|
309
|
+
The `AGPL3` appears as the primary license file in the source code. While this infers that the entire project is AGPL3-licensed, this is not true - portions of the codebase are MIT-licensed.
|
|
310
|
+
|
|
311
|
+
This license has significant implications for **distribution and network use**, particularly for organizations:
|
|
312
|
+
|
|
313
|
+
* **Source Code Provision:** If you distribute this tool (modified or unmodified) to anyone, you **must** provide the full source code under the same license.
|
|
314
|
+
* **Network Interaction (Affero Clause):** If you modify this tool and make the modified version available to users over a computer network (e.g., as a web service or backend), you **must** also offer the source code to those network users.
|
|
315
|
+
|
|
316
|
+
> **Before deploying or modifying this tool for organizational use, especially for internal web services or distribution, please ensure compliance with the AGPLv3+ terms.**
|
|
317
|
+
|
|
318
|
+
Links:
|
|
319
|
+
- Source code: https://github.com/City-of-Memphis-Wastewater/pdflinkcheck/
|
|
320
|
+
- Official AGPLv3 Text (FSF): https://www.gnu.org/licenses/agpl-3.0.html
|
|
321
|
+
|
|
322
|
+
Copyright © 2025 George Clayton Bennett
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
pdflinkcheck/__init__.py,sha256=KyoFlScM3kPrp1HjcxHDFEf4YflsoYclVF99-rerl3E,2510
|
|
2
|
+
pdflinkcheck/analyze_pymupdf.py,sha256=Be17KJQnTX9OoAluoE2GzPXC3mDCo7VGCNuwc9ilosc,12452
|
|
3
|
+
pdflinkcheck/analyze_pypdf.py,sha256=gHF9o6EY4sie727vS6YjTCQSzw_XWZape4xEk-l4lRI,6397
|
|
4
|
+
pdflinkcheck/analyze_pypdf_v2.py,sha256=dAvq2OoiN1MjptWSgOrAlArg0A98Hvpr105BKXJBrjE,7563
|
|
5
|
+
pdflinkcheck/cli.py,sha256=8PTkbK4msbhYB2NUCkUv8DWU7lO2qYg8qQKT_cB2U6w,12634
|
|
6
|
+
pdflinkcheck/data/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
|
7
|
+
pdflinkcheck/data/README.md,sha256=9tM77vu5jTpFQplL2A-ysyVyOQg8QZISsmtcmEfQXZM,11650
|
|
8
|
+
pdflinkcheck/data/pyproject.toml,sha256=nsh5tK1V_MD7iPTXjxcPjWPi5xXEbmUW7iWn-MfxxJo,2955
|
|
9
|
+
pdflinkcheck/datacopy.py,sha256=pZysPvfsvRe3qvA-du8XJvwZFxEOB_1ygEvhEj_Zj2Y,2503
|
|
10
|
+
pdflinkcheck/dev.py,sha256=e-0353spmVPPQGB2aJ_QbEDtJQGQFBSLrrfSccJGwII,4783
|
|
11
|
+
pdflinkcheck/gui.py,sha256=TYjP0vCDtuyRYMi6-c2JdCgif4FWNKyrwdye13FTv_8,24434
|
|
12
|
+
pdflinkcheck/io.py,sha256=ZdvKUumFIR8Ql89WToaVDqnosAo43H6sCRnbqwspE80,7943
|
|
13
|
+
pdflinkcheck/report.py,sha256=MmUs2Cftm6sbT__uCzgU-v6lsSQ1IjzsvoM385Xxl8g,11777
|
|
14
|
+
pdflinkcheck/stdlib_server.py,sha256=NKDPi-cfrBnYtG7mIxSI1eR1XSt8bxyan9YpdDAwhEU,6138
|
|
15
|
+
pdflinkcheck/validate.py,sha256=AtROBUZ6EmXxsx0xmqcSTYSlaippnkymp8s5eN4qN3o,14391
|
|
16
|
+
pdflinkcheck/version_info.py,sha256=dRVbs9U97YKisB1cLqVC2IoNrHCYw3z9TG8aldqTVOk,3211
|
|
17
|
+
pdflinkcheck-1.1.72.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
|
18
|
+
pdflinkcheck-1.1.72.dist-info/WHEEL,sha256=ZyFSCYkV2BrxH6-HRVRg3R9Fo7MALzer9KiPYqNxSbo,79
|
|
19
|
+
pdflinkcheck-1.1.72.dist-info/entry_points.txt,sha256=OJs4WkAziNGSoZ2KP0FgYOj2JdL6EW8UphJebWJnz3c,55
|
|
20
|
+
pdflinkcheck-1.1.72.dist-info/METADATA,sha256=HORgjln1UF9Zdx3BwHfrKBR1OZSV7WwfDi2s6z8JNnM,13568
|
|
21
|
+
pdflinkcheck-1.1.72.dist-info/RECORD,,
|
|
@@ -1,26 +1,7 @@
|
|
|
1
|
-
pdflinkcheck - A PDF Link Checker
|
|
2
|
-
Copyright (C) 2025 George Clayton Bennett
|
|
3
|
-
|
|
4
|
-
This program is free software: You can redistribute it and/or modify
|
|
5
|
-
it under the terms of the GNU Affero General Public License as
|
|
6
|
-
published by the Free Software Foundation, either version 3 of the
|
|
7
|
-
License, or (at your option) any later version.
|
|
8
|
-
|
|
9
|
-
The AGPL3+ is required because it uses PyMuPDF, which is licensed under the AGPL3.
|
|
10
|
-
|
|
11
|
-
Dependencies:
|
|
12
|
-
- Python (PSFL) | https://github.com/python/cpython |
|
|
13
|
-
- PyMuPDF (AGPL3) | https://github.com/pymupdf/PyMuPDF |
|
|
14
|
-
- pyhabitat (MIT) | https://github.com/City-of-Memphis-Wastewater/pdflinkcheck |
|
|
15
|
-
- rich (MIT) | https://github.com/Textualize/rich |
|
|
16
|
-
- typer (MIT) | https://github.com/fastapi/typer |
|
|
17
|
-
|
|
18
|
-
----------------------------------------------------------------------
|
|
19
|
-
|
|
20
1
|
GNU AFFERO GENERAL PUBLIC LICENSE
|
|
21
2
|
Version 3, 19 November 2007
|
|
22
3
|
|
|
23
|
-
Copyright (C) 2007 Free Software Foundation, Inc. <
|
|
4
|
+
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
|
24
5
|
Everyone is permitted to copy and distribute verbatim copies
|
|
25
6
|
of this license document, but changing it is not allowed.
|
|
26
7
|
|
|
@@ -652,8 +633,8 @@ the "copyright" line and a pointer to where the full notice is found.
|
|
|
652
633
|
Copyright (C) <year> <name of author>
|
|
653
634
|
|
|
654
635
|
This program is free software: you can redistribute it and/or modify
|
|
655
|
-
it under the terms of the GNU Affero General Public License as published
|
|
656
|
-
the Free Software Foundation, either version 3 of the License, or
|
|
636
|
+
it under the terms of the GNU Affero General Public License as published
|
|
637
|
+
by the Free Software Foundation, either version 3 of the License, or
|
|
657
638
|
(at your option) any later version.
|
|
658
639
|
|
|
659
640
|
This program is distributed in the hope that it will be useful,
|
|
@@ -662,7 +643,7 @@ the "copyright" line and a pointer to where the full notice is found.
|
|
|
662
643
|
GNU Affero General Public License for more details.
|
|
663
644
|
|
|
664
645
|
You should have received a copy of the GNU Affero General Public License
|
|
665
|
-
along with this program. If not, see <
|
|
646
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
666
647
|
|
|
667
648
|
Also add information on how to contact you by electronic and paper mail.
|
|
668
649
|
|
|
@@ -677,4 +658,4 @@ specific requirements.
|
|
|
677
658
|
You should also get your employer (if you work as a programmer) or school,
|
|
678
659
|
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
|
679
660
|
For more information on this, and how to apply and follow the GNU AGPL, see
|
|
680
|
-
<
|
|
661
|
+
<https://www.gnu.org/licenses/>.
|
pdflinkcheck/remnants.py
DELETED
|
@@ -1,142 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
import fitz
|
|
3
|
-
|
|
4
|
-
# Regular expression pattern for common URLs (http, https, www, mhtml)
|
|
5
|
-
URI_PATTERN = re.compile(
|
|
6
|
-
r'(?:https?|mhtml|file|ftp):\/\/\S+|\bwww\.\S+\b',
|
|
7
|
-
re.IGNORECASE
|
|
8
|
-
)
|
|
9
|
-
|
|
10
|
-
# Regular expression pattern for email addresses
|
|
11
|
-
EMAIL_PATTERN = re.compile(
|
|
12
|
-
r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
|
|
13
|
-
re.IGNORECASE
|
|
14
|
-
)
|
|
15
|
-
|
|
16
|
-
def clean_ex_rect(ex_rect_tuple):
|
|
17
|
-
# If the input is a string, attempt to parse it
|
|
18
|
-
if isinstance(ex_rect_tuple, str):
|
|
19
|
-
try:
|
|
20
|
-
# Use re.split to handle commas and spaces robustly.
|
|
21
|
-
# Filter out empty strings that result from multiple delimiters (e.g., "1, 2,,3")
|
|
22
|
-
parts = [c.strip() for c in re.split(r'[,\s]+', ex_rect_tuple.strip()) if c.strip()]
|
|
23
|
-
coords = [float(c) for c in parts]
|
|
24
|
-
|
|
25
|
-
if len(coords) != 4:
|
|
26
|
-
# print(f"Warning: Rect string parsed to {len(coords)} coords, expected 4: {ex_rect_tuple}")
|
|
27
|
-
return None
|
|
28
|
-
return coords
|
|
29
|
-
except ValueError:
|
|
30
|
-
# print(f"Warning: Could not parse rect string: {ex_rect_tuple}")
|
|
31
|
-
return None # Use None to signal failure
|
|
32
|
-
|
|
33
|
-
# If it's already a numeric sequence, check its length and type
|
|
34
|
-
elif isinstance(ex_rect_tuple, (list, tuple)):
|
|
35
|
-
if len(ex_rect_tuple) == 4 and all(isinstance(c, (int, float)) for c in ex_rect_tuple):
|
|
36
|
-
return ex_rect_tuple
|
|
37
|
-
# else: print(f"Warning: Numeric rect has incorrect length/type: {ex_rect_tuple}")
|
|
38
|
-
return None
|
|
39
|
-
|
|
40
|
-
# Handle the 'N/A: Missing Rect' case where link['rect'] might be None or a weird object
|
|
41
|
-
else:
|
|
42
|
-
# print(f"Warning: Unexpected rect type/format: {ex_rect_tuple}")
|
|
43
|
-
return None
|
|
44
|
-
|
|
45
|
-
def find_link_remnants(pdf_path, existing_links):
|
|
46
|
-
"""
|
|
47
|
-
Scans the PDF for text that looks like a URI or email but is not a registered link annotation.
|
|
48
|
-
"""
|
|
49
|
-
doc = fitz.open(pdf_path)
|
|
50
|
-
remnants_data = []
|
|
51
|
-
|
|
52
|
-
# 1. Create a set of all bounding boxes (Rects) of EXISTING links for exclusion
|
|
53
|
-
existing_rects = set()
|
|
54
|
-
for link in existing_links:
|
|
55
|
-
rect_obj = link.get("from")
|
|
56
|
-
|
|
57
|
-
if rect_obj:
|
|
58
|
-
# NOTE: A fitz.Rect object is returned here. We can use its properties directly.
|
|
59
|
-
|
|
60
|
-
# ⚠️ We still need to use your cleaning function if it handles rotation/quantization,
|
|
61
|
-
# but we must pass it the coordinates in the expected format (e.g., as a list or tuple).
|
|
62
|
-
|
|
63
|
-
# Convert the Rect object to a standard coordinate tuple (x0, y0, x1, y1)
|
|
64
|
-
raw_coords = (rect_obj.x0, rect_obj.y0, rect_obj.x1, rect_obj.y1)
|
|
65
|
-
|
|
66
|
-
# Assuming clean_ex_rect takes a list/tuple of 4 coordinates and cleans them
|
|
67
|
-
cleaned_coords = clean_ex_rect(raw_coords)
|
|
68
|
-
print(f"cleaned_coords = {cleaned_coords}")
|
|
69
|
-
|
|
70
|
-
# print(f"cleaned_coords = {cleaned_coords}") # Keep this for debugging
|
|
71
|
-
|
|
72
|
-
if cleaned_coords:
|
|
73
|
-
# Store the tuple of clean NUMBERS
|
|
74
|
-
# Note: A list is not hashable, so converting to tuple is correct.
|
|
75
|
-
existing_rects.add(tuple(cleaned_coords))
|
|
76
|
-
|
|
77
|
-
for page_num in range(doc.page_count):
|
|
78
|
-
page = doc.load_page(page_num)
|
|
79
|
-
|
|
80
|
-
# Extract text blocks with coordinates (MODE_TEXT is faster than 'text')
|
|
81
|
-
text_blocks = page.get_text("blocks")
|
|
82
|
-
|
|
83
|
-
for block in text_blocks:
|
|
84
|
-
x0, y0, x1, y1, text, block_no, block_type = block
|
|
85
|
-
|
|
86
|
-
# Look for URI remnants
|
|
87
|
-
for match in URI_PATTERN.finditer(text):
|
|
88
|
-
remnant_text = match.group(0)
|
|
89
|
-
|
|
90
|
-
# Use fitz to get the bounding box of the matched remnant text on the page
|
|
91
|
-
text_instances = page.search_for(remnant_text)
|
|
92
|
-
|
|
93
|
-
if text_instances:
|
|
94
|
-
remnant_rect = tuple(text_instances[0])
|
|
95
|
-
|
|
96
|
-
# Check if this remnant's bounding box overlaps with any existing link's bounding box
|
|
97
|
-
is_active_link = False
|
|
98
|
-
for ex_rect_tuple in existing_rects:
|
|
99
|
-
# ⚠️ CLEANUP: ex_rect_tuple is now GUARANTEED to be a tuple of 4 numbers
|
|
100
|
-
# We removed the unnecessary clean_ex_rect(ex_rect_tuple) call.
|
|
101
|
-
|
|
102
|
-
# Convert tuple back to fitz.Rect for overlap check
|
|
103
|
-
ex_rect = fitz.Rect(ex_rect_tuple)
|
|
104
|
-
if ex_rect.intersects(text_instances[0]):
|
|
105
|
-
is_active_link = True
|
|
106
|
-
break
|
|
107
|
-
|
|
108
|
-
if not is_active_link:
|
|
109
|
-
remnants_data.append({
|
|
110
|
-
'page': page_num + 1,
|
|
111
|
-
'type': 'URI Remnant',
|
|
112
|
-
'text': remnant_text,
|
|
113
|
-
'rect': remnant_rect
|
|
114
|
-
})
|
|
115
|
-
|
|
116
|
-
# Look for Email remnants
|
|
117
|
-
for match in EMAIL_PATTERN.finditer(text):
|
|
118
|
-
remnant_text = match.group(0)
|
|
119
|
-
|
|
120
|
-
text_instances = page.search_for(remnant_text)
|
|
121
|
-
|
|
122
|
-
if text_instances:
|
|
123
|
-
remnant_rect = tuple(text_instances[0])
|
|
124
|
-
|
|
125
|
-
is_active_link = False
|
|
126
|
-
for ex_rect_tuple in existing_rects:
|
|
127
|
-
# ⚠️ CLEANUP: ex_rect_tuple is now GUARANTEED to be a tuple of 4 numbers
|
|
128
|
-
ex_rect = fitz.Rect(ex_rect_tuple)
|
|
129
|
-
if ex_rect.intersects(text_instances[0]):
|
|
130
|
-
is_active_link = True
|
|
131
|
-
break
|
|
132
|
-
|
|
133
|
-
if not is_active_link:
|
|
134
|
-
remnants_data.append({
|
|
135
|
-
'page': page_num + 1,
|
|
136
|
-
'type': 'Email Remnant',
|
|
137
|
-
'text': remnant_text,
|
|
138
|
-
'rect': remnant_rect
|
|
139
|
-
})
|
|
140
|
-
|
|
141
|
-
doc.close()
|
|
142
|
-
return remnants_data
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
pdflinkcheck/__init__.py,sha256=lasv-V5JVAKsozpDdPVXxnUIF6jlAVFHHjyfXzVJXrA,1066
|
|
2
|
-
pdflinkcheck/analyze.py,sha256=dZd0wfF9qgY7kZMCY7F-xR_TNz6YcizDg1qTBfEN1I4,20863
|
|
3
|
-
pdflinkcheck/cli.py,sha256=RSlBN82LyvnICS-bgxoN7rpXsb0K8yUcky0q0Mffo8Q,5077
|
|
4
|
-
pdflinkcheck/gui.py,sha256=y8M5JHFmLTbP_2tZj-HXAEJpGBEtG7jv77tQf013e58,11768
|
|
5
|
-
pdflinkcheck/io.py,sha256=nBDIwkDfCmWXMklEebpRGrKaOr6aV1dvIm2Qq4NK0sg,3616
|
|
6
|
-
pdflinkcheck/remnants.py,sha256=xgunD4hDDT0SqD9SywvPc5DLSLNLA6O0BL0KOuLQwV8,6151
|
|
7
|
-
pdflinkcheck/data/LICENSE,sha256=40gU2B05E2rcXPc9e6HBukSE4aRgyXjzjJkBTUpkRSQ,35336
|
|
8
|
-
pdflinkcheck-1.1.47.dist-info/licenses/LICENSE,sha256=40gU2B05E2rcXPc9e6HBukSE4aRgyXjzjJkBTUpkRSQ,35336
|
|
9
|
-
pdflinkcheck-1.1.47.dist-info/METADATA,sha256=fcnkek2iCQ8OPlx7mXvXujZ6-B0JtM5wL4HhPfkjUik,11612
|
|
10
|
-
pdflinkcheck-1.1.47.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
11
|
-
pdflinkcheck-1.1.47.dist-info/entry_points.txt,sha256=cZaB_inIfr2X9lxMo1RhZr4602F3nTjTm3cXquzfw3Q,54
|
|
12
|
-
pdflinkcheck-1.1.47.dist-info/top_level.txt,sha256=WdBg8l6l3TF1HQDpR_PwSmBCSu5atKWFnPfNbRNwrME,13
|
|
13
|
-
pdflinkcheck-1.1.47.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
pdflinkcheck
|