pdflinkcheck 1.1.94__py3-none-any.whl → 1.2.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdflinkcheck/__init__.py +88 -18
- pdflinkcheck/__main__.py +6 -0
- pdflinkcheck/analysis_pdfium.py +131 -0
- pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +99 -141
- pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +51 -39
- pdflinkcheck/cli.py +52 -48
- pdflinkcheck/data/LICENSE +18 -15
- pdflinkcheck/data/README.md +23 -25
- pdflinkcheck/data/pyproject.toml +17 -26
- pdflinkcheck/datacopy.py +16 -1
- pdflinkcheck/dev.py +2 -2
- pdflinkcheck/environment.py +14 -2
- pdflinkcheck/gui.py +346 -563
- pdflinkcheck/helpers.py +88 -0
- pdflinkcheck/io.py +24 -6
- pdflinkcheck/report.py +598 -97
- pdflinkcheck/security.py +189 -0
- pdflinkcheck/splash.py +38 -0
- pdflinkcheck/stdlib_server.py +7 -21
- pdflinkcheck/stdlib_server_alt.py +571 -0
- pdflinkcheck/tk_utils.py +188 -0
- pdflinkcheck/update_msix_version.py +2 -0
- pdflinkcheck/validate.py +104 -170
- pdflinkcheck/version_info.py +2 -2
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +41 -40
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/RECORD +34 -27
- pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
- pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
- pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
- pdflinkcheck/analyze_pypdf_v2.py +0 -217
- pdflinkcheck-1.1.94.dist-info/WHEEL +0 -4
- pdflinkcheck-1.1.94.dist-info/licenses/LICENSE +0 -24
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-AGPL3 +0 -0
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-MIT +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
# SPDX-License-Identifier: MIT
|
|
3
|
-
# src/pdflinkcheck/
|
|
3
|
+
# src/pdflinkcheck/analysis_pypdf.py
|
|
4
|
+
from __future__ import annotations
|
|
4
5
|
import sys
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
import logging
|
|
@@ -8,6 +9,7 @@ from typing import Dict, Any, Optional, List
|
|
|
8
9
|
|
|
9
10
|
from pypdf import PdfReader
|
|
10
11
|
from pypdf.generic import Destination, NameObject, ArrayObject, IndirectObject
|
|
12
|
+
from pdflinkcheck.helpers import PageRef
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
from pdflinkcheck.io import error_logger, export_report_data, get_first_pdf_in_cwd, LOG_FILE_PATH
|
|
@@ -16,7 +18,28 @@ from pdflinkcheck.io import error_logger, export_report_data, get_first_pdf_in_c
|
|
|
16
18
|
Inspect target PDF for both URI links and for GoTo links, using only pypdf, not Fitz
|
|
17
19
|
"""
|
|
18
20
|
|
|
19
|
-
def
|
|
21
|
+
def analyze_pdf(pdf_path: str):
|
|
22
|
+
data = {}
|
|
23
|
+
data["links"] = []
|
|
24
|
+
data["toc"] = []
|
|
25
|
+
data["file_ov"] = {}
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
reader = PdfReader(pdf_path)
|
|
29
|
+
except Exception as e:
|
|
30
|
+
print(f"pypdf.PdfReader() failed: {e}")
|
|
31
|
+
return data
|
|
32
|
+
|
|
33
|
+
extracted_links = _extract_links_pypdf(reader)
|
|
34
|
+
structural_toc = _extract_toc_pypdf(reader)
|
|
35
|
+
page_count = len(reader.pages)
|
|
36
|
+
data["links"] = extracted_links
|
|
37
|
+
data["toc"] = structural_toc
|
|
38
|
+
data["file_ov"]["total_pages"] = page_count
|
|
39
|
+
return data
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _get_anchor_text_pypdf(page, rect) -> str:
|
|
20
43
|
"""
|
|
21
44
|
Extracts text within the link's bounding box using a visitor function.
|
|
22
45
|
Reliable for finding text associated with a link without PyMuPDF.
|
|
@@ -33,7 +56,7 @@ def get_anchor_text_pypdf(page, rect) -> str:
|
|
|
33
56
|
|
|
34
57
|
parts: List[str] = []
|
|
35
58
|
|
|
36
|
-
def
|
|
59
|
+
def _visitor_body(text, cm, tm, font_dict, font_size):
|
|
37
60
|
# tm[4], tm[5] are the current text insertion point coordinates (x, y)
|
|
38
61
|
x, y = tm[4], tm[5]
|
|
39
62
|
|
|
@@ -44,17 +67,18 @@ def get_anchor_text_pypdf(page, rect) -> str:
|
|
|
44
67
|
if text.strip():
|
|
45
68
|
parts.append(text)
|
|
46
69
|
|
|
47
|
-
page.extract_text(visitor_text=
|
|
70
|
+
page.extract_text(visitor_text=_visitor_body)
|
|
48
71
|
|
|
49
72
|
raw_extracted = "".join(parts)
|
|
50
73
|
cleaned = " ".join(raw_extracted.split()).strip()
|
|
51
74
|
|
|
52
75
|
return cleaned if cleaned else "Graphic/Empty Link"
|
|
53
76
|
|
|
54
|
-
def
|
|
77
|
+
def _resolve_pypdf_destination(reader: PdfReader, dest, obj_id_to_page: dict) -> Optional[int]:
|
|
55
78
|
try:
|
|
56
79
|
if isinstance(dest, Destination):
|
|
57
|
-
|
|
80
|
+
# .page_number in pypdf is already 0-indexed
|
|
81
|
+
return dest.page_number
|
|
58
82
|
|
|
59
83
|
if isinstance(dest, IndirectObject):
|
|
60
84
|
return obj_id_to_page.get(dest.idnum)
|
|
@@ -67,42 +91,25 @@ def resolve_pypdf_destination(reader: PdfReader, dest, obj_id_to_page: dict) ->
|
|
|
67
91
|
except Exception:
|
|
68
92
|
return None
|
|
69
93
|
|
|
70
|
-
def resolve_pypdf_destination_(reader: PdfReader, dest, obj_id_to_page: dict) -> str:
|
|
71
|
-
"""
|
|
72
|
-
Resolves a Destination object or IndirectObject to a 1-based page number string.
|
|
73
|
-
"""
|
|
74
|
-
try:
|
|
75
|
-
if isinstance(dest, Destination):
|
|
76
|
-
return str(dest.page_number + 1)
|
|
77
|
-
|
|
78
|
-
if isinstance(dest, IndirectObject):
|
|
79
|
-
return str(obj_id_to_page.get(dest.idnum, "Unknown"))
|
|
80
|
-
|
|
81
|
-
if isinstance(dest, ArrayObject) and len(dest) > 0:
|
|
82
|
-
if isinstance(dest[0], IndirectObject):
|
|
83
|
-
return str(obj_id_to_page.get(dest[0].idnum, "Unknown"))
|
|
84
|
-
|
|
85
|
-
return "Unknown"
|
|
86
|
-
except Exception:
|
|
87
|
-
return "Error Resolving"
|
|
88
94
|
|
|
89
|
-
def
|
|
95
|
+
def _extract_links_pypdf(reader: PdfReader) -> List[Dict[str, Any]]:
|
|
90
96
|
"""
|
|
91
97
|
Termux-compatible link extraction using pure-Python pypdf.
|
|
92
98
|
Matches the reporting schema of the PyMuPDF version.
|
|
93
99
|
"""
|
|
94
|
-
reader = PdfReader(pdf_path)
|
|
95
100
|
|
|
96
101
|
# Pre-map Object IDs to Page Numbers for fast internal link resolution
|
|
97
102
|
obj_id_to_page = {
|
|
98
|
-
page.indirect_reference.idnum: i
|
|
103
|
+
page.indirect_reference.idnum: i
|
|
99
104
|
for i, page in enumerate(reader.pages)
|
|
100
105
|
}
|
|
101
106
|
|
|
102
107
|
all_links = []
|
|
103
108
|
|
|
104
109
|
for i, page in enumerate(reader.pages):
|
|
105
|
-
page_num = i
|
|
110
|
+
#page_num = i
|
|
111
|
+
# Use PageRef to stay consistent
|
|
112
|
+
page_source = PageRef.from_index(i)
|
|
106
113
|
if "/Annots" not in page:
|
|
107
114
|
continue
|
|
108
115
|
|
|
@@ -112,10 +119,10 @@ def extract_links_pypdf(pdf_path):
|
|
|
112
119
|
continue
|
|
113
120
|
|
|
114
121
|
rect = obj.get("/Rect")
|
|
115
|
-
anchor_text =
|
|
122
|
+
anchor_text = _get_anchor_text_pypdf(page, rect)
|
|
116
123
|
|
|
117
124
|
link_dict = {
|
|
118
|
-
'page':
|
|
125
|
+
'page': page_source.machine,
|
|
119
126
|
'rect': list(rect) if rect else None,
|
|
120
127
|
'link_text': anchor_text,
|
|
121
128
|
'type': 'Other Action',
|
|
@@ -134,13 +141,16 @@ def extract_links_pypdf(pdf_path):
|
|
|
134
141
|
# Handle GoTo (Internal)
|
|
135
142
|
elif "/Dest" in obj or ("/A" in obj and "/D" in obj["/A"]):
|
|
136
143
|
dest = obj.get("/Dest") or obj["/A"].get("/D")
|
|
137
|
-
target_page =
|
|
144
|
+
target_page = _resolve_pypdf_destination(reader, dest, obj_id_to_page)
|
|
138
145
|
# print(f"DEBUG: resolved target_page = {target_page} (type: {type(target_page)})")
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
146
|
+
if target_page is not None:
|
|
147
|
+
dest_page = PageRef.from_index(target_page)
|
|
148
|
+
link_dict.update({
|
|
149
|
+
'type': 'Internal (GoTo/Dest)',
|
|
150
|
+
'destination_page': dest_page.machine,
|
|
151
|
+
#'target': f"Page {target_page}"
|
|
152
|
+
'target': dest_page.machine
|
|
153
|
+
})
|
|
144
154
|
|
|
145
155
|
# Handle Remote GoTo (GoToR)
|
|
146
156
|
elif "/A" in obj and obj["/A"].get("/S") == "/GoToR":
|
|
@@ -156,9 +166,8 @@ def extract_links_pypdf(pdf_path):
|
|
|
156
166
|
return all_links
|
|
157
167
|
|
|
158
168
|
|
|
159
|
-
def
|
|
169
|
+
def _extract_toc_pypdf(reader: PdfReader) -> List[Dict[str, Any]]:
|
|
160
170
|
try:
|
|
161
|
-
reader = PdfReader(pdf_path)
|
|
162
171
|
# Note: outline is a property, not a method.
|
|
163
172
|
toc_tree = reader.outline
|
|
164
173
|
toc_data = []
|
|
@@ -169,7 +178,10 @@ def extract_toc_pypdf(pdf_path: str) -> List[Dict[str, Any]]:
|
|
|
169
178
|
# Using the reader directly is the only way to avoid
|
|
170
179
|
# the 'Destination' object has no attribute error
|
|
171
180
|
try:
|
|
172
|
-
|
|
181
|
+
page_num_raw = reader.get_destination_page_number(item)
|
|
182
|
+
# page_num_raw is 0-indexed. Use PageRef to store it.
|
|
183
|
+
ref = PageRef.from_index(page_num_raw)
|
|
184
|
+
page_num = ref.machine
|
|
173
185
|
except:
|
|
174
186
|
page_num = "N/A"
|
|
175
187
|
|
pdflinkcheck/cli.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
# SPDX-License-Identifier: MIT
|
|
3
3
|
# src/pdflinkcheck/cli.py
|
|
4
|
+
from __future__ import annotations
|
|
4
5
|
import typer
|
|
5
6
|
from typing import Literal
|
|
6
7
|
from typer.models import OptionInfo
|
|
7
8
|
from rich.console import Console
|
|
8
9
|
from pathlib import Path
|
|
9
|
-
from pdflinkcheck.report import run_report_and_call_exports # Assuming core logic moves here
|
|
10
|
+
from pdflinkcheck.report import run_report_and_call_exports # Assuming core logic moves here
|
|
10
11
|
from typing import Dict, Optional, Union, List
|
|
11
12
|
import pyhabitat
|
|
12
13
|
import sys
|
|
@@ -20,18 +21,48 @@ from pdflinkcheck.io import get_first_pdf_in_cwd
|
|
|
20
21
|
|
|
21
22
|
console = Console() # to be above the tkinter check, in case of console.print
|
|
22
23
|
|
|
24
|
+
# Force Rich to always enable colors, even when running from a .pyz bundle
|
|
25
|
+
os.environ["FORCE_COLOR"] = "1"
|
|
26
|
+
# Optional but helpful for full terminal feature detection
|
|
27
|
+
os.environ["TERM"] = "xterm-256color"
|
|
28
|
+
|
|
23
29
|
app = typer.Typer(
|
|
24
30
|
name="pdflinkcheck",
|
|
25
31
|
help=f"A command-line tool for comprehensive PDF link analysis and reporting. (v{get_version_from_pyproject()})",
|
|
26
32
|
add_completion=False,
|
|
27
33
|
invoke_without_command = True,
|
|
28
34
|
no_args_is_help = False,
|
|
35
|
+
context_settings={"ignore_unknown_options": True,
|
|
36
|
+
"allow_extra_args": True,
|
|
37
|
+
"help_option_names": ["-h", "--help"]},
|
|
29
38
|
)
|
|
30
39
|
|
|
40
|
+
|
|
41
|
+
def debug_callback(value: bool):
|
|
42
|
+
#def debug_callback(ctx: typer.Context, value: bool):
|
|
43
|
+
if value:
|
|
44
|
+
# This runs IMMEDIATELY when --debug is parsed, even before --help
|
|
45
|
+
# 1. Access the list of all command-line arguments
|
|
46
|
+
full_command_list = sys.argv
|
|
47
|
+
# 2. Join the list into a single string to recreate the command
|
|
48
|
+
command_string = " ".join(full_command_list)
|
|
49
|
+
# 3. Print the command
|
|
50
|
+
typer.echo(f"command:\n{command_string}\n")
|
|
51
|
+
return value
|
|
52
|
+
|
|
53
|
+
if "--show-command" in sys.argv or "--debug" in sys.argv:
|
|
54
|
+
debug_callback(True)
|
|
55
|
+
|
|
31
56
|
@app.callback()
|
|
32
57
|
def main(ctx: typer.Context,
|
|
33
58
|
version: Optional[bool] = typer.Option(
|
|
34
59
|
None, "--version", is_flag=True, help="Show the version."
|
|
60
|
+
),
|
|
61
|
+
debug: bool = typer.Option(
|
|
62
|
+
False, "--debug", is_flag=True, help="Enable verbose debug logging and echo the full command string."
|
|
63
|
+
),
|
|
64
|
+
show_command: bool = typer.Option(
|
|
65
|
+
False, "--show-command", is_flag=True, help="Echo the full command string to the console before execution."
|
|
35
66
|
)
|
|
36
67
|
):
|
|
37
68
|
"""
|
|
@@ -44,13 +75,6 @@ def main(ctx: typer.Context,
|
|
|
44
75
|
if ctx.invoked_subcommand is None:
|
|
45
76
|
gui_command()
|
|
46
77
|
raise typer.Exit(code=0)
|
|
47
|
-
|
|
48
|
-
# 1. Access the list of all command-line arguments
|
|
49
|
-
full_command_list = sys.argv
|
|
50
|
-
# 2. Join the list into a single string to recreate the command
|
|
51
|
-
command_string = " ".join(full_command_list)
|
|
52
|
-
# 3. Print the command
|
|
53
|
-
typer.echo(f"command:\n{command_string}\n")
|
|
54
78
|
|
|
55
79
|
|
|
56
80
|
# help-tree() command: fragile, experimental, defaults to not being included.
|
|
@@ -89,7 +113,6 @@ def docs_command(
|
|
|
89
113
|
try:
|
|
90
114
|
license_path = files("pdflinkcheck.data") / "LICENSE"
|
|
91
115
|
license_text = license_path.read_text(encoding="utf-8")
|
|
92
|
-
|
|
93
116
|
console.print(f"\n[bold green]=== GNU AFFERO GENERAL PUBLIC LICENSE V3+ ===[/bold green]")
|
|
94
117
|
console.print(license_text, highlight=False)
|
|
95
118
|
|
|
@@ -127,24 +150,6 @@ def tools_command(
|
|
|
127
150
|
if clear_cache:
|
|
128
151
|
clear_all_caches()
|
|
129
152
|
|
|
130
|
-
"""
|
|
131
|
-
def validate_pdf_commands(
|
|
132
|
-
pdf_path: Optional[Path] = typer.Argument(
|
|
133
|
-
None,
|
|
134
|
-
exists=True,
|
|
135
|
-
file_okay=True,
|
|
136
|
-
dir_okay=False,
|
|
137
|
-
readable=True,
|
|
138
|
-
resolve_path=True,
|
|
139
|
-
help="Path to the PDF file to validate. If omitted, searches current directory."
|
|
140
|
-
),
|
|
141
|
-
pdf_library: Literal["pypdf", "pymupdf"] = typer.Option(
|
|
142
|
-
"pypdf",
|
|
143
|
-
"--library", "-l",
|
|
144
|
-
envvar="PDF_ENGINE",
|
|
145
|
-
help="PDF parsing engine: pypdf (pure Python) or pymupdf (faster, if available)"
|
|
146
|
-
),
|
|
147
|
-
"""
|
|
148
153
|
@app.command(name="analyze") # Added a command name 'analyze' for clarity
|
|
149
154
|
def analyze_pdf( # Renamed function for clarity
|
|
150
155
|
pdf_path: Optional[Path] = typer.Argument(
|
|
@@ -158,22 +163,21 @@ def analyze_pdf( # Renamed function for clarity
|
|
|
158
163
|
),
|
|
159
164
|
export_format: Optional[Literal["JSON", "TXT", "JSON,TXT", "NONE"]] = typer.Option(
|
|
160
165
|
"JSON,TXT",
|
|
161
|
-
"--
|
|
166
|
+
"--format","-f",
|
|
162
167
|
case_sensitive=False,
|
|
163
168
|
help="Export format. Use 'None' to suppress file export.",
|
|
164
169
|
),
|
|
165
|
-
max_links: int = typer.Option(
|
|
166
|
-
0,
|
|
167
|
-
"--max-links", "-m",
|
|
168
|
-
min=0,
|
|
169
|
-
help="Report brevity control. Use 0 to show all."
|
|
170
|
-
),
|
|
171
170
|
|
|
172
|
-
pdf_library: Literal["pypdf", "pymupdf"] = typer.Option(
|
|
171
|
+
pdf_library: Literal["auto","pdfium","pypdf", "pymupdf"] = typer.Option(
|
|
173
172
|
assess_default_pdf_library(),
|
|
174
|
-
"--
|
|
173
|
+
"--engine","-e",
|
|
175
174
|
envvar="PDF_ENGINE",
|
|
176
|
-
help="PDF parsing library. pypdf (pure Python)
|
|
175
|
+
help="PDF parsing library. pypdf (pure Python), pymupdf (fast, AGPL3+ licensed), pdfium (fast, BSD-3 licensed).",
|
|
176
|
+
),
|
|
177
|
+
print_bool: bool = typer.Option(
|
|
178
|
+
True,
|
|
179
|
+
"--print/--quiet",
|
|
180
|
+
help="Print or do not print the analysis and validation report to console."
|
|
177
181
|
)
|
|
178
182
|
):
|
|
179
183
|
"""
|
|
@@ -223,13 +227,14 @@ def analyze_pdf( # Renamed function for clarity
|
|
|
223
227
|
|
|
224
228
|
if not valid and "NONE" not in requested_formats:
|
|
225
229
|
typer.echo(f"Warning: No valid formats found in '{export_format}'. Supported: JSON, TXT.")
|
|
230
|
+
|
|
226
231
|
|
|
227
232
|
# The meat and potatoes
|
|
228
233
|
report_results = run_report_and_call_exports(
|
|
229
234
|
pdf_path=str(pdf_path),
|
|
230
|
-
max_links=max_links,
|
|
231
235
|
export_format = export_formats,
|
|
232
236
|
pdf_library = pdf_library,
|
|
237
|
+
print_bool = print_bool,
|
|
233
238
|
)
|
|
234
239
|
|
|
235
240
|
if not report_results or not report_results.get("data"):
|
|
@@ -238,14 +243,14 @@ def analyze_pdf( # Renamed function for clarity
|
|
|
238
243
|
|
|
239
244
|
validation_results = report_results["data"]["validation"]
|
|
240
245
|
# Optional: fail on broken links
|
|
241
|
-
|
|
246
|
+
broken_page_count = validation_results["summary-stats"]["broken-page"] + validation_results["summary-stats"]["broken-file"]
|
|
242
247
|
|
|
243
|
-
if
|
|
244
|
-
console.print(f"\n[bold yellow]Warning:[/bold yellow] {
|
|
245
|
-
else:
|
|
246
|
-
|
|
248
|
+
if broken_page_count > 0:
|
|
249
|
+
console.print(f"\n[bold yellow]Warning:[/bold yellow] {broken_page_count} broken link(s) found.")
|
|
250
|
+
#else:
|
|
251
|
+
# console.print(f"\n[bold green]Success:[/bold green] No broken links or TOC issues!\n")
|
|
247
252
|
|
|
248
|
-
raise typer.Exit(code=0 if
|
|
253
|
+
raise typer.Exit(code=0 if broken_page_count == 0 else 1)
|
|
249
254
|
|
|
250
255
|
@app.command(name="serve")
|
|
251
256
|
def serve(
|
|
@@ -265,7 +270,7 @@ def serve(
|
|
|
265
270
|
console.print(" → [yellow]Reload mode enabled[/yellow]")
|
|
266
271
|
|
|
267
272
|
# Import here to avoid slow imports on other commands
|
|
268
|
-
from pdflinkcheck.
|
|
273
|
+
from pdflinkcheck.stdlib_server_alt import ThreadedTCPServer, PDFLinkCheckHandler
|
|
269
274
|
import socketserver
|
|
270
275
|
|
|
271
276
|
try:
|
|
@@ -294,8 +299,6 @@ def gui_command(
|
|
|
294
299
|
"""
|
|
295
300
|
Launch tkinter-based GUI.
|
|
296
301
|
"""
|
|
297
|
-
|
|
298
|
-
# --- START FIX ---
|
|
299
302
|
assured_auto_close_value = 0
|
|
300
303
|
|
|
301
304
|
if isinstance(auto_close, OptionInfo):
|
|
@@ -307,11 +310,12 @@ def gui_command(
|
|
|
307
310
|
# Case 2: Called explicitly by Typer (pdflinkcheck gui -c 3000)
|
|
308
311
|
# Typer has successfully converted the command line argument, and auto_close is an int.
|
|
309
312
|
assured_auto_close_value = int(auto_close)
|
|
310
|
-
# --- END FIX ---
|
|
311
313
|
|
|
312
314
|
if not pyhabitat.tkinter_is_available():
|
|
313
315
|
_gui_failure_msg()
|
|
314
316
|
return
|
|
317
|
+
#from pdflinkcheck.gui import start_gui
|
|
318
|
+
#from pdflinkcheck.gui_alt import start_gui
|
|
315
319
|
from pdflinkcheck.gui import start_gui
|
|
316
320
|
start_gui(time_auto_close = assured_auto_close_value)
|
|
317
321
|
|
pdflinkcheck/data/LICENSE
CHANGED
|
@@ -1,24 +1,27 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
Some distributed binaries of this project include the PyMuPDF library, which is licensed under **AGPL‑3.0‑or‑later**.
|
|
5
|
-
Any binary that incorporates PyMuPDF is therefore distributed under **AGPL‑3.0‑or‑later**.
|
|
1
|
+
Some distributed binaries of this project include the PyMuPDF library, which is licensed under **AGPL3.0orlater**.
|
|
2
|
+
Any binary that incorporates PyMuPDF is therefore distributed under **AGPL3.0orlater**.
|
|
6
3
|
Other binaries use only the `pypdf` library and do not include PyMuPDF; these binaries are distributed under the **MIT License**.
|
|
7
4
|
|
|
8
|
-
For
|
|
5
|
+
For AGPLlicensed binaries, the complete corresponding source code must be made available to anyone who possesses a copy, upon request.
|
|
9
6
|
This obligation applies only to recipients of those binaries, and hosting the source code in GitHub Releases satisfies this requirement.
|
|
10
7
|
|
|
11
|
-
A binary becomes
|
|
12
|
-
The **source code of pdflinkcheck itself** remains licensed under the **MIT License**; only the distributed binary becomes
|
|
8
|
+
A binary becomes AGPLlicensed only when built with the optional `"full"` dependency group (as defined in `pyproject.toml` under `[project.optional-dependencies]`) or when PyMuPDF is otherwise included in the build environment.
|
|
9
|
+
The **source code of pdflinkcheck itself** remains licensed under the **MIT License**; only the distributed binary becomes AGPLlicensed when PyMuPDF is included.
|
|
13
10
|
|
|
14
|
-
Source code for each released version is available in the `
|
|
11
|
+
Source code for each released version is available in the `pdflinkcheckVERSION.tar.gz` files on the projects GitHub Releases page.
|
|
15
12
|
|
|
16
|
-
|
|
13
|
+
Fulltext copies of **LICENSEMIT** and **LICENSEAGPL3** are included in the root of the repository.
|
|
17
14
|
|
|
18
15
|
**Links:**
|
|
19
|
-
- Project source code: https://github.com/City-of-Memphis-Wastewater/pdflinkcheck
|
|
20
|
-
- PyMuPDF source code: https://github.com/pymupdf/PyMuPDF
|
|
21
|
-
-
|
|
22
|
-
-
|
|
23
|
-
-
|
|
16
|
+
- Project source code: https://github.com/City-of-Memphis-Wastewater/pdflinkcheck
|
|
17
|
+
- PyMuPDF source code: https://github.com/pymupdf/PyMuPDF
|
|
18
|
+
- pypdfium2 source code: https://github.com/pypdfium2-team/pypdfium2
|
|
19
|
+
- PDFium source code: https://pdfium.googlesource.com/pdfium/
|
|
20
|
+
- pypdf source code: https://github.com/py-pdf/pypdf
|
|
21
|
+
- AGPLv3 text (FSF): https://www.gnu.org/licenses/agpl-3.0.html
|
|
22
|
+
- MIT License text: https://opensource.org/license/mit
|
|
23
|
+
- BSD-3 License text: https://opensource.org/license/bsd-3-clause
|
|
24
|
+
- Apache-v2 License text: https://opensource.org/license/apache-2-0
|
|
25
|
+
|
|
24
26
|
|
|
27
|
+
Copyright 2025 George Clayton Bennett
|
pdflinkcheck/data/README.md
CHANGED
|
@@ -4,7 +4,7 @@ A purpose-built tool for comprehensive analysis of hyperlinks and GoTo links wit
|
|
|
4
4
|
|
|
5
5
|
-----
|
|
6
6
|
|
|
7
|
-

|
|
8
8
|
|
|
9
9
|
-----
|
|
10
10
|
|
|
@@ -19,7 +19,7 @@ For the most user-typical experience, download the single-file binary matching y
|
|
|
19
19
|
| **File Type** | **Primary Use Case** | **Recommended Launch Method** |
|
|
20
20
|
| :--- | :--- | :--- |
|
|
21
21
|
| **Executable (.exe, .elf)** | **GUI** | Double-click the file. |
|
|
22
|
-
| **PYZ (Python Zip App)** | **CLI** or **GUI** | Run using your system's `python` command: `python pdflinkcheck-VERSION.pyz --help` |
|
|
22
|
+
| **PYZ (Python Zip App)** | **CLI** or **GUI** | Run using your system's `python` command: `python pdflinkcheck-VERSION.pyz --help` |
|
|
23
23
|
|
|
24
24
|
### Installation via pipx
|
|
25
25
|
|
|
@@ -53,7 +53,7 @@ Ways to launch the GUI interface:
|
|
|
53
53
|
The core functionality is accessed via the `analyze` command.
|
|
54
54
|
|
|
55
55
|
`pdflinkcheck --help`:
|
|
56
|
-

|
|
57
57
|
|
|
58
58
|
|
|
59
59
|
See the Help Tree by unlocking the help-tree CLI command, using the DEV_TYPER_HELP_TREE env var.
|
|
@@ -63,7 +63,7 @@ DEV_TYPER_HELP_TREE=1 pdflinkcheck help-tree` # bash
|
|
|
63
63
|
$env:DEV_TYPER_HELP_TREE = "1"; pdflinkcheck help-tree` # PowerShell
|
|
64
64
|
```
|
|
65
65
|
|
|
66
|
-

|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
|
|
@@ -84,7 +84,6 @@ $env:DEV_TYPER_HELP_TREE = "1"; pdflinkcheck help-tree` # PowerShell
|
|
|
84
84
|
|`<PDF_PATH>`|**Required.** The path to the PDF file to analyze.|N/A|
|
|
85
85
|
|`--pdf-library / -p`|Select engine: `pymupdf` or `pypdf`.|`pypdf`|
|
|
86
86
|
|`--export-format / -e`|Export to `JSON`, `TXT`, or `None` to suppress file output.|`JSON`|
|
|
87
|
-
|`--max-links / -m`|Maximum links to display per section. Use `0` for all.|`0`|
|
|
88
87
|
|
|
89
88
|
### `gui` Command Options
|
|
90
89
|
|
|
@@ -98,9 +97,6 @@ $env:DEV_TYPER_HELP_TREE = "1"; pdflinkcheck help-tree` # PowerShell
|
|
|
98
97
|
# Analyze a document, show all links, and save the report as JSON and TXT
|
|
99
98
|
pdflinkcheck analyze "TE Maxson WWTF O&M Manual.pdf" --export-format JSON,TXT
|
|
100
99
|
|
|
101
|
-
# Analyze a document but keep the print block short, showing only the first 10 links for each type
|
|
102
|
-
pdflinkcheck analyze "TE Maxson WWTF O&M Manual.pdf" --max-links 10
|
|
103
|
-
|
|
104
100
|
# Show the GUI for only a moment, like in a build check
|
|
105
101
|
pdflinkcheck gui --auto-close 3000
|
|
106
102
|
|
|
@@ -112,22 +108,23 @@ pdflinkcheck docs --license --readme
|
|
|
112
108
|
|
|
113
109
|
## 📦 Library Access (Advanced)
|
|
114
110
|
|
|
115
|
-
For developers importing `pdflinkcheck` into other Python projects, the core analysis functions are exposed directly in the root namespace
|
|
111
|
+
For developers importing `pdflinkcheck` into other Python projects, the core analysis functions are exposed directly in the root namespace. The various `analysis_pdf_*` functions each use a different library to extract the target PDF's internal TOC, external links, and metadata.
|
|
116
112
|
|
|
117
|
-
|**Function**|**Description**|
|
|
118
|
-
|
|
119
|
-
|`run_report()
|
|
120
|
-
|`
|
|
121
|
-
|`
|
|
122
|
-
|`
|
|
123
|
-
|`extract_toc_pymupdf()`|Function to extract the PDF's internal Table of Contents (bookmarks/outline), using the pypdf library.|
|
|
113
|
+
|**Function**|**Library**|**Description**|
|
|
114
|
+
|---|---|---|
|
|
115
|
+
|`run_report()`|pdflinkcheck | **(Primary function)** Performs the full analysis, prints to console, and handles file export.|
|
|
116
|
+
|`analyze_pdf_pdfium()`| pypdfium2 | Fast, ~10 mb, Permissively licensed |
|
|
117
|
+
|`analyze_pdf_pymupdf()`| PyMuPDF | Fast, ~30 mb, AGPL3+ licensed |
|
|
118
|
+
|`analyze_pdf_pypdf()`| pypdf library | Slow, ~2 mb, Permissively licensed |
|
|
124
119
|
|
|
125
120
|
Exanple:
|
|
126
121
|
|
|
127
122
|
```python
|
|
128
|
-
from pdflinkcheck
|
|
129
|
-
|
|
130
|
-
|
|
123
|
+
from pdflinkcheck import ( run_report,
|
|
124
|
+
analyze_pdf_pymupdf,
|
|
125
|
+
analyze_pdf_pypdf,
|
|
126
|
+
analyze_pdf_pdfium,
|
|
127
|
+
)
|
|
131
128
|
|
|
132
129
|
file = "document1.pdf"
|
|
133
130
|
report_data = run_report(file)
|
|
@@ -194,24 +191,24 @@ Termux compatibility is important in the modern age, because Android devices are
|
|
|
194
191
|
Android is the most common operating system in the Global South.
|
|
195
192
|
We aim to produce stable software that can do the most possible good.
|
|
196
193
|
|
|
197
|
-
Now `pdflinkcheck` can run on Termux by using the `pypdf` engine.
|
|
194
|
+
Now `pdflinkcheck` can run on Termux by using the `pypdf` engine and the `pdfium` engine.
|
|
198
195
|
Benefits:
|
|
199
196
|
- `pypdf`-only artifacts, to reduce size to about 6% compared to artifacts that include `PyMuPDF`.
|
|
200
197
|
- Web-stack GUI as an alternative to the Tkinter GUI, which can be run locally on Termux or as a web app.
|
|
201
198
|
|
|
202
199
|
|
|
203
200
|
### PDF Library Selection
|
|
204
|
-
At long last, `PyMuPDF` is an optional dependency. All testing comparing `
|
|
201
|
+
At long last, `PyMuPDF` is an optional dependency. All testing comparing `pypdf` and `PyMuPDF` has shown identical validation performance. However `PyMuPDF` is much faster. The benfit of `pypdf` is small size of packages and cross-platform compatibility. We have recently added a PDFium option, which circumvents the AGPL3+.
|
|
205
202
|
|
|
206
203
|
Expecte that all binaries and artifacts contain PyMuPDF, unlss they are built on Android. The GUI and CLI interfaces both allow selection of the library; if PyMuPDF is selected but is not available, the user will be warned.
|
|
207
204
|
|
|
208
205
|
To install the complete version use one of these options:
|
|
209
206
|
|
|
210
207
|
```bash
|
|
211
|
-
pip install "pdflinkcheck[
|
|
212
|
-
pipx install "pdflinkcheck[
|
|
213
|
-
uv tool install "pdflinkcheck[
|
|
214
|
-
uv add "pdflinkcheck[
|
|
208
|
+
pip install "pdflinkcheck[mupdf]"
|
|
209
|
+
pipx install "pdflinkcheck[pdfium]"
|
|
210
|
+
uv tool install "pdflinkcheck[pdfium]"
|
|
211
|
+
uv add "pdflinkcheck[pdfium]"
|
|
215
212
|
```
|
|
216
213
|
|
|
217
214
|
---
|
|
@@ -271,6 +268,7 @@ The source code of pdflinkcheck itself remains licensed under the **MIT License*
|
|
|
271
268
|
Links:
|
|
272
269
|
- Source code: https://github.com/City-of-Memphis-Wastewater/pdflinkcheck/
|
|
273
270
|
- PyMuPDF source code: https://github.com/pymupdf/PyMuPDF/
|
|
271
|
+
- pypdfium2 source code: https://github.com/pypdfium2-team/pypdfium2
|
|
274
272
|
- pypdf source code: https://github.com/py-pdf/pypdf/
|
|
275
273
|
- AGPLv3 text (FSF): https://www.gnu.org/licenses/agpl-3.0.html
|
|
276
274
|
- MIT License text: https://opensource.org/license/mit
|
pdflinkcheck/data/pyproject.toml
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "pdflinkcheck"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.2.29"
|
|
4
4
|
description = "A purpose-built PDF link analysis and reporting tool with GUI and CLI."
|
|
5
5
|
readme = "README.md"
|
|
6
|
-
requires-python = ">=3.
|
|
6
|
+
requires-python = ">=3.9"
|
|
7
7
|
dependencies = [
|
|
8
|
-
"pyhabitat>=1.
|
|
8
|
+
"pyhabitat>=1.1.5",
|
|
9
9
|
"pypdf>=6.4.2",
|
|
10
10
|
"rich>=14.2.0",
|
|
11
11
|
"typer>=0.20.0",
|
|
@@ -17,13 +17,12 @@ license-files = ["LICENSE", "LICENSE-MIT", "LICENSE-AGPL3"]
|
|
|
17
17
|
classifiers=[
|
|
18
18
|
"Programming Language :: Python :: 3",
|
|
19
19
|
"Programming Language :: Python :: 3 :: Only",
|
|
20
|
+
"Programming Language :: Python :: 3.9",
|
|
20
21
|
"Programming Language :: Python :: 3.10",
|
|
21
22
|
"Programming Language :: Python :: 3.11",
|
|
22
23
|
"Programming Language :: Python :: 3.12",
|
|
23
24
|
"Programming Language :: Python :: 3.13",
|
|
24
25
|
"Programming Language :: Python :: 3.14",
|
|
25
|
-
"License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
|
|
26
|
-
"License :: OSI Approved :: MIT License",
|
|
27
26
|
"Operating System :: OS Independent",
|
|
28
27
|
"Intended Audience :: End Users/Desktop",
|
|
29
28
|
"Intended Audience :: Developers", # library and documentation
|
|
@@ -54,42 +53,34 @@ Repository = "https://github.com/city-of-memphis-wastewater/pdflinkcheck"
|
|
|
54
53
|
[project.scripts]
|
|
55
54
|
pdflinkcheck = "pdflinkcheck.cli:app"
|
|
56
55
|
|
|
56
|
+
|
|
57
57
|
[project.optional-dependencies]
|
|
58
|
-
# This allows users to do: pip install pdflinkcheck[full]
|
|
59
58
|
# If you choose to include PyMuPDF, you must comply with the AGPL3
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
#
|
|
65
|
-
# "
|
|
66
|
-
# "kivymd",
|
|
59
|
+
mupdf = ["pymupdf>=1.24.0,<2.0.0"] # fails on termux
|
|
60
|
+
pdfium = ["pypdfium2>=5.2.0,<6.0.0"]
|
|
61
|
+
full = ["pymupdf>=1.24.0,<2.0.0", "pypdfium2>=5.2.0,<6.0.0"]
|
|
62
|
+
|
|
63
|
+
#rust = [
|
|
64
|
+
# "pdflinkcheck-rust>=0.1.13"
|
|
67
65
|
#]
|
|
68
66
|
|
|
69
67
|
[dependency-groups]
|
|
70
68
|
dev = [
|
|
71
69
|
"build>=1.3.0",
|
|
72
|
-
"pyinstaller>=6.17.0",
|
|
70
|
+
"pyinstaller>=6.17.0 ; platform_system == 'Linux' and platform_machine != 'aarch64'", # to avoid on termux
|
|
73
71
|
"shiv>=1.0.8",
|
|
74
72
|
"ruff>=0.7.0 ; platform_system == 'Linux' and platform_machine != 'aarch64'", # to avoid on termux
|
|
75
73
|
"pytest>=8.0.0",
|
|
76
74
|
"pytest-cov>=4.1.0",
|
|
77
75
|
]
|
|
78
76
|
|
|
77
|
+
#[build-system]
|
|
78
|
+
#requires = ["uv_build"]
|
|
79
|
+
#build-backend = "uv_build"
|
|
79
80
|
[build-system]
|
|
80
|
-
requires = ["
|
|
81
|
-
build-backend = "
|
|
81
|
+
requires = ["setuptools>=64", "wheel"]
|
|
82
|
+
build-backend = "setuptools.build_meta"
|
|
82
83
|
|
|
83
84
|
[tool.uv.sources]
|
|
84
85
|
pdflinkcheck = { path = "src/pdflinkcheck" }
|
|
85
|
-
kivymd = { git = "https://github.com/kivymd/KivyMD.git" }
|
|
86
|
-
|
|
87
|
-
# https://docs.astral.sh/uv/concepts/preview/#available-preview-features
|
|
88
|
-
#[tool.uv]
|
|
89
|
-
#preview = true
|
|
90
|
-
|
|
91
|
-
# uv handles the data path automatically
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
86
|
|