pdflinkcheck 1.1.47__py3-none-any.whl → 1.1.73__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdflinkcheck/__init__.py +51 -13
- pdflinkcheck/{analyze.py → analyze_pymupdf.py} +54 -224
- pdflinkcheck/analyze_pypdf.py +184 -0
- pdflinkcheck/analyze_pypdf_v2.py +218 -0
- pdflinkcheck/cli.py +238 -39
- pdflinkcheck/data/LICENSE +5 -24
- pdflinkcheck/data/README.md +278 -0
- pdflinkcheck/data/pyproject.toml +98 -0
- pdflinkcheck/datacopy.py +60 -0
- pdflinkcheck/dev.py +109 -0
- pdflinkcheck/gui.py +371 -74
- pdflinkcheck/io.py +118 -11
- pdflinkcheck/report.py +282 -0
- pdflinkcheck/stdlib_server.py +176 -0
- pdflinkcheck/validate.py +382 -0
- pdflinkcheck/version_info.py +83 -0
- {pdflinkcheck-1.1.47.dist-info → pdflinkcheck-1.1.73.dist-info}/METADATA +127 -71
- pdflinkcheck-1.1.73.dist-info/RECORD +21 -0
- pdflinkcheck-1.1.73.dist-info/WHEEL +4 -0
- {pdflinkcheck-1.1.47.dist-info → pdflinkcheck-1.1.73.dist-info}/entry_points.txt +1 -0
- {pdflinkcheck-1.1.47.dist-info → pdflinkcheck-1.1.73.dist-info}/licenses/LICENSE +5 -24
- pdflinkcheck/remnants.py +0 -142
- pdflinkcheck-1.1.47.dist-info/RECORD +0 -13
- pdflinkcheck-1.1.47.dist-info/WHEEL +0 -5
- pdflinkcheck-1.1.47.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
# src/pdflinkcheck/analyze_pypdf.py
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Dict, Any, List
|
|
6
|
+
|
|
7
|
+
from pypdf import PdfReader
|
|
8
|
+
from pypdf.generic import Destination, NameObject, IndirectObject
|
|
9
|
+
|
|
10
|
+
from pdflinkcheck.report import run_report
|
|
11
|
+
#from pdflinkcheck.validate import run_validation
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
|
+
Inspect target PDF for both URI links and GoTo links, using only pypdf (no PyMuPDF/Fitz).
|
|
15
|
+
Fully fixed and improved version as of December 2025 (compatible with pypdf >= 4.0).
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def get_anchor_text_pypdf(page, rect) -> str:
|
|
19
|
+
"""
|
|
20
|
+
Extracts text that falls within or near the link's bounding box using a visitor function.
|
|
21
|
+
This is a reliable pure-pypdf method for associating visible text with a link annotation.
|
|
22
|
+
"""
|
|
23
|
+
if not rect:
|
|
24
|
+
return "N/A: Missing Rect"
|
|
25
|
+
|
|
26
|
+
# PDF coordinates: bottom-left origin. Rect is [x0, y0, x1, y1]
|
|
27
|
+
# Standardize Rect: [x_min, y_min, x_max, y_max]
|
|
28
|
+
# Some PDF generators write Rect as [x_max, y_max, x_min, y_min]
|
|
29
|
+
x_min, y_min, x_max, y_max = rect[0], rect[1], rect[2], rect[3]
|
|
30
|
+
if x_min > x_max: x_min, x_max = x_max, x_min
|
|
31
|
+
if y_min > y_max: y_min, y_max = y_max, y_min
|
|
32
|
+
|
|
33
|
+
parts: List[str] = []
|
|
34
|
+
|
|
35
|
+
def visitor_body(text: str, cm, tm, font_dict, font_size):
|
|
36
|
+
# tm[4] and tm[5] are the (x, y) coordinates of the text insertion point
|
|
37
|
+
x, y = tm[4], tm[5]
|
|
38
|
+
|
|
39
|
+
# Guard against missing font_size
|
|
40
|
+
actual_font_size = font_size if font_size else 10
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# Approximate Center-Alignment Check
|
|
44
|
+
# Since tm[4/5] is usually the bottom-left of the character,
|
|
45
|
+
# we shift our 'check point' slightly up and to the right based
|
|
46
|
+
# on font size to approximate the center of the character.
|
|
47
|
+
char_center_x = x + (actual_font_size / 4)
|
|
48
|
+
char_center_y = y + (actual_font_size / 3)
|
|
49
|
+
|
|
50
|
+
# Asymmetric Tolerance
|
|
51
|
+
# We use a tighter vertical tolerance (3pt) to avoid catching lines above/below.
|
|
52
|
+
# We use a wider horizontal tolerance (10pt) to catch kerning/spacing issues.
|
|
53
|
+
v_tol = 3
|
|
54
|
+
h_tol = 10
|
|
55
|
+
if (x_min - h_tol) <= char_center_x <= (x_max + h_tol) and \
|
|
56
|
+
(y_min - v_tol) <= char_center_y <= (y_max + v_tol):
|
|
57
|
+
if text.strip():
|
|
58
|
+
parts.append(text)
|
|
59
|
+
|
|
60
|
+
# Extract text using the visitor – this preserves drawing order
|
|
61
|
+
page.extract_text(visitor_text=visitor_body)
|
|
62
|
+
|
|
63
|
+
raw = "".join(parts)
|
|
64
|
+
cleaned = " ".join(raw.split()).strip()
|
|
65
|
+
|
|
66
|
+
return cleaned if cleaned else "Graphic/Empty Link"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def resolve_pypdf_destination(reader: PdfReader, dest) -> str:
|
|
70
|
+
"""
|
|
71
|
+
Resolves any form of destination (/Dest or /A /D) to a human-readable page number.
|
|
72
|
+
Uses the official pypdf helper when possible for maximum reliability.
|
|
73
|
+
"""
|
|
74
|
+
try:
|
|
75
|
+
if dest is None:
|
|
76
|
+
return "N/A"
|
|
77
|
+
|
|
78
|
+
# If it's an IndirectObject, resolve it first
|
|
79
|
+
if isinstance(dest, (IndirectObject, NameObject)):
|
|
80
|
+
dest = dest.get_object()
|
|
81
|
+
|
|
82
|
+
# Named destinations or explicit destinations are handled correctly by this method
|
|
83
|
+
if isinstance(dest, Destination):
|
|
84
|
+
return str(reader.get_destination_page_number(dest) + 1)
|
|
85
|
+
|
|
86
|
+
# Direct array or indirect reference
|
|
87
|
+
page_num = reader.get_destination_page_number(dest)
|
|
88
|
+
return str(page_num + 1)
|
|
89
|
+
|
|
90
|
+
except Exception:
|
|
91
|
+
return "Unknown/Error"
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def extract_links_pypdf(pdf_path: Path | str) -> List[Dict[str, Any]]:
|
|
95
|
+
"""
|
|
96
|
+
Extract all link annotations (URI, internal GoTo, remote GoToR) using pure pypdf.
|
|
97
|
+
Output schema matches typical reporting needs.
|
|
98
|
+
"""
|
|
99
|
+
reader = PdfReader(pdf_path)
|
|
100
|
+
|
|
101
|
+
all_links: List[Dict[str, Any]] = []
|
|
102
|
+
|
|
103
|
+
for i, page in enumerate(reader.pages):
|
|
104
|
+
page_num = i + 1
|
|
105
|
+
|
|
106
|
+
if "/Annots" not in page:
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
annots = page["/Annots"]
|
|
110
|
+
for annot_ref in annots:
|
|
111
|
+
try:
|
|
112
|
+
annot = annot_ref.get_object()
|
|
113
|
+
except Exception:
|
|
114
|
+
continue # Corrupted annotation – skip
|
|
115
|
+
|
|
116
|
+
if annot.get("/Subtype") != "/Link":
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
rect = annot.get("/Rect")
|
|
120
|
+
anchor_text = get_anchor_text_pypdf(page, rect)
|
|
121
|
+
|
|
122
|
+
link_dict: Dict[str, Any] = {
|
|
123
|
+
"page": page_num,
|
|
124
|
+
"rect": list(rect) if rect else None,
|
|
125
|
+
"link_text": anchor_text,
|
|
126
|
+
"type": "Other Action",
|
|
127
|
+
"target": "Unknown",
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
action = annot.get("/A")
|
|
131
|
+
|
|
132
|
+
# External URI link
|
|
133
|
+
if action and action.get("/URI"):
|
|
134
|
+
uri = action["/URI"]
|
|
135
|
+
link_dict.update({
|
|
136
|
+
"type": "External (URI)",
|
|
137
|
+
"url": str(uri),
|
|
138
|
+
"target": str(uri),
|
|
139
|
+
})
|
|
140
|
+
|
|
141
|
+
# Internal GoTo – can be /Dest directly or inside /A /D
|
|
142
|
+
elif annot.get("/Dest") or (action and action.get("/D")):
|
|
143
|
+
dest = annot.get("/Dest") or (action and action["/D"])
|
|
144
|
+
target_page = resolve_pypdf_destination(reader, dest)
|
|
145
|
+
link_dict.update({
|
|
146
|
+
"type": "Internal (GoTo/Dest)",
|
|
147
|
+
"destination_page": target_page,
|
|
148
|
+
"target": f"Page {target_page}",
|
|
149
|
+
})
|
|
150
|
+
|
|
151
|
+
# Remote GoToR (links to another PDF file)
|
|
152
|
+
elif action and action.get("/S") == "/GoToR":
|
|
153
|
+
file_spec = action.get("/F")
|
|
154
|
+
remote_file = str(file_spec) if file_spec else "Unknown File"
|
|
155
|
+
remote_dest = action.get("/D")
|
|
156
|
+
remote_target = f"File: {remote_file}"
|
|
157
|
+
if remote_dest:
|
|
158
|
+
remote_target += f" → Dest: {remote_dest}"
|
|
159
|
+
link_dict.update({
|
|
160
|
+
"type": "Remote (GoToR)",
|
|
161
|
+
"remote_file": remote_file,
|
|
162
|
+
"target": remote_target,
|
|
163
|
+
})
|
|
164
|
+
|
|
165
|
+
all_links.append(link_dict)
|
|
166
|
+
|
|
167
|
+
return all_links
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def extract_toc_pypdf(pdf_path: Path | str) -> List[Dict[str, Any]]:
|
|
171
|
+
"""
|
|
172
|
+
Extract the PDF outline (bookmarks / table of contents) using pypdf.
|
|
173
|
+
Correctly handles nested structure and uses the official page resolution method.
|
|
174
|
+
"""
|
|
175
|
+
try:
|
|
176
|
+
reader = PdfReader(pdf_path)
|
|
177
|
+
outline = reader.outline
|
|
178
|
+
if not outline:
|
|
179
|
+
return []
|
|
180
|
+
|
|
181
|
+
toc_data: List[Dict[str, Any]] = []
|
|
182
|
+
|
|
183
|
+
def flatten_outline(items: List, level: int = 1):
|
|
184
|
+
for item in items:
|
|
185
|
+
if isinstance(item, Destination):
|
|
186
|
+
try:
|
|
187
|
+
page_num = reader.get_destination_page_number(item) + 1
|
|
188
|
+
except Exception:
|
|
189
|
+
page_num = "N/A"
|
|
190
|
+
|
|
191
|
+
toc_data.append({
|
|
192
|
+
"level": level,
|
|
193
|
+
"title": item.title or "(Untitled)",
|
|
194
|
+
"target_page": page_num,
|
|
195
|
+
})
|
|
196
|
+
elif isinstance(item, list):
|
|
197
|
+
# Recurse into child entries
|
|
198
|
+
flatten_outline(item, level + 1)
|
|
199
|
+
|
|
200
|
+
flatten_outline(outline)
|
|
201
|
+
return toc_data
|
|
202
|
+
|
|
203
|
+
except Exception as e:
|
|
204
|
+
print(f"TOC extraction error: {e}", file=sys.stderr)
|
|
205
|
+
return []
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def call_stable():
|
|
209
|
+
"""
|
|
210
|
+
Entry point for command-line execution or integration with reporting module.
|
|
211
|
+
"""
|
|
212
|
+
run_report(library_pdf="pypdf")
|
|
213
|
+
# run_validation(library_pdf="pypdf") # Uncomment if validation step is needed
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
if __name__ == "__main__":
|
|
217
|
+
call_stable()
|
|
218
|
+
# pypdf version updates
|
pdflinkcheck/cli.py
CHANGED
|
@@ -1,25 +1,31 @@
|
|
|
1
|
-
# src/
|
|
1
|
+
# src/pdflinkcheck/cli.py
|
|
2
2
|
import typer
|
|
3
|
+
from typing import Literal
|
|
3
4
|
from typer.models import OptionInfo
|
|
4
5
|
from rich.console import Console
|
|
5
6
|
from pathlib import Path
|
|
6
|
-
from pdflinkcheck.
|
|
7
|
-
from typing import Dict
|
|
7
|
+
from pdflinkcheck.report import run_report # Assuming core logic moves here
|
|
8
|
+
from typing import Dict, Optional, Union, List
|
|
8
9
|
import pyhabitat
|
|
9
10
|
import sys
|
|
11
|
+
import os
|
|
10
12
|
from importlib.resources import files
|
|
11
13
|
|
|
14
|
+
from pdflinkcheck.version_info import get_version_from_pyproject
|
|
15
|
+
from pdflinkcheck.validate import run_validation
|
|
16
|
+
|
|
12
17
|
|
|
13
18
|
console = Console() # to be above the tkinter check, in case of console.print
|
|
14
19
|
|
|
15
20
|
app = typer.Typer(
|
|
16
21
|
name="pdflinkcheck",
|
|
17
|
-
help="A command-line tool for comprehensive PDF link analysis and reporting.",
|
|
22
|
+
help=f"A command-line tool for comprehensive PDF link analysis and reporting. (v{get_version_from_pyproject()})",
|
|
18
23
|
add_completion=False,
|
|
19
24
|
invoke_without_command = True,
|
|
20
25
|
no_args_is_help = False,
|
|
21
26
|
)
|
|
22
27
|
|
|
28
|
+
|
|
23
29
|
@app.callback()
|
|
24
30
|
def main(ctx: typer.Context):
|
|
25
31
|
"""
|
|
@@ -36,27 +42,62 @@ def main(ctx: typer.Context):
|
|
|
36
42
|
command_string = " ".join(full_command_list)
|
|
37
43
|
# 3. Print the command
|
|
38
44
|
typer.echo(f"command:\n{command_string}\n")
|
|
39
|
-
|
|
40
45
|
|
|
41
|
-
|
|
42
|
-
|
|
46
|
+
|
|
47
|
+
# help-tree() command: fragile, experimental, defaults to not being included.
|
|
48
|
+
if os.environ.get('DEV_TYPER_HELP_TREE',0) in ('true','1'):
|
|
49
|
+
from pdflinkcheck.dev import add_typer_help_tree
|
|
50
|
+
add_typer_help_tree(
|
|
51
|
+
app = app,
|
|
52
|
+
console = console)
|
|
53
|
+
|
|
54
|
+
@app.command(name="docs", help="Show the docs for this software.")
|
|
55
|
+
def docs_command(
|
|
56
|
+
license: Optional[bool] = typer.Option(
|
|
57
|
+
None, "--license", "-l", help="Show the full AGPLv3 license text."
|
|
58
|
+
),
|
|
59
|
+
readme: Optional[bool] = typer.Option(
|
|
60
|
+
None, "--readme", "-r", help="Show the full README.md content."
|
|
61
|
+
),
|
|
62
|
+
):
|
|
43
63
|
"""
|
|
44
|
-
|
|
64
|
+
Handles the pdflinkcheck docs command, either with flags or by showing help.
|
|
45
65
|
"""
|
|
46
|
-
|
|
47
|
-
#
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
#
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
66
|
+
if not license and not readme:
|
|
67
|
+
# If no flags are provided, show the help message for the docs subcommand.
|
|
68
|
+
# Use ctx.invoke(ctx.command.get_help, ctx) if you want to print help immediately.
|
|
69
|
+
# Otherwise, the default behavior (showing help) works fine, but we'll add a message.
|
|
70
|
+
console.print("[yellow]Please use either the --license or --readme flag.[/yellow]")
|
|
71
|
+
return # Typer will automatically show the help message.
|
|
72
|
+
|
|
73
|
+
# --- Handle --license flag ---
|
|
74
|
+
if license:
|
|
75
|
+
try:
|
|
76
|
+
license_path = files("pdflinkcheck.data") / "LICENSE"
|
|
77
|
+
license_text = license_path.read_text(encoding="utf-8")
|
|
78
|
+
|
|
79
|
+
console.print(f"\n[bold green]=== GNU AFFERO GENERAL PUBLIC LICENSE V3+ ===[/bold green]")
|
|
80
|
+
console.print(license_text, highlight=False)
|
|
81
|
+
|
|
82
|
+
except FileNotFoundError:
|
|
83
|
+
console.print("[bold red]Error:[/bold red] The embedded license file could not be found.")
|
|
84
|
+
raise typer.Exit(code=1)
|
|
85
|
+
|
|
86
|
+
# --- Handle --readme flag ---
|
|
87
|
+
if readme:
|
|
88
|
+
try:
|
|
89
|
+
readme_path = files("pdflinkcheck.data") / "README.md"
|
|
90
|
+
readme_text = readme_path.read_text(encoding="utf-8")
|
|
91
|
+
|
|
92
|
+
# Using rich's Panel can frame the readme text nicely
|
|
93
|
+
console.print(f"\n[bold green]=== pdflinkcheck README ===[/bold green]")
|
|
94
|
+
console.print(readme_text, highlight=False)
|
|
95
|
+
|
|
96
|
+
except FileNotFoundError:
|
|
97
|
+
console.print("[bold red]Error:[/bold red] The embedded README.md file could not be found.")
|
|
98
|
+
raise typer.Exit(code=1)
|
|
99
|
+
|
|
100
|
+
# Exit successfully if any flag was processed
|
|
60
101
|
raise typer.Exit(code=0)
|
|
61
102
|
|
|
62
103
|
@app.command(name="analyze") # Added a command name 'analyze' for clarity
|
|
@@ -69,37 +110,194 @@ def analyze_pdf( # Renamed function for clarity
|
|
|
69
110
|
readable=True,
|
|
70
111
|
resolve_path=True,
|
|
71
112
|
help="The path to the PDF file to analyze."
|
|
72
|
-
),
|
|
73
|
-
export_format:
|
|
113
|
+
),
|
|
114
|
+
export_format: Optional[Literal["JSON", "TXT", "JSON,TXT", "NONE"]] = typer.Option(
|
|
115
|
+
"JSON",
|
|
116
|
+
"--export-format","-e",
|
|
117
|
+
case_sensitive=False,
|
|
118
|
+
help="Export format. Use 'None' to suppress file export.",
|
|
74
119
|
),
|
|
75
120
|
max_links: int = typer.Option(
|
|
76
121
|
0,
|
|
77
|
-
"--max-links",
|
|
122
|
+
"--max-links", "-m",
|
|
78
123
|
min=0,
|
|
79
|
-
help="
|
|
124
|
+
help="Report brevity control. Use 0 to show all."
|
|
80
125
|
),
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
"
|
|
84
|
-
|
|
126
|
+
|
|
127
|
+
pdf_library: Literal["pypdf", "pymupdf"] = typer.Option(
|
|
128
|
+
"pypdf",#"pymupdf",
|
|
129
|
+
"--pdf-library","-p",
|
|
130
|
+
envvar="PDF_ENGINE",
|
|
131
|
+
help="Select PDF parsing library, pymupdf or pypdf.",
|
|
85
132
|
)
|
|
86
133
|
):
|
|
87
134
|
"""
|
|
88
|
-
Analyzes the specified PDF file for all internal, external, and unlinked
|
|
135
|
+
Analyzes the specified PDF file for all internal, external, and unlinked references.
|
|
136
|
+
|
|
137
|
+
Checks:
|
|
138
|
+
• Internal GoTo links point to valid pages
|
|
139
|
+
• Remote GoToR links point to existing files
|
|
140
|
+
• TOC bookmarks target valid pages
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
"""
|
|
144
|
+
Fun Typer fact:
|
|
145
|
+
Overriding Order
|
|
146
|
+
Environment variables sit in the middle of the "priority" hierarchy:
|
|
147
|
+
|
|
148
|
+
CLI Flag: (Highest priority) analyze -p pypdf will always win.
|
|
149
|
+
|
|
150
|
+
Env Var: If no flag is present, it checks PDF_ENGINE.
|
|
151
|
+
|
|
152
|
+
Code Default: (Lowest priority) It falls back to "pypdf" as defined in your typer.Option.
|
|
89
153
|
"""
|
|
90
|
-
|
|
91
|
-
|
|
154
|
+
|
|
155
|
+
VALID_FORMATS = ("JSON") # extend later
|
|
156
|
+
requested_formats = [fmt.strip().upper() for fmt in export_format.split(",")]
|
|
157
|
+
if "NONE" in requested_formats or not export_format.strip() or export_format == "0":
|
|
158
|
+
export_formats = ""
|
|
159
|
+
else:
|
|
160
|
+
# Filter for valid ones: ("JSON", "TXT")
|
|
161
|
+
# This allows "JSON,TXT" to become "JSONTXT" which your run_report logic can handle
|
|
162
|
+
valid = [f for f in requested_formats if f in ("JSON", "TXT")]
|
|
163
|
+
export_formats = "".join(valid)
|
|
164
|
+
|
|
165
|
+
if not valid and "NONE" not in requested_formats:
|
|
166
|
+
typer.echo(f"Warning: No valid formats found in '{export_format}'. Supported: JSON, TXT.")
|
|
167
|
+
|
|
168
|
+
run_report(
|
|
92
169
|
pdf_path=str(pdf_path),
|
|
93
|
-
check_remnants=check_remnants,
|
|
94
170
|
max_links=max_links,
|
|
95
|
-
export_format =
|
|
171
|
+
export_format = export_formats,
|
|
172
|
+
pdf_library = pdf_library,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
@app.command(name="validate")
|
|
176
|
+
def validate_pdf(
|
|
177
|
+
pdf_path: Optional[Path] = typer.Argument(
|
|
178
|
+
None,
|
|
179
|
+
exists=True,
|
|
180
|
+
file_okay=True,
|
|
181
|
+
dir_okay=False,
|
|
182
|
+
readable=True,
|
|
183
|
+
resolve_path=True,
|
|
184
|
+
help="Path to the PDF file to validate. If omitted, searches current directory."
|
|
185
|
+
),
|
|
186
|
+
export: bool = typer.Option(
|
|
187
|
+
True,
|
|
188
|
+
"--export",#"--no-export",
|
|
189
|
+
help = "JSON export for validation check."
|
|
190
|
+
),
|
|
191
|
+
pdf_library: Literal["pypdf", "pymupdf"] = typer.Option(
|
|
192
|
+
"pypdf",
|
|
193
|
+
"--library", "-l",
|
|
194
|
+
envvar="PDF_ENGINE",
|
|
195
|
+
help="PDF parsing engine: pypdf (pure Python) or pymupdf (faster, if available)"
|
|
196
|
+
),
|
|
197
|
+
fail_on_broken: bool = typer.Option(
|
|
198
|
+
False,
|
|
199
|
+
"--fail",
|
|
200
|
+
help="Exit with code 1 if any broken links are found (useful for CI)"
|
|
201
|
+
)
|
|
202
|
+
):
|
|
203
|
+
"""
|
|
204
|
+
Validate internal, remote, and TOC links in a PDF.
|
|
205
|
+
|
|
206
|
+
1. Call the run_report() function, like calling the 'analyze' CLI command.
|
|
207
|
+
2. Inspects the results from 'run_report():
|
|
208
|
+
- Are referenced files available?
|
|
209
|
+
- Are the page numbers referenced by GoTo links within the length of the document?
|
|
210
|
+
"""
|
|
211
|
+
from pdflinkcheck.io import get_first_pdf_in_cwd
|
|
212
|
+
|
|
213
|
+
if pdf_path is None:
|
|
214
|
+
pdf_path = get_first_pdf_in_cwd()
|
|
215
|
+
if pdf_path is None:
|
|
216
|
+
console.print("[red]Error: No PDF file provided and none found in current directory.[/red]")
|
|
217
|
+
raise typer.Exit(code=1)
|
|
218
|
+
console.print(f"[dim]No file specified — using: {pdf_path.name}[/dim]")
|
|
219
|
+
|
|
220
|
+
pdf_path_str = str(pdf_path)
|
|
221
|
+
|
|
222
|
+
console.print(f"[bold]Validating links in:[/bold] {pdf_path.name}")
|
|
223
|
+
console.print(f"[bold]Using engine:[/bold] {pdf_library}\n")
|
|
224
|
+
|
|
225
|
+
# Step 1: Run analysis (quietly)
|
|
226
|
+
report = run_report(
|
|
227
|
+
pdf_path=pdf_path_str,
|
|
228
|
+
max_links=0,
|
|
229
|
+
export_format="",
|
|
230
|
+
pdf_library=pdf_library,
|
|
231
|
+
print_bool=False
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
if not report or not report.get("data"):
|
|
235
|
+
console.print("[yellow]No links or TOC found — nothing to validate.[/yellow]")
|
|
236
|
+
raise typer.Exit(code=0)
|
|
237
|
+
|
|
238
|
+
# Step 2: Run validation
|
|
239
|
+
validation_results = run_validation(
|
|
240
|
+
report_results=report,
|
|
241
|
+
pdf_path=pdf_path_str,
|
|
242
|
+
pdf_library=pdf_library,
|
|
243
|
+
export_json=export,
|
|
244
|
+
print_bool=True
|
|
96
245
|
)
|
|
97
246
|
|
|
247
|
+
# Optional: fail on broken links
|
|
248
|
+
broken_count = validation_results["summary-stats"]["broken-page"] + validation_results["summary-stats"]["broken-file"]
|
|
249
|
+
if fail_on_broken and broken_count > 0:
|
|
250
|
+
console.print(f"\n[bold red]Validation failed:[/bold red] {broken_count} broken link(s) found.")
|
|
251
|
+
raise typer.Exit(code=1)
|
|
252
|
+
elif broken_count > 0:
|
|
253
|
+
console.print(f"\n[bold yellow]Warning:[/bold yellow] {broken_count} broken link(s) found.")
|
|
254
|
+
else:
|
|
255
|
+
console.print(f"\n[bold green]Success:[/bold green] No broken links or TOC issues!")
|
|
256
|
+
|
|
257
|
+
raise typer.Exit(code=0 if broken_count == 0 else 1)
|
|
258
|
+
|
|
259
|
+
@app.command(name="serve")
|
|
260
|
+
def serve(
|
|
261
|
+
host: str = typer.Option("0.0.0.0", "--host", "-h", help="Host to bind (use 0.0.0.0 for network access)"),
|
|
262
|
+
port: int = typer.Option(8000, "--port", "-p", help="Port to listen on"),
|
|
263
|
+
reload: bool = typer.Option(False, "--reload", help="Auto-reload on code changes (dev only)"),
|
|
264
|
+
):
|
|
265
|
+
"""
|
|
266
|
+
Start the built-in web server for uploading and analyzing PDFs in the browser.
|
|
267
|
+
|
|
268
|
+
Pure stdlib — no extra dependencies. Works great on Termux!
|
|
269
|
+
"""
|
|
270
|
+
console.print(f"[bold green]Starting pdflinkcheck web server[/bold green]")
|
|
271
|
+
console.print(f" → Open your browser at: [bold blue]http://{host}:{port}[/bold blue]")
|
|
272
|
+
console.print(f" → Upload a PDF to analyze links and TOC")
|
|
273
|
+
if reload:
|
|
274
|
+
console.print(" → [yellow]Reload mode enabled[/yellow]")
|
|
275
|
+
|
|
276
|
+
# Import here to avoid slow imports on other commands
|
|
277
|
+
from pdflinkcheck.stdlib_server import ThreadedTCPServer, PDFLinkCheckHandler
|
|
278
|
+
import socketserver
|
|
279
|
+
|
|
280
|
+
try:
|
|
281
|
+
with ThreadedTCPServer((host, port), PDFLinkCheckHandler) as httpd:
|
|
282
|
+
console.print(f"[green]Server running — press Ctrl+C to stop[/green]\n")
|
|
283
|
+
httpd.serve_forever()
|
|
284
|
+
except OSError as e:
|
|
285
|
+
if "Address already in use" in str(e):
|
|
286
|
+
console.print(f"[red]Error: Port {port} is already in use.[/red]")
|
|
287
|
+
console.print("Try a different port with --port 8080")
|
|
288
|
+
else:
|
|
289
|
+
console.print(f"[red]Server error: {e}[/red]")
|
|
290
|
+
raise typer.Exit(code=1)
|
|
291
|
+
except KeyboardInterrupt:
|
|
292
|
+
console.print("\n[bold yellow]Server stopped.[/bold yellow]")
|
|
293
|
+
raise typer.Exit(code=0)
|
|
294
|
+
|
|
295
|
+
|
|
98
296
|
@app.command(name="gui")
|
|
99
297
|
def gui_command(
|
|
100
298
|
auto_close: int = typer.Option(0,
|
|
101
299
|
"--auto-close", "-c",
|
|
102
|
-
help = "Delay in milliseconds after which the GUI window will close (for automated testing). Use 0
|
|
300
|
+
help = "Delay in milliseconds after which the GUI window will close (for automated testing). Use 0 to disable auto-closing.",
|
|
103
301
|
min=0)
|
|
104
302
|
)->None:
|
|
105
303
|
"""
|
|
@@ -126,15 +324,16 @@ def gui_command(
|
|
|
126
324
|
from pdflinkcheck.gui import start_gui
|
|
127
325
|
start_gui(time_auto_close = assured_auto_close_value)
|
|
128
326
|
|
|
129
|
-
|
|
130
327
|
# --- Helper, consistent gui failure message. ---
|
|
131
328
|
def _gui_failure_msg():
|
|
132
329
|
console.print("[bold red]GUI failed to launch[/bold red]")
|
|
133
330
|
console.print("Ensure pdflinkcheck dependecies are installed and the venv is activated (the dependecies are managed by uv).")
|
|
134
331
|
console.print("The dependecies for pdflinkcheck are managed by uv.")
|
|
135
|
-
console.print("Ensure
|
|
332
|
+
console.print("Ensure Tkinter is available, especially if using WSLg.")
|
|
333
|
+
console.print("On Termux/Android, GUI is not supported. Use 'pdflinkcheck analyze <file.pdf>' instead.")
|
|
136
334
|
console.print(f"pyhabitat.tkinter_is_available() = {pyhabitat.tkinter_is_available()}")
|
|
137
335
|
pass
|
|
138
336
|
|
|
139
337
|
if __name__ == "__main__":
|
|
140
|
-
app()
|
|
338
|
+
app()
|
|
339
|
+
|
pdflinkcheck/data/LICENSE
CHANGED
|
@@ -1,26 +1,7 @@
|
|
|
1
|
-
pdflinkcheck - A PDF Link Checker
|
|
2
|
-
Copyright (C) 2025 George Clayton Bennett
|
|
3
|
-
|
|
4
|
-
This program is free software: You can redistribute it and/or modify
|
|
5
|
-
it under the terms of the GNU Affero General Public License as
|
|
6
|
-
published by the Free Software Foundation, either version 3 of the
|
|
7
|
-
License, or (at your option) any later version.
|
|
8
|
-
|
|
9
|
-
The AGPL3+ is required because it uses PyMuPDF, which is licensed under the AGPL3.
|
|
10
|
-
|
|
11
|
-
Dependencies:
|
|
12
|
-
- Python (PSFL) | https://github.com/python/cpython |
|
|
13
|
-
- PyMuPDF (AGPL3) | https://github.com/pymupdf/PyMuPDF |
|
|
14
|
-
- pyhabitat (MIT) | https://github.com/City-of-Memphis-Wastewater/pdflinkcheck |
|
|
15
|
-
- rich (MIT) | https://github.com/Textualize/rich |
|
|
16
|
-
- typer (MIT) | https://github.com/fastapi/typer |
|
|
17
|
-
|
|
18
|
-
----------------------------------------------------------------------
|
|
19
|
-
|
|
20
1
|
GNU AFFERO GENERAL PUBLIC LICENSE
|
|
21
2
|
Version 3, 19 November 2007
|
|
22
3
|
|
|
23
|
-
Copyright (C) 2007 Free Software Foundation, Inc. <
|
|
4
|
+
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
|
24
5
|
Everyone is permitted to copy and distribute verbatim copies
|
|
25
6
|
of this license document, but changing it is not allowed.
|
|
26
7
|
|
|
@@ -652,8 +633,8 @@ the "copyright" line and a pointer to where the full notice is found.
|
|
|
652
633
|
Copyright (C) <year> <name of author>
|
|
653
634
|
|
|
654
635
|
This program is free software: you can redistribute it and/or modify
|
|
655
|
-
it under the terms of the GNU Affero General Public License as published
|
|
656
|
-
the Free Software Foundation, either version 3 of the License, or
|
|
636
|
+
it under the terms of the GNU Affero General Public License as published
|
|
637
|
+
by the Free Software Foundation, either version 3 of the License, or
|
|
657
638
|
(at your option) any later version.
|
|
658
639
|
|
|
659
640
|
This program is distributed in the hope that it will be useful,
|
|
@@ -662,7 +643,7 @@ the "copyright" line and a pointer to where the full notice is found.
|
|
|
662
643
|
GNU Affero General Public License for more details.
|
|
663
644
|
|
|
664
645
|
You should have received a copy of the GNU Affero General Public License
|
|
665
|
-
along with this program. If not, see <
|
|
646
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
666
647
|
|
|
667
648
|
Also add information on how to contact you by electronic and paper mail.
|
|
668
649
|
|
|
@@ -677,4 +658,4 @@ specific requirements.
|
|
|
677
658
|
You should also get your employer (if you work as a programmer) or school,
|
|
678
659
|
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
|
679
660
|
For more information on this, and how to apply and follow the GNU AGPL, see
|
|
680
|
-
<
|
|
661
|
+
<https://www.gnu.org/licenses/>.
|