pdflinkcheck 1.1.94__py3-none-any.whl → 1.2.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdflinkcheck/__init__.py +88 -18
- pdflinkcheck/__main__.py +6 -0
- pdflinkcheck/analysis_pdfium.py +131 -0
- pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +99 -141
- pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +51 -39
- pdflinkcheck/cli.py +52 -48
- pdflinkcheck/data/LICENSE +18 -15
- pdflinkcheck/data/README.md +23 -25
- pdflinkcheck/data/pyproject.toml +17 -26
- pdflinkcheck/datacopy.py +16 -1
- pdflinkcheck/dev.py +2 -2
- pdflinkcheck/environment.py +14 -2
- pdflinkcheck/gui.py +346 -563
- pdflinkcheck/helpers.py +88 -0
- pdflinkcheck/io.py +24 -6
- pdflinkcheck/report.py +598 -97
- pdflinkcheck/security.py +189 -0
- pdflinkcheck/splash.py +38 -0
- pdflinkcheck/stdlib_server.py +7 -21
- pdflinkcheck/stdlib_server_alt.py +571 -0
- pdflinkcheck/tk_utils.py +188 -0
- pdflinkcheck/update_msix_version.py +2 -0
- pdflinkcheck/validate.py +104 -170
- pdflinkcheck/version_info.py +2 -2
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +41 -40
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/RECORD +34 -27
- pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
- pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
- pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
- pdflinkcheck/analyze_pypdf_v2.py +0 -217
- pdflinkcheck-1.1.94.dist-info/WHEEL +0 -4
- pdflinkcheck-1.1.94.dist-info/licenses/LICENSE +0 -24
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-AGPL3 +0 -0
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-MIT +0 -0
pdflinkcheck/tk_utils.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
# src/pdflinkcheck/tk_utils.py
|
|
2
|
+
import tkinter as tk
|
|
3
|
+
import subprocess
|
|
4
|
+
import re
|
|
5
|
+
import platform
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_primary_monitor_geometry():
|
|
10
|
+
"""
|
|
11
|
+
Queries xrandr to find the actual primary monitor's dimensions and offsets.
|
|
12
|
+
Returns (width, height, x_offset, y_offset) or None.
|
|
13
|
+
|
|
14
|
+
Not used.
|
|
15
|
+
"""
|
|
16
|
+
try:
|
|
17
|
+
# Query xrandr for the primary monitor
|
|
18
|
+
result = subprocess.run(['xrandr', '--query'], capture_output=True, text=True, check=True)
|
|
19
|
+
# Look for a line like: "DP-0 connected primary 1920x1080+1200+0"
|
|
20
|
+
match = re.search(r'(\d+)x(\d+)\+(\d+)\+(\d+)', re.search(r'^.*primary.*$', result.stdout, re.M).group())
|
|
21
|
+
if match:
|
|
22
|
+
return map(int, match.groups())
|
|
23
|
+
except Exception:
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
def center_window_on_primary_stable(window: tk.Toplevel | tk.Tk, width: int, height: int):
|
|
27
|
+
"""
|
|
28
|
+
Docstring for center_window_on_primary_stable
|
|
29
|
+
|
|
30
|
+
:param window: Description
|
|
31
|
+
:type window: tk.Toplevel | tk.Tk
|
|
32
|
+
:param width: Description
|
|
33
|
+
:type width: int
|
|
34
|
+
:param height: Description
|
|
35
|
+
:type height: int
|
|
36
|
+
|
|
37
|
+
Not used.
|
|
38
|
+
"""
|
|
39
|
+
window.update_idletasks()
|
|
40
|
+
|
|
41
|
+
# 1. Try to assess via X11/XRandR (Best for WSL2)
|
|
42
|
+
geom = get_primary_monitor_geometry()
|
|
43
|
+
|
|
44
|
+
if geom:
|
|
45
|
+
pw, ph, px, py = geom
|
|
46
|
+
print(f"[DEBUG] XRandR Primary: {pw}x{ph} at +{px}+{py}")
|
|
47
|
+
else:
|
|
48
|
+
# 2. Fallback: Center on Mouse Pointer (Best for Multi-monitor without XRandR)
|
|
49
|
+
# Since we can't find 'Primary', we put it where the user's attention is.
|
|
50
|
+
pw, ph = window.winfo_screenwidth(), window.winfo_screenheight()
|
|
51
|
+
px, py = 0, 0
|
|
52
|
+
|
|
53
|
+
# If it's a giant span, let's just use the pointer as the anchor
|
|
54
|
+
if pw > 2500:
|
|
55
|
+
pointer_x = window.winfo_pointerx()
|
|
56
|
+
pointer_y = window.winfo_pointery()
|
|
57
|
+
# We treat a 1920x1080 box around the pointer as our 'virtual primary'
|
|
58
|
+
px, py = pointer_x - 960, pointer_y - 540
|
|
59
|
+
pw, ph = 1920, 1080
|
|
60
|
+
|
|
61
|
+
# 3. Final Math
|
|
62
|
+
x = px + (pw // 2) - (width // 2)
|
|
63
|
+
y = py + (ph // 2) - (height // 2)
|
|
64
|
+
|
|
65
|
+
# Final clamp to ensure it's not off-screen
|
|
66
|
+
x = max(0, x)
|
|
67
|
+
y = max(0, y)
|
|
68
|
+
|
|
69
|
+
print(f"[DEBUG] Final Positioning: x={x}, y={y}")
|
|
70
|
+
window.geometry(f"{width}x{height}+{int(x)}+{int(y)}")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def get_monitor_geometries():
|
|
74
|
+
"""
|
|
75
|
+
Queries xrandr to find all connected monitor dimensions and offsets.
|
|
76
|
+
Returns a list of dicts: [{'w', 'h', 'x', 'y', 'is_primary'}]
|
|
77
|
+
Essential for WSL2/WSLg multi-monitor accuracy.
|
|
78
|
+
|
|
79
|
+
Active.
|
|
80
|
+
"""
|
|
81
|
+
monitors = []
|
|
82
|
+
os_name = platform.system()
|
|
83
|
+
|
|
84
|
+
# --- LINUX / WSL2 Logic ---
|
|
85
|
+
if os_name == "Linux":
|
|
86
|
+
try:
|
|
87
|
+
# Run xrandr
|
|
88
|
+
xrandr_result = subprocess.run(['xrandr', '--query'], capture_output=True, text=True, check=True)
|
|
89
|
+
#print(f"xrandr_result = {xrandr_result}")
|
|
90
|
+
# Regex to find: "1920x1080+1920+0" or "1920x1080+0+0"
|
|
91
|
+
# We look for lines that contain 'connected' and a geometry string
|
|
92
|
+
lines = xrandr_result.stdout.splitlines()
|
|
93
|
+
for line in lines:
|
|
94
|
+
if " connected " in line:
|
|
95
|
+
is_primary = "primary" in line
|
|
96
|
+
match = re.search(r'(\d+)x(\d+)\+(\d+)\+(\d+)', line)
|
|
97
|
+
if match:
|
|
98
|
+
w, h, x, y = map(int, match.groups())
|
|
99
|
+
monitors.append({
|
|
100
|
+
'w': w, 'h': h, 'x': x, 'y': y,
|
|
101
|
+
'is_primary': is_primary
|
|
102
|
+
})
|
|
103
|
+
except Exception as e:
|
|
104
|
+
print(f"[DEBUG] xrandr query failed: {e}")
|
|
105
|
+
|
|
106
|
+
# --- WINDOWS Native Logic ---
|
|
107
|
+
if os_name == "Windows":
|
|
108
|
+
# On native Windows, we can use ctypes to call GetSystemMetrics
|
|
109
|
+
# or rely on the fact that the Primary monitor is almost always at 0,0
|
|
110
|
+
# and its size is reported by winfo_screenwidth if we don't have multiple monitors
|
|
111
|
+
# (For true multi-monitor on native Windows, win32api is usually needed)
|
|
112
|
+
pass
|
|
113
|
+
|
|
114
|
+
return monitors
|
|
115
|
+
|
|
116
|
+
def center_window_on_primary_goose(window: tk.Toplevel | tk.Tk, width: int, height: int):
|
|
117
|
+
"""
|
|
118
|
+
Standardizes window centering by identifying the physical monitor
|
|
119
|
+
bounds and offsets.
|
|
120
|
+
|
|
121
|
+
:param window: Description
|
|
122
|
+
:type window: tk.Toplevel | tk.Tk
|
|
123
|
+
:param width: Description
|
|
124
|
+
:type width: int
|
|
125
|
+
:param height: Description
|
|
126
|
+
:type height: int
|
|
127
|
+
|
|
128
|
+
Active.
|
|
129
|
+
"""
|
|
130
|
+
window.update_idletasks()
|
|
131
|
+
|
|
132
|
+
monitors = get_monitor_geometries()
|
|
133
|
+
target_monitor = None
|
|
134
|
+
|
|
135
|
+
if monitors:
|
|
136
|
+
# 1. Prefer the one explicitly marked 'primary'
|
|
137
|
+
target_monitor = next((m for m in monitors if m['is_primary']), None)
|
|
138
|
+
|
|
139
|
+
# 2. Fallback to the first monitor (usually the one at +0+0)
|
|
140
|
+
if not target_monitor:
|
|
141
|
+
target_monitor = monitors[0]
|
|
142
|
+
|
|
143
|
+
print(f"[DEBUG] Assessed Monitor: {target_monitor['w']}x{target_monitor['h']} at +{target_monitor['x']}+{target_monitor['y']}")
|
|
144
|
+
else:
|
|
145
|
+
print("[DEBUG] No monitors found via xrandr. Falling back to screenwidth.")
|
|
146
|
+
# Total fallback: use winfo_screenwidth but assume 1080p width
|
|
147
|
+
# to avoid the L-gap if it's clearly a massive span.
|
|
148
|
+
sw = window.winfo_screenwidth()
|
|
149
|
+
sh = window.winfo_screenheight()
|
|
150
|
+
target_monitor = {
|
|
151
|
+
'w': 1920 if sw > 2500 else sw,
|
|
152
|
+
'h': 1080 if sh > 2000 else sh,
|
|
153
|
+
'x': 0, 'y': 0
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
# 3. Calculate Center relative to the identified monitor's geometry
|
|
157
|
+
x = target_monitor['x'] + (target_monitor['w'] // 2) - (width // 2)
|
|
158
|
+
y = target_monitor['y'] + (target_monitor['h'] // 2) - (height // 2)
|
|
159
|
+
|
|
160
|
+
print(f"[DEBUG] Final Positioning: x={x}, y={y}")
|
|
161
|
+
window.geometry(f"{width}x{height}+{int(x)}+{int(y)}")
|
|
162
|
+
|
|
163
|
+
def center_window_on_primary(window: tk.Toplevel | tk.Tk, width: int, height: int):
|
|
164
|
+
window.update_idletasks()
|
|
165
|
+
monitors = get_monitor_geometries()
|
|
166
|
+
|
|
167
|
+
target = None
|
|
168
|
+
if monitors:
|
|
169
|
+
target = next((m for m in monitors if m['is_primary']), monitors[0])
|
|
170
|
+
|
|
171
|
+
if target:
|
|
172
|
+
# Use the precisely assessed hardware monitor
|
|
173
|
+
x = target['x'] + (target['w'] // 2) - (width // 2)
|
|
174
|
+
y = target['y'] + (target['h'] // 2) - (height // 2)
|
|
175
|
+
else:
|
|
176
|
+
# Fallback for Windows/Mac where xrandr doesn't exist
|
|
177
|
+
# We use wm_maxsize which is surprisingly accurate for the 'Primary' on Windows/Mac
|
|
178
|
+
pw, ph = window.wm_maxsize()
|
|
179
|
+
|
|
180
|
+
# If maxsize is also reporting the huge span (rare on native),
|
|
181
|
+
# then we use your 1920/1080 safe-zone heuristic.
|
|
182
|
+
if pw > 2500:
|
|
183
|
+
pw, ph = 1920, 1080
|
|
184
|
+
|
|
185
|
+
x = (pw // 2) - (width // 2)
|
|
186
|
+
y = (ph // 2) - (height // 2)
|
|
187
|
+
|
|
188
|
+
window.geometry(f"{width}x{height}+{int(x)}+{int(y)}")
|
pdflinkcheck/validate.py
CHANGED
|
@@ -1,29 +1,34 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
# SPDX-License-Identifier: MIT
|
|
3
3
|
# src/pdflinkcheck/validate.py
|
|
4
|
-
|
|
4
|
+
from __future__ import annotations
|
|
5
5
|
import sys
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from typing import Dict, Any
|
|
8
8
|
|
|
9
9
|
from pdflinkcheck.io import get_friendly_path
|
|
10
|
-
from pdflinkcheck.
|
|
10
|
+
from pdflinkcheck.helpers import PageRef # Importing the established helper
|
|
11
11
|
|
|
12
12
|
SEP_COUNT=28
|
|
13
13
|
|
|
14
|
+
START_INDEX = 0
|
|
15
|
+
# Internal 0-based start
|
|
16
|
+
# Define the offset.
|
|
17
|
+
# The PDF engines are 0-based.
|
|
18
|
+
# We will add +1 only for the HUMAN REASON strings.
|
|
19
|
+
|
|
20
|
+
|
|
14
21
|
def run_validation(
|
|
15
22
|
report_results: Dict[str, Any],
|
|
16
23
|
pdf_path: str,
|
|
17
|
-
pdf_library: str = "pypdf",
|
|
18
24
|
check_external: bool = False
|
|
19
25
|
) -> Dict[str, Any]:
|
|
20
26
|
"""
|
|
21
|
-
Validates links during
|
|
27
|
+
Validates links during run_report_*() using a partial completion of the data dict.
|
|
22
28
|
|
|
23
29
|
Args:
|
|
24
30
|
report_results: The dict returned by run_report_and_call_exports()
|
|
25
31
|
pdf_path: Path to the original PDF (needed for relative file checks and page count)
|
|
26
|
-
pdf_library: Engine used ("pypdf" or "pymupdf")
|
|
27
32
|
check_external: Whether to validate HTTP URLs (requires network + requests)
|
|
28
33
|
|
|
29
34
|
Returns:
|
|
@@ -34,30 +39,17 @@ def run_validation(
|
|
|
34
39
|
|
|
35
40
|
all_links = data.get("external_links", []) + data.get("internal_links", [])
|
|
36
41
|
toc = data.get("toc", [])
|
|
42
|
+
total_pages = metadata.get("file_overview", {}).get("total_pages",None)
|
|
37
43
|
|
|
38
44
|
if not all_links and not toc:
|
|
39
45
|
print("No links or TOC to validate.")
|
|
40
46
|
return {"summary-stats": {"valid": 0, "broken": 0}, "issues": []}
|
|
41
47
|
|
|
42
|
-
# Get total page count (critical for internal validation)
|
|
43
|
-
try:
|
|
44
|
-
if pymupdf_is_available() and pdf_library == "pymupdf":
|
|
45
|
-
import fitz
|
|
46
|
-
doc = fitz.open(pdf_path)
|
|
47
|
-
total_pages = doc.page_count
|
|
48
|
-
doc.close()
|
|
49
|
-
else:
|
|
50
|
-
from pypdf import PdfReader
|
|
51
|
-
reader = PdfReader(pdf_path)
|
|
52
|
-
total_pages = len(reader.pages)
|
|
53
|
-
except Exception as e:
|
|
54
|
-
print(f"Could not determine page count: {e}")
|
|
55
|
-
total_pages = None
|
|
56
48
|
|
|
57
49
|
pdf_dir = Path(pdf_path).parent
|
|
58
50
|
|
|
59
51
|
issues = []
|
|
60
|
-
valid_count = 0
|
|
52
|
+
valid_count = 0 # add more granulaity for types of valid links
|
|
61
53
|
file_found_count = 0
|
|
62
54
|
broken_file_count = 0
|
|
63
55
|
broken_page_count = 0
|
|
@@ -68,37 +60,56 @@ def run_validation(
|
|
|
68
60
|
|
|
69
61
|
# Validate active links
|
|
70
62
|
#print("DEBUG validate: entering loop with", len(all_links), "links")
|
|
71
|
-
for
|
|
63
|
+
for link in all_links:
|
|
72
64
|
link_type = link.get("type")
|
|
73
65
|
status = "valid"
|
|
74
66
|
reason = None
|
|
67
|
+
|
|
75
68
|
if link_type in ("Internal (GoTo/Dest)", "Internal (Resolved Action)"):
|
|
76
69
|
dest_page_raw = link.get("destination_page")
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
else:
|
|
70
|
+
|
|
71
|
+
if dest_page_raw is not None:
|
|
72
|
+
|
|
81
73
|
try:
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
74
|
+
# Use PageRef to handle translation
|
|
75
|
+
target_page_ref = PageRef.from_index(int(dest_page_raw))
|
|
76
|
+
#target_page = int(dest_page_raw)
|
|
77
|
+
|
|
78
|
+
# 1. Immediate Failure: Below 0
|
|
79
|
+
if target_page_ref.machine < START_INDEX:
|
|
85
80
|
status = "broken-page"
|
|
86
|
-
|
|
87
|
-
|
|
81
|
+
# We use target_page + 1 to show the user what they "saw"
|
|
82
|
+
reason = f"Target page {target_page_ref.human} is invalid (negative index)."
|
|
83
|
+
|
|
84
|
+
# 2. Case: We don't know the max page count
|
|
85
|
+
elif total_pages is None:
|
|
86
|
+
# If it's 0 or higher, we assume it might be okay but can't be sure
|
|
88
87
|
status = "unknown-reasonableness"
|
|
89
|
-
reason = "
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
elif target_page < 1:
|
|
94
|
-
status = "broken-page"
|
|
95
|
-
reason = f"TOC targets page negative {target_page}."
|
|
96
|
-
elif not (1 <= target_page <= total_pages):
|
|
88
|
+
reason = f"Page {target_page_ref.human} seems reasonable, but total page count is unavailable."
|
|
89
|
+
|
|
90
|
+
# 3. Case: Out of Upper Bounds
|
|
91
|
+
elif target_page_ref.machine >= total_pages:
|
|
97
92
|
status = "broken-page"
|
|
98
|
-
|
|
93
|
+
# User sees 1-based, e.g., "Page 101 out of range (1-100)"
|
|
94
|
+
reason = f"Page {target_page_ref.human} out of range (1–{total_pages})"
|
|
95
|
+
|
|
96
|
+
# 4. Case: Perfect Match
|
|
97
|
+
else:
|
|
98
|
+
status = "valid"
|
|
99
|
+
reason = f"Page {target_page_ref.human} within range (1–{total_pages})"
|
|
100
|
+
|
|
101
|
+
except (ValueError, TypeError):
|
|
102
|
+
status = "broken-page"
|
|
103
|
+
reason = f"Invalid page value: {dest_page_raw}"
|
|
104
|
+
|
|
99
105
|
except (ValueError, TypeError):
|
|
100
106
|
status = "broken-page"
|
|
101
107
|
reason = f"Invalid page value: {dest_page_raw}"
|
|
108
|
+
|
|
109
|
+
elif dest_page_raw is None:
|
|
110
|
+
status = "no-destinstion-page"
|
|
111
|
+
reason = "No destination page resolved"
|
|
112
|
+
|
|
102
113
|
elif link_type == "Remote (GoToR)":
|
|
103
114
|
remote_file = link.get("remote_file")
|
|
104
115
|
if not remote_file:
|
|
@@ -149,40 +160,62 @@ def run_validation(
|
|
|
149
160
|
elif status == "no-destinstion-page":
|
|
150
161
|
no_destination_page_count += 1
|
|
151
162
|
issues.append(link_with_val)
|
|
163
|
+
|
|
152
164
|
# Validate TOC entries
|
|
153
165
|
for entry in toc:
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
166
|
+
try:
|
|
167
|
+
# Coerce to int; we expect 0-based index from the engine
|
|
168
|
+
# In the context of the ing Map, -1 acts as a "Sentinel Value." It represents a state that is strictly outside the "Machine" range
|
|
169
|
+
target_page_raw = int(entry.get("target_page", -1))
|
|
170
|
+
target_page_ref = PageRef.from_index(int(target_page_raw))
|
|
171
|
+
|
|
172
|
+
status = "valid"
|
|
173
|
+
reason = ""
|
|
174
|
+
|
|
175
|
+
# 1. Check for negative indices (anything below our START_INDEX)
|
|
176
|
+
if target_page_ref.machine < START_INDEX:
|
|
177
|
+
status = "broken-page"
|
|
178
|
+
broken_page_count += 1
|
|
179
|
+
# User sees Page 0 or lower as the problem
|
|
180
|
+
reason = f"TOC targets invalid page number: {target_page_ref.human}"
|
|
181
|
+
|
|
182
|
+
# 2. Case: total_pages is unknown
|
|
183
|
+
elif total_pages is None:
|
|
158
184
|
status = "unknown-reasonableness"
|
|
159
185
|
unknown_reasonableness_count += 1
|
|
160
|
-
|
|
186
|
+
reason = f"Page {target_page_ref.human} unknown (could not verify total pages)"
|
|
187
|
+
|
|
188
|
+
# 3. Case: Out of range (Upper Bound)
|
|
189
|
+
# Index 100 in a 100-page doc (total_pages=100) is out of bounds
|
|
190
|
+
elif target_page_ref.machine >= total_pages:
|
|
161
191
|
status = "broken-page"
|
|
162
|
-
|
|
163
|
-
reason = f"TOC targets
|
|
164
|
-
|
|
192
|
+
broken_page_count += 1
|
|
193
|
+
reason = f"TOC targets page {target_page_ref.human} (out of 1–{total_pages})"
|
|
194
|
+
|
|
195
|
+
# 4. Valid Case
|
|
196
|
+
else:
|
|
197
|
+
status = "valid"
|
|
165
198
|
valid_count += 1
|
|
199
|
+
# We skip issues.append for valid TOC entries to keep the issues list clean
|
|
166
200
|
continue
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
reason = f"TOC targets page {target_page} (out of 1–{total_pages})"
|
|
170
|
-
broken_count += 1
|
|
171
|
-
else:
|
|
201
|
+
|
|
202
|
+
except (ValueError, TypeError):
|
|
172
203
|
status = "broken-page"
|
|
173
|
-
|
|
174
|
-
|
|
204
|
+
broken_page_count += 1
|
|
205
|
+
reason = f"Invalid page reference: {entry.get('target_page')}"
|
|
175
206
|
|
|
207
|
+
# Only reaches here if status is not "valid" (because of 'continue' above)
|
|
176
208
|
issues.append({
|
|
177
209
|
"type": "TOC Entry",
|
|
178
|
-
"title": entry
|
|
179
|
-
"level": entry
|
|
180
|
-
"target_page":
|
|
210
|
+
"title": entry.get("title", "Untitled"),
|
|
211
|
+
"level": entry.get("level", 0),
|
|
212
|
+
"target_page": target_page_ref.machine, # Stored as 0-indexed for data consistency
|
|
181
213
|
"validation": {"status": status, "reason": reason}
|
|
182
214
|
})
|
|
183
|
-
|
|
215
|
+
|
|
216
|
+
total_checked = metadata.get("link_counts",{}).get("total_links_count",0) + metadata.get("link_counts",{}).get("toc_entry_count",0)
|
|
184
217
|
summary_stats = {
|
|
185
|
-
"total_checked":
|
|
218
|
+
"total_checked": total_checked,
|
|
186
219
|
"valid": valid_count,
|
|
187
220
|
"file-found": file_found_count,
|
|
188
221
|
"broken-page": broken_page_count,
|
|
@@ -190,8 +223,7 @@ def run_validation(
|
|
|
190
223
|
"no_destination_page_count": no_destination_page_count,
|
|
191
224
|
"unknown-web": unknown_web_count,
|
|
192
225
|
"unknown-reasonableness": unknown_reasonableness_count,
|
|
193
|
-
"unknown-link": unknown_link_count
|
|
194
|
-
#"unknown": len(all_links) + len(toc) - valid_count - broken_count # nah this is not granuar enough
|
|
226
|
+
"unknown-link": unknown_link_count
|
|
195
227
|
}
|
|
196
228
|
|
|
197
229
|
|
|
@@ -211,7 +243,9 @@ def run_validation(
|
|
|
211
243
|
log(f"PDF Path = {get_friendly_path(pdf_path)}")
|
|
212
244
|
log(f"Total items checked: {summary_stats['total_checked']}")
|
|
213
245
|
log(f"✅ Valid: {summary_stats['valid']}")
|
|
214
|
-
log(f"
|
|
246
|
+
#log(f"✅ Valid: {summary_stats['valid']}")
|
|
247
|
+
#log(f"✅ Valid: {summary_stats['valid']}")
|
|
248
|
+
log(f"🌐 Web Addresses (Ping Each: OFF): {summary_stats['unknown-web']}")
|
|
215
249
|
log(f"⚠️ Unknown Page Reasonableness (Due to Missing Total Page Count): {summary_stats['unknown-reasonableness']}")
|
|
216
250
|
log(f"⚠️ Unsupported PDF Links: {summary_stats['unknown-link']}")
|
|
217
251
|
log(f"❌ Broken Page Reference (Page number beyond scope of availability): {summary_stats['broken-page']}")
|
|
@@ -230,6 +264,14 @@ def run_validation(
|
|
|
230
264
|
log("{:<5} | {:<12} | {:<30} | {}".format(i, link_type, text, reason))
|
|
231
265
|
if len(issues) > 25:
|
|
232
266
|
log(f"... and {len(issues) - 25} more issues")
|
|
267
|
+
|
|
268
|
+
elif summary_stats.get('total_checked', 0) == 0:
|
|
269
|
+
# Check if this was a total crash or just an empty PDF
|
|
270
|
+
if summary_stats.get('is_error_fallback'):
|
|
271
|
+
log("\nStatus: Validation could not be performed due to a processing error.")
|
|
272
|
+
else:
|
|
273
|
+
log("\nStatus: No links or TOC entries were found to validate.")
|
|
274
|
+
|
|
233
275
|
else:
|
|
234
276
|
log("Success: No broken links or TOC issues!")
|
|
235
277
|
|
|
@@ -249,111 +291,3 @@ def run_validation(
|
|
|
249
291
|
}
|
|
250
292
|
|
|
251
293
|
return validation_results
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
def run_validation_more_readable_slop(pdf_path: str = None, pdf_library: str = "pypdf", check_external_links:bool = False) -> Dict[str, Any]:
|
|
255
|
-
"""
|
|
256
|
-
Experimental. Ignore for now.
|
|
257
|
-
|
|
258
|
-
Extends the report logic by programmatically testing every extracted link.
|
|
259
|
-
Validates Internal Jumps (page bounds), External URIs (HTTP status),
|
|
260
|
-
and Launch actions (file existence).
|
|
261
|
-
"""
|
|
262
|
-
if check_external_links:
|
|
263
|
-
import requests
|
|
264
|
-
|
|
265
|
-
# 1. Setup Library Engine (Reuse logic)
|
|
266
|
-
pdf_library = pdf_library.lower()
|
|
267
|
-
if pdf_library == "pypdf":
|
|
268
|
-
from pdflinkcheck.analyze_pypdf import extract_links_pypdf as extract_links
|
|
269
|
-
else:
|
|
270
|
-
from pdflinkcheck.analyze_pymupdf import extract_links_pymupdf as extract_links
|
|
271
|
-
|
|
272
|
-
if pdf_path is None:
|
|
273
|
-
pdf_path = get_first_pdf_in_cwd()
|
|
274
|
-
|
|
275
|
-
if not pdf_path:
|
|
276
|
-
print("Error: No PDF found for validation.")
|
|
277
|
-
return {}
|
|
278
|
-
|
|
279
|
-
print(f"\nValidating links in {Path(pdf_path).name}...")
|
|
280
|
-
|
|
281
|
-
# 2. Extract links and initialize validation counters
|
|
282
|
-
links = extract_links(pdf_path)
|
|
283
|
-
total_links = len(links)
|
|
284
|
-
results = {"valid": [], "broken": [], "error": []}
|
|
285
|
-
|
|
286
|
-
# 3. Validation Loop
|
|
287
|
-
for i, link in enumerate(links, 1):
|
|
288
|
-
# Progress indicator for long manuals
|
|
289
|
-
sys.stdout.write(f"\rChecking link {i}/{total_links}...")
|
|
290
|
-
sys.stdout.flush()
|
|
291
|
-
|
|
292
|
-
link_type = link.get('type')
|
|
293
|
-
status = {"is_valid": False, "reason": "Unknown Type"}
|
|
294
|
-
|
|
295
|
-
# --- A. Validate Internal Jumps ---
|
|
296
|
-
if "Internal" in link_type:
|
|
297
|
-
target_page = link.get('destination_page')
|
|
298
|
-
if isinstance(target_page, int) and target_page > 0:
|
|
299
|
-
# In a real run, you'd compare against reader.pages_count
|
|
300
|
-
status = {"is_valid": True, "reason": "Resolves"}
|
|
301
|
-
else:
|
|
302
|
-
status = {"is_valid": False, "reason": f"Invalid Page: {target_page}"}
|
|
303
|
-
|
|
304
|
-
# --- B. Validate Web URIs ---
|
|
305
|
-
elif link_type == 'External (URI)':
|
|
306
|
-
|
|
307
|
-
url = link.get('url')
|
|
308
|
-
if url and url.startswith("http") and check_external_links:
|
|
309
|
-
try:
|
|
310
|
-
# Use a short timeout and HEAD request to be polite/fast
|
|
311
|
-
resp = requests.head(url, timeout=5, allow_redirects=True)
|
|
312
|
-
if resp.status_code < 400:
|
|
313
|
-
status = {"is_valid": True, "reason": f"HTTP {resp.status_code}"}
|
|
314
|
-
else:
|
|
315
|
-
status = {"is_valid": False, "reason": f"HTTP {resp.status_code}"}
|
|
316
|
-
except Exception as e:
|
|
317
|
-
status = {"is_valid": False, "reason": "Connection Failed"}
|
|
318
|
-
else:
|
|
319
|
-
status = {"is_valid": False, "reason": "Malformed URL"}
|
|
320
|
-
|
|
321
|
-
# --- C. Validate Local File/Launch Links ---
|
|
322
|
-
elif link_type == 'Launch' or 'remote_file' in link:
|
|
323
|
-
file_path = link.get('remote_file') or link.get('url')
|
|
324
|
-
if file_path:
|
|
325
|
-
# Clean URI formatting
|
|
326
|
-
clean_path = file_path.replace("file://", "").replace("%20", " ")
|
|
327
|
-
# Check relative to the PDF's location
|
|
328
|
-
abs_path = Path(pdf_path).parent / clean_path
|
|
329
|
-
if abs_path.exists():
|
|
330
|
-
status = {"is_valid": True, "reason": "File Exists"}
|
|
331
|
-
else:
|
|
332
|
-
status = {"is_valid": False, "reason": "File Missing"}
|
|
333
|
-
|
|
334
|
-
# Append result
|
|
335
|
-
link['validation'] = status
|
|
336
|
-
if status['is_valid']:
|
|
337
|
-
results['valid'].append(link)
|
|
338
|
-
else:
|
|
339
|
-
results['broken'].append(link)
|
|
340
|
-
|
|
341
|
-
print("\n" + "=" * SEP_COUNT)
|
|
342
|
-
print(f"--- Validation Summary Stats for {Path(pdf_path).name} ---")
|
|
343
|
-
print(f"Total Checked: {total_links}")
|
|
344
|
-
print(f"✅ Valid: {len(results['valid'])}")
|
|
345
|
-
print(f"❌ Broken: {len(results['broken'])}")
|
|
346
|
-
print("=" * SEP_COUNT)
|
|
347
|
-
|
|
348
|
-
# 4. Print Detail Report for Broken Links
|
|
349
|
-
if results['broken']:
|
|
350
|
-
print("\n## ❌ Broken Links Found:")
|
|
351
|
-
print("{:<5} | {:<5} | {:<30} | {}".format("Idx", "Page", "Reason", "Target"))
|
|
352
|
-
print("-" * SEP_COUNT)
|
|
353
|
-
for i, link in enumerate(results['broken'], 1):
|
|
354
|
-
target = link.get('url') or link.get('destination_page') or link.get('remote_file')
|
|
355
|
-
print("{:<5} | {:<5} | {:<30} | {}".format(
|
|
356
|
-
i, link['page'], link['validation']['reason'], str(target)[:30]
|
|
357
|
-
))
|
|
358
|
-
|
|
359
|
-
return results
|
pdflinkcheck/version_info.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
# SPDX-License-Identifier: MIT
|
|
3
3
|
# src/pdflinkcheck/version_info.py
|
|
4
|
-
|
|
4
|
+
from __future__ import annotations
|
|
5
5
|
import re
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
import sys
|
|
@@ -83,4 +83,4 @@ def get_version_from_pyproject() -> str:
|
|
|
83
83
|
match = re.search(r'version\s*=\s*["\']([^"\']+)["\']', poetry_section.group(1))
|
|
84
84
|
if match: return match.group(1)
|
|
85
85
|
|
|
86
|
-
return "0.0.0"
|
|
86
|
+
return "0.0.0"
|