pdflinkcheck 1.1.73__py3-none-any.whl → 1.2.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdflinkcheck/__init__.py +88 -21
- pdflinkcheck/__main__.py +6 -0
- pdflinkcheck/analysis_pdfium.py +131 -0
- pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +109 -145
- pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +67 -37
- pdflinkcheck/cli.py +111 -116
- pdflinkcheck/data/I Have Questions.md +51 -0
- pdflinkcheck/data/LICENSE +20 -654
- pdflinkcheck/data/README.md +65 -67
- pdflinkcheck/data/icons/BoxArt-1080x1080.png +0 -0
- pdflinkcheck/data/icons/Logo-150x150.png +0 -0
- pdflinkcheck/data/icons/Logo-300x300.png +0 -0
- pdflinkcheck/data/icons/Logo-71x71.png +0 -0
- pdflinkcheck/data/icons/PosterArt-720x1080.png +0 -0
- pdflinkcheck/data/icons/SmallLogo-44x44.png +0 -0
- pdflinkcheck/data/icons/SplashScreen-620x300.png +0 -0
- pdflinkcheck/data/icons/StoreLogo-50x50.png +0 -0
- pdflinkcheck/data/icons/WideLogo-310x150.png +0 -0
- pdflinkcheck/data/icons/red_pdf_512px.ico +0 -0
- pdflinkcheck/data/pyproject.toml +25 -37
- pdflinkcheck/data/themes/forest/forest-dark/border-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-invalid.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/card.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/combo-button-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/combo-button-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/combo-button-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/down.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/empty.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/notebook.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/off-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/off-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/off-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/on-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/on-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/on-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/right.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/scale-hor.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/scale-vert.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/separator.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/sizegrip.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/spin-button-down-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/spin-button-down-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/spin-button-up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tab-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tab-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tab-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tree-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tree-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark.tcl +536 -0
- pdflinkcheck/data/themes/forest/forest-light/border-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-invalid.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/card.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/combo-button-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/combo-button-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/combo-button-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/down-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/down.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/empty.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/notebook.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/off-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/off-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/off-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/on-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/on-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/on-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/right-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/right.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/scale-hor.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/scale-vert.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/separator.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/sizegrip.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/spin-button-down-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/spin-button-down-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/spin-button-up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tab-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tab-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tab-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tree-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tree-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light.tcl +544 -0
- pdflinkcheck/datacopy.py +18 -1
- pdflinkcheck/dev.py +12 -25
- pdflinkcheck/environment.py +76 -0
- pdflinkcheck/gui.py +366 -457
- pdflinkcheck/helpers.py +88 -0
- pdflinkcheck/io.py +27 -23
- pdflinkcheck/report.py +692 -121
- pdflinkcheck/security.py +189 -0
- pdflinkcheck/splash.py +38 -0
- pdflinkcheck/stdlib_server.py +14 -20
- pdflinkcheck/stdlib_server_alt.py +571 -0
- pdflinkcheck/tk_utils.py +188 -0
- pdflinkcheck/update_msix_version.py +49 -0
- pdflinkcheck/validate.py +129 -218
- pdflinkcheck/version_info.py +6 -3
- {pdflinkcheck-1.1.73.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +84 -81
- pdflinkcheck-1.2.29.dist-info/RECORD +183 -0
- pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
- {pdflinkcheck-1.1.73.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
- pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
- pdflinkcheck-1.2.29.dist-info/licenses/LICENSE-MIT +9 -0
- pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
- pdflinkcheck/analyze_pypdf_v2.py +0 -218
- pdflinkcheck-1.1.73.dist-info/RECORD +0 -21
- pdflinkcheck-1.1.73.dist-info/WHEEL +0 -4
- /pdflinkcheck-1.1.73.dist-info/licenses/LICENSE → /pdflinkcheck-1.2.29.dist-info/licenses/LICENSE-AGPL3 +0 -0
pdflinkcheck/validate.py
CHANGED
|
@@ -1,31 +1,35 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
1
3
|
# src/pdflinkcheck/validate.py
|
|
2
|
-
|
|
4
|
+
from __future__ import annotations
|
|
3
5
|
import sys
|
|
4
6
|
from pathlib import Path
|
|
5
7
|
from typing import Dict, Any
|
|
6
8
|
|
|
7
|
-
from pdflinkcheck.
|
|
8
|
-
from pdflinkcheck.
|
|
9
|
+
from pdflinkcheck.io import get_friendly_path
|
|
10
|
+
from pdflinkcheck.helpers import PageRef # Importing the established helper
|
|
9
11
|
|
|
10
12
|
SEP_COUNT=28
|
|
11
13
|
|
|
14
|
+
START_INDEX = 0
|
|
15
|
+
# Internal 0-based start
|
|
16
|
+
# Define the offset.
|
|
17
|
+
# The PDF engines are 0-based.
|
|
18
|
+
# We will add +1 only for the HUMAN REASON strings.
|
|
19
|
+
|
|
20
|
+
|
|
12
21
|
def run_validation(
|
|
13
22
|
report_results: Dict[str, Any],
|
|
14
23
|
pdf_path: str,
|
|
15
|
-
|
|
16
|
-
check_external: bool = False,
|
|
17
|
-
export_json: bool = True,
|
|
18
|
-
print_bool: bool = True
|
|
24
|
+
check_external: bool = False
|
|
19
25
|
) -> Dict[str, Any]:
|
|
20
26
|
"""
|
|
21
|
-
Validates links using the
|
|
27
|
+
Validates links during run_report_*() using a partial completion of the data dict.
|
|
22
28
|
|
|
23
29
|
Args:
|
|
24
|
-
report_results: The dict returned by
|
|
30
|
+
report_results: The dict returned by run_report_and_call_exports()
|
|
25
31
|
pdf_path: Path to the original PDF (needed for relative file checks and page count)
|
|
26
|
-
pdf_library: Engine used ("pypdf" or "pymupdf")
|
|
27
32
|
check_external: Whether to validate HTTP URLs (requires network + requests)
|
|
28
|
-
print_bool: Whether to print results to console
|
|
29
33
|
|
|
30
34
|
Returns:
|
|
31
35
|
Validation summary stats with valid/broken counts and detailed issues
|
|
@@ -35,62 +39,77 @@ def run_validation(
|
|
|
35
39
|
|
|
36
40
|
all_links = data.get("external_links", []) + data.get("internal_links", [])
|
|
37
41
|
toc = data.get("toc", [])
|
|
42
|
+
total_pages = metadata.get("file_overview", {}).get("total_pages",None)
|
|
38
43
|
|
|
39
44
|
if not all_links and not toc:
|
|
40
|
-
|
|
41
|
-
print("No links or TOC to validate.")
|
|
45
|
+
print("No links or TOC to validate.")
|
|
42
46
|
return {"summary-stats": {"valid": 0, "broken": 0}, "issues": []}
|
|
43
47
|
|
|
44
|
-
# Get total page count (critical for internal validation)
|
|
45
|
-
try:
|
|
46
|
-
if pdf_library == "pymupdf":
|
|
47
|
-
import fitz
|
|
48
|
-
doc = fitz.open(pdf_path)
|
|
49
|
-
total_pages = doc.page_count
|
|
50
|
-
doc.close()
|
|
51
|
-
else:
|
|
52
|
-
from pypdf import PdfReader
|
|
53
|
-
reader = PdfReader(pdf_path)
|
|
54
|
-
total_pages = len(reader.pages)
|
|
55
|
-
except Exception as e:
|
|
56
|
-
if print_bool:
|
|
57
|
-
print(f"Could not determine page count: {e}")
|
|
58
|
-
total_pages = None
|
|
59
48
|
|
|
60
49
|
pdf_dir = Path(pdf_path).parent
|
|
61
50
|
|
|
62
51
|
issues = []
|
|
63
|
-
valid_count = 0
|
|
52
|
+
valid_count = 0 # add more granulaity for types of valid links
|
|
53
|
+
file_found_count = 0
|
|
64
54
|
broken_file_count = 0
|
|
65
55
|
broken_page_count = 0
|
|
66
|
-
|
|
56
|
+
no_destination_page_count = 0
|
|
67
57
|
unknown_web_count = 0
|
|
68
58
|
unknown_reasonableness_count = 0
|
|
69
59
|
unknown_link_count = 0
|
|
70
60
|
|
|
71
61
|
# Validate active links
|
|
72
|
-
|
|
62
|
+
#print("DEBUG validate: entering loop with", len(all_links), "links")
|
|
63
|
+
for link in all_links:
|
|
73
64
|
link_type = link.get("type")
|
|
74
65
|
status = "valid"
|
|
75
66
|
reason = None
|
|
67
|
+
|
|
76
68
|
if link_type in ("Internal (GoTo/Dest)", "Internal (Resolved Action)"):
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
69
|
+
dest_page_raw = link.get("destination_page")
|
|
70
|
+
|
|
71
|
+
if dest_page_raw is not None:
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
# Use PageRef to handle translation
|
|
75
|
+
target_page_ref = PageRef.from_index(int(dest_page_raw))
|
|
76
|
+
#target_page = int(dest_page_raw)
|
|
77
|
+
|
|
78
|
+
# 1. Immediate Failure: Below 0
|
|
79
|
+
if target_page_ref.machine < START_INDEX:
|
|
80
|
+
status = "broken-page"
|
|
81
|
+
# We use target_page + 1 to show the user what they "saw"
|
|
82
|
+
reason = f"Target page {target_page_ref.human} is invalid (negative index)."
|
|
83
|
+
|
|
84
|
+
# 2. Case: We don't know the max page count
|
|
85
|
+
elif total_pages is None:
|
|
86
|
+
# If it's 0 or higher, we assume it might be okay but can't be sure
|
|
87
|
+
status = "unknown-reasonableness"
|
|
88
|
+
reason = f"Page {target_page_ref.human} seems reasonable, but total page count is unavailable."
|
|
89
|
+
|
|
90
|
+
# 3. Case: Out of Upper Bounds
|
|
91
|
+
elif target_page_ref.machine >= total_pages:
|
|
92
|
+
status = "broken-page"
|
|
93
|
+
# User sees 1-based, e.g., "Page 101 out of range (1-100)"
|
|
94
|
+
reason = f"Page {target_page_ref.human} out of range (1–{total_pages})"
|
|
95
|
+
|
|
96
|
+
# 4. Case: Perfect Match
|
|
97
|
+
else:
|
|
98
|
+
status = "valid"
|
|
99
|
+
reason = f"Page {target_page_ref.human} within range (1–{total_pages})"
|
|
100
|
+
|
|
101
|
+
except (ValueError, TypeError):
|
|
102
|
+
status = "broken-page"
|
|
103
|
+
reason = f"Invalid page value: {dest_page_raw}"
|
|
104
|
+
|
|
105
|
+
except (ValueError, TypeError):
|
|
106
|
+
status = "broken-page"
|
|
107
|
+
reason = f"Invalid page value: {dest_page_raw}"
|
|
108
|
+
|
|
109
|
+
elif dest_page_raw is None:
|
|
110
|
+
status = "no-destinstion-page"
|
|
111
|
+
reason = "No destination page resolved"
|
|
112
|
+
|
|
94
113
|
elif link_type == "Remote (GoToR)":
|
|
95
114
|
remote_file = link.get("remote_file")
|
|
96
115
|
if not remote_file:
|
|
@@ -132,55 +151,79 @@ def run_validation(
|
|
|
132
151
|
unknown_reasonableness_count += 1
|
|
133
152
|
elif status == "unknown-link":
|
|
134
153
|
unknown_link_count += 1
|
|
135
|
-
elif status == "broken-
|
|
154
|
+
elif status == "broken-page":
|
|
136
155
|
broken_page_count += 1
|
|
137
156
|
issues.append(link_with_val)
|
|
138
157
|
elif status == "broken-file":
|
|
139
|
-
|
|
158
|
+
broken_file_count += 1
|
|
159
|
+
issues.append(link_with_val)
|
|
160
|
+
elif status == "no-destinstion-page":
|
|
161
|
+
no_destination_page_count += 1
|
|
140
162
|
issues.append(link_with_val)
|
|
141
163
|
|
|
142
164
|
# Validate TOC entries
|
|
143
165
|
for entry in toc:
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
166
|
+
try:
|
|
167
|
+
# Coerce to int; we expect 0-based index from the engine
|
|
168
|
+
# In the context of the ing Map, -1 acts as a "Sentinel Value." It represents a state that is strictly outside the "Machine" range
|
|
169
|
+
target_page_raw = int(entry.get("target_page", -1))
|
|
170
|
+
target_page_ref = PageRef.from_index(int(target_page_raw))
|
|
171
|
+
|
|
172
|
+
status = "valid"
|
|
173
|
+
reason = ""
|
|
174
|
+
|
|
175
|
+
# 1. Check for negative indices (anything below our START_INDEX)
|
|
176
|
+
if target_page_ref.machine < START_INDEX:
|
|
177
|
+
status = "broken-page"
|
|
178
|
+
broken_page_count += 1
|
|
179
|
+
# User sees Page 0 or lower as the problem
|
|
180
|
+
reason = f"TOC targets invalid page number: {target_page_ref.human}"
|
|
181
|
+
|
|
182
|
+
# 2. Case: total_pages is unknown
|
|
183
|
+
elif total_pages is None:
|
|
148
184
|
status = "unknown-reasonableness"
|
|
149
185
|
unknown_reasonableness_count += 1
|
|
150
|
-
|
|
186
|
+
reason = f"Page {target_page_ref.human} unknown (could not verify total pages)"
|
|
187
|
+
|
|
188
|
+
# 3. Case: Out of range (Upper Bound)
|
|
189
|
+
# Index 100 in a 100-page doc (total_pages=100) is out of bounds
|
|
190
|
+
elif target_page_ref.machine >= total_pages:
|
|
151
191
|
status = "broken-page"
|
|
152
|
-
|
|
153
|
-
reason = f"TOC targets
|
|
154
|
-
|
|
192
|
+
broken_page_count += 1
|
|
193
|
+
reason = f"TOC targets page {target_page_ref.human} (out of 1–{total_pages})"
|
|
194
|
+
|
|
195
|
+
# 4. Valid Case
|
|
196
|
+
else:
|
|
197
|
+
status = "valid"
|
|
155
198
|
valid_count += 1
|
|
199
|
+
# We skip issues.append for valid TOC entries to keep the issues list clean
|
|
156
200
|
continue
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
reason = f"TOC targets page {page} (out of 1–{total_pages})"
|
|
160
|
-
broken_count += 1
|
|
161
|
-
else:
|
|
201
|
+
|
|
202
|
+
except (ValueError, TypeError):
|
|
162
203
|
status = "broken-page"
|
|
163
|
-
|
|
164
|
-
|
|
204
|
+
broken_page_count += 1
|
|
205
|
+
reason = f"Invalid page reference: {entry.get('target_page')}"
|
|
165
206
|
|
|
207
|
+
# Only reaches here if status is not "valid" (because of 'continue' above)
|
|
166
208
|
issues.append({
|
|
167
209
|
"type": "TOC Entry",
|
|
168
|
-
"title": entry
|
|
169
|
-
"level": entry
|
|
170
|
-
"target_page":
|
|
210
|
+
"title": entry.get("title", "Untitled"),
|
|
211
|
+
"level": entry.get("level", 0),
|
|
212
|
+
"target_page": target_page_ref.machine, # Stored as 0-indexed for data consistency
|
|
171
213
|
"validation": {"status": status, "reason": reason}
|
|
172
214
|
})
|
|
173
|
-
|
|
215
|
+
|
|
216
|
+
total_checked = metadata.get("link_counts",{}).get("total_links_count",0) + metadata.get("link_counts",{}).get("toc_entry_count",0)
|
|
174
217
|
summary_stats = {
|
|
175
|
-
"total_checked":
|
|
218
|
+
"total_checked": total_checked,
|
|
176
219
|
"valid": valid_count,
|
|
177
220
|
"file-found": file_found_count,
|
|
178
221
|
"broken-page": broken_page_count,
|
|
179
222
|
"broken-file": broken_file_count,
|
|
223
|
+
"no_destination_page_count": no_destination_page_count,
|
|
180
224
|
"unknown-web": unknown_web_count,
|
|
181
225
|
"unknown-reasonableness": unknown_reasonableness_count,
|
|
182
|
-
"unknown-link": unknown_link_count
|
|
183
|
-
#"unknown": len(all_links) + len(toc) - valid_count - broken_count # nah this is not granuar enough
|
|
226
|
+
"unknown-link": unknown_link_count
|
|
184
227
|
}
|
|
185
228
|
|
|
186
229
|
|
|
@@ -200,11 +243,13 @@ def run_validation(
|
|
|
200
243
|
log(f"PDF Path = {get_friendly_path(pdf_path)}")
|
|
201
244
|
log(f"Total items checked: {summary_stats['total_checked']}")
|
|
202
245
|
log(f"✅ Valid: {summary_stats['valid']}")
|
|
203
|
-
log(f"
|
|
246
|
+
#log(f"✅ Valid: {summary_stats['valid']}")
|
|
247
|
+
#log(f"✅ Valid: {summary_stats['valid']}")
|
|
248
|
+
log(f"🌐 Web Addresses (Ping Each: OFF): {summary_stats['unknown-web']}")
|
|
204
249
|
log(f"⚠️ Unknown Page Reasonableness (Due to Missing Total Page Count): {summary_stats['unknown-reasonableness']}")
|
|
205
250
|
log(f"⚠️ Unsupported PDF Links: {summary_stats['unknown-link']}")
|
|
206
|
-
log(f"❌ Broken Page Reference: {summary_stats['broken-page']}")
|
|
207
|
-
log(f"❌ Broken File Reference: {summary_stats['broken-file']}")
|
|
251
|
+
log(f"❌ Broken Page Reference (Page number beyond scope of availability): {summary_stats['broken-page']}")
|
|
252
|
+
log(f"❌ Broken File Reference (File not available): {summary_stats['broken-file']}")
|
|
208
253
|
log("=" * SEP_COUNT)
|
|
209
254
|
|
|
210
255
|
if issues:
|
|
@@ -219,8 +264,16 @@ def run_validation(
|
|
|
219
264
|
log("{:<5} | {:<12} | {:<30} | {}".format(i, link_type, text, reason))
|
|
220
265
|
if len(issues) > 25:
|
|
221
266
|
log(f"... and {len(issues) - 25} more issues")
|
|
267
|
+
|
|
268
|
+
elif summary_stats.get('total_checked', 0) == 0:
|
|
269
|
+
# Check if this was a total crash or just an empty PDF
|
|
270
|
+
if summary_stats.get('is_error_fallback'):
|
|
271
|
+
log("\nStatus: Validation could not be performed due to a processing error.")
|
|
272
|
+
else:
|
|
273
|
+
log("\nStatus: No links or TOC entries were found to validate.")
|
|
274
|
+
|
|
222
275
|
else:
|
|
223
|
-
log("No
|
|
276
|
+
log("Success: No broken links or TOC issues!")
|
|
224
277
|
|
|
225
278
|
# Final aggregation of the buffer into one string
|
|
226
279
|
validation_buffer_str = "\n".join(validation_buffer)
|
|
@@ -228,8 +281,6 @@ def run_validation(
|
|
|
228
281
|
return validation_buffer_str
|
|
229
282
|
|
|
230
283
|
summary_txt = generate_validation_summary_txt_buffer(summary_stats, issues, pdf_path)
|
|
231
|
-
if print_bool:
|
|
232
|
-
print(summary_txt)
|
|
233
284
|
|
|
234
285
|
validation_results = {
|
|
235
286
|
"pdf_path" : pdf_path,
|
|
@@ -239,144 +290,4 @@ def run_validation(
|
|
|
239
290
|
"total_pages": total_pages
|
|
240
291
|
}
|
|
241
292
|
|
|
242
|
-
# Have export run interally so that the logic need not happen in an interface
|
|
243
|
-
|
|
244
|
-
export_validation_json(validation_results, pdf_path, pdf_library)
|
|
245
293
|
return validation_results
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
def run_validation_more_readable_slop(pdf_path: str = None, pdf_library: str = "pypdf", check_external_links:bool = False) -> Dict[str, Any]:
|
|
249
|
-
"""
|
|
250
|
-
Experimental. Ignore for now.
|
|
251
|
-
|
|
252
|
-
Extends the report logic by programmatically testing every extracted link.
|
|
253
|
-
Validates Internal Jumps (page bounds), External URIs (HTTP status),
|
|
254
|
-
and Launch actions (file existence).
|
|
255
|
-
"""
|
|
256
|
-
if check_external_links:
|
|
257
|
-
import requests
|
|
258
|
-
|
|
259
|
-
# 1. Setup Library Engine (Reuse your logic)
|
|
260
|
-
pdf_library = pdf_library.lower()
|
|
261
|
-
if pdf_library == "pypdf":
|
|
262
|
-
from pdflinkcheck.analyze_pypdf import extract_links_pypdf as extract_links
|
|
263
|
-
else:
|
|
264
|
-
from pdflinkcheck.analyze_pymupdf import extract_links_pymupdf as extract_links
|
|
265
|
-
|
|
266
|
-
if pdf_path is None:
|
|
267
|
-
pdf_path = get_first_pdf_in_cwd()
|
|
268
|
-
|
|
269
|
-
if not pdf_path:
|
|
270
|
-
print("Error: No PDF found for validation.")
|
|
271
|
-
return {}
|
|
272
|
-
|
|
273
|
-
print(f"\nValidating links in {Path(pdf_path).name}...")
|
|
274
|
-
|
|
275
|
-
# 2. Extract links and initialize validation counters
|
|
276
|
-
links = extract_links(pdf_path)
|
|
277
|
-
total_links = len(links)
|
|
278
|
-
results = {"valid": [], "broken": [], "error": []}
|
|
279
|
-
|
|
280
|
-
# 3. Validation Loop
|
|
281
|
-
for i, link in enumerate(links, 1):
|
|
282
|
-
# Progress indicator for long manuals
|
|
283
|
-
sys.stdout.write(f"\rChecking link {i}/{total_links}...")
|
|
284
|
-
sys.stdout.flush()
|
|
285
|
-
|
|
286
|
-
link_type = link.get('type')
|
|
287
|
-
status = {"is_valid": False, "reason": "Unknown Type"}
|
|
288
|
-
|
|
289
|
-
# --- A. Validate Internal Jumps ---
|
|
290
|
-
if "Internal" in link_type:
|
|
291
|
-
target_page = link.get('destination_page')
|
|
292
|
-
if isinstance(target_page, int) and target_page > 0:
|
|
293
|
-
# In a real run, you'd compare against reader.pages_count
|
|
294
|
-
status = {"is_valid": True, "reason": "Resolves"}
|
|
295
|
-
else:
|
|
296
|
-
status = {"is_valid": False, "reason": f"Invalid Page: {target_page}"}
|
|
297
|
-
|
|
298
|
-
# --- B. Validate Web URIs ---
|
|
299
|
-
elif link_type == 'External (URI)':
|
|
300
|
-
|
|
301
|
-
url = link.get('url')
|
|
302
|
-
if url and url.startswith("http") and check_external_links:
|
|
303
|
-
try:
|
|
304
|
-
# Use a short timeout and HEAD request to be polite/fast
|
|
305
|
-
resp = requests.head(url, timeout=5, allow_redirects=True)
|
|
306
|
-
if resp.status_code < 400:
|
|
307
|
-
status = {"is_valid": True, "reason": f"HTTP {resp.status_code}"}
|
|
308
|
-
else:
|
|
309
|
-
status = {"is_valid": False, "reason": f"HTTP {resp.status_code}"}
|
|
310
|
-
except Exception as e:
|
|
311
|
-
status = {"is_valid": False, "reason": "Connection Failed"}
|
|
312
|
-
else:
|
|
313
|
-
status = {"is_valid": False, "reason": "Malformed URL"}
|
|
314
|
-
|
|
315
|
-
# --- C. Validate Local File/Launch Links ---
|
|
316
|
-
elif link_type == 'Launch' or 'remote_file' in link:
|
|
317
|
-
file_path = link.get('remote_file') or link.get('url')
|
|
318
|
-
if file_path:
|
|
319
|
-
# Clean URI formatting
|
|
320
|
-
clean_path = file_path.replace("file://", "").replace("%20", " ")
|
|
321
|
-
# Check relative to the PDF's location
|
|
322
|
-
abs_path = Path(pdf_path).parent / clean_path
|
|
323
|
-
if abs_path.exists():
|
|
324
|
-
status = {"is_valid": True, "reason": "File Exists"}
|
|
325
|
-
else:
|
|
326
|
-
status = {"is_valid": False, "reason": "File Missing"}
|
|
327
|
-
|
|
328
|
-
# Append result
|
|
329
|
-
link['validation'] = status
|
|
330
|
-
if status['is_valid']:
|
|
331
|
-
results['valid'].append(link)
|
|
332
|
-
else:
|
|
333
|
-
results['broken'].append(link)
|
|
334
|
-
|
|
335
|
-
print("\n" + "=" * SEP_COUNT)
|
|
336
|
-
print(f"--- Validation Summary Stats for {Path(pdf_path).name} ---")
|
|
337
|
-
print(f"Total Checked: {total_links}")
|
|
338
|
-
print(f"✅ Valid: {len(results['valid'])}")
|
|
339
|
-
print(f"❌ Broken: {len(results['broken'])}")
|
|
340
|
-
print("=" * SEP_COUNT)
|
|
341
|
-
|
|
342
|
-
# 4. Print Detail Report for Broken Links
|
|
343
|
-
if results['broken']:
|
|
344
|
-
print("\n## ❌ Broken Links Found:")
|
|
345
|
-
print("{:<5} | {:<5} | {:<30} | {}".format("Idx", "Page", "Reason", "Target"))
|
|
346
|
-
print("-" * SEP_COUNT)
|
|
347
|
-
for i, link in enumerate(results['broken'], 1):
|
|
348
|
-
target = link.get('url') or link.get('destination_page') or link.get('remote_file')
|
|
349
|
-
print("{:<5} | {:<5} | {:<30} | {}".format(
|
|
350
|
-
i, link['page'], link['validation']['reason'], str(target)[:30]
|
|
351
|
-
))
|
|
352
|
-
|
|
353
|
-
return results
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
if __name__ == "__main__":
|
|
357
|
-
|
|
358
|
-
from pdflinkcheck.io import get_first_pdf_in_cwd
|
|
359
|
-
pdf_path = get_first_pdf_in_cwd()
|
|
360
|
-
# Run analysis first
|
|
361
|
-
report = run_report(
|
|
362
|
-
pdf_path=pdf_path,
|
|
363
|
-
max_links=0,
|
|
364
|
-
export_format="",
|
|
365
|
-
pdf_library="pypdf",
|
|
366
|
-
print_bool=False # We handle printing in validation
|
|
367
|
-
)
|
|
368
|
-
|
|
369
|
-
if not report or not report.get("data"):
|
|
370
|
-
print("No data extracted — nothing to validate.")
|
|
371
|
-
sys.exit(1)
|
|
372
|
-
|
|
373
|
-
# Then validate
|
|
374
|
-
validation_results = run_validation(
|
|
375
|
-
report_results=report,
|
|
376
|
-
pdf_path=pdf_path,
|
|
377
|
-
pdf_library="pypdf",
|
|
378
|
-
export_json=True,
|
|
379
|
-
print_bool=True
|
|
380
|
-
)
|
|
381
|
-
|
|
382
|
-
export_validation_results()
|
pdflinkcheck/version_info.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
1
3
|
# src/pdflinkcheck/version_info.py
|
|
4
|
+
from __future__ import annotations
|
|
2
5
|
import re
|
|
3
6
|
from pathlib import Path
|
|
4
7
|
import sys
|
|
@@ -11,7 +14,7 @@ This portion of the codebase is MIT licensed. It does not rely on any AGPL-licen
|
|
|
11
14
|
|
|
12
15
|
MIT License
|
|
13
16
|
|
|
14
|
-
Copyright
|
|
17
|
+
Copyright © 2025 George Clayton Bennett <george.bennett@memphistn.gov>
|
|
15
18
|
|
|
16
19
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
17
20
|
of this software and associated documentation files (the "Software"), to deal
|
|
@@ -52,7 +55,7 @@ def find_pyproject(start: Path) -> Path | None:
|
|
|
52
55
|
if candidate.exists():
|
|
53
56
|
return candidate
|
|
54
57
|
|
|
55
|
-
# 3. Handle Installed / Wheel / Shiv state (using
|
|
58
|
+
# 3. Handle Installed / Wheel / Shiv state (using force-include path)
|
|
56
59
|
internal_path = Path(__file__).parent / "data" / "pyproject.toml"
|
|
57
60
|
if internal_path.exists():
|
|
58
61
|
return internal_path
|
|
@@ -80,4 +83,4 @@ def get_version_from_pyproject() -> str:
|
|
|
80
83
|
match = re.search(r'version\s*=\s*["\']([^"\']+)["\']', poetry_section.group(1))
|
|
81
84
|
if match: return match.group(1)
|
|
82
85
|
|
|
83
|
-
return "0.0.0"
|
|
86
|
+
return "0.0.0"
|