pdflinkcheck 1.1.73__py3-none-any.whl → 1.2.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. pdflinkcheck/__init__.py +88 -21
  2. pdflinkcheck/__main__.py +6 -0
  3. pdflinkcheck/analysis_pdfium.py +131 -0
  4. pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +109 -145
  5. pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +67 -37
  6. pdflinkcheck/cli.py +111 -116
  7. pdflinkcheck/data/I Have Questions.md +51 -0
  8. pdflinkcheck/data/LICENSE +20 -654
  9. pdflinkcheck/data/README.md +65 -67
  10. pdflinkcheck/data/icons/BoxArt-1080x1080.png +0 -0
  11. pdflinkcheck/data/icons/Logo-150x150.png +0 -0
  12. pdflinkcheck/data/icons/Logo-300x300.png +0 -0
  13. pdflinkcheck/data/icons/Logo-71x71.png +0 -0
  14. pdflinkcheck/data/icons/PosterArt-720x1080.png +0 -0
  15. pdflinkcheck/data/icons/SmallLogo-44x44.png +0 -0
  16. pdflinkcheck/data/icons/SplashScreen-620x300.png +0 -0
  17. pdflinkcheck/data/icons/StoreLogo-50x50.png +0 -0
  18. pdflinkcheck/data/icons/WideLogo-310x150.png +0 -0
  19. pdflinkcheck/data/icons/red_pdf_512px.ico +0 -0
  20. pdflinkcheck/data/pyproject.toml +25 -37
  21. pdflinkcheck/data/themes/forest/forest-dark/border-accent-hover.png +0 -0
  22. pdflinkcheck/data/themes/forest/forest-dark/border-accent.png +0 -0
  23. pdflinkcheck/data/themes/forest/forest-dark/border-basic.png +0 -0
  24. pdflinkcheck/data/themes/forest/forest-dark/border-hover.png +0 -0
  25. pdflinkcheck/data/themes/forest/forest-dark/border-invalid.png +0 -0
  26. pdflinkcheck/data/themes/forest/forest-dark/card.png +0 -0
  27. pdflinkcheck/data/themes/forest/forest-dark/check-accent.png +0 -0
  28. pdflinkcheck/data/themes/forest/forest-dark/check-basic.png +0 -0
  29. pdflinkcheck/data/themes/forest/forest-dark/check-hover.png +0 -0
  30. pdflinkcheck/data/themes/forest/forest-dark/check-tri-accent.png +0 -0
  31. pdflinkcheck/data/themes/forest/forest-dark/check-tri-basic.png +0 -0
  32. pdflinkcheck/data/themes/forest/forest-dark/check-tri-hover.png +0 -0
  33. pdflinkcheck/data/themes/forest/forest-dark/check-unsel-accent.png +0 -0
  34. pdflinkcheck/data/themes/forest/forest-dark/check-unsel-basic.png +0 -0
  35. pdflinkcheck/data/themes/forest/forest-dark/check-unsel-hover.png +0 -0
  36. pdflinkcheck/data/themes/forest/forest-dark/check-unsel-pressed.png +0 -0
  37. pdflinkcheck/data/themes/forest/forest-dark/combo-button-basic.png +0 -0
  38. pdflinkcheck/data/themes/forest/forest-dark/combo-button-focus.png +0 -0
  39. pdflinkcheck/data/themes/forest/forest-dark/combo-button-hover.png +0 -0
  40. pdflinkcheck/data/themes/forest/forest-dark/down.png +0 -0
  41. pdflinkcheck/data/themes/forest/forest-dark/empty.png +0 -0
  42. pdflinkcheck/data/themes/forest/forest-dark/hor-accent.png +0 -0
  43. pdflinkcheck/data/themes/forest/forest-dark/hor-basic.png +0 -0
  44. pdflinkcheck/data/themes/forest/forest-dark/hor-hover.png +0 -0
  45. pdflinkcheck/data/themes/forest/forest-dark/notebook.png +0 -0
  46. pdflinkcheck/data/themes/forest/forest-dark/off-accent.png +0 -0
  47. pdflinkcheck/data/themes/forest/forest-dark/off-basic.png +0 -0
  48. pdflinkcheck/data/themes/forest/forest-dark/off-hover.png +0 -0
  49. pdflinkcheck/data/themes/forest/forest-dark/on-accent.png +0 -0
  50. pdflinkcheck/data/themes/forest/forest-dark/on-basic.png +0 -0
  51. pdflinkcheck/data/themes/forest/forest-dark/on-hover.png +0 -0
  52. pdflinkcheck/data/themes/forest/forest-dark/radio-accent.png +0 -0
  53. pdflinkcheck/data/themes/forest/forest-dark/radio-basic.png +0 -0
  54. pdflinkcheck/data/themes/forest/forest-dark/radio-hover.png +0 -0
  55. pdflinkcheck/data/themes/forest/forest-dark/radio-tri-accent.png +0 -0
  56. pdflinkcheck/data/themes/forest/forest-dark/radio-tri-basic.png +0 -0
  57. pdflinkcheck/data/themes/forest/forest-dark/radio-tri-hover.png +0 -0
  58. pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-accent.png +0 -0
  59. pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-basic.png +0 -0
  60. pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-hover.png +0 -0
  61. pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-pressed.png +0 -0
  62. pdflinkcheck/data/themes/forest/forest-dark/rect-accent-hover.png +0 -0
  63. pdflinkcheck/data/themes/forest/forest-dark/rect-accent.png +0 -0
  64. pdflinkcheck/data/themes/forest/forest-dark/rect-basic.png +0 -0
  65. pdflinkcheck/data/themes/forest/forest-dark/rect-hover.png +0 -0
  66. pdflinkcheck/data/themes/forest/forest-dark/right.png +0 -0
  67. pdflinkcheck/data/themes/forest/forest-dark/scale-hor.png +0 -0
  68. pdflinkcheck/data/themes/forest/forest-dark/scale-vert.png +0 -0
  69. pdflinkcheck/data/themes/forest/forest-dark/separator.png +0 -0
  70. pdflinkcheck/data/themes/forest/forest-dark/sizegrip.png +0 -0
  71. pdflinkcheck/data/themes/forest/forest-dark/spin-button-down-basic.png +0 -0
  72. pdflinkcheck/data/themes/forest/forest-dark/spin-button-down-focus.png +0 -0
  73. pdflinkcheck/data/themes/forest/forest-dark/spin-button-up.png +0 -0
  74. pdflinkcheck/data/themes/forest/forest-dark/tab-accent.png +0 -0
  75. pdflinkcheck/data/themes/forest/forest-dark/tab-basic.png +0 -0
  76. pdflinkcheck/data/themes/forest/forest-dark/tab-hover.png +0 -0
  77. pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-accent.png +0 -0
  78. pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-basic.png +0 -0
  79. pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-hover.png +0 -0
  80. pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-accent.png +0 -0
  81. pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-basic.png +0 -0
  82. pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-hover.png +0 -0
  83. pdflinkcheck/data/themes/forest/forest-dark/tree-basic.png +0 -0
  84. pdflinkcheck/data/themes/forest/forest-dark/tree-pressed.png +0 -0
  85. pdflinkcheck/data/themes/forest/forest-dark/up.png +0 -0
  86. pdflinkcheck/data/themes/forest/forest-dark/vert-accent.png +0 -0
  87. pdflinkcheck/data/themes/forest/forest-dark/vert-basic.png +0 -0
  88. pdflinkcheck/data/themes/forest/forest-dark/vert-hover.png +0 -0
  89. pdflinkcheck/data/themes/forest/forest-dark.tcl +536 -0
  90. pdflinkcheck/data/themes/forest/forest-light/border-accent-hover.png +0 -0
  91. pdflinkcheck/data/themes/forest/forest-light/border-accent.png +0 -0
  92. pdflinkcheck/data/themes/forest/forest-light/border-basic.png +0 -0
  93. pdflinkcheck/data/themes/forest/forest-light/border-hover.png +0 -0
  94. pdflinkcheck/data/themes/forest/forest-light/border-invalid.png +0 -0
  95. pdflinkcheck/data/themes/forest/forest-light/card.png +0 -0
  96. pdflinkcheck/data/themes/forest/forest-light/check-accent.png +0 -0
  97. pdflinkcheck/data/themes/forest/forest-light/check-basic.png +0 -0
  98. pdflinkcheck/data/themes/forest/forest-light/check-hover.png +0 -0
  99. pdflinkcheck/data/themes/forest/forest-light/check-tri-accent.png +0 -0
  100. pdflinkcheck/data/themes/forest/forest-light/check-tri-basic.png +0 -0
  101. pdflinkcheck/data/themes/forest/forest-light/check-tri-hover.png +0 -0
  102. pdflinkcheck/data/themes/forest/forest-light/check-unsel-accent.png +0 -0
  103. pdflinkcheck/data/themes/forest/forest-light/check-unsel-basic.png +0 -0
  104. pdflinkcheck/data/themes/forest/forest-light/check-unsel-hover.png +0 -0
  105. pdflinkcheck/data/themes/forest/forest-light/check-unsel-pressed.png +0 -0
  106. pdflinkcheck/data/themes/forest/forest-light/combo-button-basic.png +0 -0
  107. pdflinkcheck/data/themes/forest/forest-light/combo-button-focus.png +0 -0
  108. pdflinkcheck/data/themes/forest/forest-light/combo-button-hover.png +0 -0
  109. pdflinkcheck/data/themes/forest/forest-light/down-focus.png +0 -0
  110. pdflinkcheck/data/themes/forest/forest-light/down.png +0 -0
  111. pdflinkcheck/data/themes/forest/forest-light/empty.png +0 -0
  112. pdflinkcheck/data/themes/forest/forest-light/hor-accent.png +0 -0
  113. pdflinkcheck/data/themes/forest/forest-light/hor-basic.png +0 -0
  114. pdflinkcheck/data/themes/forest/forest-light/hor-hover.png +0 -0
  115. pdflinkcheck/data/themes/forest/forest-light/notebook.png +0 -0
  116. pdflinkcheck/data/themes/forest/forest-light/off-accent.png +0 -0
  117. pdflinkcheck/data/themes/forest/forest-light/off-basic.png +0 -0
  118. pdflinkcheck/data/themes/forest/forest-light/off-hover.png +0 -0
  119. pdflinkcheck/data/themes/forest/forest-light/on-accent.png +0 -0
  120. pdflinkcheck/data/themes/forest/forest-light/on-basic.png +0 -0
  121. pdflinkcheck/data/themes/forest/forest-light/on-hover.png +0 -0
  122. pdflinkcheck/data/themes/forest/forest-light/radio-accent.png +0 -0
  123. pdflinkcheck/data/themes/forest/forest-light/radio-basic.png +0 -0
  124. pdflinkcheck/data/themes/forest/forest-light/radio-hover.png +0 -0
  125. pdflinkcheck/data/themes/forest/forest-light/radio-tri-accent.png +0 -0
  126. pdflinkcheck/data/themes/forest/forest-light/radio-tri-basic.png +0 -0
  127. pdflinkcheck/data/themes/forest/forest-light/radio-tri-hover.png +0 -0
  128. pdflinkcheck/data/themes/forest/forest-light/radio-unsel-accent.png +0 -0
  129. pdflinkcheck/data/themes/forest/forest-light/radio-unsel-basic.png +0 -0
  130. pdflinkcheck/data/themes/forest/forest-light/radio-unsel-hover.png +0 -0
  131. pdflinkcheck/data/themes/forest/forest-light/radio-unsel-pressed.png +0 -0
  132. pdflinkcheck/data/themes/forest/forest-light/rect-accent-hover.png +0 -0
  133. pdflinkcheck/data/themes/forest/forest-light/rect-accent.png +0 -0
  134. pdflinkcheck/data/themes/forest/forest-light/rect-basic.png +0 -0
  135. pdflinkcheck/data/themes/forest/forest-light/rect-hover.png +0 -0
  136. pdflinkcheck/data/themes/forest/forest-light/right-focus.png +0 -0
  137. pdflinkcheck/data/themes/forest/forest-light/right.png +0 -0
  138. pdflinkcheck/data/themes/forest/forest-light/scale-hor.png +0 -0
  139. pdflinkcheck/data/themes/forest/forest-light/scale-vert.png +0 -0
  140. pdflinkcheck/data/themes/forest/forest-light/separator.png +0 -0
  141. pdflinkcheck/data/themes/forest/forest-light/sizegrip.png +0 -0
  142. pdflinkcheck/data/themes/forest/forest-light/spin-button-down-basic.png +0 -0
  143. pdflinkcheck/data/themes/forest/forest-light/spin-button-down-focus.png +0 -0
  144. pdflinkcheck/data/themes/forest/forest-light/spin-button-up.png +0 -0
  145. pdflinkcheck/data/themes/forest/forest-light/tab-accent.png +0 -0
  146. pdflinkcheck/data/themes/forest/forest-light/tab-basic.png +0 -0
  147. pdflinkcheck/data/themes/forest/forest-light/tab-hover.png +0 -0
  148. pdflinkcheck/data/themes/forest/forest-light/thumb-hor-accent.png +0 -0
  149. pdflinkcheck/data/themes/forest/forest-light/thumb-hor-basic.png +0 -0
  150. pdflinkcheck/data/themes/forest/forest-light/thumb-hor-hover.png +0 -0
  151. pdflinkcheck/data/themes/forest/forest-light/thumb-vert-accent.png +0 -0
  152. pdflinkcheck/data/themes/forest/forest-light/thumb-vert-basic.png +0 -0
  153. pdflinkcheck/data/themes/forest/forest-light/thumb-vert-hover.png +0 -0
  154. pdflinkcheck/data/themes/forest/forest-light/tree-basic.png +0 -0
  155. pdflinkcheck/data/themes/forest/forest-light/tree-pressed.png +0 -0
  156. pdflinkcheck/data/themes/forest/forest-light/up.png +0 -0
  157. pdflinkcheck/data/themes/forest/forest-light/vert-accent.png +0 -0
  158. pdflinkcheck/data/themes/forest/forest-light/vert-basic.png +0 -0
  159. pdflinkcheck/data/themes/forest/forest-light/vert-hover.png +0 -0
  160. pdflinkcheck/data/themes/forest/forest-light.tcl +544 -0
  161. pdflinkcheck/datacopy.py +18 -1
  162. pdflinkcheck/dev.py +12 -25
  163. pdflinkcheck/environment.py +76 -0
  164. pdflinkcheck/gui.py +366 -457
  165. pdflinkcheck/helpers.py +88 -0
  166. pdflinkcheck/io.py +27 -23
  167. pdflinkcheck/report.py +692 -121
  168. pdflinkcheck/security.py +189 -0
  169. pdflinkcheck/splash.py +38 -0
  170. pdflinkcheck/stdlib_server.py +14 -20
  171. pdflinkcheck/stdlib_server_alt.py +571 -0
  172. pdflinkcheck/tk_utils.py +188 -0
  173. pdflinkcheck/update_msix_version.py +49 -0
  174. pdflinkcheck/validate.py +129 -218
  175. pdflinkcheck/version_info.py +6 -3
  176. {pdflinkcheck-1.1.73.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +84 -81
  177. pdflinkcheck-1.2.29.dist-info/RECORD +183 -0
  178. pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
  179. {pdflinkcheck-1.1.73.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
  180. pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
  181. pdflinkcheck-1.2.29.dist-info/licenses/LICENSE-MIT +9 -0
  182. pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
  183. pdflinkcheck/analyze_pypdf_v2.py +0 -218
  184. pdflinkcheck-1.1.73.dist-info/RECORD +0 -21
  185. pdflinkcheck-1.1.73.dist-info/WHEEL +0 -4
  186. /pdflinkcheck-1.1.73.dist-info/licenses/LICENSE → /pdflinkcheck-1.2.29.dist-info/licenses/LICENSE-AGPL3 +0 -0
pdflinkcheck/validate.py CHANGED
@@ -1,31 +1,35 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: MIT
1
3
  # src/pdflinkcheck/validate.py
2
-
4
+ from __future__ import annotations
3
5
  import sys
4
6
  from pathlib import Path
5
7
  from typing import Dict, Any
6
8
 
7
- from pdflinkcheck.report import run_report
8
- from pdflinkcheck.io import get_friendly_path, export_validation_json
9
+ from pdflinkcheck.io import get_friendly_path
10
+ from pdflinkcheck.helpers import PageRef # Importing the established helper
9
11
 
10
12
  SEP_COUNT=28
11
13
 
14
+ START_INDEX = 0
15
+ # Internal 0-based start
16
+ # Define the offset.
17
+ # The PDF engines are 0-based.
18
+ # We will add +1 only for the HUMAN REASON strings.
19
+
20
+
12
21
  def run_validation(
13
22
  report_results: Dict[str, Any],
14
23
  pdf_path: str,
15
- pdf_library: str = "pypdf",
16
- check_external: bool = False,
17
- export_json: bool = True,
18
- print_bool: bool = True
24
+ check_external: bool = False
19
25
  ) -> Dict[str, Any]:
20
26
  """
21
- Validates links using the output from run_report().
27
+ Validates links during run_report_*() using a partial completion of the data dict.
22
28
 
23
29
  Args:
24
- report_results: The dict returned by run_report()
30
+ report_results: The dict returned by run_report_and_call_exports()
25
31
  pdf_path: Path to the original PDF (needed for relative file checks and page count)
26
- pdf_library: Engine used ("pypdf" or "pymupdf")
27
32
  check_external: Whether to validate HTTP URLs (requires network + requests)
28
- print_bool: Whether to print results to console
29
33
 
30
34
  Returns:
31
35
  Validation summary stats with valid/broken counts and detailed issues
@@ -35,62 +39,77 @@ def run_validation(
35
39
 
36
40
  all_links = data.get("external_links", []) + data.get("internal_links", [])
37
41
  toc = data.get("toc", [])
42
+ total_pages = metadata.get("file_overview", {}).get("total_pages",None)
38
43
 
39
44
  if not all_links and not toc:
40
- if print_bool:
41
- print("No links or TOC to validate.")
45
+ print("No links or TOC to validate.")
42
46
  return {"summary-stats": {"valid": 0, "broken": 0}, "issues": []}
43
47
 
44
- # Get total page count (critical for internal validation)
45
- try:
46
- if pdf_library == "pymupdf":
47
- import fitz
48
- doc = fitz.open(pdf_path)
49
- total_pages = doc.page_count
50
- doc.close()
51
- else:
52
- from pypdf import PdfReader
53
- reader = PdfReader(pdf_path)
54
- total_pages = len(reader.pages)
55
- except Exception as e:
56
- if print_bool:
57
- print(f"Could not determine page count: {e}")
58
- total_pages = None
59
48
 
60
49
  pdf_dir = Path(pdf_path).parent
61
50
 
62
51
  issues = []
63
- valid_count = 0
52
+ valid_count = 0 # add more granulaity for types of valid links
53
+ file_found_count = 0
64
54
  broken_file_count = 0
65
55
  broken_page_count = 0
66
- file_found_count = 0
56
+ no_destination_page_count = 0
67
57
  unknown_web_count = 0
68
58
  unknown_reasonableness_count = 0
69
59
  unknown_link_count = 0
70
60
 
71
61
  # Validate active links
72
- for i, link in enumerate(all_links):
62
+ #print("DEBUG validate: entering loop with", len(all_links), "links")
63
+ for link in all_links:
73
64
  link_type = link.get("type")
74
65
  status = "valid"
75
66
  reason = None
67
+
76
68
  if link_type in ("Internal (GoTo/Dest)", "Internal (Resolved Action)"):
77
- target_page = int(link.get("destination_page"))
78
- if not isinstance(target_page, int):
79
- status = "broken-page"
80
- reason = f"Target page not a number: {target_page}"
81
- elif (1 <= target_page) and total_pages is None:
82
- status = "unknown-reasonableness"
83
- reason = "Total page count unavailable, but the page number is reasonable"
84
- elif (1 <= target_page <= total_pages):
85
- status = "valid"
86
- reason = f"Page {target_page} within range (1–{total_pages})"
87
- elif target_page < 1:
88
- status = "broken-page"
89
- reason = f"TOC targets page negative {target_page}."
90
- elif not (1 <= target_page <= total_pages):
91
- status = "broken-page"
92
- reason = f"Page {target_page} out of range (1–{total_pages})"
93
-
69
+ dest_page_raw = link.get("destination_page")
70
+
71
+ if dest_page_raw is not None:
72
+
73
+ try:
74
+ # Use PageRef to handle translation
75
+ target_page_ref = PageRef.from_index(int(dest_page_raw))
76
+ #target_page = int(dest_page_raw)
77
+
78
+ # 1. Immediate Failure: Below 0
79
+ if target_page_ref.machine < START_INDEX:
80
+ status = "broken-page"
81
+ # We use target_page + 1 to show the user what they "saw"
82
+ reason = f"Target page {target_page_ref.human} is invalid (negative index)."
83
+
84
+ # 2. Case: We don't know the max page count
85
+ elif total_pages is None:
86
+ # If it's 0 or higher, we assume it might be okay but can't be sure
87
+ status = "unknown-reasonableness"
88
+ reason = f"Page {target_page_ref.human} seems reasonable, but total page count is unavailable."
89
+
90
+ # 3. Case: Out of Upper Bounds
91
+ elif target_page_ref.machine >= total_pages:
92
+ status = "broken-page"
93
+ # User sees 1-based, e.g., "Page 101 out of range (1-100)"
94
+ reason = f"Page {target_page_ref.human} out of range (1–{total_pages})"
95
+
96
+ # 4. Case: Perfect Match
97
+ else:
98
+ status = "valid"
99
+ reason = f"Page {target_page_ref.human} within range (1–{total_pages})"
100
+
101
+ except (ValueError, TypeError):
102
+ status = "broken-page"
103
+ reason = f"Invalid page value: {dest_page_raw}"
104
+
105
+ except (ValueError, TypeError):
106
+ status = "broken-page"
107
+ reason = f"Invalid page value: {dest_page_raw}"
108
+
109
+ elif dest_page_raw is None:
110
+ status = "no-destinstion-page"
111
+ reason = "No destination page resolved"
112
+
94
113
  elif link_type == "Remote (GoToR)":
95
114
  remote_file = link.get("remote_file")
96
115
  if not remote_file:
@@ -132,55 +151,79 @@ def run_validation(
132
151
  unknown_reasonableness_count += 1
133
152
  elif status == "unknown-link":
134
153
  unknown_link_count += 1
135
- elif status == "broken-file":
154
+ elif status == "broken-page":
136
155
  broken_page_count += 1
137
156
  issues.append(link_with_val)
138
157
  elif status == "broken-file":
139
- broken_page_count += 1
158
+ broken_file_count += 1
159
+ issues.append(link_with_val)
160
+ elif status == "no-destinstion-page":
161
+ no_destination_page_count += 1
140
162
  issues.append(link_with_val)
141
163
 
142
164
  # Validate TOC entries
143
165
  for entry in toc:
144
- target_page = int(entry.get("target_page"))
145
- if isinstance(target_page, int):
146
- if (1 <= target_page) and total_pages is None:
147
- reason = "Page count unknown"
166
+ try:
167
+ # Coerce to int; we expect 0-based index from the engine
168
+ # In the context of the ing Map, -1 acts as a "Sentinel Value." It represents a state that is strictly outside the "Machine" range
169
+ target_page_raw = int(entry.get("target_page", -1))
170
+ target_page_ref = PageRef.from_index(int(target_page_raw))
171
+
172
+ status = "valid"
173
+ reason = ""
174
+
175
+ # 1. Check for negative indices (anything below our START_INDEX)
176
+ if target_page_ref.machine < START_INDEX:
177
+ status = "broken-page"
178
+ broken_page_count += 1
179
+ # User sees Page 0 or lower as the problem
180
+ reason = f"TOC targets invalid page number: {target_page_ref.human}"
181
+
182
+ # 2. Case: total_pages is unknown
183
+ elif total_pages is None:
148
184
  status = "unknown-reasonableness"
149
185
  unknown_reasonableness_count += 1
150
- elif target_page < 1:
186
+ reason = f"Page {target_page_ref.human} unknown (could not verify total pages)"
187
+
188
+ # 3. Case: Out of range (Upper Bound)
189
+ # Index 100 in a 100-page doc (total_pages=100) is out of bounds
190
+ elif target_page_ref.machine >= total_pages:
151
191
  status = "broken-page"
152
- broken_count += 1
153
- reason = f"TOC targets negative page: {target_page}."
154
- elif 1 <= target_page <= total_pages:
192
+ broken_page_count += 1
193
+ reason = f"TOC targets page {target_page_ref.human} (out of 1–{total_pages})"
194
+
195
+ # 4. Valid Case
196
+ else:
197
+ status = "valid"
155
198
  valid_count += 1
199
+ # We skip issues.append for valid TOC entries to keep the issues list clean
156
200
  continue
157
- else:
158
- status = "broken-page"
159
- reason = f"TOC targets page {page} (out of 1–{total_pages})"
160
- broken_count += 1
161
- else:
201
+
202
+ except (ValueError, TypeError):
162
203
  status = "broken-page"
163
- reason = f"Invalid page: {target_page}"
164
- broken_count += 1
204
+ broken_page_count += 1
205
+ reason = f"Invalid page reference: {entry.get('target_page')}"
165
206
 
207
+ # Only reaches here if status is not "valid" (because of 'continue' above)
166
208
  issues.append({
167
209
  "type": "TOC Entry",
168
- "title": entry["title"],
169
- "level": entry["level"],
170
- "target_page": target_page,
210
+ "title": entry.get("title", "Untitled"),
211
+ "level": entry.get("level", 0),
212
+ "target_page": target_page_ref.machine, # Stored as 0-indexed for data consistency
171
213
  "validation": {"status": status, "reason": reason}
172
214
  })
173
-
215
+
216
+ total_checked = metadata.get("link_counts",{}).get("total_links_count",0) + metadata.get("link_counts",{}).get("toc_entry_count",0)
174
217
  summary_stats = {
175
- "total_checked": len(all_links) + len(toc),
218
+ "total_checked": total_checked,
176
219
  "valid": valid_count,
177
220
  "file-found": file_found_count,
178
221
  "broken-page": broken_page_count,
179
222
  "broken-file": broken_file_count,
223
+ "no_destination_page_count": no_destination_page_count,
180
224
  "unknown-web": unknown_web_count,
181
225
  "unknown-reasonableness": unknown_reasonableness_count,
182
- "unknown-link": unknown_link_count,
183
- #"unknown": len(all_links) + len(toc) - valid_count - broken_count # nah this is not granuar enough
226
+ "unknown-link": unknown_link_count
184
227
  }
185
228
 
186
229
 
@@ -200,11 +243,13 @@ def run_validation(
200
243
  log(f"PDF Path = {get_friendly_path(pdf_path)}")
201
244
  log(f"Total items checked: {summary_stats['total_checked']}")
202
245
  log(f"✅ Valid: {summary_stats['valid']}")
203
- log(f"🌐 Web Addresses (Not Checked): {summary_stats['unknown-web']}")
246
+ #log(f" Valid: {summary_stats['valid']}")
247
+ #log(f"✅ Valid: {summary_stats['valid']}")
248
+ log(f"🌐 Web Addresses (Ping Each: OFF): {summary_stats['unknown-web']}")
204
249
  log(f"⚠️ Unknown Page Reasonableness (Due to Missing Total Page Count): {summary_stats['unknown-reasonableness']}")
205
250
  log(f"⚠️ Unsupported PDF Links: {summary_stats['unknown-link']}")
206
- log(f"❌ Broken Page Reference: {summary_stats['broken-page']}")
207
- log(f"❌ Broken File Reference: {summary_stats['broken-file']}")
251
+ log(f"❌ Broken Page Reference (Page number beyond scope of availability): {summary_stats['broken-page']}")
252
+ log(f"❌ Broken File Reference (File not available): {summary_stats['broken-file']}")
208
253
  log("=" * SEP_COUNT)
209
254
 
210
255
  if issues:
@@ -219,8 +264,16 @@ def run_validation(
219
264
  log("{:<5} | {:<12} | {:<30} | {}".format(i, link_type, text, reason))
220
265
  if len(issues) > 25:
221
266
  log(f"... and {len(issues) - 25} more issues")
267
+
268
+ elif summary_stats.get('total_checked', 0) == 0:
269
+ # Check if this was a total crash or just an empty PDF
270
+ if summary_stats.get('is_error_fallback'):
271
+ log("\nStatus: Validation could not be performed due to a processing error.")
272
+ else:
273
+ log("\nStatus: No links or TOC entries were found to validate.")
274
+
222
275
  else:
223
- log("No issues found — all links and TOC entries are valid!")
276
+ log("Success: No broken links or TOC issues!")
224
277
 
225
278
  # Final aggregation of the buffer into one string
226
279
  validation_buffer_str = "\n".join(validation_buffer)
@@ -228,8 +281,6 @@ def run_validation(
228
281
  return validation_buffer_str
229
282
 
230
283
  summary_txt = generate_validation_summary_txt_buffer(summary_stats, issues, pdf_path)
231
- if print_bool:
232
- print(summary_txt)
233
284
 
234
285
  validation_results = {
235
286
  "pdf_path" : pdf_path,
@@ -239,144 +290,4 @@ def run_validation(
239
290
  "total_pages": total_pages
240
291
  }
241
292
 
242
- # Have export run interally so that the logic need not happen in an interface
243
-
244
- export_validation_json(validation_results, pdf_path, pdf_library)
245
293
  return validation_results
246
-
247
-
248
- def run_validation_more_readable_slop(pdf_path: str = None, pdf_library: str = "pypdf", check_external_links:bool = False) -> Dict[str, Any]:
249
- """
250
- Experimental. Ignore for now.
251
-
252
- Extends the report logic by programmatically testing every extracted link.
253
- Validates Internal Jumps (page bounds), External URIs (HTTP status),
254
- and Launch actions (file existence).
255
- """
256
- if check_external_links:
257
- import requests
258
-
259
- # 1. Setup Library Engine (Reuse your logic)
260
- pdf_library = pdf_library.lower()
261
- if pdf_library == "pypdf":
262
- from pdflinkcheck.analyze_pypdf import extract_links_pypdf as extract_links
263
- else:
264
- from pdflinkcheck.analyze_pymupdf import extract_links_pymupdf as extract_links
265
-
266
- if pdf_path is None:
267
- pdf_path = get_first_pdf_in_cwd()
268
-
269
- if not pdf_path:
270
- print("Error: No PDF found for validation.")
271
- return {}
272
-
273
- print(f"\nValidating links in {Path(pdf_path).name}...")
274
-
275
- # 2. Extract links and initialize validation counters
276
- links = extract_links(pdf_path)
277
- total_links = len(links)
278
- results = {"valid": [], "broken": [], "error": []}
279
-
280
- # 3. Validation Loop
281
- for i, link in enumerate(links, 1):
282
- # Progress indicator for long manuals
283
- sys.stdout.write(f"\rChecking link {i}/{total_links}...")
284
- sys.stdout.flush()
285
-
286
- link_type = link.get('type')
287
- status = {"is_valid": False, "reason": "Unknown Type"}
288
-
289
- # --- A. Validate Internal Jumps ---
290
- if "Internal" in link_type:
291
- target_page = link.get('destination_page')
292
- if isinstance(target_page, int) and target_page > 0:
293
- # In a real run, you'd compare against reader.pages_count
294
- status = {"is_valid": True, "reason": "Resolves"}
295
- else:
296
- status = {"is_valid": False, "reason": f"Invalid Page: {target_page}"}
297
-
298
- # --- B. Validate Web URIs ---
299
- elif link_type == 'External (URI)':
300
-
301
- url = link.get('url')
302
- if url and url.startswith("http") and check_external_links:
303
- try:
304
- # Use a short timeout and HEAD request to be polite/fast
305
- resp = requests.head(url, timeout=5, allow_redirects=True)
306
- if resp.status_code < 400:
307
- status = {"is_valid": True, "reason": f"HTTP {resp.status_code}"}
308
- else:
309
- status = {"is_valid": False, "reason": f"HTTP {resp.status_code}"}
310
- except Exception as e:
311
- status = {"is_valid": False, "reason": "Connection Failed"}
312
- else:
313
- status = {"is_valid": False, "reason": "Malformed URL"}
314
-
315
- # --- C. Validate Local File/Launch Links ---
316
- elif link_type == 'Launch' or 'remote_file' in link:
317
- file_path = link.get('remote_file') or link.get('url')
318
- if file_path:
319
- # Clean URI formatting
320
- clean_path = file_path.replace("file://", "").replace("%20", " ")
321
- # Check relative to the PDF's location
322
- abs_path = Path(pdf_path).parent / clean_path
323
- if abs_path.exists():
324
- status = {"is_valid": True, "reason": "File Exists"}
325
- else:
326
- status = {"is_valid": False, "reason": "File Missing"}
327
-
328
- # Append result
329
- link['validation'] = status
330
- if status['is_valid']:
331
- results['valid'].append(link)
332
- else:
333
- results['broken'].append(link)
334
-
335
- print("\n" + "=" * SEP_COUNT)
336
- print(f"--- Validation Summary Stats for {Path(pdf_path).name} ---")
337
- print(f"Total Checked: {total_links}")
338
- print(f"✅ Valid: {len(results['valid'])}")
339
- print(f"❌ Broken: {len(results['broken'])}")
340
- print("=" * SEP_COUNT)
341
-
342
- # 4. Print Detail Report for Broken Links
343
- if results['broken']:
344
- print("\n## ❌ Broken Links Found:")
345
- print("{:<5} | {:<5} | {:<30} | {}".format("Idx", "Page", "Reason", "Target"))
346
- print("-" * SEP_COUNT)
347
- for i, link in enumerate(results['broken'], 1):
348
- target = link.get('url') or link.get('destination_page') or link.get('remote_file')
349
- print("{:<5} | {:<5} | {:<30} | {}".format(
350
- i, link['page'], link['validation']['reason'], str(target)[:30]
351
- ))
352
-
353
- return results
354
-
355
-
356
- if __name__ == "__main__":
357
-
358
- from pdflinkcheck.io import get_first_pdf_in_cwd
359
- pdf_path = get_first_pdf_in_cwd()
360
- # Run analysis first
361
- report = run_report(
362
- pdf_path=pdf_path,
363
- max_links=0,
364
- export_format="",
365
- pdf_library="pypdf",
366
- print_bool=False # We handle printing in validation
367
- )
368
-
369
- if not report or not report.get("data"):
370
- print("No data extracted — nothing to validate.")
371
- sys.exit(1)
372
-
373
- # Then validate
374
- validation_results = run_validation(
375
- report_results=report,
376
- pdf_path=pdf_path,
377
- pdf_library="pypdf",
378
- export_json=True,
379
- print_bool=True
380
- )
381
-
382
- export_validation_results()
@@ -1,4 +1,7 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: MIT
1
3
  # src/pdflinkcheck/version_info.py
4
+ from __future__ import annotations
2
5
  import re
3
6
  from pathlib import Path
4
7
  import sys
@@ -11,7 +14,7 @@ This portion of the codebase is MIT licensed. It does not rely on any AGPL-licen
11
14
 
12
15
  MIT License
13
16
 
14
- Copyright (c) 2025 George Clayton Bennett <george.bennett@memphistn.gov>
17
+ Copyright © 2025 George Clayton Bennett <george.bennett@memphistn.gov>
15
18
 
16
19
  Permission is hereby granted, free of charge, to any person obtaining a copy
17
20
  of this software and associated documentation files (the "Software"), to deal
@@ -52,7 +55,7 @@ def find_pyproject(start: Path) -> Path | None:
52
55
  if candidate.exists():
53
56
  return candidate
54
57
 
55
- # 3. Handle Installed / Wheel / Shiv state (using your force-include path)
58
+ # 3. Handle Installed / Wheel / Shiv state (using force-include path)
56
59
  internal_path = Path(__file__).parent / "data" / "pyproject.toml"
57
60
  if internal_path.exists():
58
61
  return internal_path
@@ -80,4 +83,4 @@ def get_version_from_pyproject() -> str:
80
83
  match = re.search(r'version\s*=\s*["\']([^"\']+)["\']', poetry_section.group(1))
81
84
  if match: return match.group(1)
82
85
 
83
- return "0.0.0"
86
+ return "0.0.0"