pdflinkcheck 1.1.73__py3-none-any.whl → 1.2.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. pdflinkcheck/__init__.py +88 -21
  2. pdflinkcheck/__main__.py +6 -0
  3. pdflinkcheck/analysis_pdfium.py +131 -0
  4. pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +109 -145
  5. pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +67 -37
  6. pdflinkcheck/cli.py +111 -116
  7. pdflinkcheck/data/I Have Questions.md +51 -0
  8. pdflinkcheck/data/LICENSE +20 -654
  9. pdflinkcheck/data/README.md +65 -67
  10. pdflinkcheck/data/icons/BoxArt-1080x1080.png +0 -0
  11. pdflinkcheck/data/icons/Logo-150x150.png +0 -0
  12. pdflinkcheck/data/icons/Logo-300x300.png +0 -0
  13. pdflinkcheck/data/icons/Logo-71x71.png +0 -0
  14. pdflinkcheck/data/icons/PosterArt-720x1080.png +0 -0
  15. pdflinkcheck/data/icons/SmallLogo-44x44.png +0 -0
  16. pdflinkcheck/data/icons/SplashScreen-620x300.png +0 -0
  17. pdflinkcheck/data/icons/StoreLogo-50x50.png +0 -0
  18. pdflinkcheck/data/icons/WideLogo-310x150.png +0 -0
  19. pdflinkcheck/data/icons/red_pdf_512px.ico +0 -0
  20. pdflinkcheck/data/pyproject.toml +25 -37
  21. pdflinkcheck/data/themes/forest/forest-dark/border-accent-hover.png +0 -0
  22. pdflinkcheck/data/themes/forest/forest-dark/border-accent.png +0 -0
  23. pdflinkcheck/data/themes/forest/forest-dark/border-basic.png +0 -0
  24. pdflinkcheck/data/themes/forest/forest-dark/border-hover.png +0 -0
  25. pdflinkcheck/data/themes/forest/forest-dark/border-invalid.png +0 -0
  26. pdflinkcheck/data/themes/forest/forest-dark/card.png +0 -0
  27. pdflinkcheck/data/themes/forest/forest-dark/check-accent.png +0 -0
  28. pdflinkcheck/data/themes/forest/forest-dark/check-basic.png +0 -0
  29. pdflinkcheck/data/themes/forest/forest-dark/check-hover.png +0 -0
  30. pdflinkcheck/data/themes/forest/forest-dark/check-tri-accent.png +0 -0
  31. pdflinkcheck/data/themes/forest/forest-dark/check-tri-basic.png +0 -0
  32. pdflinkcheck/data/themes/forest/forest-dark/check-tri-hover.png +0 -0
  33. pdflinkcheck/data/themes/forest/forest-dark/check-unsel-accent.png +0 -0
  34. pdflinkcheck/data/themes/forest/forest-dark/check-unsel-basic.png +0 -0
  35. pdflinkcheck/data/themes/forest/forest-dark/check-unsel-hover.png +0 -0
  36. pdflinkcheck/data/themes/forest/forest-dark/check-unsel-pressed.png +0 -0
  37. pdflinkcheck/data/themes/forest/forest-dark/combo-button-basic.png +0 -0
  38. pdflinkcheck/data/themes/forest/forest-dark/combo-button-focus.png +0 -0
  39. pdflinkcheck/data/themes/forest/forest-dark/combo-button-hover.png +0 -0
  40. pdflinkcheck/data/themes/forest/forest-dark/down.png +0 -0
  41. pdflinkcheck/data/themes/forest/forest-dark/empty.png +0 -0
  42. pdflinkcheck/data/themes/forest/forest-dark/hor-accent.png +0 -0
  43. pdflinkcheck/data/themes/forest/forest-dark/hor-basic.png +0 -0
  44. pdflinkcheck/data/themes/forest/forest-dark/hor-hover.png +0 -0
  45. pdflinkcheck/data/themes/forest/forest-dark/notebook.png +0 -0
  46. pdflinkcheck/data/themes/forest/forest-dark/off-accent.png +0 -0
  47. pdflinkcheck/data/themes/forest/forest-dark/off-basic.png +0 -0
  48. pdflinkcheck/data/themes/forest/forest-dark/off-hover.png +0 -0
  49. pdflinkcheck/data/themes/forest/forest-dark/on-accent.png +0 -0
  50. pdflinkcheck/data/themes/forest/forest-dark/on-basic.png +0 -0
  51. pdflinkcheck/data/themes/forest/forest-dark/on-hover.png +0 -0
  52. pdflinkcheck/data/themes/forest/forest-dark/radio-accent.png +0 -0
  53. pdflinkcheck/data/themes/forest/forest-dark/radio-basic.png +0 -0
  54. pdflinkcheck/data/themes/forest/forest-dark/radio-hover.png +0 -0
  55. pdflinkcheck/data/themes/forest/forest-dark/radio-tri-accent.png +0 -0
  56. pdflinkcheck/data/themes/forest/forest-dark/radio-tri-basic.png +0 -0
  57. pdflinkcheck/data/themes/forest/forest-dark/radio-tri-hover.png +0 -0
  58. pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-accent.png +0 -0
  59. pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-basic.png +0 -0
  60. pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-hover.png +0 -0
  61. pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-pressed.png +0 -0
  62. pdflinkcheck/data/themes/forest/forest-dark/rect-accent-hover.png +0 -0
  63. pdflinkcheck/data/themes/forest/forest-dark/rect-accent.png +0 -0
  64. pdflinkcheck/data/themes/forest/forest-dark/rect-basic.png +0 -0
  65. pdflinkcheck/data/themes/forest/forest-dark/rect-hover.png +0 -0
  66. pdflinkcheck/data/themes/forest/forest-dark/right.png +0 -0
  67. pdflinkcheck/data/themes/forest/forest-dark/scale-hor.png +0 -0
  68. pdflinkcheck/data/themes/forest/forest-dark/scale-vert.png +0 -0
  69. pdflinkcheck/data/themes/forest/forest-dark/separator.png +0 -0
  70. pdflinkcheck/data/themes/forest/forest-dark/sizegrip.png +0 -0
  71. pdflinkcheck/data/themes/forest/forest-dark/spin-button-down-basic.png +0 -0
  72. pdflinkcheck/data/themes/forest/forest-dark/spin-button-down-focus.png +0 -0
  73. pdflinkcheck/data/themes/forest/forest-dark/spin-button-up.png +0 -0
  74. pdflinkcheck/data/themes/forest/forest-dark/tab-accent.png +0 -0
  75. pdflinkcheck/data/themes/forest/forest-dark/tab-basic.png +0 -0
  76. pdflinkcheck/data/themes/forest/forest-dark/tab-hover.png +0 -0
  77. pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-accent.png +0 -0
  78. pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-basic.png +0 -0
  79. pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-hover.png +0 -0
  80. pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-accent.png +0 -0
  81. pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-basic.png +0 -0
  82. pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-hover.png +0 -0
  83. pdflinkcheck/data/themes/forest/forest-dark/tree-basic.png +0 -0
  84. pdflinkcheck/data/themes/forest/forest-dark/tree-pressed.png +0 -0
  85. pdflinkcheck/data/themes/forest/forest-dark/up.png +0 -0
  86. pdflinkcheck/data/themes/forest/forest-dark/vert-accent.png +0 -0
  87. pdflinkcheck/data/themes/forest/forest-dark/vert-basic.png +0 -0
  88. pdflinkcheck/data/themes/forest/forest-dark/vert-hover.png +0 -0
  89. pdflinkcheck/data/themes/forest/forest-dark.tcl +536 -0
  90. pdflinkcheck/data/themes/forest/forest-light/border-accent-hover.png +0 -0
  91. pdflinkcheck/data/themes/forest/forest-light/border-accent.png +0 -0
  92. pdflinkcheck/data/themes/forest/forest-light/border-basic.png +0 -0
  93. pdflinkcheck/data/themes/forest/forest-light/border-hover.png +0 -0
  94. pdflinkcheck/data/themes/forest/forest-light/border-invalid.png +0 -0
  95. pdflinkcheck/data/themes/forest/forest-light/card.png +0 -0
  96. pdflinkcheck/data/themes/forest/forest-light/check-accent.png +0 -0
  97. pdflinkcheck/data/themes/forest/forest-light/check-basic.png +0 -0
  98. pdflinkcheck/data/themes/forest/forest-light/check-hover.png +0 -0
  99. pdflinkcheck/data/themes/forest/forest-light/check-tri-accent.png +0 -0
  100. pdflinkcheck/data/themes/forest/forest-light/check-tri-basic.png +0 -0
  101. pdflinkcheck/data/themes/forest/forest-light/check-tri-hover.png +0 -0
  102. pdflinkcheck/data/themes/forest/forest-light/check-unsel-accent.png +0 -0
  103. pdflinkcheck/data/themes/forest/forest-light/check-unsel-basic.png +0 -0
  104. pdflinkcheck/data/themes/forest/forest-light/check-unsel-hover.png +0 -0
  105. pdflinkcheck/data/themes/forest/forest-light/check-unsel-pressed.png +0 -0
  106. pdflinkcheck/data/themes/forest/forest-light/combo-button-basic.png +0 -0
  107. pdflinkcheck/data/themes/forest/forest-light/combo-button-focus.png +0 -0
  108. pdflinkcheck/data/themes/forest/forest-light/combo-button-hover.png +0 -0
  109. pdflinkcheck/data/themes/forest/forest-light/down-focus.png +0 -0
  110. pdflinkcheck/data/themes/forest/forest-light/down.png +0 -0
  111. pdflinkcheck/data/themes/forest/forest-light/empty.png +0 -0
  112. pdflinkcheck/data/themes/forest/forest-light/hor-accent.png +0 -0
  113. pdflinkcheck/data/themes/forest/forest-light/hor-basic.png +0 -0
  114. pdflinkcheck/data/themes/forest/forest-light/hor-hover.png +0 -0
  115. pdflinkcheck/data/themes/forest/forest-light/notebook.png +0 -0
  116. pdflinkcheck/data/themes/forest/forest-light/off-accent.png +0 -0
  117. pdflinkcheck/data/themes/forest/forest-light/off-basic.png +0 -0
  118. pdflinkcheck/data/themes/forest/forest-light/off-hover.png +0 -0
  119. pdflinkcheck/data/themes/forest/forest-light/on-accent.png +0 -0
  120. pdflinkcheck/data/themes/forest/forest-light/on-basic.png +0 -0
  121. pdflinkcheck/data/themes/forest/forest-light/on-hover.png +0 -0
  122. pdflinkcheck/data/themes/forest/forest-light/radio-accent.png +0 -0
  123. pdflinkcheck/data/themes/forest/forest-light/radio-basic.png +0 -0
  124. pdflinkcheck/data/themes/forest/forest-light/radio-hover.png +0 -0
  125. pdflinkcheck/data/themes/forest/forest-light/radio-tri-accent.png +0 -0
  126. pdflinkcheck/data/themes/forest/forest-light/radio-tri-basic.png +0 -0
  127. pdflinkcheck/data/themes/forest/forest-light/radio-tri-hover.png +0 -0
  128. pdflinkcheck/data/themes/forest/forest-light/radio-unsel-accent.png +0 -0
  129. pdflinkcheck/data/themes/forest/forest-light/radio-unsel-basic.png +0 -0
  130. pdflinkcheck/data/themes/forest/forest-light/radio-unsel-hover.png +0 -0
  131. pdflinkcheck/data/themes/forest/forest-light/radio-unsel-pressed.png +0 -0
  132. pdflinkcheck/data/themes/forest/forest-light/rect-accent-hover.png +0 -0
  133. pdflinkcheck/data/themes/forest/forest-light/rect-accent.png +0 -0
  134. pdflinkcheck/data/themes/forest/forest-light/rect-basic.png +0 -0
  135. pdflinkcheck/data/themes/forest/forest-light/rect-hover.png +0 -0
  136. pdflinkcheck/data/themes/forest/forest-light/right-focus.png +0 -0
  137. pdflinkcheck/data/themes/forest/forest-light/right.png +0 -0
  138. pdflinkcheck/data/themes/forest/forest-light/scale-hor.png +0 -0
  139. pdflinkcheck/data/themes/forest/forest-light/scale-vert.png +0 -0
  140. pdflinkcheck/data/themes/forest/forest-light/separator.png +0 -0
  141. pdflinkcheck/data/themes/forest/forest-light/sizegrip.png +0 -0
  142. pdflinkcheck/data/themes/forest/forest-light/spin-button-down-basic.png +0 -0
  143. pdflinkcheck/data/themes/forest/forest-light/spin-button-down-focus.png +0 -0
  144. pdflinkcheck/data/themes/forest/forest-light/spin-button-up.png +0 -0
  145. pdflinkcheck/data/themes/forest/forest-light/tab-accent.png +0 -0
  146. pdflinkcheck/data/themes/forest/forest-light/tab-basic.png +0 -0
  147. pdflinkcheck/data/themes/forest/forest-light/tab-hover.png +0 -0
  148. pdflinkcheck/data/themes/forest/forest-light/thumb-hor-accent.png +0 -0
  149. pdflinkcheck/data/themes/forest/forest-light/thumb-hor-basic.png +0 -0
  150. pdflinkcheck/data/themes/forest/forest-light/thumb-hor-hover.png +0 -0
  151. pdflinkcheck/data/themes/forest/forest-light/thumb-vert-accent.png +0 -0
  152. pdflinkcheck/data/themes/forest/forest-light/thumb-vert-basic.png +0 -0
  153. pdflinkcheck/data/themes/forest/forest-light/thumb-vert-hover.png +0 -0
  154. pdflinkcheck/data/themes/forest/forest-light/tree-basic.png +0 -0
  155. pdflinkcheck/data/themes/forest/forest-light/tree-pressed.png +0 -0
  156. pdflinkcheck/data/themes/forest/forest-light/up.png +0 -0
  157. pdflinkcheck/data/themes/forest/forest-light/vert-accent.png +0 -0
  158. pdflinkcheck/data/themes/forest/forest-light/vert-basic.png +0 -0
  159. pdflinkcheck/data/themes/forest/forest-light/vert-hover.png +0 -0
  160. pdflinkcheck/data/themes/forest/forest-light.tcl +544 -0
  161. pdflinkcheck/datacopy.py +18 -1
  162. pdflinkcheck/dev.py +12 -25
  163. pdflinkcheck/environment.py +76 -0
  164. pdflinkcheck/gui.py +366 -457
  165. pdflinkcheck/helpers.py +88 -0
  166. pdflinkcheck/io.py +27 -23
  167. pdflinkcheck/report.py +692 -121
  168. pdflinkcheck/security.py +189 -0
  169. pdflinkcheck/splash.py +38 -0
  170. pdflinkcheck/stdlib_server.py +14 -20
  171. pdflinkcheck/stdlib_server_alt.py +571 -0
  172. pdflinkcheck/tk_utils.py +188 -0
  173. pdflinkcheck/update_msix_version.py +49 -0
  174. pdflinkcheck/validate.py +129 -218
  175. pdflinkcheck/version_info.py +6 -3
  176. {pdflinkcheck-1.1.73.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +84 -81
  177. pdflinkcheck-1.2.29.dist-info/RECORD +183 -0
  178. pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
  179. {pdflinkcheck-1.1.73.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
  180. pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
  181. pdflinkcheck-1.2.29.dist-info/licenses/LICENSE-MIT +9 -0
  182. pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
  183. pdflinkcheck/analyze_pypdf_v2.py +0 -218
  184. pdflinkcheck-1.1.73.dist-info/RECORD +0 -21
  185. pdflinkcheck-1.1.73.dist-info/WHEEL +0 -4
  186. /pdflinkcheck-1.1.73.dist-info/licenses/LICENSE → /pdflinkcheck-1.2.29.dist-info/licenses/LICENSE-AGPL3 +0 -0
pdflinkcheck/report.py CHANGED
@@ -1,16 +1,195 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: MIT
1
3
  # pdflinkcheck/report.py
2
-
4
+ from __future__ import annotations
3
5
  import sys
4
6
  from pathlib import Path
5
7
  from typing import Optional, Dict, Any
6
8
  import pyhabitat
9
+ import copy
7
10
 
8
11
  from pdflinkcheck.io import error_logger, export_report_json, export_report_txt, get_first_pdf_in_cwd, get_friendly_path, LOG_FILE_PATH
12
+ from pdflinkcheck.environment import pymupdf_is_available, pdfium_is_available
13
+ from pdflinkcheck.validate import run_validation
14
+ from pdflinkcheck.security import compute_risk
15
+ from pdflinkcheck.helpers import debug_head, PageRef
9
16
 
10
17
 
11
18
  SEP_COUNT=28
19
+ # Define a safe "empty" validation state
20
+ EMPTY_VALIDATION = {
21
+ "summary-stats": {
22
+ "total_checked": 0,
23
+ "valid": 0,
24
+ "file-found": 0,
25
+ "broken-page": 0,
26
+ "broken-file": 0,
27
+ "no_destination_page_count": 0,
28
+ "unknown-web": 0,
29
+ "unknown-reasonableness": 0,
30
+ "unknown-link": 0
31
+ },
32
+ "issues": [],
33
+ "summary-txt": "Analysis failed: No validation performed.",
34
+ "total_pages": 0
35
+ }
36
+
37
+
38
+ def run_report_and_call_exports(
39
+ pdf_path: str = None,
40
+ export_format: str = "JSON",
41
+ pdf_library: str = "auto",
42
+ print_bool:bool=True,
43
+ ) -> Dict[str, Any]:
44
+ """
45
+ Public entry point. Orchestrates extraction, validation, and file exports.
46
+ """
47
+ # The meat and potatoes
48
+ report_results = run_report_extraction_and_assessment_and_recording(
49
+ pdf_path=str(pdf_path),
50
+ pdf_library = pdf_library,
51
+ print_bool=print_bool,
52
+ )
53
+ # 2. Initialize file path tracking
54
+ output_path_json = None
55
+ output_path_txt = None
56
+
57
+ if export_format:
58
+ report_data_dict = report_results["data"]
59
+ report_buffer_str = report_results["text"]
60
+ if "JSON" in export_format.upper():
61
+ output_path_json = export_report_json(report_data_dict, pdf_path, pdf_library)
62
+ if "TXT" in export_format.upper():
63
+ output_path_txt = export_report_txt(report_buffer_str, pdf_path, pdf_library)
64
+
65
+ # 4. Inject the file info into the results dictionary
66
+ report_results["files"] = {
67
+ "export_path_json": output_path_json,
68
+ "export_path_txt": output_path_txt
69
+ }
70
+ return report_results
71
+
72
+ def _get_engine_data(pdf_path: str, pdf_library: str) -> tuple[Dict, str]:
73
+ """Handles the dirty work of switching engines and importing them."""
74
+ # Resolve 'auto' mode
75
+ if pdf_library == "auto":
76
+ if pdfium_is_available(): pdf_library = "pdfium"
77
+ elif pymupdf_is_available(): pdf_library = "pymupdf"
78
+ else: pdf_library = "pypdf"
79
+
80
+ # Map engine names to their respective modules
81
+ engines = {
82
+ "pdfium": "pdflinkcheck.analysis_pdfium",
83
+ "pypdf": "pdflinkcheck.analysis_pypdf", # Assuming this exists
84
+ "pymupdf": "pdflinkcheck.analysis_pymupdf"
85
+ }
86
+
87
+ if pdf_library not in engines:
88
+ raise ValueError(f"Unsupported library: {pdf_library}")
89
+
90
+ # Dynamic import to keep __init__ lean
91
+ import importlib
92
+ module = importlib.import_module(engines[pdf_library])
93
+ data = module.analyze_pdf(pdf_path) or {"links": [], "toc": [], "file_ov": {}}
94
+
95
+ return data, pdf_library
96
+
97
+ # ----- Refactored version, failing ----
98
+ def run_report_extraction_and_assessment_and_recording_(
99
+ pdf_path: str = None,
100
+ pdf_library: str = "auto",
101
+ print_bool: bool = True
102
+ ) -> Dict[str, Any]:
103
+ """
104
+ Orchestrates extraction, categorization, and validation.
105
+ FULLY RECONCILED with legacy logic to ensure no features are lost.
106
+ """
107
+ if pdf_path is None:
108
+ return _return_empty_report(["pdf_path is None"], pdf_library)
109
+
110
+ try:
111
+ # 1. Extraction
112
+ raw_data, resolved_library = _get_engine_data(pdf_path, pdf_library)
113
+
114
+ extracted_links = raw_data.get("links", [])
115
+ structural_toc = raw_data.get("toc", [])
116
+ file_ov = raw_data.get("file_ov", {})
117
+ total_pages = file_ov.get("total_pages", 0)
118
+ pdf_name = Path(pdf_path).name
119
+
120
+ # 2. Categorization (Restored exactly from original logic)
121
+ external_uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
122
+ goto_links = [link for link in extracted_links if link['type'] == 'Internal (GoTo/Dest)']
123
+ resolved_action_links = [link for link in extracted_links if link['type'] == 'Internal (Resolved Action)']
124
+ other_links = [link for link in extracted_links if link['type'] not in
125
+ ['External (URI)', 'Internal (GoTo/Dest)', 'Internal (Resolved Action)']]
126
+
127
+ all_internal = goto_links + resolved_action_links
128
+
129
+ # 3. Generate the Text Report (Using get_friendly_path as required)
130
+ # We pass the separate lists to maintain Section 2, 3, and 4 formatting
131
+ report_text_base = _generate_text_report(
132
+ pdf_path=pdf_path,
133
+ library=resolved_library,
134
+ ext_links=external_uri_links,
135
+ goto_links=goto_links,
136
+ resolve_links=resolved_action_links,
137
+ other_links=other_links,
138
+ toc=structural_toc
139
+ )
140
+
141
+ # 4. Initial Result Assembly
142
+ report_results = {
143
+ "data": {
144
+ "external_links": external_uri_links,
145
+ "internal_links": goto_links + resolved_action_links,
146
+ "toc": structural_toc,
147
+ "validation": EMPTY_VALIDATION.copy()
148
+ },
149
+ "text": report_text_base,
150
+ "metadata": _build_metadata(
151
+ pdf_name=pdf_name,
152
+ total_pages=total_pages,
153
+ library_used=resolved_library,
154
+ toc_entry_count=len(structural_toc),
155
+ internal_goto_links_count=len(goto_links),
156
+ interal_resolve_action_links_count=len(resolved_action_links),
157
+ external_uri_links_count=len(external_uri_links),
158
+ other_links_count=len(other_links)
159
+ )
160
+ }
161
+
162
+ # 5. Validation & Risk Analysis
163
+ validation_results = run_validation(report_results=report_results, pdf_path=pdf_path)
164
+ report_results["data"]["validation"].update(validation_results)
165
+ report_results["data"]["risk"] = compute_risk(report_results)
166
+
167
+ # --- Inside run_report_extraction_and_assessment_and_recording ---
168
+ # 6. Finalizing Text Buffer
169
+ val_summary = validation_results.get("summary-txt", "")
170
+ raw_text = report_text_base + f"\n{val_summary}\n--- Analysis Complete ---"
171
+ cleaned_text = sanitize_glyphs_for_compatibility(raw_text)
172
+ # Apply sanitization before returning
173
+ report_results["text"] = cleaned_text
174
+ #report_results["text"] = raw_text
12
175
 
13
- def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "JSON", pdf_library: str = "pypdf", print_bool:bool=True) -> Dict[str, Any]:
176
+ if print_bool:
177
+ # Matches your original logic: print the overview/validation summary to console
178
+ print(val_summary)
179
+
180
+ return report_results
181
+
182
+ except Exception as e:
183
+ error_logger.error(f"Critical failure: {e}", exc_info=True)
184
+ return _return_empty_report([f"FATAL: {str(e)}"], pdf_library)
185
+
186
+ # ----- Revert to stable version ----
187
+ def run_report_extraction_and_assessment_and_recording(
188
+ pdf_path: str = None,
189
+ pdf_library: str = "auto",
190
+ print_bool:bool=True,
191
+ concise_print: bool=False,
192
+ ) -> Dict[str, Any]:
14
193
  """
15
194
  Core high-level PDF link analysis logic.
16
195
 
@@ -18,10 +197,8 @@ def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "
18
197
  using pdflinkcheck analysis, and
19
198
  prints a comprehensive, user-friendly report to the console.
20
199
 
21
- Args:
200
+ Args:
22
201
  pdf_path: The file system path (str) to the target PDF document.
23
- max_links: Maximum number of links to display in each console
24
- section. If <= 0, all links will be displayed.
25
202
 
26
203
  Returns:
27
204
  A dictionary containing the structured results of the analysis:
@@ -33,213 +210,558 @@ def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "
33
210
  """
34
211
 
35
212
  report_buffer = []
213
+ report_buffer_overview = []
36
214
 
37
215
  # Helper to handle conditional printing and mandatory buffering
38
- def log(msg: str):
39
- if print_bool: # this should not be here
40
- print(msg) # this should not be here. esure elsewhere then remove
216
+ def log(msg: str, overview: bool = False):
41
217
  report_buffer.append(msg)
218
+ if overview:
219
+ report_buffer_overview.append(msg)
220
+
221
+
42
222
 
43
- # Expected: "pypdf" or "PyMuPDF"
44
- allowed_libraries = ("pypdf","pymupdf")
223
+ # Expected: "pypdf" or "PyMuPDF" pr "rust"
224
+ allowed_libraries = ("pypdf", "pymupdf", "pdfium", "auto")
45
225
  pdf_library = pdf_library.lower()
46
- if pdf_library in allowed_libraries and pdf_library == "pypdf":
47
- from pdflinkcheck.analyze_pypdf import (extract_links_pypdf as extract_links, extract_toc_pypdf as extract_toc)
226
+
227
+ log("\n--- Starting Analysis ... ---\n")
228
+ if pdf_path is None:
229
+ log("pdf_path is None", overview=True)
230
+ log("Tip: Drop a PDF in the current folder or pass in a path arg.")
231
+ _return_empty_report(report_buffer)
232
+ else:
233
+ pdf_name = Path(pdf_path).name
234
+
235
+ # AUTO MODE
236
+ if pdf_library == "auto":
237
+ if pdfium_is_available():
238
+ pdf_library = "pdfium"
239
+ elif pymupdf_is_available():
240
+ pdf_library = "pymupdf"
241
+ else:
242
+ pdf_library = "pypdf"
243
+
244
+
245
+
246
+ # PDFium ENGINE
247
+ if pdf_library in allowed_libraries and pdf_library == "pdfium":
248
+ from pdflinkcheck.analysis_pdfium import analyze_pdf as analyze_pdf_pdfium
249
+ data = analyze_pdf_pdfium(pdf_path) or {"links": [], "toc": [], "file_ov": []}
250
+ extracted_links = data.get("links", [])
251
+ structural_toc = data.get("toc", [])
252
+ file_ov = data.get("file_ov", [])
253
+
254
+ # pypdf ENGINE
255
+ elif pdf_library in allowed_libraries and pdf_library == "pypdf":
256
+ from pdflinkcheck.analysis_pdfium import analyze_pdf as analyze_pdf_pypdf
257
+ #extracted_links = extract_links(pdf_path)
258
+ #structural_toc = extract_toc(pdf_path)
259
+ data = analyze_pdf_pypdf(pdf_path) or {"links": [], "toc": [], "file_ov": []}
260
+ extracted_links = data.get("links", [])
261
+ structural_toc = data.get("toc", [])
262
+ file_ov = data.get("file_ov", [])
263
+
264
+ # PyMuPDF Engine
48
265
  elif pdf_library in allowed_libraries and pdf_library == "pymupdf":
49
- try:
50
- import fitz
51
- except ImportError:
266
+ if not pymupdf_is_available():
52
267
  print("PyMuPDF was explicitly requested as the PDF Engine")
53
- print("Use pypdf instead, or install PyMuPDF. ")
268
+ print("Switch the PDF library to 'pypdf' instead, or install PyMuPDF. ")
54
269
  print("To install PyMuPDF locally, try: `uv sync --extra full` OR `pip install .[full]`")
55
270
  if pyhabitat.on_termux():
56
271
  print(f"pyhabitat.on_termux() = {pyhabitat.on_termux()}")
57
272
  print("PyMuPDF is not expected to work on Termux. Use pypdf.")
58
273
  print("\n")
59
- return
60
- from pdflinkcheck.analyze_pymupdf import (extract_links_pymupdf as extract_links, extract_toc_pymupdf as extract_toc)
274
+ #return
275
+ raise ImportError("The 'fitz' module (PyMuPDF) is required but not installed.")
276
+
277
+ from pdflinkcheck.analysis_pdfium import analyze_pdf as analyze_pdf_pymupdf
278
+ data = analyze_pdf_pymupdf(pdf_path) or {"links": [], "toc": [], "file_ov": []}
279
+ extracted_links = data.get("links", [])
280
+ structural_toc = data.get("toc", [])
281
+ file_ov = data.get("file_ov", [])
61
282
 
62
- log("\n--- Starting Analysis ... ---\n")
63
- if pdf_path is None:
64
- pdf_path = get_first_pdf_in_cwd()
65
- if pdf_path is None:
66
- log("pdf_path is None")
67
- log("Tip: Drop a PDF in the current folder or pass in a path arg.")
68
- return
283
+ total_pages = file_ov.get("total_pages",0)
284
+
285
+
286
+
69
287
  try:
70
- log(f"Target file: {get_friendly_path(pdf_path)}")
71
- log(f"PDF Engine: {pdf_library}")
288
+ log(f"Target file: {get_friendly_path(pdf_path)}", overview=True)
289
+ log(f"PDF Engine: {pdf_library}", overview=True)
72
290
 
73
- # 1. Extract all active links and TOC
74
- extracted_links = extract_links(pdf_path)
75
- structural_toc = extract_toc(pdf_path)
76
- #structural_toc = extract_toc_pypdf(pdf_path)
77
291
  toc_entry_count = len(structural_toc)
292
+ str_structural_toc = get_structural_toc(structural_toc)
78
293
 
294
+ # check the structure, that it matches
295
+ if False:
296
+ print(f"pdf_library={pdf_library}")
297
+ debug_head("TOC", structural_toc, n=3)
298
+ debug_head("Links", list(extracted_links), n=3)
299
+
300
+ # THIS HITS
79
301
 
80
302
  if not extracted_links and not structural_toc:
81
- log(f"\nNo hyperlinks or structural TOC found in {Path(pdf_path).name}.")
82
- log("(This is common for scanned/image-only PDFs.)")
83
- return {}
303
+ log(f"\nNo hyperlinks or structural TOC found in {pdf_name}.", overview=True)
304
+ log("(This is common for scanned/image-only PDFs.)", overview=True)
305
+
306
+ empty_result = {
307
+ "data": {
308
+ "external_links": [],
309
+ "internal_links": [],
310
+ "toc": []
311
+ },
312
+ "text": "\n".join(report_buffer),
313
+ "metadata": {
314
+ "file_overview": {
315
+ "pdf_name": pdf_name,
316
+ "total_pages": total_pages,
317
+ },
318
+ "library_used": pdf_library,
319
+ "link_counts": {
320
+ "toc_entry_count": 0,
321
+ "internal_goto_links_count": 0,
322
+ "interal_resolve_action_links_count": 0,
323
+ "total_internal_links_count": 0,
324
+ "external_uri_links_count": 0,
325
+ "other_links_count": 0,
326
+ "total_links_count": 0
327
+ }
328
+ }
329
+ }
330
+ return empty_result
84
331
 
85
332
  # 3. Separate the lists based on the 'type' key
86
- uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
333
+ external_uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
87
334
  goto_links = [link for link in extracted_links if link['type'] == 'Internal (GoTo/Dest)']
88
335
  resolved_action_links = [link for link in extracted_links if link['type'] == 'Internal (Resolved Action)']
89
336
  other_links = [link for link in extracted_links if link['type'] not in ['External (URI)', 'Internal (GoTo/Dest)', 'Internal (Resolved Action)']]
90
337
 
91
- total_internal_links = len(goto_links) + len(resolved_action_links)
92
- limit = max_links if max_links > 0 else None
93
- uri_and_other = uri_links + other_links
94
-
338
+ interal_resolve_action_links_count = len(resolved_action_links)
339
+ internal_goto_links_count = len(goto_links)
340
+ total_internal_links_count = internal_goto_links_count + interal_resolve_action_links_count
341
+
342
+ external_uri_links_count = len(external_uri_links)
343
+ other_links_count = len(other_links)
344
+
345
+ total_links_count = len(extracted_links)
346
+
95
347
  # --- ANALYSIS SUMMARY (Using your print logic) ---
96
- log("\n" + "=" * SEP_COUNT)
97
- log(f"--- Link Analysis Results for {Path(pdf_path).name} ---")
98
- log(f"Total active links: {len(extracted_links)} (External: {len(uri_links)}, Internal Jumps: {total_internal_links}, Other: {len(other_links)})")
99
- log(f"Total **structural TOC entries (bookmarks)** found: {toc_entry_count}")
100
- log("=" * SEP_COUNT)
348
+ log("\n" + "=" * SEP_COUNT, overview = True)
349
+ log(f"--- Link Analysis Results for {pdf_name} ---", overview = True)
350
+ log(f"Total active links: {total_links_count} (External: {external_uri_links_count}, Internal Jumps: {total_internal_links_count}, Other: {other_links_count})",overview = True)
351
+ log(f"Total **structural TOC entries (bookmarks)** found: {toc_entry_count}",overview = True)
352
+ log("=" * SEP_COUNT,overview = True)
101
353
 
102
354
  # --- Section 1: TOC ---
103
- str_structural_toc = print_structural_toc(structural_toc)
104
355
  log(str_structural_toc)
105
356
 
106
357
  # --- Section 2: ACTIVE INTERNAL JUMPS ---
107
358
  log("\n" + "=" * SEP_COUNT)
108
- log(f"## Active Internal Jumps (GoTo & Resolved Actions) - {total_internal_links} found")
359
+ log(f"## Active Internal Jumps (GoTo & Resolved Actions) - {total_internal_links_count} found")
109
360
  log("=" * SEP_COUNT)
110
361
  log("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To Page"))
111
362
  log("-" * SEP_COUNT)
112
363
 
113
364
  all_internal = goto_links + resolved_action_links
114
- if total_internal_links > 0:
115
- for i, link in enumerate(all_internal[:limit], 1):
365
+ #If links were found: all_internal is a list with dictionaries. It evaluates to True.
366
+ # If NO links were found: all_internal is an empty list []. It evaluates to False.
367
+ if all_internal:
368
+ for i, link in enumerate(all_internal, 1):
116
369
  link_text = link.get('link_text', 'N/A')
117
- log("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], link['destination_page']))
118
370
 
119
- if limit is not None and len(all_internal) > limit:
120
- log(f"... and {len(all_internal) - limit} more links (use --max-links 0 to show all).")
371
+ # Convert source and destination indices to human strings
372
+ src_page = PageRef.from_index(link['page']).human
373
+ dest_page = PageRef.from_index(link['destination_page']).human
374
+
375
+ log("{:<5} | {:<5} | {:<40} | {}".format(
376
+ i,
377
+ src_page,
378
+ link_text[:40],
379
+ dest_page
380
+ ))
381
+
382
+
121
383
  else:
122
384
  log(" No internal GoTo or Resolved Action links found.")
123
385
  log("-" * SEP_COUNT)
124
386
 
125
387
  # --- Section 3: ACTIVE URI LINKS ---
126
388
  log("\n" + "=" * SEP_COUNT)
127
- log(f"## Active URI Links (External & Other) - {len(uri_and_other)} found")
389
+ log(f"## Active URI Links (External) - {len(external_uri_links)} found")
128
390
  log("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target URI/Action"))
129
391
  log("=" * SEP_COUNT)
130
392
 
131
- if uri_and_other:
132
- for i, link in enumerate(uri_and_other[:limit], 1):
393
+ if external_uri_links:
394
+ for i, link in enumerate(external_uri_links, 1):
133
395
  target = link.get('url') or link.get('remote_file') or link.get('target')
134
396
  link_text = link.get('link_text', 'N/A')
135
397
  log("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], target))
136
- if limit is not None and len(uri_and_other) > limit:
137
- log(f"... and {len(uri_and_other) - limit} more links (use --max-links 0 to show all).")
138
398
 
139
399
  else:
140
- log(" No external or 'Other' links found.")
400
+ log(" No external links found.")
141
401
  log("-" * SEP_COUNT)
142
402
 
143
- log("\n--- Analysis Complete ---\n")
403
+ # --- Section 4: OTHER LINKS ---
404
+ log("\n" + "=" * SEP_COUNT)
405
+ log(f"## Other Links - {len(other_links)} found")
406
+ log("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target Action"))
407
+ log("=" * SEP_COUNT)
408
+
409
+ if other_links:
410
+ for i, link in enumerate(other_links, 1):
411
+ target = link.get('url') or link.get('remote_file') or link.get('target')
412
+ link_text = link.get('link_text', 'N/A')
413
+ log("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], target))
144
414
 
145
- # Final aggregation of the buffer into one string
146
- report_buffer_str = "\n".join(report_buffer)
415
+ else:
416
+ log(" No 'Other' links found.")
417
+ log("-" * SEP_COUNT)
147
418
 
148
419
  # Return the collected data for potential future JSON/other output
149
- final_report_data_dict = {
150
- "external_links": uri_links,
420
+ report_data_dict = {
421
+ "external_links": external_uri_links,
151
422
  "internal_links": all_internal,
152
- "toc": structural_toc
423
+ "toc": structural_toc,
424
+ "validation": EMPTY_VALIDATION.copy()
153
425
  }
154
426
 
427
+ intermediate_report_results = {
428
+ "data": report_data_dict, # The structured JSON-ready dict
429
+ "text": "",
430
+ "metadata": { # Helpful for the GUI/Logs
431
+ "file_overview": {
432
+ "pdf_name": pdf_name,
433
+ "total_pages": total_pages,
434
+ },
435
+ "library_used": pdf_library,
436
+ "link_counts": {
437
+ "toc_entry_count": toc_entry_count,
438
+ "internal_goto_links_count": internal_goto_links_count,
439
+ "interal_resolve_action_links_count": interal_resolve_action_links_count,
440
+ "total_internal_links_count": total_internal_links_count,
441
+ "external_uri_links_count": external_uri_links_count,
442
+ "other_links_count": other_links_count,
443
+ "total_links_count": total_links_count
444
+ }
445
+ }
446
+ }
447
+
448
+ log("\n--- Analysis Complete ---")
449
+
450
+ validation_results = run_validation(report_results=intermediate_report_results,
451
+ pdf_path=pdf_path)
452
+ log(validation_results.get("summary-txt",""), overview = True)
453
+
454
+ # CRITICAL: Re-assign to report_results so it's available for the final return
455
+ report_results = copy.deepcopy(intermediate_report_results)
456
+
457
+ # --- Offline Risk Analysis (Security Layer) ---
458
+ risk_results = compute_risk(report_results)
459
+ report_results["data"]["risk"] = risk_results
460
+
461
+ # Final aggregation of the buffer into one string, after the last call to log()
462
+ report_buffer_str = "\n".join(report_buffer)
463
+ report_buffer_overview_str = "\n".join(report_buffer_overview)
464
+
465
+ report_results["data"]["validation"].update(validation_results)
466
+ #report_results["text"].update(report_buffer_str) # The human-readable string
467
+ report_results["text"] = report_buffer_str
468
+
155
469
  # 5. Export Report
156
470
  #if export_format:
157
471
  # # Assuming export_to will hold the output format string (e.g., "JSON")
158
- # export_report_data(final_report_data_dict, Path(pdf_path).name, export_format, pdf_library)
472
+ # export_report_data(report_data_dict, pdf_name, export_format, pdf_library)
159
473
 
160
- if export_format:
161
- fmt_upper = export_format.upper()
162
-
163
- if "JSON" in fmt_upper:
164
- export_report_json(final_report_data_dict, pdf_path, pdf_library)
474
+ if print_bool:
475
+ if concise_print:
476
+ print(report_buffer_overview_str)
477
+ else:
478
+ print(report_buffer_str)
165
479
 
166
- if "TXT" in fmt_upper:
167
- export_report_txt(report_buffer_str, pdf_path, pdf_library)
168
-
169
- report_results = {
170
- "data": final_report_data_dict, # The structured JSON-ready dict
171
- "text": report_buffer_str, # The human-readable string
172
- "metadata": { # Helpful for the GUI/Logs
173
- "pdf_name": Path(pdf_path).name,
174
- "library_used": pdf_library,
175
- "total_links": len(extracted_links)
176
- }
177
- }
178
- # Return a clean results object
179
480
  return report_results
481
+
180
482
  except Exception as e:
181
483
  # Specific handling for common read failures
182
- if "invalid pdf header" in str(e).lower() or "EOF marker not found" in str(e) or "stream has ended unexpectedly" in str(e):
484
+ if True:#"invalid pdf header" in str(e).lower() or "EOF marker not found" in str(e) or "stream has ended unexpectedly" in str(e):
183
485
  log(f"\nWarning: Could not parse PDF structure — likely an image-only or malformed PDF.")
184
486
  log("No hyperlinks or TOC can exist in this file.")
185
487
  log("Result: No links found.")
186
488
  return {
187
- "data": {"external_links": [], "internal_links": [], "toc": []},
489
+ "data": {"external_links": [], "internal_links": [], "toc": [], "validation": EMPTY_VALIDATION.copy()},
188
490
  "text": "\n".join(report_buffer + [
189
491
  "\nWarning: PDF appears to be image-only or malformed.",
190
492
  "No hyperlinks or structural TOC found."
191
493
  ]),
192
494
  "metadata": {
193
- "pdf_name": Path(pdf_path).name,
495
+ "file_overview": {
496
+ "pdf_name": pdf_name,
497
+ "total_pages": total_pages,
498
+ },
194
499
  "library_used": pdf_library,
195
- "total_links": 0
500
+ "link_counts": {
501
+ "toc_entry_count": 0,
502
+ "internal_goto_links_count": 0,
503
+ "interal_resolve_action_links_count": 0,
504
+ "total_internal_links_count": 0,
505
+ "external_uri_links_count": 0,
506
+ "other_links_count": 0,
507
+ "total_links_count": 0
508
+ }
196
509
  }
197
510
  }
198
511
 
512
+ #except Exception as e:
513
+ # # Log the critical failure
514
+ # error_logger.error(f"Critical failure during run_report for {pdf_path}: {e}", exc_info=True)
515
+ # log(f"FATAL: Analysis failed. Check logs at {LOG_FILE_PATH}", file=sys.stderr)
516
+ # raise # Allow the exception to propagate or handle gracefully
199
517
  except Exception as e:
200
- # Log the critical failure
201
518
  error_logger.error(f"Critical failure during run_report for {pdf_path}: {e}", exc_info=True)
202
- log(f"FATAL: Analysis failed. Check logs at {LOG_FILE_PATH}", file=sys.stderr)
203
- raise # Allow the exception to propagate or handle gracefully
519
+ log(f"FATAL: Analysis failed: {str(e)}. Check logs at {LOG_FILE_PATH}", file=sys.stderr)
520
+
521
+ # Always return a safe empty result on error
522
+ return {
523
+ "data": {
524
+ "external_links": [],
525
+ "internal_links": [],
526
+ "toc": [],
527
+ "validation": EMPTY_VALIDATION.copy()
528
+ },
529
+ "text": "\n".join(report_buffer + [
530
+ "\n--- Analysis failed ---",
531
+ f"Error: {str(e)}",
532
+ "No links or TOC extracted."
533
+ ]),
534
+ "metadata": {
535
+ "file_overview": {
536
+ "pdf_name": pdf_name,
537
+ "total_pages": total_pages,
538
+ },
539
+ "library_used": pdf_library,
540
+ "link_counts": {
541
+ "toc_entry_count": 0,
542
+ "internal_goto_links_count": 0,
543
+ "interal_resolve_action_links_count": 0,
544
+ "total_internal_links_count": 0,
545
+ "external_uri_links_count": 0,
546
+ "other_links_count": 0,
547
+ "total_links_count": 0
548
+ }
549
+ }
550
+ }
551
+
552
+ def _return_empty_report(report_buffer: str, pdf_library: str)-> dict:
553
+
554
+ empty_report = {
555
+ "data": {
556
+ "external_links": [],
557
+ "internal_links": [],
558
+ "toc": [],
559
+ "validation": EMPTY_VALIDATION.copy()
560
+ },
561
+ "text": "\n".join(report_buffer),
562
+ "metadata": {
563
+ "file_overview": {
564
+ "pdf_name": "null",
565
+ "total_pages": 0,
566
+ },
567
+ "library_used": pdf_library,
568
+ "link_counts": {
569
+ "toc_entry_count": 0,
570
+ "internal_goto_links_count": 0,
571
+ "interal_resolve_action_links_count": 0,
572
+ "total_internal_links_count": 0,
573
+ "external_uri_links_count": 0,
574
+ "other_links_count": 0,
575
+ "total_links_count": 0
576
+ }
577
+ }
578
+ }
204
579
 
580
+ return empty_report
581
+
582
+ def _generate_text_report(
583
+ pdf_path: str,
584
+ library: str,
585
+ ext_links: list,
586
+ goto_links: list,
587
+ resolve_links: list,
588
+ other_links: list,
589
+ toc: list
590
+ ) -> str:
591
+ """Pure helper to build the human-readable string for console/TXT export."""
592
+ lines = []
593
+ lines.append("\n--- Starting Analysis ... ---\n")
594
+ lines.append(f"Target file: {get_friendly_path(pdf_path)}")
595
+ lines.append(f"PDF Engine: {library}")
596
+
597
+ total_int = len(goto_links) + len(resolve_links)
598
+ total_links = len(ext_links) + total_int + len(other_links)
205
599
 
206
- def print_structural_toc_print(structural_toc:dict)->str|None:
207
- """
208
- Prints the structural TOC data (bookmarks/outline) in a clean,
209
- hierarchical, and readable console format.
600
+ # 1. Summary Header
601
+ lines.append("\n" + "=" * SEP_COUNT)
602
+ lines.append(f"--- Link Analysis Results for {get_friendly_path(pdf_path)} ---")
603
+ lines.append(f"Total active links: {total_links} (External: {len(ext_links)}, Internal Jumps: {total_int}, Other: {len(other_links)})")
604
+ lines.append(f"Total **structural TOC entries (bookmarks)** found: {len(toc)}")
605
+ lines.append("=" * SEP_COUNT)
210
606
 
211
- Args:
212
- structural_toc: A list of TOC dictionaries.
213
- """
214
- print("\n" + "=" * SEP_COUNT)
215
- print("## Structural Table of Contents (PDF Bookmarks/Outline)")
216
- print("=" * SEP_COUNT)
217
- if not structural_toc:
218
- print("No structural TOC (bookmarks/outline) found.")
219
- return
607
+ # 2. Table of Contents
608
+ lines.append(get_structural_toc(toc))
220
609
 
221
- # Determine max page width for consistent alignment (optional but nice)
222
- max_page = max(item['target_page'] for item in structural_toc) if structural_toc else 1
223
- page_width = len(str(max_page))
610
+ # 3. Internal Jumps
611
+ lines.append("\n" + "=" * SEP_COUNT)
612
+ lines.append(f"## Active Internal Jumps (GoTo & Resolved Actions) - {total_int} found")
613
+ lines.append("=" * SEP_COUNT)
614
+ lines.append("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To Page"))
615
+ lines.append("-" * SEP_COUNT)
224
616
 
225
- # Iterate and format
226
- for item in structural_toc:
227
- # Use level for indentation (e.g., Level 1 = 0 spaces, Level 2 = 4 spaces, Level 3 = 8 spaces)
228
- indent = " " * 4 * (item['level'] - 1)
229
- # Format the title and target page number
230
- page_str = str(item['target_page']).rjust(page_width)
231
- print(f"{indent}{item['title']} . . . page {page_str}")
617
+ all_internal = goto_links + resolve_links
618
+ if all_internal:
619
+ for i, link in enumerate(all_internal, 1):
620
+ src = PageRef.from_index(link.get('page', 0)).human
621
+ dest = PageRef.from_index(link.get('destination_page', 0)).human
622
+ lines.append("{:<5} | {:<5} | {:<40} | {}".format(
623
+ i, src, link.get('link_text', 'N/A')[:40], dest
624
+ ))
625
+ else:
626
+ lines.append(" No internal GoTo or Resolved Action links found.")
627
+ lines.append("-" * SEP_COUNT)
628
+
629
+ # 4. External URI Links
630
+ lines.append("\n" + "=" * SEP_COUNT)
631
+ lines.append(f"## Active URI Links (External) - {len(ext_links)} found")
632
+ lines.append("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target URI/Action"))
633
+ lines.append("=" * SEP_COUNT)
634
+
635
+ if ext_links:
636
+ for i, link in enumerate(ext_links, 1):
637
+ target = link.get('url') or link.get('remote_file') or link.get('target', 'N/A')
638
+ lines.append("{:<5} | {:<5} | {:<40} | {}".format(
639
+ i, link.get('page', 0), link.get('link_text', 'N/A')[:40], target
640
+ ))
641
+ else:
642
+ lines.append(" No external links found.")
643
+ lines.append("-" * SEP_COUNT)
644
+
645
+ # 5. Other Links
646
+ lines.append("\n" + "=" * SEP_COUNT)
647
+ lines.append(f"## Other Links - {len(other_links)} found")
648
+ lines.append("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target Action"))
649
+ lines.append("=" * SEP_COUNT)
650
+
651
+ if other_links:
652
+ for i, link in enumerate(other_links, 1):
653
+ target = link.get('url') or link.get('remote_file') or link.get('target', 'N/A')
654
+ lines.append("{:<5} | {:<5} | {:<40} | {}".format(
655
+ i, link.get('page', 0), link.get('link_text', 'N/A')[:40], target
656
+ ))
657
+ else:
658
+ lines.append(" No 'Other' links found.")
659
+ lines.append("-" * SEP_COUNT)
660
+
661
+ return "\n".join(lines)
662
+
663
+ def _generate_text_report__(pdf_path, library, ext_links, int_links, other_links, toc) -> str:
664
+ lines = []
665
+ lines.append("\n--- Starting Analysis ... ---\n")
666
+ lines.append(f"Target file: {get_friendly_path(pdf_path)}")
667
+ lines.append(f"PDF Engine: {library}")
668
+
669
+ # 1. Summary Header
670
+ lines.append("\n" + "=" * SEP_COUNT)
671
+ lines.append(f"--- Link Analysis Results for {get_friendly_path(pdf_path)} ---")
672
+ lines.append(f"Total active links: {len(ext_links) + len(int_links) + len(other_links)}")
673
+ lines.append(f"Total bookmarks: {len(toc)}")
674
+ lines.append("=" * SEP_COUNT)
232
675
 
233
- print("-" * SEP_COUNT)
676
+ # 2. Table of Contents
677
+ lines.append(get_structural_toc(toc))
234
678
 
679
+ # 3. Internal Jumps (GoTo & Resolved)
680
+ lines.append("\n" + "=" * SEP_COUNT)
681
+ lines.append(f"## Active Internal Jumps - {len(int_links)} found")
682
+ lines.append("=" * SEP_COUNT)
683
+ lines.append("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To"))
684
+
685
+ for i, link in enumerate(int_links, 1):
686
+ src = PageRef.from_index(link.get('page', 0)).human
687
+ dest = PageRef.from_index(link.get('destination_page', 0)).human
688
+ lines.append("{:<5} | {:<5} | {:<40} | {}".format(i, src, link.get('link_text', 'N/A')[:40], dest))
235
689
 
236
- def print_structural_toc(structural_toc: list, print_bool: bool = False) -> str:
690
+ # 4. External URI Links
691
+ lines.append("\n" + "=" * SEP_COUNT)
692
+ lines.append(f"## External URI Links - {len(ext_links)} found")
693
+ lines.append("=" * SEP_COUNT)
694
+ for i, link in enumerate(ext_links, 1):
695
+ target = link.get('url') or link.get('target', 'N/A')
696
+ lines.append("{:<5} | {:<5} | {:<40} | {}".format(i, link.get('page', 0), link.get('link_text', 'N/A')[:40], target))
697
+
698
+ return "\n".join(lines)
699
+
700
+ def _build_metadata(
701
+ pdf_name: str,
702
+ total_pages: int,
703
+ library_used: str,
704
+ toc_entry_count: int,
705
+ internal_goto_links_count: int,
706
+ interal_resolve_action_links_count: int,
707
+ external_uri_links_count: int,
708
+ other_links_count: int
709
+ ) -> Dict[str, Any]:
710
+ """
711
+ Standardizes the metadata dictionary using the EXACT legacy variable names.
712
+ """
713
+ total_internal_links_count = internal_goto_links_count + interal_resolve_action_links_count
714
+ total_links_count = total_internal_links_count + external_uri_links_count + other_links_count
715
+
716
+ return {
717
+ "file_overview": {
718
+ "pdf_name": pdf_name,
719
+ "total_pages": total_pages,
720
+ },
721
+ "library_used": library_used,
722
+ "link_counts": {
723
+ "toc_entry_count": toc_entry_count,
724
+ "internal_goto_links_count": internal_goto_links_count,
725
+ "interal_resolve_action_links_count": interal_resolve_action_links_count,
726
+ "total_internal_links_count": total_internal_links_count,
727
+ "external_uri_links_count": external_uri_links_count,
728
+ "other_links_count": other_links_count,
729
+ "total_links_count": total_links_count
730
+ }
731
+ }
732
+
733
+ def _build_metadata_(
734
+ pdf_name: str,
735
+ total_pages: int,
736
+ library_used: str,
737
+ toc_count: int,
738
+ goto_count: int,
739
+ resolve_count: int,
740
+ ext_count: int,
741
+ other_count: int
742
+ ) -> Dict[str, Any]:
743
+ """Standardizes the metadata dictionary for all report types."""
744
+ return {
745
+ "file_overview": {
746
+ "pdf_name": pdf_name,
747
+ "total_pages": total_pages,
748
+ },
749
+ "library_used": library_used,
750
+ "link_counts": {
751
+ "toc_entry_count": toc_count,
752
+ "internal_links_count": goto_count,
753
+ "external_uri_links_count": ext_count,
754
+ "other_links_count": other_count,
755
+ "total_links_count": goto_count + ext_count + other_count
756
+ }
757
+ }
758
+
759
+ def get_structural_toc(structural_toc: list) -> str:
237
760
  """
238
761
  Formats the structural TOC data into a hierarchical string and optionally prints it.
239
762
 
240
763
  Args:
241
764
  structural_toc: A list of TOC dictionaries.
242
- print_bool: Whether to print the output to the console.
243
765
 
244
766
  Returns:
245
767
  A formatted string of the structural TOC.
@@ -253,8 +775,6 @@ def print_structural_toc(structural_toc: list, print_bool: bool = False) -> str:
253
775
  msg = "No structural TOC (bookmarks/outline) found."
254
776
  lines.append(msg)
255
777
  output = "\n".join(lines)
256
- if print_bool:
257
- print(output)
258
778
  return output
259
779
 
260
780
  # Determine max page width for consistent alignment
@@ -267,16 +787,67 @@ def print_structural_toc(structural_toc: list, print_bool: bool = False) -> str:
267
787
  indent = " " * 4 * (item['level'] - 1)
268
788
  # Handle cases where page might be N/A or None
269
789
  target_page = item.get('target_page', "N/A")
270
- page_str = str(target_page).rjust(page_width)
271
790
 
791
+ # Determine the human-facing string
792
+ if isinstance(target_page, int):
793
+ # Convert 0-index back to human (1-index) for the report
794
+ display_val = PageRef.from_index(target_page).human
795
+ else:
796
+ display_val = str(target_page)
797
+
798
+ page_str = str(display_val).rjust(page_width)
799
+
272
800
  lines.append(f"{indent}{item['title']} . . . page {page_str}")
273
801
 
274
802
  lines.append("-" * SEP_COUNT)
275
803
 
276
804
  # Final aggregation
277
805
  str_structural_toc = "\n".join(lines)
278
-
279
- if print_bool:
280
- print(str_structural_toc)
281
806
 
282
807
  return str_structural_toc
808
+
809
+ import unicodedata
810
+
811
+ def sanitize_glyphs_for_compatibility(text: str) -> str:
812
+ """Replaces emojis with ASCII tags to prevent rendering bugs in gedit/WSL2."""
813
+ glyph_mapping = {
814
+ '✅': '[PASS]',
815
+ '🌐': '[WEB]',
816
+ '⚠️': '[WARN]',
817
+ '❌': '[FAIL]',
818
+ 'ℹ️': '[INFO]'
819
+ }
820
+ for glyph, replacement in glyph_mapping.items():
821
+ text = text.replace(glyph, replacement)
822
+
823
+ # Standard library only - no unidecode dependency
824
+ normalized = unicodedata.normalize('NFKD', text)
825
+ return normalized.encode('ascii', 'ignore').decode('utf-8').replace(' ', ' ')
826
+
827
+
828
+
829
+ if __name__ == "__main__":
830
+
831
+ from pdflinkcheck.io import get_first_pdf_in_cwd
832
+ pdf_path = get_first_pdf_in_cwd()
833
+ # Run analysis first
834
+
835
+ if pymupdf_is_available():
836
+ pdf_library = "pymupdf"
837
+ else:
838
+ pdf_library = "pypdf"
839
+ report = run_report_and_call_exports(
840
+ pdf_path=pdf_path,
841
+ export_format="",
842
+ pdf_library=pdf_library,
843
+ print_bool=True # We handle printing in validation
844
+ )
845
+
846
+ if not report or not report.get("data"):
847
+ print("No data extracted — nothing to validate.")
848
+ sys.exit(1)
849
+
850
+ else:
851
+ print("Success!")
852
+ print(f"list(report['data']) = {list(report['data'])}")
853
+