pdflinkcheck 1.1.94__py3-none-any.whl → 1.2.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. pdflinkcheck/__init__.py +88 -18
  2. pdflinkcheck/__main__.py +6 -0
  3. pdflinkcheck/analysis_pdfium.py +131 -0
  4. pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +99 -141
  5. pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +51 -39
  6. pdflinkcheck/cli.py +52 -48
  7. pdflinkcheck/data/LICENSE +18 -15
  8. pdflinkcheck/data/README.md +23 -25
  9. pdflinkcheck/data/pyproject.toml +17 -26
  10. pdflinkcheck/datacopy.py +16 -1
  11. pdflinkcheck/dev.py +2 -2
  12. pdflinkcheck/environment.py +14 -2
  13. pdflinkcheck/gui.py +346 -563
  14. pdflinkcheck/helpers.py +88 -0
  15. pdflinkcheck/io.py +24 -6
  16. pdflinkcheck/report.py +598 -97
  17. pdflinkcheck/security.py +189 -0
  18. pdflinkcheck/splash.py +38 -0
  19. pdflinkcheck/stdlib_server.py +7 -21
  20. pdflinkcheck/stdlib_server_alt.py +571 -0
  21. pdflinkcheck/tk_utils.py +188 -0
  22. pdflinkcheck/update_msix_version.py +2 -0
  23. pdflinkcheck/validate.py +104 -170
  24. pdflinkcheck/version_info.py +2 -2
  25. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +41 -40
  26. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/RECORD +34 -27
  27. pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
  28. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
  29. pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
  30. pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
  31. pdflinkcheck/analyze_pypdf_v2.py +0 -217
  32. pdflinkcheck-1.1.94.dist-info/WHEEL +0 -4
  33. pdflinkcheck-1.1.94.dist-info/licenses/LICENSE +0 -24
  34. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-AGPL3 +0 -0
  35. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-MIT +0 -0
pdflinkcheck/report.py CHANGED
@@ -1,37 +1,195 @@
1
1
  #!/usr/bin/env python3
2
2
  # SPDX-License-Identifier: MIT
3
3
  # pdflinkcheck/report.py
4
-
4
+ from __future__ import annotations
5
5
  import sys
6
6
  from pathlib import Path
7
7
  from typing import Optional, Dict, Any
8
8
  import pyhabitat
9
+ import copy
9
10
 
10
11
  from pdflinkcheck.io import error_logger, export_report_json, export_report_txt, get_first_pdf_in_cwd, get_friendly_path, LOG_FILE_PATH
11
- from pdflinkcheck.environment import pymupdf_is_available
12
+ from pdflinkcheck.environment import pymupdf_is_available, pdfium_is_available
12
13
  from pdflinkcheck.validate import run_validation
14
+ from pdflinkcheck.security import compute_risk
15
+ from pdflinkcheck.helpers import debug_head, PageRef
16
+
13
17
 
14
18
  SEP_COUNT=28
15
-
16
- def run_report_and_call_exports(pdf_path: str = None, max_links: int = 0, export_format: str = "JSON", pdf_library: str = "pypdf", print_bool:bool=True) -> Dict[str, Any]:
17
- # The meat and potatoes
18
- report_results = run_report_and_validtion(
19
+ # Define a safe "empty" validation state
20
+ EMPTY_VALIDATION = {
21
+ "summary-stats": {
22
+ "total_checked": 0,
23
+ "valid": 0,
24
+ "file-found": 0,
25
+ "broken-page": 0,
26
+ "broken-file": 0,
27
+ "no_destination_page_count": 0,
28
+ "unknown-web": 0,
29
+ "unknown-reasonableness": 0,
30
+ "unknown-link": 0
31
+ },
32
+ "issues": [],
33
+ "summary-txt": "Analysis failed: No validation performed.",
34
+ "total_pages": 0
35
+ }
36
+
37
+
38
+ def run_report_and_call_exports(
39
+ pdf_path: str = None,
40
+ export_format: str = "JSON",
41
+ pdf_library: str = "auto",
42
+ print_bool:bool=True,
43
+ ) -> Dict[str, Any]:
44
+ """
45
+ Public entry point. Orchestrates extraction, validation, and file exports.
46
+ """
47
+ # The meat and potatoes
48
+ report_results = run_report_extraction_and_assessment_and_recording(
19
49
  pdf_path=str(pdf_path),
20
- max_links=max_links,
21
50
  pdf_library = pdf_library,
51
+ print_bool=print_bool,
22
52
  )
53
+ # 2. Initialize file path tracking
54
+ output_path_json = None
55
+ output_path_txt = None
56
+
23
57
  if export_format:
24
58
  report_data_dict = report_results["data"]
25
59
  report_buffer_str = report_results["text"]
26
60
  if "JSON" in export_format.upper():
27
- export_report_json(report_data_dict, pdf_path, pdf_library)
28
-
61
+ output_path_json = export_report_json(report_data_dict, pdf_path, pdf_library)
29
62
  if "TXT" in export_format.upper():
30
- export_report_txt(report_buffer_str, pdf_path, pdf_library)
63
+ output_path_txt = export_report_txt(report_buffer_str, pdf_path, pdf_library)
64
+
65
+ # 4. Inject the file info into the results dictionary
66
+ report_results["files"] = {
67
+ "export_path_json": output_path_json,
68
+ "export_path_txt": output_path_txt
69
+ }
31
70
  return report_results
71
+
72
+ def _get_engine_data(pdf_path: str, pdf_library: str) -> tuple[Dict, str]:
73
+ """Handles the dirty work of switching engines and importing them."""
74
+ # Resolve 'auto' mode
75
+ if pdf_library == "auto":
76
+ if pdfium_is_available(): pdf_library = "pdfium"
77
+ elif pymupdf_is_available(): pdf_library = "pymupdf"
78
+ else: pdf_library = "pypdf"
79
+
80
+ # Map engine names to their respective modules
81
+ engines = {
82
+ "pdfium": "pdflinkcheck.analysis_pdfium",
83
+ "pypdf": "pdflinkcheck.analysis_pypdf", # Assuming this exists
84
+ "pymupdf": "pdflinkcheck.analysis_pymupdf"
85
+ }
86
+
87
+ if pdf_library not in engines:
88
+ raise ValueError(f"Unsupported library: {pdf_library}")
89
+
90
+ # Dynamic import to keep __init__ lean
91
+ import importlib
92
+ module = importlib.import_module(engines[pdf_library])
93
+ data = module.analyze_pdf(pdf_path) or {"links": [], "toc": [], "file_ov": {}}
32
94
 
95
+ return data, pdf_library
96
+
97
+ # ----- Refactored version, failing ----
98
+ def run_report_extraction_and_assessment_and_recording_(
99
+ pdf_path: str = None,
100
+ pdf_library: str = "auto",
101
+ print_bool: bool = True
102
+ ) -> Dict[str, Any]:
103
+ """
104
+ Orchestrates extraction, categorization, and validation.
105
+ FULLY RECONCILED with legacy logic to ensure no features are lost.
106
+ """
107
+ if pdf_path is None:
108
+ return _return_empty_report(["pdf_path is None"], pdf_library)
109
+
110
+ try:
111
+ # 1. Extraction
112
+ raw_data, resolved_library = _get_engine_data(pdf_path, pdf_library)
113
+
114
+ extracted_links = raw_data.get("links", [])
115
+ structural_toc = raw_data.get("toc", [])
116
+ file_ov = raw_data.get("file_ov", {})
117
+ total_pages = file_ov.get("total_pages", 0)
118
+ pdf_name = Path(pdf_path).name
119
+
120
+ # 2. Categorization (Restored exactly from original logic)
121
+ external_uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
122
+ goto_links = [link for link in extracted_links if link['type'] == 'Internal (GoTo/Dest)']
123
+ resolved_action_links = [link for link in extracted_links if link['type'] == 'Internal (Resolved Action)']
124
+ other_links = [link for link in extracted_links if link['type'] not in
125
+ ['External (URI)', 'Internal (GoTo/Dest)', 'Internal (Resolved Action)']]
126
+
127
+ all_internal = goto_links + resolved_action_links
33
128
 
34
- def run_report_and_validtion(pdf_path: str = None, max_links: int = 0, pdf_library: str = "pypdf", print_bool:bool=True) -> Dict[str, Any]:
129
+ # 3. Generate the Text Report (Using get_friendly_path as required)
130
+ # We pass the separate lists to maintain Section 2, 3, and 4 formatting
131
+ report_text_base = _generate_text_report(
132
+ pdf_path=pdf_path,
133
+ library=resolved_library,
134
+ ext_links=external_uri_links,
135
+ goto_links=goto_links,
136
+ resolve_links=resolved_action_links,
137
+ other_links=other_links,
138
+ toc=structural_toc
139
+ )
140
+
141
+ # 4. Initial Result Assembly
142
+ report_results = {
143
+ "data": {
144
+ "external_links": external_uri_links,
145
+ "internal_links": goto_links + resolved_action_links,
146
+ "toc": structural_toc,
147
+ "validation": EMPTY_VALIDATION.copy()
148
+ },
149
+ "text": report_text_base,
150
+ "metadata": _build_metadata(
151
+ pdf_name=pdf_name,
152
+ total_pages=total_pages,
153
+ library_used=resolved_library,
154
+ toc_entry_count=len(structural_toc),
155
+ internal_goto_links_count=len(goto_links),
156
+ interal_resolve_action_links_count=len(resolved_action_links),
157
+ external_uri_links_count=len(external_uri_links),
158
+ other_links_count=len(other_links)
159
+ )
160
+ }
161
+
162
+ # 5. Validation & Risk Analysis
163
+ validation_results = run_validation(report_results=report_results, pdf_path=pdf_path)
164
+ report_results["data"]["validation"].update(validation_results)
165
+ report_results["data"]["risk"] = compute_risk(report_results)
166
+
167
+ # --- Inside run_report_extraction_and_assessment_and_recording ---
168
+ # 6. Finalizing Text Buffer
169
+ val_summary = validation_results.get("summary-txt", "")
170
+ raw_text = report_text_base + f"\n{val_summary}\n--- Analysis Complete ---"
171
+ cleaned_text = sanitize_glyphs_for_compatibility(raw_text)
172
+ # Apply sanitization before returning
173
+ report_results["text"] = cleaned_text
174
+ #report_results["text"] = raw_text
175
+
176
+ if print_bool:
177
+ # Matches your original logic: print the overview/validation summary to console
178
+ print(val_summary)
179
+
180
+ return report_results
181
+
182
+ except Exception as e:
183
+ error_logger.error(f"Critical failure: {e}", exc_info=True)
184
+ return _return_empty_report([f"FATAL: {str(e)}"], pdf_library)
185
+
186
+ # ----- Revert to stable version ----
187
+ def run_report_extraction_and_assessment_and_recording(
188
+ pdf_path: str = None,
189
+ pdf_library: str = "auto",
190
+ print_bool:bool=True,
191
+ concise_print: bool=False,
192
+ ) -> Dict[str, Any]:
35
193
  """
36
194
  Core high-level PDF link analysis logic.
37
195
 
@@ -39,10 +197,8 @@ def run_report_and_validtion(pdf_path: str = None, max_links: int = 0, pdf_libr
39
197
  using pdflinkcheck analysis, and
40
198
  prints a comprehensive, user-friendly report to the console.
41
199
 
42
- Args:
200
+ Args:
43
201
  pdf_path: The file system path (str) to the target PDF document.
44
- max_links: Maximum number of links to display in each console
45
- section. If <= 0, all links will be displayed.
46
202
 
47
203
  Returns:
48
204
  A dictionary containing the structured results of the analysis:
@@ -54,18 +210,58 @@ def run_report_and_validtion(pdf_path: str = None, max_links: int = 0, pdf_libr
54
210
  """
55
211
 
56
212
  report_buffer = []
213
+ report_buffer_overview = []
57
214
 
58
215
  # Helper to handle conditional printing and mandatory buffering
59
- def log(msg: str):
60
- if print_bool:
61
- print(msg)
216
+ def log(msg: str, overview: bool = False):
62
217
  report_buffer.append(msg)
218
+ if overview:
219
+ report_buffer_overview.append(msg)
220
+
63
221
 
64
- # Expected: "pypdf" or "PyMuPDF"
65
- allowed_libraries = ("pypdf","pymupdf")
222
+
223
+ # Expected: "pypdf" or "PyMuPDF" pr "rust"
224
+ allowed_libraries = ("pypdf", "pymupdf", "pdfium", "auto")
66
225
  pdf_library = pdf_library.lower()
67
- if pdf_library in allowed_libraries and pdf_library == "pypdf":
68
- from pdflinkcheck.analyze_pypdf import (extract_links_pypdf as extract_links, extract_toc_pypdf as extract_toc)
226
+
227
+ log("\n--- Starting Analysis ... ---\n")
228
+ if pdf_path is None:
229
+ log("pdf_path is None", overview=True)
230
+ log("Tip: Drop a PDF in the current folder or pass in a path arg.")
231
+ _return_empty_report(report_buffer)
232
+ else:
233
+ pdf_name = Path(pdf_path).name
234
+
235
+ # AUTO MODE
236
+ if pdf_library == "auto":
237
+ if pdfium_is_available():
238
+ pdf_library = "pdfium"
239
+ elif pymupdf_is_available():
240
+ pdf_library = "pymupdf"
241
+ else:
242
+ pdf_library = "pypdf"
243
+
244
+
245
+
246
+ # PDFium ENGINE
247
+ if pdf_library in allowed_libraries and pdf_library == "pdfium":
248
+ from pdflinkcheck.analysis_pdfium import analyze_pdf as analyze_pdf_pdfium
249
+ data = analyze_pdf_pdfium(pdf_path) or {"links": [], "toc": [], "file_ov": []}
250
+ extracted_links = data.get("links", [])
251
+ structural_toc = data.get("toc", [])
252
+ file_ov = data.get("file_ov", [])
253
+
254
+ # pypdf ENGINE
255
+ elif pdf_library in allowed_libraries and pdf_library == "pypdf":
256
+ from pdflinkcheck.analysis_pdfium import analyze_pdf as analyze_pdf_pypdf
257
+ #extracted_links = extract_links(pdf_path)
258
+ #structural_toc = extract_toc(pdf_path)
259
+ data = analyze_pdf_pypdf(pdf_path) or {"links": [], "toc": [], "file_ov": []}
260
+ extracted_links = data.get("links", [])
261
+ structural_toc = data.get("toc", [])
262
+ file_ov = data.get("file_ov", [])
263
+
264
+ # PyMuPDF Engine
69
265
  elif pdf_library in allowed_libraries and pdf_library == "pymupdf":
70
266
  if not pymupdf_is_available():
71
267
  print("PyMuPDF was explicitly requested as the PDF Engine")
@@ -76,43 +272,36 @@ def run_report_and_validtion(pdf_path: str = None, max_links: int = 0, pdf_libr
76
272
  print("PyMuPDF is not expected to work on Termux. Use pypdf.")
77
273
  print("\n")
78
274
  #return
79
- raise ImportError(f"The 'fitz' module is required for this functionality. Original error: {e}") from e
80
- from pdflinkcheck.analyze_pymupdf import (extract_links_pymupdf as extract_links, extract_toc_pymupdf as extract_toc)
275
+ raise ImportError("The 'fitz' module (PyMuPDF) is required but not installed.")
276
+
277
+ from pdflinkcheck.analysis_pdfium import analyze_pdf as analyze_pdf_pymupdf
278
+ data = analyze_pdf_pymupdf(pdf_path) or {"links": [], "toc": [], "file_ov": []}
279
+ extracted_links = data.get("links", [])
280
+ structural_toc = data.get("toc", [])
281
+ file_ov = data.get("file_ov", [])
282
+
283
+ total_pages = file_ov.get("total_pages",0)
81
284
 
82
- log("\n--- Starting Analysis ... ---\n")
83
- if pdf_path is None:
84
- log("pdf_path is None")
85
- log("Tip: Drop a PDF in the current folder or pass in a path arg.")
86
- empty_report = {
87
- "data": {
88
- "external_links": [],
89
- "internal_links": [],
90
- "toc": []
91
- },
92
- "text": "\n".join(report_buffer),
93
- "metadata": {
94
- "pdf_name": Path(pdf_path).name,
95
- "library_used": pdf_library,
96
- "total_links": 0
97
- }
98
- }
99
285
 
100
- return empty_report
101
286
 
102
287
  try:
103
- log(f"Target file: {get_friendly_path(pdf_path)}")
104
- log(f"PDF Engine: {pdf_library}")
288
+ log(f"Target file: {get_friendly_path(pdf_path)}", overview=True)
289
+ log(f"PDF Engine: {pdf_library}", overview=True)
105
290
 
106
- # 1. Extract all active links and TOC
107
- extracted_links = extract_links(pdf_path)
108
- structural_toc = extract_toc(pdf_path)
109
- #structural_toc = extract_toc_pypdf(pdf_path)
110
291
  toc_entry_count = len(structural_toc)
292
+ str_structural_toc = get_structural_toc(structural_toc)
293
+
294
+ # check the structure, that it matches
295
+ if False:
296
+ print(f"pdf_library={pdf_library}")
297
+ debug_head("TOC", structural_toc, n=3)
298
+ debug_head("Links", list(extracted_links), n=3)
111
299
 
300
+ # THIS HITS
112
301
 
113
302
  if not extracted_links and not structural_toc:
114
- log(f"\nNo hyperlinks or structural TOC found in {Path(pdf_path).name}.")
115
- log("(This is common for scanned/image-only PDFs.)")
303
+ log(f"\nNo hyperlinks or structural TOC found in {pdf_name}.", overview=True)
304
+ log("(This is common for scanned/image-only PDFs.)", overview=True)
116
305
 
117
306
  empty_result = {
118
307
  "data": {
@@ -122,101 +311,156 @@ def run_report_and_validtion(pdf_path: str = None, max_links: int = 0, pdf_libr
122
311
  },
123
312
  "text": "\n".join(report_buffer),
124
313
  "metadata": {
125
- "pdf_name": Path(pdf_path).name,
314
+ "file_overview": {
315
+ "pdf_name": pdf_name,
316
+ "total_pages": total_pages,
317
+ },
126
318
  "library_used": pdf_library,
127
- "total_links": 0
319
+ "link_counts": {
320
+ "toc_entry_count": 0,
321
+ "internal_goto_links_count": 0,
322
+ "interal_resolve_action_links_count": 0,
323
+ "total_internal_links_count": 0,
324
+ "external_uri_links_count": 0,
325
+ "other_links_count": 0,
326
+ "total_links_count": 0
327
+ }
128
328
  }
129
329
  }
130
330
  return empty_result
131
331
 
132
332
  # 3. Separate the lists based on the 'type' key
133
- uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
333
+ external_uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
134
334
  goto_links = [link for link in extracted_links if link['type'] == 'Internal (GoTo/Dest)']
135
335
  resolved_action_links = [link for link in extracted_links if link['type'] == 'Internal (Resolved Action)']
136
336
  other_links = [link for link in extracted_links if link['type'] not in ['External (URI)', 'Internal (GoTo/Dest)', 'Internal (Resolved Action)']]
137
337
 
138
- total_internal_links = len(goto_links) + len(resolved_action_links)
139
- limit = max_links if max_links > 0 else None
140
- uri_and_other = uri_links + other_links
338
+ interal_resolve_action_links_count = len(resolved_action_links)
339
+ internal_goto_links_count = len(goto_links)
340
+ total_internal_links_count = internal_goto_links_count + interal_resolve_action_links_count
341
+
342
+ external_uri_links_count = len(external_uri_links)
343
+ other_links_count = len(other_links)
344
+
345
+ total_links_count = len(extracted_links)
141
346
 
142
- str_structural_toc = get_structural_toc(structural_toc)
143
-
144
347
  # --- ANALYSIS SUMMARY (Using your print logic) ---
145
- log("\n" + "=" * SEP_COUNT)
146
- log(f"--- Link Analysis Results for {Path(pdf_path).name} ---")
147
- log(f"Total active links: {len(extracted_links)} (External: {len(uri_links)}, Internal Jumps: {total_internal_links}, Other: {len(other_links)})")
148
- log(f"Total **structural TOC entries (bookmarks)** found: {toc_entry_count}")
149
- log("=" * SEP_COUNT)
348
+ log("\n" + "=" * SEP_COUNT, overview = True)
349
+ log(f"--- Link Analysis Results for {pdf_name} ---", overview = True)
350
+ log(f"Total active links: {total_links_count} (External: {external_uri_links_count}, Internal Jumps: {total_internal_links_count}, Other: {other_links_count})",overview = True)
351
+ log(f"Total **structural TOC entries (bookmarks)** found: {toc_entry_count}",overview = True)
352
+ log("=" * SEP_COUNT,overview = True)
150
353
 
151
354
  # --- Section 1: TOC ---
152
355
  log(str_structural_toc)
153
356
 
154
357
  # --- Section 2: ACTIVE INTERNAL JUMPS ---
155
358
  log("\n" + "=" * SEP_COUNT)
156
- log(f"## Active Internal Jumps (GoTo & Resolved Actions) - {total_internal_links} found")
359
+ log(f"## Active Internal Jumps (GoTo & Resolved Actions) - {total_internal_links_count} found")
157
360
  log("=" * SEP_COUNT)
158
361
  log("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To Page"))
159
362
  log("-" * SEP_COUNT)
160
363
 
161
364
  all_internal = goto_links + resolved_action_links
162
- if total_internal_links > 0:
163
- for i, link in enumerate(all_internal[:limit], 1):
365
+ #If links were found: all_internal is a list with dictionaries. It evaluates to True.
366
+ # If NO links were found: all_internal is an empty list []. It evaluates to False.
367
+ if all_internal:
368
+ for i, link in enumerate(all_internal, 1):
164
369
  link_text = link.get('link_text', 'N/A')
165
- log("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], link['destination_page']))
166
370
 
167
- if limit is not None and len(all_internal) > limit:
168
- log(f"... and {len(all_internal) - limit} more links (use --max-links 0 to show all).")
371
+ # Convert source and destination indices to human strings
372
+ src_page = PageRef.from_index(link['page']).human
373
+ dest_page = PageRef.from_index(link['destination_page']).human
374
+
375
+ log("{:<5} | {:<5} | {:<40} | {}".format(
376
+ i,
377
+ src_page,
378
+ link_text[:40],
379
+ dest_page
380
+ ))
381
+
382
+
169
383
  else:
170
384
  log(" No internal GoTo or Resolved Action links found.")
171
385
  log("-" * SEP_COUNT)
172
386
 
173
387
  # --- Section 3: ACTIVE URI LINKS ---
174
388
  log("\n" + "=" * SEP_COUNT)
175
- log(f"## Active URI Links (External & Other) - {len(uri_and_other)} found")
389
+ log(f"## Active URI Links (External) - {len(external_uri_links)} found")
176
390
  log("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target URI/Action"))
177
391
  log("=" * SEP_COUNT)
178
392
 
179
- if uri_and_other:
180
- for i, link in enumerate(uri_and_other[:limit], 1):
393
+ if external_uri_links:
394
+ for i, link in enumerate(external_uri_links, 1):
181
395
  target = link.get('url') or link.get('remote_file') or link.get('target')
182
396
  link_text = link.get('link_text', 'N/A')
183
397
  log("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], target))
184
- if limit is not None and len(uri_and_other) > limit:
185
- log(f"... and {len(uri_and_other) - limit} more links (use --max-links 0 to show all).")
186
398
 
187
399
  else:
188
- log(" No external or 'Other' links found.")
400
+ log(" No external links found.")
189
401
  log("-" * SEP_COUNT)
190
402
 
403
+ # --- Section 4: OTHER LINKS ---
404
+ log("\n" + "=" * SEP_COUNT)
405
+ log(f"## Other Links - {len(other_links)} found")
406
+ log("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target Action"))
407
+ log("=" * SEP_COUNT)
408
+
409
+ if other_links:
410
+ for i, link in enumerate(other_links, 1):
411
+ target = link.get('url') or link.get('remote_file') or link.get('target')
412
+ link_text = link.get('link_text', 'N/A')
413
+ log("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], target))
414
+
415
+ else:
416
+ log(" No 'Other' links found.")
417
+ log("-" * SEP_COUNT)
191
418
 
192
419
  # Return the collected data for potential future JSON/other output
193
420
  report_data_dict = {
194
- "external_links": uri_links,
421
+ "external_links": external_uri_links,
195
422
  "internal_links": all_internal,
196
423
  "toc": structural_toc,
197
- "validation": {}
424
+ "validation": EMPTY_VALIDATION.copy()
198
425
  }
199
426
 
200
427
  intermediate_report_results = {
201
428
  "data": report_data_dict, # The structured JSON-ready dict
202
429
  "text": "",
203
430
  "metadata": { # Helpful for the GUI/Logs
204
- "pdf_name": Path(pdf_path).name,
431
+ "file_overview": {
432
+ "pdf_name": pdf_name,
433
+ "total_pages": total_pages,
434
+ },
205
435
  "library_used": pdf_library,
206
- "total_links": len(extracted_links)
436
+ "link_counts": {
437
+ "toc_entry_count": toc_entry_count,
438
+ "internal_goto_links_count": internal_goto_links_count,
439
+ "interal_resolve_action_links_count": interal_resolve_action_links_count,
440
+ "total_internal_links_count": total_internal_links_count,
441
+ "external_uri_links_count": external_uri_links_count,
442
+ "other_links_count": other_links_count,
443
+ "total_links_count": total_links_count
444
+ }
207
445
  }
208
446
  }
209
447
 
210
448
  log("\n--- Analysis Complete ---")
211
449
 
212
450
  validation_results = run_validation(report_results=intermediate_report_results,
213
- pdf_path=pdf_path,
214
- pdf_library=pdf_library)
215
- log(validation_results.get("summary-txt",""))
216
- report_results = intermediate_report_results
451
+ pdf_path=pdf_path)
452
+ log(validation_results.get("summary-txt",""), overview = True)
453
+
454
+ # CRITICAL: Re-assign to report_results so it's available for the final return
455
+ report_results = copy.deepcopy(intermediate_report_results)
217
456
 
457
+ # --- Offline Risk Analysis (Security Layer) ---
458
+ risk_results = compute_risk(report_results)
459
+ report_results["data"]["risk"] = risk_results
460
+
218
461
  # Final aggregation of the buffer into one string, after the last call to log()
219
462
  report_buffer_str = "\n".join(report_buffer)
463
+ report_buffer_overview_str = "\n".join(report_buffer_overview)
220
464
 
221
465
  report_results["data"]["validation"].update(validation_results)
222
466
  #report_results["text"].update(report_buffer_str) # The human-readable string
@@ -225,30 +469,43 @@ def run_report_and_validtion(pdf_path: str = None, max_links: int = 0, pdf_libr
225
469
  # 5. Export Report
226
470
  #if export_format:
227
471
  # # Assuming export_to will hold the output format string (e.g., "JSON")
228
- # export_report_data(report_data_dict, Path(pdf_path).name, export_format, pdf_library)
472
+ # export_report_data(report_data_dict, pdf_name, export_format, pdf_library)
229
473
 
230
474
  if print_bool:
231
- print(report_buffer_str)
475
+ if concise_print:
476
+ print(report_buffer_overview_str)
477
+ else:
478
+ print(report_buffer_str)
232
479
 
233
- # Return a clean results object
234
480
  return report_results
235
-
481
+
236
482
  except Exception as e:
237
483
  # Specific handling for common read failures
238
- if "invalid pdf header" in str(e).lower() or "EOF marker not found" in str(e) or "stream has ended unexpectedly" in str(e):
484
+ if True:#"invalid pdf header" in str(e).lower() or "EOF marker not found" in str(e) or "stream has ended unexpectedly" in str(e):
239
485
  log(f"\nWarning: Could not parse PDF structure — likely an image-only or malformed PDF.")
240
486
  log("No hyperlinks or TOC can exist in this file.")
241
487
  log("Result: No links found.")
242
488
  return {
243
- "data": {"external_links": [], "internal_links": [], "toc": []},
489
+ "data": {"external_links": [], "internal_links": [], "toc": [], "validation": EMPTY_VALIDATION.copy()},
244
490
  "text": "\n".join(report_buffer + [
245
491
  "\nWarning: PDF appears to be image-only or malformed.",
246
492
  "No hyperlinks or structural TOC found."
247
493
  ]),
248
494
  "metadata": {
249
- "pdf_name": Path(pdf_path).name,
495
+ "file_overview": {
496
+ "pdf_name": pdf_name,
497
+ "total_pages": total_pages,
498
+ },
250
499
  "library_used": pdf_library,
251
- "total_links": 0
500
+ "link_counts": {
501
+ "toc_entry_count": 0,
502
+ "internal_goto_links_count": 0,
503
+ "interal_resolve_action_links_count": 0,
504
+ "total_internal_links_count": 0,
505
+ "external_uri_links_count": 0,
506
+ "other_links_count": 0,
507
+ "total_links_count": 0
508
+ }
252
509
  }
253
510
  }
254
511
 
@@ -267,7 +524,7 @@ def run_report_and_validtion(pdf_path: str = None, max_links: int = 0, pdf_libr
267
524
  "external_links": [],
268
525
  "internal_links": [],
269
526
  "toc": [],
270
- "validation": {}
527
+ "validation": EMPTY_VALIDATION.copy()
271
528
  },
272
529
  "text": "\n".join(report_buffer + [
273
530
  "\n--- Analysis failed ---",
@@ -275,11 +532,229 @@ def run_report_and_validtion(pdf_path: str = None, max_links: int = 0, pdf_libr
275
532
  "No links or TOC extracted."
276
533
  ]),
277
534
  "metadata": {
278
- "pdf_name": Path(pdf_path).name,
535
+ "file_overview": {
536
+ "pdf_name": pdf_name,
537
+ "total_pages": total_pages,
538
+ },
279
539
  "library_used": pdf_library,
280
- "total_links": 0
540
+ "link_counts": {
541
+ "toc_entry_count": 0,
542
+ "internal_goto_links_count": 0,
543
+ "interal_resolve_action_links_count": 0,
544
+ "total_internal_links_count": 0,
545
+ "external_uri_links_count": 0,
546
+ "other_links_count": 0,
547
+ "total_links_count": 0
548
+ }
281
549
  }
282
550
  }
551
+
552
+ def _return_empty_report(report_buffer: str, pdf_library: str)-> dict:
553
+
554
+ empty_report = {
555
+ "data": {
556
+ "external_links": [],
557
+ "internal_links": [],
558
+ "toc": [],
559
+ "validation": EMPTY_VALIDATION.copy()
560
+ },
561
+ "text": "\n".join(report_buffer),
562
+ "metadata": {
563
+ "file_overview": {
564
+ "pdf_name": "null",
565
+ "total_pages": 0,
566
+ },
567
+ "library_used": pdf_library,
568
+ "link_counts": {
569
+ "toc_entry_count": 0,
570
+ "internal_goto_links_count": 0,
571
+ "interal_resolve_action_links_count": 0,
572
+ "total_internal_links_count": 0,
573
+ "external_uri_links_count": 0,
574
+ "other_links_count": 0,
575
+ "total_links_count": 0
576
+ }
577
+ }
578
+ }
579
+
580
+ return empty_report
581
+
582
+ def _generate_text_report(
583
+ pdf_path: str,
584
+ library: str,
585
+ ext_links: list,
586
+ goto_links: list,
587
+ resolve_links: list,
588
+ other_links: list,
589
+ toc: list
590
+ ) -> str:
591
+ """Pure helper to build the human-readable string for console/TXT export."""
592
+ lines = []
593
+ lines.append("\n--- Starting Analysis ... ---\n")
594
+ lines.append(f"Target file: {get_friendly_path(pdf_path)}")
595
+ lines.append(f"PDF Engine: {library}")
596
+
597
+ total_int = len(goto_links) + len(resolve_links)
598
+ total_links = len(ext_links) + total_int + len(other_links)
599
+
600
+ # 1. Summary Header
601
+ lines.append("\n" + "=" * SEP_COUNT)
602
+ lines.append(f"--- Link Analysis Results for {get_friendly_path(pdf_path)} ---")
603
+ lines.append(f"Total active links: {total_links} (External: {len(ext_links)}, Internal Jumps: {total_int}, Other: {len(other_links)})")
604
+ lines.append(f"Total **structural TOC entries (bookmarks)** found: {len(toc)}")
605
+ lines.append("=" * SEP_COUNT)
606
+
607
+ # 2. Table of Contents
608
+ lines.append(get_structural_toc(toc))
609
+
610
+ # 3. Internal Jumps
611
+ lines.append("\n" + "=" * SEP_COUNT)
612
+ lines.append(f"## Active Internal Jumps (GoTo & Resolved Actions) - {total_int} found")
613
+ lines.append("=" * SEP_COUNT)
614
+ lines.append("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To Page"))
615
+ lines.append("-" * SEP_COUNT)
616
+
617
+ all_internal = goto_links + resolve_links
618
+ if all_internal:
619
+ for i, link in enumerate(all_internal, 1):
620
+ src = PageRef.from_index(link.get('page', 0)).human
621
+ dest = PageRef.from_index(link.get('destination_page', 0)).human
622
+ lines.append("{:<5} | {:<5} | {:<40} | {}".format(
623
+ i, src, link.get('link_text', 'N/A')[:40], dest
624
+ ))
625
+ else:
626
+ lines.append(" No internal GoTo or Resolved Action links found.")
627
+ lines.append("-" * SEP_COUNT)
628
+
629
+ # 4. External URI Links
630
+ lines.append("\n" + "=" * SEP_COUNT)
631
+ lines.append(f"## Active URI Links (External) - {len(ext_links)} found")
632
+ lines.append("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target URI/Action"))
633
+ lines.append("=" * SEP_COUNT)
634
+
635
+ if ext_links:
636
+ for i, link in enumerate(ext_links, 1):
637
+ target = link.get('url') or link.get('remote_file') or link.get('target', 'N/A')
638
+ lines.append("{:<5} | {:<5} | {:<40} | {}".format(
639
+ i, link.get('page', 0), link.get('link_text', 'N/A')[:40], target
640
+ ))
641
+ else:
642
+ lines.append(" No external links found.")
643
+ lines.append("-" * SEP_COUNT)
644
+
645
+ # 5. Other Links
646
+ lines.append("\n" + "=" * SEP_COUNT)
647
+ lines.append(f"## Other Links - {len(other_links)} found")
648
+ lines.append("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target Action"))
649
+ lines.append("=" * SEP_COUNT)
650
+
651
+ if other_links:
652
+ for i, link in enumerate(other_links, 1):
653
+ target = link.get('url') or link.get('remote_file') or link.get('target', 'N/A')
654
+ lines.append("{:<5} | {:<5} | {:<40} | {}".format(
655
+ i, link.get('page', 0), link.get('link_text', 'N/A')[:40], target
656
+ ))
657
+ else:
658
+ lines.append(" No 'Other' links found.")
659
+ lines.append("-" * SEP_COUNT)
660
+
661
+ return "\n".join(lines)
662
+
663
+ def _generate_text_report__(pdf_path, library, ext_links, int_links, other_links, toc) -> str:
664
+ lines = []
665
+ lines.append("\n--- Starting Analysis ... ---\n")
666
+ lines.append(f"Target file: {get_friendly_path(pdf_path)}")
667
+ lines.append(f"PDF Engine: {library}")
668
+
669
+ # 1. Summary Header
670
+ lines.append("\n" + "=" * SEP_COUNT)
671
+ lines.append(f"--- Link Analysis Results for {get_friendly_path(pdf_path)} ---")
672
+ lines.append(f"Total active links: {len(ext_links) + len(int_links) + len(other_links)}")
673
+ lines.append(f"Total bookmarks: {len(toc)}")
674
+ lines.append("=" * SEP_COUNT)
675
+
676
+ # 2. Table of Contents
677
+ lines.append(get_structural_toc(toc))
678
+
679
+ # 3. Internal Jumps (GoTo & Resolved)
680
+ lines.append("\n" + "=" * SEP_COUNT)
681
+ lines.append(f"## Active Internal Jumps - {len(int_links)} found")
682
+ lines.append("=" * SEP_COUNT)
683
+ lines.append("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To"))
684
+
685
+ for i, link in enumerate(int_links, 1):
686
+ src = PageRef.from_index(link.get('page', 0)).human
687
+ dest = PageRef.from_index(link.get('destination_page', 0)).human
688
+ lines.append("{:<5} | {:<5} | {:<40} | {}".format(i, src, link.get('link_text', 'N/A')[:40], dest))
689
+
690
+ # 4. External URI Links
691
+ lines.append("\n" + "=" * SEP_COUNT)
692
+ lines.append(f"## External URI Links - {len(ext_links)} found")
693
+ lines.append("=" * SEP_COUNT)
694
+ for i, link in enumerate(ext_links, 1):
695
+ target = link.get('url') or link.get('target', 'N/A')
696
+ lines.append("{:<5} | {:<5} | {:<40} | {}".format(i, link.get('page', 0), link.get('link_text', 'N/A')[:40], target))
697
+
698
+ return "\n".join(lines)
699
+
700
+ def _build_metadata(
701
+ pdf_name: str,
702
+ total_pages: int,
703
+ library_used: str,
704
+ toc_entry_count: int,
705
+ internal_goto_links_count: int,
706
+ interal_resolve_action_links_count: int,
707
+ external_uri_links_count: int,
708
+ other_links_count: int
709
+ ) -> Dict[str, Any]:
710
+ """
711
+ Standardizes the metadata dictionary using the EXACT legacy variable names.
712
+ """
713
+ total_internal_links_count = internal_goto_links_count + interal_resolve_action_links_count
714
+ total_links_count = total_internal_links_count + external_uri_links_count + other_links_count
715
+
716
+ return {
717
+ "file_overview": {
718
+ "pdf_name": pdf_name,
719
+ "total_pages": total_pages,
720
+ },
721
+ "library_used": library_used,
722
+ "link_counts": {
723
+ "toc_entry_count": toc_entry_count,
724
+ "internal_goto_links_count": internal_goto_links_count,
725
+ "interal_resolve_action_links_count": interal_resolve_action_links_count,
726
+ "total_internal_links_count": total_internal_links_count,
727
+ "external_uri_links_count": external_uri_links_count,
728
+ "other_links_count": other_links_count,
729
+ "total_links_count": total_links_count
730
+ }
731
+ }
732
+
733
+ def _build_metadata_(
734
+ pdf_name: str,
735
+ total_pages: int,
736
+ library_used: str,
737
+ toc_count: int,
738
+ goto_count: int,
739
+ resolve_count: int,
740
+ ext_count: int,
741
+ other_count: int
742
+ ) -> Dict[str, Any]:
743
+ """Standardizes the metadata dictionary for all report types."""
744
+ return {
745
+ "file_overview": {
746
+ "pdf_name": pdf_name,
747
+ "total_pages": total_pages,
748
+ },
749
+ "library_used": library_used,
750
+ "link_counts": {
751
+ "toc_entry_count": toc_count,
752
+ "internal_links_count": goto_count,
753
+ "external_uri_links_count": ext_count,
754
+ "other_links_count": other_count,
755
+ "total_links_count": goto_count + ext_count + other_count
756
+ }
757
+ }
283
758
 
284
759
  def get_structural_toc(structural_toc: list) -> str:
285
760
  """
@@ -287,7 +762,6 @@ def get_structural_toc(structural_toc: list) -> str:
287
762
 
288
763
  Args:
289
764
  structural_toc: A list of TOC dictionaries.
290
- print_bool: Whether to print the output to the console.
291
765
 
292
766
  Returns:
293
767
  A formatted string of the structural TOC.
@@ -313,8 +787,16 @@ def get_structural_toc(structural_toc: list) -> str:
313
787
  indent = " " * 4 * (item['level'] - 1)
314
788
  # Handle cases where page might be N/A or None
315
789
  target_page = item.get('target_page', "N/A")
316
- page_str = str(target_page).rjust(page_width)
317
790
 
791
+ # Determine the human-facing string
792
+ if isinstance(target_page, int):
793
+ # Convert 0-index back to human (1-index) for the report
794
+ display_val = PageRef.from_index(target_page).human
795
+ else:
796
+ display_val = str(target_page)
797
+
798
+ page_str = str(display_val).rjust(page_width)
799
+
318
800
  lines.append(f"{indent}{item['title']} . . . page {page_str}")
319
801
 
320
802
  lines.append("-" * SEP_COUNT)
@@ -324,6 +806,26 @@ def get_structural_toc(structural_toc: list) -> str:
324
806
 
325
807
  return str_structural_toc
326
808
 
809
+ import unicodedata
810
+
811
+ def sanitize_glyphs_for_compatibility(text: str) -> str:
812
+ """Replaces emojis with ASCII tags to prevent rendering bugs in gedit/WSL2."""
813
+ glyph_mapping = {
814
+ '✅': '[PASS]',
815
+ '🌐': '[WEB]',
816
+ '⚠️': '[WARN]',
817
+ '❌': '[FAIL]',
818
+ 'ℹ️': '[INFO]'
819
+ }
820
+ for glyph, replacement in glyph_mapping.items():
821
+ text = text.replace(glyph, replacement)
822
+
823
+ # Standard library only - no unidecode dependency
824
+ normalized = unicodedata.normalize('NFKD', text)
825
+ return normalized.encode('ascii', 'ignore').decode('utf-8').replace(' ', ' ')
826
+
827
+
828
+
327
829
  if __name__ == "__main__":
328
830
 
329
831
  from pdflinkcheck.io import get_first_pdf_in_cwd
@@ -334,9 +836,8 @@ if __name__ == "__main__":
334
836
  pdf_library = "pymupdf"
335
837
  else:
336
838
  pdf_library = "pypdf"
337
- report = run_report(
839
+ report = run_report_and_call_exports(
338
840
  pdf_path=pdf_path,
339
- max_links=0,
340
841
  export_format="",
341
842
  pdf_library=pdf_library,
342
843
  print_bool=True # We handle printing in validation