pdflinkcheck 1.1.7__py3-none-any.whl → 1.1.47__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pdflinkcheck/__init__.py CHANGED
@@ -0,0 +1,31 @@
1
+ # Library functions
2
+ from pdflinkcheck.analyze import run_analysis, extract_links, extract_toc
3
+
4
+ # For the kids. This is what I wanted when learning Python in a mysterious new REPL.
5
+ # Is this Pythonic? No. Oh well. PEP 8, PEP 20.
6
+ import os
7
+ flag = os.environ.get('PDFLINKCHECK_GUI_EASTEREGG', '')
8
+ pdflibkcheck_gui_lib_func_load = str(flag).strip().lower() in ('true', '1', 'yes', 'on')
9
+
10
+ if pdflibkcheck_gui_lib_func_load:
11
+ try:
12
+ import pyhabitat # pyhabitat is a dependency of this package already
13
+ if pyhabitat.tkinter_is_available():
14
+ from pdflinkcheck.gui import start_gui
15
+ except ImportError:
16
+ # Optional: log or ignore silently
17
+ pass
18
+
19
+ # Breadcrumbs, for stumbling upon.
20
+ if pdflibkcheck_gui_lib_func_load:
21
+ __pdflinkcheck_gui_easteregg_enabled__ = True
22
+ else:
23
+ __pdflinkcheck_gui_easteregg_enabled__ = False
24
+
25
+ # Define __all__ such that the library functions are self documenting.
26
+ __all__ = [
27
+ "run_analysis",
28
+ "extract_links",
29
+ "extract_toc",
30
+ "start_gui" if pdflibkcheck_gui_lib_func_load else None,
31
+ ]
pdflinkcheck/analyze.py CHANGED
@@ -1,23 +1,33 @@
1
1
  import sys
2
2
  from pathlib import Path
3
3
  import logging
4
- from typing import Dict, Any
4
+ from typing import Dict, Any, Optional
5
+ # ... other imports ...
5
6
  # Configure logging to suppress low-level pdfminer messages
6
7
  logging.getLogger("fitz").setLevel(logging.ERROR)
7
8
  import fitz # PyMuPDF
8
9
 
9
10
  from pdflinkcheck.remnants import find_link_remnants
11
+ from pdflinkcheck.io import error_logger, export_report_data, LOG_FILE_PATH
10
12
 
11
13
  """
12
14
  Inspect target PDF for both URI links and for GoTo links.
13
15
  """
14
16
 
15
-
16
17
  # Helper function: Prioritize 'from'
17
18
  def get_link_rect(link_dict):
18
19
  """
19
- Retrieves the bounding box for the link using the reliable 'from' key.
20
- Returns the rect coordinates (tuple of 4 floats) or None.
20
+ Retrieves the bounding box for the link using the reliable 'from' key
21
+ provided by PyMuPDF's link dictionary.
22
+
23
+ Args:
24
+ link_dict: A dictionary representing a single link/annotation
25
+ returned by `page.get_links()`.
26
+
27
+ Returns:
28
+ A tuple of four floats (x0, y0, x1, y1) representing the
29
+ rectangular coordinates of the link on the page, or None if the
30
+ bounding box data is missing.
21
31
  """
22
32
  # 1. Use the 'from' key, which returns a fitz.Rect object or None
23
33
  rect_obj = link_dict.get('from')
@@ -30,25 +40,19 @@ def get_link_rect(link_dict):
30
40
  # 3. Fallback to None if 'from' is missing
31
41
  return None
32
42
 
33
- def get_pdf_file():
34
-
35
- example_path = f"/mnt/c/Users/george.bennett/Downloads/TE Maxson WWTF O&M Manual DRAFT - Sections 1-6 - April 2025 (3).pdf"
36
- example_path = "TE Maxson WWTF O&M Manual.pdf"
37
- print(f"example path = {example_path}")
38
- pdf_file = input(f"Paste path to PDF file (or press Enter to accept example): ")
39
- if not pdf_file:
40
- pdf_file = example_path
41
- if not Path(pdf_file).exists:
42
- print("File not found!")
43
- sys.exit(1)
44
-
45
- return pdf_file
46
-
47
-
48
43
  def get_anchor_text(page, link_rect):
49
44
  """
50
- Extracts text content using the link's bounding box.
51
- Returns the cleaned text or a placeholder if no text is found.
45
+ Extracts text content using the link's bounding box coordinates.
46
+ The bounding box is slightly expanded to ensure full characters are captured.
47
+
48
+ Args:
49
+ page: The fitz.Page object where the link is located.
50
+ link_rect: A tuple of four floats (x0, y0, x1, y1) representing the
51
+ link's bounding box.
52
+
53
+ Returns:
54
+ The cleaned, extracted text string, or a placeholder message
55
+ if no text is found or if an error occurs.
52
56
  """
53
57
  if not link_rect:
54
58
  return "N/A: Missing Rect"
@@ -90,7 +94,15 @@ def get_anchor_text(page, link_rect):
90
94
 
91
95
  def analyze_toc_fitz(doc):
92
96
  """
93
- Extracts the structured Table of Contents (bookmarks/outline) from the PDF.
97
+ Extracts the structural Table of Contents (PDF Bookmarks/Outline)
98
+ from the PDF document using PyMuPDF's built-in functionality.
99
+
100
+ Args:
101
+ doc: The open fitz.Document object.
102
+
103
+ Returns:
104
+ A list of dictionaries, where each dictionary represents a TOC entry
105
+ with 'level', 'title', and 'target_page' (1-indexed).
94
106
  """
95
107
  toc = doc.get_toc()
96
108
  toc_data = []
@@ -107,12 +119,68 @@ def analyze_toc_fitz(doc):
107
119
 
108
120
 
109
121
  # 2. Updated Main Inspection Function to Include Text Extraction
110
- def inspect_pdf_hyperlinks_fitz(pdf_path):
111
- links_data = []
122
+ #def inspect_pdf_hyperlinks_fitz(pdf_path):
123
+ def extract_toc(pdf_path):
124
+ """
125
+ Opens a PDF, iterates through all pages and extracts the structural table of contents (TOC/bookmarks).
126
+
127
+ Args:
128
+ pdf_path: The file system path (str) to the target PDF document.
129
+
130
+ Returns:
131
+ A list of dictionaries representing the structural TOC/bookmarks.
132
+ """
112
133
  try:
113
134
  doc = fitz.open(pdf_path)
114
135
  structural_toc = analyze_toc_fitz(doc)
136
+ except Exception as e:
137
+ print(f"An error occurred: {e}", file=sys.stderr)
138
+ return structural_toc
139
+
140
+
141
+ def serialize_fitz_object(obj):
142
+ """Converts a fitz object (Point, Rect, Matrix) to a serializable type."""
143
+ # Meant to avoid known Point errors like: '[ERROR] An unexpected error occurred during analysis: Report export failed due to an I/O error: Object of type Point is not JSON serializable'
144
+ if obj is None:
145
+ return None
146
+
147
+ # 1. Handle fitz.Point (has x, y)
148
+ if hasattr(obj, 'x') and hasattr(obj, 'y') and not hasattr(obj, 'x0'):
149
+ return (obj.x, obj.y)
115
150
 
151
+ # 2. Handle fitz.Rect and fitz.IRect (has x0, y0)
152
+ if hasattr(obj, 'x0') and hasattr(obj, 'y0'):
153
+ return (obj.x0, obj.y0, obj.x1, obj.y1)
154
+
155
+ # 3. Handle fitz.Matrix (has a, b, c, d, e, f)
156
+ if hasattr(obj, 'a') and hasattr(obj, 'b') and hasattr(obj, 'c'):
157
+ return (obj.a, obj.b, obj.c, obj.d, obj.e, obj.f)
158
+
159
+ # 4. Fallback: If it's still not a primitive type, convert it to a string
160
+ if not isinstance(obj, (str, int, float, bool, list, tuple, dict)):
161
+ # Examples: hasattr(value, 'rect') and hasattr(value, 'point'):
162
+ # This handles Rect and Point objects that may slip through
163
+ return str(obj)
164
+
165
+ # Otherwise, return the object as is (it's already primitive)
166
+ return obj
167
+
168
+ def extract_links(pdf_path):
169
+ """
170
+ Opens a PDF, iterates through all pages and extracts all link annotations.
171
+ It categorizes the links into External, Internal, or Other actions, and extracts the anchor text.
172
+
173
+ Args:
174
+ pdf_path: The file system path (str) to the target PDF document.
175
+
176
+ Returns:
177
+ A list of dictionaries, where each dictionary is a comprehensive
178
+ representation of an active hyperlink found in the PDF.
179
+
180
+ """
181
+ links_data = []
182
+ try:
183
+ doc = fitz.open(pdf_path)
116
184
 
117
185
  for page_num in range(doc.page_count):
118
186
  page = doc.load_page(page_num)
@@ -143,13 +211,22 @@ def inspect_pdf_hyperlinks_fitz(pdf_path):
143
211
 
144
212
 
145
213
  link_dict = {
146
- 'page': int(page_num) + 1,
214
+ 'page': int(page_num) + 1, # accurate for link location, add 1
147
215
  'rect': link_rect,
148
216
  'link_text': anchor_text,
149
217
  'xref':xref
150
218
  }
151
219
 
152
-
220
+ # A. Clean Geom. Objects: Use the helper function on 'to' / 'destination'
221
+ # Use the clean serialize_fitz_object() helper function on all keys that might contain objects
222
+ destination_view = serialize_fitz_object(link.get('to'))
223
+
224
+ # B. Correct Internal Link Page Numbering (The -1 correction hack)
225
+ # This will be skipped by URI, which is not expected to have a page key
226
+ target_page_num_reported = "N/A"
227
+ if link.get('page') is not None:
228
+ target_page_num_reported = int(link.get('page')) # accurate for link target, don't add 1 (weird)
229
+
153
230
  if link['kind'] == fitz.LINK_URI:
154
231
  target = link.get('uri', 'URI (Unknown Target)')
155
232
  link_dict.update({
@@ -159,12 +236,11 @@ def inspect_pdf_hyperlinks_fitz(pdf_path):
159
236
  })
160
237
 
161
238
  elif link['kind'] == fitz.LINK_GOTO:
162
- target_page_num = link.get('page') + 1 # fitz pages are 0-indexed
163
- target = f"Page {target_page_num}"
239
+ target = f"Page {target_page_num_reported}"
164
240
  link_dict.update({
165
241
  'type': 'Internal (GoTo/Dest)',
166
- 'destination_page': int(link.get('page')) + 1,
167
- 'destination_view': link.get('to'),
242
+ 'destination_page': target_page_num_reported,
243
+ 'destination_view': destination_view,
168
244
  'target': target
169
245
  })
170
246
 
@@ -172,15 +248,17 @@ def inspect_pdf_hyperlinks_fitz(pdf_path):
172
248
  link_dict.update({
173
249
  'type': 'Remote (GoToR)',
174
250
  'remote_file': link.get('file'),
175
- 'destination': link.get('to')
251
+ 'destination': destination_view
176
252
  })
177
253
 
178
254
  elif link.get('page') is not None and link['kind'] != fitz.LINK_GOTO:
255
+ target = f"Page {target_page_num_reported}"
179
256
  link_dict.update({
180
257
  'type': 'Internal (Resolved Action)',
181
- 'destination_page': int(link.get('page')) + 1,
182
- 'destination_view': link.get('to'),
183
- 'source_kind': link.get('kind')
258
+ 'destination_page': target_page_num_reported,
259
+ 'destination_view': destination_view,
260
+ 'source_kind': link.get('kind'),
261
+ 'target': target
184
262
  })
185
263
 
186
264
  else:
@@ -190,20 +268,32 @@ def inspect_pdf_hyperlinks_fitz(pdf_path):
190
268
  'action_kind': link.get('kind'),
191
269
  'target': target
192
270
  })
271
+
272
+ ## --- General Serialization Cleaner ---
273
+ #for key, value in link_dict.items():
274
+ # if hasattr(value, 'rect') and hasattr(value, 'point'):
275
+ # # This handles Rect and Point objects that may slip through
276
+ # link_dict[key] = str(value)
277
+ ## --- End Cleaner ---
193
278
 
194
279
  links_data.append(link_dict)
195
280
 
196
281
  doc.close()
197
282
  except Exception as e:
198
283
  print(f"An error occurred: {e}", file=sys.stderr)
199
- return links_data, structural_toc
284
+ return links_data
200
285
 
201
286
  def print_structural_toc(structural_toc):
202
287
  """
203
- Prints the structural TOC data in a clean, hierarchical, and readable format.
288
+ Prints the structural TOC data (bookmarks/outline) in a clean,
289
+ hierarchical, and readable console format.
290
+
291
+ Args:
292
+ structural_toc: A list of TOC dictionaries returned by `analyze_toc_fitz`.
204
293
  """
205
- print("\n## 📚 Structural Table of Contents (PDF Bookmarks/Outline)")
206
- print("-" * 50)
294
+ print("\n" + "=" * 70)
295
+ print("## Structural Table of Contents (PDF Bookmarks/Outline)")
296
+ print("=" * 70)
207
297
  if not structural_toc:
208
298
  print("No structural TOC (bookmarks/outline) found.")
209
299
  return
@@ -220,108 +310,196 @@ def print_structural_toc(structural_toc):
220
310
  page_str = str(item['target_page']).rjust(page_width)
221
311
  print(f"{indent}{item['title']} . . . page {page_str}")
222
312
 
223
- print("-" * 50)
313
+ print("-" * 70)
224
314
 
225
- def run_analysis(pdf_path: str, check_remnants: bool, max_links: int) -> Dict[str, Any]:
226
- """
227
- Core PDF analysis logic using PyMuPDF. Extracts links, remnants, and TOC.
228
- The printing is done inside this function.
229
- max_links: If <= 0, all links will be displayed.
315
+
316
+ def get_first_pdf_in_cwd() -> Optional[str]:
230
317
  """
231
-
232
- print(f"Running PyMuPDF analysis on {Path(pdf_path).name}...")
318
+ Scans the current working directory (CWD) for the first file ending
319
+ with a '.pdf' extension (case-insensitive).
233
320
 
234
- # 1. Extract all active links and TOC
235
- extracted_links, structural_toc = inspect_pdf_hyperlinks_fitz(pdf_path)
236
- toc_entry_count = len(structural_toc)
321
+ This is intended as a convenience function for running the tool
322
+ without explicitly specifying a path.
323
+
324
+ Returns:
325
+ The absolute path (as a string) to the first PDF file found,
326
+ or None if no PDF files are present in the CWD.
327
+ """
328
+ # 1. Get the current working directory (CWD)
329
+ cwd = Path.cwd()
237
330
 
238
- # 2. Find link remnants
239
- remnants = []
240
- if check_remnants:
241
- remnants = find_link_remnants(pdf_path, extracted_links) # Pass active links to exclude them
242
-
243
- if not extracted_links and not remnants and not structural_toc:
244
- print(f"\nNo hyperlinks, remnants, or structural TOC found in {Path(pdf_path).name}.")
245
- return {}
246
-
247
- # 3. Separate the lists based on the 'type' key
248
- uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
249
- goto_links = [link for link in extracted_links if link['type'] == 'Internal (GoTo/Dest)']
250
- resolved_action_links = [link for link in extracted_links if link['type'] == 'Internal (Resolved Action)']
251
- other_links = [link for link in extracted_links if link['type'] not in ['External (URI)', 'Internal (GoTo/Dest)', 'Internal (Resolved Action)']]
252
-
253
- total_internal_links = len(goto_links) + len(resolved_action_links)
331
+ # 2. Use Path.glob to find files matching the pattern.
332
+ # We use '**/*.pdf' to also search nested directories if desired,
333
+ # but typically for a single PDF in CWD, '*.pdf' is enough.
334
+ # Let's stick to files directly in the CWD for simplicity.
254
335
 
255
- # --- ANALYSIS SUMMARY (Using your print logic) ---
256
- print(f"\n--- Link Analysis Results for {Path(pdf_path).name} ---")
257
- print(f"Total active links: {len(extracted_links)} (External: {len(uri_links)}, Internal Jumps: {total_internal_links}, Other: {len(other_links)})")
258
- print(f"Total **structural TOC entries (bookmarks)** found: {toc_entry_count}")
259
- print(f"Total **potential missing links** found: {len(remnants)}")
260
- print("-" * 50)
261
-
262
- limit = max_links if max_links > 0 else None
263
-
264
- uri_and_other = uri_links + other_links
336
+ # We use list comprehension with next() for efficiency, or a simple loop.
337
+ # Using Path.glob('*.pdf') to search the CWD for files ending in .pdf
338
+ # We make it case-insensitive by checking both '*.pdf' and '*.PDF'
265
339
 
266
- # --- Section 1: ACTIVE URI LINKS ---
267
- print(f"\n## 🔗 Active URI Links (External & Other) - {len(uri_and_other)} found")
268
- print("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target URI/Action"))
269
- print("-" * 75)
340
+ # Note: On Unix systems, glob is case-sensitive by default.
341
+ # The most cross-platform safe way is to iterate and check the suffix.
270
342
 
271
- if uri_and_other:
272
- for i, link in enumerate(uri_and_other[:limit], 1):
273
- target = link.get('url') or link.get('remote_file') or link.get('target')
274
- link_text = link.get('link_text', 'N/A')
275
- print("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], target))
276
- if limit is not None and len(uri_and_other) > limit:
277
- print(f"... and {len(uri_and_other) - limit} more links (use --max-links to see all or --max-links 0 to show all).")
278
-
279
- else:
280
- print("  No external or 'Other' links found.")
281
-
282
- # --- Section 2: ACTIVE INTERNAL JUMPS ---
283
- print(f"\n## 🖱️ Active Internal Jumps (GoTo & Resolved Actions) - {total_internal_links} found")
284
- print("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To Page"))
285
- print("-" * 75)
343
+ try:
344
+ # Check for files in the current directory only
345
+ # Iterating over the generator stops as soon as the first match is found.
346
+ first_pdf_path = next(
347
+ p.resolve() for p in cwd.iterdir()
348
+ if p.is_file() and p.suffix.lower() == '.pdf'
349
+ )
350
+ return str(first_pdf_path)
351
+ except StopIteration:
352
+ # If the generator runs out of items, no PDF was found
353
+ return None
354
+ except Exception as e:
355
+ # Handle potential permissions errors or other issues
356
+ print(f"Error while searching for PDF in CWD: {e}", file=sys.stderr)
357
+ return None
358
+
359
+ def run_analysis(pdf_path: str = None, check_remnants: bool = True, max_links: int = 0, export_format: Optional[str] = "JSON") -> Dict[str, Any]:
360
+ """
361
+ Core high-level PDF link analysis logic.
286
362
 
287
- all_internal = goto_links + resolved_action_links
288
- if total_internal_links > 0:
289
- for i, link in enumerate(all_internal[:limit], 1):
290
- link_text = link.get('link_text', 'N/A')
291
- print("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], link['destination_page']))
292
-
293
- if limit is not None and len(all_internal) > limit:
294
- print(f"... and {len(all_internal) - limit} more links (use --max-links to see all or --max-links 0 to show all).")
295
- else:
296
- print("  No internal GoTo or Resolved Action links found.")
363
+ This function orchestrates the extraction of active links and TOC
364
+ using PyMuPDF, finds link remnants (plain text URLs/emails), and
365
+ prints a comprehensive, user-friendly report to the console.
366
+
367
+ Args:
368
+ pdf_path: The file system path (str) to the target PDF document.
369
+ check_remnants: Boolean flag to enable/disable scanning for plain text
370
+ links that are not active hyperlinks.
371
+ max_links: Maximum number of links/remnants to display in each console
372
+ section. If <= 0, all links will be displayed.
373
+
374
+ Returns:
375
+ A dictionary containing the structured results of the analysis:
376
+ 'external_links', 'internal_links', 'remnants', and 'toc'.
377
+ """
378
+
379
+ if pdf_path is None:
380
+ pdf_path = get_first_pdf_in_cwd()
381
+ if pdf_path is None:
382
+ print("pdf_path is None")
383
+ print("Tip: Drop a PDF in the current folder or pass in a path arg.")
384
+ return
385
+ try:
386
+ print(f"Running PyMuPDF analysis on {Path(pdf_path).name}...")
387
+
388
+ # 1. Extract all active links and TOC
389
+ extracted_links = extract_links(pdf_path)
390
+ structural_toc = extract_toc(pdf_path)
391
+ toc_entry_count = len(structural_toc)
297
392
 
298
- # --- Section 3: REMNANTS ---
299
- print("\n" + "=" * 70)
300
- print(f"## ⚠️ Link Remnants (Potential Missing Links to Fix) - {len(remnants)} found")
301
- print("=" * 70)
302
-
303
- if remnants:
304
- print("{:<5} | {:<5} | {:<15} | {}".format("Idx", "Page", "Remnant Type", "Text Found (Needs Hyperlink)"))
305
- print("-" * 75)
306
- for i, remnant in enumerate(remnants[:max_links], 1):
307
- print("{:<5} | {:<5} | {:<15} | {}".format(i, remnant['page'], remnant['type'], remnant['text']))
308
- if len(remnants) > max_links:
309
- print(f"... and {len(remnants) - max_links} more remnants (use --max-links to see all).")
310
- else:
311
- print("  No URI or Email remnants found that are not already active links.")
393
+ # 2. Find link remnants
394
+ remnants = []
395
+ if check_remnants:
396
+ remnants = find_link_remnants(pdf_path, extracted_links) # Pass active links to exclude them
397
+
398
+ if not extracted_links and not remnants and not structural_toc:
399
+ print(f"\nNo hyperlinks, remnants, or structural TOC found in {Path(pdf_path).name}.")
400
+ return {}
401
+
402
+ # 3. Separate the lists based on the 'type' key
403
+ uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
404
+ goto_links = [link for link in extracted_links if link['type'] == 'Internal (GoTo/Dest)']
405
+ resolved_action_links = [link for link in extracted_links if link['type'] == 'Internal (Resolved Action)']
406
+ other_links = [link for link in extracted_links if link['type'] not in ['External (URI)', 'Internal (GoTo/Dest)', 'Internal (Resolved Action)']]
407
+
408
+ total_internal_links = len(goto_links) + len(resolved_action_links)
312
409
 
313
- # --- Section 4: TOC ---
314
- print_structural_toc(structural_toc)
315
-
316
- # Return the collected data for potential future JSON/other output
317
- return {
318
- "external_links": uri_links,
319
- "internal_links": all_internal,
320
- "remnants": remnants,
321
- "toc": structural_toc
322
- }
410
+ # --- ANALYSIS SUMMARY (Using your print logic) ---
411
+ print("\n" + "✪" * 70)
412
+ print(f"--- Link Analysis Results for {Path(pdf_path).name} ---")
413
+ print(f"Total active links: {len(extracted_links)} (External: {len(uri_links)}, Internal Jumps: {total_internal_links}, Other: {len(other_links)})")
414
+ print(f"Total **structural TOC entries (bookmarks)** found: {toc_entry_count}")
415
+ print(f"Total **potential missing links** found: {len(remnants)}")
416
+ print("" * 70)
417
+
418
+ limit = max_links if max_links > 0 else None
419
+
420
+ uri_and_other = uri_links + other_links
421
+
422
+ # --- Section 1: ACTIVE URI LINKS ---
423
+ print("\n" + "=" * 70)
424
+ print(f"## Active URI Links (External & Other) - {len(uri_and_other)} found")
425
+ print("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target URI/Action"))
426
+ print("=" * 70)
427
+
428
+ if uri_and_other:
429
+ for i, link in enumerate(uri_and_other[:limit], 1):
430
+ target = link.get('url') or link.get('remote_file') or link.get('target')
431
+ link_text = link.get('link_text', 'N/A')
432
+ print("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], target))
433
+ if limit is not None and len(uri_and_other) > limit:
434
+ print(f"... and {len(uri_and_other) - limit} more links (use --max-links to see all or --max-links 0 to show all).")
435
+
436
+ else:
437
+ print(" No external or 'Other' links found.")
438
+
439
+ # --- Section 2: ACTIVE INTERNAL JUMPS ---
440
+ print("\n" + "=" * 70)
441
+ print(f"## Active Internal Jumps (GoTo & Resolved Actions) - {total_internal_links} found")
442
+ print("=" * 70)
443
+ print("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To Page"))
444
+ print("-" * 70)
445
+
446
+ all_internal = goto_links + resolved_action_links
447
+ if total_internal_links > 0:
448
+ for i, link in enumerate(all_internal[:limit], 1):
449
+ link_text = link.get('link_text', 'N/A')
450
+ print("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], link['destination_page']))
451
+
452
+ if limit is not None and len(all_internal) > limit:
453
+ print(f"... and {len(all_internal) - limit} more links (use --max-links to see all or --max-links 0 to show all).")
454
+ else:
455
+ print(" No internal GoTo or Resolved Action links found.")
456
+
457
+ # --- Section 3: REMNANTS ---
458
+ print("\n" + "=" * 70)
459
+ print(f"## ⚠️ Link Remnants (Potential Missing Links to Fix) - {len(remnants)} found")
460
+ print("=" * 70)
461
+
462
+ if remnants:
463
+ print("{:<5} | {:<5} | {:<15} | {}".format("Idx", "Page", "Remnant Type", "Text Found (Needs Hyperlink)"))
464
+ print("-" * 70)
465
+ for i, remnant in enumerate(remnants[:limit], 1):
466
+ print("{:<5} | {:<5} | {:<15} | {}".format(i, remnant['page'], remnant['type'], remnant['text']))
467
+ if max_links!=0 and len(remnants) > max_links:
468
+ print(f"... and {len(remnants) - max_links} more remnants (use --max-links to see all).")
469
+ else:
470
+ print(" No URI or Email remnants found that are not already active links.")
471
+
472
+ # --- Section 4: TOC ---
473
+ print_structural_toc(structural_toc)
474
+
475
+ # Return the collected data for potential future JSON/other output
476
+ final_report_data = {
477
+ "external_links": uri_links,
478
+ "internal_links": all_internal,
479
+ "remnants": remnants,
480
+ "toc": structural_toc
481
+ }
482
+
483
+ # 5. Export Report
484
+ if export_format:
485
+ # Assuming export_to will hold the output format string (e.g., "JSON")
486
+ export_report_data(final_report_data, Path(pdf_path).name, export_format)
487
+
488
+ return final_report_data
489
+ except Exception as e:
490
+ # Log the critical failure
491
+ error_logger.error(f"Critical failure during run_analysis for {pdf_path}: {e}", exc_info=True)
492
+ print(f"FATAL: Analysis failed. Check logs at {LOG_FILE_PATH}", file=sys.stderr)
493
+ raise # Allow the exception to propagate or handle gracefully
494
+
495
+
323
496
 
324
497
  def call_stable():
498
+ """
499
+ Placeholder function for command-line execution (e.g., in __main__).
500
+ Note: This requires defining PROJECT_NAME, CLI_MAIN_FILE, etc., or
501
+ passing them as arguments to run_analysis.
502
+ """
325
503
  print("Begin analysis...")
326
504
  run_analysis()
327
505
  print("Analysis complete.")