pdflinkcheck 1.1.47__py3-none-any.whl → 1.1.72__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pdflinkcheck/__init__.py CHANGED
@@ -1,31 +1,69 @@
1
+ # src/pdflinkcheck/__init__.py
2
+ """
3
+ # License information
4
+ pdflinkcheck - A PDF Link Checker
5
+
6
+ Copyright (C) 2025 George Clayton Bennett
7
+
8
+ Source code: https://github.com/City-of-Memphis-Wastewater/pdflinkcheck/
9
+
10
+ This program is free software: You can redistribute it and/or modify
11
+ it under the terms of the GNU Affero General Public License as
12
+ published by the Free Software Foundation, either version 3 of the
13
+ License, or (at your option) any later version.
14
+
15
+ The AGPL3+ is required because pdflinkcheck uses PyMuPDF, which is licensed under the AGPL3.
16
+ """
17
+ import os as _os
18
+
1
19
  # Library functions
2
- from pdflinkcheck.analyze import run_analysis, extract_links, extract_toc
20
+ from pdflinkcheck.analyze_pymupdf import extract_links_pymupdf, extract_toc_pymupdf
21
+ from pdflinkcheck.analyze_pypdf import extract_links_pypdf, extract_toc_pypdf
22
+ #from pdflinkcheck import analyze_pypdf
23
+ from pdflinkcheck.report import run_report
24
+ from pdflinkcheck.report import run_report as run_analysis # for backwards compatibility with previos versions
25
+ #from pdflinkcheck import dev
3
26
 
4
27
  # For the kids. This is what I wanted when learning Python in a mysterious new REPL.
5
28
  # Is this Pythonic? No. Oh well. PEP 8, PEP 20.
6
- import os
7
- flag = os.environ.get('PDFLINKCHECK_GUI_EASTEREGG', '')
8
- pdflibkcheck_gui_lib_func_load = str(flag).strip().lower() in ('true', '1', 'yes', 'on')
9
-
10
- if pdflibkcheck_gui_lib_func_load:
29
+ # Why is this not Pythonic? Devs expect no side effects when importing library functions.
30
+ # What is a side effect?
31
+ _gui_easteregg_env_flag = _os.environ.get('PDFLINKCHECK_GUI_EASTEREGG', '')
32
+ _load_gui_func = str(_gui_easteregg_env_flag).strip().lower() in ('true', '1', 'yes', 'on')
33
+ if _load_gui_func:
11
34
  try:
12
- import pyhabitat # pyhabitat is a dependency of this package already
13
- if pyhabitat.tkinter_is_available():
35
+ import pyhabitat as _pyhabitat # pyhabitat is a dependency of this package already
36
+ if _pyhabitat.tkinter_is_available():
14
37
  from pdflinkcheck.gui import start_gui
15
38
  except ImportError:
16
39
  # Optional: log or ignore silently
17
- pass
40
+ print("start_gui() not imported")
18
41
 
19
42
  # Breadcrumbs, for stumbling upon.
20
- if pdflibkcheck_gui_lib_func_load:
43
+ if _load_gui_func:
21
44
  __pdflinkcheck_gui_easteregg_enabled__ = True
22
45
  else:
23
46
  __pdflinkcheck_gui_easteregg_enabled__ = False
24
47
 
25
48
  # Define __all__ such that the library functions are self documenting.
26
49
  __all__ = [
50
+ "run_report",
27
51
  "run_analysis",
28
- "extract_links",
29
- "extract_toc",
30
- "start_gui" if pdflibkcheck_gui_lib_func_load else None,
52
+ "extract_links_pymupdf",
53
+ "extract_toc_pymupdf",
54
+ "extract_links_pypdf",
55
+ "extract_toc_pypdf",
56
+ #"start_gui" if _load_gui_func else None,
57
+ #"dev",
31
58
  ]
59
+ if _load_gui_func:
60
+ __all__.append("start_gui")
61
+
62
+ # 4. THE CLEANUP (This removes items from dir())
63
+ del _os
64
+ del _gui_easteregg_env_flag
65
+ del _load_gui_func
66
+
67
+ # Force avoid 'io' appearing, it's likely being imported, when it is imported by another package which is imported here:
68
+ #if "io" in locals():
69
+ # del io
@@ -1,14 +1,17 @@
1
1
  import sys
2
2
  from pathlib import Path
3
3
  import logging
4
- from typing import Dict, Any, Optional
5
- # ... other imports ...
6
- # Configure logging to suppress low-level pdfminer messages
4
+ from typing import Dict, Any, Optional, List
5
+
7
6
  logging.getLogger("fitz").setLevel(logging.ERROR)
8
- import fitz # PyMuPDF
9
7
 
10
- from pdflinkcheck.remnants import find_link_remnants
11
- from pdflinkcheck.io import error_logger, export_report_data, LOG_FILE_PATH
8
+ try:
9
+ import fitz # PyMuPDF
10
+ except ImportError:
11
+ fitz = None
12
+
13
+ from pdflinkcheck.report import run_report
14
+ #from pdflinkcheck.validate import run_validation
12
15
 
13
16
  """
14
17
  Inspect target PDF for both URI links and for GoTo links.
@@ -41,6 +44,44 @@ def get_link_rect(link_dict):
41
44
  return None
42
45
 
43
46
  def get_anchor_text(page, link_rect):
47
+ if not link_rect:
48
+ return "N/A: Missing Rect"
49
+
50
+ try:
51
+ # 1. Convert to fitz.Rect and normalize
52
+ rect = fitz.Rect(link_rect)
53
+ if rect.is_empty:
54
+ return "N/A: Rect Error"
55
+
56
+ # 2. Use asymmetric expansion (similar to the pypdf logic)
57
+ # 10 points horizontal to catch wide characters/kerning
58
+ # 3 points vertical to stay within the line
59
+ search_rect = fitz.Rect(
60
+ rect.x0 - 10,
61
+ rect.y0 - 3,
62
+ rect.x1 + 10,
63
+ rect.y1 + 3
64
+ )
65
+
66
+ # 3. Extract all words on the page
67
+ # Each word is: (x0, y0, x1, y1, "text", block_no, line_no, word_no)
68
+ words = page.get_text("words")
69
+
70
+ anchor_parts = []
71
+ for w in words:
72
+ word_rect = fitz.Rect(w[:4])
73
+ # Check if the word intersects our expanded link rectangle
74
+ if word_rect.intersects(search_rect):
75
+ anchor_parts.append(w[4])
76
+
77
+ cleaned_text = " ".join(anchor_parts).strip()
78
+
79
+ return cleaned_text if cleaned_text else "N/A: No Visible Text"
80
+
81
+ except Exception:
82
+ return "N/A: Rect Error"
83
+
84
+ def get_anchor_text_stable(page, link_rect):
44
85
  """
45
86
  Extracts text content using the link's bounding box coordinates.
46
87
  The bounding box is slightly expanded to ensure full characters are captured.
@@ -91,7 +132,6 @@ def get_anchor_text(page, link_rect):
91
132
  # Fallback for unexpected errors in rect conversion or retrieval
92
133
  return "N/A: Rect Error"
93
134
 
94
-
95
135
  def analyze_toc_fitz(doc):
96
136
  """
97
137
  Extracts the structural Table of Contents (PDF Bookmarks/Outline)
@@ -120,7 +160,7 @@ def analyze_toc_fitz(doc):
120
160
 
121
161
  # 2. Updated Main Inspection Function to Include Text Extraction
122
162
  #def inspect_pdf_hyperlinks_fitz(pdf_path):
123
- def extract_toc(pdf_path):
163
+ def extract_toc_pymupdf(pdf_path):
124
164
  """
125
165
  Opens a PDF, iterates through all pages and extracts the structural table of contents (TOC/bookmarks).
126
166
 
@@ -165,7 +205,8 @@ def serialize_fitz_object(obj):
165
205
  # Otherwise, return the object as is (it's already primitive)
166
206
  return obj
167
207
 
168
- def extract_links(pdf_path):
208
+
209
+ def extract_links_pymupdf(pdf_path):
169
210
  """
170
211
  Opens a PDF, iterates through all pages and extracts all link annotations.
171
212
  It categorizes the links into External, Internal, or Other actions, and extracts the anchor text.
@@ -225,7 +266,7 @@ def extract_links(pdf_path):
225
266
  # This will be skipped by URI, which is not expected to have a page key
226
267
  target_page_num_reported = "N/A"
227
268
  if link.get('page') is not None:
228
- target_page_num_reported = int(link.get('page')) # accurate for link target, don't add 1 (weird)
269
+ target_page_num_reported = int(link.get('page'))+1 # accurate for link target, don't add 1 (weird)
229
270
 
230
271
  if link['kind'] == fitz.LINK_URI:
231
272
  target = link.get('uri', 'URI (Unknown Target)')
@@ -283,226 +324,15 @@ def extract_links(pdf_path):
283
324
  print(f"An error occurred: {e}", file=sys.stderr)
284
325
  return links_data
285
326
 
286
- def print_structural_toc(structural_toc):
287
- """
288
- Prints the structural TOC data (bookmarks/outline) in a clean,
289
- hierarchical, and readable console format.
290
-
291
- Args:
292
- structural_toc: A list of TOC dictionaries returned by `analyze_toc_fitz`.
293
- """
294
- print("\n" + "=" * 70)
295
- print("## Structural Table of Contents (PDF Bookmarks/Outline)")
296
- print("=" * 70)
297
- if not structural_toc:
298
- print("No structural TOC (bookmarks/outline) found.")
299
- return
300
-
301
- # Determine max page width for consistent alignment (optional but nice)
302
- max_page = max(item['target_page'] for item in structural_toc) if structural_toc else 1
303
- page_width = len(str(max_page))
304
-
305
- # Iterate and format
306
- for item in structural_toc:
307
- # Use level for indentation (e.g., Level 1 = 0 spaces, Level 2 = 4 spaces, Level 3 = 8 spaces)
308
- indent = " " * 4 * (item['level'] - 1)
309
- # Format the title and target page number
310
- page_str = str(item['target_page']).rjust(page_width)
311
- print(f"{indent}{item['title']} . . . page {page_str}")
312
-
313
- print("-" * 70)
314
-
315
-
316
- def get_first_pdf_in_cwd() -> Optional[str]:
317
- """
318
- Scans the current working directory (CWD) for the first file ending
319
- with a '.pdf' extension (case-insensitive).
320
-
321
- This is intended as a convenience function for running the tool
322
- without explicitly specifying a path.
323
-
324
- Returns:
325
- The absolute path (as a string) to the first PDF file found,
326
- or None if no PDF files are present in the CWD.
327
- """
328
- # 1. Get the current working directory (CWD)
329
- cwd = Path.cwd()
330
-
331
- # 2. Use Path.glob to find files matching the pattern.
332
- # We use '**/*.pdf' to also search nested directories if desired,
333
- # but typically for a single PDF in CWD, '*.pdf' is enough.
334
- # Let's stick to files directly in the CWD for simplicity.
335
-
336
- # We use list comprehension with next() for efficiency, or a simple loop.
337
- # Using Path.glob('*.pdf') to search the CWD for files ending in .pdf
338
- # We make it case-insensitive by checking both '*.pdf' and '*.PDF'
339
-
340
- # Note: On Unix systems, glob is case-sensitive by default.
341
- # The most cross-platform safe way is to iterate and check the suffix.
342
-
343
- try:
344
- # Check for files in the current directory only
345
- # Iterating over the generator stops as soon as the first match is found.
346
- first_pdf_path = next(
347
- p.resolve() for p in cwd.iterdir()
348
- if p.is_file() and p.suffix.lower() == '.pdf'
349
- )
350
- return str(first_pdf_path)
351
- except StopIteration:
352
- # If the generator runs out of items, no PDF was found
353
- return None
354
- except Exception as e:
355
- # Handle potential permissions errors or other issues
356
- print(f"Error while searching for PDF in CWD: {e}", file=sys.stderr)
357
- return None
358
-
359
- def run_analysis(pdf_path: str = None, check_remnants: bool = True, max_links: int = 0, export_format: Optional[str] = "JSON") -> Dict[str, Any]:
360
- """
361
- Core high-level PDF link analysis logic.
362
-
363
- This function orchestrates the extraction of active links and TOC
364
- using PyMuPDF, finds link remnants (plain text URLs/emails), and
365
- prints a comprehensive, user-friendly report to the console.
366
-
367
- Args:
368
- pdf_path: The file system path (str) to the target PDF document.
369
- check_remnants: Boolean flag to enable/disable scanning for plain text
370
- links that are not active hyperlinks.
371
- max_links: Maximum number of links/remnants to display in each console
372
- section. If <= 0, all links will be displayed.
373
-
374
- Returns:
375
- A dictionary containing the structured results of the analysis:
376
- 'external_links', 'internal_links', 'remnants', and 'toc'.
377
- """
378
-
379
- if pdf_path is None:
380
- pdf_path = get_first_pdf_in_cwd()
381
- if pdf_path is None:
382
- print("pdf_path is None")
383
- print("Tip: Drop a PDF in the current folder or pass in a path arg.")
384
- return
385
- try:
386
- print(f"Running PyMuPDF analysis on {Path(pdf_path).name}...")
387
-
388
- # 1. Extract all active links and TOC
389
- extracted_links = extract_links(pdf_path)
390
- structural_toc = extract_toc(pdf_path)
391
- toc_entry_count = len(structural_toc)
392
-
393
- # 2. Find link remnants
394
- remnants = []
395
- if check_remnants:
396
- remnants = find_link_remnants(pdf_path, extracted_links) # Pass active links to exclude them
397
-
398
- if not extracted_links and not remnants and not structural_toc:
399
- print(f"\nNo hyperlinks, remnants, or structural TOC found in {Path(pdf_path).name}.")
400
- return {}
401
-
402
- # 3. Separate the lists based on the 'type' key
403
- uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
404
- goto_links = [link for link in extracted_links if link['type'] == 'Internal (GoTo/Dest)']
405
- resolved_action_links = [link for link in extracted_links if link['type'] == 'Internal (Resolved Action)']
406
- other_links = [link for link in extracted_links if link['type'] not in ['External (URI)', 'Internal (GoTo/Dest)', 'Internal (Resolved Action)']]
407
-
408
- total_internal_links = len(goto_links) + len(resolved_action_links)
409
-
410
- # --- ANALYSIS SUMMARY (Using your print logic) ---
411
- print("\n" + "✪" * 70)
412
- print(f"--- Link Analysis Results for {Path(pdf_path).name} ---")
413
- print(f"Total active links: {len(extracted_links)} (External: {len(uri_links)}, Internal Jumps: {total_internal_links}, Other: {len(other_links)})")
414
- print(f"Total **structural TOC entries (bookmarks)** found: {toc_entry_count}")
415
- print(f"Total **potential missing links** found: {len(remnants)}")
416
- print("✪" * 70)
417
-
418
- limit = max_links if max_links > 0 else None
419
-
420
- uri_and_other = uri_links + other_links
421
-
422
- # --- Section 1: ACTIVE URI LINKS ---
423
- print("\n" + "=" * 70)
424
- print(f"## Active URI Links (External & Other) - {len(uri_and_other)} found")
425
- print("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target URI/Action"))
426
- print("=" * 70)
427
-
428
- if uri_and_other:
429
- for i, link in enumerate(uri_and_other[:limit], 1):
430
- target = link.get('url') or link.get('remote_file') or link.get('target')
431
- link_text = link.get('link_text', 'N/A')
432
- print("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], target))
433
- if limit is not None and len(uri_and_other) > limit:
434
- print(f"... and {len(uri_and_other) - limit} more links (use --max-links to see all or --max-links 0 to show all).")
435
-
436
- else:
437
- print(" No external or 'Other' links found.")
438
-
439
- # --- Section 2: ACTIVE INTERNAL JUMPS ---
440
- print("\n" + "=" * 70)
441
- print(f"## Active Internal Jumps (GoTo & Resolved Actions) - {total_internal_links} found")
442
- print("=" * 70)
443
- print("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To Page"))
444
- print("-" * 70)
445
-
446
- all_internal = goto_links + resolved_action_links
447
- if total_internal_links > 0:
448
- for i, link in enumerate(all_internal[:limit], 1):
449
- link_text = link.get('link_text', 'N/A')
450
- print("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], link['destination_page']))
451
-
452
- if limit is not None and len(all_internal) > limit:
453
- print(f"... and {len(all_internal) - limit} more links (use --max-links to see all or --max-links 0 to show all).")
454
- else:
455
- print(" No internal GoTo or Resolved Action links found.")
456
-
457
- # --- Section 3: REMNANTS ---
458
- print("\n" + "=" * 70)
459
- print(f"## ⚠️ Link Remnants (Potential Missing Links to Fix) - {len(remnants)} found")
460
- print("=" * 70)
461
-
462
- if remnants:
463
- print("{:<5} | {:<5} | {:<15} | {}".format("Idx", "Page", "Remnant Type", "Text Found (Needs Hyperlink)"))
464
- print("-" * 70)
465
- for i, remnant in enumerate(remnants[:limit], 1):
466
- print("{:<5} | {:<5} | {:<15} | {}".format(i, remnant['page'], remnant['type'], remnant['text']))
467
- if max_links!=0 and len(remnants) > max_links:
468
- print(f"... and {len(remnants) - max_links} more remnants (use --max-links to see all).")
469
- else:
470
- print(" No URI or Email remnants found that are not already active links.")
471
-
472
- # --- Section 4: TOC ---
473
- print_structural_toc(structural_toc)
474
-
475
- # Return the collected data for potential future JSON/other output
476
- final_report_data = {
477
- "external_links": uri_links,
478
- "internal_links": all_internal,
479
- "remnants": remnants,
480
- "toc": structural_toc
481
- }
482
-
483
- # 5. Export Report
484
- if export_format:
485
- # Assuming export_to will hold the output format string (e.g., "JSON")
486
- export_report_data(final_report_data, Path(pdf_path).name, export_format)
487
-
488
- return final_report_data
489
- except Exception as e:
490
- # Log the critical failure
491
- error_logger.error(f"Critical failure during run_analysis for {pdf_path}: {e}", exc_info=True)
492
- print(f"FATAL: Analysis failed. Check logs at {LOG_FILE_PATH}", file=sys.stderr)
493
- raise # Allow the exception to propagate or handle gracefully
494
-
495
-
496
327
 
497
328
  def call_stable():
498
329
  """
499
330
  Placeholder function for command-line execution (e.g., in __main__).
500
331
  Note: This requires defining PROJECT_NAME, CLI_MAIN_FILE, etc., or
501
- passing them as arguments to run_analysis.
332
+ passing them as arguments to run_report.
502
333
  """
503
- print("Begin analysis...")
504
- run_analysis()
505
- print("Analysis complete.")
334
+ run_report(pdf_library = "pymupdf")
335
+ #run_validation(pdf_library = "pymupdf")
506
336
 
507
337
  if __name__ == "__main__":
508
338
  call_stable()
@@ -0,0 +1,184 @@
1
+ # src/pdflinkcheck/analyze_pypdf.py
2
+ import sys
3
+ from pathlib import Path
4
+ import logging
5
+ from typing import Dict, Any, Optional, List
6
+
7
+ from pypdf import PdfReader
8
+ from pypdf.generic import Destination, NameObject, ArrayObject, IndirectObject
9
+
10
+
11
+ from pdflinkcheck.io import error_logger, export_report_data, get_first_pdf_in_cwd, LOG_FILE_PATH
12
+ from pdflinkcheck.report import run_report
13
+ #from pdflinkcheck.validate import run_validation
14
+
15
+ """
16
+ Inspect target PDF for both URI links and for GoTo links, using only pypdf, not Fitz
17
+ """
18
+
19
+ def get_anchor_text_pypdf(page, rect) -> str:
20
+ """
21
+ Extracts text within the link's bounding box using a visitor function.
22
+ Reliable for finding text associated with a link without PyMuPDF.
23
+ """
24
+ if not rect:
25
+ return "N/A: Missing Rect"
26
+
27
+ # Standardize rect orientation (pypdf Rects are [x0, y0, x1, y1])
28
+ # Note: PDF coordinates use bottom-left as (0,0)
29
+ x_min = min(rect[0], rect[2])
30
+ y_min = min(rect[1], rect[3])
31
+ x_max = max(rect[0], rect[2])
32
+ y_max = max(rect[1], rect[3])
33
+
34
+ parts: List[str] = []
35
+
36
+ def visitor_body(text, cm, tm, font_dict, font_size):
37
+ # tm[4], tm[5] are the current text insertion point coordinates (x, y)
38
+ x, y = tm[4], tm[5]
39
+
40
+ # Using a threshold to account for font metrics/descenders
41
+ # Generous tolerance (±10 pt) to catch descenders, ascenders, kerning, and minor misalignments
42
+ tolerance = 10
43
+ if (x_min - tolerance) <= x <= (x_max + tolerance) and (y_min - tolerance) <= y <= (y_max + tolerance):
44
+ if text.strip():
45
+ parts.append(text)
46
+
47
+ page.extract_text(visitor_text=visitor_body)
48
+
49
+ raw_extracted = "".join(parts)
50
+ cleaned = " ".join(raw_extracted.split()).strip()
51
+
52
+ return cleaned if cleaned else "Graphic/Empty Link"
53
+
54
+ def resolve_pypdf_destination(reader: PdfReader, dest, obj_id_to_page: dict) -> str:
55
+ """
56
+ Resolves a Destination object or IndirectObject to a 1-based page number string.
57
+ """
58
+ try:
59
+ if isinstance(dest, Destination):
60
+ return str(dest.page_number + 1)
61
+
62
+ if isinstance(dest, IndirectObject):
63
+ return str(obj_id_to_page.get(dest.idnum, "Unknown"))
64
+
65
+ if isinstance(dest, ArrayObject) and len(dest) > 0:
66
+ if isinstance(dest[0], IndirectObject):
67
+ return str(obj_id_to_page.get(dest[0].idnum, "Unknown"))
68
+
69
+ return "Unknown"
70
+ except Exception:
71
+ return "Error Resolving"
72
+
73
+ def extract_links_pypdf(pdf_path):
74
+ """
75
+ Termux-compatible link extraction using pure-Python pypdf.
76
+ Matches the reporting schema of the PyMuPDF version.
77
+ """
78
+ reader = PdfReader(pdf_path)
79
+
80
+ # Pre-map Object IDs to Page Numbers for fast internal link resolution
81
+ obj_id_to_page = {
82
+ page.indirect_reference.idnum: i + 1
83
+ for i, page in enumerate(reader.pages)
84
+ }
85
+
86
+ all_links = []
87
+
88
+ for i, page in enumerate(reader.pages):
89
+ page_num = i + 1
90
+ if "/Annots" not in page:
91
+ continue
92
+
93
+ for annot in page["/Annots"]:
94
+ obj = annot.get_object()
95
+ if obj.get("/Subtype") != "/Link":
96
+ continue
97
+
98
+ rect = obj.get("/Rect")
99
+ anchor_text = get_anchor_text_pypdf(page, rect)
100
+
101
+ link_dict = {
102
+ 'page': page_num,
103
+ 'rect': list(rect) if rect else None,
104
+ 'link_text': anchor_text,
105
+ 'type': 'Other Action',
106
+ 'target': 'Unknown'
107
+ }
108
+
109
+ # Handle URI (External)
110
+ if "/A" in obj and "/URI" in obj["/A"]:
111
+ uri = obj["/A"]["/URI"]
112
+ link_dict.update({
113
+ 'type': 'External (URI)',
114
+ 'url': uri,
115
+ 'target': uri
116
+ })
117
+
118
+ # Handle GoTo (Internal)
119
+ elif "/Dest" in obj or ("/A" in obj and "/D" in obj["/A"]):
120
+ dest = obj.get("/Dest") or obj["/A"].get("/D")
121
+ target_page = resolve_pypdf_destination(reader, dest, obj_id_to_page)
122
+ link_dict.update({
123
+ 'type': 'Internal (GoTo/Dest)',
124
+ 'destination_page': target_page,
125
+ 'target': f"Page {target_page}"
126
+ })
127
+
128
+ # Handle Remote GoTo (GoToR)
129
+ elif "/A" in obj and obj["/A"].get("/S") == "/GoToR":
130
+ remote_file = obj["/A"].get("/F")
131
+ link_dict.update({
132
+ 'type': 'Remote (GoToR)',
133
+ 'remote_file': str(remote_file),
134
+ 'target': f"File: {remote_file}"
135
+ })
136
+
137
+ all_links.append(link_dict)
138
+
139
+ return all_links
140
+
141
+
142
+ def extract_toc_pypdf(pdf_path: str) -> List[Dict[str, Any]]:
143
+ try:
144
+ reader = PdfReader(pdf_path)
145
+ # Note: outline is a property, not a method.
146
+ toc_tree = reader.outline
147
+ toc_data = []
148
+
149
+ def flatten_outline(outline_items, level=1):
150
+ for item in outline_items:
151
+ if isinstance(item, Destination):
152
+ # Using the reader directly is the only way to avoid
153
+ # the 'Destination' object has no attribute error
154
+ try:
155
+ page_num = reader.get_destination_page_number(item) + 1
156
+ except:
157
+ page_num = "N/A"
158
+
159
+ toc_data.append({
160
+ "level": level,
161
+ "title": item.title,
162
+ "target_page": page_num
163
+ })
164
+ elif isinstance(item, list):
165
+ # pypdf nests children in a list immediately following the parent
166
+ flatten_outline(item, level + 1)
167
+
168
+ flatten_outline(toc_tree)
169
+ return toc_data
170
+ except Exception as e:
171
+ print(f"TOC error: {e}", file=sys.stderr)
172
+ return []
173
+
174
+ def call_stable():
175
+ """
176
+ Placeholder function for command-line execution (e.g., in __main__).
177
+ Note: This requires defining PROJECT_NAME, CLI_MAIN_FILE, etc., or
178
+ passing them as arguments to run_report.
179
+ """
180
+ run_report(pdf_library = "pypdf")
181
+ #run_validation(pdf_library = "pypdf")
182
+
183
+ if __name__ == "__main__":
184
+ call_stable()