pdflinkcheck 1.1.47__py3-none-any.whl → 1.1.72__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdflinkcheck/__init__.py +51 -13
- pdflinkcheck/{analyze.py → analyze_pymupdf.py} +54 -224
- pdflinkcheck/analyze_pypdf.py +184 -0
- pdflinkcheck/analyze_pypdf_v2.py +218 -0
- pdflinkcheck/cli.py +238 -39
- pdflinkcheck/data/LICENSE +5 -24
- pdflinkcheck/data/README.md +278 -0
- pdflinkcheck/data/pyproject.toml +98 -0
- pdflinkcheck/datacopy.py +60 -0
- pdflinkcheck/dev.py +109 -0
- pdflinkcheck/gui.py +371 -74
- pdflinkcheck/io.py +118 -11
- pdflinkcheck/report.py +280 -0
- pdflinkcheck/stdlib_server.py +176 -0
- pdflinkcheck/validate.py +380 -0
- pdflinkcheck/version_info.py +83 -0
- {pdflinkcheck-1.1.47.dist-info → pdflinkcheck-1.1.72.dist-info}/METADATA +127 -71
- pdflinkcheck-1.1.72.dist-info/RECORD +21 -0
- pdflinkcheck-1.1.72.dist-info/WHEEL +4 -0
- {pdflinkcheck-1.1.47.dist-info → pdflinkcheck-1.1.72.dist-info}/entry_points.txt +1 -0
- {pdflinkcheck-1.1.47.dist-info → pdflinkcheck-1.1.72.dist-info}/licenses/LICENSE +5 -24
- pdflinkcheck/remnants.py +0 -142
- pdflinkcheck-1.1.47.dist-info/RECORD +0 -13
- pdflinkcheck-1.1.47.dist-info/WHEEL +0 -5
- pdflinkcheck-1.1.47.dist-info/top_level.txt +0 -1
pdflinkcheck/__init__.py
CHANGED
|
@@ -1,31 +1,69 @@
|
|
|
1
|
+
# src/pdflinkcheck/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
# License information
|
|
4
|
+
pdflinkcheck - A PDF Link Checker
|
|
5
|
+
|
|
6
|
+
Copyright (C) 2025 George Clayton Bennett
|
|
7
|
+
|
|
8
|
+
Source code: https://github.com/City-of-Memphis-Wastewater/pdflinkcheck/
|
|
9
|
+
|
|
10
|
+
This program is free software: You can redistribute it and/or modify
|
|
11
|
+
it under the terms of the GNU Affero General Public License as
|
|
12
|
+
published by the Free Software Foundation, either version 3 of the
|
|
13
|
+
License, or (at your option) any later version.
|
|
14
|
+
|
|
15
|
+
The AGPL3+ is required because pdflinkcheck uses PyMuPDF, which is licensed under the AGPL3.
|
|
16
|
+
"""
|
|
17
|
+
import os as _os
|
|
18
|
+
|
|
1
19
|
# Library functions
|
|
2
|
-
from pdflinkcheck.
|
|
20
|
+
from pdflinkcheck.analyze_pymupdf import extract_links_pymupdf, extract_toc_pymupdf
|
|
21
|
+
from pdflinkcheck.analyze_pypdf import extract_links_pypdf, extract_toc_pypdf
|
|
22
|
+
#from pdflinkcheck import analyze_pypdf
|
|
23
|
+
from pdflinkcheck.report import run_report
|
|
24
|
+
from pdflinkcheck.report import run_report as run_analysis # for backwards compatibility with previos versions
|
|
25
|
+
#from pdflinkcheck import dev
|
|
3
26
|
|
|
4
27
|
# For the kids. This is what I wanted when learning Python in a mysterious new REPL.
|
|
5
28
|
# Is this Pythonic? No. Oh well. PEP 8, PEP 20.
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
if
|
|
29
|
+
# Why is this not Pythonic? Devs expect no side effects when importing library functions.
|
|
30
|
+
# What is a side effect?
|
|
31
|
+
_gui_easteregg_env_flag = _os.environ.get('PDFLINKCHECK_GUI_EASTEREGG', '')
|
|
32
|
+
_load_gui_func = str(_gui_easteregg_env_flag).strip().lower() in ('true', '1', 'yes', 'on')
|
|
33
|
+
if _load_gui_func:
|
|
11
34
|
try:
|
|
12
|
-
import pyhabitat # pyhabitat is a dependency of this package already
|
|
13
|
-
if
|
|
35
|
+
import pyhabitat as _pyhabitat # pyhabitat is a dependency of this package already
|
|
36
|
+
if _pyhabitat.tkinter_is_available():
|
|
14
37
|
from pdflinkcheck.gui import start_gui
|
|
15
38
|
except ImportError:
|
|
16
39
|
# Optional: log or ignore silently
|
|
17
|
-
|
|
40
|
+
print("start_gui() not imported")
|
|
18
41
|
|
|
19
42
|
# Breadcrumbs, for stumbling upon.
|
|
20
|
-
if
|
|
43
|
+
if _load_gui_func:
|
|
21
44
|
__pdflinkcheck_gui_easteregg_enabled__ = True
|
|
22
45
|
else:
|
|
23
46
|
__pdflinkcheck_gui_easteregg_enabled__ = False
|
|
24
47
|
|
|
25
48
|
# Define __all__ such that the library functions are self documenting.
|
|
26
49
|
__all__ = [
|
|
50
|
+
"run_report",
|
|
27
51
|
"run_analysis",
|
|
28
|
-
"
|
|
29
|
-
"
|
|
30
|
-
"
|
|
52
|
+
"extract_links_pymupdf",
|
|
53
|
+
"extract_toc_pymupdf",
|
|
54
|
+
"extract_links_pypdf",
|
|
55
|
+
"extract_toc_pypdf",
|
|
56
|
+
#"start_gui" if _load_gui_func else None,
|
|
57
|
+
#"dev",
|
|
31
58
|
]
|
|
59
|
+
if _load_gui_func:
|
|
60
|
+
__all__.append("start_gui")
|
|
61
|
+
|
|
62
|
+
# 4. THE CLEANUP (This removes items from dir())
|
|
63
|
+
del _os
|
|
64
|
+
del _gui_easteregg_env_flag
|
|
65
|
+
del _load_gui_func
|
|
66
|
+
|
|
67
|
+
# Force avoid 'io' appearing, it's likely being imported, when it is imported by another package which is imported here:
|
|
68
|
+
#if "io" in locals():
|
|
69
|
+
# del io
|
|
@@ -1,14 +1,17 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Dict, Any, Optional
|
|
5
|
-
|
|
6
|
-
# Configure logging to suppress low-level pdfminer messages
|
|
4
|
+
from typing import Dict, Any, Optional, List
|
|
5
|
+
|
|
7
6
|
logging.getLogger("fitz").setLevel(logging.ERROR)
|
|
8
|
-
import fitz # PyMuPDF
|
|
9
7
|
|
|
10
|
-
|
|
11
|
-
|
|
8
|
+
try:
|
|
9
|
+
import fitz # PyMuPDF
|
|
10
|
+
except ImportError:
|
|
11
|
+
fitz = None
|
|
12
|
+
|
|
13
|
+
from pdflinkcheck.report import run_report
|
|
14
|
+
#from pdflinkcheck.validate import run_validation
|
|
12
15
|
|
|
13
16
|
"""
|
|
14
17
|
Inspect target PDF for both URI links and for GoTo links.
|
|
@@ -41,6 +44,44 @@ def get_link_rect(link_dict):
|
|
|
41
44
|
return None
|
|
42
45
|
|
|
43
46
|
def get_anchor_text(page, link_rect):
|
|
47
|
+
if not link_rect:
|
|
48
|
+
return "N/A: Missing Rect"
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
# 1. Convert to fitz.Rect and normalize
|
|
52
|
+
rect = fitz.Rect(link_rect)
|
|
53
|
+
if rect.is_empty:
|
|
54
|
+
return "N/A: Rect Error"
|
|
55
|
+
|
|
56
|
+
# 2. Use asymmetric expansion (similar to the pypdf logic)
|
|
57
|
+
# 10 points horizontal to catch wide characters/kerning
|
|
58
|
+
# 3 points vertical to stay within the line
|
|
59
|
+
search_rect = fitz.Rect(
|
|
60
|
+
rect.x0 - 10,
|
|
61
|
+
rect.y0 - 3,
|
|
62
|
+
rect.x1 + 10,
|
|
63
|
+
rect.y1 + 3
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# 3. Extract all words on the page
|
|
67
|
+
# Each word is: (x0, y0, x1, y1, "text", block_no, line_no, word_no)
|
|
68
|
+
words = page.get_text("words")
|
|
69
|
+
|
|
70
|
+
anchor_parts = []
|
|
71
|
+
for w in words:
|
|
72
|
+
word_rect = fitz.Rect(w[:4])
|
|
73
|
+
# Check if the word intersects our expanded link rectangle
|
|
74
|
+
if word_rect.intersects(search_rect):
|
|
75
|
+
anchor_parts.append(w[4])
|
|
76
|
+
|
|
77
|
+
cleaned_text = " ".join(anchor_parts).strip()
|
|
78
|
+
|
|
79
|
+
return cleaned_text if cleaned_text else "N/A: No Visible Text"
|
|
80
|
+
|
|
81
|
+
except Exception:
|
|
82
|
+
return "N/A: Rect Error"
|
|
83
|
+
|
|
84
|
+
def get_anchor_text_stable(page, link_rect):
|
|
44
85
|
"""
|
|
45
86
|
Extracts text content using the link's bounding box coordinates.
|
|
46
87
|
The bounding box is slightly expanded to ensure full characters are captured.
|
|
@@ -91,7 +132,6 @@ def get_anchor_text(page, link_rect):
|
|
|
91
132
|
# Fallback for unexpected errors in rect conversion or retrieval
|
|
92
133
|
return "N/A: Rect Error"
|
|
93
134
|
|
|
94
|
-
|
|
95
135
|
def analyze_toc_fitz(doc):
|
|
96
136
|
"""
|
|
97
137
|
Extracts the structural Table of Contents (PDF Bookmarks/Outline)
|
|
@@ -120,7 +160,7 @@ def analyze_toc_fitz(doc):
|
|
|
120
160
|
|
|
121
161
|
# 2. Updated Main Inspection Function to Include Text Extraction
|
|
122
162
|
#def inspect_pdf_hyperlinks_fitz(pdf_path):
|
|
123
|
-
def
|
|
163
|
+
def extract_toc_pymupdf(pdf_path):
|
|
124
164
|
"""
|
|
125
165
|
Opens a PDF, iterates through all pages and extracts the structural table of contents (TOC/bookmarks).
|
|
126
166
|
|
|
@@ -165,7 +205,8 @@ def serialize_fitz_object(obj):
|
|
|
165
205
|
# Otherwise, return the object as is (it's already primitive)
|
|
166
206
|
return obj
|
|
167
207
|
|
|
168
|
-
|
|
208
|
+
|
|
209
|
+
def extract_links_pymupdf(pdf_path):
|
|
169
210
|
"""
|
|
170
211
|
Opens a PDF, iterates through all pages and extracts all link annotations.
|
|
171
212
|
It categorizes the links into External, Internal, or Other actions, and extracts the anchor text.
|
|
@@ -225,7 +266,7 @@ def extract_links(pdf_path):
|
|
|
225
266
|
# This will be skipped by URI, which is not expected to have a page key
|
|
226
267
|
target_page_num_reported = "N/A"
|
|
227
268
|
if link.get('page') is not None:
|
|
228
|
-
target_page_num_reported = int(link.get('page')) # accurate for link target, don't add 1 (weird)
|
|
269
|
+
target_page_num_reported = int(link.get('page'))+1 # accurate for link target, don't add 1 (weird)
|
|
229
270
|
|
|
230
271
|
if link['kind'] == fitz.LINK_URI:
|
|
231
272
|
target = link.get('uri', 'URI (Unknown Target)')
|
|
@@ -283,226 +324,15 @@ def extract_links(pdf_path):
|
|
|
283
324
|
print(f"An error occurred: {e}", file=sys.stderr)
|
|
284
325
|
return links_data
|
|
285
326
|
|
|
286
|
-
def print_structural_toc(structural_toc):
|
|
287
|
-
"""
|
|
288
|
-
Prints the structural TOC data (bookmarks/outline) in a clean,
|
|
289
|
-
hierarchical, and readable console format.
|
|
290
|
-
|
|
291
|
-
Args:
|
|
292
|
-
structural_toc: A list of TOC dictionaries returned by `analyze_toc_fitz`.
|
|
293
|
-
"""
|
|
294
|
-
print("\n" + "=" * 70)
|
|
295
|
-
print("## Structural Table of Contents (PDF Bookmarks/Outline)")
|
|
296
|
-
print("=" * 70)
|
|
297
|
-
if not structural_toc:
|
|
298
|
-
print("No structural TOC (bookmarks/outline) found.")
|
|
299
|
-
return
|
|
300
|
-
|
|
301
|
-
# Determine max page width for consistent alignment (optional but nice)
|
|
302
|
-
max_page = max(item['target_page'] for item in structural_toc) if structural_toc else 1
|
|
303
|
-
page_width = len(str(max_page))
|
|
304
|
-
|
|
305
|
-
# Iterate and format
|
|
306
|
-
for item in structural_toc:
|
|
307
|
-
# Use level for indentation (e.g., Level 1 = 0 spaces, Level 2 = 4 spaces, Level 3 = 8 spaces)
|
|
308
|
-
indent = " " * 4 * (item['level'] - 1)
|
|
309
|
-
# Format the title and target page number
|
|
310
|
-
page_str = str(item['target_page']).rjust(page_width)
|
|
311
|
-
print(f"{indent}{item['title']} . . . page {page_str}")
|
|
312
|
-
|
|
313
|
-
print("-" * 70)
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
def get_first_pdf_in_cwd() -> Optional[str]:
|
|
317
|
-
"""
|
|
318
|
-
Scans the current working directory (CWD) for the first file ending
|
|
319
|
-
with a '.pdf' extension (case-insensitive).
|
|
320
|
-
|
|
321
|
-
This is intended as a convenience function for running the tool
|
|
322
|
-
without explicitly specifying a path.
|
|
323
|
-
|
|
324
|
-
Returns:
|
|
325
|
-
The absolute path (as a string) to the first PDF file found,
|
|
326
|
-
or None if no PDF files are present in the CWD.
|
|
327
|
-
"""
|
|
328
|
-
# 1. Get the current working directory (CWD)
|
|
329
|
-
cwd = Path.cwd()
|
|
330
|
-
|
|
331
|
-
# 2. Use Path.glob to find files matching the pattern.
|
|
332
|
-
# We use '**/*.pdf' to also search nested directories if desired,
|
|
333
|
-
# but typically for a single PDF in CWD, '*.pdf' is enough.
|
|
334
|
-
# Let's stick to files directly in the CWD for simplicity.
|
|
335
|
-
|
|
336
|
-
# We use list comprehension with next() for efficiency, or a simple loop.
|
|
337
|
-
# Using Path.glob('*.pdf') to search the CWD for files ending in .pdf
|
|
338
|
-
# We make it case-insensitive by checking both '*.pdf' and '*.PDF'
|
|
339
|
-
|
|
340
|
-
# Note: On Unix systems, glob is case-sensitive by default.
|
|
341
|
-
# The most cross-platform safe way is to iterate and check the suffix.
|
|
342
|
-
|
|
343
|
-
try:
|
|
344
|
-
# Check for files in the current directory only
|
|
345
|
-
# Iterating over the generator stops as soon as the first match is found.
|
|
346
|
-
first_pdf_path = next(
|
|
347
|
-
p.resolve() for p in cwd.iterdir()
|
|
348
|
-
if p.is_file() and p.suffix.lower() == '.pdf'
|
|
349
|
-
)
|
|
350
|
-
return str(first_pdf_path)
|
|
351
|
-
except StopIteration:
|
|
352
|
-
# If the generator runs out of items, no PDF was found
|
|
353
|
-
return None
|
|
354
|
-
except Exception as e:
|
|
355
|
-
# Handle potential permissions errors or other issues
|
|
356
|
-
print(f"Error while searching for PDF in CWD: {e}", file=sys.stderr)
|
|
357
|
-
return None
|
|
358
|
-
|
|
359
|
-
def run_analysis(pdf_path: str = None, check_remnants: bool = True, max_links: int = 0, export_format: Optional[str] = "JSON") -> Dict[str, Any]:
|
|
360
|
-
"""
|
|
361
|
-
Core high-level PDF link analysis logic.
|
|
362
|
-
|
|
363
|
-
This function orchestrates the extraction of active links and TOC
|
|
364
|
-
using PyMuPDF, finds link remnants (plain text URLs/emails), and
|
|
365
|
-
prints a comprehensive, user-friendly report to the console.
|
|
366
|
-
|
|
367
|
-
Args:
|
|
368
|
-
pdf_path: The file system path (str) to the target PDF document.
|
|
369
|
-
check_remnants: Boolean flag to enable/disable scanning for plain text
|
|
370
|
-
links that are not active hyperlinks.
|
|
371
|
-
max_links: Maximum number of links/remnants to display in each console
|
|
372
|
-
section. If <= 0, all links will be displayed.
|
|
373
|
-
|
|
374
|
-
Returns:
|
|
375
|
-
A dictionary containing the structured results of the analysis:
|
|
376
|
-
'external_links', 'internal_links', 'remnants', and 'toc'.
|
|
377
|
-
"""
|
|
378
|
-
|
|
379
|
-
if pdf_path is None:
|
|
380
|
-
pdf_path = get_first_pdf_in_cwd()
|
|
381
|
-
if pdf_path is None:
|
|
382
|
-
print("pdf_path is None")
|
|
383
|
-
print("Tip: Drop a PDF in the current folder or pass in a path arg.")
|
|
384
|
-
return
|
|
385
|
-
try:
|
|
386
|
-
print(f"Running PyMuPDF analysis on {Path(pdf_path).name}...")
|
|
387
|
-
|
|
388
|
-
# 1. Extract all active links and TOC
|
|
389
|
-
extracted_links = extract_links(pdf_path)
|
|
390
|
-
structural_toc = extract_toc(pdf_path)
|
|
391
|
-
toc_entry_count = len(structural_toc)
|
|
392
|
-
|
|
393
|
-
# 2. Find link remnants
|
|
394
|
-
remnants = []
|
|
395
|
-
if check_remnants:
|
|
396
|
-
remnants = find_link_remnants(pdf_path, extracted_links) # Pass active links to exclude them
|
|
397
|
-
|
|
398
|
-
if not extracted_links and not remnants and not structural_toc:
|
|
399
|
-
print(f"\nNo hyperlinks, remnants, or structural TOC found in {Path(pdf_path).name}.")
|
|
400
|
-
return {}
|
|
401
|
-
|
|
402
|
-
# 3. Separate the lists based on the 'type' key
|
|
403
|
-
uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
|
|
404
|
-
goto_links = [link for link in extracted_links if link['type'] == 'Internal (GoTo/Dest)']
|
|
405
|
-
resolved_action_links = [link for link in extracted_links if link['type'] == 'Internal (Resolved Action)']
|
|
406
|
-
other_links = [link for link in extracted_links if link['type'] not in ['External (URI)', 'Internal (GoTo/Dest)', 'Internal (Resolved Action)']]
|
|
407
|
-
|
|
408
|
-
total_internal_links = len(goto_links) + len(resolved_action_links)
|
|
409
|
-
|
|
410
|
-
# --- ANALYSIS SUMMARY (Using your print logic) ---
|
|
411
|
-
print("\n" + "✪" * 70)
|
|
412
|
-
print(f"--- Link Analysis Results for {Path(pdf_path).name} ---")
|
|
413
|
-
print(f"Total active links: {len(extracted_links)} (External: {len(uri_links)}, Internal Jumps: {total_internal_links}, Other: {len(other_links)})")
|
|
414
|
-
print(f"Total **structural TOC entries (bookmarks)** found: {toc_entry_count}")
|
|
415
|
-
print(f"Total **potential missing links** found: {len(remnants)}")
|
|
416
|
-
print("✪" * 70)
|
|
417
|
-
|
|
418
|
-
limit = max_links if max_links > 0 else None
|
|
419
|
-
|
|
420
|
-
uri_and_other = uri_links + other_links
|
|
421
|
-
|
|
422
|
-
# --- Section 1: ACTIVE URI LINKS ---
|
|
423
|
-
print("\n" + "=" * 70)
|
|
424
|
-
print(f"## Active URI Links (External & Other) - {len(uri_and_other)} found")
|
|
425
|
-
print("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target URI/Action"))
|
|
426
|
-
print("=" * 70)
|
|
427
|
-
|
|
428
|
-
if uri_and_other:
|
|
429
|
-
for i, link in enumerate(uri_and_other[:limit], 1):
|
|
430
|
-
target = link.get('url') or link.get('remote_file') or link.get('target')
|
|
431
|
-
link_text = link.get('link_text', 'N/A')
|
|
432
|
-
print("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], target))
|
|
433
|
-
if limit is not None and len(uri_and_other) > limit:
|
|
434
|
-
print(f"... and {len(uri_and_other) - limit} more links (use --max-links to see all or --max-links 0 to show all).")
|
|
435
|
-
|
|
436
|
-
else:
|
|
437
|
-
print(" No external or 'Other' links found.")
|
|
438
|
-
|
|
439
|
-
# --- Section 2: ACTIVE INTERNAL JUMPS ---
|
|
440
|
-
print("\n" + "=" * 70)
|
|
441
|
-
print(f"## Active Internal Jumps (GoTo & Resolved Actions) - {total_internal_links} found")
|
|
442
|
-
print("=" * 70)
|
|
443
|
-
print("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To Page"))
|
|
444
|
-
print("-" * 70)
|
|
445
|
-
|
|
446
|
-
all_internal = goto_links + resolved_action_links
|
|
447
|
-
if total_internal_links > 0:
|
|
448
|
-
for i, link in enumerate(all_internal[:limit], 1):
|
|
449
|
-
link_text = link.get('link_text', 'N/A')
|
|
450
|
-
print("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], link['destination_page']))
|
|
451
|
-
|
|
452
|
-
if limit is not None and len(all_internal) > limit:
|
|
453
|
-
print(f"... and {len(all_internal) - limit} more links (use --max-links to see all or --max-links 0 to show all).")
|
|
454
|
-
else:
|
|
455
|
-
print(" No internal GoTo or Resolved Action links found.")
|
|
456
|
-
|
|
457
|
-
# --- Section 3: REMNANTS ---
|
|
458
|
-
print("\n" + "=" * 70)
|
|
459
|
-
print(f"## ⚠️ Link Remnants (Potential Missing Links to Fix) - {len(remnants)} found")
|
|
460
|
-
print("=" * 70)
|
|
461
|
-
|
|
462
|
-
if remnants:
|
|
463
|
-
print("{:<5} | {:<5} | {:<15} | {}".format("Idx", "Page", "Remnant Type", "Text Found (Needs Hyperlink)"))
|
|
464
|
-
print("-" * 70)
|
|
465
|
-
for i, remnant in enumerate(remnants[:limit], 1):
|
|
466
|
-
print("{:<5} | {:<5} | {:<15} | {}".format(i, remnant['page'], remnant['type'], remnant['text']))
|
|
467
|
-
if max_links!=0 and len(remnants) > max_links:
|
|
468
|
-
print(f"... and {len(remnants) - max_links} more remnants (use --max-links to see all).")
|
|
469
|
-
else:
|
|
470
|
-
print(" No URI or Email remnants found that are not already active links.")
|
|
471
|
-
|
|
472
|
-
# --- Section 4: TOC ---
|
|
473
|
-
print_structural_toc(structural_toc)
|
|
474
|
-
|
|
475
|
-
# Return the collected data for potential future JSON/other output
|
|
476
|
-
final_report_data = {
|
|
477
|
-
"external_links": uri_links,
|
|
478
|
-
"internal_links": all_internal,
|
|
479
|
-
"remnants": remnants,
|
|
480
|
-
"toc": structural_toc
|
|
481
|
-
}
|
|
482
|
-
|
|
483
|
-
# 5. Export Report
|
|
484
|
-
if export_format:
|
|
485
|
-
# Assuming export_to will hold the output format string (e.g., "JSON")
|
|
486
|
-
export_report_data(final_report_data, Path(pdf_path).name, export_format)
|
|
487
|
-
|
|
488
|
-
return final_report_data
|
|
489
|
-
except Exception as e:
|
|
490
|
-
# Log the critical failure
|
|
491
|
-
error_logger.error(f"Critical failure during run_analysis for {pdf_path}: {e}", exc_info=True)
|
|
492
|
-
print(f"FATAL: Analysis failed. Check logs at {LOG_FILE_PATH}", file=sys.stderr)
|
|
493
|
-
raise # Allow the exception to propagate or handle gracefully
|
|
494
|
-
|
|
495
|
-
|
|
496
327
|
|
|
497
328
|
def call_stable():
|
|
498
329
|
"""
|
|
499
330
|
Placeholder function for command-line execution (e.g., in __main__).
|
|
500
331
|
Note: This requires defining PROJECT_NAME, CLI_MAIN_FILE, etc., or
|
|
501
|
-
passing them as arguments to
|
|
332
|
+
passing them as arguments to run_report.
|
|
502
333
|
"""
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
print("Analysis complete.")
|
|
334
|
+
run_report(pdf_library = "pymupdf")
|
|
335
|
+
#run_validation(pdf_library = "pymupdf")
|
|
506
336
|
|
|
507
337
|
if __name__ == "__main__":
|
|
508
338
|
call_stable()
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
# src/pdflinkcheck/analyze_pypdf.py
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Dict, Any, Optional, List
|
|
6
|
+
|
|
7
|
+
from pypdf import PdfReader
|
|
8
|
+
from pypdf.generic import Destination, NameObject, ArrayObject, IndirectObject
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
from pdflinkcheck.io import error_logger, export_report_data, get_first_pdf_in_cwd, LOG_FILE_PATH
|
|
12
|
+
from pdflinkcheck.report import run_report
|
|
13
|
+
#from pdflinkcheck.validate import run_validation
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
Inspect target PDF for both URI links and for GoTo links, using only pypdf, not Fitz
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def get_anchor_text_pypdf(page, rect) -> str:
|
|
20
|
+
"""
|
|
21
|
+
Extracts text within the link's bounding box using a visitor function.
|
|
22
|
+
Reliable for finding text associated with a link without PyMuPDF.
|
|
23
|
+
"""
|
|
24
|
+
if not rect:
|
|
25
|
+
return "N/A: Missing Rect"
|
|
26
|
+
|
|
27
|
+
# Standardize rect orientation (pypdf Rects are [x0, y0, x1, y1])
|
|
28
|
+
# Note: PDF coordinates use bottom-left as (0,0)
|
|
29
|
+
x_min = min(rect[0], rect[2])
|
|
30
|
+
y_min = min(rect[1], rect[3])
|
|
31
|
+
x_max = max(rect[0], rect[2])
|
|
32
|
+
y_max = max(rect[1], rect[3])
|
|
33
|
+
|
|
34
|
+
parts: List[str] = []
|
|
35
|
+
|
|
36
|
+
def visitor_body(text, cm, tm, font_dict, font_size):
|
|
37
|
+
# tm[4], tm[5] are the current text insertion point coordinates (x, y)
|
|
38
|
+
x, y = tm[4], tm[5]
|
|
39
|
+
|
|
40
|
+
# Using a threshold to account for font metrics/descenders
|
|
41
|
+
# Generous tolerance (±10 pt) to catch descenders, ascenders, kerning, and minor misalignments
|
|
42
|
+
tolerance = 10
|
|
43
|
+
if (x_min - tolerance) <= x <= (x_max + tolerance) and (y_min - tolerance) <= y <= (y_max + tolerance):
|
|
44
|
+
if text.strip():
|
|
45
|
+
parts.append(text)
|
|
46
|
+
|
|
47
|
+
page.extract_text(visitor_text=visitor_body)
|
|
48
|
+
|
|
49
|
+
raw_extracted = "".join(parts)
|
|
50
|
+
cleaned = " ".join(raw_extracted.split()).strip()
|
|
51
|
+
|
|
52
|
+
return cleaned if cleaned else "Graphic/Empty Link"
|
|
53
|
+
|
|
54
|
+
def resolve_pypdf_destination(reader: PdfReader, dest, obj_id_to_page: dict) -> str:
|
|
55
|
+
"""
|
|
56
|
+
Resolves a Destination object or IndirectObject to a 1-based page number string.
|
|
57
|
+
"""
|
|
58
|
+
try:
|
|
59
|
+
if isinstance(dest, Destination):
|
|
60
|
+
return str(dest.page_number + 1)
|
|
61
|
+
|
|
62
|
+
if isinstance(dest, IndirectObject):
|
|
63
|
+
return str(obj_id_to_page.get(dest.idnum, "Unknown"))
|
|
64
|
+
|
|
65
|
+
if isinstance(dest, ArrayObject) and len(dest) > 0:
|
|
66
|
+
if isinstance(dest[0], IndirectObject):
|
|
67
|
+
return str(obj_id_to_page.get(dest[0].idnum, "Unknown"))
|
|
68
|
+
|
|
69
|
+
return "Unknown"
|
|
70
|
+
except Exception:
|
|
71
|
+
return "Error Resolving"
|
|
72
|
+
|
|
73
|
+
def extract_links_pypdf(pdf_path):
|
|
74
|
+
"""
|
|
75
|
+
Termux-compatible link extraction using pure-Python pypdf.
|
|
76
|
+
Matches the reporting schema of the PyMuPDF version.
|
|
77
|
+
"""
|
|
78
|
+
reader = PdfReader(pdf_path)
|
|
79
|
+
|
|
80
|
+
# Pre-map Object IDs to Page Numbers for fast internal link resolution
|
|
81
|
+
obj_id_to_page = {
|
|
82
|
+
page.indirect_reference.idnum: i + 1
|
|
83
|
+
for i, page in enumerate(reader.pages)
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
all_links = []
|
|
87
|
+
|
|
88
|
+
for i, page in enumerate(reader.pages):
|
|
89
|
+
page_num = i + 1
|
|
90
|
+
if "/Annots" not in page:
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
for annot in page["/Annots"]:
|
|
94
|
+
obj = annot.get_object()
|
|
95
|
+
if obj.get("/Subtype") != "/Link":
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
rect = obj.get("/Rect")
|
|
99
|
+
anchor_text = get_anchor_text_pypdf(page, rect)
|
|
100
|
+
|
|
101
|
+
link_dict = {
|
|
102
|
+
'page': page_num,
|
|
103
|
+
'rect': list(rect) if rect else None,
|
|
104
|
+
'link_text': anchor_text,
|
|
105
|
+
'type': 'Other Action',
|
|
106
|
+
'target': 'Unknown'
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
# Handle URI (External)
|
|
110
|
+
if "/A" in obj and "/URI" in obj["/A"]:
|
|
111
|
+
uri = obj["/A"]["/URI"]
|
|
112
|
+
link_dict.update({
|
|
113
|
+
'type': 'External (URI)',
|
|
114
|
+
'url': uri,
|
|
115
|
+
'target': uri
|
|
116
|
+
})
|
|
117
|
+
|
|
118
|
+
# Handle GoTo (Internal)
|
|
119
|
+
elif "/Dest" in obj or ("/A" in obj and "/D" in obj["/A"]):
|
|
120
|
+
dest = obj.get("/Dest") or obj["/A"].get("/D")
|
|
121
|
+
target_page = resolve_pypdf_destination(reader, dest, obj_id_to_page)
|
|
122
|
+
link_dict.update({
|
|
123
|
+
'type': 'Internal (GoTo/Dest)',
|
|
124
|
+
'destination_page': target_page,
|
|
125
|
+
'target': f"Page {target_page}"
|
|
126
|
+
})
|
|
127
|
+
|
|
128
|
+
# Handle Remote GoTo (GoToR)
|
|
129
|
+
elif "/A" in obj and obj["/A"].get("/S") == "/GoToR":
|
|
130
|
+
remote_file = obj["/A"].get("/F")
|
|
131
|
+
link_dict.update({
|
|
132
|
+
'type': 'Remote (GoToR)',
|
|
133
|
+
'remote_file': str(remote_file),
|
|
134
|
+
'target': f"File: {remote_file}"
|
|
135
|
+
})
|
|
136
|
+
|
|
137
|
+
all_links.append(link_dict)
|
|
138
|
+
|
|
139
|
+
return all_links
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def extract_toc_pypdf(pdf_path: str) -> List[Dict[str, Any]]:
|
|
143
|
+
try:
|
|
144
|
+
reader = PdfReader(pdf_path)
|
|
145
|
+
# Note: outline is a property, not a method.
|
|
146
|
+
toc_tree = reader.outline
|
|
147
|
+
toc_data = []
|
|
148
|
+
|
|
149
|
+
def flatten_outline(outline_items, level=1):
|
|
150
|
+
for item in outline_items:
|
|
151
|
+
if isinstance(item, Destination):
|
|
152
|
+
# Using the reader directly is the only way to avoid
|
|
153
|
+
# the 'Destination' object has no attribute error
|
|
154
|
+
try:
|
|
155
|
+
page_num = reader.get_destination_page_number(item) + 1
|
|
156
|
+
except:
|
|
157
|
+
page_num = "N/A"
|
|
158
|
+
|
|
159
|
+
toc_data.append({
|
|
160
|
+
"level": level,
|
|
161
|
+
"title": item.title,
|
|
162
|
+
"target_page": page_num
|
|
163
|
+
})
|
|
164
|
+
elif isinstance(item, list):
|
|
165
|
+
# pypdf nests children in a list immediately following the parent
|
|
166
|
+
flatten_outline(item, level + 1)
|
|
167
|
+
|
|
168
|
+
flatten_outline(toc_tree)
|
|
169
|
+
return toc_data
|
|
170
|
+
except Exception as e:
|
|
171
|
+
print(f"TOC error: {e}", file=sys.stderr)
|
|
172
|
+
return []
|
|
173
|
+
|
|
174
|
+
def call_stable():
|
|
175
|
+
"""
|
|
176
|
+
Placeholder function for command-line execution (e.g., in __main__).
|
|
177
|
+
Note: This requires defining PROJECT_NAME, CLI_MAIN_FILE, etc., or
|
|
178
|
+
passing them as arguments to run_report.
|
|
179
|
+
"""
|
|
180
|
+
run_report(pdf_library = "pypdf")
|
|
181
|
+
#run_validation(pdf_library = "pypdf")
|
|
182
|
+
|
|
183
|
+
if __name__ == "__main__":
|
|
184
|
+
call_stable()
|