pdflinkcheck 1.1.7__py3-none-any.whl → 1.1.72__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdflinkcheck/__init__.py +69 -0
- pdflinkcheck/analyze_pymupdf.py +338 -0
- pdflinkcheck/analyze_pypdf.py +184 -0
- pdflinkcheck/analyze_pypdf_v2.py +218 -0
- pdflinkcheck/cli.py +303 -27
- pdflinkcheck/data/LICENSE +661 -0
- pdflinkcheck/data/README.md +278 -0
- pdflinkcheck/data/pyproject.toml +98 -0
- pdflinkcheck/datacopy.py +60 -0
- pdflinkcheck/dev.py +109 -0
- pdflinkcheck/gui.py +477 -52
- pdflinkcheck/io.py +213 -0
- pdflinkcheck/report.py +280 -0
- pdflinkcheck/stdlib_server.py +176 -0
- pdflinkcheck/validate.py +380 -0
- pdflinkcheck/version_info.py +83 -0
- pdflinkcheck-1.1.72.dist-info/METADATA +322 -0
- pdflinkcheck-1.1.72.dist-info/RECORD +21 -0
- pdflinkcheck-1.1.72.dist-info/WHEEL +4 -0
- {pdflinkcheck-1.1.7.dist-info → pdflinkcheck-1.1.72.dist-info}/entry_points.txt +1 -1
- pdflinkcheck-1.1.72.dist-info/licenses/LICENSE +661 -0
- pdflinkcheck/analyze.py +0 -330
- pdflinkcheck/remnants.py +0 -142
- pdflinkcheck-1.1.7.dist-info/METADATA +0 -109
- pdflinkcheck-1.1.7.dist-info/RECORD +0 -10
- pdflinkcheck-1.1.7.dist-info/WHEEL +0 -5
- pdflinkcheck-1.1.7.dist-info/top_level.txt +0 -1
pdflinkcheck/__init__.py
CHANGED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# src/pdflinkcheck/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
# License information
|
|
4
|
+
pdflinkcheck - A PDF Link Checker
|
|
5
|
+
|
|
6
|
+
Copyright (C) 2025 George Clayton Bennett
|
|
7
|
+
|
|
8
|
+
Source code: https://github.com/City-of-Memphis-Wastewater/pdflinkcheck/
|
|
9
|
+
|
|
10
|
+
This program is free software: You can redistribute it and/or modify
|
|
11
|
+
it under the terms of the GNU Affero General Public License as
|
|
12
|
+
published by the Free Software Foundation, either version 3 of the
|
|
13
|
+
License, or (at your option) any later version.
|
|
14
|
+
|
|
15
|
+
The AGPL3+ is required because pdflinkcheck uses PyMuPDF, which is licensed under the AGPL3.
|
|
16
|
+
"""
|
|
17
|
+
import os as _os
|
|
18
|
+
|
|
19
|
+
# Library functions
|
|
20
|
+
from pdflinkcheck.analyze_pymupdf import extract_links_pymupdf, extract_toc_pymupdf
|
|
21
|
+
from pdflinkcheck.analyze_pypdf import extract_links_pypdf, extract_toc_pypdf
|
|
22
|
+
#from pdflinkcheck import analyze_pypdf
|
|
23
|
+
from pdflinkcheck.report import run_report
|
|
24
|
+
from pdflinkcheck.report import run_report as run_analysis # for backwards compatibility with previos versions
|
|
25
|
+
#from pdflinkcheck import dev
|
|
26
|
+
|
|
27
|
+
# For the kids. This is what I wanted when learning Python in a mysterious new REPL.
|
|
28
|
+
# Is this Pythonic? No. Oh well. PEP 8, PEP 20.
|
|
29
|
+
# Why is this not Pythonic? Devs expect no side effects when importing library functions.
|
|
30
|
+
# What is a side effect?
|
|
31
|
+
_gui_easteregg_env_flag = _os.environ.get('PDFLINKCHECK_GUI_EASTEREGG', '')
|
|
32
|
+
_load_gui_func = str(_gui_easteregg_env_flag).strip().lower() in ('true', '1', 'yes', 'on')
|
|
33
|
+
if _load_gui_func:
|
|
34
|
+
try:
|
|
35
|
+
import pyhabitat as _pyhabitat # pyhabitat is a dependency of this package already
|
|
36
|
+
if _pyhabitat.tkinter_is_available():
|
|
37
|
+
from pdflinkcheck.gui import start_gui
|
|
38
|
+
except ImportError:
|
|
39
|
+
# Optional: log or ignore silently
|
|
40
|
+
print("start_gui() not imported")
|
|
41
|
+
|
|
42
|
+
# Breadcrumbs, for stumbling upon.
|
|
43
|
+
if _load_gui_func:
|
|
44
|
+
__pdflinkcheck_gui_easteregg_enabled__ = True
|
|
45
|
+
else:
|
|
46
|
+
__pdflinkcheck_gui_easteregg_enabled__ = False
|
|
47
|
+
|
|
48
|
+
# Define __all__ such that the library functions are self documenting.
|
|
49
|
+
__all__ = [
|
|
50
|
+
"run_report",
|
|
51
|
+
"run_analysis",
|
|
52
|
+
"extract_links_pymupdf",
|
|
53
|
+
"extract_toc_pymupdf",
|
|
54
|
+
"extract_links_pypdf",
|
|
55
|
+
"extract_toc_pypdf",
|
|
56
|
+
#"start_gui" if _load_gui_func else None,
|
|
57
|
+
#"dev",
|
|
58
|
+
]
|
|
59
|
+
if _load_gui_func:
|
|
60
|
+
__all__.append("start_gui")
|
|
61
|
+
|
|
62
|
+
# 4. THE CLEANUP (This removes items from dir())
|
|
63
|
+
del _os
|
|
64
|
+
del _gui_easteregg_env_flag
|
|
65
|
+
del _load_gui_func
|
|
66
|
+
|
|
67
|
+
# Force avoid 'io' appearing, it's likely being imported, when it is imported by another package which is imported here:
|
|
68
|
+
#if "io" in locals():
|
|
69
|
+
# del io
|
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Dict, Any, Optional, List
|
|
5
|
+
|
|
6
|
+
logging.getLogger("fitz").setLevel(logging.ERROR)
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import fitz # PyMuPDF
|
|
10
|
+
except ImportError:
|
|
11
|
+
fitz = None
|
|
12
|
+
|
|
13
|
+
from pdflinkcheck.report import run_report
|
|
14
|
+
#from pdflinkcheck.validate import run_validation
|
|
15
|
+
|
|
16
|
+
"""
|
|
17
|
+
Inspect target PDF for both URI links and for GoTo links.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
# Helper function: Prioritize 'from'
|
|
21
|
+
def get_link_rect(link_dict):
|
|
22
|
+
"""
|
|
23
|
+
Retrieves the bounding box for the link using the reliable 'from' key
|
|
24
|
+
provided by PyMuPDF's link dictionary.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
link_dict: A dictionary representing a single link/annotation
|
|
28
|
+
returned by `page.get_links()`.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
A tuple of four floats (x0, y0, x1, y1) representing the
|
|
32
|
+
rectangular coordinates of the link on the page, or None if the
|
|
33
|
+
bounding box data is missing.
|
|
34
|
+
"""
|
|
35
|
+
# 1. Use the 'from' key, which returns a fitz.Rect object or None
|
|
36
|
+
rect_obj = link_dict.get('from')
|
|
37
|
+
|
|
38
|
+
if rect_obj:
|
|
39
|
+
# 2. Extract the coordinates using the standard Rect properties
|
|
40
|
+
# (compatible with all recent PyMuPDF versions)
|
|
41
|
+
return (rect_obj.x0, rect_obj.y0, rect_obj.x1, rect_obj.y1)
|
|
42
|
+
|
|
43
|
+
# 3. Fallback to None if 'from' is missing
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
def get_anchor_text(page, link_rect):
|
|
47
|
+
if not link_rect:
|
|
48
|
+
return "N/A: Missing Rect"
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
# 1. Convert to fitz.Rect and normalize
|
|
52
|
+
rect = fitz.Rect(link_rect)
|
|
53
|
+
if rect.is_empty:
|
|
54
|
+
return "N/A: Rect Error"
|
|
55
|
+
|
|
56
|
+
# 2. Use asymmetric expansion (similar to the pypdf logic)
|
|
57
|
+
# 10 points horizontal to catch wide characters/kerning
|
|
58
|
+
# 3 points vertical to stay within the line
|
|
59
|
+
search_rect = fitz.Rect(
|
|
60
|
+
rect.x0 - 10,
|
|
61
|
+
rect.y0 - 3,
|
|
62
|
+
rect.x1 + 10,
|
|
63
|
+
rect.y1 + 3
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# 3. Extract all words on the page
|
|
67
|
+
# Each word is: (x0, y0, x1, y1, "text", block_no, line_no, word_no)
|
|
68
|
+
words = page.get_text("words")
|
|
69
|
+
|
|
70
|
+
anchor_parts = []
|
|
71
|
+
for w in words:
|
|
72
|
+
word_rect = fitz.Rect(w[:4])
|
|
73
|
+
# Check if the word intersects our expanded link rectangle
|
|
74
|
+
if word_rect.intersects(search_rect):
|
|
75
|
+
anchor_parts.append(w[4])
|
|
76
|
+
|
|
77
|
+
cleaned_text = " ".join(anchor_parts).strip()
|
|
78
|
+
|
|
79
|
+
return cleaned_text if cleaned_text else "N/A: No Visible Text"
|
|
80
|
+
|
|
81
|
+
except Exception:
|
|
82
|
+
return "N/A: Rect Error"
|
|
83
|
+
|
|
84
|
+
def get_anchor_text_stable(page, link_rect):
|
|
85
|
+
"""
|
|
86
|
+
Extracts text content using the link's bounding box coordinates.
|
|
87
|
+
The bounding box is slightly expanded to ensure full characters are captured.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
page: The fitz.Page object where the link is located.
|
|
91
|
+
link_rect: A tuple of four floats (x0, y0, x1, y1) representing the
|
|
92
|
+
link's bounding box.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
The cleaned, extracted text string, or a placeholder message
|
|
96
|
+
if no text is found or if an error occurs.
|
|
97
|
+
"""
|
|
98
|
+
if not link_rect:
|
|
99
|
+
return "N/A: Missing Rect"
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
# 1. Convert the coordinate tuple back to a fitz.Rect object
|
|
103
|
+
rect = fitz.Rect(link_rect)
|
|
104
|
+
|
|
105
|
+
# --- CRITICAL STEP: Check for invalid/empty rect AFTER conversion ---
|
|
106
|
+
# If the rect is invalid (e.g., width or height is <= 0), skip it
|
|
107
|
+
# Note: fitz.Rect will often auto-normalize, but this explicit check is safer.
|
|
108
|
+
if rect.is_empty or rect.width <= 0 or rect.height <= 0:
|
|
109
|
+
return "N/A: Rect Error (Zero/Negative Dimension)"
|
|
110
|
+
|
|
111
|
+
# 2. Expand the rect slightly to capture full characters (1 unit in each direction)
|
|
112
|
+
# This method avoids the proprietary/unstable 'from_expanded' or 'from_rect' methods.
|
|
113
|
+
expanded_rect = fitz.Rect(
|
|
114
|
+
rect.x0 - 1,
|
|
115
|
+
rect.y0 - 1,
|
|
116
|
+
rect.x1 + 1,
|
|
117
|
+
rect.y1 + 1
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# 3. Get the text within the expanded bounding box
|
|
121
|
+
anchor_text = page.get_textbox(expanded_rect)
|
|
122
|
+
|
|
123
|
+
# 4. Clean up whitespace and non-printing characters
|
|
124
|
+
cleaned_text = " ".join(anchor_text.split())
|
|
125
|
+
|
|
126
|
+
if cleaned_text:
|
|
127
|
+
return cleaned_text
|
|
128
|
+
else:
|
|
129
|
+
return "N/A: No Visible Text"
|
|
130
|
+
|
|
131
|
+
except Exception:
|
|
132
|
+
# Fallback for unexpected errors in rect conversion or retrieval
|
|
133
|
+
return "N/A: Rect Error"
|
|
134
|
+
|
|
135
|
+
def analyze_toc_fitz(doc):
|
|
136
|
+
"""
|
|
137
|
+
Extracts the structural Table of Contents (PDF Bookmarks/Outline)
|
|
138
|
+
from the PDF document using PyMuPDF's built-in functionality.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
doc: The open fitz.Document object.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
A list of dictionaries, where each dictionary represents a TOC entry
|
|
145
|
+
with 'level', 'title', and 'target_page' (1-indexed).
|
|
146
|
+
"""
|
|
147
|
+
toc = doc.get_toc()
|
|
148
|
+
toc_data = []
|
|
149
|
+
|
|
150
|
+
for level, title, page_num in toc:
|
|
151
|
+
# fitz pages are 1-indexed for TOC!
|
|
152
|
+
toc_data.append({
|
|
153
|
+
'level': level,
|
|
154
|
+
'title': title,
|
|
155
|
+
'target_page': page_num
|
|
156
|
+
})
|
|
157
|
+
|
|
158
|
+
return toc_data
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# 2. Updated Main Inspection Function to Include Text Extraction
|
|
162
|
+
#def inspect_pdf_hyperlinks_fitz(pdf_path):
|
|
163
|
+
def extract_toc_pymupdf(pdf_path):
|
|
164
|
+
"""
|
|
165
|
+
Opens a PDF, iterates through all pages and extracts the structural table of contents (TOC/bookmarks).
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
pdf_path: The file system path (str) to the target PDF document.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
A list of dictionaries representing the structural TOC/bookmarks.
|
|
172
|
+
"""
|
|
173
|
+
try:
|
|
174
|
+
doc = fitz.open(pdf_path)
|
|
175
|
+
structural_toc = analyze_toc_fitz(doc)
|
|
176
|
+
except Exception as e:
|
|
177
|
+
print(f"An error occurred: {e}", file=sys.stderr)
|
|
178
|
+
return structural_toc
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def serialize_fitz_object(obj):
|
|
182
|
+
"""Converts a fitz object (Point, Rect, Matrix) to a serializable type."""
|
|
183
|
+
# Meant to avoid known Point errors like: '[ERROR] An unexpected error occurred during analysis: Report export failed due to an I/O error: Object of type Point is not JSON serializable'
|
|
184
|
+
if obj is None:
|
|
185
|
+
return None
|
|
186
|
+
|
|
187
|
+
# 1. Handle fitz.Point (has x, y)
|
|
188
|
+
if hasattr(obj, 'x') and hasattr(obj, 'y') and not hasattr(obj, 'x0'):
|
|
189
|
+
return (obj.x, obj.y)
|
|
190
|
+
|
|
191
|
+
# 2. Handle fitz.Rect and fitz.IRect (has x0, y0)
|
|
192
|
+
if hasattr(obj, 'x0') and hasattr(obj, 'y0'):
|
|
193
|
+
return (obj.x0, obj.y0, obj.x1, obj.y1)
|
|
194
|
+
|
|
195
|
+
# 3. Handle fitz.Matrix (has a, b, c, d, e, f)
|
|
196
|
+
if hasattr(obj, 'a') and hasattr(obj, 'b') and hasattr(obj, 'c'):
|
|
197
|
+
return (obj.a, obj.b, obj.c, obj.d, obj.e, obj.f)
|
|
198
|
+
|
|
199
|
+
# 4. Fallback: If it's still not a primitive type, convert it to a string
|
|
200
|
+
if not isinstance(obj, (str, int, float, bool, list, tuple, dict)):
|
|
201
|
+
# Examples: hasattr(value, 'rect') and hasattr(value, 'point'):
|
|
202
|
+
# This handles Rect and Point objects that may slip through
|
|
203
|
+
return str(obj)
|
|
204
|
+
|
|
205
|
+
# Otherwise, return the object as is (it's already primitive)
|
|
206
|
+
return obj
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def extract_links_pymupdf(pdf_path):
|
|
210
|
+
"""
|
|
211
|
+
Opens a PDF, iterates through all pages and extracts all link annotations.
|
|
212
|
+
It categorizes the links into External, Internal, or Other actions, and extracts the anchor text.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
pdf_path: The file system path (str) to the target PDF document.
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
A list of dictionaries, where each dictionary is a comprehensive
|
|
219
|
+
representation of an active hyperlink found in the PDF.
|
|
220
|
+
|
|
221
|
+
"""
|
|
222
|
+
links_data = []
|
|
223
|
+
try:
|
|
224
|
+
doc = fitz.open(pdf_path)
|
|
225
|
+
|
|
226
|
+
for page_num in range(doc.page_count):
|
|
227
|
+
page = doc.load_page(page_num)
|
|
228
|
+
|
|
229
|
+
for link in page.get_links():
|
|
230
|
+
|
|
231
|
+
page_obj = doc.load_page(page_num)
|
|
232
|
+
link_rect = get_link_rect(link)
|
|
233
|
+
|
|
234
|
+
rect_obj = link.get("from")
|
|
235
|
+
xref = link.get("xref")
|
|
236
|
+
#print(f"rect_obj = {rect_obj}")
|
|
237
|
+
#print(f"xref = {xref}")
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
# --- Examples of various keys associated with various link instances ---
|
|
241
|
+
#print(f"keys: list(link) = {list(link)}")
|
|
242
|
+
# keys: list(link) = ['kind', 'xref', 'from', 'page', 'viewrect', 'id']
|
|
243
|
+
# keys: list(link) = ['kind', 'xref', 'from', 'uri', 'id']
|
|
244
|
+
# keys: list(link) = ['kind', 'xref', 'from', 'page', 'view', 'id']
|
|
245
|
+
|
|
246
|
+
# 1. Extract the anchor text
|
|
247
|
+
anchor_text = get_anchor_text(page_obj, link_rect)
|
|
248
|
+
|
|
249
|
+
# 2. Extract the target and kind
|
|
250
|
+
target = ""
|
|
251
|
+
kind = link.get('kind')
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
link_dict = {
|
|
255
|
+
'page': int(page_num) + 1, # accurate for link location, add 1
|
|
256
|
+
'rect': link_rect,
|
|
257
|
+
'link_text': anchor_text,
|
|
258
|
+
'xref':xref
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
# A. Clean Geom. Objects: Use the helper function on 'to' / 'destination'
|
|
262
|
+
# Use the clean serialize_fitz_object() helper function on all keys that might contain objects
|
|
263
|
+
destination_view = serialize_fitz_object(link.get('to'))
|
|
264
|
+
|
|
265
|
+
# B. Correct Internal Link Page Numbering (The -1 correction hack)
|
|
266
|
+
# This will be skipped by URI, which is not expected to have a page key
|
|
267
|
+
target_page_num_reported = "N/A"
|
|
268
|
+
if link.get('page') is not None:
|
|
269
|
+
target_page_num_reported = int(link.get('page'))+1 # accurate for link target, don't add 1 (weird)
|
|
270
|
+
|
|
271
|
+
if link['kind'] == fitz.LINK_URI:
|
|
272
|
+
target = link.get('uri', 'URI (Unknown Target)')
|
|
273
|
+
link_dict.update({
|
|
274
|
+
'type': 'External (URI)',
|
|
275
|
+
'url': link.get('uri'),
|
|
276
|
+
'target': target
|
|
277
|
+
})
|
|
278
|
+
|
|
279
|
+
elif link['kind'] == fitz.LINK_GOTO:
|
|
280
|
+
target = f"Page {target_page_num_reported}"
|
|
281
|
+
link_dict.update({
|
|
282
|
+
'type': 'Internal (GoTo/Dest)',
|
|
283
|
+
'destination_page': target_page_num_reported,
|
|
284
|
+
'destination_view': destination_view,
|
|
285
|
+
'target': target
|
|
286
|
+
})
|
|
287
|
+
|
|
288
|
+
elif link['kind'] == fitz.LINK_GOTOR:
|
|
289
|
+
link_dict.update({
|
|
290
|
+
'type': 'Remote (GoToR)',
|
|
291
|
+
'remote_file': link.get('file'),
|
|
292
|
+
'destination': destination_view
|
|
293
|
+
})
|
|
294
|
+
|
|
295
|
+
elif link.get('page') is not None and link['kind'] != fitz.LINK_GOTO:
|
|
296
|
+
target = f"Page {target_page_num_reported}"
|
|
297
|
+
link_dict.update({
|
|
298
|
+
'type': 'Internal (Resolved Action)',
|
|
299
|
+
'destination_page': target_page_num_reported,
|
|
300
|
+
'destination_view': destination_view,
|
|
301
|
+
'source_kind': link.get('kind'),
|
|
302
|
+
'target': target
|
|
303
|
+
})
|
|
304
|
+
|
|
305
|
+
else:
|
|
306
|
+
target = link.get('url') or link.get('remote_file') or link.get('target')
|
|
307
|
+
link_dict.update({
|
|
308
|
+
'type': 'Other Action',
|
|
309
|
+
'action_kind': link.get('kind'),
|
|
310
|
+
'target': target
|
|
311
|
+
})
|
|
312
|
+
|
|
313
|
+
## --- General Serialization Cleaner ---
|
|
314
|
+
#for key, value in link_dict.items():
|
|
315
|
+
# if hasattr(value, 'rect') and hasattr(value, 'point'):
|
|
316
|
+
# # This handles Rect and Point objects that may slip through
|
|
317
|
+
# link_dict[key] = str(value)
|
|
318
|
+
## --- End Cleaner ---
|
|
319
|
+
|
|
320
|
+
links_data.append(link_dict)
|
|
321
|
+
|
|
322
|
+
doc.close()
|
|
323
|
+
except Exception as e:
|
|
324
|
+
print(f"An error occurred: {e}", file=sys.stderr)
|
|
325
|
+
return links_data
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def call_stable():
|
|
329
|
+
"""
|
|
330
|
+
Placeholder function for command-line execution (e.g., in __main__).
|
|
331
|
+
Note: This requires defining PROJECT_NAME, CLI_MAIN_FILE, etc., or
|
|
332
|
+
passing them as arguments to run_report.
|
|
333
|
+
"""
|
|
334
|
+
run_report(pdf_library = "pymupdf")
|
|
335
|
+
#run_validation(pdf_library = "pymupdf")
|
|
336
|
+
|
|
337
|
+
if __name__ == "__main__":
|
|
338
|
+
call_stable()
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
# src/pdflinkcheck/analyze_pypdf.py
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Dict, Any, Optional, List
|
|
6
|
+
|
|
7
|
+
from pypdf import PdfReader
|
|
8
|
+
from pypdf.generic import Destination, NameObject, ArrayObject, IndirectObject
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
from pdflinkcheck.io import error_logger, export_report_data, get_first_pdf_in_cwd, LOG_FILE_PATH
|
|
12
|
+
from pdflinkcheck.report import run_report
|
|
13
|
+
#from pdflinkcheck.validate import run_validation
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
Inspect target PDF for both URI links and for GoTo links, using only pypdf, not Fitz
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def get_anchor_text_pypdf(page, rect) -> str:
|
|
20
|
+
"""
|
|
21
|
+
Extracts text within the link's bounding box using a visitor function.
|
|
22
|
+
Reliable for finding text associated with a link without PyMuPDF.
|
|
23
|
+
"""
|
|
24
|
+
if not rect:
|
|
25
|
+
return "N/A: Missing Rect"
|
|
26
|
+
|
|
27
|
+
# Standardize rect orientation (pypdf Rects are [x0, y0, x1, y1])
|
|
28
|
+
# Note: PDF coordinates use bottom-left as (0,0)
|
|
29
|
+
x_min = min(rect[0], rect[2])
|
|
30
|
+
y_min = min(rect[1], rect[3])
|
|
31
|
+
x_max = max(rect[0], rect[2])
|
|
32
|
+
y_max = max(rect[1], rect[3])
|
|
33
|
+
|
|
34
|
+
parts: List[str] = []
|
|
35
|
+
|
|
36
|
+
def visitor_body(text, cm, tm, font_dict, font_size):
|
|
37
|
+
# tm[4], tm[5] are the current text insertion point coordinates (x, y)
|
|
38
|
+
x, y = tm[4], tm[5]
|
|
39
|
+
|
|
40
|
+
# Using a threshold to account for font metrics/descenders
|
|
41
|
+
# Generous tolerance (±10 pt) to catch descenders, ascenders, kerning, and minor misalignments
|
|
42
|
+
tolerance = 10
|
|
43
|
+
if (x_min - tolerance) <= x <= (x_max + tolerance) and (y_min - tolerance) <= y <= (y_max + tolerance):
|
|
44
|
+
if text.strip():
|
|
45
|
+
parts.append(text)
|
|
46
|
+
|
|
47
|
+
page.extract_text(visitor_text=visitor_body)
|
|
48
|
+
|
|
49
|
+
raw_extracted = "".join(parts)
|
|
50
|
+
cleaned = " ".join(raw_extracted.split()).strip()
|
|
51
|
+
|
|
52
|
+
return cleaned if cleaned else "Graphic/Empty Link"
|
|
53
|
+
|
|
54
|
+
def resolve_pypdf_destination(reader: PdfReader, dest, obj_id_to_page: dict) -> str:
|
|
55
|
+
"""
|
|
56
|
+
Resolves a Destination object or IndirectObject to a 1-based page number string.
|
|
57
|
+
"""
|
|
58
|
+
try:
|
|
59
|
+
if isinstance(dest, Destination):
|
|
60
|
+
return str(dest.page_number + 1)
|
|
61
|
+
|
|
62
|
+
if isinstance(dest, IndirectObject):
|
|
63
|
+
return str(obj_id_to_page.get(dest.idnum, "Unknown"))
|
|
64
|
+
|
|
65
|
+
if isinstance(dest, ArrayObject) and len(dest) > 0:
|
|
66
|
+
if isinstance(dest[0], IndirectObject):
|
|
67
|
+
return str(obj_id_to_page.get(dest[0].idnum, "Unknown"))
|
|
68
|
+
|
|
69
|
+
return "Unknown"
|
|
70
|
+
except Exception:
|
|
71
|
+
return "Error Resolving"
|
|
72
|
+
|
|
73
|
+
def extract_links_pypdf(pdf_path):
|
|
74
|
+
"""
|
|
75
|
+
Termux-compatible link extraction using pure-Python pypdf.
|
|
76
|
+
Matches the reporting schema of the PyMuPDF version.
|
|
77
|
+
"""
|
|
78
|
+
reader = PdfReader(pdf_path)
|
|
79
|
+
|
|
80
|
+
# Pre-map Object IDs to Page Numbers for fast internal link resolution
|
|
81
|
+
obj_id_to_page = {
|
|
82
|
+
page.indirect_reference.idnum: i + 1
|
|
83
|
+
for i, page in enumerate(reader.pages)
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
all_links = []
|
|
87
|
+
|
|
88
|
+
for i, page in enumerate(reader.pages):
|
|
89
|
+
page_num = i + 1
|
|
90
|
+
if "/Annots" not in page:
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
for annot in page["/Annots"]:
|
|
94
|
+
obj = annot.get_object()
|
|
95
|
+
if obj.get("/Subtype") != "/Link":
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
rect = obj.get("/Rect")
|
|
99
|
+
anchor_text = get_anchor_text_pypdf(page, rect)
|
|
100
|
+
|
|
101
|
+
link_dict = {
|
|
102
|
+
'page': page_num,
|
|
103
|
+
'rect': list(rect) if rect else None,
|
|
104
|
+
'link_text': anchor_text,
|
|
105
|
+
'type': 'Other Action',
|
|
106
|
+
'target': 'Unknown'
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
# Handle URI (External)
|
|
110
|
+
if "/A" in obj and "/URI" in obj["/A"]:
|
|
111
|
+
uri = obj["/A"]["/URI"]
|
|
112
|
+
link_dict.update({
|
|
113
|
+
'type': 'External (URI)',
|
|
114
|
+
'url': uri,
|
|
115
|
+
'target': uri
|
|
116
|
+
})
|
|
117
|
+
|
|
118
|
+
# Handle GoTo (Internal)
|
|
119
|
+
elif "/Dest" in obj or ("/A" in obj and "/D" in obj["/A"]):
|
|
120
|
+
dest = obj.get("/Dest") or obj["/A"].get("/D")
|
|
121
|
+
target_page = resolve_pypdf_destination(reader, dest, obj_id_to_page)
|
|
122
|
+
link_dict.update({
|
|
123
|
+
'type': 'Internal (GoTo/Dest)',
|
|
124
|
+
'destination_page': target_page,
|
|
125
|
+
'target': f"Page {target_page}"
|
|
126
|
+
})
|
|
127
|
+
|
|
128
|
+
# Handle Remote GoTo (GoToR)
|
|
129
|
+
elif "/A" in obj and obj["/A"].get("/S") == "/GoToR":
|
|
130
|
+
remote_file = obj["/A"].get("/F")
|
|
131
|
+
link_dict.update({
|
|
132
|
+
'type': 'Remote (GoToR)',
|
|
133
|
+
'remote_file': str(remote_file),
|
|
134
|
+
'target': f"File: {remote_file}"
|
|
135
|
+
})
|
|
136
|
+
|
|
137
|
+
all_links.append(link_dict)
|
|
138
|
+
|
|
139
|
+
return all_links
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def extract_toc_pypdf(pdf_path: str) -> List[Dict[str, Any]]:
|
|
143
|
+
try:
|
|
144
|
+
reader = PdfReader(pdf_path)
|
|
145
|
+
# Note: outline is a property, not a method.
|
|
146
|
+
toc_tree = reader.outline
|
|
147
|
+
toc_data = []
|
|
148
|
+
|
|
149
|
+
def flatten_outline(outline_items, level=1):
|
|
150
|
+
for item in outline_items:
|
|
151
|
+
if isinstance(item, Destination):
|
|
152
|
+
# Using the reader directly is the only way to avoid
|
|
153
|
+
# the 'Destination' object has no attribute error
|
|
154
|
+
try:
|
|
155
|
+
page_num = reader.get_destination_page_number(item) + 1
|
|
156
|
+
except:
|
|
157
|
+
page_num = "N/A"
|
|
158
|
+
|
|
159
|
+
toc_data.append({
|
|
160
|
+
"level": level,
|
|
161
|
+
"title": item.title,
|
|
162
|
+
"target_page": page_num
|
|
163
|
+
})
|
|
164
|
+
elif isinstance(item, list):
|
|
165
|
+
# pypdf nests children in a list immediately following the parent
|
|
166
|
+
flatten_outline(item, level + 1)
|
|
167
|
+
|
|
168
|
+
flatten_outline(toc_tree)
|
|
169
|
+
return toc_data
|
|
170
|
+
except Exception as e:
|
|
171
|
+
print(f"TOC error: {e}", file=sys.stderr)
|
|
172
|
+
return []
|
|
173
|
+
|
|
174
|
+
def call_stable():
|
|
175
|
+
"""
|
|
176
|
+
Placeholder function for command-line execution (e.g., in __main__).
|
|
177
|
+
Note: This requires defining PROJECT_NAME, CLI_MAIN_FILE, etc., or
|
|
178
|
+
passing them as arguments to run_report.
|
|
179
|
+
"""
|
|
180
|
+
run_report(pdf_library = "pypdf")
|
|
181
|
+
#run_validation(pdf_library = "pypdf")
|
|
182
|
+
|
|
183
|
+
if __name__ == "__main__":
|
|
184
|
+
call_stable()
|