pdflinkcheck 1.1.7__py3-none-any.whl → 1.1.72__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pdflinkcheck/__init__.py CHANGED
@@ -0,0 +1,69 @@
1
+ # src/pdflinkcheck/__init__.py
2
+ """
3
+ # License information
4
+ pdflinkcheck - A PDF Link Checker
5
+
6
+ Copyright (C) 2025 George Clayton Bennett
7
+
8
+ Source code: https://github.com/City-of-Memphis-Wastewater/pdflinkcheck/
9
+
10
+ This program is free software: You can redistribute it and/or modify
11
+ it under the terms of the GNU Affero General Public License as
12
+ published by the Free Software Foundation, either version 3 of the
13
+ License, or (at your option) any later version.
14
+
15
+ The AGPL3+ is required because pdflinkcheck uses PyMuPDF, which is licensed under the AGPL3.
16
+ """
17
+ import os as _os
18
+
19
+ # Library functions
20
+ from pdflinkcheck.analyze_pymupdf import extract_links_pymupdf, extract_toc_pymupdf
21
+ from pdflinkcheck.analyze_pypdf import extract_links_pypdf, extract_toc_pypdf
22
+ #from pdflinkcheck import analyze_pypdf
23
+ from pdflinkcheck.report import run_report
24
+ from pdflinkcheck.report import run_report as run_analysis # for backwards compatibility with previos versions
25
+ #from pdflinkcheck import dev
26
+
27
+ # For the kids. This is what I wanted when learning Python in a mysterious new REPL.
28
+ # Is this Pythonic? No. Oh well. PEP 8, PEP 20.
29
+ # Why is this not Pythonic? Devs expect no side effects when importing library functions.
30
+ # What is a side effect?
31
+ _gui_easteregg_env_flag = _os.environ.get('PDFLINKCHECK_GUI_EASTEREGG', '')
32
+ _load_gui_func = str(_gui_easteregg_env_flag).strip().lower() in ('true', '1', 'yes', 'on')
33
+ if _load_gui_func:
34
+ try:
35
+ import pyhabitat as _pyhabitat # pyhabitat is a dependency of this package already
36
+ if _pyhabitat.tkinter_is_available():
37
+ from pdflinkcheck.gui import start_gui
38
+ except ImportError:
39
+ # Optional: log or ignore silently
40
+ print("start_gui() not imported")
41
+
42
+ # Breadcrumbs, for stumbling upon.
43
+ if _load_gui_func:
44
+ __pdflinkcheck_gui_easteregg_enabled__ = True
45
+ else:
46
+ __pdflinkcheck_gui_easteregg_enabled__ = False
47
+
48
+ # Define __all__ such that the library functions are self documenting.
49
+ __all__ = [
50
+ "run_report",
51
+ "run_analysis",
52
+ "extract_links_pymupdf",
53
+ "extract_toc_pymupdf",
54
+ "extract_links_pypdf",
55
+ "extract_toc_pypdf",
56
+ #"start_gui" if _load_gui_func else None,
57
+ #"dev",
58
+ ]
59
+ if _load_gui_func:
60
+ __all__.append("start_gui")
61
+
62
+ # 4. THE CLEANUP (This removes items from dir())
63
+ del _os
64
+ del _gui_easteregg_env_flag
65
+ del _load_gui_func
66
+
67
+ # Force avoid 'io' appearing, it's likely being imported, when it is imported by another package which is imported here:
68
+ #if "io" in locals():
69
+ # del io
@@ -0,0 +1,338 @@
1
+ import sys
2
+ from pathlib import Path
3
+ import logging
4
+ from typing import Dict, Any, Optional, List
5
+
6
+ logging.getLogger("fitz").setLevel(logging.ERROR)
7
+
8
+ try:
9
+ import fitz # PyMuPDF
10
+ except ImportError:
11
+ fitz = None
12
+
13
+ from pdflinkcheck.report import run_report
14
+ #from pdflinkcheck.validate import run_validation
15
+
16
+ """
17
+ Inspect target PDF for both URI links and for GoTo links.
18
+ """
19
+
20
+ # Helper function: Prioritize 'from'
21
+ def get_link_rect(link_dict):
22
+ """
23
+ Retrieves the bounding box for the link using the reliable 'from' key
24
+ provided by PyMuPDF's link dictionary.
25
+
26
+ Args:
27
+ link_dict: A dictionary representing a single link/annotation
28
+ returned by `page.get_links()`.
29
+
30
+ Returns:
31
+ A tuple of four floats (x0, y0, x1, y1) representing the
32
+ rectangular coordinates of the link on the page, or None if the
33
+ bounding box data is missing.
34
+ """
35
+ # 1. Use the 'from' key, which returns a fitz.Rect object or None
36
+ rect_obj = link_dict.get('from')
37
+
38
+ if rect_obj:
39
+ # 2. Extract the coordinates using the standard Rect properties
40
+ # (compatible with all recent PyMuPDF versions)
41
+ return (rect_obj.x0, rect_obj.y0, rect_obj.x1, rect_obj.y1)
42
+
43
+ # 3. Fallback to None if 'from' is missing
44
+ return None
45
+
46
+ def get_anchor_text(page, link_rect):
47
+ if not link_rect:
48
+ return "N/A: Missing Rect"
49
+
50
+ try:
51
+ # 1. Convert to fitz.Rect and normalize
52
+ rect = fitz.Rect(link_rect)
53
+ if rect.is_empty:
54
+ return "N/A: Rect Error"
55
+
56
+ # 2. Use asymmetric expansion (similar to the pypdf logic)
57
+ # 10 points horizontal to catch wide characters/kerning
58
+ # 3 points vertical to stay within the line
59
+ search_rect = fitz.Rect(
60
+ rect.x0 - 10,
61
+ rect.y0 - 3,
62
+ rect.x1 + 10,
63
+ rect.y1 + 3
64
+ )
65
+
66
+ # 3. Extract all words on the page
67
+ # Each word is: (x0, y0, x1, y1, "text", block_no, line_no, word_no)
68
+ words = page.get_text("words")
69
+
70
+ anchor_parts = []
71
+ for w in words:
72
+ word_rect = fitz.Rect(w[:4])
73
+ # Check if the word intersects our expanded link rectangle
74
+ if word_rect.intersects(search_rect):
75
+ anchor_parts.append(w[4])
76
+
77
+ cleaned_text = " ".join(anchor_parts).strip()
78
+
79
+ return cleaned_text if cleaned_text else "N/A: No Visible Text"
80
+
81
+ except Exception:
82
+ return "N/A: Rect Error"
83
+
84
+ def get_anchor_text_stable(page, link_rect):
85
+ """
86
+ Extracts text content using the link's bounding box coordinates.
87
+ The bounding box is slightly expanded to ensure full characters are captured.
88
+
89
+ Args:
90
+ page: The fitz.Page object where the link is located.
91
+ link_rect: A tuple of four floats (x0, y0, x1, y1) representing the
92
+ link's bounding box.
93
+
94
+ Returns:
95
+ The cleaned, extracted text string, or a placeholder message
96
+ if no text is found or if an error occurs.
97
+ """
98
+ if not link_rect:
99
+ return "N/A: Missing Rect"
100
+
101
+ try:
102
+ # 1. Convert the coordinate tuple back to a fitz.Rect object
103
+ rect = fitz.Rect(link_rect)
104
+
105
+ # --- CRITICAL STEP: Check for invalid/empty rect AFTER conversion ---
106
+ # If the rect is invalid (e.g., width or height is <= 0), skip it
107
+ # Note: fitz.Rect will often auto-normalize, but this explicit check is safer.
108
+ if rect.is_empty or rect.width <= 0 or rect.height <= 0:
109
+ return "N/A: Rect Error (Zero/Negative Dimension)"
110
+
111
+ # 2. Expand the rect slightly to capture full characters (1 unit in each direction)
112
+ # This method avoids the proprietary/unstable 'from_expanded' or 'from_rect' methods.
113
+ expanded_rect = fitz.Rect(
114
+ rect.x0 - 1,
115
+ rect.y0 - 1,
116
+ rect.x1 + 1,
117
+ rect.y1 + 1
118
+ )
119
+
120
+ # 3. Get the text within the expanded bounding box
121
+ anchor_text = page.get_textbox(expanded_rect)
122
+
123
+ # 4. Clean up whitespace and non-printing characters
124
+ cleaned_text = " ".join(anchor_text.split())
125
+
126
+ if cleaned_text:
127
+ return cleaned_text
128
+ else:
129
+ return "N/A: No Visible Text"
130
+
131
+ except Exception:
132
+ # Fallback for unexpected errors in rect conversion or retrieval
133
+ return "N/A: Rect Error"
134
+
135
+ def analyze_toc_fitz(doc):
136
+ """
137
+ Extracts the structural Table of Contents (PDF Bookmarks/Outline)
138
+ from the PDF document using PyMuPDF's built-in functionality.
139
+
140
+ Args:
141
+ doc: The open fitz.Document object.
142
+
143
+ Returns:
144
+ A list of dictionaries, where each dictionary represents a TOC entry
145
+ with 'level', 'title', and 'target_page' (1-indexed).
146
+ """
147
+ toc = doc.get_toc()
148
+ toc_data = []
149
+
150
+ for level, title, page_num in toc:
151
+ # fitz pages are 1-indexed for TOC!
152
+ toc_data.append({
153
+ 'level': level,
154
+ 'title': title,
155
+ 'target_page': page_num
156
+ })
157
+
158
+ return toc_data
159
+
160
+
161
+ # 2. Updated Main Inspection Function to Include Text Extraction
162
+ #def inspect_pdf_hyperlinks_fitz(pdf_path):
163
+ def extract_toc_pymupdf(pdf_path):
164
+ """
165
+ Opens a PDF, iterates through all pages and extracts the structural table of contents (TOC/bookmarks).
166
+
167
+ Args:
168
+ pdf_path: The file system path (str) to the target PDF document.
169
+
170
+ Returns:
171
+ A list of dictionaries representing the structural TOC/bookmarks.
172
+ """
173
+ try:
174
+ doc = fitz.open(pdf_path)
175
+ structural_toc = analyze_toc_fitz(doc)
176
+ except Exception as e:
177
+ print(f"An error occurred: {e}", file=sys.stderr)
178
+ return structural_toc
179
+
180
+
181
+ def serialize_fitz_object(obj):
182
+ """Converts a fitz object (Point, Rect, Matrix) to a serializable type."""
183
+ # Meant to avoid known Point errors like: '[ERROR] An unexpected error occurred during analysis: Report export failed due to an I/O error: Object of type Point is not JSON serializable'
184
+ if obj is None:
185
+ return None
186
+
187
+ # 1. Handle fitz.Point (has x, y)
188
+ if hasattr(obj, 'x') and hasattr(obj, 'y') and not hasattr(obj, 'x0'):
189
+ return (obj.x, obj.y)
190
+
191
+ # 2. Handle fitz.Rect and fitz.IRect (has x0, y0)
192
+ if hasattr(obj, 'x0') and hasattr(obj, 'y0'):
193
+ return (obj.x0, obj.y0, obj.x1, obj.y1)
194
+
195
+ # 3. Handle fitz.Matrix (has a, b, c, d, e, f)
196
+ if hasattr(obj, 'a') and hasattr(obj, 'b') and hasattr(obj, 'c'):
197
+ return (obj.a, obj.b, obj.c, obj.d, obj.e, obj.f)
198
+
199
+ # 4. Fallback: If it's still not a primitive type, convert it to a string
200
+ if not isinstance(obj, (str, int, float, bool, list, tuple, dict)):
201
+ # Examples: hasattr(value, 'rect') and hasattr(value, 'point'):
202
+ # This handles Rect and Point objects that may slip through
203
+ return str(obj)
204
+
205
+ # Otherwise, return the object as is (it's already primitive)
206
+ return obj
207
+
208
+
209
+ def extract_links_pymupdf(pdf_path):
210
+ """
211
+ Opens a PDF, iterates through all pages and extracts all link annotations.
212
+ It categorizes the links into External, Internal, or Other actions, and extracts the anchor text.
213
+
214
+ Args:
215
+ pdf_path: The file system path (str) to the target PDF document.
216
+
217
+ Returns:
218
+ A list of dictionaries, where each dictionary is a comprehensive
219
+ representation of an active hyperlink found in the PDF.
220
+
221
+ """
222
+ links_data = []
223
+ try:
224
+ doc = fitz.open(pdf_path)
225
+
226
+ for page_num in range(doc.page_count):
227
+ page = doc.load_page(page_num)
228
+
229
+ for link in page.get_links():
230
+
231
+ page_obj = doc.load_page(page_num)
232
+ link_rect = get_link_rect(link)
233
+
234
+ rect_obj = link.get("from")
235
+ xref = link.get("xref")
236
+ #print(f"rect_obj = {rect_obj}")
237
+ #print(f"xref = {xref}")
238
+
239
+
240
+ # --- Examples of various keys associated with various link instances ---
241
+ #print(f"keys: list(link) = {list(link)}")
242
+ # keys: list(link) = ['kind', 'xref', 'from', 'page', 'viewrect', 'id']
243
+ # keys: list(link) = ['kind', 'xref', 'from', 'uri', 'id']
244
+ # keys: list(link) = ['kind', 'xref', 'from', 'page', 'view', 'id']
245
+
246
+ # 1. Extract the anchor text
247
+ anchor_text = get_anchor_text(page_obj, link_rect)
248
+
249
+ # 2. Extract the target and kind
250
+ target = ""
251
+ kind = link.get('kind')
252
+
253
+
254
+ link_dict = {
255
+ 'page': int(page_num) + 1, # accurate for link location, add 1
256
+ 'rect': link_rect,
257
+ 'link_text': anchor_text,
258
+ 'xref':xref
259
+ }
260
+
261
+ # A. Clean Geom. Objects: Use the helper function on 'to' / 'destination'
262
+ # Use the clean serialize_fitz_object() helper function on all keys that might contain objects
263
+ destination_view = serialize_fitz_object(link.get('to'))
264
+
265
+ # B. Correct Internal Link Page Numbering (The -1 correction hack)
266
+ # This will be skipped by URI, which is not expected to have a page key
267
+ target_page_num_reported = "N/A"
268
+ if link.get('page') is not None:
269
+ target_page_num_reported = int(link.get('page'))+1 # accurate for link target, don't add 1 (weird)
270
+
271
+ if link['kind'] == fitz.LINK_URI:
272
+ target = link.get('uri', 'URI (Unknown Target)')
273
+ link_dict.update({
274
+ 'type': 'External (URI)',
275
+ 'url': link.get('uri'),
276
+ 'target': target
277
+ })
278
+
279
+ elif link['kind'] == fitz.LINK_GOTO:
280
+ target = f"Page {target_page_num_reported}"
281
+ link_dict.update({
282
+ 'type': 'Internal (GoTo/Dest)',
283
+ 'destination_page': target_page_num_reported,
284
+ 'destination_view': destination_view,
285
+ 'target': target
286
+ })
287
+
288
+ elif link['kind'] == fitz.LINK_GOTOR:
289
+ link_dict.update({
290
+ 'type': 'Remote (GoToR)',
291
+ 'remote_file': link.get('file'),
292
+ 'destination': destination_view
293
+ })
294
+
295
+ elif link.get('page') is not None and link['kind'] != fitz.LINK_GOTO:
296
+ target = f"Page {target_page_num_reported}"
297
+ link_dict.update({
298
+ 'type': 'Internal (Resolved Action)',
299
+ 'destination_page': target_page_num_reported,
300
+ 'destination_view': destination_view,
301
+ 'source_kind': link.get('kind'),
302
+ 'target': target
303
+ })
304
+
305
+ else:
306
+ target = link.get('url') or link.get('remote_file') or link.get('target')
307
+ link_dict.update({
308
+ 'type': 'Other Action',
309
+ 'action_kind': link.get('kind'),
310
+ 'target': target
311
+ })
312
+
313
+ ## --- General Serialization Cleaner ---
314
+ #for key, value in link_dict.items():
315
+ # if hasattr(value, 'rect') and hasattr(value, 'point'):
316
+ # # This handles Rect and Point objects that may slip through
317
+ # link_dict[key] = str(value)
318
+ ## --- End Cleaner ---
319
+
320
+ links_data.append(link_dict)
321
+
322
+ doc.close()
323
+ except Exception as e:
324
+ print(f"An error occurred: {e}", file=sys.stderr)
325
+ return links_data
326
+
327
+
328
+ def call_stable():
329
+ """
330
+ Placeholder function for command-line execution (e.g., in __main__).
331
+ Note: This requires defining PROJECT_NAME, CLI_MAIN_FILE, etc., or
332
+ passing them as arguments to run_report.
333
+ """
334
+ run_report(pdf_library = "pymupdf")
335
+ #run_validation(pdf_library = "pymupdf")
336
+
337
+ if __name__ == "__main__":
338
+ call_stable()
@@ -0,0 +1,184 @@
1
+ # src/pdflinkcheck/analyze_pypdf.py
2
+ import sys
3
+ from pathlib import Path
4
+ import logging
5
+ from typing import Dict, Any, Optional, List
6
+
7
+ from pypdf import PdfReader
8
+ from pypdf.generic import Destination, NameObject, ArrayObject, IndirectObject
9
+
10
+
11
+ from pdflinkcheck.io import error_logger, export_report_data, get_first_pdf_in_cwd, LOG_FILE_PATH
12
+ from pdflinkcheck.report import run_report
13
+ #from pdflinkcheck.validate import run_validation
14
+
15
+ """
16
+ Inspect target PDF for both URI links and for GoTo links, using only pypdf, not Fitz
17
+ """
18
+
19
+ def get_anchor_text_pypdf(page, rect) -> str:
20
+ """
21
+ Extracts text within the link's bounding box using a visitor function.
22
+ Reliable for finding text associated with a link without PyMuPDF.
23
+ """
24
+ if not rect:
25
+ return "N/A: Missing Rect"
26
+
27
+ # Standardize rect orientation (pypdf Rects are [x0, y0, x1, y1])
28
+ # Note: PDF coordinates use bottom-left as (0,0)
29
+ x_min = min(rect[0], rect[2])
30
+ y_min = min(rect[1], rect[3])
31
+ x_max = max(rect[0], rect[2])
32
+ y_max = max(rect[1], rect[3])
33
+
34
+ parts: List[str] = []
35
+
36
+ def visitor_body(text, cm, tm, font_dict, font_size):
37
+ # tm[4], tm[5] are the current text insertion point coordinates (x, y)
38
+ x, y = tm[4], tm[5]
39
+
40
+ # Using a threshold to account for font metrics/descenders
41
+ # Generous tolerance (±10 pt) to catch descenders, ascenders, kerning, and minor misalignments
42
+ tolerance = 10
43
+ if (x_min - tolerance) <= x <= (x_max + tolerance) and (y_min - tolerance) <= y <= (y_max + tolerance):
44
+ if text.strip():
45
+ parts.append(text)
46
+
47
+ page.extract_text(visitor_text=visitor_body)
48
+
49
+ raw_extracted = "".join(parts)
50
+ cleaned = " ".join(raw_extracted.split()).strip()
51
+
52
+ return cleaned if cleaned else "Graphic/Empty Link"
53
+
54
+ def resolve_pypdf_destination(reader: PdfReader, dest, obj_id_to_page: dict) -> str:
55
+ """
56
+ Resolves a Destination object or IndirectObject to a 1-based page number string.
57
+ """
58
+ try:
59
+ if isinstance(dest, Destination):
60
+ return str(dest.page_number + 1)
61
+
62
+ if isinstance(dest, IndirectObject):
63
+ return str(obj_id_to_page.get(dest.idnum, "Unknown"))
64
+
65
+ if isinstance(dest, ArrayObject) and len(dest) > 0:
66
+ if isinstance(dest[0], IndirectObject):
67
+ return str(obj_id_to_page.get(dest[0].idnum, "Unknown"))
68
+
69
+ return "Unknown"
70
+ except Exception:
71
+ return "Error Resolving"
72
+
73
+ def extract_links_pypdf(pdf_path):
74
+ """
75
+ Termux-compatible link extraction using pure-Python pypdf.
76
+ Matches the reporting schema of the PyMuPDF version.
77
+ """
78
+ reader = PdfReader(pdf_path)
79
+
80
+ # Pre-map Object IDs to Page Numbers for fast internal link resolution
81
+ obj_id_to_page = {
82
+ page.indirect_reference.idnum: i + 1
83
+ for i, page in enumerate(reader.pages)
84
+ }
85
+
86
+ all_links = []
87
+
88
+ for i, page in enumerate(reader.pages):
89
+ page_num = i + 1
90
+ if "/Annots" not in page:
91
+ continue
92
+
93
+ for annot in page["/Annots"]:
94
+ obj = annot.get_object()
95
+ if obj.get("/Subtype") != "/Link":
96
+ continue
97
+
98
+ rect = obj.get("/Rect")
99
+ anchor_text = get_anchor_text_pypdf(page, rect)
100
+
101
+ link_dict = {
102
+ 'page': page_num,
103
+ 'rect': list(rect) if rect else None,
104
+ 'link_text': anchor_text,
105
+ 'type': 'Other Action',
106
+ 'target': 'Unknown'
107
+ }
108
+
109
+ # Handle URI (External)
110
+ if "/A" in obj and "/URI" in obj["/A"]:
111
+ uri = obj["/A"]["/URI"]
112
+ link_dict.update({
113
+ 'type': 'External (URI)',
114
+ 'url': uri,
115
+ 'target': uri
116
+ })
117
+
118
+ # Handle GoTo (Internal)
119
+ elif "/Dest" in obj or ("/A" in obj and "/D" in obj["/A"]):
120
+ dest = obj.get("/Dest") or obj["/A"].get("/D")
121
+ target_page = resolve_pypdf_destination(reader, dest, obj_id_to_page)
122
+ link_dict.update({
123
+ 'type': 'Internal (GoTo/Dest)',
124
+ 'destination_page': target_page,
125
+ 'target': f"Page {target_page}"
126
+ })
127
+
128
+ # Handle Remote GoTo (GoToR)
129
+ elif "/A" in obj and obj["/A"].get("/S") == "/GoToR":
130
+ remote_file = obj["/A"].get("/F")
131
+ link_dict.update({
132
+ 'type': 'Remote (GoToR)',
133
+ 'remote_file': str(remote_file),
134
+ 'target': f"File: {remote_file}"
135
+ })
136
+
137
+ all_links.append(link_dict)
138
+
139
+ return all_links
140
+
141
+
142
+ def extract_toc_pypdf(pdf_path: str) -> List[Dict[str, Any]]:
143
+ try:
144
+ reader = PdfReader(pdf_path)
145
+ # Note: outline is a property, not a method.
146
+ toc_tree = reader.outline
147
+ toc_data = []
148
+
149
+ def flatten_outline(outline_items, level=1):
150
+ for item in outline_items:
151
+ if isinstance(item, Destination):
152
+ # Using the reader directly is the only way to avoid
153
+ # the 'Destination' object has no attribute error
154
+ try:
155
+ page_num = reader.get_destination_page_number(item) + 1
156
+ except:
157
+ page_num = "N/A"
158
+
159
+ toc_data.append({
160
+ "level": level,
161
+ "title": item.title,
162
+ "target_page": page_num
163
+ })
164
+ elif isinstance(item, list):
165
+ # pypdf nests children in a list immediately following the parent
166
+ flatten_outline(item, level + 1)
167
+
168
+ flatten_outline(toc_tree)
169
+ return toc_data
170
+ except Exception as e:
171
+ print(f"TOC error: {e}", file=sys.stderr)
172
+ return []
173
+
174
+ def call_stable():
175
+ """
176
+ Placeholder function for command-line execution (e.g., in __main__).
177
+ Note: This requires defining PROJECT_NAME, CLI_MAIN_FILE, etc., or
178
+ passing them as arguments to run_report.
179
+ """
180
+ run_report(pdf_library = "pypdf")
181
+ #run_validation(pdf_library = "pypdf")
182
+
183
+ if __name__ == "__main__":
184
+ call_stable()