pdflinkcheck 1.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,330 @@
1
+ import sys
2
+ from pathlib import Path
3
+ import logging
4
+ from typing import Dict, Any
5
+ # Configure logging to suppress low-level pdfminer messages
6
+ logging.getLogger("fitz").setLevel(logging.ERROR)
7
+ import fitz # PyMuPDF
8
+
9
+ from pdflinkcheck.remnants import find_link_remnants
10
+
11
+ """
12
+ Inspect target PDF for both URI links and for GoTo links.
13
+ """
14
+
15
+
16
+ # Helper function: Prioritize 'from'
17
+ def get_link_rect(link_dict):
18
+ """
19
+ Retrieves the bounding box for the link using the reliable 'from' key.
20
+ Returns the rect coordinates (tuple of 4 floats) or None.
21
+ """
22
+ # 1. Use the 'from' key, which returns a fitz.Rect object or None
23
+ rect_obj = link_dict.get('from')
24
+
25
+ if rect_obj:
26
+ # 2. Extract the coordinates using the standard Rect properties
27
+ # (compatible with all recent PyMuPDF versions)
28
+ return (rect_obj.x0, rect_obj.y0, rect_obj.x1, rect_obj.y1)
29
+
30
+ # 3. Fallback to None if 'from' is missing
31
+ return None
32
+
33
+ def get_pdf_file():
34
+
35
+ example_path = f"/mnt/c/Users/george.bennett/Downloads/TE Maxson WWTF O&M Manual DRAFT - Sections 1-6 - April 2025 (3).pdf"
36
+ example_path = "TE Maxson WWTF O&M Manual.pdf"
37
+ print(f"example path = {example_path}")
38
+ pdf_file = input(f"Paste path to PDF file (or press Enter to accept example): ")
39
+ if not pdf_file:
40
+ pdf_file = example_path
41
+ if not Path(pdf_file).exists:
42
+ print("File not found!")
43
+ sys.exit(1)
44
+
45
+ return pdf_file
46
+
47
+
48
+ def get_anchor_text(page, link_rect):
49
+ """
50
+ Extracts text content using the link's bounding box.
51
+ Returns the cleaned text or a placeholder if no text is found.
52
+ """
53
+ if not link_rect:
54
+ return "N/A: Missing Rect"
55
+
56
+ try:
57
+ # 1. Convert the coordinate tuple back to a fitz.Rect object
58
+ rect = fitz.Rect(link_rect)
59
+
60
+ # --- CRITICAL STEP: Check for invalid/empty rect AFTER conversion ---
61
+ # If the rect is invalid (e.g., width or height is <= 0), skip it
62
+ # Note: fitz.Rect will often auto-normalize, but this explicit check is safer.
63
+ if rect.is_empty or rect.width <= 0 or rect.height <= 0:
64
+ return "N/A: Rect Error (Zero/Negative Dimension)"
65
+
66
+ # 2. Expand the rect slightly to capture full characters (1 unit in each direction)
67
+ # This method avoids the proprietary/unstable 'from_expanded' or 'from_rect' methods.
68
+ expanded_rect = fitz.Rect(
69
+ rect.x0 - 1,
70
+ rect.y0 - 1,
71
+ rect.x1 + 1,
72
+ rect.y1 + 1
73
+ )
74
+
75
+ # 3. Get the text within the expanded bounding box
76
+ anchor_text = page.get_textbox(expanded_rect)
77
+
78
+ # 4. Clean up whitespace and non-printing characters
79
+ cleaned_text = " ".join(anchor_text.split())
80
+
81
+ if cleaned_text:
82
+ return cleaned_text
83
+ else:
84
+ return "N/A: No Visible Text"
85
+
86
+ except Exception:
87
+ # Fallback for unexpected errors in rect conversion or retrieval
88
+ return "N/A: Rect Error"
89
+
90
+
91
+ def analyze_toc_fitz(doc):
92
+ """
93
+ Extracts the structured Table of Contents (bookmarks/outline) from the PDF.
94
+ """
95
+ toc = doc.get_toc()
96
+ toc_data = []
97
+
98
+ for level, title, page_num in toc:
99
+ # fitz pages are 1-indexed for TOC!
100
+ toc_data.append({
101
+ 'level': level,
102
+ 'title': title,
103
+ 'target_page': page_num
104
+ })
105
+
106
+ return toc_data
107
+
108
+
109
+ # 2. Updated Main Inspection Function to Include Text Extraction
110
+ def inspect_pdf_hyperlinks_fitz(pdf_path):
111
+ links_data = []
112
+ try:
113
+ doc = fitz.open(pdf_path)
114
+ structural_toc = analyze_toc_fitz(doc)
115
+
116
+
117
+ for page_num in range(doc.page_count):
118
+ page = doc.load_page(page_num)
119
+
120
+ for link in page.get_links():
121
+
122
+ page_obj = doc.load_page(page_num)
123
+ link_rect = get_link_rect(link)
124
+
125
+ rect_obj = link.get("from")
126
+ xref = link.get("xref")
127
+ #print(f"rect_obj = {rect_obj}")
128
+ #print(f"xref = {xref}")
129
+
130
+
131
+ # --- Examples of various keys associated with various link instances ---
132
+ #print(f"keys: list(link) = {list(link)}")
133
+ # keys: list(link) = ['kind', 'xref', 'from', 'page', 'viewrect', 'id']
134
+ # keys: list(link) = ['kind', 'xref', 'from', 'uri', 'id']
135
+ # keys: list(link) = ['kind', 'xref', 'from', 'page', 'view', 'id']
136
+
137
+ # 1. Extract the anchor text
138
+ anchor_text = get_anchor_text(page_obj, link_rect)
139
+
140
+ # 2. Extract the target and kind
141
+ target = ""
142
+ kind = link.get('kind')
143
+
144
+
145
+ link_dict = {
146
+ 'page': int(page_num) + 1,
147
+ 'rect': link_rect,
148
+ 'link_text': anchor_text,
149
+ 'xref':xref
150
+ }
151
+
152
+
153
+ if link['kind'] == fitz.LINK_URI:
154
+ target = link.get('uri', 'URI (Unknown Target)')
155
+ link_dict.update({
156
+ 'type': 'External (URI)',
157
+ 'url': link.get('uri'),
158
+ 'target': target
159
+ })
160
+
161
+ elif link['kind'] == fitz.LINK_GOTO:
162
+ target_page_num = link.get('page') + 1 # fitz pages are 0-indexed
163
+ target = f"Page {target_page_num}"
164
+ link_dict.update({
165
+ 'type': 'Internal (GoTo/Dest)',
166
+ 'destination_page': int(link.get('page')) + 1,
167
+ 'destination_view': link.get('to'),
168
+ 'target': target
169
+ })
170
+
171
+ elif link['kind'] == fitz.LINK_GOTOR:
172
+ link_dict.update({
173
+ 'type': 'Remote (GoToR)',
174
+ 'remote_file': link.get('file'),
175
+ 'destination': link.get('to')
176
+ })
177
+
178
+ elif link.get('page') is not None and link['kind'] != fitz.LINK_GOTO:
179
+ link_dict.update({
180
+ 'type': 'Internal (Resolved Action)',
181
+ 'destination_page': int(link.get('page')) + 1,
182
+ 'destination_view': link.get('to'),
183
+ 'source_kind': link.get('kind')
184
+ })
185
+
186
+ else:
187
+ target = link.get('url') or link.get('remote_file') or link.get('target')
188
+ link_dict.update({
189
+ 'type': 'Other Action',
190
+ 'action_kind': link.get('kind'),
191
+ 'target': target
192
+ })
193
+
194
+ links_data.append(link_dict)
195
+
196
+ doc.close()
197
+ except Exception as e:
198
+ print(f"An error occurred: {e}", file=sys.stderr)
199
+ return links_data, structural_toc
200
+
201
+ def print_structural_toc(structural_toc):
202
+ """
203
+ Prints the structural TOC data in a clean, hierarchical, and readable format.
204
+ """
205
+ print("\n## 📚 Structural Table of Contents (PDF Bookmarks/Outline)")
206
+ print("-" * 50)
207
+ if not structural_toc:
208
+ print("No structural TOC (bookmarks/outline) found.")
209
+ return
210
+
211
+ # Determine max page width for consistent alignment (optional but nice)
212
+ max_page = max(item['target_page'] for item in structural_toc) if structural_toc else 1
213
+ page_width = len(str(max_page))
214
+
215
+ # Iterate and format
216
+ for item in structural_toc:
217
+ # Use level for indentation (e.g., Level 1 = 0 spaces, Level 2 = 4 spaces, Level 3 = 8 spaces)
218
+ indent = " " * 4 * (item['level'] - 1)
219
+ # Format the title and target page number
220
+ page_str = str(item['target_page']).rjust(page_width)
221
+ print(f"{indent}{item['title']} . . . page {page_str}")
222
+
223
+ print("-" * 50)
224
+
225
+ def run_analysis(pdf_path: str, check_remnants: bool, max_links: int) -> Dict[str, Any]:
226
+ """
227
+ Core PDF analysis logic using PyMuPDF. Extracts links, remnants, and TOC.
228
+ The printing is done inside this function.
229
+ max_links: If <= 0, all links will be displayed.
230
+ """
231
+
232
+ print(f"Running PyMuPDF analysis on {Path(pdf_path).name}...")
233
+
234
+ # 1. Extract all active links and TOC
235
+ extracted_links, structural_toc = inspect_pdf_hyperlinks_fitz(pdf_path)
236
+ toc_entry_count = len(structural_toc)
237
+
238
+ # 2. Find link remnants
239
+ remnants = []
240
+ if check_remnants:
241
+ remnants = find_link_remnants(pdf_path, extracted_links) # Pass active links to exclude them
242
+
243
+ if not extracted_links and not remnants and not structural_toc:
244
+ print(f"\nNo hyperlinks, remnants, or structural TOC found in {Path(pdf_path).name}.")
245
+ return {}
246
+
247
+ # 3. Separate the lists based on the 'type' key
248
+ uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
249
+ goto_links = [link for link in extracted_links if link['type'] == 'Internal (GoTo/Dest)']
250
+ resolved_action_links = [link for link in extracted_links if link['type'] == 'Internal (Resolved Action)']
251
+ other_links = [link for link in extracted_links if link['type'] not in ['External (URI)', 'Internal (GoTo/Dest)', 'Internal (Resolved Action)']]
252
+
253
+ total_internal_links = len(goto_links) + len(resolved_action_links)
254
+
255
+ # --- ANALYSIS SUMMARY (Using your print logic) ---
256
+ print(f"\n--- Link Analysis Results for {Path(pdf_path).name} ---")
257
+ print(f"Total active links: {len(extracted_links)} (External: {len(uri_links)}, Internal Jumps: {total_internal_links}, Other: {len(other_links)})")
258
+ print(f"Total **structural TOC entries (bookmarks)** found: {toc_entry_count}")
259
+ print(f"Total **potential missing links** found: {len(remnants)}")
260
+ print("-" * 50)
261
+
262
+ limit = max_links if max_links > 0 else None
263
+
264
+ uri_and_other = uri_links + other_links
265
+
266
+ # --- Section 1: ACTIVE URI LINKS ---
267
+ print(f"\n## 🔗 Active URI Links (External & Other) - {len(uri_and_other)} found")
268
+ print("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target URI/Action"))
269
+ print("-" * 75)
270
+
271
+ if uri_and_other:
272
+ for i, link in enumerate(uri_and_other[:limit], 1):
273
+ target = link.get('url') or link.get('remote_file') or link.get('target')
274
+ link_text = link.get('link_text', 'N/A')
275
+ print("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], target))
276
+ if limit is not None and len(uri_and_other) > limit:
277
+ print(f"... and {len(uri_and_other) - limit} more links (use --max-links to see all or --max-links 0 to show all).")
278
+
279
+ else:
280
+ print("  No external or 'Other' links found.")
281
+
282
+ # --- Section 2: ACTIVE INTERNAL JUMPS ---
283
+ print(f"\n## 🖱️ Active Internal Jumps (GoTo & Resolved Actions) - {total_internal_links} found")
284
+ print("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To Page"))
285
+ print("-" * 75)
286
+
287
+ all_internal = goto_links + resolved_action_links
288
+ if total_internal_links > 0:
289
+ for i, link in enumerate(all_internal[:limit], 1):
290
+ link_text = link.get('link_text', 'N/A')
291
+ print("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], link['destination_page']))
292
+
293
+ if limit is not None and len(all_internal) > limit:
294
+ print(f"... and {len(all_internal) - limit} more links (use --max-links to see all or --max-links 0 to show all).")
295
+ else:
296
+ print("  No internal GoTo or Resolved Action links found.")
297
+
298
+ # --- Section 3: REMNANTS ---
299
+ print("\n" + "=" * 70)
300
+ print(f"## ⚠️ Link Remnants (Potential Missing Links to Fix) - {len(remnants)} found")
301
+ print("=" * 70)
302
+
303
+ if remnants:
304
+ print("{:<5} | {:<5} | {:<15} | {}".format("Idx", "Page", "Remnant Type", "Text Found (Needs Hyperlink)"))
305
+ print("-" * 75)
306
+ for i, remnant in enumerate(remnants[:max_links], 1):
307
+ print("{:<5} | {:<5} | {:<15} | {}".format(i, remnant['page'], remnant['type'], remnant['text']))
308
+ if len(remnants) > max_links:
309
+ print(f"... and {len(remnants) - max_links} more remnants (use --max-links to see all).")
310
+ else:
311
+ print("  No URI or Email remnants found that are not already active links.")
312
+
313
+ # --- Section 4: TOC ---
314
+ print_structural_toc(structural_toc)
315
+
316
+ # Return the collected data for potential future JSON/other output
317
+ return {
318
+ "external_links": uri_links,
319
+ "internal_links": all_internal,
320
+ "remnants": remnants,
321
+ "toc": structural_toc
322
+ }
323
+
324
+ def call_stable():
325
+ print("Begin analysis...")
326
+ run_analysis()
327
+ print("Analysis complete.")
328
+
329
+ if __name__ == "__main__":
330
+ call_stable()
pdflinkcheck/cli.py ADDED
@@ -0,0 +1,63 @@
1
+ # src/pdflinkcheck/cli.py
2
+ import typer
3
+ from rich.console import Console
4
+ from pathlib import Path
5
+ from pdflinkcheck.analyze import run_analysis # Assuming core logic moves here
6
+ from typing import Dict
7
+ # Initialize the rich console for output
8
+ console = Console()
9
+ app = typer.Typer(
10
+ name="pdflinkcheck",
11
+ help="A command-line tool for comprehensive PDF link analysis and reporting.",
12
+ add_completion=False
13
+ )
14
+
15
+ @app.command(name="analyze") # Added a command name 'analyze' for clarity
16
+ def analyze_pdf( # Renamed function for clarity
17
+ pdf_path: Path = typer.Argument(
18
+ ...,
19
+ exists=True,
20
+ file_okay=True,
21
+ dir_okay=False,
22
+ readable=True,
23
+ resolve_path=True,
24
+ help="The path to the PDF file to analyze."
25
+ ),
26
+ check_remnants: bool = typer.Option(
27
+ True,
28
+ "--check-remnants/--no-check-remnants",
29
+ help="Toggle checking for unlinked URLs/Emails in the text layer."
30
+ ),
31
+ max_links: int = typer.Option(
32
+ 50,
33
+ "--max-links",
34
+ min=0,
35
+ help="Maximum number of links/remnants to display in the report. Use 0 to show all."
36
+ )
37
+ ):
38
+ """
39
+ Analyzes the specified PDF file for all internal, external, and unlinked URI/Email references.
40
+ """
41
+ # The actual heavy lifting (analysis and printing) is now in run_analysis
42
+ run_analysis(
43
+ pdf_path=str(pdf_path),
44
+ check_remnants=check_remnants,
45
+ max_links=max_links
46
+ )
47
+
48
+ @app.command(name="gui")
49
+ def gui():
50
+ """
51
+ Launch tkinter-based GUI.
52
+ """
53
+ from pdflinkcheck.gui import start_gui
54
+ try:
55
+ start_gui()
56
+ except Exception as e:
57
+ typer.echo("GUI failed to launch")
58
+ typer.echo("Ensure tkinter is available, especially if using WSLg.")
59
+ typer.echo(f"Error: {e}")
60
+
61
+ # Placeholder for running the app
62
+ if __name__ == "__main__":
63
+ app()
pdflinkcheck/gui.py ADDED
@@ -0,0 +1,165 @@
1
+ # src/pdflinkcheck/gui.py
2
+ import tkinter as tk
3
+ from tkinter import filedialog, ttk
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ # Import the core analysis function
8
+ from pdflinkcheck.analyze import run_analysis
9
+
10
+ class RedirectText:
11
+ """A class to redirect sys.stdout messages to a Tkinter Text widget."""
12
+ def __init__(self, text_widget):
13
+ self.text_widget = text_widget
14
+
15
+ def write(self, string):
16
+ """Insert the incoming string into the Text widget."""
17
+ self.text_widget.insert(tk.END, string)
18
+ self.text_widget.see(tk.END) # Scroll to the end
19
+ self.text_widget.update_idletasks() # Refresh GUI
20
+
21
+ def flush(self):
22
+ """Required for file-like objects, but does nothing here."""
23
+ pass
24
+
25
+ class PDFLinkCheckerApp(tk.Tk):
26
+ def __init__(self):
27
+ super().__init__()
28
+ self.title("PDF Link Checker")
29
+ self.geometry("800x600")
30
+
31
+ # Style for the application
32
+ style = ttk.Style(self)
33
+ style.theme_use('clam')
34
+
35
+ self.pdf_path = tk.StringVar(value="")
36
+ self.check_remnants_var = tk.BooleanVar(value=True)
37
+ self.max_links_var = tk.StringVar(value="50")
38
+ self.show_all_links_var = tk.BooleanVar(value=False)
39
+
40
+ self._create_widgets()
41
+
42
+ def _create_widgets(self):
43
+ # --- Control Frame (Top) ---
44
+ control_frame = ttk.Frame(self, padding="10")
45
+ control_frame.pack(fill='x')
46
+
47
+ # File Selection
48
+ ttk.Label(control_frame, text="PDF Path:").grid(row=0, column=0, padx=5, pady=5, sticky='w')
49
+ ttk.Entry(control_frame, textvariable=self.pdf_path, width=60).grid(row=0, column=1, padx=5, pady=5, sticky='ew')
50
+ ttk.Button(control_frame, text="Browse...", command=self._select_pdf).grid(row=0, column=2, padx=5, pady=5)
51
+
52
+ # Options
53
+ ttk.Checkbutton(
54
+ control_frame,
55
+ text="Check for Remnants (URLs/Emails)",
56
+ variable=self.check_remnants_var
57
+ ).grid(row=1, column=0, padx=5, pady=5, sticky='w')
58
+
59
+ ttk.Checkbutton(
60
+ control_frame,
61
+ text="Show All Links (Override Max)",
62
+ variable=self.show_all_links_var,
63
+ # Optional: Disable max_links entry when this is checked
64
+ command=self._toggle_max_links_entry
65
+ ).grid(row=2, column=0, padx=5, pady=5, sticky='w')
66
+
67
+ ttk.Label(control_frame, text="Max Links to Display:").grid(row=1, column=1, padx=5, pady=5, sticky='e')
68
+ self.max_links_entry = ttk.Entry(control_frame, textvariable=self.max_links_var, width=10)
69
+ self.max_links_entry.grid(row=1, column=2, padx=5, pady=5, sticky='w')
70
+
71
+ # Run Button
72
+ ttk.Button(control_frame, text="▶ Run Analysis", command=self._run_analysis_gui, style='Accent.TButton').grid(row=2, column=0, columnspan=3, pady=10)
73
+
74
+ control_frame.grid_columnconfigure(1, weight=1)
75
+
76
+ # --- Output Frame (Bottom) ---
77
+ output_frame = ttk.Frame(self, padding="10")
78
+ output_frame.pack(fill='both', expand=True)
79
+
80
+ ttk.Label(output_frame, text="Analysis Report Output:").pack(fill='x')
81
+
82
+ # Scrollable Text Widget for output
83
+ self.output_text = tk.Text(output_frame, wrap=tk.WORD, state=tk.DISABLED, bg='#333333', fg='white', font=('Monospace', 10))
84
+ self.output_text.pack(fill='both', expand=True, padx=5, pady=5)
85
+
86
+ # Scrollbar
87
+ scrollbar = ttk.Scrollbar(output_frame, command=self.output_text.yview)
88
+ self.output_text['yscrollcommand'] = scrollbar.set
89
+ scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
90
+
91
+ def _select_pdf(self):
92
+ file_path = filedialog.askopenfilename(
93
+ defaultextension=".pdf",
94
+ filetypes=[("PDF files", "*.pdf"), ("All files", "*.*")]
95
+ )
96
+ if file_path:
97
+ self.pdf_path.set(file_path)
98
+
99
+ def _toggle_max_links_entry(self):
100
+ """Disables/enables the max_links entry based on show_all_links_var."""
101
+ if self.show_all_links_var.get():
102
+ self.max_links_entry.config(state=tk.DISABLED)
103
+ else:
104
+ self.max_links_entry.config(state=tk.NORMAL)
105
+
106
+ def _run_analysis_gui(self):
107
+ pdf_path_str = self.pdf_path.get()
108
+ if not Path(pdf_path_str).exists():
109
+ self._display_error("Error: PDF file not found or path is invalid.")
110
+ return
111
+
112
+ if self.show_all_links_var.get():
113
+ # Pass 0 to the backend, which analyze.py interprets as "Show All"
114
+ max_links_to_pass = 0
115
+ else:
116
+ try:
117
+ max_links_to_pass = int(self.max_links_var.get())
118
+ if max_links_to_pass <= 0:
119
+ self._display_error("Error: Max Links must be a positive number (or use 'Show All').")
120
+ return
121
+ except ValueError:
122
+ self._display_error("Error: Max Links must be an integer.")
123
+ return
124
+
125
+ # 1. Clear previous output and enable editing
126
+ self.output_text.config(state=tk.NORMAL)
127
+ self.output_text.delete('1.0', tk.END)
128
+
129
+ # 2. Redirect standard output to the Text widget
130
+ original_stdout = sys.stdout
131
+ sys.stdout = RedirectText(self.output_text)
132
+
133
+ try:
134
+ # 3. Call the core logic function
135
+ self.output_text.insert(tk.END, "--- Starting Analysis ---\n")
136
+ run_analysis(
137
+ pdf_path=pdf_path_str,
138
+ check_remnants=self.check_remnants_var.get(),
139
+ max_links=max_links_to_pass
140
+ )
141
+ self.output_text.insert(tk.END, "\n--- Analysis Complete ---\n")
142
+
143
+ except Exception as e:
144
+ self._display_error(f"An unexpected error occurred during analysis: {e}")
145
+
146
+ finally:
147
+ # 4. Restore standard output and disable editing
148
+ sys.stdout = original_stdout
149
+ self.output_text.config(state=tk.DISABLED)
150
+
151
+ def _display_error(self, message):
152
+ self.output_text.config(state=tk.NORMAL)
153
+ self.output_text.delete('1.0', tk.END)
154
+ self.output_text.insert(tk.END, f"[ERROR] {message}\n", 'error')
155
+ self.output_text.tag_config('error', foreground='red')
156
+ self.output_text.config(state=tk.DISABLED)
157
+
158
+
159
+ def start_gui():
160
+ """Entry point function to launch the application."""
161
+ app = PDFLinkCheckerApp()
162
+ app.mainloop()
163
+
164
+ if __name__ == "__main__":
165
+ start_gui()
@@ -0,0 +1,142 @@
1
+ import re
2
+ import fitz
3
+
4
+ # Regular expression pattern for common URLs (http, https, www, mhtml)
5
+ URI_PATTERN = re.compile(
6
+ r'(?:https?|mhtml|file|ftp):\/\/\S+|\bwww\.\S+\b',
7
+ re.IGNORECASE
8
+ )
9
+
10
+ # Regular expression pattern for email addresses
11
+ EMAIL_PATTERN = re.compile(
12
+ r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
13
+ re.IGNORECASE
14
+ )
15
+
16
+ def clean_ex_rect(ex_rect_tuple):
17
+ # If the input is a string, attempt to parse it
18
+ if isinstance(ex_rect_tuple, str):
19
+ try:
20
+ # Use re.split to handle commas and spaces robustly.
21
+ # Filter out empty strings that result from multiple delimiters (e.g., "1, 2,,3")
22
+ parts = [c.strip() for c in re.split(r'[,\s]+', ex_rect_tuple.strip()) if c.strip()]
23
+ coords = [float(c) for c in parts]
24
+
25
+ if len(coords) != 4:
26
+ # print(f"Warning: Rect string parsed to {len(coords)} coords, expected 4: {ex_rect_tuple}")
27
+ return None
28
+ return coords
29
+ except ValueError:
30
+ # print(f"Warning: Could not parse rect string: {ex_rect_tuple}")
31
+ return None # Use None to signal failure
32
+
33
+ # If it's already a numeric sequence, check its length and type
34
+ elif isinstance(ex_rect_tuple, (list, tuple)):
35
+ if len(ex_rect_tuple) == 4 and all(isinstance(c, (int, float)) for c in ex_rect_tuple):
36
+ return ex_rect_tuple
37
+ # else: print(f"Warning: Numeric rect has incorrect length/type: {ex_rect_tuple}")
38
+ return None
39
+
40
+ # Handle the 'N/A: Missing Rect' case where link['rect'] might be None or a weird object
41
+ else:
42
+ # print(f"Warning: Unexpected rect type/format: {ex_rect_tuple}")
43
+ return None
44
+
45
+ def find_link_remnants(pdf_path, existing_links):
46
+ """
47
+ Scans the PDF for text that looks like a URI or email but is not a registered link annotation.
48
+ """
49
+ doc = fitz.open(pdf_path)
50
+ remnants_data = []
51
+
52
+ # 1. Create a set of all bounding boxes (Rects) of EXISTING links for exclusion
53
+ existing_rects = set()
54
+ for link in existing_links:
55
+ rect_obj = link.get("from")
56
+
57
+ if rect_obj:
58
+ # NOTE: A fitz.Rect object is returned here. We can use its properties directly.
59
+
60
+ # ⚠️ We still need to use your cleaning function if it handles rotation/quantization,
61
+ # but we must pass it the coordinates in the expected format (e.g., as a list or tuple).
62
+
63
+ # Convert the Rect object to a standard coordinate tuple (x0, y0, x1, y1)
64
+ raw_coords = (rect_obj.x0, rect_obj.y0, rect_obj.x1, rect_obj.y1)
65
+
66
+ # Assuming clean_ex_rect takes a list/tuple of 4 coordinates and cleans them
67
+ cleaned_coords = clean_ex_rect(raw_coords)
68
+ print(f"cleaned_coords = {cleaned_coords}")
69
+
70
+ # print(f"cleaned_coords = {cleaned_coords}") # Keep this for debugging
71
+
72
+ if cleaned_coords:
73
+ # Store the tuple of clean NUMBERS
74
+ # Note: A list is not hashable, so converting to tuple is correct.
75
+ existing_rects.add(tuple(cleaned_coords))
76
+
77
+ for page_num in range(doc.page_count):
78
+ page = doc.load_page(page_num)
79
+
80
+ # Extract text blocks with coordinates (MODE_TEXT is faster than 'text')
81
+ text_blocks = page.get_text("blocks")
82
+
83
+ for block in text_blocks:
84
+ x0, y0, x1, y1, text, block_no, block_type = block
85
+
86
+ # Look for URI remnants
87
+ for match in URI_PATTERN.finditer(text):
88
+ remnant_text = match.group(0)
89
+
90
+ # Use fitz to get the bounding box of the matched remnant text on the page
91
+ text_instances = page.search_for(remnant_text)
92
+
93
+ if text_instances:
94
+ remnant_rect = tuple(text_instances[0])
95
+
96
+ # Check if this remnant's bounding box overlaps with any existing link's bounding box
97
+ is_active_link = False
98
+ for ex_rect_tuple in existing_rects:
99
+ # ⚠️ CLEANUP: ex_rect_tuple is now GUARANTEED to be a tuple of 4 numbers
100
+ # We removed the unnecessary clean_ex_rect(ex_rect_tuple) call.
101
+
102
+ # Convert tuple back to fitz.Rect for overlap check
103
+ ex_rect = fitz.Rect(ex_rect_tuple)
104
+ if ex_rect.intersects(text_instances[0]):
105
+ is_active_link = True
106
+ break
107
+
108
+ if not is_active_link:
109
+ remnants_data.append({
110
+ 'page': page_num + 1,
111
+ 'type': 'URI Remnant',
112
+ 'text': remnant_text,
113
+ 'rect': remnant_rect
114
+ })
115
+
116
+ # Look for Email remnants
117
+ for match in EMAIL_PATTERN.finditer(text):
118
+ remnant_text = match.group(0)
119
+
120
+ text_instances = page.search_for(remnant_text)
121
+
122
+ if text_instances:
123
+ remnant_rect = tuple(text_instances[0])
124
+
125
+ is_active_link = False
126
+ for ex_rect_tuple in existing_rects:
127
+ # ⚠️ CLEANUP: ex_rect_tuple is now GUARANTEED to be a tuple of 4 numbers
128
+ ex_rect = fitz.Rect(ex_rect_tuple)
129
+ if ex_rect.intersects(text_instances[0]):
130
+ is_active_link = True
131
+ break
132
+
133
+ if not is_active_link:
134
+ remnants_data.append({
135
+ 'page': page_num + 1,
136
+ 'type': 'Email Remnant',
137
+ 'text': remnant_text,
138
+ 'rect': remnant_rect
139
+ })
140
+
141
+ doc.close()
142
+ return remnants_data
@@ -0,0 +1,109 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdflinkcheck
3
+ Version: 1.1.7
4
+ Summary: A purpose-built PDF link analysis reporting tool.
5
+ Requires-Python: >=3.12
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: pymupdf>=1.26.6
8
+ Requires-Dist: rich>=14.2.0
9
+ Requires-Dist: typer>=0.20.0
10
+ Requires-Dist: pyhabitat>=1.0.52
11
+ Provides-Extra: dev
12
+ Requires-Dist: ruff>=0.1.13; extra == "dev"
13
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
14
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
15
+
16
+ # pdflinkcheck
17
+ A purpose-built tool for comprehensive analysis of hyperlinks and link remnants within PDF documents, primarily using the PyMuPDF library.
18
+ Use the CLI or the GUI.
19
+
20
+ ---
21
+
22
+ ### Graphical User Interface (GUI)
23
+
24
+ The tool can be run using a simple cross-platform graphical interface (Tkinter):
25
+
26
+ ![Screenshot of the pdflinkcheck GUI](https://raw.githubusercontent.com/City-of-Memphis-Wastewater/pdflinkcheck/main/assets/pdflinkcheck_gui.png)
27
+
28
+ To launch the GUI, use the command: `pdflinkcheck-gui`
29
+
30
+ ---
31
+
32
+ ### ✨ Features
33
+
34
+ * **Active Link Extraction:** Identifies and categorizes all programmed links (External URIs, Internal GoTo/Destinations, Remote Jumps).
35
+ * **Anchor Text Retrieval:** Extracts the visible text corresponding to each link's bounding box.
36
+ * **Remnant Detection:** Scans the document's text layer for unlinked URIs and email addresses that should potentially be converted into active links.
37
+ * **Structural TOC:** Extracts the PDF's internal Table of Contents (bookmarks/outline).
38
+
39
+ ---
40
+
41
+ ### 📥 Installation (Recommended via `pipx`)
42
+
43
+ The recommended way to install `pdflinkcheck` is using `pipx`, which installs Python applications in isolated environments, preventing dependency conflicts.
44
+
45
+ ```bash
46
+ # Ensure you have pipx installed first (if not, run: pip install pipx)
47
+ pipx install pdflinkcheck
48
+ ```
49
+
50
+
51
+ **Note for Developers:** If you prefer a traditional virtual environment or are developing locally, use `pip`:
52
+ ```bash
53
+ # From the root of the project
54
+ pip install .
55
+ ```
56
+
57
+ ---
58
+
59
+ ### 🚀 Usage
60
+
61
+ The main command is `pdflinkcheck analyze`.
62
+
63
+
64
+ ```bash
65
+ # Basic usage: Analyze a PDF and check for remnants (default behavior)
66
+ pdflinkcheck analyze "path/to/my/document.pdf"
67
+ ```
68
+
69
+ #### Command Options
70
+
71
+ |**Option**|**Description**|**Default**|
72
+ |---|---|---|
73
+ |`<PDF_PATH>`|**Required.** The path to the PDF file to analyze.|N/A|
74
+ |`--check-remnants / --no-check-remnants`|Toggle scanning the text layer for unlinked URLs/Emails.|`--check-remnants`|
75
+ |`--max-links INTEGER`|Maximum number of links/remnants to display in the detailed report sections.|`50`|
76
+ |`--help`|Show command help and exit.|N/A|
77
+
78
+ #### Example Run
79
+
80
+ ```bash
81
+ pdflinkcheck analyze "TE Maxson WWTF O&M Manual.pdf" --max-links 10
82
+ ```
83
+
84
+ # Run from source
85
+ ```
86
+ git clone http://github.com/city-of-memphis-wastewater/pdflinkcheck.git
87
+ cd pdflinkcheck
88
+ uv sync
89
+ python src/pdflinkcheck/analyze.py
90
+ ```
91
+
92
+ ---
93
+
94
+ ### ⚠️ Platform Compatibility Note
95
+
96
+ This tool relies on the `PyMuPDF` library, which requires specific native dependencies (like MuPDF) that may not be available on all platforms.
97
+
98
+ **Known Incompatibility:** This tool is **not officially supported** and may fail to run on environments like **Termux (Android)** due to underlying C/C++ library compilation issues with PyMuPDF. It is recommended for use on standard Linux, macOS, or Windows operating systems.
99
+
100
+ ---
101
+
102
+ ### Document Compatibility
103
+
104
+ While `pdflinkcheck` uses the robust PyMuPDF library, not all PDF files can be processed successfully. This tool is designed primarily for digitally generated (vector-based) PDFs.
105
+
106
+ Processing may fail or yield incomplete results for:
107
+ * **Scanned PDFs** (images of text) that lack an accessible text layer.
108
+ * **Encrypted or Password-Protected** documents.
109
+ * **Malformed or non-standard** PDF files.
@@ -0,0 +1,10 @@
1
+ pdflinkcheck/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ pdflinkcheck/analyze.py,sha256=wtj1fNvMl5553FYdHmd3K82ve2lHaDW68qBVITig2cQ,12982
3
+ pdflinkcheck/cli.py,sha256=vo_2BF7A4jaR_Qvd4AZ8RIqlwV10Die2RbWc9Er6wQo,1872
4
+ pdflinkcheck/gui.py,sha256=8uzaKqE0aVLzAGIwD52rbEJKfEHdi4R6S8fO8bPs8rI,6432
5
+ pdflinkcheck/remnants.py,sha256=xgunD4hDDT0SqD9SywvPc5DLSLNLA6O0BL0KOuLQwV8,6151
6
+ pdflinkcheck-1.1.7.dist-info/METADATA,sha256=SjgFk5-n8SlurKY0pjwZtTWtXt42vgz0KuYTFY729a4,3725
7
+ pdflinkcheck-1.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
+ pdflinkcheck-1.1.7.dist-info/entry_points.txt,sha256=Ql8fOpnnAGZ23DWcq0J97bPBafrP0rl8x9aVpSLh5Cs,100
9
+ pdflinkcheck-1.1.7.dist-info/top_level.txt,sha256=WdBg8l6l3TF1HQDpR_PwSmBCSu5atKWFnPfNbRNwrME,13
10
+ pdflinkcheck-1.1.7.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ pdflinkcheck = pdflinkcheck.cli:app
3
+ pdflinkcheck-gui = pdflinkcheck.gui:start_gui
@@ -0,0 +1 @@
1
+ pdflinkcheck