pdflinkcheck 1.1.7__py3-none-any.whl → 1.1.47__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdflinkcheck/__init__.py +31 -0
- pdflinkcheck/analyze.py +306 -128
- pdflinkcheck/cli.py +97 -20
- pdflinkcheck/data/LICENSE +680 -0
- pdflinkcheck/gui.py +157 -29
- pdflinkcheck/io.py +106 -0
- pdflinkcheck-1.1.47.dist-info/METADATA +266 -0
- pdflinkcheck-1.1.47.dist-info/RECORD +13 -0
- {pdflinkcheck-1.1.7.dist-info → pdflinkcheck-1.1.47.dist-info}/entry_points.txt +0 -1
- pdflinkcheck-1.1.47.dist-info/licenses/LICENSE +680 -0
- pdflinkcheck-1.1.7.dist-info/METADATA +0 -109
- pdflinkcheck-1.1.7.dist-info/RECORD +0 -10
- {pdflinkcheck-1.1.7.dist-info → pdflinkcheck-1.1.47.dist-info}/WHEEL +0 -0
- {pdflinkcheck-1.1.7.dist-info → pdflinkcheck-1.1.47.dist-info}/top_level.txt +0 -0
pdflinkcheck/gui.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
# src/pdflinkcheck/gui.py
|
|
2
2
|
import tkinter as tk
|
|
3
|
-
from tkinter import filedialog, ttk
|
|
3
|
+
from tkinter import filedialog, ttk, messagebox # Added messagebox
|
|
4
4
|
import sys
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
from typing import Optional # Added Optional
|
|
7
|
+
from importlib.resources import files
|
|
6
8
|
|
|
7
9
|
# Import the core analysis function
|
|
8
10
|
from pdflinkcheck.analyze import run_analysis
|
|
@@ -16,60 +18,140 @@ class RedirectText:
|
|
|
16
18
|
"""Insert the incoming string into the Text widget."""
|
|
17
19
|
self.text_widget.insert(tk.END, string)
|
|
18
20
|
self.text_widget.see(tk.END) # Scroll to the end
|
|
19
|
-
self.text_widget.update_idletasks() # Refresh GUI
|
|
21
|
+
## self.text_widget.update_idletasks() # Refresh GUI << Suppress: The mainloop will handle updates efficiently without forcing them.
|
|
20
22
|
|
|
21
|
-
def flush(self):
|
|
23
|
+
def flush(self, *args):
|
|
22
24
|
"""Required for file-like objects, but does nothing here."""
|
|
23
25
|
pass
|
|
24
26
|
|
|
25
27
|
class PDFLinkCheckerApp(tk.Tk):
|
|
26
28
|
def __init__(self):
|
|
27
29
|
super().__init__()
|
|
28
|
-
self.title("PDF Link
|
|
30
|
+
self.title("PDF Link Check")
|
|
29
31
|
self.geometry("800x600")
|
|
30
32
|
|
|
31
33
|
# Style for the application
|
|
32
34
|
style = ttk.Style(self)
|
|
33
35
|
style.theme_use('clam')
|
|
34
36
|
|
|
37
|
+
# --- 1. Initialize Variables ---
|
|
35
38
|
self.pdf_path = tk.StringVar(value="")
|
|
36
|
-
self.check_remnants_var = tk.BooleanVar(value=True)
|
|
39
|
+
self.check_remnants_var = tk.BooleanVar(value=True)
|
|
37
40
|
self.max_links_var = tk.StringVar(value="50")
|
|
38
|
-
self.show_all_links_var = tk.BooleanVar(value=
|
|
41
|
+
self.show_all_links_var = tk.BooleanVar(value=True)
|
|
42
|
+
self.export_report_format_var = tk.StringVar(value="JSON")
|
|
43
|
+
self.do_export_report_var = tk.BooleanVar(value=True)
|
|
44
|
+
|
|
45
|
+
self.supported_export_formats = ["JSON", "MD", "TXT"]
|
|
46
|
+
self.supported_export_formats = ["JSON"]
|
|
47
|
+
|
|
39
48
|
|
|
49
|
+
# --- 2. Create Widgets ---
|
|
40
50
|
self._create_widgets()
|
|
51
|
+
|
|
52
|
+
# --- 3. Set Initial Dependent Widget States ---
|
|
53
|
+
self._toggle_max_links_entry()
|
|
54
|
+
self._toggle_export_report()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _show_license(self):
|
|
58
|
+
"""
|
|
59
|
+
Reads the embedded LICENSE file (AGPLv3) and displays its content in a new modal window.
|
|
60
|
+
"""
|
|
61
|
+
try:
|
|
62
|
+
# CORRECT WAY: Use the Traversable object's read_text() method.
|
|
63
|
+
# This handles files located inside zip archives (.pyz, pipx venvs) correctly.
|
|
64
|
+
license_path_traversable = files("pdflinkcheck.data") / "LICENSE"
|
|
65
|
+
license_content = license_path_traversable.read_text(encoding="utf-8")
|
|
66
|
+
|
|
67
|
+
except FileNotFoundError:
|
|
68
|
+
messagebox.showerror(
|
|
69
|
+
"License Error",
|
|
70
|
+
"LICENSE file not found within the installation package (pdflinkcheck.data/LICENSE). Check build process."
|
|
71
|
+
)
|
|
72
|
+
return
|
|
73
|
+
except Exception as e:
|
|
74
|
+
messagebox.showerror("Read Error", f"Failed to read embedded LICENSE file: {e}")
|
|
75
|
+
return
|
|
76
|
+
|
|
77
|
+
# --- Display in a New Toplevel Window ---
|
|
78
|
+
license_window = tk.Toplevel(self)
|
|
79
|
+
license_window.title("Software License")
|
|
80
|
+
license_window.geometry("600x400")
|
|
81
|
+
|
|
82
|
+
# Text widget for content
|
|
83
|
+
text_widget = tk.Text(license_window, wrap=tk.WORD, font=('Monospace', 10), padx=10, pady=10)
|
|
84
|
+
text_widget.insert(tk.END, license_content)
|
|
85
|
+
text_widget.config(state=tk.DISABLED)
|
|
86
|
+
|
|
87
|
+
# Scrollbar
|
|
88
|
+
scrollbar = ttk.Scrollbar(license_window, command=text_widget.yview)
|
|
89
|
+
text_widget['yscrollcommand'] = scrollbar.set
|
|
90
|
+
|
|
91
|
+
# Layout
|
|
92
|
+
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
|
|
93
|
+
text_widget.pack(fill='both', expand=True)
|
|
94
|
+
|
|
95
|
+
# Make the window modal (optional, but good practice for notices)
|
|
96
|
+
license_window.transient(self)
|
|
97
|
+
license_window.grab_set()
|
|
98
|
+
self.wait_window(license_window)
|
|
41
99
|
|
|
42
100
|
def _create_widgets(self):
|
|
43
101
|
# --- Control Frame (Top) ---
|
|
44
102
|
control_frame = ttk.Frame(self, padding="10")
|
|
45
103
|
control_frame.pack(fill='x')
|
|
46
104
|
|
|
47
|
-
# File Selection
|
|
105
|
+
# Row 0: File Selection
|
|
48
106
|
ttk.Label(control_frame, text="PDF Path:").grid(row=0, column=0, padx=5, pady=5, sticky='w')
|
|
49
107
|
ttk.Entry(control_frame, textvariable=self.pdf_path, width=60).grid(row=0, column=1, padx=5, pady=5, sticky='ew')
|
|
50
108
|
ttk.Button(control_frame, text="Browse...", command=self._select_pdf).grid(row=0, column=2, padx=5, pady=5)
|
|
51
109
|
|
|
52
|
-
#
|
|
110
|
+
# Row 1: Remnants and Max Links Label/Entry
|
|
53
111
|
ttk.Checkbutton(
|
|
54
112
|
control_frame,
|
|
55
113
|
text="Check for Remnants (URLs/Emails)",
|
|
56
114
|
variable=self.check_remnants_var
|
|
57
115
|
).grid(row=1, column=0, padx=5, pady=5, sticky='w')
|
|
58
116
|
|
|
117
|
+
ttk.Label(control_frame, text="Max Links to Display:").grid(row=1, column=1, padx=5, pady=5, sticky='e')
|
|
118
|
+
self.max_links_entry = ttk.Entry(control_frame, textvariable=self.max_links_var, width=10)
|
|
119
|
+
self.max_links_entry.grid(row=1, column=2, padx=5, pady=5, sticky='w')
|
|
120
|
+
|
|
121
|
+
export_group_frame = ttk.Frame(control_frame)
|
|
122
|
+
export_group_frame.grid(row=2, column=0, padx=5, pady=5, sticky='w') # Placed in the original Checkbutton's column
|
|
123
|
+
|
|
124
|
+
ttk.Checkbutton(
|
|
125
|
+
export_group_frame,
|
|
126
|
+
text="Export Report",
|
|
127
|
+
variable=self.do_export_report_var,
|
|
128
|
+
command=self._toggle_export_report
|
|
129
|
+
).pack(side=tk.LEFT, padx=(0, 5)) # Pack Checkbutton to the left with small internal padding
|
|
130
|
+
self.export_report_format = ttk.Combobox(
|
|
131
|
+
export_group_frame,
|
|
132
|
+
textvariable=self.export_report_format_var,
|
|
133
|
+
values=self.supported_export_formats,
|
|
134
|
+
state='readonly', # Prevents user from typing invalid values
|
|
135
|
+
width=5
|
|
136
|
+
)
|
|
137
|
+
self.export_report_format.set(self.supported_export_formats[0]) # Set default text
|
|
138
|
+
self.export_report_format.pack(side=tk.LEFT)
|
|
139
|
+
# Pack Entry tightly next to it
|
|
140
|
+
|
|
59
141
|
ttk.Checkbutton(
|
|
60
142
|
control_frame,
|
|
61
143
|
text="Show All Links (Override Max)",
|
|
62
144
|
variable=self.show_all_links_var,
|
|
63
|
-
# Optional: Disable max_links entry when this is checked
|
|
64
145
|
command=self._toggle_max_links_entry
|
|
65
|
-
).grid(row=2, column=
|
|
146
|
+
).grid(row=2, column=2, padx=5, pady=5, sticky='w')
|
|
66
147
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
148
|
+
# Row 3: Run Button and License Button
|
|
149
|
+
run_btn = ttk.Button(control_frame, text="▶ Run Analysis", command=self._run_analysis_gui, style='Accent.TButton')
|
|
150
|
+
run_btn.grid(row=3, column=0, columnspan=2, pady=10, sticky='ew', padx=(0, 5))
|
|
70
151
|
|
|
71
|
-
|
|
72
|
-
|
|
152
|
+
license_btn = ttk.Button(control_frame, text="Show License", command=self._show_license)
|
|
153
|
+
license_btn.grid(row=3, column=2, columnspan=1, pady=10, sticky='ew', padx=(5, 0)) # Sticky 'ew' makes it fill
|
|
154
|
+
|
|
73
155
|
|
|
74
156
|
control_frame.grid_columnconfigure(1, weight=1)
|
|
75
157
|
|
|
@@ -80,16 +162,26 @@ class PDFLinkCheckerApp(tk.Tk):
|
|
|
80
162
|
ttk.Label(output_frame, text="Analysis Report Output:").pack(fill='x')
|
|
81
163
|
|
|
82
164
|
# Scrollable Text Widget for output
|
|
83
|
-
|
|
84
|
-
|
|
165
|
+
# Use an internal frame for text and scrollbar to ensure correct packing
|
|
166
|
+
text_scroll_frame = ttk.Frame(output_frame)
|
|
167
|
+
text_scroll_frame.pack(fill='both', expand=True, padx=5, pady=5)
|
|
85
168
|
|
|
86
|
-
#
|
|
87
|
-
|
|
88
|
-
|
|
169
|
+
self.output_text = tk.Text(text_scroll_frame, wrap=tk.WORD, state=tk.DISABLED, bg='#333333', fg='white', font=('Monospace', 10))
|
|
170
|
+
self.output_text.pack(side=tk.LEFT, fill='both', expand=True) # Text fills and expands
|
|
171
|
+
|
|
172
|
+
# Scrollbar (Scrollbar must be packed AFTER the text widget)
|
|
173
|
+
scrollbar = ttk.Scrollbar(text_scroll_frame, command=self.output_text.yview)
|
|
89
174
|
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
|
|
175
|
+
self.output_text['yscrollcommand'] = scrollbar.set # Link text widget back to scrollbar
|
|
90
176
|
|
|
91
177
|
def _select_pdf(self):
|
|
178
|
+
if self.pdf_path.get():
|
|
179
|
+
initialdir = str(Path(self.pdf_path.get()).parent)
|
|
180
|
+
else:
|
|
181
|
+
initialdir = str(Path.cwd())
|
|
182
|
+
|
|
92
183
|
file_path = filedialog.askopenfilename(
|
|
184
|
+
initialdir=initialdir,
|
|
93
185
|
defaultextension=".pdf",
|
|
94
186
|
filetypes=[("PDF files", "*.pdf"), ("All files", "*.*")]
|
|
95
187
|
)
|
|
@@ -103,6 +195,13 @@ class PDFLinkCheckerApp(tk.Tk):
|
|
|
103
195
|
else:
|
|
104
196
|
self.max_links_entry.config(state=tk.NORMAL)
|
|
105
197
|
|
|
198
|
+
def _toggle_export_report(self):
|
|
199
|
+
"""Enables/disables the report file export."""
|
|
200
|
+
if self.do_export_report_var.get():
|
|
201
|
+
self.export_report_format.config(state=tk.NORMAL)
|
|
202
|
+
else:
|
|
203
|
+
self.export_report_format.config(state=tk.DISABLED)
|
|
204
|
+
|
|
106
205
|
def _run_analysis_gui(self):
|
|
107
206
|
pdf_path_str = self.pdf_path.get()
|
|
108
207
|
if not Path(pdf_path_str).exists():
|
|
@@ -110,18 +209,21 @@ class PDFLinkCheckerApp(tk.Tk):
|
|
|
110
209
|
return
|
|
111
210
|
|
|
112
211
|
if self.show_all_links_var.get():
|
|
113
|
-
# Pass 0 to the backend, which analyze.py interprets as "Show All"
|
|
114
212
|
max_links_to_pass = 0
|
|
115
213
|
else:
|
|
116
214
|
try:
|
|
117
215
|
max_links_to_pass = int(self.max_links_var.get())
|
|
118
|
-
if max_links_to_pass
|
|
216
|
+
if max_links_to_pass < 1:
|
|
119
217
|
self._display_error("Error: Max Links must be a positive number (or use 'Show All').")
|
|
120
218
|
return
|
|
121
219
|
except ValueError:
|
|
122
220
|
self._display_error("Error: Max Links must be an integer.")
|
|
123
221
|
return
|
|
124
222
|
|
|
223
|
+
export_format = None
|
|
224
|
+
if self.do_export_report_var.get():
|
|
225
|
+
export_format = self.export_report_format_var.get().lower()
|
|
226
|
+
|
|
125
227
|
# 1. Clear previous output and enable editing
|
|
126
228
|
self.output_text.config(state=tk.NORMAL)
|
|
127
229
|
self.output_text.delete('1.0', tk.END)
|
|
@@ -136,11 +238,13 @@ class PDFLinkCheckerApp(tk.Tk):
|
|
|
136
238
|
run_analysis(
|
|
137
239
|
pdf_path=pdf_path_str,
|
|
138
240
|
check_remnants=self.check_remnants_var.get(),
|
|
139
|
-
max_links=max_links_to_pass
|
|
241
|
+
max_links=max_links_to_pass,
|
|
242
|
+
export_format=export_format
|
|
140
243
|
)
|
|
141
244
|
self.output_text.insert(tk.END, "\n--- Analysis Complete ---\n")
|
|
142
245
|
|
|
143
246
|
except Exception as e:
|
|
247
|
+
self.output_text.insert(tk.END, "\n")
|
|
144
248
|
self._display_error(f"An unexpected error occurred during analysis: {e}")
|
|
145
249
|
|
|
146
250
|
finally:
|
|
@@ -149,17 +253,41 @@ class PDFLinkCheckerApp(tk.Tk):
|
|
|
149
253
|
self.output_text.config(state=tk.DISABLED)
|
|
150
254
|
|
|
151
255
|
def _display_error(self, message):
|
|
152
|
-
|
|
153
|
-
self.output_text.
|
|
256
|
+
# Ensure output is in normal state to write
|
|
257
|
+
original_state = self.output_text.cget('state')
|
|
258
|
+
if original_state == tk.DISABLED:
|
|
259
|
+
self.output_text.config(state=tk.NORMAL)
|
|
260
|
+
|
|
261
|
+
#self.output_text.delete('1.0', tk.END)
|
|
154
262
|
self.output_text.insert(tk.END, f"[ERROR] {message}\n", 'error')
|
|
155
263
|
self.output_text.tag_config('error', foreground='red')
|
|
264
|
+
|
|
265
|
+
# Restore state
|
|
156
266
|
self.output_text.config(state=tk.DISABLED)
|
|
157
267
|
|
|
158
268
|
|
|
159
|
-
def
|
|
160
|
-
"""
|
|
161
|
-
|
|
162
|
-
|
|
269
|
+
def auto_close_window(root, delay_ms:int = 0):
|
|
270
|
+
"""
|
|
271
|
+
Schedules the Tkinter window to be destroyed after a specified delay.
|
|
272
|
+
"""
|
|
273
|
+
if delay_ms > 0:
|
|
274
|
+
print(f"Window is set to automatically close in {delay_ms/1000} seconds.")
|
|
275
|
+
root.after(delay_ms, root.destroy)
|
|
276
|
+
else:
|
|
277
|
+
return
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def start_gui(time_auto_close:int=0):
|
|
281
|
+
"""
|
|
282
|
+
Entry point function to launch the application.
|
|
283
|
+
"""
|
|
284
|
+
print("pdflinkcheck: start_gui ...")
|
|
285
|
+
tk_app = PDFLinkCheckerApp()
|
|
286
|
+
|
|
287
|
+
auto_close_window(tk_app, time_auto_close)
|
|
288
|
+
|
|
289
|
+
tk_app.mainloop()
|
|
290
|
+
print("pdflinkcheck: gui closed.")
|
|
163
291
|
|
|
164
292
|
if __name__ == "__main__":
|
|
165
293
|
start_gui()
|
pdflinkcheck/io.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# src/pdflinkcheck/io.py
|
|
2
|
+
import logging
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Dict, Any, Union, List
|
|
6
|
+
|
|
7
|
+
# --- Configuration ---
|
|
8
|
+
|
|
9
|
+
# Define the base directory for pdflinkcheck data (~/.pdflinkcheck)
|
|
10
|
+
try:
|
|
11
|
+
# Use the home directory and append the tool's name
|
|
12
|
+
PDFLINKCHECK_HOME = Path.home() / ".pdflinkcheck"
|
|
13
|
+
except Exception:
|
|
14
|
+
# Fallback if Path.home() fails in certain environments (e.g., some CI runners)
|
|
15
|
+
PDFLINKCHECK_HOME = Path("/tmp/.pdflinkcheck_temp")
|
|
16
|
+
|
|
17
|
+
# Ensure the directory exists
|
|
18
|
+
PDFLINKCHECK_HOME.mkdir(parents=True, exist_ok=True)
|
|
19
|
+
|
|
20
|
+
# Define the log file path
|
|
21
|
+
LOG_FILE_PATH = PDFLINKCHECK_HOME / "pdflinkcheck_errors.log"
|
|
22
|
+
|
|
23
|
+
# --- Logging Setup ---
|
|
24
|
+
|
|
25
|
+
# Set up a basic logger for error tracking
|
|
26
|
+
def setup_error_logger():
|
|
27
|
+
"""
|
|
28
|
+
Configures a basic logger that writes errors and warnings to a file
|
|
29
|
+
in the PDFLINKCHECK_HOME directory.
|
|
30
|
+
"""
|
|
31
|
+
# Create the logger instance
|
|
32
|
+
logger = logging.getLogger('pdflinkcheck_logger')
|
|
33
|
+
logger.setLevel(logging.WARNING) # Log WARNING and above
|
|
34
|
+
|
|
35
|
+
# Prevent propagation to the root logger (which might print to console)
|
|
36
|
+
logger.propagate = False
|
|
37
|
+
|
|
38
|
+
# Create file handler
|
|
39
|
+
file_handler = logging.FileHandler(LOG_FILE_PATH, mode='a')
|
|
40
|
+
file_handler.setLevel(logging.WARNING)
|
|
41
|
+
|
|
42
|
+
# Create formatter
|
|
43
|
+
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
|
44
|
+
file_handler.setFormatter(formatter)
|
|
45
|
+
|
|
46
|
+
# Check if the handler is already added (prevents duplicate log entries)
|
|
47
|
+
if not any(isinstance(handler, logging.FileHandler) for handler in logger.handlers):
|
|
48
|
+
logger.addHandler(file_handler)
|
|
49
|
+
|
|
50
|
+
return logger
|
|
51
|
+
|
|
52
|
+
# Initialize the logger instance
|
|
53
|
+
error_logger = setup_error_logger()
|
|
54
|
+
|
|
55
|
+
# --- Export Functionality ---
|
|
56
|
+
|
|
57
|
+
def export_report_data(
|
|
58
|
+
report_data: Dict[str, Any],
|
|
59
|
+
pdf_filename: str,
|
|
60
|
+
export_format: str = "JSON"
|
|
61
|
+
) -> Path:
|
|
62
|
+
"""
|
|
63
|
+
Exports the structured analysis report data to a file in the
|
|
64
|
+
PDFLINKCHECK_HOME directory.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
report_data: The dictionary containing the results from run_analysis.
|
|
68
|
+
pdf_filename: The base filename of the PDF being analyzed (used for the output file name).
|
|
69
|
+
export_format: The desired output format ('json' currently supported).
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
The path object pointing to the successfully created report file.
|
|
73
|
+
|
|
74
|
+
Raises:
|
|
75
|
+
ValueError: If the export_format is not supported.
|
|
76
|
+
"""
|
|
77
|
+
if export_format.upper() != "JSON":
|
|
78
|
+
error_logger.error(f"Unsupported export format requested: {export_format}")
|
|
79
|
+
raise ValueError("Only 'JSON' format is currently supported for report export.")
|
|
80
|
+
|
|
81
|
+
# Create an output file name based on the PDF name and a timestamp
|
|
82
|
+
base_name = Path(pdf_filename).stem
|
|
83
|
+
output_filename = f"{base_name}_report.json"
|
|
84
|
+
output_path = PDFLINKCHECK_HOME / output_filename
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
88
|
+
# Use indent for readability
|
|
89
|
+
json.dump(report_data, f, indent=4)
|
|
90
|
+
|
|
91
|
+
print(f"\nReport successfully exported to: {output_path}")
|
|
92
|
+
return output_path
|
|
93
|
+
|
|
94
|
+
except Exception as e:
|
|
95
|
+
error_logger.error(f"Failed to export report to JSON: {e}", exc_info=True)
|
|
96
|
+
# Re-raise the exception after logging for caller to handle
|
|
97
|
+
raise RuntimeError(f"Report export failed due to an I/O error: {e}")
|
|
98
|
+
|
|
99
|
+
# Example of how an external module can log an error:
|
|
100
|
+
# from pdflinkcheck.io import error_logger
|
|
101
|
+
# try:
|
|
102
|
+
# ...
|
|
103
|
+
# except Exception as e:
|
|
104
|
+
# error_logger.exception("An exception occurred during link extraction.")
|
|
105
|
+
|
|
106
|
+
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdflinkcheck
|
|
3
|
+
Version: 1.1.47
|
|
4
|
+
Summary: A purpose-built PDF link analysis and reporting tool with GUI and CLI.
|
|
5
|
+
Author-email: George Clayton Bennett <george.bennett@memphistn.gov>
|
|
6
|
+
Project-URL: Homepage, https://github.com/city-of-memphis-wastewater/pdflinkcheck
|
|
7
|
+
Project-URL: Repository, https://github.com/city-of-memphis-wastewater/pdflinkcheck
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
15
|
+
Classifier: License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
18
|
+
Classifier: Intended Audience :: Developers
|
|
19
|
+
Classifier: Intended Audience :: Science/Research
|
|
20
|
+
Classifier: Intended Audience :: Other Audience
|
|
21
|
+
Classifier: Topic :: File Formats
|
|
22
|
+
Classifier: Topic :: Office/Business
|
|
23
|
+
Classifier: Topic :: Text Processing :: General
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
25
|
+
Classifier: Environment :: Console
|
|
26
|
+
Classifier: Environment :: MacOS X
|
|
27
|
+
Classifier: Environment :: Win32 (MS Windows)
|
|
28
|
+
Classifier: Typing :: Typed
|
|
29
|
+
Classifier: Development Status :: 4 - Beta
|
|
30
|
+
Requires-Python: >=3.10
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
License-File: LICENSE
|
|
33
|
+
Requires-Dist: pyhabitat>=1.0.53
|
|
34
|
+
Requires-Dist: pymupdf>=1.26.6
|
|
35
|
+
Requires-Dist: rich>=14.2.0
|
|
36
|
+
Requires-Dist: typer>=0.20.0
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: ruff>=0.1.13; extra == "dev"
|
|
39
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
40
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
41
|
+
Dynamic: license-file
|
|
42
|
+
|
|
43
|
+
# pdflinkcheck
|
|
44
|
+
|
|
45
|
+
A purpose-built tool for comprehensive analysis of hyperlinks and link remnants within PDF documents, primarily using the PyMuPDF library. Use the CLI or the GUI.
|
|
46
|
+
|
|
47
|
+
-----
|
|
48
|
+
|
|
49
|
+

|
|
50
|
+
|
|
51
|
+
-----
|
|
52
|
+
|
|
53
|
+
## 📥 Access and Installation
|
|
54
|
+
|
|
55
|
+
The recommended way to use `pdflinkcheck` is to either install the CLI with `pipx` or to download the appropriate latest binary for your system from [Releases](https://github.com/City-of-Memphis-Wastewater/pdflinkcheck/releases/).
|
|
56
|
+
|
|
57
|
+
### 🚀 Recommended Access (Binary Files)
|
|
58
|
+
|
|
59
|
+
For the most user-typical experience, download the single-file binary matching your OS.
|
|
60
|
+
|
|
61
|
+
| **File Type** | **Primary Use Case** | **Recommended Launch Method** |
|
|
62
|
+
| :--- | :--- | :--- |
|
|
63
|
+
| **Executable (.exe, .elf, .pyz)** | **GUI (Double-Click)** | Double-click the file (use the accompanying `.bat` file on Windows). |
|
|
64
|
+
| **PYZ (Python Zip App)** | **CLI (Terminal)** | Run using your system's `python` command: `python pdflinkcheck-VERSION.pyz analyze ...` |
|
|
65
|
+
|
|
66
|
+
### Installation via pipx
|
|
67
|
+
|
|
68
|
+
For an isolated environment where you can access `pdflinkcheck` from any terminal:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
# Ensure you have pipx installed first (if not, run: pip install pipx)
|
|
72
|
+
pipx install pdflinkcheck
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
-----
|
|
76
|
+
|
|
77
|
+
## 💻 Graphical User Interface (GUI)
|
|
78
|
+
|
|
79
|
+
The tool can be run as simple cross-platform graphical interface (Tkinter).
|
|
80
|
+
|
|
81
|
+
### Launching the GUI
|
|
82
|
+
|
|
83
|
+
There are three ways to launch the GUI interface:
|
|
84
|
+
|
|
85
|
+
1. **Implicit Launch:** Run the main command with no arguments, subcommands, or flags (`pdflinkcheck`).
|
|
86
|
+
2. **Explicit Command:** Use the dedicated GUI subcommand (`pdflinkcheck gui`).
|
|
87
|
+
3. **Binary Double-Click:**
|
|
88
|
+
* **Windows:** Double-click the `pdflinkcheck-VERSION-gui.bat` file.
|
|
89
|
+
* **macOS/Linux:** Double-click the downloaded `.pyz` or `.elf` file.
|
|
90
|
+
|
|
91
|
+
### Planned GUI Updates
|
|
92
|
+
|
|
93
|
+
We are actively working on the following enhancements:
|
|
94
|
+
|
|
95
|
+
* **Report Export:** Functionality to export the full analysis report to a plain text file.
|
|
96
|
+
* **License Visibility:** A dedicated "License Info" button within the GUI to display the terms of the AGPLv3+ license.
|
|
97
|
+
|
|
98
|
+
-----
|
|
99
|
+
|
|
100
|
+
## 🚀 CLI Usage
|
|
101
|
+
|
|
102
|
+
The core functionality is accessed via the `analyze` command. All commands include the built-in `--help` flag for quick reference.
|
|
103
|
+
|
|
104
|
+
### Available Commands
|
|
105
|
+
|
|
106
|
+
|**Command**|**Description**|
|
|
107
|
+
|---|---|
|
|
108
|
+
|`pdflinkcheck analyze`|Analyzes a PDF file for links and remnants.|
|
|
109
|
+
|`pdflinkcheck gui`|Explicitly launch the Graphical User Interface.|
|
|
110
|
+
|`pdflinkcheck license`|**Displays the full AGPLv3+ license text in the terminal.**|
|
|
111
|
+
|
|
112
|
+
### `analyze` Command Options
|
|
113
|
+
|
|
114
|
+
|**Option**|**Description**|**Default**|
|
|
115
|
+
|---|---|---|
|
|
116
|
+
|`<PDF_PATH>`|**Required.** The path to the PDF file to analyze.|N/A|
|
|
117
|
+
|`--check-remnants / --no-check-remnants`|Toggle scanning the text layer for unlinked URLs/Emails.|`--check-remnants`|
|
|
118
|
+
|`--max-links INTEGER`|Maximum number of links/remnants to display in the detailed report sections. Use `0` to show all.|`0` (Show All)|
|
|
119
|
+
|`--export-format FORMAT`|Format for the exported report. If specified, the report is saved to a file named after the PDF. Currently supported: `JSON`.|`JSON`|
|
|
120
|
+
|`--help`|Show command help and exit.|N/A|
|
|
121
|
+
|
|
122
|
+
### `gui` Command Options
|
|
123
|
+
|
|
124
|
+
| **Option** | **Description** | **Default** |
|
|
125
|
+
| ---------------------- | ------------------------------------------------------------------------------------------------------------- | -------------- |
|
|
126
|
+
| `--auto-close INTEGER` | **(For testing/automation only).** Delay in milliseconds after which the GUI window will automatically close. | `0` (Disabled) |
|
|
127
|
+
#### Example Runs
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
# Analyze a document, show all links/remnants, and save the report as JSON
|
|
133
|
+
pdflinkcheck analyze "TE Maxson WWTF O&M Manual.pdf" --export-format JSON
|
|
134
|
+
|
|
135
|
+
# Analyze a document but skip the time-consuming remnant check
|
|
136
|
+
pdflinkcheck analyze "another_doc.pdf" --no-check-remnants
|
|
137
|
+
|
|
138
|
+
# Analyze a document but keep the print block short, showing only the first 10 links for each type
|
|
139
|
+
pdflinkcheck analyze "TE Maxson WWTF O&M Manual.pdf" --max-links 10
|
|
140
|
+
|
|
141
|
+
# Show the GUI for only a moment, like in a build check
|
|
142
|
+
pdflinkcheck gui --auto-close 3000
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
-----
|
|
147
|
+
|
|
148
|
+
## 📦 Library Access (Advanced)
|
|
149
|
+
|
|
150
|
+
For developers importing `pdflinkcheck` into other Python projects, the core analysis functions are exposed directly in the root namespace:
|
|
151
|
+
|
|
152
|
+
|**Function**|**Description**|
|
|
153
|
+
|---|---|
|
|
154
|
+
|`run_analysis()`|**(Primary function)** Performs the full analysis, prints to console, and handles file export.|
|
|
155
|
+
|`extract_links()`|Low-level function to retrieve all explicit links (URIs, GoTo, etc.) from a PDF path.|
|
|
156
|
+
|`extract_toc()`|Low-level function to extract the PDF's internal Table of Contents (bookmarks/outline).|
|
|
157
|
+
|
|
158
|
+
Python
|
|
159
|
+
|
|
160
|
+
```
|
|
161
|
+
from pdflinkcheck.analyze import run_analysis, extract_links, extract_toc
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
-----
|
|
165
|
+
|
|
166
|
+
## ✨ Features
|
|
167
|
+
|
|
168
|
+
* **Active Link Extraction:** Identifies and categorizes all programmed links (External URIs, Internal GoTo/Destinations, Remote Jumps).
|
|
169
|
+
* **Anchor Text Retrieval:** Extracts the visible text corresponding to each link's bounding box.
|
|
170
|
+
* **Remnant Detection:** Scans the document's text layer for unlinked URIs and email addresses that should potentially be converted into active links.
|
|
171
|
+
* **Structural TOC:** Extracts the PDF's internal Table of Contents (bookmarks/outline).
|
|
172
|
+
|
|
173
|
+
-----
|
|
174
|
+
|
|
175
|
+
## 📜 License Implications (AGPLv3+)
|
|
176
|
+
|
|
177
|
+
**pdflinkcheck is licensed under the GNU Affero General Public License version 3 or later (AGPLv3+).**
|
|
178
|
+
|
|
179
|
+
This license has significant implications for **distribution and network use**, particularly for organizations:
|
|
180
|
+
|
|
181
|
+
* **Source Code Provision:** If you distribute this tool (modified or unmodified) to anyone, you **must** provide the full source code under the same license.
|
|
182
|
+
* **Network Interaction (Affero Clause):** If you modify this tool and make the modified version available to users over a computer network (e.g., as a web service or backend), you **must** also offer the source code to those network users.
|
|
183
|
+
|
|
184
|
+
> **Before deploying or modifying this tool for organizational use, especially for internal web services or distribution, please ensure compliance with the AGPLv3+ terms.**
|
|
185
|
+
|
|
186
|
+
-----
|
|
187
|
+
|
|
188
|
+
## 🥚 Optional REPL‑Friendly GUI Access (Easter Egg)
|
|
189
|
+
|
|
190
|
+
For users who prefer exploring tools interactively—especially those coming from MATLAB or other REPL‑first environments—`pdflinkcheck` includes an optional Easter egg that exposes the GUI launcher directly in the library namespace.
|
|
191
|
+
|
|
192
|
+
This feature is **disabled by default** and has **no effect on normal imports**.
|
|
193
|
+
|
|
194
|
+
### Enabling the Easter Egg
|
|
195
|
+
|
|
196
|
+
Set the environment variable before importing the library:
|
|
197
|
+
|
|
198
|
+
```python
|
|
199
|
+
import os
|
|
200
|
+
os.environ["PDFLINKCHECK_GUI_EASTEREGG"] = "true"
|
|
201
|
+
|
|
202
|
+
import pdflinkcheck
|
|
203
|
+
pdflinkcheck.start_gui()
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
Accepted values include: `true`, `1`, `yes`, `on` (case‑insensitive).
|
|
207
|
+
|
|
208
|
+
### Purpose
|
|
209
|
+
|
|
210
|
+
This opt‑in behavior is designed to make the library feel welcoming to beginners who are experimenting in a Python REPL for the first time. When enabled, the `start_gui()` function becomes available at the top level:
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
pdflinkcheck.start_gui()
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
If the `PDFLINKCHECK_GUI_EASTEREGG` environment variable is not set—or if GUI support is unavailable—`pdflinkcheck` behaves as a normal library with no GUI functions exposed.
|
|
217
|
+
|
|
218
|
+
-----
|
|
219
|
+
|
|
220
|
+
## ⚠️ Compatibility Notes
|
|
221
|
+
|
|
222
|
+
### Platform Compatibility:
|
|
223
|
+
|
|
224
|
+
This tool relies on the `PyMuPDF` library.
|
|
225
|
+
All testing has failed to run in a **Termux (Android)** environment due to underlying C/C++ library compilation issues with PyMuPDF.
|
|
226
|
+
It is recommended for use on standard Linux, macOS, or Windows operating systems.
|
|
227
|
+
|
|
228
|
+
#### Termux Compatibility as a Key Goal
|
|
229
|
+
A key goal of City-of-Memphis-Wastewater is to release all software as Termux-compatible. Unfortunately, that simply isn't possible with PyMuPDF as a dependency.
|
|
230
|
+
We tried alternative PDF libaries like `pdfminer`, `pdfplumber`, and `borb`, but none of these offered the level of detail concerning GoTo links.
|
|
231
|
+
Due to Termux compatibility goals, we do not generally make Tkinter-based interfaces, so that was a fun, minimalist opportunity on this project.
|
|
232
|
+
|
|
233
|
+
Termux compatibility is important in the modern age as Android devices are common among technicians, field engineers, and maintenace staff.
|
|
234
|
+
Android is the most common operating system in the Global South.
|
|
235
|
+
We aim to produce stable software that can do the most possible good.
|
|
236
|
+
|
|
237
|
+
We love web-stack GUIs served locally as a final product.
|
|
238
|
+
All that packaged up into a Termux-compatible ELF or PYZ - What could be better!
|
|
239
|
+
|
|
240
|
+
In the future we may find a work-around and be able to drop the PyMuPDF dependency.
|
|
241
|
+
This would have lots of implications:
|
|
242
|
+
- Reduced artifact size.
|
|
243
|
+
- Alpine-compatible Docker image.
|
|
244
|
+
- Web-stack GUI rather than Tkinter, to be compatible with Termux.
|
|
245
|
+
- A different license from the AGPL3, if we choose at that time.
|
|
246
|
+
|
|
247
|
+
In the meantime, the standalone binaries and pipx installation provide excellent cross-platform support on Windows, macOS, and standard Linux desktops/laptops.
|
|
248
|
+
|
|
249
|
+
### Document Compatibility:
|
|
250
|
+
While `pdflinkcheck` uses the robust PyMuPDF library, not all PDF files can be processed successfully. This tool is designed primarily for digitally generated (vector-based) PDFs.
|
|
251
|
+
|
|
252
|
+
Processing may fail or yield incomplete results for:
|
|
253
|
+
* **Scanned PDFs** (images of text) that lack an accessible text layer.
|
|
254
|
+
* **Encrypted or Password-Protected** documents.
|
|
255
|
+
* **Malformed or non-standard** PDF files.
|
|
256
|
+
|
|
257
|
+
-----
|
|
258
|
+
|
|
259
|
+
## Run from Source (Developers)
|
|
260
|
+
|
|
261
|
+
```bash
|
|
262
|
+
git clone http://github.com/city-of-memphis-wastewater/pdflinkcheck.git
|
|
263
|
+
cd pdflinkcheck
|
|
264
|
+
uv sync
|
|
265
|
+
uv run python src/pdflinkcheck/cli.py --help
|
|
266
|
+
```
|