pdflinkcheck 1.1.7__py3-none-any.whl → 1.1.47__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdflinkcheck/__init__.py +31 -0
- pdflinkcheck/analyze.py +306 -128
- pdflinkcheck/cli.py +97 -20
- pdflinkcheck/data/LICENSE +680 -0
- pdflinkcheck/gui.py +157 -29
- pdflinkcheck/io.py +106 -0
- pdflinkcheck-1.1.47.dist-info/METADATA +266 -0
- pdflinkcheck-1.1.47.dist-info/RECORD +13 -0
- {pdflinkcheck-1.1.7.dist-info → pdflinkcheck-1.1.47.dist-info}/entry_points.txt +0 -1
- pdflinkcheck-1.1.47.dist-info/licenses/LICENSE +680 -0
- pdflinkcheck-1.1.7.dist-info/METADATA +0 -109
- pdflinkcheck-1.1.7.dist-info/RECORD +0 -10
- {pdflinkcheck-1.1.7.dist-info → pdflinkcheck-1.1.47.dist-info}/WHEEL +0 -0
- {pdflinkcheck-1.1.7.dist-info → pdflinkcheck-1.1.47.dist-info}/top_level.txt +0 -0
pdflinkcheck/__init__.py
CHANGED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Library functions
|
|
2
|
+
from pdflinkcheck.analyze import run_analysis, extract_links, extract_toc
|
|
3
|
+
|
|
4
|
+
# For the kids. This is what I wanted when learning Python in a mysterious new REPL.
|
|
5
|
+
# Is this Pythonic? No. Oh well. PEP 8, PEP 20.
|
|
6
|
+
import os
|
|
7
|
+
flag = os.environ.get('PDFLINKCHECK_GUI_EASTEREGG', '')
|
|
8
|
+
pdflibkcheck_gui_lib_func_load = str(flag).strip().lower() in ('true', '1', 'yes', 'on')
|
|
9
|
+
|
|
10
|
+
if pdflibkcheck_gui_lib_func_load:
|
|
11
|
+
try:
|
|
12
|
+
import pyhabitat # pyhabitat is a dependency of this package already
|
|
13
|
+
if pyhabitat.tkinter_is_available():
|
|
14
|
+
from pdflinkcheck.gui import start_gui
|
|
15
|
+
except ImportError:
|
|
16
|
+
# Optional: log or ignore silently
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
# Breadcrumbs, for stumbling upon.
|
|
20
|
+
if pdflibkcheck_gui_lib_func_load:
|
|
21
|
+
__pdflinkcheck_gui_easteregg_enabled__ = True
|
|
22
|
+
else:
|
|
23
|
+
__pdflinkcheck_gui_easteregg_enabled__ = False
|
|
24
|
+
|
|
25
|
+
# Define __all__ such that the library functions are self documenting.
|
|
26
|
+
__all__ = [
|
|
27
|
+
"run_analysis",
|
|
28
|
+
"extract_links",
|
|
29
|
+
"extract_toc",
|
|
30
|
+
"start_gui" if pdflibkcheck_gui_lib_func_load else None,
|
|
31
|
+
]
|
pdflinkcheck/analyze.py
CHANGED
|
@@ -1,23 +1,33 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Dict, Any
|
|
4
|
+
from typing import Dict, Any, Optional
|
|
5
|
+
# ... other imports ...
|
|
5
6
|
# Configure logging to suppress low-level pdfminer messages
|
|
6
7
|
logging.getLogger("fitz").setLevel(logging.ERROR)
|
|
7
8
|
import fitz # PyMuPDF
|
|
8
9
|
|
|
9
10
|
from pdflinkcheck.remnants import find_link_remnants
|
|
11
|
+
from pdflinkcheck.io import error_logger, export_report_data, LOG_FILE_PATH
|
|
10
12
|
|
|
11
13
|
"""
|
|
12
14
|
Inspect target PDF for both URI links and for GoTo links.
|
|
13
15
|
"""
|
|
14
16
|
|
|
15
|
-
|
|
16
17
|
# Helper function: Prioritize 'from'
|
|
17
18
|
def get_link_rect(link_dict):
|
|
18
19
|
"""
|
|
19
|
-
Retrieves the bounding box for the link using the reliable 'from' key
|
|
20
|
-
|
|
20
|
+
Retrieves the bounding box for the link using the reliable 'from' key
|
|
21
|
+
provided by PyMuPDF's link dictionary.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
link_dict: A dictionary representing a single link/annotation
|
|
25
|
+
returned by `page.get_links()`.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
A tuple of four floats (x0, y0, x1, y1) representing the
|
|
29
|
+
rectangular coordinates of the link on the page, or None if the
|
|
30
|
+
bounding box data is missing.
|
|
21
31
|
"""
|
|
22
32
|
# 1. Use the 'from' key, which returns a fitz.Rect object or None
|
|
23
33
|
rect_obj = link_dict.get('from')
|
|
@@ -30,25 +40,19 @@ def get_link_rect(link_dict):
|
|
|
30
40
|
# 3. Fallback to None if 'from' is missing
|
|
31
41
|
return None
|
|
32
42
|
|
|
33
|
-
def get_pdf_file():
|
|
34
|
-
|
|
35
|
-
example_path = f"/mnt/c/Users/george.bennett/Downloads/TE Maxson WWTF O&M Manual DRAFT - Sections 1-6 - April 2025 (3).pdf"
|
|
36
|
-
example_path = "TE Maxson WWTF O&M Manual.pdf"
|
|
37
|
-
print(f"example path = {example_path}")
|
|
38
|
-
pdf_file = input(f"Paste path to PDF file (or press Enter to accept example): ")
|
|
39
|
-
if not pdf_file:
|
|
40
|
-
pdf_file = example_path
|
|
41
|
-
if not Path(pdf_file).exists:
|
|
42
|
-
print("File not found!")
|
|
43
|
-
sys.exit(1)
|
|
44
|
-
|
|
45
|
-
return pdf_file
|
|
46
|
-
|
|
47
|
-
|
|
48
43
|
def get_anchor_text(page, link_rect):
|
|
49
44
|
"""
|
|
50
|
-
Extracts text content using the link's bounding box.
|
|
51
|
-
|
|
45
|
+
Extracts text content using the link's bounding box coordinates.
|
|
46
|
+
The bounding box is slightly expanded to ensure full characters are captured.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
page: The fitz.Page object where the link is located.
|
|
50
|
+
link_rect: A tuple of four floats (x0, y0, x1, y1) representing the
|
|
51
|
+
link's bounding box.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
The cleaned, extracted text string, or a placeholder message
|
|
55
|
+
if no text is found or if an error occurs.
|
|
52
56
|
"""
|
|
53
57
|
if not link_rect:
|
|
54
58
|
return "N/A: Missing Rect"
|
|
@@ -90,7 +94,15 @@ def get_anchor_text(page, link_rect):
|
|
|
90
94
|
|
|
91
95
|
def analyze_toc_fitz(doc):
|
|
92
96
|
"""
|
|
93
|
-
Extracts the
|
|
97
|
+
Extracts the structural Table of Contents (PDF Bookmarks/Outline)
|
|
98
|
+
from the PDF document using PyMuPDF's built-in functionality.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
doc: The open fitz.Document object.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
A list of dictionaries, where each dictionary represents a TOC entry
|
|
105
|
+
with 'level', 'title', and 'target_page' (1-indexed).
|
|
94
106
|
"""
|
|
95
107
|
toc = doc.get_toc()
|
|
96
108
|
toc_data = []
|
|
@@ -107,12 +119,68 @@ def analyze_toc_fitz(doc):
|
|
|
107
119
|
|
|
108
120
|
|
|
109
121
|
# 2. Updated Main Inspection Function to Include Text Extraction
|
|
110
|
-
def inspect_pdf_hyperlinks_fitz(pdf_path):
|
|
111
|
-
|
|
122
|
+
#def inspect_pdf_hyperlinks_fitz(pdf_path):
|
|
123
|
+
def extract_toc(pdf_path):
|
|
124
|
+
"""
|
|
125
|
+
Opens a PDF, iterates through all pages and extracts the structural table of contents (TOC/bookmarks).
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
pdf_path: The file system path (str) to the target PDF document.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
A list of dictionaries representing the structural TOC/bookmarks.
|
|
132
|
+
"""
|
|
112
133
|
try:
|
|
113
134
|
doc = fitz.open(pdf_path)
|
|
114
135
|
structural_toc = analyze_toc_fitz(doc)
|
|
136
|
+
except Exception as e:
|
|
137
|
+
print(f"An error occurred: {e}", file=sys.stderr)
|
|
138
|
+
return structural_toc
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def serialize_fitz_object(obj):
|
|
142
|
+
"""Converts a fitz object (Point, Rect, Matrix) to a serializable type."""
|
|
143
|
+
# Meant to avoid known Point errors like: '[ERROR] An unexpected error occurred during analysis: Report export failed due to an I/O error: Object of type Point is not JSON serializable'
|
|
144
|
+
if obj is None:
|
|
145
|
+
return None
|
|
146
|
+
|
|
147
|
+
# 1. Handle fitz.Point (has x, y)
|
|
148
|
+
if hasattr(obj, 'x') and hasattr(obj, 'y') and not hasattr(obj, 'x0'):
|
|
149
|
+
return (obj.x, obj.y)
|
|
115
150
|
|
|
151
|
+
# 2. Handle fitz.Rect and fitz.IRect (has x0, y0)
|
|
152
|
+
if hasattr(obj, 'x0') and hasattr(obj, 'y0'):
|
|
153
|
+
return (obj.x0, obj.y0, obj.x1, obj.y1)
|
|
154
|
+
|
|
155
|
+
# 3. Handle fitz.Matrix (has a, b, c, d, e, f)
|
|
156
|
+
if hasattr(obj, 'a') and hasattr(obj, 'b') and hasattr(obj, 'c'):
|
|
157
|
+
return (obj.a, obj.b, obj.c, obj.d, obj.e, obj.f)
|
|
158
|
+
|
|
159
|
+
# 4. Fallback: If it's still not a primitive type, convert it to a string
|
|
160
|
+
if not isinstance(obj, (str, int, float, bool, list, tuple, dict)):
|
|
161
|
+
# Examples: hasattr(value, 'rect') and hasattr(value, 'point'):
|
|
162
|
+
# This handles Rect and Point objects that may slip through
|
|
163
|
+
return str(obj)
|
|
164
|
+
|
|
165
|
+
# Otherwise, return the object as is (it's already primitive)
|
|
166
|
+
return obj
|
|
167
|
+
|
|
168
|
+
def extract_links(pdf_path):
|
|
169
|
+
"""
|
|
170
|
+
Opens a PDF, iterates through all pages and extracts all link annotations.
|
|
171
|
+
It categorizes the links into External, Internal, or Other actions, and extracts the anchor text.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
pdf_path: The file system path (str) to the target PDF document.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
A list of dictionaries, where each dictionary is a comprehensive
|
|
178
|
+
representation of an active hyperlink found in the PDF.
|
|
179
|
+
|
|
180
|
+
"""
|
|
181
|
+
links_data = []
|
|
182
|
+
try:
|
|
183
|
+
doc = fitz.open(pdf_path)
|
|
116
184
|
|
|
117
185
|
for page_num in range(doc.page_count):
|
|
118
186
|
page = doc.load_page(page_num)
|
|
@@ -143,13 +211,22 @@ def inspect_pdf_hyperlinks_fitz(pdf_path):
|
|
|
143
211
|
|
|
144
212
|
|
|
145
213
|
link_dict = {
|
|
146
|
-
'page': int(page_num) + 1,
|
|
214
|
+
'page': int(page_num) + 1, # accurate for link location, add 1
|
|
147
215
|
'rect': link_rect,
|
|
148
216
|
'link_text': anchor_text,
|
|
149
217
|
'xref':xref
|
|
150
218
|
}
|
|
151
219
|
|
|
152
|
-
|
|
220
|
+
# A. Clean Geom. Objects: Use the helper function on 'to' / 'destination'
|
|
221
|
+
# Use the clean serialize_fitz_object() helper function on all keys that might contain objects
|
|
222
|
+
destination_view = serialize_fitz_object(link.get('to'))
|
|
223
|
+
|
|
224
|
+
# B. Correct Internal Link Page Numbering (The -1 correction hack)
|
|
225
|
+
# This will be skipped by URI, which is not expected to have a page key
|
|
226
|
+
target_page_num_reported = "N/A"
|
|
227
|
+
if link.get('page') is not None:
|
|
228
|
+
target_page_num_reported = int(link.get('page')) # accurate for link target, don't add 1 (weird)
|
|
229
|
+
|
|
153
230
|
if link['kind'] == fitz.LINK_URI:
|
|
154
231
|
target = link.get('uri', 'URI (Unknown Target)')
|
|
155
232
|
link_dict.update({
|
|
@@ -159,12 +236,11 @@ def inspect_pdf_hyperlinks_fitz(pdf_path):
|
|
|
159
236
|
})
|
|
160
237
|
|
|
161
238
|
elif link['kind'] == fitz.LINK_GOTO:
|
|
162
|
-
|
|
163
|
-
target = f"Page {target_page_num}"
|
|
239
|
+
target = f"Page {target_page_num_reported}"
|
|
164
240
|
link_dict.update({
|
|
165
241
|
'type': 'Internal (GoTo/Dest)',
|
|
166
|
-
'destination_page':
|
|
167
|
-
'destination_view':
|
|
242
|
+
'destination_page': target_page_num_reported,
|
|
243
|
+
'destination_view': destination_view,
|
|
168
244
|
'target': target
|
|
169
245
|
})
|
|
170
246
|
|
|
@@ -172,15 +248,17 @@ def inspect_pdf_hyperlinks_fitz(pdf_path):
|
|
|
172
248
|
link_dict.update({
|
|
173
249
|
'type': 'Remote (GoToR)',
|
|
174
250
|
'remote_file': link.get('file'),
|
|
175
|
-
'destination':
|
|
251
|
+
'destination': destination_view
|
|
176
252
|
})
|
|
177
253
|
|
|
178
254
|
elif link.get('page') is not None and link['kind'] != fitz.LINK_GOTO:
|
|
255
|
+
target = f"Page {target_page_num_reported}"
|
|
179
256
|
link_dict.update({
|
|
180
257
|
'type': 'Internal (Resolved Action)',
|
|
181
|
-
'destination_page':
|
|
182
|
-
'destination_view':
|
|
183
|
-
'source_kind': link.get('kind')
|
|
258
|
+
'destination_page': target_page_num_reported,
|
|
259
|
+
'destination_view': destination_view,
|
|
260
|
+
'source_kind': link.get('kind'),
|
|
261
|
+
'target': target
|
|
184
262
|
})
|
|
185
263
|
|
|
186
264
|
else:
|
|
@@ -190,20 +268,32 @@ def inspect_pdf_hyperlinks_fitz(pdf_path):
|
|
|
190
268
|
'action_kind': link.get('kind'),
|
|
191
269
|
'target': target
|
|
192
270
|
})
|
|
271
|
+
|
|
272
|
+
## --- General Serialization Cleaner ---
|
|
273
|
+
#for key, value in link_dict.items():
|
|
274
|
+
# if hasattr(value, 'rect') and hasattr(value, 'point'):
|
|
275
|
+
# # This handles Rect and Point objects that may slip through
|
|
276
|
+
# link_dict[key] = str(value)
|
|
277
|
+
## --- End Cleaner ---
|
|
193
278
|
|
|
194
279
|
links_data.append(link_dict)
|
|
195
280
|
|
|
196
281
|
doc.close()
|
|
197
282
|
except Exception as e:
|
|
198
283
|
print(f"An error occurred: {e}", file=sys.stderr)
|
|
199
|
-
return links_data
|
|
284
|
+
return links_data
|
|
200
285
|
|
|
201
286
|
def print_structural_toc(structural_toc):
|
|
202
287
|
"""
|
|
203
|
-
Prints the structural TOC data in a clean,
|
|
288
|
+
Prints the structural TOC data (bookmarks/outline) in a clean,
|
|
289
|
+
hierarchical, and readable console format.
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
structural_toc: A list of TOC dictionaries returned by `analyze_toc_fitz`.
|
|
204
293
|
"""
|
|
205
|
-
print("\n
|
|
206
|
-
print("
|
|
294
|
+
print("\n" + "=" * 70)
|
|
295
|
+
print("## Structural Table of Contents (PDF Bookmarks/Outline)")
|
|
296
|
+
print("=" * 70)
|
|
207
297
|
if not structural_toc:
|
|
208
298
|
print("No structural TOC (bookmarks/outline) found.")
|
|
209
299
|
return
|
|
@@ -220,108 +310,196 @@ def print_structural_toc(structural_toc):
|
|
|
220
310
|
page_str = str(item['target_page']).rjust(page_width)
|
|
221
311
|
print(f"{indent}{item['title']} . . . page {page_str}")
|
|
222
312
|
|
|
223
|
-
print("-" *
|
|
313
|
+
print("-" * 70)
|
|
224
314
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
Core PDF analysis logic using PyMuPDF. Extracts links, remnants, and TOC.
|
|
228
|
-
The printing is done inside this function.
|
|
229
|
-
max_links: If <= 0, all links will be displayed.
|
|
315
|
+
|
|
316
|
+
def get_first_pdf_in_cwd() -> Optional[str]:
|
|
230
317
|
"""
|
|
231
|
-
|
|
232
|
-
|
|
318
|
+
Scans the current working directory (CWD) for the first file ending
|
|
319
|
+
with a '.pdf' extension (case-insensitive).
|
|
233
320
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
321
|
+
This is intended as a convenience function for running the tool
|
|
322
|
+
without explicitly specifying a path.
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
The absolute path (as a string) to the first PDF file found,
|
|
326
|
+
or None if no PDF files are present in the CWD.
|
|
327
|
+
"""
|
|
328
|
+
# 1. Get the current working directory (CWD)
|
|
329
|
+
cwd = Path.cwd()
|
|
237
330
|
|
|
238
|
-
# 2.
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
if not extracted_links and not remnants and not structural_toc:
|
|
244
|
-
print(f"\nNo hyperlinks, remnants, or structural TOC found in {Path(pdf_path).name}.")
|
|
245
|
-
return {}
|
|
246
|
-
|
|
247
|
-
# 3. Separate the lists based on the 'type' key
|
|
248
|
-
uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
|
|
249
|
-
goto_links = [link for link in extracted_links if link['type'] == 'Internal (GoTo/Dest)']
|
|
250
|
-
resolved_action_links = [link for link in extracted_links if link['type'] == 'Internal (Resolved Action)']
|
|
251
|
-
other_links = [link for link in extracted_links if link['type'] not in ['External (URI)', 'Internal (GoTo/Dest)', 'Internal (Resolved Action)']]
|
|
252
|
-
|
|
253
|
-
total_internal_links = len(goto_links) + len(resolved_action_links)
|
|
331
|
+
# 2. Use Path.glob to find files matching the pattern.
|
|
332
|
+
# We use '**/*.pdf' to also search nested directories if desired,
|
|
333
|
+
# but typically for a single PDF in CWD, '*.pdf' is enough.
|
|
334
|
+
# Let's stick to files directly in the CWD for simplicity.
|
|
254
335
|
|
|
255
|
-
#
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
print(f"Total **structural TOC entries (bookmarks)** found: {toc_entry_count}")
|
|
259
|
-
print(f"Total **potential missing links** found: {len(remnants)}")
|
|
260
|
-
print("-" * 50)
|
|
261
|
-
|
|
262
|
-
limit = max_links if max_links > 0 else None
|
|
263
|
-
|
|
264
|
-
uri_and_other = uri_links + other_links
|
|
336
|
+
# We use list comprehension with next() for efficiency, or a simple loop.
|
|
337
|
+
# Using Path.glob('*.pdf') to search the CWD for files ending in .pdf
|
|
338
|
+
# We make it case-insensitive by checking both '*.pdf' and '*.PDF'
|
|
265
339
|
|
|
266
|
-
#
|
|
267
|
-
|
|
268
|
-
print("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target URI/Action"))
|
|
269
|
-
print("-" * 75)
|
|
340
|
+
# Note: On Unix systems, glob is case-sensitive by default.
|
|
341
|
+
# The most cross-platform safe way is to iterate and check the suffix.
|
|
270
342
|
|
|
271
|
-
|
|
272
|
-
for
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
343
|
+
try:
|
|
344
|
+
# Check for files in the current directory only
|
|
345
|
+
# Iterating over the generator stops as soon as the first match is found.
|
|
346
|
+
first_pdf_path = next(
|
|
347
|
+
p.resolve() for p in cwd.iterdir()
|
|
348
|
+
if p.is_file() and p.suffix.lower() == '.pdf'
|
|
349
|
+
)
|
|
350
|
+
return str(first_pdf_path)
|
|
351
|
+
except StopIteration:
|
|
352
|
+
# If the generator runs out of items, no PDF was found
|
|
353
|
+
return None
|
|
354
|
+
except Exception as e:
|
|
355
|
+
# Handle potential permissions errors or other issues
|
|
356
|
+
print(f"Error while searching for PDF in CWD: {e}", file=sys.stderr)
|
|
357
|
+
return None
|
|
358
|
+
|
|
359
|
+
def run_analysis(pdf_path: str = None, check_remnants: bool = True, max_links: int = 0, export_format: Optional[str] = "JSON") -> Dict[str, Any]:
|
|
360
|
+
"""
|
|
361
|
+
Core high-level PDF link analysis logic.
|
|
286
362
|
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
363
|
+
This function orchestrates the extraction of active links and TOC
|
|
364
|
+
using PyMuPDF, finds link remnants (plain text URLs/emails), and
|
|
365
|
+
prints a comprehensive, user-friendly report to the console.
|
|
366
|
+
|
|
367
|
+
Args:
|
|
368
|
+
pdf_path: The file system path (str) to the target PDF document.
|
|
369
|
+
check_remnants: Boolean flag to enable/disable scanning for plain text
|
|
370
|
+
links that are not active hyperlinks.
|
|
371
|
+
max_links: Maximum number of links/remnants to display in each console
|
|
372
|
+
section. If <= 0, all links will be displayed.
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
A dictionary containing the structured results of the analysis:
|
|
376
|
+
'external_links', 'internal_links', 'remnants', and 'toc'.
|
|
377
|
+
"""
|
|
378
|
+
|
|
379
|
+
if pdf_path is None:
|
|
380
|
+
pdf_path = get_first_pdf_in_cwd()
|
|
381
|
+
if pdf_path is None:
|
|
382
|
+
print("pdf_path is None")
|
|
383
|
+
print("Tip: Drop a PDF in the current folder or pass in a path arg.")
|
|
384
|
+
return
|
|
385
|
+
try:
|
|
386
|
+
print(f"Running PyMuPDF analysis on {Path(pdf_path).name}...")
|
|
387
|
+
|
|
388
|
+
# 1. Extract all active links and TOC
|
|
389
|
+
extracted_links = extract_links(pdf_path)
|
|
390
|
+
structural_toc = extract_toc(pdf_path)
|
|
391
|
+
toc_entry_count = len(structural_toc)
|
|
297
392
|
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
if
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
393
|
+
# 2. Find link remnants
|
|
394
|
+
remnants = []
|
|
395
|
+
if check_remnants:
|
|
396
|
+
remnants = find_link_remnants(pdf_path, extracted_links) # Pass active links to exclude them
|
|
397
|
+
|
|
398
|
+
if not extracted_links and not remnants and not structural_toc:
|
|
399
|
+
print(f"\nNo hyperlinks, remnants, or structural TOC found in {Path(pdf_path).name}.")
|
|
400
|
+
return {}
|
|
401
|
+
|
|
402
|
+
# 3. Separate the lists based on the 'type' key
|
|
403
|
+
uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
|
|
404
|
+
goto_links = [link for link in extracted_links if link['type'] == 'Internal (GoTo/Dest)']
|
|
405
|
+
resolved_action_links = [link for link in extracted_links if link['type'] == 'Internal (Resolved Action)']
|
|
406
|
+
other_links = [link for link in extracted_links if link['type'] not in ['External (URI)', 'Internal (GoTo/Dest)', 'Internal (Resolved Action)']]
|
|
407
|
+
|
|
408
|
+
total_internal_links = len(goto_links) + len(resolved_action_links)
|
|
312
409
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
"
|
|
319
|
-
"
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
410
|
+
# --- ANALYSIS SUMMARY (Using your print logic) ---
|
|
411
|
+
print("\n" + "✪" * 70)
|
|
412
|
+
print(f"--- Link Analysis Results for {Path(pdf_path).name} ---")
|
|
413
|
+
print(f"Total active links: {len(extracted_links)} (External: {len(uri_links)}, Internal Jumps: {total_internal_links}, Other: {len(other_links)})")
|
|
414
|
+
print(f"Total **structural TOC entries (bookmarks)** found: {toc_entry_count}")
|
|
415
|
+
print(f"Total **potential missing links** found: {len(remnants)}")
|
|
416
|
+
print("✪" * 70)
|
|
417
|
+
|
|
418
|
+
limit = max_links if max_links > 0 else None
|
|
419
|
+
|
|
420
|
+
uri_and_other = uri_links + other_links
|
|
421
|
+
|
|
422
|
+
# --- Section 1: ACTIVE URI LINKS ---
|
|
423
|
+
print("\n" + "=" * 70)
|
|
424
|
+
print(f"## Active URI Links (External & Other) - {len(uri_and_other)} found")
|
|
425
|
+
print("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target URI/Action"))
|
|
426
|
+
print("=" * 70)
|
|
427
|
+
|
|
428
|
+
if uri_and_other:
|
|
429
|
+
for i, link in enumerate(uri_and_other[:limit], 1):
|
|
430
|
+
target = link.get('url') or link.get('remote_file') or link.get('target')
|
|
431
|
+
link_text = link.get('link_text', 'N/A')
|
|
432
|
+
print("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], target))
|
|
433
|
+
if limit is not None and len(uri_and_other) > limit:
|
|
434
|
+
print(f"... and {len(uri_and_other) - limit} more links (use --max-links to see all or --max-links 0 to show all).")
|
|
435
|
+
|
|
436
|
+
else:
|
|
437
|
+
print(" No external or 'Other' links found.")
|
|
438
|
+
|
|
439
|
+
# --- Section 2: ACTIVE INTERNAL JUMPS ---
|
|
440
|
+
print("\n" + "=" * 70)
|
|
441
|
+
print(f"## Active Internal Jumps (GoTo & Resolved Actions) - {total_internal_links} found")
|
|
442
|
+
print("=" * 70)
|
|
443
|
+
print("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To Page"))
|
|
444
|
+
print("-" * 70)
|
|
445
|
+
|
|
446
|
+
all_internal = goto_links + resolved_action_links
|
|
447
|
+
if total_internal_links > 0:
|
|
448
|
+
for i, link in enumerate(all_internal[:limit], 1):
|
|
449
|
+
link_text = link.get('link_text', 'N/A')
|
|
450
|
+
print("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], link['destination_page']))
|
|
451
|
+
|
|
452
|
+
if limit is not None and len(all_internal) > limit:
|
|
453
|
+
print(f"... and {len(all_internal) - limit} more links (use --max-links to see all or --max-links 0 to show all).")
|
|
454
|
+
else:
|
|
455
|
+
print(" No internal GoTo or Resolved Action links found.")
|
|
456
|
+
|
|
457
|
+
# --- Section 3: REMNANTS ---
|
|
458
|
+
print("\n" + "=" * 70)
|
|
459
|
+
print(f"## ⚠️ Link Remnants (Potential Missing Links to Fix) - {len(remnants)} found")
|
|
460
|
+
print("=" * 70)
|
|
461
|
+
|
|
462
|
+
if remnants:
|
|
463
|
+
print("{:<5} | {:<5} | {:<15} | {}".format("Idx", "Page", "Remnant Type", "Text Found (Needs Hyperlink)"))
|
|
464
|
+
print("-" * 70)
|
|
465
|
+
for i, remnant in enumerate(remnants[:limit], 1):
|
|
466
|
+
print("{:<5} | {:<5} | {:<15} | {}".format(i, remnant['page'], remnant['type'], remnant['text']))
|
|
467
|
+
if max_links!=0 and len(remnants) > max_links:
|
|
468
|
+
print(f"... and {len(remnants) - max_links} more remnants (use --max-links to see all).")
|
|
469
|
+
else:
|
|
470
|
+
print(" No URI or Email remnants found that are not already active links.")
|
|
471
|
+
|
|
472
|
+
# --- Section 4: TOC ---
|
|
473
|
+
print_structural_toc(structural_toc)
|
|
474
|
+
|
|
475
|
+
# Return the collected data for potential future JSON/other output
|
|
476
|
+
final_report_data = {
|
|
477
|
+
"external_links": uri_links,
|
|
478
|
+
"internal_links": all_internal,
|
|
479
|
+
"remnants": remnants,
|
|
480
|
+
"toc": structural_toc
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
# 5. Export Report
|
|
484
|
+
if export_format:
|
|
485
|
+
# Assuming export_to will hold the output format string (e.g., "JSON")
|
|
486
|
+
export_report_data(final_report_data, Path(pdf_path).name, export_format)
|
|
487
|
+
|
|
488
|
+
return final_report_data
|
|
489
|
+
except Exception as e:
|
|
490
|
+
# Log the critical failure
|
|
491
|
+
error_logger.error(f"Critical failure during run_analysis for {pdf_path}: {e}", exc_info=True)
|
|
492
|
+
print(f"FATAL: Analysis failed. Check logs at {LOG_FILE_PATH}", file=sys.stderr)
|
|
493
|
+
raise # Allow the exception to propagate or handle gracefully
|
|
494
|
+
|
|
495
|
+
|
|
323
496
|
|
|
324
497
|
def call_stable():
|
|
498
|
+
"""
|
|
499
|
+
Placeholder function for command-line execution (e.g., in __main__).
|
|
500
|
+
Note: This requires defining PROJECT_NAME, CLI_MAIN_FILE, etc., or
|
|
501
|
+
passing them as arguments to run_analysis.
|
|
502
|
+
"""
|
|
325
503
|
print("Begin analysis...")
|
|
326
504
|
run_analysis()
|
|
327
505
|
print("Analysis complete.")
|