pdflinkcheck 1.1.94__py3-none-any.whl → 1.2.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdflinkcheck/__init__.py +88 -18
- pdflinkcheck/__main__.py +6 -0
- pdflinkcheck/analysis_pdfium.py +131 -0
- pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +99 -141
- pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +51 -39
- pdflinkcheck/cli.py +52 -48
- pdflinkcheck/data/LICENSE +18 -15
- pdflinkcheck/data/README.md +23 -25
- pdflinkcheck/data/pyproject.toml +17 -26
- pdflinkcheck/datacopy.py +16 -1
- pdflinkcheck/dev.py +2 -2
- pdflinkcheck/environment.py +14 -2
- pdflinkcheck/gui.py +346 -563
- pdflinkcheck/helpers.py +88 -0
- pdflinkcheck/io.py +24 -6
- pdflinkcheck/report.py +598 -97
- pdflinkcheck/security.py +189 -0
- pdflinkcheck/splash.py +38 -0
- pdflinkcheck/stdlib_server.py +7 -21
- pdflinkcheck/stdlib_server_alt.py +571 -0
- pdflinkcheck/tk_utils.py +188 -0
- pdflinkcheck/update_msix_version.py +2 -0
- pdflinkcheck/validate.py +104 -170
- pdflinkcheck/version_info.py +2 -2
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +41 -40
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/RECORD +34 -27
- pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
- pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
- pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
- pdflinkcheck/analyze_pypdf_v2.py +0 -217
- pdflinkcheck-1.1.94.dist-info/WHEEL +0 -4
- pdflinkcheck-1.1.94.dist-info/licenses/LICENSE +0 -24
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-AGPL3 +0 -0
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-MIT +0 -0
pdflinkcheck/__init__.py
CHANGED
|
@@ -1,27 +1,82 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
1
3
|
# src/pdflinkcheck/__init__.py
|
|
2
4
|
"""
|
|
3
|
-
# License information
|
|
4
5
|
pdflinkcheck - A PDF Link Checker
|
|
5
6
|
|
|
6
|
-
Copyright (C) 2025 George Clayton Bennett
|
|
7
|
-
|
|
8
7
|
Source code: https://github.com/City-of-Memphis-Wastewater/pdflinkcheck/
|
|
9
8
|
|
|
10
|
-
This program is free software: You can redistribute it and/or modify
|
|
11
|
-
it under the terms of the GNU Affero General Public License as
|
|
12
|
-
published by the Free Software Foundation, either version 3 of the
|
|
13
|
-
License, or (at your option) any later version.
|
|
14
|
-
|
|
15
|
-
The AGPL3+ is required because pdflinkcheck uses PyMuPDF, which is licensed under the AGPL3.
|
|
16
9
|
"""
|
|
10
|
+
from __future__ import annotations
|
|
17
11
|
import os as _os
|
|
18
12
|
|
|
19
13
|
# Library functions
|
|
20
|
-
from pdflinkcheck.analyze_pymupdf import extract_links_pymupdf, extract_toc_pymupdf
|
|
21
|
-
from pdflinkcheck.analyze_pypdf import extract_links_pypdf, extract_toc_pypdf
|
|
22
|
-
from pdflinkcheck.report import run_report_and_call_exports as run_report
|
|
23
14
|
#from pdflinkcheck import dev
|
|
24
15
|
|
|
16
|
+
# Lazy-loaded orchestrator
|
|
17
|
+
def run_report(pdf_path: str, export_format: str = "JSON", pdf_library: str = "auto", print_bool: bool = True):
|
|
18
|
+
"""
|
|
19
|
+
Run a full link check report on a PDF file.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
pdf_path: Path to the PDF file.
|
|
23
|
+
export_format: "JSON", "TXT", or both (e.g., "JSON,TXT").
|
|
24
|
+
pdf_library: "auto", "pdfium", "pymupdf", or "pypdf".
|
|
25
|
+
print_bool: If True, prints the overview to stdout.
|
|
26
|
+
"""
|
|
27
|
+
from pdflinkcheck.report import run_report_and_call_exports as _run
|
|
28
|
+
return _run(pdf_path=pdf_path, export_format=export_format, pdf_library=pdf_library, print_bool=print_bool)
|
|
29
|
+
|
|
30
|
+
# --- pypdf ---
|
|
31
|
+
def analyze_pdf_pypdf(path):
|
|
32
|
+
try:
|
|
33
|
+
from pdflinkcheck.analysis_pypdf import analyze_pdf as _analyze
|
|
34
|
+
except ImportError:
|
|
35
|
+
raise ImportError(
|
|
36
|
+
"pypdf engine is not installed. "
|
|
37
|
+
"Install pypdf to enable pypdf support."
|
|
38
|
+
)
|
|
39
|
+
return _analyze(path)
|
|
40
|
+
analyze_pdf_pypdf.__doc__ = (
|
|
41
|
+
"Analyze a PDF using the lightweight pypdf engine and return a normalized dictionary.\n\n"
|
|
42
|
+
"See pdflinkcheck.analyze_pypdf for full details."
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# --- PyMuPDF ---
|
|
46
|
+
def analyze_pdf_pymupdf(path):
|
|
47
|
+
try:
|
|
48
|
+
from pdflinkcheck.analysis_pymupdf import analyze_pdf as _analyze
|
|
49
|
+
except ImportError:
|
|
50
|
+
raise ImportError(
|
|
51
|
+
"PyMuPDF engine is not installed. "
|
|
52
|
+
"Install with the [mupdf] extra to enable PyMuPDF support."
|
|
53
|
+
)
|
|
54
|
+
return _analyze(path)
|
|
55
|
+
analyze_pdf_pymupdf.__doc__ = (
|
|
56
|
+
"Analyze a PDF using the AGPL3-licensed PyMuPDF engine and return a normalized dictionary.\n\n"
|
|
57
|
+
"See pdflinkcheck.analyze_pymupdf for full details."
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# --- PDFium ---
|
|
62
|
+
|
|
63
|
+
def analyze_pdf_pdfium(path):
|
|
64
|
+
try:
|
|
65
|
+
from pdflinkcheck.analysis_pdfium import analyze_pdf as _analyze
|
|
66
|
+
except ImportError:
|
|
67
|
+
raise ImportError(
|
|
68
|
+
"PDFium engine is not installed. "
|
|
69
|
+
"Install with the [pdfium] extra to enable pdfium support."
|
|
70
|
+
)
|
|
71
|
+
return _analyze(path)
|
|
72
|
+
analyze_pdf_pdfium.__doc__ = (
|
|
73
|
+
"Analyze a PDF using the PDFium engine and return a normalized dictionary.\n\n"
|
|
74
|
+
"See pdflinkcheck.analyze_pdfium for full details."
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# -----------------------------
|
|
78
|
+
# GUI easter egg
|
|
79
|
+
# -----------------------------
|
|
25
80
|
# For the kids. This is what I wanted when learning Python in a mysterious new REPL.
|
|
26
81
|
# Is this Pythonic? No. Oh well. PEP 8, PEP 20.
|
|
27
82
|
# Why is this not Pythonic? Devs expect no side effects when importing library functions.
|
|
@@ -30,32 +85,47 @@ _gui_easteregg_env_flag = _os.environ.get('PDFLINKCHECK_GUI_EASTEREGG', '')
|
|
|
30
85
|
_load_gui_func = str(_gui_easteregg_env_flag).strip().lower() in ('true', '1', 'yes', 'on')
|
|
31
86
|
if _load_gui_func:
|
|
32
87
|
try:
|
|
88
|
+
print("Easter egg, attemping.")
|
|
33
89
|
import pyhabitat as _pyhabitat # pyhabitat is a dependency of this package already
|
|
90
|
+
print(f"pyhabitat.tkinter_is_available() = {_pyhabitat.tkinter_is_available()}")
|
|
34
91
|
if _pyhabitat.tkinter_is_available():
|
|
35
92
|
from pdflinkcheck.gui import start_gui
|
|
93
|
+
print("Success: pdflinkcheck.start_gui() function loaded as top-level pmlibrary function.")
|
|
36
94
|
except ImportError:
|
|
37
95
|
# Optional: log or ignore silently
|
|
38
96
|
print("start_gui() not imported")
|
|
39
97
|
|
|
98
|
+
|
|
99
|
+
|
|
40
100
|
# Breadcrumbs, for stumbling upon.
|
|
41
101
|
if _load_gui_func:
|
|
42
102
|
__pdflinkcheck_gui_easteregg_enabled__ = True
|
|
43
103
|
else:
|
|
44
104
|
__pdflinkcheck_gui_easteregg_enabled__ = False
|
|
45
105
|
|
|
106
|
+
|
|
107
|
+
# -----------------------------
|
|
108
|
+
# Public API
|
|
109
|
+
# -----------------------------
|
|
46
110
|
# Define __all__ such that the library functions are self documenting.
|
|
47
111
|
__all__ = [
|
|
48
112
|
"run_report",
|
|
49
|
-
"
|
|
50
|
-
"
|
|
51
|
-
"
|
|
52
|
-
"extract_toc_pypdf",
|
|
53
|
-
#"start_gui" if _load_gui_func else None,
|
|
54
|
-
"dev",
|
|
113
|
+
"analyze_pdf_pymupdf",
|
|
114
|
+
"analyze_pdf_pypdf",
|
|
115
|
+
"analyze_pdf_pdfium",
|
|
55
116
|
]
|
|
117
|
+
|
|
118
|
+
# Handle the Easter Egg export
|
|
56
119
|
if _load_gui_func:
|
|
57
120
|
__all__.append("start_gui")
|
|
58
121
|
|
|
122
|
+
# Handle dev module if you want it public
|
|
123
|
+
try:
|
|
124
|
+
from pdflinkcheck import dev
|
|
125
|
+
__all__.append("dev")
|
|
126
|
+
except ImportError:
|
|
127
|
+
pass
|
|
128
|
+
|
|
59
129
|
# 4. THE CLEANUP (This removes items from dir())
|
|
60
130
|
del _os
|
|
61
131
|
del _gui_easteregg_env_flag
|
pdflinkcheck/__main__.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# src/pdflinkcheck/analysis_pdfium.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import ctypes
|
|
4
|
+
from typing import List, Dict, Any
|
|
5
|
+
from pdflinkcheck.helpers import PageRef
|
|
6
|
+
|
|
7
|
+
from pdflinkcheck.environment import pdfium_is_available
|
|
8
|
+
from pdflinkcheck.helpers import PageRef
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
if pdfium_is_available():
|
|
12
|
+
import pypdfium2 as pdfium
|
|
13
|
+
import pypdfium2.raw as pdfium_c
|
|
14
|
+
|
|
15
|
+
else:
|
|
16
|
+
pdfium = None
|
|
17
|
+
pdfium_c = None
|
|
18
|
+
except ImportError:
|
|
19
|
+
pdfium = None
|
|
20
|
+
pdfium_c = None
|
|
21
|
+
|
|
22
|
+
def analyze_pdf(path: str) -> Dict[str, Any]:
|
|
23
|
+
# 1. Guard the entry point
|
|
24
|
+
if not pdfium_is_available() or pdfium is None:
|
|
25
|
+
raise ImportError(
|
|
26
|
+
"pypdfium2 is not installed. "
|
|
27
|
+
"\nInstall it with: \n\tpip install pdflinkcheck[pdfium] \n\t OR \n\t uv sync --extra pdfium"
|
|
28
|
+
)
|
|
29
|
+
doc = pdfium.PdfDocument(path)
|
|
30
|
+
|
|
31
|
+
total_pages = len(doc) # or doc.page_count
|
|
32
|
+
|
|
33
|
+
links = []
|
|
34
|
+
toc_list = []
|
|
35
|
+
file_ov = {}
|
|
36
|
+
seen_toc = set()
|
|
37
|
+
|
|
38
|
+
file_ov["total_pages"] = total_pages
|
|
39
|
+
|
|
40
|
+
# 1. TOC Extraction (Matches PyMuPDF logic)
|
|
41
|
+
for item in doc.get_toc():
|
|
42
|
+
title = item.get_title() if hasattr(item, "get_title") else ""
|
|
43
|
+
dest = item.get_dest()
|
|
44
|
+
page_idx = PageRef.from_index(dest.get_index()).machine if dest else 0
|
|
45
|
+
if title or page_idx > 0:
|
|
46
|
+
key = (title, page_idx)
|
|
47
|
+
if key not in seen_toc:
|
|
48
|
+
toc_list.append({"level": item.level + 1, "title": title, "target_page": page_idx})
|
|
49
|
+
seen_toc.add(key)
|
|
50
|
+
|
|
51
|
+
# 2. Link Enumeration
|
|
52
|
+
for page_index in range(len(doc)):
|
|
53
|
+
page = doc.get_page(page_index)
|
|
54
|
+
text_page = page.get_textpage()
|
|
55
|
+
source_ref = PageRef.from_index(page_index)
|
|
56
|
+
|
|
57
|
+
# --- A. EXTERNAL WEB LINKS ---
|
|
58
|
+
pagelink_raw = pdfium_c.FPDFLink_LoadWebLinks(text_page.raw)
|
|
59
|
+
if pagelink_raw:
|
|
60
|
+
count = pdfium_c.FPDFLink_CountWebLinks(pagelink_raw)
|
|
61
|
+
for i in range(count):
|
|
62
|
+
buflen = pdfium_c.FPDFLink_GetURL(pagelink_raw, i, None, 0)
|
|
63
|
+
url = ""
|
|
64
|
+
if buflen > 0:
|
|
65
|
+
buffer = (pdfium_c.c_uint16 * buflen)()
|
|
66
|
+
pdfium_c.FPDFLink_GetURL(pagelink_raw, i, buffer, buflen)
|
|
67
|
+
url = ctypes.string_at(buffer, (buflen-1)*2).decode('utf-16le')
|
|
68
|
+
|
|
69
|
+
l, t, r, b = (ctypes.c_double() for _ in range(4))
|
|
70
|
+
pdfium_c.FPDFLink_GetRect(pagelink_raw, i, 0, ctypes.byref(l), ctypes.byref(t), ctypes.byref(r), ctypes.byref(b))
|
|
71
|
+
|
|
72
|
+
rect = [l.value, b.value, r.value, t.value]
|
|
73
|
+
links.append({
|
|
74
|
+
'page': source_ref.machine,
|
|
75
|
+
'rect': rect,
|
|
76
|
+
'link_text': text_page.get_text_bounded(left=l.value, top=t.value, right=r.value, bottom=b.value).strip() or url,
|
|
77
|
+
'type': 'External (URI)',
|
|
78
|
+
'url': url,
|
|
79
|
+
'target': url,
|
|
80
|
+
'source_kind': 'pypdfium2_weblink'
|
|
81
|
+
})
|
|
82
|
+
pdfium_c.FPDFLink_CloseWebLinks(pagelink_raw)
|
|
83
|
+
|
|
84
|
+
# --- B. INTERNAL GOTO LINKS (Standard Annotations) ---
|
|
85
|
+
# We iterate through standard link annotations for GoTo actions
|
|
86
|
+
pos = 0
|
|
87
|
+
while True:
|
|
88
|
+
annot_raw = pdfium_c.FPDFPage_GetAnnot(page.raw, pos)
|
|
89
|
+
if not annot_raw:
|
|
90
|
+
break
|
|
91
|
+
|
|
92
|
+
subtype = pdfium_c.FPDFAnnot_GetSubtype(annot_raw)
|
|
93
|
+
if subtype == pdfium_c.FPDF_ANNOT_LINK:
|
|
94
|
+
# Get Rect
|
|
95
|
+
fs_rect = pdfium_c.FS_RECTF()
|
|
96
|
+
pdfium_c.FPDFAnnot_GetRect(annot_raw, fs_rect)
|
|
97
|
+
|
|
98
|
+
# Try to get Destination
|
|
99
|
+
link_annot = pdfium_c.FPDFAnnot_GetLink(annot_raw)
|
|
100
|
+
dest = pdfium_c.FPDFLink_GetDest(doc.raw, link_annot)
|
|
101
|
+
|
|
102
|
+
if dest:
|
|
103
|
+
dest_idx = pdfium_c.FPDFDest_GetDestPageIndex(doc.raw, dest)
|
|
104
|
+
dest_ref = PageRef.from_index(dest_idx)
|
|
105
|
+
|
|
106
|
+
links.append({
|
|
107
|
+
'page': source_ref.machine,
|
|
108
|
+
'rect': [fs_rect.left, fs_rect.bottom, fs_rect.right, fs_rect.top],
|
|
109
|
+
'link_text': text_page.get_text_bounded(left=fs_rect.left, top=fs_rect.top, right=fs_rect.right, bottom=fs_rect.bottom).strip(),
|
|
110
|
+
'type': 'Internal (GoTo/Dest)',
|
|
111
|
+
'destination_page': dest_ref.machine,
|
|
112
|
+
'target': dest_ref.machine,
|
|
113
|
+
'source_kind': 'pypdfium2_annot'
|
|
114
|
+
})
|
|
115
|
+
|
|
116
|
+
# Note: We don't close annot here if we are just enumerating by index
|
|
117
|
+
# in some builds, but standard practice is to increment pos
|
|
118
|
+
pos += 1
|
|
119
|
+
|
|
120
|
+
page.close()
|
|
121
|
+
text_page.close()
|
|
122
|
+
|
|
123
|
+
doc.close()
|
|
124
|
+
return {"links": links, "toc": toc_list, "file_ov": file_ov}
|
|
125
|
+
|
|
126
|
+
if __name__ == "__main__":
|
|
127
|
+
import json
|
|
128
|
+
import sys
|
|
129
|
+
filename = "temOM.pdf"
|
|
130
|
+
results = analyze_pdf(filename)
|
|
131
|
+
print(json.dumps(results, indent=2))
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
# SPDX-License-Identifier: MIT
|
|
3
|
-
# pdflinkcheck/
|
|
4
|
-
|
|
3
|
+
# pdflinkcheck/analysis_pymupdf.py
|
|
4
|
+
from __future__ import annotations
|
|
5
5
|
import sys
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
import logging
|
|
@@ -10,6 +10,8 @@ from typing import Dict, Any, Optional, List
|
|
|
10
10
|
logging.getLogger("fitz").setLevel(logging.ERROR)
|
|
11
11
|
|
|
12
12
|
from pdflinkcheck.environment import pymupdf_is_available
|
|
13
|
+
from pdflinkcheck.helpers import PageRef
|
|
14
|
+
|
|
13
15
|
try:
|
|
14
16
|
if pymupdf_is_available():
|
|
15
17
|
import fitz # PyMuPDF
|
|
@@ -22,8 +24,31 @@ except ImportError:
|
|
|
22
24
|
Inspect target PDF for both URI links and for GoTo links.
|
|
23
25
|
"""
|
|
24
26
|
|
|
27
|
+
def analyze_pdf(pdf_path: str):
|
|
28
|
+
data = {}
|
|
29
|
+
data["links"] = []
|
|
30
|
+
data["toc"] = []
|
|
31
|
+
data["file_ov"] = {}
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
doc = fitz.open(pdf_path)
|
|
35
|
+
except Exception as e:
|
|
36
|
+
print(f"fitz.open() failed: {e}")
|
|
37
|
+
return data
|
|
38
|
+
|
|
39
|
+
extracted_links = extract_links_pymupdf(doc)
|
|
40
|
+
structural_toc = extract_toc_pymupdf(doc)
|
|
41
|
+
page_count = doc.page_count
|
|
42
|
+
|
|
43
|
+
data["links"] = extracted_links
|
|
44
|
+
data["toc"] = structural_toc
|
|
45
|
+
data["file_ov"]["total_pages"] = page_count
|
|
46
|
+
data["file_ov"]["pdf_name"] = Path(pdf_path).name
|
|
47
|
+
return data
|
|
48
|
+
|
|
49
|
+
|
|
25
50
|
# Helper function: Prioritize 'from'
|
|
26
|
-
def
|
|
51
|
+
def _get_link_rect(link_dict):
|
|
27
52
|
"""
|
|
28
53
|
Retrieves the bounding box for the link using the reliable 'from' key
|
|
29
54
|
provided by PyMuPDF's link dictionary.
|
|
@@ -49,6 +74,19 @@ def get_link_rect(link_dict):
|
|
|
49
74
|
return None
|
|
50
75
|
|
|
51
76
|
def get_anchor_text(page, link_rect):
|
|
77
|
+
"""
|
|
78
|
+
Extracts text content using the link's bounding box coordinates.
|
|
79
|
+
The bounding box is slightly expanded to ensure full characters are captured.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
page: The fitz.Page object where the link is located.
|
|
83
|
+
link_rect: A tuple of four floats (x0, y0, x1, y1) representing the
|
|
84
|
+
link's bounding box.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
The cleaned, extracted text string, or a placeholder message
|
|
88
|
+
if no text is found or if an error occurs.
|
|
89
|
+
"""
|
|
52
90
|
if not link_rect:
|
|
53
91
|
return "N/A: Missing Rect"
|
|
54
92
|
|
|
@@ -86,57 +124,6 @@ def get_anchor_text(page, link_rect):
|
|
|
86
124
|
except Exception:
|
|
87
125
|
return "N/A: Rect Error"
|
|
88
126
|
|
|
89
|
-
def get_anchor_text_stable(page, link_rect):
|
|
90
|
-
"""
|
|
91
|
-
Extracts text content using the link's bounding box coordinates.
|
|
92
|
-
The bounding box is slightly expanded to ensure full characters are captured.
|
|
93
|
-
|
|
94
|
-
Args:
|
|
95
|
-
page: The fitz.Page object where the link is located.
|
|
96
|
-
link_rect: A tuple of four floats (x0, y0, x1, y1) representing the
|
|
97
|
-
link's bounding box.
|
|
98
|
-
|
|
99
|
-
Returns:
|
|
100
|
-
The cleaned, extracted text string, or a placeholder message
|
|
101
|
-
if no text is found or if an error occurs.
|
|
102
|
-
"""
|
|
103
|
-
if not link_rect:
|
|
104
|
-
return "N/A: Missing Rect"
|
|
105
|
-
|
|
106
|
-
try:
|
|
107
|
-
# 1. Convert the coordinate tuple back to a fitz.Rect object
|
|
108
|
-
rect = fitz.Rect(link_rect)
|
|
109
|
-
|
|
110
|
-
# --- CRITICAL STEP: Check for invalid/empty rect AFTER conversion ---
|
|
111
|
-
# If the rect is invalid (e.g., width or height is <= 0), skip it
|
|
112
|
-
# Note: fitz.Rect will often auto-normalize, but this explicit check is safer.
|
|
113
|
-
if rect.is_empty or rect.width <= 0 or rect.height <= 0:
|
|
114
|
-
return "N/A: Rect Error (Zero/Negative Dimension)"
|
|
115
|
-
|
|
116
|
-
# 2. Expand the rect slightly to capture full characters (1 unit in each direction)
|
|
117
|
-
# This method avoids the proprietary/unstable 'from_expanded' or 'from_rect' methods.
|
|
118
|
-
expanded_rect = fitz.Rect(
|
|
119
|
-
rect.x0 - 1,
|
|
120
|
-
rect.y0 - 1,
|
|
121
|
-
rect.x1 + 1,
|
|
122
|
-
rect.y1 + 1
|
|
123
|
-
)
|
|
124
|
-
|
|
125
|
-
# 3. Get the text within the expanded bounding box
|
|
126
|
-
anchor_text = page.get_textbox(expanded_rect)
|
|
127
|
-
|
|
128
|
-
# 4. Clean up whitespace and non-printing characters
|
|
129
|
-
cleaned_text = " ".join(anchor_text.split())
|
|
130
|
-
|
|
131
|
-
if cleaned_text:
|
|
132
|
-
return cleaned_text
|
|
133
|
-
else:
|
|
134
|
-
return "N/A: No Visible Text"
|
|
135
|
-
|
|
136
|
-
except Exception:
|
|
137
|
-
# Fallback for unexpected errors in rect conversion or retrieval
|
|
138
|
-
return "N/A: Rect Error"
|
|
139
|
-
|
|
140
127
|
def analyze_toc_fitz(doc):
|
|
141
128
|
"""
|
|
142
129
|
Extracts the structural Table of Contents (PDF Bookmarks/Outline)
|
|
@@ -149,23 +136,28 @@ def analyze_toc_fitz(doc):
|
|
|
149
136
|
A list of dictionaries, where each dictionary represents a TOC entry
|
|
150
137
|
with 'level', 'title', and 'target_page' (1-indexed).
|
|
151
138
|
"""
|
|
139
|
+
|
|
152
140
|
toc = doc.get_toc()
|
|
153
141
|
toc_data = []
|
|
154
142
|
|
|
155
143
|
for level, title, page_num in toc:
|
|
156
144
|
# fitz pages are 1-indexed for TOC!
|
|
145
|
+
# We know fitz gives us a human number.
|
|
146
|
+
# We convert it to a physical index for our internal storage.
|
|
147
|
+
# page_num is 1 (Human). We normalize to 0 (Physical).
|
|
148
|
+
ref = PageRef.from_human(page_num)
|
|
157
149
|
toc_data.append({
|
|
158
150
|
'level': level,
|
|
159
151
|
'title': title,
|
|
160
|
-
'target_page':
|
|
152
|
+
#'target_page': ref.index
|
|
153
|
+
'target_page': ref.machine
|
|
161
154
|
})
|
|
162
155
|
|
|
163
156
|
return toc_data
|
|
164
157
|
|
|
165
|
-
|
|
166
158
|
# 2. Updated Main Inspection Function to Include Text Extraction
|
|
167
159
|
#def inspect_pdf_hyperlinks_fitz(pdf_path):
|
|
168
|
-
def extract_toc_pymupdf(
|
|
160
|
+
def extract_toc_pymupdf(doc):
|
|
169
161
|
"""
|
|
170
162
|
Opens a PDF, iterates through all pages and extracts the structural table of contents (TOC/bookmarks).
|
|
171
163
|
|
|
@@ -176,7 +168,7 @@ def extract_toc_pymupdf(pdf_path):
|
|
|
176
168
|
A list of dictionaries representing the structural TOC/bookmarks.
|
|
177
169
|
"""
|
|
178
170
|
try:
|
|
179
|
-
|
|
171
|
+
|
|
180
172
|
structural_toc = analyze_toc_fitz(doc)
|
|
181
173
|
except Exception as e:
|
|
182
174
|
print(f"An error occurred: {e}", file=sys.stderr)
|
|
@@ -211,125 +203,91 @@ def serialize_fitz_object(obj):
|
|
|
211
203
|
return obj
|
|
212
204
|
|
|
213
205
|
|
|
214
|
-
def extract_links_pymupdf(
|
|
215
|
-
"""
|
|
216
|
-
Opens a PDF, iterates through all pages and extracts all link annotations.
|
|
217
|
-
It categorizes the links into External, Internal, or Other actions, and extracts the anchor text.
|
|
218
|
-
|
|
219
|
-
Args:
|
|
220
|
-
pdf_path: The file system path (str) to the target PDF document.
|
|
221
|
-
|
|
222
|
-
Returns:
|
|
223
|
-
A list of dictionaries, where each dictionary is a comprehensive
|
|
224
|
-
representation of an active hyperlink found in the PDF.
|
|
225
|
-
|
|
226
|
-
"""
|
|
206
|
+
def extract_links_pymupdf(doc):
|
|
227
207
|
links_data = []
|
|
228
|
-
try:
|
|
229
|
-
|
|
208
|
+
try:
|
|
209
|
+
# This represents the maximum valid 0-index in the doc
|
|
210
|
+
last_page_ref = PageRef.from_pymupdf_total_page_count(doc.page_count)
|
|
211
|
+
|
|
212
|
+
#print(last_page_ref) # Output: "358" (Because of __str__)
|
|
213
|
+
#print(int(last_page_ref)) # Output: 357 (Because of __int__)
|
|
230
214
|
|
|
231
215
|
for page_num in range(doc.page_count):
|
|
232
216
|
page = doc.load_page(page_num)
|
|
233
|
-
|
|
234
|
-
for link in page.get_links():
|
|
235
|
-
|
|
236
|
-
page_obj = doc.load_page(page_num)
|
|
237
|
-
link_rect = get_link_rect(link)
|
|
238
|
-
|
|
239
|
-
rect_obj = link.get("from")
|
|
240
|
-
xref = link.get("xref")
|
|
241
|
-
#print(f"rect_obj = {rect_obj}")
|
|
242
|
-
#print(f"xref = {xref}")
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
# --- Examples of various keys associated with various link instances ---
|
|
246
|
-
#print(f"keys: list(link) = {list(link)}")
|
|
247
|
-
# keys: list(link) = ['kind', 'xref', 'from', 'page', 'viewrect', 'id']
|
|
248
|
-
# keys: list(link) = ['kind', 'xref', 'from', 'uri', 'id']
|
|
249
|
-
# keys: list(link) = ['kind', 'xref', 'from', 'page', 'view', 'id']
|
|
217
|
+
source_ref = PageRef.from_index(page_num)
|
|
250
218
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
# 2. Extract the target and kind
|
|
255
|
-
target = ""
|
|
256
|
-
kind = link.get('kind')
|
|
257
|
-
|
|
219
|
+
for link in page.get_links():
|
|
220
|
+
link_rect = _get_link_rect(link)
|
|
221
|
+
anchor_text = get_anchor_text(page, link_rect)
|
|
258
222
|
|
|
259
223
|
link_dict = {
|
|
260
|
-
'page':
|
|
224
|
+
'page': source_ref.machine,
|
|
261
225
|
'rect': link_rect,
|
|
262
226
|
'link_text': anchor_text,
|
|
263
|
-
'xref':xref
|
|
227
|
+
'xref': link.get("xref")
|
|
264
228
|
}
|
|
265
229
|
|
|
266
|
-
|
|
267
|
-
# Use the clean serialize_fitz_object() helper function on all keys that might contain objects
|
|
230
|
+
kind = link.get('kind')
|
|
268
231
|
destination_view = serialize_fitz_object(link.get('to'))
|
|
232
|
+
p_index = link.get('page') # excpeted to be human facing, per PyMuPDF's known quirks
|
|
233
|
+
|
|
234
|
+
# --- CASE 1: INTERNAL JUMPS (GoTo) ---
|
|
235
|
+
if p_index is not None:
|
|
269
236
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
237
|
+
# Ensure we are working with an integer
|
|
238
|
+
raw_pymupdf_idx = int(p_index)
|
|
239
|
+
corrected_machine_idx = PageRef.corrected_down(raw_pymupdf_idx).index
|
|
240
|
+
|
|
241
|
+
# Logic: Normalize to 0-index and store as int
|
|
242
|
+
idx = min(corrected_machine_idx, int(last_page_ref))
|
|
243
|
+
#print(f"DEBUG: Link Text: {anchor_text} | Raw p_index: {p_index}")
|
|
244
|
+
#print(f"[DEBUG] idx: {idx}")
|
|
245
|
+
dest_ref = PageRef.from_index(idx) # does not impact the value
|
|
275
246
|
|
|
276
|
-
if link['kind'] == fitz.LINK_URI:
|
|
277
|
-
target = link.get('uri', 'URI (Unknown Target)')
|
|
278
247
|
link_dict.update({
|
|
279
|
-
'
|
|
280
|
-
'
|
|
281
|
-
'target':
|
|
248
|
+
'destination_page': dest_ref.machine,
|
|
249
|
+
'destination_view': destination_view,
|
|
250
|
+
'target': dest_ref.machine, # INT (MACHINE INDEX)
|
|
282
251
|
})
|
|
252
|
+
|
|
253
|
+
if kind == fitz.LINK_GOTO:
|
|
254
|
+
link_dict['type'] = 'Internal (GoTo/Dest)'
|
|
255
|
+
else:
|
|
256
|
+
link_dict['type'] = 'Internal (Resolved Action)'
|
|
257
|
+
link_dict['source_kind'] = kind
|
|
283
258
|
|
|
284
|
-
|
|
285
|
-
|
|
259
|
+
# --- CASE 2: EXTERNAL URIs ---
|
|
260
|
+
elif kind == fitz.LINK_URI:
|
|
261
|
+
uri = link.get('uri', 'URI (Unknown Target)')
|
|
286
262
|
link_dict.update({
|
|
287
|
-
'type': '
|
|
288
|
-
'
|
|
289
|
-
'
|
|
290
|
-
'target': target
|
|
263
|
+
'type': 'External (URI)',
|
|
264
|
+
'url': uri,
|
|
265
|
+
'target': uri # STRING (URL)
|
|
291
266
|
})
|
|
292
267
|
|
|
293
|
-
|
|
268
|
+
# --- CASE 3: REMOTE PDF REFERENCES ---
|
|
269
|
+
elif kind == fitz.LINK_GOTOR:
|
|
270
|
+
remote_file = link.get('file', 'Remote File')
|
|
294
271
|
link_dict.update({
|
|
295
272
|
'type': 'Remote (GoToR)',
|
|
296
273
|
'remote_file': link.get('file'),
|
|
297
|
-
'
|
|
274
|
+
'target': remote_file # STRING (File Path)
|
|
298
275
|
})
|
|
299
276
|
|
|
300
|
-
|
|
301
|
-
target = f"Page {target_page_num_reported}"
|
|
302
|
-
link_dict.update({
|
|
303
|
-
'type': 'Internal (Resolved Action)',
|
|
304
|
-
'destination_page': target_page_num_reported,
|
|
305
|
-
'destination_view': destination_view,
|
|
306
|
-
'source_kind': link.get('kind'),
|
|
307
|
-
'target': target
|
|
308
|
-
})
|
|
309
|
-
|
|
277
|
+
# --- CASE 4: OTHERS ---
|
|
310
278
|
else:
|
|
311
|
-
target = link.get('url') or link.get('remote_file') or link.get('target')
|
|
312
279
|
link_dict.update({
|
|
313
280
|
'type': 'Other Action',
|
|
314
|
-
'action_kind':
|
|
315
|
-
'target':
|
|
281
|
+
'action_kind': kind,
|
|
282
|
+
'target': 'Unknown' # STRING
|
|
316
283
|
})
|
|
317
284
|
|
|
318
|
-
## --- General Serialization Cleaner ---
|
|
319
|
-
#for key, value in link_dict.items():
|
|
320
|
-
# if hasattr(value, 'rect') and hasattr(value, 'point'):
|
|
321
|
-
# # This handles Rect and Point objects that may slip through
|
|
322
|
-
# link_dict[key] = str(value)
|
|
323
|
-
## --- End Cleaner ---
|
|
324
|
-
|
|
325
285
|
links_data.append(link_dict)
|
|
326
|
-
|
|
327
286
|
doc.close()
|
|
328
287
|
except Exception as e:
|
|
329
288
|
print(f"An error occurred: {e}", file=sys.stderr)
|
|
330
289
|
return links_data
|
|
331
290
|
|
|
332
|
-
|
|
333
291
|
def call_stable():
|
|
334
292
|
"""
|
|
335
293
|
Placeholder function for command-line execution (e.g., in __main__).
|