pdflinkcheck 1.1.94__py3-none-any.whl → 1.2.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. pdflinkcheck/__init__.py +88 -18
  2. pdflinkcheck/__main__.py +6 -0
  3. pdflinkcheck/analysis_pdfium.py +131 -0
  4. pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +99 -141
  5. pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +51 -39
  6. pdflinkcheck/cli.py +52 -48
  7. pdflinkcheck/data/LICENSE +18 -15
  8. pdflinkcheck/data/README.md +23 -25
  9. pdflinkcheck/data/pyproject.toml +17 -26
  10. pdflinkcheck/datacopy.py +16 -1
  11. pdflinkcheck/dev.py +2 -2
  12. pdflinkcheck/environment.py +14 -2
  13. pdflinkcheck/gui.py +346 -563
  14. pdflinkcheck/helpers.py +88 -0
  15. pdflinkcheck/io.py +24 -6
  16. pdflinkcheck/report.py +598 -97
  17. pdflinkcheck/security.py +189 -0
  18. pdflinkcheck/splash.py +38 -0
  19. pdflinkcheck/stdlib_server.py +7 -21
  20. pdflinkcheck/stdlib_server_alt.py +571 -0
  21. pdflinkcheck/tk_utils.py +188 -0
  22. pdflinkcheck/update_msix_version.py +2 -0
  23. pdflinkcheck/validate.py +104 -170
  24. pdflinkcheck/version_info.py +2 -2
  25. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +41 -40
  26. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/RECORD +34 -27
  27. pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
  28. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
  29. pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
  30. pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
  31. pdflinkcheck/analyze_pypdf_v2.py +0 -217
  32. pdflinkcheck-1.1.94.dist-info/WHEEL +0 -4
  33. pdflinkcheck-1.1.94.dist-info/licenses/LICENSE +0 -24
  34. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-AGPL3 +0 -0
  35. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-MIT +0 -0
pdflinkcheck/__init__.py CHANGED
@@ -1,27 +1,82 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: MIT
1
3
  # src/pdflinkcheck/__init__.py
2
4
  """
3
- # License information
4
5
  pdflinkcheck - A PDF Link Checker
5
6
 
6
- Copyright (C) 2025 George Clayton Bennett
7
-
8
7
  Source code: https://github.com/City-of-Memphis-Wastewater/pdflinkcheck/
9
8
 
10
- This program is free software: You can redistribute it and/or modify
11
- it under the terms of the GNU Affero General Public License as
12
- published by the Free Software Foundation, either version 3 of the
13
- License, or (at your option) any later version.
14
-
15
- The AGPL3+ is required because pdflinkcheck uses PyMuPDF, which is licensed under the AGPL3.
16
9
  """
10
+ from __future__ import annotations
17
11
  import os as _os
18
12
 
19
13
  # Library functions
20
- from pdflinkcheck.analyze_pymupdf import extract_links_pymupdf, extract_toc_pymupdf
21
- from pdflinkcheck.analyze_pypdf import extract_links_pypdf, extract_toc_pypdf
22
- from pdflinkcheck.report import run_report_and_call_exports as run_report
23
14
  #from pdflinkcheck import dev
24
15
 
16
+ # Lazy-loaded orchestrator
17
+ def run_report(pdf_path: str, export_format: str = "JSON", pdf_library: str = "auto", print_bool: bool = True):
18
+ """
19
+ Run a full link check report on a PDF file.
20
+
21
+ Args:
22
+ pdf_path: Path to the PDF file.
23
+ export_format: "JSON", "TXT", or both (e.g., "JSON,TXT").
24
+ pdf_library: "auto", "pdfium", "pymupdf", or "pypdf".
25
+ print_bool: If True, prints the overview to stdout.
26
+ """
27
+ from pdflinkcheck.report import run_report_and_call_exports as _run
28
+ return _run(pdf_path=pdf_path, export_format=export_format, pdf_library=pdf_library, print_bool=print_bool)
29
+
30
+ # --- pypdf ---
31
+ def analyze_pdf_pypdf(path):
32
+ try:
33
+ from pdflinkcheck.analysis_pypdf import analyze_pdf as _analyze
34
+ except ImportError:
35
+ raise ImportError(
36
+ "pypdf engine is not installed. "
37
+ "Install pypdf to enable pypdf support."
38
+ )
39
+ return _analyze(path)
40
+ analyze_pdf_pypdf.__doc__ = (
41
+ "Analyze a PDF using the lightweight pypdf engine and return a normalized dictionary.\n\n"
42
+ "See pdflinkcheck.analyze_pypdf for full details."
43
+ )
44
+
45
+ # --- PyMuPDF ---
46
+ def analyze_pdf_pymupdf(path):
47
+ try:
48
+ from pdflinkcheck.analysis_pymupdf import analyze_pdf as _analyze
49
+ except ImportError:
50
+ raise ImportError(
51
+ "PyMuPDF engine is not installed. "
52
+ "Install with the [mupdf] extra to enable PyMuPDF support."
53
+ )
54
+ return _analyze(path)
55
+ analyze_pdf_pymupdf.__doc__ = (
56
+ "Analyze a PDF using the AGPL3-licensed PyMuPDF engine and return a normalized dictionary.\n\n"
57
+ "See pdflinkcheck.analyze_pymupdf for full details."
58
+ )
59
+
60
+
61
+ # --- PDFium ---
62
+
63
+ def analyze_pdf_pdfium(path):
64
+ try:
65
+ from pdflinkcheck.analysis_pdfium import analyze_pdf as _analyze
66
+ except ImportError:
67
+ raise ImportError(
68
+ "PDFium engine is not installed. "
69
+ "Install with the [pdfium] extra to enable pdfium support."
70
+ )
71
+ return _analyze(path)
72
+ analyze_pdf_pdfium.__doc__ = (
73
+ "Analyze a PDF using the PDFium engine and return a normalized dictionary.\n\n"
74
+ "See pdflinkcheck.analyze_pdfium for full details."
75
+ )
76
+
77
+ # -----------------------------
78
+ # GUI easter egg
79
+ # -----------------------------
25
80
  # For the kids. This is what I wanted when learning Python in a mysterious new REPL.
26
81
  # Is this Pythonic? No. Oh well. PEP 8, PEP 20.
27
82
  # Why is this not Pythonic? Devs expect no side effects when importing library functions.
@@ -30,32 +85,47 @@ _gui_easteregg_env_flag = _os.environ.get('PDFLINKCHECK_GUI_EASTEREGG', '')
30
85
  _load_gui_func = str(_gui_easteregg_env_flag).strip().lower() in ('true', '1', 'yes', 'on')
31
86
  if _load_gui_func:
32
87
  try:
88
+ print("Easter egg, attemping.")
33
89
  import pyhabitat as _pyhabitat # pyhabitat is a dependency of this package already
90
+ print(f"pyhabitat.tkinter_is_available() = {_pyhabitat.tkinter_is_available()}")
34
91
  if _pyhabitat.tkinter_is_available():
35
92
  from pdflinkcheck.gui import start_gui
93
+ print("Success: pdflinkcheck.start_gui() function loaded as top-level pmlibrary function.")
36
94
  except ImportError:
37
95
  # Optional: log or ignore silently
38
96
  print("start_gui() not imported")
39
97
 
98
+
99
+
40
100
  # Breadcrumbs, for stumbling upon.
41
101
  if _load_gui_func:
42
102
  __pdflinkcheck_gui_easteregg_enabled__ = True
43
103
  else:
44
104
  __pdflinkcheck_gui_easteregg_enabled__ = False
45
105
 
106
+
107
+ # -----------------------------
108
+ # Public API
109
+ # -----------------------------
46
110
  # Define __all__ such that the library functions are self documenting.
47
111
  __all__ = [
48
112
  "run_report",
49
- "extract_links_pymupdf",
50
- "extract_toc_pymupdf",
51
- "extract_links_pypdf",
52
- "extract_toc_pypdf",
53
- #"start_gui" if _load_gui_func else None,
54
- "dev",
113
+ "analyze_pdf_pymupdf",
114
+ "analyze_pdf_pypdf",
115
+ "analyze_pdf_pdfium",
55
116
  ]
117
+
118
+ # Handle the Easter Egg export
56
119
  if _load_gui_func:
57
120
  __all__.append("start_gui")
58
121
 
122
+ # Handle dev module if you want it public
123
+ try:
124
+ from pdflinkcheck import dev
125
+ __all__.append("dev")
126
+ except ImportError:
127
+ pass
128
+
59
129
  # 4. THE CLEANUP (This removes items from dir())
60
130
  del _os
61
131
  del _gui_easteregg_env_flag
@@ -0,0 +1,6 @@
1
+ # src/pdflinkcheck/__main__.py
2
+ from __future__ import annotations
3
+ from pdflinkcheck.cli import app
4
+
5
+ if __name__ == "__main__":
6
+ app()
@@ -0,0 +1,131 @@
1
+ # src/pdflinkcheck/analysis_pdfium.py
2
+ from __future__ import annotations
3
+ import ctypes
4
+ from typing import List, Dict, Any
5
+ from pdflinkcheck.helpers import PageRef
6
+
7
+ from pdflinkcheck.environment import pdfium_is_available
8
+ from pdflinkcheck.helpers import PageRef
9
+
10
+ try:
11
+ if pdfium_is_available():
12
+ import pypdfium2 as pdfium
13
+ import pypdfium2.raw as pdfium_c
14
+
15
+ else:
16
+ pdfium = None
17
+ pdfium_c = None
18
+ except ImportError:
19
+ pdfium = None
20
+ pdfium_c = None
21
+
22
+ def analyze_pdf(path: str) -> Dict[str, Any]:
23
+ # 1. Guard the entry point
24
+ if not pdfium_is_available() or pdfium is None:
25
+ raise ImportError(
26
+ "pypdfium2 is not installed. "
27
+ "\nInstall it with: \n\tpip install pdflinkcheck[pdfium] \n\t OR \n\t uv sync --extra pdfium"
28
+ )
29
+ doc = pdfium.PdfDocument(path)
30
+
31
+ total_pages = len(doc) # or doc.page_count
32
+
33
+ links = []
34
+ toc_list = []
35
+ file_ov = {}
36
+ seen_toc = set()
37
+
38
+ file_ov["total_pages"] = total_pages
39
+
40
+ # 1. TOC Extraction (Matches PyMuPDF logic)
41
+ for item in doc.get_toc():
42
+ title = item.get_title() if hasattr(item, "get_title") else ""
43
+ dest = item.get_dest()
44
+ page_idx = PageRef.from_index(dest.get_index()).machine if dest else 0
45
+ if title or page_idx > 0:
46
+ key = (title, page_idx)
47
+ if key not in seen_toc:
48
+ toc_list.append({"level": item.level + 1, "title": title, "target_page": page_idx})
49
+ seen_toc.add(key)
50
+
51
+ # 2. Link Enumeration
52
+ for page_index in range(len(doc)):
53
+ page = doc.get_page(page_index)
54
+ text_page = page.get_textpage()
55
+ source_ref = PageRef.from_index(page_index)
56
+
57
+ # --- A. EXTERNAL WEB LINKS ---
58
+ pagelink_raw = pdfium_c.FPDFLink_LoadWebLinks(text_page.raw)
59
+ if pagelink_raw:
60
+ count = pdfium_c.FPDFLink_CountWebLinks(pagelink_raw)
61
+ for i in range(count):
62
+ buflen = pdfium_c.FPDFLink_GetURL(pagelink_raw, i, None, 0)
63
+ url = ""
64
+ if buflen > 0:
65
+ buffer = (pdfium_c.c_uint16 * buflen)()
66
+ pdfium_c.FPDFLink_GetURL(pagelink_raw, i, buffer, buflen)
67
+ url = ctypes.string_at(buffer, (buflen-1)*2).decode('utf-16le')
68
+
69
+ l, t, r, b = (ctypes.c_double() for _ in range(4))
70
+ pdfium_c.FPDFLink_GetRect(pagelink_raw, i, 0, ctypes.byref(l), ctypes.byref(t), ctypes.byref(r), ctypes.byref(b))
71
+
72
+ rect = [l.value, b.value, r.value, t.value]
73
+ links.append({
74
+ 'page': source_ref.machine,
75
+ 'rect': rect,
76
+ 'link_text': text_page.get_text_bounded(left=l.value, top=t.value, right=r.value, bottom=b.value).strip() or url,
77
+ 'type': 'External (URI)',
78
+ 'url': url,
79
+ 'target': url,
80
+ 'source_kind': 'pypdfium2_weblink'
81
+ })
82
+ pdfium_c.FPDFLink_CloseWebLinks(pagelink_raw)
83
+
84
+ # --- B. INTERNAL GOTO LINKS (Standard Annotations) ---
85
+ # We iterate through standard link annotations for GoTo actions
86
+ pos = 0
87
+ while True:
88
+ annot_raw = pdfium_c.FPDFPage_GetAnnot(page.raw, pos)
89
+ if not annot_raw:
90
+ break
91
+
92
+ subtype = pdfium_c.FPDFAnnot_GetSubtype(annot_raw)
93
+ if subtype == pdfium_c.FPDF_ANNOT_LINK:
94
+ # Get Rect
95
+ fs_rect = pdfium_c.FS_RECTF()
96
+ pdfium_c.FPDFAnnot_GetRect(annot_raw, fs_rect)
97
+
98
+ # Try to get Destination
99
+ link_annot = pdfium_c.FPDFAnnot_GetLink(annot_raw)
100
+ dest = pdfium_c.FPDFLink_GetDest(doc.raw, link_annot)
101
+
102
+ if dest:
103
+ dest_idx = pdfium_c.FPDFDest_GetDestPageIndex(doc.raw, dest)
104
+ dest_ref = PageRef.from_index(dest_idx)
105
+
106
+ links.append({
107
+ 'page': source_ref.machine,
108
+ 'rect': [fs_rect.left, fs_rect.bottom, fs_rect.right, fs_rect.top],
109
+ 'link_text': text_page.get_text_bounded(left=fs_rect.left, top=fs_rect.top, right=fs_rect.right, bottom=fs_rect.bottom).strip(),
110
+ 'type': 'Internal (GoTo/Dest)',
111
+ 'destination_page': dest_ref.machine,
112
+ 'target': dest_ref.machine,
113
+ 'source_kind': 'pypdfium2_annot'
114
+ })
115
+
116
+ # Note: We don't close annot here if we are just enumerating by index
117
+ # in some builds, but standard practice is to increment pos
118
+ pos += 1
119
+
120
+ page.close()
121
+ text_page.close()
122
+
123
+ doc.close()
124
+ return {"links": links, "toc": toc_list, "file_ov": file_ov}
125
+
126
+ if __name__ == "__main__":
127
+ import json
128
+ import sys
129
+ filename = "temOM.pdf"
130
+ results = analyze_pdf(filename)
131
+ print(json.dumps(results, indent=2))
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python3
2
2
  # SPDX-License-Identifier: MIT
3
- # pdflinkcheck/analyze_pymupdf.py
4
-
3
+ # pdflinkcheck/analysis_pymupdf.py
4
+ from __future__ import annotations
5
5
  import sys
6
6
  from pathlib import Path
7
7
  import logging
@@ -10,6 +10,8 @@ from typing import Dict, Any, Optional, List
10
10
  logging.getLogger("fitz").setLevel(logging.ERROR)
11
11
 
12
12
  from pdflinkcheck.environment import pymupdf_is_available
13
+ from pdflinkcheck.helpers import PageRef
14
+
13
15
  try:
14
16
  if pymupdf_is_available():
15
17
  import fitz # PyMuPDF
@@ -22,8 +24,31 @@ except ImportError:
22
24
  Inspect target PDF for both URI links and for GoTo links.
23
25
  """
24
26
 
27
+ def analyze_pdf(pdf_path: str):
28
+ data = {}
29
+ data["links"] = []
30
+ data["toc"] = []
31
+ data["file_ov"] = {}
32
+
33
+ try:
34
+ doc = fitz.open(pdf_path)
35
+ except Exception as e:
36
+ print(f"fitz.open() failed: {e}")
37
+ return data
38
+
39
+ extracted_links = extract_links_pymupdf(doc)
40
+ structural_toc = extract_toc_pymupdf(doc)
41
+ page_count = doc.page_count
42
+
43
+ data["links"] = extracted_links
44
+ data["toc"] = structural_toc
45
+ data["file_ov"]["total_pages"] = page_count
46
+ data["file_ov"]["pdf_name"] = Path(pdf_path).name
47
+ return data
48
+
49
+
25
50
  # Helper function: Prioritize 'from'
26
- def get_link_rect(link_dict):
51
+ def _get_link_rect(link_dict):
27
52
  """
28
53
  Retrieves the bounding box for the link using the reliable 'from' key
29
54
  provided by PyMuPDF's link dictionary.
@@ -49,6 +74,19 @@ def get_link_rect(link_dict):
49
74
  return None
50
75
 
51
76
  def get_anchor_text(page, link_rect):
77
+ """
78
+ Extracts text content using the link's bounding box coordinates.
79
+ The bounding box is slightly expanded to ensure full characters are captured.
80
+
81
+ Args:
82
+ page: The fitz.Page object where the link is located.
83
+ link_rect: A tuple of four floats (x0, y0, x1, y1) representing the
84
+ link's bounding box.
85
+
86
+ Returns:
87
+ The cleaned, extracted text string, or a placeholder message
88
+ if no text is found or if an error occurs.
89
+ """
52
90
  if not link_rect:
53
91
  return "N/A: Missing Rect"
54
92
 
@@ -86,57 +124,6 @@ def get_anchor_text(page, link_rect):
86
124
  except Exception:
87
125
  return "N/A: Rect Error"
88
126
 
89
- def get_anchor_text_stable(page, link_rect):
90
- """
91
- Extracts text content using the link's bounding box coordinates.
92
- The bounding box is slightly expanded to ensure full characters are captured.
93
-
94
- Args:
95
- page: The fitz.Page object where the link is located.
96
- link_rect: A tuple of four floats (x0, y0, x1, y1) representing the
97
- link's bounding box.
98
-
99
- Returns:
100
- The cleaned, extracted text string, or a placeholder message
101
- if no text is found or if an error occurs.
102
- """
103
- if not link_rect:
104
- return "N/A: Missing Rect"
105
-
106
- try:
107
- # 1. Convert the coordinate tuple back to a fitz.Rect object
108
- rect = fitz.Rect(link_rect)
109
-
110
- # --- CRITICAL STEP: Check for invalid/empty rect AFTER conversion ---
111
- # If the rect is invalid (e.g., width or height is <= 0), skip it
112
- # Note: fitz.Rect will often auto-normalize, but this explicit check is safer.
113
- if rect.is_empty or rect.width <= 0 or rect.height <= 0:
114
- return "N/A: Rect Error (Zero/Negative Dimension)"
115
-
116
- # 2. Expand the rect slightly to capture full characters (1 unit in each direction)
117
- # This method avoids the proprietary/unstable 'from_expanded' or 'from_rect' methods.
118
- expanded_rect = fitz.Rect(
119
- rect.x0 - 1,
120
- rect.y0 - 1,
121
- rect.x1 + 1,
122
- rect.y1 + 1
123
- )
124
-
125
- # 3. Get the text within the expanded bounding box
126
- anchor_text = page.get_textbox(expanded_rect)
127
-
128
- # 4. Clean up whitespace and non-printing characters
129
- cleaned_text = " ".join(anchor_text.split())
130
-
131
- if cleaned_text:
132
- return cleaned_text
133
- else:
134
- return "N/A: No Visible Text"
135
-
136
- except Exception:
137
- # Fallback for unexpected errors in rect conversion or retrieval
138
- return "N/A: Rect Error"
139
-
140
127
  def analyze_toc_fitz(doc):
141
128
  """
142
129
  Extracts the structural Table of Contents (PDF Bookmarks/Outline)
@@ -149,23 +136,28 @@ def analyze_toc_fitz(doc):
149
136
  A list of dictionaries, where each dictionary represents a TOC entry
150
137
  with 'level', 'title', and 'target_page' (1-indexed).
151
138
  """
139
+
152
140
  toc = doc.get_toc()
153
141
  toc_data = []
154
142
 
155
143
  for level, title, page_num in toc:
156
144
  # fitz pages are 1-indexed for TOC!
145
+ # We know fitz gives us a human number.
146
+ # We convert it to a physical index for our internal storage.
147
+ # page_num is 1 (Human). We normalize to 0 (Physical).
148
+ ref = PageRef.from_human(page_num)
157
149
  toc_data.append({
158
150
  'level': level,
159
151
  'title': title,
160
- 'target_page': page_num
152
+ #'target_page': ref.index
153
+ 'target_page': ref.machine
161
154
  })
162
155
 
163
156
  return toc_data
164
157
 
165
-
166
158
  # 2. Updated Main Inspection Function to Include Text Extraction
167
159
  #def inspect_pdf_hyperlinks_fitz(pdf_path):
168
- def extract_toc_pymupdf(pdf_path):
160
+ def extract_toc_pymupdf(doc):
169
161
  """
170
162
  Opens a PDF, iterates through all pages and extracts the structural table of contents (TOC/bookmarks).
171
163
 
@@ -176,7 +168,7 @@ def extract_toc_pymupdf(pdf_path):
176
168
  A list of dictionaries representing the structural TOC/bookmarks.
177
169
  """
178
170
  try:
179
- doc = fitz.open(pdf_path)
171
+
180
172
  structural_toc = analyze_toc_fitz(doc)
181
173
  except Exception as e:
182
174
  print(f"An error occurred: {e}", file=sys.stderr)
@@ -211,125 +203,91 @@ def serialize_fitz_object(obj):
211
203
  return obj
212
204
 
213
205
 
214
- def extract_links_pymupdf(pdf_path):
215
- """
216
- Opens a PDF, iterates through all pages and extracts all link annotations.
217
- It categorizes the links into External, Internal, or Other actions, and extracts the anchor text.
218
-
219
- Args:
220
- pdf_path: The file system path (str) to the target PDF document.
221
-
222
- Returns:
223
- A list of dictionaries, where each dictionary is a comprehensive
224
- representation of an active hyperlink found in the PDF.
225
-
226
- """
206
+ def extract_links_pymupdf(doc):
227
207
  links_data = []
228
- try:
229
- doc = fitz.open(pdf_path)
208
+ try:
209
+ # This represents the maximum valid 0-index in the doc
210
+ last_page_ref = PageRef.from_pymupdf_total_page_count(doc.page_count)
211
+
212
+ #print(last_page_ref) # Output: "358" (Because of __str__)
213
+ #print(int(last_page_ref)) # Output: 357 (Because of __int__)
230
214
 
231
215
  for page_num in range(doc.page_count):
232
216
  page = doc.load_page(page_num)
233
-
234
- for link in page.get_links():
235
-
236
- page_obj = doc.load_page(page_num)
237
- link_rect = get_link_rect(link)
238
-
239
- rect_obj = link.get("from")
240
- xref = link.get("xref")
241
- #print(f"rect_obj = {rect_obj}")
242
- #print(f"xref = {xref}")
243
-
244
-
245
- # --- Examples of various keys associated with various link instances ---
246
- #print(f"keys: list(link) = {list(link)}")
247
- # keys: list(link) = ['kind', 'xref', 'from', 'page', 'viewrect', 'id']
248
- # keys: list(link) = ['kind', 'xref', 'from', 'uri', 'id']
249
- # keys: list(link) = ['kind', 'xref', 'from', 'page', 'view', 'id']
217
+ source_ref = PageRef.from_index(page_num)
250
218
 
251
- # 1. Extract the anchor text
252
- anchor_text = get_anchor_text(page_obj, link_rect)
253
-
254
- # 2. Extract the target and kind
255
- target = ""
256
- kind = link.get('kind')
257
-
219
+ for link in page.get_links():
220
+ link_rect = _get_link_rect(link)
221
+ anchor_text = get_anchor_text(page, link_rect)
258
222
 
259
223
  link_dict = {
260
- 'page': int(page_num) + 1, # accurate for link location, add 1
224
+ 'page': source_ref.machine,
261
225
  'rect': link_rect,
262
226
  'link_text': anchor_text,
263
- 'xref':xref
227
+ 'xref': link.get("xref")
264
228
  }
265
229
 
266
- # A. Clean Geom. Objects: Use the helper function on 'to' / 'destination'
267
- # Use the clean serialize_fitz_object() helper function on all keys that might contain objects
230
+ kind = link.get('kind')
268
231
  destination_view = serialize_fitz_object(link.get('to'))
232
+ p_index = link.get('page') # excpeted to be human facing, per PyMuPDF's known quirks
233
+
234
+ # --- CASE 1: INTERNAL JUMPS (GoTo) ---
235
+ if p_index is not None:
269
236
 
270
- # B. Correct Internal Link Page Numbering (The -1 correction hack)
271
- # This will be skipped by URI, which is not expected to have a page key
272
- target_page_num_reported = "N/A"
273
- if link.get('page') is not None:
274
- target_page_num_reported = int(link.get('page'))+1 # accurate for link target, don't add 1 (weird)
237
+ # Ensure we are working with an integer
238
+ raw_pymupdf_idx = int(p_index)
239
+ corrected_machine_idx = PageRef.corrected_down(raw_pymupdf_idx).index
240
+
241
+ # Logic: Normalize to 0-index and store as int
242
+ idx = min(corrected_machine_idx, int(last_page_ref))
243
+ #print(f"DEBUG: Link Text: {anchor_text} | Raw p_index: {p_index}")
244
+ #print(f"[DEBUG] idx: {idx}")
245
+ dest_ref = PageRef.from_index(idx) # does not impact the value
275
246
 
276
- if link['kind'] == fitz.LINK_URI:
277
- target = link.get('uri', 'URI (Unknown Target)')
278
247
  link_dict.update({
279
- 'type': 'External (URI)',
280
- 'url': link.get('uri'),
281
- 'target': target
248
+ 'destination_page': dest_ref.machine,
249
+ 'destination_view': destination_view,
250
+ 'target': dest_ref.machine, # INT (MACHINE INDEX)
282
251
  })
252
+
253
+ if kind == fitz.LINK_GOTO:
254
+ link_dict['type'] = 'Internal (GoTo/Dest)'
255
+ else:
256
+ link_dict['type'] = 'Internal (Resolved Action)'
257
+ link_dict['source_kind'] = kind
283
258
 
284
- elif link['kind'] == fitz.LINK_GOTO:
285
- target = f"Page {target_page_num_reported}"
259
+ # --- CASE 2: EXTERNAL URIs ---
260
+ elif kind == fitz.LINK_URI:
261
+ uri = link.get('uri', 'URI (Unknown Target)')
286
262
  link_dict.update({
287
- 'type': 'Internal (GoTo/Dest)',
288
- 'destination_page': target_page_num_reported,
289
- 'destination_view': destination_view,
290
- 'target': target
263
+ 'type': 'External (URI)',
264
+ 'url': uri,
265
+ 'target': uri # STRING (URL)
291
266
  })
292
267
 
293
- elif link['kind'] == fitz.LINK_GOTOR:
268
+ # --- CASE 3: REMOTE PDF REFERENCES ---
269
+ elif kind == fitz.LINK_GOTOR:
270
+ remote_file = link.get('file', 'Remote File')
294
271
  link_dict.update({
295
272
  'type': 'Remote (GoToR)',
296
273
  'remote_file': link.get('file'),
297
- 'destination': destination_view
274
+ 'target': remote_file # STRING (File Path)
298
275
  })
299
276
 
300
- elif link.get('page') is not None and link['kind'] != fitz.LINK_GOTO:
301
- target = f"Page {target_page_num_reported}"
302
- link_dict.update({
303
- 'type': 'Internal (Resolved Action)',
304
- 'destination_page': target_page_num_reported,
305
- 'destination_view': destination_view,
306
- 'source_kind': link.get('kind'),
307
- 'target': target
308
- })
309
-
277
+ # --- CASE 4: OTHERS ---
310
278
  else:
311
- target = link.get('url') or link.get('remote_file') or link.get('target')
312
279
  link_dict.update({
313
280
  'type': 'Other Action',
314
- 'action_kind': link.get('kind'),
315
- 'target': target
281
+ 'action_kind': kind,
282
+ 'target': 'Unknown' # STRING
316
283
  })
317
284
 
318
- ## --- General Serialization Cleaner ---
319
- #for key, value in link_dict.items():
320
- # if hasattr(value, 'rect') and hasattr(value, 'point'):
321
- # # This handles Rect and Point objects that may slip through
322
- # link_dict[key] = str(value)
323
- ## --- End Cleaner ---
324
-
325
285
  links_data.append(link_dict)
326
-
327
286
  doc.close()
328
287
  except Exception as e:
329
288
  print(f"An error occurred: {e}", file=sys.stderr)
330
289
  return links_data
331
290
 
332
-
333
291
  def call_stable():
334
292
  """
335
293
  Placeholder function for command-line execution (e.g., in __main__).