pdflinkcheck 1.1.73__py3-none-any.whl → 1.2.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. pdflinkcheck/__init__.py +88 -21
  2. pdflinkcheck/__main__.py +6 -0
  3. pdflinkcheck/analysis_pdfium.py +131 -0
  4. pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +109 -145
  5. pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +67 -37
  6. pdflinkcheck/cli.py +111 -116
  7. pdflinkcheck/data/I Have Questions.md +51 -0
  8. pdflinkcheck/data/LICENSE +20 -654
  9. pdflinkcheck/data/README.md +65 -67
  10. pdflinkcheck/data/icons/BoxArt-1080x1080.png +0 -0
  11. pdflinkcheck/data/icons/Logo-150x150.png +0 -0
  12. pdflinkcheck/data/icons/Logo-300x300.png +0 -0
  13. pdflinkcheck/data/icons/Logo-71x71.png +0 -0
  14. pdflinkcheck/data/icons/PosterArt-720x1080.png +0 -0
  15. pdflinkcheck/data/icons/SmallLogo-44x44.png +0 -0
  16. pdflinkcheck/data/icons/SplashScreen-620x300.png +0 -0
  17. pdflinkcheck/data/icons/StoreLogo-50x50.png +0 -0
  18. pdflinkcheck/data/icons/WideLogo-310x150.png +0 -0
  19. pdflinkcheck/data/icons/red_pdf_512px.ico +0 -0
  20. pdflinkcheck/data/pyproject.toml +25 -37
  21. pdflinkcheck/data/themes/forest/forest-dark/border-accent-hover.png +0 -0
  22. pdflinkcheck/data/themes/forest/forest-dark/border-accent.png +0 -0
  23. pdflinkcheck/data/themes/forest/forest-dark/border-basic.png +0 -0
  24. pdflinkcheck/data/themes/forest/forest-dark/border-hover.png +0 -0
  25. pdflinkcheck/data/themes/forest/forest-dark/border-invalid.png +0 -0
  26. pdflinkcheck/data/themes/forest/forest-dark/card.png +0 -0
  27. pdflinkcheck/data/themes/forest/forest-dark/check-accent.png +0 -0
  28. pdflinkcheck/data/themes/forest/forest-dark/check-basic.png +0 -0
  29. pdflinkcheck/data/themes/forest/forest-dark/check-hover.png +0 -0
  30. pdflinkcheck/data/themes/forest/forest-dark/check-tri-accent.png +0 -0
  31. pdflinkcheck/data/themes/forest/forest-dark/check-tri-basic.png +0 -0
  32. pdflinkcheck/data/themes/forest/forest-dark/check-tri-hover.png +0 -0
  33. pdflinkcheck/data/themes/forest/forest-dark/check-unsel-accent.png +0 -0
  34. pdflinkcheck/data/themes/forest/forest-dark/check-unsel-basic.png +0 -0
  35. pdflinkcheck/data/themes/forest/forest-dark/check-unsel-hover.png +0 -0
  36. pdflinkcheck/data/themes/forest/forest-dark/check-unsel-pressed.png +0 -0
  37. pdflinkcheck/data/themes/forest/forest-dark/combo-button-basic.png +0 -0
  38. pdflinkcheck/data/themes/forest/forest-dark/combo-button-focus.png +0 -0
  39. pdflinkcheck/data/themes/forest/forest-dark/combo-button-hover.png +0 -0
  40. pdflinkcheck/data/themes/forest/forest-dark/down.png +0 -0
  41. pdflinkcheck/data/themes/forest/forest-dark/empty.png +0 -0
  42. pdflinkcheck/data/themes/forest/forest-dark/hor-accent.png +0 -0
  43. pdflinkcheck/data/themes/forest/forest-dark/hor-basic.png +0 -0
  44. pdflinkcheck/data/themes/forest/forest-dark/hor-hover.png +0 -0
  45. pdflinkcheck/data/themes/forest/forest-dark/notebook.png +0 -0
  46. pdflinkcheck/data/themes/forest/forest-dark/off-accent.png +0 -0
  47. pdflinkcheck/data/themes/forest/forest-dark/off-basic.png +0 -0
  48. pdflinkcheck/data/themes/forest/forest-dark/off-hover.png +0 -0
  49. pdflinkcheck/data/themes/forest/forest-dark/on-accent.png +0 -0
  50. pdflinkcheck/data/themes/forest/forest-dark/on-basic.png +0 -0
  51. pdflinkcheck/data/themes/forest/forest-dark/on-hover.png +0 -0
  52. pdflinkcheck/data/themes/forest/forest-dark/radio-accent.png +0 -0
  53. pdflinkcheck/data/themes/forest/forest-dark/radio-basic.png +0 -0
  54. pdflinkcheck/data/themes/forest/forest-dark/radio-hover.png +0 -0
  55. pdflinkcheck/data/themes/forest/forest-dark/radio-tri-accent.png +0 -0
  56. pdflinkcheck/data/themes/forest/forest-dark/radio-tri-basic.png +0 -0
  57. pdflinkcheck/data/themes/forest/forest-dark/radio-tri-hover.png +0 -0
  58. pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-accent.png +0 -0
  59. pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-basic.png +0 -0
  60. pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-hover.png +0 -0
  61. pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-pressed.png +0 -0
  62. pdflinkcheck/data/themes/forest/forest-dark/rect-accent-hover.png +0 -0
  63. pdflinkcheck/data/themes/forest/forest-dark/rect-accent.png +0 -0
  64. pdflinkcheck/data/themes/forest/forest-dark/rect-basic.png +0 -0
  65. pdflinkcheck/data/themes/forest/forest-dark/rect-hover.png +0 -0
  66. pdflinkcheck/data/themes/forest/forest-dark/right.png +0 -0
  67. pdflinkcheck/data/themes/forest/forest-dark/scale-hor.png +0 -0
  68. pdflinkcheck/data/themes/forest/forest-dark/scale-vert.png +0 -0
  69. pdflinkcheck/data/themes/forest/forest-dark/separator.png +0 -0
  70. pdflinkcheck/data/themes/forest/forest-dark/sizegrip.png +0 -0
  71. pdflinkcheck/data/themes/forest/forest-dark/spin-button-down-basic.png +0 -0
  72. pdflinkcheck/data/themes/forest/forest-dark/spin-button-down-focus.png +0 -0
  73. pdflinkcheck/data/themes/forest/forest-dark/spin-button-up.png +0 -0
  74. pdflinkcheck/data/themes/forest/forest-dark/tab-accent.png +0 -0
  75. pdflinkcheck/data/themes/forest/forest-dark/tab-basic.png +0 -0
  76. pdflinkcheck/data/themes/forest/forest-dark/tab-hover.png +0 -0
  77. pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-accent.png +0 -0
  78. pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-basic.png +0 -0
  79. pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-hover.png +0 -0
  80. pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-accent.png +0 -0
  81. pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-basic.png +0 -0
  82. pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-hover.png +0 -0
  83. pdflinkcheck/data/themes/forest/forest-dark/tree-basic.png +0 -0
  84. pdflinkcheck/data/themes/forest/forest-dark/tree-pressed.png +0 -0
  85. pdflinkcheck/data/themes/forest/forest-dark/up.png +0 -0
  86. pdflinkcheck/data/themes/forest/forest-dark/vert-accent.png +0 -0
  87. pdflinkcheck/data/themes/forest/forest-dark/vert-basic.png +0 -0
  88. pdflinkcheck/data/themes/forest/forest-dark/vert-hover.png +0 -0
  89. pdflinkcheck/data/themes/forest/forest-dark.tcl +536 -0
  90. pdflinkcheck/data/themes/forest/forest-light/border-accent-hover.png +0 -0
  91. pdflinkcheck/data/themes/forest/forest-light/border-accent.png +0 -0
  92. pdflinkcheck/data/themes/forest/forest-light/border-basic.png +0 -0
  93. pdflinkcheck/data/themes/forest/forest-light/border-hover.png +0 -0
  94. pdflinkcheck/data/themes/forest/forest-light/border-invalid.png +0 -0
  95. pdflinkcheck/data/themes/forest/forest-light/card.png +0 -0
  96. pdflinkcheck/data/themes/forest/forest-light/check-accent.png +0 -0
  97. pdflinkcheck/data/themes/forest/forest-light/check-basic.png +0 -0
  98. pdflinkcheck/data/themes/forest/forest-light/check-hover.png +0 -0
  99. pdflinkcheck/data/themes/forest/forest-light/check-tri-accent.png +0 -0
  100. pdflinkcheck/data/themes/forest/forest-light/check-tri-basic.png +0 -0
  101. pdflinkcheck/data/themes/forest/forest-light/check-tri-hover.png +0 -0
  102. pdflinkcheck/data/themes/forest/forest-light/check-unsel-accent.png +0 -0
  103. pdflinkcheck/data/themes/forest/forest-light/check-unsel-basic.png +0 -0
  104. pdflinkcheck/data/themes/forest/forest-light/check-unsel-hover.png +0 -0
  105. pdflinkcheck/data/themes/forest/forest-light/check-unsel-pressed.png +0 -0
  106. pdflinkcheck/data/themes/forest/forest-light/combo-button-basic.png +0 -0
  107. pdflinkcheck/data/themes/forest/forest-light/combo-button-focus.png +0 -0
  108. pdflinkcheck/data/themes/forest/forest-light/combo-button-hover.png +0 -0
  109. pdflinkcheck/data/themes/forest/forest-light/down-focus.png +0 -0
  110. pdflinkcheck/data/themes/forest/forest-light/down.png +0 -0
  111. pdflinkcheck/data/themes/forest/forest-light/empty.png +0 -0
  112. pdflinkcheck/data/themes/forest/forest-light/hor-accent.png +0 -0
  113. pdflinkcheck/data/themes/forest/forest-light/hor-basic.png +0 -0
  114. pdflinkcheck/data/themes/forest/forest-light/hor-hover.png +0 -0
  115. pdflinkcheck/data/themes/forest/forest-light/notebook.png +0 -0
  116. pdflinkcheck/data/themes/forest/forest-light/off-accent.png +0 -0
  117. pdflinkcheck/data/themes/forest/forest-light/off-basic.png +0 -0
  118. pdflinkcheck/data/themes/forest/forest-light/off-hover.png +0 -0
  119. pdflinkcheck/data/themes/forest/forest-light/on-accent.png +0 -0
  120. pdflinkcheck/data/themes/forest/forest-light/on-basic.png +0 -0
  121. pdflinkcheck/data/themes/forest/forest-light/on-hover.png +0 -0
  122. pdflinkcheck/data/themes/forest/forest-light/radio-accent.png +0 -0
  123. pdflinkcheck/data/themes/forest/forest-light/radio-basic.png +0 -0
  124. pdflinkcheck/data/themes/forest/forest-light/radio-hover.png +0 -0
  125. pdflinkcheck/data/themes/forest/forest-light/radio-tri-accent.png +0 -0
  126. pdflinkcheck/data/themes/forest/forest-light/radio-tri-basic.png +0 -0
  127. pdflinkcheck/data/themes/forest/forest-light/radio-tri-hover.png +0 -0
  128. pdflinkcheck/data/themes/forest/forest-light/radio-unsel-accent.png +0 -0
  129. pdflinkcheck/data/themes/forest/forest-light/radio-unsel-basic.png +0 -0
  130. pdflinkcheck/data/themes/forest/forest-light/radio-unsel-hover.png +0 -0
  131. pdflinkcheck/data/themes/forest/forest-light/radio-unsel-pressed.png +0 -0
  132. pdflinkcheck/data/themes/forest/forest-light/rect-accent-hover.png +0 -0
  133. pdflinkcheck/data/themes/forest/forest-light/rect-accent.png +0 -0
  134. pdflinkcheck/data/themes/forest/forest-light/rect-basic.png +0 -0
  135. pdflinkcheck/data/themes/forest/forest-light/rect-hover.png +0 -0
  136. pdflinkcheck/data/themes/forest/forest-light/right-focus.png +0 -0
  137. pdflinkcheck/data/themes/forest/forest-light/right.png +0 -0
  138. pdflinkcheck/data/themes/forest/forest-light/scale-hor.png +0 -0
  139. pdflinkcheck/data/themes/forest/forest-light/scale-vert.png +0 -0
  140. pdflinkcheck/data/themes/forest/forest-light/separator.png +0 -0
  141. pdflinkcheck/data/themes/forest/forest-light/sizegrip.png +0 -0
  142. pdflinkcheck/data/themes/forest/forest-light/spin-button-down-basic.png +0 -0
  143. pdflinkcheck/data/themes/forest/forest-light/spin-button-down-focus.png +0 -0
  144. pdflinkcheck/data/themes/forest/forest-light/spin-button-up.png +0 -0
  145. pdflinkcheck/data/themes/forest/forest-light/tab-accent.png +0 -0
  146. pdflinkcheck/data/themes/forest/forest-light/tab-basic.png +0 -0
  147. pdflinkcheck/data/themes/forest/forest-light/tab-hover.png +0 -0
  148. pdflinkcheck/data/themes/forest/forest-light/thumb-hor-accent.png +0 -0
  149. pdflinkcheck/data/themes/forest/forest-light/thumb-hor-basic.png +0 -0
  150. pdflinkcheck/data/themes/forest/forest-light/thumb-hor-hover.png +0 -0
  151. pdflinkcheck/data/themes/forest/forest-light/thumb-vert-accent.png +0 -0
  152. pdflinkcheck/data/themes/forest/forest-light/thumb-vert-basic.png +0 -0
  153. pdflinkcheck/data/themes/forest/forest-light/thumb-vert-hover.png +0 -0
  154. pdflinkcheck/data/themes/forest/forest-light/tree-basic.png +0 -0
  155. pdflinkcheck/data/themes/forest/forest-light/tree-pressed.png +0 -0
  156. pdflinkcheck/data/themes/forest/forest-light/up.png +0 -0
  157. pdflinkcheck/data/themes/forest/forest-light/vert-accent.png +0 -0
  158. pdflinkcheck/data/themes/forest/forest-light/vert-basic.png +0 -0
  159. pdflinkcheck/data/themes/forest/forest-light/vert-hover.png +0 -0
  160. pdflinkcheck/data/themes/forest/forest-light.tcl +544 -0
  161. pdflinkcheck/datacopy.py +18 -1
  162. pdflinkcheck/dev.py +12 -25
  163. pdflinkcheck/environment.py +76 -0
  164. pdflinkcheck/gui.py +366 -457
  165. pdflinkcheck/helpers.py +88 -0
  166. pdflinkcheck/io.py +27 -23
  167. pdflinkcheck/report.py +692 -121
  168. pdflinkcheck/security.py +189 -0
  169. pdflinkcheck/splash.py +38 -0
  170. pdflinkcheck/stdlib_server.py +14 -20
  171. pdflinkcheck/stdlib_server_alt.py +571 -0
  172. pdflinkcheck/tk_utils.py +188 -0
  173. pdflinkcheck/update_msix_version.py +49 -0
  174. pdflinkcheck/validate.py +129 -218
  175. pdflinkcheck/version_info.py +6 -3
  176. {pdflinkcheck-1.1.73.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +84 -81
  177. pdflinkcheck-1.2.29.dist-info/RECORD +183 -0
  178. pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
  179. {pdflinkcheck-1.1.73.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
  180. pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
  181. pdflinkcheck-1.2.29.dist-info/licenses/LICENSE-MIT +9 -0
  182. pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
  183. pdflinkcheck/analyze_pypdf_v2.py +0 -218
  184. pdflinkcheck-1.1.73.dist-info/RECORD +0 -21
  185. pdflinkcheck-1.1.73.dist-info/WHEEL +0 -4
  186. /pdflinkcheck-1.1.73.dist-info/licenses/LICENSE → /pdflinkcheck-1.2.29.dist-info/licenses/LICENSE-AGPL3 +0 -0
pdflinkcheck/__init__.py CHANGED
@@ -1,29 +1,82 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: MIT
1
3
  # src/pdflinkcheck/__init__.py
2
4
  """
3
- # License information
4
5
  pdflinkcheck - A PDF Link Checker
5
6
 
6
- Copyright (C) 2025 George Clayton Bennett
7
-
8
7
  Source code: https://github.com/City-of-Memphis-Wastewater/pdflinkcheck/
9
8
 
10
- This program is free software: You can redistribute it and/or modify
11
- it under the terms of the GNU Affero General Public License as
12
- published by the Free Software Foundation, either version 3 of the
13
- License, or (at your option) any later version.
14
-
15
- The AGPL3+ is required because pdflinkcheck uses PyMuPDF, which is licensed under the AGPL3.
16
9
  """
10
+ from __future__ import annotations
17
11
  import os as _os
18
12
 
19
13
  # Library functions
20
- from pdflinkcheck.analyze_pymupdf import extract_links_pymupdf, extract_toc_pymupdf
21
- from pdflinkcheck.analyze_pypdf import extract_links_pypdf, extract_toc_pypdf
22
- #from pdflinkcheck import analyze_pypdf
23
- from pdflinkcheck.report import run_report
24
- from pdflinkcheck.report import run_report as run_analysis # for backwards compatibility with previos versions
25
14
  #from pdflinkcheck import dev
26
15
 
16
+ # Lazy-loaded orchestrator
17
+ def run_report(pdf_path: str, export_format: str = "JSON", pdf_library: str = "auto", print_bool: bool = True):
18
+ """
19
+ Run a full link check report on a PDF file.
20
+
21
+ Args:
22
+ pdf_path: Path to the PDF file.
23
+ export_format: "JSON", "TXT", or both (e.g., "JSON,TXT").
24
+ pdf_library: "auto", "pdfium", "pymupdf", or "pypdf".
25
+ print_bool: If True, prints the overview to stdout.
26
+ """
27
+ from pdflinkcheck.report import run_report_and_call_exports as _run
28
+ return _run(pdf_path=pdf_path, export_format=export_format, pdf_library=pdf_library, print_bool=print_bool)
29
+
30
+ # --- pypdf ---
31
+ def analyze_pdf_pypdf(path):
32
+ try:
33
+ from pdflinkcheck.analysis_pypdf import analyze_pdf as _analyze
34
+ except ImportError:
35
+ raise ImportError(
36
+ "pypdf engine is not installed. "
37
+ "Install pypdf to enable pypdf support."
38
+ )
39
+ return _analyze(path)
40
+ analyze_pdf_pypdf.__doc__ = (
41
+ "Analyze a PDF using the lightweight pypdf engine and return a normalized dictionary.\n\n"
42
+ "See pdflinkcheck.analyze_pypdf for full details."
43
+ )
44
+
45
+ # --- PyMuPDF ---
46
+ def analyze_pdf_pymupdf(path):
47
+ try:
48
+ from pdflinkcheck.analysis_pymupdf import analyze_pdf as _analyze
49
+ except ImportError:
50
+ raise ImportError(
51
+ "PyMuPDF engine is not installed. "
52
+ "Install with the [mupdf] extra to enable PyMuPDF support."
53
+ )
54
+ return _analyze(path)
55
+ analyze_pdf_pymupdf.__doc__ = (
56
+ "Analyze a PDF using the AGPL3-licensed PyMuPDF engine and return a normalized dictionary.\n\n"
57
+ "See pdflinkcheck.analyze_pymupdf for full details."
58
+ )
59
+
60
+
61
+ # --- PDFium ---
62
+
63
+ def analyze_pdf_pdfium(path):
64
+ try:
65
+ from pdflinkcheck.analysis_pdfium import analyze_pdf as _analyze
66
+ except ImportError:
67
+ raise ImportError(
68
+ "PDFium engine is not installed. "
69
+ "Install with the [pdfium] extra to enable pdfium support."
70
+ )
71
+ return _analyze(path)
72
+ analyze_pdf_pdfium.__doc__ = (
73
+ "Analyze a PDF using the PDFium engine and return a normalized dictionary.\n\n"
74
+ "See pdflinkcheck.analyze_pdfium for full details."
75
+ )
76
+
77
+ # -----------------------------
78
+ # GUI easter egg
79
+ # -----------------------------
27
80
  # For the kids. This is what I wanted when learning Python in a mysterious new REPL.
28
81
  # Is this Pythonic? No. Oh well. PEP 8, PEP 20.
29
82
  # Why is this not Pythonic? Devs expect no side effects when importing library functions.
@@ -32,33 +85,47 @@ _gui_easteregg_env_flag = _os.environ.get('PDFLINKCHECK_GUI_EASTEREGG', '')
32
85
  _load_gui_func = str(_gui_easteregg_env_flag).strip().lower() in ('true', '1', 'yes', 'on')
33
86
  if _load_gui_func:
34
87
  try:
88
+ print("Easter egg, attemping.")
35
89
  import pyhabitat as _pyhabitat # pyhabitat is a dependency of this package already
90
+ print(f"pyhabitat.tkinter_is_available() = {_pyhabitat.tkinter_is_available()}")
36
91
  if _pyhabitat.tkinter_is_available():
37
92
  from pdflinkcheck.gui import start_gui
93
+ print("Success: pdflinkcheck.start_gui() function loaded as top-level pmlibrary function.")
38
94
  except ImportError:
39
95
  # Optional: log or ignore silently
40
96
  print("start_gui() not imported")
41
97
 
98
+
99
+
42
100
  # Breadcrumbs, for stumbling upon.
43
101
  if _load_gui_func:
44
102
  __pdflinkcheck_gui_easteregg_enabled__ = True
45
103
  else:
46
104
  __pdflinkcheck_gui_easteregg_enabled__ = False
47
105
 
106
+
107
+ # -----------------------------
108
+ # Public API
109
+ # -----------------------------
48
110
  # Define __all__ such that the library functions are self documenting.
49
111
  __all__ = [
50
112
  "run_report",
51
- "run_analysis",
52
- "extract_links_pymupdf",
53
- "extract_toc_pymupdf",
54
- "extract_links_pypdf",
55
- "extract_toc_pypdf",
56
- #"start_gui" if _load_gui_func else None,
57
- #"dev",
113
+ "analyze_pdf_pymupdf",
114
+ "analyze_pdf_pypdf",
115
+ "analyze_pdf_pdfium",
58
116
  ]
117
+
118
+ # Handle the Easter Egg export
59
119
  if _load_gui_func:
60
120
  __all__.append("start_gui")
61
121
 
122
+ # Handle dev module if you want it public
123
+ try:
124
+ from pdflinkcheck import dev
125
+ __all__.append("dev")
126
+ except ImportError:
127
+ pass
128
+
62
129
  # 4. THE CLEANUP (This removes items from dir())
63
130
  del _os
64
131
  del _gui_easteregg_env_flag
@@ -0,0 +1,6 @@
1
+ # src/pdflinkcheck/__main__.py
2
+ from __future__ import annotations
3
+ from pdflinkcheck.cli import app
4
+
5
+ if __name__ == "__main__":
6
+ app()
@@ -0,0 +1,131 @@
1
+ # src/pdflinkcheck/analysis_pdfium.py
2
+ from __future__ import annotations
3
+ import ctypes
4
+ from typing import List, Dict, Any
5
+ from pdflinkcheck.helpers import PageRef
6
+
7
+ from pdflinkcheck.environment import pdfium_is_available
8
+ from pdflinkcheck.helpers import PageRef
9
+
10
+ try:
11
+ if pdfium_is_available():
12
+ import pypdfium2 as pdfium
13
+ import pypdfium2.raw as pdfium_c
14
+
15
+ else:
16
+ pdfium = None
17
+ pdfium_c = None
18
+ except ImportError:
19
+ pdfium = None
20
+ pdfium_c = None
21
+
22
+ def analyze_pdf(path: str) -> Dict[str, Any]:
23
+ # 1. Guard the entry point
24
+ if not pdfium_is_available() or pdfium is None:
25
+ raise ImportError(
26
+ "pypdfium2 is not installed. "
27
+ "\nInstall it with: \n\tpip install pdflinkcheck[pdfium] \n\t OR \n\t uv sync --extra pdfium"
28
+ )
29
+ doc = pdfium.PdfDocument(path)
30
+
31
+ total_pages = len(doc) # or doc.page_count
32
+
33
+ links = []
34
+ toc_list = []
35
+ file_ov = {}
36
+ seen_toc = set()
37
+
38
+ file_ov["total_pages"] = total_pages
39
+
40
+ # 1. TOC Extraction (Matches PyMuPDF logic)
41
+ for item in doc.get_toc():
42
+ title = item.get_title() if hasattr(item, "get_title") else ""
43
+ dest = item.get_dest()
44
+ page_idx = PageRef.from_index(dest.get_index()).machine if dest else 0
45
+ if title or page_idx > 0:
46
+ key = (title, page_idx)
47
+ if key not in seen_toc:
48
+ toc_list.append({"level": item.level + 1, "title": title, "target_page": page_idx})
49
+ seen_toc.add(key)
50
+
51
+ # 2. Link Enumeration
52
+ for page_index in range(len(doc)):
53
+ page = doc.get_page(page_index)
54
+ text_page = page.get_textpage()
55
+ source_ref = PageRef.from_index(page_index)
56
+
57
+ # --- A. EXTERNAL WEB LINKS ---
58
+ pagelink_raw = pdfium_c.FPDFLink_LoadWebLinks(text_page.raw)
59
+ if pagelink_raw:
60
+ count = pdfium_c.FPDFLink_CountWebLinks(pagelink_raw)
61
+ for i in range(count):
62
+ buflen = pdfium_c.FPDFLink_GetURL(pagelink_raw, i, None, 0)
63
+ url = ""
64
+ if buflen > 0:
65
+ buffer = (pdfium_c.c_uint16 * buflen)()
66
+ pdfium_c.FPDFLink_GetURL(pagelink_raw, i, buffer, buflen)
67
+ url = ctypes.string_at(buffer, (buflen-1)*2).decode('utf-16le')
68
+
69
+ l, t, r, b = (ctypes.c_double() for _ in range(4))
70
+ pdfium_c.FPDFLink_GetRect(pagelink_raw, i, 0, ctypes.byref(l), ctypes.byref(t), ctypes.byref(r), ctypes.byref(b))
71
+
72
+ rect = [l.value, b.value, r.value, t.value]
73
+ links.append({
74
+ 'page': source_ref.machine,
75
+ 'rect': rect,
76
+ 'link_text': text_page.get_text_bounded(left=l.value, top=t.value, right=r.value, bottom=b.value).strip() or url,
77
+ 'type': 'External (URI)',
78
+ 'url': url,
79
+ 'target': url,
80
+ 'source_kind': 'pypdfium2_weblink'
81
+ })
82
+ pdfium_c.FPDFLink_CloseWebLinks(pagelink_raw)
83
+
84
+ # --- B. INTERNAL GOTO LINKS (Standard Annotations) ---
85
+ # We iterate through standard link annotations for GoTo actions
86
+ pos = 0
87
+ while True:
88
+ annot_raw = pdfium_c.FPDFPage_GetAnnot(page.raw, pos)
89
+ if not annot_raw:
90
+ break
91
+
92
+ subtype = pdfium_c.FPDFAnnot_GetSubtype(annot_raw)
93
+ if subtype == pdfium_c.FPDF_ANNOT_LINK:
94
+ # Get Rect
95
+ fs_rect = pdfium_c.FS_RECTF()
96
+ pdfium_c.FPDFAnnot_GetRect(annot_raw, fs_rect)
97
+
98
+ # Try to get Destination
99
+ link_annot = pdfium_c.FPDFAnnot_GetLink(annot_raw)
100
+ dest = pdfium_c.FPDFLink_GetDest(doc.raw, link_annot)
101
+
102
+ if dest:
103
+ dest_idx = pdfium_c.FPDFDest_GetDestPageIndex(doc.raw, dest)
104
+ dest_ref = PageRef.from_index(dest_idx)
105
+
106
+ links.append({
107
+ 'page': source_ref.machine,
108
+ 'rect': [fs_rect.left, fs_rect.bottom, fs_rect.right, fs_rect.top],
109
+ 'link_text': text_page.get_text_bounded(left=fs_rect.left, top=fs_rect.top, right=fs_rect.right, bottom=fs_rect.bottom).strip(),
110
+ 'type': 'Internal (GoTo/Dest)',
111
+ 'destination_page': dest_ref.machine,
112
+ 'target': dest_ref.machine,
113
+ 'source_kind': 'pypdfium2_annot'
114
+ })
115
+
116
+ # Note: We don't close annot here if we are just enumerating by index
117
+ # in some builds, but standard practice is to increment pos
118
+ pos += 1
119
+
120
+ page.close()
121
+ text_page.close()
122
+
123
+ doc.close()
124
+ return {"links": links, "toc": toc_list, "file_ov": file_ov}
125
+
126
+ if __name__ == "__main__":
127
+ import json
128
+ import sys
129
+ filename = "temOM.pdf"
130
+ results = analyze_pdf(filename)
131
+ print(json.dumps(results, indent=2))
@@ -1,3 +1,7 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: MIT
3
+ # pdflinkcheck/analysis_pymupdf.py
4
+ from __future__ import annotations
1
5
  import sys
2
6
  from pathlib import Path
3
7
  import logging
@@ -5,20 +9,46 @@ from typing import Dict, Any, Optional, List
5
9
 
6
10
  logging.getLogger("fitz").setLevel(logging.ERROR)
7
11
 
12
+ from pdflinkcheck.environment import pymupdf_is_available
13
+ from pdflinkcheck.helpers import PageRef
14
+
8
15
  try:
9
- import fitz # PyMuPDF
16
+ if pymupdf_is_available():
17
+ import fitz # PyMuPDF
18
+ else:
19
+ fitz = None
10
20
  except ImportError:
11
21
  fitz = None
12
22
 
13
- from pdflinkcheck.report import run_report
14
- #from pdflinkcheck.validate import run_validation
15
-
16
23
  """
17
24
  Inspect target PDF for both URI links and for GoTo links.
18
25
  """
19
26
 
27
+ def analyze_pdf(pdf_path: str):
28
+ data = {}
29
+ data["links"] = []
30
+ data["toc"] = []
31
+ data["file_ov"] = {}
32
+
33
+ try:
34
+ doc = fitz.open(pdf_path)
35
+ except Exception as e:
36
+ print(f"fitz.open() failed: {e}")
37
+ return data
38
+
39
+ extracted_links = extract_links_pymupdf(doc)
40
+ structural_toc = extract_toc_pymupdf(doc)
41
+ page_count = doc.page_count
42
+
43
+ data["links"] = extracted_links
44
+ data["toc"] = structural_toc
45
+ data["file_ov"]["total_pages"] = page_count
46
+ data["file_ov"]["pdf_name"] = Path(pdf_path).name
47
+ return data
48
+
49
+
20
50
  # Helper function: Prioritize 'from'
21
- def get_link_rect(link_dict):
51
+ def _get_link_rect(link_dict):
22
52
  """
23
53
  Retrieves the bounding box for the link using the reliable 'from' key
24
54
  provided by PyMuPDF's link dictionary.
@@ -44,6 +74,19 @@ def get_link_rect(link_dict):
44
74
  return None
45
75
 
46
76
  def get_anchor_text(page, link_rect):
77
+ """
78
+ Extracts text content using the link's bounding box coordinates.
79
+ The bounding box is slightly expanded to ensure full characters are captured.
80
+
81
+ Args:
82
+ page: The fitz.Page object where the link is located.
83
+ link_rect: A tuple of four floats (x0, y0, x1, y1) representing the
84
+ link's bounding box.
85
+
86
+ Returns:
87
+ The cleaned, extracted text string, or a placeholder message
88
+ if no text is found or if an error occurs.
89
+ """
47
90
  if not link_rect:
48
91
  return "N/A: Missing Rect"
49
92
 
@@ -81,57 +124,6 @@ def get_anchor_text(page, link_rect):
81
124
  except Exception:
82
125
  return "N/A: Rect Error"
83
126
 
84
- def get_anchor_text_stable(page, link_rect):
85
- """
86
- Extracts text content using the link's bounding box coordinates.
87
- The bounding box is slightly expanded to ensure full characters are captured.
88
-
89
- Args:
90
- page: The fitz.Page object where the link is located.
91
- link_rect: A tuple of four floats (x0, y0, x1, y1) representing the
92
- link's bounding box.
93
-
94
- Returns:
95
- The cleaned, extracted text string, or a placeholder message
96
- if no text is found or if an error occurs.
97
- """
98
- if not link_rect:
99
- return "N/A: Missing Rect"
100
-
101
- try:
102
- # 1. Convert the coordinate tuple back to a fitz.Rect object
103
- rect = fitz.Rect(link_rect)
104
-
105
- # --- CRITICAL STEP: Check for invalid/empty rect AFTER conversion ---
106
- # If the rect is invalid (e.g., width or height is <= 0), skip it
107
- # Note: fitz.Rect will often auto-normalize, but this explicit check is safer.
108
- if rect.is_empty or rect.width <= 0 or rect.height <= 0:
109
- return "N/A: Rect Error (Zero/Negative Dimension)"
110
-
111
- # 2. Expand the rect slightly to capture full characters (1 unit in each direction)
112
- # This method avoids the proprietary/unstable 'from_expanded' or 'from_rect' methods.
113
- expanded_rect = fitz.Rect(
114
- rect.x0 - 1,
115
- rect.y0 - 1,
116
- rect.x1 + 1,
117
- rect.y1 + 1
118
- )
119
-
120
- # 3. Get the text within the expanded bounding box
121
- anchor_text = page.get_textbox(expanded_rect)
122
-
123
- # 4. Clean up whitespace and non-printing characters
124
- cleaned_text = " ".join(anchor_text.split())
125
-
126
- if cleaned_text:
127
- return cleaned_text
128
- else:
129
- return "N/A: No Visible Text"
130
-
131
- except Exception:
132
- # Fallback for unexpected errors in rect conversion or retrieval
133
- return "N/A: Rect Error"
134
-
135
127
  def analyze_toc_fitz(doc):
136
128
  """
137
129
  Extracts the structural Table of Contents (PDF Bookmarks/Outline)
@@ -144,23 +136,28 @@ def analyze_toc_fitz(doc):
144
136
  A list of dictionaries, where each dictionary represents a TOC entry
145
137
  with 'level', 'title', and 'target_page' (1-indexed).
146
138
  """
139
+
147
140
  toc = doc.get_toc()
148
141
  toc_data = []
149
142
 
150
143
  for level, title, page_num in toc:
151
144
  # fitz pages are 1-indexed for TOC!
145
+ # We know fitz gives us a human number.
146
+ # We convert it to a physical index for our internal storage.
147
+ # page_num is 1 (Human). We normalize to 0 (Physical).
148
+ ref = PageRef.from_human(page_num)
152
149
  toc_data.append({
153
150
  'level': level,
154
151
  'title': title,
155
- 'target_page': page_num
152
+ #'target_page': ref.index
153
+ 'target_page': ref.machine
156
154
  })
157
155
 
158
156
  return toc_data
159
157
 
160
-
161
158
  # 2. Updated Main Inspection Function to Include Text Extraction
162
159
  #def inspect_pdf_hyperlinks_fitz(pdf_path):
163
- def extract_toc_pymupdf(pdf_path):
160
+ def extract_toc_pymupdf(doc):
164
161
  """
165
162
  Opens a PDF, iterates through all pages and extracts the structural table of contents (TOC/bookmarks).
166
163
 
@@ -171,7 +168,7 @@ def extract_toc_pymupdf(pdf_path):
171
168
  A list of dictionaries representing the structural TOC/bookmarks.
172
169
  """
173
170
  try:
174
- doc = fitz.open(pdf_path)
171
+
175
172
  structural_toc = analyze_toc_fitz(doc)
176
173
  except Exception as e:
177
174
  print(f"An error occurred: {e}", file=sys.stderr)
@@ -206,133 +203,100 @@ def serialize_fitz_object(obj):
206
203
  return obj
207
204
 
208
205
 
209
- def extract_links_pymupdf(pdf_path):
210
- """
211
- Opens a PDF, iterates through all pages and extracts all link annotations.
212
- It categorizes the links into External, Internal, or Other actions, and extracts the anchor text.
213
-
214
- Args:
215
- pdf_path: The file system path (str) to the target PDF document.
216
-
217
- Returns:
218
- A list of dictionaries, where each dictionary is a comprehensive
219
- representation of an active hyperlink found in the PDF.
220
-
221
- """
206
+ def extract_links_pymupdf(doc):
222
207
  links_data = []
223
- try:
224
- doc = fitz.open(pdf_path)
208
+ try:
209
+ # This represents the maximum valid 0-index in the doc
210
+ last_page_ref = PageRef.from_pymupdf_total_page_count(doc.page_count)
211
+
212
+ #print(last_page_ref) # Output: "358" (Because of __str__)
213
+ #print(int(last_page_ref)) # Output: 357 (Because of __int__)
225
214
 
226
215
  for page_num in range(doc.page_count):
227
216
  page = doc.load_page(page_num)
228
-
229
- for link in page.get_links():
217
+ source_ref = PageRef.from_index(page_num)
230
218
 
231
- page_obj = doc.load_page(page_num)
232
- link_rect = get_link_rect(link)
233
-
234
- rect_obj = link.get("from")
235
- xref = link.get("xref")
236
- #print(f"rect_obj = {rect_obj}")
237
- #print(f"xref = {xref}")
238
-
239
-
240
- # --- Examples of various keys associated with various link instances ---
241
- #print(f"keys: list(link) = {list(link)}")
242
- # keys: list(link) = ['kind', 'xref', 'from', 'page', 'viewrect', 'id']
243
- # keys: list(link) = ['kind', 'xref', 'from', 'uri', 'id']
244
- # keys: list(link) = ['kind', 'xref', 'from', 'page', 'view', 'id']
245
-
246
- # 1. Extract the anchor text
247
- anchor_text = get_anchor_text(page_obj, link_rect)
248
-
249
- # 2. Extract the target and kind
250
- target = ""
251
- kind = link.get('kind')
252
-
219
+ for link in page.get_links():
220
+ link_rect = _get_link_rect(link)
221
+ anchor_text = get_anchor_text(page, link_rect)
253
222
 
254
223
  link_dict = {
255
- 'page': int(page_num) + 1, # accurate for link location, add 1
224
+ 'page': source_ref.machine,
256
225
  'rect': link_rect,
257
226
  'link_text': anchor_text,
258
- 'xref':xref
227
+ 'xref': link.get("xref")
259
228
  }
260
229
 
261
- # A. Clean Geom. Objects: Use the helper function on 'to' / 'destination'
262
- # Use the clean serialize_fitz_object() helper function on all keys that might contain objects
230
+ kind = link.get('kind')
263
231
  destination_view = serialize_fitz_object(link.get('to'))
232
+ p_index = link.get('page') # excpeted to be human facing, per PyMuPDF's known quirks
233
+
234
+ # --- CASE 1: INTERNAL JUMPS (GoTo) ---
235
+ if p_index is not None:
264
236
 
265
- # B. Correct Internal Link Page Numbering (The -1 correction hack)
266
- # This will be skipped by URI, which is not expected to have a page key
267
- target_page_num_reported = "N/A"
268
- if link.get('page') is not None:
269
- target_page_num_reported = int(link.get('page'))+1 # accurate for link target, don't add 1 (weird)
237
+ # Ensure we are working with an integer
238
+ raw_pymupdf_idx = int(p_index)
239
+ corrected_machine_idx = PageRef.corrected_down(raw_pymupdf_idx).index
240
+
241
+ # Logic: Normalize to 0-index and store as int
242
+ idx = min(corrected_machine_idx, int(last_page_ref))
243
+ #print(f"DEBUG: Link Text: {anchor_text} | Raw p_index: {p_index}")
244
+ #print(f"[DEBUG] idx: {idx}")
245
+ dest_ref = PageRef.from_index(idx) # does not impact the value
270
246
 
271
- if link['kind'] == fitz.LINK_URI:
272
- target = link.get('uri', 'URI (Unknown Target)')
273
247
  link_dict.update({
274
- 'type': 'External (URI)',
275
- 'url': link.get('uri'),
276
- 'target': target
248
+ 'destination_page': dest_ref.machine,
249
+ 'destination_view': destination_view,
250
+ 'target': dest_ref.machine, # INT (MACHINE INDEX)
277
251
  })
252
+
253
+ if kind == fitz.LINK_GOTO:
254
+ link_dict['type'] = 'Internal (GoTo/Dest)'
255
+ else:
256
+ link_dict['type'] = 'Internal (Resolved Action)'
257
+ link_dict['source_kind'] = kind
278
258
 
279
- elif link['kind'] == fitz.LINK_GOTO:
280
- target = f"Page {target_page_num_reported}"
259
+ # --- CASE 2: EXTERNAL URIs ---
260
+ elif kind == fitz.LINK_URI:
261
+ uri = link.get('uri', 'URI (Unknown Target)')
281
262
  link_dict.update({
282
- 'type': 'Internal (GoTo/Dest)',
283
- 'destination_page': target_page_num_reported,
284
- 'destination_view': destination_view,
285
- 'target': target
263
+ 'type': 'External (URI)',
264
+ 'url': uri,
265
+ 'target': uri # STRING (URL)
286
266
  })
287
267
 
288
- elif link['kind'] == fitz.LINK_GOTOR:
268
+ # --- CASE 3: REMOTE PDF REFERENCES ---
269
+ elif kind == fitz.LINK_GOTOR:
270
+ remote_file = link.get('file', 'Remote File')
289
271
  link_dict.update({
290
272
  'type': 'Remote (GoToR)',
291
273
  'remote_file': link.get('file'),
292
- 'destination': destination_view
274
+ 'target': remote_file # STRING (File Path)
293
275
  })
294
276
 
295
- elif link.get('page') is not None and link['kind'] != fitz.LINK_GOTO:
296
- target = f"Page {target_page_num_reported}"
297
- link_dict.update({
298
- 'type': 'Internal (Resolved Action)',
299
- 'destination_page': target_page_num_reported,
300
- 'destination_view': destination_view,
301
- 'source_kind': link.get('kind'),
302
- 'target': target
303
- })
304
-
277
+ # --- CASE 4: OTHERS ---
305
278
  else:
306
- target = link.get('url') or link.get('remote_file') or link.get('target')
307
279
  link_dict.update({
308
280
  'type': 'Other Action',
309
- 'action_kind': link.get('kind'),
310
- 'target': target
281
+ 'action_kind': kind,
282
+ 'target': 'Unknown' # STRING
311
283
  })
312
284
 
313
- ## --- General Serialization Cleaner ---
314
- #for key, value in link_dict.items():
315
- # if hasattr(value, 'rect') and hasattr(value, 'point'):
316
- # # This handles Rect and Point objects that may slip through
317
- # link_dict[key] = str(value)
318
- ## --- End Cleaner ---
319
-
320
285
  links_data.append(link_dict)
321
-
322
286
  doc.close()
323
287
  except Exception as e:
324
288
  print(f"An error occurred: {e}", file=sys.stderr)
325
289
  return links_data
326
290
 
327
-
328
291
  def call_stable():
329
292
  """
330
293
  Placeholder function for command-line execution (e.g., in __main__).
331
294
  Note: This requires defining PROJECT_NAME, CLI_MAIN_FILE, etc., or
332
295
  passing them as arguments to run_report.
333
296
  """
334
- run_report(pdf_library = "pymupdf")
335
- #run_validation(pdf_library = "pymupdf")
297
+ from pdflinkcheck.report import run_report_and_call_exports
298
+
299
+ run_report_and_call_exports(pdf_library = "pymupdf")
336
300
 
337
301
  if __name__ == "__main__":
338
302
  call_stable()