pdflinkcheck 1.1.94__py3-none-any.whl → 1.2.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. pdflinkcheck/__init__.py +88 -18
  2. pdflinkcheck/__main__.py +6 -0
  3. pdflinkcheck/analysis_pdfium.py +131 -0
  4. pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +99 -141
  5. pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +51 -39
  6. pdflinkcheck/cli.py +52 -48
  7. pdflinkcheck/data/LICENSE +18 -15
  8. pdflinkcheck/data/README.md +23 -25
  9. pdflinkcheck/data/pyproject.toml +17 -26
  10. pdflinkcheck/datacopy.py +16 -1
  11. pdflinkcheck/dev.py +2 -2
  12. pdflinkcheck/environment.py +14 -2
  13. pdflinkcheck/gui.py +346 -563
  14. pdflinkcheck/helpers.py +88 -0
  15. pdflinkcheck/io.py +24 -6
  16. pdflinkcheck/report.py +598 -97
  17. pdflinkcheck/security.py +189 -0
  18. pdflinkcheck/splash.py +38 -0
  19. pdflinkcheck/stdlib_server.py +7 -21
  20. pdflinkcheck/stdlib_server_alt.py +571 -0
  21. pdflinkcheck/tk_utils.py +188 -0
  22. pdflinkcheck/update_msix_version.py +2 -0
  23. pdflinkcheck/validate.py +104 -170
  24. pdflinkcheck/version_info.py +2 -2
  25. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +41 -40
  26. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/RECORD +34 -27
  27. pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
  28. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
  29. pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
  30. pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
  31. pdflinkcheck/analyze_pypdf_v2.py +0 -217
  32. pdflinkcheck-1.1.94.dist-info/WHEEL +0 -4
  33. pdflinkcheck-1.1.94.dist-info/licenses/LICENSE +0 -24
  34. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-AGPL3 +0 -0
  35. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-MIT +0 -0
@@ -1,6 +1,7 @@
1
1
  #!/usr/bin/env python3
2
2
  # SPDX-License-Identifier: MIT
3
- # src/pdflinkcheck/analyze_pypdf.py
3
+ # src/pdflinkcheck/analysis_pypdf.py
4
+ from __future__ import annotations
4
5
  import sys
5
6
  from pathlib import Path
6
7
  import logging
@@ -8,6 +9,7 @@ from typing import Dict, Any, Optional, List
8
9
 
9
10
  from pypdf import PdfReader
10
11
  from pypdf.generic import Destination, NameObject, ArrayObject, IndirectObject
12
+ from pdflinkcheck.helpers import PageRef
11
13
 
12
14
 
13
15
  from pdflinkcheck.io import error_logger, export_report_data, get_first_pdf_in_cwd, LOG_FILE_PATH
@@ -16,7 +18,28 @@ from pdflinkcheck.io import error_logger, export_report_data, get_first_pdf_in_c
16
18
  Inspect target PDF for both URI links and for GoTo links, using only pypdf, not Fitz
17
19
  """
18
20
 
19
- def get_anchor_text_pypdf(page, rect) -> str:
21
+ def analyze_pdf(pdf_path: str):
22
+ data = {}
23
+ data["links"] = []
24
+ data["toc"] = []
25
+ data["file_ov"] = {}
26
+
27
+ try:
28
+ reader = PdfReader(pdf_path)
29
+ except Exception as e:
30
+ print(f"pypdf.PdfReader() failed: {e}")
31
+ return data
32
+
33
+ extracted_links = _extract_links_pypdf(reader)
34
+ structural_toc = _extract_toc_pypdf(reader)
35
+ page_count = len(reader.pages)
36
+ data["links"] = extracted_links
37
+ data["toc"] = structural_toc
38
+ data["file_ov"]["total_pages"] = page_count
39
+ return data
40
+
41
+
42
+ def _get_anchor_text_pypdf(page, rect) -> str:
20
43
  """
21
44
  Extracts text within the link's bounding box using a visitor function.
22
45
  Reliable for finding text associated with a link without PyMuPDF.
@@ -33,7 +56,7 @@ def get_anchor_text_pypdf(page, rect) -> str:
33
56
 
34
57
  parts: List[str] = []
35
58
 
36
- def visitor_body(text, cm, tm, font_dict, font_size):
59
+ def _visitor_body(text, cm, tm, font_dict, font_size):
37
60
  # tm[4], tm[5] are the current text insertion point coordinates (x, y)
38
61
  x, y = tm[4], tm[5]
39
62
 
@@ -44,17 +67,18 @@ def get_anchor_text_pypdf(page, rect) -> str:
44
67
  if text.strip():
45
68
  parts.append(text)
46
69
 
47
- page.extract_text(visitor_text=visitor_body)
70
+ page.extract_text(visitor_text=_visitor_body)
48
71
 
49
72
  raw_extracted = "".join(parts)
50
73
  cleaned = " ".join(raw_extracted.split()).strip()
51
74
 
52
75
  return cleaned if cleaned else "Graphic/Empty Link"
53
76
 
54
- def resolve_pypdf_destination(reader: PdfReader, dest, obj_id_to_page: dict) -> Optional[int]:
77
+ def _resolve_pypdf_destination(reader: PdfReader, dest, obj_id_to_page: dict) -> Optional[int]:
55
78
  try:
56
79
  if isinstance(dest, Destination):
57
- return dest.page_number + 1 # Return int directly
80
+ # .page_number in pypdf is already 0-indexed
81
+ return dest.page_number
58
82
 
59
83
  if isinstance(dest, IndirectObject):
60
84
  return obj_id_to_page.get(dest.idnum)
@@ -67,42 +91,25 @@ def resolve_pypdf_destination(reader: PdfReader, dest, obj_id_to_page: dict) ->
67
91
  except Exception:
68
92
  return None
69
93
 
70
- def resolve_pypdf_destination_(reader: PdfReader, dest, obj_id_to_page: dict) -> str:
71
- """
72
- Resolves a Destination object or IndirectObject to a 1-based page number string.
73
- """
74
- try:
75
- if isinstance(dest, Destination):
76
- return str(dest.page_number + 1)
77
-
78
- if isinstance(dest, IndirectObject):
79
- return str(obj_id_to_page.get(dest.idnum, "Unknown"))
80
-
81
- if isinstance(dest, ArrayObject) and len(dest) > 0:
82
- if isinstance(dest[0], IndirectObject):
83
- return str(obj_id_to_page.get(dest[0].idnum, "Unknown"))
84
-
85
- return "Unknown"
86
- except Exception:
87
- return "Error Resolving"
88
94
 
89
- def extract_links_pypdf(pdf_path):
95
+ def _extract_links_pypdf(reader: PdfReader) -> List[Dict[str, Any]]:
90
96
  """
91
97
  Termux-compatible link extraction using pure-Python pypdf.
92
98
  Matches the reporting schema of the PyMuPDF version.
93
99
  """
94
- reader = PdfReader(pdf_path)
95
100
 
96
101
  # Pre-map Object IDs to Page Numbers for fast internal link resolution
97
102
  obj_id_to_page = {
98
- page.indirect_reference.idnum: i + 1
103
+ page.indirect_reference.idnum: i
99
104
  for i, page in enumerate(reader.pages)
100
105
  }
101
106
 
102
107
  all_links = []
103
108
 
104
109
  for i, page in enumerate(reader.pages):
105
- page_num = i + 1
110
+ #page_num = i
111
+ # Use PageRef to stay consistent
112
+ page_source = PageRef.from_index(i)
106
113
  if "/Annots" not in page:
107
114
  continue
108
115
 
@@ -112,10 +119,10 @@ def extract_links_pypdf(pdf_path):
112
119
  continue
113
120
 
114
121
  rect = obj.get("/Rect")
115
- anchor_text = get_anchor_text_pypdf(page, rect)
122
+ anchor_text = _get_anchor_text_pypdf(page, rect)
116
123
 
117
124
  link_dict = {
118
- 'page': page_num,
125
+ 'page': page_source.machine,
119
126
  'rect': list(rect) if rect else None,
120
127
  'link_text': anchor_text,
121
128
  'type': 'Other Action',
@@ -134,13 +141,16 @@ def extract_links_pypdf(pdf_path):
134
141
  # Handle GoTo (Internal)
135
142
  elif "/Dest" in obj or ("/A" in obj and "/D" in obj["/A"]):
136
143
  dest = obj.get("/Dest") or obj["/A"].get("/D")
137
- target_page = resolve_pypdf_destination(reader, dest, obj_id_to_page)
144
+ target_page = _resolve_pypdf_destination(reader, dest, obj_id_to_page)
138
145
  # print(f"DEBUG: resolved target_page = {target_page} (type: {type(target_page)})")
139
- link_dict.update({
140
- 'type': 'Internal (GoTo/Dest)',
141
- 'destination_page': target_page,
142
- 'target': f"Page {target_page}"
143
- })
146
+ if target_page is not None:
147
+ dest_page = PageRef.from_index(target_page)
148
+ link_dict.update({
149
+ 'type': 'Internal (GoTo/Dest)',
150
+ 'destination_page': dest_page.machine,
151
+ #'target': f"Page {target_page}"
152
+ 'target': dest_page.machine
153
+ })
144
154
 
145
155
  # Handle Remote GoTo (GoToR)
146
156
  elif "/A" in obj and obj["/A"].get("/S") == "/GoToR":
@@ -156,9 +166,8 @@ def extract_links_pypdf(pdf_path):
156
166
  return all_links
157
167
 
158
168
 
159
- def extract_toc_pypdf(pdf_path: str) -> List[Dict[str, Any]]:
169
+ def _extract_toc_pypdf(reader: PdfReader) -> List[Dict[str, Any]]:
160
170
  try:
161
- reader = PdfReader(pdf_path)
162
171
  # Note: outline is a property, not a method.
163
172
  toc_tree = reader.outline
164
173
  toc_data = []
@@ -169,7 +178,10 @@ def extract_toc_pypdf(pdf_path: str) -> List[Dict[str, Any]]:
169
178
  # Using the reader directly is the only way to avoid
170
179
  # the 'Destination' object has no attribute error
171
180
  try:
172
- page_num = reader.get_destination_page_number(item) + 1
181
+ page_num_raw = reader.get_destination_page_number(item)
182
+ # page_num_raw is 0-indexed. Use PageRef to store it.
183
+ ref = PageRef.from_index(page_num_raw)
184
+ page_num = ref.machine
173
185
  except:
174
186
  page_num = "N/A"
175
187
 
pdflinkcheck/cli.py CHANGED
@@ -1,12 +1,13 @@
1
1
  #!/usr/bin/env python3
2
2
  # SPDX-License-Identifier: MIT
3
3
  # src/pdflinkcheck/cli.py
4
+ from __future__ import annotations
4
5
  import typer
5
6
  from typing import Literal
6
7
  from typer.models import OptionInfo
7
8
  from rich.console import Console
8
9
  from pathlib import Path
9
- from pdflinkcheck.report import run_report_and_call_exports # Assuming core logic moves here
10
+ from pdflinkcheck.report import run_report_and_call_exports # Assuming core logic moves here
10
11
  from typing import Dict, Optional, Union, List
11
12
  import pyhabitat
12
13
  import sys
@@ -20,18 +21,48 @@ from pdflinkcheck.io import get_first_pdf_in_cwd
20
21
 
21
22
  console = Console() # to be above the tkinter check, in case of console.print
22
23
 
24
+ # Force Rich to always enable colors, even when running from a .pyz bundle
25
+ os.environ["FORCE_COLOR"] = "1"
26
+ # Optional but helpful for full terminal feature detection
27
+ os.environ["TERM"] = "xterm-256color"
28
+
23
29
  app = typer.Typer(
24
30
  name="pdflinkcheck",
25
31
  help=f"A command-line tool for comprehensive PDF link analysis and reporting. (v{get_version_from_pyproject()})",
26
32
  add_completion=False,
27
33
  invoke_without_command = True,
28
34
  no_args_is_help = False,
35
+ context_settings={"ignore_unknown_options": True,
36
+ "allow_extra_args": True,
37
+ "help_option_names": ["-h", "--help"]},
29
38
  )
30
39
 
40
+
41
+ def debug_callback(value: bool):
42
+ #def debug_callback(ctx: typer.Context, value: bool):
43
+ if value:
44
+ # This runs IMMEDIATELY when --debug is parsed, even before --help
45
+ # 1. Access the list of all command-line arguments
46
+ full_command_list = sys.argv
47
+ # 2. Join the list into a single string to recreate the command
48
+ command_string = " ".join(full_command_list)
49
+ # 3. Print the command
50
+ typer.echo(f"command:\n{command_string}\n")
51
+ return value
52
+
53
+ if "--show-command" in sys.argv or "--debug" in sys.argv:
54
+ debug_callback(True)
55
+
31
56
  @app.callback()
32
57
  def main(ctx: typer.Context,
33
58
  version: Optional[bool] = typer.Option(
34
59
  None, "--version", is_flag=True, help="Show the version."
60
+ ),
61
+ debug: bool = typer.Option(
62
+ False, "--debug", is_flag=True, help="Enable verbose debug logging and echo the full command string."
63
+ ),
64
+ show_command: bool = typer.Option(
65
+ False, "--show-command", is_flag=True, help="Echo the full command string to the console before execution."
35
66
  )
36
67
  ):
37
68
  """
@@ -44,13 +75,6 @@ def main(ctx: typer.Context,
44
75
  if ctx.invoked_subcommand is None:
45
76
  gui_command()
46
77
  raise typer.Exit(code=0)
47
-
48
- # 1. Access the list of all command-line arguments
49
- full_command_list = sys.argv
50
- # 2. Join the list into a single string to recreate the command
51
- command_string = " ".join(full_command_list)
52
- # 3. Print the command
53
- typer.echo(f"command:\n{command_string}\n")
54
78
 
55
79
 
56
80
  # help-tree() command: fragile, experimental, defaults to not being included.
@@ -89,7 +113,6 @@ def docs_command(
89
113
  try:
90
114
  license_path = files("pdflinkcheck.data") / "LICENSE"
91
115
  license_text = license_path.read_text(encoding="utf-8")
92
-
93
116
  console.print(f"\n[bold green]=== GNU AFFERO GENERAL PUBLIC LICENSE V3+ ===[/bold green]")
94
117
  console.print(license_text, highlight=False)
95
118
 
@@ -127,24 +150,6 @@ def tools_command(
127
150
  if clear_cache:
128
151
  clear_all_caches()
129
152
 
130
- """
131
- def validate_pdf_commands(
132
- pdf_path: Optional[Path] = typer.Argument(
133
- None,
134
- exists=True,
135
- file_okay=True,
136
- dir_okay=False,
137
- readable=True,
138
- resolve_path=True,
139
- help="Path to the PDF file to validate. If omitted, searches current directory."
140
- ),
141
- pdf_library: Literal["pypdf", "pymupdf"] = typer.Option(
142
- "pypdf",
143
- "--library", "-l",
144
- envvar="PDF_ENGINE",
145
- help="PDF parsing engine: pypdf (pure Python) or pymupdf (faster, if available)"
146
- ),
147
- """
148
153
  @app.command(name="analyze") # Added a command name 'analyze' for clarity
149
154
  def analyze_pdf( # Renamed function for clarity
150
155
  pdf_path: Optional[Path] = typer.Argument(
@@ -158,22 +163,21 @@ def analyze_pdf( # Renamed function for clarity
158
163
  ),
159
164
  export_format: Optional[Literal["JSON", "TXT", "JSON,TXT", "NONE"]] = typer.Option(
160
165
  "JSON,TXT",
161
- "--export-format","-e",
166
+ "--format","-f",
162
167
  case_sensitive=False,
163
168
  help="Export format. Use 'None' to suppress file export.",
164
169
  ),
165
- max_links: int = typer.Option(
166
- 0,
167
- "--max-links", "-m",
168
- min=0,
169
- help="Report brevity control. Use 0 to show all."
170
- ),
171
170
 
172
- pdf_library: Literal["pypdf", "pymupdf"] = typer.Option(
171
+ pdf_library: Literal["auto","pdfium","pypdf", "pymupdf"] = typer.Option(
173
172
  assess_default_pdf_library(),
174
- "--pdf-library","-p",
173
+ "--engine","-e",
175
174
  envvar="PDF_ENGINE",
176
- help="PDF parsing library. pypdf (pure Python) or pymupdf (faster, if available).",
175
+ help="PDF parsing library. pypdf (pure Python), pymupdf (fast, AGPL3+ licensed), pdfium (fast, BSD-3 licensed).",
176
+ ),
177
+ print_bool: bool = typer.Option(
178
+ True,
179
+ "--print/--quiet",
180
+ help="Print or do not print the analysis and validation report to console."
177
181
  )
178
182
  ):
179
183
  """
@@ -223,13 +227,14 @@ def analyze_pdf( # Renamed function for clarity
223
227
 
224
228
  if not valid and "NONE" not in requested_formats:
225
229
  typer.echo(f"Warning: No valid formats found in '{export_format}'. Supported: JSON, TXT.")
230
+
226
231
 
227
232
  # The meat and potatoes
228
233
  report_results = run_report_and_call_exports(
229
234
  pdf_path=str(pdf_path),
230
- max_links=max_links,
231
235
  export_format = export_formats,
232
236
  pdf_library = pdf_library,
237
+ print_bool = print_bool,
233
238
  )
234
239
 
235
240
  if not report_results or not report_results.get("data"):
@@ -238,14 +243,14 @@ def analyze_pdf( # Renamed function for clarity
238
243
 
239
244
  validation_results = report_results["data"]["validation"]
240
245
  # Optional: fail on broken links
241
- broken_count = validation_results["summary-stats"]["broken-page"] + validation_results["summary-stats"]["broken-file"]
246
+ broken_page_count = validation_results["summary-stats"]["broken-page"] + validation_results["summary-stats"]["broken-file"]
242
247
 
243
- if broken_count > 0:
244
- console.print(f"\n[bold yellow]Warning:[/bold yellow] {broken_count} broken link(s) found.")
245
- else:
246
- console.print(f"\n[bold green]Success:[/bold green] No broken links or TOC issues!\n")
248
+ if broken_page_count > 0:
249
+ console.print(f"\n[bold yellow]Warning:[/bold yellow] {broken_page_count} broken link(s) found.")
250
+ #else:
251
+ # console.print(f"\n[bold green]Success:[/bold green] No broken links or TOC issues!\n")
247
252
 
248
- raise typer.Exit(code=0 if broken_count == 0 else 1)
253
+ raise typer.Exit(code=0 if broken_page_count == 0 else 1)
249
254
 
250
255
  @app.command(name="serve")
251
256
  def serve(
@@ -265,7 +270,7 @@ def serve(
265
270
  console.print(" → [yellow]Reload mode enabled[/yellow]")
266
271
 
267
272
  # Import here to avoid slow imports on other commands
268
- from pdflinkcheck.stdlib_server import ThreadedTCPServer, PDFLinkCheckHandler
273
+ from pdflinkcheck.stdlib_server_alt import ThreadedTCPServer, PDFLinkCheckHandler
269
274
  import socketserver
270
275
 
271
276
  try:
@@ -294,8 +299,6 @@ def gui_command(
294
299
  """
295
300
  Launch tkinter-based GUI.
296
301
  """
297
-
298
- # --- START FIX ---
299
302
  assured_auto_close_value = 0
300
303
 
301
304
  if isinstance(auto_close, OptionInfo):
@@ -307,11 +310,12 @@ def gui_command(
307
310
  # Case 2: Called explicitly by Typer (pdflinkcheck gui -c 3000)
308
311
  # Typer has successfully converted the command line argument, and auto_close is an int.
309
312
  assured_auto_close_value = int(auto_close)
310
- # --- END FIX ---
311
313
 
312
314
  if not pyhabitat.tkinter_is_available():
313
315
  _gui_failure_msg()
314
316
  return
317
+ #from pdflinkcheck.gui import start_gui
318
+ #from pdflinkcheck.gui_alt import start_gui
315
319
  from pdflinkcheck.gui import start_gui
316
320
  start_gui(time_auto_close = assured_auto_close_value)
317
321
 
pdflinkcheck/data/LICENSE CHANGED
@@ -1,24 +1,27 @@
1
- **Copyright © 2025 George Clayton Bennett**
2
- <https://github.com/City-of-Memphis-Wastewater/pdflinkcheck>
3
-
4
- Some distributed binaries of this project include the PyMuPDF library, which is licensed under **AGPL‑3.0‑or‑later**.
5
- Any binary that incorporates PyMuPDF is therefore distributed under **AGPL‑3.0‑or‑later**.
1
+ Some distributed binaries of this project include the PyMuPDF library, which is licensed under **AGPL3.0orlater**.
2
+ Any binary that incorporates PyMuPDF is therefore distributed under **AGPL3.0orlater**.
6
3
  Other binaries use only the `pypdf` library and do not include PyMuPDF; these binaries are distributed under the **MIT License**.
7
4
 
8
- For AGPL‑licensed binaries, the complete corresponding source code must be made available to anyone who possesses a copy, upon request.
5
+ For AGPLlicensed binaries, the complete corresponding source code must be made available to anyone who possesses a copy, upon request.
9
6
  This obligation applies only to recipients of those binaries, and hosting the source code in GitHub Releases satisfies this requirement.
10
7
 
11
- A binary becomes AGPL‑licensed only when built with the optional `"full"` dependency group (as defined in `pyproject.toml` under `[project.optional-dependencies]`) or when PyMuPDF is otherwise included in the build environment.
12
- The **source code of pdflinkcheck itself** remains licensed under the **MIT License**; only the distributed binary becomes AGPL‑licensed when PyMuPDF is included.
8
+ A binary becomes AGPLlicensed only when built with the optional `"full"` dependency group (as defined in `pyproject.toml` under `[project.optional-dependencies]`) or when PyMuPDF is otherwise included in the build environment.
9
+ The **source code of pdflinkcheck itself** remains licensed under the **MIT License**; only the distributed binary becomes AGPLlicensed when PyMuPDF is included.
13
10
 
14
- Source code for each released version is available in the `pdflinkcheck‑VERSION.tar.gz` files on the project’s GitHub Releases page.
11
+ Source code for each released version is available in the `pdflinkcheckVERSION.tar.gz` files on the projects GitHub Releases page.
15
12
 
16
- Full‑text copies of **LICENSE‑MIT** and **LICENSE‑AGPL3** are included in the root of the repository.
13
+ Fulltext copies of **LICENSEMIT** and **LICENSEAGPL3** are included in the root of the repository.
17
14
 
18
15
  **Links:**
19
- - Project source code: https://github.com/City-of-Memphis-Wastewater/pdflinkcheck
20
- - PyMuPDF source code: https://github.com/pymupdf/PyMuPDF
21
- - pypdf source code: https://github.com/py-pdf/pypdf
22
- - AGPLv3 text (FSF): https://www.gnu.org/licenses/agpl-3.0.html
23
- - MIT License text: https://opensource.org/license/mit
16
+ - Project source code: https://github.com/City-of-Memphis-Wastewater/pdflinkcheck
17
+ - PyMuPDF source code: https://github.com/pymupdf/PyMuPDF
18
+ - pypdfium2 source code: https://github.com/pypdfium2-team/pypdfium2
19
+ - PDFium source code: https://pdfium.googlesource.com/pdfium/
20
+ - pypdf source code: https://github.com/py-pdf/pypdf
21
+ - AGPLv3 text (FSF): https://www.gnu.org/licenses/agpl-3.0.html
22
+ - MIT License text: https://opensource.org/license/mit
23
+ - BSD-3 License text: https://opensource.org/license/bsd-3-clause
24
+ - Apache-v2 License text: https://opensource.org/license/apache-2-0
25
+
24
26
 
27
+ Copyright 2025 George Clayton Bennett
@@ -4,7 +4,7 @@ A purpose-built tool for comprehensive analysis of hyperlinks and GoTo links wit
4
4
 
5
5
  -----
6
6
 
7
- ![Screenshot of the pdflinkcheck GUI](https://raw.githubusercontent.com/City-of-Memphis-Wastewater/pdflinkcheck/main/assets/pdflinkcheck_gui_v1.1.92.png)
7
+ ![Screenshot of the pdflinkcheck GUI](https://raw.githubusercontent.com/City-of-Memphis-Wastewater/pdflinkcheck/main/assets/pdflinkcheck_gui_v1.1.97.png)
8
8
 
9
9
  -----
10
10
 
@@ -19,7 +19,7 @@ For the most user-typical experience, download the single-file binary matching y
19
19
  | **File Type** | **Primary Use Case** | **Recommended Launch Method** |
20
20
  | :--- | :--- | :--- |
21
21
  | **Executable (.exe, .elf)** | **GUI** | Double-click the file. |
22
- | **PYZ (Python Zip App)** | **CLI** or **GUI** | Run using your system's `python` command: `python pdflinkcheck-VERSION.pyz --help` |
22
+ | **PYZ (Python Zip App)** | **CLI** or **GUI** | Run using your system's `python` command: `python pdflinkcheck-VERSION.pyz --help` |
23
23
 
24
24
  ### Installation via pipx
25
25
 
@@ -53,7 +53,7 @@ Ways to launch the GUI interface:
53
53
  The core functionality is accessed via the `analyze` command.
54
54
 
55
55
  `pdflinkcheck --help`:
56
- ![Screenshot of the pdflinkcheck CLI Tree Help](https://raw.githubusercontent.com/City-of-Memphis-Wastewater/pdflinkcheck/main/assets/pdflinkcheck_cli_v1.1.92.png)
56
+ ![Screenshot of the pdflinkcheck CLI Tree Help](https://raw.githubusercontent.com/City-of-Memphis-Wastewater/pdflinkcheck/main/assets/pdflinkcheck_cli_v1.1.97.png)
57
57
 
58
58
 
59
59
  See the Help Tree by unlocking the help-tree CLI command, using the DEV_TYPER_HELP_TREE env var.
@@ -63,7 +63,7 @@ DEV_TYPER_HELP_TREE=1 pdflinkcheck help-tree` # bash
63
63
  $env:DEV_TYPER_HELP_TREE = "1"; pdflinkcheck help-tree` # PowerShell
64
64
  ```
65
65
 
66
- ![Screenshot of the pdflinkcheck CLI Tree Help](https://raw.githubusercontent.com/City-of-Memphis-Wastewater/pdflinkcheck/main/assets/pdflinkcheck_cli_v1.1.92_tree_help.png)
66
+ ![Screenshot of the pdflinkcheck CLI Tree Help](https://raw.githubusercontent.com/City-of-Memphis-Wastewater/pdflinkcheck/main/assets/pdflinkcheck_cli_v1.1.97_tree_help.png)
67
67
 
68
68
 
69
69
 
@@ -84,7 +84,6 @@ $env:DEV_TYPER_HELP_TREE = "1"; pdflinkcheck help-tree` # PowerShell
84
84
  |`<PDF_PATH>`|**Required.** The path to the PDF file to analyze.|N/A|
85
85
  |`--pdf-library / -p`|Select engine: `pymupdf` or `pypdf`.|`pypdf`|
86
86
  |`--export-format / -e`|Export to `JSON`, `TXT`, or `None` to suppress file output.|`JSON`|
87
- |`--max-links / -m`|Maximum links to display per section. Use `0` for all.|`0`|
88
87
 
89
88
  ### `gui` Command Options
90
89
 
@@ -98,9 +97,6 @@ $env:DEV_TYPER_HELP_TREE = "1"; pdflinkcheck help-tree` # PowerShell
98
97
  # Analyze a document, show all links, and save the report as JSON and TXT
99
98
  pdflinkcheck analyze "TE Maxson WWTF O&M Manual.pdf" --export-format JSON,TXT
100
99
 
101
- # Analyze a document but keep the print block short, showing only the first 10 links for each type
102
- pdflinkcheck analyze "TE Maxson WWTF O&M Manual.pdf" --max-links 10
103
-
104
100
  # Show the GUI for only a moment, like in a build check
105
101
  pdflinkcheck gui --auto-close 3000
106
102
 
@@ -112,22 +108,23 @@ pdflinkcheck docs --license --readme
112
108
 
113
109
  ## 📦 Library Access (Advanced)
114
110
 
115
- For developers importing `pdflinkcheck` into other Python projects, the core analysis functions are exposed directly in the root namespace:
111
+ For developers importing `pdflinkcheck` into other Python projects, the core analysis functions are exposed directly in the root namespace. The various `analysis_pdf_*` functions each use a different library to extract the target PDF's internal TOC, external links, and metadata.
116
112
 
117
- |**Function**|**Description**|
118
- |---|---|
119
- |`run_report()`|**(Primary function)** Performs the full analysis, prints to console, and handles file export.|
120
- |`extract_links_pynupdf()`|Function to retrieve all explicit links (URIs, GoTo, etc.) from a PDF path.|
121
- |`extract_toc_pymupdf()`|Function to extract the PDF's internal Table of Contents (bookmarks/outline).|
122
- |`extract_links_pynupdf()`|Function to retrieve all explicit links (URIs, GoTo, etc.) from a PDF path, using the pypdf library.|
123
- |`extract_toc_pymupdf()`|Function to extract the PDF's internal Table of Contents (bookmarks/outline), using the pypdf library.|
113
+ |**Function**|**Library**|**Description**|
114
+ |---|---|---|
115
+ |`run_report()`|pdflinkcheck | **(Primary function)** Performs the full analysis, prints to console, and handles file export.|
116
+ |`analyze_pdf_pdfium()`| pypdfium2 | Fast, ~10 mb, Permissively licensed |
117
+ |`analyze_pdf_pymupdf()`| PyMuPDF | Fast, ~30 mb, AGPL3+ licensed |
118
+ |`analyze_pdf_pypdf()`| pypdf library | Slow, ~2 mb, Permissively licensed |
124
119
 
125
120
  Exanple:
126
121
 
127
122
  ```python
128
- from pdflinkcheck.report import run_report
129
- from pdflinkcheck.analysis_pymupdf import extract_links_pymupdf, extract_toc_pymupdf 130 from pdflinkcheck.analysis_pymupdf import extract_links_pynupdf, extract_toc_pymupdf
130
- from pdflinkcheck.analysis_pypdf import extract_links_pypdf, extract_toc_pypdf
123
+ from pdflinkcheck import ( run_report,
124
+ analyze_pdf_pymupdf,
125
+ analyze_pdf_pypdf,
126
+ analyze_pdf_pdfium,
127
+ )
131
128
 
132
129
  file = "document1.pdf"
133
130
  report_data = run_report(file)
@@ -194,24 +191,24 @@ Termux compatibility is important in the modern age, because Android devices are
194
191
  Android is the most common operating system in the Global South.
195
192
  We aim to produce stable software that can do the most possible good.
196
193
 
197
- Now `pdflinkcheck` can run on Termux by using the `pypdf` engine.
194
+ Now `pdflinkcheck` can run on Termux by using the `pypdf` engine and the `pdfium` engine.
198
195
  Benefits:
199
196
  - `pypdf`-only artifacts, to reduce size to about 6% compared to artifacts that include `PyMuPDF`.
200
197
  - Web-stack GUI as an alternative to the Tkinter GUI, which can be run locally on Termux or as a web app.
201
198
 
202
199
 
203
200
  ### PDF Library Selection
204
- At long last, `PyMuPDF` is an optional dependency. All testing comparing `pyp df` and `PyMuPDF` has shown identical validation performance. However `PyMuPDF` is much faster. The benfit of `pypdf` is small size of packages and cross-platform compatibility.
201
+ At long last, `PyMuPDF` is an optional dependency. All testing comparing `pypdf` and `PyMuPDF` has shown identical validation performance. However `PyMuPDF` is much faster. The benfit of `pypdf` is small size of packages and cross-platform compatibility. We have recently added a PDFium option, which circumvents the AGPL3+.
205
202
 
206
203
  Expecte that all binaries and artifacts contain PyMuPDF, unlss they are built on Android. The GUI and CLI interfaces both allow selection of the library; if PyMuPDF is selected but is not available, the user will be warned.
207
204
 
208
205
  To install the complete version use one of these options:
209
206
 
210
207
  ```bash
211
- pip install "pdflinkcheck[full]"
212
- pipx install "pdflinkcheck[full]"
213
- uv tool install "pdflinkcheck[full]"
214
- uv add "pdflinkcheck[full]"
208
+ pip install "pdflinkcheck[mupdf]"
209
+ pipx install "pdflinkcheck[pdfium]"
210
+ uv tool install "pdflinkcheck[pdfium]"
211
+ uv add "pdflinkcheck[pdfium]"
215
212
  ```
216
213
 
217
214
  ---
@@ -271,6 +268,7 @@ The source code of pdflinkcheck itself remains licensed under the **MIT License*
271
268
  Links:
272
269
  - Source code: https://github.com/City-of-Memphis-Wastewater/pdflinkcheck/
273
270
  - PyMuPDF source code: https://github.com/pymupdf/PyMuPDF/
271
+ - pypdfium2 source code: https://github.com/pypdfium2-team/pypdfium2
274
272
  - pypdf source code: https://github.com/py-pdf/pypdf/
275
273
  - AGPLv3 text (FSF): https://www.gnu.org/licenses/agpl-3.0.html
276
274
  - MIT License text: https://opensource.org/license/mit
@@ -1,11 +1,11 @@
1
1
  [project]
2
2
  name = "pdflinkcheck"
3
- version = "1.1.94"
3
+ version = "1.2.29"
4
4
  description = "A purpose-built PDF link analysis and reporting tool with GUI and CLI."
5
5
  readme = "README.md"
6
- requires-python = ">=3.10"
6
+ requires-python = ">=3.9"
7
7
  dependencies = [
8
- "pyhabitat>=1.0.53",
8
+ "pyhabitat>=1.1.5",
9
9
  "pypdf>=6.4.2",
10
10
  "rich>=14.2.0",
11
11
  "typer>=0.20.0",
@@ -17,13 +17,12 @@ license-files = ["LICENSE", "LICENSE-MIT", "LICENSE-AGPL3"]
17
17
  classifiers=[
18
18
  "Programming Language :: Python :: 3",
19
19
  "Programming Language :: Python :: 3 :: Only",
20
+ "Programming Language :: Python :: 3.9",
20
21
  "Programming Language :: Python :: 3.10",
21
22
  "Programming Language :: Python :: 3.11",
22
23
  "Programming Language :: Python :: 3.12",
23
24
  "Programming Language :: Python :: 3.13",
24
25
  "Programming Language :: Python :: 3.14",
25
- "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
26
- "License :: OSI Approved :: MIT License",
27
26
  "Operating System :: OS Independent",
28
27
  "Intended Audience :: End Users/Desktop",
29
28
  "Intended Audience :: Developers", # library and documentation
@@ -54,42 +53,34 @@ Repository = "https://github.com/city-of-memphis-wastewater/pdflinkcheck"
54
53
  [project.scripts]
55
54
  pdflinkcheck = "pdflinkcheck.cli:app"
56
55
 
56
+
57
57
  [project.optional-dependencies]
58
- # This allows users to do: pip install pdflinkcheck[full]
59
58
  # If you choose to include PyMuPDF, you must comply with the AGPL3
60
- full = [
61
- #"pymupdf>=1.26.7 ; platform_system == 'Linux' and platform_machine != 'aarch64'" # to avoid on termux
62
- "pymupdf>=1.26.7" # let them try
63
- ]
64
- #mobilegui = [
65
- # "kivy>=2.3.1",
66
- # "kivymd",
59
+ mupdf = ["pymupdf>=1.24.0,<2.0.0"] # fails on termux
60
+ pdfium = ["pypdfium2>=5.2.0,<6.0.0"]
61
+ full = ["pymupdf>=1.24.0,<2.0.0", "pypdfium2>=5.2.0,<6.0.0"]
62
+
63
+ #rust = [
64
+ # "pdflinkcheck-rust>=0.1.13"
67
65
  #]
68
66
 
69
67
  [dependency-groups]
70
68
  dev = [
71
69
  "build>=1.3.0",
72
- "pyinstaller>=6.17.0",
70
+ "pyinstaller>=6.17.0 ; platform_system == 'Linux' and platform_machine != 'aarch64'", # to avoid on termux
73
71
  "shiv>=1.0.8",
74
72
  "ruff>=0.7.0 ; platform_system == 'Linux' and platform_machine != 'aarch64'", # to avoid on termux
75
73
  "pytest>=8.0.0",
76
74
  "pytest-cov>=4.1.0",
77
75
  ]
78
76
 
77
+ #[build-system]
78
+ #requires = ["uv_build"]
79
+ #build-backend = "uv_build"
79
80
  [build-system]
80
- requires = ["uv_build"]
81
- build-backend = "uv_build"
81
+ requires = ["setuptools>=64", "wheel"]
82
+ build-backend = "setuptools.build_meta"
82
83
 
83
84
  [tool.uv.sources]
84
85
  pdflinkcheck = { path = "src/pdflinkcheck" }
85
- kivymd = { git = "https://github.com/kivymd/KivyMD.git" }
86
-
87
- # https://docs.astral.sh/uv/concepts/preview/#available-preview-features
88
- #[tool.uv]
89
- #preview = true
90
-
91
- # uv handles the data path automatically
92
-
93
-
94
-
95
86