pdflinkcheck 1.1.7__py3-none-any.whl → 1.1.72__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,218 @@
1
+ # src/pdflinkcheck/analyze_pypdf.py
2
+ import sys
3
+ from pathlib import Path
4
+ import logging
5
+ from typing import Dict, Any, List
6
+
7
+ from pypdf import PdfReader
8
+ from pypdf.generic import Destination, NameObject, IndirectObject
9
+
10
+ from pdflinkcheck.report import run_report
11
+ #from pdflinkcheck.validate import run_validation
12
+
13
+ """
14
+ Inspect target PDF for both URI links and GoTo links, using only pypdf (no PyMuPDF/Fitz).
15
+ Fully fixed and improved version as of December 2025 (compatible with pypdf >= 4.0).
16
+ """
17
+
18
+ def get_anchor_text_pypdf(page, rect) -> str:
19
+ """
20
+ Extracts text that falls within or near the link's bounding box using a visitor function.
21
+ This is a reliable pure-pypdf method for associating visible text with a link annotation.
22
+ """
23
+ if not rect:
24
+ return "N/A: Missing Rect"
25
+
26
+ # PDF coordinates: bottom-left origin. Rect is [x0, y0, x1, y1]
27
+ # Standardize Rect: [x_min, y_min, x_max, y_max]
28
+ # Some PDF generators write Rect as [x_max, y_max, x_min, y_min]
29
+ x_min, y_min, x_max, y_max = rect[0], rect[1], rect[2], rect[3]
30
+ if x_min > x_max: x_min, x_max = x_max, x_min
31
+ if y_min > y_max: y_min, y_max = y_max, y_min
32
+
33
+ parts: List[str] = []
34
+
35
+ def visitor_body(text: str, cm, tm, font_dict, font_size):
36
+ # tm[4] and tm[5] are the (x, y) coordinates of the text insertion point
37
+ x, y = tm[4], tm[5]
38
+
39
+ # Guard against missing font_size
40
+ actual_font_size = font_size if font_size else 10
41
+
42
+
43
+ # Approximate Center-Alignment Check
44
+ # Since tm[4/5] is usually the bottom-left of the character,
45
+ # we shift our 'check point' slightly up and to the right based
46
+ # on font size to approximate the center of the character.
47
+ char_center_x = x + (actual_font_size / 4)
48
+ char_center_y = y + (actual_font_size / 3)
49
+
50
+ # Asymmetric Tolerance
51
+ # We use a tighter vertical tolerance (3pt) to avoid catching lines above/below.
52
+ # We use a wider horizontal tolerance (10pt) to catch kerning/spacing issues.
53
+ v_tol = 3
54
+ h_tol = 10
55
+ if (x_min - h_tol) <= char_center_x <= (x_max + h_tol) and \
56
+ (y_min - v_tol) <= char_center_y <= (y_max + v_tol):
57
+ if text.strip():
58
+ parts.append(text)
59
+
60
+ # Extract text using the visitor – this preserves drawing order
61
+ page.extract_text(visitor_text=visitor_body)
62
+
63
+ raw = "".join(parts)
64
+ cleaned = " ".join(raw.split()).strip()
65
+
66
+ return cleaned if cleaned else "Graphic/Empty Link"
67
+
68
+
69
+ def resolve_pypdf_destination(reader: PdfReader, dest) -> str:
70
+ """
71
+ Resolves any form of destination (/Dest or /A /D) to a human-readable page number.
72
+ Uses the official pypdf helper when possible for maximum reliability.
73
+ """
74
+ try:
75
+ if dest is None:
76
+ return "N/A"
77
+
78
+ # If it's an IndirectObject, resolve it first
79
+ if isinstance(dest, (IndirectObject, NameObject)):
80
+ dest = dest.get_object()
81
+
82
+ # Named destinations or explicit destinations are handled correctly by this method
83
+ if isinstance(dest, Destination):
84
+ return str(reader.get_destination_page_number(dest) + 1)
85
+
86
+ # Direct array or indirect reference
87
+ page_num = reader.get_destination_page_number(dest)
88
+ return str(page_num + 1)
89
+
90
+ except Exception:
91
+ return "Unknown/Error"
92
+
93
+
94
+ def extract_links_pypdf(pdf_path: Path | str) -> List[Dict[str, Any]]:
95
+ """
96
+ Extract all link annotations (URI, internal GoTo, remote GoToR) using pure pypdf.
97
+ Output schema matches typical reporting needs.
98
+ """
99
+ reader = PdfReader(pdf_path)
100
+
101
+ all_links: List[Dict[str, Any]] = []
102
+
103
+ for i, page in enumerate(reader.pages):
104
+ page_num = i + 1
105
+
106
+ if "/Annots" not in page:
107
+ continue
108
+
109
+ annots = page["/Annots"]
110
+ for annot_ref in annots:
111
+ try:
112
+ annot = annot_ref.get_object()
113
+ except Exception:
114
+ continue # Corrupted annotation – skip
115
+
116
+ if annot.get("/Subtype") != "/Link":
117
+ continue
118
+
119
+ rect = annot.get("/Rect")
120
+ anchor_text = get_anchor_text_pypdf(page, rect)
121
+
122
+ link_dict: Dict[str, Any] = {
123
+ "page": page_num,
124
+ "rect": list(rect) if rect else None,
125
+ "link_text": anchor_text,
126
+ "type": "Other Action",
127
+ "target": "Unknown",
128
+ }
129
+
130
+ action = annot.get("/A")
131
+
132
+ # External URI link
133
+ if action and action.get("/URI"):
134
+ uri = action["/URI"]
135
+ link_dict.update({
136
+ "type": "External (URI)",
137
+ "url": str(uri),
138
+ "target": str(uri),
139
+ })
140
+
141
+ # Internal GoTo – can be /Dest directly or inside /A /D
142
+ elif annot.get("/Dest") or (action and action.get("/D")):
143
+ dest = annot.get("/Dest") or (action and action["/D"])
144
+ target_page = resolve_pypdf_destination(reader, dest)
145
+ link_dict.update({
146
+ "type": "Internal (GoTo/Dest)",
147
+ "destination_page": target_page,
148
+ "target": f"Page {target_page}",
149
+ })
150
+
151
+ # Remote GoToR (links to another PDF file)
152
+ elif action and action.get("/S") == "/GoToR":
153
+ file_spec = action.get("/F")
154
+ remote_file = str(file_spec) if file_spec else "Unknown File"
155
+ remote_dest = action.get("/D")
156
+ remote_target = f"File: {remote_file}"
157
+ if remote_dest:
158
+ remote_target += f" → Dest: {remote_dest}"
159
+ link_dict.update({
160
+ "type": "Remote (GoToR)",
161
+ "remote_file": remote_file,
162
+ "target": remote_target,
163
+ })
164
+
165
+ all_links.append(link_dict)
166
+
167
+ return all_links
168
+
169
+
170
+ def extract_toc_pypdf(pdf_path: Path | str) -> List[Dict[str, Any]]:
171
+ """
172
+ Extract the PDF outline (bookmarks / table of contents) using pypdf.
173
+ Correctly handles nested structure and uses the official page resolution method.
174
+ """
175
+ try:
176
+ reader = PdfReader(pdf_path)
177
+ outline = reader.outline
178
+ if not outline:
179
+ return []
180
+
181
+ toc_data: List[Dict[str, Any]] = []
182
+
183
+ def flatten_outline(items: List, level: int = 1):
184
+ for item in items:
185
+ if isinstance(item, Destination):
186
+ try:
187
+ page_num = reader.get_destination_page_number(item) + 1
188
+ except Exception:
189
+ page_num = "N/A"
190
+
191
+ toc_data.append({
192
+ "level": level,
193
+ "title": item.title or "(Untitled)",
194
+ "target_page": page_num,
195
+ })
196
+ elif isinstance(item, list):
197
+ # Recurse into child entries
198
+ flatten_outline(item, level + 1)
199
+
200
+ flatten_outline(outline)
201
+ return toc_data
202
+
203
+ except Exception as e:
204
+ print(f"TOC extraction error: {e}", file=sys.stderr)
205
+ return []
206
+
207
+
208
+ def call_stable():
209
+ """
210
+ Entry point for command-line execution or integration with reporting module.
211
+ """
212
+ run_report(library_pdf="pypdf")
213
+ # run_validation(library_pdf="pypdf") # Uncomment if validation step is needed
214
+
215
+
216
+ if __name__ == "__main__":
217
+ call_stable()
218
+ # pypdf version updates
pdflinkcheck/cli.py CHANGED
@@ -1,17 +1,105 @@
1
1
  # src/pdflinkcheck/cli.py
2
2
  import typer
3
+ from typing import Literal
4
+ from typer.models import OptionInfo
3
5
  from rich.console import Console
4
6
  from pathlib import Path
5
- from pdflinkcheck.analyze import run_analysis # Assuming core logic moves here
6
- from typing import Dict
7
- # Initialize the rich console for output
8
- console = Console()
7
+ from pdflinkcheck.report import run_report # Assuming core logic moves here
8
+ from typing import Dict, Optional, Union, List
9
+ import pyhabitat
10
+ import sys
11
+ import os
12
+ from importlib.resources import files
13
+
14
+ from pdflinkcheck.version_info import get_version_from_pyproject
15
+ from pdflinkcheck.validate import run_validation
16
+
17
+
18
+ console = Console() # to be above the tkinter check, in case of console.print
19
+
9
20
  app = typer.Typer(
10
21
  name="pdflinkcheck",
11
- help="A command-line tool for comprehensive PDF link analysis and reporting.",
12
- add_completion=False
22
+ help=f"A command-line tool for comprehensive PDF link analysis and reporting. (v{get_version_from_pyproject()})",
23
+ add_completion=False,
24
+ invoke_without_command = True,
25
+ no_args_is_help = False,
13
26
  )
14
27
 
28
+
29
+ @app.callback()
30
+ def main(ctx: typer.Context):
31
+ """
32
+ If no subcommand is provided, launch the GUI.
33
+ """
34
+
35
+ if ctx.invoked_subcommand is None:
36
+ gui_command()
37
+ raise typer.Exit(code=0)
38
+
39
+ # 1. Access the list of all command-line arguments
40
+ full_command_list = sys.argv
41
+ # 2. Join the list into a single string to recreate the command
42
+ command_string = " ".join(full_command_list)
43
+ # 3. Print the command
44
+ typer.echo(f"command:\n{command_string}\n")
45
+
46
+
47
+ # help-tree() command: fragile, experimental, defaults to not being included.
48
+ if os.environ.get('DEV_TYPER_HELP_TREE',0) in ('true','1'):
49
+ from pdflinkcheck.dev import add_typer_help_tree
50
+ add_typer_help_tree(
51
+ app = app,
52
+ console = console)
53
+
54
+ @app.command(name="docs", help="Show the docs for this software.")
55
+ def docs_command(
56
+ license: Optional[bool] = typer.Option(
57
+ None, "--license", "-l", help="Show the full AGPLv3 license text."
58
+ ),
59
+ readme: Optional[bool] = typer.Option(
60
+ None, "--readme", "-r", help="Show the full README.md content."
61
+ ),
62
+ ):
63
+ """
64
+ Handles the pdflinkcheck docs command, either with flags or by showing help.
65
+ """
66
+ if not license and not readme:
67
+ # If no flags are provided, show the help message for the docs subcommand.
68
+ # Use ctx.invoke(ctx.command.get_help, ctx) if you want to print help immediately.
69
+ # Otherwise, the default behavior (showing help) works fine, but we'll add a message.
70
+ console.print("[yellow]Please use either the --license or --readme flag.[/yellow]")
71
+ return # Typer will automatically show the help message.
72
+
73
+ # --- Handle --license flag ---
74
+ if license:
75
+ try:
76
+ license_path = files("pdflinkcheck.data") / "LICENSE"
77
+ license_text = license_path.read_text(encoding="utf-8")
78
+
79
+ console.print(f"\n[bold green]=== GNU AFFERO GENERAL PUBLIC LICENSE V3+ ===[/bold green]")
80
+ console.print(license_text, highlight=False)
81
+
82
+ except FileNotFoundError:
83
+ console.print("[bold red]Error:[/bold red] The embedded license file could not be found.")
84
+ raise typer.Exit(code=1)
85
+
86
+ # --- Handle --readme flag ---
87
+ if readme:
88
+ try:
89
+ readme_path = files("pdflinkcheck.data") / "README.md"
90
+ readme_text = readme_path.read_text(encoding="utf-8")
91
+
92
+ # Using rich's Panel can frame the readme text nicely
93
+ console.print(f"\n[bold green]=== pdflinkcheck README ===[/bold green]")
94
+ console.print(readme_text, highlight=False)
95
+
96
+ except FileNotFoundError:
97
+ console.print("[bold red]Error:[/bold red] The embedded README.md file could not be found.")
98
+ raise typer.Exit(code=1)
99
+
100
+ # Exit successfully if any flag was processed
101
+ raise typer.Exit(code=0)
102
+
15
103
  @app.command(name="analyze") # Added a command name 'analyze' for clarity
16
104
  def analyze_pdf( # Renamed function for clarity
17
105
  pdf_path: Path = typer.Argument(
@@ -22,42 +110,230 @@ def analyze_pdf( # Renamed function for clarity
22
110
  readable=True,
23
111
  resolve_path=True,
24
112
  help="The path to the PDF file to analyze."
25
- ),
26
- check_remnants: bool = typer.Option(
27
- True,
28
- "--check-remnants/--no-check-remnants",
29
- help="Toggle checking for unlinked URLs/Emails in the text layer."
113
+ ),
114
+ export_format: Optional[Literal["JSON", "TXT", "JSON,TXT", "NONE"]] = typer.Option(
115
+ "JSON",
116
+ "--export-format","-e",
117
+ case_sensitive=False,
118
+ help="Export format. Use 'None' to suppress file export.",
30
119
  ),
31
120
  max_links: int = typer.Option(
32
- 50,
33
- "--max-links",
121
+ 0,
122
+ "--max-links", "-m",
34
123
  min=0,
35
- help="Maximum number of links/remnants to display in the report. Use 0 to show all."
124
+ help="Report brevity control. Use 0 to show all."
125
+ ),
126
+
127
+ pdf_library: Literal["pypdf", "pymupdf"] = typer.Option(
128
+ "pypdf",#"pymupdf",
129
+ "--pdf-library","-p",
130
+ envvar="PDF_ENGINE",
131
+ help="Select PDF parsing library, pymupdf or pypdf.",
36
132
  )
37
133
  ):
38
134
  """
39
- Analyzes the specified PDF file for all internal, external, and unlinked URI/Email references.
135
+ Analyzes the specified PDF file for all internal, external, and unlinked references.
136
+
137
+ Checks:
138
+ • Internal GoTo links point to valid pages
139
+ • Remote GoToR links point to existing files
140
+ • TOC bookmarks target valid pages
141
+ """
142
+
143
+ """
144
+ Fun Typer fact:
145
+ Overriding Order
146
+ Environment variables sit in the middle of the "priority" hierarchy:
147
+
148
+ CLI Flag: (Highest priority) analyze -p pypdf will always win.
149
+
150
+ Env Var: If no flag is present, it checks PDF_ENGINE.
151
+
152
+ Code Default: (Lowest priority) It falls back to "pypdf" as defined in your typer.Option.
40
153
  """
41
- # The actual heavy lifting (analysis and printing) is now in run_analysis
42
- run_analysis(
154
+
155
+ VALID_FORMATS = ("JSON") # extend later
156
+ requested_formats = [fmt.strip().upper() for fmt in export_format.split(",")]
157
+ if "NONE" in requested_formats or not export_format.strip() or export_format == "0":
158
+ export_formats = ""
159
+ else:
160
+ # Filter for valid ones: ("JSON", "TXT")
161
+ # This allows "JSON,TXT" to become "JSONTXT" which your run_report logic can handle
162
+ valid = [f for f in requested_formats if f in ("JSON", "TXT")]
163
+ export_formats = "".join(valid)
164
+
165
+ if not valid and "NONE" not in requested_formats:
166
+ typer.echo(f"Warning: No valid formats found in '{export_format}'. Supported: JSON, TXT.")
167
+
168
+ run_report(
43
169
  pdf_path=str(pdf_path),
44
- check_remnants=check_remnants,
45
- max_links=max_links
170
+ max_links=max_links,
171
+ export_format = export_formats,
172
+ pdf_library = pdf_library,
173
+ )
174
+
175
+ @app.command(name="validate")
176
+ def validate_pdf(
177
+ pdf_path: Optional[Path] = typer.Argument(
178
+ None,
179
+ exists=True,
180
+ file_okay=True,
181
+ dir_okay=False,
182
+ readable=True,
183
+ resolve_path=True,
184
+ help="Path to the PDF file to validate. If omitted, searches current directory."
185
+ ),
186
+ export: bool = typer.Option(
187
+ True,
188
+ "--export",#"--no-export",
189
+ help = "JSON export for validation check."
190
+ ),
191
+ pdf_library: Literal["pypdf", "pymupdf"] = typer.Option(
192
+ "pypdf",
193
+ "--library", "-l",
194
+ envvar="PDF_ENGINE",
195
+ help="PDF parsing engine: pypdf (pure Python) or pymupdf (faster, if available)"
196
+ ),
197
+ fail_on_broken: bool = typer.Option(
198
+ False,
199
+ "--fail",
200
+ help="Exit with code 1 if any broken links are found (useful for CI)"
46
201
  )
202
+ ):
203
+ """
204
+ Validate internal, remote, and TOC links in a PDF.
205
+
206
+ 1. Call the run_report() function, like calling the 'analyze' CLI command.
207
+ 2. Inspects the results from 'run_report():
208
+ - Are referenced files available?
209
+ - Are the page numbers referenced by GoTo links within the length of the document?
210
+ """
211
+ from pdflinkcheck.io import get_first_pdf_in_cwd
212
+
213
+ if pdf_path is None:
214
+ pdf_path = get_first_pdf_in_cwd()
215
+ if pdf_path is None:
216
+ console.print("[red]Error: No PDF file provided and none found in current directory.[/red]")
217
+ raise typer.Exit(code=1)
218
+ console.print(f"[dim]No file specified — using: {pdf_path.name}[/dim]")
219
+
220
+ pdf_path_str = str(pdf_path)
221
+
222
+ console.print(f"[bold]Validating links in:[/bold] {pdf_path.name}")
223
+ console.print(f"[bold]Using engine:[/bold] {pdf_library}\n")
224
+
225
+ # Step 1: Run analysis (quietly)
226
+ report = run_report(
227
+ pdf_path=pdf_path_str,
228
+ max_links=0,
229
+ export_format="",
230
+ pdf_library=pdf_library,
231
+ print_bool=False
232
+ )
233
+
234
+ if not report or not report.get("data"):
235
+ console.print("[yellow]No links or TOC found — nothing to validate.[/yellow]")
236
+ raise typer.Exit(code=0)
47
237
 
238
+ # Step 2: Run validation
239
+ validation_results = run_validation(
240
+ report_results=report,
241
+ pdf_path=pdf_path_str,
242
+ pdf_library=pdf_library,
243
+ export_json=export,
244
+ print_bool=True
245
+ )
246
+
247
+ # Optional: fail on broken links
248
+ broken_count = validation_results["summary-stats"]["broken-page"] + validation_results["summary-stats"]["broken-file"]
249
+ if fail_on_broken and broken_count > 0:
250
+ console.print(f"\n[bold red]Validation failed:[/bold red] {broken_count} broken link(s) found.")
251
+ raise typer.Exit(code=1)
252
+ elif broken_count > 0:
253
+ console.print(f"\n[bold yellow]Warning:[/bold yellow] {broken_count} broken link(s) found.")
254
+ else:
255
+ console.print(f"\n[bold green]Success:[/bold green] No broken links or TOC issues!")
256
+
257
+ raise typer.Exit(code=0 if broken_count == 0 else 1)
258
+
259
+ @app.command(name="serve")
260
+ def serve(
261
+ host: str = typer.Option("0.0.0.0", "--host", "-h", help="Host to bind (use 0.0.0.0 for network access)"),
262
+ port: int = typer.Option(8000, "--port", "-p", help="Port to listen on"),
263
+ reload: bool = typer.Option(False, "--reload", help="Auto-reload on code changes (dev only)"),
264
+ ):
265
+ """
266
+ Start the built-in web server for uploading and analyzing PDFs in the browser.
267
+
268
+ Pure stdlib — no extra dependencies. Works great on Termux!
269
+ """
270
+ console.print(f"[bold green]Starting pdflinkcheck web server[/bold green]")
271
+ console.print(f" → Open your browser at: [bold blue]http://{host}:{port}[/bold blue]")
272
+ console.print(f" → Upload a PDF to analyze links and TOC")
273
+ if reload:
274
+ console.print(" → [yellow]Reload mode enabled[/yellow]")
275
+
276
+ # Import here to avoid slow imports on other commands
277
+ from pdflinkcheck.stdlib_server import ThreadedTCPServer, PDFLinkCheckHandler
278
+ import socketserver
279
+
280
+ try:
281
+ with ThreadedTCPServer((host, port), PDFLinkCheckHandler) as httpd:
282
+ console.print(f"[green]Server running — press Ctrl+C to stop[/green]\n")
283
+ httpd.serve_forever()
284
+ except OSError as e:
285
+ if "Address already in use" in str(e):
286
+ console.print(f"[red]Error: Port {port} is already in use.[/red]")
287
+ console.print("Try a different port with --port 8080")
288
+ else:
289
+ console.print(f"[red]Server error: {e}[/red]")
290
+ raise typer.Exit(code=1)
291
+ except KeyboardInterrupt:
292
+ console.print("\n[bold yellow]Server stopped.[/bold yellow]")
293
+ raise typer.Exit(code=0)
294
+
295
+
48
296
  @app.command(name="gui")
49
- def gui():
297
+ def gui_command(
298
+ auto_close: int = typer.Option(0,
299
+ "--auto-close", "-c",
300
+ help = "Delay in milliseconds after which the GUI window will close (for automated testing). Use 0 to disable auto-closing.",
301
+ min=0)
302
+ )->None:
50
303
  """
51
304
  Launch tkinter-based GUI.
52
305
  """
306
+
307
+ # --- START FIX ---
308
+ assured_auto_close_value = 0
309
+
310
+ if isinstance(auto_close, OptionInfo):
311
+ # Case 1: Called implicitly from main() (pdflinkcheck with no args)
312
+ # We received the metadata object, so use the function's default value (0).
313
+ # We don't need to do anything here since final_auto_close_value is already 0.
314
+ pass
315
+ else:
316
+ # Case 2: Called explicitly by Typer (pdflinkcheck gui -c 3000)
317
+ # Typer has successfully converted the command line argument, and auto_close is an int.
318
+ assured_auto_close_value = int(auto_close)
319
+ # --- END FIX ---
320
+
321
+ if not pyhabitat.tkinter_is_available():
322
+ _gui_failure_msg()
323
+ return
53
324
  from pdflinkcheck.gui import start_gui
54
- try:
55
- start_gui()
56
- except Exception as e:
57
- typer.echo("GUI failed to launch")
58
- typer.echo("Ensure tkinter is available, especially if using WSLg.")
59
- typer.echo(f"Error: {e}")
325
+ start_gui(time_auto_close = assured_auto_close_value)
326
+
327
+ # --- Helper, consistent gui failure message. ---
328
+ def _gui_failure_msg():
329
+ console.print("[bold red]GUI failed to launch[/bold red]")
330
+ console.print("Ensure pdflinkcheck dependecies are installed and the venv is activated (the dependecies are managed by uv).")
331
+ console.print("The dependecies for pdflinkcheck are managed by uv.")
332
+ console.print("Ensure Tkinter is available, especially if using WSLg.")
333
+ console.print("On Termux/Android, GUI is not supported. Use 'pdflinkcheck analyze <file.pdf>' instead.")
334
+ console.print(f"pyhabitat.tkinter_is_available() = {pyhabitat.tkinter_is_available()}")
335
+ pass
60
336
 
61
- # Placeholder for running the app
62
337
  if __name__ == "__main__":
63
338
  app()
339
+