PyPI - pdflinkcheck - Versions diffs - 1.1.7__py3-none-any.whl → 1.1.72__py3-none-any.whl - Mend

pdflinkcheck 1.1.7py3-none-any.whl → 1.1.72py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

pdflinkcheck/__init__.py +69 -0
pdflinkcheck/analyze_pymupdf.py +338 -0
pdflinkcheck/analyze_pypdf.py +184 -0
pdflinkcheck/analyze_pypdf_v2.py +218 -0
pdflinkcheck/cli.py +303 -27
pdflinkcheck/data/LICENSE +661 -0
pdflinkcheck/data/README.md +278 -0
pdflinkcheck/data/pyproject.toml +98 -0
pdflinkcheck/datacopy.py +60 -0
pdflinkcheck/dev.py +109 -0
pdflinkcheck/gui.py +477 -52
pdflinkcheck/io.py +213 -0
pdflinkcheck/report.py +280 -0
pdflinkcheck/stdlib_server.py +176 -0
pdflinkcheck/validate.py +380 -0
pdflinkcheck/version_info.py +83 -0
pdflinkcheck-1.1.72.dist-info/METADATA +322 -0
pdflinkcheck-1.1.72.dist-info/RECORD +21 -0
pdflinkcheck-1.1.72.dist-info/WHEEL +4 -0
{pdflinkcheck-1.1.7.dist-info → pdflinkcheck-1.1.72.dist-info}/entry_points.txt +1 -1
pdflinkcheck-1.1.72.dist-info/licenses/LICENSE +661 -0
pdflinkcheck/analyze.py +0 -330
pdflinkcheck/remnants.py +0 -142
pdflinkcheck-1.1.7.dist-info/METADATA +0 -109
pdflinkcheck-1.1.7.dist-info/RECORD +0 -10
pdflinkcheck-1.1.7.dist-info/WHEEL +0 -5
pdflinkcheck-1.1.7.dist-info/top_level.txt +0 -1

pdflinkcheck/analyze_pypdf_v2.py ADDED Viewed

@@ -0,0 +1,218 @@
+# src/pdflinkcheck/analyze_pypdf.py
+import sys
+from pathlib import Path
+import logging
+from typing import Dict, Any, List
+from pypdf import PdfReader
+from pypdf.generic import Destination, NameObject, IndirectObject
+from pdflinkcheck.report import run_report
+#from pdflinkcheck.validate import run_validation
+"""
+Inspect target PDF for both URI links and GoTo links, using only pypdf (no PyMuPDF/Fitz).
+Fully fixed and improved version as of December 2025 (compatible with pypdf >= 4.0).
+"""
+def get_anchor_text_pypdf(page, rect) -> str:
+    """
+    Extracts text that falls within or near the link's bounding box using a visitor function.
+    This is a reliable pure-pypdf method for associating visible text with a link annotation.
+    """
+    if not rect:
+        return "N/A: Missing Rect"
+    # PDF coordinates: bottom-left origin. Rect is [x0, y0, x1, y1]
+    # Standardize Rect: [x_min, y_min, x_max, y_max]
+    # Some PDF generators write Rect as [x_max, y_max, x_min, y_min]
+    x_min, y_min, x_max, y_max = rect[0], rect[1], rect[2], rect[3]
+    if x_min > x_max: x_min, x_max = x_max, x_min
+    if y_min > y_max: y_min, y_max = y_max, y_min
+    parts: List[str] = []
+    def visitor_body(text: str, cm, tm, font_dict, font_size):
+        # tm[4] and tm[5] are the (x, y) coordinates of the text insertion point
+        x, y = tm[4], tm[5]
+        # Guard against missing font_size
+        actual_font_size = font_size if font_size else 10
+        # Approximate Center-Alignment Check
+        # Since tm[4/5] is usually the bottom-left of the character,
+        # we shift our 'check point' slightly up and to the right based
+        # on font size to approximate the center of the character.
+        char_center_x = x + (actual_font_size / 4)
+        char_center_y = y + (actual_font_size / 3)
+        # Asymmetric Tolerance
+        # We use a tighter vertical tolerance (3pt) to avoid catching lines above/below.
+        # We use a wider horizontal tolerance (10pt) to catch kerning/spacing issues.
+        v_tol = 3
+        h_tol = 10
+        if (x_min - h_tol) <= char_center_x <= (x_max + h_tol) and \
+        (y_min - v_tol) <= char_center_y <= (y_max + v_tol):
+            if text.strip():
+                parts.append(text)
+    # Extract text using the visitor – this preserves drawing order
+    page.extract_text(visitor_text=visitor_body)
+    raw = "".join(parts)
+    cleaned = " ".join(raw.split()).strip()
+    return cleaned if cleaned else "Graphic/Empty Link"
+def resolve_pypdf_destination(reader: PdfReader, dest) -> str:
+    """
+    Resolves any form of destination (/Dest or /A /D) to a human-readable page number.
+    Uses the official pypdf helper when possible for maximum reliability.
+    """
+    try:
+        if dest is None:
+            return "N/A"
+        # If it's an IndirectObject, resolve it first
+        if isinstance(dest, (IndirectObject, NameObject)):
+            dest = dest.get_object()
+        # Named destinations or explicit destinations are handled correctly by this method
+        if isinstance(dest, Destination):
+            return str(reader.get_destination_page_number(dest) + 1)
+        # Direct array or indirect reference
+        page_num = reader.get_destination_page_number(dest)
+        return str(page_num + 1)
+    except Exception:
+        return "Unknown/Error"
+def extract_links_pypdf(pdf_path: Path | str) -> List[Dict[str, Any]]:
+    """
+    Extract all link annotations (URI, internal GoTo, remote GoToR) using pure pypdf.
+    Output schema matches typical reporting needs.
+    """
+    reader = PdfReader(pdf_path)
+    all_links: List[Dict[str, Any]] = []
+    for i, page in enumerate(reader.pages):
+        page_num = i + 1
+        if "/Annots" not in page:
+            continue
+        annots = page["/Annots"]
+        for annot_ref in annots:
+            try:
+                annot = annot_ref.get_object()
+            except Exception:
+                continue  # Corrupted annotation – skip
+            if annot.get("/Subtype") != "/Link":
+                continue
+            rect = annot.get("/Rect")
+            anchor_text = get_anchor_text_pypdf(page, rect)
+            link_dict: Dict[str, Any] = {
+                "page": page_num,
+                "rect": list(rect) if rect else None,
+                "link_text": anchor_text,
+                "type": "Other Action",
+                "target": "Unknown",
+            }
+            action = annot.get("/A")
+            # External URI link
+            if action and action.get("/URI"):
+                uri = action["/URI"]
+                link_dict.update({
+                    "type": "External (URI)",
+                    "url": str(uri),
+                    "target": str(uri),
+                })
+            # Internal GoTo – can be /Dest directly or inside /A /D
+            elif annot.get("/Dest") or (action and action.get("/D")):
+                dest = annot.get("/Dest") or (action and action["/D"])
+                target_page = resolve_pypdf_destination(reader, dest)
+                link_dict.update({
+                    "type": "Internal (GoTo/Dest)",
+                    "destination_page": target_page,
+                    "target": f"Page {target_page}",
+                })
+            # Remote GoToR (links to another PDF file)
+            elif action and action.get("/S") == "/GoToR":
+                file_spec = action.get("/F")
+                remote_file = str(file_spec) if file_spec else "Unknown File"
+                remote_dest = action.get("/D")
+                remote_target = f"File: {remote_file}"
+                if remote_dest:
+                    remote_target += f" → Dest: {remote_dest}"
+                link_dict.update({
+                    "type": "Remote (GoToR)",
+                    "remote_file": remote_file,
+                    "target": remote_target,
+                })
+            all_links.append(link_dict)
+    return all_links
+def extract_toc_pypdf(pdf_path: Path | str) -> List[Dict[str, Any]]:
+    """
+    Extract the PDF outline (bookmarks / table of contents) using pypdf.
+    Correctly handles nested structure and uses the official page resolution method.
+    """
+    try:
+        reader = PdfReader(pdf_path)
+        outline = reader.outline
+        if not outline:
+            return []
+        toc_data: List[Dict[str, Any]] = []
+        def flatten_outline(items: List, level: int = 1):
+            for item in items:
+                if isinstance(item, Destination):
+                    try:
+                        page_num = reader.get_destination_page_number(item) + 1
+                    except Exception:
+                        page_num = "N/A"
+                    toc_data.append({
+                        "level": level,
+                        "title": item.title or "(Untitled)",
+                        "target_page": page_num,
+                    })
+                elif isinstance(item, list):
+                    # Recurse into child entries
+                    flatten_outline(item, level + 1)
+        flatten_outline(outline)
+        return toc_data
+    except Exception as e:
+        print(f"TOC extraction error: {e}", file=sys.stderr)
+        return []
+def call_stable():
+    """
+    Entry point for command-line execution or integration with reporting module.
+    """
+    run_report(library_pdf="pypdf")
+    # run_validation(library_pdf="pypdf")  # Uncomment if validation step is needed
+if __name__ == "__main__":
+    call_stable()
+    # pypdf version updates

pdflinkcheck/cli.py CHANGED Viewed

@@ -1,17 +1,105 @@
 # src/pdflinkcheck/cli.py
 import typer
+from typing import Literal
+from typer.models import OptionInfo
 from rich.console import Console
 from pathlib import Path
-from pdflinkcheck.analyze import run_analysis # Assuming core logic moves here
-from typing import Dict
-# Initialize the rich console for output
-console = Console()
+from pdflinkcheck.report import run_report # Assuming core logic moves here
+from typing import Dict, Optional, Union, List
+import pyhabitat
+import sys
+import os
+from importlib.resources import files
+from pdflinkcheck.version_info import get_version_from_pyproject
+from pdflinkcheck.validate import run_validation
+console = Console() # to be above the tkinter check, in case of console.print
 app = typer.Typer(
     name="pdflinkcheck",
-    help="A command-line tool for comprehensive PDF link analysis and reporting.",
-    add_completion=False
+    help=f"A command-line tool for comprehensive PDF link analysis and reporting. (v{get_version_from_pyproject()})",
+    add_completion=False,
+    invoke_without_command = True,
+    no_args_is_help = False,
 )
+@app.callback()
+def main(ctx: typer.Context):
+    """
+    If no subcommand is provided, launch the GUI.
+    """
+    if ctx.invoked_subcommand is None:
+        gui_command()
+        raise typer.Exit(code=0)
+    # 1. Access the list of all command-line arguments
+    full_command_list = sys.argv
+    # 2. Join the list into a single string to recreate the command
+    command_string = " ".join(full_command_list)
+    # 3. Print the command
+    typer.echo(f"command:\n{command_string}\n")
+# help-tree() command: fragile, experimental, defaults to not being included.
+if os.environ.get('DEV_TYPER_HELP_TREE',0) in ('true','1'):
+    from pdflinkcheck.dev import add_typer_help_tree
+    add_typer_help_tree(
+        app = app,
+        console = console)
+@app.command(name="docs", help="Show the docs for this software.")
+def docs_command(
+    license: Optional[bool] = typer.Option(
+        None, "--license", "-l", help="Show the full AGPLv3 license text."
+    ),
+    readme: Optional[bool] = typer.Option(
+        None, "--readme", "-r", help="Show the full README.md content."
+    ),
+):
+    """
+    Handles the pdflinkcheck docs command, either with flags or by showing help.
+    """
+    if not license and not readme:
+        # If no flags are provided, show the help message for the docs subcommand.
+        # Use ctx.invoke(ctx.command.get_help, ctx) if you want to print help immediately.
+        # Otherwise, the default behavior (showing help) works fine, but we'll add a message.
+        console.print("[yellow]Please use either the --license or --readme flag.[/yellow]")
+        return # Typer will automatically show the help message.
+    # --- Handle --license flag ---
+    if license:
+        try:
+            license_path = files("pdflinkcheck.data") / "LICENSE"
+            license_text = license_path.read_text(encoding="utf-8")
+            console.print(f"\n[bold green]=== GNU AFFERO GENERAL PUBLIC LICENSE V3+ ===[/bold green]")
+            console.print(license_text, highlight=False)
+        except FileNotFoundError:
+            console.print("[bold red]Error:[/bold red] The embedded license file could not be found.")
+            raise typer.Exit(code=1)
+    # --- Handle --readme flag ---
+    if readme:
+        try:
+            readme_path = files("pdflinkcheck.data") / "README.md"
+            readme_text = readme_path.read_text(encoding="utf-8")
+            # Using rich's Panel can frame the readme text nicely
+            console.print(f"\n[bold green]=== pdflinkcheck README ===[/bold green]")
+            console.print(readme_text, highlight=False)
+        except FileNotFoundError:
+            console.print("[bold red]Error:[/bold red] The embedded README.md file could not be found.")
+            raise typer.Exit(code=1)
+    # Exit successfully if any flag was processed
+    raise typer.Exit(code=0)
 @app.command(name="analyze") # Added a command name 'analyze' for clarity
 def analyze_pdf( # Renamed function for clarity
     pdf_path: Path = typer.Argument(
@@ -22,42 +110,230 @@ def analyze_pdf( # Renamed function for clarity
         readable=True,
         resolve_path=True,
         help="The path to the PDF file to analyze."
-    ),
-    check_remnants: bool = typer.Option(
-        True,
-        "--check-remnants/--no-check-remnants",
-        help="Toggle checking for unlinked URLs/Emails in the text layer."
+    ),
+    export_format: Optional[Literal["JSON", "TXT", "JSON,TXT", "NONE"]] = typer.Option(
+        "JSON",
+        "--export-format","-e",
+        case_sensitive=False,
+        help="Export format. Use 'None' to suppress file export.",
     ),
     max_links: int = typer.Option(
-        50,
-        "--max-links",
+        0,
+        "--max-links", "-m",
         min=0,
-        help="Maximum number of links/remnants to display in the report. Use 0 to show all."
+        help="Report brevity control. Use 0 to show all."
+    ),
+    pdf_library: Literal["pypdf", "pymupdf"] = typer.Option(
+        "pypdf",#"pymupdf",
+        "--pdf-library","-p",
+        envvar="PDF_ENGINE",
+        help="Select PDF parsing library, pymupdf or pypdf.",
     )
 ):
     """
-    Analyzes the specified PDF file for all internal, external, and unlinked URI/Email references.
+    Analyzes the specified PDF file for all internal, external, and unlinked references.
+    Checks:
+    • Internal GoTo links point to valid pages
+    • Remote GoToR links point to existing files
+    • TOC bookmarks target valid pages
+    """
+    """
+    Fun Typer fact:
+    Overriding Order
+    Environment variables sit in the middle of the "priority" hierarchy:
+    CLI Flag: (Highest priority) analyze -p pypdf will always win.
+    Env Var: If no flag is present, it checks PDF_ENGINE.
+    Code Default: (Lowest priority) It falls back to "pypdf" as defined in your typer.Option.
     """
-    # The actual heavy lifting (analysis and printing) is now in run_analysis
-    run_analysis(
+    VALID_FORMATS = ("JSON") # extend later
+    requested_formats = [fmt.strip().upper() for fmt in export_format.split(",")]
+    if "NONE" in requested_formats or not export_format.strip() or export_format == "0":
+        export_formats = ""
+    else:
+        # Filter for valid ones: ("JSON", "TXT")
+        # This allows "JSON,TXT" to become "JSONTXT" which your run_report logic can handle
+        valid = [f for f in requested_formats if f in ("JSON", "TXT")]
+        export_formats = "".join(valid)
+        if not valid and "NONE" not in requested_formats:
+            typer.echo(f"Warning: No valid formats found in '{export_format}'. Supported: JSON, TXT.")
+    run_report(
         pdf_path=str(pdf_path),
-        check_remnants=check_remnants,
-        max_links=max_links
+        max_links=max_links,
+        export_format = export_formats,
+        pdf_library = pdf_library,
+    )
+@app.command(name="validate")
+def validate_pdf(
+    pdf_path: Optional[Path] = typer.Argument(
+        None,
+        exists=True,
+        file_okay=True,
+        dir_okay=False,
+        readable=True,
+        resolve_path=True,
+        help="Path to the PDF file to validate. If omitted, searches current directory."
+    ),
+    export: bool = typer.Option(
+        True,
+        "--export",#"--no-export",
+        help = "JSON export for validation check."
+    ),
+    pdf_library: Literal["pypdf", "pymupdf"] = typer.Option(
+        "pypdf",
+        "--library", "-l",
+        envvar="PDF_ENGINE",
+        help="PDF parsing engine: pypdf (pure Python) or pymupdf (faster, if available)"
+    ),
+    fail_on_broken: bool = typer.Option(
+        False,
+        "--fail",
+        help="Exit with code 1 if any broken links are found (useful for CI)"
     )
+):
+    """
+    Validate internal, remote, and TOC links in a PDF.
+    1. Call the run_report() function, like calling the 'analyze' CLI command.
+    2. Inspects the results from 'run_report():
+        - Are referenced files available?
+        - Are the page numbers referenced by GoTo links within the length of the document?
+    """
+    from pdflinkcheck.io import get_first_pdf_in_cwd
+    if pdf_path is None:
+        pdf_path = get_first_pdf_in_cwd()
+        if pdf_path is None:
+            console.print("[red]Error: No PDF file provided and none found in current directory.[/red]")
+            raise typer.Exit(code=1)
+        console.print(f"[dim]No file specified — using: {pdf_path.name}[/dim]")
+    pdf_path_str = str(pdf_path)
+    console.print(f"[bold]Validating links in:[/bold] {pdf_path.name}")
+    console.print(f"[bold]Using engine:[/bold] {pdf_library}\n")
+    # Step 1: Run analysis (quietly)
+    report = run_report(
+        pdf_path=pdf_path_str,
+        max_links=0,
+        export_format="",
+        pdf_library=pdf_library,
+        print_bool=False
+    )
+    if not report or not report.get("data"):
+        console.print("[yellow]No links or TOC found — nothing to validate.[/yellow]")
+        raise typer.Exit(code=0)
+    # Step 2: Run validation
+    validation_results = run_validation(
+        report_results=report,
+        pdf_path=pdf_path_str,
+        pdf_library=pdf_library,
+        export_json=export,
+        print_bool=True
+    )
+    # Optional: fail on broken links
+    broken_count = validation_results["summary-stats"]["broken-page"] + validation_results["summary-stats"]["broken-file"]
+    if fail_on_broken and broken_count > 0:
+        console.print(f"\n[bold red]Validation failed:[/bold red] {broken_count} broken link(s) found.")
+        raise typer.Exit(code=1)
+    elif broken_count > 0:
+        console.print(f"\n[bold yellow]Warning:[/bold yellow] {broken_count} broken link(s) found.")
+    else:
+        console.print(f"\n[bold green]Success:[/bold green] No broken links or TOC issues!")
+    raise typer.Exit(code=0 if broken_count == 0 else 1)
+@app.command(name="serve")
+def serve(
+    host: str = typer.Option("0.0.0.0", "--host", "-h", help="Host to bind (use 0.0.0.0 for network access)"),
+    port: int = typer.Option(8000, "--port", "-p", help="Port to listen on"),
+    reload: bool = typer.Option(False, "--reload", help="Auto-reload on code changes (dev only)"),
+):
+    """
+    Start the built-in web server for uploading and analyzing PDFs in the browser.
+    Pure stdlib — no extra dependencies. Works great on Termux!
+    """
+    console.print(f"[bold green]Starting pdflinkcheck web server[/bold green]")
+    console.print(f"   → Open your browser at: [bold blue]http://{host}:{port}[/bold blue]")
+    console.print(f"   → Upload a PDF to analyze links and TOC")
+    if reload:
+        console.print("   → [yellow]Reload mode enabled[/yellow]")
+    # Import here to avoid slow imports on other commands
+    from pdflinkcheck.stdlib_server import ThreadedTCPServer, PDFLinkCheckHandler
+    import socketserver
+    try:
+        with ThreadedTCPServer((host, port), PDFLinkCheckHandler) as httpd:
+            console.print(f"[green]Server running — press Ctrl+C to stop[/green]\n")
+            httpd.serve_forever()
+    except OSError as e:
+        if "Address already in use" in str(e):
+            console.print(f"[red]Error: Port {port} is already in use.[/red]")
+            console.print("Try a different port with --port 8080")
+        else:
+            console.print(f"[red]Server error: {e}[/red]")
+        raise typer.Exit(code=1)
+    except KeyboardInterrupt:
+        console.print("\n[bold yellow]Server stopped.[/bold yellow]")
+        raise typer.Exit(code=0)
 @app.command(name="gui")
-def gui():
+def gui_command(
+    auto_close: int = typer.Option(0,
+                                   "--auto-close", "-c",
+                                   help = "Delay in milliseconds after which the GUI window will close (for automated testing). Use 0 to disable auto-closing.",
+                                   min=0)
+    )->None:
     """
     Launch tkinter-based GUI.
     """
+    # --- START FIX ---
+    assured_auto_close_value = 0
+    if isinstance(auto_close, OptionInfo):
+        # Case 1: Called implicitly from main() (pdflinkcheck with no args)
+        # We received the metadata object, so use the function's default value (0).
+        # We don't need to do anything here since final_auto_close_value is already 0.
+        pass
+    else:
+        # Case 2: Called explicitly by Typer (pdflinkcheck gui -c 3000)
+        # Typer has successfully converted the command line argument, and auto_close is an int.
+        assured_auto_close_value = int(auto_close)
+    # --- END FIX ---
+    if not pyhabitat.tkinter_is_available():
+        _gui_failure_msg()
+        return
     from pdflinkcheck.gui import start_gui
-    try:
-        start_gui()
-    except Exception as e:
-        typer.echo("GUI failed to launch")
-        typer.echo("Ensure tkinter is available, especially if using WSLg.")
-        typer.echo(f"Error: {e}")
+    start_gui(time_auto_close = assured_auto_close_value)
+# --- Helper, consistent gui failure message. ---
+def _gui_failure_msg():
+    console.print("[bold red]GUI failed to launch[/bold red]")
+    console.print("Ensure pdflinkcheck dependecies are installed and the venv is activated (the dependecies are managed by uv).")
+    console.print("The dependecies for pdflinkcheck are managed by uv.")
+    console.print("Ensure Tkinter is available, especially if using WSLg.")
+    console.print("On Termux/Android, GUI is not supported. Use 'pdflinkcheck analyze <file.pdf>' instead.")
+    console.print(f"pyhabitat.tkinter_is_available() = {pyhabitat.tkinter_is_available()}")
+    pass
-    # Placeholder for running the app
 if __name__ == "__main__":
     app()

pdflinkcheck 1.1.7__py3-none-any.whl → 1.1.72__py3-none-any.whl

pdflinkcheck 1.1.7py3-none-any.whl → 1.1.72py3-none-any.whl