PyPI - pdflinkcheck - Versions diffs - 1.1.7__py3-none-any.whl - Mend

pdflinkcheck 1.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

pdflinkcheck/__init__.py +0 -0
pdflinkcheck/analyze.py +330 -0
pdflinkcheck/cli.py +63 -0
pdflinkcheck/gui.py +165 -0
pdflinkcheck/remnants.py +142 -0
pdflinkcheck-1.1.7.dist-info/METADATA +109 -0
pdflinkcheck-1.1.7.dist-info/RECORD +10 -0
pdflinkcheck-1.1.7.dist-info/WHEEL +5 -0
pdflinkcheck-1.1.7.dist-info/entry_points.txt +3 -0
pdflinkcheck-1.1.7.dist-info/top_level.txt +1 -0

pdflinkcheck/__init__.py ADDED Viewed

File without changes

pdflinkcheck/analyze.py ADDED Viewed

@@ -0,0 +1,330 @@
+import sys
+from pathlib import Path
+import logging
+from typing import Dict, Any
+# Configure logging to suppress low-level pdfminer messages
+logging.getLogger("fitz").setLevel(logging.ERROR)
+import fitz # PyMuPDF
+from pdflinkcheck.remnants import find_link_remnants
+"""
+Inspect target PDF for both URI links and for GoTo links.
+"""
+# Helper function: Prioritize 'from'
+def get_link_rect(link_dict):
+    """
+    Retrieves the bounding box for the link using the reliable 'from' key.
+    Returns the rect coordinates (tuple of 4 floats) or None.
+    """
+    # 1. Use the 'from' key, which returns a fitz.Rect object or None
+    rect_obj = link_dict.get('from')
+    if rect_obj:
+        # 2. Extract the coordinates using the standard Rect properties
+        #    (compatible with all recent PyMuPDF versions)
+        return (rect_obj.x0, rect_obj.y0, rect_obj.x1, rect_obj.y1)
+    # 3. Fallback to None if 'from' is missing
+    return None
+def get_pdf_file():
+    example_path = f"/mnt/c/Users/george.bennett/Downloads/TE Maxson WWTF O&M Manual DRAFT - Sections 1-6 - April 2025 (3).pdf"
+    example_path = "TE Maxson WWTF O&M Manual.pdf"
+    print(f"example path = {example_path}")
+    pdf_file = input(f"Paste path to PDF file (or press Enter to accept example): ")
+    if not pdf_file:
+        pdf_file = example_path
+    if not Path(pdf_file).exists:
+        print("File not found!")
+        sys.exit(1)
+    return pdf_file
+def get_anchor_text(page, link_rect):
+    """
+    Extracts text content using the link's bounding box.
+    Returns the cleaned text or a placeholder if no text is found.
+    """
+    if not link_rect:
+        return "N/A: Missing Rect"
+    try:
+        # 1. Convert the coordinate tuple back to a fitz.Rect object
+        rect = fitz.Rect(link_rect)
+        # --- CRITICAL STEP: Check for invalid/empty rect AFTER conversion ---
+        # If the rect is invalid (e.g., width or height is <= 0), skip it
+        # Note: fitz.Rect will often auto-normalize, but this explicit check is safer.
+        if rect.is_empty or rect.width <= 0 or rect.height <= 0:
+            return "N/A: Rect Error (Zero/Negative Dimension)"
+        # 2. Expand the rect slightly to capture full characters (1 unit in each direction)
+        #    This method avoids the proprietary/unstable 'from_expanded' or 'from_rect' methods.
+        expanded_rect = fitz.Rect(
+            rect.x0 - 1,
+            rect.y0 - 1,
+            rect.x1 + 1,
+            rect.y1 + 1
+        )
+        # 3. Get the text within the expanded bounding box
+        anchor_text = page.get_textbox(expanded_rect)
+        # 4. Clean up whitespace and non-printing characters
+        cleaned_text = " ".join(anchor_text.split())
+        if cleaned_text:
+            return cleaned_text
+        else:
+            return "N/A: No Visible Text"
+    except Exception:
+        # Fallback for unexpected errors in rect conversion or retrieval
+        return "N/A: Rect Error"
+def analyze_toc_fitz(doc):
+    """
+    Extracts the structured Table of Contents (bookmarks/outline) from the PDF.
+    """
+    toc = doc.get_toc()
+    toc_data = []
+    for level, title, page_num in toc:
+        # fitz pages are 1-indexed for TOC!
+        toc_data.append({
+            'level': level,
+            'title': title,
+            'target_page': page_num
+        })
+    return toc_data
+# 2. Updated Main Inspection Function to Include Text Extraction
+def inspect_pdf_hyperlinks_fitz(pdf_path):
+    links_data = []
+    try:
+        doc = fitz.open(pdf_path)
+        structural_toc = analyze_toc_fitz(doc)
+        for page_num in range(doc.page_count):
+            page = doc.load_page(page_num)
+            for link in page.get_links():
+                page_obj = doc.load_page(page_num)
+                link_rect = get_link_rect(link)
+                rect_obj = link.get("from")
+                xref = link.get("xref")
+                #print(f"rect_obj = {rect_obj}")
+                #print(f"xref = {xref}")
+                # --- Examples of various keys associated with various link instances ---
+                #print(f"keys: list(link) = {list(link)}")
+                # keys: list(link) = ['kind', 'xref', 'from', 'page', 'viewrect', 'id']
+                # keys: list(link) = ['kind', 'xref', 'from', 'uri', 'id']
+                # keys: list(link) = ['kind', 'xref', 'from', 'page', 'view', 'id']
+                # 1. Extract the anchor text
+                anchor_text = get_anchor_text(page_obj, link_rect)
+                # 2. Extract the target and kind
+                target = ""
+                kind = link.get('kind')
+                link_dict = {
+                    'page': int(page_num) + 1,
+                    'rect': link_rect,
+                    'link_text': anchor_text,
+                    'xref':xref
+                }
+                if link['kind'] == fitz.LINK_URI:
+                    target =  link.get('uri', 'URI (Unknown Target)')
+                    link_dict.update({
+                        'type': 'External (URI)',
+                        'url': link.get('uri'),
+                        'target': target
+                    })
+                elif link['kind'] == fitz.LINK_GOTO:
+                    target_page_num = link.get('page') + 1 # fitz pages are 0-indexed
+                    target = f"Page {target_page_num}"
+                    link_dict.update({
+                        'type': 'Internal (GoTo/Dest)',
+                        'destination_page': int(link.get('page')) + 1,
+                        'destination_view': link.get('to'),
+                        'target': target
+                    })
+                elif link['kind'] == fitz.LINK_GOTOR:
+                    link_dict.update({
+                        'type': 'Remote (GoToR)',
+                        'remote_file': link.get('file'),
+                        'destination': link.get('to')
+                    })
+                elif link.get('page') is not None and link['kind'] != fitz.LINK_GOTO:
+                    link_dict.update({
+                        'type': 'Internal (Resolved Action)',
+                        'destination_page': int(link.get('page')) + 1,
+                        'destination_view': link.get('to'),
+                        'source_kind': link.get('kind')
+                    })
+                else:
+                    target = link.get('url') or link.get('remote_file') or link.get('target')
+                    link_dict.update({
+                        'type': 'Other Action',
+                        'action_kind': link.get('kind'),
+                        'target': target
+                    })
+                links_data.append(link_dict)
+        doc.close()
+    except Exception as e:
+        print(f"An error occurred: {e}", file=sys.stderr)
+    return links_data, structural_toc
+def print_structural_toc(structural_toc):
+    """
+    Prints the structural TOC data in a clean, hierarchical, and readable format.
+    """
+    print("\n## 📚 Structural Table of Contents (PDF Bookmarks/Outline)")
+    print("-" * 50)
+    if not structural_toc:
+        print("No structural TOC (bookmarks/outline) found.")
+        return
+    # Determine max page width for consistent alignment (optional but nice)
+    max_page = max(item['target_page'] for item in structural_toc) if structural_toc else 1
+    page_width = len(str(max_page))
+    # Iterate and format
+    for item in structural_toc:
+        # Use level for indentation (e.g., Level 1 = 0 spaces, Level 2 = 4 spaces, Level 3 = 8 spaces)
+        indent = " " * 4 * (item['level'] - 1)
+        # Format the title and target page number
+        page_str = str(item['target_page']).rjust(page_width)
+        print(f"{indent}{item['title']} . . . page {page_str}")
+    print("-" * 50)
+def run_analysis(pdf_path: str, check_remnants: bool, max_links: int) -> Dict[str, Any]:
+    """
+    Core PDF analysis logic using PyMuPDF. Extracts links, remnants, and TOC.
+    The printing is done inside this function.
+    max_links: If <= 0, all links will be displayed.
+    """
+    print(f"Running PyMuPDF analysis on {Path(pdf_path).name}...")
+    # 1. Extract all active links and TOC
+    extracted_links, structural_toc = inspect_pdf_hyperlinks_fitz(pdf_path)
+    toc_entry_count = len(structural_toc)
+    # 2. Find link remnants
+    remnants = []
+    if check_remnants:
+        remnants = find_link_remnants(pdf_path, extracted_links) # Pass active links to exclude them
+    if not extracted_links and not remnants and not structural_toc:
+         print(f"\nNo hyperlinks, remnants, or structural TOC found in {Path(pdf_path).name}.")
+         return {}
+    # 3. Separate the lists based on the 'type' key
+    uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
+    goto_links = [link for link in extracted_links if link['type'] == 'Internal (GoTo/Dest)']
+    resolved_action_links = [link for link in extracted_links if link['type'] == 'Internal (Resolved Action)']
+    other_links = [link for link in extracted_links if link['type'] not in ['External (URI)', 'Internal (GoTo/Dest)', 'Internal (Resolved Action)']]
+    total_internal_links = len(goto_links) + len(resolved_action_links)
+    # --- ANALYSIS SUMMARY (Using your print logic) ---
+    print(f"\n--- Link Analysis Results for {Path(pdf_path).name} ---")
+    print(f"Total active links: {len(extracted_links)} (External: {len(uri_links)}, Internal Jumps: {total_internal_links}, Other: {len(other_links)})")
+    print(f"Total **structural TOC entries (bookmarks)** found: {toc_entry_count}")
+    print(f"Total **potential missing links** found: {len(remnants)}")
+    print("-" * 50)
+    limit = max_links if max_links > 0 else None
+    uri_and_other = uri_links + other_links
+    # --- Section 1: ACTIVE URI LINKS ---
+    print(f"\n## 🔗 Active URI Links (External & Other) - {len(uri_and_other)} found")
+    print("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target URI/Action"))
+    print("-" * 75)
+    if uri_and_other:
+        for i, link in enumerate(uri_and_other[:limit], 1):
+            target = link.get('url') or link.get('remote_file') or link.get('target')
+            link_text = link.get('link_text', 'N/A')
+            print("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], target))
+        if limit is not None and len(uri_and_other) > limit:
+            print(f"... and {len(uri_and_other) - limit} more links (use --max-links to see all or --max-links 0 to show all).")
+    else:
+        print("  No external or 'Other' links found.")
+    # --- Section 2: ACTIVE INTERNAL JUMPS ---
+    print(f"\n## 🖱️ Active Internal Jumps (GoTo & Resolved Actions) - {total_internal_links} found")
+    print("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To Page"))
+    print("-" * 75)
+    all_internal = goto_links + resolved_action_links
+    if total_internal_links > 0:
+        for i, link in enumerate(all_internal[:limit], 1):
+            link_text = link.get('link_text', 'N/A')
+            print("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], link['destination_page']))
+        if limit is not None and len(all_internal) > limit:
+             print(f"... and {len(all_internal) - limit} more links (use --max-links to see all or --max-links 0 to show all).")
+    else:
+        print("  No internal GoTo or Resolved Action links found.")
+    # --- Section 3: REMNANTS ---
+    print("\n" + "=" * 70)
+    print(f"## ⚠️ Link Remnants (Potential Missing Links to Fix) - {len(remnants)} found")
+    print("=" * 70)
+    if remnants:
+        print("{:<5} | {:<5} | {:<15} | {}".format("Idx", "Page", "Remnant Type", "Text Found (Needs Hyperlink)"))
+        print("-" * 75)
+        for i, remnant in enumerate(remnants[:max_links], 1):
+            print("{:<5} | {:<5} | {:<15} | {}".format(i, remnant['page'], remnant['type'], remnant['text']))
+        if len(remnants) > max_links:
+             print(f"... and {len(remnants) - max_links} more remnants (use --max-links to see all).")
+    else:
+        print("  No URI or Email remnants found that are not already active links.")
+    # --- Section 4: TOC ---
+    print_structural_toc(structural_toc)
+    # Return the collected data for potential future JSON/other output
+    return {
+        "external_links": uri_links,
+        "internal_links": all_internal,
+        "remnants": remnants,
+        "toc": structural_toc
+    }
+def call_stable():
+    print("Begin analysis...")
+    run_analysis()
+    print("Analysis complete.")
+if __name__ == "__main__":
+    call_stable()

pdflinkcheck/cli.py ADDED Viewed

@@ -0,0 +1,63 @@
+# src/pdflinkcheck/cli.py
+import typer
+from rich.console import Console
+from pathlib import Path
+from pdflinkcheck.analyze import run_analysis # Assuming core logic moves here
+from typing import Dict
+# Initialize the rich console for output
+console = Console()
+app = typer.Typer(
+    name="pdflinkcheck",
+    help="A command-line tool for comprehensive PDF link analysis and reporting.",
+    add_completion=False
+)
+@app.command(name="analyze") # Added a command name 'analyze' for clarity
+def analyze_pdf( # Renamed function for clarity
+    pdf_path: Path = typer.Argument(
+        ...,
+        exists=True,
+        file_okay=True,
+        dir_okay=False,
+        readable=True,
+        resolve_path=True,
+        help="The path to the PDF file to analyze."
+    ),
+    check_remnants: bool = typer.Option(
+        True,
+        "--check-remnants/--no-check-remnants",
+        help="Toggle checking for unlinked URLs/Emails in the text layer."
+    ),
+    max_links: int = typer.Option(
+        50,
+        "--max-links",
+        min=0,
+        help="Maximum number of links/remnants to display in the report. Use 0 to show all."
+    )
+):
+    """
+    Analyzes the specified PDF file for all internal, external, and unlinked URI/Email references.
+    """
+    # The actual heavy lifting (analysis and printing) is now in run_analysis
+    run_analysis(
+        pdf_path=str(pdf_path),
+        check_remnants=check_remnants,
+        max_links=max_links
+    )
+@app.command(name="gui")
+def gui():
+    """
+    Launch tkinter-based GUI.
+    """
+    from pdflinkcheck.gui import start_gui
+    try:
+        start_gui()
+    except Exception as e:
+        typer.echo("GUI failed to launch")
+        typer.echo("Ensure tkinter is available, especially if using WSLg.")
+        typer.echo(f"Error: {e}")
+    # Placeholder for running the app
+if __name__ == "__main__":
+    app()

pdflinkcheck/gui.py ADDED Viewed

@@ -0,0 +1,165 @@
+# src/pdflinkcheck/gui.py
+import tkinter as tk
+from tkinter import filedialog, ttk
+import sys
+from pathlib import Path
+# Import the core analysis function
+from pdflinkcheck.analyze import run_analysis
+class RedirectText:
+    """A class to redirect sys.stdout messages to a Tkinter Text widget."""
+    def __init__(self, text_widget):
+        self.text_widget = text_widget
+    def write(self, string):
+        """Insert the incoming string into the Text widget."""
+        self.text_widget.insert(tk.END, string)
+        self.text_widget.see(tk.END) # Scroll to the end
+        self.text_widget.update_idletasks() # Refresh GUI
+    def flush(self):
+        """Required for file-like objects, but does nothing here."""
+        pass
+class PDFLinkCheckerApp(tk.Tk):
+    def __init__(self):
+        super().__init__()
+        self.title("PDF Link Checker")
+        self.geometry("800x600")
+        # Style for the application
+        style = ttk.Style(self)
+        style.theme_use('clam')
+        self.pdf_path = tk.StringVar(value="")
+        self.check_remnants_var = tk.BooleanVar(value=True)
+        self.max_links_var = tk.StringVar(value="50")
+        self.show_all_links_var = tk.BooleanVar(value=False)
+        self._create_widgets()
+    def _create_widgets(self):
+        # --- Control Frame (Top) ---
+        control_frame = ttk.Frame(self, padding="10")
+        control_frame.pack(fill='x')
+        # File Selection
+        ttk.Label(control_frame, text="PDF Path:").grid(row=0, column=0, padx=5, pady=5, sticky='w')
+        ttk.Entry(control_frame, textvariable=self.pdf_path, width=60).grid(row=0, column=1, padx=5, pady=5, sticky='ew')
+        ttk.Button(control_frame, text="Browse...", command=self._select_pdf).grid(row=0, column=2, padx=5, pady=5)
+        # Options
+        ttk.Checkbutton(
+            control_frame,
+            text="Check for Remnants (URLs/Emails)",
+            variable=self.check_remnants_var
+        ).grid(row=1, column=0, padx=5, pady=5, sticky='w')
+        ttk.Checkbutton(
+            control_frame,
+            text="Show All Links (Override Max)",
+            variable=self.show_all_links_var,
+            # Optional: Disable max_links entry when this is checked
+            command=self._toggle_max_links_entry
+        ).grid(row=2, column=0, padx=5, pady=5, sticky='w')
+        ttk.Label(control_frame, text="Max Links to Display:").grid(row=1, column=1, padx=5, pady=5, sticky='e')
+        self.max_links_entry = ttk.Entry(control_frame, textvariable=self.max_links_var, width=10)
+        self.max_links_entry.grid(row=1, column=2, padx=5, pady=5, sticky='w')
+        # Run Button
+        ttk.Button(control_frame, text="▶ Run Analysis", command=self._run_analysis_gui, style='Accent.TButton').grid(row=2, column=0, columnspan=3, pady=10)
+        control_frame.grid_columnconfigure(1, weight=1)
+        # --- Output Frame (Bottom) ---
+        output_frame = ttk.Frame(self, padding="10")
+        output_frame.pack(fill='both', expand=True)
+        ttk.Label(output_frame, text="Analysis Report Output:").pack(fill='x')
+        # Scrollable Text Widget for output
+        self.output_text = tk.Text(output_frame, wrap=tk.WORD, state=tk.DISABLED, bg='#333333', fg='white', font=('Monospace', 10))
+        self.output_text.pack(fill='both', expand=True, padx=5, pady=5)
+        # Scrollbar
+        scrollbar = ttk.Scrollbar(output_frame, command=self.output_text.yview)
+        self.output_text['yscrollcommand'] = scrollbar.set
+        scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
+    def _select_pdf(self):
+        file_path = filedialog.askopenfilename(
+            defaultextension=".pdf",
+            filetypes=[("PDF files", "*.pdf"), ("All files", "*.*")]
+        )
+        if file_path:
+            self.pdf_path.set(file_path)
+    def _toggle_max_links_entry(self):
+        """Disables/enables the max_links entry based on show_all_links_var."""
+        if self.show_all_links_var.get():
+            self.max_links_entry.config(state=tk.DISABLED)
+        else:
+            self.max_links_entry.config(state=tk.NORMAL)
+    def _run_analysis_gui(self):
+        pdf_path_str = self.pdf_path.get()
+        if not Path(pdf_path_str).exists():
+            self._display_error("Error: PDF file not found or path is invalid.")
+            return
+        if self.show_all_links_var.get():
+            # Pass 0 to the backend, which analyze.py interprets as "Show All"
+            max_links_to_pass = 0
+        else:
+            try:
+                max_links_to_pass = int(self.max_links_var.get())
+                if max_links_to_pass <= 0:
+                     self._display_error("Error: Max Links must be a positive number (or use 'Show All').")
+                     return
+            except ValueError:
+                self._display_error("Error: Max Links must be an integer.")
+                return
+        # 1. Clear previous output and enable editing
+        self.output_text.config(state=tk.NORMAL)
+        self.output_text.delete('1.0', tk.END)
+        # 2. Redirect standard output to the Text widget
+        original_stdout = sys.stdout
+        sys.stdout = RedirectText(self.output_text)
+        try:
+            # 3. Call the core logic function
+            self.output_text.insert(tk.END, "--- Starting Analysis ---\n")
+            run_analysis(
+                pdf_path=pdf_path_str,
+                check_remnants=self.check_remnants_var.get(),
+                max_links=max_links_to_pass
+            )
+            self.output_text.insert(tk.END, "\n--- Analysis Complete ---\n")
+        except Exception as e:
+            self._display_error(f"An unexpected error occurred during analysis: {e}")
+        finally:
+            # 4. Restore standard output and disable editing
+            sys.stdout = original_stdout
+            self.output_text.config(state=tk.DISABLED)
+    def _display_error(self, message):
+        self.output_text.config(state=tk.NORMAL)
+        self.output_text.delete('1.0', tk.END)
+        self.output_text.insert(tk.END, f"[ERROR] {message}\n", 'error')
+        self.output_text.tag_config('error', foreground='red')
+        self.output_text.config(state=tk.DISABLED)
+def start_gui():
+    """Entry point function to launch the application."""
+    app = PDFLinkCheckerApp()
+    app.mainloop()
+if __name__ == "__main__":
+    start_gui()

pdflinkcheck/remnants.py ADDED Viewed

@@ -0,0 +1,142 @@
+import re
+import fitz
+# Regular expression pattern for common URLs (http, https, www, mhtml)
+URI_PATTERN = re.compile(
+    r'(?:https?|mhtml|file|ftp):\/\/\S+|\bwww\.\S+\b',
+    re.IGNORECASE
+)
+# Regular expression pattern for email addresses
+EMAIL_PATTERN = re.compile(
+    r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
+    re.IGNORECASE
+)
+def clean_ex_rect(ex_rect_tuple):
+    # If the input is a string, attempt to parse it
+    if isinstance(ex_rect_tuple, str):
+        try:
+            # Use re.split to handle commas and spaces robustly.
+            # Filter out empty strings that result from multiple delimiters (e.g., "1, 2,,3")
+            parts = [c.strip() for c in re.split(r'[,\s]+', ex_rect_tuple.strip()) if c.strip()]
+            coords = [float(c) for c in parts]
+            if len(coords) != 4:
+                # print(f"Warning: Rect string parsed to {len(coords)} coords, expected 4: {ex_rect_tuple}")
+                return None
+            return coords
+        except ValueError:
+            # print(f"Warning: Could not parse rect string: {ex_rect_tuple}")
+            return None # Use None to signal failure
+    # If it's already a numeric sequence, check its length and type
+    elif isinstance(ex_rect_tuple, (list, tuple)):
+        if len(ex_rect_tuple) == 4 and all(isinstance(c, (int, float)) for c in ex_rect_tuple):
+            return ex_rect_tuple
+        # else: print(f"Warning: Numeric rect has incorrect length/type: {ex_rect_tuple}")
+        return None
+    # Handle the 'N/A: Missing Rect' case where link['rect'] might be None or a weird object
+    else:
+        # print(f"Warning: Unexpected rect type/format: {ex_rect_tuple}")
+        return None
+def find_link_remnants(pdf_path, existing_links):
+    """
+    Scans the PDF for text that looks like a URI or email but is not a registered link annotation.
+    """
+    doc = fitz.open(pdf_path)
+    remnants_data = []
+    # 1. Create a set of all bounding boxes (Rects) of EXISTING links for exclusion
+    existing_rects = set()
+    for link in existing_links:
+        rect_obj = link.get("from")
+        if rect_obj:
+            # NOTE: A fitz.Rect object is returned here. We can use its properties directly.
+            # ⚠️ We still need to use your cleaning function if it handles rotation/quantization,
+            # but we must pass it the coordinates in the expected format (e.g., as a list or tuple).
+            # Convert the Rect object to a standard coordinate tuple (x0, y0, x1, y1)
+            raw_coords = (rect_obj.x0, rect_obj.y0, rect_obj.x1, rect_obj.y1)
+            # Assuming clean_ex_rect takes a list/tuple of 4 coordinates and cleans them
+            cleaned_coords = clean_ex_rect(raw_coords)
+            print(f"cleaned_coords = {cleaned_coords}")
+            # print(f"cleaned_coords = {cleaned_coords}") # Keep this for debugging
+            if cleaned_coords:
+                # Store the tuple of clean NUMBERS
+                # Note: A list is not hashable, so converting to tuple is correct.
+                existing_rects.add(tuple(cleaned_coords))
+    for page_num in range(doc.page_count):
+        page = doc.load_page(page_num)
+        # Extract text blocks with coordinates (MODE_TEXT is faster than 'text')
+        text_blocks = page.get_text("blocks")
+        for block in text_blocks:
+            x0, y0, x1, y1, text, block_no, block_type = block
+            # Look for URI remnants
+            for match in URI_PATTERN.finditer(text):
+                remnant_text = match.group(0)
+                # Use fitz to get the bounding box of the matched remnant text on the page
+                text_instances = page.search_for(remnant_text)
+                if text_instances:
+                    remnant_rect = tuple(text_instances[0])
+                    # Check if this remnant's bounding box overlaps with any existing link's bounding box
+                    is_active_link = False
+                    for ex_rect_tuple in existing_rects:
+                        # ⚠️ CLEANUP: ex_rect_tuple is now GUARANTEED to be a tuple of 4 numbers
+                        # We removed the unnecessary clean_ex_rect(ex_rect_tuple) call.
+                        # Convert tuple back to fitz.Rect for overlap check
+                        ex_rect = fitz.Rect(ex_rect_tuple)
+                        if ex_rect.intersects(text_instances[0]):
+                            is_active_link = True
+                            break
+                    if not is_active_link:
+                        remnants_data.append({
+                            'page': page_num + 1,
+                            'type': 'URI Remnant',
+                            'text': remnant_text,
+                            'rect': remnant_rect
+                        })
+            # Look for Email remnants
+            for match in EMAIL_PATTERN.finditer(text):
+                remnant_text = match.group(0)
+                text_instances = page.search_for(remnant_text)
+                if text_instances:
+                    remnant_rect = tuple(text_instances[0])
+                    is_active_link = False
+                    for ex_rect_tuple in existing_rects:
+                        # ⚠️ CLEANUP: ex_rect_tuple is now GUARANTEED to be a tuple of 4 numbers
+                        ex_rect = fitz.Rect(ex_rect_tuple)
+                        if ex_rect.intersects(text_instances[0]):
+                            is_active_link = True
+                            break
+                    if not is_active_link:
+                        remnants_data.append({
+                            'page': page_num + 1,
+                            'type': 'Email Remnant',
+                            'text': remnant_text,
+                            'rect': remnant_rect
+                        })
+    doc.close()
+    return remnants_data

pdflinkcheck-1.1.7.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,109 @@
+Metadata-Version: 2.4
+Name: pdflinkcheck
+Version: 1.1.7
+Summary: A purpose-built PDF link analysis reporting tool.
+Requires-Python: >=3.12
+Description-Content-Type: text/markdown
+Requires-Dist: pymupdf>=1.26.6
+Requires-Dist: rich>=14.2.0
+Requires-Dist: typer>=0.20.0
+Requires-Dist: pyhabitat>=1.0.52
+Provides-Extra: dev
+Requires-Dist: ruff>=0.1.13; extra == "dev"
+Requires-Dist: pytest>=8.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
+# pdflinkcheck
+A purpose-built tool for comprehensive analysis of hyperlinks and link remnants within PDF documents, primarily using the PyMuPDF library.
+Use the CLI or the GUI.
+---
+### Graphical User Interface (GUI)
+The tool can be run using a simple cross-platform graphical interface (Tkinter):
+![Screenshot of the pdflinkcheck GUI](https://raw.githubusercontent.com/City-of-Memphis-Wastewater/pdflinkcheck/main/assets/pdflinkcheck_gui.png)
+To launch the GUI, use the command: `pdflinkcheck-gui`
+---
+### ✨ Features
+* **Active Link Extraction:** Identifies and categorizes all programmed links (External URIs, Internal GoTo/Destinations, Remote Jumps).
+* **Anchor Text Retrieval:** Extracts the visible text corresponding to each link's bounding box.
+* **Remnant Detection:** Scans the document's text layer for unlinked URIs and email addresses that should potentially be converted into active links.
+* **Structural TOC:** Extracts the PDF's internal Table of Contents (bookmarks/outline).
+---
+### 📥 Installation (Recommended via `pipx`)
+The recommended way to install `pdflinkcheck` is using `pipx`, which installs Python applications in isolated environments, preventing dependency conflicts.
+```bash
+# Ensure you have pipx installed first (if not, run: pip install pipx)
+pipx install pdflinkcheck
+```
+**Note for Developers:** If you prefer a traditional virtual environment or are developing locally, use `pip`:
+```bash
+# From the root of the project
+pip install .
+```
+---
+### 🚀 Usage
+The main command is `pdflinkcheck analyze`.
+```bash
+# Basic usage: Analyze a PDF and check for remnants (default behavior)
+pdflinkcheck analyze "path/to/my/document.pdf"
+```
+#### Command Options
+|**Option**|**Description**|**Default**|
+|---|---|---|
+|`<PDF_PATH>`|**Required.** The path to the PDF file to analyze.|N/A|
+|`--check-remnants / --no-check-remnants`|Toggle scanning the text layer for unlinked URLs/Emails.|`--check-remnants`|
+|`--max-links INTEGER`|Maximum number of links/remnants to display in the detailed report sections.|`50`|
+|`--help`|Show command help and exit.|N/A|
+#### Example Run
+```bash
+pdflinkcheck analyze "TE Maxson WWTF O&M Manual.pdf" --max-links 10
+```
+# Run from source
+```
+git clone http://github.com/city-of-memphis-wastewater/pdflinkcheck.git
+cd pdflinkcheck
+uv sync
+python src/pdflinkcheck/analyze.py
+```
+---
+### ⚠️ Platform Compatibility Note
+This tool relies on the `PyMuPDF` library, which requires specific native dependencies (like MuPDF) that may not be available on all platforms.
+**Known Incompatibility:** This tool is **not officially supported** and may fail to run on environments like **Termux (Android)** due to underlying C/C++ library compilation issues with PyMuPDF. It is recommended for use on standard Linux, macOS, or Windows operating systems.
+---
+### Document Compatibility
+While `pdflinkcheck` uses the robust PyMuPDF library, not all PDF files can be processed successfully. This tool is designed primarily for digitally generated (vector-based) PDFs.
+Processing may fail or yield incomplete results for:
+* **Scanned PDFs** (images of text) that lack an accessible text layer.
+* **Encrypted or Password-Protected** documents.
+* **Malformed or non-standard** PDF files.

pdflinkcheck-1.1.7.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+pdflinkcheck/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pdflinkcheck/analyze.py,sha256=wtj1fNvMl5553FYdHmd3K82ve2lHaDW68qBVITig2cQ,12982
+pdflinkcheck/cli.py,sha256=vo_2BF7A4jaR_Qvd4AZ8RIqlwV10Die2RbWc9Er6wQo,1872
+pdflinkcheck/gui.py,sha256=8uzaKqE0aVLzAGIwD52rbEJKfEHdi4R6S8fO8bPs8rI,6432
+pdflinkcheck/remnants.py,sha256=xgunD4hDDT0SqD9SywvPc5DLSLNLA6O0BL0KOuLQwV8,6151
+pdflinkcheck-1.1.7.dist-info/METADATA,sha256=SjgFk5-n8SlurKY0pjwZtTWtXt42vgz0KuYTFY729a4,3725
+pdflinkcheck-1.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+pdflinkcheck-1.1.7.dist-info/entry_points.txt,sha256=Ql8fOpnnAGZ23DWcq0J97bPBafrP0rl8x9aVpSLh5Cs,100
+pdflinkcheck-1.1.7.dist-info/top_level.txt,sha256=WdBg8l6l3TF1HQDpR_PwSmBCSu5atKWFnPfNbRNwrME,13
+pdflinkcheck-1.1.7.dist-info/RECORD,,

pdflinkcheck-1.1.7.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.9.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

pdflinkcheck-1.1.7.dist-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,3 @@
+[console_scripts]
+pdflinkcheck = pdflinkcheck.cli:app
+pdflinkcheck-gui = pdflinkcheck.gui:start_gui

pdflinkcheck-1.1.7.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ pdflinkcheck