PyPI - pdflinkcheck - Versions diffs - 1.1.73__py3-none-any.whl → 1.2.29__py3-none-any.whl - Mend

pdflinkcheck 1.1.73py3-none-any.whl → 1.2.29py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (186) hide show

pdflinkcheck/__init__.py CHANGED Viewed

@@ -1,29 +1,82 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
 # src/pdflinkcheck/__init__.py
 """
-# License information
 pdflinkcheck - A PDF Link Checker
-Copyright (C) 2025 George Clayton Bennett
 Source code: https://github.com/City-of-Memphis-Wastewater/pdflinkcheck/
-This program is free software: You can redistribute it and/or modify
-it under the terms of the GNU Affero General Public License as
-published by the Free Software Foundation, either version 3 of the
-License, or (at your option) any later version.
-The AGPL3+ is required because pdflinkcheck uses PyMuPDF, which is licensed under the AGPL3.
 """
+from __future__ import annotations
 import os as _os
 # Library functions
-from pdflinkcheck.analyze_pymupdf import extract_links_pymupdf, extract_toc_pymupdf
-from pdflinkcheck.analyze_pypdf import extract_links_pypdf, extract_toc_pypdf
-#from pdflinkcheck import analyze_pypdf
-from pdflinkcheck.report import run_report
-from pdflinkcheck.report import run_report as run_analysis # for backwards compatibility with previos versions
 #from pdflinkcheck import dev
+# Lazy-loaded orchestrator
+def run_report(pdf_path: str, export_format: str = "JSON", pdf_library: str = "auto", print_bool: bool = True):
+    """
+    Run a full link check report on a PDF file.
+    Args:
+        pdf_path: Path to the PDF file.
+        export_format: "JSON", "TXT", or both (e.g., "JSON,TXT").
+        pdf_library: "auto", "pdfium", "pymupdf", or "pypdf".
+        print_bool: If True, prints the overview to stdout.
+    """
+    from pdflinkcheck.report import run_report_and_call_exports as _run
+    return _run(pdf_path=pdf_path, export_format=export_format, pdf_library=pdf_library, print_bool=print_bool)
+# --- pypdf ---
+def analyze_pdf_pypdf(path):
+    try:
+        from pdflinkcheck.analysis_pypdf import analyze_pdf as _analyze
+    except ImportError:
+        raise ImportError(
+            "pypdf engine is not installed. "
+            "Install pypdf to enable pypdf support."
+        )
+    return _analyze(path)
+analyze_pdf_pypdf.__doc__ = (
+    "Analyze a PDF using the lightweight pypdf engine and return a normalized dictionary.\n\n"
+    "See pdflinkcheck.analyze_pypdf for full details."
+)
+# --- PyMuPDF ---
+def analyze_pdf_pymupdf(path):
+    try:
+        from pdflinkcheck.analysis_pymupdf import analyze_pdf as _analyze
+    except ImportError:
+        raise ImportError(
+            "PyMuPDF engine is not installed. "
+            "Install with the [mupdf] extra to enable PyMuPDF support."
+        )
+    return _analyze(path)
+analyze_pdf_pymupdf.__doc__ = (
+    "Analyze a PDF using the AGPL3-licensed PyMuPDF engine and return a normalized dictionary.\n\n"
+    "See pdflinkcheck.analyze_pymupdf for full details."
+)
+# --- PDFium ---
+def analyze_pdf_pdfium(path):
+    try:
+        from pdflinkcheck.analysis_pdfium import analyze_pdf as _analyze
+    except ImportError:
+        raise ImportError(
+            "PDFium engine is not installed. "
+            "Install with the [pdfium] extra to enable pdfium support."
+        )
+    return _analyze(path)
+analyze_pdf_pdfium.__doc__ = (
+    "Analyze a PDF using the PDFium engine and return a normalized dictionary.\n\n"
+    "See pdflinkcheck.analyze_pdfium for full details."
+)
+# -----------------------------
+# GUI easter egg
+# -----------------------------
 # For the kids. This is what I wanted when learning Python in a mysterious new REPL.
 # Is this Pythonic? No. Oh well. PEP 8, PEP 20.
 # Why is this not Pythonic? Devs expect no side effects when importing library functions.
@@ -32,33 +85,47 @@ _gui_easteregg_env_flag = _os.environ.get('PDFLINKCHECK_GUI_EASTEREGG', '')
 _load_gui_func = str(_gui_easteregg_env_flag).strip().lower() in ('true', '1', 'yes', 'on')
 if _load_gui_func:
     try:
+        print("Easter egg, attemping.")
         import pyhabitat as _pyhabitat # pyhabitat is a dependency of this package already
+        print(f"pyhabitat.tkinter_is_available() = {_pyhabitat.tkinter_is_available()}")
         if _pyhabitat.tkinter_is_available():
             from pdflinkcheck.gui import start_gui
+            print("Success: pdflinkcheck.start_gui() function loaded as top-level pmlibrary function.")
     except ImportError:
         # Optional: log or ignore silently
         print("start_gui() not imported")
 # Breadcrumbs, for stumbling upon.
 if _load_gui_func:
     __pdflinkcheck_gui_easteregg_enabled__ = True
 else:
     __pdflinkcheck_gui_easteregg_enabled__ = False
+# -----------------------------
+# Public API
+# -----------------------------
 # Define __all__ such that the library functions are self documenting.
 __all__ = [
     "run_report",
-    "run_analysis",
-    "extract_links_pymupdf",
-    "extract_toc_pymupdf",
-    "extract_links_pypdf",
-    "extract_toc_pypdf",
-    #"start_gui" if _load_gui_func else None,
-    #"dev",
+    "analyze_pdf_pymupdf",
+    "analyze_pdf_pypdf",
+    "analyze_pdf_pdfium",
 ]
+# Handle the Easter Egg export
 if _load_gui_func:
     __all__.append("start_gui")
+# Handle dev module if you want it public
+try:
+    from pdflinkcheck import dev
+    __all__.append("dev")
+except ImportError:
+    pass
 # 4. THE CLEANUP (This removes items from dir())
 del _os
 del _gui_easteregg_env_flag

pdflinkcheck/__main__.py ADDED Viewed

@@ -0,0 +1,6 @@
+# src/pdflinkcheck/__main__.py
+from __future__ import annotations
+from pdflinkcheck.cli import app
+if __name__ == "__main__":
+    app()

pdflinkcheck/analysis_pdfium.py ADDED Viewed

@@ -0,0 +1,131 @@
+# src/pdflinkcheck/analysis_pdfium.py
+from __future__ import annotations
+import ctypes
+from typing import List, Dict, Any
+from pdflinkcheck.helpers import PageRef
+from pdflinkcheck.environment import pdfium_is_available
+from pdflinkcheck.helpers import PageRef
+try:
+    if pdfium_is_available():
+        import pypdfium2 as pdfium
+        import pypdfium2.raw as pdfium_c
+    else:
+        pdfium = None
+        pdfium_c = None
+except ImportError:
+    pdfium = None
+    pdfium_c = None
+def analyze_pdf(path: str) -> Dict[str, Any]:
+    # 1. Guard the entry point
+    if not pdfium_is_available() or pdfium is None:
+        raise ImportError(
+            "pypdfium2 is not installed. "
+            "\nInstall it with: \n\tpip install pdflinkcheck[pdfium] \n\t OR \n\t uv sync --extra pdfium"
+        )
+    doc = pdfium.PdfDocument(path)
+    total_pages = len(doc) # or doc.page_count
+    links = []
+    toc_list = []
+    file_ov = {}
+    seen_toc = set()
+    file_ov["total_pages"] = total_pages
+    # 1. TOC Extraction (Matches PyMuPDF logic)
+    for item in doc.get_toc():
+        title = item.get_title() if hasattr(item, "get_title") else ""
+        dest = item.get_dest()
+        page_idx = PageRef.from_index(dest.get_index()).machine if dest else 0
+        if title or page_idx > 0:
+            key = (title, page_idx)
+            if key not in seen_toc:
+                toc_list.append({"level": item.level + 1, "title": title, "target_page": page_idx})
+                seen_toc.add(key)
+    # 2. Link Enumeration
+    for page_index in range(len(doc)):
+        page = doc.get_page(page_index)
+        text_page = page.get_textpage()
+        source_ref = PageRef.from_index(page_index)
+        # --- A. EXTERNAL WEB LINKS ---
+        pagelink_raw = pdfium_c.FPDFLink_LoadWebLinks(text_page.raw)
+        if pagelink_raw:
+            count = pdfium_c.FPDFLink_CountWebLinks(pagelink_raw)
+            for i in range(count):
+                buflen = pdfium_c.FPDFLink_GetURL(pagelink_raw, i, None, 0)
+                url = ""
+                if buflen > 0:
+                    buffer = (pdfium_c.c_uint16 * buflen)()
+                    pdfium_c.FPDFLink_GetURL(pagelink_raw, i, buffer, buflen)
+                    url = ctypes.string_at(buffer, (buflen-1)*2).decode('utf-16le')
+                l, t, r, b = (ctypes.c_double() for _ in range(4))
+                pdfium_c.FPDFLink_GetRect(pagelink_raw, i, 0, ctypes.byref(l), ctypes.byref(t), ctypes.byref(r), ctypes.byref(b))
+                rect = [l.value, b.value, r.value, t.value]
+                links.append({
+                    'page': source_ref.machine,
+                    'rect': rect,
+                    'link_text': text_page.get_text_bounded(left=l.value, top=t.value, right=r.value, bottom=b.value).strip() or url,
+                    'type': 'External (URI)',
+                    'url': url,
+                    'target': url,
+                    'source_kind': 'pypdfium2_weblink'
+                })
+            pdfium_c.FPDFLink_CloseWebLinks(pagelink_raw)
+        # --- B. INTERNAL GOTO LINKS (Standard Annotations) ---
+        # We iterate through standard link annotations for GoTo actions
+        pos = 0
+        while True:
+            annot_raw = pdfium_c.FPDFPage_GetAnnot(page.raw, pos)
+            if not annot_raw:
+                break
+            subtype = pdfium_c.FPDFAnnot_GetSubtype(annot_raw)
+            if subtype == pdfium_c.FPDF_ANNOT_LINK:
+                # Get Rect
+                fs_rect = pdfium_c.FS_RECTF()
+                pdfium_c.FPDFAnnot_GetRect(annot_raw, fs_rect)
+                # Try to get Destination
+                link_annot = pdfium_c.FPDFAnnot_GetLink(annot_raw)
+                dest = pdfium_c.FPDFLink_GetDest(doc.raw, link_annot)
+                if dest:
+                    dest_idx = pdfium_c.FPDFDest_GetDestPageIndex(doc.raw, dest)
+                    dest_ref = PageRef.from_index(dest_idx)
+                    links.append({
+                        'page': source_ref.machine,
+                        'rect': [fs_rect.left, fs_rect.bottom, fs_rect.right, fs_rect.top],
+                        'link_text': text_page.get_text_bounded(left=fs_rect.left, top=fs_rect.top, right=fs_rect.right, bottom=fs_rect.bottom).strip(),
+                        'type': 'Internal (GoTo/Dest)',
+                        'destination_page': dest_ref.machine,
+                        'target': dest_ref.machine,
+                        'source_kind': 'pypdfium2_annot'
+                    })
+            # Note: We don't close annot here if we are just enumerating by index
+            # in some builds, but standard practice is to increment pos
+            pos += 1
+        page.close()
+        text_page.close()
+    doc.close()
+    return {"links": links, "toc": toc_list, "file_ov": file_ov}
+if __name__ == "__main__":
+    import json
+    import sys
+    filename = "temOM.pdf"
+    results = analyze_pdf(filename)
+    print(json.dumps(results, indent=2))

pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} RENAMED Viewed

@@ -1,3 +1,7 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+# pdflinkcheck/analysis_pymupdf.py
+from __future__ import annotations
 import sys
 from pathlib import Path
 import logging
@@ -5,20 +9,46 @@ from typing import Dict, Any, Optional, List
 logging.getLogger("fitz").setLevel(logging.ERROR)
+from pdflinkcheck.environment import pymupdf_is_available
+from pdflinkcheck.helpers import PageRef
 try:
-    import fitz  # PyMuPDF
+    if pymupdf_is_available():
+        import fitz  # PyMuPDF
+    else:
+        fitz = None
 except ImportError:
     fitz = None
-from pdflinkcheck.report import run_report
-#from pdflinkcheck.validate import run_validation
 """
 Inspect target PDF for both URI links and for GoTo links.
 """
+def analyze_pdf(pdf_path: str):
+    data = {}
+    data["links"] = []
+    data["toc"] = []
+    data["file_ov"] = {}
+    try:
+        doc = fitz.open(pdf_path)
+    except Exception as e:
+        print(f"fitz.open() failed: {e}")
+        return data
+    extracted_links = extract_links_pymupdf(doc)
+    structural_toc = extract_toc_pymupdf(doc)
+    page_count = doc.page_count
+    data["links"] = extracted_links
+    data["toc"] = structural_toc
+    data["file_ov"]["total_pages"] = page_count
+    data["file_ov"]["pdf_name"] = Path(pdf_path).name
+    return data
 # Helper function: Prioritize 'from'
-def get_link_rect(link_dict):
+def _get_link_rect(link_dict):
     """
     Retrieves the bounding box for the link using the reliable 'from' key
     provided by PyMuPDF's link dictionary.
@@ -44,6 +74,19 @@ def get_link_rect(link_dict):
     return None
 def get_anchor_text(page, link_rect):
+    """
+    Extracts text content using the link's bounding box coordinates.
+    The bounding box is slightly expanded to ensure full characters are captured.
+    Args:
+        page: The fitz.Page object where the link is located.
+        link_rect: A tuple of four floats (x0, y0, x1, y1) representing the
+                   link's bounding box.
+    Returns:
+        The cleaned, extracted text string, or a placeholder message
+        if no text is found or if an error occurs.
+    """
     if not link_rect:
         return "N/A: Missing Rect"
@@ -81,57 +124,6 @@ def get_anchor_text(page, link_rect):
     except Exception:
         return "N/A: Rect Error"
-def get_anchor_text_stable(page, link_rect):
-    """
-    Extracts text content using the link's bounding box coordinates.
-    The bounding box is slightly expanded to ensure full characters are captured.
-    Args:
-        page: The fitz.Page object where the link is located.
-        link_rect: A tuple of four floats (x0, y0, x1, y1) representing the
-                   link's bounding box.
-    Returns:
-        The cleaned, extracted text string, or a placeholder message
-        if no text is found or if an error occurs.
-    """
-    if not link_rect:
-        return "N/A: Missing Rect"
-    try:
-        # 1. Convert the coordinate tuple back to a fitz.Rect object
-        rect = fitz.Rect(link_rect)
-        # --- CRITICAL STEP: Check for invalid/empty rect AFTER conversion ---
-        # If the rect is invalid (e.g., width or height is <= 0), skip it
-        # Note: fitz.Rect will often auto-normalize, but this explicit check is safer.
-        if rect.is_empty or rect.width <= 0 or rect.height <= 0:
-            return "N/A: Rect Error (Zero/Negative Dimension)"
-        # 2. Expand the rect slightly to capture full characters (1 unit in each direction)
-        #    This method avoids the proprietary/unstable 'from_expanded' or 'from_rect' methods.
-        expanded_rect = fitz.Rect(
-            rect.x0 - 1,
-            rect.y0 - 1,
-            rect.x1 + 1,
-            rect.y1 + 1
-        )
-        # 3. Get the text within the expanded bounding box
-        anchor_text = page.get_textbox(expanded_rect)
-        # 4. Clean up whitespace and non-printing characters
-        cleaned_text = " ".join(anchor_text.split())
-        if cleaned_text:
-            return cleaned_text
-        else:
-            return "N/A: No Visible Text"
-    except Exception:
-        # Fallback for unexpected errors in rect conversion or retrieval
-        return "N/A: Rect Error"
 def analyze_toc_fitz(doc):
     """
     Extracts the structural Table of Contents (PDF Bookmarks/Outline)
@@ -144,23 +136,28 @@ def analyze_toc_fitz(doc):
         A list of dictionaries, where each dictionary represents a TOC entry
         with 'level', 'title', and 'target_page' (1-indexed).
     """
     toc = doc.get_toc()
     toc_data = []
     for level, title, page_num in toc:
         # fitz pages are 1-indexed for TOC!
+        # We know fitz gives us a human number.
+        # We convert it to a physical index for our internal storage.
+        # page_num is 1 (Human). We normalize to 0 (Physical).
+        ref = PageRef.from_human(page_num)
         toc_data.append({
             'level': level,
             'title': title,
-            'target_page': page_num
+            #'target_page': ref.index
+            'target_page': ref.machine
         })
     return toc_data
 # 2. Updated Main Inspection Function to Include Text Extraction
 #def inspect_pdf_hyperlinks_fitz(pdf_path):
-def extract_toc_pymupdf(pdf_path):
+def extract_toc_pymupdf(doc):
     """
     Opens a PDF, iterates through all pages and extracts the structural table of contents (TOC/bookmarks).
@@ -171,7 +168,7 @@ def extract_toc_pymupdf(pdf_path):
         A list of dictionaries representing the structural TOC/bookmarks.
     """
     try:
-        doc = fitz.open(pdf_path)
         structural_toc = analyze_toc_fitz(doc)
     except Exception as e:
         print(f"An error occurred: {e}", file=sys.stderr)
@@ -206,133 +203,100 @@ def serialize_fitz_object(obj):
     return obj
-def extract_links_pymupdf(pdf_path):
-    """
-    Opens a PDF, iterates through all pages and extracts all link annotations.
-    It categorizes the links into External, Internal, or Other actions, and extracts the anchor text.
-    Args:
-        pdf_path: The file system path (str) to the target PDF document.
-    Returns:
-        A list of dictionaries, where each dictionary is a comprehensive
-           representation of an active hyperlink found in the PDF.
-    """
+def extract_links_pymupdf(doc):
     links_data = []
-    try:
-        doc = fitz.open(pdf_path)
+    try:
+        # This represents the maximum valid 0-index in the doc
+        last_page_ref = PageRef.from_pymupdf_total_page_count(doc.page_count)
+        #print(last_page_ref)       # Output: "358" (Because of __str__)
+        #print(int(last_page_ref))  # Output: 357   (Because of __int__)
         for page_num in range(doc.page_count):
             page = doc.load_page(page_num)
-            for link in page.get_links():
+            source_ref = PageRef.from_index(page_num)
-                page_obj = doc.load_page(page_num)
-                link_rect = get_link_rect(link)
-                rect_obj = link.get("from")
-                xref = link.get("xref")
-                #print(f"rect_obj = {rect_obj}")
-                #print(f"xref = {xref}")
-                # --- Examples of various keys associated with various link instances ---
-                #print(f"keys: list(link) = {list(link)}")
-                # keys: list(link) = ['kind', 'xref', 'from', 'page', 'viewrect', 'id']
-                # keys: list(link) = ['kind', 'xref', 'from', 'uri', 'id']
-                # keys: list(link) = ['kind', 'xref', 'from', 'page', 'view', 'id']
-                # 1. Extract the anchor text
-                anchor_text = get_anchor_text(page_obj, link_rect)
-                # 2. Extract the target and kind
-                target = ""
-                kind = link.get('kind')
+            for link in page.get_links():
+                link_rect = _get_link_rect(link)
+                anchor_text = get_anchor_text(page, link_rect)
                 link_dict = {
-                    'page': int(page_num) + 1, # accurate for link location, add 1
+                    'page': source_ref.machine,
                     'rect': link_rect,
                     'link_text': anchor_text,
-                    'xref':xref
+                    'xref': link.get("xref")
                 }
-                # A. Clean Geom. Objects: Use the helper function on 'to' / 'destination'
-                # Use the clean serialize_fitz_object() helper function on all keys that might contain objects
+                kind = link.get('kind')
                 destination_view = serialize_fitz_object(link.get('to'))
+                p_index = link.get('page') # excpeted to be human facing, per PyMuPDF's known quirks
+                # --- CASE 1: INTERNAL JUMPS (GoTo) ---
+                if p_index is not None:
-                # B. Correct Internal Link Page Numbering (The -1 correction hack)
-                # This will be skipped by URI, which is not expected to have a page key
-                target_page_num_reported = "N/A"
-                if link.get('page') is not None:
-                    target_page_num_reported = int(link.get('page'))+1 # accurate for link target, don't add 1 (weird)
+                    # Ensure we are working with an integer
+                    raw_pymupdf_idx = int(p_index)
+                    corrected_machine_idx = PageRef.corrected_down(raw_pymupdf_idx).index
+                    # Logic: Normalize to 0-index and store as int
+                    idx = min(corrected_machine_idx, int(last_page_ref))
+                    #print(f"DEBUG: Link Text: {anchor_text} | Raw p_index: {p_index}")
+                    #print(f"[DEBUG] idx: {idx}")
+                    dest_ref = PageRef.from_index(idx) # does not impact the value
-                if link['kind'] == fitz.LINK_URI:
-                    target =  link.get('uri', 'URI (Unknown Target)')
                     link_dict.update({
-                        'type': 'External (URI)',
-                        'url': link.get('uri'),
-                        'target': target
+                        'destination_page': dest_ref.machine,
+                        'destination_view': destination_view,
+                        'target': dest_ref.machine,          # INT (MACHINE INDEX)
                     })
+                    if kind == fitz.LINK_GOTO:
+                        link_dict['type'] = 'Internal (GoTo/Dest)'
+                    else:
+                        link_dict['type'] = 'Internal (Resolved Action)'
+                        link_dict['source_kind'] = kind
-                elif link['kind'] == fitz.LINK_GOTO:
-                    target = f"Page {target_page_num_reported}"
+                # --- CASE 2: EXTERNAL URIs ---
+                elif kind == fitz.LINK_URI:
+                    uri = link.get('uri', 'URI (Unknown Target)')
                     link_dict.update({
-                        'type': 'Internal (GoTo/Dest)',
-                        'destination_page': target_page_num_reported,
-                        'destination_view': destination_view,
-                        'target': target
+                        'type': 'External (URI)',
+                        'url': uri,
+                        'target': uri # STRING (URL)
                     })
-                elif link['kind'] == fitz.LINK_GOTOR:
+                # --- CASE 3: REMOTE PDF REFERENCES ---
+                elif kind == fitz.LINK_GOTOR:
+                    remote_file = link.get('file', 'Remote File')
                     link_dict.update({
                         'type': 'Remote (GoToR)',
                         'remote_file': link.get('file'),
-                        'destination': destination_view
+                        'target': remote_file  # STRING (File Path)
                     })
-                elif link.get('page') is not None and link['kind'] != fitz.LINK_GOTO:
-                    target = f"Page {target_page_num_reported}"
-                    link_dict.update({
-                        'type': 'Internal (Resolved Action)',
-                        'destination_page': target_page_num_reported,
-                        'destination_view': destination_view,
-                        'source_kind': link.get('kind'),
-                        'target': target
-                    })
+                # --- CASE 4: OTHERS ---
                 else:
-                    target = link.get('url') or link.get('remote_file') or link.get('target')
                     link_dict.update({
                         'type': 'Other Action',
-                        'action_kind': link.get('kind'),
-                        'target': target
+                        'action_kind': kind,
+                        'target': 'Unknown'  # STRING
                     })
-                ## --- General Serialization Cleaner ---
-                #for key, value in link_dict.items():
-                #    if hasattr(value, 'rect') and hasattr(value, 'point'):
-                #        # This handles Rect and Point objects that may slip through
-                #        link_dict[key] = str(value)
-                ## --- End Cleaner ---
                 links_data.append(link_dict)
         doc.close()
     except Exception as e:
         print(f"An error occurred: {e}", file=sys.stderr)
     return links_data
 def call_stable():
     """
     Placeholder function for command-line execution (e.g., in __main__).
     Note: This requires defining PROJECT_NAME, CLI_MAIN_FILE, etc., or
     passing them as arguments to run_report.
     """
-    run_report(pdf_library = "pymupdf")
-    #run_validation(pdf_library = "pymupdf")
+    from pdflinkcheck.report import run_report_and_call_exports
+    run_report_and_call_exports(pdf_library = "pymupdf")
 if __name__ == "__main__":
     call_stable()

pdflinkcheck 1.1.73__py3-none-any.whl → 1.2.29__py3-none-any.whl

pdflinkcheck 1.1.73py3-none-any.whl → 1.2.29py3-none-any.whl