PyPI - pdflinkcheck - Versions diffs - 1.1.47__py3-none-any.whl → 1.1.73__py3-none-any.whl - Mend

pdflinkcheck 1.1.47py3-none-any.whl → 1.1.73py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

pdflinkcheck/__init__.py +51 -13
pdflinkcheck/{analyze.py → analyze_pymupdf.py} +54 -224
pdflinkcheck/analyze_pypdf.py +184 -0
pdflinkcheck/analyze_pypdf_v2.py +218 -0
pdflinkcheck/cli.py +238 -39
pdflinkcheck/data/LICENSE +5 -24
pdflinkcheck/data/README.md +278 -0
pdflinkcheck/data/pyproject.toml +98 -0
pdflinkcheck/datacopy.py +60 -0
pdflinkcheck/dev.py +109 -0
pdflinkcheck/gui.py +371 -74
pdflinkcheck/io.py +118 -11
pdflinkcheck/report.py +282 -0
pdflinkcheck/stdlib_server.py +176 -0
pdflinkcheck/validate.py +382 -0
pdflinkcheck/version_info.py +83 -0
{pdflinkcheck-1.1.47.dist-info → pdflinkcheck-1.1.73.dist-info}/METADATA +127 -71
pdflinkcheck-1.1.73.dist-info/RECORD +21 -0
pdflinkcheck-1.1.73.dist-info/WHEEL +4 -0
{pdflinkcheck-1.1.47.dist-info → pdflinkcheck-1.1.73.dist-info}/entry_points.txt +1 -0
{pdflinkcheck-1.1.47.dist-info → pdflinkcheck-1.1.73.dist-info}/licenses/LICENSE +5 -24
pdflinkcheck/remnants.py +0 -142
pdflinkcheck-1.1.47.dist-info/RECORD +0 -13
pdflinkcheck-1.1.47.dist-info/WHEEL +0 -5
pdflinkcheck-1.1.47.dist-info/top_level.txt +0 -1

pdflinkcheck/validate.py ADDED Viewed

@@ -0,0 +1,382 @@
+# src/pdflinkcheck/validate.py
+import sys
+from pathlib import Path
+from typing import Dict, Any
+from pdflinkcheck.report import run_report
+from pdflinkcheck.io import get_friendly_path, export_validation_json
+SEP_COUNT=28
+def run_validation(
+    report_results: Dict[str, Any],
+    pdf_path: str,
+    pdf_library: str = "pypdf",
+    check_external: bool = False,
+    export_json: bool = True,
+    print_bool: bool = True
+) -> Dict[str, Any]:
+    """
+    Validates links using the output from run_report().
+    Args:
+        report_results: The dict returned by run_report()
+        pdf_path: Path to the original PDF (needed for relative file checks and page count)
+        pdf_library: Engine used ("pypdf" or "pymupdf")
+        check_external: Whether to validate HTTP URLs (requires network + requests)
+        print_bool: Whether to print results to console
+    Returns:
+        Validation summary stats with valid/broken counts and detailed issues
+    """
+    data = report_results.get("data", {})
+    metadata = report_results.get("metadata", {})
+    all_links = data.get("external_links", []) + data.get("internal_links", [])
+    toc = data.get("toc", [])
+    if not all_links and not toc:
+        if print_bool:
+            print("No links or TOC to validate.")
+        return {"summary-stats": {"valid": 0, "broken": 0}, "issues": []}
+    # Get total page count (critical for internal validation)
+    try:
+        if pdf_library == "pymupdf":
+            import fitz
+            doc = fitz.open(pdf_path)
+            total_pages = doc.page_count
+            doc.close()
+        else:
+            from pypdf import PdfReader
+            reader = PdfReader(pdf_path)
+            total_pages = len(reader.pages)
+    except Exception as e:
+        if print_bool:
+            print(f"Could not determine page count: {e}")
+        total_pages = None
+    pdf_dir = Path(pdf_path).parent
+    issues = []
+    valid_count = 0
+    broken_file_count = 0
+    broken_page_count = 0
+    file_found_count = 0
+    unknown_web_count = 0
+    unknown_reasonableness_count = 0
+    unknown_link_count = 0
+    # Validate active links
+    for i, link in enumerate(all_links):
+        link_type = link.get("type")
+        status = "valid"
+        reason = None
+        if link_type in ("Internal (GoTo/Dest)", "Internal (Resolved Action)"):
+            target_page = int(link.get("destination_page"))
+            if not isinstance(target_page, int):
+                status = "broken-page"
+                reason = f"Target page not a number: {target_page}"
+            elif (1 <= target_page) and total_pages is None:
+                status = "unknown-reasonableness"
+                reason = "Total page count unavailable, but the page number is reasonable"
+            elif (1 <= target_page <= total_pages):
+                status = "valid"
+                reason = f"Page {target_page} within range (1–{total_pages})"
+            elif target_page < 1:
+                status = "broken-page"
+                reason = f"TOC targets page negative {target_page}."
+            elif not (1 <= target_page <= total_pages):
+                status = "broken-page"
+                reason = f"Page {target_page} out of range (1–{total_pages})"
+        elif link_type == "Remote (GoToR)":
+            remote_file = link.get("remote_file")
+            if not remote_file:
+                status = "broken-file"
+                reason = "Missing remote file name"
+            else:
+                target_path = (pdf_dir / remote_file).resolve()
+                if target_path.exists() and target_path.is_file():
+                    status = "file-found"
+                    reason = f"Found: {target_path.name}"
+                else:
+                    status = "broken-file"
+                    reason = f"File not found: {remote_file}"
+        elif link_type == "External (URI)":
+            url = link.get("url")
+            if url and url.startswith(("http://", "https://")) and check_external:
+                # Optional: add requests-based check later
+                status = "unknown-web"
+                reason = "External URL validation not enabled"
+            else:
+                status = "unknown-web"
+                reason = "External link (no network check)"
+        else:
+            status = "unknown-link"
+            reason = "Other/unsupported link type"
+        link_with_val = link.copy()
+        link_with_val["validation"] = {"status": status, "reason": reason}
+        if status == "valid":
+            valid_count += 1
+        elif status =="file-found":
+            file_found_count += 1
+        elif status == "unknown-web":
+            unknown_web_count += 1
+        elif status == "unknown-reasonableness":
+            unknown_reasonableness_count += 1
+        elif status == "unknown-link":
+            unknown_link_count += 1
+        elif status == "broken-file":
+            broken_page_count += 1
+            issues.append(link_with_val)
+        elif status == "broken-file":
+            broken_page_count += 1
+            issues.append(link_with_val)
+    # Validate TOC entries
+    for entry in toc:
+        target_page = int(entry.get("target_page"))
+        if isinstance(target_page, int):
+            if (1 <= target_page) and total_pages is None:
+                reason = "Page count unknown"
+                status = "unknown-reasonableness"
+                unknown_reasonableness_count += 1
+            elif target_page < 1:
+                status = "broken-page"
+                broken_count += 1
+                reason = f"TOC targets negative page: {target_page}."
+            elif 1 <= target_page <= total_pages:
+                valid_count += 1
+                continue
+            else:
+                status = "broken-page"
+                reason = f"TOC targets page {page} (out of 1–{total_pages})"
+                broken_count += 1
+        else:
+            status = "broken-page"
+            reason = f"Invalid page: {target_page}"
+            broken_count += 1
+        issues.append({
+            "type": "TOC Entry",
+            "title": entry["title"],
+            "level": entry["level"],
+            "target_page": target_page,
+            "validation": {"status": status, "reason": reason}
+        })
+    summary_stats = {
+        "total_checked": len(all_links) + len(toc),
+        "valid": valid_count,
+        "file-found": file_found_count,
+        "broken-page": broken_page_count,
+        "broken-file": broken_file_count,
+        "unknown-web": unknown_web_count,
+        "unknown-reasonableness": unknown_reasonableness_count,
+        "unknown-link": unknown_link_count,
+        #"unknown": len(all_links) + len(toc) - valid_count - broken_count # nah this is not granuar enough
+    }
+    def generate_validation_summary_txt_buffer(summary_stats, issues, pdf_path):
+        """
+        Prepare the validation overview for modular reuse
+        """
+        validation_buffer = []
+        # Helper to handle conditional printing and mandatory buffering
+        def log(msg: str):
+            validation_buffer.append(msg)
+        log("\n" + "=" * SEP_COUNT)
+        log("## Validation Results")
+        log("=" * SEP_COUNT)
+        log(f"PDF Path = {get_friendly_path(pdf_path)}")
+        log(f"Total items checked: {summary_stats['total_checked']}")
+        log(f"✅ Valid: {summary_stats['valid']}")
+        log(f"🌐 Web Addresses (Not Checked): {summary_stats['unknown-web']}")
+        log(f"⚠️ Unknown Page Reasonableness (Due to Missing Total Page Count): {summary_stats['unknown-reasonableness']}")
+        log(f"⚠️ Unsupported PDF Links: {summary_stats['unknown-link']}")
+        log(f"❌ Broken Page Reference: {summary_stats['broken-page']}")
+        log(f"❌ Broken File Reference: {summary_stats['broken-file']}")
+        log("=" * SEP_COUNT)
+        if issues:
+            log("\n## Issues Found")
+            log("{:<5} | {:<12} | {:<30} | {}".format("Idx", "Type", "Text", "Problem"))
+            log("-" * SEP_COUNT)
+            for i, issue in enumerate(issues[:25], 1):
+                link_type = issue.get("type", "Link")
+                text = issue.get("link_text", "") or issue.get("title", "") or "N/A"
+                text = text[:30]
+                reason = issue["validation"]["reason"]
+                log("{:<5} | {:<12} | {:<30} | {}".format(i, link_type, text, reason))
+            if len(issues) > 25:
+                log(f"... and {len(issues) - 25} more issues")
+        else:
+            log("No issues found — all links and TOC entries are valid!")
+        # Final aggregation of the buffer into one string
+        validation_buffer_str = "\n".join(validation_buffer)
+        return validation_buffer_str
+    summary_txt = generate_validation_summary_txt_buffer(summary_stats, issues, pdf_path)
+    if print_bool:
+        print(summary_txt)
+    validation_results = {
+        "pdf_path" : pdf_path,
+        "summary-stats": summary_stats,
+        "issues": issues,
+        "summary-txt": summary_txt,
+        "total_pages": total_pages
+    }
+    # Have export run interally so that the logic need not happen in an interface
+    export_validation_json(validation_results,  pdf_path, pdf_library)
+    return validation_results
+def run_validation_more_readable_slop(pdf_path: str = None, pdf_library: str = "pypdf", check_external_links:bool = False) -> Dict[str, Any]:
+    """
+    Experimental. Ignore for now.
+    Extends the report logic by programmatically testing every extracted link.
+    Validates Internal Jumps (page bounds), External URIs (HTTP status),
+    and Launch actions (file existence).
+    """
+    if check_external_links:
+        import requests
+    # 1. Setup Library Engine (Reuse your logic)
+    pdf_library = pdf_library.lower()
+    if pdf_library == "pypdf":
+        from pdflinkcheck.analyze_pypdf import extract_links_pypdf as extract_links
+    else:
+        from pdflinkcheck.analyze_pymupdf import extract_links_pymupdf as extract_links
+    if pdf_path is None:
+        pdf_path = get_first_pdf_in_cwd()
+    if not pdf_path:
+        print("Error: No PDF found for validation.")
+        return {}
+    print(f"\nValidating links in {Path(pdf_path).name}...")
+    # 2. Extract links and initialize validation counters
+    links = extract_links(pdf_path)
+    total_links = len(links)
+    results = {"valid": [], "broken": [], "error": []}
+    # 3. Validation Loop
+    for i, link in enumerate(links, 1):
+        # Progress indicator for long manuals
+        sys.stdout.write(f"\rChecking link {i}/{total_links}...")
+        sys.stdout.flush()
+        link_type = link.get('type')
+        status = {"is_valid": False, "reason": "Unknown Type"}
+        # --- A. Validate Internal Jumps ---
+        if "Internal" in link_type:
+            target_page = link.get('destination_page')
+            if isinstance(target_page, int) and target_page > 0:
+                # In a real run, you'd compare against reader.pages_count
+                status = {"is_valid": True, "reason": "Resolves"}
+            else:
+                status = {"is_valid": False, "reason": f"Invalid Page: {target_page}"}
+        # --- B. Validate Web URIs ---
+        elif link_type == 'External (URI)':
+            url = link.get('url')
+            if url and url.startswith("http") and check_external_links:
+                try:
+                    # Use a short timeout and HEAD request to be polite/fast
+                    resp = requests.head(url, timeout=5, allow_redirects=True)
+                    if resp.status_code < 400:
+                        status = {"is_valid": True, "reason": f"HTTP {resp.status_code}"}
+                    else:
+                        status = {"is_valid": False, "reason": f"HTTP {resp.status_code}"}
+                except Exception as e:
+                    status = {"is_valid": False, "reason": "Connection Failed"}
+            else:
+                status = {"is_valid": False, "reason": "Malformed URL"}
+        # --- C. Validate Local File/Launch Links ---
+        elif link_type == 'Launch' or 'remote_file' in link:
+            file_path = link.get('remote_file') or link.get('url')
+            if file_path:
+                # Clean URI formatting
+                clean_path = file_path.replace("file://", "").replace("%20", " ")
+                # Check relative to the PDF's location
+                abs_path = Path(pdf_path).parent / clean_path
+                if abs_path.exists():
+                    status = {"is_valid": True, "reason": "File Exists"}
+                else:
+                    status = {"is_valid": False, "reason": "File Missing"}
+        # Append result
+        link['validation'] = status
+        if status['is_valid']:
+            results['valid'].append(link)
+        else:
+            results['broken'].append(link)
+    print("\n" + "=" * SEP_COUNT)
+    print(f"--- Validation Summary Stats for {Path(pdf_path).name} ---")
+    print(f"Total Checked: {total_links}")
+    print(f"✅ Valid:  {len(results['valid'])}")
+    print(f"❌ Broken: {len(results['broken'])}")
+    print("=" * SEP_COUNT)
+    # 4. Print Detail Report for Broken Links
+    if results['broken']:
+        print("\n## ❌ Broken Links Found:")
+        print("{:<5} | {:<5} | {:<30} | {}".format("Idx", "Page", "Reason", "Target"))
+        print("-" * SEP_COUNT)
+        for i, link in enumerate(results['broken'], 1):
+            target = link.get('url') or link.get('destination_page') or link.get('remote_file')
+            print("{:<5} | {:<5} | {:<30} | {}".format(
+                i, link['page'], link['validation']['reason'], str(target)[:30]
+            ))
+    return results
+if __name__ == "__main__":
+    from pdflinkcheck.io import get_first_pdf_in_cwd
+    pdf_path = get_first_pdf_in_cwd()
+    # Run analysis first
+    report = run_report(
+        pdf_path=pdf_path,
+        max_links=0,
+        export_format="",
+        pdf_library="pypdf",
+        print_bool=False  # We handle printing in validation
+    )
+    if not report or not report.get("data"):
+        print("No data extracted — nothing to validate.")
+        sys.exit(1)
+    # Then validate
+    validation_results = run_validation(
+        report_results=report,
+        pdf_path=pdf_path,
+        pdf_library="pypdf",
+        export_json=True,
+        print_bool=True
+    )
+    export_validation_results()

pdflinkcheck/version_info.py ADDED Viewed

@@ -0,0 +1,83 @@
+# src/pdflinkcheck/version_info.py
+import re
+from pathlib import Path
+import sys
+"""
+This portion of the codebase is MIT licensed. It does not rely on any AGPL-licensed code.
+---
+MIT License
+Copyright (c) 2025 George Clayton Bennett <george.bennett@memphistn.gov>
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+# --- TOML Parsing Helper ---
+def find_pyproject(start: Path) -> Path | None:
+    # 1. Handle PyInstaller / Frozen state
+    if getattr(sys, 'frozen', False) and hasattr(sys, '_MEIPASS'):
+        # In PyInstaller, force-include maps to: sys._MEIPASS / package_name / data / file
+        candidate = Path(sys._MEIPASS) / "pdflinkcheck" / "data" / "pyproject.toml"
+        if candidate.exists():
+            return candidate
+        # Fallback for simple --add-data "pyproject.toml:."
+        candidate = Path(sys._MEIPASS) / "pyproject.toml"
+        if candidate.exists():
+            return candidate
+    # 4. Handle Development state (walking up the tree)
+    for p in start.resolve().parents:
+        candidate = p / "pyproject.toml"
+        if candidate.exists():
+            return candidate
+    # 3. Handle Installed / Wheel / Shiv state (using your force-include path)
+    internal_path = Path(__file__).parent / "data" / "pyproject.toml"
+    if internal_path.exists():
+        return internal_path
+    return None
+def get_version_from_pyproject() -> str:
+    pyproject = find_pyproject(Path(__file__))
+    if not pyproject or not pyproject.exists():
+        print("ERROR: pyproject.toml missing.", file=sys.stderr)
+        return "0.0.0"
+    text = pyproject.read_text(encoding="utf-8")
+    # Match PEP 621 style: [project]
+    project_section = re.search(r"\[project\](.*?)(?:\n\[|$)", text, re.DOTALL | re.IGNORECASE)
+    if project_section:
+        match = re.search(r'version\s*=\s*["\']([^"\']+)["\']', project_section.group(1))
+        if match: return match.group(1)
+    # Match Poetry style: [tool.poetry]
+    poetry_section = re.search(r"\[tool\.poetry\](.*?)(?:\n\[|$)", text, re.DOTALL | re.IGNORECASE)
+    if poetry_section:
+        match = re.search(r'version\s*=\s*["\']([^"\']+)["\']', poetry_section.group(1))
+        if match: return match.group(1)
+    return "0.0.0"

pdflinkcheck 1.1.47__py3-none-any.whl → 1.1.73__py3-none-any.whl

pdflinkcheck 1.1.47py3-none-any.whl → 1.1.73py3-none-any.whl