PyPI - pdflinkcheck - Versions diffs - 1.1.94__py3-none-any.whl → 1.2.29__py3-none-any.whl - Mend

pdflinkcheck 1.1.94py3-none-any.whl → 1.2.29py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

pdflinkcheck/__init__.py +88 -18
pdflinkcheck/__main__.py +6 -0
pdflinkcheck/analysis_pdfium.py +131 -0
pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +99 -141
pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +51 -39
pdflinkcheck/cli.py +52 -48
pdflinkcheck/data/LICENSE +18 -15
pdflinkcheck/data/README.md +23 -25
pdflinkcheck/data/pyproject.toml +17 -26
pdflinkcheck/datacopy.py +16 -1
pdflinkcheck/dev.py +2 -2
pdflinkcheck/environment.py +14 -2
pdflinkcheck/gui.py +346 -563
pdflinkcheck/helpers.py +88 -0
pdflinkcheck/io.py +24 -6
pdflinkcheck/report.py +598 -97
pdflinkcheck/security.py +189 -0
pdflinkcheck/splash.py +38 -0
pdflinkcheck/stdlib_server.py +7 -21
pdflinkcheck/stdlib_server_alt.py +571 -0
pdflinkcheck/tk_utils.py +188 -0
pdflinkcheck/update_msix_version.py +2 -0
pdflinkcheck/validate.py +104 -170
pdflinkcheck/version_info.py +2 -2
{pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +41 -40
{pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/RECORD +34 -27
pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
{pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
pdflinkcheck/analyze_pypdf_v2.py +0 -217
pdflinkcheck-1.1.94.dist-info/WHEEL +0 -4
pdflinkcheck-1.1.94.dist-info/licenses/LICENSE +0 -24
{pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-AGPL3 +0 -0
{pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-MIT +0 -0

pdflinkcheck/security.py ADDED Viewed

@@ -0,0 +1,189 @@
+"""
+pdflinkcheck.security
+Offline, deterministic link‑risk scoring for PDF hyperlinks.
+This module intentionally avoids any heuristics that depend on PDF text
+extraction quality (e.g., anchor text analysis), because real‑world PDFs
+often contain inconsistent OCR output, concatenated strings, or placeholder
+text. Only URL‑structure‑based signals are used.
+Stable, low‑maintenance, and fully offline.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, asdict
+from urllib.parse import urlparse, parse_qs
+import ipaddress
+from typing import List, Dict, Optional
+# ---------------------------------------------------------------------------
+# Static rule tables (embedded; no external files)
+# ---------------------------------------------------------------------------
+# Top level domain (tld)
+SUSPICIOUS_TLDS = {
+    "xyz", "top", "click", "link", "rest", "gq", "ml", "cf", "tk"
+}
+# Tracking parameters
+"""
+These parameters collectively allow detailed attribution of website traffic and conversions:
+- **utm_** parameters are universal for tracking campaigns across all traffic sources.
+- **fbclid** and **gclid** are platform-specific identifiers for Facebook and Google Ads.
+- **mc_eid** is specific to email marketing, like Mailchimp campaigns.
+"""
+TRACKING_PARAMS = {
+    "utm_source", "utm_medium", "utm_campaign",
+    "fbclid", "gclid", "mc_eid"
+}
+# Minimal homoglyph table (expandable)
+"""
+"а" → Latin "a" (Cyrillic small letter a, U+0430 vs Latin a U+0061)
+"е" → Latin "e" (Cyrillic small letter ie, U+0435 vs Latin e U+0065)
+"і" → Latin "i" (Cyrillic small letter i, U+0456 vs Latin i U+0069)
+"ο" → Latin "o" (Greek small omicron, U+03BF vs Latin o U+006F)
+"р" → Latin "p" (Cyrillic small er, U+0440 vs Latin p U+0070)
+"ѕ" → Latin "s" (Cyrillic small letter dze, U+0455 vs Latin s U+0073)
+"у" → Latin "y" (Cyrillic small letter u, U+0443 vs Latin y U+0079)
+These characters have distinct Unicode code points from their Latin lookalikes
+but are visually nearly identical, making them classic homoglyphs.
+The purpose of such mappings is often to detect or simulate homoglyph attacks,
+such as phishing domains, email spoofing, or source code obfuscation,
+where attackers substitute visually similar characters from alternate scripts to deceive users or systems.
+"""
+HOMOGLYPHS = {
+    "а": "a",  # Cyrillic
+    "е": "e",
+    "і": "i",
+    "ο": "o",
+    "р": "p",
+    "ѕ": "s",
+    "у": "y",
+}
+# ---------------------------------------------------------------------------
+# Data structures
+# ---------------------------------------------------------------------------
+@dataclass
+class RiskReason:
+    rule_id: str
+    description: str
+    weight: int
+@dataclass
+class LinkRiskResult:
+    url: str
+    score: int
+    level: str
+    reasons: List[RiskReason]
+    def to_dict(self) -> Dict[str, object]:
+        d = asdict(self)
+        d["reasons"] = [asdict(r) for r in self.reasons]
+        return d
+# ---------------------------------------------------------------------------
+# Helper functions
+# ---------------------------------------------------------------------------
+def _is_ip(host: str) -> bool:
+    try:
+        ipaddress.ip_address(host)
+        return True
+    except Exception:
+        return False
+def _contains_homoglyphs(s: str) -> bool:
+    return any(ch in HOMOGLYPHS for ch in s)
+# ---------------------------------------------------------------------------
+# Core scoring function (URL‑structure‑based only)
+# ---------------------------------------------------------------------------
+def score_link(url: str) -> LinkRiskResult:
+    reasons: List[RiskReason] = []
+    score = 0
+    parsed = urlparse(url)
+    host = parsed.hostname or ""
+    query = parsed.query or ""
+    # IP‑based URL
+    if _is_ip(host):
+        reasons.append(RiskReason("ip_host", "URL uses a raw IP address.", 3))
+        score += 3
+    # Suspicious TLD
+    if "." in host:
+        tld = host.rsplit(".", 1)[-1].lower()
+        if tld in SUSPICIOUS_TLDS:
+            reasons.append(RiskReason("suspicious_tld", f"TLD '.{tld}' is commonly abused.", 2))
+            score += 2
+    # Non‑standard port
+    if parsed.port not in (None, 80, 443):
+        reasons.append(RiskReason("nonstandard_port", f"Non‑standard port {parsed.port}.", 2))
+        score += 2
+    # Long URL
+    if len(url) > 200:
+        reasons.append(RiskReason("long_url", "URL is unusually long.", 1))
+        score += 1
+    # Tracking parameters
+    params = parse_qs(query)
+    tracking_hits = sum(1 for p in params if p.lower() in TRACKING_PARAMS)
+    if tracking_hits:
+        reasons.append(RiskReason("tracking_params", f"{tracking_hits} tracking parameters found.", 1))
+        score += 1
+    # Homoglyph detection
+    if _contains_homoglyphs(host + parsed.path):
+        reasons.append(RiskReason("homoglyph_suspected", "URL contains homoglyph characters.", 3))
+        score += 3
+    # Risk level mapping
+    if score == 0:
+        level = "none"
+    elif score <= 2:
+        level = "low"
+    elif score <= 6:
+        level = "medium"
+    else:
+        level = "high"
+    return LinkRiskResult(url, score, level, reasons)
+# ---------------------------------------------------------------------------
+# Report‑level risk computation (mirrors validate.py)
+# ---------------------------------------------------------------------------
+def compute_risk(report: Dict[str, object]) -> Dict[str, object]:
+    external_links = report.get("data", {}).get("external_links", [])
+    results = []
+    for link in external_links:
+        url = link.get("url") or link.get("remote_file") or link.get("target")
+        if url:
+            results.append(score_link(url).to_dict())
+    return {
+        "risk_summary": {
+            "total_external": len(external_links),
+            "scored": len(results),
+            "high_risk": sum(1 for r in results if r["level"] == "high"),
+            "medium_risk": sum(1 for r in results if r["level"] == "medium"),
+            "low_risk": sum(1 for r in results if r["level"] == "low"),
+        },
+        "risk_details": results
+    }

pdflinkcheck/splash.py ADDED Viewed

@@ -0,0 +1,38 @@
+# src/pdflinkcheck/splash.py
+import tkinter as tk
+from tkinter import ttk
+from pdflinkcheck.tk_utils import center_window_on_primary
+class SplashFrame:
+    def __init__(self, parent):
+        self.top = tk.Toplevel(parent)
+        self.top.withdraw()
+        self.top.overrideredirect(True)
+        self.top.configure(bg="#2b2b2b")
+        # 1. Define dimensions
+        width, height = 300, 80
+        # Use generalized centering
+        #center_window_on_primary(self.top, width, height)
+        # UI Components
+        tk.Label(self.top, text="PDF LINK CHECK", fg="white", bg="#2b2b2b",
+                 font=("Arial", 12, "bold")).pack(pady=(15, 5))
+        self.progress = ttk.Progressbar(self.top, mode='indeterminate', length=250)
+        self.progress.pack(pady=10, padx=20)
+        self.progress.start(15)
+        # Force the OS to acknowledge the window's existence
+        self.top.update_idletasks()
+        # Center and then reveal
+        center_window_on_primary(self.top, width, height)
+        self.top.deiconify()
+    def teardown(self):
+        """Cleanly shutdown the splash window."""
+        self.progress.stop()
+        self.top.destroy()

pdflinkcheck/stdlib_server.py CHANGED Viewed

@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: MIT
 # src/pdflinkcheck/stdlib_server.py
+from __future__ import annotations
 import http.server
 import socketserver
 import json
@@ -27,18 +28,11 @@ HTML_FORM = """
       <label>Engine:</label>
       <select name="pdf_library">
         <option value="pypdf" selected>pypdf (pure Python, Termux-friendly)</option>
-        <option value="pymupdf">pymupdf (faster, if installed)</option>
+        <option value="pymupdf">PyMyPPD (fast, AGPL3)</option>
+        <option value="pdfium">PDFium (fast, permissive)</option>
       </select>
     </p>
-    <p>
-      <label>Max links to show (0 = all):</label>
-      <input type="number" name="max_links" value="0" min="0">
-    </p>
     <p><button type="submit">Analyze PDF</button></p>
-    <!--p>
-      <button type="submit" name="action" value="analyze">Analyze PDF</button>
-      <button type="submit" name="action" value="validate">Validate PDF</button>
-    </p-->
   </form>
   <hr>
   <p>Returns JSON.</p>
@@ -96,7 +90,6 @@ class PDFLinkCheckHandler(http.server.SimpleHTTPRequestHandler):
         # Extract parts
         file_item = None
         pdf_library = "pypdf"
-        max_links = 0
         for part in msg.get_payload():
             disposition = part.get("Content-Disposition", "")
@@ -115,16 +108,10 @@ class PDFLinkCheckHandler(http.server.SimpleHTTPRequestHandler):
             elif name == "pdf_library":
                 pdf_library = part.get_payload(decode=True).decode().lower()
-                if pdf_library not in {"pypdf", "pymupdf"}:
+                if pdf_library not in {"pypdf", "pymupdf", "pdfium"}:
                     self._send_json_error("Invalid pdf_library", 400)
                     return
-            elif name == "max_links":
-                try:
-                    max_links = int(part.get_payload(decode=True).decode())
-                except ValueError:
-                    max_links = 0
         if not file_item:
             self._send_json_error("No PDF file uploaded", 400)
             return
@@ -138,18 +125,17 @@ class PDFLinkCheckHandler(http.server.SimpleHTTPRequestHandler):
             result = run_report_and_call_exports(
                 pdf_path=tmp_path,
-                max_links=max_links if max_links > 0 else 0,
                 export_format="",
                 pdf_library=pdf_library,
                 print_bool=False
             )
-            metadata = result.get("metadata", {"total_links": 0, "pdf_name": file_filename})
-            total_links = metadata.get("total_links", 0)
+            total_links_count = result.get("metadata",{}).get("link_counts",{}).get("total_links_count", 0)
             response = {
                 "filename": file_filename,
                 "pdf_library_used": pdf_library,
-                "total_links": total_links,
+                "total_links_count": total_links_count,
                 "data": result["data"],
                 "text_report": result["text"]
             }

pdflinkcheck 1.1.94__py3-none-any.whl → 1.2.29__py3-none-any.whl

pdflinkcheck 1.1.94py3-none-any.whl → 1.2.29py3-none-any.whl