pdflinkcheck 1.1.94__py3-none-any.whl → 1.2.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. pdflinkcheck/__init__.py +88 -18
  2. pdflinkcheck/__main__.py +6 -0
  3. pdflinkcheck/analysis_pdfium.py +131 -0
  4. pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +99 -141
  5. pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +51 -39
  6. pdflinkcheck/cli.py +52 -48
  7. pdflinkcheck/data/LICENSE +18 -15
  8. pdflinkcheck/data/README.md +23 -25
  9. pdflinkcheck/data/pyproject.toml +17 -26
  10. pdflinkcheck/datacopy.py +16 -1
  11. pdflinkcheck/dev.py +2 -2
  12. pdflinkcheck/environment.py +14 -2
  13. pdflinkcheck/gui.py +346 -563
  14. pdflinkcheck/helpers.py +88 -0
  15. pdflinkcheck/io.py +24 -6
  16. pdflinkcheck/report.py +598 -97
  17. pdflinkcheck/security.py +189 -0
  18. pdflinkcheck/splash.py +38 -0
  19. pdflinkcheck/stdlib_server.py +7 -21
  20. pdflinkcheck/stdlib_server_alt.py +571 -0
  21. pdflinkcheck/tk_utils.py +188 -0
  22. pdflinkcheck/update_msix_version.py +2 -0
  23. pdflinkcheck/validate.py +104 -170
  24. pdflinkcheck/version_info.py +2 -2
  25. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +41 -40
  26. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/RECORD +34 -27
  27. pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
  28. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
  29. pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
  30. pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
  31. pdflinkcheck/analyze_pypdf_v2.py +0 -217
  32. pdflinkcheck-1.1.94.dist-info/WHEEL +0 -4
  33. pdflinkcheck-1.1.94.dist-info/licenses/LICENSE +0 -24
  34. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-AGPL3 +0 -0
  35. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-MIT +0 -0
@@ -0,0 +1,189 @@
1
+ """
2
+ pdflinkcheck.security
3
+
4
+ Offline, deterministic link‑risk scoring for PDF hyperlinks.
5
+
6
+ This module intentionally avoids any heuristics that depend on PDF text
7
+ extraction quality (e.g., anchor text analysis), because real‑world PDFs
8
+ often contain inconsistent OCR output, concatenated strings, or placeholder
9
+ text. Only URL‑structure‑based signals are used.
10
+
11
+ Stable, low‑maintenance, and fully offline.
12
+ """
13
+
14
+ from __future__ import annotations
15
+ from dataclasses import dataclass, asdict
16
+ from urllib.parse import urlparse, parse_qs
17
+ import ipaddress
18
+ from typing import List, Dict, Optional
19
+
20
+
21
+ # ---------------------------------------------------------------------------
22
+ # Static rule tables (embedded; no external files)
23
+ # ---------------------------------------------------------------------------
24
+
25
+ # Top level domain (tld)
26
+ SUSPICIOUS_TLDS = {
27
+ "xyz", "top", "click", "link", "rest", "gq", "ml", "cf", "tk"
28
+ }
29
+
30
+ # Tracking parameters
31
+ """
32
+ These parameters collectively allow detailed attribution of website traffic and conversions:
33
+ - **utm_** parameters are universal for tracking campaigns across all traffic sources.
34
+ - **fbclid** and **gclid** are platform-specific identifiers for Facebook and Google Ads.
35
+ - **mc_eid** is specific to email marketing, like Mailchimp campaigns.
36
+ """
37
+ TRACKING_PARAMS = {
38
+ "utm_source", "utm_medium", "utm_campaign",
39
+ "fbclid", "gclid", "mc_eid"
40
+ }
41
+
42
+ # Minimal homoglyph table (expandable)
43
+ """
44
+ "а" → Latin "a" (Cyrillic small letter a, U+0430 vs Latin a U+0061)
45
+ "е" → Latin "e" (Cyrillic small letter ie, U+0435 vs Latin e U+0065)
46
+ "і" → Latin "i" (Cyrillic small letter i, U+0456 vs Latin i U+0069)
47
+ "ο" → Latin "o" (Greek small omicron, U+03BF vs Latin o U+006F)
48
+ "р" → Latin "p" (Cyrillic small er, U+0440 vs Latin p U+0070)
49
+ "ѕ" → Latin "s" (Cyrillic small letter dze, U+0455 vs Latin s U+0073)
50
+ "у" → Latin "y" (Cyrillic small letter u, U+0443 vs Latin y U+0079)
51
+
52
+ These characters have distinct Unicode code points from their Latin lookalikes
53
+ but are visually nearly identical, making them classic homoglyphs.
54
+ The purpose of such mappings is often to detect or simulate homoglyph attacks,
55
+ such as phishing domains, email spoofing, or source code obfuscation,
56
+ where attackers substitute visually similar characters from alternate scripts to deceive users or systems.
57
+ """
58
+ HOMOGLYPHS = {
59
+ "а": "a", # Cyrillic
60
+ "е": "e",
61
+ "і": "i",
62
+ "ο": "o",
63
+ "р": "p",
64
+ "ѕ": "s",
65
+ "у": "y",
66
+ }
67
+
68
+ # ---------------------------------------------------------------------------
69
+ # Data structures
70
+ # ---------------------------------------------------------------------------
71
+
72
+ @dataclass
73
+ class RiskReason:
74
+ rule_id: str
75
+ description: str
76
+ weight: int
77
+
78
+
79
+ @dataclass
80
+ class LinkRiskResult:
81
+ url: str
82
+ score: int
83
+ level: str
84
+ reasons: List[RiskReason]
85
+
86
+ def to_dict(self) -> Dict[str, object]:
87
+ d = asdict(self)
88
+ d["reasons"] = [asdict(r) for r in self.reasons]
89
+ return d
90
+
91
+
92
+ # ---------------------------------------------------------------------------
93
+ # Helper functions
94
+ # ---------------------------------------------------------------------------
95
+
96
+ def _is_ip(host: str) -> bool:
97
+ try:
98
+ ipaddress.ip_address(host)
99
+ return True
100
+ except Exception:
101
+ return False
102
+
103
+
104
+ def _contains_homoglyphs(s: str) -> bool:
105
+ return any(ch in HOMOGLYPHS for ch in s)
106
+
107
+
108
+ # ---------------------------------------------------------------------------
109
+ # Core scoring function (URL‑structure‑based only)
110
+ # ---------------------------------------------------------------------------
111
+
112
+ def score_link(url: str) -> LinkRiskResult:
113
+ reasons: List[RiskReason] = []
114
+ score = 0
115
+
116
+ parsed = urlparse(url)
117
+ host = parsed.hostname or ""
118
+ query = parsed.query or ""
119
+
120
+ # IP‑based URL
121
+ if _is_ip(host):
122
+ reasons.append(RiskReason("ip_host", "URL uses a raw IP address.", 3))
123
+ score += 3
124
+
125
+ # Suspicious TLD
126
+ if "." in host:
127
+ tld = host.rsplit(".", 1)[-1].lower()
128
+ if tld in SUSPICIOUS_TLDS:
129
+ reasons.append(RiskReason("suspicious_tld", f"TLD '.{tld}' is commonly abused.", 2))
130
+ score += 2
131
+
132
+ # Non‑standard port
133
+ if parsed.port not in (None, 80, 443):
134
+ reasons.append(RiskReason("nonstandard_port", f"Non‑standard port {parsed.port}.", 2))
135
+ score += 2
136
+
137
+ # Long URL
138
+ if len(url) > 200:
139
+ reasons.append(RiskReason("long_url", "URL is unusually long.", 1))
140
+ score += 1
141
+
142
+ # Tracking parameters
143
+ params = parse_qs(query)
144
+ tracking_hits = sum(1 for p in params if p.lower() in TRACKING_PARAMS)
145
+ if tracking_hits:
146
+ reasons.append(RiskReason("tracking_params", f"{tracking_hits} tracking parameters found.", 1))
147
+ score += 1
148
+
149
+ # Homoglyph detection
150
+ if _contains_homoglyphs(host + parsed.path):
151
+ reasons.append(RiskReason("homoglyph_suspected", "URL contains homoglyph characters.", 3))
152
+ score += 3
153
+ # Risk level mapping
154
+ if score == 0:
155
+ level = "none"
156
+ elif score <= 2:
157
+ level = "low"
158
+ elif score <= 6:
159
+ level = "medium"
160
+ else:
161
+ level = "high"
162
+
163
+
164
+ return LinkRiskResult(url, score, level, reasons)
165
+
166
+
167
+ # ---------------------------------------------------------------------------
168
+ # Report‑level risk computation (mirrors validate.py)
169
+ # ---------------------------------------------------------------------------
170
+
171
+ def compute_risk(report: Dict[str, object]) -> Dict[str, object]:
172
+ external_links = report.get("data", {}).get("external_links", [])
173
+ results = []
174
+
175
+ for link in external_links:
176
+ url = link.get("url") or link.get("remote_file") or link.get("target")
177
+ if url:
178
+ results.append(score_link(url).to_dict())
179
+
180
+ return {
181
+ "risk_summary": {
182
+ "total_external": len(external_links),
183
+ "scored": len(results),
184
+ "high_risk": sum(1 for r in results if r["level"] == "high"),
185
+ "medium_risk": sum(1 for r in results if r["level"] == "medium"),
186
+ "low_risk": sum(1 for r in results if r["level"] == "low"),
187
+ },
188
+ "risk_details": results
189
+ }
pdflinkcheck/splash.py ADDED
@@ -0,0 +1,38 @@
1
+ # src/pdflinkcheck/splash.py
2
+ import tkinter as tk
3
+ from tkinter import ttk
4
+ from pdflinkcheck.tk_utils import center_window_on_primary
5
+
6
+ class SplashFrame:
7
+
8
+ def __init__(self, parent):
9
+ self.top = tk.Toplevel(parent)
10
+ self.top.withdraw()
11
+ self.top.overrideredirect(True)
12
+ self.top.configure(bg="#2b2b2b")
13
+
14
+ # 1. Define dimensions
15
+ width, height = 300, 80
16
+ # Use generalized centering
17
+ #center_window_on_primary(self.top, width, height)
18
+
19
+
20
+ # UI Components
21
+ tk.Label(self.top, text="PDF LINK CHECK", fg="white", bg="#2b2b2b",
22
+ font=("Arial", 12, "bold")).pack(pady=(15, 5))
23
+
24
+ self.progress = ttk.Progressbar(self.top, mode='indeterminate', length=250)
25
+ self.progress.pack(pady=10, padx=20)
26
+ self.progress.start(15)
27
+
28
+ # Force the OS to acknowledge the window's existence
29
+ self.top.update_idletasks()
30
+
31
+ # Center and then reveal
32
+ center_window_on_primary(self.top, width, height)
33
+ self.top.deiconify()
34
+
35
+ def teardown(self):
36
+ """Cleanly shutdown the splash window."""
37
+ self.progress.stop()
38
+ self.top.destroy()
@@ -1,6 +1,7 @@
1
1
  #!/usr/bin/env python3
2
2
  # SPDX-License-Identifier: MIT
3
3
  # src/pdflinkcheck/stdlib_server.py
4
+ from __future__ import annotations
4
5
  import http.server
5
6
  import socketserver
6
7
  import json
@@ -27,18 +28,11 @@ HTML_FORM = """
27
28
  <label>Engine:</label>
28
29
  <select name="pdf_library">
29
30
  <option value="pypdf" selected>pypdf (pure Python, Termux-friendly)</option>
30
- <option value="pymupdf">pymupdf (faster, if installed)</option>
31
+ <option value="pymupdf">PyMyPPD (fast, AGPL3)</option>
32
+ <option value="pdfium">PDFium (fast, permissive)</option>
31
33
  </select>
32
34
  </p>
33
- <p>
34
- <label>Max links to show (0 = all):</label>
35
- <input type="number" name="max_links" value="0" min="0">
36
- </p>
37
35
  <p><button type="submit">Analyze PDF</button></p>
38
- <!--p>
39
- <button type="submit" name="action" value="analyze">Analyze PDF</button>
40
- <button type="submit" name="action" value="validate">Validate PDF</button>
41
- </p-->
42
36
  </form>
43
37
  <hr>
44
38
  <p>Returns JSON.</p>
@@ -96,7 +90,6 @@ class PDFLinkCheckHandler(http.server.SimpleHTTPRequestHandler):
96
90
  # Extract parts
97
91
  file_item = None
98
92
  pdf_library = "pypdf"
99
- max_links = 0
100
93
 
101
94
  for part in msg.get_payload():
102
95
  disposition = part.get("Content-Disposition", "")
@@ -115,16 +108,10 @@ class PDFLinkCheckHandler(http.server.SimpleHTTPRequestHandler):
115
108
 
116
109
  elif name == "pdf_library":
117
110
  pdf_library = part.get_payload(decode=True).decode().lower()
118
- if pdf_library not in {"pypdf", "pymupdf"}:
111
+ if pdf_library not in {"pypdf", "pymupdf", "pdfium"}:
119
112
  self._send_json_error("Invalid pdf_library", 400)
120
113
  return
121
114
 
122
- elif name == "max_links":
123
- try:
124
- max_links = int(part.get_payload(decode=True).decode())
125
- except ValueError:
126
- max_links = 0
127
-
128
115
  if not file_item:
129
116
  self._send_json_error("No PDF file uploaded", 400)
130
117
  return
@@ -138,18 +125,17 @@ class PDFLinkCheckHandler(http.server.SimpleHTTPRequestHandler):
138
125
 
139
126
  result = run_report_and_call_exports(
140
127
  pdf_path=tmp_path,
141
- max_links=max_links if max_links > 0 else 0,
142
128
  export_format="",
143
129
  pdf_library=pdf_library,
144
130
  print_bool=False
145
131
  )
146
- metadata = result.get("metadata", {"total_links": 0, "pdf_name": file_filename})
147
- total_links = metadata.get("total_links", 0)
132
+
133
+ total_links_count = result.get("metadata",{}).get("link_counts",{}).get("total_links_count", 0)
148
134
 
149
135
  response = {
150
136
  "filename": file_filename,
151
137
  "pdf_library_used": pdf_library,
152
- "total_links": total_links,
138
+ "total_links_count": total_links_count,
153
139
  "data": result["data"],
154
140
  "text_report": result["text"]
155
141
  }