pdflinkcheck 1.1.94__py3-none-any.whl → 1.2.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdflinkcheck/__init__.py +88 -18
- pdflinkcheck/__main__.py +6 -0
- pdflinkcheck/analysis_pdfium.py +131 -0
- pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +99 -141
- pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +51 -39
- pdflinkcheck/cli.py +52 -48
- pdflinkcheck/data/LICENSE +18 -15
- pdflinkcheck/data/README.md +23 -25
- pdflinkcheck/data/pyproject.toml +17 -26
- pdflinkcheck/datacopy.py +16 -1
- pdflinkcheck/dev.py +2 -2
- pdflinkcheck/environment.py +14 -2
- pdflinkcheck/gui.py +346 -563
- pdflinkcheck/helpers.py +88 -0
- pdflinkcheck/io.py +24 -6
- pdflinkcheck/report.py +598 -97
- pdflinkcheck/security.py +189 -0
- pdflinkcheck/splash.py +38 -0
- pdflinkcheck/stdlib_server.py +7 -21
- pdflinkcheck/stdlib_server_alt.py +571 -0
- pdflinkcheck/tk_utils.py +188 -0
- pdflinkcheck/update_msix_version.py +2 -0
- pdflinkcheck/validate.py +104 -170
- pdflinkcheck/version_info.py +2 -2
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +41 -40
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/RECORD +34 -27
- pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
- pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
- pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
- pdflinkcheck/analyze_pypdf_v2.py +0 -217
- pdflinkcheck-1.1.94.dist-info/WHEEL +0 -4
- pdflinkcheck-1.1.94.dist-info/licenses/LICENSE +0 -24
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-AGPL3 +0 -0
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-MIT +0 -0
pdflinkcheck/security.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""
|
|
2
|
+
pdflinkcheck.security
|
|
3
|
+
|
|
4
|
+
Offline, deterministic link‑risk scoring for PDF hyperlinks.
|
|
5
|
+
|
|
6
|
+
This module intentionally avoids any heuristics that depend on PDF text
|
|
7
|
+
extraction quality (e.g., anchor text analysis), because real‑world PDFs
|
|
8
|
+
often contain inconsistent OCR output, concatenated strings, or placeholder
|
|
9
|
+
text. Only URL‑structure‑based signals are used.
|
|
10
|
+
|
|
11
|
+
Stable, low‑maintenance, and fully offline.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
from dataclasses import dataclass, asdict
|
|
16
|
+
from urllib.parse import urlparse, parse_qs
|
|
17
|
+
import ipaddress
|
|
18
|
+
from typing import List, Dict, Optional
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
# Static rule tables (embedded; no external files)
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
# Top level domain (tld)
|
|
26
|
+
SUSPICIOUS_TLDS = {
|
|
27
|
+
"xyz", "top", "click", "link", "rest", "gq", "ml", "cf", "tk"
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
# Tracking parameters
|
|
31
|
+
"""
|
|
32
|
+
These parameters collectively allow detailed attribution of website traffic and conversions:
|
|
33
|
+
- **utm_** parameters are universal for tracking campaigns across all traffic sources.
|
|
34
|
+
- **fbclid** and **gclid** are platform-specific identifiers for Facebook and Google Ads.
|
|
35
|
+
- **mc_eid** is specific to email marketing, like Mailchimp campaigns.
|
|
36
|
+
"""
|
|
37
|
+
TRACKING_PARAMS = {
|
|
38
|
+
"utm_source", "utm_medium", "utm_campaign",
|
|
39
|
+
"fbclid", "gclid", "mc_eid"
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
# Minimal homoglyph table (expandable)
|
|
43
|
+
"""
|
|
44
|
+
"а" → Latin "a" (Cyrillic small letter a, U+0430 vs Latin a U+0061)
|
|
45
|
+
"е" → Latin "e" (Cyrillic small letter ie, U+0435 vs Latin e U+0065)
|
|
46
|
+
"і" → Latin "i" (Cyrillic small letter i, U+0456 vs Latin i U+0069)
|
|
47
|
+
"ο" → Latin "o" (Greek small omicron, U+03BF vs Latin o U+006F)
|
|
48
|
+
"р" → Latin "p" (Cyrillic small er, U+0440 vs Latin p U+0070)
|
|
49
|
+
"ѕ" → Latin "s" (Cyrillic small letter dze, U+0455 vs Latin s U+0073)
|
|
50
|
+
"у" → Latin "y" (Cyrillic small letter u, U+0443 vs Latin y U+0079)
|
|
51
|
+
|
|
52
|
+
These characters have distinct Unicode code points from their Latin lookalikes
|
|
53
|
+
but are visually nearly identical, making them classic homoglyphs.
|
|
54
|
+
The purpose of such mappings is often to detect or simulate homoglyph attacks,
|
|
55
|
+
such as phishing domains, email spoofing, or source code obfuscation,
|
|
56
|
+
where attackers substitute visually similar characters from alternate scripts to deceive users or systems.
|
|
57
|
+
"""
|
|
58
|
+
HOMOGLYPHS = {
|
|
59
|
+
"а": "a", # Cyrillic
|
|
60
|
+
"е": "e",
|
|
61
|
+
"і": "i",
|
|
62
|
+
"ο": "o",
|
|
63
|
+
"р": "p",
|
|
64
|
+
"ѕ": "s",
|
|
65
|
+
"у": "y",
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
# ---------------------------------------------------------------------------
|
|
69
|
+
# Data structures
|
|
70
|
+
# ---------------------------------------------------------------------------
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class RiskReason:
|
|
74
|
+
rule_id: str
|
|
75
|
+
description: str
|
|
76
|
+
weight: int
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class LinkRiskResult:
|
|
81
|
+
url: str
|
|
82
|
+
score: int
|
|
83
|
+
level: str
|
|
84
|
+
reasons: List[RiskReason]
|
|
85
|
+
|
|
86
|
+
def to_dict(self) -> Dict[str, object]:
|
|
87
|
+
d = asdict(self)
|
|
88
|
+
d["reasons"] = [asdict(r) for r in self.reasons]
|
|
89
|
+
return d
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# ---------------------------------------------------------------------------
|
|
93
|
+
# Helper functions
|
|
94
|
+
# ---------------------------------------------------------------------------
|
|
95
|
+
|
|
96
|
+
def _is_ip(host: str) -> bool:
|
|
97
|
+
try:
|
|
98
|
+
ipaddress.ip_address(host)
|
|
99
|
+
return True
|
|
100
|
+
except Exception:
|
|
101
|
+
return False
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _contains_homoglyphs(s: str) -> bool:
|
|
105
|
+
return any(ch in HOMOGLYPHS for ch in s)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# ---------------------------------------------------------------------------
|
|
109
|
+
# Core scoring function (URL‑structure‑based only)
|
|
110
|
+
# ---------------------------------------------------------------------------
|
|
111
|
+
|
|
112
|
+
def score_link(url: str) -> LinkRiskResult:
|
|
113
|
+
reasons: List[RiskReason] = []
|
|
114
|
+
score = 0
|
|
115
|
+
|
|
116
|
+
parsed = urlparse(url)
|
|
117
|
+
host = parsed.hostname or ""
|
|
118
|
+
query = parsed.query or ""
|
|
119
|
+
|
|
120
|
+
# IP‑based URL
|
|
121
|
+
if _is_ip(host):
|
|
122
|
+
reasons.append(RiskReason("ip_host", "URL uses a raw IP address.", 3))
|
|
123
|
+
score += 3
|
|
124
|
+
|
|
125
|
+
# Suspicious TLD
|
|
126
|
+
if "." in host:
|
|
127
|
+
tld = host.rsplit(".", 1)[-1].lower()
|
|
128
|
+
if tld in SUSPICIOUS_TLDS:
|
|
129
|
+
reasons.append(RiskReason("suspicious_tld", f"TLD '.{tld}' is commonly abused.", 2))
|
|
130
|
+
score += 2
|
|
131
|
+
|
|
132
|
+
# Non‑standard port
|
|
133
|
+
if parsed.port not in (None, 80, 443):
|
|
134
|
+
reasons.append(RiskReason("nonstandard_port", f"Non‑standard port {parsed.port}.", 2))
|
|
135
|
+
score += 2
|
|
136
|
+
|
|
137
|
+
# Long URL
|
|
138
|
+
if len(url) > 200:
|
|
139
|
+
reasons.append(RiskReason("long_url", "URL is unusually long.", 1))
|
|
140
|
+
score += 1
|
|
141
|
+
|
|
142
|
+
# Tracking parameters
|
|
143
|
+
params = parse_qs(query)
|
|
144
|
+
tracking_hits = sum(1 for p in params if p.lower() in TRACKING_PARAMS)
|
|
145
|
+
if tracking_hits:
|
|
146
|
+
reasons.append(RiskReason("tracking_params", f"{tracking_hits} tracking parameters found.", 1))
|
|
147
|
+
score += 1
|
|
148
|
+
|
|
149
|
+
# Homoglyph detection
|
|
150
|
+
if _contains_homoglyphs(host + parsed.path):
|
|
151
|
+
reasons.append(RiskReason("homoglyph_suspected", "URL contains homoglyph characters.", 3))
|
|
152
|
+
score += 3
|
|
153
|
+
# Risk level mapping
|
|
154
|
+
if score == 0:
|
|
155
|
+
level = "none"
|
|
156
|
+
elif score <= 2:
|
|
157
|
+
level = "low"
|
|
158
|
+
elif score <= 6:
|
|
159
|
+
level = "medium"
|
|
160
|
+
else:
|
|
161
|
+
level = "high"
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
return LinkRiskResult(url, score, level, reasons)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
# ---------------------------------------------------------------------------
|
|
168
|
+
# Report‑level risk computation (mirrors validate.py)
|
|
169
|
+
# ---------------------------------------------------------------------------
|
|
170
|
+
|
|
171
|
+
def compute_risk(report: Dict[str, object]) -> Dict[str, object]:
|
|
172
|
+
external_links = report.get("data", {}).get("external_links", [])
|
|
173
|
+
results = []
|
|
174
|
+
|
|
175
|
+
for link in external_links:
|
|
176
|
+
url = link.get("url") or link.get("remote_file") or link.get("target")
|
|
177
|
+
if url:
|
|
178
|
+
results.append(score_link(url).to_dict())
|
|
179
|
+
|
|
180
|
+
return {
|
|
181
|
+
"risk_summary": {
|
|
182
|
+
"total_external": len(external_links),
|
|
183
|
+
"scored": len(results),
|
|
184
|
+
"high_risk": sum(1 for r in results if r["level"] == "high"),
|
|
185
|
+
"medium_risk": sum(1 for r in results if r["level"] == "medium"),
|
|
186
|
+
"low_risk": sum(1 for r in results if r["level"] == "low"),
|
|
187
|
+
},
|
|
188
|
+
"risk_details": results
|
|
189
|
+
}
|
pdflinkcheck/splash.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# src/pdflinkcheck/splash.py
|
|
2
|
+
import tkinter as tk
|
|
3
|
+
from tkinter import ttk
|
|
4
|
+
from pdflinkcheck.tk_utils import center_window_on_primary
|
|
5
|
+
|
|
6
|
+
class SplashFrame:
|
|
7
|
+
|
|
8
|
+
def __init__(self, parent):
|
|
9
|
+
self.top = tk.Toplevel(parent)
|
|
10
|
+
self.top.withdraw()
|
|
11
|
+
self.top.overrideredirect(True)
|
|
12
|
+
self.top.configure(bg="#2b2b2b")
|
|
13
|
+
|
|
14
|
+
# 1. Define dimensions
|
|
15
|
+
width, height = 300, 80
|
|
16
|
+
# Use generalized centering
|
|
17
|
+
#center_window_on_primary(self.top, width, height)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# UI Components
|
|
21
|
+
tk.Label(self.top, text="PDF LINK CHECK", fg="white", bg="#2b2b2b",
|
|
22
|
+
font=("Arial", 12, "bold")).pack(pady=(15, 5))
|
|
23
|
+
|
|
24
|
+
self.progress = ttk.Progressbar(self.top, mode='indeterminate', length=250)
|
|
25
|
+
self.progress.pack(pady=10, padx=20)
|
|
26
|
+
self.progress.start(15)
|
|
27
|
+
|
|
28
|
+
# Force the OS to acknowledge the window's existence
|
|
29
|
+
self.top.update_idletasks()
|
|
30
|
+
|
|
31
|
+
# Center and then reveal
|
|
32
|
+
center_window_on_primary(self.top, width, height)
|
|
33
|
+
self.top.deiconify()
|
|
34
|
+
|
|
35
|
+
def teardown(self):
|
|
36
|
+
"""Cleanly shutdown the splash window."""
|
|
37
|
+
self.progress.stop()
|
|
38
|
+
self.top.destroy()
|
pdflinkcheck/stdlib_server.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
# SPDX-License-Identifier: MIT
|
|
3
3
|
# src/pdflinkcheck/stdlib_server.py
|
|
4
|
+
from __future__ import annotations
|
|
4
5
|
import http.server
|
|
5
6
|
import socketserver
|
|
6
7
|
import json
|
|
@@ -27,18 +28,11 @@ HTML_FORM = """
|
|
|
27
28
|
<label>Engine:</label>
|
|
28
29
|
<select name="pdf_library">
|
|
29
30
|
<option value="pypdf" selected>pypdf (pure Python, Termux-friendly)</option>
|
|
30
|
-
<option value="pymupdf">
|
|
31
|
+
<option value="pymupdf">PyMyPPD (fast, AGPL3)</option>
|
|
32
|
+
<option value="pdfium">PDFium (fast, permissive)</option>
|
|
31
33
|
</select>
|
|
32
34
|
</p>
|
|
33
|
-
<p>
|
|
34
|
-
<label>Max links to show (0 = all):</label>
|
|
35
|
-
<input type="number" name="max_links" value="0" min="0">
|
|
36
|
-
</p>
|
|
37
35
|
<p><button type="submit">Analyze PDF</button></p>
|
|
38
|
-
<!--p>
|
|
39
|
-
<button type="submit" name="action" value="analyze">Analyze PDF</button>
|
|
40
|
-
<button type="submit" name="action" value="validate">Validate PDF</button>
|
|
41
|
-
</p-->
|
|
42
36
|
</form>
|
|
43
37
|
<hr>
|
|
44
38
|
<p>Returns JSON.</p>
|
|
@@ -96,7 +90,6 @@ class PDFLinkCheckHandler(http.server.SimpleHTTPRequestHandler):
|
|
|
96
90
|
# Extract parts
|
|
97
91
|
file_item = None
|
|
98
92
|
pdf_library = "pypdf"
|
|
99
|
-
max_links = 0
|
|
100
93
|
|
|
101
94
|
for part in msg.get_payload():
|
|
102
95
|
disposition = part.get("Content-Disposition", "")
|
|
@@ -115,16 +108,10 @@ class PDFLinkCheckHandler(http.server.SimpleHTTPRequestHandler):
|
|
|
115
108
|
|
|
116
109
|
elif name == "pdf_library":
|
|
117
110
|
pdf_library = part.get_payload(decode=True).decode().lower()
|
|
118
|
-
if pdf_library not in {"pypdf", "pymupdf"}:
|
|
111
|
+
if pdf_library not in {"pypdf", "pymupdf", "pdfium"}:
|
|
119
112
|
self._send_json_error("Invalid pdf_library", 400)
|
|
120
113
|
return
|
|
121
114
|
|
|
122
|
-
elif name == "max_links":
|
|
123
|
-
try:
|
|
124
|
-
max_links = int(part.get_payload(decode=True).decode())
|
|
125
|
-
except ValueError:
|
|
126
|
-
max_links = 0
|
|
127
|
-
|
|
128
115
|
if not file_item:
|
|
129
116
|
self._send_json_error("No PDF file uploaded", 400)
|
|
130
117
|
return
|
|
@@ -138,18 +125,17 @@ class PDFLinkCheckHandler(http.server.SimpleHTTPRequestHandler):
|
|
|
138
125
|
|
|
139
126
|
result = run_report_and_call_exports(
|
|
140
127
|
pdf_path=tmp_path,
|
|
141
|
-
max_links=max_links if max_links > 0 else 0,
|
|
142
128
|
export_format="",
|
|
143
129
|
pdf_library=pdf_library,
|
|
144
130
|
print_bool=False
|
|
145
131
|
)
|
|
146
|
-
|
|
147
|
-
|
|
132
|
+
|
|
133
|
+
total_links_count = result.get("metadata",{}).get("link_counts",{}).get("total_links_count", 0)
|
|
148
134
|
|
|
149
135
|
response = {
|
|
150
136
|
"filename": file_filename,
|
|
151
137
|
"pdf_library_used": pdf_library,
|
|
152
|
-
"
|
|
138
|
+
"total_links_count": total_links_count,
|
|
153
139
|
"data": result["data"],
|
|
154
140
|
"text_report": result["text"]
|
|
155
141
|
}
|