pdflinkcheck 1.1.73__py3-none-any.whl → 1.2.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdflinkcheck/__init__.py +88 -21
- pdflinkcheck/__main__.py +6 -0
- pdflinkcheck/analysis_pdfium.py +131 -0
- pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +109 -145
- pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +67 -37
- pdflinkcheck/cli.py +111 -116
- pdflinkcheck/data/I Have Questions.md +51 -0
- pdflinkcheck/data/LICENSE +20 -654
- pdflinkcheck/data/README.md +65 -67
- pdflinkcheck/data/icons/BoxArt-1080x1080.png +0 -0
- pdflinkcheck/data/icons/Logo-150x150.png +0 -0
- pdflinkcheck/data/icons/Logo-300x300.png +0 -0
- pdflinkcheck/data/icons/Logo-71x71.png +0 -0
- pdflinkcheck/data/icons/PosterArt-720x1080.png +0 -0
- pdflinkcheck/data/icons/SmallLogo-44x44.png +0 -0
- pdflinkcheck/data/icons/SplashScreen-620x300.png +0 -0
- pdflinkcheck/data/icons/StoreLogo-50x50.png +0 -0
- pdflinkcheck/data/icons/WideLogo-310x150.png +0 -0
- pdflinkcheck/data/icons/red_pdf_512px.ico +0 -0
- pdflinkcheck/data/pyproject.toml +25 -37
- pdflinkcheck/data/themes/forest/forest-dark/border-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-invalid.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/card.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/combo-button-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/combo-button-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/combo-button-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/down.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/empty.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/notebook.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/off-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/off-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/off-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/on-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/on-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/on-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/right.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/scale-hor.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/scale-vert.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/separator.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/sizegrip.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/spin-button-down-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/spin-button-down-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/spin-button-up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tab-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tab-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tab-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tree-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tree-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark.tcl +536 -0
- pdflinkcheck/data/themes/forest/forest-light/border-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-invalid.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/card.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/combo-button-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/combo-button-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/combo-button-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/down-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/down.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/empty.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/notebook.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/off-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/off-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/off-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/on-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/on-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/on-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/right-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/right.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/scale-hor.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/scale-vert.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/separator.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/sizegrip.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/spin-button-down-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/spin-button-down-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/spin-button-up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tab-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tab-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tab-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tree-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tree-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light.tcl +544 -0
- pdflinkcheck/datacopy.py +18 -1
- pdflinkcheck/dev.py +12 -25
- pdflinkcheck/environment.py +76 -0
- pdflinkcheck/gui.py +366 -457
- pdflinkcheck/helpers.py +88 -0
- pdflinkcheck/io.py +27 -23
- pdflinkcheck/report.py +692 -121
- pdflinkcheck/security.py +189 -0
- pdflinkcheck/splash.py +38 -0
- pdflinkcheck/stdlib_server.py +14 -20
- pdflinkcheck/stdlib_server_alt.py +571 -0
- pdflinkcheck/tk_utils.py +188 -0
- pdflinkcheck/update_msix_version.py +49 -0
- pdflinkcheck/validate.py +129 -218
- pdflinkcheck/version_info.py +6 -3
- {pdflinkcheck-1.1.73.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +84 -81
- pdflinkcheck-1.2.29.dist-info/RECORD +183 -0
- pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
- {pdflinkcheck-1.1.73.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
- pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
- pdflinkcheck-1.2.29.dist-info/licenses/LICENSE-MIT +9 -0
- pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
- pdflinkcheck/analyze_pypdf_v2.py +0 -218
- pdflinkcheck-1.1.73.dist-info/RECORD +0 -21
- pdflinkcheck-1.1.73.dist-info/WHEEL +0 -4
- /pdflinkcheck-1.1.73.dist-info/licenses/LICENSE → /pdflinkcheck-1.2.29.dist-info/licenses/LICENSE-AGPL3 +0 -0
pdflinkcheck/security.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""
|
|
2
|
+
pdflinkcheck.security
|
|
3
|
+
|
|
4
|
+
Offline, deterministic link‑risk scoring for PDF hyperlinks.
|
|
5
|
+
|
|
6
|
+
This module intentionally avoids any heuristics that depend on PDF text
|
|
7
|
+
extraction quality (e.g., anchor text analysis), because real‑world PDFs
|
|
8
|
+
often contain inconsistent OCR output, concatenated strings, or placeholder
|
|
9
|
+
text. Only URL‑structure‑based signals are used.
|
|
10
|
+
|
|
11
|
+
Stable, low‑maintenance, and fully offline.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
from dataclasses import dataclass, asdict
|
|
16
|
+
from urllib.parse import urlparse, parse_qs
|
|
17
|
+
import ipaddress
|
|
18
|
+
from typing import List, Dict, Optional
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
# Static rule tables (embedded; no external files)
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
# Top level domain (tld)
|
|
26
|
+
SUSPICIOUS_TLDS = {
|
|
27
|
+
"xyz", "top", "click", "link", "rest", "gq", "ml", "cf", "tk"
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
# Tracking parameters
|
|
31
|
+
"""
|
|
32
|
+
These parameters collectively allow detailed attribution of website traffic and conversions:
|
|
33
|
+
- **utm_** parameters are universal for tracking campaigns across all traffic sources.
|
|
34
|
+
- **fbclid** and **gclid** are platform-specific identifiers for Facebook and Google Ads.
|
|
35
|
+
- **mc_eid** is specific to email marketing, like Mailchimp campaigns.
|
|
36
|
+
"""
|
|
37
|
+
TRACKING_PARAMS = {
|
|
38
|
+
"utm_source", "utm_medium", "utm_campaign",
|
|
39
|
+
"fbclid", "gclid", "mc_eid"
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
# Minimal homoglyph table (expandable)
|
|
43
|
+
"""
|
|
44
|
+
"а" → Latin "a" (Cyrillic small letter a, U+0430 vs Latin a U+0061)
|
|
45
|
+
"е" → Latin "e" (Cyrillic small letter ie, U+0435 vs Latin e U+0065)
|
|
46
|
+
"і" → Latin "i" (Cyrillic small letter i, U+0456 vs Latin i U+0069)
|
|
47
|
+
"ο" → Latin "o" (Greek small omicron, U+03BF vs Latin o U+006F)
|
|
48
|
+
"р" → Latin "p" (Cyrillic small er, U+0440 vs Latin p U+0070)
|
|
49
|
+
"ѕ" → Latin "s" (Cyrillic small letter dze, U+0455 vs Latin s U+0073)
|
|
50
|
+
"у" → Latin "y" (Cyrillic small letter u, U+0443 vs Latin y U+0079)
|
|
51
|
+
|
|
52
|
+
These characters have distinct Unicode code points from their Latin lookalikes
|
|
53
|
+
but are visually nearly identical, making them classic homoglyphs.
|
|
54
|
+
The purpose of such mappings is often to detect or simulate homoglyph attacks,
|
|
55
|
+
such as phishing domains, email spoofing, or source code obfuscation,
|
|
56
|
+
where attackers substitute visually similar characters from alternate scripts to deceive users or systems.
|
|
57
|
+
"""
|
|
58
|
+
HOMOGLYPHS = {
|
|
59
|
+
"а": "a", # Cyrillic
|
|
60
|
+
"е": "e",
|
|
61
|
+
"і": "i",
|
|
62
|
+
"ο": "o",
|
|
63
|
+
"р": "p",
|
|
64
|
+
"ѕ": "s",
|
|
65
|
+
"у": "y",
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
# ---------------------------------------------------------------------------
|
|
69
|
+
# Data structures
|
|
70
|
+
# ---------------------------------------------------------------------------
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class RiskReason:
|
|
74
|
+
rule_id: str
|
|
75
|
+
description: str
|
|
76
|
+
weight: int
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class LinkRiskResult:
|
|
81
|
+
url: str
|
|
82
|
+
score: int
|
|
83
|
+
level: str
|
|
84
|
+
reasons: List[RiskReason]
|
|
85
|
+
|
|
86
|
+
def to_dict(self) -> Dict[str, object]:
|
|
87
|
+
d = asdict(self)
|
|
88
|
+
d["reasons"] = [asdict(r) for r in self.reasons]
|
|
89
|
+
return d
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# ---------------------------------------------------------------------------
|
|
93
|
+
# Helper functions
|
|
94
|
+
# ---------------------------------------------------------------------------
|
|
95
|
+
|
|
96
|
+
def _is_ip(host: str) -> bool:
|
|
97
|
+
try:
|
|
98
|
+
ipaddress.ip_address(host)
|
|
99
|
+
return True
|
|
100
|
+
except Exception:
|
|
101
|
+
return False
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _contains_homoglyphs(s: str) -> bool:
|
|
105
|
+
return any(ch in HOMOGLYPHS for ch in s)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# ---------------------------------------------------------------------------
|
|
109
|
+
# Core scoring function (URL‑structure‑based only)
|
|
110
|
+
# ---------------------------------------------------------------------------
|
|
111
|
+
|
|
112
|
+
def score_link(url: str) -> LinkRiskResult:
|
|
113
|
+
reasons: List[RiskReason] = []
|
|
114
|
+
score = 0
|
|
115
|
+
|
|
116
|
+
parsed = urlparse(url)
|
|
117
|
+
host = parsed.hostname or ""
|
|
118
|
+
query = parsed.query or ""
|
|
119
|
+
|
|
120
|
+
# IP‑based URL
|
|
121
|
+
if _is_ip(host):
|
|
122
|
+
reasons.append(RiskReason("ip_host", "URL uses a raw IP address.", 3))
|
|
123
|
+
score += 3
|
|
124
|
+
|
|
125
|
+
# Suspicious TLD
|
|
126
|
+
if "." in host:
|
|
127
|
+
tld = host.rsplit(".", 1)[-1].lower()
|
|
128
|
+
if tld in SUSPICIOUS_TLDS:
|
|
129
|
+
reasons.append(RiskReason("suspicious_tld", f"TLD '.{tld}' is commonly abused.", 2))
|
|
130
|
+
score += 2
|
|
131
|
+
|
|
132
|
+
# Non‑standard port
|
|
133
|
+
if parsed.port not in (None, 80, 443):
|
|
134
|
+
reasons.append(RiskReason("nonstandard_port", f"Non‑standard port {parsed.port}.", 2))
|
|
135
|
+
score += 2
|
|
136
|
+
|
|
137
|
+
# Long URL
|
|
138
|
+
if len(url) > 200:
|
|
139
|
+
reasons.append(RiskReason("long_url", "URL is unusually long.", 1))
|
|
140
|
+
score += 1
|
|
141
|
+
|
|
142
|
+
# Tracking parameters
|
|
143
|
+
params = parse_qs(query)
|
|
144
|
+
tracking_hits = sum(1 for p in params if p.lower() in TRACKING_PARAMS)
|
|
145
|
+
if tracking_hits:
|
|
146
|
+
reasons.append(RiskReason("tracking_params", f"{tracking_hits} tracking parameters found.", 1))
|
|
147
|
+
score += 1
|
|
148
|
+
|
|
149
|
+
# Homoglyph detection
|
|
150
|
+
if _contains_homoglyphs(host + parsed.path):
|
|
151
|
+
reasons.append(RiskReason("homoglyph_suspected", "URL contains homoglyph characters.", 3))
|
|
152
|
+
score += 3
|
|
153
|
+
# Risk level mapping
|
|
154
|
+
if score == 0:
|
|
155
|
+
level = "none"
|
|
156
|
+
elif score <= 2:
|
|
157
|
+
level = "low"
|
|
158
|
+
elif score <= 6:
|
|
159
|
+
level = "medium"
|
|
160
|
+
else:
|
|
161
|
+
level = "high"
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
return LinkRiskResult(url, score, level, reasons)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
# ---------------------------------------------------------------------------
|
|
168
|
+
# Report‑level risk computation (mirrors validate.py)
|
|
169
|
+
# ---------------------------------------------------------------------------
|
|
170
|
+
|
|
171
|
+
def compute_risk(report: Dict[str, object]) -> Dict[str, object]:
|
|
172
|
+
external_links = report.get("data", {}).get("external_links", [])
|
|
173
|
+
results = []
|
|
174
|
+
|
|
175
|
+
for link in external_links:
|
|
176
|
+
url = link.get("url") or link.get("remote_file") or link.get("target")
|
|
177
|
+
if url:
|
|
178
|
+
results.append(score_link(url).to_dict())
|
|
179
|
+
|
|
180
|
+
return {
|
|
181
|
+
"risk_summary": {
|
|
182
|
+
"total_external": len(external_links),
|
|
183
|
+
"scored": len(results),
|
|
184
|
+
"high_risk": sum(1 for r in results if r["level"] == "high"),
|
|
185
|
+
"medium_risk": sum(1 for r in results if r["level"] == "medium"),
|
|
186
|
+
"low_risk": sum(1 for r in results if r["level"] == "low"),
|
|
187
|
+
},
|
|
188
|
+
"risk_details": results
|
|
189
|
+
}
|
pdflinkcheck/splash.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# src/pdflinkcheck/splash.py
|
|
2
|
+
import tkinter as tk
|
|
3
|
+
from tkinter import ttk
|
|
4
|
+
from pdflinkcheck.tk_utils import center_window_on_primary
|
|
5
|
+
|
|
6
|
+
class SplashFrame:
|
|
7
|
+
|
|
8
|
+
def __init__(self, parent):
|
|
9
|
+
self.top = tk.Toplevel(parent)
|
|
10
|
+
self.top.withdraw()
|
|
11
|
+
self.top.overrideredirect(True)
|
|
12
|
+
self.top.configure(bg="#2b2b2b")
|
|
13
|
+
|
|
14
|
+
# 1. Define dimensions
|
|
15
|
+
width, height = 300, 80
|
|
16
|
+
# Use generalized centering
|
|
17
|
+
#center_window_on_primary(self.top, width, height)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# UI Components
|
|
21
|
+
tk.Label(self.top, text="PDF LINK CHECK", fg="white", bg="#2b2b2b",
|
|
22
|
+
font=("Arial", 12, "bold")).pack(pady=(15, 5))
|
|
23
|
+
|
|
24
|
+
self.progress = ttk.Progressbar(self.top, mode='indeterminate', length=250)
|
|
25
|
+
self.progress.pack(pady=10, padx=20)
|
|
26
|
+
self.progress.start(15)
|
|
27
|
+
|
|
28
|
+
# Force the OS to acknowledge the window's existence
|
|
29
|
+
self.top.update_idletasks()
|
|
30
|
+
|
|
31
|
+
# Center and then reveal
|
|
32
|
+
center_window_on_primary(self.top, width, height)
|
|
33
|
+
self.top.deiconify()
|
|
34
|
+
|
|
35
|
+
def teardown(self):
|
|
36
|
+
"""Cleanly shutdown the splash window."""
|
|
37
|
+
self.progress.stop()
|
|
38
|
+
self.top.destroy()
|
pdflinkcheck/stdlib_server.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
1
3
|
# src/pdflinkcheck/stdlib_server.py
|
|
4
|
+
from __future__ import annotations
|
|
2
5
|
import http.server
|
|
3
6
|
import socketserver
|
|
4
7
|
import json
|
|
@@ -8,7 +11,7 @@ import os
|
|
|
8
11
|
from pathlib import Path
|
|
9
12
|
import email # This replaces cgi for multipart parsing
|
|
10
13
|
|
|
11
|
-
from pdflinkcheck.report import
|
|
14
|
+
from pdflinkcheck.report import run_report_and_call_exports
|
|
12
15
|
|
|
13
16
|
PORT = 8000
|
|
14
17
|
|
|
@@ -17,25 +20,22 @@ HTML_FORM = """
|
|
|
17
20
|
<html>
|
|
18
21
|
<head><title>pdflinkcheck Stdlib Server</title></head>
|
|
19
22
|
<body style="font-family: sans-serif; max-width: 800px; margin: 40px auto;">
|
|
20
|
-
<h1>pdflinkcheck API (
|
|
21
|
-
<p>Upload a PDF for link/TOC analysis
|
|
23
|
+
<h1>pdflinkcheck API (pure stdlib)</h1>
|
|
24
|
+
<p>Upload a PDF for link/TOC analysis.</p>
|
|
22
25
|
<form action="/" method="post" enctype="multipart/form-data">
|
|
23
26
|
<p><input type="file" name="file" accept=".pdf" required></p>
|
|
24
27
|
<p>
|
|
25
28
|
<label>Engine:</label>
|
|
26
29
|
<select name="pdf_library">
|
|
27
30
|
<option value="pypdf" selected>pypdf (pure Python, Termux-friendly)</option>
|
|
28
|
-
<option value="pymupdf">
|
|
31
|
+
<option value="pymupdf">PyMyPPD (fast, AGPL3)</option>
|
|
32
|
+
<option value="pdfium">PDFium (fast, permissive)</option>
|
|
29
33
|
</select>
|
|
30
34
|
</p>
|
|
31
|
-
<p>
|
|
32
|
-
<label>Max links to show (0 = all):</label>
|
|
33
|
-
<input type="number" name="max_links" value="0" min="0">
|
|
34
|
-
</p>
|
|
35
35
|
<p><button type="submit">Analyze PDF</button></p>
|
|
36
36
|
</form>
|
|
37
37
|
<hr>
|
|
38
|
-
<p>Returns JSON
|
|
38
|
+
<p>Returns JSON.</p>
|
|
39
39
|
</body>
|
|
40
40
|
</html>
|
|
41
41
|
"""
|
|
@@ -90,7 +90,6 @@ class PDFLinkCheckHandler(http.server.SimpleHTTPRequestHandler):
|
|
|
90
90
|
# Extract parts
|
|
91
91
|
file_item = None
|
|
92
92
|
pdf_library = "pypdf"
|
|
93
|
-
max_links = 0
|
|
94
93
|
|
|
95
94
|
for part in msg.get_payload():
|
|
96
95
|
disposition = part.get("Content-Disposition", "")
|
|
@@ -109,16 +108,10 @@ class PDFLinkCheckHandler(http.server.SimpleHTTPRequestHandler):
|
|
|
109
108
|
|
|
110
109
|
elif name == "pdf_library":
|
|
111
110
|
pdf_library = part.get_payload(decode=True).decode().lower()
|
|
112
|
-
if pdf_library not in {"pypdf", "pymupdf"}:
|
|
111
|
+
if pdf_library not in {"pypdf", "pymupdf", "pdfium"}:
|
|
113
112
|
self._send_json_error("Invalid pdf_library", 400)
|
|
114
113
|
return
|
|
115
114
|
|
|
116
|
-
elif name == "max_links":
|
|
117
|
-
try:
|
|
118
|
-
max_links = int(part.get_payload(decode=True).decode())
|
|
119
|
-
except ValueError:
|
|
120
|
-
max_links = 0
|
|
121
|
-
|
|
122
115
|
if not file_item:
|
|
123
116
|
self._send_json_error("No PDF file uploaded", 400)
|
|
124
117
|
return
|
|
@@ -130,18 +123,19 @@ class PDFLinkCheckHandler(http.server.SimpleHTTPRequestHandler):
|
|
|
130
123
|
tmp_file.write(file_item)
|
|
131
124
|
tmp_path = tmp_file.name
|
|
132
125
|
|
|
133
|
-
result =
|
|
126
|
+
result = run_report_and_call_exports(
|
|
134
127
|
pdf_path=tmp_path,
|
|
135
|
-
max_links=max_links if max_links > 0 else 0,
|
|
136
128
|
export_format="",
|
|
137
129
|
pdf_library=pdf_library,
|
|
138
130
|
print_bool=False
|
|
139
131
|
)
|
|
132
|
+
|
|
133
|
+
total_links_count = result.get("metadata",{}).get("link_counts",{}).get("total_links_count", 0)
|
|
140
134
|
|
|
141
135
|
response = {
|
|
142
136
|
"filename": file_filename,
|
|
143
137
|
"pdf_library_used": pdf_library,
|
|
144
|
-
"
|
|
138
|
+
"total_links_count": total_links_count,
|
|
145
139
|
"data": result["data"],
|
|
146
140
|
"text_report": result["text"]
|
|
147
141
|
}
|