justhtml 0.12.0__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +6 -0
- justhtml/__main__.py +49 -16
- justhtml/entities.py +45 -7
- justhtml/errors.py +9 -0
- justhtml/node.py +358 -89
- justhtml/parser.py +70 -14
- justhtml/sanitize.py +763 -0
- justhtml/selector.py +114 -18
- justhtml/serialize.py +332 -28
- justhtml/tokenizer.py +249 -179
- justhtml/tokens.py +8 -3
- justhtml/treebuilder.py +50 -14
- justhtml/treebuilder_modes.py +100 -36
- justhtml-0.24.0.dist-info/METADATA +192 -0
- justhtml-0.24.0.dist-info/RECORD +24 -0
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.12.0.dist-info/METADATA +0 -164
- justhtml-0.12.0.dist-info/RECORD +0 -23
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/WHEEL +0 -0
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/entry_points.txt +0 -0
justhtml/sanitize.py
ADDED
|
@@ -0,0 +1,763 @@
|
|
|
1
|
+
"""HTML sanitization policy API.
|
|
2
|
+
|
|
3
|
+
This module defines the public API for JustHTML sanitization.
|
|
4
|
+
|
|
5
|
+
The sanitizer operates on the parsed JustHTML DOM and is intentionally
|
|
6
|
+
policy-driven.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from collections.abc import Callable, Collection, Mapping
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from typing import Any
|
|
14
|
+
from urllib.parse import quote, urlsplit
|
|
15
|
+
|
|
16
|
+
UrlFilter = Callable[[str, str, str], str | None]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True, slots=True)
|
|
20
|
+
class UrlRule:
|
|
21
|
+
"""Rule for a single URL-valued attribute (e.g. a[href], img[src]).
|
|
22
|
+
|
|
23
|
+
This is intentionally rendering-oriented.
|
|
24
|
+
|
|
25
|
+
- Returning/keeping a URL can still cause network requests when the output
|
|
26
|
+
is rendered (notably for <img src>). Applications like email viewers often
|
|
27
|
+
want to block remote loads by default.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
# Allow relative URLs (including /path, ./path, ../path, ?query).
|
|
31
|
+
allow_relative: bool = True
|
|
32
|
+
|
|
33
|
+
# Allow same-document fragments (#foo). Typically safe.
|
|
34
|
+
allow_fragment: bool = True
|
|
35
|
+
|
|
36
|
+
# If set, protocol-relative URLs (//example.com) are resolved to this scheme
|
|
37
|
+
# (e.g. "https") before checking allowed_schemes.
|
|
38
|
+
# If None, protocol-relative URLs are disallowed.
|
|
39
|
+
resolve_protocol_relative: str | None = "https"
|
|
40
|
+
|
|
41
|
+
# Allow absolute URLs with these schemes (lowercase), e.g. {"https"}.
|
|
42
|
+
# If empty, all absolute URLs with a scheme are disallowed.
|
|
43
|
+
allowed_schemes: Collection[str] = field(default_factory=set)
|
|
44
|
+
|
|
45
|
+
# If provided, absolute URLs are allowed only if the parsed host is in this
|
|
46
|
+
# allowlist.
|
|
47
|
+
allowed_hosts: Collection[str] | None = None
|
|
48
|
+
|
|
49
|
+
# Optional proxy rewrite for allowed absolute/protocol-relative URLs.
|
|
50
|
+
# Example: proxy_url="/proxy" -> https://google.com becomes
|
|
51
|
+
# /proxy?url=https%3A%2F%2Fgoogle.com
|
|
52
|
+
proxy_url: str | None = None
|
|
53
|
+
|
|
54
|
+
# Query parameter name used when proxy_url is set.
|
|
55
|
+
proxy_param: str = "url"
|
|
56
|
+
|
|
57
|
+
def __post_init__(self) -> None:
|
|
58
|
+
# Accept lists/tuples from user code, normalize for internal use.
|
|
59
|
+
if not isinstance(self.allowed_schemes, set):
|
|
60
|
+
object.__setattr__(self, "allowed_schemes", set(self.allowed_schemes))
|
|
61
|
+
if self.allowed_hosts is not None and not isinstance(self.allowed_hosts, set):
|
|
62
|
+
object.__setattr__(self, "allowed_hosts", set(self.allowed_hosts))
|
|
63
|
+
|
|
64
|
+
if self.proxy_url is not None:
|
|
65
|
+
proxy_url = str(self.proxy_url)
|
|
66
|
+
object.__setattr__(self, "proxy_url", proxy_url if proxy_url else None)
|
|
67
|
+
object.__setattr__(self, "proxy_param", str(self.proxy_param))
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _proxy_url_value(*, proxy_url: str, proxy_param: str, value: str) -> str:
|
|
71
|
+
sep = "&" if "?" in proxy_url else "?"
|
|
72
|
+
return f"{proxy_url}{sep}{proxy_param}={quote(value, safe='')}"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass(frozen=True, slots=True)
|
|
76
|
+
class SanitizationPolicy:
|
|
77
|
+
"""An allow-list driven policy for sanitizing a parsed DOM.
|
|
78
|
+
|
|
79
|
+
This API is intentionally small. The implementation will interpret these
|
|
80
|
+
fields strictly.
|
|
81
|
+
|
|
82
|
+
- Tags not in `allowed_tags` are disallowed.
|
|
83
|
+
- Attributes not in `allowed_attributes[tag]` (or `allowed_attributes["*"]`)
|
|
84
|
+
are disallowed.
|
|
85
|
+
- URL scheme checks apply to attributes listed in `url_attributes`.
|
|
86
|
+
|
|
87
|
+
All tag and attribute names are expected to be ASCII-lowercase.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
allowed_tags: Collection[str]
|
|
91
|
+
allowed_attributes: Mapping[str, Collection[str]]
|
|
92
|
+
|
|
93
|
+
# URL handling:
|
|
94
|
+
# - `url_rules` is the data-driven allowlist for URL-valued attributes.
|
|
95
|
+
# - `url_filter` is an optional hook that can drop or rewrite URLs.
|
|
96
|
+
#
|
|
97
|
+
# `url_filter(tag, attr, value)` should return:
|
|
98
|
+
# - a replacement string to keep (possibly rewritten), or
|
|
99
|
+
# - None to drop the attribute.
|
|
100
|
+
url_rules: Mapping[tuple[str, str], UrlRule]
|
|
101
|
+
url_filter: UrlFilter | None = None
|
|
102
|
+
|
|
103
|
+
drop_comments: bool = True
|
|
104
|
+
drop_doctype: bool = True
|
|
105
|
+
drop_foreign_namespaces: bool = True
|
|
106
|
+
|
|
107
|
+
# If True, disallowed elements are removed but their children may be kept
|
|
108
|
+
# (except for tags in `drop_content_tags`).
|
|
109
|
+
strip_disallowed_tags: bool = True
|
|
110
|
+
|
|
111
|
+
# Dangerous containers whose text payload should not be preserved.
|
|
112
|
+
drop_content_tags: Collection[str] = field(default_factory=lambda: {"script", "style"})
|
|
113
|
+
|
|
114
|
+
# Inline style allowlist.
|
|
115
|
+
# Only applies when the `style` attribute is allowed for a tag.
|
|
116
|
+
# If empty, inline styles are effectively disabled (style attributes are dropped).
|
|
117
|
+
allowed_css_properties: Collection[str] = field(default_factory=set)
|
|
118
|
+
|
|
119
|
+
# Link hardening.
|
|
120
|
+
# If non-empty, ensure these tokens are present in <a rel="...">.
|
|
121
|
+
# (The sanitizer will merge tokens; it will not remove existing ones.)
|
|
122
|
+
force_link_rel: Collection[str] = field(default_factory=set)
|
|
123
|
+
|
|
124
|
+
def __post_init__(self) -> None:
|
|
125
|
+
# Normalize to sets so the sanitizer can do fast membership checks.
|
|
126
|
+
if not isinstance(self.allowed_tags, set):
|
|
127
|
+
object.__setattr__(self, "allowed_tags", set(self.allowed_tags))
|
|
128
|
+
|
|
129
|
+
if not isinstance(self.allowed_attributes, dict) or any(
|
|
130
|
+
not isinstance(v, set) for v in self.allowed_attributes.values()
|
|
131
|
+
):
|
|
132
|
+
normalized_attrs: dict[str, set[str]] = {}
|
|
133
|
+
for tag, attrs in self.allowed_attributes.items():
|
|
134
|
+
normalized_attrs[str(tag)] = attrs if isinstance(attrs, set) else set(attrs)
|
|
135
|
+
object.__setattr__(self, "allowed_attributes", normalized_attrs)
|
|
136
|
+
|
|
137
|
+
if not isinstance(self.drop_content_tags, set):
|
|
138
|
+
object.__setattr__(self, "drop_content_tags", set(self.drop_content_tags))
|
|
139
|
+
|
|
140
|
+
if not isinstance(self.allowed_css_properties, set):
|
|
141
|
+
object.__setattr__(self, "allowed_css_properties", set(self.allowed_css_properties))
|
|
142
|
+
|
|
143
|
+
if not isinstance(self.force_link_rel, set):
|
|
144
|
+
object.__setattr__(self, "force_link_rel", set(self.force_link_rel))
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
DEFAULT_POLICY: SanitizationPolicy = SanitizationPolicy(
|
|
148
|
+
allowed_tags=[
|
|
149
|
+
# Text / structure
|
|
150
|
+
"p",
|
|
151
|
+
"br",
|
|
152
|
+
# Structure
|
|
153
|
+
"div",
|
|
154
|
+
"span",
|
|
155
|
+
"blockquote",
|
|
156
|
+
"pre",
|
|
157
|
+
"code",
|
|
158
|
+
# Headings
|
|
159
|
+
"h1",
|
|
160
|
+
"h2",
|
|
161
|
+
"h3",
|
|
162
|
+
"h4",
|
|
163
|
+
"h5",
|
|
164
|
+
"h6",
|
|
165
|
+
# Lists
|
|
166
|
+
"ul",
|
|
167
|
+
"ol",
|
|
168
|
+
"li",
|
|
169
|
+
# Tables
|
|
170
|
+
"table",
|
|
171
|
+
"thead",
|
|
172
|
+
"tbody",
|
|
173
|
+
"tfoot",
|
|
174
|
+
"tr",
|
|
175
|
+
"th",
|
|
176
|
+
"td",
|
|
177
|
+
# Text formatting
|
|
178
|
+
"b",
|
|
179
|
+
"strong",
|
|
180
|
+
"i",
|
|
181
|
+
"em",
|
|
182
|
+
"u",
|
|
183
|
+
"s",
|
|
184
|
+
"sub",
|
|
185
|
+
"sup",
|
|
186
|
+
"small",
|
|
187
|
+
"mark",
|
|
188
|
+
# Quotes/code
|
|
189
|
+
# Line breaks
|
|
190
|
+
"hr",
|
|
191
|
+
# Links and images
|
|
192
|
+
"a",
|
|
193
|
+
"img",
|
|
194
|
+
],
|
|
195
|
+
allowed_attributes={
|
|
196
|
+
"*": ["class", "id", "title", "lang", "dir"],
|
|
197
|
+
"a": ["href", "title"],
|
|
198
|
+
"img": ["src", "alt", "title", "width", "height", "loading", "decoding"],
|
|
199
|
+
"th": ["colspan", "rowspan"],
|
|
200
|
+
"td": ["colspan", "rowspan"],
|
|
201
|
+
},
|
|
202
|
+
# Default URL stance:
|
|
203
|
+
# - Links may point to http/https/mailto/tel and relative URLs.
|
|
204
|
+
# - Images may point to relative URLs only.
|
|
205
|
+
url_rules={
|
|
206
|
+
("a", "href"): UrlRule(
|
|
207
|
+
allowed_schemes=["http", "https", "mailto", "tel"],
|
|
208
|
+
resolve_protocol_relative="https",
|
|
209
|
+
),
|
|
210
|
+
("img", "src"): UrlRule(
|
|
211
|
+
allowed_schemes=["http", "https"],
|
|
212
|
+
resolve_protocol_relative="https",
|
|
213
|
+
),
|
|
214
|
+
},
|
|
215
|
+
allowed_css_properties=set(),
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
DEFAULT_DOCUMENT_POLICY: SanitizationPolicy = SanitizationPolicy(
|
|
220
|
+
allowed_tags=sorted(set(DEFAULT_POLICY.allowed_tags) | {"html", "head", "body", "title"}),
|
|
221
|
+
allowed_attributes=DEFAULT_POLICY.allowed_attributes,
|
|
222
|
+
url_rules=DEFAULT_POLICY.url_rules,
|
|
223
|
+
url_filter=DEFAULT_POLICY.url_filter,
|
|
224
|
+
drop_comments=DEFAULT_POLICY.drop_comments,
|
|
225
|
+
drop_doctype=DEFAULT_POLICY.drop_doctype,
|
|
226
|
+
drop_foreign_namespaces=DEFAULT_POLICY.drop_foreign_namespaces,
|
|
227
|
+
strip_disallowed_tags=DEFAULT_POLICY.strip_disallowed_tags,
|
|
228
|
+
drop_content_tags=DEFAULT_POLICY.drop_content_tags,
|
|
229
|
+
allowed_css_properties=DEFAULT_POLICY.allowed_css_properties,
|
|
230
|
+
force_link_rel=DEFAULT_POLICY.force_link_rel,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _is_valid_css_property_name(name: str) -> bool:
|
|
235
|
+
# Conservative: allow only ASCII letters/digits/hyphen.
|
|
236
|
+
# This keeps parsing deterministic and avoids surprises with escapes.
|
|
237
|
+
if not name:
|
|
238
|
+
return False
|
|
239
|
+
for ch in name:
|
|
240
|
+
if "a" <= ch <= "z" or "0" <= ch <= "9" or ch == "-":
|
|
241
|
+
continue
|
|
242
|
+
return False
|
|
243
|
+
return True
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _css_value_may_load_external_resource(value: str) -> bool:
|
|
247
|
+
# Extremely conservative check: drop any declaration value that contains a
|
|
248
|
+
# CSS function call that can load external resources.
|
|
249
|
+
#
|
|
250
|
+
# We intentionally do not try to parse full CSS (escapes, comments, strings,
|
|
251
|
+
# etc.). Instead, we reject values that contain backslashes (common escape
|
|
252
|
+
# obfuscation) or that *look* like they contain url(…) / image-set(…). This
|
|
253
|
+
# ensures style attributes can't be used to trigger network requests even
|
|
254
|
+
# when users allow potentially dangerous properties.
|
|
255
|
+
if "\\" in value:
|
|
256
|
+
return True
|
|
257
|
+
|
|
258
|
+
# Scan while ignoring ASCII whitespace/control chars and CSS comments.
|
|
259
|
+
# Keep a small rolling buffer to avoid extra allocations.
|
|
260
|
+
buf: list[str] = []
|
|
261
|
+
max_len = len("alphaimageloader")
|
|
262
|
+
|
|
263
|
+
i = 0
|
|
264
|
+
n = len(value)
|
|
265
|
+
while i < n:
|
|
266
|
+
ch = value[i]
|
|
267
|
+
|
|
268
|
+
# Treat CSS comments as ignorable, so obfuscation like u/**/rl( is caught.
|
|
269
|
+
if ch == "/" and i + 1 < n and value[i + 1] == "*":
|
|
270
|
+
i += 2
|
|
271
|
+
while i + 1 < n:
|
|
272
|
+
if value[i] == "*" and value[i + 1] == "/":
|
|
273
|
+
i += 2
|
|
274
|
+
break
|
|
275
|
+
i += 1
|
|
276
|
+
else:
|
|
277
|
+
# Unterminated comments are invalid CSS; be conservative.
|
|
278
|
+
return True
|
|
279
|
+
continue
|
|
280
|
+
|
|
281
|
+
o = ord(ch)
|
|
282
|
+
if o <= 0x20 or o == 0x7F:
|
|
283
|
+
i += 1
|
|
284
|
+
continue
|
|
285
|
+
|
|
286
|
+
if "A" <= ch <= "Z":
|
|
287
|
+
lower_ch = chr(o + 0x20)
|
|
288
|
+
else:
|
|
289
|
+
lower_ch = ch
|
|
290
|
+
|
|
291
|
+
buf.append(lower_ch)
|
|
292
|
+
if len(buf) > max_len:
|
|
293
|
+
buf.pop(0)
|
|
294
|
+
|
|
295
|
+
# Check for url( and image-set( anywhere in the normalized stream.
|
|
296
|
+
if len(buf) >= 4 and buf[-4:] == ["u", "r", "l", "("]:
|
|
297
|
+
return True
|
|
298
|
+
if len(buf) >= 10 and buf[-10:] == [
|
|
299
|
+
"i",
|
|
300
|
+
"m",
|
|
301
|
+
"a",
|
|
302
|
+
"g",
|
|
303
|
+
"e",
|
|
304
|
+
"-",
|
|
305
|
+
"s",
|
|
306
|
+
"e",
|
|
307
|
+
"t",
|
|
308
|
+
"(",
|
|
309
|
+
]:
|
|
310
|
+
return True
|
|
311
|
+
|
|
312
|
+
# IE-only but still worth blocking defensively.
|
|
313
|
+
if len(buf) >= 11 and buf[-11:] == [
|
|
314
|
+
"e",
|
|
315
|
+
"x",
|
|
316
|
+
"p",
|
|
317
|
+
"r",
|
|
318
|
+
"e",
|
|
319
|
+
"s",
|
|
320
|
+
"s",
|
|
321
|
+
"i",
|
|
322
|
+
"o",
|
|
323
|
+
"n",
|
|
324
|
+
"(",
|
|
325
|
+
]:
|
|
326
|
+
return True
|
|
327
|
+
|
|
328
|
+
# Legacy IE CSS filters that can fetch remote resources.
|
|
329
|
+
if len(buf) >= 7 and buf[-7:] == ["p", "r", "o", "g", "i", "d", ":"]:
|
|
330
|
+
return True
|
|
331
|
+
if len(buf) >= 16 and buf[-16:] == [
|
|
332
|
+
"a",
|
|
333
|
+
"l",
|
|
334
|
+
"p",
|
|
335
|
+
"h",
|
|
336
|
+
"a",
|
|
337
|
+
"i",
|
|
338
|
+
"m",
|
|
339
|
+
"a",
|
|
340
|
+
"g",
|
|
341
|
+
"e",
|
|
342
|
+
"l",
|
|
343
|
+
"o",
|
|
344
|
+
"a",
|
|
345
|
+
"d",
|
|
346
|
+
"e",
|
|
347
|
+
"r",
|
|
348
|
+
]:
|
|
349
|
+
return True
|
|
350
|
+
|
|
351
|
+
# Legacy bindings/behaviors that can pull remote content.
|
|
352
|
+
if len(buf) >= 9 and buf[-9:] == ["b", "e", "h", "a", "v", "i", "o", "r", ":"]:
|
|
353
|
+
return True
|
|
354
|
+
if len(buf) >= 12 and buf[-12:] == [
|
|
355
|
+
"-",
|
|
356
|
+
"m",
|
|
357
|
+
"o",
|
|
358
|
+
"z",
|
|
359
|
+
"-",
|
|
360
|
+
"b",
|
|
361
|
+
"i",
|
|
362
|
+
"n",
|
|
363
|
+
"d",
|
|
364
|
+
"i",
|
|
365
|
+
"n",
|
|
366
|
+
"g",
|
|
367
|
+
]:
|
|
368
|
+
return True
|
|
369
|
+
|
|
370
|
+
i += 1
|
|
371
|
+
|
|
372
|
+
return False
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def _sanitize_inline_style(*, policy: SanitizationPolicy, value: str) -> str | None:
|
|
376
|
+
allowed = policy.allowed_css_properties
|
|
377
|
+
if not allowed:
|
|
378
|
+
return None
|
|
379
|
+
|
|
380
|
+
v = str(value)
|
|
381
|
+
if not v:
|
|
382
|
+
return None
|
|
383
|
+
|
|
384
|
+
out_parts: list[str] = []
|
|
385
|
+
for decl in v.split(";"):
|
|
386
|
+
d = decl.strip()
|
|
387
|
+
if not d:
|
|
388
|
+
continue
|
|
389
|
+
colon = d.find(":")
|
|
390
|
+
if colon <= 0:
|
|
391
|
+
continue
|
|
392
|
+
|
|
393
|
+
prop = d[:colon].strip().lower()
|
|
394
|
+
if not _is_valid_css_property_name(prop):
|
|
395
|
+
continue
|
|
396
|
+
if prop not in allowed:
|
|
397
|
+
continue
|
|
398
|
+
|
|
399
|
+
prop_value = d[colon + 1 :].strip()
|
|
400
|
+
if not prop_value:
|
|
401
|
+
continue
|
|
402
|
+
|
|
403
|
+
if _css_value_may_load_external_resource(prop_value):
|
|
404
|
+
continue
|
|
405
|
+
|
|
406
|
+
out_parts.append(f"{prop}: {prop_value}")
|
|
407
|
+
|
|
408
|
+
if not out_parts:
|
|
409
|
+
return None
|
|
410
|
+
return "; ".join(out_parts)
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def _normalize_url_for_checking(value: str) -> str:
|
|
414
|
+
# Strip whitespace/control chars commonly used for scheme obfuscation.
|
|
415
|
+
# Note: do not strip backslashes; they are not whitespace/control chars,
|
|
416
|
+
# and removing them can turn invalid schemes into valid ones.
|
|
417
|
+
out: list[str] = []
|
|
418
|
+
for ch in value:
|
|
419
|
+
o = ord(ch)
|
|
420
|
+
if o <= 0x20 or o == 0x7F:
|
|
421
|
+
continue
|
|
422
|
+
out.append(ch)
|
|
423
|
+
return "".join(out)
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def _is_valid_scheme(scheme: str) -> bool:
|
|
427
|
+
first = scheme[0]
|
|
428
|
+
if not ("a" <= first <= "z" or "A" <= first <= "Z"):
|
|
429
|
+
return False
|
|
430
|
+
for ch in scheme[1:]:
|
|
431
|
+
if "a" <= ch <= "z" or "A" <= ch <= "Z" or "0" <= ch <= "9" or ch in "+-.":
|
|
432
|
+
continue
|
|
433
|
+
return False
|
|
434
|
+
return True
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def _has_scheme(value: str) -> bool:
|
|
438
|
+
idx = value.find(":")
|
|
439
|
+
if idx <= 0:
|
|
440
|
+
return False
|
|
441
|
+
# Scheme must appear before any path/query/fragment separator.
|
|
442
|
+
end = len(value)
|
|
443
|
+
for sep in ("/", "?", "#"):
|
|
444
|
+
j = value.find(sep)
|
|
445
|
+
if j != -1 and j < end:
|
|
446
|
+
end = j
|
|
447
|
+
if idx >= end:
|
|
448
|
+
return False
|
|
449
|
+
return _is_valid_scheme(value[:idx])
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def _has_invalid_scheme_like_prefix(value: str) -> bool:
|
|
453
|
+
idx = value.find(":")
|
|
454
|
+
if idx <= 0:
|
|
455
|
+
return False
|
|
456
|
+
|
|
457
|
+
end = len(value)
|
|
458
|
+
for sep in ("/", "?", "#"):
|
|
459
|
+
j = value.find(sep)
|
|
460
|
+
if j != -1 and j < end:
|
|
461
|
+
end = j
|
|
462
|
+
if idx >= end:
|
|
463
|
+
return False
|
|
464
|
+
|
|
465
|
+
return not _is_valid_scheme(value[:idx])
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def _sanitize_url_value(
|
|
469
|
+
*,
|
|
470
|
+
policy: SanitizationPolicy,
|
|
471
|
+
rule: UrlRule,
|
|
472
|
+
tag: str,
|
|
473
|
+
attr: str,
|
|
474
|
+
value: str,
|
|
475
|
+
) -> str | None:
|
|
476
|
+
v = value
|
|
477
|
+
if policy.url_filter is not None:
|
|
478
|
+
rewritten = policy.url_filter(tag, attr, v)
|
|
479
|
+
if rewritten is None:
|
|
480
|
+
return None
|
|
481
|
+
v = rewritten
|
|
482
|
+
|
|
483
|
+
stripped = str(v).strip()
|
|
484
|
+
normalized = _normalize_url_for_checking(stripped)
|
|
485
|
+
if not normalized:
|
|
486
|
+
# If normalization removes everything, the value was empty/whitespace/
|
|
487
|
+
# control-only. Drop it rather than keeping weird control characters.
|
|
488
|
+
return None
|
|
489
|
+
|
|
490
|
+
if normalized.startswith("#"):
|
|
491
|
+
return stripped if rule.allow_fragment else None
|
|
492
|
+
|
|
493
|
+
# If proxying is enabled, do not treat scheme-obfuscation as a relative URL.
|
|
494
|
+
# Some user agents normalize backslashes and other characters during navigation.
|
|
495
|
+
if rule.proxy_url and _has_invalid_scheme_like_prefix(normalized):
|
|
496
|
+
return None
|
|
497
|
+
|
|
498
|
+
if normalized.startswith("//"):
|
|
499
|
+
if not rule.resolve_protocol_relative:
|
|
500
|
+
return None
|
|
501
|
+
|
|
502
|
+
# Resolve to absolute URL for checking.
|
|
503
|
+
resolved_scheme = rule.resolve_protocol_relative.lower()
|
|
504
|
+
resolved_url = f"{resolved_scheme}:{normalized}"
|
|
505
|
+
|
|
506
|
+
parsed = urlsplit(resolved_url)
|
|
507
|
+
scheme = (parsed.scheme or "").lower()
|
|
508
|
+
if scheme not in rule.allowed_schemes:
|
|
509
|
+
return None
|
|
510
|
+
|
|
511
|
+
if rule.allowed_hosts is not None:
|
|
512
|
+
host = (parsed.hostname or "").lower()
|
|
513
|
+
if not host or host not in rule.allowed_hosts:
|
|
514
|
+
return None
|
|
515
|
+
|
|
516
|
+
# Return the resolved URL.
|
|
517
|
+
return (
|
|
518
|
+
_proxy_url_value(proxy_url=rule.proxy_url, proxy_param=rule.proxy_param, value=resolved_url)
|
|
519
|
+
if rule.proxy_url
|
|
520
|
+
else resolved_url
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
if _has_scheme(normalized):
|
|
524
|
+
parsed = urlsplit(normalized)
|
|
525
|
+
scheme = (parsed.scheme or "").lower()
|
|
526
|
+
if scheme not in rule.allowed_schemes:
|
|
527
|
+
return None
|
|
528
|
+
if rule.allowed_hosts is not None:
|
|
529
|
+
host = (parsed.hostname or "").lower()
|
|
530
|
+
if not host or host not in rule.allowed_hosts:
|
|
531
|
+
return None
|
|
532
|
+
return (
|
|
533
|
+
_proxy_url_value(proxy_url=rule.proxy_url, proxy_param=rule.proxy_param, value=stripped)
|
|
534
|
+
if rule.proxy_url
|
|
535
|
+
else stripped
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
return stripped if rule.allow_relative else None
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def _sanitize_attrs(
|
|
542
|
+
*,
|
|
543
|
+
policy: SanitizationPolicy,
|
|
544
|
+
tag: str,
|
|
545
|
+
attrs: dict[str, str | None] | None,
|
|
546
|
+
) -> dict[str, str | None]:
|
|
547
|
+
if not attrs:
|
|
548
|
+
attrs = {}
|
|
549
|
+
|
|
550
|
+
allowed_global = set(policy.allowed_attributes.get("*", ()))
|
|
551
|
+
allowed_tag = set(policy.allowed_attributes.get(tag, ()))
|
|
552
|
+
allowed = allowed_global | allowed_tag
|
|
553
|
+
|
|
554
|
+
out: dict[str, str | None] = {}
|
|
555
|
+
for raw_name, raw_value in attrs.items():
|
|
556
|
+
if not raw_name:
|
|
557
|
+
continue
|
|
558
|
+
|
|
559
|
+
name = str(raw_name).strip().lower()
|
|
560
|
+
if not name:
|
|
561
|
+
continue
|
|
562
|
+
|
|
563
|
+
# Disallow namespace-ish attributes by default.
|
|
564
|
+
if ":" in name:
|
|
565
|
+
continue
|
|
566
|
+
|
|
567
|
+
# Always drop event handlers.
|
|
568
|
+
if name.startswith("on"):
|
|
569
|
+
continue
|
|
570
|
+
|
|
571
|
+
# Dangerous attribute contexts.
|
|
572
|
+
if name == "srcdoc":
|
|
573
|
+
continue
|
|
574
|
+
|
|
575
|
+
if name not in allowed and not (tag == "a" and name == "rel" and policy.force_link_rel):
|
|
576
|
+
continue
|
|
577
|
+
|
|
578
|
+
if raw_value is None:
|
|
579
|
+
out[name] = None
|
|
580
|
+
continue
|
|
581
|
+
|
|
582
|
+
value = str(raw_value)
|
|
583
|
+
rule = policy.url_rules.get((tag, name))
|
|
584
|
+
if rule is not None:
|
|
585
|
+
sanitized = _sanitize_url_value(policy=policy, rule=rule, tag=tag, attr=name, value=value)
|
|
586
|
+
if sanitized is None:
|
|
587
|
+
continue
|
|
588
|
+
out[name] = sanitized
|
|
589
|
+
elif name == "style":
|
|
590
|
+
sanitized_style = _sanitize_inline_style(policy=policy, value=value)
|
|
591
|
+
if sanitized_style is None:
|
|
592
|
+
continue
|
|
593
|
+
out[name] = sanitized_style
|
|
594
|
+
else:
|
|
595
|
+
out[name] = value
|
|
596
|
+
|
|
597
|
+
# Link hardening (merge tokens; do not remove existing ones).
|
|
598
|
+
forced_tokens = [t.strip().lower() for t in policy.force_link_rel if str(t).strip()]
|
|
599
|
+
if tag == "a" and forced_tokens:
|
|
600
|
+
existing_raw = out.get("rel")
|
|
601
|
+
existing: list[str] = []
|
|
602
|
+
if isinstance(existing_raw, str) and existing_raw:
|
|
603
|
+
for tok in existing_raw.split():
|
|
604
|
+
t = tok.strip().lower()
|
|
605
|
+
if t and t not in existing:
|
|
606
|
+
existing.append(t)
|
|
607
|
+
for tok in sorted(forced_tokens):
|
|
608
|
+
if tok not in existing:
|
|
609
|
+
existing.append(tok)
|
|
610
|
+
out["rel"] = " ".join(existing)
|
|
611
|
+
|
|
612
|
+
return out
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
def _append_sanitized_subtree(*, policy: SanitizationPolicy, original: Any, parent_out: Any) -> None:
|
|
616
|
+
stack: list[tuple[Any, Any]] = [(original, parent_out)]
|
|
617
|
+
while stack:
|
|
618
|
+
current, out_parent = stack.pop()
|
|
619
|
+
name: str = current.name
|
|
620
|
+
|
|
621
|
+
if name == "#text":
|
|
622
|
+
out_parent.append_child(current.clone_node(deep=False))
|
|
623
|
+
continue
|
|
624
|
+
|
|
625
|
+
if name == "#comment":
|
|
626
|
+
if policy.drop_comments:
|
|
627
|
+
continue
|
|
628
|
+
out_parent.append_child(current.clone_node(deep=False))
|
|
629
|
+
continue
|
|
630
|
+
|
|
631
|
+
if name == "!doctype":
|
|
632
|
+
if policy.drop_doctype:
|
|
633
|
+
continue
|
|
634
|
+
out_parent.append_child(current.clone_node(deep=False))
|
|
635
|
+
continue
|
|
636
|
+
|
|
637
|
+
# Document containers.
|
|
638
|
+
if name.startswith("#"):
|
|
639
|
+
clone = current.clone_node(deep=False)
|
|
640
|
+
clone.children.clear()
|
|
641
|
+
out_parent.append_child(clone)
|
|
642
|
+
children = current.children or []
|
|
643
|
+
stack.extend((child, clone) for child in reversed(children))
|
|
644
|
+
continue
|
|
645
|
+
|
|
646
|
+
# Element.
|
|
647
|
+
tag = str(name).lower()
|
|
648
|
+
if policy.drop_foreign_namespaces:
|
|
649
|
+
ns = current.namespace
|
|
650
|
+
if ns not in (None, "html"):
|
|
651
|
+
continue
|
|
652
|
+
|
|
653
|
+
if tag in policy.drop_content_tags:
|
|
654
|
+
continue
|
|
655
|
+
|
|
656
|
+
if tag not in policy.allowed_tags:
|
|
657
|
+
if policy.strip_disallowed_tags:
|
|
658
|
+
children = current.children or []
|
|
659
|
+
stack.extend((child, out_parent) for child in reversed(children))
|
|
660
|
+
|
|
661
|
+
if tag == "template" and current.namespace in (None, "html") and current.template_content:
|
|
662
|
+
tc_children = current.template_content.children or []
|
|
663
|
+
stack.extend((child, out_parent) for child in reversed(tc_children))
|
|
664
|
+
continue
|
|
665
|
+
|
|
666
|
+
clone = current.clone_node(deep=False)
|
|
667
|
+
# Ensure children list is empty before we append sanitized descendants.
|
|
668
|
+
clone.children.clear()
|
|
669
|
+
# Filter attributes.
|
|
670
|
+
clone.attrs = _sanitize_attrs(policy=policy, tag=tag, attrs=current.attrs)
|
|
671
|
+
|
|
672
|
+
out_parent.append_child(clone)
|
|
673
|
+
|
|
674
|
+
# Template content is a separate subtree.
|
|
675
|
+
if tag == "template" and current.namespace in (None, "html"):
|
|
676
|
+
if current.template_content and clone.template_content:
|
|
677
|
+
clone.template_content.children.clear()
|
|
678
|
+
tc_children = current.template_content.children or []
|
|
679
|
+
stack.extend((child, clone.template_content) for child in reversed(tc_children))
|
|
680
|
+
|
|
681
|
+
children = current.children or []
|
|
682
|
+
stack.extend((child, clone) for child in reversed(children))
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
def sanitize(node: Any, *, policy: SanitizationPolicy | None = None) -> Any:
|
|
686
|
+
"""Return a sanitized clone of `node`.
|
|
687
|
+
|
|
688
|
+
If `policy` is not provided, JustHTML uses a conservative default policy.
|
|
689
|
+
For full documents (`#document` roots) it preserves `<html>`, `<head>`, and
|
|
690
|
+
`<body>` wrappers; for fragments it prefers snippet-shaped output.
|
|
691
|
+
"""
|
|
692
|
+
|
|
693
|
+
if policy is None:
|
|
694
|
+
policy = DEFAULT_DOCUMENT_POLICY if node.name == "#document" else DEFAULT_POLICY
|
|
695
|
+
|
|
696
|
+
# Root handling.
|
|
697
|
+
root_name: str = node.name
|
|
698
|
+
|
|
699
|
+
if root_name == "#text":
|
|
700
|
+
return node.clone_node(deep=False)
|
|
701
|
+
|
|
702
|
+
if root_name == "#comment":
|
|
703
|
+
out_root = node.clone_node(deep=False)
|
|
704
|
+
if policy.drop_comments:
|
|
705
|
+
out_root.name = "#document-fragment"
|
|
706
|
+
return out_root
|
|
707
|
+
|
|
708
|
+
if root_name == "!doctype":
|
|
709
|
+
out_root = node.clone_node(deep=False)
|
|
710
|
+
if policy.drop_doctype:
|
|
711
|
+
out_root.name = "#document-fragment"
|
|
712
|
+
return out_root
|
|
713
|
+
|
|
714
|
+
# Containers.
|
|
715
|
+
if root_name.startswith("#"):
|
|
716
|
+
out_root = node.clone_node(deep=False)
|
|
717
|
+
out_root.children.clear()
|
|
718
|
+
for child in node.children or []:
|
|
719
|
+
_append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
|
|
720
|
+
return out_root
|
|
721
|
+
|
|
722
|
+
# Element root: keep element if allowed, otherwise unwrap into a fragment.
|
|
723
|
+
tag = str(root_name).lower()
|
|
724
|
+
if policy.drop_foreign_namespaces and node.namespace not in (None, "html"):
|
|
725
|
+
out_root = node.clone_node(deep=False)
|
|
726
|
+
out_root.name = "#document-fragment"
|
|
727
|
+
out_root.children.clear()
|
|
728
|
+
out_root.attrs.clear()
|
|
729
|
+
return out_root
|
|
730
|
+
|
|
731
|
+
if tag in policy.drop_content_tags or (tag not in policy.allowed_tags and not policy.strip_disallowed_tags):
|
|
732
|
+
out_root = node.clone_node(deep=False)
|
|
733
|
+
out_root.name = "#document-fragment"
|
|
734
|
+
out_root.children.clear()
|
|
735
|
+
out_root.attrs.clear()
|
|
736
|
+
return out_root
|
|
737
|
+
|
|
738
|
+
if tag not in policy.allowed_tags and policy.strip_disallowed_tags:
|
|
739
|
+
out_root = node.clone_node(deep=False)
|
|
740
|
+
out_root.name = "#document-fragment"
|
|
741
|
+
out_root.children.clear()
|
|
742
|
+
out_root.attrs.clear()
|
|
743
|
+
for child in node.children or []:
|
|
744
|
+
_append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
|
|
745
|
+
|
|
746
|
+
if tag == "template" and node.namespace in (None, "html") and node.template_content:
|
|
747
|
+
for child in node.template_content.children or []:
|
|
748
|
+
_append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
|
|
749
|
+
return out_root
|
|
750
|
+
|
|
751
|
+
out_root = node.clone_node(deep=False)
|
|
752
|
+
out_root.children.clear()
|
|
753
|
+
out_root.attrs = _sanitize_attrs(policy=policy, tag=tag, attrs=node.attrs)
|
|
754
|
+
for child in node.children or []:
|
|
755
|
+
_append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
|
|
756
|
+
|
|
757
|
+
if tag == "template" and node.namespace in (None, "html"):
|
|
758
|
+
if node.template_content and out_root.template_content:
|
|
759
|
+
out_root.template_content.children.clear()
|
|
760
|
+
for child in node.template_content.children or []:
|
|
761
|
+
_append_sanitized_subtree(policy=policy, original=child, parent_out=out_root.template_content)
|
|
762
|
+
|
|
763
|
+
return out_root
|