justhtml 0.24.0__py3-none-any.whl → 0.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +44 -2
- justhtml/__main__.py +45 -9
- justhtml/constants.py +12 -0
- justhtml/errors.py +8 -3
- justhtml/linkify.py +438 -0
- justhtml/node.py +54 -35
- justhtml/parser.py +105 -38
- justhtml/sanitize.py +511 -282
- justhtml/selector.py +3 -1
- justhtml/serialize.py +398 -72
- justhtml/tokenizer.py +121 -21
- justhtml/tokens.py +21 -3
- justhtml/transforms.py +2568 -0
- justhtml/treebuilder.py +247 -190
- justhtml/treebuilder_modes.py +108 -102
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/METADATA +28 -7
- justhtml-0.38.0.dist-info/RECORD +26 -0
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +1 -1
- justhtml-0.24.0.dist-info/RECORD +0 -24
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0
justhtml/sanitize.py
CHANGED
|
@@ -10,12 +10,38 @@ from __future__ import annotations
|
|
|
10
10
|
|
|
11
11
|
from collections.abc import Callable, Collection, Mapping
|
|
12
12
|
from dataclasses import dataclass, field
|
|
13
|
-
from typing import Any
|
|
13
|
+
from typing import Any, Literal, cast
|
|
14
14
|
from urllib.parse import quote, urlsplit
|
|
15
15
|
|
|
16
|
+
from .tokens import ParseError
|
|
17
|
+
|
|
16
18
|
UrlFilter = Callable[[str, str, str], str | None]
|
|
17
19
|
|
|
18
20
|
|
|
21
|
+
class UnsafeHtmlError(ValueError):
|
|
22
|
+
"""Raised when unsafe HTML is encountered and unsafe_handling='raise'."""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
UnsafeHandling = Literal["strip", "raise", "collect"]
|
|
26
|
+
|
|
27
|
+
DisallowedTagHandling = Literal["unwrap", "escape", "drop"]
|
|
28
|
+
|
|
29
|
+
UrlHandling = Literal["allow", "strip", "proxy"]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(frozen=True, slots=True)
|
|
33
|
+
class UrlProxy:
|
|
34
|
+
url: str
|
|
35
|
+
param: str = "url"
|
|
36
|
+
|
|
37
|
+
def __post_init__(self) -> None:
|
|
38
|
+
proxy_url = str(self.url)
|
|
39
|
+
if not proxy_url:
|
|
40
|
+
raise ValueError("UrlProxy.url must be a non-empty string")
|
|
41
|
+
object.__setattr__(self, "url", proxy_url)
|
|
42
|
+
object.__setattr__(self, "param", str(self.param))
|
|
43
|
+
|
|
44
|
+
|
|
19
45
|
@dataclass(frozen=True, slots=True)
|
|
20
46
|
class UrlRule:
|
|
21
47
|
"""Rule for a single URL-valued attribute (e.g. a[href], img[src]).
|
|
@@ -27,9 +53,6 @@ class UrlRule:
|
|
|
27
53
|
want to block remote loads by default.
|
|
28
54
|
"""
|
|
29
55
|
|
|
30
|
-
# Allow relative URLs (including /path, ./path, ../path, ?query).
|
|
31
|
-
allow_relative: bool = True
|
|
32
|
-
|
|
33
56
|
# Allow same-document fragments (#foo). Typically safe.
|
|
34
57
|
allow_fragment: bool = True
|
|
35
58
|
|
|
@@ -46,13 +69,17 @@ class UrlRule:
|
|
|
46
69
|
# allowlist.
|
|
47
70
|
allowed_hosts: Collection[str] | None = None
|
|
48
71
|
|
|
49
|
-
# Optional
|
|
50
|
-
#
|
|
51
|
-
|
|
52
|
-
|
|
72
|
+
# Optional per-rule handling override.
|
|
73
|
+
# If None, the URL is kept ("allow") after it passes validation.
|
|
74
|
+
handling: UrlHandling | None = None
|
|
75
|
+
|
|
76
|
+
# Optional per-rule override of UrlPolicy.default_allow_relative.
|
|
77
|
+
# If None, UrlPolicy.default_allow_relative is used.
|
|
78
|
+
allow_relative: bool | None = None
|
|
53
79
|
|
|
54
|
-
#
|
|
55
|
-
|
|
80
|
+
# Optional proxy override for absolute/protocol-relative URLs.
|
|
81
|
+
# Used when the effective URL handling is "proxy".
|
|
82
|
+
proxy: UrlProxy | None = None
|
|
56
83
|
|
|
57
84
|
def __post_init__(self) -> None:
|
|
58
85
|
# Accept lists/tuples from user code, normalize for internal use.
|
|
@@ -61,15 +88,158 @@ class UrlRule:
|
|
|
61
88
|
if self.allowed_hosts is not None and not isinstance(self.allowed_hosts, set):
|
|
62
89
|
object.__setattr__(self, "allowed_hosts", set(self.allowed_hosts))
|
|
63
90
|
|
|
64
|
-
if self.
|
|
65
|
-
|
|
66
|
-
object.__setattr__(self, "proxy_url", proxy_url if proxy_url else None)
|
|
67
|
-
object.__setattr__(self, "proxy_param", str(self.proxy_param))
|
|
91
|
+
if self.proxy is not None and not isinstance(self.proxy, UrlProxy):
|
|
92
|
+
raise TypeError("UrlRule.proxy must be a UrlProxy or None")
|
|
68
93
|
|
|
94
|
+
if self.handling is not None:
|
|
95
|
+
mode = str(self.handling)
|
|
96
|
+
if mode not in {"allow", "strip", "proxy"}:
|
|
97
|
+
raise ValueError("Invalid UrlRule.handling. Expected one of: 'allow', 'strip', 'proxy'")
|
|
98
|
+
object.__setattr__(self, "handling", mode)
|
|
69
99
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
100
|
+
if self.allow_relative is not None:
|
|
101
|
+
object.__setattr__(self, "allow_relative", bool(self.allow_relative))
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@dataclass(frozen=True, slots=True)
|
|
105
|
+
class UrlPolicy:
|
|
106
|
+
# Default handling for URL-like attributes after they pass UrlRule checks.
|
|
107
|
+
# - "allow": keep the URL as-is
|
|
108
|
+
# - "strip": drop the attribute
|
|
109
|
+
# - "proxy": rewrite the URL through a proxy (UrlPolicy.proxy or UrlRule.proxy)
|
|
110
|
+
default_handling: UrlHandling = "strip"
|
|
111
|
+
|
|
112
|
+
# Default allowance for relative URLs (including /path, ./path, ../path, ?query)
|
|
113
|
+
# for URL-like attributes that have a matching UrlRule.
|
|
114
|
+
default_allow_relative: bool = True
|
|
115
|
+
|
|
116
|
+
# Rule configuration for URL-valued attributes.
|
|
117
|
+
allow_rules: Mapping[tuple[str, str], UrlRule] = field(default_factory=dict)
|
|
118
|
+
|
|
119
|
+
# Optional hook that can drop or rewrite URLs.
|
|
120
|
+
# url_filter(tag, attr, value) should return:
|
|
121
|
+
# - a replacement string to keep (possibly rewritten), or
|
|
122
|
+
# - None to drop the attribute.
|
|
123
|
+
url_filter: UrlFilter | None = None
|
|
124
|
+
|
|
125
|
+
# Default proxy config used when a rule is handled with "proxy" and
|
|
126
|
+
# the rule does not specify its own UrlRule.proxy override.
|
|
127
|
+
proxy: UrlProxy | None = None
|
|
128
|
+
|
|
129
|
+
def __post_init__(self) -> None:
|
|
130
|
+
mode = str(self.default_handling)
|
|
131
|
+
if mode not in {"allow", "strip", "proxy"}:
|
|
132
|
+
raise ValueError("Invalid default_handling. Expected one of: 'allow', 'strip', 'proxy'")
|
|
133
|
+
object.__setattr__(self, "default_handling", mode)
|
|
134
|
+
|
|
135
|
+
object.__setattr__(self, "default_allow_relative", bool(self.default_allow_relative))
|
|
136
|
+
|
|
137
|
+
if not isinstance(self.allow_rules, dict):
|
|
138
|
+
object.__setattr__(self, "allow_rules", dict(self.allow_rules))
|
|
139
|
+
|
|
140
|
+
if self.proxy is not None and not isinstance(self.proxy, UrlProxy):
|
|
141
|
+
raise TypeError("UrlPolicy.proxy must be a UrlProxy or None")
|
|
142
|
+
|
|
143
|
+
# Validate proxy configuration for any rules that are in proxy mode.
|
|
144
|
+
for rule in self.allow_rules.values():
|
|
145
|
+
if not isinstance(rule, UrlRule):
|
|
146
|
+
raise TypeError("UrlPolicy.allow_rules values must be UrlRule")
|
|
147
|
+
if rule.handling == "proxy" and self.proxy is None and rule.proxy is None:
|
|
148
|
+
raise ValueError("UrlRule.handling='proxy' requires a UrlPolicy.proxy or a per-rule UrlRule.proxy")
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _proxy_url_value(*, proxy: UrlProxy, value: str) -> str:
|
|
152
|
+
sep = "&" if "?" in proxy.url else "?"
|
|
153
|
+
return f"{proxy.url}{sep}{proxy.param}={quote(value, safe='')}"
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@dataclass(slots=True)
|
|
157
|
+
class UnsafeHandler:
|
|
158
|
+
"""Centralized handler for security findings.
|
|
159
|
+
|
|
160
|
+
This is intentionally a small stateful object so multiple sanitization-
|
|
161
|
+
related passes/transforms can share the same unsafe-handling behavior and
|
|
162
|
+
(in collect mode) append into the same error list.
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
unsafe_handling: UnsafeHandling
|
|
166
|
+
|
|
167
|
+
# Optional external sink (e.g. a JustHTML document's .errors list).
|
|
168
|
+
# When set and unsafe_handling == "collect", security findings are written
|
|
169
|
+
# into that list so multiple components can share a single sink.
|
|
170
|
+
sink: list[ParseError] | None = None
|
|
171
|
+
|
|
172
|
+
_errors: list[ParseError] | None = None
|
|
173
|
+
|
|
174
|
+
def reset(self) -> None:
|
|
175
|
+
if self.unsafe_handling != "collect":
|
|
176
|
+
self._errors = None
|
|
177
|
+
return
|
|
178
|
+
|
|
179
|
+
if self.sink is None:
|
|
180
|
+
self._errors = []
|
|
181
|
+
return
|
|
182
|
+
|
|
183
|
+
# Remove previously collected security findings from the shared sink to
|
|
184
|
+
# avoid accumulating duplicates across multiple runs.
|
|
185
|
+
errors = self.sink
|
|
186
|
+
write_i = 0
|
|
187
|
+
for e in errors:
|
|
188
|
+
if e.category == "security":
|
|
189
|
+
continue
|
|
190
|
+
errors[write_i] = e
|
|
191
|
+
write_i += 1
|
|
192
|
+
del errors[write_i:]
|
|
193
|
+
|
|
194
|
+
def collected(self) -> list[ParseError]:
|
|
195
|
+
src = self.sink if self.sink is not None else self._errors
|
|
196
|
+
if not src:
|
|
197
|
+
return []
|
|
198
|
+
|
|
199
|
+
if self.sink is not None:
|
|
200
|
+
out = [e for e in src if e.category == "security"]
|
|
201
|
+
else:
|
|
202
|
+
out = list(src)
|
|
203
|
+
out.sort(
|
|
204
|
+
key=lambda e: (
|
|
205
|
+
e.line if e.line is not None else 1_000_000_000,
|
|
206
|
+
e.column if e.column is not None else 1_000_000_000,
|
|
207
|
+
)
|
|
208
|
+
)
|
|
209
|
+
return out
|
|
210
|
+
|
|
211
|
+
def handle(self, msg: str, *, node: Any | None = None) -> None:
|
|
212
|
+
mode = self.unsafe_handling
|
|
213
|
+
if mode == "strip":
|
|
214
|
+
return
|
|
215
|
+
if mode == "raise":
|
|
216
|
+
raise UnsafeHtmlError(msg)
|
|
217
|
+
if mode == "collect":
|
|
218
|
+
dest = self.sink
|
|
219
|
+
if dest is None:
|
|
220
|
+
if self._errors is None:
|
|
221
|
+
self._errors = []
|
|
222
|
+
dest = self._errors
|
|
223
|
+
|
|
224
|
+
line: int | None = None
|
|
225
|
+
column: int | None = None
|
|
226
|
+
if node is not None:
|
|
227
|
+
# Best-effort: use node origin metadata when enabled.
|
|
228
|
+
# This stays allocation-light and avoids any input re-parsing.
|
|
229
|
+
line = node.origin_line
|
|
230
|
+
column = node.origin_col
|
|
231
|
+
|
|
232
|
+
dest.append(
|
|
233
|
+
ParseError(
|
|
234
|
+
"unsafe-html",
|
|
235
|
+
line=line,
|
|
236
|
+
column=column,
|
|
237
|
+
category="security",
|
|
238
|
+
message=msg,
|
|
239
|
+
)
|
|
240
|
+
)
|
|
241
|
+
return
|
|
242
|
+
raise AssertionError(f"Unhandled unsafe_handling: {mode!r}")
|
|
73
243
|
|
|
74
244
|
|
|
75
245
|
@dataclass(frozen=True, slots=True)
|
|
@@ -90,24 +260,13 @@ class SanitizationPolicy:
|
|
|
90
260
|
allowed_tags: Collection[str]
|
|
91
261
|
allowed_attributes: Mapping[str, Collection[str]]
|
|
92
262
|
|
|
93
|
-
# URL handling
|
|
94
|
-
|
|
95
|
-
# - `url_filter` is an optional hook that can drop or rewrite URLs.
|
|
96
|
-
#
|
|
97
|
-
# `url_filter(tag, attr, value)` should return:
|
|
98
|
-
# - a replacement string to keep (possibly rewritten), or
|
|
99
|
-
# - None to drop the attribute.
|
|
100
|
-
url_rules: Mapping[tuple[str, str], UrlRule]
|
|
101
|
-
url_filter: UrlFilter | None = None
|
|
263
|
+
# URL handling.
|
|
264
|
+
url_policy: UrlPolicy = field(default_factory=UrlPolicy)
|
|
102
265
|
|
|
103
266
|
drop_comments: bool = True
|
|
104
267
|
drop_doctype: bool = True
|
|
105
268
|
drop_foreign_namespaces: bool = True
|
|
106
269
|
|
|
107
|
-
# If True, disallowed elements are removed but their children may be kept
|
|
108
|
-
# (except for tags in `drop_content_tags`).
|
|
109
|
-
strip_disallowed_tags: bool = True
|
|
110
|
-
|
|
111
270
|
# Dangerous containers whose text payload should not be preserved.
|
|
112
271
|
drop_content_tags: Collection[str] = field(default_factory=lambda: {"script", "style"})
|
|
113
272
|
|
|
@@ -121,6 +280,52 @@ class SanitizationPolicy:
|
|
|
121
280
|
# (The sanitizer will merge tokens; it will not remove existing ones.)
|
|
122
281
|
force_link_rel: Collection[str] = field(default_factory=set)
|
|
123
282
|
|
|
283
|
+
# Determines how unsafe input is handled.
|
|
284
|
+
#
|
|
285
|
+
# - "strip": Default. Remove/drop unsafe constructs and keep going.
|
|
286
|
+
# - "raise": Raise UnsafeHtmlError on the first unsafe construct.
|
|
287
|
+
#
|
|
288
|
+
# This is intentionally a string mode (instead of a boolean) so we can add
|
|
289
|
+
# more behaviors over time without changing the API shape.
|
|
290
|
+
unsafe_handling: UnsafeHandling = "strip"
|
|
291
|
+
|
|
292
|
+
# Determines how disallowed tags are handled.
|
|
293
|
+
#
|
|
294
|
+
# - "unwrap": Default. Drop the tag but keep/sanitize its children.
|
|
295
|
+
# - "escape": Emit original tag tokens as text, keep/sanitize children.
|
|
296
|
+
# - "drop": Drop the entire disallowed subtree.
|
|
297
|
+
disallowed_tag_handling: DisallowedTagHandling = "unwrap"
|
|
298
|
+
|
|
299
|
+
_unsafe_handler: UnsafeHandler = field(
|
|
300
|
+
default_factory=lambda: UnsafeHandler("strip"),
|
|
301
|
+
init=False,
|
|
302
|
+
repr=False,
|
|
303
|
+
compare=False,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
# Internal caches to avoid per-node allocations in hot paths.
|
|
307
|
+
_allowed_attrs_global: frozenset[str] = field(
|
|
308
|
+
default_factory=frozenset,
|
|
309
|
+
init=False,
|
|
310
|
+
repr=False,
|
|
311
|
+
compare=False,
|
|
312
|
+
)
|
|
313
|
+
_allowed_attrs_by_tag: dict[str, frozenset[str]] = field(
|
|
314
|
+
default_factory=dict,
|
|
315
|
+
init=False,
|
|
316
|
+
repr=False,
|
|
317
|
+
compare=False,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
# Cache for the compiled `Sanitize(policy=...)` transform pipeline.
|
|
321
|
+
# This lets safe serialization reuse the same compiled transforms.
|
|
322
|
+
_compiled_sanitize_transforms: list[Any] | None = field(
|
|
323
|
+
default=None,
|
|
324
|
+
init=False,
|
|
325
|
+
repr=False,
|
|
326
|
+
compare=False,
|
|
327
|
+
)
|
|
328
|
+
|
|
124
329
|
def __post_init__(self) -> None:
|
|
125
330
|
# Normalize to sets so the sanitizer can do fast membership checks.
|
|
126
331
|
if not isinstance(self.allowed_tags, set):
|
|
@@ -143,6 +348,57 @@ class SanitizationPolicy:
|
|
|
143
348
|
if not isinstance(self.force_link_rel, set):
|
|
144
349
|
object.__setattr__(self, "force_link_rel", set(self.force_link_rel))
|
|
145
350
|
|
|
351
|
+
unsafe_handling = str(self.unsafe_handling)
|
|
352
|
+
if unsafe_handling not in {"strip", "raise", "collect"}:
|
|
353
|
+
raise ValueError("Invalid unsafe_handling. Expected one of: 'strip', 'raise', 'collect'")
|
|
354
|
+
object.__setattr__(self, "unsafe_handling", unsafe_handling)
|
|
355
|
+
|
|
356
|
+
disallowed_tag_handling = str(self.disallowed_tag_handling)
|
|
357
|
+
if disallowed_tag_handling not in {"unwrap", "escape", "drop"}:
|
|
358
|
+
raise ValueError("Invalid disallowed_tag_handling. Expected one of: 'unwrap', 'escape', 'drop'")
|
|
359
|
+
object.__setattr__(self, "disallowed_tag_handling", disallowed_tag_handling)
|
|
360
|
+
|
|
361
|
+
# Centralize unsafe-handling logic so multiple passes can share it.
|
|
362
|
+
handler = UnsafeHandler(cast("UnsafeHandling", unsafe_handling))
|
|
363
|
+
handler.reset()
|
|
364
|
+
object.__setattr__(self, "_unsafe_handler", handler)
|
|
365
|
+
|
|
366
|
+
# Normalize rel tokens once so downstream sanitization can stay allocation-light.
|
|
367
|
+
# (Downstream code expects lowercase tokens and ignores empty/whitespace.)
|
|
368
|
+
if self.force_link_rel:
|
|
369
|
+
normalized_force_link_rel = {t.strip().lower() for t in self.force_link_rel if str(t).strip()}
|
|
370
|
+
object.__setattr__(self, "force_link_rel", normalized_force_link_rel)
|
|
371
|
+
|
|
372
|
+
style_allowed = any("style" in attrs for attrs in self.allowed_attributes.values())
|
|
373
|
+
if style_allowed and not self.allowed_css_properties:
|
|
374
|
+
raise ValueError(
|
|
375
|
+
"SanitizationPolicy allows the 'style' attribute but allowed_css_properties is empty. "
|
|
376
|
+
"Either remove 'style' from allowed_attributes or set allowed_css_properties (for example CSS_PRESET_TEXT)."
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
allowed_attributes = self.allowed_attributes
|
|
380
|
+
allowed_global = frozenset(allowed_attributes.get("*", ()))
|
|
381
|
+
by_tag: dict[str, frozenset[str]] = {}
|
|
382
|
+
for tag, attrs in allowed_attributes.items():
|
|
383
|
+
if tag == "*":
|
|
384
|
+
continue
|
|
385
|
+
by_tag[tag] = frozenset(allowed_global.union(attrs))
|
|
386
|
+
object.__setattr__(self, "_allowed_attrs_global", allowed_global)
|
|
387
|
+
object.__setattr__(self, "_allowed_attrs_by_tag", by_tag)
|
|
388
|
+
|
|
389
|
+
def reset_collected_security_errors(self) -> None:
|
|
390
|
+
self._unsafe_handler.reset()
|
|
391
|
+
|
|
392
|
+
def collected_security_errors(self) -> list[ParseError]:
|
|
393
|
+
return self._unsafe_handler.collected()
|
|
394
|
+
|
|
395
|
+
def handle_unsafe(self, msg: str, *, node: Any | None = None) -> None:
|
|
396
|
+
self._unsafe_handler.handle(msg, node=node)
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
_URL_NORMALIZE_STRIP_TABLE = {i: None for i in range(0x21)}
|
|
400
|
+
_URL_NORMALIZE_STRIP_TABLE[0x7F] = None
|
|
401
|
+
|
|
146
402
|
|
|
147
403
|
DEFAULT_POLICY: SanitizationPolicy = SanitizationPolicy(
|
|
148
404
|
allowed_tags=[
|
|
@@ -199,32 +455,53 @@ DEFAULT_POLICY: SanitizationPolicy = SanitizationPolicy(
|
|
|
199
455
|
"th": ["colspan", "rowspan"],
|
|
200
456
|
"td": ["colspan", "rowspan"],
|
|
201
457
|
},
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
458
|
+
url_policy=UrlPolicy(
|
|
459
|
+
default_handling="allow",
|
|
460
|
+
allow_rules={
|
|
461
|
+
("a", "href"): UrlRule(
|
|
462
|
+
allowed_schemes=["http", "https", "mailto", "tel"],
|
|
463
|
+
resolve_protocol_relative="https",
|
|
464
|
+
),
|
|
465
|
+
("img", "src"): UrlRule(
|
|
466
|
+
allowed_schemes=[],
|
|
467
|
+
resolve_protocol_relative=None,
|
|
468
|
+
),
|
|
469
|
+
},
|
|
470
|
+
),
|
|
215
471
|
allowed_css_properties=set(),
|
|
216
472
|
)
|
|
217
473
|
|
|
218
474
|
|
|
475
|
+
# A conservative preset for allowing a small amount of inline styling.
|
|
476
|
+
# This is intentionally focused on text-level styling and avoids layout/
|
|
477
|
+
# positioning properties that are commonly abused for UI redress.
|
|
478
|
+
CSS_PRESET_TEXT: frozenset[str] = frozenset(
|
|
479
|
+
{
|
|
480
|
+
"background-color",
|
|
481
|
+
"color",
|
|
482
|
+
"font-size",
|
|
483
|
+
"font-style",
|
|
484
|
+
"font-weight",
|
|
485
|
+
"letter-spacing",
|
|
486
|
+
"line-height",
|
|
487
|
+
"text-align",
|
|
488
|
+
"text-decoration",
|
|
489
|
+
"text-transform",
|
|
490
|
+
"white-space",
|
|
491
|
+
"word-break",
|
|
492
|
+
"word-spacing",
|
|
493
|
+
"word-wrap",
|
|
494
|
+
}
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
|
|
219
498
|
DEFAULT_DOCUMENT_POLICY: SanitizationPolicy = SanitizationPolicy(
|
|
220
499
|
allowed_tags=sorted(set(DEFAULT_POLICY.allowed_tags) | {"html", "head", "body", "title"}),
|
|
221
500
|
allowed_attributes=DEFAULT_POLICY.allowed_attributes,
|
|
222
|
-
|
|
223
|
-
url_filter=DEFAULT_POLICY.url_filter,
|
|
501
|
+
url_policy=DEFAULT_POLICY.url_policy,
|
|
224
502
|
drop_comments=DEFAULT_POLICY.drop_comments,
|
|
225
503
|
drop_doctype=DEFAULT_POLICY.drop_doctype,
|
|
226
504
|
drop_foreign_namespaces=DEFAULT_POLICY.drop_foreign_namespaces,
|
|
227
|
-
strip_disallowed_tags=DEFAULT_POLICY.strip_disallowed_tags,
|
|
228
505
|
drop_content_tags=DEFAULT_POLICY.drop_content_tags,
|
|
229
506
|
allowed_css_properties=DEFAULT_POLICY.allowed_css_properties,
|
|
230
507
|
force_link_rel=DEFAULT_POLICY.force_link_rel,
|
|
@@ -372,8 +649,8 @@ def _css_value_may_load_external_resource(value: str) -> bool:
|
|
|
372
649
|
return False
|
|
373
650
|
|
|
374
651
|
|
|
375
|
-
def _sanitize_inline_style(*,
|
|
376
|
-
allowed =
|
|
652
|
+
def _sanitize_inline_style(*, allowed_css_properties: Collection[str], value: str) -> str | None:
|
|
653
|
+
allowed = allowed_css_properties
|
|
377
654
|
if not allowed:
|
|
378
655
|
return None
|
|
379
656
|
|
|
@@ -414,13 +691,7 @@ def _normalize_url_for_checking(value: str) -> str:
|
|
|
414
691
|
# Strip whitespace/control chars commonly used for scheme obfuscation.
|
|
415
692
|
# Note: do not strip backslashes; they are not whitespace/control chars,
|
|
416
693
|
# and removing them can turn invalid schemes into valid ones.
|
|
417
|
-
|
|
418
|
-
for ch in value:
|
|
419
|
-
o = ord(ch)
|
|
420
|
-
if o <= 0x20 or o == 0x7F:
|
|
421
|
-
continue
|
|
422
|
-
out.append(ch)
|
|
423
|
-
return "".join(out)
|
|
694
|
+
return value.translate(_URL_NORMALIZE_STRIP_TABLE)
|
|
424
695
|
|
|
425
696
|
|
|
426
697
|
def _is_valid_scheme(scheme: str) -> bool:
|
|
@@ -467,15 +738,46 @@ def _has_invalid_scheme_like_prefix(value: str) -> bool:
|
|
|
467
738
|
|
|
468
739
|
def _sanitize_url_value(
|
|
469
740
|
*,
|
|
470
|
-
|
|
741
|
+
url_policy: UrlPolicy,
|
|
742
|
+
rule: UrlRule,
|
|
743
|
+
tag: str,
|
|
744
|
+
attr: str,
|
|
745
|
+
value: str,
|
|
746
|
+
) -> str | None:
|
|
747
|
+
return _sanitize_url_value_inner(
|
|
748
|
+
url_policy=url_policy, rule=rule, tag=tag, attr=attr, value=value, apply_filter=True
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
def _effective_proxy(*, url_policy: UrlPolicy, rule: UrlRule) -> UrlProxy | None:
|
|
753
|
+
return rule.proxy if rule.proxy is not None else url_policy.proxy
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
def _effective_url_handling(*, url_policy: UrlPolicy, rule: UrlRule) -> UrlHandling:
|
|
757
|
+
# URL-like attributes are allowlisted via UrlPolicy.allow_rules. When they are
|
|
758
|
+
# allowlisted and the URL passes validation, the default action is to keep the URL.
|
|
759
|
+
return rule.handling if rule.handling is not None else "allow"
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
def _effective_allow_relative(*, url_policy: UrlPolicy, rule: UrlRule) -> bool:
|
|
763
|
+
return rule.allow_relative if rule.allow_relative is not None else url_policy.default_allow_relative
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
def _sanitize_url_value_inner(
|
|
767
|
+
*,
|
|
768
|
+
url_policy: UrlPolicy,
|
|
471
769
|
rule: UrlRule,
|
|
472
770
|
tag: str,
|
|
473
771
|
attr: str,
|
|
474
772
|
value: str,
|
|
773
|
+
apply_filter: bool,
|
|
475
774
|
) -> str | None:
|
|
476
775
|
v = value
|
|
477
|
-
|
|
478
|
-
|
|
776
|
+
mode = _effective_url_handling(url_policy=url_policy, rule=rule)
|
|
777
|
+
allow_relative = _effective_allow_relative(url_policy=url_policy, rule=rule)
|
|
778
|
+
|
|
779
|
+
if apply_filter and url_policy.url_filter is not None:
|
|
780
|
+
rewritten = url_policy.url_filter(tag, attr, v)
|
|
479
781
|
if rewritten is None:
|
|
480
782
|
return None
|
|
481
783
|
v = rewritten
|
|
@@ -488,11 +790,18 @@ def _sanitize_url_value(
|
|
|
488
790
|
return None
|
|
489
791
|
|
|
490
792
|
if normalized.startswith("#"):
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
793
|
+
if not rule.allow_fragment:
|
|
794
|
+
return None
|
|
795
|
+
if mode == "strip":
|
|
796
|
+
return None
|
|
797
|
+
if mode == "proxy":
|
|
798
|
+
proxy = _effective_proxy(url_policy=url_policy, rule=rule)
|
|
799
|
+
return None if proxy is None else _proxy_url_value(proxy=proxy, value=stripped)
|
|
800
|
+
return stripped
|
|
801
|
+
|
|
802
|
+
if mode == "proxy" and _has_invalid_scheme_like_prefix(normalized):
|
|
803
|
+
# If proxying is enabled, do not treat scheme-obfuscation as a relative URL.
|
|
804
|
+
# Some user agents normalize backslashes and other characters during navigation.
|
|
496
805
|
return None
|
|
497
806
|
|
|
498
807
|
if normalized.startswith("//"):
|
|
@@ -513,12 +822,12 @@ def _sanitize_url_value(
|
|
|
513
822
|
if not host or host not in rule.allowed_hosts:
|
|
514
823
|
return None
|
|
515
824
|
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
else resolved_url
|
|
521
|
-
|
|
825
|
+
if mode == "strip":
|
|
826
|
+
return None
|
|
827
|
+
if mode == "proxy":
|
|
828
|
+
proxy = _effective_proxy(url_policy=url_policy, rule=rule)
|
|
829
|
+
return None if proxy is None else _proxy_url_value(proxy=proxy, value=resolved_url)
|
|
830
|
+
return resolved_url
|
|
522
831
|
|
|
523
832
|
if _has_scheme(normalized):
|
|
524
833
|
parsed = urlsplit(normalized)
|
|
@@ -529,235 +838,155 @@ def _sanitize_url_value(
|
|
|
529
838
|
host = (parsed.hostname or "").lower()
|
|
530
839
|
if not host or host not in rule.allowed_hosts:
|
|
531
840
|
return None
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
841
|
+
if mode == "strip":
|
|
842
|
+
return None
|
|
843
|
+
if mode == "proxy":
|
|
844
|
+
proxy = _effective_proxy(url_policy=url_policy, rule=rule)
|
|
845
|
+
return None if proxy is None else _proxy_url_value(proxy=proxy, value=stripped)
|
|
846
|
+
return stripped
|
|
537
847
|
|
|
538
|
-
|
|
848
|
+
if not allow_relative:
|
|
849
|
+
return None
|
|
539
850
|
|
|
851
|
+
if mode == "strip":
|
|
852
|
+
return None
|
|
853
|
+
if mode == "proxy":
|
|
854
|
+
proxy = _effective_proxy(url_policy=url_policy, rule=rule)
|
|
855
|
+
return None if proxy is None else _proxy_url_value(proxy=proxy, value=stripped)
|
|
856
|
+
return stripped
|
|
540
857
|
|
|
541
|
-
|
|
858
|
+
|
|
859
|
+
def _sanitize_srcset_value(
|
|
542
860
|
*,
|
|
543
|
-
|
|
861
|
+
url_policy: UrlPolicy,
|
|
862
|
+
rule: UrlRule,
|
|
544
863
|
tag: str,
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
for raw_name, raw_value in attrs.items():
|
|
556
|
-
if not raw_name:
|
|
557
|
-
continue
|
|
558
|
-
|
|
559
|
-
name = str(raw_name).strip().lower()
|
|
560
|
-
if not name:
|
|
561
|
-
continue
|
|
562
|
-
|
|
563
|
-
# Disallow namespace-ish attributes by default.
|
|
564
|
-
if ":" in name:
|
|
565
|
-
continue
|
|
566
|
-
|
|
567
|
-
# Always drop event handlers.
|
|
568
|
-
if name.startswith("on"):
|
|
569
|
-
continue
|
|
570
|
-
|
|
571
|
-
# Dangerous attribute contexts.
|
|
572
|
-
if name == "srcdoc":
|
|
573
|
-
continue
|
|
574
|
-
|
|
575
|
-
if name not in allowed and not (tag == "a" and name == "rel" and policy.force_link_rel):
|
|
576
|
-
continue
|
|
577
|
-
|
|
578
|
-
if raw_value is None:
|
|
579
|
-
out[name] = None
|
|
580
|
-
continue
|
|
581
|
-
|
|
582
|
-
value = str(raw_value)
|
|
583
|
-
rule = policy.url_rules.get((tag, name))
|
|
584
|
-
if rule is not None:
|
|
585
|
-
sanitized = _sanitize_url_value(policy=policy, rule=rule, tag=tag, attr=name, value=value)
|
|
586
|
-
if sanitized is None:
|
|
587
|
-
continue
|
|
588
|
-
out[name] = sanitized
|
|
589
|
-
elif name == "style":
|
|
590
|
-
sanitized_style = _sanitize_inline_style(policy=policy, value=value)
|
|
591
|
-
if sanitized_style is None:
|
|
592
|
-
continue
|
|
593
|
-
out[name] = sanitized_style
|
|
594
|
-
else:
|
|
595
|
-
out[name] = value
|
|
596
|
-
|
|
597
|
-
# Link hardening (merge tokens; do not remove existing ones).
|
|
598
|
-
forced_tokens = [t.strip().lower() for t in policy.force_link_rel if str(t).strip()]
|
|
599
|
-
if tag == "a" and forced_tokens:
|
|
600
|
-
existing_raw = out.get("rel")
|
|
601
|
-
existing: list[str] = []
|
|
602
|
-
if isinstance(existing_raw, str) and existing_raw:
|
|
603
|
-
for tok in existing_raw.split():
|
|
604
|
-
t = tok.strip().lower()
|
|
605
|
-
if t and t not in existing:
|
|
606
|
-
existing.append(t)
|
|
607
|
-
for tok in sorted(forced_tokens):
|
|
608
|
-
if tok not in existing:
|
|
609
|
-
existing.append(tok)
|
|
610
|
-
out["rel"] = " ".join(existing)
|
|
611
|
-
|
|
612
|
-
return out
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
def _append_sanitized_subtree(*, policy: SanitizationPolicy, original: Any, parent_out: Any) -> None:
|
|
616
|
-
stack: list[tuple[Any, Any]] = [(original, parent_out)]
|
|
617
|
-
while stack:
|
|
618
|
-
current, out_parent = stack.pop()
|
|
619
|
-
name: str = current.name
|
|
620
|
-
|
|
621
|
-
if name == "#text":
|
|
622
|
-
out_parent.append_child(current.clone_node(deep=False))
|
|
623
|
-
continue
|
|
624
|
-
|
|
625
|
-
if name == "#comment":
|
|
626
|
-
if policy.drop_comments:
|
|
627
|
-
continue
|
|
628
|
-
out_parent.append_child(current.clone_node(deep=False))
|
|
629
|
-
continue
|
|
630
|
-
|
|
631
|
-
if name == "!doctype":
|
|
632
|
-
if policy.drop_doctype:
|
|
633
|
-
continue
|
|
634
|
-
out_parent.append_child(current.clone_node(deep=False))
|
|
635
|
-
continue
|
|
636
|
-
|
|
637
|
-
# Document containers.
|
|
638
|
-
if name.startswith("#"):
|
|
639
|
-
clone = current.clone_node(deep=False)
|
|
640
|
-
clone.children.clear()
|
|
641
|
-
out_parent.append_child(clone)
|
|
642
|
-
children = current.children or []
|
|
643
|
-
stack.extend((child, clone) for child in reversed(children))
|
|
644
|
-
continue
|
|
645
|
-
|
|
646
|
-
# Element.
|
|
647
|
-
tag = str(name).lower()
|
|
648
|
-
if policy.drop_foreign_namespaces:
|
|
649
|
-
ns = current.namespace
|
|
650
|
-
if ns not in (None, "html"):
|
|
651
|
-
continue
|
|
652
|
-
|
|
653
|
-
if tag in policy.drop_content_tags:
|
|
654
|
-
continue
|
|
864
|
+
attr: str,
|
|
865
|
+
value: str,
|
|
866
|
+
) -> str | None:
|
|
867
|
+
# Apply the URL filter once to the whole attribute value.
|
|
868
|
+
v = value
|
|
869
|
+
if url_policy.url_filter is not None:
|
|
870
|
+
rewritten = url_policy.url_filter(tag, attr, v)
|
|
871
|
+
if rewritten is None:
|
|
872
|
+
return None
|
|
873
|
+
v = rewritten
|
|
655
874
|
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
stack.extend((child, out_parent) for child in reversed(children))
|
|
875
|
+
stripped = str(v).strip()
|
|
876
|
+
if not stripped:
|
|
877
|
+
return None
|
|
660
878
|
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
879
|
+
out_candidates: list[str] = []
|
|
880
|
+
for raw_candidate in stripped.split(","):
|
|
881
|
+
c = raw_candidate.strip()
|
|
882
|
+
if not c:
|
|
664
883
|
continue
|
|
665
884
|
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
885
|
+
parts = c.split(None, 1)
|
|
886
|
+
url_token = parts[0]
|
|
887
|
+
desc = parts[1].strip() if len(parts) == 2 else ""
|
|
888
|
+
|
|
889
|
+
sanitized_url = _sanitize_url_value_inner(
|
|
890
|
+
url_policy=url_policy,
|
|
891
|
+
rule=rule,
|
|
892
|
+
tag=tag,
|
|
893
|
+
attr=attr,
|
|
894
|
+
value=url_token,
|
|
895
|
+
apply_filter=False,
|
|
896
|
+
)
|
|
897
|
+
if sanitized_url is None:
|
|
898
|
+
return None
|
|
680
899
|
|
|
681
|
-
|
|
682
|
-
|
|
900
|
+
out_candidates.append(f"{sanitized_url} {desc}".strip())
|
|
901
|
+
|
|
902
|
+
return None if not out_candidates else ", ".join(out_candidates)
|
|
903
|
+
|
|
904
|
+
|
|
905
|
+
_URL_LIKE_ATTRS: frozenset[str] = frozenset(
|
|
906
|
+
{
|
|
907
|
+
# Common URL-valued attributes.
|
|
908
|
+
"href",
|
|
909
|
+
"src",
|
|
910
|
+
"srcset",
|
|
911
|
+
"poster",
|
|
912
|
+
"action",
|
|
913
|
+
"formaction",
|
|
914
|
+
"data",
|
|
915
|
+
"cite",
|
|
916
|
+
"background",
|
|
917
|
+
# Can trigger requests/pings.
|
|
918
|
+
"ping",
|
|
919
|
+
}
|
|
920
|
+
)
|
|
683
921
|
|
|
684
922
|
|
|
685
|
-
def
|
|
923
|
+
def _sanitize(node: Any, *, policy: SanitizationPolicy | None = None) -> Any:
|
|
686
924
|
"""Return a sanitized clone of `node`.
|
|
687
925
|
|
|
688
|
-
|
|
689
|
-
For
|
|
690
|
-
`<body>` wrappers; for fragments it prefers snippet-shaped output.
|
|
926
|
+
This returns a sanitized clone without mutating the original tree.
|
|
927
|
+
For performance, it builds the sanitized clone in a single pass.
|
|
691
928
|
"""
|
|
692
929
|
|
|
693
930
|
if policy is None:
|
|
694
931
|
policy = DEFAULT_DOCUMENT_POLICY if node.name == "#document" else DEFAULT_POLICY
|
|
695
932
|
|
|
696
|
-
#
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
if
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
if tag == "template" and node.namespace in (None, "html"):
|
|
758
|
-
if node.template_content and out_root.template_content:
|
|
759
|
-
out_root.template_content.children.clear()
|
|
760
|
-
for child in node.template_content.children or []:
|
|
761
|
-
_append_sanitized_subtree(policy=policy, original=child, parent_out=out_root.template_content)
|
|
762
|
-
|
|
763
|
-
return out_root
|
|
933
|
+
# Escape-mode tag reconstruction may need access to the original source HTML.
|
|
934
|
+
# Historically we allow a child element to inherit _source_html from an
|
|
935
|
+
# ancestor container; keep that behavior even though we sanitize a clone.
|
|
936
|
+
if policy.disallowed_tag_handling == "escape":
|
|
937
|
+
root_source_html = getattr(node, "_source_html", None)
|
|
938
|
+
if root_source_html:
|
|
939
|
+
from .node import TemplateNode # noqa: PLC0415
|
|
940
|
+
|
|
941
|
+
stack: list[Any] = [node]
|
|
942
|
+
while stack:
|
|
943
|
+
current = stack.pop()
|
|
944
|
+
current_source_html = getattr(current, "_source_html", None) or root_source_html
|
|
945
|
+
|
|
946
|
+
children = getattr(current, "children", None) or []
|
|
947
|
+
for child in children:
|
|
948
|
+
# TextNode does not have _source_html.
|
|
949
|
+
if getattr(child, "name", "") == "#text":
|
|
950
|
+
continue
|
|
951
|
+
if getattr(child, "_source_html", None) is None:
|
|
952
|
+
child._source_html = current_source_html
|
|
953
|
+
stack.append(child)
|
|
954
|
+
|
|
955
|
+
if type(current) is TemplateNode and current.template_content is not None:
|
|
956
|
+
tc = current.template_content
|
|
957
|
+
if getattr(tc, "_source_html", None) is None:
|
|
958
|
+
tc._source_html = current_source_html
|
|
959
|
+
stack.append(tc)
|
|
960
|
+
|
|
961
|
+
# We intentionally implement safe-output sanitization by applying the
|
|
962
|
+
# `Sanitize(policy=...)` transform pipeline to a clone of the node.
|
|
963
|
+
# This keeps a single canonical sanitization algorithm.
|
|
964
|
+
from .transforms import Sanitize, apply_compiled_transforms, compile_transforms # noqa: PLC0415
|
|
965
|
+
|
|
966
|
+
compiled = policy._compiled_sanitize_transforms
|
|
967
|
+
if compiled is None:
|
|
968
|
+
compiled = compile_transforms((Sanitize(policy=policy),))
|
|
969
|
+
object.__setattr__(policy, "_compiled_sanitize_transforms", compiled)
|
|
970
|
+
|
|
971
|
+
# Container-root rule: transforms walk children of the provided root.
|
|
972
|
+
# For non-container roots, wrap the cloned node in a document fragment so
|
|
973
|
+
# the sanitizer can act on the root node itself.
|
|
974
|
+
if node.name in {"#document", "#document-fragment"}:
|
|
975
|
+
cloned = node.clone_node(deep=True)
|
|
976
|
+
apply_compiled_transforms(cloned, compiled, errors=None)
|
|
977
|
+
return cloned
|
|
978
|
+
|
|
979
|
+
from .node import SimpleDomNode # noqa: PLC0415
|
|
980
|
+
|
|
981
|
+
wrapper = SimpleDomNode("#document-fragment")
|
|
982
|
+
wrapper.append_child(node.clone_node(deep=True))
|
|
983
|
+
apply_compiled_transforms(wrapper, compiled, errors=None)
|
|
984
|
+
|
|
985
|
+
children = wrapper.children or []
|
|
986
|
+
if len(children) == 1:
|
|
987
|
+
only = children[0]
|
|
988
|
+
only.parent = None
|
|
989
|
+
wrapper.children = []
|
|
990
|
+
return only
|
|
991
|
+
|
|
992
|
+
return wrapper
|