justhtml 0.24.0__py3-none-any.whl → 0.38.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/sanitize.py CHANGED
@@ -10,12 +10,38 @@ from __future__ import annotations
10
10
 
11
11
  from collections.abc import Callable, Collection, Mapping
12
12
  from dataclasses import dataclass, field
13
- from typing import Any
13
+ from typing import Any, Literal, cast
14
14
  from urllib.parse import quote, urlsplit
15
15
 
16
+ from .tokens import ParseError
17
+
16
18
  UrlFilter = Callable[[str, str, str], str | None]
17
19
 
18
20
 
21
+ class UnsafeHtmlError(ValueError):
22
+ """Raised when unsafe HTML is encountered and unsafe_handling='raise'."""
23
+
24
+
25
+ UnsafeHandling = Literal["strip", "raise", "collect"]
26
+
27
+ DisallowedTagHandling = Literal["unwrap", "escape", "drop"]
28
+
29
+ UrlHandling = Literal["allow", "strip", "proxy"]
30
+
31
+
32
+ @dataclass(frozen=True, slots=True)
33
+ class UrlProxy:
34
+ url: str
35
+ param: str = "url"
36
+
37
+ def __post_init__(self) -> None:
38
+ proxy_url = str(self.url)
39
+ if not proxy_url:
40
+ raise ValueError("UrlProxy.url must be a non-empty string")
41
+ object.__setattr__(self, "url", proxy_url)
42
+ object.__setattr__(self, "param", str(self.param))
43
+
44
+
19
45
  @dataclass(frozen=True, slots=True)
20
46
  class UrlRule:
21
47
  """Rule for a single URL-valued attribute (e.g. a[href], img[src]).
@@ -27,9 +53,6 @@ class UrlRule:
27
53
  want to block remote loads by default.
28
54
  """
29
55
 
30
- # Allow relative URLs (including /path, ./path, ../path, ?query).
31
- allow_relative: bool = True
32
-
33
56
  # Allow same-document fragments (#foo). Typically safe.
34
57
  allow_fragment: bool = True
35
58
 
@@ -46,13 +69,17 @@ class UrlRule:
46
69
  # allowlist.
47
70
  allowed_hosts: Collection[str] | None = None
48
71
 
49
- # Optional proxy rewrite for allowed absolute/protocol-relative URLs.
50
- # Example: proxy_url="/proxy" -> https://google.com becomes
51
- # /proxy?url=https%3A%2F%2Fgoogle.com
52
- proxy_url: str | None = None
72
+ # Optional per-rule handling override.
73
+ # If None, the URL is kept ("allow") after it passes validation.
74
+ handling: UrlHandling | None = None
75
+
76
+ # Optional per-rule override of UrlPolicy.default_allow_relative.
77
+ # If None, UrlPolicy.default_allow_relative is used.
78
+ allow_relative: bool | None = None
53
79
 
54
- # Query parameter name used when proxy_url is set.
55
- proxy_param: str = "url"
80
+ # Optional proxy override for absolute/protocol-relative URLs.
81
+ # Used when the effective URL handling is "proxy".
82
+ proxy: UrlProxy | None = None
56
83
 
57
84
  def __post_init__(self) -> None:
58
85
  # Accept lists/tuples from user code, normalize for internal use.
@@ -61,15 +88,158 @@ class UrlRule:
61
88
  if self.allowed_hosts is not None and not isinstance(self.allowed_hosts, set):
62
89
  object.__setattr__(self, "allowed_hosts", set(self.allowed_hosts))
63
90
 
64
- if self.proxy_url is not None:
65
- proxy_url = str(self.proxy_url)
66
- object.__setattr__(self, "proxy_url", proxy_url if proxy_url else None)
67
- object.__setattr__(self, "proxy_param", str(self.proxy_param))
91
+ if self.proxy is not None and not isinstance(self.proxy, UrlProxy):
92
+ raise TypeError("UrlRule.proxy must be a UrlProxy or None")
68
93
 
94
+ if self.handling is not None:
95
+ mode = str(self.handling)
96
+ if mode not in {"allow", "strip", "proxy"}:
97
+ raise ValueError("Invalid UrlRule.handling. Expected one of: 'allow', 'strip', 'proxy'")
98
+ object.__setattr__(self, "handling", mode)
69
99
 
70
- def _proxy_url_value(*, proxy_url: str, proxy_param: str, value: str) -> str:
71
- sep = "&" if "?" in proxy_url else "?"
72
- return f"{proxy_url}{sep}{proxy_param}={quote(value, safe='')}"
100
+ if self.allow_relative is not None:
101
+ object.__setattr__(self, "allow_relative", bool(self.allow_relative))
102
+
103
+
104
+ @dataclass(frozen=True, slots=True)
105
+ class UrlPolicy:
106
+ # Default handling for URL-like attributes after they pass UrlRule checks.
107
+ # - "allow": keep the URL as-is
108
+ # - "strip": drop the attribute
109
+ # - "proxy": rewrite the URL through a proxy (UrlPolicy.proxy or UrlRule.proxy)
110
+ default_handling: UrlHandling = "strip"
111
+
112
+ # Default allowance for relative URLs (including /path, ./path, ../path, ?query)
113
+ # for URL-like attributes that have a matching UrlRule.
114
+ default_allow_relative: bool = True
115
+
116
+ # Rule configuration for URL-valued attributes.
117
+ allow_rules: Mapping[tuple[str, str], UrlRule] = field(default_factory=dict)
118
+
119
+ # Optional hook that can drop or rewrite URLs.
120
+ # url_filter(tag, attr, value) should return:
121
+ # - a replacement string to keep (possibly rewritten), or
122
+ # - None to drop the attribute.
123
+ url_filter: UrlFilter | None = None
124
+
125
+ # Default proxy config used when a rule is handled with "proxy" and
126
+ # the rule does not specify its own UrlRule.proxy override.
127
+ proxy: UrlProxy | None = None
128
+
129
+ def __post_init__(self) -> None:
130
+ mode = str(self.default_handling)
131
+ if mode not in {"allow", "strip", "proxy"}:
132
+ raise ValueError("Invalid default_handling. Expected one of: 'allow', 'strip', 'proxy'")
133
+ object.__setattr__(self, "default_handling", mode)
134
+
135
+ object.__setattr__(self, "default_allow_relative", bool(self.default_allow_relative))
136
+
137
+ if not isinstance(self.allow_rules, dict):
138
+ object.__setattr__(self, "allow_rules", dict(self.allow_rules))
139
+
140
+ if self.proxy is not None and not isinstance(self.proxy, UrlProxy):
141
+ raise TypeError("UrlPolicy.proxy must be a UrlProxy or None")
142
+
143
+ # Validate proxy configuration for any rules that are in proxy mode.
144
+ for rule in self.allow_rules.values():
145
+ if not isinstance(rule, UrlRule):
146
+ raise TypeError("UrlPolicy.allow_rules values must be UrlRule")
147
+ if rule.handling == "proxy" and self.proxy is None and rule.proxy is None:
148
+ raise ValueError("UrlRule.handling='proxy' requires a UrlPolicy.proxy or a per-rule UrlRule.proxy")
149
+
150
+
151
+ def _proxy_url_value(*, proxy: UrlProxy, value: str) -> str:
152
+ sep = "&" if "?" in proxy.url else "?"
153
+ return f"{proxy.url}{sep}{proxy.param}={quote(value, safe='')}"
154
+
155
+
156
+ @dataclass(slots=True)
157
+ class UnsafeHandler:
158
+ """Centralized handler for security findings.
159
+
160
+ This is intentionally a small stateful object so multiple sanitization-
161
+ related passes/transforms can share the same unsafe-handling behavior and
162
+ (in collect mode) append into the same error list.
163
+ """
164
+
165
+ unsafe_handling: UnsafeHandling
166
+
167
+ # Optional external sink (e.g. a JustHTML document's .errors list).
168
+ # When set and unsafe_handling == "collect", security findings are written
169
+ # into that list so multiple components can share a single sink.
170
+ sink: list[ParseError] | None = None
171
+
172
+ _errors: list[ParseError] | None = None
173
+
174
+ def reset(self) -> None:
175
+ if self.unsafe_handling != "collect":
176
+ self._errors = None
177
+ return
178
+
179
+ if self.sink is None:
180
+ self._errors = []
181
+ return
182
+
183
+ # Remove previously collected security findings from the shared sink to
184
+ # avoid accumulating duplicates across multiple runs.
185
+ errors = self.sink
186
+ write_i = 0
187
+ for e in errors:
188
+ if e.category == "security":
189
+ continue
190
+ errors[write_i] = e
191
+ write_i += 1
192
+ del errors[write_i:]
193
+
194
+ def collected(self) -> list[ParseError]:
195
+ src = self.sink if self.sink is not None else self._errors
196
+ if not src:
197
+ return []
198
+
199
+ if self.sink is not None:
200
+ out = [e for e in src if e.category == "security"]
201
+ else:
202
+ out = list(src)
203
+ out.sort(
204
+ key=lambda e: (
205
+ e.line if e.line is not None else 1_000_000_000,
206
+ e.column if e.column is not None else 1_000_000_000,
207
+ )
208
+ )
209
+ return out
210
+
211
+ def handle(self, msg: str, *, node: Any | None = None) -> None:
212
+ mode = self.unsafe_handling
213
+ if mode == "strip":
214
+ return
215
+ if mode == "raise":
216
+ raise UnsafeHtmlError(msg)
217
+ if mode == "collect":
218
+ dest = self.sink
219
+ if dest is None:
220
+ if self._errors is None:
221
+ self._errors = []
222
+ dest = self._errors
223
+
224
+ line: int | None = None
225
+ column: int | None = None
226
+ if node is not None:
227
+ # Best-effort: use node origin metadata when enabled.
228
+ # This stays allocation-light and avoids any input re-parsing.
229
+ line = node.origin_line
230
+ column = node.origin_col
231
+
232
+ dest.append(
233
+ ParseError(
234
+ "unsafe-html",
235
+ line=line,
236
+ column=column,
237
+ category="security",
238
+ message=msg,
239
+ )
240
+ )
241
+ return
242
+ raise AssertionError(f"Unhandled unsafe_handling: {mode!r}")
73
243
 
74
244
 
75
245
  @dataclass(frozen=True, slots=True)
@@ -90,24 +260,13 @@ class SanitizationPolicy:
90
260
  allowed_tags: Collection[str]
91
261
  allowed_attributes: Mapping[str, Collection[str]]
92
262
 
93
- # URL handling:
94
- # - `url_rules` is the data-driven allowlist for URL-valued attributes.
95
- # - `url_filter` is an optional hook that can drop or rewrite URLs.
96
- #
97
- # `url_filter(tag, attr, value)` should return:
98
- # - a replacement string to keep (possibly rewritten), or
99
- # - None to drop the attribute.
100
- url_rules: Mapping[tuple[str, str], UrlRule]
101
- url_filter: UrlFilter | None = None
263
+ # URL handling.
264
+ url_policy: UrlPolicy = field(default_factory=UrlPolicy)
102
265
 
103
266
  drop_comments: bool = True
104
267
  drop_doctype: bool = True
105
268
  drop_foreign_namespaces: bool = True
106
269
 
107
- # If True, disallowed elements are removed but their children may be kept
108
- # (except for tags in `drop_content_tags`).
109
- strip_disallowed_tags: bool = True
110
-
111
270
  # Dangerous containers whose text payload should not be preserved.
112
271
  drop_content_tags: Collection[str] = field(default_factory=lambda: {"script", "style"})
113
272
 
@@ -121,6 +280,52 @@ class SanitizationPolicy:
121
280
  # (The sanitizer will merge tokens; it will not remove existing ones.)
122
281
  force_link_rel: Collection[str] = field(default_factory=set)
123
282
 
283
+ # Determines how unsafe input is handled.
284
+ #
285
+ # - "strip": Default. Remove/drop unsafe constructs and keep going.
286
+ # - "raise": Raise UnsafeHtmlError on the first unsafe construct.
287
+ #
288
+ # This is intentionally a string mode (instead of a boolean) so we can add
289
+ # more behaviors over time without changing the API shape.
290
+ unsafe_handling: UnsafeHandling = "strip"
291
+
292
+ # Determines how disallowed tags are handled.
293
+ #
294
+ # - "unwrap": Default. Drop the tag but keep/sanitize its children.
295
+ # - "escape": Emit original tag tokens as text, keep/sanitize children.
296
+ # - "drop": Drop the entire disallowed subtree.
297
+ disallowed_tag_handling: DisallowedTagHandling = "unwrap"
298
+
299
+ _unsafe_handler: UnsafeHandler = field(
300
+ default_factory=lambda: UnsafeHandler("strip"),
301
+ init=False,
302
+ repr=False,
303
+ compare=False,
304
+ )
305
+
306
+ # Internal caches to avoid per-node allocations in hot paths.
307
+ _allowed_attrs_global: frozenset[str] = field(
308
+ default_factory=frozenset,
309
+ init=False,
310
+ repr=False,
311
+ compare=False,
312
+ )
313
+ _allowed_attrs_by_tag: dict[str, frozenset[str]] = field(
314
+ default_factory=dict,
315
+ init=False,
316
+ repr=False,
317
+ compare=False,
318
+ )
319
+
320
+ # Cache for the compiled `Sanitize(policy=...)` transform pipeline.
321
+ # This lets safe serialization reuse the same compiled transforms.
322
+ _compiled_sanitize_transforms: list[Any] | None = field(
323
+ default=None,
324
+ init=False,
325
+ repr=False,
326
+ compare=False,
327
+ )
328
+
124
329
  def __post_init__(self) -> None:
125
330
  # Normalize to sets so the sanitizer can do fast membership checks.
126
331
  if not isinstance(self.allowed_tags, set):
@@ -143,6 +348,57 @@ class SanitizationPolicy:
143
348
  if not isinstance(self.force_link_rel, set):
144
349
  object.__setattr__(self, "force_link_rel", set(self.force_link_rel))
145
350
 
351
+ unsafe_handling = str(self.unsafe_handling)
352
+ if unsafe_handling not in {"strip", "raise", "collect"}:
353
+ raise ValueError("Invalid unsafe_handling. Expected one of: 'strip', 'raise', 'collect'")
354
+ object.__setattr__(self, "unsafe_handling", unsafe_handling)
355
+
356
+ disallowed_tag_handling = str(self.disallowed_tag_handling)
357
+ if disallowed_tag_handling not in {"unwrap", "escape", "drop"}:
358
+ raise ValueError("Invalid disallowed_tag_handling. Expected one of: 'unwrap', 'escape', 'drop'")
359
+ object.__setattr__(self, "disallowed_tag_handling", disallowed_tag_handling)
360
+
361
+ # Centralize unsafe-handling logic so multiple passes can share it.
362
+ handler = UnsafeHandler(cast("UnsafeHandling", unsafe_handling))
363
+ handler.reset()
364
+ object.__setattr__(self, "_unsafe_handler", handler)
365
+
366
+ # Normalize rel tokens once so downstream sanitization can stay allocation-light.
367
+ # (Downstream code expects lowercase tokens and ignores empty/whitespace.)
368
+ if self.force_link_rel:
369
+ normalized_force_link_rel = {t.strip().lower() for t in self.force_link_rel if str(t).strip()}
370
+ object.__setattr__(self, "force_link_rel", normalized_force_link_rel)
371
+
372
+ style_allowed = any("style" in attrs for attrs in self.allowed_attributes.values())
373
+ if style_allowed and not self.allowed_css_properties:
374
+ raise ValueError(
375
+ "SanitizationPolicy allows the 'style' attribute but allowed_css_properties is empty. "
376
+ "Either remove 'style' from allowed_attributes or set allowed_css_properties (for example CSS_PRESET_TEXT)."
377
+ )
378
+
379
+ allowed_attributes = self.allowed_attributes
380
+ allowed_global = frozenset(allowed_attributes.get("*", ()))
381
+ by_tag: dict[str, frozenset[str]] = {}
382
+ for tag, attrs in allowed_attributes.items():
383
+ if tag == "*":
384
+ continue
385
+ by_tag[tag] = frozenset(allowed_global.union(attrs))
386
+ object.__setattr__(self, "_allowed_attrs_global", allowed_global)
387
+ object.__setattr__(self, "_allowed_attrs_by_tag", by_tag)
388
+
389
+ def reset_collected_security_errors(self) -> None:
390
+ self._unsafe_handler.reset()
391
+
392
+ def collected_security_errors(self) -> list[ParseError]:
393
+ return self._unsafe_handler.collected()
394
+
395
+ def handle_unsafe(self, msg: str, *, node: Any | None = None) -> None:
396
+ self._unsafe_handler.handle(msg, node=node)
397
+
398
+
399
+ _URL_NORMALIZE_STRIP_TABLE = {i: None for i in range(0x21)}
400
+ _URL_NORMALIZE_STRIP_TABLE[0x7F] = None
401
+
146
402
 
147
403
  DEFAULT_POLICY: SanitizationPolicy = SanitizationPolicy(
148
404
  allowed_tags=[
@@ -199,32 +455,53 @@ DEFAULT_POLICY: SanitizationPolicy = SanitizationPolicy(
199
455
  "th": ["colspan", "rowspan"],
200
456
  "td": ["colspan", "rowspan"],
201
457
  },
202
- # Default URL stance:
203
- # - Links may point to http/https/mailto/tel and relative URLs.
204
- # - Images may point to relative URLs only.
205
- url_rules={
206
- ("a", "href"): UrlRule(
207
- allowed_schemes=["http", "https", "mailto", "tel"],
208
- resolve_protocol_relative="https",
209
- ),
210
- ("img", "src"): UrlRule(
211
- allowed_schemes=["http", "https"],
212
- resolve_protocol_relative="https",
213
- ),
214
- },
458
+ url_policy=UrlPolicy(
459
+ default_handling="allow",
460
+ allow_rules={
461
+ ("a", "href"): UrlRule(
462
+ allowed_schemes=["http", "https", "mailto", "tel"],
463
+ resolve_protocol_relative="https",
464
+ ),
465
+ ("img", "src"): UrlRule(
466
+ allowed_schemes=[],
467
+ resolve_protocol_relative=None,
468
+ ),
469
+ },
470
+ ),
215
471
  allowed_css_properties=set(),
216
472
  )
217
473
 
218
474
 
475
+ # A conservative preset for allowing a small amount of inline styling.
476
+ # This is intentionally focused on text-level styling and avoids layout/
477
+ # positioning properties that are commonly abused for UI redress.
478
+ CSS_PRESET_TEXT: frozenset[str] = frozenset(
479
+ {
480
+ "background-color",
481
+ "color",
482
+ "font-size",
483
+ "font-style",
484
+ "font-weight",
485
+ "letter-spacing",
486
+ "line-height",
487
+ "text-align",
488
+ "text-decoration",
489
+ "text-transform",
490
+ "white-space",
491
+ "word-break",
492
+ "word-spacing",
493
+ "word-wrap",
494
+ }
495
+ )
496
+
497
+
219
498
  DEFAULT_DOCUMENT_POLICY: SanitizationPolicy = SanitizationPolicy(
220
499
  allowed_tags=sorted(set(DEFAULT_POLICY.allowed_tags) | {"html", "head", "body", "title"}),
221
500
  allowed_attributes=DEFAULT_POLICY.allowed_attributes,
222
- url_rules=DEFAULT_POLICY.url_rules,
223
- url_filter=DEFAULT_POLICY.url_filter,
501
+ url_policy=DEFAULT_POLICY.url_policy,
224
502
  drop_comments=DEFAULT_POLICY.drop_comments,
225
503
  drop_doctype=DEFAULT_POLICY.drop_doctype,
226
504
  drop_foreign_namespaces=DEFAULT_POLICY.drop_foreign_namespaces,
227
- strip_disallowed_tags=DEFAULT_POLICY.strip_disallowed_tags,
228
505
  drop_content_tags=DEFAULT_POLICY.drop_content_tags,
229
506
  allowed_css_properties=DEFAULT_POLICY.allowed_css_properties,
230
507
  force_link_rel=DEFAULT_POLICY.force_link_rel,
@@ -372,8 +649,8 @@ def _css_value_may_load_external_resource(value: str) -> bool:
372
649
  return False
373
650
 
374
651
 
375
- def _sanitize_inline_style(*, policy: SanitizationPolicy, value: str) -> str | None:
376
- allowed = policy.allowed_css_properties
652
+ def _sanitize_inline_style(*, allowed_css_properties: Collection[str], value: str) -> str | None:
653
+ allowed = allowed_css_properties
377
654
  if not allowed:
378
655
  return None
379
656
 
@@ -414,13 +691,7 @@ def _normalize_url_for_checking(value: str) -> str:
414
691
  # Strip whitespace/control chars commonly used for scheme obfuscation.
415
692
  # Note: do not strip backslashes; they are not whitespace/control chars,
416
693
  # and removing them can turn invalid schemes into valid ones.
417
- out: list[str] = []
418
- for ch in value:
419
- o = ord(ch)
420
- if o <= 0x20 or o == 0x7F:
421
- continue
422
- out.append(ch)
423
- return "".join(out)
694
+ return value.translate(_URL_NORMALIZE_STRIP_TABLE)
424
695
 
425
696
 
426
697
  def _is_valid_scheme(scheme: str) -> bool:
@@ -467,15 +738,46 @@ def _has_invalid_scheme_like_prefix(value: str) -> bool:
467
738
 
468
739
  def _sanitize_url_value(
469
740
  *,
470
- policy: SanitizationPolicy,
741
+ url_policy: UrlPolicy,
742
+ rule: UrlRule,
743
+ tag: str,
744
+ attr: str,
745
+ value: str,
746
+ ) -> str | None:
747
+ return _sanitize_url_value_inner(
748
+ url_policy=url_policy, rule=rule, tag=tag, attr=attr, value=value, apply_filter=True
749
+ )
750
+
751
+
752
+ def _effective_proxy(*, url_policy: UrlPolicy, rule: UrlRule) -> UrlProxy | None:
753
+ return rule.proxy if rule.proxy is not None else url_policy.proxy
754
+
755
+
756
+ def _effective_url_handling(*, url_policy: UrlPolicy, rule: UrlRule) -> UrlHandling:
757
+ # URL-like attributes are allowlisted via UrlPolicy.allow_rules. When they are
758
+ # allowlisted and the URL passes validation, the default action is to keep the URL.
759
+ return rule.handling if rule.handling is not None else "allow"
760
+
761
+
762
+ def _effective_allow_relative(*, url_policy: UrlPolicy, rule: UrlRule) -> bool:
763
+ return rule.allow_relative if rule.allow_relative is not None else url_policy.default_allow_relative
764
+
765
+
766
+ def _sanitize_url_value_inner(
767
+ *,
768
+ url_policy: UrlPolicy,
471
769
  rule: UrlRule,
472
770
  tag: str,
473
771
  attr: str,
474
772
  value: str,
773
+ apply_filter: bool,
475
774
  ) -> str | None:
476
775
  v = value
477
- if policy.url_filter is not None:
478
- rewritten = policy.url_filter(tag, attr, v)
776
+ mode = _effective_url_handling(url_policy=url_policy, rule=rule)
777
+ allow_relative = _effective_allow_relative(url_policy=url_policy, rule=rule)
778
+
779
+ if apply_filter and url_policy.url_filter is not None:
780
+ rewritten = url_policy.url_filter(tag, attr, v)
479
781
  if rewritten is None:
480
782
  return None
481
783
  v = rewritten
@@ -488,11 +790,18 @@ def _sanitize_url_value(
488
790
  return None
489
791
 
490
792
  if normalized.startswith("#"):
491
- return stripped if rule.allow_fragment else None
492
-
493
- # If proxying is enabled, do not treat scheme-obfuscation as a relative URL.
494
- # Some user agents normalize backslashes and other characters during navigation.
495
- if rule.proxy_url and _has_invalid_scheme_like_prefix(normalized):
793
+ if not rule.allow_fragment:
794
+ return None
795
+ if mode == "strip":
796
+ return None
797
+ if mode == "proxy":
798
+ proxy = _effective_proxy(url_policy=url_policy, rule=rule)
799
+ return None if proxy is None else _proxy_url_value(proxy=proxy, value=stripped)
800
+ return stripped
801
+
802
+ if mode == "proxy" and _has_invalid_scheme_like_prefix(normalized):
803
+ # If proxying is enabled, do not treat scheme-obfuscation as a relative URL.
804
+ # Some user agents normalize backslashes and other characters during navigation.
496
805
  return None
497
806
 
498
807
  if normalized.startswith("//"):
@@ -513,12 +822,12 @@ def _sanitize_url_value(
513
822
  if not host or host not in rule.allowed_hosts:
514
823
  return None
515
824
 
516
- # Return the resolved URL.
517
- return (
518
- _proxy_url_value(proxy_url=rule.proxy_url, proxy_param=rule.proxy_param, value=resolved_url)
519
- if rule.proxy_url
520
- else resolved_url
521
- )
825
+ if mode == "strip":
826
+ return None
827
+ if mode == "proxy":
828
+ proxy = _effective_proxy(url_policy=url_policy, rule=rule)
829
+ return None if proxy is None else _proxy_url_value(proxy=proxy, value=resolved_url)
830
+ return resolved_url
522
831
 
523
832
  if _has_scheme(normalized):
524
833
  parsed = urlsplit(normalized)
@@ -529,235 +838,155 @@ def _sanitize_url_value(
529
838
  host = (parsed.hostname or "").lower()
530
839
  if not host or host not in rule.allowed_hosts:
531
840
  return None
532
- return (
533
- _proxy_url_value(proxy_url=rule.proxy_url, proxy_param=rule.proxy_param, value=stripped)
534
- if rule.proxy_url
535
- else stripped
536
- )
841
+ if mode == "strip":
842
+ return None
843
+ if mode == "proxy":
844
+ proxy = _effective_proxy(url_policy=url_policy, rule=rule)
845
+ return None if proxy is None else _proxy_url_value(proxy=proxy, value=stripped)
846
+ return stripped
537
847
 
538
- return stripped if rule.allow_relative else None
848
+ if not allow_relative:
849
+ return None
539
850
 
851
+ if mode == "strip":
852
+ return None
853
+ if mode == "proxy":
854
+ proxy = _effective_proxy(url_policy=url_policy, rule=rule)
855
+ return None if proxy is None else _proxy_url_value(proxy=proxy, value=stripped)
856
+ return stripped
540
857
 
541
- def _sanitize_attrs(
858
+
859
+ def _sanitize_srcset_value(
542
860
  *,
543
- policy: SanitizationPolicy,
861
+ url_policy: UrlPolicy,
862
+ rule: UrlRule,
544
863
  tag: str,
545
- attrs: dict[str, str | None] | None,
546
- ) -> dict[str, str | None]:
547
- if not attrs:
548
- attrs = {}
549
-
550
- allowed_global = set(policy.allowed_attributes.get("*", ()))
551
- allowed_tag = set(policy.allowed_attributes.get(tag, ()))
552
- allowed = allowed_global | allowed_tag
553
-
554
- out: dict[str, str | None] = {}
555
- for raw_name, raw_value in attrs.items():
556
- if not raw_name:
557
- continue
558
-
559
- name = str(raw_name).strip().lower()
560
- if not name:
561
- continue
562
-
563
- # Disallow namespace-ish attributes by default.
564
- if ":" in name:
565
- continue
566
-
567
- # Always drop event handlers.
568
- if name.startswith("on"):
569
- continue
570
-
571
- # Dangerous attribute contexts.
572
- if name == "srcdoc":
573
- continue
574
-
575
- if name not in allowed and not (tag == "a" and name == "rel" and policy.force_link_rel):
576
- continue
577
-
578
- if raw_value is None:
579
- out[name] = None
580
- continue
581
-
582
- value = str(raw_value)
583
- rule = policy.url_rules.get((tag, name))
584
- if rule is not None:
585
- sanitized = _sanitize_url_value(policy=policy, rule=rule, tag=tag, attr=name, value=value)
586
- if sanitized is None:
587
- continue
588
- out[name] = sanitized
589
- elif name == "style":
590
- sanitized_style = _sanitize_inline_style(policy=policy, value=value)
591
- if sanitized_style is None:
592
- continue
593
- out[name] = sanitized_style
594
- else:
595
- out[name] = value
596
-
597
- # Link hardening (merge tokens; do not remove existing ones).
598
- forced_tokens = [t.strip().lower() for t in policy.force_link_rel if str(t).strip()]
599
- if tag == "a" and forced_tokens:
600
- existing_raw = out.get("rel")
601
- existing: list[str] = []
602
- if isinstance(existing_raw, str) and existing_raw:
603
- for tok in existing_raw.split():
604
- t = tok.strip().lower()
605
- if t and t not in existing:
606
- existing.append(t)
607
- for tok in sorted(forced_tokens):
608
- if tok not in existing:
609
- existing.append(tok)
610
- out["rel"] = " ".join(existing)
611
-
612
- return out
613
-
614
-
615
- def _append_sanitized_subtree(*, policy: SanitizationPolicy, original: Any, parent_out: Any) -> None:
616
- stack: list[tuple[Any, Any]] = [(original, parent_out)]
617
- while stack:
618
- current, out_parent = stack.pop()
619
- name: str = current.name
620
-
621
- if name == "#text":
622
- out_parent.append_child(current.clone_node(deep=False))
623
- continue
624
-
625
- if name == "#comment":
626
- if policy.drop_comments:
627
- continue
628
- out_parent.append_child(current.clone_node(deep=False))
629
- continue
630
-
631
- if name == "!doctype":
632
- if policy.drop_doctype:
633
- continue
634
- out_parent.append_child(current.clone_node(deep=False))
635
- continue
636
-
637
- # Document containers.
638
- if name.startswith("#"):
639
- clone = current.clone_node(deep=False)
640
- clone.children.clear()
641
- out_parent.append_child(clone)
642
- children = current.children or []
643
- stack.extend((child, clone) for child in reversed(children))
644
- continue
645
-
646
- # Element.
647
- tag = str(name).lower()
648
- if policy.drop_foreign_namespaces:
649
- ns = current.namespace
650
- if ns not in (None, "html"):
651
- continue
652
-
653
- if tag in policy.drop_content_tags:
654
- continue
864
+ attr: str,
865
+ value: str,
866
+ ) -> str | None:
867
+ # Apply the URL filter once to the whole attribute value.
868
+ v = value
869
+ if url_policy.url_filter is not None:
870
+ rewritten = url_policy.url_filter(tag, attr, v)
871
+ if rewritten is None:
872
+ return None
873
+ v = rewritten
655
874
 
656
- if tag not in policy.allowed_tags:
657
- if policy.strip_disallowed_tags:
658
- children = current.children or []
659
- stack.extend((child, out_parent) for child in reversed(children))
875
+ stripped = str(v).strip()
876
+ if not stripped:
877
+ return None
660
878
 
661
- if tag == "template" and current.namespace in (None, "html") and current.template_content:
662
- tc_children = current.template_content.children or []
663
- stack.extend((child, out_parent) for child in reversed(tc_children))
879
+ out_candidates: list[str] = []
880
+ for raw_candidate in stripped.split(","):
881
+ c = raw_candidate.strip()
882
+ if not c:
664
883
  continue
665
884
 
666
- clone = current.clone_node(deep=False)
667
- # Ensure children list is empty before we append sanitized descendants.
668
- clone.children.clear()
669
- # Filter attributes.
670
- clone.attrs = _sanitize_attrs(policy=policy, tag=tag, attrs=current.attrs)
671
-
672
- out_parent.append_child(clone)
673
-
674
- # Template content is a separate subtree.
675
- if tag == "template" and current.namespace in (None, "html"):
676
- if current.template_content and clone.template_content:
677
- clone.template_content.children.clear()
678
- tc_children = current.template_content.children or []
679
- stack.extend((child, clone.template_content) for child in reversed(tc_children))
885
+ parts = c.split(None, 1)
886
+ url_token = parts[0]
887
+ desc = parts[1].strip() if len(parts) == 2 else ""
888
+
889
+ sanitized_url = _sanitize_url_value_inner(
890
+ url_policy=url_policy,
891
+ rule=rule,
892
+ tag=tag,
893
+ attr=attr,
894
+ value=url_token,
895
+ apply_filter=False,
896
+ )
897
+ if sanitized_url is None:
898
+ return None
680
899
 
681
- children = current.children or []
682
- stack.extend((child, clone) for child in reversed(children))
900
+ out_candidates.append(f"{sanitized_url} {desc}".strip())
901
+
902
+ return None if not out_candidates else ", ".join(out_candidates)
903
+
904
+
905
+ _URL_LIKE_ATTRS: frozenset[str] = frozenset(
906
+ {
907
+ # Common URL-valued attributes.
908
+ "href",
909
+ "src",
910
+ "srcset",
911
+ "poster",
912
+ "action",
913
+ "formaction",
914
+ "data",
915
+ "cite",
916
+ "background",
917
+ # Can trigger requests/pings.
918
+ "ping",
919
+ }
920
+ )
683
921
 
684
922
 
685
- def sanitize(node: Any, *, policy: SanitizationPolicy | None = None) -> Any:
923
+ def _sanitize(node: Any, *, policy: SanitizationPolicy | None = None) -> Any:
686
924
  """Return a sanitized clone of `node`.
687
925
 
688
- If `policy` is not provided, JustHTML uses a conservative default policy.
689
- For full documents (`#document` roots) it preserves `<html>`, `<head>`, and
690
- `<body>` wrappers; for fragments it prefers snippet-shaped output.
926
+ This returns a sanitized clone without mutating the original tree.
927
+ For performance, it builds the sanitized clone in a single pass.
691
928
  """
692
929
 
693
930
  if policy is None:
694
931
  policy = DEFAULT_DOCUMENT_POLICY if node.name == "#document" else DEFAULT_POLICY
695
932
 
696
- # Root handling.
697
- root_name: str = node.name
698
-
699
- if root_name == "#text":
700
- return node.clone_node(deep=False)
701
-
702
- if root_name == "#comment":
703
- out_root = node.clone_node(deep=False)
704
- if policy.drop_comments:
705
- out_root.name = "#document-fragment"
706
- return out_root
707
-
708
- if root_name == "!doctype":
709
- out_root = node.clone_node(deep=False)
710
- if policy.drop_doctype:
711
- out_root.name = "#document-fragment"
712
- return out_root
713
-
714
- # Containers.
715
- if root_name.startswith("#"):
716
- out_root = node.clone_node(deep=False)
717
- out_root.children.clear()
718
- for child in node.children or []:
719
- _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
720
- return out_root
721
-
722
- # Element root: keep element if allowed, otherwise unwrap into a fragment.
723
- tag = str(root_name).lower()
724
- if policy.drop_foreign_namespaces and node.namespace not in (None, "html"):
725
- out_root = node.clone_node(deep=False)
726
- out_root.name = "#document-fragment"
727
- out_root.children.clear()
728
- out_root.attrs.clear()
729
- return out_root
730
-
731
- if tag in policy.drop_content_tags or (tag not in policy.allowed_tags and not policy.strip_disallowed_tags):
732
- out_root = node.clone_node(deep=False)
733
- out_root.name = "#document-fragment"
734
- out_root.children.clear()
735
- out_root.attrs.clear()
736
- return out_root
737
-
738
- if tag not in policy.allowed_tags and policy.strip_disallowed_tags:
739
- out_root = node.clone_node(deep=False)
740
- out_root.name = "#document-fragment"
741
- out_root.children.clear()
742
- out_root.attrs.clear()
743
- for child in node.children or []:
744
- _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
745
-
746
- if tag == "template" and node.namespace in (None, "html") and node.template_content:
747
- for child in node.template_content.children or []:
748
- _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
749
- return out_root
750
-
751
- out_root = node.clone_node(deep=False)
752
- out_root.children.clear()
753
- out_root.attrs = _sanitize_attrs(policy=policy, tag=tag, attrs=node.attrs)
754
- for child in node.children or []:
755
- _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
756
-
757
- if tag == "template" and node.namespace in (None, "html"):
758
- if node.template_content and out_root.template_content:
759
- out_root.template_content.children.clear()
760
- for child in node.template_content.children or []:
761
- _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root.template_content)
762
-
763
- return out_root
933
+ # Escape-mode tag reconstruction may need access to the original source HTML.
934
+ # Historically we allow a child element to inherit _source_html from an
935
+ # ancestor container; keep that behavior even though we sanitize a clone.
936
+ if policy.disallowed_tag_handling == "escape":
937
+ root_source_html = getattr(node, "_source_html", None)
938
+ if root_source_html:
939
+ from .node import TemplateNode # noqa: PLC0415
940
+
941
+ stack: list[Any] = [node]
942
+ while stack:
943
+ current = stack.pop()
944
+ current_source_html = getattr(current, "_source_html", None) or root_source_html
945
+
946
+ children = getattr(current, "children", None) or []
947
+ for child in children:
948
+ # TextNode does not have _source_html.
949
+ if getattr(child, "name", "") == "#text":
950
+ continue
951
+ if getattr(child, "_source_html", None) is None:
952
+ child._source_html = current_source_html
953
+ stack.append(child)
954
+
955
+ if type(current) is TemplateNode and current.template_content is not None:
956
+ tc = current.template_content
957
+ if getattr(tc, "_source_html", None) is None:
958
+ tc._source_html = current_source_html
959
+ stack.append(tc)
960
+
961
+ # We intentionally implement safe-output sanitization by applying the
962
+ # `Sanitize(policy=...)` transform pipeline to a clone of the node.
963
+ # This keeps a single canonical sanitization algorithm.
964
+ from .transforms import Sanitize, apply_compiled_transforms, compile_transforms # noqa: PLC0415
965
+
966
+ compiled = policy._compiled_sanitize_transforms
967
+ if compiled is None:
968
+ compiled = compile_transforms((Sanitize(policy=policy),))
969
+ object.__setattr__(policy, "_compiled_sanitize_transforms", compiled)
970
+
971
+ # Container-root rule: transforms walk children of the provided root.
972
+ # For non-container roots, wrap the cloned node in a document fragment so
973
+ # the sanitizer can act on the root node itself.
974
+ if node.name in {"#document", "#document-fragment"}:
975
+ cloned = node.clone_node(deep=True)
976
+ apply_compiled_transforms(cloned, compiled, errors=None)
977
+ return cloned
978
+
979
+ from .node import SimpleDomNode # noqa: PLC0415
980
+
981
+ wrapper = SimpleDomNode("#document-fragment")
982
+ wrapper.append_child(node.clone_node(deep=True))
983
+ apply_compiled_transforms(wrapper, compiled, errors=None)
984
+
985
+ children = wrapper.children or []
986
+ if len(children) == 1:
987
+ only = children[0]
988
+ only.parent = None
989
+ wrapper.children = []
990
+ return only
991
+
992
+ return wrapper