justhtml 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/sanitize.py ADDED
@@ -0,0 +1,992 @@
1
+ """HTML sanitization policy API.
2
+
3
+ This module defines the public API for JustHTML sanitization.
4
+
5
+ The sanitizer operates on the parsed JustHTML DOM and is intentionally
6
+ policy-driven.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from collections.abc import Callable, Collection, Mapping
12
+ from dataclasses import dataclass, field
13
+ from typing import Any, Literal, cast
14
+ from urllib.parse import quote, urlsplit
15
+
16
+ from .tokens import ParseError
17
+
18
+ UrlFilter = Callable[[str, str, str], str | None]
19
+
20
+
21
+ class UnsafeHtmlError(ValueError):
22
+ """Raised when unsafe HTML is encountered and unsafe_handling='raise'."""
23
+
24
+
25
+ UnsafeHandling = Literal["strip", "raise", "collect"]
26
+
27
+ DisallowedTagHandling = Literal["unwrap", "escape", "drop"]
28
+
29
+ UrlHandling = Literal["allow", "strip", "proxy"]
30
+
31
+
32
+ @dataclass(frozen=True, slots=True)
33
+ class UrlProxy:
34
+ url: str
35
+ param: str = "url"
36
+
37
+ def __post_init__(self) -> None:
38
+ proxy_url = str(self.url)
39
+ if not proxy_url:
40
+ raise ValueError("UrlProxy.url must be a non-empty string")
41
+ object.__setattr__(self, "url", proxy_url)
42
+ object.__setattr__(self, "param", str(self.param))
43
+
44
+
45
+ @dataclass(frozen=True, slots=True)
46
+ class UrlRule:
47
+ """Rule for a single URL-valued attribute (e.g. a[href], img[src]).
48
+
49
+ This is intentionally rendering-oriented.
50
+
51
+ - Returning/keeping a URL can still cause network requests when the output
52
+ is rendered (notably for <img src>). Applications like email viewers often
53
+ want to block remote loads by default.
54
+ """
55
+
56
+ # Allow same-document fragments (#foo). Typically safe.
57
+ allow_fragment: bool = True
58
+
59
+ # If set, protocol-relative URLs (//example.com) are resolved to this scheme
60
+ # (e.g. "https") before checking allowed_schemes.
61
+ # If None, protocol-relative URLs are disallowed.
62
+ resolve_protocol_relative: str | None = "https"
63
+
64
+ # Allow absolute URLs with these schemes (lowercase), e.g. {"https"}.
65
+ # If empty, all absolute URLs with a scheme are disallowed.
66
+ allowed_schemes: Collection[str] = field(default_factory=set)
67
+
68
+ # If provided, absolute URLs are allowed only if the parsed host is in this
69
+ # allowlist.
70
+ allowed_hosts: Collection[str] | None = None
71
+
72
+ # Optional per-rule handling override.
73
+ # If None, the URL is kept ("allow") after it passes validation.
74
+ handling: UrlHandling | None = None
75
+
76
+ # Optional per-rule override of UrlPolicy.default_allow_relative.
77
+ # If None, UrlPolicy.default_allow_relative is used.
78
+ allow_relative: bool | None = None
79
+
80
+ # Optional proxy override for absolute/protocol-relative URLs.
81
+ # Used when the effective URL handling is "proxy".
82
+ proxy: UrlProxy | None = None
83
+
84
+ def __post_init__(self) -> None:
85
+ # Accept lists/tuples from user code, normalize for internal use.
86
+ if not isinstance(self.allowed_schemes, set):
87
+ object.__setattr__(self, "allowed_schemes", set(self.allowed_schemes))
88
+ if self.allowed_hosts is not None and not isinstance(self.allowed_hosts, set):
89
+ object.__setattr__(self, "allowed_hosts", set(self.allowed_hosts))
90
+
91
+ if self.proxy is not None and not isinstance(self.proxy, UrlProxy):
92
+ raise TypeError("UrlRule.proxy must be a UrlProxy or None")
93
+
94
+ if self.handling is not None:
95
+ mode = str(self.handling)
96
+ if mode not in {"allow", "strip", "proxy"}:
97
+ raise ValueError("Invalid UrlRule.handling. Expected one of: 'allow', 'strip', 'proxy'")
98
+ object.__setattr__(self, "handling", mode)
99
+
100
+ if self.allow_relative is not None:
101
+ object.__setattr__(self, "allow_relative", bool(self.allow_relative))
102
+
103
+
104
+ @dataclass(frozen=True, slots=True)
105
+ class UrlPolicy:
106
+ # Default handling for URL-like attributes after they pass UrlRule checks.
107
+ # - "allow": keep the URL as-is
108
+ # - "strip": drop the attribute
109
+ # - "proxy": rewrite the URL through a proxy (UrlPolicy.proxy or UrlRule.proxy)
110
+ default_handling: UrlHandling = "strip"
111
+
112
+ # Default allowance for relative URLs (including /path, ./path, ../path, ?query)
113
+ # for URL-like attributes that have a matching UrlRule.
114
+ default_allow_relative: bool = True
115
+
116
+ # Rule configuration for URL-valued attributes.
117
+ allow_rules: Mapping[tuple[str, str], UrlRule] = field(default_factory=dict)
118
+
119
+ # Optional hook that can drop or rewrite URLs.
120
+ # url_filter(tag, attr, value) should return:
121
+ # - a replacement string to keep (possibly rewritten), or
122
+ # - None to drop the attribute.
123
+ url_filter: UrlFilter | None = None
124
+
125
+ # Default proxy config used when a rule is handled with "proxy" and
126
+ # the rule does not specify its own UrlRule.proxy override.
127
+ proxy: UrlProxy | None = None
128
+
129
+ def __post_init__(self) -> None:
130
+ mode = str(self.default_handling)
131
+ if mode not in {"allow", "strip", "proxy"}:
132
+ raise ValueError("Invalid default_handling. Expected one of: 'allow', 'strip', 'proxy'")
133
+ object.__setattr__(self, "default_handling", mode)
134
+
135
+ object.__setattr__(self, "default_allow_relative", bool(self.default_allow_relative))
136
+
137
+ if not isinstance(self.allow_rules, dict):
138
+ object.__setattr__(self, "allow_rules", dict(self.allow_rules))
139
+
140
+ if self.proxy is not None and not isinstance(self.proxy, UrlProxy):
141
+ raise TypeError("UrlPolicy.proxy must be a UrlProxy or None")
142
+
143
+ # Validate proxy configuration for any rules that are in proxy mode.
144
+ for rule in self.allow_rules.values():
145
+ if not isinstance(rule, UrlRule):
146
+ raise TypeError("UrlPolicy.allow_rules values must be UrlRule")
147
+ if rule.handling == "proxy" and self.proxy is None and rule.proxy is None:
148
+ raise ValueError("UrlRule.handling='proxy' requires a UrlPolicy.proxy or a per-rule UrlRule.proxy")
149
+
150
+
151
+ def _proxy_url_value(*, proxy: UrlProxy, value: str) -> str:
152
+ sep = "&" if "?" in proxy.url else "?"
153
+ return f"{proxy.url}{sep}{proxy.param}={quote(value, safe='')}"
154
+
155
+
156
+ @dataclass(slots=True)
157
+ class UnsafeHandler:
158
+ """Centralized handler for security findings.
159
+
160
+ This is intentionally a small stateful object so multiple sanitization-
161
+ related passes/transforms can share the same unsafe-handling behavior and
162
+ (in collect mode) append into the same error list.
163
+ """
164
+
165
+ unsafe_handling: UnsafeHandling
166
+
167
+ # Optional external sink (e.g. a JustHTML document's .errors list).
168
+ # When set and unsafe_handling == "collect", security findings are written
169
+ # into that list so multiple components can share a single sink.
170
+ sink: list[ParseError] | None = None
171
+
172
+ _errors: list[ParseError] | None = None
173
+
174
+ def reset(self) -> None:
175
+ if self.unsafe_handling != "collect":
176
+ self._errors = None
177
+ return
178
+
179
+ if self.sink is None:
180
+ self._errors = []
181
+ return
182
+
183
+ # Remove previously collected security findings from the shared sink to
184
+ # avoid accumulating duplicates across multiple runs.
185
+ errors = self.sink
186
+ write_i = 0
187
+ for e in errors:
188
+ if e.category == "security":
189
+ continue
190
+ errors[write_i] = e
191
+ write_i += 1
192
+ del errors[write_i:]
193
+
194
+ def collected(self) -> list[ParseError]:
195
+ src = self.sink if self.sink is not None else self._errors
196
+ if not src:
197
+ return []
198
+
199
+ if self.sink is not None:
200
+ out = [e for e in src if e.category == "security"]
201
+ else:
202
+ out = list(src)
203
+ out.sort(
204
+ key=lambda e: (
205
+ e.line if e.line is not None else 1_000_000_000,
206
+ e.column if e.column is not None else 1_000_000_000,
207
+ )
208
+ )
209
+ return out
210
+
211
+ def handle(self, msg: str, *, node: Any | None = None) -> None:
212
+ mode = self.unsafe_handling
213
+ if mode == "strip":
214
+ return
215
+ if mode == "raise":
216
+ raise UnsafeHtmlError(msg)
217
+ if mode == "collect":
218
+ dest = self.sink
219
+ if dest is None:
220
+ if self._errors is None:
221
+ self._errors = []
222
+ dest = self._errors
223
+
224
+ line: int | None = None
225
+ column: int | None = None
226
+ if node is not None:
227
+ # Best-effort: use node origin metadata when enabled.
228
+ # This stays allocation-light and avoids any input re-parsing.
229
+ line = node.origin_line
230
+ column = node.origin_col
231
+
232
+ dest.append(
233
+ ParseError(
234
+ "unsafe-html",
235
+ line=line,
236
+ column=column,
237
+ category="security",
238
+ message=msg,
239
+ )
240
+ )
241
+ return
242
+ raise AssertionError(f"Unhandled unsafe_handling: {mode!r}")
243
+
244
+
245
+ @dataclass(frozen=True, slots=True)
246
+ class SanitizationPolicy:
247
+ """An allow-list driven policy for sanitizing a parsed DOM.
248
+
249
+ This API is intentionally small. The implementation will interpret these
250
+ fields strictly.
251
+
252
+ - Tags not in `allowed_tags` are disallowed.
253
+ - Attributes not in `allowed_attributes[tag]` (or `allowed_attributes["*"]`)
254
+ are disallowed.
255
+ - URL scheme checks apply to attributes listed in `url_attributes`.
256
+
257
+ All tag and attribute names are expected to be ASCII-lowercase.
258
+ """
259
+
260
+ allowed_tags: Collection[str]
261
+ allowed_attributes: Mapping[str, Collection[str]]
262
+
263
+ # URL handling.
264
+ url_policy: UrlPolicy = field(default_factory=UrlPolicy)
265
+
266
+ drop_comments: bool = True
267
+ drop_doctype: bool = True
268
+ drop_foreign_namespaces: bool = True
269
+
270
+ # Dangerous containers whose text payload should not be preserved.
271
+ drop_content_tags: Collection[str] = field(default_factory=lambda: {"script", "style"})
272
+
273
+ # Inline style allowlist.
274
+ # Only applies when the `style` attribute is allowed for a tag.
275
+ # If empty, inline styles are effectively disabled (style attributes are dropped).
276
+ allowed_css_properties: Collection[str] = field(default_factory=set)
277
+
278
+ # Link hardening.
279
+ # If non-empty, ensure these tokens are present in <a rel="...">.
280
+ # (The sanitizer will merge tokens; it will not remove existing ones.)
281
+ force_link_rel: Collection[str] = field(default_factory=set)
282
+
283
+ # Determines how unsafe input is handled.
284
+ #
285
+ # - "strip": Default. Remove/drop unsafe constructs and keep going.
286
+ # - "raise": Raise UnsafeHtmlError on the first unsafe construct.
287
+ #
288
+ # This is intentionally a string mode (instead of a boolean) so we can add
289
+ # more behaviors over time without changing the API shape.
290
+ unsafe_handling: UnsafeHandling = "strip"
291
+
292
+ # Determines how disallowed tags are handled.
293
+ #
294
+ # - "unwrap": Default. Drop the tag but keep/sanitize its children.
295
+ # - "escape": Emit original tag tokens as text, keep/sanitize children.
296
+ # - "drop": Drop the entire disallowed subtree.
297
+ disallowed_tag_handling: DisallowedTagHandling = "unwrap"
298
+
299
+ _unsafe_handler: UnsafeHandler = field(
300
+ default_factory=lambda: UnsafeHandler("strip"),
301
+ init=False,
302
+ repr=False,
303
+ compare=False,
304
+ )
305
+
306
+ # Internal caches to avoid per-node allocations in hot paths.
307
+ _allowed_attrs_global: frozenset[str] = field(
308
+ default_factory=frozenset,
309
+ init=False,
310
+ repr=False,
311
+ compare=False,
312
+ )
313
+ _allowed_attrs_by_tag: dict[str, frozenset[str]] = field(
314
+ default_factory=dict,
315
+ init=False,
316
+ repr=False,
317
+ compare=False,
318
+ )
319
+
320
+ # Cache for the compiled `Sanitize(policy=...)` transform pipeline.
321
+ # This lets safe serialization reuse the same compiled transforms.
322
+ _compiled_sanitize_transforms: list[Any] | None = field(
323
+ default=None,
324
+ init=False,
325
+ repr=False,
326
+ compare=False,
327
+ )
328
+
329
+ def __post_init__(self) -> None:
330
+ # Normalize to sets so the sanitizer can do fast membership checks.
331
+ if not isinstance(self.allowed_tags, set):
332
+ object.__setattr__(self, "allowed_tags", set(self.allowed_tags))
333
+
334
+ if not isinstance(self.allowed_attributes, dict) or any(
335
+ not isinstance(v, set) for v in self.allowed_attributes.values()
336
+ ):
337
+ normalized_attrs: dict[str, set[str]] = {}
338
+ for tag, attrs in self.allowed_attributes.items():
339
+ normalized_attrs[str(tag)] = attrs if isinstance(attrs, set) else set(attrs)
340
+ object.__setattr__(self, "allowed_attributes", normalized_attrs)
341
+
342
+ if not isinstance(self.drop_content_tags, set):
343
+ object.__setattr__(self, "drop_content_tags", set(self.drop_content_tags))
344
+
345
+ if not isinstance(self.allowed_css_properties, set):
346
+ object.__setattr__(self, "allowed_css_properties", set(self.allowed_css_properties))
347
+
348
+ if not isinstance(self.force_link_rel, set):
349
+ object.__setattr__(self, "force_link_rel", set(self.force_link_rel))
350
+
351
+ unsafe_handling = str(self.unsafe_handling)
352
+ if unsafe_handling not in {"strip", "raise", "collect"}:
353
+ raise ValueError("Invalid unsafe_handling. Expected one of: 'strip', 'raise', 'collect'")
354
+ object.__setattr__(self, "unsafe_handling", unsafe_handling)
355
+
356
+ disallowed_tag_handling = str(self.disallowed_tag_handling)
357
+ if disallowed_tag_handling not in {"unwrap", "escape", "drop"}:
358
+ raise ValueError("Invalid disallowed_tag_handling. Expected one of: 'unwrap', 'escape', 'drop'")
359
+ object.__setattr__(self, "disallowed_tag_handling", disallowed_tag_handling)
360
+
361
+ # Centralize unsafe-handling logic so multiple passes can share it.
362
+ handler = UnsafeHandler(cast("UnsafeHandling", unsafe_handling))
363
+ handler.reset()
364
+ object.__setattr__(self, "_unsafe_handler", handler)
365
+
366
+ # Normalize rel tokens once so downstream sanitization can stay allocation-light.
367
+ # (Downstream code expects lowercase tokens and ignores empty/whitespace.)
368
+ if self.force_link_rel:
369
+ normalized_force_link_rel = {t.strip().lower() for t in self.force_link_rel if str(t).strip()}
370
+ object.__setattr__(self, "force_link_rel", normalized_force_link_rel)
371
+
372
+ style_allowed = any("style" in attrs for attrs in self.allowed_attributes.values())
373
+ if style_allowed and not self.allowed_css_properties:
374
+ raise ValueError(
375
+ "SanitizationPolicy allows the 'style' attribute but allowed_css_properties is empty. "
376
+ "Either remove 'style' from allowed_attributes or set allowed_css_properties (for example CSS_PRESET_TEXT)."
377
+ )
378
+
379
+ allowed_attributes = self.allowed_attributes
380
+ allowed_global = frozenset(allowed_attributes.get("*", ()))
381
+ by_tag: dict[str, frozenset[str]] = {}
382
+ for tag, attrs in allowed_attributes.items():
383
+ if tag == "*":
384
+ continue
385
+ by_tag[tag] = frozenset(allowed_global.union(attrs))
386
+ object.__setattr__(self, "_allowed_attrs_global", allowed_global)
387
+ object.__setattr__(self, "_allowed_attrs_by_tag", by_tag)
388
+
389
+ def reset_collected_security_errors(self) -> None:
390
+ self._unsafe_handler.reset()
391
+
392
+ def collected_security_errors(self) -> list[ParseError]:
393
+ return self._unsafe_handler.collected()
394
+
395
+ def handle_unsafe(self, msg: str, *, node: Any | None = None) -> None:
396
+ self._unsafe_handler.handle(msg, node=node)
397
+
398
+
399
+ _URL_NORMALIZE_STRIP_TABLE = {i: None for i in range(0x21)}
400
+ _URL_NORMALIZE_STRIP_TABLE[0x7F] = None
401
+
402
+
403
+ DEFAULT_POLICY: SanitizationPolicy = SanitizationPolicy(
404
+ allowed_tags=[
405
+ # Text / structure
406
+ "p",
407
+ "br",
408
+ # Structure
409
+ "div",
410
+ "span",
411
+ "blockquote",
412
+ "pre",
413
+ "code",
414
+ # Headings
415
+ "h1",
416
+ "h2",
417
+ "h3",
418
+ "h4",
419
+ "h5",
420
+ "h6",
421
+ # Lists
422
+ "ul",
423
+ "ol",
424
+ "li",
425
+ # Tables
426
+ "table",
427
+ "thead",
428
+ "tbody",
429
+ "tfoot",
430
+ "tr",
431
+ "th",
432
+ "td",
433
+ # Text formatting
434
+ "b",
435
+ "strong",
436
+ "i",
437
+ "em",
438
+ "u",
439
+ "s",
440
+ "sub",
441
+ "sup",
442
+ "small",
443
+ "mark",
444
+ # Quotes/code
445
+ # Line breaks
446
+ "hr",
447
+ # Links and images
448
+ "a",
449
+ "img",
450
+ ],
451
+ allowed_attributes={
452
+ "*": ["class", "id", "title", "lang", "dir"],
453
+ "a": ["href", "title"],
454
+ "img": ["src", "alt", "title", "width", "height", "loading", "decoding"],
455
+ "th": ["colspan", "rowspan"],
456
+ "td": ["colspan", "rowspan"],
457
+ },
458
+ url_policy=UrlPolicy(
459
+ default_handling="allow",
460
+ allow_rules={
461
+ ("a", "href"): UrlRule(
462
+ allowed_schemes=["http", "https", "mailto", "tel"],
463
+ resolve_protocol_relative="https",
464
+ ),
465
+ ("img", "src"): UrlRule(
466
+ allowed_schemes=[],
467
+ resolve_protocol_relative=None,
468
+ ),
469
+ },
470
+ ),
471
+ allowed_css_properties=set(),
472
+ )
473
+
474
+
475
+ # A conservative preset for allowing a small amount of inline styling.
476
+ # This is intentionally focused on text-level styling and avoids layout/
477
+ # positioning properties that are commonly abused for UI redress.
478
+ CSS_PRESET_TEXT: frozenset[str] = frozenset(
479
+ {
480
+ "background-color",
481
+ "color",
482
+ "font-size",
483
+ "font-style",
484
+ "font-weight",
485
+ "letter-spacing",
486
+ "line-height",
487
+ "text-align",
488
+ "text-decoration",
489
+ "text-transform",
490
+ "white-space",
491
+ "word-break",
492
+ "word-spacing",
493
+ "word-wrap",
494
+ }
495
+ )
496
+
497
+
498
+ DEFAULT_DOCUMENT_POLICY: SanitizationPolicy = SanitizationPolicy(
499
+ allowed_tags=sorted(set(DEFAULT_POLICY.allowed_tags) | {"html", "head", "body", "title"}),
500
+ allowed_attributes=DEFAULT_POLICY.allowed_attributes,
501
+ url_policy=DEFAULT_POLICY.url_policy,
502
+ drop_comments=DEFAULT_POLICY.drop_comments,
503
+ drop_doctype=DEFAULT_POLICY.drop_doctype,
504
+ drop_foreign_namespaces=DEFAULT_POLICY.drop_foreign_namespaces,
505
+ drop_content_tags=DEFAULT_POLICY.drop_content_tags,
506
+ allowed_css_properties=DEFAULT_POLICY.allowed_css_properties,
507
+ force_link_rel=DEFAULT_POLICY.force_link_rel,
508
+ )
509
+
510
+
511
+ def _is_valid_css_property_name(name: str) -> bool:
512
+ # Conservative: allow only ASCII letters/digits/hyphen.
513
+ # This keeps parsing deterministic and avoids surprises with escapes.
514
+ if not name:
515
+ return False
516
+ for ch in name:
517
+ if "a" <= ch <= "z" or "0" <= ch <= "9" or ch == "-":
518
+ continue
519
+ return False
520
+ return True
521
+
522
+
523
+ def _css_value_may_load_external_resource(value: str) -> bool:
524
+ # Extremely conservative check: drop any declaration value that contains a
525
+ # CSS function call that can load external resources.
526
+ #
527
+ # We intentionally do not try to parse full CSS (escapes, comments, strings,
528
+ # etc.). Instead, we reject values that contain backslashes (common escape
529
+ # obfuscation) or that *look* like they contain url(…) / image-set(…). This
530
+ # ensures style attributes can't be used to trigger network requests even
531
+ # when users allow potentially dangerous properties.
532
+ if "\\" in value:
533
+ return True
534
+
535
+ # Scan while ignoring ASCII whitespace/control chars and CSS comments.
536
+ # Keep a small rolling buffer to avoid extra allocations.
537
+ buf: list[str] = []
538
+ max_len = len("alphaimageloader")
539
+
540
+ i = 0
541
+ n = len(value)
542
+ while i < n:
543
+ ch = value[i]
544
+
545
+ # Treat CSS comments as ignorable, so obfuscation like u/**/rl( is caught.
546
+ if ch == "/" and i + 1 < n and value[i + 1] == "*":
547
+ i += 2
548
+ while i + 1 < n:
549
+ if value[i] == "*" and value[i + 1] == "/":
550
+ i += 2
551
+ break
552
+ i += 1
553
+ else:
554
+ # Unterminated comments are invalid CSS; be conservative.
555
+ return True
556
+ continue
557
+
558
+ o = ord(ch)
559
+ if o <= 0x20 or o == 0x7F:
560
+ i += 1
561
+ continue
562
+
563
+ if "A" <= ch <= "Z":
564
+ lower_ch = chr(o + 0x20)
565
+ else:
566
+ lower_ch = ch
567
+
568
+ buf.append(lower_ch)
569
+ if len(buf) > max_len:
570
+ buf.pop(0)
571
+
572
+ # Check for url( and image-set( anywhere in the normalized stream.
573
+ if len(buf) >= 4 and buf[-4:] == ["u", "r", "l", "("]:
574
+ return True
575
+ if len(buf) >= 10 and buf[-10:] == [
576
+ "i",
577
+ "m",
578
+ "a",
579
+ "g",
580
+ "e",
581
+ "-",
582
+ "s",
583
+ "e",
584
+ "t",
585
+ "(",
586
+ ]:
587
+ return True
588
+
589
+ # IE-only but still worth blocking defensively.
590
+ if len(buf) >= 11 and buf[-11:] == [
591
+ "e",
592
+ "x",
593
+ "p",
594
+ "r",
595
+ "e",
596
+ "s",
597
+ "s",
598
+ "i",
599
+ "o",
600
+ "n",
601
+ "(",
602
+ ]:
603
+ return True
604
+
605
+ # Legacy IE CSS filters that can fetch remote resources.
606
+ if len(buf) >= 7 and buf[-7:] == ["p", "r", "o", "g", "i", "d", ":"]:
607
+ return True
608
+ if len(buf) >= 16 and buf[-16:] == [
609
+ "a",
610
+ "l",
611
+ "p",
612
+ "h",
613
+ "a",
614
+ "i",
615
+ "m",
616
+ "a",
617
+ "g",
618
+ "e",
619
+ "l",
620
+ "o",
621
+ "a",
622
+ "d",
623
+ "e",
624
+ "r",
625
+ ]:
626
+ return True
627
+
628
+ # Legacy bindings/behaviors that can pull remote content.
629
+ if len(buf) >= 9 and buf[-9:] == ["b", "e", "h", "a", "v", "i", "o", "r", ":"]:
630
+ return True
631
+ if len(buf) >= 12 and buf[-12:] == [
632
+ "-",
633
+ "m",
634
+ "o",
635
+ "z",
636
+ "-",
637
+ "b",
638
+ "i",
639
+ "n",
640
+ "d",
641
+ "i",
642
+ "n",
643
+ "g",
644
+ ]:
645
+ return True
646
+
647
+ i += 1
648
+
649
+ return False
650
+
651
+
652
+ def _sanitize_inline_style(*, allowed_css_properties: Collection[str], value: str) -> str | None:
653
+ allowed = allowed_css_properties
654
+ if not allowed:
655
+ return None
656
+
657
+ v = str(value)
658
+ if not v:
659
+ return None
660
+
661
+ out_parts: list[str] = []
662
+ for decl in v.split(";"):
663
+ d = decl.strip()
664
+ if not d:
665
+ continue
666
+ colon = d.find(":")
667
+ if colon <= 0:
668
+ continue
669
+
670
+ prop = d[:colon].strip().lower()
671
+ if not _is_valid_css_property_name(prop):
672
+ continue
673
+ if prop not in allowed:
674
+ continue
675
+
676
+ prop_value = d[colon + 1 :].strip()
677
+ if not prop_value:
678
+ continue
679
+
680
+ if _css_value_may_load_external_resource(prop_value):
681
+ continue
682
+
683
+ out_parts.append(f"{prop}: {prop_value}")
684
+
685
+ if not out_parts:
686
+ return None
687
+ return "; ".join(out_parts)
688
+
689
+
690
+ def _normalize_url_for_checking(value: str) -> str:
691
+ # Strip whitespace/control chars commonly used for scheme obfuscation.
692
+ # Note: do not strip backslashes; they are not whitespace/control chars,
693
+ # and removing them can turn invalid schemes into valid ones.
694
+ return value.translate(_URL_NORMALIZE_STRIP_TABLE)
695
+
696
+
697
+ def _is_valid_scheme(scheme: str) -> bool:
698
+ first = scheme[0]
699
+ if not ("a" <= first <= "z" or "A" <= first <= "Z"):
700
+ return False
701
+ for ch in scheme[1:]:
702
+ if "a" <= ch <= "z" or "A" <= ch <= "Z" or "0" <= ch <= "9" or ch in "+-.":
703
+ continue
704
+ return False
705
+ return True
706
+
707
+
708
+ def _has_scheme(value: str) -> bool:
709
+ idx = value.find(":")
710
+ if idx <= 0:
711
+ return False
712
+ # Scheme must appear before any path/query/fragment separator.
713
+ end = len(value)
714
+ for sep in ("/", "?", "#"):
715
+ j = value.find(sep)
716
+ if j != -1 and j < end:
717
+ end = j
718
+ if idx >= end:
719
+ return False
720
+ return _is_valid_scheme(value[:idx])
721
+
722
+
723
+ def _has_invalid_scheme_like_prefix(value: str) -> bool:
724
+ idx = value.find(":")
725
+ if idx <= 0:
726
+ return False
727
+
728
+ end = len(value)
729
+ for sep in ("/", "?", "#"):
730
+ j = value.find(sep)
731
+ if j != -1 and j < end:
732
+ end = j
733
+ if idx >= end:
734
+ return False
735
+
736
+ return not _is_valid_scheme(value[:idx])
737
+
738
+
739
+ def _sanitize_url_value(
740
+ *,
741
+ url_policy: UrlPolicy,
742
+ rule: UrlRule,
743
+ tag: str,
744
+ attr: str,
745
+ value: str,
746
+ ) -> str | None:
747
+ return _sanitize_url_value_inner(
748
+ url_policy=url_policy, rule=rule, tag=tag, attr=attr, value=value, apply_filter=True
749
+ )
750
+
751
+
752
+ def _effective_proxy(*, url_policy: UrlPolicy, rule: UrlRule) -> UrlProxy | None:
753
+ return rule.proxy if rule.proxy is not None else url_policy.proxy
754
+
755
+
756
+ def _effective_url_handling(*, url_policy: UrlPolicy, rule: UrlRule) -> UrlHandling:
757
+ # URL-like attributes are allowlisted via UrlPolicy.allow_rules. When they are
758
+ # allowlisted and the URL passes validation, the default action is to keep the URL.
759
+ return rule.handling if rule.handling is not None else "allow"
760
+
761
+
762
+ def _effective_allow_relative(*, url_policy: UrlPolicy, rule: UrlRule) -> bool:
763
+ return rule.allow_relative if rule.allow_relative is not None else url_policy.default_allow_relative
764
+
765
+
766
+ def _sanitize_url_value_inner(
767
+ *,
768
+ url_policy: UrlPolicy,
769
+ rule: UrlRule,
770
+ tag: str,
771
+ attr: str,
772
+ value: str,
773
+ apply_filter: bool,
774
+ ) -> str | None:
775
+ v = value
776
+ mode = _effective_url_handling(url_policy=url_policy, rule=rule)
777
+ allow_relative = _effective_allow_relative(url_policy=url_policy, rule=rule)
778
+
779
+ if apply_filter and url_policy.url_filter is not None:
780
+ rewritten = url_policy.url_filter(tag, attr, v)
781
+ if rewritten is None:
782
+ return None
783
+ v = rewritten
784
+
785
+ stripped = str(v).strip()
786
+ normalized = _normalize_url_for_checking(stripped)
787
+ if not normalized:
788
+ # If normalization removes everything, the value was empty/whitespace/
789
+ # control-only. Drop it rather than keeping weird control characters.
790
+ return None
791
+
792
+ if normalized.startswith("#"):
793
+ if not rule.allow_fragment:
794
+ return None
795
+ if mode == "strip":
796
+ return None
797
+ if mode == "proxy":
798
+ proxy = _effective_proxy(url_policy=url_policy, rule=rule)
799
+ return None if proxy is None else _proxy_url_value(proxy=proxy, value=stripped)
800
+ return stripped
801
+
802
+ if mode == "proxy" and _has_invalid_scheme_like_prefix(normalized):
803
+ # If proxying is enabled, do not treat scheme-obfuscation as a relative URL.
804
+ # Some user agents normalize backslashes and other characters during navigation.
805
+ return None
806
+
807
+ if normalized.startswith("//"):
808
+ if not rule.resolve_protocol_relative:
809
+ return None
810
+
811
+ # Resolve to absolute URL for checking.
812
+ resolved_scheme = rule.resolve_protocol_relative.lower()
813
+ resolved_url = f"{resolved_scheme}:{normalized}"
814
+
815
+ parsed = urlsplit(resolved_url)
816
+ scheme = (parsed.scheme or "").lower()
817
+ if scheme not in rule.allowed_schemes:
818
+ return None
819
+
820
+ if rule.allowed_hosts is not None:
821
+ host = (parsed.hostname or "").lower()
822
+ if not host or host not in rule.allowed_hosts:
823
+ return None
824
+
825
+ if mode == "strip":
826
+ return None
827
+ if mode == "proxy":
828
+ proxy = _effective_proxy(url_policy=url_policy, rule=rule)
829
+ return None if proxy is None else _proxy_url_value(proxy=proxy, value=resolved_url)
830
+ return resolved_url
831
+
832
+ if _has_scheme(normalized):
833
+ parsed = urlsplit(normalized)
834
+ scheme = (parsed.scheme or "").lower()
835
+ if scheme not in rule.allowed_schemes:
836
+ return None
837
+ if rule.allowed_hosts is not None:
838
+ host = (parsed.hostname or "").lower()
839
+ if not host or host not in rule.allowed_hosts:
840
+ return None
841
+ if mode == "strip":
842
+ return None
843
+ if mode == "proxy":
844
+ proxy = _effective_proxy(url_policy=url_policy, rule=rule)
845
+ return None if proxy is None else _proxy_url_value(proxy=proxy, value=stripped)
846
+ return stripped
847
+
848
+ if not allow_relative:
849
+ return None
850
+
851
+ if mode == "strip":
852
+ return None
853
+ if mode == "proxy":
854
+ proxy = _effective_proxy(url_policy=url_policy, rule=rule)
855
+ return None if proxy is None else _proxy_url_value(proxy=proxy, value=stripped)
856
+ return stripped
857
+
858
+
859
+ def _sanitize_srcset_value(
860
+ *,
861
+ url_policy: UrlPolicy,
862
+ rule: UrlRule,
863
+ tag: str,
864
+ attr: str,
865
+ value: str,
866
+ ) -> str | None:
867
+ # Apply the URL filter once to the whole attribute value.
868
+ v = value
869
+ if url_policy.url_filter is not None:
870
+ rewritten = url_policy.url_filter(tag, attr, v)
871
+ if rewritten is None:
872
+ return None
873
+ v = rewritten
874
+
875
+ stripped = str(v).strip()
876
+ if not stripped:
877
+ return None
878
+
879
+ out_candidates: list[str] = []
880
+ for raw_candidate in stripped.split(","):
881
+ c = raw_candidate.strip()
882
+ if not c:
883
+ continue
884
+
885
+ parts = c.split(None, 1)
886
+ url_token = parts[0]
887
+ desc = parts[1].strip() if len(parts) == 2 else ""
888
+
889
+ sanitized_url = _sanitize_url_value_inner(
890
+ url_policy=url_policy,
891
+ rule=rule,
892
+ tag=tag,
893
+ attr=attr,
894
+ value=url_token,
895
+ apply_filter=False,
896
+ )
897
+ if sanitized_url is None:
898
+ return None
899
+
900
+ out_candidates.append(f"{sanitized_url} {desc}".strip())
901
+
902
+ return None if not out_candidates else ", ".join(out_candidates)
903
+
904
+
905
+ _URL_LIKE_ATTRS: frozenset[str] = frozenset(
906
+ {
907
+ # Common URL-valued attributes.
908
+ "href",
909
+ "src",
910
+ "srcset",
911
+ "poster",
912
+ "action",
913
+ "formaction",
914
+ "data",
915
+ "cite",
916
+ "background",
917
+ # Can trigger requests/pings.
918
+ "ping",
919
+ }
920
+ )
921
+
922
+
923
+ def _sanitize(node: Any, *, policy: SanitizationPolicy | None = None) -> Any:
924
+ """Return a sanitized clone of `node`.
925
+
926
+ This returns a sanitized clone without mutating the original tree.
927
+ For performance, it builds the sanitized clone in a single pass.
928
+ """
929
+
930
+ if policy is None:
931
+ policy = DEFAULT_DOCUMENT_POLICY if node.name == "#document" else DEFAULT_POLICY
932
+
933
+ # Escape-mode tag reconstruction may need access to the original source HTML.
934
+ # Historically we allow a child element to inherit _source_html from an
935
+ # ancestor container; keep that behavior even though we sanitize a clone.
936
+ if policy.disallowed_tag_handling == "escape":
937
+ root_source_html = getattr(node, "_source_html", None)
938
+ if root_source_html:
939
+ from .node import TemplateNode # noqa: PLC0415
940
+
941
+ stack: list[Any] = [node]
942
+ while stack:
943
+ current = stack.pop()
944
+ current_source_html = getattr(current, "_source_html", None) or root_source_html
945
+
946
+ children = getattr(current, "children", None) or []
947
+ for child in children:
948
+ # TextNode does not have _source_html.
949
+ if getattr(child, "name", "") == "#text":
950
+ continue
951
+ if getattr(child, "_source_html", None) is None:
952
+ child._source_html = current_source_html
953
+ stack.append(child)
954
+
955
+ if type(current) is TemplateNode and current.template_content is not None:
956
+ tc = current.template_content
957
+ if getattr(tc, "_source_html", None) is None:
958
+ tc._source_html = current_source_html
959
+ stack.append(tc)
960
+
961
+ # We intentionally implement safe-output sanitization by applying the
962
+ # `Sanitize(policy=...)` transform pipeline to a clone of the node.
963
+ # This keeps a single canonical sanitization algorithm.
964
+ from .transforms import Sanitize, apply_compiled_transforms, compile_transforms # noqa: PLC0415
965
+
966
+ compiled = policy._compiled_sanitize_transforms
967
+ if compiled is None:
968
+ compiled = compile_transforms((Sanitize(policy=policy),))
969
+ object.__setattr__(policy, "_compiled_sanitize_transforms", compiled)
970
+
971
+ # Container-root rule: transforms walk children of the provided root.
972
+ # For non-container roots, wrap the cloned node in a document fragment so
973
+ # the sanitizer can act on the root node itself.
974
+ if node.name in {"#document", "#document-fragment"}:
975
+ cloned = node.clone_node(deep=True)
976
+ apply_compiled_transforms(cloned, compiled, errors=None)
977
+ return cloned
978
+
979
+ from .node import SimpleDomNode # noqa: PLC0415
980
+
981
+ wrapper = SimpleDomNode("#document-fragment")
982
+ wrapper.append_child(node.clone_node(deep=True))
983
+ apply_compiled_transforms(wrapper, compiled, errors=None)
984
+
985
+ children = wrapper.children or []
986
+ if len(children) == 1:
987
+ only = children[0]
988
+ only.parent = None
989
+ wrapper.children = []
990
+ return only
991
+
992
+ return wrapper