justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
justhtml/sanitize.py ADDED
@@ -0,0 +1,1141 @@
1
+ """HTML sanitization policy API.
2
+
3
+ This module defines the public API for JustHTML sanitization.
4
+
5
+ The sanitizer operates on the parsed JustHTML DOM and is intentionally
6
+ policy-driven.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from collections.abc import Callable, Collection, Mapping
12
+ from dataclasses import dataclass, field
13
+ from typing import Any, Literal
14
+ from urllib.parse import quote, urlsplit
15
+
16
+ from .tokens import ParseError
17
+
18
+ UrlFilter = Callable[[str, str, str], str | None]
19
+
20
+
21
+ class UnsafeHtmlError(ValueError):
22
+ """Raised when unsafe HTML is encountered and unsafe_handling='raise'."""
23
+
24
+
25
+ UnsafeHandling = Literal["strip", "raise", "collect"]
26
+
27
+
28
+ UrlHandling = Literal["allow", "strip", "proxy"]
29
+
30
+
31
+ @dataclass(frozen=True, slots=True)
32
+ class UrlProxy:
33
+ url: str
34
+ param: str = "url"
35
+
36
+ def __post_init__(self) -> None:
37
+ proxy_url = str(self.url)
38
+ if not proxy_url:
39
+ raise ValueError("UrlProxy.url must be a non-empty string")
40
+ object.__setattr__(self, "url", proxy_url)
41
+ object.__setattr__(self, "param", str(self.param))
42
+
43
+
44
+ @dataclass(frozen=True, slots=True)
45
+ class UrlRule:
46
+ """Rule for a single URL-valued attribute (e.g. a[href], img[src]).
47
+
48
+ This is intentionally rendering-oriented.
49
+
50
+ - Returning/keeping a URL can still cause network requests when the output
51
+ is rendered (notably for <img src>). Applications like email viewers often
52
+ want to block remote loads by default.
53
+ """
54
+
55
+ # Allow same-document fragments (#foo). Typically safe.
56
+ allow_fragment: bool = True
57
+
58
+ # If set, protocol-relative URLs (//example.com) are resolved to this scheme
59
+ # (e.g. "https") before checking allowed_schemes.
60
+ # If None, protocol-relative URLs are disallowed.
61
+ resolve_protocol_relative: str | None = "https"
62
+
63
+ # Allow absolute URLs with these schemes (lowercase), e.g. {"https"}.
64
+ # If empty, all absolute URLs with a scheme are disallowed.
65
+ allowed_schemes: Collection[str] = field(default_factory=set)
66
+
67
+ # If provided, absolute URLs are allowed only if the parsed host is in this
68
+ # allowlist.
69
+ allowed_hosts: Collection[str] | None = None
70
+
71
+ # Optional per-rule handling override.
72
+ # If None, the URL is kept ("allow") after it passes validation.
73
+ handling: UrlHandling | None = None
74
+
75
+ # Optional per-rule override of UrlPolicy.default_allow_relative.
76
+ # If None, UrlPolicy.default_allow_relative is used.
77
+ allow_relative: bool | None = None
78
+
79
+ # Optional proxy override for absolute/protocol-relative URLs.
80
+ # Used when the effective URL handling is "proxy".
81
+ proxy: UrlProxy | None = None
82
+
83
+ def __post_init__(self) -> None:
84
+ # Accept lists/tuples from user code, normalize for internal use.
85
+ if not isinstance(self.allowed_schemes, set):
86
+ object.__setattr__(self, "allowed_schemes", set(self.allowed_schemes))
87
+ if self.allowed_hosts is not None and not isinstance(self.allowed_hosts, set):
88
+ object.__setattr__(self, "allowed_hosts", set(self.allowed_hosts))
89
+
90
+ if self.proxy is not None and not isinstance(self.proxy, UrlProxy):
91
+ raise TypeError("UrlRule.proxy must be a UrlProxy or None")
92
+
93
+ if self.handling is not None:
94
+ mode = str(self.handling)
95
+ if mode not in {"allow", "strip", "proxy"}:
96
+ raise ValueError("Invalid UrlRule.handling. Expected one of: 'allow', 'strip', 'proxy'")
97
+ object.__setattr__(self, "handling", mode)
98
+
99
+ if self.allow_relative is not None:
100
+ object.__setattr__(self, "allow_relative", bool(self.allow_relative))
101
+
102
+
103
+ @dataclass(frozen=True, slots=True)
104
+ class UrlPolicy:
105
+ # Default handling for URL-like attributes after they pass UrlRule checks.
106
+ # - "allow": keep the URL as-is
107
+ # - "strip": drop the attribute
108
+ # - "proxy": rewrite the URL through a proxy (UrlPolicy.proxy or UrlRule.proxy)
109
+ default_handling: UrlHandling = "strip"
110
+
111
+ # Default allowance for relative URLs (including /path, ./path, ../path, ?query)
112
+ # for URL-like attributes that have a matching UrlRule.
113
+ default_allow_relative: bool = True
114
+
115
+ # Rule configuration for URL-valued attributes.
116
+ allow_rules: Mapping[tuple[str, str], UrlRule] = field(default_factory=dict)
117
+
118
+ # Optional hook that can drop or rewrite URLs.
119
+ # url_filter(tag, attr, value) should return:
120
+ # - a replacement string to keep (possibly rewritten), or
121
+ # - None to drop the attribute.
122
+ url_filter: UrlFilter | None = None
123
+
124
+ # Default proxy config used when a rule is handled with "proxy" and
125
+ # the rule does not specify its own UrlRule.proxy override.
126
+ proxy: UrlProxy | None = None
127
+
128
+ def __post_init__(self) -> None:
129
+ mode = str(self.default_handling)
130
+ if mode not in {"allow", "strip", "proxy"}:
131
+ raise ValueError("Invalid default_handling. Expected one of: 'allow', 'strip', 'proxy'")
132
+ object.__setattr__(self, "default_handling", mode)
133
+
134
+ object.__setattr__(self, "default_allow_relative", bool(self.default_allow_relative))
135
+
136
+ if not isinstance(self.allow_rules, dict):
137
+ object.__setattr__(self, "allow_rules", dict(self.allow_rules))
138
+
139
+ if self.proxy is not None and not isinstance(self.proxy, UrlProxy):
140
+ raise TypeError("UrlPolicy.proxy must be a UrlProxy or None")
141
+
142
+ # Validate proxy configuration for any rules that are in proxy mode.
143
+ for rule in self.allow_rules.values():
144
+ if not isinstance(rule, UrlRule):
145
+ raise TypeError("UrlPolicy.allow_rules values must be UrlRule")
146
+ if rule.handling == "proxy" and self.proxy is None and rule.proxy is None:
147
+ raise ValueError("UrlRule.handling='proxy' requires a UrlPolicy.proxy or a per-rule UrlRule.proxy")
148
+
149
+
150
+ def _proxy_url_value(*, proxy: UrlProxy, value: str) -> str:
151
+ sep = "&" if "?" in proxy.url else "?"
152
+ return f"{proxy.url}{sep}{proxy.param}={quote(value, safe='')}"
153
+
154
+
155
+ @dataclass(frozen=True, slots=True)
156
+ class SanitizationPolicy:
157
+ """An allow-list driven policy for sanitizing a parsed DOM.
158
+
159
+ This API is intentionally small. The implementation will interpret these
160
+ fields strictly.
161
+
162
+ - Tags not in `allowed_tags` are disallowed.
163
+ - Attributes not in `allowed_attributes[tag]` (or `allowed_attributes["*"]`)
164
+ are disallowed.
165
+ - URL scheme checks apply to attributes listed in `url_attributes`.
166
+
167
+ All tag and attribute names are expected to be ASCII-lowercase.
168
+ """
169
+
170
+ allowed_tags: Collection[str]
171
+ allowed_attributes: Mapping[str, Collection[str]]
172
+
173
+ # URL handling.
174
+ url_policy: UrlPolicy = field(default_factory=UrlPolicy)
175
+
176
+ drop_comments: bool = True
177
+ drop_doctype: bool = True
178
+ drop_foreign_namespaces: bool = True
179
+
180
+ # If True, disallowed elements are removed but their children may be kept
181
+ # (except for tags in `drop_content_tags`).
182
+ strip_disallowed_tags: bool = True
183
+
184
+ # Dangerous containers whose text payload should not be preserved.
185
+ drop_content_tags: Collection[str] = field(default_factory=lambda: {"script", "style"})
186
+
187
+ # Inline style allowlist.
188
+ # Only applies when the `style` attribute is allowed for a tag.
189
+ # If empty, inline styles are effectively disabled (style attributes are dropped).
190
+ allowed_css_properties: Collection[str] = field(default_factory=set)
191
+
192
+ # Link hardening.
193
+ # If non-empty, ensure these tokens are present in <a rel="...">.
194
+ # (The sanitizer will merge tokens; it will not remove existing ones.)
195
+ force_link_rel: Collection[str] = field(default_factory=set)
196
+
197
+ # Determines how unsafe input is handled.
198
+ #
199
+ # - "strip": Default. Remove/drop unsafe constructs and keep going.
200
+ # - "raise": Raise UnsafeHtmlError on the first unsafe construct.
201
+ #
202
+ # This is intentionally a string mode (instead of a boolean) so we can add
203
+ # more behaviors over time without changing the API shape.
204
+ unsafe_handling: UnsafeHandling = "strip"
205
+
206
+ _collected_security_errors: list[ParseError] | None = field(
207
+ default=None,
208
+ init=False,
209
+ repr=False,
210
+ compare=False,
211
+ )
212
+
213
+ # Internal caches to avoid per-node allocations in hot paths.
214
+ _allowed_attrs_global: frozenset[str] = field(
215
+ default_factory=frozenset,
216
+ init=False,
217
+ repr=False,
218
+ compare=False,
219
+ )
220
+ _allowed_attrs_by_tag: dict[str, frozenset[str]] = field(
221
+ default_factory=dict,
222
+ init=False,
223
+ repr=False,
224
+ compare=False,
225
+ )
226
+
227
+ def __post_init__(self) -> None:
228
+ # Normalize to sets so the sanitizer can do fast membership checks.
229
+ if not isinstance(self.allowed_tags, set):
230
+ object.__setattr__(self, "allowed_tags", set(self.allowed_tags))
231
+
232
+ if not isinstance(self.allowed_attributes, dict) or any(
233
+ not isinstance(v, set) for v in self.allowed_attributes.values()
234
+ ):
235
+ normalized_attrs: dict[str, set[str]] = {}
236
+ for tag, attrs in self.allowed_attributes.items():
237
+ normalized_attrs[str(tag)] = attrs if isinstance(attrs, set) else set(attrs)
238
+ object.__setattr__(self, "allowed_attributes", normalized_attrs)
239
+
240
+ if not isinstance(self.drop_content_tags, set):
241
+ object.__setattr__(self, "drop_content_tags", set(self.drop_content_tags))
242
+
243
+ if not isinstance(self.allowed_css_properties, set):
244
+ object.__setattr__(self, "allowed_css_properties", set(self.allowed_css_properties))
245
+
246
+ if not isinstance(self.force_link_rel, set):
247
+ object.__setattr__(self, "force_link_rel", set(self.force_link_rel))
248
+
249
+ unsafe_handling = str(self.unsafe_handling)
250
+ if unsafe_handling not in {"strip", "raise", "collect"}:
251
+ raise ValueError("Invalid unsafe_handling. Expected one of: 'strip', 'raise', 'collect'")
252
+ object.__setattr__(self, "unsafe_handling", unsafe_handling)
253
+
254
+ # Normalize rel tokens once so _sanitize_attrs() can stay allocation-light.
255
+ # (Downstream code expects lowercase tokens and ignores empty/whitespace.)
256
+ if self.force_link_rel:
257
+ normalized_force_link_rel = {t.strip().lower() for t in self.force_link_rel if str(t).strip()}
258
+ object.__setattr__(self, "force_link_rel", normalized_force_link_rel)
259
+
260
+ style_allowed = any("style" in attrs for attrs in self.allowed_attributes.values())
261
+ if style_allowed and not self.allowed_css_properties:
262
+ raise ValueError(
263
+ "SanitizationPolicy allows the 'style' attribute but allowed_css_properties is empty. "
264
+ "Either remove 'style' from allowed_attributes or set allowed_css_properties (for example CSS_PRESET_TEXT)."
265
+ )
266
+
267
+ allowed_attributes = self.allowed_attributes
268
+ allowed_global = frozenset(allowed_attributes.get("*", ()))
269
+ by_tag: dict[str, frozenset[str]] = {}
270
+ for tag, attrs in allowed_attributes.items():
271
+ if tag == "*":
272
+ continue
273
+ by_tag[tag] = frozenset(allowed_global.union(attrs))
274
+ object.__setattr__(self, "_allowed_attrs_global", allowed_global)
275
+ object.__setattr__(self, "_allowed_attrs_by_tag", by_tag)
276
+
277
+ def reset_collected_security_errors(self) -> None:
278
+ if self.unsafe_handling == "collect":
279
+ object.__setattr__(self, "_collected_security_errors", [])
280
+ else:
281
+ object.__setattr__(self, "_collected_security_errors", None)
282
+
283
+ def collected_security_errors(self) -> list[ParseError]:
284
+ if self._collected_security_errors is None:
285
+ return []
286
+ out = list(self._collected_security_errors)
287
+ # Keep ordering consistent with JustHTML error ordering: by input position.
288
+ # Errors without a location sort last.
289
+ out.sort(
290
+ key=lambda e: (
291
+ e.line if e.line is not None else 1_000_000_000,
292
+ e.column if e.column is not None else 1_000_000_000,
293
+ )
294
+ )
295
+ return out
296
+
297
+ def handle_unsafe(self, msg: str, *, node: Any | None = None) -> None:
298
+ mode = self.unsafe_handling
299
+ if mode == "strip":
300
+ return
301
+ if mode == "raise":
302
+ raise UnsafeHtmlError(msg)
303
+ if mode == "collect":
304
+ collected = self._collected_security_errors
305
+ if collected is None:
306
+ collected = []
307
+ object.__setattr__(self, "_collected_security_errors", collected)
308
+
309
+ line: int | None = None
310
+ column: int | None = None
311
+ if node is not None:
312
+ # Best-effort: use node origin metadata when enabled.
313
+ # This stays allocation-light and avoids any input re-parsing.
314
+ line = node.origin_line
315
+ column = node.origin_col
316
+
317
+ collected.append(
318
+ ParseError(
319
+ "unsafe-html",
320
+ line=line,
321
+ column=column,
322
+ category="security",
323
+ message=msg,
324
+ )
325
+ )
326
+ return
327
+ raise AssertionError(f"Unhandled unsafe_handling: {mode!r}")
328
+
329
+
330
+ _URL_NORMALIZE_STRIP_TABLE = {i: None for i in range(0x21)}
331
+ _URL_NORMALIZE_STRIP_TABLE[0x7F] = None
332
+
333
+
334
+ DEFAULT_POLICY: SanitizationPolicy = SanitizationPolicy(
335
+ allowed_tags=[
336
+ # Text / structure
337
+ "p",
338
+ "br",
339
+ # Structure
340
+ "div",
341
+ "span",
342
+ "blockquote",
343
+ "pre",
344
+ "code",
345
+ # Headings
346
+ "h1",
347
+ "h2",
348
+ "h3",
349
+ "h4",
350
+ "h5",
351
+ "h6",
352
+ # Lists
353
+ "ul",
354
+ "ol",
355
+ "li",
356
+ # Tables
357
+ "table",
358
+ "thead",
359
+ "tbody",
360
+ "tfoot",
361
+ "tr",
362
+ "th",
363
+ "td",
364
+ # Text formatting
365
+ "b",
366
+ "strong",
367
+ "i",
368
+ "em",
369
+ "u",
370
+ "s",
371
+ "sub",
372
+ "sup",
373
+ "small",
374
+ "mark",
375
+ # Quotes/code
376
+ # Line breaks
377
+ "hr",
378
+ # Links and images
379
+ "a",
380
+ "img",
381
+ ],
382
+ allowed_attributes={
383
+ "*": ["class", "id", "title", "lang", "dir"],
384
+ "a": ["href", "title"],
385
+ "img": ["src", "alt", "title", "width", "height", "loading", "decoding"],
386
+ "th": ["colspan", "rowspan"],
387
+ "td": ["colspan", "rowspan"],
388
+ },
389
+ url_policy=UrlPolicy(
390
+ default_handling="allow",
391
+ allow_rules={
392
+ ("a", "href"): UrlRule(
393
+ allowed_schemes=["http", "https", "mailto", "tel"],
394
+ resolve_protocol_relative="https",
395
+ ),
396
+ ("img", "src"): UrlRule(
397
+ allowed_schemes=[],
398
+ resolve_protocol_relative=None,
399
+ ),
400
+ },
401
+ ),
402
+ allowed_css_properties=set(),
403
+ )
404
+
405
+
406
+ # A conservative preset for allowing a small amount of inline styling.
407
+ # This is intentionally focused on text-level styling and avoids layout/
408
+ # positioning properties that are commonly abused for UI redress.
409
+ CSS_PRESET_TEXT: frozenset[str] = frozenset(
410
+ {
411
+ "background-color",
412
+ "color",
413
+ "font-size",
414
+ "font-style",
415
+ "font-weight",
416
+ "letter-spacing",
417
+ "line-height",
418
+ "text-align",
419
+ "text-decoration",
420
+ "text-transform",
421
+ "white-space",
422
+ "word-break",
423
+ "word-spacing",
424
+ "word-wrap",
425
+ }
426
+ )
427
+
428
+
429
+ DEFAULT_DOCUMENT_POLICY: SanitizationPolicy = SanitizationPolicy(
430
+ allowed_tags=sorted(set(DEFAULT_POLICY.allowed_tags) | {"html", "head", "body", "title"}),
431
+ allowed_attributes=DEFAULT_POLICY.allowed_attributes,
432
+ url_policy=DEFAULT_POLICY.url_policy,
433
+ drop_comments=DEFAULT_POLICY.drop_comments,
434
+ drop_doctype=DEFAULT_POLICY.drop_doctype,
435
+ drop_foreign_namespaces=DEFAULT_POLICY.drop_foreign_namespaces,
436
+ strip_disallowed_tags=DEFAULT_POLICY.strip_disallowed_tags,
437
+ drop_content_tags=DEFAULT_POLICY.drop_content_tags,
438
+ allowed_css_properties=DEFAULT_POLICY.allowed_css_properties,
439
+ force_link_rel=DEFAULT_POLICY.force_link_rel,
440
+ )
441
+
442
+
443
+ def _is_valid_css_property_name(name: str) -> bool:
444
+ # Conservative: allow only ASCII letters/digits/hyphen.
445
+ # This keeps parsing deterministic and avoids surprises with escapes.
446
+ if not name:
447
+ return False
448
+ for ch in name:
449
+ if "a" <= ch <= "z" or "0" <= ch <= "9" or ch == "-":
450
+ continue
451
+ return False
452
+ return True
453
+
454
+
455
+ def _css_value_may_load_external_resource(value: str) -> bool:
456
+ # Extremely conservative check: drop any declaration value that contains a
457
+ # CSS function call that can load external resources.
458
+ #
459
+ # We intentionally do not try to parse full CSS (escapes, comments, strings,
460
+ # etc.). Instead, we reject values that contain backslashes (common escape
461
+ # obfuscation) or that *look* like they contain url(…) / image-set(…). This
462
+ # ensures style attributes can't be used to trigger network requests even
463
+ # when users allow potentially dangerous properties.
464
+ if "\\" in value:
465
+ return True
466
+
467
+ # Scan while ignoring ASCII whitespace/control chars and CSS comments.
468
+ # Keep a small rolling buffer to avoid extra allocations.
469
+ buf: list[str] = []
470
+ max_len = len("alphaimageloader")
471
+
472
+ i = 0
473
+ n = len(value)
474
+ while i < n:
475
+ ch = value[i]
476
+
477
+ # Treat CSS comments as ignorable, so obfuscation like u/**/rl( is caught.
478
+ if ch == "/" and i + 1 < n and value[i + 1] == "*":
479
+ i += 2
480
+ while i + 1 < n:
481
+ if value[i] == "*" and value[i + 1] == "/":
482
+ i += 2
483
+ break
484
+ i += 1
485
+ else:
486
+ # Unterminated comments are invalid CSS; be conservative.
487
+ return True
488
+ continue
489
+
490
+ o = ord(ch)
491
+ if o <= 0x20 or o == 0x7F:
492
+ i += 1
493
+ continue
494
+
495
+ if "A" <= ch <= "Z":
496
+ lower_ch = chr(o + 0x20)
497
+ else:
498
+ lower_ch = ch
499
+
500
+ buf.append(lower_ch)
501
+ if len(buf) > max_len:
502
+ buf.pop(0)
503
+
504
+ # Check for url( and image-set( anywhere in the normalized stream.
505
+ if len(buf) >= 4 and buf[-4:] == ["u", "r", "l", "("]:
506
+ return True
507
+ if len(buf) >= 10 and buf[-10:] == [
508
+ "i",
509
+ "m",
510
+ "a",
511
+ "g",
512
+ "e",
513
+ "-",
514
+ "s",
515
+ "e",
516
+ "t",
517
+ "(",
518
+ ]:
519
+ return True
520
+
521
+ # IE-only but still worth blocking defensively.
522
+ if len(buf) >= 11 and buf[-11:] == [
523
+ "e",
524
+ "x",
525
+ "p",
526
+ "r",
527
+ "e",
528
+ "s",
529
+ "s",
530
+ "i",
531
+ "o",
532
+ "n",
533
+ "(",
534
+ ]:
535
+ return True
536
+
537
+ # Legacy IE CSS filters that can fetch remote resources.
538
+ if len(buf) >= 7 and buf[-7:] == ["p", "r", "o", "g", "i", "d", ":"]:
539
+ return True
540
+ if len(buf) >= 16 and buf[-16:] == [
541
+ "a",
542
+ "l",
543
+ "p",
544
+ "h",
545
+ "a",
546
+ "i",
547
+ "m",
548
+ "a",
549
+ "g",
550
+ "e",
551
+ "l",
552
+ "o",
553
+ "a",
554
+ "d",
555
+ "e",
556
+ "r",
557
+ ]:
558
+ return True
559
+
560
+ # Legacy bindings/behaviors that can pull remote content.
561
+ if len(buf) >= 9 and buf[-9:] == ["b", "e", "h", "a", "v", "i", "o", "r", ":"]:
562
+ return True
563
+ if len(buf) >= 12 and buf[-12:] == [
564
+ "-",
565
+ "m",
566
+ "o",
567
+ "z",
568
+ "-",
569
+ "b",
570
+ "i",
571
+ "n",
572
+ "d",
573
+ "i",
574
+ "n",
575
+ "g",
576
+ ]:
577
+ return True
578
+
579
+ i += 1
580
+
581
+ return False
582
+
583
+
584
+ def _sanitize_inline_style(*, policy: SanitizationPolicy, value: str) -> str | None:
585
+ allowed = policy.allowed_css_properties
586
+ if not allowed:
587
+ return None
588
+
589
+ v = str(value)
590
+ if not v:
591
+ return None
592
+
593
+ out_parts: list[str] = []
594
+ for decl in v.split(";"):
595
+ d = decl.strip()
596
+ if not d:
597
+ continue
598
+ colon = d.find(":")
599
+ if colon <= 0:
600
+ continue
601
+
602
+ prop = d[:colon].strip().lower()
603
+ if not _is_valid_css_property_name(prop):
604
+ continue
605
+ if prop not in allowed:
606
+ continue
607
+
608
+ prop_value = d[colon + 1 :].strip()
609
+ if not prop_value:
610
+ continue
611
+
612
+ if _css_value_may_load_external_resource(prop_value):
613
+ continue
614
+
615
+ out_parts.append(f"{prop}: {prop_value}")
616
+
617
+ if not out_parts:
618
+ return None
619
+ return "; ".join(out_parts)
620
+
621
+
622
+ def _normalize_url_for_checking(value: str) -> str:
623
+ # Strip whitespace/control chars commonly used for scheme obfuscation.
624
+ # Note: do not strip backslashes; they are not whitespace/control chars,
625
+ # and removing them can turn invalid schemes into valid ones.
626
+ return value.translate(_URL_NORMALIZE_STRIP_TABLE)
627
+
628
+
629
+ def _is_valid_scheme(scheme: str) -> bool:
630
+ first = scheme[0]
631
+ if not ("a" <= first <= "z" or "A" <= first <= "Z"):
632
+ return False
633
+ for ch in scheme[1:]:
634
+ if "a" <= ch <= "z" or "A" <= ch <= "Z" or "0" <= ch <= "9" or ch in "+-.":
635
+ continue
636
+ return False
637
+ return True
638
+
639
+
640
+ def _has_scheme(value: str) -> bool:
641
+ idx = value.find(":")
642
+ if idx <= 0:
643
+ return False
644
+ # Scheme must appear before any path/query/fragment separator.
645
+ end = len(value)
646
+ for sep in ("/", "?", "#"):
647
+ j = value.find(sep)
648
+ if j != -1 and j < end:
649
+ end = j
650
+ if idx >= end:
651
+ return False
652
+ return _is_valid_scheme(value[:idx])
653
+
654
+
655
+ def _has_invalid_scheme_like_prefix(value: str) -> bool:
656
+ idx = value.find(":")
657
+ if idx <= 0:
658
+ return False
659
+
660
+ end = len(value)
661
+ for sep in ("/", "?", "#"):
662
+ j = value.find(sep)
663
+ if j != -1 and j < end:
664
+ end = j
665
+ if idx >= end:
666
+ return False
667
+
668
+ return not _is_valid_scheme(value[:idx])
669
+
670
+
671
+ def _sanitize_url_value(
672
+ *,
673
+ policy: SanitizationPolicy,
674
+ rule: UrlRule,
675
+ tag: str,
676
+ attr: str,
677
+ value: str,
678
+ ) -> str | None:
679
+ return _sanitize_url_value_inner(policy=policy, rule=rule, tag=tag, attr=attr, value=value, apply_filter=True)
680
+
681
+
682
+ def _effective_proxy(*, url_policy: UrlPolicy, rule: UrlRule) -> UrlProxy | None:
683
+ return rule.proxy if rule.proxy is not None else url_policy.proxy
684
+
685
+
686
+ def _effective_url_handling(*, url_policy: UrlPolicy, rule: UrlRule) -> UrlHandling:
687
+ # URL-like attributes are allowlisted via UrlPolicy.allow_rules. When they are
688
+ # allowlisted and the URL passes validation, the default action is to keep the URL.
689
+ return rule.handling if rule.handling is not None else "allow"
690
+
691
+
692
+ def _effective_allow_relative(*, url_policy: UrlPolicy, rule: UrlRule) -> bool:
693
+ return rule.allow_relative if rule.allow_relative is not None else url_policy.default_allow_relative
694
+
695
+
696
+ def _sanitize_url_value_inner(
697
+ *,
698
+ policy: SanitizationPolicy,
699
+ rule: UrlRule,
700
+ tag: str,
701
+ attr: str,
702
+ value: str,
703
+ apply_filter: bool,
704
+ ) -> str | None:
705
+ v = value
706
+ url_policy = policy.url_policy
707
+ mode = _effective_url_handling(url_policy=url_policy, rule=rule)
708
+ allow_relative = _effective_allow_relative(url_policy=url_policy, rule=rule)
709
+
710
+ if apply_filter and url_policy.url_filter is not None:
711
+ rewritten = url_policy.url_filter(tag, attr, v)
712
+ if rewritten is None:
713
+ return None
714
+ v = rewritten
715
+
716
+ stripped = str(v).strip()
717
+ normalized = _normalize_url_for_checking(stripped)
718
+ if not normalized:
719
+ # If normalization removes everything, the value was empty/whitespace/
720
+ # control-only. Drop it rather than keeping weird control characters.
721
+ return None
722
+
723
+ if normalized.startswith("#"):
724
+ if not rule.allow_fragment:
725
+ return None
726
+ if mode == "strip":
727
+ return None
728
+ if mode == "proxy":
729
+ proxy = _effective_proxy(url_policy=url_policy, rule=rule)
730
+ return None if proxy is None else _proxy_url_value(proxy=proxy, value=stripped)
731
+ return stripped
732
+
733
+ if mode == "proxy" and _has_invalid_scheme_like_prefix(normalized):
734
+ # If proxying is enabled, do not treat scheme-obfuscation as a relative URL.
735
+ # Some user agents normalize backslashes and other characters during navigation.
736
+ return None
737
+
738
+ if normalized.startswith("//"):
739
+ if not rule.resolve_protocol_relative:
740
+ return None
741
+
742
+ # Resolve to absolute URL for checking.
743
+ resolved_scheme = rule.resolve_protocol_relative.lower()
744
+ resolved_url = f"{resolved_scheme}:{normalized}"
745
+
746
+ parsed = urlsplit(resolved_url)
747
+ scheme = (parsed.scheme or "").lower()
748
+ if scheme not in rule.allowed_schemes:
749
+ return None
750
+
751
+ if rule.allowed_hosts is not None:
752
+ host = (parsed.hostname or "").lower()
753
+ if not host or host not in rule.allowed_hosts:
754
+ return None
755
+
756
+ if mode == "strip":
757
+ return None
758
+ if mode == "proxy":
759
+ proxy = _effective_proxy(url_policy=url_policy, rule=rule)
760
+ return None if proxy is None else _proxy_url_value(proxy=proxy, value=resolved_url)
761
+ return resolved_url
762
+
763
+ if _has_scheme(normalized):
764
+ parsed = urlsplit(normalized)
765
+ scheme = (parsed.scheme or "").lower()
766
+ if scheme not in rule.allowed_schemes:
767
+ return None
768
+ if rule.allowed_hosts is not None:
769
+ host = (parsed.hostname or "").lower()
770
+ if not host or host not in rule.allowed_hosts:
771
+ return None
772
+ if mode == "strip":
773
+ return None
774
+ if mode == "proxy":
775
+ proxy = _effective_proxy(url_policy=url_policy, rule=rule)
776
+ return None if proxy is None else _proxy_url_value(proxy=proxy, value=stripped)
777
+ return stripped
778
+
779
+ if not allow_relative:
780
+ return None
781
+
782
+ if mode == "strip":
783
+ return None
784
+ if mode == "proxy":
785
+ proxy = _effective_proxy(url_policy=url_policy, rule=rule)
786
+ return None if proxy is None else _proxy_url_value(proxy=proxy, value=stripped)
787
+ return stripped
788
+
789
+
790
+ def _sanitize_srcset_value(
791
+ *,
792
+ policy: SanitizationPolicy,
793
+ rule: UrlRule,
794
+ tag: str,
795
+ attr: str,
796
+ value: str,
797
+ ) -> str | None:
798
+ # Apply the URL filter once to the whole attribute value.
799
+ url_policy = policy.url_policy
800
+ v = value
801
+ if url_policy.url_filter is not None:
802
+ rewritten = url_policy.url_filter(tag, attr, v)
803
+ if rewritten is None:
804
+ return None
805
+ v = rewritten
806
+
807
+ stripped = str(v).strip()
808
+ if not stripped:
809
+ return None
810
+
811
+ out_candidates: list[str] = []
812
+ for raw_candidate in stripped.split(","):
813
+ c = raw_candidate.strip()
814
+ if not c:
815
+ continue
816
+
817
+ parts = c.split(None, 1)
818
+ url_token = parts[0]
819
+ desc = parts[1].strip() if len(parts) == 2 else ""
820
+
821
+ sanitized_url = _sanitize_url_value_inner(
822
+ policy=policy,
823
+ rule=rule,
824
+ tag=tag,
825
+ attr=attr,
826
+ value=url_token,
827
+ apply_filter=False,
828
+ )
829
+ if sanitized_url is None:
830
+ return None
831
+
832
+ out_candidates.append(f"{sanitized_url} {desc}".strip())
833
+
834
+ return None if not out_candidates else ", ".join(out_candidates)
835
+
836
+
837
+ _URL_LIKE_ATTRS: frozenset[str] = frozenset(
838
+ {
839
+ # Common URL-valued attributes.
840
+ "href",
841
+ "src",
842
+ "srcset",
843
+ "poster",
844
+ "action",
845
+ "formaction",
846
+ "data",
847
+ "cite",
848
+ "background",
849
+ # Can trigger requests/pings.
850
+ "ping",
851
+ }
852
+ )
853
+
854
+
855
+ def _url_is_external(normalized_url: str) -> bool:
856
+ if not normalized_url:
857
+ return False
858
+ if normalized_url.startswith("#"):
859
+ return False
860
+ if normalized_url.startswith("//"):
861
+ return True
862
+ return _has_scheme(normalized_url)
863
+
864
+
865
+ def _srcset_contains_external_url(value: str) -> bool:
866
+ # Minimal srcset scanner: parse comma-separated candidates, taking the URL
867
+ # up to the first whitespace of each candidate.
868
+ i = 0
869
+ n = len(value)
870
+ while i < n:
871
+ # Skip whitespace and commas.
872
+ while i < n and value[i] in "\t\n\r\f ,":
873
+ i += 1
874
+ if i >= n:
875
+ break
876
+
877
+ start = i
878
+ while i < n and value[i] not in "\t\n\r\f ,":
879
+ i += 1
880
+ candidate = value[start:i]
881
+
882
+ normalized = _normalize_url_for_checking(candidate.strip())
883
+ if _url_is_external(normalized):
884
+ return True
885
+
886
+ # Skip the rest of the candidate (descriptors) until the next comma.
887
+ while i < n and value[i] != ",":
888
+ i += 1
889
+ if i < n and value[i] == ",":
890
+ i += 1
891
+
892
+ return False
893
+
894
+
895
+ def _sanitize_attrs(
896
+ *,
897
+ policy: SanitizationPolicy,
898
+ tag: str,
899
+ attrs: dict[str, str | None] | None,
900
+ node: Any | None = None,
901
+ ) -> dict[str, str | None]:
902
+ if not attrs:
903
+ attrs = {}
904
+
905
+ allowed = policy._allowed_attrs_by_tag.get(tag) or policy._allowed_attrs_global
906
+
907
+ out: dict[str, str | None] = {}
908
+ for raw_name, raw_value in attrs.items():
909
+ if not raw_name:
910
+ continue
911
+
912
+ name = raw_name
913
+ # Optimization: assume name is already a string and stripped (from tokenizer)
914
+ if not name.islower():
915
+ name = name.lower()
916
+
917
+ # Disallow namespace-ish attributes by default.
918
+ if ":" in name:
919
+ policy.handle_unsafe(f"Unsafe attribute '{name}' (namespaced)", node=node)
920
+ continue
921
+
922
+ # Always drop event handlers.
923
+ if name.startswith("on"):
924
+ policy.handle_unsafe(f"Unsafe attribute '{name}' (event handler)", node=node)
925
+ continue
926
+
927
+ # Dangerous attribute contexts.
928
+ if name == "srcdoc":
929
+ policy.handle_unsafe(f"Unsafe attribute '{name}'", node=node)
930
+ continue
931
+
932
+ if name not in allowed and not (tag == "a" and name == "rel" and policy.force_link_rel):
933
+ policy.handle_unsafe(f"Unsafe attribute '{name}' (not allowed)", node=node)
934
+ continue
935
+
936
+ if raw_value is None:
937
+ out[name] = None
938
+ continue
939
+
940
+ value = raw_value
941
+
942
+ if name in _URL_LIKE_ATTRS:
943
+ rule = policy.url_policy.allow_rules.get((tag, name))
944
+ if rule is None:
945
+ policy.handle_unsafe(f"Unsafe URL in attribute '{name}' (no rule)", node=node)
946
+ continue
947
+
948
+ if name == "srcset":
949
+ sanitized = _sanitize_srcset_value(policy=policy, rule=rule, tag=tag, attr=name, value=value)
950
+ else:
951
+ sanitized = _sanitize_url_value(policy=policy, rule=rule, tag=tag, attr=name, value=value)
952
+
953
+ if sanitized is None:
954
+ policy.handle_unsafe(f"Unsafe URL in attribute '{name}'", node=node)
955
+ continue
956
+
957
+ out[name] = sanitized
958
+ elif name == "style":
959
+ sanitized_style = _sanitize_inline_style(policy=policy, value=value)
960
+ if sanitized_style is None:
961
+ policy.handle_unsafe(f"Unsafe inline style in attribute '{name}'", node=node)
962
+ continue
963
+ out[name] = sanitized_style
964
+ else:
965
+ out[name] = value
966
+
967
+ # Link hardening (merge tokens; do not remove existing ones).
968
+ if tag == "a" and policy.force_link_rel:
969
+ existing_raw = out.get("rel")
970
+ existing: list[str] = []
971
+ if isinstance(existing_raw, str) and existing_raw:
972
+ for tok in existing_raw.split():
973
+ t = tok.strip().lower()
974
+ if t and t not in existing:
975
+ existing.append(t)
976
+ for tok in sorted(policy.force_link_rel):
977
+ if tok not in existing:
978
+ existing.append(tok)
979
+ out["rel"] = " ".join(existing)
980
+
981
+ return out
982
+
983
+
984
+ def _append_sanitized_subtree(*, policy: SanitizationPolicy, original: Any, parent_out: Any) -> None:
985
+ stack: list[tuple[Any, Any]] = [(original, parent_out)]
986
+ while stack:
987
+ current, out_parent = stack.pop()
988
+ name: str = current.name
989
+
990
+ if name == "#text":
991
+ out_parent.append_child(current.clone_node(deep=False))
992
+ continue
993
+
994
+ if name == "#comment":
995
+ if policy.drop_comments:
996
+ continue
997
+ out_parent.append_child(current.clone_node(deep=False))
998
+ continue
999
+
1000
+ if name == "!doctype":
1001
+ if policy.drop_doctype:
1002
+ continue
1003
+ out_parent.append_child(current.clone_node(deep=False))
1004
+ continue
1005
+
1006
+ # Document containers.
1007
+ if name.startswith("#"):
1008
+ clone = current.clone_node(deep=False)
1009
+ clone.children.clear()
1010
+ out_parent.append_child(clone)
1011
+ children = current.children or []
1012
+ stack.extend((child, clone) for child in reversed(children))
1013
+ continue
1014
+
1015
+ # Element.
1016
+ tag = str(name).lower()
1017
+ if policy.drop_foreign_namespaces:
1018
+ ns = current.namespace
1019
+ if ns not in (None, "html"):
1020
+ policy.handle_unsafe(f"Unsafe tag '{tag}' (foreign namespace)", node=current)
1021
+ continue
1022
+
1023
+ if tag in policy.drop_content_tags:
1024
+ policy.handle_unsafe(f"Unsafe tag '{tag}' (dropped content)", node=current)
1025
+ continue
1026
+
1027
+ if tag not in policy.allowed_tags:
1028
+ policy.handle_unsafe(f"Unsafe tag '{tag}' (not allowed)", node=current)
1029
+ if policy.strip_disallowed_tags:
1030
+ children = current.children or []
1031
+ stack.extend((child, out_parent) for child in reversed(children))
1032
+
1033
+ if tag == "template" and current.namespace in (None, "html") and current.template_content:
1034
+ tc_children = current.template_content.children or []
1035
+ stack.extend((child, out_parent) for child in reversed(tc_children))
1036
+ continue
1037
+
1038
+ # Filter attributes first to avoid copying them in clone_node.
1039
+ sanitized_attrs = _sanitize_attrs(policy=policy, tag=tag, attrs=current.attrs, node=current)
1040
+ clone = current.clone_node(deep=False, override_attrs=sanitized_attrs)
1041
+
1042
+ out_parent.append_child(clone)
1043
+
1044
+ # Template content is a separate subtree.
1045
+ if tag == "template" and current.namespace in (None, "html"):
1046
+ if current.template_content and clone.template_content:
1047
+ clone.template_content.children.clear()
1048
+ tc_children = current.template_content.children or []
1049
+ stack.extend((child, clone.template_content) for child in reversed(tc_children))
1050
+
1051
+ children = current.children or []
1052
+ stack.extend((child, clone) for child in reversed(children))
1053
+
1054
+
1055
+ def _sanitize(node: Any, *, policy: SanitizationPolicy | None = None) -> Any:
1056
+ """Return a sanitized clone of `node`.
1057
+
1058
+ Private implementation detail.
1059
+
1060
+ If `policy` is not provided, JustHTML uses a conservative default policy.
1061
+ For full documents (`#document` roots) it preserves `<html>`, `<head>`, and
1062
+ `<body>` wrappers; for fragments it prefers snippet-shaped output.
1063
+ """
1064
+
1065
+ if policy is None:
1066
+ policy = DEFAULT_DOCUMENT_POLICY if node.name == "#document" else DEFAULT_POLICY
1067
+
1068
+ # Root handling.
1069
+ root_name: str = node.name
1070
+
1071
+ if root_name == "#text":
1072
+ return node.clone_node(deep=False)
1073
+
1074
+ if root_name == "#comment":
1075
+ out_root = node.clone_node(deep=False)
1076
+ if policy.drop_comments:
1077
+ out_root.name = "#document-fragment"
1078
+ return out_root
1079
+
1080
+ if root_name == "!doctype":
1081
+ out_root = node.clone_node(deep=False)
1082
+ if policy.drop_doctype:
1083
+ out_root.name = "#document-fragment"
1084
+ return out_root
1085
+
1086
+ # Containers.
1087
+ if root_name.startswith("#"):
1088
+ out_root = node.clone_node(deep=False)
1089
+ out_root.children.clear()
1090
+ for child in node.children or []:
1091
+ _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
1092
+ return out_root
1093
+
1094
+ # Element root: keep element if allowed, otherwise unwrap into a fragment.
1095
+ tag = str(root_name).lower()
1096
+ if policy.drop_foreign_namespaces and node.namespace not in (None, "html"):
1097
+ policy.handle_unsafe(f"Unsafe tag '{tag}' (foreign namespace)", node=node)
1098
+ out_root = node.clone_node(deep=False)
1099
+ out_root.name = "#document-fragment"
1100
+ out_root.children.clear()
1101
+ out_root.attrs.clear()
1102
+ return out_root
1103
+
1104
+ if tag in policy.drop_content_tags or (tag not in policy.allowed_tags and not policy.strip_disallowed_tags):
1105
+ if tag in policy.drop_content_tags:
1106
+ policy.handle_unsafe(f"Unsafe tag '{tag}' (dropped content)", node=node)
1107
+ else:
1108
+ policy.handle_unsafe(f"Unsafe tag '{tag}' (not allowed)", node=node)
1109
+ out_root = node.clone_node(deep=False)
1110
+ out_root.name = "#document-fragment"
1111
+ out_root.children.clear()
1112
+ out_root.attrs.clear()
1113
+ return out_root
1114
+
1115
+ if tag not in policy.allowed_tags and policy.strip_disallowed_tags:
1116
+ policy.handle_unsafe(f"Unsafe tag '{tag}' (not allowed)", node=node)
1117
+ out_root = node.clone_node(deep=False)
1118
+ out_root.name = "#document-fragment"
1119
+ out_root.children.clear()
1120
+ out_root.attrs.clear()
1121
+ for child in node.children or []:
1122
+ _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
1123
+
1124
+ if tag == "template" and node.namespace in (None, "html") and node.template_content:
1125
+ for child in node.template_content.children or []:
1126
+ _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
1127
+ return out_root
1128
+
1129
+ out_root = node.clone_node(deep=False)
1130
+ out_root.children.clear()
1131
+ out_root.attrs = _sanitize_attrs(policy=policy, tag=tag, attrs=node.attrs, node=node)
1132
+ for child in node.children or []:
1133
+ _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
1134
+
1135
+ if tag == "template" and node.namespace in (None, "html"):
1136
+ if node.template_content and out_root.template_content:
1137
+ out_root.template_content.children.clear()
1138
+ for child in node.template_content.children or []:
1139
+ _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root.template_content)
1140
+
1141
+ return out_root