justhtml 0.12.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/sanitize.py ADDED
@@ -0,0 +1,763 @@
1
+ """HTML sanitization policy API.
2
+
3
+ This module defines the public API for JustHTML sanitization.
4
+
5
+ The sanitizer operates on the parsed JustHTML DOM and is intentionally
6
+ policy-driven.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from collections.abc import Callable, Collection, Mapping
12
+ from dataclasses import dataclass, field
13
+ from typing import Any
14
+ from urllib.parse import quote, urlsplit
15
+
16
+ UrlFilter = Callable[[str, str, str], str | None]
17
+
18
+
19
+ @dataclass(frozen=True, slots=True)
20
+ class UrlRule:
21
+ """Rule for a single URL-valued attribute (e.g. a[href], img[src]).
22
+
23
+ This is intentionally rendering-oriented.
24
+
25
+ - Returning/keeping a URL can still cause network requests when the output
26
+ is rendered (notably for <img src>). Applications like email viewers often
27
+ want to block remote loads by default.
28
+ """
29
+
30
+ # Allow relative URLs (including /path, ./path, ../path, ?query).
31
+ allow_relative: bool = True
32
+
33
+ # Allow same-document fragments (#foo). Typically safe.
34
+ allow_fragment: bool = True
35
+
36
+ # If set, protocol-relative URLs (//example.com) are resolved to this scheme
37
+ # (e.g. "https") before checking allowed_schemes.
38
+ # If None, protocol-relative URLs are disallowed.
39
+ resolve_protocol_relative: str | None = "https"
40
+
41
+ # Allow absolute URLs with these schemes (lowercase), e.g. {"https"}.
42
+ # If empty, all absolute URLs with a scheme are disallowed.
43
+ allowed_schemes: Collection[str] = field(default_factory=set)
44
+
45
+ # If provided, absolute URLs are allowed only if the parsed host is in this
46
+ # allowlist.
47
+ allowed_hosts: Collection[str] | None = None
48
+
49
+ # Optional proxy rewrite for allowed absolute/protocol-relative URLs.
50
+ # Example: proxy_url="/proxy" -> https://google.com becomes
51
+ # /proxy?url=https%3A%2F%2Fgoogle.com
52
+ proxy_url: str | None = None
53
+
54
+ # Query parameter name used when proxy_url is set.
55
+ proxy_param: str = "url"
56
+
57
+ def __post_init__(self) -> None:
58
+ # Accept lists/tuples from user code, normalize for internal use.
59
+ if not isinstance(self.allowed_schemes, set):
60
+ object.__setattr__(self, "allowed_schemes", set(self.allowed_schemes))
61
+ if self.allowed_hosts is not None and not isinstance(self.allowed_hosts, set):
62
+ object.__setattr__(self, "allowed_hosts", set(self.allowed_hosts))
63
+
64
+ if self.proxy_url is not None:
65
+ proxy_url = str(self.proxy_url)
66
+ object.__setattr__(self, "proxy_url", proxy_url if proxy_url else None)
67
+ object.__setattr__(self, "proxy_param", str(self.proxy_param))
68
+
69
+
70
+ def _proxy_url_value(*, proxy_url: str, proxy_param: str, value: str) -> str:
71
+ sep = "&" if "?" in proxy_url else "?"
72
+ return f"{proxy_url}{sep}{proxy_param}={quote(value, safe='')}"
73
+
74
+
75
+ @dataclass(frozen=True, slots=True)
76
+ class SanitizationPolicy:
77
+ """An allow-list driven policy for sanitizing a parsed DOM.
78
+
79
+ This API is intentionally small. The implementation will interpret these
80
+ fields strictly.
81
+
82
+ - Tags not in `allowed_tags` are disallowed.
83
+ - Attributes not in `allowed_attributes[tag]` (or `allowed_attributes["*"]`)
84
+ are disallowed.
85
+ - URL scheme checks apply to attributes listed in `url_attributes`.
86
+
87
+ All tag and attribute names are expected to be ASCII-lowercase.
88
+ """
89
+
90
+ allowed_tags: Collection[str]
91
+ allowed_attributes: Mapping[str, Collection[str]]
92
+
93
+ # URL handling:
94
+ # - `url_rules` is the data-driven allowlist for URL-valued attributes.
95
+ # - `url_filter` is an optional hook that can drop or rewrite URLs.
96
+ #
97
+ # `url_filter(tag, attr, value)` should return:
98
+ # - a replacement string to keep (possibly rewritten), or
99
+ # - None to drop the attribute.
100
+ url_rules: Mapping[tuple[str, str], UrlRule]
101
+ url_filter: UrlFilter | None = None
102
+
103
+ drop_comments: bool = True
104
+ drop_doctype: bool = True
105
+ drop_foreign_namespaces: bool = True
106
+
107
+ # If True, disallowed elements are removed but their children may be kept
108
+ # (except for tags in `drop_content_tags`).
109
+ strip_disallowed_tags: bool = True
110
+
111
+ # Dangerous containers whose text payload should not be preserved.
112
+ drop_content_tags: Collection[str] = field(default_factory=lambda: {"script", "style"})
113
+
114
+ # Inline style allowlist.
115
+ # Only applies when the `style` attribute is allowed for a tag.
116
+ # If empty, inline styles are effectively disabled (style attributes are dropped).
117
+ allowed_css_properties: Collection[str] = field(default_factory=set)
118
+
119
+ # Link hardening.
120
+ # If non-empty, ensure these tokens are present in <a rel="...">.
121
+ # (The sanitizer will merge tokens; it will not remove existing ones.)
122
+ force_link_rel: Collection[str] = field(default_factory=set)
123
+
124
+ def __post_init__(self) -> None:
125
+ # Normalize to sets so the sanitizer can do fast membership checks.
126
+ if not isinstance(self.allowed_tags, set):
127
+ object.__setattr__(self, "allowed_tags", set(self.allowed_tags))
128
+
129
+ if not isinstance(self.allowed_attributes, dict) or any(
130
+ not isinstance(v, set) for v in self.allowed_attributes.values()
131
+ ):
132
+ normalized_attrs: dict[str, set[str]] = {}
133
+ for tag, attrs in self.allowed_attributes.items():
134
+ normalized_attrs[str(tag)] = attrs if isinstance(attrs, set) else set(attrs)
135
+ object.__setattr__(self, "allowed_attributes", normalized_attrs)
136
+
137
+ if not isinstance(self.drop_content_tags, set):
138
+ object.__setattr__(self, "drop_content_tags", set(self.drop_content_tags))
139
+
140
+ if not isinstance(self.allowed_css_properties, set):
141
+ object.__setattr__(self, "allowed_css_properties", set(self.allowed_css_properties))
142
+
143
+ if not isinstance(self.force_link_rel, set):
144
+ object.__setattr__(self, "force_link_rel", set(self.force_link_rel))
145
+
146
+
147
+ DEFAULT_POLICY: SanitizationPolicy = SanitizationPolicy(
148
+ allowed_tags=[
149
+ # Text / structure
150
+ "p",
151
+ "br",
152
+ # Structure
153
+ "div",
154
+ "span",
155
+ "blockquote",
156
+ "pre",
157
+ "code",
158
+ # Headings
159
+ "h1",
160
+ "h2",
161
+ "h3",
162
+ "h4",
163
+ "h5",
164
+ "h6",
165
+ # Lists
166
+ "ul",
167
+ "ol",
168
+ "li",
169
+ # Tables
170
+ "table",
171
+ "thead",
172
+ "tbody",
173
+ "tfoot",
174
+ "tr",
175
+ "th",
176
+ "td",
177
+ # Text formatting
178
+ "b",
179
+ "strong",
180
+ "i",
181
+ "em",
182
+ "u",
183
+ "s",
184
+ "sub",
185
+ "sup",
186
+ "small",
187
+ "mark",
188
+ # Quotes/code
189
+ # Line breaks
190
+ "hr",
191
+ # Links and images
192
+ "a",
193
+ "img",
194
+ ],
195
+ allowed_attributes={
196
+ "*": ["class", "id", "title", "lang", "dir"],
197
+ "a": ["href", "title"],
198
+ "img": ["src", "alt", "title", "width", "height", "loading", "decoding"],
199
+ "th": ["colspan", "rowspan"],
200
+ "td": ["colspan", "rowspan"],
201
+ },
202
+ # Default URL stance:
203
+ # - Links may point to http/https/mailto/tel and relative URLs.
204
+ # - Images may point to relative URLs only.
205
+ url_rules={
206
+ ("a", "href"): UrlRule(
207
+ allowed_schemes=["http", "https", "mailto", "tel"],
208
+ resolve_protocol_relative="https",
209
+ ),
210
+ ("img", "src"): UrlRule(
211
+ allowed_schemes=["http", "https"],
212
+ resolve_protocol_relative="https",
213
+ ),
214
+ },
215
+ allowed_css_properties=set(),
216
+ )
217
+
218
+
219
+ DEFAULT_DOCUMENT_POLICY: SanitizationPolicy = SanitizationPolicy(
220
+ allowed_tags=sorted(set(DEFAULT_POLICY.allowed_tags) | {"html", "head", "body", "title"}),
221
+ allowed_attributes=DEFAULT_POLICY.allowed_attributes,
222
+ url_rules=DEFAULT_POLICY.url_rules,
223
+ url_filter=DEFAULT_POLICY.url_filter,
224
+ drop_comments=DEFAULT_POLICY.drop_comments,
225
+ drop_doctype=DEFAULT_POLICY.drop_doctype,
226
+ drop_foreign_namespaces=DEFAULT_POLICY.drop_foreign_namespaces,
227
+ strip_disallowed_tags=DEFAULT_POLICY.strip_disallowed_tags,
228
+ drop_content_tags=DEFAULT_POLICY.drop_content_tags,
229
+ allowed_css_properties=DEFAULT_POLICY.allowed_css_properties,
230
+ force_link_rel=DEFAULT_POLICY.force_link_rel,
231
+ )
232
+
233
+
234
+ def _is_valid_css_property_name(name: str) -> bool:
235
+ # Conservative: allow only ASCII letters/digits/hyphen.
236
+ # This keeps parsing deterministic and avoids surprises with escapes.
237
+ if not name:
238
+ return False
239
+ for ch in name:
240
+ if "a" <= ch <= "z" or "0" <= ch <= "9" or ch == "-":
241
+ continue
242
+ return False
243
+ return True
244
+
245
+
246
+ def _css_value_may_load_external_resource(value: str) -> bool:
247
+ # Extremely conservative check: drop any declaration value that contains a
248
+ # CSS function call that can load external resources.
249
+ #
250
+ # We intentionally do not try to parse full CSS (escapes, comments, strings,
251
+ # etc.). Instead, we reject values that contain backslashes (common escape
252
+ # obfuscation) or that *look* like they contain url(…) / image-set(…). This
253
+ # ensures style attributes can't be used to trigger network requests even
254
+ # when users allow potentially dangerous properties.
255
+ if "\\" in value:
256
+ return True
257
+
258
+ # Scan while ignoring ASCII whitespace/control chars and CSS comments.
259
+ # Keep a small rolling buffer to avoid extra allocations.
260
+ buf: list[str] = []
261
+ max_len = len("alphaimageloader")
262
+
263
+ i = 0
264
+ n = len(value)
265
+ while i < n:
266
+ ch = value[i]
267
+
268
+ # Treat CSS comments as ignorable, so obfuscation like u/**/rl( is caught.
269
+ if ch == "/" and i + 1 < n and value[i + 1] == "*":
270
+ i += 2
271
+ while i + 1 < n:
272
+ if value[i] == "*" and value[i + 1] == "/":
273
+ i += 2
274
+ break
275
+ i += 1
276
+ else:
277
+ # Unterminated comments are invalid CSS; be conservative.
278
+ return True
279
+ continue
280
+
281
+ o = ord(ch)
282
+ if o <= 0x20 or o == 0x7F:
283
+ i += 1
284
+ continue
285
+
286
+ if "A" <= ch <= "Z":
287
+ lower_ch = chr(o + 0x20)
288
+ else:
289
+ lower_ch = ch
290
+
291
+ buf.append(lower_ch)
292
+ if len(buf) > max_len:
293
+ buf.pop(0)
294
+
295
+ # Check for url( and image-set( anywhere in the normalized stream.
296
+ if len(buf) >= 4 and buf[-4:] == ["u", "r", "l", "("]:
297
+ return True
298
+ if len(buf) >= 10 and buf[-10:] == [
299
+ "i",
300
+ "m",
301
+ "a",
302
+ "g",
303
+ "e",
304
+ "-",
305
+ "s",
306
+ "e",
307
+ "t",
308
+ "(",
309
+ ]:
310
+ return True
311
+
312
+ # IE-only but still worth blocking defensively.
313
+ if len(buf) >= 11 and buf[-11:] == [
314
+ "e",
315
+ "x",
316
+ "p",
317
+ "r",
318
+ "e",
319
+ "s",
320
+ "s",
321
+ "i",
322
+ "o",
323
+ "n",
324
+ "(",
325
+ ]:
326
+ return True
327
+
328
+ # Legacy IE CSS filters that can fetch remote resources.
329
+ if len(buf) >= 7 and buf[-7:] == ["p", "r", "o", "g", "i", "d", ":"]:
330
+ return True
331
+ if len(buf) >= 16 and buf[-16:] == [
332
+ "a",
333
+ "l",
334
+ "p",
335
+ "h",
336
+ "a",
337
+ "i",
338
+ "m",
339
+ "a",
340
+ "g",
341
+ "e",
342
+ "l",
343
+ "o",
344
+ "a",
345
+ "d",
346
+ "e",
347
+ "r",
348
+ ]:
349
+ return True
350
+
351
+ # Legacy bindings/behaviors that can pull remote content.
352
+ if len(buf) >= 9 and buf[-9:] == ["b", "e", "h", "a", "v", "i", "o", "r", ":"]:
353
+ return True
354
+ if len(buf) >= 12 and buf[-12:] == [
355
+ "-",
356
+ "m",
357
+ "o",
358
+ "z",
359
+ "-",
360
+ "b",
361
+ "i",
362
+ "n",
363
+ "d",
364
+ "i",
365
+ "n",
366
+ "g",
367
+ ]:
368
+ return True
369
+
370
+ i += 1
371
+
372
+ return False
373
+
374
+
375
+ def _sanitize_inline_style(*, policy: SanitizationPolicy, value: str) -> str | None:
376
+ allowed = policy.allowed_css_properties
377
+ if not allowed:
378
+ return None
379
+
380
+ v = str(value)
381
+ if not v:
382
+ return None
383
+
384
+ out_parts: list[str] = []
385
+ for decl in v.split(";"):
386
+ d = decl.strip()
387
+ if not d:
388
+ continue
389
+ colon = d.find(":")
390
+ if colon <= 0:
391
+ continue
392
+
393
+ prop = d[:colon].strip().lower()
394
+ if not _is_valid_css_property_name(prop):
395
+ continue
396
+ if prop not in allowed:
397
+ continue
398
+
399
+ prop_value = d[colon + 1 :].strip()
400
+ if not prop_value:
401
+ continue
402
+
403
+ if _css_value_may_load_external_resource(prop_value):
404
+ continue
405
+
406
+ out_parts.append(f"{prop}: {prop_value}")
407
+
408
+ if not out_parts:
409
+ return None
410
+ return "; ".join(out_parts)
411
+
412
+
413
+ def _normalize_url_for_checking(value: str) -> str:
414
+ # Strip whitespace/control chars commonly used for scheme obfuscation.
415
+ # Note: do not strip backslashes; they are not whitespace/control chars,
416
+ # and removing them can turn invalid schemes into valid ones.
417
+ out: list[str] = []
418
+ for ch in value:
419
+ o = ord(ch)
420
+ if o <= 0x20 or o == 0x7F:
421
+ continue
422
+ out.append(ch)
423
+ return "".join(out)
424
+
425
+
426
+ def _is_valid_scheme(scheme: str) -> bool:
427
+ first = scheme[0]
428
+ if not ("a" <= first <= "z" or "A" <= first <= "Z"):
429
+ return False
430
+ for ch in scheme[1:]:
431
+ if "a" <= ch <= "z" or "A" <= ch <= "Z" or "0" <= ch <= "9" or ch in "+-.":
432
+ continue
433
+ return False
434
+ return True
435
+
436
+
437
+ def _has_scheme(value: str) -> bool:
438
+ idx = value.find(":")
439
+ if idx <= 0:
440
+ return False
441
+ # Scheme must appear before any path/query/fragment separator.
442
+ end = len(value)
443
+ for sep in ("/", "?", "#"):
444
+ j = value.find(sep)
445
+ if j != -1 and j < end:
446
+ end = j
447
+ if idx >= end:
448
+ return False
449
+ return _is_valid_scheme(value[:idx])
450
+
451
+
452
+ def _has_invalid_scheme_like_prefix(value: str) -> bool:
453
+ idx = value.find(":")
454
+ if idx <= 0:
455
+ return False
456
+
457
+ end = len(value)
458
+ for sep in ("/", "?", "#"):
459
+ j = value.find(sep)
460
+ if j != -1 and j < end:
461
+ end = j
462
+ if idx >= end:
463
+ return False
464
+
465
+ return not _is_valid_scheme(value[:idx])
466
+
467
+
468
+ def _sanitize_url_value(
469
+ *,
470
+ policy: SanitizationPolicy,
471
+ rule: UrlRule,
472
+ tag: str,
473
+ attr: str,
474
+ value: str,
475
+ ) -> str | None:
476
+ v = value
477
+ if policy.url_filter is not None:
478
+ rewritten = policy.url_filter(tag, attr, v)
479
+ if rewritten is None:
480
+ return None
481
+ v = rewritten
482
+
483
+ stripped = str(v).strip()
484
+ normalized = _normalize_url_for_checking(stripped)
485
+ if not normalized:
486
+ # If normalization removes everything, the value was empty/whitespace/
487
+ # control-only. Drop it rather than keeping weird control characters.
488
+ return None
489
+
490
+ if normalized.startswith("#"):
491
+ return stripped if rule.allow_fragment else None
492
+
493
+ # If proxying is enabled, do not treat scheme-obfuscation as a relative URL.
494
+ # Some user agents normalize backslashes and other characters during navigation.
495
+ if rule.proxy_url and _has_invalid_scheme_like_prefix(normalized):
496
+ return None
497
+
498
+ if normalized.startswith("//"):
499
+ if not rule.resolve_protocol_relative:
500
+ return None
501
+
502
+ # Resolve to absolute URL for checking.
503
+ resolved_scheme = rule.resolve_protocol_relative.lower()
504
+ resolved_url = f"{resolved_scheme}:{normalized}"
505
+
506
+ parsed = urlsplit(resolved_url)
507
+ scheme = (parsed.scheme or "").lower()
508
+ if scheme not in rule.allowed_schemes:
509
+ return None
510
+
511
+ if rule.allowed_hosts is not None:
512
+ host = (parsed.hostname or "").lower()
513
+ if not host or host not in rule.allowed_hosts:
514
+ return None
515
+
516
+ # Return the resolved URL.
517
+ return (
518
+ _proxy_url_value(proxy_url=rule.proxy_url, proxy_param=rule.proxy_param, value=resolved_url)
519
+ if rule.proxy_url
520
+ else resolved_url
521
+ )
522
+
523
+ if _has_scheme(normalized):
524
+ parsed = urlsplit(normalized)
525
+ scheme = (parsed.scheme or "").lower()
526
+ if scheme not in rule.allowed_schemes:
527
+ return None
528
+ if rule.allowed_hosts is not None:
529
+ host = (parsed.hostname or "").lower()
530
+ if not host or host not in rule.allowed_hosts:
531
+ return None
532
+ return (
533
+ _proxy_url_value(proxy_url=rule.proxy_url, proxy_param=rule.proxy_param, value=stripped)
534
+ if rule.proxy_url
535
+ else stripped
536
+ )
537
+
538
+ return stripped if rule.allow_relative else None
539
+
540
+
541
+ def _sanitize_attrs(
542
+ *,
543
+ policy: SanitizationPolicy,
544
+ tag: str,
545
+ attrs: dict[str, str | None] | None,
546
+ ) -> dict[str, str | None]:
547
+ if not attrs:
548
+ attrs = {}
549
+
550
+ allowed_global = set(policy.allowed_attributes.get("*", ()))
551
+ allowed_tag = set(policy.allowed_attributes.get(tag, ()))
552
+ allowed = allowed_global | allowed_tag
553
+
554
+ out: dict[str, str | None] = {}
555
+ for raw_name, raw_value in attrs.items():
556
+ if not raw_name:
557
+ continue
558
+
559
+ name = str(raw_name).strip().lower()
560
+ if not name:
561
+ continue
562
+
563
+ # Disallow namespace-ish attributes by default.
564
+ if ":" in name:
565
+ continue
566
+
567
+ # Always drop event handlers.
568
+ if name.startswith("on"):
569
+ continue
570
+
571
+ # Dangerous attribute contexts.
572
+ if name == "srcdoc":
573
+ continue
574
+
575
+ if name not in allowed and not (tag == "a" and name == "rel" and policy.force_link_rel):
576
+ continue
577
+
578
+ if raw_value is None:
579
+ out[name] = None
580
+ continue
581
+
582
+ value = str(raw_value)
583
+ rule = policy.url_rules.get((tag, name))
584
+ if rule is not None:
585
+ sanitized = _sanitize_url_value(policy=policy, rule=rule, tag=tag, attr=name, value=value)
586
+ if sanitized is None:
587
+ continue
588
+ out[name] = sanitized
589
+ elif name == "style":
590
+ sanitized_style = _sanitize_inline_style(policy=policy, value=value)
591
+ if sanitized_style is None:
592
+ continue
593
+ out[name] = sanitized_style
594
+ else:
595
+ out[name] = value
596
+
597
+ # Link hardening (merge tokens; do not remove existing ones).
598
+ forced_tokens = [t.strip().lower() for t in policy.force_link_rel if str(t).strip()]
599
+ if tag == "a" and forced_tokens:
600
+ existing_raw = out.get("rel")
601
+ existing: list[str] = []
602
+ if isinstance(existing_raw, str) and existing_raw:
603
+ for tok in existing_raw.split():
604
+ t = tok.strip().lower()
605
+ if t and t not in existing:
606
+ existing.append(t)
607
+ for tok in sorted(forced_tokens):
608
+ if tok not in existing:
609
+ existing.append(tok)
610
+ out["rel"] = " ".join(existing)
611
+
612
+ return out
613
+
614
+
615
+ def _append_sanitized_subtree(*, policy: SanitizationPolicy, original: Any, parent_out: Any) -> None:
616
+ stack: list[tuple[Any, Any]] = [(original, parent_out)]
617
+ while stack:
618
+ current, out_parent = stack.pop()
619
+ name: str = current.name
620
+
621
+ if name == "#text":
622
+ out_parent.append_child(current.clone_node(deep=False))
623
+ continue
624
+
625
+ if name == "#comment":
626
+ if policy.drop_comments:
627
+ continue
628
+ out_parent.append_child(current.clone_node(deep=False))
629
+ continue
630
+
631
+ if name == "!doctype":
632
+ if policy.drop_doctype:
633
+ continue
634
+ out_parent.append_child(current.clone_node(deep=False))
635
+ continue
636
+
637
+ # Document containers.
638
+ if name.startswith("#"):
639
+ clone = current.clone_node(deep=False)
640
+ clone.children.clear()
641
+ out_parent.append_child(clone)
642
+ children = current.children or []
643
+ stack.extend((child, clone) for child in reversed(children))
644
+ continue
645
+
646
+ # Element.
647
+ tag = str(name).lower()
648
+ if policy.drop_foreign_namespaces:
649
+ ns = current.namespace
650
+ if ns not in (None, "html"):
651
+ continue
652
+
653
+ if tag in policy.drop_content_tags:
654
+ continue
655
+
656
+ if tag not in policy.allowed_tags:
657
+ if policy.strip_disallowed_tags:
658
+ children = current.children or []
659
+ stack.extend((child, out_parent) for child in reversed(children))
660
+
661
+ if tag == "template" and current.namespace in (None, "html") and current.template_content:
662
+ tc_children = current.template_content.children or []
663
+ stack.extend((child, out_parent) for child in reversed(tc_children))
664
+ continue
665
+
666
+ clone = current.clone_node(deep=False)
667
+ # Ensure children list is empty before we append sanitized descendants.
668
+ clone.children.clear()
669
+ # Filter attributes.
670
+ clone.attrs = _sanitize_attrs(policy=policy, tag=tag, attrs=current.attrs)
671
+
672
+ out_parent.append_child(clone)
673
+
674
+ # Template content is a separate subtree.
675
+ if tag == "template" and current.namespace in (None, "html"):
676
+ if current.template_content and clone.template_content:
677
+ clone.template_content.children.clear()
678
+ tc_children = current.template_content.children or []
679
+ stack.extend((child, clone.template_content) for child in reversed(tc_children))
680
+
681
+ children = current.children or []
682
+ stack.extend((child, clone) for child in reversed(children))
683
+
684
+
685
+ def sanitize(node: Any, *, policy: SanitizationPolicy | None = None) -> Any:
686
+ """Return a sanitized clone of `node`.
687
+
688
+ If `policy` is not provided, JustHTML uses a conservative default policy.
689
+ For full documents (`#document` roots) it preserves `<html>`, `<head>`, and
690
+ `<body>` wrappers; for fragments it prefers snippet-shaped output.
691
+ """
692
+
693
+ if policy is None:
694
+ policy = DEFAULT_DOCUMENT_POLICY if node.name == "#document" else DEFAULT_POLICY
695
+
696
+ # Root handling.
697
+ root_name: str = node.name
698
+
699
+ if root_name == "#text":
700
+ return node.clone_node(deep=False)
701
+
702
+ if root_name == "#comment":
703
+ out_root = node.clone_node(deep=False)
704
+ if policy.drop_comments:
705
+ out_root.name = "#document-fragment"
706
+ return out_root
707
+
708
+ if root_name == "!doctype":
709
+ out_root = node.clone_node(deep=False)
710
+ if policy.drop_doctype:
711
+ out_root.name = "#document-fragment"
712
+ return out_root
713
+
714
+ # Containers.
715
+ if root_name.startswith("#"):
716
+ out_root = node.clone_node(deep=False)
717
+ out_root.children.clear()
718
+ for child in node.children or []:
719
+ _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
720
+ return out_root
721
+
722
+ # Element root: keep element if allowed, otherwise unwrap into a fragment.
723
+ tag = str(root_name).lower()
724
+ if policy.drop_foreign_namespaces and node.namespace not in (None, "html"):
725
+ out_root = node.clone_node(deep=False)
726
+ out_root.name = "#document-fragment"
727
+ out_root.children.clear()
728
+ out_root.attrs.clear()
729
+ return out_root
730
+
731
+ if tag in policy.drop_content_tags or (tag not in policy.allowed_tags and not policy.strip_disallowed_tags):
732
+ out_root = node.clone_node(deep=False)
733
+ out_root.name = "#document-fragment"
734
+ out_root.children.clear()
735
+ out_root.attrs.clear()
736
+ return out_root
737
+
738
+ if tag not in policy.allowed_tags and policy.strip_disallowed_tags:
739
+ out_root = node.clone_node(deep=False)
740
+ out_root.name = "#document-fragment"
741
+ out_root.children.clear()
742
+ out_root.attrs.clear()
743
+ for child in node.children or []:
744
+ _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
745
+
746
+ if tag == "template" and node.namespace in (None, "html") and node.template_content:
747
+ for child in node.template_content.children or []:
748
+ _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
749
+ return out_root
750
+
751
+ out_root = node.clone_node(deep=False)
752
+ out_root.children.clear()
753
+ out_root.attrs = _sanitize_attrs(policy=policy, tag=tag, attrs=node.attrs)
754
+ for child in node.children or []:
755
+ _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root)
756
+
757
+ if tag == "template" and node.namespace in (None, "html"):
758
+ if node.template_content and out_root.template_content:
759
+ out_root.template_content.children.clear()
760
+ for child in node.template_content.children or []:
761
+ _append_sanitized_subtree(policy=policy, original=child, parent_out=out_root.template_content)
762
+
763
+ return out_root