justhtml 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/transforms.py ADDED
@@ -0,0 +1,2568 @@
1
+ """Constructor-time DOM transforms.
2
+
3
+ These transforms are intended as a migration path for Bleach/html5lib-style
4
+ post-processing, but are implemented as DOM (tree) operations to match
5
+ JustHTML's architecture.
6
+
7
+ Safety model: transforms shape the in-memory tree; safe-by-default output is
8
+ still enforced by `to_html()`/`to_text()`/`to_markdown()` via sanitization.
9
+
10
+ Performance: selectors are compiled (parsed) once before application.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import re
16
+ from contextvars import ContextVar
17
+ from dataclasses import dataclass
18
+ from enum import Enum
19
+ from typing import TYPE_CHECKING, ClassVar, Literal, cast
20
+
21
+ from .constants import VOID_ELEMENTS, WHITESPACE_PRESERVING_ELEMENTS
22
+ from .linkify import LinkifyConfig, find_links_with_config
23
+ from .node import ElementNode, SimpleDomNode, TemplateNode, TextNode
24
+ from .sanitize import (
25
+ _URL_LIKE_ATTRS,
26
+ DEFAULT_POLICY,
27
+ SanitizationPolicy,
28
+ UrlPolicy,
29
+ _sanitize_inline_style,
30
+ _sanitize_srcset_value,
31
+ _sanitize_url_value,
32
+ )
33
+ from .selector import SelectorMatcher, parse_selector
34
+ from .serialize import serialize_end_tag, serialize_start_tag
35
+ from .tokens import ParseError
36
+
37
+ if TYPE_CHECKING:
38
+ from collections.abc import Callable, Collection
39
+ from typing import Any, Protocol
40
+
41
+ from .selector import ParsedSelector
42
+
43
+ class NodeCallback(Protocol):
44
+ def __call__(self, node: SimpleDomNode) -> None: ...
45
+
46
+ class EditAttrsCallback(Protocol):
47
+ def __call__(self, node: SimpleDomNode) -> dict[str, str | None] | None: ...
48
+
49
+ class ReportCallback(Protocol):
50
+ def __call__(self, msg: str, *, node: Any | None = None) -> None: ...
51
+
52
+
53
+ # -----------------
54
+ # Public transforms
55
+ # -----------------
56
+
57
+
58
+ _ERROR_SINK: ContextVar[list[ParseError] | None] = ContextVar("justhtml_transform_error_sink", default=None)
59
+
60
+
61
+ def emit_error(
62
+ code: str,
63
+ *,
64
+ node: SimpleDomNode | None = None,
65
+ line: int | None = None,
66
+ column: int | None = None,
67
+ category: str = "transform",
68
+ message: str | None = None,
69
+ ) -> None:
70
+ """Emit a ParseError from within a transform callback.
71
+
72
+ Errors are appended to the active sink when transforms are applied (e.g.
73
+ during JustHTML construction). If no sink is active, this is a no-op.
74
+ """
75
+
76
+ sink = _ERROR_SINK.get()
77
+ if sink is None:
78
+ return
79
+
80
+ if node is not None:
81
+ line = node.origin_line
82
+ column = node.origin_col
83
+
84
+ sink.append(
85
+ ParseError(
86
+ str(code),
87
+ line=line,
88
+ column=column,
89
+ category=str(category),
90
+ message=str(message) if message is not None else str(code),
91
+ )
92
+ )
93
+
94
+
95
+ class _StrEnum(str, Enum):
96
+ """Backport of enum.StrEnum (Python 3.11+).
97
+
98
+ We support Python 3.10+, so we use this small mixin instead.
99
+ """
100
+
101
+
102
+ class DecideAction(_StrEnum):
103
+ KEEP = "keep"
104
+ DROP = "drop"
105
+ UNWRAP = "unwrap"
106
+ EMPTY = "empty"
107
+ ESCAPE = "escape"
108
+
109
+
110
+ @dataclass(frozen=True, slots=True)
111
+ class SetAttrs:
112
+ selector: str
113
+ attrs: dict[str, str | None]
114
+ enabled: bool
115
+ callback: NodeCallback | None
116
+ report: ReportCallback | None
117
+
118
+ def __init__(
119
+ self,
120
+ selector: str,
121
+ *,
122
+ enabled: bool = True,
123
+ callback: NodeCallback | None = None,
124
+ report: ReportCallback | None = None,
125
+ attributes: dict[str, str | None] | None = None,
126
+ **attrs: str | None,
127
+ ) -> None:
128
+ object.__setattr__(self, "selector", str(selector))
129
+ merged = dict(attributes) if attributes else {}
130
+ merged.update(attrs)
131
+ object.__setattr__(self, "attrs", merged)
132
+ object.__setattr__(self, "enabled", bool(enabled))
133
+ object.__setattr__(self, "callback", callback)
134
+ object.__setattr__(self, "report", report)
135
+
136
+
137
+ @dataclass(frozen=True, slots=True)
138
+ class Drop:
139
+ selector: str
140
+
141
+ enabled: bool
142
+ callback: NodeCallback | None
143
+ report: ReportCallback | None
144
+
145
+ def __init__(
146
+ self,
147
+ selector: str,
148
+ *,
149
+ enabled: bool = True,
150
+ callback: NodeCallback | None = None,
151
+ report: ReportCallback | None = None,
152
+ ) -> None:
153
+ object.__setattr__(self, "selector", str(selector))
154
+ object.__setattr__(self, "enabled", bool(enabled))
155
+ object.__setattr__(self, "callback", callback)
156
+ object.__setattr__(self, "report", report)
157
+
158
+
159
+ @dataclass(frozen=True, slots=True)
160
+ class Unwrap:
161
+ selector: str
162
+
163
+ enabled: bool
164
+ callback: NodeCallback | None
165
+ report: ReportCallback | None
166
+
167
+ def __init__(
168
+ self,
169
+ selector: str,
170
+ *,
171
+ enabled: bool = True,
172
+ callback: NodeCallback | None = None,
173
+ report: ReportCallback | None = None,
174
+ ) -> None:
175
+ object.__setattr__(self, "selector", str(selector))
176
+ object.__setattr__(self, "enabled", bool(enabled))
177
+ object.__setattr__(self, "callback", callback)
178
+ object.__setattr__(self, "report", report)
179
+
180
+
181
+ @dataclass(frozen=True, slots=True)
182
+ class Empty:
183
+ selector: str
184
+
185
+ enabled: bool
186
+ callback: NodeCallback | None
187
+ report: ReportCallback | None
188
+
189
+ def __init__(
190
+ self,
191
+ selector: str,
192
+ *,
193
+ enabled: bool = True,
194
+ callback: NodeCallback | None = None,
195
+ report: ReportCallback | None = None,
196
+ ) -> None:
197
+ object.__setattr__(self, "selector", str(selector))
198
+ object.__setattr__(self, "enabled", bool(enabled))
199
+ object.__setattr__(self, "callback", callback)
200
+ object.__setattr__(self, "report", report)
201
+
202
+
203
+ @dataclass(frozen=True, slots=True)
204
+ class Edit:
205
+ selector: str
206
+ func: NodeCallback
207
+ enabled: bool
208
+ callback: NodeCallback | None
209
+ report: ReportCallback | None
210
+
211
+ def __init__(
212
+ self,
213
+ selector: str,
214
+ func: NodeCallback,
215
+ *,
216
+ enabled: bool = True,
217
+ callback: NodeCallback | None = None,
218
+ report: ReportCallback | None = None,
219
+ ) -> None:
220
+ object.__setattr__(self, "selector", str(selector))
221
+ object.__setattr__(self, "func", func)
222
+ object.__setattr__(self, "enabled", bool(enabled))
223
+ object.__setattr__(self, "callback", callback)
224
+ object.__setattr__(self, "report", report)
225
+
226
+
227
+ @dataclass(frozen=True, slots=True)
228
+ class EditDocument:
229
+ """Edit the document root in-place.
230
+
231
+ The callback is invoked exactly once with the provided root node.
232
+
233
+ This is intended for operations that need access to the root container
234
+ (e.g. #document / #document-fragment) which selector-based transforms do
235
+ not visit.
236
+ """
237
+
238
+ func: NodeCallback
239
+ enabled: bool
240
+ callback: NodeCallback | None
241
+ report: ReportCallback | None
242
+
243
+ def __init__(
244
+ self,
245
+ func: NodeCallback,
246
+ *,
247
+ enabled: bool = True,
248
+ callback: NodeCallback | None = None,
249
+ report: ReportCallback | None = None,
250
+ ) -> None:
251
+ object.__setattr__(self, "func", func)
252
+ object.__setattr__(self, "enabled", bool(enabled))
253
+ object.__setattr__(self, "callback", callback)
254
+ object.__setattr__(self, "report", report)
255
+
256
+
257
+ @dataclass(frozen=True, slots=True)
258
+ class Decide:
259
+ """Perform structural actions based on a callback.
260
+
261
+ This is a generic building block for policy-driven transforms.
262
+
263
+ - For selectors other than "*", the selector is matched against element
264
+ nodes using the normal selector engine.
265
+ - For selector "*", the callback is invoked for every node type, including
266
+ text/comment/doctype and document container nodes.
267
+
268
+ The callback must return one of: Decide.KEEP, Decide.DROP, Decide.UNWRAP, Decide.EMPTY, Decide.ESCAPE.
269
+ """
270
+
271
+ selector: str
272
+ func: Callable[[SimpleDomNode], DecideAction]
273
+ enabled: bool
274
+ callback: NodeCallback | None
275
+ report: ReportCallback | None
276
+
277
+ KEEP: ClassVar[DecideAction] = DecideAction.KEEP
278
+ DROP: ClassVar[DecideAction] = DecideAction.DROP
279
+ UNWRAP: ClassVar[DecideAction] = DecideAction.UNWRAP
280
+ EMPTY: ClassVar[DecideAction] = DecideAction.EMPTY
281
+ ESCAPE: ClassVar[DecideAction] = DecideAction.ESCAPE
282
+
283
+ def __init__(
284
+ self,
285
+ selector: str,
286
+ func: Callable[[SimpleDomNode], DecideAction],
287
+ *,
288
+ enabled: bool = True,
289
+ callback: NodeCallback | None = None,
290
+ report: ReportCallback | None = None,
291
+ ) -> None:
292
+ object.__setattr__(self, "selector", str(selector))
293
+ object.__setattr__(self, "func", func)
294
+ object.__setattr__(self, "enabled", bool(enabled))
295
+ object.__setattr__(self, "callback", callback)
296
+ object.__setattr__(self, "report", report)
297
+
298
+
299
+ @dataclass(frozen=True, slots=True)
300
+ class EditAttrs:
301
+ """Edit element attributes using a callback.
302
+
303
+ The callback is invoked for matching element/template nodes.
304
+
305
+ - Return None to leave attributes unchanged.
306
+ - Return a dict to replace the node's attributes with that dict.
307
+ """
308
+
309
+ selector: str
310
+ func: EditAttrsCallback
311
+ enabled: bool
312
+ callback: NodeCallback | None
313
+ report: ReportCallback | None
314
+
315
+ def __init__(
316
+ self,
317
+ selector: str,
318
+ func: EditAttrsCallback,
319
+ *,
320
+ enabled: bool = True,
321
+ callback: NodeCallback | None = None,
322
+ report: ReportCallback | None = None,
323
+ ) -> None:
324
+ object.__setattr__(self, "selector", str(selector))
325
+ object.__setattr__(self, "func", func)
326
+ object.__setattr__(self, "enabled", bool(enabled))
327
+ object.__setattr__(self, "callback", callback)
328
+ object.__setattr__(self, "report", report)
329
+
330
+
331
+ # Backwards-compatible alias.
332
+ RewriteAttrs = EditAttrs
333
+
334
+
335
+ @dataclass(frozen=True, slots=True)
336
+ class Linkify:
337
+ """Linkify URLs/emails in text nodes.
338
+
339
+ This transform scans DOM text nodes (not raw HTML strings) and wraps detected
340
+ links in `<a href="...">...</a>`.
341
+ """
342
+
343
+ skip_tags: frozenset[str]
344
+ fuzzy_ip: bool
345
+ extra_tlds: frozenset[str]
346
+ enabled: bool
347
+ callback: NodeCallback | None
348
+ report: ReportCallback | None
349
+
350
+ def __init__(
351
+ self,
352
+ *,
353
+ skip_tags: list[str] | tuple[str, ...] | set[str] | frozenset[str] = (
354
+ "a",
355
+ *WHITESPACE_PRESERVING_ELEMENTS,
356
+ ),
357
+ enabled: bool = True,
358
+ fuzzy_ip: bool = False,
359
+ extra_tlds: list[str] | tuple[str, ...] | set[str] | frozenset[str] = (),
360
+ callback: NodeCallback | None = None,
361
+ report: ReportCallback | None = None,
362
+ ) -> None:
363
+ object.__setattr__(self, "skip_tags", frozenset(str(t).lower() for t in skip_tags))
364
+ object.__setattr__(self, "fuzzy_ip", bool(fuzzy_ip))
365
+ object.__setattr__(self, "extra_tlds", frozenset(str(t).lower() for t in extra_tlds))
366
+ object.__setattr__(self, "enabled", bool(enabled))
367
+ object.__setattr__(self, "callback", callback)
368
+ object.__setattr__(self, "report", report)
369
+
370
+
371
+ def _collapse_html_space_characters(text: str) -> str:
372
+ """Collapse runs of HTML whitespace characters to a single space.
373
+
374
+ This mirrors html5lib's whitespace filter behavior: it does not trim.
375
+ """
376
+
377
+ # Fast path: no formatting whitespace and no double spaces.
378
+ if "\t" not in text and "\n" not in text and "\r" not in text and "\f" not in text and " " not in text:
379
+ return text
380
+
381
+ out: list[str] = []
382
+ in_ws = False
383
+
384
+ for ch in text:
385
+ if ch == " " or ch == "\t" or ch == "\n" or ch == "\r" or ch == "\f":
386
+ if in_ws:
387
+ continue
388
+ out.append(" ")
389
+ in_ws = True
390
+ continue
391
+
392
+ out.append(ch)
393
+ in_ws = False
394
+ return "".join(out)
395
+
396
+
397
+ @dataclass(frozen=True, slots=True)
398
+ class CollapseWhitespace:
399
+ """Collapse whitespace in text nodes.
400
+
401
+ Collapses runs of HTML whitespace characters (space, tab, LF, CR, FF) into a
402
+ single space.
403
+
404
+ This is similar to `html5lib.filters.whitespace.Filter`.
405
+ """
406
+
407
+ skip_tags: frozenset[str]
408
+ enabled: bool
409
+ callback: NodeCallback | None
410
+ report: ReportCallback | None
411
+
412
+ def __init__(
413
+ self,
414
+ *,
415
+ skip_tags: list[str] | tuple[str, ...] | set[str] | frozenset[str] = (
416
+ *WHITESPACE_PRESERVING_ELEMENTS,
417
+ "title",
418
+ ),
419
+ enabled: bool = True,
420
+ callback: NodeCallback | None = None,
421
+ report: ReportCallback | None = None,
422
+ ) -> None:
423
+ object.__setattr__(self, "skip_tags", frozenset(str(t).lower() for t in skip_tags))
424
+ object.__setattr__(self, "enabled", bool(enabled))
425
+ object.__setattr__(self, "callback", callback)
426
+ object.__setattr__(self, "report", report)
427
+
428
+
429
+ @dataclass(frozen=True, slots=True)
430
+ class Sanitize:
431
+ """Sanitize the in-memory tree.
432
+
433
+ This transform replaces the current tree with a sanitized clone using the
434
+ same sanitizer that powers `safe=True` serialization.
435
+
436
+ Notes:
437
+ - This runs once at parse/transform time.
438
+ - If you apply transforms after `Sanitize`, they may reintroduce unsafe
439
+ content. Use safe serialization (`safe=True`) if you need output safety.
440
+ """
441
+
442
+ policy: SanitizationPolicy | None
443
+ enabled: bool
444
+ callback: NodeCallback | None
445
+ report: ReportCallback | None
446
+
447
+ def __init__(
448
+ self,
449
+ policy: SanitizationPolicy | None = None,
450
+ *,
451
+ enabled: bool = True,
452
+ callback: NodeCallback | None = None,
453
+ report: ReportCallback | None = None,
454
+ ) -> None:
455
+ object.__setattr__(self, "policy", policy)
456
+ object.__setattr__(self, "enabled", bool(enabled))
457
+ object.__setattr__(self, "callback", callback)
458
+ object.__setattr__(self, "report", report)
459
+
460
+
461
+ @dataclass(frozen=True, slots=True)
462
+ class DropComments:
463
+ """Drop comment nodes (#comment)."""
464
+
465
+ enabled: bool
466
+ callback: NodeCallback | None
467
+ report: ReportCallback | None
468
+
469
+ def __init__(
470
+ self,
471
+ *,
472
+ enabled: bool = True,
473
+ callback: NodeCallback | None = None,
474
+ report: ReportCallback | None = None,
475
+ ) -> None:
476
+ object.__setattr__(self, "enabled", bool(enabled))
477
+ object.__setattr__(self, "callback", callback)
478
+ object.__setattr__(self, "report", report)
479
+
480
+
481
+ @dataclass(frozen=True, slots=True)
482
+ class DropDoctype:
483
+ """Drop doctype nodes (!doctype)."""
484
+
485
+ enabled: bool
486
+ callback: NodeCallback | None
487
+ report: ReportCallback | None
488
+
489
+ def __init__(
490
+ self,
491
+ *,
492
+ enabled: bool = True,
493
+ callback: NodeCallback | None = None,
494
+ report: ReportCallback | None = None,
495
+ ) -> None:
496
+ object.__setattr__(self, "enabled", bool(enabled))
497
+ object.__setattr__(self, "callback", callback)
498
+ object.__setattr__(self, "report", report)
499
+
500
+
501
+ @dataclass(frozen=True, slots=True)
502
+ class DropForeignNamespaces:
503
+ """Drop elements in non-HTML namespaces."""
504
+
505
+ enabled: bool
506
+ callback: NodeCallback | None
507
+ report: ReportCallback | None
508
+
509
+ def __init__(
510
+ self,
511
+ *,
512
+ enabled: bool = True,
513
+ callback: NodeCallback | None = None,
514
+ report: ReportCallback | None = None,
515
+ ) -> None:
516
+ object.__setattr__(self, "enabled", bool(enabled))
517
+ object.__setattr__(self, "callback", callback)
518
+ object.__setattr__(self, "report", report)
519
+
520
+
521
+ @dataclass(frozen=True, slots=True)
522
+ class DropAttrs:
523
+ """Drop attributes whose names match simple patterns."""
524
+
525
+ selector: str
526
+ patterns: tuple[str, ...]
527
+ enabled: bool
528
+ callback: NodeCallback | None
529
+ report: ReportCallback | None
530
+
531
+ def __init__(
532
+ self,
533
+ selector: str,
534
+ *,
535
+ patterns: tuple[str, ...] = (),
536
+ enabled: bool = True,
537
+ callback: NodeCallback | None = None,
538
+ report: ReportCallback | None = None,
539
+ ) -> None:
540
+ object.__setattr__(self, "selector", str(selector))
541
+ object.__setattr__(
542
+ self,
543
+ "patterns",
544
+ tuple(sorted({str(p).strip().lower() for p in patterns if str(p).strip()})),
545
+ )
546
+ object.__setattr__(self, "enabled", bool(enabled))
547
+ object.__setattr__(self, "callback", callback)
548
+ object.__setattr__(self, "report", report)
549
+
550
+
551
+ @dataclass(frozen=True, slots=True)
552
+ class AllowlistAttrs:
553
+ """Retain only allowlisted attributes by tag and global allowlist."""
554
+
555
+ selector: str
556
+ allowed_attributes: dict[str, set[str]]
557
+ enabled: bool
558
+ callback: NodeCallback | None
559
+ report: ReportCallback | None
560
+
561
+ def __init__(
562
+ self,
563
+ selector: str,
564
+ *,
565
+ allowed_attributes: dict[str, Collection[str]],
566
+ enabled: bool = True,
567
+ callback: NodeCallback | None = None,
568
+ report: ReportCallback | None = None,
569
+ ) -> None:
570
+ normalized: dict[str, set[str]] = {}
571
+ for tag, attrs in allowed_attributes.items():
572
+ normalized[str(tag)] = {str(a).lower() for a in attrs}
573
+ object.__setattr__(self, "selector", str(selector))
574
+ object.__setattr__(self, "allowed_attributes", normalized)
575
+ object.__setattr__(self, "enabled", bool(enabled))
576
+ object.__setattr__(self, "callback", callback)
577
+ object.__setattr__(self, "report", report)
578
+
579
+
580
+ @dataclass(frozen=True, slots=True)
581
+ class DropUrlAttrs:
582
+ """Validate and rewrite/drop URL-valued attributes based on UrlPolicy rules."""
583
+
584
+ selector: str
585
+ url_policy: UrlPolicy
586
+ enabled: bool
587
+ callback: NodeCallback | None
588
+ report: ReportCallback | None
589
+
590
+ def __init__(
591
+ self,
592
+ selector: str,
593
+ *,
594
+ url_policy: UrlPolicy,
595
+ enabled: bool = True,
596
+ callback: NodeCallback | None = None,
597
+ report: ReportCallback | None = None,
598
+ ) -> None:
599
+ object.__setattr__(self, "selector", str(selector))
600
+ object.__setattr__(self, "url_policy", url_policy)
601
+ object.__setattr__(self, "enabled", bool(enabled))
602
+ object.__setattr__(self, "callback", callback)
603
+ object.__setattr__(self, "report", report)
604
+
605
+
606
+ @dataclass(frozen=True, slots=True)
607
+ class AllowStyleAttrs:
608
+ """Sanitize inline style attributes when present."""
609
+
610
+ selector: str
611
+ allowed_css_properties: tuple[str, ...]
612
+ enabled: bool
613
+ callback: NodeCallback | None
614
+ report: ReportCallback | None
615
+
616
+ def __init__(
617
+ self,
618
+ selector: str,
619
+ *,
620
+ allowed_css_properties: Collection[str],
621
+ enabled: bool = True,
622
+ callback: NodeCallback | None = None,
623
+ report: ReportCallback | None = None,
624
+ ) -> None:
625
+ object.__setattr__(self, "selector", str(selector))
626
+ object.__setattr__(
627
+ self,
628
+ "allowed_css_properties",
629
+ tuple(sorted({str(p).strip().lower() for p in allowed_css_properties if str(p).strip()})),
630
+ )
631
+ object.__setattr__(self, "enabled", bool(enabled))
632
+ object.__setattr__(self, "callback", callback)
633
+ object.__setattr__(self, "report", report)
634
+
635
+
636
+ @dataclass(frozen=True, slots=True)
637
+ class MergeAttrs:
638
+ """Merge tokens into a whitespace-delimited attribute without removing existing ones."""
639
+
640
+ tag: str
641
+ attr: str
642
+ tokens: tuple[str, ...]
643
+ enabled: bool
644
+ callback: NodeCallback | None
645
+ report: ReportCallback | None
646
+
647
+ def __init__(
648
+ self,
649
+ tag: str,
650
+ *,
651
+ attr: str,
652
+ tokens: Collection[str],
653
+ enabled: bool = True,
654
+ callback: NodeCallback | None = None,
655
+ report: ReportCallback | None = None,
656
+ ) -> None:
657
+ object.__setattr__(self, "tag", str(tag).lower())
658
+ object.__setattr__(self, "attr", str(attr).lower())
659
+ object.__setattr__(self, "tokens", tuple(sorted({str(t).strip().lower() for t in tokens if str(t).strip()})))
660
+ object.__setattr__(self, "enabled", bool(enabled))
661
+ object.__setattr__(self, "callback", callback)
662
+ object.__setattr__(self, "report", report)
663
+
664
+
665
+ @dataclass(frozen=True, slots=True)
666
+ class PruneEmpty:
667
+ """Recursively drop empty elements.
668
+
669
+ This transform removes elements that are empty at that point in the
670
+ transform pipeline.
671
+
672
+ "Empty" means:
673
+ - no element children, and
674
+ - no non-whitespace text nodes (unless `strip_whitespace=False`).
675
+
676
+ Comments/doctypes are ignored when determining emptiness.
677
+
678
+ Notes:
679
+ - Pruning uses a post-order traversal to be correct.
680
+ """
681
+
682
+ selector: str
683
+ strip_whitespace: bool
684
+ enabled: bool
685
+ callback: NodeCallback | None
686
+ report: ReportCallback | None
687
+
688
+ def __init__(
689
+ self,
690
+ selector: str,
691
+ *,
692
+ strip_whitespace: bool = True,
693
+ enabled: bool = True,
694
+ callback: NodeCallback | None = None,
695
+ report: ReportCallback | None = None,
696
+ ) -> None:
697
+ object.__setattr__(self, "selector", str(selector))
698
+ object.__setattr__(self, "strip_whitespace", bool(strip_whitespace))
699
+ object.__setattr__(self, "enabled", bool(enabled))
700
+ object.__setattr__(self, "callback", callback)
701
+ object.__setattr__(self, "report", report)
702
+
703
+
704
+ @dataclass(frozen=True, slots=True)
705
+ class Stage:
706
+ """Group transforms into an explicit stage.
707
+
708
+ Stages are intended to make transform passes explicit and readable.
709
+
710
+ - Stages can be nested; nested stages are flattened.
711
+ - If at least one Stage is present at the top level of a transform list,
712
+ any top-level transforms around it are automatically grouped into
713
+ implicit stages.
714
+ """
715
+
716
+ transforms: tuple[TransformSpec, ...]
717
+ enabled: bool
718
+ callback: NodeCallback | None
719
+ report: ReportCallback | None
720
+
721
+ def __init__(
722
+ self,
723
+ transforms: list[TransformSpec] | tuple[TransformSpec, ...],
724
+ *,
725
+ enabled: bool = True,
726
+ callback: NodeCallback | None = None,
727
+ report: ReportCallback | None = None,
728
+ ) -> None:
729
+ object.__setattr__(self, "transforms", tuple(transforms))
730
+ object.__setattr__(self, "enabled", bool(enabled))
731
+ object.__setattr__(self, "callback", callback)
732
+ object.__setattr__(self, "report", report)
733
+
734
+
735
+ # -----------------
736
+ # Compilation
737
+ # -----------------
738
+
739
+
740
+ Transform = (
741
+ SetAttrs
742
+ | Drop
743
+ | Unwrap
744
+ | Empty
745
+ | Edit
746
+ | EditDocument
747
+ | Decide
748
+ | EditAttrs
749
+ | Linkify
750
+ | CollapseWhitespace
751
+ | PruneEmpty
752
+ | Sanitize
753
+ | DropComments
754
+ | DropDoctype
755
+ | DropForeignNamespaces
756
+ | DropAttrs
757
+ | AllowlistAttrs
758
+ | DropUrlAttrs
759
+ | AllowStyleAttrs
760
+ | MergeAttrs
761
+ )
762
+
763
+
764
+ _TRANSFORM_CLASSES: tuple[type[object], ...] = (
765
+ SetAttrs,
766
+ Drop,
767
+ Unwrap,
768
+ Empty,
769
+ Edit,
770
+ EditDocument,
771
+ Decide,
772
+ EditAttrs,
773
+ Linkify,
774
+ CollapseWhitespace,
775
+ PruneEmpty,
776
+ Sanitize,
777
+ DropComments,
778
+ DropDoctype,
779
+ DropForeignNamespaces,
780
+ DropAttrs,
781
+ AllowlistAttrs,
782
+ DropUrlAttrs,
783
+ AllowStyleAttrs,
784
+ MergeAttrs,
785
+ )
786
+
787
+ TransformSpec = Transform | Stage
788
+
789
+
790
+ @dataclass(frozen=True, slots=True)
791
+ class _CompiledCollapseWhitespaceTransform:
792
+ kind: Literal["collapse_whitespace"]
793
+ skip_tags: frozenset[str]
794
+ callback: NodeCallback | None
795
+ report: ReportCallback | None
796
+
797
+
798
+ @dataclass(frozen=True, slots=True)
799
+ class _CompiledSelectorTransform:
800
+ kind: Literal["setattrs", "drop", "unwrap", "empty", "edit"]
801
+ selector_str: str
802
+ selector: ParsedSelector
803
+ payload: dict[str, str | None] | NodeCallback | None
804
+ callback: NodeCallback | None
805
+ report: ReportCallback | None
806
+
807
+
808
+ @dataclass(frozen=True, slots=True)
809
+ class _CompiledLinkifyTransform:
810
+ kind: Literal["linkify"]
811
+ skip_tags: frozenset[str]
812
+ config: LinkifyConfig
813
+ callback: NodeCallback | None
814
+ report: ReportCallback | None
815
+
816
+
817
+ @dataclass(frozen=True, slots=True)
818
+ class _CompiledEditDocumentTransform:
819
+ kind: Literal["edit_document"]
820
+ callback: NodeCallback
821
+
822
+
823
+ @dataclass(frozen=True, slots=True)
824
+ class _CompiledPruneEmptyTransform:
825
+ kind: Literal["prune_empty"]
826
+ selector_str: str
827
+ selector: ParsedSelector
828
+ strip_whitespace: bool
829
+ callback: NodeCallback | None
830
+ report: ReportCallback | None
831
+
832
+
833
+ @dataclass(frozen=True, slots=True)
834
+ class _CompiledStageBoundary:
835
+ kind: Literal["stage_boundary"]
836
+
837
+
838
+ @dataclass(frozen=True, slots=True)
839
+ class _CompiledDecideTransform:
840
+ kind: Literal["decide"]
841
+ selector_str: str
842
+ selector: ParsedSelector | None
843
+ all_nodes: bool
844
+ callback: Callable[[SimpleDomNode], DecideAction]
845
+
846
+
847
+ @dataclass(frozen=True, slots=True)
848
+ class _CompiledRewriteAttrsTransform:
849
+ kind: Literal["rewrite_attrs"]
850
+ selector_str: str
851
+ selector: ParsedSelector | None
852
+ all_nodes: bool
853
+ func: EditAttrsCallback
854
+
855
+
856
+ @dataclass(frozen=True, slots=True)
857
+ class _CompiledDropCommentsTransform:
858
+ kind: Literal["drop_comments"]
859
+ callback: NodeCallback | None
860
+ report: ReportCallback | None
861
+
862
+
863
+ @dataclass(frozen=True, slots=True)
864
+ class _CompiledDropDoctypeTransform:
865
+ kind: Literal["drop_doctype"]
866
+ callback: NodeCallback | None
867
+ report: ReportCallback | None
868
+
869
+
870
+ @dataclass(frozen=True, slots=True)
871
+ class _CompiledMergeAttrTokensTransform:
872
+ kind: Literal["merge_attr_tokens"]
873
+ tag: str
874
+ attr: str
875
+ tokens: tuple[str, ...]
876
+ callback: NodeCallback | None
877
+ report: ReportCallback | None
878
+
879
+
880
+ @dataclass(frozen=True, slots=True)
881
+ class _CompiledSanitizeTransform:
882
+ kind: Literal["sanitize"]
883
+ policy: SanitizationPolicy
884
+ attr_drop_regex: re.Pattern[str] | None
885
+ callback: NodeCallback | None
886
+ report: ReportCallback | None
887
+
888
+
889
+ @dataclass(frozen=True, slots=True)
890
+ class _CompiledStageHookTransform:
891
+ kind: Literal["stage_hook"]
892
+ index: int
893
+ callback: NodeCallback | None
894
+ report: ReportCallback | None
895
+
896
+
897
+ CompiledTransform = (
898
+ _CompiledSelectorTransform
899
+ | _CompiledDecideTransform
900
+ | _CompiledRewriteAttrsTransform
901
+ | _CompiledLinkifyTransform
902
+ | _CompiledCollapseWhitespaceTransform
903
+ | _CompiledPruneEmptyTransform
904
+ | _CompiledEditDocumentTransform
905
+ | _CompiledDropCommentsTransform
906
+ | _CompiledDropDoctypeTransform
907
+ | _CompiledMergeAttrTokensTransform
908
+ | _CompiledSanitizeTransform
909
+ | _CompiledStageHookTransform
910
+ | _CompiledStageBoundary
911
+ )
912
+
913
+
914
+ def _iter_flattened_transforms(specs: list[TransformSpec] | tuple[TransformSpec, ...]) -> list[Transform]:
915
+ out: list[Transform] = []
916
+
917
+ def _walk(items: list[TransformSpec] | tuple[TransformSpec, ...]) -> None:
918
+ for item in items:
919
+ if isinstance(item, Stage):
920
+ if item.enabled:
921
+ _walk(item.transforms)
922
+ continue
923
+ out.append(item)
924
+
925
+ _walk(specs)
926
+ return out
927
+
928
+
929
+ def _glob_match(pattern: str, text: str) -> bool:
930
+ """Match a glob pattern against text.
931
+
932
+ Supported wildcards:
933
+ - '*' matches any sequence (including empty)
934
+ - '?' matches any single character
935
+ """
936
+
937
+ if pattern == "*":
938
+ return True
939
+ if "*" not in pattern and "?" not in pattern:
940
+ return pattern == text
941
+
942
+ p_i = 0
943
+ t_i = 0
944
+ star_i = -1
945
+ match_i = 0
946
+
947
+ while t_i < len(text):
948
+ if p_i < len(pattern) and (pattern[p_i] == "?" or pattern[p_i] == text[t_i]):
949
+ p_i += 1
950
+ t_i += 1
951
+ continue
952
+
953
+ if p_i < len(pattern) and pattern[p_i] == "*":
954
+ star_i = p_i
955
+ match_i = t_i
956
+ p_i += 1
957
+ continue
958
+
959
+ if star_i != -1:
960
+ p_i = star_i + 1
961
+ match_i += 1
962
+ t_i = match_i
963
+ continue
964
+
965
+ return False
966
+
967
+ while p_i < len(pattern) and pattern[p_i] == "*":
968
+ p_i += 1
969
+
970
+ return p_i == len(pattern)
971
+
972
+
973
+ def _split_into_top_level_stages(specs: list[TransformSpec] | tuple[TransformSpec, ...]) -> list[Stage]:
974
+ # Only enable auto-staging when a Stage is present at the top level.
975
+ has_top_level_stage = any(isinstance(t, Stage) and t.enabled for t in specs)
976
+ if not has_top_level_stage:
977
+ return []
978
+
979
+ stages: list[Stage] = []
980
+ pending: list[TransformSpec] = []
981
+
982
+ for item in specs:
983
+ if isinstance(item, Stage):
984
+ if not item.enabled:
985
+ continue
986
+ if pending:
987
+ stages.append(Stage(pending))
988
+ pending = []
989
+ stages.append(item)
990
+ continue
991
+
992
+ pending.append(item)
993
+
994
+ if pending:
995
+ stages.append(Stage(pending))
996
+
997
+ return stages
998
+
999
+
1000
+ def compile_transforms(transforms: list[TransformSpec] | tuple[TransformSpec, ...]) -> list[CompiledTransform]:
1001
+ if not transforms:
1002
+ return []
1003
+
1004
+ flattened = _iter_flattened_transforms(transforms)
1005
+
1006
+ top_level_stages = _split_into_top_level_stages(transforms)
1007
+ if top_level_stages:
1008
+ # Stage is a pass boundary. Compile each stage separately and insert a
1009
+ # boundary marker so apply_compiled_transforms can flush batches.
1010
+ compiled_stage: list[CompiledTransform] = []
1011
+ for stage_i, stage in enumerate(top_level_stages):
1012
+ if stage_i:
1013
+ compiled_stage.append(_CompiledStageBoundary(kind="stage_boundary"))
1014
+ compiled_stage.append(
1015
+ _CompiledStageHookTransform(
1016
+ kind="stage_hook",
1017
+ index=stage_i,
1018
+ callback=stage.callback,
1019
+ report=stage.report,
1020
+ )
1021
+ )
1022
+ for inner in _iter_flattened_transforms(stage.transforms):
1023
+ compiled_stage.extend(compile_transforms((inner,)))
1024
+ return compiled_stage
1025
+
1026
+ compiled: list[CompiledTransform] = []
1027
+
1028
+ def _append_compiled(item: CompiledTransform) -> None:
1029
+ # Optimization: fuse adjacent EditAttrs transforms that target the
1030
+ # same selector. This preserves left-to-right semantics but reduces
1031
+ # per-node selector matching and callback overhead.
1032
+ if (
1033
+ compiled
1034
+ and isinstance(item, _CompiledRewriteAttrsTransform)
1035
+ and isinstance(compiled[-1], _CompiledRewriteAttrsTransform)
1036
+ ):
1037
+ prev = compiled[-1]
1038
+ if prev.selector_str == item.selector_str and prev.all_nodes == item.all_nodes:
1039
+ prev_cb = prev.func
1040
+ next_cb = item.func
1041
+
1042
+ def _chained(
1043
+ node: SimpleDomNode,
1044
+ prev_cb: Callable[[SimpleDomNode], dict[str, str | None] | None] = prev_cb,
1045
+ next_cb: Callable[[SimpleDomNode], dict[str, str | None] | None] = next_cb,
1046
+ ) -> dict[str, str | None] | None:
1047
+ changed = False
1048
+ out = prev_cb(node)
1049
+ if out is not None: # pragma: no cover
1050
+ node.attrs = out
1051
+ changed = True
1052
+ out = next_cb(node)
1053
+ if out is not None:
1054
+ node.attrs = out
1055
+ changed = True
1056
+ return node.attrs if changed else None
1057
+
1058
+ compiled[-1] = _CompiledRewriteAttrsTransform(
1059
+ kind="rewrite_attrs",
1060
+ selector_str=prev.selector_str,
1061
+ selector=prev.selector,
1062
+ all_nodes=prev.all_nodes,
1063
+ func=_chained,
1064
+ )
1065
+ return
1066
+
1067
+ compiled.append(item)
1068
+
1069
+ for t in flattened:
1070
+ if not isinstance(t, _TRANSFORM_CLASSES):
1071
+ raise TypeError(f"Unsupported transform: {type(t).__name__}")
1072
+ if not t.enabled:
1073
+ continue
1074
+ if isinstance(t, SetAttrs):
1075
+ compiled.append(
1076
+ _CompiledSelectorTransform(
1077
+ kind="setattrs",
1078
+ selector_str=t.selector,
1079
+ selector=parse_selector(t.selector),
1080
+ payload=t.attrs,
1081
+ callback=t.callback,
1082
+ report=t.report,
1083
+ )
1084
+ )
1085
+ continue
1086
+ if isinstance(t, Drop):
1087
+ selector_str = t.selector
1088
+
1089
+ # Fast-path: if selector is a simple comma-separated list of tag
1090
+ # names (e.g. "script, style"), avoid selector matching entirely.
1091
+ raw_parts = selector_str.split(",")
1092
+ tag_list: list[str] = []
1093
+ for part in raw_parts:
1094
+ p = part.strip().lower()
1095
+ if not p:
1096
+ tag_list = []
1097
+ break
1098
+ # Reject anything that isn't a plain tag name.
1099
+ if any(ch in p for ch in " .#[:>*+~\t\n\r\f"):
1100
+ tag_list = []
1101
+ break
1102
+ tag_list.append(p)
1103
+
1104
+ if tag_list:
1105
+ tags = frozenset(tag_list)
1106
+ on_drop = t.callback
1107
+ on_report = t.report
1108
+
1109
+ def _drop_if_tag(
1110
+ node: SimpleDomNode,
1111
+ tags: frozenset[str] = tags,
1112
+ selector_str: str = selector_str,
1113
+ on_drop: NodeCallback | None = on_drop,
1114
+ on_report: ReportCallback | None = on_report,
1115
+ ) -> DecideAction:
1116
+ name = node.name
1117
+ if name.startswith("#") or name == "!doctype":
1118
+ return Decide.KEEP
1119
+ tag = str(name).lower()
1120
+ if tag not in tags:
1121
+ return Decide.KEEP
1122
+ if on_drop is not None:
1123
+ on_drop(node)
1124
+ if on_report is not None:
1125
+ on_report(f"Dropped tag '{tag}' (matched selector '{selector_str}')", node=node)
1126
+ return Decide.DROP
1127
+
1128
+ compiled.append(
1129
+ _CompiledDecideTransform(
1130
+ kind="decide",
1131
+ selector_str="*",
1132
+ selector=None,
1133
+ all_nodes=True,
1134
+ callback=_drop_if_tag,
1135
+ )
1136
+ )
1137
+ continue
1138
+
1139
+ compiled.append(
1140
+ _CompiledSelectorTransform(
1141
+ kind="drop",
1142
+ selector_str=selector_str,
1143
+ selector=parse_selector(selector_str),
1144
+ payload=None,
1145
+ callback=t.callback,
1146
+ report=t.report,
1147
+ )
1148
+ )
1149
+ continue
1150
+ if isinstance(t, Unwrap):
1151
+ compiled.append(
1152
+ _CompiledSelectorTransform(
1153
+ kind="unwrap",
1154
+ selector_str=t.selector,
1155
+ selector=parse_selector(t.selector),
1156
+ payload=None,
1157
+ callback=t.callback,
1158
+ report=t.report,
1159
+ )
1160
+ )
1161
+ continue
1162
+ if isinstance(t, Empty):
1163
+ compiled.append(
1164
+ _CompiledSelectorTransform(
1165
+ kind="empty",
1166
+ selector_str=t.selector,
1167
+ selector=parse_selector(t.selector),
1168
+ payload=None,
1169
+ callback=t.callback,
1170
+ report=t.report,
1171
+ )
1172
+ )
1173
+ continue
1174
+ if isinstance(t, Edit):
1175
+ selector_str = t.selector
1176
+ edit_func = t.func
1177
+ on_hook = t.callback
1178
+ on_report = t.report
1179
+
1180
+ def _wrapped(
1181
+ node: SimpleDomNode,
1182
+ edit_func: NodeCallback = edit_func,
1183
+ selector_str: str = selector_str,
1184
+ on_hook: NodeCallback | None = on_hook,
1185
+ on_report: ReportCallback | None = on_report,
1186
+ ) -> None:
1187
+ if on_hook is not None:
1188
+ on_hook(node)
1189
+ if on_report is not None:
1190
+ tag = str(node.name).lower()
1191
+ on_report(f"Edited <{tag}> (matched selector '{selector_str}')", node=node)
1192
+ edit_func(node)
1193
+
1194
+ compiled.append(
1195
+ _CompiledSelectorTransform(
1196
+ kind="edit",
1197
+ selector_str=t.selector,
1198
+ selector=parse_selector(t.selector),
1199
+ payload=_wrapped,
1200
+ callback=None,
1201
+ report=None,
1202
+ )
1203
+ )
1204
+ continue
1205
+
1206
+ if isinstance(t, EditDocument):
1207
+ edit_document_func = t.func
1208
+ on_hook = t.callback
1209
+ on_report = t.report
1210
+
1211
+ def _wrapped_root(
1212
+ node: SimpleDomNode,
1213
+ edit_document_func: NodeCallback = edit_document_func,
1214
+ on_hook: NodeCallback | None = on_hook,
1215
+ on_report: ReportCallback | None = on_report,
1216
+ ) -> None:
1217
+ if on_hook is not None:
1218
+ on_hook(node)
1219
+ if on_report is not None:
1220
+ on_report("Edited document root", node=node)
1221
+ edit_document_func(node)
1222
+
1223
+ compiled.append(_CompiledEditDocumentTransform(kind="edit_document", callback=_wrapped_root))
1224
+ continue
1225
+
1226
+ if isinstance(t, Decide):
1227
+ selector_str = t.selector
1228
+ all_nodes = selector_str.strip() == "*"
1229
+ decide_func = t.func
1230
+ on_hook = t.callback
1231
+ on_report = t.report
1232
+
1233
+ def _wrapped_decide(
1234
+ node: SimpleDomNode,
1235
+ decide_func: Callable[[SimpleDomNode], DecideAction] = decide_func,
1236
+ selector_str: str = selector_str,
1237
+ on_hook: NodeCallback | None = on_hook,
1238
+ on_report: ReportCallback | None = on_report,
1239
+ ) -> DecideAction:
1240
+ action = decide_func(node)
1241
+ if action is DecideAction.KEEP:
1242
+ return action
1243
+ if on_hook is not None:
1244
+ on_hook(node)
1245
+ if on_report is not None:
1246
+ nm = node.name
1247
+ label = str(nm).lower() if not nm.startswith("#") and nm != "!doctype" else str(nm)
1248
+ on_report(f"Decide -> {action.value} '{label}' (matched selector '{selector_str}')", node=node)
1249
+ return action
1250
+
1251
+ compiled.append(
1252
+ _CompiledDecideTransform(
1253
+ kind="decide",
1254
+ selector_str=selector_str,
1255
+ selector=None if all_nodes else parse_selector(selector_str),
1256
+ all_nodes=all_nodes,
1257
+ callback=_wrapped_decide,
1258
+ )
1259
+ )
1260
+ continue
1261
+
1262
+ if isinstance(t, EditAttrs):
1263
+ selector_str = t.selector
1264
+ all_nodes = selector_str.strip() == "*"
1265
+ edit_attrs_func = t.func
1266
+ on_hook = t.callback
1267
+ on_report = t.report
1268
+
1269
+ def _wrapped_attrs(
1270
+ node: SimpleDomNode,
1271
+ edit_attrs_func: EditAttrsCallback = edit_attrs_func,
1272
+ selector_str: str = selector_str,
1273
+ on_hook: NodeCallback | None = on_hook,
1274
+ on_report: ReportCallback | None = on_report,
1275
+ ) -> dict[str, str | None] | None:
1276
+ out = edit_attrs_func(node)
1277
+ if out is None:
1278
+ return None
1279
+ if on_hook is not None:
1280
+ on_hook(node)
1281
+ if on_report is not None:
1282
+ tag = str(node.name).lower()
1283
+ on_report(f"Edited attributes on <{tag}> (matched selector '{selector_str}')", node=node)
1284
+ return out
1285
+
1286
+ _append_compiled(
1287
+ _CompiledRewriteAttrsTransform(
1288
+ kind="rewrite_attrs",
1289
+ selector_str=selector_str,
1290
+ selector=None if all_nodes else parse_selector(selector_str),
1291
+ all_nodes=all_nodes,
1292
+ func=_wrapped_attrs,
1293
+ )
1294
+ )
1295
+ continue
1296
+
1297
+ if isinstance(t, Linkify):
1298
+ compiled.append(
1299
+ _CompiledLinkifyTransform(
1300
+ kind="linkify",
1301
+ skip_tags=t.skip_tags,
1302
+ config=LinkifyConfig(fuzzy_ip=t.fuzzy_ip, extra_tlds=t.extra_tlds),
1303
+ callback=t.callback,
1304
+ report=t.report,
1305
+ )
1306
+ )
1307
+ continue
1308
+
1309
+ if isinstance(t, CollapseWhitespace):
1310
+ compiled.append(
1311
+ _CompiledCollapseWhitespaceTransform(
1312
+ kind="collapse_whitespace",
1313
+ skip_tags=t.skip_tags,
1314
+ callback=t.callback,
1315
+ report=t.report,
1316
+ )
1317
+ )
1318
+ continue
1319
+
1320
+ if isinstance(t, PruneEmpty):
1321
+ compiled.append(
1322
+ _CompiledPruneEmptyTransform(
1323
+ kind="prune_empty",
1324
+ selector_str=t.selector,
1325
+ selector=parse_selector(t.selector),
1326
+ strip_whitespace=t.strip_whitespace,
1327
+ callback=t.callback,
1328
+ report=t.report,
1329
+ )
1330
+ )
1331
+ continue
1332
+
1333
+ if isinstance(t, DropComments):
1334
+ compiled.append(
1335
+ _CompiledDropCommentsTransform(
1336
+ kind="drop_comments",
1337
+ callback=t.callback,
1338
+ report=t.report,
1339
+ )
1340
+ )
1341
+ continue
1342
+
1343
+ if isinstance(t, DropDoctype):
1344
+ compiled.append(
1345
+ _CompiledDropDoctypeTransform(
1346
+ kind="drop_doctype",
1347
+ callback=t.callback,
1348
+ report=t.report,
1349
+ )
1350
+ )
1351
+ continue
1352
+
1353
+ if isinstance(t, DropForeignNamespaces):
1354
+ on_hook = t.callback
1355
+ on_report = t.report
1356
+
1357
+ def _drop_foreign(
1358
+ node: SimpleDomNode,
1359
+ on_hook: NodeCallback | None = on_hook,
1360
+ on_report: ReportCallback | None = on_report,
1361
+ ) -> DecideAction:
1362
+ name = node.name
1363
+ if name.startswith("#") or name == "!doctype":
1364
+ return Decide.KEEP
1365
+ ns = node.namespace
1366
+ if ns not in (None, "html"):
1367
+ if on_hook is not None:
1368
+ on_hook(node)
1369
+ if on_report is not None:
1370
+ tag = str(name).lower()
1371
+ on_report(f"Unsafe tag '{tag}' (foreign namespace)", node=node)
1372
+ return Decide.DROP
1373
+ return Decide.KEEP
1374
+
1375
+ compiled.append(
1376
+ _CompiledDecideTransform(
1377
+ kind="decide",
1378
+ selector_str="*",
1379
+ selector=None,
1380
+ all_nodes=True,
1381
+ callback=_drop_foreign,
1382
+ )
1383
+ )
1384
+ continue
1385
+
1386
+ if isinstance(t, DropAttrs):
1387
+ patterns = t.patterns
1388
+ on_hook = t.callback
1389
+ on_report = t.report
1390
+
1391
+ # Optimize pattern matching: Compile all patterns into one regex
1392
+ compiled_regex = _compile_patterns_to_regex(patterns)
1393
+
1394
+ def _drop_attrs(
1395
+ node: SimpleDomNode,
1396
+ patterns: tuple[str, ...] = patterns,
1397
+ compiled_regex: re.Pattern[str] | None = compiled_regex,
1398
+ on_hook: NodeCallback | None = on_hook,
1399
+ on_report: ReportCallback | None = on_report,
1400
+ ) -> dict[str, str | None] | None:
1401
+ attrs = node.attrs
1402
+ if not attrs:
1403
+ return None
1404
+
1405
+ if not patterns:
1406
+ return None
1407
+
1408
+ out: dict[str, str | None] = {}
1409
+ changed = False
1410
+ for raw_key, value in attrs.items():
1411
+ if not raw_key or not str(raw_key).strip():
1412
+ continue
1413
+ key = raw_key
1414
+ if not key.islower():
1415
+ key = key.lower()
1416
+
1417
+ if compiled_regex and compiled_regex.match(key):
1418
+ if on_report is not None:
1419
+ # Re-check to report which pattern matched (rare path)
1420
+ found_pat = "?"
1421
+ for pat in patterns:
1422
+ if _glob_match(pat, key): # pragma: no cover
1423
+ found_pat = pat
1424
+ break
1425
+ on_report(
1426
+ f"Unsafe attribute '{key}' (matched pattern '{found_pat}')",
1427
+ node=node,
1428
+ )
1429
+ changed = True
1430
+ continue
1431
+
1432
+ out[key] = value
1433
+
1434
+ if not changed:
1435
+ return None
1436
+ if on_hook is not None:
1437
+ on_hook(node) # pragma: no cover
1438
+ return out
1439
+
1440
+ selector_str = t.selector
1441
+ all_nodes = selector_str.strip() == "*"
1442
+ _append_compiled(
1443
+ _CompiledRewriteAttrsTransform(
1444
+ kind="rewrite_attrs",
1445
+ selector_str=selector_str,
1446
+ selector=None if all_nodes else parse_selector(selector_str),
1447
+ all_nodes=all_nodes,
1448
+ func=_drop_attrs,
1449
+ )
1450
+ )
1451
+ continue
1452
+
1453
+ if isinstance(t, AllowlistAttrs):
1454
+ allowed_attributes = t.allowed_attributes
1455
+ on_hook = t.callback
1456
+ on_report = t.report
1457
+ allowed_global = allowed_attributes.get("*", set())
1458
+ allowed_by_tag: dict[str, set[str]] = {}
1459
+ for tag, attrs in allowed_attributes.items():
1460
+ if tag == "*":
1461
+ continue
1462
+ allowed_by_tag[str(tag).lower()] = set(allowed_global).union(attrs)
1463
+
1464
+ def _allowlist_attrs(
1465
+ node: SimpleDomNode,
1466
+ allowed_by_tag: dict[str, set[str]] = allowed_by_tag,
1467
+ allowed_global: set[str] = allowed_global,
1468
+ on_hook: NodeCallback | None = on_hook,
1469
+ on_report: ReportCallback | None = on_report,
1470
+ ) -> dict[str, str | None] | None:
1471
+ attrs = node.attrs
1472
+ if not attrs:
1473
+ return None
1474
+ tag = str(node.name).lower()
1475
+ allowed = allowed_by_tag.get(tag, allowed_global)
1476
+
1477
+ changed = False
1478
+ out: dict[str, str | None] = {}
1479
+ for raw_key, value in attrs.items():
1480
+ raw_key_str = str(raw_key)
1481
+ if not raw_key_str.strip():
1482
+ # Drop invalid attribute names like '' or whitespace-only.
1483
+ changed = True
1484
+ continue
1485
+ key = raw_key_str
1486
+ if not key.islower():
1487
+ key = key.lower()
1488
+ changed = True # pragma: no cover
1489
+ if key in allowed:
1490
+ out[key] = value
1491
+ else:
1492
+ changed = True
1493
+ if on_report is not None:
1494
+ on_report(f"Unsafe attribute '{key}' (not allowed)", node=node)
1495
+ if not changed:
1496
+ return None
1497
+ if on_hook is not None:
1498
+ on_hook(node) # pragma: no cover
1499
+ return out
1500
+
1501
+ selector_str = t.selector
1502
+ all_nodes = selector_str.strip() == "*"
1503
+ _append_compiled(
1504
+ _CompiledRewriteAttrsTransform(
1505
+ kind="rewrite_attrs",
1506
+ selector_str=selector_str,
1507
+ selector=None if all_nodes else parse_selector(selector_str),
1508
+ all_nodes=all_nodes,
1509
+ func=_allowlist_attrs,
1510
+ )
1511
+ )
1512
+ continue
1513
+
1514
+ if isinstance(t, DropUrlAttrs):
1515
+ url_policy = t.url_policy
1516
+ on_hook = t.callback
1517
+ on_report = t.report
1518
+
1519
+ def _drop_url_attrs(
1520
+ node: SimpleDomNode,
1521
+ url_policy: UrlPolicy = url_policy,
1522
+ on_hook: NodeCallback | None = on_hook,
1523
+ on_report: ReportCallback | None = on_report,
1524
+ ) -> dict[str, str | None] | None:
1525
+ attrs = node.attrs
1526
+ if not attrs:
1527
+ return None
1528
+
1529
+ tag = str(node.name).lower()
1530
+ out = dict(attrs)
1531
+ changed = False
1532
+ for key in list(out.keys()):
1533
+ if key not in _URL_LIKE_ATTRS:
1534
+ continue
1535
+
1536
+ raw_value = out.get(key)
1537
+ if raw_value is None:
1538
+ if on_report is not None: # pragma: no cover
1539
+ on_report(f"Unsafe URL in attribute '{key}'", node=node)
1540
+ out.pop(key, None)
1541
+ changed = True
1542
+ continue
1543
+
1544
+ rule = url_policy.allow_rules.get((tag, key))
1545
+ if rule is None:
1546
+ if on_report is not None: # pragma: no cover
1547
+ on_report(f"Unsafe URL in attribute '{key}' (no rule)", node=node)
1548
+ out.pop(key, None)
1549
+ changed = True
1550
+ continue
1551
+
1552
+ if key == "srcset":
1553
+ sanitized = _sanitize_srcset_value(
1554
+ url_policy=url_policy,
1555
+ rule=rule,
1556
+ tag=tag,
1557
+ attr=key,
1558
+ value=str(raw_value),
1559
+ )
1560
+ else:
1561
+ sanitized = _sanitize_url_value(
1562
+ url_policy=url_policy,
1563
+ rule=rule,
1564
+ tag=tag,
1565
+ attr=key,
1566
+ value=str(raw_value),
1567
+ )
1568
+
1569
+ if sanitized is None:
1570
+ if on_report is not None:
1571
+ on_report(f"Unsafe URL in attribute '{key}'", node=node)
1572
+ out.pop(key, None)
1573
+ changed = True
1574
+ continue
1575
+
1576
+ out[key] = sanitized
1577
+
1578
+ if raw_value != sanitized:
1579
+ changed = True
1580
+
1581
+ if not changed:
1582
+ return None
1583
+ if on_hook is not None:
1584
+ on_hook(node)
1585
+ return out
1586
+
1587
+ selector_str = t.selector
1588
+ all_nodes = selector_str.strip() == "*"
1589
+ _append_compiled(
1590
+ _CompiledRewriteAttrsTransform(
1591
+ kind="rewrite_attrs",
1592
+ selector_str=selector_str,
1593
+ selector=None if all_nodes else parse_selector(selector_str),
1594
+ all_nodes=all_nodes,
1595
+ func=_drop_url_attrs,
1596
+ )
1597
+ )
1598
+ continue
1599
+
1600
+ if isinstance(t, AllowStyleAttrs):
1601
+ allowed_css_properties = t.allowed_css_properties
1602
+ on_hook = t.callback
1603
+ on_report = t.report
1604
+
1605
+ def _allow_style_attrs(
1606
+ node: SimpleDomNode,
1607
+ allowed_css_properties: tuple[str, ...] = allowed_css_properties,
1608
+ on_hook: NodeCallback | None = on_hook,
1609
+ on_report: ReportCallback | None = on_report,
1610
+ ) -> dict[str, str | None] | None:
1611
+ attrs = node.attrs
1612
+ if not attrs or "style" not in attrs:
1613
+ return None
1614
+
1615
+ raw_value = attrs.get("style")
1616
+ if raw_value is None:
1617
+ if on_report is not None:
1618
+ on_report("Unsafe inline style in attribute 'style'", node=node)
1619
+ out = dict(attrs)
1620
+ out.pop("style", None)
1621
+ if on_hook is not None:
1622
+ on_hook(node)
1623
+ return out
1624
+
1625
+ sanitized_style = _sanitize_inline_style(
1626
+ allowed_css_properties=allowed_css_properties, value=str(raw_value)
1627
+ )
1628
+ if sanitized_style is None:
1629
+ if on_report is not None:
1630
+ on_report("Unsafe inline style in attribute 'style'", node=node)
1631
+ out = dict(attrs)
1632
+ out.pop("style", None)
1633
+ if on_hook is not None:
1634
+ on_hook(node)
1635
+ return out
1636
+
1637
+ out = dict(attrs)
1638
+ out["style"] = sanitized_style
1639
+ if raw_value != sanitized_style and on_hook is not None:
1640
+ on_hook(node)
1641
+ return out
1642
+
1643
+ selector_str = t.selector
1644
+ all_nodes = selector_str.strip() == "*"
1645
+ _append_compiled(
1646
+ _CompiledRewriteAttrsTransform(
1647
+ kind="rewrite_attrs",
1648
+ selector_str=selector_str,
1649
+ selector=None if all_nodes else parse_selector(selector_str),
1650
+ all_nodes=all_nodes,
1651
+ func=_allow_style_attrs,
1652
+ )
1653
+ )
1654
+ continue
1655
+
1656
+ if isinstance(t, MergeAttrs):
1657
+ if not t.tokens:
1658
+ continue
1659
+ compiled.append(
1660
+ _CompiledMergeAttrTokensTransform(
1661
+ kind="merge_attr_tokens",
1662
+ tag=t.tag,
1663
+ attr=t.attr,
1664
+ tokens=t.tokens,
1665
+ callback=t.callback,
1666
+ report=t.report,
1667
+ )
1668
+ )
1669
+ continue
1670
+
1671
+ if isinstance(t, Sanitize): # pragma: no branch
1672
+ policy = t.policy or DEFAULT_POLICY
1673
+
1674
+ # Hardcoded patterns from original usage
1675
+ attr_patterns = ("on*", "srcdoc", "*:*")
1676
+ attr_regex = _compile_patterns_to_regex(attr_patterns)
1677
+
1678
+ _append_compiled(
1679
+ _CompiledSanitizeTransform(
1680
+ kind="sanitize",
1681
+ policy=policy,
1682
+ attr_drop_regex=attr_regex,
1683
+ callback=t.callback,
1684
+ report=t.report,
1685
+ )
1686
+ )
1687
+ continue
1688
+
1689
+ raise TypeError(f"Unsupported transform: {type(t).__name__}") # pragma: no cover
1690
+
1691
+ return compiled
1692
+
1693
+
1694
+ # -----------------
1695
+ # Application
1696
+ # -----------------
1697
+
1698
+
1699
+ def _compile_patterns_to_regex(patterns: tuple[str, ...]) -> re.Pattern[str] | None:
1700
+ if not patterns:
1701
+ return None
1702
+ parts: list[str] = []
1703
+ for p in patterns:
1704
+ regex = re.escape(p)
1705
+ regex = regex.replace(r"\*", ".*")
1706
+ regex = regex.replace(r"\?", ".")
1707
+ parts.append(regex)
1708
+ full = "^(?:" + "|".join(parts) + ")$"
1709
+ return re.compile(full)
1710
+
1711
+
1712
+ def apply_compiled_transforms(
1713
+ root: SimpleDomNode,
1714
+ compiled: list[CompiledTransform],
1715
+ *,
1716
+ errors: list[ParseError] | None = None,
1717
+ ) -> None:
1718
+ if not compiled:
1719
+ return
1720
+
1721
+ token = _ERROR_SINK.set(errors)
1722
+ try:
1723
+ matcher = SelectorMatcher()
1724
+
1725
+ def apply_walk_transforms(root_node: SimpleDomNode, walk_transforms: list[CompiledTransform]) -> None:
1726
+ if not walk_transforms:
1727
+ return
1728
+
1729
+ def _raw_tag_text(node: SimpleDomNode, start_attr: str, end_attr: str) -> str | None:
1730
+ start = getattr(node, start_attr, None)
1731
+ end = getattr(node, end_attr, None)
1732
+ if start is None or end is None:
1733
+ return None
1734
+ src = node._source_html
1735
+ if src is None:
1736
+ cur: SimpleDomNode | None = node
1737
+ while cur is not None and src is None:
1738
+ cur = cur.parent
1739
+ if cur is None:
1740
+ break
1741
+ src = cur._source_html
1742
+ if src is not None:
1743
+ node._source_html = src
1744
+ if src is None:
1745
+ return None
1746
+ return src[start:end]
1747
+
1748
+ def _reconstruct_start_tag(node: SimpleDomNode) -> str | None:
1749
+ if node.name.startswith("#") or node.name == "!doctype":
1750
+ return None
1751
+ name = str(node.name)
1752
+ attrs = getattr(node, "attrs", None)
1753
+ tag = serialize_start_tag(name, attrs)
1754
+ if getattr(node, "_self_closing", False):
1755
+ tag = f"{tag[:-1]}/>"
1756
+ return tag
1757
+
1758
+ def _reconstruct_end_tag(node: SimpleDomNode) -> str | None:
1759
+ if getattr(node, "_self_closing", False):
1760
+ return None
1761
+
1762
+ # If explicit metadata says no end tag, respect it.
1763
+ if getattr(node, "_end_tag_present", None) is False:
1764
+ return None
1765
+
1766
+ # For nodes without metadata (or explicitly present), check void list.
1767
+ name = str(node.name)
1768
+ if name.startswith("#") or name == "!doctype":
1769
+ return None
1770
+
1771
+ if name.lower() in VOID_ELEMENTS:
1772
+ return None
1773
+
1774
+ return serialize_end_tag(name)
1775
+
1776
+ linkify_skip_tags: frozenset[str] = frozenset().union(
1777
+ *(t.skip_tags for t in walk_transforms if isinstance(t, _CompiledLinkifyTransform))
1778
+ )
1779
+ whitespace_skip_tags: frozenset[str] = frozenset().union(
1780
+ *(t.skip_tags for t in walk_transforms if isinstance(t, _CompiledCollapseWhitespaceTransform))
1781
+ )
1782
+
1783
+ # To preserve strict left-to-right semantics while still batching
1784
+ # compatible transforms into a single walk, we track the earliest
1785
+ # transform index that may run on a node.
1786
+ #
1787
+ # Example:
1788
+ # transforms=[Drop("a"), Linkify()]
1789
+ # Linkify introduces <a> elements. Those <a> nodes must not be
1790
+ # processed by earlier transforms (like Drop("a")), because Drop has
1791
+ # already run conceptually.
1792
+ created_start_index: dict[int, int] = {}
1793
+
1794
+ def _mark_start(n: object, start_index: int) -> None:
1795
+ key = id(n)
1796
+ created_start_index[key] = max(created_start_index.get(key, 0), start_index)
1797
+
1798
+ def _apply_fused_sanitize(
1799
+ node: SimpleDomNode,
1800
+ t: _CompiledSanitizeTransform,
1801
+ parent: SimpleDomNode,
1802
+ idx: int,
1803
+ ) -> bool:
1804
+ policy = t.policy
1805
+ report = t.report
1806
+ callback = t.callback
1807
+ name = node.name
1808
+
1809
+ # 1. Drop Nodes (Comments, Doctype, Foreign)
1810
+ if name.startswith("#") or name == "!doctype":
1811
+ if name == "#comment":
1812
+ if policy.drop_comments:
1813
+ if callback:
1814
+ callback(node)
1815
+ if report:
1816
+ report("Dropped comment", node=node)
1817
+ parent.remove_child(node)
1818
+ return True
1819
+ return False
1820
+ if name == "!doctype":
1821
+ if policy.drop_doctype:
1822
+ if callback:
1823
+ callback(node)
1824
+ if report:
1825
+ report("Dropped doctype", node=node)
1826
+ parent.remove_child(node)
1827
+ return True
1828
+ return False
1829
+ return False
1830
+
1831
+ # 2. Drop Foreign
1832
+ ns = node.namespace
1833
+ if ns and ns != "html":
1834
+ if policy.drop_foreign_namespaces:
1835
+ if callback:
1836
+ callback(node)
1837
+ tag = str(name).lower()
1838
+ msg = f"Unsafe tag '{tag}' (foreign namespace)"
1839
+ policy.handle_unsafe(msg, node=node)
1840
+ if report:
1841
+ report(msg, node=node)
1842
+ parent.remove_child(node)
1843
+ return True
1844
+
1845
+ # Element tag
1846
+ tag = str(name).lower()
1847
+
1848
+ # 3. Allowed Tags
1849
+ if tag in policy.allowed_tags:
1850
+ pass
1851
+ elif tag in policy.drop_content_tags:
1852
+ msg = f"Unsafe tag '{tag}' (dropped content)"
1853
+ policy.handle_unsafe(msg, node=node)
1854
+ if report:
1855
+ report(msg, node=node)
1856
+ if callback:
1857
+ callback(node)
1858
+ parent.remove_child(node)
1859
+ return True
1860
+ else:
1861
+ msg = f"Unsafe tag '{tag}' (not allowed)"
1862
+ policy.handle_unsafe(msg, node=node)
1863
+ if report:
1864
+ report(msg, node=node)
1865
+ if callback:
1866
+ callback(node)
1867
+
1868
+ handling = policy.disallowed_tag_handling
1869
+ if handling == "drop":
1870
+ parent.remove_child(node)
1871
+ return True
1872
+ if handling == "escape":
1873
+ raw_start = _raw_tag_text(node, "_start_tag_start", "_start_tag_end")
1874
+ if raw_start is None:
1875
+ raw_start = _reconstruct_start_tag(node)
1876
+ raw_end = _raw_tag_text(node, "_end_tag_start", "_end_tag_end")
1877
+ if raw_end is None:
1878
+ raw_end = _reconstruct_end_tag(node)
1879
+
1880
+ if raw_start: # pragma: no cover
1881
+ sn = TextNode(raw_start)
1882
+ _mark_start(sn, idx)
1883
+ parent.insert_before(sn, node)
1884
+
1885
+ moved: list[SimpleDomNode] = []
1886
+ if node.children:
1887
+ moved.extend(list(node.children))
1888
+ node.children = []
1889
+ if type(node) is TemplateNode and node.template_content:
1890
+ tc = node.template_content
1891
+ if tc.children:
1892
+ moved.extend(list(tc.children))
1893
+ tc.children = []
1894
+
1895
+ if moved:
1896
+ for child in moved:
1897
+ _mark_start(child, idx)
1898
+ parent.insert_before(child, node)
1899
+
1900
+ if raw_end:
1901
+ en = TextNode(raw_end)
1902
+ _mark_start(en, idx)
1903
+ parent.insert_before(en, node)
1904
+
1905
+ parent.remove_child(node)
1906
+ return True
1907
+
1908
+ # UNWRAP
1909
+ moved_nodes: list[SimpleDomNode] = []
1910
+ if node.children:
1911
+ moved_nodes.extend(list(node.children))
1912
+ node.children = []
1913
+ if type(node) is TemplateNode and node.template_content:
1914
+ tc = node.template_content
1915
+ if tc.children:
1916
+ moved_nodes.extend(list(tc.children))
1917
+ tc.children = []
1918
+
1919
+ if moved_nodes:
1920
+ for child in moved_nodes:
1921
+ _mark_start(child, idx)
1922
+ parent.insert_before(child, node)
1923
+ parent.remove_child(node)
1924
+ return True
1925
+
1926
+ # 4. Attributes
1927
+ attrs = node.attrs
1928
+ if not attrs:
1929
+ return False
1930
+
1931
+ changed_attrs = False
1932
+ out_attrs: dict[str, str | None] = {}
1933
+
1934
+ capture_rel = tag == "a" and bool(policy.force_link_rel)
1935
+ rel_input_value: str | None = None
1936
+
1937
+ # Optimized: pre-calc allowlist for this tag
1938
+ # Note: allowed_attributes values are sets.
1939
+ allowed_attr_set = policy._allowed_attrs_by_tag.get(tag, policy._allowed_attrs_global)
1940
+
1941
+ drop_regex = t.attr_drop_regex
1942
+
1943
+ for raw_key, original_value in attrs.items():
1944
+ value = original_value
1945
+ key = str(raw_key)
1946
+ if not key.strip():
1947
+ changed_attrs = True
1948
+ continue
1949
+ key_lower = key.lower() if not key.islower() else key
1950
+
1951
+ if capture_rel and key_lower == "rel":
1952
+ rel_input_value = str(value or "")
1953
+
1954
+ # DropAttrs
1955
+ if drop_regex and drop_regex.match(key_lower):
1956
+ msg = f"Unsafe attribute '{key_lower}' (matched forbidden pattern)"
1957
+ policy.handle_unsafe(msg, node=node)
1958
+ if report:
1959
+ report(msg, node=node)
1960
+ changed_attrs = True
1961
+ continue
1962
+
1963
+ # Allowlist
1964
+ if key_lower not in allowed_attr_set:
1965
+ msg = f"Unsafe attribute '{key_lower}' (not allowed)"
1966
+ policy.handle_unsafe(msg, node=node)
1967
+ changed_attrs = True
1968
+ continue
1969
+
1970
+ # DropUrlAttrs
1971
+ if key_lower in _URL_LIKE_ATTRS:
1972
+ url_rule = policy.url_policy.allow_rules.get((tag, key_lower))
1973
+ if url_rule is None:
1974
+ msg = f"Unsafe URL in attribute '{key_lower}' (no rule)"
1975
+ policy.handle_unsafe(msg, node=node)
1976
+ if report:
1977
+ report(msg, node=node)
1978
+ changed_attrs = True
1979
+ continue
1980
+
1981
+ val_str = str(value or "")
1982
+ if key_lower == "srcset":
1983
+ sanitized = _sanitize_srcset_value(
1984
+ url_policy=policy.url_policy,
1985
+ rule=url_rule,
1986
+ tag=tag,
1987
+ attr=key_lower,
1988
+ value=val_str,
1989
+ )
1990
+ else:
1991
+ sanitized = _sanitize_url_value(
1992
+ url_policy=policy.url_policy,
1993
+ rule=url_rule,
1994
+ tag=tag,
1995
+ attr=key_lower,
1996
+ value=val_str,
1997
+ )
1998
+
1999
+ if sanitized is None:
2000
+ msg = f"Unsafe URL in attribute '{key_lower}'"
2001
+ policy.handle_unsafe(msg, node=node)
2002
+ if report: # pragma: no cover
2003
+ report(msg, node=node)
2004
+ changed_attrs = True
2005
+ continue
2006
+
2007
+ if sanitized != val_str:
2008
+ changed_attrs = True
2009
+ value = sanitized
2010
+
2011
+ # AllowStyleAttrs
2012
+ if key_lower == "style" and policy.allowed_css_properties:
2013
+ val_str = str(value or "")
2014
+ sanitized_style = _sanitize_inline_style(
2015
+ allowed_css_properties=policy.allowed_css_properties, value=val_str
2016
+ )
2017
+ if sanitized_style is None:
2018
+ msg = "Unsafe inline style in attribute 'style'"
2019
+ policy.handle_unsafe(msg, node=node)
2020
+ if report:
2021
+ report(msg, node=node)
2022
+ changed_attrs = True
2023
+ continue
2024
+
2025
+ if sanitized_style != val_str:
2026
+ changed_attrs = True
2027
+ value = sanitized_style
2028
+
2029
+ # Ensure we flag changes if the key case is normalized
2030
+ if key != key_lower:
2031
+ changed_attrs = True
2032
+
2033
+ out_attrs[key_lower] = value
2034
+
2035
+ # MergeAttrs (a rel)
2036
+ if capture_rel:
2037
+ rel_attr = "rel"
2038
+ existing_raw = out_attrs.get(rel_attr)
2039
+ if existing_raw is None and rel_input_value is not None:
2040
+ existing_raw = rel_input_value
2041
+
2042
+ existing: list[str] = []
2043
+ if isinstance(existing_raw, str) and existing_raw:
2044
+ for tok in existing_raw.split():
2045
+ tt = tok.strip().lower()
2046
+ if tt and tt not in existing:
2047
+ existing.append(tt)
2048
+
2049
+ rel_changed = False
2050
+ # Ensure deterministic order for forced tokens
2051
+ for tok in sorted(policy.force_link_rel):
2052
+ if tok not in existing:
2053
+ existing.append(tok)
2054
+ rel_changed = True
2055
+
2056
+ normalized = " ".join(existing)
2057
+ if rel_changed or (existing_raw != normalized):
2058
+ out_attrs[rel_attr] = normalized
2059
+ changed_attrs = True
2060
+ if report and rel_changed: # pragma: no cover
2061
+ report("Merged tokens into attribute 'rel' on <a>", node=node)
2062
+
2063
+ if changed_attrs:
2064
+ node.attrs = out_attrs
2065
+ if callback:
2066
+ callback(node)
2067
+
2068
+ return False
2069
+
2070
+ def apply_to_children(parent: SimpleDomNode, *, skip_linkify: bool, skip_whitespace: bool) -> None:
2071
+ children = parent.children
2072
+ if not children:
2073
+ return
2074
+
2075
+ i = 0
2076
+ while i < len(children):
2077
+ node = children[i]
2078
+ name = node.name
2079
+
2080
+ changed = False
2081
+ start_at = created_start_index.get(id(node), 0)
2082
+ for idx in range(start_at, len(walk_transforms)):
2083
+ t = walk_transforms[idx]
2084
+ # Dispatch based on 'kind' string to avoid expensive isinstance/class hierarchy checks
2085
+ # in this hot loop (50k nodes * 10 transforms = 500k type checks otherwise).
2086
+ k: str = t.kind
2087
+
2088
+ # Sanitize (Fused output for performance)
2089
+ if k == "sanitize":
2090
+ if TYPE_CHECKING:
2091
+ t = cast("_CompiledSanitizeTransform", t)
2092
+ if _apply_fused_sanitize(node, t, parent, idx):
2093
+ changed = True
2094
+ break
2095
+ continue
2096
+
2097
+ # DropComments
2098
+ if k == "drop_comments":
2099
+ if name == "#comment":
2100
+ if TYPE_CHECKING:
2101
+ t = cast("_CompiledDropCommentsTransform", t)
2102
+ if t.callback is not None:
2103
+ t.callback(node)
2104
+ if t.report is not None:
2105
+ t.report("Dropped comment", node=node)
2106
+ parent.remove_child(node)
2107
+ changed = True
2108
+ break
2109
+ continue
2110
+
2111
+ # DropDoctype
2112
+ if k == "drop_doctype":
2113
+ if name == "!doctype":
2114
+ if TYPE_CHECKING:
2115
+ t = cast("_CompiledDropDoctypeTransform", t)
2116
+ if t.callback is not None:
2117
+ t.callback(node) # pragma: no cover
2118
+ if t.report is not None:
2119
+ t.report("Dropped doctype", node=node) # pragma: no cover
2120
+ parent.remove_child(node)
2121
+ changed = True
2122
+ break
2123
+ continue
2124
+
2125
+ # MergeAttrs
2126
+ if k == "merge_attr_tokens":
2127
+ if not name.startswith("#") and name != "!doctype":
2128
+ if TYPE_CHECKING:
2129
+ t = cast("_CompiledMergeAttrTokensTransform", t)
2130
+ if str(name).lower() == t.tag:
2131
+ attrs = node.attrs
2132
+ existing_raw = attrs.get(t.attr)
2133
+ existing: list[str] = []
2134
+ if isinstance(existing_raw, str) and existing_raw:
2135
+ for tok in existing_raw.split():
2136
+ tt = tok.strip().lower()
2137
+ if tt and tt not in existing:
2138
+ existing.append(tt)
2139
+
2140
+ changed_rel = False
2141
+ for tok in t.tokens:
2142
+ if tok not in existing:
2143
+ existing.append(tok)
2144
+ changed_rel = True
2145
+ normalized = " ".join(existing)
2146
+ if (
2147
+ changed_rel
2148
+ or (existing_raw is None and existing)
2149
+ or (isinstance(existing_raw, str) and existing_raw != normalized)
2150
+ ):
2151
+ attrs[t.attr] = normalized
2152
+ if t.callback is not None:
2153
+ t.callback(node)
2154
+ if t.report is not None:
2155
+ t.report(
2156
+ f"Merged tokens into attribute '{t.attr}' on <{t.tag}>",
2157
+ node=node,
2158
+ )
2159
+ continue
2160
+
2161
+ # CollapseWhitespace
2162
+ if k == "collapse_whitespace":
2163
+ if name == "#text" and not skip_whitespace:
2164
+ if TYPE_CHECKING:
2165
+ t = cast("_CompiledCollapseWhitespaceTransform", t)
2166
+ data = node.data or ""
2167
+ if data:
2168
+ collapsed = _collapse_html_space_characters(data)
2169
+ if collapsed != data:
2170
+ if t.callback is not None:
2171
+ t.callback(node)
2172
+ if t.report is not None:
2173
+ t.report("Collapsed whitespace in text node", node=node)
2174
+ node.data = collapsed
2175
+ continue
2176
+
2177
+ # Linkify
2178
+ if k == "linkify":
2179
+ if name == "#text" and not skip_linkify:
2180
+ if TYPE_CHECKING:
2181
+ t = cast("_CompiledLinkifyTransform", t)
2182
+ data = node.data or ""
2183
+ if data:
2184
+ matches = find_links_with_config(data, t.config)
2185
+ if matches:
2186
+ if t.callback is not None:
2187
+ t.callback(node)
2188
+ if t.report is not None:
2189
+ t.report(
2190
+ f"Linkified {len(matches)} link(s) in text node",
2191
+ node=node,
2192
+ )
2193
+ cursor = 0
2194
+ for m in matches:
2195
+ if m.start > cursor:
2196
+ txt = TextNode(data[cursor : m.start])
2197
+ _mark_start(txt, idx + 1)
2198
+ parent.insert_before(txt, node)
2199
+
2200
+ ns = parent.namespace or "html"
2201
+ a = ElementNode("a", {"href": m.href}, ns)
2202
+ a.append_child(TextNode(m.text))
2203
+ _mark_start(a, idx + 1)
2204
+ parent.insert_before(a, node)
2205
+ cursor = m.end
2206
+
2207
+ if cursor < len(data):
2208
+ tail = TextNode(data[cursor:])
2209
+ _mark_start(tail, idx + 1)
2210
+ parent.insert_before(tail, node)
2211
+
2212
+ parent.remove_child(node)
2213
+ changed = True
2214
+ break
2215
+ continue
2216
+
2217
+ # Decide
2218
+ if k == "decide":
2219
+ if TYPE_CHECKING:
2220
+ t = cast("_CompiledDecideTransform", t)
2221
+ if t.all_nodes:
2222
+ action = t.callback(node)
2223
+ else:
2224
+ if name.startswith("#") or name == "!doctype":
2225
+ continue
2226
+ sel = t.selector
2227
+ if TYPE_CHECKING:
2228
+ sel = cast("ParsedSelector", sel)
2229
+ if not matcher.matches(node, sel):
2230
+ continue
2231
+ action = t.callback(node)
2232
+
2233
+ if action is DecideAction.KEEP:
2234
+ continue
2235
+
2236
+ if action is DecideAction.EMPTY:
2237
+ if name != "#text" and node.children:
2238
+ for child in node.children:
2239
+ child.parent = None
2240
+ node.children = []
2241
+ if type(node) is TemplateNode and node.template_content is not None:
2242
+ tc = node.template_content
2243
+ for child in tc.children or []:
2244
+ child.parent = None
2245
+ tc.children = []
2246
+ continue
2247
+
2248
+ if action is DecideAction.UNWRAP:
2249
+ moved_nodes: list[SimpleDomNode] = []
2250
+ if name != "#text" and node.children:
2251
+ moved_nodes.extend(list(node.children))
2252
+ node.children = []
2253
+ if type(node) is TemplateNode and node.template_content is not None:
2254
+ tc = node.template_content
2255
+ if tc.children:
2256
+ moved_nodes.extend(list(tc.children))
2257
+ tc.children = []
2258
+ if moved_nodes:
2259
+ for child in moved_nodes:
2260
+ _mark_start(child, idx)
2261
+ parent.insert_before(child, node)
2262
+ parent.remove_child(node)
2263
+ changed = True
2264
+ break
2265
+
2266
+ if action is DecideAction.ESCAPE:
2267
+ raw_start = _raw_tag_text(node, "_start_tag_start", "_start_tag_end")
2268
+ if raw_start is None:
2269
+ raw_start = _reconstruct_start_tag(node)
2270
+ raw_end = _raw_tag_text(node, "_end_tag_start", "_end_tag_end")
2271
+ if raw_end is None:
2272
+ raw_end = _reconstruct_end_tag(node)
2273
+ if raw_start:
2274
+ start_node = TextNode(raw_start)
2275
+ _mark_start(start_node, idx)
2276
+ parent.insert_before(start_node, node)
2277
+
2278
+ moved: list[SimpleDomNode] = []
2279
+ if name != "#text" and node.children:
2280
+ moved.extend(list(node.children))
2281
+ node.children = []
2282
+ if type(node) is TemplateNode and node.template_content is not None:
2283
+ tc = node.template_content
2284
+ tc_children = tc.children or []
2285
+ moved.extend(tc_children)
2286
+ tc.children = []
2287
+
2288
+ if moved:
2289
+ for child in moved:
2290
+ _mark_start(child, idx)
2291
+ parent.insert_before(child, node)
2292
+
2293
+ if raw_end:
2294
+ end_node = TextNode(raw_end)
2295
+ _mark_start(end_node, idx)
2296
+ parent.insert_before(end_node, node)
2297
+
2298
+ parent.remove_child(node)
2299
+ changed = True
2300
+ break
2301
+
2302
+ # action == DROP (and any invalid value)
2303
+ parent.remove_child(node)
2304
+ changed = True
2305
+ break
2306
+
2307
+ # EditAttrs (rewrite_attrs)
2308
+ if k == "rewrite_attrs":
2309
+ if name.startswith("#") or name == "!doctype":
2310
+ continue
2311
+ if TYPE_CHECKING:
2312
+ t = cast("_CompiledRewriteAttrsTransform", t)
2313
+ if not t.all_nodes:
2314
+ sel = t.selector
2315
+ if TYPE_CHECKING:
2316
+ sel = cast("ParsedSelector", sel)
2317
+ if not matcher.matches(node, sel):
2318
+ continue
2319
+ new_attrs = t.func(node)
2320
+ if new_attrs is not None:
2321
+ node.attrs = new_attrs
2322
+ continue
2323
+
2324
+ # Selector transforms
2325
+ if TYPE_CHECKING:
2326
+ t = cast("_CompiledSelectorTransform", t)
2327
+ if name.startswith("#") or name == "!doctype":
2328
+ continue
2329
+
2330
+ if not matcher.matches(node, t.selector):
2331
+ continue
2332
+
2333
+ if t.kind == "setattrs":
2334
+ patch = cast("dict[str, str | None]", t.payload)
2335
+ attrs = node.attrs
2336
+ changed_any = False
2337
+ for k, v in patch.items():
2338
+ key = str(k)
2339
+ new_val = None if v is None else str(v)
2340
+ if attrs.get(key) != new_val:
2341
+ attrs[key] = new_val
2342
+ changed_any = True
2343
+ if changed_any:
2344
+ if t.callback is not None:
2345
+ t.callback(node)
2346
+ if t.report is not None:
2347
+ tag = str(node.name).lower()
2348
+ t.report(
2349
+ f"Set attributes on <{tag}> (matched selector '{t.selector_str}')", node=node
2350
+ )
2351
+ continue
2352
+
2353
+ if t.kind == "edit":
2354
+ cb = cast("NodeCallback", t.payload)
2355
+ cb(node)
2356
+ continue
2357
+
2358
+ if t.kind == "empty":
2359
+ had_children = bool(node.children)
2360
+ if node.children:
2361
+ for child in node.children:
2362
+ child.parent = None
2363
+ node.children = []
2364
+ if type(node) is TemplateNode and node.template_content is not None:
2365
+ tc = node.template_content
2366
+ had_children = had_children or bool(tc.children)
2367
+ for child in tc.children or []:
2368
+ child.parent = None
2369
+ tc.children = []
2370
+ if had_children:
2371
+ if t.callback is not None:
2372
+ t.callback(node)
2373
+ if t.report is not None:
2374
+ tag = str(node.name).lower()
2375
+ t.report(f"Emptied <{tag}> (matched selector '{t.selector_str}')", node=node)
2376
+ continue
2377
+
2378
+ if t.kind == "drop":
2379
+ if t.callback is not None:
2380
+ t.callback(node)
2381
+ if t.report is not None:
2382
+ tag = str(node.name).lower()
2383
+ t.report(f"Dropped <{tag}> (matched selector '{t.selector_str}')", node=node)
2384
+ parent.remove_child(node)
2385
+ changed = True
2386
+ break
2387
+
2388
+ # t.kind == "unwrap".
2389
+ if t.callback is not None:
2390
+ t.callback(node)
2391
+ if t.report is not None:
2392
+ tag = str(node.name).lower()
2393
+ t.report(f"Unwrapped <{tag}> (matched selector '{t.selector_str}')", node=node)
2394
+
2395
+ moved_nodes_unwrap: list[SimpleDomNode] = []
2396
+ if node.children:
2397
+ moved_nodes_unwrap.extend(list(node.children))
2398
+ node.children = []
2399
+
2400
+ if type(node) is TemplateNode and node.template_content is not None:
2401
+ tc = node.template_content
2402
+ tc_children = tc.children or []
2403
+ moved_nodes_unwrap.extend(tc_children)
2404
+ tc.children = []
2405
+
2406
+ if moved_nodes_unwrap:
2407
+ for child in moved_nodes_unwrap:
2408
+ _mark_start(child, idx + 1)
2409
+ parent.insert_before(child, node)
2410
+ parent.remove_child(node)
2411
+ changed = True
2412
+ break
2413
+
2414
+ if changed:
2415
+ continue
2416
+
2417
+ if name.startswith("#"):
2418
+ # Document containers (e.g. nested #document-fragment) should
2419
+ # still be traversed to reach their element descendants.
2420
+ if node.children:
2421
+ apply_to_children(node, skip_linkify=skip_linkify, skip_whitespace=skip_whitespace)
2422
+ else:
2423
+ tag = node.name.lower()
2424
+ child_skip = skip_linkify or (tag in linkify_skip_tags)
2425
+ child_skip_ws = skip_whitespace or (tag in whitespace_skip_tags)
2426
+
2427
+ if node.children:
2428
+ apply_to_children(node, skip_linkify=child_skip, skip_whitespace=child_skip_ws)
2429
+
2430
+ if type(node) is TemplateNode and node.template_content is not None:
2431
+ apply_to_children(
2432
+ node.template_content, skip_linkify=child_skip, skip_whitespace=child_skip_ws
2433
+ )
2434
+
2435
+ i += 1
2436
+
2437
+ if type(root_node) is not TextNode:
2438
+ apply_to_children(root_node, skip_linkify=False, skip_whitespace=False)
2439
+
2440
+ # Root template nodes need special handling since the main walk
2441
+ # only visits children of the provided root.
2442
+ if type(root_node) is TemplateNode and root_node.template_content is not None:
2443
+ apply_to_children(root_node.template_content, skip_linkify=False, skip_whitespace=False)
2444
+
2445
+ def apply_prune_transforms(
2446
+ root_node: SimpleDomNode, prune_transforms: list[_CompiledPruneEmptyTransform]
2447
+ ) -> None:
2448
+ def _is_effectively_empty_element(n: SimpleDomNode, *, strip_whitespace: bool) -> bool:
2449
+ if n.namespace == "html" and n.name.lower() in VOID_ELEMENTS:
2450
+ return False
2451
+
2452
+ def _has_content(children: list[SimpleDomNode] | None) -> bool:
2453
+ if not children:
2454
+ return False
2455
+ for ch in children:
2456
+ nm = ch.name
2457
+ if nm == "#text":
2458
+ data = getattr(ch, "data", "") or ""
2459
+ if strip_whitespace:
2460
+ if str(data).strip():
2461
+ return True
2462
+ else:
2463
+ if str(data) != "":
2464
+ return True
2465
+ continue
2466
+ if nm.startswith("#"):
2467
+ continue
2468
+ return True
2469
+ return False
2470
+
2471
+ if _has_content(n.children):
2472
+ return False
2473
+
2474
+ if type(n) is TemplateNode and n.template_content is not None:
2475
+ if _has_content(n.template_content.children):
2476
+ return False
2477
+
2478
+ return True
2479
+
2480
+ stack: list[tuple[SimpleDomNode, bool]] = [(root_node, False)]
2481
+ while stack:
2482
+ node, visited = stack.pop()
2483
+ if not visited:
2484
+ stack.append((node, True))
2485
+
2486
+ children = node.children or []
2487
+ stack.extend((child, False) for child in reversed(children) if isinstance(child, SimpleDomNode))
2488
+
2489
+ if type(node) is TemplateNode and node.template_content is not None:
2490
+ stack.append((node.template_content, False))
2491
+ continue
2492
+
2493
+ if node.parent is None:
2494
+ continue
2495
+ if node.name.startswith("#"):
2496
+ continue
2497
+
2498
+ for pt in prune_transforms:
2499
+ if matcher.matches(node, pt.selector):
2500
+ if _is_effectively_empty_element(node, strip_whitespace=pt.strip_whitespace):
2501
+ if pt.callback is not None:
2502
+ pt.callback(node)
2503
+ if pt.report is not None:
2504
+ tag = str(node.name).lower()
2505
+ pt.report(
2506
+ f"Pruned empty <{tag}> (matched selector '{pt.selector_str}')",
2507
+ node=node,
2508
+ )
2509
+ node.parent.remove_child(node)
2510
+ break
2511
+
2512
+ pending_walk: list[CompiledTransform] = []
2513
+
2514
+ i = 0
2515
+ while i < len(compiled):
2516
+ t = compiled[i]
2517
+ if isinstance(
2518
+ t,
2519
+ (
2520
+ _CompiledSelectorTransform,
2521
+ _CompiledDecideTransform,
2522
+ _CompiledRewriteAttrsTransform,
2523
+ _CompiledLinkifyTransform,
2524
+ _CompiledCollapseWhitespaceTransform,
2525
+ _CompiledDropCommentsTransform,
2526
+ _CompiledDropDoctypeTransform,
2527
+ _CompiledMergeAttrTokensTransform,
2528
+ _CompiledSanitizeTransform,
2529
+ ),
2530
+ ):
2531
+ pending_walk.append(t)
2532
+ i += 1
2533
+ continue
2534
+
2535
+ apply_walk_transforms(root, pending_walk)
2536
+ pending_walk = []
2537
+
2538
+ if isinstance(t, _CompiledStageBoundary):
2539
+ i += 1
2540
+ continue
2541
+
2542
+ if isinstance(t, _CompiledStageHookTransform):
2543
+ if t.callback is not None:
2544
+ t.callback(root)
2545
+ if t.report is not None:
2546
+ t.report(f"Stage {t.index + 1}", node=root)
2547
+ i += 1
2548
+ continue
2549
+
2550
+ if isinstance(t, _CompiledEditDocumentTransform):
2551
+ t.callback(root)
2552
+ i += 1
2553
+ continue
2554
+
2555
+ if isinstance(t, _CompiledPruneEmptyTransform):
2556
+ prune_batch: list[_CompiledPruneEmptyTransform] = [t]
2557
+ i += 1
2558
+ while i < len(compiled) and isinstance(compiled[i], _CompiledPruneEmptyTransform):
2559
+ prune_batch.append(cast("_CompiledPruneEmptyTransform", compiled[i]))
2560
+ i += 1
2561
+ apply_prune_transforms(root, prune_batch)
2562
+ continue
2563
+
2564
+ raise TypeError(f"Unsupported compiled transform: {type(t).__name__}")
2565
+
2566
+ apply_walk_transforms(root, pending_walk)
2567
+ finally:
2568
+ _ERROR_SINK.reset(token)