justhtml 0.24.0__py3-none-any.whl → 0.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +44 -2
- justhtml/__main__.py +45 -9
- justhtml/constants.py +12 -0
- justhtml/errors.py +8 -3
- justhtml/linkify.py +438 -0
- justhtml/node.py +54 -35
- justhtml/parser.py +105 -38
- justhtml/sanitize.py +511 -282
- justhtml/selector.py +3 -1
- justhtml/serialize.py +398 -72
- justhtml/tokenizer.py +121 -21
- justhtml/tokens.py +21 -3
- justhtml/transforms.py +2568 -0
- justhtml/treebuilder.py +247 -190
- justhtml/treebuilder_modes.py +108 -102
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/METADATA +28 -7
- justhtml-0.38.0.dist-info/RECORD +26 -0
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +1 -1
- justhtml-0.24.0.dist-info/RECORD +0 -24
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0
justhtml/transforms.py
ADDED
|
@@ -0,0 +1,2568 @@
|
|
|
1
|
+
"""Constructor-time DOM transforms.
|
|
2
|
+
|
|
3
|
+
These transforms are intended as a migration path for Bleach/html5lib-style
|
|
4
|
+
post-processing, but are implemented as DOM (tree) operations to match
|
|
5
|
+
JustHTML's architecture.
|
|
6
|
+
|
|
7
|
+
Safety model: transforms shape the in-memory tree; safe-by-default output is
|
|
8
|
+
still enforced by `to_html()`/`to_text()`/`to_markdown()` via sanitization.
|
|
9
|
+
|
|
10
|
+
Performance: selectors are compiled (parsed) once before application.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
from contextvars import ContextVar
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from enum import Enum
|
|
19
|
+
from typing import TYPE_CHECKING, ClassVar, Literal, cast
|
|
20
|
+
|
|
21
|
+
from .constants import VOID_ELEMENTS, WHITESPACE_PRESERVING_ELEMENTS
|
|
22
|
+
from .linkify import LinkifyConfig, find_links_with_config
|
|
23
|
+
from .node import ElementNode, SimpleDomNode, TemplateNode, TextNode
|
|
24
|
+
from .sanitize import (
|
|
25
|
+
_URL_LIKE_ATTRS,
|
|
26
|
+
DEFAULT_POLICY,
|
|
27
|
+
SanitizationPolicy,
|
|
28
|
+
UrlPolicy,
|
|
29
|
+
_sanitize_inline_style,
|
|
30
|
+
_sanitize_srcset_value,
|
|
31
|
+
_sanitize_url_value,
|
|
32
|
+
)
|
|
33
|
+
from .selector import SelectorMatcher, parse_selector
|
|
34
|
+
from .serialize import serialize_end_tag, serialize_start_tag
|
|
35
|
+
from .tokens import ParseError
|
|
36
|
+
|
|
37
|
+
if TYPE_CHECKING:
|
|
38
|
+
from collections.abc import Callable, Collection
|
|
39
|
+
from typing import Any, Protocol
|
|
40
|
+
|
|
41
|
+
from .selector import ParsedSelector
|
|
42
|
+
|
|
43
|
+
class NodeCallback(Protocol):
|
|
44
|
+
def __call__(self, node: SimpleDomNode) -> None: ...
|
|
45
|
+
|
|
46
|
+
class EditAttrsCallback(Protocol):
|
|
47
|
+
def __call__(self, node: SimpleDomNode) -> dict[str, str | None] | None: ...
|
|
48
|
+
|
|
49
|
+
class ReportCallback(Protocol):
|
|
50
|
+
def __call__(self, msg: str, *, node: Any | None = None) -> None: ...
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# -----------------
|
|
54
|
+
# Public transforms
|
|
55
|
+
# -----------------
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
_ERROR_SINK: ContextVar[list[ParseError] | None] = ContextVar("justhtml_transform_error_sink", default=None)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def emit_error(
|
|
62
|
+
code: str,
|
|
63
|
+
*,
|
|
64
|
+
node: SimpleDomNode | None = None,
|
|
65
|
+
line: int | None = None,
|
|
66
|
+
column: int | None = None,
|
|
67
|
+
category: str = "transform",
|
|
68
|
+
message: str | None = None,
|
|
69
|
+
) -> None:
|
|
70
|
+
"""Emit a ParseError from within a transform callback.
|
|
71
|
+
|
|
72
|
+
Errors are appended to the active sink when transforms are applied (e.g.
|
|
73
|
+
during JustHTML construction). If no sink is active, this is a no-op.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
sink = _ERROR_SINK.get()
|
|
77
|
+
if sink is None:
|
|
78
|
+
return
|
|
79
|
+
|
|
80
|
+
if node is not None:
|
|
81
|
+
line = node.origin_line
|
|
82
|
+
column = node.origin_col
|
|
83
|
+
|
|
84
|
+
sink.append(
|
|
85
|
+
ParseError(
|
|
86
|
+
str(code),
|
|
87
|
+
line=line,
|
|
88
|
+
column=column,
|
|
89
|
+
category=str(category),
|
|
90
|
+
message=str(message) if message is not None else str(code),
|
|
91
|
+
)
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class _StrEnum(str, Enum):
|
|
96
|
+
"""Backport of enum.StrEnum (Python 3.11+).
|
|
97
|
+
|
|
98
|
+
We support Python 3.10+, so we use this small mixin instead.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class DecideAction(_StrEnum):
|
|
103
|
+
KEEP = "keep"
|
|
104
|
+
DROP = "drop"
|
|
105
|
+
UNWRAP = "unwrap"
|
|
106
|
+
EMPTY = "empty"
|
|
107
|
+
ESCAPE = "escape"
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@dataclass(frozen=True, slots=True)
|
|
111
|
+
class SetAttrs:
|
|
112
|
+
selector: str
|
|
113
|
+
attrs: dict[str, str | None]
|
|
114
|
+
enabled: bool
|
|
115
|
+
callback: NodeCallback | None
|
|
116
|
+
report: ReportCallback | None
|
|
117
|
+
|
|
118
|
+
def __init__(
|
|
119
|
+
self,
|
|
120
|
+
selector: str,
|
|
121
|
+
*,
|
|
122
|
+
enabled: bool = True,
|
|
123
|
+
callback: NodeCallback | None = None,
|
|
124
|
+
report: ReportCallback | None = None,
|
|
125
|
+
attributes: dict[str, str | None] | None = None,
|
|
126
|
+
**attrs: str | None,
|
|
127
|
+
) -> None:
|
|
128
|
+
object.__setattr__(self, "selector", str(selector))
|
|
129
|
+
merged = dict(attributes) if attributes else {}
|
|
130
|
+
merged.update(attrs)
|
|
131
|
+
object.__setattr__(self, "attrs", merged)
|
|
132
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
133
|
+
object.__setattr__(self, "callback", callback)
|
|
134
|
+
object.__setattr__(self, "report", report)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@dataclass(frozen=True, slots=True)
|
|
138
|
+
class Drop:
|
|
139
|
+
selector: str
|
|
140
|
+
|
|
141
|
+
enabled: bool
|
|
142
|
+
callback: NodeCallback | None
|
|
143
|
+
report: ReportCallback | None
|
|
144
|
+
|
|
145
|
+
def __init__(
|
|
146
|
+
self,
|
|
147
|
+
selector: str,
|
|
148
|
+
*,
|
|
149
|
+
enabled: bool = True,
|
|
150
|
+
callback: NodeCallback | None = None,
|
|
151
|
+
report: ReportCallback | None = None,
|
|
152
|
+
) -> None:
|
|
153
|
+
object.__setattr__(self, "selector", str(selector))
|
|
154
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
155
|
+
object.__setattr__(self, "callback", callback)
|
|
156
|
+
object.__setattr__(self, "report", report)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@dataclass(frozen=True, slots=True)
|
|
160
|
+
class Unwrap:
|
|
161
|
+
selector: str
|
|
162
|
+
|
|
163
|
+
enabled: bool
|
|
164
|
+
callback: NodeCallback | None
|
|
165
|
+
report: ReportCallback | None
|
|
166
|
+
|
|
167
|
+
def __init__(
|
|
168
|
+
self,
|
|
169
|
+
selector: str,
|
|
170
|
+
*,
|
|
171
|
+
enabled: bool = True,
|
|
172
|
+
callback: NodeCallback | None = None,
|
|
173
|
+
report: ReportCallback | None = None,
|
|
174
|
+
) -> None:
|
|
175
|
+
object.__setattr__(self, "selector", str(selector))
|
|
176
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
177
|
+
object.__setattr__(self, "callback", callback)
|
|
178
|
+
object.__setattr__(self, "report", report)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
@dataclass(frozen=True, slots=True)
|
|
182
|
+
class Empty:
|
|
183
|
+
selector: str
|
|
184
|
+
|
|
185
|
+
enabled: bool
|
|
186
|
+
callback: NodeCallback | None
|
|
187
|
+
report: ReportCallback | None
|
|
188
|
+
|
|
189
|
+
def __init__(
|
|
190
|
+
self,
|
|
191
|
+
selector: str,
|
|
192
|
+
*,
|
|
193
|
+
enabled: bool = True,
|
|
194
|
+
callback: NodeCallback | None = None,
|
|
195
|
+
report: ReportCallback | None = None,
|
|
196
|
+
) -> None:
|
|
197
|
+
object.__setattr__(self, "selector", str(selector))
|
|
198
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
199
|
+
object.__setattr__(self, "callback", callback)
|
|
200
|
+
object.__setattr__(self, "report", report)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
@dataclass(frozen=True, slots=True)
|
|
204
|
+
class Edit:
|
|
205
|
+
selector: str
|
|
206
|
+
func: NodeCallback
|
|
207
|
+
enabled: bool
|
|
208
|
+
callback: NodeCallback | None
|
|
209
|
+
report: ReportCallback | None
|
|
210
|
+
|
|
211
|
+
def __init__(
|
|
212
|
+
self,
|
|
213
|
+
selector: str,
|
|
214
|
+
func: NodeCallback,
|
|
215
|
+
*,
|
|
216
|
+
enabled: bool = True,
|
|
217
|
+
callback: NodeCallback | None = None,
|
|
218
|
+
report: ReportCallback | None = None,
|
|
219
|
+
) -> None:
|
|
220
|
+
object.__setattr__(self, "selector", str(selector))
|
|
221
|
+
object.__setattr__(self, "func", func)
|
|
222
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
223
|
+
object.__setattr__(self, "callback", callback)
|
|
224
|
+
object.__setattr__(self, "report", report)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
@dataclass(frozen=True, slots=True)
|
|
228
|
+
class EditDocument:
|
|
229
|
+
"""Edit the document root in-place.
|
|
230
|
+
|
|
231
|
+
The callback is invoked exactly once with the provided root node.
|
|
232
|
+
|
|
233
|
+
This is intended for operations that need access to the root container
|
|
234
|
+
(e.g. #document / #document-fragment) which selector-based transforms do
|
|
235
|
+
not visit.
|
|
236
|
+
"""
|
|
237
|
+
|
|
238
|
+
func: NodeCallback
|
|
239
|
+
enabled: bool
|
|
240
|
+
callback: NodeCallback | None
|
|
241
|
+
report: ReportCallback | None
|
|
242
|
+
|
|
243
|
+
def __init__(
|
|
244
|
+
self,
|
|
245
|
+
func: NodeCallback,
|
|
246
|
+
*,
|
|
247
|
+
enabled: bool = True,
|
|
248
|
+
callback: NodeCallback | None = None,
|
|
249
|
+
report: ReportCallback | None = None,
|
|
250
|
+
) -> None:
|
|
251
|
+
object.__setattr__(self, "func", func)
|
|
252
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
253
|
+
object.__setattr__(self, "callback", callback)
|
|
254
|
+
object.__setattr__(self, "report", report)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
@dataclass(frozen=True, slots=True)
|
|
258
|
+
class Decide:
|
|
259
|
+
"""Perform structural actions based on a callback.
|
|
260
|
+
|
|
261
|
+
This is a generic building block for policy-driven transforms.
|
|
262
|
+
|
|
263
|
+
- For selectors other than "*", the selector is matched against element
|
|
264
|
+
nodes using the normal selector engine.
|
|
265
|
+
- For selector "*", the callback is invoked for every node type, including
|
|
266
|
+
text/comment/doctype and document container nodes.
|
|
267
|
+
|
|
268
|
+
The callback must return one of: Decide.KEEP, Decide.DROP, Decide.UNWRAP, Decide.EMPTY, Decide.ESCAPE.
|
|
269
|
+
"""
|
|
270
|
+
|
|
271
|
+
selector: str
|
|
272
|
+
func: Callable[[SimpleDomNode], DecideAction]
|
|
273
|
+
enabled: bool
|
|
274
|
+
callback: NodeCallback | None
|
|
275
|
+
report: ReportCallback | None
|
|
276
|
+
|
|
277
|
+
KEEP: ClassVar[DecideAction] = DecideAction.KEEP
|
|
278
|
+
DROP: ClassVar[DecideAction] = DecideAction.DROP
|
|
279
|
+
UNWRAP: ClassVar[DecideAction] = DecideAction.UNWRAP
|
|
280
|
+
EMPTY: ClassVar[DecideAction] = DecideAction.EMPTY
|
|
281
|
+
ESCAPE: ClassVar[DecideAction] = DecideAction.ESCAPE
|
|
282
|
+
|
|
283
|
+
def __init__(
|
|
284
|
+
self,
|
|
285
|
+
selector: str,
|
|
286
|
+
func: Callable[[SimpleDomNode], DecideAction],
|
|
287
|
+
*,
|
|
288
|
+
enabled: bool = True,
|
|
289
|
+
callback: NodeCallback | None = None,
|
|
290
|
+
report: ReportCallback | None = None,
|
|
291
|
+
) -> None:
|
|
292
|
+
object.__setattr__(self, "selector", str(selector))
|
|
293
|
+
object.__setattr__(self, "func", func)
|
|
294
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
295
|
+
object.__setattr__(self, "callback", callback)
|
|
296
|
+
object.__setattr__(self, "report", report)
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
@dataclass(frozen=True, slots=True)
|
|
300
|
+
class EditAttrs:
|
|
301
|
+
"""Edit element attributes using a callback.
|
|
302
|
+
|
|
303
|
+
The callback is invoked for matching element/template nodes.
|
|
304
|
+
|
|
305
|
+
- Return None to leave attributes unchanged.
|
|
306
|
+
- Return a dict to replace the node's attributes with that dict.
|
|
307
|
+
"""
|
|
308
|
+
|
|
309
|
+
selector: str
|
|
310
|
+
func: EditAttrsCallback
|
|
311
|
+
enabled: bool
|
|
312
|
+
callback: NodeCallback | None
|
|
313
|
+
report: ReportCallback | None
|
|
314
|
+
|
|
315
|
+
def __init__(
|
|
316
|
+
self,
|
|
317
|
+
selector: str,
|
|
318
|
+
func: EditAttrsCallback,
|
|
319
|
+
*,
|
|
320
|
+
enabled: bool = True,
|
|
321
|
+
callback: NodeCallback | None = None,
|
|
322
|
+
report: ReportCallback | None = None,
|
|
323
|
+
) -> None:
|
|
324
|
+
object.__setattr__(self, "selector", str(selector))
|
|
325
|
+
object.__setattr__(self, "func", func)
|
|
326
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
327
|
+
object.__setattr__(self, "callback", callback)
|
|
328
|
+
object.__setattr__(self, "report", report)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
# Backwards-compatible alias.
|
|
332
|
+
RewriteAttrs = EditAttrs
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
@dataclass(frozen=True, slots=True)
|
|
336
|
+
class Linkify:
|
|
337
|
+
"""Linkify URLs/emails in text nodes.
|
|
338
|
+
|
|
339
|
+
This transform scans DOM text nodes (not raw HTML strings) and wraps detected
|
|
340
|
+
links in `<a href="...">...</a>`.
|
|
341
|
+
"""
|
|
342
|
+
|
|
343
|
+
skip_tags: frozenset[str]
|
|
344
|
+
fuzzy_ip: bool
|
|
345
|
+
extra_tlds: frozenset[str]
|
|
346
|
+
enabled: bool
|
|
347
|
+
callback: NodeCallback | None
|
|
348
|
+
report: ReportCallback | None
|
|
349
|
+
|
|
350
|
+
def __init__(
|
|
351
|
+
self,
|
|
352
|
+
*,
|
|
353
|
+
skip_tags: list[str] | tuple[str, ...] | set[str] | frozenset[str] = (
|
|
354
|
+
"a",
|
|
355
|
+
*WHITESPACE_PRESERVING_ELEMENTS,
|
|
356
|
+
),
|
|
357
|
+
enabled: bool = True,
|
|
358
|
+
fuzzy_ip: bool = False,
|
|
359
|
+
extra_tlds: list[str] | tuple[str, ...] | set[str] | frozenset[str] = (),
|
|
360
|
+
callback: NodeCallback | None = None,
|
|
361
|
+
report: ReportCallback | None = None,
|
|
362
|
+
) -> None:
|
|
363
|
+
object.__setattr__(self, "skip_tags", frozenset(str(t).lower() for t in skip_tags))
|
|
364
|
+
object.__setattr__(self, "fuzzy_ip", bool(fuzzy_ip))
|
|
365
|
+
object.__setattr__(self, "extra_tlds", frozenset(str(t).lower() for t in extra_tlds))
|
|
366
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
367
|
+
object.__setattr__(self, "callback", callback)
|
|
368
|
+
object.__setattr__(self, "report", report)
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def _collapse_html_space_characters(text: str) -> str:
|
|
372
|
+
"""Collapse runs of HTML whitespace characters to a single space.
|
|
373
|
+
|
|
374
|
+
This mirrors html5lib's whitespace filter behavior: it does not trim.
|
|
375
|
+
"""
|
|
376
|
+
|
|
377
|
+
# Fast path: no formatting whitespace and no double spaces.
|
|
378
|
+
if "\t" not in text and "\n" not in text and "\r" not in text and "\f" not in text and " " not in text:
|
|
379
|
+
return text
|
|
380
|
+
|
|
381
|
+
out: list[str] = []
|
|
382
|
+
in_ws = False
|
|
383
|
+
|
|
384
|
+
for ch in text:
|
|
385
|
+
if ch == " " or ch == "\t" or ch == "\n" or ch == "\r" or ch == "\f":
|
|
386
|
+
if in_ws:
|
|
387
|
+
continue
|
|
388
|
+
out.append(" ")
|
|
389
|
+
in_ws = True
|
|
390
|
+
continue
|
|
391
|
+
|
|
392
|
+
out.append(ch)
|
|
393
|
+
in_ws = False
|
|
394
|
+
return "".join(out)
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
@dataclass(frozen=True, slots=True)
|
|
398
|
+
class CollapseWhitespace:
|
|
399
|
+
"""Collapse whitespace in text nodes.
|
|
400
|
+
|
|
401
|
+
Collapses runs of HTML whitespace characters (space, tab, LF, CR, FF) into a
|
|
402
|
+
single space.
|
|
403
|
+
|
|
404
|
+
This is similar to `html5lib.filters.whitespace.Filter`.
|
|
405
|
+
"""
|
|
406
|
+
|
|
407
|
+
skip_tags: frozenset[str]
|
|
408
|
+
enabled: bool
|
|
409
|
+
callback: NodeCallback | None
|
|
410
|
+
report: ReportCallback | None
|
|
411
|
+
|
|
412
|
+
def __init__(
|
|
413
|
+
self,
|
|
414
|
+
*,
|
|
415
|
+
skip_tags: list[str] | tuple[str, ...] | set[str] | frozenset[str] = (
|
|
416
|
+
*WHITESPACE_PRESERVING_ELEMENTS,
|
|
417
|
+
"title",
|
|
418
|
+
),
|
|
419
|
+
enabled: bool = True,
|
|
420
|
+
callback: NodeCallback | None = None,
|
|
421
|
+
report: ReportCallback | None = None,
|
|
422
|
+
) -> None:
|
|
423
|
+
object.__setattr__(self, "skip_tags", frozenset(str(t).lower() for t in skip_tags))
|
|
424
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
425
|
+
object.__setattr__(self, "callback", callback)
|
|
426
|
+
object.__setattr__(self, "report", report)
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
@dataclass(frozen=True, slots=True)
|
|
430
|
+
class Sanitize:
|
|
431
|
+
"""Sanitize the in-memory tree.
|
|
432
|
+
|
|
433
|
+
This transform replaces the current tree with a sanitized clone using the
|
|
434
|
+
same sanitizer that powers `safe=True` serialization.
|
|
435
|
+
|
|
436
|
+
Notes:
|
|
437
|
+
- This runs once at parse/transform time.
|
|
438
|
+
- If you apply transforms after `Sanitize`, they may reintroduce unsafe
|
|
439
|
+
content. Use safe serialization (`safe=True`) if you need output safety.
|
|
440
|
+
"""
|
|
441
|
+
|
|
442
|
+
policy: SanitizationPolicy | None
|
|
443
|
+
enabled: bool
|
|
444
|
+
callback: NodeCallback | None
|
|
445
|
+
report: ReportCallback | None
|
|
446
|
+
|
|
447
|
+
def __init__(
|
|
448
|
+
self,
|
|
449
|
+
policy: SanitizationPolicy | None = None,
|
|
450
|
+
*,
|
|
451
|
+
enabled: bool = True,
|
|
452
|
+
callback: NodeCallback | None = None,
|
|
453
|
+
report: ReportCallback | None = None,
|
|
454
|
+
) -> None:
|
|
455
|
+
object.__setattr__(self, "policy", policy)
|
|
456
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
457
|
+
object.__setattr__(self, "callback", callback)
|
|
458
|
+
object.__setattr__(self, "report", report)
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
@dataclass(frozen=True, slots=True)
|
|
462
|
+
class DropComments:
|
|
463
|
+
"""Drop comment nodes (#comment)."""
|
|
464
|
+
|
|
465
|
+
enabled: bool
|
|
466
|
+
callback: NodeCallback | None
|
|
467
|
+
report: ReportCallback | None
|
|
468
|
+
|
|
469
|
+
def __init__(
|
|
470
|
+
self,
|
|
471
|
+
*,
|
|
472
|
+
enabled: bool = True,
|
|
473
|
+
callback: NodeCallback | None = None,
|
|
474
|
+
report: ReportCallback | None = None,
|
|
475
|
+
) -> None:
|
|
476
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
477
|
+
object.__setattr__(self, "callback", callback)
|
|
478
|
+
object.__setattr__(self, "report", report)
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
@dataclass(frozen=True, slots=True)
|
|
482
|
+
class DropDoctype:
|
|
483
|
+
"""Drop doctype nodes (!doctype)."""
|
|
484
|
+
|
|
485
|
+
enabled: bool
|
|
486
|
+
callback: NodeCallback | None
|
|
487
|
+
report: ReportCallback | None
|
|
488
|
+
|
|
489
|
+
def __init__(
|
|
490
|
+
self,
|
|
491
|
+
*,
|
|
492
|
+
enabled: bool = True,
|
|
493
|
+
callback: NodeCallback | None = None,
|
|
494
|
+
report: ReportCallback | None = None,
|
|
495
|
+
) -> None:
|
|
496
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
497
|
+
object.__setattr__(self, "callback", callback)
|
|
498
|
+
object.__setattr__(self, "report", report)
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
@dataclass(frozen=True, slots=True)
|
|
502
|
+
class DropForeignNamespaces:
|
|
503
|
+
"""Drop elements in non-HTML namespaces."""
|
|
504
|
+
|
|
505
|
+
enabled: bool
|
|
506
|
+
callback: NodeCallback | None
|
|
507
|
+
report: ReportCallback | None
|
|
508
|
+
|
|
509
|
+
def __init__(
|
|
510
|
+
self,
|
|
511
|
+
*,
|
|
512
|
+
enabled: bool = True,
|
|
513
|
+
callback: NodeCallback | None = None,
|
|
514
|
+
report: ReportCallback | None = None,
|
|
515
|
+
) -> None:
|
|
516
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
517
|
+
object.__setattr__(self, "callback", callback)
|
|
518
|
+
object.__setattr__(self, "report", report)
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
@dataclass(frozen=True, slots=True)
|
|
522
|
+
class DropAttrs:
|
|
523
|
+
"""Drop attributes whose names match simple patterns."""
|
|
524
|
+
|
|
525
|
+
selector: str
|
|
526
|
+
patterns: tuple[str, ...]
|
|
527
|
+
enabled: bool
|
|
528
|
+
callback: NodeCallback | None
|
|
529
|
+
report: ReportCallback | None
|
|
530
|
+
|
|
531
|
+
def __init__(
|
|
532
|
+
self,
|
|
533
|
+
selector: str,
|
|
534
|
+
*,
|
|
535
|
+
patterns: tuple[str, ...] = (),
|
|
536
|
+
enabled: bool = True,
|
|
537
|
+
callback: NodeCallback | None = None,
|
|
538
|
+
report: ReportCallback | None = None,
|
|
539
|
+
) -> None:
|
|
540
|
+
object.__setattr__(self, "selector", str(selector))
|
|
541
|
+
object.__setattr__(
|
|
542
|
+
self,
|
|
543
|
+
"patterns",
|
|
544
|
+
tuple(sorted({str(p).strip().lower() for p in patterns if str(p).strip()})),
|
|
545
|
+
)
|
|
546
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
547
|
+
object.__setattr__(self, "callback", callback)
|
|
548
|
+
object.__setattr__(self, "report", report)
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
@dataclass(frozen=True, slots=True)
|
|
552
|
+
class AllowlistAttrs:
|
|
553
|
+
"""Retain only allowlisted attributes by tag and global allowlist."""
|
|
554
|
+
|
|
555
|
+
selector: str
|
|
556
|
+
allowed_attributes: dict[str, set[str]]
|
|
557
|
+
enabled: bool
|
|
558
|
+
callback: NodeCallback | None
|
|
559
|
+
report: ReportCallback | None
|
|
560
|
+
|
|
561
|
+
def __init__(
|
|
562
|
+
self,
|
|
563
|
+
selector: str,
|
|
564
|
+
*,
|
|
565
|
+
allowed_attributes: dict[str, Collection[str]],
|
|
566
|
+
enabled: bool = True,
|
|
567
|
+
callback: NodeCallback | None = None,
|
|
568
|
+
report: ReportCallback | None = None,
|
|
569
|
+
) -> None:
|
|
570
|
+
normalized: dict[str, set[str]] = {}
|
|
571
|
+
for tag, attrs in allowed_attributes.items():
|
|
572
|
+
normalized[str(tag)] = {str(a).lower() for a in attrs}
|
|
573
|
+
object.__setattr__(self, "selector", str(selector))
|
|
574
|
+
object.__setattr__(self, "allowed_attributes", normalized)
|
|
575
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
576
|
+
object.__setattr__(self, "callback", callback)
|
|
577
|
+
object.__setattr__(self, "report", report)
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
@dataclass(frozen=True, slots=True)
|
|
581
|
+
class DropUrlAttrs:
|
|
582
|
+
"""Validate and rewrite/drop URL-valued attributes based on UrlPolicy rules."""
|
|
583
|
+
|
|
584
|
+
selector: str
|
|
585
|
+
url_policy: UrlPolicy
|
|
586
|
+
enabled: bool
|
|
587
|
+
callback: NodeCallback | None
|
|
588
|
+
report: ReportCallback | None
|
|
589
|
+
|
|
590
|
+
def __init__(
|
|
591
|
+
self,
|
|
592
|
+
selector: str,
|
|
593
|
+
*,
|
|
594
|
+
url_policy: UrlPolicy,
|
|
595
|
+
enabled: bool = True,
|
|
596
|
+
callback: NodeCallback | None = None,
|
|
597
|
+
report: ReportCallback | None = None,
|
|
598
|
+
) -> None:
|
|
599
|
+
object.__setattr__(self, "selector", str(selector))
|
|
600
|
+
object.__setattr__(self, "url_policy", url_policy)
|
|
601
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
602
|
+
object.__setattr__(self, "callback", callback)
|
|
603
|
+
object.__setattr__(self, "report", report)
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
@dataclass(frozen=True, slots=True)
|
|
607
|
+
class AllowStyleAttrs:
|
|
608
|
+
"""Sanitize inline style attributes when present."""
|
|
609
|
+
|
|
610
|
+
selector: str
|
|
611
|
+
allowed_css_properties: tuple[str, ...]
|
|
612
|
+
enabled: bool
|
|
613
|
+
callback: NodeCallback | None
|
|
614
|
+
report: ReportCallback | None
|
|
615
|
+
|
|
616
|
+
def __init__(
|
|
617
|
+
self,
|
|
618
|
+
selector: str,
|
|
619
|
+
*,
|
|
620
|
+
allowed_css_properties: Collection[str],
|
|
621
|
+
enabled: bool = True,
|
|
622
|
+
callback: NodeCallback | None = None,
|
|
623
|
+
report: ReportCallback | None = None,
|
|
624
|
+
) -> None:
|
|
625
|
+
object.__setattr__(self, "selector", str(selector))
|
|
626
|
+
object.__setattr__(
|
|
627
|
+
self,
|
|
628
|
+
"allowed_css_properties",
|
|
629
|
+
tuple(sorted({str(p).strip().lower() for p in allowed_css_properties if str(p).strip()})),
|
|
630
|
+
)
|
|
631
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
632
|
+
object.__setattr__(self, "callback", callback)
|
|
633
|
+
object.__setattr__(self, "report", report)
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
@dataclass(frozen=True, slots=True)
|
|
637
|
+
class MergeAttrs:
|
|
638
|
+
"""Merge tokens into a whitespace-delimited attribute without removing existing ones."""
|
|
639
|
+
|
|
640
|
+
tag: str
|
|
641
|
+
attr: str
|
|
642
|
+
tokens: tuple[str, ...]
|
|
643
|
+
enabled: bool
|
|
644
|
+
callback: NodeCallback | None
|
|
645
|
+
report: ReportCallback | None
|
|
646
|
+
|
|
647
|
+
def __init__(
|
|
648
|
+
self,
|
|
649
|
+
tag: str,
|
|
650
|
+
*,
|
|
651
|
+
attr: str,
|
|
652
|
+
tokens: Collection[str],
|
|
653
|
+
enabled: bool = True,
|
|
654
|
+
callback: NodeCallback | None = None,
|
|
655
|
+
report: ReportCallback | None = None,
|
|
656
|
+
) -> None:
|
|
657
|
+
object.__setattr__(self, "tag", str(tag).lower())
|
|
658
|
+
object.__setattr__(self, "attr", str(attr).lower())
|
|
659
|
+
object.__setattr__(self, "tokens", tuple(sorted({str(t).strip().lower() for t in tokens if str(t).strip()})))
|
|
660
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
661
|
+
object.__setattr__(self, "callback", callback)
|
|
662
|
+
object.__setattr__(self, "report", report)
|
|
663
|
+
|
|
664
|
+
|
|
665
|
+
@dataclass(frozen=True, slots=True)
|
|
666
|
+
class PruneEmpty:
|
|
667
|
+
"""Recursively drop empty elements.
|
|
668
|
+
|
|
669
|
+
This transform removes elements that are empty at that point in the
|
|
670
|
+
transform pipeline.
|
|
671
|
+
|
|
672
|
+
"Empty" means:
|
|
673
|
+
- no element children, and
|
|
674
|
+
- no non-whitespace text nodes (unless `strip_whitespace=False`).
|
|
675
|
+
|
|
676
|
+
Comments/doctypes are ignored when determining emptiness.
|
|
677
|
+
|
|
678
|
+
Notes:
|
|
679
|
+
- Pruning uses a post-order traversal to be correct.
|
|
680
|
+
"""
|
|
681
|
+
|
|
682
|
+
selector: str
|
|
683
|
+
strip_whitespace: bool
|
|
684
|
+
enabled: bool
|
|
685
|
+
callback: NodeCallback | None
|
|
686
|
+
report: ReportCallback | None
|
|
687
|
+
|
|
688
|
+
def __init__(
|
|
689
|
+
self,
|
|
690
|
+
selector: str,
|
|
691
|
+
*,
|
|
692
|
+
strip_whitespace: bool = True,
|
|
693
|
+
enabled: bool = True,
|
|
694
|
+
callback: NodeCallback | None = None,
|
|
695
|
+
report: ReportCallback | None = None,
|
|
696
|
+
) -> None:
|
|
697
|
+
object.__setattr__(self, "selector", str(selector))
|
|
698
|
+
object.__setattr__(self, "strip_whitespace", bool(strip_whitespace))
|
|
699
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
700
|
+
object.__setattr__(self, "callback", callback)
|
|
701
|
+
object.__setattr__(self, "report", report)
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
@dataclass(frozen=True, slots=True)
|
|
705
|
+
class Stage:
|
|
706
|
+
"""Group transforms into an explicit stage.
|
|
707
|
+
|
|
708
|
+
Stages are intended to make transform passes explicit and readable.
|
|
709
|
+
|
|
710
|
+
- Stages can be nested; nested stages are flattened.
|
|
711
|
+
- If at least one Stage is present at the top level of a transform list,
|
|
712
|
+
any top-level transforms around it are automatically grouped into
|
|
713
|
+
implicit stages.
|
|
714
|
+
"""
|
|
715
|
+
|
|
716
|
+
transforms: tuple[TransformSpec, ...]
|
|
717
|
+
enabled: bool
|
|
718
|
+
callback: NodeCallback | None
|
|
719
|
+
report: ReportCallback | None
|
|
720
|
+
|
|
721
|
+
def __init__(
|
|
722
|
+
self,
|
|
723
|
+
transforms: list[TransformSpec] | tuple[TransformSpec, ...],
|
|
724
|
+
*,
|
|
725
|
+
enabled: bool = True,
|
|
726
|
+
callback: NodeCallback | None = None,
|
|
727
|
+
report: ReportCallback | None = None,
|
|
728
|
+
) -> None:
|
|
729
|
+
object.__setattr__(self, "transforms", tuple(transforms))
|
|
730
|
+
object.__setattr__(self, "enabled", bool(enabled))
|
|
731
|
+
object.__setattr__(self, "callback", callback)
|
|
732
|
+
object.__setattr__(self, "report", report)
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
# -----------------
|
|
736
|
+
# Compilation
|
|
737
|
+
# -----------------
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
Transform = (
|
|
741
|
+
SetAttrs
|
|
742
|
+
| Drop
|
|
743
|
+
| Unwrap
|
|
744
|
+
| Empty
|
|
745
|
+
| Edit
|
|
746
|
+
| EditDocument
|
|
747
|
+
| Decide
|
|
748
|
+
| EditAttrs
|
|
749
|
+
| Linkify
|
|
750
|
+
| CollapseWhitespace
|
|
751
|
+
| PruneEmpty
|
|
752
|
+
| Sanitize
|
|
753
|
+
| DropComments
|
|
754
|
+
| DropDoctype
|
|
755
|
+
| DropForeignNamespaces
|
|
756
|
+
| DropAttrs
|
|
757
|
+
| AllowlistAttrs
|
|
758
|
+
| DropUrlAttrs
|
|
759
|
+
| AllowStyleAttrs
|
|
760
|
+
| MergeAttrs
|
|
761
|
+
)
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
_TRANSFORM_CLASSES: tuple[type[object], ...] = (
|
|
765
|
+
SetAttrs,
|
|
766
|
+
Drop,
|
|
767
|
+
Unwrap,
|
|
768
|
+
Empty,
|
|
769
|
+
Edit,
|
|
770
|
+
EditDocument,
|
|
771
|
+
Decide,
|
|
772
|
+
EditAttrs,
|
|
773
|
+
Linkify,
|
|
774
|
+
CollapseWhitespace,
|
|
775
|
+
PruneEmpty,
|
|
776
|
+
Sanitize,
|
|
777
|
+
DropComments,
|
|
778
|
+
DropDoctype,
|
|
779
|
+
DropForeignNamespaces,
|
|
780
|
+
DropAttrs,
|
|
781
|
+
AllowlistAttrs,
|
|
782
|
+
DropUrlAttrs,
|
|
783
|
+
AllowStyleAttrs,
|
|
784
|
+
MergeAttrs,
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
TransformSpec = Transform | Stage
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
@dataclass(frozen=True, slots=True)
|
|
791
|
+
class _CompiledCollapseWhitespaceTransform:
|
|
792
|
+
kind: Literal["collapse_whitespace"]
|
|
793
|
+
skip_tags: frozenset[str]
|
|
794
|
+
callback: NodeCallback | None
|
|
795
|
+
report: ReportCallback | None
|
|
796
|
+
|
|
797
|
+
|
|
798
|
+
@dataclass(frozen=True, slots=True)
|
|
799
|
+
class _CompiledSelectorTransform:
|
|
800
|
+
kind: Literal["setattrs", "drop", "unwrap", "empty", "edit"]
|
|
801
|
+
selector_str: str
|
|
802
|
+
selector: ParsedSelector
|
|
803
|
+
payload: dict[str, str | None] | NodeCallback | None
|
|
804
|
+
callback: NodeCallback | None
|
|
805
|
+
report: ReportCallback | None
|
|
806
|
+
|
|
807
|
+
|
|
808
|
+
@dataclass(frozen=True, slots=True)
|
|
809
|
+
class _CompiledLinkifyTransform:
|
|
810
|
+
kind: Literal["linkify"]
|
|
811
|
+
skip_tags: frozenset[str]
|
|
812
|
+
config: LinkifyConfig
|
|
813
|
+
callback: NodeCallback | None
|
|
814
|
+
report: ReportCallback | None
|
|
815
|
+
|
|
816
|
+
|
|
817
|
+
@dataclass(frozen=True, slots=True)
|
|
818
|
+
class _CompiledEditDocumentTransform:
|
|
819
|
+
kind: Literal["edit_document"]
|
|
820
|
+
callback: NodeCallback
|
|
821
|
+
|
|
822
|
+
|
|
823
|
+
@dataclass(frozen=True, slots=True)
|
|
824
|
+
class _CompiledPruneEmptyTransform:
|
|
825
|
+
kind: Literal["prune_empty"]
|
|
826
|
+
selector_str: str
|
|
827
|
+
selector: ParsedSelector
|
|
828
|
+
strip_whitespace: bool
|
|
829
|
+
callback: NodeCallback | None
|
|
830
|
+
report: ReportCallback | None
|
|
831
|
+
|
|
832
|
+
|
|
833
|
+
@dataclass(frozen=True, slots=True)
|
|
834
|
+
class _CompiledStageBoundary:
|
|
835
|
+
kind: Literal["stage_boundary"]
|
|
836
|
+
|
|
837
|
+
|
|
838
|
+
@dataclass(frozen=True, slots=True)
|
|
839
|
+
class _CompiledDecideTransform:
|
|
840
|
+
kind: Literal["decide"]
|
|
841
|
+
selector_str: str
|
|
842
|
+
selector: ParsedSelector | None
|
|
843
|
+
all_nodes: bool
|
|
844
|
+
callback: Callable[[SimpleDomNode], DecideAction]
|
|
845
|
+
|
|
846
|
+
|
|
847
|
+
@dataclass(frozen=True, slots=True)
|
|
848
|
+
class _CompiledRewriteAttrsTransform:
|
|
849
|
+
kind: Literal["rewrite_attrs"]
|
|
850
|
+
selector_str: str
|
|
851
|
+
selector: ParsedSelector | None
|
|
852
|
+
all_nodes: bool
|
|
853
|
+
func: EditAttrsCallback
|
|
854
|
+
|
|
855
|
+
|
|
856
|
+
@dataclass(frozen=True, slots=True)
|
|
857
|
+
class _CompiledDropCommentsTransform:
|
|
858
|
+
kind: Literal["drop_comments"]
|
|
859
|
+
callback: NodeCallback | None
|
|
860
|
+
report: ReportCallback | None
|
|
861
|
+
|
|
862
|
+
|
|
863
|
+
@dataclass(frozen=True, slots=True)
|
|
864
|
+
class _CompiledDropDoctypeTransform:
|
|
865
|
+
kind: Literal["drop_doctype"]
|
|
866
|
+
callback: NodeCallback | None
|
|
867
|
+
report: ReportCallback | None
|
|
868
|
+
|
|
869
|
+
|
|
870
|
+
@dataclass(frozen=True, slots=True)
|
|
871
|
+
class _CompiledMergeAttrTokensTransform:
|
|
872
|
+
kind: Literal["merge_attr_tokens"]
|
|
873
|
+
tag: str
|
|
874
|
+
attr: str
|
|
875
|
+
tokens: tuple[str, ...]
|
|
876
|
+
callback: NodeCallback | None
|
|
877
|
+
report: ReportCallback | None
|
|
878
|
+
|
|
879
|
+
|
|
880
|
+
@dataclass(frozen=True, slots=True)
|
|
881
|
+
class _CompiledSanitizeTransform:
|
|
882
|
+
kind: Literal["sanitize"]
|
|
883
|
+
policy: SanitizationPolicy
|
|
884
|
+
attr_drop_regex: re.Pattern[str] | None
|
|
885
|
+
callback: NodeCallback | None
|
|
886
|
+
report: ReportCallback | None
|
|
887
|
+
|
|
888
|
+
|
|
889
|
+
@dataclass(frozen=True, slots=True)
|
|
890
|
+
class _CompiledStageHookTransform:
|
|
891
|
+
kind: Literal["stage_hook"]
|
|
892
|
+
index: int
|
|
893
|
+
callback: NodeCallback | None
|
|
894
|
+
report: ReportCallback | None
|
|
895
|
+
|
|
896
|
+
|
|
897
|
+
CompiledTransform = (
|
|
898
|
+
_CompiledSelectorTransform
|
|
899
|
+
| _CompiledDecideTransform
|
|
900
|
+
| _CompiledRewriteAttrsTransform
|
|
901
|
+
| _CompiledLinkifyTransform
|
|
902
|
+
| _CompiledCollapseWhitespaceTransform
|
|
903
|
+
| _CompiledPruneEmptyTransform
|
|
904
|
+
| _CompiledEditDocumentTransform
|
|
905
|
+
| _CompiledDropCommentsTransform
|
|
906
|
+
| _CompiledDropDoctypeTransform
|
|
907
|
+
| _CompiledMergeAttrTokensTransform
|
|
908
|
+
| _CompiledSanitizeTransform
|
|
909
|
+
| _CompiledStageHookTransform
|
|
910
|
+
| _CompiledStageBoundary
|
|
911
|
+
)
|
|
912
|
+
|
|
913
|
+
|
|
914
|
+
def _iter_flattened_transforms(specs: list[TransformSpec] | tuple[TransformSpec, ...]) -> list[Transform]:
|
|
915
|
+
out: list[Transform] = []
|
|
916
|
+
|
|
917
|
+
def _walk(items: list[TransformSpec] | tuple[TransformSpec, ...]) -> None:
|
|
918
|
+
for item in items:
|
|
919
|
+
if isinstance(item, Stage):
|
|
920
|
+
if item.enabled:
|
|
921
|
+
_walk(item.transforms)
|
|
922
|
+
continue
|
|
923
|
+
out.append(item)
|
|
924
|
+
|
|
925
|
+
_walk(specs)
|
|
926
|
+
return out
|
|
927
|
+
|
|
928
|
+
|
|
929
|
+
def _glob_match(pattern: str, text: str) -> bool:
|
|
930
|
+
"""Match a glob pattern against text.
|
|
931
|
+
|
|
932
|
+
Supported wildcards:
|
|
933
|
+
- '*' matches any sequence (including empty)
|
|
934
|
+
- '?' matches any single character
|
|
935
|
+
"""
|
|
936
|
+
|
|
937
|
+
if pattern == "*":
|
|
938
|
+
return True
|
|
939
|
+
if "*" not in pattern and "?" not in pattern:
|
|
940
|
+
return pattern == text
|
|
941
|
+
|
|
942
|
+
p_i = 0
|
|
943
|
+
t_i = 0
|
|
944
|
+
star_i = -1
|
|
945
|
+
match_i = 0
|
|
946
|
+
|
|
947
|
+
while t_i < len(text):
|
|
948
|
+
if p_i < len(pattern) and (pattern[p_i] == "?" or pattern[p_i] == text[t_i]):
|
|
949
|
+
p_i += 1
|
|
950
|
+
t_i += 1
|
|
951
|
+
continue
|
|
952
|
+
|
|
953
|
+
if p_i < len(pattern) and pattern[p_i] == "*":
|
|
954
|
+
star_i = p_i
|
|
955
|
+
match_i = t_i
|
|
956
|
+
p_i += 1
|
|
957
|
+
continue
|
|
958
|
+
|
|
959
|
+
if star_i != -1:
|
|
960
|
+
p_i = star_i + 1
|
|
961
|
+
match_i += 1
|
|
962
|
+
t_i = match_i
|
|
963
|
+
continue
|
|
964
|
+
|
|
965
|
+
return False
|
|
966
|
+
|
|
967
|
+
while p_i < len(pattern) and pattern[p_i] == "*":
|
|
968
|
+
p_i += 1
|
|
969
|
+
|
|
970
|
+
return p_i == len(pattern)
|
|
971
|
+
|
|
972
|
+
|
|
973
|
+
def _split_into_top_level_stages(specs: list[TransformSpec] | tuple[TransformSpec, ...]) -> list[Stage]:
|
|
974
|
+
# Only enable auto-staging when a Stage is present at the top level.
|
|
975
|
+
has_top_level_stage = any(isinstance(t, Stage) and t.enabled for t in specs)
|
|
976
|
+
if not has_top_level_stage:
|
|
977
|
+
return []
|
|
978
|
+
|
|
979
|
+
stages: list[Stage] = []
|
|
980
|
+
pending: list[TransformSpec] = []
|
|
981
|
+
|
|
982
|
+
for item in specs:
|
|
983
|
+
if isinstance(item, Stage):
|
|
984
|
+
if not item.enabled:
|
|
985
|
+
continue
|
|
986
|
+
if pending:
|
|
987
|
+
stages.append(Stage(pending))
|
|
988
|
+
pending = []
|
|
989
|
+
stages.append(item)
|
|
990
|
+
continue
|
|
991
|
+
|
|
992
|
+
pending.append(item)
|
|
993
|
+
|
|
994
|
+
if pending:
|
|
995
|
+
stages.append(Stage(pending))
|
|
996
|
+
|
|
997
|
+
return stages
|
|
998
|
+
|
|
999
|
+
|
|
1000
|
+
def compile_transforms(transforms: list[TransformSpec] | tuple[TransformSpec, ...]) -> list[CompiledTransform]:
|
|
1001
|
+
if not transforms:
|
|
1002
|
+
return []
|
|
1003
|
+
|
|
1004
|
+
flattened = _iter_flattened_transforms(transforms)
|
|
1005
|
+
|
|
1006
|
+
top_level_stages = _split_into_top_level_stages(transforms)
|
|
1007
|
+
if top_level_stages:
|
|
1008
|
+
# Stage is a pass boundary. Compile each stage separately and insert a
|
|
1009
|
+
# boundary marker so apply_compiled_transforms can flush batches.
|
|
1010
|
+
compiled_stage: list[CompiledTransform] = []
|
|
1011
|
+
for stage_i, stage in enumerate(top_level_stages):
|
|
1012
|
+
if stage_i:
|
|
1013
|
+
compiled_stage.append(_CompiledStageBoundary(kind="stage_boundary"))
|
|
1014
|
+
compiled_stage.append(
|
|
1015
|
+
_CompiledStageHookTransform(
|
|
1016
|
+
kind="stage_hook",
|
|
1017
|
+
index=stage_i,
|
|
1018
|
+
callback=stage.callback,
|
|
1019
|
+
report=stage.report,
|
|
1020
|
+
)
|
|
1021
|
+
)
|
|
1022
|
+
for inner in _iter_flattened_transforms(stage.transforms):
|
|
1023
|
+
compiled_stage.extend(compile_transforms((inner,)))
|
|
1024
|
+
return compiled_stage
|
|
1025
|
+
|
|
1026
|
+
compiled: list[CompiledTransform] = []
|
|
1027
|
+
|
|
1028
|
+
def _append_compiled(item: CompiledTransform) -> None:
|
|
1029
|
+
# Optimization: fuse adjacent EditAttrs transforms that target the
|
|
1030
|
+
# same selector. This preserves left-to-right semantics but reduces
|
|
1031
|
+
# per-node selector matching and callback overhead.
|
|
1032
|
+
if (
|
|
1033
|
+
compiled
|
|
1034
|
+
and isinstance(item, _CompiledRewriteAttrsTransform)
|
|
1035
|
+
and isinstance(compiled[-1], _CompiledRewriteAttrsTransform)
|
|
1036
|
+
):
|
|
1037
|
+
prev = compiled[-1]
|
|
1038
|
+
if prev.selector_str == item.selector_str and prev.all_nodes == item.all_nodes:
|
|
1039
|
+
prev_cb = prev.func
|
|
1040
|
+
next_cb = item.func
|
|
1041
|
+
|
|
1042
|
+
def _chained(
|
|
1043
|
+
node: SimpleDomNode,
|
|
1044
|
+
prev_cb: Callable[[SimpleDomNode], dict[str, str | None] | None] = prev_cb,
|
|
1045
|
+
next_cb: Callable[[SimpleDomNode], dict[str, str | None] | None] = next_cb,
|
|
1046
|
+
) -> dict[str, str | None] | None:
|
|
1047
|
+
changed = False
|
|
1048
|
+
out = prev_cb(node)
|
|
1049
|
+
if out is not None: # pragma: no cover
|
|
1050
|
+
node.attrs = out
|
|
1051
|
+
changed = True
|
|
1052
|
+
out = next_cb(node)
|
|
1053
|
+
if out is not None:
|
|
1054
|
+
node.attrs = out
|
|
1055
|
+
changed = True
|
|
1056
|
+
return node.attrs if changed else None
|
|
1057
|
+
|
|
1058
|
+
compiled[-1] = _CompiledRewriteAttrsTransform(
|
|
1059
|
+
kind="rewrite_attrs",
|
|
1060
|
+
selector_str=prev.selector_str,
|
|
1061
|
+
selector=prev.selector,
|
|
1062
|
+
all_nodes=prev.all_nodes,
|
|
1063
|
+
func=_chained,
|
|
1064
|
+
)
|
|
1065
|
+
return
|
|
1066
|
+
|
|
1067
|
+
compiled.append(item)
|
|
1068
|
+
|
|
1069
|
+
for t in flattened:
|
|
1070
|
+
if not isinstance(t, _TRANSFORM_CLASSES):
|
|
1071
|
+
raise TypeError(f"Unsupported transform: {type(t).__name__}")
|
|
1072
|
+
if not t.enabled:
|
|
1073
|
+
continue
|
|
1074
|
+
if isinstance(t, SetAttrs):
|
|
1075
|
+
compiled.append(
|
|
1076
|
+
_CompiledSelectorTransform(
|
|
1077
|
+
kind="setattrs",
|
|
1078
|
+
selector_str=t.selector,
|
|
1079
|
+
selector=parse_selector(t.selector),
|
|
1080
|
+
payload=t.attrs,
|
|
1081
|
+
callback=t.callback,
|
|
1082
|
+
report=t.report,
|
|
1083
|
+
)
|
|
1084
|
+
)
|
|
1085
|
+
continue
|
|
1086
|
+
if isinstance(t, Drop):
|
|
1087
|
+
selector_str = t.selector
|
|
1088
|
+
|
|
1089
|
+
# Fast-path: if selector is a simple comma-separated list of tag
|
|
1090
|
+
# names (e.g. "script, style"), avoid selector matching entirely.
|
|
1091
|
+
raw_parts = selector_str.split(",")
|
|
1092
|
+
tag_list: list[str] = []
|
|
1093
|
+
for part in raw_parts:
|
|
1094
|
+
p = part.strip().lower()
|
|
1095
|
+
if not p:
|
|
1096
|
+
tag_list = []
|
|
1097
|
+
break
|
|
1098
|
+
# Reject anything that isn't a plain tag name.
|
|
1099
|
+
if any(ch in p for ch in " .#[:>*+~\t\n\r\f"):
|
|
1100
|
+
tag_list = []
|
|
1101
|
+
break
|
|
1102
|
+
tag_list.append(p)
|
|
1103
|
+
|
|
1104
|
+
if tag_list:
|
|
1105
|
+
tags = frozenset(tag_list)
|
|
1106
|
+
on_drop = t.callback
|
|
1107
|
+
on_report = t.report
|
|
1108
|
+
|
|
1109
|
+
def _drop_if_tag(
|
|
1110
|
+
node: SimpleDomNode,
|
|
1111
|
+
tags: frozenset[str] = tags,
|
|
1112
|
+
selector_str: str = selector_str,
|
|
1113
|
+
on_drop: NodeCallback | None = on_drop,
|
|
1114
|
+
on_report: ReportCallback | None = on_report,
|
|
1115
|
+
) -> DecideAction:
|
|
1116
|
+
name = node.name
|
|
1117
|
+
if name.startswith("#") or name == "!doctype":
|
|
1118
|
+
return Decide.KEEP
|
|
1119
|
+
tag = str(name).lower()
|
|
1120
|
+
if tag not in tags:
|
|
1121
|
+
return Decide.KEEP
|
|
1122
|
+
if on_drop is not None:
|
|
1123
|
+
on_drop(node)
|
|
1124
|
+
if on_report is not None:
|
|
1125
|
+
on_report(f"Dropped tag '{tag}' (matched selector '{selector_str}')", node=node)
|
|
1126
|
+
return Decide.DROP
|
|
1127
|
+
|
|
1128
|
+
compiled.append(
|
|
1129
|
+
_CompiledDecideTransform(
|
|
1130
|
+
kind="decide",
|
|
1131
|
+
selector_str="*",
|
|
1132
|
+
selector=None,
|
|
1133
|
+
all_nodes=True,
|
|
1134
|
+
callback=_drop_if_tag,
|
|
1135
|
+
)
|
|
1136
|
+
)
|
|
1137
|
+
continue
|
|
1138
|
+
|
|
1139
|
+
compiled.append(
|
|
1140
|
+
_CompiledSelectorTransform(
|
|
1141
|
+
kind="drop",
|
|
1142
|
+
selector_str=selector_str,
|
|
1143
|
+
selector=parse_selector(selector_str),
|
|
1144
|
+
payload=None,
|
|
1145
|
+
callback=t.callback,
|
|
1146
|
+
report=t.report,
|
|
1147
|
+
)
|
|
1148
|
+
)
|
|
1149
|
+
continue
|
|
1150
|
+
if isinstance(t, Unwrap):
|
|
1151
|
+
compiled.append(
|
|
1152
|
+
_CompiledSelectorTransform(
|
|
1153
|
+
kind="unwrap",
|
|
1154
|
+
selector_str=t.selector,
|
|
1155
|
+
selector=parse_selector(t.selector),
|
|
1156
|
+
payload=None,
|
|
1157
|
+
callback=t.callback,
|
|
1158
|
+
report=t.report,
|
|
1159
|
+
)
|
|
1160
|
+
)
|
|
1161
|
+
continue
|
|
1162
|
+
if isinstance(t, Empty):
|
|
1163
|
+
compiled.append(
|
|
1164
|
+
_CompiledSelectorTransform(
|
|
1165
|
+
kind="empty",
|
|
1166
|
+
selector_str=t.selector,
|
|
1167
|
+
selector=parse_selector(t.selector),
|
|
1168
|
+
payload=None,
|
|
1169
|
+
callback=t.callback,
|
|
1170
|
+
report=t.report,
|
|
1171
|
+
)
|
|
1172
|
+
)
|
|
1173
|
+
continue
|
|
1174
|
+
if isinstance(t, Edit):
|
|
1175
|
+
selector_str = t.selector
|
|
1176
|
+
edit_func = t.func
|
|
1177
|
+
on_hook = t.callback
|
|
1178
|
+
on_report = t.report
|
|
1179
|
+
|
|
1180
|
+
def _wrapped(
|
|
1181
|
+
node: SimpleDomNode,
|
|
1182
|
+
edit_func: NodeCallback = edit_func,
|
|
1183
|
+
selector_str: str = selector_str,
|
|
1184
|
+
on_hook: NodeCallback | None = on_hook,
|
|
1185
|
+
on_report: ReportCallback | None = on_report,
|
|
1186
|
+
) -> None:
|
|
1187
|
+
if on_hook is not None:
|
|
1188
|
+
on_hook(node)
|
|
1189
|
+
if on_report is not None:
|
|
1190
|
+
tag = str(node.name).lower()
|
|
1191
|
+
on_report(f"Edited <{tag}> (matched selector '{selector_str}')", node=node)
|
|
1192
|
+
edit_func(node)
|
|
1193
|
+
|
|
1194
|
+
compiled.append(
|
|
1195
|
+
_CompiledSelectorTransform(
|
|
1196
|
+
kind="edit",
|
|
1197
|
+
selector_str=t.selector,
|
|
1198
|
+
selector=parse_selector(t.selector),
|
|
1199
|
+
payload=_wrapped,
|
|
1200
|
+
callback=None,
|
|
1201
|
+
report=None,
|
|
1202
|
+
)
|
|
1203
|
+
)
|
|
1204
|
+
continue
|
|
1205
|
+
|
|
1206
|
+
if isinstance(t, EditDocument):
|
|
1207
|
+
edit_document_func = t.func
|
|
1208
|
+
on_hook = t.callback
|
|
1209
|
+
on_report = t.report
|
|
1210
|
+
|
|
1211
|
+
def _wrapped_root(
|
|
1212
|
+
node: SimpleDomNode,
|
|
1213
|
+
edit_document_func: NodeCallback = edit_document_func,
|
|
1214
|
+
on_hook: NodeCallback | None = on_hook,
|
|
1215
|
+
on_report: ReportCallback | None = on_report,
|
|
1216
|
+
) -> None:
|
|
1217
|
+
if on_hook is not None:
|
|
1218
|
+
on_hook(node)
|
|
1219
|
+
if on_report is not None:
|
|
1220
|
+
on_report("Edited document root", node=node)
|
|
1221
|
+
edit_document_func(node)
|
|
1222
|
+
|
|
1223
|
+
compiled.append(_CompiledEditDocumentTransform(kind="edit_document", callback=_wrapped_root))
|
|
1224
|
+
continue
|
|
1225
|
+
|
|
1226
|
+
if isinstance(t, Decide):
|
|
1227
|
+
selector_str = t.selector
|
|
1228
|
+
all_nodes = selector_str.strip() == "*"
|
|
1229
|
+
decide_func = t.func
|
|
1230
|
+
on_hook = t.callback
|
|
1231
|
+
on_report = t.report
|
|
1232
|
+
|
|
1233
|
+
def _wrapped_decide(
|
|
1234
|
+
node: SimpleDomNode,
|
|
1235
|
+
decide_func: Callable[[SimpleDomNode], DecideAction] = decide_func,
|
|
1236
|
+
selector_str: str = selector_str,
|
|
1237
|
+
on_hook: NodeCallback | None = on_hook,
|
|
1238
|
+
on_report: ReportCallback | None = on_report,
|
|
1239
|
+
) -> DecideAction:
|
|
1240
|
+
action = decide_func(node)
|
|
1241
|
+
if action is DecideAction.KEEP:
|
|
1242
|
+
return action
|
|
1243
|
+
if on_hook is not None:
|
|
1244
|
+
on_hook(node)
|
|
1245
|
+
if on_report is not None:
|
|
1246
|
+
nm = node.name
|
|
1247
|
+
label = str(nm).lower() if not nm.startswith("#") and nm != "!doctype" else str(nm)
|
|
1248
|
+
on_report(f"Decide -> {action.value} '{label}' (matched selector '{selector_str}')", node=node)
|
|
1249
|
+
return action
|
|
1250
|
+
|
|
1251
|
+
compiled.append(
|
|
1252
|
+
_CompiledDecideTransform(
|
|
1253
|
+
kind="decide",
|
|
1254
|
+
selector_str=selector_str,
|
|
1255
|
+
selector=None if all_nodes else parse_selector(selector_str),
|
|
1256
|
+
all_nodes=all_nodes,
|
|
1257
|
+
callback=_wrapped_decide,
|
|
1258
|
+
)
|
|
1259
|
+
)
|
|
1260
|
+
continue
|
|
1261
|
+
|
|
1262
|
+
if isinstance(t, EditAttrs):
|
|
1263
|
+
selector_str = t.selector
|
|
1264
|
+
all_nodes = selector_str.strip() == "*"
|
|
1265
|
+
edit_attrs_func = t.func
|
|
1266
|
+
on_hook = t.callback
|
|
1267
|
+
on_report = t.report
|
|
1268
|
+
|
|
1269
|
+
def _wrapped_attrs(
|
|
1270
|
+
node: SimpleDomNode,
|
|
1271
|
+
edit_attrs_func: EditAttrsCallback = edit_attrs_func,
|
|
1272
|
+
selector_str: str = selector_str,
|
|
1273
|
+
on_hook: NodeCallback | None = on_hook,
|
|
1274
|
+
on_report: ReportCallback | None = on_report,
|
|
1275
|
+
) -> dict[str, str | None] | None:
|
|
1276
|
+
out = edit_attrs_func(node)
|
|
1277
|
+
if out is None:
|
|
1278
|
+
return None
|
|
1279
|
+
if on_hook is not None:
|
|
1280
|
+
on_hook(node)
|
|
1281
|
+
if on_report is not None:
|
|
1282
|
+
tag = str(node.name).lower()
|
|
1283
|
+
on_report(f"Edited attributes on <{tag}> (matched selector '{selector_str}')", node=node)
|
|
1284
|
+
return out
|
|
1285
|
+
|
|
1286
|
+
_append_compiled(
|
|
1287
|
+
_CompiledRewriteAttrsTransform(
|
|
1288
|
+
kind="rewrite_attrs",
|
|
1289
|
+
selector_str=selector_str,
|
|
1290
|
+
selector=None if all_nodes else parse_selector(selector_str),
|
|
1291
|
+
all_nodes=all_nodes,
|
|
1292
|
+
func=_wrapped_attrs,
|
|
1293
|
+
)
|
|
1294
|
+
)
|
|
1295
|
+
continue
|
|
1296
|
+
|
|
1297
|
+
if isinstance(t, Linkify):
|
|
1298
|
+
compiled.append(
|
|
1299
|
+
_CompiledLinkifyTransform(
|
|
1300
|
+
kind="linkify",
|
|
1301
|
+
skip_tags=t.skip_tags,
|
|
1302
|
+
config=LinkifyConfig(fuzzy_ip=t.fuzzy_ip, extra_tlds=t.extra_tlds),
|
|
1303
|
+
callback=t.callback,
|
|
1304
|
+
report=t.report,
|
|
1305
|
+
)
|
|
1306
|
+
)
|
|
1307
|
+
continue
|
|
1308
|
+
|
|
1309
|
+
if isinstance(t, CollapseWhitespace):
|
|
1310
|
+
compiled.append(
|
|
1311
|
+
_CompiledCollapseWhitespaceTransform(
|
|
1312
|
+
kind="collapse_whitespace",
|
|
1313
|
+
skip_tags=t.skip_tags,
|
|
1314
|
+
callback=t.callback,
|
|
1315
|
+
report=t.report,
|
|
1316
|
+
)
|
|
1317
|
+
)
|
|
1318
|
+
continue
|
|
1319
|
+
|
|
1320
|
+
if isinstance(t, PruneEmpty):
|
|
1321
|
+
compiled.append(
|
|
1322
|
+
_CompiledPruneEmptyTransform(
|
|
1323
|
+
kind="prune_empty",
|
|
1324
|
+
selector_str=t.selector,
|
|
1325
|
+
selector=parse_selector(t.selector),
|
|
1326
|
+
strip_whitespace=t.strip_whitespace,
|
|
1327
|
+
callback=t.callback,
|
|
1328
|
+
report=t.report,
|
|
1329
|
+
)
|
|
1330
|
+
)
|
|
1331
|
+
continue
|
|
1332
|
+
|
|
1333
|
+
if isinstance(t, DropComments):
|
|
1334
|
+
compiled.append(
|
|
1335
|
+
_CompiledDropCommentsTransform(
|
|
1336
|
+
kind="drop_comments",
|
|
1337
|
+
callback=t.callback,
|
|
1338
|
+
report=t.report,
|
|
1339
|
+
)
|
|
1340
|
+
)
|
|
1341
|
+
continue
|
|
1342
|
+
|
|
1343
|
+
if isinstance(t, DropDoctype):
|
|
1344
|
+
compiled.append(
|
|
1345
|
+
_CompiledDropDoctypeTransform(
|
|
1346
|
+
kind="drop_doctype",
|
|
1347
|
+
callback=t.callback,
|
|
1348
|
+
report=t.report,
|
|
1349
|
+
)
|
|
1350
|
+
)
|
|
1351
|
+
continue
|
|
1352
|
+
|
|
1353
|
+
if isinstance(t, DropForeignNamespaces):
|
|
1354
|
+
on_hook = t.callback
|
|
1355
|
+
on_report = t.report
|
|
1356
|
+
|
|
1357
|
+
def _drop_foreign(
|
|
1358
|
+
node: SimpleDomNode,
|
|
1359
|
+
on_hook: NodeCallback | None = on_hook,
|
|
1360
|
+
on_report: ReportCallback | None = on_report,
|
|
1361
|
+
) -> DecideAction:
|
|
1362
|
+
name = node.name
|
|
1363
|
+
if name.startswith("#") or name == "!doctype":
|
|
1364
|
+
return Decide.KEEP
|
|
1365
|
+
ns = node.namespace
|
|
1366
|
+
if ns not in (None, "html"):
|
|
1367
|
+
if on_hook is not None:
|
|
1368
|
+
on_hook(node)
|
|
1369
|
+
if on_report is not None:
|
|
1370
|
+
tag = str(name).lower()
|
|
1371
|
+
on_report(f"Unsafe tag '{tag}' (foreign namespace)", node=node)
|
|
1372
|
+
return Decide.DROP
|
|
1373
|
+
return Decide.KEEP
|
|
1374
|
+
|
|
1375
|
+
compiled.append(
|
|
1376
|
+
_CompiledDecideTransform(
|
|
1377
|
+
kind="decide",
|
|
1378
|
+
selector_str="*",
|
|
1379
|
+
selector=None,
|
|
1380
|
+
all_nodes=True,
|
|
1381
|
+
callback=_drop_foreign,
|
|
1382
|
+
)
|
|
1383
|
+
)
|
|
1384
|
+
continue
|
|
1385
|
+
|
|
1386
|
+
if isinstance(t, DropAttrs):
|
|
1387
|
+
patterns = t.patterns
|
|
1388
|
+
on_hook = t.callback
|
|
1389
|
+
on_report = t.report
|
|
1390
|
+
|
|
1391
|
+
# Optimize pattern matching: Compile all patterns into one regex
|
|
1392
|
+
compiled_regex = _compile_patterns_to_regex(patterns)
|
|
1393
|
+
|
|
1394
|
+
def _drop_attrs(
|
|
1395
|
+
node: SimpleDomNode,
|
|
1396
|
+
patterns: tuple[str, ...] = patterns,
|
|
1397
|
+
compiled_regex: re.Pattern[str] | None = compiled_regex,
|
|
1398
|
+
on_hook: NodeCallback | None = on_hook,
|
|
1399
|
+
on_report: ReportCallback | None = on_report,
|
|
1400
|
+
) -> dict[str, str | None] | None:
|
|
1401
|
+
attrs = node.attrs
|
|
1402
|
+
if not attrs:
|
|
1403
|
+
return None
|
|
1404
|
+
|
|
1405
|
+
if not patterns:
|
|
1406
|
+
return None
|
|
1407
|
+
|
|
1408
|
+
out: dict[str, str | None] = {}
|
|
1409
|
+
changed = False
|
|
1410
|
+
for raw_key, value in attrs.items():
|
|
1411
|
+
if not raw_key or not str(raw_key).strip():
|
|
1412
|
+
continue
|
|
1413
|
+
key = raw_key
|
|
1414
|
+
if not key.islower():
|
|
1415
|
+
key = key.lower()
|
|
1416
|
+
|
|
1417
|
+
if compiled_regex and compiled_regex.match(key):
|
|
1418
|
+
if on_report is not None:
|
|
1419
|
+
# Re-check to report which pattern matched (rare path)
|
|
1420
|
+
found_pat = "?"
|
|
1421
|
+
for pat in patterns:
|
|
1422
|
+
if _glob_match(pat, key): # pragma: no cover
|
|
1423
|
+
found_pat = pat
|
|
1424
|
+
break
|
|
1425
|
+
on_report(
|
|
1426
|
+
f"Unsafe attribute '{key}' (matched pattern '{found_pat}')",
|
|
1427
|
+
node=node,
|
|
1428
|
+
)
|
|
1429
|
+
changed = True
|
|
1430
|
+
continue
|
|
1431
|
+
|
|
1432
|
+
out[key] = value
|
|
1433
|
+
|
|
1434
|
+
if not changed:
|
|
1435
|
+
return None
|
|
1436
|
+
if on_hook is not None:
|
|
1437
|
+
on_hook(node) # pragma: no cover
|
|
1438
|
+
return out
|
|
1439
|
+
|
|
1440
|
+
selector_str = t.selector
|
|
1441
|
+
all_nodes = selector_str.strip() == "*"
|
|
1442
|
+
_append_compiled(
|
|
1443
|
+
_CompiledRewriteAttrsTransform(
|
|
1444
|
+
kind="rewrite_attrs",
|
|
1445
|
+
selector_str=selector_str,
|
|
1446
|
+
selector=None if all_nodes else parse_selector(selector_str),
|
|
1447
|
+
all_nodes=all_nodes,
|
|
1448
|
+
func=_drop_attrs,
|
|
1449
|
+
)
|
|
1450
|
+
)
|
|
1451
|
+
continue
|
|
1452
|
+
|
|
1453
|
+
if isinstance(t, AllowlistAttrs):
|
|
1454
|
+
allowed_attributes = t.allowed_attributes
|
|
1455
|
+
on_hook = t.callback
|
|
1456
|
+
on_report = t.report
|
|
1457
|
+
allowed_global = allowed_attributes.get("*", set())
|
|
1458
|
+
allowed_by_tag: dict[str, set[str]] = {}
|
|
1459
|
+
for tag, attrs in allowed_attributes.items():
|
|
1460
|
+
if tag == "*":
|
|
1461
|
+
continue
|
|
1462
|
+
allowed_by_tag[str(tag).lower()] = set(allowed_global).union(attrs)
|
|
1463
|
+
|
|
1464
|
+
def _allowlist_attrs(
|
|
1465
|
+
node: SimpleDomNode,
|
|
1466
|
+
allowed_by_tag: dict[str, set[str]] = allowed_by_tag,
|
|
1467
|
+
allowed_global: set[str] = allowed_global,
|
|
1468
|
+
on_hook: NodeCallback | None = on_hook,
|
|
1469
|
+
on_report: ReportCallback | None = on_report,
|
|
1470
|
+
) -> dict[str, str | None] | None:
|
|
1471
|
+
attrs = node.attrs
|
|
1472
|
+
if not attrs:
|
|
1473
|
+
return None
|
|
1474
|
+
tag = str(node.name).lower()
|
|
1475
|
+
allowed = allowed_by_tag.get(tag, allowed_global)
|
|
1476
|
+
|
|
1477
|
+
changed = False
|
|
1478
|
+
out: dict[str, str | None] = {}
|
|
1479
|
+
for raw_key, value in attrs.items():
|
|
1480
|
+
raw_key_str = str(raw_key)
|
|
1481
|
+
if not raw_key_str.strip():
|
|
1482
|
+
# Drop invalid attribute names like '' or whitespace-only.
|
|
1483
|
+
changed = True
|
|
1484
|
+
continue
|
|
1485
|
+
key = raw_key_str
|
|
1486
|
+
if not key.islower():
|
|
1487
|
+
key = key.lower()
|
|
1488
|
+
changed = True # pragma: no cover
|
|
1489
|
+
if key in allowed:
|
|
1490
|
+
out[key] = value
|
|
1491
|
+
else:
|
|
1492
|
+
changed = True
|
|
1493
|
+
if on_report is not None:
|
|
1494
|
+
on_report(f"Unsafe attribute '{key}' (not allowed)", node=node)
|
|
1495
|
+
if not changed:
|
|
1496
|
+
return None
|
|
1497
|
+
if on_hook is not None:
|
|
1498
|
+
on_hook(node) # pragma: no cover
|
|
1499
|
+
return out
|
|
1500
|
+
|
|
1501
|
+
selector_str = t.selector
|
|
1502
|
+
all_nodes = selector_str.strip() == "*"
|
|
1503
|
+
_append_compiled(
|
|
1504
|
+
_CompiledRewriteAttrsTransform(
|
|
1505
|
+
kind="rewrite_attrs",
|
|
1506
|
+
selector_str=selector_str,
|
|
1507
|
+
selector=None if all_nodes else parse_selector(selector_str),
|
|
1508
|
+
all_nodes=all_nodes,
|
|
1509
|
+
func=_allowlist_attrs,
|
|
1510
|
+
)
|
|
1511
|
+
)
|
|
1512
|
+
continue
|
|
1513
|
+
|
|
1514
|
+
if isinstance(t, DropUrlAttrs):
|
|
1515
|
+
url_policy = t.url_policy
|
|
1516
|
+
on_hook = t.callback
|
|
1517
|
+
on_report = t.report
|
|
1518
|
+
|
|
1519
|
+
def _drop_url_attrs(
|
|
1520
|
+
node: SimpleDomNode,
|
|
1521
|
+
url_policy: UrlPolicy = url_policy,
|
|
1522
|
+
on_hook: NodeCallback | None = on_hook,
|
|
1523
|
+
on_report: ReportCallback | None = on_report,
|
|
1524
|
+
) -> dict[str, str | None] | None:
|
|
1525
|
+
attrs = node.attrs
|
|
1526
|
+
if not attrs:
|
|
1527
|
+
return None
|
|
1528
|
+
|
|
1529
|
+
tag = str(node.name).lower()
|
|
1530
|
+
out = dict(attrs)
|
|
1531
|
+
changed = False
|
|
1532
|
+
for key in list(out.keys()):
|
|
1533
|
+
if key not in _URL_LIKE_ATTRS:
|
|
1534
|
+
continue
|
|
1535
|
+
|
|
1536
|
+
raw_value = out.get(key)
|
|
1537
|
+
if raw_value is None:
|
|
1538
|
+
if on_report is not None: # pragma: no cover
|
|
1539
|
+
on_report(f"Unsafe URL in attribute '{key}'", node=node)
|
|
1540
|
+
out.pop(key, None)
|
|
1541
|
+
changed = True
|
|
1542
|
+
continue
|
|
1543
|
+
|
|
1544
|
+
rule = url_policy.allow_rules.get((tag, key))
|
|
1545
|
+
if rule is None:
|
|
1546
|
+
if on_report is not None: # pragma: no cover
|
|
1547
|
+
on_report(f"Unsafe URL in attribute '{key}' (no rule)", node=node)
|
|
1548
|
+
out.pop(key, None)
|
|
1549
|
+
changed = True
|
|
1550
|
+
continue
|
|
1551
|
+
|
|
1552
|
+
if key == "srcset":
|
|
1553
|
+
sanitized = _sanitize_srcset_value(
|
|
1554
|
+
url_policy=url_policy,
|
|
1555
|
+
rule=rule,
|
|
1556
|
+
tag=tag,
|
|
1557
|
+
attr=key,
|
|
1558
|
+
value=str(raw_value),
|
|
1559
|
+
)
|
|
1560
|
+
else:
|
|
1561
|
+
sanitized = _sanitize_url_value(
|
|
1562
|
+
url_policy=url_policy,
|
|
1563
|
+
rule=rule,
|
|
1564
|
+
tag=tag,
|
|
1565
|
+
attr=key,
|
|
1566
|
+
value=str(raw_value),
|
|
1567
|
+
)
|
|
1568
|
+
|
|
1569
|
+
if sanitized is None:
|
|
1570
|
+
if on_report is not None:
|
|
1571
|
+
on_report(f"Unsafe URL in attribute '{key}'", node=node)
|
|
1572
|
+
out.pop(key, None)
|
|
1573
|
+
changed = True
|
|
1574
|
+
continue
|
|
1575
|
+
|
|
1576
|
+
out[key] = sanitized
|
|
1577
|
+
|
|
1578
|
+
if raw_value != sanitized:
|
|
1579
|
+
changed = True
|
|
1580
|
+
|
|
1581
|
+
if not changed:
|
|
1582
|
+
return None
|
|
1583
|
+
if on_hook is not None:
|
|
1584
|
+
on_hook(node)
|
|
1585
|
+
return out
|
|
1586
|
+
|
|
1587
|
+
selector_str = t.selector
|
|
1588
|
+
all_nodes = selector_str.strip() == "*"
|
|
1589
|
+
_append_compiled(
|
|
1590
|
+
_CompiledRewriteAttrsTransform(
|
|
1591
|
+
kind="rewrite_attrs",
|
|
1592
|
+
selector_str=selector_str,
|
|
1593
|
+
selector=None if all_nodes else parse_selector(selector_str),
|
|
1594
|
+
all_nodes=all_nodes,
|
|
1595
|
+
func=_drop_url_attrs,
|
|
1596
|
+
)
|
|
1597
|
+
)
|
|
1598
|
+
continue
|
|
1599
|
+
|
|
1600
|
+
if isinstance(t, AllowStyleAttrs):
|
|
1601
|
+
allowed_css_properties = t.allowed_css_properties
|
|
1602
|
+
on_hook = t.callback
|
|
1603
|
+
on_report = t.report
|
|
1604
|
+
|
|
1605
|
+
def _allow_style_attrs(
|
|
1606
|
+
node: SimpleDomNode,
|
|
1607
|
+
allowed_css_properties: tuple[str, ...] = allowed_css_properties,
|
|
1608
|
+
on_hook: NodeCallback | None = on_hook,
|
|
1609
|
+
on_report: ReportCallback | None = on_report,
|
|
1610
|
+
) -> dict[str, str | None] | None:
|
|
1611
|
+
attrs = node.attrs
|
|
1612
|
+
if not attrs or "style" not in attrs:
|
|
1613
|
+
return None
|
|
1614
|
+
|
|
1615
|
+
raw_value = attrs.get("style")
|
|
1616
|
+
if raw_value is None:
|
|
1617
|
+
if on_report is not None:
|
|
1618
|
+
on_report("Unsafe inline style in attribute 'style'", node=node)
|
|
1619
|
+
out = dict(attrs)
|
|
1620
|
+
out.pop("style", None)
|
|
1621
|
+
if on_hook is not None:
|
|
1622
|
+
on_hook(node)
|
|
1623
|
+
return out
|
|
1624
|
+
|
|
1625
|
+
sanitized_style = _sanitize_inline_style(
|
|
1626
|
+
allowed_css_properties=allowed_css_properties, value=str(raw_value)
|
|
1627
|
+
)
|
|
1628
|
+
if sanitized_style is None:
|
|
1629
|
+
if on_report is not None:
|
|
1630
|
+
on_report("Unsafe inline style in attribute 'style'", node=node)
|
|
1631
|
+
out = dict(attrs)
|
|
1632
|
+
out.pop("style", None)
|
|
1633
|
+
if on_hook is not None:
|
|
1634
|
+
on_hook(node)
|
|
1635
|
+
return out
|
|
1636
|
+
|
|
1637
|
+
out = dict(attrs)
|
|
1638
|
+
out["style"] = sanitized_style
|
|
1639
|
+
if raw_value != sanitized_style and on_hook is not None:
|
|
1640
|
+
on_hook(node)
|
|
1641
|
+
return out
|
|
1642
|
+
|
|
1643
|
+
selector_str = t.selector
|
|
1644
|
+
all_nodes = selector_str.strip() == "*"
|
|
1645
|
+
_append_compiled(
|
|
1646
|
+
_CompiledRewriteAttrsTransform(
|
|
1647
|
+
kind="rewrite_attrs",
|
|
1648
|
+
selector_str=selector_str,
|
|
1649
|
+
selector=None if all_nodes else parse_selector(selector_str),
|
|
1650
|
+
all_nodes=all_nodes,
|
|
1651
|
+
func=_allow_style_attrs,
|
|
1652
|
+
)
|
|
1653
|
+
)
|
|
1654
|
+
continue
|
|
1655
|
+
|
|
1656
|
+
if isinstance(t, MergeAttrs):
|
|
1657
|
+
if not t.tokens:
|
|
1658
|
+
continue
|
|
1659
|
+
compiled.append(
|
|
1660
|
+
_CompiledMergeAttrTokensTransform(
|
|
1661
|
+
kind="merge_attr_tokens",
|
|
1662
|
+
tag=t.tag,
|
|
1663
|
+
attr=t.attr,
|
|
1664
|
+
tokens=t.tokens,
|
|
1665
|
+
callback=t.callback,
|
|
1666
|
+
report=t.report,
|
|
1667
|
+
)
|
|
1668
|
+
)
|
|
1669
|
+
continue
|
|
1670
|
+
|
|
1671
|
+
if isinstance(t, Sanitize): # pragma: no branch
|
|
1672
|
+
policy = t.policy or DEFAULT_POLICY
|
|
1673
|
+
|
|
1674
|
+
# Hardcoded patterns from original usage
|
|
1675
|
+
attr_patterns = ("on*", "srcdoc", "*:*")
|
|
1676
|
+
attr_regex = _compile_patterns_to_regex(attr_patterns)
|
|
1677
|
+
|
|
1678
|
+
_append_compiled(
|
|
1679
|
+
_CompiledSanitizeTransform(
|
|
1680
|
+
kind="sanitize",
|
|
1681
|
+
policy=policy,
|
|
1682
|
+
attr_drop_regex=attr_regex,
|
|
1683
|
+
callback=t.callback,
|
|
1684
|
+
report=t.report,
|
|
1685
|
+
)
|
|
1686
|
+
)
|
|
1687
|
+
continue
|
|
1688
|
+
|
|
1689
|
+
raise TypeError(f"Unsupported transform: {type(t).__name__}") # pragma: no cover
|
|
1690
|
+
|
|
1691
|
+
return compiled
|
|
1692
|
+
|
|
1693
|
+
|
|
1694
|
+
# -----------------
|
|
1695
|
+
# Application
|
|
1696
|
+
# -----------------
|
|
1697
|
+
|
|
1698
|
+
|
|
1699
|
+
def _compile_patterns_to_regex(patterns: tuple[str, ...]) -> re.Pattern[str] | None:
|
|
1700
|
+
if not patterns:
|
|
1701
|
+
return None
|
|
1702
|
+
parts: list[str] = []
|
|
1703
|
+
for p in patterns:
|
|
1704
|
+
regex = re.escape(p)
|
|
1705
|
+
regex = regex.replace(r"\*", ".*")
|
|
1706
|
+
regex = regex.replace(r"\?", ".")
|
|
1707
|
+
parts.append(regex)
|
|
1708
|
+
full = "^(?:" + "|".join(parts) + ")$"
|
|
1709
|
+
return re.compile(full)
|
|
1710
|
+
|
|
1711
|
+
|
|
1712
|
+
def apply_compiled_transforms(
|
|
1713
|
+
root: SimpleDomNode,
|
|
1714
|
+
compiled: list[CompiledTransform],
|
|
1715
|
+
*,
|
|
1716
|
+
errors: list[ParseError] | None = None,
|
|
1717
|
+
) -> None:
|
|
1718
|
+
if not compiled:
|
|
1719
|
+
return
|
|
1720
|
+
|
|
1721
|
+
token = _ERROR_SINK.set(errors)
|
|
1722
|
+
try:
|
|
1723
|
+
matcher = SelectorMatcher()
|
|
1724
|
+
|
|
1725
|
+
def apply_walk_transforms(root_node: SimpleDomNode, walk_transforms: list[CompiledTransform]) -> None:
|
|
1726
|
+
if not walk_transforms:
|
|
1727
|
+
return
|
|
1728
|
+
|
|
1729
|
+
def _raw_tag_text(node: SimpleDomNode, start_attr: str, end_attr: str) -> str | None:
|
|
1730
|
+
start = getattr(node, start_attr, None)
|
|
1731
|
+
end = getattr(node, end_attr, None)
|
|
1732
|
+
if start is None or end is None:
|
|
1733
|
+
return None
|
|
1734
|
+
src = node._source_html
|
|
1735
|
+
if src is None:
|
|
1736
|
+
cur: SimpleDomNode | None = node
|
|
1737
|
+
while cur is not None and src is None:
|
|
1738
|
+
cur = cur.parent
|
|
1739
|
+
if cur is None:
|
|
1740
|
+
break
|
|
1741
|
+
src = cur._source_html
|
|
1742
|
+
if src is not None:
|
|
1743
|
+
node._source_html = src
|
|
1744
|
+
if src is None:
|
|
1745
|
+
return None
|
|
1746
|
+
return src[start:end]
|
|
1747
|
+
|
|
1748
|
+
def _reconstruct_start_tag(node: SimpleDomNode) -> str | None:
|
|
1749
|
+
if node.name.startswith("#") or node.name == "!doctype":
|
|
1750
|
+
return None
|
|
1751
|
+
name = str(node.name)
|
|
1752
|
+
attrs = getattr(node, "attrs", None)
|
|
1753
|
+
tag = serialize_start_tag(name, attrs)
|
|
1754
|
+
if getattr(node, "_self_closing", False):
|
|
1755
|
+
tag = f"{tag[:-1]}/>"
|
|
1756
|
+
return tag
|
|
1757
|
+
|
|
1758
|
+
def _reconstruct_end_tag(node: SimpleDomNode) -> str | None:
|
|
1759
|
+
if getattr(node, "_self_closing", False):
|
|
1760
|
+
return None
|
|
1761
|
+
|
|
1762
|
+
# If explicit metadata says no end tag, respect it.
|
|
1763
|
+
if getattr(node, "_end_tag_present", None) is False:
|
|
1764
|
+
return None
|
|
1765
|
+
|
|
1766
|
+
# For nodes without metadata (or explicitly present), check void list.
|
|
1767
|
+
name = str(node.name)
|
|
1768
|
+
if name.startswith("#") or name == "!doctype":
|
|
1769
|
+
return None
|
|
1770
|
+
|
|
1771
|
+
if name.lower() in VOID_ELEMENTS:
|
|
1772
|
+
return None
|
|
1773
|
+
|
|
1774
|
+
return serialize_end_tag(name)
|
|
1775
|
+
|
|
1776
|
+
linkify_skip_tags: frozenset[str] = frozenset().union(
|
|
1777
|
+
*(t.skip_tags for t in walk_transforms if isinstance(t, _CompiledLinkifyTransform))
|
|
1778
|
+
)
|
|
1779
|
+
whitespace_skip_tags: frozenset[str] = frozenset().union(
|
|
1780
|
+
*(t.skip_tags for t in walk_transforms if isinstance(t, _CompiledCollapseWhitespaceTransform))
|
|
1781
|
+
)
|
|
1782
|
+
|
|
1783
|
+
# To preserve strict left-to-right semantics while still batching
|
|
1784
|
+
# compatible transforms into a single walk, we track the earliest
|
|
1785
|
+
# transform index that may run on a node.
|
|
1786
|
+
#
|
|
1787
|
+
# Example:
|
|
1788
|
+
# transforms=[Drop("a"), Linkify()]
|
|
1789
|
+
# Linkify introduces <a> elements. Those <a> nodes must not be
|
|
1790
|
+
# processed by earlier transforms (like Drop("a")), because Drop has
|
|
1791
|
+
# already run conceptually.
|
|
1792
|
+
created_start_index: dict[int, int] = {}
|
|
1793
|
+
|
|
1794
|
+
def _mark_start(n: object, start_index: int) -> None:
|
|
1795
|
+
key = id(n)
|
|
1796
|
+
created_start_index[key] = max(created_start_index.get(key, 0), start_index)
|
|
1797
|
+
|
|
1798
|
+
def _apply_fused_sanitize(
|
|
1799
|
+
node: SimpleDomNode,
|
|
1800
|
+
t: _CompiledSanitizeTransform,
|
|
1801
|
+
parent: SimpleDomNode,
|
|
1802
|
+
idx: int,
|
|
1803
|
+
) -> bool:
|
|
1804
|
+
policy = t.policy
|
|
1805
|
+
report = t.report
|
|
1806
|
+
callback = t.callback
|
|
1807
|
+
name = node.name
|
|
1808
|
+
|
|
1809
|
+
# 1. Drop Nodes (Comments, Doctype, Foreign)
|
|
1810
|
+
if name.startswith("#") or name == "!doctype":
|
|
1811
|
+
if name == "#comment":
|
|
1812
|
+
if policy.drop_comments:
|
|
1813
|
+
if callback:
|
|
1814
|
+
callback(node)
|
|
1815
|
+
if report:
|
|
1816
|
+
report("Dropped comment", node=node)
|
|
1817
|
+
parent.remove_child(node)
|
|
1818
|
+
return True
|
|
1819
|
+
return False
|
|
1820
|
+
if name == "!doctype":
|
|
1821
|
+
if policy.drop_doctype:
|
|
1822
|
+
if callback:
|
|
1823
|
+
callback(node)
|
|
1824
|
+
if report:
|
|
1825
|
+
report("Dropped doctype", node=node)
|
|
1826
|
+
parent.remove_child(node)
|
|
1827
|
+
return True
|
|
1828
|
+
return False
|
|
1829
|
+
return False
|
|
1830
|
+
|
|
1831
|
+
# 2. Drop Foreign
|
|
1832
|
+
ns = node.namespace
|
|
1833
|
+
if ns and ns != "html":
|
|
1834
|
+
if policy.drop_foreign_namespaces:
|
|
1835
|
+
if callback:
|
|
1836
|
+
callback(node)
|
|
1837
|
+
tag = str(name).lower()
|
|
1838
|
+
msg = f"Unsafe tag '{tag}' (foreign namespace)"
|
|
1839
|
+
policy.handle_unsafe(msg, node=node)
|
|
1840
|
+
if report:
|
|
1841
|
+
report(msg, node=node)
|
|
1842
|
+
parent.remove_child(node)
|
|
1843
|
+
return True
|
|
1844
|
+
|
|
1845
|
+
# Element tag
|
|
1846
|
+
tag = str(name).lower()
|
|
1847
|
+
|
|
1848
|
+
# 3. Allowed Tags
|
|
1849
|
+
if tag in policy.allowed_tags:
|
|
1850
|
+
pass
|
|
1851
|
+
elif tag in policy.drop_content_tags:
|
|
1852
|
+
msg = f"Unsafe tag '{tag}' (dropped content)"
|
|
1853
|
+
policy.handle_unsafe(msg, node=node)
|
|
1854
|
+
if report:
|
|
1855
|
+
report(msg, node=node)
|
|
1856
|
+
if callback:
|
|
1857
|
+
callback(node)
|
|
1858
|
+
parent.remove_child(node)
|
|
1859
|
+
return True
|
|
1860
|
+
else:
|
|
1861
|
+
msg = f"Unsafe tag '{tag}' (not allowed)"
|
|
1862
|
+
policy.handle_unsafe(msg, node=node)
|
|
1863
|
+
if report:
|
|
1864
|
+
report(msg, node=node)
|
|
1865
|
+
if callback:
|
|
1866
|
+
callback(node)
|
|
1867
|
+
|
|
1868
|
+
handling = policy.disallowed_tag_handling
|
|
1869
|
+
if handling == "drop":
|
|
1870
|
+
parent.remove_child(node)
|
|
1871
|
+
return True
|
|
1872
|
+
if handling == "escape":
|
|
1873
|
+
raw_start = _raw_tag_text(node, "_start_tag_start", "_start_tag_end")
|
|
1874
|
+
if raw_start is None:
|
|
1875
|
+
raw_start = _reconstruct_start_tag(node)
|
|
1876
|
+
raw_end = _raw_tag_text(node, "_end_tag_start", "_end_tag_end")
|
|
1877
|
+
if raw_end is None:
|
|
1878
|
+
raw_end = _reconstruct_end_tag(node)
|
|
1879
|
+
|
|
1880
|
+
if raw_start: # pragma: no cover
|
|
1881
|
+
sn = TextNode(raw_start)
|
|
1882
|
+
_mark_start(sn, idx)
|
|
1883
|
+
parent.insert_before(sn, node)
|
|
1884
|
+
|
|
1885
|
+
moved: list[SimpleDomNode] = []
|
|
1886
|
+
if node.children:
|
|
1887
|
+
moved.extend(list(node.children))
|
|
1888
|
+
node.children = []
|
|
1889
|
+
if type(node) is TemplateNode and node.template_content:
|
|
1890
|
+
tc = node.template_content
|
|
1891
|
+
if tc.children:
|
|
1892
|
+
moved.extend(list(tc.children))
|
|
1893
|
+
tc.children = []
|
|
1894
|
+
|
|
1895
|
+
if moved:
|
|
1896
|
+
for child in moved:
|
|
1897
|
+
_mark_start(child, idx)
|
|
1898
|
+
parent.insert_before(child, node)
|
|
1899
|
+
|
|
1900
|
+
if raw_end:
|
|
1901
|
+
en = TextNode(raw_end)
|
|
1902
|
+
_mark_start(en, idx)
|
|
1903
|
+
parent.insert_before(en, node)
|
|
1904
|
+
|
|
1905
|
+
parent.remove_child(node)
|
|
1906
|
+
return True
|
|
1907
|
+
|
|
1908
|
+
# UNWRAP
|
|
1909
|
+
moved_nodes: list[SimpleDomNode] = []
|
|
1910
|
+
if node.children:
|
|
1911
|
+
moved_nodes.extend(list(node.children))
|
|
1912
|
+
node.children = []
|
|
1913
|
+
if type(node) is TemplateNode and node.template_content:
|
|
1914
|
+
tc = node.template_content
|
|
1915
|
+
if tc.children:
|
|
1916
|
+
moved_nodes.extend(list(tc.children))
|
|
1917
|
+
tc.children = []
|
|
1918
|
+
|
|
1919
|
+
if moved_nodes:
|
|
1920
|
+
for child in moved_nodes:
|
|
1921
|
+
_mark_start(child, idx)
|
|
1922
|
+
parent.insert_before(child, node)
|
|
1923
|
+
parent.remove_child(node)
|
|
1924
|
+
return True
|
|
1925
|
+
|
|
1926
|
+
# 4. Attributes
|
|
1927
|
+
attrs = node.attrs
|
|
1928
|
+
if not attrs:
|
|
1929
|
+
return False
|
|
1930
|
+
|
|
1931
|
+
changed_attrs = False
|
|
1932
|
+
out_attrs: dict[str, str | None] = {}
|
|
1933
|
+
|
|
1934
|
+
capture_rel = tag == "a" and bool(policy.force_link_rel)
|
|
1935
|
+
rel_input_value: str | None = None
|
|
1936
|
+
|
|
1937
|
+
# Optimized: pre-calc allowlist for this tag
|
|
1938
|
+
# Note: allowed_attributes values are sets.
|
|
1939
|
+
allowed_attr_set = policy._allowed_attrs_by_tag.get(tag, policy._allowed_attrs_global)
|
|
1940
|
+
|
|
1941
|
+
drop_regex = t.attr_drop_regex
|
|
1942
|
+
|
|
1943
|
+
for raw_key, original_value in attrs.items():
|
|
1944
|
+
value = original_value
|
|
1945
|
+
key = str(raw_key)
|
|
1946
|
+
if not key.strip():
|
|
1947
|
+
changed_attrs = True
|
|
1948
|
+
continue
|
|
1949
|
+
key_lower = key.lower() if not key.islower() else key
|
|
1950
|
+
|
|
1951
|
+
if capture_rel and key_lower == "rel":
|
|
1952
|
+
rel_input_value = str(value or "")
|
|
1953
|
+
|
|
1954
|
+
# DropAttrs
|
|
1955
|
+
if drop_regex and drop_regex.match(key_lower):
|
|
1956
|
+
msg = f"Unsafe attribute '{key_lower}' (matched forbidden pattern)"
|
|
1957
|
+
policy.handle_unsafe(msg, node=node)
|
|
1958
|
+
if report:
|
|
1959
|
+
report(msg, node=node)
|
|
1960
|
+
changed_attrs = True
|
|
1961
|
+
continue
|
|
1962
|
+
|
|
1963
|
+
# Allowlist
|
|
1964
|
+
if key_lower not in allowed_attr_set:
|
|
1965
|
+
msg = f"Unsafe attribute '{key_lower}' (not allowed)"
|
|
1966
|
+
policy.handle_unsafe(msg, node=node)
|
|
1967
|
+
changed_attrs = True
|
|
1968
|
+
continue
|
|
1969
|
+
|
|
1970
|
+
# DropUrlAttrs
|
|
1971
|
+
if key_lower in _URL_LIKE_ATTRS:
|
|
1972
|
+
url_rule = policy.url_policy.allow_rules.get((tag, key_lower))
|
|
1973
|
+
if url_rule is None:
|
|
1974
|
+
msg = f"Unsafe URL in attribute '{key_lower}' (no rule)"
|
|
1975
|
+
policy.handle_unsafe(msg, node=node)
|
|
1976
|
+
if report:
|
|
1977
|
+
report(msg, node=node)
|
|
1978
|
+
changed_attrs = True
|
|
1979
|
+
continue
|
|
1980
|
+
|
|
1981
|
+
val_str = str(value or "")
|
|
1982
|
+
if key_lower == "srcset":
|
|
1983
|
+
sanitized = _sanitize_srcset_value(
|
|
1984
|
+
url_policy=policy.url_policy,
|
|
1985
|
+
rule=url_rule,
|
|
1986
|
+
tag=tag,
|
|
1987
|
+
attr=key_lower,
|
|
1988
|
+
value=val_str,
|
|
1989
|
+
)
|
|
1990
|
+
else:
|
|
1991
|
+
sanitized = _sanitize_url_value(
|
|
1992
|
+
url_policy=policy.url_policy,
|
|
1993
|
+
rule=url_rule,
|
|
1994
|
+
tag=tag,
|
|
1995
|
+
attr=key_lower,
|
|
1996
|
+
value=val_str,
|
|
1997
|
+
)
|
|
1998
|
+
|
|
1999
|
+
if sanitized is None:
|
|
2000
|
+
msg = f"Unsafe URL in attribute '{key_lower}'"
|
|
2001
|
+
policy.handle_unsafe(msg, node=node)
|
|
2002
|
+
if report: # pragma: no cover
|
|
2003
|
+
report(msg, node=node)
|
|
2004
|
+
changed_attrs = True
|
|
2005
|
+
continue
|
|
2006
|
+
|
|
2007
|
+
if sanitized != val_str:
|
|
2008
|
+
changed_attrs = True
|
|
2009
|
+
value = sanitized
|
|
2010
|
+
|
|
2011
|
+
# AllowStyleAttrs
|
|
2012
|
+
if key_lower == "style" and policy.allowed_css_properties:
|
|
2013
|
+
val_str = str(value or "")
|
|
2014
|
+
sanitized_style = _sanitize_inline_style(
|
|
2015
|
+
allowed_css_properties=policy.allowed_css_properties, value=val_str
|
|
2016
|
+
)
|
|
2017
|
+
if sanitized_style is None:
|
|
2018
|
+
msg = "Unsafe inline style in attribute 'style'"
|
|
2019
|
+
policy.handle_unsafe(msg, node=node)
|
|
2020
|
+
if report:
|
|
2021
|
+
report(msg, node=node)
|
|
2022
|
+
changed_attrs = True
|
|
2023
|
+
continue
|
|
2024
|
+
|
|
2025
|
+
if sanitized_style != val_str:
|
|
2026
|
+
changed_attrs = True
|
|
2027
|
+
value = sanitized_style
|
|
2028
|
+
|
|
2029
|
+
# Ensure we flag changes if the key case is normalized
|
|
2030
|
+
if key != key_lower:
|
|
2031
|
+
changed_attrs = True
|
|
2032
|
+
|
|
2033
|
+
out_attrs[key_lower] = value
|
|
2034
|
+
|
|
2035
|
+
# MergeAttrs (a rel)
|
|
2036
|
+
if capture_rel:
|
|
2037
|
+
rel_attr = "rel"
|
|
2038
|
+
existing_raw = out_attrs.get(rel_attr)
|
|
2039
|
+
if existing_raw is None and rel_input_value is not None:
|
|
2040
|
+
existing_raw = rel_input_value
|
|
2041
|
+
|
|
2042
|
+
existing: list[str] = []
|
|
2043
|
+
if isinstance(existing_raw, str) and existing_raw:
|
|
2044
|
+
for tok in existing_raw.split():
|
|
2045
|
+
tt = tok.strip().lower()
|
|
2046
|
+
if tt and tt not in existing:
|
|
2047
|
+
existing.append(tt)
|
|
2048
|
+
|
|
2049
|
+
rel_changed = False
|
|
2050
|
+
# Ensure deterministic order for forced tokens
|
|
2051
|
+
for tok in sorted(policy.force_link_rel):
|
|
2052
|
+
if tok not in existing:
|
|
2053
|
+
existing.append(tok)
|
|
2054
|
+
rel_changed = True
|
|
2055
|
+
|
|
2056
|
+
normalized = " ".join(existing)
|
|
2057
|
+
if rel_changed or (existing_raw != normalized):
|
|
2058
|
+
out_attrs[rel_attr] = normalized
|
|
2059
|
+
changed_attrs = True
|
|
2060
|
+
if report and rel_changed: # pragma: no cover
|
|
2061
|
+
report("Merged tokens into attribute 'rel' on <a>", node=node)
|
|
2062
|
+
|
|
2063
|
+
if changed_attrs:
|
|
2064
|
+
node.attrs = out_attrs
|
|
2065
|
+
if callback:
|
|
2066
|
+
callback(node)
|
|
2067
|
+
|
|
2068
|
+
return False
|
|
2069
|
+
|
|
2070
|
+
def apply_to_children(parent: SimpleDomNode, *, skip_linkify: bool, skip_whitespace: bool) -> None:
|
|
2071
|
+
children = parent.children
|
|
2072
|
+
if not children:
|
|
2073
|
+
return
|
|
2074
|
+
|
|
2075
|
+
i = 0
|
|
2076
|
+
while i < len(children):
|
|
2077
|
+
node = children[i]
|
|
2078
|
+
name = node.name
|
|
2079
|
+
|
|
2080
|
+
changed = False
|
|
2081
|
+
start_at = created_start_index.get(id(node), 0)
|
|
2082
|
+
for idx in range(start_at, len(walk_transforms)):
|
|
2083
|
+
t = walk_transforms[idx]
|
|
2084
|
+
# Dispatch based on 'kind' string to avoid expensive isinstance/class hierarchy checks
|
|
2085
|
+
# in this hot loop (50k nodes * 10 transforms = 500k type checks otherwise).
|
|
2086
|
+
k: str = t.kind
|
|
2087
|
+
|
|
2088
|
+
# Sanitize (Fused output for performance)
|
|
2089
|
+
if k == "sanitize":
|
|
2090
|
+
if TYPE_CHECKING:
|
|
2091
|
+
t = cast("_CompiledSanitizeTransform", t)
|
|
2092
|
+
if _apply_fused_sanitize(node, t, parent, idx):
|
|
2093
|
+
changed = True
|
|
2094
|
+
break
|
|
2095
|
+
continue
|
|
2096
|
+
|
|
2097
|
+
# DropComments
|
|
2098
|
+
if k == "drop_comments":
|
|
2099
|
+
if name == "#comment":
|
|
2100
|
+
if TYPE_CHECKING:
|
|
2101
|
+
t = cast("_CompiledDropCommentsTransform", t)
|
|
2102
|
+
if t.callback is not None:
|
|
2103
|
+
t.callback(node)
|
|
2104
|
+
if t.report is not None:
|
|
2105
|
+
t.report("Dropped comment", node=node)
|
|
2106
|
+
parent.remove_child(node)
|
|
2107
|
+
changed = True
|
|
2108
|
+
break
|
|
2109
|
+
continue
|
|
2110
|
+
|
|
2111
|
+
# DropDoctype
|
|
2112
|
+
if k == "drop_doctype":
|
|
2113
|
+
if name == "!doctype":
|
|
2114
|
+
if TYPE_CHECKING:
|
|
2115
|
+
t = cast("_CompiledDropDoctypeTransform", t)
|
|
2116
|
+
if t.callback is not None:
|
|
2117
|
+
t.callback(node) # pragma: no cover
|
|
2118
|
+
if t.report is not None:
|
|
2119
|
+
t.report("Dropped doctype", node=node) # pragma: no cover
|
|
2120
|
+
parent.remove_child(node)
|
|
2121
|
+
changed = True
|
|
2122
|
+
break
|
|
2123
|
+
continue
|
|
2124
|
+
|
|
2125
|
+
# MergeAttrs
|
|
2126
|
+
if k == "merge_attr_tokens":
|
|
2127
|
+
if not name.startswith("#") and name != "!doctype":
|
|
2128
|
+
if TYPE_CHECKING:
|
|
2129
|
+
t = cast("_CompiledMergeAttrTokensTransform", t)
|
|
2130
|
+
if str(name).lower() == t.tag:
|
|
2131
|
+
attrs = node.attrs
|
|
2132
|
+
existing_raw = attrs.get(t.attr)
|
|
2133
|
+
existing: list[str] = []
|
|
2134
|
+
if isinstance(existing_raw, str) and existing_raw:
|
|
2135
|
+
for tok in existing_raw.split():
|
|
2136
|
+
tt = tok.strip().lower()
|
|
2137
|
+
if tt and tt not in existing:
|
|
2138
|
+
existing.append(tt)
|
|
2139
|
+
|
|
2140
|
+
changed_rel = False
|
|
2141
|
+
for tok in t.tokens:
|
|
2142
|
+
if tok not in existing:
|
|
2143
|
+
existing.append(tok)
|
|
2144
|
+
changed_rel = True
|
|
2145
|
+
normalized = " ".join(existing)
|
|
2146
|
+
if (
|
|
2147
|
+
changed_rel
|
|
2148
|
+
or (existing_raw is None and existing)
|
|
2149
|
+
or (isinstance(existing_raw, str) and existing_raw != normalized)
|
|
2150
|
+
):
|
|
2151
|
+
attrs[t.attr] = normalized
|
|
2152
|
+
if t.callback is not None:
|
|
2153
|
+
t.callback(node)
|
|
2154
|
+
if t.report is not None:
|
|
2155
|
+
t.report(
|
|
2156
|
+
f"Merged tokens into attribute '{t.attr}' on <{t.tag}>",
|
|
2157
|
+
node=node,
|
|
2158
|
+
)
|
|
2159
|
+
continue
|
|
2160
|
+
|
|
2161
|
+
# CollapseWhitespace
|
|
2162
|
+
if k == "collapse_whitespace":
|
|
2163
|
+
if name == "#text" and not skip_whitespace:
|
|
2164
|
+
if TYPE_CHECKING:
|
|
2165
|
+
t = cast("_CompiledCollapseWhitespaceTransform", t)
|
|
2166
|
+
data = node.data or ""
|
|
2167
|
+
if data:
|
|
2168
|
+
collapsed = _collapse_html_space_characters(data)
|
|
2169
|
+
if collapsed != data:
|
|
2170
|
+
if t.callback is not None:
|
|
2171
|
+
t.callback(node)
|
|
2172
|
+
if t.report is not None:
|
|
2173
|
+
t.report("Collapsed whitespace in text node", node=node)
|
|
2174
|
+
node.data = collapsed
|
|
2175
|
+
continue
|
|
2176
|
+
|
|
2177
|
+
# Linkify
|
|
2178
|
+
if k == "linkify":
|
|
2179
|
+
if name == "#text" and not skip_linkify:
|
|
2180
|
+
if TYPE_CHECKING:
|
|
2181
|
+
t = cast("_CompiledLinkifyTransform", t)
|
|
2182
|
+
data = node.data or ""
|
|
2183
|
+
if data:
|
|
2184
|
+
matches = find_links_with_config(data, t.config)
|
|
2185
|
+
if matches:
|
|
2186
|
+
if t.callback is not None:
|
|
2187
|
+
t.callback(node)
|
|
2188
|
+
if t.report is not None:
|
|
2189
|
+
t.report(
|
|
2190
|
+
f"Linkified {len(matches)} link(s) in text node",
|
|
2191
|
+
node=node,
|
|
2192
|
+
)
|
|
2193
|
+
cursor = 0
|
|
2194
|
+
for m in matches:
|
|
2195
|
+
if m.start > cursor:
|
|
2196
|
+
txt = TextNode(data[cursor : m.start])
|
|
2197
|
+
_mark_start(txt, idx + 1)
|
|
2198
|
+
parent.insert_before(txt, node)
|
|
2199
|
+
|
|
2200
|
+
ns = parent.namespace or "html"
|
|
2201
|
+
a = ElementNode("a", {"href": m.href}, ns)
|
|
2202
|
+
a.append_child(TextNode(m.text))
|
|
2203
|
+
_mark_start(a, idx + 1)
|
|
2204
|
+
parent.insert_before(a, node)
|
|
2205
|
+
cursor = m.end
|
|
2206
|
+
|
|
2207
|
+
if cursor < len(data):
|
|
2208
|
+
tail = TextNode(data[cursor:])
|
|
2209
|
+
_mark_start(tail, idx + 1)
|
|
2210
|
+
parent.insert_before(tail, node)
|
|
2211
|
+
|
|
2212
|
+
parent.remove_child(node)
|
|
2213
|
+
changed = True
|
|
2214
|
+
break
|
|
2215
|
+
continue
|
|
2216
|
+
|
|
2217
|
+
# Decide
|
|
2218
|
+
if k == "decide":
|
|
2219
|
+
if TYPE_CHECKING:
|
|
2220
|
+
t = cast("_CompiledDecideTransform", t)
|
|
2221
|
+
if t.all_nodes:
|
|
2222
|
+
action = t.callback(node)
|
|
2223
|
+
else:
|
|
2224
|
+
if name.startswith("#") or name == "!doctype":
|
|
2225
|
+
continue
|
|
2226
|
+
sel = t.selector
|
|
2227
|
+
if TYPE_CHECKING:
|
|
2228
|
+
sel = cast("ParsedSelector", sel)
|
|
2229
|
+
if not matcher.matches(node, sel):
|
|
2230
|
+
continue
|
|
2231
|
+
action = t.callback(node)
|
|
2232
|
+
|
|
2233
|
+
if action is DecideAction.KEEP:
|
|
2234
|
+
continue
|
|
2235
|
+
|
|
2236
|
+
if action is DecideAction.EMPTY:
|
|
2237
|
+
if name != "#text" and node.children:
|
|
2238
|
+
for child in node.children:
|
|
2239
|
+
child.parent = None
|
|
2240
|
+
node.children = []
|
|
2241
|
+
if type(node) is TemplateNode and node.template_content is not None:
|
|
2242
|
+
tc = node.template_content
|
|
2243
|
+
for child in tc.children or []:
|
|
2244
|
+
child.parent = None
|
|
2245
|
+
tc.children = []
|
|
2246
|
+
continue
|
|
2247
|
+
|
|
2248
|
+
if action is DecideAction.UNWRAP:
|
|
2249
|
+
moved_nodes: list[SimpleDomNode] = []
|
|
2250
|
+
if name != "#text" and node.children:
|
|
2251
|
+
moved_nodes.extend(list(node.children))
|
|
2252
|
+
node.children = []
|
|
2253
|
+
if type(node) is TemplateNode and node.template_content is not None:
|
|
2254
|
+
tc = node.template_content
|
|
2255
|
+
if tc.children:
|
|
2256
|
+
moved_nodes.extend(list(tc.children))
|
|
2257
|
+
tc.children = []
|
|
2258
|
+
if moved_nodes:
|
|
2259
|
+
for child in moved_nodes:
|
|
2260
|
+
_mark_start(child, idx)
|
|
2261
|
+
parent.insert_before(child, node)
|
|
2262
|
+
parent.remove_child(node)
|
|
2263
|
+
changed = True
|
|
2264
|
+
break
|
|
2265
|
+
|
|
2266
|
+
if action is DecideAction.ESCAPE:
|
|
2267
|
+
raw_start = _raw_tag_text(node, "_start_tag_start", "_start_tag_end")
|
|
2268
|
+
if raw_start is None:
|
|
2269
|
+
raw_start = _reconstruct_start_tag(node)
|
|
2270
|
+
raw_end = _raw_tag_text(node, "_end_tag_start", "_end_tag_end")
|
|
2271
|
+
if raw_end is None:
|
|
2272
|
+
raw_end = _reconstruct_end_tag(node)
|
|
2273
|
+
if raw_start:
|
|
2274
|
+
start_node = TextNode(raw_start)
|
|
2275
|
+
_mark_start(start_node, idx)
|
|
2276
|
+
parent.insert_before(start_node, node)
|
|
2277
|
+
|
|
2278
|
+
moved: list[SimpleDomNode] = []
|
|
2279
|
+
if name != "#text" and node.children:
|
|
2280
|
+
moved.extend(list(node.children))
|
|
2281
|
+
node.children = []
|
|
2282
|
+
if type(node) is TemplateNode and node.template_content is not None:
|
|
2283
|
+
tc = node.template_content
|
|
2284
|
+
tc_children = tc.children or []
|
|
2285
|
+
moved.extend(tc_children)
|
|
2286
|
+
tc.children = []
|
|
2287
|
+
|
|
2288
|
+
if moved:
|
|
2289
|
+
for child in moved:
|
|
2290
|
+
_mark_start(child, idx)
|
|
2291
|
+
parent.insert_before(child, node)
|
|
2292
|
+
|
|
2293
|
+
if raw_end:
|
|
2294
|
+
end_node = TextNode(raw_end)
|
|
2295
|
+
_mark_start(end_node, idx)
|
|
2296
|
+
parent.insert_before(end_node, node)
|
|
2297
|
+
|
|
2298
|
+
parent.remove_child(node)
|
|
2299
|
+
changed = True
|
|
2300
|
+
break
|
|
2301
|
+
|
|
2302
|
+
# action == DROP (and any invalid value)
|
|
2303
|
+
parent.remove_child(node)
|
|
2304
|
+
changed = True
|
|
2305
|
+
break
|
|
2306
|
+
|
|
2307
|
+
# EditAttrs (rewrite_attrs)
|
|
2308
|
+
if k == "rewrite_attrs":
|
|
2309
|
+
if name.startswith("#") or name == "!doctype":
|
|
2310
|
+
continue
|
|
2311
|
+
if TYPE_CHECKING:
|
|
2312
|
+
t = cast("_CompiledRewriteAttrsTransform", t)
|
|
2313
|
+
if not t.all_nodes:
|
|
2314
|
+
sel = t.selector
|
|
2315
|
+
if TYPE_CHECKING:
|
|
2316
|
+
sel = cast("ParsedSelector", sel)
|
|
2317
|
+
if not matcher.matches(node, sel):
|
|
2318
|
+
continue
|
|
2319
|
+
new_attrs = t.func(node)
|
|
2320
|
+
if new_attrs is not None:
|
|
2321
|
+
node.attrs = new_attrs
|
|
2322
|
+
continue
|
|
2323
|
+
|
|
2324
|
+
# Selector transforms
|
|
2325
|
+
if TYPE_CHECKING:
|
|
2326
|
+
t = cast("_CompiledSelectorTransform", t)
|
|
2327
|
+
if name.startswith("#") or name == "!doctype":
|
|
2328
|
+
continue
|
|
2329
|
+
|
|
2330
|
+
if not matcher.matches(node, t.selector):
|
|
2331
|
+
continue
|
|
2332
|
+
|
|
2333
|
+
if t.kind == "setattrs":
|
|
2334
|
+
patch = cast("dict[str, str | None]", t.payload)
|
|
2335
|
+
attrs = node.attrs
|
|
2336
|
+
changed_any = False
|
|
2337
|
+
for k, v in patch.items():
|
|
2338
|
+
key = str(k)
|
|
2339
|
+
new_val = None if v is None else str(v)
|
|
2340
|
+
if attrs.get(key) != new_val:
|
|
2341
|
+
attrs[key] = new_val
|
|
2342
|
+
changed_any = True
|
|
2343
|
+
if changed_any:
|
|
2344
|
+
if t.callback is not None:
|
|
2345
|
+
t.callback(node)
|
|
2346
|
+
if t.report is not None:
|
|
2347
|
+
tag = str(node.name).lower()
|
|
2348
|
+
t.report(
|
|
2349
|
+
f"Set attributes on <{tag}> (matched selector '{t.selector_str}')", node=node
|
|
2350
|
+
)
|
|
2351
|
+
continue
|
|
2352
|
+
|
|
2353
|
+
if t.kind == "edit":
|
|
2354
|
+
cb = cast("NodeCallback", t.payload)
|
|
2355
|
+
cb(node)
|
|
2356
|
+
continue
|
|
2357
|
+
|
|
2358
|
+
if t.kind == "empty":
|
|
2359
|
+
had_children = bool(node.children)
|
|
2360
|
+
if node.children:
|
|
2361
|
+
for child in node.children:
|
|
2362
|
+
child.parent = None
|
|
2363
|
+
node.children = []
|
|
2364
|
+
if type(node) is TemplateNode and node.template_content is not None:
|
|
2365
|
+
tc = node.template_content
|
|
2366
|
+
had_children = had_children or bool(tc.children)
|
|
2367
|
+
for child in tc.children or []:
|
|
2368
|
+
child.parent = None
|
|
2369
|
+
tc.children = []
|
|
2370
|
+
if had_children:
|
|
2371
|
+
if t.callback is not None:
|
|
2372
|
+
t.callback(node)
|
|
2373
|
+
if t.report is not None:
|
|
2374
|
+
tag = str(node.name).lower()
|
|
2375
|
+
t.report(f"Emptied <{tag}> (matched selector '{t.selector_str}')", node=node)
|
|
2376
|
+
continue
|
|
2377
|
+
|
|
2378
|
+
if t.kind == "drop":
|
|
2379
|
+
if t.callback is not None:
|
|
2380
|
+
t.callback(node)
|
|
2381
|
+
if t.report is not None:
|
|
2382
|
+
tag = str(node.name).lower()
|
|
2383
|
+
t.report(f"Dropped <{tag}> (matched selector '{t.selector_str}')", node=node)
|
|
2384
|
+
parent.remove_child(node)
|
|
2385
|
+
changed = True
|
|
2386
|
+
break
|
|
2387
|
+
|
|
2388
|
+
# t.kind == "unwrap".
|
|
2389
|
+
if t.callback is not None:
|
|
2390
|
+
t.callback(node)
|
|
2391
|
+
if t.report is not None:
|
|
2392
|
+
tag = str(node.name).lower()
|
|
2393
|
+
t.report(f"Unwrapped <{tag}> (matched selector '{t.selector_str}')", node=node)
|
|
2394
|
+
|
|
2395
|
+
moved_nodes_unwrap: list[SimpleDomNode] = []
|
|
2396
|
+
if node.children:
|
|
2397
|
+
moved_nodes_unwrap.extend(list(node.children))
|
|
2398
|
+
node.children = []
|
|
2399
|
+
|
|
2400
|
+
if type(node) is TemplateNode and node.template_content is not None:
|
|
2401
|
+
tc = node.template_content
|
|
2402
|
+
tc_children = tc.children or []
|
|
2403
|
+
moved_nodes_unwrap.extend(tc_children)
|
|
2404
|
+
tc.children = []
|
|
2405
|
+
|
|
2406
|
+
if moved_nodes_unwrap:
|
|
2407
|
+
for child in moved_nodes_unwrap:
|
|
2408
|
+
_mark_start(child, idx + 1)
|
|
2409
|
+
parent.insert_before(child, node)
|
|
2410
|
+
parent.remove_child(node)
|
|
2411
|
+
changed = True
|
|
2412
|
+
break
|
|
2413
|
+
|
|
2414
|
+
if changed:
|
|
2415
|
+
continue
|
|
2416
|
+
|
|
2417
|
+
if name.startswith("#"):
|
|
2418
|
+
# Document containers (e.g. nested #document-fragment) should
|
|
2419
|
+
# still be traversed to reach their element descendants.
|
|
2420
|
+
if node.children:
|
|
2421
|
+
apply_to_children(node, skip_linkify=skip_linkify, skip_whitespace=skip_whitespace)
|
|
2422
|
+
else:
|
|
2423
|
+
tag = node.name.lower()
|
|
2424
|
+
child_skip = skip_linkify or (tag in linkify_skip_tags)
|
|
2425
|
+
child_skip_ws = skip_whitespace or (tag in whitespace_skip_tags)
|
|
2426
|
+
|
|
2427
|
+
if node.children:
|
|
2428
|
+
apply_to_children(node, skip_linkify=child_skip, skip_whitespace=child_skip_ws)
|
|
2429
|
+
|
|
2430
|
+
if type(node) is TemplateNode and node.template_content is not None:
|
|
2431
|
+
apply_to_children(
|
|
2432
|
+
node.template_content, skip_linkify=child_skip, skip_whitespace=child_skip_ws
|
|
2433
|
+
)
|
|
2434
|
+
|
|
2435
|
+
i += 1
|
|
2436
|
+
|
|
2437
|
+
if type(root_node) is not TextNode:
|
|
2438
|
+
apply_to_children(root_node, skip_linkify=False, skip_whitespace=False)
|
|
2439
|
+
|
|
2440
|
+
# Root template nodes need special handling since the main walk
|
|
2441
|
+
# only visits children of the provided root.
|
|
2442
|
+
if type(root_node) is TemplateNode and root_node.template_content is not None:
|
|
2443
|
+
apply_to_children(root_node.template_content, skip_linkify=False, skip_whitespace=False)
|
|
2444
|
+
|
|
2445
|
+
def apply_prune_transforms(
|
|
2446
|
+
root_node: SimpleDomNode, prune_transforms: list[_CompiledPruneEmptyTransform]
|
|
2447
|
+
) -> None:
|
|
2448
|
+
def _is_effectively_empty_element(n: SimpleDomNode, *, strip_whitespace: bool) -> bool:
|
|
2449
|
+
if n.namespace == "html" and n.name.lower() in VOID_ELEMENTS:
|
|
2450
|
+
return False
|
|
2451
|
+
|
|
2452
|
+
def _has_content(children: list[SimpleDomNode] | None) -> bool:
|
|
2453
|
+
if not children:
|
|
2454
|
+
return False
|
|
2455
|
+
for ch in children:
|
|
2456
|
+
nm = ch.name
|
|
2457
|
+
if nm == "#text":
|
|
2458
|
+
data = getattr(ch, "data", "") or ""
|
|
2459
|
+
if strip_whitespace:
|
|
2460
|
+
if str(data).strip():
|
|
2461
|
+
return True
|
|
2462
|
+
else:
|
|
2463
|
+
if str(data) != "":
|
|
2464
|
+
return True
|
|
2465
|
+
continue
|
|
2466
|
+
if nm.startswith("#"):
|
|
2467
|
+
continue
|
|
2468
|
+
return True
|
|
2469
|
+
return False
|
|
2470
|
+
|
|
2471
|
+
if _has_content(n.children):
|
|
2472
|
+
return False
|
|
2473
|
+
|
|
2474
|
+
if type(n) is TemplateNode and n.template_content is not None:
|
|
2475
|
+
if _has_content(n.template_content.children):
|
|
2476
|
+
return False
|
|
2477
|
+
|
|
2478
|
+
return True
|
|
2479
|
+
|
|
2480
|
+
stack: list[tuple[SimpleDomNode, bool]] = [(root_node, False)]
|
|
2481
|
+
while stack:
|
|
2482
|
+
node, visited = stack.pop()
|
|
2483
|
+
if not visited:
|
|
2484
|
+
stack.append((node, True))
|
|
2485
|
+
|
|
2486
|
+
children = node.children or []
|
|
2487
|
+
stack.extend((child, False) for child in reversed(children) if isinstance(child, SimpleDomNode))
|
|
2488
|
+
|
|
2489
|
+
if type(node) is TemplateNode and node.template_content is not None:
|
|
2490
|
+
stack.append((node.template_content, False))
|
|
2491
|
+
continue
|
|
2492
|
+
|
|
2493
|
+
if node.parent is None:
|
|
2494
|
+
continue
|
|
2495
|
+
if node.name.startswith("#"):
|
|
2496
|
+
continue
|
|
2497
|
+
|
|
2498
|
+
for pt in prune_transforms:
|
|
2499
|
+
if matcher.matches(node, pt.selector):
|
|
2500
|
+
if _is_effectively_empty_element(node, strip_whitespace=pt.strip_whitespace):
|
|
2501
|
+
if pt.callback is not None:
|
|
2502
|
+
pt.callback(node)
|
|
2503
|
+
if pt.report is not None:
|
|
2504
|
+
tag = str(node.name).lower()
|
|
2505
|
+
pt.report(
|
|
2506
|
+
f"Pruned empty <{tag}> (matched selector '{pt.selector_str}')",
|
|
2507
|
+
node=node,
|
|
2508
|
+
)
|
|
2509
|
+
node.parent.remove_child(node)
|
|
2510
|
+
break
|
|
2511
|
+
|
|
2512
|
+
pending_walk: list[CompiledTransform] = []
|
|
2513
|
+
|
|
2514
|
+
i = 0
|
|
2515
|
+
while i < len(compiled):
|
|
2516
|
+
t = compiled[i]
|
|
2517
|
+
if isinstance(
|
|
2518
|
+
t,
|
|
2519
|
+
(
|
|
2520
|
+
_CompiledSelectorTransform,
|
|
2521
|
+
_CompiledDecideTransform,
|
|
2522
|
+
_CompiledRewriteAttrsTransform,
|
|
2523
|
+
_CompiledLinkifyTransform,
|
|
2524
|
+
_CompiledCollapseWhitespaceTransform,
|
|
2525
|
+
_CompiledDropCommentsTransform,
|
|
2526
|
+
_CompiledDropDoctypeTransform,
|
|
2527
|
+
_CompiledMergeAttrTokensTransform,
|
|
2528
|
+
_CompiledSanitizeTransform,
|
|
2529
|
+
),
|
|
2530
|
+
):
|
|
2531
|
+
pending_walk.append(t)
|
|
2532
|
+
i += 1
|
|
2533
|
+
continue
|
|
2534
|
+
|
|
2535
|
+
apply_walk_transforms(root, pending_walk)
|
|
2536
|
+
pending_walk = []
|
|
2537
|
+
|
|
2538
|
+
if isinstance(t, _CompiledStageBoundary):
|
|
2539
|
+
i += 1
|
|
2540
|
+
continue
|
|
2541
|
+
|
|
2542
|
+
if isinstance(t, _CompiledStageHookTransform):
|
|
2543
|
+
if t.callback is not None:
|
|
2544
|
+
t.callback(root)
|
|
2545
|
+
if t.report is not None:
|
|
2546
|
+
t.report(f"Stage {t.index + 1}", node=root)
|
|
2547
|
+
i += 1
|
|
2548
|
+
continue
|
|
2549
|
+
|
|
2550
|
+
if isinstance(t, _CompiledEditDocumentTransform):
|
|
2551
|
+
t.callback(root)
|
|
2552
|
+
i += 1
|
|
2553
|
+
continue
|
|
2554
|
+
|
|
2555
|
+
if isinstance(t, _CompiledPruneEmptyTransform):
|
|
2556
|
+
prune_batch: list[_CompiledPruneEmptyTransform] = [t]
|
|
2557
|
+
i += 1
|
|
2558
|
+
while i < len(compiled) and isinstance(compiled[i], _CompiledPruneEmptyTransform):
|
|
2559
|
+
prune_batch.append(cast("_CompiledPruneEmptyTransform", compiled[i]))
|
|
2560
|
+
i += 1
|
|
2561
|
+
apply_prune_transforms(root, prune_batch)
|
|
2562
|
+
continue
|
|
2563
|
+
|
|
2564
|
+
raise TypeError(f"Unsupported compiled transform: {type(t).__name__}")
|
|
2565
|
+
|
|
2566
|
+
apply_walk_transforms(root, pending_walk)
|
|
2567
|
+
finally:
|
|
2568
|
+
_ERROR_SINK.reset(token)
|