justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- justhtml/__init__.py +28 -0
- justhtml/__main__.py +161 -13
- justhtml/constants.py +17 -1
- justhtml/context.py +7 -1
- justhtml/encoding.py +405 -0
- justhtml/entities.py +57 -17
- justhtml/errors.py +20 -4
- justhtml/linkify.py +438 -0
- justhtml/node.py +738 -41
- justhtml/parser.py +188 -21
- justhtml/py.typed +0 -0
- justhtml/sanitize.py +1141 -0
- justhtml/selector.py +240 -104
- justhtml/serialize.py +418 -57
- justhtml/stream.py +34 -10
- justhtml/tokenizer.py +433 -289
- justhtml/tokens.py +91 -23
- justhtml/transforms.py +690 -0
- justhtml/treebuilder.py +196 -111
- justhtml/treebuilder_modes.py +191 -117
- justhtml/treebuilder_utils.py +11 -4
- justhtml-0.33.0.dist-info/METADATA +196 -0
- justhtml-0.33.0.dist-info/RECORD +26 -0
- justhtml-0.33.0.dist-info/entry_points.txt +2 -0
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.6.0.dist-info/METADATA +0 -126
- justhtml-0.6.0.dist-info/RECORD +0 -20
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/WHEEL +0 -0
justhtml/transforms.py
ADDED
|
@@ -0,0 +1,690 @@
|
|
|
1
|
+
"""Constructor-time DOM transforms.
|
|
2
|
+
|
|
3
|
+
These transforms are intended as a migration path for Bleach/html5lib-style
|
|
4
|
+
post-processing, but are implemented as DOM (tree) operations to match
|
|
5
|
+
JustHTML's architecture.
|
|
6
|
+
|
|
7
|
+
Safety model: transforms shape the in-memory tree; safe-by-default output is
|
|
8
|
+
still enforced by `to_html()`/`to_text()`/`to_markdown()` via sanitization.
|
|
9
|
+
|
|
10
|
+
Performance: selectors are compiled (parsed) once before application.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from typing import TYPE_CHECKING, Literal, cast
|
|
17
|
+
|
|
18
|
+
from .constants import WHITESPACE_PRESERVING_ELEMENTS
|
|
19
|
+
from .linkify import LinkifyConfig, find_links_with_config
|
|
20
|
+
from .node import ElementNode, SimpleDomNode, TemplateNode, TextNode
|
|
21
|
+
from .sanitize import SanitizationPolicy, _sanitize
|
|
22
|
+
from .selector import SelectorMatcher, parse_selector
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from collections.abc import Callable
|
|
26
|
+
|
|
27
|
+
from .selector import ParsedSelector
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# -----------------
|
|
31
|
+
# Public transforms
|
|
32
|
+
# -----------------
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(frozen=True, slots=True)
|
|
36
|
+
class SetAttrs:
|
|
37
|
+
selector: str
|
|
38
|
+
attrs: dict[str, str | None]
|
|
39
|
+
|
|
40
|
+
def __init__(self, selector: str, **attrs: str | None) -> None:
|
|
41
|
+
object.__setattr__(self, "selector", str(selector))
|
|
42
|
+
object.__setattr__(self, "attrs", dict(attrs))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(frozen=True, slots=True)
|
|
46
|
+
class Drop:
|
|
47
|
+
selector: str
|
|
48
|
+
|
|
49
|
+
def __init__(self, selector: str) -> None:
|
|
50
|
+
object.__setattr__(self, "selector", str(selector))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass(frozen=True, slots=True)
|
|
54
|
+
class Unwrap:
|
|
55
|
+
selector: str
|
|
56
|
+
|
|
57
|
+
def __init__(self, selector: str) -> None:
|
|
58
|
+
object.__setattr__(self, "selector", str(selector))
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass(frozen=True, slots=True)
|
|
62
|
+
class Empty:
|
|
63
|
+
selector: str
|
|
64
|
+
|
|
65
|
+
def __init__(self, selector: str) -> None:
|
|
66
|
+
object.__setattr__(self, "selector", str(selector))
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass(frozen=True, slots=True)
|
|
70
|
+
class Edit:
|
|
71
|
+
selector: str
|
|
72
|
+
callback: Callable[[SimpleDomNode], None]
|
|
73
|
+
|
|
74
|
+
def __init__(self, selector: str, callback: Callable[[SimpleDomNode], None]) -> None:
|
|
75
|
+
object.__setattr__(self, "selector", str(selector))
|
|
76
|
+
object.__setattr__(self, "callback", callback)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass(frozen=True, slots=True)
|
|
80
|
+
class Linkify:
|
|
81
|
+
"""Linkify URLs/emails in text nodes.
|
|
82
|
+
|
|
83
|
+
This transform scans DOM text nodes (not raw HTML strings) and wraps detected
|
|
84
|
+
links in `<a href="...">...</a>`.
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
skip_tags: frozenset[str]
|
|
88
|
+
fuzzy_ip: bool
|
|
89
|
+
extra_tlds: frozenset[str]
|
|
90
|
+
|
|
91
|
+
def __init__(
|
|
92
|
+
self,
|
|
93
|
+
*,
|
|
94
|
+
skip_tags: list[str] | tuple[str, ...] | set[str] | frozenset[str] = (
|
|
95
|
+
"a",
|
|
96
|
+
*WHITESPACE_PRESERVING_ELEMENTS,
|
|
97
|
+
),
|
|
98
|
+
fuzzy_ip: bool = False,
|
|
99
|
+
extra_tlds: list[str] | tuple[str, ...] | set[str] | frozenset[str] = (),
|
|
100
|
+
) -> None:
|
|
101
|
+
object.__setattr__(self, "skip_tags", frozenset(str(t).lower() for t in skip_tags))
|
|
102
|
+
object.__setattr__(self, "fuzzy_ip", bool(fuzzy_ip))
|
|
103
|
+
object.__setattr__(self, "extra_tlds", frozenset(str(t).lower() for t in extra_tlds))
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _collapse_html_space_characters(text: str) -> str:
|
|
107
|
+
"""Collapse runs of HTML whitespace characters to a single space.
|
|
108
|
+
|
|
109
|
+
This mirrors html5lib's whitespace filter behavior: it does not trim.
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
# Fast path: no formatting whitespace and no double spaces.
|
|
113
|
+
if "\t" not in text and "\n" not in text and "\r" not in text and "\f" not in text and " " not in text:
|
|
114
|
+
return text
|
|
115
|
+
|
|
116
|
+
out: list[str] = []
|
|
117
|
+
in_ws = False
|
|
118
|
+
|
|
119
|
+
for ch in text:
|
|
120
|
+
if ch == " " or ch == "\t" or ch == "\n" or ch == "\r" or ch == "\f":
|
|
121
|
+
if in_ws:
|
|
122
|
+
continue
|
|
123
|
+
out.append(" ")
|
|
124
|
+
in_ws = True
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
out.append(ch)
|
|
128
|
+
in_ws = False
|
|
129
|
+
return "".join(out)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@dataclass(frozen=True, slots=True)
|
|
133
|
+
class CollapseWhitespace:
|
|
134
|
+
"""Collapse whitespace in text nodes.
|
|
135
|
+
|
|
136
|
+
Collapses runs of HTML whitespace characters (space, tab, LF, CR, FF) into a
|
|
137
|
+
single space.
|
|
138
|
+
|
|
139
|
+
This is similar to `html5lib.filters.whitespace.Filter`.
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
skip_tags: frozenset[str]
|
|
143
|
+
|
|
144
|
+
def __init__(
|
|
145
|
+
self,
|
|
146
|
+
*,
|
|
147
|
+
skip_tags: list[str] | tuple[str, ...] | set[str] | frozenset[str] = (
|
|
148
|
+
*WHITESPACE_PRESERVING_ELEMENTS,
|
|
149
|
+
"title",
|
|
150
|
+
),
|
|
151
|
+
) -> None:
|
|
152
|
+
object.__setattr__(self, "skip_tags", frozenset(str(t).lower() for t in skip_tags))
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@dataclass(frozen=True, slots=True)
|
|
156
|
+
class Sanitize:
|
|
157
|
+
"""Sanitize the in-memory tree.
|
|
158
|
+
|
|
159
|
+
This transform replaces the current tree with a sanitized clone using the
|
|
160
|
+
same sanitizer that powers `safe=True` serialization.
|
|
161
|
+
|
|
162
|
+
Notes:
|
|
163
|
+
- This runs once at parse/transform time.
|
|
164
|
+
- This transform must be last.
|
|
165
|
+
"""
|
|
166
|
+
|
|
167
|
+
policy: SanitizationPolicy | None
|
|
168
|
+
|
|
169
|
+
def __init__(self, policy: SanitizationPolicy | None = None) -> None:
|
|
170
|
+
object.__setattr__(self, "policy", policy)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
@dataclass(frozen=True, slots=True)
|
|
174
|
+
class PruneEmpty:
|
|
175
|
+
"""Recursively drop empty elements.
|
|
176
|
+
|
|
177
|
+
This transform removes elements that are empty at that point in the
|
|
178
|
+
transform pipeline.
|
|
179
|
+
|
|
180
|
+
"Empty" means:
|
|
181
|
+
- no element children, and
|
|
182
|
+
- no non-whitespace text nodes (unless `strip_whitespace=False`).
|
|
183
|
+
|
|
184
|
+
Comments/doctypes are ignored when determining emptiness.
|
|
185
|
+
|
|
186
|
+
Notes:
|
|
187
|
+
- Pruning uses a post-order traversal to be correct.
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
selector: str
|
|
191
|
+
strip_whitespace: bool
|
|
192
|
+
|
|
193
|
+
def __init__(self, selector: str, *, strip_whitespace: bool = True) -> None:
|
|
194
|
+
object.__setattr__(self, "selector", str(selector))
|
|
195
|
+
object.__setattr__(self, "strip_whitespace", bool(strip_whitespace))
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
# -----------------
|
|
199
|
+
# Compilation
|
|
200
|
+
# -----------------
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
Transform = SetAttrs | Drop | Unwrap | Empty | Edit | Linkify | CollapseWhitespace | PruneEmpty | Sanitize
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
@dataclass(frozen=True, slots=True)
|
|
207
|
+
class _CompiledCollapseWhitespaceTransform:
|
|
208
|
+
kind: Literal["collapse_whitespace"]
|
|
209
|
+
skip_tags: frozenset[str]
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
@dataclass(frozen=True, slots=True)
|
|
213
|
+
class _CompiledSelectorTransform:
|
|
214
|
+
kind: Literal["setattrs", "drop", "unwrap", "empty", "edit"]
|
|
215
|
+
selector_str: str
|
|
216
|
+
selector: ParsedSelector
|
|
217
|
+
payload: dict[str, str | None] | Callable[[SimpleDomNode], None] | None
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
@dataclass(frozen=True, slots=True)
|
|
221
|
+
class _CompiledLinkifyTransform:
|
|
222
|
+
kind: Literal["linkify"]
|
|
223
|
+
skip_tags: frozenset[str]
|
|
224
|
+
config: LinkifyConfig
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
@dataclass(frozen=True, slots=True)
|
|
228
|
+
class _CompiledSanitizeTransform:
|
|
229
|
+
kind: Literal["sanitize"]
|
|
230
|
+
policy: SanitizationPolicy | None
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
@dataclass(frozen=True, slots=True)
|
|
234
|
+
class _CompiledPruneEmptyTransform:
|
|
235
|
+
kind: Literal["prune_empty"]
|
|
236
|
+
selector_str: str
|
|
237
|
+
selector: ParsedSelector
|
|
238
|
+
strip_whitespace: bool
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
CompiledTransform = (
|
|
242
|
+
_CompiledSelectorTransform
|
|
243
|
+
| _CompiledLinkifyTransform
|
|
244
|
+
| _CompiledCollapseWhitespaceTransform
|
|
245
|
+
| _CompiledPruneEmptyTransform
|
|
246
|
+
| _CompiledSanitizeTransform
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def compile_transforms(transforms: list[Transform] | tuple[Transform, ...]) -> list[CompiledTransform]:
|
|
251
|
+
if transforms:
|
|
252
|
+
sanitize_count = sum(1 for t in transforms if isinstance(t, Sanitize))
|
|
253
|
+
if sanitize_count:
|
|
254
|
+
if sanitize_count > 1:
|
|
255
|
+
raise ValueError("Only one Sanitize transform is supported")
|
|
256
|
+
sanitize_index = next(i for i, t in enumerate(transforms) if isinstance(t, Sanitize))
|
|
257
|
+
for t in transforms[sanitize_index + 1 :]:
|
|
258
|
+
if not isinstance(t, (PruneEmpty, CollapseWhitespace)):
|
|
259
|
+
raise TypeError(
|
|
260
|
+
"Sanitize transform must be last (except for trailing PruneEmpty and CollapseWhitespace transforms)"
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
compiled: list[CompiledTransform] = []
|
|
264
|
+
for t in transforms:
|
|
265
|
+
if isinstance(t, SetAttrs):
|
|
266
|
+
compiled.append(
|
|
267
|
+
_CompiledSelectorTransform(
|
|
268
|
+
kind="setattrs",
|
|
269
|
+
selector_str=t.selector,
|
|
270
|
+
selector=parse_selector(t.selector),
|
|
271
|
+
payload=t.attrs,
|
|
272
|
+
)
|
|
273
|
+
)
|
|
274
|
+
continue
|
|
275
|
+
if isinstance(t, Drop):
|
|
276
|
+
compiled.append(
|
|
277
|
+
_CompiledSelectorTransform(
|
|
278
|
+
kind="drop",
|
|
279
|
+
selector_str=t.selector,
|
|
280
|
+
selector=parse_selector(t.selector),
|
|
281
|
+
payload=None,
|
|
282
|
+
)
|
|
283
|
+
)
|
|
284
|
+
continue
|
|
285
|
+
if isinstance(t, Unwrap):
|
|
286
|
+
compiled.append(
|
|
287
|
+
_CompiledSelectorTransform(
|
|
288
|
+
kind="unwrap",
|
|
289
|
+
selector_str=t.selector,
|
|
290
|
+
selector=parse_selector(t.selector),
|
|
291
|
+
payload=None,
|
|
292
|
+
)
|
|
293
|
+
)
|
|
294
|
+
continue
|
|
295
|
+
if isinstance(t, Empty):
|
|
296
|
+
compiled.append(
|
|
297
|
+
_CompiledSelectorTransform(
|
|
298
|
+
kind="empty",
|
|
299
|
+
selector_str=t.selector,
|
|
300
|
+
selector=parse_selector(t.selector),
|
|
301
|
+
payload=None,
|
|
302
|
+
)
|
|
303
|
+
)
|
|
304
|
+
continue
|
|
305
|
+
if isinstance(t, Edit):
|
|
306
|
+
compiled.append(
|
|
307
|
+
_CompiledSelectorTransform(
|
|
308
|
+
kind="edit",
|
|
309
|
+
selector_str=t.selector,
|
|
310
|
+
selector=parse_selector(t.selector),
|
|
311
|
+
payload=t.callback,
|
|
312
|
+
)
|
|
313
|
+
)
|
|
314
|
+
continue
|
|
315
|
+
|
|
316
|
+
if isinstance(t, Linkify):
|
|
317
|
+
compiled.append(
|
|
318
|
+
_CompiledLinkifyTransform(
|
|
319
|
+
kind="linkify",
|
|
320
|
+
skip_tags=t.skip_tags,
|
|
321
|
+
config=LinkifyConfig(fuzzy_ip=t.fuzzy_ip, extra_tlds=t.extra_tlds),
|
|
322
|
+
)
|
|
323
|
+
)
|
|
324
|
+
continue
|
|
325
|
+
|
|
326
|
+
if isinstance(t, CollapseWhitespace):
|
|
327
|
+
compiled.append(
|
|
328
|
+
_CompiledCollapseWhitespaceTransform(
|
|
329
|
+
kind="collapse_whitespace",
|
|
330
|
+
skip_tags=t.skip_tags,
|
|
331
|
+
)
|
|
332
|
+
)
|
|
333
|
+
continue
|
|
334
|
+
|
|
335
|
+
if isinstance(t, PruneEmpty):
|
|
336
|
+
compiled.append(
|
|
337
|
+
_CompiledPruneEmptyTransform(
|
|
338
|
+
kind="prune_empty",
|
|
339
|
+
selector_str=t.selector,
|
|
340
|
+
selector=parse_selector(t.selector),
|
|
341
|
+
strip_whitespace=t.strip_whitespace,
|
|
342
|
+
)
|
|
343
|
+
)
|
|
344
|
+
continue
|
|
345
|
+
|
|
346
|
+
if isinstance(t, Sanitize):
|
|
347
|
+
compiled.append(_CompiledSanitizeTransform(kind="sanitize", policy=t.policy))
|
|
348
|
+
continue
|
|
349
|
+
|
|
350
|
+
raise TypeError(f"Unsupported transform: {type(t).__name__}")
|
|
351
|
+
|
|
352
|
+
return compiled
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
# -----------------
|
|
356
|
+
# Application
|
|
357
|
+
# -----------------
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def apply_compiled_transforms(root: SimpleDomNode, compiled: list[CompiledTransform]) -> None:
|
|
361
|
+
if not compiled:
|
|
362
|
+
return
|
|
363
|
+
|
|
364
|
+
sanitize_transform: _CompiledSanitizeTransform | None = None
|
|
365
|
+
post_sanitize_transforms: list[_CompiledPruneEmptyTransform | _CompiledCollapseWhitespaceTransform] = []
|
|
366
|
+
for i, t in enumerate(compiled):
|
|
367
|
+
if isinstance(t, _CompiledSanitizeTransform):
|
|
368
|
+
sanitize_transform = t
|
|
369
|
+
post = compiled[i + 1 :]
|
|
370
|
+
if post and not all(
|
|
371
|
+
isinstance(x, (_CompiledPruneEmptyTransform, _CompiledCollapseWhitespaceTransform)) for x in post
|
|
372
|
+
):
|
|
373
|
+
raise TypeError(
|
|
374
|
+
"Sanitize must be the last transform (except for trailing PruneEmpty and CollapseWhitespace transforms)"
|
|
375
|
+
)
|
|
376
|
+
post_sanitize_transforms = [
|
|
377
|
+
cast("_CompiledPruneEmptyTransform | _CompiledCollapseWhitespaceTransform", x) for x in post
|
|
378
|
+
]
|
|
379
|
+
compiled = compiled[:i]
|
|
380
|
+
break
|
|
381
|
+
|
|
382
|
+
matcher = SelectorMatcher()
|
|
383
|
+
|
|
384
|
+
def apply_selector_and_linkify_transforms(
|
|
385
|
+
root_node: SimpleDomNode,
|
|
386
|
+
selector_transforms: list[_CompiledSelectorTransform],
|
|
387
|
+
linkify_transforms: list[_CompiledLinkifyTransform],
|
|
388
|
+
whitespace_transforms: list[_CompiledCollapseWhitespaceTransform],
|
|
389
|
+
) -> None:
|
|
390
|
+
if not selector_transforms and not linkify_transforms and not whitespace_transforms:
|
|
391
|
+
return
|
|
392
|
+
|
|
393
|
+
linkify_skip_tags: frozenset[str] = (
|
|
394
|
+
frozenset().union(*(t.skip_tags for t in linkify_transforms)) if linkify_transforms else frozenset()
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
whitespace_skip_tags: frozenset[str] = (
|
|
398
|
+
frozenset().union(*(t.skip_tags for t in whitespace_transforms)) if whitespace_transforms else frozenset()
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
def apply_to_children(parent: SimpleDomNode, *, skip_linkify: bool, skip_whitespace: bool) -> None:
|
|
402
|
+
children = parent.children
|
|
403
|
+
if not children:
|
|
404
|
+
return
|
|
405
|
+
|
|
406
|
+
i = 0
|
|
407
|
+
while i < len(children):
|
|
408
|
+
node = children[i]
|
|
409
|
+
name = node.name
|
|
410
|
+
is_element = not name.startswith("#")
|
|
411
|
+
|
|
412
|
+
if name == "#text" and not skip_whitespace and whitespace_transforms:
|
|
413
|
+
data = node.data or ""
|
|
414
|
+
if data:
|
|
415
|
+
collapsed = data
|
|
416
|
+
for _wt in whitespace_transforms:
|
|
417
|
+
collapsed = _collapse_html_space_characters(collapsed)
|
|
418
|
+
if collapsed != data:
|
|
419
|
+
node.data = collapsed
|
|
420
|
+
|
|
421
|
+
# Linkify applies to text nodes, context-aware.
|
|
422
|
+
if name == "#text" and not skip_linkify and linkify_transforms:
|
|
423
|
+
data = node.data or ""
|
|
424
|
+
if data:
|
|
425
|
+
rewritten = False
|
|
426
|
+
for lt in linkify_transforms:
|
|
427
|
+
matches = find_links_with_config(data, lt.config)
|
|
428
|
+
if not matches:
|
|
429
|
+
continue
|
|
430
|
+
|
|
431
|
+
cursor = 0
|
|
432
|
+
for m in matches:
|
|
433
|
+
if m.start > cursor:
|
|
434
|
+
parent.insert_before(TextNode(data[cursor : m.start]), node)
|
|
435
|
+
|
|
436
|
+
ns = parent.namespace or "html"
|
|
437
|
+
a = ElementNode("a", {"href": m.href}, ns)
|
|
438
|
+
a.append_child(TextNode(m.text))
|
|
439
|
+
parent.insert_before(a, node)
|
|
440
|
+
cursor = m.end
|
|
441
|
+
|
|
442
|
+
if cursor < len(data):
|
|
443
|
+
parent.insert_before(TextNode(data[cursor:]), node)
|
|
444
|
+
|
|
445
|
+
parent.remove_child(node)
|
|
446
|
+
rewritten = True
|
|
447
|
+
break
|
|
448
|
+
|
|
449
|
+
if rewritten:
|
|
450
|
+
continue
|
|
451
|
+
|
|
452
|
+
changed = False
|
|
453
|
+
for t in selector_transforms:
|
|
454
|
+
if not is_element:
|
|
455
|
+
break
|
|
456
|
+
|
|
457
|
+
if not matcher.matches(node, t.selector):
|
|
458
|
+
continue
|
|
459
|
+
|
|
460
|
+
if t.kind == "setattrs":
|
|
461
|
+
patch = cast("dict[str, str | None]", t.payload)
|
|
462
|
+
attrs = node.attrs
|
|
463
|
+
for k, v in patch.items():
|
|
464
|
+
attrs[str(k)] = None if v is None else str(v)
|
|
465
|
+
continue
|
|
466
|
+
|
|
467
|
+
if t.kind == "edit":
|
|
468
|
+
cb = cast("Callable[[SimpleDomNode], None]", t.payload)
|
|
469
|
+
cb(node)
|
|
470
|
+
continue
|
|
471
|
+
|
|
472
|
+
if t.kind == "empty":
|
|
473
|
+
if node.children:
|
|
474
|
+
for child in node.children:
|
|
475
|
+
child.parent = None
|
|
476
|
+
node.children = []
|
|
477
|
+
if type(node) is TemplateNode and node.template_content is not None:
|
|
478
|
+
tc = node.template_content
|
|
479
|
+
for child in tc.children or []:
|
|
480
|
+
child.parent = None
|
|
481
|
+
tc.children = []
|
|
482
|
+
continue
|
|
483
|
+
|
|
484
|
+
if t.kind == "drop":
|
|
485
|
+
parent.remove_child(node)
|
|
486
|
+
changed = True
|
|
487
|
+
break
|
|
488
|
+
|
|
489
|
+
# t.kind == "unwrap".
|
|
490
|
+
if node.children:
|
|
491
|
+
moved = list(node.children)
|
|
492
|
+
node.children = []
|
|
493
|
+
for child in moved:
|
|
494
|
+
parent.insert_before(child, node)
|
|
495
|
+
parent.remove_child(node)
|
|
496
|
+
changed = True
|
|
497
|
+
break
|
|
498
|
+
|
|
499
|
+
if changed:
|
|
500
|
+
continue
|
|
501
|
+
|
|
502
|
+
if is_element:
|
|
503
|
+
tag = node.name.lower()
|
|
504
|
+
child_skip = skip_linkify or (tag in linkify_skip_tags)
|
|
505
|
+
child_skip_ws = skip_whitespace or (tag in whitespace_skip_tags)
|
|
506
|
+
|
|
507
|
+
if node.children:
|
|
508
|
+
apply_to_children(node, skip_linkify=child_skip, skip_whitespace=child_skip_ws)
|
|
509
|
+
|
|
510
|
+
if type(node) is TemplateNode and node.template_content is not None:
|
|
511
|
+
apply_to_children(
|
|
512
|
+
node.template_content, skip_linkify=child_skip, skip_whitespace=child_skip_ws
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
i += 1
|
|
516
|
+
|
|
517
|
+
if type(root_node) is not TextNode:
|
|
518
|
+
apply_to_children(root_node, skip_linkify=False, skip_whitespace=False)
|
|
519
|
+
|
|
520
|
+
def apply_prune_transforms(root_node: SimpleDomNode, prune_transforms: list[_CompiledPruneEmptyTransform]) -> None:
|
|
521
|
+
def _is_effectively_empty_element(n: SimpleDomNode, *, strip_whitespace: bool) -> bool:
|
|
522
|
+
def _has_content(children: list[SimpleDomNode] | None) -> bool:
|
|
523
|
+
if not children:
|
|
524
|
+
return False
|
|
525
|
+
for ch in children:
|
|
526
|
+
nm = ch.name
|
|
527
|
+
if nm == "#text":
|
|
528
|
+
data = getattr(ch, "data", "") or ""
|
|
529
|
+
if strip_whitespace:
|
|
530
|
+
if str(data).strip():
|
|
531
|
+
return True
|
|
532
|
+
else:
|
|
533
|
+
if str(data) != "":
|
|
534
|
+
return True
|
|
535
|
+
continue
|
|
536
|
+
if nm.startswith("#"):
|
|
537
|
+
continue
|
|
538
|
+
return True
|
|
539
|
+
return False
|
|
540
|
+
|
|
541
|
+
if _has_content(n.children):
|
|
542
|
+
return False
|
|
543
|
+
|
|
544
|
+
if type(n) is TemplateNode and n.template_content is not None:
|
|
545
|
+
if _has_content(n.template_content.children):
|
|
546
|
+
return False
|
|
547
|
+
|
|
548
|
+
return True
|
|
549
|
+
|
|
550
|
+
stack: list[tuple[SimpleDomNode, bool]] = [(root_node, False)]
|
|
551
|
+
while stack:
|
|
552
|
+
node, visited = stack.pop()
|
|
553
|
+
if not visited:
|
|
554
|
+
stack.append((node, True))
|
|
555
|
+
|
|
556
|
+
children = node.children or []
|
|
557
|
+
stack.extend((child, False) for child in reversed(children) if isinstance(child, SimpleDomNode))
|
|
558
|
+
|
|
559
|
+
if type(node) is TemplateNode and node.template_content is not None:
|
|
560
|
+
stack.append((node.template_content, False))
|
|
561
|
+
continue
|
|
562
|
+
|
|
563
|
+
if node.parent is None:
|
|
564
|
+
continue
|
|
565
|
+
if node.name.startswith("#"):
|
|
566
|
+
continue
|
|
567
|
+
|
|
568
|
+
for pt in prune_transforms:
|
|
569
|
+
if matcher.matches(node, pt.selector):
|
|
570
|
+
if _is_effectively_empty_element(node, strip_whitespace=pt.strip_whitespace):
|
|
571
|
+
node.parent.remove_child(node)
|
|
572
|
+
break
|
|
573
|
+
|
|
574
|
+
pending_selector: list[_CompiledSelectorTransform] = []
|
|
575
|
+
pending_linkify: list[_CompiledLinkifyTransform] = []
|
|
576
|
+
pending_whitespace: list[_CompiledCollapseWhitespaceTransform] = []
|
|
577
|
+
|
|
578
|
+
i = 0
|
|
579
|
+
while i < len(compiled):
|
|
580
|
+
t = compiled[i]
|
|
581
|
+
if isinstance(t, _CompiledSelectorTransform):
|
|
582
|
+
pending_selector.append(t)
|
|
583
|
+
i += 1
|
|
584
|
+
continue
|
|
585
|
+
if isinstance(t, _CompiledLinkifyTransform):
|
|
586
|
+
pending_linkify.append(t)
|
|
587
|
+
i += 1
|
|
588
|
+
continue
|
|
589
|
+
if isinstance(t, _CompiledCollapseWhitespaceTransform):
|
|
590
|
+
pending_whitespace.append(t)
|
|
591
|
+
i += 1
|
|
592
|
+
continue
|
|
593
|
+
|
|
594
|
+
apply_selector_and_linkify_transforms(root, pending_selector, pending_linkify, pending_whitespace)
|
|
595
|
+
pending_selector = []
|
|
596
|
+
pending_linkify = []
|
|
597
|
+
pending_whitespace = []
|
|
598
|
+
|
|
599
|
+
if isinstance(t, _CompiledPruneEmptyTransform):
|
|
600
|
+
prune_batch: list[_CompiledPruneEmptyTransform] = [t]
|
|
601
|
+
i += 1
|
|
602
|
+
while i < len(compiled) and isinstance(compiled[i], _CompiledPruneEmptyTransform):
|
|
603
|
+
prune_batch.append(cast("_CompiledPruneEmptyTransform", compiled[i]))
|
|
604
|
+
i += 1
|
|
605
|
+
apply_prune_transforms(root, prune_batch)
|
|
606
|
+
continue
|
|
607
|
+
|
|
608
|
+
raise TypeError(f"Unsupported compiled transform: {type(t).__name__}")
|
|
609
|
+
|
|
610
|
+
apply_selector_and_linkify_transforms(root, pending_selector, pending_linkify, pending_whitespace)
|
|
611
|
+
|
|
612
|
+
if sanitize_transform is not None:
|
|
613
|
+
sanitized = _sanitize(root, policy=sanitize_transform.policy)
|
|
614
|
+
|
|
615
|
+
def _apply_post_sanitize_transforms() -> None:
|
|
616
|
+
if not post_sanitize_transforms:
|
|
617
|
+
return
|
|
618
|
+
|
|
619
|
+
pending_post_ws: list[_CompiledCollapseWhitespaceTransform] = []
|
|
620
|
+
i = 0
|
|
621
|
+
while i < len(post_sanitize_transforms):
|
|
622
|
+
t = post_sanitize_transforms[i]
|
|
623
|
+
if isinstance(t, _CompiledCollapseWhitespaceTransform):
|
|
624
|
+
pending_post_ws.append(t)
|
|
625
|
+
i += 1
|
|
626
|
+
continue
|
|
627
|
+
|
|
628
|
+
if pending_post_ws:
|
|
629
|
+
apply_selector_and_linkify_transforms(root, [], [], pending_post_ws)
|
|
630
|
+
pending_post_ws = []
|
|
631
|
+
|
|
632
|
+
prune_batch: list[_CompiledPruneEmptyTransform] = [t]
|
|
633
|
+
i += 1
|
|
634
|
+
while i < len(post_sanitize_transforms) and isinstance(
|
|
635
|
+
post_sanitize_transforms[i], _CompiledPruneEmptyTransform
|
|
636
|
+
):
|
|
637
|
+
prune_batch.append(cast("_CompiledPruneEmptyTransform", post_sanitize_transforms[i]))
|
|
638
|
+
i += 1
|
|
639
|
+
apply_prune_transforms(root, prune_batch)
|
|
640
|
+
|
|
641
|
+
if pending_post_ws:
|
|
642
|
+
apply_selector_and_linkify_transforms(root, [], [], pending_post_ws)
|
|
643
|
+
|
|
644
|
+
def _detach_children(n: SimpleDomNode) -> None:
|
|
645
|
+
if n.children:
|
|
646
|
+
for child in n.children:
|
|
647
|
+
child.parent = None
|
|
648
|
+
|
|
649
|
+
def _reparent_children(n: SimpleDomNode) -> None:
|
|
650
|
+
if n.children:
|
|
651
|
+
for child in n.children:
|
|
652
|
+
child.parent = n
|
|
653
|
+
|
|
654
|
+
# Overwrite the root node in-place so callers keep their reference.
|
|
655
|
+
# This supports the common case (document/document-fragment root) as well
|
|
656
|
+
# as advanced usage where callers pass an element root.
|
|
657
|
+
if type(root) is TextNode:
|
|
658
|
+
root.data = sanitized.data
|
|
659
|
+
_apply_post_sanitize_transforms()
|
|
660
|
+
return
|
|
661
|
+
|
|
662
|
+
_detach_children(root)
|
|
663
|
+
|
|
664
|
+
if type(root) is TemplateNode:
|
|
665
|
+
root.name = sanitized.name
|
|
666
|
+
root.namespace = sanitized.namespace
|
|
667
|
+
root.attrs = sanitized.attrs
|
|
668
|
+
root.children = sanitized.children
|
|
669
|
+
root.template_content = sanitized.template_content
|
|
670
|
+
_reparent_children(root)
|
|
671
|
+
_apply_post_sanitize_transforms()
|
|
672
|
+
return
|
|
673
|
+
|
|
674
|
+
if type(root) is ElementNode:
|
|
675
|
+
root.name = sanitized.name
|
|
676
|
+
root.namespace = sanitized.namespace
|
|
677
|
+
root.attrs = sanitized.attrs
|
|
678
|
+
root.children = sanitized.children
|
|
679
|
+
root.template_content = sanitized.template_content
|
|
680
|
+
_reparent_children(root)
|
|
681
|
+
_apply_post_sanitize_transforms()
|
|
682
|
+
return
|
|
683
|
+
|
|
684
|
+
root.name = sanitized.name
|
|
685
|
+
root.namespace = sanitized.namespace
|
|
686
|
+
root.data = sanitized.data
|
|
687
|
+
root.attrs = sanitized.attrs
|
|
688
|
+
root.children = sanitized.children
|
|
689
|
+
_reparent_children(root)
|
|
690
|
+
_apply_post_sanitize_transforms()
|