justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- justhtml/__init__.py +28 -0
- justhtml/__main__.py +161 -13
- justhtml/constants.py +17 -1
- justhtml/context.py +7 -1
- justhtml/encoding.py +405 -0
- justhtml/entities.py +57 -17
- justhtml/errors.py +20 -4
- justhtml/linkify.py +438 -0
- justhtml/node.py +738 -41
- justhtml/parser.py +188 -21
- justhtml/py.typed +0 -0
- justhtml/sanitize.py +1141 -0
- justhtml/selector.py +240 -104
- justhtml/serialize.py +418 -57
- justhtml/stream.py +34 -10
- justhtml/tokenizer.py +433 -289
- justhtml/tokens.py +91 -23
- justhtml/transforms.py +690 -0
- justhtml/treebuilder.py +196 -111
- justhtml/treebuilder_modes.py +191 -117
- justhtml/treebuilder_utils.py +11 -4
- justhtml-0.33.0.dist-info/METADATA +196 -0
- justhtml-0.33.0.dist-info/RECORD +26 -0
- justhtml-0.33.0.dist-info/entry_points.txt +2 -0
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.6.0.dist-info/METADATA +0 -126
- justhtml-0.6.0.dist-info/RECORD +0 -20
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/WHEEL +0 -0
justhtml/treebuilder.py
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
# ruff: noqa: S101, PLW2901
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
3
6
|
|
|
4
7
|
from .constants import (
|
|
5
8
|
BUTTON_SCOPE_TERMINATORS,
|
|
@@ -23,13 +26,16 @@ from .constants import (
|
|
|
23
26
|
)
|
|
24
27
|
from .errors import generate_error_message
|
|
25
28
|
from .node import ElementNode, SimpleDomNode, TemplateNode, TextNode
|
|
26
|
-
from .tokens import CharacterTokens, CommentToken, DoctypeToken, EOFToken, ParseError, Tag, TokenSinkResult
|
|
29
|
+
from .tokens import AnyToken, CharacterTokens, CommentToken, DoctypeToken, EOFToken, ParseError, Tag, TokenSinkResult
|
|
27
30
|
from .treebuilder_modes import TreeBuilderModesMixin
|
|
28
31
|
from .treebuilder_utils import (
|
|
29
32
|
InsertionMode,
|
|
30
33
|
is_all_whitespace,
|
|
31
34
|
)
|
|
32
35
|
|
|
36
|
+
if TYPE_CHECKING:
|
|
37
|
+
from collections.abc import Callable
|
|
38
|
+
|
|
33
39
|
|
|
34
40
|
class TreeBuilder(TreeBuilderModesMixin):
|
|
35
41
|
__slots__ = (
|
|
@@ -53,6 +59,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
53
59
|
"open_elements",
|
|
54
60
|
"original_mode",
|
|
55
61
|
"pending_table_text",
|
|
62
|
+
"pending_table_text_should_error",
|
|
56
63
|
"quirks_mode",
|
|
57
64
|
"table_text_original_mode",
|
|
58
65
|
"template_modes",
|
|
@@ -60,12 +67,39 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
60
67
|
"tokenizer_state_override",
|
|
61
68
|
)
|
|
62
69
|
|
|
70
|
+
_body_end_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
|
|
71
|
+
_body_start_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
|
|
72
|
+
_body_token_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
|
|
73
|
+
_mode_handlers: dict[InsertionMode, Callable[[TreeBuilder, Any], Any]]
|
|
74
|
+
active_formatting: list[Any]
|
|
75
|
+
collect_errors: bool
|
|
76
|
+
document: SimpleDomNode
|
|
77
|
+
errors: list[ParseError]
|
|
78
|
+
form_element: Any | None
|
|
79
|
+
fragment_context: Any | None
|
|
80
|
+
fragment_context_element: Any | None
|
|
81
|
+
frameset_ok: bool
|
|
82
|
+
head_element: Any | None
|
|
83
|
+
iframe_srcdoc: bool
|
|
84
|
+
ignore_lf: bool
|
|
85
|
+
insert_from_table: bool
|
|
86
|
+
mode: InsertionMode
|
|
87
|
+
open_elements: list[Any]
|
|
88
|
+
original_mode: InsertionMode | None # type: ignore[assignment]
|
|
89
|
+
pending_table_text: list[str]
|
|
90
|
+
pending_table_text_should_error: bool
|
|
91
|
+
quirks_mode: str
|
|
92
|
+
table_text_original_mode: InsertionMode | None # type: ignore[assignment]
|
|
93
|
+
template_modes: list[InsertionMode]
|
|
94
|
+
tokenizer: Any | None
|
|
95
|
+
tokenizer_state_override: Any | None # type: ignore[assignment]
|
|
96
|
+
|
|
63
97
|
def __init__(
|
|
64
98
|
self,
|
|
65
|
-
fragment_context=None,
|
|
66
|
-
iframe_srcdoc=False,
|
|
67
|
-
collect_errors=False,
|
|
68
|
-
):
|
|
99
|
+
fragment_context: Any | None = None,
|
|
100
|
+
iframe_srcdoc: bool = False,
|
|
101
|
+
collect_errors: bool = False,
|
|
102
|
+
) -> None:
|
|
69
103
|
self.fragment_context = fragment_context
|
|
70
104
|
self.iframe_srcdoc = iframe_srcdoc
|
|
71
105
|
self.collect_errors = collect_errors
|
|
@@ -86,6 +120,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
86
120
|
self.quirks_mode = "no-quirks"
|
|
87
121
|
self.ignore_lf = False
|
|
88
122
|
self.active_formatting = []
|
|
123
|
+
self.pending_table_text_should_error = False
|
|
89
124
|
self.insert_from_table = False
|
|
90
125
|
self.pending_table_text = []
|
|
91
126
|
self.template_modes = []
|
|
@@ -134,10 +169,10 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
134
169
|
# This prevents frameset from being inserted in fragment contexts
|
|
135
170
|
self.frameset_ok = False
|
|
136
171
|
|
|
137
|
-
def _set_quirks_mode(self, mode):
|
|
172
|
+
def _set_quirks_mode(self, mode: str) -> None:
|
|
138
173
|
self.quirks_mode = mode
|
|
139
174
|
|
|
140
|
-
def _parse_error(self, code, tag_name=None, token=None):
|
|
175
|
+
def _parse_error(self, code: str, tag_name: str | None = None, token: AnyToken | None = None) -> None:
|
|
141
176
|
if not self.collect_errors:
|
|
142
177
|
return
|
|
143
178
|
# Use the position of the last emitted token (set by tokenizer before emit)
|
|
@@ -174,13 +209,16 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
174
209
|
code,
|
|
175
210
|
line=line,
|
|
176
211
|
column=column,
|
|
212
|
+
category="treebuilder",
|
|
177
213
|
message=message,
|
|
178
214
|
source_html=source_html,
|
|
179
215
|
end_column=end_column,
|
|
180
216
|
)
|
|
181
217
|
)
|
|
182
218
|
|
|
183
|
-
def _has_element_in_scope(
|
|
219
|
+
def _has_element_in_scope(
|
|
220
|
+
self, target: str, terminators: set[str] | None = None, check_integration_points: bool = True
|
|
221
|
+
) -> bool:
|
|
184
222
|
if terminators is None:
|
|
185
223
|
terminators = DEFAULT_SCOPE_TERMINATORS
|
|
186
224
|
for node in reversed(self.open_elements):
|
|
@@ -196,33 +234,33 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
196
234
|
return False
|
|
197
235
|
return False
|
|
198
236
|
|
|
199
|
-
def _has_element_in_button_scope(self, target):
|
|
237
|
+
def _has_element_in_button_scope(self, target: str) -> bool:
|
|
200
238
|
return self._has_element_in_scope(target, BUTTON_SCOPE_TERMINATORS)
|
|
201
239
|
|
|
202
|
-
def _pop_until_inclusive(self, name):
|
|
240
|
+
def _pop_until_inclusive(self, name: str) -> None:
|
|
203
241
|
# Callers ensure element exists on stack
|
|
204
242
|
while self.open_elements: # pragma: no branch
|
|
205
243
|
node = self.open_elements.pop()
|
|
206
244
|
if node.name == name:
|
|
207
245
|
break
|
|
208
246
|
|
|
209
|
-
def _pop_until_any_inclusive(self, names):
|
|
247
|
+
def _pop_until_any_inclusive(self, names: set[str]) -> None:
|
|
210
248
|
# Pop elements until we find one in names (callers ensure element exists)
|
|
211
249
|
while self.open_elements:
|
|
212
250
|
node = self.open_elements.pop()
|
|
213
251
|
if node.name in names:
|
|
214
252
|
return
|
|
215
253
|
|
|
216
|
-
def _close_p_element(self):
|
|
254
|
+
def _close_p_element(self) -> bool:
|
|
217
255
|
if self._has_element_in_button_scope("p"):
|
|
218
256
|
self._generate_implied_end_tags("p")
|
|
219
257
|
if self.open_elements[-1].name != "p":
|
|
220
|
-
self._parse_error("end-tag
|
|
258
|
+
self._parse_error("unexpected-end-tag", tag_name="p")
|
|
221
259
|
self._pop_until_inclusive("p")
|
|
222
260
|
return True
|
|
223
261
|
return False
|
|
224
262
|
|
|
225
|
-
def process_token(self, token):
|
|
263
|
+
def process_token(self, token: Any) -> Any:
|
|
226
264
|
# Optimization: Use type() identity check instead of isinstance
|
|
227
265
|
token_type = type(token)
|
|
228
266
|
if token_type is DoctypeToken:
|
|
@@ -276,7 +314,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
276
314
|
self._insert_element(current_token, push=True)
|
|
277
315
|
result = None
|
|
278
316
|
elif name == "p":
|
|
279
|
-
result = self._handle_body_start_paragraph(current_token)
|
|
317
|
+
result = self._handle_body_start_paragraph(current_token) # type: ignore[func-returns-value]
|
|
280
318
|
elif name == "span":
|
|
281
319
|
if self.active_formatting:
|
|
282
320
|
self._reconstruct_active_formatting_elements()
|
|
@@ -284,7 +322,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
284
322
|
self.frameset_ok = False
|
|
285
323
|
result = None
|
|
286
324
|
elif name == "a":
|
|
287
|
-
result = self._handle_body_start_a(current_token)
|
|
325
|
+
result = self._handle_body_start_a(current_token) # type: ignore[func-returns-value]
|
|
288
326
|
elif name == "br" or name == "img":
|
|
289
327
|
if self.active_formatting:
|
|
290
328
|
self._reconstruct_active_formatting_elements()
|
|
@@ -331,7 +369,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
331
369
|
if name == "br":
|
|
332
370
|
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
333
371
|
br_tag = Tag(0, "br", {}, False)
|
|
334
|
-
result = self._handle_body_start_br(br_tag)
|
|
372
|
+
result = self._handle_body_start_br(br_tag) # type: ignore[func-returns-value]
|
|
335
373
|
elif name in FORMATTING_ELEMENTS:
|
|
336
374
|
self._adoption_agency(name)
|
|
337
375
|
result = None
|
|
@@ -350,7 +388,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
350
388
|
self._append_text(current_token.data)
|
|
351
389
|
result = None
|
|
352
390
|
elif token_type is CommentToken:
|
|
353
|
-
result = self._handle_comment_in_body(current_token)
|
|
391
|
+
result = self._handle_comment_in_body(current_token) # type: ignore[func-returns-value]
|
|
354
392
|
else: # EOFToken
|
|
355
393
|
result = self._handle_eof_in_body(current_token)
|
|
356
394
|
else:
|
|
@@ -379,11 +417,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
379
417
|
# Tokenizer guarantees non-empty data
|
|
380
418
|
data = current_token.data
|
|
381
419
|
if "\x00" in data:
|
|
382
|
-
self._parse_error("invalid-codepoint")
|
|
383
420
|
data = data.replace("\x00", "")
|
|
384
|
-
if "\x0c" in data:
|
|
385
|
-
self._parse_error("invalid-codepoint")
|
|
386
|
-
data = data.replace("\x0c", "")
|
|
387
421
|
if data:
|
|
388
422
|
if not is_all_whitespace(data):
|
|
389
423
|
self._reconstruct_active_formatting_elements()
|
|
@@ -437,10 +471,11 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
437
471
|
current_token = token_override
|
|
438
472
|
# Continue loop to reprocess
|
|
439
473
|
|
|
440
|
-
def finish(self):
|
|
474
|
+
def finish(self) -> SimpleDomNode:
|
|
441
475
|
if self.fragment_context is not None:
|
|
442
476
|
# For fragments, remove the html wrapper and promote its children
|
|
443
477
|
# Note: html element is always created in fragment setup, so children[0] is always "html"
|
|
478
|
+
assert self.document.children is not None
|
|
444
479
|
root = self.document.children[0]
|
|
445
480
|
context_elem = self.fragment_context_element
|
|
446
481
|
if context_elem is not None and context_elem.parent is root:
|
|
@@ -460,20 +495,28 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
460
495
|
|
|
461
496
|
# Insertion mode dispatch ------------------------------------------------
|
|
462
497
|
|
|
463
|
-
def _append_comment_to_document(self, text):
|
|
498
|
+
def _append_comment_to_document(self, text: str) -> None:
|
|
464
499
|
node = SimpleDomNode("#comment", data=text)
|
|
500
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
501
|
+
node._origin_pos = self.tokenizer.last_token_start_pos
|
|
502
|
+
if node._origin_pos is not None:
|
|
503
|
+
node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
|
|
465
504
|
self.document.append_child(node)
|
|
466
505
|
|
|
467
|
-
def _append_comment(self, text, parent=None):
|
|
506
|
+
def _append_comment(self, text: str, parent: Any | None = None) -> None:
|
|
468
507
|
if parent is None:
|
|
469
508
|
parent = self._current_node_or_html()
|
|
470
509
|
# If parent is a template, insert into its content fragment
|
|
471
510
|
if type(parent) is TemplateNode and parent.template_content:
|
|
472
511
|
parent = parent.template_content
|
|
473
512
|
node = SimpleDomNode("#comment", data=text)
|
|
513
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
514
|
+
node._origin_pos = self.tokenizer.last_token_start_pos
|
|
515
|
+
if node._origin_pos is not None:
|
|
516
|
+
node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
|
|
474
517
|
parent.append_child(node)
|
|
475
518
|
|
|
476
|
-
def _append_text(self, text):
|
|
519
|
+
def _append_text(self, text: str) -> None:
|
|
477
520
|
if self.ignore_lf:
|
|
478
521
|
self.ignore_lf = False
|
|
479
522
|
if text.startswith("\n"):
|
|
@@ -481,6 +524,9 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
481
524
|
if not text:
|
|
482
525
|
return
|
|
483
526
|
|
|
527
|
+
if "\f" in text:
|
|
528
|
+
text = text.replace("\f", " ")
|
|
529
|
+
|
|
484
530
|
# Guard against empty stack
|
|
485
531
|
if not self.open_elements: # pragma: no cover
|
|
486
532
|
return
|
|
@@ -493,10 +539,14 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
493
539
|
if children:
|
|
494
540
|
last_child = children[-1]
|
|
495
541
|
if type(last_child) is TextNode:
|
|
496
|
-
last_child.data
|
|
542
|
+
last_child.data = (last_child.data or "") + text
|
|
497
543
|
return
|
|
498
544
|
|
|
499
545
|
node = TextNode(text)
|
|
546
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
547
|
+
node._origin_pos = self.tokenizer.last_token_start_pos
|
|
548
|
+
if node._origin_pos is not None:
|
|
549
|
+
node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
|
|
500
550
|
children.append(node)
|
|
501
551
|
node.parent = target
|
|
502
552
|
return
|
|
@@ -517,32 +567,45 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
517
567
|
return
|
|
518
568
|
|
|
519
569
|
node = TextNode(text)
|
|
570
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
571
|
+
node._origin_pos = self.tokenizer.last_token_start_pos
|
|
572
|
+
if node._origin_pos is not None:
|
|
573
|
+
node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
|
|
520
574
|
reference_node = parent.children[position] if position < len(parent.children) else None
|
|
521
575
|
parent.insert_before(node, reference_node)
|
|
522
576
|
|
|
523
|
-
def _current_node_or_html(self):
|
|
577
|
+
def _current_node_or_html(self) -> Any:
|
|
524
578
|
if self.open_elements:
|
|
525
579
|
return self.open_elements[-1]
|
|
526
580
|
# Stack empty - find html element in document children
|
|
527
581
|
# (may not be first if there are comments/doctype before it)
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
582
|
+
children = self.document.children
|
|
583
|
+
if children is not None:
|
|
584
|
+
for child in children:
|
|
585
|
+
if child.name == "html":
|
|
586
|
+
return child
|
|
587
|
+
# Edge case: no html found, return first child or None
|
|
588
|
+
return children[0] if children else None # pragma: no cover
|
|
589
|
+
return None # pragma: no cover
|
|
533
590
|
|
|
534
|
-
def _create_root(self, attrs):
|
|
591
|
+
def _create_root(self, attrs: dict[str, str | None]) -> Any:
|
|
535
592
|
node = SimpleDomNode("html", attrs=attrs, namespace="html")
|
|
536
593
|
self.document.append_child(node)
|
|
537
594
|
self.open_elements.append(node)
|
|
538
595
|
return node
|
|
539
596
|
|
|
540
|
-
def _insert_element(self, tag, *, push, namespace="html"):
|
|
597
|
+
def _insert_element(self, tag: Any, *, push: bool, namespace: str = "html") -> Any:
|
|
598
|
+
node: ElementNode | TemplateNode
|
|
541
599
|
if tag.name == "template" and namespace == "html":
|
|
542
600
|
node = TemplateNode(tag.name, attrs=tag.attrs, namespace=namespace)
|
|
543
601
|
else:
|
|
544
602
|
node = ElementNode(tag.name, attrs=tag.attrs, namespace=namespace)
|
|
545
603
|
|
|
604
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
605
|
+
node._origin_pos = tag.start_pos
|
|
606
|
+
if node._origin_pos is not None:
|
|
607
|
+
node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
|
|
608
|
+
|
|
546
609
|
# Fast path for common case: not inserting from table
|
|
547
610
|
if not self.insert_from_table:
|
|
548
611
|
target = self._current_node_or_html()
|
|
@@ -553,7 +616,8 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
553
616
|
else:
|
|
554
617
|
parent = target
|
|
555
618
|
|
|
556
|
-
parent
|
|
619
|
+
if parent is not None: # pragma: no branch
|
|
620
|
+
parent.append_child(node)
|
|
557
621
|
|
|
558
622
|
if push:
|
|
559
623
|
self.open_elements.append(node)
|
|
@@ -567,28 +631,30 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
567
631
|
self.open_elements.append(node)
|
|
568
632
|
return node
|
|
569
633
|
|
|
570
|
-
def _insert_phantom(self, name):
|
|
571
|
-
|
|
634
|
+
def _insert_phantom(self, name: str) -> Any:
|
|
635
|
+
attrs: dict[str, str | None] = {}
|
|
636
|
+
tag = Tag(Tag.START, name, attrs, False)
|
|
572
637
|
return self._insert_element(tag, push=True)
|
|
573
638
|
|
|
574
|
-
def _insert_body_if_missing(self):
|
|
639
|
+
def _insert_body_if_missing(self) -> None:
|
|
575
640
|
html_node = self._find_last_on_stack("html")
|
|
576
641
|
node = SimpleDomNode("body", namespace="html")
|
|
577
|
-
html_node
|
|
578
|
-
|
|
642
|
+
if html_node is not None: # pragma: no branch
|
|
643
|
+
html_node.append_child(node)
|
|
644
|
+
node.parent = html_node
|
|
579
645
|
self.open_elements.append(node)
|
|
580
646
|
|
|
581
|
-
def _create_element(self, name, namespace, attrs):
|
|
647
|
+
def _create_element(self, name: str, namespace: str | None, attrs: dict[str, str | None]) -> Any:
|
|
582
648
|
ns = namespace or "html"
|
|
583
649
|
return ElementNode(name, attrs, ns)
|
|
584
650
|
|
|
585
|
-
def _pop_current(self):
|
|
651
|
+
def _pop_current(self) -> Any:
|
|
586
652
|
return self.open_elements.pop()
|
|
587
653
|
|
|
588
|
-
def _in_scope(self, name):
|
|
654
|
+
def _in_scope(self, name: str) -> bool:
|
|
589
655
|
return self._has_element_in_scope(name, DEFAULT_SCOPE_TERMINATORS)
|
|
590
656
|
|
|
591
|
-
def _close_element_by_name(self, name):
|
|
657
|
+
def _close_element_by_name(self, name: str) -> None:
|
|
592
658
|
# Simple element closing - pops from the named element onwards
|
|
593
659
|
# Used for explicit closing (e.g., when button start tag closes existing button)
|
|
594
660
|
# Caller guarantees name is on the stack via _has_in_scope check
|
|
@@ -599,7 +665,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
599
665
|
return
|
|
600
666
|
index -= 1
|
|
601
667
|
|
|
602
|
-
def _any_other_end_tag(self, name):
|
|
668
|
+
def _any_other_end_tag(self, name: str) -> None:
|
|
603
669
|
# Spec: "Any other end tag" in IN_BODY mode
|
|
604
670
|
# Loop through stack backwards (always terminates: html is special)
|
|
605
671
|
index = len(self.open_elements) - 1
|
|
@@ -624,7 +690,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
624
690
|
# Continue to next node (previous in stack)
|
|
625
691
|
index -= 1
|
|
626
692
|
|
|
627
|
-
def _add_missing_attributes(self, node, attrs):
|
|
693
|
+
def _add_missing_attributes(self, node: Any, attrs: dict[str, str]) -> None:
|
|
628
694
|
if not attrs:
|
|
629
695
|
return
|
|
630
696
|
existing = node.attrs
|
|
@@ -632,19 +698,19 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
632
698
|
if name not in existing:
|
|
633
699
|
existing[name] = value
|
|
634
700
|
|
|
635
|
-
def _remove_from_open_elements(self, node):
|
|
701
|
+
def _remove_from_open_elements(self, node: Any) -> bool:
|
|
636
702
|
for index, current in enumerate(self.open_elements):
|
|
637
703
|
if current is node:
|
|
638
704
|
del self.open_elements[index]
|
|
639
705
|
return True
|
|
640
706
|
return False
|
|
641
707
|
|
|
642
|
-
def _is_special_element(self, node):
|
|
708
|
+
def _is_special_element(self, node: Any) -> bool:
|
|
643
709
|
if node.namespace not in {None, "html"}:
|
|
644
710
|
return False
|
|
645
711
|
return node.name in SPECIAL_ELEMENTS
|
|
646
712
|
|
|
647
|
-
def _find_active_formatting_index(self, name):
|
|
713
|
+
def _find_active_formatting_index(self, name: str) -> int | None:
|
|
648
714
|
for index in range(len(self.active_formatting) - 1, -1, -1):
|
|
649
715
|
entry = self.active_formatting[index]
|
|
650
716
|
if entry is FORMAT_MARKER:
|
|
@@ -653,28 +719,28 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
653
719
|
return index
|
|
654
720
|
return None
|
|
655
721
|
|
|
656
|
-
def _find_active_formatting_index_by_node(self, node):
|
|
722
|
+
def _find_active_formatting_index_by_node(self, node: Any) -> int | None:
|
|
657
723
|
for index in range(len(self.active_formatting) - 1, -1, -1):
|
|
658
724
|
entry = self.active_formatting[index]
|
|
659
725
|
if entry is not FORMAT_MARKER and entry["node"] is node:
|
|
660
726
|
return index
|
|
661
727
|
return None
|
|
662
728
|
|
|
663
|
-
def _clone_attributes(self, attrs):
|
|
729
|
+
def _clone_attributes(self, attrs: dict[str, str | None]) -> dict[str, str | None]:
|
|
664
730
|
return attrs.copy() if attrs else {}
|
|
665
731
|
|
|
666
|
-
def _attrs_signature(self, attrs):
|
|
732
|
+
def _attrs_signature(self, attrs: dict[str, str | None]) -> tuple[tuple[str, str], ...]:
|
|
667
733
|
if not attrs:
|
|
668
734
|
return ()
|
|
669
|
-
items = []
|
|
735
|
+
items: list[tuple[str, str]] = []
|
|
670
736
|
for name, value in attrs.items():
|
|
671
737
|
items.append((name, value or ""))
|
|
672
738
|
items.sort()
|
|
673
739
|
return tuple(items)
|
|
674
740
|
|
|
675
|
-
def _find_active_formatting_duplicate(self, name, attrs):
|
|
741
|
+
def _find_active_formatting_duplicate(self, name: str, attrs: dict[str, str | None]) -> int | None:
|
|
676
742
|
signature = self._attrs_signature(attrs)
|
|
677
|
-
matches = []
|
|
743
|
+
matches: list[int] = []
|
|
678
744
|
for index, entry in enumerate(self.active_formatting):
|
|
679
745
|
if entry is FORMAT_MARKER:
|
|
680
746
|
matches.clear()
|
|
@@ -686,7 +752,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
686
752
|
return matches[0]
|
|
687
753
|
return None
|
|
688
754
|
|
|
689
|
-
def _has_active_formatting_entry(self, name):
|
|
755
|
+
def _has_active_formatting_entry(self, name: str) -> bool:
|
|
690
756
|
for index in range(len(self.active_formatting) - 1, -1, -1):
|
|
691
757
|
entry = self.active_formatting[index]
|
|
692
758
|
if entry is FORMAT_MARKER:
|
|
@@ -695,7 +761,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
695
761
|
return True
|
|
696
762
|
return False
|
|
697
763
|
|
|
698
|
-
def _remove_last_active_formatting_by_name(self, name):
|
|
764
|
+
def _remove_last_active_formatting_by_name(self, name: str) -> None:
|
|
699
765
|
for index in range(len(self.active_formatting) - 1, -1, -1):
|
|
700
766
|
entry = self.active_formatting[index]
|
|
701
767
|
if entry is FORMAT_MARKER:
|
|
@@ -704,13 +770,13 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
704
770
|
del self.active_formatting[index]
|
|
705
771
|
return
|
|
706
772
|
|
|
707
|
-
def _remove_last_open_element_by_name(self, name):
|
|
773
|
+
def _remove_last_open_element_by_name(self, name: str) -> None:
|
|
708
774
|
for index in range(len(self.open_elements) - 1, -1, -1):
|
|
709
775
|
if self.open_elements[index].name == name:
|
|
710
776
|
del self.open_elements[index]
|
|
711
777
|
return
|
|
712
778
|
|
|
713
|
-
def _append_active_formatting_entry(self, name, attrs, node):
|
|
779
|
+
def _append_active_formatting_entry(self, name: str, attrs: dict[str, str | None], node: Any) -> None:
|
|
714
780
|
entry_attrs = self._clone_attributes(attrs)
|
|
715
781
|
signature = self._attrs_signature(entry_attrs)
|
|
716
782
|
self.active_formatting.append(
|
|
@@ -722,20 +788,20 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
722
788
|
},
|
|
723
789
|
)
|
|
724
790
|
|
|
725
|
-
def _clear_active_formatting_up_to_marker(self):
|
|
791
|
+
def _clear_active_formatting_up_to_marker(self) -> None:
|
|
726
792
|
while self.active_formatting:
|
|
727
793
|
entry = self.active_formatting.pop()
|
|
728
794
|
if entry is FORMAT_MARKER:
|
|
729
795
|
break
|
|
730
796
|
|
|
731
|
-
def _push_formatting_marker(self):
|
|
797
|
+
def _push_formatting_marker(self) -> None:
|
|
732
798
|
self.active_formatting.append(FORMAT_MARKER)
|
|
733
799
|
|
|
734
|
-
def _remove_formatting_entry(self, index):
|
|
800
|
+
def _remove_formatting_entry(self, index: int) -> None:
|
|
735
801
|
assert 0 <= index < len(self.active_formatting), f"Invalid index: {index}"
|
|
736
802
|
del self.active_formatting[index]
|
|
737
803
|
|
|
738
|
-
def _reconstruct_active_formatting_elements(self):
|
|
804
|
+
def _reconstruct_active_formatting_elements(self) -> None:
|
|
739
805
|
if not self.active_formatting:
|
|
740
806
|
return
|
|
741
807
|
last_entry = self.active_formatting[-1]
|
|
@@ -757,22 +823,26 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
757
823
|
entry = self.active_formatting[index]
|
|
758
824
|
tag = Tag(Tag.START, entry["name"], self._clone_attributes(entry["attrs"]), False)
|
|
759
825
|
new_node = self._insert_element(tag, push=True)
|
|
826
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
827
|
+
new_node._origin_pos = entry["node"].origin_offset
|
|
828
|
+
new_node._origin_line = entry["node"].origin_line
|
|
829
|
+
new_node._origin_col = entry["node"].origin_col
|
|
760
830
|
entry["node"] = new_node
|
|
761
831
|
index += 1
|
|
762
832
|
|
|
763
|
-
def _insert_node_at(self, parent, index, node):
|
|
833
|
+
def _insert_node_at(self, parent: Any, index: int, node: Any) -> None:
|
|
764
834
|
reference_node = None
|
|
765
835
|
if index is not None and index < len(parent.children):
|
|
766
836
|
reference_node = parent.children[index]
|
|
767
837
|
parent.insert_before(node, reference_node)
|
|
768
838
|
|
|
769
|
-
def _find_last_on_stack(self, name):
|
|
839
|
+
def _find_last_on_stack(self, name: str) -> Any | None:
|
|
770
840
|
for node in reversed(self.open_elements):
|
|
771
841
|
if node.name == name:
|
|
772
842
|
return node
|
|
773
843
|
return None
|
|
774
844
|
|
|
775
|
-
def _clear_stack_until(self, names):
|
|
845
|
+
def _clear_stack_until(self, names: set[str]) -> None:
|
|
776
846
|
# All callers include "html" in names, so this always terminates via break
|
|
777
847
|
while self.open_elements:
|
|
778
848
|
node = self.open_elements[-1]
|
|
@@ -780,7 +850,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
780
850
|
break
|
|
781
851
|
self.open_elements.pop()
|
|
782
852
|
|
|
783
|
-
def _generate_implied_end_tags(self, exclude=None):
|
|
853
|
+
def _generate_implied_end_tags(self, exclude: str | None = None) -> None:
|
|
784
854
|
# Always terminates: html is not in IMPLIED_END_TAGS
|
|
785
855
|
while self.open_elements: # pragma: no branch
|
|
786
856
|
node = self.open_elements[-1]
|
|
@@ -789,10 +859,10 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
789
859
|
continue
|
|
790
860
|
break
|
|
791
861
|
|
|
792
|
-
def _has_in_table_scope(self, name):
|
|
862
|
+
def _has_in_table_scope(self, name: str) -> bool:
|
|
793
863
|
return self._has_element_in_scope(name, TABLE_SCOPE_TERMINATORS, check_integration_points=False)
|
|
794
864
|
|
|
795
|
-
def _close_table_cell(self):
|
|
865
|
+
def _close_table_cell(self) -> bool:
|
|
796
866
|
if self._has_in_table_scope("td"):
|
|
797
867
|
self._end_table_cell("td")
|
|
798
868
|
return True
|
|
@@ -801,7 +871,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
801
871
|
return True
|
|
802
872
|
return False
|
|
803
873
|
|
|
804
|
-
def _end_table_cell(self, name):
|
|
874
|
+
def _end_table_cell(self, name: str) -> None:
|
|
805
875
|
self._generate_implied_end_tags(name)
|
|
806
876
|
while self.open_elements:
|
|
807
877
|
node = self.open_elements.pop()
|
|
@@ -810,15 +880,22 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
810
880
|
self._clear_active_formatting_up_to_marker()
|
|
811
881
|
self.mode = InsertionMode.IN_ROW
|
|
812
882
|
|
|
813
|
-
def _flush_pending_table_text(self):
|
|
883
|
+
def _flush_pending_table_text(self) -> None:
|
|
814
884
|
data = "".join(self.pending_table_text)
|
|
815
885
|
self.pending_table_text.clear()
|
|
816
|
-
if not data:
|
|
886
|
+
if not data: # pragma: no cover
|
|
817
887
|
return
|
|
818
888
|
if is_all_whitespace(data):
|
|
819
889
|
self._append_text(data)
|
|
820
890
|
return
|
|
821
|
-
|
|
891
|
+
|
|
892
|
+
if self.pending_table_text_should_error:
|
|
893
|
+
# html5lib reports one foster-parenting error per non-whitespace character.
|
|
894
|
+
for ch in data:
|
|
895
|
+
if ch not in " \t\n\r\f":
|
|
896
|
+
self._parse_error("foster-parenting-character")
|
|
897
|
+
self.pending_table_text_should_error = False
|
|
898
|
+
|
|
822
899
|
previous = self.insert_from_table
|
|
823
900
|
self.insert_from_table = True
|
|
824
901
|
try:
|
|
@@ -827,7 +904,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
827
904
|
finally:
|
|
828
905
|
self.insert_from_table = previous
|
|
829
906
|
|
|
830
|
-
def _close_table_element(self):
|
|
907
|
+
def _close_table_element(self) -> bool:
|
|
831
908
|
if not self._has_in_table_scope("table"):
|
|
832
909
|
self._parse_error("unexpected-end-tag", tag_name="table")
|
|
833
910
|
return False
|
|
@@ -840,7 +917,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
840
917
|
self._reset_insertion_mode()
|
|
841
918
|
return True
|
|
842
919
|
|
|
843
|
-
def _reset_insertion_mode(self):
|
|
920
|
+
def _reset_insertion_mode(self) -> None:
|
|
844
921
|
# Walk stack backwards - html element always terminates
|
|
845
922
|
idx = len(self.open_elements) - 1
|
|
846
923
|
while idx >= 0:
|
|
@@ -880,7 +957,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
880
957
|
# Empty stack fallback
|
|
881
958
|
self.mode = InsertionMode.IN_BODY
|
|
882
959
|
|
|
883
|
-
def _should_foster_parenting(self, target, *, for_tag=None, is_text=False):
|
|
960
|
+
def _should_foster_parenting(self, target: Any, *, for_tag: str | None = None, is_text: bool = False) -> bool:
|
|
884
961
|
if not self.insert_from_table:
|
|
885
962
|
return False
|
|
886
963
|
if target.name not in TABLE_FOSTER_TARGETS:
|
|
@@ -891,17 +968,17 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
891
968
|
return False
|
|
892
969
|
return True
|
|
893
970
|
|
|
894
|
-
def _lower_ascii(self, value):
|
|
971
|
+
def _lower_ascii(self, value: str) -> str:
|
|
895
972
|
return value.lower() if value else ""
|
|
896
973
|
|
|
897
|
-
def _adjust_svg_tag_name(self, name):
|
|
974
|
+
def _adjust_svg_tag_name(self, name: str) -> str:
|
|
898
975
|
lowered = self._lower_ascii(name)
|
|
899
976
|
return SVG_TAG_NAME_ADJUSTMENTS.get(lowered, name)
|
|
900
977
|
|
|
901
|
-
def _prepare_foreign_attributes(self, namespace, attrs):
|
|
978
|
+
def _prepare_foreign_attributes(self, namespace: str, attrs: dict[str, str | None]) -> dict[str, str | None]:
|
|
902
979
|
if not attrs:
|
|
903
980
|
return {}
|
|
904
|
-
adjusted = {}
|
|
981
|
+
adjusted: dict[str, str | None] = {}
|
|
905
982
|
for name, value in attrs.items():
|
|
906
983
|
lower_name = self._lower_ascii(name)
|
|
907
984
|
if namespace == "math" and lower_name in MATHML_ATTRIBUTE_ADJUSTMENTS:
|
|
@@ -920,14 +997,14 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
920
997
|
adjusted[name] = value
|
|
921
998
|
return adjusted
|
|
922
999
|
|
|
923
|
-
def _node_attribute_value(self, node, name):
|
|
1000
|
+
def _node_attribute_value(self, node: Any, name: str) -> str | None:
|
|
924
1001
|
target = self._lower_ascii(name)
|
|
925
1002
|
for attr_name, attr_value in node.attrs.items():
|
|
926
1003
|
if self._lower_ascii(attr_name) == target:
|
|
927
1004
|
return attr_value or ""
|
|
928
1005
|
return None
|
|
929
1006
|
|
|
930
|
-
def _is_html_integration_point(self, node):
|
|
1007
|
+
def _is_html_integration_point(self, node: Any) -> bool:
|
|
931
1008
|
# annotation-xml is an HTML integration point only with specific encoding values
|
|
932
1009
|
if node.namespace == "math" and node.name == "annotation-xml":
|
|
933
1010
|
encoding = self._node_attribute_value(node, "encoding")
|
|
@@ -939,15 +1016,15 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
939
1016
|
# SVG foreignObject, desc, and title are always HTML integration points
|
|
940
1017
|
return (node.namespace, node.name) in HTML_INTEGRATION_POINT_SET
|
|
941
1018
|
|
|
942
|
-
def _is_mathml_text_integration_point(self, node):
|
|
1019
|
+
def _is_mathml_text_integration_point(self, node: Any) -> bool:
|
|
943
1020
|
if node.namespace != "math":
|
|
944
1021
|
return False
|
|
945
1022
|
return (node.namespace, node.name) in MATHML_TEXT_INTEGRATION_POINT_SET
|
|
946
1023
|
|
|
947
|
-
def _adjusted_current_node(self):
|
|
1024
|
+
def _adjusted_current_node(self) -> Any:
|
|
948
1025
|
return self.open_elements[-1]
|
|
949
1026
|
|
|
950
|
-
def _should_use_foreign_content(self, token):
|
|
1027
|
+
def _should_use_foreign_content(self, token: AnyToken) -> bool:
|
|
951
1028
|
current = self._adjusted_current_node()
|
|
952
1029
|
# HTML namespace elements don't use foreign content rules
|
|
953
1030
|
# (unreachable in practice as foreign content mode only entered for foreign elements)
|
|
@@ -978,13 +1055,13 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
978
1055
|
|
|
979
1056
|
return True
|
|
980
1057
|
|
|
981
|
-
def _foreign_breakout_font(self, tag):
|
|
1058
|
+
def _foreign_breakout_font(self, tag: Any) -> bool:
|
|
982
1059
|
for name in tag.attrs.keys():
|
|
983
1060
|
if self._lower_ascii(name) in {"color", "face", "size"}:
|
|
984
1061
|
return True
|
|
985
1062
|
return False
|
|
986
1063
|
|
|
987
|
-
def _pop_until_html_or_integration_point(self):
|
|
1064
|
+
def _pop_until_html_or_integration_point(self) -> None:
|
|
988
1065
|
# Always terminates: html element has html namespace
|
|
989
1066
|
while self.open_elements: # pragma: no branch
|
|
990
1067
|
node = self.open_elements[-1]
|
|
@@ -996,7 +1073,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
996
1073
|
return
|
|
997
1074
|
self.open_elements.pop()
|
|
998
1075
|
|
|
999
|
-
def _process_foreign_content(self, token):
|
|
1076
|
+
def _process_foreign_content(self, token: AnyToken) -> Any | None:
|
|
1000
1077
|
current = self._adjusted_current_node()
|
|
1001
1078
|
|
|
1002
1079
|
if isinstance(token, CharacterTokens):
|
|
@@ -1076,7 +1153,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1076
1153
|
|
|
1077
1154
|
# Per HTML5 spec: if first node doesn't match, it's a parse error
|
|
1078
1155
|
if first:
|
|
1079
|
-
self._parse_error("unexpected-end-tag
|
|
1156
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
1080
1157
|
first = False
|
|
1081
1158
|
|
|
1082
1159
|
# If we hit an HTML element that doesn't match, process in secondary mode
|
|
@@ -1087,7 +1164,9 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1087
1164
|
# Stack exhausted without finding match - ignore tag (defensive, html always terminates)
|
|
1088
1165
|
return None # pragma: no cover
|
|
1089
1166
|
|
|
1090
|
-
def _appropriate_insertion_location(
|
|
1167
|
+
def _appropriate_insertion_location(
|
|
1168
|
+
self, override_target: Any | None = None, *, foster_parenting: bool = False
|
|
1169
|
+
) -> tuple[Any, int]:
|
|
1091
1170
|
if override_target is not None:
|
|
1092
1171
|
target = override_target
|
|
1093
1172
|
else:
|
|
@@ -1106,24 +1185,28 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1106
1185
|
parent = last_table.parent
|
|
1107
1186
|
# Table has no parent (e.g., detached) - fall back to target
|
|
1108
1187
|
if parent is None: # pragma: no cover
|
|
1109
|
-
|
|
1188
|
+
children = target.children
|
|
1189
|
+
return target, len(children) if children is not None else 0
|
|
1190
|
+
assert parent.children is not None
|
|
1110
1191
|
position = parent.children.index(last_table)
|
|
1111
1192
|
return parent, position
|
|
1112
1193
|
|
|
1113
1194
|
# If target is a template element, insert into its content document fragment
|
|
1114
1195
|
if type(target) is TemplateNode and target.template_content:
|
|
1115
|
-
|
|
1196
|
+
children = target.template_content.children
|
|
1197
|
+
return target.template_content, len(children) if children is not None else 0
|
|
1116
1198
|
|
|
1117
|
-
|
|
1199
|
+
target_children = target.children
|
|
1200
|
+
return target, len(target_children) if target_children is not None else 0
|
|
1118
1201
|
|
|
1119
|
-
def _populate_selectedcontent(self, root):
|
|
1202
|
+
def _populate_selectedcontent(self, root: Any) -> None:
|
|
1120
1203
|
"""Populate selectedcontent elements with content from selected option.
|
|
1121
1204
|
|
|
1122
1205
|
Per HTML5 spec: selectedcontent mirrors the content of the selected option,
|
|
1123
1206
|
or the first option if none is selected.
|
|
1124
1207
|
"""
|
|
1125
1208
|
# Find all select elements
|
|
1126
|
-
selects = []
|
|
1209
|
+
selects: list[Any] = []
|
|
1127
1210
|
self._find_elements(root, "select", selects)
|
|
1128
1211
|
|
|
1129
1212
|
for select in selects:
|
|
@@ -1133,7 +1216,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1133
1216
|
continue
|
|
1134
1217
|
|
|
1135
1218
|
# Find all option elements
|
|
1136
|
-
options = []
|
|
1219
|
+
options: list[Any] = []
|
|
1137
1220
|
self._find_elements(select, "option", options)
|
|
1138
1221
|
|
|
1139
1222
|
# Find selected option or use first one
|
|
@@ -1153,7 +1236,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1153
1236
|
# Clone content from selected option to selectedcontent
|
|
1154
1237
|
self._clone_children(selected_option, selectedcontent)
|
|
1155
1238
|
|
|
1156
|
-
def _find_elements(self, node, name, result):
|
|
1239
|
+
def _find_elements(self, node: Any, name: str, result: list[Any]) -> None:
|
|
1157
1240
|
"""Recursively find all elements with given name."""
|
|
1158
1241
|
if node.name == name:
|
|
1159
1242
|
result.append(node)
|
|
@@ -1162,7 +1245,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1162
1245
|
for child in node.children:
|
|
1163
1246
|
self._find_elements(child, name, result)
|
|
1164
1247
|
|
|
1165
|
-
def _find_element(self, node, name):
|
|
1248
|
+
def _find_element(self, node: Any, name: str) -> Any | None:
|
|
1166
1249
|
"""Find first element with given name."""
|
|
1167
1250
|
if node.name == name:
|
|
1168
1251
|
return node
|
|
@@ -1174,21 +1257,21 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1174
1257
|
return result
|
|
1175
1258
|
return None
|
|
1176
1259
|
|
|
1177
|
-
def _clone_children(self, source, target):
|
|
1260
|
+
def _clone_children(self, source: Any, target: Any) -> None:
|
|
1178
1261
|
"""Deep clone all children from source to target."""
|
|
1179
1262
|
for child in source.children:
|
|
1180
1263
|
target.append_child(child.clone_node(deep=True))
|
|
1181
1264
|
|
|
1182
|
-
def _has_in_scope(self, name):
|
|
1265
|
+
def _has_in_scope(self, name: str) -> bool:
|
|
1183
1266
|
return self._has_element_in_scope(name, DEFAULT_SCOPE_TERMINATORS)
|
|
1184
1267
|
|
|
1185
|
-
def _has_in_list_item_scope(self, name):
|
|
1268
|
+
def _has_in_list_item_scope(self, name: str) -> bool:
|
|
1186
1269
|
return self._has_element_in_scope(name, LIST_ITEM_SCOPE_TERMINATORS)
|
|
1187
1270
|
|
|
1188
|
-
def _has_in_definition_scope(self, name):
|
|
1271
|
+
def _has_in_definition_scope(self, name: str) -> bool:
|
|
1189
1272
|
return self._has_element_in_scope(name, DEFINITION_SCOPE_TERMINATORS)
|
|
1190
1273
|
|
|
1191
|
-
def _has_any_in_scope(self, names):
|
|
1274
|
+
def _has_any_in_scope(self, names: set[str]) -> bool:
|
|
1192
1275
|
# Always terminates: html is in DEFAULT_SCOPE_TERMINATORS
|
|
1193
1276
|
terminators = DEFAULT_SCOPE_TERMINATORS
|
|
1194
1277
|
idx = len(self.open_elements) - 1
|
|
@@ -1201,7 +1284,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1201
1284
|
idx -= 1
|
|
1202
1285
|
return False # pragma: no cover - html always terminates
|
|
1203
1286
|
|
|
1204
|
-
def process_characters(self, data):
|
|
1287
|
+
def process_characters(self, data: str) -> Any:
|
|
1205
1288
|
"""Optimized path for character tokens."""
|
|
1206
1289
|
# Check for foreign content first
|
|
1207
1290
|
current_node = self.open_elements[-1] if self.open_elements else None
|
|
@@ -1211,19 +1294,21 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1211
1294
|
return self.process_token(CharacterTokens(data))
|
|
1212
1295
|
|
|
1213
1296
|
if self.mode == InsertionMode.IN_BODY:
|
|
1214
|
-
if "\x00" in data:
|
|
1215
|
-
self._parse_error("invalid-codepoint")
|
|
1216
|
-
data = data.replace("\x00", "")
|
|
1217
|
-
|
|
1218
1297
|
if not data:
|
|
1219
1298
|
return TokenSinkResult.Continue
|
|
1299
|
+
if "\x00" in data:
|
|
1300
|
+
data = data.replace("\x00", "")
|
|
1301
|
+
if not data:
|
|
1302
|
+
return TokenSinkResult.Continue
|
|
1220
1303
|
|
|
1221
1304
|
if is_all_whitespace(data):
|
|
1222
|
-
self.
|
|
1305
|
+
if self.active_formatting:
|
|
1306
|
+
self._reconstruct_active_formatting_elements()
|
|
1223
1307
|
self._append_text(data)
|
|
1224
1308
|
return TokenSinkResult.Continue
|
|
1225
1309
|
|
|
1226
|
-
self.
|
|
1310
|
+
if self.active_formatting:
|
|
1311
|
+
self._reconstruct_active_formatting_elements()
|
|
1227
1312
|
self.frameset_ok = False
|
|
1228
1313
|
self._append_text(data)
|
|
1229
1314
|
return TokenSinkResult.Continue
|