justhtml 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +48 -0
- justhtml/__main__.py +86 -17
- justhtml/constants.py +12 -0
- justhtml/entities.py +45 -7
- justhtml/errors.py +17 -3
- justhtml/linkify.py +438 -0
- justhtml/node.py +385 -97
- justhtml/parser.py +139 -16
- justhtml/sanitize.py +992 -0
- justhtml/selector.py +117 -19
- justhtml/serialize.py +671 -41
- justhtml/tokenizer.py +364 -194
- justhtml/tokens.py +28 -5
- justhtml/transforms.py +2568 -0
- justhtml/treebuilder.py +297 -204
- justhtml/treebuilder_modes.py +208 -138
- justhtml-0.38.0.dist-info/METADATA +213 -0
- justhtml-0.38.0.dist-info/RECORD +26 -0
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.12.0.dist-info/METADATA +0 -164
- justhtml-0.12.0.dist-info/RECORD +0 -23
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0
justhtml/treebuilder_modes.py
CHANGED
|
@@ -3,23 +3,30 @@
|
|
|
3
3
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
7
7
|
|
|
8
8
|
from .constants import (
|
|
9
|
+
FORMAT_MARKER,
|
|
9
10
|
FORMATTING_ELEMENTS,
|
|
10
11
|
HEADING_ELEMENTS,
|
|
11
12
|
)
|
|
12
13
|
from .node import SimpleDomNode, TemplateNode
|
|
13
|
-
from .tokens import CharacterTokens, CommentToken, EOFToken, Tag, TokenSinkResult
|
|
14
|
+
from .tokens import AnyToken, CharacterTokens, CommentToken, DoctypeToken, EOFToken, Tag, TokenSinkResult
|
|
14
15
|
from .treebuilder_utils import (
|
|
15
16
|
InsertionMode,
|
|
16
17
|
doctype_error_and_quirks,
|
|
17
18
|
is_all_whitespace,
|
|
18
19
|
)
|
|
19
20
|
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from collections.abc import Callable
|
|
23
|
+
|
|
24
|
+
ModeResultTuple = tuple[str, InsertionMode, AnyToken] | tuple[str, InsertionMode, AnyToken, bool]
|
|
25
|
+
"Result is (instruction, mode, token) or (instruction, mode, token, force_html)"
|
|
26
|
+
|
|
20
27
|
|
|
21
28
|
class TreeBuilderModesMixin:
|
|
22
|
-
def _handle_doctype(self, token:
|
|
29
|
+
def _handle_doctype(self, token: DoctypeToken) -> Literal[0]:
|
|
23
30
|
if self.mode != InsertionMode.INITIAL:
|
|
24
31
|
self._parse_error("unexpected-doctype")
|
|
25
32
|
return TokenSinkResult.Continue
|
|
@@ -37,7 +44,7 @@ class TreeBuilderModesMixin:
|
|
|
37
44
|
self.mode = InsertionMode.BEFORE_HTML
|
|
38
45
|
return TokenSinkResult.Continue
|
|
39
46
|
|
|
40
|
-
def _mode_initial(self, token: Any) ->
|
|
47
|
+
def _mode_initial(self, token: Any) -> ModeResultTuple | None:
|
|
41
48
|
if isinstance(token, CharacterTokens):
|
|
42
49
|
if is_all_whitespace(token.data):
|
|
43
50
|
return None
|
|
@@ -54,13 +61,13 @@ class TreeBuilderModesMixin:
|
|
|
54
61
|
return ("reprocess", InsertionMode.BEFORE_HTML, token)
|
|
55
62
|
# Only Tags remain - no DOCTYPE seen, so quirks mode
|
|
56
63
|
if token.kind == Tag.START:
|
|
57
|
-
self._parse_error("expected-doctype-but-got-start-tag", tag_name=token.name
|
|
64
|
+
self._parse_error("expected-doctype-but-got-start-tag", tag_name=token.name)
|
|
58
65
|
else:
|
|
59
|
-
self._parse_error("expected-doctype-but-got-end-tag", tag_name=token.name
|
|
66
|
+
self._parse_error("expected-doctype-but-got-end-tag", tag_name=token.name)
|
|
60
67
|
self._set_quirks_mode("quirks")
|
|
61
68
|
return ("reprocess", InsertionMode.BEFORE_HTML, token)
|
|
62
69
|
|
|
63
|
-
def _mode_before_html(self, token:
|
|
70
|
+
def _mode_before_html(self, token: AnyToken) -> ModeResultTuple | None:
|
|
64
71
|
if isinstance(token, CharacterTokens) and is_all_whitespace(token.data):
|
|
65
72
|
return None
|
|
66
73
|
if isinstance(token, CommentToken):
|
|
@@ -93,7 +100,7 @@ class TreeBuilderModesMixin:
|
|
|
93
100
|
self.mode = InsertionMode.BEFORE_HEAD
|
|
94
101
|
return ("reprocess", InsertionMode.BEFORE_HEAD, token)
|
|
95
102
|
|
|
96
|
-
def _mode_before_head(self, token:
|
|
103
|
+
def _mode_before_head(self, token: AnyToken) -> ModeResultTuple | None:
|
|
97
104
|
if isinstance(token, CharacterTokens):
|
|
98
105
|
data = token.data or ""
|
|
99
106
|
if "\x00" in data:
|
|
@@ -136,7 +143,7 @@ class TreeBuilderModesMixin:
|
|
|
136
143
|
self.mode = InsertionMode.IN_HEAD
|
|
137
144
|
return ("reprocess", InsertionMode.IN_HEAD, token)
|
|
138
145
|
|
|
139
|
-
def _mode_in_head(self, token:
|
|
146
|
+
def _mode_in_head(self, token: AnyToken) -> ModeResultTuple | None:
|
|
140
147
|
if isinstance(token, CharacterTokens):
|
|
141
148
|
if is_all_whitespace(token.data):
|
|
142
149
|
self._append_text(token.data)
|
|
@@ -212,7 +219,7 @@ class TreeBuilderModesMixin:
|
|
|
212
219
|
self.mode = InsertionMode.AFTER_HEAD
|
|
213
220
|
return ("reprocess", InsertionMode.AFTER_HEAD, token)
|
|
214
221
|
|
|
215
|
-
def _mode_in_head_noscript(self, token:
|
|
222
|
+
def _mode_in_head_noscript(self, token: AnyToken) -> ModeResultTuple | None:
|
|
216
223
|
"""Handle tokens in 'in head noscript' insertion mode (scripting disabled)."""
|
|
217
224
|
if isinstance(token, CharacterTokens):
|
|
218
225
|
data = token.data or ""
|
|
@@ -261,15 +268,11 @@ class TreeBuilderModesMixin:
|
|
|
261
268
|
# All token types are handled above - CharacterTokens, CommentToken, Tag, EOFToken
|
|
262
269
|
return None # pragma: no cover
|
|
263
270
|
|
|
264
|
-
def _mode_after_head(self, token:
|
|
271
|
+
def _mode_after_head(self, token: AnyToken) -> ModeResultTuple | None:
|
|
265
272
|
if isinstance(token, CharacterTokens):
|
|
266
273
|
data = token.data or ""
|
|
267
274
|
if "\x00" in data:
|
|
268
|
-
self._parse_error("invalid-codepoint-in-body")
|
|
269
275
|
data = data.replace("\x00", "")
|
|
270
|
-
if "\x0c" in data:
|
|
271
|
-
self._parse_error("invalid-codepoint-in-body")
|
|
272
|
-
data = data.replace("\x0c", "")
|
|
273
276
|
if not data or is_all_whitespace(data):
|
|
274
277
|
if data:
|
|
275
278
|
self._append_text(data)
|
|
@@ -331,6 +334,10 @@ class TreeBuilderModesMixin:
|
|
|
331
334
|
self.mode = InsertionMode.IN_HEAD
|
|
332
335
|
return ("reprocess", InsertionMode.IN_HEAD, token)
|
|
333
336
|
if token.kind == Tag.END and token.name == "template":
|
|
337
|
+
has_template = any(node.name == "template" for node in self.open_elements)
|
|
338
|
+
if not has_template:
|
|
339
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
340
|
+
return None
|
|
334
341
|
return self._mode_in_head(token)
|
|
335
342
|
if token.kind == Tag.END and token.name == "body":
|
|
336
343
|
self._insert_body_if_missing()
|
|
@@ -350,7 +357,7 @@ class TreeBuilderModesMixin:
|
|
|
350
357
|
self._insert_body_if_missing()
|
|
351
358
|
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
352
359
|
|
|
353
|
-
def _mode_text(self, token:
|
|
360
|
+
def _mode_text(self, token: AnyToken) -> ModeResultTuple | None:
|
|
354
361
|
if isinstance(token, CharacterTokens):
|
|
355
362
|
self._append_text(token.data)
|
|
356
363
|
return None
|
|
@@ -366,11 +373,11 @@ class TreeBuilderModesMixin:
|
|
|
366
373
|
self.mode = self.original_mode or InsertionMode.IN_BODY
|
|
367
374
|
return None
|
|
368
375
|
|
|
369
|
-
def _mode_in_body(self, token: Any) ->
|
|
376
|
+
def _mode_in_body(self, token: Any) -> ModeResultTuple | None:
|
|
370
377
|
handler = self._BODY_TOKEN_HANDLERS.get(type(token))
|
|
371
378
|
return handler(self, token) if handler else None
|
|
372
379
|
|
|
373
|
-
def _handle_characters_in_body(self, token:
|
|
380
|
+
def _handle_characters_in_body(self, token: CharacterTokens) -> None:
|
|
374
381
|
data = token.data or ""
|
|
375
382
|
if "\x00" in data:
|
|
376
383
|
self._parse_error("invalid-codepoint")
|
|
@@ -384,11 +391,11 @@ class TreeBuilderModesMixin:
|
|
|
384
391
|
self._append_text(data)
|
|
385
392
|
return
|
|
386
393
|
|
|
387
|
-
def _handle_comment_in_body(self, token:
|
|
394
|
+
def _handle_comment_in_body(self, token: CommentToken) -> None:
|
|
388
395
|
self._append_comment(token.data)
|
|
389
396
|
return
|
|
390
397
|
|
|
391
|
-
def _handle_tag_in_body(self, token:
|
|
398
|
+
def _handle_tag_in_body(self, token: Tag) -> ModeResultTuple | None:
|
|
392
399
|
if token.kind == Tag.START:
|
|
393
400
|
handler = self._BODY_START_HANDLERS.get(token.name)
|
|
394
401
|
if handler:
|
|
@@ -412,7 +419,7 @@ class TreeBuilderModesMixin:
|
|
|
412
419
|
self._any_other_end_tag(token.name)
|
|
413
420
|
return None
|
|
414
421
|
|
|
415
|
-
def _handle_eof_in_body(self, token:
|
|
422
|
+
def _handle_eof_in_body(self, token: EOFToken) -> ModeResultTuple | None:
|
|
416
423
|
# If we're in a template, handle EOF in template mode first
|
|
417
424
|
if self.template_modes:
|
|
418
425
|
return self._mode_in_template(token)
|
|
@@ -447,17 +454,19 @@ class TreeBuilderModesMixin:
|
|
|
447
454
|
# Body mode start tag handlers
|
|
448
455
|
# ---------------------
|
|
449
456
|
|
|
450
|
-
def _handle_body_start_html(self, token:
|
|
457
|
+
def _handle_body_start_html(self, token: Tag) -> None:
|
|
451
458
|
if self.template_modes:
|
|
452
459
|
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
453
460
|
return
|
|
461
|
+
# Per spec: parse error; merge attributes onto existing <html>.
|
|
462
|
+
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
454
463
|
# In IN_BODY mode, html element is always at open_elements[0]
|
|
455
464
|
if self.open_elements: # pragma: no branch
|
|
456
465
|
html = self.open_elements[0]
|
|
457
466
|
self._add_missing_attributes(html, token.attrs)
|
|
458
467
|
return
|
|
459
468
|
|
|
460
|
-
def _handle_body_start_body(self, token:
|
|
469
|
+
def _handle_body_start_body(self, token: Tag) -> None:
|
|
461
470
|
if self.template_modes:
|
|
462
471
|
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
463
472
|
return
|
|
@@ -471,19 +480,19 @@ class TreeBuilderModesMixin:
|
|
|
471
480
|
self.frameset_ok = False
|
|
472
481
|
return
|
|
473
482
|
|
|
474
|
-
def _handle_body_start_head(self, token:
|
|
483
|
+
def _handle_body_start_head(self, token: Tag) -> None:
|
|
475
484
|
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
476
485
|
return
|
|
477
486
|
|
|
478
|
-
def _handle_body_start_in_head(self, token:
|
|
487
|
+
def _handle_body_start_in_head(self, token: Tag) -> ModeResultTuple | None:
|
|
479
488
|
return self._mode_in_head(token)
|
|
480
489
|
|
|
481
|
-
def _handle_body_start_block_with_p(self, token:
|
|
490
|
+
def _handle_body_start_block_with_p(self, token: Tag) -> None:
|
|
482
491
|
self._close_p_element()
|
|
483
492
|
self._insert_element(token, push=True)
|
|
484
493
|
return
|
|
485
494
|
|
|
486
|
-
def _handle_body_start_heading(self, token:
|
|
495
|
+
def _handle_body_start_heading(self, token: Tag) -> None:
|
|
487
496
|
self._close_p_element()
|
|
488
497
|
if self.open_elements and self.open_elements[-1].name in HEADING_ELEMENTS:
|
|
489
498
|
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
@@ -492,14 +501,14 @@ class TreeBuilderModesMixin:
|
|
|
492
501
|
self.frameset_ok = False
|
|
493
502
|
return
|
|
494
503
|
|
|
495
|
-
def _handle_body_start_pre_listing(self, token:
|
|
504
|
+
def _handle_body_start_pre_listing(self, token: Tag) -> None:
|
|
496
505
|
self._close_p_element()
|
|
497
506
|
self._insert_element(token, push=True)
|
|
498
507
|
self.ignore_lf = True
|
|
499
508
|
self.frameset_ok = False
|
|
500
509
|
return
|
|
501
510
|
|
|
502
|
-
def _handle_body_start_form(self, token:
|
|
511
|
+
def _handle_body_start_form(self, token: Tag) -> None:
|
|
503
512
|
if self.form_element is not None:
|
|
504
513
|
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
505
514
|
return
|
|
@@ -509,7 +518,7 @@ class TreeBuilderModesMixin:
|
|
|
509
518
|
self.frameset_ok = False
|
|
510
519
|
return
|
|
511
520
|
|
|
512
|
-
def _handle_body_start_button(self, token:
|
|
521
|
+
def _handle_body_start_button(self, token: Tag) -> None:
|
|
513
522
|
if self._has_in_scope("button"):
|
|
514
523
|
self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=token.name)
|
|
515
524
|
self._close_element_by_name("button")
|
|
@@ -517,19 +526,19 @@ class TreeBuilderModesMixin:
|
|
|
517
526
|
self.frameset_ok = False
|
|
518
527
|
return
|
|
519
528
|
|
|
520
|
-
def _handle_body_start_paragraph(self, token:
|
|
529
|
+
def _handle_body_start_paragraph(self, token: Tag) -> None:
|
|
521
530
|
self._close_p_element()
|
|
522
531
|
self._insert_element(token, push=True)
|
|
523
532
|
return
|
|
524
533
|
|
|
525
|
-
def _handle_body_start_math(self, token:
|
|
534
|
+
def _handle_body_start_math(self, token: Tag) -> None:
|
|
526
535
|
self._reconstruct_active_formatting_elements()
|
|
527
536
|
attrs = self._prepare_foreign_attributes("math", token.attrs)
|
|
528
537
|
new_tag = Tag(Tag.START, token.name, attrs, token.self_closing)
|
|
529
538
|
self._insert_element(new_tag, push=not token.self_closing, namespace="math")
|
|
530
539
|
return
|
|
531
540
|
|
|
532
|
-
def _handle_body_start_svg(self, token:
|
|
541
|
+
def _handle_body_start_svg(self, token: Tag) -> None:
|
|
533
542
|
self._reconstruct_active_formatting_elements()
|
|
534
543
|
adjusted_name = self._adjust_svg_tag_name(token.name)
|
|
535
544
|
attrs = self._prepare_foreign_attributes("svg", token.attrs)
|
|
@@ -537,7 +546,7 @@ class TreeBuilderModesMixin:
|
|
|
537
546
|
self._insert_element(new_tag, push=not token.self_closing, namespace="svg")
|
|
538
547
|
return
|
|
539
548
|
|
|
540
|
-
def _handle_body_start_li(self, token:
|
|
549
|
+
def _handle_body_start_li(self, token: Tag) -> None:
|
|
541
550
|
self.frameset_ok = False
|
|
542
551
|
self._close_p_element()
|
|
543
552
|
if self._has_in_list_item_scope("li"):
|
|
@@ -545,7 +554,7 @@ class TreeBuilderModesMixin:
|
|
|
545
554
|
self._insert_element(token, push=True)
|
|
546
555
|
return
|
|
547
556
|
|
|
548
|
-
def _handle_body_start_dd_dt(self, token:
|
|
557
|
+
def _handle_body_start_dd_dt(self, token: Tag) -> None:
|
|
549
558
|
self.frameset_ok = False
|
|
550
559
|
self._close_p_element()
|
|
551
560
|
name = token.name
|
|
@@ -574,6 +583,10 @@ class TreeBuilderModesMixin:
|
|
|
574
583
|
# 3. Find formatting element
|
|
575
584
|
formatting_element_index = self._find_active_formatting_index(subject)
|
|
576
585
|
if formatting_element_index is None:
|
|
586
|
+
# html5lib reports a parse error when an end tag for a formatting
|
|
587
|
+
# element triggers the adoption agency algorithm but no matching
|
|
588
|
+
# active formatting entry exists.
|
|
589
|
+
self._parse_error("adoption-agency-1.3")
|
|
577
590
|
return
|
|
578
591
|
|
|
579
592
|
formatting_element_entry = self.active_formatting[formatting_element_index]
|
|
@@ -607,7 +620,7 @@ class TreeBuilderModesMixin:
|
|
|
607
620
|
if furthest_block is None:
|
|
608
621
|
# formatting_element is known to be on the stack
|
|
609
622
|
while True:
|
|
610
|
-
popped = self.
|
|
623
|
+
popped = self._pop_current()
|
|
611
624
|
if popped is formatting_element:
|
|
612
625
|
break
|
|
613
626
|
self._remove_formatting_entry(formatting_element_index)
|
|
@@ -651,6 +664,10 @@ class TreeBuilderModesMixin:
|
|
|
651
664
|
# 10.4 Replace entry with new element
|
|
652
665
|
entry = self.active_formatting[node_formatting_index]
|
|
653
666
|
new_element = self._create_element(entry["name"], entry["node"].namespace, entry["attrs"])
|
|
667
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
668
|
+
new_element._origin_pos = entry["node"].origin_offset
|
|
669
|
+
new_element._origin_line = entry["node"].origin_line
|
|
670
|
+
new_element._origin_col = entry["node"].origin_col
|
|
654
671
|
entry["node"] = new_element
|
|
655
672
|
self.open_elements[self.open_elements.index(node)] = new_element
|
|
656
673
|
node = new_element
|
|
@@ -684,6 +701,10 @@ class TreeBuilderModesMixin:
|
|
|
684
701
|
# 12. Create new formatting element
|
|
685
702
|
entry = self.active_formatting[formatting_element_index]
|
|
686
703
|
new_formatting_element = self._create_element(entry["name"], entry["node"].namespace, entry["attrs"])
|
|
704
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
705
|
+
new_formatting_element._origin_pos = entry["node"].origin_offset
|
|
706
|
+
new_formatting_element._origin_line = entry["node"].origin_line
|
|
707
|
+
new_formatting_element._origin_col = entry["node"].origin_col
|
|
687
708
|
entry["node"] = new_formatting_element
|
|
688
709
|
|
|
689
710
|
# 13. Move children of furthest block
|
|
@@ -706,8 +727,9 @@ class TreeBuilderModesMixin:
|
|
|
706
727
|
furthest_block_index = self.open_elements.index(furthest_block)
|
|
707
728
|
self.open_elements.insert(furthest_block_index + 1, new_formatting_element)
|
|
708
729
|
|
|
709
|
-
def _handle_body_start_a(self, token:
|
|
730
|
+
def _handle_body_start_a(self, token: Tag) -> None:
|
|
710
731
|
if self._has_active_formatting_entry("a"):
|
|
732
|
+
self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=token.name)
|
|
711
733
|
self._adoption_agency("a")
|
|
712
734
|
self._remove_last_active_formatting_by_name("a")
|
|
713
735
|
self._remove_last_open_element_by_name("a")
|
|
@@ -716,7 +738,7 @@ class TreeBuilderModesMixin:
|
|
|
716
738
|
self._append_active_formatting_entry("a", token.attrs, node)
|
|
717
739
|
return
|
|
718
740
|
|
|
719
|
-
def _handle_body_start_formatting(self, token:
|
|
741
|
+
def _handle_body_start_formatting(self, token: Tag) -> None:
|
|
720
742
|
name = token.name
|
|
721
743
|
if name == "nobr" and self._in_scope("nobr"):
|
|
722
744
|
self._adoption_agency("nobr")
|
|
@@ -730,21 +752,21 @@ class TreeBuilderModesMixin:
|
|
|
730
752
|
self._append_active_formatting_entry(name, token.attrs, node)
|
|
731
753
|
return
|
|
732
754
|
|
|
733
|
-
def _handle_body_start_applet_like(self, token:
|
|
755
|
+
def _handle_body_start_applet_like(self, token: Tag) -> None:
|
|
734
756
|
self._reconstruct_active_formatting_elements()
|
|
735
757
|
self._insert_element(token, push=True)
|
|
736
758
|
self._push_formatting_marker()
|
|
737
759
|
self.frameset_ok = False
|
|
738
760
|
return
|
|
739
761
|
|
|
740
|
-
def _handle_body_start_br(self, token:
|
|
762
|
+
def _handle_body_start_br(self, token: Tag) -> None:
|
|
741
763
|
self._close_p_element()
|
|
742
764
|
self._reconstruct_active_formatting_elements()
|
|
743
765
|
self._insert_element(token, push=False)
|
|
744
766
|
self.frameset_ok = False
|
|
745
767
|
return
|
|
746
768
|
|
|
747
|
-
def _handle_body_start_frameset(self, token:
|
|
769
|
+
def _handle_body_start_frameset(self, token: Tag) -> None:
|
|
748
770
|
if not self.frameset_ok:
|
|
749
771
|
self._parse_error("unexpected-start-tag-ignored", tag_name=token.name)
|
|
750
772
|
return
|
|
@@ -769,17 +791,17 @@ class TreeBuilderModesMixin:
|
|
|
769
791
|
# Body mode end tag handlers
|
|
770
792
|
# ---------------------
|
|
771
793
|
|
|
772
|
-
def _handle_body_end_body(self, token:
|
|
794
|
+
def _handle_body_end_body(self, token: Tag) -> None:
|
|
773
795
|
if self._in_scope("body"):
|
|
774
796
|
self.mode = InsertionMode.AFTER_BODY
|
|
775
797
|
return
|
|
776
798
|
|
|
777
|
-
def _handle_body_end_html(self, token:
|
|
799
|
+
def _handle_body_end_html(self, token: Tag) -> ModeResultTuple | None:
|
|
778
800
|
if self._in_scope("body"):
|
|
779
801
|
return ("reprocess", InsertionMode.AFTER_BODY, token)
|
|
780
802
|
return None
|
|
781
803
|
|
|
782
|
-
def _handle_body_end_p(self, token:
|
|
804
|
+
def _handle_body_end_p(self, token: Tag) -> None:
|
|
783
805
|
if not self._close_p_element():
|
|
784
806
|
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
785
807
|
phantom = Tag(Tag.START, "p", {}, False)
|
|
@@ -787,21 +809,21 @@ class TreeBuilderModesMixin:
|
|
|
787
809
|
self._close_p_element()
|
|
788
810
|
return
|
|
789
811
|
|
|
790
|
-
def _handle_body_end_li(self, token:
|
|
812
|
+
def _handle_body_end_li(self, token: Tag) -> None:
|
|
791
813
|
if not self._has_in_list_item_scope("li"):
|
|
792
814
|
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
793
815
|
return
|
|
794
816
|
self._pop_until_any_inclusive({"li"})
|
|
795
817
|
return
|
|
796
818
|
|
|
797
|
-
def _handle_body_end_dd_dt(self, token:
|
|
819
|
+
def _handle_body_end_dd_dt(self, token: Tag) -> None:
|
|
798
820
|
name = token.name
|
|
799
821
|
if not self._has_in_definition_scope(name):
|
|
800
822
|
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
801
823
|
return
|
|
802
824
|
self._pop_until_any_inclusive({"dd", "dt"})
|
|
803
825
|
|
|
804
|
-
def _handle_body_end_form(self, token:
|
|
826
|
+
def _handle_body_end_form(self, token: Tag) -> None:
|
|
805
827
|
if self.form_element is None:
|
|
806
828
|
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
807
829
|
return
|
|
@@ -811,20 +833,20 @@ class TreeBuilderModesMixin:
|
|
|
811
833
|
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
812
834
|
return
|
|
813
835
|
|
|
814
|
-
def _handle_body_end_applet_like(self, token:
|
|
836
|
+
def _handle_body_end_applet_like(self, token: Tag) -> None:
|
|
815
837
|
name = token.name
|
|
816
838
|
if not self._in_scope(name):
|
|
817
839
|
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
818
840
|
return
|
|
819
841
|
# Element verified in scope above
|
|
820
842
|
while self.open_elements: # pragma: no branch
|
|
821
|
-
popped = self.
|
|
843
|
+
popped = self._pop_current()
|
|
822
844
|
if popped.name == name:
|
|
823
845
|
break
|
|
824
846
|
self._clear_active_formatting_up_to_marker()
|
|
825
847
|
return
|
|
826
848
|
|
|
827
|
-
def _handle_body_end_heading(self, token:
|
|
849
|
+
def _handle_body_end_heading(self, token: Tag) -> None:
|
|
828
850
|
name = token.name
|
|
829
851
|
if not self._has_any_in_scope(HEADING_ELEMENTS):
|
|
830
852
|
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
@@ -834,12 +856,12 @@ class TreeBuilderModesMixin:
|
|
|
834
856
|
self._parse_error("end-tag-too-early", tag_name=name)
|
|
835
857
|
# Heading verified in scope by caller
|
|
836
858
|
while self.open_elements: # pragma: no branch
|
|
837
|
-
popped = self.
|
|
859
|
+
popped = self._pop_current()
|
|
838
860
|
if popped.name in HEADING_ELEMENTS:
|
|
839
861
|
break
|
|
840
862
|
return
|
|
841
863
|
|
|
842
|
-
def _handle_body_end_block(self, token:
|
|
864
|
+
def _handle_body_end_block(self, token: Tag) -> None:
|
|
843
865
|
name = token.name
|
|
844
866
|
if not self._in_scope(name):
|
|
845
867
|
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
@@ -850,9 +872,10 @@ class TreeBuilderModesMixin:
|
|
|
850
872
|
self._pop_until_any_inclusive({name})
|
|
851
873
|
return
|
|
852
874
|
|
|
853
|
-
def _handle_body_end_template(self, token:
|
|
875
|
+
def _handle_body_end_template(self, token: Tag) -> None:
|
|
854
876
|
has_template = any(node.name == "template" for node in self.open_elements)
|
|
855
877
|
if not has_template:
|
|
878
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
856
879
|
return
|
|
857
880
|
self._generate_implied_end_tags()
|
|
858
881
|
self._pop_until_inclusive("template")
|
|
@@ -863,18 +886,18 @@ class TreeBuilderModesMixin:
|
|
|
863
886
|
self._reset_insertion_mode()
|
|
864
887
|
return
|
|
865
888
|
|
|
866
|
-
def _handle_body_start_structure_ignored(self, token:
|
|
889
|
+
def _handle_body_start_structure_ignored(self, token: Tag) -> None:
|
|
867
890
|
self._parse_error("unexpected-start-tag-ignored", tag_name=token.name)
|
|
868
891
|
return
|
|
869
892
|
|
|
870
|
-
def _handle_body_start_col_or_frame(self, token:
|
|
893
|
+
def _handle_body_start_col_or_frame(self, token: Tag) -> None:
|
|
871
894
|
if self.fragment_context is None:
|
|
872
895
|
self._parse_error("unexpected-start-tag-ignored", tag_name=token.name)
|
|
873
896
|
return
|
|
874
897
|
self._insert_element(token, push=False)
|
|
875
898
|
return
|
|
876
899
|
|
|
877
|
-
def _handle_body_start_image(self, token:
|
|
900
|
+
def _handle_body_start_image(self, token: Tag) -> None:
|
|
878
901
|
self._parse_error("image-start-tag", tag_name=token.name)
|
|
879
902
|
img_token = Tag(Tag.START, "img", token.attrs, token.self_closing)
|
|
880
903
|
self._reconstruct_active_formatting_elements()
|
|
@@ -882,17 +905,17 @@ class TreeBuilderModesMixin:
|
|
|
882
905
|
self.frameset_ok = False
|
|
883
906
|
return
|
|
884
907
|
|
|
885
|
-
def _handle_body_start_void_with_formatting(self, token:
|
|
908
|
+
def _handle_body_start_void_with_formatting(self, token: Tag) -> None:
|
|
886
909
|
self._reconstruct_active_formatting_elements()
|
|
887
910
|
self._insert_element(token, push=False)
|
|
888
911
|
self.frameset_ok = False
|
|
889
912
|
return
|
|
890
913
|
|
|
891
|
-
def _handle_body_start_simple_void(self, token:
|
|
914
|
+
def _handle_body_start_simple_void(self, token: Tag) -> None:
|
|
892
915
|
self._insert_element(token, push=False)
|
|
893
916
|
return
|
|
894
917
|
|
|
895
|
-
def _handle_body_start_input(self, token:
|
|
918
|
+
def _handle_body_start_input(self, token: Tag) -> None:
|
|
896
919
|
input_type = None
|
|
897
920
|
for name, value in token.attrs.items():
|
|
898
921
|
if name == "type":
|
|
@@ -903,7 +926,7 @@ class TreeBuilderModesMixin:
|
|
|
903
926
|
self.frameset_ok = False
|
|
904
927
|
return
|
|
905
928
|
|
|
906
|
-
def _handle_body_start_table(self, token:
|
|
929
|
+
def _handle_body_start_table(self, token: Tag) -> None:
|
|
907
930
|
if self.quirks_mode != "quirks":
|
|
908
931
|
self._close_p_element()
|
|
909
932
|
self._insert_element(token, push=True)
|
|
@@ -911,7 +934,7 @@ class TreeBuilderModesMixin:
|
|
|
911
934
|
self.mode = InsertionMode.IN_TABLE
|
|
912
935
|
return
|
|
913
936
|
|
|
914
|
-
def _handle_body_start_plaintext_xmp(self, token:
|
|
937
|
+
def _handle_body_start_plaintext_xmp(self, token: Tag) -> None:
|
|
915
938
|
self._close_p_element()
|
|
916
939
|
self._insert_element(token, push=True)
|
|
917
940
|
self.frameset_ok = False
|
|
@@ -923,66 +946,88 @@ class TreeBuilderModesMixin:
|
|
|
923
946
|
self.mode = InsertionMode.TEXT
|
|
924
947
|
return
|
|
925
948
|
|
|
926
|
-
def _handle_body_start_textarea(self, token:
|
|
949
|
+
def _handle_body_start_textarea(self, token: Tag) -> None:
|
|
927
950
|
self._insert_element(token, push=True)
|
|
928
951
|
self.ignore_lf = True
|
|
929
952
|
self.frameset_ok = False
|
|
930
953
|
return
|
|
931
954
|
|
|
932
|
-
def _handle_body_start_select(self, token:
|
|
955
|
+
def _handle_body_start_select(self, token: Tag) -> None:
|
|
933
956
|
self._reconstruct_active_formatting_elements()
|
|
934
957
|
self._insert_element(token, push=True)
|
|
935
958
|
self.frameset_ok = False
|
|
936
959
|
self._reset_insertion_mode()
|
|
937
960
|
return
|
|
938
961
|
|
|
939
|
-
def _handle_body_start_option(self, token:
|
|
962
|
+
def _handle_body_start_option(self, token: Tag) -> None:
|
|
940
963
|
if self.open_elements and self.open_elements[-1].name == "option":
|
|
941
|
-
self.
|
|
964
|
+
self._pop_current()
|
|
942
965
|
self._reconstruct_active_formatting_elements()
|
|
943
966
|
self._insert_element(token, push=True)
|
|
944
967
|
return
|
|
945
968
|
|
|
946
|
-
def _handle_body_start_optgroup(self, token:
|
|
969
|
+
def _handle_body_start_optgroup(self, token: Tag) -> None:
|
|
947
970
|
if self.open_elements and self.open_elements[-1].name == "option":
|
|
948
|
-
self.
|
|
971
|
+
self._pop_current()
|
|
949
972
|
self._reconstruct_active_formatting_elements()
|
|
950
973
|
self._insert_element(token, push=True)
|
|
951
974
|
return
|
|
952
975
|
|
|
953
|
-
def _handle_body_start_rp_rt(self, token:
|
|
976
|
+
def _handle_body_start_rp_rt(self, token: Tag) -> None:
|
|
954
977
|
self._generate_implied_end_tags(exclude="rtc")
|
|
955
978
|
self._insert_element(token, push=True)
|
|
956
979
|
return
|
|
957
980
|
|
|
958
|
-
def _handle_body_start_rb_rtc(self, token:
|
|
981
|
+
def _handle_body_start_rb_rtc(self, token: Tag) -> None:
|
|
959
982
|
if self.open_elements and self.open_elements[-1].name in {"rb", "rp", "rt", "rtc"}:
|
|
960
983
|
self._generate_implied_end_tags()
|
|
961
984
|
self._insert_element(token, push=True)
|
|
962
985
|
return
|
|
963
986
|
|
|
964
|
-
def _handle_body_start_table_parse_error(self, token:
|
|
987
|
+
def _handle_body_start_table_parse_error(self, token: Tag) -> None:
|
|
965
988
|
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
966
989
|
return
|
|
967
990
|
|
|
968
|
-
def _handle_body_start_default(self, token:
|
|
991
|
+
def _handle_body_start_default(self, token: Tag) -> ModeResultTuple | None:
|
|
969
992
|
self._reconstruct_active_formatting_elements()
|
|
970
993
|
self._insert_element(token, push=True)
|
|
971
994
|
if token.self_closing:
|
|
972
995
|
self._parse_error("non-void-html-element-start-tag-with-trailing-solidus", tag_name=token.name)
|
|
973
996
|
# Elements reaching here have no handler - never in FRAMESET_NEUTRAL/FORMATTING_ELEMENTS
|
|
974
997
|
self.frameset_ok = False
|
|
975
|
-
return
|
|
998
|
+
return None
|
|
976
999
|
|
|
977
|
-
def _mode_in_table(self, token:
|
|
1000
|
+
def _mode_in_table(self, token: AnyToken) -> ModeResultTuple | None:
|
|
978
1001
|
if isinstance(token, CharacterTokens):
|
|
979
1002
|
data = token.data or ""
|
|
980
1003
|
if "\x00" in data:
|
|
981
|
-
self._parse_error("unexpected-null-character")
|
|
982
1004
|
data = data.replace("\x00", "")
|
|
983
1005
|
if not data:
|
|
984
1006
|
return None
|
|
985
1007
|
token = CharacterTokens(data)
|
|
1008
|
+
|
|
1009
|
+
if is_all_whitespace(data):
|
|
1010
|
+
self._append_text(data)
|
|
1011
|
+
return None
|
|
1012
|
+
|
|
1013
|
+
# html5lib-tests expect that some table foster-parenting text triggered by a
|
|
1014
|
+
# misnested formatting element (<a>) only produces an implied-end-tag error
|
|
1015
|
+
# when the table closes, not an additional character-in-table error.
|
|
1016
|
+
suppress_table_char_error = False
|
|
1017
|
+
if self.active_formatting:
|
|
1018
|
+
for idx in range(len(self.active_formatting) - 1, -1, -1):
|
|
1019
|
+
entry = self.active_formatting[idx]
|
|
1020
|
+
if entry is FORMAT_MARKER:
|
|
1021
|
+
break
|
|
1022
|
+
if entry["name"] == "a":
|
|
1023
|
+
if entry["node"] not in self.open_elements:
|
|
1024
|
+
suppress_table_char_error = True
|
|
1025
|
+
break
|
|
1026
|
+
|
|
1027
|
+
if not suppress_table_char_error:
|
|
1028
|
+
self.pending_table_text_should_error = True
|
|
1029
|
+
else:
|
|
1030
|
+
self.pending_table_text_should_error = False
|
|
986
1031
|
self.pending_table_text = []
|
|
987
1032
|
self.table_text_original_mode = self.mode
|
|
988
1033
|
self.mode = InsertionMode.IN_TABLE_TEXT
|
|
@@ -1046,16 +1091,16 @@ class TreeBuilderModesMixin:
|
|
|
1046
1091
|
if input_type == "hidden":
|
|
1047
1092
|
self._parse_error("unexpected-hidden-input-in-table")
|
|
1048
1093
|
self._insert_element(token, push=True)
|
|
1049
|
-
self.
|
|
1094
|
+
self._pop_current() # push=True always adds to stack
|
|
1050
1095
|
return None
|
|
1051
1096
|
if name == "form":
|
|
1052
1097
|
self._parse_error("unexpected-form-in-table")
|
|
1053
1098
|
if self.form_element is None:
|
|
1054
1099
|
node = self._insert_element(token, push=True)
|
|
1055
1100
|
self.form_element = node
|
|
1056
|
-
self.
|
|
1101
|
+
self._pop_current() # push=True always adds to stack
|
|
1057
1102
|
return None
|
|
1058
|
-
self._parse_error("
|
|
1103
|
+
self._parse_error("foster-parenting-start-tag", tag_name=name)
|
|
1059
1104
|
previous = self.insert_from_table
|
|
1060
1105
|
self.insert_from_table = True
|
|
1061
1106
|
try:
|
|
@@ -1082,26 +1127,40 @@ class TreeBuilderModesMixin:
|
|
|
1082
1127
|
if self.template_modes:
|
|
1083
1128
|
return self._mode_in_template(token)
|
|
1084
1129
|
if self._has_in_table_scope("table"):
|
|
1085
|
-
self._parse_error("
|
|
1130
|
+
self._parse_error("eof-in-table")
|
|
1086
1131
|
return None
|
|
1087
1132
|
|
|
1088
|
-
def _mode_in_table_text(self, token:
|
|
1133
|
+
def _mode_in_table_text(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1089
1134
|
if isinstance(token, CharacterTokens):
|
|
1090
1135
|
# IN_TABLE mode guarantees non-empty data
|
|
1091
1136
|
data = token.data
|
|
1092
|
-
|
|
1093
|
-
self._parse_error("invalid-codepoint-in-table-text")
|
|
1094
|
-
data = data.replace("\x0c", "")
|
|
1095
|
-
if data:
|
|
1096
|
-
self.pending_table_text.append(data)
|
|
1137
|
+
self.pending_table_text.append(data)
|
|
1097
1138
|
return None
|
|
1139
|
+
|
|
1140
|
+
if (
|
|
1141
|
+
self.pending_table_text
|
|
1142
|
+
and isinstance(token, Tag)
|
|
1143
|
+
and token.kind == Tag.END
|
|
1144
|
+
and token.name == "table"
|
|
1145
|
+
and not is_all_whitespace("".join(self.pending_table_text))
|
|
1146
|
+
):
|
|
1147
|
+
# If a misnested <a> exists only in the active formatting list, html5lib
|
|
1148
|
+
# reports the implied close when the table ends.
|
|
1149
|
+
if self.active_formatting:
|
|
1150
|
+
for idx in range(len(self.active_formatting) - 1, -1, -1):
|
|
1151
|
+
entry = self.active_formatting[idx]
|
|
1152
|
+
if entry is FORMAT_MARKER:
|
|
1153
|
+
break
|
|
1154
|
+
if entry["name"] == "a" and entry["node"] not in self.open_elements:
|
|
1155
|
+
self._parse_error("unexpected-implied-end-tag-in-table-view")
|
|
1156
|
+
break
|
|
1098
1157
|
self._flush_pending_table_text()
|
|
1099
1158
|
original = self.table_text_original_mode or InsertionMode.IN_TABLE
|
|
1100
1159
|
self.table_text_original_mode = None
|
|
1101
1160
|
self.mode = original
|
|
1102
1161
|
return ("reprocess", original, token)
|
|
1103
1162
|
|
|
1104
|
-
def _mode_in_caption(self, token:
|
|
1163
|
+
def _mode_in_caption(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1105
1164
|
if isinstance(token, CharacterTokens):
|
|
1106
1165
|
return self._mode_in_body(token)
|
|
1107
1166
|
if isinstance(token, CommentToken):
|
|
@@ -1147,14 +1206,14 @@ class TreeBuilderModesMixin:
|
|
|
1147
1206
|
self._generate_implied_end_tags()
|
|
1148
1207
|
# Caption verified in scope above
|
|
1149
1208
|
while self.open_elements: # pragma: no branch
|
|
1150
|
-
node = self.
|
|
1209
|
+
node = self._pop_current()
|
|
1151
1210
|
if node.name == "caption":
|
|
1152
1211
|
break
|
|
1153
1212
|
self._clear_active_formatting_up_to_marker()
|
|
1154
1213
|
self.mode = InsertionMode.IN_TABLE
|
|
1155
1214
|
return True
|
|
1156
1215
|
|
|
1157
|
-
def _mode_in_column_group(self, token:
|
|
1216
|
+
def _mode_in_column_group(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1158
1217
|
current = self.open_elements[-1] if self.open_elements else None
|
|
1159
1218
|
if isinstance(token, CharacterTokens):
|
|
1160
1219
|
data = token.data or ""
|
|
@@ -1191,7 +1250,7 @@ class TreeBuilderModesMixin:
|
|
|
1191
1250
|
return self._mode_in_body(token)
|
|
1192
1251
|
if name == "col":
|
|
1193
1252
|
self._insert_element(token, push=True)
|
|
1194
|
-
self.
|
|
1253
|
+
self._pop_current() # push=True always adds to stack
|
|
1195
1254
|
return None
|
|
1196
1255
|
if name == "template":
|
|
1197
1256
|
# Template is handled by delegating to IN_HEAD
|
|
@@ -1249,7 +1308,7 @@ class TreeBuilderModesMixin:
|
|
|
1249
1308
|
return None
|
|
1250
1309
|
# Per spec: EOF when current is html - implicit None return
|
|
1251
1310
|
|
|
1252
|
-
def _mode_in_table_body(self, token:
|
|
1311
|
+
def _mode_in_table_body(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1253
1312
|
if isinstance(token, CharacterTokens) or isinstance(token, CommentToken):
|
|
1254
1313
|
return self._mode_in_table(token)
|
|
1255
1314
|
if isinstance(token, Tag):
|
|
@@ -1284,7 +1343,7 @@ class TreeBuilderModesMixin:
|
|
|
1284
1343
|
return None
|
|
1285
1344
|
# Pop tbody/tfoot/thead (stack always has elements here in normal parsing)
|
|
1286
1345
|
if self.open_elements:
|
|
1287
|
-
self.
|
|
1346
|
+
self._pop_current()
|
|
1288
1347
|
self.mode = InsertionMode.IN_TABLE
|
|
1289
1348
|
return ("reprocess", InsertionMode.IN_TABLE, token)
|
|
1290
1349
|
# Empty stack edge case - go directly to IN_TABLE without reprocess
|
|
@@ -1315,7 +1374,7 @@ class TreeBuilderModesMixin:
|
|
|
1315
1374
|
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
1316
1375
|
return None
|
|
1317
1376
|
if current and current.name in {"tbody", "tfoot", "thead"}:
|
|
1318
|
-
self.
|
|
1377
|
+
self._pop_current()
|
|
1319
1378
|
self.mode = InsertionMode.IN_TABLE
|
|
1320
1379
|
return ("reprocess", InsertionMode.IN_TABLE, token)
|
|
1321
1380
|
if name in {"caption", "col", "colgroup", "td", "th", "tr"}:
|
|
@@ -1325,7 +1384,7 @@ class TreeBuilderModesMixin:
|
|
|
1325
1384
|
assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
|
|
1326
1385
|
return self._mode_in_table(token)
|
|
1327
1386
|
|
|
1328
|
-
def _mode_in_row(self, token:
|
|
1387
|
+
def _mode_in_row(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1329
1388
|
if isinstance(token, CharacterTokens) or isinstance(token, CommentToken):
|
|
1330
1389
|
return self._mode_in_table(token)
|
|
1331
1390
|
if isinstance(token, Tag):
|
|
@@ -1378,14 +1437,14 @@ class TreeBuilderModesMixin:
|
|
|
1378
1437
|
self._clear_stack_until({"tr", "template", "html"})
|
|
1379
1438
|
# Pop tr if on top (may not be if stack was exhausted)
|
|
1380
1439
|
if self.open_elements and self.open_elements[-1].name == "tr":
|
|
1381
|
-
self.
|
|
1440
|
+
self._pop_current()
|
|
1382
1441
|
# When in a template, restore template mode; otherwise use IN_TABLE_BODY
|
|
1383
1442
|
if self.template_modes:
|
|
1384
1443
|
self.mode = self.template_modes[-1]
|
|
1385
1444
|
else:
|
|
1386
1445
|
self.mode = InsertionMode.IN_TABLE_BODY
|
|
1387
1446
|
|
|
1388
|
-
def _mode_in_cell(self, token:
|
|
1447
|
+
def _mode_in_cell(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1389
1448
|
if isinstance(token, CharacterTokens):
|
|
1390
1449
|
previous = self.insert_from_table
|
|
1391
1450
|
self.insert_from_table = False
|
|
@@ -1439,15 +1498,11 @@ class TreeBuilderModesMixin:
|
|
|
1439
1498
|
return ("reprocess", self.mode, token)
|
|
1440
1499
|
return self._mode_in_table(token)
|
|
1441
1500
|
|
|
1442
|
-
def _mode_in_select(self, token:
|
|
1501
|
+
def _mode_in_select(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1443
1502
|
if isinstance(token, CharacterTokens):
|
|
1444
1503
|
data = token.data or ""
|
|
1445
1504
|
if "\x00" in data:
|
|
1446
|
-
self._parse_error("invalid-codepoint-in-select")
|
|
1447
1505
|
data = data.replace("\x00", "")
|
|
1448
|
-
if "\x0c" in data:
|
|
1449
|
-
self._parse_error("invalid-codepoint-in-select")
|
|
1450
|
-
data = data.replace("\x0c", "")
|
|
1451
1506
|
if data:
|
|
1452
1507
|
self._reconstruct_active_formatting_elements()
|
|
1453
1508
|
self._append_text(data)
|
|
@@ -1462,26 +1517,26 @@ class TreeBuilderModesMixin:
|
|
|
1462
1517
|
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
1463
1518
|
if name == "option":
|
|
1464
1519
|
if self.open_elements and self.open_elements[-1].name == "option":
|
|
1465
|
-
self.
|
|
1520
|
+
self._pop_current()
|
|
1466
1521
|
self._reconstruct_active_formatting_elements()
|
|
1467
1522
|
self._insert_element(token, push=True)
|
|
1468
1523
|
return None
|
|
1469
1524
|
if name == "optgroup":
|
|
1470
1525
|
if self.open_elements and self.open_elements[-1].name == "option":
|
|
1471
|
-
self.
|
|
1526
|
+
self._pop_current()
|
|
1472
1527
|
if self.open_elements and self.open_elements[-1].name == "optgroup":
|
|
1473
|
-
self.
|
|
1528
|
+
self._pop_current()
|
|
1474
1529
|
self._reconstruct_active_formatting_elements()
|
|
1475
1530
|
self._insert_element(token, push=True)
|
|
1476
1531
|
return None
|
|
1477
1532
|
if name == "select":
|
|
1478
|
-
self._parse_error("unexpected-
|
|
1533
|
+
self._parse_error("unexpected-select-in-select")
|
|
1479
1534
|
# select is always in scope in IN_SELECT mode
|
|
1480
1535
|
self._pop_until_any_inclusive({"select"})
|
|
1481
1536
|
self._reset_insertion_mode()
|
|
1482
1537
|
return None
|
|
1483
1538
|
if name in {"input", "textarea"}:
|
|
1484
|
-
self._parse_error("unexpected-start-tag-
|
|
1539
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1485
1540
|
# select is always in scope in IN_SELECT mode
|
|
1486
1541
|
self._pop_until_any_inclusive({"select"})
|
|
1487
1542
|
self._reset_insertion_mode()
|
|
@@ -1491,7 +1546,7 @@ class TreeBuilderModesMixin:
|
|
|
1491
1546
|
self._insert_element(token, push=False)
|
|
1492
1547
|
return None
|
|
1493
1548
|
if name in {"caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr", "table"}:
|
|
1494
|
-
self._parse_error("unexpected-start-tag-
|
|
1549
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1495
1550
|
# select is always in scope in IN_SELECT mode
|
|
1496
1551
|
self._pop_until_any_inclusive({"select"})
|
|
1497
1552
|
self._reset_insertion_mode()
|
|
@@ -1509,45 +1564,53 @@ class TreeBuilderModesMixin:
|
|
|
1509
1564
|
self._append_active_formatting_entry(name, token.attrs, node)
|
|
1510
1565
|
return None
|
|
1511
1566
|
if name == "hr":
|
|
1567
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1512
1568
|
# Per spec: pop option and optgroup before inserting hr (makes hr sibling, not child)
|
|
1513
1569
|
if self.open_elements and self.open_elements[-1].name == "option":
|
|
1514
|
-
self.
|
|
1570
|
+
self._pop_current()
|
|
1515
1571
|
if self.open_elements and self.open_elements[-1].name == "optgroup":
|
|
1516
|
-
self.
|
|
1572
|
+
self._pop_current()
|
|
1517
1573
|
self._reconstruct_active_formatting_elements()
|
|
1518
1574
|
self._insert_element(token, push=False)
|
|
1519
1575
|
return None
|
|
1520
1576
|
if name == "menuitem":
|
|
1577
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1521
1578
|
self._reconstruct_active_formatting_elements()
|
|
1522
1579
|
self._insert_element(token, push=True)
|
|
1523
1580
|
return None
|
|
1524
1581
|
# Allow common HTML elements in select (newer spec)
|
|
1525
1582
|
if name in {"p", "div", "span", "button", "datalist", "selectedcontent"}:
|
|
1583
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1526
1584
|
self._reconstruct_active_formatting_elements()
|
|
1527
1585
|
self._insert_element(token, push=not token.self_closing)
|
|
1528
1586
|
return None
|
|
1529
1587
|
if name in {"br", "img"}:
|
|
1588
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1530
1589
|
self._reconstruct_active_formatting_elements()
|
|
1531
1590
|
self._insert_element(token, push=False)
|
|
1532
1591
|
return None
|
|
1533
1592
|
if name == "plaintext":
|
|
1534
1593
|
# Per spec: plaintext element is inserted in select (consumes all remaining text)
|
|
1594
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1535
1595
|
self._reconstruct_active_formatting_elements()
|
|
1536
1596
|
self._insert_element(token, push=True)
|
|
1597
|
+
return None
|
|
1598
|
+
# Any other start tag: parse error, ignore.
|
|
1599
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1537
1600
|
return None
|
|
1538
1601
|
if name == "optgroup":
|
|
1539
1602
|
if self.open_elements and self.open_elements[-1].name == "option":
|
|
1540
|
-
self.
|
|
1603
|
+
self._pop_current()
|
|
1541
1604
|
if self.open_elements and self.open_elements[-1].name == "optgroup":
|
|
1542
|
-
self.
|
|
1605
|
+
self._pop_current()
|
|
1543
1606
|
else:
|
|
1544
|
-
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
1607
|
+
self._parse_error("unexpected-end-tag-in-select", tag_name=token.name)
|
|
1545
1608
|
return None
|
|
1546
1609
|
if name == "option":
|
|
1547
1610
|
if self.open_elements and self.open_elements[-1].name == "option":
|
|
1548
|
-
self.
|
|
1611
|
+
self._pop_current()
|
|
1549
1612
|
else:
|
|
1550
|
-
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
1613
|
+
self._parse_error("unexpected-end-tag-in-select", tag_name=token.name)
|
|
1551
1614
|
return None
|
|
1552
1615
|
if name == "select":
|
|
1553
1616
|
# In IN_SELECT mode, select is always in scope - pop to it
|
|
@@ -1559,17 +1622,20 @@ class TreeBuilderModesMixin:
|
|
|
1559
1622
|
# select is always on stack in IN_SELECT mode
|
|
1560
1623
|
select_node = self._find_last_on_stack("select")
|
|
1561
1624
|
fmt_index = self._find_active_formatting_index(name)
|
|
1562
|
-
if fmt_index is
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
1625
|
+
if fmt_index is None:
|
|
1626
|
+
self._parse_error("unexpected-end-tag-in-select", tag_name=name)
|
|
1627
|
+
return None
|
|
1628
|
+
target = self.active_formatting[fmt_index]["node"]
|
|
1629
|
+
if target in self.open_elements: # pragma: no branch
|
|
1630
|
+
select_index = self.open_elements.index(select_node)
|
|
1631
|
+
target_index = self.open_elements.index(target)
|
|
1632
|
+
if target_index < select_index:
|
|
1633
|
+
self._parse_error("unexpected-end-tag-in-select", tag_name=name)
|
|
1634
|
+
return None
|
|
1570
1635
|
self._adoption_agency(name)
|
|
1571
1636
|
return None
|
|
1572
1637
|
if name in {"p", "div", "span", "button", "datalist", "selectedcontent"}:
|
|
1638
|
+
self._parse_error("unexpected-end-tag-in-select", tag_name=name)
|
|
1573
1639
|
# Per HTML5 spec: these end tags in select mode close the element if it's on the stack.
|
|
1574
1640
|
# But we must not pop across the select boundary (i.e., don't pop elements BEFORE select).
|
|
1575
1641
|
select_idx = None
|
|
@@ -1583,14 +1649,12 @@ class TreeBuilderModesMixin:
|
|
|
1583
1649
|
# i.e., the target is inside the select or there's no select
|
|
1584
1650
|
if target_idx is not None and (select_idx is None or target_idx > select_idx):
|
|
1585
1651
|
while True:
|
|
1586
|
-
popped = self.
|
|
1652
|
+
popped = self._pop_current()
|
|
1587
1653
|
if popped.name == name:
|
|
1588
1654
|
break
|
|
1589
|
-
else:
|
|
1590
|
-
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1591
1655
|
return None
|
|
1592
1656
|
if name in {"caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr", "table"}:
|
|
1593
|
-
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1657
|
+
self._parse_error("unexpected-end-tag-in-select", tag_name=name)
|
|
1594
1658
|
# select is always in scope in IN_SELECT mode
|
|
1595
1659
|
self._pop_until_any_inclusive({"select"})
|
|
1596
1660
|
self._reset_insertion_mode()
|
|
@@ -1601,7 +1665,7 @@ class TreeBuilderModesMixin:
|
|
|
1601
1665
|
assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
|
|
1602
1666
|
return self._mode_in_body(token)
|
|
1603
1667
|
|
|
1604
|
-
def _mode_in_template(self, token:
|
|
1668
|
+
def _mode_in_template(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1605
1669
|
# § The "in template" insertion mode
|
|
1606
1670
|
# https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intemplate
|
|
1607
1671
|
if isinstance(token, CharacterTokens):
|
|
@@ -1680,7 +1744,7 @@ class TreeBuilderModesMixin:
|
|
|
1680
1744
|
return ("reprocess", self.mode, token)
|
|
1681
1745
|
return None
|
|
1682
1746
|
|
|
1683
|
-
def _mode_after_body(self, token:
|
|
1747
|
+
def _mode_after_body(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1684
1748
|
if isinstance(token, CharacterTokens):
|
|
1685
1749
|
if is_all_whitespace(token.data):
|
|
1686
1750
|
# Whitespace is processed using InBody rules (appended to body)
|
|
@@ -1701,7 +1765,7 @@ class TreeBuilderModesMixin:
|
|
|
1701
1765
|
assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
|
|
1702
1766
|
return None
|
|
1703
1767
|
|
|
1704
|
-
def _mode_after_after_body(self, token:
|
|
1768
|
+
def _mode_after_after_body(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1705
1769
|
if isinstance(token, CharacterTokens):
|
|
1706
1770
|
if is_all_whitespace(token.data):
|
|
1707
1771
|
# Per spec: whitespace characters are inserted using the rules for the "in body" mode
|
|
@@ -1728,7 +1792,7 @@ class TreeBuilderModesMixin:
|
|
|
1728
1792
|
assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
|
|
1729
1793
|
return None
|
|
1730
1794
|
|
|
1731
|
-
def _mode_in_frameset(self, token:
|
|
1795
|
+
def _mode_in_frameset(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1732
1796
|
# Per HTML5 spec §13.2.6.4.16: In frameset insertion mode
|
|
1733
1797
|
if isinstance(token, CharacterTokens):
|
|
1734
1798
|
# Only whitespace characters allowed; ignore all others
|
|
@@ -1749,13 +1813,13 @@ class TreeBuilderModesMixin:
|
|
|
1749
1813
|
if self.open_elements and self.open_elements[-1].name == "html":
|
|
1750
1814
|
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
1751
1815
|
return None
|
|
1752
|
-
self.
|
|
1816
|
+
self._pop_current()
|
|
1753
1817
|
if self.open_elements and self.open_elements[-1].name != "frameset":
|
|
1754
1818
|
self.mode = InsertionMode.AFTER_FRAMESET
|
|
1755
1819
|
return None
|
|
1756
1820
|
if token.kind == Tag.START and token.name == "frame":
|
|
1757
1821
|
self._insert_element(token, push=True)
|
|
1758
|
-
self.
|
|
1822
|
+
self._pop_current()
|
|
1759
1823
|
return None
|
|
1760
1824
|
if token.kind == Tag.START and token.name == "noframes":
|
|
1761
1825
|
# Per spec: use IN_HEAD rules but preserve current mode for TEXT restoration
|
|
@@ -1770,11 +1834,14 @@ class TreeBuilderModesMixin:
|
|
|
1770
1834
|
self._parse_error("unexpected-token-in-frameset")
|
|
1771
1835
|
return None
|
|
1772
1836
|
|
|
1773
|
-
def _mode_after_frameset(self, token:
|
|
1837
|
+
def _mode_after_frameset(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1774
1838
|
# Per HTML5 spec §13.2.6.4.17: After frameset insertion mode
|
|
1775
1839
|
if isinstance(token, CharacterTokens):
|
|
1776
|
-
# Only whitespace characters allowed;
|
|
1777
|
-
|
|
1840
|
+
# Only whitespace characters allowed; non-whitespace is a parse error.
|
|
1841
|
+
data = token.data or ""
|
|
1842
|
+
whitespace = "".join(ch for ch in data if ch in "\t\n\f\r ")
|
|
1843
|
+
if any(ch not in "\t\n\f\r " for ch in data):
|
|
1844
|
+
self._parse_error("unexpected-token-after-frameset")
|
|
1778
1845
|
if whitespace:
|
|
1779
1846
|
self._append_text(whitespace)
|
|
1780
1847
|
return None
|
|
@@ -1787,6 +1854,9 @@ class TreeBuilderModesMixin:
|
|
|
1787
1854
|
if token.kind == Tag.END and token.name == "html":
|
|
1788
1855
|
self.mode = InsertionMode.AFTER_AFTER_FRAMESET
|
|
1789
1856
|
return None
|
|
1857
|
+
if token.kind == Tag.END and token.name == "frameset":
|
|
1858
|
+
self._parse_error("unexpected-token-after-frameset")
|
|
1859
|
+
return None
|
|
1790
1860
|
if token.kind == Tag.START and token.name == "noframes":
|
|
1791
1861
|
# Insert noframes element directly and switch to TEXT mode
|
|
1792
1862
|
self._insert_element(token, push=True)
|
|
@@ -1799,7 +1869,7 @@ class TreeBuilderModesMixin:
|
|
|
1799
1869
|
self.mode = InsertionMode.IN_FRAMESET
|
|
1800
1870
|
return ("reprocess", InsertionMode.IN_FRAMESET, token)
|
|
1801
1871
|
|
|
1802
|
-
def _mode_after_after_frameset(self, token:
|
|
1872
|
+
def _mode_after_after_frameset(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1803
1873
|
# Per HTML5 spec §13.2.6.4.18: After after frameset insertion mode
|
|
1804
1874
|
if isinstance(token, CharacterTokens):
|
|
1805
1875
|
# Whitespace is processed using InBody rules
|
|
@@ -1830,7 +1900,7 @@ class TreeBuilderModesMixin:
|
|
|
1830
1900
|
|
|
1831
1901
|
# Helpers ----------------------------------------------------------------
|
|
1832
1902
|
|
|
1833
|
-
_MODE_HANDLERS = [
|
|
1903
|
+
_MODE_HANDLERS: list[Callable[[TreeBuilderModesMixin, AnyToken], ModeResultTuple | None]] = [
|
|
1834
1904
|
_mode_initial,
|
|
1835
1905
|
_mode_before_html,
|
|
1836
1906
|
_mode_before_head,
|
|
@@ -1855,14 +1925,14 @@ class TreeBuilderModesMixin:
|
|
|
1855
1925
|
_mode_in_template,
|
|
1856
1926
|
]
|
|
1857
1927
|
|
|
1858
|
-
_BODY_TOKEN_HANDLERS = {
|
|
1928
|
+
_BODY_TOKEN_HANDLERS: dict[type[AnyToken], Callable[[TreeBuilderModesMixin, Any], ModeResultTuple | None]] = {
|
|
1859
1929
|
CharacterTokens: _handle_characters_in_body,
|
|
1860
1930
|
CommentToken: _handle_comment_in_body,
|
|
1861
1931
|
Tag: _handle_tag_in_body,
|
|
1862
1932
|
EOFToken: _handle_eof_in_body,
|
|
1863
1933
|
}
|
|
1864
1934
|
|
|
1865
|
-
_BODY_START_HANDLERS = {
|
|
1935
|
+
_BODY_START_HANDLERS: dict[str, Callable[[TreeBuilderModesMixin, Tag], ModeResultTuple | None]] = {
|
|
1866
1936
|
"a": _handle_body_start_a,
|
|
1867
1937
|
"address": _handle_body_start_block_with_p,
|
|
1868
1938
|
"applet": _handle_body_start_applet_like,
|
|
@@ -1967,7 +2037,7 @@ class TreeBuilderModesMixin:
|
|
|
1967
2037
|
"wbr": _handle_body_start_void_with_formatting,
|
|
1968
2038
|
"xmp": _handle_body_start_plaintext_xmp,
|
|
1969
2039
|
}
|
|
1970
|
-
_BODY_END_HANDLERS = {
|
|
2040
|
+
_BODY_END_HANDLERS: dict[str, Callable[[TreeBuilderModesMixin, Tag], ModeResultTuple | None]] = {
|
|
1971
2041
|
"address": _handle_body_end_block,
|
|
1972
2042
|
"applet": _handle_body_end_applet_like,
|
|
1973
2043
|
"article": _handle_body_end_block,
|