justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- justhtml/__init__.py +28 -0
- justhtml/__main__.py +161 -13
- justhtml/constants.py +17 -1
- justhtml/context.py +7 -1
- justhtml/encoding.py +405 -0
- justhtml/entities.py +57 -17
- justhtml/errors.py +20 -4
- justhtml/linkify.py +438 -0
- justhtml/node.py +738 -41
- justhtml/parser.py +188 -21
- justhtml/py.typed +0 -0
- justhtml/sanitize.py +1141 -0
- justhtml/selector.py +240 -104
- justhtml/serialize.py +418 -57
- justhtml/stream.py +34 -10
- justhtml/tokenizer.py +433 -289
- justhtml/tokens.py +91 -23
- justhtml/transforms.py +690 -0
- justhtml/treebuilder.py +196 -111
- justhtml/treebuilder_modes.py +191 -117
- justhtml/treebuilder_utils.py +11 -4
- justhtml-0.33.0.dist-info/METADATA +196 -0
- justhtml-0.33.0.dist-info/RECORD +26 -0
- justhtml-0.33.0.dist-info/entry_points.txt +2 -0
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.6.0.dist-info/METADATA +0 -126
- justhtml-0.6.0.dist-info/RECORD +0 -20
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/WHEEL +0 -0
justhtml/treebuilder_modes.py
CHANGED
|
@@ -1,21 +1,32 @@
|
|
|
1
1
|
# ruff: noqa: S101, RUF012
|
|
2
|
+
# mypy: disable-error-code="attr-defined, has-type, var-annotated, assignment"
|
|
2
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
3
7
|
|
|
4
8
|
from .constants import (
|
|
9
|
+
FORMAT_MARKER,
|
|
5
10
|
FORMATTING_ELEMENTS,
|
|
6
11
|
HEADING_ELEMENTS,
|
|
7
12
|
)
|
|
8
13
|
from .node import SimpleDomNode, TemplateNode
|
|
9
|
-
from .tokens import CharacterTokens, CommentToken, EOFToken, Tag, TokenSinkResult
|
|
14
|
+
from .tokens import AnyToken, CharacterTokens, CommentToken, DoctypeToken, EOFToken, Tag, TokenSinkResult
|
|
10
15
|
from .treebuilder_utils import (
|
|
11
16
|
InsertionMode,
|
|
12
17
|
doctype_error_and_quirks,
|
|
13
18
|
is_all_whitespace,
|
|
14
19
|
)
|
|
15
20
|
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from collections.abc import Callable
|
|
23
|
+
|
|
24
|
+
ModeResultTuple = tuple[str, InsertionMode, AnyToken] | tuple[str, InsertionMode, AnyToken, bool]
|
|
25
|
+
"Result is (instruction, mode, token) or (instruction, mode, token, force_html)"
|
|
26
|
+
|
|
16
27
|
|
|
17
28
|
class TreeBuilderModesMixin:
|
|
18
|
-
def _handle_doctype(self, token):
|
|
29
|
+
def _handle_doctype(self, token: DoctypeToken) -> Literal[0]:
|
|
19
30
|
if self.mode != InsertionMode.INITIAL:
|
|
20
31
|
self._parse_error("unexpected-doctype")
|
|
21
32
|
return TokenSinkResult.Continue
|
|
@@ -33,7 +44,7 @@ class TreeBuilderModesMixin:
|
|
|
33
44
|
self.mode = InsertionMode.BEFORE_HTML
|
|
34
45
|
return TokenSinkResult.Continue
|
|
35
46
|
|
|
36
|
-
def _mode_initial(self, token):
|
|
47
|
+
def _mode_initial(self, token: Any) -> ModeResultTuple | None:
|
|
37
48
|
if isinstance(token, CharacterTokens):
|
|
38
49
|
if is_all_whitespace(token.data):
|
|
39
50
|
return None
|
|
@@ -50,13 +61,13 @@ class TreeBuilderModesMixin:
|
|
|
50
61
|
return ("reprocess", InsertionMode.BEFORE_HTML, token)
|
|
51
62
|
# Only Tags remain - no DOCTYPE seen, so quirks mode
|
|
52
63
|
if token.kind == Tag.START:
|
|
53
|
-
self._parse_error("expected-doctype-but-got-start-tag", tag_name=token.name
|
|
64
|
+
self._parse_error("expected-doctype-but-got-start-tag", tag_name=token.name)
|
|
54
65
|
else:
|
|
55
|
-
self._parse_error("expected-doctype-but-got-end-tag", tag_name=token.name
|
|
66
|
+
self._parse_error("expected-doctype-but-got-end-tag", tag_name=token.name)
|
|
56
67
|
self._set_quirks_mode("quirks")
|
|
57
68
|
return ("reprocess", InsertionMode.BEFORE_HTML, token)
|
|
58
69
|
|
|
59
|
-
def _mode_before_html(self, token):
|
|
70
|
+
def _mode_before_html(self, token: AnyToken) -> ModeResultTuple | None:
|
|
60
71
|
if isinstance(token, CharacterTokens) and is_all_whitespace(token.data):
|
|
61
72
|
return None
|
|
62
73
|
if isinstance(token, CommentToken):
|
|
@@ -89,7 +100,7 @@ class TreeBuilderModesMixin:
|
|
|
89
100
|
self.mode = InsertionMode.BEFORE_HEAD
|
|
90
101
|
return ("reprocess", InsertionMode.BEFORE_HEAD, token)
|
|
91
102
|
|
|
92
|
-
def _mode_before_head(self, token):
|
|
103
|
+
def _mode_before_head(self, token: AnyToken) -> ModeResultTuple | None:
|
|
93
104
|
if isinstance(token, CharacterTokens):
|
|
94
105
|
data = token.data or ""
|
|
95
106
|
if "\x00" in data:
|
|
@@ -132,7 +143,7 @@ class TreeBuilderModesMixin:
|
|
|
132
143
|
self.mode = InsertionMode.IN_HEAD
|
|
133
144
|
return ("reprocess", InsertionMode.IN_HEAD, token)
|
|
134
145
|
|
|
135
|
-
def _mode_in_head(self, token):
|
|
146
|
+
def _mode_in_head(self, token: AnyToken) -> ModeResultTuple | None:
|
|
136
147
|
if isinstance(token, CharacterTokens):
|
|
137
148
|
if is_all_whitespace(token.data):
|
|
138
149
|
self._append_text(token.data)
|
|
@@ -208,7 +219,7 @@ class TreeBuilderModesMixin:
|
|
|
208
219
|
self.mode = InsertionMode.AFTER_HEAD
|
|
209
220
|
return ("reprocess", InsertionMode.AFTER_HEAD, token)
|
|
210
221
|
|
|
211
|
-
def _mode_in_head_noscript(self, token):
|
|
222
|
+
def _mode_in_head_noscript(self, token: AnyToken) -> ModeResultTuple | None:
|
|
212
223
|
"""Handle tokens in 'in head noscript' insertion mode (scripting disabled)."""
|
|
213
224
|
if isinstance(token, CharacterTokens):
|
|
214
225
|
data = token.data or ""
|
|
@@ -257,15 +268,11 @@ class TreeBuilderModesMixin:
|
|
|
257
268
|
# All token types are handled above - CharacterTokens, CommentToken, Tag, EOFToken
|
|
258
269
|
return None # pragma: no cover
|
|
259
270
|
|
|
260
|
-
def _mode_after_head(self, token):
|
|
271
|
+
def _mode_after_head(self, token: AnyToken) -> ModeResultTuple | None:
|
|
261
272
|
if isinstance(token, CharacterTokens):
|
|
262
273
|
data = token.data or ""
|
|
263
274
|
if "\x00" in data:
|
|
264
|
-
self._parse_error("invalid-codepoint-in-body")
|
|
265
275
|
data = data.replace("\x00", "")
|
|
266
|
-
if "\x0c" in data:
|
|
267
|
-
self._parse_error("invalid-codepoint-in-body")
|
|
268
|
-
data = data.replace("\x0c", "")
|
|
269
276
|
if not data or is_all_whitespace(data):
|
|
270
277
|
if data:
|
|
271
278
|
self._append_text(data)
|
|
@@ -327,6 +334,10 @@ class TreeBuilderModesMixin:
|
|
|
327
334
|
self.mode = InsertionMode.IN_HEAD
|
|
328
335
|
return ("reprocess", InsertionMode.IN_HEAD, token)
|
|
329
336
|
if token.kind == Tag.END and token.name == "template":
|
|
337
|
+
has_template = any(node.name == "template" for node in self.open_elements)
|
|
338
|
+
if not has_template:
|
|
339
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
340
|
+
return None
|
|
330
341
|
return self._mode_in_head(token)
|
|
331
342
|
if token.kind == Tag.END and token.name == "body":
|
|
332
343
|
self._insert_body_if_missing()
|
|
@@ -346,7 +357,7 @@ class TreeBuilderModesMixin:
|
|
|
346
357
|
self._insert_body_if_missing()
|
|
347
358
|
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
348
359
|
|
|
349
|
-
def _mode_text(self, token):
|
|
360
|
+
def _mode_text(self, token: AnyToken) -> ModeResultTuple | None:
|
|
350
361
|
if isinstance(token, CharacterTokens):
|
|
351
362
|
self._append_text(token.data)
|
|
352
363
|
return None
|
|
@@ -362,11 +373,11 @@ class TreeBuilderModesMixin:
|
|
|
362
373
|
self.mode = self.original_mode or InsertionMode.IN_BODY
|
|
363
374
|
return None
|
|
364
375
|
|
|
365
|
-
def _mode_in_body(self, token):
|
|
376
|
+
def _mode_in_body(self, token: Any) -> ModeResultTuple | None:
|
|
366
377
|
handler = self._BODY_TOKEN_HANDLERS.get(type(token))
|
|
367
378
|
return handler(self, token) if handler else None
|
|
368
379
|
|
|
369
|
-
def _handle_characters_in_body(self, token):
|
|
380
|
+
def _handle_characters_in_body(self, token: CharacterTokens) -> None:
|
|
370
381
|
data = token.data or ""
|
|
371
382
|
if "\x00" in data:
|
|
372
383
|
self._parse_error("invalid-codepoint")
|
|
@@ -380,11 +391,11 @@ class TreeBuilderModesMixin:
|
|
|
380
391
|
self._append_text(data)
|
|
381
392
|
return
|
|
382
393
|
|
|
383
|
-
def _handle_comment_in_body(self, token):
|
|
394
|
+
def _handle_comment_in_body(self, token: CommentToken) -> None:
|
|
384
395
|
self._append_comment(token.data)
|
|
385
396
|
return
|
|
386
397
|
|
|
387
|
-
def _handle_tag_in_body(self, token):
|
|
398
|
+
def _handle_tag_in_body(self, token: Tag) -> ModeResultTuple | None:
|
|
388
399
|
if token.kind == Tag.START:
|
|
389
400
|
handler = self._BODY_START_HANDLERS.get(token.name)
|
|
390
401
|
if handler:
|
|
@@ -408,7 +419,7 @@ class TreeBuilderModesMixin:
|
|
|
408
419
|
self._any_other_end_tag(token.name)
|
|
409
420
|
return None
|
|
410
421
|
|
|
411
|
-
def _handle_eof_in_body(self, token):
|
|
422
|
+
def _handle_eof_in_body(self, token: EOFToken) -> ModeResultTuple | None:
|
|
412
423
|
# If we're in a template, handle EOF in template mode first
|
|
413
424
|
if self.template_modes:
|
|
414
425
|
return self._mode_in_template(token)
|
|
@@ -443,17 +454,19 @@ class TreeBuilderModesMixin:
|
|
|
443
454
|
# Body mode start tag handlers
|
|
444
455
|
# ---------------------
|
|
445
456
|
|
|
446
|
-
def _handle_body_start_html(self, token):
|
|
457
|
+
def _handle_body_start_html(self, token: Tag) -> None:
|
|
447
458
|
if self.template_modes:
|
|
448
459
|
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
449
460
|
return
|
|
461
|
+
# Per spec: parse error; merge attributes onto existing <html>.
|
|
462
|
+
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
450
463
|
# In IN_BODY mode, html element is always at open_elements[0]
|
|
451
464
|
if self.open_elements: # pragma: no branch
|
|
452
465
|
html = self.open_elements[0]
|
|
453
466
|
self._add_missing_attributes(html, token.attrs)
|
|
454
467
|
return
|
|
455
468
|
|
|
456
|
-
def _handle_body_start_body(self, token):
|
|
469
|
+
def _handle_body_start_body(self, token: Tag) -> None:
|
|
457
470
|
if self.template_modes:
|
|
458
471
|
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
459
472
|
return
|
|
@@ -467,19 +480,19 @@ class TreeBuilderModesMixin:
|
|
|
467
480
|
self.frameset_ok = False
|
|
468
481
|
return
|
|
469
482
|
|
|
470
|
-
def _handle_body_start_head(self, token):
|
|
483
|
+
def _handle_body_start_head(self, token: Tag) -> None:
|
|
471
484
|
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
472
485
|
return
|
|
473
486
|
|
|
474
|
-
def _handle_body_start_in_head(self, token):
|
|
487
|
+
def _handle_body_start_in_head(self, token: Tag) -> ModeResultTuple | None:
|
|
475
488
|
return self._mode_in_head(token)
|
|
476
489
|
|
|
477
|
-
def _handle_body_start_block_with_p(self, token):
|
|
490
|
+
def _handle_body_start_block_with_p(self, token: Tag) -> None:
|
|
478
491
|
self._close_p_element()
|
|
479
492
|
self._insert_element(token, push=True)
|
|
480
493
|
return
|
|
481
494
|
|
|
482
|
-
def _handle_body_start_heading(self, token):
|
|
495
|
+
def _handle_body_start_heading(self, token: Tag) -> None:
|
|
483
496
|
self._close_p_element()
|
|
484
497
|
if self.open_elements and self.open_elements[-1].name in HEADING_ELEMENTS:
|
|
485
498
|
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
@@ -488,14 +501,14 @@ class TreeBuilderModesMixin:
|
|
|
488
501
|
self.frameset_ok = False
|
|
489
502
|
return
|
|
490
503
|
|
|
491
|
-
def _handle_body_start_pre_listing(self, token):
|
|
504
|
+
def _handle_body_start_pre_listing(self, token: Tag) -> None:
|
|
492
505
|
self._close_p_element()
|
|
493
506
|
self._insert_element(token, push=True)
|
|
494
507
|
self.ignore_lf = True
|
|
495
508
|
self.frameset_ok = False
|
|
496
509
|
return
|
|
497
510
|
|
|
498
|
-
def _handle_body_start_form(self, token):
|
|
511
|
+
def _handle_body_start_form(self, token: Tag) -> None:
|
|
499
512
|
if self.form_element is not None:
|
|
500
513
|
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
501
514
|
return
|
|
@@ -505,7 +518,7 @@ class TreeBuilderModesMixin:
|
|
|
505
518
|
self.frameset_ok = False
|
|
506
519
|
return
|
|
507
520
|
|
|
508
|
-
def _handle_body_start_button(self, token):
|
|
521
|
+
def _handle_body_start_button(self, token: Tag) -> None:
|
|
509
522
|
if self._has_in_scope("button"):
|
|
510
523
|
self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=token.name)
|
|
511
524
|
self._close_element_by_name("button")
|
|
@@ -513,19 +526,19 @@ class TreeBuilderModesMixin:
|
|
|
513
526
|
self.frameset_ok = False
|
|
514
527
|
return
|
|
515
528
|
|
|
516
|
-
def _handle_body_start_paragraph(self, token):
|
|
529
|
+
def _handle_body_start_paragraph(self, token: Tag) -> None:
|
|
517
530
|
self._close_p_element()
|
|
518
531
|
self._insert_element(token, push=True)
|
|
519
532
|
return
|
|
520
533
|
|
|
521
|
-
def _handle_body_start_math(self, token):
|
|
534
|
+
def _handle_body_start_math(self, token: Tag) -> None:
|
|
522
535
|
self._reconstruct_active_formatting_elements()
|
|
523
536
|
attrs = self._prepare_foreign_attributes("math", token.attrs)
|
|
524
537
|
new_tag = Tag(Tag.START, token.name, attrs, token.self_closing)
|
|
525
538
|
self._insert_element(new_tag, push=not token.self_closing, namespace="math")
|
|
526
539
|
return
|
|
527
540
|
|
|
528
|
-
def _handle_body_start_svg(self, token):
|
|
541
|
+
def _handle_body_start_svg(self, token: Tag) -> None:
|
|
529
542
|
self._reconstruct_active_formatting_elements()
|
|
530
543
|
adjusted_name = self._adjust_svg_tag_name(token.name)
|
|
531
544
|
attrs = self._prepare_foreign_attributes("svg", token.attrs)
|
|
@@ -533,7 +546,7 @@ class TreeBuilderModesMixin:
|
|
|
533
546
|
self._insert_element(new_tag, push=not token.self_closing, namespace="svg")
|
|
534
547
|
return
|
|
535
548
|
|
|
536
|
-
def _handle_body_start_li(self, token):
|
|
549
|
+
def _handle_body_start_li(self, token: Tag) -> None:
|
|
537
550
|
self.frameset_ok = False
|
|
538
551
|
self._close_p_element()
|
|
539
552
|
if self._has_in_list_item_scope("li"):
|
|
@@ -541,7 +554,7 @@ class TreeBuilderModesMixin:
|
|
|
541
554
|
self._insert_element(token, push=True)
|
|
542
555
|
return
|
|
543
556
|
|
|
544
|
-
def _handle_body_start_dd_dt(self, token):
|
|
557
|
+
def _handle_body_start_dd_dt(self, token: Tag) -> None:
|
|
545
558
|
self.frameset_ok = False
|
|
546
559
|
self._close_p_element()
|
|
547
560
|
name = token.name
|
|
@@ -558,7 +571,7 @@ class TreeBuilderModesMixin:
|
|
|
558
571
|
self._insert_element(token, push=True)
|
|
559
572
|
return
|
|
560
573
|
|
|
561
|
-
def _adoption_agency(self, subject):
|
|
574
|
+
def _adoption_agency(self, subject: Any) -> None:
|
|
562
575
|
# 1. If the current node is the subject, and it is not in the active formatting elements list...
|
|
563
576
|
if self.open_elements and self.open_elements[-1].name == subject:
|
|
564
577
|
if not self._has_active_formatting_entry(subject):
|
|
@@ -570,6 +583,10 @@ class TreeBuilderModesMixin:
|
|
|
570
583
|
# 3. Find formatting element
|
|
571
584
|
formatting_element_index = self._find_active_formatting_index(subject)
|
|
572
585
|
if formatting_element_index is None:
|
|
586
|
+
# html5lib reports a parse error when an end tag for a formatting
|
|
587
|
+
# element triggers the adoption agency algorithm but no matching
|
|
588
|
+
# active formatting entry exists.
|
|
589
|
+
self._parse_error("adoption-agency-1.3")
|
|
573
590
|
return
|
|
574
591
|
|
|
575
592
|
formatting_element_entry = self.active_formatting[formatting_element_index]
|
|
@@ -647,6 +664,10 @@ class TreeBuilderModesMixin:
|
|
|
647
664
|
# 10.4 Replace entry with new element
|
|
648
665
|
entry = self.active_formatting[node_formatting_index]
|
|
649
666
|
new_element = self._create_element(entry["name"], entry["node"].namespace, entry["attrs"])
|
|
667
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
668
|
+
new_element._origin_pos = entry["node"].origin_offset
|
|
669
|
+
new_element._origin_line = entry["node"].origin_line
|
|
670
|
+
new_element._origin_col = entry["node"].origin_col
|
|
650
671
|
entry["node"] = new_element
|
|
651
672
|
self.open_elements[self.open_elements.index(node)] = new_element
|
|
652
673
|
node = new_element
|
|
@@ -680,6 +701,10 @@ class TreeBuilderModesMixin:
|
|
|
680
701
|
# 12. Create new formatting element
|
|
681
702
|
entry = self.active_formatting[formatting_element_index]
|
|
682
703
|
new_formatting_element = self._create_element(entry["name"], entry["node"].namespace, entry["attrs"])
|
|
704
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
705
|
+
new_formatting_element._origin_pos = entry["node"].origin_offset
|
|
706
|
+
new_formatting_element._origin_line = entry["node"].origin_line
|
|
707
|
+
new_formatting_element._origin_col = entry["node"].origin_col
|
|
683
708
|
entry["node"] = new_formatting_element
|
|
684
709
|
|
|
685
710
|
# 13. Move children of furthest block
|
|
@@ -702,8 +727,9 @@ class TreeBuilderModesMixin:
|
|
|
702
727
|
furthest_block_index = self.open_elements.index(furthest_block)
|
|
703
728
|
self.open_elements.insert(furthest_block_index + 1, new_formatting_element)
|
|
704
729
|
|
|
705
|
-
def _handle_body_start_a(self, token):
|
|
730
|
+
def _handle_body_start_a(self, token: Tag) -> None:
|
|
706
731
|
if self._has_active_formatting_entry("a"):
|
|
732
|
+
self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=token.name)
|
|
707
733
|
self._adoption_agency("a")
|
|
708
734
|
self._remove_last_active_formatting_by_name("a")
|
|
709
735
|
self._remove_last_open_element_by_name("a")
|
|
@@ -712,7 +738,7 @@ class TreeBuilderModesMixin:
|
|
|
712
738
|
self._append_active_formatting_entry("a", token.attrs, node)
|
|
713
739
|
return
|
|
714
740
|
|
|
715
|
-
def _handle_body_start_formatting(self, token):
|
|
741
|
+
def _handle_body_start_formatting(self, token: Tag) -> None:
|
|
716
742
|
name = token.name
|
|
717
743
|
if name == "nobr" and self._in_scope("nobr"):
|
|
718
744
|
self._adoption_agency("nobr")
|
|
@@ -726,21 +752,21 @@ class TreeBuilderModesMixin:
|
|
|
726
752
|
self._append_active_formatting_entry(name, token.attrs, node)
|
|
727
753
|
return
|
|
728
754
|
|
|
729
|
-
def _handle_body_start_applet_like(self, token):
|
|
755
|
+
def _handle_body_start_applet_like(self, token: Tag) -> None:
|
|
730
756
|
self._reconstruct_active_formatting_elements()
|
|
731
757
|
self._insert_element(token, push=True)
|
|
732
758
|
self._push_formatting_marker()
|
|
733
759
|
self.frameset_ok = False
|
|
734
760
|
return
|
|
735
761
|
|
|
736
|
-
def _handle_body_start_br(self, token):
|
|
762
|
+
def _handle_body_start_br(self, token: Tag) -> None:
|
|
737
763
|
self._close_p_element()
|
|
738
764
|
self._reconstruct_active_formatting_elements()
|
|
739
765
|
self._insert_element(token, push=False)
|
|
740
766
|
self.frameset_ok = False
|
|
741
767
|
return
|
|
742
768
|
|
|
743
|
-
def _handle_body_start_frameset(self, token):
|
|
769
|
+
def _handle_body_start_frameset(self, token: Tag) -> None:
|
|
744
770
|
if not self.frameset_ok:
|
|
745
771
|
self._parse_error("unexpected-start-tag-ignored", tag_name=token.name)
|
|
746
772
|
return
|
|
@@ -765,17 +791,17 @@ class TreeBuilderModesMixin:
|
|
|
765
791
|
# Body mode end tag handlers
|
|
766
792
|
# ---------------------
|
|
767
793
|
|
|
768
|
-
def _handle_body_end_body(self, token):
|
|
794
|
+
def _handle_body_end_body(self, token: Tag) -> None:
|
|
769
795
|
if self._in_scope("body"):
|
|
770
796
|
self.mode = InsertionMode.AFTER_BODY
|
|
771
797
|
return
|
|
772
798
|
|
|
773
|
-
def _handle_body_end_html(self, token):
|
|
799
|
+
def _handle_body_end_html(self, token: Tag) -> ModeResultTuple | None:
|
|
774
800
|
if self._in_scope("body"):
|
|
775
801
|
return ("reprocess", InsertionMode.AFTER_BODY, token)
|
|
776
802
|
return None
|
|
777
803
|
|
|
778
|
-
def _handle_body_end_p(self, token):
|
|
804
|
+
def _handle_body_end_p(self, token: Tag) -> None:
|
|
779
805
|
if not self._close_p_element():
|
|
780
806
|
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
781
807
|
phantom = Tag(Tag.START, "p", {}, False)
|
|
@@ -783,21 +809,21 @@ class TreeBuilderModesMixin:
|
|
|
783
809
|
self._close_p_element()
|
|
784
810
|
return
|
|
785
811
|
|
|
786
|
-
def _handle_body_end_li(self, token):
|
|
812
|
+
def _handle_body_end_li(self, token: Tag) -> None:
|
|
787
813
|
if not self._has_in_list_item_scope("li"):
|
|
788
814
|
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
789
815
|
return
|
|
790
816
|
self._pop_until_any_inclusive({"li"})
|
|
791
817
|
return
|
|
792
818
|
|
|
793
|
-
def _handle_body_end_dd_dt(self, token):
|
|
819
|
+
def _handle_body_end_dd_dt(self, token: Tag) -> None:
|
|
794
820
|
name = token.name
|
|
795
821
|
if not self._has_in_definition_scope(name):
|
|
796
822
|
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
797
823
|
return
|
|
798
824
|
self._pop_until_any_inclusive({"dd", "dt"})
|
|
799
825
|
|
|
800
|
-
def _handle_body_end_form(self, token):
|
|
826
|
+
def _handle_body_end_form(self, token: Tag) -> None:
|
|
801
827
|
if self.form_element is None:
|
|
802
828
|
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
803
829
|
return
|
|
@@ -807,7 +833,7 @@ class TreeBuilderModesMixin:
|
|
|
807
833
|
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
808
834
|
return
|
|
809
835
|
|
|
810
|
-
def _handle_body_end_applet_like(self, token):
|
|
836
|
+
def _handle_body_end_applet_like(self, token: Tag) -> None:
|
|
811
837
|
name = token.name
|
|
812
838
|
if not self._in_scope(name):
|
|
813
839
|
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
@@ -820,7 +846,7 @@ class TreeBuilderModesMixin:
|
|
|
820
846
|
self._clear_active_formatting_up_to_marker()
|
|
821
847
|
return
|
|
822
848
|
|
|
823
|
-
def _handle_body_end_heading(self, token):
|
|
849
|
+
def _handle_body_end_heading(self, token: Tag) -> None:
|
|
824
850
|
name = token.name
|
|
825
851
|
if not self._has_any_in_scope(HEADING_ELEMENTS):
|
|
826
852
|
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
@@ -835,7 +861,7 @@ class TreeBuilderModesMixin:
|
|
|
835
861
|
break
|
|
836
862
|
return
|
|
837
863
|
|
|
838
|
-
def _handle_body_end_block(self, token):
|
|
864
|
+
def _handle_body_end_block(self, token: Tag) -> None:
|
|
839
865
|
name = token.name
|
|
840
866
|
if not self._in_scope(name):
|
|
841
867
|
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
@@ -846,9 +872,10 @@ class TreeBuilderModesMixin:
|
|
|
846
872
|
self._pop_until_any_inclusive({name})
|
|
847
873
|
return
|
|
848
874
|
|
|
849
|
-
def _handle_body_end_template(self, token):
|
|
875
|
+
def _handle_body_end_template(self, token: Tag) -> None:
|
|
850
876
|
has_template = any(node.name == "template" for node in self.open_elements)
|
|
851
877
|
if not has_template:
|
|
878
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
852
879
|
return
|
|
853
880
|
self._generate_implied_end_tags()
|
|
854
881
|
self._pop_until_inclusive("template")
|
|
@@ -859,18 +886,18 @@ class TreeBuilderModesMixin:
|
|
|
859
886
|
self._reset_insertion_mode()
|
|
860
887
|
return
|
|
861
888
|
|
|
862
|
-
def _handle_body_start_structure_ignored(self, token):
|
|
889
|
+
def _handle_body_start_structure_ignored(self, token: Tag) -> None:
|
|
863
890
|
self._parse_error("unexpected-start-tag-ignored", tag_name=token.name)
|
|
864
891
|
return
|
|
865
892
|
|
|
866
|
-
def _handle_body_start_col_or_frame(self, token):
|
|
893
|
+
def _handle_body_start_col_or_frame(self, token: Tag) -> None:
|
|
867
894
|
if self.fragment_context is None:
|
|
868
895
|
self._parse_error("unexpected-start-tag-ignored", tag_name=token.name)
|
|
869
896
|
return
|
|
870
897
|
self._insert_element(token, push=False)
|
|
871
898
|
return
|
|
872
899
|
|
|
873
|
-
def _handle_body_start_image(self, token):
|
|
900
|
+
def _handle_body_start_image(self, token: Tag) -> None:
|
|
874
901
|
self._parse_error("image-start-tag", tag_name=token.name)
|
|
875
902
|
img_token = Tag(Tag.START, "img", token.attrs, token.self_closing)
|
|
876
903
|
self._reconstruct_active_formatting_elements()
|
|
@@ -878,17 +905,17 @@ class TreeBuilderModesMixin:
|
|
|
878
905
|
self.frameset_ok = False
|
|
879
906
|
return
|
|
880
907
|
|
|
881
|
-
def _handle_body_start_void_with_formatting(self, token):
|
|
908
|
+
def _handle_body_start_void_with_formatting(self, token: Tag) -> None:
|
|
882
909
|
self._reconstruct_active_formatting_elements()
|
|
883
910
|
self._insert_element(token, push=False)
|
|
884
911
|
self.frameset_ok = False
|
|
885
912
|
return
|
|
886
913
|
|
|
887
|
-
def _handle_body_start_simple_void(self, token):
|
|
914
|
+
def _handle_body_start_simple_void(self, token: Tag) -> None:
|
|
888
915
|
self._insert_element(token, push=False)
|
|
889
916
|
return
|
|
890
917
|
|
|
891
|
-
def _handle_body_start_input(self, token):
|
|
918
|
+
def _handle_body_start_input(self, token: Tag) -> None:
|
|
892
919
|
input_type = None
|
|
893
920
|
for name, value in token.attrs.items():
|
|
894
921
|
if name == "type":
|
|
@@ -899,7 +926,7 @@ class TreeBuilderModesMixin:
|
|
|
899
926
|
self.frameset_ok = False
|
|
900
927
|
return
|
|
901
928
|
|
|
902
|
-
def _handle_body_start_table(self, token):
|
|
929
|
+
def _handle_body_start_table(self, token: Tag) -> None:
|
|
903
930
|
if self.quirks_mode != "quirks":
|
|
904
931
|
self._close_p_element()
|
|
905
932
|
self._insert_element(token, push=True)
|
|
@@ -907,7 +934,7 @@ class TreeBuilderModesMixin:
|
|
|
907
934
|
self.mode = InsertionMode.IN_TABLE
|
|
908
935
|
return
|
|
909
936
|
|
|
910
|
-
def _handle_body_start_plaintext_xmp(self, token):
|
|
937
|
+
def _handle_body_start_plaintext_xmp(self, token: Tag) -> None:
|
|
911
938
|
self._close_p_element()
|
|
912
939
|
self._insert_element(token, push=True)
|
|
913
940
|
self.frameset_ok = False
|
|
@@ -919,66 +946,88 @@ class TreeBuilderModesMixin:
|
|
|
919
946
|
self.mode = InsertionMode.TEXT
|
|
920
947
|
return
|
|
921
948
|
|
|
922
|
-
def _handle_body_start_textarea(self, token):
|
|
949
|
+
def _handle_body_start_textarea(self, token: Tag) -> None:
|
|
923
950
|
self._insert_element(token, push=True)
|
|
924
951
|
self.ignore_lf = True
|
|
925
952
|
self.frameset_ok = False
|
|
926
953
|
return
|
|
927
954
|
|
|
928
|
-
def _handle_body_start_select(self, token):
|
|
955
|
+
def _handle_body_start_select(self, token: Tag) -> None:
|
|
929
956
|
self._reconstruct_active_formatting_elements()
|
|
930
957
|
self._insert_element(token, push=True)
|
|
931
958
|
self.frameset_ok = False
|
|
932
959
|
self._reset_insertion_mode()
|
|
933
960
|
return
|
|
934
961
|
|
|
935
|
-
def _handle_body_start_option(self, token):
|
|
962
|
+
def _handle_body_start_option(self, token: Tag) -> None:
|
|
936
963
|
if self.open_elements and self.open_elements[-1].name == "option":
|
|
937
964
|
self.open_elements.pop()
|
|
938
965
|
self._reconstruct_active_formatting_elements()
|
|
939
966
|
self._insert_element(token, push=True)
|
|
940
967
|
return
|
|
941
968
|
|
|
942
|
-
def _handle_body_start_optgroup(self, token):
|
|
969
|
+
def _handle_body_start_optgroup(self, token: Tag) -> None:
|
|
943
970
|
if self.open_elements and self.open_elements[-1].name == "option":
|
|
944
971
|
self.open_elements.pop()
|
|
945
972
|
self._reconstruct_active_formatting_elements()
|
|
946
973
|
self._insert_element(token, push=True)
|
|
947
974
|
return
|
|
948
975
|
|
|
949
|
-
def _handle_body_start_rp_rt(self, token):
|
|
976
|
+
def _handle_body_start_rp_rt(self, token: Tag) -> None:
|
|
950
977
|
self._generate_implied_end_tags(exclude="rtc")
|
|
951
978
|
self._insert_element(token, push=True)
|
|
952
979
|
return
|
|
953
980
|
|
|
954
|
-
def _handle_body_start_rb_rtc(self, token):
|
|
981
|
+
def _handle_body_start_rb_rtc(self, token: Tag) -> None:
|
|
955
982
|
if self.open_elements and self.open_elements[-1].name in {"rb", "rp", "rt", "rtc"}:
|
|
956
983
|
self._generate_implied_end_tags()
|
|
957
984
|
self._insert_element(token, push=True)
|
|
958
985
|
return
|
|
959
986
|
|
|
960
|
-
def _handle_body_start_table_parse_error(self, token):
|
|
987
|
+
def _handle_body_start_table_parse_error(self, token: Tag) -> None:
|
|
961
988
|
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
962
989
|
return
|
|
963
990
|
|
|
964
|
-
def _handle_body_start_default(self, token):
|
|
991
|
+
def _handle_body_start_default(self, token: Tag) -> ModeResultTuple | None:
|
|
965
992
|
self._reconstruct_active_formatting_elements()
|
|
966
993
|
self._insert_element(token, push=True)
|
|
967
994
|
if token.self_closing:
|
|
968
995
|
self._parse_error("non-void-html-element-start-tag-with-trailing-solidus", tag_name=token.name)
|
|
969
996
|
# Elements reaching here have no handler - never in FRAMESET_NEUTRAL/FORMATTING_ELEMENTS
|
|
970
997
|
self.frameset_ok = False
|
|
971
|
-
return
|
|
998
|
+
return None
|
|
972
999
|
|
|
973
|
-
def _mode_in_table(self, token):
|
|
1000
|
+
def _mode_in_table(self, token: AnyToken) -> ModeResultTuple | None:
|
|
974
1001
|
if isinstance(token, CharacterTokens):
|
|
975
1002
|
data = token.data or ""
|
|
976
1003
|
if "\x00" in data:
|
|
977
|
-
self._parse_error("unexpected-null-character")
|
|
978
1004
|
data = data.replace("\x00", "")
|
|
979
1005
|
if not data:
|
|
980
1006
|
return None
|
|
981
1007
|
token = CharacterTokens(data)
|
|
1008
|
+
|
|
1009
|
+
if is_all_whitespace(data):
|
|
1010
|
+
self._append_text(data)
|
|
1011
|
+
return None
|
|
1012
|
+
|
|
1013
|
+
# html5lib-tests expect that some table foster-parenting text triggered by a
|
|
1014
|
+
# misnested formatting element (<a>) only produces an implied-end-tag error
|
|
1015
|
+
# when the table closes, not an additional character-in-table error.
|
|
1016
|
+
suppress_table_char_error = False
|
|
1017
|
+
if self.active_formatting:
|
|
1018
|
+
for idx in range(len(self.active_formatting) - 1, -1, -1):
|
|
1019
|
+
entry = self.active_formatting[idx]
|
|
1020
|
+
if entry is FORMAT_MARKER:
|
|
1021
|
+
break
|
|
1022
|
+
if entry["name"] == "a":
|
|
1023
|
+
if entry["node"] not in self.open_elements:
|
|
1024
|
+
suppress_table_char_error = True
|
|
1025
|
+
break
|
|
1026
|
+
|
|
1027
|
+
if not suppress_table_char_error:
|
|
1028
|
+
self.pending_table_text_should_error = True
|
|
1029
|
+
else:
|
|
1030
|
+
self.pending_table_text_should_error = False
|
|
982
1031
|
self.pending_table_text = []
|
|
983
1032
|
self.table_text_original_mode = self.mode
|
|
984
1033
|
self.mode = InsertionMode.IN_TABLE_TEXT
|
|
@@ -1051,7 +1100,7 @@ class TreeBuilderModesMixin:
|
|
|
1051
1100
|
self.form_element = node
|
|
1052
1101
|
self.open_elements.pop() # push=True always adds to stack
|
|
1053
1102
|
return None
|
|
1054
|
-
self._parse_error("
|
|
1103
|
+
self._parse_error("foster-parenting-start-tag", tag_name=name)
|
|
1055
1104
|
previous = self.insert_from_table
|
|
1056
1105
|
self.insert_from_table = True
|
|
1057
1106
|
try:
|
|
@@ -1078,26 +1127,40 @@ class TreeBuilderModesMixin:
|
|
|
1078
1127
|
if self.template_modes:
|
|
1079
1128
|
return self._mode_in_template(token)
|
|
1080
1129
|
if self._has_in_table_scope("table"):
|
|
1081
|
-
self._parse_error("
|
|
1130
|
+
self._parse_error("eof-in-table")
|
|
1082
1131
|
return None
|
|
1083
1132
|
|
|
1084
|
-
def _mode_in_table_text(self, token):
|
|
1133
|
+
def _mode_in_table_text(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1085
1134
|
if isinstance(token, CharacterTokens):
|
|
1086
1135
|
# IN_TABLE mode guarantees non-empty data
|
|
1087
1136
|
data = token.data
|
|
1088
|
-
|
|
1089
|
-
self._parse_error("invalid-codepoint-in-table-text")
|
|
1090
|
-
data = data.replace("\x0c", "")
|
|
1091
|
-
if data:
|
|
1092
|
-
self.pending_table_text.append(data)
|
|
1137
|
+
self.pending_table_text.append(data)
|
|
1093
1138
|
return None
|
|
1139
|
+
|
|
1140
|
+
if (
|
|
1141
|
+
self.pending_table_text
|
|
1142
|
+
and isinstance(token, Tag)
|
|
1143
|
+
and token.kind == Tag.END
|
|
1144
|
+
and token.name == "table"
|
|
1145
|
+
and not is_all_whitespace("".join(self.pending_table_text))
|
|
1146
|
+
):
|
|
1147
|
+
# If a misnested <a> exists only in the active formatting list, html5lib
|
|
1148
|
+
# reports the implied close when the table ends.
|
|
1149
|
+
if self.active_formatting:
|
|
1150
|
+
for idx in range(len(self.active_formatting) - 1, -1, -1):
|
|
1151
|
+
entry = self.active_formatting[idx]
|
|
1152
|
+
if entry is FORMAT_MARKER:
|
|
1153
|
+
break
|
|
1154
|
+
if entry["name"] == "a" and entry["node"] not in self.open_elements:
|
|
1155
|
+
self._parse_error("unexpected-implied-end-tag-in-table-view")
|
|
1156
|
+
break
|
|
1094
1157
|
self._flush_pending_table_text()
|
|
1095
1158
|
original = self.table_text_original_mode or InsertionMode.IN_TABLE
|
|
1096
1159
|
self.table_text_original_mode = None
|
|
1097
1160
|
self.mode = original
|
|
1098
1161
|
return ("reprocess", original, token)
|
|
1099
1162
|
|
|
1100
|
-
def _mode_in_caption(self, token):
|
|
1163
|
+
def _mode_in_caption(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1101
1164
|
if isinstance(token, CharacterTokens):
|
|
1102
1165
|
return self._mode_in_body(token)
|
|
1103
1166
|
if isinstance(token, CommentToken):
|
|
@@ -1136,7 +1199,7 @@ class TreeBuilderModesMixin:
|
|
|
1136
1199
|
assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
|
|
1137
1200
|
return self._mode_in_body(token)
|
|
1138
1201
|
|
|
1139
|
-
def _close_caption_element(self):
|
|
1202
|
+
def _close_caption_element(self) -> bool:
|
|
1140
1203
|
if not self._has_in_table_scope("caption"):
|
|
1141
1204
|
self._parse_error("unexpected-end-tag", tag_name="caption")
|
|
1142
1205
|
return False
|
|
@@ -1150,7 +1213,7 @@ class TreeBuilderModesMixin:
|
|
|
1150
1213
|
self.mode = InsertionMode.IN_TABLE
|
|
1151
1214
|
return True
|
|
1152
1215
|
|
|
1153
|
-
def _mode_in_column_group(self, token):
|
|
1216
|
+
def _mode_in_column_group(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1154
1217
|
current = self.open_elements[-1] if self.open_elements else None
|
|
1155
1218
|
if isinstance(token, CharacterTokens):
|
|
1156
1219
|
data = token.data or ""
|
|
@@ -1245,7 +1308,7 @@ class TreeBuilderModesMixin:
|
|
|
1245
1308
|
return None
|
|
1246
1309
|
# Per spec: EOF when current is html - implicit None return
|
|
1247
1310
|
|
|
1248
|
-
def _mode_in_table_body(self, token):
|
|
1311
|
+
def _mode_in_table_body(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1249
1312
|
if isinstance(token, CharacterTokens) or isinstance(token, CommentToken):
|
|
1250
1313
|
return self._mode_in_table(token)
|
|
1251
1314
|
if isinstance(token, Tag):
|
|
@@ -1321,7 +1384,7 @@ class TreeBuilderModesMixin:
|
|
|
1321
1384
|
assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
|
|
1322
1385
|
return self._mode_in_table(token)
|
|
1323
1386
|
|
|
1324
|
-
def _mode_in_row(self, token):
|
|
1387
|
+
def _mode_in_row(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1325
1388
|
if isinstance(token, CharacterTokens) or isinstance(token, CommentToken):
|
|
1326
1389
|
return self._mode_in_table(token)
|
|
1327
1390
|
if isinstance(token, Tag):
|
|
@@ -1370,7 +1433,7 @@ class TreeBuilderModesMixin:
|
|
|
1370
1433
|
assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
|
|
1371
1434
|
return self._mode_in_table(token)
|
|
1372
1435
|
|
|
1373
|
-
def _end_tr_element(self):
|
|
1436
|
+
def _end_tr_element(self) -> None:
|
|
1374
1437
|
self._clear_stack_until({"tr", "template", "html"})
|
|
1375
1438
|
# Pop tr if on top (may not be if stack was exhausted)
|
|
1376
1439
|
if self.open_elements and self.open_elements[-1].name == "tr":
|
|
@@ -1381,7 +1444,7 @@ class TreeBuilderModesMixin:
|
|
|
1381
1444
|
else:
|
|
1382
1445
|
self.mode = InsertionMode.IN_TABLE_BODY
|
|
1383
1446
|
|
|
1384
|
-
def _mode_in_cell(self, token):
|
|
1447
|
+
def _mode_in_cell(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1385
1448
|
if isinstance(token, CharacterTokens):
|
|
1386
1449
|
previous = self.insert_from_table
|
|
1387
1450
|
self.insert_from_table = False
|
|
@@ -1435,15 +1498,11 @@ class TreeBuilderModesMixin:
|
|
|
1435
1498
|
return ("reprocess", self.mode, token)
|
|
1436
1499
|
return self._mode_in_table(token)
|
|
1437
1500
|
|
|
1438
|
-
def _mode_in_select(self, token):
|
|
1501
|
+
def _mode_in_select(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1439
1502
|
if isinstance(token, CharacterTokens):
|
|
1440
1503
|
data = token.data or ""
|
|
1441
1504
|
if "\x00" in data:
|
|
1442
|
-
self._parse_error("invalid-codepoint-in-select")
|
|
1443
1505
|
data = data.replace("\x00", "")
|
|
1444
|
-
if "\x0c" in data:
|
|
1445
|
-
self._parse_error("invalid-codepoint-in-select")
|
|
1446
|
-
data = data.replace("\x0c", "")
|
|
1447
1506
|
if data:
|
|
1448
1507
|
self._reconstruct_active_formatting_elements()
|
|
1449
1508
|
self._append_text(data)
|
|
@@ -1471,13 +1530,13 @@ class TreeBuilderModesMixin:
|
|
|
1471
1530
|
self._insert_element(token, push=True)
|
|
1472
1531
|
return None
|
|
1473
1532
|
if name == "select":
|
|
1474
|
-
self._parse_error("unexpected-
|
|
1533
|
+
self._parse_error("unexpected-select-in-select")
|
|
1475
1534
|
# select is always in scope in IN_SELECT mode
|
|
1476
1535
|
self._pop_until_any_inclusive({"select"})
|
|
1477
1536
|
self._reset_insertion_mode()
|
|
1478
1537
|
return None
|
|
1479
1538
|
if name in {"input", "textarea"}:
|
|
1480
|
-
self._parse_error("unexpected-start-tag-
|
|
1539
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1481
1540
|
# select is always in scope in IN_SELECT mode
|
|
1482
1541
|
self._pop_until_any_inclusive({"select"})
|
|
1483
1542
|
self._reset_insertion_mode()
|
|
@@ -1487,7 +1546,7 @@ class TreeBuilderModesMixin:
|
|
|
1487
1546
|
self._insert_element(token, push=False)
|
|
1488
1547
|
return None
|
|
1489
1548
|
if name in {"caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr", "table"}:
|
|
1490
|
-
self._parse_error("unexpected-start-tag-
|
|
1549
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1491
1550
|
# select is always in scope in IN_SELECT mode
|
|
1492
1551
|
self._pop_until_any_inclusive({"select"})
|
|
1493
1552
|
self._reset_insertion_mode()
|
|
@@ -1505,6 +1564,7 @@ class TreeBuilderModesMixin:
|
|
|
1505
1564
|
self._append_active_formatting_entry(name, token.attrs, node)
|
|
1506
1565
|
return None
|
|
1507
1566
|
if name == "hr":
|
|
1567
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1508
1568
|
# Per spec: pop option and optgroup before inserting hr (makes hr sibling, not child)
|
|
1509
1569
|
if self.open_elements and self.open_elements[-1].name == "option":
|
|
1510
1570
|
self.open_elements.pop()
|
|
@@ -1514,22 +1574,29 @@ class TreeBuilderModesMixin:
|
|
|
1514
1574
|
self._insert_element(token, push=False)
|
|
1515
1575
|
return None
|
|
1516
1576
|
if name == "menuitem":
|
|
1577
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1517
1578
|
self._reconstruct_active_formatting_elements()
|
|
1518
1579
|
self._insert_element(token, push=True)
|
|
1519
1580
|
return None
|
|
1520
1581
|
# Allow common HTML elements in select (newer spec)
|
|
1521
1582
|
if name in {"p", "div", "span", "button", "datalist", "selectedcontent"}:
|
|
1583
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1522
1584
|
self._reconstruct_active_formatting_elements()
|
|
1523
1585
|
self._insert_element(token, push=not token.self_closing)
|
|
1524
1586
|
return None
|
|
1525
1587
|
if name in {"br", "img"}:
|
|
1588
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1526
1589
|
self._reconstruct_active_formatting_elements()
|
|
1527
1590
|
self._insert_element(token, push=False)
|
|
1528
1591
|
return None
|
|
1529
1592
|
if name == "plaintext":
|
|
1530
1593
|
# Per spec: plaintext element is inserted in select (consumes all remaining text)
|
|
1594
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1531
1595
|
self._reconstruct_active_formatting_elements()
|
|
1532
1596
|
self._insert_element(token, push=True)
|
|
1597
|
+
return None
|
|
1598
|
+
# Any other start tag: parse error, ignore.
|
|
1599
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1533
1600
|
return None
|
|
1534
1601
|
if name == "optgroup":
|
|
1535
1602
|
if self.open_elements and self.open_elements[-1].name == "option":
|
|
@@ -1537,13 +1604,13 @@ class TreeBuilderModesMixin:
|
|
|
1537
1604
|
if self.open_elements and self.open_elements[-1].name == "optgroup":
|
|
1538
1605
|
self.open_elements.pop()
|
|
1539
1606
|
else:
|
|
1540
|
-
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
1607
|
+
self._parse_error("unexpected-end-tag-in-select", tag_name=token.name)
|
|
1541
1608
|
return None
|
|
1542
1609
|
if name == "option":
|
|
1543
1610
|
if self.open_elements and self.open_elements[-1].name == "option":
|
|
1544
1611
|
self.open_elements.pop()
|
|
1545
1612
|
else:
|
|
1546
|
-
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
1613
|
+
self._parse_error("unexpected-end-tag-in-select", tag_name=token.name)
|
|
1547
1614
|
return None
|
|
1548
1615
|
if name == "select":
|
|
1549
1616
|
# In IN_SELECT mode, select is always in scope - pop to it
|
|
@@ -1555,17 +1622,20 @@ class TreeBuilderModesMixin:
|
|
|
1555
1622
|
# select is always on stack in IN_SELECT mode
|
|
1556
1623
|
select_node = self._find_last_on_stack("select")
|
|
1557
1624
|
fmt_index = self._find_active_formatting_index(name)
|
|
1558
|
-
if fmt_index is
|
|
1559
|
-
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
|
|
1625
|
+
if fmt_index is None:
|
|
1626
|
+
self._parse_error("unexpected-end-tag-in-select", tag_name=name)
|
|
1627
|
+
return None
|
|
1628
|
+
target = self.active_formatting[fmt_index]["node"]
|
|
1629
|
+
if target in self.open_elements: # pragma: no branch
|
|
1630
|
+
select_index = self.open_elements.index(select_node)
|
|
1631
|
+
target_index = self.open_elements.index(target)
|
|
1632
|
+
if target_index < select_index:
|
|
1633
|
+
self._parse_error("unexpected-end-tag-in-select", tag_name=name)
|
|
1634
|
+
return None
|
|
1566
1635
|
self._adoption_agency(name)
|
|
1567
1636
|
return None
|
|
1568
1637
|
if name in {"p", "div", "span", "button", "datalist", "selectedcontent"}:
|
|
1638
|
+
self._parse_error("unexpected-end-tag-in-select", tag_name=name)
|
|
1569
1639
|
# Per HTML5 spec: these end tags in select mode close the element if it's on the stack.
|
|
1570
1640
|
# But we must not pop across the select boundary (i.e., don't pop elements BEFORE select).
|
|
1571
1641
|
select_idx = None
|
|
@@ -1582,11 +1652,9 @@ class TreeBuilderModesMixin:
|
|
|
1582
1652
|
popped = self.open_elements.pop()
|
|
1583
1653
|
if popped.name == name:
|
|
1584
1654
|
break
|
|
1585
|
-
else:
|
|
1586
|
-
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1587
1655
|
return None
|
|
1588
1656
|
if name in {"caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr", "table"}:
|
|
1589
|
-
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1657
|
+
self._parse_error("unexpected-end-tag-in-select", tag_name=name)
|
|
1590
1658
|
# select is always in scope in IN_SELECT mode
|
|
1591
1659
|
self._pop_until_any_inclusive({"select"})
|
|
1592
1660
|
self._reset_insertion_mode()
|
|
@@ -1597,7 +1665,7 @@ class TreeBuilderModesMixin:
|
|
|
1597
1665
|
assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
|
|
1598
1666
|
return self._mode_in_body(token)
|
|
1599
1667
|
|
|
1600
|
-
def _mode_in_template(self, token):
|
|
1668
|
+
def _mode_in_template(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1601
1669
|
# § The "in template" insertion mode
|
|
1602
1670
|
# https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intemplate
|
|
1603
1671
|
if isinstance(token, CharacterTokens):
|
|
@@ -1676,7 +1744,7 @@ class TreeBuilderModesMixin:
|
|
|
1676
1744
|
return ("reprocess", self.mode, token)
|
|
1677
1745
|
return None
|
|
1678
1746
|
|
|
1679
|
-
def _mode_after_body(self, token):
|
|
1747
|
+
def _mode_after_body(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1680
1748
|
if isinstance(token, CharacterTokens):
|
|
1681
1749
|
if is_all_whitespace(token.data):
|
|
1682
1750
|
# Whitespace is processed using InBody rules (appended to body)
|
|
@@ -1697,7 +1765,7 @@ class TreeBuilderModesMixin:
|
|
|
1697
1765
|
assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
|
|
1698
1766
|
return None
|
|
1699
1767
|
|
|
1700
|
-
def _mode_after_after_body(self, token):
|
|
1768
|
+
def _mode_after_after_body(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1701
1769
|
if isinstance(token, CharacterTokens):
|
|
1702
1770
|
if is_all_whitespace(token.data):
|
|
1703
1771
|
# Per spec: whitespace characters are inserted using the rules for the "in body" mode
|
|
@@ -1724,7 +1792,7 @@ class TreeBuilderModesMixin:
|
|
|
1724
1792
|
assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
|
|
1725
1793
|
return None
|
|
1726
1794
|
|
|
1727
|
-
def _mode_in_frameset(self, token):
|
|
1795
|
+
def _mode_in_frameset(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1728
1796
|
# Per HTML5 spec §13.2.6.4.16: In frameset insertion mode
|
|
1729
1797
|
if isinstance(token, CharacterTokens):
|
|
1730
1798
|
# Only whitespace characters allowed; ignore all others
|
|
@@ -1766,11 +1834,14 @@ class TreeBuilderModesMixin:
|
|
|
1766
1834
|
self._parse_error("unexpected-token-in-frameset")
|
|
1767
1835
|
return None
|
|
1768
1836
|
|
|
1769
|
-
def _mode_after_frameset(self, token):
|
|
1837
|
+
def _mode_after_frameset(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1770
1838
|
# Per HTML5 spec §13.2.6.4.17: After frameset insertion mode
|
|
1771
1839
|
if isinstance(token, CharacterTokens):
|
|
1772
|
-
# Only whitespace characters allowed;
|
|
1773
|
-
|
|
1840
|
+
# Only whitespace characters allowed; non-whitespace is a parse error.
|
|
1841
|
+
data = token.data or ""
|
|
1842
|
+
whitespace = "".join(ch for ch in data if ch in "\t\n\f\r ")
|
|
1843
|
+
if any(ch not in "\t\n\f\r " for ch in data):
|
|
1844
|
+
self._parse_error("unexpected-token-after-frameset")
|
|
1774
1845
|
if whitespace:
|
|
1775
1846
|
self._append_text(whitespace)
|
|
1776
1847
|
return None
|
|
@@ -1783,6 +1854,9 @@ class TreeBuilderModesMixin:
|
|
|
1783
1854
|
if token.kind == Tag.END and token.name == "html":
|
|
1784
1855
|
self.mode = InsertionMode.AFTER_AFTER_FRAMESET
|
|
1785
1856
|
return None
|
|
1857
|
+
if token.kind == Tag.END and token.name == "frameset":
|
|
1858
|
+
self._parse_error("unexpected-token-after-frameset")
|
|
1859
|
+
return None
|
|
1786
1860
|
if token.kind == Tag.START and token.name == "noframes":
|
|
1787
1861
|
# Insert noframes element directly and switch to TEXT mode
|
|
1788
1862
|
self._insert_element(token, push=True)
|
|
@@ -1795,7 +1869,7 @@ class TreeBuilderModesMixin:
|
|
|
1795
1869
|
self.mode = InsertionMode.IN_FRAMESET
|
|
1796
1870
|
return ("reprocess", InsertionMode.IN_FRAMESET, token)
|
|
1797
1871
|
|
|
1798
|
-
def _mode_after_after_frameset(self, token):
|
|
1872
|
+
def _mode_after_after_frameset(self, token: AnyToken) -> ModeResultTuple | None:
|
|
1799
1873
|
# Per HTML5 spec §13.2.6.4.18: After after frameset insertion mode
|
|
1800
1874
|
if isinstance(token, CharacterTokens):
|
|
1801
1875
|
# Whitespace is processed using InBody rules
|
|
@@ -1826,7 +1900,7 @@ class TreeBuilderModesMixin:
|
|
|
1826
1900
|
|
|
1827
1901
|
# Helpers ----------------------------------------------------------------
|
|
1828
1902
|
|
|
1829
|
-
_MODE_HANDLERS = [
|
|
1903
|
+
_MODE_HANDLERS: list[Callable[[TreeBuilderModesMixin, AnyToken], ModeResultTuple | None]] = [
|
|
1830
1904
|
_mode_initial,
|
|
1831
1905
|
_mode_before_html,
|
|
1832
1906
|
_mode_before_head,
|
|
@@ -1851,14 +1925,14 @@ class TreeBuilderModesMixin:
|
|
|
1851
1925
|
_mode_in_template,
|
|
1852
1926
|
]
|
|
1853
1927
|
|
|
1854
|
-
_BODY_TOKEN_HANDLERS = {
|
|
1928
|
+
_BODY_TOKEN_HANDLERS: dict[type[AnyToken], Callable[[TreeBuilderModesMixin, Any], ModeResultTuple | None]] = {
|
|
1855
1929
|
CharacterTokens: _handle_characters_in_body,
|
|
1856
1930
|
CommentToken: _handle_comment_in_body,
|
|
1857
1931
|
Tag: _handle_tag_in_body,
|
|
1858
1932
|
EOFToken: _handle_eof_in_body,
|
|
1859
1933
|
}
|
|
1860
1934
|
|
|
1861
|
-
_BODY_START_HANDLERS = {
|
|
1935
|
+
_BODY_START_HANDLERS: dict[str, Callable[[TreeBuilderModesMixin, Tag], ModeResultTuple | None]] = {
|
|
1862
1936
|
"a": _handle_body_start_a,
|
|
1863
1937
|
"address": _handle_body_start_block_with_p,
|
|
1864
1938
|
"applet": _handle_body_start_applet_like,
|
|
@@ -1963,7 +2037,7 @@ class TreeBuilderModesMixin:
|
|
|
1963
2037
|
"wbr": _handle_body_start_void_with_formatting,
|
|
1964
2038
|
"xmp": _handle_body_start_plaintext_xmp,
|
|
1965
2039
|
}
|
|
1966
|
-
_BODY_END_HANDLERS = {
|
|
2040
|
+
_BODY_END_HANDLERS: dict[str, Callable[[TreeBuilderModesMixin, Tag], ModeResultTuple | None]] = {
|
|
1967
2041
|
"address": _handle_body_end_block,
|
|
1968
2042
|
"applet": _handle_body_end_applet_like,
|
|
1969
2043
|
"article": _handle_body_end_block,
|