justhtml 0.12.0__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +6 -0
- justhtml/__main__.py +49 -16
- justhtml/entities.py +45 -7
- justhtml/errors.py +9 -0
- justhtml/node.py +358 -89
- justhtml/parser.py +70 -14
- justhtml/sanitize.py +763 -0
- justhtml/selector.py +114 -18
- justhtml/serialize.py +332 -28
- justhtml/tokenizer.py +249 -179
- justhtml/tokens.py +8 -3
- justhtml/treebuilder.py +50 -14
- justhtml/treebuilder_modes.py +100 -36
- justhtml-0.24.0.dist-info/METADATA +192 -0
- justhtml-0.24.0.dist-info/RECORD +24 -0
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.12.0.dist-info/METADATA +0 -164
- justhtml-0.12.0.dist-info/RECORD +0 -23
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/WHEEL +0 -0
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/entry_points.txt +0 -0
justhtml/tokens.py
CHANGED
|
@@ -4,7 +4,7 @@ from typing import Literal
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class Tag:
|
|
7
|
-
__slots__ = ("attrs", "kind", "name", "self_closing")
|
|
7
|
+
__slots__ = ("attrs", "kind", "name", "self_closing", "start_pos")
|
|
8
8
|
|
|
9
9
|
START: Literal[0] = 0
|
|
10
10
|
END: Literal[1] = 1
|
|
@@ -13,6 +13,7 @@ class Tag:
|
|
|
13
13
|
name: str
|
|
14
14
|
attrs: dict[str, str | None]
|
|
15
15
|
self_closing: bool
|
|
16
|
+
start_pos: int | None
|
|
16
17
|
|
|
17
18
|
def __init__(
|
|
18
19
|
self,
|
|
@@ -20,11 +21,13 @@ class Tag:
|
|
|
20
21
|
name: str,
|
|
21
22
|
attrs: dict[str, str | None] | None,
|
|
22
23
|
self_closing: bool = False,
|
|
24
|
+
start_pos: int | None = None,
|
|
23
25
|
) -> None:
|
|
24
26
|
self.kind = kind
|
|
25
27
|
self.name = name
|
|
26
28
|
self.attrs = attrs if attrs is not None else {}
|
|
27
29
|
self.self_closing = bool(self_closing)
|
|
30
|
+
self.start_pos = start_pos
|
|
28
31
|
|
|
29
32
|
|
|
30
33
|
class CharacterTokens:
|
|
@@ -37,12 +40,14 @@ class CharacterTokens:
|
|
|
37
40
|
|
|
38
41
|
|
|
39
42
|
class CommentToken:
|
|
40
|
-
__slots__ = ("data",)
|
|
43
|
+
__slots__ = ("data", "start_pos")
|
|
41
44
|
|
|
42
45
|
data: str
|
|
46
|
+
start_pos: int | None
|
|
43
47
|
|
|
44
|
-
def __init__(self, data: str) -> None:
|
|
48
|
+
def __init__(self, data: str, start_pos: int | None = None) -> None:
|
|
45
49
|
self.data = data
|
|
50
|
+
self.start_pos = start_pos
|
|
46
51
|
|
|
47
52
|
|
|
48
53
|
class Doctype:
|
justhtml/treebuilder.py
CHANGED
|
@@ -59,6 +59,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
59
59
|
"open_elements",
|
|
60
60
|
"original_mode",
|
|
61
61
|
"pending_table_text",
|
|
62
|
+
"pending_table_text_should_error",
|
|
62
63
|
"quirks_mode",
|
|
63
64
|
"table_text_original_mode",
|
|
64
65
|
"template_modes",
|
|
@@ -86,6 +87,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
86
87
|
open_elements: list[Any]
|
|
87
88
|
original_mode: InsertionMode | None # type: ignore[assignment]
|
|
88
89
|
pending_table_text: list[str]
|
|
90
|
+
pending_table_text_should_error: bool
|
|
89
91
|
quirks_mode: str
|
|
90
92
|
table_text_original_mode: InsertionMode | None # type: ignore[assignment]
|
|
91
93
|
template_modes: list[InsertionMode]
|
|
@@ -118,6 +120,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
118
120
|
self.quirks_mode = "no-quirks"
|
|
119
121
|
self.ignore_lf = False
|
|
120
122
|
self.active_formatting = []
|
|
123
|
+
self.pending_table_text_should_error = False
|
|
121
124
|
self.insert_from_table = False
|
|
122
125
|
self.pending_table_text = []
|
|
123
126
|
self.template_modes = []
|
|
@@ -251,7 +254,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
251
254
|
if self._has_element_in_button_scope("p"):
|
|
252
255
|
self._generate_implied_end_tags("p")
|
|
253
256
|
if self.open_elements[-1].name != "p":
|
|
254
|
-
self._parse_error("end-tag
|
|
257
|
+
self._parse_error("unexpected-end-tag", tag_name="p")
|
|
255
258
|
self._pop_until_inclusive("p")
|
|
256
259
|
return True
|
|
257
260
|
return False
|
|
@@ -413,11 +416,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
413
416
|
# Tokenizer guarantees non-empty data
|
|
414
417
|
data = current_token.data
|
|
415
418
|
if "\x00" in data:
|
|
416
|
-
self._parse_error("invalid-codepoint")
|
|
417
419
|
data = data.replace("\x00", "")
|
|
418
|
-
if "\x0c" in data:
|
|
419
|
-
self._parse_error("invalid-codepoint")
|
|
420
|
-
data = data.replace("\x0c", "")
|
|
421
420
|
if data:
|
|
422
421
|
if not is_all_whitespace(data):
|
|
423
422
|
self._reconstruct_active_formatting_elements()
|
|
@@ -497,6 +496,10 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
497
496
|
|
|
498
497
|
def _append_comment_to_document(self, text: str) -> None:
|
|
499
498
|
node = SimpleDomNode("#comment", data=text)
|
|
499
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
500
|
+
node._origin_pos = self.tokenizer.last_token_start_pos
|
|
501
|
+
if node._origin_pos is not None:
|
|
502
|
+
node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
|
|
500
503
|
self.document.append_child(node)
|
|
501
504
|
|
|
502
505
|
def _append_comment(self, text: str, parent: Any | None = None) -> None:
|
|
@@ -506,6 +509,10 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
506
509
|
if type(parent) is TemplateNode and parent.template_content:
|
|
507
510
|
parent = parent.template_content
|
|
508
511
|
node = SimpleDomNode("#comment", data=text)
|
|
512
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
513
|
+
node._origin_pos = self.tokenizer.last_token_start_pos
|
|
514
|
+
if node._origin_pos is not None:
|
|
515
|
+
node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
|
|
509
516
|
parent.append_child(node)
|
|
510
517
|
|
|
511
518
|
def _append_text(self, text: str) -> None:
|
|
@@ -516,6 +523,9 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
516
523
|
if not text:
|
|
517
524
|
return
|
|
518
525
|
|
|
526
|
+
if "\f" in text:
|
|
527
|
+
text = text.replace("\f", " ")
|
|
528
|
+
|
|
519
529
|
# Guard against empty stack
|
|
520
530
|
if not self.open_elements: # pragma: no cover
|
|
521
531
|
return
|
|
@@ -532,6 +542,10 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
532
542
|
return
|
|
533
543
|
|
|
534
544
|
node = TextNode(text)
|
|
545
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
546
|
+
node._origin_pos = self.tokenizer.last_token_start_pos
|
|
547
|
+
if node._origin_pos is not None:
|
|
548
|
+
node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
|
|
535
549
|
children.append(node)
|
|
536
550
|
node.parent = target
|
|
537
551
|
return
|
|
@@ -552,6 +566,10 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
552
566
|
return
|
|
553
567
|
|
|
554
568
|
node = TextNode(text)
|
|
569
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
570
|
+
node._origin_pos = self.tokenizer.last_token_start_pos
|
|
571
|
+
if node._origin_pos is not None:
|
|
572
|
+
node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
|
|
555
573
|
reference_node = parent.children[position] if position < len(parent.children) else None
|
|
556
574
|
parent.insert_before(node, reference_node)
|
|
557
575
|
|
|
@@ -582,6 +600,11 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
582
600
|
else:
|
|
583
601
|
node = ElementNode(tag.name, attrs=tag.attrs, namespace=namespace)
|
|
584
602
|
|
|
603
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
604
|
+
node._origin_pos = tag.start_pos
|
|
605
|
+
if node._origin_pos is not None:
|
|
606
|
+
node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
|
|
607
|
+
|
|
585
608
|
# Fast path for common case: not inserting from table
|
|
586
609
|
if not self.insert_from_table:
|
|
587
610
|
target = self._current_node_or_html()
|
|
@@ -799,6 +822,10 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
799
822
|
entry = self.active_formatting[index]
|
|
800
823
|
tag = Tag(Tag.START, entry["name"], self._clone_attributes(entry["attrs"]), False)
|
|
801
824
|
new_node = self._insert_element(tag, push=True)
|
|
825
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
826
|
+
new_node._origin_pos = entry["node"].origin_offset
|
|
827
|
+
new_node._origin_line = entry["node"].origin_line
|
|
828
|
+
new_node._origin_col = entry["node"].origin_col
|
|
802
829
|
entry["node"] = new_node
|
|
803
830
|
index += 1
|
|
804
831
|
|
|
@@ -855,12 +882,19 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
855
882
|
def _flush_pending_table_text(self) -> None:
|
|
856
883
|
data = "".join(self.pending_table_text)
|
|
857
884
|
self.pending_table_text.clear()
|
|
858
|
-
if not data:
|
|
885
|
+
if not data: # pragma: no cover
|
|
859
886
|
return
|
|
860
887
|
if is_all_whitespace(data):
|
|
861
888
|
self._append_text(data)
|
|
862
889
|
return
|
|
863
|
-
|
|
890
|
+
|
|
891
|
+
if self.pending_table_text_should_error:
|
|
892
|
+
# html5lib reports one foster-parenting error per non-whitespace character.
|
|
893
|
+
for ch in data:
|
|
894
|
+
if ch not in " \t\n\r\f":
|
|
895
|
+
self._parse_error("foster-parenting-character")
|
|
896
|
+
self.pending_table_text_should_error = False
|
|
897
|
+
|
|
864
898
|
previous = self.insert_from_table
|
|
865
899
|
self.insert_from_table = True
|
|
866
900
|
try:
|
|
@@ -1118,7 +1152,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1118
1152
|
|
|
1119
1153
|
# Per HTML5 spec: if first node doesn't match, it's a parse error
|
|
1120
1154
|
if first:
|
|
1121
|
-
self._parse_error("unexpected-end-tag
|
|
1155
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
1122
1156
|
first = False
|
|
1123
1157
|
|
|
1124
1158
|
# If we hit an HTML element that doesn't match, process in secondary mode
|
|
@@ -1259,19 +1293,21 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1259
1293
|
return self.process_token(CharacterTokens(data))
|
|
1260
1294
|
|
|
1261
1295
|
if self.mode == InsertionMode.IN_BODY:
|
|
1262
|
-
if "\x00" in data:
|
|
1263
|
-
self._parse_error("invalid-codepoint")
|
|
1264
|
-
data = data.replace("\x00", "")
|
|
1265
|
-
|
|
1266
1296
|
if not data:
|
|
1267
1297
|
return TokenSinkResult.Continue
|
|
1298
|
+
if "\x00" in data:
|
|
1299
|
+
data = data.replace("\x00", "")
|
|
1300
|
+
if not data:
|
|
1301
|
+
return TokenSinkResult.Continue
|
|
1268
1302
|
|
|
1269
1303
|
if is_all_whitespace(data):
|
|
1270
|
-
self.
|
|
1304
|
+
if self.active_formatting:
|
|
1305
|
+
self._reconstruct_active_formatting_elements()
|
|
1271
1306
|
self._append_text(data)
|
|
1272
1307
|
return TokenSinkResult.Continue
|
|
1273
1308
|
|
|
1274
|
-
self.
|
|
1309
|
+
if self.active_formatting:
|
|
1310
|
+
self._reconstruct_active_formatting_elements()
|
|
1275
1311
|
self.frameset_ok = False
|
|
1276
1312
|
self._append_text(data)
|
|
1277
1313
|
return TokenSinkResult.Continue
|
justhtml/treebuilder_modes.py
CHANGED
|
@@ -6,6 +6,7 @@ from __future__ import annotations
|
|
|
6
6
|
from typing import Any
|
|
7
7
|
|
|
8
8
|
from .constants import (
|
|
9
|
+
FORMAT_MARKER,
|
|
9
10
|
FORMATTING_ELEMENTS,
|
|
10
11
|
HEADING_ELEMENTS,
|
|
11
12
|
)
|
|
@@ -54,9 +55,9 @@ class TreeBuilderModesMixin:
|
|
|
54
55
|
return ("reprocess", InsertionMode.BEFORE_HTML, token)
|
|
55
56
|
# Only Tags remain - no DOCTYPE seen, so quirks mode
|
|
56
57
|
if token.kind == Tag.START:
|
|
57
|
-
self._parse_error("expected-doctype-but-got-start-tag", tag_name=token.name
|
|
58
|
+
self._parse_error("expected-doctype-but-got-start-tag", tag_name=token.name)
|
|
58
59
|
else:
|
|
59
|
-
self._parse_error("expected-doctype-but-got-end-tag", tag_name=token.name
|
|
60
|
+
self._parse_error("expected-doctype-but-got-end-tag", tag_name=token.name)
|
|
60
61
|
self._set_quirks_mode("quirks")
|
|
61
62
|
return ("reprocess", InsertionMode.BEFORE_HTML, token)
|
|
62
63
|
|
|
@@ -265,11 +266,7 @@ class TreeBuilderModesMixin:
|
|
|
265
266
|
if isinstance(token, CharacterTokens):
|
|
266
267
|
data = token.data or ""
|
|
267
268
|
if "\x00" in data:
|
|
268
|
-
self._parse_error("invalid-codepoint-in-body")
|
|
269
269
|
data = data.replace("\x00", "")
|
|
270
|
-
if "\x0c" in data:
|
|
271
|
-
self._parse_error("invalid-codepoint-in-body")
|
|
272
|
-
data = data.replace("\x0c", "")
|
|
273
270
|
if not data or is_all_whitespace(data):
|
|
274
271
|
if data:
|
|
275
272
|
self._append_text(data)
|
|
@@ -331,6 +328,10 @@ class TreeBuilderModesMixin:
|
|
|
331
328
|
self.mode = InsertionMode.IN_HEAD
|
|
332
329
|
return ("reprocess", InsertionMode.IN_HEAD, token)
|
|
333
330
|
if token.kind == Tag.END and token.name == "template":
|
|
331
|
+
has_template = any(node.name == "template" for node in self.open_elements)
|
|
332
|
+
if not has_template:
|
|
333
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
334
|
+
return None
|
|
334
335
|
return self._mode_in_head(token)
|
|
335
336
|
if token.kind == Tag.END and token.name == "body":
|
|
336
337
|
self._insert_body_if_missing()
|
|
@@ -451,6 +452,8 @@ class TreeBuilderModesMixin:
|
|
|
451
452
|
if self.template_modes:
|
|
452
453
|
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
453
454
|
return
|
|
455
|
+
# Per spec: parse error; merge attributes onto existing <html>.
|
|
456
|
+
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
454
457
|
# In IN_BODY mode, html element is always at open_elements[0]
|
|
455
458
|
if self.open_elements: # pragma: no branch
|
|
456
459
|
html = self.open_elements[0]
|
|
@@ -574,6 +577,10 @@ class TreeBuilderModesMixin:
|
|
|
574
577
|
# 3. Find formatting element
|
|
575
578
|
formatting_element_index = self._find_active_formatting_index(subject)
|
|
576
579
|
if formatting_element_index is None:
|
|
580
|
+
# html5lib reports a parse error when an end tag for a formatting
|
|
581
|
+
# element triggers the adoption agency algorithm but no matching
|
|
582
|
+
# active formatting entry exists.
|
|
583
|
+
self._parse_error("adoption-agency-1.3")
|
|
577
584
|
return
|
|
578
585
|
|
|
579
586
|
formatting_element_entry = self.active_formatting[formatting_element_index]
|
|
@@ -651,6 +658,10 @@ class TreeBuilderModesMixin:
|
|
|
651
658
|
# 10.4 Replace entry with new element
|
|
652
659
|
entry = self.active_formatting[node_formatting_index]
|
|
653
660
|
new_element = self._create_element(entry["name"], entry["node"].namespace, entry["attrs"])
|
|
661
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
662
|
+
new_element._origin_pos = entry["node"].origin_offset
|
|
663
|
+
new_element._origin_line = entry["node"].origin_line
|
|
664
|
+
new_element._origin_col = entry["node"].origin_col
|
|
654
665
|
entry["node"] = new_element
|
|
655
666
|
self.open_elements[self.open_elements.index(node)] = new_element
|
|
656
667
|
node = new_element
|
|
@@ -684,6 +695,10 @@ class TreeBuilderModesMixin:
|
|
|
684
695
|
# 12. Create new formatting element
|
|
685
696
|
entry = self.active_formatting[formatting_element_index]
|
|
686
697
|
new_formatting_element = self._create_element(entry["name"], entry["node"].namespace, entry["attrs"])
|
|
698
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
699
|
+
new_formatting_element._origin_pos = entry["node"].origin_offset
|
|
700
|
+
new_formatting_element._origin_line = entry["node"].origin_line
|
|
701
|
+
new_formatting_element._origin_col = entry["node"].origin_col
|
|
687
702
|
entry["node"] = new_formatting_element
|
|
688
703
|
|
|
689
704
|
# 13. Move children of furthest block
|
|
@@ -708,6 +723,7 @@ class TreeBuilderModesMixin:
|
|
|
708
723
|
|
|
709
724
|
def _handle_body_start_a(self, token: Any) -> Any:
|
|
710
725
|
if self._has_active_formatting_entry("a"):
|
|
726
|
+
self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=token.name)
|
|
711
727
|
self._adoption_agency("a")
|
|
712
728
|
self._remove_last_active_formatting_by_name("a")
|
|
713
729
|
self._remove_last_open_element_by_name("a")
|
|
@@ -853,6 +869,7 @@ class TreeBuilderModesMixin:
|
|
|
853
869
|
def _handle_body_end_template(self, token: Any) -> Any:
|
|
854
870
|
has_template = any(node.name == "template" for node in self.open_elements)
|
|
855
871
|
if not has_template:
|
|
872
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
856
873
|
return
|
|
857
874
|
self._generate_implied_end_tags()
|
|
858
875
|
self._pop_until_inclusive("template")
|
|
@@ -978,11 +995,33 @@ class TreeBuilderModesMixin:
|
|
|
978
995
|
if isinstance(token, CharacterTokens):
|
|
979
996
|
data = token.data or ""
|
|
980
997
|
if "\x00" in data:
|
|
981
|
-
self._parse_error("unexpected-null-character")
|
|
982
998
|
data = data.replace("\x00", "")
|
|
983
999
|
if not data:
|
|
984
1000
|
return None
|
|
985
1001
|
token = CharacterTokens(data)
|
|
1002
|
+
|
|
1003
|
+
if is_all_whitespace(data):
|
|
1004
|
+
self._append_text(data)
|
|
1005
|
+
return None
|
|
1006
|
+
|
|
1007
|
+
# html5lib-tests expect that some table foster-parenting text triggered by a
|
|
1008
|
+
# misnested formatting element (<a>) only produces an implied-end-tag error
|
|
1009
|
+
# when the table closes, not an additional character-in-table error.
|
|
1010
|
+
suppress_table_char_error = False
|
|
1011
|
+
if self.active_formatting:
|
|
1012
|
+
for idx in range(len(self.active_formatting) - 1, -1, -1):
|
|
1013
|
+
entry = self.active_formatting[idx]
|
|
1014
|
+
if entry is FORMAT_MARKER:
|
|
1015
|
+
break
|
|
1016
|
+
if entry["name"] == "a":
|
|
1017
|
+
if entry["node"] not in self.open_elements:
|
|
1018
|
+
suppress_table_char_error = True
|
|
1019
|
+
break
|
|
1020
|
+
|
|
1021
|
+
if not suppress_table_char_error:
|
|
1022
|
+
self.pending_table_text_should_error = True
|
|
1023
|
+
else:
|
|
1024
|
+
self.pending_table_text_should_error = False
|
|
986
1025
|
self.pending_table_text = []
|
|
987
1026
|
self.table_text_original_mode = self.mode
|
|
988
1027
|
self.mode = InsertionMode.IN_TABLE_TEXT
|
|
@@ -1055,7 +1094,7 @@ class TreeBuilderModesMixin:
|
|
|
1055
1094
|
self.form_element = node
|
|
1056
1095
|
self.open_elements.pop() # push=True always adds to stack
|
|
1057
1096
|
return None
|
|
1058
|
-
self._parse_error("
|
|
1097
|
+
self._parse_error("foster-parenting-start-tag", tag_name=name)
|
|
1059
1098
|
previous = self.insert_from_table
|
|
1060
1099
|
self.insert_from_table = True
|
|
1061
1100
|
try:
|
|
@@ -1082,19 +1121,33 @@ class TreeBuilderModesMixin:
|
|
|
1082
1121
|
if self.template_modes:
|
|
1083
1122
|
return self._mode_in_template(token)
|
|
1084
1123
|
if self._has_in_table_scope("table"):
|
|
1085
|
-
self._parse_error("
|
|
1124
|
+
self._parse_error("eof-in-table")
|
|
1086
1125
|
return None
|
|
1087
1126
|
|
|
1088
1127
|
def _mode_in_table_text(self, token: Any) -> Any:
|
|
1089
1128
|
if isinstance(token, CharacterTokens):
|
|
1090
1129
|
# IN_TABLE mode guarantees non-empty data
|
|
1091
1130
|
data = token.data
|
|
1092
|
-
|
|
1093
|
-
self._parse_error("invalid-codepoint-in-table-text")
|
|
1094
|
-
data = data.replace("\x0c", "")
|
|
1095
|
-
if data:
|
|
1096
|
-
self.pending_table_text.append(data)
|
|
1131
|
+
self.pending_table_text.append(data)
|
|
1097
1132
|
return None
|
|
1133
|
+
|
|
1134
|
+
if (
|
|
1135
|
+
self.pending_table_text
|
|
1136
|
+
and isinstance(token, Tag)
|
|
1137
|
+
and token.kind == Tag.END
|
|
1138
|
+
and token.name == "table"
|
|
1139
|
+
and not is_all_whitespace("".join(self.pending_table_text))
|
|
1140
|
+
):
|
|
1141
|
+
# If a misnested <a> exists only in the active formatting list, html5lib
|
|
1142
|
+
# reports the implied close when the table ends.
|
|
1143
|
+
if self.active_formatting:
|
|
1144
|
+
for idx in range(len(self.active_formatting) - 1, -1, -1):
|
|
1145
|
+
entry = self.active_formatting[idx]
|
|
1146
|
+
if entry is FORMAT_MARKER:
|
|
1147
|
+
break
|
|
1148
|
+
if entry["name"] == "a" and entry["node"] not in self.open_elements:
|
|
1149
|
+
self._parse_error("unexpected-implied-end-tag-in-table-view")
|
|
1150
|
+
break
|
|
1098
1151
|
self._flush_pending_table_text()
|
|
1099
1152
|
original = self.table_text_original_mode or InsertionMode.IN_TABLE
|
|
1100
1153
|
self.table_text_original_mode = None
|
|
@@ -1443,11 +1496,7 @@ class TreeBuilderModesMixin:
|
|
|
1443
1496
|
if isinstance(token, CharacterTokens):
|
|
1444
1497
|
data = token.data or ""
|
|
1445
1498
|
if "\x00" in data:
|
|
1446
|
-
self._parse_error("invalid-codepoint-in-select")
|
|
1447
1499
|
data = data.replace("\x00", "")
|
|
1448
|
-
if "\x0c" in data:
|
|
1449
|
-
self._parse_error("invalid-codepoint-in-select")
|
|
1450
|
-
data = data.replace("\x0c", "")
|
|
1451
1500
|
if data:
|
|
1452
1501
|
self._reconstruct_active_formatting_elements()
|
|
1453
1502
|
self._append_text(data)
|
|
@@ -1475,13 +1524,13 @@ class TreeBuilderModesMixin:
|
|
|
1475
1524
|
self._insert_element(token, push=True)
|
|
1476
1525
|
return None
|
|
1477
1526
|
if name == "select":
|
|
1478
|
-
self._parse_error("unexpected-
|
|
1527
|
+
self._parse_error("unexpected-select-in-select")
|
|
1479
1528
|
# select is always in scope in IN_SELECT mode
|
|
1480
1529
|
self._pop_until_any_inclusive({"select"})
|
|
1481
1530
|
self._reset_insertion_mode()
|
|
1482
1531
|
return None
|
|
1483
1532
|
if name in {"input", "textarea"}:
|
|
1484
|
-
self._parse_error("unexpected-start-tag-
|
|
1533
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1485
1534
|
# select is always in scope in IN_SELECT mode
|
|
1486
1535
|
self._pop_until_any_inclusive({"select"})
|
|
1487
1536
|
self._reset_insertion_mode()
|
|
@@ -1491,7 +1540,7 @@ class TreeBuilderModesMixin:
|
|
|
1491
1540
|
self._insert_element(token, push=False)
|
|
1492
1541
|
return None
|
|
1493
1542
|
if name in {"caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr", "table"}:
|
|
1494
|
-
self._parse_error("unexpected-start-tag-
|
|
1543
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1495
1544
|
# select is always in scope in IN_SELECT mode
|
|
1496
1545
|
self._pop_until_any_inclusive({"select"})
|
|
1497
1546
|
self._reset_insertion_mode()
|
|
@@ -1509,6 +1558,7 @@ class TreeBuilderModesMixin:
|
|
|
1509
1558
|
self._append_active_formatting_entry(name, token.attrs, node)
|
|
1510
1559
|
return None
|
|
1511
1560
|
if name == "hr":
|
|
1561
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1512
1562
|
# Per spec: pop option and optgroup before inserting hr (makes hr sibling, not child)
|
|
1513
1563
|
if self.open_elements and self.open_elements[-1].name == "option":
|
|
1514
1564
|
self.open_elements.pop()
|
|
@@ -1518,22 +1568,29 @@ class TreeBuilderModesMixin:
|
|
|
1518
1568
|
self._insert_element(token, push=False)
|
|
1519
1569
|
return None
|
|
1520
1570
|
if name == "menuitem":
|
|
1571
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1521
1572
|
self._reconstruct_active_formatting_elements()
|
|
1522
1573
|
self._insert_element(token, push=True)
|
|
1523
1574
|
return None
|
|
1524
1575
|
# Allow common HTML elements in select (newer spec)
|
|
1525
1576
|
if name in {"p", "div", "span", "button", "datalist", "selectedcontent"}:
|
|
1577
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1526
1578
|
self._reconstruct_active_formatting_elements()
|
|
1527
1579
|
self._insert_element(token, push=not token.self_closing)
|
|
1528
1580
|
return None
|
|
1529
1581
|
if name in {"br", "img"}:
|
|
1582
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1530
1583
|
self._reconstruct_active_formatting_elements()
|
|
1531
1584
|
self._insert_element(token, push=False)
|
|
1532
1585
|
return None
|
|
1533
1586
|
if name == "plaintext":
|
|
1534
1587
|
# Per spec: plaintext element is inserted in select (consumes all remaining text)
|
|
1588
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1535
1589
|
self._reconstruct_active_formatting_elements()
|
|
1536
1590
|
self._insert_element(token, push=True)
|
|
1591
|
+
return None
|
|
1592
|
+
# Any other start tag: parse error, ignore.
|
|
1593
|
+
self._parse_error("unexpected-start-tag-in-select", tag_name=name)
|
|
1537
1594
|
return None
|
|
1538
1595
|
if name == "optgroup":
|
|
1539
1596
|
if self.open_elements and self.open_elements[-1].name == "option":
|
|
@@ -1541,13 +1598,13 @@ class TreeBuilderModesMixin:
|
|
|
1541
1598
|
if self.open_elements and self.open_elements[-1].name == "optgroup":
|
|
1542
1599
|
self.open_elements.pop()
|
|
1543
1600
|
else:
|
|
1544
|
-
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
1601
|
+
self._parse_error("unexpected-end-tag-in-select", tag_name=token.name)
|
|
1545
1602
|
return None
|
|
1546
1603
|
if name == "option":
|
|
1547
1604
|
if self.open_elements and self.open_elements[-1].name == "option":
|
|
1548
1605
|
self.open_elements.pop()
|
|
1549
1606
|
else:
|
|
1550
|
-
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
1607
|
+
self._parse_error("unexpected-end-tag-in-select", tag_name=token.name)
|
|
1551
1608
|
return None
|
|
1552
1609
|
if name == "select":
|
|
1553
1610
|
# In IN_SELECT mode, select is always in scope - pop to it
|
|
@@ -1559,17 +1616,20 @@ class TreeBuilderModesMixin:
|
|
|
1559
1616
|
# select is always on stack in IN_SELECT mode
|
|
1560
1617
|
select_node = self._find_last_on_stack("select")
|
|
1561
1618
|
fmt_index = self._find_active_formatting_index(name)
|
|
1562
|
-
if fmt_index is
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
1619
|
+
if fmt_index is None:
|
|
1620
|
+
self._parse_error("unexpected-end-tag-in-select", tag_name=name)
|
|
1621
|
+
return None
|
|
1622
|
+
target = self.active_formatting[fmt_index]["node"]
|
|
1623
|
+
if target in self.open_elements: # pragma: no branch
|
|
1624
|
+
select_index = self.open_elements.index(select_node)
|
|
1625
|
+
target_index = self.open_elements.index(target)
|
|
1626
|
+
if target_index < select_index:
|
|
1627
|
+
self._parse_error("unexpected-end-tag-in-select", tag_name=name)
|
|
1628
|
+
return None
|
|
1570
1629
|
self._adoption_agency(name)
|
|
1571
1630
|
return None
|
|
1572
1631
|
if name in {"p", "div", "span", "button", "datalist", "selectedcontent"}:
|
|
1632
|
+
self._parse_error("unexpected-end-tag-in-select", tag_name=name)
|
|
1573
1633
|
# Per HTML5 spec: these end tags in select mode close the element if it's on the stack.
|
|
1574
1634
|
# But we must not pop across the select boundary (i.e., don't pop elements BEFORE select).
|
|
1575
1635
|
select_idx = None
|
|
@@ -1586,11 +1646,9 @@ class TreeBuilderModesMixin:
|
|
|
1586
1646
|
popped = self.open_elements.pop()
|
|
1587
1647
|
if popped.name == name:
|
|
1588
1648
|
break
|
|
1589
|
-
else:
|
|
1590
|
-
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1591
1649
|
return None
|
|
1592
1650
|
if name in {"caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr", "table"}:
|
|
1593
|
-
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1651
|
+
self._parse_error("unexpected-end-tag-in-select", tag_name=name)
|
|
1594
1652
|
# select is always in scope in IN_SELECT mode
|
|
1595
1653
|
self._pop_until_any_inclusive({"select"})
|
|
1596
1654
|
self._reset_insertion_mode()
|
|
@@ -1773,8 +1831,11 @@ class TreeBuilderModesMixin:
|
|
|
1773
1831
|
def _mode_after_frameset(self, token: Any) -> Any:
|
|
1774
1832
|
# Per HTML5 spec §13.2.6.4.17: After frameset insertion mode
|
|
1775
1833
|
if isinstance(token, CharacterTokens):
|
|
1776
|
-
# Only whitespace characters allowed;
|
|
1777
|
-
|
|
1834
|
+
# Only whitespace characters allowed; non-whitespace is a parse error.
|
|
1835
|
+
data = token.data or ""
|
|
1836
|
+
whitespace = "".join(ch for ch in data if ch in "\t\n\f\r ")
|
|
1837
|
+
if any(ch not in "\t\n\f\r " for ch in data):
|
|
1838
|
+
self._parse_error("unexpected-token-after-frameset")
|
|
1778
1839
|
if whitespace:
|
|
1779
1840
|
self._append_text(whitespace)
|
|
1780
1841
|
return None
|
|
@@ -1787,6 +1848,9 @@ class TreeBuilderModesMixin:
|
|
|
1787
1848
|
if token.kind == Tag.END and token.name == "html":
|
|
1788
1849
|
self.mode = InsertionMode.AFTER_AFTER_FRAMESET
|
|
1789
1850
|
return None
|
|
1851
|
+
if token.kind == Tag.END and token.name == "frameset":
|
|
1852
|
+
self._parse_error("unexpected-token-after-frameset")
|
|
1853
|
+
return None
|
|
1790
1854
|
if token.kind == Tag.START and token.name == "noframes":
|
|
1791
1855
|
# Insert noframes element directly and switch to TEXT mode
|
|
1792
1856
|
self._insert_element(token, push=True)
|