justhtml 0.12.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/tokens.py CHANGED
@@ -4,7 +4,7 @@ from typing import Literal
4
4
 
5
5
 
6
6
  class Tag:
7
- __slots__ = ("attrs", "kind", "name", "self_closing")
7
+ __slots__ = ("attrs", "kind", "name", "self_closing", "start_pos")
8
8
 
9
9
  START: Literal[0] = 0
10
10
  END: Literal[1] = 1
@@ -13,6 +13,7 @@ class Tag:
13
13
  name: str
14
14
  attrs: dict[str, str | None]
15
15
  self_closing: bool
16
+ start_pos: int | None
16
17
 
17
18
  def __init__(
18
19
  self,
@@ -20,11 +21,13 @@ class Tag:
20
21
  name: str,
21
22
  attrs: dict[str, str | None] | None,
22
23
  self_closing: bool = False,
24
+ start_pos: int | None = None,
23
25
  ) -> None:
24
26
  self.kind = kind
25
27
  self.name = name
26
28
  self.attrs = attrs if attrs is not None else {}
27
29
  self.self_closing = bool(self_closing)
30
+ self.start_pos = start_pos
28
31
 
29
32
 
30
33
  class CharacterTokens:
@@ -37,12 +40,14 @@ class CharacterTokens:
37
40
 
38
41
 
39
42
  class CommentToken:
40
- __slots__ = ("data",)
43
+ __slots__ = ("data", "start_pos")
41
44
 
42
45
  data: str
46
+ start_pos: int | None
43
47
 
44
- def __init__(self, data: str) -> None:
48
+ def __init__(self, data: str, start_pos: int | None = None) -> None:
45
49
  self.data = data
50
+ self.start_pos = start_pos
46
51
 
47
52
 
48
53
  class Doctype:
justhtml/treebuilder.py CHANGED
@@ -59,6 +59,7 @@ class TreeBuilder(TreeBuilderModesMixin):
59
59
  "open_elements",
60
60
  "original_mode",
61
61
  "pending_table_text",
62
+ "pending_table_text_should_error",
62
63
  "quirks_mode",
63
64
  "table_text_original_mode",
64
65
  "template_modes",
@@ -86,6 +87,7 @@ class TreeBuilder(TreeBuilderModesMixin):
86
87
  open_elements: list[Any]
87
88
  original_mode: InsertionMode | None # type: ignore[assignment]
88
89
  pending_table_text: list[str]
90
+ pending_table_text_should_error: bool
89
91
  quirks_mode: str
90
92
  table_text_original_mode: InsertionMode | None # type: ignore[assignment]
91
93
  template_modes: list[InsertionMode]
@@ -118,6 +120,7 @@ class TreeBuilder(TreeBuilderModesMixin):
118
120
  self.quirks_mode = "no-quirks"
119
121
  self.ignore_lf = False
120
122
  self.active_formatting = []
123
+ self.pending_table_text_should_error = False
121
124
  self.insert_from_table = False
122
125
  self.pending_table_text = []
123
126
  self.template_modes = []
@@ -251,7 +254,7 @@ class TreeBuilder(TreeBuilderModesMixin):
251
254
  if self._has_element_in_button_scope("p"):
252
255
  self._generate_implied_end_tags("p")
253
256
  if self.open_elements[-1].name != "p":
254
- self._parse_error("end-tag-too-early", tag_name="p")
257
+ self._parse_error("unexpected-end-tag", tag_name="p")
255
258
  self._pop_until_inclusive("p")
256
259
  return True
257
260
  return False
@@ -413,11 +416,7 @@ class TreeBuilder(TreeBuilderModesMixin):
413
416
  # Tokenizer guarantees non-empty data
414
417
  data = current_token.data
415
418
  if "\x00" in data:
416
- self._parse_error("invalid-codepoint")
417
419
  data = data.replace("\x00", "")
418
- if "\x0c" in data:
419
- self._parse_error("invalid-codepoint")
420
- data = data.replace("\x0c", "")
421
420
  if data:
422
421
  if not is_all_whitespace(data):
423
422
  self._reconstruct_active_formatting_elements()
@@ -497,6 +496,10 @@ class TreeBuilder(TreeBuilderModesMixin):
497
496
 
498
497
  def _append_comment_to_document(self, text: str) -> None:
499
498
  node = SimpleDomNode("#comment", data=text)
499
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
500
+ node._origin_pos = self.tokenizer.last_token_start_pos
501
+ if node._origin_pos is not None:
502
+ node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
500
503
  self.document.append_child(node)
501
504
 
502
505
  def _append_comment(self, text: str, parent: Any | None = None) -> None:
@@ -506,6 +509,10 @@ class TreeBuilder(TreeBuilderModesMixin):
506
509
  if type(parent) is TemplateNode and parent.template_content:
507
510
  parent = parent.template_content
508
511
  node = SimpleDomNode("#comment", data=text)
512
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
513
+ node._origin_pos = self.tokenizer.last_token_start_pos
514
+ if node._origin_pos is not None:
515
+ node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
509
516
  parent.append_child(node)
510
517
 
511
518
  def _append_text(self, text: str) -> None:
@@ -516,6 +523,9 @@ class TreeBuilder(TreeBuilderModesMixin):
516
523
  if not text:
517
524
  return
518
525
 
526
+ if "\f" in text:
527
+ text = text.replace("\f", " ")
528
+
519
529
  # Guard against empty stack
520
530
  if not self.open_elements: # pragma: no cover
521
531
  return
@@ -532,6 +542,10 @@ class TreeBuilder(TreeBuilderModesMixin):
532
542
  return
533
543
 
534
544
  node = TextNode(text)
545
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
546
+ node._origin_pos = self.tokenizer.last_token_start_pos
547
+ if node._origin_pos is not None:
548
+ node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
535
549
  children.append(node)
536
550
  node.parent = target
537
551
  return
@@ -552,6 +566,10 @@ class TreeBuilder(TreeBuilderModesMixin):
552
566
  return
553
567
 
554
568
  node = TextNode(text)
569
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
570
+ node._origin_pos = self.tokenizer.last_token_start_pos
571
+ if node._origin_pos is not None:
572
+ node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
555
573
  reference_node = parent.children[position] if position < len(parent.children) else None
556
574
  parent.insert_before(node, reference_node)
557
575
 
@@ -582,6 +600,11 @@ class TreeBuilder(TreeBuilderModesMixin):
582
600
  else:
583
601
  node = ElementNode(tag.name, attrs=tag.attrs, namespace=namespace)
584
602
 
603
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
604
+ node._origin_pos = tag.start_pos
605
+ if node._origin_pos is not None:
606
+ node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
607
+
585
608
  # Fast path for common case: not inserting from table
586
609
  if not self.insert_from_table:
587
610
  target = self._current_node_or_html()
@@ -799,6 +822,10 @@ class TreeBuilder(TreeBuilderModesMixin):
799
822
  entry = self.active_formatting[index]
800
823
  tag = Tag(Tag.START, entry["name"], self._clone_attributes(entry["attrs"]), False)
801
824
  new_node = self._insert_element(tag, push=True)
825
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
826
+ new_node._origin_pos = entry["node"].origin_offset
827
+ new_node._origin_line = entry["node"].origin_line
828
+ new_node._origin_col = entry["node"].origin_col
802
829
  entry["node"] = new_node
803
830
  index += 1
804
831
 
@@ -855,12 +882,19 @@ class TreeBuilder(TreeBuilderModesMixin):
855
882
  def _flush_pending_table_text(self) -> None:
856
883
  data = "".join(self.pending_table_text)
857
884
  self.pending_table_text.clear()
858
- if not data:
885
+ if not data: # pragma: no cover
859
886
  return
860
887
  if is_all_whitespace(data):
861
888
  self._append_text(data)
862
889
  return
863
- self._parse_error("foster-parenting-character")
890
+
891
+ if self.pending_table_text_should_error:
892
+ # html5lib reports one foster-parenting error per non-whitespace character.
893
+ for ch in data:
894
+ if ch not in " \t\n\r\f":
895
+ self._parse_error("foster-parenting-character")
896
+ self.pending_table_text_should_error = False
897
+
864
898
  previous = self.insert_from_table
865
899
  self.insert_from_table = True
866
900
  try:
@@ -1118,7 +1152,7 @@ class TreeBuilder(TreeBuilderModesMixin):
1118
1152
 
1119
1153
  # Per HTML5 spec: if first node doesn't match, it's a parse error
1120
1154
  if first:
1121
- self._parse_error("unexpected-end-tag-in-foreign-content", tag_name=token.name)
1155
+ self._parse_error("unexpected-end-tag", tag_name=token.name)
1122
1156
  first = False
1123
1157
 
1124
1158
  # If we hit an HTML element that doesn't match, process in secondary mode
@@ -1259,19 +1293,21 @@ class TreeBuilder(TreeBuilderModesMixin):
1259
1293
  return self.process_token(CharacterTokens(data))
1260
1294
 
1261
1295
  if self.mode == InsertionMode.IN_BODY:
1262
- if "\x00" in data:
1263
- self._parse_error("invalid-codepoint")
1264
- data = data.replace("\x00", "")
1265
-
1266
1296
  if not data:
1267
1297
  return TokenSinkResult.Continue
1298
+ if "\x00" in data:
1299
+ data = data.replace("\x00", "")
1300
+ if not data:
1301
+ return TokenSinkResult.Continue
1268
1302
 
1269
1303
  if is_all_whitespace(data):
1270
- self._reconstruct_active_formatting_elements()
1304
+ if self.active_formatting:
1305
+ self._reconstruct_active_formatting_elements()
1271
1306
  self._append_text(data)
1272
1307
  return TokenSinkResult.Continue
1273
1308
 
1274
- self._reconstruct_active_formatting_elements()
1309
+ if self.active_formatting:
1310
+ self._reconstruct_active_formatting_elements()
1275
1311
  self.frameset_ok = False
1276
1312
  self._append_text(data)
1277
1313
  return TokenSinkResult.Continue
@@ -6,6 +6,7 @@ from __future__ import annotations
6
6
  from typing import Any
7
7
 
8
8
  from .constants import (
9
+ FORMAT_MARKER,
9
10
  FORMATTING_ELEMENTS,
10
11
  HEADING_ELEMENTS,
11
12
  )
@@ -54,9 +55,9 @@ class TreeBuilderModesMixin:
54
55
  return ("reprocess", InsertionMode.BEFORE_HTML, token)
55
56
  # Only Tags remain - no DOCTYPE seen, so quirks mode
56
57
  if token.kind == Tag.START:
57
- self._parse_error("expected-doctype-but-got-start-tag", tag_name=token.name, token=token)
58
+ self._parse_error("expected-doctype-but-got-start-tag", tag_name=token.name)
58
59
  else:
59
- self._parse_error("expected-doctype-but-got-end-tag", tag_name=token.name, token=token)
60
+ self._parse_error("expected-doctype-but-got-end-tag", tag_name=token.name)
60
61
  self._set_quirks_mode("quirks")
61
62
  return ("reprocess", InsertionMode.BEFORE_HTML, token)
62
63
 
@@ -265,11 +266,7 @@ class TreeBuilderModesMixin:
265
266
  if isinstance(token, CharacterTokens):
266
267
  data = token.data or ""
267
268
  if "\x00" in data:
268
- self._parse_error("invalid-codepoint-in-body")
269
269
  data = data.replace("\x00", "")
270
- if "\x0c" in data:
271
- self._parse_error("invalid-codepoint-in-body")
272
- data = data.replace("\x0c", "")
273
270
  if not data or is_all_whitespace(data):
274
271
  if data:
275
272
  self._append_text(data)
@@ -331,6 +328,10 @@ class TreeBuilderModesMixin:
331
328
  self.mode = InsertionMode.IN_HEAD
332
329
  return ("reprocess", InsertionMode.IN_HEAD, token)
333
330
  if token.kind == Tag.END and token.name == "template":
331
+ has_template = any(node.name == "template" for node in self.open_elements)
332
+ if not has_template:
333
+ self._parse_error("unexpected-end-tag", tag_name=token.name)
334
+ return None
334
335
  return self._mode_in_head(token)
335
336
  if token.kind == Tag.END and token.name == "body":
336
337
  self._insert_body_if_missing()
@@ -451,6 +452,8 @@ class TreeBuilderModesMixin:
451
452
  if self.template_modes:
452
453
  self._parse_error("unexpected-start-tag", tag_name=token.name)
453
454
  return
455
+ # Per spec: parse error; merge attributes onto existing <html>.
456
+ self._parse_error("unexpected-start-tag", tag_name=token.name)
454
457
  # In IN_BODY mode, html element is always at open_elements[0]
455
458
  if self.open_elements: # pragma: no branch
456
459
  html = self.open_elements[0]
@@ -574,6 +577,10 @@ class TreeBuilderModesMixin:
574
577
  # 3. Find formatting element
575
578
  formatting_element_index = self._find_active_formatting_index(subject)
576
579
  if formatting_element_index is None:
580
+ # html5lib reports a parse error when an end tag for a formatting
581
+ # element triggers the adoption agency algorithm but no matching
582
+ # active formatting entry exists.
583
+ self._parse_error("adoption-agency-1.3")
577
584
  return
578
585
 
579
586
  formatting_element_entry = self.active_formatting[formatting_element_index]
@@ -651,6 +658,10 @@ class TreeBuilderModesMixin:
651
658
  # 10.4 Replace entry with new element
652
659
  entry = self.active_formatting[node_formatting_index]
653
660
  new_element = self._create_element(entry["name"], entry["node"].namespace, entry["attrs"])
661
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
662
+ new_element._origin_pos = entry["node"].origin_offset
663
+ new_element._origin_line = entry["node"].origin_line
664
+ new_element._origin_col = entry["node"].origin_col
654
665
  entry["node"] = new_element
655
666
  self.open_elements[self.open_elements.index(node)] = new_element
656
667
  node = new_element
@@ -684,6 +695,10 @@ class TreeBuilderModesMixin:
684
695
  # 12. Create new formatting element
685
696
  entry = self.active_formatting[formatting_element_index]
686
697
  new_formatting_element = self._create_element(entry["name"], entry["node"].namespace, entry["attrs"])
698
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
699
+ new_formatting_element._origin_pos = entry["node"].origin_offset
700
+ new_formatting_element._origin_line = entry["node"].origin_line
701
+ new_formatting_element._origin_col = entry["node"].origin_col
687
702
  entry["node"] = new_formatting_element
688
703
 
689
704
  # 13. Move children of furthest block
@@ -708,6 +723,7 @@ class TreeBuilderModesMixin:
708
723
 
709
724
  def _handle_body_start_a(self, token: Any) -> Any:
710
725
  if self._has_active_formatting_entry("a"):
726
+ self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=token.name)
711
727
  self._adoption_agency("a")
712
728
  self._remove_last_active_formatting_by_name("a")
713
729
  self._remove_last_open_element_by_name("a")
@@ -853,6 +869,7 @@ class TreeBuilderModesMixin:
853
869
  def _handle_body_end_template(self, token: Any) -> Any:
854
870
  has_template = any(node.name == "template" for node in self.open_elements)
855
871
  if not has_template:
872
+ self._parse_error("unexpected-end-tag", tag_name=token.name)
856
873
  return
857
874
  self._generate_implied_end_tags()
858
875
  self._pop_until_inclusive("template")
@@ -978,11 +995,33 @@ class TreeBuilderModesMixin:
978
995
  if isinstance(token, CharacterTokens):
979
996
  data = token.data or ""
980
997
  if "\x00" in data:
981
- self._parse_error("unexpected-null-character")
982
998
  data = data.replace("\x00", "")
983
999
  if not data:
984
1000
  return None
985
1001
  token = CharacterTokens(data)
1002
+
1003
+ if is_all_whitespace(data):
1004
+ self._append_text(data)
1005
+ return None
1006
+
1007
+ # html5lib-tests expect that some table foster-parenting text triggered by a
1008
+ # misnested formatting element (<a>) only produces an implied-end-tag error
1009
+ # when the table closes, not an additional character-in-table error.
1010
+ suppress_table_char_error = False
1011
+ if self.active_formatting:
1012
+ for idx in range(len(self.active_formatting) - 1, -1, -1):
1013
+ entry = self.active_formatting[idx]
1014
+ if entry is FORMAT_MARKER:
1015
+ break
1016
+ if entry["name"] == "a":
1017
+ if entry["node"] not in self.open_elements:
1018
+ suppress_table_char_error = True
1019
+ break
1020
+
1021
+ if not suppress_table_char_error:
1022
+ self.pending_table_text_should_error = True
1023
+ else:
1024
+ self.pending_table_text_should_error = False
986
1025
  self.pending_table_text = []
987
1026
  self.table_text_original_mode = self.mode
988
1027
  self.mode = InsertionMode.IN_TABLE_TEXT
@@ -1055,7 +1094,7 @@ class TreeBuilderModesMixin:
1055
1094
  self.form_element = node
1056
1095
  self.open_elements.pop() # push=True always adds to stack
1057
1096
  return None
1058
- self._parse_error("unexpected-start-tag-implies-table-voodoo", tag_name=name)
1097
+ self._parse_error("foster-parenting-start-tag", tag_name=name)
1059
1098
  previous = self.insert_from_table
1060
1099
  self.insert_from_table = True
1061
1100
  try:
@@ -1082,19 +1121,33 @@ class TreeBuilderModesMixin:
1082
1121
  if self.template_modes:
1083
1122
  return self._mode_in_template(token)
1084
1123
  if self._has_in_table_scope("table"):
1085
- self._parse_error("expected-closing-tag-but-got-eof", tag_name="table")
1124
+ self._parse_error("eof-in-table")
1086
1125
  return None
1087
1126
 
1088
1127
  def _mode_in_table_text(self, token: Any) -> Any:
1089
1128
  if isinstance(token, CharacterTokens):
1090
1129
  # IN_TABLE mode guarantees non-empty data
1091
1130
  data = token.data
1092
- if "\x0c" in data:
1093
- self._parse_error("invalid-codepoint-in-table-text")
1094
- data = data.replace("\x0c", "")
1095
- if data:
1096
- self.pending_table_text.append(data)
1131
+ self.pending_table_text.append(data)
1097
1132
  return None
1133
+
1134
+ if (
1135
+ self.pending_table_text
1136
+ and isinstance(token, Tag)
1137
+ and token.kind == Tag.END
1138
+ and token.name == "table"
1139
+ and not is_all_whitespace("".join(self.pending_table_text))
1140
+ ):
1141
+ # If a misnested <a> exists only in the active formatting list, html5lib
1142
+ # reports the implied close when the table ends.
1143
+ if self.active_formatting:
1144
+ for idx in range(len(self.active_formatting) - 1, -1, -1):
1145
+ entry = self.active_formatting[idx]
1146
+ if entry is FORMAT_MARKER:
1147
+ break
1148
+ if entry["name"] == "a" and entry["node"] not in self.open_elements:
1149
+ self._parse_error("unexpected-implied-end-tag-in-table-view")
1150
+ break
1098
1151
  self._flush_pending_table_text()
1099
1152
  original = self.table_text_original_mode or InsertionMode.IN_TABLE
1100
1153
  self.table_text_original_mode = None
@@ -1443,11 +1496,7 @@ class TreeBuilderModesMixin:
1443
1496
  if isinstance(token, CharacterTokens):
1444
1497
  data = token.data or ""
1445
1498
  if "\x00" in data:
1446
- self._parse_error("invalid-codepoint-in-select")
1447
1499
  data = data.replace("\x00", "")
1448
- if "\x0c" in data:
1449
- self._parse_error("invalid-codepoint-in-select")
1450
- data = data.replace("\x0c", "")
1451
1500
  if data:
1452
1501
  self._reconstruct_active_formatting_elements()
1453
1502
  self._append_text(data)
@@ -1475,13 +1524,13 @@ class TreeBuilderModesMixin:
1475
1524
  self._insert_element(token, push=True)
1476
1525
  return None
1477
1526
  if name == "select":
1478
- self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=name)
1527
+ self._parse_error("unexpected-select-in-select")
1479
1528
  # select is always in scope in IN_SELECT mode
1480
1529
  self._pop_until_any_inclusive({"select"})
1481
1530
  self._reset_insertion_mode()
1482
1531
  return None
1483
1532
  if name in {"input", "textarea"}:
1484
- self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=name)
1533
+ self._parse_error("unexpected-start-tag-in-select", tag_name=name)
1485
1534
  # select is always in scope in IN_SELECT mode
1486
1535
  self._pop_until_any_inclusive({"select"})
1487
1536
  self._reset_insertion_mode()
@@ -1491,7 +1540,7 @@ class TreeBuilderModesMixin:
1491
1540
  self._insert_element(token, push=False)
1492
1541
  return None
1493
1542
  if name in {"caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr", "table"}:
1494
- self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=name)
1543
+ self._parse_error("unexpected-start-tag-in-select", tag_name=name)
1495
1544
  # select is always in scope in IN_SELECT mode
1496
1545
  self._pop_until_any_inclusive({"select"})
1497
1546
  self._reset_insertion_mode()
@@ -1509,6 +1558,7 @@ class TreeBuilderModesMixin:
1509
1558
  self._append_active_formatting_entry(name, token.attrs, node)
1510
1559
  return None
1511
1560
  if name == "hr":
1561
+ self._parse_error("unexpected-start-tag-in-select", tag_name=name)
1512
1562
  # Per spec: pop option and optgroup before inserting hr (makes hr sibling, not child)
1513
1563
  if self.open_elements and self.open_elements[-1].name == "option":
1514
1564
  self.open_elements.pop()
@@ -1518,22 +1568,29 @@ class TreeBuilderModesMixin:
1518
1568
  self._insert_element(token, push=False)
1519
1569
  return None
1520
1570
  if name == "menuitem":
1571
+ self._parse_error("unexpected-start-tag-in-select", tag_name=name)
1521
1572
  self._reconstruct_active_formatting_elements()
1522
1573
  self._insert_element(token, push=True)
1523
1574
  return None
1524
1575
  # Allow common HTML elements in select (newer spec)
1525
1576
  if name in {"p", "div", "span", "button", "datalist", "selectedcontent"}:
1577
+ self._parse_error("unexpected-start-tag-in-select", tag_name=name)
1526
1578
  self._reconstruct_active_formatting_elements()
1527
1579
  self._insert_element(token, push=not token.self_closing)
1528
1580
  return None
1529
1581
  if name in {"br", "img"}:
1582
+ self._parse_error("unexpected-start-tag-in-select", tag_name=name)
1530
1583
  self._reconstruct_active_formatting_elements()
1531
1584
  self._insert_element(token, push=False)
1532
1585
  return None
1533
1586
  if name == "plaintext":
1534
1587
  # Per spec: plaintext element is inserted in select (consumes all remaining text)
1588
+ self._parse_error("unexpected-start-tag-in-select", tag_name=name)
1535
1589
  self._reconstruct_active_formatting_elements()
1536
1590
  self._insert_element(token, push=True)
1591
+ return None
1592
+ # Any other start tag: parse error, ignore.
1593
+ self._parse_error("unexpected-start-tag-in-select", tag_name=name)
1537
1594
  return None
1538
1595
  if name == "optgroup":
1539
1596
  if self.open_elements and self.open_elements[-1].name == "option":
@@ -1541,13 +1598,13 @@ class TreeBuilderModesMixin:
1541
1598
  if self.open_elements and self.open_elements[-1].name == "optgroup":
1542
1599
  self.open_elements.pop()
1543
1600
  else:
1544
- self._parse_error("unexpected-end-tag", tag_name=token.name)
1601
+ self._parse_error("unexpected-end-tag-in-select", tag_name=token.name)
1545
1602
  return None
1546
1603
  if name == "option":
1547
1604
  if self.open_elements and self.open_elements[-1].name == "option":
1548
1605
  self.open_elements.pop()
1549
1606
  else:
1550
- self._parse_error("unexpected-end-tag", tag_name=token.name)
1607
+ self._parse_error("unexpected-end-tag-in-select", tag_name=token.name)
1551
1608
  return None
1552
1609
  if name == "select":
1553
1610
  # In IN_SELECT mode, select is always in scope - pop to it
@@ -1559,17 +1616,20 @@ class TreeBuilderModesMixin:
1559
1616
  # select is always on stack in IN_SELECT mode
1560
1617
  select_node = self._find_last_on_stack("select")
1561
1618
  fmt_index = self._find_active_formatting_index(name)
1562
- if fmt_index is not None:
1563
- target = self.active_formatting[fmt_index]["node"]
1564
- if target in self.open_elements: # pragma: no branch
1565
- select_index = self.open_elements.index(select_node)
1566
- target_index = self.open_elements.index(target)
1567
- if target_index < select_index:
1568
- self._parse_error("unexpected-end-tag", tag_name=name)
1569
- return None
1619
+ if fmt_index is None:
1620
+ self._parse_error("unexpected-end-tag-in-select", tag_name=name)
1621
+ return None
1622
+ target = self.active_formatting[fmt_index]["node"]
1623
+ if target in self.open_elements: # pragma: no branch
1624
+ select_index = self.open_elements.index(select_node)
1625
+ target_index = self.open_elements.index(target)
1626
+ if target_index < select_index:
1627
+ self._parse_error("unexpected-end-tag-in-select", tag_name=name)
1628
+ return None
1570
1629
  self._adoption_agency(name)
1571
1630
  return None
1572
1631
  if name in {"p", "div", "span", "button", "datalist", "selectedcontent"}:
1632
+ self._parse_error("unexpected-end-tag-in-select", tag_name=name)
1573
1633
  # Per HTML5 spec: these end tags in select mode close the element if it's on the stack.
1574
1634
  # But we must not pop across the select boundary (i.e., don't pop elements BEFORE select).
1575
1635
  select_idx = None
@@ -1586,11 +1646,9 @@ class TreeBuilderModesMixin:
1586
1646
  popped = self.open_elements.pop()
1587
1647
  if popped.name == name:
1588
1648
  break
1589
- else:
1590
- self._parse_error("unexpected-end-tag", tag_name=name)
1591
1649
  return None
1592
1650
  if name in {"caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr", "table"}:
1593
- self._parse_error("unexpected-end-tag", tag_name=name)
1651
+ self._parse_error("unexpected-end-tag-in-select", tag_name=name)
1594
1652
  # select is always in scope in IN_SELECT mode
1595
1653
  self._pop_until_any_inclusive({"select"})
1596
1654
  self._reset_insertion_mode()
@@ -1773,8 +1831,11 @@ class TreeBuilderModesMixin:
1773
1831
  def _mode_after_frameset(self, token: Any) -> Any:
1774
1832
  # Per HTML5 spec §13.2.6.4.17: After frameset insertion mode
1775
1833
  if isinstance(token, CharacterTokens):
1776
- # Only whitespace characters allowed; ignore all others
1777
- whitespace = "".join(ch for ch in token.data if ch in "\t\n\f\r ")
1834
+ # Only whitespace characters allowed; non-whitespace is a parse error.
1835
+ data = token.data or ""
1836
+ whitespace = "".join(ch for ch in data if ch in "\t\n\f\r ")
1837
+ if any(ch not in "\t\n\f\r " for ch in data):
1838
+ self._parse_error("unexpected-token-after-frameset")
1778
1839
  if whitespace:
1779
1840
  self._append_text(whitespace)
1780
1841
  return None
@@ -1787,6 +1848,9 @@ class TreeBuilderModesMixin:
1787
1848
  if token.kind == Tag.END and token.name == "html":
1788
1849
  self.mode = InsertionMode.AFTER_AFTER_FRAMESET
1789
1850
  return None
1851
+ if token.kind == Tag.END and token.name == "frameset":
1852
+ self._parse_error("unexpected-token-after-frameset")
1853
+ return None
1790
1854
  if token.kind == Tag.START and token.name == "noframes":
1791
1855
  # Insert noframes element directly and switch to TEXT mode
1792
1856
  self._insert_element(token, push=True)