justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
justhtml/treebuilder.py CHANGED
@@ -1,5 +1,8 @@
1
1
  # ruff: noqa: S101, PLW2901
2
2
 
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
3
6
 
4
7
  from .constants import (
5
8
  BUTTON_SCOPE_TERMINATORS,
@@ -23,13 +26,16 @@ from .constants import (
23
26
  )
24
27
  from .errors import generate_error_message
25
28
  from .node import ElementNode, SimpleDomNode, TemplateNode, TextNode
26
- from .tokens import CharacterTokens, CommentToken, DoctypeToken, EOFToken, ParseError, Tag, TokenSinkResult
29
+ from .tokens import AnyToken, CharacterTokens, CommentToken, DoctypeToken, EOFToken, ParseError, Tag, TokenSinkResult
27
30
  from .treebuilder_modes import TreeBuilderModesMixin
28
31
  from .treebuilder_utils import (
29
32
  InsertionMode,
30
33
  is_all_whitespace,
31
34
  )
32
35
 
36
+ if TYPE_CHECKING:
37
+ from collections.abc import Callable
38
+
33
39
 
34
40
  class TreeBuilder(TreeBuilderModesMixin):
35
41
  __slots__ = (
@@ -53,6 +59,7 @@ class TreeBuilder(TreeBuilderModesMixin):
53
59
  "open_elements",
54
60
  "original_mode",
55
61
  "pending_table_text",
62
+ "pending_table_text_should_error",
56
63
  "quirks_mode",
57
64
  "table_text_original_mode",
58
65
  "template_modes",
@@ -60,12 +67,39 @@ class TreeBuilder(TreeBuilderModesMixin):
60
67
  "tokenizer_state_override",
61
68
  )
62
69
 
70
+ _body_end_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
71
+ _body_start_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
72
+ _body_token_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
73
+ _mode_handlers: dict[InsertionMode, Callable[[TreeBuilder, Any], Any]]
74
+ active_formatting: list[Any]
75
+ collect_errors: bool
76
+ document: SimpleDomNode
77
+ errors: list[ParseError]
78
+ form_element: Any | None
79
+ fragment_context: Any | None
80
+ fragment_context_element: Any | None
81
+ frameset_ok: bool
82
+ head_element: Any | None
83
+ iframe_srcdoc: bool
84
+ ignore_lf: bool
85
+ insert_from_table: bool
86
+ mode: InsertionMode
87
+ open_elements: list[Any]
88
+ original_mode: InsertionMode | None # type: ignore[assignment]
89
+ pending_table_text: list[str]
90
+ pending_table_text_should_error: bool
91
+ quirks_mode: str
92
+ table_text_original_mode: InsertionMode | None # type: ignore[assignment]
93
+ template_modes: list[InsertionMode]
94
+ tokenizer: Any | None
95
+ tokenizer_state_override: Any | None # type: ignore[assignment]
96
+
63
97
  def __init__(
64
98
  self,
65
- fragment_context=None,
66
- iframe_srcdoc=False,
67
- collect_errors=False,
68
- ):
99
+ fragment_context: Any | None = None,
100
+ iframe_srcdoc: bool = False,
101
+ collect_errors: bool = False,
102
+ ) -> None:
69
103
  self.fragment_context = fragment_context
70
104
  self.iframe_srcdoc = iframe_srcdoc
71
105
  self.collect_errors = collect_errors
@@ -86,6 +120,7 @@ class TreeBuilder(TreeBuilderModesMixin):
86
120
  self.quirks_mode = "no-quirks"
87
121
  self.ignore_lf = False
88
122
  self.active_formatting = []
123
+ self.pending_table_text_should_error = False
89
124
  self.insert_from_table = False
90
125
  self.pending_table_text = []
91
126
  self.template_modes = []
@@ -134,10 +169,10 @@ class TreeBuilder(TreeBuilderModesMixin):
134
169
  # This prevents frameset from being inserted in fragment contexts
135
170
  self.frameset_ok = False
136
171
 
137
- def _set_quirks_mode(self, mode):
172
+ def _set_quirks_mode(self, mode: str) -> None:
138
173
  self.quirks_mode = mode
139
174
 
140
- def _parse_error(self, code, tag_name=None, token=None):
175
+ def _parse_error(self, code: str, tag_name: str | None = None, token: AnyToken | None = None) -> None:
141
176
  if not self.collect_errors:
142
177
  return
143
178
  # Use the position of the last emitted token (set by tokenizer before emit)
@@ -174,13 +209,16 @@ class TreeBuilder(TreeBuilderModesMixin):
174
209
  code,
175
210
  line=line,
176
211
  column=column,
212
+ category="treebuilder",
177
213
  message=message,
178
214
  source_html=source_html,
179
215
  end_column=end_column,
180
216
  )
181
217
  )
182
218
 
183
- def _has_element_in_scope(self, target, terminators=None, check_integration_points=True):
219
+ def _has_element_in_scope(
220
+ self, target: str, terminators: set[str] | None = None, check_integration_points: bool = True
221
+ ) -> bool:
184
222
  if terminators is None:
185
223
  terminators = DEFAULT_SCOPE_TERMINATORS
186
224
  for node in reversed(self.open_elements):
@@ -196,33 +234,33 @@ class TreeBuilder(TreeBuilderModesMixin):
196
234
  return False
197
235
  return False
198
236
 
199
- def _has_element_in_button_scope(self, target):
237
+ def _has_element_in_button_scope(self, target: str) -> bool:
200
238
  return self._has_element_in_scope(target, BUTTON_SCOPE_TERMINATORS)
201
239
 
202
- def _pop_until_inclusive(self, name):
240
+ def _pop_until_inclusive(self, name: str) -> None:
203
241
  # Callers ensure element exists on stack
204
242
  while self.open_elements: # pragma: no branch
205
243
  node = self.open_elements.pop()
206
244
  if node.name == name:
207
245
  break
208
246
 
209
- def _pop_until_any_inclusive(self, names):
247
+ def _pop_until_any_inclusive(self, names: set[str]) -> None:
210
248
  # Pop elements until we find one in names (callers ensure element exists)
211
249
  while self.open_elements:
212
250
  node = self.open_elements.pop()
213
251
  if node.name in names:
214
252
  return
215
253
 
216
- def _close_p_element(self):
254
+ def _close_p_element(self) -> bool:
217
255
  if self._has_element_in_button_scope("p"):
218
256
  self._generate_implied_end_tags("p")
219
257
  if self.open_elements[-1].name != "p":
220
- self._parse_error("end-tag-too-early", tag_name="p")
258
+ self._parse_error("unexpected-end-tag", tag_name="p")
221
259
  self._pop_until_inclusive("p")
222
260
  return True
223
261
  return False
224
262
 
225
- def process_token(self, token):
263
+ def process_token(self, token: Any) -> Any:
226
264
  # Optimization: Use type() identity check instead of isinstance
227
265
  token_type = type(token)
228
266
  if token_type is DoctypeToken:
@@ -276,7 +314,7 @@ class TreeBuilder(TreeBuilderModesMixin):
276
314
  self._insert_element(current_token, push=True)
277
315
  result = None
278
316
  elif name == "p":
279
- result = self._handle_body_start_paragraph(current_token)
317
+ result = self._handle_body_start_paragraph(current_token) # type: ignore[func-returns-value]
280
318
  elif name == "span":
281
319
  if self.active_formatting:
282
320
  self._reconstruct_active_formatting_elements()
@@ -284,7 +322,7 @@ class TreeBuilder(TreeBuilderModesMixin):
284
322
  self.frameset_ok = False
285
323
  result = None
286
324
  elif name == "a":
287
- result = self._handle_body_start_a(current_token)
325
+ result = self._handle_body_start_a(current_token) # type: ignore[func-returns-value]
288
326
  elif name == "br" or name == "img":
289
327
  if self.active_formatting:
290
328
  self._reconstruct_active_formatting_elements()
@@ -331,7 +369,7 @@ class TreeBuilder(TreeBuilderModesMixin):
331
369
  if name == "br":
332
370
  self._parse_error("unexpected-end-tag", tag_name=name)
333
371
  br_tag = Tag(0, "br", {}, False)
334
- result = self._handle_body_start_br(br_tag)
372
+ result = self._handle_body_start_br(br_tag) # type: ignore[func-returns-value]
335
373
  elif name in FORMATTING_ELEMENTS:
336
374
  self._adoption_agency(name)
337
375
  result = None
@@ -350,7 +388,7 @@ class TreeBuilder(TreeBuilderModesMixin):
350
388
  self._append_text(current_token.data)
351
389
  result = None
352
390
  elif token_type is CommentToken:
353
- result = self._handle_comment_in_body(current_token)
391
+ result = self._handle_comment_in_body(current_token) # type: ignore[func-returns-value]
354
392
  else: # EOFToken
355
393
  result = self._handle_eof_in_body(current_token)
356
394
  else:
@@ -379,11 +417,7 @@ class TreeBuilder(TreeBuilderModesMixin):
379
417
  # Tokenizer guarantees non-empty data
380
418
  data = current_token.data
381
419
  if "\x00" in data:
382
- self._parse_error("invalid-codepoint")
383
420
  data = data.replace("\x00", "")
384
- if "\x0c" in data:
385
- self._parse_error("invalid-codepoint")
386
- data = data.replace("\x0c", "")
387
421
  if data:
388
422
  if not is_all_whitespace(data):
389
423
  self._reconstruct_active_formatting_elements()
@@ -437,10 +471,11 @@ class TreeBuilder(TreeBuilderModesMixin):
437
471
  current_token = token_override
438
472
  # Continue loop to reprocess
439
473
 
440
- def finish(self):
474
+ def finish(self) -> SimpleDomNode:
441
475
  if self.fragment_context is not None:
442
476
  # For fragments, remove the html wrapper and promote its children
443
477
  # Note: html element is always created in fragment setup, so children[0] is always "html"
478
+ assert self.document.children is not None
444
479
  root = self.document.children[0]
445
480
  context_elem = self.fragment_context_element
446
481
  if context_elem is not None and context_elem.parent is root:
@@ -460,20 +495,28 @@ class TreeBuilder(TreeBuilderModesMixin):
460
495
 
461
496
  # Insertion mode dispatch ------------------------------------------------
462
497
 
463
- def _append_comment_to_document(self, text):
498
+ def _append_comment_to_document(self, text: str) -> None:
464
499
  node = SimpleDomNode("#comment", data=text)
500
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
501
+ node._origin_pos = self.tokenizer.last_token_start_pos
502
+ if node._origin_pos is not None:
503
+ node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
465
504
  self.document.append_child(node)
466
505
 
467
- def _append_comment(self, text, parent=None):
506
+ def _append_comment(self, text: str, parent: Any | None = None) -> None:
468
507
  if parent is None:
469
508
  parent = self._current_node_or_html()
470
509
  # If parent is a template, insert into its content fragment
471
510
  if type(parent) is TemplateNode and parent.template_content:
472
511
  parent = parent.template_content
473
512
  node = SimpleDomNode("#comment", data=text)
513
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
514
+ node._origin_pos = self.tokenizer.last_token_start_pos
515
+ if node._origin_pos is not None:
516
+ node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
474
517
  parent.append_child(node)
475
518
 
476
- def _append_text(self, text):
519
+ def _append_text(self, text: str) -> None:
477
520
  if self.ignore_lf:
478
521
  self.ignore_lf = False
479
522
  if text.startswith("\n"):
@@ -481,6 +524,9 @@ class TreeBuilder(TreeBuilderModesMixin):
481
524
  if not text:
482
525
  return
483
526
 
527
+ if "\f" in text:
528
+ text = text.replace("\f", " ")
529
+
484
530
  # Guard against empty stack
485
531
  if not self.open_elements: # pragma: no cover
486
532
  return
@@ -493,10 +539,14 @@ class TreeBuilder(TreeBuilderModesMixin):
493
539
  if children:
494
540
  last_child = children[-1]
495
541
  if type(last_child) is TextNode:
496
- last_child.data += text
542
+ last_child.data = (last_child.data or "") + text
497
543
  return
498
544
 
499
545
  node = TextNode(text)
546
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
547
+ node._origin_pos = self.tokenizer.last_token_start_pos
548
+ if node._origin_pos is not None:
549
+ node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
500
550
  children.append(node)
501
551
  node.parent = target
502
552
  return
@@ -517,32 +567,45 @@ class TreeBuilder(TreeBuilderModesMixin):
517
567
  return
518
568
 
519
569
  node = TextNode(text)
570
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
571
+ node._origin_pos = self.tokenizer.last_token_start_pos
572
+ if node._origin_pos is not None:
573
+ node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
520
574
  reference_node = parent.children[position] if position < len(parent.children) else None
521
575
  parent.insert_before(node, reference_node)
522
576
 
523
- def _current_node_or_html(self):
577
+ def _current_node_or_html(self) -> Any:
524
578
  if self.open_elements:
525
579
  return self.open_elements[-1]
526
580
  # Stack empty - find html element in document children
527
581
  # (may not be first if there are comments/doctype before it)
528
- for child in self.document.children:
529
- if child.name == "html":
530
- return child
531
- # Edge case: no html found, return first child or None
532
- return self.document.children[0] if self.document.children else None # pragma: no cover
582
+ children = self.document.children
583
+ if children is not None:
584
+ for child in children:
585
+ if child.name == "html":
586
+ return child
587
+ # Edge case: no html found, return first child or None
588
+ return children[0] if children else None # pragma: no cover
589
+ return None # pragma: no cover
533
590
 
534
- def _create_root(self, attrs):
591
+ def _create_root(self, attrs: dict[str, str | None]) -> Any:
535
592
  node = SimpleDomNode("html", attrs=attrs, namespace="html")
536
593
  self.document.append_child(node)
537
594
  self.open_elements.append(node)
538
595
  return node
539
596
 
540
- def _insert_element(self, tag, *, push, namespace="html"):
597
+ def _insert_element(self, tag: Any, *, push: bool, namespace: str = "html") -> Any:
598
+ node: ElementNode | TemplateNode
541
599
  if tag.name == "template" and namespace == "html":
542
600
  node = TemplateNode(tag.name, attrs=tag.attrs, namespace=namespace)
543
601
  else:
544
602
  node = ElementNode(tag.name, attrs=tag.attrs, namespace=namespace)
545
603
 
604
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
605
+ node._origin_pos = tag.start_pos
606
+ if node._origin_pos is not None:
607
+ node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
608
+
546
609
  # Fast path for common case: not inserting from table
547
610
  if not self.insert_from_table:
548
611
  target = self._current_node_or_html()
@@ -553,7 +616,8 @@ class TreeBuilder(TreeBuilderModesMixin):
553
616
  else:
554
617
  parent = target
555
618
 
556
- parent.append_child(node)
619
+ if parent is not None: # pragma: no branch
620
+ parent.append_child(node)
557
621
 
558
622
  if push:
559
623
  self.open_elements.append(node)
@@ -567,28 +631,30 @@ class TreeBuilder(TreeBuilderModesMixin):
567
631
  self.open_elements.append(node)
568
632
  return node
569
633
 
570
- def _insert_phantom(self, name):
571
- tag = Tag(Tag.START, name, {}, False)
634
+ def _insert_phantom(self, name: str) -> Any:
635
+ attrs: dict[str, str | None] = {}
636
+ tag = Tag(Tag.START, name, attrs, False)
572
637
  return self._insert_element(tag, push=True)
573
638
 
574
- def _insert_body_if_missing(self):
639
+ def _insert_body_if_missing(self) -> None:
575
640
  html_node = self._find_last_on_stack("html")
576
641
  node = SimpleDomNode("body", namespace="html")
577
- html_node.append_child(node)
578
- node.parent = html_node
642
+ if html_node is not None: # pragma: no branch
643
+ html_node.append_child(node)
644
+ node.parent = html_node
579
645
  self.open_elements.append(node)
580
646
 
581
- def _create_element(self, name, namespace, attrs):
647
+ def _create_element(self, name: str, namespace: str | None, attrs: dict[str, str | None]) -> Any:
582
648
  ns = namespace or "html"
583
649
  return ElementNode(name, attrs, ns)
584
650
 
585
- def _pop_current(self):
651
+ def _pop_current(self) -> Any:
586
652
  return self.open_elements.pop()
587
653
 
588
- def _in_scope(self, name):
654
+ def _in_scope(self, name: str) -> bool:
589
655
  return self._has_element_in_scope(name, DEFAULT_SCOPE_TERMINATORS)
590
656
 
591
- def _close_element_by_name(self, name):
657
+ def _close_element_by_name(self, name: str) -> None:
592
658
  # Simple element closing - pops from the named element onwards
593
659
  # Used for explicit closing (e.g., when button start tag closes existing button)
594
660
  # Caller guarantees name is on the stack via _has_in_scope check
@@ -599,7 +665,7 @@ class TreeBuilder(TreeBuilderModesMixin):
599
665
  return
600
666
  index -= 1
601
667
 
602
- def _any_other_end_tag(self, name):
668
+ def _any_other_end_tag(self, name: str) -> None:
603
669
  # Spec: "Any other end tag" in IN_BODY mode
604
670
  # Loop through stack backwards (always terminates: html is special)
605
671
  index = len(self.open_elements) - 1
@@ -624,7 +690,7 @@ class TreeBuilder(TreeBuilderModesMixin):
624
690
  # Continue to next node (previous in stack)
625
691
  index -= 1
626
692
 
627
- def _add_missing_attributes(self, node, attrs):
693
+ def _add_missing_attributes(self, node: Any, attrs: dict[str, str]) -> None:
628
694
  if not attrs:
629
695
  return
630
696
  existing = node.attrs
@@ -632,19 +698,19 @@ class TreeBuilder(TreeBuilderModesMixin):
632
698
  if name not in existing:
633
699
  existing[name] = value
634
700
 
635
- def _remove_from_open_elements(self, node):
701
+ def _remove_from_open_elements(self, node: Any) -> bool:
636
702
  for index, current in enumerate(self.open_elements):
637
703
  if current is node:
638
704
  del self.open_elements[index]
639
705
  return True
640
706
  return False
641
707
 
642
- def _is_special_element(self, node):
708
+ def _is_special_element(self, node: Any) -> bool:
643
709
  if node.namespace not in {None, "html"}:
644
710
  return False
645
711
  return node.name in SPECIAL_ELEMENTS
646
712
 
647
- def _find_active_formatting_index(self, name):
713
+ def _find_active_formatting_index(self, name: str) -> int | None:
648
714
  for index in range(len(self.active_formatting) - 1, -1, -1):
649
715
  entry = self.active_formatting[index]
650
716
  if entry is FORMAT_MARKER:
@@ -653,28 +719,28 @@ class TreeBuilder(TreeBuilderModesMixin):
653
719
  return index
654
720
  return None
655
721
 
656
- def _find_active_formatting_index_by_node(self, node):
722
+ def _find_active_formatting_index_by_node(self, node: Any) -> int | None:
657
723
  for index in range(len(self.active_formatting) - 1, -1, -1):
658
724
  entry = self.active_formatting[index]
659
725
  if entry is not FORMAT_MARKER and entry["node"] is node:
660
726
  return index
661
727
  return None
662
728
 
663
- def _clone_attributes(self, attrs):
729
+ def _clone_attributes(self, attrs: dict[str, str | None]) -> dict[str, str | None]:
664
730
  return attrs.copy() if attrs else {}
665
731
 
666
- def _attrs_signature(self, attrs):
732
+ def _attrs_signature(self, attrs: dict[str, str | None]) -> tuple[tuple[str, str], ...]:
667
733
  if not attrs:
668
734
  return ()
669
- items = []
735
+ items: list[tuple[str, str]] = []
670
736
  for name, value in attrs.items():
671
737
  items.append((name, value or ""))
672
738
  items.sort()
673
739
  return tuple(items)
674
740
 
675
- def _find_active_formatting_duplicate(self, name, attrs):
741
+ def _find_active_formatting_duplicate(self, name: str, attrs: dict[str, str | None]) -> int | None:
676
742
  signature = self._attrs_signature(attrs)
677
- matches = []
743
+ matches: list[int] = []
678
744
  for index, entry in enumerate(self.active_formatting):
679
745
  if entry is FORMAT_MARKER:
680
746
  matches.clear()
@@ -686,7 +752,7 @@ class TreeBuilder(TreeBuilderModesMixin):
686
752
  return matches[0]
687
753
  return None
688
754
 
689
- def _has_active_formatting_entry(self, name):
755
+ def _has_active_formatting_entry(self, name: str) -> bool:
690
756
  for index in range(len(self.active_formatting) - 1, -1, -1):
691
757
  entry = self.active_formatting[index]
692
758
  if entry is FORMAT_MARKER:
@@ -695,7 +761,7 @@ class TreeBuilder(TreeBuilderModesMixin):
695
761
  return True
696
762
  return False
697
763
 
698
- def _remove_last_active_formatting_by_name(self, name):
764
+ def _remove_last_active_formatting_by_name(self, name: str) -> None:
699
765
  for index in range(len(self.active_formatting) - 1, -1, -1):
700
766
  entry = self.active_formatting[index]
701
767
  if entry is FORMAT_MARKER:
@@ -704,13 +770,13 @@ class TreeBuilder(TreeBuilderModesMixin):
704
770
  del self.active_formatting[index]
705
771
  return
706
772
 
707
- def _remove_last_open_element_by_name(self, name):
773
+ def _remove_last_open_element_by_name(self, name: str) -> None:
708
774
  for index in range(len(self.open_elements) - 1, -1, -1):
709
775
  if self.open_elements[index].name == name:
710
776
  del self.open_elements[index]
711
777
  return
712
778
 
713
- def _append_active_formatting_entry(self, name, attrs, node):
779
+ def _append_active_formatting_entry(self, name: str, attrs: dict[str, str | None], node: Any) -> None:
714
780
  entry_attrs = self._clone_attributes(attrs)
715
781
  signature = self._attrs_signature(entry_attrs)
716
782
  self.active_formatting.append(
@@ -722,20 +788,20 @@ class TreeBuilder(TreeBuilderModesMixin):
722
788
  },
723
789
  )
724
790
 
725
- def _clear_active_formatting_up_to_marker(self):
791
+ def _clear_active_formatting_up_to_marker(self) -> None:
726
792
  while self.active_formatting:
727
793
  entry = self.active_formatting.pop()
728
794
  if entry is FORMAT_MARKER:
729
795
  break
730
796
 
731
- def _push_formatting_marker(self):
797
+ def _push_formatting_marker(self) -> None:
732
798
  self.active_formatting.append(FORMAT_MARKER)
733
799
 
734
- def _remove_formatting_entry(self, index):
800
+ def _remove_formatting_entry(self, index: int) -> None:
735
801
  assert 0 <= index < len(self.active_formatting), f"Invalid index: {index}"
736
802
  del self.active_formatting[index]
737
803
 
738
- def _reconstruct_active_formatting_elements(self):
804
+ def _reconstruct_active_formatting_elements(self) -> None:
739
805
  if not self.active_formatting:
740
806
  return
741
807
  last_entry = self.active_formatting[-1]
@@ -757,22 +823,26 @@ class TreeBuilder(TreeBuilderModesMixin):
757
823
  entry = self.active_formatting[index]
758
824
  tag = Tag(Tag.START, entry["name"], self._clone_attributes(entry["attrs"]), False)
759
825
  new_node = self._insert_element(tag, push=True)
826
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
827
+ new_node._origin_pos = entry["node"].origin_offset
828
+ new_node._origin_line = entry["node"].origin_line
829
+ new_node._origin_col = entry["node"].origin_col
760
830
  entry["node"] = new_node
761
831
  index += 1
762
832
 
763
- def _insert_node_at(self, parent, index, node):
833
+ def _insert_node_at(self, parent: Any, index: int, node: Any) -> None:
764
834
  reference_node = None
765
835
  if index is not None and index < len(parent.children):
766
836
  reference_node = parent.children[index]
767
837
  parent.insert_before(node, reference_node)
768
838
 
769
- def _find_last_on_stack(self, name):
839
+ def _find_last_on_stack(self, name: str) -> Any | None:
770
840
  for node in reversed(self.open_elements):
771
841
  if node.name == name:
772
842
  return node
773
843
  return None
774
844
 
775
- def _clear_stack_until(self, names):
845
+ def _clear_stack_until(self, names: set[str]) -> None:
776
846
  # All callers include "html" in names, so this always terminates via break
777
847
  while self.open_elements:
778
848
  node = self.open_elements[-1]
@@ -780,7 +850,7 @@ class TreeBuilder(TreeBuilderModesMixin):
780
850
  break
781
851
  self.open_elements.pop()
782
852
 
783
- def _generate_implied_end_tags(self, exclude=None):
853
+ def _generate_implied_end_tags(self, exclude: str | None = None) -> None:
784
854
  # Always terminates: html is not in IMPLIED_END_TAGS
785
855
  while self.open_elements: # pragma: no branch
786
856
  node = self.open_elements[-1]
@@ -789,10 +859,10 @@ class TreeBuilder(TreeBuilderModesMixin):
789
859
  continue
790
860
  break
791
861
 
792
- def _has_in_table_scope(self, name):
862
+ def _has_in_table_scope(self, name: str) -> bool:
793
863
  return self._has_element_in_scope(name, TABLE_SCOPE_TERMINATORS, check_integration_points=False)
794
864
 
795
- def _close_table_cell(self):
865
+ def _close_table_cell(self) -> bool:
796
866
  if self._has_in_table_scope("td"):
797
867
  self._end_table_cell("td")
798
868
  return True
@@ -801,7 +871,7 @@ class TreeBuilder(TreeBuilderModesMixin):
801
871
  return True
802
872
  return False
803
873
 
804
- def _end_table_cell(self, name):
874
+ def _end_table_cell(self, name: str) -> None:
805
875
  self._generate_implied_end_tags(name)
806
876
  while self.open_elements:
807
877
  node = self.open_elements.pop()
@@ -810,15 +880,22 @@ class TreeBuilder(TreeBuilderModesMixin):
810
880
  self._clear_active_formatting_up_to_marker()
811
881
  self.mode = InsertionMode.IN_ROW
812
882
 
813
- def _flush_pending_table_text(self):
883
+ def _flush_pending_table_text(self) -> None:
814
884
  data = "".join(self.pending_table_text)
815
885
  self.pending_table_text.clear()
816
- if not data:
886
+ if not data: # pragma: no cover
817
887
  return
818
888
  if is_all_whitespace(data):
819
889
  self._append_text(data)
820
890
  return
821
- self._parse_error("foster-parenting-character")
891
+
892
+ if self.pending_table_text_should_error:
893
+ # html5lib reports one foster-parenting error per non-whitespace character.
894
+ for ch in data:
895
+ if ch not in " \t\n\r\f":
896
+ self._parse_error("foster-parenting-character")
897
+ self.pending_table_text_should_error = False
898
+
822
899
  previous = self.insert_from_table
823
900
  self.insert_from_table = True
824
901
  try:
@@ -827,7 +904,7 @@ class TreeBuilder(TreeBuilderModesMixin):
827
904
  finally:
828
905
  self.insert_from_table = previous
829
906
 
830
- def _close_table_element(self):
907
+ def _close_table_element(self) -> bool:
831
908
  if not self._has_in_table_scope("table"):
832
909
  self._parse_error("unexpected-end-tag", tag_name="table")
833
910
  return False
@@ -840,7 +917,7 @@ class TreeBuilder(TreeBuilderModesMixin):
840
917
  self._reset_insertion_mode()
841
918
  return True
842
919
 
843
- def _reset_insertion_mode(self):
920
+ def _reset_insertion_mode(self) -> None:
844
921
  # Walk stack backwards - html element always terminates
845
922
  idx = len(self.open_elements) - 1
846
923
  while idx >= 0:
@@ -880,7 +957,7 @@ class TreeBuilder(TreeBuilderModesMixin):
880
957
  # Empty stack fallback
881
958
  self.mode = InsertionMode.IN_BODY
882
959
 
883
- def _should_foster_parenting(self, target, *, for_tag=None, is_text=False):
960
+ def _should_foster_parenting(self, target: Any, *, for_tag: str | None = None, is_text: bool = False) -> bool:
884
961
  if not self.insert_from_table:
885
962
  return False
886
963
  if target.name not in TABLE_FOSTER_TARGETS:
@@ -891,17 +968,17 @@ class TreeBuilder(TreeBuilderModesMixin):
891
968
  return False
892
969
  return True
893
970
 
894
- def _lower_ascii(self, value):
971
+ def _lower_ascii(self, value: str) -> str:
895
972
  return value.lower() if value else ""
896
973
 
897
- def _adjust_svg_tag_name(self, name):
974
+ def _adjust_svg_tag_name(self, name: str) -> str:
898
975
  lowered = self._lower_ascii(name)
899
976
  return SVG_TAG_NAME_ADJUSTMENTS.get(lowered, name)
900
977
 
901
- def _prepare_foreign_attributes(self, namespace, attrs):
978
+ def _prepare_foreign_attributes(self, namespace: str, attrs: dict[str, str | None]) -> dict[str, str | None]:
902
979
  if not attrs:
903
980
  return {}
904
- adjusted = {}
981
+ adjusted: dict[str, str | None] = {}
905
982
  for name, value in attrs.items():
906
983
  lower_name = self._lower_ascii(name)
907
984
  if namespace == "math" and lower_name in MATHML_ATTRIBUTE_ADJUSTMENTS:
@@ -920,14 +997,14 @@ class TreeBuilder(TreeBuilderModesMixin):
920
997
  adjusted[name] = value
921
998
  return adjusted
922
999
 
923
- def _node_attribute_value(self, node, name):
1000
+ def _node_attribute_value(self, node: Any, name: str) -> str | None:
924
1001
  target = self._lower_ascii(name)
925
1002
  for attr_name, attr_value in node.attrs.items():
926
1003
  if self._lower_ascii(attr_name) == target:
927
1004
  return attr_value or ""
928
1005
  return None
929
1006
 
930
- def _is_html_integration_point(self, node):
1007
+ def _is_html_integration_point(self, node: Any) -> bool:
931
1008
  # annotation-xml is an HTML integration point only with specific encoding values
932
1009
  if node.namespace == "math" and node.name == "annotation-xml":
933
1010
  encoding = self._node_attribute_value(node, "encoding")
@@ -939,15 +1016,15 @@ class TreeBuilder(TreeBuilderModesMixin):
939
1016
  # SVG foreignObject, desc, and title are always HTML integration points
940
1017
  return (node.namespace, node.name) in HTML_INTEGRATION_POINT_SET
941
1018
 
942
- def _is_mathml_text_integration_point(self, node):
1019
+ def _is_mathml_text_integration_point(self, node: Any) -> bool:
943
1020
  if node.namespace != "math":
944
1021
  return False
945
1022
  return (node.namespace, node.name) in MATHML_TEXT_INTEGRATION_POINT_SET
946
1023
 
947
- def _adjusted_current_node(self):
1024
+ def _adjusted_current_node(self) -> Any:
948
1025
  return self.open_elements[-1]
949
1026
 
950
- def _should_use_foreign_content(self, token):
1027
+ def _should_use_foreign_content(self, token: AnyToken) -> bool:
951
1028
  current = self._adjusted_current_node()
952
1029
  # HTML namespace elements don't use foreign content rules
953
1030
  # (unreachable in practice as foreign content mode only entered for foreign elements)
@@ -978,13 +1055,13 @@ class TreeBuilder(TreeBuilderModesMixin):
978
1055
 
979
1056
  return True
980
1057
 
981
- def _foreign_breakout_font(self, tag):
1058
+ def _foreign_breakout_font(self, tag: Any) -> bool:
982
1059
  for name in tag.attrs.keys():
983
1060
  if self._lower_ascii(name) in {"color", "face", "size"}:
984
1061
  return True
985
1062
  return False
986
1063
 
987
- def _pop_until_html_or_integration_point(self):
1064
+ def _pop_until_html_or_integration_point(self) -> None:
988
1065
  # Always terminates: html element has html namespace
989
1066
  while self.open_elements: # pragma: no branch
990
1067
  node = self.open_elements[-1]
@@ -996,7 +1073,7 @@ class TreeBuilder(TreeBuilderModesMixin):
996
1073
  return
997
1074
  self.open_elements.pop()
998
1075
 
999
- def _process_foreign_content(self, token):
1076
+ def _process_foreign_content(self, token: AnyToken) -> Any | None:
1000
1077
  current = self._adjusted_current_node()
1001
1078
 
1002
1079
  if isinstance(token, CharacterTokens):
@@ -1076,7 +1153,7 @@ class TreeBuilder(TreeBuilderModesMixin):
1076
1153
 
1077
1154
  # Per HTML5 spec: if first node doesn't match, it's a parse error
1078
1155
  if first:
1079
- self._parse_error("unexpected-end-tag-in-foreign-content", tag_name=token.name)
1156
+ self._parse_error("unexpected-end-tag", tag_name=token.name)
1080
1157
  first = False
1081
1158
 
1082
1159
  # If we hit an HTML element that doesn't match, process in secondary mode
@@ -1087,7 +1164,9 @@ class TreeBuilder(TreeBuilderModesMixin):
1087
1164
  # Stack exhausted without finding match - ignore tag (defensive, html always terminates)
1088
1165
  return None # pragma: no cover
1089
1166
 
1090
- def _appropriate_insertion_location(self, override_target=None, *, foster_parenting=False):
1167
+ def _appropriate_insertion_location(
1168
+ self, override_target: Any | None = None, *, foster_parenting: bool = False
1169
+ ) -> tuple[Any, int]:
1091
1170
  if override_target is not None:
1092
1171
  target = override_target
1093
1172
  else:
@@ -1106,24 +1185,28 @@ class TreeBuilder(TreeBuilderModesMixin):
1106
1185
  parent = last_table.parent
1107
1186
  # Table has no parent (e.g., detached) - fall back to target
1108
1187
  if parent is None: # pragma: no cover
1109
- return target, len(target.children)
1188
+ children = target.children
1189
+ return target, len(children) if children is not None else 0
1190
+ assert parent.children is not None
1110
1191
  position = parent.children.index(last_table)
1111
1192
  return parent, position
1112
1193
 
1113
1194
  # If target is a template element, insert into its content document fragment
1114
1195
  if type(target) is TemplateNode and target.template_content:
1115
- return target.template_content, len(target.template_content.children)
1196
+ children = target.template_content.children
1197
+ return target.template_content, len(children) if children is not None else 0
1116
1198
 
1117
- return target, len(target.children)
1199
+ target_children = target.children
1200
+ return target, len(target_children) if target_children is not None else 0
1118
1201
 
1119
- def _populate_selectedcontent(self, root):
1202
+ def _populate_selectedcontent(self, root: Any) -> None:
1120
1203
  """Populate selectedcontent elements with content from selected option.
1121
1204
 
1122
1205
  Per HTML5 spec: selectedcontent mirrors the content of the selected option,
1123
1206
  or the first option if none is selected.
1124
1207
  """
1125
1208
  # Find all select elements
1126
- selects = []
1209
+ selects: list[Any] = []
1127
1210
  self._find_elements(root, "select", selects)
1128
1211
 
1129
1212
  for select in selects:
@@ -1133,7 +1216,7 @@ class TreeBuilder(TreeBuilderModesMixin):
1133
1216
  continue
1134
1217
 
1135
1218
  # Find all option elements
1136
- options = []
1219
+ options: list[Any] = []
1137
1220
  self._find_elements(select, "option", options)
1138
1221
 
1139
1222
  # Find selected option or use first one
@@ -1153,7 +1236,7 @@ class TreeBuilder(TreeBuilderModesMixin):
1153
1236
  # Clone content from selected option to selectedcontent
1154
1237
  self._clone_children(selected_option, selectedcontent)
1155
1238
 
1156
- def _find_elements(self, node, name, result):
1239
+ def _find_elements(self, node: Any, name: str, result: list[Any]) -> None:
1157
1240
  """Recursively find all elements with given name."""
1158
1241
  if node.name == name:
1159
1242
  result.append(node)
@@ -1162,7 +1245,7 @@ class TreeBuilder(TreeBuilderModesMixin):
1162
1245
  for child in node.children:
1163
1246
  self._find_elements(child, name, result)
1164
1247
 
1165
- def _find_element(self, node, name):
1248
+ def _find_element(self, node: Any, name: str) -> Any | None:
1166
1249
  """Find first element with given name."""
1167
1250
  if node.name == name:
1168
1251
  return node
@@ -1174,21 +1257,21 @@ class TreeBuilder(TreeBuilderModesMixin):
1174
1257
  return result
1175
1258
  return None
1176
1259
 
1177
- def _clone_children(self, source, target):
1260
+ def _clone_children(self, source: Any, target: Any) -> None:
1178
1261
  """Deep clone all children from source to target."""
1179
1262
  for child in source.children:
1180
1263
  target.append_child(child.clone_node(deep=True))
1181
1264
 
1182
- def _has_in_scope(self, name):
1265
+ def _has_in_scope(self, name: str) -> bool:
1183
1266
  return self._has_element_in_scope(name, DEFAULT_SCOPE_TERMINATORS)
1184
1267
 
1185
- def _has_in_list_item_scope(self, name):
1268
+ def _has_in_list_item_scope(self, name: str) -> bool:
1186
1269
  return self._has_element_in_scope(name, LIST_ITEM_SCOPE_TERMINATORS)
1187
1270
 
1188
- def _has_in_definition_scope(self, name):
1271
+ def _has_in_definition_scope(self, name: str) -> bool:
1189
1272
  return self._has_element_in_scope(name, DEFINITION_SCOPE_TERMINATORS)
1190
1273
 
1191
- def _has_any_in_scope(self, names):
1274
+ def _has_any_in_scope(self, names: set[str]) -> bool:
1192
1275
  # Always terminates: html is in DEFAULT_SCOPE_TERMINATORS
1193
1276
  terminators = DEFAULT_SCOPE_TERMINATORS
1194
1277
  idx = len(self.open_elements) - 1
@@ -1201,7 +1284,7 @@ class TreeBuilder(TreeBuilderModesMixin):
1201
1284
  idx -= 1
1202
1285
  return False # pragma: no cover - html always terminates
1203
1286
 
1204
- def process_characters(self, data):
1287
+ def process_characters(self, data: str) -> Any:
1205
1288
  """Optimized path for character tokens."""
1206
1289
  # Check for foreign content first
1207
1290
  current_node = self.open_elements[-1] if self.open_elements else None
@@ -1211,19 +1294,21 @@ class TreeBuilder(TreeBuilderModesMixin):
1211
1294
  return self.process_token(CharacterTokens(data))
1212
1295
 
1213
1296
  if self.mode == InsertionMode.IN_BODY:
1214
- if "\x00" in data:
1215
- self._parse_error("invalid-codepoint")
1216
- data = data.replace("\x00", "")
1217
-
1218
1297
  if not data:
1219
1298
  return TokenSinkResult.Continue
1299
+ if "\x00" in data:
1300
+ data = data.replace("\x00", "")
1301
+ if not data:
1302
+ return TokenSinkResult.Continue
1220
1303
 
1221
1304
  if is_all_whitespace(data):
1222
- self._reconstruct_active_formatting_elements()
1305
+ if self.active_formatting:
1306
+ self._reconstruct_active_formatting_elements()
1223
1307
  self._append_text(data)
1224
1308
  return TokenSinkResult.Continue
1225
1309
 
1226
- self._reconstruct_active_formatting_elements()
1310
+ if self.active_formatting:
1311
+ self._reconstruct_active_formatting_elements()
1227
1312
  self.frameset_ok = False
1228
1313
  self._append_text(data)
1229
1314
  return TokenSinkResult.Continue