justhtml 0.24.0__py3-none-any.whl → 0.38.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/treebuilder.py CHANGED
@@ -26,7 +26,7 @@ from .constants import (
26
26
  )
27
27
  from .errors import generate_error_message
28
28
  from .node import ElementNode, SimpleDomNode, TemplateNode, TextNode
29
- from .tokens import CharacterTokens, CommentToken, DoctypeToken, EOFToken, ParseError, Tag, TokenSinkResult
29
+ from .tokens import AnyToken, CharacterTokens, CommentToken, DoctypeToken, EOFToken, ParseError, Tag, TokenSinkResult
30
30
  from .treebuilder_modes import TreeBuilderModesMixin
31
31
  from .treebuilder_utils import (
32
32
  InsertionMode,
@@ -43,6 +43,9 @@ class TreeBuilder(TreeBuilderModesMixin):
43
43
  "_body_start_handlers",
44
44
  "_body_token_handlers",
45
45
  "_mode_handlers",
46
+ "_pending_end_tag_end",
47
+ "_pending_end_tag_name",
48
+ "_pending_end_tag_start",
46
49
  "active_formatting",
47
50
  "collect_errors",
48
51
  "document",
@@ -65,12 +68,17 @@ class TreeBuilder(TreeBuilderModesMixin):
65
68
  "template_modes",
66
69
  "tokenizer",
67
70
  "tokenizer_state_override",
71
+ "track_tag_spans",
68
72
  )
69
73
 
70
74
  _body_end_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
71
75
  _body_start_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
72
76
  _body_token_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
73
77
  _mode_handlers: dict[InsertionMode, Callable[[TreeBuilder, Any], Any]]
78
+ _pending_end_tag_name: str | None
79
+ _pending_end_tag_start: int | None
80
+ _pending_end_tag_end: int | None
81
+ track_tag_spans: bool
74
82
  active_formatting: list[Any]
75
83
  collect_errors: bool
76
84
  document: SimpleDomNode
@@ -99,10 +107,12 @@ class TreeBuilder(TreeBuilderModesMixin):
99
107
  fragment_context: Any | None = None,
100
108
  iframe_srcdoc: bool = False,
101
109
  collect_errors: bool = False,
110
+ track_tag_spans: bool = False,
102
111
  ) -> None:
103
112
  self.fragment_context = fragment_context
104
113
  self.iframe_srcdoc = iframe_srcdoc
105
114
  self.collect_errors = collect_errors
115
+ self.track_tag_spans = bool(track_tag_spans)
106
116
  self.errors = []
107
117
  self.tokenizer = None # Set by parser after tokenizer is created
108
118
  self.fragment_context_element = None
@@ -114,6 +124,9 @@ class TreeBuilder(TreeBuilderModesMixin):
114
124
  self.original_mode = None
115
125
  self.table_text_original_mode = None
116
126
  self.open_elements = []
127
+ self._pending_end_tag_name = None
128
+ self._pending_end_tag_start = None
129
+ self._pending_end_tag_end = None
117
130
  self.head_element = None
118
131
  self.form_element = None
119
132
  self.frameset_ok = True
@@ -172,7 +185,7 @@ class TreeBuilder(TreeBuilderModesMixin):
172
185
  def _set_quirks_mode(self, mode: str) -> None:
173
186
  self.quirks_mode = mode
174
187
 
175
- def _parse_error(self, code: str, tag_name: str | None = None, token: Any = None) -> None:
188
+ def _parse_error(self, code: str, tag_name: str | None = None, token: AnyToken | None = None) -> None:
176
189
  if not self.collect_errors:
177
190
  return
178
191
  # Use the position of the last emitted token (set by tokenizer before emit)
@@ -209,6 +222,7 @@ class TreeBuilder(TreeBuilderModesMixin):
209
222
  code,
210
223
  line=line,
211
224
  column=column,
225
+ category="treebuilder",
212
226
  message=message,
213
227
  source_html=source_html,
214
228
  end_column=end_column,
@@ -239,14 +253,14 @@ class TreeBuilder(TreeBuilderModesMixin):
239
253
  def _pop_until_inclusive(self, name: str) -> None:
240
254
  # Callers ensure element exists on stack
241
255
  while self.open_elements: # pragma: no branch
242
- node = self.open_elements.pop()
256
+ node = self._pop_current()
243
257
  if node.name == name:
244
258
  break
245
259
 
246
260
  def _pop_until_any_inclusive(self, names: set[str]) -> None:
247
261
  # Pop elements until we find one in names (callers ensure element exists)
248
262
  while self.open_elements:
249
- node = self.open_elements.pop()
263
+ node = self._pop_current()
250
264
  if node.name in names:
251
265
  return
252
266
 
@@ -273,202 +287,218 @@ class TreeBuilder(TreeBuilderModesMixin):
273
287
 
274
288
  current_token = token
275
289
  force_html_mode = False
290
+ if token_type is Tag and token.kind == Tag.END:
291
+ self._pending_end_tag_name = token.name
292
+ if self.track_tag_spans:
293
+ self._pending_end_tag_start = token.start_pos
294
+ self._pending_end_tag_end = token.end_pos
295
+ else:
296
+ self._pending_end_tag_start = None
297
+ self._pending_end_tag_end = None
276
298
 
277
299
  # Cache mode handlers list for speed
278
300
  mode_handlers = self._MODE_HANDLERS
279
301
 
280
- while True:
281
- # Update token type for current token (it might have changed if reprocessed)
282
- token_type = type(current_token)
283
-
284
- # Optimization: Check for HTML namespace first (common case)
285
- current_node = self.open_elements[-1] if self.open_elements else None
286
- is_html_namespace = current_node is None or current_node.namespace in {None, "html"}
287
-
288
- if force_html_mode or is_html_namespace:
289
- force_html_mode = False
290
- if self.mode == InsertionMode.IN_BODY:
291
- # Inline _mode_in_body for performance
292
- if token_type is Tag:
293
- # Inline _handle_tag_in_body
294
- if current_token.kind == 0: # Tag.START
295
- name = current_token.name
296
- if name == "div" or name == "ul" or name == "ol":
297
- # Inline _handle_body_start_block_with_p
298
- # Check if p is in button scope (html always terminates)
299
- has_p = False
300
- idx = len(self.open_elements) - 1
301
- while idx >= 0: # pragma: no branch
302
- node = self.open_elements[idx]
303
- if node.name == "p":
304
- has_p = True
305
- break
306
- if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
307
- break
308
- idx -= 1
309
-
310
- if has_p:
311
- self._close_p_element()
312
-
313
- self._insert_element(current_token, push=True)
314
- result = None
315
- elif name == "p":
316
- result = self._handle_body_start_paragraph(current_token)
317
- elif name == "span":
318
- if self.active_formatting:
319
- self._reconstruct_active_formatting_elements()
320
- self._insert_element(current_token, push=True)
321
- self.frameset_ok = False
322
- result = None
323
- elif name == "a":
324
- result = self._handle_body_start_a(current_token)
325
- elif name == "br" or name == "img":
326
- if self.active_formatting:
327
- self._reconstruct_active_formatting_elements()
328
- self._insert_element(current_token, push=False)
329
- self.frameset_ok = False
330
- result = None
331
- elif name == "hr":
332
- has_p = False
333
- idx = len(self.open_elements) - 1
334
- while idx >= 0: # pragma: no branch
335
- node = self.open_elements[idx]
336
- if node.name == "p":
337
- has_p = True
338
- break
339
- if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
340
- break
341
- idx -= 1
342
-
343
- if has_p:
344
- self._close_p_element()
345
-
346
- self._insert_element(current_token, push=False)
347
- self.frameset_ok = False
348
- result = None
349
- else:
350
- handler = self._BODY_START_HANDLERS.get(name)
351
- if handler:
352
- result = handler(self, current_token)
353
- else:
354
- # Inline _handle_body_start_default
355
- # Elements here have no special handler - never in FRAMESET_NEUTRAL/FORMATTING_ELEMENTS
302
+ try:
303
+ while True:
304
+ # Update token type for current token (it might have changed if reprocessed)
305
+ token_type = type(current_token)
306
+
307
+ # Optimization: Check for HTML namespace first (common case)
308
+ current_node = self.open_elements[-1] if self.open_elements else None
309
+ is_html_namespace = current_node is None or current_node.namespace in {None, "html"}
310
+
311
+ if force_html_mode or is_html_namespace:
312
+ force_html_mode = False
313
+ if self.mode == InsertionMode.IN_BODY:
314
+ # Inline _mode_in_body for performance
315
+ if token_type is Tag:
316
+ # Inline _handle_tag_in_body
317
+ if current_token.kind == 0: # Tag.START
318
+ name = current_token.name
319
+ if name == "div" or name == "ul" or name == "ol":
320
+ # Inline _handle_body_start_block_with_p
321
+ # Check if p is in button scope (html always terminates)
322
+ has_p = False
323
+ idx = len(self.open_elements) - 1
324
+ while idx >= 0: # pragma: no branch
325
+ node = self.open_elements[idx]
326
+ if node.name == "p":
327
+ has_p = True
328
+ break
329
+ if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
330
+ break
331
+ idx -= 1
332
+
333
+ if has_p:
334
+ self._close_p_element()
335
+
336
+ self._insert_element(current_token, push=True)
337
+ result = None
338
+ elif name == "p":
339
+ result = self._handle_body_start_paragraph(current_token) # type: ignore[func-returns-value]
340
+ elif name == "span":
356
341
  if self.active_formatting:
357
342
  self._reconstruct_active_formatting_elements()
358
343
  self._insert_element(current_token, push=True)
359
- if current_token.self_closing:
360
- self._parse_error(
361
- "non-void-html-element-start-tag-with-trailing-solidus",
362
- tag_name=current_token.name,
363
- )
364
344
  self.frameset_ok = False
365
345
  result = None
366
- else:
367
- name = current_token.name
368
- if name == "br":
369
- self._parse_error("unexpected-end-tag", tag_name=name)
370
- br_tag = Tag(0, "br", {}, False)
371
- result = self._handle_body_start_br(br_tag)
372
- elif name in FORMATTING_ELEMENTS:
373
- self._adoption_agency(name)
374
- result = None
375
- else:
376
- handler = self._BODY_END_HANDLERS.get(name)
377
- if handler:
378
- result = handler(self, current_token)
346
+ elif name == "a":
347
+ result = self._handle_body_start_a(current_token) # type: ignore[func-returns-value]
348
+ elif name == "br" or name == "img":
349
+ if self.active_formatting:
350
+ self._reconstruct_active_formatting_elements()
351
+ self._insert_element(current_token, push=False)
352
+ self.frameset_ok = False
353
+ result = None
354
+ elif name == "hr":
355
+ has_p = False
356
+ idx = len(self.open_elements) - 1
357
+ while idx >= 0: # pragma: no branch
358
+ node = self.open_elements[idx]
359
+ if node.name == "p":
360
+ has_p = True
361
+ break
362
+ if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
363
+ break
364
+ idx -= 1
365
+
366
+ if has_p:
367
+ self._close_p_element()
368
+
369
+ self._insert_element(current_token, push=False)
370
+ self.frameset_ok = False
371
+ result = None
379
372
  else:
380
- self._any_other_end_tag(name)
373
+ handler = self._BODY_START_HANDLERS.get(name)
374
+ if handler:
375
+ result = handler(self, current_token)
376
+ else:
377
+ # Inline _handle_body_start_default
378
+ # Elements here have no special handler - never in FRAMESET_NEUTRAL/FORMATTING_ELEMENTS
379
+ if self.active_formatting:
380
+ self._reconstruct_active_formatting_elements()
381
+ self._insert_element(current_token, push=True)
382
+ if current_token.self_closing:
383
+ self._parse_error(
384
+ "non-void-html-element-start-tag-with-trailing-solidus",
385
+ tag_name=current_token.name,
386
+ )
387
+ self.frameset_ok = False
388
+ result = None
389
+ else:
390
+ name = current_token.name
391
+ if name == "br":
392
+ self._parse_error("unexpected-end-tag", tag_name=name)
393
+ br_tag = Tag(0, "br", {}, False)
394
+ result = self._handle_body_start_br(br_tag) # type: ignore[func-returns-value]
395
+ elif name in FORMATTING_ELEMENTS:
396
+ self._adoption_agency(name)
381
397
  result = None
382
- elif token_type is CharacterTokens:
383
- # Inline _handle_characters_in_body
384
- # Only non-whitespace data reaches here (whitespace handled in process_characters)
385
- self.frameset_ok = False
386
- self._reconstruct_active_formatting_elements()
387
- self._append_text(current_token.data)
388
- result = None
389
- elif token_type is CommentToken:
390
- result = self._handle_comment_in_body(current_token)
391
- else: # EOFToken
392
- result = self._handle_eof_in_body(current_token)
393
- else:
394
- result = mode_handlers[self.mode](self, current_token)
395
- elif self._should_use_foreign_content(current_token):
396
- result = self._process_foreign_content(current_token)
397
- else:
398
- # Foreign content stack logic
399
- current = current_node
400
- # Only pop foreign elements if we're NOT at an HTML/MathML integration point
401
- # and NOT about to insert a new foreign element (svg/math)
402
- if not isinstance(current_token, EOFToken):
403
- # Don't pop at integration points - they stay on stack to receive content
404
- if self._is_html_integration_point(current) or self._is_mathml_text_integration_point(current):
405
- pass
406
- # Don't pop when inserting new svg/math elements
407
- if isinstance(current_token, Tag) and current_token.kind == Tag.START:
408
- # Optimization: Tokenizer already lowercases tag names
409
- name_lower = current_token.name
410
- if name_lower in {"svg", "math"}:
411
- pass
412
-
413
- # Special handling: text at integration points inserts directly, bypassing mode dispatch
414
- if isinstance(current_token, CharacterTokens):
415
- if self._is_mathml_text_integration_point(current):
416
- # Tokenizer guarantees non-empty data
417
- data = current_token.data
418
- if "\x00" in data:
419
- data = data.replace("\x00", "")
420
- if data:
421
- if not is_all_whitespace(data):
422
- self._reconstruct_active_formatting_elements()
423
- self.frameset_ok = False
424
- self._append_text(data)
425
- result = None
398
+ else:
399
+ handler = self._BODY_END_HANDLERS.get(name)
400
+ if handler:
401
+ result = handler(self, current_token)
402
+ else:
403
+ self._any_other_end_tag(name)
404
+ result = None
405
+ elif token_type is CharacterTokens:
406
+ # Inline _handle_characters_in_body
407
+ # Only non-whitespace data reaches here (whitespace handled in process_characters)
408
+ self.frameset_ok = False
409
+ self._reconstruct_active_formatting_elements()
410
+ self._append_text(current_token.data)
411
+ result = None
412
+ elif token_type is CommentToken:
413
+ result = self._handle_comment_in_body(current_token) # type: ignore[func-returns-value]
414
+ else: # EOFToken
415
+ result = self._handle_eof_in_body(current_token)
426
416
  else:
427
417
  result = mode_handlers[self.mode](self, current_token)
418
+ elif self._should_use_foreign_content(current_token):
419
+ result = self._process_foreign_content(current_token)
428
420
  else:
429
- # At integration points inside foreign content, check if table tags make sense.
430
- if (
431
- (self._is_mathml_text_integration_point(current) or self._is_html_integration_point(current))
432
- and isinstance(current_token, Tag)
433
- and current_token.kind == Tag.START
434
- and self.mode not in {InsertionMode.IN_BODY}
435
- ):
436
- # Check if we're in a table mode but without an actual table in scope
437
- # If so, table tags should be ignored (use IN_BODY mode)
438
- is_table_mode = self.mode in {
439
- InsertionMode.IN_TABLE,
440
- InsertionMode.IN_TABLE_BODY,
441
- InsertionMode.IN_ROW,
442
- InsertionMode.IN_CELL,
443
- InsertionMode.IN_CAPTION,
444
- InsertionMode.IN_COLUMN_GROUP,
445
- }
446
- has_table_in_scope = self._has_in_table_scope("table")
447
- if is_table_mode and not has_table_in_scope:
448
- # Temporarily use IN_BODY mode for this tag
449
- saved_mode = self.mode
450
- self.mode = InsertionMode.IN_BODY
451
- result = mode_handlers[self.mode](self, current_token)
452
- # Restore mode if no mode change was requested
453
- if self.mode == InsertionMode.IN_BODY: # pragma: no branch
454
- self.mode = saved_mode
421
+ # Foreign content stack logic
422
+ current = current_node
423
+ # Only pop foreign elements if we're NOT at an HTML/MathML integration point
424
+ # and NOT about to insert a new foreign element (svg/math)
425
+ if not isinstance(current_token, EOFToken):
426
+ # Don't pop at integration points - they stay on stack to receive content
427
+ if self._is_html_integration_point(current) or self._is_mathml_text_integration_point(current):
428
+ pass
429
+ # Don't pop when inserting new svg/math elements
430
+ if isinstance(current_token, Tag) and current_token.kind == Tag.START:
431
+ # Optimization: Tokenizer already lowercases tag names
432
+ name_lower = current_token.name
433
+ if name_lower in {"svg", "math"}:
434
+ pass
435
+
436
+ # Special handling: text at integration points inserts directly, bypassing mode dispatch
437
+ if isinstance(current_token, CharacterTokens):
438
+ if self._is_mathml_text_integration_point(current):
439
+ # Tokenizer guarantees non-empty data
440
+ data = current_token.data
441
+ if "\x00" in data:
442
+ data = data.replace("\x00", "")
443
+ if data:
444
+ if not is_all_whitespace(data):
445
+ self._reconstruct_active_formatting_elements()
446
+ self.frameset_ok = False
447
+ self._append_text(data)
448
+ result = None
455
449
  else:
456
450
  result = mode_handlers[self.mode](self, current_token)
457
451
  else:
458
- result = mode_handlers[self.mode](self, current_token)
452
+ # At integration points inside foreign content, check if table tags make sense.
453
+ if (
454
+ (
455
+ self._is_mathml_text_integration_point(current)
456
+ or self._is_html_integration_point(current)
457
+ )
458
+ and isinstance(current_token, Tag)
459
+ and current_token.kind == Tag.START
460
+ and self.mode not in {InsertionMode.IN_BODY}
461
+ ):
462
+ # Check if we're in a table mode but without an actual table in scope
463
+ # If so, table tags should be ignored (use IN_BODY mode)
464
+ is_table_mode = self.mode in {
465
+ InsertionMode.IN_TABLE,
466
+ InsertionMode.IN_TABLE_BODY,
467
+ InsertionMode.IN_ROW,
468
+ InsertionMode.IN_CELL,
469
+ InsertionMode.IN_CAPTION,
470
+ InsertionMode.IN_COLUMN_GROUP,
471
+ }
472
+ has_table_in_scope = self._has_in_table_scope("table")
473
+ if is_table_mode and not has_table_in_scope:
474
+ # Temporarily use IN_BODY mode for this tag
475
+ saved_mode = self.mode
476
+ self.mode = InsertionMode.IN_BODY
477
+ result = mode_handlers[self.mode](self, current_token)
478
+ # Restore mode if no mode change was requested
479
+ if self.mode == InsertionMode.IN_BODY: # pragma: no branch
480
+ self.mode = saved_mode
481
+ else:
482
+ result = mode_handlers[self.mode](self, current_token)
483
+ else:
484
+ result = mode_handlers[self.mode](self, current_token)
459
485
 
460
- if result is None:
461
- result_to_return = self.tokenizer_state_override or TokenSinkResult.Continue
462
- self.tokenizer_state_override = None
463
- return result_to_return
464
- # Result is (instruction, mode, token) or (instruction, mode, token, force_html)
465
- _instruction, mode, token_override = result[0], result[1], result[2]
466
- if len(result) == 4:
467
- force_html_mode = result[3]
468
- # All mode handlers that return a tuple use "reprocess" instruction
469
- self.mode = mode
470
- current_token = token_override
471
- # Continue loop to reprocess
486
+ if result is None:
487
+ result_to_return = self.tokenizer_state_override or TokenSinkResult.Continue
488
+ self.tokenizer_state_override = None
489
+ return result_to_return
490
+ # Result is (instruction, mode, token) or (instruction, mode, token, force_html)
491
+ _instruction, mode, token_override = result[0], result[1], result[2]
492
+ if len(result) == 4:
493
+ force_html_mode = result[3]
494
+ # All mode handlers that return a tuple use "reprocess" instruction
495
+ self.mode = mode
496
+ current_token = token_override
497
+ # Continue loop to reprocess
498
+ finally:
499
+ self._pending_end_tag_name = None
500
+ self._pending_end_tag_start = None
501
+ self._pending_end_tag_end = None
472
502
 
473
503
  def finish(self) -> SimpleDomNode:
474
504
  if self.fragment_context is not None:
@@ -490,6 +520,9 @@ class TreeBuilder(TreeBuilderModesMixin):
490
520
  # Populate selectedcontent elements per HTML5 spec
491
521
  self._populate_selectedcontent(self.document)
492
522
 
523
+ if self.tokenizer is not None and self.track_tag_spans: # pragma: no branch
524
+ self.document._source_html = self.tokenizer.buffer
525
+
493
526
  return self.document
494
527
 
495
528
  # Insertion mode dispatch ------------------------------------------------
@@ -599,6 +632,10 @@ class TreeBuilder(TreeBuilderModesMixin):
599
632
  node = TemplateNode(tag.name, attrs=tag.attrs, namespace=namespace)
600
633
  else:
601
634
  node = ElementNode(tag.name, attrs=tag.attrs, namespace=namespace)
635
+ if self.track_tag_spans:
636
+ node._start_tag_start = tag.start_pos
637
+ node._start_tag_end = tag.end_pos
638
+ node._self_closing = bool(getattr(tag, "self_closing", False))
602
639
 
603
640
  if self.tokenizer is not None and self.tokenizer.track_node_locations:
604
641
  node._origin_pos = tag.start_pos
@@ -647,8 +684,23 @@ class TreeBuilder(TreeBuilderModesMixin):
647
684
  ns = namespace or "html"
648
685
  return ElementNode(name, attrs, ns)
649
686
 
687
+ def _maybe_mark_end_tag(self, node: Any) -> None:
688
+ if self._pending_end_tag_name is None:
689
+ return
690
+ if getattr(node, "name", None) != self._pending_end_tag_name:
691
+ return
692
+ node._end_tag_present = True
693
+ if self.track_tag_spans:
694
+ node._end_tag_start = self._pending_end_tag_start
695
+ node._end_tag_end = self._pending_end_tag_end
696
+ self._pending_end_tag_name = None
697
+ self._pending_end_tag_start = None
698
+ self._pending_end_tag_end = None
699
+
650
700
  def _pop_current(self) -> Any:
651
- return self.open_elements.pop()
701
+ node = self.open_elements.pop()
702
+ self._maybe_mark_end_tag(node)
703
+ return node
652
704
 
653
705
  def _in_scope(self, name: str) -> bool:
654
706
  return self._has_element_in_scope(name, DEFAULT_SCOPE_TERMINATORS)
@@ -660,6 +712,7 @@ class TreeBuilder(TreeBuilderModesMixin):
660
712
  index = len(self.open_elements) - 1
661
713
  while index >= 0: # pragma: no branch
662
714
  if self.open_elements[index].name == name:
715
+ self._maybe_mark_end_tag(self.open_elements[index])
663
716
  del self.open_elements[index:]
664
717
  return
665
718
  index -= 1
@@ -677,6 +730,7 @@ class TreeBuilder(TreeBuilderModesMixin):
677
730
  # If current node is not this node, parse error
678
731
  if index != len(self.open_elements) - 1:
679
732
  self._parse_error("end-tag-too-early")
733
+ self._maybe_mark_end_tag(node)
680
734
  # Pop all elements from this node onwards
681
735
  del self.open_elements[index:]
682
736
  return
@@ -700,6 +754,7 @@ class TreeBuilder(TreeBuilderModesMixin):
700
754
  def _remove_from_open_elements(self, node: Any) -> bool:
701
755
  for index, current in enumerate(self.open_elements):
702
756
  if current is node:
757
+ self._maybe_mark_end_tag(current)
703
758
  del self.open_elements[index]
704
759
  return True
705
760
  return False
@@ -772,6 +827,7 @@ class TreeBuilder(TreeBuilderModesMixin):
772
827
  def _remove_last_open_element_by_name(self, name: str) -> None:
773
828
  for index in range(len(self.open_elements) - 1, -1, -1):
774
829
  if self.open_elements[index].name == name:
830
+ self._maybe_mark_end_tag(self.open_elements[index])
775
831
  del self.open_elements[index]
776
832
  return
777
833
 
@@ -847,14 +903,14 @@ class TreeBuilder(TreeBuilderModesMixin):
847
903
  node = self.open_elements[-1]
848
904
  if node.name in names and node.namespace in {None, "html"}:
849
905
  break
850
- self.open_elements.pop()
906
+ self._pop_current()
851
907
 
852
908
  def _generate_implied_end_tags(self, exclude: str | None = None) -> None:
853
909
  # Always terminates: html is not in IMPLIED_END_TAGS
854
910
  while self.open_elements: # pragma: no branch
855
911
  node = self.open_elements[-1]
856
912
  if node.name in IMPLIED_END_TAGS and node.name != exclude:
857
- self.open_elements.pop()
913
+ self._pop_current()
858
914
  continue
859
915
  break
860
916
 
@@ -873,7 +929,7 @@ class TreeBuilder(TreeBuilderModesMixin):
873
929
  def _end_table_cell(self, name: str) -> None:
874
930
  self._generate_implied_end_tags(name)
875
931
  while self.open_elements:
876
- node = self.open_elements.pop()
932
+ node = self._pop_current()
877
933
  if node.name == name and node.namespace in {None, "html"}:
878
934
  break
879
935
  self._clear_active_formatting_up_to_marker()
@@ -910,7 +966,7 @@ class TreeBuilder(TreeBuilderModesMixin):
910
966
  self._generate_implied_end_tags()
911
967
  # Table verified in scope above
912
968
  while self.open_elements: # pragma: no branch
913
- node = self.open_elements.pop()
969
+ node = self._pop_current()
914
970
  if node.name == "table":
915
971
  break
916
972
  self._reset_insertion_mode()
@@ -1023,7 +1079,7 @@ class TreeBuilder(TreeBuilderModesMixin):
1023
1079
  def _adjusted_current_node(self) -> Any:
1024
1080
  return self.open_elements[-1]
1025
1081
 
1026
- def _should_use_foreign_content(self, token: Any) -> bool:
1082
+ def _should_use_foreign_content(self, token: AnyToken) -> bool:
1027
1083
  current = self._adjusted_current_node()
1028
1084
  # HTML namespace elements don't use foreign content rules
1029
1085
  # (unreachable in practice as foreign content mode only entered for foreign elements)
@@ -1070,9 +1126,9 @@ class TreeBuilder(TreeBuilderModesMixin):
1070
1126
  return
1071
1127
  if self.fragment_context_element is not None and node is self.fragment_context_element:
1072
1128
  return
1073
- self.open_elements.pop()
1129
+ self._pop_current()
1074
1130
 
1075
- def _process_foreign_content(self, token: Any) -> Any | None:
1131
+ def _process_foreign_content(self, token: AnyToken) -> Any | None:
1076
1132
  current = self._adjusted_current_node()
1077
1133
 
1078
1134
  if isinstance(token, CharacterTokens):
@@ -1147,6 +1203,7 @@ class TreeBuilder(TreeBuilderModesMixin):
1147
1203
  if is_html:
1148
1204
  return ("reprocess", self.mode, token, True)
1149
1205
  # Otherwise it's a foreign element - pop everything from this point up
1206
+ self._maybe_mark_end_tag(node)
1150
1207
  del self.open_elements[idx:]
1151
1208
  return None
1152
1209