justhtml 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1231 @@
1
+ # ruff: noqa: S101, PLW2901
2
+
3
+
4
+ from .constants import (
5
+ BUTTON_SCOPE_TERMINATORS,
6
+ DEFAULT_SCOPE_TERMINATORS,
7
+ DEFINITION_SCOPE_TERMINATORS,
8
+ FOREIGN_ATTRIBUTE_ADJUSTMENTS,
9
+ FOREIGN_BREAKOUT_ELEMENTS,
10
+ FORMAT_MARKER,
11
+ FORMATTING_ELEMENTS,
12
+ HTML_INTEGRATION_POINT_SET,
13
+ IMPLIED_END_TAGS,
14
+ LIST_ITEM_SCOPE_TERMINATORS,
15
+ MATHML_ATTRIBUTE_ADJUSTMENTS,
16
+ MATHML_TEXT_INTEGRATION_POINT_SET,
17
+ SPECIAL_ELEMENTS,
18
+ SVG_ATTRIBUTE_ADJUSTMENTS,
19
+ SVG_TAG_NAME_ADJUSTMENTS,
20
+ TABLE_ALLOWED_CHILDREN,
21
+ TABLE_FOSTER_TARGETS,
22
+ TABLE_SCOPE_TERMINATORS,
23
+ )
24
+ from .errors import generate_error_message
25
+ from .node import ElementNode, SimpleDomNode, TemplateNode, TextNode
26
+ from .tokens import CharacterTokens, CommentToken, DoctypeToken, EOFToken, ParseError, Tag, TokenSinkResult
27
+ from .treebuilder_modes import TreeBuilderModesMixin
28
+ from .treebuilder_utils import (
29
+ InsertionMode,
30
+ is_all_whitespace,
31
+ )
32
+
33
+
34
+ class TreeBuilder(TreeBuilderModesMixin):
35
+ __slots__ = (
36
+ "_body_end_handlers",
37
+ "_body_start_handlers",
38
+ "_body_token_handlers",
39
+ "_mode_handlers",
40
+ "active_formatting",
41
+ "collect_errors",
42
+ "document",
43
+ "errors",
44
+ "form_element",
45
+ "fragment_context",
46
+ "fragment_context_element",
47
+ "frameset_ok",
48
+ "head_element",
49
+ "iframe_srcdoc",
50
+ "ignore_lf",
51
+ "insert_from_table",
52
+ "mode",
53
+ "open_elements",
54
+ "original_mode",
55
+ "pending_table_text",
56
+ "quirks_mode",
57
+ "table_text_original_mode",
58
+ "template_modes",
59
+ "tokenizer",
60
+ "tokenizer_state_override",
61
+ )
62
+
63
+ def __init__(
64
+ self,
65
+ fragment_context=None,
66
+ iframe_srcdoc=False,
67
+ collect_errors=False,
68
+ ):
69
+ self.fragment_context = fragment_context
70
+ self.iframe_srcdoc = iframe_srcdoc
71
+ self.collect_errors = collect_errors
72
+ self.errors = []
73
+ self.tokenizer = None # Set by parser after tokenizer is created
74
+ self.fragment_context_element = None
75
+ if fragment_context is not None:
76
+ self.document = SimpleDomNode("#document-fragment")
77
+ else:
78
+ self.document = SimpleDomNode("#document")
79
+ self.mode = InsertionMode.INITIAL
80
+ self.original_mode = None
81
+ self.table_text_original_mode = None
82
+ self.open_elements = []
83
+ self.head_element = None
84
+ self.form_element = None
85
+ self.frameset_ok = True
86
+ self.quirks_mode = "no-quirks"
87
+ self.ignore_lf = False
88
+ self.active_formatting = []
89
+ self.insert_from_table = False
90
+ self.pending_table_text = []
91
+ self.template_modes = []
92
+ self.tokenizer_state_override = None
93
+ if fragment_context is not None:
94
+ # Fragment parsing per HTML5 spec
95
+ root = self._create_element("html", None, {})
96
+ self.document.append_child(root)
97
+ self.open_elements.append(root)
98
+ # Set mode based on context element name
99
+ namespace = fragment_context.namespace
100
+ context_name = fragment_context.tag_name or ""
101
+ name = context_name.lower()
102
+
103
+ # Create a fake context element to establish foreign content context
104
+ # Per spec: "Create an element for the token in the given namespace"
105
+ if namespace and namespace not in {None, "html"}:
106
+ adjusted_name = context_name
107
+ if namespace == "svg":
108
+ adjusted_name = self._adjust_svg_tag_name(context_name)
109
+ context_element = self._create_element(adjusted_name, namespace, {})
110
+ root.append_child(context_element)
111
+ self.open_elements.append(context_element)
112
+ self.fragment_context_element = context_element
113
+
114
+ # For html context, don't pre-create head/body - start in BEFORE_HEAD mode
115
+ # This allows frameset and other elements to be inserted properly
116
+ if name == "html":
117
+ self.mode = InsertionMode.BEFORE_HEAD
118
+ # Table modes only apply to HTML namespace fragments (namespace is None or "html")
119
+ elif namespace in {None, "html"} and name in {"tbody", "thead", "tfoot"}:
120
+ self.mode = InsertionMode.IN_TABLE_BODY
121
+ elif namespace in {None, "html"} and name == "tr":
122
+ self.mode = InsertionMode.IN_ROW
123
+ elif namespace in {None, "html"} and name in {"td", "th"}:
124
+ self.mode = InsertionMode.IN_CELL
125
+ elif namespace in {None, "html"} and name == "caption":
126
+ self.mode = InsertionMode.IN_CAPTION
127
+ elif namespace in {None, "html"} and name == "colgroup":
128
+ self.mode = InsertionMode.IN_COLUMN_GROUP
129
+ elif namespace in {None, "html"} and name == "table":
130
+ self.mode = InsertionMode.IN_TABLE
131
+ else:
132
+ self.mode = InsertionMode.IN_BODY
133
+ # For fragments, frameset_ok starts as False per HTML5 spec
134
+ # This prevents frameset from being inserted in fragment contexts
135
+ self.frameset_ok = False
136
+
137
+ def _set_quirks_mode(self, mode):
138
+ self.quirks_mode = mode
139
+
140
+ def _parse_error(self, code, tag_name=None, token=None):
141
+ if not self.collect_errors:
142
+ return
143
+ # Use the position of the last emitted token (set by tokenizer before emit)
144
+ line = None
145
+ column = None
146
+ end_column = None
147
+ if self.tokenizer: # pragma: no branch
148
+ line = self.tokenizer.last_token_line
149
+ column = self.tokenizer.last_token_column
150
+
151
+ # Calculate start and end columns based on token type for precise highlighting
152
+ # Note: column from tokenizer points AFTER the last character (0-indexed)
153
+ if token is not None and isinstance(token, Tag):
154
+ # Tag: <name> or </name> plus attributes
155
+ tag_len = len(token.name) + 2 # < + name + >
156
+ if token.kind == Tag.END:
157
+ tag_len += 1 # </name>
158
+ # Add attribute lengths
159
+ for attr_name, attr_value in token.attrs.items():
160
+ tag_len += 1 + len(attr_name) # space + name
161
+ if attr_value:
162
+ tag_len += 1 + 2 + len(attr_value) # = + "value"
163
+ if token.self_closing:
164
+ tag_len += 1 # /
165
+ # column points after >, so start is column - tag_len + 1 (for 1-indexed)
166
+ start_column = column - tag_len + 1
167
+ column = start_column
168
+ end_column = column + tag_len
169
+
170
+ message = generate_error_message(code, tag_name)
171
+ source_html = self.tokenizer.buffer if self.tokenizer else None
172
+ self.errors.append(
173
+ ParseError(
174
+ code,
175
+ line=line,
176
+ column=column,
177
+ message=message,
178
+ source_html=source_html,
179
+ end_column=end_column,
180
+ )
181
+ )
182
+
183
+ def _has_element_in_scope(self, target, terminators=None, check_integration_points=True):
184
+ if terminators is None:
185
+ terminators = DEFAULT_SCOPE_TERMINATORS
186
+ for node in reversed(self.open_elements):
187
+ if node.name == target:
188
+ return True
189
+ ns = node.namespace
190
+ if ns == "html" or ns is None:
191
+ if node.name in terminators:
192
+ return False
193
+ elif check_integration_points and (
194
+ self._is_html_integration_point(node) or self._is_mathml_text_integration_point(node)
195
+ ):
196
+ return False
197
+ return False
198
+
199
+ def _has_element_in_button_scope(self, target):
200
+ return self._has_element_in_scope(target, BUTTON_SCOPE_TERMINATORS)
201
+
202
+ def _pop_until_inclusive(self, name):
203
+ # Callers ensure element exists on stack
204
+ while self.open_elements: # pragma: no branch
205
+ node = self.open_elements.pop()
206
+ if node.name == name:
207
+ break
208
+
209
+ def _pop_until_any_inclusive(self, names):
210
+ # Pop elements until we find one in names (callers ensure element exists)
211
+ while self.open_elements:
212
+ node = self.open_elements.pop()
213
+ if node.name in names:
214
+ return
215
+
216
+ def _close_p_element(self):
217
+ if self._has_element_in_button_scope("p"):
218
+ self._generate_implied_end_tags("p")
219
+ if self.open_elements[-1].name != "p":
220
+ self._parse_error("end-tag-too-early", tag_name="p")
221
+ self._pop_until_inclusive("p")
222
+ return True
223
+ return False
224
+
225
+ def process_token(self, token):
226
+ # Optimization: Use type() identity check instead of isinstance
227
+ token_type = type(token)
228
+ if token_type is DoctypeToken:
229
+ # Check for foreign content first - DOCTYPE in SVG/MathML is a parse error
230
+ if self.open_elements:
231
+ current = self.open_elements[-1]
232
+ if current.namespace not in {None, "html"}:
233
+ self._parse_error("unexpected-doctype")
234
+ return TokenSinkResult.Continue
235
+ return self._handle_doctype(token)
236
+
237
+ current_token = token
238
+ force_html_mode = False
239
+
240
+ # Cache mode handlers list for speed
241
+ mode_handlers = self._MODE_HANDLERS
242
+
243
+ while True:
244
+ # Update token type for current token (it might have changed if reprocessed)
245
+ token_type = type(current_token)
246
+
247
+ # Optimization: Check for HTML namespace first (common case)
248
+ current_node = self.open_elements[-1] if self.open_elements else None
249
+ is_html_namespace = current_node is None or current_node.namespace in {None, "html"}
250
+
251
+ if force_html_mode or is_html_namespace:
252
+ force_html_mode = False
253
+ if self.mode == InsertionMode.IN_BODY:
254
+ # Inline _mode_in_body for performance
255
+ if token_type is Tag:
256
+ # Inline _handle_tag_in_body
257
+ if current_token.kind == 0: # Tag.START
258
+ name = current_token.name
259
+ if name == "div" or name == "ul" or name == "ol":
260
+ # Inline _handle_body_start_block_with_p
261
+ # Check if p is in button scope (html always terminates)
262
+ has_p = False
263
+ idx = len(self.open_elements) - 1
264
+ while idx >= 0: # pragma: no branch
265
+ node = self.open_elements[idx]
266
+ if node.name == "p":
267
+ has_p = True
268
+ break
269
+ if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
270
+ break
271
+ idx -= 1
272
+
273
+ if has_p:
274
+ self._close_p_element()
275
+
276
+ self._insert_element(current_token, push=True)
277
+ result = None
278
+ elif name == "p":
279
+ result = self._handle_body_start_paragraph(current_token)
280
+ elif name == "span":
281
+ if self.active_formatting:
282
+ self._reconstruct_active_formatting_elements()
283
+ self._insert_element(current_token, push=True)
284
+ self.frameset_ok = False
285
+ result = None
286
+ elif name == "a":
287
+ result = self._handle_body_start_a(current_token)
288
+ elif name == "br" or name == "img":
289
+ if self.active_formatting:
290
+ self._reconstruct_active_formatting_elements()
291
+ self._insert_element(current_token, push=False)
292
+ self.frameset_ok = False
293
+ result = None
294
+ elif name == "hr":
295
+ has_p = False
296
+ idx = len(self.open_elements) - 1
297
+ while idx >= 0: # pragma: no branch
298
+ node = self.open_elements[idx]
299
+ if node.name == "p":
300
+ has_p = True
301
+ break
302
+ if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
303
+ break
304
+ idx -= 1
305
+
306
+ if has_p:
307
+ self._close_p_element()
308
+
309
+ self._insert_element(current_token, push=False)
310
+ self.frameset_ok = False
311
+ result = None
312
+ else:
313
+ handler = self._BODY_START_HANDLERS.get(name)
314
+ if handler:
315
+ result = handler(self, current_token)
316
+ else:
317
+ # Inline _handle_body_start_default
318
+ # Elements here have no special handler - never in FRAMESET_NEUTRAL/FORMATTING_ELEMENTS
319
+ if self.active_formatting:
320
+ self._reconstruct_active_formatting_elements()
321
+ self._insert_element(current_token, push=True)
322
+ if current_token.self_closing:
323
+ self._parse_error(
324
+ "non-void-html-element-start-tag-with-trailing-solidus",
325
+ tag_name=current_token.name,
326
+ )
327
+ self.frameset_ok = False
328
+ result = None
329
+ else:
330
+ name = current_token.name
331
+ if name == "br":
332
+ self._parse_error("unexpected-end-tag", tag_name=name)
333
+ br_tag = Tag(0, "br", {}, False)
334
+ result = self._handle_body_start_br(br_tag)
335
+ elif name in FORMATTING_ELEMENTS:
336
+ self._adoption_agency(name)
337
+ result = None
338
+ else:
339
+ handler = self._BODY_END_HANDLERS.get(name)
340
+ if handler:
341
+ result = handler(self, current_token)
342
+ else:
343
+ self._any_other_end_tag(name)
344
+ result = None
345
+ elif token_type is CharacterTokens:
346
+ # Inline _handle_characters_in_body
347
+ # Only non-whitespace data reaches here (whitespace handled in process_characters)
348
+ self.frameset_ok = False
349
+ self._reconstruct_active_formatting_elements()
350
+ self._append_text(current_token.data)
351
+ result = None
352
+ elif token_type is CommentToken:
353
+ result = self._handle_comment_in_body(current_token)
354
+ else: # EOFToken
355
+ result = self._handle_eof_in_body(current_token)
356
+ else:
357
+ result = mode_handlers[self.mode](self, current_token)
358
+ elif self._should_use_foreign_content(current_token):
359
+ result = self._process_foreign_content(current_token)
360
+ else:
361
+ # Foreign content stack logic
362
+ current = current_node
363
+ # Only pop foreign elements if we're NOT at an HTML/MathML integration point
364
+ # and NOT about to insert a new foreign element (svg/math)
365
+ if not isinstance(current_token, EOFToken):
366
+ # Don't pop at integration points - they stay on stack to receive content
367
+ if self._is_html_integration_point(current) or self._is_mathml_text_integration_point(current):
368
+ pass
369
+ # Don't pop when inserting new svg/math elements
370
+ if isinstance(current_token, Tag) and current_token.kind == Tag.START:
371
+ # Optimization: Tokenizer already lowercases tag names
372
+ name_lower = current_token.name
373
+ if name_lower in {"svg", "math"}:
374
+ pass
375
+
376
+ # Special handling: text at integration points inserts directly, bypassing mode dispatch
377
+ if isinstance(current_token, CharacterTokens):
378
+ if self._is_mathml_text_integration_point(current):
379
+ # Tokenizer guarantees non-empty data
380
+ data = current_token.data
381
+ if "\x00" in data:
382
+ self._parse_error("invalid-codepoint")
383
+ data = data.replace("\x00", "")
384
+ if "\x0c" in data:
385
+ self._parse_error("invalid-codepoint")
386
+ data = data.replace("\x0c", "")
387
+ if data:
388
+ if not is_all_whitespace(data):
389
+ self._reconstruct_active_formatting_elements()
390
+ self.frameset_ok = False
391
+ self._append_text(data)
392
+ result = None
393
+ else:
394
+ result = mode_handlers[self.mode](self, current_token)
395
+ else:
396
+ # At integration points inside foreign content, check if table tags make sense.
397
+ if (
398
+ (self._is_mathml_text_integration_point(current) or self._is_html_integration_point(current))
399
+ and isinstance(current_token, Tag)
400
+ and current_token.kind == Tag.START
401
+ and self.mode not in {InsertionMode.IN_BODY}
402
+ ):
403
+ # Check if we're in a table mode but without an actual table in scope
404
+ # If so, table tags should be ignored (use IN_BODY mode)
405
+ is_table_mode = self.mode in {
406
+ InsertionMode.IN_TABLE,
407
+ InsertionMode.IN_TABLE_BODY,
408
+ InsertionMode.IN_ROW,
409
+ InsertionMode.IN_CELL,
410
+ InsertionMode.IN_CAPTION,
411
+ InsertionMode.IN_COLUMN_GROUP,
412
+ }
413
+ has_table_in_scope = self._has_in_table_scope("table")
414
+ if is_table_mode and not has_table_in_scope:
415
+ # Temporarily use IN_BODY mode for this tag
416
+ saved_mode = self.mode
417
+ self.mode = InsertionMode.IN_BODY
418
+ result = mode_handlers[self.mode](self, current_token)
419
+ # Restore mode if no mode change was requested
420
+ if self.mode == InsertionMode.IN_BODY: # pragma: no branch
421
+ self.mode = saved_mode
422
+ else:
423
+ result = mode_handlers[self.mode](self, current_token)
424
+ else:
425
+ result = mode_handlers[self.mode](self, current_token)
426
+
427
+ if result is None:
428
+ result_to_return = self.tokenizer_state_override or TokenSinkResult.Continue
429
+ self.tokenizer_state_override = None
430
+ return result_to_return
431
+ # Result is (instruction, mode, token) or (instruction, mode, token, force_html)
432
+ _instruction, mode, token_override = result[0], result[1], result[2]
433
+ if len(result) == 4:
434
+ force_html_mode = result[3]
435
+ # All mode handlers that return a tuple use "reprocess" instruction
436
+ self.mode = mode
437
+ current_token = token_override
438
+ # Continue loop to reprocess
439
+
440
+ def finish(self):
441
+ if self.fragment_context is not None:
442
+ # For fragments, remove the html wrapper and promote its children
443
+ # Note: html element is always created in fragment setup, so children[0] is always "html"
444
+ root = self.document.children[0]
445
+ context_elem = self.fragment_context_element
446
+ if context_elem is not None and context_elem.parent is root:
447
+ for child in list(context_elem.children):
448
+ context_elem.remove_child(child)
449
+ root.append_child(child)
450
+ root.remove_child(context_elem)
451
+ for child in list(root.children):
452
+ root.remove_child(child)
453
+ self.document.append_child(child)
454
+ self.document.remove_child(root)
455
+
456
+ # Populate selectedcontent elements per HTML5 spec
457
+ self._populate_selectedcontent(self.document)
458
+
459
+ return self.document
460
+
461
+ # Insertion mode dispatch ------------------------------------------------
462
+
463
+ def _append_comment_to_document(self, text):
464
+ node = SimpleDomNode("#comment", data=text)
465
+ self.document.append_child(node)
466
+
467
+ def _append_comment(self, text, parent=None):
468
+ if parent is None:
469
+ parent = self._current_node_or_html()
470
+ # If parent is a template, insert into its content fragment
471
+ if type(parent) is TemplateNode and parent.template_content:
472
+ parent = parent.template_content
473
+ node = SimpleDomNode("#comment", data=text)
474
+ parent.append_child(node)
475
+
476
+ def _append_text(self, text):
477
+ if self.ignore_lf:
478
+ self.ignore_lf = False
479
+ if text.startswith("\n"):
480
+ text = text[1:]
481
+ if not text:
482
+ return
483
+
484
+ # Guard against empty stack
485
+ if not self.open_elements: # pragma: no cover
486
+ return
487
+
488
+ # Fast path optimization for common case
489
+ target = self.open_elements[-1]
490
+
491
+ if target.name not in TABLE_FOSTER_TARGETS and type(target) is not TemplateNode:
492
+ children = target.children
493
+ if children:
494
+ last_child = children[-1]
495
+ if type(last_child) is TextNode:
496
+ last_child.data += text
497
+ return
498
+
499
+ node = TextNode(text)
500
+ children.append(node)
501
+ node.parent = target
502
+ return
503
+
504
+ target = self._current_node_or_html()
505
+ foster_parenting = self._should_foster_parenting(target, is_text=True)
506
+
507
+ # Reconstruct active formatting BEFORE getting insertion location when foster parenting
508
+ if foster_parenting:
509
+ self._reconstruct_active_formatting_elements()
510
+
511
+ # Always use appropriate insertion location to handle templates
512
+ parent, position = self._appropriate_insertion_location(foster_parenting=foster_parenting)
513
+
514
+ # Coalesce with adjacent text node if possible
515
+ if position > 0 and parent.children[position - 1].name == "#text":
516
+ parent.children[position - 1].data = (parent.children[position - 1].data or "") + text
517
+ return
518
+
519
+ node = TextNode(text)
520
+ reference_node = parent.children[position] if position < len(parent.children) else None
521
+ parent.insert_before(node, reference_node)
522
+
523
+ def _current_node_or_html(self):
524
+ if self.open_elements:
525
+ return self.open_elements[-1]
526
+ # Stack empty - find html element in document children
527
+ # (may not be first if there are comments/doctype before it)
528
+ for child in self.document.children:
529
+ if child.name == "html":
530
+ return child
531
+ # Edge case: no html found, return first child or None
532
+ return self.document.children[0] if self.document.children else None # pragma: no cover
533
+
534
+ def _create_root(self, attrs):
535
+ node = SimpleDomNode("html", attrs=attrs, namespace="html")
536
+ self.document.append_child(node)
537
+ self.open_elements.append(node)
538
+ return node
539
+
540
+ def _insert_element(self, tag, *, push, namespace="html"):
541
+ if tag.name == "template" and namespace == "html":
542
+ node = TemplateNode(tag.name, attrs=tag.attrs, namespace=namespace)
543
+ else:
544
+ node = ElementNode(tag.name, attrs=tag.attrs, namespace=namespace)
545
+
546
+ # Fast path for common case: not inserting from table
547
+ if not self.insert_from_table:
548
+ target = self._current_node_or_html()
549
+
550
+ # Handle template content insertion
551
+ if type(target) is TemplateNode:
552
+ parent = target.template_content
553
+ else:
554
+ parent = target
555
+
556
+ parent.append_child(node)
557
+
558
+ if push:
559
+ self.open_elements.append(node)
560
+ return node
561
+
562
+ target = self._current_node_or_html()
563
+ foster_parenting = self._should_foster_parenting(target, for_tag=tag.name)
564
+ parent, position = self._appropriate_insertion_location(foster_parenting=foster_parenting)
565
+ self._insert_node_at(parent, position, node)
566
+ if push:
567
+ self.open_elements.append(node)
568
+ return node
569
+
570
+ def _insert_phantom(self, name):
571
+ tag = Tag(Tag.START, name, {}, False)
572
+ return self._insert_element(tag, push=True)
573
+
574
+ def _insert_body_if_missing(self):
575
+ html_node = self._find_last_on_stack("html")
576
+ node = SimpleDomNode("body", namespace="html")
577
+ html_node.append_child(node)
578
+ node.parent = html_node
579
+ self.open_elements.append(node)
580
+
581
+ def _create_element(self, name, namespace, attrs):
582
+ ns = namespace or "html"
583
+ return ElementNode(name, attrs, ns)
584
+
585
+ def _pop_current(self):
586
+ return self.open_elements.pop()
587
+
588
+ def _in_scope(self, name):
589
+ return self._has_element_in_scope(name, DEFAULT_SCOPE_TERMINATORS)
590
+
591
+ def _close_element_by_name(self, name):
592
+ # Simple element closing - pops from the named element onwards
593
+ # Used for explicit closing (e.g., when button start tag closes existing button)
594
+ # Caller guarantees name is on the stack via _has_in_scope check
595
+ index = len(self.open_elements) - 1
596
+ while index >= 0: # pragma: no branch
597
+ if self.open_elements[index].name == name:
598
+ del self.open_elements[index:]
599
+ return
600
+ index -= 1
601
+
602
+ def _any_other_end_tag(self, name):
603
+ # Spec: "Any other end tag" in IN_BODY mode
604
+ # Loop through stack backwards (always terminates: html is special)
605
+ index = len(self.open_elements) - 1
606
+ while index >= 0: # pragma: no branch
607
+ node = self.open_elements[index]
608
+
609
+ # If node's name matches the end tag name
610
+ if node.name == name:
611
+ # Generate implied end tags (except for this name)
612
+ # If current node is not this node, parse error
613
+ if index != len(self.open_elements) - 1:
614
+ self._parse_error("end-tag-too-early")
615
+ # Pop all elements from this node onwards
616
+ del self.open_elements[index:]
617
+ return
618
+
619
+ # If node is a special element, parse error and ignore the tag
620
+ if self._is_special_element(node):
621
+ self._parse_error("unexpected-end-tag", tag_name=name)
622
+ return # Ignore the end tag
623
+
624
+ # Continue to next node (previous in stack)
625
+ index -= 1
626
+
627
+ def _add_missing_attributes(self, node, attrs):
628
+ if not attrs:
629
+ return
630
+ existing = node.attrs
631
+ for name, value in attrs.items():
632
+ if name not in existing:
633
+ existing[name] = value
634
+
635
+ def _remove_from_open_elements(self, node):
636
+ for index, current in enumerate(self.open_elements):
637
+ if current is node:
638
+ del self.open_elements[index]
639
+ return True
640
+ return False
641
+
642
+ def _is_special_element(self, node):
643
+ if node.namespace not in {None, "html"}:
644
+ return False
645
+ return node.name in SPECIAL_ELEMENTS
646
+
647
+ def _find_active_formatting_index(self, name):
648
+ for index in range(len(self.active_formatting) - 1, -1, -1):
649
+ entry = self.active_formatting[index]
650
+ if entry is FORMAT_MARKER:
651
+ break
652
+ if entry["name"] == name:
653
+ return index
654
+ return None
655
+
656
+ def _find_active_formatting_index_by_node(self, node):
657
+ for index in range(len(self.active_formatting) - 1, -1, -1):
658
+ entry = self.active_formatting[index]
659
+ if entry is not FORMAT_MARKER and entry["node"] is node:
660
+ return index
661
+ return None
662
+
663
+ def _clone_attributes(self, attrs):
664
+ return attrs.copy() if attrs else {}
665
+
666
+ def _attrs_signature(self, attrs):
667
+ if not attrs:
668
+ return ()
669
+ items = []
670
+ for name, value in attrs.items():
671
+ items.append((name, value or ""))
672
+ items.sort()
673
+ return tuple(items)
674
+
675
+ def _find_active_formatting_duplicate(self, name, attrs):
676
+ signature = self._attrs_signature(attrs)
677
+ matches = []
678
+ for index, entry in enumerate(self.active_formatting):
679
+ if entry is FORMAT_MARKER:
680
+ matches.clear()
681
+ continue
682
+ existing_signature = entry["signature"]
683
+ if entry["name"] == name and existing_signature == signature:
684
+ matches.append(index)
685
+ if len(matches) >= 3:
686
+ return matches[0]
687
+ return None
688
+
689
+ def _has_active_formatting_entry(self, name):
690
+ for index in range(len(self.active_formatting) - 1, -1, -1):
691
+ entry = self.active_formatting[index]
692
+ if entry is FORMAT_MARKER:
693
+ break
694
+ if entry["name"] == name:
695
+ return True
696
+ return False
697
+
698
+ def _remove_last_active_formatting_by_name(self, name):
699
+ for index in range(len(self.active_formatting) - 1, -1, -1):
700
+ entry = self.active_formatting[index]
701
+ if entry is FORMAT_MARKER:
702
+ break
703
+ if entry["name"] == name:
704
+ del self.active_formatting[index]
705
+ return
706
+
707
+ def _remove_last_open_element_by_name(self, name):
708
+ for index in range(len(self.open_elements) - 1, -1, -1):
709
+ if self.open_elements[index].name == name:
710
+ del self.open_elements[index]
711
+ return
712
+
713
+ def _append_active_formatting_entry(self, name, attrs, node):
714
+ entry_attrs = self._clone_attributes(attrs)
715
+ signature = self._attrs_signature(entry_attrs)
716
+ self.active_formatting.append(
717
+ {
718
+ "name": name,
719
+ "attrs": entry_attrs,
720
+ "node": node,
721
+ "signature": signature,
722
+ },
723
+ )
724
+
725
+ def _clear_active_formatting_up_to_marker(self):
726
+ while self.active_formatting:
727
+ entry = self.active_formatting.pop()
728
+ if entry is FORMAT_MARKER:
729
+ break
730
+
731
+ def _push_formatting_marker(self):
732
+ self.active_formatting.append(FORMAT_MARKER)
733
+
734
+ def _remove_formatting_entry(self, index):
735
+ assert 0 <= index < len(self.active_formatting), f"Invalid index: {index}"
736
+ del self.active_formatting[index]
737
+
738
+ def _reconstruct_active_formatting_elements(self):
739
+ if not self.active_formatting:
740
+ return
741
+ last_entry = self.active_formatting[-1]
742
+ if last_entry is FORMAT_MARKER or last_entry["node"] in self.open_elements:
743
+ return
744
+
745
+ index = len(self.active_formatting) - 1
746
+ while True:
747
+ index -= 1
748
+ if index < 0:
749
+ break
750
+ entry = self.active_formatting[index]
751
+ if entry is FORMAT_MARKER or entry["node"] in self.open_elements:
752
+ index += 1
753
+ break
754
+ if index < 0:
755
+ index = 0
756
+ while index < len(self.active_formatting):
757
+ entry = self.active_formatting[index]
758
+ tag = Tag(Tag.START, entry["name"], self._clone_attributes(entry["attrs"]), False)
759
+ new_node = self._insert_element(tag, push=True)
760
+ entry["node"] = new_node
761
+ index += 1
762
+
763
+ def _insert_node_at(self, parent, index, node):
764
+ reference_node = None
765
+ if index is not None and index < len(parent.children):
766
+ reference_node = parent.children[index]
767
+ parent.insert_before(node, reference_node)
768
+
769
+ def _find_last_on_stack(self, name):
770
+ for node in reversed(self.open_elements):
771
+ if node.name == name:
772
+ return node
773
+ return None
774
+
775
+ def _clear_stack_until(self, names):
776
+ # All callers include "html" in names, so this always terminates via break
777
+ while self.open_elements:
778
+ node = self.open_elements[-1]
779
+ if node.name in names and node.namespace in {None, "html"}:
780
+ break
781
+ self.open_elements.pop()
782
+
783
+ def _generate_implied_end_tags(self, exclude=None):
784
+ # Always terminates: html is not in IMPLIED_END_TAGS
785
+ while self.open_elements: # pragma: no branch
786
+ node = self.open_elements[-1]
787
+ if node.name in IMPLIED_END_TAGS and node.name != exclude:
788
+ self.open_elements.pop()
789
+ continue
790
+ break
791
+
792
+ def _has_in_table_scope(self, name):
793
+ return self._has_element_in_scope(name, TABLE_SCOPE_TERMINATORS, check_integration_points=False)
794
+
795
+ def _close_table_cell(self):
796
+ if self._has_in_table_scope("td"):
797
+ self._end_table_cell("td")
798
+ return True
799
+ if self._has_in_table_scope("th"):
800
+ self._end_table_cell("th")
801
+ return True
802
+ return False
803
+
804
+ def _end_table_cell(self, name):
805
+ self._generate_implied_end_tags(name)
806
+ while self.open_elements:
807
+ node = self.open_elements.pop()
808
+ if node.name == name and node.namespace in {None, "html"}:
809
+ break
810
+ self._clear_active_formatting_up_to_marker()
811
+ self.mode = InsertionMode.IN_ROW
812
+
813
+ def _flush_pending_table_text(self):
814
+ data = "".join(self.pending_table_text)
815
+ self.pending_table_text.clear()
816
+ if not data:
817
+ return
818
+ if is_all_whitespace(data):
819
+ self._append_text(data)
820
+ return
821
+ self._parse_error("foster-parenting-character")
822
+ previous = self.insert_from_table
823
+ self.insert_from_table = True
824
+ try:
825
+ self._reconstruct_active_formatting_elements()
826
+ self._append_text(data)
827
+ finally:
828
+ self.insert_from_table = previous
829
+
830
+ def _close_table_element(self):
831
+ if not self._has_in_table_scope("table"):
832
+ self._parse_error("unexpected-end-tag", tag_name="table")
833
+ return False
834
+ self._generate_implied_end_tags()
835
+ # Table verified in scope above
836
+ while self.open_elements: # pragma: no branch
837
+ node = self.open_elements.pop()
838
+ if node.name == "table":
839
+ break
840
+ self._reset_insertion_mode()
841
+ return True
842
+
843
+ def _reset_insertion_mode(self):
844
+ # Walk stack backwards - html element always terminates
845
+ idx = len(self.open_elements) - 1
846
+ while idx >= 0:
847
+ node = self.open_elements[idx]
848
+ name = node.name
849
+ if name == "select":
850
+ self.mode = InsertionMode.IN_SELECT
851
+ return
852
+ if name == "td" or name == "th":
853
+ self.mode = InsertionMode.IN_CELL
854
+ return
855
+ if name == "tr":
856
+ self.mode = InsertionMode.IN_ROW
857
+ return
858
+ if name in {"tbody", "tfoot", "thead"}:
859
+ self.mode = InsertionMode.IN_TABLE_BODY
860
+ return
861
+ if name == "caption":
862
+ self.mode = InsertionMode.IN_CAPTION
863
+ return
864
+ if name == "table":
865
+ self.mode = InsertionMode.IN_TABLE
866
+ return
867
+ if name == "template":
868
+ # Return the last template mode from the stack
869
+ if self.template_modes:
870
+ self.mode = self.template_modes[-1]
871
+ return
872
+ if name == "head":
873
+ # If we're resetting and head is on stack, stay in IN_HEAD
874
+ self.mode = InsertionMode.IN_HEAD
875
+ return
876
+ if name == "html":
877
+ self.mode = InsertionMode.IN_BODY
878
+ return
879
+ idx -= 1
880
+ # Empty stack fallback
881
+ self.mode = InsertionMode.IN_BODY
882
+
883
+ def _should_foster_parenting(self, target, *, for_tag=None, is_text=False):
884
+ if not self.insert_from_table:
885
+ return False
886
+ if target.name not in TABLE_FOSTER_TARGETS:
887
+ return False
888
+ if is_text:
889
+ return True
890
+ if for_tag in TABLE_ALLOWED_CHILDREN:
891
+ return False
892
+ return True
893
+
894
+ def _lower_ascii(self, value):
895
+ return value.lower() if value else ""
896
+
897
+ def _adjust_svg_tag_name(self, name):
898
+ lowered = self._lower_ascii(name)
899
+ return SVG_TAG_NAME_ADJUSTMENTS.get(lowered, name)
900
+
901
+ def _prepare_foreign_attributes(self, namespace, attrs):
902
+ if not attrs:
903
+ return {}
904
+ adjusted = {}
905
+ for name, value in attrs.items():
906
+ lower_name = self._lower_ascii(name)
907
+ if namespace == "math" and lower_name in MATHML_ATTRIBUTE_ADJUSTMENTS:
908
+ name = MATHML_ATTRIBUTE_ADJUSTMENTS[lower_name]
909
+ lower_name = self._lower_ascii(name)
910
+ elif namespace == "svg" and lower_name in SVG_ATTRIBUTE_ADJUSTMENTS:
911
+ name = SVG_ATTRIBUTE_ADJUSTMENTS[lower_name]
912
+ lower_name = self._lower_ascii(name)
913
+
914
+ foreign_adjustment = FOREIGN_ATTRIBUTE_ADJUSTMENTS.get(lower_name)
915
+ if foreign_adjustment is not None:
916
+ prefix, local, _ = foreign_adjustment
917
+ name = f"{prefix}:{local}"
918
+
919
+ # Tokenizer deduplicates attributes, so name collision impossible here
920
+ adjusted[name] = value
921
+ return adjusted
922
+
923
+ def _node_attribute_value(self, node, name):
924
+ target = self._lower_ascii(name)
925
+ for attr_name, attr_value in node.attrs.items():
926
+ if self._lower_ascii(attr_name) == target:
927
+ return attr_value or ""
928
+ return None
929
+
930
+ def _is_html_integration_point(self, node):
931
+ # annotation-xml is an HTML integration point only with specific encoding values
932
+ if node.namespace == "math" and node.name == "annotation-xml":
933
+ encoding = self._node_attribute_value(node, "encoding")
934
+ if encoding:
935
+ enc_lower = encoding.lower()
936
+ if enc_lower in {"text/html", "application/xhtml+xml"}:
937
+ return True
938
+ return False # annotation-xml without proper encoding is NOT an integration point
939
+ # SVG foreignObject, desc, and title are always HTML integration points
940
+ return (node.namespace, node.name) in HTML_INTEGRATION_POINT_SET
941
+
942
+ def _is_mathml_text_integration_point(self, node):
943
+ if node.namespace != "math":
944
+ return False
945
+ return (node.namespace, node.name) in MATHML_TEXT_INTEGRATION_POINT_SET
946
+
947
+ def _adjusted_current_node(self):
948
+ return self.open_elements[-1]
949
+
950
+ def _should_use_foreign_content(self, token):
951
+ current = self._adjusted_current_node()
952
+ # HTML namespace elements don't use foreign content rules
953
+ # (unreachable in practice as foreign content mode only entered for foreign elements)
954
+ if current.namespace in {None, "html"}:
955
+ return False # pragma: no cover
956
+
957
+ if isinstance(token, EOFToken):
958
+ return False
959
+
960
+ if self._is_mathml_text_integration_point(current):
961
+ if isinstance(token, CharacterTokens):
962
+ return False
963
+ if isinstance(token, Tag) and token.kind == Tag.START:
964
+ name_lower = self._lower_ascii(token.name)
965
+ if name_lower not in {"mglyph", "malignmark"}:
966
+ return False
967
+
968
+ if current.namespace == "math" and current.name == "annotation-xml":
969
+ if isinstance(token, Tag) and token.kind == Tag.START:
970
+ if self._lower_ascii(token.name) == "svg":
971
+ return False
972
+
973
+ if self._is_html_integration_point(current):
974
+ if isinstance(token, CharacterTokens):
975
+ return False
976
+ if isinstance(token, Tag) and token.kind == Tag.START:
977
+ return False
978
+
979
+ return True
980
+
981
+ def _foreign_breakout_font(self, tag):
982
+ for name in tag.attrs.keys():
983
+ if self._lower_ascii(name) in {"color", "face", "size"}:
984
+ return True
985
+ return False
986
+
987
+ def _pop_until_html_or_integration_point(self):
988
+ # Always terminates: html element has html namespace
989
+ while self.open_elements: # pragma: no branch
990
+ node = self.open_elements[-1]
991
+ if node.namespace in {None, "html"}:
992
+ return
993
+ if self._is_html_integration_point(node):
994
+ return
995
+ if self.fragment_context_element is not None and node is self.fragment_context_element:
996
+ return
997
+ self.open_elements.pop()
998
+
999
+ def _process_foreign_content(self, token):
1000
+ current = self._adjusted_current_node()
1001
+
1002
+ if isinstance(token, CharacterTokens):
1003
+ raw = token.data or ""
1004
+ cleaned = []
1005
+ has_non_null_non_ws = False
1006
+ for ch in raw:
1007
+ if ch == "\x00":
1008
+ self._parse_error("invalid-codepoint-in-foreign-content")
1009
+ cleaned.append("\ufffd")
1010
+ continue
1011
+ cleaned.append(ch)
1012
+ if ch not in "\t\n\f\r ":
1013
+ has_non_null_non_ws = True
1014
+ data = "".join(cleaned)
1015
+ if has_non_null_non_ws:
1016
+ self.frameset_ok = False
1017
+ self._append_text(data)
1018
+ return None
1019
+
1020
+ if isinstance(token, CommentToken):
1021
+ self._append_comment(token.data)
1022
+ return None
1023
+
1024
+ # Foreign content only receives CharacterTokens, CommentToken, or Tag (not EOF)
1025
+ assert isinstance(token, Tag), f"Unexpected token type in foreign content: {type(token)}"
1026
+ name_lower = self._lower_ascii(token.name)
1027
+ if token.kind == Tag.START:
1028
+ if name_lower in FOREIGN_BREAKOUT_ELEMENTS or (
1029
+ name_lower == "font" and self._foreign_breakout_font(token)
1030
+ ):
1031
+ self._parse_error("unexpected-html-element-in-foreign-content")
1032
+ self._pop_until_html_or_integration_point()
1033
+ self._reset_insertion_mode()
1034
+ return ("reprocess", self.mode, token, True)
1035
+
1036
+ namespace = current.namespace
1037
+ adjusted_name = token.name
1038
+ if namespace == "svg":
1039
+ adjusted_name = self._adjust_svg_tag_name(token.name)
1040
+ attrs = self._prepare_foreign_attributes(namespace, token.attrs)
1041
+ new_tag = Tag(Tag.START, adjusted_name, attrs, token.self_closing)
1042
+ # For foreign elements, honor the self-closing flag
1043
+ self._insert_element(new_tag, push=not token.self_closing, namespace=namespace)
1044
+ return None
1045
+
1046
+ # Only START and END tag kinds exist, and START returns above
1047
+ assert token.kind == Tag.END, f"Unexpected tag kind: {token.kind}"
1048
+ name_lower = self._lower_ascii(token.name)
1049
+
1050
+ # Special case: </br> and </p> end tags trigger breakout from foreign content
1051
+ if name_lower in {"br", "p"}:
1052
+ self._parse_error("unexpected-html-element-in-foreign-content")
1053
+ self._pop_until_html_or_integration_point()
1054
+ self._reset_insertion_mode()
1055
+ return ("reprocess", self.mode, token, True)
1056
+
1057
+ # Process foreign end tag per spec: walk stack backwards looking for match
1058
+ idx = len(self.open_elements) - 1
1059
+ first = True
1060
+ while idx >= 0:
1061
+ node = self.open_elements[idx]
1062
+ is_html = node.namespace in {None, "html"}
1063
+ name_eq = self._lower_ascii(node.name) == name_lower
1064
+
1065
+ # Check if this node matches the end tag (case-insensitive)
1066
+ if name_eq:
1067
+ if self.fragment_context_element is not None and node is self.fragment_context_element:
1068
+ self._parse_error("unexpected-end-tag-in-fragment-context")
1069
+ return None
1070
+ # If matched element is HTML namespace, break out to HTML mode
1071
+ if is_html:
1072
+ return ("reprocess", self.mode, token, True)
1073
+ # Otherwise it's a foreign element - pop everything from this point up
1074
+ del self.open_elements[idx:]
1075
+ return None
1076
+
1077
+ # Per HTML5 spec: if first node doesn't match, it's a parse error
1078
+ if first:
1079
+ self._parse_error("unexpected-end-tag-in-foreign-content", tag_name=token.name)
1080
+ first = False
1081
+
1082
+ # If we hit an HTML element that doesn't match, process in secondary mode
1083
+ if is_html:
1084
+ return ("reprocess", self.mode, token, True)
1085
+
1086
+ idx -= 1
1087
+ # Stack exhausted without finding match - ignore tag (defensive, html always terminates)
1088
+ return None # pragma: no cover
1089
+
1090
+ def _appropriate_insertion_location(self, override_target=None, *, foster_parenting=False):
1091
+ if override_target is not None:
1092
+ target = override_target
1093
+ else:
1094
+ target = self._current_node_or_html()
1095
+
1096
+ if foster_parenting and target.name in {"table", "tbody", "tfoot", "thead", "tr"}:
1097
+ last_template = self._find_last_on_stack("template")
1098
+ last_table = self._find_last_on_stack("table")
1099
+ if last_template is not None and (
1100
+ last_table is None or self.open_elements.index(last_template) > self.open_elements.index(last_table)
1101
+ ):
1102
+ return last_template.template_content, len(last_template.template_content.children)
1103
+ # No table on stack - fall back to inserting in target
1104
+ if last_table is None:
1105
+ return target, len(target.children)
1106
+ parent = last_table.parent
1107
+ # Table has no parent (e.g., detached) - fall back to target
1108
+ if parent is None: # pragma: no cover
1109
+ return target, len(target.children)
1110
+ position = parent.children.index(last_table)
1111
+ return parent, position
1112
+
1113
+ # If target is a template element, insert into its content document fragment
1114
+ if type(target) is TemplateNode and target.template_content:
1115
+ return target.template_content, len(target.template_content.children)
1116
+
1117
+ return target, len(target.children)
1118
+
1119
+ def _populate_selectedcontent(self, root):
1120
+ """Populate selectedcontent elements with content from selected option.
1121
+
1122
+ Per HTML5 spec: selectedcontent mirrors the content of the selected option,
1123
+ or the first option if none is selected.
1124
+ """
1125
+ # Find all select elements
1126
+ selects = []
1127
+ self._find_elements(root, "select", selects)
1128
+
1129
+ for select in selects:
1130
+ # Find selectedcontent element in this select
1131
+ selectedcontent = self._find_element(select, "selectedcontent")
1132
+ if not selectedcontent:
1133
+ continue
1134
+
1135
+ # Find all option elements
1136
+ options = []
1137
+ self._find_elements(select, "option", options)
1138
+
1139
+ # Find selected option or use first one
1140
+ selected_option = None
1141
+ for opt in options:
1142
+ if opt.attrs:
1143
+ for attr_name in opt.attrs.keys():
1144
+ if attr_name == "selected":
1145
+ selected_option = opt
1146
+ break
1147
+ if selected_option:
1148
+ break
1149
+
1150
+ if not selected_option:
1151
+ selected_option = options[0]
1152
+
1153
+ # Clone content from selected option to selectedcontent
1154
+ self._clone_children(selected_option, selectedcontent)
1155
+
1156
+ def _find_elements(self, node, name, result):
1157
+ """Recursively find all elements with given name."""
1158
+ if node.name == name:
1159
+ result.append(node)
1160
+
1161
+ if node.has_child_nodes():
1162
+ for child in node.children:
1163
+ self._find_elements(child, name, result)
1164
+
1165
+ def _find_element(self, node, name):
1166
+ """Find first element with given name."""
1167
+ if node.name == name:
1168
+ return node
1169
+
1170
+ if node.has_child_nodes():
1171
+ for child in node.children:
1172
+ result = self._find_element(child, name)
1173
+ if result:
1174
+ return result
1175
+ return None
1176
+
1177
+ def _clone_children(self, source, target):
1178
+ """Deep clone all children from source to target."""
1179
+ for child in source.children:
1180
+ target.append_child(child.clone_node(deep=True))
1181
+
1182
+ def _has_in_scope(self, name):
1183
+ return self._has_element_in_scope(name, DEFAULT_SCOPE_TERMINATORS)
1184
+
1185
+ def _has_in_list_item_scope(self, name):
1186
+ return self._has_element_in_scope(name, LIST_ITEM_SCOPE_TERMINATORS)
1187
+
1188
+ def _has_in_definition_scope(self, name):
1189
+ return self._has_element_in_scope(name, DEFINITION_SCOPE_TERMINATORS)
1190
+
1191
+ def _has_any_in_scope(self, names):
1192
+ # Always terminates: html is in DEFAULT_SCOPE_TERMINATORS
1193
+ terminators = DEFAULT_SCOPE_TERMINATORS
1194
+ idx = len(self.open_elements) - 1
1195
+ while idx >= 0:
1196
+ node = self.open_elements[idx]
1197
+ if node.name in names:
1198
+ return True
1199
+ if node.namespace in {None, "html"} and node.name in terminators:
1200
+ return False
1201
+ idx -= 1
1202
+ return False # pragma: no cover - html always terminates
1203
+
1204
+ def process_characters(self, data):
1205
+ """Optimized path for character tokens."""
1206
+ # Check for foreign content first
1207
+ current_node = self.open_elements[-1] if self.open_elements else None
1208
+ is_html_namespace = current_node is None or current_node.namespace in {None, "html"}
1209
+
1210
+ if not is_html_namespace:
1211
+ return self.process_token(CharacterTokens(data))
1212
+
1213
+ if self.mode == InsertionMode.IN_BODY:
1214
+ if "\x00" in data:
1215
+ self._parse_error("invalid-codepoint")
1216
+ data = data.replace("\x00", "")
1217
+
1218
+ if not data:
1219
+ return TokenSinkResult.Continue
1220
+
1221
+ if is_all_whitespace(data):
1222
+ self._reconstruct_active_formatting_elements()
1223
+ self._append_text(data)
1224
+ return TokenSinkResult.Continue
1225
+
1226
+ self._reconstruct_active_formatting_elements()
1227
+ self.frameset_ok = False
1228
+ self._append_text(data)
1229
+ return TokenSinkResult.Continue
1230
+
1231
+ return self.process_token(CharacterTokens(data))