justhtml 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

@@ -0,0 +1,1279 @@
1
+ # ruff: noqa: S101, PLW2901
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from .constants import (
8
+ BUTTON_SCOPE_TERMINATORS,
9
+ DEFAULT_SCOPE_TERMINATORS,
10
+ DEFINITION_SCOPE_TERMINATORS,
11
+ FOREIGN_ATTRIBUTE_ADJUSTMENTS,
12
+ FOREIGN_BREAKOUT_ELEMENTS,
13
+ FORMAT_MARKER,
14
+ FORMATTING_ELEMENTS,
15
+ HTML_INTEGRATION_POINT_SET,
16
+ IMPLIED_END_TAGS,
17
+ LIST_ITEM_SCOPE_TERMINATORS,
18
+ MATHML_ATTRIBUTE_ADJUSTMENTS,
19
+ MATHML_TEXT_INTEGRATION_POINT_SET,
20
+ SPECIAL_ELEMENTS,
21
+ SVG_ATTRIBUTE_ADJUSTMENTS,
22
+ SVG_TAG_NAME_ADJUSTMENTS,
23
+ TABLE_ALLOWED_CHILDREN,
24
+ TABLE_FOSTER_TARGETS,
25
+ TABLE_SCOPE_TERMINATORS,
26
+ )
27
+ from .errors import generate_error_message
28
+ from .node import ElementNode, SimpleDomNode, TemplateNode, TextNode
29
+ from .tokens import CharacterTokens, CommentToken, DoctypeToken, EOFToken, ParseError, Tag, TokenSinkResult
30
+ from .treebuilder_modes import TreeBuilderModesMixin
31
+ from .treebuilder_utils import (
32
+ InsertionMode,
33
+ is_all_whitespace,
34
+ )
35
+
36
+ if TYPE_CHECKING:
37
+ from collections.abc import Callable
38
+
39
+
40
+ class TreeBuilder(TreeBuilderModesMixin):
41
+ __slots__ = (
42
+ "_body_end_handlers",
43
+ "_body_start_handlers",
44
+ "_body_token_handlers",
45
+ "_mode_handlers",
46
+ "active_formatting",
47
+ "collect_errors",
48
+ "document",
49
+ "errors",
50
+ "form_element",
51
+ "fragment_context",
52
+ "fragment_context_element",
53
+ "frameset_ok",
54
+ "head_element",
55
+ "iframe_srcdoc",
56
+ "ignore_lf",
57
+ "insert_from_table",
58
+ "mode",
59
+ "open_elements",
60
+ "original_mode",
61
+ "pending_table_text",
62
+ "quirks_mode",
63
+ "table_text_original_mode",
64
+ "template_modes",
65
+ "tokenizer",
66
+ "tokenizer_state_override",
67
+ )
68
+
69
+ _body_end_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
70
+ _body_start_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
71
+ _body_token_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
72
+ _mode_handlers: dict[InsertionMode, Callable[[TreeBuilder, Any], Any]]
73
+ active_formatting: list[Any]
74
+ collect_errors: bool
75
+ document: SimpleDomNode
76
+ errors: list[ParseError]
77
+ form_element: Any | None
78
+ fragment_context: Any | None
79
+ fragment_context_element: Any | None
80
+ frameset_ok: bool
81
+ head_element: Any | None
82
+ iframe_srcdoc: bool
83
+ ignore_lf: bool
84
+ insert_from_table: bool
85
+ mode: InsertionMode
86
+ open_elements: list[Any]
87
+ original_mode: InsertionMode | None # type: ignore[assignment]
88
+ pending_table_text: list[str]
89
+ quirks_mode: str
90
+ table_text_original_mode: InsertionMode | None # type: ignore[assignment]
91
+ template_modes: list[InsertionMode]
92
+ tokenizer: Any | None
93
+ tokenizer_state_override: Any | None # type: ignore[assignment]
94
+
95
+ def __init__(
96
+ self,
97
+ fragment_context: Any | None = None,
98
+ iframe_srcdoc: bool = False,
99
+ collect_errors: bool = False,
100
+ ) -> None:
101
+ self.fragment_context = fragment_context
102
+ self.iframe_srcdoc = iframe_srcdoc
103
+ self.collect_errors = collect_errors
104
+ self.errors = []
105
+ self.tokenizer = None # Set by parser after tokenizer is created
106
+ self.fragment_context_element = None
107
+ if fragment_context is not None:
108
+ self.document = SimpleDomNode("#document-fragment")
109
+ else:
110
+ self.document = SimpleDomNode("#document")
111
+ self.mode = InsertionMode.INITIAL
112
+ self.original_mode = None
113
+ self.table_text_original_mode = None
114
+ self.open_elements = []
115
+ self.head_element = None
116
+ self.form_element = None
117
+ self.frameset_ok = True
118
+ self.quirks_mode = "no-quirks"
119
+ self.ignore_lf = False
120
+ self.active_formatting = []
121
+ self.insert_from_table = False
122
+ self.pending_table_text = []
123
+ self.template_modes = []
124
+ self.tokenizer_state_override = None
125
+ if fragment_context is not None:
126
+ # Fragment parsing per HTML5 spec
127
+ root = self._create_element("html", None, {})
128
+ self.document.append_child(root)
129
+ self.open_elements.append(root)
130
+ # Set mode based on context element name
131
+ namespace = fragment_context.namespace
132
+ context_name = fragment_context.tag_name or ""
133
+ name = context_name.lower()
134
+
135
+ # Create a fake context element to establish foreign content context
136
+ # Per spec: "Create an element for the token in the given namespace"
137
+ if namespace and namespace not in {None, "html"}:
138
+ adjusted_name = context_name
139
+ if namespace == "svg":
140
+ adjusted_name = self._adjust_svg_tag_name(context_name)
141
+ context_element = self._create_element(adjusted_name, namespace, {})
142
+ root.append_child(context_element)
143
+ self.open_elements.append(context_element)
144
+ self.fragment_context_element = context_element
145
+
146
+ # For html context, don't pre-create head/body - start in BEFORE_HEAD mode
147
+ # This allows frameset and other elements to be inserted properly
148
+ if name == "html":
149
+ self.mode = InsertionMode.BEFORE_HEAD
150
+ # Table modes only apply to HTML namespace fragments (namespace is None or "html")
151
+ elif namespace in {None, "html"} and name in {"tbody", "thead", "tfoot"}:
152
+ self.mode = InsertionMode.IN_TABLE_BODY
153
+ elif namespace in {None, "html"} and name == "tr":
154
+ self.mode = InsertionMode.IN_ROW
155
+ elif namespace in {None, "html"} and name in {"td", "th"}:
156
+ self.mode = InsertionMode.IN_CELL
157
+ elif namespace in {None, "html"} and name == "caption":
158
+ self.mode = InsertionMode.IN_CAPTION
159
+ elif namespace in {None, "html"} and name == "colgroup":
160
+ self.mode = InsertionMode.IN_COLUMN_GROUP
161
+ elif namespace in {None, "html"} and name == "table":
162
+ self.mode = InsertionMode.IN_TABLE
163
+ else:
164
+ self.mode = InsertionMode.IN_BODY
165
+ # For fragments, frameset_ok starts as False per HTML5 spec
166
+ # This prevents frameset from being inserted in fragment contexts
167
+ self.frameset_ok = False
168
+
169
+ def _set_quirks_mode(self, mode: str) -> None:
170
+ self.quirks_mode = mode
171
+
172
+ def _parse_error(self, code: str, tag_name: str | None = None, token: Any = None) -> None:
173
+ if not self.collect_errors:
174
+ return
175
+ # Use the position of the last emitted token (set by tokenizer before emit)
176
+ line = None
177
+ column = None
178
+ end_column = None
179
+ if self.tokenizer: # pragma: no branch
180
+ line = self.tokenizer.last_token_line
181
+ column = self.tokenizer.last_token_column
182
+
183
+ # Calculate start and end columns based on token type for precise highlighting
184
+ # Note: column from tokenizer points AFTER the last character (0-indexed)
185
+ if token is not None and isinstance(token, Tag):
186
+ # Tag: <name> or </name> plus attributes
187
+ tag_len = len(token.name) + 2 # < + name + >
188
+ if token.kind == Tag.END:
189
+ tag_len += 1 # </name>
190
+ # Add attribute lengths
191
+ for attr_name, attr_value in token.attrs.items():
192
+ tag_len += 1 + len(attr_name) # space + name
193
+ if attr_value:
194
+ tag_len += 1 + 2 + len(attr_value) # = + "value"
195
+ if token.self_closing:
196
+ tag_len += 1 # /
197
+ # column points after >, so start is column - tag_len + 1 (for 1-indexed)
198
+ start_column = column - tag_len + 1
199
+ column = start_column
200
+ end_column = column + tag_len
201
+
202
+ message = generate_error_message(code, tag_name)
203
+ source_html = self.tokenizer.buffer if self.tokenizer else None
204
+ self.errors.append(
205
+ ParseError(
206
+ code,
207
+ line=line,
208
+ column=column,
209
+ message=message,
210
+ source_html=source_html,
211
+ end_column=end_column,
212
+ )
213
+ )
214
+
215
+ def _has_element_in_scope(
216
+ self, target: str, terminators: set[str] | None = None, check_integration_points: bool = True
217
+ ) -> bool:
218
+ if terminators is None:
219
+ terminators = DEFAULT_SCOPE_TERMINATORS
220
+ for node in reversed(self.open_elements):
221
+ if node.name == target:
222
+ return True
223
+ ns = node.namespace
224
+ if ns == "html" or ns is None:
225
+ if node.name in terminators:
226
+ return False
227
+ elif check_integration_points and (
228
+ self._is_html_integration_point(node) or self._is_mathml_text_integration_point(node)
229
+ ):
230
+ return False
231
+ return False
232
+
233
+ def _has_element_in_button_scope(self, target: str) -> bool:
234
+ return self._has_element_in_scope(target, BUTTON_SCOPE_TERMINATORS)
235
+
236
+ def _pop_until_inclusive(self, name: str) -> None:
237
+ # Callers ensure element exists on stack
238
+ while self.open_elements: # pragma: no branch
239
+ node = self.open_elements.pop()
240
+ if node.name == name:
241
+ break
242
+
243
+ def _pop_until_any_inclusive(self, names: set[str]) -> None:
244
+ # Pop elements until we find one in names (callers ensure element exists)
245
+ while self.open_elements:
246
+ node = self.open_elements.pop()
247
+ if node.name in names:
248
+ return
249
+
250
+ def _close_p_element(self) -> bool:
251
+ if self._has_element_in_button_scope("p"):
252
+ self._generate_implied_end_tags("p")
253
+ if self.open_elements[-1].name != "p":
254
+ self._parse_error("end-tag-too-early", tag_name="p")
255
+ self._pop_until_inclusive("p")
256
+ return True
257
+ return False
258
+
259
+ def process_token(self, token: Any) -> Any:
260
+ # Optimization: Use type() identity check instead of isinstance
261
+ token_type = type(token)
262
+ if token_type is DoctypeToken:
263
+ # Check for foreign content first - DOCTYPE in SVG/MathML is a parse error
264
+ if self.open_elements:
265
+ current = self.open_elements[-1]
266
+ if current.namespace not in {None, "html"}:
267
+ self._parse_error("unexpected-doctype")
268
+ return TokenSinkResult.Continue
269
+ return self._handle_doctype(token)
270
+
271
+ current_token = token
272
+ force_html_mode = False
273
+
274
+ # Cache mode handlers list for speed
275
+ mode_handlers = self._MODE_HANDLERS
276
+
277
+ while True:
278
+ # Update token type for current token (it might have changed if reprocessed)
279
+ token_type = type(current_token)
280
+
281
+ # Optimization: Check for HTML namespace first (common case)
282
+ current_node = self.open_elements[-1] if self.open_elements else None
283
+ is_html_namespace = current_node is None or current_node.namespace in {None, "html"}
284
+
285
+ if force_html_mode or is_html_namespace:
286
+ force_html_mode = False
287
+ if self.mode == InsertionMode.IN_BODY:
288
+ # Inline _mode_in_body for performance
289
+ if token_type is Tag:
290
+ # Inline _handle_tag_in_body
291
+ if current_token.kind == 0: # Tag.START
292
+ name = current_token.name
293
+ if name == "div" or name == "ul" or name == "ol":
294
+ # Inline _handle_body_start_block_with_p
295
+ # Check if p is in button scope (html always terminates)
296
+ has_p = False
297
+ idx = len(self.open_elements) - 1
298
+ while idx >= 0: # pragma: no branch
299
+ node = self.open_elements[idx]
300
+ if node.name == "p":
301
+ has_p = True
302
+ break
303
+ if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
304
+ break
305
+ idx -= 1
306
+
307
+ if has_p:
308
+ self._close_p_element()
309
+
310
+ self._insert_element(current_token, push=True)
311
+ result = None
312
+ elif name == "p":
313
+ result = self._handle_body_start_paragraph(current_token)
314
+ elif name == "span":
315
+ if self.active_formatting:
316
+ self._reconstruct_active_formatting_elements()
317
+ self._insert_element(current_token, push=True)
318
+ self.frameset_ok = False
319
+ result = None
320
+ elif name == "a":
321
+ result = self._handle_body_start_a(current_token)
322
+ elif name == "br" or name == "img":
323
+ if self.active_formatting:
324
+ self._reconstruct_active_formatting_elements()
325
+ self._insert_element(current_token, push=False)
326
+ self.frameset_ok = False
327
+ result = None
328
+ elif name == "hr":
329
+ has_p = False
330
+ idx = len(self.open_elements) - 1
331
+ while idx >= 0: # pragma: no branch
332
+ node = self.open_elements[idx]
333
+ if node.name == "p":
334
+ has_p = True
335
+ break
336
+ if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
337
+ break
338
+ idx -= 1
339
+
340
+ if has_p:
341
+ self._close_p_element()
342
+
343
+ self._insert_element(current_token, push=False)
344
+ self.frameset_ok = False
345
+ result = None
346
+ else:
347
+ handler = self._BODY_START_HANDLERS.get(name)
348
+ if handler:
349
+ result = handler(self, current_token)
350
+ else:
351
+ # Inline _handle_body_start_default
352
+ # Elements here have no special handler - never in FRAMESET_NEUTRAL/FORMATTING_ELEMENTS
353
+ if self.active_formatting:
354
+ self._reconstruct_active_formatting_elements()
355
+ self._insert_element(current_token, push=True)
356
+ if current_token.self_closing:
357
+ self._parse_error(
358
+ "non-void-html-element-start-tag-with-trailing-solidus",
359
+ tag_name=current_token.name,
360
+ )
361
+ self.frameset_ok = False
362
+ result = None
363
+ else:
364
+ name = current_token.name
365
+ if name == "br":
366
+ self._parse_error("unexpected-end-tag", tag_name=name)
367
+ br_tag = Tag(0, "br", {}, False)
368
+ result = self._handle_body_start_br(br_tag)
369
+ elif name in FORMATTING_ELEMENTS:
370
+ self._adoption_agency(name)
371
+ result = None
372
+ else:
373
+ handler = self._BODY_END_HANDLERS.get(name)
374
+ if handler:
375
+ result = handler(self, current_token)
376
+ else:
377
+ self._any_other_end_tag(name)
378
+ result = None
379
+ elif token_type is CharacterTokens:
380
+ # Inline _handle_characters_in_body
381
+ # Only non-whitespace data reaches here (whitespace handled in process_characters)
382
+ self.frameset_ok = False
383
+ self._reconstruct_active_formatting_elements()
384
+ self._append_text(current_token.data)
385
+ result = None
386
+ elif token_type is CommentToken:
387
+ result = self._handle_comment_in_body(current_token)
388
+ else: # EOFToken
389
+ result = self._handle_eof_in_body(current_token)
390
+ else:
391
+ result = mode_handlers[self.mode](self, current_token)
392
+ elif self._should_use_foreign_content(current_token):
393
+ result = self._process_foreign_content(current_token)
394
+ else:
395
+ # Foreign content stack logic
396
+ current = current_node
397
+ # Only pop foreign elements if we're NOT at an HTML/MathML integration point
398
+ # and NOT about to insert a new foreign element (svg/math)
399
+ if not isinstance(current_token, EOFToken):
400
+ # Don't pop at integration points - they stay on stack to receive content
401
+ if self._is_html_integration_point(current) or self._is_mathml_text_integration_point(current):
402
+ pass
403
+ # Don't pop when inserting new svg/math elements
404
+ if isinstance(current_token, Tag) and current_token.kind == Tag.START:
405
+ # Optimization: Tokenizer already lowercases tag names
406
+ name_lower = current_token.name
407
+ if name_lower in {"svg", "math"}:
408
+ pass
409
+
410
+ # Special handling: text at integration points inserts directly, bypassing mode dispatch
411
+ if isinstance(current_token, CharacterTokens):
412
+ if self._is_mathml_text_integration_point(current):
413
+ # Tokenizer guarantees non-empty data
414
+ data = current_token.data
415
+ if "\x00" in data:
416
+ self._parse_error("invalid-codepoint")
417
+ data = data.replace("\x00", "")
418
+ if "\x0c" in data:
419
+ self._parse_error("invalid-codepoint")
420
+ data = data.replace("\x0c", "")
421
+ if data:
422
+ if not is_all_whitespace(data):
423
+ self._reconstruct_active_formatting_elements()
424
+ self.frameset_ok = False
425
+ self._append_text(data)
426
+ result = None
427
+ else:
428
+ result = mode_handlers[self.mode](self, current_token)
429
+ else:
430
+ # At integration points inside foreign content, check if table tags make sense.
431
+ if (
432
+ (self._is_mathml_text_integration_point(current) or self._is_html_integration_point(current))
433
+ and isinstance(current_token, Tag)
434
+ and current_token.kind == Tag.START
435
+ and self.mode not in {InsertionMode.IN_BODY}
436
+ ):
437
+ # Check if we're in a table mode but without an actual table in scope
438
+ # If so, table tags should be ignored (use IN_BODY mode)
439
+ is_table_mode = self.mode in {
440
+ InsertionMode.IN_TABLE,
441
+ InsertionMode.IN_TABLE_BODY,
442
+ InsertionMode.IN_ROW,
443
+ InsertionMode.IN_CELL,
444
+ InsertionMode.IN_CAPTION,
445
+ InsertionMode.IN_COLUMN_GROUP,
446
+ }
447
+ has_table_in_scope = self._has_in_table_scope("table")
448
+ if is_table_mode and not has_table_in_scope:
449
+ # Temporarily use IN_BODY mode for this tag
450
+ saved_mode = self.mode
451
+ self.mode = InsertionMode.IN_BODY
452
+ result = mode_handlers[self.mode](self, current_token)
453
+ # Restore mode if no mode change was requested
454
+ if self.mode == InsertionMode.IN_BODY: # pragma: no branch
455
+ self.mode = saved_mode
456
+ else:
457
+ result = mode_handlers[self.mode](self, current_token)
458
+ else:
459
+ result = mode_handlers[self.mode](self, current_token)
460
+
461
+ if result is None:
462
+ result_to_return = self.tokenizer_state_override or TokenSinkResult.Continue
463
+ self.tokenizer_state_override = None
464
+ return result_to_return
465
+ # Result is (instruction, mode, token) or (instruction, mode, token, force_html)
466
+ _instruction, mode, token_override = result[0], result[1], result[2]
467
+ if len(result) == 4:
468
+ force_html_mode = result[3]
469
+ # All mode handlers that return a tuple use "reprocess" instruction
470
+ self.mode = mode
471
+ current_token = token_override
472
+ # Continue loop to reprocess
473
+
474
+ def finish(self) -> SimpleDomNode:
475
+ if self.fragment_context is not None:
476
+ # For fragments, remove the html wrapper and promote its children
477
+ # Note: html element is always created in fragment setup, so children[0] is always "html"
478
+ assert self.document.children is not None
479
+ root = self.document.children[0]
480
+ context_elem = self.fragment_context_element
481
+ if context_elem is not None and context_elem.parent is root:
482
+ for child in list(context_elem.children):
483
+ context_elem.remove_child(child)
484
+ root.append_child(child)
485
+ root.remove_child(context_elem)
486
+ for child in list(root.children):
487
+ root.remove_child(child)
488
+ self.document.append_child(child)
489
+ self.document.remove_child(root)
490
+
491
+ # Populate selectedcontent elements per HTML5 spec
492
+ self._populate_selectedcontent(self.document)
493
+
494
+ return self.document
495
+
496
+ # Insertion mode dispatch ------------------------------------------------
497
+
498
+ def _append_comment_to_document(self, text: str) -> None:
499
+ node = SimpleDomNode("#comment", data=text)
500
+ self.document.append_child(node)
501
+
502
+ def _append_comment(self, text: str, parent: Any | None = None) -> None:
503
+ if parent is None:
504
+ parent = self._current_node_or_html()
505
+ # If parent is a template, insert into its content fragment
506
+ if type(parent) is TemplateNode and parent.template_content:
507
+ parent = parent.template_content
508
+ node = SimpleDomNode("#comment", data=text)
509
+ parent.append_child(node)
510
+
511
+ def _append_text(self, text: str) -> None:
512
+ if self.ignore_lf:
513
+ self.ignore_lf = False
514
+ if text.startswith("\n"):
515
+ text = text[1:]
516
+ if not text:
517
+ return
518
+
519
+ # Guard against empty stack
520
+ if not self.open_elements: # pragma: no cover
521
+ return
522
+
523
+ # Fast path optimization for common case
524
+ target = self.open_elements[-1]
525
+
526
+ if target.name not in TABLE_FOSTER_TARGETS and type(target) is not TemplateNode:
527
+ children = target.children
528
+ if children:
529
+ last_child = children[-1]
530
+ if type(last_child) is TextNode:
531
+ last_child.data = (last_child.data or "") + text
532
+ return
533
+
534
+ node = TextNode(text)
535
+ children.append(node)
536
+ node.parent = target
537
+ return
538
+
539
+ target = self._current_node_or_html()
540
+ foster_parenting = self._should_foster_parenting(target, is_text=True)
541
+
542
+ # Reconstruct active formatting BEFORE getting insertion location when foster parenting
543
+ if foster_parenting:
544
+ self._reconstruct_active_formatting_elements()
545
+
546
+ # Always use appropriate insertion location to handle templates
547
+ parent, position = self._appropriate_insertion_location(foster_parenting=foster_parenting)
548
+
549
+ # Coalesce with adjacent text node if possible
550
+ if position > 0 and parent.children[position - 1].name == "#text":
551
+ parent.children[position - 1].data = (parent.children[position - 1].data or "") + text
552
+ return
553
+
554
+ node = TextNode(text)
555
+ reference_node = parent.children[position] if position < len(parent.children) else None
556
+ parent.insert_before(node, reference_node)
557
+
558
+ def _current_node_or_html(self) -> Any:
559
+ if self.open_elements:
560
+ return self.open_elements[-1]
561
+ # Stack empty - find html element in document children
562
+ # (may not be first if there are comments/doctype before it)
563
+ children = self.document.children
564
+ if children is not None:
565
+ for child in children:
566
+ if child.name == "html":
567
+ return child
568
+ # Edge case: no html found, return first child or None
569
+ return children[0] if children else None # pragma: no cover
570
+ return None # pragma: no cover
571
+
572
+ def _create_root(self, attrs: dict[str, str | None]) -> Any:
573
+ node = SimpleDomNode("html", attrs=attrs, namespace="html")
574
+ self.document.append_child(node)
575
+ self.open_elements.append(node)
576
+ return node
577
+
578
+ def _insert_element(self, tag: Any, *, push: bool, namespace: str = "html") -> Any:
579
+ node: ElementNode | TemplateNode
580
+ if tag.name == "template" and namespace == "html":
581
+ node = TemplateNode(tag.name, attrs=tag.attrs, namespace=namespace)
582
+ else:
583
+ node = ElementNode(tag.name, attrs=tag.attrs, namespace=namespace)
584
+
585
+ # Fast path for common case: not inserting from table
586
+ if not self.insert_from_table:
587
+ target = self._current_node_or_html()
588
+
589
+ # Handle template content insertion
590
+ if type(target) is TemplateNode:
591
+ parent = target.template_content
592
+ else:
593
+ parent = target
594
+
595
+ if parent is not None: # pragma: no branch
596
+ parent.append_child(node)
597
+
598
+ if push:
599
+ self.open_elements.append(node)
600
+ return node
601
+
602
+ target = self._current_node_or_html()
603
+ foster_parenting = self._should_foster_parenting(target, for_tag=tag.name)
604
+ parent, position = self._appropriate_insertion_location(foster_parenting=foster_parenting)
605
+ self._insert_node_at(parent, position, node)
606
+ if push:
607
+ self.open_elements.append(node)
608
+ return node
609
+
610
+ def _insert_phantom(self, name: str) -> Any:
611
+ attrs: dict[str, str | None] = {}
612
+ tag = Tag(Tag.START, name, attrs, False)
613
+ return self._insert_element(tag, push=True)
614
+
615
+ def _insert_body_if_missing(self) -> None:
616
+ html_node = self._find_last_on_stack("html")
617
+ node = SimpleDomNode("body", namespace="html")
618
+ if html_node is not None: # pragma: no branch
619
+ html_node.append_child(node)
620
+ node.parent = html_node
621
+ self.open_elements.append(node)
622
+
623
+ def _create_element(self, name: str, namespace: str | None, attrs: dict[str, str | None]) -> Any:
624
+ ns = namespace or "html"
625
+ return ElementNode(name, attrs, ns)
626
+
627
+ def _pop_current(self) -> Any:
628
+ return self.open_elements.pop()
629
+
630
+ def _in_scope(self, name: str) -> bool:
631
+ return self._has_element_in_scope(name, DEFAULT_SCOPE_TERMINATORS)
632
+
633
+ def _close_element_by_name(self, name: str) -> None:
634
+ # Simple element closing - pops from the named element onwards
635
+ # Used for explicit closing (e.g., when button start tag closes existing button)
636
+ # Caller guarantees name is on the stack via _has_in_scope check
637
+ index = len(self.open_elements) - 1
638
+ while index >= 0: # pragma: no branch
639
+ if self.open_elements[index].name == name:
640
+ del self.open_elements[index:]
641
+ return
642
+ index -= 1
643
+
644
+ def _any_other_end_tag(self, name: str) -> None:
645
+ # Spec: "Any other end tag" in IN_BODY mode
646
+ # Loop through stack backwards (always terminates: html is special)
647
+ index = len(self.open_elements) - 1
648
+ while index >= 0: # pragma: no branch
649
+ node = self.open_elements[index]
650
+
651
+ # If node's name matches the end tag name
652
+ if node.name == name:
653
+ # Generate implied end tags (except for this name)
654
+ # If current node is not this node, parse error
655
+ if index != len(self.open_elements) - 1:
656
+ self._parse_error("end-tag-too-early")
657
+ # Pop all elements from this node onwards
658
+ del self.open_elements[index:]
659
+ return
660
+
661
+ # If node is a special element, parse error and ignore the tag
662
+ if self._is_special_element(node):
663
+ self._parse_error("unexpected-end-tag", tag_name=name)
664
+ return # Ignore the end tag
665
+
666
+ # Continue to next node (previous in stack)
667
+ index -= 1
668
+
669
+ def _add_missing_attributes(self, node: Any, attrs: dict[str, str]) -> None:
670
+ if not attrs:
671
+ return
672
+ existing = node.attrs
673
+ for name, value in attrs.items():
674
+ if name not in existing:
675
+ existing[name] = value
676
+
677
+ def _remove_from_open_elements(self, node: Any) -> bool:
678
+ for index, current in enumerate(self.open_elements):
679
+ if current is node:
680
+ del self.open_elements[index]
681
+ return True
682
+ return False
683
+
684
+ def _is_special_element(self, node: Any) -> bool:
685
+ if node.namespace not in {None, "html"}:
686
+ return False
687
+ return node.name in SPECIAL_ELEMENTS
688
+
689
+ def _find_active_formatting_index(self, name: str) -> int | None:
690
+ for index in range(len(self.active_formatting) - 1, -1, -1):
691
+ entry = self.active_formatting[index]
692
+ if entry is FORMAT_MARKER:
693
+ break
694
+ if entry["name"] == name:
695
+ return index
696
+ return None
697
+
698
+ def _find_active_formatting_index_by_node(self, node: Any) -> int | None:
699
+ for index in range(len(self.active_formatting) - 1, -1, -1):
700
+ entry = self.active_formatting[index]
701
+ if entry is not FORMAT_MARKER and entry["node"] is node:
702
+ return index
703
+ return None
704
+
705
+ def _clone_attributes(self, attrs: dict[str, str | None]) -> dict[str, str | None]:
706
+ return attrs.copy() if attrs else {}
707
+
708
+ def _attrs_signature(self, attrs: dict[str, str | None]) -> tuple[tuple[str, str], ...]:
709
+ if not attrs:
710
+ return ()
711
+ items: list[tuple[str, str]] = []
712
+ for name, value in attrs.items():
713
+ items.append((name, value or ""))
714
+ items.sort()
715
+ return tuple(items)
716
+
717
+ def _find_active_formatting_duplicate(self, name: str, attrs: dict[str, str | None]) -> int | None:
718
+ signature = self._attrs_signature(attrs)
719
+ matches: list[int] = []
720
+ for index, entry in enumerate(self.active_formatting):
721
+ if entry is FORMAT_MARKER:
722
+ matches.clear()
723
+ continue
724
+ existing_signature = entry["signature"]
725
+ if entry["name"] == name and existing_signature == signature:
726
+ matches.append(index)
727
+ if len(matches) >= 3:
728
+ return matches[0]
729
+ return None
730
+
731
+ def _has_active_formatting_entry(self, name: str) -> bool:
732
+ for index in range(len(self.active_formatting) - 1, -1, -1):
733
+ entry = self.active_formatting[index]
734
+ if entry is FORMAT_MARKER:
735
+ break
736
+ if entry["name"] == name:
737
+ return True
738
+ return False
739
+
740
+ def _remove_last_active_formatting_by_name(self, name: str) -> None:
741
+ for index in range(len(self.active_formatting) - 1, -1, -1):
742
+ entry = self.active_formatting[index]
743
+ if entry is FORMAT_MARKER:
744
+ break
745
+ if entry["name"] == name:
746
+ del self.active_formatting[index]
747
+ return
748
+
749
+ def _remove_last_open_element_by_name(self, name: str) -> None:
750
+ for index in range(len(self.open_elements) - 1, -1, -1):
751
+ if self.open_elements[index].name == name:
752
+ del self.open_elements[index]
753
+ return
754
+
755
+ def _append_active_formatting_entry(self, name: str, attrs: dict[str, str | None], node: Any) -> None:
756
+ entry_attrs = self._clone_attributes(attrs)
757
+ signature = self._attrs_signature(entry_attrs)
758
+ self.active_formatting.append(
759
+ {
760
+ "name": name,
761
+ "attrs": entry_attrs,
762
+ "node": node,
763
+ "signature": signature,
764
+ },
765
+ )
766
+
767
+ def _clear_active_formatting_up_to_marker(self) -> None:
768
+ while self.active_formatting:
769
+ entry = self.active_formatting.pop()
770
+ if entry is FORMAT_MARKER:
771
+ break
772
+
773
+ def _push_formatting_marker(self) -> None:
774
+ self.active_formatting.append(FORMAT_MARKER)
775
+
776
+ def _remove_formatting_entry(self, index: int) -> None:
777
+ assert 0 <= index < len(self.active_formatting), f"Invalid index: {index}"
778
+ del self.active_formatting[index]
779
+
780
+ def _reconstruct_active_formatting_elements(self) -> None:
781
+ if not self.active_formatting:
782
+ return
783
+ last_entry = self.active_formatting[-1]
784
+ if last_entry is FORMAT_MARKER or last_entry["node"] in self.open_elements:
785
+ return
786
+
787
+ index = len(self.active_formatting) - 1
788
+ while True:
789
+ index -= 1
790
+ if index < 0:
791
+ break
792
+ entry = self.active_formatting[index]
793
+ if entry is FORMAT_MARKER or entry["node"] in self.open_elements:
794
+ index += 1
795
+ break
796
+ if index < 0:
797
+ index = 0
798
+ while index < len(self.active_formatting):
799
+ entry = self.active_formatting[index]
800
+ tag = Tag(Tag.START, entry["name"], self._clone_attributes(entry["attrs"]), False)
801
+ new_node = self._insert_element(tag, push=True)
802
+ entry["node"] = new_node
803
+ index += 1
804
+
805
+ def _insert_node_at(self, parent: Any, index: int, node: Any) -> None:
806
+ reference_node = None
807
+ if index is not None and index < len(parent.children):
808
+ reference_node = parent.children[index]
809
+ parent.insert_before(node, reference_node)
810
+
811
+ def _find_last_on_stack(self, name: str) -> Any | None:
812
+ for node in reversed(self.open_elements):
813
+ if node.name == name:
814
+ return node
815
+ return None
816
+
817
+ def _clear_stack_until(self, names: set[str]) -> None:
818
+ # All callers include "html" in names, so this always terminates via break
819
+ while self.open_elements:
820
+ node = self.open_elements[-1]
821
+ if node.name in names and node.namespace in {None, "html"}:
822
+ break
823
+ self.open_elements.pop()
824
+
825
+ def _generate_implied_end_tags(self, exclude: str | None = None) -> None:
826
+ # Always terminates: html is not in IMPLIED_END_TAGS
827
+ while self.open_elements: # pragma: no branch
828
+ node = self.open_elements[-1]
829
+ if node.name in IMPLIED_END_TAGS and node.name != exclude:
830
+ self.open_elements.pop()
831
+ continue
832
+ break
833
+
834
+ def _has_in_table_scope(self, name: str) -> bool:
835
+ return self._has_element_in_scope(name, TABLE_SCOPE_TERMINATORS, check_integration_points=False)
836
+
837
+ def _close_table_cell(self) -> bool:
838
+ if self._has_in_table_scope("td"):
839
+ self._end_table_cell("td")
840
+ return True
841
+ if self._has_in_table_scope("th"):
842
+ self._end_table_cell("th")
843
+ return True
844
+ return False
845
+
846
+ def _end_table_cell(self, name: str) -> None:
847
+ self._generate_implied_end_tags(name)
848
+ while self.open_elements:
849
+ node = self.open_elements.pop()
850
+ if node.name == name and node.namespace in {None, "html"}:
851
+ break
852
+ self._clear_active_formatting_up_to_marker()
853
+ self.mode = InsertionMode.IN_ROW
854
+
855
+ def _flush_pending_table_text(self) -> None:
856
+ data = "".join(self.pending_table_text)
857
+ self.pending_table_text.clear()
858
+ if not data:
859
+ return
860
+ if is_all_whitespace(data):
861
+ self._append_text(data)
862
+ return
863
+ self._parse_error("foster-parenting-character")
864
+ previous = self.insert_from_table
865
+ self.insert_from_table = True
866
+ try:
867
+ self._reconstruct_active_formatting_elements()
868
+ self._append_text(data)
869
+ finally:
870
+ self.insert_from_table = previous
871
+
872
+ def _close_table_element(self) -> bool:
873
+ if not self._has_in_table_scope("table"):
874
+ self._parse_error("unexpected-end-tag", tag_name="table")
875
+ return False
876
+ self._generate_implied_end_tags()
877
+ # Table verified in scope above
878
+ while self.open_elements: # pragma: no branch
879
+ node = self.open_elements.pop()
880
+ if node.name == "table":
881
+ break
882
+ self._reset_insertion_mode()
883
+ return True
884
+
885
+ def _reset_insertion_mode(self) -> None:
886
+ # Walk stack backwards - html element always terminates
887
+ idx = len(self.open_elements) - 1
888
+ while idx >= 0:
889
+ node = self.open_elements[idx]
890
+ name = node.name
891
+ if name == "select":
892
+ self.mode = InsertionMode.IN_SELECT
893
+ return
894
+ if name == "td" or name == "th":
895
+ self.mode = InsertionMode.IN_CELL
896
+ return
897
+ if name == "tr":
898
+ self.mode = InsertionMode.IN_ROW
899
+ return
900
+ if name in {"tbody", "tfoot", "thead"}:
901
+ self.mode = InsertionMode.IN_TABLE_BODY
902
+ return
903
+ if name == "caption":
904
+ self.mode = InsertionMode.IN_CAPTION
905
+ return
906
+ if name == "table":
907
+ self.mode = InsertionMode.IN_TABLE
908
+ return
909
+ if name == "template":
910
+ # Return the last template mode from the stack
911
+ if self.template_modes:
912
+ self.mode = self.template_modes[-1]
913
+ return
914
+ if name == "head":
915
+ # If we're resetting and head is on stack, stay in IN_HEAD
916
+ self.mode = InsertionMode.IN_HEAD
917
+ return
918
+ if name == "html":
919
+ self.mode = InsertionMode.IN_BODY
920
+ return
921
+ idx -= 1
922
+ # Empty stack fallback
923
+ self.mode = InsertionMode.IN_BODY
924
+
925
+ def _should_foster_parenting(self, target: Any, *, for_tag: str | None = None, is_text: bool = False) -> bool:
926
+ if not self.insert_from_table:
927
+ return False
928
+ if target.name not in TABLE_FOSTER_TARGETS:
929
+ return False
930
+ if is_text:
931
+ return True
932
+ if for_tag in TABLE_ALLOWED_CHILDREN:
933
+ return False
934
+ return True
935
+
936
+ def _lower_ascii(self, value: str) -> str:
937
+ return value.lower() if value else ""
938
+
939
+ def _adjust_svg_tag_name(self, name: str) -> str:
940
+ lowered = self._lower_ascii(name)
941
+ return SVG_TAG_NAME_ADJUSTMENTS.get(lowered, name)
942
+
943
+ def _prepare_foreign_attributes(self, namespace: str, attrs: dict[str, str | None]) -> dict[str, str | None]:
944
+ if not attrs:
945
+ return {}
946
+ adjusted: dict[str, str | None] = {}
947
+ for name, value in attrs.items():
948
+ lower_name = self._lower_ascii(name)
949
+ if namespace == "math" and lower_name in MATHML_ATTRIBUTE_ADJUSTMENTS:
950
+ name = MATHML_ATTRIBUTE_ADJUSTMENTS[lower_name]
951
+ lower_name = self._lower_ascii(name)
952
+ elif namespace == "svg" and lower_name in SVG_ATTRIBUTE_ADJUSTMENTS:
953
+ name = SVG_ATTRIBUTE_ADJUSTMENTS[lower_name]
954
+ lower_name = self._lower_ascii(name)
955
+
956
+ foreign_adjustment = FOREIGN_ATTRIBUTE_ADJUSTMENTS.get(lower_name)
957
+ if foreign_adjustment is not None:
958
+ prefix, local, _ = foreign_adjustment
959
+ name = f"{prefix}:{local}"
960
+
961
+ # Tokenizer deduplicates attributes, so name collision impossible here
962
+ adjusted[name] = value
963
+ return adjusted
964
+
965
+ def _node_attribute_value(self, node: Any, name: str) -> str | None:
966
+ target = self._lower_ascii(name)
967
+ for attr_name, attr_value in node.attrs.items():
968
+ if self._lower_ascii(attr_name) == target:
969
+ return attr_value or ""
970
+ return None
971
+
972
+ def _is_html_integration_point(self, node: Any) -> bool:
973
+ # annotation-xml is an HTML integration point only with specific encoding values
974
+ if node.namespace == "math" and node.name == "annotation-xml":
975
+ encoding = self._node_attribute_value(node, "encoding")
976
+ if encoding:
977
+ enc_lower = encoding.lower()
978
+ if enc_lower in {"text/html", "application/xhtml+xml"}:
979
+ return True
980
+ return False # annotation-xml without proper encoding is NOT an integration point
981
+ # SVG foreignObject, desc, and title are always HTML integration points
982
+ return (node.namespace, node.name) in HTML_INTEGRATION_POINT_SET
983
+
984
+ def _is_mathml_text_integration_point(self, node: Any) -> bool:
985
+ if node.namespace != "math":
986
+ return False
987
+ return (node.namespace, node.name) in MATHML_TEXT_INTEGRATION_POINT_SET
988
+
989
+ def _adjusted_current_node(self) -> Any:
990
+ return self.open_elements[-1]
991
+
992
+ def _should_use_foreign_content(self, token: Any) -> bool:
993
+ current = self._adjusted_current_node()
994
+ # HTML namespace elements don't use foreign content rules
995
+ # (unreachable in practice as foreign content mode only entered for foreign elements)
996
+ if current.namespace in {None, "html"}:
997
+ return False # pragma: no cover
998
+
999
+ if isinstance(token, EOFToken):
1000
+ return False
1001
+
1002
+ if self._is_mathml_text_integration_point(current):
1003
+ if isinstance(token, CharacterTokens):
1004
+ return False
1005
+ if isinstance(token, Tag) and token.kind == Tag.START:
1006
+ name_lower = self._lower_ascii(token.name)
1007
+ if name_lower not in {"mglyph", "malignmark"}:
1008
+ return False
1009
+
1010
+ if current.namespace == "math" and current.name == "annotation-xml":
1011
+ if isinstance(token, Tag) and token.kind == Tag.START:
1012
+ if self._lower_ascii(token.name) == "svg":
1013
+ return False
1014
+
1015
+ if self._is_html_integration_point(current):
1016
+ if isinstance(token, CharacterTokens):
1017
+ return False
1018
+ if isinstance(token, Tag) and token.kind == Tag.START:
1019
+ return False
1020
+
1021
+ return True
1022
+
1023
+ def _foreign_breakout_font(self, tag: Any) -> bool:
1024
+ for name in tag.attrs.keys():
1025
+ if self._lower_ascii(name) in {"color", "face", "size"}:
1026
+ return True
1027
+ return False
1028
+
1029
+ def _pop_until_html_or_integration_point(self) -> None:
1030
+ # Always terminates: html element has html namespace
1031
+ while self.open_elements: # pragma: no branch
1032
+ node = self.open_elements[-1]
1033
+ if node.namespace in {None, "html"}:
1034
+ return
1035
+ if self._is_html_integration_point(node):
1036
+ return
1037
+ if self.fragment_context_element is not None and node is self.fragment_context_element:
1038
+ return
1039
+ self.open_elements.pop()
1040
+
1041
+ def _process_foreign_content(self, token: Any) -> Any | None:
1042
+ current = self._adjusted_current_node()
1043
+
1044
+ if isinstance(token, CharacterTokens):
1045
+ raw = token.data or ""
1046
+ cleaned = []
1047
+ has_non_null_non_ws = False
1048
+ for ch in raw:
1049
+ if ch == "\x00":
1050
+ self._parse_error("invalid-codepoint-in-foreign-content")
1051
+ cleaned.append("\ufffd")
1052
+ continue
1053
+ cleaned.append(ch)
1054
+ if ch not in "\t\n\f\r ":
1055
+ has_non_null_non_ws = True
1056
+ data = "".join(cleaned)
1057
+ if has_non_null_non_ws:
1058
+ self.frameset_ok = False
1059
+ self._append_text(data)
1060
+ return None
1061
+
1062
+ if isinstance(token, CommentToken):
1063
+ self._append_comment(token.data)
1064
+ return None
1065
+
1066
+ # Foreign content only receives CharacterTokens, CommentToken, or Tag (not EOF)
1067
+ assert isinstance(token, Tag), f"Unexpected token type in foreign content: {type(token)}"
1068
+ name_lower = self._lower_ascii(token.name)
1069
+ if token.kind == Tag.START:
1070
+ if name_lower in FOREIGN_BREAKOUT_ELEMENTS or (
1071
+ name_lower == "font" and self._foreign_breakout_font(token)
1072
+ ):
1073
+ self._parse_error("unexpected-html-element-in-foreign-content")
1074
+ self._pop_until_html_or_integration_point()
1075
+ self._reset_insertion_mode()
1076
+ return ("reprocess", self.mode, token, True)
1077
+
1078
+ namespace = current.namespace
1079
+ adjusted_name = token.name
1080
+ if namespace == "svg":
1081
+ adjusted_name = self._adjust_svg_tag_name(token.name)
1082
+ attrs = self._prepare_foreign_attributes(namespace, token.attrs)
1083
+ new_tag = Tag(Tag.START, adjusted_name, attrs, token.self_closing)
1084
+ # For foreign elements, honor the self-closing flag
1085
+ self._insert_element(new_tag, push=not token.self_closing, namespace=namespace)
1086
+ return None
1087
+
1088
+ # Only START and END tag kinds exist, and START returns above
1089
+ assert token.kind == Tag.END, f"Unexpected tag kind: {token.kind}"
1090
+ name_lower = self._lower_ascii(token.name)
1091
+
1092
+ # Special case: </br> and </p> end tags trigger breakout from foreign content
1093
+ if name_lower in {"br", "p"}:
1094
+ self._parse_error("unexpected-html-element-in-foreign-content")
1095
+ self._pop_until_html_or_integration_point()
1096
+ self._reset_insertion_mode()
1097
+ return ("reprocess", self.mode, token, True)
1098
+
1099
+ # Process foreign end tag per spec: walk stack backwards looking for match
1100
+ idx = len(self.open_elements) - 1
1101
+ first = True
1102
+ while idx >= 0:
1103
+ node = self.open_elements[idx]
1104
+ is_html = node.namespace in {None, "html"}
1105
+ name_eq = self._lower_ascii(node.name) == name_lower
1106
+
1107
+ # Check if this node matches the end tag (case-insensitive)
1108
+ if name_eq:
1109
+ if self.fragment_context_element is not None and node is self.fragment_context_element:
1110
+ self._parse_error("unexpected-end-tag-in-fragment-context")
1111
+ return None
1112
+ # If matched element is HTML namespace, break out to HTML mode
1113
+ if is_html:
1114
+ return ("reprocess", self.mode, token, True)
1115
+ # Otherwise it's a foreign element - pop everything from this point up
1116
+ del self.open_elements[idx:]
1117
+ return None
1118
+
1119
+ # Per HTML5 spec: if first node doesn't match, it's a parse error
1120
+ if first:
1121
+ self._parse_error("unexpected-end-tag-in-foreign-content", tag_name=token.name)
1122
+ first = False
1123
+
1124
+ # If we hit an HTML element that doesn't match, process in secondary mode
1125
+ if is_html:
1126
+ return ("reprocess", self.mode, token, True)
1127
+
1128
+ idx -= 1
1129
+ # Stack exhausted without finding match - ignore tag (defensive, html always terminates)
1130
+ return None # pragma: no cover
1131
+
1132
+ def _appropriate_insertion_location(
1133
+ self, override_target: Any | None = None, *, foster_parenting: bool = False
1134
+ ) -> tuple[Any, int]:
1135
+ if override_target is not None:
1136
+ target = override_target
1137
+ else:
1138
+ target = self._current_node_or_html()
1139
+
1140
+ if foster_parenting and target.name in {"table", "tbody", "tfoot", "thead", "tr"}:
1141
+ last_template = self._find_last_on_stack("template")
1142
+ last_table = self._find_last_on_stack("table")
1143
+ if last_template is not None and (
1144
+ last_table is None or self.open_elements.index(last_template) > self.open_elements.index(last_table)
1145
+ ):
1146
+ return last_template.template_content, len(last_template.template_content.children)
1147
+ # No table on stack - fall back to inserting in target
1148
+ if last_table is None:
1149
+ return target, len(target.children)
1150
+ parent = last_table.parent
1151
+ # Table has no parent (e.g., detached) - fall back to target
1152
+ if parent is None: # pragma: no cover
1153
+ children = target.children
1154
+ return target, len(children) if children is not None else 0
1155
+ assert parent.children is not None
1156
+ position = parent.children.index(last_table)
1157
+ return parent, position
1158
+
1159
+ # If target is a template element, insert into its content document fragment
1160
+ if type(target) is TemplateNode and target.template_content:
1161
+ children = target.template_content.children
1162
+ return target.template_content, len(children) if children is not None else 0
1163
+
1164
+ target_children = target.children
1165
+ return target, len(target_children) if target_children is not None else 0
1166
+
1167
+ def _populate_selectedcontent(self, root: Any) -> None:
1168
+ """Populate selectedcontent elements with content from selected option.
1169
+
1170
+ Per HTML5 spec: selectedcontent mirrors the content of the selected option,
1171
+ or the first option if none is selected.
1172
+ """
1173
+ # Find all select elements
1174
+ selects: list[Any] = []
1175
+ self._find_elements(root, "select", selects)
1176
+
1177
+ for select in selects:
1178
+ # Find selectedcontent element in this select
1179
+ selectedcontent = self._find_element(select, "selectedcontent")
1180
+ if not selectedcontent:
1181
+ continue
1182
+
1183
+ # Find all option elements
1184
+ options: list[Any] = []
1185
+ self._find_elements(select, "option", options)
1186
+
1187
+ # Find selected option or use first one
1188
+ selected_option = None
1189
+ for opt in options:
1190
+ if opt.attrs:
1191
+ for attr_name in opt.attrs.keys():
1192
+ if attr_name == "selected":
1193
+ selected_option = opt
1194
+ break
1195
+ if selected_option:
1196
+ break
1197
+
1198
+ if not selected_option:
1199
+ selected_option = options[0]
1200
+
1201
+ # Clone content from selected option to selectedcontent
1202
+ self._clone_children(selected_option, selectedcontent)
1203
+
1204
+ def _find_elements(self, node: Any, name: str, result: list[Any]) -> None:
1205
+ """Recursively find all elements with given name."""
1206
+ if node.name == name:
1207
+ result.append(node)
1208
+
1209
+ if node.has_child_nodes():
1210
+ for child in node.children:
1211
+ self._find_elements(child, name, result)
1212
+
1213
+ def _find_element(self, node: Any, name: str) -> Any | None:
1214
+ """Find first element with given name."""
1215
+ if node.name == name:
1216
+ return node
1217
+
1218
+ if node.has_child_nodes():
1219
+ for child in node.children:
1220
+ result = self._find_element(child, name)
1221
+ if result:
1222
+ return result
1223
+ return None
1224
+
1225
+ def _clone_children(self, source: Any, target: Any) -> None:
1226
+ """Deep clone all children from source to target."""
1227
+ for child in source.children:
1228
+ target.append_child(child.clone_node(deep=True))
1229
+
1230
+ def _has_in_scope(self, name: str) -> bool:
1231
+ return self._has_element_in_scope(name, DEFAULT_SCOPE_TERMINATORS)
1232
+
1233
+ def _has_in_list_item_scope(self, name: str) -> bool:
1234
+ return self._has_element_in_scope(name, LIST_ITEM_SCOPE_TERMINATORS)
1235
+
1236
+ def _has_in_definition_scope(self, name: str) -> bool:
1237
+ return self._has_element_in_scope(name, DEFINITION_SCOPE_TERMINATORS)
1238
+
1239
+ def _has_any_in_scope(self, names: set[str]) -> bool:
1240
+ # Always terminates: html is in DEFAULT_SCOPE_TERMINATORS
1241
+ terminators = DEFAULT_SCOPE_TERMINATORS
1242
+ idx = len(self.open_elements) - 1
1243
+ while idx >= 0:
1244
+ node = self.open_elements[idx]
1245
+ if node.name in names:
1246
+ return True
1247
+ if node.namespace in {None, "html"} and node.name in terminators:
1248
+ return False
1249
+ idx -= 1
1250
+ return False # pragma: no cover - html always terminates
1251
+
1252
+ def process_characters(self, data: str) -> Any:
1253
+ """Optimized path for character tokens."""
1254
+ # Check for foreign content first
1255
+ current_node = self.open_elements[-1] if self.open_elements else None
1256
+ is_html_namespace = current_node is None or current_node.namespace in {None, "html"}
1257
+
1258
+ if not is_html_namespace:
1259
+ return self.process_token(CharacterTokens(data))
1260
+
1261
+ if self.mode == InsertionMode.IN_BODY:
1262
+ if "\x00" in data:
1263
+ self._parse_error("invalid-codepoint")
1264
+ data = data.replace("\x00", "")
1265
+
1266
+ if not data:
1267
+ return TokenSinkResult.Continue
1268
+
1269
+ if is_all_whitespace(data):
1270
+ self._reconstruct_active_formatting_elements()
1271
+ self._append_text(data)
1272
+ return TokenSinkResult.Continue
1273
+
1274
+ self._reconstruct_active_formatting_elements()
1275
+ self.frameset_ok = False
1276
+ self._append_text(data)
1277
+ return TokenSinkResult.Continue
1278
+
1279
+ return self.process_token(CharacterTokens(data))