justhtml 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2012 @@
1
+ # ruff: noqa: S101, RUF012
2
+
3
+
4
+ from .constants import (
5
+ FORMATTING_ELEMENTS,
6
+ HEADING_ELEMENTS,
7
+ )
8
+ from .node import SimpleDomNode, TemplateNode
9
+ from .tokens import CharacterTokens, CommentToken, EOFToken, Tag, TokenSinkResult
10
+ from .treebuilder_utils import (
11
+ InsertionMode,
12
+ doctype_error_and_quirks,
13
+ is_all_whitespace,
14
+ )
15
+
16
+
17
+ class TreeBuilderModesMixin:
18
+ def _handle_doctype(self, token):
19
+ if self.mode != InsertionMode.INITIAL:
20
+ self._parse_error("unexpected-doctype")
21
+ return TokenSinkResult.Continue
22
+
23
+ doctype = token.doctype
24
+ parse_error, quirks_mode = doctype_error_and_quirks(doctype, self.iframe_srcdoc)
25
+
26
+ node = SimpleDomNode("!doctype", data=doctype)
27
+ self.document.append_child(node)
28
+
29
+ if parse_error:
30
+ self._parse_error("unknown-doctype")
31
+
32
+ self._set_quirks_mode(quirks_mode)
33
+ self.mode = InsertionMode.BEFORE_HTML
34
+ return TokenSinkResult.Continue
35
+
36
+ def _mode_initial(self, token):
37
+ if isinstance(token, CharacterTokens):
38
+ if is_all_whitespace(token.data):
39
+ return None
40
+ self._parse_error("expected-doctype-but-got-chars")
41
+ self._set_quirks_mode("quirks")
42
+ return ("reprocess", InsertionMode.BEFORE_HTML, token)
43
+ if isinstance(token, CommentToken):
44
+ self._append_comment_to_document(token.data)
45
+ return None
46
+ if isinstance(token, EOFToken):
47
+ self._parse_error("expected-doctype-but-got-eof")
48
+ self._set_quirks_mode("quirks")
49
+ self.mode = InsertionMode.BEFORE_HTML
50
+ return ("reprocess", InsertionMode.BEFORE_HTML, token)
51
+ # Only Tags remain - no DOCTYPE seen, so quirks mode
52
+ if token.kind == Tag.START:
53
+ self._parse_error("expected-doctype-but-got-start-tag", tag_name=token.name, token=token)
54
+ else:
55
+ self._parse_error("expected-doctype-but-got-end-tag", tag_name=token.name, token=token)
56
+ self._set_quirks_mode("quirks")
57
+ return ("reprocess", InsertionMode.BEFORE_HTML, token)
58
+
59
+ def _mode_before_html(self, token):
60
+ if isinstance(token, CharacterTokens) and is_all_whitespace(token.data):
61
+ return None
62
+ if isinstance(token, CommentToken):
63
+ self._append_comment_to_document(token.data)
64
+ return None
65
+ if isinstance(token, Tag):
66
+ if token.kind == Tag.START and token.name == "html":
67
+ self._create_root(token.attrs)
68
+ self.mode = InsertionMode.BEFORE_HEAD
69
+ return None
70
+ if token.kind == Tag.END and token.name in {"head", "body", "html", "br"}:
71
+ self._create_root({})
72
+ self.mode = InsertionMode.BEFORE_HEAD
73
+ return ("reprocess", InsertionMode.BEFORE_HEAD, token)
74
+ if token.kind == Tag.END:
75
+ # Ignore other end tags
76
+ self._parse_error("unexpected-end-tag-before-html", tag_name=token.name)
77
+ return None
78
+ if isinstance(token, EOFToken):
79
+ self._create_root({})
80
+ self.mode = InsertionMode.BEFORE_HEAD
81
+ return ("reprocess", InsertionMode.BEFORE_HEAD, token)
82
+
83
+ if isinstance(token, CharacterTokens):
84
+ stripped = token.data.lstrip("\t\n\f\r ")
85
+ if len(stripped) != len(token.data):
86
+ token = CharacterTokens(stripped)
87
+
88
+ self._create_root({})
89
+ self.mode = InsertionMode.BEFORE_HEAD
90
+ return ("reprocess", InsertionMode.BEFORE_HEAD, token)
91
+
92
+ def _mode_before_head(self, token):
93
+ if isinstance(token, CharacterTokens):
94
+ data = token.data or ""
95
+ if "\x00" in data:
96
+ self._parse_error("invalid-codepoint-before-head")
97
+ data = data.replace("\x00", "")
98
+ if not data:
99
+ return None
100
+ if is_all_whitespace(data):
101
+ return None
102
+ token = CharacterTokens(data)
103
+ if isinstance(token, CommentToken):
104
+ self._append_comment(token.data)
105
+ return None
106
+ if isinstance(token, Tag):
107
+ if token.kind == Tag.START and token.name == "html":
108
+ # Duplicate html tag - add attributes to existing html element
109
+ # Note: open_elements[0] is always html at this point (created in BEFORE_HTML mode)
110
+ html = self.open_elements[0]
111
+ self._add_missing_attributes(html, token.attrs)
112
+ return None
113
+ if token.kind == Tag.START and token.name == "head":
114
+ head = self._insert_element(token, push=True)
115
+ self.head_element = head
116
+ self.mode = InsertionMode.IN_HEAD
117
+ return None
118
+ if token.kind == Tag.END and token.name in {"head", "body", "html", "br"}:
119
+ self.head_element = self._insert_phantom("head")
120
+ self.mode = InsertionMode.IN_HEAD
121
+ return ("reprocess", InsertionMode.IN_HEAD, token)
122
+ if token.kind == Tag.END:
123
+ # Ignore other end tags
124
+ self._parse_error("unexpected-end-tag-before-head", tag_name=token.name)
125
+ return None
126
+ if isinstance(token, EOFToken):
127
+ self.head_element = self._insert_phantom("head")
128
+ self.mode = InsertionMode.IN_HEAD
129
+ return ("reprocess", InsertionMode.IN_HEAD, token)
130
+
131
+ self.head_element = self._insert_phantom("head")
132
+ self.mode = InsertionMode.IN_HEAD
133
+ return ("reprocess", InsertionMode.IN_HEAD, token)
134
+
135
+ def _mode_in_head(self, token):
136
+ if isinstance(token, CharacterTokens):
137
+ if is_all_whitespace(token.data):
138
+ self._append_text(token.data)
139
+ return None
140
+ data = token.data or ""
141
+ i = 0
142
+ while i < len(data) and data[i] in "\t\n\f\r ":
143
+ i += 1
144
+ leading_ws = data[:i]
145
+ remaining = data[i:]
146
+ if leading_ws:
147
+ current = self.open_elements[-1] if self.open_elements else None
148
+ if current is not None and current.has_child_nodes():
149
+ self._append_text(leading_ws)
150
+ self._pop_current()
151
+ self.mode = InsertionMode.AFTER_HEAD
152
+ return ("reprocess", InsertionMode.AFTER_HEAD, CharacterTokens(remaining))
153
+ if isinstance(token, CommentToken):
154
+ self._append_comment(token.data)
155
+ return None
156
+ if isinstance(token, Tag):
157
+ if token.kind == Tag.START and token.name == "html":
158
+ # Pop head and transition to AFTER_HEAD, then reprocess
159
+ self._pop_current()
160
+ self.mode = InsertionMode.AFTER_HEAD
161
+ return ("reprocess", InsertionMode.AFTER_HEAD, token)
162
+ if token.kind == Tag.START and token.name in {"base", "basefont", "bgsound", "link", "meta"}:
163
+ self._insert_element(token, push=False)
164
+ return None
165
+ if token.kind == Tag.START and token.name == "template":
166
+ self._insert_element(token, push=True)
167
+ self._push_formatting_marker()
168
+ self.frameset_ok = False
169
+ self.mode = InsertionMode.IN_TEMPLATE
170
+ self.template_modes.append(InsertionMode.IN_TEMPLATE)
171
+ return None
172
+ if token.kind == Tag.END and token.name == "template":
173
+ # Check if template is on the stack (don't use scope check as table blocks it)
174
+ has_template = any(node.name == "template" for node in self.open_elements)
175
+ if not has_template:
176
+ return None
177
+ self._generate_implied_end_tags()
178
+ self._pop_until_inclusive("template")
179
+ self._clear_active_formatting_up_to_marker()
180
+ # template_modes always non-empty here since we passed has_template check
181
+ self.template_modes.pop()
182
+ self._reset_insertion_mode()
183
+ return None
184
+ if token.kind == Tag.START and token.name in {"title", "style", "script", "noframes"}:
185
+ self._insert_element(token, push=True)
186
+ self.original_mode = self.mode
187
+ self.mode = InsertionMode.TEXT
188
+ return None
189
+ if token.kind == Tag.START and token.name == "noscript":
190
+ # Scripting is disabled: parse noscript content as HTML
191
+ self._insert_element(token, push=True)
192
+ self.mode = InsertionMode.IN_HEAD_NOSCRIPT
193
+ return None
194
+ if token.kind == Tag.END and token.name == "head":
195
+ self._pop_current()
196
+ self.mode = InsertionMode.AFTER_HEAD
197
+ return None
198
+ if token.kind == Tag.END and token.name in {"body", "html", "br"}:
199
+ self._pop_current()
200
+ self.mode = InsertionMode.AFTER_HEAD
201
+ return ("reprocess", InsertionMode.AFTER_HEAD, token)
202
+ if isinstance(token, EOFToken):
203
+ self._pop_current()
204
+ self.mode = InsertionMode.AFTER_HEAD
205
+ return ("reprocess", InsertionMode.AFTER_HEAD, token)
206
+
207
+ self._pop_current()
208
+ self.mode = InsertionMode.AFTER_HEAD
209
+ return ("reprocess", InsertionMode.AFTER_HEAD, token)
210
+
211
+ def _mode_in_head_noscript(self, token):
212
+ """Handle tokens in 'in head noscript' insertion mode (scripting disabled)."""
213
+ if isinstance(token, CharacterTokens):
214
+ data = token.data or ""
215
+ # Whitespace: process using in head rules
216
+ if is_all_whitespace(data):
217
+ return self._mode_in_head(token)
218
+ # Non-whitespace: parse error, pop noscript, reprocess in head
219
+ self._parse_error("unexpected-start-tag", tag_name="text")
220
+ self._pop_current() # Pop noscript
221
+ self.mode = InsertionMode.IN_HEAD
222
+ return ("reprocess", InsertionMode.IN_HEAD, token)
223
+ if isinstance(token, CommentToken):
224
+ return self._mode_in_head(token)
225
+ if isinstance(token, Tag):
226
+ if token.kind == Tag.START:
227
+ if token.name == "html":
228
+ return self._mode_in_body(token)
229
+ if token.name in {"basefont", "bgsound", "link", "meta", "noframes", "style"}:
230
+ return self._mode_in_head(token)
231
+ if token.name in {"head", "noscript"}:
232
+ self._parse_error("unexpected-start-tag", tag_name=token.name)
233
+ return None # Ignore
234
+ # Any other start tag: parse error, pop noscript, reprocess in head
235
+ self._parse_error("unexpected-start-tag", tag_name=token.name)
236
+ self._pop_current() # Pop noscript
237
+ self.mode = InsertionMode.IN_HEAD
238
+ return ("reprocess", InsertionMode.IN_HEAD, token)
239
+ # token.kind == Tag.END
240
+ if token.name == "noscript":
241
+ self._pop_current() # Pop noscript
242
+ self.mode = InsertionMode.IN_HEAD
243
+ return None
244
+ if token.name == "br":
245
+ self._parse_error("unexpected-end-tag", tag_name=token.name)
246
+ self._pop_current() # Pop noscript
247
+ self.mode = InsertionMode.IN_HEAD
248
+ return ("reprocess", InsertionMode.IN_HEAD, token)
249
+ # Any other end tag: parse error, ignore
250
+ self._parse_error("unexpected-end-tag", tag_name=token.name)
251
+ return None
252
+ if isinstance(token, EOFToken):
253
+ self._parse_error("expected-closing-tag-but-got-eof", tag_name="noscript")
254
+ self._pop_current() # Pop noscript
255
+ self.mode = InsertionMode.IN_HEAD
256
+ return ("reprocess", InsertionMode.IN_HEAD, token)
257
+ # All token types are handled above - CharacterTokens, CommentToken, Tag, EOFToken
258
+ return None # pragma: no cover
259
+
260
+ def _mode_after_head(self, token):
261
+ if isinstance(token, CharacterTokens):
262
+ data = token.data or ""
263
+ if "\x00" in data:
264
+ self._parse_error("invalid-codepoint-in-body")
265
+ data = data.replace("\x00", "")
266
+ if "\x0c" in data:
267
+ self._parse_error("invalid-codepoint-in-body")
268
+ data = data.replace("\x0c", "")
269
+ if not data or is_all_whitespace(data):
270
+ if data:
271
+ self._append_text(data)
272
+ return None
273
+ self._insert_body_if_missing()
274
+ return ("reprocess", InsertionMode.IN_BODY, CharacterTokens(data))
275
+ if isinstance(token, CommentToken):
276
+ self._append_comment(token.data)
277
+ return None
278
+ if isinstance(token, Tag):
279
+ if token.kind == Tag.START and token.name == "html":
280
+ self._insert_body_if_missing()
281
+ return ("reprocess", InsertionMode.IN_BODY, token)
282
+ if token.kind == Tag.START and token.name == "body":
283
+ self._insert_element(token, push=True)
284
+ self.mode = InsertionMode.IN_BODY
285
+ self.frameset_ok = False
286
+ return None
287
+ if token.kind == Tag.START and token.name == "frameset":
288
+ self._insert_element(token, push=True)
289
+ self.mode = InsertionMode.IN_FRAMESET
290
+ return None
291
+ # Special handling: input type="hidden" doesn't create body or affect frameset_ok
292
+ if token.kind == Tag.START and token.name == "input":
293
+ input_type = None
294
+ for name, value in token.attrs.items():
295
+ if name == "type":
296
+ input_type = (value or "").lower()
297
+ break
298
+ if input_type == "hidden":
299
+ # Parse error but ignore - don't create body, don't insert element
300
+ self._parse_error("unexpected-hidden-input-after-head")
301
+ return None
302
+ # Non-hidden input creates body
303
+ self._insert_body_if_missing()
304
+ return ("reprocess", InsertionMode.IN_BODY, token)
305
+ if token.kind == Tag.START and token.name in {
306
+ "base",
307
+ "basefont",
308
+ "bgsound",
309
+ "link",
310
+ "meta",
311
+ "title",
312
+ "style",
313
+ "script",
314
+ "noscript",
315
+ }:
316
+ self.open_elements.append(self.head_element)
317
+ result = self._mode_in_head(token)
318
+ # Remove the head element from wherever it is in the stack
319
+ # (it might not be at the end if we inserted other elements like <title>)
320
+ self.open_elements.remove(self.head_element)
321
+ return result
322
+ if token.kind == Tag.START and token.name == "template":
323
+ # Template in after-head needs special handling:
324
+ # Process in IN_HEAD mode, which will switch to IN_TEMPLATE
325
+ # Don't remove head from stack - let normal processing continue
326
+ self.open_elements.append(self.head_element)
327
+ self.mode = InsertionMode.IN_HEAD
328
+ return ("reprocess", InsertionMode.IN_HEAD, token)
329
+ if token.kind == Tag.END and token.name == "template":
330
+ return self._mode_in_head(token)
331
+ if token.kind == Tag.END and token.name == "body":
332
+ self._insert_body_if_missing()
333
+ return ("reprocess", InsertionMode.IN_BODY, token)
334
+ if token.kind == Tag.END and token.name in {"html", "br"}:
335
+ self._insert_body_if_missing()
336
+ return ("reprocess", InsertionMode.IN_BODY, token)
337
+ if token.kind == Tag.END:
338
+ # Ignore other end tags
339
+ self._parse_error("unexpected-end-tag-after-head", tag_name=token.name)
340
+ return None
341
+ if isinstance(token, EOFToken):
342
+ self._insert_body_if_missing()
343
+ self.mode = InsertionMode.IN_BODY
344
+ return ("reprocess", InsertionMode.IN_BODY, token)
345
+
346
+ self._insert_body_if_missing()
347
+ return ("reprocess", InsertionMode.IN_BODY, token)
348
+
349
+ def _mode_text(self, token):
350
+ if isinstance(token, CharacterTokens):
351
+ self._append_text(token.data)
352
+ return None
353
+ if isinstance(token, EOFToken):
354
+ # Get the tag name of the unclosed element
355
+ tag_name = self.open_elements[-1].name if self.open_elements else None
356
+ self._parse_error("expected-named-closing-tag-but-got-eof", tag_name=tag_name)
357
+ self._pop_current()
358
+ self.mode = self.original_mode or InsertionMode.IN_BODY
359
+ return ("reprocess", self.mode, token)
360
+ # End tag
361
+ self._pop_current()
362
+ self.mode = self.original_mode or InsertionMode.IN_BODY
363
+ return None
364
+
365
+ def _mode_in_body(self, token):
366
+ handler = self._BODY_TOKEN_HANDLERS.get(type(token))
367
+ return handler(self, token) if handler else None
368
+
369
+ def _handle_characters_in_body(self, token):
370
+ data = token.data or ""
371
+ if "\x00" in data:
372
+ self._parse_error("invalid-codepoint")
373
+ data = data.replace("\x00", "")
374
+ if is_all_whitespace(data):
375
+ self._reconstruct_active_formatting_elements()
376
+ self._append_text(data)
377
+ return
378
+ self._reconstruct_active_formatting_elements()
379
+ self.frameset_ok = False
380
+ self._append_text(data)
381
+ return
382
+
383
+ def _handle_comment_in_body(self, token):
384
+ self._append_comment(token.data)
385
+ return
386
+
387
+ def _handle_tag_in_body(self, token):
388
+ if token.kind == Tag.START:
389
+ handler = self._BODY_START_HANDLERS.get(token.name)
390
+ if handler:
391
+ return handler(self, token)
392
+ return self._handle_body_start_default(token)
393
+ name = token.name
394
+
395
+ # Special case: </br> end tag is treated as <br> start tag
396
+ if name == "br":
397
+ self._parse_error("unexpected-end-tag", tag_name=name, token=token)
398
+ br_tag = Tag(Tag.START, "br", {}, False)
399
+ return self._mode_in_body(br_tag)
400
+
401
+ if name in FORMATTING_ELEMENTS:
402
+ self._adoption_agency(name)
403
+ return None
404
+ handler = self._BODY_END_HANDLERS.get(name)
405
+ if handler:
406
+ return handler(self, token)
407
+ # Any other end tag
408
+ self._any_other_end_tag(token.name)
409
+ return None
410
+
411
+ def _handle_eof_in_body(self, token):
412
+ # If we're in a template, handle EOF in template mode first
413
+ if self.template_modes:
414
+ return self._mode_in_template(token)
415
+ # Check for unclosed elements (excluding html, body, head which are implicit)
416
+ for node in self.open_elements:
417
+ if node.name not in {
418
+ "dd",
419
+ "dt",
420
+ "li",
421
+ "optgroup",
422
+ "option",
423
+ "p",
424
+ "rb",
425
+ "rp",
426
+ "rt",
427
+ "rtc",
428
+ "tbody",
429
+ "td",
430
+ "tfoot",
431
+ "th",
432
+ "thead",
433
+ "tr",
434
+ "body",
435
+ "html",
436
+ }:
437
+ self._parse_error("expected-closing-tag-but-got-eof", tag_name=node.name)
438
+ break
439
+ self.mode = InsertionMode.AFTER_BODY
440
+ return ("reprocess", InsertionMode.AFTER_BODY, token)
441
+
442
+ # ---------------------
443
+ # Body mode start tag handlers
444
+ # ---------------------
445
+
446
+ def _handle_body_start_html(self, token):
447
+ if self.template_modes:
448
+ self._parse_error("unexpected-start-tag", tag_name=token.name)
449
+ return
450
+ # In IN_BODY mode, html element is always at open_elements[0]
451
+ if self.open_elements: # pragma: no branch
452
+ html = self.open_elements[0]
453
+ self._add_missing_attributes(html, token.attrs)
454
+ return
455
+
456
+ def _handle_body_start_body(self, token):
457
+ if self.template_modes:
458
+ self._parse_error("unexpected-start-tag", tag_name=token.name)
459
+ return
460
+ if len(self.open_elements) > 1:
461
+ self._parse_error("unexpected-start-tag", tag_name=token.name)
462
+ body = self.open_elements[1] if len(self.open_elements) > 1 else None
463
+ if body and body.name == "body":
464
+ self._add_missing_attributes(body, token.attrs)
465
+ self.frameset_ok = False
466
+ return
467
+ self.frameset_ok = False
468
+ return
469
+
470
+ def _handle_body_start_head(self, token):
471
+ self._parse_error("unexpected-start-tag", tag_name=token.name)
472
+ return
473
+
474
+ def _handle_body_start_in_head(self, token):
475
+ return self._mode_in_head(token)
476
+
477
+ def _handle_body_start_block_with_p(self, token):
478
+ self._close_p_element()
479
+ self._insert_element(token, push=True)
480
+ return
481
+
482
+ def _handle_body_start_heading(self, token):
483
+ self._close_p_element()
484
+ if self.open_elements and self.open_elements[-1].name in HEADING_ELEMENTS:
485
+ self._parse_error("unexpected-start-tag", tag_name=token.name)
486
+ self._pop_current()
487
+ self._insert_element(token, push=True)
488
+ self.frameset_ok = False
489
+ return
490
+
491
+ def _handle_body_start_pre_listing(self, token):
492
+ self._close_p_element()
493
+ self._insert_element(token, push=True)
494
+ self.ignore_lf = True
495
+ self.frameset_ok = False
496
+ return
497
+
498
+ def _handle_body_start_form(self, token):
499
+ if self.form_element is not None:
500
+ self._parse_error("unexpected-start-tag", tag_name=token.name)
501
+ return
502
+ self._close_p_element()
503
+ node = self._insert_element(token, push=True)
504
+ self.form_element = node
505
+ self.frameset_ok = False
506
+ return
507
+
508
+ def _handle_body_start_button(self, token):
509
+ if self._has_in_scope("button"):
510
+ self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=token.name)
511
+ self._close_element_by_name("button")
512
+ self._insert_element(token, push=True)
513
+ self.frameset_ok = False
514
+ return
515
+
516
+ def _handle_body_start_paragraph(self, token):
517
+ self._close_p_element()
518
+ self._insert_element(token, push=True)
519
+ return
520
+
521
+ def _handle_body_start_math(self, token):
522
+ self._reconstruct_active_formatting_elements()
523
+ attrs = self._prepare_foreign_attributes("math", token.attrs)
524
+ new_tag = Tag(Tag.START, token.name, attrs, token.self_closing)
525
+ self._insert_element(new_tag, push=not token.self_closing, namespace="math")
526
+ return
527
+
528
+ def _handle_body_start_svg(self, token):
529
+ self._reconstruct_active_formatting_elements()
530
+ adjusted_name = self._adjust_svg_tag_name(token.name)
531
+ attrs = self._prepare_foreign_attributes("svg", token.attrs)
532
+ new_tag = Tag(Tag.START, adjusted_name, attrs, token.self_closing)
533
+ self._insert_element(new_tag, push=not token.self_closing, namespace="svg")
534
+ return
535
+
536
+ def _handle_body_start_li(self, token):
537
+ self.frameset_ok = False
538
+ self._close_p_element()
539
+ if self._has_in_list_item_scope("li"):
540
+ self._pop_until_any_inclusive({"li"})
541
+ self._insert_element(token, push=True)
542
+ return
543
+
544
+ def _handle_body_start_dd_dt(self, token):
545
+ self.frameset_ok = False
546
+ self._close_p_element()
547
+ name = token.name
548
+ if name == "dd":
549
+ if self._has_in_definition_scope("dd"):
550
+ self._pop_until_any_inclusive({"dd"})
551
+ if self._has_in_definition_scope("dt"):
552
+ self._pop_until_any_inclusive({"dt"})
553
+ else:
554
+ if self._has_in_definition_scope("dt"):
555
+ self._pop_until_any_inclusive({"dt"})
556
+ if self._has_in_definition_scope("dd"):
557
+ self._pop_until_any_inclusive({"dd"})
558
+ self._insert_element(token, push=True)
559
+ return
560
+
561
+ def _adoption_agency(self, subject):
562
+ # 1. If the current node is the subject, and it is not in the active formatting elements list...
563
+ if self.open_elements and self.open_elements[-1].name == subject:
564
+ if not self._has_active_formatting_entry(subject):
565
+ self._pop_until_inclusive(subject)
566
+ return
567
+
568
+ # 2. Outer loop
569
+ for _ in range(8):
570
+ # 3. Find formatting element
571
+ formatting_element_index = self._find_active_formatting_index(subject)
572
+ if formatting_element_index is None:
573
+ return
574
+
575
+ formatting_element_entry = self.active_formatting[formatting_element_index]
576
+ formatting_element = formatting_element_entry["node"]
577
+
578
+ # 4. If formatting element is not in open elements
579
+ if formatting_element not in self.open_elements:
580
+ self._parse_error("adoption-agency-1.3")
581
+ self._remove_formatting_entry(formatting_element_index)
582
+ return
583
+
584
+ # 5. If formatting element is in open elements but not in scope
585
+ if not self._has_element_in_scope(formatting_element.name):
586
+ self._parse_error("adoption-agency-1.3")
587
+ return
588
+
589
+ # 6. If formatting element is not the current node
590
+ if formatting_element is not self.open_elements[-1]:
591
+ self._parse_error("adoption-agency-1.3")
592
+
593
+ # 7. Find furthest block
594
+ furthest_block = None
595
+ formatting_element_in_open_index = self.open_elements.index(formatting_element)
596
+
597
+ for i in range(formatting_element_in_open_index + 1, len(self.open_elements)):
598
+ node = self.open_elements[i]
599
+ if self._is_special_element(node):
600
+ furthest_block = node
601
+ break
602
+
603
+ if furthest_block is None:
604
+ # formatting_element is known to be on the stack
605
+ while True:
606
+ popped = self.open_elements.pop()
607
+ if popped is formatting_element:
608
+ break
609
+ self._remove_formatting_entry(formatting_element_index)
610
+ return
611
+
612
+ # 8. Bookmark
613
+ bookmark = formatting_element_index + 1
614
+
615
+ # 9. Node and Last Node
616
+ node = furthest_block
617
+ last_node = furthest_block
618
+
619
+ # 10. Inner loop
620
+ inner_loop_counter = 0
621
+ while True:
622
+ inner_loop_counter += 1
623
+
624
+ # 10.1 Node = element above node
625
+ node_index = self.open_elements.index(node)
626
+ node = self.open_elements[node_index - 1]
627
+
628
+ # 10.2 If node is formatting element, break
629
+ if node is formatting_element:
630
+ break
631
+
632
+ # 10.3 Find active formatting entry for node
633
+ node_formatting_index = self._find_active_formatting_index_by_node(node)
634
+
635
+ if inner_loop_counter > 3 and node_formatting_index is not None:
636
+ self._remove_formatting_entry(node_formatting_index)
637
+ if node_formatting_index < bookmark:
638
+ bookmark -= 1
639
+ node_formatting_index = None
640
+
641
+ if node_formatting_index is None:
642
+ node_index = self.open_elements.index(node)
643
+ self.open_elements.remove(node)
644
+ node = self.open_elements[node_index]
645
+ continue
646
+
647
+ # 10.4 Replace entry with new element
648
+ entry = self.active_formatting[node_formatting_index]
649
+ new_element = self._create_element(entry["name"], entry["node"].namespace, entry["attrs"])
650
+ entry["node"] = new_element
651
+ self.open_elements[self.open_elements.index(node)] = new_element
652
+ node = new_element
653
+
654
+ # 10.5 If last node is furthest block, update bookmark
655
+ if last_node is furthest_block:
656
+ bookmark = node_formatting_index + 1
657
+
658
+ # 10.6 Reparent last_node
659
+ if last_node.parent:
660
+ last_node.parent.remove_child(last_node)
661
+ node.append_child(last_node)
662
+
663
+ # 10.7
664
+ last_node = node
665
+
666
+ # 11. Insert last_node into common ancestor
667
+ common_ancestor = self.open_elements[formatting_element_in_open_index - 1]
668
+ if last_node.parent:
669
+ last_node.parent.remove_child(last_node)
670
+
671
+ if self._should_foster_parenting(common_ancestor, for_tag=last_node.name):
672
+ parent, position = self._appropriate_insertion_location(common_ancestor, foster_parenting=True)
673
+ self._insert_node_at(parent, position, last_node)
674
+ else:
675
+ if type(common_ancestor) is TemplateNode and common_ancestor.template_content:
676
+ common_ancestor.template_content.append_child(last_node)
677
+ else:
678
+ common_ancestor.append_child(last_node)
679
+
680
+ # 12. Create new formatting element
681
+ entry = self.active_formatting[formatting_element_index]
682
+ new_formatting_element = self._create_element(entry["name"], entry["node"].namespace, entry["attrs"])
683
+ entry["node"] = new_formatting_element
684
+
685
+ # 13. Move children of furthest block
686
+ while furthest_block.has_child_nodes():
687
+ child = furthest_block.children[0]
688
+ furthest_block.remove_child(child)
689
+ new_formatting_element.append_child(child)
690
+
691
+ furthest_block.append_child(new_formatting_element)
692
+
693
+ # 14. Remove formatting element from active formatting and insert new at bookmark
694
+ # Per spec, bookmark is always > formatting_element_index (starts at fmt_idx+1,
695
+ # can only be set to higher values or decremented when entries above fmt_idx are removed)
696
+ self._remove_formatting_entry(formatting_element_index)
697
+ bookmark -= 1
698
+ self.active_formatting.insert(bookmark, entry)
699
+
700
+ # 15. Remove formatting element from open elements and insert new one
701
+ self.open_elements.remove(formatting_element)
702
+ furthest_block_index = self.open_elements.index(furthest_block)
703
+ self.open_elements.insert(furthest_block_index + 1, new_formatting_element)
704
+
705
+ def _handle_body_start_a(self, token):
706
+ if self._has_active_formatting_entry("a"):
707
+ self._adoption_agency("a")
708
+ self._remove_last_active_formatting_by_name("a")
709
+ self._remove_last_open_element_by_name("a")
710
+ self._reconstruct_active_formatting_elements()
711
+ node = self._insert_element(token, push=True)
712
+ self._append_active_formatting_entry("a", token.attrs, node)
713
+ return
714
+
715
+ def _handle_body_start_formatting(self, token):
716
+ name = token.name
717
+ if name == "nobr" and self._in_scope("nobr"):
718
+ self._adoption_agency("nobr")
719
+ self._remove_last_active_formatting_by_name("nobr")
720
+ self._remove_last_open_element_by_name("nobr")
721
+ self._reconstruct_active_formatting_elements()
722
+ duplicate_index = self._find_active_formatting_duplicate(name, token.attrs)
723
+ if duplicate_index is not None:
724
+ self._remove_formatting_entry(duplicate_index)
725
+ node = self._insert_element(token, push=True)
726
+ self._append_active_formatting_entry(name, token.attrs, node)
727
+ return
728
+
729
+ def _handle_body_start_applet_like(self, token):
730
+ self._reconstruct_active_formatting_elements()
731
+ self._insert_element(token, push=True)
732
+ self._push_formatting_marker()
733
+ self.frameset_ok = False
734
+ return
735
+
736
+ def _handle_body_start_br(self, token):
737
+ self._close_p_element()
738
+ self._reconstruct_active_formatting_elements()
739
+ self._insert_element(token, push=False)
740
+ self.frameset_ok = False
741
+ return
742
+
743
+ def _handle_body_start_frameset(self, token):
744
+ if not self.frameset_ok:
745
+ self._parse_error("unexpected-start-tag-ignored", tag_name=token.name)
746
+ return
747
+ # Find body element on the stack (may not exist if already in frameset)
748
+ body_index = None
749
+ for i, elem in enumerate(self.open_elements):
750
+ if elem.name == "body":
751
+ body_index = i
752
+ break
753
+ if body_index is None:
754
+ # No body on stack (e.g., nested frameset after mode reset), ignore
755
+ self._parse_error("unexpected-start-tag-ignored", tag_name=token.name)
756
+ return
757
+ body_elem = self.open_elements[body_index]
758
+ body_elem.parent.remove_child(body_elem)
759
+ self.open_elements = self.open_elements[:body_index]
760
+ self._insert_element(token, push=True)
761
+ self.mode = InsertionMode.IN_FRAMESET
762
+ return
763
+
764
+ # ---------------------
765
+ # Body mode end tag handlers
766
+ # ---------------------
767
+
768
+ def _handle_body_end_body(self, token):
769
+ if self._in_scope("body"):
770
+ self.mode = InsertionMode.AFTER_BODY
771
+ return
772
+
773
+ def _handle_body_end_html(self, token):
774
+ if self._in_scope("body"):
775
+ return ("reprocess", InsertionMode.AFTER_BODY, token)
776
+ return None
777
+
778
+ def _handle_body_end_p(self, token):
779
+ if not self._close_p_element():
780
+ self._parse_error("unexpected-end-tag", tag_name=token.name)
781
+ phantom = Tag(Tag.START, "p", {}, False)
782
+ self._insert_element(phantom, push=True)
783
+ self._close_p_element()
784
+ return
785
+
786
+ def _handle_body_end_li(self, token):
787
+ if not self._has_in_list_item_scope("li"):
788
+ self._parse_error("unexpected-end-tag", tag_name=token.name)
789
+ return
790
+ self._pop_until_any_inclusive({"li"})
791
+ return
792
+
793
+ def _handle_body_end_dd_dt(self, token):
794
+ name = token.name
795
+ if not self._has_in_definition_scope(name):
796
+ self._parse_error("unexpected-end-tag", tag_name=name)
797
+ return
798
+ self._pop_until_any_inclusive({"dd", "dt"})
799
+
800
+ def _handle_body_end_form(self, token):
801
+ if self.form_element is None:
802
+ self._parse_error("unexpected-end-tag", tag_name=token.name)
803
+ return
804
+ removed = self._remove_from_open_elements(self.form_element)
805
+ self.form_element = None
806
+ if not removed:
807
+ self._parse_error("unexpected-end-tag", tag_name=token.name)
808
+ return
809
+
810
+ def _handle_body_end_applet_like(self, token):
811
+ name = token.name
812
+ if not self._in_scope(name):
813
+ self._parse_error("unexpected-end-tag", tag_name=name)
814
+ return
815
+ # Element verified in scope above
816
+ while self.open_elements: # pragma: no branch
817
+ popped = self.open_elements.pop()
818
+ if popped.name == name:
819
+ break
820
+ self._clear_active_formatting_up_to_marker()
821
+ return
822
+
823
+ def _handle_body_end_heading(self, token):
824
+ name = token.name
825
+ if not self._has_any_in_scope(HEADING_ELEMENTS):
826
+ self._parse_error("unexpected-end-tag", tag_name=name)
827
+ return
828
+ self._generate_implied_end_tags()
829
+ if self.open_elements and self.open_elements[-1].name != name:
830
+ self._parse_error("end-tag-too-early", tag_name=name)
831
+ # Heading verified in scope by caller
832
+ while self.open_elements: # pragma: no branch
833
+ popped = self.open_elements.pop()
834
+ if popped.name in HEADING_ELEMENTS:
835
+ break
836
+ return
837
+
838
+ def _handle_body_end_block(self, token):
839
+ name = token.name
840
+ if not self._in_scope(name):
841
+ self._parse_error("unexpected-end-tag", tag_name=name)
842
+ return
843
+ self._generate_implied_end_tags()
844
+ if self.open_elements and self.open_elements[-1].name != name:
845
+ self._parse_error("end-tag-too-early", tag_name=name)
846
+ self._pop_until_any_inclusive({name})
847
+ return
848
+
849
+ def _handle_body_end_template(self, token):
850
+ has_template = any(node.name == "template" for node in self.open_elements)
851
+ if not has_template:
852
+ return
853
+ self._generate_implied_end_tags()
854
+ self._pop_until_inclusive("template")
855
+ self._clear_active_formatting_up_to_marker()
856
+ # Pop template mode if available
857
+ if self.template_modes: # pragma: no branch
858
+ self.template_modes.pop()
859
+ self._reset_insertion_mode()
860
+ return
861
+
862
+ def _handle_body_start_structure_ignored(self, token):
863
+ self._parse_error("unexpected-start-tag-ignored", tag_name=token.name)
864
+ return
865
+
866
+ def _handle_body_start_col_or_frame(self, token):
867
+ if self.fragment_context is None:
868
+ self._parse_error("unexpected-start-tag-ignored", tag_name=token.name)
869
+ return
870
+ self._insert_element(token, push=False)
871
+ return
872
+
873
+ def _handle_body_start_image(self, token):
874
+ self._parse_error("image-start-tag", tag_name=token.name)
875
+ img_token = Tag(Tag.START, "img", token.attrs, token.self_closing)
876
+ self._reconstruct_active_formatting_elements()
877
+ self._insert_element(img_token, push=False)
878
+ self.frameset_ok = False
879
+ return
880
+
881
+ def _handle_body_start_void_with_formatting(self, token):
882
+ self._reconstruct_active_formatting_elements()
883
+ self._insert_element(token, push=False)
884
+ self.frameset_ok = False
885
+ return
886
+
887
+ def _handle_body_start_simple_void(self, token):
888
+ self._insert_element(token, push=False)
889
+ return
890
+
891
+ def _handle_body_start_input(self, token):
892
+ input_type = None
893
+ for name, value in token.attrs.items():
894
+ if name == "type":
895
+ input_type = (value or "").lower()
896
+ break
897
+ self._insert_element(token, push=False)
898
+ if input_type != "hidden":
899
+ self.frameset_ok = False
900
+ return
901
+
902
+ def _handle_body_start_table(self, token):
903
+ if self.quirks_mode != "quirks":
904
+ self._close_p_element()
905
+ self._insert_element(token, push=True)
906
+ self.frameset_ok = False
907
+ self.mode = InsertionMode.IN_TABLE
908
+ return
909
+
910
+ def _handle_body_start_plaintext_xmp(self, token):
911
+ self._close_p_element()
912
+ self._insert_element(token, push=True)
913
+ self.frameset_ok = False
914
+ if token.name == "plaintext":
915
+ self.tokenizer_state_override = TokenSinkResult.Plaintext
916
+ else:
917
+ # xmp, iframe, noembed, noframes, noscript (scripting disabled)
918
+ self.original_mode = self.mode
919
+ self.mode = InsertionMode.TEXT
920
+ return
921
+
922
+ def _handle_body_start_textarea(self, token):
923
+ self._insert_element(token, push=True)
924
+ self.ignore_lf = True
925
+ self.frameset_ok = False
926
+ return
927
+
928
+ def _handle_body_start_select(self, token):
929
+ self._reconstruct_active_formatting_elements()
930
+ self._insert_element(token, push=True)
931
+ self.frameset_ok = False
932
+ self._reset_insertion_mode()
933
+ return
934
+
935
+ def _handle_body_start_option(self, token):
936
+ if self.open_elements and self.open_elements[-1].name == "option":
937
+ self.open_elements.pop()
938
+ self._reconstruct_active_formatting_elements()
939
+ self._insert_element(token, push=True)
940
+ return
941
+
942
+ def _handle_body_start_optgroup(self, token):
943
+ if self.open_elements and self.open_elements[-1].name == "option":
944
+ self.open_elements.pop()
945
+ self._reconstruct_active_formatting_elements()
946
+ self._insert_element(token, push=True)
947
+ return
948
+
949
+ def _handle_body_start_rp_rt(self, token):
950
+ self._generate_implied_end_tags(exclude="rtc")
951
+ self._insert_element(token, push=True)
952
+ return
953
+
954
+ def _handle_body_start_rb_rtc(self, token):
955
+ if self.open_elements and self.open_elements[-1].name in {"rb", "rp", "rt", "rtc"}:
956
+ self._generate_implied_end_tags()
957
+ self._insert_element(token, push=True)
958
+ return
959
+
960
+ def _handle_body_start_table_parse_error(self, token):
961
+ self._parse_error("unexpected-start-tag", tag_name=token.name)
962
+ return
963
+
964
+ def _handle_body_start_default(self, token):
965
+ self._reconstruct_active_formatting_elements()
966
+ self._insert_element(token, push=True)
967
+ if token.self_closing:
968
+ self._parse_error("non-void-html-element-start-tag-with-trailing-solidus", tag_name=token.name)
969
+ # Elements reaching here have no handler - never in FRAMESET_NEUTRAL/FORMATTING_ELEMENTS
970
+ self.frameset_ok = False
971
+ return
972
+
973
+ def _mode_in_table(self, token):
974
+ if isinstance(token, CharacterTokens):
975
+ data = token.data or ""
976
+ if "\x00" in data:
977
+ self._parse_error("unexpected-null-character")
978
+ data = data.replace("\x00", "")
979
+ if not data:
980
+ return None
981
+ token = CharacterTokens(data)
982
+ self.pending_table_text = []
983
+ self.table_text_original_mode = self.mode
984
+ self.mode = InsertionMode.IN_TABLE_TEXT
985
+ return ("reprocess", InsertionMode.IN_TABLE_TEXT, token)
986
+ if isinstance(token, CommentToken):
987
+ self._append_comment(token.data)
988
+ return None
989
+ if isinstance(token, Tag):
990
+ name = token.name
991
+ if token.kind == Tag.START:
992
+ if name == "caption":
993
+ self._clear_stack_until({"table", "template", "html"})
994
+ self._push_formatting_marker()
995
+ self._insert_element(token, push=True)
996
+ self.mode = InsertionMode.IN_CAPTION
997
+ return None
998
+ if name == "colgroup":
999
+ self._clear_stack_until({"table", "template", "html"})
1000
+ self._insert_element(token, push=True)
1001
+ self.mode = InsertionMode.IN_COLUMN_GROUP
1002
+ return None
1003
+ if name == "col":
1004
+ self._clear_stack_until({"table", "template", "html"})
1005
+ implied = Tag(Tag.START, "colgroup", {}, False)
1006
+ self._insert_element(implied, push=True)
1007
+ self.mode = InsertionMode.IN_COLUMN_GROUP
1008
+ return ("reprocess", InsertionMode.IN_COLUMN_GROUP, token)
1009
+ if name in {"tbody", "tfoot", "thead"}:
1010
+ self._clear_stack_until({"table", "template", "html"})
1011
+ self._insert_element(token, push=True)
1012
+ self.mode = InsertionMode.IN_TABLE_BODY
1013
+ return None
1014
+ if name in {"td", "th", "tr"}:
1015
+ self._clear_stack_until({"table", "template", "html"})
1016
+ implied = Tag(Tag.START, "tbody", {}, False)
1017
+ self._insert_element(implied, push=True)
1018
+ self.mode = InsertionMode.IN_TABLE_BODY
1019
+ return ("reprocess", InsertionMode.IN_TABLE_BODY, token)
1020
+ if name == "table":
1021
+ self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=name)
1022
+ closed = self._close_table_element()
1023
+ if closed:
1024
+ return ("reprocess", self.mode, token)
1025
+ return None
1026
+ if name in {"style", "script"}:
1027
+ # Per HTML5 spec: style and script are inserted directly into the table
1028
+ # (not processed as in-head which would move them)
1029
+ self._insert_element(token, push=True)
1030
+ self.original_mode = self.mode
1031
+ self.mode = InsertionMode.TEXT
1032
+ return None
1033
+ if name == "template":
1034
+ # Template is handled by delegating to IN_HEAD
1035
+ return self._mode_in_head(token)
1036
+ if name == "input":
1037
+ input_type = None
1038
+ for attr_name, attr_value in token.attrs.items():
1039
+ if attr_name == "type":
1040
+ input_type = (attr_value or "").lower()
1041
+ break
1042
+ if input_type == "hidden":
1043
+ self._parse_error("unexpected-hidden-input-in-table")
1044
+ self._insert_element(token, push=True)
1045
+ self.open_elements.pop() # push=True always adds to stack
1046
+ return None
1047
+ if name == "form":
1048
+ self._parse_error("unexpected-form-in-table")
1049
+ if self.form_element is None:
1050
+ node = self._insert_element(token, push=True)
1051
+ self.form_element = node
1052
+ self.open_elements.pop() # push=True always adds to stack
1053
+ return None
1054
+ self._parse_error("unexpected-start-tag-implies-table-voodoo", tag_name=name)
1055
+ previous = self.insert_from_table
1056
+ self.insert_from_table = True
1057
+ try:
1058
+ return self._mode_in_body(token)
1059
+ finally:
1060
+ self.insert_from_table = previous
1061
+ else:
1062
+ if name == "table":
1063
+ self._close_table_element()
1064
+ return None
1065
+ if name in {"body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr"}:
1066
+ self._parse_error("unexpected-end-tag", tag_name=name)
1067
+ return None
1068
+ self._parse_error("unexpected-end-tag-implies-table-voodoo", tag_name=name)
1069
+ previous = self.insert_from_table
1070
+ self.insert_from_table = True
1071
+ try:
1072
+ return self._mode_in_body(token)
1073
+ finally:
1074
+ self.insert_from_table = previous
1075
+ # Per spec, only CharacterTokens, CommentToken, Tag, and EOFToken exist
1076
+ assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
1077
+ # If we're in a template, handle EOF in template mode first
1078
+ if self.template_modes:
1079
+ return self._mode_in_template(token)
1080
+ if self._has_in_table_scope("table"):
1081
+ self._parse_error("expected-closing-tag-but-got-eof", tag_name="table")
1082
+ return None
1083
+
1084
+ def _mode_in_table_text(self, token):
1085
+ if isinstance(token, CharacterTokens):
1086
+ # IN_TABLE mode guarantees non-empty data
1087
+ data = token.data
1088
+ if "\x0c" in data:
1089
+ self._parse_error("invalid-codepoint-in-table-text")
1090
+ data = data.replace("\x0c", "")
1091
+ if data:
1092
+ self.pending_table_text.append(data)
1093
+ return None
1094
+ self._flush_pending_table_text()
1095
+ original = self.table_text_original_mode or InsertionMode.IN_TABLE
1096
+ self.table_text_original_mode = None
1097
+ self.mode = original
1098
+ return ("reprocess", original, token)
1099
+
1100
+ def _mode_in_caption(self, token):
1101
+ if isinstance(token, CharacterTokens):
1102
+ return self._mode_in_body(token)
1103
+ if isinstance(token, CommentToken):
1104
+ self._append_comment(token.data)
1105
+ return None
1106
+ if isinstance(token, Tag):
1107
+ name = token.name
1108
+ if token.kind == Tag.START:
1109
+ if name in {"caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr", "td", "th"}:
1110
+ self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=name)
1111
+ if self._close_caption_element():
1112
+ return ("reprocess", InsertionMode.IN_TABLE, token)
1113
+ # Fragment parsing with caption context: caption not on stack, ignore table structure elements
1114
+ return None
1115
+ if name == "table":
1116
+ self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=name)
1117
+ if self._close_caption_element():
1118
+ return ("reprocess", InsertionMode.IN_TABLE, token)
1119
+ # Fragment parsing: no caption on stack - handle in body mode
1120
+ return self._mode_in_body(token)
1121
+ return self._mode_in_body(token)
1122
+ if name == "caption":
1123
+ if not self._close_caption_element():
1124
+ return None
1125
+ return None
1126
+ if name == "table":
1127
+ if self._close_caption_element():
1128
+ return ("reprocess", InsertionMode.IN_TABLE, token)
1129
+ return None
1130
+ if name in {"tbody", "tfoot", "thead"}:
1131
+ # These elements are never in table scope when in caption -
1132
+ # caption closes any open tbody/tfoot/thead when created
1133
+ self._parse_error("unexpected-end-tag", tag_name=name)
1134
+ return None
1135
+ return self._mode_in_body(token)
1136
+ assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
1137
+ return self._mode_in_body(token)
1138
+
1139
+ def _close_caption_element(self):
1140
+ if not self._has_in_table_scope("caption"):
1141
+ self._parse_error("unexpected-end-tag", tag_name="caption")
1142
+ return False
1143
+ self._generate_implied_end_tags()
1144
+ # Caption verified in scope above
1145
+ while self.open_elements: # pragma: no branch
1146
+ node = self.open_elements.pop()
1147
+ if node.name == "caption":
1148
+ break
1149
+ self._clear_active_formatting_up_to_marker()
1150
+ self.mode = InsertionMode.IN_TABLE
1151
+ return True
1152
+
1153
+ def _mode_in_column_group(self, token):
1154
+ current = self.open_elements[-1] if self.open_elements else None
1155
+ if isinstance(token, CharacterTokens):
1156
+ data = token.data or ""
1157
+ # Find first non-whitespace character
1158
+ stripped = data.lstrip(" \t\n\r\f")
1159
+
1160
+ if len(stripped) < len(data):
1161
+ # Has leading whitespace - insert it
1162
+ ws = data[: len(data) - len(stripped)]
1163
+ self._append_text(ws)
1164
+
1165
+ # Continue processing non-whitespace with a new token
1166
+ non_ws_token = CharacterTokens(stripped)
1167
+ if current and current.name == "html":
1168
+ # Fragment parsing with colgroup context: drop non-whitespace characters
1169
+ # (This is the only way html can be current in IN_COLUMN_GROUP mode)
1170
+ self._parse_error("unexpected-characters-in-column-group")
1171
+ return None
1172
+ # In a template, non-whitespace characters are parse errors - ignore them
1173
+ if current and current.name == "template":
1174
+ self._parse_error("unexpected-characters-in-template-column-group")
1175
+ return None
1176
+ self._parse_error("unexpected-characters-in-column-group")
1177
+ self._pop_current()
1178
+ self.mode = InsertionMode.IN_TABLE
1179
+ return ("reprocess", InsertionMode.IN_TABLE, non_ws_token)
1180
+ if isinstance(token, CommentToken):
1181
+ self._append_comment(token.data)
1182
+ return None
1183
+ if isinstance(token, Tag):
1184
+ name = token.name
1185
+ if token.kind == Tag.START:
1186
+ if name == "html":
1187
+ return self._mode_in_body(token)
1188
+ if name == "col":
1189
+ self._insert_element(token, push=True)
1190
+ self.open_elements.pop() # push=True always adds to stack
1191
+ return None
1192
+ if name == "template":
1193
+ # Template is handled by delegating to IN_HEAD
1194
+ return self._mode_in_head(token)
1195
+ if name == "colgroup":
1196
+ self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=name)
1197
+ # Don't pop template element - only pop actual colgroup
1198
+ if current and current.name == "colgroup":
1199
+ self._pop_current()
1200
+ self.mode = InsertionMode.IN_TABLE
1201
+ return ("reprocess", InsertionMode.IN_TABLE, token)
1202
+ return None
1203
+ if (
1204
+ self.fragment_context
1205
+ and self.fragment_context.tag_name.lower() == "colgroup"
1206
+ and not self._has_in_table_scope("table")
1207
+ ):
1208
+ self._parse_error("unexpected-start-tag-in-column-group", tag_name=name)
1209
+ return None
1210
+ # Anything else: if we're in a colgroup, pop it and switch to IN_TABLE
1211
+ if current and current.name == "colgroup":
1212
+ self._pop_current()
1213
+ self.mode = InsertionMode.IN_TABLE
1214
+ return ("reprocess", InsertionMode.IN_TABLE, token)
1215
+ # In template column group context (via <col> in template), ignore non-column content
1216
+ # At this point current is template - the only other case after colgroup fragment
1217
+ # and colgroup element are handled
1218
+ self._parse_error("unexpected-start-tag-in-template-column-group", tag_name=name)
1219
+ return None
1220
+ if name == "colgroup":
1221
+ if current and current.name == "colgroup":
1222
+ self._pop_current()
1223
+ self.mode = InsertionMode.IN_TABLE
1224
+ else:
1225
+ self._parse_error("unexpected-end-tag", tag_name=token.name)
1226
+ return None
1227
+ if name == "col":
1228
+ self._parse_error("unexpected-end-tag", tag_name=name)
1229
+ return None
1230
+ if name == "template":
1231
+ # Template end tag needs proper handling
1232
+ return self._mode_in_head(token)
1233
+ if current and current.name != "html": # pragma: no branch
1234
+ self._pop_current()
1235
+ self.mode = InsertionMode.IN_TABLE
1236
+ return ("reprocess", InsertionMode.IN_TABLE, token)
1237
+ assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
1238
+ if current and current.name == "colgroup":
1239
+ self._pop_current()
1240
+ self.mode = InsertionMode.IN_TABLE
1241
+ return ("reprocess", InsertionMode.IN_TABLE, token)
1242
+ if current and current.name == "template":
1243
+ # In template, delegate EOF handling to IN_TEMPLATE
1244
+ return self._mode_in_template(token)
1245
+ return None
1246
+ # Per spec: EOF when current is html - implicit None return
1247
+
1248
+ def _mode_in_table_body(self, token):
1249
+ if isinstance(token, CharacterTokens) or isinstance(token, CommentToken):
1250
+ return self._mode_in_table(token)
1251
+ if isinstance(token, Tag):
1252
+ name = token.name
1253
+ if token.kind == Tag.START:
1254
+ if name == "tr":
1255
+ self._clear_stack_until({"tbody", "tfoot", "thead", "template", "html"})
1256
+ self._insert_element(token, push=True)
1257
+ self.mode = InsertionMode.IN_ROW
1258
+ return None
1259
+ if name in {"td", "th"}:
1260
+ self._parse_error("unexpected-cell-in-table-body")
1261
+ self._clear_stack_until({"tbody", "tfoot", "thead", "template", "html"})
1262
+ implied = Tag(Tag.START, "tr", {}, False)
1263
+ self._insert_element(implied, push=True)
1264
+ self.mode = InsertionMode.IN_ROW
1265
+ return ("reprocess", InsertionMode.IN_ROW, token)
1266
+ if name in {"caption", "col", "colgroup", "tbody", "tfoot", "thead", "table"}:
1267
+ current = self.open_elements[-1] if self.open_elements else None
1268
+ # When in a template, these tags create invalid structure - treat as "anything else"
1269
+ if current and current.name == "template":
1270
+ self._parse_error("unexpected-start-tag-in-template-table-context", tag_name=name)
1271
+ return None
1272
+ # In fragment parsing with tbody/tfoot/thead context and no tbody on stack, ignore these tags
1273
+ if (
1274
+ self.fragment_context
1275
+ and current
1276
+ and current.name == "html"
1277
+ and self.fragment_context.tag_name.lower() in {"tbody", "tfoot", "thead"}
1278
+ ):
1279
+ self._parse_error("unexpected-start-tag")
1280
+ return None
1281
+ # Pop tbody/tfoot/thead (stack always has elements here in normal parsing)
1282
+ if self.open_elements:
1283
+ self.open_elements.pop()
1284
+ self.mode = InsertionMode.IN_TABLE
1285
+ return ("reprocess", InsertionMode.IN_TABLE, token)
1286
+ # Empty stack edge case - go directly to IN_TABLE without reprocess
1287
+ self.mode = InsertionMode.IN_TABLE # pragma: no cover
1288
+ return None # pragma: no cover
1289
+ return self._mode_in_table(token)
1290
+ if name in {"tbody", "tfoot", "thead"}:
1291
+ if not self._has_in_table_scope(name):
1292
+ self._parse_error("unexpected-end-tag", tag_name=name)
1293
+ return None
1294
+ self._clear_stack_until({"tbody", "tfoot", "thead", "template", "html"})
1295
+ self._pop_current()
1296
+ self.mode = InsertionMode.IN_TABLE
1297
+ return None
1298
+ if name == "table":
1299
+ current = self.open_elements[-1] if self.open_elements else None
1300
+ # In a template, reject </table> as there's no table element
1301
+ if current and current.name == "template":
1302
+ self._parse_error("unexpected-end-tag", tag_name=token.name)
1303
+ return None
1304
+ # In fragment parsing with tbody/tfoot/thead context and no tbody on stack, ignore </table>
1305
+ if (
1306
+ self.fragment_context
1307
+ and current
1308
+ and current.name == "html"
1309
+ and self.fragment_context.tag_name.lower() in {"tbody", "tfoot", "thead"}
1310
+ ):
1311
+ self._parse_error("unexpected-end-tag", tag_name=token.name)
1312
+ return None
1313
+ if current and current.name in {"tbody", "tfoot", "thead"}:
1314
+ self.open_elements.pop()
1315
+ self.mode = InsertionMode.IN_TABLE
1316
+ return ("reprocess", InsertionMode.IN_TABLE, token)
1317
+ if name in {"caption", "col", "colgroup", "td", "th", "tr"}:
1318
+ self._parse_error("unexpected-end-tag", tag_name=name)
1319
+ return None
1320
+ return self._mode_in_table(token)
1321
+ assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
1322
+ return self._mode_in_table(token)
1323
+
1324
+ def _mode_in_row(self, token):
1325
+ if isinstance(token, CharacterTokens) or isinstance(token, CommentToken):
1326
+ return self._mode_in_table(token)
1327
+ if isinstance(token, Tag):
1328
+ name = token.name
1329
+ if token.kind == Tag.START:
1330
+ if name in {"td", "th"}:
1331
+ self._clear_stack_until({"tr", "template", "html"})
1332
+ self._insert_element(token, push=True)
1333
+ self._push_formatting_marker()
1334
+ self.mode = InsertionMode.IN_CELL
1335
+ return None
1336
+ if name in {"caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr", "table"}:
1337
+ if not self._has_in_table_scope("tr"):
1338
+ self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=name)
1339
+ return None
1340
+ self._end_tr_element()
1341
+ return ("reprocess", self.mode, token)
1342
+ previous = self.insert_from_table
1343
+ self.insert_from_table = True
1344
+ try:
1345
+ return self._mode_in_body(token)
1346
+ finally:
1347
+ self.insert_from_table = previous
1348
+ else:
1349
+ if name == "tr":
1350
+ if not self._has_in_table_scope("tr"):
1351
+ self._parse_error("unexpected-end-tag", tag_name=name)
1352
+ return None
1353
+ self._end_tr_element()
1354
+ return None
1355
+ if name in {"table", "tbody", "tfoot", "thead"}:
1356
+ if self._has_in_table_scope(name):
1357
+ self._end_tr_element()
1358
+ return ("reprocess", self.mode, token)
1359
+ self._parse_error("unexpected-end-tag", tag_name=name)
1360
+ return None
1361
+ if name in {"caption", "col", "group", "td", "th"}:
1362
+ self._parse_error("unexpected-end-tag", tag_name=name)
1363
+ return None
1364
+ previous = self.insert_from_table
1365
+ self.insert_from_table = True
1366
+ try:
1367
+ return self._mode_in_body(token)
1368
+ finally:
1369
+ self.insert_from_table = previous
1370
+ assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
1371
+ return self._mode_in_table(token)
1372
+
1373
+ def _end_tr_element(self):
1374
+ self._clear_stack_until({"tr", "template", "html"})
1375
+ # Pop tr if on top (may not be if stack was exhausted)
1376
+ if self.open_elements and self.open_elements[-1].name == "tr":
1377
+ self.open_elements.pop()
1378
+ # When in a template, restore template mode; otherwise use IN_TABLE_BODY
1379
+ if self.template_modes:
1380
+ self.mode = self.template_modes[-1]
1381
+ else:
1382
+ self.mode = InsertionMode.IN_TABLE_BODY
1383
+
1384
+ def _mode_in_cell(self, token):
1385
+ if isinstance(token, CharacterTokens):
1386
+ previous = self.insert_from_table
1387
+ self.insert_from_table = False
1388
+ try:
1389
+ return self._mode_in_body(token)
1390
+ finally:
1391
+ self.insert_from_table = previous
1392
+ if isinstance(token, CommentToken):
1393
+ self._append_comment(token.data)
1394
+ return None
1395
+ if isinstance(token, Tag):
1396
+ name = token.name
1397
+ if token.kind == Tag.START:
1398
+ if name in {"caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr"}:
1399
+ if self._close_table_cell():
1400
+ return ("reprocess", self.mode, token)
1401
+ # Per spec: if we reach here in IN_CELL mode with no cell to close,
1402
+ # we're in a fragment context with td/th as context element and no table structure.
1403
+ # Issue parse error and ignore the token.
1404
+ self._parse_error("unexpected-start-tag-in-cell-fragment", tag_name=name)
1405
+ return None
1406
+ previous = self.insert_from_table
1407
+ self.insert_from_table = False
1408
+ try:
1409
+ return self._mode_in_body(token)
1410
+ finally:
1411
+ self.insert_from_table = previous
1412
+ else:
1413
+ if name in {"td", "th"}:
1414
+ if not self._has_in_table_scope(name):
1415
+ self._parse_error("unexpected-end-tag", tag_name=name)
1416
+ return None
1417
+ self._end_table_cell(name)
1418
+ return None
1419
+ if name in {"table", "tbody", "tfoot", "thead", "tr"}:
1420
+ # Per HTML5 spec: only close cell if the element is actually in scope
1421
+ # Otherwise it's a parse error and we ignore the token
1422
+ if not self._has_in_table_scope(name):
1423
+ self._parse_error("unexpected-end-tag", tag_name=name)
1424
+ return None
1425
+ self._close_table_cell()
1426
+ return ("reprocess", self.mode, token)
1427
+ previous = self.insert_from_table
1428
+ self.insert_from_table = False
1429
+ try:
1430
+ return self._mode_in_body(token)
1431
+ finally:
1432
+ self.insert_from_table = previous
1433
+ assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
1434
+ if self._close_table_cell():
1435
+ return ("reprocess", self.mode, token)
1436
+ return self._mode_in_table(token)
1437
+
1438
+ def _mode_in_select(self, token):
1439
+ if isinstance(token, CharacterTokens):
1440
+ data = token.data or ""
1441
+ if "\x00" in data:
1442
+ self._parse_error("invalid-codepoint-in-select")
1443
+ data = data.replace("\x00", "")
1444
+ if "\x0c" in data:
1445
+ self._parse_error("invalid-codepoint-in-select")
1446
+ data = data.replace("\x0c", "")
1447
+ if data:
1448
+ self._reconstruct_active_formatting_elements()
1449
+ self._append_text(data)
1450
+ return None
1451
+ if isinstance(token, CommentToken):
1452
+ self._append_comment(token.data)
1453
+ return None
1454
+ if isinstance(token, Tag):
1455
+ name = token.name
1456
+ if token.kind == Tag.START:
1457
+ if name == "html":
1458
+ return ("reprocess", InsertionMode.IN_BODY, token)
1459
+ if name == "option":
1460
+ if self.open_elements and self.open_elements[-1].name == "option":
1461
+ self.open_elements.pop()
1462
+ self._reconstruct_active_formatting_elements()
1463
+ self._insert_element(token, push=True)
1464
+ return None
1465
+ if name == "optgroup":
1466
+ if self.open_elements and self.open_elements[-1].name == "option":
1467
+ self.open_elements.pop()
1468
+ if self.open_elements and self.open_elements[-1].name == "optgroup":
1469
+ self.open_elements.pop()
1470
+ self._reconstruct_active_formatting_elements()
1471
+ self._insert_element(token, push=True)
1472
+ return None
1473
+ if name == "select":
1474
+ self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=name)
1475
+ # select is always in scope in IN_SELECT mode
1476
+ self._pop_until_any_inclusive({"select"})
1477
+ self._reset_insertion_mode()
1478
+ return None
1479
+ if name in {"input", "textarea"}:
1480
+ self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=name)
1481
+ # select is always in scope in IN_SELECT mode
1482
+ self._pop_until_any_inclusive({"select"})
1483
+ self._reset_insertion_mode()
1484
+ return ("reprocess", self.mode, token)
1485
+ if name == "keygen":
1486
+ self._reconstruct_active_formatting_elements()
1487
+ self._insert_element(token, push=False)
1488
+ return None
1489
+ if name in {"caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr", "table"}:
1490
+ self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=name)
1491
+ # select is always in scope in IN_SELECT mode
1492
+ self._pop_until_any_inclusive({"select"})
1493
+ self._reset_insertion_mode()
1494
+ return ("reprocess", self.mode, token)
1495
+ if name in {"script", "template"}:
1496
+ return self._mode_in_head(token)
1497
+ if name in {"svg", "math"}:
1498
+ # For foreign elements, honor the self-closing flag
1499
+ self._reconstruct_active_formatting_elements()
1500
+ self._insert_element(token, push=not token.self_closing, namespace=name)
1501
+ return None
1502
+ if name in FORMATTING_ELEMENTS:
1503
+ self._reconstruct_active_formatting_elements()
1504
+ node = self._insert_element(token, push=True)
1505
+ self._append_active_formatting_entry(name, token.attrs, node)
1506
+ return None
1507
+ if name == "hr":
1508
+ # Per spec: pop option and optgroup before inserting hr (makes hr sibling, not child)
1509
+ if self.open_elements and self.open_elements[-1].name == "option":
1510
+ self.open_elements.pop()
1511
+ if self.open_elements and self.open_elements[-1].name == "optgroup":
1512
+ self.open_elements.pop()
1513
+ self._reconstruct_active_formatting_elements()
1514
+ self._insert_element(token, push=False)
1515
+ return None
1516
+ if name == "menuitem":
1517
+ self._reconstruct_active_formatting_elements()
1518
+ self._insert_element(token, push=True)
1519
+ return None
1520
+ # Allow common HTML elements in select (newer spec)
1521
+ if name in {"p", "div", "span", "button", "datalist", "selectedcontent"}:
1522
+ self._reconstruct_active_formatting_elements()
1523
+ self._insert_element(token, push=not token.self_closing)
1524
+ return None
1525
+ if name in {"br", "img"}:
1526
+ self._reconstruct_active_formatting_elements()
1527
+ self._insert_element(token, push=False)
1528
+ return None
1529
+ if name == "plaintext":
1530
+ # Per spec: plaintext element is inserted in select (consumes all remaining text)
1531
+ self._reconstruct_active_formatting_elements()
1532
+ self._insert_element(token, push=True)
1533
+ return None
1534
+ if name == "optgroup":
1535
+ if self.open_elements and self.open_elements[-1].name == "option":
1536
+ self.open_elements.pop()
1537
+ if self.open_elements and self.open_elements[-1].name == "optgroup":
1538
+ self.open_elements.pop()
1539
+ else:
1540
+ self._parse_error("unexpected-end-tag", tag_name=token.name)
1541
+ return None
1542
+ if name == "option":
1543
+ if self.open_elements and self.open_elements[-1].name == "option":
1544
+ self.open_elements.pop()
1545
+ else:
1546
+ self._parse_error("unexpected-end-tag", tag_name=token.name)
1547
+ return None
1548
+ if name == "select":
1549
+ # In IN_SELECT mode, select is always in scope - pop to it
1550
+ self._pop_until_any_inclusive({"select"})
1551
+ self._reset_insertion_mode()
1552
+ return None
1553
+ # Handle end tags for allowed HTML elements in select
1554
+ if name == "a" or name in FORMATTING_ELEMENTS:
1555
+ # select is always on stack in IN_SELECT mode
1556
+ select_node = self._find_last_on_stack("select")
1557
+ fmt_index = self._find_active_formatting_index(name)
1558
+ if fmt_index is not None:
1559
+ target = self.active_formatting[fmt_index]["node"]
1560
+ if target in self.open_elements: # pragma: no branch
1561
+ select_index = self.open_elements.index(select_node)
1562
+ target_index = self.open_elements.index(target)
1563
+ if target_index < select_index:
1564
+ self._parse_error("unexpected-end-tag", tag_name=name)
1565
+ return None
1566
+ self._adoption_agency(name)
1567
+ return None
1568
+ if name in {"p", "div", "span", "button", "datalist", "selectedcontent"}:
1569
+ # Per HTML5 spec: these end tags in select mode close the element if it's on the stack.
1570
+ # But we must not pop across the select boundary (i.e., don't pop elements BEFORE select).
1571
+ select_idx = None
1572
+ target_idx = None
1573
+ for i, node in enumerate(self.open_elements):
1574
+ if node.name == "select" and select_idx is None:
1575
+ select_idx = i
1576
+ if node.name == name:
1577
+ target_idx = i # Track the LAST occurrence
1578
+ # Only pop if target exists and is AFTER (or at same level as) select
1579
+ # i.e., the target is inside the select or there's no select
1580
+ if target_idx is not None and (select_idx is None or target_idx > select_idx):
1581
+ while True:
1582
+ popped = self.open_elements.pop()
1583
+ if popped.name == name:
1584
+ break
1585
+ else:
1586
+ self._parse_error("unexpected-end-tag", tag_name=name)
1587
+ return None
1588
+ if name in {"caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr", "table"}:
1589
+ self._parse_error("unexpected-end-tag", tag_name=name)
1590
+ # select is always in scope in IN_SELECT mode
1591
+ self._pop_until_any_inclusive({"select"})
1592
+ self._reset_insertion_mode()
1593
+ return ("reprocess", self.mode, token)
1594
+ # Any other end tag: parse error, ignore
1595
+ self._parse_error("unexpected-end-tag", tag_name=name)
1596
+ return None
1597
+ assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
1598
+ return self._mode_in_body(token)
1599
+
1600
+ def _mode_in_template(self, token):
1601
+ # § The "in template" insertion mode
1602
+ # https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intemplate
1603
+ if isinstance(token, CharacterTokens):
1604
+ return self._mode_in_body(token)
1605
+ if isinstance(token, CommentToken):
1606
+ return self._mode_in_body(token)
1607
+ if isinstance(token, Tag):
1608
+ if token.kind == Tag.START:
1609
+ # Table-related tags switch template mode
1610
+ if token.name in {"caption", "colgroup", "tbody", "tfoot", "thead"}:
1611
+ self.template_modes.pop()
1612
+ self.template_modes.append(InsertionMode.IN_TABLE)
1613
+ self.mode = InsertionMode.IN_TABLE
1614
+ return ("reprocess", InsertionMode.IN_TABLE, token)
1615
+ if token.name == "col":
1616
+ self.template_modes.pop()
1617
+ self.template_modes.append(InsertionMode.IN_COLUMN_GROUP)
1618
+ self.mode = InsertionMode.IN_COLUMN_GROUP
1619
+ return ("reprocess", InsertionMode.IN_COLUMN_GROUP, token)
1620
+ if token.name == "tr":
1621
+ self.template_modes.pop()
1622
+ self.template_modes.append(InsertionMode.IN_TABLE_BODY)
1623
+ self.mode = InsertionMode.IN_TABLE_BODY
1624
+ return ("reprocess", InsertionMode.IN_TABLE_BODY, token)
1625
+ if token.name in {"td", "th"}:
1626
+ self.template_modes.pop()
1627
+ self.template_modes.append(InsertionMode.IN_ROW)
1628
+ self.mode = InsertionMode.IN_ROW
1629
+ return ("reprocess", InsertionMode.IN_ROW, token)
1630
+ # Default: pop template mode and push IN_BODY
1631
+ if token.name not in {
1632
+ "base",
1633
+ "basefont",
1634
+ "bgsound",
1635
+ "link",
1636
+ "meta",
1637
+ "noframes",
1638
+ "script",
1639
+ "style",
1640
+ "template",
1641
+ "title",
1642
+ }:
1643
+ self.template_modes.pop()
1644
+ self.template_modes.append(InsertionMode.IN_BODY)
1645
+ self.mode = InsertionMode.IN_BODY
1646
+ return ("reprocess", InsertionMode.IN_BODY, token)
1647
+ if token.kind == Tag.END and token.name == "template":
1648
+ return self._mode_in_head(token)
1649
+ # Head-related tags process in InHead
1650
+ if token.name in {
1651
+ "base",
1652
+ "basefont",
1653
+ "bgsound",
1654
+ "link",
1655
+ "meta",
1656
+ "noframes",
1657
+ "script",
1658
+ "style",
1659
+ "template",
1660
+ "title",
1661
+ }:
1662
+ return self._mode_in_head(token)
1663
+ if isinstance(token, EOFToken):
1664
+ # Check if template is on the stack (don't use _in_scope as table blocks it)
1665
+ has_template = any(node.name == "template" for node in self.open_elements)
1666
+ if not has_template:
1667
+ return None
1668
+ # Parse error for EOF in template
1669
+ self._parse_error("expected-closing-tag-but-got-eof", tag_name="template")
1670
+ # Pop until template, then handle EOF in reset mode
1671
+ self._pop_until_inclusive("template")
1672
+ self._clear_active_formatting_up_to_marker()
1673
+ # template_modes is always non-empty when template is on stack
1674
+ self.template_modes.pop()
1675
+ self._reset_insertion_mode()
1676
+ return ("reprocess", self.mode, token)
1677
+ return None
1678
+
1679
+ def _mode_after_body(self, token):
1680
+ if isinstance(token, CharacterTokens):
1681
+ if is_all_whitespace(token.data):
1682
+ # Whitespace is processed using InBody rules (appended to body)
1683
+ # but we stay in AfterBody mode
1684
+ self._mode_in_body(token)
1685
+ return None
1686
+ return ("reprocess", InsertionMode.IN_BODY, token)
1687
+ if isinstance(token, CommentToken):
1688
+ self._append_comment(token.data, parent=self.open_elements[0])
1689
+ return None
1690
+ if isinstance(token, Tag):
1691
+ if token.kind == Tag.START and token.name == "html":
1692
+ return ("reprocess", InsertionMode.IN_BODY, token)
1693
+ if token.kind == Tag.END and token.name == "html":
1694
+ self.mode = InsertionMode.AFTER_AFTER_BODY
1695
+ return None
1696
+ return ("reprocess", InsertionMode.IN_BODY, token)
1697
+ assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
1698
+ return None
1699
+
1700
+ def _mode_after_after_body(self, token):
1701
+ if isinstance(token, CharacterTokens):
1702
+ if is_all_whitespace(token.data):
1703
+ # Per spec: whitespace characters are inserted using the rules for the "in body" mode
1704
+ # Process with InBody rules but stay in AfterAfterBody mode
1705
+ self._mode_in_body(token)
1706
+ return None
1707
+ # Non-whitespace character: parse error, reprocess in IN_BODY
1708
+ self._parse_error("unexpected-char-after-body")
1709
+ return ("reprocess", InsertionMode.IN_BODY, token)
1710
+ if isinstance(token, CommentToken):
1711
+ if self.fragment_context is not None:
1712
+ # html is always on stack in fragment parsing
1713
+ html_node = self._find_last_on_stack("html")
1714
+ html_node.append_child(SimpleDomNode("#comment", data=token.data))
1715
+ return None
1716
+ self._append_comment_to_document(token.data)
1717
+ return None
1718
+ if isinstance(token, Tag):
1719
+ if token.kind == Tag.START and token.name == "html":
1720
+ return ("reprocess", InsertionMode.IN_BODY, token)
1721
+ # Any other tag: parse error, reprocess in IN_BODY
1722
+ self._parse_error("unexpected-token-after-body")
1723
+ return ("reprocess", InsertionMode.IN_BODY, token)
1724
+ assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
1725
+ return None
1726
+
1727
+ def _mode_in_frameset(self, token):
1728
+ # Per HTML5 spec §13.2.6.4.16: In frameset insertion mode
1729
+ if isinstance(token, CharacterTokens):
1730
+ # Only whitespace characters allowed; ignore all others
1731
+ whitespace = "".join(ch for ch in token.data if ch in "\t\n\f\r ")
1732
+ if whitespace:
1733
+ self._append_text(whitespace)
1734
+ return None
1735
+ if isinstance(token, CommentToken):
1736
+ self._append_comment(token.data)
1737
+ return None
1738
+ if isinstance(token, Tag):
1739
+ if token.kind == Tag.START and token.name == "html":
1740
+ return ("reprocess", InsertionMode.IN_BODY, token)
1741
+ if token.kind == Tag.START and token.name == "frameset":
1742
+ self._insert_element(token, push=True)
1743
+ return None
1744
+ if token.kind == Tag.END and token.name == "frameset":
1745
+ if self.open_elements and self.open_elements[-1].name == "html":
1746
+ self._parse_error("unexpected-end-tag", tag_name=token.name)
1747
+ return None
1748
+ self.open_elements.pop()
1749
+ if self.open_elements and self.open_elements[-1].name != "frameset":
1750
+ self.mode = InsertionMode.AFTER_FRAMESET
1751
+ return None
1752
+ if token.kind == Tag.START and token.name == "frame":
1753
+ self._insert_element(token, push=True)
1754
+ self.open_elements.pop()
1755
+ return None
1756
+ if token.kind == Tag.START and token.name == "noframes":
1757
+ # Per spec: use IN_HEAD rules but preserve current mode for TEXT restoration
1758
+ self._insert_element(token, push=True)
1759
+ self.original_mode = self.mode
1760
+ self.mode = InsertionMode.TEXT
1761
+ return None
1762
+ if isinstance(token, EOFToken):
1763
+ if self.open_elements and self.open_elements[-1].name != "html":
1764
+ self._parse_error("expected-closing-tag-but-got-eof", tag_name=self.open_elements[-1].name)
1765
+ return None
1766
+ self._parse_error("unexpected-token-in-frameset")
1767
+ return None
1768
+
1769
+ def _mode_after_frameset(self, token):
1770
+ # Per HTML5 spec §13.2.6.4.17: After frameset insertion mode
1771
+ if isinstance(token, CharacterTokens):
1772
+ # Only whitespace characters allowed; ignore all others
1773
+ whitespace = "".join(ch for ch in token.data if ch in "\t\n\f\r ")
1774
+ if whitespace:
1775
+ self._append_text(whitespace)
1776
+ return None
1777
+ if isinstance(token, CommentToken):
1778
+ self._append_comment(token.data)
1779
+ return None
1780
+ if isinstance(token, Tag):
1781
+ if token.kind == Tag.START and token.name == "html":
1782
+ return ("reprocess", InsertionMode.IN_BODY, token)
1783
+ if token.kind == Tag.END and token.name == "html":
1784
+ self.mode = InsertionMode.AFTER_AFTER_FRAMESET
1785
+ return None
1786
+ if token.kind == Tag.START and token.name == "noframes":
1787
+ # Insert noframes element directly and switch to TEXT mode
1788
+ self._insert_element(token, push=True)
1789
+ self.original_mode = self.mode
1790
+ self.mode = InsertionMode.TEXT
1791
+ return None
1792
+ if isinstance(token, EOFToken):
1793
+ return None
1794
+ self._parse_error("unexpected-token-after-frameset")
1795
+ self.mode = InsertionMode.IN_FRAMESET
1796
+ return ("reprocess", InsertionMode.IN_FRAMESET, token)
1797
+
1798
+ def _mode_after_after_frameset(self, token):
1799
+ # Per HTML5 spec §13.2.6.4.18: After after frameset insertion mode
1800
+ if isinstance(token, CharacterTokens):
1801
+ # Whitespace is processed using InBody rules
1802
+ # but we stay in AfterAfterFrameset mode
1803
+ if is_all_whitespace(token.data):
1804
+ self._mode_in_body(token)
1805
+ return None
1806
+ # Non-whitespace falls through to "Anything else"
1807
+ if isinstance(token, CommentToken):
1808
+ self._append_comment_to_document(token.data)
1809
+ return None
1810
+ if isinstance(token, Tag):
1811
+ if token.kind == Tag.START and token.name == "html":
1812
+ return ("reprocess", InsertionMode.IN_BODY, token)
1813
+ if token.kind == Tag.START and token.name == "noframes":
1814
+ # Insert noframes element directly and switch to TEXT mode
1815
+ self._insert_element(token, push=True)
1816
+ self.original_mode = self.mode
1817
+ self.mode = InsertionMode.TEXT
1818
+ return None
1819
+ # Other tags fall through to "Anything else"
1820
+ if isinstance(token, EOFToken):
1821
+ return None
1822
+ # Anything else: parse error, reprocess in IN_FRAMESET
1823
+ self._parse_error("unexpected-token-after-after-frameset")
1824
+ self.mode = InsertionMode.IN_FRAMESET
1825
+ return ("reprocess", InsertionMode.IN_FRAMESET, token)
1826
+
1827
+ # Helpers ----------------------------------------------------------------
1828
+
1829
+ _MODE_HANDLERS = [
1830
+ _mode_initial,
1831
+ _mode_before_html,
1832
+ _mode_before_head,
1833
+ _mode_in_head,
1834
+ _mode_in_head_noscript,
1835
+ _mode_after_head,
1836
+ _mode_text,
1837
+ _mode_in_body,
1838
+ _mode_after_body,
1839
+ _mode_after_after_body,
1840
+ _mode_in_table,
1841
+ _mode_in_table_text,
1842
+ _mode_in_caption,
1843
+ _mode_in_column_group,
1844
+ _mode_in_table_body,
1845
+ _mode_in_row,
1846
+ _mode_in_cell,
1847
+ _mode_in_frameset,
1848
+ _mode_after_frameset,
1849
+ _mode_after_after_frameset,
1850
+ _mode_in_select,
1851
+ _mode_in_template,
1852
+ ]
1853
+
1854
+ _BODY_TOKEN_HANDLERS = {
1855
+ CharacterTokens: _handle_characters_in_body,
1856
+ CommentToken: _handle_comment_in_body,
1857
+ Tag: _handle_tag_in_body,
1858
+ EOFToken: _handle_eof_in_body,
1859
+ }
1860
+
1861
+ _BODY_START_HANDLERS = {
1862
+ "a": _handle_body_start_a,
1863
+ "address": _handle_body_start_block_with_p,
1864
+ "applet": _handle_body_start_applet_like,
1865
+ "area": _handle_body_start_void_with_formatting,
1866
+ "article": _handle_body_start_block_with_p,
1867
+ "aside": _handle_body_start_block_with_p,
1868
+ "b": _handle_body_start_formatting,
1869
+ "base": _handle_body_start_in_head,
1870
+ "basefont": _handle_body_start_in_head,
1871
+ "bgsound": _handle_body_start_in_head,
1872
+ "big": _handle_body_start_formatting,
1873
+ "blockquote": _handle_body_start_block_with_p,
1874
+ "body": _handle_body_start_body,
1875
+ "br": _handle_body_start_br,
1876
+ "button": _handle_body_start_button,
1877
+ "caption": _handle_body_start_table_parse_error,
1878
+ "center": _handle_body_start_block_with_p,
1879
+ "code": _handle_body_start_formatting,
1880
+ "col": _handle_body_start_col_or_frame,
1881
+ "colgroup": _handle_body_start_structure_ignored,
1882
+ "dd": _handle_body_start_dd_dt,
1883
+ "details": _handle_body_start_block_with_p,
1884
+ "dialog": _handle_body_start_block_with_p,
1885
+ "dir": _handle_body_start_block_with_p,
1886
+ "div": _handle_body_start_block_with_p,
1887
+ "dl": _handle_body_start_block_with_p,
1888
+ "dt": _handle_body_start_dd_dt,
1889
+ "em": _handle_body_start_formatting,
1890
+ "embed": _handle_body_start_void_with_formatting,
1891
+ "fieldset": _handle_body_start_block_with_p,
1892
+ "figcaption": _handle_body_start_block_with_p,
1893
+ "figure": _handle_body_start_block_with_p,
1894
+ "font": _handle_body_start_formatting,
1895
+ "footer": _handle_body_start_block_with_p,
1896
+ "form": _handle_body_start_form,
1897
+ "frame": _handle_body_start_col_or_frame,
1898
+ "frameset": _handle_body_start_frameset,
1899
+ "h1": _handle_body_start_heading,
1900
+ "h2": _handle_body_start_heading,
1901
+ "h3": _handle_body_start_heading,
1902
+ "h4": _handle_body_start_heading,
1903
+ "h5": _handle_body_start_heading,
1904
+ "h6": _handle_body_start_heading,
1905
+ "head": _handle_body_start_head,
1906
+ "header": _handle_body_start_block_with_p,
1907
+ "hgroup": _handle_body_start_block_with_p,
1908
+ "html": _handle_body_start_html,
1909
+ "i": _handle_body_start_formatting,
1910
+ "image": _handle_body_start_image,
1911
+ "img": _handle_body_start_void_with_formatting,
1912
+ "input": _handle_body_start_input,
1913
+ "keygen": _handle_body_start_void_with_formatting,
1914
+ "li": _handle_body_start_li,
1915
+ "link": _handle_body_start_in_head,
1916
+ "listing": _handle_body_start_pre_listing,
1917
+ "main": _handle_body_start_block_with_p,
1918
+ "marquee": _handle_body_start_applet_like,
1919
+ "math": _handle_body_start_math,
1920
+ "menu": _handle_body_start_block_with_p,
1921
+ "meta": _handle_body_start_in_head,
1922
+ "nav": _handle_body_start_block_with_p,
1923
+ "nobr": _handle_body_start_formatting,
1924
+ "noframes": _handle_body_start_in_head,
1925
+ "object": _handle_body_start_applet_like,
1926
+ "ol": _handle_body_start_block_with_p,
1927
+ "optgroup": _handle_body_start_optgroup,
1928
+ "option": _handle_body_start_option,
1929
+ "p": _handle_body_start_paragraph,
1930
+ "param": _handle_body_start_simple_void,
1931
+ "plaintext": _handle_body_start_plaintext_xmp,
1932
+ "pre": _handle_body_start_pre_listing,
1933
+ "rb": _handle_body_start_rb_rtc,
1934
+ "rp": _handle_body_start_rp_rt,
1935
+ "rt": _handle_body_start_rp_rt,
1936
+ "rtc": _handle_body_start_rb_rtc,
1937
+ "s": _handle_body_start_formatting,
1938
+ "script": _handle_body_start_in_head,
1939
+ "search": _handle_body_start_block_with_p,
1940
+ "section": _handle_body_start_block_with_p,
1941
+ "select": _handle_body_start_select,
1942
+ "small": _handle_body_start_formatting,
1943
+ "source": _handle_body_start_simple_void,
1944
+ "strike": _handle_body_start_formatting,
1945
+ "strong": _handle_body_start_formatting,
1946
+ "style": _handle_body_start_in_head,
1947
+ "summary": _handle_body_start_block_with_p,
1948
+ "svg": _handle_body_start_svg,
1949
+ "table": _handle_body_start_table,
1950
+ "tbody": _handle_body_start_structure_ignored,
1951
+ "td": _handle_body_start_structure_ignored,
1952
+ "template": _handle_body_start_in_head,
1953
+ "textarea": _handle_body_start_textarea,
1954
+ "tfoot": _handle_body_start_structure_ignored,
1955
+ "th": _handle_body_start_structure_ignored,
1956
+ "thead": _handle_body_start_structure_ignored,
1957
+ "title": _handle_body_start_in_head,
1958
+ "tr": _handle_body_start_structure_ignored,
1959
+ "track": _handle_body_start_simple_void,
1960
+ "tt": _handle_body_start_formatting,
1961
+ "u": _handle_body_start_formatting,
1962
+ "ul": _handle_body_start_block_with_p,
1963
+ "wbr": _handle_body_start_void_with_formatting,
1964
+ "xmp": _handle_body_start_plaintext_xmp,
1965
+ }
1966
+ _BODY_END_HANDLERS = {
1967
+ "address": _handle_body_end_block,
1968
+ "applet": _handle_body_end_applet_like,
1969
+ "article": _handle_body_end_block,
1970
+ "aside": _handle_body_end_block,
1971
+ "blockquote": _handle_body_end_block,
1972
+ "body": _handle_body_end_body,
1973
+ "button": _handle_body_end_block,
1974
+ "center": _handle_body_end_block,
1975
+ "dd": _handle_body_end_dd_dt,
1976
+ "details": _handle_body_end_block,
1977
+ "dialog": _handle_body_end_block,
1978
+ "dir": _handle_body_end_block,
1979
+ "div": _handle_body_end_block,
1980
+ "dl": _handle_body_end_block,
1981
+ "dt": _handle_body_end_dd_dt,
1982
+ "fieldset": _handle_body_end_block,
1983
+ "figcaption": _handle_body_end_block,
1984
+ "figure": _handle_body_end_block,
1985
+ "footer": _handle_body_end_block,
1986
+ "form": _handle_body_end_form,
1987
+ "h1": _handle_body_end_heading,
1988
+ "h2": _handle_body_end_heading,
1989
+ "h3": _handle_body_end_heading,
1990
+ "h4": _handle_body_end_heading,
1991
+ "h5": _handle_body_end_heading,
1992
+ "h6": _handle_body_end_heading,
1993
+ "header": _handle_body_end_block,
1994
+ "hgroup": _handle_body_end_block,
1995
+ "html": _handle_body_end_html,
1996
+ "li": _handle_body_end_li,
1997
+ "listing": _handle_body_end_block,
1998
+ "main": _handle_body_end_block,
1999
+ "marquee": _handle_body_end_applet_like,
2000
+ "menu": _handle_body_end_block,
2001
+ "nav": _handle_body_end_block,
2002
+ "object": _handle_body_end_applet_like,
2003
+ "ol": _handle_body_end_block,
2004
+ "p": _handle_body_end_p,
2005
+ "pre": _handle_body_end_block,
2006
+ "search": _handle_body_end_block,
2007
+ "section": _handle_body_end_block,
2008
+ "summary": _handle_body_end_block,
2009
+ "table": _handle_body_end_block,
2010
+ "template": _handle_body_end_template,
2011
+ "ul": _handle_body_end_block,
2012
+ }