justhtml 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +17 -0
- justhtml/__main__.py +144 -0
- justhtml/constants.py +445 -0
- justhtml/context.py +12 -0
- justhtml/encoding.py +405 -0
- justhtml/entities.py +344 -0
- justhtml/errors.py +140 -0
- justhtml/node.py +632 -0
- justhtml/parser.py +131 -0
- justhtml/py.typed +0 -0
- justhtml/selector.py +965 -0
- justhtml/serialize.py +258 -0
- justhtml/stream.py +107 -0
- justhtml/tokenizer.py +2647 -0
- justhtml/tokens.py +223 -0
- justhtml/treebuilder.py +1279 -0
- justhtml/treebuilder_modes.py +2016 -0
- justhtml/treebuilder_utils.py +93 -0
- justhtml-0.12.0.dist-info/METADATA +164 -0
- justhtml-0.12.0.dist-info/RECORD +23 -0
- justhtml-0.12.0.dist-info/WHEEL +4 -0
- justhtml-0.12.0.dist-info/entry_points.txt +2 -0
- justhtml-0.12.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,2016 @@
|
|
|
1
|
+
# ruff: noqa: S101, RUF012
|
|
2
|
+
# mypy: disable-error-code="attr-defined, has-type, var-annotated, assignment"
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from .constants import (
|
|
9
|
+
FORMATTING_ELEMENTS,
|
|
10
|
+
HEADING_ELEMENTS,
|
|
11
|
+
)
|
|
12
|
+
from .node import SimpleDomNode, TemplateNode
|
|
13
|
+
from .tokens import CharacterTokens, CommentToken, EOFToken, Tag, TokenSinkResult
|
|
14
|
+
from .treebuilder_utils import (
|
|
15
|
+
InsertionMode,
|
|
16
|
+
doctype_error_and_quirks,
|
|
17
|
+
is_all_whitespace,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TreeBuilderModesMixin:
|
|
22
|
+
def _handle_doctype(self, token: Any) -> Any:
|
|
23
|
+
if self.mode != InsertionMode.INITIAL:
|
|
24
|
+
self._parse_error("unexpected-doctype")
|
|
25
|
+
return TokenSinkResult.Continue
|
|
26
|
+
|
|
27
|
+
doctype = token.doctype
|
|
28
|
+
parse_error, quirks_mode = doctype_error_and_quirks(doctype, self.iframe_srcdoc)
|
|
29
|
+
|
|
30
|
+
node = SimpleDomNode("!doctype", data=doctype)
|
|
31
|
+
self.document.append_child(node)
|
|
32
|
+
|
|
33
|
+
if parse_error:
|
|
34
|
+
self._parse_error("unknown-doctype")
|
|
35
|
+
|
|
36
|
+
self._set_quirks_mode(quirks_mode)
|
|
37
|
+
self.mode = InsertionMode.BEFORE_HTML
|
|
38
|
+
return TokenSinkResult.Continue
|
|
39
|
+
|
|
40
|
+
def _mode_initial(self, token: Any) -> Any:
|
|
41
|
+
if isinstance(token, CharacterTokens):
|
|
42
|
+
if is_all_whitespace(token.data):
|
|
43
|
+
return None
|
|
44
|
+
self._parse_error("expected-doctype-but-got-chars")
|
|
45
|
+
self._set_quirks_mode("quirks")
|
|
46
|
+
return ("reprocess", InsertionMode.BEFORE_HTML, token)
|
|
47
|
+
if isinstance(token, CommentToken):
|
|
48
|
+
self._append_comment_to_document(token.data)
|
|
49
|
+
return None
|
|
50
|
+
if isinstance(token, EOFToken):
|
|
51
|
+
self._parse_error("expected-doctype-but-got-eof")
|
|
52
|
+
self._set_quirks_mode("quirks")
|
|
53
|
+
self.mode = InsertionMode.BEFORE_HTML
|
|
54
|
+
return ("reprocess", InsertionMode.BEFORE_HTML, token)
|
|
55
|
+
# Only Tags remain - no DOCTYPE seen, so quirks mode
|
|
56
|
+
if token.kind == Tag.START:
|
|
57
|
+
self._parse_error("expected-doctype-but-got-start-tag", tag_name=token.name, token=token)
|
|
58
|
+
else:
|
|
59
|
+
self._parse_error("expected-doctype-but-got-end-tag", tag_name=token.name, token=token)
|
|
60
|
+
self._set_quirks_mode("quirks")
|
|
61
|
+
return ("reprocess", InsertionMode.BEFORE_HTML, token)
|
|
62
|
+
|
|
63
|
+
def _mode_before_html(self, token: Any) -> Any:
|
|
64
|
+
if isinstance(token, CharacterTokens) and is_all_whitespace(token.data):
|
|
65
|
+
return None
|
|
66
|
+
if isinstance(token, CommentToken):
|
|
67
|
+
self._append_comment_to_document(token.data)
|
|
68
|
+
return None
|
|
69
|
+
if isinstance(token, Tag):
|
|
70
|
+
if token.kind == Tag.START and token.name == "html":
|
|
71
|
+
self._create_root(token.attrs)
|
|
72
|
+
self.mode = InsertionMode.BEFORE_HEAD
|
|
73
|
+
return None
|
|
74
|
+
if token.kind == Tag.END and token.name in {"head", "body", "html", "br"}:
|
|
75
|
+
self._create_root({})
|
|
76
|
+
self.mode = InsertionMode.BEFORE_HEAD
|
|
77
|
+
return ("reprocess", InsertionMode.BEFORE_HEAD, token)
|
|
78
|
+
if token.kind == Tag.END:
|
|
79
|
+
# Ignore other end tags
|
|
80
|
+
self._parse_error("unexpected-end-tag-before-html", tag_name=token.name)
|
|
81
|
+
return None
|
|
82
|
+
if isinstance(token, EOFToken):
|
|
83
|
+
self._create_root({})
|
|
84
|
+
self.mode = InsertionMode.BEFORE_HEAD
|
|
85
|
+
return ("reprocess", InsertionMode.BEFORE_HEAD, token)
|
|
86
|
+
|
|
87
|
+
if isinstance(token, CharacterTokens):
|
|
88
|
+
stripped = token.data.lstrip("\t\n\f\r ")
|
|
89
|
+
if len(stripped) != len(token.data):
|
|
90
|
+
token = CharacterTokens(stripped)
|
|
91
|
+
|
|
92
|
+
self._create_root({})
|
|
93
|
+
self.mode = InsertionMode.BEFORE_HEAD
|
|
94
|
+
return ("reprocess", InsertionMode.BEFORE_HEAD, token)
|
|
95
|
+
|
|
96
|
+
def _mode_before_head(self, token: Any) -> Any:
|
|
97
|
+
if isinstance(token, CharacterTokens):
|
|
98
|
+
data = token.data or ""
|
|
99
|
+
if "\x00" in data:
|
|
100
|
+
self._parse_error("invalid-codepoint-before-head")
|
|
101
|
+
data = data.replace("\x00", "")
|
|
102
|
+
if not data:
|
|
103
|
+
return None
|
|
104
|
+
if is_all_whitespace(data):
|
|
105
|
+
return None
|
|
106
|
+
token = CharacterTokens(data)
|
|
107
|
+
if isinstance(token, CommentToken):
|
|
108
|
+
self._append_comment(token.data)
|
|
109
|
+
return None
|
|
110
|
+
if isinstance(token, Tag):
|
|
111
|
+
if token.kind == Tag.START and token.name == "html":
|
|
112
|
+
# Duplicate html tag - add attributes to existing html element
|
|
113
|
+
# Note: open_elements[0] is always html at this point (created in BEFORE_HTML mode)
|
|
114
|
+
html = self.open_elements[0]
|
|
115
|
+
self._add_missing_attributes(html, token.attrs)
|
|
116
|
+
return None
|
|
117
|
+
if token.kind == Tag.START and token.name == "head":
|
|
118
|
+
head = self._insert_element(token, push=True)
|
|
119
|
+
self.head_element = head
|
|
120
|
+
self.mode = InsertionMode.IN_HEAD
|
|
121
|
+
return None
|
|
122
|
+
if token.kind == Tag.END and token.name in {"head", "body", "html", "br"}:
|
|
123
|
+
self.head_element = self._insert_phantom("head")
|
|
124
|
+
self.mode = InsertionMode.IN_HEAD
|
|
125
|
+
return ("reprocess", InsertionMode.IN_HEAD, token)
|
|
126
|
+
if token.kind == Tag.END:
|
|
127
|
+
# Ignore other end tags
|
|
128
|
+
self._parse_error("unexpected-end-tag-before-head", tag_name=token.name)
|
|
129
|
+
return None
|
|
130
|
+
if isinstance(token, EOFToken):
|
|
131
|
+
self.head_element = self._insert_phantom("head")
|
|
132
|
+
self.mode = InsertionMode.IN_HEAD
|
|
133
|
+
return ("reprocess", InsertionMode.IN_HEAD, token)
|
|
134
|
+
|
|
135
|
+
self.head_element = self._insert_phantom("head")
|
|
136
|
+
self.mode = InsertionMode.IN_HEAD
|
|
137
|
+
return ("reprocess", InsertionMode.IN_HEAD, token)
|
|
138
|
+
|
|
139
|
+
def _mode_in_head(self, token: Any) -> Any:
|
|
140
|
+
if isinstance(token, CharacterTokens):
|
|
141
|
+
if is_all_whitespace(token.data):
|
|
142
|
+
self._append_text(token.data)
|
|
143
|
+
return None
|
|
144
|
+
data = token.data or ""
|
|
145
|
+
i = 0
|
|
146
|
+
while i < len(data) and data[i] in "\t\n\f\r ":
|
|
147
|
+
i += 1
|
|
148
|
+
leading_ws = data[:i]
|
|
149
|
+
remaining = data[i:]
|
|
150
|
+
if leading_ws:
|
|
151
|
+
current = self.open_elements[-1] if self.open_elements else None
|
|
152
|
+
if current is not None and current.has_child_nodes():
|
|
153
|
+
self._append_text(leading_ws)
|
|
154
|
+
self._pop_current()
|
|
155
|
+
self.mode = InsertionMode.AFTER_HEAD
|
|
156
|
+
return ("reprocess", InsertionMode.AFTER_HEAD, CharacterTokens(remaining))
|
|
157
|
+
if isinstance(token, CommentToken):
|
|
158
|
+
self._append_comment(token.data)
|
|
159
|
+
return None
|
|
160
|
+
if isinstance(token, Tag):
|
|
161
|
+
if token.kind == Tag.START and token.name == "html":
|
|
162
|
+
# Pop head and transition to AFTER_HEAD, then reprocess
|
|
163
|
+
self._pop_current()
|
|
164
|
+
self.mode = InsertionMode.AFTER_HEAD
|
|
165
|
+
return ("reprocess", InsertionMode.AFTER_HEAD, token)
|
|
166
|
+
if token.kind == Tag.START and token.name in {"base", "basefont", "bgsound", "link", "meta"}:
|
|
167
|
+
self._insert_element(token, push=False)
|
|
168
|
+
return None
|
|
169
|
+
if token.kind == Tag.START and token.name == "template":
|
|
170
|
+
self._insert_element(token, push=True)
|
|
171
|
+
self._push_formatting_marker()
|
|
172
|
+
self.frameset_ok = False
|
|
173
|
+
self.mode = InsertionMode.IN_TEMPLATE
|
|
174
|
+
self.template_modes.append(InsertionMode.IN_TEMPLATE)
|
|
175
|
+
return None
|
|
176
|
+
if token.kind == Tag.END and token.name == "template":
|
|
177
|
+
# Check if template is on the stack (don't use scope check as table blocks it)
|
|
178
|
+
has_template = any(node.name == "template" for node in self.open_elements)
|
|
179
|
+
if not has_template:
|
|
180
|
+
return None
|
|
181
|
+
self._generate_implied_end_tags()
|
|
182
|
+
self._pop_until_inclusive("template")
|
|
183
|
+
self._clear_active_formatting_up_to_marker()
|
|
184
|
+
# template_modes always non-empty here since we passed has_template check
|
|
185
|
+
self.template_modes.pop()
|
|
186
|
+
self._reset_insertion_mode()
|
|
187
|
+
return None
|
|
188
|
+
if token.kind == Tag.START and token.name in {"title", "style", "script", "noframes"}:
|
|
189
|
+
self._insert_element(token, push=True)
|
|
190
|
+
self.original_mode = self.mode
|
|
191
|
+
self.mode = InsertionMode.TEXT
|
|
192
|
+
return None
|
|
193
|
+
if token.kind == Tag.START and token.name == "noscript":
|
|
194
|
+
# Scripting is disabled: parse noscript content as HTML
|
|
195
|
+
self._insert_element(token, push=True)
|
|
196
|
+
self.mode = InsertionMode.IN_HEAD_NOSCRIPT
|
|
197
|
+
return None
|
|
198
|
+
if token.kind == Tag.END and token.name == "head":
|
|
199
|
+
self._pop_current()
|
|
200
|
+
self.mode = InsertionMode.AFTER_HEAD
|
|
201
|
+
return None
|
|
202
|
+
if token.kind == Tag.END and token.name in {"body", "html", "br"}:
|
|
203
|
+
self._pop_current()
|
|
204
|
+
self.mode = InsertionMode.AFTER_HEAD
|
|
205
|
+
return ("reprocess", InsertionMode.AFTER_HEAD, token)
|
|
206
|
+
if isinstance(token, EOFToken):
|
|
207
|
+
self._pop_current()
|
|
208
|
+
self.mode = InsertionMode.AFTER_HEAD
|
|
209
|
+
return ("reprocess", InsertionMode.AFTER_HEAD, token)
|
|
210
|
+
|
|
211
|
+
self._pop_current()
|
|
212
|
+
self.mode = InsertionMode.AFTER_HEAD
|
|
213
|
+
return ("reprocess", InsertionMode.AFTER_HEAD, token)
|
|
214
|
+
|
|
215
|
+
def _mode_in_head_noscript(self, token: Any) -> Any:
|
|
216
|
+
"""Handle tokens in 'in head noscript' insertion mode (scripting disabled)."""
|
|
217
|
+
if isinstance(token, CharacterTokens):
|
|
218
|
+
data = token.data or ""
|
|
219
|
+
# Whitespace: process using in head rules
|
|
220
|
+
if is_all_whitespace(data):
|
|
221
|
+
return self._mode_in_head(token)
|
|
222
|
+
# Non-whitespace: parse error, pop noscript, reprocess in head
|
|
223
|
+
self._parse_error("unexpected-start-tag", tag_name="text")
|
|
224
|
+
self._pop_current() # Pop noscript
|
|
225
|
+
self.mode = InsertionMode.IN_HEAD
|
|
226
|
+
return ("reprocess", InsertionMode.IN_HEAD, token)
|
|
227
|
+
if isinstance(token, CommentToken):
|
|
228
|
+
return self._mode_in_head(token)
|
|
229
|
+
if isinstance(token, Tag):
|
|
230
|
+
if token.kind == Tag.START:
|
|
231
|
+
if token.name == "html":
|
|
232
|
+
return self._mode_in_body(token)
|
|
233
|
+
if token.name in {"basefont", "bgsound", "link", "meta", "noframes", "style"}:
|
|
234
|
+
return self._mode_in_head(token)
|
|
235
|
+
if token.name in {"head", "noscript"}:
|
|
236
|
+
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
237
|
+
return None # Ignore
|
|
238
|
+
# Any other start tag: parse error, pop noscript, reprocess in head
|
|
239
|
+
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
240
|
+
self._pop_current() # Pop noscript
|
|
241
|
+
self.mode = InsertionMode.IN_HEAD
|
|
242
|
+
return ("reprocess", InsertionMode.IN_HEAD, token)
|
|
243
|
+
# token.kind == Tag.END
|
|
244
|
+
if token.name == "noscript":
|
|
245
|
+
self._pop_current() # Pop noscript
|
|
246
|
+
self.mode = InsertionMode.IN_HEAD
|
|
247
|
+
return None
|
|
248
|
+
if token.name == "br":
|
|
249
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
250
|
+
self._pop_current() # Pop noscript
|
|
251
|
+
self.mode = InsertionMode.IN_HEAD
|
|
252
|
+
return ("reprocess", InsertionMode.IN_HEAD, token)
|
|
253
|
+
# Any other end tag: parse error, ignore
|
|
254
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
255
|
+
return None
|
|
256
|
+
if isinstance(token, EOFToken):
|
|
257
|
+
self._parse_error("expected-closing-tag-but-got-eof", tag_name="noscript")
|
|
258
|
+
self._pop_current() # Pop noscript
|
|
259
|
+
self.mode = InsertionMode.IN_HEAD
|
|
260
|
+
return ("reprocess", InsertionMode.IN_HEAD, token)
|
|
261
|
+
# All token types are handled above - CharacterTokens, CommentToken, Tag, EOFToken
|
|
262
|
+
return None # pragma: no cover
|
|
263
|
+
|
|
264
|
+
def _mode_after_head(self, token: Any) -> Any:
|
|
265
|
+
if isinstance(token, CharacterTokens):
|
|
266
|
+
data = token.data or ""
|
|
267
|
+
if "\x00" in data:
|
|
268
|
+
self._parse_error("invalid-codepoint-in-body")
|
|
269
|
+
data = data.replace("\x00", "")
|
|
270
|
+
if "\x0c" in data:
|
|
271
|
+
self._parse_error("invalid-codepoint-in-body")
|
|
272
|
+
data = data.replace("\x0c", "")
|
|
273
|
+
if not data or is_all_whitespace(data):
|
|
274
|
+
if data:
|
|
275
|
+
self._append_text(data)
|
|
276
|
+
return None
|
|
277
|
+
self._insert_body_if_missing()
|
|
278
|
+
return ("reprocess", InsertionMode.IN_BODY, CharacterTokens(data))
|
|
279
|
+
if isinstance(token, CommentToken):
|
|
280
|
+
self._append_comment(token.data)
|
|
281
|
+
return None
|
|
282
|
+
if isinstance(token, Tag):
|
|
283
|
+
if token.kind == Tag.START and token.name == "html":
|
|
284
|
+
self._insert_body_if_missing()
|
|
285
|
+
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
286
|
+
if token.kind == Tag.START and token.name == "body":
|
|
287
|
+
self._insert_element(token, push=True)
|
|
288
|
+
self.mode = InsertionMode.IN_BODY
|
|
289
|
+
self.frameset_ok = False
|
|
290
|
+
return None
|
|
291
|
+
if token.kind == Tag.START and token.name == "frameset":
|
|
292
|
+
self._insert_element(token, push=True)
|
|
293
|
+
self.mode = InsertionMode.IN_FRAMESET
|
|
294
|
+
return None
|
|
295
|
+
# Special handling: input type="hidden" doesn't create body or affect frameset_ok
|
|
296
|
+
if token.kind == Tag.START and token.name == "input":
|
|
297
|
+
input_type = None
|
|
298
|
+
for name, value in token.attrs.items():
|
|
299
|
+
if name == "type":
|
|
300
|
+
input_type = (value or "").lower()
|
|
301
|
+
break
|
|
302
|
+
if input_type == "hidden":
|
|
303
|
+
# Parse error but ignore - don't create body, don't insert element
|
|
304
|
+
self._parse_error("unexpected-hidden-input-after-head")
|
|
305
|
+
return None
|
|
306
|
+
# Non-hidden input creates body
|
|
307
|
+
self._insert_body_if_missing()
|
|
308
|
+
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
309
|
+
if token.kind == Tag.START and token.name in {
|
|
310
|
+
"base",
|
|
311
|
+
"basefont",
|
|
312
|
+
"bgsound",
|
|
313
|
+
"link",
|
|
314
|
+
"meta",
|
|
315
|
+
"title",
|
|
316
|
+
"style",
|
|
317
|
+
"script",
|
|
318
|
+
"noscript",
|
|
319
|
+
}:
|
|
320
|
+
self.open_elements.append(self.head_element)
|
|
321
|
+
result = self._mode_in_head(token)
|
|
322
|
+
# Remove the head element from wherever it is in the stack
|
|
323
|
+
# (it might not be at the end if we inserted other elements like <title>)
|
|
324
|
+
self.open_elements.remove(self.head_element)
|
|
325
|
+
return result
|
|
326
|
+
if token.kind == Tag.START and token.name == "template":
|
|
327
|
+
# Template in after-head needs special handling:
|
|
328
|
+
# Process in IN_HEAD mode, which will switch to IN_TEMPLATE
|
|
329
|
+
# Don't remove head from stack - let normal processing continue
|
|
330
|
+
self.open_elements.append(self.head_element)
|
|
331
|
+
self.mode = InsertionMode.IN_HEAD
|
|
332
|
+
return ("reprocess", InsertionMode.IN_HEAD, token)
|
|
333
|
+
if token.kind == Tag.END and token.name == "template":
|
|
334
|
+
return self._mode_in_head(token)
|
|
335
|
+
if token.kind == Tag.END and token.name == "body":
|
|
336
|
+
self._insert_body_if_missing()
|
|
337
|
+
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
338
|
+
if token.kind == Tag.END and token.name in {"html", "br"}:
|
|
339
|
+
self._insert_body_if_missing()
|
|
340
|
+
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
341
|
+
if token.kind == Tag.END:
|
|
342
|
+
# Ignore other end tags
|
|
343
|
+
self._parse_error("unexpected-end-tag-after-head", tag_name=token.name)
|
|
344
|
+
return None
|
|
345
|
+
if isinstance(token, EOFToken):
|
|
346
|
+
self._insert_body_if_missing()
|
|
347
|
+
self.mode = InsertionMode.IN_BODY
|
|
348
|
+
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
349
|
+
|
|
350
|
+
self._insert_body_if_missing()
|
|
351
|
+
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
352
|
+
|
|
353
|
+
def _mode_text(self, token: Any) -> Any:
|
|
354
|
+
if isinstance(token, CharacterTokens):
|
|
355
|
+
self._append_text(token.data)
|
|
356
|
+
return None
|
|
357
|
+
if isinstance(token, EOFToken):
|
|
358
|
+
# Get the tag name of the unclosed element
|
|
359
|
+
tag_name = self.open_elements[-1].name if self.open_elements else None
|
|
360
|
+
self._parse_error("expected-named-closing-tag-but-got-eof", tag_name=tag_name)
|
|
361
|
+
self._pop_current()
|
|
362
|
+
self.mode = self.original_mode or InsertionMode.IN_BODY
|
|
363
|
+
return ("reprocess", self.mode, token)
|
|
364
|
+
# End tag
|
|
365
|
+
self._pop_current()
|
|
366
|
+
self.mode = self.original_mode or InsertionMode.IN_BODY
|
|
367
|
+
return None
|
|
368
|
+
|
|
369
|
+
def _mode_in_body(self, token: Any) -> Any:
|
|
370
|
+
handler = self._BODY_TOKEN_HANDLERS.get(type(token))
|
|
371
|
+
return handler(self, token) if handler else None
|
|
372
|
+
|
|
373
|
+
def _handle_characters_in_body(self, token: Any) -> Any:
|
|
374
|
+
data = token.data or ""
|
|
375
|
+
if "\x00" in data:
|
|
376
|
+
self._parse_error("invalid-codepoint")
|
|
377
|
+
data = data.replace("\x00", "")
|
|
378
|
+
if is_all_whitespace(data):
|
|
379
|
+
self._reconstruct_active_formatting_elements()
|
|
380
|
+
self._append_text(data)
|
|
381
|
+
return
|
|
382
|
+
self._reconstruct_active_formatting_elements()
|
|
383
|
+
self.frameset_ok = False
|
|
384
|
+
self._append_text(data)
|
|
385
|
+
return
|
|
386
|
+
|
|
387
|
+
def _handle_comment_in_body(self, token: Any) -> Any:
|
|
388
|
+
self._append_comment(token.data)
|
|
389
|
+
return
|
|
390
|
+
|
|
391
|
+
def _handle_tag_in_body(self, token: Any) -> Any:
|
|
392
|
+
if token.kind == Tag.START:
|
|
393
|
+
handler = self._BODY_START_HANDLERS.get(token.name)
|
|
394
|
+
if handler:
|
|
395
|
+
return handler(self, token)
|
|
396
|
+
return self._handle_body_start_default(token)
|
|
397
|
+
name = token.name
|
|
398
|
+
|
|
399
|
+
# Special case: </br> end tag is treated as <br> start tag
|
|
400
|
+
if name == "br":
|
|
401
|
+
self._parse_error("unexpected-end-tag", tag_name=name, token=token)
|
|
402
|
+
br_tag = Tag(Tag.START, "br", {}, False)
|
|
403
|
+
return self._mode_in_body(br_tag)
|
|
404
|
+
|
|
405
|
+
if name in FORMATTING_ELEMENTS:
|
|
406
|
+
self._adoption_agency(name)
|
|
407
|
+
return None
|
|
408
|
+
handler = self._BODY_END_HANDLERS.get(name)
|
|
409
|
+
if handler:
|
|
410
|
+
return handler(self, token)
|
|
411
|
+
# Any other end tag
|
|
412
|
+
self._any_other_end_tag(token.name)
|
|
413
|
+
return None
|
|
414
|
+
|
|
415
|
+
def _handle_eof_in_body(self, token: Any) -> Any:
|
|
416
|
+
# If we're in a template, handle EOF in template mode first
|
|
417
|
+
if self.template_modes:
|
|
418
|
+
return self._mode_in_template(token)
|
|
419
|
+
# Check for unclosed elements (excluding html, body, head which are implicit)
|
|
420
|
+
for node in self.open_elements:
|
|
421
|
+
if node.name not in {
|
|
422
|
+
"dd",
|
|
423
|
+
"dt",
|
|
424
|
+
"li",
|
|
425
|
+
"optgroup",
|
|
426
|
+
"option",
|
|
427
|
+
"p",
|
|
428
|
+
"rb",
|
|
429
|
+
"rp",
|
|
430
|
+
"rt",
|
|
431
|
+
"rtc",
|
|
432
|
+
"tbody",
|
|
433
|
+
"td",
|
|
434
|
+
"tfoot",
|
|
435
|
+
"th",
|
|
436
|
+
"thead",
|
|
437
|
+
"tr",
|
|
438
|
+
"body",
|
|
439
|
+
"html",
|
|
440
|
+
}:
|
|
441
|
+
self._parse_error("expected-closing-tag-but-got-eof", tag_name=node.name)
|
|
442
|
+
break
|
|
443
|
+
self.mode = InsertionMode.AFTER_BODY
|
|
444
|
+
return ("reprocess", InsertionMode.AFTER_BODY, token)
|
|
445
|
+
|
|
446
|
+
# ---------------------
|
|
447
|
+
# Body mode start tag handlers
|
|
448
|
+
# ---------------------
|
|
449
|
+
|
|
450
|
+
def _handle_body_start_html(self, token: Any) -> Any:
|
|
451
|
+
if self.template_modes:
|
|
452
|
+
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
453
|
+
return
|
|
454
|
+
# In IN_BODY mode, html element is always at open_elements[0]
|
|
455
|
+
if self.open_elements: # pragma: no branch
|
|
456
|
+
html = self.open_elements[0]
|
|
457
|
+
self._add_missing_attributes(html, token.attrs)
|
|
458
|
+
return
|
|
459
|
+
|
|
460
|
+
def _handle_body_start_body(self, token: Any) -> Any:
|
|
461
|
+
if self.template_modes:
|
|
462
|
+
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
463
|
+
return
|
|
464
|
+
if len(self.open_elements) > 1:
|
|
465
|
+
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
466
|
+
body = self.open_elements[1] if len(self.open_elements) > 1 else None
|
|
467
|
+
if body and body.name == "body":
|
|
468
|
+
self._add_missing_attributes(body, token.attrs)
|
|
469
|
+
self.frameset_ok = False
|
|
470
|
+
return
|
|
471
|
+
self.frameset_ok = False
|
|
472
|
+
return
|
|
473
|
+
|
|
474
|
+
def _handle_body_start_head(self, token: Any) -> Any:
|
|
475
|
+
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
476
|
+
return
|
|
477
|
+
|
|
478
|
+
def _handle_body_start_in_head(self, token: Any) -> Any:
|
|
479
|
+
return self._mode_in_head(token)
|
|
480
|
+
|
|
481
|
+
def _handle_body_start_block_with_p(self, token: Any) -> Any:
|
|
482
|
+
self._close_p_element()
|
|
483
|
+
self._insert_element(token, push=True)
|
|
484
|
+
return
|
|
485
|
+
|
|
486
|
+
def _handle_body_start_heading(self, token: Any) -> Any:
|
|
487
|
+
self._close_p_element()
|
|
488
|
+
if self.open_elements and self.open_elements[-1].name in HEADING_ELEMENTS:
|
|
489
|
+
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
490
|
+
self._pop_current()
|
|
491
|
+
self._insert_element(token, push=True)
|
|
492
|
+
self.frameset_ok = False
|
|
493
|
+
return
|
|
494
|
+
|
|
495
|
+
def _handle_body_start_pre_listing(self, token: Any) -> Any:
|
|
496
|
+
self._close_p_element()
|
|
497
|
+
self._insert_element(token, push=True)
|
|
498
|
+
self.ignore_lf = True
|
|
499
|
+
self.frameset_ok = False
|
|
500
|
+
return
|
|
501
|
+
|
|
502
|
+
def _handle_body_start_form(self, token: Any) -> Any:
|
|
503
|
+
if self.form_element is not None:
|
|
504
|
+
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
505
|
+
return
|
|
506
|
+
self._close_p_element()
|
|
507
|
+
node = self._insert_element(token, push=True)
|
|
508
|
+
self.form_element = node
|
|
509
|
+
self.frameset_ok = False
|
|
510
|
+
return
|
|
511
|
+
|
|
512
|
+
def _handle_body_start_button(self, token: Any) -> Any:
|
|
513
|
+
if self._has_in_scope("button"):
|
|
514
|
+
self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=token.name)
|
|
515
|
+
self._close_element_by_name("button")
|
|
516
|
+
self._insert_element(token, push=True)
|
|
517
|
+
self.frameset_ok = False
|
|
518
|
+
return
|
|
519
|
+
|
|
520
|
+
def _handle_body_start_paragraph(self, token: Any) -> Any:
|
|
521
|
+
self._close_p_element()
|
|
522
|
+
self._insert_element(token, push=True)
|
|
523
|
+
return
|
|
524
|
+
|
|
525
|
+
def _handle_body_start_math(self, token: Any) -> Any:
|
|
526
|
+
self._reconstruct_active_formatting_elements()
|
|
527
|
+
attrs = self._prepare_foreign_attributes("math", token.attrs)
|
|
528
|
+
new_tag = Tag(Tag.START, token.name, attrs, token.self_closing)
|
|
529
|
+
self._insert_element(new_tag, push=not token.self_closing, namespace="math")
|
|
530
|
+
return
|
|
531
|
+
|
|
532
|
+
def _handle_body_start_svg(self, token: Any) -> Any:
|
|
533
|
+
self._reconstruct_active_formatting_elements()
|
|
534
|
+
adjusted_name = self._adjust_svg_tag_name(token.name)
|
|
535
|
+
attrs = self._prepare_foreign_attributes("svg", token.attrs)
|
|
536
|
+
new_tag = Tag(Tag.START, adjusted_name, attrs, token.self_closing)
|
|
537
|
+
self._insert_element(new_tag, push=not token.self_closing, namespace="svg")
|
|
538
|
+
return
|
|
539
|
+
|
|
540
|
+
def _handle_body_start_li(self, token: Any) -> Any:
|
|
541
|
+
self.frameset_ok = False
|
|
542
|
+
self._close_p_element()
|
|
543
|
+
if self._has_in_list_item_scope("li"):
|
|
544
|
+
self._pop_until_any_inclusive({"li"})
|
|
545
|
+
self._insert_element(token, push=True)
|
|
546
|
+
return
|
|
547
|
+
|
|
548
|
+
def _handle_body_start_dd_dt(self, token: Any) -> Any:
|
|
549
|
+
self.frameset_ok = False
|
|
550
|
+
self._close_p_element()
|
|
551
|
+
name = token.name
|
|
552
|
+
if name == "dd":
|
|
553
|
+
if self._has_in_definition_scope("dd"):
|
|
554
|
+
self._pop_until_any_inclusive({"dd"})
|
|
555
|
+
if self._has_in_definition_scope("dt"):
|
|
556
|
+
self._pop_until_any_inclusive({"dt"})
|
|
557
|
+
else:
|
|
558
|
+
if self._has_in_definition_scope("dt"):
|
|
559
|
+
self._pop_until_any_inclusive({"dt"})
|
|
560
|
+
if self._has_in_definition_scope("dd"):
|
|
561
|
+
self._pop_until_any_inclusive({"dd"})
|
|
562
|
+
self._insert_element(token, push=True)
|
|
563
|
+
return
|
|
564
|
+
|
|
565
|
+
def _adoption_agency(self, subject: Any) -> None:
|
|
566
|
+
# 1. If the current node is the subject, and it is not in the active formatting elements list...
|
|
567
|
+
if self.open_elements and self.open_elements[-1].name == subject:
|
|
568
|
+
if not self._has_active_formatting_entry(subject):
|
|
569
|
+
self._pop_until_inclusive(subject)
|
|
570
|
+
return
|
|
571
|
+
|
|
572
|
+
# 2. Outer loop
|
|
573
|
+
for _ in range(8):
|
|
574
|
+
# 3. Find formatting element
|
|
575
|
+
formatting_element_index = self._find_active_formatting_index(subject)
|
|
576
|
+
if formatting_element_index is None:
|
|
577
|
+
return
|
|
578
|
+
|
|
579
|
+
formatting_element_entry = self.active_formatting[formatting_element_index]
|
|
580
|
+
formatting_element = formatting_element_entry["node"]
|
|
581
|
+
|
|
582
|
+
# 4. If formatting element is not in open elements
|
|
583
|
+
if formatting_element not in self.open_elements:
|
|
584
|
+
self._parse_error("adoption-agency-1.3")
|
|
585
|
+
self._remove_formatting_entry(formatting_element_index)
|
|
586
|
+
return
|
|
587
|
+
|
|
588
|
+
# 5. If formatting element is in open elements but not in scope
|
|
589
|
+
if not self._has_element_in_scope(formatting_element.name):
|
|
590
|
+
self._parse_error("adoption-agency-1.3")
|
|
591
|
+
return
|
|
592
|
+
|
|
593
|
+
# 6. If formatting element is not the current node
|
|
594
|
+
if formatting_element is not self.open_elements[-1]:
|
|
595
|
+
self._parse_error("adoption-agency-1.3")
|
|
596
|
+
|
|
597
|
+
# 7. Find furthest block
|
|
598
|
+
furthest_block = None
|
|
599
|
+
formatting_element_in_open_index = self.open_elements.index(formatting_element)
|
|
600
|
+
|
|
601
|
+
for i in range(formatting_element_in_open_index + 1, len(self.open_elements)):
|
|
602
|
+
node = self.open_elements[i]
|
|
603
|
+
if self._is_special_element(node):
|
|
604
|
+
furthest_block = node
|
|
605
|
+
break
|
|
606
|
+
|
|
607
|
+
if furthest_block is None:
|
|
608
|
+
# formatting_element is known to be on the stack
|
|
609
|
+
while True:
|
|
610
|
+
popped = self.open_elements.pop()
|
|
611
|
+
if popped is formatting_element:
|
|
612
|
+
break
|
|
613
|
+
self._remove_formatting_entry(formatting_element_index)
|
|
614
|
+
return
|
|
615
|
+
|
|
616
|
+
# 8. Bookmark
|
|
617
|
+
bookmark = formatting_element_index + 1
|
|
618
|
+
|
|
619
|
+
# 9. Node and Last Node
|
|
620
|
+
node = furthest_block
|
|
621
|
+
last_node = furthest_block
|
|
622
|
+
|
|
623
|
+
# 10. Inner loop
|
|
624
|
+
inner_loop_counter = 0
|
|
625
|
+
while True:
|
|
626
|
+
inner_loop_counter += 1
|
|
627
|
+
|
|
628
|
+
# 10.1 Node = element above node
|
|
629
|
+
node_index = self.open_elements.index(node)
|
|
630
|
+
node = self.open_elements[node_index - 1]
|
|
631
|
+
|
|
632
|
+
# 10.2 If node is formatting element, break
|
|
633
|
+
if node is formatting_element:
|
|
634
|
+
break
|
|
635
|
+
|
|
636
|
+
# 10.3 Find active formatting entry for node
|
|
637
|
+
node_formatting_index = self._find_active_formatting_index_by_node(node)
|
|
638
|
+
|
|
639
|
+
if inner_loop_counter > 3 and node_formatting_index is not None:
|
|
640
|
+
self._remove_formatting_entry(node_formatting_index)
|
|
641
|
+
if node_formatting_index < bookmark:
|
|
642
|
+
bookmark -= 1
|
|
643
|
+
node_formatting_index = None
|
|
644
|
+
|
|
645
|
+
if node_formatting_index is None:
|
|
646
|
+
node_index = self.open_elements.index(node)
|
|
647
|
+
self.open_elements.remove(node)
|
|
648
|
+
node = self.open_elements[node_index]
|
|
649
|
+
continue
|
|
650
|
+
|
|
651
|
+
# 10.4 Replace entry with new element
|
|
652
|
+
entry = self.active_formatting[node_formatting_index]
|
|
653
|
+
new_element = self._create_element(entry["name"], entry["node"].namespace, entry["attrs"])
|
|
654
|
+
entry["node"] = new_element
|
|
655
|
+
self.open_elements[self.open_elements.index(node)] = new_element
|
|
656
|
+
node = new_element
|
|
657
|
+
|
|
658
|
+
# 10.5 If last node is furthest block, update bookmark
|
|
659
|
+
if last_node is furthest_block:
|
|
660
|
+
bookmark = node_formatting_index + 1
|
|
661
|
+
|
|
662
|
+
# 10.6 Reparent last_node
|
|
663
|
+
if last_node.parent:
|
|
664
|
+
last_node.parent.remove_child(last_node)
|
|
665
|
+
node.append_child(last_node)
|
|
666
|
+
|
|
667
|
+
# 10.7
|
|
668
|
+
last_node = node
|
|
669
|
+
|
|
670
|
+
# 11. Insert last_node into common ancestor
|
|
671
|
+
common_ancestor = self.open_elements[formatting_element_in_open_index - 1]
|
|
672
|
+
if last_node.parent:
|
|
673
|
+
last_node.parent.remove_child(last_node)
|
|
674
|
+
|
|
675
|
+
if self._should_foster_parenting(common_ancestor, for_tag=last_node.name):
|
|
676
|
+
parent, position = self._appropriate_insertion_location(common_ancestor, foster_parenting=True)
|
|
677
|
+
self._insert_node_at(parent, position, last_node)
|
|
678
|
+
else:
|
|
679
|
+
if type(common_ancestor) is TemplateNode and common_ancestor.template_content:
|
|
680
|
+
common_ancestor.template_content.append_child(last_node)
|
|
681
|
+
else:
|
|
682
|
+
common_ancestor.append_child(last_node)
|
|
683
|
+
|
|
684
|
+
# 12. Create new formatting element
|
|
685
|
+
entry = self.active_formatting[formatting_element_index]
|
|
686
|
+
new_formatting_element = self._create_element(entry["name"], entry["node"].namespace, entry["attrs"])
|
|
687
|
+
entry["node"] = new_formatting_element
|
|
688
|
+
|
|
689
|
+
# 13. Move children of furthest block
|
|
690
|
+
while furthest_block.has_child_nodes():
|
|
691
|
+
child = furthest_block.children[0]
|
|
692
|
+
furthest_block.remove_child(child)
|
|
693
|
+
new_formatting_element.append_child(child)
|
|
694
|
+
|
|
695
|
+
furthest_block.append_child(new_formatting_element)
|
|
696
|
+
|
|
697
|
+
# 14. Remove formatting element from active formatting and insert new at bookmark
|
|
698
|
+
# Per spec, bookmark is always > formatting_element_index (starts at fmt_idx+1,
|
|
699
|
+
# can only be set to higher values or decremented when entries above fmt_idx are removed)
|
|
700
|
+
self._remove_formatting_entry(formatting_element_index)
|
|
701
|
+
bookmark -= 1
|
|
702
|
+
self.active_formatting.insert(bookmark, entry)
|
|
703
|
+
|
|
704
|
+
# 15. Remove formatting element from open elements and insert new one
|
|
705
|
+
self.open_elements.remove(formatting_element)
|
|
706
|
+
furthest_block_index = self.open_elements.index(furthest_block)
|
|
707
|
+
self.open_elements.insert(furthest_block_index + 1, new_formatting_element)
|
|
708
|
+
|
|
709
|
+
def _handle_body_start_a(self, token: Any) -> Any:
|
|
710
|
+
if self._has_active_formatting_entry("a"):
|
|
711
|
+
self._adoption_agency("a")
|
|
712
|
+
self._remove_last_active_formatting_by_name("a")
|
|
713
|
+
self._remove_last_open_element_by_name("a")
|
|
714
|
+
self._reconstruct_active_formatting_elements()
|
|
715
|
+
node = self._insert_element(token, push=True)
|
|
716
|
+
self._append_active_formatting_entry("a", token.attrs, node)
|
|
717
|
+
return
|
|
718
|
+
|
|
719
|
+
def _handle_body_start_formatting(self, token: Any) -> Any:
|
|
720
|
+
name = token.name
|
|
721
|
+
if name == "nobr" and self._in_scope("nobr"):
|
|
722
|
+
self._adoption_agency("nobr")
|
|
723
|
+
self._remove_last_active_formatting_by_name("nobr")
|
|
724
|
+
self._remove_last_open_element_by_name("nobr")
|
|
725
|
+
self._reconstruct_active_formatting_elements()
|
|
726
|
+
duplicate_index = self._find_active_formatting_duplicate(name, token.attrs)
|
|
727
|
+
if duplicate_index is not None:
|
|
728
|
+
self._remove_formatting_entry(duplicate_index)
|
|
729
|
+
node = self._insert_element(token, push=True)
|
|
730
|
+
self._append_active_formatting_entry(name, token.attrs, node)
|
|
731
|
+
return
|
|
732
|
+
|
|
733
|
+
def _handle_body_start_applet_like(self, token: Any) -> Any:
|
|
734
|
+
self._reconstruct_active_formatting_elements()
|
|
735
|
+
self._insert_element(token, push=True)
|
|
736
|
+
self._push_formatting_marker()
|
|
737
|
+
self.frameset_ok = False
|
|
738
|
+
return
|
|
739
|
+
|
|
740
|
+
def _handle_body_start_br(self, token: Any) -> Any:
|
|
741
|
+
self._close_p_element()
|
|
742
|
+
self._reconstruct_active_formatting_elements()
|
|
743
|
+
self._insert_element(token, push=False)
|
|
744
|
+
self.frameset_ok = False
|
|
745
|
+
return
|
|
746
|
+
|
|
747
|
+
def _handle_body_start_frameset(self, token: Any) -> Any:
|
|
748
|
+
if not self.frameset_ok:
|
|
749
|
+
self._parse_error("unexpected-start-tag-ignored", tag_name=token.name)
|
|
750
|
+
return
|
|
751
|
+
# Find body element on the stack (may not exist if already in frameset)
|
|
752
|
+
body_index = None
|
|
753
|
+
for i, elem in enumerate(self.open_elements):
|
|
754
|
+
if elem.name == "body":
|
|
755
|
+
body_index = i
|
|
756
|
+
break
|
|
757
|
+
if body_index is None:
|
|
758
|
+
# No body on stack (e.g., nested frameset after mode reset), ignore
|
|
759
|
+
self._parse_error("unexpected-start-tag-ignored", tag_name=token.name)
|
|
760
|
+
return
|
|
761
|
+
body_elem = self.open_elements[body_index]
|
|
762
|
+
body_elem.parent.remove_child(body_elem)
|
|
763
|
+
self.open_elements = self.open_elements[:body_index]
|
|
764
|
+
self._insert_element(token, push=True)
|
|
765
|
+
self.mode = InsertionMode.IN_FRAMESET
|
|
766
|
+
return
|
|
767
|
+
|
|
768
|
+
# ---------------------
|
|
769
|
+
# Body mode end tag handlers
|
|
770
|
+
# ---------------------
|
|
771
|
+
|
|
772
|
+
def _handle_body_end_body(self, token: Any) -> Any:
|
|
773
|
+
if self._in_scope("body"):
|
|
774
|
+
self.mode = InsertionMode.AFTER_BODY
|
|
775
|
+
return
|
|
776
|
+
|
|
777
|
+
def _handle_body_end_html(self, token: Any) -> Any:
|
|
778
|
+
if self._in_scope("body"):
|
|
779
|
+
return ("reprocess", InsertionMode.AFTER_BODY, token)
|
|
780
|
+
return None
|
|
781
|
+
|
|
782
|
+
def _handle_body_end_p(self, token: Any) -> Any:
|
|
783
|
+
if not self._close_p_element():
|
|
784
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
785
|
+
phantom = Tag(Tag.START, "p", {}, False)
|
|
786
|
+
self._insert_element(phantom, push=True)
|
|
787
|
+
self._close_p_element()
|
|
788
|
+
return
|
|
789
|
+
|
|
790
|
+
def _handle_body_end_li(self, token: Any) -> Any:
|
|
791
|
+
if not self._has_in_list_item_scope("li"):
|
|
792
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
793
|
+
return
|
|
794
|
+
self._pop_until_any_inclusive({"li"})
|
|
795
|
+
return
|
|
796
|
+
|
|
797
|
+
def _handle_body_end_dd_dt(self, token: Any) -> Any:
|
|
798
|
+
name = token.name
|
|
799
|
+
if not self._has_in_definition_scope(name):
|
|
800
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
801
|
+
return
|
|
802
|
+
self._pop_until_any_inclusive({"dd", "dt"})
|
|
803
|
+
|
|
804
|
+
def _handle_body_end_form(self, token: Any) -> Any:
|
|
805
|
+
if self.form_element is None:
|
|
806
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
807
|
+
return
|
|
808
|
+
removed = self._remove_from_open_elements(self.form_element)
|
|
809
|
+
self.form_element = None
|
|
810
|
+
if not removed:
|
|
811
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
812
|
+
return
|
|
813
|
+
|
|
814
|
+
def _handle_body_end_applet_like(self, token: Any) -> Any:
|
|
815
|
+
name = token.name
|
|
816
|
+
if not self._in_scope(name):
|
|
817
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
818
|
+
return
|
|
819
|
+
# Element verified in scope above
|
|
820
|
+
while self.open_elements: # pragma: no branch
|
|
821
|
+
popped = self.open_elements.pop()
|
|
822
|
+
if popped.name == name:
|
|
823
|
+
break
|
|
824
|
+
self._clear_active_formatting_up_to_marker()
|
|
825
|
+
return
|
|
826
|
+
|
|
827
|
+
def _handle_body_end_heading(self, token: Any) -> Any:
|
|
828
|
+
name = token.name
|
|
829
|
+
if not self._has_any_in_scope(HEADING_ELEMENTS):
|
|
830
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
831
|
+
return
|
|
832
|
+
self._generate_implied_end_tags()
|
|
833
|
+
if self.open_elements and self.open_elements[-1].name != name:
|
|
834
|
+
self._parse_error("end-tag-too-early", tag_name=name)
|
|
835
|
+
# Heading verified in scope by caller
|
|
836
|
+
while self.open_elements: # pragma: no branch
|
|
837
|
+
popped = self.open_elements.pop()
|
|
838
|
+
if popped.name in HEADING_ELEMENTS:
|
|
839
|
+
break
|
|
840
|
+
return
|
|
841
|
+
|
|
842
|
+
def _handle_body_end_block(self, token: Any) -> Any:
|
|
843
|
+
name = token.name
|
|
844
|
+
if not self._in_scope(name):
|
|
845
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
846
|
+
return
|
|
847
|
+
self._generate_implied_end_tags()
|
|
848
|
+
if self.open_elements and self.open_elements[-1].name != name:
|
|
849
|
+
self._parse_error("end-tag-too-early", tag_name=name)
|
|
850
|
+
self._pop_until_any_inclusive({name})
|
|
851
|
+
return
|
|
852
|
+
|
|
853
|
+
def _handle_body_end_template(self, token: Any) -> Any:
|
|
854
|
+
has_template = any(node.name == "template" for node in self.open_elements)
|
|
855
|
+
if not has_template:
|
|
856
|
+
return
|
|
857
|
+
self._generate_implied_end_tags()
|
|
858
|
+
self._pop_until_inclusive("template")
|
|
859
|
+
self._clear_active_formatting_up_to_marker()
|
|
860
|
+
# Pop template mode if available
|
|
861
|
+
if self.template_modes: # pragma: no branch
|
|
862
|
+
self.template_modes.pop()
|
|
863
|
+
self._reset_insertion_mode()
|
|
864
|
+
return
|
|
865
|
+
|
|
866
|
+
def _handle_body_start_structure_ignored(self, token: Any) -> Any:
|
|
867
|
+
self._parse_error("unexpected-start-tag-ignored", tag_name=token.name)
|
|
868
|
+
return
|
|
869
|
+
|
|
870
|
+
def _handle_body_start_col_or_frame(self, token: Any) -> Any:
|
|
871
|
+
if self.fragment_context is None:
|
|
872
|
+
self._parse_error("unexpected-start-tag-ignored", tag_name=token.name)
|
|
873
|
+
return
|
|
874
|
+
self._insert_element(token, push=False)
|
|
875
|
+
return
|
|
876
|
+
|
|
877
|
+
def _handle_body_start_image(self, token: Any) -> Any:
|
|
878
|
+
self._parse_error("image-start-tag", tag_name=token.name)
|
|
879
|
+
img_token = Tag(Tag.START, "img", token.attrs, token.self_closing)
|
|
880
|
+
self._reconstruct_active_formatting_elements()
|
|
881
|
+
self._insert_element(img_token, push=False)
|
|
882
|
+
self.frameset_ok = False
|
|
883
|
+
return
|
|
884
|
+
|
|
885
|
+
def _handle_body_start_void_with_formatting(self, token: Any) -> Any:
|
|
886
|
+
self._reconstruct_active_formatting_elements()
|
|
887
|
+
self._insert_element(token, push=False)
|
|
888
|
+
self.frameset_ok = False
|
|
889
|
+
return
|
|
890
|
+
|
|
891
|
+
def _handle_body_start_simple_void(self, token: Any) -> Any:
|
|
892
|
+
self._insert_element(token, push=False)
|
|
893
|
+
return
|
|
894
|
+
|
|
895
|
+
def _handle_body_start_input(self, token: Any) -> Any:
|
|
896
|
+
input_type = None
|
|
897
|
+
for name, value in token.attrs.items():
|
|
898
|
+
if name == "type":
|
|
899
|
+
input_type = (value or "").lower()
|
|
900
|
+
break
|
|
901
|
+
self._insert_element(token, push=False)
|
|
902
|
+
if input_type != "hidden":
|
|
903
|
+
self.frameset_ok = False
|
|
904
|
+
return
|
|
905
|
+
|
|
906
|
+
def _handle_body_start_table(self, token: Any) -> Any:
|
|
907
|
+
if self.quirks_mode != "quirks":
|
|
908
|
+
self._close_p_element()
|
|
909
|
+
self._insert_element(token, push=True)
|
|
910
|
+
self.frameset_ok = False
|
|
911
|
+
self.mode = InsertionMode.IN_TABLE
|
|
912
|
+
return
|
|
913
|
+
|
|
914
|
+
def _handle_body_start_plaintext_xmp(self, token: Any) -> Any:
|
|
915
|
+
self._close_p_element()
|
|
916
|
+
self._insert_element(token, push=True)
|
|
917
|
+
self.frameset_ok = False
|
|
918
|
+
if token.name == "plaintext":
|
|
919
|
+
self.tokenizer_state_override = TokenSinkResult.Plaintext
|
|
920
|
+
else:
|
|
921
|
+
# xmp, iframe, noembed, noframes, noscript (scripting disabled)
|
|
922
|
+
self.original_mode = self.mode
|
|
923
|
+
self.mode = InsertionMode.TEXT
|
|
924
|
+
return
|
|
925
|
+
|
|
926
|
+
def _handle_body_start_textarea(self, token: Any) -> Any:
|
|
927
|
+
self._insert_element(token, push=True)
|
|
928
|
+
self.ignore_lf = True
|
|
929
|
+
self.frameset_ok = False
|
|
930
|
+
return
|
|
931
|
+
|
|
932
|
+
def _handle_body_start_select(self, token: Any) -> Any:
|
|
933
|
+
self._reconstruct_active_formatting_elements()
|
|
934
|
+
self._insert_element(token, push=True)
|
|
935
|
+
self.frameset_ok = False
|
|
936
|
+
self._reset_insertion_mode()
|
|
937
|
+
return
|
|
938
|
+
|
|
939
|
+
def _handle_body_start_option(self, token: Any) -> Any:
|
|
940
|
+
if self.open_elements and self.open_elements[-1].name == "option":
|
|
941
|
+
self.open_elements.pop()
|
|
942
|
+
self._reconstruct_active_formatting_elements()
|
|
943
|
+
self._insert_element(token, push=True)
|
|
944
|
+
return
|
|
945
|
+
|
|
946
|
+
def _handle_body_start_optgroup(self, token: Any) -> Any:
|
|
947
|
+
if self.open_elements and self.open_elements[-1].name == "option":
|
|
948
|
+
self.open_elements.pop()
|
|
949
|
+
self._reconstruct_active_formatting_elements()
|
|
950
|
+
self._insert_element(token, push=True)
|
|
951
|
+
return
|
|
952
|
+
|
|
953
|
+
def _handle_body_start_rp_rt(self, token: Any) -> Any:
|
|
954
|
+
self._generate_implied_end_tags(exclude="rtc")
|
|
955
|
+
self._insert_element(token, push=True)
|
|
956
|
+
return
|
|
957
|
+
|
|
958
|
+
def _handle_body_start_rb_rtc(self, token: Any) -> Any:
|
|
959
|
+
if self.open_elements and self.open_elements[-1].name in {"rb", "rp", "rt", "rtc"}:
|
|
960
|
+
self._generate_implied_end_tags()
|
|
961
|
+
self._insert_element(token, push=True)
|
|
962
|
+
return
|
|
963
|
+
|
|
964
|
+
def _handle_body_start_table_parse_error(self, token: Any) -> Any:
|
|
965
|
+
self._parse_error("unexpected-start-tag", tag_name=token.name)
|
|
966
|
+
return
|
|
967
|
+
|
|
968
|
+
def _handle_body_start_default(self, token: Any) -> Any:
|
|
969
|
+
self._reconstruct_active_formatting_elements()
|
|
970
|
+
self._insert_element(token, push=True)
|
|
971
|
+
if token.self_closing:
|
|
972
|
+
self._parse_error("non-void-html-element-start-tag-with-trailing-solidus", tag_name=token.name)
|
|
973
|
+
# Elements reaching here have no handler - never in FRAMESET_NEUTRAL/FORMATTING_ELEMENTS
|
|
974
|
+
self.frameset_ok = False
|
|
975
|
+
return
|
|
976
|
+
|
|
977
|
+
def _mode_in_table(self, token: Any) -> Any:
|
|
978
|
+
if isinstance(token, CharacterTokens):
|
|
979
|
+
data = token.data or ""
|
|
980
|
+
if "\x00" in data:
|
|
981
|
+
self._parse_error("unexpected-null-character")
|
|
982
|
+
data = data.replace("\x00", "")
|
|
983
|
+
if not data:
|
|
984
|
+
return None
|
|
985
|
+
token = CharacterTokens(data)
|
|
986
|
+
self.pending_table_text = []
|
|
987
|
+
self.table_text_original_mode = self.mode
|
|
988
|
+
self.mode = InsertionMode.IN_TABLE_TEXT
|
|
989
|
+
return ("reprocess", InsertionMode.IN_TABLE_TEXT, token)
|
|
990
|
+
if isinstance(token, CommentToken):
|
|
991
|
+
self._append_comment(token.data)
|
|
992
|
+
return None
|
|
993
|
+
if isinstance(token, Tag):
|
|
994
|
+
name = token.name
|
|
995
|
+
if token.kind == Tag.START:
|
|
996
|
+
if name == "caption":
|
|
997
|
+
self._clear_stack_until({"table", "template", "html"})
|
|
998
|
+
self._push_formatting_marker()
|
|
999
|
+
self._insert_element(token, push=True)
|
|
1000
|
+
self.mode = InsertionMode.IN_CAPTION
|
|
1001
|
+
return None
|
|
1002
|
+
if name == "colgroup":
|
|
1003
|
+
self._clear_stack_until({"table", "template", "html"})
|
|
1004
|
+
self._insert_element(token, push=True)
|
|
1005
|
+
self.mode = InsertionMode.IN_COLUMN_GROUP
|
|
1006
|
+
return None
|
|
1007
|
+
if name == "col":
|
|
1008
|
+
self._clear_stack_until({"table", "template", "html"})
|
|
1009
|
+
implied = Tag(Tag.START, "colgroup", {}, False)
|
|
1010
|
+
self._insert_element(implied, push=True)
|
|
1011
|
+
self.mode = InsertionMode.IN_COLUMN_GROUP
|
|
1012
|
+
return ("reprocess", InsertionMode.IN_COLUMN_GROUP, token)
|
|
1013
|
+
if name in {"tbody", "tfoot", "thead"}:
|
|
1014
|
+
self._clear_stack_until({"table", "template", "html"})
|
|
1015
|
+
self._insert_element(token, push=True)
|
|
1016
|
+
self.mode = InsertionMode.IN_TABLE_BODY
|
|
1017
|
+
return None
|
|
1018
|
+
if name in {"td", "th", "tr"}:
|
|
1019
|
+
self._clear_stack_until({"table", "template", "html"})
|
|
1020
|
+
implied = Tag(Tag.START, "tbody", {}, False)
|
|
1021
|
+
self._insert_element(implied, push=True)
|
|
1022
|
+
self.mode = InsertionMode.IN_TABLE_BODY
|
|
1023
|
+
return ("reprocess", InsertionMode.IN_TABLE_BODY, token)
|
|
1024
|
+
if name == "table":
|
|
1025
|
+
self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=name)
|
|
1026
|
+
closed = self._close_table_element()
|
|
1027
|
+
if closed:
|
|
1028
|
+
return ("reprocess", self.mode, token)
|
|
1029
|
+
return None
|
|
1030
|
+
if name in {"style", "script"}:
|
|
1031
|
+
# Per HTML5 spec: style and script are inserted directly into the table
|
|
1032
|
+
# (not processed as in-head which would move them)
|
|
1033
|
+
self._insert_element(token, push=True)
|
|
1034
|
+
self.original_mode = self.mode
|
|
1035
|
+
self.mode = InsertionMode.TEXT
|
|
1036
|
+
return None
|
|
1037
|
+
if name == "template":
|
|
1038
|
+
# Template is handled by delegating to IN_HEAD
|
|
1039
|
+
return self._mode_in_head(token)
|
|
1040
|
+
if name == "input":
|
|
1041
|
+
input_type = None
|
|
1042
|
+
for attr_name, attr_value in token.attrs.items():
|
|
1043
|
+
if attr_name == "type":
|
|
1044
|
+
input_type = (attr_value or "").lower()
|
|
1045
|
+
break
|
|
1046
|
+
if input_type == "hidden":
|
|
1047
|
+
self._parse_error("unexpected-hidden-input-in-table")
|
|
1048
|
+
self._insert_element(token, push=True)
|
|
1049
|
+
self.open_elements.pop() # push=True always adds to stack
|
|
1050
|
+
return None
|
|
1051
|
+
if name == "form":
|
|
1052
|
+
self._parse_error("unexpected-form-in-table")
|
|
1053
|
+
if self.form_element is None:
|
|
1054
|
+
node = self._insert_element(token, push=True)
|
|
1055
|
+
self.form_element = node
|
|
1056
|
+
self.open_elements.pop() # push=True always adds to stack
|
|
1057
|
+
return None
|
|
1058
|
+
self._parse_error("unexpected-start-tag-implies-table-voodoo", tag_name=name)
|
|
1059
|
+
previous = self.insert_from_table
|
|
1060
|
+
self.insert_from_table = True
|
|
1061
|
+
try:
|
|
1062
|
+
return self._mode_in_body(token)
|
|
1063
|
+
finally:
|
|
1064
|
+
self.insert_from_table = previous
|
|
1065
|
+
else:
|
|
1066
|
+
if name == "table":
|
|
1067
|
+
self._close_table_element()
|
|
1068
|
+
return None
|
|
1069
|
+
if name in {"body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr"}:
|
|
1070
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1071
|
+
return None
|
|
1072
|
+
self._parse_error("unexpected-end-tag-implies-table-voodoo", tag_name=name)
|
|
1073
|
+
previous = self.insert_from_table
|
|
1074
|
+
self.insert_from_table = True
|
|
1075
|
+
try:
|
|
1076
|
+
return self._mode_in_body(token)
|
|
1077
|
+
finally:
|
|
1078
|
+
self.insert_from_table = previous
|
|
1079
|
+
# Per spec, only CharacterTokens, CommentToken, Tag, and EOFToken exist
|
|
1080
|
+
assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
|
|
1081
|
+
# If we're in a template, handle EOF in template mode first
|
|
1082
|
+
if self.template_modes:
|
|
1083
|
+
return self._mode_in_template(token)
|
|
1084
|
+
if self._has_in_table_scope("table"):
|
|
1085
|
+
self._parse_error("expected-closing-tag-but-got-eof", tag_name="table")
|
|
1086
|
+
return None
|
|
1087
|
+
|
|
1088
|
+
def _mode_in_table_text(self, token: Any) -> Any:
|
|
1089
|
+
if isinstance(token, CharacterTokens):
|
|
1090
|
+
# IN_TABLE mode guarantees non-empty data
|
|
1091
|
+
data = token.data
|
|
1092
|
+
if "\x0c" in data:
|
|
1093
|
+
self._parse_error("invalid-codepoint-in-table-text")
|
|
1094
|
+
data = data.replace("\x0c", "")
|
|
1095
|
+
if data:
|
|
1096
|
+
self.pending_table_text.append(data)
|
|
1097
|
+
return None
|
|
1098
|
+
self._flush_pending_table_text()
|
|
1099
|
+
original = self.table_text_original_mode or InsertionMode.IN_TABLE
|
|
1100
|
+
self.table_text_original_mode = None
|
|
1101
|
+
self.mode = original
|
|
1102
|
+
return ("reprocess", original, token)
|
|
1103
|
+
|
|
1104
|
+
def _mode_in_caption(self, token: Any) -> Any:
|
|
1105
|
+
if isinstance(token, CharacterTokens):
|
|
1106
|
+
return self._mode_in_body(token)
|
|
1107
|
+
if isinstance(token, CommentToken):
|
|
1108
|
+
self._append_comment(token.data)
|
|
1109
|
+
return None
|
|
1110
|
+
if isinstance(token, Tag):
|
|
1111
|
+
name = token.name
|
|
1112
|
+
if token.kind == Tag.START:
|
|
1113
|
+
if name in {"caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr", "td", "th"}:
|
|
1114
|
+
self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=name)
|
|
1115
|
+
if self._close_caption_element():
|
|
1116
|
+
return ("reprocess", InsertionMode.IN_TABLE, token)
|
|
1117
|
+
# Fragment parsing with caption context: caption not on stack, ignore table structure elements
|
|
1118
|
+
return None
|
|
1119
|
+
if name == "table":
|
|
1120
|
+
self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=name)
|
|
1121
|
+
if self._close_caption_element():
|
|
1122
|
+
return ("reprocess", InsertionMode.IN_TABLE, token)
|
|
1123
|
+
# Fragment parsing: no caption on stack - handle in body mode
|
|
1124
|
+
return self._mode_in_body(token)
|
|
1125
|
+
return self._mode_in_body(token)
|
|
1126
|
+
if name == "caption":
|
|
1127
|
+
if not self._close_caption_element():
|
|
1128
|
+
return None
|
|
1129
|
+
return None
|
|
1130
|
+
if name == "table":
|
|
1131
|
+
if self._close_caption_element():
|
|
1132
|
+
return ("reprocess", InsertionMode.IN_TABLE, token)
|
|
1133
|
+
return None
|
|
1134
|
+
if name in {"tbody", "tfoot", "thead"}:
|
|
1135
|
+
# These elements are never in table scope when in caption -
|
|
1136
|
+
# caption closes any open tbody/tfoot/thead when created
|
|
1137
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1138
|
+
return None
|
|
1139
|
+
return self._mode_in_body(token)
|
|
1140
|
+
assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
|
|
1141
|
+
return self._mode_in_body(token)
|
|
1142
|
+
|
|
1143
|
+
def _close_caption_element(self) -> bool:
|
|
1144
|
+
if not self._has_in_table_scope("caption"):
|
|
1145
|
+
self._parse_error("unexpected-end-tag", tag_name="caption")
|
|
1146
|
+
return False
|
|
1147
|
+
self._generate_implied_end_tags()
|
|
1148
|
+
# Caption verified in scope above
|
|
1149
|
+
while self.open_elements: # pragma: no branch
|
|
1150
|
+
node = self.open_elements.pop()
|
|
1151
|
+
if node.name == "caption":
|
|
1152
|
+
break
|
|
1153
|
+
self._clear_active_formatting_up_to_marker()
|
|
1154
|
+
self.mode = InsertionMode.IN_TABLE
|
|
1155
|
+
return True
|
|
1156
|
+
|
|
1157
|
+
def _mode_in_column_group(self, token: Any) -> Any:
|
|
1158
|
+
current = self.open_elements[-1] if self.open_elements else None
|
|
1159
|
+
if isinstance(token, CharacterTokens):
|
|
1160
|
+
data = token.data or ""
|
|
1161
|
+
# Find first non-whitespace character
|
|
1162
|
+
stripped = data.lstrip(" \t\n\r\f")
|
|
1163
|
+
|
|
1164
|
+
if len(stripped) < len(data):
|
|
1165
|
+
# Has leading whitespace - insert it
|
|
1166
|
+
ws = data[: len(data) - len(stripped)]
|
|
1167
|
+
self._append_text(ws)
|
|
1168
|
+
|
|
1169
|
+
# Continue processing non-whitespace with a new token
|
|
1170
|
+
non_ws_token = CharacterTokens(stripped)
|
|
1171
|
+
if current and current.name == "html":
|
|
1172
|
+
# Fragment parsing with colgroup context: drop non-whitespace characters
|
|
1173
|
+
# (This is the only way html can be current in IN_COLUMN_GROUP mode)
|
|
1174
|
+
self._parse_error("unexpected-characters-in-column-group")
|
|
1175
|
+
return None
|
|
1176
|
+
# In a template, non-whitespace characters are parse errors - ignore them
|
|
1177
|
+
if current and current.name == "template":
|
|
1178
|
+
self._parse_error("unexpected-characters-in-template-column-group")
|
|
1179
|
+
return None
|
|
1180
|
+
self._parse_error("unexpected-characters-in-column-group")
|
|
1181
|
+
self._pop_current()
|
|
1182
|
+
self.mode = InsertionMode.IN_TABLE
|
|
1183
|
+
return ("reprocess", InsertionMode.IN_TABLE, non_ws_token)
|
|
1184
|
+
if isinstance(token, CommentToken):
|
|
1185
|
+
self._append_comment(token.data)
|
|
1186
|
+
return None
|
|
1187
|
+
if isinstance(token, Tag):
|
|
1188
|
+
name = token.name
|
|
1189
|
+
if token.kind == Tag.START:
|
|
1190
|
+
if name == "html":
|
|
1191
|
+
return self._mode_in_body(token)
|
|
1192
|
+
if name == "col":
|
|
1193
|
+
self._insert_element(token, push=True)
|
|
1194
|
+
self.open_elements.pop() # push=True always adds to stack
|
|
1195
|
+
return None
|
|
1196
|
+
if name == "template":
|
|
1197
|
+
# Template is handled by delegating to IN_HEAD
|
|
1198
|
+
return self._mode_in_head(token)
|
|
1199
|
+
if name == "colgroup":
|
|
1200
|
+
self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=name)
|
|
1201
|
+
# Don't pop template element - only pop actual colgroup
|
|
1202
|
+
if current and current.name == "colgroup":
|
|
1203
|
+
self._pop_current()
|
|
1204
|
+
self.mode = InsertionMode.IN_TABLE
|
|
1205
|
+
return ("reprocess", InsertionMode.IN_TABLE, token)
|
|
1206
|
+
return None
|
|
1207
|
+
if (
|
|
1208
|
+
self.fragment_context
|
|
1209
|
+
and self.fragment_context.tag_name.lower() == "colgroup"
|
|
1210
|
+
and not self._has_in_table_scope("table")
|
|
1211
|
+
):
|
|
1212
|
+
self._parse_error("unexpected-start-tag-in-column-group", tag_name=name)
|
|
1213
|
+
return None
|
|
1214
|
+
# Anything else: if we're in a colgroup, pop it and switch to IN_TABLE
|
|
1215
|
+
if current and current.name == "colgroup":
|
|
1216
|
+
self._pop_current()
|
|
1217
|
+
self.mode = InsertionMode.IN_TABLE
|
|
1218
|
+
return ("reprocess", InsertionMode.IN_TABLE, token)
|
|
1219
|
+
# In template column group context (via <col> in template), ignore non-column content
|
|
1220
|
+
# At this point current is template - the only other case after colgroup fragment
|
|
1221
|
+
# and colgroup element are handled
|
|
1222
|
+
self._parse_error("unexpected-start-tag-in-template-column-group", tag_name=name)
|
|
1223
|
+
return None
|
|
1224
|
+
if name == "colgroup":
|
|
1225
|
+
if current and current.name == "colgroup":
|
|
1226
|
+
self._pop_current()
|
|
1227
|
+
self.mode = InsertionMode.IN_TABLE
|
|
1228
|
+
else:
|
|
1229
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
1230
|
+
return None
|
|
1231
|
+
if name == "col":
|
|
1232
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1233
|
+
return None
|
|
1234
|
+
if name == "template":
|
|
1235
|
+
# Template end tag needs proper handling
|
|
1236
|
+
return self._mode_in_head(token)
|
|
1237
|
+
if current and current.name != "html": # pragma: no branch
|
|
1238
|
+
self._pop_current()
|
|
1239
|
+
self.mode = InsertionMode.IN_TABLE
|
|
1240
|
+
return ("reprocess", InsertionMode.IN_TABLE, token)
|
|
1241
|
+
assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
|
|
1242
|
+
if current and current.name == "colgroup":
|
|
1243
|
+
self._pop_current()
|
|
1244
|
+
self.mode = InsertionMode.IN_TABLE
|
|
1245
|
+
return ("reprocess", InsertionMode.IN_TABLE, token)
|
|
1246
|
+
if current and current.name == "template":
|
|
1247
|
+
# In template, delegate EOF handling to IN_TEMPLATE
|
|
1248
|
+
return self._mode_in_template(token)
|
|
1249
|
+
return None
|
|
1250
|
+
# Per spec: EOF when current is html - implicit None return
|
|
1251
|
+
|
|
1252
|
+
def _mode_in_table_body(self, token: Any) -> Any:
|
|
1253
|
+
if isinstance(token, CharacterTokens) or isinstance(token, CommentToken):
|
|
1254
|
+
return self._mode_in_table(token)
|
|
1255
|
+
if isinstance(token, Tag):
|
|
1256
|
+
name = token.name
|
|
1257
|
+
if token.kind == Tag.START:
|
|
1258
|
+
if name == "tr":
|
|
1259
|
+
self._clear_stack_until({"tbody", "tfoot", "thead", "template", "html"})
|
|
1260
|
+
self._insert_element(token, push=True)
|
|
1261
|
+
self.mode = InsertionMode.IN_ROW
|
|
1262
|
+
return None
|
|
1263
|
+
if name in {"td", "th"}:
|
|
1264
|
+
self._parse_error("unexpected-cell-in-table-body")
|
|
1265
|
+
self._clear_stack_until({"tbody", "tfoot", "thead", "template", "html"})
|
|
1266
|
+
implied = Tag(Tag.START, "tr", {}, False)
|
|
1267
|
+
self._insert_element(implied, push=True)
|
|
1268
|
+
self.mode = InsertionMode.IN_ROW
|
|
1269
|
+
return ("reprocess", InsertionMode.IN_ROW, token)
|
|
1270
|
+
if name in {"caption", "col", "colgroup", "tbody", "tfoot", "thead", "table"}:
|
|
1271
|
+
current = self.open_elements[-1] if self.open_elements else None
|
|
1272
|
+
# When in a template, these tags create invalid structure - treat as "anything else"
|
|
1273
|
+
if current and current.name == "template":
|
|
1274
|
+
self._parse_error("unexpected-start-tag-in-template-table-context", tag_name=name)
|
|
1275
|
+
return None
|
|
1276
|
+
# In fragment parsing with tbody/tfoot/thead context and no tbody on stack, ignore these tags
|
|
1277
|
+
if (
|
|
1278
|
+
self.fragment_context
|
|
1279
|
+
and current
|
|
1280
|
+
and current.name == "html"
|
|
1281
|
+
and self.fragment_context.tag_name.lower() in {"tbody", "tfoot", "thead"}
|
|
1282
|
+
):
|
|
1283
|
+
self._parse_error("unexpected-start-tag")
|
|
1284
|
+
return None
|
|
1285
|
+
# Pop tbody/tfoot/thead (stack always has elements here in normal parsing)
|
|
1286
|
+
if self.open_elements:
|
|
1287
|
+
self.open_elements.pop()
|
|
1288
|
+
self.mode = InsertionMode.IN_TABLE
|
|
1289
|
+
return ("reprocess", InsertionMode.IN_TABLE, token)
|
|
1290
|
+
# Empty stack edge case - go directly to IN_TABLE without reprocess
|
|
1291
|
+
self.mode = InsertionMode.IN_TABLE # pragma: no cover
|
|
1292
|
+
return None # pragma: no cover
|
|
1293
|
+
return self._mode_in_table(token)
|
|
1294
|
+
if name in {"tbody", "tfoot", "thead"}:
|
|
1295
|
+
if not self._has_in_table_scope(name):
|
|
1296
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1297
|
+
return None
|
|
1298
|
+
self._clear_stack_until({"tbody", "tfoot", "thead", "template", "html"})
|
|
1299
|
+
self._pop_current()
|
|
1300
|
+
self.mode = InsertionMode.IN_TABLE
|
|
1301
|
+
return None
|
|
1302
|
+
if name == "table":
|
|
1303
|
+
current = self.open_elements[-1] if self.open_elements else None
|
|
1304
|
+
# In a template, reject </table> as there's no table element
|
|
1305
|
+
if current and current.name == "template":
|
|
1306
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
1307
|
+
return None
|
|
1308
|
+
# In fragment parsing with tbody/tfoot/thead context and no tbody on stack, ignore </table>
|
|
1309
|
+
if (
|
|
1310
|
+
self.fragment_context
|
|
1311
|
+
and current
|
|
1312
|
+
and current.name == "html"
|
|
1313
|
+
and self.fragment_context.tag_name.lower() in {"tbody", "tfoot", "thead"}
|
|
1314
|
+
):
|
|
1315
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
1316
|
+
return None
|
|
1317
|
+
if current and current.name in {"tbody", "tfoot", "thead"}:
|
|
1318
|
+
self.open_elements.pop()
|
|
1319
|
+
self.mode = InsertionMode.IN_TABLE
|
|
1320
|
+
return ("reprocess", InsertionMode.IN_TABLE, token)
|
|
1321
|
+
if name in {"caption", "col", "colgroup", "td", "th", "tr"}:
|
|
1322
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1323
|
+
return None
|
|
1324
|
+
return self._mode_in_table(token)
|
|
1325
|
+
assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
|
|
1326
|
+
return self._mode_in_table(token)
|
|
1327
|
+
|
|
1328
|
+
def _mode_in_row(self, token: Any) -> Any:
|
|
1329
|
+
if isinstance(token, CharacterTokens) or isinstance(token, CommentToken):
|
|
1330
|
+
return self._mode_in_table(token)
|
|
1331
|
+
if isinstance(token, Tag):
|
|
1332
|
+
name = token.name
|
|
1333
|
+
if token.kind == Tag.START:
|
|
1334
|
+
if name in {"td", "th"}:
|
|
1335
|
+
self._clear_stack_until({"tr", "template", "html"})
|
|
1336
|
+
self._insert_element(token, push=True)
|
|
1337
|
+
self._push_formatting_marker()
|
|
1338
|
+
self.mode = InsertionMode.IN_CELL
|
|
1339
|
+
return None
|
|
1340
|
+
if name in {"caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr", "table"}:
|
|
1341
|
+
if not self._has_in_table_scope("tr"):
|
|
1342
|
+
self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=name)
|
|
1343
|
+
return None
|
|
1344
|
+
self._end_tr_element()
|
|
1345
|
+
return ("reprocess", self.mode, token)
|
|
1346
|
+
previous = self.insert_from_table
|
|
1347
|
+
self.insert_from_table = True
|
|
1348
|
+
try:
|
|
1349
|
+
return self._mode_in_body(token)
|
|
1350
|
+
finally:
|
|
1351
|
+
self.insert_from_table = previous
|
|
1352
|
+
else:
|
|
1353
|
+
if name == "tr":
|
|
1354
|
+
if not self._has_in_table_scope("tr"):
|
|
1355
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1356
|
+
return None
|
|
1357
|
+
self._end_tr_element()
|
|
1358
|
+
return None
|
|
1359
|
+
if name in {"table", "tbody", "tfoot", "thead"}:
|
|
1360
|
+
if self._has_in_table_scope(name):
|
|
1361
|
+
self._end_tr_element()
|
|
1362
|
+
return ("reprocess", self.mode, token)
|
|
1363
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1364
|
+
return None
|
|
1365
|
+
if name in {"caption", "col", "group", "td", "th"}:
|
|
1366
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1367
|
+
return None
|
|
1368
|
+
previous = self.insert_from_table
|
|
1369
|
+
self.insert_from_table = True
|
|
1370
|
+
try:
|
|
1371
|
+
return self._mode_in_body(token)
|
|
1372
|
+
finally:
|
|
1373
|
+
self.insert_from_table = previous
|
|
1374
|
+
assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
|
|
1375
|
+
return self._mode_in_table(token)
|
|
1376
|
+
|
|
1377
|
+
def _end_tr_element(self) -> None:
|
|
1378
|
+
self._clear_stack_until({"tr", "template", "html"})
|
|
1379
|
+
# Pop tr if on top (may not be if stack was exhausted)
|
|
1380
|
+
if self.open_elements and self.open_elements[-1].name == "tr":
|
|
1381
|
+
self.open_elements.pop()
|
|
1382
|
+
# When in a template, restore template mode; otherwise use IN_TABLE_BODY
|
|
1383
|
+
if self.template_modes:
|
|
1384
|
+
self.mode = self.template_modes[-1]
|
|
1385
|
+
else:
|
|
1386
|
+
self.mode = InsertionMode.IN_TABLE_BODY
|
|
1387
|
+
|
|
1388
|
+
def _mode_in_cell(self, token: Any) -> Any:
|
|
1389
|
+
if isinstance(token, CharacterTokens):
|
|
1390
|
+
previous = self.insert_from_table
|
|
1391
|
+
self.insert_from_table = False
|
|
1392
|
+
try:
|
|
1393
|
+
return self._mode_in_body(token)
|
|
1394
|
+
finally:
|
|
1395
|
+
self.insert_from_table = previous
|
|
1396
|
+
if isinstance(token, CommentToken):
|
|
1397
|
+
self._append_comment(token.data)
|
|
1398
|
+
return None
|
|
1399
|
+
if isinstance(token, Tag):
|
|
1400
|
+
name = token.name
|
|
1401
|
+
if token.kind == Tag.START:
|
|
1402
|
+
if name in {"caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr"}:
|
|
1403
|
+
if self._close_table_cell():
|
|
1404
|
+
return ("reprocess", self.mode, token)
|
|
1405
|
+
# Per spec: if we reach here in IN_CELL mode with no cell to close,
|
|
1406
|
+
# we're in a fragment context with td/th as context element and no table structure.
|
|
1407
|
+
# Issue parse error and ignore the token.
|
|
1408
|
+
self._parse_error("unexpected-start-tag-in-cell-fragment", tag_name=name)
|
|
1409
|
+
return None
|
|
1410
|
+
previous = self.insert_from_table
|
|
1411
|
+
self.insert_from_table = False
|
|
1412
|
+
try:
|
|
1413
|
+
return self._mode_in_body(token)
|
|
1414
|
+
finally:
|
|
1415
|
+
self.insert_from_table = previous
|
|
1416
|
+
else:
|
|
1417
|
+
if name in {"td", "th"}:
|
|
1418
|
+
if not self._has_in_table_scope(name):
|
|
1419
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1420
|
+
return None
|
|
1421
|
+
self._end_table_cell(name)
|
|
1422
|
+
return None
|
|
1423
|
+
if name in {"table", "tbody", "tfoot", "thead", "tr"}:
|
|
1424
|
+
# Per HTML5 spec: only close cell if the element is actually in scope
|
|
1425
|
+
# Otherwise it's a parse error and we ignore the token
|
|
1426
|
+
if not self._has_in_table_scope(name):
|
|
1427
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1428
|
+
return None
|
|
1429
|
+
self._close_table_cell()
|
|
1430
|
+
return ("reprocess", self.mode, token)
|
|
1431
|
+
previous = self.insert_from_table
|
|
1432
|
+
self.insert_from_table = False
|
|
1433
|
+
try:
|
|
1434
|
+
return self._mode_in_body(token)
|
|
1435
|
+
finally:
|
|
1436
|
+
self.insert_from_table = previous
|
|
1437
|
+
assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
|
|
1438
|
+
if self._close_table_cell():
|
|
1439
|
+
return ("reprocess", self.mode, token)
|
|
1440
|
+
return self._mode_in_table(token)
|
|
1441
|
+
|
|
1442
|
+
def _mode_in_select(self, token: Any) -> Any:
|
|
1443
|
+
if isinstance(token, CharacterTokens):
|
|
1444
|
+
data = token.data or ""
|
|
1445
|
+
if "\x00" in data:
|
|
1446
|
+
self._parse_error("invalid-codepoint-in-select")
|
|
1447
|
+
data = data.replace("\x00", "")
|
|
1448
|
+
if "\x0c" in data:
|
|
1449
|
+
self._parse_error("invalid-codepoint-in-select")
|
|
1450
|
+
data = data.replace("\x0c", "")
|
|
1451
|
+
if data:
|
|
1452
|
+
self._reconstruct_active_formatting_elements()
|
|
1453
|
+
self._append_text(data)
|
|
1454
|
+
return None
|
|
1455
|
+
if isinstance(token, CommentToken):
|
|
1456
|
+
self._append_comment(token.data)
|
|
1457
|
+
return None
|
|
1458
|
+
if isinstance(token, Tag):
|
|
1459
|
+
name = token.name
|
|
1460
|
+
if token.kind == Tag.START:
|
|
1461
|
+
if name == "html":
|
|
1462
|
+
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
1463
|
+
if name == "option":
|
|
1464
|
+
if self.open_elements and self.open_elements[-1].name == "option":
|
|
1465
|
+
self.open_elements.pop()
|
|
1466
|
+
self._reconstruct_active_formatting_elements()
|
|
1467
|
+
self._insert_element(token, push=True)
|
|
1468
|
+
return None
|
|
1469
|
+
if name == "optgroup":
|
|
1470
|
+
if self.open_elements and self.open_elements[-1].name == "option":
|
|
1471
|
+
self.open_elements.pop()
|
|
1472
|
+
if self.open_elements and self.open_elements[-1].name == "optgroup":
|
|
1473
|
+
self.open_elements.pop()
|
|
1474
|
+
self._reconstruct_active_formatting_elements()
|
|
1475
|
+
self._insert_element(token, push=True)
|
|
1476
|
+
return None
|
|
1477
|
+
if name == "select":
|
|
1478
|
+
self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=name)
|
|
1479
|
+
# select is always in scope in IN_SELECT mode
|
|
1480
|
+
self._pop_until_any_inclusive({"select"})
|
|
1481
|
+
self._reset_insertion_mode()
|
|
1482
|
+
return None
|
|
1483
|
+
if name in {"input", "textarea"}:
|
|
1484
|
+
self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=name)
|
|
1485
|
+
# select is always in scope in IN_SELECT mode
|
|
1486
|
+
self._pop_until_any_inclusive({"select"})
|
|
1487
|
+
self._reset_insertion_mode()
|
|
1488
|
+
return ("reprocess", self.mode, token)
|
|
1489
|
+
if name == "keygen":
|
|
1490
|
+
self._reconstruct_active_formatting_elements()
|
|
1491
|
+
self._insert_element(token, push=False)
|
|
1492
|
+
return None
|
|
1493
|
+
if name in {"caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr", "table"}:
|
|
1494
|
+
self._parse_error("unexpected-start-tag-implies-end-tag", tag_name=name)
|
|
1495
|
+
# select is always in scope in IN_SELECT mode
|
|
1496
|
+
self._pop_until_any_inclusive({"select"})
|
|
1497
|
+
self._reset_insertion_mode()
|
|
1498
|
+
return ("reprocess", self.mode, token)
|
|
1499
|
+
if name in {"script", "template"}:
|
|
1500
|
+
return self._mode_in_head(token)
|
|
1501
|
+
if name in {"svg", "math"}:
|
|
1502
|
+
# For foreign elements, honor the self-closing flag
|
|
1503
|
+
self._reconstruct_active_formatting_elements()
|
|
1504
|
+
self._insert_element(token, push=not token.self_closing, namespace=name)
|
|
1505
|
+
return None
|
|
1506
|
+
if name in FORMATTING_ELEMENTS:
|
|
1507
|
+
self._reconstruct_active_formatting_elements()
|
|
1508
|
+
node = self._insert_element(token, push=True)
|
|
1509
|
+
self._append_active_formatting_entry(name, token.attrs, node)
|
|
1510
|
+
return None
|
|
1511
|
+
if name == "hr":
|
|
1512
|
+
# Per spec: pop option and optgroup before inserting hr (makes hr sibling, not child)
|
|
1513
|
+
if self.open_elements and self.open_elements[-1].name == "option":
|
|
1514
|
+
self.open_elements.pop()
|
|
1515
|
+
if self.open_elements and self.open_elements[-1].name == "optgroup":
|
|
1516
|
+
self.open_elements.pop()
|
|
1517
|
+
self._reconstruct_active_formatting_elements()
|
|
1518
|
+
self._insert_element(token, push=False)
|
|
1519
|
+
return None
|
|
1520
|
+
if name == "menuitem":
|
|
1521
|
+
self._reconstruct_active_formatting_elements()
|
|
1522
|
+
self._insert_element(token, push=True)
|
|
1523
|
+
return None
|
|
1524
|
+
# Allow common HTML elements in select (newer spec)
|
|
1525
|
+
if name in {"p", "div", "span", "button", "datalist", "selectedcontent"}:
|
|
1526
|
+
self._reconstruct_active_formatting_elements()
|
|
1527
|
+
self._insert_element(token, push=not token.self_closing)
|
|
1528
|
+
return None
|
|
1529
|
+
if name in {"br", "img"}:
|
|
1530
|
+
self._reconstruct_active_formatting_elements()
|
|
1531
|
+
self._insert_element(token, push=False)
|
|
1532
|
+
return None
|
|
1533
|
+
if name == "plaintext":
|
|
1534
|
+
# Per spec: plaintext element is inserted in select (consumes all remaining text)
|
|
1535
|
+
self._reconstruct_active_formatting_elements()
|
|
1536
|
+
self._insert_element(token, push=True)
|
|
1537
|
+
return None
|
|
1538
|
+
if name == "optgroup":
|
|
1539
|
+
if self.open_elements and self.open_elements[-1].name == "option":
|
|
1540
|
+
self.open_elements.pop()
|
|
1541
|
+
if self.open_elements and self.open_elements[-1].name == "optgroup":
|
|
1542
|
+
self.open_elements.pop()
|
|
1543
|
+
else:
|
|
1544
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
1545
|
+
return None
|
|
1546
|
+
if name == "option":
|
|
1547
|
+
if self.open_elements and self.open_elements[-1].name == "option":
|
|
1548
|
+
self.open_elements.pop()
|
|
1549
|
+
else:
|
|
1550
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
1551
|
+
return None
|
|
1552
|
+
if name == "select":
|
|
1553
|
+
# In IN_SELECT mode, select is always in scope - pop to it
|
|
1554
|
+
self._pop_until_any_inclusive({"select"})
|
|
1555
|
+
self._reset_insertion_mode()
|
|
1556
|
+
return None
|
|
1557
|
+
# Handle end tags for allowed HTML elements in select
|
|
1558
|
+
if name == "a" or name in FORMATTING_ELEMENTS:
|
|
1559
|
+
# select is always on stack in IN_SELECT mode
|
|
1560
|
+
select_node = self._find_last_on_stack("select")
|
|
1561
|
+
fmt_index = self._find_active_formatting_index(name)
|
|
1562
|
+
if fmt_index is not None:
|
|
1563
|
+
target = self.active_formatting[fmt_index]["node"]
|
|
1564
|
+
if target in self.open_elements: # pragma: no branch
|
|
1565
|
+
select_index = self.open_elements.index(select_node)
|
|
1566
|
+
target_index = self.open_elements.index(target)
|
|
1567
|
+
if target_index < select_index:
|
|
1568
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1569
|
+
return None
|
|
1570
|
+
self._adoption_agency(name)
|
|
1571
|
+
return None
|
|
1572
|
+
if name in {"p", "div", "span", "button", "datalist", "selectedcontent"}:
|
|
1573
|
+
# Per HTML5 spec: these end tags in select mode close the element if it's on the stack.
|
|
1574
|
+
# But we must not pop across the select boundary (i.e., don't pop elements BEFORE select).
|
|
1575
|
+
select_idx = None
|
|
1576
|
+
target_idx = None
|
|
1577
|
+
for i, node in enumerate(self.open_elements):
|
|
1578
|
+
if node.name == "select" and select_idx is None:
|
|
1579
|
+
select_idx = i
|
|
1580
|
+
if node.name == name:
|
|
1581
|
+
target_idx = i # Track the LAST occurrence
|
|
1582
|
+
# Only pop if target exists and is AFTER (or at same level as) select
|
|
1583
|
+
# i.e., the target is inside the select or there's no select
|
|
1584
|
+
if target_idx is not None and (select_idx is None or target_idx > select_idx):
|
|
1585
|
+
while True:
|
|
1586
|
+
popped = self.open_elements.pop()
|
|
1587
|
+
if popped.name == name:
|
|
1588
|
+
break
|
|
1589
|
+
else:
|
|
1590
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1591
|
+
return None
|
|
1592
|
+
if name in {"caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr", "table"}:
|
|
1593
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1594
|
+
# select is always in scope in IN_SELECT mode
|
|
1595
|
+
self._pop_until_any_inclusive({"select"})
|
|
1596
|
+
self._reset_insertion_mode()
|
|
1597
|
+
return ("reprocess", self.mode, token)
|
|
1598
|
+
# Any other end tag: parse error, ignore
|
|
1599
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
1600
|
+
return None
|
|
1601
|
+
assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
|
|
1602
|
+
return self._mode_in_body(token)
|
|
1603
|
+
|
|
1604
|
+
def _mode_in_template(self, token: Any) -> Any:
|
|
1605
|
+
# § The "in template" insertion mode
|
|
1606
|
+
# https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intemplate
|
|
1607
|
+
if isinstance(token, CharacterTokens):
|
|
1608
|
+
return self._mode_in_body(token)
|
|
1609
|
+
if isinstance(token, CommentToken):
|
|
1610
|
+
return self._mode_in_body(token)
|
|
1611
|
+
if isinstance(token, Tag):
|
|
1612
|
+
if token.kind == Tag.START:
|
|
1613
|
+
# Table-related tags switch template mode
|
|
1614
|
+
if token.name in {"caption", "colgroup", "tbody", "tfoot", "thead"}:
|
|
1615
|
+
self.template_modes.pop()
|
|
1616
|
+
self.template_modes.append(InsertionMode.IN_TABLE)
|
|
1617
|
+
self.mode = InsertionMode.IN_TABLE
|
|
1618
|
+
return ("reprocess", InsertionMode.IN_TABLE, token)
|
|
1619
|
+
if token.name == "col":
|
|
1620
|
+
self.template_modes.pop()
|
|
1621
|
+
self.template_modes.append(InsertionMode.IN_COLUMN_GROUP)
|
|
1622
|
+
self.mode = InsertionMode.IN_COLUMN_GROUP
|
|
1623
|
+
return ("reprocess", InsertionMode.IN_COLUMN_GROUP, token)
|
|
1624
|
+
if token.name == "tr":
|
|
1625
|
+
self.template_modes.pop()
|
|
1626
|
+
self.template_modes.append(InsertionMode.IN_TABLE_BODY)
|
|
1627
|
+
self.mode = InsertionMode.IN_TABLE_BODY
|
|
1628
|
+
return ("reprocess", InsertionMode.IN_TABLE_BODY, token)
|
|
1629
|
+
if token.name in {"td", "th"}:
|
|
1630
|
+
self.template_modes.pop()
|
|
1631
|
+
self.template_modes.append(InsertionMode.IN_ROW)
|
|
1632
|
+
self.mode = InsertionMode.IN_ROW
|
|
1633
|
+
return ("reprocess", InsertionMode.IN_ROW, token)
|
|
1634
|
+
# Default: pop template mode and push IN_BODY
|
|
1635
|
+
if token.name not in {
|
|
1636
|
+
"base",
|
|
1637
|
+
"basefont",
|
|
1638
|
+
"bgsound",
|
|
1639
|
+
"link",
|
|
1640
|
+
"meta",
|
|
1641
|
+
"noframes",
|
|
1642
|
+
"script",
|
|
1643
|
+
"style",
|
|
1644
|
+
"template",
|
|
1645
|
+
"title",
|
|
1646
|
+
}:
|
|
1647
|
+
self.template_modes.pop()
|
|
1648
|
+
self.template_modes.append(InsertionMode.IN_BODY)
|
|
1649
|
+
self.mode = InsertionMode.IN_BODY
|
|
1650
|
+
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
1651
|
+
if token.kind == Tag.END and token.name == "template":
|
|
1652
|
+
return self._mode_in_head(token)
|
|
1653
|
+
# Head-related tags process in InHead
|
|
1654
|
+
if token.name in {
|
|
1655
|
+
"base",
|
|
1656
|
+
"basefont",
|
|
1657
|
+
"bgsound",
|
|
1658
|
+
"link",
|
|
1659
|
+
"meta",
|
|
1660
|
+
"noframes",
|
|
1661
|
+
"script",
|
|
1662
|
+
"style",
|
|
1663
|
+
"template",
|
|
1664
|
+
"title",
|
|
1665
|
+
}:
|
|
1666
|
+
return self._mode_in_head(token)
|
|
1667
|
+
if isinstance(token, EOFToken):
|
|
1668
|
+
# Check if template is on the stack (don't use _in_scope as table blocks it)
|
|
1669
|
+
has_template = any(node.name == "template" for node in self.open_elements)
|
|
1670
|
+
if not has_template:
|
|
1671
|
+
return None
|
|
1672
|
+
# Parse error for EOF in template
|
|
1673
|
+
self._parse_error("expected-closing-tag-but-got-eof", tag_name="template")
|
|
1674
|
+
# Pop until template, then handle EOF in reset mode
|
|
1675
|
+
self._pop_until_inclusive("template")
|
|
1676
|
+
self._clear_active_formatting_up_to_marker()
|
|
1677
|
+
# template_modes is always non-empty when template is on stack
|
|
1678
|
+
self.template_modes.pop()
|
|
1679
|
+
self._reset_insertion_mode()
|
|
1680
|
+
return ("reprocess", self.mode, token)
|
|
1681
|
+
return None
|
|
1682
|
+
|
|
1683
|
+
def _mode_after_body(self, token: Any) -> Any:
|
|
1684
|
+
if isinstance(token, CharacterTokens):
|
|
1685
|
+
if is_all_whitespace(token.data):
|
|
1686
|
+
# Whitespace is processed using InBody rules (appended to body)
|
|
1687
|
+
# but we stay in AfterBody mode
|
|
1688
|
+
self._mode_in_body(token)
|
|
1689
|
+
return None
|
|
1690
|
+
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
1691
|
+
if isinstance(token, CommentToken):
|
|
1692
|
+
self._append_comment(token.data, parent=self.open_elements[0])
|
|
1693
|
+
return None
|
|
1694
|
+
if isinstance(token, Tag):
|
|
1695
|
+
if token.kind == Tag.START and token.name == "html":
|
|
1696
|
+
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
1697
|
+
if token.kind == Tag.END and token.name == "html":
|
|
1698
|
+
self.mode = InsertionMode.AFTER_AFTER_BODY
|
|
1699
|
+
return None
|
|
1700
|
+
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
1701
|
+
assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
|
|
1702
|
+
return None
|
|
1703
|
+
|
|
1704
|
+
def _mode_after_after_body(self, token: Any) -> Any:
|
|
1705
|
+
if isinstance(token, CharacterTokens):
|
|
1706
|
+
if is_all_whitespace(token.data):
|
|
1707
|
+
# Per spec: whitespace characters are inserted using the rules for the "in body" mode
|
|
1708
|
+
# Process with InBody rules but stay in AfterAfterBody mode
|
|
1709
|
+
self._mode_in_body(token)
|
|
1710
|
+
return None
|
|
1711
|
+
# Non-whitespace character: parse error, reprocess in IN_BODY
|
|
1712
|
+
self._parse_error("unexpected-char-after-body")
|
|
1713
|
+
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
1714
|
+
if isinstance(token, CommentToken):
|
|
1715
|
+
if self.fragment_context is not None:
|
|
1716
|
+
# html is always on stack in fragment parsing
|
|
1717
|
+
html_node = self._find_last_on_stack("html")
|
|
1718
|
+
html_node.append_child(SimpleDomNode("#comment", data=token.data))
|
|
1719
|
+
return None
|
|
1720
|
+
self._append_comment_to_document(token.data)
|
|
1721
|
+
return None
|
|
1722
|
+
if isinstance(token, Tag):
|
|
1723
|
+
if token.kind == Tag.START and token.name == "html":
|
|
1724
|
+
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
1725
|
+
# Any other tag: parse error, reprocess in IN_BODY
|
|
1726
|
+
self._parse_error("unexpected-token-after-body")
|
|
1727
|
+
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
1728
|
+
assert isinstance(token, EOFToken), f"Unexpected token type: {type(token)}"
|
|
1729
|
+
return None
|
|
1730
|
+
|
|
1731
|
+
def _mode_in_frameset(self, token: Any) -> Any:
|
|
1732
|
+
# Per HTML5 spec §13.2.6.4.16: In frameset insertion mode
|
|
1733
|
+
if isinstance(token, CharacterTokens):
|
|
1734
|
+
# Only whitespace characters allowed; ignore all others
|
|
1735
|
+
whitespace = "".join(ch for ch in token.data if ch in "\t\n\f\r ")
|
|
1736
|
+
if whitespace:
|
|
1737
|
+
self._append_text(whitespace)
|
|
1738
|
+
return None
|
|
1739
|
+
if isinstance(token, CommentToken):
|
|
1740
|
+
self._append_comment(token.data)
|
|
1741
|
+
return None
|
|
1742
|
+
if isinstance(token, Tag):
|
|
1743
|
+
if token.kind == Tag.START and token.name == "html":
|
|
1744
|
+
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
1745
|
+
if token.kind == Tag.START and token.name == "frameset":
|
|
1746
|
+
self._insert_element(token, push=True)
|
|
1747
|
+
return None
|
|
1748
|
+
if token.kind == Tag.END and token.name == "frameset":
|
|
1749
|
+
if self.open_elements and self.open_elements[-1].name == "html":
|
|
1750
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
1751
|
+
return None
|
|
1752
|
+
self.open_elements.pop()
|
|
1753
|
+
if self.open_elements and self.open_elements[-1].name != "frameset":
|
|
1754
|
+
self.mode = InsertionMode.AFTER_FRAMESET
|
|
1755
|
+
return None
|
|
1756
|
+
if token.kind == Tag.START and token.name == "frame":
|
|
1757
|
+
self._insert_element(token, push=True)
|
|
1758
|
+
self.open_elements.pop()
|
|
1759
|
+
return None
|
|
1760
|
+
if token.kind == Tag.START and token.name == "noframes":
|
|
1761
|
+
# Per spec: use IN_HEAD rules but preserve current mode for TEXT restoration
|
|
1762
|
+
self._insert_element(token, push=True)
|
|
1763
|
+
self.original_mode = self.mode
|
|
1764
|
+
self.mode = InsertionMode.TEXT
|
|
1765
|
+
return None
|
|
1766
|
+
if isinstance(token, EOFToken):
|
|
1767
|
+
if self.open_elements and self.open_elements[-1].name != "html":
|
|
1768
|
+
self._parse_error("expected-closing-tag-but-got-eof", tag_name=self.open_elements[-1].name)
|
|
1769
|
+
return None
|
|
1770
|
+
self._parse_error("unexpected-token-in-frameset")
|
|
1771
|
+
return None
|
|
1772
|
+
|
|
1773
|
+
def _mode_after_frameset(self, token: Any) -> Any:
|
|
1774
|
+
# Per HTML5 spec §13.2.6.4.17: After frameset insertion mode
|
|
1775
|
+
if isinstance(token, CharacterTokens):
|
|
1776
|
+
# Only whitespace characters allowed; ignore all others
|
|
1777
|
+
whitespace = "".join(ch for ch in token.data if ch in "\t\n\f\r ")
|
|
1778
|
+
if whitespace:
|
|
1779
|
+
self._append_text(whitespace)
|
|
1780
|
+
return None
|
|
1781
|
+
if isinstance(token, CommentToken):
|
|
1782
|
+
self._append_comment(token.data)
|
|
1783
|
+
return None
|
|
1784
|
+
if isinstance(token, Tag):
|
|
1785
|
+
if token.kind == Tag.START and token.name == "html":
|
|
1786
|
+
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
1787
|
+
if token.kind == Tag.END and token.name == "html":
|
|
1788
|
+
self.mode = InsertionMode.AFTER_AFTER_FRAMESET
|
|
1789
|
+
return None
|
|
1790
|
+
if token.kind == Tag.START and token.name == "noframes":
|
|
1791
|
+
# Insert noframes element directly and switch to TEXT mode
|
|
1792
|
+
self._insert_element(token, push=True)
|
|
1793
|
+
self.original_mode = self.mode
|
|
1794
|
+
self.mode = InsertionMode.TEXT
|
|
1795
|
+
return None
|
|
1796
|
+
if isinstance(token, EOFToken):
|
|
1797
|
+
return None
|
|
1798
|
+
self._parse_error("unexpected-token-after-frameset")
|
|
1799
|
+
self.mode = InsertionMode.IN_FRAMESET
|
|
1800
|
+
return ("reprocess", InsertionMode.IN_FRAMESET, token)
|
|
1801
|
+
|
|
1802
|
+
def _mode_after_after_frameset(self, token: Any) -> Any:
|
|
1803
|
+
# Per HTML5 spec §13.2.6.4.18: After after frameset insertion mode
|
|
1804
|
+
if isinstance(token, CharacterTokens):
|
|
1805
|
+
# Whitespace is processed using InBody rules
|
|
1806
|
+
# but we stay in AfterAfterFrameset mode
|
|
1807
|
+
if is_all_whitespace(token.data):
|
|
1808
|
+
self._mode_in_body(token)
|
|
1809
|
+
return None
|
|
1810
|
+
# Non-whitespace falls through to "Anything else"
|
|
1811
|
+
if isinstance(token, CommentToken):
|
|
1812
|
+
self._append_comment_to_document(token.data)
|
|
1813
|
+
return None
|
|
1814
|
+
if isinstance(token, Tag):
|
|
1815
|
+
if token.kind == Tag.START and token.name == "html":
|
|
1816
|
+
return ("reprocess", InsertionMode.IN_BODY, token)
|
|
1817
|
+
if token.kind == Tag.START and token.name == "noframes":
|
|
1818
|
+
# Insert noframes element directly and switch to TEXT mode
|
|
1819
|
+
self._insert_element(token, push=True)
|
|
1820
|
+
self.original_mode = self.mode
|
|
1821
|
+
self.mode = InsertionMode.TEXT
|
|
1822
|
+
return None
|
|
1823
|
+
# Other tags fall through to "Anything else"
|
|
1824
|
+
if isinstance(token, EOFToken):
|
|
1825
|
+
return None
|
|
1826
|
+
# Anything else: parse error, reprocess in IN_FRAMESET
|
|
1827
|
+
self._parse_error("unexpected-token-after-after-frameset")
|
|
1828
|
+
self.mode = InsertionMode.IN_FRAMESET
|
|
1829
|
+
return ("reprocess", InsertionMode.IN_FRAMESET, token)
|
|
1830
|
+
|
|
1831
|
+
# Helpers ----------------------------------------------------------------
|
|
1832
|
+
|
|
1833
|
+
_MODE_HANDLERS = [
|
|
1834
|
+
_mode_initial,
|
|
1835
|
+
_mode_before_html,
|
|
1836
|
+
_mode_before_head,
|
|
1837
|
+
_mode_in_head,
|
|
1838
|
+
_mode_in_head_noscript,
|
|
1839
|
+
_mode_after_head,
|
|
1840
|
+
_mode_text,
|
|
1841
|
+
_mode_in_body,
|
|
1842
|
+
_mode_after_body,
|
|
1843
|
+
_mode_after_after_body,
|
|
1844
|
+
_mode_in_table,
|
|
1845
|
+
_mode_in_table_text,
|
|
1846
|
+
_mode_in_caption,
|
|
1847
|
+
_mode_in_column_group,
|
|
1848
|
+
_mode_in_table_body,
|
|
1849
|
+
_mode_in_row,
|
|
1850
|
+
_mode_in_cell,
|
|
1851
|
+
_mode_in_frameset,
|
|
1852
|
+
_mode_after_frameset,
|
|
1853
|
+
_mode_after_after_frameset,
|
|
1854
|
+
_mode_in_select,
|
|
1855
|
+
_mode_in_template,
|
|
1856
|
+
]
|
|
1857
|
+
|
|
1858
|
+
_BODY_TOKEN_HANDLERS = {
|
|
1859
|
+
CharacterTokens: _handle_characters_in_body,
|
|
1860
|
+
CommentToken: _handle_comment_in_body,
|
|
1861
|
+
Tag: _handle_tag_in_body,
|
|
1862
|
+
EOFToken: _handle_eof_in_body,
|
|
1863
|
+
}
|
|
1864
|
+
|
|
1865
|
+
_BODY_START_HANDLERS = {
|
|
1866
|
+
"a": _handle_body_start_a,
|
|
1867
|
+
"address": _handle_body_start_block_with_p,
|
|
1868
|
+
"applet": _handle_body_start_applet_like,
|
|
1869
|
+
"area": _handle_body_start_void_with_formatting,
|
|
1870
|
+
"article": _handle_body_start_block_with_p,
|
|
1871
|
+
"aside": _handle_body_start_block_with_p,
|
|
1872
|
+
"b": _handle_body_start_formatting,
|
|
1873
|
+
"base": _handle_body_start_in_head,
|
|
1874
|
+
"basefont": _handle_body_start_in_head,
|
|
1875
|
+
"bgsound": _handle_body_start_in_head,
|
|
1876
|
+
"big": _handle_body_start_formatting,
|
|
1877
|
+
"blockquote": _handle_body_start_block_with_p,
|
|
1878
|
+
"body": _handle_body_start_body,
|
|
1879
|
+
"br": _handle_body_start_br,
|
|
1880
|
+
"button": _handle_body_start_button,
|
|
1881
|
+
"caption": _handle_body_start_table_parse_error,
|
|
1882
|
+
"center": _handle_body_start_block_with_p,
|
|
1883
|
+
"code": _handle_body_start_formatting,
|
|
1884
|
+
"col": _handle_body_start_col_or_frame,
|
|
1885
|
+
"colgroup": _handle_body_start_structure_ignored,
|
|
1886
|
+
"dd": _handle_body_start_dd_dt,
|
|
1887
|
+
"details": _handle_body_start_block_with_p,
|
|
1888
|
+
"dialog": _handle_body_start_block_with_p,
|
|
1889
|
+
"dir": _handle_body_start_block_with_p,
|
|
1890
|
+
"div": _handle_body_start_block_with_p,
|
|
1891
|
+
"dl": _handle_body_start_block_with_p,
|
|
1892
|
+
"dt": _handle_body_start_dd_dt,
|
|
1893
|
+
"em": _handle_body_start_formatting,
|
|
1894
|
+
"embed": _handle_body_start_void_with_formatting,
|
|
1895
|
+
"fieldset": _handle_body_start_block_with_p,
|
|
1896
|
+
"figcaption": _handle_body_start_block_with_p,
|
|
1897
|
+
"figure": _handle_body_start_block_with_p,
|
|
1898
|
+
"font": _handle_body_start_formatting,
|
|
1899
|
+
"footer": _handle_body_start_block_with_p,
|
|
1900
|
+
"form": _handle_body_start_form,
|
|
1901
|
+
"frame": _handle_body_start_col_or_frame,
|
|
1902
|
+
"frameset": _handle_body_start_frameset,
|
|
1903
|
+
"h1": _handle_body_start_heading,
|
|
1904
|
+
"h2": _handle_body_start_heading,
|
|
1905
|
+
"h3": _handle_body_start_heading,
|
|
1906
|
+
"h4": _handle_body_start_heading,
|
|
1907
|
+
"h5": _handle_body_start_heading,
|
|
1908
|
+
"h6": _handle_body_start_heading,
|
|
1909
|
+
"head": _handle_body_start_head,
|
|
1910
|
+
"header": _handle_body_start_block_with_p,
|
|
1911
|
+
"hgroup": _handle_body_start_block_with_p,
|
|
1912
|
+
"html": _handle_body_start_html,
|
|
1913
|
+
"i": _handle_body_start_formatting,
|
|
1914
|
+
"image": _handle_body_start_image,
|
|
1915
|
+
"img": _handle_body_start_void_with_formatting,
|
|
1916
|
+
"input": _handle_body_start_input,
|
|
1917
|
+
"keygen": _handle_body_start_void_with_formatting,
|
|
1918
|
+
"li": _handle_body_start_li,
|
|
1919
|
+
"link": _handle_body_start_in_head,
|
|
1920
|
+
"listing": _handle_body_start_pre_listing,
|
|
1921
|
+
"main": _handle_body_start_block_with_p,
|
|
1922
|
+
"marquee": _handle_body_start_applet_like,
|
|
1923
|
+
"math": _handle_body_start_math,
|
|
1924
|
+
"menu": _handle_body_start_block_with_p,
|
|
1925
|
+
"meta": _handle_body_start_in_head,
|
|
1926
|
+
"nav": _handle_body_start_block_with_p,
|
|
1927
|
+
"nobr": _handle_body_start_formatting,
|
|
1928
|
+
"noframes": _handle_body_start_in_head,
|
|
1929
|
+
"object": _handle_body_start_applet_like,
|
|
1930
|
+
"ol": _handle_body_start_block_with_p,
|
|
1931
|
+
"optgroup": _handle_body_start_optgroup,
|
|
1932
|
+
"option": _handle_body_start_option,
|
|
1933
|
+
"p": _handle_body_start_paragraph,
|
|
1934
|
+
"param": _handle_body_start_simple_void,
|
|
1935
|
+
"plaintext": _handle_body_start_plaintext_xmp,
|
|
1936
|
+
"pre": _handle_body_start_pre_listing,
|
|
1937
|
+
"rb": _handle_body_start_rb_rtc,
|
|
1938
|
+
"rp": _handle_body_start_rp_rt,
|
|
1939
|
+
"rt": _handle_body_start_rp_rt,
|
|
1940
|
+
"rtc": _handle_body_start_rb_rtc,
|
|
1941
|
+
"s": _handle_body_start_formatting,
|
|
1942
|
+
"script": _handle_body_start_in_head,
|
|
1943
|
+
"search": _handle_body_start_block_with_p,
|
|
1944
|
+
"section": _handle_body_start_block_with_p,
|
|
1945
|
+
"select": _handle_body_start_select,
|
|
1946
|
+
"small": _handle_body_start_formatting,
|
|
1947
|
+
"source": _handle_body_start_simple_void,
|
|
1948
|
+
"strike": _handle_body_start_formatting,
|
|
1949
|
+
"strong": _handle_body_start_formatting,
|
|
1950
|
+
"style": _handle_body_start_in_head,
|
|
1951
|
+
"summary": _handle_body_start_block_with_p,
|
|
1952
|
+
"svg": _handle_body_start_svg,
|
|
1953
|
+
"table": _handle_body_start_table,
|
|
1954
|
+
"tbody": _handle_body_start_structure_ignored,
|
|
1955
|
+
"td": _handle_body_start_structure_ignored,
|
|
1956
|
+
"template": _handle_body_start_in_head,
|
|
1957
|
+
"textarea": _handle_body_start_textarea,
|
|
1958
|
+
"tfoot": _handle_body_start_structure_ignored,
|
|
1959
|
+
"th": _handle_body_start_structure_ignored,
|
|
1960
|
+
"thead": _handle_body_start_structure_ignored,
|
|
1961
|
+
"title": _handle_body_start_in_head,
|
|
1962
|
+
"tr": _handle_body_start_structure_ignored,
|
|
1963
|
+
"track": _handle_body_start_simple_void,
|
|
1964
|
+
"tt": _handle_body_start_formatting,
|
|
1965
|
+
"u": _handle_body_start_formatting,
|
|
1966
|
+
"ul": _handle_body_start_block_with_p,
|
|
1967
|
+
"wbr": _handle_body_start_void_with_formatting,
|
|
1968
|
+
"xmp": _handle_body_start_plaintext_xmp,
|
|
1969
|
+
}
|
|
1970
|
+
_BODY_END_HANDLERS = {
|
|
1971
|
+
"address": _handle_body_end_block,
|
|
1972
|
+
"applet": _handle_body_end_applet_like,
|
|
1973
|
+
"article": _handle_body_end_block,
|
|
1974
|
+
"aside": _handle_body_end_block,
|
|
1975
|
+
"blockquote": _handle_body_end_block,
|
|
1976
|
+
"body": _handle_body_end_body,
|
|
1977
|
+
"button": _handle_body_end_block,
|
|
1978
|
+
"center": _handle_body_end_block,
|
|
1979
|
+
"dd": _handle_body_end_dd_dt,
|
|
1980
|
+
"details": _handle_body_end_block,
|
|
1981
|
+
"dialog": _handle_body_end_block,
|
|
1982
|
+
"dir": _handle_body_end_block,
|
|
1983
|
+
"div": _handle_body_end_block,
|
|
1984
|
+
"dl": _handle_body_end_block,
|
|
1985
|
+
"dt": _handle_body_end_dd_dt,
|
|
1986
|
+
"fieldset": _handle_body_end_block,
|
|
1987
|
+
"figcaption": _handle_body_end_block,
|
|
1988
|
+
"figure": _handle_body_end_block,
|
|
1989
|
+
"footer": _handle_body_end_block,
|
|
1990
|
+
"form": _handle_body_end_form,
|
|
1991
|
+
"h1": _handle_body_end_heading,
|
|
1992
|
+
"h2": _handle_body_end_heading,
|
|
1993
|
+
"h3": _handle_body_end_heading,
|
|
1994
|
+
"h4": _handle_body_end_heading,
|
|
1995
|
+
"h5": _handle_body_end_heading,
|
|
1996
|
+
"h6": _handle_body_end_heading,
|
|
1997
|
+
"header": _handle_body_end_block,
|
|
1998
|
+
"hgroup": _handle_body_end_block,
|
|
1999
|
+
"html": _handle_body_end_html,
|
|
2000
|
+
"li": _handle_body_end_li,
|
|
2001
|
+
"listing": _handle_body_end_block,
|
|
2002
|
+
"main": _handle_body_end_block,
|
|
2003
|
+
"marquee": _handle_body_end_applet_like,
|
|
2004
|
+
"menu": _handle_body_end_block,
|
|
2005
|
+
"nav": _handle_body_end_block,
|
|
2006
|
+
"object": _handle_body_end_applet_like,
|
|
2007
|
+
"ol": _handle_body_end_block,
|
|
2008
|
+
"p": _handle_body_end_p,
|
|
2009
|
+
"pre": _handle_body_end_block,
|
|
2010
|
+
"search": _handle_body_end_block,
|
|
2011
|
+
"section": _handle_body_end_block,
|
|
2012
|
+
"summary": _handle_body_end_block,
|
|
2013
|
+
"table": _handle_body_end_block,
|
|
2014
|
+
"template": _handle_body_end_template,
|
|
2015
|
+
"ul": _handle_body_end_block,
|
|
2016
|
+
}
|