justhtml 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- justhtml/__init__.py +17 -0
- justhtml/__main__.py +29 -0
- justhtml/constants.py +441 -0
- justhtml/context.py +6 -0
- justhtml/entities.py +342 -0
- justhtml/errors.py +138 -0
- justhtml/node.py +208 -0
- justhtml/parser.py +86 -0
- justhtml/selector.py +925 -0
- justhtml/serialize.py +201 -0
- justhtml/stream.py +83 -0
- justhtml/tokenizer.py +2590 -0
- justhtml/tokens.py +175 -0
- justhtml/treebuilder.py +1231 -0
- justhtml/treebuilder_modes.py +2012 -0
- justhtml/treebuilder_utils.py +86 -0
- justhtml-0.6.0.dist-info/METADATA +126 -0
- justhtml-0.6.0.dist-info/RECORD +20 -0
- justhtml-0.6.0.dist-info/WHEEL +4 -0
- justhtml-0.6.0.dist-info/licenses/LICENSE +21 -0
justhtml/treebuilder.py
ADDED
|
@@ -0,0 +1,1231 @@
|
|
|
1
|
+
# ruff: noqa: S101, PLW2901
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
from .constants import (
|
|
5
|
+
BUTTON_SCOPE_TERMINATORS,
|
|
6
|
+
DEFAULT_SCOPE_TERMINATORS,
|
|
7
|
+
DEFINITION_SCOPE_TERMINATORS,
|
|
8
|
+
FOREIGN_ATTRIBUTE_ADJUSTMENTS,
|
|
9
|
+
FOREIGN_BREAKOUT_ELEMENTS,
|
|
10
|
+
FORMAT_MARKER,
|
|
11
|
+
FORMATTING_ELEMENTS,
|
|
12
|
+
HTML_INTEGRATION_POINT_SET,
|
|
13
|
+
IMPLIED_END_TAGS,
|
|
14
|
+
LIST_ITEM_SCOPE_TERMINATORS,
|
|
15
|
+
MATHML_ATTRIBUTE_ADJUSTMENTS,
|
|
16
|
+
MATHML_TEXT_INTEGRATION_POINT_SET,
|
|
17
|
+
SPECIAL_ELEMENTS,
|
|
18
|
+
SVG_ATTRIBUTE_ADJUSTMENTS,
|
|
19
|
+
SVG_TAG_NAME_ADJUSTMENTS,
|
|
20
|
+
TABLE_ALLOWED_CHILDREN,
|
|
21
|
+
TABLE_FOSTER_TARGETS,
|
|
22
|
+
TABLE_SCOPE_TERMINATORS,
|
|
23
|
+
)
|
|
24
|
+
from .errors import generate_error_message
|
|
25
|
+
from .node import ElementNode, SimpleDomNode, TemplateNode, TextNode
|
|
26
|
+
from .tokens import CharacterTokens, CommentToken, DoctypeToken, EOFToken, ParseError, Tag, TokenSinkResult
|
|
27
|
+
from .treebuilder_modes import TreeBuilderModesMixin
|
|
28
|
+
from .treebuilder_utils import (
|
|
29
|
+
InsertionMode,
|
|
30
|
+
is_all_whitespace,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class TreeBuilder(TreeBuilderModesMixin):
|
|
35
|
+
__slots__ = (
|
|
36
|
+
"_body_end_handlers",
|
|
37
|
+
"_body_start_handlers",
|
|
38
|
+
"_body_token_handlers",
|
|
39
|
+
"_mode_handlers",
|
|
40
|
+
"active_formatting",
|
|
41
|
+
"collect_errors",
|
|
42
|
+
"document",
|
|
43
|
+
"errors",
|
|
44
|
+
"form_element",
|
|
45
|
+
"fragment_context",
|
|
46
|
+
"fragment_context_element",
|
|
47
|
+
"frameset_ok",
|
|
48
|
+
"head_element",
|
|
49
|
+
"iframe_srcdoc",
|
|
50
|
+
"ignore_lf",
|
|
51
|
+
"insert_from_table",
|
|
52
|
+
"mode",
|
|
53
|
+
"open_elements",
|
|
54
|
+
"original_mode",
|
|
55
|
+
"pending_table_text",
|
|
56
|
+
"quirks_mode",
|
|
57
|
+
"table_text_original_mode",
|
|
58
|
+
"template_modes",
|
|
59
|
+
"tokenizer",
|
|
60
|
+
"tokenizer_state_override",
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
fragment_context=None,
|
|
66
|
+
iframe_srcdoc=False,
|
|
67
|
+
collect_errors=False,
|
|
68
|
+
):
|
|
69
|
+
self.fragment_context = fragment_context
|
|
70
|
+
self.iframe_srcdoc = iframe_srcdoc
|
|
71
|
+
self.collect_errors = collect_errors
|
|
72
|
+
self.errors = []
|
|
73
|
+
self.tokenizer = None # Set by parser after tokenizer is created
|
|
74
|
+
self.fragment_context_element = None
|
|
75
|
+
if fragment_context is not None:
|
|
76
|
+
self.document = SimpleDomNode("#document-fragment")
|
|
77
|
+
else:
|
|
78
|
+
self.document = SimpleDomNode("#document")
|
|
79
|
+
self.mode = InsertionMode.INITIAL
|
|
80
|
+
self.original_mode = None
|
|
81
|
+
self.table_text_original_mode = None
|
|
82
|
+
self.open_elements = []
|
|
83
|
+
self.head_element = None
|
|
84
|
+
self.form_element = None
|
|
85
|
+
self.frameset_ok = True
|
|
86
|
+
self.quirks_mode = "no-quirks"
|
|
87
|
+
self.ignore_lf = False
|
|
88
|
+
self.active_formatting = []
|
|
89
|
+
self.insert_from_table = False
|
|
90
|
+
self.pending_table_text = []
|
|
91
|
+
self.template_modes = []
|
|
92
|
+
self.tokenizer_state_override = None
|
|
93
|
+
if fragment_context is not None:
|
|
94
|
+
# Fragment parsing per HTML5 spec
|
|
95
|
+
root = self._create_element("html", None, {})
|
|
96
|
+
self.document.append_child(root)
|
|
97
|
+
self.open_elements.append(root)
|
|
98
|
+
# Set mode based on context element name
|
|
99
|
+
namespace = fragment_context.namespace
|
|
100
|
+
context_name = fragment_context.tag_name or ""
|
|
101
|
+
name = context_name.lower()
|
|
102
|
+
|
|
103
|
+
# Create a fake context element to establish foreign content context
|
|
104
|
+
# Per spec: "Create an element for the token in the given namespace"
|
|
105
|
+
if namespace and namespace not in {None, "html"}:
|
|
106
|
+
adjusted_name = context_name
|
|
107
|
+
if namespace == "svg":
|
|
108
|
+
adjusted_name = self._adjust_svg_tag_name(context_name)
|
|
109
|
+
context_element = self._create_element(adjusted_name, namespace, {})
|
|
110
|
+
root.append_child(context_element)
|
|
111
|
+
self.open_elements.append(context_element)
|
|
112
|
+
self.fragment_context_element = context_element
|
|
113
|
+
|
|
114
|
+
# For html context, don't pre-create head/body - start in BEFORE_HEAD mode
|
|
115
|
+
# This allows frameset and other elements to be inserted properly
|
|
116
|
+
if name == "html":
|
|
117
|
+
self.mode = InsertionMode.BEFORE_HEAD
|
|
118
|
+
# Table modes only apply to HTML namespace fragments (namespace is None or "html")
|
|
119
|
+
elif namespace in {None, "html"} and name in {"tbody", "thead", "tfoot"}:
|
|
120
|
+
self.mode = InsertionMode.IN_TABLE_BODY
|
|
121
|
+
elif namespace in {None, "html"} and name == "tr":
|
|
122
|
+
self.mode = InsertionMode.IN_ROW
|
|
123
|
+
elif namespace in {None, "html"} and name in {"td", "th"}:
|
|
124
|
+
self.mode = InsertionMode.IN_CELL
|
|
125
|
+
elif namespace in {None, "html"} and name == "caption":
|
|
126
|
+
self.mode = InsertionMode.IN_CAPTION
|
|
127
|
+
elif namespace in {None, "html"} and name == "colgroup":
|
|
128
|
+
self.mode = InsertionMode.IN_COLUMN_GROUP
|
|
129
|
+
elif namespace in {None, "html"} and name == "table":
|
|
130
|
+
self.mode = InsertionMode.IN_TABLE
|
|
131
|
+
else:
|
|
132
|
+
self.mode = InsertionMode.IN_BODY
|
|
133
|
+
# For fragments, frameset_ok starts as False per HTML5 spec
|
|
134
|
+
# This prevents frameset from being inserted in fragment contexts
|
|
135
|
+
self.frameset_ok = False
|
|
136
|
+
|
|
137
|
+
def _set_quirks_mode(self, mode):
|
|
138
|
+
self.quirks_mode = mode
|
|
139
|
+
|
|
140
|
+
def _parse_error(self, code, tag_name=None, token=None):
|
|
141
|
+
if not self.collect_errors:
|
|
142
|
+
return
|
|
143
|
+
# Use the position of the last emitted token (set by tokenizer before emit)
|
|
144
|
+
line = None
|
|
145
|
+
column = None
|
|
146
|
+
end_column = None
|
|
147
|
+
if self.tokenizer: # pragma: no branch
|
|
148
|
+
line = self.tokenizer.last_token_line
|
|
149
|
+
column = self.tokenizer.last_token_column
|
|
150
|
+
|
|
151
|
+
# Calculate start and end columns based on token type for precise highlighting
|
|
152
|
+
# Note: column from tokenizer points AFTER the last character (0-indexed)
|
|
153
|
+
if token is not None and isinstance(token, Tag):
|
|
154
|
+
# Tag: <name> or </name> plus attributes
|
|
155
|
+
tag_len = len(token.name) + 2 # < + name + >
|
|
156
|
+
if token.kind == Tag.END:
|
|
157
|
+
tag_len += 1 # </name>
|
|
158
|
+
# Add attribute lengths
|
|
159
|
+
for attr_name, attr_value in token.attrs.items():
|
|
160
|
+
tag_len += 1 + len(attr_name) # space + name
|
|
161
|
+
if attr_value:
|
|
162
|
+
tag_len += 1 + 2 + len(attr_value) # = + "value"
|
|
163
|
+
if token.self_closing:
|
|
164
|
+
tag_len += 1 # /
|
|
165
|
+
# column points after >, so start is column - tag_len + 1 (for 1-indexed)
|
|
166
|
+
start_column = column - tag_len + 1
|
|
167
|
+
column = start_column
|
|
168
|
+
end_column = column + tag_len
|
|
169
|
+
|
|
170
|
+
message = generate_error_message(code, tag_name)
|
|
171
|
+
source_html = self.tokenizer.buffer if self.tokenizer else None
|
|
172
|
+
self.errors.append(
|
|
173
|
+
ParseError(
|
|
174
|
+
code,
|
|
175
|
+
line=line,
|
|
176
|
+
column=column,
|
|
177
|
+
message=message,
|
|
178
|
+
source_html=source_html,
|
|
179
|
+
end_column=end_column,
|
|
180
|
+
)
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
def _has_element_in_scope(self, target, terminators=None, check_integration_points=True):
|
|
184
|
+
if terminators is None:
|
|
185
|
+
terminators = DEFAULT_SCOPE_TERMINATORS
|
|
186
|
+
for node in reversed(self.open_elements):
|
|
187
|
+
if node.name == target:
|
|
188
|
+
return True
|
|
189
|
+
ns = node.namespace
|
|
190
|
+
if ns == "html" or ns is None:
|
|
191
|
+
if node.name in terminators:
|
|
192
|
+
return False
|
|
193
|
+
elif check_integration_points and (
|
|
194
|
+
self._is_html_integration_point(node) or self._is_mathml_text_integration_point(node)
|
|
195
|
+
):
|
|
196
|
+
return False
|
|
197
|
+
return False
|
|
198
|
+
|
|
199
|
+
def _has_element_in_button_scope(self, target):
|
|
200
|
+
return self._has_element_in_scope(target, BUTTON_SCOPE_TERMINATORS)
|
|
201
|
+
|
|
202
|
+
def _pop_until_inclusive(self, name):
|
|
203
|
+
# Callers ensure element exists on stack
|
|
204
|
+
while self.open_elements: # pragma: no branch
|
|
205
|
+
node = self.open_elements.pop()
|
|
206
|
+
if node.name == name:
|
|
207
|
+
break
|
|
208
|
+
|
|
209
|
+
def _pop_until_any_inclusive(self, names):
|
|
210
|
+
# Pop elements until we find one in names (callers ensure element exists)
|
|
211
|
+
while self.open_elements:
|
|
212
|
+
node = self.open_elements.pop()
|
|
213
|
+
if node.name in names:
|
|
214
|
+
return
|
|
215
|
+
|
|
216
|
+
def _close_p_element(self):
|
|
217
|
+
if self._has_element_in_button_scope("p"):
|
|
218
|
+
self._generate_implied_end_tags("p")
|
|
219
|
+
if self.open_elements[-1].name != "p":
|
|
220
|
+
self._parse_error("end-tag-too-early", tag_name="p")
|
|
221
|
+
self._pop_until_inclusive("p")
|
|
222
|
+
return True
|
|
223
|
+
return False
|
|
224
|
+
|
|
225
|
+
def process_token(self, token):
|
|
226
|
+
# Optimization: Use type() identity check instead of isinstance
|
|
227
|
+
token_type = type(token)
|
|
228
|
+
if token_type is DoctypeToken:
|
|
229
|
+
# Check for foreign content first - DOCTYPE in SVG/MathML is a parse error
|
|
230
|
+
if self.open_elements:
|
|
231
|
+
current = self.open_elements[-1]
|
|
232
|
+
if current.namespace not in {None, "html"}:
|
|
233
|
+
self._parse_error("unexpected-doctype")
|
|
234
|
+
return TokenSinkResult.Continue
|
|
235
|
+
return self._handle_doctype(token)
|
|
236
|
+
|
|
237
|
+
current_token = token
|
|
238
|
+
force_html_mode = False
|
|
239
|
+
|
|
240
|
+
# Cache mode handlers list for speed
|
|
241
|
+
mode_handlers = self._MODE_HANDLERS
|
|
242
|
+
|
|
243
|
+
while True:
|
|
244
|
+
# Update token type for current token (it might have changed if reprocessed)
|
|
245
|
+
token_type = type(current_token)
|
|
246
|
+
|
|
247
|
+
# Optimization: Check for HTML namespace first (common case)
|
|
248
|
+
current_node = self.open_elements[-1] if self.open_elements else None
|
|
249
|
+
is_html_namespace = current_node is None or current_node.namespace in {None, "html"}
|
|
250
|
+
|
|
251
|
+
if force_html_mode or is_html_namespace:
|
|
252
|
+
force_html_mode = False
|
|
253
|
+
if self.mode == InsertionMode.IN_BODY:
|
|
254
|
+
# Inline _mode_in_body for performance
|
|
255
|
+
if token_type is Tag:
|
|
256
|
+
# Inline _handle_tag_in_body
|
|
257
|
+
if current_token.kind == 0: # Tag.START
|
|
258
|
+
name = current_token.name
|
|
259
|
+
if name == "div" or name == "ul" or name == "ol":
|
|
260
|
+
# Inline _handle_body_start_block_with_p
|
|
261
|
+
# Check if p is in button scope (html always terminates)
|
|
262
|
+
has_p = False
|
|
263
|
+
idx = len(self.open_elements) - 1
|
|
264
|
+
while idx >= 0: # pragma: no branch
|
|
265
|
+
node = self.open_elements[idx]
|
|
266
|
+
if node.name == "p":
|
|
267
|
+
has_p = True
|
|
268
|
+
break
|
|
269
|
+
if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
|
|
270
|
+
break
|
|
271
|
+
idx -= 1
|
|
272
|
+
|
|
273
|
+
if has_p:
|
|
274
|
+
self._close_p_element()
|
|
275
|
+
|
|
276
|
+
self._insert_element(current_token, push=True)
|
|
277
|
+
result = None
|
|
278
|
+
elif name == "p":
|
|
279
|
+
result = self._handle_body_start_paragraph(current_token)
|
|
280
|
+
elif name == "span":
|
|
281
|
+
if self.active_formatting:
|
|
282
|
+
self._reconstruct_active_formatting_elements()
|
|
283
|
+
self._insert_element(current_token, push=True)
|
|
284
|
+
self.frameset_ok = False
|
|
285
|
+
result = None
|
|
286
|
+
elif name == "a":
|
|
287
|
+
result = self._handle_body_start_a(current_token)
|
|
288
|
+
elif name == "br" or name == "img":
|
|
289
|
+
if self.active_formatting:
|
|
290
|
+
self._reconstruct_active_formatting_elements()
|
|
291
|
+
self._insert_element(current_token, push=False)
|
|
292
|
+
self.frameset_ok = False
|
|
293
|
+
result = None
|
|
294
|
+
elif name == "hr":
|
|
295
|
+
has_p = False
|
|
296
|
+
idx = len(self.open_elements) - 1
|
|
297
|
+
while idx >= 0: # pragma: no branch
|
|
298
|
+
node = self.open_elements[idx]
|
|
299
|
+
if node.name == "p":
|
|
300
|
+
has_p = True
|
|
301
|
+
break
|
|
302
|
+
if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
|
|
303
|
+
break
|
|
304
|
+
idx -= 1
|
|
305
|
+
|
|
306
|
+
if has_p:
|
|
307
|
+
self._close_p_element()
|
|
308
|
+
|
|
309
|
+
self._insert_element(current_token, push=False)
|
|
310
|
+
self.frameset_ok = False
|
|
311
|
+
result = None
|
|
312
|
+
else:
|
|
313
|
+
handler = self._BODY_START_HANDLERS.get(name)
|
|
314
|
+
if handler:
|
|
315
|
+
result = handler(self, current_token)
|
|
316
|
+
else:
|
|
317
|
+
# Inline _handle_body_start_default
|
|
318
|
+
# Elements here have no special handler - never in FRAMESET_NEUTRAL/FORMATTING_ELEMENTS
|
|
319
|
+
if self.active_formatting:
|
|
320
|
+
self._reconstruct_active_formatting_elements()
|
|
321
|
+
self._insert_element(current_token, push=True)
|
|
322
|
+
if current_token.self_closing:
|
|
323
|
+
self._parse_error(
|
|
324
|
+
"non-void-html-element-start-tag-with-trailing-solidus",
|
|
325
|
+
tag_name=current_token.name,
|
|
326
|
+
)
|
|
327
|
+
self.frameset_ok = False
|
|
328
|
+
result = None
|
|
329
|
+
else:
|
|
330
|
+
name = current_token.name
|
|
331
|
+
if name == "br":
|
|
332
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
333
|
+
br_tag = Tag(0, "br", {}, False)
|
|
334
|
+
result = self._handle_body_start_br(br_tag)
|
|
335
|
+
elif name in FORMATTING_ELEMENTS:
|
|
336
|
+
self._adoption_agency(name)
|
|
337
|
+
result = None
|
|
338
|
+
else:
|
|
339
|
+
handler = self._BODY_END_HANDLERS.get(name)
|
|
340
|
+
if handler:
|
|
341
|
+
result = handler(self, current_token)
|
|
342
|
+
else:
|
|
343
|
+
self._any_other_end_tag(name)
|
|
344
|
+
result = None
|
|
345
|
+
elif token_type is CharacterTokens:
|
|
346
|
+
# Inline _handle_characters_in_body
|
|
347
|
+
# Only non-whitespace data reaches here (whitespace handled in process_characters)
|
|
348
|
+
self.frameset_ok = False
|
|
349
|
+
self._reconstruct_active_formatting_elements()
|
|
350
|
+
self._append_text(current_token.data)
|
|
351
|
+
result = None
|
|
352
|
+
elif token_type is CommentToken:
|
|
353
|
+
result = self._handle_comment_in_body(current_token)
|
|
354
|
+
else: # EOFToken
|
|
355
|
+
result = self._handle_eof_in_body(current_token)
|
|
356
|
+
else:
|
|
357
|
+
result = mode_handlers[self.mode](self, current_token)
|
|
358
|
+
elif self._should_use_foreign_content(current_token):
|
|
359
|
+
result = self._process_foreign_content(current_token)
|
|
360
|
+
else:
|
|
361
|
+
# Foreign content stack logic
|
|
362
|
+
current = current_node
|
|
363
|
+
# Only pop foreign elements if we're NOT at an HTML/MathML integration point
|
|
364
|
+
# and NOT about to insert a new foreign element (svg/math)
|
|
365
|
+
if not isinstance(current_token, EOFToken):
|
|
366
|
+
# Don't pop at integration points - they stay on stack to receive content
|
|
367
|
+
if self._is_html_integration_point(current) or self._is_mathml_text_integration_point(current):
|
|
368
|
+
pass
|
|
369
|
+
# Don't pop when inserting new svg/math elements
|
|
370
|
+
if isinstance(current_token, Tag) and current_token.kind == Tag.START:
|
|
371
|
+
# Optimization: Tokenizer already lowercases tag names
|
|
372
|
+
name_lower = current_token.name
|
|
373
|
+
if name_lower in {"svg", "math"}:
|
|
374
|
+
pass
|
|
375
|
+
|
|
376
|
+
# Special handling: text at integration points inserts directly, bypassing mode dispatch
|
|
377
|
+
if isinstance(current_token, CharacterTokens):
|
|
378
|
+
if self._is_mathml_text_integration_point(current):
|
|
379
|
+
# Tokenizer guarantees non-empty data
|
|
380
|
+
data = current_token.data
|
|
381
|
+
if "\x00" in data:
|
|
382
|
+
self._parse_error("invalid-codepoint")
|
|
383
|
+
data = data.replace("\x00", "")
|
|
384
|
+
if "\x0c" in data:
|
|
385
|
+
self._parse_error("invalid-codepoint")
|
|
386
|
+
data = data.replace("\x0c", "")
|
|
387
|
+
if data:
|
|
388
|
+
if not is_all_whitespace(data):
|
|
389
|
+
self._reconstruct_active_formatting_elements()
|
|
390
|
+
self.frameset_ok = False
|
|
391
|
+
self._append_text(data)
|
|
392
|
+
result = None
|
|
393
|
+
else:
|
|
394
|
+
result = mode_handlers[self.mode](self, current_token)
|
|
395
|
+
else:
|
|
396
|
+
# At integration points inside foreign content, check if table tags make sense.
|
|
397
|
+
if (
|
|
398
|
+
(self._is_mathml_text_integration_point(current) or self._is_html_integration_point(current))
|
|
399
|
+
and isinstance(current_token, Tag)
|
|
400
|
+
and current_token.kind == Tag.START
|
|
401
|
+
and self.mode not in {InsertionMode.IN_BODY}
|
|
402
|
+
):
|
|
403
|
+
# Check if we're in a table mode but without an actual table in scope
|
|
404
|
+
# If so, table tags should be ignored (use IN_BODY mode)
|
|
405
|
+
is_table_mode = self.mode in {
|
|
406
|
+
InsertionMode.IN_TABLE,
|
|
407
|
+
InsertionMode.IN_TABLE_BODY,
|
|
408
|
+
InsertionMode.IN_ROW,
|
|
409
|
+
InsertionMode.IN_CELL,
|
|
410
|
+
InsertionMode.IN_CAPTION,
|
|
411
|
+
InsertionMode.IN_COLUMN_GROUP,
|
|
412
|
+
}
|
|
413
|
+
has_table_in_scope = self._has_in_table_scope("table")
|
|
414
|
+
if is_table_mode and not has_table_in_scope:
|
|
415
|
+
# Temporarily use IN_BODY mode for this tag
|
|
416
|
+
saved_mode = self.mode
|
|
417
|
+
self.mode = InsertionMode.IN_BODY
|
|
418
|
+
result = mode_handlers[self.mode](self, current_token)
|
|
419
|
+
# Restore mode if no mode change was requested
|
|
420
|
+
if self.mode == InsertionMode.IN_BODY: # pragma: no branch
|
|
421
|
+
self.mode = saved_mode
|
|
422
|
+
else:
|
|
423
|
+
result = mode_handlers[self.mode](self, current_token)
|
|
424
|
+
else:
|
|
425
|
+
result = mode_handlers[self.mode](self, current_token)
|
|
426
|
+
|
|
427
|
+
if result is None:
|
|
428
|
+
result_to_return = self.tokenizer_state_override or TokenSinkResult.Continue
|
|
429
|
+
self.tokenizer_state_override = None
|
|
430
|
+
return result_to_return
|
|
431
|
+
# Result is (instruction, mode, token) or (instruction, mode, token, force_html)
|
|
432
|
+
_instruction, mode, token_override = result[0], result[1], result[2]
|
|
433
|
+
if len(result) == 4:
|
|
434
|
+
force_html_mode = result[3]
|
|
435
|
+
# All mode handlers that return a tuple use "reprocess" instruction
|
|
436
|
+
self.mode = mode
|
|
437
|
+
current_token = token_override
|
|
438
|
+
# Continue loop to reprocess
|
|
439
|
+
|
|
440
|
+
def finish(self):
|
|
441
|
+
if self.fragment_context is not None:
|
|
442
|
+
# For fragments, remove the html wrapper and promote its children
|
|
443
|
+
# Note: html element is always created in fragment setup, so children[0] is always "html"
|
|
444
|
+
root = self.document.children[0]
|
|
445
|
+
context_elem = self.fragment_context_element
|
|
446
|
+
if context_elem is not None and context_elem.parent is root:
|
|
447
|
+
for child in list(context_elem.children):
|
|
448
|
+
context_elem.remove_child(child)
|
|
449
|
+
root.append_child(child)
|
|
450
|
+
root.remove_child(context_elem)
|
|
451
|
+
for child in list(root.children):
|
|
452
|
+
root.remove_child(child)
|
|
453
|
+
self.document.append_child(child)
|
|
454
|
+
self.document.remove_child(root)
|
|
455
|
+
|
|
456
|
+
# Populate selectedcontent elements per HTML5 spec
|
|
457
|
+
self._populate_selectedcontent(self.document)
|
|
458
|
+
|
|
459
|
+
return self.document
|
|
460
|
+
|
|
461
|
+
# Insertion mode dispatch ------------------------------------------------
|
|
462
|
+
|
|
463
|
+
def _append_comment_to_document(self, text):
|
|
464
|
+
node = SimpleDomNode("#comment", data=text)
|
|
465
|
+
self.document.append_child(node)
|
|
466
|
+
|
|
467
|
+
def _append_comment(self, text, parent=None):
|
|
468
|
+
if parent is None:
|
|
469
|
+
parent = self._current_node_or_html()
|
|
470
|
+
# If parent is a template, insert into its content fragment
|
|
471
|
+
if type(parent) is TemplateNode and parent.template_content:
|
|
472
|
+
parent = parent.template_content
|
|
473
|
+
node = SimpleDomNode("#comment", data=text)
|
|
474
|
+
parent.append_child(node)
|
|
475
|
+
|
|
476
|
+
def _append_text(self, text):
|
|
477
|
+
if self.ignore_lf:
|
|
478
|
+
self.ignore_lf = False
|
|
479
|
+
if text.startswith("\n"):
|
|
480
|
+
text = text[1:]
|
|
481
|
+
if not text:
|
|
482
|
+
return
|
|
483
|
+
|
|
484
|
+
# Guard against empty stack
|
|
485
|
+
if not self.open_elements: # pragma: no cover
|
|
486
|
+
return
|
|
487
|
+
|
|
488
|
+
# Fast path optimization for common case
|
|
489
|
+
target = self.open_elements[-1]
|
|
490
|
+
|
|
491
|
+
if target.name not in TABLE_FOSTER_TARGETS and type(target) is not TemplateNode:
|
|
492
|
+
children = target.children
|
|
493
|
+
if children:
|
|
494
|
+
last_child = children[-1]
|
|
495
|
+
if type(last_child) is TextNode:
|
|
496
|
+
last_child.data += text
|
|
497
|
+
return
|
|
498
|
+
|
|
499
|
+
node = TextNode(text)
|
|
500
|
+
children.append(node)
|
|
501
|
+
node.parent = target
|
|
502
|
+
return
|
|
503
|
+
|
|
504
|
+
target = self._current_node_or_html()
|
|
505
|
+
foster_parenting = self._should_foster_parenting(target, is_text=True)
|
|
506
|
+
|
|
507
|
+
# Reconstruct active formatting BEFORE getting insertion location when foster parenting
|
|
508
|
+
if foster_parenting:
|
|
509
|
+
self._reconstruct_active_formatting_elements()
|
|
510
|
+
|
|
511
|
+
# Always use appropriate insertion location to handle templates
|
|
512
|
+
parent, position = self._appropriate_insertion_location(foster_parenting=foster_parenting)
|
|
513
|
+
|
|
514
|
+
# Coalesce with adjacent text node if possible
|
|
515
|
+
if position > 0 and parent.children[position - 1].name == "#text":
|
|
516
|
+
parent.children[position - 1].data = (parent.children[position - 1].data or "") + text
|
|
517
|
+
return
|
|
518
|
+
|
|
519
|
+
node = TextNode(text)
|
|
520
|
+
reference_node = parent.children[position] if position < len(parent.children) else None
|
|
521
|
+
parent.insert_before(node, reference_node)
|
|
522
|
+
|
|
523
|
+
def _current_node_or_html(self):
|
|
524
|
+
if self.open_elements:
|
|
525
|
+
return self.open_elements[-1]
|
|
526
|
+
# Stack empty - find html element in document children
|
|
527
|
+
# (may not be first if there are comments/doctype before it)
|
|
528
|
+
for child in self.document.children:
|
|
529
|
+
if child.name == "html":
|
|
530
|
+
return child
|
|
531
|
+
# Edge case: no html found, return first child or None
|
|
532
|
+
return self.document.children[0] if self.document.children else None # pragma: no cover
|
|
533
|
+
|
|
534
|
+
def _create_root(self, attrs):
|
|
535
|
+
node = SimpleDomNode("html", attrs=attrs, namespace="html")
|
|
536
|
+
self.document.append_child(node)
|
|
537
|
+
self.open_elements.append(node)
|
|
538
|
+
return node
|
|
539
|
+
|
|
540
|
+
def _insert_element(self, tag, *, push, namespace="html"):
|
|
541
|
+
if tag.name == "template" and namespace == "html":
|
|
542
|
+
node = TemplateNode(tag.name, attrs=tag.attrs, namespace=namespace)
|
|
543
|
+
else:
|
|
544
|
+
node = ElementNode(tag.name, attrs=tag.attrs, namespace=namespace)
|
|
545
|
+
|
|
546
|
+
# Fast path for common case: not inserting from table
|
|
547
|
+
if not self.insert_from_table:
|
|
548
|
+
target = self._current_node_or_html()
|
|
549
|
+
|
|
550
|
+
# Handle template content insertion
|
|
551
|
+
if type(target) is TemplateNode:
|
|
552
|
+
parent = target.template_content
|
|
553
|
+
else:
|
|
554
|
+
parent = target
|
|
555
|
+
|
|
556
|
+
parent.append_child(node)
|
|
557
|
+
|
|
558
|
+
if push:
|
|
559
|
+
self.open_elements.append(node)
|
|
560
|
+
return node
|
|
561
|
+
|
|
562
|
+
target = self._current_node_or_html()
|
|
563
|
+
foster_parenting = self._should_foster_parenting(target, for_tag=tag.name)
|
|
564
|
+
parent, position = self._appropriate_insertion_location(foster_parenting=foster_parenting)
|
|
565
|
+
self._insert_node_at(parent, position, node)
|
|
566
|
+
if push:
|
|
567
|
+
self.open_elements.append(node)
|
|
568
|
+
return node
|
|
569
|
+
|
|
570
|
+
def _insert_phantom(self, name):
|
|
571
|
+
tag = Tag(Tag.START, name, {}, False)
|
|
572
|
+
return self._insert_element(tag, push=True)
|
|
573
|
+
|
|
574
|
+
def _insert_body_if_missing(self):
|
|
575
|
+
html_node = self._find_last_on_stack("html")
|
|
576
|
+
node = SimpleDomNode("body", namespace="html")
|
|
577
|
+
html_node.append_child(node)
|
|
578
|
+
node.parent = html_node
|
|
579
|
+
self.open_elements.append(node)
|
|
580
|
+
|
|
581
|
+
def _create_element(self, name, namespace, attrs):
|
|
582
|
+
ns = namespace or "html"
|
|
583
|
+
return ElementNode(name, attrs, ns)
|
|
584
|
+
|
|
585
|
+
def _pop_current(self):
|
|
586
|
+
return self.open_elements.pop()
|
|
587
|
+
|
|
588
|
+
def _in_scope(self, name):
|
|
589
|
+
return self._has_element_in_scope(name, DEFAULT_SCOPE_TERMINATORS)
|
|
590
|
+
|
|
591
|
+
def _close_element_by_name(self, name):
|
|
592
|
+
# Simple element closing - pops from the named element onwards
|
|
593
|
+
# Used for explicit closing (e.g., when button start tag closes existing button)
|
|
594
|
+
# Caller guarantees name is on the stack via _has_in_scope check
|
|
595
|
+
index = len(self.open_elements) - 1
|
|
596
|
+
while index >= 0: # pragma: no branch
|
|
597
|
+
if self.open_elements[index].name == name:
|
|
598
|
+
del self.open_elements[index:]
|
|
599
|
+
return
|
|
600
|
+
index -= 1
|
|
601
|
+
|
|
602
|
+
def _any_other_end_tag(self, name):
|
|
603
|
+
# Spec: "Any other end tag" in IN_BODY mode
|
|
604
|
+
# Loop through stack backwards (always terminates: html is special)
|
|
605
|
+
index = len(self.open_elements) - 1
|
|
606
|
+
while index >= 0: # pragma: no branch
|
|
607
|
+
node = self.open_elements[index]
|
|
608
|
+
|
|
609
|
+
# If node's name matches the end tag name
|
|
610
|
+
if node.name == name:
|
|
611
|
+
# Generate implied end tags (except for this name)
|
|
612
|
+
# If current node is not this node, parse error
|
|
613
|
+
if index != len(self.open_elements) - 1:
|
|
614
|
+
self._parse_error("end-tag-too-early")
|
|
615
|
+
# Pop all elements from this node onwards
|
|
616
|
+
del self.open_elements[index:]
|
|
617
|
+
return
|
|
618
|
+
|
|
619
|
+
# If node is a special element, parse error and ignore the tag
|
|
620
|
+
if self._is_special_element(node):
|
|
621
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
622
|
+
return # Ignore the end tag
|
|
623
|
+
|
|
624
|
+
# Continue to next node (previous in stack)
|
|
625
|
+
index -= 1
|
|
626
|
+
|
|
627
|
+
def _add_missing_attributes(self, node, attrs):
|
|
628
|
+
if not attrs:
|
|
629
|
+
return
|
|
630
|
+
existing = node.attrs
|
|
631
|
+
for name, value in attrs.items():
|
|
632
|
+
if name not in existing:
|
|
633
|
+
existing[name] = value
|
|
634
|
+
|
|
635
|
+
def _remove_from_open_elements(self, node):
|
|
636
|
+
for index, current in enumerate(self.open_elements):
|
|
637
|
+
if current is node:
|
|
638
|
+
del self.open_elements[index]
|
|
639
|
+
return True
|
|
640
|
+
return False
|
|
641
|
+
|
|
642
|
+
def _is_special_element(self, node):
|
|
643
|
+
if node.namespace not in {None, "html"}:
|
|
644
|
+
return False
|
|
645
|
+
return node.name in SPECIAL_ELEMENTS
|
|
646
|
+
|
|
647
|
+
def _find_active_formatting_index(self, name):
|
|
648
|
+
for index in range(len(self.active_formatting) - 1, -1, -1):
|
|
649
|
+
entry = self.active_formatting[index]
|
|
650
|
+
if entry is FORMAT_MARKER:
|
|
651
|
+
break
|
|
652
|
+
if entry["name"] == name:
|
|
653
|
+
return index
|
|
654
|
+
return None
|
|
655
|
+
|
|
656
|
+
def _find_active_formatting_index_by_node(self, node):
|
|
657
|
+
for index in range(len(self.active_formatting) - 1, -1, -1):
|
|
658
|
+
entry = self.active_formatting[index]
|
|
659
|
+
if entry is not FORMAT_MARKER and entry["node"] is node:
|
|
660
|
+
return index
|
|
661
|
+
return None
|
|
662
|
+
|
|
663
|
+
def _clone_attributes(self, attrs):
|
|
664
|
+
return attrs.copy() if attrs else {}
|
|
665
|
+
|
|
666
|
+
def _attrs_signature(self, attrs):
|
|
667
|
+
if not attrs:
|
|
668
|
+
return ()
|
|
669
|
+
items = []
|
|
670
|
+
for name, value in attrs.items():
|
|
671
|
+
items.append((name, value or ""))
|
|
672
|
+
items.sort()
|
|
673
|
+
return tuple(items)
|
|
674
|
+
|
|
675
|
+
def _find_active_formatting_duplicate(self, name, attrs):
|
|
676
|
+
signature = self._attrs_signature(attrs)
|
|
677
|
+
matches = []
|
|
678
|
+
for index, entry in enumerate(self.active_formatting):
|
|
679
|
+
if entry is FORMAT_MARKER:
|
|
680
|
+
matches.clear()
|
|
681
|
+
continue
|
|
682
|
+
existing_signature = entry["signature"]
|
|
683
|
+
if entry["name"] == name and existing_signature == signature:
|
|
684
|
+
matches.append(index)
|
|
685
|
+
if len(matches) >= 3:
|
|
686
|
+
return matches[0]
|
|
687
|
+
return None
|
|
688
|
+
|
|
689
|
+
def _has_active_formatting_entry(self, name):
|
|
690
|
+
for index in range(len(self.active_formatting) - 1, -1, -1):
|
|
691
|
+
entry = self.active_formatting[index]
|
|
692
|
+
if entry is FORMAT_MARKER:
|
|
693
|
+
break
|
|
694
|
+
if entry["name"] == name:
|
|
695
|
+
return True
|
|
696
|
+
return False
|
|
697
|
+
|
|
698
|
+
def _remove_last_active_formatting_by_name(self, name):
|
|
699
|
+
for index in range(len(self.active_formatting) - 1, -1, -1):
|
|
700
|
+
entry = self.active_formatting[index]
|
|
701
|
+
if entry is FORMAT_MARKER:
|
|
702
|
+
break
|
|
703
|
+
if entry["name"] == name:
|
|
704
|
+
del self.active_formatting[index]
|
|
705
|
+
return
|
|
706
|
+
|
|
707
|
+
def _remove_last_open_element_by_name(self, name):
|
|
708
|
+
for index in range(len(self.open_elements) - 1, -1, -1):
|
|
709
|
+
if self.open_elements[index].name == name:
|
|
710
|
+
del self.open_elements[index]
|
|
711
|
+
return
|
|
712
|
+
|
|
713
|
+
def _append_active_formatting_entry(self, name, attrs, node):
|
|
714
|
+
entry_attrs = self._clone_attributes(attrs)
|
|
715
|
+
signature = self._attrs_signature(entry_attrs)
|
|
716
|
+
self.active_formatting.append(
|
|
717
|
+
{
|
|
718
|
+
"name": name,
|
|
719
|
+
"attrs": entry_attrs,
|
|
720
|
+
"node": node,
|
|
721
|
+
"signature": signature,
|
|
722
|
+
},
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
def _clear_active_formatting_up_to_marker(self):
|
|
726
|
+
while self.active_formatting:
|
|
727
|
+
entry = self.active_formatting.pop()
|
|
728
|
+
if entry is FORMAT_MARKER:
|
|
729
|
+
break
|
|
730
|
+
|
|
731
|
+
def _push_formatting_marker(self):
|
|
732
|
+
self.active_formatting.append(FORMAT_MARKER)
|
|
733
|
+
|
|
734
|
+
def _remove_formatting_entry(self, index):
|
|
735
|
+
assert 0 <= index < len(self.active_formatting), f"Invalid index: {index}"
|
|
736
|
+
del self.active_formatting[index]
|
|
737
|
+
|
|
738
|
+
def _reconstruct_active_formatting_elements(self):
|
|
739
|
+
if not self.active_formatting:
|
|
740
|
+
return
|
|
741
|
+
last_entry = self.active_formatting[-1]
|
|
742
|
+
if last_entry is FORMAT_MARKER or last_entry["node"] in self.open_elements:
|
|
743
|
+
return
|
|
744
|
+
|
|
745
|
+
index = len(self.active_formatting) - 1
|
|
746
|
+
while True:
|
|
747
|
+
index -= 1
|
|
748
|
+
if index < 0:
|
|
749
|
+
break
|
|
750
|
+
entry = self.active_formatting[index]
|
|
751
|
+
if entry is FORMAT_MARKER or entry["node"] in self.open_elements:
|
|
752
|
+
index += 1
|
|
753
|
+
break
|
|
754
|
+
if index < 0:
|
|
755
|
+
index = 0
|
|
756
|
+
while index < len(self.active_formatting):
|
|
757
|
+
entry = self.active_formatting[index]
|
|
758
|
+
tag = Tag(Tag.START, entry["name"], self._clone_attributes(entry["attrs"]), False)
|
|
759
|
+
new_node = self._insert_element(tag, push=True)
|
|
760
|
+
entry["node"] = new_node
|
|
761
|
+
index += 1
|
|
762
|
+
|
|
763
|
+
def _insert_node_at(self, parent, index, node):
|
|
764
|
+
reference_node = None
|
|
765
|
+
if index is not None and index < len(parent.children):
|
|
766
|
+
reference_node = parent.children[index]
|
|
767
|
+
parent.insert_before(node, reference_node)
|
|
768
|
+
|
|
769
|
+
def _find_last_on_stack(self, name):
|
|
770
|
+
for node in reversed(self.open_elements):
|
|
771
|
+
if node.name == name:
|
|
772
|
+
return node
|
|
773
|
+
return None
|
|
774
|
+
|
|
775
|
+
def _clear_stack_until(self, names):
|
|
776
|
+
# All callers include "html" in names, so this always terminates via break
|
|
777
|
+
while self.open_elements:
|
|
778
|
+
node = self.open_elements[-1]
|
|
779
|
+
if node.name in names and node.namespace in {None, "html"}:
|
|
780
|
+
break
|
|
781
|
+
self.open_elements.pop()
|
|
782
|
+
|
|
783
|
+
def _generate_implied_end_tags(self, exclude=None):
|
|
784
|
+
# Always terminates: html is not in IMPLIED_END_TAGS
|
|
785
|
+
while self.open_elements: # pragma: no branch
|
|
786
|
+
node = self.open_elements[-1]
|
|
787
|
+
if node.name in IMPLIED_END_TAGS and node.name != exclude:
|
|
788
|
+
self.open_elements.pop()
|
|
789
|
+
continue
|
|
790
|
+
break
|
|
791
|
+
|
|
792
|
+
def _has_in_table_scope(self, name):
|
|
793
|
+
return self._has_element_in_scope(name, TABLE_SCOPE_TERMINATORS, check_integration_points=False)
|
|
794
|
+
|
|
795
|
+
def _close_table_cell(self):
|
|
796
|
+
if self._has_in_table_scope("td"):
|
|
797
|
+
self._end_table_cell("td")
|
|
798
|
+
return True
|
|
799
|
+
if self._has_in_table_scope("th"):
|
|
800
|
+
self._end_table_cell("th")
|
|
801
|
+
return True
|
|
802
|
+
return False
|
|
803
|
+
|
|
804
|
+
def _end_table_cell(self, name):
|
|
805
|
+
self._generate_implied_end_tags(name)
|
|
806
|
+
while self.open_elements:
|
|
807
|
+
node = self.open_elements.pop()
|
|
808
|
+
if node.name == name and node.namespace in {None, "html"}:
|
|
809
|
+
break
|
|
810
|
+
self._clear_active_formatting_up_to_marker()
|
|
811
|
+
self.mode = InsertionMode.IN_ROW
|
|
812
|
+
|
|
813
|
+
def _flush_pending_table_text(self):
|
|
814
|
+
data = "".join(self.pending_table_text)
|
|
815
|
+
self.pending_table_text.clear()
|
|
816
|
+
if not data:
|
|
817
|
+
return
|
|
818
|
+
if is_all_whitespace(data):
|
|
819
|
+
self._append_text(data)
|
|
820
|
+
return
|
|
821
|
+
self._parse_error("foster-parenting-character")
|
|
822
|
+
previous = self.insert_from_table
|
|
823
|
+
self.insert_from_table = True
|
|
824
|
+
try:
|
|
825
|
+
self._reconstruct_active_formatting_elements()
|
|
826
|
+
self._append_text(data)
|
|
827
|
+
finally:
|
|
828
|
+
self.insert_from_table = previous
|
|
829
|
+
|
|
830
|
+
def _close_table_element(self):
|
|
831
|
+
if not self._has_in_table_scope("table"):
|
|
832
|
+
self._parse_error("unexpected-end-tag", tag_name="table")
|
|
833
|
+
return False
|
|
834
|
+
self._generate_implied_end_tags()
|
|
835
|
+
# Table verified in scope above
|
|
836
|
+
while self.open_elements: # pragma: no branch
|
|
837
|
+
node = self.open_elements.pop()
|
|
838
|
+
if node.name == "table":
|
|
839
|
+
break
|
|
840
|
+
self._reset_insertion_mode()
|
|
841
|
+
return True
|
|
842
|
+
|
|
843
|
+
def _reset_insertion_mode(self):
|
|
844
|
+
# Walk stack backwards - html element always terminates
|
|
845
|
+
idx = len(self.open_elements) - 1
|
|
846
|
+
while idx >= 0:
|
|
847
|
+
node = self.open_elements[idx]
|
|
848
|
+
name = node.name
|
|
849
|
+
if name == "select":
|
|
850
|
+
self.mode = InsertionMode.IN_SELECT
|
|
851
|
+
return
|
|
852
|
+
if name == "td" or name == "th":
|
|
853
|
+
self.mode = InsertionMode.IN_CELL
|
|
854
|
+
return
|
|
855
|
+
if name == "tr":
|
|
856
|
+
self.mode = InsertionMode.IN_ROW
|
|
857
|
+
return
|
|
858
|
+
if name in {"tbody", "tfoot", "thead"}:
|
|
859
|
+
self.mode = InsertionMode.IN_TABLE_BODY
|
|
860
|
+
return
|
|
861
|
+
if name == "caption":
|
|
862
|
+
self.mode = InsertionMode.IN_CAPTION
|
|
863
|
+
return
|
|
864
|
+
if name == "table":
|
|
865
|
+
self.mode = InsertionMode.IN_TABLE
|
|
866
|
+
return
|
|
867
|
+
if name == "template":
|
|
868
|
+
# Return the last template mode from the stack
|
|
869
|
+
if self.template_modes:
|
|
870
|
+
self.mode = self.template_modes[-1]
|
|
871
|
+
return
|
|
872
|
+
if name == "head":
|
|
873
|
+
# If we're resetting and head is on stack, stay in IN_HEAD
|
|
874
|
+
self.mode = InsertionMode.IN_HEAD
|
|
875
|
+
return
|
|
876
|
+
if name == "html":
|
|
877
|
+
self.mode = InsertionMode.IN_BODY
|
|
878
|
+
return
|
|
879
|
+
idx -= 1
|
|
880
|
+
# Empty stack fallback
|
|
881
|
+
self.mode = InsertionMode.IN_BODY
|
|
882
|
+
|
|
883
|
+
def _should_foster_parenting(self, target, *, for_tag=None, is_text=False):
|
|
884
|
+
if not self.insert_from_table:
|
|
885
|
+
return False
|
|
886
|
+
if target.name not in TABLE_FOSTER_TARGETS:
|
|
887
|
+
return False
|
|
888
|
+
if is_text:
|
|
889
|
+
return True
|
|
890
|
+
if for_tag in TABLE_ALLOWED_CHILDREN:
|
|
891
|
+
return False
|
|
892
|
+
return True
|
|
893
|
+
|
|
894
|
+
def _lower_ascii(self, value):
|
|
895
|
+
return value.lower() if value else ""
|
|
896
|
+
|
|
897
|
+
def _adjust_svg_tag_name(self, name):
|
|
898
|
+
lowered = self._lower_ascii(name)
|
|
899
|
+
return SVG_TAG_NAME_ADJUSTMENTS.get(lowered, name)
|
|
900
|
+
|
|
901
|
+
def _prepare_foreign_attributes(self, namespace, attrs):
|
|
902
|
+
if not attrs:
|
|
903
|
+
return {}
|
|
904
|
+
adjusted = {}
|
|
905
|
+
for name, value in attrs.items():
|
|
906
|
+
lower_name = self._lower_ascii(name)
|
|
907
|
+
if namespace == "math" and lower_name in MATHML_ATTRIBUTE_ADJUSTMENTS:
|
|
908
|
+
name = MATHML_ATTRIBUTE_ADJUSTMENTS[lower_name]
|
|
909
|
+
lower_name = self._lower_ascii(name)
|
|
910
|
+
elif namespace == "svg" and lower_name in SVG_ATTRIBUTE_ADJUSTMENTS:
|
|
911
|
+
name = SVG_ATTRIBUTE_ADJUSTMENTS[lower_name]
|
|
912
|
+
lower_name = self._lower_ascii(name)
|
|
913
|
+
|
|
914
|
+
foreign_adjustment = FOREIGN_ATTRIBUTE_ADJUSTMENTS.get(lower_name)
|
|
915
|
+
if foreign_adjustment is not None:
|
|
916
|
+
prefix, local, _ = foreign_adjustment
|
|
917
|
+
name = f"{prefix}:{local}"
|
|
918
|
+
|
|
919
|
+
# Tokenizer deduplicates attributes, so name collision impossible here
|
|
920
|
+
adjusted[name] = value
|
|
921
|
+
return adjusted
|
|
922
|
+
|
|
923
|
+
def _node_attribute_value(self, node, name):
|
|
924
|
+
target = self._lower_ascii(name)
|
|
925
|
+
for attr_name, attr_value in node.attrs.items():
|
|
926
|
+
if self._lower_ascii(attr_name) == target:
|
|
927
|
+
return attr_value or ""
|
|
928
|
+
return None
|
|
929
|
+
|
|
930
|
+
def _is_html_integration_point(self, node):
|
|
931
|
+
# annotation-xml is an HTML integration point only with specific encoding values
|
|
932
|
+
if node.namespace == "math" and node.name == "annotation-xml":
|
|
933
|
+
encoding = self._node_attribute_value(node, "encoding")
|
|
934
|
+
if encoding:
|
|
935
|
+
enc_lower = encoding.lower()
|
|
936
|
+
if enc_lower in {"text/html", "application/xhtml+xml"}:
|
|
937
|
+
return True
|
|
938
|
+
return False # annotation-xml without proper encoding is NOT an integration point
|
|
939
|
+
# SVG foreignObject, desc, and title are always HTML integration points
|
|
940
|
+
return (node.namespace, node.name) in HTML_INTEGRATION_POINT_SET
|
|
941
|
+
|
|
942
|
+
def _is_mathml_text_integration_point(self, node):
|
|
943
|
+
if node.namespace != "math":
|
|
944
|
+
return False
|
|
945
|
+
return (node.namespace, node.name) in MATHML_TEXT_INTEGRATION_POINT_SET
|
|
946
|
+
|
|
947
|
+
def _adjusted_current_node(self):
|
|
948
|
+
return self.open_elements[-1]
|
|
949
|
+
|
|
950
|
+
def _should_use_foreign_content(self, token):
|
|
951
|
+
current = self._adjusted_current_node()
|
|
952
|
+
# HTML namespace elements don't use foreign content rules
|
|
953
|
+
# (unreachable in practice as foreign content mode only entered for foreign elements)
|
|
954
|
+
if current.namespace in {None, "html"}:
|
|
955
|
+
return False # pragma: no cover
|
|
956
|
+
|
|
957
|
+
if isinstance(token, EOFToken):
|
|
958
|
+
return False
|
|
959
|
+
|
|
960
|
+
if self._is_mathml_text_integration_point(current):
|
|
961
|
+
if isinstance(token, CharacterTokens):
|
|
962
|
+
return False
|
|
963
|
+
if isinstance(token, Tag) and token.kind == Tag.START:
|
|
964
|
+
name_lower = self._lower_ascii(token.name)
|
|
965
|
+
if name_lower not in {"mglyph", "malignmark"}:
|
|
966
|
+
return False
|
|
967
|
+
|
|
968
|
+
if current.namespace == "math" and current.name == "annotation-xml":
|
|
969
|
+
if isinstance(token, Tag) and token.kind == Tag.START:
|
|
970
|
+
if self._lower_ascii(token.name) == "svg":
|
|
971
|
+
return False
|
|
972
|
+
|
|
973
|
+
if self._is_html_integration_point(current):
|
|
974
|
+
if isinstance(token, CharacterTokens):
|
|
975
|
+
return False
|
|
976
|
+
if isinstance(token, Tag) and token.kind == Tag.START:
|
|
977
|
+
return False
|
|
978
|
+
|
|
979
|
+
return True
|
|
980
|
+
|
|
981
|
+
def _foreign_breakout_font(self, tag):
|
|
982
|
+
for name in tag.attrs.keys():
|
|
983
|
+
if self._lower_ascii(name) in {"color", "face", "size"}:
|
|
984
|
+
return True
|
|
985
|
+
return False
|
|
986
|
+
|
|
987
|
+
def _pop_until_html_or_integration_point(self):
|
|
988
|
+
# Always terminates: html element has html namespace
|
|
989
|
+
while self.open_elements: # pragma: no branch
|
|
990
|
+
node = self.open_elements[-1]
|
|
991
|
+
if node.namespace in {None, "html"}:
|
|
992
|
+
return
|
|
993
|
+
if self._is_html_integration_point(node):
|
|
994
|
+
return
|
|
995
|
+
if self.fragment_context_element is not None and node is self.fragment_context_element:
|
|
996
|
+
return
|
|
997
|
+
self.open_elements.pop()
|
|
998
|
+
|
|
999
|
+
def _process_foreign_content(self, token):
|
|
1000
|
+
current = self._adjusted_current_node()
|
|
1001
|
+
|
|
1002
|
+
if isinstance(token, CharacterTokens):
|
|
1003
|
+
raw = token.data or ""
|
|
1004
|
+
cleaned = []
|
|
1005
|
+
has_non_null_non_ws = False
|
|
1006
|
+
for ch in raw:
|
|
1007
|
+
if ch == "\x00":
|
|
1008
|
+
self._parse_error("invalid-codepoint-in-foreign-content")
|
|
1009
|
+
cleaned.append("\ufffd")
|
|
1010
|
+
continue
|
|
1011
|
+
cleaned.append(ch)
|
|
1012
|
+
if ch not in "\t\n\f\r ":
|
|
1013
|
+
has_non_null_non_ws = True
|
|
1014
|
+
data = "".join(cleaned)
|
|
1015
|
+
if has_non_null_non_ws:
|
|
1016
|
+
self.frameset_ok = False
|
|
1017
|
+
self._append_text(data)
|
|
1018
|
+
return None
|
|
1019
|
+
|
|
1020
|
+
if isinstance(token, CommentToken):
|
|
1021
|
+
self._append_comment(token.data)
|
|
1022
|
+
return None
|
|
1023
|
+
|
|
1024
|
+
# Foreign content only receives CharacterTokens, CommentToken, or Tag (not EOF)
|
|
1025
|
+
assert isinstance(token, Tag), f"Unexpected token type in foreign content: {type(token)}"
|
|
1026
|
+
name_lower = self._lower_ascii(token.name)
|
|
1027
|
+
if token.kind == Tag.START:
|
|
1028
|
+
if name_lower in FOREIGN_BREAKOUT_ELEMENTS or (
|
|
1029
|
+
name_lower == "font" and self._foreign_breakout_font(token)
|
|
1030
|
+
):
|
|
1031
|
+
self._parse_error("unexpected-html-element-in-foreign-content")
|
|
1032
|
+
self._pop_until_html_or_integration_point()
|
|
1033
|
+
self._reset_insertion_mode()
|
|
1034
|
+
return ("reprocess", self.mode, token, True)
|
|
1035
|
+
|
|
1036
|
+
namespace = current.namespace
|
|
1037
|
+
adjusted_name = token.name
|
|
1038
|
+
if namespace == "svg":
|
|
1039
|
+
adjusted_name = self._adjust_svg_tag_name(token.name)
|
|
1040
|
+
attrs = self._prepare_foreign_attributes(namespace, token.attrs)
|
|
1041
|
+
new_tag = Tag(Tag.START, adjusted_name, attrs, token.self_closing)
|
|
1042
|
+
# For foreign elements, honor the self-closing flag
|
|
1043
|
+
self._insert_element(new_tag, push=not token.self_closing, namespace=namespace)
|
|
1044
|
+
return None
|
|
1045
|
+
|
|
1046
|
+
# Only START and END tag kinds exist, and START returns above
|
|
1047
|
+
assert token.kind == Tag.END, f"Unexpected tag kind: {token.kind}"
|
|
1048
|
+
name_lower = self._lower_ascii(token.name)
|
|
1049
|
+
|
|
1050
|
+
# Special case: </br> and </p> end tags trigger breakout from foreign content
|
|
1051
|
+
if name_lower in {"br", "p"}:
|
|
1052
|
+
self._parse_error("unexpected-html-element-in-foreign-content")
|
|
1053
|
+
self._pop_until_html_or_integration_point()
|
|
1054
|
+
self._reset_insertion_mode()
|
|
1055
|
+
return ("reprocess", self.mode, token, True)
|
|
1056
|
+
|
|
1057
|
+
# Process foreign end tag per spec: walk stack backwards looking for match
|
|
1058
|
+
idx = len(self.open_elements) - 1
|
|
1059
|
+
first = True
|
|
1060
|
+
while idx >= 0:
|
|
1061
|
+
node = self.open_elements[idx]
|
|
1062
|
+
is_html = node.namespace in {None, "html"}
|
|
1063
|
+
name_eq = self._lower_ascii(node.name) == name_lower
|
|
1064
|
+
|
|
1065
|
+
# Check if this node matches the end tag (case-insensitive)
|
|
1066
|
+
if name_eq:
|
|
1067
|
+
if self.fragment_context_element is not None and node is self.fragment_context_element:
|
|
1068
|
+
self._parse_error("unexpected-end-tag-in-fragment-context")
|
|
1069
|
+
return None
|
|
1070
|
+
# If matched element is HTML namespace, break out to HTML mode
|
|
1071
|
+
if is_html:
|
|
1072
|
+
return ("reprocess", self.mode, token, True)
|
|
1073
|
+
# Otherwise it's a foreign element - pop everything from this point up
|
|
1074
|
+
del self.open_elements[idx:]
|
|
1075
|
+
return None
|
|
1076
|
+
|
|
1077
|
+
# Per HTML5 spec: if first node doesn't match, it's a parse error
|
|
1078
|
+
if first:
|
|
1079
|
+
self._parse_error("unexpected-end-tag-in-foreign-content", tag_name=token.name)
|
|
1080
|
+
first = False
|
|
1081
|
+
|
|
1082
|
+
# If we hit an HTML element that doesn't match, process in secondary mode
|
|
1083
|
+
if is_html:
|
|
1084
|
+
return ("reprocess", self.mode, token, True)
|
|
1085
|
+
|
|
1086
|
+
idx -= 1
|
|
1087
|
+
# Stack exhausted without finding match - ignore tag (defensive, html always terminates)
|
|
1088
|
+
return None # pragma: no cover
|
|
1089
|
+
|
|
1090
|
+
def _appropriate_insertion_location(self, override_target=None, *, foster_parenting=False):
|
|
1091
|
+
if override_target is not None:
|
|
1092
|
+
target = override_target
|
|
1093
|
+
else:
|
|
1094
|
+
target = self._current_node_or_html()
|
|
1095
|
+
|
|
1096
|
+
if foster_parenting and target.name in {"table", "tbody", "tfoot", "thead", "tr"}:
|
|
1097
|
+
last_template = self._find_last_on_stack("template")
|
|
1098
|
+
last_table = self._find_last_on_stack("table")
|
|
1099
|
+
if last_template is not None and (
|
|
1100
|
+
last_table is None or self.open_elements.index(last_template) > self.open_elements.index(last_table)
|
|
1101
|
+
):
|
|
1102
|
+
return last_template.template_content, len(last_template.template_content.children)
|
|
1103
|
+
# No table on stack - fall back to inserting in target
|
|
1104
|
+
if last_table is None:
|
|
1105
|
+
return target, len(target.children)
|
|
1106
|
+
parent = last_table.parent
|
|
1107
|
+
# Table has no parent (e.g., detached) - fall back to target
|
|
1108
|
+
if parent is None: # pragma: no cover
|
|
1109
|
+
return target, len(target.children)
|
|
1110
|
+
position = parent.children.index(last_table)
|
|
1111
|
+
return parent, position
|
|
1112
|
+
|
|
1113
|
+
# If target is a template element, insert into its content document fragment
|
|
1114
|
+
if type(target) is TemplateNode and target.template_content:
|
|
1115
|
+
return target.template_content, len(target.template_content.children)
|
|
1116
|
+
|
|
1117
|
+
return target, len(target.children)
|
|
1118
|
+
|
|
1119
|
+
def _populate_selectedcontent(self, root):
|
|
1120
|
+
"""Populate selectedcontent elements with content from selected option.
|
|
1121
|
+
|
|
1122
|
+
Per HTML5 spec: selectedcontent mirrors the content of the selected option,
|
|
1123
|
+
or the first option if none is selected.
|
|
1124
|
+
"""
|
|
1125
|
+
# Find all select elements
|
|
1126
|
+
selects = []
|
|
1127
|
+
self._find_elements(root, "select", selects)
|
|
1128
|
+
|
|
1129
|
+
for select in selects:
|
|
1130
|
+
# Find selectedcontent element in this select
|
|
1131
|
+
selectedcontent = self._find_element(select, "selectedcontent")
|
|
1132
|
+
if not selectedcontent:
|
|
1133
|
+
continue
|
|
1134
|
+
|
|
1135
|
+
# Find all option elements
|
|
1136
|
+
options = []
|
|
1137
|
+
self._find_elements(select, "option", options)
|
|
1138
|
+
|
|
1139
|
+
# Find selected option or use first one
|
|
1140
|
+
selected_option = None
|
|
1141
|
+
for opt in options:
|
|
1142
|
+
if opt.attrs:
|
|
1143
|
+
for attr_name in opt.attrs.keys():
|
|
1144
|
+
if attr_name == "selected":
|
|
1145
|
+
selected_option = opt
|
|
1146
|
+
break
|
|
1147
|
+
if selected_option:
|
|
1148
|
+
break
|
|
1149
|
+
|
|
1150
|
+
if not selected_option:
|
|
1151
|
+
selected_option = options[0]
|
|
1152
|
+
|
|
1153
|
+
# Clone content from selected option to selectedcontent
|
|
1154
|
+
self._clone_children(selected_option, selectedcontent)
|
|
1155
|
+
|
|
1156
|
+
def _find_elements(self, node, name, result):
|
|
1157
|
+
"""Recursively find all elements with given name."""
|
|
1158
|
+
if node.name == name:
|
|
1159
|
+
result.append(node)
|
|
1160
|
+
|
|
1161
|
+
if node.has_child_nodes():
|
|
1162
|
+
for child in node.children:
|
|
1163
|
+
self._find_elements(child, name, result)
|
|
1164
|
+
|
|
1165
|
+
def _find_element(self, node, name):
|
|
1166
|
+
"""Find first element with given name."""
|
|
1167
|
+
if node.name == name:
|
|
1168
|
+
return node
|
|
1169
|
+
|
|
1170
|
+
if node.has_child_nodes():
|
|
1171
|
+
for child in node.children:
|
|
1172
|
+
result = self._find_element(child, name)
|
|
1173
|
+
if result:
|
|
1174
|
+
return result
|
|
1175
|
+
return None
|
|
1176
|
+
|
|
1177
|
+
def _clone_children(self, source, target):
|
|
1178
|
+
"""Deep clone all children from source to target."""
|
|
1179
|
+
for child in source.children:
|
|
1180
|
+
target.append_child(child.clone_node(deep=True))
|
|
1181
|
+
|
|
1182
|
+
def _has_in_scope(self, name):
|
|
1183
|
+
return self._has_element_in_scope(name, DEFAULT_SCOPE_TERMINATORS)
|
|
1184
|
+
|
|
1185
|
+
def _has_in_list_item_scope(self, name):
|
|
1186
|
+
return self._has_element_in_scope(name, LIST_ITEM_SCOPE_TERMINATORS)
|
|
1187
|
+
|
|
1188
|
+
def _has_in_definition_scope(self, name):
|
|
1189
|
+
return self._has_element_in_scope(name, DEFINITION_SCOPE_TERMINATORS)
|
|
1190
|
+
|
|
1191
|
+
def _has_any_in_scope(self, names):
|
|
1192
|
+
# Always terminates: html is in DEFAULT_SCOPE_TERMINATORS
|
|
1193
|
+
terminators = DEFAULT_SCOPE_TERMINATORS
|
|
1194
|
+
idx = len(self.open_elements) - 1
|
|
1195
|
+
while idx >= 0:
|
|
1196
|
+
node = self.open_elements[idx]
|
|
1197
|
+
if node.name in names:
|
|
1198
|
+
return True
|
|
1199
|
+
if node.namespace in {None, "html"} and node.name in terminators:
|
|
1200
|
+
return False
|
|
1201
|
+
idx -= 1
|
|
1202
|
+
return False # pragma: no cover - html always terminates
|
|
1203
|
+
|
|
1204
|
+
def process_characters(self, data):
|
|
1205
|
+
"""Optimized path for character tokens."""
|
|
1206
|
+
# Check for foreign content first
|
|
1207
|
+
current_node = self.open_elements[-1] if self.open_elements else None
|
|
1208
|
+
is_html_namespace = current_node is None or current_node.namespace in {None, "html"}
|
|
1209
|
+
|
|
1210
|
+
if not is_html_namespace:
|
|
1211
|
+
return self.process_token(CharacterTokens(data))
|
|
1212
|
+
|
|
1213
|
+
if self.mode == InsertionMode.IN_BODY:
|
|
1214
|
+
if "\x00" in data:
|
|
1215
|
+
self._parse_error("invalid-codepoint")
|
|
1216
|
+
data = data.replace("\x00", "")
|
|
1217
|
+
|
|
1218
|
+
if not data:
|
|
1219
|
+
return TokenSinkResult.Continue
|
|
1220
|
+
|
|
1221
|
+
if is_all_whitespace(data):
|
|
1222
|
+
self._reconstruct_active_formatting_elements()
|
|
1223
|
+
self._append_text(data)
|
|
1224
|
+
return TokenSinkResult.Continue
|
|
1225
|
+
|
|
1226
|
+
self._reconstruct_active_formatting_elements()
|
|
1227
|
+
self.frameset_ok = False
|
|
1228
|
+
self._append_text(data)
|
|
1229
|
+
return TokenSinkResult.Continue
|
|
1230
|
+
|
|
1231
|
+
return self.process_token(CharacterTokens(data))
|