justhtml 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +48 -0
- justhtml/__main__.py +86 -17
- justhtml/constants.py +12 -0
- justhtml/entities.py +45 -7
- justhtml/errors.py +17 -3
- justhtml/linkify.py +438 -0
- justhtml/node.py +385 -97
- justhtml/parser.py +139 -16
- justhtml/sanitize.py +992 -0
- justhtml/selector.py +117 -19
- justhtml/serialize.py +671 -41
- justhtml/tokenizer.py +364 -194
- justhtml/tokens.py +28 -5
- justhtml/transforms.py +2568 -0
- justhtml/treebuilder.py +297 -204
- justhtml/treebuilder_modes.py +208 -138
- justhtml-0.38.0.dist-info/METADATA +213 -0
- justhtml-0.38.0.dist-info/RECORD +26 -0
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.12.0.dist-info/METADATA +0 -164
- justhtml-0.12.0.dist-info/RECORD +0 -23
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0
justhtml/treebuilder.py
CHANGED
|
@@ -26,7 +26,7 @@ from .constants import (
|
|
|
26
26
|
)
|
|
27
27
|
from .errors import generate_error_message
|
|
28
28
|
from .node import ElementNode, SimpleDomNode, TemplateNode, TextNode
|
|
29
|
-
from .tokens import CharacterTokens, CommentToken, DoctypeToken, EOFToken, ParseError, Tag, TokenSinkResult
|
|
29
|
+
from .tokens import AnyToken, CharacterTokens, CommentToken, DoctypeToken, EOFToken, ParseError, Tag, TokenSinkResult
|
|
30
30
|
from .treebuilder_modes import TreeBuilderModesMixin
|
|
31
31
|
from .treebuilder_utils import (
|
|
32
32
|
InsertionMode,
|
|
@@ -43,6 +43,9 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
43
43
|
"_body_start_handlers",
|
|
44
44
|
"_body_token_handlers",
|
|
45
45
|
"_mode_handlers",
|
|
46
|
+
"_pending_end_tag_end",
|
|
47
|
+
"_pending_end_tag_name",
|
|
48
|
+
"_pending_end_tag_start",
|
|
46
49
|
"active_formatting",
|
|
47
50
|
"collect_errors",
|
|
48
51
|
"document",
|
|
@@ -59,17 +62,23 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
59
62
|
"open_elements",
|
|
60
63
|
"original_mode",
|
|
61
64
|
"pending_table_text",
|
|
65
|
+
"pending_table_text_should_error",
|
|
62
66
|
"quirks_mode",
|
|
63
67
|
"table_text_original_mode",
|
|
64
68
|
"template_modes",
|
|
65
69
|
"tokenizer",
|
|
66
70
|
"tokenizer_state_override",
|
|
71
|
+
"track_tag_spans",
|
|
67
72
|
)
|
|
68
73
|
|
|
69
74
|
_body_end_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
|
|
70
75
|
_body_start_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
|
|
71
76
|
_body_token_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
|
|
72
77
|
_mode_handlers: dict[InsertionMode, Callable[[TreeBuilder, Any], Any]]
|
|
78
|
+
_pending_end_tag_name: str | None
|
|
79
|
+
_pending_end_tag_start: int | None
|
|
80
|
+
_pending_end_tag_end: int | None
|
|
81
|
+
track_tag_spans: bool
|
|
73
82
|
active_formatting: list[Any]
|
|
74
83
|
collect_errors: bool
|
|
75
84
|
document: SimpleDomNode
|
|
@@ -86,6 +95,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
86
95
|
open_elements: list[Any]
|
|
87
96
|
original_mode: InsertionMode | None # type: ignore[assignment]
|
|
88
97
|
pending_table_text: list[str]
|
|
98
|
+
pending_table_text_should_error: bool
|
|
89
99
|
quirks_mode: str
|
|
90
100
|
table_text_original_mode: InsertionMode | None # type: ignore[assignment]
|
|
91
101
|
template_modes: list[InsertionMode]
|
|
@@ -97,10 +107,12 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
97
107
|
fragment_context: Any | None = None,
|
|
98
108
|
iframe_srcdoc: bool = False,
|
|
99
109
|
collect_errors: bool = False,
|
|
110
|
+
track_tag_spans: bool = False,
|
|
100
111
|
) -> None:
|
|
101
112
|
self.fragment_context = fragment_context
|
|
102
113
|
self.iframe_srcdoc = iframe_srcdoc
|
|
103
114
|
self.collect_errors = collect_errors
|
|
115
|
+
self.track_tag_spans = bool(track_tag_spans)
|
|
104
116
|
self.errors = []
|
|
105
117
|
self.tokenizer = None # Set by parser after tokenizer is created
|
|
106
118
|
self.fragment_context_element = None
|
|
@@ -112,12 +124,16 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
112
124
|
self.original_mode = None
|
|
113
125
|
self.table_text_original_mode = None
|
|
114
126
|
self.open_elements = []
|
|
127
|
+
self._pending_end_tag_name = None
|
|
128
|
+
self._pending_end_tag_start = None
|
|
129
|
+
self._pending_end_tag_end = None
|
|
115
130
|
self.head_element = None
|
|
116
131
|
self.form_element = None
|
|
117
132
|
self.frameset_ok = True
|
|
118
133
|
self.quirks_mode = "no-quirks"
|
|
119
134
|
self.ignore_lf = False
|
|
120
135
|
self.active_formatting = []
|
|
136
|
+
self.pending_table_text_should_error = False
|
|
121
137
|
self.insert_from_table = False
|
|
122
138
|
self.pending_table_text = []
|
|
123
139
|
self.template_modes = []
|
|
@@ -169,7 +185,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
169
185
|
def _set_quirks_mode(self, mode: str) -> None:
|
|
170
186
|
self.quirks_mode = mode
|
|
171
187
|
|
|
172
|
-
def _parse_error(self, code: str, tag_name: str | None = None, token:
|
|
188
|
+
def _parse_error(self, code: str, tag_name: str | None = None, token: AnyToken | None = None) -> None:
|
|
173
189
|
if not self.collect_errors:
|
|
174
190
|
return
|
|
175
191
|
# Use the position of the last emitted token (set by tokenizer before emit)
|
|
@@ -206,6 +222,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
206
222
|
code,
|
|
207
223
|
line=line,
|
|
208
224
|
column=column,
|
|
225
|
+
category="treebuilder",
|
|
209
226
|
message=message,
|
|
210
227
|
source_html=source_html,
|
|
211
228
|
end_column=end_column,
|
|
@@ -236,14 +253,14 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
236
253
|
def _pop_until_inclusive(self, name: str) -> None:
|
|
237
254
|
# Callers ensure element exists on stack
|
|
238
255
|
while self.open_elements: # pragma: no branch
|
|
239
|
-
node = self.
|
|
256
|
+
node = self._pop_current()
|
|
240
257
|
if node.name == name:
|
|
241
258
|
break
|
|
242
259
|
|
|
243
260
|
def _pop_until_any_inclusive(self, names: set[str]) -> None:
|
|
244
261
|
# Pop elements until we find one in names (callers ensure element exists)
|
|
245
262
|
while self.open_elements:
|
|
246
|
-
node = self.
|
|
263
|
+
node = self._pop_current()
|
|
247
264
|
if node.name in names:
|
|
248
265
|
return
|
|
249
266
|
|
|
@@ -251,7 +268,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
251
268
|
if self._has_element_in_button_scope("p"):
|
|
252
269
|
self._generate_implied_end_tags("p")
|
|
253
270
|
if self.open_elements[-1].name != "p":
|
|
254
|
-
self._parse_error("end-tag
|
|
271
|
+
self._parse_error("unexpected-end-tag", tag_name="p")
|
|
255
272
|
self._pop_until_inclusive("p")
|
|
256
273
|
return True
|
|
257
274
|
return False
|
|
@@ -270,206 +287,218 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
270
287
|
|
|
271
288
|
current_token = token
|
|
272
289
|
force_html_mode = False
|
|
290
|
+
if token_type is Tag and token.kind == Tag.END:
|
|
291
|
+
self._pending_end_tag_name = token.name
|
|
292
|
+
if self.track_tag_spans:
|
|
293
|
+
self._pending_end_tag_start = token.start_pos
|
|
294
|
+
self._pending_end_tag_end = token.end_pos
|
|
295
|
+
else:
|
|
296
|
+
self._pending_end_tag_start = None
|
|
297
|
+
self._pending_end_tag_end = None
|
|
273
298
|
|
|
274
299
|
# Cache mode handlers list for speed
|
|
275
300
|
mode_handlers = self._MODE_HANDLERS
|
|
276
301
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
force_html_mode
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
self._reconstruct_active_formatting_elements()
|
|
317
|
-
self._insert_element(current_token, push=True)
|
|
318
|
-
self.frameset_ok = False
|
|
319
|
-
result = None
|
|
320
|
-
elif name == "a":
|
|
321
|
-
result = self._handle_body_start_a(current_token)
|
|
322
|
-
elif name == "br" or name == "img":
|
|
323
|
-
if self.active_formatting:
|
|
324
|
-
self._reconstruct_active_formatting_elements()
|
|
325
|
-
self._insert_element(current_token, push=False)
|
|
326
|
-
self.frameset_ok = False
|
|
327
|
-
result = None
|
|
328
|
-
elif name == "hr":
|
|
329
|
-
has_p = False
|
|
330
|
-
idx = len(self.open_elements) - 1
|
|
331
|
-
while idx >= 0: # pragma: no branch
|
|
332
|
-
node = self.open_elements[idx]
|
|
333
|
-
if node.name == "p":
|
|
334
|
-
has_p = True
|
|
335
|
-
break
|
|
336
|
-
if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
|
|
337
|
-
break
|
|
338
|
-
idx -= 1
|
|
339
|
-
|
|
340
|
-
if has_p:
|
|
341
|
-
self._close_p_element()
|
|
342
|
-
|
|
343
|
-
self._insert_element(current_token, push=False)
|
|
344
|
-
self.frameset_ok = False
|
|
345
|
-
result = None
|
|
346
|
-
else:
|
|
347
|
-
handler = self._BODY_START_HANDLERS.get(name)
|
|
348
|
-
if handler:
|
|
349
|
-
result = handler(self, current_token)
|
|
350
|
-
else:
|
|
351
|
-
# Inline _handle_body_start_default
|
|
352
|
-
# Elements here have no special handler - never in FRAMESET_NEUTRAL/FORMATTING_ELEMENTS
|
|
302
|
+
try:
|
|
303
|
+
while True:
|
|
304
|
+
# Update token type for current token (it might have changed if reprocessed)
|
|
305
|
+
token_type = type(current_token)
|
|
306
|
+
|
|
307
|
+
# Optimization: Check for HTML namespace first (common case)
|
|
308
|
+
current_node = self.open_elements[-1] if self.open_elements else None
|
|
309
|
+
is_html_namespace = current_node is None or current_node.namespace in {None, "html"}
|
|
310
|
+
|
|
311
|
+
if force_html_mode or is_html_namespace:
|
|
312
|
+
force_html_mode = False
|
|
313
|
+
if self.mode == InsertionMode.IN_BODY:
|
|
314
|
+
# Inline _mode_in_body for performance
|
|
315
|
+
if token_type is Tag:
|
|
316
|
+
# Inline _handle_tag_in_body
|
|
317
|
+
if current_token.kind == 0: # Tag.START
|
|
318
|
+
name = current_token.name
|
|
319
|
+
if name == "div" or name == "ul" or name == "ol":
|
|
320
|
+
# Inline _handle_body_start_block_with_p
|
|
321
|
+
# Check if p is in button scope (html always terminates)
|
|
322
|
+
has_p = False
|
|
323
|
+
idx = len(self.open_elements) - 1
|
|
324
|
+
while idx >= 0: # pragma: no branch
|
|
325
|
+
node = self.open_elements[idx]
|
|
326
|
+
if node.name == "p":
|
|
327
|
+
has_p = True
|
|
328
|
+
break
|
|
329
|
+
if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
|
|
330
|
+
break
|
|
331
|
+
idx -= 1
|
|
332
|
+
|
|
333
|
+
if has_p:
|
|
334
|
+
self._close_p_element()
|
|
335
|
+
|
|
336
|
+
self._insert_element(current_token, push=True)
|
|
337
|
+
result = None
|
|
338
|
+
elif name == "p":
|
|
339
|
+
result = self._handle_body_start_paragraph(current_token) # type: ignore[func-returns-value]
|
|
340
|
+
elif name == "span":
|
|
353
341
|
if self.active_formatting:
|
|
354
342
|
self._reconstruct_active_formatting_elements()
|
|
355
343
|
self._insert_element(current_token, push=True)
|
|
356
|
-
if current_token.self_closing:
|
|
357
|
-
self._parse_error(
|
|
358
|
-
"non-void-html-element-start-tag-with-trailing-solidus",
|
|
359
|
-
tag_name=current_token.name,
|
|
360
|
-
)
|
|
361
344
|
self.frameset_ok = False
|
|
362
345
|
result = None
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
346
|
+
elif name == "a":
|
|
347
|
+
result = self._handle_body_start_a(current_token) # type: ignore[func-returns-value]
|
|
348
|
+
elif name == "br" or name == "img":
|
|
349
|
+
if self.active_formatting:
|
|
350
|
+
self._reconstruct_active_formatting_elements()
|
|
351
|
+
self._insert_element(current_token, push=False)
|
|
352
|
+
self.frameset_ok = False
|
|
353
|
+
result = None
|
|
354
|
+
elif name == "hr":
|
|
355
|
+
has_p = False
|
|
356
|
+
idx = len(self.open_elements) - 1
|
|
357
|
+
while idx >= 0: # pragma: no branch
|
|
358
|
+
node = self.open_elements[idx]
|
|
359
|
+
if node.name == "p":
|
|
360
|
+
has_p = True
|
|
361
|
+
break
|
|
362
|
+
if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
|
|
363
|
+
break
|
|
364
|
+
idx -= 1
|
|
365
|
+
|
|
366
|
+
if has_p:
|
|
367
|
+
self._close_p_element()
|
|
368
|
+
|
|
369
|
+
self._insert_element(current_token, push=False)
|
|
370
|
+
self.frameset_ok = False
|
|
371
|
+
result = None
|
|
376
372
|
else:
|
|
377
|
-
self.
|
|
373
|
+
handler = self._BODY_START_HANDLERS.get(name)
|
|
374
|
+
if handler:
|
|
375
|
+
result = handler(self, current_token)
|
|
376
|
+
else:
|
|
377
|
+
# Inline _handle_body_start_default
|
|
378
|
+
# Elements here have no special handler - never in FRAMESET_NEUTRAL/FORMATTING_ELEMENTS
|
|
379
|
+
if self.active_formatting:
|
|
380
|
+
self._reconstruct_active_formatting_elements()
|
|
381
|
+
self._insert_element(current_token, push=True)
|
|
382
|
+
if current_token.self_closing:
|
|
383
|
+
self._parse_error(
|
|
384
|
+
"non-void-html-element-start-tag-with-trailing-solidus",
|
|
385
|
+
tag_name=current_token.name,
|
|
386
|
+
)
|
|
387
|
+
self.frameset_ok = False
|
|
388
|
+
result = None
|
|
389
|
+
else:
|
|
390
|
+
name = current_token.name
|
|
391
|
+
if name == "br":
|
|
392
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
393
|
+
br_tag = Tag(0, "br", {}, False)
|
|
394
|
+
result = self._handle_body_start_br(br_tag) # type: ignore[func-returns-value]
|
|
395
|
+
elif name in FORMATTING_ELEMENTS:
|
|
396
|
+
self._adoption_agency(name)
|
|
378
397
|
result = None
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
# Only pop foreign elements if we're NOT at an HTML/MathML integration point
|
|
398
|
-
# and NOT about to insert a new foreign element (svg/math)
|
|
399
|
-
if not isinstance(current_token, EOFToken):
|
|
400
|
-
# Don't pop at integration points - they stay on stack to receive content
|
|
401
|
-
if self._is_html_integration_point(current) or self._is_mathml_text_integration_point(current):
|
|
402
|
-
pass
|
|
403
|
-
# Don't pop when inserting new svg/math elements
|
|
404
|
-
if isinstance(current_token, Tag) and current_token.kind == Tag.START:
|
|
405
|
-
# Optimization: Tokenizer already lowercases tag names
|
|
406
|
-
name_lower = current_token.name
|
|
407
|
-
if name_lower in {"svg", "math"}:
|
|
408
|
-
pass
|
|
409
|
-
|
|
410
|
-
# Special handling: text at integration points inserts directly, bypassing mode dispatch
|
|
411
|
-
if isinstance(current_token, CharacterTokens):
|
|
412
|
-
if self._is_mathml_text_integration_point(current):
|
|
413
|
-
# Tokenizer guarantees non-empty data
|
|
414
|
-
data = current_token.data
|
|
415
|
-
if "\x00" in data:
|
|
416
|
-
self._parse_error("invalid-codepoint")
|
|
417
|
-
data = data.replace("\x00", "")
|
|
418
|
-
if "\x0c" in data:
|
|
419
|
-
self._parse_error("invalid-codepoint")
|
|
420
|
-
data = data.replace("\x0c", "")
|
|
421
|
-
if data:
|
|
422
|
-
if not is_all_whitespace(data):
|
|
423
|
-
self._reconstruct_active_formatting_elements()
|
|
424
|
-
self.frameset_ok = False
|
|
425
|
-
self._append_text(data)
|
|
426
|
-
result = None
|
|
398
|
+
else:
|
|
399
|
+
handler = self._BODY_END_HANDLERS.get(name)
|
|
400
|
+
if handler:
|
|
401
|
+
result = handler(self, current_token)
|
|
402
|
+
else:
|
|
403
|
+
self._any_other_end_tag(name)
|
|
404
|
+
result = None
|
|
405
|
+
elif token_type is CharacterTokens:
|
|
406
|
+
# Inline _handle_characters_in_body
|
|
407
|
+
# Only non-whitespace data reaches here (whitespace handled in process_characters)
|
|
408
|
+
self.frameset_ok = False
|
|
409
|
+
self._reconstruct_active_formatting_elements()
|
|
410
|
+
self._append_text(current_token.data)
|
|
411
|
+
result = None
|
|
412
|
+
elif token_type is CommentToken:
|
|
413
|
+
result = self._handle_comment_in_body(current_token) # type: ignore[func-returns-value]
|
|
414
|
+
else: # EOFToken
|
|
415
|
+
result = self._handle_eof_in_body(current_token)
|
|
427
416
|
else:
|
|
428
417
|
result = mode_handlers[self.mode](self, current_token)
|
|
418
|
+
elif self._should_use_foreign_content(current_token):
|
|
419
|
+
result = self._process_foreign_content(current_token)
|
|
429
420
|
else:
|
|
430
|
-
#
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
#
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
421
|
+
# Foreign content stack logic
|
|
422
|
+
current = current_node
|
|
423
|
+
# Only pop foreign elements if we're NOT at an HTML/MathML integration point
|
|
424
|
+
# and NOT about to insert a new foreign element (svg/math)
|
|
425
|
+
if not isinstance(current_token, EOFToken):
|
|
426
|
+
# Don't pop at integration points - they stay on stack to receive content
|
|
427
|
+
if self._is_html_integration_point(current) or self._is_mathml_text_integration_point(current):
|
|
428
|
+
pass
|
|
429
|
+
# Don't pop when inserting new svg/math elements
|
|
430
|
+
if isinstance(current_token, Tag) and current_token.kind == Tag.START:
|
|
431
|
+
# Optimization: Tokenizer already lowercases tag names
|
|
432
|
+
name_lower = current_token.name
|
|
433
|
+
if name_lower in {"svg", "math"}:
|
|
434
|
+
pass
|
|
435
|
+
|
|
436
|
+
# Special handling: text at integration points inserts directly, bypassing mode dispatch
|
|
437
|
+
if isinstance(current_token, CharacterTokens):
|
|
438
|
+
if self._is_mathml_text_integration_point(current):
|
|
439
|
+
# Tokenizer guarantees non-empty data
|
|
440
|
+
data = current_token.data
|
|
441
|
+
if "\x00" in data:
|
|
442
|
+
data = data.replace("\x00", "")
|
|
443
|
+
if data:
|
|
444
|
+
if not is_all_whitespace(data):
|
|
445
|
+
self._reconstruct_active_formatting_elements()
|
|
446
|
+
self.frameset_ok = False
|
|
447
|
+
self._append_text(data)
|
|
448
|
+
result = None
|
|
456
449
|
else:
|
|
457
450
|
result = mode_handlers[self.mode](self, current_token)
|
|
458
451
|
else:
|
|
459
|
-
|
|
452
|
+
# At integration points inside foreign content, check if table tags make sense.
|
|
453
|
+
if (
|
|
454
|
+
(
|
|
455
|
+
self._is_mathml_text_integration_point(current)
|
|
456
|
+
or self._is_html_integration_point(current)
|
|
457
|
+
)
|
|
458
|
+
and isinstance(current_token, Tag)
|
|
459
|
+
and current_token.kind == Tag.START
|
|
460
|
+
and self.mode not in {InsertionMode.IN_BODY}
|
|
461
|
+
):
|
|
462
|
+
# Check if we're in a table mode but without an actual table in scope
|
|
463
|
+
# If so, table tags should be ignored (use IN_BODY mode)
|
|
464
|
+
is_table_mode = self.mode in {
|
|
465
|
+
InsertionMode.IN_TABLE,
|
|
466
|
+
InsertionMode.IN_TABLE_BODY,
|
|
467
|
+
InsertionMode.IN_ROW,
|
|
468
|
+
InsertionMode.IN_CELL,
|
|
469
|
+
InsertionMode.IN_CAPTION,
|
|
470
|
+
InsertionMode.IN_COLUMN_GROUP,
|
|
471
|
+
}
|
|
472
|
+
has_table_in_scope = self._has_in_table_scope("table")
|
|
473
|
+
if is_table_mode and not has_table_in_scope:
|
|
474
|
+
# Temporarily use IN_BODY mode for this tag
|
|
475
|
+
saved_mode = self.mode
|
|
476
|
+
self.mode = InsertionMode.IN_BODY
|
|
477
|
+
result = mode_handlers[self.mode](self, current_token)
|
|
478
|
+
# Restore mode if no mode change was requested
|
|
479
|
+
if self.mode == InsertionMode.IN_BODY: # pragma: no branch
|
|
480
|
+
self.mode = saved_mode
|
|
481
|
+
else:
|
|
482
|
+
result = mode_handlers[self.mode](self, current_token)
|
|
483
|
+
else:
|
|
484
|
+
result = mode_handlers[self.mode](self, current_token)
|
|
460
485
|
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
486
|
+
if result is None:
|
|
487
|
+
result_to_return = self.tokenizer_state_override or TokenSinkResult.Continue
|
|
488
|
+
self.tokenizer_state_override = None
|
|
489
|
+
return result_to_return
|
|
490
|
+
# Result is (instruction, mode, token) or (instruction, mode, token, force_html)
|
|
491
|
+
_instruction, mode, token_override = result[0], result[1], result[2]
|
|
492
|
+
if len(result) == 4:
|
|
493
|
+
force_html_mode = result[3]
|
|
494
|
+
# All mode handlers that return a tuple use "reprocess" instruction
|
|
495
|
+
self.mode = mode
|
|
496
|
+
current_token = token_override
|
|
497
|
+
# Continue loop to reprocess
|
|
498
|
+
finally:
|
|
499
|
+
self._pending_end_tag_name = None
|
|
500
|
+
self._pending_end_tag_start = None
|
|
501
|
+
self._pending_end_tag_end = None
|
|
473
502
|
|
|
474
503
|
def finish(self) -> SimpleDomNode:
|
|
475
504
|
if self.fragment_context is not None:
|
|
@@ -491,12 +520,19 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
491
520
|
# Populate selectedcontent elements per HTML5 spec
|
|
492
521
|
self._populate_selectedcontent(self.document)
|
|
493
522
|
|
|
523
|
+
if self.tokenizer is not None and self.track_tag_spans: # pragma: no branch
|
|
524
|
+
self.document._source_html = self.tokenizer.buffer
|
|
525
|
+
|
|
494
526
|
return self.document
|
|
495
527
|
|
|
496
528
|
# Insertion mode dispatch ------------------------------------------------
|
|
497
529
|
|
|
498
530
|
def _append_comment_to_document(self, text: str) -> None:
|
|
499
531
|
node = SimpleDomNode("#comment", data=text)
|
|
532
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
533
|
+
node._origin_pos = self.tokenizer.last_token_start_pos
|
|
534
|
+
if node._origin_pos is not None:
|
|
535
|
+
node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
|
|
500
536
|
self.document.append_child(node)
|
|
501
537
|
|
|
502
538
|
def _append_comment(self, text: str, parent: Any | None = None) -> None:
|
|
@@ -506,6 +542,10 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
506
542
|
if type(parent) is TemplateNode and parent.template_content:
|
|
507
543
|
parent = parent.template_content
|
|
508
544
|
node = SimpleDomNode("#comment", data=text)
|
|
545
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
546
|
+
node._origin_pos = self.tokenizer.last_token_start_pos
|
|
547
|
+
if node._origin_pos is not None:
|
|
548
|
+
node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
|
|
509
549
|
parent.append_child(node)
|
|
510
550
|
|
|
511
551
|
def _append_text(self, text: str) -> None:
|
|
@@ -516,6 +556,9 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
516
556
|
if not text:
|
|
517
557
|
return
|
|
518
558
|
|
|
559
|
+
if "\f" in text:
|
|
560
|
+
text = text.replace("\f", " ")
|
|
561
|
+
|
|
519
562
|
# Guard against empty stack
|
|
520
563
|
if not self.open_elements: # pragma: no cover
|
|
521
564
|
return
|
|
@@ -532,6 +575,10 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
532
575
|
return
|
|
533
576
|
|
|
534
577
|
node = TextNode(text)
|
|
578
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
579
|
+
node._origin_pos = self.tokenizer.last_token_start_pos
|
|
580
|
+
if node._origin_pos is not None:
|
|
581
|
+
node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
|
|
535
582
|
children.append(node)
|
|
536
583
|
node.parent = target
|
|
537
584
|
return
|
|
@@ -552,6 +599,10 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
552
599
|
return
|
|
553
600
|
|
|
554
601
|
node = TextNode(text)
|
|
602
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
603
|
+
node._origin_pos = self.tokenizer.last_token_start_pos
|
|
604
|
+
if node._origin_pos is not None:
|
|
605
|
+
node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
|
|
555
606
|
reference_node = parent.children[position] if position < len(parent.children) else None
|
|
556
607
|
parent.insert_before(node, reference_node)
|
|
557
608
|
|
|
@@ -581,6 +632,15 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
581
632
|
node = TemplateNode(tag.name, attrs=tag.attrs, namespace=namespace)
|
|
582
633
|
else:
|
|
583
634
|
node = ElementNode(tag.name, attrs=tag.attrs, namespace=namespace)
|
|
635
|
+
if self.track_tag_spans:
|
|
636
|
+
node._start_tag_start = tag.start_pos
|
|
637
|
+
node._start_tag_end = tag.end_pos
|
|
638
|
+
node._self_closing = bool(getattr(tag, "self_closing", False))
|
|
639
|
+
|
|
640
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
641
|
+
node._origin_pos = tag.start_pos
|
|
642
|
+
if node._origin_pos is not None:
|
|
643
|
+
node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
|
|
584
644
|
|
|
585
645
|
# Fast path for common case: not inserting from table
|
|
586
646
|
if not self.insert_from_table:
|
|
@@ -624,8 +684,23 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
624
684
|
ns = namespace or "html"
|
|
625
685
|
return ElementNode(name, attrs, ns)
|
|
626
686
|
|
|
687
|
+
def _maybe_mark_end_tag(self, node: Any) -> None:
|
|
688
|
+
if self._pending_end_tag_name is None:
|
|
689
|
+
return
|
|
690
|
+
if getattr(node, "name", None) != self._pending_end_tag_name:
|
|
691
|
+
return
|
|
692
|
+
node._end_tag_present = True
|
|
693
|
+
if self.track_tag_spans:
|
|
694
|
+
node._end_tag_start = self._pending_end_tag_start
|
|
695
|
+
node._end_tag_end = self._pending_end_tag_end
|
|
696
|
+
self._pending_end_tag_name = None
|
|
697
|
+
self._pending_end_tag_start = None
|
|
698
|
+
self._pending_end_tag_end = None
|
|
699
|
+
|
|
627
700
|
def _pop_current(self) -> Any:
|
|
628
|
-
|
|
701
|
+
node = self.open_elements.pop()
|
|
702
|
+
self._maybe_mark_end_tag(node)
|
|
703
|
+
return node
|
|
629
704
|
|
|
630
705
|
def _in_scope(self, name: str) -> bool:
|
|
631
706
|
return self._has_element_in_scope(name, DEFAULT_SCOPE_TERMINATORS)
|
|
@@ -637,6 +712,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
637
712
|
index = len(self.open_elements) - 1
|
|
638
713
|
while index >= 0: # pragma: no branch
|
|
639
714
|
if self.open_elements[index].name == name:
|
|
715
|
+
self._maybe_mark_end_tag(self.open_elements[index])
|
|
640
716
|
del self.open_elements[index:]
|
|
641
717
|
return
|
|
642
718
|
index -= 1
|
|
@@ -654,6 +730,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
654
730
|
# If current node is not this node, parse error
|
|
655
731
|
if index != len(self.open_elements) - 1:
|
|
656
732
|
self._parse_error("end-tag-too-early")
|
|
733
|
+
self._maybe_mark_end_tag(node)
|
|
657
734
|
# Pop all elements from this node onwards
|
|
658
735
|
del self.open_elements[index:]
|
|
659
736
|
return
|
|
@@ -677,6 +754,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
677
754
|
def _remove_from_open_elements(self, node: Any) -> bool:
|
|
678
755
|
for index, current in enumerate(self.open_elements):
|
|
679
756
|
if current is node:
|
|
757
|
+
self._maybe_mark_end_tag(current)
|
|
680
758
|
del self.open_elements[index]
|
|
681
759
|
return True
|
|
682
760
|
return False
|
|
@@ -749,6 +827,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
749
827
|
def _remove_last_open_element_by_name(self, name: str) -> None:
|
|
750
828
|
for index in range(len(self.open_elements) - 1, -1, -1):
|
|
751
829
|
if self.open_elements[index].name == name:
|
|
830
|
+
self._maybe_mark_end_tag(self.open_elements[index])
|
|
752
831
|
del self.open_elements[index]
|
|
753
832
|
return
|
|
754
833
|
|
|
@@ -799,6 +878,10 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
799
878
|
entry = self.active_formatting[index]
|
|
800
879
|
tag = Tag(Tag.START, entry["name"], self._clone_attributes(entry["attrs"]), False)
|
|
801
880
|
new_node = self._insert_element(tag, push=True)
|
|
881
|
+
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
882
|
+
new_node._origin_pos = entry["node"].origin_offset
|
|
883
|
+
new_node._origin_line = entry["node"].origin_line
|
|
884
|
+
new_node._origin_col = entry["node"].origin_col
|
|
802
885
|
entry["node"] = new_node
|
|
803
886
|
index += 1
|
|
804
887
|
|
|
@@ -820,14 +903,14 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
820
903
|
node = self.open_elements[-1]
|
|
821
904
|
if node.name in names and node.namespace in {None, "html"}:
|
|
822
905
|
break
|
|
823
|
-
self.
|
|
906
|
+
self._pop_current()
|
|
824
907
|
|
|
825
908
|
def _generate_implied_end_tags(self, exclude: str | None = None) -> None:
|
|
826
909
|
# Always terminates: html is not in IMPLIED_END_TAGS
|
|
827
910
|
while self.open_elements: # pragma: no branch
|
|
828
911
|
node = self.open_elements[-1]
|
|
829
912
|
if node.name in IMPLIED_END_TAGS and node.name != exclude:
|
|
830
|
-
self.
|
|
913
|
+
self._pop_current()
|
|
831
914
|
continue
|
|
832
915
|
break
|
|
833
916
|
|
|
@@ -846,7 +929,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
846
929
|
def _end_table_cell(self, name: str) -> None:
|
|
847
930
|
self._generate_implied_end_tags(name)
|
|
848
931
|
while self.open_elements:
|
|
849
|
-
node = self.
|
|
932
|
+
node = self._pop_current()
|
|
850
933
|
if node.name == name and node.namespace in {None, "html"}:
|
|
851
934
|
break
|
|
852
935
|
self._clear_active_formatting_up_to_marker()
|
|
@@ -855,12 +938,19 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
855
938
|
def _flush_pending_table_text(self) -> None:
|
|
856
939
|
data = "".join(self.pending_table_text)
|
|
857
940
|
self.pending_table_text.clear()
|
|
858
|
-
if not data:
|
|
941
|
+
if not data: # pragma: no cover
|
|
859
942
|
return
|
|
860
943
|
if is_all_whitespace(data):
|
|
861
944
|
self._append_text(data)
|
|
862
945
|
return
|
|
863
|
-
|
|
946
|
+
|
|
947
|
+
if self.pending_table_text_should_error:
|
|
948
|
+
# html5lib reports one foster-parenting error per non-whitespace character.
|
|
949
|
+
for ch in data:
|
|
950
|
+
if ch not in " \t\n\r\f":
|
|
951
|
+
self._parse_error("foster-parenting-character")
|
|
952
|
+
self.pending_table_text_should_error = False
|
|
953
|
+
|
|
864
954
|
previous = self.insert_from_table
|
|
865
955
|
self.insert_from_table = True
|
|
866
956
|
try:
|
|
@@ -876,7 +966,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
876
966
|
self._generate_implied_end_tags()
|
|
877
967
|
# Table verified in scope above
|
|
878
968
|
while self.open_elements: # pragma: no branch
|
|
879
|
-
node = self.
|
|
969
|
+
node = self._pop_current()
|
|
880
970
|
if node.name == "table":
|
|
881
971
|
break
|
|
882
972
|
self._reset_insertion_mode()
|
|
@@ -989,7 +1079,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
989
1079
|
def _adjusted_current_node(self) -> Any:
|
|
990
1080
|
return self.open_elements[-1]
|
|
991
1081
|
|
|
992
|
-
def _should_use_foreign_content(self, token:
|
|
1082
|
+
def _should_use_foreign_content(self, token: AnyToken) -> bool:
|
|
993
1083
|
current = self._adjusted_current_node()
|
|
994
1084
|
# HTML namespace elements don't use foreign content rules
|
|
995
1085
|
# (unreachable in practice as foreign content mode only entered for foreign elements)
|
|
@@ -1036,9 +1126,9 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1036
1126
|
return
|
|
1037
1127
|
if self.fragment_context_element is not None and node is self.fragment_context_element:
|
|
1038
1128
|
return
|
|
1039
|
-
self.
|
|
1129
|
+
self._pop_current()
|
|
1040
1130
|
|
|
1041
|
-
def _process_foreign_content(self, token:
|
|
1131
|
+
def _process_foreign_content(self, token: AnyToken) -> Any | None:
|
|
1042
1132
|
current = self._adjusted_current_node()
|
|
1043
1133
|
|
|
1044
1134
|
if isinstance(token, CharacterTokens):
|
|
@@ -1113,12 +1203,13 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1113
1203
|
if is_html:
|
|
1114
1204
|
return ("reprocess", self.mode, token, True)
|
|
1115
1205
|
# Otherwise it's a foreign element - pop everything from this point up
|
|
1206
|
+
self._maybe_mark_end_tag(node)
|
|
1116
1207
|
del self.open_elements[idx:]
|
|
1117
1208
|
return None
|
|
1118
1209
|
|
|
1119
1210
|
# Per HTML5 spec: if first node doesn't match, it's a parse error
|
|
1120
1211
|
if first:
|
|
1121
|
-
self._parse_error("unexpected-end-tag
|
|
1212
|
+
self._parse_error("unexpected-end-tag", tag_name=token.name)
|
|
1122
1213
|
first = False
|
|
1123
1214
|
|
|
1124
1215
|
# If we hit an HTML element that doesn't match, process in secondary mode
|
|
@@ -1259,19 +1350,21 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1259
1350
|
return self.process_token(CharacterTokens(data))
|
|
1260
1351
|
|
|
1261
1352
|
if self.mode == InsertionMode.IN_BODY:
|
|
1262
|
-
if "\x00" in data:
|
|
1263
|
-
self._parse_error("invalid-codepoint")
|
|
1264
|
-
data = data.replace("\x00", "")
|
|
1265
|
-
|
|
1266
1353
|
if not data:
|
|
1267
1354
|
return TokenSinkResult.Continue
|
|
1355
|
+
if "\x00" in data:
|
|
1356
|
+
data = data.replace("\x00", "")
|
|
1357
|
+
if not data:
|
|
1358
|
+
return TokenSinkResult.Continue
|
|
1268
1359
|
|
|
1269
1360
|
if is_all_whitespace(data):
|
|
1270
|
-
self.
|
|
1361
|
+
if self.active_formatting:
|
|
1362
|
+
self._reconstruct_active_formatting_elements()
|
|
1271
1363
|
self._append_text(data)
|
|
1272
1364
|
return TokenSinkResult.Continue
|
|
1273
1365
|
|
|
1274
|
-
self.
|
|
1366
|
+
if self.active_formatting:
|
|
1367
|
+
self._reconstruct_active_formatting_elements()
|
|
1275
1368
|
self.frameset_ok = False
|
|
1276
1369
|
self._append_text(data)
|
|
1277
1370
|
return TokenSinkResult.Continue
|