justhtml 0.24.0__py3-none-any.whl → 0.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +44 -2
- justhtml/__main__.py +45 -9
- justhtml/constants.py +12 -0
- justhtml/errors.py +8 -3
- justhtml/linkify.py +438 -0
- justhtml/node.py +54 -35
- justhtml/parser.py +105 -38
- justhtml/sanitize.py +511 -282
- justhtml/selector.py +3 -1
- justhtml/serialize.py +398 -72
- justhtml/tokenizer.py +121 -21
- justhtml/tokens.py +21 -3
- justhtml/transforms.py +2568 -0
- justhtml/treebuilder.py +247 -190
- justhtml/treebuilder_modes.py +108 -102
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/METADATA +28 -7
- justhtml-0.38.0.dist-info/RECORD +26 -0
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +1 -1
- justhtml-0.24.0.dist-info/RECORD +0 -24
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0
justhtml/treebuilder.py
CHANGED
|
@@ -26,7 +26,7 @@ from .constants import (
|
|
|
26
26
|
)
|
|
27
27
|
from .errors import generate_error_message
|
|
28
28
|
from .node import ElementNode, SimpleDomNode, TemplateNode, TextNode
|
|
29
|
-
from .tokens import CharacterTokens, CommentToken, DoctypeToken, EOFToken, ParseError, Tag, TokenSinkResult
|
|
29
|
+
from .tokens import AnyToken, CharacterTokens, CommentToken, DoctypeToken, EOFToken, ParseError, Tag, TokenSinkResult
|
|
30
30
|
from .treebuilder_modes import TreeBuilderModesMixin
|
|
31
31
|
from .treebuilder_utils import (
|
|
32
32
|
InsertionMode,
|
|
@@ -43,6 +43,9 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
43
43
|
"_body_start_handlers",
|
|
44
44
|
"_body_token_handlers",
|
|
45
45
|
"_mode_handlers",
|
|
46
|
+
"_pending_end_tag_end",
|
|
47
|
+
"_pending_end_tag_name",
|
|
48
|
+
"_pending_end_tag_start",
|
|
46
49
|
"active_formatting",
|
|
47
50
|
"collect_errors",
|
|
48
51
|
"document",
|
|
@@ -65,12 +68,17 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
65
68
|
"template_modes",
|
|
66
69
|
"tokenizer",
|
|
67
70
|
"tokenizer_state_override",
|
|
71
|
+
"track_tag_spans",
|
|
68
72
|
)
|
|
69
73
|
|
|
70
74
|
_body_end_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
|
|
71
75
|
_body_start_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
|
|
72
76
|
_body_token_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
|
|
73
77
|
_mode_handlers: dict[InsertionMode, Callable[[TreeBuilder, Any], Any]]
|
|
78
|
+
_pending_end_tag_name: str | None
|
|
79
|
+
_pending_end_tag_start: int | None
|
|
80
|
+
_pending_end_tag_end: int | None
|
|
81
|
+
track_tag_spans: bool
|
|
74
82
|
active_formatting: list[Any]
|
|
75
83
|
collect_errors: bool
|
|
76
84
|
document: SimpleDomNode
|
|
@@ -99,10 +107,12 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
99
107
|
fragment_context: Any | None = None,
|
|
100
108
|
iframe_srcdoc: bool = False,
|
|
101
109
|
collect_errors: bool = False,
|
|
110
|
+
track_tag_spans: bool = False,
|
|
102
111
|
) -> None:
|
|
103
112
|
self.fragment_context = fragment_context
|
|
104
113
|
self.iframe_srcdoc = iframe_srcdoc
|
|
105
114
|
self.collect_errors = collect_errors
|
|
115
|
+
self.track_tag_spans = bool(track_tag_spans)
|
|
106
116
|
self.errors = []
|
|
107
117
|
self.tokenizer = None # Set by parser after tokenizer is created
|
|
108
118
|
self.fragment_context_element = None
|
|
@@ -114,6 +124,9 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
114
124
|
self.original_mode = None
|
|
115
125
|
self.table_text_original_mode = None
|
|
116
126
|
self.open_elements = []
|
|
127
|
+
self._pending_end_tag_name = None
|
|
128
|
+
self._pending_end_tag_start = None
|
|
129
|
+
self._pending_end_tag_end = None
|
|
117
130
|
self.head_element = None
|
|
118
131
|
self.form_element = None
|
|
119
132
|
self.frameset_ok = True
|
|
@@ -172,7 +185,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
172
185
|
def _set_quirks_mode(self, mode: str) -> None:
|
|
173
186
|
self.quirks_mode = mode
|
|
174
187
|
|
|
175
|
-
def _parse_error(self, code: str, tag_name: str | None = None, token:
|
|
188
|
+
def _parse_error(self, code: str, tag_name: str | None = None, token: AnyToken | None = None) -> None:
|
|
176
189
|
if not self.collect_errors:
|
|
177
190
|
return
|
|
178
191
|
# Use the position of the last emitted token (set by tokenizer before emit)
|
|
@@ -209,6 +222,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
209
222
|
code,
|
|
210
223
|
line=line,
|
|
211
224
|
column=column,
|
|
225
|
+
category="treebuilder",
|
|
212
226
|
message=message,
|
|
213
227
|
source_html=source_html,
|
|
214
228
|
end_column=end_column,
|
|
@@ -239,14 +253,14 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
239
253
|
def _pop_until_inclusive(self, name: str) -> None:
|
|
240
254
|
# Callers ensure element exists on stack
|
|
241
255
|
while self.open_elements: # pragma: no branch
|
|
242
|
-
node = self.
|
|
256
|
+
node = self._pop_current()
|
|
243
257
|
if node.name == name:
|
|
244
258
|
break
|
|
245
259
|
|
|
246
260
|
def _pop_until_any_inclusive(self, names: set[str]) -> None:
|
|
247
261
|
# Pop elements until we find one in names (callers ensure element exists)
|
|
248
262
|
while self.open_elements:
|
|
249
|
-
node = self.
|
|
263
|
+
node = self._pop_current()
|
|
250
264
|
if node.name in names:
|
|
251
265
|
return
|
|
252
266
|
|
|
@@ -273,202 +287,218 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
273
287
|
|
|
274
288
|
current_token = token
|
|
275
289
|
force_html_mode = False
|
|
290
|
+
if token_type is Tag and token.kind == Tag.END:
|
|
291
|
+
self._pending_end_tag_name = token.name
|
|
292
|
+
if self.track_tag_spans:
|
|
293
|
+
self._pending_end_tag_start = token.start_pos
|
|
294
|
+
self._pending_end_tag_end = token.end_pos
|
|
295
|
+
else:
|
|
296
|
+
self._pending_end_tag_start = None
|
|
297
|
+
self._pending_end_tag_end = None
|
|
276
298
|
|
|
277
299
|
# Cache mode handlers list for speed
|
|
278
300
|
mode_handlers = self._MODE_HANDLERS
|
|
279
301
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
force_html_mode
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
self._reconstruct_active_formatting_elements()
|
|
320
|
-
self._insert_element(current_token, push=True)
|
|
321
|
-
self.frameset_ok = False
|
|
322
|
-
result = None
|
|
323
|
-
elif name == "a":
|
|
324
|
-
result = self._handle_body_start_a(current_token)
|
|
325
|
-
elif name == "br" or name == "img":
|
|
326
|
-
if self.active_formatting:
|
|
327
|
-
self._reconstruct_active_formatting_elements()
|
|
328
|
-
self._insert_element(current_token, push=False)
|
|
329
|
-
self.frameset_ok = False
|
|
330
|
-
result = None
|
|
331
|
-
elif name == "hr":
|
|
332
|
-
has_p = False
|
|
333
|
-
idx = len(self.open_elements) - 1
|
|
334
|
-
while idx >= 0: # pragma: no branch
|
|
335
|
-
node = self.open_elements[idx]
|
|
336
|
-
if node.name == "p":
|
|
337
|
-
has_p = True
|
|
338
|
-
break
|
|
339
|
-
if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
|
|
340
|
-
break
|
|
341
|
-
idx -= 1
|
|
342
|
-
|
|
343
|
-
if has_p:
|
|
344
|
-
self._close_p_element()
|
|
345
|
-
|
|
346
|
-
self._insert_element(current_token, push=False)
|
|
347
|
-
self.frameset_ok = False
|
|
348
|
-
result = None
|
|
349
|
-
else:
|
|
350
|
-
handler = self._BODY_START_HANDLERS.get(name)
|
|
351
|
-
if handler:
|
|
352
|
-
result = handler(self, current_token)
|
|
353
|
-
else:
|
|
354
|
-
# Inline _handle_body_start_default
|
|
355
|
-
# Elements here have no special handler - never in FRAMESET_NEUTRAL/FORMATTING_ELEMENTS
|
|
302
|
+
try:
|
|
303
|
+
while True:
|
|
304
|
+
# Update token type for current token (it might have changed if reprocessed)
|
|
305
|
+
token_type = type(current_token)
|
|
306
|
+
|
|
307
|
+
# Optimization: Check for HTML namespace first (common case)
|
|
308
|
+
current_node = self.open_elements[-1] if self.open_elements else None
|
|
309
|
+
is_html_namespace = current_node is None or current_node.namespace in {None, "html"}
|
|
310
|
+
|
|
311
|
+
if force_html_mode or is_html_namespace:
|
|
312
|
+
force_html_mode = False
|
|
313
|
+
if self.mode == InsertionMode.IN_BODY:
|
|
314
|
+
# Inline _mode_in_body for performance
|
|
315
|
+
if token_type is Tag:
|
|
316
|
+
# Inline _handle_tag_in_body
|
|
317
|
+
if current_token.kind == 0: # Tag.START
|
|
318
|
+
name = current_token.name
|
|
319
|
+
if name == "div" or name == "ul" or name == "ol":
|
|
320
|
+
# Inline _handle_body_start_block_with_p
|
|
321
|
+
# Check if p is in button scope (html always terminates)
|
|
322
|
+
has_p = False
|
|
323
|
+
idx = len(self.open_elements) - 1
|
|
324
|
+
while idx >= 0: # pragma: no branch
|
|
325
|
+
node = self.open_elements[idx]
|
|
326
|
+
if node.name == "p":
|
|
327
|
+
has_p = True
|
|
328
|
+
break
|
|
329
|
+
if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
|
|
330
|
+
break
|
|
331
|
+
idx -= 1
|
|
332
|
+
|
|
333
|
+
if has_p:
|
|
334
|
+
self._close_p_element()
|
|
335
|
+
|
|
336
|
+
self._insert_element(current_token, push=True)
|
|
337
|
+
result = None
|
|
338
|
+
elif name == "p":
|
|
339
|
+
result = self._handle_body_start_paragraph(current_token) # type: ignore[func-returns-value]
|
|
340
|
+
elif name == "span":
|
|
356
341
|
if self.active_formatting:
|
|
357
342
|
self._reconstruct_active_formatting_elements()
|
|
358
343
|
self._insert_element(current_token, push=True)
|
|
359
|
-
if current_token.self_closing:
|
|
360
|
-
self._parse_error(
|
|
361
|
-
"non-void-html-element-start-tag-with-trailing-solidus",
|
|
362
|
-
tag_name=current_token.name,
|
|
363
|
-
)
|
|
364
344
|
self.frameset_ok = False
|
|
365
345
|
result = None
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
346
|
+
elif name == "a":
|
|
347
|
+
result = self._handle_body_start_a(current_token) # type: ignore[func-returns-value]
|
|
348
|
+
elif name == "br" or name == "img":
|
|
349
|
+
if self.active_formatting:
|
|
350
|
+
self._reconstruct_active_formatting_elements()
|
|
351
|
+
self._insert_element(current_token, push=False)
|
|
352
|
+
self.frameset_ok = False
|
|
353
|
+
result = None
|
|
354
|
+
elif name == "hr":
|
|
355
|
+
has_p = False
|
|
356
|
+
idx = len(self.open_elements) - 1
|
|
357
|
+
while idx >= 0: # pragma: no branch
|
|
358
|
+
node = self.open_elements[idx]
|
|
359
|
+
if node.name == "p":
|
|
360
|
+
has_p = True
|
|
361
|
+
break
|
|
362
|
+
if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
|
|
363
|
+
break
|
|
364
|
+
idx -= 1
|
|
365
|
+
|
|
366
|
+
if has_p:
|
|
367
|
+
self._close_p_element()
|
|
368
|
+
|
|
369
|
+
self._insert_element(current_token, push=False)
|
|
370
|
+
self.frameset_ok = False
|
|
371
|
+
result = None
|
|
379
372
|
else:
|
|
380
|
-
self.
|
|
373
|
+
handler = self._BODY_START_HANDLERS.get(name)
|
|
374
|
+
if handler:
|
|
375
|
+
result = handler(self, current_token)
|
|
376
|
+
else:
|
|
377
|
+
# Inline _handle_body_start_default
|
|
378
|
+
# Elements here have no special handler - never in FRAMESET_NEUTRAL/FORMATTING_ELEMENTS
|
|
379
|
+
if self.active_formatting:
|
|
380
|
+
self._reconstruct_active_formatting_elements()
|
|
381
|
+
self._insert_element(current_token, push=True)
|
|
382
|
+
if current_token.self_closing:
|
|
383
|
+
self._parse_error(
|
|
384
|
+
"non-void-html-element-start-tag-with-trailing-solidus",
|
|
385
|
+
tag_name=current_token.name,
|
|
386
|
+
)
|
|
387
|
+
self.frameset_ok = False
|
|
388
|
+
result = None
|
|
389
|
+
else:
|
|
390
|
+
name = current_token.name
|
|
391
|
+
if name == "br":
|
|
392
|
+
self._parse_error("unexpected-end-tag", tag_name=name)
|
|
393
|
+
br_tag = Tag(0, "br", {}, False)
|
|
394
|
+
result = self._handle_body_start_br(br_tag) # type: ignore[func-returns-value]
|
|
395
|
+
elif name in FORMATTING_ELEMENTS:
|
|
396
|
+
self._adoption_agency(name)
|
|
381
397
|
result = None
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
# Only pop foreign elements if we're NOT at an HTML/MathML integration point
|
|
401
|
-
# and NOT about to insert a new foreign element (svg/math)
|
|
402
|
-
if not isinstance(current_token, EOFToken):
|
|
403
|
-
# Don't pop at integration points - they stay on stack to receive content
|
|
404
|
-
if self._is_html_integration_point(current) or self._is_mathml_text_integration_point(current):
|
|
405
|
-
pass
|
|
406
|
-
# Don't pop when inserting new svg/math elements
|
|
407
|
-
if isinstance(current_token, Tag) and current_token.kind == Tag.START:
|
|
408
|
-
# Optimization: Tokenizer already lowercases tag names
|
|
409
|
-
name_lower = current_token.name
|
|
410
|
-
if name_lower in {"svg", "math"}:
|
|
411
|
-
pass
|
|
412
|
-
|
|
413
|
-
# Special handling: text at integration points inserts directly, bypassing mode dispatch
|
|
414
|
-
if isinstance(current_token, CharacterTokens):
|
|
415
|
-
if self._is_mathml_text_integration_point(current):
|
|
416
|
-
# Tokenizer guarantees non-empty data
|
|
417
|
-
data = current_token.data
|
|
418
|
-
if "\x00" in data:
|
|
419
|
-
data = data.replace("\x00", "")
|
|
420
|
-
if data:
|
|
421
|
-
if not is_all_whitespace(data):
|
|
422
|
-
self._reconstruct_active_formatting_elements()
|
|
423
|
-
self.frameset_ok = False
|
|
424
|
-
self._append_text(data)
|
|
425
|
-
result = None
|
|
398
|
+
else:
|
|
399
|
+
handler = self._BODY_END_HANDLERS.get(name)
|
|
400
|
+
if handler:
|
|
401
|
+
result = handler(self, current_token)
|
|
402
|
+
else:
|
|
403
|
+
self._any_other_end_tag(name)
|
|
404
|
+
result = None
|
|
405
|
+
elif token_type is CharacterTokens:
|
|
406
|
+
# Inline _handle_characters_in_body
|
|
407
|
+
# Only non-whitespace data reaches here (whitespace handled in process_characters)
|
|
408
|
+
self.frameset_ok = False
|
|
409
|
+
self._reconstruct_active_formatting_elements()
|
|
410
|
+
self._append_text(current_token.data)
|
|
411
|
+
result = None
|
|
412
|
+
elif token_type is CommentToken:
|
|
413
|
+
result = self._handle_comment_in_body(current_token) # type: ignore[func-returns-value]
|
|
414
|
+
else: # EOFToken
|
|
415
|
+
result = self._handle_eof_in_body(current_token)
|
|
426
416
|
else:
|
|
427
417
|
result = mode_handlers[self.mode](self, current_token)
|
|
418
|
+
elif self._should_use_foreign_content(current_token):
|
|
419
|
+
result = self._process_foreign_content(current_token)
|
|
428
420
|
else:
|
|
429
|
-
#
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
#
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
421
|
+
# Foreign content stack logic
|
|
422
|
+
current = current_node
|
|
423
|
+
# Only pop foreign elements if we're NOT at an HTML/MathML integration point
|
|
424
|
+
# and NOT about to insert a new foreign element (svg/math)
|
|
425
|
+
if not isinstance(current_token, EOFToken):
|
|
426
|
+
# Don't pop at integration points - they stay on stack to receive content
|
|
427
|
+
if self._is_html_integration_point(current) or self._is_mathml_text_integration_point(current):
|
|
428
|
+
pass
|
|
429
|
+
# Don't pop when inserting new svg/math elements
|
|
430
|
+
if isinstance(current_token, Tag) and current_token.kind == Tag.START:
|
|
431
|
+
# Optimization: Tokenizer already lowercases tag names
|
|
432
|
+
name_lower = current_token.name
|
|
433
|
+
if name_lower in {"svg", "math"}:
|
|
434
|
+
pass
|
|
435
|
+
|
|
436
|
+
# Special handling: text at integration points inserts directly, bypassing mode dispatch
|
|
437
|
+
if isinstance(current_token, CharacterTokens):
|
|
438
|
+
if self._is_mathml_text_integration_point(current):
|
|
439
|
+
# Tokenizer guarantees non-empty data
|
|
440
|
+
data = current_token.data
|
|
441
|
+
if "\x00" in data:
|
|
442
|
+
data = data.replace("\x00", "")
|
|
443
|
+
if data:
|
|
444
|
+
if not is_all_whitespace(data):
|
|
445
|
+
self._reconstruct_active_formatting_elements()
|
|
446
|
+
self.frameset_ok = False
|
|
447
|
+
self._append_text(data)
|
|
448
|
+
result = None
|
|
455
449
|
else:
|
|
456
450
|
result = mode_handlers[self.mode](self, current_token)
|
|
457
451
|
else:
|
|
458
|
-
|
|
452
|
+
# At integration points inside foreign content, check if table tags make sense.
|
|
453
|
+
if (
|
|
454
|
+
(
|
|
455
|
+
self._is_mathml_text_integration_point(current)
|
|
456
|
+
or self._is_html_integration_point(current)
|
|
457
|
+
)
|
|
458
|
+
and isinstance(current_token, Tag)
|
|
459
|
+
and current_token.kind == Tag.START
|
|
460
|
+
and self.mode not in {InsertionMode.IN_BODY}
|
|
461
|
+
):
|
|
462
|
+
# Check if we're in a table mode but without an actual table in scope
|
|
463
|
+
# If so, table tags should be ignored (use IN_BODY mode)
|
|
464
|
+
is_table_mode = self.mode in {
|
|
465
|
+
InsertionMode.IN_TABLE,
|
|
466
|
+
InsertionMode.IN_TABLE_BODY,
|
|
467
|
+
InsertionMode.IN_ROW,
|
|
468
|
+
InsertionMode.IN_CELL,
|
|
469
|
+
InsertionMode.IN_CAPTION,
|
|
470
|
+
InsertionMode.IN_COLUMN_GROUP,
|
|
471
|
+
}
|
|
472
|
+
has_table_in_scope = self._has_in_table_scope("table")
|
|
473
|
+
if is_table_mode and not has_table_in_scope:
|
|
474
|
+
# Temporarily use IN_BODY mode for this tag
|
|
475
|
+
saved_mode = self.mode
|
|
476
|
+
self.mode = InsertionMode.IN_BODY
|
|
477
|
+
result = mode_handlers[self.mode](self, current_token)
|
|
478
|
+
# Restore mode if no mode change was requested
|
|
479
|
+
if self.mode == InsertionMode.IN_BODY: # pragma: no branch
|
|
480
|
+
self.mode = saved_mode
|
|
481
|
+
else:
|
|
482
|
+
result = mode_handlers[self.mode](self, current_token)
|
|
483
|
+
else:
|
|
484
|
+
result = mode_handlers[self.mode](self, current_token)
|
|
459
485
|
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
486
|
+
if result is None:
|
|
487
|
+
result_to_return = self.tokenizer_state_override or TokenSinkResult.Continue
|
|
488
|
+
self.tokenizer_state_override = None
|
|
489
|
+
return result_to_return
|
|
490
|
+
# Result is (instruction, mode, token) or (instruction, mode, token, force_html)
|
|
491
|
+
_instruction, mode, token_override = result[0], result[1], result[2]
|
|
492
|
+
if len(result) == 4:
|
|
493
|
+
force_html_mode = result[3]
|
|
494
|
+
# All mode handlers that return a tuple use "reprocess" instruction
|
|
495
|
+
self.mode = mode
|
|
496
|
+
current_token = token_override
|
|
497
|
+
# Continue loop to reprocess
|
|
498
|
+
finally:
|
|
499
|
+
self._pending_end_tag_name = None
|
|
500
|
+
self._pending_end_tag_start = None
|
|
501
|
+
self._pending_end_tag_end = None
|
|
472
502
|
|
|
473
503
|
def finish(self) -> SimpleDomNode:
|
|
474
504
|
if self.fragment_context is not None:
|
|
@@ -490,6 +520,9 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
490
520
|
# Populate selectedcontent elements per HTML5 spec
|
|
491
521
|
self._populate_selectedcontent(self.document)
|
|
492
522
|
|
|
523
|
+
if self.tokenizer is not None and self.track_tag_spans: # pragma: no branch
|
|
524
|
+
self.document._source_html = self.tokenizer.buffer
|
|
525
|
+
|
|
493
526
|
return self.document
|
|
494
527
|
|
|
495
528
|
# Insertion mode dispatch ------------------------------------------------
|
|
@@ -599,6 +632,10 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
599
632
|
node = TemplateNode(tag.name, attrs=tag.attrs, namespace=namespace)
|
|
600
633
|
else:
|
|
601
634
|
node = ElementNode(tag.name, attrs=tag.attrs, namespace=namespace)
|
|
635
|
+
if self.track_tag_spans:
|
|
636
|
+
node._start_tag_start = tag.start_pos
|
|
637
|
+
node._start_tag_end = tag.end_pos
|
|
638
|
+
node._self_closing = bool(getattr(tag, "self_closing", False))
|
|
602
639
|
|
|
603
640
|
if self.tokenizer is not None and self.tokenizer.track_node_locations:
|
|
604
641
|
node._origin_pos = tag.start_pos
|
|
@@ -647,8 +684,23 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
647
684
|
ns = namespace or "html"
|
|
648
685
|
return ElementNode(name, attrs, ns)
|
|
649
686
|
|
|
687
|
+
def _maybe_mark_end_tag(self, node: Any) -> None:
|
|
688
|
+
if self._pending_end_tag_name is None:
|
|
689
|
+
return
|
|
690
|
+
if getattr(node, "name", None) != self._pending_end_tag_name:
|
|
691
|
+
return
|
|
692
|
+
node._end_tag_present = True
|
|
693
|
+
if self.track_tag_spans:
|
|
694
|
+
node._end_tag_start = self._pending_end_tag_start
|
|
695
|
+
node._end_tag_end = self._pending_end_tag_end
|
|
696
|
+
self._pending_end_tag_name = None
|
|
697
|
+
self._pending_end_tag_start = None
|
|
698
|
+
self._pending_end_tag_end = None
|
|
699
|
+
|
|
650
700
|
def _pop_current(self) -> Any:
|
|
651
|
-
|
|
701
|
+
node = self.open_elements.pop()
|
|
702
|
+
self._maybe_mark_end_tag(node)
|
|
703
|
+
return node
|
|
652
704
|
|
|
653
705
|
def _in_scope(self, name: str) -> bool:
|
|
654
706
|
return self._has_element_in_scope(name, DEFAULT_SCOPE_TERMINATORS)
|
|
@@ -660,6 +712,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
660
712
|
index = len(self.open_elements) - 1
|
|
661
713
|
while index >= 0: # pragma: no branch
|
|
662
714
|
if self.open_elements[index].name == name:
|
|
715
|
+
self._maybe_mark_end_tag(self.open_elements[index])
|
|
663
716
|
del self.open_elements[index:]
|
|
664
717
|
return
|
|
665
718
|
index -= 1
|
|
@@ -677,6 +730,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
677
730
|
# If current node is not this node, parse error
|
|
678
731
|
if index != len(self.open_elements) - 1:
|
|
679
732
|
self._parse_error("end-tag-too-early")
|
|
733
|
+
self._maybe_mark_end_tag(node)
|
|
680
734
|
# Pop all elements from this node onwards
|
|
681
735
|
del self.open_elements[index:]
|
|
682
736
|
return
|
|
@@ -700,6 +754,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
700
754
|
def _remove_from_open_elements(self, node: Any) -> bool:
|
|
701
755
|
for index, current in enumerate(self.open_elements):
|
|
702
756
|
if current is node:
|
|
757
|
+
self._maybe_mark_end_tag(current)
|
|
703
758
|
del self.open_elements[index]
|
|
704
759
|
return True
|
|
705
760
|
return False
|
|
@@ -772,6 +827,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
772
827
|
def _remove_last_open_element_by_name(self, name: str) -> None:
|
|
773
828
|
for index in range(len(self.open_elements) - 1, -1, -1):
|
|
774
829
|
if self.open_elements[index].name == name:
|
|
830
|
+
self._maybe_mark_end_tag(self.open_elements[index])
|
|
775
831
|
del self.open_elements[index]
|
|
776
832
|
return
|
|
777
833
|
|
|
@@ -847,14 +903,14 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
847
903
|
node = self.open_elements[-1]
|
|
848
904
|
if node.name in names and node.namespace in {None, "html"}:
|
|
849
905
|
break
|
|
850
|
-
self.
|
|
906
|
+
self._pop_current()
|
|
851
907
|
|
|
852
908
|
def _generate_implied_end_tags(self, exclude: str | None = None) -> None:
|
|
853
909
|
# Always terminates: html is not in IMPLIED_END_TAGS
|
|
854
910
|
while self.open_elements: # pragma: no branch
|
|
855
911
|
node = self.open_elements[-1]
|
|
856
912
|
if node.name in IMPLIED_END_TAGS and node.name != exclude:
|
|
857
|
-
self.
|
|
913
|
+
self._pop_current()
|
|
858
914
|
continue
|
|
859
915
|
break
|
|
860
916
|
|
|
@@ -873,7 +929,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
873
929
|
def _end_table_cell(self, name: str) -> None:
|
|
874
930
|
self._generate_implied_end_tags(name)
|
|
875
931
|
while self.open_elements:
|
|
876
|
-
node = self.
|
|
932
|
+
node = self._pop_current()
|
|
877
933
|
if node.name == name and node.namespace in {None, "html"}:
|
|
878
934
|
break
|
|
879
935
|
self._clear_active_formatting_up_to_marker()
|
|
@@ -910,7 +966,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
910
966
|
self._generate_implied_end_tags()
|
|
911
967
|
# Table verified in scope above
|
|
912
968
|
while self.open_elements: # pragma: no branch
|
|
913
|
-
node = self.
|
|
969
|
+
node = self._pop_current()
|
|
914
970
|
if node.name == "table":
|
|
915
971
|
break
|
|
916
972
|
self._reset_insertion_mode()
|
|
@@ -1023,7 +1079,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1023
1079
|
def _adjusted_current_node(self) -> Any:
|
|
1024
1080
|
return self.open_elements[-1]
|
|
1025
1081
|
|
|
1026
|
-
def _should_use_foreign_content(self, token:
|
|
1082
|
+
def _should_use_foreign_content(self, token: AnyToken) -> bool:
|
|
1027
1083
|
current = self._adjusted_current_node()
|
|
1028
1084
|
# HTML namespace elements don't use foreign content rules
|
|
1029
1085
|
# (unreachable in practice as foreign content mode only entered for foreign elements)
|
|
@@ -1070,9 +1126,9 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1070
1126
|
return
|
|
1071
1127
|
if self.fragment_context_element is not None and node is self.fragment_context_element:
|
|
1072
1128
|
return
|
|
1073
|
-
self.
|
|
1129
|
+
self._pop_current()
|
|
1074
1130
|
|
|
1075
|
-
def _process_foreign_content(self, token:
|
|
1131
|
+
def _process_foreign_content(self, token: AnyToken) -> Any | None:
|
|
1076
1132
|
current = self._adjusted_current_node()
|
|
1077
1133
|
|
|
1078
1134
|
if isinstance(token, CharacterTokens):
|
|
@@ -1147,6 +1203,7 @@ class TreeBuilder(TreeBuilderModesMixin):
|
|
|
1147
1203
|
if is_html:
|
|
1148
1204
|
return ("reprocess", self.mode, token, True)
|
|
1149
1205
|
# Otherwise it's a foreign element - pop everything from this point up
|
|
1206
|
+
self._maybe_mark_end_tag(node)
|
|
1150
1207
|
del self.open_elements[idx:]
|
|
1151
1208
|
return None
|
|
1152
1209
|
|