justhtml 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/treebuilder.py CHANGED
@@ -26,7 +26,7 @@ from .constants import (
26
26
  )
27
27
  from .errors import generate_error_message
28
28
  from .node import ElementNode, SimpleDomNode, TemplateNode, TextNode
29
- from .tokens import CharacterTokens, CommentToken, DoctypeToken, EOFToken, ParseError, Tag, TokenSinkResult
29
+ from .tokens import AnyToken, CharacterTokens, CommentToken, DoctypeToken, EOFToken, ParseError, Tag, TokenSinkResult
30
30
  from .treebuilder_modes import TreeBuilderModesMixin
31
31
  from .treebuilder_utils import (
32
32
  InsertionMode,
@@ -43,6 +43,9 @@ class TreeBuilder(TreeBuilderModesMixin):
43
43
  "_body_start_handlers",
44
44
  "_body_token_handlers",
45
45
  "_mode_handlers",
46
+ "_pending_end_tag_end",
47
+ "_pending_end_tag_name",
48
+ "_pending_end_tag_start",
46
49
  "active_formatting",
47
50
  "collect_errors",
48
51
  "document",
@@ -59,17 +62,23 @@ class TreeBuilder(TreeBuilderModesMixin):
59
62
  "open_elements",
60
63
  "original_mode",
61
64
  "pending_table_text",
65
+ "pending_table_text_should_error",
62
66
  "quirks_mode",
63
67
  "table_text_original_mode",
64
68
  "template_modes",
65
69
  "tokenizer",
66
70
  "tokenizer_state_override",
71
+ "track_tag_spans",
67
72
  )
68
73
 
69
74
  _body_end_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
70
75
  _body_start_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
71
76
  _body_token_handlers: dict[str, Callable[[TreeBuilder, Any], Any]]
72
77
  _mode_handlers: dict[InsertionMode, Callable[[TreeBuilder, Any], Any]]
78
+ _pending_end_tag_name: str | None
79
+ _pending_end_tag_start: int | None
80
+ _pending_end_tag_end: int | None
81
+ track_tag_spans: bool
73
82
  active_formatting: list[Any]
74
83
  collect_errors: bool
75
84
  document: SimpleDomNode
@@ -86,6 +95,7 @@ class TreeBuilder(TreeBuilderModesMixin):
86
95
  open_elements: list[Any]
87
96
  original_mode: InsertionMode | None # type: ignore[assignment]
88
97
  pending_table_text: list[str]
98
+ pending_table_text_should_error: bool
89
99
  quirks_mode: str
90
100
  table_text_original_mode: InsertionMode | None # type: ignore[assignment]
91
101
  template_modes: list[InsertionMode]
@@ -97,10 +107,12 @@ class TreeBuilder(TreeBuilderModesMixin):
97
107
  fragment_context: Any | None = None,
98
108
  iframe_srcdoc: bool = False,
99
109
  collect_errors: bool = False,
110
+ track_tag_spans: bool = False,
100
111
  ) -> None:
101
112
  self.fragment_context = fragment_context
102
113
  self.iframe_srcdoc = iframe_srcdoc
103
114
  self.collect_errors = collect_errors
115
+ self.track_tag_spans = bool(track_tag_spans)
104
116
  self.errors = []
105
117
  self.tokenizer = None # Set by parser after tokenizer is created
106
118
  self.fragment_context_element = None
@@ -112,12 +124,16 @@ class TreeBuilder(TreeBuilderModesMixin):
112
124
  self.original_mode = None
113
125
  self.table_text_original_mode = None
114
126
  self.open_elements = []
127
+ self._pending_end_tag_name = None
128
+ self._pending_end_tag_start = None
129
+ self._pending_end_tag_end = None
115
130
  self.head_element = None
116
131
  self.form_element = None
117
132
  self.frameset_ok = True
118
133
  self.quirks_mode = "no-quirks"
119
134
  self.ignore_lf = False
120
135
  self.active_formatting = []
136
+ self.pending_table_text_should_error = False
121
137
  self.insert_from_table = False
122
138
  self.pending_table_text = []
123
139
  self.template_modes = []
@@ -169,7 +185,7 @@ class TreeBuilder(TreeBuilderModesMixin):
169
185
  def _set_quirks_mode(self, mode: str) -> None:
170
186
  self.quirks_mode = mode
171
187
 
172
- def _parse_error(self, code: str, tag_name: str | None = None, token: Any = None) -> None:
188
+ def _parse_error(self, code: str, tag_name: str | None = None, token: AnyToken | None = None) -> None:
173
189
  if not self.collect_errors:
174
190
  return
175
191
  # Use the position of the last emitted token (set by tokenizer before emit)
@@ -206,6 +222,7 @@ class TreeBuilder(TreeBuilderModesMixin):
206
222
  code,
207
223
  line=line,
208
224
  column=column,
225
+ category="treebuilder",
209
226
  message=message,
210
227
  source_html=source_html,
211
228
  end_column=end_column,
@@ -236,14 +253,14 @@ class TreeBuilder(TreeBuilderModesMixin):
236
253
  def _pop_until_inclusive(self, name: str) -> None:
237
254
  # Callers ensure element exists on stack
238
255
  while self.open_elements: # pragma: no branch
239
- node = self.open_elements.pop()
256
+ node = self._pop_current()
240
257
  if node.name == name:
241
258
  break
242
259
 
243
260
  def _pop_until_any_inclusive(self, names: set[str]) -> None:
244
261
  # Pop elements until we find one in names (callers ensure element exists)
245
262
  while self.open_elements:
246
- node = self.open_elements.pop()
263
+ node = self._pop_current()
247
264
  if node.name in names:
248
265
  return
249
266
 
@@ -251,7 +268,7 @@ class TreeBuilder(TreeBuilderModesMixin):
251
268
  if self._has_element_in_button_scope("p"):
252
269
  self._generate_implied_end_tags("p")
253
270
  if self.open_elements[-1].name != "p":
254
- self._parse_error("end-tag-too-early", tag_name="p")
271
+ self._parse_error("unexpected-end-tag", tag_name="p")
255
272
  self._pop_until_inclusive("p")
256
273
  return True
257
274
  return False
@@ -270,206 +287,218 @@ class TreeBuilder(TreeBuilderModesMixin):
270
287
 
271
288
  current_token = token
272
289
  force_html_mode = False
290
+ if token_type is Tag and token.kind == Tag.END:
291
+ self._pending_end_tag_name = token.name
292
+ if self.track_tag_spans:
293
+ self._pending_end_tag_start = token.start_pos
294
+ self._pending_end_tag_end = token.end_pos
295
+ else:
296
+ self._pending_end_tag_start = None
297
+ self._pending_end_tag_end = None
273
298
 
274
299
  # Cache mode handlers list for speed
275
300
  mode_handlers = self._MODE_HANDLERS
276
301
 
277
- while True:
278
- # Update token type for current token (it might have changed if reprocessed)
279
- token_type = type(current_token)
280
-
281
- # Optimization: Check for HTML namespace first (common case)
282
- current_node = self.open_elements[-1] if self.open_elements else None
283
- is_html_namespace = current_node is None or current_node.namespace in {None, "html"}
284
-
285
- if force_html_mode or is_html_namespace:
286
- force_html_mode = False
287
- if self.mode == InsertionMode.IN_BODY:
288
- # Inline _mode_in_body for performance
289
- if token_type is Tag:
290
- # Inline _handle_tag_in_body
291
- if current_token.kind == 0: # Tag.START
292
- name = current_token.name
293
- if name == "div" or name == "ul" or name == "ol":
294
- # Inline _handle_body_start_block_with_p
295
- # Check if p is in button scope (html always terminates)
296
- has_p = False
297
- idx = len(self.open_elements) - 1
298
- while idx >= 0: # pragma: no branch
299
- node = self.open_elements[idx]
300
- if node.name == "p":
301
- has_p = True
302
- break
303
- if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
304
- break
305
- idx -= 1
306
-
307
- if has_p:
308
- self._close_p_element()
309
-
310
- self._insert_element(current_token, push=True)
311
- result = None
312
- elif name == "p":
313
- result = self._handle_body_start_paragraph(current_token)
314
- elif name == "span":
315
- if self.active_formatting:
316
- self._reconstruct_active_formatting_elements()
317
- self._insert_element(current_token, push=True)
318
- self.frameset_ok = False
319
- result = None
320
- elif name == "a":
321
- result = self._handle_body_start_a(current_token)
322
- elif name == "br" or name == "img":
323
- if self.active_formatting:
324
- self._reconstruct_active_formatting_elements()
325
- self._insert_element(current_token, push=False)
326
- self.frameset_ok = False
327
- result = None
328
- elif name == "hr":
329
- has_p = False
330
- idx = len(self.open_elements) - 1
331
- while idx >= 0: # pragma: no branch
332
- node = self.open_elements[idx]
333
- if node.name == "p":
334
- has_p = True
335
- break
336
- if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
337
- break
338
- idx -= 1
339
-
340
- if has_p:
341
- self._close_p_element()
342
-
343
- self._insert_element(current_token, push=False)
344
- self.frameset_ok = False
345
- result = None
346
- else:
347
- handler = self._BODY_START_HANDLERS.get(name)
348
- if handler:
349
- result = handler(self, current_token)
350
- else:
351
- # Inline _handle_body_start_default
352
- # Elements here have no special handler - never in FRAMESET_NEUTRAL/FORMATTING_ELEMENTS
302
+ try:
303
+ while True:
304
+ # Update token type for current token (it might have changed if reprocessed)
305
+ token_type = type(current_token)
306
+
307
+ # Optimization: Check for HTML namespace first (common case)
308
+ current_node = self.open_elements[-1] if self.open_elements else None
309
+ is_html_namespace = current_node is None or current_node.namespace in {None, "html"}
310
+
311
+ if force_html_mode or is_html_namespace:
312
+ force_html_mode = False
313
+ if self.mode == InsertionMode.IN_BODY:
314
+ # Inline _mode_in_body for performance
315
+ if token_type is Tag:
316
+ # Inline _handle_tag_in_body
317
+ if current_token.kind == 0: # Tag.START
318
+ name = current_token.name
319
+ if name == "div" or name == "ul" or name == "ol":
320
+ # Inline _handle_body_start_block_with_p
321
+ # Check if p is in button scope (html always terminates)
322
+ has_p = False
323
+ idx = len(self.open_elements) - 1
324
+ while idx >= 0: # pragma: no branch
325
+ node = self.open_elements[idx]
326
+ if node.name == "p":
327
+ has_p = True
328
+ break
329
+ if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
330
+ break
331
+ idx -= 1
332
+
333
+ if has_p:
334
+ self._close_p_element()
335
+
336
+ self._insert_element(current_token, push=True)
337
+ result = None
338
+ elif name == "p":
339
+ result = self._handle_body_start_paragraph(current_token) # type: ignore[func-returns-value]
340
+ elif name == "span":
353
341
  if self.active_formatting:
354
342
  self._reconstruct_active_formatting_elements()
355
343
  self._insert_element(current_token, push=True)
356
- if current_token.self_closing:
357
- self._parse_error(
358
- "non-void-html-element-start-tag-with-trailing-solidus",
359
- tag_name=current_token.name,
360
- )
361
344
  self.frameset_ok = False
362
345
  result = None
363
- else:
364
- name = current_token.name
365
- if name == "br":
366
- self._parse_error("unexpected-end-tag", tag_name=name)
367
- br_tag = Tag(0, "br", {}, False)
368
- result = self._handle_body_start_br(br_tag)
369
- elif name in FORMATTING_ELEMENTS:
370
- self._adoption_agency(name)
371
- result = None
372
- else:
373
- handler = self._BODY_END_HANDLERS.get(name)
374
- if handler:
375
- result = handler(self, current_token)
346
+ elif name == "a":
347
+ result = self._handle_body_start_a(current_token) # type: ignore[func-returns-value]
348
+ elif name == "br" or name == "img":
349
+ if self.active_formatting:
350
+ self._reconstruct_active_formatting_elements()
351
+ self._insert_element(current_token, push=False)
352
+ self.frameset_ok = False
353
+ result = None
354
+ elif name == "hr":
355
+ has_p = False
356
+ idx = len(self.open_elements) - 1
357
+ while idx >= 0: # pragma: no branch
358
+ node = self.open_elements[idx]
359
+ if node.name == "p":
360
+ has_p = True
361
+ break
362
+ if node.namespace in {None, "html"} and node.name in BUTTON_SCOPE_TERMINATORS:
363
+ break
364
+ idx -= 1
365
+
366
+ if has_p:
367
+ self._close_p_element()
368
+
369
+ self._insert_element(current_token, push=False)
370
+ self.frameset_ok = False
371
+ result = None
376
372
  else:
377
- self._any_other_end_tag(name)
373
+ handler = self._BODY_START_HANDLERS.get(name)
374
+ if handler:
375
+ result = handler(self, current_token)
376
+ else:
377
+ # Inline _handle_body_start_default
378
+ # Elements here have no special handler - never in FRAMESET_NEUTRAL/FORMATTING_ELEMENTS
379
+ if self.active_formatting:
380
+ self._reconstruct_active_formatting_elements()
381
+ self._insert_element(current_token, push=True)
382
+ if current_token.self_closing:
383
+ self._parse_error(
384
+ "non-void-html-element-start-tag-with-trailing-solidus",
385
+ tag_name=current_token.name,
386
+ )
387
+ self.frameset_ok = False
388
+ result = None
389
+ else:
390
+ name = current_token.name
391
+ if name == "br":
392
+ self._parse_error("unexpected-end-tag", tag_name=name)
393
+ br_tag = Tag(0, "br", {}, False)
394
+ result = self._handle_body_start_br(br_tag) # type: ignore[func-returns-value]
395
+ elif name in FORMATTING_ELEMENTS:
396
+ self._adoption_agency(name)
378
397
  result = None
379
- elif token_type is CharacterTokens:
380
- # Inline _handle_characters_in_body
381
- # Only non-whitespace data reaches here (whitespace handled in process_characters)
382
- self.frameset_ok = False
383
- self._reconstruct_active_formatting_elements()
384
- self._append_text(current_token.data)
385
- result = None
386
- elif token_type is CommentToken:
387
- result = self._handle_comment_in_body(current_token)
388
- else: # EOFToken
389
- result = self._handle_eof_in_body(current_token)
390
- else:
391
- result = mode_handlers[self.mode](self, current_token)
392
- elif self._should_use_foreign_content(current_token):
393
- result = self._process_foreign_content(current_token)
394
- else:
395
- # Foreign content stack logic
396
- current = current_node
397
- # Only pop foreign elements if we're NOT at an HTML/MathML integration point
398
- # and NOT about to insert a new foreign element (svg/math)
399
- if not isinstance(current_token, EOFToken):
400
- # Don't pop at integration points - they stay on stack to receive content
401
- if self._is_html_integration_point(current) or self._is_mathml_text_integration_point(current):
402
- pass
403
- # Don't pop when inserting new svg/math elements
404
- if isinstance(current_token, Tag) and current_token.kind == Tag.START:
405
- # Optimization: Tokenizer already lowercases tag names
406
- name_lower = current_token.name
407
- if name_lower in {"svg", "math"}:
408
- pass
409
-
410
- # Special handling: text at integration points inserts directly, bypassing mode dispatch
411
- if isinstance(current_token, CharacterTokens):
412
- if self._is_mathml_text_integration_point(current):
413
- # Tokenizer guarantees non-empty data
414
- data = current_token.data
415
- if "\x00" in data:
416
- self._parse_error("invalid-codepoint")
417
- data = data.replace("\x00", "")
418
- if "\x0c" in data:
419
- self._parse_error("invalid-codepoint")
420
- data = data.replace("\x0c", "")
421
- if data:
422
- if not is_all_whitespace(data):
423
- self._reconstruct_active_formatting_elements()
424
- self.frameset_ok = False
425
- self._append_text(data)
426
- result = None
398
+ else:
399
+ handler = self._BODY_END_HANDLERS.get(name)
400
+ if handler:
401
+ result = handler(self, current_token)
402
+ else:
403
+ self._any_other_end_tag(name)
404
+ result = None
405
+ elif token_type is CharacterTokens:
406
+ # Inline _handle_characters_in_body
407
+ # Only non-whitespace data reaches here (whitespace handled in process_characters)
408
+ self.frameset_ok = False
409
+ self._reconstruct_active_formatting_elements()
410
+ self._append_text(current_token.data)
411
+ result = None
412
+ elif token_type is CommentToken:
413
+ result = self._handle_comment_in_body(current_token) # type: ignore[func-returns-value]
414
+ else: # EOFToken
415
+ result = self._handle_eof_in_body(current_token)
427
416
  else:
428
417
  result = mode_handlers[self.mode](self, current_token)
418
+ elif self._should_use_foreign_content(current_token):
419
+ result = self._process_foreign_content(current_token)
429
420
  else:
430
- # At integration points inside foreign content, check if table tags make sense.
431
- if (
432
- (self._is_mathml_text_integration_point(current) or self._is_html_integration_point(current))
433
- and isinstance(current_token, Tag)
434
- and current_token.kind == Tag.START
435
- and self.mode not in {InsertionMode.IN_BODY}
436
- ):
437
- # Check if we're in a table mode but without an actual table in scope
438
- # If so, table tags should be ignored (use IN_BODY mode)
439
- is_table_mode = self.mode in {
440
- InsertionMode.IN_TABLE,
441
- InsertionMode.IN_TABLE_BODY,
442
- InsertionMode.IN_ROW,
443
- InsertionMode.IN_CELL,
444
- InsertionMode.IN_CAPTION,
445
- InsertionMode.IN_COLUMN_GROUP,
446
- }
447
- has_table_in_scope = self._has_in_table_scope("table")
448
- if is_table_mode and not has_table_in_scope:
449
- # Temporarily use IN_BODY mode for this tag
450
- saved_mode = self.mode
451
- self.mode = InsertionMode.IN_BODY
452
- result = mode_handlers[self.mode](self, current_token)
453
- # Restore mode if no mode change was requested
454
- if self.mode == InsertionMode.IN_BODY: # pragma: no branch
455
- self.mode = saved_mode
421
+ # Foreign content stack logic
422
+ current = current_node
423
+ # Only pop foreign elements if we're NOT at an HTML/MathML integration point
424
+ # and NOT about to insert a new foreign element (svg/math)
425
+ if not isinstance(current_token, EOFToken):
426
+ # Don't pop at integration points - they stay on stack to receive content
427
+ if self._is_html_integration_point(current) or self._is_mathml_text_integration_point(current):
428
+ pass
429
+ # Don't pop when inserting new svg/math elements
430
+ if isinstance(current_token, Tag) and current_token.kind == Tag.START:
431
+ # Optimization: Tokenizer already lowercases tag names
432
+ name_lower = current_token.name
433
+ if name_lower in {"svg", "math"}:
434
+ pass
435
+
436
+ # Special handling: text at integration points inserts directly, bypassing mode dispatch
437
+ if isinstance(current_token, CharacterTokens):
438
+ if self._is_mathml_text_integration_point(current):
439
+ # Tokenizer guarantees non-empty data
440
+ data = current_token.data
441
+ if "\x00" in data:
442
+ data = data.replace("\x00", "")
443
+ if data:
444
+ if not is_all_whitespace(data):
445
+ self._reconstruct_active_formatting_elements()
446
+ self.frameset_ok = False
447
+ self._append_text(data)
448
+ result = None
456
449
  else:
457
450
  result = mode_handlers[self.mode](self, current_token)
458
451
  else:
459
- result = mode_handlers[self.mode](self, current_token)
452
+ # At integration points inside foreign content, check if table tags make sense.
453
+ if (
454
+ (
455
+ self._is_mathml_text_integration_point(current)
456
+ or self._is_html_integration_point(current)
457
+ )
458
+ and isinstance(current_token, Tag)
459
+ and current_token.kind == Tag.START
460
+ and self.mode not in {InsertionMode.IN_BODY}
461
+ ):
462
+ # Check if we're in a table mode but without an actual table in scope
463
+ # If so, table tags should be ignored (use IN_BODY mode)
464
+ is_table_mode = self.mode in {
465
+ InsertionMode.IN_TABLE,
466
+ InsertionMode.IN_TABLE_BODY,
467
+ InsertionMode.IN_ROW,
468
+ InsertionMode.IN_CELL,
469
+ InsertionMode.IN_CAPTION,
470
+ InsertionMode.IN_COLUMN_GROUP,
471
+ }
472
+ has_table_in_scope = self._has_in_table_scope("table")
473
+ if is_table_mode and not has_table_in_scope:
474
+ # Temporarily use IN_BODY mode for this tag
475
+ saved_mode = self.mode
476
+ self.mode = InsertionMode.IN_BODY
477
+ result = mode_handlers[self.mode](self, current_token)
478
+ # Restore mode if no mode change was requested
479
+ if self.mode == InsertionMode.IN_BODY: # pragma: no branch
480
+ self.mode = saved_mode
481
+ else:
482
+ result = mode_handlers[self.mode](self, current_token)
483
+ else:
484
+ result = mode_handlers[self.mode](self, current_token)
460
485
 
461
- if result is None:
462
- result_to_return = self.tokenizer_state_override or TokenSinkResult.Continue
463
- self.tokenizer_state_override = None
464
- return result_to_return
465
- # Result is (instruction, mode, token) or (instruction, mode, token, force_html)
466
- _instruction, mode, token_override = result[0], result[1], result[2]
467
- if len(result) == 4:
468
- force_html_mode = result[3]
469
- # All mode handlers that return a tuple use "reprocess" instruction
470
- self.mode = mode
471
- current_token = token_override
472
- # Continue loop to reprocess
486
+ if result is None:
487
+ result_to_return = self.tokenizer_state_override or TokenSinkResult.Continue
488
+ self.tokenizer_state_override = None
489
+ return result_to_return
490
+ # Result is (instruction, mode, token) or (instruction, mode, token, force_html)
491
+ _instruction, mode, token_override = result[0], result[1], result[2]
492
+ if len(result) == 4:
493
+ force_html_mode = result[3]
494
+ # All mode handlers that return a tuple use "reprocess" instruction
495
+ self.mode = mode
496
+ current_token = token_override
497
+ # Continue loop to reprocess
498
+ finally:
499
+ self._pending_end_tag_name = None
500
+ self._pending_end_tag_start = None
501
+ self._pending_end_tag_end = None
473
502
 
474
503
  def finish(self) -> SimpleDomNode:
475
504
  if self.fragment_context is not None:
@@ -491,12 +520,19 @@ class TreeBuilder(TreeBuilderModesMixin):
491
520
  # Populate selectedcontent elements per HTML5 spec
492
521
  self._populate_selectedcontent(self.document)
493
522
 
523
+ if self.tokenizer is not None and self.track_tag_spans: # pragma: no branch
524
+ self.document._source_html = self.tokenizer.buffer
525
+
494
526
  return self.document
495
527
 
496
528
  # Insertion mode dispatch ------------------------------------------------
497
529
 
498
530
  def _append_comment_to_document(self, text: str) -> None:
499
531
  node = SimpleDomNode("#comment", data=text)
532
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
533
+ node._origin_pos = self.tokenizer.last_token_start_pos
534
+ if node._origin_pos is not None:
535
+ node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
500
536
  self.document.append_child(node)
501
537
 
502
538
  def _append_comment(self, text: str, parent: Any | None = None) -> None:
@@ -506,6 +542,10 @@ class TreeBuilder(TreeBuilderModesMixin):
506
542
  if type(parent) is TemplateNode and parent.template_content:
507
543
  parent = parent.template_content
508
544
  node = SimpleDomNode("#comment", data=text)
545
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
546
+ node._origin_pos = self.tokenizer.last_token_start_pos
547
+ if node._origin_pos is not None:
548
+ node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
509
549
  parent.append_child(node)
510
550
 
511
551
  def _append_text(self, text: str) -> None:
@@ -516,6 +556,9 @@ class TreeBuilder(TreeBuilderModesMixin):
516
556
  if not text:
517
557
  return
518
558
 
559
+ if "\f" in text:
560
+ text = text.replace("\f", " ")
561
+
519
562
  # Guard against empty stack
520
563
  if not self.open_elements: # pragma: no cover
521
564
  return
@@ -532,6 +575,10 @@ class TreeBuilder(TreeBuilderModesMixin):
532
575
  return
533
576
 
534
577
  node = TextNode(text)
578
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
579
+ node._origin_pos = self.tokenizer.last_token_start_pos
580
+ if node._origin_pos is not None:
581
+ node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
535
582
  children.append(node)
536
583
  node.parent = target
537
584
  return
@@ -552,6 +599,10 @@ class TreeBuilder(TreeBuilderModesMixin):
552
599
  return
553
600
 
554
601
  node = TextNode(text)
602
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
603
+ node._origin_pos = self.tokenizer.last_token_start_pos
604
+ if node._origin_pos is not None:
605
+ node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
555
606
  reference_node = parent.children[position] if position < len(parent.children) else None
556
607
  parent.insert_before(node, reference_node)
557
608
 
@@ -581,6 +632,15 @@ class TreeBuilder(TreeBuilderModesMixin):
581
632
  node = TemplateNode(tag.name, attrs=tag.attrs, namespace=namespace)
582
633
  else:
583
634
  node = ElementNode(tag.name, attrs=tag.attrs, namespace=namespace)
635
+ if self.track_tag_spans:
636
+ node._start_tag_start = tag.start_pos
637
+ node._start_tag_end = tag.end_pos
638
+ node._self_closing = bool(getattr(tag, "self_closing", False))
639
+
640
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
641
+ node._origin_pos = tag.start_pos
642
+ if node._origin_pos is not None:
643
+ node._origin_line, node._origin_col = self.tokenizer.location_at_pos(node._origin_pos)
584
644
 
585
645
  # Fast path for common case: not inserting from table
586
646
  if not self.insert_from_table:
@@ -624,8 +684,23 @@ class TreeBuilder(TreeBuilderModesMixin):
624
684
  ns = namespace or "html"
625
685
  return ElementNode(name, attrs, ns)
626
686
 
687
+ def _maybe_mark_end_tag(self, node: Any) -> None:
688
+ if self._pending_end_tag_name is None:
689
+ return
690
+ if getattr(node, "name", None) != self._pending_end_tag_name:
691
+ return
692
+ node._end_tag_present = True
693
+ if self.track_tag_spans:
694
+ node._end_tag_start = self._pending_end_tag_start
695
+ node._end_tag_end = self._pending_end_tag_end
696
+ self._pending_end_tag_name = None
697
+ self._pending_end_tag_start = None
698
+ self._pending_end_tag_end = None
699
+
627
700
  def _pop_current(self) -> Any:
628
- return self.open_elements.pop()
701
+ node = self.open_elements.pop()
702
+ self._maybe_mark_end_tag(node)
703
+ return node
629
704
 
630
705
  def _in_scope(self, name: str) -> bool:
631
706
  return self._has_element_in_scope(name, DEFAULT_SCOPE_TERMINATORS)
@@ -637,6 +712,7 @@ class TreeBuilder(TreeBuilderModesMixin):
637
712
  index = len(self.open_elements) - 1
638
713
  while index >= 0: # pragma: no branch
639
714
  if self.open_elements[index].name == name:
715
+ self._maybe_mark_end_tag(self.open_elements[index])
640
716
  del self.open_elements[index:]
641
717
  return
642
718
  index -= 1
@@ -654,6 +730,7 @@ class TreeBuilder(TreeBuilderModesMixin):
654
730
  # If current node is not this node, parse error
655
731
  if index != len(self.open_elements) - 1:
656
732
  self._parse_error("end-tag-too-early")
733
+ self._maybe_mark_end_tag(node)
657
734
  # Pop all elements from this node onwards
658
735
  del self.open_elements[index:]
659
736
  return
@@ -677,6 +754,7 @@ class TreeBuilder(TreeBuilderModesMixin):
677
754
  def _remove_from_open_elements(self, node: Any) -> bool:
678
755
  for index, current in enumerate(self.open_elements):
679
756
  if current is node:
757
+ self._maybe_mark_end_tag(current)
680
758
  del self.open_elements[index]
681
759
  return True
682
760
  return False
@@ -749,6 +827,7 @@ class TreeBuilder(TreeBuilderModesMixin):
749
827
  def _remove_last_open_element_by_name(self, name: str) -> None:
750
828
  for index in range(len(self.open_elements) - 1, -1, -1):
751
829
  if self.open_elements[index].name == name:
830
+ self._maybe_mark_end_tag(self.open_elements[index])
752
831
  del self.open_elements[index]
753
832
  return
754
833
 
@@ -799,6 +878,10 @@ class TreeBuilder(TreeBuilderModesMixin):
799
878
  entry = self.active_formatting[index]
800
879
  tag = Tag(Tag.START, entry["name"], self._clone_attributes(entry["attrs"]), False)
801
880
  new_node = self._insert_element(tag, push=True)
881
+ if self.tokenizer is not None and self.tokenizer.track_node_locations:
882
+ new_node._origin_pos = entry["node"].origin_offset
883
+ new_node._origin_line = entry["node"].origin_line
884
+ new_node._origin_col = entry["node"].origin_col
802
885
  entry["node"] = new_node
803
886
  index += 1
804
887
 
@@ -820,14 +903,14 @@ class TreeBuilder(TreeBuilderModesMixin):
820
903
  node = self.open_elements[-1]
821
904
  if node.name in names and node.namespace in {None, "html"}:
822
905
  break
823
- self.open_elements.pop()
906
+ self._pop_current()
824
907
 
825
908
  def _generate_implied_end_tags(self, exclude: str | None = None) -> None:
826
909
  # Always terminates: html is not in IMPLIED_END_TAGS
827
910
  while self.open_elements: # pragma: no branch
828
911
  node = self.open_elements[-1]
829
912
  if node.name in IMPLIED_END_TAGS and node.name != exclude:
830
- self.open_elements.pop()
913
+ self._pop_current()
831
914
  continue
832
915
  break
833
916
 
@@ -846,7 +929,7 @@ class TreeBuilder(TreeBuilderModesMixin):
846
929
  def _end_table_cell(self, name: str) -> None:
847
930
  self._generate_implied_end_tags(name)
848
931
  while self.open_elements:
849
- node = self.open_elements.pop()
932
+ node = self._pop_current()
850
933
  if node.name == name and node.namespace in {None, "html"}:
851
934
  break
852
935
  self._clear_active_formatting_up_to_marker()
@@ -855,12 +938,19 @@ class TreeBuilder(TreeBuilderModesMixin):
855
938
  def _flush_pending_table_text(self) -> None:
856
939
  data = "".join(self.pending_table_text)
857
940
  self.pending_table_text.clear()
858
- if not data:
941
+ if not data: # pragma: no cover
859
942
  return
860
943
  if is_all_whitespace(data):
861
944
  self._append_text(data)
862
945
  return
863
- self._parse_error("foster-parenting-character")
946
+
947
+ if self.pending_table_text_should_error:
948
+ # html5lib reports one foster-parenting error per non-whitespace character.
949
+ for ch in data:
950
+ if ch not in " \t\n\r\f":
951
+ self._parse_error("foster-parenting-character")
952
+ self.pending_table_text_should_error = False
953
+
864
954
  previous = self.insert_from_table
865
955
  self.insert_from_table = True
866
956
  try:
@@ -876,7 +966,7 @@ class TreeBuilder(TreeBuilderModesMixin):
876
966
  self._generate_implied_end_tags()
877
967
  # Table verified in scope above
878
968
  while self.open_elements: # pragma: no branch
879
- node = self.open_elements.pop()
969
+ node = self._pop_current()
880
970
  if node.name == "table":
881
971
  break
882
972
  self._reset_insertion_mode()
@@ -989,7 +1079,7 @@ class TreeBuilder(TreeBuilderModesMixin):
989
1079
  def _adjusted_current_node(self) -> Any:
990
1080
  return self.open_elements[-1]
991
1081
 
992
- def _should_use_foreign_content(self, token: Any) -> bool:
1082
+ def _should_use_foreign_content(self, token: AnyToken) -> bool:
993
1083
  current = self._adjusted_current_node()
994
1084
  # HTML namespace elements don't use foreign content rules
995
1085
  # (unreachable in practice as foreign content mode only entered for foreign elements)
@@ -1036,9 +1126,9 @@ class TreeBuilder(TreeBuilderModesMixin):
1036
1126
  return
1037
1127
  if self.fragment_context_element is not None and node is self.fragment_context_element:
1038
1128
  return
1039
- self.open_elements.pop()
1129
+ self._pop_current()
1040
1130
 
1041
- def _process_foreign_content(self, token: Any) -> Any | None:
1131
+ def _process_foreign_content(self, token: AnyToken) -> Any | None:
1042
1132
  current = self._adjusted_current_node()
1043
1133
 
1044
1134
  if isinstance(token, CharacterTokens):
@@ -1113,12 +1203,13 @@ class TreeBuilder(TreeBuilderModesMixin):
1113
1203
  if is_html:
1114
1204
  return ("reprocess", self.mode, token, True)
1115
1205
  # Otherwise it's a foreign element - pop everything from this point up
1206
+ self._maybe_mark_end_tag(node)
1116
1207
  del self.open_elements[idx:]
1117
1208
  return None
1118
1209
 
1119
1210
  # Per HTML5 spec: if first node doesn't match, it's a parse error
1120
1211
  if first:
1121
- self._parse_error("unexpected-end-tag-in-foreign-content", tag_name=token.name)
1212
+ self._parse_error("unexpected-end-tag", tag_name=token.name)
1122
1213
  first = False
1123
1214
 
1124
1215
  # If we hit an HTML element that doesn't match, process in secondary mode
@@ -1259,19 +1350,21 @@ class TreeBuilder(TreeBuilderModesMixin):
1259
1350
  return self.process_token(CharacterTokens(data))
1260
1351
 
1261
1352
  if self.mode == InsertionMode.IN_BODY:
1262
- if "\x00" in data:
1263
- self._parse_error("invalid-codepoint")
1264
- data = data.replace("\x00", "")
1265
-
1266
1353
  if not data:
1267
1354
  return TokenSinkResult.Continue
1355
+ if "\x00" in data:
1356
+ data = data.replace("\x00", "")
1357
+ if not data:
1358
+ return TokenSinkResult.Continue
1268
1359
 
1269
1360
  if is_all_whitespace(data):
1270
- self._reconstruct_active_formatting_elements()
1361
+ if self.active_formatting:
1362
+ self._reconstruct_active_formatting_elements()
1271
1363
  self._append_text(data)
1272
1364
  return TokenSinkResult.Continue
1273
1365
 
1274
- self._reconstruct_active_formatting_elements()
1366
+ if self.active_formatting:
1367
+ self._reconstruct_active_formatting_elements()
1275
1368
  self.frameset_ok = False
1276
1369
  self._append_text(data)
1277
1370
  return TokenSinkResult.Continue