justhtml 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/tokenizer.py CHANGED
@@ -9,9 +9,9 @@ if TYPE_CHECKING:
9
9
 
10
10
  from .entities import decode_entities_in_text
11
11
  from .errors import generate_error_message
12
- from .tokens import CommentToken, Doctype, DoctypeToken, EOFToken, ParseError, Tag
12
+ from .tokens import AnyToken, CharacterTokens, CommentToken, Doctype, DoctypeToken, EOFToken, ParseError, Tag
13
13
 
14
- _ATTR_VALUE_UNQUOTED_TERMINATORS = "\t\n\f >&\"'<=`\r\0"
14
+ _ATTR_VALUE_UNQUOTED_TERMINATORS = "\t\n\f >&\"'<=`\0"
15
15
  _ASCII_LOWER_TABLE = str.maketrans({chr(code): chr(code + 32) for code in range(65, 91)})
16
16
  _RCDATA_ELEMENTS = {"title", "textarea"}
17
17
  _RAWTEXT_SWITCH_TAGS = {
@@ -29,8 +29,8 @@ _ATTR_VALUE_DOUBLE_PATTERN = re.compile(r'["&\0]')
29
29
  _ATTR_VALUE_SINGLE_PATTERN = re.compile(r"['&\0]")
30
30
  _ATTR_VALUE_UNQUOTED_PATTERN = re.compile(f"[{re.escape(_ATTR_VALUE_UNQUOTED_TERMINATORS)}]")
31
31
 
32
- _TAG_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />\0\r]+")
33
- _ATTR_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />=\0\"'<\r]+")
32
+ _TAG_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />\0]+")
33
+ _ATTR_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />=\0\"'<]+")
34
34
  _COMMENT_RUN_PATTERN = re.compile(r"[^-\0]+")
35
35
  _WHITESPACE_PATTERN = re.compile(r"[ \t\n\f]+")
36
36
 
@@ -44,6 +44,13 @@ for _plane in range(17):
44
44
  _XML_COERCION_PATTERN = re.compile(r"[\f\uFDD0-\uFDEF" + "".join(_xml_invalid_single_chars) + "]")
45
45
 
46
46
 
47
+ def _is_noncharacter_codepoint(codepoint: int) -> bool:
48
+ if 0xFDD0 <= codepoint <= 0xFDEF:
49
+ return True
50
+ last = codepoint & 0xFFFF
51
+ return last == 0xFFFE or last == 0xFFFF
52
+
53
+
47
54
  def _xml_coercion_callback(match: re.Match[str]) -> str:
48
55
  if match.group(0) == "\f":
49
56
  return " "
@@ -72,7 +79,14 @@ def _coerce_comment_for_xml(text: str) -> str:
72
79
 
73
80
 
74
81
  class TokenizerOpts:
75
- __slots__ = ("discard_bom", "exact_errors", "initial_rawtext_tag", "initial_state", "xml_coercion")
82
+ __slots__ = (
83
+ "discard_bom",
84
+ "emit_bogus_markup_as_text",
85
+ "exact_errors",
86
+ "initial_rawtext_tag",
87
+ "initial_state",
88
+ "xml_coercion",
89
+ )
76
90
 
77
91
  discard_bom: bool
78
92
  exact_errors: bool
@@ -84,12 +98,14 @@ class TokenizerOpts:
84
98
  self,
85
99
  exact_errors: bool = False,
86
100
  discard_bom: bool = True,
101
+ emit_bogus_markup_as_text: bool = False,
87
102
  initial_state: int | None = None,
88
103
  initial_rawtext_tag: str | None = None,
89
104
  xml_coercion: bool = False,
90
105
  ) -> None:
91
106
  self.exact_errors = bool(exact_errors)
92
107
  self.discard_bom = bool(discard_bom)
108
+ self.emit_bogus_markup_as_text = bool(emit_bogus_markup_as_text)
93
109
  self.initial_state = initial_state
94
110
  self.initial_rawtext_tag = initial_rawtext_tag
95
111
  self.xml_coercion = bool(xml_coercion)
@@ -178,11 +194,12 @@ class Tokenizer:
178
194
  "current_tag_kind",
179
195
  "current_tag_name",
180
196
  "current_tag_self_closing",
197
+ "current_token_start_pos",
181
198
  "errors",
182
- "ignore_lf",
183
199
  "last_start_tag_name",
184
200
  "last_token_column",
185
201
  "last_token_line",
202
+ "last_token_start_pos",
186
203
  "length",
187
204
  "opts",
188
205
  "original_tag_name",
@@ -194,6 +211,8 @@ class Tokenizer:
194
211
  "temp_buffer",
195
212
  "text_buffer",
196
213
  "text_start_pos",
214
+ "track_node_locations",
215
+ "track_tag_positions",
197
216
  )
198
217
 
199
218
  _comment_token: CommentToken
@@ -202,6 +221,8 @@ class Tokenizer:
202
221
  _tag_token: Tag
203
222
  buffer: str
204
223
  collect_errors: bool
224
+ track_tag_positions: bool
225
+ track_node_locations: bool
205
226
  current_attr_name: list[str]
206
227
  current_attr_value: list[str]
207
228
  current_attr_value_has_amp: bool
@@ -215,11 +236,12 @@ class Tokenizer:
215
236
  current_tag_kind: int
216
237
  current_tag_name: list[str]
217
238
  current_tag_self_closing: bool
239
+ current_token_start_pos: int
218
240
  errors: list[ParseError]
219
- ignore_lf: bool
220
241
  last_start_tag_name: str | None
221
242
  last_token_column: int
222
243
  last_token_line: int
244
+ last_token_start_pos: int | None
223
245
  length: int
224
246
  opts: TokenizerOpts
225
247
  original_tag_name: list[str]
@@ -234,10 +256,20 @@ class Tokenizer:
234
256
 
235
257
  # _STATE_HANDLERS is defined at the end of the file
236
258
 
237
- def __init__(self, sink: Any, opts: TokenizerOpts | None = None, collect_errors: bool = False) -> None:
259
+ def __init__(
260
+ self,
261
+ sink: Any,
262
+ opts: TokenizerOpts | None = None,
263
+ *,
264
+ collect_errors: bool = False,
265
+ track_node_locations: bool = False,
266
+ track_tag_positions: bool = False,
267
+ ) -> None:
238
268
  self.sink = sink
239
269
  self.opts = opts or TokenizerOpts()
240
270
  self.collect_errors = collect_errors
271
+ self.track_node_locations = bool(track_node_locations)
272
+ self.track_tag_positions = bool(track_tag_positions)
241
273
  self.errors = []
242
274
 
243
275
  self.state = self.DATA
@@ -246,9 +278,10 @@ class Tokenizer:
246
278
  self.pos = 0
247
279
  self.reconsume = False
248
280
  self.current_char = ""
249
- self.ignore_lf = False
250
281
  self.last_token_line = 1
251
282
  self.last_token_column = 0
283
+ self.current_token_start_pos = 0
284
+ self.last_token_start_pos = None
252
285
 
253
286
  # Reusable buffers to avoid per-token allocations.
254
287
  self.text_buffer = []
@@ -276,14 +309,20 @@ class Tokenizer:
276
309
  if html and html[0] == "\ufeff" and self.opts.discard_bom:
277
310
  html = html[1:]
278
311
 
312
+ # Normalize newlines per §13.2.2.5
313
+ if html:
314
+ if "\r" in html:
315
+ html = html.replace("\r\n", "\n").replace("\r", "\n")
316
+
279
317
  self.buffer = html or ""
280
318
  self.length = len(self.buffer)
281
319
  self.pos = 0
282
320
  self.reconsume = False
283
321
  self.current_char = ""
284
- self.ignore_lf = False
285
322
  self.last_token_line = 1
286
323
  self.last_token_column = 0
324
+ self.current_token_start_pos = 0
325
+ self.last_token_start_pos = None
287
326
  self.errors = []
288
327
  self.text_buffer.clear()
289
328
  self.text_start_pos = 0
@@ -313,8 +352,9 @@ class Tokenizer:
313
352
  else:
314
353
  self.state = self.DATA
315
354
 
316
- # Pre-compute newline positions for O(log n) line lookups
317
- if self.collect_errors:
355
+ # Pre-compute newline positions for O(log n) line lookups.
356
+ # Only do this when errors are collected or when node locations are requested.
357
+ if self.collect_errors or self.track_node_locations:
318
358
  self._newline_positions = []
319
359
  pos = -1
320
360
  buffer = self.buffer
@@ -334,6 +374,34 @@ class Tokenizer:
334
374
  return 1
335
375
  return bisect_right(newline_positions, pos - 1) + 1
336
376
 
377
+ def location_at_pos(self, pos: int) -> tuple[int, int]:
378
+ """Return (line, column) for a 0-indexed offset in the current buffer.
379
+
380
+ Column is 1-indexed. Newline positions are computed lazily when needed.
381
+ """
382
+ newline_positions = self._newline_positions
383
+ if newline_positions is None:
384
+ newline_positions = []
385
+ scan = -1
386
+ buffer = self.buffer
387
+ while True:
388
+ scan = buffer.find("\n", scan + 1)
389
+ if scan == -1:
390
+ break
391
+ newline_positions.append(scan)
392
+ self._newline_positions = newline_positions
393
+
394
+ line_index = bisect_right(newline_positions, pos - 1)
395
+ line = line_index + 1
396
+
397
+ # Compute column using newline index rather than rfind() to avoid O(n) scans.
398
+ if line_index == 0:
399
+ last_newline = -1
400
+ else:
401
+ last_newline = newline_positions[line_index - 1]
402
+ column = pos - last_newline
403
+ return line, column
404
+
337
405
  def step(self) -> bool:
338
406
  """Run one step of the tokenizer state machine. Returns True if EOF reached."""
339
407
  handler = self._STATE_HANDLERS[self.state] # type: ignore[attr-defined]
@@ -341,8 +409,9 @@ class Tokenizer:
341
409
 
342
410
  def run(self, html: str | None) -> None:
343
411
  self.initialize(html)
412
+ handlers = self._STATE_HANDLERS # type: ignore[attr-defined]
344
413
  while True:
345
- if self.step():
414
+ if handlers[self.state](self): # type: ignore[no-any-return]
346
415
  break
347
416
 
348
417
  # ---------------------
@@ -356,9 +425,8 @@ class Tokenizer:
356
425
  return self.buffer[peek_pos]
357
426
  return None
358
427
 
359
- def _append_text_chunk(self, chunk: str, *, ends_with_cr: bool = False) -> None:
428
+ def _append_text_chunk(self, chunk: str) -> None:
360
429
  self._append_text(chunk)
361
- self.ignore_lf = ends_with_cr
362
430
 
363
431
  # ---------------------
364
432
  # State handlers
@@ -392,12 +460,12 @@ class Tokenizer:
392
460
 
393
461
  if end > pos:
394
462
  chunk = buffer[pos:end]
395
-
396
- if "\r" in chunk:
397
- chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
398
-
463
+ if self.collect_errors and not chunk.isascii():
464
+ base_pos = pos
465
+ for offset, ch in enumerate(chunk):
466
+ if _is_noncharacter_codepoint(ord(ch)):
467
+ self._emit_error_at_pos("noncharacter-in-input-stream", base_pos + offset)
399
468
  self._append_text(chunk)
400
- self.ignore_lf = chunk.endswith("\r")
401
469
 
402
470
  pos = end
403
471
  self.pos = pos
@@ -410,8 +478,8 @@ class Tokenizer:
410
478
  pos += 1
411
479
  self.pos = pos
412
480
  self.current_char = c
413
- self.ignore_lf = False
414
481
  # c is always '<' here due to find() optimization above
482
+ self.current_token_start_pos = pos - 1
415
483
  # Optimization: Peek ahead for common tag starts
416
484
  if pos < length:
417
485
  nc = buffer[pos]
@@ -432,7 +500,7 @@ class Tokenizer:
432
500
  self.state = self.TAG_NAME
433
501
  return self._state_tag_name()
434
502
 
435
- if nc == "!":
503
+ if nc == "!" and not self.opts.emit_bogus_markup_as_text:
436
504
  # Optimization: Peek ahead for comments
437
505
  if pos + 2 < length and buffer[pos + 1] == "-" and buffer[pos + 2] == "-":
438
506
  self._flush_text()
@@ -475,12 +543,20 @@ class Tokenizer:
475
543
  self._emit_token(EOFToken())
476
544
  return True
477
545
  if c == "!":
546
+ if self.opts.emit_bogus_markup_as_text:
547
+ self._append_text("<!")
548
+ self.state = self.DATA
549
+ return False
478
550
  self.state = self.MARKUP_DECLARATION_OPEN
479
551
  return False
480
552
  if c == "/":
481
553
  self.state = self.END_TAG_OPEN
482
554
  return False
483
555
  if c == "?":
556
+ if self.opts.emit_bogus_markup_as_text:
557
+ self._append_text("<?")
558
+ self.state = self.DATA
559
+ return False
484
560
  self._emit_error("unexpected-question-mark-instead-of-tag-name")
485
561
  self.current_comment.clear()
486
562
  self._reconsume_current()
@@ -497,6 +573,11 @@ class Tokenizer:
497
573
  c = self._get_char()
498
574
  if c is None:
499
575
  self._emit_error("eof-before-tag-name")
576
+ if self.opts.emit_bogus_markup_as_text:
577
+ self._append_text("</")
578
+ self._flush_text()
579
+ self._emit_token(EOFToken())
580
+ return True
500
581
  self._append_text("<")
501
582
  self._append_text("/")
502
583
  self._flush_text()
@@ -504,6 +585,16 @@ class Tokenizer:
504
585
  return True
505
586
  if c == ">":
506
587
  self._emit_error("empty-end-tag")
588
+ if self.opts.emit_bogus_markup_as_text:
589
+ self._append_text("</>")
590
+ self.state = self.DATA
591
+ return False
592
+ self.state = self.DATA
593
+ return False
594
+
595
+ if self.opts.emit_bogus_markup_as_text:
596
+ self._append_text("</")
597
+ self._append_text(c)
507
598
  self.state = self.DATA
508
599
  return False
509
600
 
@@ -518,15 +609,15 @@ class Tokenizer:
518
609
  append_tag_char = self.current_tag_name.append
519
610
  buffer = self.buffer
520
611
  length = self.length
612
+ pos = self.pos
521
613
 
522
614
  while True:
523
615
  # Inline _consume_tag_name_run
524
- # Note: reconsume and ignore_lf are never True when entering TAG_NAME
525
- pos = self.pos
616
+ # Note: reconsume is never True when entering TAG_NAME
526
617
  if pos < length:
527
618
  # Optimization: Check for common terminators before regex
528
619
  match = None
529
- if buffer[pos] not in "\t\n\f />\0\r":
620
+ if buffer[pos] not in "\t\n\f />\0":
530
621
  match = _TAG_NAME_RUN_PATTERN.match(buffer, pos)
531
622
 
532
623
  if match:
@@ -534,46 +625,68 @@ class Tokenizer:
534
625
  if not chunk.islower():
535
626
  chunk = chunk.translate(_ASCII_LOWER_TABLE)
536
627
  append_tag_char(chunk)
537
- self.pos = match.end()
538
-
539
- if self.pos < length:
540
- c = buffer[self.pos]
541
- if c in (" ", "\t", "\n", "\f", "\r"):
542
- self.pos += 1
543
- if c == "\r":
544
- self.ignore_lf = True
628
+ pos = match.end()
629
+
630
+ if pos < length:
631
+ next_char = buffer[pos]
632
+ if next_char in (" ", "\t", "\n", "\f"):
633
+ if self.current_tag_kind == Tag.END and self.opts.emit_bogus_markup_as_text:
634
+ return self._emit_raw_end_tag_as_text(pos)
635
+ pos += 1
636
+ self.pos = pos
545
637
  self.state = self.BEFORE_ATTRIBUTE_NAME
546
638
  return self._state_before_attribute_name()
547
- if c == ">":
548
- self.pos += 1
639
+ if next_char == ">":
640
+ pos += 1
641
+ self.pos = pos
549
642
  if not self._emit_current_tag():
550
643
  self.state = self.DATA
551
644
  return False
552
- if c == "/":
553
- self.pos += 1
645
+ if next_char == "/":
646
+ if self.current_tag_kind == Tag.END and self.opts.emit_bogus_markup_as_text:
647
+ return self._emit_raw_end_tag_as_text(pos)
648
+ pos += 1
649
+ self.pos = pos
554
650
  self.state = self.SELF_CLOSING_START_TAG
555
651
  return self._state_self_closing_start_tag()
556
652
 
557
- c = self._get_char() # type: ignore[assignment]
653
+ # Inline _get_char
654
+ # Note: reconsume is never True in this state.
655
+ if pos >= length:
656
+ c: str | None = None
657
+ else:
658
+ c = buffer[pos]
659
+ pos += 1
660
+ self.current_char = c
558
661
  if c is None:
662
+ self.pos = pos
559
663
  self._emit_error("eof-in-tag")
560
- # Per HTML5 spec: EOF in tag name is a parse error, emit EOF token only
561
- # The incomplete tag is discarded (not emitted as text)
664
+ self._emit_incomplete_tag_as_text()
562
665
  self._emit_token(EOFToken())
563
666
  return True
564
667
  if c in ("\t", "\n", "\f", " "):
668
+ if self.current_tag_kind == Tag.END and self.opts.emit_bogus_markup_as_text:
669
+ self.pos = pos
670
+ return self._emit_raw_end_tag_as_text(pos)
671
+ self.pos = pos
565
672
  self.state = self.BEFORE_ATTRIBUTE_NAME
566
673
  return self._state_before_attribute_name()
567
674
  if c == "/":
675
+ if self.current_tag_kind == Tag.END and self.opts.emit_bogus_markup_as_text:
676
+ self.pos = pos
677
+ return self._emit_raw_end_tag_as_text(pos)
678
+ self.pos = pos
568
679
  self.state = self.SELF_CLOSING_START_TAG
569
680
  return self._state_self_closing_start_tag()
570
681
  if c == ">":
571
682
  # In slow path, tag name is only first char (from DATA),
572
683
  # so no rawtext elements possible - always set DATA state
684
+ self.pos = pos
573
685
  self._emit_current_tag()
574
686
  self.state = self.DATA
575
687
  return False
576
688
  # c == "\0" - the only remaining possibility after fast-path
689
+ self.pos = pos
577
690
  self._emit_error("unexpected-null-character")
578
691
  append_tag_char(replacement)
579
692
 
@@ -583,7 +696,7 @@ class Tokenizer:
583
696
 
584
697
  while True:
585
698
  # Optimization: Skip whitespace
586
- if not self.reconsume and not self.ignore_lf:
699
+ if not self.reconsume:
587
700
  if self.pos < length:
588
701
  # Check if current char is whitespace before running regex
589
702
  if buffer[self.pos] in " \t\n\f":
@@ -603,25 +716,12 @@ class Tokenizer:
603
716
 
604
717
  self.current_char = c
605
718
 
606
- if c == " ":
607
- self.ignore_lf = False
608
- continue
609
- if c == "\n":
610
- if self.ignore_lf:
611
- self.ignore_lf = False
612
- # Line tracking now computed on-demand via _get_line_at_pos()
613
- continue
614
- if c == "\t" or c == "\f":
615
- self.ignore_lf = False
616
- continue
617
- if c == "\r":
618
- self.ignore_lf = False
619
- if self.pos < length and buffer[self.pos] == "\n":
620
- self.pos += 1
719
+ if c in (" ", "\n", "\t", "\f"):
621
720
  continue
622
721
 
623
722
  if c is None:
624
723
  self._emit_error("eof-in-tag")
724
+ self._emit_incomplete_tag_as_text()
625
725
  self._flush_text()
626
726
  self._emit_token(EOFToken())
627
727
  return True
@@ -661,52 +761,62 @@ class Tokenizer:
661
761
  append_attr_char = self.current_attr_name.append
662
762
  buffer = self.buffer
663
763
  length = self.length
764
+ pos = self.pos
664
765
 
665
766
  while True:
666
767
  # Inline _consume_attribute_name_run
667
- if not self.reconsume and not self.ignore_lf:
668
- pos = self.pos
669
- if pos < length:
670
- # Optimization: Check for common terminators before regex
671
- match = None
672
- if buffer[pos] not in "\t\n\f />=\0\"'<\r":
673
- match = _ATTR_NAME_RUN_PATTERN.match(buffer, pos)
674
-
675
- if match:
676
- chunk = match.group(0)
677
- if not chunk.islower():
678
- chunk = chunk.translate(_ASCII_LOWER_TABLE)
679
- append_attr_char(chunk)
680
- self.pos = match.end()
681
-
682
- if self.pos < length:
683
- c = buffer[self.pos]
684
- if c == "=":
685
- self.pos += 1
686
- self.state = self.BEFORE_ATTRIBUTE_VALUE
687
- return self._state_before_attribute_value()
688
- if c in (" ", "\t", "\n", "\f", "\r"):
689
- self.pos += 1
690
- if c == "\r":
691
- self.ignore_lf = True
692
- self._finish_attribute()
693
- self.state = self.AFTER_ATTRIBUTE_NAME
694
- return False # Let main loop dispatch to avoid recursion
695
- if c == ">":
696
- self.pos += 1
697
- self._finish_attribute()
698
- if not self._emit_current_tag():
699
- self.state = self.DATA
700
- return False
701
- if c == "/":
702
- self.pos += 1
703
- self._finish_attribute()
704
- self.state = self.SELF_CLOSING_START_TAG
705
- return self._state_self_closing_start_tag()
706
-
707
- c = self._get_char() # type: ignore[assignment]
768
+ # Note: reconsume is never True in this state.
769
+ if pos < length:
770
+ # Optimization: Check for common terminators before regex
771
+ match = None
772
+ if buffer[pos] not in "\t\n\f />=\0\"'<":
773
+ match = _ATTR_NAME_RUN_PATTERN.match(buffer, pos)
774
+
775
+ if match:
776
+ chunk = match.group(0)
777
+ if not chunk.islower():
778
+ chunk = chunk.translate(_ASCII_LOWER_TABLE)
779
+ append_attr_char(chunk)
780
+ pos = match.end()
781
+
782
+ if pos < length:
783
+ next_char = buffer[pos]
784
+ if next_char == "=":
785
+ pos += 1
786
+ self.pos = pos
787
+ self.state = self.BEFORE_ATTRIBUTE_VALUE
788
+ return self._state_before_attribute_value()
789
+ if next_char in (" ", "\t", "\n", "\f"):
790
+ pos += 1
791
+ self.pos = pos
792
+ self._finish_attribute()
793
+ self.state = self.AFTER_ATTRIBUTE_NAME
794
+ return False # Let main loop dispatch to avoid recursion
795
+ if next_char == ">":
796
+ pos += 1
797
+ self.pos = pos
798
+ self._finish_attribute()
799
+ if not self._emit_current_tag():
800
+ self.state = self.DATA
801
+ return False
802
+ if next_char == "/":
803
+ pos += 1
804
+ self.pos = pos
805
+ self._finish_attribute()
806
+ self.state = self.SELF_CLOSING_START_TAG
807
+ return self._state_self_closing_start_tag()
808
+
809
+ # Inline _get_char (reconsume is never True in this state)
810
+ if pos >= length:
811
+ c: str | None = None
812
+ else:
813
+ c = buffer[pos]
814
+ pos += 1
815
+ self.current_char = c
816
+ self.pos = pos
708
817
  if c is None:
709
818
  self._emit_error("eof-in-tag")
819
+ self._emit_incomplete_tag_as_text()
710
820
  self._flush_text()
711
821
  self._emit_token(EOFToken())
712
822
  return True
@@ -730,8 +840,7 @@ class Tokenizer:
730
840
  self._emit_error("unexpected-null-character")
731
841
  append_attr_char(replacement)
732
842
  continue
733
- if c in ('"', "'", "<"):
734
- self._emit_error("unexpected-character-in-attribute-name")
843
+ self._emit_error("unexpected-character-in-attribute-name")
735
844
  append_attr_char(c)
736
845
 
737
846
  def _state_after_attribute_name(self) -> bool:
@@ -740,11 +849,10 @@ class Tokenizer:
740
849
 
741
850
  while True:
742
851
  # Optimization: Skip whitespace
743
- if not self.reconsume and not self.ignore_lf:
852
+ if not self.reconsume:
744
853
  if self.pos < length:
745
- match = _WHITESPACE_PATTERN.match(buffer, self.pos)
746
- if match:
747
- self.pos = match.end()
854
+ if buffer[self.pos] in " \t\n\f":
855
+ self.pos = _WHITESPACE_PATTERN.match(buffer, self.pos).end() # type: ignore[union-attr]
748
856
 
749
857
  # Inline _get_char
750
858
  if self.pos >= length:
@@ -755,25 +863,12 @@ class Tokenizer:
755
863
 
756
864
  self.current_char = c
757
865
 
758
- if c == " ":
759
- self.ignore_lf = False
760
- continue
761
- if c == "\n":
762
- # Note: Only reachable when ignore_lf=True (CR-LF handling)
763
- # Standalone \n is caught by whitespace optimization
764
- self.ignore_lf = False
866
+ if c in (" ", "\n", "\t", "\f"):
765
867
  continue
766
- if c == "\r":
767
- self.ignore_lf = True
768
- continue
769
- if c == "\t" or c == "\f":
770
- self.ignore_lf = False
771
- continue
772
-
773
- self.ignore_lf = False
774
868
 
775
869
  if c is None:
776
870
  self._emit_error("eof-in-tag")
871
+ self._emit_incomplete_tag_as_text()
777
872
  self._flush_text()
778
873
  self._emit_token(EOFToken())
779
874
  return True
@@ -804,9 +899,17 @@ class Tokenizer:
804
899
 
805
900
  def _state_before_attribute_value(self) -> bool:
806
901
  while True:
807
- c = self._get_char()
902
+ # Inline _get_char (reconsume is never True in this state)
903
+ pos = self.pos
904
+ if pos >= self.length:
905
+ c: str | None = None
906
+ else:
907
+ c = self.buffer[pos]
908
+ self.pos = pos + 1
909
+ self.current_char = c
808
910
  if c is None:
809
911
  self._emit_error("eof-in-tag")
912
+ self._emit_incomplete_tag_as_text()
810
913
  self._flush_text()
811
914
  self._emit_token(EOFToken())
812
915
  return True
@@ -857,10 +960,6 @@ class Tokenizer:
857
960
  if end != next_quote:
858
961
  chunk = buffer[pos:end]
859
962
 
860
- # Normalize chunk for value if needed
861
- if "\r" in chunk:
862
- chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
863
-
864
963
  self.current_attr_value.append(chunk)
865
964
  self.pos = end
866
965
 
@@ -868,6 +967,7 @@ class Tokenizer:
868
967
  if self.pos >= length:
869
968
  self.current_char = None
870
969
  self._emit_error("eof-in-tag")
970
+ self._emit_incomplete_tag_as_text()
871
971
  self._emit_token(EOFToken())
872
972
  return True
873
973
 
@@ -916,10 +1016,6 @@ class Tokenizer:
916
1016
  if end != next_quote:
917
1017
  chunk = buffer[pos:end]
918
1018
 
919
- # Normalize chunk for value if needed
920
- if "\r" in chunk:
921
- chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
922
-
923
1019
  self.current_attr_value.append(chunk)
924
1020
  self.pos = end
925
1021
 
@@ -927,6 +1023,7 @@ class Tokenizer:
927
1023
  if self.pos >= length:
928
1024
  self.current_char = None
929
1025
  self._emit_error("eof-in-tag")
1026
+ self._emit_incomplete_tag_as_text()
930
1027
  self._emit_token(EOFToken())
931
1028
  return True
932
1029
 
@@ -965,11 +1062,22 @@ class Tokenizer:
965
1062
  self.current_attr_value.append(buffer[pos:end])
966
1063
  self.pos = end
967
1064
 
968
- c = self._get_char()
1065
+ # Inline _get_char
1066
+ if self.reconsume:
1067
+ self.reconsume = False
1068
+ c = self.current_char
1069
+ elif self.pos >= length:
1070
+ c = None
1071
+ else:
1072
+ c = buffer[self.pos]
1073
+ self.pos += 1
1074
+ self.current_char = c
1075
+
969
1076
  if c is None:
970
1077
  # Per HTML5 spec: EOF in attribute value is a parse error
971
1078
  # The incomplete tag is discarded (not emitted)
972
1079
  self._emit_error("eof-in-tag")
1080
+ self._emit_incomplete_tag_as_text()
973
1081
  self._emit_token(EOFToken())
974
1082
  return True
975
1083
  if c in ("\t", "\n", "\f", " "):
@@ -995,9 +1103,17 @@ class Tokenizer:
995
1103
 
996
1104
  def _state_after_attribute_value_quoted(self) -> bool:
997
1105
  """After attribute value (quoted) state per HTML5 spec §13.2.5.42"""
998
- c = self._get_char()
1106
+ # Inline _get_char
1107
+ if self.pos >= self.length:
1108
+ c: str | None = None
1109
+ else:
1110
+ c = self.buffer[self.pos]
1111
+ self.pos += 1
1112
+ self.current_char = c
1113
+
999
1114
  if c is None:
1000
1115
  self._emit_error("eof-in-tag")
1116
+ self._emit_incomplete_tag_as_text()
1001
1117
  self._flush_text()
1002
1118
  self._emit_token(EOFToken())
1003
1119
  return True
@@ -1025,6 +1141,7 @@ class Tokenizer:
1025
1141
  c = self._get_char()
1026
1142
  if c is None:
1027
1143
  self._emit_error("eof-in-tag")
1144
+ self._emit_incomplete_tag_as_text()
1028
1145
  self._flush_text()
1029
1146
  self._emit_token(EOFToken())
1030
1147
  return True
@@ -1125,7 +1242,14 @@ class Tokenizer:
1125
1242
  while True:
1126
1243
  if self._consume_comment_run():
1127
1244
  continue
1128
- c = self._get_char()
1245
+ # Inline _get_char
1246
+ if self.pos >= self.length:
1247
+ c: str | None = None
1248
+ else:
1249
+ c = self.buffer[self.pos]
1250
+ self.pos += 1
1251
+ self.current_char = c
1252
+
1129
1253
  if c is None:
1130
1254
  self._emit_error("eof-in-comment")
1131
1255
  self._emit_comment()
@@ -1264,7 +1388,7 @@ class Tokenizer:
1264
1388
  while True:
1265
1389
  c = self._get_char()
1266
1390
  if c is None:
1267
- self._emit_error("eof-in-doctype-name")
1391
+ self._emit_error("eof-in-doctype")
1268
1392
  self.current_doctype_force_quirks = True
1269
1393
  self._emit_doctype()
1270
1394
  self._emit_token(EOFToken())
@@ -1291,7 +1415,7 @@ class Tokenizer:
1291
1415
  while True:
1292
1416
  c = self._get_char()
1293
1417
  if c is None:
1294
- self._emit_error("eof-in-doctype-name")
1418
+ self._emit_error("eof-in-doctype")
1295
1419
  self.current_doctype_force_quirks = True
1296
1420
  self._emit_doctype()
1297
1421
  self._emit_token(EOFToken())
@@ -1675,36 +1799,19 @@ class Tokenizer:
1675
1799
  self.reconsume = False
1676
1800
  return self.current_char
1677
1801
 
1678
- buffer = self.buffer
1679
1802
  pos = self.pos
1680
- length = self.length
1681
- while True:
1682
- if pos >= length:
1683
- self.pos = pos
1684
- self.current_char = None
1685
- return None
1686
-
1687
- c = buffer[pos]
1688
- pos += 1
1689
-
1690
- if c == "\r":
1691
- self.ignore_lf = True
1692
- self.current_char = "\n"
1693
- self.pos = pos
1694
- return "\n"
1695
-
1696
- if c == "\n":
1697
- if self.ignore_lf:
1698
- self.ignore_lf = False
1699
- continue
1700
- # Line tracking now computed on-demand via _get_line_at_pos()
1803
+ if pos >= self.length:
1804
+ self.current_char = None
1805
+ return None
1701
1806
 
1702
- else:
1703
- self.ignore_lf = False
1704
-
1705
- self.current_char = c
1706
- self.pos = pos
1707
- return c
1807
+ c = self.buffer[pos]
1808
+ self.pos = pos + 1
1809
+ self.current_char = c
1810
+ if c == "<":
1811
+ self.current_token_start_pos = pos
1812
+ if self.collect_errors and not c.isascii() and _is_noncharacter_codepoint(ord(c)):
1813
+ self._emit_error_at_pos("noncharacter-in-input-stream", pos)
1814
+ return c
1708
1815
 
1709
1816
  def _reconsume_current(self) -> None:
1710
1817
  self.reconsume = True
@@ -1731,10 +1838,38 @@ class Tokenizer:
1731
1838
  raw_len = len(data)
1732
1839
 
1733
1840
  self.text_buffer.clear()
1734
- if self.state == self.DATA and "\0" in data:
1735
- count = data.count("\0")
1736
- for _ in range(count):
1737
- self._emit_error("unexpected-null-character")
1841
+ # U+0000 NULL is a parse error in text.
1842
+ # Emit one error per NULL at the *actual* character position.
1843
+ if "\0" in data:
1844
+ base_pos = self.text_start_pos
1845
+ search_from = 0
1846
+ while True:
1847
+ idx = data.find("\0", search_from)
1848
+ if idx == -1:
1849
+ break
1850
+ error_pos = base_pos + idx
1851
+
1852
+ # Compute column at error_pos (1-indexed).
1853
+ last_newline = self.buffer.rfind("\n", 0, error_pos + 1)
1854
+ if last_newline == -1:
1855
+ column = error_pos + 1
1856
+ else:
1857
+ column = error_pos - last_newline
1858
+ line = self._get_line_at_pos(error_pos)
1859
+
1860
+ message = generate_error_message("unexpected-null-character")
1861
+ self.errors.append(
1862
+ ParseError(
1863
+ "unexpected-null-character",
1864
+ line=line,
1865
+ column=column,
1866
+ category="tokenizer",
1867
+ message=message,
1868
+ source_html=self.buffer,
1869
+ )
1870
+ )
1871
+
1872
+ search_from = idx + 1
1738
1873
 
1739
1874
  # Per HTML5 spec:
1740
1875
  # - RCDATA state (title, textarea): decode character references
@@ -1747,13 +1882,16 @@ class Tokenizer:
1747
1882
  pass
1748
1883
  else:
1749
1884
  if "&" in data:
1750
- data = decode_entities_in_text(data)
1885
+ report_error = self._emit_error if self.collect_errors else None
1886
+ data = decode_entities_in_text(data, report_error=report_error)
1751
1887
  # Apply XML coercion if enabled
1752
1888
  if self.opts.xml_coercion:
1753
1889
  data = _coerce_text_for_xml(data)
1754
1890
 
1755
1891
  # Record position at END of raw text (1-indexed column = raw_len)
1756
- self._record_text_end_position(raw_len)
1892
+ if self.collect_errors:
1893
+ self._record_text_end_position(raw_len)
1894
+ self.last_token_start_pos = self.text_start_pos
1757
1895
  self.sink.process_characters(data)
1758
1896
  # Note: process_characters never returns Plaintext or RawData
1759
1897
  # State switches happen via _emit_current_tag instead
@@ -1785,7 +1923,8 @@ class Tokenizer:
1785
1923
  else:
1786
1924
  value = "".join(attr_value_buffer)
1787
1925
  if self.current_attr_value_has_amp:
1788
- value = decode_entities_in_text(value, in_attribute=True)
1926
+ report_error = self._emit_error if self.collect_errors else None
1927
+ value = decode_entities_in_text(value, in_attribute=True, report_error=report_error)
1789
1928
  attrs[name] = value
1790
1929
  attr_value_buffer.clear()
1791
1930
  self.current_attr_value_has_amp = False
@@ -1806,6 +1945,13 @@ class Tokenizer:
1806
1945
  tag.name = name
1807
1946
  tag.attrs = attrs
1808
1947
  tag.self_closing = self.current_tag_self_closing
1948
+ if self.track_tag_positions:
1949
+ tag.start_pos = self.current_token_start_pos
1950
+ tag.end_pos = self.pos
1951
+ else:
1952
+ tag.start_pos = None
1953
+ tag.end_pos = None
1954
+ self.last_token_start_pos = tag.start_pos
1809
1955
 
1810
1956
  switched_to_rawtext = False
1811
1957
  if self.current_tag_kind == Tag.START:
@@ -1831,7 +1977,8 @@ class Tokenizer:
1831
1977
  # Remember current state before emitting
1832
1978
 
1833
1979
  # Emit token to sink
1834
- self._record_token_position()
1980
+ if self.collect_errors:
1981
+ self._record_token_position()
1835
1982
  result = self.sink.process_token(tag)
1836
1983
  if result == 1: # TokenSinkResult.Plaintext
1837
1984
  self.state = self.PLAINTEXT
@@ -1844,6 +1991,30 @@ class Tokenizer:
1844
1991
  self.current_tag_kind = Tag.START
1845
1992
  return switched_to_rawtext
1846
1993
 
1994
+ def _emit_incomplete_tag_as_text(self) -> None:
1995
+ if not self.opts.emit_bogus_markup_as_text:
1996
+ return
1997
+ start = self.current_token_start_pos
1998
+ if start is None: # pragma: no cover
1999
+ return
2000
+ raw = self.buffer[start : self.pos]
2001
+ if raw: # pragma: no branch
2002
+ self._emit_token(CharacterTokens(raw))
2003
+
2004
+ def _emit_raw_end_tag_as_text(self, pos: int) -> bool:
2005
+ end = self.buffer.find(">", pos)
2006
+ if end == -1:
2007
+ self.pos = self.length
2008
+ self._emit_incomplete_tag_as_text()
2009
+ self._emit_token(EOFToken())
2010
+ return True
2011
+ self.pos = end + 1
2012
+ raw = self.buffer[self.current_token_start_pos : self.pos]
2013
+ if raw: # pragma: no branch
2014
+ self._emit_token(CharacterTokens(raw))
2015
+ self.state = self.DATA
2016
+ return False
2017
+
1847
2018
  def _emit_comment(self) -> None:
1848
2019
  data = "".join(self.current_comment)
1849
2020
  self.current_comment.clear()
@@ -1851,6 +2022,8 @@ class Tokenizer:
1851
2022
  if self.opts.xml_coercion:
1852
2023
  data = _coerce_comment_for_xml(data)
1853
2024
  self._comment_token.data = data
2025
+ self._comment_token.start_pos = self.current_token_start_pos
2026
+ self.last_token_start_pos = self._comment_token.start_pos
1854
2027
  self._emit_token(self._comment_token)
1855
2028
 
1856
2029
  def _emit_doctype(self) -> None:
@@ -1870,8 +2043,9 @@ class Tokenizer:
1870
2043
  self.current_doctype_force_quirks = False
1871
2044
  self._emit_token(DoctypeToken(doctype))
1872
2045
 
1873
- def _emit_token(self, token: Any) -> None:
1874
- self._record_token_position()
2046
+ def _emit_token(self, token: AnyToken) -> None:
2047
+ if self.collect_errors:
2048
+ self._record_token_position()
1875
2049
  self.sink.process_token(token)
1876
2050
  # Note: process_token never returns Plaintext or RawData for state switches
1877
2051
  # State switches happen via _emit_current_tag checking sink response
@@ -1881,8 +2055,6 @@ class Tokenizer:
1881
2055
 
1882
2056
  Per the spec, the position should be at the end of the token (after the last char).
1883
2057
  """
1884
- if not self.collect_errors:
1885
- return
1886
2058
  # pos points after the last consumed character, which is exactly what we want
1887
2059
  pos = self.pos
1888
2060
  last_newline = self.buffer.rfind("\n", 0, pos)
@@ -1899,8 +2071,6 @@ class Tokenizer:
1899
2071
  Uses text_start_pos + raw_len to compute where text ends, matching html5lib's
1900
2072
  behavior of reporting the column of the last character (1-indexed).
1901
2073
  """
1902
- if not self.collect_errors:
1903
- return
1904
2074
  # Position of last character of text (0-indexed)
1905
2075
  end_pos = self.text_start_pos + raw_len
1906
2076
  last_newline = self.buffer.rfind("\n", 0, end_pos)
@@ -1924,7 +2094,22 @@ class Tokenizer:
1924
2094
 
1925
2095
  message = generate_error_message(code)
1926
2096
  line = self._get_line_at_pos(self.pos)
1927
- self.errors.append(ParseError(code, line=line, column=column, message=message, source_html=self.buffer))
2097
+ self.errors.append(
2098
+ ParseError(code, line=line, column=column, category="tokenizer", message=message, source_html=self.buffer)
2099
+ )
2100
+
2101
+ def _emit_error_at_pos(self, code: str, pos: int) -> None:
2102
+ last_newline = self.buffer.rfind("\n", 0, pos + 1)
2103
+ if last_newline == -1:
2104
+ column = pos + 1
2105
+ else:
2106
+ column = pos - last_newline
2107
+
2108
+ message = generate_error_message(code)
2109
+ line = self._get_line_at_pos(pos)
2110
+ self.errors.append(
2111
+ ParseError(code, line=line, column=column, category="tokenizer", message=message, source_html=self.buffer)
2112
+ )
1928
2113
 
1929
2114
  def _consume_if(self, literal: str) -> bool:
1930
2115
  end = self.pos + len(literal)
@@ -1953,21 +2138,9 @@ class Tokenizer:
1953
2138
  if pos >= length:
1954
2139
  return False
1955
2140
 
1956
- # Handle ignore_lf for CRLF sequences
1957
- if self.ignore_lf and pos < length and self.buffer[pos] == "\n":
1958
- self.ignore_lf = False
1959
- pos += 1
1960
- self.pos = pos
1961
- if pos >= length:
1962
- return False
1963
-
1964
2141
  match = _COMMENT_RUN_PATTERN.match(self.buffer, pos)
1965
2142
  if match:
1966
2143
  chunk = match.group(0)
1967
- # Handle CRLF normalization for comments
1968
- if "\r" in chunk:
1969
- chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
1970
- self.ignore_lf = chunk.endswith("\r")
1971
2144
  self.current_comment.append(chunk)
1972
2145
  self.pos = match.end()
1973
2146
  return True
@@ -2061,7 +2234,7 @@ class Tokenizer:
2061
2234
  # Consume everything up to the special character
2062
2235
  if next_special > pos:
2063
2236
  chunk = buffer[pos:next_special]
2064
- self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2237
+ self._append_text_chunk(chunk)
2065
2238
  pos = next_special
2066
2239
  self.pos = pos
2067
2240
 
@@ -2073,7 +2246,6 @@ class Tokenizer:
2073
2246
 
2074
2247
  # Handle special characters - we're at one of them after find()
2075
2248
  if null_index == pos:
2076
- self.ignore_lf = False
2077
2249
  self._emit_error("unexpected-null-character")
2078
2250
  self._append_text("\ufffd")
2079
2251
  pos += 1
@@ -2188,9 +2360,7 @@ class Tokenizer:
2188
2360
  if null_index != -1 and null_index < next_special:
2189
2361
  if null_index > pos:
2190
2362
  chunk = buffer[pos:null_index]
2191
- self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2192
- else:
2193
- self.ignore_lf = False
2363
+ self._append_text_chunk(chunk)
2194
2364
  self._emit_error("unexpected-null-character")
2195
2365
  self._append_text("\ufffd")
2196
2366
  pos = null_index + 1
@@ -2199,14 +2369,14 @@ class Tokenizer:
2199
2369
  if lt_index == -1:
2200
2370
  if pos < length:
2201
2371
  chunk = buffer[pos:length]
2202
- self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2372
+ self._append_text_chunk(chunk)
2203
2373
  self.pos = length
2204
2374
  self._flush_text()
2205
2375
  self._emit_token(EOFToken())
2206
2376
  return True
2207
2377
  if lt_index > pos:
2208
2378
  chunk = buffer[pos:lt_index]
2209
- self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2379
+ self._append_text_chunk(chunk)
2210
2380
  pos = lt_index + 1
2211
2381
  self.pos = pos
2212
2382
  # Handle script escaped transition before treating '<' as markup boundary