justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
justhtml/tokenizer.py CHANGED
@@ -1,11 +1,17 @@
1
+ from __future__ import annotations
2
+
1
3
  import re
2
4
  from bisect import bisect_right
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ if TYPE_CHECKING:
8
+ from collections.abc import Callable
3
9
 
4
10
  from .entities import decode_entities_in_text
5
11
  from .errors import generate_error_message
6
- from .tokens import CommentToken, Doctype, DoctypeToken, EOFToken, ParseError, Tag
12
+ from .tokens import AnyToken, CommentToken, Doctype, DoctypeToken, EOFToken, ParseError, Tag
7
13
 
8
- _ATTR_VALUE_UNQUOTED_TERMINATORS = "\t\n\f >&\"'<=`\r\0"
14
+ _ATTR_VALUE_UNQUOTED_TERMINATORS = "\t\n\f >&\"'<=`\0"
9
15
  _ASCII_LOWER_TABLE = str.maketrans({chr(code): chr(code + 32) for code in range(65, 91)})
10
16
  _RCDATA_ELEMENTS = {"title", "textarea"}
11
17
  _RAWTEXT_SWITCH_TAGS = {
@@ -23,8 +29,8 @@ _ATTR_VALUE_DOUBLE_PATTERN = re.compile(r'["&\0]')
23
29
  _ATTR_VALUE_SINGLE_PATTERN = re.compile(r"['&\0]")
24
30
  _ATTR_VALUE_UNQUOTED_PATTERN = re.compile(f"[{re.escape(_ATTR_VALUE_UNQUOTED_TERMINATORS)}]")
25
31
 
26
- _TAG_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />\0\r]+")
27
- _ATTR_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />=\0\"'<\r]+")
32
+ _TAG_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />\0]+")
33
+ _ATTR_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />=\0\"'<]+")
28
34
  _COMMENT_RUN_PATTERN = re.compile(r"[^-\0]+")
29
35
  _WHITESPACE_PATTERN = re.compile(r"[ \t\n\f]+")
30
36
 
@@ -38,13 +44,20 @@ for _plane in range(17):
38
44
  _XML_COERCION_PATTERN = re.compile(r"[\f\uFDD0-\uFDEF" + "".join(_xml_invalid_single_chars) + "]")
39
45
 
40
46
 
41
- def _xml_coercion_callback(match):
47
+ def _is_noncharacter_codepoint(codepoint: int) -> bool:
48
+ if 0xFDD0 <= codepoint <= 0xFDEF:
49
+ return True
50
+ last = codepoint & 0xFFFF
51
+ return last == 0xFFFE or last == 0xFFFF
52
+
53
+
54
+ def _xml_coercion_callback(match: re.Match[str]) -> str:
42
55
  if match.group(0) == "\f":
43
56
  return " "
44
57
  return "\ufffd"
45
58
 
46
59
 
47
- def _coerce_text_for_xml(text):
60
+ def _coerce_text_for_xml(text: str) -> str:
48
61
  """Apply XML coercion to text content."""
49
62
  # Fast path for ASCII
50
63
  if text.isascii():
@@ -57,7 +70,7 @@ def _coerce_text_for_xml(text):
57
70
  return _XML_COERCION_PATTERN.sub(_xml_coercion_callback, text)
58
71
 
59
72
 
60
- def _coerce_comment_for_xml(text):
73
+ def _coerce_comment_for_xml(text: str) -> str:
61
74
  """Apply XML coercion to comment content - handle double hyphens."""
62
75
  # Replace -- with - - (with space)
63
76
  if "--" in text:
@@ -68,14 +81,20 @@ def _coerce_comment_for_xml(text):
68
81
  class TokenizerOpts:
69
82
  __slots__ = ("discard_bom", "exact_errors", "initial_rawtext_tag", "initial_state", "xml_coercion")
70
83
 
84
+ discard_bom: bool
85
+ exact_errors: bool
86
+ initial_rawtext_tag: str | None
87
+ initial_state: int | None
88
+ xml_coercion: bool
89
+
71
90
  def __init__(
72
91
  self,
73
- exact_errors=False,
74
- discard_bom=True,
75
- initial_state=None,
76
- initial_rawtext_tag=None,
77
- xml_coercion=False,
78
- ):
92
+ exact_errors: bool = False,
93
+ discard_bom: bool = True,
94
+ initial_state: int | None = None,
95
+ initial_rawtext_tag: str | None = None,
96
+ xml_coercion: bool = False,
97
+ ) -> None:
79
98
  self.exact_errors = bool(exact_errors)
80
99
  self.discard_bom = bool(discard_bom)
81
100
  self.initial_state = initial_state
@@ -166,11 +185,12 @@ class Tokenizer:
166
185
  "current_tag_kind",
167
186
  "current_tag_name",
168
187
  "current_tag_self_closing",
188
+ "current_token_start_pos",
169
189
  "errors",
170
- "ignore_lf",
171
190
  "last_start_tag_name",
172
191
  "last_token_column",
173
192
  "last_token_line",
193
+ "last_token_start_pos",
174
194
  "length",
175
195
  "opts",
176
196
  "original_tag_name",
@@ -182,14 +202,61 @@ class Tokenizer:
182
202
  "temp_buffer",
183
203
  "text_buffer",
184
204
  "text_start_pos",
205
+ "track_node_locations",
185
206
  )
186
207
 
208
+ _comment_token: CommentToken
209
+ _newline_positions: list[int] | None
210
+ _state_handlers: list[Callable[[Tokenizer], bool]]
211
+ _tag_token: Tag
212
+ buffer: str
213
+ collect_errors: bool
214
+ track_node_locations: bool
215
+ current_attr_name: list[str]
216
+ current_attr_value: list[str]
217
+ current_attr_value_has_amp: bool
218
+ current_char: str | None
219
+ current_comment: list[str]
220
+ current_doctype_force_quirks: bool
221
+ current_doctype_name: list[str]
222
+ current_doctype_public: list[str] | None
223
+ current_doctype_system: list[str] | None
224
+ current_tag_attrs: dict[str, str | None]
225
+ current_tag_kind: int
226
+ current_tag_name: list[str]
227
+ current_tag_self_closing: bool
228
+ current_token_start_pos: int
229
+ errors: list[ParseError]
230
+ last_start_tag_name: str | None
231
+ last_token_column: int
232
+ last_token_line: int
233
+ last_token_start_pos: int | None
234
+ length: int
235
+ opts: TokenizerOpts
236
+ original_tag_name: list[str]
237
+ pos: int
238
+ rawtext_tag_name: str | None
239
+ reconsume: bool
240
+ sink: Any
241
+ state: int
242
+ temp_buffer: list[str]
243
+ text_buffer: list[str]
244
+ text_start_pos: int
245
+
187
246
  # _STATE_HANDLERS is defined at the end of the file
188
247
 
189
- def __init__(self, sink, opts=None, collect_errors=False):
248
+ def __init__(
249
+ self,
250
+ sink: Any,
251
+ opts: TokenizerOpts | None = None,
252
+ *,
253
+ collect_errors: bool = False,
254
+ track_node_locations: bool = False,
255
+ ) -> None:
190
256
  self.sink = sink
191
257
  self.opts = opts or TokenizerOpts()
192
258
  self.collect_errors = collect_errors
259
+ self.track_node_locations = bool(track_node_locations)
193
260
  self.errors = []
194
261
 
195
262
  self.state = self.DATA
@@ -198,9 +265,10 @@ class Tokenizer:
198
265
  self.pos = 0
199
266
  self.reconsume = False
200
267
  self.current_char = ""
201
- self.ignore_lf = False
202
268
  self.last_token_line = 1
203
269
  self.last_token_column = 0
270
+ self.current_token_start_pos = 0
271
+ self.last_token_start_pos = None
204
272
 
205
273
  # Reusable buffers to avoid per-token allocations.
206
274
  self.text_buffer = []
@@ -224,18 +292,24 @@ class Tokenizer:
224
292
  self._tag_token = Tag(Tag.START, "", {}, False)
225
293
  self._comment_token = CommentToken("")
226
294
 
227
- def initialize(self, html):
295
+ def initialize(self, html: str | None) -> None:
228
296
  if html and html[0] == "\ufeff" and self.opts.discard_bom:
229
297
  html = html[1:]
230
298
 
299
+ # Normalize newlines per §13.2.2.5
300
+ if html:
301
+ if "\r" in html:
302
+ html = html.replace("\r\n", "\n").replace("\r", "\n")
303
+
231
304
  self.buffer = html or ""
232
305
  self.length = len(self.buffer)
233
306
  self.pos = 0
234
307
  self.reconsume = False
235
308
  self.current_char = ""
236
- self.ignore_lf = False
237
309
  self.last_token_line = 1
238
310
  self.last_token_column = 0
311
+ self.current_token_start_pos = 0
312
+ self.last_token_start_pos = None
239
313
  self.errors = []
240
314
  self.text_buffer.clear()
241
315
  self.text_start_pos = 0
@@ -265,8 +339,9 @@ class Tokenizer:
265
339
  else:
266
340
  self.state = self.DATA
267
341
 
268
- # Pre-compute newline positions for O(log n) line lookups
269
- if self.collect_errors:
342
+ # Pre-compute newline positions for O(log n) line lookups.
343
+ # Only do this when errors are collected or when node locations are requested.
344
+ if self.collect_errors or self.track_node_locations:
270
345
  self._newline_positions = []
271
346
  pos = -1
272
347
  buffer = self.buffer
@@ -278,42 +353,73 @@ class Tokenizer:
278
353
  else:
279
354
  self._newline_positions = None
280
355
 
281
- def _get_line_at_pos(self, pos):
356
+ def _get_line_at_pos(self, pos: int) -> int:
282
357
  """Get line number (1-indexed) for a position using binary search."""
283
358
  # Line number = count of newlines before pos + 1
284
- return bisect_right(self._newline_positions, pos - 1) + 1
359
+ newline_positions = self._newline_positions
360
+ if newline_positions is None: # pragma: no cover
361
+ return 1
362
+ return bisect_right(newline_positions, pos - 1) + 1
363
+
364
+ def location_at_pos(self, pos: int) -> tuple[int, int]:
365
+ """Return (line, column) for a 0-indexed offset in the current buffer.
366
+
367
+ Column is 1-indexed. Newline positions are computed lazily when needed.
368
+ """
369
+ newline_positions = self._newline_positions
370
+ if newline_positions is None:
371
+ newline_positions = []
372
+ scan = -1
373
+ buffer = self.buffer
374
+ while True:
375
+ scan = buffer.find("\n", scan + 1)
376
+ if scan == -1:
377
+ break
378
+ newline_positions.append(scan)
379
+ self._newline_positions = newline_positions
380
+
381
+ line_index = bisect_right(newline_positions, pos - 1)
382
+ line = line_index + 1
285
383
 
286
- def step(self):
384
+ # Compute column using newline index rather than rfind() to avoid O(n) scans.
385
+ if line_index == 0:
386
+ last_newline = -1
387
+ else:
388
+ last_newline = newline_positions[line_index - 1]
389
+ column = pos - last_newline
390
+ return line, column
391
+
392
+ def step(self) -> bool:
287
393
  """Run one step of the tokenizer state machine. Returns True if EOF reached."""
288
- handler = self._STATE_HANDLERS[self.state]
289
- return handler(self)
394
+ handler = self._STATE_HANDLERS[self.state] # type: ignore[attr-defined]
395
+ return handler(self) # type: ignore[no-any-return]
290
396
 
291
- def run(self, html):
397
+ def run(self, html: str | None) -> None:
292
398
  self.initialize(html)
399
+ handlers = self._STATE_HANDLERS # type: ignore[attr-defined]
293
400
  while True:
294
- if self.step():
401
+ if handlers[self.state](self): # type: ignore[no-any-return]
295
402
  break
296
403
 
297
404
  # ---------------------
298
405
  # Helper methods
299
406
  # ---------------------
300
407
 
301
- def _peek_char(self, offset):
408
+ def _peek_char(self, offset: int) -> str | None:
302
409
  """Peek ahead at character at current position + offset without consuming"""
303
410
  peek_pos = self.pos + offset
304
411
  if peek_pos < self.length:
305
412
  return self.buffer[peek_pos]
306
413
  return None
307
414
 
308
- def _append_text_chunk(self, chunk, *, ends_with_cr=False):
415
+ def _append_text_chunk(self, chunk: str) -> None:
309
416
  self._append_text(chunk)
310
- self.ignore_lf = ends_with_cr
311
417
 
312
418
  # ---------------------
313
419
  # State handlers
314
420
  # ---------------------
315
421
 
316
- def _state_data(self):
422
+ def _state_data(self) -> bool:
317
423
  buffer = self.buffer
318
424
  length = self.length
319
425
  pos = self.pos
@@ -341,12 +447,12 @@ class Tokenizer:
341
447
 
342
448
  if end > pos:
343
449
  chunk = buffer[pos:end]
344
-
345
- if "\r" in chunk:
346
- chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
347
-
450
+ if self.collect_errors and not chunk.isascii():
451
+ base_pos = pos
452
+ for offset, ch in enumerate(chunk):
453
+ if _is_noncharacter_codepoint(ord(ch)):
454
+ self._emit_error_at_pos("noncharacter-in-input-stream", base_pos + offset)
348
455
  self._append_text(chunk)
349
- self.ignore_lf = chunk.endswith("\r")
350
456
 
351
457
  pos = end
352
458
  self.pos = pos
@@ -359,8 +465,8 @@ class Tokenizer:
359
465
  pos += 1
360
466
  self.pos = pos
361
467
  self.current_char = c
362
- self.ignore_lf = False
363
468
  # c is always '<' here due to find() optimization above
469
+ self.current_token_start_pos = pos - 1
364
470
  # Optimization: Peek ahead for common tag starts
365
471
  if pos < length:
366
472
  nc = buffer[pos]
@@ -415,7 +521,7 @@ class Tokenizer:
415
521
  self.state = self.TAG_OPEN
416
522
  return self._state_tag_open()
417
523
 
418
- def _state_tag_open(self):
524
+ def _state_tag_open(self) -> bool:
419
525
  c = self._get_char()
420
526
  if c is None:
421
527
  self._emit_error("eof-before-tag-name")
@@ -442,7 +548,7 @@ class Tokenizer:
442
548
  self.state = self.DATA
443
549
  return False
444
550
 
445
- def _state_end_tag_open(self):
551
+ def _state_end_tag_open(self) -> bool:
446
552
  c = self._get_char()
447
553
  if c is None:
448
554
  self._emit_error("eof-before-tag-name")
@@ -462,20 +568,20 @@ class Tokenizer:
462
568
  self.state = self.BOGUS_COMMENT
463
569
  return False
464
570
 
465
- def _state_tag_name(self):
571
+ def _state_tag_name(self) -> bool:
466
572
  replacement = "\ufffd"
467
573
  append_tag_char = self.current_tag_name.append
468
574
  buffer = self.buffer
469
575
  length = self.length
576
+ pos = self.pos
470
577
 
471
578
  while True:
472
579
  # Inline _consume_tag_name_run
473
- # Note: reconsume and ignore_lf are never True when entering TAG_NAME
474
- pos = self.pos
580
+ # Note: reconsume is never True when entering TAG_NAME
475
581
  if pos < length:
476
582
  # Optimization: Check for common terminators before regex
477
583
  match = None
478
- if buffer[pos] not in "\t\n\f />\0\r":
584
+ if buffer[pos] not in "\t\n\f />\0":
479
585
  match = _TAG_NAME_RUN_PATTERN.match(buffer, pos)
480
586
 
481
587
  if match:
@@ -483,56 +589,69 @@ class Tokenizer:
483
589
  if not chunk.islower():
484
590
  chunk = chunk.translate(_ASCII_LOWER_TABLE)
485
591
  append_tag_char(chunk)
486
- self.pos = match.end()
487
-
488
- if self.pos < length:
489
- c = buffer[self.pos]
490
- if c in (" ", "\t", "\n", "\f", "\r"):
491
- self.pos += 1
492
- if c == "\r":
493
- self.ignore_lf = True
592
+ pos = match.end()
593
+
594
+ if pos < length:
595
+ next_char = buffer[pos]
596
+ if next_char in (" ", "\t", "\n", "\f"):
597
+ pos += 1
598
+ self.pos = pos
494
599
  self.state = self.BEFORE_ATTRIBUTE_NAME
495
600
  return self._state_before_attribute_name()
496
- if c == ">":
497
- self.pos += 1
601
+ if next_char == ">":
602
+ pos += 1
603
+ self.pos = pos
498
604
  if not self._emit_current_tag():
499
605
  self.state = self.DATA
500
606
  return False
501
- if c == "/":
502
- self.pos += 1
607
+ if next_char == "/":
608
+ pos += 1
609
+ self.pos = pos
503
610
  self.state = self.SELF_CLOSING_START_TAG
504
611
  return self._state_self_closing_start_tag()
505
612
 
506
- c = self._get_char()
613
+ # Inline _get_char
614
+ # Note: reconsume is never True in this state.
615
+ if pos >= length:
616
+ c: str | None = None
617
+ else:
618
+ c = buffer[pos]
619
+ pos += 1
620
+ self.current_char = c
507
621
  if c is None:
622
+ self.pos = pos
508
623
  self._emit_error("eof-in-tag")
509
624
  # Per HTML5 spec: EOF in tag name is a parse error, emit EOF token only
510
625
  # The incomplete tag is discarded (not emitted as text)
511
626
  self._emit_token(EOFToken())
512
627
  return True
513
628
  if c in ("\t", "\n", "\f", " "):
629
+ self.pos = pos
514
630
  self.state = self.BEFORE_ATTRIBUTE_NAME
515
631
  return self._state_before_attribute_name()
516
632
  if c == "/":
633
+ self.pos = pos
517
634
  self.state = self.SELF_CLOSING_START_TAG
518
635
  return self._state_self_closing_start_tag()
519
636
  if c == ">":
520
637
  # In slow path, tag name is only first char (from DATA),
521
638
  # so no rawtext elements possible - always set DATA state
639
+ self.pos = pos
522
640
  self._emit_current_tag()
523
641
  self.state = self.DATA
524
642
  return False
525
643
  # c == "\0" - the only remaining possibility after fast-path
644
+ self.pos = pos
526
645
  self._emit_error("unexpected-null-character")
527
646
  append_tag_char(replacement)
528
647
 
529
- def _state_before_attribute_name(self):
648
+ def _state_before_attribute_name(self) -> bool:
530
649
  buffer = self.buffer
531
650
  length = self.length
532
651
 
533
652
  while True:
534
653
  # Optimization: Skip whitespace
535
- if not self.reconsume and not self.ignore_lf:
654
+ if not self.reconsume:
536
655
  if self.pos < length:
537
656
  # Check if current char is whitespace before running regex
538
657
  if buffer[self.pos] in " \t\n\f":
@@ -552,21 +671,7 @@ class Tokenizer:
552
671
 
553
672
  self.current_char = c
554
673
 
555
- if c == " ":
556
- self.ignore_lf = False
557
- continue
558
- if c == "\n":
559
- if self.ignore_lf:
560
- self.ignore_lf = False
561
- # Line tracking now computed on-demand via _get_line_at_pos()
562
- continue
563
- if c == "\t" or c == "\f":
564
- self.ignore_lf = False
565
- continue
566
- if c == "\r":
567
- self.ignore_lf = False
568
- if self.pos < length and buffer[self.pos] == "\n":
569
- self.pos += 1
674
+ if c in (" ", "\n", "\t", "\f"):
570
675
  continue
571
676
 
572
677
  if c is None:
@@ -605,55 +710,64 @@ class Tokenizer:
605
710
  self.state = self.ATTRIBUTE_NAME
606
711
  return False # Let main loop dispatch to avoid recursion
607
712
 
608
- def _state_attribute_name(self):
713
+ def _state_attribute_name(self) -> bool:
609
714
  replacement = "\ufffd"
610
715
  append_attr_char = self.current_attr_name.append
611
716
  buffer = self.buffer
612
717
  length = self.length
718
+ pos = self.pos
613
719
 
614
720
  while True:
615
721
  # Inline _consume_attribute_name_run
616
- if not self.reconsume and not self.ignore_lf:
617
- pos = self.pos
618
- if pos < length:
619
- # Optimization: Check for common terminators before regex
620
- match = None
621
- if buffer[pos] not in "\t\n\f />=\0\"'<\r":
622
- match = _ATTR_NAME_RUN_PATTERN.match(buffer, pos)
623
-
624
- if match:
625
- chunk = match.group(0)
626
- if not chunk.islower():
627
- chunk = chunk.translate(_ASCII_LOWER_TABLE)
628
- append_attr_char(chunk)
629
- self.pos = match.end()
630
-
631
- if self.pos < length:
632
- c = buffer[self.pos]
633
- if c == "=":
634
- self.pos += 1
635
- self.state = self.BEFORE_ATTRIBUTE_VALUE
636
- return self._state_before_attribute_value()
637
- if c in (" ", "\t", "\n", "\f", "\r"):
638
- self.pos += 1
639
- if c == "\r":
640
- self.ignore_lf = True
641
- self._finish_attribute()
642
- self.state = self.AFTER_ATTRIBUTE_NAME
643
- return False # Let main loop dispatch to avoid recursion
644
- if c == ">":
645
- self.pos += 1
646
- self._finish_attribute()
647
- if not self._emit_current_tag():
648
- self.state = self.DATA
649
- return False
650
- if c == "/":
651
- self.pos += 1
652
- self._finish_attribute()
653
- self.state = self.SELF_CLOSING_START_TAG
654
- return self._state_self_closing_start_tag()
722
+ # Note: reconsume is never True in this state.
723
+ if pos < length:
724
+ # Optimization: Check for common terminators before regex
725
+ match = None
726
+ if buffer[pos] not in "\t\n\f />=\0\"'<":
727
+ match = _ATTR_NAME_RUN_PATTERN.match(buffer, pos)
655
728
 
656
- c = self._get_char()
729
+ if match:
730
+ chunk = match.group(0)
731
+ if not chunk.islower():
732
+ chunk = chunk.translate(_ASCII_LOWER_TABLE)
733
+ append_attr_char(chunk)
734
+ pos = match.end()
735
+
736
+ if pos < length:
737
+ next_char = buffer[pos]
738
+ if next_char == "=":
739
+ pos += 1
740
+ self.pos = pos
741
+ self.state = self.BEFORE_ATTRIBUTE_VALUE
742
+ return self._state_before_attribute_value()
743
+ if next_char in (" ", "\t", "\n", "\f"):
744
+ pos += 1
745
+ self.pos = pos
746
+ self._finish_attribute()
747
+ self.state = self.AFTER_ATTRIBUTE_NAME
748
+ return False # Let main loop dispatch to avoid recursion
749
+ if next_char == ">":
750
+ pos += 1
751
+ self.pos = pos
752
+ self._finish_attribute()
753
+ if not self._emit_current_tag():
754
+ self.state = self.DATA
755
+ return False
756
+ if next_char == "/":
757
+ pos += 1
758
+ self.pos = pos
759
+ self._finish_attribute()
760
+ self.state = self.SELF_CLOSING_START_TAG
761
+ return self._state_self_closing_start_tag()
762
+
763
+ # Inline _get_char (reconsume is never True in this state)
764
+ if pos >= length:
765
+ c: str | None = None
766
+ else:
767
+ c = buffer[pos]
768
+ pos += 1
769
+ self.current_char = c
770
+ self.pos = pos
657
771
  if c is None:
658
772
  self._emit_error("eof-in-tag")
659
773
  self._flush_text()
@@ -679,21 +793,19 @@ class Tokenizer:
679
793
  self._emit_error("unexpected-null-character")
680
794
  append_attr_char(replacement)
681
795
  continue
682
- if c in ('"', "'", "<"):
683
- self._emit_error("unexpected-character-in-attribute-name")
796
+ self._emit_error("unexpected-character-in-attribute-name")
684
797
  append_attr_char(c)
685
798
 
686
- def _state_after_attribute_name(self):
799
+ def _state_after_attribute_name(self) -> bool:
687
800
  buffer = self.buffer
688
801
  length = self.length
689
802
 
690
803
  while True:
691
804
  # Optimization: Skip whitespace
692
- if not self.reconsume and not self.ignore_lf:
805
+ if not self.reconsume:
693
806
  if self.pos < length:
694
- match = _WHITESPACE_PATTERN.match(buffer, self.pos)
695
- if match:
696
- self.pos = match.end()
807
+ if buffer[self.pos] in " \t\n\f":
808
+ self.pos = _WHITESPACE_PATTERN.match(buffer, self.pos).end() # type: ignore[union-attr]
697
809
 
698
810
  # Inline _get_char
699
811
  if self.pos >= length:
@@ -704,23 +816,9 @@ class Tokenizer:
704
816
 
705
817
  self.current_char = c
706
818
 
707
- if c == " ":
708
- self.ignore_lf = False
709
- continue
710
- if c == "\n":
711
- # Note: Only reachable when ignore_lf=True (CR-LF handling)
712
- # Standalone \n is caught by whitespace optimization
713
- self.ignore_lf = False
714
- continue
715
- if c == "\r":
716
- self.ignore_lf = True
717
- continue
718
- if c == "\t" or c == "\f":
719
- self.ignore_lf = False
819
+ if c in (" ", "\n", "\t", "\f"):
720
820
  continue
721
821
 
722
- self.ignore_lf = False
723
-
724
822
  if c is None:
725
823
  self._emit_error("eof-in-tag")
726
824
  self._flush_text()
@@ -751,9 +849,16 @@ class Tokenizer:
751
849
  self.state = self.ATTRIBUTE_NAME
752
850
  return False # Let main loop dispatch to avoid recursion
753
851
 
754
- def _state_before_attribute_value(self):
852
+ def _state_before_attribute_value(self) -> bool:
755
853
  while True:
756
- c = self._get_char()
854
+ # Inline _get_char (reconsume is never True in this state)
855
+ pos = self.pos
856
+ if pos >= self.length:
857
+ c: str | None = None
858
+ else:
859
+ c = self.buffer[pos]
860
+ self.pos = pos + 1
861
+ self.current_char = c
757
862
  if c is None:
758
863
  self._emit_error("eof-in-tag")
759
864
  self._flush_text()
@@ -777,7 +882,7 @@ class Tokenizer:
777
882
  self.state = self.ATTRIBUTE_VALUE_UNQUOTED
778
883
  return self._state_attribute_value_unquoted()
779
884
 
780
- def _state_attribute_value_double(self):
885
+ def _state_attribute_value_double(self) -> bool:
781
886
  replacement = "\ufffd"
782
887
  stop_pattern = _ATTR_VALUE_DOUBLE_PATTERN
783
888
  buffer = self.buffer
@@ -797,8 +902,7 @@ class Tokenizer:
797
902
  if "&" in chunk or "\0" in chunk:
798
903
  # Fallback to regex if complex chars present
799
904
  match = stop_pattern.search(buffer, pos)
800
- # Note: match is always found because we checked for & or \0 above
801
- end = match.start()
905
+ end = length if match is None else match.start()
802
906
  else:
803
907
  end = next_quote
804
908
 
@@ -807,10 +911,6 @@ class Tokenizer:
807
911
  if end != next_quote:
808
912
  chunk = buffer[pos:end]
809
913
 
810
- # Normalize chunk for value if needed
811
- if "\r" in chunk:
812
- chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
813
-
814
914
  self.current_attr_value.append(chunk)
815
915
  self.pos = end
816
916
 
@@ -837,7 +937,7 @@ class Tokenizer:
837
937
  self._emit_error("unexpected-null-character")
838
938
  self._append_attr_value_char(replacement)
839
939
 
840
- def _state_attribute_value_single(self):
940
+ def _state_attribute_value_single(self) -> bool:
841
941
  replacement = "\ufffd"
842
942
  stop_pattern = _ATTR_VALUE_SINGLE_PATTERN
843
943
  buffer = self.buffer
@@ -857,8 +957,7 @@ class Tokenizer:
857
957
  if "&" in chunk or "\0" in chunk:
858
958
  # Fallback to regex if complex chars present
859
959
  match = stop_pattern.search(buffer, pos)
860
- # Note: match is always found because we checked for & or \0 above
861
- end = match.start()
960
+ end = length if match is None else match.start()
862
961
  else:
863
962
  end = next_quote
864
963
 
@@ -867,10 +966,6 @@ class Tokenizer:
867
966
  if end != next_quote:
868
967
  chunk = buffer[pos:end]
869
968
 
870
- # Normalize chunk for value if needed
871
- if "\r" in chunk:
872
- chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
873
-
874
969
  self.current_attr_value.append(chunk)
875
970
  self.pos = end
876
971
 
@@ -897,7 +992,7 @@ class Tokenizer:
897
992
  self._emit_error("unexpected-null-character")
898
993
  self._append_attr_value_char(replacement)
899
994
 
900
- def _state_attribute_value_unquoted(self):
995
+ def _state_attribute_value_unquoted(self) -> bool:
901
996
  replacement = "\ufffd"
902
997
  stop_pattern = _ATTR_VALUE_UNQUOTED_PATTERN
903
998
  buffer = self.buffer
@@ -916,7 +1011,17 @@ class Tokenizer:
916
1011
  self.current_attr_value.append(buffer[pos:end])
917
1012
  self.pos = end
918
1013
 
919
- c = self._get_char()
1014
+ # Inline _get_char
1015
+ if self.reconsume:
1016
+ self.reconsume = False
1017
+ c = self.current_char
1018
+ elif self.pos >= length:
1019
+ c = None
1020
+ else:
1021
+ c = buffer[self.pos]
1022
+ self.pos += 1
1023
+ self.current_char = c
1024
+
920
1025
  if c is None:
921
1026
  # Per HTML5 spec: EOF in attribute value is a parse error
922
1027
  # The incomplete tag is discarded (not emitted)
@@ -944,9 +1049,16 @@ class Tokenizer:
944
1049
  continue
945
1050
  self._append_attr_value_char(c)
946
1051
 
947
- def _state_after_attribute_value_quoted(self):
1052
+ def _state_after_attribute_value_quoted(self) -> bool:
948
1053
  """After attribute value (quoted) state per HTML5 spec §13.2.5.42"""
949
- c = self._get_char()
1054
+ # Inline _get_char
1055
+ if self.pos >= self.length:
1056
+ c: str | None = None
1057
+ else:
1058
+ c = self.buffer[self.pos]
1059
+ self.pos += 1
1060
+ self.current_char = c
1061
+
950
1062
  if c is None:
951
1063
  self._emit_error("eof-in-tag")
952
1064
  self._flush_text()
@@ -972,7 +1084,7 @@ class Tokenizer:
972
1084
  self.state = self.BEFORE_ATTRIBUTE_NAME
973
1085
  return False
974
1086
 
975
- def _state_self_closing_start_tag(self):
1087
+ def _state_self_closing_start_tag(self) -> bool:
976
1088
  c = self._get_char()
977
1089
  if c is None:
978
1090
  self._emit_error("eof-in-tag")
@@ -989,7 +1101,7 @@ class Tokenizer:
989
1101
  self.state = self.BEFORE_ATTRIBUTE_NAME
990
1102
  return False
991
1103
 
992
- def _state_markup_declaration_open(self):
1104
+ def _state_markup_declaration_open(self) -> bool:
993
1105
  # Note: Comment handling (<!--) is optimized in DATA state fast-path
994
1106
  # This code only handles DOCTYPE and CDATA, or malformed markup
995
1107
  if self._consume_case_insensitive("DOCTYPE"):
@@ -1023,7 +1135,7 @@ class Tokenizer:
1023
1135
  self.state = self.BOGUS_COMMENT
1024
1136
  return False
1025
1137
 
1026
- def _state_comment_start(self):
1138
+ def _state_comment_start(self) -> bool:
1027
1139
  replacement = "\ufffd"
1028
1140
  c = self._get_char()
1029
1141
  if c is None:
@@ -1047,7 +1159,7 @@ class Tokenizer:
1047
1159
  self.state = self.COMMENT
1048
1160
  return False
1049
1161
 
1050
- def _state_comment_start_dash(self):
1162
+ def _state_comment_start_dash(self) -> bool:
1051
1163
  replacement = "\ufffd"
1052
1164
  c = self._get_char()
1053
1165
  if c is None:
@@ -1071,12 +1183,19 @@ class Tokenizer:
1071
1183
  self.state = self.COMMENT
1072
1184
  return False
1073
1185
 
1074
- def _state_comment(self):
1186
+ def _state_comment(self) -> bool:
1075
1187
  replacement = "\ufffd"
1076
1188
  while True:
1077
1189
  if self._consume_comment_run():
1078
1190
  continue
1079
- c = self._get_char()
1191
+ # Inline _get_char
1192
+ if self.pos >= self.length:
1193
+ c: str | None = None
1194
+ else:
1195
+ c = self.buffer[self.pos]
1196
+ self.pos += 1
1197
+ self.current_char = c
1198
+
1080
1199
  if c is None:
1081
1200
  self._emit_error("eof-in-comment")
1082
1201
  self._emit_comment()
@@ -1089,7 +1208,7 @@ class Tokenizer:
1089
1208
  self._emit_error("unexpected-null-character")
1090
1209
  self.current_comment.append(replacement)
1091
1210
 
1092
- def _state_comment_end_dash(self):
1211
+ def _state_comment_end_dash(self) -> bool:
1093
1212
  replacement = "\ufffd"
1094
1213
  c = self._get_char()
1095
1214
  if c is None:
@@ -1110,7 +1229,7 @@ class Tokenizer:
1110
1229
  self.state = self.COMMENT
1111
1230
  return False
1112
1231
 
1113
- def _state_comment_end(self):
1232
+ def _state_comment_end(self) -> bool:
1114
1233
  replacement = "\ufffd"
1115
1234
  c = self._get_char()
1116
1235
  if c is None:
@@ -1138,7 +1257,7 @@ class Tokenizer:
1138
1257
  self.state = self.COMMENT
1139
1258
  return False
1140
1259
 
1141
- def _state_comment_end_bang(self):
1260
+ def _state_comment_end_bang(self) -> bool:
1142
1261
  replacement = "\ufffd"
1143
1262
  c = self._get_char()
1144
1263
  if c is None:
@@ -1172,7 +1291,7 @@ class Tokenizer:
1172
1291
  self.state = self.COMMENT
1173
1292
  return False
1174
1293
 
1175
- def _state_bogus_comment(self):
1294
+ def _state_bogus_comment(self) -> bool:
1176
1295
  replacement = "\ufffd"
1177
1296
  while True:
1178
1297
  c = self._get_char()
@@ -1189,7 +1308,7 @@ class Tokenizer:
1189
1308
  else:
1190
1309
  self.current_comment.append(c)
1191
1310
 
1192
- def _state_doctype(self):
1311
+ def _state_doctype(self) -> bool:
1193
1312
  c = self._get_char()
1194
1313
  if c is None:
1195
1314
  self._emit_error("eof-in-doctype")
@@ -1211,11 +1330,11 @@ class Tokenizer:
1211
1330
  self.state = self.BEFORE_DOCTYPE_NAME
1212
1331
  return False
1213
1332
 
1214
- def _state_before_doctype_name(self):
1333
+ def _state_before_doctype_name(self) -> bool:
1215
1334
  while True:
1216
1335
  c = self._get_char()
1217
1336
  if c is None:
1218
- self._emit_error("eof-in-doctype-name")
1337
+ self._emit_error("eof-in-doctype")
1219
1338
  self.current_doctype_force_quirks = True
1220
1339
  self._emit_doctype()
1221
1340
  self._emit_token(EOFToken())
@@ -1238,11 +1357,11 @@ class Tokenizer:
1238
1357
  self.state = self.DOCTYPE_NAME
1239
1358
  return False
1240
1359
 
1241
- def _state_doctype_name(self):
1360
+ def _state_doctype_name(self) -> bool:
1242
1361
  while True:
1243
1362
  c = self._get_char()
1244
1363
  if c is None:
1245
- self._emit_error("eof-in-doctype-name")
1364
+ self._emit_error("eof-in-doctype")
1246
1365
  self.current_doctype_force_quirks = True
1247
1366
  self._emit_doctype()
1248
1367
  self._emit_token(EOFToken())
@@ -1263,7 +1382,7 @@ class Tokenizer:
1263
1382
  continue
1264
1383
  self.current_doctype_name.append(c)
1265
1384
 
1266
- def _state_after_doctype_name(self):
1385
+ def _state_after_doctype_name(self) -> bool:
1267
1386
  if self._consume_case_insensitive("PUBLIC"):
1268
1387
  self.state = self.AFTER_DOCTYPE_PUBLIC_KEYWORD
1269
1388
  return False
@@ -1290,7 +1409,7 @@ class Tokenizer:
1290
1409
  self.state = self.BOGUS_DOCTYPE
1291
1410
  return False
1292
1411
 
1293
- def _state_after_doctype_public_keyword(self):
1412
+ def _state_after_doctype_public_keyword(self) -> bool:
1294
1413
  while True:
1295
1414
  c = self._get_char()
1296
1415
  if c is None:
@@ -1324,7 +1443,7 @@ class Tokenizer:
1324
1443
  self.state = self.BOGUS_DOCTYPE
1325
1444
  return False
1326
1445
 
1327
- def _state_after_doctype_system_keyword(self):
1446
+ def _state_after_doctype_system_keyword(self) -> bool:
1328
1447
  while True:
1329
1448
  c = self._get_char()
1330
1449
  if c is None:
@@ -1358,7 +1477,7 @@ class Tokenizer:
1358
1477
  self.state = self.BOGUS_DOCTYPE
1359
1478
  return False
1360
1479
 
1361
- def _state_before_doctype_public_identifier(self):
1480
+ def _state_before_doctype_public_identifier(self) -> bool:
1362
1481
  while True:
1363
1482
  c = self._get_char()
1364
1483
  if c is None:
@@ -1389,7 +1508,9 @@ class Tokenizer:
1389
1508
  self.state = self.BOGUS_DOCTYPE
1390
1509
  return False
1391
1510
 
1392
- def _state_doctype_public_identifier_double_quoted(self):
1511
+ def _state_doctype_public_identifier_double_quoted(self) -> bool:
1512
+ if self.current_doctype_public is None: # pragma: no cover
1513
+ self.current_doctype_public = []
1393
1514
  while True:
1394
1515
  c = self._get_char()
1395
1516
  if c is None:
@@ -1413,7 +1534,9 @@ class Tokenizer:
1413
1534
  return False
1414
1535
  self.current_doctype_public.append(c)
1415
1536
 
1416
- def _state_doctype_public_identifier_single_quoted(self):
1537
+ def _state_doctype_public_identifier_single_quoted(self) -> bool:
1538
+ if self.current_doctype_public is None: # pragma: no cover
1539
+ self.current_doctype_public = []
1417
1540
  while True:
1418
1541
  c = self._get_char()
1419
1542
  if c is None:
@@ -1437,7 +1560,7 @@ class Tokenizer:
1437
1560
  return False
1438
1561
  self.current_doctype_public.append(c)
1439
1562
 
1440
- def _state_after_doctype_public_identifier(self):
1563
+ def _state_after_doctype_public_identifier(self) -> bool:
1441
1564
  while True:
1442
1565
  c = self._get_char()
1443
1566
  if c is None:
@@ -1469,7 +1592,7 @@ class Tokenizer:
1469
1592
  self.state = self.BOGUS_DOCTYPE
1470
1593
  return False
1471
1594
 
1472
- def _state_between_doctype_public_and_system_identifiers(self):
1595
+ def _state_between_doctype_public_and_system_identifiers(self) -> bool:
1473
1596
  while True:
1474
1597
  c = self._get_char()
1475
1598
  if c is None:
@@ -1498,7 +1621,7 @@ class Tokenizer:
1498
1621
  self.state = self.BOGUS_DOCTYPE
1499
1622
  return False
1500
1623
 
1501
- def _state_before_doctype_system_identifier(self):
1624
+ def _state_before_doctype_system_identifier(self) -> bool:
1502
1625
  while True:
1503
1626
  c = self._get_char()
1504
1627
  if c is None:
@@ -1529,7 +1652,9 @@ class Tokenizer:
1529
1652
  self.state = self.BOGUS_DOCTYPE
1530
1653
  return False
1531
1654
 
1532
- def _state_doctype_system_identifier_double_quoted(self):
1655
+ def _state_doctype_system_identifier_double_quoted(self) -> bool:
1656
+ if self.current_doctype_system is None: # pragma: no cover
1657
+ self.current_doctype_system = []
1533
1658
  while True:
1534
1659
  c = self._get_char()
1535
1660
  if c is None:
@@ -1553,7 +1678,9 @@ class Tokenizer:
1553
1678
  return False
1554
1679
  self.current_doctype_system.append(c)
1555
1680
 
1556
- def _state_doctype_system_identifier_single_quoted(self):
1681
+ def _state_doctype_system_identifier_single_quoted(self) -> bool:
1682
+ if self.current_doctype_system is None: # pragma: no cover
1683
+ self.current_doctype_system = []
1557
1684
  while True:
1558
1685
  c = self._get_char()
1559
1686
  if c is None:
@@ -1577,7 +1704,7 @@ class Tokenizer:
1577
1704
  return False
1578
1705
  self.current_doctype_system.append(c)
1579
1706
 
1580
- def _state_after_doctype_system_identifier(self):
1707
+ def _state_after_doctype_system_identifier(self) -> bool:
1581
1708
  while True:
1582
1709
  c = self._get_char()
1583
1710
  if c is None:
@@ -1597,7 +1724,7 @@ class Tokenizer:
1597
1724
  self.state = self.BOGUS_DOCTYPE
1598
1725
  return False
1599
1726
 
1600
- def _state_bogus_doctype(self):
1727
+ def _state_bogus_doctype(self) -> bool:
1601
1728
  while True:
1602
1729
  c = self._get_char()
1603
1730
  if c is None:
@@ -1613,53 +1740,36 @@ class Tokenizer:
1613
1740
  # Low-level helpers
1614
1741
  # ---------------------
1615
1742
 
1616
- def _get_char(self):
1743
+ def _get_char(self) -> str | None:
1617
1744
  if self.reconsume:
1618
1745
  self.reconsume = False
1619
1746
  return self.current_char
1620
1747
 
1621
- buffer = self.buffer
1622
1748
  pos = self.pos
1623
- length = self.length
1624
- while True:
1625
- if pos >= length:
1626
- self.pos = pos
1627
- self.current_char = None
1628
- return None
1629
-
1630
- c = buffer[pos]
1631
- pos += 1
1749
+ if pos >= self.length:
1750
+ self.current_char = None
1751
+ return None
1632
1752
 
1633
- if c == "\r":
1634
- self.ignore_lf = True
1635
- self.current_char = "\n"
1636
- self.pos = pos
1637
- return "\n"
1638
-
1639
- if c == "\n":
1640
- if self.ignore_lf:
1641
- self.ignore_lf = False
1642
- continue
1643
- # Line tracking now computed on-demand via _get_line_at_pos()
1644
-
1645
- else:
1646
- self.ignore_lf = False
1647
-
1648
- self.current_char = c
1649
- self.pos = pos
1650
- return c
1753
+ c = self.buffer[pos]
1754
+ self.pos = pos + 1
1755
+ self.current_char = c
1756
+ if c == "<":
1757
+ self.current_token_start_pos = pos
1758
+ if self.collect_errors and not c.isascii() and _is_noncharacter_codepoint(ord(c)):
1759
+ self._emit_error_at_pos("noncharacter-in-input-stream", pos)
1760
+ return c
1651
1761
 
1652
- def _reconsume_current(self):
1762
+ def _reconsume_current(self) -> None:
1653
1763
  self.reconsume = True
1654
1764
 
1655
- def _append_text(self, text):
1765
+ def _append_text(self, text: str) -> None:
1656
1766
  """Append text to buffer, recording start position if this is the first chunk."""
1657
1767
  if not self.text_buffer:
1658
1768
  # Record where text started (current position before this chunk)
1659
1769
  self.text_start_pos = self.pos
1660
1770
  self.text_buffer.append(text)
1661
1771
 
1662
- def _flush_text(self):
1772
+ def _flush_text(self) -> None:
1663
1773
  if not self.text_buffer:
1664
1774
  return
1665
1775
 
@@ -1674,10 +1784,38 @@ class Tokenizer:
1674
1784
  raw_len = len(data)
1675
1785
 
1676
1786
  self.text_buffer.clear()
1677
- if self.state == self.DATA and "\0" in data:
1678
- count = data.count("\0")
1679
- for _ in range(count):
1680
- self._emit_error("unexpected-null-character")
1787
+ # U+0000 NULL is a parse error in text.
1788
+ # Emit one error per NULL at the *actual* character position.
1789
+ if "\0" in data:
1790
+ base_pos = self.text_start_pos
1791
+ search_from = 0
1792
+ while True:
1793
+ idx = data.find("\0", search_from)
1794
+ if idx == -1:
1795
+ break
1796
+ error_pos = base_pos + idx
1797
+
1798
+ # Compute column at error_pos (1-indexed).
1799
+ last_newline = self.buffer.rfind("\n", 0, error_pos + 1)
1800
+ if last_newline == -1:
1801
+ column = error_pos + 1
1802
+ else:
1803
+ column = error_pos - last_newline
1804
+ line = self._get_line_at_pos(error_pos)
1805
+
1806
+ message = generate_error_message("unexpected-null-character")
1807
+ self.errors.append(
1808
+ ParseError(
1809
+ "unexpected-null-character",
1810
+ line=line,
1811
+ column=column,
1812
+ category="tokenizer",
1813
+ message=message,
1814
+ source_html=self.buffer,
1815
+ )
1816
+ )
1817
+
1818
+ search_from = idx + 1
1681
1819
 
1682
1820
  # Per HTML5 spec:
1683
1821
  # - RCDATA state (title, textarea): decode character references
@@ -1690,21 +1828,24 @@ class Tokenizer:
1690
1828
  pass
1691
1829
  else:
1692
1830
  if "&" in data:
1693
- data = decode_entities_in_text(data)
1831
+ report_error = self._emit_error if self.collect_errors else None
1832
+ data = decode_entities_in_text(data, report_error=report_error)
1694
1833
  # Apply XML coercion if enabled
1695
1834
  if self.opts.xml_coercion:
1696
1835
  data = _coerce_text_for_xml(data)
1697
1836
 
1698
1837
  # Record position at END of raw text (1-indexed column = raw_len)
1699
- self._record_text_end_position(raw_len)
1838
+ if self.collect_errors:
1839
+ self._record_text_end_position(raw_len)
1840
+ self.last_token_start_pos = self.text_start_pos
1700
1841
  self.sink.process_characters(data)
1701
1842
  # Note: process_characters never returns Plaintext or RawData
1702
1843
  # State switches happen via _emit_current_tag instead
1703
1844
 
1704
- def _append_attr_value_char(self, c):
1845
+ def _append_attr_value_char(self, c: str) -> None:
1705
1846
  self.current_attr_value.append(c)
1706
1847
 
1707
- def _finish_attribute(self):
1848
+ def _finish_attribute(self) -> None:
1708
1849
  attr_name_buffer = self.current_attr_name
1709
1850
  if not attr_name_buffer:
1710
1851
  return
@@ -1728,12 +1869,13 @@ class Tokenizer:
1728
1869
  else:
1729
1870
  value = "".join(attr_value_buffer)
1730
1871
  if self.current_attr_value_has_amp:
1731
- value = decode_entities_in_text(value, in_attribute=True)
1872
+ report_error = self._emit_error if self.collect_errors else None
1873
+ value = decode_entities_in_text(value, in_attribute=True, report_error=report_error)
1732
1874
  attrs[name] = value
1733
1875
  attr_value_buffer.clear()
1734
1876
  self.current_attr_value_has_amp = False
1735
1877
 
1736
- def _emit_current_tag(self):
1878
+ def _emit_current_tag(self) -> bool:
1737
1879
  name_parts = self.current_tag_name
1738
1880
  part_count = len(name_parts)
1739
1881
  # Note: part_count is always >= 1 because fast-path appends before entering TAG_NAME
@@ -1749,6 +1891,8 @@ class Tokenizer:
1749
1891
  tag.name = name
1750
1892
  tag.attrs = attrs
1751
1893
  tag.self_closing = self.current_tag_self_closing
1894
+ tag.start_pos = self.current_token_start_pos
1895
+ self.last_token_start_pos = tag.start_pos
1752
1896
 
1753
1897
  switched_to_rawtext = False
1754
1898
  if self.current_tag_kind == Tag.START:
@@ -1774,7 +1918,8 @@ class Tokenizer:
1774
1918
  # Remember current state before emitting
1775
1919
 
1776
1920
  # Emit token to sink
1777
- self._record_token_position()
1921
+ if self.collect_errors:
1922
+ self._record_token_position()
1778
1923
  result = self.sink.process_token(tag)
1779
1924
  if result == 1: # TokenSinkResult.Plaintext
1780
1925
  self.state = self.PLAINTEXT
@@ -1787,16 +1932,18 @@ class Tokenizer:
1787
1932
  self.current_tag_kind = Tag.START
1788
1933
  return switched_to_rawtext
1789
1934
 
1790
- def _emit_comment(self):
1935
+ def _emit_comment(self) -> None:
1791
1936
  data = "".join(self.current_comment)
1792
1937
  self.current_comment.clear()
1793
1938
  # Apply XML coercion if enabled
1794
1939
  if self.opts.xml_coercion:
1795
1940
  data = _coerce_comment_for_xml(data)
1796
1941
  self._comment_token.data = data
1942
+ self._comment_token.start_pos = self.current_token_start_pos
1943
+ self.last_token_start_pos = self._comment_token.start_pos
1797
1944
  self._emit_token(self._comment_token)
1798
1945
 
1799
- def _emit_doctype(self):
1946
+ def _emit_doctype(self) -> None:
1800
1947
  name = "".join(self.current_doctype_name) if self.current_doctype_name else None
1801
1948
  # If public_id/system_id is a list (even empty), join it; if None, keep None
1802
1949
  public_id = "".join(self.current_doctype_public) if self.current_doctype_public is not None else None
@@ -1813,19 +1960,18 @@ class Tokenizer:
1813
1960
  self.current_doctype_force_quirks = False
1814
1961
  self._emit_token(DoctypeToken(doctype))
1815
1962
 
1816
- def _emit_token(self, token):
1817
- self._record_token_position()
1963
+ def _emit_token(self, token: AnyToken) -> None:
1964
+ if self.collect_errors:
1965
+ self._record_token_position()
1818
1966
  self.sink.process_token(token)
1819
1967
  # Note: process_token never returns Plaintext or RawData for state switches
1820
1968
  # State switches happen via _emit_current_tag checking sink response
1821
1969
 
1822
- def _record_token_position(self):
1970
+ def _record_token_position(self) -> None:
1823
1971
  """Record current position as 0-indexed column for the last emitted token.
1824
1972
 
1825
1973
  Per the spec, the position should be at the end of the token (after the last char).
1826
1974
  """
1827
- if not self.collect_errors:
1828
- return
1829
1975
  # pos points after the last consumed character, which is exactly what we want
1830
1976
  pos = self.pos
1831
1977
  last_newline = self.buffer.rfind("\n", 0, pos)
@@ -1836,14 +1982,12 @@ class Tokenizer:
1836
1982
  self.last_token_line = self._get_line_at_pos(pos)
1837
1983
  self.last_token_column = column
1838
1984
 
1839
- def _record_text_end_position(self, raw_len):
1985
+ def _record_text_end_position(self, raw_len: int) -> None:
1840
1986
  """Record position at end of text token (after last character).
1841
1987
 
1842
1988
  Uses text_start_pos + raw_len to compute where text ends, matching html5lib's
1843
1989
  behavior of reporting the column of the last character (1-indexed).
1844
1990
  """
1845
- if not self.collect_errors:
1846
- return
1847
1991
  # Position of last character of text (0-indexed)
1848
1992
  end_pos = self.text_start_pos + raw_len
1849
1993
  last_newline = self.buffer.rfind("\n", 0, end_pos)
@@ -1854,7 +1998,7 @@ class Tokenizer:
1854
1998
  self.last_token_line = self._get_line_at_pos(end_pos)
1855
1999
  self.last_token_column = column
1856
2000
 
1857
- def _emit_error(self, code):
2001
+ def _emit_error(self, code: str) -> None:
1858
2002
  if not self.collect_errors:
1859
2003
  return
1860
2004
  # Compute column on-demand: scan backwards to find last newline
@@ -1867,9 +2011,24 @@ class Tokenizer:
1867
2011
 
1868
2012
  message = generate_error_message(code)
1869
2013
  line = self._get_line_at_pos(self.pos)
1870
- self.errors.append(ParseError(code, line=line, column=column, message=message, source_html=self.buffer))
2014
+ self.errors.append(
2015
+ ParseError(code, line=line, column=column, category="tokenizer", message=message, source_html=self.buffer)
2016
+ )
1871
2017
 
1872
- def _consume_if(self, literal):
2018
+ def _emit_error_at_pos(self, code: str, pos: int) -> None:
2019
+ last_newline = self.buffer.rfind("\n", 0, pos + 1)
2020
+ if last_newline == -1:
2021
+ column = pos + 1
2022
+ else:
2023
+ column = pos - last_newline
2024
+
2025
+ message = generate_error_message(code)
2026
+ line = self._get_line_at_pos(pos)
2027
+ self.errors.append(
2028
+ ParseError(code, line=line, column=column, category="tokenizer", message=message, source_html=self.buffer)
2029
+ )
2030
+
2031
+ def _consume_if(self, literal: str) -> bool:
1873
2032
  end = self.pos + len(literal)
1874
2033
  if end > self.length:
1875
2034
  return False
@@ -1879,7 +2038,7 @@ class Tokenizer:
1879
2038
  self.pos = end
1880
2039
  return True
1881
2040
 
1882
- def _consume_case_insensitive(self, literal):
2041
+ def _consume_case_insensitive(self, literal: str) -> bool:
1883
2042
  end = self.pos + len(literal)
1884
2043
  if end > self.length:
1885
2044
  return False
@@ -1889,34 +2048,22 @@ class Tokenizer:
1889
2048
  self.pos = end
1890
2049
  return True
1891
2050
 
1892
- def _consume_comment_run(self):
2051
+ def _consume_comment_run(self) -> bool:
1893
2052
  # Note: Comments are never reconsumed
1894
2053
  pos = self.pos
1895
2054
  length = self.length
1896
2055
  if pos >= length:
1897
2056
  return False
1898
2057
 
1899
- # Handle ignore_lf for CRLF sequences
1900
- if self.ignore_lf and pos < length and self.buffer[pos] == "\n":
1901
- self.ignore_lf = False
1902
- pos += 1
1903
- self.pos = pos
1904
- if pos >= length:
1905
- return False
1906
-
1907
2058
  match = _COMMENT_RUN_PATTERN.match(self.buffer, pos)
1908
2059
  if match:
1909
2060
  chunk = match.group(0)
1910
- # Handle CRLF normalization for comments
1911
- if "\r" in chunk:
1912
- chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
1913
- self.ignore_lf = chunk.endswith("\r")
1914
2061
  self.current_comment.append(chunk)
1915
2062
  self.pos = match.end()
1916
2063
  return True
1917
2064
  return False
1918
2065
 
1919
- def _state_cdata_section(self):
2066
+ def _state_cdata_section(self) -> bool:
1920
2067
  # CDATA section state - consume characters until we see ']'
1921
2068
  while True:
1922
2069
  c = self._get_char()
@@ -1930,7 +2077,7 @@ class Tokenizer:
1930
2077
  return False
1931
2078
  self._append_text(c)
1932
2079
 
1933
- def _state_cdata_section_bracket(self):
2080
+ def _state_cdata_section_bracket(self) -> bool:
1934
2081
  # Seen one ']', check for second ']'
1935
2082
  c = self._get_char()
1936
2083
  if c == "]":
@@ -1947,7 +2094,7 @@ class Tokenizer:
1947
2094
  self.state = self.CDATA_SECTION
1948
2095
  return False
1949
2096
 
1950
- def _state_cdata_section_end(self):
2097
+ def _state_cdata_section_end(self) -> bool:
1951
2098
  # Seen ']]', check for '>'
1952
2099
  c = self._get_char()
1953
2100
  if c == ">":
@@ -1973,7 +2120,7 @@ class Tokenizer:
1973
2120
  self.state = self.CDATA_SECTION
1974
2121
  return False
1975
2122
 
1976
- def _state_rcdata(self):
2123
+ def _state_rcdata(self) -> bool:
1977
2124
  buffer = self.buffer
1978
2125
  length = self.length
1979
2126
  pos = self.pos
@@ -2004,7 +2151,7 @@ class Tokenizer:
2004
2151
  # Consume everything up to the special character
2005
2152
  if next_special > pos:
2006
2153
  chunk = buffer[pos:next_special]
2007
- self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2154
+ self._append_text_chunk(chunk)
2008
2155
  pos = next_special
2009
2156
  self.pos = pos
2010
2157
 
@@ -2016,7 +2163,6 @@ class Tokenizer:
2016
2163
 
2017
2164
  # Handle special characters - we're at one of them after find()
2018
2165
  if null_index == pos:
2019
- self.ignore_lf = False
2020
2166
  self._emit_error("unexpected-null-character")
2021
2167
  self._append_text("\ufffd")
2022
2168
  pos += 1
@@ -2034,7 +2180,7 @@ class Tokenizer:
2034
2180
  self.state = self.RCDATA_LESS_THAN_SIGN
2035
2181
  return False
2036
2182
 
2037
- def _state_rcdata_less_than_sign(self):
2183
+ def _state_rcdata_less_than_sign(self) -> bool:
2038
2184
  c = self._get_char()
2039
2185
  if c == "/":
2040
2186
  self.current_tag_name.clear()
@@ -2045,7 +2191,7 @@ class Tokenizer:
2045
2191
  self.state = self.RCDATA
2046
2192
  return False
2047
2193
 
2048
- def _state_rcdata_end_tag_open(self):
2194
+ def _state_rcdata_end_tag_open(self) -> bool:
2049
2195
  c = self._get_char()
2050
2196
  if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2051
2197
  self.current_tag_name.append(c.lower())
@@ -2057,7 +2203,7 @@ class Tokenizer:
2057
2203
  self.state = self.RCDATA
2058
2204
  return False
2059
2205
 
2060
- def _state_rcdata_end_tag_name(self):
2206
+ def _state_rcdata_end_tag_name(self) -> bool:
2061
2207
  # Check if this matches the opening tag name
2062
2208
  while True:
2063
2209
  c = self._get_char()
@@ -2069,7 +2215,7 @@ class Tokenizer:
2069
2215
  tag_name = "".join(self.current_tag_name)
2070
2216
  if tag_name == self.rawtext_tag_name:
2071
2217
  if c == ">":
2072
- attrs = []
2218
+ attrs: dict[str, str | None] = {}
2073
2219
  tag = Tag(Tag.END, tag_name, attrs, False)
2074
2220
  self._flush_text()
2075
2221
  self._emit_token(tag)
@@ -2110,7 +2256,7 @@ class Tokenizer:
2110
2256
  self.state = self.RCDATA
2111
2257
  return False
2112
2258
 
2113
- def _state_rawtext(self):
2259
+ def _state_rawtext(self) -> bool:
2114
2260
  buffer = self.buffer
2115
2261
  length = self.length
2116
2262
  pos = self.pos
@@ -2131,9 +2277,7 @@ class Tokenizer:
2131
2277
  if null_index != -1 and null_index < next_special:
2132
2278
  if null_index > pos:
2133
2279
  chunk = buffer[pos:null_index]
2134
- self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2135
- else:
2136
- self.ignore_lf = False
2280
+ self._append_text_chunk(chunk)
2137
2281
  self._emit_error("unexpected-null-character")
2138
2282
  self._append_text("\ufffd")
2139
2283
  pos = null_index + 1
@@ -2142,14 +2286,14 @@ class Tokenizer:
2142
2286
  if lt_index == -1:
2143
2287
  if pos < length:
2144
2288
  chunk = buffer[pos:length]
2145
- self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2289
+ self._append_text_chunk(chunk)
2146
2290
  self.pos = length
2147
2291
  self._flush_text()
2148
2292
  self._emit_token(EOFToken())
2149
2293
  return True
2150
2294
  if lt_index > pos:
2151
2295
  chunk = buffer[pos:lt_index]
2152
- self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2296
+ self._append_text_chunk(chunk)
2153
2297
  pos = lt_index + 1
2154
2298
  self.pos = pos
2155
2299
  # Handle script escaped transition before treating '<' as markup boundary
@@ -2167,7 +2311,7 @@ class Tokenizer:
2167
2311
  self.state = self.RAWTEXT_LESS_THAN_SIGN
2168
2312
  return False
2169
2313
 
2170
- def _state_rawtext_less_than_sign(self):
2314
+ def _state_rawtext_less_than_sign(self) -> bool:
2171
2315
  c = self._get_char()
2172
2316
  if c == "/":
2173
2317
  self.current_tag_name.clear()
@@ -2178,7 +2322,7 @@ class Tokenizer:
2178
2322
  self.state = self.RAWTEXT
2179
2323
  return False
2180
2324
 
2181
- def _state_rawtext_end_tag_open(self):
2325
+ def _state_rawtext_end_tag_open(self) -> bool:
2182
2326
  c = self._get_char()
2183
2327
  if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2184
2328
  self.current_tag_name.append(c.lower())
@@ -2190,7 +2334,7 @@ class Tokenizer:
2190
2334
  self.state = self.RAWTEXT
2191
2335
  return False
2192
2336
 
2193
- def _state_rawtext_end_tag_name(self):
2337
+ def _state_rawtext_end_tag_name(self) -> bool:
2194
2338
  # Check if this matches the opening tag name
2195
2339
  while True:
2196
2340
  c = self._get_char()
@@ -2202,7 +2346,7 @@ class Tokenizer:
2202
2346
  tag_name = "".join(self.current_tag_name)
2203
2347
  if tag_name == self.rawtext_tag_name:
2204
2348
  if c == ">":
2205
- attrs = []
2349
+ attrs: dict[str, str | None] = {}
2206
2350
  tag = Tag(Tag.END, tag_name, attrs, False)
2207
2351
  self._flush_text()
2208
2352
  self._emit_token(tag)
@@ -2243,7 +2387,7 @@ class Tokenizer:
2243
2387
  self.state = self.RAWTEXT
2244
2388
  return False
2245
2389
 
2246
- def _state_plaintext(self):
2390
+ def _state_plaintext(self) -> bool:
2247
2391
  # PLAINTEXT state - consume everything as text, no end tag
2248
2392
  if self.pos < self.length:
2249
2393
  remaining = self.buffer[self.pos :]
@@ -2257,7 +2401,7 @@ class Tokenizer:
2257
2401
  self._emit_token(EOFToken())
2258
2402
  return True
2259
2403
 
2260
- def _state_script_data_escaped(self):
2404
+ def _state_script_data_escaped(self) -> bool:
2261
2405
  c = self._get_char()
2262
2406
  if c is None:
2263
2407
  self._flush_text()
@@ -2277,7 +2421,7 @@ class Tokenizer:
2277
2421
  self._append_text(c)
2278
2422
  return False
2279
2423
 
2280
- def _state_script_data_escaped_dash(self):
2424
+ def _state_script_data_escaped_dash(self) -> bool:
2281
2425
  c = self._get_char()
2282
2426
  if c is None:
2283
2427
  self._flush_text()
@@ -2299,7 +2443,7 @@ class Tokenizer:
2299
2443
  self.state = self.SCRIPT_DATA_ESCAPED
2300
2444
  return False
2301
2445
 
2302
- def _state_script_data_escaped_dash_dash(self):
2446
+ def _state_script_data_escaped_dash_dash(self) -> bool:
2303
2447
  c = self._get_char()
2304
2448
  if c is None:
2305
2449
  self._flush_text()
@@ -2325,7 +2469,7 @@ class Tokenizer:
2325
2469
  self.state = self.SCRIPT_DATA_ESCAPED
2326
2470
  return False
2327
2471
 
2328
- def _state_script_data_escaped_less_than_sign(self):
2472
+ def _state_script_data_escaped_less_than_sign(self) -> bool:
2329
2473
  c = self._get_char()
2330
2474
  if c == "/":
2331
2475
  self.temp_buffer.clear()
@@ -2343,7 +2487,7 @@ class Tokenizer:
2343
2487
 
2344
2488
  return False
2345
2489
 
2346
- def _state_script_data_escaped_end_tag_open(self):
2490
+ def _state_script_data_escaped_end_tag_open(self) -> bool:
2347
2491
  c = self._get_char()
2348
2492
  if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2349
2493
  self.current_tag_name.clear()
@@ -2356,7 +2500,7 @@ class Tokenizer:
2356
2500
  self.state = self.SCRIPT_DATA_ESCAPED
2357
2501
  return False
2358
2502
 
2359
- def _state_script_data_escaped_end_tag_name(self):
2503
+ def _state_script_data_escaped_end_tag_name(self) -> bool:
2360
2504
  c = self._get_char()
2361
2505
  if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
2362
2506
  self.current_tag_name.append(c.lower())
@@ -2381,7 +2525,7 @@ class Tokenizer:
2381
2525
  return False
2382
2526
  if c == ">":
2383
2527
  self._flush_text()
2384
- attrs = []
2528
+ attrs: dict[str, str | None] = {}
2385
2529
  tag = Tag(Tag.END, tag_name, attrs, False)
2386
2530
  self._emit_token(tag)
2387
2531
  self.state = self.DATA
@@ -2397,7 +2541,7 @@ class Tokenizer:
2397
2541
  self.state = self.SCRIPT_DATA_ESCAPED
2398
2542
  return False
2399
2543
 
2400
- def _state_script_data_double_escape_start(self):
2544
+ def _state_script_data_double_escape_start(self) -> bool:
2401
2545
  c = self._get_char()
2402
2546
  if c in (" ", "\t", "\n", "\r", "\f", "/", ">"):
2403
2547
  # Check if temp_buffer contains "script"
@@ -2416,7 +2560,7 @@ class Tokenizer:
2416
2560
  self.state = self.SCRIPT_DATA_ESCAPED
2417
2561
  return False
2418
2562
 
2419
- def _state_script_data_double_escaped(self):
2563
+ def _state_script_data_double_escaped(self) -> bool:
2420
2564
  c = self._get_char()
2421
2565
  if c is None:
2422
2566
  self._flush_text()
@@ -2437,7 +2581,7 @@ class Tokenizer:
2437
2581
  self._append_text(c)
2438
2582
  return False
2439
2583
 
2440
- def _state_script_data_double_escaped_dash(self):
2584
+ def _state_script_data_double_escaped_dash(self) -> bool:
2441
2585
  c = self._get_char()
2442
2586
  if c is None:
2443
2587
  self._flush_text()
@@ -2460,7 +2604,7 @@ class Tokenizer:
2460
2604
  self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
2461
2605
  return False
2462
2606
 
2463
- def _state_script_data_double_escaped_dash_dash(self):
2607
+ def _state_script_data_double_escaped_dash_dash(self) -> bool:
2464
2608
  c = self._get_char()
2465
2609
  if c is None:
2466
2610
  self._flush_text()
@@ -2488,7 +2632,7 @@ class Tokenizer:
2488
2632
  self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
2489
2633
  return False
2490
2634
 
2491
- def _state_script_data_double_escaped_less_than_sign(self):
2635
+ def _state_script_data_double_escaped_less_than_sign(self) -> bool:
2492
2636
  c = self._get_char()
2493
2637
  if c == "/":
2494
2638
  self.temp_buffer.clear()
@@ -2504,7 +2648,7 @@ class Tokenizer:
2504
2648
  self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
2505
2649
  return False
2506
2650
 
2507
- def _state_script_data_double_escape_end(self):
2651
+ def _state_script_data_double_escape_end(self) -> bool:
2508
2652
  c = self._get_char()
2509
2653
  if c in (" ", "\t", "\n", "\r", "\f", "/", ">"):
2510
2654
  # Check if temp_buffer contains "script"
@@ -2525,7 +2669,7 @@ class Tokenizer:
2525
2669
  return False
2526
2670
 
2527
2671
 
2528
- Tokenizer._STATE_HANDLERS = [
2672
+ Tokenizer._STATE_HANDLERS = [ # type: ignore[attr-defined]
2529
2673
  Tokenizer._state_data,
2530
2674
  Tokenizer._state_tag_open,
2531
2675
  Tokenizer._state_end_tag_open,