justhtml 0.12.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/tokenizer.py CHANGED
@@ -11,7 +11,7 @@ from .entities import decode_entities_in_text
11
11
  from .errors import generate_error_message
12
12
  from .tokens import CommentToken, Doctype, DoctypeToken, EOFToken, ParseError, Tag
13
13
 
14
- _ATTR_VALUE_UNQUOTED_TERMINATORS = "\t\n\f >&\"'<=`\r\0"
14
+ _ATTR_VALUE_UNQUOTED_TERMINATORS = "\t\n\f >&\"'<=`\0"
15
15
  _ASCII_LOWER_TABLE = str.maketrans({chr(code): chr(code + 32) for code in range(65, 91)})
16
16
  _RCDATA_ELEMENTS = {"title", "textarea"}
17
17
  _RAWTEXT_SWITCH_TAGS = {
@@ -29,8 +29,8 @@ _ATTR_VALUE_DOUBLE_PATTERN = re.compile(r'["&\0]')
29
29
  _ATTR_VALUE_SINGLE_PATTERN = re.compile(r"['&\0]")
30
30
  _ATTR_VALUE_UNQUOTED_PATTERN = re.compile(f"[{re.escape(_ATTR_VALUE_UNQUOTED_TERMINATORS)}]")
31
31
 
32
- _TAG_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />\0\r]+")
33
- _ATTR_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />=\0\"'<\r]+")
32
+ _TAG_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />\0]+")
33
+ _ATTR_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />=\0\"'<]+")
34
34
  _COMMENT_RUN_PATTERN = re.compile(r"[^-\0]+")
35
35
  _WHITESPACE_PATTERN = re.compile(r"[ \t\n\f]+")
36
36
 
@@ -44,6 +44,13 @@ for _plane in range(17):
44
44
  _XML_COERCION_PATTERN = re.compile(r"[\f\uFDD0-\uFDEF" + "".join(_xml_invalid_single_chars) + "]")
45
45
 
46
46
 
47
+ def _is_noncharacter_codepoint(codepoint: int) -> bool:
48
+ if 0xFDD0 <= codepoint <= 0xFDEF:
49
+ return True
50
+ last = codepoint & 0xFFFF
51
+ return last == 0xFFFE or last == 0xFFFF
52
+
53
+
47
54
  def _xml_coercion_callback(match: re.Match[str]) -> str:
48
55
  if match.group(0) == "\f":
49
56
  return " "
@@ -178,11 +185,12 @@ class Tokenizer:
178
185
  "current_tag_kind",
179
186
  "current_tag_name",
180
187
  "current_tag_self_closing",
188
+ "current_token_start_pos",
181
189
  "errors",
182
- "ignore_lf",
183
190
  "last_start_tag_name",
184
191
  "last_token_column",
185
192
  "last_token_line",
193
+ "last_token_start_pos",
186
194
  "length",
187
195
  "opts",
188
196
  "original_tag_name",
@@ -194,6 +202,7 @@ class Tokenizer:
194
202
  "temp_buffer",
195
203
  "text_buffer",
196
204
  "text_start_pos",
205
+ "track_node_locations",
197
206
  )
198
207
 
199
208
  _comment_token: CommentToken
@@ -202,6 +211,7 @@ class Tokenizer:
202
211
  _tag_token: Tag
203
212
  buffer: str
204
213
  collect_errors: bool
214
+ track_node_locations: bool
205
215
  current_attr_name: list[str]
206
216
  current_attr_value: list[str]
207
217
  current_attr_value_has_amp: bool
@@ -215,11 +225,12 @@ class Tokenizer:
215
225
  current_tag_kind: int
216
226
  current_tag_name: list[str]
217
227
  current_tag_self_closing: bool
228
+ current_token_start_pos: int
218
229
  errors: list[ParseError]
219
- ignore_lf: bool
220
230
  last_start_tag_name: str | None
221
231
  last_token_column: int
222
232
  last_token_line: int
233
+ last_token_start_pos: int | None
223
234
  length: int
224
235
  opts: TokenizerOpts
225
236
  original_tag_name: list[str]
@@ -234,10 +245,18 @@ class Tokenizer:
234
245
 
235
246
  # _STATE_HANDLERS is defined at the end of the file
236
247
 
237
- def __init__(self, sink: Any, opts: TokenizerOpts | None = None, collect_errors: bool = False) -> None:
248
+ def __init__(
249
+ self,
250
+ sink: Any,
251
+ opts: TokenizerOpts | None = None,
252
+ *,
253
+ collect_errors: bool = False,
254
+ track_node_locations: bool = False,
255
+ ) -> None:
238
256
  self.sink = sink
239
257
  self.opts = opts or TokenizerOpts()
240
258
  self.collect_errors = collect_errors
259
+ self.track_node_locations = bool(track_node_locations)
241
260
  self.errors = []
242
261
 
243
262
  self.state = self.DATA
@@ -246,9 +265,10 @@ class Tokenizer:
246
265
  self.pos = 0
247
266
  self.reconsume = False
248
267
  self.current_char = ""
249
- self.ignore_lf = False
250
268
  self.last_token_line = 1
251
269
  self.last_token_column = 0
270
+ self.current_token_start_pos = 0
271
+ self.last_token_start_pos = None
252
272
 
253
273
  # Reusable buffers to avoid per-token allocations.
254
274
  self.text_buffer = []
@@ -276,14 +296,20 @@ class Tokenizer:
276
296
  if html and html[0] == "\ufeff" and self.opts.discard_bom:
277
297
  html = html[1:]
278
298
 
299
+ # Normalize newlines per §13.2.2.5
300
+ if html:
301
+ if "\r" in html:
302
+ html = html.replace("\r\n", "\n").replace("\r", "\n")
303
+
279
304
  self.buffer = html or ""
280
305
  self.length = len(self.buffer)
281
306
  self.pos = 0
282
307
  self.reconsume = False
283
308
  self.current_char = ""
284
- self.ignore_lf = False
285
309
  self.last_token_line = 1
286
310
  self.last_token_column = 0
311
+ self.current_token_start_pos = 0
312
+ self.last_token_start_pos = None
287
313
  self.errors = []
288
314
  self.text_buffer.clear()
289
315
  self.text_start_pos = 0
@@ -313,8 +339,9 @@ class Tokenizer:
313
339
  else:
314
340
  self.state = self.DATA
315
341
 
316
- # Pre-compute newline positions for O(log n) line lookups
317
- if self.collect_errors:
342
+ # Pre-compute newline positions for O(log n) line lookups.
343
+ # Only do this when errors are collected or when node locations are requested.
344
+ if self.collect_errors or self.track_node_locations:
318
345
  self._newline_positions = []
319
346
  pos = -1
320
347
  buffer = self.buffer
@@ -334,6 +361,34 @@ class Tokenizer:
334
361
  return 1
335
362
  return bisect_right(newline_positions, pos - 1) + 1
336
363
 
364
+ def location_at_pos(self, pos: int) -> tuple[int, int]:
365
+ """Return (line, column) for a 0-indexed offset in the current buffer.
366
+
367
+ Column is 1-indexed. Newline positions are computed lazily when needed.
368
+ """
369
+ newline_positions = self._newline_positions
370
+ if newline_positions is None:
371
+ newline_positions = []
372
+ scan = -1
373
+ buffer = self.buffer
374
+ while True:
375
+ scan = buffer.find("\n", scan + 1)
376
+ if scan == -1:
377
+ break
378
+ newline_positions.append(scan)
379
+ self._newline_positions = newline_positions
380
+
381
+ line_index = bisect_right(newline_positions, pos - 1)
382
+ line = line_index + 1
383
+
384
+ # Compute column using newline index rather than rfind() to avoid O(n) scans.
385
+ if line_index == 0:
386
+ last_newline = -1
387
+ else:
388
+ last_newline = newline_positions[line_index - 1]
389
+ column = pos - last_newline
390
+ return line, column
391
+
337
392
  def step(self) -> bool:
338
393
  """Run one step of the tokenizer state machine. Returns True if EOF reached."""
339
394
  handler = self._STATE_HANDLERS[self.state] # type: ignore[attr-defined]
@@ -356,9 +411,8 @@ class Tokenizer:
356
411
  return self.buffer[peek_pos]
357
412
  return None
358
413
 
359
- def _append_text_chunk(self, chunk: str, *, ends_with_cr: bool = False) -> None:
414
+ def _append_text_chunk(self, chunk: str) -> None:
360
415
  self._append_text(chunk)
361
- self.ignore_lf = ends_with_cr
362
416
 
363
417
  # ---------------------
364
418
  # State handlers
@@ -392,12 +446,12 @@ class Tokenizer:
392
446
 
393
447
  if end > pos:
394
448
  chunk = buffer[pos:end]
395
-
396
- if "\r" in chunk:
397
- chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
398
-
449
+ if self.collect_errors and not chunk.isascii():
450
+ base_pos = pos
451
+ for offset, ch in enumerate(chunk):
452
+ if _is_noncharacter_codepoint(ord(ch)):
453
+ self._emit_error_at_pos("noncharacter-in-input-stream", base_pos + offset)
399
454
  self._append_text(chunk)
400
- self.ignore_lf = chunk.endswith("\r")
401
455
 
402
456
  pos = end
403
457
  self.pos = pos
@@ -410,8 +464,8 @@ class Tokenizer:
410
464
  pos += 1
411
465
  self.pos = pos
412
466
  self.current_char = c
413
- self.ignore_lf = False
414
467
  # c is always '<' here due to find() optimization above
468
+ self.current_token_start_pos = pos - 1
415
469
  # Optimization: Peek ahead for common tag starts
416
470
  if pos < length:
417
471
  nc = buffer[pos]
@@ -518,15 +572,15 @@ class Tokenizer:
518
572
  append_tag_char = self.current_tag_name.append
519
573
  buffer = self.buffer
520
574
  length = self.length
575
+ pos = self.pos
521
576
 
522
577
  while True:
523
578
  # Inline _consume_tag_name_run
524
- # Note: reconsume and ignore_lf are never True when entering TAG_NAME
525
- pos = self.pos
579
+ # Note: reconsume is never True when entering TAG_NAME
526
580
  if pos < length:
527
581
  # Optimization: Check for common terminators before regex
528
582
  match = None
529
- if buffer[pos] not in "\t\n\f />\0\r":
583
+ if buffer[pos] not in "\t\n\f />\0":
530
584
  match = _TAG_NAME_RUN_PATTERN.match(buffer, pos)
531
585
 
532
586
  if match:
@@ -534,46 +588,59 @@ class Tokenizer:
534
588
  if not chunk.islower():
535
589
  chunk = chunk.translate(_ASCII_LOWER_TABLE)
536
590
  append_tag_char(chunk)
537
- self.pos = match.end()
538
-
539
- if self.pos < length:
540
- c = buffer[self.pos]
541
- if c in (" ", "\t", "\n", "\f", "\r"):
542
- self.pos += 1
543
- if c == "\r":
544
- self.ignore_lf = True
591
+ pos = match.end()
592
+
593
+ if pos < length:
594
+ next_char = buffer[pos]
595
+ if next_char in (" ", "\t", "\n", "\f"):
596
+ pos += 1
597
+ self.pos = pos
545
598
  self.state = self.BEFORE_ATTRIBUTE_NAME
546
599
  return self._state_before_attribute_name()
547
- if c == ">":
548
- self.pos += 1
600
+ if next_char == ">":
601
+ pos += 1
602
+ self.pos = pos
549
603
  if not self._emit_current_tag():
550
604
  self.state = self.DATA
551
605
  return False
552
- if c == "/":
553
- self.pos += 1
606
+ if next_char == "/":
607
+ pos += 1
608
+ self.pos = pos
554
609
  self.state = self.SELF_CLOSING_START_TAG
555
610
  return self._state_self_closing_start_tag()
556
611
 
557
- c = self._get_char() # type: ignore[assignment]
612
+ # Inline _get_char
613
+ # Note: reconsume is never True in this state.
614
+ if pos >= length:
615
+ c: str | None = None
616
+ else:
617
+ c = buffer[pos]
618
+ pos += 1
619
+ self.current_char = c
558
620
  if c is None:
621
+ self.pos = pos
559
622
  self._emit_error("eof-in-tag")
560
623
  # Per HTML5 spec: EOF in tag name is a parse error, emit EOF token only
561
624
  # The incomplete tag is discarded (not emitted as text)
562
625
  self._emit_token(EOFToken())
563
626
  return True
564
627
  if c in ("\t", "\n", "\f", " "):
628
+ self.pos = pos
565
629
  self.state = self.BEFORE_ATTRIBUTE_NAME
566
630
  return self._state_before_attribute_name()
567
631
  if c == "/":
632
+ self.pos = pos
568
633
  self.state = self.SELF_CLOSING_START_TAG
569
634
  return self._state_self_closing_start_tag()
570
635
  if c == ">":
571
636
  # In slow path, tag name is only first char (from DATA),
572
637
  # so no rawtext elements possible - always set DATA state
638
+ self.pos = pos
573
639
  self._emit_current_tag()
574
640
  self.state = self.DATA
575
641
  return False
576
642
  # c == "\0" - the only remaining possibility after fast-path
643
+ self.pos = pos
577
644
  self._emit_error("unexpected-null-character")
578
645
  append_tag_char(replacement)
579
646
 
@@ -583,7 +650,7 @@ class Tokenizer:
583
650
 
584
651
  while True:
585
652
  # Optimization: Skip whitespace
586
- if not self.reconsume and not self.ignore_lf:
653
+ if not self.reconsume:
587
654
  if self.pos < length:
588
655
  # Check if current char is whitespace before running regex
589
656
  if buffer[self.pos] in " \t\n\f":
@@ -603,21 +670,7 @@ class Tokenizer:
603
670
 
604
671
  self.current_char = c
605
672
 
606
- if c == " ":
607
- self.ignore_lf = False
608
- continue
609
- if c == "\n":
610
- if self.ignore_lf:
611
- self.ignore_lf = False
612
- # Line tracking now computed on-demand via _get_line_at_pos()
613
- continue
614
- if c == "\t" or c == "\f":
615
- self.ignore_lf = False
616
- continue
617
- if c == "\r":
618
- self.ignore_lf = False
619
- if self.pos < length and buffer[self.pos] == "\n":
620
- self.pos += 1
673
+ if c in (" ", "\n", "\t", "\f"):
621
674
  continue
622
675
 
623
676
  if c is None:
@@ -661,50 +714,54 @@ class Tokenizer:
661
714
  append_attr_char = self.current_attr_name.append
662
715
  buffer = self.buffer
663
716
  length = self.length
717
+ pos = self.pos
664
718
 
665
719
  while True:
666
720
  # Inline _consume_attribute_name_run
667
- if not self.reconsume and not self.ignore_lf:
668
- pos = self.pos
669
- if pos < length:
670
- # Optimization: Check for common terminators before regex
671
- match = None
672
- if buffer[pos] not in "\t\n\f />=\0\"'<\r":
673
- match = _ATTR_NAME_RUN_PATTERN.match(buffer, pos)
674
-
675
- if match:
676
- chunk = match.group(0)
677
- if not chunk.islower():
678
- chunk = chunk.translate(_ASCII_LOWER_TABLE)
679
- append_attr_char(chunk)
680
- self.pos = match.end()
721
+ # Note: reconsume is never True in this state.
722
+ if pos < length:
723
+ # Optimization: Check for common terminators before regex
724
+ match = None
725
+ if buffer[pos] not in "\t\n\f />=\0\"'<":
726
+ match = _ATTR_NAME_RUN_PATTERN.match(buffer, pos)
681
727
 
682
- if self.pos < length:
683
- c = buffer[self.pos]
684
- if c == "=":
685
- self.pos += 1
686
- self.state = self.BEFORE_ATTRIBUTE_VALUE
687
- return self._state_before_attribute_value()
688
- if c in (" ", "\t", "\n", "\f", "\r"):
689
- self.pos += 1
690
- if c == "\r":
691
- self.ignore_lf = True
692
- self._finish_attribute()
693
- self.state = self.AFTER_ATTRIBUTE_NAME
694
- return False # Let main loop dispatch to avoid recursion
695
- if c == ">":
696
- self.pos += 1
697
- self._finish_attribute()
698
- if not self._emit_current_tag():
699
- self.state = self.DATA
700
- return False
701
- if c == "/":
702
- self.pos += 1
703
- self._finish_attribute()
704
- self.state = self.SELF_CLOSING_START_TAG
705
- return self._state_self_closing_start_tag()
728
+ if match:
729
+ chunk = match.group(0)
730
+ if not chunk.islower():
731
+ chunk = chunk.translate(_ASCII_LOWER_TABLE)
732
+ append_attr_char(chunk)
733
+ pos = match.end()
734
+
735
+ if pos < length:
736
+ c = buffer[pos]
737
+ if c == "=":
738
+ pos += 1
739
+ self.pos = pos
740
+ self.state = self.BEFORE_ATTRIBUTE_VALUE
741
+ return self._state_before_attribute_value()
742
+ if c in (" ", "\t", "\n", "\f"):
743
+ pos += 1
744
+ self.pos = pos
745
+ self._finish_attribute()
746
+ self.state = self.AFTER_ATTRIBUTE_NAME
747
+ return False # Let main loop dispatch to avoid recursion
748
+ if c == ">":
749
+ pos += 1
750
+ self.pos = pos
751
+ self._finish_attribute()
752
+ if not self._emit_current_tag():
753
+ self.state = self.DATA
754
+ return False
755
+ if c == "/":
756
+ pos += 1
757
+ self.pos = pos
758
+ self._finish_attribute()
759
+ self.state = self.SELF_CLOSING_START_TAG
760
+ return self._state_self_closing_start_tag()
706
761
 
762
+ self.pos = pos
707
763
  c = self._get_char() # type: ignore[assignment]
764
+ pos = self.pos
708
765
  if c is None:
709
766
  self._emit_error("eof-in-tag")
710
767
  self._flush_text()
@@ -730,8 +787,7 @@ class Tokenizer:
730
787
  self._emit_error("unexpected-null-character")
731
788
  append_attr_char(replacement)
732
789
  continue
733
- if c in ('"', "'", "<"):
734
- self._emit_error("unexpected-character-in-attribute-name")
790
+ self._emit_error("unexpected-character-in-attribute-name")
735
791
  append_attr_char(c)
736
792
 
737
793
  def _state_after_attribute_name(self) -> bool:
@@ -740,7 +796,7 @@ class Tokenizer:
740
796
 
741
797
  while True:
742
798
  # Optimization: Skip whitespace
743
- if not self.reconsume and not self.ignore_lf:
799
+ if not self.reconsume:
744
800
  if self.pos < length:
745
801
  match = _WHITESPACE_PATTERN.match(buffer, self.pos)
746
802
  if match:
@@ -755,23 +811,9 @@ class Tokenizer:
755
811
 
756
812
  self.current_char = c
757
813
 
758
- if c == " ":
759
- self.ignore_lf = False
760
- continue
761
- if c == "\n":
762
- # Note: Only reachable when ignore_lf=True (CR-LF handling)
763
- # Standalone \n is caught by whitespace optimization
764
- self.ignore_lf = False
765
- continue
766
- if c == "\r":
767
- self.ignore_lf = True
768
- continue
769
- if c == "\t" or c == "\f":
770
- self.ignore_lf = False
814
+ if c in (" ", "\n", "\t", "\f"):
771
815
  continue
772
816
 
773
- self.ignore_lf = False
774
-
775
817
  if c is None:
776
818
  self._emit_error("eof-in-tag")
777
819
  self._flush_text()
@@ -857,10 +899,6 @@ class Tokenizer:
857
899
  if end != next_quote:
858
900
  chunk = buffer[pos:end]
859
901
 
860
- # Normalize chunk for value if needed
861
- if "\r" in chunk:
862
- chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
863
-
864
902
  self.current_attr_value.append(chunk)
865
903
  self.pos = end
866
904
 
@@ -916,10 +954,6 @@ class Tokenizer:
916
954
  if end != next_quote:
917
955
  chunk = buffer[pos:end]
918
956
 
919
- # Normalize chunk for value if needed
920
- if "\r" in chunk:
921
- chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
922
-
923
957
  self.current_attr_value.append(chunk)
924
958
  self.pos = end
925
959
 
@@ -965,7 +999,17 @@ class Tokenizer:
965
999
  self.current_attr_value.append(buffer[pos:end])
966
1000
  self.pos = end
967
1001
 
968
- c = self._get_char()
1002
+ # Inline _get_char
1003
+ if self.reconsume:
1004
+ self.reconsume = False
1005
+ c = self.current_char
1006
+ elif self.pos >= length:
1007
+ c = None
1008
+ else:
1009
+ c = buffer[self.pos]
1010
+ self.pos += 1
1011
+ self.current_char = c
1012
+
969
1013
  if c is None:
970
1014
  # Per HTML5 spec: EOF in attribute value is a parse error
971
1015
  # The incomplete tag is discarded (not emitted)
@@ -995,7 +1039,14 @@ class Tokenizer:
995
1039
 
996
1040
  def _state_after_attribute_value_quoted(self) -> bool:
997
1041
  """After attribute value (quoted) state per HTML5 spec §13.2.5.42"""
998
- c = self._get_char()
1042
+ # Inline _get_char
1043
+ if self.pos >= self.length:
1044
+ c: str | None = None
1045
+ else:
1046
+ c = self.buffer[self.pos]
1047
+ self.pos += 1
1048
+ self.current_char = c
1049
+
999
1050
  if c is None:
1000
1051
  self._emit_error("eof-in-tag")
1001
1052
  self._flush_text()
@@ -1125,7 +1176,14 @@ class Tokenizer:
1125
1176
  while True:
1126
1177
  if self._consume_comment_run():
1127
1178
  continue
1128
- c = self._get_char()
1179
+ # Inline _get_char
1180
+ if self.pos >= self.length:
1181
+ c: str | None = None
1182
+ else:
1183
+ c = self.buffer[self.pos]
1184
+ self.pos += 1
1185
+ self.current_char = c
1186
+
1129
1187
  if c is None:
1130
1188
  self._emit_error("eof-in-comment")
1131
1189
  self._emit_comment()
@@ -1264,7 +1322,7 @@ class Tokenizer:
1264
1322
  while True:
1265
1323
  c = self._get_char()
1266
1324
  if c is None:
1267
- self._emit_error("eof-in-doctype-name")
1325
+ self._emit_error("eof-in-doctype")
1268
1326
  self.current_doctype_force_quirks = True
1269
1327
  self._emit_doctype()
1270
1328
  self._emit_token(EOFToken())
@@ -1291,7 +1349,7 @@ class Tokenizer:
1291
1349
  while True:
1292
1350
  c = self._get_char()
1293
1351
  if c is None:
1294
- self._emit_error("eof-in-doctype-name")
1352
+ self._emit_error("eof-in-doctype")
1295
1353
  self.current_doctype_force_quirks = True
1296
1354
  self._emit_doctype()
1297
1355
  self._emit_token(EOFToken())
@@ -1675,36 +1733,19 @@ class Tokenizer:
1675
1733
  self.reconsume = False
1676
1734
  return self.current_char
1677
1735
 
1678
- buffer = self.buffer
1679
1736
  pos = self.pos
1680
- length = self.length
1681
- while True:
1682
- if pos >= length:
1683
- self.pos = pos
1684
- self.current_char = None
1685
- return None
1686
-
1687
- c = buffer[pos]
1688
- pos += 1
1737
+ if pos >= self.length:
1738
+ self.current_char = None
1739
+ return None
1689
1740
 
1690
- if c == "\r":
1691
- self.ignore_lf = True
1692
- self.current_char = "\n"
1693
- self.pos = pos
1694
- return "\n"
1695
-
1696
- if c == "\n":
1697
- if self.ignore_lf:
1698
- self.ignore_lf = False
1699
- continue
1700
- # Line tracking now computed on-demand via _get_line_at_pos()
1701
-
1702
- else:
1703
- self.ignore_lf = False
1704
-
1705
- self.current_char = c
1706
- self.pos = pos
1707
- return c
1741
+ c = self.buffer[pos]
1742
+ self.pos = pos + 1
1743
+ self.current_char = c
1744
+ if c == "<":
1745
+ self.current_token_start_pos = pos
1746
+ if self.collect_errors and not c.isascii() and _is_noncharacter_codepoint(ord(c)):
1747
+ self._emit_error_at_pos("noncharacter-in-input-stream", pos)
1748
+ return c
1708
1749
 
1709
1750
  def _reconsume_current(self) -> None:
1710
1751
  self.reconsume = True
@@ -1731,10 +1772,37 @@ class Tokenizer:
1731
1772
  raw_len = len(data)
1732
1773
 
1733
1774
  self.text_buffer.clear()
1734
- if self.state == self.DATA and "\0" in data:
1735
- count = data.count("\0")
1736
- for _ in range(count):
1737
- self._emit_error("unexpected-null-character")
1775
+ # U+0000 NULL is a parse error in text.
1776
+ # Emit one error per NULL at the *actual* character position.
1777
+ if "\0" in data:
1778
+ base_pos = self.text_start_pos
1779
+ search_from = 0
1780
+ while True:
1781
+ idx = data.find("\0", search_from)
1782
+ if idx == -1:
1783
+ break
1784
+ error_pos = base_pos + idx
1785
+
1786
+ # Compute column at error_pos (1-indexed).
1787
+ last_newline = self.buffer.rfind("\n", 0, error_pos + 1)
1788
+ if last_newline == -1:
1789
+ column = error_pos + 1
1790
+ else:
1791
+ column = error_pos - last_newline
1792
+ line = self._get_line_at_pos(error_pos)
1793
+
1794
+ message = generate_error_message("unexpected-null-character")
1795
+ self.errors.append(
1796
+ ParseError(
1797
+ "unexpected-null-character",
1798
+ line=line,
1799
+ column=column,
1800
+ message=message,
1801
+ source_html=self.buffer,
1802
+ )
1803
+ )
1804
+
1805
+ search_from = idx + 1
1738
1806
 
1739
1807
  # Per HTML5 spec:
1740
1808
  # - RCDATA state (title, textarea): decode character references
@@ -1747,13 +1815,16 @@ class Tokenizer:
1747
1815
  pass
1748
1816
  else:
1749
1817
  if "&" in data:
1750
- data = decode_entities_in_text(data)
1818
+ report_error = self._emit_error if self.collect_errors else None
1819
+ data = decode_entities_in_text(data, report_error=report_error)
1751
1820
  # Apply XML coercion if enabled
1752
1821
  if self.opts.xml_coercion:
1753
1822
  data = _coerce_text_for_xml(data)
1754
1823
 
1755
1824
  # Record position at END of raw text (1-indexed column = raw_len)
1756
- self._record_text_end_position(raw_len)
1825
+ if self.collect_errors:
1826
+ self._record_text_end_position(raw_len)
1827
+ self.last_token_start_pos = self.text_start_pos
1757
1828
  self.sink.process_characters(data)
1758
1829
  # Note: process_characters never returns Plaintext or RawData
1759
1830
  # State switches happen via _emit_current_tag instead
@@ -1785,7 +1856,8 @@ class Tokenizer:
1785
1856
  else:
1786
1857
  value = "".join(attr_value_buffer)
1787
1858
  if self.current_attr_value_has_amp:
1788
- value = decode_entities_in_text(value, in_attribute=True)
1859
+ report_error = self._emit_error if self.collect_errors else None
1860
+ value = decode_entities_in_text(value, in_attribute=True, report_error=report_error)
1789
1861
  attrs[name] = value
1790
1862
  attr_value_buffer.clear()
1791
1863
  self.current_attr_value_has_amp = False
@@ -1806,6 +1878,8 @@ class Tokenizer:
1806
1878
  tag.name = name
1807
1879
  tag.attrs = attrs
1808
1880
  tag.self_closing = self.current_tag_self_closing
1881
+ tag.start_pos = self.current_token_start_pos
1882
+ self.last_token_start_pos = tag.start_pos
1809
1883
 
1810
1884
  switched_to_rawtext = False
1811
1885
  if self.current_tag_kind == Tag.START:
@@ -1831,7 +1905,8 @@ class Tokenizer:
1831
1905
  # Remember current state before emitting
1832
1906
 
1833
1907
  # Emit token to sink
1834
- self._record_token_position()
1908
+ if self.collect_errors:
1909
+ self._record_token_position()
1835
1910
  result = self.sink.process_token(tag)
1836
1911
  if result == 1: # TokenSinkResult.Plaintext
1837
1912
  self.state = self.PLAINTEXT
@@ -1851,6 +1926,8 @@ class Tokenizer:
1851
1926
  if self.opts.xml_coercion:
1852
1927
  data = _coerce_comment_for_xml(data)
1853
1928
  self._comment_token.data = data
1929
+ self._comment_token.start_pos = self.current_token_start_pos
1930
+ self.last_token_start_pos = self._comment_token.start_pos
1854
1931
  self._emit_token(self._comment_token)
1855
1932
 
1856
1933
  def _emit_doctype(self) -> None:
@@ -1871,7 +1948,8 @@ class Tokenizer:
1871
1948
  self._emit_token(DoctypeToken(doctype))
1872
1949
 
1873
1950
  def _emit_token(self, token: Any) -> None:
1874
- self._record_token_position()
1951
+ if self.collect_errors:
1952
+ self._record_token_position()
1875
1953
  self.sink.process_token(token)
1876
1954
  # Note: process_token never returns Plaintext or RawData for state switches
1877
1955
  # State switches happen via _emit_current_tag checking sink response
@@ -1881,8 +1959,6 @@ class Tokenizer:
1881
1959
 
1882
1960
  Per the spec, the position should be at the end of the token (after the last char).
1883
1961
  """
1884
- if not self.collect_errors:
1885
- return
1886
1962
  # pos points after the last consumed character, which is exactly what we want
1887
1963
  pos = self.pos
1888
1964
  last_newline = self.buffer.rfind("\n", 0, pos)
@@ -1899,8 +1975,6 @@ class Tokenizer:
1899
1975
  Uses text_start_pos + raw_len to compute where text ends, matching html5lib's
1900
1976
  behavior of reporting the column of the last character (1-indexed).
1901
1977
  """
1902
- if not self.collect_errors:
1903
- return
1904
1978
  # Position of last character of text (0-indexed)
1905
1979
  end_pos = self.text_start_pos + raw_len
1906
1980
  last_newline = self.buffer.rfind("\n", 0, end_pos)
@@ -1926,6 +2000,17 @@ class Tokenizer:
1926
2000
  line = self._get_line_at_pos(self.pos)
1927
2001
  self.errors.append(ParseError(code, line=line, column=column, message=message, source_html=self.buffer))
1928
2002
 
2003
+ def _emit_error_at_pos(self, code: str, pos: int) -> None:
2004
+ last_newline = self.buffer.rfind("\n", 0, pos + 1)
2005
+ if last_newline == -1:
2006
+ column = pos + 1
2007
+ else:
2008
+ column = pos - last_newline
2009
+
2010
+ message = generate_error_message(code)
2011
+ line = self._get_line_at_pos(pos)
2012
+ self.errors.append(ParseError(code, line=line, column=column, message=message, source_html=self.buffer))
2013
+
1929
2014
  def _consume_if(self, literal: str) -> bool:
1930
2015
  end = self.pos + len(literal)
1931
2016
  if end > self.length:
@@ -1953,21 +2038,9 @@ class Tokenizer:
1953
2038
  if pos >= length:
1954
2039
  return False
1955
2040
 
1956
- # Handle ignore_lf for CRLF sequences
1957
- if self.ignore_lf and pos < length and self.buffer[pos] == "\n":
1958
- self.ignore_lf = False
1959
- pos += 1
1960
- self.pos = pos
1961
- if pos >= length:
1962
- return False
1963
-
1964
2041
  match = _COMMENT_RUN_PATTERN.match(self.buffer, pos)
1965
2042
  if match:
1966
2043
  chunk = match.group(0)
1967
- # Handle CRLF normalization for comments
1968
- if "\r" in chunk:
1969
- chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
1970
- self.ignore_lf = chunk.endswith("\r")
1971
2044
  self.current_comment.append(chunk)
1972
2045
  self.pos = match.end()
1973
2046
  return True
@@ -2061,7 +2134,7 @@ class Tokenizer:
2061
2134
  # Consume everything up to the special character
2062
2135
  if next_special > pos:
2063
2136
  chunk = buffer[pos:next_special]
2064
- self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2137
+ self._append_text_chunk(chunk)
2065
2138
  pos = next_special
2066
2139
  self.pos = pos
2067
2140
 
@@ -2073,7 +2146,6 @@ class Tokenizer:
2073
2146
 
2074
2147
  # Handle special characters - we're at one of them after find()
2075
2148
  if null_index == pos:
2076
- self.ignore_lf = False
2077
2149
  self._emit_error("unexpected-null-character")
2078
2150
  self._append_text("\ufffd")
2079
2151
  pos += 1
@@ -2188,9 +2260,7 @@ class Tokenizer:
2188
2260
  if null_index != -1 and null_index < next_special:
2189
2261
  if null_index > pos:
2190
2262
  chunk = buffer[pos:null_index]
2191
- self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2192
- else:
2193
- self.ignore_lf = False
2263
+ self._append_text_chunk(chunk)
2194
2264
  self._emit_error("unexpected-null-character")
2195
2265
  self._append_text("\ufffd")
2196
2266
  pos = null_index + 1
@@ -2199,14 +2269,14 @@ class Tokenizer:
2199
2269
  if lt_index == -1:
2200
2270
  if pos < length:
2201
2271
  chunk = buffer[pos:length]
2202
- self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2272
+ self._append_text_chunk(chunk)
2203
2273
  self.pos = length
2204
2274
  self._flush_text()
2205
2275
  self._emit_token(EOFToken())
2206
2276
  return True
2207
2277
  if lt_index > pos:
2208
2278
  chunk = buffer[pos:lt_index]
2209
- self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
2279
+ self._append_text_chunk(chunk)
2210
2280
  pos = lt_index + 1
2211
2281
  self.pos = pos
2212
2282
  # Handle script escaped transition before treating '<' as markup boundary